From 023063a0980ad807e08c538c35654cee8e721b4b Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Mon, 26 Aug 2024 18:01:09 +0000 Subject: [PATCH 01/12] DAOS-16447 test: set D_IL_REPORT per test (#15012) set D_IL_REPORT per test instead of setting defaults values in utilities. This allows running without it set. Features: IoctlPoolHandles IoSysAdmin performance ior dfuse soak_smoke Skip-unit-tests: true Skip-fault-injection-test: true Required-githooks: true Signed-off-by: Dalton Bohning --- src/tests/ftest/io/large_file_count.yaml | 4 ++++ src/tests/ftest/io/small_file_count.yaml | 4 ++++ src/tests/ftest/ior/small.yaml | 2 ++ src/tests/ftest/util/file_count_test_base.py | 2 +- src/tests/ftest/util/ior_test_base.py | 2 -- src/tests/ftest/util/ior_utils.py | 6 +++--- src/tests/ftest/util/soak_utils.py | 8 -------- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/tests/ftest/io/large_file_count.yaml b/src/tests/ftest/io/large_file_count.yaml index 619143a83e8..6ff375cf3a9 100644 --- a/src/tests/ftest/io/large_file_count.yaml +++ b/src/tests/ftest/io/large_file_count.yaml @@ -44,6 +44,8 @@ ior: repetitions: 1 transfer_size: '1M' block_size: '7G' + env_vars: + - D_IL_REPORT=1 dfuse: disable_caching: true @@ -59,3 +61,5 @@ mdtest: write_bytes: 4096 read_bytes: 4096 depth: 0 + env_vars: + - D_IL_REPORT=1 diff --git a/src/tests/ftest/io/small_file_count.yaml b/src/tests/ftest/io/small_file_count.yaml index b9bf23cd126..79e02c3d787 100644 --- a/src/tests/ftest/io/small_file_count.yaml +++ b/src/tests/ftest/io/small_file_count.yaml @@ -45,6 +45,8 @@ ior: repetitions: 1 transfer_size: '1M' block_size: '2G' + env_vars: + - D_IL_REPORT=1 dfuse: disable_caching: true @@ -60,3 +62,5 @@ mdtest: write_bytes: 4096 read_bytes: 4096 depth: 0 + env_vars: + - D_IL_REPORT=1 diff --git a/src/tests/ftest/ior/small.yaml b/src/tests/ftest/ior/small.yaml index b0b21620a93..b638a396580 100644 --- a/src/tests/ftest/ior/small.yaml +++ b/src/tests/ftest/ior/small.yaml @@ -38,6 +38,8 @@ container: properties: cksum:crc16,cksum_size:16384,srv_cksum:on control_method: daos ior: + env_vars: + - D_IL_REPORT=1 ior_timeout: 75 client_processes: ppn: 32 diff --git a/src/tests/ftest/util/file_count_test_base.py b/src/tests/ftest/util/file_count_test_base.py index be21183c97a..12c66d76b8c 100644 --- a/src/tests/ftest/util/file_count_test_base.py +++ b/src/tests/ftest/util/file_count_test_base.py @@ -97,7 +97,7 @@ def run_file_count(self): self.processes = mdtest_np self.ppn = mdtest_ppn if self.mdtest_cmd.api.value == 'POSIX': - self.mdtest_cmd.env.update(LD_PRELOAD=intercept, D_IL_REPORT='1') + self.mdtest_cmd.env.update(LD_PRELOAD=intercept) self.execute_mdtest() else: self.execute_mdtest() diff --git a/src/tests/ftest/util/ior_test_base.py b/src/tests/ftest/util/ior_test_base.py index 625a283593e..8f056c34002 100644 --- a/src/tests/ftest/util/ior_test_base.py +++ b/src/tests/ftest/util/ior_test_base.py @@ -225,8 +225,6 @@ def run_ior(self, manager, processes, intercept=None, display_space=True, env = self.ior_cmd.get_default_env(str(manager), self.client_log) if intercept: env['LD_PRELOAD'] = intercept - if 'D_IL_REPORT' not in env: - env['D_IL_REPORT'] = '1' if plugin_path: env["HDF5_VOL_CONNECTOR"] = "daos" env["HDF5_PLUGIN_PATH"] = str(plugin_path) diff --git a/src/tests/ftest/util/ior_utils.py b/src/tests/ftest/util/ior_utils.py index 7851e4587d7..ffde4454fcb 100644 --- a/src/tests/ftest/util/ior_utils.py +++ b/src/tests/ftest/util/ior_utils.py @@ -588,7 +588,7 @@ def get_unique_log(self, container): return '.'.join(['_'.join(parts), 'log']) def run(self, pool, container, processes, ppn=None, intercept=None, plugin_path=None, - dfuse=None, display_space=True, fail_on_warning=False, unique_log=True, il_report=1): + dfuse=None, display_space=True, fail_on_warning=False, unique_log=True, il_report=None): # pylint: disable=too-many-arguments """Run ior. @@ -609,7 +609,7 @@ def run(self, pool, container, processes, ppn=None, intercept=None, plugin_path= unique_log (bool, optional): whether or not to update the log file with a new unique log file name. Defaults to True. il_report (int, optional): D_IL_REPORT value to use when 'intercept' is specified and a - value does not already exist in the environment. Defaults to 1. + value does not already exist in the environment. Defaults to None. Raises: CommandFailure: if there is an error running the ior command @@ -627,7 +627,7 @@ def run(self, pool, container, processes, ppn=None, intercept=None, plugin_path= self.env["LD_PRELOAD"] = intercept if "D_LOG_MASK" not in self.env: self.env["D_LOG_MASK"] = "INFO" - if "D_IL_REPORT" not in self.env: + if "D_IL_REPORT" not in self.env and il_report is not None: self.env["D_IL_REPORT"] = str(il_report) if plugin_path: diff --git a/src/tests/ftest/util/soak_utils.py b/src/tests/ftest/util/soak_utils.py index 39178e0e0d9..9e523c6096c 100644 --- a/src/tests/ftest/util/soak_utils.py +++ b/src/tests/ftest/util/soak_utils.py @@ -997,10 +997,8 @@ def create_ior_cmdline(self, job_spec, pool, ppn, nodesperjob, oclass_list=None, mpirun_cmd.get_params(self) if api == "POSIX-LIBPIL4DFS": env["LD_PRELOAD"] = os.path.join(self.prefix, 'lib64', 'libpil4dfs.so') - env["D_IL_REPORT"] = "1" if api == "POSIX-LIBIOIL": env["LD_PRELOAD"] = os.path.join(self.prefix, 'lib64', 'libioil.so') - env["D_IL_REPORT"] = "1" # add envs if api is HDF5-VOL if api == "HDF5-VOL": vol = True @@ -1166,10 +1164,8 @@ def create_mdtest_cmdline(self, job_spec, pool, ppn, nodesperjob): if self.enable_il and api == "POSIX-LIBPIL4DFS": env["LD_PRELOAD"] = os.path.join( self.prefix, 'lib64', 'libpil4dfs.so') - env["D_IL_REPORT"] = "1" if self.enable_il and api == "POSIX-LIBIOIL": env["LD_PRELOAD"] = os.path.join(self.prefix, 'lib64', 'libioil.so') - env["D_IL_REPORT"] = "1" mpirun_cmd = Mpirun(mdtest_cmd, mpi_type=self.mpi_module) mpirun_cmd.get_params(self) mpirun_cmd.assign_processes(nodesperjob * ppn) @@ -1297,10 +1293,8 @@ def create_fio_cmdline(self, job_spec, pool): cmds.append(f"cd {dfuse.mount_dir.value};") if self.enable_il and api == "POSIX-LIBPIL4DFS": cmds.append(f"export LD_PRELOAD={os.path.join(self.prefix, 'lib64', 'libpil4dfs.so')}") - cmds.append("export D_IL_REPORT=1") if self.enable_il and api == "POSIX-LIBIOIL": cmds.append(f"export LD_PRELOAD={os.path.join(self.prefix, 'lib64', 'libioil.so')}") - cmds.append("export D_IL_REPORT=1") cmds.append(str(fio_cmd)) cmds.append("status=$?") cmds.append("cd -") @@ -1372,10 +1366,8 @@ def create_app_cmdline(self, job_spec, pool, ppn, nodesperjob): env["DAOS_UNS_PREFIX"] = format_path(pool, self.container[-1]) if self.enable_il and api == "POSIX-LIBPIL4DFS": env["LD_PRELOAD"] = os.path.join(self.prefix, 'lib64', 'libpil4dfs.so') - env["D_IL_REPORT"] = "1" if self.enable_il and api == "POSIX-LIBIOIL": env["LD_PRELOAD"] = os.path.join(self.prefix, 'lib64', 'libioil.so') - env["D_IL_REPORT"] = "1" mpirun_cmd.assign_environment(env, True) mpirun_cmd.assign_processes(nodesperjob * ppn) mpirun_cmd.ppn.update(ppn) From 663d70161255d177594c5b6c5ea79bc9694c3f88 Mon Sep 17 00:00:00 2001 From: Makito Kano Date: Fri, 6 Sep 2024 06:59:02 +0900 Subject: [PATCH 02/12] =?UTF-8?q?DAOS-16446=20test:=20HDF5-VOL=20test=20-?= =?UTF-8?q?=20Set=20object=20class=20and=20container=20prope=E2=80=A6=20(#?= =?UTF-8?q?15004)=20#15098?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In HDF5, DFS, MPIIO, or POSIX, object class and container properties are defined during the container create. If it’s DFS, object class is also set to the IOR parameter. However, in HDF5-VOL, object class and container properties are defined with the following environment variables of mpirun. HDF5_DAOS_OBJ_CLASS (Object class) HDF5_DAOS_FILE_PROP (Container properties) The infrastructure to set these variables are already there in run_ior_with_pool(). In file_count_test_base.py, pass in the env vars to run_ior_with_pool(env=env) as a dictionary. Object class is the oclass variable. Container properties can be obtained from self.container.properties.value. This fix is discussed in PR #14964. Skip-unit-tests: true Skip-fault-injection-test: true Skip-func-hw-test-medium-md-on-ssd: false Test-tag: test_io_sys_admin test_largefilecount test_smallfilecount Required-githooks: true Signed-off-by: Makito Kano --- src/tests/ftest/util/file_count_test_base.py | 31 ++++++++++++++------ 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/src/tests/ftest/util/file_count_test_base.py b/src/tests/ftest/util/file_count_test_base.py index 12c66d76b8c..f95e22bd4bd 100644 --- a/src/tests/ftest/util/file_count_test_base.py +++ b/src/tests/ftest/util/file_count_test_base.py @@ -17,15 +17,15 @@ class FileCountTestBase(IorTestBase, MdtestBase): :avocado: recursive """ - def add_containers(self, file_oclass=None, dir_oclass=None): - """Create a list of containers that the various jobs use for storage. + def get_file_write_container(self, file_oclass=None, dir_oclass=None): + """Create a container, set oclass, dir_oclass, and add rd_fac property based on oclass. Args: - file_oclass (str, optional): file object class of container. - Defaults to None. - dir_oclass (str, optional): dir object class of container. - Defaults to None. + file_oclass (str, optional): file object class of container. Defaults to None. + dir_oclass (str, optional): dir object class of container. Defaults to None. + Returns: + TestContainer: Created container with oclass, dir_oclass, and rd_fac set. """ # Create a container and add it to the overall list of containers @@ -92,7 +92,7 @@ def run_file_count(self): rd_fac = extract_redundancy_factor(oclass) dir_oclass = self.get_diroclass(rd_fac) self.mdtest_cmd.dfs_dir_oclass.update(dir_oclass) - self.container = self.add_containers(oclass, dir_oclass) + self.container = self.get_file_write_container(oclass, dir_oclass) try: self.processes = mdtest_np self.ppn = mdtest_ppn @@ -111,14 +111,27 @@ def run_file_count(self): # run ior self.log.info("=======>>>Starting IOR with %s and %s", api, oclass) self.ior_cmd.dfs_oclass.update(oclass) - self.container = self.add_containers(oclass) + self.container = self.get_file_write_container(oclass) self.update_ior_cmd_with_pool(False) try: self.processes = ior_np self.ppn = ior_ppn if api == 'HDF5-VOL': + # Format the container properties so that it works with HDF5-VOL env var. + # Each entry:value pair needs to be separated by a semicolon. Since we're + # using this in the mpirun command, semicolon would indicate the end of the + # command, so quote the whole thing. + cont_props = self.container.properties.value + cont_props_hdf5_vol = '"' + cont_props.replace(",", ";") + '"' + self.log.info("cont_props_hdf5_vol = %s", cont_props_hdf5_vol) + env = self.ior_cmd.env.copy() + env.update({ + "HDF5_DAOS_OBJ_CLASS": oclass, + "HDF5_DAOS_FILE_PROP": cont_props_hdf5_vol + }) self.ior_cmd.api.update('HDF5') - self.run_ior_with_pool(create_pool=False, plugin_path=hdf5_plugin_path) + self.run_ior_with_pool( + create_pool=False, plugin_path=hdf5_plugin_path, env=env) elif self.ior_cmd.api.value == 'POSIX': self.run_ior_with_pool(create_pool=False, intercept=intercept) else: From 031f37218a21b2c37a10f6a83cf690dcf87a084f Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Mon, 12 Aug 2024 16:55:06 +0000 Subject: [PATCH 03/12] wip: more support for sudo-less client Required-githooks: true Signed-off-by: Dalton Bohning --- src/tests/ftest/launch.py | 72 +++++++++++++++++--- src/tests/ftest/network/cart_self_test.py | 5 +- src/tests/ftest/util/agent_utils.py | 8 +-- src/tests/ftest/util/apricot/apricot/test.py | 43 +++++++----- src/tests/ftest/util/command_utils.py | 46 +++++++++---- src/tests/ftest/util/dmg_utils.py | 7 +- src/tests/ftest/util/dmg_utils_base.py | 6 +- src/tests/ftest/util/general_utils.py | 21 ++++-- src/tests/ftest/util/job_manager_utils.py | 60 ++++++++-------- src/tests/ftest/util/launch_utils.py | 18 ++--- src/tests/ftest/util/network_utils.py | 2 + src/tests/ftest/util/run_utils.py | 20 ++++-- src/tests/ftest/util/server_utils.py | 10 +-- src/tests/ftest/util/server_utils_base.py | 6 +- src/tests/ftest/util/server_utils_params.py | 1 + src/tests/ftest/util/soak_test_base.py | 2 +- src/tests/ftest/util/soak_utils.py | 6 +- src/tests/ftest/util/systemctl_utils.py | 6 +- 18 files changed, 228 insertions(+), 111 deletions(-) diff --git a/src/tests/ftest/launch.py b/src/tests/ftest/launch.py index f3daa8464d7..550c02d7176 100755 --- a/src/tests/ftest/launch.py +++ b/src/tests/ftest/launch.py @@ -282,7 +282,8 @@ def _run(self, args): else: set_test_environment( logger, test_env, args.test_servers, args.test_clients, args.provider, - args.insecure_mode, self.details) + args.insecure_mode, self.details, args.agent_user, args.test_log_dir, + args.server_ld_lib) except TestEnvironmentException as error: message = f"Error setting up test environment: {str(error)}" return self.get_exit_status(1, message, "Setup", sys.exc_info()) @@ -320,12 +321,13 @@ def _run(self, args): return self.get_exit_status(0, "Listing tests complete") # Setup the fuse configuration - try: - setup_fuse_config(logger, args.test_servers | args.test_clients) - except LaunchException: - # Warn but don't fail - message = "Issue detected setting up the fuse configuration" - setup_result.warn_test(logger, "Setup", message, sys.exc_info()) + if args.fuse_setup: + try: + setup_fuse_config(logger, args.test_servers | args.test_clients) + except LaunchException: + # Warn but don't fail + message = "Issue detected setting up the fuse configuration" + setup_result.warn_test(logger, "Setup", message, sys.exc_info()) # Setup override systemctl files try: @@ -358,8 +360,8 @@ def _run(self, args): group.update_test_yaml( logger, args.scm_size, args.scm_mount, args.extra_yaml, args.timeout_multiplier, args.override, args.verbose, args.include_localhost) - except (RunException, YamlException) as e: - message = "Error modifying the test yaml files: {}".format(e) + except (RunException, YamlException) as error: + message = f"Error modifying the test yaml files: {str(error)}" status |= self.get_exit_status(1, message, "Setup", sys.exc_info()) except StorageException: message = "Error detecting storage information for test yaml files" @@ -540,6 +542,12 @@ def main(): "-a", "--archive", action="store_true", help="archive host log files in the avocado job-results directory") + parser.add_argument( + "-au", "--agent_user", + action="store", + default=None, + type=str, + help="user account to use when running the daos_agent") parser.add_argument( "-c", "--clear_mounts", action="append", @@ -562,6 +570,10 @@ def main(): "--failfast", action="store_true", help="stop the test suite after the first failure") + parser.add_argument( + "-fs", "--fuse_setup", + action="store_true", + help="enable setting up fuse configuration files") parser.add_argument( "-i", "--include_localhost", action="store_true", @@ -584,7 +596,7 @@ def main(): help="modify the test yaml files but do not run the tests") parser.add_argument( "-mo", "--mode", - choices=['normal', 'manual', 'ci'], + choices=['normal', 'manual', 'ci', 'custom_a'], default='normal', help="provide the mode of test to be run under. Default is normal, " "in which the final return code of launch.py is still zero if " @@ -649,6 +661,12 @@ def main(): "-si", "--slurm_install", action="store_true", help="enable installing slurm RPMs if required by the tests") + parser.add_argument( + "-sl", "--server_ld_lib", + action="store", + default=None, + type=str, + help="LD_LIBRARY_PATH environment variable to use in the daos_server config file") parser.add_argument( "--scm_mount", action="store", @@ -681,6 +699,12 @@ def main(): default=NodeSet(), help="comma-separated list of hosts to use as replacement values for " "client placeholders in each test's yaml file") + parser.add_argument( + "-tld", "--test_log_dir", + action="store", + default=None, + type=str, + help="test log directory base path") parser.add_argument( "-th", "--logs_threshold", action="store", @@ -744,10 +768,38 @@ def main(): args.slurm_install = True args.slurm_setup = True args.user_create = True + args.fuse_setup = True args.clear_mounts.append("/mnt/daos") args.clear_mounts.append("/mnt/daos0") args.clear_mounts.append("/mnt/daos1") + elif args.mode == "custom_a": + if args.agent_user is None: + # Run the agent with the current user by default + args.agent_user = getpass.getuser() + if os.environ.get("DAOS_TEST_LOG_DIR", args.test_log_dir) is None: + # Use a user-specific test log dir by default + args.test_log_dir = os.path.join( + os.sep, "var", "tmp", f"daos_testing_{args.agent_user}") + if os.environ.get("DAOS_TEST_CONTROL_CONFIG") is None: + os.environ["DAOS_TEST_CONTROL_CONFIG"] = os.path.join( + os.environ.get("DAOS_TEST_LOG_DIR", args.test_log_dir), + "daos_control.yml") + if os.environ.get("DAOS_TEST_AGENT_CONFIG") is None: + os.environ["DAOS_TEST_AGENT_CONFIG"] = os.path.join( + os.environ.get("DAOS_TEST_LOG_DIR", args.test_log_dir), + "daos_agent.yml") + if os.environ.get("DAOS_TEST_SERVER_CONFIG") is None: + os.environ["DAOS_TEST_SERVER_CONFIG"] = os.path.join( + os.environ.get("DAOS_TEST_LOG_DIR", args.test_log_dir), + "daos_server.yml") + args.process_cores = False + args.logs_threshold = None + args.slurm_install = False + args.slurm_setup = False + args.user_create = False + args.fuse_setup = False + # Setup the Launch object launch = Launch(args.name, args.mode, args.slurm_install, args.slurm_setup) diff --git a/src/tests/ftest/network/cart_self_test.py b/src/tests/ftest/network/cart_self_test.py index 9600531c5f2..a1e78684c8e 100644 --- a/src/tests/ftest/network/cart_self_test.py +++ b/src/tests/ftest/network/cart_self_test.py @@ -3,6 +3,8 @@ SPDX-License-Identifier: BSD-2-Clause-Patent """ +import os + from apricot import TestWithServers from command_utils import ExecutableCommand from command_utils_base import EnvironmentVariables, FormattedParameter @@ -70,7 +72,8 @@ def setUp(self): self.server_managers[0].get_config_value("provider") self.cart_env["D_INTERFACE"] = \ self.server_managers[0].get_config_value("fabric_iface") - self.cart_env["DAOS_AGENT_DRPC_DIR"] = "/var/run/daos_agent/" + self.cart_env["DAOS_AGENT_DRPC_DIR"] = os.environ.get( + "DAOS_AGENT_DRPC_DIR", "/var/run/daos_agent/") self.server_managers[0].manager.assign_environment(self.cart_env, True) self.server_managers[0].detect_start_via_dmg = True diff --git a/src/tests/ftest/util/agent_utils.py b/src/tests/ftest/util/agent_utils.py index 74b79fb9796..d3e32997343 100644 --- a/src/tests/ftest/util/agent_utils.py +++ b/src/tests/ftest/util/agent_utils.py @@ -229,9 +229,8 @@ def __init__(self, group, bin_dir, cert_dir, config_file, run_user, config_temp= the hosts using the config_file specification. Defaults to None. manager (str, optional): the name of the JobManager class used to manage the YamlCommand defined through the "job" attribute. - Defaults to "Orterun". - outputdir (str, optional): path to avocado test outputdir. Defaults - to None. + Defaults to "Systemctl". + outputdir (str, optional): path to avocado test outputdir. Defaults to None. """ agent_command = get_agent_command( group, cert_dir, bin_dir, config_file, run_user, config_temp) @@ -283,8 +282,7 @@ def start(self): self._hosts, self.manager.command) # Copy certificates - self.manager.job.copy_certificates( - get_log_file("daosCA/certs"), self._hosts) + self.manager.job.copy_certificates(get_log_file("daosCA/certs"), self._hosts) # Verify the socket directory exists when using a non-systemctl manager if self.verify_socket_dir: diff --git a/src/tests/ftest/util/apricot/apricot/test.py b/src/tests/ftest/util/apricot/apricot/test.py index 42e05937f37..a70973659c9 100644 --- a/src/tests/ftest/util/apricot/apricot/test.py +++ b/src/tests/ftest/util/apricot/apricot/test.py @@ -10,6 +10,7 @@ import re import sys from ast import literal_eval +from getpass import getuser from time import time from agent_utils import DaosAgentManager, include_local_host @@ -736,7 +737,7 @@ def setUp(self): # Toggle whether to dump server ULT stacks on failure self.__dump_engine_ult_on_failure = self.params.get( - "dump_engine_ult_on_failure", "/run/setup/*", True) + "dump_engine_ult_on_failure", "/run/setup/*", self.__dump_engine_ult_on_failure) # # Find a configuration that meets the test requirements # self.config = Configuration( @@ -1065,12 +1066,10 @@ def add_agent_manager(self, group=None, config_file=None, config_temp=None): """ if group is None: group = self.server_group - if config_file is None and self.agent_manager_class == "Systemctl": + + if config_file is None: config_file = self.test_env.agent_config config_temp = self.get_config_file(group, "agent", self.test_dir) - elif config_file is None: - config_file = self.get_config_file(group, "agent") - config_temp = None # Verify the correct configuration files have been provided if self.agent_manager_class == "Systemctl" and config_temp is None: @@ -1079,10 +1078,12 @@ def add_agent_manager(self, group=None, config_file=None, config_temp=None): "file provided for the Systemctl manager class!") # Define the location of the certificates - if self.agent_manager_class == "Systemctl": + if self.agent_manager_class == "Systemctl" and self.test_env.agent_user != getuser(): + # Default directory requiring privileged access cert_dir = os.path.join(os.sep, "etc", "daos", "certs") else: - cert_dir = self.workdir + # Test-specific directory not requiring privileged access + cert_dir = os.path.join(self.test_dir, "certs") self.agent_managers.append( DaosAgentManager( @@ -1115,6 +1116,8 @@ def add_server_manager(self, group=None, svr_config_file=None, """ if group is None: group = self.server_group + + # Set default server config files if svr_config_file is None and self.server_manager_class == "Systemctl": svr_config_file = self.test_env.server_config svr_config_temp = self.get_config_file( @@ -1122,12 +1125,6 @@ def add_server_manager(self, group=None, svr_config_file=None, elif svr_config_file is None: svr_config_file = self.get_config_file(group, "server") svr_config_temp = None - if dmg_config_file is None and self.server_manager_class == "Systemctl": - dmg_config_file = self.test_env.control_config - dmg_config_temp = self.get_config_file(group, "dmg", self.test_dir) - elif dmg_config_file is None: - dmg_config_file = self.get_config_file(group, "dmg") - dmg_config_temp = None # Verify the correct configuration files have been provided if self.server_manager_class == "Systemctl" and svr_config_temp is None: @@ -1135,13 +1132,25 @@ def add_server_manager(self, group=None, svr_config_file=None, "Error adding a DaosServerManager: no temporary configuration " "file provided for the Systemctl manager class!") - # Define the location of the certificates + # Set default dmg config files + if dmg_config_file is None: + if self.server_manager_class == "Systemctl": + dmg_config_file = self.test_env.control_config + dmg_config_temp = self.get_config_file(group, "dmg", self.test_dir) + else: + dmg_config_file = os.path.join(self.test_dir, "daos_control.yml") + + # Define server certificate directory if self.server_manager_class == "Systemctl": svr_cert_dir = os.path.join(os.sep, "etc", "daos", "certs") - dmg_cert_dir = os.path.join(os.sep, "etc", "daos", "certs") else: svr_cert_dir = self.workdir - dmg_cert_dir = self.workdir + + # Define dmg certificate directory + if self.server_manager_class == "Systemctl" and self.test_env.agent_user != getuser(): + dmg_cert_dir = os.path.join(os.sep, "etc", "daos", "certs") + else: + dmg_cert_dir = os.path.join(self.test_dir, "certs") self.server_managers.append( DaosServerManager( @@ -1681,7 +1690,7 @@ def get_dmg_command(self, index=0): dmg_cmd = get_dmg_command( self.server_group, dmg_cert_dir, self.bin, dmg_config_file, - dmg_config_temp, self.access_points_suffix) + dmg_config_temp, self.access_points_suffix, getuser()) dmg_cmd.hostlist = self.access_points return dmg_cmd diff --git a/src/tests/ftest/util/command_utils.py b/src/tests/ftest/util/command_utils.py index 06da3b27868..2a533e66e9e 100644 --- a/src/tests/ftest/util/command_utils.py +++ b/src/tests/ftest/util/command_utils.py @@ -59,7 +59,13 @@ def __init__(self, namespace, command, path="", subprocess=False, check_results= self.exit_status_exception = True self.output_check = "both" self.verbose = True + # TODO proper. Really just need this for all "client" commands self.env = EnvironmentVariables() + _env_from_os = ("DAOS_AGENT_DRPC_DIR",) + for key in _env_from_os: + val = os.environ.get(key) + if val is not None: + self.env[key] = val # User to run the command as. "root" is equivalent to sudo self.run_user = run_user @@ -1022,12 +1028,20 @@ def copy_certificates(self, source, hosts): src_file = os.path.join(source, file_name) dst_file = os.path.join(name, file_name) self.log.debug(" %s -> %s", src_file, dst_file) + # Don't use sudo if running as the current user + # TODO proper + _sudo = self.run_user != getuser() or \ + self.certificate_owner != getuser() or dst_file.startswith('/etc') + _owner = self.certificate_owner if _sudo else None result = distribute_files( self.log, hosts, src_file, dst_file, mkdir=False, - verbose=False, sudo=True, owner=self.certificate_owner) + verbose=True, sudo=_sudo, owner=_owner) if not result.passed: + # TODO warns on copying dmg certs because it is done multiple times + # to the same destination self.log.info( - " WARNING: %s copy failed on %s", dst_file, result.failed_hosts) + " WARNING: %s copy failed on %s:\n%s", + dst_file, result.failed_hosts, result) names.add(name) yaml = yaml.other_params @@ -1055,9 +1069,14 @@ def copy_configuration(self, hosts): self.log.info( "Copying %s yaml configuration file to %s on %s", self.temporary_file, self.yaml.filename, hosts) + # Don't use sudo if running as the current user + # TODO proper + _sudo = self.run_user != getuser() or self.certificate_owner != getuser() or \ + self.yaml.filename.startswith('/etc') + _owner = self.certificate_owner if _sudo else None result = distribute_files( - self.log, hosts, self.temporary_file, self.yaml.filename, verbose=False, - sudo=True) + self.log, hosts, self.temporary_file, self.yaml.filename, + verbose=True, sudo=_sudo, owner=_owner) if not result.passed: raise CommandFailure( f"ERROR: Copying yaml configuration file to {result.failed_hosts}") @@ -1083,17 +1102,16 @@ def verify_socket_directory(self, user, hosts): self.log.info( "%s: creating socket directory %s for user %s on %s", self.command, directory, user, nodes) - result = create_directory(self.log, nodes, directory, user="root") + if user == getuser(): + result = create_directory(self.log, nodes, directory) + else: + result = create_directory(self.log, nodes, directory, user="root") + change_file_owner( + self.log, nodes, directory, user, get_primary_group(user), user="root") if not result.passed: raise CommandFailure( f"{self.command}: error creating socket directory {directory} for user " f"{user} on {result.failed_hosts}") - result = change_file_owner( - self.log, nodes, directory, user, get_primary_group(user), user="root") - if not result.passed: - raise CommandFailure( - f"{self.command}: error setting socket directory {directory} owner for " - f"user {user} on {result.failed_hosts}") def get_socket_dir(self): """Get the socket directory. @@ -1119,14 +1137,14 @@ def _get_new(self): class SubprocessManager(ObjectWithParameters): """Defines an object that manages a sub process launched with orterun.""" - def __init__(self, command, manager="Orterun", namespace=None): + def __init__(self, command, manager="Systemctl", namespace=None): """Create a SubprocessManager object. Args: command (YamlCommand): command to manage as a subprocess manager (str, optional): the name of the JobManager class used to manage the YamlCommand defined through the "job" attribute. - Defaults to "OpenMpi" + Defaults to "Systemctl" namespace (str): yaml namespace (path to parameters) """ super().__init__(namespace) @@ -1290,7 +1308,7 @@ def get_config_value(self, name): return value def get_current_state(self): - """Get the current state of the daos_server ranks. + """Get the current state of the service. Returns: dict: dictionary of server rank keys, each referencing a dictionary diff --git a/src/tests/ftest/util/dmg_utils.py b/src/tests/ftest/util/dmg_utils.py index 34e21f66d0a..a12646af389 100644 --- a/src/tests/ftest/util/dmg_utils.py +++ b/src/tests/ftest/util/dmg_utils.py @@ -19,7 +19,8 @@ class DmgJsonCommandFailure(CommandFailure): """Exception raised when a dmg --json command fails.""" -def get_dmg_command(group, cert_dir, bin_dir, config_file, config_temp=None, hostlist_suffix=None): +def get_dmg_command(group, cert_dir, bin_dir, config_file, config_temp=None, hostlist_suffix=None, + run_user=None): """Get a dmg command object. Args: @@ -33,6 +34,8 @@ def get_dmg_command(group, cert_dir, bin_dir, config_file, config_temp=None, hos utilizes the file specified by config_file. hostlist_suffix (str, optional): Suffix to append to each host name. Defaults to None. + run_user (str, optional): user to run as. Defaults to None, which will run commands as + the current user. Returns: DmgCommand: the dmg command object @@ -40,7 +43,7 @@ def get_dmg_command(group, cert_dir, bin_dir, config_file, config_temp=None, hos """ transport_config = DmgTransportCredentials(cert_dir) config = DmgYamlParameters(config_file, group, transport_config) - command = DmgCommand(bin_dir, config, hostlist_suffix) + command = DmgCommand(bin_dir, config, hostlist_suffix, run_user) if config_temp: # Setup the DaosServerCommand to write the config file data to the # temporary file and then copy the file to all the hosts using the diff --git a/src/tests/ftest/util/dmg_utils_base.py b/src/tests/ftest/util/dmg_utils_base.py index a601b590033..5bff504241a 100644 --- a/src/tests/ftest/util/dmg_utils_base.py +++ b/src/tests/ftest/util/dmg_utils_base.py @@ -15,7 +15,7 @@ class DmgCommandBase(YamlCommand): """Defines a base object representing a dmg command.""" - def __init__(self, path, yaml_cfg=None, hostlist_suffix=None): + def __init__(self, path, yaml_cfg=None, hostlist_suffix=None, run_user=None): """Create a dmg Command object. Args: @@ -24,8 +24,10 @@ def __init__(self, path, yaml_cfg=None, hostlist_suffix=None): settings. Defaults to None, in which case settings must be supplied as command-line parameters. hostlist_suffix (str, optional): Suffix to append to each host name. Defaults to None. + run_user (str, optional): user to run as. Defaults to None, which will run commands as + the current user. """ - super().__init__("/run/dmg/*", "dmg", path, yaml_cfg) + super().__init__("/run/dmg/*", "dmg", path, yaml_cfg, run_user=run_user) # If running dmg on remote hosts, this list needs to include those hosts self.temporary_file_hosts = NodeSet(gethostname().split(".")[0]) diff --git a/src/tests/ftest/util/general_utils.py b/src/tests/ftest/util/general_utils.py index 84e55601ff2..5dcaf11d7f3 100644 --- a/src/tests/ftest/util/general_utils.py +++ b/src/tests/ftest/util/general_utils.py @@ -137,6 +137,10 @@ def run_command(command, timeout=60, verbose=True, raise_exception=True, """ log = getLogger() msg = None + if env is not None and "DAOS_AGENT_DRPC_DIR" not in env: + daos_agent_drpc_dir = os.environ.get("DAOS_AGENT_DRPC_DIR") + if daos_agent_drpc_dir: + env["DAOS_AGENT_DRPC_DIR"] = daos_agent_drpc_dir kwargs = { "cmd": command, "timeout": timeout, @@ -148,6 +152,7 @@ def run_command(command, timeout=60, verbose=True, raise_exception=True, } if verbose: log.info("Command environment vars:\n %s", env) + try: # Block until the command is complete or times out return process.run(**kwargs) @@ -1024,7 +1029,8 @@ def percent_change(val1, val2): return math.nan -def get_journalctl_command(since, until=None, system=False, units=None, identifiers=None): +def get_journalctl_command(since, until=None, system=False, units=None, identifiers=None, + run_user="root"): """Get the journalctl command to capture all unit/identifier activity from since to until. Args: @@ -1036,21 +1042,24 @@ def get_journalctl_command(since, until=None, system=False, units=None, identifi None. identifiers (str/list, optional): show messages for the specified syslog identifier(s). Defaults to None. + run_user (str, optional): user to run as. Defaults to root Returns: str: journalctl command to capture all unit activity """ - command = ["sudo", os.path.join(os.sep, "usr", "bin", "journalctl")] + command = [os.path.join(os.sep, "usr", "bin", "journalctl")] if system: command.append("--system") + if run_user != "root": + command.append("--user") for key, values in {"unit": units or [], "identifier": identifiers or []}.items(): for item in values if isinstance(values, (list, tuple)) else [values]: - command.append("--{}={}".format(key, item)) - command.append("--since=\"{}\"".format(since)) + command.append(f"--{key}={item}") + command.append(f'--since="{since}"') if until: - command.append("--until=\"{}\"".format(until)) - return " ".join(command) + command.append(f'--until="{until}"') + return command_as_user(" ".join(command), run_user) def get_journalctl(hosts, since, until, journalctl_type): diff --git a/src/tests/ftest/util/job_manager_utils.py b/src/tests/ftest/util/job_manager_utils.py index 2b5f2cd6c26..23c6bb7c358 100644 --- a/src/tests/ftest/util/job_manager_utils.py +++ b/src/tests/ftest/util/job_manager_utils.py @@ -5,8 +5,10 @@ """ import os import re +import tempfile import time # pylint: disable=too-many-lines +from getpass import getuser from shutil import which from ClusterShell.NodeSet import NodeSet @@ -14,6 +16,7 @@ from command_utils_base import BasicParameter, EnvironmentVariables, FormattedParameter from env_modules import load_mpi from exception_utils import CommandFailure, MPILoadError +from file_utils import distribute_files from general_utils import (get_job_manager_class, get_journalctl_command, journalctl_time, pcmd, run_pcmd) from run_utils import run_remote, stop_processes @@ -651,7 +654,7 @@ def __init__(self, job): """ super().__init__("/run/systemctl/*", "systemd", job) self.job = job - self._systemctl = SystemctlCommand() + self._systemctl = SystemctlCommand(run_user=job.run_user) self._systemctl.service.value = self.job.service_name self.timestamps = { @@ -816,25 +819,16 @@ def _run_unit_command(self, command): CommandFailure: if there is an issue running the command Returns: - dict: a dictionary of return codes keys and accompanying NodeSet - values indicating which hosts yielded the return code. + RemoteCommandResult: a grouping of the command results from the same hosts with the + same return status """ self._systemctl.unit_command.value = command self.timestamps[command] = journalctl_time() - result = pcmd(self._hosts, str(self), self.verbose, self.timeout) - if 255 in result: - raise CommandFailure( - "Timeout detected running '{}' with a {}s timeout on {}".format( - str(self), self.timeout, NodeSet.fromlist(result[255]))) - - if 0 not in result or len(result) > 1: - failed = [] - for item, value in list(result.items()): - if item != 0: - failed.extend(value) + result = run_remote(self.log, self._hosts, str(self), self.verbose, self.timeout) + if not result.passed: raise CommandFailure( - "Error occurred running '{}' on {}".format(str(self), NodeSet.fromlist(failed))) + "Error occurred running '{}' on {}".format(str(self), result.failed_hosts)) return result def _report_unit_command(self, command): @@ -847,8 +841,8 @@ def _report_unit_command(self, command): CommandFailure: if there is an issue running the command Returns: - dict: a dictionary of return codes keys and accompanying NodeSet - values indicating which hosts yielded the return code. + RemoteCommandResult: a grouping of the command results from the same hosts with the + same return status """ try: @@ -856,7 +850,8 @@ def _report_unit_command(self, command): except CommandFailure as error: self.log.info(error) command = get_journalctl_command( - self.timestamps[command], units=self._systemctl.service.value) + self.timestamps[command], units=self._systemctl.service.value, + run_user=self.job.run_user) self.display_log_data(self.get_log_data(self._hosts, command)) raise CommandFailure(error) from error @@ -867,8 +862,8 @@ def service_enable(self): CommandFailure: if unable to enable Returns: - dict: a dictionary of return codes keys and accompanying NodeSet - values indicating which hosts yielded the return code. + RemoteCommandResult: a grouping of the command results from the same hosts with the + same return status """ return self._report_unit_command("enable") @@ -880,8 +875,8 @@ def service_disable(self): CommandFailure: if unable to disable Returns: - dict: a dictionary of return codes keys and accompanying NodeSet - values indicating which hosts yielded the return code. + RemoteCommandResult: a grouping of the command results from the same hosts with the + same return status """ return self._report_unit_command("disable") @@ -893,8 +888,8 @@ def service_start(self): CommandFailure: if unable to start Returns: - dict: a dictionary of return codes keys and accompanying NodeSet - values indicating which hosts yielded the return code. + RemoteCommandResult: a grouping of the command results from the same hosts with the + same return status """ return self._report_unit_command("start") @@ -906,8 +901,8 @@ def service_stop(self): CommandFailure: if unable to stop Returns: - dict: a dictionary of return codes keys and accompanying NodeSet - values indicating which hosts yielded the return code. + RemoteCommandResult: a grouping of the command results from the same hosts with the + same return status """ return self._report_unit_command("stop") @@ -919,8 +914,8 @@ def service_status(self): CommandFailure: if unable to get the status Returns: - dict: a dictionary of return codes keys and accompanying NodeSet - values indicating which hosts yielded the return code. + RemoteCommandResult: a grouping of the command results from the same hosts with the + same return status """ return self._report_unit_command("status") @@ -1084,7 +1079,8 @@ def search_logs(self, pattern, since, until, quantity=1, timeout=60, verbose=Fal (str) - string indicating the number of patterns found in what duration """ - command = get_journalctl_command(since, until, units=self._systemctl.service.value) + command = get_journalctl_command( + since, until, units=self._systemctl.service.value, run_user=self.job.run_user) self.log.info("Searching for '%s' in '%s' output on %s", pattern, command, self._hosts) log_data = None @@ -1171,7 +1167,8 @@ def dump_logs(self, hosts=None, timestamp=None): if timestamp: if hosts is None: hosts = self._hosts - command = get_journalctl_command(timestamp, units=self._systemctl.service.value) + command = get_journalctl_command( + timestamp, units=self._systemctl.service.value, run_user=self.job.run_user) self.display_log_data(self.get_log_data(hosts, command)) def log_additional_debug_data(self, hosts, since, until): @@ -1184,7 +1181,8 @@ def log_additional_debug_data(self, hosts, since, until): to None, in which case it is not utilized. """ command = get_journalctl_command( - since, until, True, identifiers=["kernel", self._systemctl.service.value]) + since, until, True, identifiers=["kernel", self._systemctl.service.value], + run_user=self.job.run_user) details = self.str_log_data(self.get_log_data(hosts, command)) self.log.info("Additional '%s' output:\n%s", command, details) diff --git a/src/tests/ftest/util/launch_utils.py b/src/tests/ftest/util/launch_utils.py index 0f7284c50ef..4df840cd86b 100644 --- a/src/tests/ftest/util/launch_utils.py +++ b/src/tests/ftest/util/launch_utils.py @@ -129,12 +129,14 @@ def setup_systemctl(logger, servers, clients, test_env): __add_systemctl_override( logger, servers, "daos_server.service", "root", os.path.join(test_env.daos_prefix, "bin", "daos_server"), test_env.server_config, - None, None)) + os.environ.get("DAOS_TEST_SYSTEMD_PATH"), + os.environ.get("DAOS_TEST_SYSTEMD_LIBRARY_PATH"))) systemctl_configs.update( __add_systemctl_override( logger, clients, "daos_agent.service", test_env.agent_user, os.path.join(test_env.daos_prefix, "bin", "daos_agent"), test_env.agent_config, - None, None)) + os.environ.get("DAOS_TEST_SYSTEMD_PATH"), + os.environ.get("DAOS_TEST_SYSTEMD_LIBRARY_PATH"))) return systemctl_configs @@ -602,7 +604,7 @@ def _setup_test_directory(self, logger, test): hosts.add(self.local_host) logger.debug("Setting up '%s' on %s:", test_env.log_dir, hosts) commands = [ - f"sudo -n rm -fr {test_env.log_dir}", + f"rm -fr {test_env.log_dir}", f"mkdir -p {test_env.log_dir}", f"chmod a+wrx {test_env.log_dir}", ] @@ -612,11 +614,11 @@ def _setup_test_directory(self, logger, test): directories.append(os.path.join(test_env.log_dir, directory)) commands.append(f"mkdir -p {' '.join(directories)}") commands.append(f"ls -al {test_env.log_dir}") - for command in commands: - if not run_remote(logger, hosts, command).passed: - message = "Error setting up the common test directory on all hosts" - self.test_result.fail_test(logger, "Prepare", message, sys.exc_info()) - return 128 + command = " && ".join(commands) + if not run_remote(logger, hosts, command).passed: + message = "Error setting up the common test directory on all hosts" + self.test_result.fail_test(logger, "Prepare", message, sys.exc_info()) + return 128 return 0 def _user_setup(self, logger, test, create=False): diff --git a/src/tests/ftest/util/network_utils.py b/src/tests/ftest/util/network_utils.py index e3802364d8f..d2ee78ea269 100644 --- a/src/tests/ftest/util/network_utils.py +++ b/src/tests/ftest/util/network_utils.py @@ -10,6 +10,7 @@ from ClusterShell.NodeSet import NodeSet # pylint: disable=import-error,no-name-in-module +from util.host_utils import get_local_host from util.run_utils import run_remote # Order here is used to select default provider in environment_utils @@ -398,6 +399,7 @@ def get_fastest_interface(logger, hosts, verbose=True): Returns: str: the fastest active interface common to all hosts specified """ + hosts = NodeSet(hosts) | NodeSet(get_local_host()) common_interfaces = get_common_interfaces(logger, hosts, verbose) # Find the speed of each common active interface in order to be able to choose the fastest diff --git a/src/tests/ftest/util/run_utils.py b/src/tests/ftest/util/run_utils.py index 2f9d33b07c5..d1ac0b791c1 100644 --- a/src/tests/ftest/util/run_utils.py +++ b/src/tests/ftest/util/run_utils.py @@ -345,7 +345,8 @@ def log_result_data(log, data): log.debug("%s%s", " " * indent, line) -def get_clush_command(hosts, args=None, command="", command_env=None, command_sudo=False): +def get_clush_command(hosts, args=None, command="", command_env=None, command_sudo=False, + timeout=None, fanout=None): """Get the clush command with optional sudo arguments. Args: @@ -355,11 +356,21 @@ def get_clush_command(hosts, args=None, command="", command_env=None, command_su command_env (EnvironmentVariables, optional): environment variables to export with the command. Defaults to None. sudo (bool, optional): whether to run the command with sudo privileges. Defaults to False. + timeout (int, optional): number of seconds to wait for the command to complete. + Defaults to None. + fanout (int, optional): fanout to use. Default uses the max of the + clush default (64) or available cores Returns: str: the clush command """ + if fanout is None: + fanout = max(64, len(os.sched_getaffinity(0))) cmd_list = ["clush"] + if timeout is not None: + cmd_list.extend(["-u", str(timeout)]) + if fanout is not None: + cmd_list.extend(["-f", str(fanout)]) if args: cmd_list.append(args) cmd_list.extend(["-w", str(hosts)]) @@ -453,8 +464,9 @@ def run_remote(log, hosts, command, verbose=True, timeout=120, task_debug=False, if fanout is None: fanout = max(task.info('fanout'), len(os.sched_getaffinity(0))) task.set_info('fanout', fanout) - # Enable forwarding of the ssh authentication agent connection - task.set_info("ssh_options", "-oForwardAgent=yes") + # Enable forwarding of the ssh authentication agent connection. + # Force pseudo-terminal allocation so timed-out commands are killed remotely. + task.set_info("ssh_options", "-oForwardAgent=yes -q -t -t") if verbose: if timeout is None: log.debug("Running on %s without a timeout: %s", hosts, command) @@ -571,7 +583,7 @@ def stop_processes(log, hosts, pattern, verbose=True, timeout=60, exclude=None, log.debug( "Killing%s any processes on %s that match %s and then waiting %s seconds", step[0], result.passed_hosts, pattern_match, step[1]) - kill_command = f"sudo /usr/bin/pkill{step[0]} {pattern}" + kill_command = f"sudo -n /usr/bin/pkill{step[0]} {pattern}" run_remote(log, result.passed_hosts, kill_command, verbose, timeout) time.sleep(step[1]) result = run_remote(log, result.passed_hosts, command, verbose, timeout) diff --git a/src/tests/ftest/util/server_utils.py b/src/tests/ftest/util/server_utils.py index 752473021a3..649885ac974 100644 --- a/src/tests/ftest/util/server_utils.py +++ b/src/tests/ftest/util/server_utils.py @@ -5,7 +5,6 @@ """ # pylint: disable=too-many-lines -import os import random import time from getpass import getuser @@ -46,6 +45,7 @@ def get_server_command(group, cert_dir, bin_dir, config_file, config_temp=None): common_config = CommonConfig(group, transport_config) config = DaosServerYamlParameters(config_file, common_config) command = DaosServerCommand(bin_dir, config, None) + command.run_user = "root" if config_temp: # Setup the DaosServerCommand to write the config file data to the # temporary file and then copy the file to all the hosts using the @@ -104,7 +104,8 @@ def __init__(self, group, bin_dir, # Dmg command to access this group of servers which will be configured # to access the daos_servers when they are started self.dmg = get_dmg_command( - group, dmg_cert_dir, bin_dir, dmg_config_file, dmg_config_temp, access_points_suffix) + group, dmg_cert_dir, bin_dir, dmg_config_file, dmg_config_temp, access_points_suffix, + getuser()) # Set the correct certificate file ownership if manager == "Systemctl": @@ -262,7 +263,7 @@ def prepare(self, storage=True): self.manager.mca.update({"plm_rsh_args": "-l root"}, "orterun.mca", True) # Verify the socket directory exists when using a non-systemctl manager - self.verify_socket_directory(getuser()) + self.verify_socket_directory(self.manager.job.certificate_owner) def clean_files(self, verbose=True): """Clean up the daos server files. @@ -1156,10 +1157,9 @@ def get_daos_metrics(self, verbose=False, timeout=60): """ engines_per_host = self.get_config_value("engines_per_host") or 1 engines = [] - daos_metrics_exe = os.path.join(self.manager.job.command_path, "daos_metrics") for engine in range(engines_per_host): results = run_pcmd( hosts=self._hosts, verbose=verbose, timeout=timeout, - command="sudo {} -S {} --csv".format(daos_metrics_exe, engine)) + command=f"sudo daos_metrics -S {engine} --csv") engines.append(results) return engines diff --git a/src/tests/ftest/util/server_utils_base.py b/src/tests/ftest/util/server_utils_base.py index bbda608faad..8e211d2bc00 100644 --- a/src/tests/ftest/util/server_utils_base.py +++ b/src/tests/ftest/util/server_utils_base.py @@ -34,7 +34,7 @@ class DaosServerCommand(YamlCommand): DEFAULT_CONFIG_FILE = os.path.join(os.sep, "etc", "daos", "daos_server.yml") - def __init__(self, path="", yaml_cfg=None, timeout=45): + def __init__(self, path="", yaml_cfg=None, timeout=45, run_user=None): """Create a daos_server command object. Args: @@ -43,9 +43,11 @@ def __init__(self, path="", yaml_cfg=None, timeout=45): Defaults to None. timeout (int, optional): number of seconds to wait for patterns to appear in the subprocess output. Defaults to 45 seconds. + run_user (str, optional): user to run as. Defaults to None, which will run commands as + the current user. """ super().__init__( - "/run/daos_server/*", "daos_server", path, yaml_cfg, timeout) + "/run/daos_server/*", "daos_server", path, yaml_cfg, timeout, run_user) self.pattern = self.NORMAL_PATTERN # Command line parameters: diff --git a/src/tests/ftest/util/server_utils_params.py b/src/tests/ftest/util/server_utils_params.py index 440ffe68f82..b80201a6fdf 100644 --- a/src/tests/ftest/util/server_utils_params.py +++ b/src/tests/ftest/util/server_utils_params.py @@ -502,6 +502,7 @@ def __init__(self, base_namespace, index, provider=None, max_storage_tiers=MAX_S if name in self.REQUIRED_ENV_VARS: default_env_vars.extend(self.REQUIRED_ENV_VARS[name]) self.env_vars = BasicParameter(None, default_env_vars) + self.env_pass_through = BasicParameter(None, None) # the storage configuration for this engine self.storage = StorageYamlParameters(self.namespace, max_storage_tiers) diff --git a/src/tests/ftest/util/soak_test_base.py b/src/tests/ftest/util/soak_test_base.py index f32e068cb16..5716a9c5ee1 100644 --- a/src/tests/ftest/util/soak_test_base.py +++ b/src/tests/ftest/util/soak_test_base.py @@ -570,7 +570,7 @@ def run_soak(self, test_param): resv_bytes = self.params.get("resv_bytes", test_param + "*", 500000000) ignore_soak_errors = self.params.get("ignore_soak_errors", test_param + "*", False) self.enable_il = self.params.get("enable_intercept_lib", test_param + "*", False) - self.sudo_cmd = "sudo" if enable_sudo else "" + self.sudo_cmd = "sudo -n" if enable_sudo else "" self.enable_remote_logging = self.params.get( "enable_remote_logging", os.path.join(test_param, "*"), False) self.enable_scrubber = self.params.get( diff --git a/src/tests/ftest/util/soak_utils.py b/src/tests/ftest/util/soak_utils.py index 9e523c6096c..397d5a77008 100644 --- a/src/tests/ftest/util/soak_utils.py +++ b/src/tests/ftest/util/soak_utils.py @@ -15,6 +15,7 @@ import slurm_utils from avocado.core.exceptions import TestFail from avocado.utils.distro import detect +from command_utils import command_as_user from command_utils_base import EnvironmentVariables from daos_racer_utils import DaosRacerCommand from data_mover_utils import DcpCommand, FsCopy @@ -497,7 +498,7 @@ def launch_reboot(self, pools, name, results, args): self.log.info( "<<>>\n", self.loop, name, ranks, time.ctime()) # reboot host in 1 min - result = run_remote(self.log, reboot_host, "sudo shutdown -r +1") + result = run_remote(self.log, reboot_host, command_as_user("shutdown -r +1", "root")) if result.passed: status = True else: @@ -536,7 +537,8 @@ def launch_reboot(self, pools, name, results, args): # issue a restart self.log.info("<<>>\n", self.loop, name, reboot_host, time.ctime()) - cmd_results = run_remote(self.log, reboot_host, "sudo systemctl restart daos_server") + cmd_results = run_remote( + self.log, reboot_host, command_as_user("systemctl restart daos_server", "root")) if cmd_results.passed: self.dmg_command.system_query() for pool in pools: diff --git a/src/tests/ftest/util/systemctl_utils.py b/src/tests/ftest/util/systemctl_utils.py index 9ee809a1143..32b5ec9f992 100644 --- a/src/tests/ftest/util/systemctl_utils.py +++ b/src/tests/ftest/util/systemctl_utils.py @@ -125,7 +125,7 @@ def get_systemctl_command(unit_command, service, user="root"): """ command = ["systemctl"] if user != "root": - command.append(f"--user {user}") + command.append("--user") if unit_command: command.append(unit_command) if service: @@ -190,6 +190,10 @@ def create_override_config(logger, hosts, service, user, service_command, servic Returns: str: the systemctl override config file path """ + # TODO proper + # Reload since teardown removes the files + daemon_reload(logger, hosts, user, verbose, timeout) + # Get the existing service file service_file = get_service_file(logger, hosts, service, user, verbose, timeout) From 9a65ac2b5915d70a36bb84ab9a81cea190c2a48a Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Wed, 24 Jul 2024 19:41:37 +0000 Subject: [PATCH 04/12] export daos-related envs in run_remote Skip-test: true Skip-build: true Signed-off-by: Dalton Bohning --- src/tests/ftest/util/run_utils.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/src/tests/ftest/util/run_utils.py b/src/tests/ftest/util/run_utils.py index d1ac0b791c1..a57e5d80c85 100644 --- a/src/tests/ftest/util/run_utils.py +++ b/src/tests/ftest/util/run_utils.py @@ -438,6 +438,35 @@ def run_local(log, command, verbose=True, timeout=None, stderr=False, capture_ou return results +def daos_env_str(env): + """Return a copy of an env including only daos-relevant variables. + + TODO ideally should be under EnvironmentVariables but the imports are broken + + Args: + enc (dict): the original env + + Returns: + str: a copy of env env including only daos-relevant variables, + converted to an export str + """ + def _include(key): + return key.startswith('FI_') or \ + key.startswith('OFI_') or \ + key.startswith('DAOS_') or \ + key.startswith('D_') or \ + key.startswith('CRT_') or \ + key.startswith('DD_') or \ + key.startswith('MPI') or \ + key in ('PATH', 'LD_LIBRARY_PATH') + export_str = ';'.join( + f'export {key}' if value is None else "export {}='{}'".format(key, value) + for key, value in env.items() if _include(key)) + if export_str: + export_str = "".join([export_str, ';']) + return export_str + + def run_remote(log, hosts, command, verbose=True, timeout=120, task_debug=False, stderr=False, fanout=None): """Run the command on the remote hosts. @@ -472,6 +501,8 @@ def run_remote(log, hosts, command, verbose=True, timeout=120, task_debug=False, log.debug("Running on %s without a timeout: %s", hosts, command) else: log.debug("Running on %s with a %s second timeout: %s", hosts, timeout, command) + env_str = daos_env_str(os.environ) + command = f'{env_str}{command}' task.run(command=command, nodes=hosts, timeout=timeout) results = CommandResult(command, task) if verbose: From b39444cb836eae4832292761d1739e23a6d9521b Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Fri, 2 Aug 2024 16:22:54 +0000 Subject: [PATCH 05/12] export daos-related envs in soak clush dfuse Skip-build: true Signed-off-by: Dalton Bohning --- src/tests/ftest/util/soak_utils.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/tests/ftest/util/soak_utils.py b/src/tests/ftest/util/soak_utils.py index 397d5a77008..e2abe4ca0ad 100644 --- a/src/tests/ftest/util/soak_utils.py +++ b/src/tests/ftest/util/soak_utils.py @@ -33,7 +33,7 @@ from mdtest_utils import MdtestCommand from oclass_utils import extract_redundancy_factor from pydaos.raw import DaosApiError, DaosSnapshot -from run_utils import run_remote +from run_utils import daos_env_str, run_remote from test_utils_container import add_container H_LOCK = threading.Lock() @@ -857,7 +857,8 @@ def start_dfuse(self, pool, container, name=None, job_spec=None): self.soak_log_dir, self.test_name + "_" + name + "_`hostname -s`_" "" + "${SLURM_JOB_ID}_" + "daos_dfuse.log") - dfuse_env = f"export D_LOG_FILE_APPEND_PID=1;export D_LOG_MASK=ERR;export D_LOG_FILE={dfuselog}" + dfuse_env = daos_env_str(os.environ) + \ + f"export D_LOG_FILE_APPEND_PID=1;export D_LOG_MASK=ERR;export D_LOG_FILE={dfuselog}" module_load = f"module use {self.mpi_module_use};module load {self.mpi_module}" dfuse_start_cmds = [ @@ -886,9 +887,10 @@ def stop_dfuse(dfuse, vol=False): "do daos container destroy --path={0}/\"$file\" ; done".format( dfuse.mount_dir.value)]) + dfuse_env = daos_env_str(os.environ) dfuse_stop_cmds.extend([ - "clush -S -w $SLURM_JOB_NODELIST \"fusermount3 -uz {0}\"".format(dfuse.mount_dir.value), - "clush -S -w $SLURM_JOB_NODELIST \"rm -rf {0}\"".format(dfuse.mount_dir.value)]) + f'clush -S -w $SLURM_JOB_NODELIST "{dfuse_env}fusermount3 -uz {dfuse.mount_dir.value}"', + f'clush -S -w $SLURM_JOB_NODELIST "rm -rf {dfuse.mount_dir.value}"']) return dfuse_stop_cmds From 2a43e641a3e7ab72458dbf3480772f29df5f60bb Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Thu, 25 Jul 2024 19:49:52 +0000 Subject: [PATCH 06/12] update agent_failure to run journalctl as user Skip-test: true Skip-build: true Signed-off-by: Dalton Bohning --- src/tests/ftest/deployment/agent_failure.py | 6 +++--- src/tests/ftest/util/general_utils.py | 6 ++++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/tests/ftest/deployment/agent_failure.py b/src/tests/ftest/deployment/agent_failure.py index d1964df3c27..22cbbbdaa79 100644 --- a/src/tests/ftest/deployment/agent_failure.py +++ b/src/tests/ftest/deployment/agent_failure.py @@ -122,7 +122,7 @@ def test_agent_failure(self): # 5. Verify journalctl shows the log that the agent is stopped. results = get_journalctl( hosts=self.hostlist_clients, since=since, until=until, - journalctl_type="daos_agent") + journalctl_type="daos_agent", run_user=self.test_env.agent_user) self.log.info("journalctl results = %s", results) if "shutting down" not in results[0]["data"]: msg = "Agent shut down message not found in journalctl! Output = {}".format( @@ -240,7 +240,7 @@ def test_agent_failure_isolation(self): # stopped. results = get_journalctl( hosts=[agent_host_kill], since=since, until=until, - journalctl_type="daos_agent") + journalctl_type="daos_agent", run_user=self.test_env.agent_user) self.log.info("journalctl results (kill) = %s", results) if "shutting down" not in results[0]["data"]: msg = ("Agent shut down message not found in journalctl on killed client! " @@ -251,7 +251,7 @@ def test_agent_failure_isolation(self): # in the previous step doesn't show that the agent is stopped. results = get_journalctl( hosts=[agent_host_keep], since=since, until=until, - journalctl_type="daos_agent") + journalctl_type="daos_agent", run_user=self.test_env.agent_user) self.log.info("journalctl results (keep) = %s", results) if "shutting down" in results[0]["data"]: msg = ("Agent shut down message found in journalctl on keep client! " diff --git a/src/tests/ftest/util/general_utils.py b/src/tests/ftest/util/general_utils.py index 5dcaf11d7f3..09f4004aecb 100644 --- a/src/tests/ftest/util/general_utils.py +++ b/src/tests/ftest/util/general_utils.py @@ -1062,7 +1062,7 @@ def get_journalctl_command(since, until=None, system=False, units=None, identifi return command_as_user(" ".join(command), run_user) -def get_journalctl(hosts, since, until, journalctl_type): +def get_journalctl(hosts, since, until, journalctl_type, run_user="root"): """Run the journalctl on the hosts. Args: @@ -1070,6 +1070,7 @@ def get_journalctl(hosts, since, until, journalctl_type): since (str): Start time to search the log. until (str): End time to search the log. journalctl_type (str): String to search in the log. -t param for journalctl. + run_user (str, optional): user to run as. Defaults to root Returns: list: a list of dictionaries containing the following key/value pairs: @@ -1077,7 +1078,8 @@ def get_journalctl(hosts, since, until, journalctl_type): "data": data requested for the group of hosts """ - command = get_journalctl_command(since, until, True, identifiers=journalctl_type) + system = run_user != getuser() + command = get_journalctl_command(since, until, system, identifiers=journalctl_type, run_user=run_user) err = "Error gathering system log events" return get_host_data(hosts=hosts, command=command, text="journalctl", error=err) From 00367666e97ac0cada393d1188c64f49e6a2fdf4 Mon Sep 17 00:00:00 2001 From: Makito Kano Date: Wed, 10 Jul 2024 00:28:37 +0900 Subject: [PATCH 07/12] DAOS-16076 test: Automate dmg scale test to be run on Aurora (#14616) #15126 Skip-test: true Skip-build: true Steps: 1. Format storages 2. System query 3. Create a 100% pool that spans all engines 4. Pool query 5. Pool destroy 6. Create 49 pools spanning all the engines with each pool using a 1/50th of the capacity 7. Pool list 8. Get around 80 pool metrics 9. Destroy all 49 pools 10. System stop 11. System start Skip-unit-tests: true Skip-fault-injection-test: true Signed-off-by: Makito Kano Signed-off-by: Dalton Bohning --- src/tests/ftest/control/dmg_scale.py | 182 +++++++++++++++++++++++++ src/tests/ftest/control/dmg_scale.yaml | 37 +++++ 2 files changed, 219 insertions(+) create mode 100644 src/tests/ftest/control/dmg_scale.py create mode 100644 src/tests/ftest/control/dmg_scale.yaml diff --git a/src/tests/ftest/control/dmg_scale.py b/src/tests/ftest/control/dmg_scale.py new file mode 100644 index 00000000000..5a268a798c7 --- /dev/null +++ b/src/tests/ftest/control/dmg_scale.py @@ -0,0 +1,182 @@ +""" + (C) Copyright 2024 Intel Corporation. + + SPDX-License-Identifier: BSD-2-Clause-Patent +""" +from apricot import TestWithServers +from telemetry_utils import TelemetryUtils +from test_utils_pool import time_pool_create + +ENGINE_POOL_METRICS_SHORT = [ + "engine_pool_entries_dtx_batched_degree", + "engine_pool_entries_dtx_batched_total", + "engine_pool_ops_akey_enum", + "engine_pool_ops_akey_punch", + "engine_pool_ops_compound", + "engine_pool_ops_dkey_enum", + "engine_pool_ops_dkey_punch", + "engine_pool_ops_dtx_abort", + "engine_pool_ops_dtx_check", + "engine_pool_ops_dtx_commit", + "engine_pool_ops_dtx_refresh", + "engine_pool_ops_ec_agg", + "engine_pool_ops_ec_rep", + "engine_pool_ops_fetch", + "engine_pool_ops_key_query", + "engine_pool_ops_migrate", + "engine_pool_ops_obj_enum", + "engine_pool_ops_obj_punch", + "engine_pool_ops_obj_sync", + "engine_pool_ops_recx_enum", + "engine_pool_ops_tgt_akey_punch", + "engine_pool_ops_tgt_dkey_punch", + "engine_pool_ops_tgt_punch", + "engine_pool_ops_tgt_update", + "engine_pool_ops_update", + "engine_pool_ops_pool_connect", + "engine_pool_ops_pool_disconnect", + "engine_pool_ops_pool_evict", + "engine_pool_ops_pool_query", + "engine_pool_ops_pool_query_space", + "engine_pool_resent", + "engine_pool_restarted", + "engine_pool_retry", + "engine_pool_scrubber_busy_time", + "engine_pool_scrubber_bytes_scrubbed_current", + "engine_pool_scrubber_bytes_scrubbed_prev", + "engine_pool_scrubber_bytes_scrubbed_total", + "engine_pool_scrubber_corruption_current", + "engine_pool_scrubber_corruption_total", + "engine_pool_scrubber_csums_current", + "engine_pool_scrubber_csums_prev", + "engine_pool_scrubber_csums_total", + "engine_pool_scrubber_next_csum_scrub", + "engine_pool_scrubber_next_tree_scrub", + "engine_pool_scrubber_prev_duration", + "engine_pool_scrubber_prev_duration_max", + "engine_pool_scrubber_prev_duration_mean", + "engine_pool_scrubber_prev_duration_min", + "engine_pool_scrubber_prev_duration_stddev", + "engine_pool_scrubber_scrubber_started", + "engine_pool_scrubber_scrubs_completed", + "engine_pool_started_at", + "engine_pool_vos_aggregation_akey_deleted", + "engine_pool_vos_aggregation_akey_scanned", + "engine_pool_vos_aggregation_akey_skipped", + "engine_pool_vos_aggregation_csum_errors", + "engine_pool_vos_aggregation_deleted_ev", + "engine_pool_vos_aggregation_deleted_sv", + "engine_pool_vos_aggregation_dkey_deleted", + "engine_pool_vos_aggregation_dkey_scanned", + "engine_pool_vos_aggregation_dkey_skipped", + "engine_pool_vos_aggregation_epr_duration", + "engine_pool_vos_aggregation_epr_duration_max", + "engine_pool_vos_aggregation_epr_duration_mean", + "engine_pool_vos_aggregation_epr_duration_min", + "engine_pool_vos_aggregation_epr_duration_stddev", + "engine_pool_vos_aggregation_merged_recs", + "engine_pool_vos_aggregation_merged_size", + "engine_pool_vos_aggregation_obj_deleted", + "engine_pool_vos_aggregation_obj_scanned", + "engine_pool_vos_aggregation_obj_skipped", + "engine_pool_vos_aggregation_uncommitted", + "engine_pool_vos_space_nvme_used", + "engine_pool_vos_space_scm_used", + "engine_pool_xferred_fetch", + "engine_pool_xferred_update", + "engine_pool_EC_update_full_stripe", + "engine_pool_EC_update_partial", + "engine_pool_block_allocator_alloc_hint", + "engine_pool_block_allocator_alloc_large", + "engine_pool_block_allocator_alloc_small", + "engine_pool_block_allocator_frags_aging", + "engine_pool_block_allocator_frags_large", + "engine_pool_block_allocator_frags_small", + "engine_pool_block_allocator_free_blks", + "engine_pool_ops_key2anchor" +] + + +class DmgScale(TestWithServers): + """Verify dmg commands works as expected in a large scale system. + + :avocado: recursive + """ + + def test_dmg_scale(self): + """Run the following steps and manually collect duration for each step. + + 0. Format storage + 1. System query + 2. Create a 100% pool that spans all engines + 3. Pool query + 4. Pool destroy + 5. Create 49 pools spanning all the engines with each pool using a 1/50th of the capacity + 6. Pool list + 7. Query around 80 pool metrics + 8. Destroy all 49 pools + 9. System stop + 10. System start + + Jira ID: DAOS-10508. + + :avocado: tags=all,manual + :avocado: tags=deployment + :avocado: tags=DmgScale,test_dmg_scale + """ + # This is a manual test and we need to find the durations from job.log, so add "##" to make + # it easy to search. The log is usually over 1 million lines. + self.log_step("## System query") + dmg_command = self.get_dmg_command() + dmg_command.system_query() + + self.log_step("## Create a 100% pool that spans all engines") + pool = self.get_pool(namespace="/run/pool_100/*", create=False) + duration = time_pool_create(log=self.log, number=1, pool=pool) + self.log.info("## Single pool create duration = %.1f", duration) + + self.log_step("## Pool query") + pool.query() + + self.log_step("## Pool destroy") + pool.destroy() + + quantity = self.params.get("quantity", "/run/pool_small/*", 1) + msg = (f"## Create {quantity} small pools spanning all the engines where the pools fill up " + f"the capacity") + self.log_step(msg) + pool_0 = self.get_pool(namespace="/run/pool_small/*", create=False) + duration_0 = time_pool_create(log=self.log, number=0, pool=pool_0) + pools = [pool_0] + durations = [duration_0] + for count in range(1, quantity): + pools.append(self.get_pool(create=False)) + # Use the SCM and NVMe size of the first pool for the rest of the (quantity - 1) pools. + pools[-1].scm_size.update(pool_0.scm_per_rank) + pools[-1].nvme_size.update(pool_0.nvme_per_rank) + durations.append(time_pool_create(log=self.log, number=count, pool=pools[-1])) + msg = (f"Pool {count} created. SCM = {pools[-1].scm_per_rank}; " + f"NVMe = {pools[-1].nvme_per_rank}") + self.log.info(msg) + self.log.info("## durations = %s", durations) + total_duration = sum(durations) + self.log.info("## %d pools create duration = %.1f", quantity, total_duration) + + self.log_step("## Pool list") + dmg_command.pool_list() + + self.log_step("## Query around 80 pool metrics") + # To save time and logs, call telemetry on the first host only. With the 80 pool metrics + # above, ~100K lines are printed per host. + telemetry_utils = TelemetryUtils( + dmg=dmg_command, servers=[self.server_managers[0].hosts[0]]) + telemetry_utils.get_metrics(name=",".join(ENGINE_POOL_METRICS_SHORT)) + + self.log_step(f"## Destroy all {quantity} pools") + self.destroy_pools(pools=pools) + + self.log_step("## System stop") + self.server_managers[0].system_stop() + + self.log_step("## System start") + self.server_managers[0].system_start() diff --git a/src/tests/ftest/control/dmg_scale.yaml b/src/tests/ftest/control/dmg_scale.yaml new file mode 100644 index 00000000000..84f4e35bc4d --- /dev/null +++ b/src/tests/ftest/control/dmg_scale.yaml @@ -0,0 +1,37 @@ +# Note: We usually use the extra yaml in aurora-tools, but that extra yaml has test_clients while +# this test doesn't need any client, so update the extra yaml or provide some dummy client to -tc. +hosts: + test_servers: 256 + +timeout: 900 + +daos_server: + pattern_timeout: 60 + +server_config: + name: daos_server + engines_per_host: 2 + engines: + 0: + pinned_numa_node: 0 + nr_xs_helpers: 1 + fabric_iface: ib0 + fabric_iface_port: 31317 + log_file: daos_server0.log + storage: auto + targets: 8 + 1: + pinned_numa_node: 1 + nr_xs_helpers: 1 + fabric_iface: ib1 + fabric_iface_port: 31417 + log_file: daos_server1.log + storage: auto + targets: 8 + +pool_100: + size: 100% +pool_small: + size: 2% + # If we use --size=2% during pool create, we can only create up to 49 pools. + quantity: 49 From 938c31020cd99326e600c5bde0908fdd7af663a3 Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Tue, 17 Sep 2024 13:35:34 -0700 Subject: [PATCH 08/12] DAOS-16550 test: use correct stonewall file with mdtest (#15109) Use a dynamic stonewall file with mdtest instead of hardcoded path. Skip-test: true Skip-build: true Signed-off-by: Dalton Bohning --- src/tests/ftest/deployment/basic_checkout.yaml | 2 +- src/tests/ftest/performance/mdtest_easy.yaml | 2 +- src/tests/ftest/performance/mdtest_hard.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/tests/ftest/deployment/basic_checkout.yaml b/src/tests/ftest/deployment/basic_checkout.yaml index 03d420ab82b..7ce9515bae8 100644 --- a/src/tests/ftest/deployment/basic_checkout.yaml +++ b/src/tests/ftest/deployment/basic_checkout.yaml @@ -70,7 +70,7 @@ mdtest_easy: &mdtest_easy_base write_bytes: 0 num_of_files_dirs: 100000000 stonewall_timer: 30 - stonewall_statusfile: "/var/tmp/daos_testing/stoneWallingStatusFile" + stonewall_statusfile: stoneWallingStatusFile dfs_destroy: false mdtest_dfs_s1: <<: *mdtest_easy_base diff --git a/src/tests/ftest/performance/mdtest_easy.yaml b/src/tests/ftest/performance/mdtest_easy.yaml index 8fdd27031c2..a81db811686 100644 --- a/src/tests/ftest/performance/mdtest_easy.yaml +++ b/src/tests/ftest/performance/mdtest_easy.yaml @@ -46,7 +46,7 @@ mdtest: &mdtest_base write_bytes: 0 num_of_files_dirs: 100000000 stonewall_timer: 30 - stonewall_statusfile: "/var/tmp/daos_testing/stoneWallingStatusFile" + stonewall_statusfile: stoneWallingStatusFile dfs_destroy: false mdtest_s1: &mdtest_s1 diff --git a/src/tests/ftest/performance/mdtest_hard.yaml b/src/tests/ftest/performance/mdtest_hard.yaml index ae3fcebaf5c..2bf5e0d73ca 100644 --- a/src/tests/ftest/performance/mdtest_hard.yaml +++ b/src/tests/ftest/performance/mdtest_hard.yaml @@ -47,7 +47,7 @@ mdtest: &mdtest_base write_bytes: 3901 num_of_files_dirs: 100000000 stonewall_timer: 30 - stonewall_statusfile: "/var/tmp/daos_testing/stoneWallingStatusFile" + stonewall_statusfile: stoneWallingStatusFile dfs_destroy: false mdtest_s1: &mdtest_s1 From 2cd69aa29b445eccaf287df9e5c0e2edee5496ae Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Tue, 17 Sep 2024 15:31:14 -0700 Subject: [PATCH 09/12] DAOS-623 test: Support running independent io sys admin steps (#15134) Support running independent io sys admin steps from the yaml. Skip-test: true Skip-build: true Signed-off-by: Dalton Bohning --- src/tests/ftest/deployment/io_sys_admin.py | 140 +++++++++++-------- src/tests/ftest/deployment/io_sys_admin.yaml | 8 ++ 2 files changed, 89 insertions(+), 59 deletions(-) diff --git a/src/tests/ftest/deployment/io_sys_admin.py b/src/tests/ftest/deployment/io_sys_admin.py index bca8373ba5c..265c1ad42f3 100644 --- a/src/tests/ftest/deployment/io_sys_admin.py +++ b/src/tests/ftest/deployment/io_sys_admin.py @@ -40,66 +40,88 @@ def test_io_sys_admin(self): new_cont_user = self.params.get("user", "/run/container_set_owner/*") new_cont_group = self.params.get("group", "/run/container_set_owner/*") + # Toggle independent steps + steps_to_run = { + "pool_create_ownership": True, + "storage_system_query": True, + "io": True, + "snapshot": True, + "datamover": True + } + for step in steps_to_run: + run = self.params.get(step, "/run/io_sys_admin/steps_to_run/*", None) + if run is not None: + steps_to_run[step] = run + dmg = self.get_dmg_command() daos = self.get_daos_command() - for idx in range(1, 4): - pool = self.get_pool(namespace=f"/run/pool_{idx}/", create=False) - check_pool_creation(self, [pool], 60) - containers = [] - for cont_idx in range(1, 4): - containers.append( - self.get_container(pool, namespace=f"/run/container_{cont_idx}/")) - containers[-1].set_owner(f"{new_cont_user}@", f"{new_cont_group}@") - - daos.container_list(pool.identifier) - self.destroy_containers(containers) - pool.destroy() - - # dmg storage scan - dmg.storage_scan() - dmg.system_query() - dmg.system_leader_query() - - # write large data sets - self.run_file_count() - # create snapshot - self.container[-1].create_snap() - # overwrite the last ior file - self.ior_cmd.signature.update('456') - self.processes = self.ior_np - self.ppn = self.ior_ppn - self.run_ior_with_pool(create_pool=False, create_cont=False) - - nvme_free_space_before_snap_destroy = self.get_free_space()[1] - # delete snapshot - self.container[-1].destroy_snap(epc=self.container[-1].epoch) - # Now check if the space is returned back. - counter = 1 - returned_space = self.get_free_space()[1] - nvme_free_space_before_snap_destroy - - data_written = (int(self.ppn) * human_to_bytes(self.ior_cmd.block_size.value)) - while returned_space < int(data_written): - # try to wait for 4 x 60 secs for aggregation to be completed or - # else exit the test with a failure. - if counter > 4: - self.log.info("Free space before snapshot destroy: %s", - nvme_free_space_before_snap_destroy) - self.log.info("Free space when test terminated: %s", - self.get_free_space()[1]) - self.fail("Aggregation did not complete as expected") - - time.sleep(60) + if steps_to_run["pool_create_ownership"]: + self.log_step("Verify pool creation time and container set-owner") + for idx in range(1, 4): + pool = self.get_pool(namespace=f"/run/pool_{idx}/", create=False) + check_pool_creation(self, [pool], 60) + containers = [] + for cont_idx in range(1, 4): + containers.append( + self.get_container(pool, namespace=f"/run/container_{cont_idx}/")) + containers[-1].set_owner(f"{new_cont_user}@", f"{new_cont_group}@") + + daos.container_list(pool.identifier) + self.destroy_containers(containers) + pool.destroy() + + if steps_to_run["storage_system_query"]: + self.log_step("Verify storage scan and system query") + dmg.storage_scan() + dmg.system_query() + dmg.system_leader_query() + + if steps_to_run["io"]: + self.log_step("Verifying large dataset IO") + self.run_file_count() + + if steps_to_run["snapshot"]: + self.log_step("Verifying snapshot creation and aggregation") + self.container[-1].create_snap() + # overwrite the last ior file + self.ior_cmd.signature.update('456') + self.processes = self.ior_np + self.ppn = self.ior_ppn + self.run_ior_with_pool(create_pool=False, create_cont=False) + + nvme_free_space_before_snap_destroy = self.get_free_space()[1] + # delete snapshot + self.container[-1].destroy_snap(epc=self.container[-1].epoch) + # Now check if the space is returned back. + counter = 1 returned_space = self.get_free_space()[1] - nvme_free_space_before_snap_destroy - counter += 1 - - self.log.info("#####Starting FS_COPY Test") - self.run_dm_activities_with_ior("FS_COPY", self.pool, self.container[-1]) - self.log.info("#####Starting DCP Test") - self.run_dm_activities_with_ior("DCP", self.pool, self.container[-1]) - self.log.info("#####Starting DSERIAL Test") - self.run_dm_activities_with_ior("DSERIAL", self.pool, self.container[-1]) - self.log.info("#####Starting CONT_CLONE Test") - self.run_dm_activities_with_ior("CONT_CLONE", self.pool, self.container[-1]) - self.log.info("#####Completed all Datamover tests") - self.container.pop(0) + + data_written = (int(self.ppn) * human_to_bytes(self.ior_cmd.block_size.value)) + while returned_space < int(data_written): + # try to wait for 4 x 60 secs for aggregation to be completed or + # else exit the test with a failure. + if counter > 4: + self.log.info( + "Free space before snapshot destroy: %s", + nvme_free_space_before_snap_destroy) + self.log.info( + "Free space when test terminated: %s", self.get_free_space()[1]) + self.fail("Aggregation did not complete as expected") + + time.sleep(60) + returned_space = self.get_free_space()[1] - nvme_free_space_before_snap_destroy + counter += 1 + + if steps_to_run["datamover"]: + self.log_step("Verifying datamover") + self.log.info("#####Starting FS_COPY Test") + self.run_dm_activities_with_ior("FS_COPY", self.pool, self.container[-1]) + self.log.info("#####Starting DCP Test") + self.run_dm_activities_with_ior("DCP", self.pool, self.container[-1]) + self.log.info("#####Starting DSERIAL Test") + self.run_dm_activities_with_ior("DSERIAL", self.pool, self.container[-1]) + self.log.info("#####Starting CONT_CLONE Test") + self.run_dm_activities_with_ior("CONT_CLONE", self.pool, self.container[-1]) + self.log.info("#####Completed all Datamover tests") + self.container.pop(0) diff --git a/src/tests/ftest/deployment/io_sys_admin.yaml b/src/tests/ftest/deployment/io_sys_admin.yaml index 6c3edab15b3..f2a238ad4b5 100644 --- a/src/tests/ftest/deployment/io_sys_admin.yaml +++ b/src/tests/ftest/deployment/io_sys_admin.yaml @@ -104,3 +104,11 @@ dcp: np: 16 hdf5_vol: plugin_path: /usr/lib64/mpich/lib + +io_sys_admin: + steps_to_run: + pool_create_ownership: True + storage_system_query: True + io: True + snapshot: True + datamover: True From b452c443f80ff3ea8d64dd1660f1e4d60daa9ffb Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Tue, 17 Sep 2024 22:09:52 +0000 Subject: [PATCH 10/12] DAOS-16590 test: misc ftest/performance updates - Add variants for mdtest with RP_3G1 - Update ec cell size to 1MiB for mdtest and ior easy Skip-test: true skip-build: true Required-githooks: true Signed-off-by: Dalton Bohning --- src/tests/ftest/performance/ior_easy.yaml | 2 +- src/tests/ftest/performance/mdtest_easy.py | 18 ++++++++++++++++++ src/tests/ftest/performance/mdtest_easy.yaml | 16 +++++++++++++++- src/tests/ftest/performance/mdtest_hard.py | 18 ++++++++++++++++++ src/tests/ftest/performance/mdtest_hard.yaml | 20 ++++++++++++++++++-- 5 files changed, 70 insertions(+), 4 deletions(-) diff --git a/src/tests/ftest/performance/ior_easy.yaml b/src/tests/ftest/performance/ior_easy.yaml index fca6fa3ba70..b846b179277 100644 --- a/src/tests/ftest/performance/ior_easy.yaml +++ b/src/tests/ftest/performance/ior_easy.yaml @@ -28,7 +28,7 @@ server_config: pool: size: 95% - properties: ec_cell_sz:128KiB + properties: ec_cell_sz:1MiB container: type: POSIX diff --git a/src/tests/ftest/performance/mdtest_easy.py b/src/tests/ftest/performance/mdtest_easy.py index 86db9f0c49d..c1a768694af 100644 --- a/src/tests/ftest/performance/mdtest_easy.py +++ b/src/tests/ftest/performance/mdtest_easy.py @@ -33,6 +33,15 @@ def test_performance_mdtest_easy_dfs_ec_16p2g1(self): """ self.run_performance_mdtest(namespace="/run/mdtest_dfs_ec_16p2g1/*") + def test_performance_mdtest_easy_dfs_rp_3g1(self): + """Test Description: Run MDTest Easy, DFS, RP_3G1. + + :avocado: tags=all,manual + :avocado: tags=performance + :avocado: tags=MdtestEasy,test_performance_mdtest_easy_dfs_rp_3g1 + """ + self.run_performance_mdtest(namespace="/run/mdtest_dfs_rp_3g1/*") + def test_performance_mdtest_easy_pil4dfs_s1(self): """Test Description: Run MDTest Easy, dfuse + pil4dfs, S1. @@ -51,3 +60,12 @@ def test_performance_mdtest_easy_pil4dfs_ec_16p2g1(self): :avocado: tags=MdtestEasy,test_performance_mdtest_easy_pil4dfs_ec_16p2g1 """ self.run_performance_mdtest(namespace="/run/mdtest_pil4dfs_ec_16p2g1/*") + + def test_performance_mdtest_easy_pil4dfs_rp_3g1(self): + """Test Description: Run MDTest Easy, dfuse + pil4dfs, RP_3G1. + + :avocado: tags=all,manual + :avocado: tags=performance + :avocado: tags=MdtestEasy,test_performance_mdtest_easy_pil4dfs_rp_3g1 + """ + self.run_performance_mdtest(namespace="/run/mdtest_pil4dfs_rp_3g1/*") diff --git a/src/tests/ftest/performance/mdtest_easy.yaml b/src/tests/ftest/performance/mdtest_easy.yaml index a81db811686..d2925536b79 100644 --- a/src/tests/ftest/performance/mdtest_easy.yaml +++ b/src/tests/ftest/performance/mdtest_easy.yaml @@ -28,7 +28,7 @@ server_config: pool: size: 95% - properties: ec_cell_sz:128KiB + properties: ec_cell_sz:1MiB container: type: POSIX @@ -61,6 +61,12 @@ mdtest_ec_16p2g1: &mdtest_ec_16p2g1 dfs_dir_oclass: RP_3GX dfs_chunk: 16MiB +mdtest_rp_3g1: &mdtest_rp_3g1 + <<: *mdtest_base + dfs_oclass: RP_3G1 + dfs_dir_oclass: RP_3GX + dfs_chunk: 1MiB + mdtest_dfs_s1: api: DFS <<: *mdtest_s1 @@ -69,6 +75,10 @@ mdtest_dfs_ec_16p2g1: api: DFS <<: *mdtest_ec_16p2g1 +mdtest_dfs_rp_3g1: + api: DFS + <<: *mdtest_rp_3g1 + mdtest_pil4dfs_s1: api: POSIX+PIL4DFS # handled by ftest <<: *mdtest_s1 @@ -77,6 +87,10 @@ mdtest_pil4dfs_ec_16p2g1: api: POSIX+PIL4DFS # handled by ftest <<: *mdtest_ec_16p2g1 +mdtest_pil4dfs_rp_3g1: + api: POSIX+PIL4DFS # handled by ftest + <<: *mdtest_rp_3g1 + dfuse: disable_caching: true diff --git a/src/tests/ftest/performance/mdtest_hard.py b/src/tests/ftest/performance/mdtest_hard.py index 2eebc5738a8..a1bf2ec3076 100644 --- a/src/tests/ftest/performance/mdtest_hard.py +++ b/src/tests/ftest/performance/mdtest_hard.py @@ -33,6 +33,15 @@ def test_performance_mdtest_hard_dfs_ec_16p2g1(self): """ self.run_performance_mdtest(namespace="/run/mdtest_dfs_ec_16p2g1/*") + def test_performance_mdtest_hard_dfs_rp_3g1(self): + """Test Description: Run MdTest Hard, DFS, RP_3G1. + + :avocado: tags=all,manual + :avocado: tags=performance + :avocado: tags=MdtestHard,test_performance_mdtest_hard_dfs_rp_3g1 + """ + self.run_performance_mdtest(namespace="/run/mdtest_dfs_rp_3g1/*") + def test_performance_mdtest_hard_pil4dfs_s1(self): """Test Description: Run MDTest Hard, dfuse + pil4dfs, S1. @@ -51,3 +60,12 @@ def test_performance_mdtest_hard_pil4dfs_ec_16p2g1(self): :avocado: tags=MdtestHard,test_performance_mdtest_hard_pil4dfs_ec_16p2g1 """ self.run_performance_mdtest(namespace="/run/mdtest_pil4dfs_ec_16p2g1/*") + + def test_performance_mdtest_hard_pil4dfs_rp_3g1(self): + """Test Description: Run MDTest Hard, dfuse + pil4dfs, RP_3G1. + + :avocado: tags=all,manual + :avocado: tags=performance + :avocado: tags=MdtestHard,test_performance_mdtest_hard_pil4dfs_rp_3g1 + """ + self.run_performance_mdtest(namespace="/run/mdtest_pil4dfs_rp_3g1/*") diff --git a/src/tests/ftest/performance/mdtest_hard.yaml b/src/tests/ftest/performance/mdtest_hard.yaml index 2bf5e0d73ca..0599ea61319 100644 --- a/src/tests/ftest/performance/mdtest_hard.yaml +++ b/src/tests/ftest/performance/mdtest_hard.yaml @@ -28,8 +28,7 @@ server_config: pool: size: 95% - control_method: dmg - properties: ec_cell_sz:128KiB + properties: ec_cell_sz:1MiB container: type: POSIX @@ -62,6 +61,12 @@ mdtest_ec_16p2g1: &mdtest_ec_16p2g1 dfs_dir_oclass: RP_3GX dfs_chunk: 16MiB +mdtest_rp_3g1: &mdtest_rp_3g1 + <<: *mdtest_base + dfs_oclass: RP_3G1 + dfs_dir_oclass: RP_3GX + dfs_chunk: 1MiB + mdtest_dfs_s1: api: DFS <<: *mdtest_s1 @@ -70,6 +75,10 @@ mdtest_dfs_ec_16p2g1: api: DFS <<: *mdtest_ec_16p2g1 +mdtest_dfs_rp_3g1: + api: DFS + <<: *mdtest_rp_3g1 + mdtest_pil4dfs_s1: api: POSIX+PIL4DFS # handled by ftest <<: *mdtest_s1 @@ -78,6 +87,13 @@ mdtest_pil4dfs_ec_16p2g1: api: POSIX+PIL4DFS # handled by ftest <<: *mdtest_ec_16p2g1 +mdtest_pil4dfs_rp_3g1: + api: POSIX+PIL4DFS # handled by ftest + <<: *mdtest_rp_3g1 + +dfuse: + disable_caching: true + client: env_vars: - D_LOG_MASK=INFO From 72afaffd356edfca6b59c6913b45ae11ec77d83b Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Wed, 18 Sep 2024 20:08:48 +0000 Subject: [PATCH 11/12] disable some unused / not working teardown steps Skip-test: true Skip-build: true Signed-off-by: Dalton Bohning --- src/tests/ftest/launch.py | 2 +- src/tests/ftest/util/collection_utils.py | 36 ++++++++++++------------ 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/src/tests/ftest/launch.py b/src/tests/ftest/launch.py index 550c02d7176..749993ef637 100755 --- a/src/tests/ftest/launch.py +++ b/src/tests/ftest/launch.py @@ -353,7 +353,7 @@ def _run(self, args): # Determine if bullseye code coverage collection is enabled code_coverage = CodeCoverage(test_env) # pylint: disable=unsupported-binary-operation - code_coverage.check(logger, args.test_servers | self.local_host) + #code_coverage.check(logger, args.test_servers | self.local_host) # Update the test yaml files for the tests in this test group try: diff --git a/src/tests/ftest/util/collection_utils.py b/src/tests/ftest/util/collection_utils.py index a900769f4d6..bf09b73b983 100644 --- a/src/tests/ftest/util/collection_utils.py +++ b/src/tests/ftest/util/collection_utils.py @@ -191,23 +191,23 @@ def archive_files(logger, summary, hosts, source, pattern, destination, depth, t logger.debug("No %s files found on %s", os.path.join(source, pattern), hosts) return return_code - if "log" in pattern: - # Remove any empty files - return_code |= remove_empty_files(logger, file_hosts, source, pattern, depth, test_result) + # if "log" in pattern: + # # Remove any empty files + # return_code |= remove_empty_files(logger, file_hosts, source, pattern, depth, test_result) - # Report an error if any files sizes exceed the threshold - if threshold is not None: - return_code |= check_log_size( - logger, file_hosts, source, pattern, depth, threshold, test_result) + # # Report an error if any files sizes exceed the threshold + # if threshold is not None: + # return_code |= check_log_size( + # logger, file_hosts, source, pattern, depth, threshold, test_result) - # Run cart_logtest on log files - return_code |= cart_log_test(logger, file_hosts, source, pattern, depth, test_result) + # # Run cart_logtest on log files + # return_code |= cart_log_test(logger, file_hosts, source, pattern, depth, test_result) # Remove any empty files return_code |= remove_empty_files(logger, file_hosts, source, pattern, depth, test_result) # Compress any files larger than 1 MB - return_code |= compress_files(logger, file_hosts, source, pattern, depth, test_result) + #return_code |= compress_files(logger, file_hosts, source, pattern, depth, test_result) # Move the test files to the test-results directory on this host return_code |= move_files( @@ -924,14 +924,14 @@ def collect_test_result(logger, test, test_result, job_results_dir, stop_daos, a "depth": 1, "timeout": 900, } - remote_files["valgrind log files"] = { - "source": test_env.shared_dir, - "destination": os.path.join(job_results_dir, "latest", TEST_RESULTS_DIRS[4]), - "pattern": "valgrind*", - "hosts": test.host_info.servers.hosts, - "depth": 1, - "timeout": 900, - } + # remote_files["valgrind log files"] = { + # "source": test_env.shared_dir, + # "destination": os.path.join(job_results_dir, "latest", TEST_RESULTS_DIRS[4]), + # "pattern": "valgrind*", + # "hosts": test.host_info.servers.hosts, + # "depth": 1, + # "timeout": 900, + # } for index, hosts in enumerate(core_files): remote_files[f"core files {index + 1}/{len(core_files)}"] = { "source": core_files[hosts]["path"], From 9f107a95fa9aad14165c6338b82a5ddd097e062b Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Wed, 18 Sep 2024 19:32:28 +0000 Subject: [PATCH 12/12] optimize move_files when archiving to /lus on computes Skip-test: true Skip-build: true Signed-off-by: Dalton Bohning --- src/tests/ftest/util/collection_utils.py | 35 +++++++++++++++++------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/src/tests/ftest/util/collection_utils.py b/src/tests/ftest/util/collection_utils.py index bf09b73b983..f651c603f29 100644 --- a/src/tests/ftest/util/collection_utils.py +++ b/src/tests/ftest/util/collection_utils.py @@ -445,21 +445,36 @@ def move_files(logger, hosts, source, pattern, destination, depth, timeout, test tmp_copy_dir = os.path.join(source, tmp_copy_dir) sudo_command = "" - # Create a temporary remote directory - should already exist, see _setup_test_directory() - command = f"mkdir -p '{tmp_copy_dir}'" - result = run_remote(logger, hosts, command) - if not result.passed: - message = (f"Error creating temporary remote copy directory '{tmp_copy_dir}' on " - f"{result.failed_hosts}") - test_result.fail_test(logger, "Process", message) - return_code = 16 - hosts = result.passed_hosts.copy() + compute_hosts = NodeSet.fromlist(_host for _host in hosts if _host.startswith('x')) + if destination.startswith("/lus/") and compute_hosts: + # Optimize collection to run a single command and not use clush --rcopy + hosts = hosts - compute_hosts + commands = [] + + # Create the host-specific directory + commands.append(f"mkdir -p '{destination}'.$(hostname)") + + # Move all the source files matching the pattern into the host-specific directory + other = f"-print0 | xargs -0 -r0 -I '{{}}' {sudo_command}mv '{{}}' " \ + f"'{destination}'.$(hostname)/" + commands.append(find_command(source, pattern, depth, other)) + + result = run_remote(logger, compute_hosts, " && ".join(commands), timeout=timeout) + if not result.passed: + message = f"Error moving files from {source} to {destination}" + test_result.fail_test(logger, "Process", message) + return_code = 16 + + # Continue clush --rcopy archiving if there are remaining hosts if not hosts: return return_code + # Create a temporary remote directory - should already exist, see _setup_test_directory() # Move all the source files matching the pattern into the temporary remote directory + mkdir_command = f"mkdir -p '{tmp_copy_dir}'" other = f"-print0 | xargs -0 -r0 -I '{{}}' {sudo_command}mv '{{}}' '{tmp_copy_dir}'/" - result = run_remote(logger, hosts, find_command(source, pattern, depth, other)) + command = f"{mkdir_command} && {find_command(source, pattern, depth, other)}" + result = run_remote(logger, hosts, command) if not result.passed: message = (f"Error moving files to temporary remote copy directory '{tmp_copy_dir}' on " f"{result.failed_hosts}")