diff --git a/src/tests/ftest/container/per_server_fault_domain.yaml b/src/tests/ftest/container/per_server_fault_domain.yaml index a9dcc1cb96a..266e8c8e600 100644 --- a/src/tests/ftest/container/per_server_fault_domain.yaml +++ b/src/tests/ftest/container/per_server_fault_domain.yaml @@ -6,6 +6,7 @@ timeout: 300 server_config: name: daos_server engines_per_host: 2 + crt_timeout: 10 engines: 0: pinned_numa_node: 0 diff --git a/src/tests/ftest/erasurecode/cell_size_property.yaml b/src/tests/ftest/erasurecode/cell_size_property.yaml index ca05427711d..f87d9798539 100644 --- a/src/tests/ftest/erasurecode/cell_size_property.yaml +++ b/src/tests/ftest/erasurecode/cell_size_property.yaml @@ -5,6 +5,7 @@ timeout: 900 server_config: name: daos_server engines_per_host: 2 + crt_timeout: 10 engines: 0: pinned_numa_node: 0 diff --git a/src/tests/ftest/erasurecode/offline_rebuild.yaml b/src/tests/ftest/erasurecode/offline_rebuild.yaml index 7d1a6e7aa08..a61b47760b5 100644 --- a/src/tests/ftest/erasurecode/offline_rebuild.yaml +++ b/src/tests/ftest/erasurecode/offline_rebuild.yaml @@ -15,6 +15,7 @@ setup: server_config: name: daos_server engines_per_host: 2 + crt_timeout: 10 engines: 0: pinned_numa_node: 0 diff --git a/src/tests/ftest/erasurecode/rebuild_disabled_single.yaml b/src/tests/ftest/erasurecode/rebuild_disabled_single.yaml index e8a2afcb644..006e75079fb 100644 --- a/src/tests/ftest/erasurecode/rebuild_disabled_single.yaml +++ b/src/tests/ftest/erasurecode/rebuild_disabled_single.yaml @@ -36,7 +36,7 @@ pool: pool_query_timeout: 30 container: type: POSIX - control_method: API + control_method: daos single_data_set: # [object_qty, record_qty, dkey, akey, data_size] - [1, 1, 1, 1, 4194304] diff --git a/src/tests/ftest/rebuild/container_create_race.yaml b/src/tests/ftest/rebuild/container_create_race.yaml index 501d98378a4..46b8fe2791a 100644 --- a/src/tests/ftest/rebuild/container_create_race.yaml +++ b/src/tests/ftest/rebuild/container_create_race.yaml @@ -7,6 +7,7 @@ timeout: 360 server_config: name: daos_server engines_per_host: 2 + crt_timeout: 10 engines: 0: targets: 2 diff --git a/src/tests/ftest/soak/faults.yaml b/src/tests/ftest/soak/faults.yaml index 793974b8d46..165cb573031 100644 --- a/src/tests/ftest/soak/faults.yaml +++ b/src/tests/ftest/soak/faults.yaml @@ -120,7 +120,7 @@ ior_faults: dfs_oclass: - ["EC_2P1GX", "RP_2GX"] dfuse: - mount_dir: "/tmp/daos_dfuse/ior/" + mount_dir: "/tmp/soak_dfuse_ior/" disable_caching: true hdf5_vol: plugin_path: "/usr/lib64/mpich/lib" diff --git a/src/tests/ftest/soak/harassers.yaml b/src/tests/ftest/soak/harassers.yaml index e56c0527f1d..84b6a25f85a 100644 --- a/src/tests/ftest/soak/harassers.yaml +++ b/src/tests/ftest/soak/harassers.yaml @@ -8,8 +8,7 @@ hosts: orterun: allow_run_as_root: true # This timeout must be longer than the test_timeout param (+15minutes) -# 12 hour test -timeout: 12H15M +timeout: 24H30M setup: start_servers: true start_agents: true @@ -56,7 +55,7 @@ container: daos_timeout: 30 container_reserved: type: POSIX - properties: cksum:crc16,cksum_size:16384,srv_cksum:on,rd_fac:1 + properties: cksum:crc16,cksum_size:16384,srv_cksum:on file_oclass: SX dir_oclass: SX control_method: daos @@ -78,8 +77,8 @@ soak_harassers: # harasser test timeout in hours single_test_pool: false test_timeout: - test_soak_online_harassers: 12 - test_soak_offline_harassers: 12 + test_soak_online_harassers: 24 + test_soak_offline_harassers: 24 # maximum timeout for a single job in test in minutes joblist: - ior_harasser @@ -93,12 +92,14 @@ soak_harassers: - server-stop_server-reintegrate-offline - extend-pool-offline # - vmd-identify-check-offline + - reboot_reboot-reintegrate-offline test_soak_online_harassers: - exclude_reintegrate - server-stop_server-start - server-stop_server-reintegrate - extend-pool # - vmd-identify-check + - reboot_reboot-reintegrate harasser_to: 1200 # drain rank from all pools before stopping server enable_drain: true @@ -140,7 +141,7 @@ ior_harasser: - ["EC_2P1GX", "RP_2GX"] - ["EC_4P2GX", "RP_3GX"] dfuse: - mount_dir: "/tmp/daos_dfuse/ior" + mount_dir: "/tmp/soak_dfuse_ior" disable_caching: true fio_harasser: api: @@ -150,7 +151,7 @@ fio_harasser: - global - test global: - directory: "/tmp/daos_dfuse/fio/" + directory: "/tmp/soak_dfuse_fio/" ioengine: 'libaio' thread: 1 group_reporting: 1 @@ -172,7 +173,7 @@ fio_harasser: - ["EC_2P1GX", "RP_2GX"] - ["EC_4P2GX", "RP_3GX"] dfuse: - mount_dir: "/tmp/daos_dfuse/fio/" + mount_dir: "/tmp/soak_dfuse_fio/" disable_caching: true mdtest_harasser: # maximum timeout for a single job in test in minutes @@ -204,7 +205,7 @@ mdtest_harasser: - ["EC_4P2G1", "RP_3G1"] dfs_destroy: false dfuse: - mount_dir: "/tmp/daos_dfuse/mdtest/" + mount_dir: "/tmp/soak_dfuse_mdtest/" disable_caching: true hdf5_vol: plugin_path: "/usr/lib64/mpich/lib" diff --git a/src/tests/ftest/soak/smoke.yaml b/src/tests/ftest/soak/smoke.yaml index 4df4317eedb..b65ad1106ee 100644 --- a/src/tests/ftest/soak/smoke.yaml +++ b/src/tests/ftest/soak/smoke.yaml @@ -120,7 +120,7 @@ ior_smoke: dfs_oclass: - ["EC_2P1GX", "RP_2GX"] dfuse: - mount_dir: "/tmp/daos_dfuse/ior/" + mount_dir: "/tmp/soak_dfuse_ior/" disable_caching: true thread_count: 8 cores: '0-7' @@ -134,7 +134,7 @@ fio_smoke: - global - test global: - directory: "/tmp/daos_dfuse/fio/" + directory: "/tmp/soak_dfuse_fio/" ioengine: 'libaio' thread: 1 group_reporting: 1 @@ -155,7 +155,7 @@ fio_smoke: oclass: - ["EC_2P1GX", "RP_2GX"] dfuse: - mount_dir: "/tmp/daos_dfuse/fio/" + mount_dir: "/tmp/soak_dfuse_fio/" disable_caching: true thread_count: 8 cores: '0-7' @@ -172,9 +172,9 @@ vpic_smoke: - POSIX - POSIX-LIBIOIL - POSIX-LIBPIL4DFS - workdir: "/tmp/daos_dfuse/vpic/" + workdir: "/tmp/soak_dfuse_vpic/" dfuse: - mount_dir: "/tmp/daos_dfuse/vpic/" + mount_dir: "/tmp/soak_dfuse_vpic/" disable_caching: true thread_count: 8 cores: '0-7' @@ -191,9 +191,9 @@ lammps_smoke: - POSIX - POSIX-LIBIOIL - POSIX-LIBPIL4DFS - workdir: "/tmp/daos_dfuse/lammps/" + workdir: "/tmp/soak_dfuse_lammps/" dfuse: - mount_dir: "/tmp/daos_dfuse/lammps/" + mount_dir: "/tmp/soak_dfuse_lammps/" disable_caching: true thread_count: 8 cores: '0-7' @@ -226,7 +226,7 @@ mdtest_smoke: - ["EC_2P1G1", "RP_2G1"] dfs_destroy: false dfuse: - mount_dir: "/tmp/daos_dfuse/mdtest/" + mount_dir: "/tmp/soak_dfuse_mdtest/" disable_caching: true thread_count: 8 cores: '0-7' @@ -250,7 +250,7 @@ macsio_smoke: oclass: - ["EC_2P1GX", "RP_2GX"] dfuse: - mount_dir: "/tmp/daos_dfuse/macsio/" + mount_dir: "/tmp/soak_dfuse_macsio/" disable_caching: true thread_count: 8 cores: '0-7' diff --git a/src/tests/ftest/soak/stress.yaml b/src/tests/ftest/soak/stress.yaml index 12c9bf8b0e0..75585401684 100644 --- a/src/tests/ftest/soak/stress.yaml +++ b/src/tests/ftest/soak/stress.yaml @@ -134,7 +134,7 @@ ior_stress: # - ["EC_8P2G1", "RP_3GX"] # - ["EC_16P2GX", "RP_3GX"] dfuse: - mount_dir: "/tmp/daos_dfuse/ior/" + mount_dir: "/tmp/soak_dfuse_ior/" disable_caching: true thread_count: 8 cores: '0-7' @@ -149,7 +149,7 @@ fio_stress: - test global: create_serialize: 0 - directory: "/tmp/daos_dfuse/fio/" + directory: "/tmp/soak_dfuse_fio/" ioengine: 'libaio' thread: 1 group_reporting: 1 @@ -176,7 +176,7 @@ fio_stress: # - ["EC_8P2G1", "RP_3GX"] # - ["EC_16P2GX", "RP_3GX"] dfuse: - mount_dir: "/tmp/daos_dfuse/fio/" + mount_dir: "/tmp/soak_dfuse_fio/" disable_caching: true thread_count: 8 cores: '0-7' @@ -194,9 +194,9 @@ vpic_stress: - POSIX - POSIX-LIBIOIL - POSIX-LIBPIL4DFS - workdir: "/tmp/daos_dfuse/vpic/" + workdir: "/tmp/soak_dfuse_vpic/" dfuse: - mount_dir: "/tmp/daos_dfuse/vpic/" + mount_dir: "/tmp/soak_dfuse_vpic/" disable_caching: true oclass: - ["EC_2P1GX", "RP_2GX"] @@ -211,9 +211,9 @@ lammps_stress: - POSIX - POSIX-LIBIOIL - POSIX-LIBPIL4DFS - workdir: "/tmp/daos_dfuse/lammps/" + workdir: "/tmp/soak_dfuse_lammps/" dfuse: - mount_dir: "/tmp/daos_dfuse/lammps/" + mount_dir: "/tmp/soak_dfuse_lammps/" disable_caching: true thread_count: 8 cores: '0-7' @@ -254,7 +254,7 @@ mdtest_stress: # - ["EC_16P2G1", "RP_3G1"] dfs_destroy: false dfuse: - mount_dir: "/tmp/daos_dfuse/mdtest/" + mount_dir: "/tmp/soak_dfuse_mdtest/" disable_caching: true thread_count: 8 cores: '0-7' @@ -285,18 +285,15 @@ macsio_stress: # - ["EC_8P2G1", "RP_3GX"] # - ["EC_16P2GX", "RP_3GX"] dfuse: - mount_dir: "/tmp/daos_dfuse/macsio/" + mount_dir: "/tmp/soak_dfuse_macsio/" disable_caching: true thread_count: 8 cores: '0-7' datamover_stress: job_timeout: 10 nodesperjob: - - 1 - - 4 - 8 taskspernode: - - 16 - 32 oclass: - ["SX","SX"] diff --git a/src/tests/ftest/soak/stress_2h.yaml b/src/tests/ftest/soak/stress_2h.yaml index 68574a91c97..fd34958f368 100644 --- a/src/tests/ftest/soak/stress_2h.yaml +++ b/src/tests/ftest/soak/stress_2h.yaml @@ -97,7 +97,7 @@ ior_stress: dfs_oclass: - ["SX","SX"] dfuse: - mount_dir: "/tmp/daos_dfuse/ior/" + mount_dir: "/tmp/soak_dfuse_ior/" disable_caching: true mdtest_stress: # maximum timeout for a single job in test in minutes @@ -124,7 +124,7 @@ mdtest_stress: - ["S1","S1"] dfs_destroy: false dfuse: - mount_dir: "/tmp/daos_dfuse/mdtest/" + mount_dir: "/tmp/soak_dfuse_mdtest/" disable_caching: true vpic_stress: job_timeout: 20 @@ -134,9 +134,9 @@ vpic_stress: - 16 cmdline: "/var/hit/daos/builds/vpic-install/bin/harris.Linux" posix: true - workdir: "/tmp/daos_dfuse/vpic/" + workdir: "/tmp/soak_dfuse_vpic/" dfuse: - mount_dir: "/tmp/daos_dfuse/vpic/" + mount_dir: "/tmp/soak_dfuse_vpic/" disable_caching: true oclass: - ["SX","SX"] @@ -148,9 +148,9 @@ lammps_stress: - 16 cmdline: "/var/hit/daos/builds/lammps/src/lmp_mpi -i /var/hit/daos/builds/lammps/bench/in.lj" posix: true - workdir: "/tmp/daos_dfuse/lammps/" + workdir: "/tmp/soak_dfuse_lammps/" dfuse: - mount_dir: "/tmp/daos_dfuse/lammps/" + mount_dir: "/tmp/soak_dfuse_lammps/" disable_caching: true oclass: - ["SX","SX"] diff --git a/src/tests/ftest/util/ec_utils.py b/src/tests/ftest/util/ec_utils.py index d706b32320f..141db8f1710 100644 --- a/src/tests/ftest/util/ec_utils.py +++ b/src/tests/ftest/util/ec_utils.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -280,6 +280,10 @@ def ec_container_create(self, oclass): oclass (str): object class for creating the container. """ self.container.append(self.get_container(self.pool, create=False, oclass=oclass)) + if self.container[-1].control_method.value == \ + self.container[-1].USE_DAOS and self.container[-1].oclass.value: + self.container[-1].oclass.update(self.container[-1].oclass.value.replace("OC_", ""), + "container.oclass") # Get the Parity count for setting the container RF property. ec_object = get_data_parity_number(self.log, oclass) diff --git a/src/tests/ftest/util/server_utils.py b/src/tests/ftest/util/server_utils.py index 2be8cdb8b3b..39c212c13a8 100644 --- a/src/tests/ftest/util/server_utils.py +++ b/src/tests/ftest/util/server_utils.py @@ -477,7 +477,7 @@ def display_memory_info(self): """Display server hosts memory info.""" self.log.debug("#" * 80) self.log.debug(" Collection debug memory info") - run_remote(self.log, self._hosts, "free -m") + run_remote(self.log, self._hosts, "free -m && df -h --type=tmpfs") run_remote(self.log, self._hosts, "ps -eo size,pid,user,command --sort -size | head -n 6") self.log.debug("#" * 80) @@ -720,6 +720,9 @@ def stop(self): # Make sure the mount directory belongs to non-root user self.set_scm_mount_ownership() + # Collective memory usage after stop. + self.display_memory_info() + # Report any errors after all stop actions have been attempted if messages: raise ServerFailed("Failed to stop servers:\n {}".format("\n ".join(messages))) diff --git a/src/tests/ftest/util/soak_test_base.py b/src/tests/ftest/util/soak_test_base.py index c4ea919e2e6..886e667d9ca 100644 --- a/src/tests/ftest/util/soak_test_base.py +++ b/src/tests/ftest/util/soak_test_base.py @@ -26,7 +26,7 @@ create_app_cmdline, create_dm_cmdline, create_fio_cmdline, create_ior_cmdline, create_macsio_cmdline, create_mdtest_cmdline, create_racer_cmdline, ddhhmmss_format, get_daos_server_logs, get_harassers, - get_journalctl, launch_exclude_reintegrate, launch_extend, + get_journalctl, launch_exclude_reintegrate, launch_extend, launch_reboot, launch_server_stop_start, launch_snapshot, launch_vmd_identify_check, reserved_file_copy, run_event_check, run_metrics_check, run_monitor_check) @@ -140,8 +140,7 @@ def pre_tear_down(self): run_local(self.log, cmd, timeout=120) except RunException as error: # Exception was raised due to a non-zero exit status - errors.append("Failed to cancel jobs {}: {}".format( - self.failed_job_id_list, error)) + errors.append(f"Failed to cancel jobs {self.failed_job_id_list}: {error}") if self.all_failed_jobs: errors.append("SOAK FAILED: The following jobs failed {} ".format( " ,".join(str(j_id) for j_id in self.all_failed_jobs))) @@ -169,7 +168,7 @@ def pre_tear_down(self): try: get_daos_server_logs(self) except SoakTestError as error: - errors.append("<>".format(error)) + errors.append(f"<>") # Gather journalctl logs hosts = list(set(self.hostlist_servers)) since = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(self.start_time)) @@ -250,6 +249,16 @@ def launch_harasser(self, harasser, pool): name = "VMD_LED_CHECK" params = (self, name, results, args) job = multiprocessing.Process(target=method, args=params, name=name) + elif harasser == "reboot": + method = launch_reboot + name = "REBOOT" + params = (self, pool, name, results, args) + job = multiprocessing.Process(target=method, args=params, name=name) + elif harasser == "reboot-reintegrate": + method = launch_reboot + name = "REBOOT_REINTEGRATE" + params = (self, pool, name, results, args) + job = multiprocessing.Process(target=method, args=params, name=name) else: raise SoakTestError(f"<>", job.name) if name not in ["REBUILD", "SNAPSHOT"]: job.terminate() - status_msg = "<>" self.log.error(status_msg) return status_msg @@ -336,8 +343,7 @@ def job_setup(self, jobs, pool): elif "datamover" in job: commands = create_dm_cmdline(self, job, pool, ppn, npj) else: - raise SoakTestError( - "<>".format(self.test_name, err_msg)) + raise SoakTestError(f"<>") return job_id_list def job_completion(self, job_id_list): @@ -414,7 +420,7 @@ def job_completion(self, job_id_list): time.ctime()) for job in job_id_list: if not slurm_utils.cancel_jobs(self.log, self.control, int(job)).passed: - self.fail("Error canceling Job {}".format(job)) + self.fail(f"Error canceling Job {job}") # monitor events every 15 min if datetime.now() > check_time: run_monitor_check(self) diff --git a/src/tests/ftest/util/soak_utils.py b/src/tests/ftest/util/soak_utils.py index 06857bf431e..50a37766c19 100644 --- a/src/tests/ftest/util/soak_utils.py +++ b/src/tests/ftest/util/soak_utils.py @@ -18,12 +18,14 @@ from command_utils_base import EnvironmentVariables from daos_racer_utils import DaosRacerCommand from data_mover_utils import DcpCommand, FsCopy -from dfuse_utils import Dfuse +from dfuse_utils import get_dfuse from dmg_utils import get_storage_query_device_info from duns_utils import format_path +from exception_utils import CommandFailure from fio_utils import FioCommand -from general_utils import (DaosTestError, get_host_data, get_log_file, get_random_bytes, - get_random_string, list_to_str, pcmd, run_command, run_pcmd) +from general_utils import (DaosTestError, check_ping, check_ssh, get_host_data, get_log_file, + get_random_bytes, get_random_string, list_to_str, pcmd, run_command, + run_pcmd, wait_for_result) from ior_utils import IorCommand from job_manager_utils import Mpirun from macsio_util import MacsioCommand @@ -239,8 +241,7 @@ def get_daos_server_logs(self): try: run_command(" ".join(command), timeout=600) except DaosTestError as error: - raise SoakTestError( - "<>".format(hosts)) from error + raise SoakTestError(f"<>") from error def run_monitor_check(self): @@ -459,6 +460,136 @@ def launch_vmd_identify_check(self, name, results, args): self.log.info("<<>>\n", self.loop, name, time.ctime()) +def launch_reboot(self, pools, name, results, args): + """Execute server unexpected reboot. + + Args: + self (obj): soak obj + pools (TestPool): list of TestPool obj + name (str): name of dmg subcommand + results (queue): multiprocessing queue + args (queue): multiprocessing queue + """ + # Harasser is run in two parts REBOOT and then REBOOT_REINTEGRATE + # REBOOT test steps + # shutdown random node + # wait for node to reboot + # If node rebooted ok wait for rebuild on both pool to complete + # Update multiprocessing queue with results and args + # REBOOT_REINTEGRATE test steps + # if REBOOT completed ok then + # Issue systemctl restart daos_server + # Verify that all ranks are joined + # If all ranks "joined", issue reintegrate for all pool on all ranks and wait for + # rebuild to complete + # Update multiprocessing queue with results and args + status = False + params = {} + ranks = None + if name == "REBOOT": + reboot_host = self.random.choice(self.hostlist_servers) + ranklist = self.server_managers[0].get_host_ranks(reboot_host) + ranks = ",".join(str(rank) for rank in ranklist) + # init the status dictionary + params = {"name": name, + "status": status, + "vars": {"host": reboot_host, "ranks": ranklist}} + self.log.info( + "<<>>\n", self.loop, name, ranks, time.ctime()) + # reboot host in 1 min + result = run_remote(self.log, reboot_host, "sudo shutdown -r +1") + if result.passed: + status = True + else: + self.log.error(f"<<>>\n", self.loop, name, reboot_host, + time.ctime()) + status = True + self.dmg_command.system_query() + # wait for node to complete rebooting + if not wait_for_result(self.log, check_ping, 60, 5, True, host=reboot_host, + expected_ping=True, cmd_timeout=60, verbose=True): + self.log.error(f"<<>>\n", + self.loop, name, reboot_host, time.ctime()) + cmd_results = run_remote(self.log, reboot_host, "sudo systemctl restart daos_server") + if cmd_results.passed: + self.dmg_command.system_query() + for pool in pools: + self.dmg_command.pool_query(pool.identifier) + # wait server to be started + try: + self.dmg_command.system_start(ranks=ranks) + except CommandFailure as error: + self.log.error("<<>>", self.loop, name) + status = False + + params = {"name": name, + "status": status, + "vars": {"host": reboot_host, "ranks": ranklist}} + if not status: + self.log.error("<<< %s failed - check logs for failure data>>>", name) + self.dmg_command.system_query() + self.harasser_job_done(params) + results.put(self.harasser_results) + args.put(self.harasser_args) + self.log.info("Harasser results: %s", self.harasser_results) + self.log.info("Harasser args: %s", self.harasser_args) + self.log.info("<<>>\n", self.loop, name, time.ctime()) + + def launch_extend(self, pool, name, results, args): """Execute dmg extend ranks. @@ -475,14 +606,13 @@ def launch_extend(self, pool, name, results, args): if self.selected_host: ranklist = self.server_managers[0].get_host_ranks(self.selected_host) - + ranks = ",".join(str(rank) for rank in ranklist) # init the status dictionary params = {"name": name, "status": status, "vars": {"host": self.selected_host, "ranks": ranks}} self.log.info( "<<>>\n", self.loop, name, ranks, time.ctime()) - ranks = ",".join(str(rank) for rank in ranklist) try: pool.extend(ranks) status = True @@ -497,6 +627,7 @@ def launch_extend(self, pool, name, results, args): "vars": {"host": self.selected_host, "ranks": ranks}} if not status: self.log.error("<<< %s failed - check logs for failure data>>>", name) + self.dmg_command.system_query() self.harasser_job_done(params) results.put(self.harasser_results) args.put(self.harasser_args) @@ -630,7 +761,7 @@ def launch_server_stop_start(self, pools, name, results, args): self.log.error("<<