Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

aurora/2.6 test branch #14539

Draft
wants to merge 12 commits into
base: release/2.6
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
182 changes: 182 additions & 0 deletions src/tests/ftest/control/dmg_scale.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
"""
(C) Copyright 2024 Intel Corporation.

SPDX-License-Identifier: BSD-2-Clause-Patent
"""
from apricot import TestWithServers
from telemetry_utils import TelemetryUtils
from test_utils_pool import time_pool_create

ENGINE_POOL_METRICS_SHORT = [
"engine_pool_entries_dtx_batched_degree",
"engine_pool_entries_dtx_batched_total",
"engine_pool_ops_akey_enum",
"engine_pool_ops_akey_punch",
"engine_pool_ops_compound",
"engine_pool_ops_dkey_enum",
"engine_pool_ops_dkey_punch",
"engine_pool_ops_dtx_abort",
"engine_pool_ops_dtx_check",
"engine_pool_ops_dtx_commit",
"engine_pool_ops_dtx_refresh",
"engine_pool_ops_ec_agg",
"engine_pool_ops_ec_rep",
"engine_pool_ops_fetch",
"engine_pool_ops_key_query",
"engine_pool_ops_migrate",
"engine_pool_ops_obj_enum",
"engine_pool_ops_obj_punch",
"engine_pool_ops_obj_sync",
"engine_pool_ops_recx_enum",
"engine_pool_ops_tgt_akey_punch",
"engine_pool_ops_tgt_dkey_punch",
"engine_pool_ops_tgt_punch",
"engine_pool_ops_tgt_update",
"engine_pool_ops_update",
"engine_pool_ops_pool_connect",
"engine_pool_ops_pool_disconnect",
"engine_pool_ops_pool_evict",
"engine_pool_ops_pool_query",
"engine_pool_ops_pool_query_space",
"engine_pool_resent",
"engine_pool_restarted",
"engine_pool_retry",
"engine_pool_scrubber_busy_time",
"engine_pool_scrubber_bytes_scrubbed_current",
"engine_pool_scrubber_bytes_scrubbed_prev",
"engine_pool_scrubber_bytes_scrubbed_total",
"engine_pool_scrubber_corruption_current",
"engine_pool_scrubber_corruption_total",
"engine_pool_scrubber_csums_current",
"engine_pool_scrubber_csums_prev",
"engine_pool_scrubber_csums_total",
"engine_pool_scrubber_next_csum_scrub",
"engine_pool_scrubber_next_tree_scrub",
"engine_pool_scrubber_prev_duration",
"engine_pool_scrubber_prev_duration_max",
"engine_pool_scrubber_prev_duration_mean",
"engine_pool_scrubber_prev_duration_min",
"engine_pool_scrubber_prev_duration_stddev",
"engine_pool_scrubber_scrubber_started",
"engine_pool_scrubber_scrubs_completed",
"engine_pool_started_at",
"engine_pool_vos_aggregation_akey_deleted",
"engine_pool_vos_aggregation_akey_scanned",
"engine_pool_vos_aggregation_akey_skipped",
"engine_pool_vos_aggregation_csum_errors",
"engine_pool_vos_aggregation_deleted_ev",
"engine_pool_vos_aggregation_deleted_sv",
"engine_pool_vos_aggregation_dkey_deleted",
"engine_pool_vos_aggregation_dkey_scanned",
"engine_pool_vos_aggregation_dkey_skipped",
"engine_pool_vos_aggregation_epr_duration",
"engine_pool_vos_aggregation_epr_duration_max",
"engine_pool_vos_aggregation_epr_duration_mean",
"engine_pool_vos_aggregation_epr_duration_min",
"engine_pool_vos_aggregation_epr_duration_stddev",
"engine_pool_vos_aggregation_merged_recs",
"engine_pool_vos_aggregation_merged_size",
"engine_pool_vos_aggregation_obj_deleted",
"engine_pool_vos_aggregation_obj_scanned",
"engine_pool_vos_aggregation_obj_skipped",
"engine_pool_vos_aggregation_uncommitted",
"engine_pool_vos_space_nvme_used",
"engine_pool_vos_space_scm_used",
"engine_pool_xferred_fetch",
"engine_pool_xferred_update",
"engine_pool_EC_update_full_stripe",
"engine_pool_EC_update_partial",
"engine_pool_block_allocator_alloc_hint",
"engine_pool_block_allocator_alloc_large",
"engine_pool_block_allocator_alloc_small",
"engine_pool_block_allocator_frags_aging",
"engine_pool_block_allocator_frags_large",
"engine_pool_block_allocator_frags_small",
"engine_pool_block_allocator_free_blks",
"engine_pool_ops_key2anchor"
]


class DmgScale(TestWithServers):
"""Verify dmg commands works as expected in a large scale system.

:avocado: recursive
"""

def test_dmg_scale(self):
"""Run the following steps and manually collect duration for each step.

0. Format storage
1. System query
2. Create a 100% pool that spans all engines
3. Pool query
4. Pool destroy
5. Create 49 pools spanning all the engines with each pool using a 1/50th of the capacity
6. Pool list
7. Query around 80 pool metrics
8. Destroy all 49 pools
9. System stop
10. System start

Jira ID: DAOS-10508.

:avocado: tags=all,manual
:avocado: tags=deployment
:avocado: tags=DmgScale,test_dmg_scale
"""
# This is a manual test and we need to find the durations from job.log, so add "##" to make
# it easy to search. The log is usually over 1 million lines.
self.log_step("## System query")
dmg_command = self.get_dmg_command()
dmg_command.system_query()

self.log_step("## Create a 100% pool that spans all engines")
pool = self.get_pool(namespace="/run/pool_100/*", create=False)
duration = time_pool_create(log=self.log, number=1, pool=pool)
self.log.info("## Single pool create duration = %.1f", duration)

self.log_step("## Pool query")
pool.query()

self.log_step("## Pool destroy")
pool.destroy()

quantity = self.params.get("quantity", "/run/pool_small/*", 1)
msg = (f"## Create {quantity} small pools spanning all the engines where the pools fill up "
f"the capacity")
self.log_step(msg)
pool_0 = self.get_pool(namespace="/run/pool_small/*", create=False)
duration_0 = time_pool_create(log=self.log, number=0, pool=pool_0)
pools = [pool_0]
durations = [duration_0]
for count in range(1, quantity):
pools.append(self.get_pool(create=False))
# Use the SCM and NVMe size of the first pool for the rest of the (quantity - 1) pools.
pools[-1].scm_size.update(pool_0.scm_per_rank)
pools[-1].nvme_size.update(pool_0.nvme_per_rank)
durations.append(time_pool_create(log=self.log, number=count, pool=pools[-1]))
msg = (f"Pool {count} created. SCM = {pools[-1].scm_per_rank}; "
f"NVMe = {pools[-1].nvme_per_rank}")
self.log.info(msg)
self.log.info("## durations = %s", durations)
total_duration = sum(durations)
self.log.info("## %d pools create duration = %.1f", quantity, total_duration)

self.log_step("## Pool list")
dmg_command.pool_list()

self.log_step("## Query around 80 pool metrics")
# To save time and logs, call telemetry on the first host only. With the 80 pool metrics
# above, ~100K lines are printed per host.
telemetry_utils = TelemetryUtils(
dmg=dmg_command, servers=[self.server_managers[0].hosts[0]])
telemetry_utils.get_metrics(name=",".join(ENGINE_POOL_METRICS_SHORT))

self.log_step(f"## Destroy all {quantity} pools")
self.destroy_pools(pools=pools)

self.log_step("## System stop")
self.server_managers[0].system_stop()

self.log_step("## System start")
self.server_managers[0].system_start()
37 changes: 37 additions & 0 deletions src/tests/ftest/control/dmg_scale.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Note: We usually use the extra yaml in aurora-tools, but that extra yaml has test_clients while
# this test doesn't need any client, so update the extra yaml or provide some dummy client to -tc.
hosts:
test_servers: 256

timeout: 900

daos_server:
pattern_timeout: 60

server_config:
name: daos_server
engines_per_host: 2
engines:
0:
pinned_numa_node: 0
nr_xs_helpers: 1
fabric_iface: ib0
fabric_iface_port: 31317
log_file: daos_server0.log
storage: auto
targets: 8
1:
pinned_numa_node: 1
nr_xs_helpers: 1
fabric_iface: ib1
fabric_iface_port: 31417
log_file: daos_server1.log
storage: auto
targets: 8

pool_100:
size: 100%
pool_small:
size: 2%
# If we use --size=2% during pool create, we can only create up to 49 pools.
quantity: 49
6 changes: 3 additions & 3 deletions src/tests/ftest/deployment/agent_failure.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def test_agent_failure(self):
# 5. Verify journalctl shows the log that the agent is stopped.
results = get_journalctl(
hosts=self.hostlist_clients, since=since, until=until,
journalctl_type="daos_agent")
journalctl_type="daos_agent", run_user=self.test_env.agent_user)
self.log.info("journalctl results = %s", results)
if "shutting down" not in results[0]["data"]:
msg = "Agent shut down message not found in journalctl! Output = {}".format(
Expand Down Expand Up @@ -240,7 +240,7 @@ def test_agent_failure_isolation(self):
# stopped.
results = get_journalctl(
hosts=[agent_host_kill], since=since, until=until,
journalctl_type="daos_agent")
journalctl_type="daos_agent", run_user=self.test_env.agent_user)
self.log.info("journalctl results (kill) = %s", results)
if "shutting down" not in results[0]["data"]:
msg = ("Agent shut down message not found in journalctl on killed client! "
Expand All @@ -251,7 +251,7 @@ def test_agent_failure_isolation(self):
# in the previous step doesn't show that the agent is stopped.
results = get_journalctl(
hosts=[agent_host_keep], since=since, until=until,
journalctl_type="daos_agent")
journalctl_type="daos_agent", run_user=self.test_env.agent_user)
self.log.info("journalctl results (keep) = %s", results)
if "shutting down" in results[0]["data"]:
msg = ("Agent shut down message found in journalctl on keep client! "
Expand Down
2 changes: 1 addition & 1 deletion src/tests/ftest/deployment/basic_checkout.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ mdtest_easy: &mdtest_easy_base
write_bytes: 0
num_of_files_dirs: 100000000
stonewall_timer: 30
stonewall_statusfile: "/var/tmp/daos_testing/stoneWallingStatusFile"
stonewall_statusfile: stoneWallingStatusFile
dfs_destroy: false
mdtest_dfs_s1:
<<: *mdtest_easy_base
Expand Down
Loading
Loading