From 24cf0ed6e4f74be09a542fd1c478f5082b7646dd Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Wed, 27 Mar 2024 07:46:14 -0400 Subject: [PATCH] DAOS-11626 test: Adding MD on SSD metrics tests (#13661) Adding tests for WAL commit, reply, and checkpoint metrics. Signed-off-by: Phil Henderson --- src/tests/ftest/server/replay.py | 58 +-- src/tests/ftest/server/replay.yaml | 3 +- .../ftest/telemetry/dkey_akey_enum_punch.py | 16 +- src/tests/ftest/telemetry/wal_metrics.py | 217 ++++++++++ src/tests/ftest/telemetry/wal_metrics.yaml | 34 ++ src/tests/ftest/util/ior_utils.py | 80 +++- src/tests/ftest/util/telemetry_utils.py | 388 ++++++++++++++++-- 7 files changed, 709 insertions(+), 87 deletions(-) create mode 100644 src/tests/ftest/telemetry/wal_metrics.py create mode 100644 src/tests/ftest/telemetry/wal_metrics.yaml diff --git a/src/tests/ftest/server/replay.py b/src/tests/ftest/server/replay.py index 1b9f08b114cf..61e8f50a6f49 100644 --- a/src/tests/ftest/server/replay.py +++ b/src/tests/ftest/server/replay.py @@ -9,8 +9,7 @@ from apricot import TestWithServers from dfuse_utils import get_dfuse, start_dfuse, stop_dfuse from general_utils import join -from ior_utils import get_ior -from job_manager_utils import get_job_manager +from ior_utils import read_data, write_data from test_utils_pool import add_pool @@ -38,24 +37,6 @@ def create_container(self, details=None, **pool_params): self.log_step(join(' ', 'Creating a container (daos container create)', '-', details)) return self.get_container(pool) - def write_data(self, container, ppn, dfuse=None): - """Write data to the container/dfuse using ior. - - Args: - container (TestContainer): the container to populate - ppn (int): processes per node to use with the ior command - dfuse (Dfuse, optional): dfuse object defining the dfuse mount point. Defaults to None. - - Returns: - Ior: the Ior object used to populate the container - """ - job_manager = get_job_manager(self, subprocess=False, timeout=60) - ior = get_ior( - self, job_manager, self.hostlist_clients, self.workdir, None, - namespace='/run/ior_write/*') - ior.run(self.server_group, container.pool, container, None, ppn, dfuse=dfuse) - return ior - def stop_engines(self): """Stop each server engine and verify they are not running.""" self.log_step('Shutting down the engines (dmg system stop)') @@ -80,18 +61,6 @@ def restart_engines(self): self.log.info('Ranks %s failed to start', rank_check) self.fail('Failed to start ranks cleanly') - def read_data(self, ior, container, ppn, dfuse=None): - """Verify the data used to populate the container. - - Args: - ior (Ior): the ior command used to populate the container - container (TestContainer): the container to verify - ppn (int): processes per node to use with the ior command - dfuse (Dfuse, optional): dfuse object defining the dfuse mount point. Defaults to None. - """ - ior.update('flags', self.params.get('flags', '/run/ior_read/*')) - ior.run(self.server_group, container.pool, container, None, ppn, dfuse=dfuse) - def verify_snapshots(self, container, expected): """Verify the snapshots listed for the container match the expected list of snapshots. @@ -126,17 +95,16 @@ def test_restart(self): :avocado: tags=server,replay :avocado: tags=ReplayTests,test_restart """ - ppn = self.params.get('ppn', '/run/ior_write/*', 1) container = self.create_container() self.log_step('Write data to the container (ior)') - ior = self.write_data(container, ppn) + ior = write_data(self, container) self.stop_engines() self.restart_engines() self.log_step('Verifying data previously written to the container (ior)') - self.read_data(ior, container, ppn) + read_data(self, ior, container) self.log_step('Test passed') def test_replay_posix(self): @@ -159,7 +127,6 @@ def test_replay_posix(self): :avocado: tags=server,replay :avocado: tags=ReplayTests,test_replay_posix """ - ppn = self.params.get('ppn', '/run/ior_write/*', 1) container = self.create_container() self.log_step('Start dfuse') @@ -167,7 +134,7 @@ def test_replay_posix(self): start_dfuse(self, dfuse, container.pool, container) self.log_step('Write data to the dfuse mount point (ior)') - ior = self.write_data(container, ppn, dfuse) + ior = write_data(self, container, dfuse=dfuse) self.log_step('After the read has completed, unmount dfuse') stop_dfuse(self, dfuse) @@ -179,10 +146,10 @@ def test_replay_posix(self): start_dfuse(self, dfuse) self.log_step('Verifying data previously written to the dfuse mount point (ior)') - self.read_data(ior, container, ppn, dfuse) + read_data(self, ior, container, dfuse=dfuse) self.log_step('Write additional data to the dfuse mount point (ior)') - ior = self.write_data(container, ppn, dfuse) + ior = write_data(self, container, dfuse=dfuse) self.log.info('Test passed') @@ -210,14 +177,13 @@ def test_replay_snapshots(self): :avocado: tags=server,replay :avocado: tags=ReplayTests,test_replay_snapshots """ - ppn = self.params.get('ppn', '/run/ior_write/*', 1) container = self.create_container() snapshots = [] for index in range(1, 4): step = join(' ', index, 'of', 3) self.log_step(join(' ', 'Write data to the container (ior)', '-', step)) - self.write_data(container, ppn) + write_data(self, container) self.log_step(join(' ', 'Creating a snapshot (daos container create-snap)', '-', step)) snapshots.append(container.create_snap()['response']['epoch']) @@ -348,7 +314,6 @@ def test_replay_no_check_pointing(self): :avocado: tags=server,replay :avocado: tags=ReplayTests,test_replay_no_check_pointing """ - ppn = self.params.get('ppn', '/run/ior_write/*', 1) container = self.create_container() self.log_step('Disabling check pointing on {}'.format(container.pool)) @@ -358,7 +323,7 @@ def test_replay_no_check_pointing(self): self.fail('Pool check pointing not disabled before engine restart') self.log_step('Write data to the container (ior)') - ior = self.write_data(container, ppn) + ior = write_data(self, container) self.stop_engines() self.restart_engines() @@ -371,7 +336,7 @@ def test_replay_no_check_pointing(self): self.fail('Pool check pointing not disabled after engine restart') self.log_step('Verifying data previously written to the container (ior)') - self.read_data(ior, container, ppn) + read_data(self, ior, container) self.log_step('Test passed') def test_replay_check_pointing(self): @@ -392,14 +357,13 @@ def test_replay_check_pointing(self): :avocado: tags=server,replay :avocado: tags=ReplayTests,test_replay_check_pointing """ - ppn = self.params.get('ppn', '/run/ior_write/*', 1) frequency = 5 container = self.create_container( properties=f'checkpoint:timed,checkpoint_freq:{frequency}') self.log.info('%s check point frequency: %s seconds', container.pool, frequency) self.log_step('Write data to the container (ior)') - ior = self.write_data(container, ppn) + ior = write_data(self, container) self.log_step('Waiting for check pointing to complete (sleep {})'.format(frequency * 2)) time.sleep(frequency * 2) @@ -408,5 +372,5 @@ def test_replay_check_pointing(self): self.restart_engines() self.log_step('Verifying data previously written to the container (ior)') - self.read_data(ior, container, ppn) + read_data(self, ior, container) self.log_step('Test passed') diff --git a/src/tests/ftest/server/replay.yaml b/src/tests/ftest/server/replay.yaml index 161582536414..1e4fb0b81e0f 100644 --- a/src/tests/ftest/server/replay.yaml +++ b/src/tests/ftest/server/replay.yaml @@ -21,8 +21,7 @@ container: dfs_oclass: SX ior: &ior_base - client_processes: - ppn: 4 + ppn: 4 api: DFS transfer_size: 512K block_size: 1G diff --git a/src/tests/ftest/telemetry/dkey_akey_enum_punch.py b/src/tests/ftest/telemetry/dkey_akey_enum_punch.py index 99481406d8ee..b97d1526b2b7 100644 --- a/src/tests/ftest/telemetry/dkey_akey_enum_punch.py +++ b/src/tests/ftest/telemetry/dkey_akey_enum_punch.py @@ -1,5 +1,5 @@ ''' - (C) Copyright 2018-2023 Intel Corporation. + (C) Copyright 2018-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -232,13 +232,13 @@ def test_dkey_akey_enum_punch(self): # Obtain and verify the io metrics 1 to 4. ### # engine_pool_ops_dkey_enum - pool_dkey_enum = self.telemetry.ENGINE_POOL_METRICS[5] + pool_dkey_enum = self.telemetry.ENGINE_POOL_OPS_DKEY_ENUM_METRICS # engine_pool_ops_akey_enum - pool_akey_enum = self.telemetry.ENGINE_POOL_METRICS[2] + pool_akey_enum = self.telemetry.ENGINE_POOL_OPS_AKEY_ENUM_METRICS # engine_pool_ops_dkey_punch - pool_dkey_punch = self.telemetry.ENGINE_POOL_METRICS[6] + pool_dkey_punch = self.telemetry.ENGINE_POOL_OPS_DKEY_PUNCH_METRICS # engine_pool_ops_akey_punch - pool_akey_punch = self.telemetry.ENGINE_POOL_METRICS[3] + pool_akey_punch = self.telemetry.ENGINE_POOL_OPS_AKEY_PUNCH_METRICS specific_metrics = [ pool_dkey_enum, pool_akey_enum, pool_dkey_punch, pool_akey_punch, @@ -357,9 +357,9 @@ def test_pool_tgt_dkey_akey_punch(self): self.telemetry.dmg.verbose = False - # Obtain and verify the pool metrics 1 and 2 ### - pool_tgt_dkey_punch = self.telemetry.ENGINE_POOL_METRICS[21] - pool_tgt_akey_punch = self.telemetry.ENGINE_POOL_METRICS[20] + # Obtain and verify the pool target punch metrics + pool_tgt_dkey_punch = self.telemetry.ENGINE_POOL_OPS_TGT_DKEY_PUNCH_METRICS + pool_tgt_akey_punch = self.telemetry.ENGINE_POOL_OPS_TGT_AKEY_PUNCH_METRICS specific_metrics = [pool_tgt_dkey_punch, pool_tgt_akey_punch] pool_out = self.telemetry.get_pool_metrics( specific_metrics=specific_metrics) diff --git a/src/tests/ftest/telemetry/wal_metrics.py b/src/tests/ftest/telemetry/wal_metrics.py new file mode 100644 index 000000000000..dc553f4b106a --- /dev/null +++ b/src/tests/ftest/telemetry/wal_metrics.py @@ -0,0 +1,217 @@ +""" + (C) Copyright 2018-2024 Intel Corporation. + + SPDX-License-Identifier: BSD-2-Clause-Patent +""" +import time + +from ior_utils import write_data +from telemetry_test_base import TestWithTelemetry +from test_utils_pool import add_pool + + +class WalMetrics(TestWithTelemetry): + """Tests for new specific metrics to track activity of md_on_ssd. + + :avocado: recursive + """ + + def test_wal_commit_metrics(self): + """JIRA ID: DAOS-11626. + + The WAL commit metrics is per-pool metrics, it includes 'wal_sz', 'wal_qd' and 'wal_waiters' + (see vos_metrics_alloc() in src/vos/vos_common.c). WAL commit metrics are updated on each + local transaction (for example, transaction for a update request, etc.) + + Test steps: + 1) Create a pool + 2) Verify WAL commit metrics after pool creation (non-zero w/ MD on SSD) + + :avocado: tags=all,daily_regression + :avocado: tags=hw,medium + :avocado: tags=telemetry + :avocado: tags=WalMetrics,test_wal_commit_metrics + """ + wal_metrics = list(self.telemetry.ENGINE_POOL_VOS_WAL_METRICS) + + self.log_step('Creating a pool (dmg pool create)') + add_pool(self) + + self.log_step( + 'Collect WAL commit metrics after creating a pool (dmg telemetry metrics query)') + ranges = self.telemetry.collect_data(wal_metrics) + for metric in list(ranges): + if '_sz' in metric and not metric.endswith('_mean') and not metric.endswith('_stddev'): + for label in ranges[metric]: + if self.server_managers[0].manager.job.using_control_metadata: + # The min/max/actual size should be greater than 0 for MD on SSD + ranges[metric][label] = [1] + else: + ranges[metric][label] = [0, 0] + elif '_waiters' not in metric: + ranges.pop(metric) + if self.server_managers[0].manager.job.using_control_metadata: + self.log_step( + 'Verify WAL commit size metrics are > 0 and waiters are 0 after creating a pool') + else: + self.log_step('Verify WAL commit metrics are 0 after creating a pool') + if not self.telemetry.verify_data(ranges): + self.fail('Unexpected WAL commit metric values after pool create') + + self.log_step('Test passed') + + def test_wal_reply_metrics(self): + """JIRA ID: DAOS-11626. + + The WAL replay metrics is per-pool metrics in 'vos_rehydration' under each pool folder, it + includes 'replay_size', 'replay_time', 'replay_entries', 'replay_count' and + 'replay_transactions' (see vos_metrics_alloc() in src/vos/vos_common.c). WAL replay + metrics are only updated when open a pool on engine start (or when creating a pool). + + Test steps: + 1) Create a pool + 2) Verify WAL replay metrics after pool creation (non-zero w/ MD on SSD) + + :avocado: tags=all,daily_regression + :avocado: tags=hw,medium + :avocado: tags=telemetry + :avocado: tags=WalMetrics,test_wal_reply_metrics + """ + wal_metrics = list(self.telemetry.ENGINE_POOL_VOS_WAL_REPLAY_METRICS) + + self.log_step('Creating a pool (dmg pool create)') + add_pool(self) + + self.log_step( + 'Collect WAL replay metrics after creating a pool (dmg telemetry metrics query)') + ranges = self.telemetry.collect_data(wal_metrics) + for metric in sorted(ranges): + for label in ranges[metric]: + if self.server_managers[0].manager.job.using_control_metadata: + if metric.endswith('_replay_count'): + # Replay count should be 1 after pool create for MD on SSD + ranges[metric][label] = [1, 1] + elif metric.endswith('_replay_entries'): + # Replay entries should be > 0 after pool create for MD on SSD + ranges[metric][label] = [1] + elif metric.endswith('_replay_size'): + # Replay size should be > 0 after pool create for MD on SSD + ranges[metric][label] = [1] + elif metric.endswith('_replay_time'): + # Replay time should be 10,000 - 50,000 after pool create for MD on SSD + ranges[metric][label] = [10000, 50000] + elif metric.endswith('_replay_transactions'): + # Replay transactions should be > 0 after pool create for MD on SSD + ranges[metric][label] = [1] + else: + ranges[metric][label] = [0, 0] + + self.log_step('Verify WAL reply metrics after pool creation (dmg telemetry metrics query)') + if not self.telemetry.verify_data(ranges): + self.fail('WAL replay metrics verification failed after pool creation') + + self.log_step('Test passed') + + def test_wal_checkpoint_metrics(self): + """JIRA ID: DAOS-11626. + + The WAL checkpoint metrics is per-pool metrics in 'checkpoint' under each pool folder, it + includes 'duration', 'dirty_pages', 'dirty_chunks', 'iovs_copied' and 'wal_purged' (see + vos_chkpt_metrics_init() in src/vos/vos_pool.c). WAL checkpoint metrics are update on + check pointing, check pointing regularly happens in background (See the 'Checkpoint policy' + in manual), when there is nothing to be checkpoint-ed (no new commits since last + checkpoint), the checkpoint would be no-op and metrics won’t updated. + + Test steps: + 1) Create a pool w/o check pointing + 2) Verify WAL checkpoint metrics are zero after pool creation + 3) Create a second pool w/ check pointing enabled + 4) Verify WAL checkpoint metrics are zero for both pools after pool creation + 5) Write some data to a container in the second pool + 6) Wait enough time for check pointing to have occurred + 7) Verify WAL checkpoint purged metrics are non-zero for the second pool (for MD on SSD) + + :avocado: tags=all,daily_regression + :avocado: tags=hw,medium + :avocado: tags=telemetry + :avocado: tags=WalMetrics,test_wal_checkpoint_metrics + """ + frequency = 5 + wal_metrics = list(self.telemetry.ENGINE_POOL_CHECKPOINT_METRICS) + + self.log_step('Creating a pool with check pointing disabled (dmg pool create)') + add_pool(self, properties='checkpoint:disabled') + + self.log_step( + 'Collect WAL checkpoint metrics after creating a pool w/o check pointing ' + '(dmg telemetry metrics query)') + ranges = self.telemetry.collect_data(wal_metrics) + for metric, values in ranges.items(): + for label in values: + # Initially all metrics should be 0 + values[label] = [0, 0] + + self.log_step( + 'Verifying WAL checkpoint metrics are all 0 after creating a pool w/o check pointing') + if not self.telemetry.verify_data(ranges): + self.fail('WAL check point metrics not zero after creating a pool w/o check pointing') + + self.log_step('Creating a pool with timed check pointing (dmg pool create)') + pool = add_pool(self, properties=f'checkpoint:timed,checkpoint_freq:{frequency}') + + self.log_step( + 'Collect WAL checkpoint metrics after creating a pool w/ check pointing ' + '(dmg telemetry metrics query)') + ranges = self.telemetry.collect_data(wal_metrics) + for metric, values in ranges.items(): + for label in values: + uuid = pool.uuid + if uuid in label and self.server_managers[0].manager.job.using_control_metadata: + if '_dirty_chunks' in metric: + # Check point dirty chunks should be 0-300 after pool create for MD on SSD + values[label] = [0, 300] + elif '_dirty_pages' in metric: + # Check point dirty pages should be 0-3 after pool create for MD on SSD + values[label] = [0, 3] + elif '_duration' in metric: + # Check point duration should be 0-1,000,000 after pool create for MD on SSD + values[label] = [0, 1000000] + elif '_iovs_copied' in metric: + # Check point iovs copied should be >= 0 after pool create for MD on SSD + values[label] = [0] + elif '_wal_purged' in metric: + # Check point wal purged should be >= 0 after pool create for MD on SSD + values[label] = [0] + else: + # All metrics for the pool w/o check pointing or w/o MD on SSD should be 0 + values[label] = [0, 0] + self.log_step('Verifying WAL check point metrics after creating a pool w/ check pointing') + if not self.telemetry.verify_data(ranges): + self.fail('WAL replay metrics verification failed after pool w/ check pointing create') + + self.log_step('Creating a container for the pool w/ check pointing (daos container create)') + container = self.get_container(pool) + self.log.info('%s check point frequency: %s seconds', container.pool, frequency) + + self.log_step('Writing data to the pool w/ check pointing (ior)') + write_data(self, container) + + self.log_step(f'Waiting for check pointing to complete (sleep {frequency * 2})') + time.sleep(frequency * 2) + + self.log_step('Collect WAL checkpoint metrics after check pointing is complete') + self.telemetry.collect_data(wal_metrics) + if self.server_managers[0].manager.job.using_control_metadata: + for metric, values in ranges.items(): + for label in values: + if pool.uuid in label: + if '_wal_purged' in metric: + # Check point wal purged should be > 0 after check point for MD on SSD + values[label] = [1] + self.log_step( + 'Verify WAL checkpoint metrics after check pointing is complete ' + '(dmg telemetry metrics query)') + if not self.telemetry.verify_data(ranges): + self.fail('WAL replay metrics verification failed after check pointing completion') + + self.log_step('Test passed') diff --git a/src/tests/ftest/telemetry/wal_metrics.yaml b/src/tests/ftest/telemetry/wal_metrics.yaml new file mode 100644 index 000000000000..71ba8cbc17b2 --- /dev/null +++ b/src/tests/ftest/telemetry/wal_metrics.yaml @@ -0,0 +1,34 @@ +hosts: + test_servers: 2 + test_clients: 2 + +timeout: 180 + +server_config: + engines_per_host: 1 + engines: + 0: + storage: auto + +pool: + size: 20G + +container: + control_method: daos + type: POSIX + dfs_oclass: SX + +ior: &ior_base + ppn: 4 + api: DFS + transfer_size: 512K + block_size: 1G + dfs_oclass: SX + +ior_write: + <<: *ior_base + flags: "-k -v -w -W -G 1" + +ior_read: + <<: *ior_base + flags: "-v -r -R -G 1" diff --git a/src/tests/ftest/util/ior_utils.py b/src/tests/ftest/util/ior_utils.py index cd54b0e19aff..b729afc00ee0 100644 --- a/src/tests/ftest/util/ior_utils.py +++ b/src/tests/ftest/util/ior_utils.py @@ -1,5 +1,5 @@ """ -(C) Copyright 2018-2023 Intel Corporation. +(C) Copyright 2018-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -13,6 +13,7 @@ from duns_utils import format_path from exception_utils import CommandFailure from general_utils import get_log_file +from job_manager_utils import get_job_manager def get_ior(test, manager, hosts, path, slots, namespace="/run/ior/*", ior_params=None): @@ -139,6 +140,83 @@ def thread_run_ior(thread_queue, job_id, test, manager, log, hosts, path, slots, thread_queue.put(thread_result) +def write_data(test, container, namespace='/run/ior_write/*', **ior_run_params): + """Write data to the container/dfuse using ior. + + Simple method for test classes to use to write data with ior. While not required, this is setup + by default to pull in ior parameters from the test yaml using a format similar to: + + ior: &ior_base + api: DFS + transfer_size: 512K + block_size: 1G + ppn: 2 + + ior_write: + <<: *ior_base + flags: "-k -v -w -W -G 1" + + ior_read: + <<: *ior_base + flags: "-v -r -R -G 1" + + Args: + test (Test): avocado Test object + container (TestContainer): the container to populate + namespace (str, optional): path to ior yaml parameters. Defaults to '/run/ior_write/*'. + ior_run_params (dict): optional params for the Ior.run() command, like ppn, dfuse, etc. + + Returns: + Ior: the Ior object used to populate the container + """ + job_manager = get_job_manager(test, subprocess=False, timeout=60) + ior = get_ior(test, job_manager, test.hostlist_clients, test.workdir, None, namespace) + + if 'processes' not in ior_run_params: + ior_run_params['processes'] = test.params.get('processes', namespace, None) + elif 'ppn' not in ior_run_params: + ior_run_params['ppn'] = test.params.get('ppn', namespace, None) + + ior.run(test.server_group, container.pool, container, **ior_run_params) + return ior + + +def read_data(test, ior, container, namespace='/run/ior_read/*', **ior_run_params): + """Verify the data used to populate the container. + + Simple method for test classes to use to read data with ior designed to be used with the Ior + object returned by the write_data() method. While not required, this is setup by default to pull + in ior parameters from the test yaml using a format similar to: + + ior: &ior_base + api: DFS + transfer_size: 512K + block_size: 1G + ppn: 2 + + ior_write: + <<: *ior_base + flags: "-k -v -w -W -G 1" + + ior_read: + <<: *ior_base + flags: "-v -r -R -G 1" + + Args: + test (Test): avocado Test object + ior (Ior): the ior command used to populate the container + container (TestContainer): the container to verify + namespace (str, optional): path to ior yaml parameters. Defaults to '/run/ior_read/*'. + ior_run_params (dict): optional params for the Ior.run() command, like ppn, dfuse, etc. + """ + if 'processes' not in ior_run_params: + ior_run_params['processes'] = test.params.get('processes', namespace, None) + elif 'ppn' not in ior_run_params: + ior_run_params['ppn'] = test.params.get('ppn', namespace, 1) + ior.update('flags', test.params.get('flags', namespace)) + ior.run(test.server_group, container.pool, container, **ior_run_params) + + class IorCommand(SubProcessCommand): # pylint: disable=too-many-instance-attributes # pylint: disable=wrong-spelling-in-docstring diff --git a/src/tests/ftest/util/telemetry_utils.py b/src/tests/ftest/util/telemetry_utils.py index 1a51cf23848f..0666e47bd7f5 100644 --- a/src/tests/ftest/util/telemetry_utils.py +++ b/src/tests/ftest/util/telemetry_utils.py @@ -3,6 +3,8 @@ SPDX-License-Identifier: BSD-2-Clause-Patent """ +# pylint: disable=too-many-lines +import copy import re from logging import getLogger @@ -41,14 +43,45 @@ class TelemetryUtils(): "engine_pool_ops_cont_create", "engine_pool_ops_cont_destroy", "engine_pool_ops_cont_query"] - ENGINE_POOL_METRICS = [ + ENGINE_POOL_ACTION_METRICS = [ + "engine_pool_resent", + "engine_pool_restarted", + "engine_pool_retry", + "engine_pool_started_at", + "engine_pool_xferred_fetch", + "engine_pool_xferred_update"] + ENGINE_POOL_BLOCK_ALLOCATOR_METRICS = [ + "engine_pool_block_allocator_alloc_hint", + "engine_pool_block_allocator_alloc_large", + "engine_pool_block_allocator_alloc_small", + "engine_pool_block_allocator_frags_aging", + "engine_pool_block_allocator_frags_large", + "engine_pool_block_allocator_frags_small", + "engine_pool_block_allocator_free_blks"] + ENGINE_POOL_CHECKPOINT_METRICS = [ + *_gen_stats_metrics("engine_pool_checkpoint_dirty_chunks"), + *_gen_stats_metrics("engine_pool_checkpoint_dirty_pages"), + *_gen_stats_metrics("engine_pool_checkpoint_duration"), + *_gen_stats_metrics("engine_pool_checkpoint_iovs_copied"), + *_gen_stats_metrics("engine_pool_checkpoint_wal_purged")] + ENGINE_POOL_EC_UPDATE_METRICS = [ + "engine_pool_EC_update_full_stripe", + "engine_pool_EC_update_partial"] + ENGINE_POOL_ENTRIES_METRICS = [ "engine_pool_entries_dtx_batched_degree", - "engine_pool_entries_dtx_batched_total", - "engine_pool_ops_akey_enum", - "engine_pool_ops_akey_punch", + "engine_pool_entries_dtx_batched_total"] + ENGINE_POOL_OPS_AKEY_ENUM_METRICS = "engine_pool_ops_akey_enum" + ENGINE_POOL_OPS_DKEY_ENUM_METRICS = "engine_pool_ops_dkey_enum" + ENGINE_POOL_OPS_AKEY_PUNCH_METRICS = "engine_pool_ops_akey_punch" + ENGINE_POOL_OPS_DKEY_PUNCH_METRICS = "engine_pool_ops_dkey_punch" + ENGINE_POOL_OPS_TGT_AKEY_PUNCH_METRICS = "engine_pool_ops_tgt_akey_punch" + ENGINE_POOL_OPS_TGT_DKEY_PUNCH_METRICS = "engine_pool_ops_tgt_dkey_punch" + ENGINE_POOL_OPS_METRICS = [ + ENGINE_POOL_OPS_AKEY_ENUM_METRICS, + ENGINE_POOL_OPS_DKEY_ENUM_METRICS, + ENGINE_POOL_OPS_AKEY_PUNCH_METRICS, + ENGINE_POOL_OPS_DKEY_PUNCH_METRICS, "engine_pool_ops_compound", - "engine_pool_ops_dkey_enum", - "engine_pool_ops_dkey_punch", "engine_pool_ops_dtx_abort", "engine_pool_ops_dtx_check", "engine_pool_ops_dtx_commit", @@ -57,13 +90,14 @@ class TelemetryUtils(): "engine_pool_ops_ec_rep", "engine_pool_ops_fetch", "engine_pool_ops_key_query", + "engine_pool_ops_key2anchor", "engine_pool_ops_migrate", "engine_pool_ops_obj_enum", "engine_pool_ops_obj_punch", "engine_pool_ops_obj_sync", "engine_pool_ops_recx_enum", - "engine_pool_ops_tgt_akey_punch", - "engine_pool_ops_tgt_dkey_punch", + ENGINE_POOL_OPS_TGT_AKEY_PUNCH_METRICS, + ENGINE_POOL_OPS_TGT_DKEY_PUNCH_METRICS, "engine_pool_ops_tgt_punch", "engine_pool_ops_tgt_update", "engine_pool_ops_update", @@ -71,10 +105,8 @@ class TelemetryUtils(): "engine_pool_ops_pool_disconnect", "engine_pool_ops_pool_evict", "engine_pool_ops_pool_query", - "engine_pool_ops_pool_query_space", - "engine_pool_resent", - "engine_pool_restarted", - "engine_pool_retry", + "engine_pool_ops_pool_query_space"] + ENGINE_POOL_SCRUBBER_METRICS = [ "engine_pool_scrubber_busy_time", "engine_pool_scrubber_bytes_scrubbed_current", "engine_pool_scrubber_bytes_scrubbed_prev", @@ -88,8 +120,8 @@ class TelemetryUtils(): "engine_pool_scrubber_next_tree_scrub", *_gen_stats_metrics("engine_pool_scrubber_prev_duration"), "engine_pool_scrubber_scrubber_started", - "engine_pool_scrubber_scrubs_completed", - "engine_pool_started_at", + "engine_pool_scrubber_scrubs_completed"] + ENGINE_POOL_VOS_AGGREGATION_METRICS = [ "engine_pool_vos_aggregation_akey_deleted", "engine_pool_vos_aggregation_akey_scanned", "engine_pool_vos_aggregation_akey_skipped", @@ -105,21 +137,31 @@ class TelemetryUtils(): "engine_pool_vos_aggregation_obj_deleted", "engine_pool_vos_aggregation_obj_scanned", "engine_pool_vos_aggregation_obj_skipped", - "engine_pool_vos_aggregation_uncommitted", + "engine_pool_vos_aggregation_uncommitted"] + ENGINE_POOL_VOS_SPACE_METRICS = [ "engine_pool_vos_space_nvme_used", - "engine_pool_vos_space_scm_used", - "engine_pool_xferred_fetch", - "engine_pool_xferred_update", - "engine_pool_EC_update_full_stripe", - "engine_pool_EC_update_partial", - "engine_pool_block_allocator_alloc_hint", - "engine_pool_block_allocator_alloc_large", - "engine_pool_block_allocator_alloc_small", - "engine_pool_block_allocator_frags_aging", - "engine_pool_block_allocator_frags_large", - "engine_pool_block_allocator_frags_small", - "engine_pool_block_allocator_free_blks", - "engine_pool_ops_key2anchor"] + "engine_pool_vos_space_scm_used"] + ENGINE_POOL_VOS_WAL_METRICS = [ + *_gen_stats_metrics("engine_pool_vos_wal_wal_sz"), + *_gen_stats_metrics("engine_pool_vos_wal_wal_qd"), + *_gen_stats_metrics("engine_pool_vos_wal_wal_waiters")] + ENGINE_POOL_VOS_WAL_REPLAY_METRICS = [ + "engine_pool_vos_wal_replay_count", + "engine_pool_vos_wal_replay_entries", + "engine_pool_vos_wal_replay_size", + "engine_pool_vos_wal_replay_time", + "engine_pool_vos_wal_replay_transactions"] + ENGINE_POOL_METRICS = ENGINE_POOL_ACTION_METRICS +\ + ENGINE_POOL_BLOCK_ALLOCATOR_METRICS +\ + ENGINE_POOL_CHECKPOINT_METRICS +\ + ENGINE_POOL_EC_UPDATE_METRICS +\ + ENGINE_POOL_ENTRIES_METRICS +\ + ENGINE_POOL_OPS_METRICS +\ + ENGINE_POOL_SCRUBBER_METRICS +\ + ENGINE_POOL_VOS_AGGREGATION_METRICS +\ + ENGINE_POOL_VOS_SPACE_METRICS + \ + ENGINE_POOL_VOS_WAL_METRICS + \ + ENGINE_POOL_VOS_WAL_REPLAY_METRICS ENGINE_EVENT_METRICS = [ "engine_events_dead_ranks", "engine_events_last_event_ts", @@ -383,6 +425,7 @@ def __init__(self, dmg, servers): self.log = getLogger(__name__) self.dmg = dmg self.hosts = NodeSet.fromlist(servers) + self._data = MetricData() def get_all_server_metrics_names(self, server, with_pools=False): """Get all the telemetry metrics names for this server. @@ -449,6 +492,43 @@ def list_metrics(self): info[host].append(entry["name"]) return info + def collect_data(self, names): + """Collect telemetry data for the specified metrics. + + Args: + names (list): list of metric names + + Returns: + dict: dictionary of metric values keyed by the metric name and combination of metric + labels and values, e.g. + : { + : , + : , + ... + }, + ... + """ + return self._data.collect(self.log, names, self.hosts, self.dmg) + + def display_data(self): + """Display the telemetry metric values.""" + return self._data.display(self.log) + + def verify_data(self, ranges): + """Verify the telemetry metric values. + + Args: + ranges (dict): dictionary of min/max lists for each metric to be verified, e.g. + { + : [10], <--- will verify value of is at least 10 + : [0, 9] <--- will verify value of is between 0-9 + } + + Returns: + bool: True if all metric values are within the ranges specified; False otherwise + """ + return self._data.verify(self.log, ranges) + def get_metrics(self, name): """Obtain the specified metric information for each host. @@ -651,7 +731,7 @@ def get_nvme_metrics(self, specific_metrics=None): """Get the NVMe telemetry metrics. Args: - specific_metrics(list): list of specific NVMe metrics + specific_metrics (list): list of specific NVMe metrics Returns: dict: dictionary of dictionaries of NVMe metric names and @@ -728,3 +808,253 @@ def verify_metric_value(self, metrics_data, min_value=None, max_value=None): self.log.info(" %-12s %-4s %s %s", host, rank, value, invalid) return status + + +class MetricData(): + """Defines a object used to collect, display, and verify telemetry metric data.""" + + def __init__(self): + """Initialize a MetricData object.""" + self._data = {} + self._display = {'data': {}, 'labels': set(), 'widths': {}} + + def collect(self, log, names, hosts, dmg): + """Collect telemetry data for the specified metrics. + + Args: + log (logger): logger for the messages produced by this method + names (list): list of metric names + hosts (NodeSet): set of servers from which to collect the telemetry metrics + dmg (DmgCommand): the DmgCommand object configured to communicate with the servers + + Returns: + dict: dictionary of metric values keyed by the metric name and combination of metric + labels and values, e.g. + : { + : , + : , + ... + }, + ... + """ + info = self._get_metrics(log, ','.join(names), hosts, dmg) + self._data = self._get_data(names, info) + return copy.deepcopy(self._data) + + def display(self, log): + """Display the telemetry metric values. + + Args: + log (logger): logger for the messages produced by this method + """ + self._set_display() + columns = ['metric'] + self._display['labels'] + ['value'] + format_str = ' '.join([f"%-{self._display['widths'][name]}s" for name in columns]) + + log.info('-' * 80) + log.info('Telemetry Metric Information') + log.info(format_str, *[name.title() for name in columns]) + log.info(format_str, *['-' * self._display['widths'][name] for name in columns]) + for metric in sorted(self._display['data']): + for value, labels_list in self._display['data'][metric].items(): + for labels in labels_list: + log.info(format_str, metric, *self._label_values(labels), value) + + def verify(self, log, ranges): + """Verify the telemetry metric values. + + Args: + log (logger): logger for the messages produced by this method + ranges (dict): dictionary of expected metric value ranges with a minimum metric key and + optional label key to at least a minimum metric value and optional maximum metric + value, e.g. + {: } or + {: []} or + {: [, ]} or + {: {