From 24cf0ed6e4f74be09a542fd1c478f5082b7646dd Mon Sep 17 00:00:00 2001
From: Phil Henderson <phillip.henderson@intel.com>
Date: Wed, 27 Mar 2024 07:46:14 -0400
Subject: [PATCH] DAOS-11626 test: Adding MD on SSD metrics tests (#13661)

Adding tests for WAL commit, reply, and checkpoint metrics.

Signed-off-by: Phil Henderson <phillip.henderson@intel.com>
---
 src/tests/ftest/server/replay.py              |  58 +--
 src/tests/ftest/server/replay.yaml            |   3 +-
 .../ftest/telemetry/dkey_akey_enum_punch.py   |  16 +-
 src/tests/ftest/telemetry/wal_metrics.py      | 217 ++++++++++
 src/tests/ftest/telemetry/wal_metrics.yaml    |  34 ++
 src/tests/ftest/util/ior_utils.py             |  80 +++-
 src/tests/ftest/util/telemetry_utils.py       | 388 ++++++++++++++++--
 7 files changed, 709 insertions(+), 87 deletions(-)
 create mode 100644 src/tests/ftest/telemetry/wal_metrics.py
 create mode 100644 src/tests/ftest/telemetry/wal_metrics.yaml

diff --git a/src/tests/ftest/server/replay.py b/src/tests/ftest/server/replay.py
index 1b9f08b114cf..61e8f50a6f49 100644
--- a/src/tests/ftest/server/replay.py
+++ b/src/tests/ftest/server/replay.py
@@ -9,8 +9,7 @@
 from apricot import TestWithServers
 from dfuse_utils import get_dfuse, start_dfuse, stop_dfuse
 from general_utils import join
-from ior_utils import get_ior
-from job_manager_utils import get_job_manager
+from ior_utils import read_data, write_data
 from test_utils_pool import add_pool
 
 
@@ -38,24 +37,6 @@ def create_container(self, details=None, **pool_params):
         self.log_step(join(' ', 'Creating a container (daos container create)', '-', details))
         return self.get_container(pool)
 
-    def write_data(self, container, ppn, dfuse=None):
-        """Write data to the container/dfuse using ior.
-
-        Args:
-            container (TestContainer): the container to populate
-            ppn (int): processes per node to use with the ior command
-            dfuse (Dfuse, optional): dfuse object defining the dfuse mount point. Defaults to None.
-
-        Returns:
-            Ior: the Ior object used to populate the container
-        """
-        job_manager = get_job_manager(self, subprocess=False, timeout=60)
-        ior = get_ior(
-            self, job_manager, self.hostlist_clients, self.workdir, None,
-            namespace='/run/ior_write/*')
-        ior.run(self.server_group, container.pool, container, None, ppn, dfuse=dfuse)
-        return ior
-
     def stop_engines(self):
         """Stop each server engine and verify they are not running."""
         self.log_step('Shutting down the engines (dmg system stop)')
@@ -80,18 +61,6 @@ def restart_engines(self):
             self.log.info('Ranks %s failed to start', rank_check)
             self.fail('Failed to start ranks cleanly')
 
-    def read_data(self, ior, container, ppn, dfuse=None):
-        """Verify the data used to populate the container.
-
-        Args:
-            ior (Ior): the ior command used to populate the container
-            container (TestContainer): the container to verify
-            ppn (int): processes per node to use with the ior command
-            dfuse (Dfuse, optional): dfuse object defining the dfuse mount point. Defaults to None.
-        """
-        ior.update('flags', self.params.get('flags', '/run/ior_read/*'))
-        ior.run(self.server_group, container.pool, container, None, ppn, dfuse=dfuse)
-
     def verify_snapshots(self, container, expected):
         """Verify the snapshots listed for the container match the expected list of snapshots.
 
@@ -126,17 +95,16 @@ def test_restart(self):
         :avocado: tags=server,replay
         :avocado: tags=ReplayTests,test_restart
         """
-        ppn = self.params.get('ppn', '/run/ior_write/*', 1)
         container = self.create_container()
 
         self.log_step('Write data to the container (ior)')
-        ior = self.write_data(container, ppn)
+        ior = write_data(self, container)
 
         self.stop_engines()
         self.restart_engines()
 
         self.log_step('Verifying data previously written to the container (ior)')
-        self.read_data(ior, container, ppn)
+        read_data(self, ior, container)
         self.log_step('Test passed')
 
     def test_replay_posix(self):
@@ -159,7 +127,6 @@ def test_replay_posix(self):
         :avocado: tags=server,replay
         :avocado: tags=ReplayTests,test_replay_posix
         """
-        ppn = self.params.get('ppn', '/run/ior_write/*', 1)
         container = self.create_container()
 
         self.log_step('Start dfuse')
@@ -167,7 +134,7 @@ def test_replay_posix(self):
         start_dfuse(self, dfuse, container.pool, container)
 
         self.log_step('Write data to the dfuse mount point (ior)')
-        ior = self.write_data(container, ppn, dfuse)
+        ior = write_data(self, container, dfuse=dfuse)
 
         self.log_step('After the read has completed, unmount dfuse')
         stop_dfuse(self, dfuse)
@@ -179,10 +146,10 @@ def test_replay_posix(self):
         start_dfuse(self, dfuse)
 
         self.log_step('Verifying data previously written to the dfuse mount point (ior)')
-        self.read_data(ior, container, ppn, dfuse)
+        read_data(self, ior, container, dfuse=dfuse)
 
         self.log_step('Write additional data to the dfuse mount point (ior)')
-        ior = self.write_data(container, ppn, dfuse)
+        ior = write_data(self, container, dfuse=dfuse)
 
         self.log.info('Test passed')
 
@@ -210,14 +177,13 @@ def test_replay_snapshots(self):
         :avocado: tags=server,replay
         :avocado: tags=ReplayTests,test_replay_snapshots
         """
-        ppn = self.params.get('ppn', '/run/ior_write/*', 1)
         container = self.create_container()
 
         snapshots = []
         for index in range(1, 4):
             step = join(' ', index, 'of', 3)
             self.log_step(join(' ', 'Write data to the container (ior)', '-', step))
-            self.write_data(container, ppn)
+            write_data(self, container)
 
             self.log_step(join(' ', 'Creating a snapshot (daos container create-snap)', '-', step))
             snapshots.append(container.create_snap()['response']['epoch'])
@@ -348,7 +314,6 @@ def test_replay_no_check_pointing(self):
         :avocado: tags=server,replay
         :avocado: tags=ReplayTests,test_replay_no_check_pointing
         """
-        ppn = self.params.get('ppn', '/run/ior_write/*', 1)
         container = self.create_container()
 
         self.log_step('Disabling check pointing on {}'.format(container.pool))
@@ -358,7 +323,7 @@ def test_replay_no_check_pointing(self):
             self.fail('Pool check pointing not disabled before engine restart')
 
         self.log_step('Write data to the container (ior)')
-        ior = self.write_data(container, ppn)
+        ior = write_data(self, container)
 
         self.stop_engines()
         self.restart_engines()
@@ -371,7 +336,7 @@ def test_replay_no_check_pointing(self):
             self.fail('Pool check pointing not disabled after engine restart')
 
         self.log_step('Verifying data previously written to the container (ior)')
-        self.read_data(ior, container, ppn)
+        read_data(self, ior, container)
         self.log_step('Test passed')
 
     def test_replay_check_pointing(self):
@@ -392,14 +357,13 @@ def test_replay_check_pointing(self):
         :avocado: tags=server,replay
         :avocado: tags=ReplayTests,test_replay_check_pointing
         """
-        ppn = self.params.get('ppn', '/run/ior_write/*', 1)
         frequency = 5
         container = self.create_container(
             properties=f'checkpoint:timed,checkpoint_freq:{frequency}')
         self.log.info('%s check point frequency: %s seconds', container.pool, frequency)
 
         self.log_step('Write data to the container (ior)')
-        ior = self.write_data(container, ppn)
+        ior = write_data(self, container)
 
         self.log_step('Waiting for check pointing to complete (sleep {})'.format(frequency * 2))
         time.sleep(frequency * 2)
@@ -408,5 +372,5 @@ def test_replay_check_pointing(self):
         self.restart_engines()
 
         self.log_step('Verifying data previously written to the container (ior)')
-        self.read_data(ior, container, ppn)
+        read_data(self, ior, container)
         self.log_step('Test passed')
diff --git a/src/tests/ftest/server/replay.yaml b/src/tests/ftest/server/replay.yaml
index 161582536414..1e4fb0b81e0f 100644
--- a/src/tests/ftest/server/replay.yaml
+++ b/src/tests/ftest/server/replay.yaml
@@ -21,8 +21,7 @@ container:
   dfs_oclass: SX
 
 ior: &ior_base
-  client_processes:
-    ppn: 4
+  ppn: 4
   api: DFS
   transfer_size: 512K
   block_size: 1G
diff --git a/src/tests/ftest/telemetry/dkey_akey_enum_punch.py b/src/tests/ftest/telemetry/dkey_akey_enum_punch.py
index 99481406d8ee..b97d1526b2b7 100644
--- a/src/tests/ftest/telemetry/dkey_akey_enum_punch.py
+++ b/src/tests/ftest/telemetry/dkey_akey_enum_punch.py
@@ -1,5 +1,5 @@
 '''
-  (C) Copyright 2018-2023 Intel Corporation.
+  (C) Copyright 2018-2024 Intel Corporation.
 
   SPDX-License-Identifier: BSD-2-Clause-Patent
 '''
@@ -232,13 +232,13 @@ def test_dkey_akey_enum_punch(self):
 
         # Obtain and verify the io metrics 1 to 4. ###
         # engine_pool_ops_dkey_enum
-        pool_dkey_enum = self.telemetry.ENGINE_POOL_METRICS[5]
+        pool_dkey_enum = self.telemetry.ENGINE_POOL_OPS_DKEY_ENUM_METRICS
         # engine_pool_ops_akey_enum
-        pool_akey_enum = self.telemetry.ENGINE_POOL_METRICS[2]
+        pool_akey_enum = self.telemetry.ENGINE_POOL_OPS_AKEY_ENUM_METRICS
         # engine_pool_ops_dkey_punch
-        pool_dkey_punch = self.telemetry.ENGINE_POOL_METRICS[6]
+        pool_dkey_punch = self.telemetry.ENGINE_POOL_OPS_DKEY_PUNCH_METRICS
         # engine_pool_ops_akey_punch
-        pool_akey_punch = self.telemetry.ENGINE_POOL_METRICS[3]
+        pool_akey_punch = self.telemetry.ENGINE_POOL_OPS_AKEY_PUNCH_METRICS
         specific_metrics = [
             pool_dkey_enum, pool_akey_enum,
             pool_dkey_punch, pool_akey_punch,
@@ -357,9 +357,9 @@ def test_pool_tgt_dkey_akey_punch(self):
 
         self.telemetry.dmg.verbose = False
 
-        # Obtain and verify the pool metrics 1 and 2 ###
-        pool_tgt_dkey_punch = self.telemetry.ENGINE_POOL_METRICS[21]
-        pool_tgt_akey_punch = self.telemetry.ENGINE_POOL_METRICS[20]
+        # Obtain and verify the pool target punch metrics
+        pool_tgt_dkey_punch = self.telemetry.ENGINE_POOL_OPS_TGT_DKEY_PUNCH_METRICS
+        pool_tgt_akey_punch = self.telemetry.ENGINE_POOL_OPS_TGT_AKEY_PUNCH_METRICS
         specific_metrics = [pool_tgt_dkey_punch, pool_tgt_akey_punch]
         pool_out = self.telemetry.get_pool_metrics(
             specific_metrics=specific_metrics)
diff --git a/src/tests/ftest/telemetry/wal_metrics.py b/src/tests/ftest/telemetry/wal_metrics.py
new file mode 100644
index 000000000000..dc553f4b106a
--- /dev/null
+++ b/src/tests/ftest/telemetry/wal_metrics.py
@@ -0,0 +1,217 @@
+"""
+  (C) Copyright 2018-2024 Intel Corporation.
+
+  SPDX-License-Identifier: BSD-2-Clause-Patent
+"""
+import time
+
+from ior_utils import write_data
+from telemetry_test_base import TestWithTelemetry
+from test_utils_pool import add_pool
+
+
+class WalMetrics(TestWithTelemetry):
+    """Tests for new specific metrics to track activity of md_on_ssd.
+
+    :avocado: recursive
+    """
+
+    def test_wal_commit_metrics(self):
+        """JIRA ID: DAOS-11626.
+
+        The WAL commit metrics is per-pool metrics, it includes 'wal_sz', 'wal_qd' and 'wal_waiters'
+        (see vos_metrics_alloc() in src/vos/vos_common.c). WAL commit metrics are updated on each
+        local transaction (for example, transaction for a update request, etc.)
+
+        Test steps:
+        1) Create a pool
+        2) Verify WAL commit metrics after pool creation (non-zero w/ MD on SSD)
+
+        :avocado: tags=all,daily_regression
+        :avocado: tags=hw,medium
+        :avocado: tags=telemetry
+        :avocado: tags=WalMetrics,test_wal_commit_metrics
+        """
+        wal_metrics = list(self.telemetry.ENGINE_POOL_VOS_WAL_METRICS)
+
+        self.log_step('Creating a pool (dmg pool create)')
+        add_pool(self)
+
+        self.log_step(
+            'Collect WAL commit metrics after creating a pool (dmg telemetry metrics query)')
+        ranges = self.telemetry.collect_data(wal_metrics)
+        for metric in list(ranges):
+            if '_sz' in metric and not metric.endswith('_mean') and not metric.endswith('_stddev'):
+                for label in ranges[metric]:
+                    if self.server_managers[0].manager.job.using_control_metadata:
+                        # The min/max/actual size should be greater than 0 for MD on SSD
+                        ranges[metric][label] = [1]
+                    else:
+                        ranges[metric][label] = [0, 0]
+            elif '_waiters' not in metric:
+                ranges.pop(metric)
+        if self.server_managers[0].manager.job.using_control_metadata:
+            self.log_step(
+                'Verify WAL commit size metrics are > 0 and waiters are 0 after creating a pool')
+        else:
+            self.log_step('Verify WAL commit metrics are 0 after creating a pool')
+        if not self.telemetry.verify_data(ranges):
+            self.fail('Unexpected WAL commit metric values after pool create')
+
+        self.log_step('Test passed')
+
+    def test_wal_reply_metrics(self):
+        """JIRA ID: DAOS-11626.
+
+        The WAL replay metrics is per-pool metrics in 'vos_rehydration' under each pool folder, it
+        includes 'replay_size', 'replay_time', 'replay_entries', 'replay_count' and
+        'replay_transactions' (see vos_metrics_alloc() in src/vos/vos_common.c). WAL replay
+        metrics are only updated when open a pool on engine start (or when creating a pool).
+
+        Test steps:
+        1) Create a pool
+        2) Verify WAL replay metrics after pool creation (non-zero w/ MD on SSD)
+
+        :avocado: tags=all,daily_regression
+        :avocado: tags=hw,medium
+        :avocado: tags=telemetry
+        :avocado: tags=WalMetrics,test_wal_reply_metrics
+        """
+        wal_metrics = list(self.telemetry.ENGINE_POOL_VOS_WAL_REPLAY_METRICS)
+
+        self.log_step('Creating a pool (dmg pool create)')
+        add_pool(self)
+
+        self.log_step(
+            'Collect WAL replay metrics after creating a pool (dmg telemetry metrics query)')
+        ranges = self.telemetry.collect_data(wal_metrics)
+        for metric in sorted(ranges):
+            for label in ranges[metric]:
+                if self.server_managers[0].manager.job.using_control_metadata:
+                    if metric.endswith('_replay_count'):
+                        # Replay count should be 1 after pool create for MD on SSD
+                        ranges[metric][label] = [1, 1]
+                    elif metric.endswith('_replay_entries'):
+                        # Replay entries should be > 0 after pool create for MD on SSD
+                        ranges[metric][label] = [1]
+                    elif metric.endswith('_replay_size'):
+                        # Replay size should be > 0 after pool create for MD on SSD
+                        ranges[metric][label] = [1]
+                    elif metric.endswith('_replay_time'):
+                        # Replay time should be 10,000 - 50,000 after pool create for MD on SSD
+                        ranges[metric][label] = [10000, 50000]
+                    elif metric.endswith('_replay_transactions'):
+                        # Replay transactions should be > 0 after pool create for MD on SSD
+                        ranges[metric][label] = [1]
+                else:
+                    ranges[metric][label] = [0, 0]
+
+        self.log_step('Verify WAL reply metrics after pool creation (dmg telemetry metrics query)')
+        if not self.telemetry.verify_data(ranges):
+            self.fail('WAL replay metrics verification failed after pool creation')
+
+        self.log_step('Test passed')
+
+    def test_wal_checkpoint_metrics(self):
+        """JIRA ID: DAOS-11626.
+
+        The WAL checkpoint metrics is per-pool metrics in 'checkpoint' under each pool folder, it
+        includes 'duration', 'dirty_pages', 'dirty_chunks', 'iovs_copied' and 'wal_purged' (see
+        vos_chkpt_metrics_init() in src/vos/vos_pool.c). WAL checkpoint metrics are update on
+        check pointing, check pointing regularly happens in background (See the 'Checkpoint policy'
+        in manual), when there is nothing to be checkpoint-ed (no new commits since last
+        checkpoint), the checkpoint would be no-op and metrics won’t updated.
+
+        Test steps:
+        1) Create a pool w/o check pointing
+        2) Verify WAL checkpoint metrics are zero after pool creation
+        3) Create a second pool w/ check pointing enabled
+        4) Verify WAL checkpoint metrics are zero for both pools after pool creation
+        5) Write some data to a container in the second pool
+        6) Wait enough time for check pointing to have occurred
+        7) Verify WAL checkpoint purged metrics are non-zero for the second pool (for MD on SSD)
+
+        :avocado: tags=all,daily_regression
+        :avocado: tags=hw,medium
+        :avocado: tags=telemetry
+        :avocado: tags=WalMetrics,test_wal_checkpoint_metrics
+        """
+        frequency = 5
+        wal_metrics = list(self.telemetry.ENGINE_POOL_CHECKPOINT_METRICS)
+
+        self.log_step('Creating a pool with check pointing disabled (dmg pool create)')
+        add_pool(self, properties='checkpoint:disabled')
+
+        self.log_step(
+            'Collect WAL checkpoint metrics after creating a pool w/o check pointing '
+            '(dmg telemetry metrics query)')
+        ranges = self.telemetry.collect_data(wal_metrics)
+        for metric, values in ranges.items():
+            for label in values:
+                # Initially all metrics should be 0
+                values[label] = [0, 0]
+
+        self.log_step(
+            'Verifying WAL checkpoint metrics are all 0 after creating a pool w/o check pointing')
+        if not self.telemetry.verify_data(ranges):
+            self.fail('WAL check point metrics not zero after creating a pool w/o check pointing')
+
+        self.log_step('Creating a pool with timed check pointing (dmg pool create)')
+        pool = add_pool(self, properties=f'checkpoint:timed,checkpoint_freq:{frequency}')
+
+        self.log_step(
+            'Collect WAL checkpoint metrics after creating a pool w/ check pointing '
+            '(dmg telemetry metrics query)')
+        ranges = self.telemetry.collect_data(wal_metrics)
+        for metric, values in ranges.items():
+            for label in values:
+                uuid = pool.uuid
+                if uuid in label and self.server_managers[0].manager.job.using_control_metadata:
+                    if '_dirty_chunks' in metric:
+                        # Check point dirty chunks should be 0-300 after pool create for MD on SSD
+                        values[label] = [0, 300]
+                    elif '_dirty_pages' in metric:
+                        # Check point dirty pages should be 0-3 after pool create for MD on SSD
+                        values[label] = [0, 3]
+                    elif '_duration' in metric:
+                        # Check point duration should be 0-1,000,000 after pool create for MD on SSD
+                        values[label] = [0, 1000000]
+                    elif '_iovs_copied' in metric:
+                        # Check point iovs copied should be >= 0 after pool create for MD on SSD
+                        values[label] = [0]
+                    elif '_wal_purged' in metric:
+                        # Check point wal purged should be >= 0 after pool create for MD on SSD
+                        values[label] = [0]
+                else:
+                    # All metrics for the pool w/o check pointing or w/o MD on SSD should be 0
+                    values[label] = [0, 0]
+        self.log_step('Verifying WAL check point metrics after creating a pool w/ check pointing')
+        if not self.telemetry.verify_data(ranges):
+            self.fail('WAL replay metrics verification failed after pool w/ check pointing create')
+
+        self.log_step('Creating a container for the pool w/ check pointing (daos container create)')
+        container = self.get_container(pool)
+        self.log.info('%s check point frequency: %s seconds', container.pool, frequency)
+
+        self.log_step('Writing data to the pool w/ check pointing (ior)')
+        write_data(self, container)
+
+        self.log_step(f'Waiting for check pointing to complete (sleep {frequency * 2})')
+        time.sleep(frequency * 2)
+
+        self.log_step('Collect WAL checkpoint metrics after check pointing is complete')
+        self.telemetry.collect_data(wal_metrics)
+        if self.server_managers[0].manager.job.using_control_metadata:
+            for metric, values in ranges.items():
+                for label in values:
+                    if pool.uuid in label:
+                        if '_wal_purged' in metric:
+                            # Check point wal purged should be > 0 after check point for MD on SSD
+                            values[label] = [1]
+        self.log_step(
+            'Verify WAL checkpoint metrics after check pointing is complete '
+            '(dmg telemetry metrics query)')
+        if not self.telemetry.verify_data(ranges):
+            self.fail('WAL replay metrics verification failed after check pointing completion')
+
+        self.log_step('Test passed')
diff --git a/src/tests/ftest/telemetry/wal_metrics.yaml b/src/tests/ftest/telemetry/wal_metrics.yaml
new file mode 100644
index 000000000000..71ba8cbc17b2
--- /dev/null
+++ b/src/tests/ftest/telemetry/wal_metrics.yaml
@@ -0,0 +1,34 @@
+hosts:
+  test_servers: 2
+  test_clients: 2
+
+timeout: 180
+
+server_config:
+  engines_per_host: 1
+  engines:
+    0:
+      storage: auto
+
+pool:
+  size: 20G
+
+container:
+  control_method: daos
+  type: POSIX
+  dfs_oclass: SX
+
+ior: &ior_base
+  ppn: 4
+  api: DFS
+  transfer_size: 512K
+  block_size: 1G
+  dfs_oclass: SX
+
+ior_write:
+  <<: *ior_base
+  flags: "-k -v -w -W -G 1"
+
+ior_read:
+  <<: *ior_base
+  flags: "-v -r -R -G 1"
diff --git a/src/tests/ftest/util/ior_utils.py b/src/tests/ftest/util/ior_utils.py
index cd54b0e19aff..b729afc00ee0 100644
--- a/src/tests/ftest/util/ior_utils.py
+++ b/src/tests/ftest/util/ior_utils.py
@@ -1,5 +1,5 @@
 """
-(C) Copyright 2018-2023 Intel Corporation.
+(C) Copyright 2018-2024 Intel Corporation.
 
 SPDX-License-Identifier: BSD-2-Clause-Patent
 """
@@ -13,6 +13,7 @@
 from duns_utils import format_path
 from exception_utils import CommandFailure
 from general_utils import get_log_file
+from job_manager_utils import get_job_manager
 
 
 def get_ior(test, manager, hosts, path, slots, namespace="/run/ior/*", ior_params=None):
@@ -139,6 +140,83 @@ def thread_run_ior(thread_queue, job_id, test, manager, log, hosts, path, slots,
         thread_queue.put(thread_result)
 
 
+def write_data(test, container, namespace='/run/ior_write/*', **ior_run_params):
+    """Write data to the container/dfuse using ior.
+
+    Simple method for test classes to use to write data with ior. While not required, this is setup
+    by default to pull in ior parameters from the test yaml using a format similar to:
+
+        ior: &ior_base
+          api: DFS
+          transfer_size: 512K
+          block_size: 1G
+          ppn: 2
+
+        ior_write:
+          <<: *ior_base
+          flags: "-k -v -w -W -G 1"
+
+        ior_read:
+          <<: *ior_base
+          flags: "-v -r -R -G 1"
+
+    Args:
+        test (Test): avocado Test object
+        container (TestContainer): the container to populate
+        namespace (str, optional): path to ior yaml parameters. Defaults to '/run/ior_write/*'.
+        ior_run_params (dict): optional params for the Ior.run() command, like ppn, dfuse, etc.
+
+    Returns:
+        Ior: the Ior object used to populate the container
+    """
+    job_manager = get_job_manager(test, subprocess=False, timeout=60)
+    ior = get_ior(test, job_manager, test.hostlist_clients, test.workdir, None, namespace)
+
+    if 'processes' not in ior_run_params:
+        ior_run_params['processes'] = test.params.get('processes', namespace, None)
+    elif 'ppn' not in ior_run_params:
+        ior_run_params['ppn'] = test.params.get('ppn', namespace, None)
+
+    ior.run(test.server_group, container.pool, container, **ior_run_params)
+    return ior
+
+
+def read_data(test, ior, container, namespace='/run/ior_read/*', **ior_run_params):
+    """Verify the data used to populate the container.
+
+    Simple method for test classes to use to read data with ior designed to be used with the Ior
+    object returned by the write_data() method. While not required, this is setup by default to pull
+    in ior parameters from the test yaml using a format similar to:
+
+        ior: &ior_base
+          api: DFS
+          transfer_size: 512K
+          block_size: 1G
+          ppn: 2
+
+        ior_write:
+          <<: *ior_base
+          flags: "-k -v -w -W -G 1"
+
+        ior_read:
+          <<: *ior_base
+          flags: "-v -r -R -G 1"
+
+    Args:
+        test (Test): avocado Test object
+        ior (Ior): the ior command used to populate the container
+        container (TestContainer): the container to verify
+        namespace (str, optional): path to ior yaml parameters. Defaults to '/run/ior_read/*'.
+        ior_run_params (dict): optional params for the Ior.run() command, like ppn, dfuse, etc.
+    """
+    if 'processes' not in ior_run_params:
+        ior_run_params['processes'] = test.params.get('processes', namespace, None)
+    elif 'ppn' not in ior_run_params:
+        ior_run_params['ppn'] = test.params.get('ppn', namespace, 1)
+    ior.update('flags', test.params.get('flags', namespace))
+    ior.run(test.server_group, container.pool, container, **ior_run_params)
+
+
 class IorCommand(SubProcessCommand):
     # pylint: disable=too-many-instance-attributes
     # pylint: disable=wrong-spelling-in-docstring
diff --git a/src/tests/ftest/util/telemetry_utils.py b/src/tests/ftest/util/telemetry_utils.py
index 1a51cf23848f..0666e47bd7f5 100644
--- a/src/tests/ftest/util/telemetry_utils.py
+++ b/src/tests/ftest/util/telemetry_utils.py
@@ -3,6 +3,8 @@
 
 SPDX-License-Identifier: BSD-2-Clause-Patent
 """
+# pylint: disable=too-many-lines
+import copy
 import re
 from logging import getLogger
 
@@ -41,14 +43,45 @@ class TelemetryUtils():
         "engine_pool_ops_cont_create",
         "engine_pool_ops_cont_destroy",
         "engine_pool_ops_cont_query"]
-    ENGINE_POOL_METRICS = [
+    ENGINE_POOL_ACTION_METRICS = [
+        "engine_pool_resent",
+        "engine_pool_restarted",
+        "engine_pool_retry",
+        "engine_pool_started_at",
+        "engine_pool_xferred_fetch",
+        "engine_pool_xferred_update"]
+    ENGINE_POOL_BLOCK_ALLOCATOR_METRICS = [
+        "engine_pool_block_allocator_alloc_hint",
+        "engine_pool_block_allocator_alloc_large",
+        "engine_pool_block_allocator_alloc_small",
+        "engine_pool_block_allocator_frags_aging",
+        "engine_pool_block_allocator_frags_large",
+        "engine_pool_block_allocator_frags_small",
+        "engine_pool_block_allocator_free_blks"]
+    ENGINE_POOL_CHECKPOINT_METRICS = [
+        *_gen_stats_metrics("engine_pool_checkpoint_dirty_chunks"),
+        *_gen_stats_metrics("engine_pool_checkpoint_dirty_pages"),
+        *_gen_stats_metrics("engine_pool_checkpoint_duration"),
+        *_gen_stats_metrics("engine_pool_checkpoint_iovs_copied"),
+        *_gen_stats_metrics("engine_pool_checkpoint_wal_purged")]
+    ENGINE_POOL_EC_UPDATE_METRICS = [
+        "engine_pool_EC_update_full_stripe",
+        "engine_pool_EC_update_partial"]
+    ENGINE_POOL_ENTRIES_METRICS = [
         "engine_pool_entries_dtx_batched_degree",
-        "engine_pool_entries_dtx_batched_total",
-        "engine_pool_ops_akey_enum",
-        "engine_pool_ops_akey_punch",
+        "engine_pool_entries_dtx_batched_total"]
+    ENGINE_POOL_OPS_AKEY_ENUM_METRICS = "engine_pool_ops_akey_enum"
+    ENGINE_POOL_OPS_DKEY_ENUM_METRICS = "engine_pool_ops_dkey_enum"
+    ENGINE_POOL_OPS_AKEY_PUNCH_METRICS = "engine_pool_ops_akey_punch"
+    ENGINE_POOL_OPS_DKEY_PUNCH_METRICS = "engine_pool_ops_dkey_punch"
+    ENGINE_POOL_OPS_TGT_AKEY_PUNCH_METRICS = "engine_pool_ops_tgt_akey_punch"
+    ENGINE_POOL_OPS_TGT_DKEY_PUNCH_METRICS = "engine_pool_ops_tgt_dkey_punch"
+    ENGINE_POOL_OPS_METRICS = [
+        ENGINE_POOL_OPS_AKEY_ENUM_METRICS,
+        ENGINE_POOL_OPS_DKEY_ENUM_METRICS,
+        ENGINE_POOL_OPS_AKEY_PUNCH_METRICS,
+        ENGINE_POOL_OPS_DKEY_PUNCH_METRICS,
         "engine_pool_ops_compound",
-        "engine_pool_ops_dkey_enum",
-        "engine_pool_ops_dkey_punch",
         "engine_pool_ops_dtx_abort",
         "engine_pool_ops_dtx_check",
         "engine_pool_ops_dtx_commit",
@@ -57,13 +90,14 @@ class TelemetryUtils():
         "engine_pool_ops_ec_rep",
         "engine_pool_ops_fetch",
         "engine_pool_ops_key_query",
+        "engine_pool_ops_key2anchor",
         "engine_pool_ops_migrate",
         "engine_pool_ops_obj_enum",
         "engine_pool_ops_obj_punch",
         "engine_pool_ops_obj_sync",
         "engine_pool_ops_recx_enum",
-        "engine_pool_ops_tgt_akey_punch",
-        "engine_pool_ops_tgt_dkey_punch",
+        ENGINE_POOL_OPS_TGT_AKEY_PUNCH_METRICS,
+        ENGINE_POOL_OPS_TGT_DKEY_PUNCH_METRICS,
         "engine_pool_ops_tgt_punch",
         "engine_pool_ops_tgt_update",
         "engine_pool_ops_update",
@@ -71,10 +105,8 @@ class TelemetryUtils():
         "engine_pool_ops_pool_disconnect",
         "engine_pool_ops_pool_evict",
         "engine_pool_ops_pool_query",
-        "engine_pool_ops_pool_query_space",
-        "engine_pool_resent",
-        "engine_pool_restarted",
-        "engine_pool_retry",
+        "engine_pool_ops_pool_query_space"]
+    ENGINE_POOL_SCRUBBER_METRICS = [
         "engine_pool_scrubber_busy_time",
         "engine_pool_scrubber_bytes_scrubbed_current",
         "engine_pool_scrubber_bytes_scrubbed_prev",
@@ -88,8 +120,8 @@ class TelemetryUtils():
         "engine_pool_scrubber_next_tree_scrub",
         *_gen_stats_metrics("engine_pool_scrubber_prev_duration"),
         "engine_pool_scrubber_scrubber_started",
-        "engine_pool_scrubber_scrubs_completed",
-        "engine_pool_started_at",
+        "engine_pool_scrubber_scrubs_completed"]
+    ENGINE_POOL_VOS_AGGREGATION_METRICS = [
         "engine_pool_vos_aggregation_akey_deleted",
         "engine_pool_vos_aggregation_akey_scanned",
         "engine_pool_vos_aggregation_akey_skipped",
@@ -105,21 +137,31 @@ class TelemetryUtils():
         "engine_pool_vos_aggregation_obj_deleted",
         "engine_pool_vos_aggregation_obj_scanned",
         "engine_pool_vos_aggregation_obj_skipped",
-        "engine_pool_vos_aggregation_uncommitted",
+        "engine_pool_vos_aggregation_uncommitted"]
+    ENGINE_POOL_VOS_SPACE_METRICS = [
         "engine_pool_vos_space_nvme_used",
-        "engine_pool_vos_space_scm_used",
-        "engine_pool_xferred_fetch",
-        "engine_pool_xferred_update",
-        "engine_pool_EC_update_full_stripe",
-        "engine_pool_EC_update_partial",
-        "engine_pool_block_allocator_alloc_hint",
-        "engine_pool_block_allocator_alloc_large",
-        "engine_pool_block_allocator_alloc_small",
-        "engine_pool_block_allocator_frags_aging",
-        "engine_pool_block_allocator_frags_large",
-        "engine_pool_block_allocator_frags_small",
-        "engine_pool_block_allocator_free_blks",
-        "engine_pool_ops_key2anchor"]
+        "engine_pool_vos_space_scm_used"]
+    ENGINE_POOL_VOS_WAL_METRICS = [
+        *_gen_stats_metrics("engine_pool_vos_wal_wal_sz"),
+        *_gen_stats_metrics("engine_pool_vos_wal_wal_qd"),
+        *_gen_stats_metrics("engine_pool_vos_wal_wal_waiters")]
+    ENGINE_POOL_VOS_WAL_REPLAY_METRICS = [
+        "engine_pool_vos_wal_replay_count",
+        "engine_pool_vos_wal_replay_entries",
+        "engine_pool_vos_wal_replay_size",
+        "engine_pool_vos_wal_replay_time",
+        "engine_pool_vos_wal_replay_transactions"]
+    ENGINE_POOL_METRICS = ENGINE_POOL_ACTION_METRICS +\
+        ENGINE_POOL_BLOCK_ALLOCATOR_METRICS +\
+        ENGINE_POOL_CHECKPOINT_METRICS +\
+        ENGINE_POOL_EC_UPDATE_METRICS +\
+        ENGINE_POOL_ENTRIES_METRICS +\
+        ENGINE_POOL_OPS_METRICS +\
+        ENGINE_POOL_SCRUBBER_METRICS +\
+        ENGINE_POOL_VOS_AGGREGATION_METRICS +\
+        ENGINE_POOL_VOS_SPACE_METRICS + \
+        ENGINE_POOL_VOS_WAL_METRICS + \
+        ENGINE_POOL_VOS_WAL_REPLAY_METRICS
     ENGINE_EVENT_METRICS = [
         "engine_events_dead_ranks",
         "engine_events_last_event_ts",
@@ -383,6 +425,7 @@ def __init__(self, dmg, servers):
         self.log = getLogger(__name__)
         self.dmg = dmg
         self.hosts = NodeSet.fromlist(servers)
+        self._data = MetricData()
 
     def get_all_server_metrics_names(self, server, with_pools=False):
         """Get all the telemetry metrics names for this server.
@@ -449,6 +492,43 @@ def list_metrics(self):
                             info[host].append(entry["name"])
         return info
 
+    def collect_data(self, names):
+        """Collect telemetry data for the specified metrics.
+
+        Args:
+            names (list): list of metric names
+
+        Returns:
+            dict: dictionary of metric values keyed by the metric name and combination of metric
+                labels and values, e.g.
+                    <metric_name>: {
+                        <label_1:label_1_value,label_2:label_2_value,...>: <value_1>,
+                        <label_1:label_1_value,label_2:label_2_value,...>: <value_2>,
+                        ...
+                    },
+                    ...
+        """
+        return self._data.collect(self.log, names, self.hosts, self.dmg)
+
+    def display_data(self):
+        """Display the telemetry metric values."""
+        return self._data.display(self.log)
+
+    def verify_data(self, ranges):
+        """Verify the telemetry metric values.
+
+        Args:
+            ranges (dict): dictionary of min/max lists for each metric to be verified, e.g.
+                {
+                    <metric_a>: [10],       <--- will verify value of <metric_a> is at least 10
+                    <metric_b>: [0, 9]      <--- will verify value of <metric_b> is between 0-9
+                }
+
+        Returns:
+            bool: True if all metric values are within the ranges specified; False otherwise
+        """
+        return self._data.verify(self.log, ranges)
+
     def get_metrics(self, name):
         """Obtain the specified metric information for each host.
 
@@ -651,7 +731,7 @@ def get_nvme_metrics(self, specific_metrics=None):
         """Get the NVMe telemetry metrics.
 
         Args:
-            specific_metrics(list): list of specific NVMe metrics
+            specific_metrics (list): list of specific NVMe metrics
 
         Returns:
             dict: dictionary of dictionaries of NVMe metric names and
@@ -728,3 +808,253 @@ def verify_metric_value(self, metrics_data, min_value=None, max_value=None):
                     self.log.info("    %-12s %-4s %s %s",
                                   host, rank, value, invalid)
         return status
+
+
+class MetricData():
+    """Defines a object used to collect, display, and verify telemetry metric data."""
+
+    def __init__(self):
+        """Initialize a MetricData object."""
+        self._data = {}
+        self._display = {'data': {}, 'labels': set(), 'widths': {}}
+
+    def collect(self, log, names, hosts, dmg):
+        """Collect telemetry data for the specified metrics.
+
+        Args:
+            log (logger): logger for the messages produced by this method
+            names (list): list of metric names
+            hosts (NodeSet): set of servers from which to collect the telemetry metrics
+            dmg (DmgCommand): the DmgCommand object configured to communicate with the servers
+
+        Returns:
+            dict: dictionary of metric values keyed by the metric name and combination of metric
+                labels and values, e.g.
+                    <metric_name>: {
+                        <label_1:label_1_value,label_2:label_2_value,...>: <value_1>,
+                        <label_1:label_1_value,label_2:label_2_value,...>: <value_2>,
+                        ...
+                    },
+                    ...
+        """
+        info = self._get_metrics(log, ','.join(names), hosts, dmg)
+        self._data = self._get_data(names, info)
+        return copy.deepcopy(self._data)
+
+    def display(self, log):
+        """Display the telemetry metric values.
+
+        Args:
+            log (logger): logger for the messages produced by this method
+        """
+        self._set_display()
+        columns = ['metric'] + self._display['labels'] + ['value']
+        format_str = '  '.join([f"%-{self._display['widths'][name]}s" for name in columns])
+
+        log.info('-' * 80)
+        log.info('Telemetry Metric Information')
+        log.info(format_str, *[name.title() for name in columns])
+        log.info(format_str, *['-' * self._display['widths'][name] for name in columns])
+        for metric in sorted(self._display['data']):
+            for value, labels_list in self._display['data'][metric].items():
+                for labels in labels_list:
+                    log.info(format_str, metric, *self._label_values(labels), value)
+
+    def verify(self, log, ranges):
+        """Verify the telemetry metric values.
+
+        Args:
+            log (logger): logger for the messages produced by this method
+            ranges (dict): dictionary of expected metric value ranges with a minimum metric key and
+                optional label key to at least a minimum metric value and optional maximum metric
+                value, e.g.
+                    {<metric>: <min_value>} or
+                    {<metric>: [<min_value>]} or
+                    {<metric>: [<min_value>, <max_value>]} or
+                    {<metric>: {<label>: <min_value>}} or
+                    {<metric>: {<label>: [<min_value>]}} or
+                    {<metric>: {<label>: [<min_value>, <max_value>]}}
+
+        Returns:
+            bool: True if all metric values are within the ranges specified; False otherwise
+        """
+        status = self._set_display(ranges)
+        columns = ['metric'] + self._display['labels'] + ['value', 'check']
+        format_str = '  '.join([f"%-{self._display['widths'][name]}s" for name in columns])
+
+        log.info('-' * 80)
+        log.info('Telemetry Metric Verification')
+        log.info(format_str, *[name.title() for name in columns])
+        log.info(format_str, *['-' * self._display['widths'][name] for name in columns])
+        for metric in sorted(self._display['data']):
+            for value, labels in self._display['data'][metric].items():
+                log.info(
+                    format_str, metric, *self._label_values(labels), value,
+                    *self._label_values(labels, ['check']))
+        return status
+
+    def _get_metrics(self, log, names, hosts, dmg):
+        """Obtain the specified metric information for each host.
+
+        Args:
+            log (logger): logger for the messages produced by this method
+            names (str): Comma-separated list of metric names to query.
+            hosts (NodeSet): set of servers from which to collect the telemetry metrics
+            dmg (DmgCommand): the DmgCommand object configured to communicate with the servers
+
+        Returns:
+            dict: a dictionary of host keys linked to metric data for each metric name specified
+        """
+        info = {}
+        log.info('Querying telemetry metric %s from %s', names, hosts)
+        for host in hosts:
+            data = dmg.telemetry_metrics_query(host=host, metrics=names)
+            info[host] = {}
+            if 'response' in data:
+                if 'metric_sets' in data['response']:
+                    for entry in data['response']['metric_sets']:
+                        info[host][entry['name']] = {
+                            'description': entry['description'],
+                            'metrics': entry['metrics']
+                        }
+        return info
+
+    def _get_data(self, names, info):
+        """Get the telemetry metric data values.
+
+        Values are stored per metric name with common label information (e.g. rank, target, pool,
+        etc.) stored as NodeSets.
+
+        Args:
+            names (list): list of metric names
+            info (dict): a dictionary of host keys linked to metric data for each metric name
+
+        Returns:
+            dict: dictionary of metric values keyed by the metric name and combination of metric
+                labels and values, e.g.
+                    <metric_name>: {
+                        <label_1:label_1_value,label_2:label_2_value,...>: <value_1>,
+                        <label_1:label_1_value,label_2:label_2_value,...>: <value_2>,
+                        ...
+                    },
+                    ...
+        """
+        data = {}
+        for name in names:
+            for host, metrics in info.items():
+                if name not in data:
+                    data[name] = {}
+                for metric in metrics[name]['metrics']:
+                    if 'labels' not in metric or 'value' not in metric:
+                        continue
+                    labels = [f'host:{host}']
+                    for key, value in metric['labels'].items():
+                        labels.append(":".join([str(key), str(value)]))
+                    label_key = ",".join(sorted(labels))
+                    data[name][label_key] = metric['value']
+        return data
+
+    def _set_display(self, compare=None):
+        """Set the data to display along with the column labels and widths.
+
+        Args:
+            compare (dict, optional): dictionary of expected metric value ranges with a minimum
+                metric key and optional label key to at least a minimum metric value and optional
+                maximum metric value. Defaults to None.
+
+        Returns:
+            bool: True if all the comparisons passed; False otherwise
+        """
+        status = True
+        self._display['data'] = {}
+        unique_labels = set()
+        all_widths = {name: [len(name)] for name in ['metric', 'value', 'check']}
+        for metric in sorted(self._data):
+            self._display['data'][metric] = {}
+            all_widths['metric'].append(len(metric))
+            for combined_label, value in self._data[metric].items():
+                if value not in self._display['data'][metric]:
+                    # self._display['data'][metric][value] = {}
+                    self._display['data'][metric][value] = []
+                all_widths['value'].append(len(str(value)))
+                combined_label = self._add_check_label(combined_label, metric, compare, value)
+
+                # For now just list all the labels with this metric value w/o condensing
+                self._display['data'][metric][value].append({})
+
+                for label_entry in combined_label.split(','):
+                    label_name, label_value = label_entry.split(':')
+                    if label_name != 'check':
+                        unique_labels.add(label_name)
+                    elif 'Fail' in label_value:
+                        status = False
+                    self._display['data'][metric][value][-1][label_name] = label_value
+                    # if label_name not in self._display['data'][metric][value]:
+                    #     self._display['data'][metric][value][label_name] = NodeSet()
+                    # self._display['data'][metric][value][label_name].add(label_value)
+                    if label_name not in all_widths:
+                        all_widths[label_name] = []
+                    all_widths[label_name].append(len(str(label_name)))
+                    all_widths[label_name].append(
+                        len(str(self._display['data'][metric][value][-1][label_name])))
+        self._display['labels'] = sorted(unique_labels)
+        self._display['widths'] = {name: max(value) for name, value in all_widths.items()}
+        return status
+
+    def _label_values(self, label_data, names=None):
+        """Get the values for each telemetry metric label.
+
+        Args:
+            label_data (dict): dictionary of label names and values from which to generate a list of
+                values
+            names (list, optional): list of label names to include. If not specified
+                self._display['labels'] is used.
+
+        Returns:
+            list: list of values for each telemetry metric label
+        """
+        label_values = []
+        if names is None:
+            names = self._display['labels']
+        for name in names:
+            if name in label_data:
+                label_values.append(label_data[name])
+            else:
+                label_values.append('-')
+        return label_values
+
+    def _add_check_label(self, label, metric, compare, value):
+        """Add a 'check' label to the provided label indicating if the value is in the range.
+
+        Args:
+            label (str): current label (an optional key in the compare dictionary)
+            metric (str): metric name (a key in the compare dictionary)
+            compare (dict): dictionary of expected metric value ranges with a minimum metric key and
+                optional label key to at least a minimum metric value and optional maximum metric
+                value, e.g.
+                    {<metric>: <min_value>} or
+                    {<metric>: [<min_value>]} or
+                    {<metric>: [<min_value>, <max_value>]} or
+                    {<metric>: {<label>: <min_value>}} or
+                    {<metric>: {<label>: [<min_value>]}} or
+                    {<metric>: {<label>: [<min_value>, <max_value>]}}
+            value (int): current value to compare
+
+        Returns:
+            str: the current label with a added entry for the check result
+        """
+        def _validate_range(_value, _range):
+            if not isinstance(_range, (list, tuple)):
+                if _value < _range:
+                    return f'Fail (<{_range})'
+            elif len(_range) > 1 and _value > _range[1]:
+                return f'Fail (>{_range[1]})'
+            elif len(_range) > 0 and _value < _range[0]:
+                return f'Fail (<{_range[0]})'
+            return 'Pass'
+
+        if compare is None or metric not in compare:
+            return label + ',check:Pass'
+        if label not in compare[metric]:
+            return label + f',check:{_validate_range(value, compare[metric])}'
+        return label + f',check:{_validate_range(value, compare[metric][label])}'