DAOS-11626 test: Adding MD on SSD metrics tests (#13661)

Adding tests for WAL commit, reply, and checkpoint metrics. Signed-off-by: Phil Henderson <phillip.henderson@intel.com>
daos-stack · Apr 23, 2024 · 24cf0ed · 24cf0ed
1 parent 1ce8e5c
commit 24cf0ed
Show file tree

Hide file tree

Showing 7 changed files with 709 additions and 87 deletions.
diff --git a/src/tests/ftest/server/replay.py b/src/tests/ftest/server/replay.py
@@ -9,8 +9,7 @@
 from apricot import TestWithServers
 from dfuse_utils import get_dfuse, start_dfuse, stop_dfuse
 from general_utils import join
-from ior_utils import get_ior
-from job_manager_utils import get_job_manager
+from ior_utils import read_data, write_data
 from test_utils_pool import add_pool
 
 
@@ -38,24 +37,6 @@ def create_container(self, details=None, **pool_params):
  self.log_step(join(' ', 'Creating a container (daos container create)', '-', details))
  return self.get_container(pool)
 
- def write_data(self, container, ppn, dfuse=None):
- """Write data to the container/dfuse using ior.
-
- Args:
- container (TestContainer): the container to populate
- ppn (int): processes per node to use with the ior command
- dfuse (Dfuse, optional): dfuse object defining the dfuse mount point. Defaults to None.
-
- Returns:
- Ior: the Ior object used to populate the container
- """
- job_manager = get_job_manager(self, subprocess=False, timeout=60)
- ior = get_ior(
- self, job_manager, self.hostlist_clients, self.workdir, None,
- namespace='/run/ior_write/*')
- ior.run(self.server_group, container.pool, container, None, ppn, dfuse=dfuse)
- return ior
-
  def stop_engines(self):
  """Stop each server engine and verify they are not running."""
  self.log_step('Shutting down the engines (dmg system stop)')
@@ -80,18 +61,6 @@ def restart_engines(self):
  self.log.info('Ranks %s failed to start', rank_check)
  self.fail('Failed to start ranks cleanly')
 
- def read_data(self, ior, container, ppn, dfuse=None):
- """Verify the data used to populate the container.
-
- Args:
- ior (Ior): the ior command used to populate the container
- container (TestContainer): the container to verify
- ppn (int): processes per node to use with the ior command
- dfuse (Dfuse, optional): dfuse object defining the dfuse mount point. Defaults to None.
- """
- ior.update('flags', self.params.get('flags', '/run/ior_read/*'))
- ior.run(self.server_group, container.pool, container, None, ppn, dfuse=dfuse)
-
  def verify_snapshots(self, container, expected):
  """Verify the snapshots listed for the container match the expected list of snapshots.
 
@@ -126,17 +95,16 @@ def test_restart(self):
  :avocado: tags=server,replay
  :avocado: tags=ReplayTests,test_restart
  """
- ppn = self.params.get('ppn', '/run/ior_write/*', 1)
  container = self.create_container()
 
  self.log_step('Write data to the container (ior)')
- ior = self.write_data(container, ppn)
+ ior = write_data(self, container)
 
  self.stop_engines()
  self.restart_engines()
 
  self.log_step('Verifying data previously written to the container (ior)')
- self.read_data(ior, container, ppn)
+ read_data(self, ior, container)
  self.log_step('Test passed')
 
  def test_replay_posix(self):
@@ -159,15 +127,14 @@ def test_replay_posix(self):
  :avocado: tags=server,replay
  :avocado: tags=ReplayTests,test_replay_posix
  """
- ppn = self.params.get('ppn', '/run/ior_write/*', 1)
  container = self.create_container()
 
  self.log_step('Start dfuse')
  dfuse = get_dfuse(self, self.hostlist_clients)
  start_dfuse(self, dfuse, container.pool, container)
 
  self.log_step('Write data to the dfuse mount point (ior)')
- ior = self.write_data(container, ppn, dfuse)
+ ior = write_data(self, container, dfuse=dfuse)
 
  self.log_step('After the read has completed, unmount dfuse')
  stop_dfuse(self, dfuse)
@@ -179,10 +146,10 @@ def test_replay_posix(self):
  start_dfuse(self, dfuse)
 
  self.log_step('Verifying data previously written to the dfuse mount point (ior)')
- self.read_data(ior, container, ppn, dfuse)
+ read_data(self, ior, container, dfuse=dfuse)
 
  self.log_step('Write additional data to the dfuse mount point (ior)')
- ior = self.write_data(container, ppn, dfuse)
+ ior = write_data(self, container, dfuse=dfuse)
 
  self.log.info('Test passed')
 
@@ -210,14 +177,13 @@ def test_replay_snapshots(self):
  :avocado: tags=server,replay
  :avocado: tags=ReplayTests,test_replay_snapshots
  """
- ppn = self.params.get('ppn', '/run/ior_write/*', 1)
  container = self.create_container()
 
  snapshots = []
  for index in range(1, 4):
  step = join(' ', index, 'of', 3)
  self.log_step(join(' ', 'Write data to the container (ior)', '-', step))
- self.write_data(container, ppn)
+ write_data(self, container)
 
  self.log_step(join(' ', 'Creating a snapshot (daos container create-snap)', '-', step))
  snapshots.append(container.create_snap()['response']['epoch'])
@@ -348,7 +314,6 @@ def test_replay_no_check_pointing(self):
  :avocado: tags=server,replay
  :avocado: tags=ReplayTests,test_replay_no_check_pointing
  """
- ppn = self.params.get('ppn', '/run/ior_write/*', 1)
  container = self.create_container()
 
  self.log_step('Disabling check pointing on {}'.format(container.pool))
@@ -358,7 +323,7 @@ def test_replay_no_check_pointing(self):
  self.fail('Pool check pointing not disabled before engine restart')
 
  self.log_step('Write data to the container (ior)')
- ior = self.write_data(container, ppn)
+ ior = write_data(self, container)
 
  self.stop_engines()
  self.restart_engines()
@@ -371,7 +336,7 @@ def test_replay_no_check_pointing(self):
  self.fail('Pool check pointing not disabled after engine restart')
 
  self.log_step('Verifying data previously written to the container (ior)')
- self.read_data(ior, container, ppn)
+ read_data(self, ior, container)
  self.log_step('Test passed')
 
  def test_replay_check_pointing(self):
@@ -392,14 +357,13 @@ def test_replay_check_pointing(self):
  :avocado: tags=server,replay
  :avocado: tags=ReplayTests,test_replay_check_pointing
  """
- ppn = self.params.get('ppn', '/run/ior_write/*', 1)
  frequency = 5
  container = self.create_container(
  properties=f'checkpoint:timed,checkpoint_freq:{frequency}')
  self.log.info('%s check point frequency: %s seconds', container.pool, frequency)
 
  self.log_step('Write data to the container (ior)')
- ior = self.write_data(container, ppn)
+ ior = write_data(self, container)
 
  self.log_step('Waiting for check pointing to complete (sleep {})'.format(frequency * 2))
  time.sleep(frequency * 2)
@@ -408,5 +372,5 @@ def test_replay_check_pointing(self):
  self.restart_engines()
 
  self.log_step('Verifying data previously written to the container (ior)')
- self.read_data(ior, container, ppn)
+ read_data(self, ior, container)
  self.log_step('Test passed')
diff --git a/src/tests/ftest/server/replay.yaml b/src/tests/ftest/server/replay.yaml
@@ -21,8 +21,7 @@ container:
  dfs_oclass: SX
 
 ior: &ior_base
- client_processes:
- ppn: 4
+ ppn: 4
  api: DFS
  transfer_size: 512K
  block_size: 1G

diff --git a/src/tests/ftest/telemetry/dkey_akey_enum_punch.py b/src/tests/ftest/telemetry/dkey_akey_enum_punch.py
@@ -1,5 +1,5 @@
 '''
- (C) Copyright 2018-2023 Intel Corporation.
+ (C) Copyright 2018-2024 Intel Corporation.
 
  SPDX-License-Identifier: BSD-2-Clause-Patent
 '''
@@ -232,13 +232,13 @@ def test_dkey_akey_enum_punch(self):
 
  # Obtain and verify the io metrics 1 to 4. ###
  # engine_pool_ops_dkey_enum
- pool_dkey_enum = self.telemetry.ENGINE_POOL_METRICS[5]
+ pool_dkey_enum = self.telemetry.ENGINE_POOL_OPS_DKEY_ENUM_METRICS
  # engine_pool_ops_akey_enum
- pool_akey_enum = self.telemetry.ENGINE_POOL_METRICS[2]
+ pool_akey_enum = self.telemetry.ENGINE_POOL_OPS_AKEY_ENUM_METRICS
  # engine_pool_ops_dkey_punch
- pool_dkey_punch = self.telemetry.ENGINE_POOL_METRICS[6]
+ pool_dkey_punch = self.telemetry.ENGINE_POOL_OPS_DKEY_PUNCH_METRICS
  # engine_pool_ops_akey_punch
- pool_akey_punch = self.telemetry.ENGINE_POOL_METRICS[3]
+ pool_akey_punch = self.telemetry.ENGINE_POOL_OPS_AKEY_PUNCH_METRICS
  specific_metrics = [
  pool_dkey_enum, pool_akey_enum,
  pool_dkey_punch, pool_akey_punch,
@@ -357,9 +357,9 @@ def test_pool_tgt_dkey_akey_punch(self):
 
  self.telemetry.dmg.verbose = False
 
- # Obtain and verify the pool metrics 1 and 2 ###
- pool_tgt_dkey_punch = self.telemetry.ENGINE_POOL_METRICS[21]
- pool_tgt_akey_punch = self.telemetry.ENGINE_POOL_METRICS[20]
+ # Obtain and verify the pool target punch metrics
+ pool_tgt_dkey_punch = self.telemetry.ENGINE_POOL_OPS_TGT_DKEY_PUNCH_METRICS
+ pool_tgt_akey_punch = self.telemetry.ENGINE_POOL_OPS_TGT_AKEY_PUNCH_METRICS
  specific_metrics = [pool_tgt_dkey_punch, pool_tgt_akey_punch]
  pool_out = self.telemetry.get_pool_metrics(
  specific_metrics=specific_metrics)