Skip to content

Commit

Permalink
DAOS-11626 test: Adding MD on SSD metrics tests (#13661)
Browse files Browse the repository at this point in the history
Adding tests for WAL commit, reply, and checkpoint metrics.

Signed-off-by: Phil Henderson <phillip.henderson@intel.com>
  • Loading branch information
phender authored and mjmac committed Apr 23, 2024
1 parent 1ce8e5c commit 24cf0ed
Show file tree
Hide file tree
Showing 7 changed files with 709 additions and 87 deletions.
58 changes: 11 additions & 47 deletions src/tests/ftest/server/replay.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@
from apricot import TestWithServers
from dfuse_utils import get_dfuse, start_dfuse, stop_dfuse
from general_utils import join
from ior_utils import get_ior
from job_manager_utils import get_job_manager
from ior_utils import read_data, write_data
from test_utils_pool import add_pool


Expand Down Expand Up @@ -38,24 +37,6 @@ def create_container(self, details=None, **pool_params):
self.log_step(join(' ', 'Creating a container (daos container create)', '-', details))
return self.get_container(pool)

def write_data(self, container, ppn, dfuse=None):
"""Write data to the container/dfuse using ior.
Args:
container (TestContainer): the container to populate
ppn (int): processes per node to use with the ior command
dfuse (Dfuse, optional): dfuse object defining the dfuse mount point. Defaults to None.
Returns:
Ior: the Ior object used to populate the container
"""
job_manager = get_job_manager(self, subprocess=False, timeout=60)
ior = get_ior(
self, job_manager, self.hostlist_clients, self.workdir, None,
namespace='/run/ior_write/*')
ior.run(self.server_group, container.pool, container, None, ppn, dfuse=dfuse)
return ior

def stop_engines(self):
"""Stop each server engine and verify they are not running."""
self.log_step('Shutting down the engines (dmg system stop)')
Expand All @@ -80,18 +61,6 @@ def restart_engines(self):
self.log.info('Ranks %s failed to start', rank_check)
self.fail('Failed to start ranks cleanly')

def read_data(self, ior, container, ppn, dfuse=None):
"""Verify the data used to populate the container.
Args:
ior (Ior): the ior command used to populate the container
container (TestContainer): the container to verify
ppn (int): processes per node to use with the ior command
dfuse (Dfuse, optional): dfuse object defining the dfuse mount point. Defaults to None.
"""
ior.update('flags', self.params.get('flags', '/run/ior_read/*'))
ior.run(self.server_group, container.pool, container, None, ppn, dfuse=dfuse)

def verify_snapshots(self, container, expected):
"""Verify the snapshots listed for the container match the expected list of snapshots.
Expand Down Expand Up @@ -126,17 +95,16 @@ def test_restart(self):
:avocado: tags=server,replay
:avocado: tags=ReplayTests,test_restart
"""
ppn = self.params.get('ppn', '/run/ior_write/*', 1)
container = self.create_container()

self.log_step('Write data to the container (ior)')
ior = self.write_data(container, ppn)
ior = write_data(self, container)

self.stop_engines()
self.restart_engines()

self.log_step('Verifying data previously written to the container (ior)')
self.read_data(ior, container, ppn)
read_data(self, ior, container)
self.log_step('Test passed')

def test_replay_posix(self):
Expand All @@ -159,15 +127,14 @@ def test_replay_posix(self):
:avocado: tags=server,replay
:avocado: tags=ReplayTests,test_replay_posix
"""
ppn = self.params.get('ppn', '/run/ior_write/*', 1)
container = self.create_container()

self.log_step('Start dfuse')
dfuse = get_dfuse(self, self.hostlist_clients)
start_dfuse(self, dfuse, container.pool, container)

self.log_step('Write data to the dfuse mount point (ior)')
ior = self.write_data(container, ppn, dfuse)
ior = write_data(self, container, dfuse=dfuse)

self.log_step('After the read has completed, unmount dfuse')
stop_dfuse(self, dfuse)
Expand All @@ -179,10 +146,10 @@ def test_replay_posix(self):
start_dfuse(self, dfuse)

self.log_step('Verifying data previously written to the dfuse mount point (ior)')
self.read_data(ior, container, ppn, dfuse)
read_data(self, ior, container, dfuse=dfuse)

self.log_step('Write additional data to the dfuse mount point (ior)')
ior = self.write_data(container, ppn, dfuse)
ior = write_data(self, container, dfuse=dfuse)

self.log.info('Test passed')

Expand Down Expand Up @@ -210,14 +177,13 @@ def test_replay_snapshots(self):
:avocado: tags=server,replay
:avocado: tags=ReplayTests,test_replay_snapshots
"""
ppn = self.params.get('ppn', '/run/ior_write/*', 1)
container = self.create_container()

snapshots = []
for index in range(1, 4):
step = join(' ', index, 'of', 3)
self.log_step(join(' ', 'Write data to the container (ior)', '-', step))
self.write_data(container, ppn)
write_data(self, container)

self.log_step(join(' ', 'Creating a snapshot (daos container create-snap)', '-', step))
snapshots.append(container.create_snap()['response']['epoch'])
Expand Down Expand Up @@ -348,7 +314,6 @@ def test_replay_no_check_pointing(self):
:avocado: tags=server,replay
:avocado: tags=ReplayTests,test_replay_no_check_pointing
"""
ppn = self.params.get('ppn', '/run/ior_write/*', 1)
container = self.create_container()

self.log_step('Disabling check pointing on {}'.format(container.pool))
Expand All @@ -358,7 +323,7 @@ def test_replay_no_check_pointing(self):
self.fail('Pool check pointing not disabled before engine restart')

self.log_step('Write data to the container (ior)')
ior = self.write_data(container, ppn)
ior = write_data(self, container)

self.stop_engines()
self.restart_engines()
Expand All @@ -371,7 +336,7 @@ def test_replay_no_check_pointing(self):
self.fail('Pool check pointing not disabled after engine restart')

self.log_step('Verifying data previously written to the container (ior)')
self.read_data(ior, container, ppn)
read_data(self, ior, container)
self.log_step('Test passed')

def test_replay_check_pointing(self):
Expand All @@ -392,14 +357,13 @@ def test_replay_check_pointing(self):
:avocado: tags=server,replay
:avocado: tags=ReplayTests,test_replay_check_pointing
"""
ppn = self.params.get('ppn', '/run/ior_write/*', 1)
frequency = 5
container = self.create_container(
properties=f'checkpoint:timed,checkpoint_freq:{frequency}')
self.log.info('%s check point frequency: %s seconds', container.pool, frequency)

self.log_step('Write data to the container (ior)')
ior = self.write_data(container, ppn)
ior = write_data(self, container)

self.log_step('Waiting for check pointing to complete (sleep {})'.format(frequency * 2))
time.sleep(frequency * 2)
Expand All @@ -408,5 +372,5 @@ def test_replay_check_pointing(self):
self.restart_engines()

self.log_step('Verifying data previously written to the container (ior)')
self.read_data(ior, container, ppn)
read_data(self, ior, container)
self.log_step('Test passed')
3 changes: 1 addition & 2 deletions src/tests/ftest/server/replay.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ container:
dfs_oclass: SX

ior: &ior_base
client_processes:
ppn: 4
ppn: 4
api: DFS
transfer_size: 512K
block_size: 1G
Expand Down
16 changes: 8 additions & 8 deletions src/tests/ftest/telemetry/dkey_akey_enum_punch.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
'''
(C) Copyright 2018-2023 Intel Corporation.
(C) Copyright 2018-2024 Intel Corporation.
SPDX-License-Identifier: BSD-2-Clause-Patent
'''
Expand Down Expand Up @@ -232,13 +232,13 @@ def test_dkey_akey_enum_punch(self):

# Obtain and verify the io metrics 1 to 4. ###
# engine_pool_ops_dkey_enum
pool_dkey_enum = self.telemetry.ENGINE_POOL_METRICS[5]
pool_dkey_enum = self.telemetry.ENGINE_POOL_OPS_DKEY_ENUM_METRICS
# engine_pool_ops_akey_enum
pool_akey_enum = self.telemetry.ENGINE_POOL_METRICS[2]
pool_akey_enum = self.telemetry.ENGINE_POOL_OPS_AKEY_ENUM_METRICS
# engine_pool_ops_dkey_punch
pool_dkey_punch = self.telemetry.ENGINE_POOL_METRICS[6]
pool_dkey_punch = self.telemetry.ENGINE_POOL_OPS_DKEY_PUNCH_METRICS
# engine_pool_ops_akey_punch
pool_akey_punch = self.telemetry.ENGINE_POOL_METRICS[3]
pool_akey_punch = self.telemetry.ENGINE_POOL_OPS_AKEY_PUNCH_METRICS
specific_metrics = [
pool_dkey_enum, pool_akey_enum,
pool_dkey_punch, pool_akey_punch,
Expand Down Expand Up @@ -357,9 +357,9 @@ def test_pool_tgt_dkey_akey_punch(self):

self.telemetry.dmg.verbose = False

# Obtain and verify the pool metrics 1 and 2 ###
pool_tgt_dkey_punch = self.telemetry.ENGINE_POOL_METRICS[21]
pool_tgt_akey_punch = self.telemetry.ENGINE_POOL_METRICS[20]
# Obtain and verify the pool target punch metrics
pool_tgt_dkey_punch = self.telemetry.ENGINE_POOL_OPS_TGT_DKEY_PUNCH_METRICS
pool_tgt_akey_punch = self.telemetry.ENGINE_POOL_OPS_TGT_AKEY_PUNCH_METRICS
specific_metrics = [pool_tgt_dkey_punch, pool_tgt_akey_punch]
pool_out = self.telemetry.get_pool_metrics(
specific_metrics=specific_metrics)
Expand Down
Loading

0 comments on commit 24cf0ed

Please sign in to comment.