Skip to content

Commit

Permalink
Add lock to config reload/load_minigraph (sonic-net#3475)
Browse files Browse the repository at this point in the history
What I did
In some cases, if multiple config reload/load_minigraph are running in parallel, they might leave the system in an error state.
In this PR, a flock is added to config reload/load_minigraph so they will not run in parallel.
The file lock is binding to /etc/sonic/reload.lock.

This is to fix issue: #19855

Microsoft ADO (number only): 28877643
Signed-off-by: Longxiang Lyu lolv@microsoft.com

How I did it
Add flock utility and decoate the reload and load_minigraph with the try_lock to ensure the lock is acquired before reload/load_minigraph.

How to verify it
UT and on testbed.

New command output (if the output of a command-line utility has changed)
reload with locking success
# config reload
Acquired lock on /etc/sonic/reload.lock
Clear current config and reload config in config_db format from the default config file(s) ? [y/N]: y
Disabling container monitoring ...
Stopping SONiC target ...
Running command: /usr/local/bin/sonic-cfggen -j /etc/sonic/init_cfg.json -j /etc/sonic/config_db.json --write-to-db
Running command: /usr/local/bin/db_migrator.py -o migrate
Running command: /usr/local/bin/sonic-cfggen -d -y /etc/sonic/sonic_version.yml -t /usr/share/sonic/templates/sonic-environment.j2,/etc/sonic/sonic-environment
Restarting SONiC target ...
Enabling container monitoring ...
Reloading Monit configuration ...
Reinitializing monit daemon
Released lock on /etc/sonic/reload.lock
reload with locking failure
# config reload
Failed to acquire lock on /etc/sonic/reload.lock
  • Loading branch information
lolyu committed Aug 20, 2024
1 parent 1c4300f commit 4372ced
Show file tree
Hide file tree
Showing 4 changed files with 448 additions and 12 deletions.
15 changes: 13 additions & 2 deletions config/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from utilities_common.general import load_db_config, load_module_from_source
from .validated_config_db_connector import ValidatedConfigDBConnector
import utilities_common.multi_asic as multi_asic_util
from utilities_common.flock import try_lock

from .utils import log

Expand Down Expand Up @@ -124,6 +125,12 @@
GRE_TYPE_RANGE = click.IntRange(min=0, max=65535)
ADHOC_VALIDATION = True

if os.environ.get("UTILITIES_UNIT_TESTING", "0") in ("1", "2"):
temp_system_reload_lockfile = tempfile.NamedTemporaryFile()
SYSTEM_RELOAD_LOCK = temp_system_reload_lockfile.name
else:
SYSTEM_RELOAD_LOCK = "/etc/sonic/reload.lock"

# Load sonic-cfggen from source since /usr/local/bin/sonic-cfggen does not have .py extension.
sonic_cfggen = load_module_from_source('sonic_cfggen', '/usr/local/bin/sonic-cfggen')

Expand Down Expand Up @@ -1753,9 +1760,11 @@ def list_checkpoints(ctx, verbose):
@click.option('-n', '--no_service_restart', default=False, is_flag=True, help='Do not restart docker services')
@click.option('-f', '--force', default=False, is_flag=True, help='Force config reload without system checks')
@click.option('-t', '--file_format', default='config_db',type=click.Choice(['config_yang', 'config_db']),show_default=True,help='specify the file format')
@click.option('-b', '--bypass-lock', default=False, is_flag=True, help='Do reload without acquiring lock')
@click.argument('filename', required=False)
@clicommon.pass_db
def reload(db, filename, yes, load_sysinfo, no_service_restart, force, file_format):
@try_lock(SYSTEM_RELOAD_LOCK, timeout=0)
def reload(db, filename, yes, load_sysinfo, no_service_restart, force, file_format, bypass_lock):
"""Clear current configuration and import a previous saved config DB dump file.
<filename> : Names of configuration file(s) to load, separated by comma with no spaces in between
"""
Expand Down Expand Up @@ -1968,8 +1977,10 @@ def load_mgmt_config(filename):
@click.option('-t', '--traffic_shift_away', default=False, is_flag=True, help='Keep device in maintenance with TSA')
@click.option('-o', '--override_config', default=False, is_flag=True, help='Enable config override. Proceed with default path.')
@click.option('-p', '--golden_config_path', help='Provide golden config path to override. Use with --override_config')
@click.option('-b', '--bypass-lock', default=False, is_flag=True, help='Do load minigraph without acquiring lock')
@clicommon.pass_db
def load_minigraph(db, no_service_restart, traffic_shift_away, override_config, golden_config_path):
@try_lock(SYSTEM_RELOAD_LOCK, timeout=0)
def load_minigraph(db, no_service_restart, traffic_shift_away, override_config, golden_config_path, bypass_lock):
"""Reconfigure based on minigraph."""
argv_str = ' '.join(['config', *sys.argv[1:]])
log.log_notice(f"'load_minigraph' executing with command: {argv_str}")
Expand Down
169 changes: 159 additions & 10 deletions tests/config_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from click.testing import CliRunner

from sonic_py_common import device_info, multi_asic
from utilities_common import flock
from utilities_common.db import Db
from utilities_common.general import load_module_from_source
from mock import call, patch, mock_open, MagicMock
Expand Down Expand Up @@ -45,6 +46,23 @@
load_minigraph_platform_false_path = os.path.join(load_minigraph_input_path, "platform_false")

load_minigraph_command_output="""\
Acquired lock on {0}
Stopping SONiC target ...
Running command: /usr/local/bin/sonic-cfggen -H -m --write-to-db
Running command: config qos reload --no-dynamic-buffer --no-delay
Running command: pfcwd start_default
Restarting SONiC target ...
Reloading Monit configuration ...
Please note setting loaded from minigraph will be lost after system reboot. To preserve setting, run `config save`.
Released lock on {0}
"""

load_minigraph_lock_failure_output = """\
Failed to acquire lock on {0}
"""

load_minigraph_command_bypass_lock_output = """\
Bypass lock on {}
Stopping SONiC target ...
Running command: /usr/local/bin/sonic-cfggen -H -m --write-to-db
Running command: config qos reload --no-dynamic-buffer --no-delay
Expand All @@ -55,6 +73,7 @@
"""

load_minigraph_platform_plugin_command_output="""\
Acquired lock on {0}
Stopping SONiC target ...
Running command: /usr/local/bin/sonic-cfggen -H -m --write-to-db
Running command: config qos reload --no-dynamic-buffer --no-delay
Expand All @@ -63,6 +82,7 @@
Restarting SONiC target ...
Reloading Monit configuration ...
Please note setting loaded from minigraph will be lost after system reboot. To preserve setting, run `config save`.
Released lock on {0}
"""

load_mgmt_config_command_ipv4_only_output="""\
Expand Down Expand Up @@ -137,51 +157,76 @@
"""

RELOAD_CONFIG_DB_OUTPUT = """\
Acquired lock on {0}
Stopping SONiC target ...
Running command: /usr/local/bin/sonic-cfggen -j /tmp/config.json --write-to-db
Restarting SONiC target ...
Reloading Monit configuration ...
Released lock on {0}
"""

RELOAD_CONFIG_DB_LOCK_FAILURE_OUTPUT = """\
Failed to acquire lock on {0}
"""

RELOAD_CONFIG_DB_BYPASS_LOCK_OUTPUT = """\
Bypass lock on {0}
Stopping SONiC target ...
Running command: /usr/local/bin/sonic-cfggen -j /tmp/config.json --write-to-db
Restarting SONiC target ...
Reloading Monit configuration ...
"""

RELOAD_YANG_CFG_OUTPUT = """\
Acquired lock on {0}
Stopping SONiC target ...
Running command: /usr/local/bin/sonic-cfggen -Y /tmp/config.json --write-to-db
Restarting SONiC target ...
Reloading Monit configuration ...
Released lock on {0}
"""

RELOAD_MASIC_CONFIG_DB_OUTPUT = """\
Acquired lock on {0}
Stopping SONiC target ...
Running command: /usr/local/bin/sonic-cfggen -j /tmp/config.json --write-to-db
Running command: /usr/local/bin/sonic-cfggen -j /tmp/config.json -n asic0 --write-to-db
Running command: /usr/local/bin/sonic-cfggen -j /tmp/config.json -n asic1 --write-to-db
Restarting SONiC target ...
Reloading Monit configuration ...
Released lock on {0}
"""

reload_config_with_sys_info_command_output="""\
Acquired lock on {0}
Running command: /usr/local/bin/sonic-cfggen -H -k Seastone-DX010-25-50 --write-to-db"""

reload_config_with_disabled_service_output="""\
Acquired lock on {0}
Stopping SONiC target ...
Running command: /usr/local/bin/sonic-cfggen -j /tmp/config.json --write-to-db
Restarting SONiC target ...
Reloading Monit configuration ...
Released lock on {0}
"""

reload_config_masic_onefile_output = """\
Acquired lock on {0}
Stopping SONiC target ...
Restarting SONiC target ...
Reloading Monit configuration ...
Released lock on {0}
"""

reload_config_masic_onefile_gen_sysinfo_output = """\
Acquired lock on {0}
Stopping SONiC target ...
Running command: /usr/local/bin/sonic-cfggen -H -k Mellanox-SN3800-D112C8 --write-to-db
Running command: /usr/local/bin/sonic-cfggen -H -k multi_asic -n asic0 --write-to-db
Running command: /usr/local/bin/sonic-cfggen -H -k multi_asic -n asic1 --write-to-db
Restarting SONiC target ...
Reloading Monit configuration ...
Released lock on {0}
"""

save_config_output = """\
Expand Down Expand Up @@ -601,7 +646,8 @@ def test_config_reload(self, get_cmd_module, setup_single_broadcom_asic):

assert result.exit_code == 0

assert "\n".join([l.rstrip() for l in result.output.split('\n')][:1]) == reload_config_with_sys_info_command_output
assert "\n".join([line.rstrip() for line in result.output.split('\n')][:2]) == \
reload_config_with_sys_info_command_output.format(config.SYSTEM_RELOAD_LOCK)

def test_config_reload_stdin(self, get_cmd_module, setup_single_broadcom_asic):
def mock_json_load(f):
Expand Down Expand Up @@ -641,7 +687,8 @@ def mock_json_load(f):

assert result.exit_code == 0

assert "\n".join([l.rstrip() for l in result.output.split('\n')][:1]) == reload_config_with_sys_info_command_output
assert "\n".join([line.rstrip() for line in result.output.split('\n')][:2]) == \
reload_config_with_sys_info_command_output.format(config.SYSTEM_RELOAD_LOCK)

@classmethod
def teardown_class(cls):
Expand Down Expand Up @@ -747,7 +794,8 @@ def read_json_file_side_effect(filename):
traceback.print_tb(result.exc_info[2])

assert result.exit_code == 0
assert "\n".join([li.rstrip() for li in result.output.split('\n')]) == reload_config_masic_onefile_output
assert "\n".join([li.rstrip() for li in result.output.split('\n')]) == \
reload_config_masic_onefile_output.format(config.SYSTEM_RELOAD_LOCK)

def test_config_reload_onefile_gen_sysinfo_masic(self):
def read_json_file_side_effect(filename):
Expand Down Expand Up @@ -823,7 +871,7 @@ def read_json_file_side_effect(filename):
assert result.exit_code == 0
assert "\n".join(
[li.rstrip() for li in result.output.split('\n')]
) == reload_config_masic_onefile_gen_sysinfo_output
) == reload_config_masic_onefile_gen_sysinfo_output.format(config.SYSTEM_RELOAD_LOCK)

def test_config_reload_onefile_bad_format_masic(self):
def read_json_file_side_effect(filename):
Expand Down Expand Up @@ -878,11 +926,58 @@ def test_load_minigraph(self, get_cmd_module, setup_single_broadcom_asic):
print(result.output)
traceback.print_tb(result.exc_info[2])
assert result.exit_code == 0
assert "\n".join([l.rstrip() for l in result.output.split('\n')]) == load_minigraph_command_output
assert "\n".join([line.rstrip() for line in result.output.split('\n')]) == \
(load_minigraph_command_output.format(config.SYSTEM_RELOAD_LOCK))
# Verify "systemctl reset-failed" is called for services under sonic.target
mock_run_command.assert_any_call(['systemctl', 'reset-failed', 'swss'])
assert mock_run_command.call_count == 12

@mock.patch('sonic_py_common.device_info.get_paths_to_platform_and_hwsku_dirs',
mock.MagicMock(return_value=("dummy_path", None)))
def test_load_minigraph_lock_failure(self, get_cmd_module, setup_single_broadcom_asic):
with mock.patch("utilities_common.cli.run_command",
mock.MagicMock(side_effect=mock_run_command_side_effect)) as mock_run_command:
(config, _) = get_cmd_module

fd = open(config.SYSTEM_RELOAD_LOCK, 'r')
assert flock.acquire_flock(fd, 0)

try:
runner = CliRunner()
result = runner.invoke(config.config.commands["load_minigraph"], ["-y"])
print(result.exit_code)
print(result.output)
traceback.print_tb(result.exc_info[2])
assert result.exit_code != 0
assert result.output == \
(load_minigraph_lock_failure_output.format(config.SYSTEM_RELOAD_LOCK))
assert mock_run_command.call_count == 0
finally:
flock.release_flock(fd)

@mock.patch('sonic_py_common.device_info.get_paths_to_platform_and_hwsku_dirs',
mock.MagicMock(return_value=("dummy_path", None)))
def test_load_minigraph_bypass_lock(self, get_cmd_module, setup_single_broadcom_asic):
with mock.patch("utilities_common.cli.run_command",
mock.MagicMock(side_effect=mock_run_command_side_effect)) as mock_run_command:
(config, _) = get_cmd_module

fd = open(config.SYSTEM_RELOAD_LOCK, 'r')
assert flock.acquire_flock(fd, 0)

try:
runner = CliRunner()
result = runner.invoke(config.config.commands["load_minigraph"], ["-y", "-b"])
print(result.exit_code)
print(result.output)
traceback.print_tb(result.exc_info[2])
assert result.exit_code == 0
assert result.output == \
load_minigraph_command_bypass_lock_output.format(config.SYSTEM_RELOAD_LOCK)
assert mock_run_command.call_count == 12
finally:
flock.release_flock(fd)

@mock.patch('sonic_py_common.device_info.get_paths_to_platform_and_hwsku_dirs', mock.MagicMock(return_value=(load_minigraph_platform_path, None)))
def test_load_minigraph_platform_plugin(self, get_cmd_module, setup_single_broadcom_asic):
with mock.patch("utilities_common.cli.run_command", mock.MagicMock(side_effect=mock_run_command_side_effect)) as mock_run_command:
Expand All @@ -893,7 +988,8 @@ def test_load_minigraph_platform_plugin(self, get_cmd_module, setup_single_broad
print(result.output)
traceback.print_tb(result.exc_info[2])
assert result.exit_code == 0
assert "\n".join([l.rstrip() for l in result.output.split('\n')]) == load_minigraph_platform_plugin_command_output
assert "\n".join([line.rstrip() for line in result.output.split('\n')]) == \
(load_minigraph_platform_plugin_command_output.format(config.SYSTEM_RELOAD_LOCK))
# Verify "systemctl reset-failed" is called for services under sonic.target
mock_run_command.assert_any_call(['systemctl', 'reset-failed', 'swss'])
assert mock_run_command.call_count == 12
Expand Down Expand Up @@ -1171,7 +1267,59 @@ def test_reload_config(self, get_cmd_module, setup_single_broadcom_asic):
traceback.print_tb(result.exc_info[2])
assert result.exit_code == 0
assert "\n".join([l.rstrip() for l in result.output.split('\n')]) \
== RELOAD_CONFIG_DB_OUTPUT
== RELOAD_CONFIG_DB_OUTPUT.format(config.SYSTEM_RELOAD_LOCK)

def test_reload_config_lock_failure(self, get_cmd_module, setup_single_broadcom_asic):
self.add_sysinfo_to_cfg_file()
with mock.patch(
"utilities_common.cli.run_command",
mock.MagicMock(side_effect=mock_run_command_side_effect)
):
(config, show) = get_cmd_module
runner = CliRunner()

fd = open(config.SYSTEM_RELOAD_LOCK, 'r')
assert flock.acquire_flock(fd, 0)

try:
result = runner.invoke(
config.config.commands["reload"],
[self.dummy_cfg_file, '-y', '-f'])

print(result.exit_code)
print(result.output)
traceback.print_tb(result.exc_info[2])
assert result.exit_code != 0
assert "\n".join([line.rstrip() for line in result.output.split('\n')]) \
== RELOAD_CONFIG_DB_LOCK_FAILURE_OUTPUT.format(config.SYSTEM_RELOAD_LOCK)
finally:
flock.release_flock(fd)

def test_reload_config_bypass_lock(self, get_cmd_module, setup_single_broadcom_asic):
self.add_sysinfo_to_cfg_file()
with mock.patch(
"utilities_common.cli.run_command",
mock.MagicMock(side_effect=mock_run_command_side_effect)
):
(config, show) = get_cmd_module
runner = CliRunner()

fd = open(config.SYSTEM_RELOAD_LOCK, 'r')
assert flock.acquire_flock(fd, 0)

try:
result = runner.invoke(
config.config.commands["reload"],
[self.dummy_cfg_file, '-y', '-f', '-b'])

print(result.exit_code)
print(result.output)
traceback.print_tb(result.exc_info[2])
assert result.exit_code == 0
assert "\n".join([line.rstrip() for line in result.output.split('\n')]) \
== RELOAD_CONFIG_DB_BYPASS_LOCK_OUTPUT.format(config.SYSTEM_RELOAD_LOCK)
finally:
flock.release_flock(fd)

def test_config_reload_disabled_service(self, get_cmd_module, setup_single_broadcom_asic):
self.add_sysinfo_to_cfg_file()
Expand All @@ -1191,7 +1339,8 @@ def test_config_reload_disabled_service(self, get_cmd_module, setup_single_broad

assert result.exit_code == 0

assert "\n".join([l.rstrip() for l in result.output.split('\n')]) == reload_config_with_disabled_service_output
assert "\n".join([line.rstrip() for line in result.output.split('\n')]) == \
reload_config_with_disabled_service_output.format(config.SYSTEM_RELOAD_LOCK)

def test_reload_config_masic(self, get_cmd_module, setup_multi_broadcom_masic):
self.add_sysinfo_to_cfg_file()
Expand All @@ -1215,7 +1364,7 @@ def test_reload_config_masic(self, get_cmd_module, setup_multi_broadcom_masic):
traceback.print_tb(result.exc_info[2])
assert result.exit_code == 0
assert "\n".join([l.rstrip() for l in result.output.split('\n')]) \
== RELOAD_MASIC_CONFIG_DB_OUTPUT
== RELOAD_MASIC_CONFIG_DB_OUTPUT.format(config.SYSTEM_RELOAD_LOCK)

def test_reload_yang_config(self, get_cmd_module,
setup_single_broadcom_asic):
Expand All @@ -1234,7 +1383,7 @@ def test_reload_yang_config(self, get_cmd_module,
traceback.print_tb(result.exc_info[2])
assert result.exit_code == 0
assert "\n".join([l.rstrip() for l in result.output.split('\n')]) \
== RELOAD_YANG_CFG_OUTPUT
== RELOAD_YANG_CFG_OUTPUT.format(config.SYSTEM_RELOAD_LOCK)

@classmethod
def teardown_class(cls):
Expand Down
Loading

0 comments on commit 4372ced

Please sign in to comment.