diff --git a/config/main.py b/config/main.py index 0d94359788..f3c26bfe9b 100644 --- a/config/main.py +++ b/config/main.py @@ -38,6 +38,7 @@ from utilities_common.general import load_db_config, load_module_from_source from .validated_config_db_connector import ValidatedConfigDBConnector import utilities_common.multi_asic as multi_asic_util +from utilities_common.flock import try_lock from .utils import log @@ -116,6 +117,12 @@ GRE_TYPE_RANGE = click.IntRange(min=0, max=65535) ADHOC_VALIDATION = True +if os.environ.get("UTILITIES_UNIT_TESTING", "0") in ("1", "2"): + temp_system_reload_lockfile = tempfile.NamedTemporaryFile() + SYSTEM_RELOAD_LOCK = temp_system_reload_lockfile.name +else: + SYSTEM_RELOAD_LOCK = "/etc/sonic/reload.lock" + # Load sonic-cfggen from source since /usr/local/bin/sonic-cfggen does not have .py extension. sonic_cfggen = load_module_from_source('sonic_cfggen', '/usr/local/bin/sonic-cfggen') @@ -1511,9 +1518,11 @@ def list_checkpoints(ctx, verbose): @click.option('-n', '--no_service_restart', default=False, is_flag=True, help='Do not restart docker services') @click.option('-f', '--force', default=False, is_flag=True, help='Force config reload without system checks') @click.option('-t', '--file_format', default='config_db',type=click.Choice(['config_yang', 'config_db']),show_default=True,help='specify the file format') +@click.option('-b', '--bypass-lock', default=False, is_flag=True, help='Do reload without acquiring lock') @click.argument('filename', required=False) @clicommon.pass_db -def reload(db, filename, yes, load_sysinfo, no_service_restart, force, file_format): +@try_lock(SYSTEM_RELOAD_LOCK, timeout=0) +def reload(db, filename, yes, load_sysinfo, no_service_restart, force, file_format, bypass_lock): """Clear current configuration and import a previous saved config DB dump file. : Names of configuration file(s) to load, separated by comma with no spaces in between """ @@ -1740,8 +1749,10 @@ def load_mgmt_config(filename): @click.option('-t', '--traffic_shift_away', default=False, is_flag=True, help='Keep device in maintenance with TSA') @click.option('-o', '--override_config', default=False, is_flag=True, help='Enable config override. Proceed with default path.') @click.option('-p', '--golden_config_path', help='Provide golden config path to override. Use with --override_config') +@click.option('-b', '--bypass-lock', default=False, is_flag=True, help='Do load minigraph without acquiring lock') @clicommon.pass_db -def load_minigraph(db, no_service_restart, traffic_shift_away, override_config, golden_config_path): +@try_lock(SYSTEM_RELOAD_LOCK, timeout=0) +def load_minigraph(db, no_service_restart, traffic_shift_away, override_config, golden_config_path, bypass_lock): """Reconfigure based on minigraph.""" argv_str = ' '.join(['config', *sys.argv[1:]]) log.log_notice(f"'load_minigraph' executing with command: {argv_str}") diff --git a/tests/config_test.py b/tests/config_test.py index 26d03c4f0b..be595dadfe 100644 --- a/tests/config_test.py +++ b/tests/config_test.py @@ -17,6 +17,7 @@ from click.testing import CliRunner from sonic_py_common import device_info +from utilities_common import flock from utilities_common.db import Db from utilities_common.general import load_module_from_source from mock import call, patch, mock_open, MagicMock @@ -44,6 +45,23 @@ load_minigraph_platform_false_path = os.path.join(load_minigraph_input_path, "platform_false") load_minigraph_command_output="""\ +Acquired lock on {0} +Stopping SONiC target ... +Running command: /usr/local/bin/sonic-cfggen -H -m --write-to-db +Running command: config qos reload --no-dynamic-buffer --no-delay +Running command: pfcwd start_default +Restarting SONiC target ... +Reloading Monit configuration ... +Please note setting loaded from minigraph will be lost after system reboot. To preserve setting, run `config save`. +Released lock on {0} +""" + +load_minigraph_lock_failure_output = """\ +Failed to acquire lock on {0} +""" + +load_minigraph_command_bypass_lock_output = """\ +Bypass lock on {} Stopping SONiC target ... Running command: /usr/local/bin/sonic-cfggen -H -m --write-to-db Running command: config qos reload --no-dynamic-buffer --no-delay @@ -54,6 +72,7 @@ """ load_minigraph_platform_plugin_command_output="""\ +Acquired lock on {0} Stopping SONiC target ... Running command: /usr/local/bin/sonic-cfggen -H -m --write-to-db Running command: config qos reload --no-dynamic-buffer --no-delay @@ -62,6 +81,7 @@ Restarting SONiC target ... Reloading Monit configuration ... Please note setting loaded from minigraph will be lost after system reboot. To preserve setting, run `config save`. +Released lock on {0} """ load_mgmt_config_command_ipv4_only_output="""\ @@ -136,6 +156,20 @@ """ RELOAD_CONFIG_DB_OUTPUT = """\ +Acquired lock on {0} +Stopping SONiC target ... +Running command: /usr/local/bin/sonic-cfggen -j /tmp/config.json --write-to-db +Restarting SONiC target ... +Reloading Monit configuration ... +Released lock on {0} +""" + +RELOAD_CONFIG_DB_LOCK_FAILURE_OUTPUT = """\ +Failed to acquire lock on {0} +""" + +RELOAD_CONFIG_DB_BYPASS_LOCK_OUTPUT = """\ +Bypass lock on {0} Stopping SONiC target ... Running command: /usr/local/bin/sonic-cfggen -j /tmp/config.json --write-to-db Restarting SONiC target ... @@ -143,29 +177,36 @@ """ RELOAD_YANG_CFG_OUTPUT = """\ +Acquired lock on {0} Stopping SONiC target ... Running command: /usr/local/bin/sonic-cfggen -Y /tmp/config.json --write-to-db Restarting SONiC target ... Reloading Monit configuration ... +Released lock on {0} """ RELOAD_MASIC_CONFIG_DB_OUTPUT = """\ +Acquired lock on {0} Stopping SONiC target ... Running command: /usr/local/bin/sonic-cfggen -j /tmp/config.json --write-to-db Running command: /usr/local/bin/sonic-cfggen -j /tmp/config.json -n asic0 --write-to-db Running command: /usr/local/bin/sonic-cfggen -j /tmp/config.json -n asic1 --write-to-db Restarting SONiC target ... Reloading Monit configuration ... +Released lock on {0} """ reload_config_with_sys_info_command_output="""\ +Acquired lock on {0} Running command: /usr/local/bin/sonic-cfggen -H -k Seastone-DX010-25-50 --write-to-db""" reload_config_with_disabled_service_output="""\ +Acquired lock on {0} Stopping SONiC target ... Running command: /usr/local/bin/sonic-cfggen -j /tmp/config.json --write-to-db Restarting SONiC target ... Reloading Monit configuration ... +Released lock on {0} """ def mock_run_command_side_effect(*args, **kwargs): @@ -343,7 +384,8 @@ def test_config_reload(self, get_cmd_module, setup_single_broadcom_asic): assert result.exit_code == 0 - assert "\n".join([l.rstrip() for l in result.output.split('\n')][:1]) == reload_config_with_sys_info_command_output + assert "\n".join([line.rstrip() for line in result.output.split('\n')][:2]) == \ + reload_config_with_sys_info_command_output.format(config.SYSTEM_RELOAD_LOCK) def test_config_reload_stdin(self, get_cmd_module, setup_single_broadcom_asic): def mock_json_load(f): @@ -383,7 +425,8 @@ def mock_json_load(f): assert result.exit_code == 0 - assert "\n".join([l.rstrip() for l in result.output.split('\n')][:1]) == reload_config_with_sys_info_command_output + assert "\n".join([line.rstrip() for line in result.output.split('\n')][:2]) == \ + reload_config_with_sys_info_command_output.format(config.SYSTEM_RELOAD_LOCK) @classmethod def teardown_class(cls): @@ -415,11 +458,58 @@ def test_load_minigraph(self, get_cmd_module, setup_single_broadcom_asic): print(result.output) traceback.print_tb(result.exc_info[2]) assert result.exit_code == 0 - assert "\n".join([l.rstrip() for l in result.output.split('\n')]) == load_minigraph_command_output + assert "\n".join([line.rstrip() for line in result.output.split('\n')]) == \ + (load_minigraph_command_output.format(config.SYSTEM_RELOAD_LOCK)) # Verify "systemctl reset-failed" is called for services under sonic.target mock_run_command.assert_any_call(['systemctl', 'reset-failed', 'swss']) assert mock_run_command.call_count == 12 + @mock.patch('sonic_py_common.device_info.get_paths_to_platform_and_hwsku_dirs', + mock.MagicMock(return_value=("dummy_path", None))) + def test_load_minigraph_lock_failure(self, get_cmd_module, setup_single_broadcom_asic): + with mock.patch("utilities_common.cli.run_command", + mock.MagicMock(side_effect=mock_run_command_side_effect)) as mock_run_command: + (config, _) = get_cmd_module + + fd = open(config.SYSTEM_RELOAD_LOCK, 'r') + assert flock.acquire_flock(fd, 0) + + try: + runner = CliRunner() + result = runner.invoke(config.config.commands["load_minigraph"], ["-y"]) + print(result.exit_code) + print(result.output) + traceback.print_tb(result.exc_info[2]) + assert result.exit_code != 0 + assert result.output == \ + (load_minigraph_lock_failure_output.format(config.SYSTEM_RELOAD_LOCK)) + assert mock_run_command.call_count == 0 + finally: + flock.release_flock(fd) + + @mock.patch('sonic_py_common.device_info.get_paths_to_platform_and_hwsku_dirs', + mock.MagicMock(return_value=("dummy_path", None))) + def test_load_minigraph_bypass_lock(self, get_cmd_module, setup_single_broadcom_asic): + with mock.patch("utilities_common.cli.run_command", + mock.MagicMock(side_effect=mock_run_command_side_effect)) as mock_run_command: + (config, _) = get_cmd_module + + fd = open(config.SYSTEM_RELOAD_LOCK, 'r') + assert flock.acquire_flock(fd, 0) + + try: + runner = CliRunner() + result = runner.invoke(config.config.commands["load_minigraph"], ["-y", "-b"]) + print(result.exit_code) + print(result.output) + traceback.print_tb(result.exc_info[2]) + assert result.exit_code == 0 + assert result.output == \ + load_minigraph_command_bypass_lock_output.format(config.SYSTEM_RELOAD_LOCK) + assert mock_run_command.call_count == 12 + finally: + flock.release_flock(fd) + @mock.patch('sonic_py_common.device_info.get_paths_to_platform_and_hwsku_dirs', mock.MagicMock(return_value=(load_minigraph_platform_path, None))) def test_load_minigraph_platform_plugin(self, get_cmd_module, setup_single_broadcom_asic): with mock.patch("utilities_common.cli.run_command", mock.MagicMock(side_effect=mock_run_command_side_effect)) as mock_run_command: @@ -430,7 +520,8 @@ def test_load_minigraph_platform_plugin(self, get_cmd_module, setup_single_broad print(result.output) traceback.print_tb(result.exc_info[2]) assert result.exit_code == 0 - assert "\n".join([l.rstrip() for l in result.output.split('\n')]) == load_minigraph_platform_plugin_command_output + assert "\n".join([line.rstrip() for line in result.output.split('\n')]) == \ + (load_minigraph_platform_plugin_command_output.format(config.SYSTEM_RELOAD_LOCK)) # Verify "systemctl reset-failed" is called for services under sonic.target mock_run_command.assert_any_call(['systemctl', 'reset-failed', 'swss']) assert mock_run_command.call_count == 12 @@ -708,7 +799,59 @@ def test_reload_config(self, get_cmd_module, setup_single_broadcom_asic): traceback.print_tb(result.exc_info[2]) assert result.exit_code == 0 assert "\n".join([l.rstrip() for l in result.output.split('\n')]) \ - == RELOAD_CONFIG_DB_OUTPUT + == RELOAD_CONFIG_DB_OUTPUT.format(config.SYSTEM_RELOAD_LOCK) + + def test_reload_config_lock_failure(self, get_cmd_module, setup_single_broadcom_asic): + self.add_sysinfo_to_cfg_file() + with mock.patch( + "utilities_common.cli.run_command", + mock.MagicMock(side_effect=mock_run_command_side_effect) + ): + (config, show) = get_cmd_module + runner = CliRunner() + + fd = open(config.SYSTEM_RELOAD_LOCK, 'r') + assert flock.acquire_flock(fd, 0) + + try: + result = runner.invoke( + config.config.commands["reload"], + [self.dummy_cfg_file, '-y', '-f']) + + print(result.exit_code) + print(result.output) + traceback.print_tb(result.exc_info[2]) + assert result.exit_code != 0 + assert "\n".join([line.rstrip() for line in result.output.split('\n')]) \ + == RELOAD_CONFIG_DB_LOCK_FAILURE_OUTPUT.format(config.SYSTEM_RELOAD_LOCK) + finally: + flock.release_flock(fd) + + def test_reload_config_bypass_lock(self, get_cmd_module, setup_single_broadcom_asic): + self.add_sysinfo_to_cfg_file() + with mock.patch( + "utilities_common.cli.run_command", + mock.MagicMock(side_effect=mock_run_command_side_effect) + ): + (config, show) = get_cmd_module + runner = CliRunner() + + fd = open(config.SYSTEM_RELOAD_LOCK, 'r') + assert flock.acquire_flock(fd, 0) + + try: + result = runner.invoke( + config.config.commands["reload"], + [self.dummy_cfg_file, '-y', '-f', '-b']) + + print(result.exit_code) + print(result.output) + traceback.print_tb(result.exc_info[2]) + assert result.exit_code == 0 + assert "\n".join([line.rstrip() for line in result.output.split('\n')]) \ + == RELOAD_CONFIG_DB_BYPASS_LOCK_OUTPUT.format(config.SYSTEM_RELOAD_LOCK) + finally: + flock.release_flock(fd) def test_config_reload_disabled_service(self, get_cmd_module, setup_single_broadcom_asic): self.add_sysinfo_to_cfg_file() @@ -728,7 +871,8 @@ def test_config_reload_disabled_service(self, get_cmd_module, setup_single_broad assert result.exit_code == 0 - assert "\n".join([l.rstrip() for l in result.output.split('\n')]) == reload_config_with_disabled_service_output + assert "\n".join([line.rstrip() for line in result.output.split('\n')]) == \ + reload_config_with_disabled_service_output.format(config.SYSTEM_RELOAD_LOCK) def test_reload_config_masic(self, get_cmd_module, setup_multi_broadcom_masic): self.add_sysinfo_to_cfg_file() @@ -752,7 +896,7 @@ def test_reload_config_masic(self, get_cmd_module, setup_multi_broadcom_masic): traceback.print_tb(result.exc_info[2]) assert result.exit_code == 0 assert "\n".join([l.rstrip() for l in result.output.split('\n')]) \ - == RELOAD_MASIC_CONFIG_DB_OUTPUT + == RELOAD_MASIC_CONFIG_DB_OUTPUT.format(config.SYSTEM_RELOAD_LOCK) def test_reload_yang_config(self, get_cmd_module, setup_single_broadcom_asic): @@ -771,7 +915,7 @@ def test_reload_yang_config(self, get_cmd_module, traceback.print_tb(result.exc_info[2]) assert result.exit_code == 0 assert "\n".join([l.rstrip() for l in result.output.split('\n')]) \ - == RELOAD_YANG_CFG_OUTPUT + == RELOAD_YANG_CFG_OUTPUT.format(config.SYSTEM_RELOAD_LOCK) @classmethod def teardown_class(cls): diff --git a/tests/flock_test.py b/tests/flock_test.py new file mode 100644 index 0000000000..7d9039dd2d --- /dev/null +++ b/tests/flock_test.py @@ -0,0 +1,187 @@ +import pytest +import tempfile +import threading +import time + +from unittest import mock +from utilities_common import flock + + +f0_exit = threading.Event() +f1_exit = threading.Event() +f2_exit = threading.Event() + + +def dummy_f0(): + while not f0_exit.is_set(): + time.sleep(1) + + +def dummy_f1(bypass_lock=False): + while not f1_exit.is_set(): + time.sleep(1) + + +def dummy_f2(bypass_lock=True): + while not f2_exit.is_set(): + time.sleep(1) + + +class TestFLock: + def setup(self): + print("SETUP") + f0_exit.clear() + f1_exit.clear() + f2_exit.clear() + + def test_flock_acquire_lock_non_blocking(self): + """Test flock non-blocking acquire lock.""" + with tempfile.NamedTemporaryFile() as fd0: + fd1 = open(fd0.name, "r") + + assert flock.acquire_flock(fd0.fileno(), 0) + assert not flock.acquire_flock(fd1.fileno(), 0) + + flock.release_flock(fd0.fileno()) + + assert flock.acquire_flock(fd1.fileno(), 0) + flock.release_flock(fd1.fileno()) + + def test_flock_acquire_lock_blocking(self): + """Test flock blocking acquire.""" + with tempfile.NamedTemporaryFile() as fd0: + fd1 = open(fd0.name, "r") + res = [] + + assert flock.acquire_flock(fd0.fileno(), 0) + thrd = threading.Thread(target=lambda: res.append(flock.acquire_flock(fd1.fileno(), -1))) + thrd.start() + + time.sleep(5) + assert thrd.is_alive() + + flock.release_flock(fd0.fileno()) + thrd.join() + assert len(res) == 1 and res[0] + + fd2 = open(fd0.name, "r") + assert not flock.acquire_flock(fd2.fileno(), 0) + + flock.release_flock(fd1.fileno()) + assert flock.acquire_flock(fd2.fileno(), 0) + flock.release_flock(fd2.fileno()) + + def test_flock_acquire_lock_timeout(self): + """Test flock timeout acquire.""" + with tempfile.NamedTemporaryFile() as fd0: + def acquire_helper(): + nonlocal elapsed + start = time.time() + res.append(flock.acquire_flock(fd1.fileno(), 5)) + end = time.time() + elapsed = end - start + + fd1 = open(fd0.name, "r") + elapsed = 0 + res = [] + + assert flock.acquire_flock(fd0.fileno(), 0) + thrd = threading.Thread(target=acquire_helper) + thrd.start() + + thrd.join() + assert ((len(res) == 1) and (not res[0])) + assert elapsed >= 5 + + flock.release_flock(fd0.fileno()) + + @mock.patch("click.echo") + def test_try_lock(self, mock_echo): + """Test try_lock decorator.""" + with tempfile.NamedTemporaryFile() as fd0: + def get_file_content(fd): + fd.seek(0) + return fd.read() + + f0_with_try_lock = flock.try_lock(fd0.name, timeout=0)(dummy_f0) + f1_with_try_lock = flock.try_lock(fd0.name, timeout=0)(dummy_f1) + + thrd = threading.Thread(target=f0_with_try_lock) + thrd.start() + time.sleep(2) + + try: + assert mock_echo.call_args_list == [mock.call(f"Acquired lock on {fd0.name}")] + assert b"dummy_f0" in get_file_content(fd0) + + with pytest.raises(SystemExit): + f1_with_try_lock() + assert mock_echo.call_args_list == [mock.call(f"Acquired lock on {fd0.name}"), + mock.call(f"Failed to acquire lock on {fd0.name}")] + finally: + f0_exit.set() + thrd.join() + + assert b"dummy_f0" not in get_file_content(fd0) + + thrd = threading.Thread(target=f1_with_try_lock) + thrd.start() + time.sleep(2) + + try: + assert mock_echo.call_args_list == [mock.call(f"Acquired lock on {fd0.name}"), + mock.call(f"Failed to acquire lock on {fd0.name}"), + mock.call(f"Released lock on {fd0.name}"), + mock.call(f"Acquired lock on {fd0.name}")] + assert b"dummy_f1" in get_file_content(fd0) + finally: + f1_exit.set() + thrd.join() + + assert b"dummy_f1" not in get_file_content(fd0) + + @mock.patch("click.echo") + def test_try_lock_with_bypass(self, mock_echo): + with tempfile.NamedTemporaryFile() as fd0: + def get_file_content(fd): + fd.seek(0) + return fd.read() + + f1_with_try_lock = flock.try_lock(fd0.name, timeout=0)(dummy_f1) + + thrd = threading.Thread(target=f1_with_try_lock, args=(True,)) + thrd.start() + time.sleep(2) + + try: + assert mock_echo.call_args_list == [mock.call(f"Bypass lock on {fd0.name}")] + assert b"dummy_f1" not in get_file_content(fd0) + finally: + f1_exit.set() + thrd.join() + + @mock.patch("click.echo") + def test_try_lock_with_bypass_default(self, mock_echo): + with tempfile.NamedTemporaryFile() as fd0: + def get_file_content(fd): + fd.seek(0) + return fd.read() + + f2_with_try_lock = flock.try_lock(fd0.name, timeout=0)(dummy_f2) + + thrd = threading.Thread(target=f2_with_try_lock) + thrd.start() + time.sleep(2) + + try: + assert mock_echo.call_args_list == [mock.call(f"Bypass lock on {fd0.name}")] + assert b"dummy_f2" not in get_file_content(fd0) + finally: + f2_exit.set() + thrd.join() + + def teardown(self): + print("TEARDOWN") + f0_exit.clear() + f1_exit.clear() + f2_exit.clear() diff --git a/utilities_common/flock.py b/utilities_common/flock.py new file mode 100644 index 0000000000..c8faa8bfd9 --- /dev/null +++ b/utilities_common/flock.py @@ -0,0 +1,89 @@ +"""File lock utilities.""" +import click +import fcntl +import functools +import inspect +import os +import sys +import time + +from sonic_py_common import logger + + +log = logger.Logger() + + +def acquire_flock(fd, timeout=-1): + """Acquire the flock.""" + flags = fcntl.LOCK_EX + if timeout >= 0: + flags |= fcntl.LOCK_NB + else: + timeout = 0 + + start_time = current_time = time.time() + ret = False + while current_time - start_time <= timeout: + try: + fcntl.flock(fd, flags) + except (IOError, OSError): + ret = False + else: + ret = True + break + current_time = time.time() + if timeout != 0: + time.sleep(0.2) + return ret + + +def release_flock(fd): + """Release the flock.""" + fcntl.flock(fd, fcntl.LOCK_UN) + + +def try_lock(lock_file, timeout=-1): + """Decorator to try lock file using fcntl.flock.""" + def _decorator(func): + @functools.wraps(func) + def _wrapper(*args, **kwargs): + bypass_lock = False + + # Get the bypass_lock argument from the function signature + func_signature = inspect.signature(func) + has_bypass_lock = "bypass_lock" in func_signature.parameters + if has_bypass_lock: + func_ba = func_signature.bind(*args, **kwargs) + func_ba.apply_defaults() + bypass_lock = func_ba.arguments["bypass_lock"] + + if bypass_lock: + click.echo(f"Bypass lock on {lock_file}") + return func(*args, **kwargs) + else: + fd = os.open(lock_file, os.O_CREAT | os.O_RDWR) + if acquire_flock(fd, timeout): + click.echo(f"Acquired lock on {lock_file}") + os.truncate(fd, 0) + # Write pid and the function name to the lock file as a record + os.write(fd, f"{func.__name__}, pid {os.getpid()}\n".encode()) + try: + return func(*args, **kwargs) + finally: + release_flock(fd) + click.echo(f"Released lock on {lock_file}") + os.truncate(fd, 0) + os.close(fd) + else: + click.echo(f"Failed to acquire lock on {lock_file}") + lock_owner = os.read(fd, 1024).decode() + if not lock_owner: + lock_owner = "unknown" + log.log_notice( + (f"{func.__name__} failed to acquire lock on {lock_file}," + f" which is taken by {lock_owner}") + ) + os.close(fd) + sys.exit(1) + return _wrapper + return _decorator