From 4372ced5f87d6c2a048a6df5e49de2179cbf06c1 Mon Sep 17 00:00:00 2001 From: Longxiang Lyu <35479537+lolyu@users.noreply.github.com> Date: Tue, 20 Aug 2024 13:29:15 +0800 Subject: [PATCH] Add lock to config reload/load_minigraph (#3475) What I did In some cases, if multiple config reload/load_minigraph are running in parallel, they might leave the system in an error state. In this PR, a flock is added to config reload/load_minigraph so they will not run in parallel. The file lock is binding to /etc/sonic/reload.lock. This is to fix issue: #19855 Microsoft ADO (number only): 28877643 Signed-off-by: Longxiang Lyu lolv@microsoft.com How I did it Add flock utility and decoate the reload and load_minigraph with the try_lock to ensure the lock is acquired before reload/load_minigraph. How to verify it UT and on testbed. New command output (if the output of a command-line utility has changed) reload with locking success # config reload Acquired lock on /etc/sonic/reload.lock Clear current config and reload config in config_db format from the default config file(s) ? [y/N]: y Disabling container monitoring ... Stopping SONiC target ... Running command: /usr/local/bin/sonic-cfggen -j /etc/sonic/init_cfg.json -j /etc/sonic/config_db.json --write-to-db Running command: /usr/local/bin/db_migrator.py -o migrate Running command: /usr/local/bin/sonic-cfggen -d -y /etc/sonic/sonic_version.yml -t /usr/share/sonic/templates/sonic-environment.j2,/etc/sonic/sonic-environment Restarting SONiC target ... Enabling container monitoring ... Reloading Monit configuration ... Reinitializing monit daemon Released lock on /etc/sonic/reload.lock reload with locking failure # config reload Failed to acquire lock on /etc/sonic/reload.lock --- config/main.py | 15 ++- tests/config_test.py | 169 ++++++++++++++++++++++++++++++++-- tests/flock_test.py | 187 ++++++++++++++++++++++++++++++++++++++ utilities_common/flock.py | 89 ++++++++++++++++++ 4 files changed, 448 insertions(+), 12 deletions(-) create mode 100644 tests/flock_test.py create mode 100644 utilities_common/flock.py diff --git a/config/main.py b/config/main.py index f48c446adf..4a46efda5e 100644 --- a/config/main.py +++ b/config/main.py @@ -42,6 +42,7 @@ from utilities_common.general import load_db_config, load_module_from_source from .validated_config_db_connector import ValidatedConfigDBConnector import utilities_common.multi_asic as multi_asic_util +from utilities_common.flock import try_lock from .utils import log @@ -124,6 +125,12 @@ GRE_TYPE_RANGE = click.IntRange(min=0, max=65535) ADHOC_VALIDATION = True +if os.environ.get("UTILITIES_UNIT_TESTING", "0") in ("1", "2"): + temp_system_reload_lockfile = tempfile.NamedTemporaryFile() + SYSTEM_RELOAD_LOCK = temp_system_reload_lockfile.name +else: + SYSTEM_RELOAD_LOCK = "/etc/sonic/reload.lock" + # Load sonic-cfggen from source since /usr/local/bin/sonic-cfggen does not have .py extension. sonic_cfggen = load_module_from_source('sonic_cfggen', '/usr/local/bin/sonic-cfggen') @@ -1753,9 +1760,11 @@ def list_checkpoints(ctx, verbose): @click.option('-n', '--no_service_restart', default=False, is_flag=True, help='Do not restart docker services') @click.option('-f', '--force', default=False, is_flag=True, help='Force config reload without system checks') @click.option('-t', '--file_format', default='config_db',type=click.Choice(['config_yang', 'config_db']),show_default=True,help='specify the file format') +@click.option('-b', '--bypass-lock', default=False, is_flag=True, help='Do reload without acquiring lock') @click.argument('filename', required=False) @clicommon.pass_db -def reload(db, filename, yes, load_sysinfo, no_service_restart, force, file_format): +@try_lock(SYSTEM_RELOAD_LOCK, timeout=0) +def reload(db, filename, yes, load_sysinfo, no_service_restart, force, file_format, bypass_lock): """Clear current configuration and import a previous saved config DB dump file. : Names of configuration file(s) to load, separated by comma with no spaces in between """ @@ -1968,8 +1977,10 @@ def load_mgmt_config(filename): @click.option('-t', '--traffic_shift_away', default=False, is_flag=True, help='Keep device in maintenance with TSA') @click.option('-o', '--override_config', default=False, is_flag=True, help='Enable config override. Proceed with default path.') @click.option('-p', '--golden_config_path', help='Provide golden config path to override. Use with --override_config') +@click.option('-b', '--bypass-lock', default=False, is_flag=True, help='Do load minigraph without acquiring lock') @clicommon.pass_db -def load_minigraph(db, no_service_restart, traffic_shift_away, override_config, golden_config_path): +@try_lock(SYSTEM_RELOAD_LOCK, timeout=0) +def load_minigraph(db, no_service_restart, traffic_shift_away, override_config, golden_config_path, bypass_lock): """Reconfigure based on minigraph.""" argv_str = ' '.join(['config', *sys.argv[1:]]) log.log_notice(f"'load_minigraph' executing with command: {argv_str}") diff --git a/tests/config_test.py b/tests/config_test.py index 748d434fc2..74bc0e1093 100644 --- a/tests/config_test.py +++ b/tests/config_test.py @@ -18,6 +18,7 @@ from click.testing import CliRunner from sonic_py_common import device_info, multi_asic +from utilities_common import flock from utilities_common.db import Db from utilities_common.general import load_module_from_source from mock import call, patch, mock_open, MagicMock @@ -45,6 +46,23 @@ load_minigraph_platform_false_path = os.path.join(load_minigraph_input_path, "platform_false") load_minigraph_command_output="""\ +Acquired lock on {0} +Stopping SONiC target ... +Running command: /usr/local/bin/sonic-cfggen -H -m --write-to-db +Running command: config qos reload --no-dynamic-buffer --no-delay +Running command: pfcwd start_default +Restarting SONiC target ... +Reloading Monit configuration ... +Please note setting loaded from minigraph will be lost after system reboot. To preserve setting, run `config save`. +Released lock on {0} +""" + +load_minigraph_lock_failure_output = """\ +Failed to acquire lock on {0} +""" + +load_minigraph_command_bypass_lock_output = """\ +Bypass lock on {} Stopping SONiC target ... Running command: /usr/local/bin/sonic-cfggen -H -m --write-to-db Running command: config qos reload --no-dynamic-buffer --no-delay @@ -55,6 +73,7 @@ """ load_minigraph_platform_plugin_command_output="""\ +Acquired lock on {0} Stopping SONiC target ... Running command: /usr/local/bin/sonic-cfggen -H -m --write-to-db Running command: config qos reload --no-dynamic-buffer --no-delay @@ -63,6 +82,7 @@ Restarting SONiC target ... Reloading Monit configuration ... Please note setting loaded from minigraph will be lost after system reboot. To preserve setting, run `config save`. +Released lock on {0} """ load_mgmt_config_command_ipv4_only_output="""\ @@ -137,6 +157,20 @@ """ RELOAD_CONFIG_DB_OUTPUT = """\ +Acquired lock on {0} +Stopping SONiC target ... +Running command: /usr/local/bin/sonic-cfggen -j /tmp/config.json --write-to-db +Restarting SONiC target ... +Reloading Monit configuration ... +Released lock on {0} +""" + +RELOAD_CONFIG_DB_LOCK_FAILURE_OUTPUT = """\ +Failed to acquire lock on {0} +""" + +RELOAD_CONFIG_DB_BYPASS_LOCK_OUTPUT = """\ +Bypass lock on {0} Stopping SONiC target ... Running command: /usr/local/bin/sonic-cfggen -j /tmp/config.json --write-to-db Restarting SONiC target ... @@ -144,44 +178,55 @@ """ RELOAD_YANG_CFG_OUTPUT = """\ +Acquired lock on {0} Stopping SONiC target ... Running command: /usr/local/bin/sonic-cfggen -Y /tmp/config.json --write-to-db Restarting SONiC target ... Reloading Monit configuration ... +Released lock on {0} """ RELOAD_MASIC_CONFIG_DB_OUTPUT = """\ +Acquired lock on {0} Stopping SONiC target ... Running command: /usr/local/bin/sonic-cfggen -j /tmp/config.json --write-to-db Running command: /usr/local/bin/sonic-cfggen -j /tmp/config.json -n asic0 --write-to-db Running command: /usr/local/bin/sonic-cfggen -j /tmp/config.json -n asic1 --write-to-db Restarting SONiC target ... Reloading Monit configuration ... +Released lock on {0} """ reload_config_with_sys_info_command_output="""\ +Acquired lock on {0} Running command: /usr/local/bin/sonic-cfggen -H -k Seastone-DX010-25-50 --write-to-db""" reload_config_with_disabled_service_output="""\ +Acquired lock on {0} Stopping SONiC target ... Running command: /usr/local/bin/sonic-cfggen -j /tmp/config.json --write-to-db Restarting SONiC target ... Reloading Monit configuration ... +Released lock on {0} """ reload_config_masic_onefile_output = """\ +Acquired lock on {0} Stopping SONiC target ... Restarting SONiC target ... Reloading Monit configuration ... +Released lock on {0} """ reload_config_masic_onefile_gen_sysinfo_output = """\ +Acquired lock on {0} Stopping SONiC target ... Running command: /usr/local/bin/sonic-cfggen -H -k Mellanox-SN3800-D112C8 --write-to-db Running command: /usr/local/bin/sonic-cfggen -H -k multi_asic -n asic0 --write-to-db Running command: /usr/local/bin/sonic-cfggen -H -k multi_asic -n asic1 --write-to-db Restarting SONiC target ... Reloading Monit configuration ... +Released lock on {0} """ save_config_output = """\ @@ -601,7 +646,8 @@ def test_config_reload(self, get_cmd_module, setup_single_broadcom_asic): assert result.exit_code == 0 - assert "\n".join([l.rstrip() for l in result.output.split('\n')][:1]) == reload_config_with_sys_info_command_output + assert "\n".join([line.rstrip() for line in result.output.split('\n')][:2]) == \ + reload_config_with_sys_info_command_output.format(config.SYSTEM_RELOAD_LOCK) def test_config_reload_stdin(self, get_cmd_module, setup_single_broadcom_asic): def mock_json_load(f): @@ -641,7 +687,8 @@ def mock_json_load(f): assert result.exit_code == 0 - assert "\n".join([l.rstrip() for l in result.output.split('\n')][:1]) == reload_config_with_sys_info_command_output + assert "\n".join([line.rstrip() for line in result.output.split('\n')][:2]) == \ + reload_config_with_sys_info_command_output.format(config.SYSTEM_RELOAD_LOCK) @classmethod def teardown_class(cls): @@ -747,7 +794,8 @@ def read_json_file_side_effect(filename): traceback.print_tb(result.exc_info[2]) assert result.exit_code == 0 - assert "\n".join([li.rstrip() for li in result.output.split('\n')]) == reload_config_masic_onefile_output + assert "\n".join([li.rstrip() for li in result.output.split('\n')]) == \ + reload_config_masic_onefile_output.format(config.SYSTEM_RELOAD_LOCK) def test_config_reload_onefile_gen_sysinfo_masic(self): def read_json_file_side_effect(filename): @@ -823,7 +871,7 @@ def read_json_file_side_effect(filename): assert result.exit_code == 0 assert "\n".join( [li.rstrip() for li in result.output.split('\n')] - ) == reload_config_masic_onefile_gen_sysinfo_output + ) == reload_config_masic_onefile_gen_sysinfo_output.format(config.SYSTEM_RELOAD_LOCK) def test_config_reload_onefile_bad_format_masic(self): def read_json_file_side_effect(filename): @@ -878,11 +926,58 @@ def test_load_minigraph(self, get_cmd_module, setup_single_broadcom_asic): print(result.output) traceback.print_tb(result.exc_info[2]) assert result.exit_code == 0 - assert "\n".join([l.rstrip() for l in result.output.split('\n')]) == load_minigraph_command_output + assert "\n".join([line.rstrip() for line in result.output.split('\n')]) == \ + (load_minigraph_command_output.format(config.SYSTEM_RELOAD_LOCK)) # Verify "systemctl reset-failed" is called for services under sonic.target mock_run_command.assert_any_call(['systemctl', 'reset-failed', 'swss']) assert mock_run_command.call_count == 12 + @mock.patch('sonic_py_common.device_info.get_paths_to_platform_and_hwsku_dirs', + mock.MagicMock(return_value=("dummy_path", None))) + def test_load_minigraph_lock_failure(self, get_cmd_module, setup_single_broadcom_asic): + with mock.patch("utilities_common.cli.run_command", + mock.MagicMock(side_effect=mock_run_command_side_effect)) as mock_run_command: + (config, _) = get_cmd_module + + fd = open(config.SYSTEM_RELOAD_LOCK, 'r') + assert flock.acquire_flock(fd, 0) + + try: + runner = CliRunner() + result = runner.invoke(config.config.commands["load_minigraph"], ["-y"]) + print(result.exit_code) + print(result.output) + traceback.print_tb(result.exc_info[2]) + assert result.exit_code != 0 + assert result.output == \ + (load_minigraph_lock_failure_output.format(config.SYSTEM_RELOAD_LOCK)) + assert mock_run_command.call_count == 0 + finally: + flock.release_flock(fd) + + @mock.patch('sonic_py_common.device_info.get_paths_to_platform_and_hwsku_dirs', + mock.MagicMock(return_value=("dummy_path", None))) + def test_load_minigraph_bypass_lock(self, get_cmd_module, setup_single_broadcom_asic): + with mock.patch("utilities_common.cli.run_command", + mock.MagicMock(side_effect=mock_run_command_side_effect)) as mock_run_command: + (config, _) = get_cmd_module + + fd = open(config.SYSTEM_RELOAD_LOCK, 'r') + assert flock.acquire_flock(fd, 0) + + try: + runner = CliRunner() + result = runner.invoke(config.config.commands["load_minigraph"], ["-y", "-b"]) + print(result.exit_code) + print(result.output) + traceback.print_tb(result.exc_info[2]) + assert result.exit_code == 0 + assert result.output == \ + load_minigraph_command_bypass_lock_output.format(config.SYSTEM_RELOAD_LOCK) + assert mock_run_command.call_count == 12 + finally: + flock.release_flock(fd) + @mock.patch('sonic_py_common.device_info.get_paths_to_platform_and_hwsku_dirs', mock.MagicMock(return_value=(load_minigraph_platform_path, None))) def test_load_minigraph_platform_plugin(self, get_cmd_module, setup_single_broadcom_asic): with mock.patch("utilities_common.cli.run_command", mock.MagicMock(side_effect=mock_run_command_side_effect)) as mock_run_command: @@ -893,7 +988,8 @@ def test_load_minigraph_platform_plugin(self, get_cmd_module, setup_single_broad print(result.output) traceback.print_tb(result.exc_info[2]) assert result.exit_code == 0 - assert "\n".join([l.rstrip() for l in result.output.split('\n')]) == load_minigraph_platform_plugin_command_output + assert "\n".join([line.rstrip() for line in result.output.split('\n')]) == \ + (load_minigraph_platform_plugin_command_output.format(config.SYSTEM_RELOAD_LOCK)) # Verify "systemctl reset-failed" is called for services under sonic.target mock_run_command.assert_any_call(['systemctl', 'reset-failed', 'swss']) assert mock_run_command.call_count == 12 @@ -1171,7 +1267,59 @@ def test_reload_config(self, get_cmd_module, setup_single_broadcom_asic): traceback.print_tb(result.exc_info[2]) assert result.exit_code == 0 assert "\n".join([l.rstrip() for l in result.output.split('\n')]) \ - == RELOAD_CONFIG_DB_OUTPUT + == RELOAD_CONFIG_DB_OUTPUT.format(config.SYSTEM_RELOAD_LOCK) + + def test_reload_config_lock_failure(self, get_cmd_module, setup_single_broadcom_asic): + self.add_sysinfo_to_cfg_file() + with mock.patch( + "utilities_common.cli.run_command", + mock.MagicMock(side_effect=mock_run_command_side_effect) + ): + (config, show) = get_cmd_module + runner = CliRunner() + + fd = open(config.SYSTEM_RELOAD_LOCK, 'r') + assert flock.acquire_flock(fd, 0) + + try: + result = runner.invoke( + config.config.commands["reload"], + [self.dummy_cfg_file, '-y', '-f']) + + print(result.exit_code) + print(result.output) + traceback.print_tb(result.exc_info[2]) + assert result.exit_code != 0 + assert "\n".join([line.rstrip() for line in result.output.split('\n')]) \ + == RELOAD_CONFIG_DB_LOCK_FAILURE_OUTPUT.format(config.SYSTEM_RELOAD_LOCK) + finally: + flock.release_flock(fd) + + def test_reload_config_bypass_lock(self, get_cmd_module, setup_single_broadcom_asic): + self.add_sysinfo_to_cfg_file() + with mock.patch( + "utilities_common.cli.run_command", + mock.MagicMock(side_effect=mock_run_command_side_effect) + ): + (config, show) = get_cmd_module + runner = CliRunner() + + fd = open(config.SYSTEM_RELOAD_LOCK, 'r') + assert flock.acquire_flock(fd, 0) + + try: + result = runner.invoke( + config.config.commands["reload"], + [self.dummy_cfg_file, '-y', '-f', '-b']) + + print(result.exit_code) + print(result.output) + traceback.print_tb(result.exc_info[2]) + assert result.exit_code == 0 + assert "\n".join([line.rstrip() for line in result.output.split('\n')]) \ + == RELOAD_CONFIG_DB_BYPASS_LOCK_OUTPUT.format(config.SYSTEM_RELOAD_LOCK) + finally: + flock.release_flock(fd) def test_config_reload_disabled_service(self, get_cmd_module, setup_single_broadcom_asic): self.add_sysinfo_to_cfg_file() @@ -1191,7 +1339,8 @@ def test_config_reload_disabled_service(self, get_cmd_module, setup_single_broad assert result.exit_code == 0 - assert "\n".join([l.rstrip() for l in result.output.split('\n')]) == reload_config_with_disabled_service_output + assert "\n".join([line.rstrip() for line in result.output.split('\n')]) == \ + reload_config_with_disabled_service_output.format(config.SYSTEM_RELOAD_LOCK) def test_reload_config_masic(self, get_cmd_module, setup_multi_broadcom_masic): self.add_sysinfo_to_cfg_file() @@ -1215,7 +1364,7 @@ def test_reload_config_masic(self, get_cmd_module, setup_multi_broadcom_masic): traceback.print_tb(result.exc_info[2]) assert result.exit_code == 0 assert "\n".join([l.rstrip() for l in result.output.split('\n')]) \ - == RELOAD_MASIC_CONFIG_DB_OUTPUT + == RELOAD_MASIC_CONFIG_DB_OUTPUT.format(config.SYSTEM_RELOAD_LOCK) def test_reload_yang_config(self, get_cmd_module, setup_single_broadcom_asic): @@ -1234,7 +1383,7 @@ def test_reload_yang_config(self, get_cmd_module, traceback.print_tb(result.exc_info[2]) assert result.exit_code == 0 assert "\n".join([l.rstrip() for l in result.output.split('\n')]) \ - == RELOAD_YANG_CFG_OUTPUT + == RELOAD_YANG_CFG_OUTPUT.format(config.SYSTEM_RELOAD_LOCK) @classmethod def teardown_class(cls): diff --git a/tests/flock_test.py b/tests/flock_test.py new file mode 100644 index 0000000000..7d9039dd2d --- /dev/null +++ b/tests/flock_test.py @@ -0,0 +1,187 @@ +import pytest +import tempfile +import threading +import time + +from unittest import mock +from utilities_common import flock + + +f0_exit = threading.Event() +f1_exit = threading.Event() +f2_exit = threading.Event() + + +def dummy_f0(): + while not f0_exit.is_set(): + time.sleep(1) + + +def dummy_f1(bypass_lock=False): + while not f1_exit.is_set(): + time.sleep(1) + + +def dummy_f2(bypass_lock=True): + while not f2_exit.is_set(): + time.sleep(1) + + +class TestFLock: + def setup(self): + print("SETUP") + f0_exit.clear() + f1_exit.clear() + f2_exit.clear() + + def test_flock_acquire_lock_non_blocking(self): + """Test flock non-blocking acquire lock.""" + with tempfile.NamedTemporaryFile() as fd0: + fd1 = open(fd0.name, "r") + + assert flock.acquire_flock(fd0.fileno(), 0) + assert not flock.acquire_flock(fd1.fileno(), 0) + + flock.release_flock(fd0.fileno()) + + assert flock.acquire_flock(fd1.fileno(), 0) + flock.release_flock(fd1.fileno()) + + def test_flock_acquire_lock_blocking(self): + """Test flock blocking acquire.""" + with tempfile.NamedTemporaryFile() as fd0: + fd1 = open(fd0.name, "r") + res = [] + + assert flock.acquire_flock(fd0.fileno(), 0) + thrd = threading.Thread(target=lambda: res.append(flock.acquire_flock(fd1.fileno(), -1))) + thrd.start() + + time.sleep(5) + assert thrd.is_alive() + + flock.release_flock(fd0.fileno()) + thrd.join() + assert len(res) == 1 and res[0] + + fd2 = open(fd0.name, "r") + assert not flock.acquire_flock(fd2.fileno(), 0) + + flock.release_flock(fd1.fileno()) + assert flock.acquire_flock(fd2.fileno(), 0) + flock.release_flock(fd2.fileno()) + + def test_flock_acquire_lock_timeout(self): + """Test flock timeout acquire.""" + with tempfile.NamedTemporaryFile() as fd0: + def acquire_helper(): + nonlocal elapsed + start = time.time() + res.append(flock.acquire_flock(fd1.fileno(), 5)) + end = time.time() + elapsed = end - start + + fd1 = open(fd0.name, "r") + elapsed = 0 + res = [] + + assert flock.acquire_flock(fd0.fileno(), 0) + thrd = threading.Thread(target=acquire_helper) + thrd.start() + + thrd.join() + assert ((len(res) == 1) and (not res[0])) + assert elapsed >= 5 + + flock.release_flock(fd0.fileno()) + + @mock.patch("click.echo") + def test_try_lock(self, mock_echo): + """Test try_lock decorator.""" + with tempfile.NamedTemporaryFile() as fd0: + def get_file_content(fd): + fd.seek(0) + return fd.read() + + f0_with_try_lock = flock.try_lock(fd0.name, timeout=0)(dummy_f0) + f1_with_try_lock = flock.try_lock(fd0.name, timeout=0)(dummy_f1) + + thrd = threading.Thread(target=f0_with_try_lock) + thrd.start() + time.sleep(2) + + try: + assert mock_echo.call_args_list == [mock.call(f"Acquired lock on {fd0.name}")] + assert b"dummy_f0" in get_file_content(fd0) + + with pytest.raises(SystemExit): + f1_with_try_lock() + assert mock_echo.call_args_list == [mock.call(f"Acquired lock on {fd0.name}"), + mock.call(f"Failed to acquire lock on {fd0.name}")] + finally: + f0_exit.set() + thrd.join() + + assert b"dummy_f0" not in get_file_content(fd0) + + thrd = threading.Thread(target=f1_with_try_lock) + thrd.start() + time.sleep(2) + + try: + assert mock_echo.call_args_list == [mock.call(f"Acquired lock on {fd0.name}"), + mock.call(f"Failed to acquire lock on {fd0.name}"), + mock.call(f"Released lock on {fd0.name}"), + mock.call(f"Acquired lock on {fd0.name}")] + assert b"dummy_f1" in get_file_content(fd0) + finally: + f1_exit.set() + thrd.join() + + assert b"dummy_f1" not in get_file_content(fd0) + + @mock.patch("click.echo") + def test_try_lock_with_bypass(self, mock_echo): + with tempfile.NamedTemporaryFile() as fd0: + def get_file_content(fd): + fd.seek(0) + return fd.read() + + f1_with_try_lock = flock.try_lock(fd0.name, timeout=0)(dummy_f1) + + thrd = threading.Thread(target=f1_with_try_lock, args=(True,)) + thrd.start() + time.sleep(2) + + try: + assert mock_echo.call_args_list == [mock.call(f"Bypass lock on {fd0.name}")] + assert b"dummy_f1" not in get_file_content(fd0) + finally: + f1_exit.set() + thrd.join() + + @mock.patch("click.echo") + def test_try_lock_with_bypass_default(self, mock_echo): + with tempfile.NamedTemporaryFile() as fd0: + def get_file_content(fd): + fd.seek(0) + return fd.read() + + f2_with_try_lock = flock.try_lock(fd0.name, timeout=0)(dummy_f2) + + thrd = threading.Thread(target=f2_with_try_lock) + thrd.start() + time.sleep(2) + + try: + assert mock_echo.call_args_list == [mock.call(f"Bypass lock on {fd0.name}")] + assert b"dummy_f2" not in get_file_content(fd0) + finally: + f2_exit.set() + thrd.join() + + def teardown(self): + print("TEARDOWN") + f0_exit.clear() + f1_exit.clear() + f2_exit.clear() diff --git a/utilities_common/flock.py b/utilities_common/flock.py new file mode 100644 index 0000000000..c8faa8bfd9 --- /dev/null +++ b/utilities_common/flock.py @@ -0,0 +1,89 @@ +"""File lock utilities.""" +import click +import fcntl +import functools +import inspect +import os +import sys +import time + +from sonic_py_common import logger + + +log = logger.Logger() + + +def acquire_flock(fd, timeout=-1): + """Acquire the flock.""" + flags = fcntl.LOCK_EX + if timeout >= 0: + flags |= fcntl.LOCK_NB + else: + timeout = 0 + + start_time = current_time = time.time() + ret = False + while current_time - start_time <= timeout: + try: + fcntl.flock(fd, flags) + except (IOError, OSError): + ret = False + else: + ret = True + break + current_time = time.time() + if timeout != 0: + time.sleep(0.2) + return ret + + +def release_flock(fd): + """Release the flock.""" + fcntl.flock(fd, fcntl.LOCK_UN) + + +def try_lock(lock_file, timeout=-1): + """Decorator to try lock file using fcntl.flock.""" + def _decorator(func): + @functools.wraps(func) + def _wrapper(*args, **kwargs): + bypass_lock = False + + # Get the bypass_lock argument from the function signature + func_signature = inspect.signature(func) + has_bypass_lock = "bypass_lock" in func_signature.parameters + if has_bypass_lock: + func_ba = func_signature.bind(*args, **kwargs) + func_ba.apply_defaults() + bypass_lock = func_ba.arguments["bypass_lock"] + + if bypass_lock: + click.echo(f"Bypass lock on {lock_file}") + return func(*args, **kwargs) + else: + fd = os.open(lock_file, os.O_CREAT | os.O_RDWR) + if acquire_flock(fd, timeout): + click.echo(f"Acquired lock on {lock_file}") + os.truncate(fd, 0) + # Write pid and the function name to the lock file as a record + os.write(fd, f"{func.__name__}, pid {os.getpid()}\n".encode()) + try: + return func(*args, **kwargs) + finally: + release_flock(fd) + click.echo(f"Released lock on {lock_file}") + os.truncate(fd, 0) + os.close(fd) + else: + click.echo(f"Failed to acquire lock on {lock_file}") + lock_owner = os.read(fd, 1024).decode() + if not lock_owner: + lock_owner = "unknown" + log.log_notice( + (f"{func.__name__} failed to acquire lock on {lock_file}," + f" which is taken by {lock_owner}") + ) + os.close(fd) + sys.exit(1) + return _wrapper + return _decorator