From 5c80d3804bc8ab93464bbc36a84b71f01e87cfa5 Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox <57339448+Junchao-Mellanox@users.noreply.github.com> Date: Wed, 4 Oct 2023 09:58:31 +0800 Subject: [PATCH] [Mellanox] wait reset cause ready (#16722) Why I did it SONiC service determine-reboot-cause might run before driver creating reset cause files. In that case, the reset cause will be "Unknown". This PR introduces a wait mechanism to wait for reset cause sysfs files ready. How I did it /run/hw-management/config/reset_attr_ready is the file to indicate all reset cause files are ready. In chassis.get_reboot_cause function, it waits /run/hw-management/config/reset_attr_ready for up to 45 seconds. How to verify it Manual test on master/202211/202205 --- .../sonic_platform/chassis.py | 19 +++++++++++++++++-- .../mlnx-platform-api/tests/test_chassis.py | 17 +++++++++++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py index 17d66b2e8f64..41caf52cf69e 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py @@ -59,8 +59,9 @@ #reboot cause related definitions REBOOT_CAUSE_ROOT = HWMGMT_SYSTEM_ROOT - -REBOOT_CAUSE_FILE_LENGTH = 1 +REBOOT_CAUSE_MAX_WAIT_TIME = 45 +REBOOT_CAUSE_CHECK_INTERVAL = 5 +REBOOT_CAUSE_READY_FILE = '/run/hw-management/config/reset_attr_ready' REBOOT_TYPE_KEXEC_FILE = "/proc/cmdline" REBOOT_TYPE_KEXEC_PATTERN_WARM = ".*SONIC_BOOT_TYPE=(warm|fastfast).*" @@ -757,6 +758,16 @@ def _parse_warmfast_reboot_from_proc_cmdline(self): return 'fast-reboot' return None + def _wait_reboot_cause_ready(self): + max_wait_time = REBOOT_CAUSE_MAX_WAIT_TIME + while max_wait_time > 0: + if utils.read_int_from_file(REBOOT_CAUSE_READY_FILE, log_func=None) == 1: + return True + time.sleep(REBOOT_CAUSE_CHECK_INTERVAL) + max_wait_time -= REBOOT_CAUSE_CHECK_INTERVAL + + return False + def get_reboot_cause(self): """ Retrieves the cause of the previous reboot @@ -777,6 +788,10 @@ def get_reboot_cause(self): if reboot_cause: return self.REBOOT_CAUSE_NON_HARDWARE, '' + if not self._wait_reboot_cause_ready(): + logger.log_error("Hardware reboot cause is not ready") + return self.REBOOT_CAUSE_NON_HARDWARE, '' + if not self.reboot_cause_initialized: self.initialize_reboot_cause() diff --git a/platform/mellanox/mlnx-platform-api/tests/test_chassis.py b/platform/mellanox/mlnx-platform-api/tests/test_chassis.py index e71f75f0a954..4904b7185386 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_chassis.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_chassis.py @@ -194,6 +194,7 @@ def mock_check_sfp_status(self, port_dict, error_dict, timeout): assert status is True assert 'sfp' in event_dict and not event_dict['sfp'] + @mock.patch('sonic_platform.chassis.Chassis._wait_reboot_cause_ready', MagicMock(return_value=True)) def test_reboot_cause(self): from sonic_platform import utils from sonic_platform.chassis import REBOOT_CAUSE_ROOT @@ -242,6 +243,22 @@ def read_int_from_file(file_path, *args, **kwargs): assert minor == value mock_file_content[file_path] = 0 + @mock.patch('sonic_platform.chassis.Chassis._wait_reboot_cause_ready', MagicMock(return_value=False)) + def test_reboot_cause_timeout(self): + chassis = Chassis() + major, minor = chassis.get_reboot_cause() + assert major == chassis.REBOOT_CAUSE_NON_HARDWARE + assert minor == '' + + @mock.patch('sonic_platform.utils.read_int_from_file') + @mock.patch('sonic_platform.chassis.time.sleep', mock.MagicMock()) + def test_wait_reboot_cause_ready(self, mock_read_int): + mock_read_int.return_value = 1 + chassis = Chassis() + assert chassis._wait_reboot_cause_ready() + mock_read_int.return_value = 0 + assert not chassis._wait_reboot_cause_ready() + def test_parse_warmfast_reboot_from_proc_cmdline(self): chassis = Chassis() with mock.patch("builtins.open", mock.mock_open(read_data="SONIC_BOOT_TYPE=warm")):