Skip to content

Commit

Permalink
[Mellanox] wait reset cause ready (#16722)
Browse files Browse the repository at this point in the history
Why I did it
SONiC service determine-reboot-cause might run before driver creating reset cause files. In that case, the reset cause will be "Unknown". This PR introduces a wait mechanism to wait for reset cause sysfs files ready.

How I did it
/run/hw-management/config/reset_attr_ready is the file to indicate all reset cause files are ready. In chassis.get_reboot_cause function, it waits /run/hw-management/config/reset_attr_ready for up to 45 seconds.

How to verify it
Manual test on master/202211/202205
  • Loading branch information
Junchao-Mellanox authored and mssonicbld committed Oct 4, 2023
1 parent 185a63b commit 648c94d
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 2 deletions.
19 changes: 17 additions & 2 deletions platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,9 @@

#reboot cause related definitions
REBOOT_CAUSE_ROOT = HWMGMT_SYSTEM_ROOT

REBOOT_CAUSE_FILE_LENGTH = 1
REBOOT_CAUSE_MAX_WAIT_TIME = 45
REBOOT_CAUSE_CHECK_INTERVAL = 5
REBOOT_CAUSE_READY_FILE = '/run/hw-management/config/reset_attr_ready'

REBOOT_TYPE_KEXEC_FILE = "/proc/cmdline"
REBOOT_TYPE_KEXEC_PATTERN_WARM = ".*SONIC_BOOT_TYPE=(warm|fastfast).*"
Expand Down Expand Up @@ -782,6 +783,16 @@ def _parse_warmfast_reboot_from_proc_cmdline(self):
return 'fast-reboot'
return None

def _wait_reboot_cause_ready(self):
max_wait_time = REBOOT_CAUSE_MAX_WAIT_TIME
while max_wait_time > 0:
if utils.read_int_from_file(REBOOT_CAUSE_READY_FILE, log_func=None) == 1:
return True
time.sleep(REBOOT_CAUSE_CHECK_INTERVAL)
max_wait_time -= REBOOT_CAUSE_CHECK_INTERVAL

return False

def get_reboot_cause(self):
"""
Retrieves the cause of the previous reboot
Expand All @@ -802,6 +813,10 @@ def get_reboot_cause(self):
if reboot_cause:
return self.REBOOT_CAUSE_NON_HARDWARE, ''

if not self._wait_reboot_cause_ready():
logger.log_error("Hardware reboot cause is not ready")
return self.REBOOT_CAUSE_NON_HARDWARE, ''

if not self.reboot_cause_initialized:
self.initialize_reboot_cause()

Expand Down
17 changes: 17 additions & 0 deletions platform/mellanox/mlnx-platform-api/tests/test_chassis.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ def mock_check_sfp_status(self, port_dict, error_dict, timeout):
assert status is True
assert 'sfp' in event_dict and not event_dict['sfp']

@mock.patch('sonic_platform.chassis.Chassis._wait_reboot_cause_ready', MagicMock(return_value=True))
def test_reboot_cause(self):
from sonic_platform import utils
from sonic_platform.chassis import REBOOT_CAUSE_ROOT
Expand Down Expand Up @@ -242,6 +243,22 @@ def read_int_from_file(file_path, *args, **kwargs):
assert minor == value
mock_file_content[file_path] = 0

@mock.patch('sonic_platform.chassis.Chassis._wait_reboot_cause_ready', MagicMock(return_value=False))
def test_reboot_cause_timeout(self):
chassis = Chassis()
major, minor = chassis.get_reboot_cause()
assert major == chassis.REBOOT_CAUSE_NON_HARDWARE
assert minor == ''

@mock.patch('sonic_platform.utils.read_int_from_file')
@mock.patch('sonic_platform.chassis.time.sleep', mock.MagicMock())
def test_wait_reboot_cause_ready(self, mock_read_int):
mock_read_int.return_value = 1
chassis = Chassis()
assert chassis._wait_reboot_cause_ready()
mock_read_int.return_value = 0
assert not chassis._wait_reboot_cause_ready()

def test_parse_warmfast_reboot_from_proc_cmdline(self):
chassis = Chassis()
with mock.patch("builtins.open", mock.mock_open(read_data="SONIC_BOOT_TYPE=warm")):
Expand Down

0 comments on commit 648c94d

Please sign in to comment.