From e312281706aa942758e215b4e49a230d6d8d1e34 Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox <57339448+Junchao-Mellanox@users.noreply.github.com> Date: Thu, 14 Dec 2023 18:04:24 +0800 Subject: [PATCH] [202311][Mellanox] implement platform wait in python code (#17398) - Why I did it New implementation of Nvidia platform_wait due to: 1. sysfs deprecated by hw-mgmt 2. new dependencies to SDK 3. For CMIS host management mode - How I did it wait hw-management ready wait SDK sysfs nodes ready - How to verify it manual test unit test sonic-mgmt regression --- .../x86_64-mlnx_msn2700-r0/platform_wait | 101 ++++++------------ .../sonic_platform/device_data.py | 28 ++++- .../mlnx-platform-api/sonic_platform/utils.py | 24 +++++ .../tests/test_device_data.py | 24 ++++- .../mlnx-platform-api/tests/test_utils.py | 7 ++ 5 files changed, 111 insertions(+), 73 deletions(-) diff --git a/device/mellanox/x86_64-mlnx_msn2700-r0/platform_wait b/device/mellanox/x86_64-mlnx_msn2700-r0/platform_wait index a233eb41de42..ea76db07a6d8 100755 --- a/device/mellanox/x86_64-mlnx_msn2700-r0/platform_wait +++ b/device/mellanox/x86_64-mlnx_msn2700-r0/platform_wait @@ -1,69 +1,32 @@ -#!/bin/bash - -declare -r SYSLOG_LOGGER="/usr/bin/logger" -declare -r SYSLOG_IDENTIFIER="platform_wait" -declare -r SYSLOG_ERROR="error" -declare -r SYSLOG_NOTICE="notice" -declare -r SYSLOG_INFO="info" - -declare -r HW_MGMT_CONFIG="/var/run/hw-management/config" - -declare -r MODULE_COUNTER="${HW_MGMT_CONFIG}/module_counter" -declare -r SFP_COUNTER="${HW_MGMT_CONFIG}/sfp_counter" - -declare -r EXIT_SUCCESS="0" -declare -r EXIT_TIMEOUT="1" - -function log_error() { - eval "${SYSLOG_LOGGER} -t ${SYSLOG_IDENTIFIER} -p ${SYSLOG_ERROR} $@" -} - -function log_notice() { - eval "${SYSLOG_LOGGER} -t ${SYSLOG_IDENTIFIER} -p ${SYSLOG_NOTICE} $@" -} - -function log_info() { - eval "${SYSLOG_LOGGER} -t ${SYSLOG_IDENTIFIER} -p ${SYSLOG_INFO} $@" -} - -function wait_for_sfp() { - local -r _NUM_MATCH="^[0-9]+$" - local -r _NUM_ZERO="0" - - local _MODULE_CNT="0" - local _SFP_CNT="0" - - local -i _WDOG_CNT="1" - local -ir _WDOG_MAX="300" - - local -r _TIMEOUT="1s" - - while [[ "${_WDOG_CNT}" -le "${_WDOG_MAX}" ]]; do - _MODULE_CNT="$(cat ${MODULE_COUNTER} 2>&1)" - _SFP_CNT="$(cat ${SFP_COUNTER} 2>&1)" - - if [[ "${_MODULE_CNT}" =~ ${_NUM_MATCH} && "${_SFP_CNT}" =~ ${_NUM_MATCH} ]]; then - if [[ "${_SFP_CNT}" -gt "${_NUM_ZERO}" && "${_MODULE_CNT}" -eq "${_SFP_CNT}" ]]; then - return "${EXIT_SUCCESS}" - fi - fi - - let "_WDOG_CNT++" - sleep "${_TIMEOUT}" - done - - return "${EXIT_TIMEOUT}" -} - -log_info "Wait for SFP interfaces to be ready" - -wait_for_sfp -EXIT_CODE="$?" -if [[ "${EXIT_CODE}" != "${EXIT_SUCCESS}" ]]; then - log_error "SFP interfaces are not ready: timeout" - exit "${EXIT_CODE}" -fi - -log_info "SFP interfaces are ready" - -exit "${EXIT_SUCCESS}" +#!/usr/bin/python3 + +# +# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +# Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import sys +from sonic_platform.device_data import DeviceDataManager +from sonic_py_common.logger import Logger + + +logger = Logger(log_identifier='platform_wait') +logger.log_notice('Nvidia: Wait for PMON dependencies to be ready') +if DeviceDataManager.wait_platform_ready(): + logger.log_notice('Nvidia: PMON dependencies are ready') + sys.exit(0) +else: + logger.log_error('Nvidia: PMON dependencies are not ready: timeout') + sys.exit(-1) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py b/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py index 6bf0a9945a85..aeceb15d1983 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py @@ -17,6 +17,7 @@ import glob import os +import time from . import utils @@ -167,8 +168,11 @@ def is_psu_hotswapable(cls): @classmethod @utils.read_only_cache() def get_sfp_count(cls): - sfp_count = utils.read_int_from_file('/run/hw-management/config/sfp_counter') - return sfp_count if sfp_count > 0 else len(glob.glob('/sys/module/sx_core/asic0/module*')) + from sonic_py_common import device_info + platform_path = device_info.get_path_to_platform_dir() + platform_json_path = os.path.join(platform_path, 'platform.json') + platform_data = utils.load_json_file(platform_json_path) + return len(platform_data['chassis']['sfps']) @classmethod def get_linecard_sfp_count(cls, lc_index): @@ -244,3 +248,23 @@ def is_independent_mode(cls): sai_profile_file = os.path.join(hwsku_dir, 'sai.profile') data = utils.read_key_value_file(sai_profile_file, delimeter='=') return data.get('SAI_INDEPENDENT_MODULE_MODE') == '1' + + @classmethod + def wait_platform_ready(cls): + """ + Wait for Nvidia platform related services(SDK, hw-management) ready + Returns: + bool: True if wait success else timeout + """ + conditions = [] + sysfs_nodes = ['power_mode', 'power_mode_policy', 'present', 'reset', 'status', 'statuserror'] + if cls.is_independent_mode(): + sysfs_nodes.extend(['control', 'frequency', 'frequency_support', 'hw_present', 'hw_reset', + 'power_good', 'power_limit', 'power_on', 'temperature/input']) + else: + conditions.append(lambda: utils.read_int_from_file('/var/run/hw-management/config/asics_init_done') == 1) + sfp_count = cls.get_sfp_count() + for sfp_index in range(sfp_count): + for sysfs_node in sysfs_nodes: + conditions.append(lambda: os.path.exists(f'/sys/module/sx_core/asic0/module{sfp_index}/{sysfs_node}')) + return utils.wait_until_conditions(conditions, 300, 1) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/utils.py b/platform/mellanox/mlnx-platform-api/sonic_platform/utils.py index 9db38e6b4147..1135903c24bf 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/utils.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/utils.py @@ -290,6 +290,30 @@ def wait_until(predict, timeout, interval=1, *args, **kwargs): return False +def wait_until_conditions(conditions, timeout, interval=1): + """ + Wait until all the conditions become true + Args: + conditions (list): a list of callable which generate True|False + timeout (int): wait time in seconds + interval (int, optional): interval to check the predict. Defaults to 1. + + Returns: + bool: True if wait success else False + """ + while timeout > 0: + pending_conditions = [] + for condition in conditions: + if not condition(): + pending_conditions.append(condition) + if not pending_conditions: + return True + conditions = pending_conditions + time.sleep(interval) + timeout -= interval + return False + + class TimerEvent: def __init__(self, interval, cb, repeat): self.interval = interval diff --git a/platform/mellanox/mlnx-platform-api/tests/test_device_data.py b/platform/mellanox/mlnx-platform-api/tests/test_device_data.py index 866f01c3e7e3..c172b82a30b7 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_device_data.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_device_data.py @@ -60,6 +60,26 @@ def test_is_independent_mode(self, mock_read): mock_read.return_value = {'SAI_INDEPENDENT_MODULE_MODE': '1'} assert DeviceDataManager.is_independent_mode() + @mock.patch('sonic_py_common.device_info.get_path_to_platform_dir', mock.MagicMock(return_value='/tmp')) + @mock.patch('sonic_platform.device_data.utils.load_json_file') + def test_get_sfp_count(self, mock_load_json): + mock_load_json.return_value = { + 'chassis': { + 'sfps': [1,2,3] + } + } + assert DeviceDataManager.get_sfp_count() == 3 - - + @mock.patch('sonic_platform.device_data.time.sleep', mock.MagicMock()) + @mock.patch('sonic_platform.device_data.DeviceDataManager.get_sfp_count', mock.MagicMock(return_value=3)) + @mock.patch('sonic_platform.device_data.utils.read_int_from_file', mock.MagicMock(return_value=1)) + @mock.patch('sonic_platform.device_data.os.path.exists') + @mock.patch('sonic_platform.device_data.DeviceDataManager.is_independent_mode') + def test_wait_platform_ready(self, mock_is_indep, mock_exists): + mock_exists.return_value = True + mock_is_indep.return_value = True + assert DeviceDataManager.wait_platform_ready() + mock_is_indep.return_value = False + assert DeviceDataManager.wait_platform_ready() + mock_exists.return_value = False + assert not DeviceDataManager.wait_platform_ready() diff --git a/platform/mellanox/mlnx-platform-api/tests/test_utils.py b/platform/mellanox/mlnx-platform-api/tests/test_utils.py index 2a186de7e5b0..b6ec67975f80 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_utils.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_utils.py @@ -195,6 +195,13 @@ def test_read_key_value_file(self): mock_os_open = mock.mock_open(read_data='a=b') with mock.patch('sonic_platform.utils.open', mock_os_open): assert utils.read_key_value_file('some_file', delimeter='=') == {'a':'b'} + + @mock.patch('sonic_platform.utils.time.sleep', mock.MagicMock()) + def test_wait_until_conditions(self): + conditions = [lambda: True] + assert utils.wait_until_conditions(conditions, 1) + conditions = [lambda: False] + assert not utils.wait_until_conditions(conditions, 1) def test_timer(self): timer = utils.Timer()