diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py b/platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py index d03c0fe10e79..f6a9380bf6aa 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py @@ -327,17 +327,10 @@ def get_presence(self): Returns: bool: True if device is present, False if not """ - if DeviceDataManager.is_independent_mode(): - if utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/control') != 0: - if not utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/hw_present'): - return False - if not utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/power_good'): - return False - if not utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/power_on'): - return False - if utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/hw_reset') == 1: - return False - + try: + self.is_sw_control() + except: + return False eeprom_raw = self._read_eeprom(0, 1, log_on_error=False) return eeprom_raw is not None @@ -877,6 +870,13 @@ def get_tx_fault(self): return [False] * api.NUM_CHANNELS if api else None def get_temperature(self): + """Get SFP temperature + + Returns: + None if there is an error (sysfs does not exist or sysfs return None or module EEPROM not readable) + 0.0 if module temperature is not supported or module is under initialization + other float value if module temperature is available + """ try: if not self.is_sw_control(): temp_file = f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/input' @@ -893,59 +893,68 @@ def get_temperature(self): temperature = super().get_temperature() return temperature if temperature is not None else None - def get_temperature_warning_threashold(self): + def get_temperature_warning_threshold(self): """Get temperature warning threshold Returns: - int: temperature warning threshold + None if there is an error (module EEPROM not readable) + 0.0 if warning threshold is not supported or module is under initialization + other float value if warning threshold is available """ try: - if not self.is_sw_control(): - emergency = utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/emergency', - log_func=None, - default=None) - return emergency / SFP_TEMPERATURE_SCALE if emergency is not None else SFP_DEFAULT_TEMP_WARNNING_THRESHOLD + self.is_sw_control() except: - return SFP_DEFAULT_TEMP_WARNNING_THRESHOLD - - thresh = self._get_temperature_threshold() - if thresh and consts.TEMP_HIGH_WARNING_FIELD in thresh: - return thresh[consts.TEMP_HIGH_WARNING_FIELD] - return SFP_DEFAULT_TEMP_WARNNING_THRESHOLD + return 0.0 + + support, thresh = self._get_temperature_threshold() + if support is None or thresh is None: + # Failed to read from EEPROM + return None + if support is False: + # Do not support + return 0.0 + return thresh.get(consts.TEMP_HIGH_WARNING_FIELD, SFP_DEFAULT_TEMP_WARNNING_THRESHOLD) - def get_temperature_critical_threashold(self): + def get_temperature_critical_threshold(self): """Get temperature critical threshold Returns: - int: temperature critical threshold + None if there is an error (module EEPROM not readable) + 0.0 if critical threshold is not supported or module is under initialization + other float value if critical threshold is available """ try: - if not self.is_sw_control(): - critical = utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/critical', - log_func=None, - default=None) - return critical / SFP_TEMPERATURE_SCALE if critical is not None else SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD + self.is_sw_control() except: - return SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD + return 0.0 - thresh = self._get_temperature_threshold() - if thresh and consts.TEMP_HIGH_ALARM_FIELD in thresh: - return thresh[consts.TEMP_HIGH_ALARM_FIELD] - return SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD + support, thresh = self._get_temperature_threshold() + if support is None or thresh is None: + # Failed to read from EEPROM + return None + if support is False: + # Do not support + return 0.0 + return thresh.get(consts.TEMP_HIGH_ALARM_FIELD, SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD) def _get_temperature_threshold(self): + """Get temperature thresholds data from EEPROM + + Returns: + tuple: (support, thresh_dict) + """ self.reinit() api = self.get_xcvr_api() if not api: - return None + return None, None thresh_support = api.get_transceiver_thresholds_support() if thresh_support: if isinstance(api, sff8636.Sff8636Api) or isinstance(api, sff8436.Sff8436Api): - return api.xcvr_eeprom.read(consts.TEMP_THRESHOLDS_FIELD) - return api.xcvr_eeprom.read(consts.THRESHOLDS_FIELD) + return thresh_support, api.xcvr_eeprom.read(consts.TEMP_THRESHOLDS_FIELD) + return thresh_support, api.xcvr_eeprom.read(consts.THRESHOLDS_FIELD) else: - return None + return thresh_support, {} def get_xcvr_api(self): """ @@ -964,17 +973,22 @@ def get_xcvr_api(self): def is_sw_control(self): if not DeviceDataManager.is_independent_mode(): return False - + db = utils.DbUtils.get_db_instance('STATE_DB') logical_port = NvidiaSFPCommon.get_logical_port_by_sfp_index(self.sdk_index) if not logical_port: - raise Exception(f'Module {self.sdk_index} is not present or in initialization') + raise Exception(f'Module {self.sdk_index} is not present or under initialization') initialized = db.exists('STATE_DB', f'TRANSCEIVER_STATUS|{logical_port}') if not initialized: - raise Exception(f'Module {self.sdk_index} is not present or in initialization') + raise Exception(f'Module {self.sdk_index} is not present or under initialization') - return utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/control') == 1 + try: + return utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/control', + raise_exception=True, log_func=None) == 1 + except: + # just in case control file does not exist + raise Exception(f'Module {self.sdk_index} is under initialization') class RJ45Port(NvidiaSFPCommon): diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py index 435389321d0b..cdd47af607c2 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py @@ -431,7 +431,8 @@ def get_temperature(self): A float number of current temperature in Celsius up to nearest thousandth of one degree Celsius, e.g. 30.125 """ - return self.sfp.get_temperature() + value = self.sfp.get_temperature() + return value if (value != 0.0 and value is not None) else None def get_high_threshold(self): """ @@ -441,7 +442,8 @@ def get_high_threshold(self): A float number, the high threshold temperature of thermal in Celsius up to nearest thousandth of one degree Celsius, e.g. 30.125 """ - return self.sfp.get_temperature_warning_threashold() + value = self.sfp.get_temperature_warning_threshold() + return value if (value != 0.0 and value is not None) else None def get_high_critical_threshold(self): """ @@ -451,7 +453,8 @@ def get_high_critical_threshold(self): A float number, the high critical threshold temperature of thermal in Celsius up to nearest thousandth of one degree Celsius, e.g. 30.125 """ - return self.sfp.get_temperature_critical_threashold() + value = self.sfp.get_temperature_critical_threshold() + return value if (value != 0.0 and value is not None) else None def get_position_in_parent(self): """ diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py index 9e1aaded0586..5c118b4c9a07 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py @@ -46,5 +46,5 @@ def deinitialize(cls): is a no-op. :return: """ - if DeviceDataManager.is_independent_mode(): + if DeviceDataManager.is_independent_mode() and cls.thermal_updater_task: cls.thermal_updater_task.stop() diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py index ad0b92ef4ee6..f2f0f75b2fd1 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py @@ -56,7 +56,7 @@ def __init__(self, sfp_list): def load_tc_config(self): asic_poll_interval = 1 sfp_poll_interval = 10 - data = utils.load_json_file(TC_CONFIG_FILE) + data = utils.load_json_file(TC_CONFIG_FILE, log_func=None) if not data: logger.log_notice(f'{TC_CONFIG_FILE} does not exist, use default polling interval') @@ -108,7 +108,7 @@ def clean_thermal_data(self): def wait_all_sfp_ready(self): logger.log_notice('Waiting for all SFP modules ready...') - max_wait_time = 60 + max_wait_time = 300 ready_set = set() while len(ready_set) != len(self._sfp_list): for sfp in self._sfp_list: @@ -129,11 +129,11 @@ def get_asic_temp(self): temperature = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/input', default=None) return temperature * ASIC_TEMPERATURE_SCALE if temperature is not None else None - def get_asic_temp_warning_threashold(self): + def get_asic_temp_warning_threshold(self): emergency = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/emergency', default=None, log_func=None) return emergency * ASIC_TEMPERATURE_SCALE if emergency is not None else ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD - def get_asic_temp_critical_threashold(self): + def get_asic_temp_critical_threshold(self): critical = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/critical', default=None, log_func=None) return critical * ASIC_TEMPERATURE_SCALE if critical is not None else ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD @@ -148,19 +148,19 @@ def update_single_module(self, sfp): critical_thresh = 0 fault = 0 else: - warning_thresh = sfp.get_temperature_warning_threashold() - critical_thresh = sfp.get_temperature_critical_threashold() + warning_thresh = sfp.get_temperature_warning_threshold() + critical_thresh = sfp.get_temperature_critical_threshold() fault = ERROR_READ_THERMAL_DATA if (temperature is None or warning_thresh is None or critical_thresh is None) else 0 - temperature = 0 if temperature is None else int(temperature * SFP_TEMPERATURE_SCALE) - warning_thresh = 0 if warning_thresh is None else int(warning_thresh * SFP_TEMPERATURE_SCALE) - critical_thresh = 0 if critical_thresh is None else int(critical_thresh * SFP_TEMPERATURE_SCALE) + temperature = 0 if temperature is None else temperature * SFP_TEMPERATURE_SCALE + warning_thresh = 0 if warning_thresh is None else warning_thresh * SFP_TEMPERATURE_SCALE + critical_thresh = 0 if critical_thresh is None else critical_thresh * SFP_TEMPERATURE_SCALE hw_management_independent_mode_update.thermal_data_set_module( 0, # ASIC index always 0 for now sfp.sdk_index + 1, - temperature, - critical_thresh, - warning_thresh, + int(temperature), + int(critical_thresh), + int(warning_thresh), fault ) else: @@ -170,7 +170,7 @@ def update_single_module(self, sfp): if pre_presence != presence: self._sfp_status[sfp.sdk_index] = presence except Exception as e: - logger.log_error('Failed to update module {sfp.sdk_index} thermal data - {e}') + logger.log_error(f'Failed to update module {sfp.sdk_index} thermal data - {e}') hw_management_independent_mode_update.thermal_data_set_module( 0, # ASIC index always 0 for now sfp.sdk_index + 1, @@ -187,8 +187,8 @@ def update_module(self): def update_asic(self): try: asic_temp = self.get_asic_temp() - warn_threshold = self.get_asic_temp_warning_threashold() - critical_threshold = self.get_asic_temp_critical_threashold() + warn_threshold = self.get_asic_temp_warning_threshold() + critical_threshold = self.get_asic_temp_critical_threshold() fault = 0 if asic_temp is None: logger.log_error('Failed to read ASIC temperature, send fault to hw-management-tc') @@ -203,7 +203,7 @@ def update_asic(self): fault ) except Exception as e: - logger.log_error('Failed to update ASIC thermal data - {e}') + logger.log_error(f'Failed to update ASIC thermal data - {e}') hw_management_independent_mode_update.thermal_data_set_asic( 0, # ASIC index always 0 for now 0, diff --git a/platform/mellanox/mlnx-platform-api/tests/test_sfp.py b/platform/mellanox/mlnx-platform-api/tests/test_sfp.py index d273e9bce700..499983a01e15 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_sfp.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_sfp.py @@ -230,14 +230,18 @@ def test_get_page_and_page_offset(self, mock_get_type_str, mock_eeprom_path, moc assert page == '/tmp/1/data' assert page_offset is 0 + @mock.patch('sonic_platform.sfp.SFP.is_sw_control') @mock.patch('sonic_platform.sfp.SFP._read_eeprom') - def test_sfp_get_presence(self, mock_read): + def test_sfp_get_presence(self, mock_read, mock_control): sfp = SFP(0) mock_read.return_value = None assert not sfp.get_presence() mock_read.return_value = 0 assert sfp.get_presence() + + mock_control.side_effect = RuntimeError('') + assert not sfp.get_presence() @mock.patch('sonic_platform.utils.read_int_from_file') def test_rj45_get_presence(self, mock_read_int): @@ -318,14 +322,16 @@ def test_get_temperature(self, mock_read, mock_exists): def test_get_temperature_threshold(self): sfp = SFP(0) sfp.is_sw_control = mock.MagicMock(return_value=True) - assert sfp.get_temperature_warning_threashold() == 70.0 - assert sfp.get_temperature_critical_threashold() == 80.0 mock_api = mock.MagicMock() mock_api.get_transceiver_thresholds_support = mock.MagicMock(return_value=False) - sfp.get_xcvr_api = mock.MagicMock(return_value=mock_api) - assert sfp.get_temperature_warning_threashold() == 70.0 - assert sfp.get_temperature_critical_threashold() == 80.0 + sfp.get_xcvr_api = mock.MagicMock(return_value=None) + assert sfp.get_temperature_warning_threshold() is None + assert sfp.get_temperature_critical_threshold() is None + + sfp.get_xcvr_api.return_value = mock_api + assert sfp.get_temperature_warning_threshold() == 0.0 + assert sfp.get_temperature_critical_threshold() == 0.0 from sonic_platform_base.sonic_xcvr.fields import consts mock_api.get_transceiver_thresholds_support.return_value = True @@ -334,8 +340,8 @@ def test_get_temperature_threshold(self): consts.TEMP_HIGH_ALARM_FIELD: 85.0, consts.TEMP_HIGH_WARNING_FIELD: 75.0 }) - assert sfp.get_temperature_warning_threashold() == 75.0 - assert sfp.get_temperature_critical_threashold() == 85.0 + assert sfp.get_temperature_warning_threshold() == 75.0 + assert sfp.get_temperature_critical_threshold() == 85.0 @mock.patch('sonic_platform.sfp.NvidiaSFPCommon.get_logical_port_by_sfp_index') @mock.patch('sonic_platform.utils.read_int_from_file') diff --git a/platform/mellanox/mlnx-platform-api/tests/test_thermal.py b/platform/mellanox/mlnx-platform-api/tests/test_thermal.py index a59b8dda4055..e17d91cb0818 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_thermal.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_thermal.py @@ -160,11 +160,17 @@ def test_sfp_thermal(self): assert thermal.get_position_in_parent() == 1 assert thermal.is_replaceable() == False sfp.get_temperature = mock.MagicMock(return_value=35.4) - sfp.get_temperature_warning_threashold = mock.MagicMock(return_value=70) - sfp.get_temperature_critical_threashold = mock.MagicMock(return_value=80) + sfp.get_temperature_warning_threshold = mock.MagicMock(return_value=70) + sfp.get_temperature_critical_threshold = mock.MagicMock(return_value=80) assert thermal.get_temperature() == 35.4 assert thermal.get_high_threshold() == 70 assert thermal.get_high_critical_threshold() == 80 + sfp.get_temperature = mock.MagicMock(return_value=0) + sfp.get_temperature_warning_threshold = mock.MagicMock(return_value=0) + sfp.get_temperature_critical_threshold = mock.MagicMock(return_value=None) + assert thermal.get_temperature() is None + assert thermal.get_high_threshold() is None + assert thermal.get_high_critical_threshold() is None @mock.patch('sonic_platform.utils.read_float_from_file') def test_get_temperature(self, mock_read): diff --git a/platform/mellanox/mlnx-platform-api/tests/test_thermal_updater.py b/platform/mellanox/mlnx-platform-api/tests/test_thermal_updater.py index 1a34a7440a2d..8e7509ce9b69 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_thermal_updater.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_thermal_updater.py @@ -97,23 +97,23 @@ def test_update_asic(self, mock_read): mock_read.return_value = 8 updater = ThermalUpdater(None) assert updater.get_asic_temp() == 1000 - assert updater.get_asic_temp_warning_threashold() == 1000 - assert updater.get_asic_temp_critical_threashold() == 1000 + assert updater.get_asic_temp_warning_threshold() == 1000 + assert updater.get_asic_temp_critical_threshold() == 1000 updater.update_asic() hw_management_independent_mode_update.thermal_data_set_asic.assert_called_once() mock_read.return_value = None assert updater.get_asic_temp() is None - assert updater.get_asic_temp_warning_threashold() == ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD - assert updater.get_asic_temp_critical_threashold() == ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD + assert updater.get_asic_temp_warning_threshold() == ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD + assert updater.get_asic_temp_critical_threshold() == ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD def test_update_module(self): mock_sfp = mock.MagicMock() mock_sfp.sdk_index = 10 mock_sfp.get_presence = mock.MagicMock(return_value=True) mock_sfp.get_temperature = mock.MagicMock(return_value=55.0) - mock_sfp.get_temperature_warning_threashold = mock.MagicMock(return_value=70.0) - mock_sfp.get_temperature_critical_threashold = mock.MagicMock(return_value=80.0) + mock_sfp.get_temperature_warning_threshold = mock.MagicMock(return_value=70.0) + mock_sfp.get_temperature_critical_threshold = mock.MagicMock(return_value=80.0) updater = ThermalUpdater([mock_sfp]) updater.update_module() hw_management_independent_mode_update.thermal_data_set_module.assert_called_once_with(0, 11, 55000, 80000, 70000, 0)