Skip to content

Commit

Permalink
[Mellanox] Fix issues found for CMIS host management (#17637)
Browse files Browse the repository at this point in the history
- Why I did it
1. Thermal updater should wait more time for module to be initialized
2. sfp should get temperature threshold from EEPROM because SDK sysfs is not yet supported
3. Rename sfp function to fix typo
4. sfp.get_presence should return False if module is under initialization

- How I did it
1. Thermal updater should wait more time for module to be initialized
2. sfp should get temperature threshold from EEPROM because SDK sysfs is not yet supported
3. Rename sfp function to fix typo
4. sfp.get_presence should return False if module is under initialization

- How to verify it
Manual test
Unit test
  • Loading branch information
Junchao-Mellanox authored and mssonicbld committed Jan 19, 2024
1 parent 3f29b28 commit 8d65e2c
Show file tree
Hide file tree
Showing 7 changed files with 109 additions and 80 deletions.
102 changes: 58 additions & 44 deletions platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,17 +327,10 @@ def get_presence(self):
Returns:
bool: True if device is present, False if not
"""
if DeviceDataManager.is_independent_mode():
if utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/control') != 0:
if not utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/hw_present'):
return False
if not utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/power_good'):
return False
if not utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/power_on'):
return False
if utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/hw_reset') == 1:
return False

try:
self.is_sw_control()
except:
return False
eeprom_raw = self._read_eeprom(0, 1, log_on_error=False)
return eeprom_raw is not None

Expand Down Expand Up @@ -877,6 +870,13 @@ def get_tx_fault(self):
return [False] * api.NUM_CHANNELS if api else None

def get_temperature(self):
"""Get SFP temperature
Returns:
None if there is an error (sysfs does not exist or sysfs return None or module EEPROM not readable)
0.0 if module temperature is not supported or module is under initialization
other float value if module temperature is available
"""
try:
if not self.is_sw_control():
temp_file = f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/input'
Expand All @@ -893,59 +893,68 @@ def get_temperature(self):
temperature = super().get_temperature()
return temperature if temperature is not None else None

def get_temperature_warning_threashold(self):
def get_temperature_warning_threshold(self):
"""Get temperature warning threshold
Returns:
int: temperature warning threshold
None if there is an error (module EEPROM not readable)
0.0 if warning threshold is not supported or module is under initialization
other float value if warning threshold is available
"""
try:
if not self.is_sw_control():
emergency = utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/emergency',
log_func=None,
default=None)
return emergency / SFP_TEMPERATURE_SCALE if emergency is not None else SFP_DEFAULT_TEMP_WARNNING_THRESHOLD
self.is_sw_control()
except:
return SFP_DEFAULT_TEMP_WARNNING_THRESHOLD

thresh = self._get_temperature_threshold()
if thresh and consts.TEMP_HIGH_WARNING_FIELD in thresh:
return thresh[consts.TEMP_HIGH_WARNING_FIELD]
return SFP_DEFAULT_TEMP_WARNNING_THRESHOLD
return 0.0

support, thresh = self._get_temperature_threshold()
if support is None or thresh is None:
# Failed to read from EEPROM
return None
if support is False:
# Do not support
return 0.0
return thresh.get(consts.TEMP_HIGH_WARNING_FIELD, SFP_DEFAULT_TEMP_WARNNING_THRESHOLD)

def get_temperature_critical_threashold(self):
def get_temperature_critical_threshold(self):
"""Get temperature critical threshold
Returns:
int: temperature critical threshold
None if there is an error (module EEPROM not readable)
0.0 if critical threshold is not supported or module is under initialization
other float value if critical threshold is available
"""
try:
if not self.is_sw_control():
critical = utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/critical',
log_func=None,
default=None)
return critical / SFP_TEMPERATURE_SCALE if critical is not None else SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD
self.is_sw_control()
except:
return SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD
return 0.0

thresh = self._get_temperature_threshold()
if thresh and consts.TEMP_HIGH_ALARM_FIELD in thresh:
return thresh[consts.TEMP_HIGH_ALARM_FIELD]
return SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD
support, thresh = self._get_temperature_threshold()
if support is None or thresh is None:
# Failed to read from EEPROM
return None
if support is False:
# Do not support
return 0.0
return thresh.get(consts.TEMP_HIGH_ALARM_FIELD, SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD)

def _get_temperature_threshold(self):
"""Get temperature thresholds data from EEPROM
Returns:
tuple: (support, thresh_dict)
"""
self.reinit()
api = self.get_xcvr_api()
if not api:
return None
return None, None

thresh_support = api.get_transceiver_thresholds_support()
if thresh_support:
if isinstance(api, sff8636.Sff8636Api) or isinstance(api, sff8436.Sff8436Api):
return api.xcvr_eeprom.read(consts.TEMP_THRESHOLDS_FIELD)
return api.xcvr_eeprom.read(consts.THRESHOLDS_FIELD)
return thresh_support, api.xcvr_eeprom.read(consts.TEMP_THRESHOLDS_FIELD)
return thresh_support, api.xcvr_eeprom.read(consts.THRESHOLDS_FIELD)
else:
return None
return thresh_support, {}

def get_xcvr_api(self):
"""
Expand All @@ -964,17 +973,22 @@ def get_xcvr_api(self):
def is_sw_control(self):
if not DeviceDataManager.is_independent_mode():
return False

db = utils.DbUtils.get_db_instance('STATE_DB')
logical_port = NvidiaSFPCommon.get_logical_port_by_sfp_index(self.sdk_index)
if not logical_port:
raise Exception(f'Module {self.sdk_index} is not present or in initialization')
raise Exception(f'Module {self.sdk_index} is not present or under initialization')

initialized = db.exists('STATE_DB', f'TRANSCEIVER_STATUS|{logical_port}')
if not initialized:
raise Exception(f'Module {self.sdk_index} is not present or in initialization')
raise Exception(f'Module {self.sdk_index} is not present or under initialization')

return utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/control') == 1
try:
return utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/control',
raise_exception=True, log_func=None) == 1
except:
# just in case control file does not exist
raise Exception(f'Module {self.sdk_index} is under initialization')


class RJ45Port(NvidiaSFPCommon):
Expand Down
9 changes: 6 additions & 3 deletions platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,8 @@ def get_temperature(self):
A float number of current temperature in Celsius up to nearest thousandth
of one degree Celsius, e.g. 30.125
"""
return self.sfp.get_temperature()
value = self.sfp.get_temperature()
return value if (value != 0.0 and value is not None) else None

def get_high_threshold(self):
"""
Expand All @@ -441,7 +442,8 @@ def get_high_threshold(self):
A float number, the high threshold temperature of thermal in Celsius
up to nearest thousandth of one degree Celsius, e.g. 30.125
"""
return self.sfp.get_temperature_warning_threashold()
value = self.sfp.get_temperature_warning_threshold()
return value if (value != 0.0 and value is not None) else None

def get_high_critical_threshold(self):
"""
Expand All @@ -451,7 +453,8 @@ def get_high_critical_threshold(self):
A float number, the high critical threshold temperature of thermal in Celsius
up to nearest thousandth of one degree Celsius, e.g. 30.125
"""
return self.sfp.get_temperature_critical_threashold()
value = self.sfp.get_temperature_critical_threshold()
return value if (value != 0.0 and value is not None) else None

def get_position_in_parent(self):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,5 +46,5 @@ def deinitialize(cls):
is a no-op.
:return:
"""
if DeviceDataManager.is_independent_mode():
if DeviceDataManager.is_independent_mode() and cls.thermal_updater_task:
cls.thermal_updater_task.stop()
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def __init__(self, sfp_list):
def load_tc_config(self):
asic_poll_interval = 1
sfp_poll_interval = 10
data = utils.load_json_file(TC_CONFIG_FILE)
data = utils.load_json_file(TC_CONFIG_FILE, log_func=None)
if not data:
logger.log_notice(f'{TC_CONFIG_FILE} does not exist, use default polling interval')

Expand Down Expand Up @@ -108,7 +108,7 @@ def clean_thermal_data(self):

def wait_all_sfp_ready(self):
logger.log_notice('Waiting for all SFP modules ready...')
max_wait_time = 60
max_wait_time = 300
ready_set = set()
while len(ready_set) != len(self._sfp_list):
for sfp in self._sfp_list:
Expand All @@ -129,11 +129,11 @@ def get_asic_temp(self):
temperature = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/input', default=None)
return temperature * ASIC_TEMPERATURE_SCALE if temperature is not None else None

def get_asic_temp_warning_threashold(self):
def get_asic_temp_warning_threshold(self):
emergency = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/emergency', default=None, log_func=None)
return emergency * ASIC_TEMPERATURE_SCALE if emergency is not None else ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD

def get_asic_temp_critical_threashold(self):
def get_asic_temp_critical_threshold(self):
critical = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/critical', default=None, log_func=None)
return critical * ASIC_TEMPERATURE_SCALE if critical is not None else ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD

Expand All @@ -148,19 +148,19 @@ def update_single_module(self, sfp):
critical_thresh = 0
fault = 0
else:
warning_thresh = sfp.get_temperature_warning_threashold()
critical_thresh = sfp.get_temperature_critical_threashold()
warning_thresh = sfp.get_temperature_warning_threshold()
critical_thresh = sfp.get_temperature_critical_threshold()
fault = ERROR_READ_THERMAL_DATA if (temperature is None or warning_thresh is None or critical_thresh is None) else 0
temperature = 0 if temperature is None else int(temperature * SFP_TEMPERATURE_SCALE)
warning_thresh = 0 if warning_thresh is None else int(warning_thresh * SFP_TEMPERATURE_SCALE)
critical_thresh = 0 if critical_thresh is None else int(critical_thresh * SFP_TEMPERATURE_SCALE)
temperature = 0 if temperature is None else temperature * SFP_TEMPERATURE_SCALE
warning_thresh = 0 if warning_thresh is None else warning_thresh * SFP_TEMPERATURE_SCALE
critical_thresh = 0 if critical_thresh is None else critical_thresh * SFP_TEMPERATURE_SCALE

hw_management_independent_mode_update.thermal_data_set_module(
0, # ASIC index always 0 for now
sfp.sdk_index + 1,
temperature,
critical_thresh,
warning_thresh,
int(temperature),
int(critical_thresh),
int(warning_thresh),
fault
)
else:
Expand All @@ -170,7 +170,7 @@ def update_single_module(self, sfp):
if pre_presence != presence:
self._sfp_status[sfp.sdk_index] = presence
except Exception as e:
logger.log_error('Failed to update module {sfp.sdk_index} thermal data - {e}')
logger.log_error(f'Failed to update module {sfp.sdk_index} thermal data - {e}')
hw_management_independent_mode_update.thermal_data_set_module(
0, # ASIC index always 0 for now
sfp.sdk_index + 1,
Expand All @@ -187,8 +187,8 @@ def update_module(self):
def update_asic(self):
try:
asic_temp = self.get_asic_temp()
warn_threshold = self.get_asic_temp_warning_threashold()
critical_threshold = self.get_asic_temp_critical_threashold()
warn_threshold = self.get_asic_temp_warning_threshold()
critical_threshold = self.get_asic_temp_critical_threshold()
fault = 0
if asic_temp is None:
logger.log_error('Failed to read ASIC temperature, send fault to hw-management-tc')
Expand All @@ -203,7 +203,7 @@ def update_asic(self):
fault
)
except Exception as e:
logger.log_error('Failed to update ASIC thermal data - {e}')
logger.log_error(f'Failed to update ASIC thermal data - {e}')
hw_management_independent_mode_update.thermal_data_set_asic(
0, # ASIC index always 0 for now
0,
Expand Down
22 changes: 14 additions & 8 deletions platform/mellanox/mlnx-platform-api/tests/test_sfp.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,14 +230,18 @@ def test_get_page_and_page_offset(self, mock_get_type_str, mock_eeprom_path, moc
assert page == '/tmp/1/data'
assert page_offset is 0

@mock.patch('sonic_platform.sfp.SFP.is_sw_control')
@mock.patch('sonic_platform.sfp.SFP._read_eeprom')
def test_sfp_get_presence(self, mock_read):
def test_sfp_get_presence(self, mock_read, mock_control):
sfp = SFP(0)
mock_read.return_value = None
assert not sfp.get_presence()

mock_read.return_value = 0
assert sfp.get_presence()

mock_control.side_effect = RuntimeError('')
assert not sfp.get_presence()

@mock.patch('sonic_platform.utils.read_int_from_file')
def test_rj45_get_presence(self, mock_read_int):
Expand Down Expand Up @@ -318,14 +322,16 @@ def test_get_temperature(self, mock_read, mock_exists):
def test_get_temperature_threshold(self):
sfp = SFP(0)
sfp.is_sw_control = mock.MagicMock(return_value=True)
assert sfp.get_temperature_warning_threashold() == 70.0
assert sfp.get_temperature_critical_threashold() == 80.0

mock_api = mock.MagicMock()
mock_api.get_transceiver_thresholds_support = mock.MagicMock(return_value=False)
sfp.get_xcvr_api = mock.MagicMock(return_value=mock_api)
assert sfp.get_temperature_warning_threashold() == 70.0
assert sfp.get_temperature_critical_threashold() == 80.0
sfp.get_xcvr_api = mock.MagicMock(return_value=None)
assert sfp.get_temperature_warning_threshold() is None
assert sfp.get_temperature_critical_threshold() is None

sfp.get_xcvr_api.return_value = mock_api
assert sfp.get_temperature_warning_threshold() == 0.0
assert sfp.get_temperature_critical_threshold() == 0.0

from sonic_platform_base.sonic_xcvr.fields import consts
mock_api.get_transceiver_thresholds_support.return_value = True
Expand All @@ -334,8 +340,8 @@ def test_get_temperature_threshold(self):
consts.TEMP_HIGH_ALARM_FIELD: 85.0,
consts.TEMP_HIGH_WARNING_FIELD: 75.0
})
assert sfp.get_temperature_warning_threashold() == 75.0
assert sfp.get_temperature_critical_threashold() == 85.0
assert sfp.get_temperature_warning_threshold() == 75.0
assert sfp.get_temperature_critical_threshold() == 85.0

@mock.patch('sonic_platform.sfp.NvidiaSFPCommon.get_logical_port_by_sfp_index')
@mock.patch('sonic_platform.utils.read_int_from_file')
Expand Down
10 changes: 8 additions & 2 deletions platform/mellanox/mlnx-platform-api/tests/test_thermal.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,11 +160,17 @@ def test_sfp_thermal(self):
assert thermal.get_position_in_parent() == 1
assert thermal.is_replaceable() == False
sfp.get_temperature = mock.MagicMock(return_value=35.4)
sfp.get_temperature_warning_threashold = mock.MagicMock(return_value=70)
sfp.get_temperature_critical_threashold = mock.MagicMock(return_value=80)
sfp.get_temperature_warning_threshold = mock.MagicMock(return_value=70)
sfp.get_temperature_critical_threshold = mock.MagicMock(return_value=80)
assert thermal.get_temperature() == 35.4
assert thermal.get_high_threshold() == 70
assert thermal.get_high_critical_threshold() == 80
sfp.get_temperature = mock.MagicMock(return_value=0)
sfp.get_temperature_warning_threshold = mock.MagicMock(return_value=0)
sfp.get_temperature_critical_threshold = mock.MagicMock(return_value=None)
assert thermal.get_temperature() is None
assert thermal.get_high_threshold() is None
assert thermal.get_high_critical_threshold() is None

@mock.patch('sonic_platform.utils.read_float_from_file')
def test_get_temperature(self, mock_read):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,23 +97,23 @@ def test_update_asic(self, mock_read):
mock_read.return_value = 8
updater = ThermalUpdater(None)
assert updater.get_asic_temp() == 1000
assert updater.get_asic_temp_warning_threashold() == 1000
assert updater.get_asic_temp_critical_threashold() == 1000
assert updater.get_asic_temp_warning_threshold() == 1000
assert updater.get_asic_temp_critical_threshold() == 1000
updater.update_asic()
hw_management_independent_mode_update.thermal_data_set_asic.assert_called_once()

mock_read.return_value = None
assert updater.get_asic_temp() is None
assert updater.get_asic_temp_warning_threashold() == ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD
assert updater.get_asic_temp_critical_threashold() == ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD
assert updater.get_asic_temp_warning_threshold() == ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD
assert updater.get_asic_temp_critical_threshold() == ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD

def test_update_module(self):
mock_sfp = mock.MagicMock()
mock_sfp.sdk_index = 10
mock_sfp.get_presence = mock.MagicMock(return_value=True)
mock_sfp.get_temperature = mock.MagicMock(return_value=55.0)
mock_sfp.get_temperature_warning_threashold = mock.MagicMock(return_value=70.0)
mock_sfp.get_temperature_critical_threashold = mock.MagicMock(return_value=80.0)
mock_sfp.get_temperature_warning_threshold = mock.MagicMock(return_value=70.0)
mock_sfp.get_temperature_critical_threshold = mock.MagicMock(return_value=80.0)
updater = ThermalUpdater([mock_sfp])
updater.update_module()
hw_management_independent_mode_update.thermal_data_set_module.assert_called_once_with(0, 11, 55000, 80000, 70000, 0)
Expand Down

0 comments on commit 8d65e2c

Please sign in to comment.