diff --git a/scripts/fast-reboot b/scripts/fast-reboot index e183c34219..88c9c9a6f4 100755 --- a/scripts/fast-reboot +++ b/scripts/fast-reboot @@ -758,8 +758,8 @@ trap '' EXIT HUP INT QUIT TERM KILL ABRT ALRM # this is a non-blocking call, and the process will die in 300s debug "Starting lag_keepalive to send LACPDUs ..." timeout 300 python ${LAG_KEEPALIVE_SCRIPT} & -# give the lag_keepalive script a chance to get ready (30s) and collect one lacpdu before going down (30s) -sleep 60 +# give the lag_keepalive script a chance to send some LACPDUs +sleep 10 if [ -x ${LOG_SSD_HEALTH} ]; then debug "Collecting logs to check ssd health before ${REBOOT_TYPE}..." diff --git a/scripts/lag_keepalive.py b/scripts/lag_keepalive.py index 04ac01a074..45a1f2ff06 100644 --- a/scripts/lag_keepalive.py +++ b/scripts/lag_keepalive.py @@ -2,12 +2,19 @@ from scapy.config import conf conf.ipv6_enabled = False -from scapy.all import sendp, sniff -from swsscommon.swsscommon import ConfigDBConnector -import time, threading, traceback -import syslog +from scapy.layers.l2 import Ether # noqa: E402 +from scapy.sendrecv import sendp # noqa: E402 +import scapy.contrib.lacp # noqa: E402 +import subprocess # noqa: E402 +import json # noqa: E402 +import time # noqa: E402 +import traceback # noqa: E402 +import syslog # noqa: E402 +from swsscommon.swsscommon import ConfigDBConnector # noqa: E402 SYSLOG_ID = 'lag_keepalive' +SLOW_PROTOCOL_MAC_ADDRESS = "01:80:c2:00:00:02" +LACP_ETHERTYPE = 0x8809 def log_info(msg): @@ -22,41 +29,57 @@ def log_error(msg): syslog.closelog() -def sniff_lacpdu(device_mac, lag_member, lag_member_to_packet): - sniffed_packet = sniff(iface=lag_member, - filter="ether proto 0x8809 and ether src {}".format(device_mac), - count=1, timeout=30) - lag_member_to_packet[lag_member] = sniffed_packet +def getCmdOutput(cmd): + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE) + return proc.communicate()[0], proc.returncode + + +def get_port_channel_config(portChannelName): + (processStdout, _) = getCmdOutput(["teamdctl", portChannelName, "state", "dump"]) + return json.loads(processStdout) + + +def craft_lacp_packet(portChannelConfig, portName): + portConfig = portChannelConfig["ports"][portName] + actorConfig = portConfig["runner"]["actor_lacpdu_info"] + partnerConfig = portConfig["runner"]["partner_lacpdu_info"] + l2 = Ether(dst=SLOW_PROTOCOL_MAC_ADDRESS, src=portConfig["ifinfo"]["dev_addr"], type=LACP_ETHERTYPE) + l3 = scapy.contrib.lacp.SlowProtocol(subtype=0x01) + l4 = scapy.contrib.lacp.LACP() + l4.version = 0x1 + l4.actor_system_priority = actorConfig["system_priority"] + l4.actor_system = actorConfig["system"] + l4.actor_key = actorConfig["key"] + l4.actor_port_priority = actorConfig["port_priority"] + l4.actor_port_number = actorConfig["port"] + l4.actor_state = actorConfig["state"] + l4.partner_system_priority = partnerConfig["system_priority"] + l4.partner_system = partnerConfig["system"] + l4.partner_key = partnerConfig["key"] + l4.partner_port_priority = partnerConfig["port_priority"] + l4.partner_port_number = partnerConfig["port"] + l4.partner_state = partnerConfig["state"] + packet = l2 / l3 / l4 + return packet def get_lacpdu_per_lag_member(): appDB = ConfigDBConnector() appDB.db_connect('APPL_DB') appDB_lag_info = appDB.get_keys('LAG_MEMBER_TABLE') - configDB = ConfigDBConnector() - configDB.db_connect('CONFIG_DB') - device_mac = configDB.get(configDB.CONFIG_DB, "DEVICE_METADATA|localhost", "mac") - hwsku = configDB.get(configDB.CONFIG_DB, "DEVICE_METADATA|localhost", "hwsku") active_lag_members = list() lag_member_to_packet = dict() - sniffer_threads = list() for lag_entry in appDB_lag_info: lag_name = str(lag_entry[0]) - oper_status = appDB.get(appDB.APPL_DB,"LAG_TABLE:{}".format(lag_name), "oper_status") + oper_status = appDB.get(appDB.APPL_DB, "LAG_TABLE:{}".format(lag_name), "oper_status") if oper_status == "up": # only apply the workaround for active lags lag_member = str(lag_entry[1]) active_lag_members.append(lag_member) - # use threading to capture lacpdus from several lag members simultaneously - sniffer_thread = threading.Thread(target=sniff_lacpdu, - args=(device_mac, lag_member, lag_member_to_packet)) - sniffer_thread.start() - sniffer_threads.append(sniffer_thread) - - # sniff for lacpdu should finish in <= 30s. sniff timeout is also set to 30s - for sniffer in sniffer_threads: - sniffer.join(timeout=30) - + # craft lacpdu packets for each lag member based on config + port_channel_config = get_port_channel_config(lag_name) + lag_member_to_packet[lag_member] = craft_lacp_packet(port_channel_config, lag_member) + return active_lag_members, lag_member_to_packet @@ -80,9 +103,9 @@ def main(): try: active_lag_members, lag_member_to_packet = get_lacpdu_per_lag_member() if len(active_lag_members) != len(lag_member_to_packet.keys()): - log_error("Failed to capture LACPDU packets for some lag members. " +\ - "Active lag members: {}. LACPDUs captured for: {}".format( - active_lag_members, lag_member_to_packet.keys())) + log_error("Failed to craft LACPDU packets for some lag members. " + + "Active lag members: {}. LACPDUs craft for: {}".format( + active_lag_members, lag_member_to_packet.keys())) log_info("ready to send LACPDU packets via {}".format(lag_member_to_packet.keys())) except Exception: @@ -98,5 +121,6 @@ def main(): # start an infinite loop to keep sending lacpdus from lag member ports lag_keepalive(lag_member_to_packet) + if __name__ == "__main__": main()