From fec49f210759fdfc36448a9fccf5d6ad0fcc4815 Mon Sep 17 00:00:00 2001 From: Saikrishna Arcot Date: Mon, 19 Feb 2024 11:10:30 -0800 Subject: [PATCH 1/2] Optimize lag_keepalive by crafting the LACPDU packet ourselves Instead of waiting for a LACPDU packet to be sent and capturing that (which involves waiting roughly 30 seconds), get the necessary information from teamd and craft it ourselves. This means that the 60-second wait in making sure that a LACPDU packet is captured and the keepalive script is ready can be largely eliminated (this has been reduced to 10 seconds to make sure the script has a chance to craft the packets and send some LACPDUs). Signed-off-by: Saikrishna Arcot --- scripts/fast-reboot | 4 +-- scripts/lag_keepalive.py | 72 ++++++++++++++++++++++++++-------------- 2 files changed, 50 insertions(+), 26 deletions(-) diff --git a/scripts/fast-reboot b/scripts/fast-reboot index 922d217e3f..3dab4e3995 100755 --- a/scripts/fast-reboot +++ b/scripts/fast-reboot @@ -689,8 +689,8 @@ trap '' EXIT HUP INT QUIT TERM KILL ABRT ALRM # this is a non-blocking call, and the process will die in 300s debug "Starting lag_keepalive to send LACPDUs ..." timeout 300 python ${LAG_KEEPALIVE_SCRIPT} & -# give the lag_keepalive script a chance to get ready (30s) and collect one lacpdu before going down (30s) -sleep 60 +# give the lag_keepalive script a chance to send some LACPDUs +sleep 10 if [ -x ${LOG_SSD_HEALTH} ]; then debug "Collecting logs to check ssd health before ${REBOOT_TYPE}..." diff --git a/scripts/lag_keepalive.py b/scripts/lag_keepalive.py index 04ac01a074..4855e60841 100644 --- a/scripts/lag_keepalive.py +++ b/scripts/lag_keepalive.py @@ -2,12 +2,20 @@ from scapy.config import conf conf.ipv6_enabled = False -from scapy.all import sendp, sniff +from scapy.fields import ByteField, ShortField, MACField, XStrFixedLenField, ConditionalField +from scapy.layers.l2 import Ether +from scapy.sendrecv import sendp, sniff +from scapy.packet import Packet, split_layers, bind_layers +import scapy.contrib.lacp +import subprocess +import json from swsscommon.swsscommon import ConfigDBConnector -import time, threading, traceback +import time, traceback import syslog SYSLOG_ID = 'lag_keepalive' +SLOW_PROTOCOL_MAC_ADDRESS = "01:80:c2:00:00:02" +LACP_ETHERTYPE = 0x8809 def log_info(msg): @@ -22,24 +30,46 @@ def log_error(msg): syslog.closelog() -def sniff_lacpdu(device_mac, lag_member, lag_member_to_packet): - sniffed_packet = sniff(iface=lag_member, - filter="ether proto 0x8809 and ether src {}".format(device_mac), - count=1, timeout=30) - lag_member_to_packet[lag_member] = sniffed_packet +def getCmdOutput(cmd): + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE) + return proc.communicate()[0], proc.returncode + + +def get_port_channel_config(portChannelName): + (processStdout, _) = getCmdOutput(["teamdctl", portChannelName, "state", "dump"]) + return json.loads(processStdout) + + +def craft_lacp_packet(portChannelConfig, portName): + portConfig = portChannelConfig["ports"][portName] + actorConfig = portConfig["runner"]["actor_lacpdu_info"] + partnerConfig = portConfig["runner"]["partner_lacpdu_info"] + l2 = Ether(dst=SLOW_PROTOCOL_MAC_ADDRESS, src=portConfig["ifinfo"]["dev_addr"], type=LACP_ETHERTYPE) + l3 = scapy.contrib.lacp.SlowProtocol(subtype=0x01) + l4 = scapy.contrib.lacp.LACP() + l4.version = 0x1 + l4.actor_system_priority = actorConfig["system_priority"] + l4.actor_system = actorConfig["system"] + l4.actor_key = actorConfig["key"] + l4.actor_port_priority = actorConfig["port_priority"] + l4.actor_port_number = actorConfig["port"] + l4.actor_state = actorConfig["state"] + l4.partner_system_priority = partnerConfig["system_priority"] + l4.partner_system = partnerConfig["system"] + l4.partner_key = partnerConfig["key"] + l4.partner_port_priority = partnerConfig["port_priority"] + l4.partner_port_number = partnerConfig["port"] + l4.partner_state = partnerConfig["state"] + packet = l2 / l3 / l4 + return packet def get_lacpdu_per_lag_member(): appDB = ConfigDBConnector() appDB.db_connect('APPL_DB') appDB_lag_info = appDB.get_keys('LAG_MEMBER_TABLE') - configDB = ConfigDBConnector() - configDB.db_connect('CONFIG_DB') - device_mac = configDB.get(configDB.CONFIG_DB, "DEVICE_METADATA|localhost", "mac") - hwsku = configDB.get(configDB.CONFIG_DB, "DEVICE_METADATA|localhost", "hwsku") active_lag_members = list() lag_member_to_packet = dict() - sniffer_threads = list() for lag_entry in appDB_lag_info: lag_name = str(lag_entry[0]) oper_status = appDB.get(appDB.APPL_DB,"LAG_TABLE:{}".format(lag_name), "oper_status") @@ -47,16 +77,10 @@ def get_lacpdu_per_lag_member(): # only apply the workaround for active lags lag_member = str(lag_entry[1]) active_lag_members.append(lag_member) - # use threading to capture lacpdus from several lag members simultaneously - sniffer_thread = threading.Thread(target=sniff_lacpdu, - args=(device_mac, lag_member, lag_member_to_packet)) - sniffer_thread.start() - sniffer_threads.append(sniffer_thread) - - # sniff for lacpdu should finish in <= 30s. sniff timeout is also set to 30s - for sniffer in sniffer_threads: - sniffer.join(timeout=30) - + # craft lacpdu packets for each lag member based on config + port_channel_config = get_port_channel_config(lag_name) + lag_member_to_packet[lag_member] = craft_lacp_packet(port_channel_config, lag_member) + return active_lag_members, lag_member_to_packet @@ -80,8 +104,8 @@ def main(): try: active_lag_members, lag_member_to_packet = get_lacpdu_per_lag_member() if len(active_lag_members) != len(lag_member_to_packet.keys()): - log_error("Failed to capture LACPDU packets for some lag members. " +\ - "Active lag members: {}. LACPDUs captured for: {}".format( + log_error("Failed to craft LACPDU packets for some lag members. " +\ + "Active lag members: {}. LACPDUs craft for: {}".format( active_lag_members, lag_member_to_packet.keys())) log_info("ready to send LACPDU packets via {}".format(lag_member_to_packet.keys())) From 7b60ffe5529970c0984be4310a41c03929266c91 Mon Sep 17 00:00:00 2001 From: Saikrishna Arcot Date: Sun, 15 Sep 2024 23:06:09 -0700 Subject: [PATCH 2/2] Fix pre-commit errors Signed-off-by: Saikrishna Arcot --- scripts/lag_keepalive.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/scripts/lag_keepalive.py b/scripts/lag_keepalive.py index 4855e60841..45a1f2ff06 100644 --- a/scripts/lag_keepalive.py +++ b/scripts/lag_keepalive.py @@ -2,16 +2,15 @@ from scapy.config import conf conf.ipv6_enabled = False -from scapy.fields import ByteField, ShortField, MACField, XStrFixedLenField, ConditionalField -from scapy.layers.l2 import Ether -from scapy.sendrecv import sendp, sniff -from scapy.packet import Packet, split_layers, bind_layers -import scapy.contrib.lacp -import subprocess -import json -from swsscommon.swsscommon import ConfigDBConnector -import time, traceback -import syslog +from scapy.layers.l2 import Ether # noqa: E402 +from scapy.sendrecv import sendp # noqa: E402 +import scapy.contrib.lacp # noqa: E402 +import subprocess # noqa: E402 +import json # noqa: E402 +import time # noqa: E402 +import traceback # noqa: E402 +import syslog # noqa: E402 +from swsscommon.swsscommon import ConfigDBConnector # noqa: E402 SYSLOG_ID = 'lag_keepalive' SLOW_PROTOCOL_MAC_ADDRESS = "01:80:c2:00:00:02" @@ -72,7 +71,7 @@ def get_lacpdu_per_lag_member(): lag_member_to_packet = dict() for lag_entry in appDB_lag_info: lag_name = str(lag_entry[0]) - oper_status = appDB.get(appDB.APPL_DB,"LAG_TABLE:{}".format(lag_name), "oper_status") + oper_status = appDB.get(appDB.APPL_DB, "LAG_TABLE:{}".format(lag_name), "oper_status") if oper_status == "up": # only apply the workaround for active lags lag_member = str(lag_entry[1]) @@ -104,9 +103,9 @@ def main(): try: active_lag_members, lag_member_to_packet = get_lacpdu_per_lag_member() if len(active_lag_members) != len(lag_member_to_packet.keys()): - log_error("Failed to craft LACPDU packets for some lag members. " +\ - "Active lag members: {}. LACPDUs craft for: {}".format( - active_lag_members, lag_member_to_packet.keys())) + log_error("Failed to craft LACPDU packets for some lag members. " + + "Active lag members: {}. LACPDUs craft for: {}".format( + active_lag_members, lag_member_to_packet.keys())) log_info("ready to send LACPDU packets via {}".format(lag_member_to_packet.keys())) except Exception: @@ -122,5 +121,6 @@ def main(): # start an infinite loop to keep sending lacpdus from lag member ports lag_keepalive(lag_member_to_packet) + if __name__ == "__main__": main()