Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize lag_keepalive by crafting the LACPDU packet ourselves #3170

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions scripts/fast-reboot
Original file line number Diff line number Diff line change
Expand Up @@ -758,8 +758,8 @@ trap '' EXIT HUP INT QUIT TERM KILL ABRT ALRM
# this is a non-blocking call, and the process will die in 300s
debug "Starting lag_keepalive to send LACPDUs ..."
timeout 300 python ${LAG_KEEPALIVE_SCRIPT} &
# give the lag_keepalive script a chance to get ready (30s) and collect one lacpdu before going down (30s)
sleep 60
# give the lag_keepalive script a chance to send some LACPDUs
sleep 10

if [ -x ${LOG_SSD_HEALTH} ]; then
debug "Collecting logs to check ssd health before ${REBOOT_TYPE}..."
Expand Down
80 changes: 52 additions & 28 deletions scripts/lag_keepalive.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,19 @@

from scapy.config import conf
conf.ipv6_enabled = False
from scapy.all import sendp, sniff
from swsscommon.swsscommon import ConfigDBConnector
import time, threading, traceback
import syslog
from scapy.layers.l2 import Ether # noqa: E402
from scapy.sendrecv import sendp # noqa: E402
import scapy.contrib.lacp # noqa: E402
import subprocess # noqa: E402
import json # noqa: E402
import time # noqa: E402
import traceback # noqa: E402
import syslog # noqa: E402
from swsscommon.swsscommon import ConfigDBConnector # noqa: E402

SYSLOG_ID = 'lag_keepalive'
SLOW_PROTOCOL_MAC_ADDRESS = "01:80:c2:00:00:02"
LACP_ETHERTYPE = 0x8809


def log_info(msg):
Expand All @@ -22,41 +29,57 @@ def log_error(msg):
syslog.closelog()


def sniff_lacpdu(device_mac, lag_member, lag_member_to_packet):
sniffed_packet = sniff(iface=lag_member,
filter="ether proto 0x8809 and ether src {}".format(device_mac),
count=1, timeout=30)
lag_member_to_packet[lag_member] = sniffed_packet
def getCmdOutput(cmd):
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
return proc.communicate()[0], proc.returncode


def get_port_channel_config(portChannelName):
(processStdout, _) = getCmdOutput(["teamdctl", portChannelName, "state", "dump"])
return json.loads(processStdout)


def craft_lacp_packet(portChannelConfig, portName):
portConfig = portChannelConfig["ports"][portName]
actorConfig = portConfig["runner"]["actor_lacpdu_info"]
partnerConfig = portConfig["runner"]["partner_lacpdu_info"]
l2 = Ether(dst=SLOW_PROTOCOL_MAC_ADDRESS, src=portConfig["ifinfo"]["dev_addr"], type=LACP_ETHERTYPE)
l3 = scapy.contrib.lacp.SlowProtocol(subtype=0x01)
l4 = scapy.contrib.lacp.LACP()
l4.version = 0x1
l4.actor_system_priority = actorConfig["system_priority"]
l4.actor_system = actorConfig["system"]
l4.actor_key = actorConfig["key"]
l4.actor_port_priority = actorConfig["port_priority"]
l4.actor_port_number = actorConfig["port"]
l4.actor_state = actorConfig["state"]
l4.partner_system_priority = partnerConfig["system_priority"]
l4.partner_system = partnerConfig["system"]
l4.partner_key = partnerConfig["key"]
l4.partner_port_priority = partnerConfig["port_priority"]
l4.partner_port_number = partnerConfig["port"]
l4.partner_state = partnerConfig["state"]
packet = l2 / l3 / l4
return packet


def get_lacpdu_per_lag_member():
appDB = ConfigDBConnector()
appDB.db_connect('APPL_DB')
appDB_lag_info = appDB.get_keys('LAG_MEMBER_TABLE')
configDB = ConfigDBConnector()
configDB.db_connect('CONFIG_DB')
device_mac = configDB.get(configDB.CONFIG_DB, "DEVICE_METADATA|localhost", "mac")
hwsku = configDB.get(configDB.CONFIG_DB, "DEVICE_METADATA|localhost", "hwsku")
active_lag_members = list()
lag_member_to_packet = dict()
sniffer_threads = list()
for lag_entry in appDB_lag_info:
lag_name = str(lag_entry[0])
oper_status = appDB.get(appDB.APPL_DB,"LAG_TABLE:{}".format(lag_name), "oper_status")
oper_status = appDB.get(appDB.APPL_DB, "LAG_TABLE:{}".format(lag_name), "oper_status")
if oper_status == "up":
# only apply the workaround for active lags
lag_member = str(lag_entry[1])
active_lag_members.append(lag_member)
# use threading to capture lacpdus from several lag members simultaneously
sniffer_thread = threading.Thread(target=sniff_lacpdu,
args=(device_mac, lag_member, lag_member_to_packet))
sniffer_thread.start()
sniffer_threads.append(sniffer_thread)

# sniff for lacpdu should finish in <= 30s. sniff timeout is also set to 30s
for sniffer in sniffer_threads:
sniffer.join(timeout=30)

# craft lacpdu packets for each lag member based on config
port_channel_config = get_port_channel_config(lag_name)
lag_member_to_packet[lag_member] = craft_lacp_packet(port_channel_config, lag_member)

return active_lag_members, lag_member_to_packet


Expand All @@ -80,9 +103,9 @@ def main():
try:
active_lag_members, lag_member_to_packet = get_lacpdu_per_lag_member()
if len(active_lag_members) != len(lag_member_to_packet.keys()):
log_error("Failed to capture LACPDU packets for some lag members. " +\
"Active lag members: {}. LACPDUs captured for: {}".format(
active_lag_members, lag_member_to_packet.keys()))
log_error("Failed to craft LACPDU packets for some lag members. " +
"Active lag members: {}. LACPDUs craft for: {}".format(
active_lag_members, lag_member_to_packet.keys()))

log_info("ready to send LACPDU packets via {}".format(lag_member_to_packet.keys()))
except Exception:
Expand All @@ -98,5 +121,6 @@ def main():
# start an infinite loop to keep sending lacpdus from lag member ports
lag_keepalive(lag_member_to_packet)


if __name__ == "__main__":
main()
Loading