Skip to content

Commit

Permalink
[202305] Enhanced route_check.py for multi_asic platforms (sonic-net#…
Browse files Browse the repository at this point in the history
…3112)

What I did
Cherry pick of PR sonic-net#3077

MSFT ADO: 25416673

How I did it
Enhanced the route_check.py script to take additional optional parameter(--n/namespace). Without this parameter, the check will be run on all asics in multi-asic platforms.
Different connections to DBs are modified accordingly to connect to relevant ns dbs.
Result will be encapsulated under different namespace. For single asic, results will be displayed under Default-Namespace("").

testData and the testfiles are enhanced accordinly.

How to verify it
Verified that all pytest UT cases are passing.
Verified the route_check_test.sh script on single asic and multi-asic platforms.
Verified Monit routecheck outputs by simulating a failure scenario on both single asic and multi-asic platforms.
output from Monit Check:
Single Asic:
xxx/usr/local/bin# monit status routecheck
Monit 5.20.0 uptime: 1d 20h 32m

Program 'routeCheck'
status Status failed
monitoring status Monitored
monitoring mode active
on reboot start
last exit value 255
last output Failure results: {{
"": {
"missed_ROUTE_TABLE_routes": [
"20c0:d9b8:99:80::/64"
]
}
}}
Failed. Look at reported mismatches above
add: {
"": []
}
del: {
"": []
}
data collected Tue, 12 Dec 2023 20:30:11

'''
Multi Asic:
'''
/bin# monit status routecheck
Monit 5.20.0 uptime: 1d 23h 51m

Program 'routeCheck'
status Status failed
monitoring status Monitored
monitoring mode active
on reboot start
last exit value 255
last output Failure results: {{
"asic0": {
"missed_ROUTE_TABLE_routes": [
"1.0.0.0/16"
]
},
"asic1": {
"missed_ROUTE_TABLE_routes": [
"1.0.0.0/16"
]
},
"asic2": {
"missed_ROUTE_TABLE_routes": [
"1.0.0.0/16"
]
}
}}
Failed. Look at reported mismatches above
add: {
"asic0": [],
"asic1": [],
"asic2": []
}
del: {
"asic0": [],
"asic1": [],
"asic2": []
}
data collected Tue, 12 Dec 2023 23:54:23
  • Loading branch information
deepak-singhal0408 authored Jan 29, 2024
1 parent 83a548d commit c5e30e3
Show file tree
Hide file tree
Showing 4 changed files with 831 additions and 483 deletions.
168 changes: 98 additions & 70 deletions scripts/route_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,12 @@
import time
import signal
import traceback
from ipaddress import ip_network

from ipaddress import ip_network
from swsscommon import swsscommon
from utilities_common import chassis
from sonic_py_common import multi_asic
from utilities_common.general import load_db_config

APPL_DB_NAME = 'APPL_DB'
ASIC_DB_NAME = 'ASIC_DB'
Expand All @@ -72,6 +74,8 @@

PRINT_MSG_LEN_MAX = 1000

REDIS_TIMEOUT_MSECS = 0

class Level(Enum):
ERR = 'ERR'
INFO = 'INFO'
Expand Down Expand Up @@ -272,12 +276,12 @@ def is_vrf(k):
return k.startswith("Vrf")


def get_routes():
def get_appdb_routes(namespace):
"""
helper to read route table from APPL-DB.
:return list of sorted routes with prefix ensured
"""
db = swsscommon.DBConnector(APPL_DB_NAME, 0)
db = swsscommon.DBConnector(APPL_DB_NAME, REDIS_TIMEOUT_MSECS, True, namespace)
print_message(syslog.LOG_DEBUG, "APPL DB connected for routes")
tbl = swsscommon.Table(db, 'ROUTE_TABLE')
keys = tbl.getKeys()
Expand All @@ -294,15 +298,15 @@ def get_routes():
return sorted(valid_rt)


def get_route_entries():
def get_asicdb_routes(namespace):
"""
helper to read present route entries from ASIC-DB and
as well initiate selector for ASIC-DB:ASIC-state updates.
:return (selector, subscriber, <list of sorted routes>)
"""
db = swsscommon.DBConnector(ASIC_DB_NAME, 0)
db = swsscommon.DBConnector(ASIC_DB_NAME, REDIS_TIMEOUT_MSECS, True, namespace)
subs = swsscommon.SubscriberStateTable(db, ASIC_TABLE_NAME)
print_message(syslog.LOG_DEBUG, "ASIC DB connected")
print_message(syslog.LOG_DEBUG, "ASIC DB {} connected".format(namespace))

rt = []
while True:
Expand All @@ -320,12 +324,12 @@ def get_route_entries():
return (selector, subs, sorted(rt))


def get_interfaces():
def get_interfaces(namespace):
"""
helper to read interface table from APPL-DB.
:return sorted list of IP addresses with added prefix
"""
db = swsscommon.DBConnector(APPL_DB_NAME, 0)
db = swsscommon.DBConnector(APPL_DB_NAME, REDIS_TIMEOUT_MSECS, True, namespace)
print_message(syslog.LOG_DEBUG, "APPL DB connected for interfaces")
tbl = swsscommon.Table(db, 'INTF_TABLE')
keys = tbl.getKeys()
Expand All @@ -345,20 +349,20 @@ def get_interfaces():
return sorted(intf)


def filter_out_local_interfaces(keys):
def filter_out_local_interfaces(namespace, keys):
"""
helper to filter out local interfaces
:param keys: APPL-DB:ROUTE_TABLE Routes to check.
:return keys filtered out of local
"""
rt = []
local_if_lst = {'eth0', 'docker0'}
local_if_lst = {'eth0', 'eth1', 'docker0'} #eth1 is added to skip route installed in AAPL_DB on packet-chassis
local_if_lo = [r'tun0', r'lo', r'Loopback\d+']

chassis_local_intfs = chassis.get_chassis_local_interfaces()
local_if_lst.update(set(chassis_local_intfs))

db = swsscommon.DBConnector(APPL_DB_NAME, 0)
db = swsscommon.DBConnector(APPL_DB_NAME, REDIS_TIMEOUT_MSECS, True, namespace)
tbl = swsscommon.Table(db, 'ROUTE_TABLE')

for k in keys:
Expand All @@ -378,20 +382,20 @@ def filter_out_local_interfaces(keys):
return rt


def filter_out_voq_neigh_routes(keys):
def filter_out_voq_neigh_routes(namespace, keys):
"""
helper to filter out voq neigh routes. These are the
routes statically added for the voq neighbors. We skip
writing route entries in asic db for these. We filter
out reporting error on all the host routes written on
inband interface prefixed with "Ethernte-IB"
:param keys: APPL-DB:ROUTE_TABLE Routes to check.
:param namespace: Asic namespace, keys: APPL-DB:ROUTE_TABLE Routes to check.
:return keys filtered out for voq neigh routes
"""
rt = []
local_if_re = [r'Ethernet-IB\d+']

db = swsscommon.DBConnector(APPL_DB_NAME, 0)
db = swsscommon.DBConnector(APPL_DB_NAME, REDIS_TIMEOUT_MSECS, True, namespace)
tbl = swsscommon.Table(db, 'ROUTE_TABLE')

for k in keys:
Expand Down Expand Up @@ -423,13 +427,13 @@ def filter_out_default_routes(lst):
return upd


def filter_out_vnet_routes(routes):
def filter_out_vnet_routes(namespace, routes):
"""
Helper to filter out VNET routes
:param routes: list of routes to filter
:return filtered list of routes.
"""
db = swsscommon.DBConnector('APPL_DB', 0)
db = swsscommon.DBConnector('APPL_DB', REDIS_TIMEOUT_MSECS, True, namespace)

vnet_route_table = swsscommon.Table(db, 'VNET_ROUTE_TABLE')
vnet_route_tunnel_table = swsscommon.Table(db, 'VNET_ROUTE_TUNNEL_TABLE')
Expand Down Expand Up @@ -459,14 +463,14 @@ def is_dualtor(config_db):
return subtype.lower() == 'dualtor'


def filter_out_standalone_tunnel_routes(routes):
config_db = swsscommon.ConfigDBConnector()
config_db.connect()
def filter_out_standalone_tunnel_routes(namespace, routes):

config_db = multi_asic.connect_config_db_for_ns(namespace)

if not is_dualtor(config_db):
return routes

app_db = swsscommon.DBConnector('APPL_DB', 0)
app_db = swsscommon.DBConnector('APPL_DB', REDIS_TIMEOUT_MSECS, True, namespace)
neigh_table = swsscommon.Table(app_db, 'NEIGH_TABLE')
neigh_keys = neigh_table.getKeys()
standalone_tunnel_route_ips = []
Expand Down Expand Up @@ -510,7 +514,7 @@ def get_soc_ips(config_db):
return soc_ips


def filter_out_soc_ip_routes(routes):
def filter_out_soc_ip_routes(namespace, routes):
"""
Ignore ASIC only routes for SOC IPs
Expand All @@ -520,8 +524,7 @@ def filter_out_soc_ip_routes(routes):
will use the kernel routing table), but still provide connectivity to any external
traffic in case of a link issue (since this traffic will be forwarded by the ASIC).
"""
config_db = swsscommon.ConfigDBConnector()
config_db.connect()
config_db = multi_asic.connect_config_db_for_ns(namespace)

if not is_dualtor(config_db):
return routes
Expand All @@ -539,9 +542,9 @@ def filter_out_soc_ip_routes(routes):
return updated_routes


def get_vlan_neighbors():
def get_vlan_neighbors(namespace):
"""Return a list of VLAN neighbors."""
db = swsscommon.DBConnector(APPL_DB_NAME, 0)
db = swsscommon.DBConnector(APPL_DB_NAME, REDIS_TIMEOUT_MSECS, True, namespace)
print_message(syslog.LOG_DEBUG, "APPL DB connected for neighbors")
tbl = swsscommon.Table(db, 'NEIGH_TABLE')
neigh_entries = tbl.getKeys()
Expand All @@ -557,7 +560,7 @@ def get_vlan_neighbors():
return valid_neighs


def filter_out_vlan_neigh_route_miss(rt_appl_miss, rt_asic_miss):
def filter_out_vlan_neigh_route_miss(namespace, rt_appl_miss, rt_asic_miss):
"""Ignore any route miss for vlan neighbor IPs."""

def _filter_out_neigh_route(routes, neighs):
Expand All @@ -570,12 +573,10 @@ def _filter_out_neigh_route(routes, neighs):
updated_routes.append(route)
return updated_routes, ignored_routes

config_db = swsscommon.ConfigDBConnector()
config_db.connect()
config_db = multi_asic.connect_config_db_for_ns(namespace)

print_message(syslog.LOG_DEBUG, "Ignore vlan neighbor route miss")
if is_dualtor(config_db):
vlan_neighs = set(get_vlan_neighbors())
vlan_neighs = set(get_vlan_neighbors(namespace))
rt_appl_miss, ignored_rt_appl_miss = _filter_out_neigh_route(rt_appl_miss, vlan_neighs)
print_message(syslog.LOG_DEBUG, "Ignored appl route miss:", json.dumps(ignored_rt_appl_miss, indent=4))
rt_asic_miss, ignored_rt_asic_miss = _filter_out_neigh_route(rt_asic_miss, vlan_neighs)
Expand All @@ -584,7 +585,7 @@ def _filter_out_neigh_route(routes, neighs):
return rt_appl_miss, rt_asic_miss


def check_routes():
def check_routes(namespace):
"""
The heart of this script which runs the checks.
Read APPL-DB & ASIC-DB, the relevant tables for route checking.
Expand All @@ -600,61 +601,77 @@ def check_routes():
:return (0, None) on sucess, else (-1, results) where results holds
the unjustifiable entries.
"""
intf_appl_miss = []
rt_appl_miss = []
rt_asic_miss = []
namespace_list = []
if namespace is not multi_asic.DEFAULT_NAMESPACE and namespace in multi_asic.get_namespace_list():
namespace_list.append(namespace)
else:
namespace_list = multi_asic.get_namespace_list()
print_message(syslog.LOG_INFO, "Checking routes for namespaces: ", namespace_list)

results = {}
adds = []
deletes = []
adds = {}
deletes = {}
for namespace in namespace_list:
intf_appl_miss = []
rt_appl_miss = []
rt_asic_miss = []
adds[namespace] = []
deletes[namespace] = []

selector, subs, rt_asic = get_asicdb_routes(namespace)

selector, subs, rt_asic = get_route_entries()
rt_appl = get_appdb_routes(namespace)
intf_appl = get_interfaces(namespace)

rt_appl = get_routes()
intf_appl = get_interfaces()
# Diff APPL-DB routes & ASIC-DB routes
rt_appl_miss, rt_asic_miss = diff_sorted_lists(rt_appl, rt_asic)

# Diff APPL-DB routes & ASIC-DB routes
rt_appl_miss, rt_asic_miss = diff_sorted_lists(rt_appl, rt_asic)
# Check missed ASIC routes against APPL-DB INTF_TABLE
_, rt_asic_miss = diff_sorted_lists(intf_appl, rt_asic_miss)
rt_asic_miss = filter_out_default_routes(rt_asic_miss)
rt_asic_miss = filter_out_vnet_routes(namespace, rt_asic_miss)
rt_asic_miss = filter_out_standalone_tunnel_routes(namespace, rt_asic_miss)
rt_asic_miss = filter_out_soc_ip_routes(namespace, rt_asic_miss)

# Check missed ASIC routes against APPL-DB INTF_TABLE
_, rt_asic_miss = diff_sorted_lists(intf_appl, rt_asic_miss)
rt_asic_miss = filter_out_default_routes(rt_asic_miss)
rt_asic_miss = filter_out_vnet_routes(rt_asic_miss)
rt_asic_miss = filter_out_standalone_tunnel_routes(rt_asic_miss)
rt_asic_miss = filter_out_soc_ip_routes(rt_asic_miss)

# Check APPL-DB INTF_TABLE with ASIC table route entries
intf_appl_miss, _ = diff_sorted_lists(intf_appl, rt_asic)
# Check APPL-DB INTF_TABLE with ASIC table route entries
intf_appl_miss, _ = diff_sorted_lists(intf_appl, rt_asic)

if rt_appl_miss:
rt_appl_miss = filter_out_local_interfaces(rt_appl_miss)
if rt_appl_miss:
rt_appl_miss = filter_out_local_interfaces(namespace, rt_appl_miss)

if rt_appl_miss:
rt_appl_miss = filter_out_voq_neigh_routes(rt_appl_miss)
if rt_appl_miss:
rt_appl_miss = filter_out_voq_neigh_routes(namespace, rt_appl_miss)

# NOTE: On dualtor environment, ignore any route miss for the
# neighbors learned from the vlan subnet.
if rt_appl_miss or rt_asic_miss:
rt_appl_miss, rt_asic_miss = filter_out_vlan_neigh_route_miss(rt_appl_miss, rt_asic_miss)
# NOTE: On dualtor environment, ignore any route miss for the
# neighbors learned from the vlan subnet.
if rt_appl_miss or rt_asic_miss:
rt_appl_miss, rt_asic_miss = filter_out_vlan_neigh_route_miss(namespace, rt_appl_miss, rt_asic_miss)

if rt_appl_miss or rt_asic_miss:
# Look for subscribe updates for a second
adds, deletes = get_subscribe_updates(selector, subs)
if rt_appl_miss or rt_asic_miss:
# Look for subscribe updates for a second
adds[namespace], deletes[namespace] = get_subscribe_updates(selector, subs)

# Drop all those for which SET received
rt_appl_miss, _ = diff_sorted_lists(rt_appl_miss, adds)
rt_appl_miss, _ = diff_sorted_lists(rt_appl_miss, adds[namespace])

# Drop all those for which DEL received
rt_asic_miss, _ = diff_sorted_lists(rt_asic_miss, deletes)
rt_asic_miss, _ = diff_sorted_lists(rt_asic_miss, deletes[namespace])

if rt_appl_miss:
results["missed_ROUTE_TABLE_routes"] = rt_appl_miss
if rt_appl_miss:
if namespace not in results:
results[namespace] = {}
results[namespace]["missed_ROUTE_TABLE_routes"] = rt_appl_miss

if intf_appl_miss:
results["missed_INTF_TABLE_entries"] = intf_appl_miss
if intf_appl_miss:
if namespace not in results:
results[namespace] = {}
results[namespace]["missed_INTF_TABLE_entries"] = intf_appl_miss

if rt_asic_miss:
results["Unaccounted_ROUTE_ENTRY_TABLE_entries"] = rt_asic_miss
if rt_asic_miss:
if namespace not in results:
results[namespace] = {}
results[namespace]["Unaccounted_ROUTE_ENTRY_TABLE_entries"] = rt_asic_miss

if results:
print_message(syslog.LOG_WARNING, "Failure results: {", json.dumps(results, indent=4), "}")
Expand All @@ -666,7 +683,6 @@ def check_routes():
print_message(syslog.LOG_INFO, "All good!")
return 0, None


def main():
"""
main entry point, which mainly parses the args and call check_routes
Expand All @@ -679,8 +695,18 @@ def main():
parser.add_argument('-m', "--mode", type=Level, choices=list(Level), default='ERR')
parser.add_argument("-i", "--interval", type=int, default=0, help="Scan interval in seconds")
parser.add_argument("-s", "--log_to_syslog", action="store_true", default=True, help="Write message to syslog")
parser.add_argument('-n','--namespace', default=multi_asic.DEFAULT_NAMESPACE, help='Verify routes for this specific namespace')
args = parser.parse_args()

namespace = args.namespace
if namespace is not multi_asic.DEFAULT_NAMESPACE and not multi_asic.is_multi_asic():
print_message(syslog.LOG_ERR, "Namespace option is not valid for a single-ASIC device")
return -1, None

if namespace is not multi_asic.DEFAULT_NAMESPACE and namespace not in multi_asic.get_namespace_list():
print_message(syslog.LOG_ERR, "Namespace option is not valid. Choose one of {}".format(multi_asic.get_namespace_list()))
return -1, None

set_level(args.mode, args.log_to_syslog)

if args.interval:
Expand All @@ -694,10 +720,12 @@ def main():
interval = 1

signal.signal(signal.SIGALRM, handler)
load_db_config()

while True:
signal.alarm(TIMEOUT_SECONDS)
ret, res= check_routes()
ret, res= check_routes(namespace)
print_message(syslog.LOG_DEBUG, "ret={}, res={}".format(ret, res))
signal.alarm(0)

if interval:
Expand Down
Loading

0 comments on commit c5e30e3

Please sign in to comment.