Skip to content

Commit

Permalink
[sanity_check][bgp] Add default route check in sanity for single asic (
Browse files Browse the repository at this point in the history
…sonic-net#16235)

What is the motivation for this PR?
BGP routes would be setup during add-topo https://github.com/sonic-net/sonic-mgmt/blob/master/ansible/roles/vm_set/tasks/add_topo.yml#L276.
But there are some scenarios that route in DUT has been messed up, but bgp sessions are all up, sanity would treat it as healthy and wouldn't take action to recover it.

Loopbackv4 address has been replaced, it would cause all kernel routes from bgp miss
In some test cases announce or withdraw routes from ptf but fail to recover (i.e. test_stress_routes)
Healthy status:

admin@sonic:~$ ip route show default
default nhid 282 proto bgp src 10.1.0.32 metric 20 
        nexthop via 10.0.0.57 dev PortChannel101 weight 1 
        nexthop via 10.0.0.59 dev PortChannel103 weight 1 
        nexthop via 10.0.0.61 dev PortChannel105 weight 1 
        nexthop via 10.0.0.63 dev PortChannel106 weight 1 
admin@sonic:~$ show ip bgp sum

IPv4 Unicast Summary:
BGP router identifier 10.1.0.32, local AS number 65100 vrf-id 0
BGP table version 2890
RIB entries 2893, using 648032 bytes of memory
Peers 6, using 4451856 KiB of memory
Peer groups 4, using 256 bytes of memory


Neighbhor      V     AS    MsgRcvd    MsgSent    TblVer    InQ    OutQ  Up/Down      State/PfxRcd  NeighborName
-----------  ---  -----  ---------  ---------  --------  -----  ------  ---------  --------------  --------------
10.0.0.57      4  65200        763        764         0      0       0  11:46:17             1439  ARISTA01M1
10.0.0.59      4  65200        763        765         0      0       0  11:46:17             1439  ARISTA02M1
10.0.0.61      4  65200        763        765         0      0       0  11:46:17             1439  ARISTA03M1
10.0.0.63      4  65200        763        765         0      0       0  11:46:17             1439  ARISTA04M1
10.0.0.65      4  64001        712        761         0      0       0  11:46:15                2  ARISTA01MX
10.0.0.67      4  64002        712        761         0      0       0  11:46:15                2  ARISTA02MX

Total number of neighbors 6
Issue status, no default route, but show ip bgp sum looks good

admin@sonic:~$ ip route show default
admin@sonic:~$ show ip bgp sum

IPv4 Unicast Summary:
BGP router identifier 10.1.0.32, local AS number 65100 vrf-id 0
BGP table version 2892
RIB entries 2893, using 648032 bytes of memory
Peers 6, using 4451856 KiB of memory
Peer groups 4, using 256 bytes of memory


Neighbhor      V     AS    MsgRcvd    MsgSent    TblVer    InQ    OutQ  Up/Down      State/PfxRcd  NeighborName
-----------  ---  -----  ---------  ---------  --------  -----  ------  ---------  --------------  --------------
10.0.0.57      4  65200        764        767         0      0       0  11:47:14             1439  ARISTA01M1
10.0.0.59      4  65200        764        768         0      0       0  11:47:14             1439  ARISTA02M1
10.0.0.61      4  65200        764        768         0      0       0  11:47:14             1439  ARISTA03M1
10.0.0.63      4  65200        764        768         0      0       0  11:47:14             1439  ARISTA04M1
10.0.0.65      4  64001        713        764         0      0       0  11:47:12                2  ARISTA01MX
10.0.0.67      4  64002        713        764         0      0       0  11:47:12                2  ARISTA02MX

Total number of neighbors 6
How did you do it?
Add default routes check in sanity check, and re-announce routes if issue happen

How did you verify/test it?
Run sanity check
  • Loading branch information
yaqiangz authored and mssonicbld committed Dec 27, 2024
1 parent 0fa7639 commit 66c9308
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 12 deletions.
43 changes: 32 additions & 11 deletions tests/common/plugins/sanity_check/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,12 @@ def _check_bgp_on_dut(*args, **kwargs):
dut = kwargs['node']
results = kwargs['results']

def _check_default_route(version, dut):
# Return True if successfully get default route
res = dut.shell("ip {} route show default".format("" if version == 4 else "-6"),
module_ignore_errors=True)
return not res["rc"] and len(res["stdout"].strip()) != 0

def _check_bgp_status_helper():
asic_check_results = []
bgp_facts = dut.bgp_facts(asic_index='all')
Expand All @@ -184,23 +190,34 @@ def _check_bgp_status_helper():
if a_asic_neighbors is not None and len(a_asic_neighbors) > 0:
down_neighbors = [k for k, v in list(a_asic_neighbors.items())
if v['state'] != 'established']
asic_key = 'bgp' if dut.facts['num_asic'] == 1 else 'bgp' + str(asic_index)
if down_neighbors:
if dut.facts['num_asic'] == 1:
check_result['bgp'] = {'down_neighbors': down_neighbors}
else:
check_result['bgp' + str(asic_index)] = {'down_neighbors': down_neighbors}
check_result[asic_key] = {'down_neighbors': down_neighbors}
a_asic_result = True
else:
a_asic_result = False
if dut.facts['num_asic'] == 1:
if 'bgp' in check_result:
check_result['bgp'].pop('down_neighbors', None)
else:
if 'bgp' + str(asic_index) in check_result:
check_result['bgp' + str(asic_index)].pop('down_neighbors', None)
if asic_key in check_result:
check_result[asic_key].pop('down_neighbors', None)
else:
a_asic_result = True

# Add default route check for default route missing issue in below scenario:
# 1) Loopbackv4 ip address replace, it would cause all bgp routes missing
# 2) Announce or withdraw routes in some test cases and doesn't recover it
# Chassis and multi_asic is not supported for now
if not dut.is_multi_asic and not (dut.get_facts().get("modular_chassis") is True or
dut.get_facts().get("modular_chassis") == "True"):
if not _check_default_route(4, dut):
if asic_key not in check_result:
check_result[asic_key] = {}
check_result[asic_key]["no_v4_default_route"] = True
a_asic_result = True
if not _check_default_route(6, dut):
if asic_key not in check_result:
check_result[asic_key] = {}
check_result[asic_key]["no_v6_default_route"] = True
a_asic_result = True

asic_check_results.append(a_asic_result)

if any(asic_check_results):
Expand Down Expand Up @@ -238,8 +255,12 @@ def _check_bgp_status_helper():
if 'down_neighbors' in check_result[a_result]:
logger.info('BGP neighbors down: %s on bgp instance %s on dut %s' % (
check_result[a_result]['down_neighbors'], a_result, dut.hostname))
if "no_v4_default_route" in check_result[a_result]:
logger.info('Deafult v4 route for {} {} is missing'.format(dut.hostname, a_result))
if "no_v6_default_route" in check_result[a_result]:
logger.info('Deafult v6 route for {} {} is missing'.format(dut.hostname, a_result))
else:
logger.info('No BGP neighbors are down on %s' % dut.hostname)
logger.info('No BGP neighbors are down or default route missing on %s' % dut.hostname)

mgFacts = dut.get_extended_minigraph_facts(tbinfo)
if dut.num_asics() == 1 and tbinfo['topo']['type'] != 't2' and \
Expand Down
17 changes: 16 additions & 1 deletion tests/common/plugins/sanity_check/recover.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,12 @@ def _recover_with_command(dut, cmd, wait_time):
wait(wait_time, msg="Wait {} seconds for system to be stable.".format(wait_time))


def re_announce_routes(localhost, topo_name, ptf_ip):
localhost.announce_routes(topo_name=topo_name, ptf_ip=ptf_ip, action="withdraw", path="../ansible/")
localhost.announce_routes(topo_name=topo_name, ptf_ip=ptf_ip, action="announce", path="../ansible/")
return None


def adaptive_recover(dut, localhost, fanouthosts, nbrhosts, tbinfo, check_results, wait_time):
outstanding_action = None
for result in check_results:
Expand All @@ -154,7 +160,16 @@ def adaptive_recover(dut, localhost, fanouthosts, nbrhosts, tbinfo, check_result
action = _recover_interfaces(dut, fanouthosts, result, wait_time)
elif result['check_item'] == 'services':
action = _recover_services(dut, result)
elif result['check_item'] == 'bgp' or result['check_item'] == "neighbor_macsec_empty":
elif result['check_item'] == 'bgp':
# If there is only default route missing issue, only need to re-announce routes to recover
if ("no_v4_default_route" in result['bgp'] and len(result['bgp']) == 1 or
"no_v6_default_route" in result['bgp'] and len(result['bgp']) == 1 or
("no_v4_default_route" in result['bgp'] and "no_v6_default_route" in result['bgp'] and
len(result['bgp']) == 2)):
action = re_announce_routes(localhost, tbinfo["topo"]["name"], tbinfo["ptf_ip"])
else:
action = neighbor_vm_restore(dut, nbrhosts, tbinfo, result)
elif result['check_item'] == "neighbor_macsec_empty":
action = neighbor_vm_restore(dut, nbrhosts, tbinfo, result)
elif result['check_item'] in ['processes', 'mux_simulator']:
action = 'config_reload'
Expand Down

0 comments on commit 66c9308

Please sign in to comment.