From 690497fb1071f9959ed39362027f2c5e79124061 Mon Sep 17 00:00:00 2001 From: GalaxyGorilla Date: Tue, 19 May 2020 11:52:04 +0000 Subject: [PATCH] isisd: Fast RIB recovery from BFD recognized link failures Unfortunately as the topotests show a fast recovery after failure detection due to BFD is currently not possible because of the following issue: There are multiple scheduling mechanisms within isisd to prevent overload situations. Regarding our problem these two are important: * scheduler for regenerating ISIS Link State PDUs scheduler for managing * consecutive SPF calculations In fact both schedulers are coupled, the first one triggers the second one, which again is triggered by isis_adj_state_change (which again is triggered by a BFD 'down' message). The re-calculation of SPF paths finally triggers updates in zebra for the RIB. Both schedulers work as a throttle, e.g. they allow the regeneration of Link State PDUs or a re-calculation for SPF paths only once within a certain time interval which is configurable (and by default different!). This means that a request can go through the first scheduler but might still be 'stuck' at the second one for a while. Or a request can be 'stuck' at the first scheduler even though the second one is ready. This also explains the 'random' behaviour one can observe testing since a 'fast' recovery is only possible if both schedulers are ready to process this request. Note that the solution in this commit is 'thread safe' in the sense that both schedulers use the same thread master such that the introduced flags are only used exactly one time (and one after another) for a 'fast' execution. Further there are some irritating comments and logs which I partially removed. They seems to be not valid anymore due to changes in thread management (or they were never valid in the first place). Signed-off-by: GalaxyGorilla --- isisd/isis_adjacency.c | 6 +++--- isisd/isis_bfd.c | 2 ++ isisd/isis_lsp.c | 42 +++++++++++++++++++++++++++++------------- isisd/isis_spf.c | 15 +++++++++++++-- isisd/isisd.c | 3 +++ isisd/isisd.h | 3 +++ 6 files changed, 53 insertions(+), 18 deletions(-) diff --git a/isisd/isis_adjacency.c b/isisd/isis_adjacency.c index 66de11e6f..425627485 100644 --- a/isisd/isis_adjacency.c +++ b/isisd/isis_adjacency.c @@ -171,14 +171,14 @@ void isis_delete_adj(void *arg) return; THREAD_TIMER_OFF(adj->t_expire); - if (adj->adj_state != ISIS_ADJ_DOWN) { + if (adj->adj_state != ISIS_ADJ_DOWN) adj->adj_state = ISIS_ADJ_DOWN; - hook_call(isis_adj_state_change_hook, adj); - } /* remove from SPF trees */ spftree_area_adj_del(adj->circuit->area, adj); + hook_call(isis_adj_state_change_hook, adj); + XFREE(MTYPE_ISIS_ADJACENCY_INFO, adj->area_addresses); XFREE(MTYPE_ISIS_ADJACENCY_INFO, adj->ipv4_addresses); XFREE(MTYPE_ISIS_ADJACENCY_INFO, adj->ipv6_addresses); diff --git a/isisd/isis_bfd.c b/isisd/isis_bfd.c index 9f8424fcd..69c971ee2 100644 --- a/isisd/isis_bfd.c +++ b/isisd/isis_bfd.c @@ -138,6 +138,8 @@ static void bfd_adj_event(struct isis_adjacency *adj, struct prefix *dst, return; } + adj->circuit->area->bfd_signalled_down = true; + isis_adj_state_change(&adj, ISIS_ADJ_DOWN, "bfd session went down"); } diff --git a/isisd/isis_lsp.c b/isisd/isis_lsp.c index 5cf652b29..63303e230 100644 --- a/isisd/isis_lsp.c +++ b/isisd/isis_lsp.c @@ -1376,7 +1376,13 @@ static int lsp_refresh(struct thread *thread) if ((area->is_type & level) == 0) return ISIS_ERROR; - if (monotime_since(&area->last_lsp_refresh_event[level - 1], NULL) < 100000L) { + /* + * Throttle regeneration of LSPs (but not when BFD signalled a 'down' + * message) + */ + if (monotime_since(&area->last_lsp_refresh_event[level - 1], NULL) + < 100000L + && !(area->bfd_force_spf_refresh)) { sched_debug("ISIS (%s): Still unstable, postpone LSP L%d refresh", area->area_tag, level); _lsp_regenerate_schedule(area, level, 0, false, @@ -1429,7 +1435,12 @@ int _lsp_regenerate_schedule(struct isis_area *area, int level, "ISIS (%s): Checking whether L%d needs to be scheduled", area->area_tag, lvl); - if (area->lsp_regenerate_pending[lvl - 1]) { + if (area->lsp_regenerate_pending[lvl - 1] + && !(area->bfd_signalled_down)) { + /* + * Note: in case of a BFD 'down' message the refresh is + * scheduled once again just to be sure + */ struct timeval remain = thread_timer_remain( area->t_lsp_refresh[lvl - 1]); sched_debug( @@ -1457,7 +1468,8 @@ int _lsp_regenerate_schedule(struct isis_area *area, int level, (long long)now); THREAD_TIMER_OFF(area->t_lsp_refresh[lvl - 1]); diff = now - lsp->last_generated; - if (diff < area->lsp_gen_interval[lvl - 1]) { + if (diff < area->lsp_gen_interval[lvl - 1] + && !(area->bfd_signalled_down)) { timeout = 1000 * (area->lsp_gen_interval[lvl - 1] - diff); sched_debug( @@ -1465,17 +1477,21 @@ int _lsp_regenerate_schedule(struct isis_area *area, int level, area->area_tag, timeout); } else { /* - * lsps are not regenerated if lsp_regenerate function - * is called - * directly. However if the lsp_regenerate call is - * queued for - * later execution it works. + * Schedule LSP refresh ASAP */ - timeout = 100; - sched_debug( - "ISIS (%s): Last generation was more than lsp_gen_interval ago." - " Scheduling for execution in %ld ms.", - area->area_tag, timeout); + timeout = 0; + + if (area->bfd_signalled_down) { + sched_debug( + "ISIS (%s): Scheduling immediately due to BDF 'down' message.", + area->area_tag); + area->bfd_signalled_down = false; + area->bfd_force_spf_refresh = true; + } else { + sched_debug( + "ISIS (%s): Last generation was more than lsp_gen_interval ago. Scheduling for execution now.", + area->area_tag); + } } area->lsp_regenerate_pending[lvl - 1] = 1; diff --git a/isisd/isis_spf.c b/isisd/isis_spf.c index daf97859f..e43153372 100644 --- a/isisd/isis_spf.c +++ b/isisd/isis_spf.c @@ -1275,9 +1275,20 @@ int _isis_spf_schedule(struct isis_area *area, int level, /* wait configured min_spf_interval before doing the SPF */ long timer; - if (diff >= area->min_spf_interval[level - 1]) { - /* Last run is more than min interval ago, schedule immediate run */ + if (diff >= area->min_spf_interval[level - 1] + || area->bfd_force_spf_refresh) { + /* + * Last run is more than min interval ago or BFD signalled a + * 'down' message, schedule immediate run + */ timer = 0; + + if (area->bfd_force_spf_refresh) { + zlog_debug( + "ISIS-Spf (%s) L%d SPF scheduled immediately due to BFD 'down' message", + area->area_tag, level); + area->bfd_force_spf_refresh = false; + } } else { timer = area->min_spf_interval[level - 1] - diff; } diff --git a/isisd/isisd.c b/isisd/isisd.c index 286542c8d..53e48bd1c 100644 --- a/isisd/isisd.c +++ b/isisd/isisd.c @@ -203,6 +203,9 @@ struct isis_area *isis_area_create(const char *area_tag) area->lsp_refresh_arg[1].area = area; area->lsp_refresh_arg[1].level = IS_LEVEL_2; + area->bfd_signalled_down = false; + area->bfd_force_spf_refresh = false; + QOBJ_REG(area, isis_area); diff --git a/isisd/isisd.h b/isisd/isisd.h index def2027aa..57d9691cc 100644 --- a/isisd/isisd.h +++ b/isisd/isisd.h @@ -127,6 +127,9 @@ struct isis_area { */ int lsp_regenerate_pending[ISIS_LEVELS]; + bool bfd_signalled_down; + bool bfd_force_spf_refresh; + struct fabricd *fabricd; /* -- 2.39.5