]> git.proxmox.com Git - mirror_frr.git/commitdiff
lib: Figure out if we are being starved for cpu
authorDonald Sharp <sharpd@nvidia.com>
Wed, 3 Feb 2021 14:13:59 +0000 (09:13 -0500)
committerDonald Sharp <sharpd@nvidia.com>
Thu, 20 Jan 2022 16:56:27 +0000 (11:56 -0500)
If a thread timer should have popped CPU_CONSUMED_CHECK
seconds in the past, and we are only handling it now.  Consider
the thread starved and notice it.

Signed-off-by: Donald Sharp <sharpd@nvidia.com>
lib/lib_errors.c
lib/lib_errors.h
lib/thread.c

index a139b9a14ca6c36f3d7f9b7145b19408819ecc01..acc9a05c33528a40cf32f3e56be602cef79c4e2e 100644 (file)
@@ -56,6 +56,12 @@ static struct log_ref ferr_lib_warn[] = {
                .description = "The Event subsystem has detected a slow process, this typically indicates that FRR is having trouble completing work in a timely manner.  This can be either a misconfiguration, bug or some combination thereof.  In this case total WALL time was over 5 seconds.  Which indicates that FRR might be having trouble being scheduled or some system call is delaying",
                .suggestion = "Gather log data and open an Issue",
        },
+       {
+               .code = EC_LIB_STARVE_THREAD,
+               .title = "The Event subsystem has detected a thread starvation issue",
+               .description = "The event subsystem has detected a thread starvation issue.  This typically indicates that the system FRR is running on is heavily loaded and this load might be impacting FRR's ability to handle events in a timely fashion",
+               .suggestion = "Gather log data and open an Issue",
+       },
        {
                .code = EC_LIB_NO_THREAD,
                .title = "The Event subsystem has detected an internal FD problem",
index 9f0f58d20b1842a11fc83edc27952c99922cfd3e..64ac6c1cebad85790427fc77fe27825456271825 100644 (file)
@@ -46,6 +46,7 @@ enum lib_log_refs {
        EC_LIB_LINUX_NS,
        EC_LIB_SLOW_THREAD_CPU,
        EC_LIB_SLOW_THREAD_WALL,
+       EC_LIB_STARVE_THREAD,
        EC_LIB_NO_THREAD,
        EC_LIB_RMAP_RECURSION_LIMIT,
        EC_LIB_BACKUP_CONFIG,
index 77e34f48f342731f36561d3598779b9cf99e363c..73e0e4887c32c6c64130d177b58b5c37d83be66a 100644 (file)
@@ -1651,12 +1651,31 @@ static void thread_process_io(struct thread_master *m, unsigned int num)
 static unsigned int thread_process_timers(struct thread_master *m,
                                          struct timeval *timenow)
 {
+       struct timeval prev = *timenow;
+       bool displayed = false;
        struct thread *thread;
        unsigned int ready = 0;
 
        while ((thread = thread_timer_list_first(&m->timer))) {
                if (timercmp(timenow, &thread->u.sands, <))
                        break;
+               prev = thread->u.sands;
+               prev.tv_sec += 4;
+               /*
+                * If the timer would have popped 4 seconds in the
+                * past then we are in a situation where we are
+                * really getting behind on handling of events.
+                * Let's log it and do the right thing with it.
+                */
+               if (timercmp(timenow, &prev, >)) {
+                       if (!displayed)
+                               flog_warn(
+                                       EC_LIB_STARVE_THREAD,
+                                       "Thread Starvation: %pTHD was scheduled to pop greater than 4s ago",
+                                       thread);
+                       displayed = true;
+               }
+
                thread_timer_list_pop(&m->timer);
                thread->type = THREAD_READY;
                thread_list_add_tail(&m->ready, thread);