lib/thread.c

   1 /* Thread management routine
   2  * Copyright (C) 1998, 2000 Kunihiro Ishiguro <kunihiro@zebra.org>
   3  *
   4  * This file is part of GNU Zebra.
   5  *
   6  * GNU Zebra is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License as published by the
   8  * Free Software Foundation; either version 2, or (at your option) any
   9  * later version.
  10  *
  11  * GNU Zebra is distributed in the hope that it will be useful, but
  12  * WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License along
  17  * with this program; see the file COPYING; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 /* #define DEBUG */
  22
  23 #include <zebra.h>
  24 #include <sys/resource.h>
  25
  26 #include "thread.h"
  27 #include "memory.h"
  28 #include "frrcu.h"
  29 #include "log.h"
  30 #include "hash.h"
  31 #include "command.h"
  32 #include "sigevent.h"
  33 #include "network.h"
  34 #include "jhash.h"
  35 #include "frratomic.h"
  36 #include "frr_pthread.h"
  37 #include "lib_errors.h"
  38 #include "libfrr_trace.h"
  39 #include "libfrr.h"
  40
  41 DEFINE_MTYPE_STATIC(LIB, THREAD, "Thread");
  42 DEFINE_MTYPE_STATIC(LIB, THREAD_MASTER, "Thread master");
  43 DEFINE_MTYPE_STATIC(LIB, THREAD_POLL, "Thread Poll Info");
  44 DEFINE_MTYPE_STATIC(LIB, THREAD_STATS, "Thread stats");
  45
  46 DECLARE_LIST(thread_list, struct thread, threaditem);
  47
  48 struct cancel_req {
  49         int flags;
  50         struct thread *thread;
  51         void *eventobj;
  52         struct thread **threadref;
  53 };
  54
  55 /* Flags for task cancellation */
  56 #define THREAD_CANCEL_FLAG_READY     0x01
  57
  58 static int thread_timer_cmp(const struct thread *a, const struct thread *b)
  59 {
  60         if (a->u.sands.tv_sec < b->u.sands.tv_sec)
  61                 return -1;
  62         if (a->u.sands.tv_sec > b->u.sands.tv_sec)
  63                 return 1;
  64         if (a->u.sands.tv_usec < b->u.sands.tv_usec)
  65                 return -1;
  66         if (a->u.sands.tv_usec > b->u.sands.tv_usec)
  67                 return 1;
  68         return 0;
  69 }
  70
  71 DECLARE_HEAP(thread_timer_list, struct thread, timeritem, thread_timer_cmp);
  72
  73 #if defined(__APPLE__)
  74 #include <mach/mach.h>
  75 #include <mach/mach_time.h>
  76 #endif
  77
  78 #define AWAKEN(m)                                                              \
  79         do {                                                                   \
  80                 const unsigned char wakebyte = 0x01;                           \
  81                 write(m->io_pipe[1], &wakebyte, 1);                            \
  82         } while (0);
  83
  84 /* control variable for initializer */
  85 static pthread_once_t init_once = PTHREAD_ONCE_INIT;
  86 pthread_key_t thread_current;
  87
  88 static pthread_mutex_t masters_mtx = PTHREAD_MUTEX_INITIALIZER;
  89 static struct list *masters;
  90
  91 static void thread_free(struct thread_master *master, struct thread *thread);
  92
  93 #ifndef EXCLUDE_CPU_TIME
  94 #define EXCLUDE_CPU_TIME 0
  95 #endif
  96 #ifndef CONSUMED_TIME_CHECK
  97 #define CONSUMED_TIME_CHECK 0
  98 #endif
  99
 100 bool cputime_enabled = !EXCLUDE_CPU_TIME;
 101 unsigned long cputime_threshold = CONSUMED_TIME_CHECK;
 102 unsigned long walltime_threshold = CONSUMED_TIME_CHECK;
 103
 104 /* CLI start ---------------------------------------------------------------- */
 105 #ifndef VTYSH_EXTRACT_PL
 106 #include "lib/thread_clippy.c"
 107 #endif
 108
 109 static unsigned int cpu_record_hash_key(const struct cpu_thread_history *a)
 110 {
 111         int size = sizeof(a->func);
 112
 113         return jhash(&a->func, size, 0);
 114 }
 115
 116 static bool cpu_record_hash_cmp(const struct cpu_thread_history *a,
 117                                const struct cpu_thread_history *b)
 118 {
 119         return a->func == b->func;
 120 }
 121
 122 static void *cpu_record_hash_alloc(struct cpu_thread_history *a)
 123 {
 124         struct cpu_thread_history *new;
 125         new = XCALLOC(MTYPE_THREAD_STATS, sizeof(struct cpu_thread_history));
 126         new->func = a->func;
 127         new->funcname = a->funcname;
 128         return new;
 129 }
 130
 131 static void cpu_record_hash_free(void *a)
 132 {
 133         struct cpu_thread_history *hist = a;
 134
 135         XFREE(MTYPE_THREAD_STATS, hist);
 136 }
 137
 138 static void vty_out_cpu_thread_history(struct vty *vty,
 139                                        struct cpu_thread_history *a)
 140 {
 141         vty_out(vty,
 142                 "%5zu %10zu.%03zu %9zu %8zu %9zu %8zu %9zu %9zu %9zu %10zu",
 143                 a->total_active, a->cpu.total / 1000, a->cpu.total % 1000,
 144                 a->total_calls, (a->cpu.total / a->total_calls), a->cpu.max,
 145                 (a->real.total / a->total_calls), a->real.max,
 146                 a->total_cpu_warn, a->total_wall_warn, a->total_starv_warn);
 147         vty_out(vty, "  %c%c%c%c%c  %s\n",
 148                 a->types & (1 << THREAD_READ) ? 'R' : ' ',
 149                 a->types & (1 << THREAD_WRITE) ? 'W' : ' ',
 150                 a->types & (1 << THREAD_TIMER) ? 'T' : ' ',
 151                 a->types & (1 << THREAD_EVENT) ? 'E' : ' ',
 152                 a->types & (1 << THREAD_EXECUTE) ? 'X' : ' ', a->funcname);
 153 }
 154
 155 static void cpu_record_hash_print(struct hash_bucket *bucket, void *args[])
 156 {
 157         struct cpu_thread_history *totals = args[0];
 158         struct cpu_thread_history copy;
 159         struct vty *vty = args[1];
 160         uint8_t *filter = args[2];
 161
 162         struct cpu_thread_history *a = bucket->data;
 163
 164         copy.total_active =
 165                 atomic_load_explicit(&a->total_active, memory_order_seq_cst);
 166         copy.total_calls =
 167                 atomic_load_explicit(&a->total_calls, memory_order_seq_cst);
 168         copy.total_cpu_warn =
 169                 atomic_load_explicit(&a->total_cpu_warn, memory_order_seq_cst);
 170         copy.total_wall_warn =
 171                 atomic_load_explicit(&a->total_wall_warn, memory_order_seq_cst);
 172         copy.total_starv_warn = atomic_load_explicit(&a->total_starv_warn,
 173                                                      memory_order_seq_cst);
 174         copy.cpu.total =
 175                 atomic_load_explicit(&a->cpu.total, memory_order_seq_cst);
 176         copy.cpu.max = atomic_load_explicit(&a->cpu.max, memory_order_seq_cst);
 177         copy.real.total =
 178                 atomic_load_explicit(&a->real.total, memory_order_seq_cst);
 179         copy.real.max =
 180                 atomic_load_explicit(&a->real.max, memory_order_seq_cst);
 181         copy.types = atomic_load_explicit(&a->types, memory_order_seq_cst);
 182         copy.funcname = a->funcname;
 183
 184         if (!(copy.types & *filter))
 185                 return;
 186
 187         vty_out_cpu_thread_history(vty, &copy);
 188         totals->total_active += copy.total_active;
 189         totals->total_calls += copy.total_calls;
 190         totals->total_cpu_warn += copy.total_cpu_warn;
 191         totals->total_wall_warn += copy.total_wall_warn;
 192         totals->total_starv_warn += copy.total_starv_warn;
 193         totals->real.total += copy.real.total;
 194         if (totals->real.max < copy.real.max)
 195                 totals->real.max = copy.real.max;
 196         totals->cpu.total += copy.cpu.total;
 197         if (totals->cpu.max < copy.cpu.max)
 198                 totals->cpu.max = copy.cpu.max;
 199 }
 200
 201 static void cpu_record_print(struct vty *vty, uint8_t filter)
 202 {
 203         struct cpu_thread_history tmp;
 204         void *args[3] = {&tmp, vty, &filter};
 205         struct thread_master *m;
 206         struct listnode *ln;
 207
 208         if (!cputime_enabled)
 209                 vty_out(vty,
 210                         "\n"
 211                         "Collecting CPU time statistics is currently disabled.  Following statistics\n"
 212                         "will be zero or may display data from when collection was enabled.  Use the\n"
 213                         "  \"service cputime-stats\"  command to start collecting data.\n"
 214                         "\nCounters and wallclock times are always maintained and should be accurate.\n");
 215
 216         memset(&tmp, 0, sizeof(tmp));
 217         tmp.funcname = "TOTAL";
 218         tmp.types = filter;
 219
 220         frr_with_mutex(&masters_mtx) {
 221                 for (ALL_LIST_ELEMENTS_RO(masters, ln, m)) {
 222                         const char *name = m->name ? m->name : "main";
 223
 224                         char underline[strlen(name) + 1];
 225                         memset(underline, '-', sizeof(underline));
 226                         underline[sizeof(underline) - 1] = '\0';
 227
 228                         vty_out(vty, "\n");
 229                         vty_out(vty, "Showing statistics for pthread %s\n",
 230                                 name);
 231                         vty_out(vty, "-------------------------------%s\n",
 232                                 underline);
 233                         vty_out(vty, "%30s %18s %18s\n", "",
 234                                 "CPU (user+system):", "Real (wall-clock):");
 235                         vty_out(vty,
 236                                 "Active   Runtime(ms)   Invoked Avg uSec Max uSecs");
 237                         vty_out(vty, " Avg uSec Max uSecs");
 238                         vty_out(vty,
 239                                 "  CPU_Warn Wall_Warn Starv_Warn Type   Thread\n");
 240
 241                         if (m->cpu_record->count)
 242                                 hash_iterate(
 243                                         m->cpu_record,
 244                                         (void (*)(struct hash_bucket *,
 245                                                   void *))cpu_record_hash_print,
 246                                         args);
 247                         else
 248                                 vty_out(vty, "No data to display yet.\n");
 249
 250                         vty_out(vty, "\n");
 251                 }
 252         }
 253
 254         vty_out(vty, "\n");
 255         vty_out(vty, "Total thread statistics\n");
 256         vty_out(vty, "-------------------------\n");
 257         vty_out(vty, "%30s %18s %18s\n", "",
 258                 "CPU (user+system):", "Real (wall-clock):");
 259         vty_out(vty, "Active   Runtime(ms)   Invoked Avg uSec Max uSecs");
 260         vty_out(vty, " Avg uSec Max uSecs  CPU_Warn Wall_Warn");
 261         vty_out(vty, "  Type  Thread\n");
 262
 263         if (tmp.total_calls > 0)
 264                 vty_out_cpu_thread_history(vty, &tmp);
 265 }
 266
 267 static void cpu_record_hash_clear(struct hash_bucket *bucket, void *args[])
 268 {
 269         uint8_t *filter = args[0];
 270         struct hash *cpu_record = args[1];
 271
 272         struct cpu_thread_history *a = bucket->data;
 273
 274         if (!(a->types & *filter))
 275                 return;
 276
 277         hash_release(cpu_record, bucket->data);
 278 }
 279
 280 static void cpu_record_clear(uint8_t filter)
 281 {
 282         uint8_t *tmp = &filter;
 283         struct thread_master *m;
 284         struct listnode *ln;
 285
 286         frr_with_mutex(&masters_mtx) {
 287                 for (ALL_LIST_ELEMENTS_RO(masters, ln, m)) {
 288                         frr_with_mutex(&m->mtx) {
 289                                 void *args[2] = {tmp, m->cpu_record};
 290                                 hash_iterate(
 291                                         m->cpu_record,
 292                                         (void (*)(struct hash_bucket *,
 293                                                   void *))cpu_record_hash_clear,
 294                                         args);
 295                         }
 296                 }
 297         }
 298 }
 299
 300 static uint8_t parse_filter(const char *filterstr)
 301 {
 302         int i = 0;
 303         int filter = 0;
 304
 305         while (filterstr[i] != '\0') {
 306                 switch (filterstr[i]) {
 307                 case 'r':
 308                 case 'R':
 309                         filter |= (1 << THREAD_READ);
 310                         break;
 311                 case 'w':
 312                 case 'W':
 313                         filter |= (1 << THREAD_WRITE);
 314                         break;
 315                 case 't':
 316                 case 'T':
 317                         filter |= (1 << THREAD_TIMER);
 318                         break;
 319                 case 'e':
 320                 case 'E':
 321                         filter |= (1 << THREAD_EVENT);
 322                         break;
 323                 case 'x':
 324                 case 'X':
 325                         filter |= (1 << THREAD_EXECUTE);
 326                         break;
 327                 default:
 328                         break;
 329                 }
 330                 ++i;
 331         }
 332         return filter;
 333 }
 334
 335 DEFUN_NOSH (show_thread_cpu,
 336             show_thread_cpu_cmd,
 337             "show thread cpu [FILTER]",
 338             SHOW_STR
 339             "Thread information\n"
 340             "Thread CPU usage\n"
 341             "Display filter (rwtex)\n")
 342 {
 343         uint8_t filter = (uint8_t)-1U;
 344         int idx = 0;
 345
 346         if (argv_find(argv, argc, "FILTER", &idx)) {
 347                 filter = parse_filter(argv[idx]->arg);
 348                 if (!filter) {
 349                         vty_out(vty,
 350                                 "Invalid filter \"%s\" specified; must contain at leastone of 'RWTEXB'\n",
 351                                 argv[idx]->arg);
 352                         return CMD_WARNING;
 353                 }
 354         }
 355
 356         cpu_record_print(vty, filter);
 357         return CMD_SUCCESS;
 358 }
 359
 360 DEFPY (service_cputime_stats,
 361        service_cputime_stats_cmd,
 362        "[no] service cputime-stats",
 363        NO_STR
 364        "Set up miscellaneous service\n"
 365        "Collect CPU usage statistics\n")
 366 {
 367         cputime_enabled = !no;
 368         return CMD_SUCCESS;
 369 }
 370
 371 DEFPY (service_cputime_warning,
 372        service_cputime_warning_cmd,
 373        "[no] service cputime-warning (1-4294967295)",
 374        NO_STR
 375        "Set up miscellaneous service\n"
 376        "Warn for tasks exceeding CPU usage threshold\n"
 377        "Warning threshold in milliseconds\n")
 378 {
 379         if (no)
 380                 cputime_threshold = 0;
 381         else
 382                 cputime_threshold = cputime_warning * 1000;
 383         return CMD_SUCCESS;
 384 }
 385
 386 ALIAS (service_cputime_warning,
 387        no_service_cputime_warning_cmd,
 388        "no service cputime-warning",
 389        NO_STR
 390        "Set up miscellaneous service\n"
 391        "Warn for tasks exceeding CPU usage threshold\n")
 392
 393 DEFPY (service_walltime_warning,
 394        service_walltime_warning_cmd,
 395        "[no] service walltime-warning (1-4294967295)",
 396        NO_STR
 397        "Set up miscellaneous service\n"
 398        "Warn for tasks exceeding total wallclock threshold\n"
 399        "Warning threshold in milliseconds\n")
 400 {
 401         if (no)
 402                 walltime_threshold = 0;
 403         else
 404                 walltime_threshold = walltime_warning * 1000;
 405         return CMD_SUCCESS;
 406 }
 407
 408 ALIAS (service_walltime_warning,
 409        no_service_walltime_warning_cmd,
 410        "no service walltime-warning",
 411        NO_STR
 412        "Set up miscellaneous service\n"
 413        "Warn for tasks exceeding total wallclock threshold\n")
 414
 415 static void show_thread_poll_helper(struct vty *vty, struct thread_master *m)
 416 {
 417         const char *name = m->name ? m->name : "main";
 418         char underline[strlen(name) + 1];
 419         struct thread *thread;
 420         uint32_t i;
 421
 422         memset(underline, '-', sizeof(underline));
 423         underline[sizeof(underline) - 1] = '\0';
 424
 425         vty_out(vty, "\nShowing poll FD's for %s\n", name);
 426         vty_out(vty, "----------------------%s\n", underline);
 427         vty_out(vty, "Count: %u/%d\n", (uint32_t)m->handler.pfdcount,
 428                 m->fd_limit);
 429         for (i = 0; i < m->handler.pfdcount; i++) {
 430                 vty_out(vty, "\t%6d fd:%6d events:%2d revents:%2d\t\t", i,
 431                         m->handler.pfds[i].fd, m->handler.pfds[i].events,
 432                         m->handler.pfds[i].revents);
 433
 434                 if (m->handler.pfds[i].events & POLLIN) {
 435                         thread = m->read[m->handler.pfds[i].fd];
 436
 437                         if (!thread)
 438                                 vty_out(vty, "ERROR ");
 439                         else
 440                                 vty_out(vty, "%s ", thread->xref->funcname);
 441                 } else
 442                         vty_out(vty, " ");
 443
 444                 if (m->handler.pfds[i].events & POLLOUT) {
 445                         thread = m->write[m->handler.pfds[i].fd];
 446
 447                         if (!thread)
 448                                 vty_out(vty, "ERROR\n");
 449                         else
 450                                 vty_out(vty, "%s\n", thread->xref->funcname);
 451                 } else
 452                         vty_out(vty, "\n");
 453         }
 454 }
 455
 456 DEFUN_NOSH (show_thread_poll,
 457             show_thread_poll_cmd,
 458             "show thread poll",
 459             SHOW_STR
 460             "Thread information\n"
 461             "Show poll FD's and information\n")
 462 {
 463         struct listnode *node;
 464         struct thread_master *m;
 465
 466         frr_with_mutex(&masters_mtx) {
 467                 for (ALL_LIST_ELEMENTS_RO(masters, node, m)) {
 468                         show_thread_poll_helper(vty, m);
 469                 }
 470         }
 471
 472         return CMD_SUCCESS;
 473 }
 474
 475
 476 DEFUN (clear_thread_cpu,
 477        clear_thread_cpu_cmd,
 478        "clear thread cpu [FILTER]",
 479        "Clear stored data in all pthreads\n"
 480        "Thread information\n"
 481        "Thread CPU usage\n"
 482        "Display filter (rwtexb)\n")
 483 {
 484         uint8_t filter = (uint8_t)-1U;
 485         int idx = 0;
 486
 487         if (argv_find(argv, argc, "FILTER", &idx)) {
 488                 filter = parse_filter(argv[idx]->arg);
 489                 if (!filter) {
 490                         vty_out(vty,
 491                                 "Invalid filter \"%s\" specified; must contain at leastone of 'RWTEXB'\n",
 492                                 argv[idx]->arg);
 493                         return CMD_WARNING;
 494                 }
 495         }
 496
 497         cpu_record_clear(filter);
 498         return CMD_SUCCESS;
 499 }
 500
 501 static void show_thread_timers_helper(struct vty *vty, struct thread_master *m)
 502 {
 503         const char *name = m->name ? m->name : "main";
 504         char underline[strlen(name) + 1];
 505         struct thread *thread;
 506
 507         memset(underline, '-', sizeof(underline));
 508         underline[sizeof(underline) - 1] = '\0';
 509
 510         vty_out(vty, "\nShowing timers for %s\n", name);
 511         vty_out(vty, "-------------------%s\n", underline);
 512
 513         frr_each (thread_timer_list, &m->timer, thread) {
 514                 vty_out(vty, "  %-50s%pTH\n", thread->hist->funcname, thread);
 515         }
 516 }
 517
 518 DEFPY_NOSH (show_thread_timers,
 519             show_thread_timers_cmd,
 520             "show thread timers",
 521             SHOW_STR
 522             "Thread information\n"
 523             "Show all timers and how long they have in the system\n")
 524 {
 525         struct listnode *node;
 526         struct thread_master *m;
 527
 528         frr_with_mutex (&masters_mtx) {
 529                 for (ALL_LIST_ELEMENTS_RO(masters, node, m))
 530                         show_thread_timers_helper(vty, m);
 531         }
 532
 533         return CMD_SUCCESS;
 534 }
 535
 536 void thread_cmd_init(void)
 537 {
 538         install_element(VIEW_NODE, &show_thread_cpu_cmd);
 539         install_element(VIEW_NODE, &show_thread_poll_cmd);
 540         install_element(ENABLE_NODE, &clear_thread_cpu_cmd);
 541
 542         install_element(CONFIG_NODE, &service_cputime_stats_cmd);
 543         install_element(CONFIG_NODE, &service_cputime_warning_cmd);
 544         install_element(CONFIG_NODE, &no_service_cputime_warning_cmd);
 545         install_element(CONFIG_NODE, &service_walltime_warning_cmd);
 546         install_element(CONFIG_NODE, &no_service_walltime_warning_cmd);
 547
 548         install_element(VIEW_NODE, &show_thread_timers_cmd);
 549 }
 550 /* CLI end ------------------------------------------------------------------ */
 551
 552
 553 static void cancelreq_del(void *cr)
 554 {
 555         XFREE(MTYPE_TMP, cr);
 556 }
 557
 558 /* initializer, only ever called once */
 559 static void initializer(void)
 560 {
 561         pthread_key_create(&thread_current, NULL);
 562 }
 563
 564 struct thread_master *thread_master_create(const char *name)
 565 {
 566         struct thread_master *rv;
 567         struct rlimit limit;
 568
 569         pthread_once(&init_once, &initializer);
 570
 571         rv = XCALLOC(MTYPE_THREAD_MASTER, sizeof(struct thread_master));
 572
 573         /* Initialize master mutex */
 574         pthread_mutex_init(&rv->mtx, NULL);
 575         pthread_cond_init(&rv->cancel_cond, NULL);
 576
 577         /* Set name */
 578         name = name ? name : "default";
 579         rv->name = XSTRDUP(MTYPE_THREAD_MASTER, name);
 580
 581         /* Initialize I/O task data structures */
 582
 583         /* Use configured limit if present, ulimit otherwise. */
 584         rv->fd_limit = frr_get_fd_limit();
 585         if (rv->fd_limit == 0) {
 586                 getrlimit(RLIMIT_NOFILE, &limit);
 587                 rv->fd_limit = (int)limit.rlim_cur;
 588         }
 589
 590         rv->read = XCALLOC(MTYPE_THREAD_POLL,
 591                            sizeof(struct thread *) * rv->fd_limit);
 592
 593         rv->write = XCALLOC(MTYPE_THREAD_POLL,
 594                             sizeof(struct thread *) * rv->fd_limit);
 595
 596         char tmhashname[strlen(name) + 32];
 597         snprintf(tmhashname, sizeof(tmhashname), "%s - threadmaster event hash",
 598                  name);
 599         rv->cpu_record = hash_create_size(
 600                 8, (unsigned int (*)(const void *))cpu_record_hash_key,
 601                 (bool (*)(const void *, const void *))cpu_record_hash_cmp,
 602                 tmhashname);
 603
 604         thread_list_init(&rv->event);
 605         thread_list_init(&rv->ready);
 606         thread_list_init(&rv->unuse);
 607         thread_timer_list_init(&rv->timer);
 608
 609         /* Initialize thread_fetch() settings */
 610         rv->spin = true;
 611         rv->handle_signals = true;
 612
 613         /* Set pthread owner, should be updated by actual owner */
 614         rv->owner = pthread_self();
 615         rv->cancel_req = list_new();
 616         rv->cancel_req->del = cancelreq_del;
 617         rv->canceled = true;
 618
 619         /* Initialize pipe poker */
 620         pipe(rv->io_pipe);
 621         set_nonblocking(rv->io_pipe[0]);
 622         set_nonblocking(rv->io_pipe[1]);
 623
 624         /* Initialize data structures for poll() */
 625         rv->handler.pfdsize = rv->fd_limit;
 626         rv->handler.pfdcount = 0;
 627         rv->handler.pfds = XCALLOC(MTYPE_THREAD_MASTER,
 628                                    sizeof(struct pollfd) * rv->handler.pfdsize);
 629         rv->handler.copy = XCALLOC(MTYPE_THREAD_MASTER,
 630                                    sizeof(struct pollfd) * rv->handler.pfdsize);
 631
 632         /* add to list of threadmasters */
 633         frr_with_mutex(&masters_mtx) {
 634                 if (!masters)
 635                         masters = list_new();
 636
 637                 listnode_add(masters, rv);
 638         }
 639
 640         return rv;
 641 }
 642
 643 void thread_master_set_name(struct thread_master *master, const char *name)
 644 {
 645         frr_with_mutex(&master->mtx) {
 646                 XFREE(MTYPE_THREAD_MASTER, master->name);
 647                 master->name = XSTRDUP(MTYPE_THREAD_MASTER, name);
 648         }
 649 }
 650
 651 #define THREAD_UNUSED_DEPTH 10
 652
 653 /* Move thread to unuse list. */
 654 static void thread_add_unuse(struct thread_master *m, struct thread *thread)
 655 {
 656         pthread_mutex_t mtxc = thread->mtx;
 657
 658         assert(m != NULL && thread != NULL);
 659
 660         thread->hist->total_active--;
 661         memset(thread, 0, sizeof(struct thread));
 662         thread->type = THREAD_UNUSED;
 663
 664         /* Restore the thread mutex context. */
 665         thread->mtx = mtxc;
 666
 667         if (thread_list_count(&m->unuse) < THREAD_UNUSED_DEPTH) {
 668                 thread_list_add_tail(&m->unuse, thread);
 669                 return;
 670         }
 671
 672         thread_free(m, thread);
 673 }
 674
 675 /* Free all unused thread. */
 676 static void thread_list_free(struct thread_master *m,
 677                 struct thread_list_head *list)
 678 {
 679         struct thread *t;
 680
 681         while ((t = thread_list_pop(list)))
 682                 thread_free(m, t);
 683 }
 684
 685 static void thread_array_free(struct thread_master *m,
 686                               struct thread **thread_array)
 687 {
 688         struct thread *t;
 689         int index;
 690
 691         for (index = 0; index < m->fd_limit; ++index) {
 692                 t = thread_array[index];
 693                 if (t) {
 694                         thread_array[index] = NULL;
 695                         thread_free(m, t);
 696                 }
 697         }
 698         XFREE(MTYPE_THREAD_POLL, thread_array);
 699 }
 700
 701 /*
 702  * thread_master_free_unused
 703  *
 704  * As threads are finished with they are put on the
 705  * unuse list for later reuse.
 706  * If we are shutting down, Free up unused threads
 707  * So we can see if we forget to shut anything off
 708  */
 709 void thread_master_free_unused(struct thread_master *m)
 710 {
 711         frr_with_mutex(&m->mtx) {
 712                 struct thread *t;
 713                 while ((t = thread_list_pop(&m->unuse)))
 714                         thread_free(m, t);
 715         }
 716 }
 717
 718 /* Stop thread scheduler. */
 719 void thread_master_free(struct thread_master *m)
 720 {
 721         struct thread *t;
 722
 723         frr_with_mutex(&masters_mtx) {
 724                 listnode_delete(masters, m);
 725                 if (masters->count == 0) {
 726                         list_delete(&masters);
 727                 }
 728         }
 729
 730         thread_array_free(m, m->read);
 731         thread_array_free(m, m->write);
 732         while ((t = thread_timer_list_pop(&m->timer)))
 733                 thread_free(m, t);
 734         thread_list_free(m, &m->event);
 735         thread_list_free(m, &m->ready);
 736         thread_list_free(m, &m->unuse);
 737         pthread_mutex_destroy(&m->mtx);
 738         pthread_cond_destroy(&m->cancel_cond);
 739         close(m->io_pipe[0]);
 740         close(m->io_pipe[1]);
 741         list_delete(&m->cancel_req);
 742         m->cancel_req = NULL;
 743
 744         hash_clean(m->cpu_record, cpu_record_hash_free);
 745         hash_free(m->cpu_record);
 746         m->cpu_record = NULL;
 747
 748         XFREE(MTYPE_THREAD_MASTER, m->name);
 749         XFREE(MTYPE_THREAD_MASTER, m->handler.pfds);
 750         XFREE(MTYPE_THREAD_MASTER, m->handler.copy);
 751         XFREE(MTYPE_THREAD_MASTER, m);
 752 }
 753
 754 /* Return remain time in milliseconds. */
 755 unsigned long thread_timer_remain_msec(struct thread *thread)
 756 {
 757         int64_t remain;
 758
 759         frr_with_mutex(&thread->mtx) {
 760                 remain = monotime_until(&thread->u.sands, NULL) / 1000LL;
 761         }
 762
 763         return remain < 0 ? 0 : remain;
 764 }
 765
 766 /* Return remain time in seconds. */
 767 unsigned long thread_timer_remain_second(struct thread *thread)
 768 {
 769         return thread_timer_remain_msec(thread) / 1000LL;
 770 }
 771
 772 struct timeval thread_timer_remain(struct thread *thread)
 773 {
 774         struct timeval remain;
 775         frr_with_mutex(&thread->mtx) {
 776                 monotime_until(&thread->u.sands, &remain);
 777         }
 778         return remain;
 779 }
 780
 781 static int time_hhmmss(char *buf, int buf_size, long sec)
 782 {
 783         long hh;
 784         long mm;
 785         int wr;
 786
 787         assert(buf_size >= 8);
 788
 789         hh = sec / 3600;
 790         sec %= 3600;
 791         mm = sec / 60;
 792         sec %= 60;
 793
 794         wr = snprintf(buf, buf_size, "%02ld:%02ld:%02ld", hh, mm, sec);
 795
 796         return wr != 8;
 797 }
 798
 799 char *thread_timer_to_hhmmss(char *buf, int buf_size,
 800                 struct thread *t_timer)
 801 {
 802         if (t_timer) {
 803                 time_hhmmss(buf, buf_size,
 804                                 thread_timer_remain_second(t_timer));
 805         } else {
 806                 snprintf(buf, buf_size, "--:--:--");
 807         }
 808         return buf;
 809 }
 810
 811 /* Get new thread.  */
 812 static struct thread *thread_get(struct thread_master *m, uint8_t type,
 813                                  void (*func)(struct thread *), void *arg,
 814                                  const struct xref_threadsched *xref)
 815 {
 816         struct thread *thread = thread_list_pop(&m->unuse);
 817         struct cpu_thread_history tmp;
 818
 819         if (!thread) {
 820                 thread = XCALLOC(MTYPE_THREAD, sizeof(struct thread));
 821                 /* mutex only needs to be initialized at struct creation. */
 822                 pthread_mutex_init(&thread->mtx, NULL);
 823                 m->alloc++;
 824         }
 825
 826         thread->type = type;
 827         thread->add_type = type;
 828         thread->master = m;
 829         thread->arg = arg;
 830         thread->yield = THREAD_YIELD_TIME_SLOT; /* default */
 831         thread->ref = NULL;
 832         thread->ignore_timer_late = false;
 833
 834         /*
 835          * So if the passed in funcname is not what we have
 836          * stored that means the thread->hist needs to be
 837          * updated.  We keep the last one around in unused
 838          * under the assumption that we are probably
 839          * going to immediately allocate the same
 840          * type of thread.
 841          * This hopefully saves us some serious
 842          * hash_get lookups.
 843          */
 844         if ((thread->xref && thread->xref->funcname != xref->funcname)
 845             || thread->func != func) {
 846                 tmp.func = func;
 847                 tmp.funcname = xref->funcname;
 848                 thread->hist =
 849                         hash_get(m->cpu_record, &tmp,
 850                                  (void *(*)(void *))cpu_record_hash_alloc);
 851         }
 852         thread->hist->total_active++;
 853         thread->func = func;
 854         thread->xref = xref;
 855
 856         return thread;
 857 }
 858
 859 static void thread_free(struct thread_master *master, struct thread *thread)
 860 {
 861         /* Update statistics. */
 862         assert(master->alloc > 0);
 863         master->alloc--;
 864
 865         /* Free allocated resources. */
 866         pthread_mutex_destroy(&thread->mtx);
 867         XFREE(MTYPE_THREAD, thread);
 868 }
 869
 870 static int fd_poll(struct thread_master *m, const struct timeval *timer_wait,
 871                    bool *eintr_p)
 872 {
 873         sigset_t origsigs;
 874         unsigned char trash[64];
 875         nfds_t count = m->handler.copycount;
 876
 877         /*
 878          * If timer_wait is null here, that means poll() should block
 879          * indefinitely, unless the thread_master has overridden it by setting
 880          * ->selectpoll_timeout.
 881          *
 882          * If the value is positive, it specifies the maximum number of
 883          * milliseconds to wait. If the timeout is -1, it specifies that
 884          * we should never wait and always return immediately even if no
 885          * event is detected. If the value is zero, the behavior is default.
 886          */
 887         int timeout = -1;
 888
 889         /* number of file descriptors with events */
 890         int num;
 891
 892         if (timer_wait != NULL
 893             && m->selectpoll_timeout == 0) // use the default value
 894                 timeout = (timer_wait->tv_sec * 1000)
 895                           + (timer_wait->tv_usec / 1000);
 896         else if (m->selectpoll_timeout > 0) // use the user's timeout
 897                 timeout = m->selectpoll_timeout;
 898         else if (m->selectpoll_timeout
 899                  < 0) // effect a poll (return immediately)
 900                 timeout = 0;
 901
 902         zlog_tls_buffer_flush();
 903         rcu_read_unlock();
 904         rcu_assert_read_unlocked();
 905
 906         /* add poll pipe poker */
 907         assert(count + 1 < m->handler.pfdsize);
 908         m->handler.copy[count].fd = m->io_pipe[0];
 909         m->handler.copy[count].events = POLLIN;
 910         m->handler.copy[count].revents = 0x00;
 911
 912         /* We need to deal with a signal-handling race here: we
 913          * don't want to miss a crucial signal, such as SIGTERM or SIGINT,
 914          * that may arrive just before we enter poll(). We will block the
 915          * key signals, then check whether any have arrived - if so, we return
 916          * before calling poll(). If not, we'll re-enable the signals
 917          * in the ppoll() call.
 918          */
 919
 920         sigemptyset(&origsigs);
 921         if (m->handle_signals) {
 922                 /* Main pthread that handles the app signals */
 923                 if (frr_sigevent_check(&origsigs)) {
 924                         /* Signal to process - restore signal mask and return */
 925                         pthread_sigmask(SIG_SETMASK, &origsigs, NULL);
 926                         num = -1;
 927                         *eintr_p = true;
 928                         goto done;
 929                 }
 930         } else {
 931                 /* Don't make any changes for the non-main pthreads */
 932                 pthread_sigmask(SIG_SETMASK, NULL, &origsigs);
 933         }
 934
 935 #if defined(HAVE_PPOLL)
 936         struct timespec ts, *tsp;
 937
 938         if (timeout >= 0) {
 939                 ts.tv_sec = timeout / 1000;
 940                 ts.tv_nsec = (timeout % 1000) * 1000000;
 941                 tsp = &ts;
 942         } else
 943                 tsp = NULL;
 944
 945         num = ppoll(m->handler.copy, count + 1, tsp, &origsigs);
 946         pthread_sigmask(SIG_SETMASK, &origsigs, NULL);
 947 #else
 948         /* Not ideal - there is a race after we restore the signal mask */
 949         pthread_sigmask(SIG_SETMASK, &origsigs, NULL);
 950         num = poll(m->handler.copy, count + 1, timeout);
 951 #endif
 952
 953 done:
 954
 955         if (num < 0 && errno == EINTR)
 956                 *eintr_p = true;
 957
 958         if (num > 0 && m->handler.copy[count].revents != 0 && num--)
 959                 while (read(m->io_pipe[0], &trash, sizeof(trash)) > 0)
 960                         ;
 961
 962         rcu_read_lock();
 963
 964         return num;
 965 }
 966
 967 /* Add new read thread. */
 968 void _thread_add_read_write(const struct xref_threadsched *xref,
 969                             struct thread_master *m,
 970                             void (*func)(struct thread *), void *arg, int fd,
 971                             struct thread **t_ptr)
 972 {
 973         int dir = xref->thread_type;
 974         struct thread *thread = NULL;
 975         struct thread **thread_array;
 976
 977         if (dir == THREAD_READ)
 978                 frrtrace(9, frr_libfrr, schedule_read, m,
 979                          xref->funcname, xref->xref.file, xref->xref.line,
 980                          t_ptr, fd, 0, arg, 0);
 981         else
 982                 frrtrace(9, frr_libfrr, schedule_write, m,
 983                          xref->funcname, xref->xref.file, xref->xref.line,
 984                          t_ptr, fd, 0, arg, 0);
 985
 986         assert(fd >= 0);
 987         if (fd >= m->fd_limit)
 988                 assert(!"Number of FD's open is greater than FRR currently configured to handle, aborting");
 989
 990         frr_with_mutex(&m->mtx) {
 991                 if (t_ptr && *t_ptr)
 992                         // thread is already scheduled; don't reschedule
 993                         break;
 994
 995                 /* default to a new pollfd */
 996                 nfds_t queuepos = m->handler.pfdcount;
 997
 998                 if (dir == THREAD_READ)
 999                         thread_array = m->read;
1000                 else
1001                         thread_array = m->write;
1002
1003                 /* if we already have a pollfd for our file descriptor, find and
1004                  * use it */
1005                 for (nfds_t i = 0; i < m->handler.pfdcount; i++)
1006                         if (m->handler.pfds[i].fd == fd) {
1007                                 queuepos = i;
1008
1009 #ifdef DEV_BUILD
1010                                 /*
1011                                  * What happens if we have a thread already
1012                                  * created for this event?
1013                                  */
1014                                 if (thread_array[fd])
1015                                         assert(!"Thread already scheduled for file descriptor");
1016 #endif
1017                                 break;
1018                         }
1019
1020                 /* make sure we have room for this fd + pipe poker fd */
1021                 assert(queuepos + 1 < m->handler.pfdsize);
1022
1023                 thread = thread_get(m, dir, func, arg, xref);
1024
1025                 m->handler.pfds[queuepos].fd = fd;
1026                 m->handler.pfds[queuepos].events |=
1027                         (dir == THREAD_READ ? POLLIN : POLLOUT);
1028
1029                 if (queuepos == m->handler.pfdcount)
1030                         m->handler.pfdcount++;
1031
1032                 if (thread) {
1033                         frr_with_mutex(&thread->mtx) {
1034                                 thread->u.fd = fd;
1035                                 thread_array[thread->u.fd] = thread;
1036                         }
1037
1038                         if (t_ptr) {
1039                                 *t_ptr = thread;
1040                                 thread->ref = t_ptr;
1041                         }
1042                 }
1043
1044                 AWAKEN(m);
1045         }
1046 }
1047
1048 static void _thread_add_timer_timeval(const struct xref_threadsched *xref,
1049                                       struct thread_master *m,
1050                                       void (*func)(struct thread *), void *arg,
1051                                       struct timeval *time_relative,
1052                                       struct thread **t_ptr)
1053 {
1054         struct thread *thread;
1055         struct timeval t;
1056
1057         assert(m != NULL);
1058
1059         assert(time_relative);
1060
1061         frrtrace(9, frr_libfrr, schedule_timer, m,
1062                  xref->funcname, xref->xref.file, xref->xref.line,
1063                  t_ptr, 0, 0, arg, (long)time_relative->tv_sec);
1064
1065         /* Compute expiration/deadline time. */
1066         monotime(&t);
1067         timeradd(&t, time_relative, &t);
1068
1069         frr_with_mutex(&m->mtx) {
1070                 if (t_ptr && *t_ptr)
1071                         /* thread is already scheduled; don't reschedule */
1072                         return;
1073
1074                 thread = thread_get(m, THREAD_TIMER, func, arg, xref);
1075
1076                 frr_with_mutex(&thread->mtx) {
1077                         thread->u.sands = t;
1078                         thread_timer_list_add(&m->timer, thread);
1079                         if (t_ptr) {
1080                                 *t_ptr = thread;
1081                                 thread->ref = t_ptr;
1082                         }
1083                 }
1084
1085                 /* The timer list is sorted - if this new timer
1086                  * might change the time we'll wait for, give the pthread
1087                  * a chance to re-compute.
1088                  */
1089                 if (thread_timer_list_first(&m->timer) == thread)
1090                         AWAKEN(m);
1091         }
1092 #define ONEYEAR2SEC (60 * 60 * 24 * 365)
1093         if (time_relative->tv_sec > ONEYEAR2SEC)
1094                 flog_err(
1095                         EC_LIB_TIMER_TOO_LONG,
1096                         "Timer: %pTHD is created with an expiration that is greater than 1 year",
1097                         thread);
1098 }
1099
1100
1101 /* Add timer event thread. */
1102 void _thread_add_timer(const struct xref_threadsched *xref,
1103                        struct thread_master *m, void (*func)(struct thread *),
1104                        void *arg, long timer, struct thread **t_ptr)
1105 {
1106         struct timeval trel;
1107
1108         assert(m != NULL);
1109
1110         trel.tv_sec = timer;
1111         trel.tv_usec = 0;
1112
1113         _thread_add_timer_timeval(xref, m, func, arg, &trel, t_ptr);
1114 }
1115
1116 /* Add timer event thread with "millisecond" resolution */
1117 void _thread_add_timer_msec(const struct xref_threadsched *xref,
1118                             struct thread_master *m,
1119                             void (*func)(struct thread *), void *arg,
1120                             long timer, struct thread **t_ptr)
1121 {
1122         struct timeval trel;
1123
1124         assert(m != NULL);
1125
1126         trel.tv_sec = timer / 1000;
1127         trel.tv_usec = 1000 * (timer % 1000);
1128
1129         _thread_add_timer_timeval(xref, m, func, arg, &trel, t_ptr);
1130 }
1131
1132 /* Add timer event thread with "timeval" resolution */
1133 void _thread_add_timer_tv(const struct xref_threadsched *xref,
1134                           struct thread_master *m,
1135                           void (*func)(struct thread *), void *arg,
1136                           struct timeval *tv, struct thread **t_ptr)
1137 {
1138         _thread_add_timer_timeval(xref, m, func, arg, tv, t_ptr);
1139 }
1140
1141 /* Add simple event thread. */
1142 void _thread_add_event(const struct xref_threadsched *xref,
1143                        struct thread_master *m, void (*func)(struct thread *),
1144                        void *arg, int val, struct thread **t_ptr)
1145 {
1146         struct thread *thread = NULL;
1147
1148         frrtrace(9, frr_libfrr, schedule_event, m,
1149                  xref->funcname, xref->xref.file, xref->xref.line,
1150                  t_ptr, 0, val, arg, 0);
1151
1152         assert(m != NULL);
1153
1154         frr_with_mutex(&m->mtx) {
1155                 if (t_ptr && *t_ptr)
1156                         /* thread is already scheduled; don't reschedule */
1157                         break;
1158
1159                 thread = thread_get(m, THREAD_EVENT, func, arg, xref);
1160                 frr_with_mutex(&thread->mtx) {
1161                         thread->u.val = val;
1162                         thread_list_add_tail(&m->event, thread);
1163                 }
1164
1165                 if (t_ptr) {
1166                         *t_ptr = thread;
1167                         thread->ref = t_ptr;
1168                 }
1169
1170                 AWAKEN(m);
1171         }
1172 }
1173
1174 /* Thread cancellation ------------------------------------------------------ */
1175
1176 /**
1177  * NOT's out the .events field of pollfd corresponding to the given file
1178  * descriptor. The event to be NOT'd is passed in the 'state' parameter.
1179  *
1180  * This needs to happen for both copies of pollfd's. See 'thread_fetch'
1181  * implementation for details.
1182  *
1183  * @param master
1184  * @param fd
1185  * @param state the event to cancel. One or more (OR'd together) of the
1186  * following:
1187  *   - POLLIN
1188  *   - POLLOUT
1189  */
1190 static void thread_cancel_rw(struct thread_master *master, int fd, short state,
1191                              int idx_hint)
1192 {
1193         bool found = false;
1194
1195         /* find the index of corresponding pollfd */
1196         nfds_t i;
1197
1198         /* Cancel POLLHUP too just in case some bozo set it */
1199         state |= POLLHUP;
1200
1201         /* Some callers know the index of the pfd already */
1202         if (idx_hint >= 0) {
1203                 i = idx_hint;
1204                 found = true;
1205         } else {
1206                 /* Have to look for the fd in the pfd array */
1207                 for (i = 0; i < master->handler.pfdcount; i++)
1208                         if (master->handler.pfds[i].fd == fd) {
1209                                 found = true;
1210                                 break;
1211                         }
1212         }
1213
1214         if (!found) {
1215                 zlog_debug(
1216                         "[!] Received cancellation request for nonexistent rw job");
1217                 zlog_debug("[!] threadmaster: %s | fd: %d",
1218                            master->name ? master->name : "", fd);
1219                 return;
1220         }
1221
1222         /* NOT out event. */
1223         master->handler.pfds[i].events &= ~(state);
1224
1225         /* If all events are canceled, delete / resize the pollfd array. */
1226         if (master->handler.pfds[i].events == 0) {
1227                 memmove(master->handler.pfds + i, master->handler.pfds + i + 1,
1228                         (master->handler.pfdcount - i - 1)
1229                                 * sizeof(struct pollfd));
1230                 master->handler.pfdcount--;
1231                 master->handler.pfds[master->handler.pfdcount].fd = 0;
1232                 master->handler.pfds[master->handler.pfdcount].events = 0;
1233         }
1234
1235         /* If we have the same pollfd in the copy, perform the same operations,
1236          * otherwise return. */
1237         if (i >= master->handler.copycount)
1238                 return;
1239
1240         master->handler.copy[i].events &= ~(state);
1241
1242         if (master->handler.copy[i].events == 0) {
1243                 memmove(master->handler.copy + i, master->handler.copy + i + 1,
1244                         (master->handler.copycount - i - 1)
1245                                 * sizeof(struct pollfd));
1246                 master->handler.copycount--;
1247                 master->handler.copy[master->handler.copycount].fd = 0;
1248                 master->handler.copy[master->handler.copycount].events = 0;
1249         }
1250 }
1251
1252 /*
1253  * Process task cancellation given a task argument: iterate through the
1254  * various lists of tasks, looking for any that match the argument.
1255  */
1256 static void cancel_arg_helper(struct thread_master *master,
1257                               const struct cancel_req *cr)
1258 {
1259         struct thread *t;
1260         nfds_t i;
1261         int fd;
1262         struct pollfd *pfd;
1263
1264         /* We're only processing arg-based cancellations here. */
1265         if (cr->eventobj == NULL)
1266                 return;
1267
1268         /* First process the ready lists. */
1269         frr_each_safe(thread_list, &master->event, t) {
1270                 if (t->arg != cr->eventobj)
1271                         continue;
1272                 thread_list_del(&master->event, t);
1273                 if (t->ref)
1274                         *t->ref = NULL;
1275                 thread_add_unuse(master, t);
1276         }
1277
1278         frr_each_safe(thread_list, &master->ready, t) {
1279                 if (t->arg != cr->eventobj)
1280                         continue;
1281                 thread_list_del(&master->ready, t);
1282                 if (t->ref)
1283                         *t->ref = NULL;
1284                 thread_add_unuse(master, t);
1285         }
1286
1287         /* If requested, stop here and ignore io and timers */
1288         if (CHECK_FLAG(cr->flags, THREAD_CANCEL_FLAG_READY))
1289                 return;
1290
1291         /* Check the io tasks */
1292         for (i = 0; i < master->handler.pfdcount;) {
1293                 pfd = master->handler.pfds + i;
1294
1295                 if (pfd->events & POLLIN)
1296                         t = master->read[pfd->fd];
1297                 else
1298                         t = master->write[pfd->fd];
1299
1300                 if (t && t->arg == cr->eventobj) {
1301                         fd = pfd->fd;
1302
1303                         /* Found a match to cancel: clean up fd arrays */
1304                         thread_cancel_rw(master, pfd->fd, pfd->events, i);
1305
1306                         /* Clean up thread arrays */
1307                         master->read[fd] = NULL;
1308                         master->write[fd] = NULL;
1309
1310                         /* Clear caller's ref */
1311                         if (t->ref)
1312                                 *t->ref = NULL;
1313
1314                         thread_add_unuse(master, t);
1315
1316                         /* Don't increment 'i' since the cancellation will have
1317                          * removed the entry from the pfd array
1318                          */
1319                 } else
1320                         i++;
1321         }
1322
1323         /* Check the timer tasks */
1324         t = thread_timer_list_first(&master->timer);
1325         while (t) {
1326                 struct thread *t_next;
1327
1328                 t_next = thread_timer_list_next(&master->timer, t);
1329
1330                 if (t->arg == cr->eventobj) {
1331                         thread_timer_list_del(&master->timer, t);
1332                         if (t->ref)
1333                                 *t->ref = NULL;
1334                         thread_add_unuse(master, t);
1335                 }
1336
1337                 t = t_next;
1338         }
1339 }
1340
1341 /**
1342  * Process cancellation requests.
1343  *
1344  * This may only be run from the pthread which owns the thread_master.
1345  *
1346  * @param master the thread master to process
1347  * @REQUIRE master->mtx
1348  */
1349 static void do_thread_cancel(struct thread_master *master)
1350 {
1351         struct thread_list_head *list = NULL;
1352         struct thread **thread_array = NULL;
1353         struct thread *thread;
1354
1355         struct cancel_req *cr;
1356         struct listnode *ln;
1357         for (ALL_LIST_ELEMENTS_RO(master->cancel_req, ln, cr)) {
1358                 /*
1359                  * If this is an event object cancellation, search
1360                  * through task lists deleting any tasks which have the
1361                  * specified argument - use this handy helper function.
1362                  */
1363                 if (cr->eventobj) {
1364                         cancel_arg_helper(master, cr);
1365                         continue;
1366                 }
1367
1368                 /*
1369                  * The pointer varies depending on whether the cancellation
1370                  * request was made asynchronously or not. If it was, we
1371                  * need to check whether the thread even exists anymore
1372                  * before cancelling it.
1373                  */
1374                 thread = (cr->thread) ? cr->thread : *cr->threadref;
1375
1376                 if (!thread)
1377                         continue;
1378
1379                 /* Determine the appropriate queue to cancel the thread from */
1380                 switch (thread->type) {
1381                 case THREAD_READ:
1382                         thread_cancel_rw(master, thread->u.fd, POLLIN, -1);
1383                         thread_array = master->read;
1384                         break;
1385                 case THREAD_WRITE:
1386                         thread_cancel_rw(master, thread->u.fd, POLLOUT, -1);
1387                         thread_array = master->write;
1388                         break;
1389                 case THREAD_TIMER:
1390                         thread_timer_list_del(&master->timer, thread);
1391                         break;
1392                 case THREAD_EVENT:
1393                         list = &master->event;
1394                         break;
1395                 case THREAD_READY:
1396                         list = &master->ready;
1397                         break;
1398                 default:
1399                         continue;
1400                         break;
1401                 }
1402
1403                 if (list) {
1404                         thread_list_del(list, thread);
1405                 } else if (thread_array) {
1406                         thread_array[thread->u.fd] = NULL;
1407                 }
1408
1409                 if (thread->ref)
1410                         *thread->ref = NULL;
1411
1412                 thread_add_unuse(thread->master, thread);
1413         }
1414
1415         /* Delete and free all cancellation requests */
1416         if (master->cancel_req)
1417                 list_delete_all_node(master->cancel_req);
1418
1419         /* Wake up any threads which may be blocked in thread_cancel_async() */
1420         master->canceled = true;
1421         pthread_cond_broadcast(&master->cancel_cond);
1422 }
1423
1424 /*
1425  * Helper function used for multiple flavors of arg-based cancellation.
1426  */
1427 static void cancel_event_helper(struct thread_master *m, void *arg, int flags)
1428 {
1429         struct cancel_req *cr;
1430
1431         assert(m->owner == pthread_self());
1432
1433         /* Only worth anything if caller supplies an arg. */
1434         if (arg == NULL)
1435                 return;
1436
1437         cr = XCALLOC(MTYPE_TMP, sizeof(struct cancel_req));
1438
1439         cr->flags = flags;
1440
1441         frr_with_mutex(&m->mtx) {
1442                 cr->eventobj = arg;
1443                 listnode_add(m->cancel_req, cr);
1444                 do_thread_cancel(m);
1445         }
1446 }
1447
1448 /**
1449  * Cancel any events which have the specified argument.
1450  *
1451  * MT-Unsafe
1452  *
1453  * @param m the thread_master to cancel from
1454  * @param arg the argument passed when creating the event
1455  */
1456 void thread_cancel_event(struct thread_master *master, void *arg)
1457 {
1458         cancel_event_helper(master, arg, 0);
1459 }
1460
1461 /*
1462  * Cancel ready tasks with an arg matching 'arg'
1463  *
1464  * MT-Unsafe
1465  *
1466  * @param m the thread_master to cancel from
1467  * @param arg the argument passed when creating the event
1468  */
1469 void thread_cancel_event_ready(struct thread_master *m, void *arg)
1470 {
1471
1472         /* Only cancel ready/event tasks */
1473         cancel_event_helper(m, arg, THREAD_CANCEL_FLAG_READY);
1474 }
1475
1476 /**
1477  * Cancel a specific task.
1478  *
1479  * MT-Unsafe
1480  *
1481  * @param thread task to cancel
1482  */
1483 void thread_cancel(struct thread **thread)
1484 {
1485         struct thread_master *master;
1486
1487         if (thread == NULL || *thread == NULL)
1488                 return;
1489
1490         master = (*thread)->master;
1491
1492         frrtrace(9, frr_libfrr, thread_cancel, master,
1493                  (*thread)->xref->funcname, (*thread)->xref->xref.file,
1494                  (*thread)->xref->xref.line, NULL, (*thread)->u.fd,
1495                  (*thread)->u.val, (*thread)->arg, (*thread)->u.sands.tv_sec);
1496
1497         assert(master->owner == pthread_self());
1498
1499         frr_with_mutex(&master->mtx) {
1500                 struct cancel_req *cr =
1501                         XCALLOC(MTYPE_TMP, sizeof(struct cancel_req));
1502                 cr->thread = *thread;
1503                 listnode_add(master->cancel_req, cr);
1504                 do_thread_cancel(master);
1505         }
1506
1507         *thread = NULL;
1508 }
1509
1510 /**
1511  * Asynchronous cancellation.
1512  *
1513  * Called with either a struct thread ** or void * to an event argument,
1514  * this function posts the correct cancellation request and blocks until it is
1515  * serviced.
1516  *
1517  * If the thread is currently running, execution blocks until it completes.
1518  *
1519  * The last two parameters are mutually exclusive, i.e. if you pass one the
1520  * other must be NULL.
1521  *
1522  * When the cancellation procedure executes on the target thread_master, the
1523  * thread * provided is checked for nullity. If it is null, the thread is
1524  * assumed to no longer exist and the cancellation request is a no-op. Thus
1525  * users of this API must pass a back-reference when scheduling the original
1526  * task.
1527  *
1528  * MT-Safe
1529  *
1530  * @param master the thread master with the relevant event / task
1531  * @param thread pointer to thread to cancel
1532  * @param eventobj the event
1533  */
1534 void thread_cancel_async(struct thread_master *master, struct thread **thread,
1535                          void *eventobj)
1536 {
1537         assert(!(thread && eventobj) && (thread || eventobj));
1538
1539         if (thread && *thread)
1540                 frrtrace(9, frr_libfrr, thread_cancel_async, master,
1541                          (*thread)->xref->funcname, (*thread)->xref->xref.file,
1542                          (*thread)->xref->xref.line, NULL, (*thread)->u.fd,
1543                          (*thread)->u.val, (*thread)->arg,
1544                          (*thread)->u.sands.tv_sec);
1545         else
1546                 frrtrace(9, frr_libfrr, thread_cancel_async, master, NULL, NULL,
1547                          0, NULL, 0, 0, eventobj, 0);
1548
1549         assert(master->owner != pthread_self());
1550
1551         frr_with_mutex(&master->mtx) {
1552                 master->canceled = false;
1553
1554                 if (thread) {
1555                         struct cancel_req *cr =
1556                                 XCALLOC(MTYPE_TMP, sizeof(struct cancel_req));
1557                         cr->threadref = thread;
1558                         listnode_add(master->cancel_req, cr);
1559                 } else if (eventobj) {
1560                         struct cancel_req *cr =
1561                                 XCALLOC(MTYPE_TMP, sizeof(struct cancel_req));
1562                         cr->eventobj = eventobj;
1563                         listnode_add(master->cancel_req, cr);
1564                 }
1565                 AWAKEN(master);
1566
1567                 while (!master->canceled)
1568                         pthread_cond_wait(&master->cancel_cond, &master->mtx);
1569         }
1570
1571         if (thread)
1572                 *thread = NULL;
1573 }
1574 /* ------------------------------------------------------------------------- */
1575
1576 static struct timeval *thread_timer_wait(struct thread_timer_list_head *timers,
1577                                          struct timeval *timer_val)
1578 {
1579         if (!thread_timer_list_count(timers))
1580                 return NULL;
1581
1582         struct thread *next_timer = thread_timer_list_first(timers);
1583         monotime_until(&next_timer->u.sands, timer_val);
1584         return timer_val;
1585 }
1586
1587 static struct thread *thread_run(struct thread_master *m, struct thread *thread,
1588                                  struct thread *fetch)
1589 {
1590         *fetch = *thread;
1591         thread_add_unuse(m, thread);
1592         return fetch;
1593 }
1594
1595 static int thread_process_io_helper(struct thread_master *m,
1596                                     struct thread *thread, short state,
1597                                     short actual_state, int pos)
1598 {
1599         struct thread **thread_array;
1600
1601         /*
1602          * poll() clears the .events field, but the pollfd array we
1603          * pass to poll() is a copy of the one used to schedule threads.
1604          * We need to synchronize state between the two here by applying
1605          * the same changes poll() made on the copy of the "real" pollfd
1606          * array.
1607          *
1608          * This cleans up a possible infinite loop where we refuse
1609          * to respond to a poll event but poll is insistent that
1610          * we should.
1611          */
1612         m->handler.pfds[pos].events &= ~(state);
1613
1614         if (!thread) {
1615                 if ((actual_state & (POLLHUP|POLLIN)) != POLLHUP)
1616                         flog_err(EC_LIB_NO_THREAD,
1617                                  "Attempting to process an I/O event but for fd: %d(%d) no thread to handle this!",
1618                                  m->handler.pfds[pos].fd, actual_state);
1619                 return 0;
1620         }
1621
1622         if (thread->type == THREAD_READ)
1623                 thread_array = m->read;
1624         else
1625                 thread_array = m->write;
1626
1627         thread_array[thread->u.fd] = NULL;
1628         thread_list_add_tail(&m->ready, thread);
1629         thread->type = THREAD_READY;
1630
1631         return 1;
1632 }
1633
1634 /**
1635  * Process I/O events.
1636  *
1637  * Walks through file descriptor array looking for those pollfds whose .revents
1638  * field has something interesting. Deletes any invalid file descriptors.
1639  *
1640  * @param m the thread master
1641  * @param num the number of active file descriptors (return value of poll())
1642  */
1643 static void thread_process_io(struct thread_master *m, unsigned int num)
1644 {
1645         unsigned int ready = 0;
1646         struct pollfd *pfds = m->handler.copy;
1647
1648         for (nfds_t i = 0; i < m->handler.copycount && ready < num; ++i) {
1649                 /* no event for current fd? immediately continue */
1650                 if (pfds[i].revents == 0)
1651                         continue;
1652
1653                 ready++;
1654
1655                 /*
1656                  * Unless someone has called thread_cancel from another
1657                  * pthread, the only thing that could have changed in
1658                  * m->handler.pfds while we were asleep is the .events
1659                  * field in a given pollfd. Barring thread_cancel() that
1660                  * value should be a superset of the values we have in our
1661                  * copy, so there's no need to update it. Similarily,
1662                  * barring deletion, the fd should still be a valid index
1663                  * into the master's pfds.
1664                  *
1665                  * We are including POLLERR here to do a READ event
1666                  * this is because the read should fail and the
1667                  * read function should handle it appropriately
1668                  */
1669                 if (pfds[i].revents & (POLLIN | POLLHUP | POLLERR)) {
1670                         thread_process_io_helper(m, m->read[pfds[i].fd], POLLIN,
1671                                                  pfds[i].revents, i);
1672                 }
1673                 if (pfds[i].revents & POLLOUT)
1674                         thread_process_io_helper(m, m->write[pfds[i].fd],
1675                                                  POLLOUT, pfds[i].revents, i);
1676
1677                 /* if one of our file descriptors is garbage, remove the same
1678                  * from
1679                  * both pfds + update sizes and index */
1680                 if (pfds[i].revents & POLLNVAL) {
1681                         memmove(m->handler.pfds + i, m->handler.pfds + i + 1,
1682                                 (m->handler.pfdcount - i - 1)
1683                                         * sizeof(struct pollfd));
1684                         m->handler.pfdcount--;
1685                         m->handler.pfds[m->handler.pfdcount].fd = 0;
1686                         m->handler.pfds[m->handler.pfdcount].events = 0;
1687
1688                         memmove(pfds + i, pfds + i + 1,
1689                                 (m->handler.copycount - i - 1)
1690                                         * sizeof(struct pollfd));
1691                         m->handler.copycount--;
1692                         m->handler.copy[m->handler.copycount].fd = 0;
1693                         m->handler.copy[m->handler.copycount].events = 0;
1694
1695                         i--;
1696                 }
1697         }
1698 }
1699
1700 /* Add all timers that have popped to the ready list. */
1701 static unsigned int thread_process_timers(struct thread_master *m,
1702                                           struct timeval *timenow)
1703 {
1704         struct timeval prev = *timenow;
1705         bool displayed = false;
1706         struct thread *thread;
1707         unsigned int ready = 0;
1708
1709         while ((thread = thread_timer_list_first(&m->timer))) {
1710                 if (timercmp(timenow, &thread->u.sands, <))
1711                         break;
1712                 prev = thread->u.sands;
1713                 prev.tv_sec += 4;
1714                 /*
1715                  * If the timer would have popped 4 seconds in the
1716                  * past then we are in a situation where we are
1717                  * really getting behind on handling of events.
1718                  * Let's log it and do the right thing with it.
1719                  */
1720                 if (timercmp(timenow, &prev, >)) {
1721                         atomic_fetch_add_explicit(
1722                                 &thread->hist->total_starv_warn, 1,
1723                                 memory_order_seq_cst);
1724                         if (!displayed && !thread->ignore_timer_late) {
1725                                 flog_warn(
1726                                         EC_LIB_STARVE_THREAD,
1727                                         "Thread Starvation: %pTHD was scheduled to pop greater than 4s ago",
1728                                         thread);
1729                                 displayed = true;
1730                         }
1731                 }
1732
1733                 thread_timer_list_pop(&m->timer);
1734                 thread->type = THREAD_READY;
1735                 thread_list_add_tail(&m->ready, thread);
1736                 ready++;
1737         }
1738
1739         return ready;
1740 }
1741
1742 /* process a list en masse, e.g. for event thread lists */
1743 static unsigned int thread_process(struct thread_list_head *list)
1744 {
1745         struct thread *thread;
1746         unsigned int ready = 0;
1747
1748         while ((thread = thread_list_pop(list))) {
1749                 thread->type = THREAD_READY;
1750                 thread_list_add_tail(&thread->master->ready, thread);
1751                 ready++;
1752         }
1753         return ready;
1754 }
1755
1756
1757 /* Fetch next ready thread. */
1758 struct thread *thread_fetch(struct thread_master *m, struct thread *fetch)
1759 {
1760         struct thread *thread = NULL;
1761         struct timeval now;
1762         struct timeval zerotime = {0, 0};
1763         struct timeval tv;
1764         struct timeval *tw = NULL;
1765         bool eintr_p = false;
1766         int num = 0;
1767
1768         do {
1769                 /* Handle signals if any */
1770                 if (m->handle_signals)
1771                         frr_sigevent_process();
1772
1773                 pthread_mutex_lock(&m->mtx);
1774
1775                 /* Process any pending cancellation requests */
1776                 do_thread_cancel(m);
1777
1778                 /*
1779                  * Attempt to flush ready queue before going into poll().
1780                  * This is performance-critical. Think twice before modifying.
1781                  */
1782                 if ((thread = thread_list_pop(&m->ready))) {
1783                         fetch = thread_run(m, thread, fetch);
1784                         if (fetch->ref)
1785                                 *fetch->ref = NULL;
1786                         pthread_mutex_unlock(&m->mtx);
1787                         if (!m->ready_run_loop)
1788                                 GETRUSAGE(&m->last_getrusage);
1789                         m->ready_run_loop = true;
1790                         break;
1791                 }
1792
1793                 m->ready_run_loop = false;
1794                 /* otherwise, tick through scheduling sequence */
1795
1796                 /*
1797                  * Post events to ready queue. This must come before the
1798                  * following block since events should occur immediately
1799                  */
1800                 thread_process(&m->event);
1801
1802                 /*
1803                  * If there are no tasks on the ready queue, we will poll()
1804                  * until a timer expires or we receive I/O, whichever comes
1805                  * first. The strategy for doing this is:
1806                  *
1807                  * - If there are events pending, set the poll() timeout to zero
1808                  * - If there are no events pending, but there are timers
1809                  * pending, set the timeout to the smallest remaining time on
1810                  * any timer.
1811                  * - If there are neither timers nor events pending, but there
1812                  * are file descriptors pending, block indefinitely in poll()
1813                  * - If nothing is pending, it's time for the application to die
1814                  *
1815                  * In every case except the last, we need to hit poll() at least
1816                  * once per loop to avoid starvation by events
1817                  */
1818                 if (!thread_list_count(&m->ready))
1819                         tw = thread_timer_wait(&m->timer, &tv);
1820
1821                 if (thread_list_count(&m->ready) ||
1822                                 (tw && !timercmp(tw, &zerotime, >)))
1823                         tw = &zerotime;
1824
1825                 if (!tw && m->handler.pfdcount == 0) { /* die */
1826                         pthread_mutex_unlock(&m->mtx);
1827                         fetch = NULL;
1828                         break;
1829                 }
1830
1831                 /*
1832                  * Copy pollfd array + # active pollfds in it. Not necessary to
1833                  * copy the array size as this is fixed.
1834                  */
1835                 m->handler.copycount = m->handler.pfdcount;
1836                 memcpy(m->handler.copy, m->handler.pfds,
1837                        m->handler.copycount * sizeof(struct pollfd));
1838
1839                 pthread_mutex_unlock(&m->mtx);
1840                 {
1841                         eintr_p = false;
1842                         num = fd_poll(m, tw, &eintr_p);
1843                 }
1844                 pthread_mutex_lock(&m->mtx);
1845
1846                 /* Handle any errors received in poll() */
1847                 if (num < 0) {
1848                         if (eintr_p) {
1849                                 pthread_mutex_unlock(&m->mtx);
1850                                 /* loop around to signal handler */
1851                                 continue;
1852                         }
1853
1854                         /* else die */
1855                         flog_err(EC_LIB_SYSTEM_CALL, "poll() error: %s",
1856                                  safe_strerror(errno));
1857                         pthread_mutex_unlock(&m->mtx);
1858                         fetch = NULL;
1859                         break;
1860                 }
1861
1862                 /* Post timers to ready queue. */
1863                 monotime(&now);
1864                 thread_process_timers(m, &now);
1865
1866                 /* Post I/O to ready queue. */
1867                 if (num > 0)
1868                         thread_process_io(m, num);
1869
1870                 pthread_mutex_unlock(&m->mtx);
1871
1872         } while (!thread && m->spin);
1873
1874         return fetch;
1875 }
1876
1877 static unsigned long timeval_elapsed(struct timeval a, struct timeval b)
1878 {
1879         return (((a.tv_sec - b.tv_sec) * TIMER_SECOND_MICRO)
1880                 + (a.tv_usec - b.tv_usec));
1881 }
1882
1883 unsigned long thread_consumed_time(RUSAGE_T *now, RUSAGE_T *start,
1884                                    unsigned long *cputime)
1885 {
1886 #ifdef HAVE_CLOCK_THREAD_CPUTIME_ID
1887
1888 #ifdef __FreeBSD__
1889         /*
1890          * FreeBSD appears to have an issue when calling clock_gettime
1891          * with CLOCK_THREAD_CPUTIME_ID really close to each other
1892          * occassionally the now time will be before the start time.
1893          * This is not good and FRR is ending up with CPU HOG's
1894          * when the subtraction wraps to very large numbers
1895          *
1896          * What we are going to do here is cheat a little bit
1897          * and notice that this is a problem and just correct
1898          * it so that it is impossible to happen
1899          */
1900         if (start->cpu.tv_sec == now->cpu.tv_sec &&
1901             start->cpu.tv_nsec > now->cpu.tv_nsec)
1902                 now->cpu.tv_nsec = start->cpu.tv_nsec + 1;
1903         else if (start->cpu.tv_sec > now->cpu.tv_sec) {
1904                 now->cpu.tv_sec = start->cpu.tv_sec;
1905                 now->cpu.tv_nsec = start->cpu.tv_nsec + 1;
1906         }
1907 #endif
1908         *cputime = (now->cpu.tv_sec - start->cpu.tv_sec) * TIMER_SECOND_MICRO
1909                    + (now->cpu.tv_nsec - start->cpu.tv_nsec) / 1000;
1910 #else
1911         /* This is 'user + sys' time.  */
1912         *cputime = timeval_elapsed(now->cpu.ru_utime, start->cpu.ru_utime)
1913                    + timeval_elapsed(now->cpu.ru_stime, start->cpu.ru_stime);
1914 #endif
1915         return timeval_elapsed(now->real, start->real);
1916 }
1917
1918 /* We should aim to yield after yield milliseconds, which defaults
1919    to THREAD_YIELD_TIME_SLOT .
1920    Note: we are using real (wall clock) time for this calculation.
1921    It could be argued that CPU time may make more sense in certain
1922    contexts.  The things to consider are whether the thread may have
1923    blocked (in which case wall time increases, but CPU time does not),
1924    or whether the system is heavily loaded with other processes competing
1925    for CPU time.  On balance, wall clock time seems to make sense.
1926    Plus it has the added benefit that gettimeofday should be faster
1927    than calling getrusage. */
1928 int thread_should_yield(struct thread *thread)
1929 {
1930         int result;
1931         frr_with_mutex(&thread->mtx) {
1932                 result = monotime_since(&thread->real, NULL)
1933                          > (int64_t)thread->yield;
1934         }
1935         return result;
1936 }
1937
1938 void thread_set_yield_time(struct thread *thread, unsigned long yield_time)
1939 {
1940         frr_with_mutex(&thread->mtx) {
1941                 thread->yield = yield_time;
1942         }
1943 }
1944
1945 void thread_getrusage(RUSAGE_T *r)
1946 {
1947         monotime(&r->real);
1948         if (!cputime_enabled) {
1949                 memset(&r->cpu, 0, sizeof(r->cpu));
1950                 return;
1951         }
1952
1953 #ifdef HAVE_CLOCK_THREAD_CPUTIME_ID
1954         /* not currently implemented in Linux's vDSO, but maybe at some point
1955          * in the future?
1956          */
1957         clock_gettime(CLOCK_THREAD_CPUTIME_ID, &r->cpu);
1958 #else /* !HAVE_CLOCK_THREAD_CPUTIME_ID */
1959 #if defined RUSAGE_THREAD
1960 #define FRR_RUSAGE RUSAGE_THREAD
1961 #else
1962 #define FRR_RUSAGE RUSAGE_SELF
1963 #endif
1964         getrusage(FRR_RUSAGE, &(r->cpu));
1965 #endif
1966 }
1967
1968 /*
1969  * Call a thread.
1970  *
1971  * This function will atomically update the thread's usage history. At present
1972  * this is the only spot where usage history is written. Nevertheless the code
1973  * has been written such that the introduction of writers in the future should
1974  * not need to update it provided the writers atomically perform only the
1975  * operations done here, i.e. updating the total and maximum times. In
1976  * particular, the maximum real and cpu times must be monotonically increasing
1977  * or this code is not correct.
1978  */
1979 void thread_call(struct thread *thread)
1980 {
1981         RUSAGE_T before, after;
1982
1983         /* if the thread being called is the CLI, it may change cputime_enabled
1984          * ("service cputime-stats" command), which can result in nonsensical
1985          * and very confusing warnings
1986          */
1987         bool cputime_enabled_here = cputime_enabled;
1988
1989         if (thread->master->ready_run_loop)
1990                 before = thread->master->last_getrusage;
1991         else
1992                 GETRUSAGE(&before);
1993
1994         thread->real = before.real;
1995
1996         frrtrace(9, frr_libfrr, thread_call, thread->master,
1997                  thread->xref->funcname, thread->xref->xref.file,
1998                  thread->xref->xref.line, NULL, thread->u.fd,
1999                  thread->u.val, thread->arg, thread->u.sands.tv_sec);
2000
2001         pthread_setspecific(thread_current, thread);
2002         (*thread->func)(thread);
2003         pthread_setspecific(thread_current, NULL);
2004
2005         GETRUSAGE(&after);
2006         thread->master->last_getrusage = after;
2007
2008         unsigned long walltime, cputime;
2009         unsigned long exp;
2010
2011         walltime = thread_consumed_time(&after, &before, &cputime);
2012
2013         /* update walltime */
2014         atomic_fetch_add_explicit(&thread->hist->real.total, walltime,
2015                                   memory_order_seq_cst);
2016         exp = atomic_load_explicit(&thread->hist->real.max,
2017                                    memory_order_seq_cst);
2018         while (exp < walltime
2019                && !atomic_compare_exchange_weak_explicit(
2020                        &thread->hist->real.max, &exp, walltime,
2021                        memory_order_seq_cst, memory_order_seq_cst))
2022                 ;
2023
2024         if (cputime_enabled_here && cputime_enabled) {
2025                 /* update cputime */
2026                 atomic_fetch_add_explicit(&thread->hist->cpu.total, cputime,
2027                                           memory_order_seq_cst);
2028                 exp = atomic_load_explicit(&thread->hist->cpu.max,
2029                                            memory_order_seq_cst);
2030                 while (exp < cputime
2031                        && !atomic_compare_exchange_weak_explicit(
2032                                &thread->hist->cpu.max, &exp, cputime,
2033                                memory_order_seq_cst, memory_order_seq_cst))
2034                         ;
2035         }
2036
2037         atomic_fetch_add_explicit(&thread->hist->total_calls, 1,
2038                                   memory_order_seq_cst);
2039         atomic_fetch_or_explicit(&thread->hist->types, 1 << thread->add_type,
2040                                  memory_order_seq_cst);
2041
2042         if (cputime_enabled_here && cputime_enabled && cputime_threshold
2043             && cputime > cputime_threshold) {
2044                 /*
2045                  * We have a CPU Hog on our hands.  The time FRR has spent
2046                  * doing actual work (not sleeping) is greater than 5 seconds.
2047                  * Whinge about it now, so we're aware this is yet another task
2048                  * to fix.
2049                  */
2050                 atomic_fetch_add_explicit(&thread->hist->total_cpu_warn,
2051                                           1, memory_order_seq_cst);
2052                 flog_warn(
2053                         EC_LIB_SLOW_THREAD_CPU,
2054                         "CPU HOG: task %s (%lx) ran for %lums (cpu time %lums)",
2055                         thread->xref->funcname, (unsigned long)thread->func,
2056                         walltime / 1000, cputime / 1000);
2057
2058         } else if (walltime_threshold && walltime > walltime_threshold) {
2059                 /*
2060                  * The runtime for a task is greater than 5 seconds, but the
2061                  * cpu time is under 5 seconds.  Let's whine about this because
2062                  * this could imply some sort of scheduling issue.
2063                  */
2064                 atomic_fetch_add_explicit(&thread->hist->total_wall_warn,
2065                                           1, memory_order_seq_cst);
2066                 flog_warn(
2067                         EC_LIB_SLOW_THREAD_WALL,
2068                         "STARVATION: task %s (%lx) ran for %lums (cpu time %lums)",
2069                         thread->xref->funcname, (unsigned long)thread->func,
2070                         walltime / 1000, cputime / 1000);
2071         }
2072 }
2073
2074 /* Execute thread */
2075 void _thread_execute(const struct xref_threadsched *xref,
2076                      struct thread_master *m, void (*func)(struct thread *),
2077                      void *arg, int val)
2078 {
2079         struct thread *thread;
2080
2081         /* Get or allocate new thread to execute. */
2082         frr_with_mutex(&m->mtx) {
2083                 thread = thread_get(m, THREAD_EVENT, func, arg, xref);
2084
2085                 /* Set its event value. */
2086                 frr_with_mutex(&thread->mtx) {
2087                         thread->add_type = THREAD_EXECUTE;
2088                         thread->u.val = val;
2089                         thread->ref = &thread;
2090                 }
2091         }
2092
2093         /* Execute thread doing all accounting. */
2094         thread_call(thread);
2095
2096         /* Give back or free thread. */
2097         thread_add_unuse(m, thread);
2098 }
2099
2100 /* Debug signal mask - if 'sigs' is NULL, use current effective mask. */
2101 void debug_signals(const sigset_t *sigs)
2102 {
2103         int i, found;
2104         sigset_t tmpsigs;
2105         char buf[300];
2106
2107         /*
2108          * We're only looking at the non-realtime signals here, so we need
2109          * some limit value. Platform differences mean at some point we just
2110          * need to pick a reasonable value.
2111          */
2112 #if defined SIGRTMIN
2113 #  define LAST_SIGNAL SIGRTMIN
2114 #else
2115 #  define LAST_SIGNAL 32
2116 #endif
2117
2118
2119         if (sigs == NULL) {
2120                 sigemptyset(&tmpsigs);
2121                 pthread_sigmask(SIG_BLOCK, NULL, &tmpsigs);
2122                 sigs = &tmpsigs;
2123         }
2124
2125         found = 0;
2126         buf[0] = '\0';
2127
2128         for (i = 0; i < LAST_SIGNAL; i++) {
2129                 char tmp[20];
2130
2131                 if (sigismember(sigs, i) > 0) {
2132                         if (found > 0)
2133                                 strlcat(buf, ",", sizeof(buf));
2134                         snprintf(tmp, sizeof(tmp), "%d", i);
2135                         strlcat(buf, tmp, sizeof(buf));
2136                         found++;
2137                 }
2138         }
2139
2140         if (found == 0)
2141                 snprintf(buf, sizeof(buf), "<none>");
2142
2143         zlog_debug("%s: %s", __func__, buf);
2144 }
2145
2146 bool thread_is_scheduled(struct thread *thread)
2147 {
2148         if (thread == NULL)
2149                 return false;
2150
2151         return true;
2152 }
2153
2154 static ssize_t printfrr_thread_dbg(struct fbuf *buf, struct printfrr_eargs *ea,
2155                                    const struct thread *thread)
2156 {
2157         static const char * const types[] = {
2158                 [THREAD_READ] = "read",
2159                 [THREAD_WRITE] = "write",
2160                 [THREAD_TIMER] = "timer",
2161                 [THREAD_EVENT] = "event",
2162                 [THREAD_READY] = "ready",
2163                 [THREAD_UNUSED] = "unused",
2164                 [THREAD_EXECUTE] = "exec",
2165         };
2166         ssize_t rv = 0;
2167         char info[16] = "";
2168
2169         if (!thread)
2170                 return bputs(buf, "{(thread *)NULL}");
2171
2172         rv += bprintfrr(buf, "{(thread *)%p arg=%p", thread, thread->arg);
2173
2174         if (thread->type < array_size(types) && types[thread->type])
2175                 rv += bprintfrr(buf, " %-6s", types[thread->type]);
2176         else
2177                 rv += bprintfrr(buf, " INVALID(%u)", thread->type);
2178
2179         switch (thread->type) {
2180         case THREAD_READ:
2181         case THREAD_WRITE:
2182                 snprintfrr(info, sizeof(info), "fd=%d", thread->u.fd);
2183                 break;
2184
2185         case THREAD_TIMER:
2186                 snprintfrr(info, sizeof(info), "r=%pTVMud", &thread->u.sands);
2187                 break;
2188         }
2189
2190         rv += bprintfrr(buf, " %-12s %s() %s from %s:%d}", info,
2191                         thread->xref->funcname, thread->xref->dest,
2192                         thread->xref->xref.file, thread->xref->xref.line);
2193         return rv;
2194 }
2195
2196 printfrr_ext_autoreg_p("TH", printfrr_thread);
2197 static ssize_t printfrr_thread(struct fbuf *buf, struct printfrr_eargs *ea,
2198                                const void *ptr)
2199 {
2200         const struct thread *thread = ptr;
2201         struct timespec remain = {};
2202
2203         if (ea->fmt[0] == 'D') {
2204                 ea->fmt++;
2205                 return printfrr_thread_dbg(buf, ea, thread);
2206         }
2207
2208         if (!thread) {
2209                 /* need to jump over time formatting flag characters in the
2210                  * input format string, i.e. adjust ea->fmt!
2211                  */
2212                 printfrr_time(buf, ea, &remain,
2213                               TIMEFMT_TIMER_DEADLINE | TIMEFMT_SKIP);
2214                 return bputch(buf, '-');
2215         }
2216
2217         TIMEVAL_TO_TIMESPEC(&thread->u.sands, &remain);
2218         return printfrr_time(buf, ea, &remain, TIMEFMT_TIMER_DEADLINE);
2219 }