]> git.proxmox.com Git - mirror_frr.git/blob - watchfrr/watchfrr.c
Merge pull request #912 from chiragshah6/mdev
[mirror_frr.git] / watchfrr / watchfrr.c
1 /*
2 * Monitor status of frr daemons and restart if necessary.
3 *
4 * Copyright (C) 2004 Andrew J. Schorr
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include <zebra.h>
22 #include <thread.h>
23 #include <log.h>
24 #include <network.h>
25 #include <sigevent.h>
26 #include <lib/version.h>
27 #include "command.h"
28 #include "memory_vty.h"
29 #include "libfrr.h"
30
31 #include <getopt.h>
32 #include <sys/un.h>
33 #include <sys/wait.h>
34 #include <memory.h>
35 #include <systemd.h>
36
37 #include "watchfrr.h"
38
39 #ifndef MIN
40 #define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
41 #endif
42
43 /* Macros to help randomize timers. */
44 #define JITTER(X) ((random() % ((X)+1))-((X)/2))
45 #define FUZZY(X) ((X)+JITTER((X)/20))
46
47 #define DEFAULT_PERIOD 5
48 #define DEFAULT_TIMEOUT 10
49 #define DEFAULT_RESTART_TIMEOUT 20
50 #define DEFAULT_LOGLEVEL LOG_INFO
51 #define DEFAULT_MIN_RESTART 60
52 #define DEFAULT_MAX_RESTART 600
53 #ifdef PATH_WATCHFRR_PID
54 #define DEFAULT_PIDFILE PATH_WATCHFRR_PID
55 #else
56 #define DEFAULT_PIDFILE STATEDIR "/watchfrr.pid"
57 #endif
58 #ifdef DAEMON_VTY_DIR
59 #define VTYDIR DAEMON_VTY_DIR
60 #else
61 #define VTYDIR STATEDIR
62 #endif
63
64 #define PING_TOKEN "PING"
65
66 /* Needs to be global, referenced somewhere inside libfrr. */
67 struct thread_master *master;
68
69 typedef enum {
70 MODE_MONITOR = 0,
71 MODE_GLOBAL_RESTART,
72 MODE_SEPARATE_RESTART,
73 MODE_PHASED_ZEBRA_RESTART,
74 MODE_PHASED_ALL_RESTART
75 } watch_mode_t;
76
77 static const char *mode_str[] = {
78 "monitor",
79 "global restart",
80 "individual daemon restart",
81 "phased zebra restart",
82 "phased global restart for any failure",
83 };
84
85 typedef enum {
86 PHASE_NONE = 0,
87 PHASE_STOPS_PENDING,
88 PHASE_WAITING_DOWN,
89 PHASE_ZEBRA_RESTART_PENDING,
90 PHASE_WAITING_ZEBRA_UP
91 } restart_phase_t;
92
93 static const char *phase_str[] = {
94 "None",
95 "Stop jobs running",
96 "Waiting for other daemons to come down",
97 "Zebra restart job running",
98 "Waiting for zebra to come up",
99 "Start jobs running",
100 };
101
102 #define PHASE_TIMEOUT (3*gs.restart_timeout)
103
104 struct restart_info {
105 const char *name;
106 const char *what;
107 pid_t pid;
108 struct timeval time;
109 long interval;
110 struct thread *t_kill;
111 int kills;
112 };
113
114 static struct global_state {
115 watch_mode_t mode;
116 restart_phase_t phase;
117 struct thread *t_phase_hanging;
118 const char *vtydir;
119 long period;
120 long timeout;
121 long restart_timeout;
122 long min_restart_interval;
123 long max_restart_interval;
124 int do_ping;
125 struct daemon *daemons;
126 const char *restart_command;
127 const char *start_command;
128 const char *stop_command;
129 struct restart_info restart;
130 int unresponsive_restart;
131 int loglevel;
132 struct daemon *special; /* points to zebra when doing phased restart */
133 int numdaemons;
134 int numpids;
135 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
136 } gs = {
137 .mode = MODE_MONITOR,
138 .phase = PHASE_NONE,
139 .vtydir = VTYDIR,
140 .period = 1000 * DEFAULT_PERIOD,
141 .timeout = DEFAULT_TIMEOUT,
142 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
143 .loglevel = DEFAULT_LOGLEVEL,
144 .min_restart_interval = DEFAULT_MIN_RESTART,
145 .max_restart_interval = DEFAULT_MAX_RESTART,
146 .do_ping = 1,
147 };
148
149 typedef enum {
150 DAEMON_INIT,
151 DAEMON_DOWN,
152 DAEMON_CONNECTING,
153 DAEMON_UP,
154 DAEMON_UNRESPONSIVE
155 } daemon_state_t;
156
157 #define IS_UP(DMN) \
158 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
159
160 static const char *state_str[] = {
161 "Init", "Down", "Connecting", "Up", "Unresponsive",
162 };
163
164 struct daemon {
165 const char *name;
166 daemon_state_t state;
167 int fd;
168 struct timeval echo_sent;
169 u_int connect_tries;
170 struct thread *t_wakeup;
171 struct thread *t_read;
172 struct thread *t_write;
173 struct daemon *next;
174 struct restart_info restart;
175 };
176
177 #define OPTION_MINRESTART 2000
178 #define OPTION_MAXRESTART 2001
179
180 static const struct option longopts[] = {
181 {"daemon", no_argument, NULL, 'd'},
182 {"statedir", required_argument, NULL, 'S'},
183 {"no-echo", no_argument, NULL, 'e'},
184 {"loglevel", required_argument, NULL, 'l'},
185 {"interval", required_argument, NULL, 'i'},
186 {"timeout", required_argument, NULL, 't'},
187 {"restart-timeout", required_argument, NULL, 'T'},
188 {"restart", required_argument, NULL, 'r'},
189 {"start-command", required_argument, NULL, 's'},
190 {"kill-command", required_argument, NULL, 'k'},
191 {"restart-all", required_argument, NULL, 'R'},
192 {"all-restart", no_argument, NULL, 'a'},
193 {"always-all-restart", no_argument, NULL, 'A'},
194 {"unresponsive-restart", no_argument, NULL, 'z'},
195 {"min-restart-interval", required_argument, NULL, OPTION_MINRESTART},
196 {"max-restart-interval", required_argument, NULL, OPTION_MAXRESTART},
197 {"pid-file", required_argument, NULL, 'p'},
198 {"blank-string", required_argument, NULL, 'b'},
199 {"help", no_argument, NULL, 'h'},
200 {"version", no_argument, NULL, 'v'},
201 {NULL, 0, NULL, 0}};
202
203 static int try_connect(struct daemon *dmn);
204 static int wakeup_send_echo(struct thread *t_wakeup);
205 static void try_restart(struct daemon *dmn);
206 static void phase_check(void);
207
208 static const char *progname;
209 static void printhelp(FILE *target)
210 {
211 fprintf(target,
212 "Usage : %s [OPTION...] <daemon name> ...\n\n\
213 Watchdog program to monitor status of frr daemons and try to restart\n\
214 them if they are down or unresponsive. It determines whether a daemon is\n\
215 up based on whether it can connect to the daemon's vty unix stream socket.\n\
216 It then repeatedly sends echo commands over that socket to determine whether\n\
217 the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
218 on the socket connection and know immediately that the daemon is down.\n\n\
219 The daemons to be monitored should be listed on the command line.\n\n\
220 This program can run in one of 5 modes:\n\n\
221 0. Mode: %s.\n\
222 Just monitor and report on status changes. Example:\n\
223 %s -d zebra ospfd bgpd\n\n\
224 1. Mode: %s.\n\
225 Whenever any daemon hangs or crashes, use the given command to restart\n\
226 them all. Example:\n\
227 %s -dz \\\n\
228 -R '/sbin/service zebra restart; /sbin/service ospfd restart' \\\n\
229 zebra ospfd\n\n\
230 2. Mode: %s.\n\
231 When any single daemon hangs or crashes, restart only the daemon that's\n\
232 in trouble using the supplied restart command. Example:\n\
233 %s -dz -r '/sbin/service %%s restart' zebra ospfd bgpd\n\n\
234 3. Mode: %s.\n\
235 The same as the previous mode, except that there is special treatment when\n\
236 the zebra daemon is in trouble. In that case, a phased restart approach\n\
237 is used: 1. stop all other daemons; 2. restart zebra; 3. start the other\n\
238 daemons. Example:\n\
239 %s -adz -r '/sbin/service %%s restart' \\\n\
240 -s '/sbin/service %%s start' \\\n\
241 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
242 4. Mode: %s.\n\
243 This is the same as the previous mode, except that the phased restart\n\
244 procedure is used whenever any of the daemons hangs or crashes. Example:\n\
245 %s -Adz -r '/sbin/service %%s restart' \\\n\
246 -s '/sbin/service %%s start' \\\n\
247 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
248 As of this writing, it is believed that mode 2 [%s]\n\
249 is not safe, and mode 3 [%s] may not be safe with some of the\n\
250 routing daemons.\n\n\
251 In order to avoid attempting to restart the daemons in a fast loop,\n\
252 the -m and -M options allow you to control the minimum delay between\n\
253 restart commands. The minimum restart delay is recalculated each time\n\
254 a restart is attempted: if the time since the last restart attempt exceeds\n\
255 twice the -M value, then the restart delay is set to the -m value.\n\
256 Otherwise, the interval is doubled (but capped at the -M value).\n\n",
257 progname, mode_str[0], progname, mode_str[1], progname,
258 mode_str[2], progname, mode_str[3], progname, mode_str[4],
259 progname, mode_str[2], mode_str[3]);
260
261 fprintf(target,
262 "Options:\n\
263 -d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
264 to syslog instead of stdout.\n\
265 -S, --statedir Set the vty socket directory (default is %s)\n\
266 -e, --no-echo Do not ping the daemons to test responsiveness (this\n\
267 option is necessary if the daemons do not support the\n\
268 echo command)\n\
269 -l, --loglevel Set the logging level (default is %d).\n\
270 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
271 but it can be set higher than %d if extra-verbose debugging\n\
272 messages are desired.\n\
273 --min-restart-interval\n\
274 Set the minimum seconds to wait between invocations of daemon\n\
275 restart commands (default is %d).\n\
276 --max-restart-interval\n\
277 Set the maximum seconds to wait between invocations of daemon\n\
278 restart commands (default is %d).\n\
279 -i, --interval Set the status polling interval in seconds (default is %d)\n\
280 -t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
281 -T, --restart-timeout\n\
282 Set the restart (kill) timeout in seconds (default is %d).\n\
283 If any background jobs are still running after this much\n\
284 time has elapsed, they will be killed.\n\
285 -r, --restart Supply a Bourne shell command to use to restart a single\n\
286 daemon. The command string should include '%%s' where the\n\
287 name of the daemon should be substituted.\n\
288 Note that -r and -R are incompatible.\n\
289 -s, --start-command\n\
290 Supply a Bourne shell to command to use to start a single\n\
291 daemon. The command string should include '%%s' where the\n\
292 name of the daemon should be substituted.\n\
293 -k, --kill-command\n\
294 Supply a Bourne shell to command to use to stop a single\n\
295 daemon. The command string should include '%%s' where the\n\
296 name of the daemon should be substituted.\n\
297 -R, --restart-all\n\
298 When one or more daemons is down, try to restart everything\n\
299 using the Bourne shell command supplied as the argument.\n\
300 Note that -r and -R are incompatible.\n\
301 -z, --unresponsive-restart\n\
302 When a daemon is unresponsive, treat it as being down for\n\
303 restart purposes.\n\
304 -a, --all-restart\n\
305 When zebra hangs or crashes, restart all daemons using\n\
306 this phased approach: 1. stop all other daemons; 2. restart\n\
307 zebra; 3. start other daemons. Requires -r, -s, and -k.\n\
308 -A, --always-all-restart\n\
309 When any daemon (not just zebra) hangs or crashes, use the\n\
310 same phased restart mechanism described above for -a.\n\
311 Requires -r, -s, and -k.\n\
312 -p, --pid-file Set process identifier file name\n\
313 (default is %s).\n\
314 -b, --blank-string\n\
315 When the supplied argument string is found in any of the\n\
316 various shell command arguments (-r, -s, -k, or -R), replace\n\
317 it with a space. This is an ugly hack to circumvent problems\n\
318 passing command-line arguments with embedded spaces.\n\
319 -v, --version Print program version\n\
320 -h, --help Display this help and exit\n",
321 VTYDIR, DEFAULT_LOGLEVEL, LOG_EMERG, LOG_DEBUG, LOG_DEBUG,
322 DEFAULT_MIN_RESTART, DEFAULT_MAX_RESTART, DEFAULT_PERIOD,
323 DEFAULT_TIMEOUT, DEFAULT_RESTART_TIMEOUT, DEFAULT_PIDFILE);
324 }
325
326 static pid_t run_background(char *shell_cmd)
327 {
328 pid_t child;
329
330 switch (child = fork()) {
331 case -1:
332 zlog_err("fork failed, cannot run command [%s]: %s", shell_cmd,
333 safe_strerror(errno));
334 return -1;
335 case 0:
336 /* Child process. */
337 /* Use separate process group so child processes can be killed
338 * easily. */
339 if (setpgid(0, 0) < 0)
340 zlog_warn("warning: setpgid(0,0) failed: %s",
341 safe_strerror(errno));
342 {
343 char shell[] = "sh";
344 char dashc[] = "-c";
345 char *const argv[4] = {shell, dashc, shell_cmd, NULL};
346 execv("/bin/sh", argv);
347 zlog_err("execv(/bin/sh -c '%s') failed: %s", shell_cmd,
348 safe_strerror(errno));
349 _exit(127);
350 }
351 default:
352 /* Parent process: we will reap the child later. */
353 zlog_err("Forked background command [pid %d]: %s", (int)child,
354 shell_cmd);
355 return child;
356 }
357 }
358
359 static struct timeval *time_elapsed(struct timeval *result,
360 const struct timeval *start_time)
361 {
362 gettimeofday(result, NULL);
363 result->tv_sec -= start_time->tv_sec;
364 result->tv_usec -= start_time->tv_usec;
365 while (result->tv_usec < 0) {
366 result->tv_usec += 1000000L;
367 result->tv_sec--;
368 }
369 return result;
370 }
371
372 static int restart_kill(struct thread *t_kill)
373 {
374 struct restart_info *restart = THREAD_ARG(t_kill);
375 struct timeval delay;
376
377 time_elapsed(&delay, &restart->time);
378 zlog_warn(
379 "Warning: %s %s child process %d still running after "
380 "%ld seconds, sending signal %d",
381 restart->what, restart->name, (int)restart->pid,
382 (long)delay.tv_sec, (restart->kills ? SIGKILL : SIGTERM));
383 kill(-restart->pid, (restart->kills ? SIGKILL : SIGTERM));
384 restart->kills++;
385 restart->t_kill = NULL;
386 thread_add_timer(master, restart_kill, restart, gs.restart_timeout,
387 &restart->t_kill);
388 return 0;
389 }
390
391 static struct restart_info *find_child(pid_t child)
392 {
393 if (gs.mode == MODE_GLOBAL_RESTART) {
394 if (gs.restart.pid == child)
395 return &gs.restart;
396 } else {
397 struct daemon *dmn;
398 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
399 if (dmn->restart.pid == child)
400 return &dmn->restart;
401 }
402 }
403 return NULL;
404 }
405
406 static void sigchild(void)
407 {
408 pid_t child;
409 int status;
410 const char *name;
411 const char *what;
412 struct restart_info *restart;
413
414 switch (child = waitpid(-1, &status, WNOHANG)) {
415 case -1:
416 zlog_err("waitpid failed: %s", safe_strerror(errno));
417 return;
418 case 0:
419 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
420 return;
421 }
422
423 if (child == integrated_write_pid) {
424 integrated_write_sigchld(status);
425 return;
426 }
427
428 if ((restart = find_child(child)) != NULL) {
429 name = restart->name;
430 what = restart->what;
431 restart->pid = 0;
432 gs.numpids--;
433 thread_cancel(restart->t_kill);
434 restart->t_kill = NULL;
435 /* Update restart time to reflect the time the command
436 * completed. */
437 gettimeofday(&restart->time, NULL);
438 } else {
439 zlog_err(
440 "waitpid returned status for an unknown child process %d",
441 (int)child);
442 name = "(unknown)";
443 what = "background";
444 }
445 if (WIFSTOPPED(status))
446 zlog_warn("warning: %s %s process %d is stopped", what, name,
447 (int)child);
448 else if (WIFSIGNALED(status))
449 zlog_warn("%s %s process %d terminated due to signal %d", what,
450 name, (int)child, WTERMSIG(status));
451 else if (WIFEXITED(status)) {
452 if (WEXITSTATUS(status) != 0)
453 zlog_warn(
454 "%s %s process %d exited with non-zero status %d",
455 what, name, (int)child, WEXITSTATUS(status));
456 else
457 zlog_debug("%s %s process %d exited normally", what,
458 name, (int)child);
459 } else
460 zlog_err("cannot interpret %s %s process %d wait status 0x%x",
461 what, name, (int)child, status);
462 phase_check();
463 }
464
465 static int run_job(struct restart_info *restart, const char *cmdtype,
466 const char *command, int force, int update_interval)
467 {
468 struct timeval delay;
469
470 if (gs.loglevel > LOG_DEBUG + 1)
471 zlog_debug("attempting to %s %s", cmdtype, restart->name);
472
473 if (restart->pid) {
474 if (gs.loglevel > LOG_DEBUG + 1)
475 zlog_debug(
476 "cannot %s %s, previous pid %d still running",
477 cmdtype, restart->name, (int)restart->pid);
478 return -1;
479 }
480
481 /* Note: time_elapsed test must come before the force test, since we
482 need
483 to make sure that delay is initialized for use below in updating the
484 restart interval. */
485 if ((time_elapsed(&delay, &restart->time)->tv_sec < restart->interval)
486 && !force) {
487 if (gs.loglevel > LOG_DEBUG + 1)
488 zlog_debug(
489 "postponing %s %s: "
490 "elapsed time %ld < retry interval %ld",
491 cmdtype, restart->name, (long)delay.tv_sec,
492 restart->interval);
493 return -1;
494 }
495
496 gettimeofday(&restart->time, NULL);
497 restart->kills = 0;
498 {
499 char cmd[strlen(command) + strlen(restart->name) + 1];
500 snprintf(cmd, sizeof(cmd), command, restart->name);
501 if ((restart->pid = run_background(cmd)) > 0) {
502 restart->t_kill = NULL;
503 thread_add_timer(master, restart_kill, restart,
504 gs.restart_timeout, &restart->t_kill);
505 restart->what = cmdtype;
506 gs.numpids++;
507 } else
508 restart->pid = 0;
509 }
510
511 /* Calculate the new restart interval. */
512 if (update_interval) {
513 if (delay.tv_sec > 2 * gs.max_restart_interval)
514 restart->interval = gs.min_restart_interval;
515 else if ((restart->interval *= 2) > gs.max_restart_interval)
516 restart->interval = gs.max_restart_interval;
517 if (gs.loglevel > LOG_DEBUG + 1)
518 zlog_debug("restart %s interval is now %ld",
519 restart->name, restart->interval);
520 }
521 return restart->pid;
522 }
523
524 #define SET_READ_HANDLER(DMN) \
525 do { \
526 (DMN)->t_read = NULL; \
527 thread_add_read(master, handle_read, (DMN), (DMN)->fd, \
528 &(DMN)->t_read); \
529 } while (0);
530
531 #define SET_WAKEUP_DOWN(DMN) \
532 do { \
533 (DMN)->t_wakeup = NULL; \
534 thread_add_timer_msec(master, wakeup_down, (DMN), \
535 FUZZY(gs.period), &(DMN)->t_wakeup); \
536 } while (0);
537
538 #define SET_WAKEUP_UNRESPONSIVE(DMN) \
539 do { \
540 (DMN)->t_wakeup = NULL; \
541 thread_add_timer_msec(master, wakeup_unresponsive, (DMN), \
542 FUZZY(gs.period), &(DMN)->t_wakeup); \
543 } while (0);
544
545 #define SET_WAKEUP_ECHO(DMN) \
546 do { \
547 (DMN)->t_wakeup = NULL; \
548 thread_add_timer_msec(master, wakeup_send_echo, (DMN), \
549 FUZZY(gs.period), &(DMN)->t_wakeup); \
550 } while (0);
551
552 static int wakeup_down(struct thread *t_wakeup)
553 {
554 struct daemon *dmn = THREAD_ARG(t_wakeup);
555
556 dmn->t_wakeup = NULL;
557 if (try_connect(dmn) < 0)
558 SET_WAKEUP_DOWN(dmn);
559 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
560 try_restart(dmn);
561 return 0;
562 }
563
564 static int wakeup_init(struct thread *t_wakeup)
565 {
566 struct daemon *dmn = THREAD_ARG(t_wakeup);
567
568 dmn->t_wakeup = NULL;
569 if (try_connect(dmn) < 0) {
570 SET_WAKEUP_DOWN(dmn);
571 zlog_err("%s state -> down : initial connection attempt failed",
572 dmn->name);
573 dmn->state = DAEMON_DOWN;
574 }
575 return 0;
576 }
577
578 static void daemon_down(struct daemon *dmn, const char *why)
579 {
580 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
581 zlog_err("%s state -> down : %s", dmn->name, why);
582 else if (gs.loglevel > LOG_DEBUG)
583 zlog_debug("%s still down : %s", dmn->name, why);
584 if (IS_UP(dmn))
585 gs.numdown++;
586 dmn->state = DAEMON_DOWN;
587 if (dmn->fd >= 0) {
588 close(dmn->fd);
589 dmn->fd = -1;
590 }
591 THREAD_OFF(dmn->t_read);
592 THREAD_OFF(dmn->t_write);
593 THREAD_OFF(dmn->t_wakeup);
594 if (try_connect(dmn) < 0)
595 SET_WAKEUP_DOWN(dmn);
596 phase_check();
597 }
598
599 static int handle_read(struct thread *t_read)
600 {
601 struct daemon *dmn = THREAD_ARG(t_read);
602 static const char resp[sizeof(PING_TOKEN) + 4] = PING_TOKEN "\n";
603 char buf[sizeof(resp) + 100];
604 ssize_t rc;
605 struct timeval delay;
606
607 dmn->t_read = NULL;
608 if ((rc = read(dmn->fd, buf, sizeof(buf))) < 0) {
609 char why[100];
610
611 if (ERRNO_IO_RETRY(errno)) {
612 /* Pretend it never happened. */
613 SET_READ_HANDLER(dmn);
614 return 0;
615 }
616 snprintf(why, sizeof(why), "unexpected read error: %s",
617 safe_strerror(errno));
618 daemon_down(dmn, why);
619 return 0;
620 }
621 if (rc == 0) {
622 daemon_down(dmn, "read returned EOF");
623 return 0;
624 }
625 if (!dmn->echo_sent.tv_sec) {
626 char why[sizeof(buf) + 100];
627 snprintf(why, sizeof(why),
628 "unexpected read returns %d bytes: %.*s", (int)rc,
629 (int)rc, buf);
630 daemon_down(dmn, why);
631 return 0;
632 }
633
634 /* We are expecting an echo response: is there any chance that the
635 response would not be returned entirely in the first read? That
636 seems inconceivable... */
637 if ((rc != sizeof(resp)) || memcmp(buf, resp, sizeof(resp))) {
638 char why[100 + sizeof(buf)];
639 snprintf(why, sizeof(why),
640 "read returned bad echo response of %d bytes "
641 "(expecting %u): %.*s",
642 (int)rc, (u_int)sizeof(resp), (int)rc, buf);
643 daemon_down(dmn, why);
644 return 0;
645 }
646
647 time_elapsed(&delay, &dmn->echo_sent);
648 dmn->echo_sent.tv_sec = 0;
649 if (dmn->state == DAEMON_UNRESPONSIVE) {
650 if (delay.tv_sec < gs.timeout) {
651 dmn->state = DAEMON_UP;
652 zlog_warn(
653 "%s state -> up : echo response received after %ld.%06ld "
654 "seconds",
655 dmn->name, (long)delay.tv_sec,
656 (long)delay.tv_usec);
657 } else
658 zlog_warn(
659 "%s: slow echo response finally received after %ld.%06ld "
660 "seconds",
661 dmn->name, (long)delay.tv_sec,
662 (long)delay.tv_usec);
663 } else if (gs.loglevel > LOG_DEBUG + 1)
664 zlog_debug("%s: echo response received after %ld.%06ld seconds",
665 dmn->name, (long)delay.tv_sec, (long)delay.tv_usec);
666
667 SET_READ_HANDLER(dmn);
668 if (dmn->t_wakeup)
669 thread_cancel(dmn->t_wakeup);
670 SET_WAKEUP_ECHO(dmn);
671
672 return 0;
673 }
674
675 /*
676 * Wait till we notice that all daemons are ready before
677 * we send we are ready to systemd
678 */
679 static void daemon_send_ready(void)
680 {
681 static int sent = 0;
682 if (!sent && gs.numdown == 0) {
683 FILE *fp;
684
685 fp = fopen(DAEMON_VTY_DIR "/watchfrr.started", "w");
686 fclose(fp);
687 #if defined HAVE_SYSTEMD
688 zlog_notice(
689 "Watchfrr: Notifying Systemd we are up and running");
690 systemd_send_started(master, 0);
691 #endif
692 sent = 1;
693 }
694 }
695
696 static void daemon_up(struct daemon *dmn, const char *why)
697 {
698 dmn->state = DAEMON_UP;
699 gs.numdown--;
700 dmn->connect_tries = 0;
701 zlog_notice("%s state -> up : %s", dmn->name, why);
702 daemon_send_ready();
703 if (gs.do_ping)
704 SET_WAKEUP_ECHO(dmn);
705 phase_check();
706 }
707
708 static int check_connect(struct thread *t_write)
709 {
710 struct daemon *dmn = THREAD_ARG(t_write);
711 int sockerr;
712 socklen_t reslen = sizeof(sockerr);
713
714 dmn->t_write = NULL;
715 if (getsockopt(dmn->fd, SOL_SOCKET, SO_ERROR, (char *)&sockerr, &reslen)
716 < 0) {
717 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn->name,
718 safe_strerror(errno));
719 daemon_down(dmn,
720 "getsockopt failed checking connection success");
721 return 0;
722 }
723 if ((reslen == sizeof(sockerr)) && sockerr) {
724 char why[100];
725 snprintf(
726 why, sizeof(why),
727 "getsockopt reports that connection attempt failed: %s",
728 safe_strerror(sockerr));
729 daemon_down(dmn, why);
730 return 0;
731 }
732
733 daemon_up(dmn, "delayed connect succeeded");
734 return 0;
735 }
736
737 static int wakeup_connect_hanging(struct thread *t_wakeup)
738 {
739 struct daemon *dmn = THREAD_ARG(t_wakeup);
740 char why[100];
741
742 dmn->t_wakeup = NULL;
743 snprintf(why, sizeof(why),
744 "connection attempt timed out after %ld seconds", gs.timeout);
745 daemon_down(dmn, why);
746 return 0;
747 }
748
749 /* Making connection to protocol daemon. */
750 static int try_connect(struct daemon *dmn)
751 {
752 int sock;
753 struct sockaddr_un addr;
754 socklen_t len;
755
756 if (gs.loglevel > LOG_DEBUG + 1)
757 zlog_debug("%s: attempting to connect", dmn->name);
758 dmn->connect_tries++;
759
760 memset(&addr, 0, sizeof(struct sockaddr_un));
761 addr.sun_family = AF_UNIX;
762 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty", gs.vtydir,
763 dmn->name);
764 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
765 len = addr.sun_len = SUN_LEN(&addr);
766 #else
767 len = sizeof(addr.sun_family) + strlen(addr.sun_path);
768 #endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
769
770 /* Quick check to see if we might succeed before we go to the trouble
771 of creating a socket. */
772 if (access(addr.sun_path, W_OK) < 0) {
773 if (errno != ENOENT)
774 zlog_err("%s: access to socket %s denied: %s",
775 dmn->name, addr.sun_path,
776 safe_strerror(errno));
777 return -1;
778 }
779
780 if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
781 zlog_err("%s(%s): cannot make socket: %s", __func__,
782 addr.sun_path, safe_strerror(errno));
783 return -1;
784 }
785
786 if (set_nonblocking(sock) < 0 || set_cloexec(sock) < 0) {
787 zlog_err("%s(%s): set_nonblocking/cloexec(%d) failed", __func__,
788 addr.sun_path, sock);
789 close(sock);
790 return -1;
791 }
792
793 if (connect(sock, (struct sockaddr *)&addr, len) < 0) {
794 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK)) {
795 if (gs.loglevel > LOG_DEBUG)
796 zlog_debug("%s(%s): connect failed: %s",
797 __func__, addr.sun_path,
798 safe_strerror(errno));
799 close(sock);
800 return -1;
801 }
802 if (gs.loglevel > LOG_DEBUG)
803 zlog_debug("%s: connection in progress", dmn->name);
804 dmn->state = DAEMON_CONNECTING;
805 dmn->fd = sock;
806 dmn->t_write = NULL;
807 thread_add_write(master, check_connect, dmn, dmn->fd,
808 &dmn->t_write);
809 dmn->t_wakeup = NULL;
810 thread_add_timer(master, wakeup_connect_hanging, dmn,
811 gs.timeout, &dmn->t_wakeup);
812 SET_READ_HANDLER(dmn);
813 return 0;
814 }
815
816 dmn->fd = sock;
817 SET_READ_HANDLER(dmn);
818 daemon_up(dmn, "connect succeeded");
819 return 1;
820 }
821
822 static int phase_hanging(struct thread *t_hanging)
823 {
824 gs.t_phase_hanging = NULL;
825 zlog_err("Phase [%s] hanging for %ld seconds, aborting phased restart",
826 phase_str[gs.phase], PHASE_TIMEOUT);
827 gs.phase = PHASE_NONE;
828 return 0;
829 }
830
831 static void set_phase(restart_phase_t new_phase)
832 {
833 gs.phase = new_phase;
834 if (gs.t_phase_hanging)
835 thread_cancel(gs.t_phase_hanging);
836 gs.t_phase_hanging = NULL;
837 thread_add_timer(master, phase_hanging, NULL, PHASE_TIMEOUT,
838 &gs.t_phase_hanging);
839 }
840
841 static void phase_check(void)
842 {
843 switch (gs.phase) {
844 case PHASE_NONE:
845 break;
846 case PHASE_STOPS_PENDING:
847 if (gs.numpids)
848 break;
849 zlog_info(
850 "Phased restart: all routing daemon stop jobs have completed.");
851 set_phase(PHASE_WAITING_DOWN);
852
853 /*FALLTHRU*/
854 case PHASE_WAITING_DOWN:
855 if (gs.numdown + IS_UP(gs.special) < gs.numdaemons)
856 break;
857 zlog_info("Phased restart: all routing daemons now down.");
858 run_job(&gs.special->restart, "restart", gs.restart_command, 1,
859 1);
860 set_phase(PHASE_ZEBRA_RESTART_PENDING);
861
862 /*FALLTHRU*/
863 case PHASE_ZEBRA_RESTART_PENDING:
864 if (gs.special->restart.pid)
865 break;
866 zlog_info("Phased restart: %s restart job completed.",
867 gs.special->name);
868 set_phase(PHASE_WAITING_ZEBRA_UP);
869
870 /*FALLTHRU*/
871 case PHASE_WAITING_ZEBRA_UP:
872 if (!IS_UP(gs.special))
873 break;
874 zlog_info("Phased restart: %s is now up.", gs.special->name);
875 {
876 struct daemon *dmn;
877 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
878 if (dmn != gs.special)
879 run_job(&dmn->restart, "start",
880 gs.start_command, 1, 0);
881 }
882 }
883 gs.phase = PHASE_NONE;
884 THREAD_OFF(gs.t_phase_hanging);
885 zlog_notice("Phased global restart has completed.");
886 break;
887 }
888 }
889
890 static void try_restart(struct daemon *dmn)
891 {
892 switch (gs.mode) {
893 case MODE_MONITOR:
894 return;
895 case MODE_GLOBAL_RESTART:
896 run_job(&gs.restart, "restart", gs.restart_command, 0, 1);
897 break;
898 case MODE_SEPARATE_RESTART:
899 run_job(&dmn->restart, "restart", gs.restart_command, 0, 1);
900 break;
901 case MODE_PHASED_ZEBRA_RESTART:
902 if (dmn != gs.special) {
903 if ((gs.special->state == DAEMON_UP)
904 && (gs.phase == PHASE_NONE))
905 run_job(&dmn->restart, "restart",
906 gs.restart_command, 0, 1);
907 else
908 zlog_debug(
909 "%s: postponing restart attempt because master %s daemon "
910 "not up [%s], or phased restart in progress",
911 dmn->name, gs.special->name,
912 state_str[gs.special->state]);
913 break;
914 }
915
916 /*FALLTHRU*/
917 case MODE_PHASED_ALL_RESTART:
918 if ((gs.phase != PHASE_NONE) || gs.numpids) {
919 if (gs.loglevel > LOG_DEBUG + 1)
920 zlog_debug(
921 "postponing phased global restart: restart already in "
922 "progress [%s], or outstanding child processes [%d]",
923 phase_str[gs.phase], gs.numpids);
924 break;
925 }
926 /* Is it too soon for a restart? */
927 {
928 struct timeval delay;
929 if (time_elapsed(&delay, &gs.special->restart.time)
930 ->tv_sec
931 < gs.special->restart.interval) {
932 if (gs.loglevel > LOG_DEBUG + 1)
933 zlog_debug(
934 "postponing phased global restart: "
935 "elapsed time %ld < retry interval %ld",
936 (long)delay.tv_sec,
937 gs.special->restart.interval);
938 break;
939 }
940 }
941 run_job(&gs.restart, "restart", gs.restart_command, 0, 1);
942 break;
943 default:
944 zlog_err("error: unknown restart mode %d", gs.mode);
945 break;
946 }
947 }
948
949 static int wakeup_unresponsive(struct thread *t_wakeup)
950 {
951 struct daemon *dmn = THREAD_ARG(t_wakeup);
952
953 dmn->t_wakeup = NULL;
954 if (dmn->state != DAEMON_UNRESPONSIVE)
955 zlog_err(
956 "%s: no longer unresponsive (now %s), "
957 "wakeup should have been cancelled!",
958 dmn->name, state_str[dmn->state]);
959 else {
960 SET_WAKEUP_UNRESPONSIVE(dmn);
961 try_restart(dmn);
962 }
963 return 0;
964 }
965
966 static int wakeup_no_answer(struct thread *t_wakeup)
967 {
968 struct daemon *dmn = THREAD_ARG(t_wakeup);
969
970 dmn->t_wakeup = NULL;
971 dmn->state = DAEMON_UNRESPONSIVE;
972 zlog_err(
973 "%s state -> unresponsive : no response yet to ping "
974 "sent %ld seconds ago",
975 dmn->name, gs.timeout);
976 if (gs.unresponsive_restart) {
977 SET_WAKEUP_UNRESPONSIVE(dmn);
978 try_restart(dmn);
979 }
980 return 0;
981 }
982
983 static int wakeup_send_echo(struct thread *t_wakeup)
984 {
985 static const char echocmd[] = "echo " PING_TOKEN;
986 ssize_t rc;
987 struct daemon *dmn = THREAD_ARG(t_wakeup);
988
989 dmn->t_wakeup = NULL;
990 if (((rc = write(dmn->fd, echocmd, sizeof(echocmd))) < 0)
991 || ((size_t)rc != sizeof(echocmd))) {
992 char why[100 + sizeof(echocmd)];
993 snprintf(why, sizeof(why),
994 "write '%s' returned %d instead of %u", echocmd,
995 (int)rc, (u_int)sizeof(echocmd));
996 daemon_down(dmn, why);
997 } else {
998 gettimeofday(&dmn->echo_sent, NULL);
999 dmn->t_wakeup = NULL;
1000 thread_add_timer(master, wakeup_no_answer, dmn, gs.timeout,
1001 &dmn->t_wakeup);
1002 }
1003 return 0;
1004 }
1005
1006 static void sigint(void)
1007 {
1008 zlog_notice("Terminating on signal");
1009 systemd_send_stopping();
1010 exit(0);
1011 }
1012
1013 static int valid_command(const char *cmd)
1014 {
1015 char *p;
1016
1017 return ((p = strchr(cmd, '%')) != NULL) && (*(p + 1) == 's')
1018 && !strchr(p + 1, '%');
1019 }
1020
1021 /* This is an ugly hack to circumvent problems with passing command-line
1022 arguments that contain spaces. The fix is to use a configuration file. */
1023 static char *translate_blanks(const char *cmd, const char *blankstr)
1024 {
1025 char *res;
1026 char *p;
1027 size_t bslen = strlen(blankstr);
1028
1029 if (!(res = strdup(cmd))) {
1030 perror("strdup");
1031 exit(1);
1032 }
1033 while ((p = strstr(res, blankstr)) != NULL) {
1034 *p = ' ';
1035 if (bslen != 1)
1036 memmove(p + 1, p + bslen, strlen(p + bslen) + 1);
1037 }
1038 return res;
1039 }
1040
1041 struct zebra_privs_t watchfrr_privs = {
1042 #ifdef VTY_GROUP
1043 .vty_group = VTY_GROUP,
1044 #endif
1045 };
1046
1047 static struct quagga_signal_t watchfrr_signals[] = {
1048 {
1049 .signal = SIGINT,
1050 .handler = sigint,
1051 },
1052 {
1053 .signal = SIGTERM,
1054 .handler = sigint,
1055 },
1056 {
1057 .signal = SIGCHLD,
1058 .handler = sigchild,
1059 },
1060 };
1061
1062 FRR_DAEMON_INFO(watchfrr, WATCHFRR,
1063 .flags = FRR_NO_PRIVSEP | FRR_NO_TCPVTY | FRR_LIMITED_CLI
1064 | FRR_NO_CFG_PID_DRY | FRR_NO_ZCLIENT,
1065
1066 .printhelp = printhelp,
1067 .copyright = "Copyright 2004 Andrew J. Schorr",
1068
1069 .signals = watchfrr_signals,
1070 .n_signals = array_size(watchfrr_signals),
1071
1072 .privs = &watchfrr_privs, )
1073
1074 int main(int argc, char **argv)
1075 {
1076 int opt;
1077 const char *pidfile = DEFAULT_PIDFILE;
1078 const char *special = "zebra";
1079 const char *blankstr = NULL;
1080
1081 frr_preinit(&watchfrr_di, argc, argv);
1082 progname = watchfrr_di.progname;
1083
1084 frr_opt_add("aAb:dek:l:i:p:r:R:S:s:t:T:z", longopts, "");
1085
1086 gs.restart.name = "all";
1087 while ((opt = frr_getopt(argc, argv, NULL)) != EOF) {
1088 switch (opt) {
1089 case 0:
1090 break;
1091 case 'a':
1092 if ((gs.mode != MODE_MONITOR)
1093 && (gs.mode != MODE_SEPARATE_RESTART)) {
1094 fputs("Ambiguous operating mode selected.\n",
1095 stderr);
1096 frr_help_exit(1);
1097 }
1098 gs.mode = MODE_PHASED_ZEBRA_RESTART;
1099 break;
1100 case 'A':
1101 if ((gs.mode != MODE_MONITOR)
1102 && (gs.mode != MODE_SEPARATE_RESTART)) {
1103 fputs("Ambiguous operating mode selected.\n",
1104 stderr);
1105 frr_help_exit(1);
1106 }
1107 gs.mode = MODE_PHASED_ALL_RESTART;
1108 break;
1109 case 'b':
1110 blankstr = optarg;
1111 break;
1112 case 'e':
1113 gs.do_ping = 0;
1114 break;
1115 case 'k':
1116 if (!valid_command(optarg)) {
1117 fprintf(stderr,
1118 "Invalid kill command, must contain '%%s': %s\n",
1119 optarg);
1120 frr_help_exit(1);
1121 }
1122 gs.stop_command = optarg;
1123 break;
1124 case 'l': {
1125 char garbage[3];
1126 if ((sscanf(optarg, "%d%1s", &gs.loglevel, garbage)
1127 != 1)
1128 || (gs.loglevel < LOG_EMERG)) {
1129 fprintf(stderr,
1130 "Invalid loglevel argument: %s\n",
1131 optarg);
1132 frr_help_exit(1);
1133 }
1134 } break;
1135 case OPTION_MINRESTART: {
1136 char garbage[3];
1137 if ((sscanf(optarg, "%ld%1s", &gs.min_restart_interval,
1138 garbage)
1139 != 1)
1140 || (gs.min_restart_interval < 0)) {
1141 fprintf(stderr,
1142 "Invalid min_restart_interval argument: %s\n",
1143 optarg);
1144 frr_help_exit(1);
1145 }
1146 } break;
1147 case OPTION_MAXRESTART: {
1148 char garbage[3];
1149 if ((sscanf(optarg, "%ld%1s", &gs.max_restart_interval,
1150 garbage)
1151 != 1)
1152 || (gs.max_restart_interval < 0)) {
1153 fprintf(stderr,
1154 "Invalid max_restart_interval argument: %s\n",
1155 optarg);
1156 frr_help_exit(1);
1157 }
1158 } break;
1159 case 'i': {
1160 char garbage[3];
1161 int period;
1162 if ((sscanf(optarg, "%d%1s", &period, garbage) != 1)
1163 || (gs.period < 1)) {
1164 fprintf(stderr,
1165 "Invalid interval argument: %s\n",
1166 optarg);
1167 frr_help_exit(1);
1168 }
1169 gs.period = 1000 * period;
1170 } break;
1171 case 'p':
1172 pidfile = optarg;
1173 break;
1174 case 'r':
1175 if ((gs.mode == MODE_GLOBAL_RESTART)
1176 || (gs.mode == MODE_SEPARATE_RESTART)) {
1177 fputs("Ambiguous operating mode selected.\n",
1178 stderr);
1179 frr_help_exit(1);
1180 }
1181 if (!valid_command(optarg)) {
1182 fprintf(stderr,
1183 "Invalid restart command, must contain '%%s': %s\n",
1184 optarg);
1185 frr_help_exit(1);
1186 }
1187 gs.restart_command = optarg;
1188 if (gs.mode == MODE_MONITOR)
1189 gs.mode = MODE_SEPARATE_RESTART;
1190 break;
1191 case 'R':
1192 if (gs.mode != MODE_MONITOR) {
1193 fputs("Ambiguous operating mode selected.\n",
1194 stderr);
1195 frr_help_exit(1);
1196 }
1197 if (strchr(optarg, '%')) {
1198 fprintf(stderr,
1199 "Invalid restart-all arg, must not contain '%%s': %s\n",
1200 optarg);
1201 frr_help_exit(1);
1202 }
1203 gs.restart_command = optarg;
1204 gs.mode = MODE_GLOBAL_RESTART;
1205 break;
1206 case 's':
1207 if (!valid_command(optarg)) {
1208 fprintf(stderr,
1209 "Invalid start command, must contain '%%s': %s\n",
1210 optarg);
1211 frr_help_exit(1);
1212 }
1213 gs.start_command = optarg;
1214 break;
1215 case 'S':
1216 gs.vtydir = optarg;
1217 break;
1218 case 't': {
1219 char garbage[3];
1220 if ((sscanf(optarg, "%ld%1s", &gs.timeout, garbage)
1221 != 1)
1222 || (gs.timeout < 1)) {
1223 fprintf(stderr,
1224 "Invalid timeout argument: %s\n",
1225 optarg);
1226 frr_help_exit(1);
1227 }
1228 } break;
1229 case 'T': {
1230 char garbage[3];
1231 if ((sscanf(optarg, "%ld%1s", &gs.restart_timeout,
1232 garbage)
1233 != 1)
1234 || (gs.restart_timeout < 1)) {
1235 fprintf(stderr,
1236 "Invalid restart timeout argument: %s\n",
1237 optarg);
1238 frr_help_exit(1);
1239 }
1240 } break;
1241 case 'z':
1242 gs.unresponsive_restart = 1;
1243 break;
1244 default:
1245 fputs("Invalid option.\n", stderr);
1246 frr_help_exit(1);
1247 }
1248 }
1249
1250 if (gs.unresponsive_restart && (gs.mode == MODE_MONITOR)) {
1251 fputs("Option -z requires a -r or -R restart option.\n",
1252 stderr);
1253 frr_help_exit(1);
1254 }
1255 switch (gs.mode) {
1256 case MODE_MONITOR:
1257 if (gs.restart_command || gs.start_command || gs.stop_command) {
1258 fprintf(stderr,
1259 "No kill/(re)start commands needed for %s mode.\n",
1260 mode_str[gs.mode]);
1261 frr_help_exit(1);
1262 }
1263 break;
1264 case MODE_GLOBAL_RESTART:
1265 case MODE_SEPARATE_RESTART:
1266 if (!gs.restart_command || gs.start_command
1267 || gs.stop_command) {
1268 fprintf(stderr,
1269 "No start/kill commands needed in [%s] mode.\n",
1270 mode_str[gs.mode]);
1271 frr_help_exit(1);
1272 }
1273 break;
1274 case MODE_PHASED_ZEBRA_RESTART:
1275 case MODE_PHASED_ALL_RESTART:
1276 if (!gs.restart_command || !gs.start_command
1277 || !gs.stop_command) {
1278 fprintf(stderr,
1279 "Need start, kill, and restart commands in [%s] mode.\n",
1280 mode_str[gs.mode]);
1281 frr_help_exit(1);
1282 }
1283 break;
1284 }
1285
1286 if (blankstr) {
1287 if (gs.restart_command)
1288 gs.restart_command =
1289 translate_blanks(gs.restart_command, blankstr);
1290 if (gs.start_command)
1291 gs.start_command =
1292 translate_blanks(gs.start_command, blankstr);
1293 if (gs.stop_command)
1294 gs.stop_command =
1295 translate_blanks(gs.stop_command, blankstr);
1296 }
1297
1298 gs.restart.interval = gs.min_restart_interval;
1299
1300 master = frr_init();
1301
1302 zlog_set_level(ZLOG_DEST_MONITOR, ZLOG_DISABLED);
1303 if (watchfrr_di.daemon_mode) {
1304 zlog_set_level(ZLOG_DEST_SYSLOG, MIN(gs.loglevel, LOG_DEBUG));
1305 if (daemon(0, 0) < 0) {
1306 fprintf(stderr, "Watchfrr daemon failed: %s",
1307 strerror(errno));
1308 exit(1);
1309 }
1310 } else
1311 zlog_set_level(ZLOG_DEST_STDOUT, MIN(gs.loglevel, LOG_DEBUG));
1312
1313 watchfrr_vty_init();
1314
1315 frr_vty_serv();
1316
1317 {
1318 int i;
1319 struct daemon *tail = NULL;
1320
1321 for (i = optind; i < argc; i++) {
1322 struct daemon *dmn;
1323
1324 if (!(dmn = (struct daemon *)calloc(1, sizeof(*dmn)))) {
1325 fprintf(stderr, "calloc(1,%u) failed: %s\n",
1326 (u_int)sizeof(*dmn),
1327 safe_strerror(errno));
1328 return 1;
1329 }
1330 dmn->name = dmn->restart.name = argv[i];
1331 dmn->state = DAEMON_INIT;
1332 gs.numdaemons++;
1333 gs.numdown++;
1334 dmn->fd = -1;
1335 dmn->t_wakeup = NULL;
1336 thread_add_timer_msec(master, wakeup_init, dmn,
1337 100 + (random() % 900),
1338 &dmn->t_wakeup);
1339 dmn->restart.interval = gs.min_restart_interval;
1340 if (tail)
1341 tail->next = dmn;
1342 else
1343 gs.daemons = dmn;
1344 tail = dmn;
1345
1346 if (((gs.mode == MODE_PHASED_ZEBRA_RESTART)
1347 || (gs.mode == MODE_PHASED_ALL_RESTART))
1348 && !strcmp(dmn->name, special))
1349 gs.special = dmn;
1350 }
1351 }
1352 if (!gs.daemons) {
1353 fputs("Must specify one or more daemons to monitor.\n", stderr);
1354 frr_help_exit(1);
1355 }
1356 if (((gs.mode == MODE_PHASED_ZEBRA_RESTART)
1357 || (gs.mode == MODE_PHASED_ALL_RESTART))
1358 && !gs.special) {
1359 fprintf(stderr,
1360 "In mode [%s], but cannot find master daemon %s\n",
1361 mode_str[gs.mode], special);
1362 frr_help_exit(1);
1363 }
1364
1365 /* Make sure we're not already running. */
1366 pid_output(pidfile);
1367
1368 /* Announce which daemons are being monitored. */
1369 {
1370 struct daemon *dmn;
1371 size_t len = 0;
1372
1373 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1374 len += strlen(dmn->name) + 1;
1375
1376 {
1377 char buf[len + 1];
1378 char *p = buf;
1379
1380 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
1381 if (p != buf)
1382 *p++ = ' ';
1383 strcpy(p, dmn->name);
1384 p += strlen(p);
1385 }
1386 zlog_notice("%s %s watching [%s], mode [%s]", progname,
1387 FRR_VERSION, buf, mode_str[gs.mode]);
1388 }
1389 }
1390
1391 {
1392 struct thread thread;
1393
1394 while (thread_fetch(master, &thread))
1395 thread_call(&thread);
1396 }
1397
1398 systemd_send_stopping();
1399 /* Not reached. */
1400 return 0;
1401 }