]> git.proxmox.com Git - mirror_frr.git/blob - watchfrr/watchfrr.c
Merge remote-tracking branch 'origin/stable/2.0'
[mirror_frr.git] / watchfrr / watchfrr.c
1 /*
2 * Monitor status of frr daemons and restart if necessary.
3 *
4 * Copyright (C) 2004 Andrew J. Schorr
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include <zebra.h>
22 #include <thread.h>
23 #include <log.h>
24 #include <network.h>
25 #include <sigevent.h>
26 #include <lib/version.h>
27 #include "command.h"
28 #include "memory_vty.h"
29 #include "libfrr.h"
30
31 #include <getopt.h>
32 #include <sys/un.h>
33 #include <sys/wait.h>
34 #include <memory.h>
35 #include <systemd.h>
36
37 #include "watchfrr.h"
38
39 #ifndef MIN
40 #define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
41 #endif
42
43 /* Macros to help randomize timers. */
44 #define JITTER(X) ((random() % ((X)+1))-((X)/2))
45 #define FUZZY(X) ((X)+JITTER((X)/20))
46
47 #define DEFAULT_PERIOD 5
48 #define DEFAULT_TIMEOUT 10
49 #define DEFAULT_RESTART_TIMEOUT 20
50 #define DEFAULT_LOGLEVEL LOG_INFO
51 #define DEFAULT_MIN_RESTART 60
52 #define DEFAULT_MAX_RESTART 600
53 #ifdef PATH_WATCHFRR_PID
54 #define DEFAULT_PIDFILE PATH_WATCHFRR_PID
55 #else
56 #define DEFAULT_PIDFILE STATEDIR "/watchfrr.pid"
57 #endif
58 #ifdef DAEMON_VTY_DIR
59 #define VTYDIR DAEMON_VTY_DIR
60 #else
61 #define VTYDIR STATEDIR
62 #endif
63
64 #define PING_TOKEN "PING"
65
66 /* Needs to be global, referenced somewhere inside libfrr. */
67 struct thread_master *master;
68
69 typedef enum {
70 MODE_MONITOR = 0,
71 MODE_GLOBAL_RESTART,
72 MODE_SEPARATE_RESTART,
73 MODE_PHASED_ZEBRA_RESTART,
74 MODE_PHASED_ALL_RESTART
75 } watch_mode_t;
76
77 static const char *mode_str[] = {
78 "monitor",
79 "global restart",
80 "individual daemon restart",
81 "phased zebra restart",
82 "phased global restart for any failure",
83 };
84
85 typedef enum {
86 PHASE_NONE = 0,
87 PHASE_STOPS_PENDING,
88 PHASE_WAITING_DOWN,
89 PHASE_ZEBRA_RESTART_PENDING,
90 PHASE_WAITING_ZEBRA_UP
91 } restart_phase_t;
92
93 static const char *phase_str[] = {
94 "None",
95 "Stop jobs running",
96 "Waiting for other daemons to come down",
97 "Zebra restart job running",
98 "Waiting for zebra to come up",
99 "Start jobs running",
100 };
101
102 #define PHASE_TIMEOUT (3*gs.restart_timeout)
103
104 struct restart_info {
105 const char *name;
106 const char *what;
107 pid_t pid;
108 struct timeval time;
109 long interval;
110 struct thread *t_kill;
111 int kills;
112 };
113
114 static struct global_state {
115 watch_mode_t mode;
116 restart_phase_t phase;
117 struct thread *t_phase_hanging;
118 const char *vtydir;
119 long period;
120 long timeout;
121 long restart_timeout;
122 long min_restart_interval;
123 long max_restart_interval;
124 int do_ping;
125 struct daemon *daemons;
126 const char *restart_command;
127 const char *start_command;
128 const char *stop_command;
129 struct restart_info restart;
130 int unresponsive_restart;
131 int loglevel;
132 struct daemon *special; /* points to zebra when doing phased restart */
133 int numdaemons;
134 int numpids;
135 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
136 } gs = {
137 .mode = MODE_MONITOR,.phase = PHASE_NONE,.vtydir = VTYDIR,.period =
138 1000 * DEFAULT_PERIOD,.timeout =
139 DEFAULT_TIMEOUT,.restart_timeout =
140 DEFAULT_RESTART_TIMEOUT,.loglevel =
141 DEFAULT_LOGLEVEL,.min_restart_interval =
142 DEFAULT_MIN_RESTART,.max_restart_interval =
143 DEFAULT_MAX_RESTART,.do_ping = 1,};
144
145 typedef enum {
146 DAEMON_INIT,
147 DAEMON_DOWN,
148 DAEMON_CONNECTING,
149 DAEMON_UP,
150 DAEMON_UNRESPONSIVE
151 } daemon_state_t;
152
153 #define IS_UP(DMN) \
154 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
155
156 static const char *state_str[] = {
157 "Init",
158 "Down",
159 "Connecting",
160 "Up",
161 "Unresponsive",
162 };
163
164 struct daemon {
165 const char *name;
166 daemon_state_t state;
167 int fd;
168 struct timeval echo_sent;
169 u_int connect_tries;
170 struct thread *t_wakeup;
171 struct thread *t_read;
172 struct thread *t_write;
173 struct daemon *next;
174 struct restart_info restart;
175 };
176
177 #define OPTION_MINRESTART 2000
178 #define OPTION_MAXRESTART 2001
179
180 static const struct option longopts[] = {
181 {"daemon", no_argument, NULL, 'd'},
182 {"statedir", required_argument, NULL, 'S'},
183 {"no-echo", no_argument, NULL, 'e'},
184 {"loglevel", required_argument, NULL, 'l'},
185 {"interval", required_argument, NULL, 'i'},
186 {"timeout", required_argument, NULL, 't'},
187 {"restart-timeout", required_argument, NULL, 'T'},
188 {"restart", required_argument, NULL, 'r'},
189 {"start-command", required_argument, NULL, 's'},
190 {"kill-command", required_argument, NULL, 'k'},
191 {"restart-all", required_argument, NULL, 'R'},
192 {"all-restart", no_argument, NULL, 'a'},
193 {"always-all-restart", no_argument, NULL, 'A'},
194 {"unresponsive-restart", no_argument, NULL, 'z'},
195 {"min-restart-interval", required_argument, NULL, OPTION_MINRESTART },
196 {"max-restart-interval", required_argument, NULL, OPTION_MAXRESTART },
197 {"pid-file", required_argument, NULL, 'p'},
198 {"blank-string", required_argument, NULL, 'b'},
199 {"help", no_argument, NULL, 'h'},
200 {"version", no_argument, NULL, 'v'},
201 {NULL, 0, NULL, 0}
202 };
203
204 static int try_connect(struct daemon *dmn);
205 static int wakeup_send_echo(struct thread *t_wakeup);
206 static void try_restart(struct daemon *dmn);
207 static void phase_check(void);
208
209 static const char *progname;
210 static void printhelp(FILE *target)
211 {
212 fprintf(target, "Usage : %s [OPTION...] <daemon name> ...\n\n\
213 Watchdog program to monitor status of frr daemons and try to restart\n\
214 them if they are down or unresponsive. It determines whether a daemon is\n\
215 up based on whether it can connect to the daemon's vty unix stream socket.\n\
216 It then repeatedly sends echo commands over that socket to determine whether\n\
217 the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
218 on the socket connection and know immediately that the daemon is down.\n\n\
219 The daemons to be monitored should be listed on the command line.\n\n\
220 This program can run in one of 5 modes:\n\n\
221 0. Mode: %s.\n\
222 Just monitor and report on status changes. Example:\n\
223 %s -d zebra ospfd bgpd\n\n\
224 1. Mode: %s.\n\
225 Whenever any daemon hangs or crashes, use the given command to restart\n\
226 them all. Example:\n\
227 %s -dz \\\n\
228 -R '/sbin/service zebra restart; /sbin/service ospfd restart' \\\n\
229 zebra ospfd\n\n\
230 2. Mode: %s.\n\
231 When any single daemon hangs or crashes, restart only the daemon that's\n\
232 in trouble using the supplied restart command. Example:\n\
233 %s -dz -r '/sbin/service %%s restart' zebra ospfd bgpd\n\n\
234 3. Mode: %s.\n\
235 The same as the previous mode, except that there is special treatment when\n\
236 the zebra daemon is in trouble. In that case, a phased restart approach\n\
237 is used: 1. stop all other daemons; 2. restart zebra; 3. start the other\n\
238 daemons. Example:\n\
239 %s -adz -r '/sbin/service %%s restart' \\\n\
240 -s '/sbin/service %%s start' \\\n\
241 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
242 4. Mode: %s.\n\
243 This is the same as the previous mode, except that the phased restart\n\
244 procedure is used whenever any of the daemons hangs or crashes. Example:\n\
245 %s -Adz -r '/sbin/service %%s restart' \\\n\
246 -s '/sbin/service %%s start' \\\n\
247 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
248 As of this writing, it is believed that mode 2 [%s]\n\
249 is not safe, and mode 3 [%s] may not be safe with some of the\n\
250 routing daemons.\n\n\
251 In order to avoid attempting to restart the daemons in a fast loop,\n\
252 the -m and -M options allow you to control the minimum delay between\n\
253 restart commands. The minimum restart delay is recalculated each time\n\
254 a restart is attempted: if the time since the last restart attempt exceeds\n\
255 twice the -M value, then the restart delay is set to the -m value.\n\
256 Otherwise, the interval is doubled (but capped at the -M value).\n\n", progname, mode_str[0], progname, mode_str[1], progname, mode_str[2], progname, mode_str[3], progname, mode_str[4], progname, mode_str[2], mode_str[3]);
257
258 fprintf(target, "Options:\n\
259 -d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
260 to syslog instead of stdout.\n\
261 -S, --statedir Set the vty socket directory (default is %s)\n\
262 -e, --no-echo Do not ping the daemons to test responsiveness (this\n\
263 option is necessary if the daemons do not support the\n\
264 echo command)\n\
265 -l, --loglevel Set the logging level (default is %d).\n\
266 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
267 but it can be set higher than %d if extra-verbose debugging\n\
268 messages are desired.\n\
269 --min-restart-interval\n\
270 Set the minimum seconds to wait between invocations of daemon\n\
271 restart commands (default is %d).\n\
272 --max-restart-interval\n\
273 Set the maximum seconds to wait between invocations of daemon\n\
274 restart commands (default is %d).\n\
275 -i, --interval Set the status polling interval in seconds (default is %d)\n\
276 -t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
277 -T, --restart-timeout\n\
278 Set the restart (kill) timeout in seconds (default is %d).\n\
279 If any background jobs are still running after this much\n\
280 time has elapsed, they will be killed.\n\
281 -r, --restart Supply a Bourne shell command to use to restart a single\n\
282 daemon. The command string should include '%%s' where the\n\
283 name of the daemon should be substituted.\n\
284 Note that -r and -R are incompatible.\n\
285 -s, --start-command\n\
286 Supply a Bourne shell to command to use to start a single\n\
287 daemon. The command string should include '%%s' where the\n\
288 name of the daemon should be substituted.\n\
289 -k, --kill-command\n\
290 Supply a Bourne shell to command to use to stop a single\n\
291 daemon. The command string should include '%%s' where the\n\
292 name of the daemon should be substituted.\n\
293 -R, --restart-all\n\
294 When one or more daemons is down, try to restart everything\n\
295 using the Bourne shell command supplied as the argument.\n\
296 Note that -r and -R are incompatible.\n\
297 -z, --unresponsive-restart\n\
298 When a daemon is unresponsive, treat it as being down for\n\
299 restart purposes.\n\
300 -a, --all-restart\n\
301 When zebra hangs or crashes, restart all daemons using\n\
302 this phased approach: 1. stop all other daemons; 2. restart\n\
303 zebra; 3. start other daemons. Requires -r, -s, and -k.\n\
304 -A, --always-all-restart\n\
305 When any daemon (not just zebra) hangs or crashes, use the\n\
306 same phased restart mechanism described above for -a.\n\
307 Requires -r, -s, and -k.\n\
308 -p, --pid-file Set process identifier file name\n\
309 (default is %s).\n\
310 -b, --blank-string\n\
311 When the supplied argument string is found in any of the\n\
312 various shell command arguments (-r, -s, -k, or -R), replace\n\
313 it with a space. This is an ugly hack to circumvent problems\n\
314 passing command-line arguments with embedded spaces.\n\
315 -v, --version Print program version\n\
316 -h, --help Display this help and exit\n", VTYDIR, DEFAULT_LOGLEVEL, LOG_EMERG, LOG_DEBUG, LOG_DEBUG, DEFAULT_MIN_RESTART, DEFAULT_MAX_RESTART, DEFAULT_PERIOD, DEFAULT_TIMEOUT, DEFAULT_RESTART_TIMEOUT, DEFAULT_PIDFILE);
317 }
318
319 static pid_t run_background(char *shell_cmd)
320 {
321 pid_t child;
322
323 switch (child = fork()) {
324 case -1:
325 zlog_err("fork failed, cannot run command [%s]: %s",
326 shell_cmd, safe_strerror(errno));
327 return -1;
328 case 0:
329 /* Child process. */
330 /* Use separate process group so child processes can be killed easily. */
331 if (setpgid(0, 0) < 0)
332 zlog_warn("warning: setpgid(0,0) failed: %s",
333 safe_strerror(errno));
334 {
335 char shell[] = "sh";
336 char dashc[] = "-c";
337 char *const argv[4] = { shell, dashc, shell_cmd, NULL };
338 execv("/bin/sh", argv);
339 zlog_err("execv(/bin/sh -c '%s') failed: %s",
340 shell_cmd, safe_strerror(errno));
341 _exit(127);
342 }
343 default:
344 /* Parent process: we will reap the child later. */
345 zlog_err("Forked background command [pid %d]: %s", (int)child,
346 shell_cmd);
347 return child;
348 }
349 }
350
351 static struct timeval *time_elapsed(struct timeval *result,
352 const struct timeval *start_time)
353 {
354 gettimeofday(result, NULL);
355 result->tv_sec -= start_time->tv_sec;
356 result->tv_usec -= start_time->tv_usec;
357 while (result->tv_usec < 0) {
358 result->tv_usec += 1000000L;
359 result->tv_sec--;
360 }
361 return result;
362 }
363
364 static int restart_kill(struct thread *t_kill)
365 {
366 struct restart_info *restart = THREAD_ARG(t_kill);
367 struct timeval delay;
368
369 time_elapsed(&delay, &restart->time);
370 zlog_warn("Warning: %s %s child process %d still running after "
371 "%ld seconds, sending signal %d",
372 restart->what, restart->name, (int)restart->pid,
373 (long)delay.tv_sec, (restart->kills ? SIGKILL : SIGTERM));
374 kill(-restart->pid, (restart->kills ? SIGKILL : SIGTERM));
375 restart->kills++;
376 restart->t_kill = NULL;
377 thread_add_timer(master, restart_kill, restart, gs.restart_timeout,
378 &restart->t_kill);
379 return 0;
380 }
381
382 static struct restart_info *find_child(pid_t child)
383 {
384 if (gs.mode == MODE_GLOBAL_RESTART) {
385 if (gs.restart.pid == child)
386 return &gs.restart;
387 } else {
388 struct daemon *dmn;
389 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
390 if (dmn->restart.pid == child)
391 return &dmn->restart;
392 }
393 }
394 return NULL;
395 }
396
397 static void sigchild(void)
398 {
399 pid_t child;
400 int status;
401 const char *name;
402 const char *what;
403 struct restart_info *restart;
404
405 switch (child = waitpid(-1, &status, WNOHANG)) {
406 case -1:
407 zlog_err("waitpid failed: %s", safe_strerror(errno));
408 return;
409 case 0:
410 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
411 return;
412 }
413
414 if (child == integrated_write_pid) {
415 integrated_write_sigchld(status);
416 return;
417 }
418
419 if ((restart = find_child(child)) != NULL) {
420 name = restart->name;
421 what = restart->what;
422 restart->pid = 0;
423 gs.numpids--;
424 thread_cancel(restart->t_kill);
425 restart->t_kill = NULL;
426 /* Update restart time to reflect the time the command completed. */
427 gettimeofday(&restart->time, NULL);
428 } else {
429 zlog_err
430 ("waitpid returned status for an unknown child process %d",
431 (int)child);
432 name = "(unknown)";
433 what = "background";
434 }
435 if (WIFSTOPPED(status))
436 zlog_warn("warning: %s %s process %d is stopped",
437 what, name, (int)child);
438 else if (WIFSIGNALED(status))
439 zlog_warn("%s %s process %d terminated due to signal %d",
440 what, name, (int)child, WTERMSIG(status));
441 else if (WIFEXITED(status)) {
442 if (WEXITSTATUS(status) != 0)
443 zlog_warn
444 ("%s %s process %d exited with non-zero status %d",
445 what, name, (int)child, WEXITSTATUS(status));
446 else
447 zlog_debug("%s %s process %d exited normally", what,
448 name, (int)child);
449 } else
450 zlog_err("cannot interpret %s %s process %d wait status 0x%x",
451 what, name, (int)child, status);
452 phase_check();
453 }
454
455 static int
456 run_job(struct restart_info *restart, const char *cmdtype, const char *command,
457 int force, int update_interval)
458 {
459 struct timeval delay;
460
461 if (gs.loglevel > LOG_DEBUG + 1)
462 zlog_debug("attempting to %s %s", cmdtype, restart->name);
463
464 if (restart->pid) {
465 if (gs.loglevel > LOG_DEBUG + 1)
466 zlog_debug
467 ("cannot %s %s, previous pid %d still running",
468 cmdtype, restart->name, (int)restart->pid);
469 return -1;
470 }
471
472 /* Note: time_elapsed test must come before the force test, since we need
473 to make sure that delay is initialized for use below in updating the
474 restart interval. */
475 if ((time_elapsed(&delay, &restart->time)->tv_sec < restart->interval)
476 && !force) {
477 if (gs.loglevel > LOG_DEBUG + 1)
478 zlog_debug("postponing %s %s: "
479 "elapsed time %ld < retry interval %ld",
480 cmdtype, restart->name, (long)delay.tv_sec,
481 restart->interval);
482 return -1;
483 }
484
485 gettimeofday(&restart->time, NULL);
486 restart->kills = 0;
487 {
488 char cmd[strlen(command) + strlen(restart->name) + 1];
489 snprintf(cmd, sizeof(cmd), command, restart->name);
490 if ((restart->pid = run_background(cmd)) > 0) {
491 restart->t_kill = NULL;
492 thread_add_timer(master, restart_kill, restart, gs.restart_timeout,
493 &restart->t_kill);
494 restart->what = cmdtype;
495 gs.numpids++;
496 } else
497 restart->pid = 0;
498 }
499
500 /* Calculate the new restart interval. */
501 if (update_interval) {
502 if (delay.tv_sec > 2 * gs.max_restart_interval)
503 restart->interval = gs.min_restart_interval;
504 else if ((restart->interval *= 2) > gs.max_restart_interval)
505 restart->interval = gs.max_restart_interval;
506 if (gs.loglevel > LOG_DEBUG + 1)
507 zlog_debug("restart %s interval is now %ld",
508 restart->name, restart->interval);
509 }
510 return restart->pid;
511 }
512
513 #define SET_READ_HANDLER(DMN) \
514 do { \
515 (DMN)->t_read = NULL; \
516 thread_add_read (master, handle_read, (DMN), (DMN)->fd, &(DMN)->t_read); \
517 } while (0);
518
519 #define SET_WAKEUP_DOWN(DMN) \
520 do { \
521 (DMN)->t_wakeup = NULL; \
522 thread_add_timer_msec (master, wakeup_down, (DMN), FUZZY(gs.period), \
523 &(DMN)->t_wakeup); \
524 } while (0);
525
526 #define SET_WAKEUP_UNRESPONSIVE(DMN) \
527 do { \
528 (DMN)->t_wakeup = NULL; \
529 thread_add_timer_msec (master, wakeup_unresponsive, (DMN), \
530 FUZZY(gs.period), &(DMN)->t_wakeup); \
531 } while (0);
532
533 #define SET_WAKEUP_ECHO(DMN) \
534 do { \
535 (DMN)->t_wakeup = NULL; \
536 thread_add_timer_msec (master, wakeup_send_echo, (DMN), \
537 FUZZY(gs.period), &(DMN)->t_wakeup); \
538 } while (0);
539
540 static int wakeup_down(struct thread *t_wakeup)
541 {
542 struct daemon *dmn = THREAD_ARG(t_wakeup);
543
544 dmn->t_wakeup = NULL;
545 if (try_connect(dmn) < 0)
546 SET_WAKEUP_DOWN(dmn);
547 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
548 try_restart(dmn);
549 return 0;
550 }
551
552 static int wakeup_init(struct thread *t_wakeup)
553 {
554 struct daemon *dmn = THREAD_ARG(t_wakeup);
555
556 dmn->t_wakeup = NULL;
557 if (try_connect(dmn) < 0) {
558 SET_WAKEUP_DOWN(dmn);
559 zlog_err("%s state -> down : initial connection attempt failed",
560 dmn->name);
561 dmn->state = DAEMON_DOWN;
562 }
563 return 0;
564 }
565
566 static void daemon_down(struct daemon *dmn, const char *why)
567 {
568 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
569 zlog_err("%s state -> down : %s", dmn->name, why);
570 else if (gs.loglevel > LOG_DEBUG)
571 zlog_debug("%s still down : %s", dmn->name, why);
572 if (IS_UP(dmn))
573 gs.numdown++;
574 dmn->state = DAEMON_DOWN;
575 if (dmn->fd >= 0) {
576 close(dmn->fd);
577 dmn->fd = -1;
578 }
579 THREAD_OFF(dmn->t_read);
580 THREAD_OFF(dmn->t_write);
581 THREAD_OFF(dmn->t_wakeup);
582 if (try_connect(dmn) < 0)
583 SET_WAKEUP_DOWN(dmn);
584 phase_check();
585 }
586
587 static int handle_read(struct thread *t_read)
588 {
589 struct daemon *dmn = THREAD_ARG(t_read);
590 static const char resp[sizeof(PING_TOKEN) + 4] = PING_TOKEN "\n";
591 char buf[sizeof(resp) + 100];
592 ssize_t rc;
593 struct timeval delay;
594
595 dmn->t_read = NULL;
596 if ((rc = read(dmn->fd, buf, sizeof(buf))) < 0) {
597 char why[100];
598
599 if (ERRNO_IO_RETRY(errno)) {
600 /* Pretend it never happened. */
601 SET_READ_HANDLER(dmn);
602 return 0;
603 }
604 snprintf(why, sizeof(why), "unexpected read error: %s",
605 safe_strerror(errno));
606 daemon_down(dmn, why);
607 return 0;
608 }
609 if (rc == 0) {
610 daemon_down(dmn, "read returned EOF");
611 return 0;
612 }
613 if (!dmn->echo_sent.tv_sec) {
614 char why[sizeof(buf) + 100];
615 snprintf(why, sizeof(why),
616 "unexpected read returns %d bytes: %.*s", (int)rc,
617 (int)rc, buf);
618 daemon_down(dmn, why);
619 return 0;
620 }
621
622 /* We are expecting an echo response: is there any chance that the
623 response would not be returned entirely in the first read? That
624 seems inconceivable... */
625 if ((rc != sizeof(resp)) || memcmp(buf, resp, sizeof(resp))) {
626 char why[100 + sizeof(buf)];
627 snprintf(why, sizeof(why),
628 "read returned bad echo response of %d bytes "
629 "(expecting %u): %.*s", (int)rc, (u_int) sizeof(resp),
630 (int)rc, buf);
631 daemon_down(dmn, why);
632 return 0;
633 }
634
635 time_elapsed(&delay, &dmn->echo_sent);
636 dmn->echo_sent.tv_sec = 0;
637 if (dmn->state == DAEMON_UNRESPONSIVE) {
638 if (delay.tv_sec < gs.timeout) {
639 dmn->state = DAEMON_UP;
640 zlog_warn
641 ("%s state -> up : echo response received after %ld.%06ld "
642 "seconds", dmn->name, (long)delay.tv_sec,
643 (long)delay.tv_usec);
644 } else
645 zlog_warn
646 ("%s: slow echo response finally received after %ld.%06ld "
647 "seconds", dmn->name, (long)delay.tv_sec,
648 (long)delay.tv_usec);
649 } else if (gs.loglevel > LOG_DEBUG + 1)
650 zlog_debug("%s: echo response received after %ld.%06ld seconds",
651 dmn->name, (long)delay.tv_sec, (long)delay.tv_usec);
652
653 SET_READ_HANDLER(dmn);
654 if (dmn->t_wakeup)
655 thread_cancel(dmn->t_wakeup);
656 SET_WAKEUP_ECHO(dmn);
657
658 return 0;
659 }
660
661 /*
662 * Wait till we notice that all daemons are ready before
663 * we send we are ready to systemd
664 */
665 static void daemon_send_ready(void)
666 {
667 static int sent = 0;
668 if (!sent && gs.numdown == 0) {
669 FILE *fp;
670
671 fp = fopen(DAEMON_VTY_DIR "/watchfrr.started", "w");
672 fclose(fp);
673 zlog_notice
674 ("Watchfrr: Notifying Systemd we are up and running");
675 systemd_send_started(master, 0);
676 sent = 1;
677 }
678 }
679
680 static void daemon_up(struct daemon *dmn, const char *why)
681 {
682 dmn->state = DAEMON_UP;
683 gs.numdown--;
684 dmn->connect_tries = 0;
685 zlog_notice("%s state -> up : %s", dmn->name, why);
686 daemon_send_ready();
687 if (gs.do_ping)
688 SET_WAKEUP_ECHO(dmn);
689 phase_check();
690 }
691
692 static int check_connect(struct thread *t_write)
693 {
694 struct daemon *dmn = THREAD_ARG(t_write);
695 int sockerr;
696 socklen_t reslen = sizeof(sockerr);
697
698 dmn->t_write = NULL;
699 if (getsockopt(dmn->fd, SOL_SOCKET, SO_ERROR, (char *)&sockerr, &reslen)
700 < 0) {
701 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn->name,
702 safe_strerror(errno));
703 daemon_down(dmn,
704 "getsockopt failed checking connection success");
705 return 0;
706 }
707 if ((reslen == sizeof(sockerr)) && sockerr) {
708 char why[100];
709 snprintf(why, sizeof(why),
710 "getsockopt reports that connection attempt failed: %s",
711 safe_strerror(sockerr));
712 daemon_down(dmn, why);
713 return 0;
714 }
715
716 daemon_up(dmn, "delayed connect succeeded");
717 return 0;
718 }
719
720 static int wakeup_connect_hanging(struct thread *t_wakeup)
721 {
722 struct daemon *dmn = THREAD_ARG(t_wakeup);
723 char why[100];
724
725 dmn->t_wakeup = NULL;
726 snprintf(why, sizeof(why),
727 "connection attempt timed out after %ld seconds", gs.timeout);
728 daemon_down(dmn, why);
729 return 0;
730 }
731
732 /* Making connection to protocol daemon. */
733 static int try_connect(struct daemon *dmn)
734 {
735 int sock;
736 struct sockaddr_un addr;
737 socklen_t len;
738
739 if (gs.loglevel > LOG_DEBUG + 1)
740 zlog_debug("%s: attempting to connect", dmn->name);
741 dmn->connect_tries++;
742
743 memset(&addr, 0, sizeof(struct sockaddr_un));
744 addr.sun_family = AF_UNIX;
745 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty",
746 gs.vtydir, dmn->name);
747 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
748 len = addr.sun_len = SUN_LEN(&addr);
749 #else
750 len = sizeof(addr.sun_family) + strlen(addr.sun_path);
751 #endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
752
753 /* Quick check to see if we might succeed before we go to the trouble
754 of creating a socket. */
755 if (access(addr.sun_path, W_OK) < 0) {
756 if (errno != ENOENT)
757 zlog_err("%s: access to socket %s denied: %s",
758 dmn->name, addr.sun_path,
759 safe_strerror(errno));
760 return -1;
761 }
762
763 if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
764 zlog_err("%s(%s): cannot make socket: %s",
765 __func__, addr.sun_path, safe_strerror(errno));
766 return -1;
767 }
768
769 if (set_nonblocking(sock) < 0 || set_cloexec(sock) < 0) {
770 zlog_err("%s(%s): set_nonblocking/cloexec(%d) failed",
771 __func__, addr.sun_path, sock);
772 close(sock);
773 return -1;
774 }
775
776 if (connect(sock, (struct sockaddr *)&addr, len) < 0) {
777 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK)) {
778 if (gs.loglevel > LOG_DEBUG)
779 zlog_debug("%s(%s): connect failed: %s",
780 __func__, addr.sun_path,
781 safe_strerror(errno));
782 close(sock);
783 return -1;
784 }
785 if (gs.loglevel > LOG_DEBUG)
786 zlog_debug("%s: connection in progress", dmn->name);
787 dmn->state = DAEMON_CONNECTING;
788 dmn->fd = sock;
789 dmn->t_write = NULL;
790 thread_add_write(master, check_connect, dmn, dmn->fd,
791 &dmn->t_write);dmn->t_wakeup = NULL;
792 thread_add_timer(master, wakeup_connect_hanging, dmn, gs.timeout,
793 &dmn->t_wakeup);
794 SET_READ_HANDLER(dmn);
795 return 0;
796 }
797
798 dmn->fd = sock;
799 SET_READ_HANDLER(dmn);
800 daemon_up(dmn, "connect succeeded");
801 return 1;
802 }
803
804 static int phase_hanging(struct thread *t_hanging)
805 {
806 gs.t_phase_hanging = NULL;
807 zlog_err("Phase [%s] hanging for %ld seconds, aborting phased restart",
808 phase_str[gs.phase], PHASE_TIMEOUT);
809 gs.phase = PHASE_NONE;
810 return 0;
811 }
812
813 static void set_phase(restart_phase_t new_phase)
814 {
815 gs.phase = new_phase;
816 if (gs.t_phase_hanging)
817 thread_cancel(gs.t_phase_hanging);
818 gs.t_phase_hanging = NULL;
819 thread_add_timer(master, phase_hanging, NULL, PHASE_TIMEOUT,
820 &gs.t_phase_hanging);
821 }
822
823 static void phase_check(void)
824 {
825 switch (gs.phase) {
826 case PHASE_NONE:
827 break;
828 case PHASE_STOPS_PENDING:
829 if (gs.numpids)
830 break;
831 zlog_info
832 ("Phased restart: all routing daemon stop jobs have completed.");
833 set_phase(PHASE_WAITING_DOWN);
834
835 /*FALLTHRU*/
836 case PHASE_WAITING_DOWN:
837 if (gs.numdown + IS_UP(gs.special) < gs.numdaemons)
838 break;
839 zlog_info("Phased restart: all routing daemons now down.");
840 run_job(&gs.special->restart, "restart", gs.restart_command, 1,
841 1);
842 set_phase(PHASE_ZEBRA_RESTART_PENDING);
843
844 /*FALLTHRU*/
845 case PHASE_ZEBRA_RESTART_PENDING:
846 if (gs.special->restart.pid)
847 break;
848 zlog_info("Phased restart: %s restart job completed.",
849 gs.special->name);
850 set_phase(PHASE_WAITING_ZEBRA_UP);
851
852 /*FALLTHRU*/
853 case PHASE_WAITING_ZEBRA_UP:
854 if (!IS_UP(gs.special))
855 break;
856 zlog_info("Phased restart: %s is now up.", gs.special->name);
857 {
858 struct daemon *dmn;
859 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
860 if (dmn != gs.special)
861 run_job(&dmn->restart, "start",
862 gs.start_command, 1, 0);
863 }
864 }
865 gs.phase = PHASE_NONE;
866 THREAD_OFF(gs.t_phase_hanging);
867 zlog_notice("Phased global restart has completed.");
868 break;
869 }
870 }
871
872 static void try_restart(struct daemon *dmn)
873 {
874 switch (gs.mode) {
875 case MODE_MONITOR:
876 return;
877 case MODE_GLOBAL_RESTART:
878 run_job(&gs.restart, "restart", gs.restart_command, 0, 1);
879 break;
880 case MODE_SEPARATE_RESTART:
881 run_job(&dmn->restart, "restart", gs.restart_command, 0, 1);
882 break;
883 case MODE_PHASED_ZEBRA_RESTART:
884 if (dmn != gs.special) {
885 if ((gs.special->state == DAEMON_UP)
886 && (gs.phase == PHASE_NONE))
887 run_job(&dmn->restart, "restart",
888 gs.restart_command, 0, 1);
889 else
890 zlog_debug
891 ("%s: postponing restart attempt because master %s daemon "
892 "not up [%s], or phased restart in progress",
893 dmn->name, gs.special->name,
894 state_str[gs.special->state]);
895 break;
896 }
897
898 /*FALLTHRU*/
899 case MODE_PHASED_ALL_RESTART:
900 if ((gs.phase != PHASE_NONE) || gs.numpids) {
901 if (gs.loglevel > LOG_DEBUG + 1)
902 zlog_debug
903 ("postponing phased global restart: restart already in "
904 "progress [%s], or outstanding child processes [%d]",
905 phase_str[gs.phase], gs.numpids);
906 break;
907 }
908 /* Is it too soon for a restart? */
909 {
910 struct timeval delay;
911 if (time_elapsed(&delay, &gs.special->restart.time)->
912 tv_sec < gs.special->restart.interval) {
913 if (gs.loglevel > LOG_DEBUG + 1)
914 zlog_debug
915 ("postponing phased global restart: "
916 "elapsed time %ld < retry interval %ld",
917 (long)delay.tv_sec,
918 gs.special->restart.interval);
919 break;
920 }
921 }
922 run_job(&gs.restart, "restart", gs.restart_command, 0, 1);
923 break;
924 default:
925 zlog_err("error: unknown restart mode %d", gs.mode);
926 break;
927 }
928 }
929
930 static int wakeup_unresponsive(struct thread *t_wakeup)
931 {
932 struct daemon *dmn = THREAD_ARG(t_wakeup);
933
934 dmn->t_wakeup = NULL;
935 if (dmn->state != DAEMON_UNRESPONSIVE)
936 zlog_err("%s: no longer unresponsive (now %s), "
937 "wakeup should have been cancelled!",
938 dmn->name, state_str[dmn->state]);
939 else {
940 SET_WAKEUP_UNRESPONSIVE(dmn);
941 try_restart(dmn);
942 }
943 return 0;
944 }
945
946 static int wakeup_no_answer(struct thread *t_wakeup)
947 {
948 struct daemon *dmn = THREAD_ARG(t_wakeup);
949
950 dmn->t_wakeup = NULL;
951 dmn->state = DAEMON_UNRESPONSIVE;
952 zlog_err("%s state -> unresponsive : no response yet to ping "
953 "sent %ld seconds ago", dmn->name, gs.timeout);
954 if (gs.unresponsive_restart) {
955 SET_WAKEUP_UNRESPONSIVE(dmn);
956 try_restart(dmn);
957 }
958 return 0;
959 }
960
961 static int wakeup_send_echo(struct thread *t_wakeup)
962 {
963 static const char echocmd[] = "echo " PING_TOKEN;
964 ssize_t rc;
965 struct daemon *dmn = THREAD_ARG(t_wakeup);
966
967 dmn->t_wakeup = NULL;
968 if (((rc = write(dmn->fd, echocmd, sizeof(echocmd))) < 0) ||
969 ((size_t) rc != sizeof(echocmd))) {
970 char why[100 + sizeof(echocmd)];
971 snprintf(why, sizeof(why),
972 "write '%s' returned %d instead of %u", echocmd,
973 (int)rc, (u_int) sizeof(echocmd));
974 daemon_down(dmn, why);
975 } else {
976 gettimeofday(&dmn->echo_sent, NULL);
977 dmn->t_wakeup = NULL;
978 thread_add_timer(master, wakeup_no_answer, dmn, gs.timeout,
979 &dmn->t_wakeup);
980 }
981 return 0;
982 }
983
984 static void sigint(void)
985 {
986 zlog_notice("Terminating on signal");
987 systemd_send_stopping();
988 exit(0);
989 }
990
991 static int valid_command(const char *cmd)
992 {
993 char *p;
994
995 return ((p = strchr(cmd, '%')) != NULL) && (*(p + 1) == 's')
996 && !strchr(p + 1, '%');
997 }
998
999 /* This is an ugly hack to circumvent problems with passing command-line
1000 arguments that contain spaces. The fix is to use a configuration file. */
1001 static char *translate_blanks(const char *cmd, const char *blankstr)
1002 {
1003 char *res;
1004 char *p;
1005 size_t bslen = strlen(blankstr);
1006
1007 if (!(res = strdup(cmd))) {
1008 perror("strdup");
1009 exit(1);
1010 }
1011 while ((p = strstr(res, blankstr)) != NULL) {
1012 *p = ' ';
1013 if (bslen != 1)
1014 memmove(p + 1, p + bslen, strlen(p + bslen) + 1);
1015 }
1016 return res;
1017 }
1018
1019 struct zebra_privs_t watchfrr_privs = {
1020 #ifdef VTY_GROUP
1021 .vty_group = VTY_GROUP,
1022 #endif
1023 };
1024
1025 static struct quagga_signal_t watchfrr_signals[] = {
1026 {
1027 .signal = SIGINT,
1028 .handler = sigint,
1029 },
1030 {
1031 .signal = SIGTERM,
1032 .handler = sigint,
1033 },
1034 {
1035 .signal = SIGCHLD,
1036 .handler = sigchild,
1037 },
1038 };
1039
1040 FRR_DAEMON_INFO(watchfrr, WATCHFRR,
1041 .flags = FRR_NO_PRIVSEP | FRR_NO_TCPVTY | FRR_LIMITED_CLI
1042 | FRR_NO_CFG_PID_DRY | FRR_NO_ZCLIENT,
1043
1044 .printhelp = printhelp,
1045 .copyright = "Copyright 2004 Andrew J. Schorr",
1046
1047 .signals = watchfrr_signals,
1048 .n_signals = array_size(watchfrr_signals),
1049
1050 .privs = &watchfrr_privs,
1051 )
1052
1053 int main(int argc, char **argv)
1054 {
1055 int opt;
1056 const char *pidfile = DEFAULT_PIDFILE;
1057 const char *special = "zebra";
1058 const char *blankstr = NULL;
1059
1060 frr_preinit(&watchfrr_di, argc, argv);
1061 progname = watchfrr_di.progname;
1062
1063 frr_opt_add("aAb:dek:l:i:p:r:R:S:s:t:T:z", longopts, "");
1064
1065 gs.restart.name = "all";
1066 while ((opt = frr_getopt(argc, argv, NULL)) != EOF) {
1067 switch (opt) {
1068 case 0:
1069 break;
1070 case 'a':
1071 if ((gs.mode != MODE_MONITOR)
1072 && (gs.mode != MODE_SEPARATE_RESTART)) {
1073 fputs("Ambiguous operating mode selected.\n",
1074 stderr);
1075 frr_help_exit(1);
1076 }
1077 gs.mode = MODE_PHASED_ZEBRA_RESTART;
1078 break;
1079 case 'A':
1080 if ((gs.mode != MODE_MONITOR)
1081 && (gs.mode != MODE_SEPARATE_RESTART)) {
1082 fputs("Ambiguous operating mode selected.\n",
1083 stderr);
1084 frr_help_exit(1);
1085 }
1086 gs.mode = MODE_PHASED_ALL_RESTART;
1087 break;
1088 case 'b':
1089 blankstr = optarg;
1090 break;
1091 case 'e':
1092 gs.do_ping = 0;
1093 break;
1094 case 'k':
1095 if (!valid_command(optarg)) {
1096 fprintf(stderr,
1097 "Invalid kill command, must contain '%%s': %s\n",
1098 optarg);
1099 frr_help_exit(1);
1100 }
1101 gs.stop_command = optarg;
1102 break;
1103 case 'l':
1104 {
1105 char garbage[3];
1106 if ((sscanf
1107 (optarg, "%d%1s", &gs.loglevel,
1108 garbage) != 1)
1109 || (gs.loglevel < LOG_EMERG)) {
1110 fprintf(stderr,
1111 "Invalid loglevel argument: %s\n",
1112 optarg);
1113 frr_help_exit(1);
1114 }
1115 }
1116 break;
1117 case OPTION_MINRESTART:
1118 {
1119 char garbage[3];
1120 if ((sscanf(optarg, "%ld%1s",
1121 &gs.min_restart_interval,
1122 garbage) != 1)
1123 || (gs.min_restart_interval < 0)) {
1124 fprintf(stderr,
1125 "Invalid min_restart_interval argument: %s\n",
1126 optarg);
1127 frr_help_exit(1);
1128 }
1129 }
1130 break;
1131 case OPTION_MAXRESTART:
1132 {
1133 char garbage[3];
1134 if ((sscanf(optarg, "%ld%1s",
1135 &gs.max_restart_interval,
1136 garbage) != 1)
1137 || (gs.max_restart_interval < 0)) {
1138 fprintf(stderr,
1139 "Invalid max_restart_interval argument: %s\n",
1140 optarg);
1141 frr_help_exit(1);
1142 }
1143 }
1144 break;
1145 case 'i':
1146 {
1147 char garbage[3];
1148 int period;
1149 if ((sscanf(optarg, "%d%1s", &period, garbage)
1150 != 1) || (gs.period < 1)) {
1151 fprintf(stderr,
1152 "Invalid interval argument: %s\n",
1153 optarg);
1154 frr_help_exit(1);
1155 }
1156 gs.period = 1000 * period;
1157 }
1158 break;
1159 case 'p':
1160 pidfile = optarg;
1161 break;
1162 case 'r':
1163 if ((gs.mode == MODE_GLOBAL_RESTART) ||
1164 (gs.mode == MODE_SEPARATE_RESTART)) {
1165 fputs("Ambiguous operating mode selected.\n",
1166 stderr);
1167 frr_help_exit(1);
1168 }
1169 if (!valid_command(optarg)) {
1170 fprintf(stderr,
1171 "Invalid restart command, must contain '%%s': %s\n",
1172 optarg);
1173 frr_help_exit(1);
1174 }
1175 gs.restart_command = optarg;
1176 if (gs.mode == MODE_MONITOR)
1177 gs.mode = MODE_SEPARATE_RESTART;
1178 break;
1179 case 'R':
1180 if (gs.mode != MODE_MONITOR) {
1181 fputs("Ambiguous operating mode selected.\n",
1182 stderr);
1183 frr_help_exit(1);
1184 }
1185 if (strchr(optarg, '%')) {
1186 fprintf(stderr,
1187 "Invalid restart-all arg, must not contain '%%s': %s\n",
1188 optarg);
1189 frr_help_exit(1);
1190 }
1191 gs.restart_command = optarg;
1192 gs.mode = MODE_GLOBAL_RESTART;
1193 break;
1194 case 's':
1195 if (!valid_command(optarg)) {
1196 fprintf(stderr,
1197 "Invalid start command, must contain '%%s': %s\n",
1198 optarg);
1199 frr_help_exit(1);
1200 }
1201 gs.start_command = optarg;
1202 break;
1203 case 'S':
1204 gs.vtydir = optarg;
1205 break;
1206 case 't':
1207 {
1208 char garbage[3];
1209 if ((sscanf
1210 (optarg, "%ld%1s", &gs.timeout,
1211 garbage) != 1) || (gs.timeout < 1)) {
1212 fprintf(stderr,
1213 "Invalid timeout argument: %s\n",
1214 optarg);
1215 frr_help_exit(1);
1216 }
1217 }
1218 break;
1219 case 'T':
1220 {
1221 char garbage[3];
1222 if ((sscanf
1223 (optarg, "%ld%1s", &gs.restart_timeout,
1224 garbage) != 1)
1225 || (gs.restart_timeout < 1)) {
1226 fprintf(stderr,
1227 "Invalid restart timeout argument: %s\n",
1228 optarg);
1229 frr_help_exit(1);
1230 }
1231 }
1232 break;
1233 case 'z':
1234 gs.unresponsive_restart = 1;
1235 break;
1236 default:
1237 fputs("Invalid option.\n", stderr);
1238 frr_help_exit(1);
1239 }
1240 }
1241
1242 if (gs.unresponsive_restart && (gs.mode == MODE_MONITOR)) {
1243 fputs("Option -z requires a -r or -R restart option.\n",
1244 stderr);
1245 frr_help_exit(1);
1246 }
1247 switch (gs.mode) {
1248 case MODE_MONITOR:
1249 if (gs.restart_command || gs.start_command || gs.stop_command) {
1250 fprintf(stderr,
1251 "No kill/(re)start commands needed for %s mode.\n",
1252 mode_str[gs.mode]);
1253 frr_help_exit(1);
1254 }
1255 break;
1256 case MODE_GLOBAL_RESTART:
1257 case MODE_SEPARATE_RESTART:
1258 if (!gs.restart_command || gs.start_command || gs.stop_command) {
1259 fprintf(stderr,
1260 "No start/kill commands needed in [%s] mode.\n",
1261 mode_str[gs.mode]);
1262 frr_help_exit(1);
1263 }
1264 break;
1265 case MODE_PHASED_ZEBRA_RESTART:
1266 case MODE_PHASED_ALL_RESTART:
1267 if (!gs.restart_command || !gs.start_command
1268 || !gs.stop_command) {
1269 fprintf(stderr,
1270 "Need start, kill, and restart commands in [%s] mode.\n",
1271 mode_str[gs.mode]);
1272 frr_help_exit(1);
1273 }
1274 break;
1275 }
1276
1277 if (blankstr) {
1278 if (gs.restart_command)
1279 gs.restart_command =
1280 translate_blanks(gs.restart_command, blankstr);
1281 if (gs.start_command)
1282 gs.start_command =
1283 translate_blanks(gs.start_command, blankstr);
1284 if (gs.stop_command)
1285 gs.stop_command =
1286 translate_blanks(gs.stop_command, blankstr);
1287 }
1288
1289 gs.restart.interval = gs.min_restart_interval;
1290
1291 master = frr_init();
1292
1293 zlog_set_level(ZLOG_DEST_MONITOR, ZLOG_DISABLED);
1294 if (watchfrr_di.daemon_mode) {
1295 zlog_set_level(ZLOG_DEST_SYSLOG, MIN(gs.loglevel, LOG_DEBUG));
1296 if (daemon (0, 0) < 0) {
1297 fprintf(stderr, "Watchquagga daemon failed: %s",
1298 strerror(errno));
1299 exit (1);
1300 }
1301 } else
1302 zlog_set_level(ZLOG_DEST_STDOUT, MIN(gs.loglevel, LOG_DEBUG));
1303
1304 watchfrr_vty_init();
1305
1306 frr_vty_serv();
1307
1308 {
1309 int i;
1310 struct daemon *tail = NULL;
1311
1312 for (i = optind; i < argc; i++) {
1313 struct daemon *dmn;
1314
1315 if (!(dmn = (struct daemon *)calloc(1, sizeof(*dmn)))) {
1316 fprintf(stderr, "calloc(1,%u) failed: %s\n",
1317 (u_int) sizeof(*dmn),
1318 safe_strerror(errno));
1319 return 1;
1320 }
1321 dmn->name = dmn->restart.name = argv[i];
1322 dmn->state = DAEMON_INIT;
1323 gs.numdaemons++;
1324 gs.numdown++;
1325 dmn->fd = -1;
1326 dmn->t_wakeup = NULL;
1327 thread_add_timer_msec(master, wakeup_init, dmn, 100 + (random() % 900),
1328 &dmn->t_wakeup);
1329 dmn->restart.interval = gs.min_restart_interval;
1330 if (tail)
1331 tail->next = dmn;
1332 else
1333 gs.daemons = dmn;
1334 tail = dmn;
1335
1336 if (((gs.mode == MODE_PHASED_ZEBRA_RESTART) ||
1337 (gs.mode == MODE_PHASED_ALL_RESTART)) &&
1338 !strcmp(dmn->name, special))
1339 gs.special = dmn;
1340 }
1341 }
1342 if (!gs.daemons) {
1343 fputs("Must specify one or more daemons to monitor.\n", stderr);
1344 frr_help_exit(1);
1345 }
1346 if (((gs.mode == MODE_PHASED_ZEBRA_RESTART) ||
1347 (gs.mode == MODE_PHASED_ALL_RESTART)) && !gs.special) {
1348 fprintf(stderr,
1349 "In mode [%s], but cannot find master daemon %s\n",
1350 mode_str[gs.mode], special);
1351 frr_help_exit(1);
1352 }
1353
1354 /* Make sure we're not already running. */
1355 pid_output(pidfile);
1356
1357 /* Announce which daemons are being monitored. */
1358 {
1359 struct daemon *dmn;
1360 size_t len = 0;
1361
1362 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1363 len += strlen(dmn->name) + 1;
1364
1365 {
1366 char buf[len + 1];
1367 char *p = buf;
1368
1369 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
1370 if (p != buf)
1371 *p++ = ' ';
1372 strcpy(p, dmn->name);
1373 p += strlen(p);
1374 }
1375 zlog_notice("%s %s watching [%s], mode [%s]",
1376 progname, FRR_VERSION, buf,
1377 mode_str[gs.mode]);
1378 }
1379 }
1380
1381 {
1382 struct thread thread;
1383
1384 while (thread_fetch(master, &thread))
1385 thread_call(&thread);
1386 }
1387
1388 systemd_send_stopping();
1389 /* Not reached. */
1390 return 0;
1391 }