]> git.proxmox.com Git - mirror_frr.git/blame - watchfrr/watchfrr.c
*: make consistent & update GPLv2 file headers
[mirror_frr.git] / watchfrr / watchfrr.c
CommitLineData
8b886ca7 1/*
896014f4
DL
2 * Monitor status of frr daemons and restart if necessary.
3 *
4 * Copyright (C) 2004 Andrew J. Schorr
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
8b886ca7 19 */
20
a365534f 21#include <zebra.h>
8b886ca7 22#include <thread.h>
23#include <log.h>
52e66296 24#include <network.h>
8b886ca7 25#include <sigevent.h>
a365534f 26#include <lib/version.h>
95c4aff2 27#include "command.h"
87f44e2f 28#include "memory_vty.h"
4f04a76b 29#include "libfrr.h"
95c4aff2 30
6f594023 31#include <getopt.h>
a365534f 32#include <sys/un.h>
33#include <sys/wait.h>
837d16cc 34#include <memory.h>
651415bd 35#include <systemd.h>
8b886ca7 36
9473e340 37#include "watchfrr.h"
95c4aff2 38
8b886ca7 39#ifndef MIN
40#define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
41#endif
42
43/* Macros to help randomize timers. */
44#define JITTER(X) ((random() % ((X)+1))-((X)/2))
45#define FUZZY(X) ((X)+JITTER((X)/20))
46
47#define DEFAULT_PERIOD 5
48#define DEFAULT_TIMEOUT 10
49#define DEFAULT_RESTART_TIMEOUT 20
50#define DEFAULT_LOGLEVEL LOG_INFO
51#define DEFAULT_MIN_RESTART 60
52#define DEFAULT_MAX_RESTART 600
9473e340
DS
53#ifdef PATH_WATCHFRR_PID
54#define DEFAULT_PIDFILE PATH_WATCHFRR_PID
6028df52 55#else
9473e340 56#define DEFAULT_PIDFILE STATEDIR "/watchfrr.pid"
6028df52 57#endif
16f6511e 58#ifdef DAEMON_VTY_DIR
59#define VTYDIR DAEMON_VTY_DIR
60#else
61#define VTYDIR STATEDIR
62#endif
8b886ca7 63
64#define PING_TOKEN "PING"
65
55c72803 66/* Needs to be global, referenced somewhere inside libfrr. */
8b886ca7 67struct thread_master *master;
68
a6810074
DL
69typedef enum {
70 MODE_MONITOR = 0,
71 MODE_GLOBAL_RESTART,
72 MODE_SEPARATE_RESTART,
73 MODE_PHASED_ZEBRA_RESTART,
74 MODE_PHASED_ALL_RESTART
8b886ca7 75} watch_mode_t;
76
a6810074
DL
77static const char *mode_str[] = {
78 "monitor",
79 "global restart",
80 "individual daemon restart",
81 "phased zebra restart",
82 "phased global restart for any failure",
8b886ca7 83};
84
a6810074
DL
85typedef enum {
86 PHASE_NONE = 0,
87 PHASE_STOPS_PENDING,
88 PHASE_WAITING_DOWN,
89 PHASE_ZEBRA_RESTART_PENDING,
90 PHASE_WAITING_ZEBRA_UP
8b886ca7 91} restart_phase_t;
92
a6810074
DL
93static const char *phase_str[] = {
94 "None",
95 "Stop jobs running",
96 "Waiting for other daemons to come down",
97 "Zebra restart job running",
98 "Waiting for zebra to come up",
99 "Start jobs running",
8b886ca7 100};
101
102#define PHASE_TIMEOUT (3*gs.restart_timeout)
103
a6810074
DL
104struct restart_info {
105 const char *name;
106 const char *what;
107 pid_t pid;
108 struct timeval time;
109 long interval;
110 struct thread *t_kill;
111 int kills;
098e240f 112};
113
a6810074
DL
114static struct global_state {
115 watch_mode_t mode;
116 restart_phase_t phase;
117 struct thread *t_phase_hanging;
118 const char *vtydir;
119 long period;
120 long timeout;
121 long restart_timeout;
122 long min_restart_interval;
123 long max_restart_interval;
124 int do_ping;
125 struct daemon *daemons;
126 const char *restart_command;
127 const char *start_command;
128 const char *stop_command;
129 struct restart_info restart;
130 int unresponsive_restart;
131 int loglevel;
132 struct daemon *special; /* points to zebra when doing phased restart */
133 int numdaemons;
134 int numpids;
135 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
8b886ca7 136} gs = {
a6810074
DL
137.mode = MODE_MONITOR,.phase = PHASE_NONE,.vtydir = VTYDIR,.period =
138 1000 * DEFAULT_PERIOD,.timeout =
139 DEFAULT_TIMEOUT,.restart_timeout =
140 DEFAULT_RESTART_TIMEOUT,.loglevel =
141 DEFAULT_LOGLEVEL,.min_restart_interval =
142 DEFAULT_MIN_RESTART,.max_restart_interval =
143 DEFAULT_MAX_RESTART,.do_ping = 1,};
144
145typedef enum {
146 DAEMON_INIT,
147 DAEMON_DOWN,
148 DAEMON_CONNECTING,
149 DAEMON_UP,
150 DAEMON_UNRESPONSIVE
8b886ca7 151} daemon_state_t;
152
153#define IS_UP(DMN) \
154 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
155
a6810074
DL
156static const char *state_str[] = {
157 "Init",
158 "Down",
159 "Connecting",
160 "Up",
161 "Unresponsive",
8b886ca7 162};
163
164struct daemon {
a6810074
DL
165 const char *name;
166 daemon_state_t state;
167 int fd;
168 struct timeval echo_sent;
169 u_int connect_tries;
170 struct thread *t_wakeup;
171 struct thread *t_read;
172 struct thread *t_write;
173 struct daemon *next;
174 struct restart_info restart;
8b886ca7 175};
176
9272302b
DL
177#define OPTION_MINRESTART 2000
178#define OPTION_MAXRESTART 2001
179
a6810074
DL
180static const struct option longopts[] = {
181 {"daemon", no_argument, NULL, 'd'},
182 {"statedir", required_argument, NULL, 'S'},
183 {"no-echo", no_argument, NULL, 'e'},
184 {"loglevel", required_argument, NULL, 'l'},
185 {"interval", required_argument, NULL, 'i'},
186 {"timeout", required_argument, NULL, 't'},
187 {"restart-timeout", required_argument, NULL, 'T'},
188 {"restart", required_argument, NULL, 'r'},
189 {"start-command", required_argument, NULL, 's'},
190 {"kill-command", required_argument, NULL, 'k'},
191 {"restart-all", required_argument, NULL, 'R'},
192 {"all-restart", no_argument, NULL, 'a'},
193 {"always-all-restart", no_argument, NULL, 'A'},
194 {"unresponsive-restart", no_argument, NULL, 'z'},
9272302b
DL
195 {"min-restart-interval", required_argument, NULL, OPTION_MINRESTART },
196 {"max-restart-interval", required_argument, NULL, OPTION_MAXRESTART },
a6810074
DL
197 {"pid-file", required_argument, NULL, 'p'},
198 {"blank-string", required_argument, NULL, 'b'},
199 {"help", no_argument, NULL, 'h'},
200 {"version", no_argument, NULL, 'v'},
201 {NULL, 0, NULL, 0}
8b886ca7 202};
203
204static int try_connect(struct daemon *dmn);
205static int wakeup_send_echo(struct thread *t_wakeup);
206static void try_restart(struct daemon *dmn);
207static void phase_check(void);
208
4f04a76b
DL
209static const char *progname;
210static void printhelp(FILE *target)
8b886ca7 211{
4f04a76b 212 fprintf(target, "Usage : %s [OPTION...] <daemon name> ...\n\n\
9473e340 213Watchdog program to monitor status of frr daemons and try to restart\n\
8b886ca7 214them if they are down or unresponsive. It determines whether a daemon is\n\
215up based on whether it can connect to the daemon's vty unix stream socket.\n\
216It then repeatedly sends echo commands over that socket to determine whether\n\
217the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
218on the socket connection and know immediately that the daemon is down.\n\n\
219The daemons to be monitored should be listed on the command line.\n\n\
220This program can run in one of 5 modes:\n\n\
2210. Mode: %s.\n\
222 Just monitor and report on status changes. Example:\n\
223 %s -d zebra ospfd bgpd\n\n\
2241. Mode: %s.\n\
225 Whenever any daemon hangs or crashes, use the given command to restart\n\
226 them all. Example:\n\
227 %s -dz \\\n\
228 -R '/sbin/service zebra restart; /sbin/service ospfd restart' \\\n\
229 zebra ospfd\n\n\
2302. Mode: %s.\n\
231 When any single daemon hangs or crashes, restart only the daemon that's\n\
232 in trouble using the supplied restart command. Example:\n\
233 %s -dz -r '/sbin/service %%s restart' zebra ospfd bgpd\n\n\
2343. Mode: %s.\n\
235 The same as the previous mode, except that there is special treatment when\n\
236 the zebra daemon is in trouble. In that case, a phased restart approach\n\
237 is used: 1. stop all other daemons; 2. restart zebra; 3. start the other\n\
238 daemons. Example:\n\
239 %s -adz -r '/sbin/service %%s restart' \\\n\
240 -s '/sbin/service %%s start' \\\n\
241 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
2424. Mode: %s.\n\
243 This is the same as the previous mode, except that the phased restart\n\
244 procedure is used whenever any of the daemons hangs or crashes. Example:\n\
245 %s -Adz -r '/sbin/service %%s restart' \\\n\
246 -s '/sbin/service %%s start' \\\n\
247 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
248As of this writing, it is believed that mode 2 [%s]\n\
249is not safe, and mode 3 [%s] may not be safe with some of the\n\
250routing daemons.\n\n\
251In order to avoid attempting to restart the daemons in a fast loop,\n\
252the -m and -M options allow you to control the minimum delay between\n\
253restart commands. The minimum restart delay is recalculated each time\n\
254a restart is attempted: if the time since the last restart attempt exceeds\n\
255twice the -M value, then the restart delay is set to the -m value.\n\
a6810074 256Otherwise, the interval is doubled (but capped at the -M value).\n\n", progname, mode_str[0], progname, mode_str[1], progname, mode_str[2], progname, mode_str[3], progname, mode_str[4], progname, mode_str[2], mode_str[3]);
e757c940 257
4f04a76b 258 fprintf(target, "Options:\n\
8b886ca7 259-d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
260 to syslog instead of stdout.\n\
261-S, --statedir Set the vty socket directory (default is %s)\n\
262-e, --no-echo Do not ping the daemons to test responsiveness (this\n\
263 option is necessary if the daemons do not support the\n\
264 echo command)\n\
265-l, --loglevel Set the logging level (default is %d).\n\
266 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
267 but it can be set higher than %d if extra-verbose debugging\n\
268 messages are desired.\n\
9272302b 269 --min-restart-interval\n\
8b886ca7 270 Set the minimum seconds to wait between invocations of daemon\n\
271 restart commands (default is %d).\n\
9272302b 272 --max-restart-interval\n\
8b886ca7 273 Set the maximum seconds to wait between invocations of daemon\n\
274 restart commands (default is %d).\n\
275-i, --interval Set the status polling interval in seconds (default is %d)\n\
276-t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
277-T, --restart-timeout\n\
278 Set the restart (kill) timeout in seconds (default is %d).\n\
279 If any background jobs are still running after this much\n\
280 time has elapsed, they will be killed.\n\
281-r, --restart Supply a Bourne shell command to use to restart a single\n\
282 daemon. The command string should include '%%s' where the\n\
283 name of the daemon should be substituted.\n\
284 Note that -r and -R are incompatible.\n\
285-s, --start-command\n\
286 Supply a Bourne shell to command to use to start a single\n\
287 daemon. The command string should include '%%s' where the\n\
288 name of the daemon should be substituted.\n\
289-k, --kill-command\n\
290 Supply a Bourne shell to command to use to stop a single\n\
291 daemon. The command string should include '%%s' where the\n\
292 name of the daemon should be substituted.\n\
293-R, --restart-all\n\
294 When one or more daemons is down, try to restart everything\n\
295 using the Bourne shell command supplied as the argument.\n\
296 Note that -r and -R are incompatible.\n\
297-z, --unresponsive-restart\n\
298 When a daemon is unresponsive, treat it as being down for\n\
299 restart purposes.\n\
300-a, --all-restart\n\
301 When zebra hangs or crashes, restart all daemons using\n\
302 this phased approach: 1. stop all other daemons; 2. restart\n\
303 zebra; 3. start other daemons. Requires -r, -s, and -k.\n\
304-A, --always-all-restart\n\
305 When any daemon (not just zebra) hangs or crashes, use the\n\
306 same phased restart mechanism described above for -a.\n\
307 Requires -r, -s, and -k.\n\
308-p, --pid-file Set process identifier file name\n\
309 (default is %s).\n\
c8b40f86 310-b, --blank-string\n\
311 When the supplied argument string is found in any of the\n\
312 various shell command arguments (-r, -s, -k, or -R), replace\n\
313 it with a space. This is an ugly hack to circumvent problems\n\
314 passing command-line arguments with embedded spaces.\n\
8b886ca7 315-v, --version Print program version\n\
a6810074 316-h, --help Display this help and exit\n", VTYDIR, DEFAULT_LOGLEVEL, LOG_EMERG, LOG_DEBUG, LOG_DEBUG, DEFAULT_MIN_RESTART, DEFAULT_MAX_RESTART, DEFAULT_PERIOD, DEFAULT_TIMEOUT, DEFAULT_RESTART_TIMEOUT, DEFAULT_PIDFILE);
8b886ca7 317}
318
a6810074 319static pid_t run_background(char *shell_cmd)
8b886ca7 320{
a6810074
DL
321 pid_t child;
322
323 switch (child = fork()) {
324 case -1:
325 zlog_err("fork failed, cannot run command [%s]: %s",
326 shell_cmd, safe_strerror(errno));
327 return -1;
328 case 0:
329 /* Child process. */
330 /* Use separate process group so child processes can be killed easily. */
331 if (setpgid(0, 0) < 0)
332 zlog_warn("warning: setpgid(0,0) failed: %s",
333 safe_strerror(errno));
334 {
335 char shell[] = "sh";
336 char dashc[] = "-c";
337 char *const argv[4] = { shell, dashc, shell_cmd, NULL };
338 execv("/bin/sh", argv);
339 zlog_err("execv(/bin/sh -c '%s') failed: %s",
340 shell_cmd, safe_strerror(errno));
341 _exit(127);
342 }
343 default:
344 /* Parent process: we will reap the child later. */
345 zlog_err("Forked background command [pid %d]: %s", (int)child,
346 shell_cmd);
347 return child;
348 }
8b886ca7 349}
350
a6810074
DL
351static struct timeval *time_elapsed(struct timeval *result,
352 const struct timeval *start_time)
8b886ca7 353{
a6810074
DL
354 gettimeofday(result, NULL);
355 result->tv_sec -= start_time->tv_sec;
356 result->tv_usec -= start_time->tv_usec;
357 while (result->tv_usec < 0) {
358 result->tv_usec += 1000000L;
359 result->tv_sec--;
360 }
361 return result;
8b886ca7 362}
363
a6810074 364static int restart_kill(struct thread *t_kill)
8b886ca7 365{
a6810074
DL
366 struct restart_info *restart = THREAD_ARG(t_kill);
367 struct timeval delay;
368
369 time_elapsed(&delay, &restart->time);
370 zlog_warn("Warning: %s %s child process %d still running after "
371 "%ld seconds, sending signal %d",
372 restart->what, restart->name, (int)restart->pid,
373 (long)delay.tv_sec, (restart->kills ? SIGKILL : SIGTERM));
374 kill(-restart->pid, (restart->kills ? SIGKILL : SIGTERM));
375 restart->kills++;
66e78ae6
QY
376 restart->t_kill = NULL;
377 thread_add_timer(master, restart_kill, restart, gs.restart_timeout,
378 &restart->t_kill);
a6810074 379 return 0;
8b886ca7 380}
381
a6810074 382static struct restart_info *find_child(pid_t child)
8b886ca7 383{
a6810074
DL
384 if (gs.mode == MODE_GLOBAL_RESTART) {
385 if (gs.restart.pid == child)
386 return &gs.restart;
387 } else {
388 struct daemon *dmn;
389 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
390 if (dmn->restart.pid == child)
391 return &dmn->restart;
392 }
393 }
394 return NULL;
8b886ca7 395}
396
a6810074 397static void sigchild(void)
8b886ca7 398{
a6810074
DL
399 pid_t child;
400 int status;
401 const char *name;
402 const char *what;
403 struct restart_info *restart;
404
405 switch (child = waitpid(-1, &status, WNOHANG)) {
406 case -1:
407 zlog_err("waitpid failed: %s", safe_strerror(errno));
408 return;
409 case 0:
410 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
411 return;
412 }
413
414 if (child == integrated_write_pid) {
415 integrated_write_sigchld(status);
416 return;
417 }
418
419 if ((restart = find_child(child)) != NULL) {
420 name = restart->name;
421 what = restart->what;
422 restart->pid = 0;
423 gs.numpids--;
424 thread_cancel(restart->t_kill);
425 restart->t_kill = NULL;
426 /* Update restart time to reflect the time the command completed. */
427 gettimeofday(&restart->time, NULL);
428 } else {
429 zlog_err
430 ("waitpid returned status for an unknown child process %d",
431 (int)child);
432 name = "(unknown)";
433 what = "background";
434 }
435 if (WIFSTOPPED(status))
436 zlog_warn("warning: %s %s process %d is stopped",
437 what, name, (int)child);
438 else if (WIFSIGNALED(status))
439 zlog_warn("%s %s process %d terminated due to signal %d",
440 what, name, (int)child, WTERMSIG(status));
441 else if (WIFEXITED(status)) {
442 if (WEXITSTATUS(status) != 0)
443 zlog_warn
444 ("%s %s process %d exited with non-zero status %d",
445 what, name, (int)child, WEXITSTATUS(status));
446 else
447 zlog_debug("%s %s process %d exited normally", what,
448 name, (int)child);
449 } else
450 zlog_err("cannot interpret %s %s process %d wait status 0x%x",
451 what, name, (int)child, status);
452 phase_check();
8b886ca7 453}
454
455static int
456run_job(struct restart_info *restart, const char *cmdtype, const char *command,
457 int force, int update_interval)
458{
a6810074
DL
459 struct timeval delay;
460
461 if (gs.loglevel > LOG_DEBUG + 1)
462 zlog_debug("attempting to %s %s", cmdtype, restart->name);
463
464 if (restart->pid) {
465 if (gs.loglevel > LOG_DEBUG + 1)
466 zlog_debug
467 ("cannot %s %s, previous pid %d still running",
468 cmdtype, restart->name, (int)restart->pid);
469 return -1;
470 }
471
472 /* Note: time_elapsed test must come before the force test, since we need
473 to make sure that delay is initialized for use below in updating the
474 restart interval. */
475 if ((time_elapsed(&delay, &restart->time)->tv_sec < restart->interval)
476 && !force) {
477 if (gs.loglevel > LOG_DEBUG + 1)
478 zlog_debug("postponing %s %s: "
479 "elapsed time %ld < retry interval %ld",
480 cmdtype, restart->name, (long)delay.tv_sec,
481 restart->interval);
482 return -1;
483 }
484
485 gettimeofday(&restart->time, NULL);
486 restart->kills = 0;
487 {
488 char cmd[strlen(command) + strlen(restart->name) + 1];
489 snprintf(cmd, sizeof(cmd), command, restart->name);
490 if ((restart->pid = run_background(cmd)) > 0) {
66e78ae6
QY
491 restart->t_kill = NULL;
492 thread_add_timer(master, restart_kill, restart, gs.restart_timeout,
493 &restart->t_kill);
a6810074
DL
494 restart->what = cmdtype;
495 gs.numpids++;
496 } else
497 restart->pid = 0;
498 }
499
500 /* Calculate the new restart interval. */
501 if (update_interval) {
502 if (delay.tv_sec > 2 * gs.max_restart_interval)
503 restart->interval = gs.min_restart_interval;
504 else if ((restart->interval *= 2) > gs.max_restart_interval)
505 restart->interval = gs.max_restart_interval;
506 if (gs.loglevel > LOG_DEBUG + 1)
507 zlog_debug("restart %s interval is now %ld",
508 restart->name, restart->interval);
509 }
510 return restart->pid;
8b886ca7 511}
512
513#define SET_READ_HANDLER(DMN) \
66e78ae6
QY
514 do { \
515 (DMN)->t_read = NULL; \
516 thread_add_read (master, handle_read, (DMN), (DMN)->fd, &(DMN)->t_read); \
517 } while (0);
518
519#define SET_WAKEUP_DOWN(DMN) \
520 do { \
521 (DMN)->t_wakeup = NULL; \
522 thread_add_timer_msec (master, wakeup_down, (DMN), FUZZY(gs.period), \
523 &(DMN)->t_wakeup); \
524 } while (0);
525
526#define SET_WAKEUP_UNRESPONSIVE(DMN) \
527 do { \
528 (DMN)->t_wakeup = NULL; \
529 thread_add_timer_msec (master, wakeup_unresponsive, (DMN), \
530 FUZZY(gs.period), &(DMN)->t_wakeup); \
531 } while (0);
8b886ca7 532
533#define SET_WAKEUP_ECHO(DMN) \
66e78ae6
QY
534 do { \
535 (DMN)->t_wakeup = NULL; \
536 thread_add_timer_msec (master, wakeup_send_echo, (DMN), \
537 FUZZY(gs.period), &(DMN)->t_wakeup); \
538 } while (0);
8b886ca7 539
a6810074 540static int wakeup_down(struct thread *t_wakeup)
8b886ca7 541{
a6810074
DL
542 struct daemon *dmn = THREAD_ARG(t_wakeup);
543
544 dmn->t_wakeup = NULL;
545 if (try_connect(dmn) < 0)
546 SET_WAKEUP_DOWN(dmn);
547 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
548 try_restart(dmn);
549 return 0;
8b886ca7 550}
551
a6810074 552static int wakeup_init(struct thread *t_wakeup)
8b886ca7 553{
a6810074
DL
554 struct daemon *dmn = THREAD_ARG(t_wakeup);
555
556 dmn->t_wakeup = NULL;
557 if (try_connect(dmn) < 0) {
558 SET_WAKEUP_DOWN(dmn);
559 zlog_err("%s state -> down : initial connection attempt failed",
560 dmn->name);
561 dmn->state = DAEMON_DOWN;
562 }
563 return 0;
8b886ca7 564}
565
a6810074 566static void daemon_down(struct daemon *dmn, const char *why)
8b886ca7 567{
a6810074
DL
568 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
569 zlog_err("%s state -> down : %s", dmn->name, why);
570 else if (gs.loglevel > LOG_DEBUG)
571 zlog_debug("%s still down : %s", dmn->name, why);
572 if (IS_UP(dmn))
573 gs.numdown++;
574 dmn->state = DAEMON_DOWN;
575 if (dmn->fd >= 0) {
576 close(dmn->fd);
577 dmn->fd = -1;
578 }
579 THREAD_OFF(dmn->t_read);
580 THREAD_OFF(dmn->t_write);
581 THREAD_OFF(dmn->t_wakeup);
582 if (try_connect(dmn) < 0)
583 SET_WAKEUP_DOWN(dmn);
584 phase_check();
8b886ca7 585}
586
a6810074 587static int handle_read(struct thread *t_read)
8b886ca7 588{
a6810074
DL
589 struct daemon *dmn = THREAD_ARG(t_read);
590 static const char resp[sizeof(PING_TOKEN) + 4] = PING_TOKEN "\n";
591 char buf[sizeof(resp) + 100];
592 ssize_t rc;
593 struct timeval delay;
594
595 dmn->t_read = NULL;
596 if ((rc = read(dmn->fd, buf, sizeof(buf))) < 0) {
597 char why[100];
598
599 if (ERRNO_IO_RETRY(errno)) {
600 /* Pretend it never happened. */
601 SET_READ_HANDLER(dmn);
602 return 0;
603 }
604 snprintf(why, sizeof(why), "unexpected read error: %s",
605 safe_strerror(errno));
606 daemon_down(dmn, why);
607 return 0;
8b886ca7 608 }
a6810074
DL
609 if (rc == 0) {
610 daemon_down(dmn, "read returned EOF");
611 return 0;
612 }
613 if (!dmn->echo_sent.tv_sec) {
614 char why[sizeof(buf) + 100];
615 snprintf(why, sizeof(why),
616 "unexpected read returns %d bytes: %.*s", (int)rc,
617 (int)rc, buf);
618 daemon_down(dmn, why);
619 return 0;
8b886ca7 620 }
a6810074
DL
621
622 /* We are expecting an echo response: is there any chance that the
623 response would not be returned entirely in the first read? That
624 seems inconceivable... */
625 if ((rc != sizeof(resp)) || memcmp(buf, resp, sizeof(resp))) {
626 char why[100 + sizeof(buf)];
627 snprintf(why, sizeof(why),
628 "read returned bad echo response of %d bytes "
629 "(expecting %u): %.*s", (int)rc, (u_int) sizeof(resp),
630 (int)rc, buf);
631 daemon_down(dmn, why);
632 return 0;
633 }
634
635 time_elapsed(&delay, &dmn->echo_sent);
636 dmn->echo_sent.tv_sec = 0;
637 if (dmn->state == DAEMON_UNRESPONSIVE) {
638 if (delay.tv_sec < gs.timeout) {
639 dmn->state = DAEMON_UP;
640 zlog_warn
641 ("%s state -> up : echo response received after %ld.%06ld "
642 "seconds", dmn->name, (long)delay.tv_sec,
643 (long)delay.tv_usec);
644 } else
645 zlog_warn
646 ("%s: slow echo response finally received after %ld.%06ld "
647 "seconds", dmn->name, (long)delay.tv_sec,
648 (long)delay.tv_usec);
649 } else if (gs.loglevel > LOG_DEBUG + 1)
650 zlog_debug("%s: echo response received after %ld.%06ld seconds",
651 dmn->name, (long)delay.tv_sec, (long)delay.tv_usec);
652
653 SET_READ_HANDLER(dmn);
654 if (dmn->t_wakeup)
655 thread_cancel(dmn->t_wakeup);
656 SET_WAKEUP_ECHO(dmn);
657
658 return 0;
8b886ca7 659}
660
207e0d7a
DS
661/*
662 * Wait till we notice that all daemons are ready before
663 * we send we are ready to systemd
664 */
a6810074 665static void daemon_send_ready(void)
207e0d7a 666{
a6810074
DL
667 static int sent = 0;
668 if (!sent && gs.numdown == 0) {
207e0d7a 669#if defined (HAVE_CUMULUS)
a6810074 670 FILE *fp;
207e0d7a 671
a6810074
DL
672 fp = fopen(DAEMON_VTY_DIR "/watchfrr.started", "w");
673 fclose(fp);
207e0d7a 674#endif
a6810074
DL
675 zlog_notice
676 ("Watchfrr: Notifying Systemd we are up and running");
677 systemd_send_started(master, 0);
678 sent = 1;
679 }
207e0d7a
DS
680}
681
a6810074 682static void daemon_up(struct daemon *dmn, const char *why)
8b886ca7 683{
a6810074
DL
684 dmn->state = DAEMON_UP;
685 gs.numdown--;
686 dmn->connect_tries = 0;
687 zlog_notice("%s state -> up : %s", dmn->name, why);
688 daemon_send_ready();
689 if (gs.do_ping)
690 SET_WAKEUP_ECHO(dmn);
691 phase_check();
8b886ca7 692}
693
a6810074 694static int check_connect(struct thread *t_write)
8b886ca7 695{
a6810074
DL
696 struct daemon *dmn = THREAD_ARG(t_write);
697 int sockerr;
698 socklen_t reslen = sizeof(sockerr);
699
700 dmn->t_write = NULL;
701 if (getsockopt(dmn->fd, SOL_SOCKET, SO_ERROR, (char *)&sockerr, &reslen)
702 < 0) {
703 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn->name,
704 safe_strerror(errno));
705 daemon_down(dmn,
706 "getsockopt failed checking connection success");
707 return 0;
708 }
709 if ((reslen == sizeof(sockerr)) && sockerr) {
710 char why[100];
711 snprintf(why, sizeof(why),
712 "getsockopt reports that connection attempt failed: %s",
713 safe_strerror(sockerr));
714 daemon_down(dmn, why);
715 return 0;
716 }
717
718 daemon_up(dmn, "delayed connect succeeded");
719 return 0;
8b886ca7 720}
721
a6810074 722static int wakeup_connect_hanging(struct thread *t_wakeup)
8b886ca7 723{
a6810074
DL
724 struct daemon *dmn = THREAD_ARG(t_wakeup);
725 char why[100];
726
727 dmn->t_wakeup = NULL;
728 snprintf(why, sizeof(why),
729 "connection attempt timed out after %ld seconds", gs.timeout);
730 daemon_down(dmn, why);
731 return 0;
8b886ca7 732}
733
734/* Making connection to protocol daemon. */
a6810074 735static int try_connect(struct daemon *dmn)
8b886ca7 736{
a6810074
DL
737 int sock;
738 struct sockaddr_un addr;
739 socklen_t len;
740
741 if (gs.loglevel > LOG_DEBUG + 1)
742 zlog_debug("%s: attempting to connect", dmn->name);
743 dmn->connect_tries++;
744
745 memset(&addr, 0, sizeof(struct sockaddr_un));
746 addr.sun_family = AF_UNIX;
747 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty",
748 gs.vtydir, dmn->name);
6f0e3f6e 749#ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
a6810074 750 len = addr.sun_len = SUN_LEN(&addr);
8b886ca7 751#else
a6810074
DL
752 len = sizeof(addr.sun_family) + strlen(addr.sun_path);
753#endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
754
755 /* Quick check to see if we might succeed before we go to the trouble
756 of creating a socket. */
757 if (access(addr.sun_path, W_OK) < 0) {
758 if (errno != ENOENT)
759 zlog_err("%s: access to socket %s denied: %s",
760 dmn->name, addr.sun_path,
761 safe_strerror(errno));
762 return -1;
763 }
764
765 if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
766 zlog_err("%s(%s): cannot make socket: %s",
767 __func__, addr.sun_path, safe_strerror(errno));
768 return -1;
769 }
770
771 if (set_nonblocking(sock) < 0 || set_cloexec(sock) < 0) {
772 zlog_err("%s(%s): set_nonblocking/cloexec(%d) failed",
773 __func__, addr.sun_path, sock);
774 close(sock);
775 return -1;
8b886ca7 776 }
a6810074
DL
777
778 if (connect(sock, (struct sockaddr *)&addr, len) < 0) {
779 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK)) {
780 if (gs.loglevel > LOG_DEBUG)
781 zlog_debug("%s(%s): connect failed: %s",
782 __func__, addr.sun_path,
783 safe_strerror(errno));
784 close(sock);
785 return -1;
786 }
787 if (gs.loglevel > LOG_DEBUG)
788 zlog_debug("%s: connection in progress", dmn->name);
789 dmn->state = DAEMON_CONNECTING;
790 dmn->fd = sock;
66e78ae6
QY
791 dmn->t_write = NULL;
792 thread_add_write(master, check_connect, dmn, dmn->fd,
793 &dmn->t_write);dmn->t_wakeup = NULL;
794 thread_add_timer(master, wakeup_connect_hanging, dmn, gs.timeout,
795 &dmn->t_wakeup);
a6810074
DL
796 SET_READ_HANDLER(dmn);
797 return 0;
798 }
799
800 dmn->fd = sock;
801 SET_READ_HANDLER(dmn);
802 daemon_up(dmn, "connect succeeded");
803 return 1;
8b886ca7 804}
805
a6810074 806static int phase_hanging(struct thread *t_hanging)
8b886ca7 807{
a6810074
DL
808 gs.t_phase_hanging = NULL;
809 zlog_err("Phase [%s] hanging for %ld seconds, aborting phased restart",
810 phase_str[gs.phase], PHASE_TIMEOUT);
811 gs.phase = PHASE_NONE;
812 return 0;
8b886ca7 813}
814
a6810074 815static void set_phase(restart_phase_t new_phase)
8b886ca7 816{
a6810074
DL
817 gs.phase = new_phase;
818 if (gs.t_phase_hanging)
819 thread_cancel(gs.t_phase_hanging);
66e78ae6
QY
820 gs.t_phase_hanging = NULL;
821 thread_add_timer(master, phase_hanging, NULL, PHASE_TIMEOUT,
822 &gs.t_phase_hanging);
8b886ca7 823}
824
a6810074 825static void phase_check(void)
8b886ca7 826{
a6810074
DL
827 switch (gs.phase) {
828 case PHASE_NONE:
829 break;
830 case PHASE_STOPS_PENDING:
831 if (gs.numpids)
832 break;
833 zlog_info
834 ("Phased restart: all routing daemon stop jobs have completed.");
835 set_phase(PHASE_WAITING_DOWN);
836
837 /*FALLTHRU*/
838 case PHASE_WAITING_DOWN:
839 if (gs.numdown + IS_UP(gs.special) < gs.numdaemons)
840 break;
841 zlog_info("Phased restart: all routing daemons now down.");
842 run_job(&gs.special->restart, "restart", gs.restart_command, 1,
843 1);
844 set_phase(PHASE_ZEBRA_RESTART_PENDING);
845
846 /*FALLTHRU*/
847 case PHASE_ZEBRA_RESTART_PENDING:
848 if (gs.special->restart.pid)
849 break;
850 zlog_info("Phased restart: %s restart job completed.",
851 gs.special->name);
852 set_phase(PHASE_WAITING_ZEBRA_UP);
853
854 /*FALLTHRU*/
855 case PHASE_WAITING_ZEBRA_UP:
856 if (!IS_UP(gs.special))
857 break;
858 zlog_info("Phased restart: %s is now up.", gs.special->name);
859 {
860 struct daemon *dmn;
861 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
862 if (dmn != gs.special)
863 run_job(&dmn->restart, "start",
864 gs.start_command, 1, 0);
865 }
866 }
867 gs.phase = PHASE_NONE;
868 THREAD_OFF(gs.t_phase_hanging);
869 zlog_notice("Phased global restart has completed.");
870 break;
871 }
8b886ca7 872}
873
a6810074 874static void try_restart(struct daemon *dmn)
8b886ca7 875{
a6810074
DL
876 switch (gs.mode) {
877 case MODE_MONITOR:
878 return;
879 case MODE_GLOBAL_RESTART:
880 run_job(&gs.restart, "restart", gs.restart_command, 0, 1);
881 break;
882 case MODE_SEPARATE_RESTART:
883 run_job(&dmn->restart, "restart", gs.restart_command, 0, 1);
884 break;
885 case MODE_PHASED_ZEBRA_RESTART:
886 if (dmn != gs.special) {
887 if ((gs.special->state == DAEMON_UP)
888 && (gs.phase == PHASE_NONE))
889 run_job(&dmn->restart, "restart",
890 gs.restart_command, 0, 1);
891 else
892 zlog_debug
893 ("%s: postponing restart attempt because master %s daemon "
894 "not up [%s], or phased restart in progress",
895 dmn->name, gs.special->name,
896 state_str[gs.special->state]);
897 break;
898 }
899
900 /*FALLTHRU*/
901 case MODE_PHASED_ALL_RESTART:
902 if ((gs.phase != PHASE_NONE) || gs.numpids) {
903 if (gs.loglevel > LOG_DEBUG + 1)
904 zlog_debug
905 ("postponing phased global restart: restart already in "
906 "progress [%s], or outstanding child processes [%d]",
907 phase_str[gs.phase], gs.numpids);
908 break;
909 }
910 /* Is it too soon for a restart? */
911 {
912 struct timeval delay;
913 if (time_elapsed(&delay, &gs.special->restart.time)->
914 tv_sec < gs.special->restart.interval) {
915 if (gs.loglevel > LOG_DEBUG + 1)
916 zlog_debug
917 ("postponing phased global restart: "
918 "elapsed time %ld < retry interval %ld",
919 (long)delay.tv_sec,
920 gs.special->restart.interval);
921 break;
922 }
923 }
924 run_job(&gs.restart, "restart", gs.restart_command, 0, 1);
925 break;
926 default:
927 zlog_err("error: unknown restart mode %d", gs.mode);
928 break;
8b886ca7 929 }
8b886ca7 930}
931
a6810074 932static int wakeup_unresponsive(struct thread *t_wakeup)
8b886ca7 933{
a6810074
DL
934 struct daemon *dmn = THREAD_ARG(t_wakeup);
935
936 dmn->t_wakeup = NULL;
937 if (dmn->state != DAEMON_UNRESPONSIVE)
938 zlog_err("%s: no longer unresponsive (now %s), "
939 "wakeup should have been cancelled!",
940 dmn->name, state_str[dmn->state]);
941 else {
942 SET_WAKEUP_UNRESPONSIVE(dmn);
943 try_restart(dmn);
944 }
945 return 0;
8b886ca7 946}
947
a6810074 948static int wakeup_no_answer(struct thread *t_wakeup)
8b886ca7 949{
a6810074
DL
950 struct daemon *dmn = THREAD_ARG(t_wakeup);
951
952 dmn->t_wakeup = NULL;
953 dmn->state = DAEMON_UNRESPONSIVE;
954 zlog_err("%s state -> unresponsive : no response yet to ping "
955 "sent %ld seconds ago", dmn->name, gs.timeout);
956 if (gs.unresponsive_restart) {
957 SET_WAKEUP_UNRESPONSIVE(dmn);
958 try_restart(dmn);
959 }
960 return 0;
8b886ca7 961}
962
a6810074 963static int wakeup_send_echo(struct thread *t_wakeup)
8b886ca7 964{
a6810074
DL
965 static const char echocmd[] = "echo " PING_TOKEN;
966 ssize_t rc;
967 struct daemon *dmn = THREAD_ARG(t_wakeup);
968
969 dmn->t_wakeup = NULL;
970 if (((rc = write(dmn->fd, echocmd, sizeof(echocmd))) < 0) ||
971 ((size_t) rc != sizeof(echocmd))) {
972 char why[100 + sizeof(echocmd)];
973 snprintf(why, sizeof(why),
974 "write '%s' returned %d instead of %u", echocmd,
975 (int)rc, (u_int) sizeof(echocmd));
976 daemon_down(dmn, why);
977 } else {
978 gettimeofday(&dmn->echo_sent, NULL);
66e78ae6
QY
979 dmn->t_wakeup = NULL;
980 thread_add_timer(master, wakeup_no_answer, dmn, gs.timeout,
981 &dmn->t_wakeup);
a6810074
DL
982 }
983 return 0;
8b886ca7 984}
985
a6810074 986static void sigint(void)
8b886ca7 987{
a6810074
DL
988 zlog_notice("Terminating on signal");
989 systemd_send_stopping();
990 exit(0);
8b886ca7 991}
992
a6810074 993static int valid_command(const char *cmd)
8b886ca7 994{
a6810074 995 char *p;
8b886ca7 996
a6810074
DL
997 return ((p = strchr(cmd, '%')) != NULL) && (*(p + 1) == 's')
998 && !strchr(p + 1, '%');
8b886ca7 999}
1000
c8b40f86 1001/* This is an ugly hack to circumvent problems with passing command-line
1002 arguments that contain spaces. The fix is to use a configuration file. */
a6810074 1003static char *translate_blanks(const char *cmd, const char *blankstr)
c8b40f86 1004{
a6810074
DL
1005 char *res;
1006 char *p;
1007 size_t bslen = strlen(blankstr);
1008
1009 if (!(res = strdup(cmd))) {
1010 perror("strdup");
1011 exit(1);
1012 }
1013 while ((p = strstr(res, blankstr)) != NULL) {
1014 *p = ' ';
1015 if (bslen != 1)
1016 memmove(p + 1, p + bslen, strlen(p + bslen) + 1);
1017 }
1018 return res;
c8b40f86 1019}
1020
a6810074 1021struct zebra_privs_t watchfrr_privs = {
95c4aff2 1022#ifdef VTY_GROUP
a6810074 1023 .vty_group = VTY_GROUP,
95c4aff2
DL
1024#endif
1025};
1026
4f04a76b
DL
1027static struct quagga_signal_t watchfrr_signals[] = {
1028 {
1029 .signal = SIGINT,
1030 .handler = sigint,
1031 },
1032 {
1033 .signal = SIGTERM,
1034 .handler = sigint,
1035 },
1036 {
1037 .signal = SIGCHLD,
1038 .handler = sigchild,
1039 },
1040};
1041
1042FRR_DAEMON_INFO(watchfrr, WATCHFRR,
eb05883f
DL
1043 .flags = FRR_NO_PRIVSEP | FRR_NO_TCPVTY | FRR_LIMITED_CLI
1044 | FRR_NO_CFG_PID_DRY | FRR_NO_ZCLIENT,
4f04a76b
DL
1045
1046 .printhelp = printhelp,
1047 .copyright = "Copyright 2004 Andrew J. Schorr",
1048
1049 .signals = watchfrr_signals,
1050 .n_signals = array_size(watchfrr_signals),
1051
1052 .privs = &watchfrr_privs,
1053)
1054
a6810074 1055int main(int argc, char **argv)
8b886ca7 1056{
a6810074 1057 int opt;
a6810074
DL
1058 const char *pidfile = DEFAULT_PIDFILE;
1059 const char *special = "zebra";
1060 const char *blankstr = NULL;
a6810074 1061
4f04a76b
DL
1062 frr_preinit(&watchfrr_di, argc, argv);
1063 progname = watchfrr_di.progname;
1064
9272302b 1065 frr_opt_add("aAb:dek:l:i:p:r:R:S:s:t:T:z", longopts, "");
a6810074
DL
1066
1067 gs.restart.name = "all";
4f04a76b 1068 while ((opt = frr_getopt(argc, argv, NULL)) != EOF) {
a6810074
DL
1069 switch (opt) {
1070 case 0:
1071 break;
1072 case 'a':
1073 if ((gs.mode != MODE_MONITOR)
1074 && (gs.mode != MODE_SEPARATE_RESTART)) {
1075 fputs("Ambiguous operating mode selected.\n",
1076 stderr);
4f04a76b 1077 frr_help_exit(1);
a6810074
DL
1078 }
1079 gs.mode = MODE_PHASED_ZEBRA_RESTART;
1080 break;
1081 case 'A':
1082 if ((gs.mode != MODE_MONITOR)
1083 && (gs.mode != MODE_SEPARATE_RESTART)) {
1084 fputs("Ambiguous operating mode selected.\n",
1085 stderr);
4f04a76b 1086 frr_help_exit(1);
a6810074
DL
1087 }
1088 gs.mode = MODE_PHASED_ALL_RESTART;
1089 break;
1090 case 'b':
1091 blankstr = optarg;
1092 break;
a6810074
DL
1093 case 'e':
1094 gs.do_ping = 0;
1095 break;
1096 case 'k':
1097 if (!valid_command(optarg)) {
1098 fprintf(stderr,
1099 "Invalid kill command, must contain '%%s': %s\n",
1100 optarg);
4f04a76b 1101 frr_help_exit(1);
a6810074
DL
1102 }
1103 gs.stop_command = optarg;
1104 break;
1105 case 'l':
1106 {
1107 char garbage[3];
1108 if ((sscanf
1109 (optarg, "%d%1s", &gs.loglevel,
1110 garbage) != 1)
1111 || (gs.loglevel < LOG_EMERG)) {
1112 fprintf(stderr,
1113 "Invalid loglevel argument: %s\n",
1114 optarg);
4f04a76b 1115 frr_help_exit(1);
a6810074
DL
1116 }
1117 }
1118 break;
9272302b 1119 case OPTION_MINRESTART:
a6810074
DL
1120 {
1121 char garbage[3];
1122 if ((sscanf(optarg, "%ld%1s",
1123 &gs.min_restart_interval,
1124 garbage) != 1)
1125 || (gs.min_restart_interval < 0)) {
1126 fprintf(stderr,
1127 "Invalid min_restart_interval argument: %s\n",
1128 optarg);
4f04a76b 1129 frr_help_exit(1);
a6810074
DL
1130 }
1131 }
1132 break;
9272302b 1133 case OPTION_MAXRESTART:
a6810074
DL
1134 {
1135 char garbage[3];
1136 if ((sscanf(optarg, "%ld%1s",
1137 &gs.max_restart_interval,
1138 garbage) != 1)
1139 || (gs.max_restart_interval < 0)) {
1140 fprintf(stderr,
1141 "Invalid max_restart_interval argument: %s\n",
1142 optarg);
4f04a76b 1143 frr_help_exit(1);
a6810074
DL
1144 }
1145 }
1146 break;
1147 case 'i':
1148 {
1149 char garbage[3];
1150 int period;
1151 if ((sscanf(optarg, "%d%1s", &period, garbage)
1152 != 1) || (gs.period < 1)) {
1153 fprintf(stderr,
1154 "Invalid interval argument: %s\n",
1155 optarg);
4f04a76b 1156 frr_help_exit(1);
a6810074
DL
1157 }
1158 gs.period = 1000 * period;
1159 }
1160 break;
1161 case 'p':
1162 pidfile = optarg;
1163 break;
1164 case 'r':
1165 if ((gs.mode == MODE_GLOBAL_RESTART) ||
1166 (gs.mode == MODE_SEPARATE_RESTART)) {
1167 fputs("Ambiguous operating mode selected.\n",
1168 stderr);
4f04a76b 1169 frr_help_exit(1);
a6810074
DL
1170 }
1171 if (!valid_command(optarg)) {
1172 fprintf(stderr,
1173 "Invalid restart command, must contain '%%s': %s\n",
1174 optarg);
4f04a76b 1175 frr_help_exit(1);
a6810074
DL
1176 }
1177 gs.restart_command = optarg;
1178 if (gs.mode == MODE_MONITOR)
1179 gs.mode = MODE_SEPARATE_RESTART;
1180 break;
1181 case 'R':
1182 if (gs.mode != MODE_MONITOR) {
1183 fputs("Ambiguous operating mode selected.\n",
1184 stderr);
4f04a76b 1185 frr_help_exit(1);
a6810074
DL
1186 }
1187 if (strchr(optarg, '%')) {
1188 fprintf(stderr,
1189 "Invalid restart-all arg, must not contain '%%s': %s\n",
1190 optarg);
4f04a76b 1191 frr_help_exit(1);
a6810074
DL
1192 }
1193 gs.restart_command = optarg;
1194 gs.mode = MODE_GLOBAL_RESTART;
1195 break;
1196 case 's':
1197 if (!valid_command(optarg)) {
1198 fprintf(stderr,
1199 "Invalid start command, must contain '%%s': %s\n",
1200 optarg);
4f04a76b 1201 frr_help_exit(1);
a6810074
DL
1202 }
1203 gs.start_command = optarg;
1204 break;
1205 case 'S':
1206 gs.vtydir = optarg;
1207 break;
1208 case 't':
1209 {
1210 char garbage[3];
1211 if ((sscanf
1212 (optarg, "%ld%1s", &gs.timeout,
1213 garbage) != 1) || (gs.timeout < 1)) {
1214 fprintf(stderr,
1215 "Invalid timeout argument: %s\n",
1216 optarg);
4f04a76b 1217 frr_help_exit(1);
a6810074
DL
1218 }
1219 }
1220 break;
1221 case 'T':
1222 {
1223 char garbage[3];
1224 if ((sscanf
1225 (optarg, "%ld%1s", &gs.restart_timeout,
1226 garbage) != 1)
1227 || (gs.restart_timeout < 1)) {
1228 fprintf(stderr,
1229 "Invalid restart timeout argument: %s\n",
1230 optarg);
4f04a76b 1231 frr_help_exit(1);
a6810074
DL
1232 }
1233 }
1234 break;
1235 case 'z':
1236 gs.unresponsive_restart = 1;
1237 break;
a6810074
DL
1238 default:
1239 fputs("Invalid option.\n", stderr);
4f04a76b 1240 frr_help_exit(1);
a6810074 1241 }
8b886ca7 1242 }
a6810074
DL
1243
1244 if (gs.unresponsive_restart && (gs.mode == MODE_MONITOR)) {
1245 fputs("Option -z requires a -r or -R restart option.\n",
1246 stderr);
4f04a76b 1247 frr_help_exit(1);
8b886ca7 1248 }
a6810074
DL
1249 switch (gs.mode) {
1250 case MODE_MONITOR:
1251 if (gs.restart_command || gs.start_command || gs.stop_command) {
1252 fprintf(stderr,
1253 "No kill/(re)start commands needed for %s mode.\n",
1254 mode_str[gs.mode]);
4f04a76b 1255 frr_help_exit(1);
a6810074
DL
1256 }
1257 break;
1258 case MODE_GLOBAL_RESTART:
1259 case MODE_SEPARATE_RESTART:
1260 if (!gs.restart_command || gs.start_command || gs.stop_command) {
1261 fprintf(stderr,
1262 "No start/kill commands needed in [%s] mode.\n",
1263 mode_str[gs.mode]);
4f04a76b 1264 frr_help_exit(1);
a6810074
DL
1265 }
1266 break;
1267 case MODE_PHASED_ZEBRA_RESTART:
1268 case MODE_PHASED_ALL_RESTART:
1269 if (!gs.restart_command || !gs.start_command
1270 || !gs.stop_command) {
1271 fprintf(stderr,
1272 "Need start, kill, and restart commands in [%s] mode.\n",
1273 mode_str[gs.mode]);
4f04a76b 1274 frr_help_exit(1);
a6810074
DL
1275 }
1276 break;
8b886ca7 1277 }
8b886ca7 1278
a6810074
DL
1279 if (blankstr) {
1280 if (gs.restart_command)
1281 gs.restart_command =
1282 translate_blanks(gs.restart_command, blankstr);
1283 if (gs.start_command)
1284 gs.start_command =
1285 translate_blanks(gs.start_command, blankstr);
1286 if (gs.stop_command)
1287 gs.stop_command =
1288 translate_blanks(gs.stop_command, blankstr);
065de903 1289 }
8b886ca7 1290
a6810074 1291 gs.restart.interval = gs.min_restart_interval;
8b886ca7 1292
4f04a76b
DL
1293 master = frr_init();
1294
dd8376fe 1295 zlog_set_level(ZLOG_DEST_MONITOR, ZLOG_DISABLED);
eb05883f 1296 if (watchfrr_di.daemon_mode) {
dd8376fe 1297 zlog_set_level(ZLOG_DEST_SYSLOG, MIN(gs.loglevel, LOG_DEBUG));
4f04a76b
DL
1298 if (daemon (0, 0) < 0) {
1299 fprintf(stderr, "Watchquagga daemon failed: %s",
1300 strerror(errno));
1301 exit (1);
1302 }
1303 } else
dd8376fe 1304 zlog_set_level(ZLOG_DEST_STDOUT, MIN(gs.loglevel, LOG_DEBUG));
8b886ca7 1305
a6810074 1306 watchfrr_vty_init();
8b886ca7 1307
eb05883f 1308 frr_vty_serv();
8b886ca7 1309
8b886ca7 1310 {
a6810074
DL
1311 int i;
1312 struct daemon *tail = NULL;
1313
1314 for (i = optind; i < argc; i++) {
1315 struct daemon *dmn;
1316
1317 if (!(dmn = (struct daemon *)calloc(1, sizeof(*dmn)))) {
1318 fprintf(stderr, "calloc(1,%u) failed: %s\n",
1319 (u_int) sizeof(*dmn),
1320 safe_strerror(errno));
1321 return 1;
1322 }
1323 dmn->name = dmn->restart.name = argv[i];
1324 dmn->state = DAEMON_INIT;
1325 gs.numdaemons++;
1326 gs.numdown++;
1327 dmn->fd = -1;
66e78ae6
QY
1328 dmn->t_wakeup = NULL;
1329 thread_add_timer_msec(master, wakeup_init, dmn, 100 + (random() % 900),
1330 &dmn->t_wakeup);
a6810074
DL
1331 dmn->restart.interval = gs.min_restart_interval;
1332 if (tail)
1333 tail->next = dmn;
1334 else
1335 gs.daemons = dmn;
1336 tail = dmn;
1337
1338 if (((gs.mode == MODE_PHASED_ZEBRA_RESTART) ||
1339 (gs.mode == MODE_PHASED_ALL_RESTART)) &&
1340 !strcmp(dmn->name, special))
1341 gs.special = dmn;
1342 }
1343 }
1344 if (!gs.daemons) {
1345 fputs("Must specify one or more daemons to monitor.\n", stderr);
4f04a76b 1346 frr_help_exit(1);
a6810074
DL
1347 }
1348 if (((gs.mode == MODE_PHASED_ZEBRA_RESTART) ||
1349 (gs.mode == MODE_PHASED_ALL_RESTART)) && !gs.special) {
1350 fprintf(stderr,
1351 "In mode [%s], but cannot find master daemon %s\n",
1352 mode_str[gs.mode], special);
4f04a76b 1353 frr_help_exit(1);
8b886ca7 1354 }
8b886ca7 1355
a6810074
DL
1356 /* Make sure we're not already running. */
1357 pid_output(pidfile);
1358
1359 /* Announce which daemons are being monitored. */
1360 {
1361 struct daemon *dmn;
1362 size_t len = 0;
1363
1364 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1365 len += strlen(dmn->name) + 1;
1366
1367 {
1368 char buf[len + 1];
1369 char *p = buf;
1370
1371 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
1372 if (p != buf)
1373 *p++ = ' ';
1374 strcpy(p, dmn->name);
1375 p += strlen(p);
1376 }
1377 zlog_notice("%s %s watching [%s], mode [%s]",
1378 progname, FRR_VERSION, buf,
1379 mode_str[gs.mode]);
1380 }
1381 }
8b886ca7 1382
a6810074
DL
1383 {
1384 struct thread thread;
1385
1386 while (thread_fetch(master, &thread))
1387 thread_call(&thread);
1388 }
8b886ca7 1389
a6810074
DL
1390 systemd_send_stopping();
1391 /* Not reached. */
1392 return 0;
8b886ca7 1393}