2 Monitor status of frr daemons and restart if necessary.
4 Copyright (C) 2004 Andrew J. Schorr
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
26 #include <lib/version.h>
28 #include "memory_vty.h"
39 #define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
42 /* Macros to help randomize timers. */
43 #define JITTER(X) ((random() % ((X)+1))-((X)/2))
44 #define FUZZY(X) ((X)+JITTER((X)/20))
46 #define DEFAULT_PERIOD 5
47 #define DEFAULT_TIMEOUT 10
48 #define DEFAULT_RESTART_TIMEOUT 20
49 #define DEFAULT_LOGLEVEL LOG_INFO
50 #define DEFAULT_MIN_RESTART 60
51 #define DEFAULT_MAX_RESTART 600
52 #ifdef PATH_WATCHFRR_PID
53 #define DEFAULT_PIDFILE PATH_WATCHFRR_PID
55 #define DEFAULT_PIDFILE STATEDIR "/watchfrr.pid"
58 #define VTYDIR DAEMON_VTY_DIR
60 #define VTYDIR STATEDIR
63 #define PING_TOKEN "PING"
65 /* Needs to be global, referenced somewhere inside libzebra. */
66 struct thread_master
*master
;
71 MODE_SEPARATE_RESTART
,
72 MODE_PHASED_ZEBRA_RESTART
,
73 MODE_PHASED_ALL_RESTART
76 static const char *mode_str
[] = {
79 "individual daemon restart",
80 "phased zebra restart",
81 "phased global restart for any failure",
88 PHASE_ZEBRA_RESTART_PENDING
,
89 PHASE_WAITING_ZEBRA_UP
92 static const char *phase_str
[] = {
95 "Waiting for other daemons to come down",
96 "Zebra restart job running",
97 "Waiting for zebra to come up",
101 #define PHASE_TIMEOUT (3*gs.restart_timeout)
103 struct restart_info
{
109 struct thread
*t_kill
;
113 static struct global_state
{
115 restart_phase_t phase
;
116 struct thread
*t_phase_hanging
;
120 long restart_timeout
;
121 long min_restart_interval
;
122 long max_restart_interval
;
124 struct daemon
*daemons
;
125 const char *restart_command
;
126 const char *start_command
;
127 const char *stop_command
;
128 struct restart_info restart
;
129 int unresponsive_restart
;
131 struct daemon
*special
; /* points to zebra when doing phased restart */
134 int numdown
; /* # of daemons that are not UP or UNRESPONSIVE */
136 .mode
= MODE_MONITOR
,.phase
= PHASE_NONE
,.vtydir
= VTYDIR
,.period
=
137 1000 * DEFAULT_PERIOD
,.timeout
=
138 DEFAULT_TIMEOUT
,.restart_timeout
=
139 DEFAULT_RESTART_TIMEOUT
,.loglevel
=
140 DEFAULT_LOGLEVEL
,.min_restart_interval
=
141 DEFAULT_MIN_RESTART
,.max_restart_interval
=
142 DEFAULT_MAX_RESTART
,.do_ping
= 1,};
153 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
155 static const char *state_str
[] = {
165 daemon_state_t state
;
167 struct timeval echo_sent
;
169 struct thread
*t_wakeup
;
170 struct thread
*t_read
;
171 struct thread
*t_write
;
173 struct restart_info restart
;
176 static const struct option longopts
[] = {
177 {"daemon", no_argument
, NULL
, 'd'},
178 {"statedir", required_argument
, NULL
, 'S'},
179 {"no-echo", no_argument
, NULL
, 'e'},
180 {"loglevel", required_argument
, NULL
, 'l'},
181 {"interval", required_argument
, NULL
, 'i'},
182 {"timeout", required_argument
, NULL
, 't'},
183 {"restart-timeout", required_argument
, NULL
, 'T'},
184 {"restart", required_argument
, NULL
, 'r'},
185 {"start-command", required_argument
, NULL
, 's'},
186 {"kill-command", required_argument
, NULL
, 'k'},
187 {"restart-all", required_argument
, NULL
, 'R'},
188 {"all-restart", no_argument
, NULL
, 'a'},
189 {"always-all-restart", no_argument
, NULL
, 'A'},
190 {"unresponsive-restart", no_argument
, NULL
, 'z'},
191 {"min-restart-interval", required_argument
, NULL
, 'm'},
192 {"max-restart-interval", required_argument
, NULL
, 'M'},
193 {"pid-file", required_argument
, NULL
, 'p'},
194 {"blank-string", required_argument
, NULL
, 'b'},
195 {"help", no_argument
, NULL
, 'h'},
196 {"version", no_argument
, NULL
, 'v'},
200 static int try_connect(struct daemon
*dmn
);
201 static int wakeup_send_echo(struct thread
*t_wakeup
);
202 static void try_restart(struct daemon
*dmn
);
203 static void phase_check(void);
205 static int usage(const char *progname
, int status
)
208 fprintf(stderr
, "Try `%s --help' for more information.\n",
211 printf("Usage : %s [OPTION...] <daemon name> ...\n\n\
212 Watchdog program to monitor status of frr daemons and try to restart\n\
213 them if they are down or unresponsive. It determines whether a daemon is\n\
214 up based on whether it can connect to the daemon's vty unix stream socket.\n\
215 It then repeatedly sends echo commands over that socket to determine whether\n\
216 the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
217 on the socket connection and know immediately that the daemon is down.\n\n\
218 The daemons to be monitored should be listed on the command line.\n\n\
219 This program can run in one of 5 modes:\n\n\
221 Just monitor and report on status changes. Example:\n\
222 %s -d zebra ospfd bgpd\n\n\
224 Whenever any daemon hangs or crashes, use the given command to restart\n\
225 them all. Example:\n\
227 -R '/sbin/service zebra restart; /sbin/service ospfd restart' \\\n\
230 When any single daemon hangs or crashes, restart only the daemon that's\n\
231 in trouble using the supplied restart command. Example:\n\
232 %s -dz -r '/sbin/service %%s restart' zebra ospfd bgpd\n\n\
234 The same as the previous mode, except that there is special treatment when\n\
235 the zebra daemon is in trouble. In that case, a phased restart approach\n\
236 is used: 1. stop all other daemons; 2. restart zebra; 3. start the other\n\
238 %s -adz -r '/sbin/service %%s restart' \\\n\
239 -s '/sbin/service %%s start' \\\n\
240 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
242 This is the same as the previous mode, except that the phased restart\n\
243 procedure is used whenever any of the daemons hangs or crashes. Example:\n\
244 %s -Adz -r '/sbin/service %%s restart' \\\n\
245 -s '/sbin/service %%s start' \\\n\
246 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
247 As of this writing, it is believed that mode 2 [%s]\n\
248 is not safe, and mode 3 [%s] may not be safe with some of the\n\
249 routing daemons.\n\n\
250 In order to avoid attempting to restart the daemons in a fast loop,\n\
251 the -m and -M options allow you to control the minimum delay between\n\
252 restart commands. The minimum restart delay is recalculated each time\n\
253 a restart is attempted: if the time since the last restart attempt exceeds\n\
254 twice the -M value, then the restart delay is set to the -m value.\n\
255 Otherwise, the interval is doubled (but capped at the -M value).\n\n", progname
, mode_str
[0], progname
, mode_str
[1], progname
, mode_str
[2], progname
, mode_str
[3], progname
, mode_str
[4], progname
, mode_str
[2], mode_str
[3]);
258 -d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
259 to syslog instead of stdout.\n\
260 -S, --statedir Set the vty socket directory (default is %s)\n\
261 -e, --no-echo Do not ping the daemons to test responsiveness (this\n\
262 option is necessary if the daemons do not support the\n\
264 -l, --loglevel Set the logging level (default is %d).\n\
265 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
266 but it can be set higher than %d if extra-verbose debugging\n\
267 messages are desired.\n\
268 -m, --min-restart-interval\n\
269 Set the minimum seconds to wait between invocations of daemon\n\
270 restart commands (default is %d).\n\
271 -M, --max-restart-interval\n\
272 Set the maximum seconds to wait between invocations of daemon\n\
273 restart commands (default is %d).\n\
274 -i, --interval Set the status polling interval in seconds (default is %d)\n\
275 -t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
276 -T, --restart-timeout\n\
277 Set the restart (kill) timeout in seconds (default is %d).\n\
278 If any background jobs are still running after this much\n\
279 time has elapsed, they will be killed.\n\
280 -r, --restart Supply a Bourne shell command to use to restart a single\n\
281 daemon. The command string should include '%%s' where the\n\
282 name of the daemon should be substituted.\n\
283 Note that -r and -R are incompatible.\n\
284 -s, --start-command\n\
285 Supply a Bourne shell to command to use to start a single\n\
286 daemon. The command string should include '%%s' where the\n\
287 name of the daemon should be substituted.\n\
288 -k, --kill-command\n\
289 Supply a Bourne shell to command to use to stop a single\n\
290 daemon. The command string should include '%%s' where the\n\
291 name of the daemon should be substituted.\n\
293 When one or more daemons is down, try to restart everything\n\
294 using the Bourne shell command supplied as the argument.\n\
295 Note that -r and -R are incompatible.\n\
296 -z, --unresponsive-restart\n\
297 When a daemon is unresponsive, treat it as being down for\n\
300 When zebra hangs or crashes, restart all daemons using\n\
301 this phased approach: 1. stop all other daemons; 2. restart\n\
302 zebra; 3. start other daemons. Requires -r, -s, and -k.\n\
303 -A, --always-all-restart\n\
304 When any daemon (not just zebra) hangs or crashes, use the\n\
305 same phased restart mechanism described above for -a.\n\
306 Requires -r, -s, and -k.\n\
307 -p, --pid-file Set process identifier file name\n\
309 -b, --blank-string\n\
310 When the supplied argument string is found in any of the\n\
311 various shell command arguments (-r, -s, -k, or -R), replace\n\
312 it with a space. This is an ugly hack to circumvent problems\n\
313 passing command-line arguments with embedded spaces.\n\
314 -v, --version Print program version\n\
315 -h, --help Display this help and exit\n", VTYDIR
, DEFAULT_LOGLEVEL
, LOG_EMERG
, LOG_DEBUG
, LOG_DEBUG
, DEFAULT_MIN_RESTART
, DEFAULT_MAX_RESTART
, DEFAULT_PERIOD
, DEFAULT_TIMEOUT
, DEFAULT_RESTART_TIMEOUT
, DEFAULT_PIDFILE
);
321 static pid_t
run_background(char *shell_cmd
)
325 switch (child
= fork()) {
327 zlog_err("fork failed, cannot run command [%s]: %s",
328 shell_cmd
, safe_strerror(errno
));
332 /* Use separate process group so child processes can be killed easily. */
333 if (setpgid(0, 0) < 0)
334 zlog_warn("warning: setpgid(0,0) failed: %s",
335 safe_strerror(errno
));
339 char *const argv
[4] = { shell
, dashc
, shell_cmd
, NULL
};
340 execv("/bin/sh", argv
);
341 zlog_err("execv(/bin/sh -c '%s') failed: %s",
342 shell_cmd
, safe_strerror(errno
));
346 /* Parent process: we will reap the child later. */
347 zlog_err("Forked background command [pid %d]: %s", (int)child
,
353 static struct timeval
*time_elapsed(struct timeval
*result
,
354 const struct timeval
*start_time
)
356 gettimeofday(result
, NULL
);
357 result
->tv_sec
-= start_time
->tv_sec
;
358 result
->tv_usec
-= start_time
->tv_usec
;
359 while (result
->tv_usec
< 0) {
360 result
->tv_usec
+= 1000000L;
366 static int restart_kill(struct thread
*t_kill
)
368 struct restart_info
*restart
= THREAD_ARG(t_kill
);
369 struct timeval delay
;
371 time_elapsed(&delay
, &restart
->time
);
372 zlog_warn("Warning: %s %s child process %d still running after "
373 "%ld seconds, sending signal %d",
374 restart
->what
, restart
->name
, (int)restart
->pid
,
375 (long)delay
.tv_sec
, (restart
->kills
? SIGKILL
: SIGTERM
));
376 kill(-restart
->pid
, (restart
->kills
? SIGKILL
: SIGTERM
));
378 restart
->t_kill
= thread_add_timer(master
, restart_kill
, restart
,
383 static struct restart_info
*find_child(pid_t child
)
385 if (gs
.mode
== MODE_GLOBAL_RESTART
) {
386 if (gs
.restart
.pid
== child
)
390 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
391 if (dmn
->restart
.pid
== child
)
392 return &dmn
->restart
;
398 static void sigchild(void)
404 struct restart_info
*restart
;
406 switch (child
= waitpid(-1, &status
, WNOHANG
)) {
408 zlog_err("waitpid failed: %s", safe_strerror(errno
));
411 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
415 if (child
== integrated_write_pid
) {
416 integrated_write_sigchld(status
);
420 if ((restart
= find_child(child
)) != NULL
) {
421 name
= restart
->name
;
422 what
= restart
->what
;
425 thread_cancel(restart
->t_kill
);
426 restart
->t_kill
= NULL
;
427 /* Update restart time to reflect the time the command completed. */
428 gettimeofday(&restart
->time
, NULL
);
431 ("waitpid returned status for an unknown child process %d",
436 if (WIFSTOPPED(status
))
437 zlog_warn("warning: %s %s process %d is stopped",
438 what
, name
, (int)child
);
439 else if (WIFSIGNALED(status
))
440 zlog_warn("%s %s process %d terminated due to signal %d",
441 what
, name
, (int)child
, WTERMSIG(status
));
442 else if (WIFEXITED(status
)) {
443 if (WEXITSTATUS(status
) != 0)
445 ("%s %s process %d exited with non-zero status %d",
446 what
, name
, (int)child
, WEXITSTATUS(status
));
448 zlog_debug("%s %s process %d exited normally", what
,
451 zlog_err("cannot interpret %s %s process %d wait status 0x%x",
452 what
, name
, (int)child
, status
);
457 run_job(struct restart_info
*restart
, const char *cmdtype
, const char *command
,
458 int force
, int update_interval
)
460 struct timeval delay
;
462 if (gs
.loglevel
> LOG_DEBUG
+ 1)
463 zlog_debug("attempting to %s %s", cmdtype
, restart
->name
);
466 if (gs
.loglevel
> LOG_DEBUG
+ 1)
468 ("cannot %s %s, previous pid %d still running",
469 cmdtype
, restart
->name
, (int)restart
->pid
);
473 /* Note: time_elapsed test must come before the force test, since we need
474 to make sure that delay is initialized for use below in updating the
476 if ((time_elapsed(&delay
, &restart
->time
)->tv_sec
< restart
->interval
)
478 if (gs
.loglevel
> LOG_DEBUG
+ 1)
479 zlog_debug("postponing %s %s: "
480 "elapsed time %ld < retry interval %ld",
481 cmdtype
, restart
->name
, (long)delay
.tv_sec
,
486 gettimeofday(&restart
->time
, NULL
);
489 char cmd
[strlen(command
) + strlen(restart
->name
) + 1];
490 snprintf(cmd
, sizeof(cmd
), command
, restart
->name
);
491 if ((restart
->pid
= run_background(cmd
)) > 0) {
493 thread_add_timer(master
, restart_kill
, restart
,
495 restart
->what
= cmdtype
;
501 /* Calculate the new restart interval. */
502 if (update_interval
) {
503 if (delay
.tv_sec
> 2 * gs
.max_restart_interval
)
504 restart
->interval
= gs
.min_restart_interval
;
505 else if ((restart
->interval
*= 2) > gs
.max_restart_interval
)
506 restart
->interval
= gs
.max_restart_interval
;
507 if (gs
.loglevel
> LOG_DEBUG
+ 1)
508 zlog_debug("restart %s interval is now %ld",
509 restart
->name
, restart
->interval
);
514 #define SET_READ_HANDLER(DMN) \
515 (DMN)->t_read = thread_add_read(master,handle_read,(DMN),(DMN)->fd)
517 #define SET_WAKEUP_DOWN(DMN) \
518 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_down,(DMN), \
521 #define SET_WAKEUP_UNRESPONSIVE(DMN) \
522 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_unresponsive,(DMN), \
525 #define SET_WAKEUP_ECHO(DMN) \
526 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_send_echo,(DMN), \
529 static int wakeup_down(struct thread
*t_wakeup
)
531 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
533 dmn
->t_wakeup
= NULL
;
534 if (try_connect(dmn
) < 0)
535 SET_WAKEUP_DOWN(dmn
);
536 if ((dmn
->connect_tries
> 1) && (dmn
->state
!= DAEMON_UP
))
541 static int wakeup_init(struct thread
*t_wakeup
)
543 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
545 dmn
->t_wakeup
= NULL
;
546 if (try_connect(dmn
) < 0) {
547 SET_WAKEUP_DOWN(dmn
);
548 zlog_err("%s state -> down : initial connection attempt failed",
550 dmn
->state
= DAEMON_DOWN
;
555 static void daemon_down(struct daemon
*dmn
, const char *why
)
557 if (IS_UP(dmn
) || (dmn
->state
== DAEMON_INIT
))
558 zlog_err("%s state -> down : %s", dmn
->name
, why
);
559 else if (gs
.loglevel
> LOG_DEBUG
)
560 zlog_debug("%s still down : %s", dmn
->name
, why
);
563 dmn
->state
= DAEMON_DOWN
;
568 THREAD_OFF(dmn
->t_read
);
569 THREAD_OFF(dmn
->t_write
);
570 THREAD_OFF(dmn
->t_wakeup
);
571 if (try_connect(dmn
) < 0)
572 SET_WAKEUP_DOWN(dmn
);
576 static int handle_read(struct thread
*t_read
)
578 struct daemon
*dmn
= THREAD_ARG(t_read
);
579 static const char resp
[sizeof(PING_TOKEN
) + 4] = PING_TOKEN
"\n";
580 char buf
[sizeof(resp
) + 100];
582 struct timeval delay
;
585 if ((rc
= read(dmn
->fd
, buf
, sizeof(buf
))) < 0) {
588 if (ERRNO_IO_RETRY(errno
)) {
589 /* Pretend it never happened. */
590 SET_READ_HANDLER(dmn
);
593 snprintf(why
, sizeof(why
), "unexpected read error: %s",
594 safe_strerror(errno
));
595 daemon_down(dmn
, why
);
599 daemon_down(dmn
, "read returned EOF");
602 if (!dmn
->echo_sent
.tv_sec
) {
603 char why
[sizeof(buf
) + 100];
604 snprintf(why
, sizeof(why
),
605 "unexpected read returns %d bytes: %.*s", (int)rc
,
607 daemon_down(dmn
, why
);
611 /* We are expecting an echo response: is there any chance that the
612 response would not be returned entirely in the first read? That
613 seems inconceivable... */
614 if ((rc
!= sizeof(resp
)) || memcmp(buf
, resp
, sizeof(resp
))) {
615 char why
[100 + sizeof(buf
)];
616 snprintf(why
, sizeof(why
),
617 "read returned bad echo response of %d bytes "
618 "(expecting %u): %.*s", (int)rc
, (u_int
) sizeof(resp
),
620 daemon_down(dmn
, why
);
624 time_elapsed(&delay
, &dmn
->echo_sent
);
625 dmn
->echo_sent
.tv_sec
= 0;
626 if (dmn
->state
== DAEMON_UNRESPONSIVE
) {
627 if (delay
.tv_sec
< gs
.timeout
) {
628 dmn
->state
= DAEMON_UP
;
630 ("%s state -> up : echo response received after %ld.%06ld "
631 "seconds", dmn
->name
, (long)delay
.tv_sec
,
632 (long)delay
.tv_usec
);
635 ("%s: slow echo response finally received after %ld.%06ld "
636 "seconds", dmn
->name
, (long)delay
.tv_sec
,
637 (long)delay
.tv_usec
);
638 } else if (gs
.loglevel
> LOG_DEBUG
+ 1)
639 zlog_debug("%s: echo response received after %ld.%06ld seconds",
640 dmn
->name
, (long)delay
.tv_sec
, (long)delay
.tv_usec
);
642 SET_READ_HANDLER(dmn
);
644 thread_cancel(dmn
->t_wakeup
);
645 SET_WAKEUP_ECHO(dmn
);
651 * Wait till we notice that all daemons are ready before
652 * we send we are ready to systemd
654 static void daemon_send_ready(void)
657 if (!sent
&& gs
.numdown
== 0) {
658 #if defined (HAVE_CUMULUS)
661 fp
= fopen(DAEMON_VTY_DIR
"/watchfrr.started", "w");
665 ("Watchfrr: Notifying Systemd we are up and running");
666 systemd_send_started(master
, 0);
671 static void daemon_up(struct daemon
*dmn
, const char *why
)
673 dmn
->state
= DAEMON_UP
;
675 dmn
->connect_tries
= 0;
676 zlog_notice("%s state -> up : %s", dmn
->name
, why
);
679 SET_WAKEUP_ECHO(dmn
);
683 static int check_connect(struct thread
*t_write
)
685 struct daemon
*dmn
= THREAD_ARG(t_write
);
687 socklen_t reslen
= sizeof(sockerr
);
690 if (getsockopt(dmn
->fd
, SOL_SOCKET
, SO_ERROR
, (char *)&sockerr
, &reslen
)
692 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn
->name
,
693 safe_strerror(errno
));
695 "getsockopt failed checking connection success");
698 if ((reslen
== sizeof(sockerr
)) && sockerr
) {
700 snprintf(why
, sizeof(why
),
701 "getsockopt reports that connection attempt failed: %s",
702 safe_strerror(sockerr
));
703 daemon_down(dmn
, why
);
707 daemon_up(dmn
, "delayed connect succeeded");
711 static int wakeup_connect_hanging(struct thread
*t_wakeup
)
713 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
716 dmn
->t_wakeup
= NULL
;
717 snprintf(why
, sizeof(why
),
718 "connection attempt timed out after %ld seconds", gs
.timeout
);
719 daemon_down(dmn
, why
);
723 /* Making connection to protocol daemon. */
724 static int try_connect(struct daemon
*dmn
)
727 struct sockaddr_un addr
;
730 if (gs
.loglevel
> LOG_DEBUG
+ 1)
731 zlog_debug("%s: attempting to connect", dmn
->name
);
732 dmn
->connect_tries
++;
734 memset(&addr
, 0, sizeof(struct sockaddr_un
));
735 addr
.sun_family
= AF_UNIX
;
736 snprintf(addr
.sun_path
, sizeof(addr
.sun_path
), "%s/%s.vty",
737 gs
.vtydir
, dmn
->name
);
738 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
739 len
= addr
.sun_len
= SUN_LEN(&addr
);
741 len
= sizeof(addr
.sun_family
) + strlen(addr
.sun_path
);
742 #endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
744 /* Quick check to see if we might succeed before we go to the trouble
745 of creating a socket. */
746 if (access(addr
.sun_path
, W_OK
) < 0) {
748 zlog_err("%s: access to socket %s denied: %s",
749 dmn
->name
, addr
.sun_path
,
750 safe_strerror(errno
));
754 if ((sock
= socket(AF_UNIX
, SOCK_STREAM
, 0)) < 0) {
755 zlog_err("%s(%s): cannot make socket: %s",
756 __func__
, addr
.sun_path
, safe_strerror(errno
));
760 if (set_nonblocking(sock
) < 0 || set_cloexec(sock
) < 0) {
761 zlog_err("%s(%s): set_nonblocking/cloexec(%d) failed",
762 __func__
, addr
.sun_path
, sock
);
767 if (connect(sock
, (struct sockaddr
*)&addr
, len
) < 0) {
768 if ((errno
!= EINPROGRESS
) && (errno
!= EWOULDBLOCK
)) {
769 if (gs
.loglevel
> LOG_DEBUG
)
770 zlog_debug("%s(%s): connect failed: %s",
771 __func__
, addr
.sun_path
,
772 safe_strerror(errno
));
776 if (gs
.loglevel
> LOG_DEBUG
)
777 zlog_debug("%s: connection in progress", dmn
->name
);
778 dmn
->state
= DAEMON_CONNECTING
;
781 thread_add_write(master
, check_connect
, dmn
, dmn
->fd
);
783 thread_add_timer(master
, wakeup_connect_hanging
, dmn
,
785 SET_READ_HANDLER(dmn
);
790 SET_READ_HANDLER(dmn
);
791 daemon_up(dmn
, "connect succeeded");
795 static int phase_hanging(struct thread
*t_hanging
)
797 gs
.t_phase_hanging
= NULL
;
798 zlog_err("Phase [%s] hanging for %ld seconds, aborting phased restart",
799 phase_str
[gs
.phase
], PHASE_TIMEOUT
);
800 gs
.phase
= PHASE_NONE
;
804 static void set_phase(restart_phase_t new_phase
)
806 gs
.phase
= new_phase
;
807 if (gs
.t_phase_hanging
)
808 thread_cancel(gs
.t_phase_hanging
);
809 gs
.t_phase_hanging
= thread_add_timer(master
, phase_hanging
, NULL
,
813 static void phase_check(void)
818 case PHASE_STOPS_PENDING
:
822 ("Phased restart: all routing daemon stop jobs have completed.");
823 set_phase(PHASE_WAITING_DOWN
);
826 case PHASE_WAITING_DOWN
:
827 if (gs
.numdown
+ IS_UP(gs
.special
) < gs
.numdaemons
)
829 zlog_info("Phased restart: all routing daemons now down.");
830 run_job(&gs
.special
->restart
, "restart", gs
.restart_command
, 1,
832 set_phase(PHASE_ZEBRA_RESTART_PENDING
);
835 case PHASE_ZEBRA_RESTART_PENDING
:
836 if (gs
.special
->restart
.pid
)
838 zlog_info("Phased restart: %s restart job completed.",
840 set_phase(PHASE_WAITING_ZEBRA_UP
);
843 case PHASE_WAITING_ZEBRA_UP
:
844 if (!IS_UP(gs
.special
))
846 zlog_info("Phased restart: %s is now up.", gs
.special
->name
);
849 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
850 if (dmn
!= gs
.special
)
851 run_job(&dmn
->restart
, "start",
852 gs
.start_command
, 1, 0);
855 gs
.phase
= PHASE_NONE
;
856 THREAD_OFF(gs
.t_phase_hanging
);
857 zlog_notice("Phased global restart has completed.");
862 static void try_restart(struct daemon
*dmn
)
867 case MODE_GLOBAL_RESTART
:
868 run_job(&gs
.restart
, "restart", gs
.restart_command
, 0, 1);
870 case MODE_SEPARATE_RESTART
:
871 run_job(&dmn
->restart
, "restart", gs
.restart_command
, 0, 1);
873 case MODE_PHASED_ZEBRA_RESTART
:
874 if (dmn
!= gs
.special
) {
875 if ((gs
.special
->state
== DAEMON_UP
)
876 && (gs
.phase
== PHASE_NONE
))
877 run_job(&dmn
->restart
, "restart",
878 gs
.restart_command
, 0, 1);
881 ("%s: postponing restart attempt because master %s daemon "
882 "not up [%s], or phased restart in progress",
883 dmn
->name
, gs
.special
->name
,
884 state_str
[gs
.special
->state
]);
889 case MODE_PHASED_ALL_RESTART
:
890 if ((gs
.phase
!= PHASE_NONE
) || gs
.numpids
) {
891 if (gs
.loglevel
> LOG_DEBUG
+ 1)
893 ("postponing phased global restart: restart already in "
894 "progress [%s], or outstanding child processes [%d]",
895 phase_str
[gs
.phase
], gs
.numpids
);
898 /* Is it too soon for a restart? */
900 struct timeval delay
;
901 if (time_elapsed(&delay
, &gs
.special
->restart
.time
)->
902 tv_sec
< gs
.special
->restart
.interval
) {
903 if (gs
.loglevel
> LOG_DEBUG
+ 1)
905 ("postponing phased global restart: "
906 "elapsed time %ld < retry interval %ld",
908 gs
.special
->restart
.interval
);
912 run_job(&gs
.restart
, "restart", gs
.restart_command
, 0, 1);
915 zlog_err("error: unknown restart mode %d", gs
.mode
);
920 static int wakeup_unresponsive(struct thread
*t_wakeup
)
922 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
924 dmn
->t_wakeup
= NULL
;
925 if (dmn
->state
!= DAEMON_UNRESPONSIVE
)
926 zlog_err("%s: no longer unresponsive (now %s), "
927 "wakeup should have been cancelled!",
928 dmn
->name
, state_str
[dmn
->state
]);
930 SET_WAKEUP_UNRESPONSIVE(dmn
);
936 static int wakeup_no_answer(struct thread
*t_wakeup
)
938 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
940 dmn
->t_wakeup
= NULL
;
941 dmn
->state
= DAEMON_UNRESPONSIVE
;
942 zlog_err("%s state -> unresponsive : no response yet to ping "
943 "sent %ld seconds ago", dmn
->name
, gs
.timeout
);
944 if (gs
.unresponsive_restart
) {
945 SET_WAKEUP_UNRESPONSIVE(dmn
);
951 static int wakeup_send_echo(struct thread
*t_wakeup
)
953 static const char echocmd
[] = "echo " PING_TOKEN
;
955 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
957 dmn
->t_wakeup
= NULL
;
958 if (((rc
= write(dmn
->fd
, echocmd
, sizeof(echocmd
))) < 0) ||
959 ((size_t) rc
!= sizeof(echocmd
))) {
960 char why
[100 + sizeof(echocmd
)];
961 snprintf(why
, sizeof(why
),
962 "write '%s' returned %d instead of %u", echocmd
,
963 (int)rc
, (u_int
) sizeof(echocmd
));
964 daemon_down(dmn
, why
);
966 gettimeofday(&dmn
->echo_sent
, NULL
);
968 thread_add_timer(master
, wakeup_no_answer
, dmn
, gs
.timeout
);
973 static void sigint(void)
975 zlog_notice("Terminating on signal");
976 systemd_send_stopping();
980 static int valid_command(const char *cmd
)
984 return ((p
= strchr(cmd
, '%')) != NULL
) && (*(p
+ 1) == 's')
985 && !strchr(p
+ 1, '%');
988 /* This is an ugly hack to circumvent problems with passing command-line
989 arguments that contain spaces. The fix is to use a configuration file. */
990 static char *translate_blanks(const char *cmd
, const char *blankstr
)
994 size_t bslen
= strlen(blankstr
);
996 if (!(res
= strdup(cmd
))) {
1000 while ((p
= strstr(res
, blankstr
)) != NULL
) {
1003 memmove(p
+ 1, p
+ bslen
, strlen(p
+ bslen
) + 1);
1008 struct zebra_privs_t watchfrr_privs
= {
1010 .vty_group
= VTY_GROUP
,
1014 int main(int argc
, char **argv
)
1016 const char *progname
;
1018 int daemon_mode
= 0;
1019 const char *pidfile
= DEFAULT_PIDFILE
;
1020 const char *special
= "zebra";
1021 const char *blankstr
= NULL
;
1022 static struct quagga_signal_t my_signals
[] = {
1033 .handler
= sigchild
,
1037 if ((progname
= strrchr(argv
[0], '/')) != NULL
)
1042 gs
.restart
.name
= "all";
1044 getopt_long(argc
, argv
, "aAb:dek:l:m:M:i:p:r:R:S:s:t:T:zvh",
1045 longopts
, 0)) != EOF
) {
1050 if ((gs
.mode
!= MODE_MONITOR
)
1051 && (gs
.mode
!= MODE_SEPARATE_RESTART
)) {
1052 fputs("Ambiguous operating mode selected.\n",
1054 return usage(progname
, 1);
1056 gs
.mode
= MODE_PHASED_ZEBRA_RESTART
;
1059 if ((gs
.mode
!= MODE_MONITOR
)
1060 && (gs
.mode
!= MODE_SEPARATE_RESTART
)) {
1061 fputs("Ambiguous operating mode selected.\n",
1063 return usage(progname
, 1);
1065 gs
.mode
= MODE_PHASED_ALL_RESTART
;
1077 if (!valid_command(optarg
)) {
1079 "Invalid kill command, must contain '%%s': %s\n",
1081 return usage(progname
, 1);
1083 gs
.stop_command
= optarg
;
1089 (optarg
, "%d%1s", &gs
.loglevel
,
1091 || (gs
.loglevel
< LOG_EMERG
)) {
1093 "Invalid loglevel argument: %s\n",
1095 return usage(progname
, 1);
1102 if ((sscanf(optarg
, "%ld%1s",
1103 &gs
.min_restart_interval
,
1105 || (gs
.min_restart_interval
< 0)) {
1107 "Invalid min_restart_interval argument: %s\n",
1109 return usage(progname
, 1);
1116 if ((sscanf(optarg
, "%ld%1s",
1117 &gs
.max_restart_interval
,
1119 || (gs
.max_restart_interval
< 0)) {
1121 "Invalid max_restart_interval argument: %s\n",
1123 return usage(progname
, 1);
1131 if ((sscanf(optarg
, "%d%1s", &period
, garbage
)
1132 != 1) || (gs
.period
< 1)) {
1134 "Invalid interval argument: %s\n",
1136 return usage(progname
, 1);
1138 gs
.period
= 1000 * period
;
1145 if ((gs
.mode
== MODE_GLOBAL_RESTART
) ||
1146 (gs
.mode
== MODE_SEPARATE_RESTART
)) {
1147 fputs("Ambiguous operating mode selected.\n",
1149 return usage(progname
, 1);
1151 if (!valid_command(optarg
)) {
1153 "Invalid restart command, must contain '%%s': %s\n",
1155 return usage(progname
, 1);
1157 gs
.restart_command
= optarg
;
1158 if (gs
.mode
== MODE_MONITOR
)
1159 gs
.mode
= MODE_SEPARATE_RESTART
;
1162 if (gs
.mode
!= MODE_MONITOR
) {
1163 fputs("Ambiguous operating mode selected.\n",
1165 return usage(progname
, 1);
1167 if (strchr(optarg
, '%')) {
1169 "Invalid restart-all arg, must not contain '%%s': %s\n",
1171 return usage(progname
, 1);
1173 gs
.restart_command
= optarg
;
1174 gs
.mode
= MODE_GLOBAL_RESTART
;
1177 if (!valid_command(optarg
)) {
1179 "Invalid start command, must contain '%%s': %s\n",
1181 return usage(progname
, 1);
1183 gs
.start_command
= optarg
;
1192 (optarg
, "%ld%1s", &gs
.timeout
,
1193 garbage
) != 1) || (gs
.timeout
< 1)) {
1195 "Invalid timeout argument: %s\n",
1197 return usage(progname
, 1);
1205 (optarg
, "%ld%1s", &gs
.restart_timeout
,
1207 || (gs
.restart_timeout
< 1)) {
1209 "Invalid restart timeout argument: %s\n",
1211 return usage(progname
, 1);
1216 gs
.unresponsive_restart
= 1;
1219 printf("%s version %s\n", progname
, FRR_VERSION
);
1220 puts("Copyright 2004 Andrew J. Schorr");
1223 return usage(progname
, 0);
1225 fputs("Invalid option.\n", stderr
);
1226 return usage(progname
, 1);
1230 if (gs
.unresponsive_restart
&& (gs
.mode
== MODE_MONITOR
)) {
1231 fputs("Option -z requires a -r or -R restart option.\n",
1233 return usage(progname
, 1);
1237 if (gs
.restart_command
|| gs
.start_command
|| gs
.stop_command
) {
1239 "No kill/(re)start commands needed for %s mode.\n",
1241 return usage(progname
, 1);
1244 case MODE_GLOBAL_RESTART
:
1245 case MODE_SEPARATE_RESTART
:
1246 if (!gs
.restart_command
|| gs
.start_command
|| gs
.stop_command
) {
1248 "No start/kill commands needed in [%s] mode.\n",
1250 return usage(progname
, 1);
1253 case MODE_PHASED_ZEBRA_RESTART
:
1254 case MODE_PHASED_ALL_RESTART
:
1255 if (!gs
.restart_command
|| !gs
.start_command
1256 || !gs
.stop_command
) {
1258 "Need start, kill, and restart commands in [%s] mode.\n",
1260 return usage(progname
, 1);
1266 if (gs
.restart_command
)
1267 gs
.restart_command
=
1268 translate_blanks(gs
.restart_command
, blankstr
);
1269 if (gs
.start_command
)
1271 translate_blanks(gs
.start_command
, blankstr
);
1272 if (gs
.stop_command
)
1274 translate_blanks(gs
.stop_command
, blankstr
);
1277 gs
.restart
.interval
= gs
.min_restart_interval
;
1279 zprivs_init(&watchfrr_privs
);
1281 master
= thread_master_create();
1285 watchfrr_vty_init();
1286 vty_serv_sock(NULL
, 0, WATCHFRR_VTYSH_PATH
);
1288 signal_init(master
, array_size(my_signals
), my_signals
);
1289 srandom(time(NULL
));
1293 struct daemon
*tail
= NULL
;
1295 for (i
= optind
; i
< argc
; i
++) {
1298 if (!(dmn
= (struct daemon
*)calloc(1, sizeof(*dmn
)))) {
1299 fprintf(stderr
, "calloc(1,%u) failed: %s\n",
1300 (u_int
) sizeof(*dmn
),
1301 safe_strerror(errno
));
1304 dmn
->name
= dmn
->restart
.name
= argv
[i
];
1305 dmn
->state
= DAEMON_INIT
;
1310 thread_add_timer_msec(master
, wakeup_init
, dmn
,
1311 100 + (random() % 900));
1312 dmn
->restart
.interval
= gs
.min_restart_interval
;
1319 if (((gs
.mode
== MODE_PHASED_ZEBRA_RESTART
) ||
1320 (gs
.mode
== MODE_PHASED_ALL_RESTART
)) &&
1321 !strcmp(dmn
->name
, special
))
1326 fputs("Must specify one or more daemons to monitor.\n", stderr
);
1327 return usage(progname
, 1);
1329 if (((gs
.mode
== MODE_PHASED_ZEBRA_RESTART
) ||
1330 (gs
.mode
== MODE_PHASED_ALL_RESTART
)) && !gs
.special
) {
1332 "In mode [%s], but cannot find master daemon %s\n",
1333 mode_str
[gs
.mode
], special
);
1334 return usage(progname
, 1);
1337 zlog_default
= openzlog(progname
, ZLOG_WATCHFRR
, 0,
1338 LOG_CONS
| LOG_NDELAY
| LOG_PID
, LOG_DAEMON
);
1339 zlog_set_level(NULL
, ZLOG_DEST_MONITOR
, ZLOG_DISABLED
);
1341 zlog_set_level(NULL
, ZLOG_DEST_SYSLOG
,
1342 MIN(gs
.loglevel
, LOG_DEBUG
));
1343 if (daemon(0, 0) < 0) {
1344 fprintf(stderr
, "Watchfrr daemon failed: %s",
1349 zlog_set_level(NULL
, ZLOG_DEST_STDOUT
,
1350 MIN(gs
.loglevel
, LOG_DEBUG
));
1352 /* Make sure we're not already running. */
1353 pid_output(pidfile
);
1355 /* Announce which daemons are being monitored. */
1360 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
1361 len
+= strlen(dmn
->name
) + 1;
1367 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
1370 strcpy(p
, dmn
->name
);
1373 zlog_notice("%s %s watching [%s], mode [%s]",
1374 progname
, FRR_VERSION
, buf
,
1380 struct thread thread
;
1382 while (thread_fetch(master
, &thread
))
1383 thread_call(&thread
);
1386 systemd_send_stopping();