2 * Monitor status of frr daemons and restart if necessary.
4 * Copyright (C) 2004 Andrew J. Schorr
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
26 #include <lib/version.h>
28 #include "memory_vty.h"
30 #include "lib_errors.h"
39 #include "watchfrr_errors.h"
42 #define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
45 /* Macros to help randomize timers. */
46 #define JITTER(X) ((random() % ((X)+1))-((X)/2))
47 #define FUZZY(X) ((X)+JITTER((X)/20))
49 #define DEFAULT_PERIOD 5
50 #define DEFAULT_TIMEOUT 90
51 #define DEFAULT_RESTART_TIMEOUT 20
52 #define DEFAULT_LOGLEVEL LOG_INFO
53 #define DEFAULT_MIN_RESTART 60
54 #define DEFAULT_MAX_RESTART 600
56 #define DEFAULT_RESTART_CMD WATCHFRR_SH_PATH " restart %s"
57 #define DEFAULT_START_CMD WATCHFRR_SH_PATH " start %s"
58 #define DEFAULT_STOP_CMD WATCHFRR_SH_PATH " stop %s"
60 #define PING_TOKEN "PING"
62 DEFINE_MGROUP(WATCHFRR
, "watchfrr")
63 DEFINE_MTYPE_STATIC(WATCHFRR
, WATCHFRR_DAEMON
, "watchfrr daemon entry")
65 /* Needs to be global, referenced somewhere inside libfrr. */
66 struct thread_master
*master
;
68 static bool watch_only
= false;
75 PHASE_ZEBRA_RESTART_PENDING
,
76 PHASE_WAITING_ZEBRA_UP
79 static const char *phase_str
[] = {
83 "Waiting for other daemons to come down",
84 "Zebra restart job running",
85 "Waiting for zebra to come up",
89 #define PHASE_TIMEOUT (3*gs.restart_timeout)
90 #define STARTUP_TIMEOUT 55 * 1000
98 struct thread
*t_kill
;
102 static struct global_state
{
103 restart_phase_t phase
;
104 struct thread
*t_phase_hanging
;
105 struct thread
*t_startup_timeout
;
109 long restart_timeout
;
110 long min_restart_interval
;
111 long max_restart_interval
;
112 struct daemon
*daemons
;
113 const char *restart_command
;
114 const char *start_command
;
115 const char *stop_command
;
116 struct restart_info restart
;
118 struct daemon
*special
; /* points to zebra when doing phased restart */
121 int numdown
; /* # of daemons that are not UP or UNRESPONSIVE */
124 .vtydir
= frr_vtydir
,
125 .period
= 1000 * DEFAULT_PERIOD
,
126 .timeout
= DEFAULT_TIMEOUT
,
127 .restart_timeout
= DEFAULT_RESTART_TIMEOUT
,
128 .loglevel
= DEFAULT_LOGLEVEL
,
129 .min_restart_interval
= DEFAULT_MIN_RESTART
,
130 .max_restart_interval
= DEFAULT_MAX_RESTART
,
131 .restart_command
= DEFAULT_RESTART_CMD
,
132 .start_command
= DEFAULT_START_CMD
,
133 .stop_command
= DEFAULT_STOP_CMD
,
145 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
147 static const char *state_str
[] = {
148 "Init", "Down", "Connecting", "Up", "Unresponsive",
153 daemon_state_t state
;
155 struct timeval echo_sent
;
156 unsigned int connect_tries
;
157 struct thread
*t_wakeup
;
158 struct thread
*t_read
;
159 struct thread
*t_write
;
161 struct restart_info restart
;
164 * For a given daemon, if we've turned on ignore timeouts
165 * ignore the timeout value and assume everything is ok
166 * This is for daemon debugging w/ gdb after we have started
167 * FRR and realize we have something that needs to be looked
173 #define OPTION_MINRESTART 2000
174 #define OPTION_MAXRESTART 2001
175 #define OPTION_DRY 2002
177 static const struct option longopts
[] = {
178 {"daemon", no_argument
, NULL
, 'd'},
179 {"statedir", required_argument
, NULL
, 'S'},
180 {"loglevel", required_argument
, NULL
, 'l'},
181 {"interval", required_argument
, NULL
, 'i'},
182 {"timeout", required_argument
, NULL
, 't'},
183 {"restart-timeout", required_argument
, NULL
, 'T'},
184 {"restart", required_argument
, NULL
, 'r'},
185 {"start-command", required_argument
, NULL
, 's'},
186 {"kill-command", required_argument
, NULL
, 'k'},
187 {"dry", no_argument
, NULL
, OPTION_DRY
},
188 {"min-restart-interval", required_argument
, NULL
, OPTION_MINRESTART
},
189 {"max-restart-interval", required_argument
, NULL
, OPTION_MAXRESTART
},
190 {"pid-file", required_argument
, NULL
, 'p'},
191 {"blank-string", required_argument
, NULL
, 'b'},
192 {"help", no_argument
, NULL
, 'h'},
193 {"version", no_argument
, NULL
, 'v'},
196 static int try_connect(struct daemon
*dmn
);
197 static int wakeup_send_echo(struct thread
*t_wakeup
);
198 static void try_restart(struct daemon
*dmn
);
199 static void phase_check(void);
200 static void restart_done(struct daemon
*dmn
);
202 static const char *progname
;
204 void watchfrr_set_ignore_daemon(struct vty
*vty
, const char *dname
, bool ignore
)
208 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
209 if (strncmp(dmn
->name
, dname
, strlen(dmn
->name
)) == 0)
214 dmn
->ignore_timeout
= ignore
;
215 vty_out(vty
, "%s switching to %s\n", dmn
->name
,
216 ignore
? "ignore" : "watch");
218 vty_out(vty
, "%s is not configured for running at the moment",
222 static void printhelp(FILE *target
)
225 "Usage : %s [OPTION...] <daemon name> ...\n\n\
226 Watchdog program to monitor status of frr daemons and try to restart\n\
227 them if they are down or unresponsive. It determines whether a daemon is\n\
228 up based on whether it can connect to the daemon's vty unix stream socket.\n\
229 It then repeatedly sends echo commands over that socket to determine whether\n\
230 the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
231 on the socket connection and know immediately that the daemon is down.\n\n\
232 The daemons to be monitored should be listed on the command line.\n\n\
233 In order to avoid attempting to restart the daemons in a fast loop,\n\
234 the -m and -M options allow you to control the minimum delay between\n\
235 restart commands. The minimum restart delay is recalculated each time\n\
236 a restart is attempted: if the time since the last restart attempt exceeds\n\
237 twice the -M value, then the restart delay is set to the -m value.\n\
238 Otherwise, the interval is doubled (but capped at the -M value).\n\n",
243 -d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
244 to syslog instead of stdout.\n\
245 -S, --statedir Set the vty socket directory (default is %s)\n\
246 -l, --loglevel Set the logging level (default is %d).\n\
247 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
248 but it can be set higher than %d if extra-verbose debugging\n\
249 messages are desired.\n\
250 --min-restart-interval\n\
251 Set the minimum seconds to wait between invocations of daemon\n\
252 restart commands (default is %d).\n\
253 --max-restart-interval\n\
254 Set the maximum seconds to wait between invocations of daemon\n\
255 restart commands (default is %d).\n\
256 -i, --interval Set the status polling interval in seconds (default is %d)\n\
257 -t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
258 -T, --restart-timeout\n\
259 Set the restart (kill) timeout in seconds (default is %d).\n\
260 If any background jobs are still running after this much\n\
261 time has elapsed, they will be killed.\n\
262 -r, --restart Supply a Bourne shell command to use to restart a single\n\
263 daemon. The command string should include '%%s' where the\n\
264 name of the daemon should be substituted.\n\
266 -s, --start-command\n\
267 Supply a Bourne shell to command to use to start a single\n\
268 daemon. The command string should include '%%s' where the\n\
269 name of the daemon should be substituted.\n\
271 -k, --kill-command\n\
272 Supply a Bourne shell to command to use to stop a single\n\
273 daemon. The command string should include '%%s' where the\n\
274 name of the daemon should be substituted.\n\
276 --dry Do not start or restart anything, just log.\n\
277 -p, --pid-file Set process identifier file name\n\
278 (default is %s/watchfrr.pid).\n\
279 -b, --blank-string\n\
280 When the supplied argument string is found in any of the\n\
281 various shell command arguments (-r, -s, or -k), replace\n\
282 it with a space. This is an ugly hack to circumvent problems\n\
283 passing command-line arguments with embedded spaces.\n\
284 -v, --version Print program version\n\
285 -h, --help Display this help and exit\n",
286 frr_vtydir
, DEFAULT_LOGLEVEL
, LOG_EMERG
, LOG_DEBUG
, LOG_DEBUG
,
287 DEFAULT_MIN_RESTART
, DEFAULT_MAX_RESTART
, DEFAULT_PERIOD
,
288 DEFAULT_TIMEOUT
, DEFAULT_RESTART_TIMEOUT
,
289 DEFAULT_RESTART_CMD
, DEFAULT_START_CMD
, DEFAULT_STOP_CMD
,
293 static pid_t
run_background(char *shell_cmd
)
297 switch (child
= fork()) {
299 flog_err_sys(EC_LIB_SYSTEM_CALL
,
300 "fork failed, cannot run command [%s]: %s",
301 shell_cmd
, safe_strerror(errno
));
305 /* Use separate process group so child processes can be killed
307 if (setpgid(0, 0) < 0)
308 zlog_warn("warning: setpgid(0,0) failed: %s",
309 safe_strerror(errno
));
313 char *const argv
[4] = {shell
, dashc
, shell_cmd
, NULL
};
314 execv("/bin/sh", argv
);
315 flog_err_sys(EC_LIB_SYSTEM_CALL
,
316 "execv(/bin/sh -c '%s') failed: %s",
317 shell_cmd
, safe_strerror(errno
));
321 /* Parent process: we will reap the child later. */
322 flog_err_sys(EC_LIB_SYSTEM_CALL
,
323 "Forked background command [pid %d]: %s",
324 (int)child
, shell_cmd
);
329 static struct timeval
*time_elapsed(struct timeval
*result
,
330 const struct timeval
*start_time
)
332 gettimeofday(result
, NULL
);
333 result
->tv_sec
-= start_time
->tv_sec
;
334 result
->tv_usec
-= start_time
->tv_usec
;
335 while (result
->tv_usec
< 0) {
336 result
->tv_usec
+= 1000000L;
342 static int restart_kill(struct thread
*t_kill
)
344 struct restart_info
*restart
= THREAD_ARG(t_kill
);
345 struct timeval delay
;
347 time_elapsed(&delay
, &restart
->time
);
349 "Warning: %s %s child process %d still running after "
350 "%ld seconds, sending signal %d",
351 restart
->what
, restart
->name
, (int)restart
->pid
,
352 (long)delay
.tv_sec
, (restart
->kills
? SIGKILL
: SIGTERM
));
353 kill(-restart
->pid
, (restart
->kills
? SIGKILL
: SIGTERM
));
355 restart
->t_kill
= NULL
;
356 thread_add_timer(master
, restart_kill
, restart
, gs
.restart_timeout
,
361 static struct restart_info
*find_child(pid_t child
)
364 if (gs
.restart
.pid
== child
)
367 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
368 if (dmn
->restart
.pid
== child
)
369 return &dmn
->restart
;
374 static void sigchild(void)
380 struct restart_info
*restart
;
383 switch (child
= waitpid(-1, &status
, WNOHANG
)) {
385 flog_err_sys(EC_LIB_SYSTEM_CALL
, "waitpid failed: %s",
386 safe_strerror(errno
));
389 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
393 if (child
== integrated_write_pid
) {
394 integrated_write_sigchld(status
);
398 if ((restart
= find_child(child
)) != NULL
) {
399 name
= restart
->name
;
400 what
= restart
->what
;
403 thread_cancel(restart
->t_kill
);
404 restart
->t_kill
= NULL
;
405 /* Update restart time to reflect the time the command
407 gettimeofday(&restart
->time
, NULL
);
411 "waitpid returned status for an unknown child process %d",
416 if (WIFSTOPPED(status
))
417 zlog_warn("warning: %s %s process %d is stopped", what
, name
,
419 else if (WIFSIGNALED(status
))
420 zlog_warn("%s %s process %d terminated due to signal %d", what
,
421 name
, (int)child
, WTERMSIG(status
));
422 else if (WIFEXITED(status
)) {
423 if (WEXITSTATUS(status
) != 0)
425 "%s %s process %d exited with non-zero status %d",
426 what
, name
, (int)child
, WEXITSTATUS(status
));
428 zlog_debug("%s %s process %d exited normally", what
,
431 if (restart
&& restart
!= &gs
.restart
) {
432 dmn
= container_of(restart
, struct daemon
,
436 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
442 "cannot interpret %s %s process %d wait status 0x%x",
443 what
, name
, (int)child
, status
);
447 static int run_job(struct restart_info
*restart
, const char *cmdtype
,
448 const char *command
, int force
, int update_interval
)
450 struct timeval delay
;
452 if (gs
.loglevel
> LOG_DEBUG
+ 1)
453 zlog_debug("attempting to %s %s", cmdtype
, restart
->name
);
456 if (gs
.loglevel
> LOG_DEBUG
+ 1)
458 "cannot %s %s, previous pid %d still running",
459 cmdtype
, restart
->name
, (int)restart
->pid
);
463 /* Note: time_elapsed test must come before the force test, since we
465 to make sure that delay is initialized for use below in updating the
467 if ((time_elapsed(&delay
, &restart
->time
)->tv_sec
< restart
->interval
)
469 if (gs
.loglevel
> LOG_DEBUG
+ 1)
472 "elapsed time %ld < retry interval %ld",
473 cmdtype
, restart
->name
, (long)delay
.tv_sec
,
478 gettimeofday(&restart
->time
, NULL
);
481 char cmd
[strlen(command
) + strlen(restart
->name
) + 1];
482 snprintf(cmd
, sizeof(cmd
), command
, restart
->name
);
483 if ((restart
->pid
= run_background(cmd
)) > 0) {
484 restart
->t_kill
= NULL
;
485 thread_add_timer(master
, restart_kill
, restart
,
486 gs
.restart_timeout
, &restart
->t_kill
);
487 restart
->what
= cmdtype
;
493 /* Calculate the new restart interval. */
494 if (update_interval
) {
495 if (delay
.tv_sec
> 2 * gs
.max_restart_interval
)
496 restart
->interval
= gs
.min_restart_interval
;
497 else if ((restart
->interval
*= 2) > gs
.max_restart_interval
)
498 restart
->interval
= gs
.max_restart_interval
;
499 if (gs
.loglevel
> LOG_DEBUG
+ 1)
500 zlog_debug("restart %s interval is now %ld",
501 restart
->name
, restart
->interval
);
506 #define SET_READ_HANDLER(DMN) \
508 (DMN)->t_read = NULL; \
509 thread_add_read(master, handle_read, (DMN), (DMN)->fd, \
513 #define SET_WAKEUP_DOWN(DMN) \
515 (DMN)->t_wakeup = NULL; \
516 thread_add_timer_msec(master, wakeup_down, (DMN), \
517 FUZZY(gs.period), &(DMN)->t_wakeup); \
520 #define SET_WAKEUP_UNRESPONSIVE(DMN) \
522 (DMN)->t_wakeup = NULL; \
523 thread_add_timer_msec(master, wakeup_unresponsive, (DMN), \
524 FUZZY(gs.period), &(DMN)->t_wakeup); \
527 #define SET_WAKEUP_ECHO(DMN) \
529 (DMN)->t_wakeup = NULL; \
530 thread_add_timer_msec(master, wakeup_send_echo, (DMN), \
531 FUZZY(gs.period), &(DMN)->t_wakeup); \
534 static int wakeup_down(struct thread
*t_wakeup
)
536 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
538 dmn
->t_wakeup
= NULL
;
539 if (try_connect(dmn
) < 0)
540 SET_WAKEUP_DOWN(dmn
);
541 if ((dmn
->connect_tries
> 1) && (dmn
->state
!= DAEMON_UP
))
546 static int wakeup_init(struct thread
*t_wakeup
)
548 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
550 dmn
->t_wakeup
= NULL
;
551 if (try_connect(dmn
) < 0) {
552 flog_err(EC_WATCHFRR_CONNECTION
,
553 "%s state -> down : initial connection attempt failed",
555 dmn
->state
= DAEMON_DOWN
;
561 static void restart_done(struct daemon
*dmn
)
563 if (dmn
->state
!= DAEMON_DOWN
) {
565 "Daemon: %s: is in %s state but expected it to be in DAEMON_DOWN state",
566 dmn
->name
, state_str
[dmn
->state
]);
570 THREAD_OFF(dmn
->t_wakeup
);
571 if (try_connect(dmn
) < 0)
572 SET_WAKEUP_DOWN(dmn
);
575 static void daemon_down(struct daemon
*dmn
, const char *why
)
577 if (IS_UP(dmn
) || (dmn
->state
== DAEMON_INIT
))
578 flog_err(EC_WATCHFRR_CONNECTION
, "%s state -> down : %s",
580 else if (gs
.loglevel
> LOG_DEBUG
)
581 zlog_debug("%s still down : %s", dmn
->name
, why
);
584 dmn
->state
= DAEMON_DOWN
;
589 THREAD_OFF(dmn
->t_read
);
590 THREAD_OFF(dmn
->t_write
);
591 THREAD_OFF(dmn
->t_wakeup
);
592 if (try_connect(dmn
) < 0)
593 SET_WAKEUP_DOWN(dmn
);
597 static int handle_read(struct thread
*t_read
)
599 struct daemon
*dmn
= THREAD_ARG(t_read
);
600 static const char resp
[sizeof(PING_TOKEN
) + 4] = PING_TOKEN
"\n";
601 char buf
[sizeof(resp
) + 100];
603 struct timeval delay
;
606 if ((rc
= read(dmn
->fd
, buf
, sizeof(buf
))) < 0) {
609 if (ERRNO_IO_RETRY(errno
)) {
610 /* Pretend it never happened. */
611 SET_READ_HANDLER(dmn
);
614 snprintf(why
, sizeof(why
), "unexpected read error: %s",
615 safe_strerror(errno
));
616 daemon_down(dmn
, why
);
620 daemon_down(dmn
, "read returned EOF");
623 if (!dmn
->echo_sent
.tv_sec
) {
624 char why
[sizeof(buf
) + 100];
625 snprintf(why
, sizeof(why
),
626 "unexpected read returns %d bytes: %.*s", (int)rc
,
628 daemon_down(dmn
, why
);
632 /* We are expecting an echo response: is there any chance that the
633 response would not be returned entirely in the first read? That
634 seems inconceivable... */
635 if ((rc
!= sizeof(resp
)) || memcmp(buf
, resp
, sizeof(resp
))) {
636 char why
[100 + sizeof(buf
)];
637 snprintf(why
, sizeof(why
),
638 "read returned bad echo response of %d bytes "
639 "(expecting %u): %.*s",
640 (int)rc
, (unsigned int)sizeof(resp
), (int)rc
, buf
);
641 daemon_down(dmn
, why
);
645 time_elapsed(&delay
, &dmn
->echo_sent
);
646 dmn
->echo_sent
.tv_sec
= 0;
647 if (dmn
->state
== DAEMON_UNRESPONSIVE
) {
648 if (delay
.tv_sec
< gs
.timeout
) {
649 dmn
->state
= DAEMON_UP
;
651 "%s state -> up : echo response received after %ld.%06ld "
653 dmn
->name
, (long)delay
.tv_sec
,
654 (long)delay
.tv_usec
);
657 "%s: slow echo response finally received after %ld.%06ld "
659 dmn
->name
, (long)delay
.tv_sec
,
660 (long)delay
.tv_usec
);
661 } else if (gs
.loglevel
> LOG_DEBUG
+ 1)
662 zlog_debug("%s: echo response received after %ld.%06ld seconds",
663 dmn
->name
, (long)delay
.tv_sec
, (long)delay
.tv_usec
);
665 SET_READ_HANDLER(dmn
);
667 thread_cancel(dmn
->t_wakeup
);
668 SET_WAKEUP_ECHO(dmn
);
674 * Wait till we notice that all daemons are ready before
675 * we send we are ready to systemd
677 static void daemon_send_ready(int exitcode
)
687 zlog_notice("all daemons up, doing startup-complete notify");
688 else if (gs
.numdown
< gs
.numdaemons
)
689 flog_err(EC_WATCHFRR_CONNECTION
,
690 "startup did not complete within timeout"
691 " (%d/%d daemons running)",
692 gs
.numdaemons
- gs
.numdown
, gs
.numdaemons
);
694 flog_err(EC_WATCHFRR_CONNECTION
,
695 "all configured daemons failed to start"
696 " -- exiting watchfrr");
703 snprintf(started
, sizeof(started
), "%s%s", frr_vtydir
,
705 fp
= fopen(started
, "w");
708 #if defined HAVE_SYSTEMD
709 systemd_send_started(master
, 0);
714 static void daemon_up(struct daemon
*dmn
, const char *why
)
716 dmn
->state
= DAEMON_UP
;
718 dmn
->connect_tries
= 0;
719 zlog_notice("%s state -> up : %s", dmn
->name
, why
);
721 daemon_send_ready(0);
722 SET_WAKEUP_ECHO(dmn
);
726 static int check_connect(struct thread
*t_write
)
728 struct daemon
*dmn
= THREAD_ARG(t_write
);
730 socklen_t reslen
= sizeof(sockerr
);
733 if (getsockopt(dmn
->fd
, SOL_SOCKET
, SO_ERROR
, (char *)&sockerr
, &reslen
)
735 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn
->name
,
736 safe_strerror(errno
));
738 "getsockopt failed checking connection success");
741 if ((reslen
== sizeof(sockerr
)) && sockerr
) {
745 "getsockopt reports that connection attempt failed: %s",
746 safe_strerror(sockerr
));
747 daemon_down(dmn
, why
);
751 daemon_up(dmn
, "delayed connect succeeded");
755 static int wakeup_connect_hanging(struct thread
*t_wakeup
)
757 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
760 dmn
->t_wakeup
= NULL
;
761 snprintf(why
, sizeof(why
),
762 "connection attempt timed out after %ld seconds", gs
.timeout
);
763 daemon_down(dmn
, why
);
767 /* Making connection to protocol daemon. */
768 static int try_connect(struct daemon
*dmn
)
771 struct sockaddr_un addr
;
774 if (gs
.loglevel
> LOG_DEBUG
+ 1)
775 zlog_debug("%s: attempting to connect", dmn
->name
);
776 dmn
->connect_tries
++;
778 memset(&addr
, 0, sizeof(struct sockaddr_un
));
779 addr
.sun_family
= AF_UNIX
;
780 snprintf(addr
.sun_path
, sizeof(addr
.sun_path
), "%s/%s.vty", gs
.vtydir
,
782 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
783 len
= addr
.sun_len
= SUN_LEN(&addr
);
785 len
= sizeof(addr
.sun_family
) + strlen(addr
.sun_path
);
786 #endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
788 /* Quick check to see if we might succeed before we go to the trouble
789 of creating a socket. */
790 if (access(addr
.sun_path
, W_OK
) < 0) {
792 flog_err_sys(EC_LIB_SYSTEM_CALL
,
793 "%s: access to socket %s denied: %s",
794 dmn
->name
, addr
.sun_path
,
795 safe_strerror(errno
));
799 if ((sock
= socket(AF_UNIX
, SOCK_STREAM
, 0)) < 0) {
800 flog_err_sys(EC_LIB_SOCKET
, "%s(%s): cannot make socket: %s",
801 __func__
, addr
.sun_path
, safe_strerror(errno
));
805 if (set_nonblocking(sock
) < 0 || set_cloexec(sock
) < 0) {
806 flog_err_sys(EC_LIB_SYSTEM_CALL
,
807 "%s(%s): set_nonblocking/cloexec(%d) failed",
808 __func__
, addr
.sun_path
, sock
);
813 if (connect(sock
, (struct sockaddr
*)&addr
, len
) < 0) {
814 if ((errno
!= EINPROGRESS
) && (errno
!= EWOULDBLOCK
)) {
815 if (gs
.loglevel
> LOG_DEBUG
)
816 zlog_debug("%s(%s): connect failed: %s",
817 __func__
, addr
.sun_path
,
818 safe_strerror(errno
));
822 if (gs
.loglevel
> LOG_DEBUG
)
823 zlog_debug("%s: connection in progress", dmn
->name
);
824 dmn
->state
= DAEMON_CONNECTING
;
827 thread_add_write(master
, check_connect
, dmn
, dmn
->fd
,
829 dmn
->t_wakeup
= NULL
;
830 thread_add_timer(master
, wakeup_connect_hanging
, dmn
,
831 gs
.timeout
, &dmn
->t_wakeup
);
832 SET_READ_HANDLER(dmn
);
837 SET_READ_HANDLER(dmn
);
838 daemon_up(dmn
, "connect succeeded");
842 static int phase_hanging(struct thread
*t_hanging
)
844 gs
.t_phase_hanging
= NULL
;
845 flog_err(EC_WATCHFRR_CONNECTION
,
846 "Phase [%s] hanging for %ld seconds, aborting phased restart",
847 phase_str
[gs
.phase
], PHASE_TIMEOUT
);
848 gs
.phase
= PHASE_NONE
;
852 static void set_phase(restart_phase_t new_phase
)
854 gs
.phase
= new_phase
;
855 if (gs
.t_phase_hanging
)
856 thread_cancel(gs
.t_phase_hanging
);
857 gs
.t_phase_hanging
= NULL
;
858 thread_add_timer(master
, phase_hanging
, NULL
, PHASE_TIMEOUT
,
859 &gs
.t_phase_hanging
);
862 static void phase_check(void)
871 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
872 if (dmn
->state
== DAEMON_INIT
)
875 /* startup complete, everything out of INIT */
876 gs
.phase
= PHASE_NONE
;
877 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
878 if (dmn
->state
== DAEMON_DOWN
) {
879 SET_WAKEUP_DOWN(dmn
);
883 case PHASE_STOPS_PENDING
:
887 "Phased restart: all routing daemon stop jobs have completed.");
888 set_phase(PHASE_WAITING_DOWN
);
891 case PHASE_WAITING_DOWN
:
892 if (gs
.numdown
+ IS_UP(gs
.special
) < gs
.numdaemons
)
894 zlog_info("Phased restart: all routing daemons now down.");
895 run_job(&gs
.special
->restart
, "restart", gs
.restart_command
, 1,
897 set_phase(PHASE_ZEBRA_RESTART_PENDING
);
900 case PHASE_ZEBRA_RESTART_PENDING
:
901 if (gs
.special
->restart
.pid
)
903 zlog_info("Phased restart: %s restart job completed.",
905 set_phase(PHASE_WAITING_ZEBRA_UP
);
908 case PHASE_WAITING_ZEBRA_UP
:
909 if (!IS_UP(gs
.special
))
911 zlog_info("Phased restart: %s is now up.", gs
.special
->name
);
914 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
915 if (dmn
!= gs
.special
)
916 run_job(&dmn
->restart
, "start",
917 gs
.start_command
, 1, 0);
920 gs
.phase
= PHASE_NONE
;
921 THREAD_OFF(gs
.t_phase_hanging
);
922 zlog_notice("Phased global restart has completed.");
927 static void try_restart(struct daemon
*dmn
)
932 if (dmn
!= gs
.special
) {
933 if ((gs
.special
->state
== DAEMON_UP
)
934 && (gs
.phase
== PHASE_NONE
))
935 run_job(&dmn
->restart
, "restart", gs
.restart_command
, 0,
939 "%s: postponing restart attempt because master %s daemon "
940 "not up [%s], or phased restart in progress",
941 dmn
->name
, gs
.special
->name
,
942 state_str
[gs
.special
->state
]);
946 if ((gs
.phase
!= PHASE_NONE
) || gs
.numpids
) {
947 if (gs
.loglevel
> LOG_DEBUG
+ 1)
949 "postponing phased global restart: restart already in "
950 "progress [%s], or outstanding child processes [%d]",
951 phase_str
[gs
.phase
], gs
.numpids
);
954 /* Is it too soon for a restart? */
956 struct timeval delay
;
957 if (time_elapsed(&delay
, &gs
.special
->restart
.time
)->tv_sec
958 < gs
.special
->restart
.interval
) {
959 if (gs
.loglevel
> LOG_DEBUG
+ 1)
961 "postponing phased global restart: "
962 "elapsed time %ld < retry interval %ld",
964 gs
.special
->restart
.interval
);
968 run_job(&gs
.restart
, "restart", gs
.restart_command
, 0, 1);
971 static int wakeup_unresponsive(struct thread
*t_wakeup
)
973 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
975 dmn
->t_wakeup
= NULL
;
976 if (dmn
->state
!= DAEMON_UNRESPONSIVE
)
977 flog_err(EC_WATCHFRR_CONNECTION
,
978 "%s: no longer unresponsive (now %s), "
979 "wakeup should have been cancelled!",
980 dmn
->name
, state_str
[dmn
->state
]);
982 SET_WAKEUP_UNRESPONSIVE(dmn
);
988 static int wakeup_no_answer(struct thread
*t_wakeup
)
990 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
992 dmn
->t_wakeup
= NULL
;
993 dmn
->state
= DAEMON_UNRESPONSIVE
;
994 if (dmn
->ignore_timeout
)
996 flog_err(EC_WATCHFRR_CONNECTION
,
997 "%s state -> unresponsive : no response yet to ping "
998 "sent %ld seconds ago",
999 dmn
->name
, gs
.timeout
);
1000 SET_WAKEUP_UNRESPONSIVE(dmn
);
1005 static int wakeup_send_echo(struct thread
*t_wakeup
)
1007 static const char echocmd
[] = "echo " PING_TOKEN
;
1009 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
1011 dmn
->t_wakeup
= NULL
;
1012 if (((rc
= write(dmn
->fd
, echocmd
, sizeof(echocmd
))) < 0)
1013 || ((size_t)rc
!= sizeof(echocmd
))) {
1014 char why
[100 + sizeof(echocmd
)];
1015 snprintf(why
, sizeof(why
),
1016 "write '%s' returned %d instead of %u", echocmd
,
1017 (int)rc
, (unsigned int)sizeof(echocmd
));
1018 daemon_down(dmn
, why
);
1020 gettimeofday(&dmn
->echo_sent
, NULL
);
1021 dmn
->t_wakeup
= NULL
;
1022 thread_add_timer(master
, wakeup_no_answer
, dmn
, gs
.timeout
,
1028 bool check_all_up(void)
1032 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
1033 if (dmn
->state
!= DAEMON_UP
)
1038 void watchfrr_status(struct vty
*vty
)
1041 struct timeval delay
;
1043 vty_out(vty
, "watchfrr global phase: %s\n", phase_str
[gs
.phase
]);
1045 vty_out(vty
, " global restart running, pid %ld\n",
1046 (long)gs
.restart
.pid
);
1048 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
1049 vty_out(vty
, " %-20s %s%s", dmn
->name
, state_str
[dmn
->state
],
1050 dmn
->ignore_timeout
? "/Ignoring Timeout\n" : "\n");
1051 if (dmn
->restart
.pid
)
1052 vty_out(vty
, " restart running, pid %ld\n",
1053 (long)dmn
->restart
.pid
);
1054 else if (dmn
->state
== DAEMON_DOWN
&&
1055 time_elapsed(&delay
, &dmn
->restart
.time
)->tv_sec
1056 < dmn
->restart
.interval
)
1057 vty_out(vty
, " restarting in %jd seconds"
1058 " (%jds backoff interval)\n",
1059 (intmax_t)dmn
->restart
.interval
1060 - (intmax_t)delay
.tv_sec
,
1061 (intmax_t)dmn
->restart
.interval
);
1065 static void sigint(void)
1067 zlog_notice("Terminating on signal");
1068 systemd_send_stopping();
1072 static int valid_command(const char *cmd
)
1076 return ((p
= strchr(cmd
, '%')) != NULL
) && (*(p
+ 1) == 's')
1077 && !strchr(p
+ 1, '%');
1080 /* This is an ugly hack to circumvent problems with passing command-line
1081 arguments that contain spaces. The fix is to use a configuration file. */
1082 static char *translate_blanks(const char *cmd
, const char *blankstr
)
1086 size_t bslen
= strlen(blankstr
);
1088 if (!(res
= strdup(cmd
))) {
1092 while ((p
= strstr(res
, blankstr
)) != NULL
) {
1095 memmove(p
+ 1, p
+ bslen
, strlen(p
+ bslen
) + 1);
1100 static int startup_timeout(struct thread
*t_wakeup
)
1102 daemon_send_ready(1);
1106 static void watchfrr_init(int argc
, char **argv
)
1108 const char *special
= "zebra";
1110 struct daemon
*dmn
, **add
= &gs
.daemons
;
1111 char alldaemons
[512] = "", *p
= alldaemons
;
1113 thread_add_timer_msec(master
, startup_timeout
, NULL
, STARTUP_TIMEOUT
,
1114 &gs
.t_startup_timeout
);
1116 for (i
= optind
; i
< argc
; i
++) {
1117 dmn
= XCALLOC(MTYPE_WATCHFRR_DAEMON
, sizeof(*dmn
));
1119 dmn
->name
= dmn
->restart
.name
= argv
[i
];
1120 dmn
->state
= DAEMON_INIT
;
1124 dmn
->t_wakeup
= NULL
;
1125 thread_add_timer_msec(master
, wakeup_init
, dmn
, 0,
1127 dmn
->restart
.interval
= gs
.min_restart_interval
;
1131 if (!strcmp(dmn
->name
, special
))
1137 "Must specify one or more daemons to monitor.\n\n");
1140 if (!watch_only
&& !gs
.special
) {
1141 fprintf(stderr
, "\"%s\" daemon must be in daemon lists\n\n",
1146 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
1147 snprintf(p
, alldaemons
+ sizeof(alldaemons
) - p
, "%s%s",
1148 (p
== alldaemons
) ? "" : " ", dmn
->name
);
1151 zlog_notice("%s %s watching [%s]%s", progname
, FRR_VERSION
, alldaemons
,
1152 watch_only
? ", monitor mode" : "");
1155 struct zebra_privs_t watchfrr_privs
= {
1157 .vty_group
= VTY_GROUP
,
1161 static struct quagga_signal_t watchfrr_signals
[] = {
1172 .handler
= sigchild
,
1176 FRR_DAEMON_INFO(watchfrr
, WATCHFRR
,
1177 .flags
= FRR_NO_PRIVSEP
| FRR_NO_TCPVTY
| FRR_LIMITED_CLI
1178 | FRR_NO_CFG_PID_DRY
| FRR_NO_ZCLIENT
1181 .printhelp
= printhelp
,
1182 .copyright
= "Copyright 2004 Andrew J. Schorr",
1184 .signals
= watchfrr_signals
,
1185 .n_signals
= array_size(watchfrr_signals
),
1187 .privs
= &watchfrr_privs
, )
1189 #define DEPRECATED_OPTIONS "aAezR:"
1191 int main(int argc
, char **argv
)
1194 const char *blankstr
= NULL
;
1196 frr_preinit(&watchfrr_di
, argc
, argv
);
1197 progname
= watchfrr_di
.progname
;
1199 frr_opt_add("b:dk:l:i:p:r:S:s:t:T:" DEPRECATED_OPTIONS
, longopts
, "");
1201 gs
.restart
.name
= "all";
1202 while ((opt
= frr_getopt(argc
, argv
, NULL
)) != EOF
) {
1203 if (opt
&& opt
< 128 && strchr(DEPRECATED_OPTIONS
, opt
)) {
1205 "The -%c option no longer exists.\n"
1206 "Please refer to the watchfrr(8) man page.\n",
1221 if (!valid_command(optarg
)) {
1223 "Invalid kill command, must contain '%%s': %s\n",
1227 gs
.stop_command
= optarg
;
1231 if ((sscanf(optarg
, "%d%1s", &gs
.loglevel
, garbage
)
1233 || (gs
.loglevel
< LOG_EMERG
)) {
1235 "Invalid loglevel argument: %s\n",
1240 case OPTION_MINRESTART
: {
1242 if ((sscanf(optarg
, "%ld%1s", &gs
.min_restart_interval
,
1245 || (gs
.min_restart_interval
< 0)) {
1247 "Invalid min_restart_interval argument: %s\n",
1252 case OPTION_MAXRESTART
: {
1254 if ((sscanf(optarg
, "%ld%1s", &gs
.max_restart_interval
,
1257 || (gs
.max_restart_interval
< 0)) {
1259 "Invalid max_restart_interval argument: %s\n",
1267 if ((sscanf(optarg
, "%d%1s", &period
, garbage
) != 1)
1268 || (gs
.period
< 1)) {
1270 "Invalid interval argument: %s\n",
1274 gs
.period
= 1000 * period
;
1277 watchfrr_di
.pid_file
= optarg
;
1280 if (!valid_command(optarg
)) {
1282 "Invalid restart command, must contain '%%s': %s\n",
1286 gs
.restart_command
= optarg
;
1289 if (!valid_command(optarg
)) {
1291 "Invalid start command, must contain '%%s': %s\n",
1295 gs
.start_command
= optarg
;
1302 if ((sscanf(optarg
, "%ld%1s", &gs
.timeout
, garbage
)
1304 || (gs
.timeout
< 1)) {
1306 "Invalid timeout argument: %s\n",
1313 if ((sscanf(optarg
, "%ld%1s", &gs
.restart_timeout
,
1316 || (gs
.restart_timeout
< 1)) {
1318 "Invalid restart timeout argument: %s\n",
1324 fputs("Invalid option.\n", stderr
);
1330 && (gs
.start_command
|| gs
.stop_command
|| gs
.restart_command
)) {
1331 fputs("Options -r/-s/-k are not used when --dry is active.\n",
1335 && (!gs
.restart_command
|| !gs
.start_command
|| !gs
.stop_command
)) {
1337 "Options -s (start), -k (kill), and -r (restart) are required.\n");
1342 if (gs
.restart_command
)
1343 gs
.restart_command
=
1344 translate_blanks(gs
.restart_command
, blankstr
);
1345 if (gs
.start_command
)
1347 translate_blanks(gs
.start_command
, blankstr
);
1348 if (gs
.stop_command
)
1350 translate_blanks(gs
.stop_command
, blankstr
);
1353 gs
.restart
.interval
= gs
.min_restart_interval
;
1355 master
= frr_init();
1356 watchfrr_error_init();
1357 watchfrr_init(argc
, argv
);
1358 watchfrr_vty_init();
1362 zlog_set_level(ZLOG_DEST_MONITOR
, ZLOG_DISABLED
);
1363 if (watchfrr_di
.daemon_mode
)
1364 zlog_set_level(ZLOG_DEST_SYSLOG
, MIN(gs
.loglevel
, LOG_DEBUG
));
1366 zlog_set_level(ZLOG_DEST_STDOUT
, MIN(gs
.loglevel
, LOG_DEBUG
));
1370 systemd_send_stopping();