2 * Monitor status of frr daemons and restart if necessary.
4 * Copyright (C) 2004 Andrew J. Schorr
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
26 #include <lib/version.h>
29 #include "lib_errors.h"
30 #include "zlog_targets.h"
40 #include "watchfrr_errors.h"
43 #define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
46 /* Macros to help randomize timers. */
47 #define JITTER(X) ((frr_weak_random() % ((X)+1))-((X)/2))
48 #define FUZZY(X) ((X)+JITTER((X)/20))
50 #define DEFAULT_PERIOD 5
51 #define DEFAULT_TIMEOUT 90
52 #define DEFAULT_RESTART_TIMEOUT 20
53 #define DEFAULT_LOGLEVEL LOG_INFO
54 #define DEFAULT_MIN_RESTART 60
55 #define DEFAULT_MAX_RESTART 600
57 #define DEFAULT_RESTART_CMD WATCHFRR_SH_PATH " restart %s"
58 #define DEFAULT_START_CMD WATCHFRR_SH_PATH " start %s"
59 #define DEFAULT_STOP_CMD WATCHFRR_SH_PATH " stop %s"
61 #define PING_TOKEN "PING"
63 DEFINE_MGROUP(WATCHFRR
, "watchfrr")
64 DEFINE_MTYPE_STATIC(WATCHFRR
, WATCHFRR_DAEMON
, "watchfrr daemon entry")
66 /* Needs to be global, referenced somewhere inside libfrr. */
67 struct thread_master
*master
;
69 static bool watch_only
= false;
76 PHASE_ZEBRA_RESTART_PENDING
,
77 PHASE_WAITING_ZEBRA_UP
80 static const char *const phase_str
[] = {
84 "Waiting for other daemons to come down",
85 "Zebra restart job running",
86 "Waiting for zebra to come up",
90 #define PHASE_TIMEOUT (3*gs.restart_timeout)
91 #define STARTUP_TIMEOUT 55 * 1000
99 struct thread
*t_kill
;
103 static struct global_state
{
104 restart_phase_t phase
;
105 struct thread
*t_phase_hanging
;
106 struct thread
*t_startup_timeout
;
110 long restart_timeout
;
111 long min_restart_interval
;
112 long max_restart_interval
;
113 struct daemon
*daemons
;
114 const char *restart_command
;
115 const char *start_command
;
116 const char *stop_command
;
117 struct restart_info restart
;
119 struct daemon
*special
; /* points to zebra when doing phased restart */
122 int numdown
; /* # of daemons that are not UP or UNRESPONSIVE */
125 .vtydir
= frr_vtydir
,
126 .period
= 1000 * DEFAULT_PERIOD
,
127 .timeout
= DEFAULT_TIMEOUT
,
128 .restart_timeout
= DEFAULT_RESTART_TIMEOUT
,
129 .loglevel
= DEFAULT_LOGLEVEL
,
130 .min_restart_interval
= DEFAULT_MIN_RESTART
,
131 .max_restart_interval
= DEFAULT_MAX_RESTART
,
132 .restart_command
= DEFAULT_RESTART_CMD
,
133 .start_command
= DEFAULT_START_CMD
,
134 .stop_command
= DEFAULT_STOP_CMD
,
146 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
148 static const char *const state_str
[] = {
149 "Init", "Down", "Connecting", "Up", "Unresponsive",
154 daemon_state_t state
;
156 struct timeval echo_sent
;
157 unsigned int connect_tries
;
158 struct thread
*t_wakeup
;
159 struct thread
*t_read
;
160 struct thread
*t_write
;
162 struct restart_info restart
;
165 * For a given daemon, if we've turned on ignore timeouts
166 * ignore the timeout value and assume everything is ok
167 * This is for daemon debugging w/ gdb after we have started
168 * FRR and realize we have something that needs to be looked
174 #define OPTION_MINRESTART 2000
175 #define OPTION_MAXRESTART 2001
176 #define OPTION_DRY 2002
178 static const struct option longopts
[] = {
179 {"daemon", no_argument
, NULL
, 'd'},
180 {"statedir", required_argument
, NULL
, 'S'},
181 {"loglevel", required_argument
, NULL
, 'l'},
182 {"interval", required_argument
, NULL
, 'i'},
183 {"timeout", required_argument
, NULL
, 't'},
184 {"restart-timeout", required_argument
, NULL
, 'T'},
185 {"restart", required_argument
, NULL
, 'r'},
186 {"start-command", required_argument
, NULL
, 's'},
187 {"kill-command", required_argument
, NULL
, 'k'},
188 {"dry", no_argument
, NULL
, OPTION_DRY
},
189 {"min-restart-interval", required_argument
, NULL
, OPTION_MINRESTART
},
190 {"max-restart-interval", required_argument
, NULL
, OPTION_MAXRESTART
},
191 {"pid-file", required_argument
, NULL
, 'p'},
192 {"blank-string", required_argument
, NULL
, 'b'},
193 {"help", no_argument
, NULL
, 'h'},
194 {"version", no_argument
, NULL
, 'v'},
197 static int try_connect(struct daemon
*dmn
);
198 static int wakeup_send_echo(struct thread
*t_wakeup
);
199 static void try_restart(struct daemon
*dmn
);
200 static void phase_check(void);
201 static void restart_done(struct daemon
*dmn
);
203 static const char *progname
;
205 void watchfrr_set_ignore_daemon(struct vty
*vty
, const char *dname
, bool ignore
)
209 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
210 if (strncmp(dmn
->name
, dname
, strlen(dmn
->name
)) == 0)
215 dmn
->ignore_timeout
= ignore
;
216 vty_out(vty
, "%s switching to %s\n", dmn
->name
,
217 ignore
? "ignore" : "watch");
219 vty_out(vty
, "%s is not configured for running at the moment",
223 static void printhelp(FILE *target
)
226 "Usage : %s [OPTION...] <daemon name> ...\n\n\
227 Watchdog program to monitor status of frr daemons and try to restart\n\
228 them if they are down or unresponsive. It determines whether a daemon is\n\
229 up based on whether it can connect to the daemon's vty unix stream socket.\n\
230 It then repeatedly sends echo commands over that socket to determine whether\n\
231 the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
232 on the socket connection and know immediately that the daemon is down.\n\n\
233 The daemons to be monitored should be listed on the command line.\n\n\
234 In order to avoid attempting to restart the daemons in a fast loop,\n\
235 the -m and -M options allow you to control the minimum delay between\n\
236 restart commands. The minimum restart delay is recalculated each time\n\
237 a restart is attempted: if the time since the last restart attempt exceeds\n\
238 twice the -M value, then the restart delay is set to the -m value.\n\
239 Otherwise, the interval is doubled (but capped at the -M value).\n\n",
244 -d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
245 to syslog instead of stdout.\n\
246 -S, --statedir Set the vty socket directory (default is %s)\n\
247 -l, --loglevel Set the logging level (default is %d).\n\
248 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
249 but it can be set higher than %d if extra-verbose debugging\n\
250 messages are desired.\n\
251 --min-restart-interval\n\
252 Set the minimum seconds to wait between invocations of daemon\n\
253 restart commands (default is %d).\n\
254 --max-restart-interval\n\
255 Set the maximum seconds to wait between invocations of daemon\n\
256 restart commands (default is %d).\n\
257 -i, --interval Set the status polling interval in seconds (default is %d)\n\
258 -t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
259 -T, --restart-timeout\n\
260 Set the restart (kill) timeout in seconds (default is %d).\n\
261 If any background jobs are still running after this much\n\
262 time has elapsed, they will be killed.\n\
263 -r, --restart Supply a Bourne shell command to use to restart a single\n\
264 daemon. The command string should include '%%s' where the\n\
265 name of the daemon should be substituted.\n\
267 -s, --start-command\n\
268 Supply a Bourne shell to command to use to start a single\n\
269 daemon. The command string should include '%%s' where the\n\
270 name of the daemon should be substituted.\n\
272 -k, --kill-command\n\
273 Supply a Bourne shell to command to use to stop a single\n\
274 daemon. The command string should include '%%s' where the\n\
275 name of the daemon should be substituted.\n\
277 --dry Do not start or restart anything, just log.\n\
278 -p, --pid-file Set process identifier file name\n\
279 (default is %s/watchfrr.pid).\n\
280 -b, --blank-string\n\
281 When the supplied argument string is found in any of the\n\
282 various shell command arguments (-r, -s, or -k), replace\n\
283 it with a space. This is an ugly hack to circumvent problems\n\
284 passing command-line arguments with embedded spaces.\n\
285 -v, --version Print program version\n\
286 -h, --help Display this help and exit\n",
287 frr_vtydir
, DEFAULT_LOGLEVEL
, LOG_EMERG
, LOG_DEBUG
, LOG_DEBUG
,
288 DEFAULT_MIN_RESTART
, DEFAULT_MAX_RESTART
, DEFAULT_PERIOD
,
289 DEFAULT_TIMEOUT
, DEFAULT_RESTART_TIMEOUT
,
290 DEFAULT_RESTART_CMD
, DEFAULT_START_CMD
, DEFAULT_STOP_CMD
,
294 static pid_t
run_background(char *shell_cmd
)
298 switch (child
= fork()) {
300 flog_err_sys(EC_LIB_SYSTEM_CALL
,
301 "fork failed, cannot run command [%s]: %s",
302 shell_cmd
, safe_strerror(errno
));
306 /* Use separate process group so child processes can be killed
308 if (setpgid(0, 0) < 0)
309 zlog_warn("warning: setpgid(0,0) failed: %s",
310 safe_strerror(errno
));
314 char *const argv
[4] = {shell
, dashc
, shell_cmd
, NULL
};
315 execv("/bin/sh", argv
);
316 flog_err_sys(EC_LIB_SYSTEM_CALL
,
317 "execv(/bin/sh -c '%s') failed: %s",
318 shell_cmd
, safe_strerror(errno
));
322 /* Parent process: we will reap the child later. */
323 zlog_info("Forked background command [pid %d]: %s", (int)child
,
329 static struct timeval
*time_elapsed(struct timeval
*result
,
330 const struct timeval
*start_time
)
332 gettimeofday(result
, NULL
);
333 result
->tv_sec
-= start_time
->tv_sec
;
334 result
->tv_usec
-= start_time
->tv_usec
;
335 while (result
->tv_usec
< 0) {
336 result
->tv_usec
+= 1000000L;
342 static int restart_kill(struct thread
*t_kill
)
344 struct restart_info
*restart
= THREAD_ARG(t_kill
);
345 struct timeval delay
;
347 time_elapsed(&delay
, &restart
->time
);
349 "Warning: %s %s child process %d still running after "
350 "%ld seconds, sending signal %d",
351 restart
->what
, restart
->name
, (int)restart
->pid
,
352 (long)delay
.tv_sec
, (restart
->kills
? SIGKILL
: SIGTERM
));
353 kill(-restart
->pid
, (restart
->kills
? SIGKILL
: SIGTERM
));
355 restart
->t_kill
= NULL
;
356 thread_add_timer(master
, restart_kill
, restart
, gs
.restart_timeout
,
361 static struct restart_info
*find_child(pid_t child
)
364 if (gs
.restart
.pid
== child
)
367 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
368 if (dmn
->restart
.pid
== child
)
369 return &dmn
->restart
;
374 static void sigchild(void)
380 struct restart_info
*restart
;
383 switch (child
= waitpid(-1, &status
, WNOHANG
)) {
385 flog_err_sys(EC_LIB_SYSTEM_CALL
, "waitpid failed: %s",
386 safe_strerror(errno
));
389 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
393 if (child
== integrated_write_pid
) {
394 integrated_write_sigchld(status
);
398 if ((restart
= find_child(child
)) != NULL
) {
399 name
= restart
->name
;
400 what
= restart
->what
;
403 thread_cancel(restart
->t_kill
);
404 restart
->t_kill
= NULL
;
405 /* Update restart time to reflect the time the command
407 gettimeofday(&restart
->time
, NULL
);
411 "waitpid returned status for an unknown child process %d",
416 if (WIFSTOPPED(status
))
417 zlog_warn("warning: %s %s process %d is stopped", what
, name
,
419 else if (WIFSIGNALED(status
))
420 zlog_warn("%s %s process %d terminated due to signal %d", what
,
421 name
, (int)child
, WTERMSIG(status
));
422 else if (WIFEXITED(status
)) {
423 if (WEXITSTATUS(status
) != 0)
425 "%s %s process %d exited with non-zero status %d",
426 what
, name
, (int)child
, WEXITSTATUS(status
));
428 zlog_debug("%s %s process %d exited normally", what
,
431 if (restart
&& restart
!= &gs
.restart
) {
432 dmn
= container_of(restart
, struct daemon
,
436 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
442 "cannot interpret %s %s process %d wait status 0x%x",
443 what
, name
, (int)child
, status
);
447 static int run_job(struct restart_info
*restart
, const char *cmdtype
,
448 const char *command
, int force
, int update_interval
)
450 struct timeval delay
;
452 if (gs
.loglevel
> LOG_DEBUG
+ 1)
453 zlog_debug("attempting to %s %s", cmdtype
, restart
->name
);
456 if (gs
.loglevel
> LOG_DEBUG
+ 1)
458 "cannot %s %s, previous pid %d still running",
459 cmdtype
, restart
->name
, (int)restart
->pid
);
463 #if defined HAVE_SYSTEMD
466 snprintf(buffer
, sizeof(buffer
), "restarting %s", restart
->name
);
467 systemd_send_status(buffer
);
470 /* Note: time_elapsed test must come before the force test, since we
472 to make sure that delay is initialized for use below in updating the
474 if ((time_elapsed(&delay
, &restart
->time
)->tv_sec
< restart
->interval
)
477 if (gs
.loglevel
> LOG_DEBUG
+ 1)
480 "elapsed time %ld < retry interval %ld",
481 cmdtype
, restart
->name
, (long)delay
.tv_sec
,
486 gettimeofday(&restart
->time
, NULL
);
489 char cmd
[strlen(command
) + strlen(restart
->name
) + 1];
490 snprintf(cmd
, sizeof(cmd
), command
, restart
->name
);
491 if ((restart
->pid
= run_background(cmd
)) > 0) {
492 restart
->t_kill
= NULL
;
493 thread_add_timer(master
, restart_kill
, restart
,
494 gs
.restart_timeout
, &restart
->t_kill
);
495 restart
->what
= cmdtype
;
501 #if defined HAVE_SYSTEMD
502 systemd_send_status("FRR Operational");
504 /* Calculate the new restart interval. */
505 if (update_interval
) {
506 if (delay
.tv_sec
> 2 * gs
.max_restart_interval
)
507 restart
->interval
= gs
.min_restart_interval
;
508 else if ((restart
->interval
*= 2) > gs
.max_restart_interval
)
509 restart
->interval
= gs
.max_restart_interval
;
510 if (gs
.loglevel
> LOG_DEBUG
+ 1)
511 zlog_debug("restart %s interval is now %ld",
512 restart
->name
, restart
->interval
);
517 #define SET_READ_HANDLER(DMN) \
519 (DMN)->t_read = NULL; \
520 thread_add_read(master, handle_read, (DMN), (DMN)->fd, \
524 #define SET_WAKEUP_DOWN(DMN) \
526 (DMN)->t_wakeup = NULL; \
527 thread_add_timer_msec(master, wakeup_down, (DMN), \
528 FUZZY(gs.period), &(DMN)->t_wakeup); \
531 #define SET_WAKEUP_UNRESPONSIVE(DMN) \
533 (DMN)->t_wakeup = NULL; \
534 thread_add_timer_msec(master, wakeup_unresponsive, (DMN), \
535 FUZZY(gs.period), &(DMN)->t_wakeup); \
538 #define SET_WAKEUP_ECHO(DMN) \
540 (DMN)->t_wakeup = NULL; \
541 thread_add_timer_msec(master, wakeup_send_echo, (DMN), \
542 FUZZY(gs.period), &(DMN)->t_wakeup); \
545 static int wakeup_down(struct thread
*t_wakeup
)
547 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
549 dmn
->t_wakeup
= NULL
;
550 if (try_connect(dmn
) < 0)
551 SET_WAKEUP_DOWN(dmn
);
552 if ((dmn
->connect_tries
> 1) && (dmn
->state
!= DAEMON_UP
))
557 static int wakeup_init(struct thread
*t_wakeup
)
559 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
561 dmn
->t_wakeup
= NULL
;
562 if (try_connect(dmn
) < 0) {
564 "%s state -> down : initial connection attempt failed",
566 dmn
->state
= DAEMON_DOWN
;
572 static void restart_done(struct daemon
*dmn
)
574 if (dmn
->state
!= DAEMON_DOWN
) {
576 "Daemon: %s: is in %s state but expected it to be in DAEMON_DOWN state",
577 dmn
->name
, state_str
[dmn
->state
]);
581 THREAD_OFF(dmn
->t_wakeup
);
582 if (try_connect(dmn
) < 0)
583 SET_WAKEUP_DOWN(dmn
);
586 static void daemon_down(struct daemon
*dmn
, const char *why
)
588 if (IS_UP(dmn
) || (dmn
->state
== DAEMON_INIT
))
589 flog_err(EC_WATCHFRR_CONNECTION
, "%s state -> down : %s",
591 else if (gs
.loglevel
> LOG_DEBUG
)
592 zlog_debug("%s still down : %s", dmn
->name
, why
);
595 dmn
->state
= DAEMON_DOWN
;
600 THREAD_OFF(dmn
->t_read
);
601 THREAD_OFF(dmn
->t_write
);
602 THREAD_OFF(dmn
->t_wakeup
);
603 if (try_connect(dmn
) < 0)
604 SET_WAKEUP_DOWN(dmn
);
608 static int handle_read(struct thread
*t_read
)
610 struct daemon
*dmn
= THREAD_ARG(t_read
);
611 static const char resp
[sizeof(PING_TOKEN
) + 4] = PING_TOKEN
"\n";
612 char buf
[sizeof(resp
) + 100];
614 struct timeval delay
;
617 if ((rc
= read(dmn
->fd
, buf
, sizeof(buf
))) < 0) {
620 if (ERRNO_IO_RETRY(errno
)) {
621 /* Pretend it never happened. */
622 SET_READ_HANDLER(dmn
);
625 snprintf(why
, sizeof(why
), "unexpected read error: %s",
626 safe_strerror(errno
));
627 daemon_down(dmn
, why
);
631 daemon_down(dmn
, "read returned EOF");
634 if (!dmn
->echo_sent
.tv_sec
) {
635 char why
[sizeof(buf
) + 100];
636 snprintf(why
, sizeof(why
),
637 "unexpected read returns %d bytes: %.*s", (int)rc
,
639 daemon_down(dmn
, why
);
643 /* We are expecting an echo response: is there any chance that the
644 response would not be returned entirely in the first read? That
645 seems inconceivable... */
646 if ((rc
!= sizeof(resp
)) || memcmp(buf
, resp
, sizeof(resp
))) {
647 char why
[100 + sizeof(buf
)];
648 snprintf(why
, sizeof(why
),
649 "read returned bad echo response of %d bytes "
650 "(expecting %u): %.*s",
651 (int)rc
, (unsigned int)sizeof(resp
), (int)rc
, buf
);
652 daemon_down(dmn
, why
);
656 time_elapsed(&delay
, &dmn
->echo_sent
);
657 dmn
->echo_sent
.tv_sec
= 0;
658 if (dmn
->state
== DAEMON_UNRESPONSIVE
) {
659 if (delay
.tv_sec
< gs
.timeout
) {
660 dmn
->state
= DAEMON_UP
;
662 "%s state -> up : echo response received after %ld.%06ld "
664 dmn
->name
, (long)delay
.tv_sec
,
665 (long)delay
.tv_usec
);
668 "%s: slow echo response finally received after %ld.%06ld "
670 dmn
->name
, (long)delay
.tv_sec
,
671 (long)delay
.tv_usec
);
672 } else if (gs
.loglevel
> LOG_DEBUG
+ 1)
673 zlog_debug("%s: echo response received after %ld.%06ld seconds",
674 dmn
->name
, (long)delay
.tv_sec
, (long)delay
.tv_usec
);
676 SET_READ_HANDLER(dmn
);
678 thread_cancel(dmn
->t_wakeup
);
679 SET_WAKEUP_ECHO(dmn
);
685 * Wait till we notice that all daemons are ready before
686 * we send we are ready to systemd
688 static void daemon_send_ready(int exitcode
)
698 zlog_notice("all daemons up, doing startup-complete notify");
699 else if (gs
.numdown
< gs
.numdaemons
)
700 flog_err(EC_WATCHFRR_CONNECTION
,
701 "startup did not complete within timeout"
702 " (%d/%d daemons running)",
703 gs
.numdaemons
- gs
.numdown
, gs
.numdaemons
);
705 flog_err(EC_WATCHFRR_CONNECTION
,
706 "all configured daemons failed to start"
707 " -- exiting watchfrr");
714 snprintf(started
, sizeof(started
), "%s%s", frr_vtydir
,
716 fp
= fopen(started
, "w");
719 #if defined HAVE_SYSTEMD
720 systemd_send_started(master
, 0);
721 systemd_send_status("FRR Operational");
726 static void daemon_up(struct daemon
*dmn
, const char *why
)
728 dmn
->state
= DAEMON_UP
;
730 dmn
->connect_tries
= 0;
731 zlog_notice("%s state -> up : %s", dmn
->name
, why
);
733 daemon_send_ready(0);
734 SET_WAKEUP_ECHO(dmn
);
738 static int check_connect(struct thread
*t_write
)
740 struct daemon
*dmn
= THREAD_ARG(t_write
);
742 socklen_t reslen
= sizeof(sockerr
);
745 if (getsockopt(dmn
->fd
, SOL_SOCKET
, SO_ERROR
, (char *)&sockerr
, &reslen
)
747 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn
->name
,
748 safe_strerror(errno
));
750 "getsockopt failed checking connection success");
753 if ((reslen
== sizeof(sockerr
)) && sockerr
) {
757 "getsockopt reports that connection attempt failed: %s",
758 safe_strerror(sockerr
));
759 daemon_down(dmn
, why
);
763 daemon_up(dmn
, "delayed connect succeeded");
767 static int wakeup_connect_hanging(struct thread
*t_wakeup
)
769 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
772 dmn
->t_wakeup
= NULL
;
773 snprintf(why
, sizeof(why
),
774 "connection attempt timed out after %ld seconds", gs
.timeout
);
775 daemon_down(dmn
, why
);
779 /* Making connection to protocol daemon. */
780 static int try_connect(struct daemon
*dmn
)
783 struct sockaddr_un addr
;
786 if (gs
.loglevel
> LOG_DEBUG
+ 1)
787 zlog_debug("%s: attempting to connect", dmn
->name
);
788 dmn
->connect_tries
++;
790 memset(&addr
, 0, sizeof(struct sockaddr_un
));
791 addr
.sun_family
= AF_UNIX
;
792 snprintf(addr
.sun_path
, sizeof(addr
.sun_path
), "%s/%s.vty", gs
.vtydir
,
794 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
795 len
= addr
.sun_len
= SUN_LEN(&addr
);
797 len
= sizeof(addr
.sun_family
) + strlen(addr
.sun_path
);
798 #endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
800 /* Quick check to see if we might succeed before we go to the trouble
801 of creating a socket. */
802 if (access(addr
.sun_path
, W_OK
) < 0) {
804 flog_err_sys(EC_LIB_SYSTEM_CALL
,
805 "%s: access to socket %s denied: %s",
806 dmn
->name
, addr
.sun_path
,
807 safe_strerror(errno
));
811 if ((sock
= socket(AF_UNIX
, SOCK_STREAM
, 0)) < 0) {
812 flog_err_sys(EC_LIB_SOCKET
, "%s(%s): cannot make socket: %s",
813 __func__
, addr
.sun_path
, safe_strerror(errno
));
817 if (set_nonblocking(sock
) < 0 || set_cloexec(sock
) < 0) {
818 flog_err_sys(EC_LIB_SYSTEM_CALL
,
819 "%s(%s): set_nonblocking/cloexec(%d) failed",
820 __func__
, addr
.sun_path
, sock
);
825 if (connect(sock
, (struct sockaddr
*)&addr
, len
) < 0) {
826 if ((errno
!= EINPROGRESS
) && (errno
!= EWOULDBLOCK
)) {
827 if (gs
.loglevel
> LOG_DEBUG
)
828 zlog_debug("%s(%s): connect failed: %s",
829 __func__
, addr
.sun_path
,
830 safe_strerror(errno
));
834 if (gs
.loglevel
> LOG_DEBUG
)
835 zlog_debug("%s: connection in progress", dmn
->name
);
836 dmn
->state
= DAEMON_CONNECTING
;
839 thread_add_write(master
, check_connect
, dmn
, dmn
->fd
,
841 dmn
->t_wakeup
= NULL
;
842 thread_add_timer(master
, wakeup_connect_hanging
, dmn
,
843 gs
.timeout
, &dmn
->t_wakeup
);
844 SET_READ_HANDLER(dmn
);
849 SET_READ_HANDLER(dmn
);
850 daemon_up(dmn
, "connect succeeded");
854 static int phase_hanging(struct thread
*t_hanging
)
856 gs
.t_phase_hanging
= NULL
;
857 flog_err(EC_WATCHFRR_CONNECTION
,
858 "Phase [%s] hanging for %ld seconds, aborting phased restart",
859 phase_str
[gs
.phase
], PHASE_TIMEOUT
);
860 gs
.phase
= PHASE_NONE
;
864 static void set_phase(restart_phase_t new_phase
)
866 gs
.phase
= new_phase
;
867 if (gs
.t_phase_hanging
)
868 thread_cancel(gs
.t_phase_hanging
);
869 gs
.t_phase_hanging
= NULL
;
870 thread_add_timer(master
, phase_hanging
, NULL
, PHASE_TIMEOUT
,
871 &gs
.t_phase_hanging
);
874 static void phase_check(void)
883 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
884 if (dmn
->state
== DAEMON_INIT
)
887 /* startup complete, everything out of INIT */
888 gs
.phase
= PHASE_NONE
;
889 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
890 if (dmn
->state
== DAEMON_DOWN
) {
891 SET_WAKEUP_DOWN(dmn
);
895 case PHASE_STOPS_PENDING
:
899 "Phased restart: all routing daemon stop jobs have completed.");
900 set_phase(PHASE_WAITING_DOWN
);
903 case PHASE_WAITING_DOWN
:
904 if (gs
.numdown
+ IS_UP(gs
.special
) < gs
.numdaemons
)
906 zlog_info("Phased restart: all routing daemons now down.");
907 run_job(&gs
.special
->restart
, "restart", gs
.restart_command
, 1,
909 set_phase(PHASE_ZEBRA_RESTART_PENDING
);
912 case PHASE_ZEBRA_RESTART_PENDING
:
913 if (gs
.special
->restart
.pid
)
915 zlog_info("Phased restart: %s restart job completed.",
917 set_phase(PHASE_WAITING_ZEBRA_UP
);
920 case PHASE_WAITING_ZEBRA_UP
:
921 if (!IS_UP(gs
.special
))
923 zlog_info("Phased restart: %s is now up.", gs
.special
->name
);
926 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
927 if (dmn
!= gs
.special
)
928 run_job(&dmn
->restart
, "start",
929 gs
.start_command
, 1, 0);
932 gs
.phase
= PHASE_NONE
;
933 THREAD_OFF(gs
.t_phase_hanging
);
934 zlog_notice("Phased global restart has completed.");
939 static void try_restart(struct daemon
*dmn
)
944 if (dmn
!= gs
.special
) {
945 if ((gs
.special
->state
== DAEMON_UP
)
946 && (gs
.phase
== PHASE_NONE
))
947 run_job(&dmn
->restart
, "restart", gs
.restart_command
, 0,
951 "%s: postponing restart attempt because master %s daemon "
952 "not up [%s], or phased restart in progress",
953 dmn
->name
, gs
.special
->name
,
954 state_str
[gs
.special
->state
]);
958 if ((gs
.phase
!= PHASE_NONE
) || gs
.numpids
) {
959 if (gs
.loglevel
> LOG_DEBUG
+ 1)
961 "postponing phased global restart: restart already in "
962 "progress [%s], or outstanding child processes [%d]",
963 phase_str
[gs
.phase
], gs
.numpids
);
966 /* Is it too soon for a restart? */
968 struct timeval delay
;
969 if (time_elapsed(&delay
, &gs
.special
->restart
.time
)->tv_sec
970 < gs
.special
->restart
.interval
) {
971 if (gs
.loglevel
> LOG_DEBUG
+ 1)
973 "postponing phased global restart: "
974 "elapsed time %ld < retry interval %ld",
976 gs
.special
->restart
.interval
);
980 run_job(&gs
.restart
, "restart", gs
.restart_command
, 0, 1);
983 static int wakeup_unresponsive(struct thread
*t_wakeup
)
985 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
987 dmn
->t_wakeup
= NULL
;
988 if (dmn
->state
!= DAEMON_UNRESPONSIVE
)
989 flog_err(EC_WATCHFRR_CONNECTION
,
990 "%s: no longer unresponsive (now %s), "
991 "wakeup should have been cancelled!",
992 dmn
->name
, state_str
[dmn
->state
]);
994 SET_WAKEUP_UNRESPONSIVE(dmn
);
1000 static int wakeup_no_answer(struct thread
*t_wakeup
)
1002 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
1004 dmn
->t_wakeup
= NULL
;
1005 dmn
->state
= DAEMON_UNRESPONSIVE
;
1006 if (dmn
->ignore_timeout
)
1008 flog_err(EC_WATCHFRR_CONNECTION
,
1009 "%s state -> unresponsive : no response yet to ping "
1010 "sent %ld seconds ago",
1011 dmn
->name
, gs
.timeout
);
1012 SET_WAKEUP_UNRESPONSIVE(dmn
);
1017 static int wakeup_send_echo(struct thread
*t_wakeup
)
1019 static const char echocmd
[] = "echo " PING_TOKEN
;
1021 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
1023 dmn
->t_wakeup
= NULL
;
1024 if (((rc
= write(dmn
->fd
, echocmd
, sizeof(echocmd
))) < 0)
1025 || ((size_t)rc
!= sizeof(echocmd
))) {
1026 char why
[100 + sizeof(echocmd
)];
1027 snprintf(why
, sizeof(why
),
1028 "write '%s' returned %d instead of %u", echocmd
,
1029 (int)rc
, (unsigned int)sizeof(echocmd
));
1030 daemon_down(dmn
, why
);
1032 gettimeofday(&dmn
->echo_sent
, NULL
);
1033 dmn
->t_wakeup
= NULL
;
1034 thread_add_timer(master
, wakeup_no_answer
, dmn
, gs
.timeout
,
1040 bool check_all_up(void)
1044 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
1045 if (dmn
->state
!= DAEMON_UP
)
1050 void watchfrr_status(struct vty
*vty
)
1053 struct timeval delay
;
1055 vty_out(vty
, "watchfrr global phase: %s\n", phase_str
[gs
.phase
]);
1057 vty_out(vty
, " global restart running, pid %ld\n",
1058 (long)gs
.restart
.pid
);
1060 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
1061 vty_out(vty
, " %-20s %s%s", dmn
->name
, state_str
[dmn
->state
],
1062 dmn
->ignore_timeout
? "/Ignoring Timeout\n" : "\n");
1063 if (dmn
->restart
.pid
)
1064 vty_out(vty
, " restart running, pid %ld\n",
1065 (long)dmn
->restart
.pid
);
1066 else if (dmn
->state
== DAEMON_DOWN
&&
1067 time_elapsed(&delay
, &dmn
->restart
.time
)->tv_sec
1068 < dmn
->restart
.interval
)
1069 vty_out(vty
, " restarting in %jd seconds"
1070 " (%jds backoff interval)\n",
1071 (intmax_t)dmn
->restart
.interval
1072 - (intmax_t)delay
.tv_sec
,
1073 (intmax_t)dmn
->restart
.interval
);
1077 static void sigint(void)
1079 zlog_notice("Terminating on signal");
1080 systemd_send_stopping();
1084 static int valid_command(const char *cmd
)
1088 return ((p
= strchr(cmd
, '%')) != NULL
) && (*(p
+ 1) == 's')
1089 && !strchr(p
+ 1, '%');
1092 /* This is an ugly hack to circumvent problems with passing command-line
1093 arguments that contain spaces. The fix is to use a configuration file. */
1094 static char *translate_blanks(const char *cmd
, const char *blankstr
)
1098 size_t bslen
= strlen(blankstr
);
1100 if (!(res
= strdup(cmd
))) {
1104 while ((p
= strstr(res
, blankstr
)) != NULL
) {
1107 memmove(p
+ 1, p
+ bslen
, strlen(p
+ bslen
) + 1);
1112 static int startup_timeout(struct thread
*t_wakeup
)
1114 daemon_send_ready(1);
1118 static void watchfrr_init(int argc
, char **argv
)
1120 const char *special
= "zebra";
1122 struct daemon
*dmn
, **add
= &gs
.daemons
;
1123 char alldaemons
[512] = "", *p
= alldaemons
;
1125 thread_add_timer_msec(master
, startup_timeout
, NULL
, STARTUP_TIMEOUT
,
1126 &gs
.t_startup_timeout
);
1128 for (i
= optind
; i
< argc
; i
++) {
1129 dmn
= XCALLOC(MTYPE_WATCHFRR_DAEMON
, sizeof(*dmn
));
1131 dmn
->name
= dmn
->restart
.name
= argv
[i
];
1132 dmn
->state
= DAEMON_INIT
;
1136 dmn
->t_wakeup
= NULL
;
1137 thread_add_timer_msec(master
, wakeup_init
, dmn
, 0,
1139 dmn
->restart
.interval
= gs
.min_restart_interval
;
1143 if (!strcmp(dmn
->name
, special
))
1149 "Must specify one or more daemons to monitor.\n\n");
1152 if (!watch_only
&& !gs
.special
) {
1153 fprintf(stderr
, "\"%s\" daemon must be in daemon lists\n\n",
1158 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
1159 snprintf(p
, alldaemons
+ sizeof(alldaemons
) - p
, "%s%s",
1160 (p
== alldaemons
) ? "" : " ", dmn
->name
);
1163 zlog_notice("%s %s watching [%s]%s", progname
, FRR_VERSION
, alldaemons
,
1164 watch_only
? ", monitor mode" : "");
1167 struct zebra_privs_t watchfrr_privs
= {
1169 .vty_group
= VTY_GROUP
,
1173 static struct quagga_signal_t watchfrr_signals
[] = {
1184 .handler
= sigchild
,
1188 FRR_DAEMON_INFO(watchfrr
, WATCHFRR
,
1189 .flags
= FRR_NO_PRIVSEP
| FRR_NO_TCPVTY
| FRR_LIMITED_CLI
1190 | FRR_NO_CFG_PID_DRY
| FRR_NO_ZCLIENT
1193 .printhelp
= printhelp
,
1194 .copyright
= "Copyright 2004 Andrew J. Schorr",
1196 .signals
= watchfrr_signals
,
1197 .n_signals
= array_size(watchfrr_signals
),
1199 .privs
= &watchfrr_privs
, )
1201 #define DEPRECATED_OPTIONS "aAezR:"
1203 int main(int argc
, char **argv
)
1206 const char *blankstr
= NULL
;
1208 frr_preinit(&watchfrr_di
, argc
, argv
);
1209 progname
= watchfrr_di
.progname
;
1211 frr_opt_add("b:dk:l:i:p:r:S:s:t:T:" DEPRECATED_OPTIONS
, longopts
, "");
1213 gs
.restart
.name
= "all";
1214 while ((opt
= frr_getopt(argc
, argv
, NULL
)) != EOF
) {
1215 if (opt
&& opt
< 128 && strchr(DEPRECATED_OPTIONS
, opt
)) {
1217 "The -%c option no longer exists.\n"
1218 "Please refer to the watchfrr(8) man page.\n",
1233 if (!valid_command(optarg
)) {
1235 "Invalid kill command, must contain '%%s': %s\n",
1239 gs
.stop_command
= optarg
;
1243 if ((sscanf(optarg
, "%d%1s", &gs
.loglevel
, garbage
)
1245 || (gs
.loglevel
< LOG_EMERG
)) {
1247 "Invalid loglevel argument: %s\n",
1252 case OPTION_MINRESTART
: {
1254 if ((sscanf(optarg
, "%ld%1s", &gs
.min_restart_interval
,
1257 || (gs
.min_restart_interval
< 0)) {
1259 "Invalid min_restart_interval argument: %s\n",
1264 case OPTION_MAXRESTART
: {
1266 if ((sscanf(optarg
, "%ld%1s", &gs
.max_restart_interval
,
1269 || (gs
.max_restart_interval
< 0)) {
1271 "Invalid max_restart_interval argument: %s\n",
1279 if ((sscanf(optarg
, "%d%1s", &period
, garbage
) != 1)
1280 || (gs
.period
< 1)) {
1282 "Invalid interval argument: %s\n",
1286 gs
.period
= 1000 * period
;
1289 watchfrr_di
.pid_file
= optarg
;
1292 if (!valid_command(optarg
)) {
1294 "Invalid restart command, must contain '%%s': %s\n",
1298 gs
.restart_command
= optarg
;
1301 if (!valid_command(optarg
)) {
1303 "Invalid start command, must contain '%%s': %s\n",
1307 gs
.start_command
= optarg
;
1314 if ((sscanf(optarg
, "%ld%1s", &gs
.timeout
, garbage
)
1316 || (gs
.timeout
< 1)) {
1318 "Invalid timeout argument: %s\n",
1325 if ((sscanf(optarg
, "%ld%1s", &gs
.restart_timeout
,
1328 || (gs
.restart_timeout
< 1)) {
1330 "Invalid restart timeout argument: %s\n",
1336 fputs("Invalid option.\n", stderr
);
1342 && (gs
.start_command
|| gs
.stop_command
|| gs
.restart_command
)) {
1343 fputs("Options -r/-s/-k are not used when --dry is active.\n",
1347 && (!gs
.restart_command
|| !gs
.start_command
|| !gs
.stop_command
)) {
1349 "Options -s (start), -k (kill), and -r (restart) are required.\n");
1354 if (gs
.restart_command
)
1355 gs
.restart_command
=
1356 translate_blanks(gs
.restart_command
, blankstr
);
1357 if (gs
.start_command
)
1359 translate_blanks(gs
.start_command
, blankstr
);
1360 if (gs
.stop_command
)
1362 translate_blanks(gs
.stop_command
, blankstr
);
1365 gs
.restart
.interval
= gs
.min_restart_interval
;
1367 master
= frr_init();
1368 watchfrr_error_init();
1369 watchfrr_init(argc
, argv
);
1370 watchfrr_vty_init();
1374 if (watchfrr_di
.daemon_mode
)
1375 zlog_syslog_set_prio_min(MIN(gs
.loglevel
, LOG_DEBUG
));
1377 zlog_aux_init(NULL
, MIN(gs
.loglevel
, LOG_DEBUG
));
1381 systemd_send_stopping();