2 * Monitor status of frr daemons and restart if necessary.
4 * Copyright (C) 2004 Andrew J. Schorr
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
26 #include <lib/version.h>
29 #include "lib_errors.h"
38 #include "watchfrr_errors.h"
41 #define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
44 /* Macros to help randomize timers. */
45 #define JITTER(X) ((random() % ((X)+1))-((X)/2))
46 #define FUZZY(X) ((X)+JITTER((X)/20))
48 #define DEFAULT_PERIOD 5
49 #define DEFAULT_TIMEOUT 90
50 #define DEFAULT_RESTART_TIMEOUT 20
51 #define DEFAULT_LOGLEVEL LOG_INFO
52 #define DEFAULT_MIN_RESTART 60
53 #define DEFAULT_MAX_RESTART 600
55 #define DEFAULT_RESTART_CMD WATCHFRR_SH_PATH " restart %s"
56 #define DEFAULT_START_CMD WATCHFRR_SH_PATH " start %s"
57 #define DEFAULT_STOP_CMD WATCHFRR_SH_PATH " stop %s"
59 #define PING_TOKEN "PING"
61 DEFINE_MGROUP(WATCHFRR
, "watchfrr")
62 DEFINE_MTYPE_STATIC(WATCHFRR
, WATCHFRR_DAEMON
, "watchfrr daemon entry")
64 /* Needs to be global, referenced somewhere inside libfrr. */
65 struct thread_master
*master
;
67 static bool watch_only
= false;
74 PHASE_ZEBRA_RESTART_PENDING
,
75 PHASE_WAITING_ZEBRA_UP
78 static const char *const phase_str
[] = {
82 "Waiting for other daemons to come down",
83 "Zebra restart job running",
84 "Waiting for zebra to come up",
88 #define PHASE_TIMEOUT (3*gs.restart_timeout)
89 #define STARTUP_TIMEOUT 55 * 1000
97 struct thread
*t_kill
;
101 static struct global_state
{
102 restart_phase_t phase
;
103 struct thread
*t_phase_hanging
;
104 struct thread
*t_startup_timeout
;
108 long restart_timeout
;
109 long min_restart_interval
;
110 long max_restart_interval
;
111 struct daemon
*daemons
;
112 const char *restart_command
;
113 const char *start_command
;
114 const char *stop_command
;
115 struct restart_info restart
;
117 struct daemon
*special
; /* points to zebra when doing phased restart */
120 int numdown
; /* # of daemons that are not UP or UNRESPONSIVE */
123 .vtydir
= frr_vtydir
,
124 .period
= 1000 * DEFAULT_PERIOD
,
125 .timeout
= DEFAULT_TIMEOUT
,
126 .restart_timeout
= DEFAULT_RESTART_TIMEOUT
,
127 .loglevel
= DEFAULT_LOGLEVEL
,
128 .min_restart_interval
= DEFAULT_MIN_RESTART
,
129 .max_restart_interval
= DEFAULT_MAX_RESTART
,
130 .restart_command
= DEFAULT_RESTART_CMD
,
131 .start_command
= DEFAULT_START_CMD
,
132 .stop_command
= DEFAULT_STOP_CMD
,
144 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
146 static const char *const state_str
[] = {
147 "Init", "Down", "Connecting", "Up", "Unresponsive",
152 daemon_state_t state
;
154 struct timeval echo_sent
;
155 unsigned int connect_tries
;
156 struct thread
*t_wakeup
;
157 struct thread
*t_read
;
158 struct thread
*t_write
;
160 struct restart_info restart
;
163 * For a given daemon, if we've turned on ignore timeouts
164 * ignore the timeout value and assume everything is ok
165 * This is for daemon debugging w/ gdb after we have started
166 * FRR and realize we have something that needs to be looked
172 #define OPTION_MINRESTART 2000
173 #define OPTION_MAXRESTART 2001
174 #define OPTION_DRY 2002
176 static const struct option longopts
[] = {
177 {"daemon", no_argument
, NULL
, 'd'},
178 {"statedir", required_argument
, NULL
, 'S'},
179 {"loglevel", required_argument
, NULL
, 'l'},
180 {"interval", required_argument
, NULL
, 'i'},
181 {"timeout", required_argument
, NULL
, 't'},
182 {"restart-timeout", required_argument
, NULL
, 'T'},
183 {"restart", required_argument
, NULL
, 'r'},
184 {"start-command", required_argument
, NULL
, 's'},
185 {"kill-command", required_argument
, NULL
, 'k'},
186 {"dry", no_argument
, NULL
, OPTION_DRY
},
187 {"min-restart-interval", required_argument
, NULL
, OPTION_MINRESTART
},
188 {"max-restart-interval", required_argument
, NULL
, OPTION_MAXRESTART
},
189 {"pid-file", required_argument
, NULL
, 'p'},
190 {"blank-string", required_argument
, NULL
, 'b'},
191 {"help", no_argument
, NULL
, 'h'},
192 {"version", no_argument
, NULL
, 'v'},
195 static int try_connect(struct daemon
*dmn
);
196 static int wakeup_send_echo(struct thread
*t_wakeup
);
197 static void try_restart(struct daemon
*dmn
);
198 static void phase_check(void);
199 static void restart_done(struct daemon
*dmn
);
201 static const char *progname
;
203 void watchfrr_set_ignore_daemon(struct vty
*vty
, const char *dname
, bool ignore
)
207 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
208 if (strncmp(dmn
->name
, dname
, strlen(dmn
->name
)) == 0)
213 dmn
->ignore_timeout
= ignore
;
214 vty_out(vty
, "%s switching to %s\n", dmn
->name
,
215 ignore
? "ignore" : "watch");
217 vty_out(vty
, "%s is not configured for running at the moment",
221 static void printhelp(FILE *target
)
224 "Usage : %s [OPTION...] <daemon name> ...\n\n\
225 Watchdog program to monitor status of frr daemons and try to restart\n\
226 them if they are down or unresponsive. It determines whether a daemon is\n\
227 up based on whether it can connect to the daemon's vty unix stream socket.\n\
228 It then repeatedly sends echo commands over that socket to determine whether\n\
229 the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
230 on the socket connection and know immediately that the daemon is down.\n\n\
231 The daemons to be monitored should be listed on the command line.\n\n\
232 In order to avoid attempting to restart the daemons in a fast loop,\n\
233 the -m and -M options allow you to control the minimum delay between\n\
234 restart commands. The minimum restart delay is recalculated each time\n\
235 a restart is attempted: if the time since the last restart attempt exceeds\n\
236 twice the -M value, then the restart delay is set to the -m value.\n\
237 Otherwise, the interval is doubled (but capped at the -M value).\n\n",
242 -d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
243 to syslog instead of stdout.\n\
244 -S, --statedir Set the vty socket directory (default is %s)\n\
245 -l, --loglevel Set the logging level (default is %d).\n\
246 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
247 but it can be set higher than %d if extra-verbose debugging\n\
248 messages are desired.\n\
249 --min-restart-interval\n\
250 Set the minimum seconds to wait between invocations of daemon\n\
251 restart commands (default is %d).\n\
252 --max-restart-interval\n\
253 Set the maximum seconds to wait between invocations of daemon\n\
254 restart commands (default is %d).\n\
255 -i, --interval Set the status polling interval in seconds (default is %d)\n\
256 -t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
257 -T, --restart-timeout\n\
258 Set the restart (kill) timeout in seconds (default is %d).\n\
259 If any background jobs are still running after this much\n\
260 time has elapsed, they will be killed.\n\
261 -r, --restart Supply a Bourne shell command to use to restart a single\n\
262 daemon. The command string should include '%%s' where the\n\
263 name of the daemon should be substituted.\n\
265 -s, --start-command\n\
266 Supply a Bourne shell to command to use to start a single\n\
267 daemon. The command string should include '%%s' where the\n\
268 name of the daemon should be substituted.\n\
270 -k, --kill-command\n\
271 Supply a Bourne shell to command to use to stop a single\n\
272 daemon. The command string should include '%%s' where the\n\
273 name of the daemon should be substituted.\n\
275 --dry Do not start or restart anything, just log.\n\
276 -p, --pid-file Set process identifier file name\n\
277 (default is %s/watchfrr.pid).\n\
278 -b, --blank-string\n\
279 When the supplied argument string is found in any of the\n\
280 various shell command arguments (-r, -s, or -k), replace\n\
281 it with a space. This is an ugly hack to circumvent problems\n\
282 passing command-line arguments with embedded spaces.\n\
283 -v, --version Print program version\n\
284 -h, --help Display this help and exit\n",
285 frr_vtydir
, DEFAULT_LOGLEVEL
, LOG_EMERG
, LOG_DEBUG
, LOG_DEBUG
,
286 DEFAULT_MIN_RESTART
, DEFAULT_MAX_RESTART
, DEFAULT_PERIOD
,
287 DEFAULT_TIMEOUT
, DEFAULT_RESTART_TIMEOUT
,
288 DEFAULT_RESTART_CMD
, DEFAULT_START_CMD
, DEFAULT_STOP_CMD
,
292 static pid_t
run_background(char *shell_cmd
)
296 switch (child
= fork()) {
298 flog_err_sys(EC_LIB_SYSTEM_CALL
,
299 "fork failed, cannot run command [%s]: %s",
300 shell_cmd
, safe_strerror(errno
));
304 /* Use separate process group so child processes can be killed
306 if (setpgid(0, 0) < 0)
307 zlog_warn("warning: setpgid(0,0) failed: %s",
308 safe_strerror(errno
));
312 char *const argv
[4] = {shell
, dashc
, shell_cmd
, NULL
};
313 execv("/bin/sh", argv
);
314 flog_err_sys(EC_LIB_SYSTEM_CALL
,
315 "execv(/bin/sh -c '%s') failed: %s",
316 shell_cmd
, safe_strerror(errno
));
320 /* Parent process: we will reap the child later. */
321 flog_err_sys(EC_LIB_SYSTEM_CALL
,
322 "Forked background command [pid %d]: %s",
323 (int)child
, shell_cmd
);
328 static struct timeval
*time_elapsed(struct timeval
*result
,
329 const struct timeval
*start_time
)
331 gettimeofday(result
, NULL
);
332 result
->tv_sec
-= start_time
->tv_sec
;
333 result
->tv_usec
-= start_time
->tv_usec
;
334 while (result
->tv_usec
< 0) {
335 result
->tv_usec
+= 1000000L;
341 static int restart_kill(struct thread
*t_kill
)
343 struct restart_info
*restart
= THREAD_ARG(t_kill
);
344 struct timeval delay
;
346 time_elapsed(&delay
, &restart
->time
);
348 "Warning: %s %s child process %d still running after "
349 "%ld seconds, sending signal %d",
350 restart
->what
, restart
->name
, (int)restart
->pid
,
351 (long)delay
.tv_sec
, (restart
->kills
? SIGKILL
: SIGTERM
));
352 kill(-restart
->pid
, (restart
->kills
? SIGKILL
: SIGTERM
));
354 restart
->t_kill
= NULL
;
355 thread_add_timer(master
, restart_kill
, restart
, gs
.restart_timeout
,
360 static struct restart_info
*find_child(pid_t child
)
363 if (gs
.restart
.pid
== child
)
366 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
367 if (dmn
->restart
.pid
== child
)
368 return &dmn
->restart
;
373 static void sigchild(void)
379 struct restart_info
*restart
;
382 switch (child
= waitpid(-1, &status
, WNOHANG
)) {
384 flog_err_sys(EC_LIB_SYSTEM_CALL
, "waitpid failed: %s",
385 safe_strerror(errno
));
388 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
392 if (child
== integrated_write_pid
) {
393 integrated_write_sigchld(status
);
397 if ((restart
= find_child(child
)) != NULL
) {
398 name
= restart
->name
;
399 what
= restart
->what
;
402 thread_cancel(restart
->t_kill
);
403 restart
->t_kill
= NULL
;
404 /* Update restart time to reflect the time the command
406 gettimeofday(&restart
->time
, NULL
);
410 "waitpid returned status for an unknown child process %d",
415 if (WIFSTOPPED(status
))
416 zlog_warn("warning: %s %s process %d is stopped", what
, name
,
418 else if (WIFSIGNALED(status
))
419 zlog_warn("%s %s process %d terminated due to signal %d", what
,
420 name
, (int)child
, WTERMSIG(status
));
421 else if (WIFEXITED(status
)) {
422 if (WEXITSTATUS(status
) != 0)
424 "%s %s process %d exited with non-zero status %d",
425 what
, name
, (int)child
, WEXITSTATUS(status
));
427 zlog_debug("%s %s process %d exited normally", what
,
430 if (restart
&& restart
!= &gs
.restart
) {
431 dmn
= container_of(restart
, struct daemon
,
435 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
441 "cannot interpret %s %s process %d wait status 0x%x",
442 what
, name
, (int)child
, status
);
446 static int run_job(struct restart_info
*restart
, const char *cmdtype
,
447 const char *command
, int force
, int update_interval
)
449 struct timeval delay
;
451 if (gs
.loglevel
> LOG_DEBUG
+ 1)
452 zlog_debug("attempting to %s %s", cmdtype
, restart
->name
);
455 if (gs
.loglevel
> LOG_DEBUG
+ 1)
457 "cannot %s %s, previous pid %d still running",
458 cmdtype
, restart
->name
, (int)restart
->pid
);
462 #if defined HAVE_SYSTEMD
465 snprintf(buffer
, sizeof(buffer
), "restarting %s", restart
->name
);
466 systemd_send_status(buffer
);
469 /* Note: time_elapsed test must come before the force test, since we
471 to make sure that delay is initialized for use below in updating the
473 if ((time_elapsed(&delay
, &restart
->time
)->tv_sec
< restart
->interval
)
476 if (gs
.loglevel
> LOG_DEBUG
+ 1)
479 "elapsed time %ld < retry interval %ld",
480 cmdtype
, restart
->name
, (long)delay
.tv_sec
,
485 gettimeofday(&restart
->time
, NULL
);
488 char cmd
[strlen(command
) + strlen(restart
->name
) + 1];
489 snprintf(cmd
, sizeof(cmd
), command
, restart
->name
);
490 if ((restart
->pid
= run_background(cmd
)) > 0) {
491 restart
->t_kill
= NULL
;
492 thread_add_timer(master
, restart_kill
, restart
,
493 gs
.restart_timeout
, &restart
->t_kill
);
494 restart
->what
= cmdtype
;
500 #if defined HAVE_SYSTEMD
501 systemd_send_status("FRR Operational");
503 /* Calculate the new restart interval. */
504 if (update_interval
) {
505 if (delay
.tv_sec
> 2 * gs
.max_restart_interval
)
506 restart
->interval
= gs
.min_restart_interval
;
507 else if ((restart
->interval
*= 2) > gs
.max_restart_interval
)
508 restart
->interval
= gs
.max_restart_interval
;
509 if (gs
.loglevel
> LOG_DEBUG
+ 1)
510 zlog_debug("restart %s interval is now %ld",
511 restart
->name
, restart
->interval
);
516 #define SET_READ_HANDLER(DMN) \
518 (DMN)->t_read = NULL; \
519 thread_add_read(master, handle_read, (DMN), (DMN)->fd, \
523 #define SET_WAKEUP_DOWN(DMN) \
525 (DMN)->t_wakeup = NULL; \
526 thread_add_timer_msec(master, wakeup_down, (DMN), \
527 FUZZY(gs.period), &(DMN)->t_wakeup); \
530 #define SET_WAKEUP_UNRESPONSIVE(DMN) \
532 (DMN)->t_wakeup = NULL; \
533 thread_add_timer_msec(master, wakeup_unresponsive, (DMN), \
534 FUZZY(gs.period), &(DMN)->t_wakeup); \
537 #define SET_WAKEUP_ECHO(DMN) \
539 (DMN)->t_wakeup = NULL; \
540 thread_add_timer_msec(master, wakeup_send_echo, (DMN), \
541 FUZZY(gs.period), &(DMN)->t_wakeup); \
544 static int wakeup_down(struct thread
*t_wakeup
)
546 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
548 dmn
->t_wakeup
= NULL
;
549 if (try_connect(dmn
) < 0)
550 SET_WAKEUP_DOWN(dmn
);
551 if ((dmn
->connect_tries
> 1) && (dmn
->state
!= DAEMON_UP
))
556 static int wakeup_init(struct thread
*t_wakeup
)
558 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
560 dmn
->t_wakeup
= NULL
;
561 if (try_connect(dmn
) < 0) {
562 flog_err(EC_WATCHFRR_CONNECTION
,
563 "%s state -> down : initial connection attempt failed",
565 dmn
->state
= DAEMON_DOWN
;
571 static void restart_done(struct daemon
*dmn
)
573 if (dmn
->state
!= DAEMON_DOWN
) {
575 "Daemon: %s: is in %s state but expected it to be in DAEMON_DOWN state",
576 dmn
->name
, state_str
[dmn
->state
]);
580 THREAD_OFF(dmn
->t_wakeup
);
581 if (try_connect(dmn
) < 0)
582 SET_WAKEUP_DOWN(dmn
);
585 static void daemon_down(struct daemon
*dmn
, const char *why
)
587 if (IS_UP(dmn
) || (dmn
->state
== DAEMON_INIT
))
588 flog_err(EC_WATCHFRR_CONNECTION
, "%s state -> down : %s",
590 else if (gs
.loglevel
> LOG_DEBUG
)
591 zlog_debug("%s still down : %s", dmn
->name
, why
);
594 dmn
->state
= DAEMON_DOWN
;
599 THREAD_OFF(dmn
->t_read
);
600 THREAD_OFF(dmn
->t_write
);
601 THREAD_OFF(dmn
->t_wakeup
);
602 if (try_connect(dmn
) < 0)
603 SET_WAKEUP_DOWN(dmn
);
607 static int handle_read(struct thread
*t_read
)
609 struct daemon
*dmn
= THREAD_ARG(t_read
);
610 static const char resp
[sizeof(PING_TOKEN
) + 4] = PING_TOKEN
"\n";
611 char buf
[sizeof(resp
) + 100];
613 struct timeval delay
;
616 if ((rc
= read(dmn
->fd
, buf
, sizeof(buf
))) < 0) {
619 if (ERRNO_IO_RETRY(errno
)) {
620 /* Pretend it never happened. */
621 SET_READ_HANDLER(dmn
);
624 snprintf(why
, sizeof(why
), "unexpected read error: %s",
625 safe_strerror(errno
));
626 daemon_down(dmn
, why
);
630 daemon_down(dmn
, "read returned EOF");
633 if (!dmn
->echo_sent
.tv_sec
) {
634 char why
[sizeof(buf
) + 100];
635 snprintf(why
, sizeof(why
),
636 "unexpected read returns %d bytes: %.*s", (int)rc
,
638 daemon_down(dmn
, why
);
642 /* We are expecting an echo response: is there any chance that the
643 response would not be returned entirely in the first read? That
644 seems inconceivable... */
645 if ((rc
!= sizeof(resp
)) || memcmp(buf
, resp
, sizeof(resp
))) {
646 char why
[100 + sizeof(buf
)];
647 snprintf(why
, sizeof(why
),
648 "read returned bad echo response of %d bytes "
649 "(expecting %u): %.*s",
650 (int)rc
, (unsigned int)sizeof(resp
), (int)rc
, buf
);
651 daemon_down(dmn
, why
);
655 time_elapsed(&delay
, &dmn
->echo_sent
);
656 dmn
->echo_sent
.tv_sec
= 0;
657 if (dmn
->state
== DAEMON_UNRESPONSIVE
) {
658 if (delay
.tv_sec
< gs
.timeout
) {
659 dmn
->state
= DAEMON_UP
;
661 "%s state -> up : echo response received after %ld.%06ld "
663 dmn
->name
, (long)delay
.tv_sec
,
664 (long)delay
.tv_usec
);
667 "%s: slow echo response finally received after %ld.%06ld "
669 dmn
->name
, (long)delay
.tv_sec
,
670 (long)delay
.tv_usec
);
671 } else if (gs
.loglevel
> LOG_DEBUG
+ 1)
672 zlog_debug("%s: echo response received after %ld.%06ld seconds",
673 dmn
->name
, (long)delay
.tv_sec
, (long)delay
.tv_usec
);
675 SET_READ_HANDLER(dmn
);
677 thread_cancel(dmn
->t_wakeup
);
678 SET_WAKEUP_ECHO(dmn
);
684 * Wait till we notice that all daemons are ready before
685 * we send we are ready to systemd
687 static void daemon_send_ready(int exitcode
)
697 zlog_notice("all daemons up, doing startup-complete notify");
698 else if (gs
.numdown
< gs
.numdaemons
)
699 flog_err(EC_WATCHFRR_CONNECTION
,
700 "startup did not complete within timeout"
701 " (%d/%d daemons running)",
702 gs
.numdaemons
- gs
.numdown
, gs
.numdaemons
);
704 flog_err(EC_WATCHFRR_CONNECTION
,
705 "all configured daemons failed to start"
706 " -- exiting watchfrr");
713 snprintf(started
, sizeof(started
), "%s%s", frr_vtydir
,
715 fp
= fopen(started
, "w");
718 #if defined HAVE_SYSTEMD
719 systemd_send_started(master
, 0);
720 systemd_send_status("FRR Operational");
725 static void daemon_up(struct daemon
*dmn
, const char *why
)
727 dmn
->state
= DAEMON_UP
;
729 dmn
->connect_tries
= 0;
730 zlog_notice("%s state -> up : %s", dmn
->name
, why
);
732 daemon_send_ready(0);
733 SET_WAKEUP_ECHO(dmn
);
737 static int check_connect(struct thread
*t_write
)
739 struct daemon
*dmn
= THREAD_ARG(t_write
);
741 socklen_t reslen
= sizeof(sockerr
);
744 if (getsockopt(dmn
->fd
, SOL_SOCKET
, SO_ERROR
, (char *)&sockerr
, &reslen
)
746 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn
->name
,
747 safe_strerror(errno
));
749 "getsockopt failed checking connection success");
752 if ((reslen
== sizeof(sockerr
)) && sockerr
) {
756 "getsockopt reports that connection attempt failed: %s",
757 safe_strerror(sockerr
));
758 daemon_down(dmn
, why
);
762 daemon_up(dmn
, "delayed connect succeeded");
766 static int wakeup_connect_hanging(struct thread
*t_wakeup
)
768 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
771 dmn
->t_wakeup
= NULL
;
772 snprintf(why
, sizeof(why
),
773 "connection attempt timed out after %ld seconds", gs
.timeout
);
774 daemon_down(dmn
, why
);
778 /* Making connection to protocol daemon. */
779 static int try_connect(struct daemon
*dmn
)
782 struct sockaddr_un addr
;
785 if (gs
.loglevel
> LOG_DEBUG
+ 1)
786 zlog_debug("%s: attempting to connect", dmn
->name
);
787 dmn
->connect_tries
++;
789 memset(&addr
, 0, sizeof(struct sockaddr_un
));
790 addr
.sun_family
= AF_UNIX
;
791 snprintf(addr
.sun_path
, sizeof(addr
.sun_path
), "%s/%s.vty", gs
.vtydir
,
793 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
794 len
= addr
.sun_len
= SUN_LEN(&addr
);
796 len
= sizeof(addr
.sun_family
) + strlen(addr
.sun_path
);
797 #endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
799 /* Quick check to see if we might succeed before we go to the trouble
800 of creating a socket. */
801 if (access(addr
.sun_path
, W_OK
) < 0) {
803 flog_err_sys(EC_LIB_SYSTEM_CALL
,
804 "%s: access to socket %s denied: %s",
805 dmn
->name
, addr
.sun_path
,
806 safe_strerror(errno
));
810 if ((sock
= socket(AF_UNIX
, SOCK_STREAM
, 0)) < 0) {
811 flog_err_sys(EC_LIB_SOCKET
, "%s(%s): cannot make socket: %s",
812 __func__
, addr
.sun_path
, safe_strerror(errno
));
816 if (set_nonblocking(sock
) < 0 || set_cloexec(sock
) < 0) {
817 flog_err_sys(EC_LIB_SYSTEM_CALL
,
818 "%s(%s): set_nonblocking/cloexec(%d) failed",
819 __func__
, addr
.sun_path
, sock
);
824 if (connect(sock
, (struct sockaddr
*)&addr
, len
) < 0) {
825 if ((errno
!= EINPROGRESS
) && (errno
!= EWOULDBLOCK
)) {
826 if (gs
.loglevel
> LOG_DEBUG
)
827 zlog_debug("%s(%s): connect failed: %s",
828 __func__
, addr
.sun_path
,
829 safe_strerror(errno
));
833 if (gs
.loglevel
> LOG_DEBUG
)
834 zlog_debug("%s: connection in progress", dmn
->name
);
835 dmn
->state
= DAEMON_CONNECTING
;
838 thread_add_write(master
, check_connect
, dmn
, dmn
->fd
,
840 dmn
->t_wakeup
= NULL
;
841 thread_add_timer(master
, wakeup_connect_hanging
, dmn
,
842 gs
.timeout
, &dmn
->t_wakeup
);
843 SET_READ_HANDLER(dmn
);
848 SET_READ_HANDLER(dmn
);
849 daemon_up(dmn
, "connect succeeded");
853 static int phase_hanging(struct thread
*t_hanging
)
855 gs
.t_phase_hanging
= NULL
;
856 flog_err(EC_WATCHFRR_CONNECTION
,
857 "Phase [%s] hanging for %ld seconds, aborting phased restart",
858 phase_str
[gs
.phase
], PHASE_TIMEOUT
);
859 gs
.phase
= PHASE_NONE
;
863 static void set_phase(restart_phase_t new_phase
)
865 gs
.phase
= new_phase
;
866 if (gs
.t_phase_hanging
)
867 thread_cancel(gs
.t_phase_hanging
);
868 gs
.t_phase_hanging
= NULL
;
869 thread_add_timer(master
, phase_hanging
, NULL
, PHASE_TIMEOUT
,
870 &gs
.t_phase_hanging
);
873 static void phase_check(void)
882 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
883 if (dmn
->state
== DAEMON_INIT
)
886 /* startup complete, everything out of INIT */
887 gs
.phase
= PHASE_NONE
;
888 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
889 if (dmn
->state
== DAEMON_DOWN
) {
890 SET_WAKEUP_DOWN(dmn
);
894 case PHASE_STOPS_PENDING
:
898 "Phased restart: all routing daemon stop jobs have completed.");
899 set_phase(PHASE_WAITING_DOWN
);
902 case PHASE_WAITING_DOWN
:
903 if (gs
.numdown
+ IS_UP(gs
.special
) < gs
.numdaemons
)
905 zlog_info("Phased restart: all routing daemons now down.");
906 run_job(&gs
.special
->restart
, "restart", gs
.restart_command
, 1,
908 set_phase(PHASE_ZEBRA_RESTART_PENDING
);
911 case PHASE_ZEBRA_RESTART_PENDING
:
912 if (gs
.special
->restart
.pid
)
914 zlog_info("Phased restart: %s restart job completed.",
916 set_phase(PHASE_WAITING_ZEBRA_UP
);
919 case PHASE_WAITING_ZEBRA_UP
:
920 if (!IS_UP(gs
.special
))
922 zlog_info("Phased restart: %s is now up.", gs
.special
->name
);
925 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
926 if (dmn
!= gs
.special
)
927 run_job(&dmn
->restart
, "start",
928 gs
.start_command
, 1, 0);
931 gs
.phase
= PHASE_NONE
;
932 THREAD_OFF(gs
.t_phase_hanging
);
933 zlog_notice("Phased global restart has completed.");
938 static void try_restart(struct daemon
*dmn
)
943 if (dmn
!= gs
.special
) {
944 if ((gs
.special
->state
== DAEMON_UP
)
945 && (gs
.phase
== PHASE_NONE
))
946 run_job(&dmn
->restart
, "restart", gs
.restart_command
, 0,
950 "%s: postponing restart attempt because master %s daemon "
951 "not up [%s], or phased restart in progress",
952 dmn
->name
, gs
.special
->name
,
953 state_str
[gs
.special
->state
]);
957 if ((gs
.phase
!= PHASE_NONE
) || gs
.numpids
) {
958 if (gs
.loglevel
> LOG_DEBUG
+ 1)
960 "postponing phased global restart: restart already in "
961 "progress [%s], or outstanding child processes [%d]",
962 phase_str
[gs
.phase
], gs
.numpids
);
965 /* Is it too soon for a restart? */
967 struct timeval delay
;
968 if (time_elapsed(&delay
, &gs
.special
->restart
.time
)->tv_sec
969 < gs
.special
->restart
.interval
) {
970 if (gs
.loglevel
> LOG_DEBUG
+ 1)
972 "postponing phased global restart: "
973 "elapsed time %ld < retry interval %ld",
975 gs
.special
->restart
.interval
);
979 run_job(&gs
.restart
, "restart", gs
.restart_command
, 0, 1);
982 static int wakeup_unresponsive(struct thread
*t_wakeup
)
984 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
986 dmn
->t_wakeup
= NULL
;
987 if (dmn
->state
!= DAEMON_UNRESPONSIVE
)
988 flog_err(EC_WATCHFRR_CONNECTION
,
989 "%s: no longer unresponsive (now %s), "
990 "wakeup should have been cancelled!",
991 dmn
->name
, state_str
[dmn
->state
]);
993 SET_WAKEUP_UNRESPONSIVE(dmn
);
999 static int wakeup_no_answer(struct thread
*t_wakeup
)
1001 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
1003 dmn
->t_wakeup
= NULL
;
1004 dmn
->state
= DAEMON_UNRESPONSIVE
;
1005 if (dmn
->ignore_timeout
)
1007 flog_err(EC_WATCHFRR_CONNECTION
,
1008 "%s state -> unresponsive : no response yet to ping "
1009 "sent %ld seconds ago",
1010 dmn
->name
, gs
.timeout
);
1011 SET_WAKEUP_UNRESPONSIVE(dmn
);
1016 static int wakeup_send_echo(struct thread
*t_wakeup
)
1018 static const char echocmd
[] = "echo " PING_TOKEN
;
1020 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
1022 dmn
->t_wakeup
= NULL
;
1023 if (((rc
= write(dmn
->fd
, echocmd
, sizeof(echocmd
))) < 0)
1024 || ((size_t)rc
!= sizeof(echocmd
))) {
1025 char why
[100 + sizeof(echocmd
)];
1026 snprintf(why
, sizeof(why
),
1027 "write '%s' returned %d instead of %u", echocmd
,
1028 (int)rc
, (unsigned int)sizeof(echocmd
));
1029 daemon_down(dmn
, why
);
1031 gettimeofday(&dmn
->echo_sent
, NULL
);
1032 dmn
->t_wakeup
= NULL
;
1033 thread_add_timer(master
, wakeup_no_answer
, dmn
, gs
.timeout
,
1039 bool check_all_up(void)
1043 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
1044 if (dmn
->state
!= DAEMON_UP
)
1049 void watchfrr_status(struct vty
*vty
)
1052 struct timeval delay
;
1054 vty_out(vty
, "watchfrr global phase: %s\n", phase_str
[gs
.phase
]);
1056 vty_out(vty
, " global restart running, pid %ld\n",
1057 (long)gs
.restart
.pid
);
1059 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
1060 vty_out(vty
, " %-20s %s%s", dmn
->name
, state_str
[dmn
->state
],
1061 dmn
->ignore_timeout
? "/Ignoring Timeout\n" : "\n");
1062 if (dmn
->restart
.pid
)
1063 vty_out(vty
, " restart running, pid %ld\n",
1064 (long)dmn
->restart
.pid
);
1065 else if (dmn
->state
== DAEMON_DOWN
&&
1066 time_elapsed(&delay
, &dmn
->restart
.time
)->tv_sec
1067 < dmn
->restart
.interval
)
1068 vty_out(vty
, " restarting in %jd seconds"
1069 " (%jds backoff interval)\n",
1070 (intmax_t)dmn
->restart
.interval
1071 - (intmax_t)delay
.tv_sec
,
1072 (intmax_t)dmn
->restart
.interval
);
1076 static void sigint(void)
1078 zlog_notice("Terminating on signal");
1079 systemd_send_stopping();
1083 static int valid_command(const char *cmd
)
1087 return ((p
= strchr(cmd
, '%')) != NULL
) && (*(p
+ 1) == 's')
1088 && !strchr(p
+ 1, '%');
1091 /* This is an ugly hack to circumvent problems with passing command-line
1092 arguments that contain spaces. The fix is to use a configuration file. */
1093 static char *translate_blanks(const char *cmd
, const char *blankstr
)
1097 size_t bslen
= strlen(blankstr
);
1099 if (!(res
= strdup(cmd
))) {
1103 while ((p
= strstr(res
, blankstr
)) != NULL
) {
1106 memmove(p
+ 1, p
+ bslen
, strlen(p
+ bslen
) + 1);
1111 static int startup_timeout(struct thread
*t_wakeup
)
1113 daemon_send_ready(1);
1117 static void watchfrr_init(int argc
, char **argv
)
1119 const char *special
= "zebra";
1121 struct daemon
*dmn
, **add
= &gs
.daemons
;
1122 char alldaemons
[512] = "", *p
= alldaemons
;
1124 thread_add_timer_msec(master
, startup_timeout
, NULL
, STARTUP_TIMEOUT
,
1125 &gs
.t_startup_timeout
);
1127 for (i
= optind
; i
< argc
; i
++) {
1128 dmn
= XCALLOC(MTYPE_WATCHFRR_DAEMON
, sizeof(*dmn
));
1130 dmn
->name
= dmn
->restart
.name
= argv
[i
];
1131 dmn
->state
= DAEMON_INIT
;
1135 dmn
->t_wakeup
= NULL
;
1136 thread_add_timer_msec(master
, wakeup_init
, dmn
, 0,
1138 dmn
->restart
.interval
= gs
.min_restart_interval
;
1142 if (!strcmp(dmn
->name
, special
))
1148 "Must specify one or more daemons to monitor.\n\n");
1151 if (!watch_only
&& !gs
.special
) {
1152 fprintf(stderr
, "\"%s\" daemon must be in daemon lists\n\n",
1157 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
1158 snprintf(p
, alldaemons
+ sizeof(alldaemons
) - p
, "%s%s",
1159 (p
== alldaemons
) ? "" : " ", dmn
->name
);
1162 zlog_notice("%s %s watching [%s]%s", progname
, FRR_VERSION
, alldaemons
,
1163 watch_only
? ", monitor mode" : "");
1166 struct zebra_privs_t watchfrr_privs
= {
1168 .vty_group
= VTY_GROUP
,
1172 static struct quagga_signal_t watchfrr_signals
[] = {
1183 .handler
= sigchild
,
1187 FRR_DAEMON_INFO(watchfrr
, WATCHFRR
,
1188 .flags
= FRR_NO_PRIVSEP
| FRR_NO_TCPVTY
| FRR_LIMITED_CLI
1189 | FRR_NO_CFG_PID_DRY
| FRR_NO_ZCLIENT
1192 .printhelp
= printhelp
,
1193 .copyright
= "Copyright 2004 Andrew J. Schorr",
1195 .signals
= watchfrr_signals
,
1196 .n_signals
= array_size(watchfrr_signals
),
1198 .privs
= &watchfrr_privs
, )
1200 #define DEPRECATED_OPTIONS "aAezR:"
1202 int main(int argc
, char **argv
)
1205 const char *blankstr
= NULL
;
1207 frr_preinit(&watchfrr_di
, argc
, argv
);
1208 progname
= watchfrr_di
.progname
;
1210 frr_opt_add("b:dk:l:i:p:r:S:s:t:T:" DEPRECATED_OPTIONS
, longopts
, "");
1212 gs
.restart
.name
= "all";
1213 while ((opt
= frr_getopt(argc
, argv
, NULL
)) != EOF
) {
1214 if (opt
&& opt
< 128 && strchr(DEPRECATED_OPTIONS
, opt
)) {
1216 "The -%c option no longer exists.\n"
1217 "Please refer to the watchfrr(8) man page.\n",
1232 if (!valid_command(optarg
)) {
1234 "Invalid kill command, must contain '%%s': %s\n",
1238 gs
.stop_command
= optarg
;
1242 if ((sscanf(optarg
, "%d%1s", &gs
.loglevel
, garbage
)
1244 || (gs
.loglevel
< LOG_EMERG
)) {
1246 "Invalid loglevel argument: %s\n",
1251 case OPTION_MINRESTART
: {
1253 if ((sscanf(optarg
, "%ld%1s", &gs
.min_restart_interval
,
1256 || (gs
.min_restart_interval
< 0)) {
1258 "Invalid min_restart_interval argument: %s\n",
1263 case OPTION_MAXRESTART
: {
1265 if ((sscanf(optarg
, "%ld%1s", &gs
.max_restart_interval
,
1268 || (gs
.max_restart_interval
< 0)) {
1270 "Invalid max_restart_interval argument: %s\n",
1278 if ((sscanf(optarg
, "%d%1s", &period
, garbage
) != 1)
1279 || (gs
.period
< 1)) {
1281 "Invalid interval argument: %s\n",
1285 gs
.period
= 1000 * period
;
1288 watchfrr_di
.pid_file
= optarg
;
1291 if (!valid_command(optarg
)) {
1293 "Invalid restart command, must contain '%%s': %s\n",
1297 gs
.restart_command
= optarg
;
1300 if (!valid_command(optarg
)) {
1302 "Invalid start command, must contain '%%s': %s\n",
1306 gs
.start_command
= optarg
;
1313 if ((sscanf(optarg
, "%ld%1s", &gs
.timeout
, garbage
)
1315 || (gs
.timeout
< 1)) {
1317 "Invalid timeout argument: %s\n",
1324 if ((sscanf(optarg
, "%ld%1s", &gs
.restart_timeout
,
1327 || (gs
.restart_timeout
< 1)) {
1329 "Invalid restart timeout argument: %s\n",
1335 fputs("Invalid option.\n", stderr
);
1341 && (gs
.start_command
|| gs
.stop_command
|| gs
.restart_command
)) {
1342 fputs("Options -r/-s/-k are not used when --dry is active.\n",
1346 && (!gs
.restart_command
|| !gs
.start_command
|| !gs
.stop_command
)) {
1348 "Options -s (start), -k (kill), and -r (restart) are required.\n");
1353 if (gs
.restart_command
)
1354 gs
.restart_command
=
1355 translate_blanks(gs
.restart_command
, blankstr
);
1356 if (gs
.start_command
)
1358 translate_blanks(gs
.start_command
, blankstr
);
1359 if (gs
.stop_command
)
1361 translate_blanks(gs
.stop_command
, blankstr
);
1364 gs
.restart
.interval
= gs
.min_restart_interval
;
1366 master
= frr_init();
1367 watchfrr_error_init();
1368 watchfrr_init(argc
, argv
);
1369 watchfrr_vty_init();
1373 zlog_set_level(ZLOG_DEST_MONITOR
, ZLOG_DISABLED
);
1374 if (watchfrr_di
.daemon_mode
)
1375 zlog_set_level(ZLOG_DEST_SYSLOG
, MIN(gs
.loglevel
, LOG_DEBUG
));
1377 zlog_set_level(ZLOG_DEST_STDOUT
, MIN(gs
.loglevel
, LOG_DEBUG
));
1381 systemd_send_stopping();