2 * Monitor status of frr daemons and restart if necessary.
4 * Copyright (C) 2004 Andrew J. Schorr
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
26 #include <lib/version.h>
29 #include "lib_errors.h"
30 #include "zlog_targets.h"
40 #include "watchfrr_errors.h"
43 #define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
46 /* Macros to help randomize timers. */
47 #define JITTER(X) ((frr_weak_random() % ((X)+1))-((X)/2))
48 #define FUZZY(X) ((X)+JITTER((X)/20))
50 #define DEFAULT_PERIOD 5
51 #define DEFAULT_TIMEOUT 90
52 #define DEFAULT_RESTART_TIMEOUT 20
53 #define DEFAULT_LOGLEVEL LOG_INFO
54 #define DEFAULT_MIN_RESTART 60
55 #define DEFAULT_MAX_RESTART 600
57 #define DEFAULT_RESTART_CMD WATCHFRR_SH_PATH " restart %s"
58 #define DEFAULT_START_CMD WATCHFRR_SH_PATH " start %s"
59 #define DEFAULT_STOP_CMD WATCHFRR_SH_PATH " stop %s"
61 #define PING_TOKEN "PING"
63 DEFINE_MGROUP(WATCHFRR
, "watchfrr")
64 DEFINE_MTYPE_STATIC(WATCHFRR
, WATCHFRR_DAEMON
, "watchfrr daemon entry")
66 /* Needs to be global, referenced somewhere inside libfrr. */
67 struct thread_master
*master
;
69 static bool watch_only
= false;
76 PHASE_ZEBRA_RESTART_PENDING
,
77 PHASE_WAITING_ZEBRA_UP
80 static const char *const phase_str
[] = {
84 "Waiting for other daemons to come down",
85 "Zebra restart job running",
86 "Waiting for zebra to come up",
90 #define PHASE_TIMEOUT (3*gs.restart_timeout)
91 #define STARTUP_TIMEOUT 55 * 1000
99 struct thread
*t_kill
;
103 static struct global_state
{
104 restart_phase_t phase
;
105 struct thread
*t_phase_hanging
;
106 struct thread
*t_startup_timeout
;
110 long restart_timeout
;
111 long min_restart_interval
;
112 long max_restart_interval
;
113 struct daemon
*daemons
;
114 const char *restart_command
;
115 const char *start_command
;
116 const char *stop_command
;
117 struct restart_info restart
;
119 struct daemon
*special
; /* points to zebra when doing phased restart */
122 int numdown
; /* # of daemons that are not UP or UNRESPONSIVE */
125 .vtydir
= frr_vtydir
,
126 .period
= 1000 * DEFAULT_PERIOD
,
127 .timeout
= DEFAULT_TIMEOUT
,
128 .restart_timeout
= DEFAULT_RESTART_TIMEOUT
,
129 .loglevel
= DEFAULT_LOGLEVEL
,
130 .min_restart_interval
= DEFAULT_MIN_RESTART
,
131 .max_restart_interval
= DEFAULT_MAX_RESTART
,
132 .restart_command
= DEFAULT_RESTART_CMD
,
133 .start_command
= DEFAULT_START_CMD
,
134 .stop_command
= DEFAULT_STOP_CMD
,
146 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
148 static const char *const state_str
[] = {
149 "Init", "Down", "Connecting", "Up", "Unresponsive",
154 daemon_state_t state
;
156 struct timeval echo_sent
;
157 unsigned int connect_tries
;
158 struct thread
*t_wakeup
;
159 struct thread
*t_read
;
160 struct thread
*t_write
;
162 struct restart_info restart
;
165 * For a given daemon, if we've turned on ignore timeouts
166 * ignore the timeout value and assume everything is ok
167 * This is for daemon debugging w/ gdb after we have started
168 * FRR and realize we have something that needs to be looked
174 #define OPTION_MINRESTART 2000
175 #define OPTION_MAXRESTART 2001
176 #define OPTION_DRY 2002
178 static const struct option longopts
[] = {
179 {"daemon", no_argument
, NULL
, 'd'},
180 {"statedir", required_argument
, NULL
, 'S'},
181 {"loglevel", required_argument
, NULL
, 'l'},
182 {"interval", required_argument
, NULL
, 'i'},
183 {"timeout", required_argument
, NULL
, 't'},
184 {"restart-timeout", required_argument
, NULL
, 'T'},
185 {"restart", required_argument
, NULL
, 'r'},
186 {"start-command", required_argument
, NULL
, 's'},
187 {"kill-command", required_argument
, NULL
, 'k'},
188 {"dry", no_argument
, NULL
, OPTION_DRY
},
189 {"min-restart-interval", required_argument
, NULL
, OPTION_MINRESTART
},
190 {"max-restart-interval", required_argument
, NULL
, OPTION_MAXRESTART
},
191 {"pid-file", required_argument
, NULL
, 'p'},
192 {"blank-string", required_argument
, NULL
, 'b'},
193 {"help", no_argument
, NULL
, 'h'},
194 {"version", no_argument
, NULL
, 'v'},
197 static int try_connect(struct daemon
*dmn
);
198 static int wakeup_send_echo(struct thread
*t_wakeup
);
199 static void try_restart(struct daemon
*dmn
);
200 static void phase_check(void);
201 static void restart_done(struct daemon
*dmn
);
203 static const char *progname
;
205 void watchfrr_set_ignore_daemon(struct vty
*vty
, const char *dname
, bool ignore
)
209 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
210 if (strncmp(dmn
->name
, dname
, strlen(dmn
->name
)) == 0)
215 dmn
->ignore_timeout
= ignore
;
216 vty_out(vty
, "%s switching to %s\n", dmn
->name
,
217 ignore
? "ignore" : "watch");
219 vty_out(vty
, "%s is not configured for running at the moment",
223 static void printhelp(FILE *target
)
226 "Usage : %s [OPTION...] <daemon name> ...\n\n\
227 Watchdog program to monitor status of frr daemons and try to restart\n\
228 them if they are down or unresponsive. It determines whether a daemon is\n\
229 up based on whether it can connect to the daemon's vty unix stream socket.\n\
230 It then repeatedly sends echo commands over that socket to determine whether\n\
231 the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
232 on the socket connection and know immediately that the daemon is down.\n\n\
233 The daemons to be monitored should be listed on the command line.\n\n\
234 In order to avoid attempting to restart the daemons in a fast loop,\n\
235 the -m and -M options allow you to control the minimum delay between\n\
236 restart commands. The minimum restart delay is recalculated each time\n\
237 a restart is attempted: if the time since the last restart attempt exceeds\n\
238 twice the -M value, then the restart delay is set to the -m value.\n\
239 Otherwise, the interval is doubled (but capped at the -M value).\n\n",
244 -d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
245 to syslog instead of stdout.\n\
246 -S, --statedir Set the vty socket directory (default is %s)\n\
247 -l, --loglevel Set the logging level (default is %d).\n\
248 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
249 but it can be set higher than %d if extra-verbose debugging\n\
250 messages are desired.\n\
251 --min-restart-interval\n\
252 Set the minimum seconds to wait between invocations of daemon\n\
253 restart commands (default is %d).\n\
254 --max-restart-interval\n\
255 Set the maximum seconds to wait between invocations of daemon\n\
256 restart commands (default is %d).\n\
257 -i, --interval Set the status polling interval in seconds (default is %d)\n\
258 -t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
259 -T, --restart-timeout\n\
260 Set the restart (kill) timeout in seconds (default is %d).\n\
261 If any background jobs are still running after this much\n\
262 time has elapsed, they will be killed.\n\
263 -r, --restart Supply a Bourne shell command to use to restart a single\n\
264 daemon. The command string should include '%%s' where the\n\
265 name of the daemon should be substituted.\n\
267 -s, --start-command\n\
268 Supply a Bourne shell to command to use to start a single\n\
269 daemon. The command string should include '%%s' where the\n\
270 name of the daemon should be substituted.\n\
272 -k, --kill-command\n\
273 Supply a Bourne shell to command to use to stop a single\n\
274 daemon. The command string should include '%%s' where the\n\
275 name of the daemon should be substituted.\n\
277 --dry Do not start or restart anything, just log.\n\
278 -p, --pid-file Set process identifier file name\n\
279 (default is %s/watchfrr.pid).\n\
280 -b, --blank-string\n\
281 When the supplied argument string is found in any of the\n\
282 various shell command arguments (-r, -s, or -k), replace\n\
283 it with a space. This is an ugly hack to circumvent problems\n\
284 passing command-line arguments with embedded spaces.\n\
285 -v, --version Print program version\n\
286 -h, --help Display this help and exit\n",
287 frr_vtydir
, DEFAULT_LOGLEVEL
, LOG_EMERG
, LOG_DEBUG
, LOG_DEBUG
,
288 DEFAULT_MIN_RESTART
, DEFAULT_MAX_RESTART
, DEFAULT_PERIOD
,
289 DEFAULT_TIMEOUT
, DEFAULT_RESTART_TIMEOUT
,
290 DEFAULT_RESTART_CMD
, DEFAULT_START_CMD
, DEFAULT_STOP_CMD
,
294 static pid_t
run_background(char *shell_cmd
)
298 switch (child
= fork()) {
300 flog_err_sys(EC_LIB_SYSTEM_CALL
,
301 "fork failed, cannot run command [%s]: %s",
302 shell_cmd
, safe_strerror(errno
));
306 /* Use separate process group so child processes can be killed
308 if (setpgid(0, 0) < 0)
309 zlog_warn("warning: setpgid(0,0) failed: %s",
310 safe_strerror(errno
));
314 char *const argv
[4] = {shell
, dashc
, shell_cmd
, NULL
};
315 execv("/bin/sh", argv
);
316 flog_err_sys(EC_LIB_SYSTEM_CALL
,
317 "execv(/bin/sh -c '%s') failed: %s",
318 shell_cmd
, safe_strerror(errno
));
322 /* Parent process: we will reap the child later. */
323 zlog_info("Forked background command [pid %d]: %s", (int)child
,
329 static struct timeval
*time_elapsed(struct timeval
*result
,
330 const struct timeval
*start_time
)
332 gettimeofday(result
, NULL
);
333 result
->tv_sec
-= start_time
->tv_sec
;
334 result
->tv_usec
-= start_time
->tv_usec
;
335 while (result
->tv_usec
< 0) {
336 result
->tv_usec
+= 1000000L;
342 static int restart_kill(struct thread
*t_kill
)
344 struct restart_info
*restart
= THREAD_ARG(t_kill
);
345 struct timeval delay
;
347 time_elapsed(&delay
, &restart
->time
);
349 "Warning: %s %s child process %d still running after %ld seconds, sending signal %d",
350 restart
->what
, restart
->name
, (int)restart
->pid
,
351 (long)delay
.tv_sec
, (restart
->kills
? SIGKILL
: SIGTERM
));
352 kill(-restart
->pid
, (restart
->kills
? SIGKILL
: SIGTERM
));
354 restart
->t_kill
= NULL
;
355 thread_add_timer(master
, restart_kill
, restart
, gs
.restart_timeout
,
360 static struct restart_info
*find_child(pid_t child
)
363 if (gs
.restart
.pid
== child
)
366 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
367 if (dmn
->restart
.pid
== child
)
368 return &dmn
->restart
;
373 static void sigchild(void)
379 struct restart_info
*restart
;
382 switch (child
= waitpid(-1, &status
, WNOHANG
)) {
384 flog_err_sys(EC_LIB_SYSTEM_CALL
, "waitpid failed: %s",
385 safe_strerror(errno
));
388 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
392 if (child
== integrated_write_pid
) {
393 integrated_write_sigchld(status
);
397 if ((restart
= find_child(child
)) != NULL
) {
398 name
= restart
->name
;
399 what
= restart
->what
;
402 thread_cancel(restart
->t_kill
);
403 restart
->t_kill
= NULL
;
404 /* Update restart time to reflect the time the command
406 gettimeofday(&restart
->time
, NULL
);
410 "waitpid returned status for an unknown child process %d",
415 if (WIFSTOPPED(status
))
416 zlog_warn("warning: %s %s process %d is stopped", what
, name
,
418 else if (WIFSIGNALED(status
))
419 zlog_warn("%s %s process %d terminated due to signal %d", what
,
420 name
, (int)child
, WTERMSIG(status
));
421 else if (WIFEXITED(status
)) {
422 if (WEXITSTATUS(status
) != 0)
424 "%s %s process %d exited with non-zero status %d",
425 what
, name
, (int)child
, WEXITSTATUS(status
));
427 zlog_debug("%s %s process %d exited normally", what
,
430 if (restart
&& restart
!= &gs
.restart
) {
431 dmn
= container_of(restart
, struct daemon
,
435 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
441 "cannot interpret %s %s process %d wait status 0x%x",
442 what
, name
, (int)child
, status
);
446 static int run_job(struct restart_info
*restart
, const char *cmdtype
,
447 const char *command
, int force
, int update_interval
)
449 struct timeval delay
;
451 if (gs
.loglevel
> LOG_DEBUG
+ 1)
452 zlog_debug("attempting to %s %s", cmdtype
, restart
->name
);
455 if (gs
.loglevel
> LOG_DEBUG
+ 1)
457 "cannot %s %s, previous pid %d still running",
458 cmdtype
, restart
->name
, (int)restart
->pid
);
462 #if defined HAVE_SYSTEMD
465 snprintf(buffer
, sizeof(buffer
), "restarting %s", restart
->name
);
466 systemd_send_status(buffer
);
469 /* Note: time_elapsed test must come before the force test, since we
471 to make sure that delay is initialized for use below in updating the
473 if ((time_elapsed(&delay
, &restart
->time
)->tv_sec
< restart
->interval
)
476 if (gs
.loglevel
> LOG_DEBUG
+ 1)
478 "postponing %s %s: elapsed time %ld < retry interval %ld",
479 cmdtype
, restart
->name
, (long)delay
.tv_sec
,
484 gettimeofday(&restart
->time
, NULL
);
487 char cmd
[strlen(command
) + strlen(restart
->name
) + 1];
488 snprintf(cmd
, sizeof(cmd
), command
, restart
->name
);
489 if ((restart
->pid
= run_background(cmd
)) > 0) {
490 restart
->t_kill
= NULL
;
491 thread_add_timer(master
, restart_kill
, restart
,
492 gs
.restart_timeout
, &restart
->t_kill
);
493 restart
->what
= cmdtype
;
499 #if defined HAVE_SYSTEMD
500 systemd_send_status("FRR Operational");
502 /* Calculate the new restart interval. */
503 if (update_interval
) {
504 if (delay
.tv_sec
> 2 * gs
.max_restart_interval
)
505 restart
->interval
= gs
.min_restart_interval
;
506 else if ((restart
->interval
*= 2) > gs
.max_restart_interval
)
507 restart
->interval
= gs
.max_restart_interval
;
508 if (gs
.loglevel
> LOG_DEBUG
+ 1)
509 zlog_debug("restart %s interval is now %ld",
510 restart
->name
, restart
->interval
);
515 #define SET_READ_HANDLER(DMN) \
517 (DMN)->t_read = NULL; \
518 thread_add_read(master, handle_read, (DMN), (DMN)->fd, \
522 #define SET_WAKEUP_DOWN(DMN) \
524 (DMN)->t_wakeup = NULL; \
525 thread_add_timer_msec(master, wakeup_down, (DMN), \
526 FUZZY(gs.period), &(DMN)->t_wakeup); \
529 #define SET_WAKEUP_UNRESPONSIVE(DMN) \
531 (DMN)->t_wakeup = NULL; \
532 thread_add_timer_msec(master, wakeup_unresponsive, (DMN), \
533 FUZZY(gs.period), &(DMN)->t_wakeup); \
536 #define SET_WAKEUP_ECHO(DMN) \
538 (DMN)->t_wakeup = NULL; \
539 thread_add_timer_msec(master, wakeup_send_echo, (DMN), \
540 FUZZY(gs.period), &(DMN)->t_wakeup); \
543 static int wakeup_down(struct thread
*t_wakeup
)
545 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
547 dmn
->t_wakeup
= NULL
;
548 if (try_connect(dmn
) < 0)
549 SET_WAKEUP_DOWN(dmn
);
550 if ((dmn
->connect_tries
> 1) && (dmn
->state
!= DAEMON_UP
))
555 static int wakeup_init(struct thread
*t_wakeup
)
557 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
559 dmn
->t_wakeup
= NULL
;
560 if (try_connect(dmn
) < 0) {
562 "%s state -> down : initial connection attempt failed",
564 dmn
->state
= DAEMON_DOWN
;
570 static void restart_done(struct daemon
*dmn
)
572 if (dmn
->state
!= DAEMON_DOWN
) {
574 "Daemon: %s: is in %s state but expected it to be in DAEMON_DOWN state",
575 dmn
->name
, state_str
[dmn
->state
]);
579 THREAD_OFF(dmn
->t_wakeup
);
580 if (try_connect(dmn
) < 0)
581 SET_WAKEUP_DOWN(dmn
);
584 static void daemon_down(struct daemon
*dmn
, const char *why
)
586 if (IS_UP(dmn
) || (dmn
->state
== DAEMON_INIT
))
587 flog_err(EC_WATCHFRR_CONNECTION
, "%s state -> down : %s",
589 else if (gs
.loglevel
> LOG_DEBUG
)
590 zlog_debug("%s still down : %s", dmn
->name
, why
);
593 dmn
->state
= DAEMON_DOWN
;
598 THREAD_OFF(dmn
->t_read
);
599 THREAD_OFF(dmn
->t_write
);
600 THREAD_OFF(dmn
->t_wakeup
);
601 if (try_connect(dmn
) < 0)
602 SET_WAKEUP_DOWN(dmn
);
606 static int handle_read(struct thread
*t_read
)
608 struct daemon
*dmn
= THREAD_ARG(t_read
);
609 static const char resp
[sizeof(PING_TOKEN
) + 4] = PING_TOKEN
"\n";
610 char buf
[sizeof(resp
) + 100];
612 struct timeval delay
;
615 if ((rc
= read(dmn
->fd
, buf
, sizeof(buf
))) < 0) {
618 if (ERRNO_IO_RETRY(errno
)) {
619 /* Pretend it never happened. */
620 SET_READ_HANDLER(dmn
);
623 snprintf(why
, sizeof(why
), "unexpected read error: %s",
624 safe_strerror(errno
));
625 daemon_down(dmn
, why
);
629 daemon_down(dmn
, "read returned EOF");
632 if (!dmn
->echo_sent
.tv_sec
) {
633 char why
[sizeof(buf
) + 100];
634 snprintf(why
, sizeof(why
),
635 "unexpected read returns %d bytes: %.*s", (int)rc
,
637 daemon_down(dmn
, why
);
641 /* We are expecting an echo response: is there any chance that the
642 response would not be returned entirely in the first read? That
643 seems inconceivable... */
644 if ((rc
!= sizeof(resp
)) || memcmp(buf
, resp
, sizeof(resp
))) {
645 char why
[100 + sizeof(buf
)];
646 snprintf(why
, sizeof(why
),
647 "read returned bad echo response of %d bytes (expecting %u): %.*s",
648 (int)rc
, (unsigned int)sizeof(resp
), (int)rc
, buf
);
649 daemon_down(dmn
, why
);
653 time_elapsed(&delay
, &dmn
->echo_sent
);
654 dmn
->echo_sent
.tv_sec
= 0;
655 if (dmn
->state
== DAEMON_UNRESPONSIVE
) {
656 if (delay
.tv_sec
< gs
.timeout
) {
657 dmn
->state
= DAEMON_UP
;
659 "%s state -> up : echo response received after %ld.%06ld seconds",
660 dmn
->name
, (long)delay
.tv_sec
,
661 (long)delay
.tv_usec
);
664 "%s: slow echo response finally received after %ld.%06ld seconds",
665 dmn
->name
, (long)delay
.tv_sec
,
666 (long)delay
.tv_usec
);
667 } else if (gs
.loglevel
> LOG_DEBUG
+ 1)
668 zlog_debug("%s: echo response received after %ld.%06ld seconds",
669 dmn
->name
, (long)delay
.tv_sec
, (long)delay
.tv_usec
);
671 SET_READ_HANDLER(dmn
);
673 thread_cancel(dmn
->t_wakeup
);
674 SET_WAKEUP_ECHO(dmn
);
680 * Wait till we notice that all daemons are ready before
681 * we send we are ready to systemd
683 static void daemon_send_ready(int exitcode
)
693 zlog_notice("all daemons up, doing startup-complete notify");
694 else if (gs
.numdown
< gs
.numdaemons
)
695 flog_err(EC_WATCHFRR_CONNECTION
,
696 "startup did not complete within timeout (%d/%d daemons running)",
697 gs
.numdaemons
- gs
.numdown
, gs
.numdaemons
);
699 flog_err(EC_WATCHFRR_CONNECTION
,
700 "all configured daemons failed to start -- exiting watchfrr");
707 snprintf(started
, sizeof(started
), "%s%s", frr_vtydir
,
709 fp
= fopen(started
, "w");
712 #if defined HAVE_SYSTEMD
713 systemd_send_started(master
, 0);
714 systemd_send_status("FRR Operational");
719 static void daemon_up(struct daemon
*dmn
, const char *why
)
721 dmn
->state
= DAEMON_UP
;
723 dmn
->connect_tries
= 0;
724 zlog_notice("%s state -> up : %s", dmn
->name
, why
);
726 daemon_send_ready(0);
727 SET_WAKEUP_ECHO(dmn
);
731 static int check_connect(struct thread
*t_write
)
733 struct daemon
*dmn
= THREAD_ARG(t_write
);
735 socklen_t reslen
= sizeof(sockerr
);
738 if (getsockopt(dmn
->fd
, SOL_SOCKET
, SO_ERROR
, (char *)&sockerr
, &reslen
)
740 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn
->name
,
741 safe_strerror(errno
));
743 "getsockopt failed checking connection success");
746 if ((reslen
== sizeof(sockerr
)) && sockerr
) {
750 "getsockopt reports that connection attempt failed: %s",
751 safe_strerror(sockerr
));
752 daemon_down(dmn
, why
);
756 daemon_up(dmn
, "delayed connect succeeded");
760 static int wakeup_connect_hanging(struct thread
*t_wakeup
)
762 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
765 dmn
->t_wakeup
= NULL
;
766 snprintf(why
, sizeof(why
),
767 "connection attempt timed out after %ld seconds", gs
.timeout
);
768 daemon_down(dmn
, why
);
772 /* Making connection to protocol daemon. */
773 static int try_connect(struct daemon
*dmn
)
776 struct sockaddr_un addr
;
779 if (gs
.loglevel
> LOG_DEBUG
+ 1)
780 zlog_debug("%s: attempting to connect", dmn
->name
);
781 dmn
->connect_tries
++;
783 memset(&addr
, 0, sizeof(struct sockaddr_un
));
784 addr
.sun_family
= AF_UNIX
;
785 snprintf(addr
.sun_path
, sizeof(addr
.sun_path
), "%s/%s.vty", gs
.vtydir
,
787 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
788 len
= addr
.sun_len
= SUN_LEN(&addr
);
790 len
= sizeof(addr
.sun_family
) + strlen(addr
.sun_path
);
791 #endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
793 /* Quick check to see if we might succeed before we go to the trouble
794 of creating a socket. */
795 if (access(addr
.sun_path
, W_OK
) < 0) {
797 flog_err_sys(EC_LIB_SYSTEM_CALL
,
798 "%s: access to socket %s denied: %s",
799 dmn
->name
, addr
.sun_path
,
800 safe_strerror(errno
));
804 if ((sock
= socket(AF_UNIX
, SOCK_STREAM
, 0)) < 0) {
805 flog_err_sys(EC_LIB_SOCKET
, "%s(%s): cannot make socket: %s",
806 __func__
, addr
.sun_path
, safe_strerror(errno
));
810 if (set_nonblocking(sock
) < 0 || set_cloexec(sock
) < 0) {
811 flog_err_sys(EC_LIB_SYSTEM_CALL
,
812 "%s(%s): set_nonblocking/cloexec(%d) failed",
813 __func__
, addr
.sun_path
, sock
);
818 if (connect(sock
, (struct sockaddr
*)&addr
, len
) < 0) {
819 if ((errno
!= EINPROGRESS
) && (errno
!= EWOULDBLOCK
)) {
820 if (gs
.loglevel
> LOG_DEBUG
)
821 zlog_debug("%s(%s): connect failed: %s",
822 __func__
, addr
.sun_path
,
823 safe_strerror(errno
));
827 if (gs
.loglevel
> LOG_DEBUG
)
828 zlog_debug("%s: connection in progress", dmn
->name
);
829 dmn
->state
= DAEMON_CONNECTING
;
832 thread_add_write(master
, check_connect
, dmn
, dmn
->fd
,
834 dmn
->t_wakeup
= NULL
;
835 thread_add_timer(master
, wakeup_connect_hanging
, dmn
,
836 gs
.timeout
, &dmn
->t_wakeup
);
837 SET_READ_HANDLER(dmn
);
842 SET_READ_HANDLER(dmn
);
843 daemon_up(dmn
, "connect succeeded");
847 static int phase_hanging(struct thread
*t_hanging
)
849 gs
.t_phase_hanging
= NULL
;
850 flog_err(EC_WATCHFRR_CONNECTION
,
851 "Phase [%s] hanging for %ld seconds, aborting phased restart",
852 phase_str
[gs
.phase
], PHASE_TIMEOUT
);
853 gs
.phase
= PHASE_NONE
;
857 static void set_phase(restart_phase_t new_phase
)
859 gs
.phase
= new_phase
;
860 if (gs
.t_phase_hanging
)
861 thread_cancel(gs
.t_phase_hanging
);
862 gs
.t_phase_hanging
= NULL
;
863 thread_add_timer(master
, phase_hanging
, NULL
, PHASE_TIMEOUT
,
864 &gs
.t_phase_hanging
);
867 static void phase_check(void)
876 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
877 if (dmn
->state
== DAEMON_INIT
)
880 /* startup complete, everything out of INIT */
881 gs
.phase
= PHASE_NONE
;
882 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
883 if (dmn
->state
== DAEMON_DOWN
) {
884 SET_WAKEUP_DOWN(dmn
);
888 case PHASE_STOPS_PENDING
:
892 "Phased restart: all routing daemon stop jobs have completed.");
893 set_phase(PHASE_WAITING_DOWN
);
896 case PHASE_WAITING_DOWN
:
897 if (gs
.numdown
+ IS_UP(gs
.special
) < gs
.numdaemons
)
899 zlog_info("Phased restart: all routing daemons now down.");
900 run_job(&gs
.special
->restart
, "restart", gs
.restart_command
, 1,
902 set_phase(PHASE_ZEBRA_RESTART_PENDING
);
905 case PHASE_ZEBRA_RESTART_PENDING
:
906 if (gs
.special
->restart
.pid
)
908 zlog_info("Phased restart: %s restart job completed.",
910 set_phase(PHASE_WAITING_ZEBRA_UP
);
913 case PHASE_WAITING_ZEBRA_UP
:
914 if (!IS_UP(gs
.special
))
916 zlog_info("Phased restart: %s is now up.", gs
.special
->name
);
919 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
920 if (dmn
!= gs
.special
)
921 run_job(&dmn
->restart
, "start",
922 gs
.start_command
, 1, 0);
925 gs
.phase
= PHASE_NONE
;
926 THREAD_OFF(gs
.t_phase_hanging
);
927 zlog_notice("Phased global restart has completed.");
932 static void try_restart(struct daemon
*dmn
)
937 if (dmn
!= gs
.special
) {
938 if ((gs
.special
->state
== DAEMON_UP
)
939 && (gs
.phase
== PHASE_NONE
))
940 run_job(&dmn
->restart
, "restart", gs
.restart_command
, 0,
944 "%s: postponing restart attempt because master %s daemon not up [%s], or phased restart in progress",
945 dmn
->name
, gs
.special
->name
,
946 state_str
[gs
.special
->state
]);
950 if ((gs
.phase
!= PHASE_NONE
) || gs
.numpids
) {
951 if (gs
.loglevel
> LOG_DEBUG
+ 1)
953 "postponing phased global restart: restart already in progress [%s], or outstanding child processes [%d]",
954 phase_str
[gs
.phase
], gs
.numpids
);
957 /* Is it too soon for a restart? */
959 struct timeval delay
;
960 if (time_elapsed(&delay
, &gs
.special
->restart
.time
)->tv_sec
961 < gs
.special
->restart
.interval
) {
962 if (gs
.loglevel
> LOG_DEBUG
+ 1)
964 "postponing phased global restart: elapsed time %ld < retry interval %ld",
966 gs
.special
->restart
.interval
);
970 run_job(&gs
.restart
, "restart", gs
.restart_command
, 0, 1);
973 static int wakeup_unresponsive(struct thread
*t_wakeup
)
975 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
977 dmn
->t_wakeup
= NULL
;
978 if (dmn
->state
!= DAEMON_UNRESPONSIVE
)
979 flog_err(EC_WATCHFRR_CONNECTION
,
980 "%s: no longer unresponsive (now %s), wakeup should have been cancelled!",
981 dmn
->name
, state_str
[dmn
->state
]);
983 SET_WAKEUP_UNRESPONSIVE(dmn
);
989 static int wakeup_no_answer(struct thread
*t_wakeup
)
991 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
993 dmn
->t_wakeup
= NULL
;
994 dmn
->state
= DAEMON_UNRESPONSIVE
;
995 if (dmn
->ignore_timeout
)
997 flog_err(EC_WATCHFRR_CONNECTION
,
998 "%s state -> unresponsive : no response yet to ping sent %ld seconds ago",
999 dmn
->name
, gs
.timeout
);
1000 SET_WAKEUP_UNRESPONSIVE(dmn
);
1005 static int wakeup_send_echo(struct thread
*t_wakeup
)
1007 static const char echocmd
[] = "echo " PING_TOKEN
;
1009 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
1011 dmn
->t_wakeup
= NULL
;
1012 if (((rc
= write(dmn
->fd
, echocmd
, sizeof(echocmd
))) < 0)
1013 || ((size_t)rc
!= sizeof(echocmd
))) {
1014 char why
[100 + sizeof(echocmd
)];
1015 snprintf(why
, sizeof(why
),
1016 "write '%s' returned %d instead of %u", echocmd
,
1017 (int)rc
, (unsigned int)sizeof(echocmd
));
1018 daemon_down(dmn
, why
);
1020 gettimeofday(&dmn
->echo_sent
, NULL
);
1021 dmn
->t_wakeup
= NULL
;
1022 thread_add_timer(master
, wakeup_no_answer
, dmn
, gs
.timeout
,
1028 bool check_all_up(void)
1032 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
1033 if (dmn
->state
!= DAEMON_UP
)
1038 void watchfrr_status(struct vty
*vty
)
1041 struct timeval delay
;
1043 vty_out(vty
, "watchfrr global phase: %s\n", phase_str
[gs
.phase
]);
1045 vty_out(vty
, " global restart running, pid %ld\n",
1046 (long)gs
.restart
.pid
);
1048 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
1049 vty_out(vty
, " %-20s %s%s", dmn
->name
, state_str
[dmn
->state
],
1050 dmn
->ignore_timeout
? "/Ignoring Timeout\n" : "\n");
1051 if (dmn
->restart
.pid
)
1052 vty_out(vty
, " restart running, pid %ld\n",
1053 (long)dmn
->restart
.pid
);
1054 else if (dmn
->state
== DAEMON_DOWN
&&
1055 time_elapsed(&delay
, &dmn
->restart
.time
)->tv_sec
1056 < dmn
->restart
.interval
)
1057 vty_out(vty
, " restarting in %jd seconds (%jds backoff interval)\n",
1058 (intmax_t)dmn
->restart
.interval
1059 - (intmax_t)delay
.tv_sec
,
1060 (intmax_t)dmn
->restart
.interval
);
1064 static void sigint(void)
1066 zlog_notice("Terminating on signal");
1067 systemd_send_stopping();
1071 static int valid_command(const char *cmd
)
1075 return ((p
= strchr(cmd
, '%')) != NULL
) && (*(p
+ 1) == 's')
1076 && !strchr(p
+ 1, '%');
1079 /* This is an ugly hack to circumvent problems with passing command-line
1080 arguments that contain spaces. The fix is to use a configuration file. */
1081 static char *translate_blanks(const char *cmd
, const char *blankstr
)
1085 size_t bslen
= strlen(blankstr
);
1087 if (!(res
= strdup(cmd
))) {
1091 while ((p
= strstr(res
, blankstr
)) != NULL
) {
1094 memmove(p
+ 1, p
+ bslen
, strlen(p
+ bslen
) + 1);
1099 static int startup_timeout(struct thread
*t_wakeup
)
1101 daemon_send_ready(1);
1105 static void watchfrr_init(int argc
, char **argv
)
1107 const char *special
= "zebra";
1109 struct daemon
*dmn
, **add
= &gs
.daemons
;
1110 char alldaemons
[512] = "", *p
= alldaemons
;
1112 thread_add_timer_msec(master
, startup_timeout
, NULL
, STARTUP_TIMEOUT
,
1113 &gs
.t_startup_timeout
);
1115 for (i
= optind
; i
< argc
; i
++) {
1116 dmn
= XCALLOC(MTYPE_WATCHFRR_DAEMON
, sizeof(*dmn
));
1118 dmn
->name
= dmn
->restart
.name
= argv
[i
];
1119 dmn
->state
= DAEMON_INIT
;
1123 dmn
->t_wakeup
= NULL
;
1124 thread_add_timer_msec(master
, wakeup_init
, dmn
, 0,
1126 dmn
->restart
.interval
= gs
.min_restart_interval
;
1130 if (!strcmp(dmn
->name
, special
))
1136 "Must specify one or more daemons to monitor.\n\n");
1139 if (!watch_only
&& !gs
.special
) {
1140 fprintf(stderr
, "\"%s\" daemon must be in daemon lists\n\n",
1145 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
1146 snprintf(p
, alldaemons
+ sizeof(alldaemons
) - p
, "%s%s",
1147 (p
== alldaemons
) ? "" : " ", dmn
->name
);
1150 zlog_notice("%s %s watching [%s]%s", progname
, FRR_VERSION
, alldaemons
,
1151 watch_only
? ", monitor mode" : "");
1154 struct zebra_privs_t watchfrr_privs
= {
1156 .vty_group
= VTY_GROUP
,
1160 static struct quagga_signal_t watchfrr_signals
[] = {
1171 .handler
= sigchild
,
1175 FRR_DAEMON_INFO(watchfrr
, WATCHFRR
,
1176 .flags
= FRR_NO_PRIVSEP
| FRR_NO_TCPVTY
| FRR_LIMITED_CLI
1177 | FRR_NO_CFG_PID_DRY
| FRR_NO_ZCLIENT
1180 .printhelp
= printhelp
,
1181 .copyright
= "Copyright 2004 Andrew J. Schorr",
1183 .signals
= watchfrr_signals
,
1184 .n_signals
= array_size(watchfrr_signals
),
1186 .privs
= &watchfrr_privs
, )
1188 #define DEPRECATED_OPTIONS "aAezR:"
1190 int main(int argc
, char **argv
)
1193 const char *blankstr
= NULL
;
1195 frr_preinit(&watchfrr_di
, argc
, argv
);
1196 progname
= watchfrr_di
.progname
;
1198 frr_opt_add("b:dk:l:i:p:r:S:s:t:T:" DEPRECATED_OPTIONS
, longopts
, "");
1200 gs
.restart
.name
= "all";
1201 while ((opt
= frr_getopt(argc
, argv
, NULL
)) != EOF
) {
1202 if (opt
&& opt
< 128 && strchr(DEPRECATED_OPTIONS
, opt
)) {
1204 "The -%c option no longer exists.\n"
1205 "Please refer to the watchfrr(8) man page.\n",
1220 if (!valid_command(optarg
)) {
1222 "Invalid kill command, must contain '%%s': %s\n",
1226 gs
.stop_command
= optarg
;
1230 if ((sscanf(optarg
, "%d%1s", &gs
.loglevel
, garbage
)
1232 || (gs
.loglevel
< LOG_EMERG
)) {
1234 "Invalid loglevel argument: %s\n",
1239 case OPTION_MINRESTART
: {
1241 if ((sscanf(optarg
, "%ld%1s", &gs
.min_restart_interval
,
1244 || (gs
.min_restart_interval
< 0)) {
1246 "Invalid min_restart_interval argument: %s\n",
1251 case OPTION_MAXRESTART
: {
1253 if ((sscanf(optarg
, "%ld%1s", &gs
.max_restart_interval
,
1256 || (gs
.max_restart_interval
< 0)) {
1258 "Invalid max_restart_interval argument: %s\n",
1266 if ((sscanf(optarg
, "%d%1s", &period
, garbage
) != 1)
1267 || (gs
.period
< 1)) {
1269 "Invalid interval argument: %s\n",
1273 gs
.period
= 1000 * period
;
1276 watchfrr_di
.pid_file
= optarg
;
1279 if (!valid_command(optarg
)) {
1281 "Invalid restart command, must contain '%%s': %s\n",
1285 gs
.restart_command
= optarg
;
1288 if (!valid_command(optarg
)) {
1290 "Invalid start command, must contain '%%s': %s\n",
1294 gs
.start_command
= optarg
;
1301 if ((sscanf(optarg
, "%ld%1s", &gs
.timeout
, garbage
)
1303 || (gs
.timeout
< 1)) {
1305 "Invalid timeout argument: %s\n",
1312 if ((sscanf(optarg
, "%ld%1s", &gs
.restart_timeout
,
1315 || (gs
.restart_timeout
< 1)) {
1317 "Invalid restart timeout argument: %s\n",
1323 fputs("Invalid option.\n", stderr
);
1329 && (gs
.start_command
|| gs
.stop_command
|| gs
.restart_command
)) {
1330 fputs("Options -r/-s/-k are not used when --dry is active.\n",
1334 && (!gs
.restart_command
|| !gs
.start_command
|| !gs
.stop_command
)) {
1336 "Options -s (start), -k (kill), and -r (restart) are required.\n");
1341 if (gs
.restart_command
)
1342 gs
.restart_command
=
1343 translate_blanks(gs
.restart_command
, blankstr
);
1344 if (gs
.start_command
)
1346 translate_blanks(gs
.start_command
, blankstr
);
1347 if (gs
.stop_command
)
1349 translate_blanks(gs
.stop_command
, blankstr
);
1352 gs
.restart
.interval
= gs
.min_restart_interval
;
1354 master
= frr_init();
1355 watchfrr_error_init();
1356 watchfrr_init(argc
, argv
);
1357 watchfrr_vty_init();
1361 if (watchfrr_di
.daemon_mode
)
1362 zlog_syslog_set_prio_min(MIN(gs
.loglevel
, LOG_DEBUG
));
1364 zlog_aux_init(NULL
, MIN(gs
.loglevel
, LOG_DEBUG
));
1368 systemd_send_stopping();