2 * Monitor status of frr daemons and restart if necessary.
4 * Copyright (C) 2004 Andrew J. Schorr
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
26 #include <lib/version.h>
28 #include "memory_vty.h"
30 #include "lib_errors.h"
39 #include "watchfrr_errors.h"
42 #define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
45 /* Macros to help randomize timers. */
46 #define JITTER(X) ((random() % ((X)+1))-((X)/2))
47 #define FUZZY(X) ((X)+JITTER((X)/20))
49 #define DEFAULT_PERIOD 5
50 #define DEFAULT_TIMEOUT 90
51 #define DEFAULT_RESTART_TIMEOUT 20
52 #define DEFAULT_LOGLEVEL LOG_INFO
53 #define DEFAULT_MIN_RESTART 60
54 #define DEFAULT_MAX_RESTART 600
56 #define DEFAULT_RESTART_CMD WATCHFRR_SH_PATH " restart %s"
57 #define DEFAULT_START_CMD WATCHFRR_SH_PATH " start %s"
58 #define DEFAULT_STOP_CMD WATCHFRR_SH_PATH " stop %s"
60 #define PING_TOKEN "PING"
62 DEFINE_MGROUP(WATCHFRR
, "watchfrr")
63 DEFINE_MTYPE_STATIC(WATCHFRR
, WATCHFRR_DAEMON
, "watchfrr daemon entry")
65 /* Needs to be global, referenced somewhere inside libfrr. */
66 struct thread_master
*master
;
68 static bool watch_only
= false;
75 PHASE_ZEBRA_RESTART_PENDING
,
76 PHASE_WAITING_ZEBRA_UP
79 static const char *phase_str
[] = {
83 "Waiting for other daemons to come down",
84 "Zebra restart job running",
85 "Waiting for zebra to come up",
89 #define PHASE_TIMEOUT (3*gs.restart_timeout)
90 #define STARTUP_TIMEOUT 55 * 1000
98 struct thread
*t_kill
;
102 static struct global_state
{
103 restart_phase_t phase
;
104 struct thread
*t_phase_hanging
;
105 struct thread
*t_startup_timeout
;
109 long restart_timeout
;
110 long min_restart_interval
;
111 long max_restart_interval
;
112 struct daemon
*daemons
;
113 const char *restart_command
;
114 const char *start_command
;
115 const char *stop_command
;
116 struct restart_info restart
;
118 struct daemon
*special
; /* points to zebra when doing phased restart */
121 int numdown
; /* # of daemons that are not UP or UNRESPONSIVE */
124 .vtydir
= frr_vtydir
,
125 .period
= 1000 * DEFAULT_PERIOD
,
126 .timeout
= DEFAULT_TIMEOUT
,
127 .restart_timeout
= DEFAULT_RESTART_TIMEOUT
,
128 .loglevel
= DEFAULT_LOGLEVEL
,
129 .min_restart_interval
= DEFAULT_MIN_RESTART
,
130 .max_restart_interval
= DEFAULT_MAX_RESTART
,
131 .restart_command
= DEFAULT_RESTART_CMD
,
132 .start_command
= DEFAULT_START_CMD
,
133 .stop_command
= DEFAULT_STOP_CMD
,
145 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
147 static const char *state_str
[] = {
148 "Init", "Down", "Connecting", "Up", "Unresponsive",
153 daemon_state_t state
;
155 struct timeval echo_sent
;
156 unsigned int connect_tries
;
157 struct thread
*t_wakeup
;
158 struct thread
*t_read
;
159 struct thread
*t_write
;
161 struct restart_info restart
;
164 #define OPTION_MINRESTART 2000
165 #define OPTION_MAXRESTART 2001
166 #define OPTION_DRY 2002
168 static const struct option longopts
[] = {
169 {"daemon", no_argument
, NULL
, 'd'},
170 {"statedir", required_argument
, NULL
, 'S'},
171 {"loglevel", required_argument
, NULL
, 'l'},
172 {"interval", required_argument
, NULL
, 'i'},
173 {"timeout", required_argument
, NULL
, 't'},
174 {"restart-timeout", required_argument
, NULL
, 'T'},
175 {"restart", required_argument
, NULL
, 'r'},
176 {"start-command", required_argument
, NULL
, 's'},
177 {"kill-command", required_argument
, NULL
, 'k'},
178 {"dry", no_argument
, NULL
, OPTION_DRY
},
179 {"min-restart-interval", required_argument
, NULL
, OPTION_MINRESTART
},
180 {"max-restart-interval", required_argument
, NULL
, OPTION_MAXRESTART
},
181 {"pid-file", required_argument
, NULL
, 'p'},
182 {"blank-string", required_argument
, NULL
, 'b'},
183 {"help", no_argument
, NULL
, 'h'},
184 {"version", no_argument
, NULL
, 'v'},
187 static int try_connect(struct daemon
*dmn
);
188 static int wakeup_send_echo(struct thread
*t_wakeup
);
189 static void try_restart(struct daemon
*dmn
);
190 static void phase_check(void);
191 static void restart_done(struct daemon
*dmn
);
193 static const char *progname
;
194 static void printhelp(FILE *target
)
197 "Usage : %s [OPTION...] <daemon name> ...\n\n\
198 Watchdog program to monitor status of frr daemons and try to restart\n\
199 them if they are down or unresponsive. It determines whether a daemon is\n\
200 up based on whether it can connect to the daemon's vty unix stream socket.\n\
201 It then repeatedly sends echo commands over that socket to determine whether\n\
202 the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
203 on the socket connection and know immediately that the daemon is down.\n\n\
204 The daemons to be monitored should be listed on the command line.\n\n\
205 In order to avoid attempting to restart the daemons in a fast loop,\n\
206 the -m and -M options allow you to control the minimum delay between\n\
207 restart commands. The minimum restart delay is recalculated each time\n\
208 a restart is attempted: if the time since the last restart attempt exceeds\n\
209 twice the -M value, then the restart delay is set to the -m value.\n\
210 Otherwise, the interval is doubled (but capped at the -M value).\n\n",
215 -d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
216 to syslog instead of stdout.\n\
217 -S, --statedir Set the vty socket directory (default is %s)\n\
218 -l, --loglevel Set the logging level (default is %d).\n\
219 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
220 but it can be set higher than %d if extra-verbose debugging\n\
221 messages are desired.\n\
222 --min-restart-interval\n\
223 Set the minimum seconds to wait between invocations of daemon\n\
224 restart commands (default is %d).\n\
225 --max-restart-interval\n\
226 Set the maximum seconds to wait between invocations of daemon\n\
227 restart commands (default is %d).\n\
228 -i, --interval Set the status polling interval in seconds (default is %d)\n\
229 -t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
230 -T, --restart-timeout\n\
231 Set the restart (kill) timeout in seconds (default is %d).\n\
232 If any background jobs are still running after this much\n\
233 time has elapsed, they will be killed.\n\
234 -r, --restart Supply a Bourne shell command to use to restart a single\n\
235 daemon. The command string should include '%%s' where the\n\
236 name of the daemon should be substituted.\n\
238 -s, --start-command\n\
239 Supply a Bourne shell to command to use to start a single\n\
240 daemon. The command string should include '%%s' where the\n\
241 name of the daemon should be substituted.\n\
243 -k, --kill-command\n\
244 Supply a Bourne shell to command to use to stop a single\n\
245 daemon. The command string should include '%%s' where the\n\
246 name of the daemon should be substituted.\n\
248 --dry Do not start or restart anything, just log.\n\
249 -p, --pid-file Set process identifier file name\n\
250 (default is %s/watchfrr.pid).\n\
251 -b, --blank-string\n\
252 When the supplied argument string is found in any of the\n\
253 various shell command arguments (-r, -s, or -k), replace\n\
254 it with a space. This is an ugly hack to circumvent problems\n\
255 passing command-line arguments with embedded spaces.\n\
256 -v, --version Print program version\n\
257 -h, --help Display this help and exit\n",
258 frr_vtydir
, DEFAULT_LOGLEVEL
, LOG_EMERG
, LOG_DEBUG
, LOG_DEBUG
,
259 DEFAULT_MIN_RESTART
, DEFAULT_MAX_RESTART
, DEFAULT_PERIOD
,
260 DEFAULT_TIMEOUT
, DEFAULT_RESTART_TIMEOUT
,
261 DEFAULT_RESTART_CMD
, DEFAULT_START_CMD
, DEFAULT_STOP_CMD
,
265 static pid_t
run_background(char *shell_cmd
)
269 switch (child
= fork()) {
271 flog_err_sys(EC_LIB_SYSTEM_CALL
,
272 "fork failed, cannot run command [%s]: %s",
273 shell_cmd
, safe_strerror(errno
));
277 /* Use separate process group so child processes can be killed
279 if (setpgid(0, 0) < 0)
280 zlog_warn("warning: setpgid(0,0) failed: %s",
281 safe_strerror(errno
));
285 char *const argv
[4] = {shell
, dashc
, shell_cmd
, NULL
};
286 execv("/bin/sh", argv
);
287 flog_err_sys(EC_LIB_SYSTEM_CALL
,
288 "execv(/bin/sh -c '%s') failed: %s",
289 shell_cmd
, safe_strerror(errno
));
293 /* Parent process: we will reap the child later. */
294 flog_err_sys(EC_LIB_SYSTEM_CALL
,
295 "Forked background command [pid %d]: %s",
296 (int)child
, shell_cmd
);
301 static struct timeval
*time_elapsed(struct timeval
*result
,
302 const struct timeval
*start_time
)
304 gettimeofday(result
, NULL
);
305 result
->tv_sec
-= start_time
->tv_sec
;
306 result
->tv_usec
-= start_time
->tv_usec
;
307 while (result
->tv_usec
< 0) {
308 result
->tv_usec
+= 1000000L;
314 static int restart_kill(struct thread
*t_kill
)
316 struct restart_info
*restart
= THREAD_ARG(t_kill
);
317 struct timeval delay
;
319 time_elapsed(&delay
, &restart
->time
);
321 "Warning: %s %s child process %d still running after "
322 "%ld seconds, sending signal %d",
323 restart
->what
, restart
->name
, (int)restart
->pid
,
324 (long)delay
.tv_sec
, (restart
->kills
? SIGKILL
: SIGTERM
));
325 kill(-restart
->pid
, (restart
->kills
? SIGKILL
: SIGTERM
));
327 restart
->t_kill
= NULL
;
328 thread_add_timer(master
, restart_kill
, restart
, gs
.restart_timeout
,
333 static struct restart_info
*find_child(pid_t child
)
336 if (gs
.restart
.pid
== child
)
339 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
340 if (dmn
->restart
.pid
== child
)
341 return &dmn
->restart
;
346 static void sigchild(void)
352 struct restart_info
*restart
;
355 switch (child
= waitpid(-1, &status
, WNOHANG
)) {
357 flog_err_sys(EC_LIB_SYSTEM_CALL
, "waitpid failed: %s",
358 safe_strerror(errno
));
361 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
365 if (child
== integrated_write_pid
) {
366 integrated_write_sigchld(status
);
370 if ((restart
= find_child(child
)) != NULL
) {
371 name
= restart
->name
;
372 what
= restart
->what
;
375 thread_cancel(restart
->t_kill
);
376 restart
->t_kill
= NULL
;
377 /* Update restart time to reflect the time the command
379 gettimeofday(&restart
->time
, NULL
);
383 "waitpid returned status for an unknown child process %d",
388 if (WIFSTOPPED(status
))
389 zlog_warn("warning: %s %s process %d is stopped", what
, name
,
391 else if (WIFSIGNALED(status
))
392 zlog_warn("%s %s process %d terminated due to signal %d", what
,
393 name
, (int)child
, WTERMSIG(status
));
394 else if (WIFEXITED(status
)) {
395 if (WEXITSTATUS(status
) != 0)
397 "%s %s process %d exited with non-zero status %d",
398 what
, name
, (int)child
, WEXITSTATUS(status
));
400 zlog_debug("%s %s process %d exited normally", what
,
403 if (restart
&& restart
!= &gs
.restart
) {
404 dmn
= container_of(restart
, struct daemon
,
408 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
414 "cannot interpret %s %s process %d wait status 0x%x",
415 what
, name
, (int)child
, status
);
419 static int run_job(struct restart_info
*restart
, const char *cmdtype
,
420 const char *command
, int force
, int update_interval
)
422 struct timeval delay
;
424 if (gs
.loglevel
> LOG_DEBUG
+ 1)
425 zlog_debug("attempting to %s %s", cmdtype
, restart
->name
);
428 if (gs
.loglevel
> LOG_DEBUG
+ 1)
430 "cannot %s %s, previous pid %d still running",
431 cmdtype
, restart
->name
, (int)restart
->pid
);
435 /* Note: time_elapsed test must come before the force test, since we
437 to make sure that delay is initialized for use below in updating the
439 if ((time_elapsed(&delay
, &restart
->time
)->tv_sec
< restart
->interval
)
441 if (gs
.loglevel
> LOG_DEBUG
+ 1)
444 "elapsed time %ld < retry interval %ld",
445 cmdtype
, restart
->name
, (long)delay
.tv_sec
,
450 gettimeofday(&restart
->time
, NULL
);
453 char cmd
[strlen(command
) + strlen(restart
->name
) + 1];
454 snprintf(cmd
, sizeof(cmd
), command
, restart
->name
);
455 if ((restart
->pid
= run_background(cmd
)) > 0) {
456 restart
->t_kill
= NULL
;
457 thread_add_timer(master
, restart_kill
, restart
,
458 gs
.restart_timeout
, &restart
->t_kill
);
459 restart
->what
= cmdtype
;
465 /* Calculate the new restart interval. */
466 if (update_interval
) {
467 if (delay
.tv_sec
> 2 * gs
.max_restart_interval
)
468 restart
->interval
= gs
.min_restart_interval
;
469 else if ((restart
->interval
*= 2) > gs
.max_restart_interval
)
470 restart
->interval
= gs
.max_restart_interval
;
471 if (gs
.loglevel
> LOG_DEBUG
+ 1)
472 zlog_debug("restart %s interval is now %ld",
473 restart
->name
, restart
->interval
);
478 #define SET_READ_HANDLER(DMN) \
480 (DMN)->t_read = NULL; \
481 thread_add_read(master, handle_read, (DMN), (DMN)->fd, \
485 #define SET_WAKEUP_DOWN(DMN) \
487 (DMN)->t_wakeup = NULL; \
488 thread_add_timer_msec(master, wakeup_down, (DMN), \
489 FUZZY(gs.period), &(DMN)->t_wakeup); \
492 #define SET_WAKEUP_UNRESPONSIVE(DMN) \
494 (DMN)->t_wakeup = NULL; \
495 thread_add_timer_msec(master, wakeup_unresponsive, (DMN), \
496 FUZZY(gs.period), &(DMN)->t_wakeup); \
499 #define SET_WAKEUP_ECHO(DMN) \
501 (DMN)->t_wakeup = NULL; \
502 thread_add_timer_msec(master, wakeup_send_echo, (DMN), \
503 FUZZY(gs.period), &(DMN)->t_wakeup); \
506 static int wakeup_down(struct thread
*t_wakeup
)
508 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
510 dmn
->t_wakeup
= NULL
;
511 if (try_connect(dmn
) < 0)
512 SET_WAKEUP_DOWN(dmn
);
513 if ((dmn
->connect_tries
> 1) && (dmn
->state
!= DAEMON_UP
))
518 static int wakeup_init(struct thread
*t_wakeup
)
520 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
522 dmn
->t_wakeup
= NULL
;
523 if (try_connect(dmn
) < 0) {
524 flog_err(EC_WATCHFRR_CONNECTION
,
525 "%s state -> down : initial connection attempt failed",
527 dmn
->state
= DAEMON_DOWN
;
533 static void restart_done(struct daemon
*dmn
)
535 if (dmn
->state
!= DAEMON_DOWN
) {
540 THREAD_OFF(dmn
->t_wakeup
);
541 if (try_connect(dmn
) < 0)
542 SET_WAKEUP_DOWN(dmn
);
545 static void daemon_down(struct daemon
*dmn
, const char *why
)
547 if (IS_UP(dmn
) || (dmn
->state
== DAEMON_INIT
))
548 flog_err(EC_WATCHFRR_CONNECTION
, "%s state -> down : %s",
550 else if (gs
.loglevel
> LOG_DEBUG
)
551 zlog_debug("%s still down : %s", dmn
->name
, why
);
554 dmn
->state
= DAEMON_DOWN
;
559 THREAD_OFF(dmn
->t_read
);
560 THREAD_OFF(dmn
->t_write
);
561 THREAD_OFF(dmn
->t_wakeup
);
562 if (try_connect(dmn
) < 0)
563 SET_WAKEUP_DOWN(dmn
);
567 static int handle_read(struct thread
*t_read
)
569 struct daemon
*dmn
= THREAD_ARG(t_read
);
570 static const char resp
[sizeof(PING_TOKEN
) + 4] = PING_TOKEN
"\n";
571 char buf
[sizeof(resp
) + 100];
573 struct timeval delay
;
576 if ((rc
= read(dmn
->fd
, buf
, sizeof(buf
))) < 0) {
579 if (ERRNO_IO_RETRY(errno
)) {
580 /* Pretend it never happened. */
581 SET_READ_HANDLER(dmn
);
584 snprintf(why
, sizeof(why
), "unexpected read error: %s",
585 safe_strerror(errno
));
586 daemon_down(dmn
, why
);
590 daemon_down(dmn
, "read returned EOF");
593 if (!dmn
->echo_sent
.tv_sec
) {
594 char why
[sizeof(buf
) + 100];
595 snprintf(why
, sizeof(why
),
596 "unexpected read returns %d bytes: %.*s", (int)rc
,
598 daemon_down(dmn
, why
);
602 /* We are expecting an echo response: is there any chance that the
603 response would not be returned entirely in the first read? That
604 seems inconceivable... */
605 if ((rc
!= sizeof(resp
)) || memcmp(buf
, resp
, sizeof(resp
))) {
606 char why
[100 + sizeof(buf
)];
607 snprintf(why
, sizeof(why
),
608 "read returned bad echo response of %d bytes "
609 "(expecting %u): %.*s",
610 (int)rc
, (unsigned int)sizeof(resp
), (int)rc
, buf
);
611 daemon_down(dmn
, why
);
615 time_elapsed(&delay
, &dmn
->echo_sent
);
616 dmn
->echo_sent
.tv_sec
= 0;
617 if (dmn
->state
== DAEMON_UNRESPONSIVE
) {
618 if (delay
.tv_sec
< gs
.timeout
) {
619 dmn
->state
= DAEMON_UP
;
621 "%s state -> up : echo response received after %ld.%06ld "
623 dmn
->name
, (long)delay
.tv_sec
,
624 (long)delay
.tv_usec
);
627 "%s: slow echo response finally received after %ld.%06ld "
629 dmn
->name
, (long)delay
.tv_sec
,
630 (long)delay
.tv_usec
);
631 } else if (gs
.loglevel
> LOG_DEBUG
+ 1)
632 zlog_debug("%s: echo response received after %ld.%06ld seconds",
633 dmn
->name
, (long)delay
.tv_sec
, (long)delay
.tv_usec
);
635 SET_READ_HANDLER(dmn
);
637 thread_cancel(dmn
->t_wakeup
);
638 SET_WAKEUP_ECHO(dmn
);
644 * Wait till we notice that all daemons are ready before
645 * we send we are ready to systemd
647 static void daemon_send_ready(int exitcode
)
656 zlog_notice("all daemons up, doing startup-complete notify");
657 else if (gs
.numdown
< gs
.numdaemons
)
658 flog_err(EC_WATCHFRR_CONNECTION
,
659 "startup did not complete within timeout"
660 " (%d/%d daemons running)",
661 gs
.numdaemons
- gs
.numdown
, gs
.numdaemons
);
663 flog_err(EC_WATCHFRR_CONNECTION
,
664 "all configured daemons failed to start"
665 " -- exiting watchfrr");
672 fp
= fopen(DAEMON_VTY_DIR
"/watchfrr.started", "w");
675 #if defined HAVE_SYSTEMD
676 systemd_send_started(master
, 0);
681 static void daemon_up(struct daemon
*dmn
, const char *why
)
683 dmn
->state
= DAEMON_UP
;
685 dmn
->connect_tries
= 0;
686 zlog_notice("%s state -> up : %s", dmn
->name
, why
);
688 daemon_send_ready(0);
689 SET_WAKEUP_ECHO(dmn
);
693 static int check_connect(struct thread
*t_write
)
695 struct daemon
*dmn
= THREAD_ARG(t_write
);
697 socklen_t reslen
= sizeof(sockerr
);
700 if (getsockopt(dmn
->fd
, SOL_SOCKET
, SO_ERROR
, (char *)&sockerr
, &reslen
)
702 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn
->name
,
703 safe_strerror(errno
));
705 "getsockopt failed checking connection success");
708 if ((reslen
== sizeof(sockerr
)) && sockerr
) {
712 "getsockopt reports that connection attempt failed: %s",
713 safe_strerror(sockerr
));
714 daemon_down(dmn
, why
);
718 daemon_up(dmn
, "delayed connect succeeded");
722 static int wakeup_connect_hanging(struct thread
*t_wakeup
)
724 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
727 dmn
->t_wakeup
= NULL
;
728 snprintf(why
, sizeof(why
),
729 "connection attempt timed out after %ld seconds", gs
.timeout
);
730 daemon_down(dmn
, why
);
734 /* Making connection to protocol daemon. */
735 static int try_connect(struct daemon
*dmn
)
738 struct sockaddr_un addr
;
741 if (gs
.loglevel
> LOG_DEBUG
+ 1)
742 zlog_debug("%s: attempting to connect", dmn
->name
);
743 dmn
->connect_tries
++;
745 memset(&addr
, 0, sizeof(struct sockaddr_un
));
746 addr
.sun_family
= AF_UNIX
;
747 snprintf(addr
.sun_path
, sizeof(addr
.sun_path
), "%s/%s.vty", gs
.vtydir
,
749 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
750 len
= addr
.sun_len
= SUN_LEN(&addr
);
752 len
= sizeof(addr
.sun_family
) + strlen(addr
.sun_path
);
753 #endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
755 /* Quick check to see if we might succeed before we go to the trouble
756 of creating a socket. */
757 if (access(addr
.sun_path
, W_OK
) < 0) {
759 flog_err_sys(EC_LIB_SYSTEM_CALL
,
760 "%s: access to socket %s denied: %s",
761 dmn
->name
, addr
.sun_path
,
762 safe_strerror(errno
));
766 if ((sock
= socket(AF_UNIX
, SOCK_STREAM
, 0)) < 0) {
767 flog_err_sys(EC_LIB_SOCKET
, "%s(%s): cannot make socket: %s",
768 __func__
, addr
.sun_path
, safe_strerror(errno
));
772 if (set_nonblocking(sock
) < 0 || set_cloexec(sock
) < 0) {
773 flog_err_sys(EC_LIB_SYSTEM_CALL
,
774 "%s(%s): set_nonblocking/cloexec(%d) failed",
775 __func__
, addr
.sun_path
, sock
);
780 if (connect(sock
, (struct sockaddr
*)&addr
, len
) < 0) {
781 if ((errno
!= EINPROGRESS
) && (errno
!= EWOULDBLOCK
)) {
782 if (gs
.loglevel
> LOG_DEBUG
)
783 zlog_debug("%s(%s): connect failed: %s",
784 __func__
, addr
.sun_path
,
785 safe_strerror(errno
));
789 if (gs
.loglevel
> LOG_DEBUG
)
790 zlog_debug("%s: connection in progress", dmn
->name
);
791 dmn
->state
= DAEMON_CONNECTING
;
794 thread_add_write(master
, check_connect
, dmn
, dmn
->fd
,
796 dmn
->t_wakeup
= NULL
;
797 thread_add_timer(master
, wakeup_connect_hanging
, dmn
,
798 gs
.timeout
, &dmn
->t_wakeup
);
799 SET_READ_HANDLER(dmn
);
804 SET_READ_HANDLER(dmn
);
805 daemon_up(dmn
, "connect succeeded");
809 static int phase_hanging(struct thread
*t_hanging
)
811 gs
.t_phase_hanging
= NULL
;
812 flog_err(EC_WATCHFRR_CONNECTION
,
813 "Phase [%s] hanging for %ld seconds, aborting phased restart",
814 phase_str
[gs
.phase
], PHASE_TIMEOUT
);
815 gs
.phase
= PHASE_NONE
;
819 static void set_phase(restart_phase_t new_phase
)
821 gs
.phase
= new_phase
;
822 if (gs
.t_phase_hanging
)
823 thread_cancel(gs
.t_phase_hanging
);
824 gs
.t_phase_hanging
= NULL
;
825 thread_add_timer(master
, phase_hanging
, NULL
, PHASE_TIMEOUT
,
826 &gs
.t_phase_hanging
);
829 static void phase_check(void)
838 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
839 if (dmn
->state
== DAEMON_INIT
)
842 /* startup complete, everything out of INIT */
843 gs
.phase
= PHASE_NONE
;
844 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
845 if (dmn
->state
== DAEMON_DOWN
) {
846 SET_WAKEUP_DOWN(dmn
);
850 case PHASE_STOPS_PENDING
:
854 "Phased restart: all routing daemon stop jobs have completed.");
855 set_phase(PHASE_WAITING_DOWN
);
858 case PHASE_WAITING_DOWN
:
859 if (gs
.numdown
+ IS_UP(gs
.special
) < gs
.numdaemons
)
861 zlog_info("Phased restart: all routing daemons now down.");
862 run_job(&gs
.special
->restart
, "restart", gs
.restart_command
, 1,
864 set_phase(PHASE_ZEBRA_RESTART_PENDING
);
867 case PHASE_ZEBRA_RESTART_PENDING
:
868 if (gs
.special
->restart
.pid
)
870 zlog_info("Phased restart: %s restart job completed.",
872 set_phase(PHASE_WAITING_ZEBRA_UP
);
875 case PHASE_WAITING_ZEBRA_UP
:
876 if (!IS_UP(gs
.special
))
878 zlog_info("Phased restart: %s is now up.", gs
.special
->name
);
881 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
882 if (dmn
!= gs
.special
)
883 run_job(&dmn
->restart
, "start",
884 gs
.start_command
, 1, 0);
887 gs
.phase
= PHASE_NONE
;
888 THREAD_OFF(gs
.t_phase_hanging
);
889 zlog_notice("Phased global restart has completed.");
894 static void try_restart(struct daemon
*dmn
)
899 if (dmn
!= gs
.special
) {
900 if ((gs
.special
->state
== DAEMON_UP
)
901 && (gs
.phase
== PHASE_NONE
))
902 run_job(&dmn
->restart
, "restart", gs
.restart_command
, 0,
906 "%s: postponing restart attempt because master %s daemon "
907 "not up [%s], or phased restart in progress",
908 dmn
->name
, gs
.special
->name
,
909 state_str
[gs
.special
->state
]);
913 if ((gs
.phase
!= PHASE_NONE
) || gs
.numpids
) {
914 if (gs
.loglevel
> LOG_DEBUG
+ 1)
916 "postponing phased global restart: restart already in "
917 "progress [%s], or outstanding child processes [%d]",
918 phase_str
[gs
.phase
], gs
.numpids
);
921 /* Is it too soon for a restart? */
923 struct timeval delay
;
924 if (time_elapsed(&delay
, &gs
.special
->restart
.time
)->tv_sec
925 < gs
.special
->restart
.interval
) {
926 if (gs
.loglevel
> LOG_DEBUG
+ 1)
928 "postponing phased global restart: "
929 "elapsed time %ld < retry interval %ld",
931 gs
.special
->restart
.interval
);
935 run_job(&gs
.restart
, "restart", gs
.restart_command
, 0, 1);
938 static int wakeup_unresponsive(struct thread
*t_wakeup
)
940 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
942 dmn
->t_wakeup
= NULL
;
943 if (dmn
->state
!= DAEMON_UNRESPONSIVE
)
944 flog_err(EC_WATCHFRR_CONNECTION
,
945 "%s: no longer unresponsive (now %s), "
946 "wakeup should have been cancelled!",
947 dmn
->name
, state_str
[dmn
->state
]);
949 SET_WAKEUP_UNRESPONSIVE(dmn
);
955 static int wakeup_no_answer(struct thread
*t_wakeup
)
957 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
959 dmn
->t_wakeup
= NULL
;
960 dmn
->state
= DAEMON_UNRESPONSIVE
;
961 flog_err(EC_WATCHFRR_CONNECTION
,
962 "%s state -> unresponsive : no response yet to ping "
963 "sent %ld seconds ago",
964 dmn
->name
, gs
.timeout
);
965 SET_WAKEUP_UNRESPONSIVE(dmn
);
970 static int wakeup_send_echo(struct thread
*t_wakeup
)
972 static const char echocmd
[] = "echo " PING_TOKEN
;
974 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
976 dmn
->t_wakeup
= NULL
;
977 if (((rc
= write(dmn
->fd
, echocmd
, sizeof(echocmd
))) < 0)
978 || ((size_t)rc
!= sizeof(echocmd
))) {
979 char why
[100 + sizeof(echocmd
)];
980 snprintf(why
, sizeof(why
),
981 "write '%s' returned %d instead of %u", echocmd
,
982 (int)rc
, (unsigned int)sizeof(echocmd
));
983 daemon_down(dmn
, why
);
985 gettimeofday(&dmn
->echo_sent
, NULL
);
986 dmn
->t_wakeup
= NULL
;
987 thread_add_timer(master
, wakeup_no_answer
, dmn
, gs
.timeout
,
993 bool check_all_up(void)
997 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
998 if (dmn
->state
!= DAEMON_UP
)
1003 void watchfrr_status(struct vty
*vty
)
1006 struct timeval delay
;
1008 vty_out(vty
, "watchfrr global phase: %s\n", phase_str
[gs
.phase
]);
1010 vty_out(vty
, " global restart running, pid %ld\n",
1011 (long)gs
.restart
.pid
);
1013 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
1014 vty_out(vty
, " %-20s %s\n", dmn
->name
, state_str
[dmn
->state
]);
1015 if (dmn
->restart
.pid
)
1016 vty_out(vty
, " restart running, pid %ld\n",
1017 (long)dmn
->restart
.pid
);
1018 else if (dmn
->state
== DAEMON_DOWN
&&
1019 time_elapsed(&delay
, &dmn
->restart
.time
)->tv_sec
1020 < dmn
->restart
.interval
)
1021 vty_out(vty
, " restarting in %ld seconds"
1022 " (%lds backoff interval)\n",
1023 dmn
->restart
.interval
- delay
.tv_sec
,
1024 dmn
->restart
.interval
);
1028 static void sigint(void)
1030 zlog_notice("Terminating on signal");
1031 systemd_send_stopping();
1035 static int valid_command(const char *cmd
)
1039 return ((p
= strchr(cmd
, '%')) != NULL
) && (*(p
+ 1) == 's')
1040 && !strchr(p
+ 1, '%');
1043 /* This is an ugly hack to circumvent problems with passing command-line
1044 arguments that contain spaces. The fix is to use a configuration file. */
1045 static char *translate_blanks(const char *cmd
, const char *blankstr
)
1049 size_t bslen
= strlen(blankstr
);
1051 if (!(res
= strdup(cmd
))) {
1055 while ((p
= strstr(res
, blankstr
)) != NULL
) {
1058 memmove(p
+ 1, p
+ bslen
, strlen(p
+ bslen
) + 1);
1063 static int startup_timeout(struct thread
*t_wakeup
)
1065 daemon_send_ready(1);
1069 static void watchfrr_init(int argc
, char **argv
)
1071 const char *special
= "zebra";
1073 struct daemon
*dmn
, **add
= &gs
.daemons
;
1074 char alldaemons
[512] = "", *p
= alldaemons
;
1076 thread_add_timer_msec(master
, startup_timeout
, NULL
, STARTUP_TIMEOUT
,
1077 &gs
.t_startup_timeout
);
1079 for (i
= optind
; i
< argc
; i
++) {
1080 dmn
= XCALLOC(MTYPE_WATCHFRR_DAEMON
, sizeof(*dmn
));
1082 dmn
->name
= dmn
->restart
.name
= argv
[i
];
1083 dmn
->state
= DAEMON_INIT
;
1087 dmn
->t_wakeup
= NULL
;
1088 thread_add_timer_msec(master
, wakeup_init
, dmn
, 0,
1090 dmn
->restart
.interval
= gs
.min_restart_interval
;
1094 if (!strcmp(dmn
->name
, special
))
1100 "Must specify one or more daemons to monitor.\n\n");
1103 if (!watch_only
&& !gs
.special
) {
1104 fprintf(stderr
, "\"%s\" daemon must be in daemon lists\n\n",
1109 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
1110 snprintf(p
, alldaemons
+ sizeof(alldaemons
) - p
, "%s%s",
1111 (p
== alldaemons
) ? "" : " ", dmn
->name
);
1114 zlog_notice("%s %s watching [%s]%s", progname
, FRR_VERSION
, alldaemons
,
1115 watch_only
? ", monitor mode" : "");
1118 struct zebra_privs_t watchfrr_privs
= {
1120 .vty_group
= VTY_GROUP
,
1124 static struct quagga_signal_t watchfrr_signals
[] = {
1135 .handler
= sigchild
,
1139 FRR_DAEMON_INFO(watchfrr
, WATCHFRR
,
1140 .flags
= FRR_NO_PRIVSEP
| FRR_NO_TCPVTY
| FRR_LIMITED_CLI
1141 | FRR_NO_CFG_PID_DRY
| FRR_NO_ZCLIENT
1144 .printhelp
= printhelp
,
1145 .copyright
= "Copyright 2004 Andrew J. Schorr",
1147 .signals
= watchfrr_signals
,
1148 .n_signals
= array_size(watchfrr_signals
),
1150 .privs
= &watchfrr_privs
, )
1152 #define DEPRECATED_OPTIONS "aAezR:"
1154 int main(int argc
, char **argv
)
1157 const char *blankstr
= NULL
;
1159 frr_preinit(&watchfrr_di
, argc
, argv
);
1160 progname
= watchfrr_di
.progname
;
1162 frr_opt_add("b:dk:l:i:p:r:S:s:t:T:" DEPRECATED_OPTIONS
, longopts
, "");
1164 gs
.restart
.name
= "all";
1165 while ((opt
= frr_getopt(argc
, argv
, NULL
)) != EOF
) {
1166 if (opt
&& opt
< 128 && strchr(DEPRECATED_OPTIONS
, opt
)) {
1168 "The -%c option no longer exists.\n"
1169 "Please refer to the watchfrr(8) man page.\n",
1184 if (!valid_command(optarg
)) {
1186 "Invalid kill command, must contain '%%s': %s\n",
1190 gs
.stop_command
= optarg
;
1194 if ((sscanf(optarg
, "%d%1s", &gs
.loglevel
, garbage
)
1196 || (gs
.loglevel
< LOG_EMERG
)) {
1198 "Invalid loglevel argument: %s\n",
1203 case OPTION_MINRESTART
: {
1205 if ((sscanf(optarg
, "%ld%1s", &gs
.min_restart_interval
,
1208 || (gs
.min_restart_interval
< 0)) {
1210 "Invalid min_restart_interval argument: %s\n",
1215 case OPTION_MAXRESTART
: {
1217 if ((sscanf(optarg
, "%ld%1s", &gs
.max_restart_interval
,
1220 || (gs
.max_restart_interval
< 0)) {
1222 "Invalid max_restart_interval argument: %s\n",
1230 if ((sscanf(optarg
, "%d%1s", &period
, garbage
) != 1)
1231 || (gs
.period
< 1)) {
1233 "Invalid interval argument: %s\n",
1237 gs
.period
= 1000 * period
;
1240 watchfrr_di
.pid_file
= optarg
;
1243 if (!valid_command(optarg
)) {
1245 "Invalid restart command, must contain '%%s': %s\n",
1249 gs
.restart_command
= optarg
;
1252 if (!valid_command(optarg
)) {
1254 "Invalid start command, must contain '%%s': %s\n",
1258 gs
.start_command
= optarg
;
1265 if ((sscanf(optarg
, "%ld%1s", &gs
.timeout
, garbage
)
1267 || (gs
.timeout
< 1)) {
1269 "Invalid timeout argument: %s\n",
1276 if ((sscanf(optarg
, "%ld%1s", &gs
.restart_timeout
,
1279 || (gs
.restart_timeout
< 1)) {
1281 "Invalid restart timeout argument: %s\n",
1287 fputs("Invalid option.\n", stderr
);
1293 && (gs
.start_command
|| gs
.stop_command
|| gs
.restart_command
)) {
1294 fputs("Options -r/-s/-k are not used when --dry is active.\n",
1298 && (!gs
.restart_command
|| !gs
.start_command
|| !gs
.stop_command
)) {
1300 "Options -s (start), -k (kill), and -r (restart) are required.\n");
1305 if (gs
.restart_command
)
1306 gs
.restart_command
=
1307 translate_blanks(gs
.restart_command
, blankstr
);
1308 if (gs
.start_command
)
1310 translate_blanks(gs
.start_command
, blankstr
);
1311 if (gs
.stop_command
)
1313 translate_blanks(gs
.stop_command
, blankstr
);
1316 gs
.restart
.interval
= gs
.min_restart_interval
;
1318 master
= frr_init();
1319 watchfrr_error_init();
1320 watchfrr_init(argc
, argv
);
1321 watchfrr_vty_init();
1325 zlog_set_level(ZLOG_DEST_MONITOR
, ZLOG_DISABLED
);
1326 if (watchfrr_di
.daemon_mode
)
1327 zlog_set_level(ZLOG_DEST_SYSLOG
, MIN(gs
.loglevel
, LOG_DEBUG
));
1329 zlog_set_level(ZLOG_DEST_STDOUT
, MIN(gs
.loglevel
, LOG_DEBUG
));
1333 systemd_send_stopping();