2 * Monitor status of frr daemons and restart if necessary.
4 * Copyright (C) 2004 Andrew J. Schorr
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
26 #include <lib/version.h>
28 #include "memory_vty.h"
40 #define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
43 /* Macros to help randomize timers. */
44 #define JITTER(X) ((random() % ((X)+1))-((X)/2))
45 #define FUZZY(X) ((X)+JITTER((X)/20))
47 #define DEFAULT_PERIOD 5
48 #define DEFAULT_TIMEOUT 90
49 #define DEFAULT_RESTART_TIMEOUT 20
50 #define DEFAULT_LOGLEVEL LOG_INFO
51 #define DEFAULT_MIN_RESTART 60
52 #define DEFAULT_MAX_RESTART 600
54 #define PING_TOKEN "PING"
56 /* Needs to be global, referenced somewhere inside libfrr. */
57 struct thread_master
*master
;
58 static char pidfile_default
[256];
60 static bool watch_only
= false;
66 PHASE_ZEBRA_RESTART_PENDING
,
67 PHASE_WAITING_ZEBRA_UP
70 static const char *phase_str
[] = {
73 "Waiting for other daemons to come down",
74 "Zebra restart job running",
75 "Waiting for zebra to come up",
79 #define PHASE_TIMEOUT (3*gs.restart_timeout)
87 struct thread
*t_kill
;
91 static struct global_state
{
92 restart_phase_t phase
;
93 struct thread
*t_phase_hanging
;
98 long min_restart_interval
;
99 long max_restart_interval
;
100 struct daemon
*daemons
;
101 const char *restart_command
;
102 const char *start_command
;
103 const char *stop_command
;
104 struct restart_info restart
;
106 struct daemon
*special
; /* points to zebra when doing phased restart */
109 int numdown
; /* # of daemons that are not UP or UNRESPONSIVE */
112 .vtydir
= frr_vtydir
,
113 .period
= 1000 * DEFAULT_PERIOD
,
114 .timeout
= DEFAULT_TIMEOUT
,
115 .restart_timeout
= DEFAULT_RESTART_TIMEOUT
,
116 .loglevel
= DEFAULT_LOGLEVEL
,
117 .min_restart_interval
= DEFAULT_MIN_RESTART
,
118 .max_restart_interval
= DEFAULT_MAX_RESTART
,
130 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
132 static const char *state_str
[] = {
133 "Init", "Down", "Connecting", "Up", "Unresponsive",
138 daemon_state_t state
;
140 struct timeval echo_sent
;
141 unsigned int connect_tries
;
142 struct thread
*t_wakeup
;
143 struct thread
*t_read
;
144 struct thread
*t_write
;
146 struct restart_info restart
;
149 #define OPTION_MINRESTART 2000
150 #define OPTION_MAXRESTART 2001
151 #define OPTION_DRY 2002
153 static const struct option longopts
[] = {
154 {"daemon", no_argument
, NULL
, 'd'},
155 {"statedir", required_argument
, NULL
, 'S'},
156 {"loglevel", required_argument
, NULL
, 'l'},
157 {"interval", required_argument
, NULL
, 'i'},
158 {"timeout", required_argument
, NULL
, 't'},
159 {"restart-timeout", required_argument
, NULL
, 'T'},
160 {"restart", required_argument
, NULL
, 'r'},
161 {"start-command", required_argument
, NULL
, 's'},
162 {"kill-command", required_argument
, NULL
, 'k'},
163 {"dry", no_argument
, NULL
, OPTION_DRY
},
164 {"min-restart-interval", required_argument
, NULL
, OPTION_MINRESTART
},
165 {"max-restart-interval", required_argument
, NULL
, OPTION_MAXRESTART
},
166 {"pid-file", required_argument
, NULL
, 'p'},
167 {"blank-string", required_argument
, NULL
, 'b'},
168 {"help", no_argument
, NULL
, 'h'},
169 {"version", no_argument
, NULL
, 'v'},
172 static int try_connect(struct daemon
*dmn
);
173 static int wakeup_send_echo(struct thread
*t_wakeup
);
174 static void try_restart(struct daemon
*dmn
);
175 static void phase_check(void);
177 static const char *progname
;
178 static void printhelp(FILE *target
)
181 "Usage : %s [OPTION...] <daemon name> ...\n\n\
182 Watchdog program to monitor status of frr daemons and try to restart\n\
183 them if they are down or unresponsive. It determines whether a daemon is\n\
184 up based on whether it can connect to the daemon's vty unix stream socket.\n\
185 It then repeatedly sends echo commands over that socket to determine whether\n\
186 the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
187 on the socket connection and know immediately that the daemon is down.\n\n\
188 The daemons to be monitored should be listed on the command line.\n\n\
189 In order to avoid attempting to restart the daemons in a fast loop,\n\
190 the -m and -M options allow you to control the minimum delay between\n\
191 restart commands. The minimum restart delay is recalculated each time\n\
192 a restart is attempted: if the time since the last restart attempt exceeds\n\
193 twice the -M value, then the restart delay is set to the -m value.\n\
194 Otherwise, the interval is doubled (but capped at the -M value).\n\n",
199 -d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
200 to syslog instead of stdout.\n\
201 -S, --statedir Set the vty socket directory (default is %s)\n\
202 -l, --loglevel Set the logging level (default is %d).\n\
203 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
204 but it can be set higher than %d if extra-verbose debugging\n\
205 messages are desired.\n\
206 --min-restart-interval\n\
207 Set the minimum seconds to wait between invocations of daemon\n\
208 restart commands (default is %d).\n\
209 --max-restart-interval\n\
210 Set the maximum seconds to wait between invocations of daemon\n\
211 restart commands (default is %d).\n\
212 -i, --interval Set the status polling interval in seconds (default is %d)\n\
213 -t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
214 -T, --restart-timeout\n\
215 Set the restart (kill) timeout in seconds (default is %d).\n\
216 If any background jobs are still running after this much\n\
217 time has elapsed, they will be killed.\n\
218 -r, --restart Supply a Bourne shell command to use to restart a single\n\
219 daemon. The command string should include '%%s' where the\n\
220 name of the daemon should be substituted.\n\
221 -s, --start-command\n\
222 Supply a Bourne shell to command to use to start a single\n\
223 daemon. The command string should include '%%s' where the\n\
224 name of the daemon should be substituted.\n\
225 -k, --kill-command\n\
226 Supply a Bourne shell to command to use to stop a single\n\
227 daemon. The command string should include '%%s' where the\n\
228 name of the daemon should be substituted.\n\
229 --dry Do not start or restart anything, just log.\n\
230 -p, --pid-file Set process identifier file name\n\
232 -b, --blank-string\n\
233 When the supplied argument string is found in any of the\n\
234 various shell command arguments (-r, -s, or -k), replace\n\
235 it with a space. This is an ugly hack to circumvent problems\n\
236 passing command-line arguments with embedded spaces.\n\
237 -v, --version Print program version\n\
238 -h, --help Display this help and exit\n",
239 frr_vtydir
, DEFAULT_LOGLEVEL
, LOG_EMERG
, LOG_DEBUG
, LOG_DEBUG
,
240 DEFAULT_MIN_RESTART
, DEFAULT_MAX_RESTART
, DEFAULT_PERIOD
,
241 DEFAULT_TIMEOUT
, DEFAULT_RESTART_TIMEOUT
, pidfile_default
);
244 static pid_t
run_background(char *shell_cmd
)
248 switch (child
= fork()) {
250 zlog_err("fork failed, cannot run command [%s]: %s", shell_cmd
,
251 safe_strerror(errno
));
255 /* Use separate process group so child processes can be killed
257 if (setpgid(0, 0) < 0)
258 zlog_warn("warning: setpgid(0,0) failed: %s",
259 safe_strerror(errno
));
263 char *const argv
[4] = {shell
, dashc
, shell_cmd
, NULL
};
264 execv("/bin/sh", argv
);
265 zlog_err("execv(/bin/sh -c '%s') failed: %s", shell_cmd
,
266 safe_strerror(errno
));
270 /* Parent process: we will reap the child later. */
271 zlog_err("Forked background command [pid %d]: %s", (int)child
,
277 static struct timeval
*time_elapsed(struct timeval
*result
,
278 const struct timeval
*start_time
)
280 gettimeofday(result
, NULL
);
281 result
->tv_sec
-= start_time
->tv_sec
;
282 result
->tv_usec
-= start_time
->tv_usec
;
283 while (result
->tv_usec
< 0) {
284 result
->tv_usec
+= 1000000L;
290 static int restart_kill(struct thread
*t_kill
)
292 struct restart_info
*restart
= THREAD_ARG(t_kill
);
293 struct timeval delay
;
295 time_elapsed(&delay
, &restart
->time
);
297 "Warning: %s %s child process %d still running after "
298 "%ld seconds, sending signal %d",
299 restart
->what
, restart
->name
, (int)restart
->pid
,
300 (long)delay
.tv_sec
, (restart
->kills
? SIGKILL
: SIGTERM
));
301 kill(-restart
->pid
, (restart
->kills
? SIGKILL
: SIGTERM
));
303 restart
->t_kill
= NULL
;
304 thread_add_timer(master
, restart_kill
, restart
, gs
.restart_timeout
,
309 static struct restart_info
*find_child(pid_t child
)
312 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
313 if (dmn
->restart
.pid
== child
)
314 return &dmn
->restart
;
319 static void sigchild(void)
325 struct restart_info
*restart
;
327 switch (child
= waitpid(-1, &status
, WNOHANG
)) {
329 zlog_err("waitpid failed: %s", safe_strerror(errno
));
332 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
336 if (child
== integrated_write_pid
) {
337 integrated_write_sigchld(status
);
341 if ((restart
= find_child(child
)) != NULL
) {
342 name
= restart
->name
;
343 what
= restart
->what
;
346 thread_cancel(restart
->t_kill
);
347 restart
->t_kill
= NULL
;
348 /* Update restart time to reflect the time the command
350 gettimeofday(&restart
->time
, NULL
);
353 "waitpid returned status for an unknown child process %d",
358 if (WIFSTOPPED(status
))
359 zlog_warn("warning: %s %s process %d is stopped", what
, name
,
361 else if (WIFSIGNALED(status
))
362 zlog_warn("%s %s process %d terminated due to signal %d", what
,
363 name
, (int)child
, WTERMSIG(status
));
364 else if (WIFEXITED(status
)) {
365 if (WEXITSTATUS(status
) != 0)
367 "%s %s process %d exited with non-zero status %d",
368 what
, name
, (int)child
, WEXITSTATUS(status
));
370 zlog_debug("%s %s process %d exited normally", what
,
373 zlog_err("cannot interpret %s %s process %d wait status 0x%x",
374 what
, name
, (int)child
, status
);
378 static int run_job(struct restart_info
*restart
, const char *cmdtype
,
379 const char *command
, int force
, int update_interval
)
381 struct timeval delay
;
383 if (gs
.loglevel
> LOG_DEBUG
+ 1)
384 zlog_debug("attempting to %s %s", cmdtype
, restart
->name
);
387 if (gs
.loglevel
> LOG_DEBUG
+ 1)
389 "cannot %s %s, previous pid %d still running",
390 cmdtype
, restart
->name
, (int)restart
->pid
);
394 /* Note: time_elapsed test must come before the force test, since we
396 to make sure that delay is initialized for use below in updating the
398 if ((time_elapsed(&delay
, &restart
->time
)->tv_sec
< restart
->interval
)
400 if (gs
.loglevel
> LOG_DEBUG
+ 1)
403 "elapsed time %ld < retry interval %ld",
404 cmdtype
, restart
->name
, (long)delay
.tv_sec
,
409 gettimeofday(&restart
->time
, NULL
);
412 char cmd
[strlen(command
) + strlen(restart
->name
) + 1];
413 snprintf(cmd
, sizeof(cmd
), command
, restart
->name
);
414 if ((restart
->pid
= run_background(cmd
)) > 0) {
415 restart
->t_kill
= NULL
;
416 thread_add_timer(master
, restart_kill
, restart
,
417 gs
.restart_timeout
, &restart
->t_kill
);
418 restart
->what
= cmdtype
;
424 /* Calculate the new restart interval. */
425 if (update_interval
) {
426 if (delay
.tv_sec
> 2 * gs
.max_restart_interval
)
427 restart
->interval
= gs
.min_restart_interval
;
428 else if ((restart
->interval
*= 2) > gs
.max_restart_interval
)
429 restart
->interval
= gs
.max_restart_interval
;
430 if (gs
.loglevel
> LOG_DEBUG
+ 1)
431 zlog_debug("restart %s interval is now %ld",
432 restart
->name
, restart
->interval
);
437 #define SET_READ_HANDLER(DMN) \
439 (DMN)->t_read = NULL; \
440 thread_add_read(master, handle_read, (DMN), (DMN)->fd, \
444 #define SET_WAKEUP_DOWN(DMN) \
446 (DMN)->t_wakeup = NULL; \
447 thread_add_timer_msec(master, wakeup_down, (DMN), \
448 FUZZY(gs.period), &(DMN)->t_wakeup); \
451 #define SET_WAKEUP_UNRESPONSIVE(DMN) \
453 (DMN)->t_wakeup = NULL; \
454 thread_add_timer_msec(master, wakeup_unresponsive, (DMN), \
455 FUZZY(gs.period), &(DMN)->t_wakeup); \
458 #define SET_WAKEUP_ECHO(DMN) \
460 (DMN)->t_wakeup = NULL; \
461 thread_add_timer_msec(master, wakeup_send_echo, (DMN), \
462 FUZZY(gs.period), &(DMN)->t_wakeup); \
465 static int wakeup_down(struct thread
*t_wakeup
)
467 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
469 dmn
->t_wakeup
= NULL
;
470 if (try_connect(dmn
) < 0)
471 SET_WAKEUP_DOWN(dmn
);
472 if ((dmn
->connect_tries
> 1) && (dmn
->state
!= DAEMON_UP
))
477 static int wakeup_init(struct thread
*t_wakeup
)
479 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
481 dmn
->t_wakeup
= NULL
;
482 if (try_connect(dmn
) < 0) {
483 SET_WAKEUP_DOWN(dmn
);
484 zlog_err("%s state -> down : initial connection attempt failed",
486 dmn
->state
= DAEMON_DOWN
;
491 static void daemon_down(struct daemon
*dmn
, const char *why
)
493 if (IS_UP(dmn
) || (dmn
->state
== DAEMON_INIT
))
494 zlog_err("%s state -> down : %s", dmn
->name
, why
);
495 else if (gs
.loglevel
> LOG_DEBUG
)
496 zlog_debug("%s still down : %s", dmn
->name
, why
);
499 dmn
->state
= DAEMON_DOWN
;
504 THREAD_OFF(dmn
->t_read
);
505 THREAD_OFF(dmn
->t_write
);
506 THREAD_OFF(dmn
->t_wakeup
);
507 if (try_connect(dmn
) < 0)
508 SET_WAKEUP_DOWN(dmn
);
512 static int handle_read(struct thread
*t_read
)
514 struct daemon
*dmn
= THREAD_ARG(t_read
);
515 static const char resp
[sizeof(PING_TOKEN
) + 4] = PING_TOKEN
"\n";
516 char buf
[sizeof(resp
) + 100];
518 struct timeval delay
;
521 if ((rc
= read(dmn
->fd
, buf
, sizeof(buf
))) < 0) {
524 if (ERRNO_IO_RETRY(errno
)) {
525 /* Pretend it never happened. */
526 SET_READ_HANDLER(dmn
);
529 snprintf(why
, sizeof(why
), "unexpected read error: %s",
530 safe_strerror(errno
));
531 daemon_down(dmn
, why
);
535 daemon_down(dmn
, "read returned EOF");
538 if (!dmn
->echo_sent
.tv_sec
) {
539 char why
[sizeof(buf
) + 100];
540 snprintf(why
, sizeof(why
),
541 "unexpected read returns %d bytes: %.*s", (int)rc
,
543 daemon_down(dmn
, why
);
547 /* We are expecting an echo response: is there any chance that the
548 response would not be returned entirely in the first read? That
549 seems inconceivable... */
550 if ((rc
!= sizeof(resp
)) || memcmp(buf
, resp
, sizeof(resp
))) {
551 char why
[100 + sizeof(buf
)];
552 snprintf(why
, sizeof(why
),
553 "read returned bad echo response of %d bytes "
554 "(expecting %u): %.*s",
555 (int)rc
, (unsigned int)sizeof(resp
), (int)rc
, buf
);
556 daemon_down(dmn
, why
);
560 time_elapsed(&delay
, &dmn
->echo_sent
);
561 dmn
->echo_sent
.tv_sec
= 0;
562 if (dmn
->state
== DAEMON_UNRESPONSIVE
) {
563 if (delay
.tv_sec
< gs
.timeout
) {
564 dmn
->state
= DAEMON_UP
;
566 "%s state -> up : echo response received after %ld.%06ld "
568 dmn
->name
, (long)delay
.tv_sec
,
569 (long)delay
.tv_usec
);
572 "%s: slow echo response finally received after %ld.%06ld "
574 dmn
->name
, (long)delay
.tv_sec
,
575 (long)delay
.tv_usec
);
576 } else if (gs
.loglevel
> LOG_DEBUG
+ 1)
577 zlog_debug("%s: echo response received after %ld.%06ld seconds",
578 dmn
->name
, (long)delay
.tv_sec
, (long)delay
.tv_usec
);
580 SET_READ_HANDLER(dmn
);
582 thread_cancel(dmn
->t_wakeup
);
583 SET_WAKEUP_ECHO(dmn
);
589 * Wait till we notice that all daemons are ready before
590 * we send we are ready to systemd
592 static void daemon_send_ready(void)
595 if (!sent
&& gs
.numdown
== 0) {
598 fp
= fopen(DAEMON_VTY_DIR
"/watchfrr.started", "w");
601 #if defined HAVE_SYSTEMD
603 "Watchfrr: Notifying Systemd we are up and running");
604 systemd_send_started(master
, 0);
610 static void daemon_up(struct daemon
*dmn
, const char *why
)
612 dmn
->state
= DAEMON_UP
;
614 dmn
->connect_tries
= 0;
615 zlog_notice("%s state -> up : %s", dmn
->name
, why
);
617 SET_WAKEUP_ECHO(dmn
);
621 static int check_connect(struct thread
*t_write
)
623 struct daemon
*dmn
= THREAD_ARG(t_write
);
625 socklen_t reslen
= sizeof(sockerr
);
628 if (getsockopt(dmn
->fd
, SOL_SOCKET
, SO_ERROR
, (char *)&sockerr
, &reslen
)
630 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn
->name
,
631 safe_strerror(errno
));
633 "getsockopt failed checking connection success");
636 if ((reslen
== sizeof(sockerr
)) && sockerr
) {
640 "getsockopt reports that connection attempt failed: %s",
641 safe_strerror(sockerr
));
642 daemon_down(dmn
, why
);
646 daemon_up(dmn
, "delayed connect succeeded");
650 static int wakeup_connect_hanging(struct thread
*t_wakeup
)
652 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
655 dmn
->t_wakeup
= NULL
;
656 snprintf(why
, sizeof(why
),
657 "connection attempt timed out after %ld seconds", gs
.timeout
);
658 daemon_down(dmn
, why
);
662 /* Making connection to protocol daemon. */
663 static int try_connect(struct daemon
*dmn
)
666 struct sockaddr_un addr
;
669 if (gs
.loglevel
> LOG_DEBUG
+ 1)
670 zlog_debug("%s: attempting to connect", dmn
->name
);
671 dmn
->connect_tries
++;
673 memset(&addr
, 0, sizeof(struct sockaddr_un
));
674 addr
.sun_family
= AF_UNIX
;
675 snprintf(addr
.sun_path
, sizeof(addr
.sun_path
), "%s/%s.vty", gs
.vtydir
,
677 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
678 len
= addr
.sun_len
= SUN_LEN(&addr
);
680 len
= sizeof(addr
.sun_family
) + strlen(addr
.sun_path
);
681 #endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
683 /* Quick check to see if we might succeed before we go to the trouble
684 of creating a socket. */
685 if (access(addr
.sun_path
, W_OK
) < 0) {
687 zlog_err("%s: access to socket %s denied: %s",
688 dmn
->name
, addr
.sun_path
,
689 safe_strerror(errno
));
693 if ((sock
= socket(AF_UNIX
, SOCK_STREAM
, 0)) < 0) {
694 zlog_err("%s(%s): cannot make socket: %s", __func__
,
695 addr
.sun_path
, safe_strerror(errno
));
699 if (set_nonblocking(sock
) < 0 || set_cloexec(sock
) < 0) {
700 zlog_err("%s(%s): set_nonblocking/cloexec(%d) failed", __func__
,
701 addr
.sun_path
, sock
);
706 if (connect(sock
, (struct sockaddr
*)&addr
, len
) < 0) {
707 if ((errno
!= EINPROGRESS
) && (errno
!= EWOULDBLOCK
)) {
708 if (gs
.loglevel
> LOG_DEBUG
)
709 zlog_debug("%s(%s): connect failed: %s",
710 __func__
, addr
.sun_path
,
711 safe_strerror(errno
));
715 if (gs
.loglevel
> LOG_DEBUG
)
716 zlog_debug("%s: connection in progress", dmn
->name
);
717 dmn
->state
= DAEMON_CONNECTING
;
720 thread_add_write(master
, check_connect
, dmn
, dmn
->fd
,
722 dmn
->t_wakeup
= NULL
;
723 thread_add_timer(master
, wakeup_connect_hanging
, dmn
,
724 gs
.timeout
, &dmn
->t_wakeup
);
725 SET_READ_HANDLER(dmn
);
730 SET_READ_HANDLER(dmn
);
731 daemon_up(dmn
, "connect succeeded");
735 static int phase_hanging(struct thread
*t_hanging
)
737 gs
.t_phase_hanging
= NULL
;
738 zlog_err("Phase [%s] hanging for %ld seconds, aborting phased restart",
739 phase_str
[gs
.phase
], PHASE_TIMEOUT
);
740 gs
.phase
= PHASE_NONE
;
744 static void set_phase(restart_phase_t new_phase
)
746 gs
.phase
= new_phase
;
747 if (gs
.t_phase_hanging
)
748 thread_cancel(gs
.t_phase_hanging
);
749 gs
.t_phase_hanging
= NULL
;
750 thread_add_timer(master
, phase_hanging
, NULL
, PHASE_TIMEOUT
,
751 &gs
.t_phase_hanging
);
754 static void phase_check(void)
759 case PHASE_STOPS_PENDING
:
763 "Phased restart: all routing daemon stop jobs have completed.");
764 set_phase(PHASE_WAITING_DOWN
);
767 case PHASE_WAITING_DOWN
:
768 if (gs
.numdown
+ IS_UP(gs
.special
) < gs
.numdaemons
)
770 zlog_info("Phased restart: all routing daemons now down.");
771 run_job(&gs
.special
->restart
, "restart", gs
.restart_command
, 1,
773 set_phase(PHASE_ZEBRA_RESTART_PENDING
);
776 case PHASE_ZEBRA_RESTART_PENDING
:
777 if (gs
.special
->restart
.pid
)
779 zlog_info("Phased restart: %s restart job completed.",
781 set_phase(PHASE_WAITING_ZEBRA_UP
);
784 case PHASE_WAITING_ZEBRA_UP
:
785 if (!IS_UP(gs
.special
))
787 zlog_info("Phased restart: %s is now up.", gs
.special
->name
);
790 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
791 if (dmn
!= gs
.special
)
792 run_job(&dmn
->restart
, "start",
793 gs
.start_command
, 1, 0);
796 gs
.phase
= PHASE_NONE
;
797 THREAD_OFF(gs
.t_phase_hanging
);
798 zlog_notice("Phased global restart has completed.");
803 static void try_restart(struct daemon
*dmn
)
808 if (dmn
!= gs
.special
) {
809 if ((gs
.special
->state
== DAEMON_UP
)
810 && (gs
.phase
== PHASE_NONE
))
811 run_job(&dmn
->restart
, "restart", gs
.restart_command
, 0,
815 "%s: postponing restart attempt because master %s daemon "
816 "not up [%s], or phased restart in progress",
817 dmn
->name
, gs
.special
->name
,
818 state_str
[gs
.special
->state
]);
822 if ((gs
.phase
!= PHASE_NONE
) || gs
.numpids
) {
823 if (gs
.loglevel
> LOG_DEBUG
+ 1)
825 "postponing phased global restart: restart already in "
826 "progress [%s], or outstanding child processes [%d]",
827 phase_str
[gs
.phase
], gs
.numpids
);
830 /* Is it too soon for a restart? */
832 struct timeval delay
;
833 if (time_elapsed(&delay
, &gs
.special
->restart
.time
)->tv_sec
834 < gs
.special
->restart
.interval
) {
835 if (gs
.loglevel
> LOG_DEBUG
+ 1)
837 "postponing phased global restart: "
838 "elapsed time %ld < retry interval %ld",
840 gs
.special
->restart
.interval
);
844 run_job(&gs
.restart
, "restart", gs
.restart_command
, 0, 1);
847 static int wakeup_unresponsive(struct thread
*t_wakeup
)
849 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
851 dmn
->t_wakeup
= NULL
;
852 if (dmn
->state
!= DAEMON_UNRESPONSIVE
)
854 "%s: no longer unresponsive (now %s), "
855 "wakeup should have been cancelled!",
856 dmn
->name
, state_str
[dmn
->state
]);
858 SET_WAKEUP_UNRESPONSIVE(dmn
);
864 static int wakeup_no_answer(struct thread
*t_wakeup
)
866 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
868 dmn
->t_wakeup
= NULL
;
869 dmn
->state
= DAEMON_UNRESPONSIVE
;
871 "%s state -> unresponsive : no response yet to ping "
872 "sent %ld seconds ago",
873 dmn
->name
, gs
.timeout
);
874 SET_WAKEUP_UNRESPONSIVE(dmn
);
879 static int wakeup_send_echo(struct thread
*t_wakeup
)
881 static const char echocmd
[] = "echo " PING_TOKEN
;
883 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
885 dmn
->t_wakeup
= NULL
;
886 if (((rc
= write(dmn
->fd
, echocmd
, sizeof(echocmd
))) < 0)
887 || ((size_t)rc
!= sizeof(echocmd
))) {
888 char why
[100 + sizeof(echocmd
)];
889 snprintf(why
, sizeof(why
),
890 "write '%s' returned %d instead of %u", echocmd
,
891 (int)rc
, (unsigned int)sizeof(echocmd
));
892 daemon_down(dmn
, why
);
894 gettimeofday(&dmn
->echo_sent
, NULL
);
895 dmn
->t_wakeup
= NULL
;
896 thread_add_timer(master
, wakeup_no_answer
, dmn
, gs
.timeout
,
902 bool check_all_up(void)
906 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
907 if (dmn
->state
!= DAEMON_UP
)
912 static void sigint(void)
914 zlog_notice("Terminating on signal");
915 systemd_send_stopping();
919 static int valid_command(const char *cmd
)
923 return ((p
= strchr(cmd
, '%')) != NULL
) && (*(p
+ 1) == 's')
924 && !strchr(p
+ 1, '%');
927 /* This is an ugly hack to circumvent problems with passing command-line
928 arguments that contain spaces. The fix is to use a configuration file. */
929 static char *translate_blanks(const char *cmd
, const char *blankstr
)
933 size_t bslen
= strlen(blankstr
);
935 if (!(res
= strdup(cmd
))) {
939 while ((p
= strstr(res
, blankstr
)) != NULL
) {
942 memmove(p
+ 1, p
+ bslen
, strlen(p
+ bslen
) + 1);
947 struct zebra_privs_t watchfrr_privs
= {
949 .vty_group
= VTY_GROUP
,
953 static struct quagga_signal_t watchfrr_signals
[] = {
968 FRR_DAEMON_INFO(watchfrr
, WATCHFRR
,
969 .flags
= FRR_NO_PRIVSEP
| FRR_NO_TCPVTY
| FRR_LIMITED_CLI
970 | FRR_NO_CFG_PID_DRY
| FRR_NO_ZCLIENT
,
972 .printhelp
= printhelp
,
973 .copyright
= "Copyright 2004 Andrew J. Schorr",
975 .signals
= watchfrr_signals
,
976 .n_signals
= array_size(watchfrr_signals
),
978 .privs
= &watchfrr_privs
, )
980 #define DEPRECATED_OPTIONS "aAezR:"
982 int main(int argc
, char **argv
)
985 const char *pidfile
= pidfile_default
;
986 const char *special
= "zebra";
987 const char *blankstr
= NULL
;
989 snprintf(pidfile_default
, sizeof(pidfile_default
), "%s/watchfrr.pid",
992 frr_preinit(&watchfrr_di
, argc
, argv
);
993 progname
= watchfrr_di
.progname
;
995 frr_opt_add("b:dk:l:i:p:r:S:s:t:T:" DEPRECATED_OPTIONS
, longopts
, "");
997 gs
.restart
.name
= "all";
998 while ((opt
= frr_getopt(argc
, argv
, NULL
)) != EOF
) {
999 if (opt
&& opt
< 128 && strchr(DEPRECATED_OPTIONS
, opt
)) {
1001 "The -%c option no longer exists.\n"
1002 "Please refer to the watchfrr(8) man page.\n",
1017 if (!valid_command(optarg
)) {
1019 "Invalid kill command, must contain '%%s': %s\n",
1023 gs
.stop_command
= optarg
;
1027 if ((sscanf(optarg
, "%d%1s", &gs
.loglevel
, garbage
)
1029 || (gs
.loglevel
< LOG_EMERG
)) {
1031 "Invalid loglevel argument: %s\n",
1036 case OPTION_MINRESTART
: {
1038 if ((sscanf(optarg
, "%ld%1s", &gs
.min_restart_interval
,
1041 || (gs
.min_restart_interval
< 0)) {
1043 "Invalid min_restart_interval argument: %s\n",
1048 case OPTION_MAXRESTART
: {
1050 if ((sscanf(optarg
, "%ld%1s", &gs
.max_restart_interval
,
1053 || (gs
.max_restart_interval
< 0)) {
1055 "Invalid max_restart_interval argument: %s\n",
1063 if ((sscanf(optarg
, "%d%1s", &period
, garbage
) != 1)
1064 || (gs
.period
< 1)) {
1066 "Invalid interval argument: %s\n",
1070 gs
.period
= 1000 * period
;
1076 if (!valid_command(optarg
)) {
1078 "Invalid restart command, must contain '%%s': %s\n",
1082 gs
.restart_command
= optarg
;
1085 if (!valid_command(optarg
)) {
1087 "Invalid start command, must contain '%%s': %s\n",
1091 gs
.start_command
= optarg
;
1098 if ((sscanf(optarg
, "%ld%1s", &gs
.timeout
, garbage
)
1100 || (gs
.timeout
< 1)) {
1102 "Invalid timeout argument: %s\n",
1109 if ((sscanf(optarg
, "%ld%1s", &gs
.restart_timeout
,
1112 || (gs
.restart_timeout
< 1)) {
1114 "Invalid restart timeout argument: %s\n",
1120 fputs("Invalid option.\n", stderr
);
1126 && (gs
.start_command
|| gs
.stop_command
|| gs
.restart_command
)) {
1127 fputs("Options -r/-s/-k are not used when --dry is active.\n",
1131 && (!gs
.restart_command
|| !gs
.start_command
|| !gs
.stop_command
)) {
1133 "Options -s (start), -k (kill), and -r (restart) are required.\n");
1138 if (gs
.restart_command
)
1139 gs
.restart_command
=
1140 translate_blanks(gs
.restart_command
, blankstr
);
1141 if (gs
.start_command
)
1143 translate_blanks(gs
.start_command
, blankstr
);
1144 if (gs
.stop_command
)
1146 translate_blanks(gs
.stop_command
, blankstr
);
1149 gs
.restart
.interval
= gs
.min_restart_interval
;
1151 master
= frr_init();
1153 zlog_set_level(ZLOG_DEST_MONITOR
, ZLOG_DISABLED
);
1154 if (watchfrr_di
.daemon_mode
) {
1155 zlog_set_level(ZLOG_DEST_SYSLOG
, MIN(gs
.loglevel
, LOG_DEBUG
));
1156 if (daemon(0, 0) < 0) {
1157 fprintf(stderr
, "Watchfrr daemon failed: %s",
1162 zlog_set_level(ZLOG_DEST_STDOUT
, MIN(gs
.loglevel
, LOG_DEBUG
));
1164 watchfrr_vty_init();
1170 struct daemon
*tail
= NULL
;
1172 for (i
= optind
; i
< argc
; i
++) {
1175 if (!(dmn
= (struct daemon
*)calloc(1, sizeof(*dmn
)))) {
1176 fprintf(stderr
, "calloc(1,%u) failed: %s\n",
1177 (unsigned int)sizeof(*dmn
),
1178 safe_strerror(errno
));
1181 dmn
->name
= dmn
->restart
.name
= argv
[i
];
1182 dmn
->state
= DAEMON_INIT
;
1186 dmn
->t_wakeup
= NULL
;
1187 thread_add_timer_msec(master
, wakeup_init
, dmn
,
1188 100 + (random() % 900),
1190 dmn
->restart
.interval
= gs
.min_restart_interval
;
1197 if (!strcmp(dmn
->name
, special
))
1202 fputs("Must specify one or more daemons to monitor.\n", stderr
);
1205 if (!watch_only
&& !gs
.special
) {
1206 fprintf(stderr
, "\"%s\" daemon must be in daemon list\n",
1211 /* Make sure we're not already running. */
1212 pid_output(pidfile
);
1214 /* Announce which daemons are being monitored. */
1219 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
1220 len
+= strlen(dmn
->name
) + 1;
1226 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
1229 strcpy(p
, dmn
->name
);
1232 zlog_notice("%s %s watching [%s]%s", progname
,
1234 watch_only
? ", monitor mode" : "");
1239 struct thread thread
;
1241 while (thread_fetch(master
, &thread
))
1242 thread_call(&thread
);
1245 systemd_send_stopping();