2 * Monitor status of frr daemons and restart if necessary.
4 * Copyright (C) 2004 Andrew J. Schorr
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
26 #include <lib/version.h>
28 #include "memory_vty.h"
30 #include "lib_errors.h"
39 #include "watchfrr_errors.h"
42 #define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
45 /* Macros to help randomize timers. */
46 #define JITTER(X) ((random() % ((X)+1))-((X)/2))
47 #define FUZZY(X) ((X)+JITTER((X)/20))
49 #define DEFAULT_PERIOD 5
50 #define DEFAULT_TIMEOUT 90
51 #define DEFAULT_RESTART_TIMEOUT 20
52 #define DEFAULT_LOGLEVEL LOG_INFO
53 #define DEFAULT_MIN_RESTART 60
54 #define DEFAULT_MAX_RESTART 600
56 #define PING_TOKEN "PING"
58 /* Needs to be global, referenced somewhere inside libfrr. */
59 struct thread_master
*master
;
60 static char pidfile_default
[256];
62 static bool watch_only
= false;
68 PHASE_ZEBRA_RESTART_PENDING
,
69 PHASE_WAITING_ZEBRA_UP
72 static const char *phase_str
[] = {
75 "Waiting for other daemons to come down",
76 "Zebra restart job running",
77 "Waiting for zebra to come up",
81 #define PHASE_TIMEOUT (3*gs.restart_timeout)
89 struct thread
*t_kill
;
93 static struct global_state
{
94 restart_phase_t phase
;
95 struct thread
*t_phase_hanging
;
100 long min_restart_interval
;
101 long max_restart_interval
;
102 struct daemon
*daemons
;
103 const char *restart_command
;
104 const char *start_command
;
105 const char *stop_command
;
106 struct restart_info restart
;
108 struct daemon
*special
; /* points to zebra when doing phased restart */
111 int numdown
; /* # of daemons that are not UP or UNRESPONSIVE */
114 .vtydir
= frr_vtydir
,
115 .period
= 1000 * DEFAULT_PERIOD
,
116 .timeout
= DEFAULT_TIMEOUT
,
117 .restart_timeout
= DEFAULT_RESTART_TIMEOUT
,
118 .loglevel
= DEFAULT_LOGLEVEL
,
119 .min_restart_interval
= DEFAULT_MIN_RESTART
,
120 .max_restart_interval
= DEFAULT_MAX_RESTART
,
132 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
134 static const char *state_str
[] = {
135 "Init", "Down", "Connecting", "Up", "Unresponsive",
140 daemon_state_t state
;
142 struct timeval echo_sent
;
143 unsigned int connect_tries
;
144 struct thread
*t_wakeup
;
145 struct thread
*t_read
;
146 struct thread
*t_write
;
148 struct restart_info restart
;
151 #define OPTION_MINRESTART 2000
152 #define OPTION_MAXRESTART 2001
153 #define OPTION_DRY 2002
155 static const struct option longopts
[] = {
156 {"daemon", no_argument
, NULL
, 'd'},
157 {"statedir", required_argument
, NULL
, 'S'},
158 {"loglevel", required_argument
, NULL
, 'l'},
159 {"interval", required_argument
, NULL
, 'i'},
160 {"timeout", required_argument
, NULL
, 't'},
161 {"restart-timeout", required_argument
, NULL
, 'T'},
162 {"restart", required_argument
, NULL
, 'r'},
163 {"start-command", required_argument
, NULL
, 's'},
164 {"kill-command", required_argument
, NULL
, 'k'},
165 {"dry", no_argument
, NULL
, OPTION_DRY
},
166 {"min-restart-interval", required_argument
, NULL
, OPTION_MINRESTART
},
167 {"max-restart-interval", required_argument
, NULL
, OPTION_MAXRESTART
},
168 {"pid-file", required_argument
, NULL
, 'p'},
169 {"blank-string", required_argument
, NULL
, 'b'},
170 {"help", no_argument
, NULL
, 'h'},
171 {"version", no_argument
, NULL
, 'v'},
174 static int try_connect(struct daemon
*dmn
);
175 static int wakeup_send_echo(struct thread
*t_wakeup
);
176 static void try_restart(struct daemon
*dmn
);
177 static void phase_check(void);
179 static const char *progname
;
180 static void printhelp(FILE *target
)
183 "Usage : %s [OPTION...] <daemon name> ...\n\n\
184 Watchdog program to monitor status of frr daemons and try to restart\n\
185 them if they are down or unresponsive. It determines whether a daemon is\n\
186 up based on whether it can connect to the daemon's vty unix stream socket.\n\
187 It then repeatedly sends echo commands over that socket to determine whether\n\
188 the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
189 on the socket connection and know immediately that the daemon is down.\n\n\
190 The daemons to be monitored should be listed on the command line.\n\n\
191 In order to avoid attempting to restart the daemons in a fast loop,\n\
192 the -m and -M options allow you to control the minimum delay between\n\
193 restart commands. The minimum restart delay is recalculated each time\n\
194 a restart is attempted: if the time since the last restart attempt exceeds\n\
195 twice the -M value, then the restart delay is set to the -m value.\n\
196 Otherwise, the interval is doubled (but capped at the -M value).\n\n",
201 -d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
202 to syslog instead of stdout.\n\
203 -S, --statedir Set the vty socket directory (default is %s)\n\
204 -l, --loglevel Set the logging level (default is %d).\n\
205 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
206 but it can be set higher than %d if extra-verbose debugging\n\
207 messages are desired.\n\
208 --min-restart-interval\n\
209 Set the minimum seconds to wait between invocations of daemon\n\
210 restart commands (default is %d).\n\
211 --max-restart-interval\n\
212 Set the maximum seconds to wait between invocations of daemon\n\
213 restart commands (default is %d).\n\
214 -i, --interval Set the status polling interval in seconds (default is %d)\n\
215 -t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
216 -T, --restart-timeout\n\
217 Set the restart (kill) timeout in seconds (default is %d).\n\
218 If any background jobs are still running after this much\n\
219 time has elapsed, they will be killed.\n\
220 -r, --restart Supply a Bourne shell command to use to restart a single\n\
221 daemon. The command string should include '%%s' where the\n\
222 name of the daemon should be substituted.\n\
223 -s, --start-command\n\
224 Supply a Bourne shell to command to use to start a single\n\
225 daemon. The command string should include '%%s' where the\n\
226 name of the daemon should be substituted.\n\
227 -k, --kill-command\n\
228 Supply a Bourne shell to command to use to stop a single\n\
229 daemon. The command string should include '%%s' where the\n\
230 name of the daemon should be substituted.\n\
231 --dry Do not start or restart anything, just log.\n\
232 -p, --pid-file Set process identifier file name\n\
234 -b, --blank-string\n\
235 When the supplied argument string is found in any of the\n\
236 various shell command arguments (-r, -s, or -k), replace\n\
237 it with a space. This is an ugly hack to circumvent problems\n\
238 passing command-line arguments with embedded spaces.\n\
239 -v, --version Print program version\n\
240 -h, --help Display this help and exit\n",
241 frr_vtydir
, DEFAULT_LOGLEVEL
, LOG_EMERG
, LOG_DEBUG
, LOG_DEBUG
,
242 DEFAULT_MIN_RESTART
, DEFAULT_MAX_RESTART
, DEFAULT_PERIOD
,
243 DEFAULT_TIMEOUT
, DEFAULT_RESTART_TIMEOUT
, pidfile_default
);
246 static pid_t
run_background(char *shell_cmd
)
250 switch (child
= fork()) {
252 flog_err_sys(EC_LIB_SYSTEM_CALL
,
253 "fork failed, cannot run command [%s]: %s",
254 shell_cmd
, safe_strerror(errno
));
258 /* Use separate process group so child processes can be killed
260 if (setpgid(0, 0) < 0)
261 zlog_warn("warning: setpgid(0,0) failed: %s",
262 safe_strerror(errno
));
266 char *const argv
[4] = {shell
, dashc
, shell_cmd
, NULL
};
267 execv("/bin/sh", argv
);
268 flog_err_sys(EC_LIB_SYSTEM_CALL
,
269 "execv(/bin/sh -c '%s') failed: %s",
270 shell_cmd
, safe_strerror(errno
));
274 /* Parent process: we will reap the child later. */
275 flog_err_sys(EC_LIB_SYSTEM_CALL
,
276 "Forked background command [pid %d]: %s",
277 (int)child
, shell_cmd
);
282 static struct timeval
*time_elapsed(struct timeval
*result
,
283 const struct timeval
*start_time
)
285 gettimeofday(result
, NULL
);
286 result
->tv_sec
-= start_time
->tv_sec
;
287 result
->tv_usec
-= start_time
->tv_usec
;
288 while (result
->tv_usec
< 0) {
289 result
->tv_usec
+= 1000000L;
295 static int restart_kill(struct thread
*t_kill
)
297 struct restart_info
*restart
= THREAD_ARG(t_kill
);
298 struct timeval delay
;
300 time_elapsed(&delay
, &restart
->time
);
302 "Warning: %s %s child process %d still running after "
303 "%ld seconds, sending signal %d",
304 restart
->what
, restart
->name
, (int)restart
->pid
,
305 (long)delay
.tv_sec
, (restart
->kills
? SIGKILL
: SIGTERM
));
306 kill(-restart
->pid
, (restart
->kills
? SIGKILL
: SIGTERM
));
308 restart
->t_kill
= NULL
;
309 thread_add_timer(master
, restart_kill
, restart
, gs
.restart_timeout
,
314 static struct restart_info
*find_child(pid_t child
)
317 if (gs
.restart
.pid
== child
)
320 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
321 if (dmn
->restart
.pid
== child
)
322 return &dmn
->restart
;
327 static void sigchild(void)
333 struct restart_info
*restart
;
335 switch (child
= waitpid(-1, &status
, WNOHANG
)) {
337 flog_err_sys(EC_LIB_SYSTEM_CALL
, "waitpid failed: %s",
338 safe_strerror(errno
));
341 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
345 if (child
== integrated_write_pid
) {
346 integrated_write_sigchld(status
);
350 if ((restart
= find_child(child
)) != NULL
) {
351 name
= restart
->name
;
352 what
= restart
->what
;
355 thread_cancel(restart
->t_kill
);
356 restart
->t_kill
= NULL
;
357 /* Update restart time to reflect the time the command
359 gettimeofday(&restart
->time
, NULL
);
363 "waitpid returned status for an unknown child process %d",
368 if (WIFSTOPPED(status
))
369 zlog_warn("warning: %s %s process %d is stopped", what
, name
,
371 else if (WIFSIGNALED(status
))
372 zlog_warn("%s %s process %d terminated due to signal %d", what
,
373 name
, (int)child
, WTERMSIG(status
));
374 else if (WIFEXITED(status
)) {
375 if (WEXITSTATUS(status
) != 0)
377 "%s %s process %d exited with non-zero status %d",
378 what
, name
, (int)child
, WEXITSTATUS(status
));
380 zlog_debug("%s %s process %d exited normally", what
,
385 "cannot interpret %s %s process %d wait status 0x%x",
386 what
, name
, (int)child
, status
);
390 static int run_job(struct restart_info
*restart
, const char *cmdtype
,
391 const char *command
, int force
, int update_interval
)
393 struct timeval delay
;
395 if (gs
.loglevel
> LOG_DEBUG
+ 1)
396 zlog_debug("attempting to %s %s", cmdtype
, restart
->name
);
399 if (gs
.loglevel
> LOG_DEBUG
+ 1)
401 "cannot %s %s, previous pid %d still running",
402 cmdtype
, restart
->name
, (int)restart
->pid
);
406 /* Note: time_elapsed test must come before the force test, since we
408 to make sure that delay is initialized for use below in updating the
410 if ((time_elapsed(&delay
, &restart
->time
)->tv_sec
< restart
->interval
)
412 if (gs
.loglevel
> LOG_DEBUG
+ 1)
415 "elapsed time %ld < retry interval %ld",
416 cmdtype
, restart
->name
, (long)delay
.tv_sec
,
421 gettimeofday(&restart
->time
, NULL
);
424 char cmd
[strlen(command
) + strlen(restart
->name
) + 1];
425 snprintf(cmd
, sizeof(cmd
), command
, restart
->name
);
426 if ((restart
->pid
= run_background(cmd
)) > 0) {
427 restart
->t_kill
= NULL
;
428 thread_add_timer(master
, restart_kill
, restart
,
429 gs
.restart_timeout
, &restart
->t_kill
);
430 restart
->what
= cmdtype
;
436 /* Calculate the new restart interval. */
437 if (update_interval
) {
438 if (delay
.tv_sec
> 2 * gs
.max_restart_interval
)
439 restart
->interval
= gs
.min_restart_interval
;
440 else if ((restart
->interval
*= 2) > gs
.max_restart_interval
)
441 restart
->interval
= gs
.max_restart_interval
;
442 if (gs
.loglevel
> LOG_DEBUG
+ 1)
443 zlog_debug("restart %s interval is now %ld",
444 restart
->name
, restart
->interval
);
449 #define SET_READ_HANDLER(DMN) \
451 (DMN)->t_read = NULL; \
452 thread_add_read(master, handle_read, (DMN), (DMN)->fd, \
456 #define SET_WAKEUP_DOWN(DMN) \
458 (DMN)->t_wakeup = NULL; \
459 thread_add_timer_msec(master, wakeup_down, (DMN), \
460 FUZZY(gs.period), &(DMN)->t_wakeup); \
463 #define SET_WAKEUP_UNRESPONSIVE(DMN) \
465 (DMN)->t_wakeup = NULL; \
466 thread_add_timer_msec(master, wakeup_unresponsive, (DMN), \
467 FUZZY(gs.period), &(DMN)->t_wakeup); \
470 #define SET_WAKEUP_ECHO(DMN) \
472 (DMN)->t_wakeup = NULL; \
473 thread_add_timer_msec(master, wakeup_send_echo, (DMN), \
474 FUZZY(gs.period), &(DMN)->t_wakeup); \
477 static int wakeup_down(struct thread
*t_wakeup
)
479 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
481 dmn
->t_wakeup
= NULL
;
482 if (try_connect(dmn
) < 0)
483 SET_WAKEUP_DOWN(dmn
);
484 if ((dmn
->connect_tries
> 1) && (dmn
->state
!= DAEMON_UP
))
489 static int wakeup_init(struct thread
*t_wakeup
)
491 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
493 dmn
->t_wakeup
= NULL
;
494 if (try_connect(dmn
) < 0) {
495 SET_WAKEUP_DOWN(dmn
);
496 flog_err(EC_WATCHFRR_CONNECTION
,
497 "%s state -> down : initial connection attempt failed",
499 dmn
->state
= DAEMON_DOWN
;
504 static void daemon_down(struct daemon
*dmn
, const char *why
)
506 if (IS_UP(dmn
) || (dmn
->state
== DAEMON_INIT
))
507 flog_err(EC_WATCHFRR_CONNECTION
, "%s state -> down : %s",
509 else if (gs
.loglevel
> LOG_DEBUG
)
510 zlog_debug("%s still down : %s", dmn
->name
, why
);
513 dmn
->state
= DAEMON_DOWN
;
518 THREAD_OFF(dmn
->t_read
);
519 THREAD_OFF(dmn
->t_write
);
520 THREAD_OFF(dmn
->t_wakeup
);
521 if (try_connect(dmn
) < 0)
522 SET_WAKEUP_DOWN(dmn
);
526 static int handle_read(struct thread
*t_read
)
528 struct daemon
*dmn
= THREAD_ARG(t_read
);
529 static const char resp
[sizeof(PING_TOKEN
) + 4] = PING_TOKEN
"\n";
530 char buf
[sizeof(resp
) + 100];
532 struct timeval delay
;
535 if ((rc
= read(dmn
->fd
, buf
, sizeof(buf
))) < 0) {
538 if (ERRNO_IO_RETRY(errno
)) {
539 /* Pretend it never happened. */
540 SET_READ_HANDLER(dmn
);
543 snprintf(why
, sizeof(why
), "unexpected read error: %s",
544 safe_strerror(errno
));
545 daemon_down(dmn
, why
);
549 daemon_down(dmn
, "read returned EOF");
552 if (!dmn
->echo_sent
.tv_sec
) {
553 char why
[sizeof(buf
) + 100];
554 snprintf(why
, sizeof(why
),
555 "unexpected read returns %d bytes: %.*s", (int)rc
,
557 daemon_down(dmn
, why
);
561 /* We are expecting an echo response: is there any chance that the
562 response would not be returned entirely in the first read? That
563 seems inconceivable... */
564 if ((rc
!= sizeof(resp
)) || memcmp(buf
, resp
, sizeof(resp
))) {
565 char why
[100 + sizeof(buf
)];
566 snprintf(why
, sizeof(why
),
567 "read returned bad echo response of %d bytes "
568 "(expecting %u): %.*s",
569 (int)rc
, (unsigned int)sizeof(resp
), (int)rc
, buf
);
570 daemon_down(dmn
, why
);
574 time_elapsed(&delay
, &dmn
->echo_sent
);
575 dmn
->echo_sent
.tv_sec
= 0;
576 if (dmn
->state
== DAEMON_UNRESPONSIVE
) {
577 if (delay
.tv_sec
< gs
.timeout
) {
578 dmn
->state
= DAEMON_UP
;
580 "%s state -> up : echo response received after %ld.%06ld "
582 dmn
->name
, (long)delay
.tv_sec
,
583 (long)delay
.tv_usec
);
586 "%s: slow echo response finally received after %ld.%06ld "
588 dmn
->name
, (long)delay
.tv_sec
,
589 (long)delay
.tv_usec
);
590 } else if (gs
.loglevel
> LOG_DEBUG
+ 1)
591 zlog_debug("%s: echo response received after %ld.%06ld seconds",
592 dmn
->name
, (long)delay
.tv_sec
, (long)delay
.tv_usec
);
594 SET_READ_HANDLER(dmn
);
596 thread_cancel(dmn
->t_wakeup
);
597 SET_WAKEUP_ECHO(dmn
);
603 * Wait till we notice that all daemons are ready before
604 * we send we are ready to systemd
606 static void daemon_send_ready(void)
609 if (!sent
&& gs
.numdown
== 0) {
612 fp
= fopen(DAEMON_VTY_DIR
"/watchfrr.started", "w");
615 #if defined HAVE_SYSTEMD
617 "Watchfrr: Notifying Systemd we are up and running");
618 systemd_send_started(master
, 0);
624 static void daemon_up(struct daemon
*dmn
, const char *why
)
626 dmn
->state
= DAEMON_UP
;
628 dmn
->connect_tries
= 0;
629 zlog_notice("%s state -> up : %s", dmn
->name
, why
);
631 SET_WAKEUP_ECHO(dmn
);
635 static int check_connect(struct thread
*t_write
)
637 struct daemon
*dmn
= THREAD_ARG(t_write
);
639 socklen_t reslen
= sizeof(sockerr
);
642 if (getsockopt(dmn
->fd
, SOL_SOCKET
, SO_ERROR
, (char *)&sockerr
, &reslen
)
644 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn
->name
,
645 safe_strerror(errno
));
647 "getsockopt failed checking connection success");
650 if ((reslen
== sizeof(sockerr
)) && sockerr
) {
654 "getsockopt reports that connection attempt failed: %s",
655 safe_strerror(sockerr
));
656 daemon_down(dmn
, why
);
660 daemon_up(dmn
, "delayed connect succeeded");
664 static int wakeup_connect_hanging(struct thread
*t_wakeup
)
666 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
669 dmn
->t_wakeup
= NULL
;
670 snprintf(why
, sizeof(why
),
671 "connection attempt timed out after %ld seconds", gs
.timeout
);
672 daemon_down(dmn
, why
);
676 /* Making connection to protocol daemon. */
677 static int try_connect(struct daemon
*dmn
)
680 struct sockaddr_un addr
;
683 if (gs
.loglevel
> LOG_DEBUG
+ 1)
684 zlog_debug("%s: attempting to connect", dmn
->name
);
685 dmn
->connect_tries
++;
687 memset(&addr
, 0, sizeof(struct sockaddr_un
));
688 addr
.sun_family
= AF_UNIX
;
689 snprintf(addr
.sun_path
, sizeof(addr
.sun_path
), "%s/%s.vty", gs
.vtydir
,
691 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
692 len
= addr
.sun_len
= SUN_LEN(&addr
);
694 len
= sizeof(addr
.sun_family
) + strlen(addr
.sun_path
);
695 #endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
697 /* Quick check to see if we might succeed before we go to the trouble
698 of creating a socket. */
699 if (access(addr
.sun_path
, W_OK
) < 0) {
701 flog_err_sys(EC_LIB_SYSTEM_CALL
,
702 "%s: access to socket %s denied: %s",
703 dmn
->name
, addr
.sun_path
,
704 safe_strerror(errno
));
708 if ((sock
= socket(AF_UNIX
, SOCK_STREAM
, 0)) < 0) {
709 flog_err_sys(EC_LIB_SOCKET
, "%s(%s): cannot make socket: %s",
710 __func__
, addr
.sun_path
, safe_strerror(errno
));
714 if (set_nonblocking(sock
) < 0 || set_cloexec(sock
) < 0) {
715 flog_err_sys(EC_LIB_SYSTEM_CALL
,
716 "%s(%s): set_nonblocking/cloexec(%d) failed",
717 __func__
, addr
.sun_path
, sock
);
722 if (connect(sock
, (struct sockaddr
*)&addr
, len
) < 0) {
723 if ((errno
!= EINPROGRESS
) && (errno
!= EWOULDBLOCK
)) {
724 if (gs
.loglevel
> LOG_DEBUG
)
725 zlog_debug("%s(%s): connect failed: %s",
726 __func__
, addr
.sun_path
,
727 safe_strerror(errno
));
731 if (gs
.loglevel
> LOG_DEBUG
)
732 zlog_debug("%s: connection in progress", dmn
->name
);
733 dmn
->state
= DAEMON_CONNECTING
;
736 thread_add_write(master
, check_connect
, dmn
, dmn
->fd
,
738 dmn
->t_wakeup
= NULL
;
739 thread_add_timer(master
, wakeup_connect_hanging
, dmn
,
740 gs
.timeout
, &dmn
->t_wakeup
);
741 SET_READ_HANDLER(dmn
);
746 SET_READ_HANDLER(dmn
);
747 daemon_up(dmn
, "connect succeeded");
751 static int phase_hanging(struct thread
*t_hanging
)
753 gs
.t_phase_hanging
= NULL
;
754 flog_err(EC_WATCHFRR_CONNECTION
,
755 "Phase [%s] hanging for %ld seconds, aborting phased restart",
756 phase_str
[gs
.phase
], PHASE_TIMEOUT
);
757 gs
.phase
= PHASE_NONE
;
761 static void set_phase(restart_phase_t new_phase
)
763 gs
.phase
= new_phase
;
764 if (gs
.t_phase_hanging
)
765 thread_cancel(gs
.t_phase_hanging
);
766 gs
.t_phase_hanging
= NULL
;
767 thread_add_timer(master
, phase_hanging
, NULL
, PHASE_TIMEOUT
,
768 &gs
.t_phase_hanging
);
771 static void phase_check(void)
776 case PHASE_STOPS_PENDING
:
780 "Phased restart: all routing daemon stop jobs have completed.");
781 set_phase(PHASE_WAITING_DOWN
);
784 case PHASE_WAITING_DOWN
:
785 if (gs
.numdown
+ IS_UP(gs
.special
) < gs
.numdaemons
)
787 zlog_info("Phased restart: all routing daemons now down.");
788 run_job(&gs
.special
->restart
, "restart", gs
.restart_command
, 1,
790 set_phase(PHASE_ZEBRA_RESTART_PENDING
);
793 case PHASE_ZEBRA_RESTART_PENDING
:
794 if (gs
.special
->restart
.pid
)
796 zlog_info("Phased restart: %s restart job completed.",
798 set_phase(PHASE_WAITING_ZEBRA_UP
);
801 case PHASE_WAITING_ZEBRA_UP
:
802 if (!IS_UP(gs
.special
))
804 zlog_info("Phased restart: %s is now up.", gs
.special
->name
);
807 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
808 if (dmn
!= gs
.special
)
809 run_job(&dmn
->restart
, "start",
810 gs
.start_command
, 1, 0);
813 gs
.phase
= PHASE_NONE
;
814 THREAD_OFF(gs
.t_phase_hanging
);
815 zlog_notice("Phased global restart has completed.");
820 static void try_restart(struct daemon
*dmn
)
825 if (dmn
!= gs
.special
) {
826 if ((gs
.special
->state
== DAEMON_UP
)
827 && (gs
.phase
== PHASE_NONE
))
828 run_job(&dmn
->restart
, "restart", gs
.restart_command
, 0,
832 "%s: postponing restart attempt because master %s daemon "
833 "not up [%s], or phased restart in progress",
834 dmn
->name
, gs
.special
->name
,
835 state_str
[gs
.special
->state
]);
839 if ((gs
.phase
!= PHASE_NONE
) || gs
.numpids
) {
840 if (gs
.loglevel
> LOG_DEBUG
+ 1)
842 "postponing phased global restart: restart already in "
843 "progress [%s], or outstanding child processes [%d]",
844 phase_str
[gs
.phase
], gs
.numpids
);
847 /* Is it too soon for a restart? */
849 struct timeval delay
;
850 if (time_elapsed(&delay
, &gs
.special
->restart
.time
)->tv_sec
851 < gs
.special
->restart
.interval
) {
852 if (gs
.loglevel
> LOG_DEBUG
+ 1)
854 "postponing phased global restart: "
855 "elapsed time %ld < retry interval %ld",
857 gs
.special
->restart
.interval
);
861 run_job(&gs
.restart
, "restart", gs
.restart_command
, 0, 1);
864 static int wakeup_unresponsive(struct thread
*t_wakeup
)
866 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
868 dmn
->t_wakeup
= NULL
;
869 if (dmn
->state
!= DAEMON_UNRESPONSIVE
)
870 flog_err(EC_WATCHFRR_CONNECTION
,
871 "%s: no longer unresponsive (now %s), "
872 "wakeup should have been cancelled!",
873 dmn
->name
, state_str
[dmn
->state
]);
875 SET_WAKEUP_UNRESPONSIVE(dmn
);
881 static int wakeup_no_answer(struct thread
*t_wakeup
)
883 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
885 dmn
->t_wakeup
= NULL
;
886 dmn
->state
= DAEMON_UNRESPONSIVE
;
887 flog_err(EC_WATCHFRR_CONNECTION
,
888 "%s state -> unresponsive : no response yet to ping "
889 "sent %ld seconds ago",
890 dmn
->name
, gs
.timeout
);
891 SET_WAKEUP_UNRESPONSIVE(dmn
);
896 static int wakeup_send_echo(struct thread
*t_wakeup
)
898 static const char echocmd
[] = "echo " PING_TOKEN
;
900 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
902 dmn
->t_wakeup
= NULL
;
903 if (((rc
= write(dmn
->fd
, echocmd
, sizeof(echocmd
))) < 0)
904 || ((size_t)rc
!= sizeof(echocmd
))) {
905 char why
[100 + sizeof(echocmd
)];
906 snprintf(why
, sizeof(why
),
907 "write '%s' returned %d instead of %u", echocmd
,
908 (int)rc
, (unsigned int)sizeof(echocmd
));
909 daemon_down(dmn
, why
);
911 gettimeofday(&dmn
->echo_sent
, NULL
);
912 dmn
->t_wakeup
= NULL
;
913 thread_add_timer(master
, wakeup_no_answer
, dmn
, gs
.timeout
,
919 bool check_all_up(void)
923 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
924 if (dmn
->state
!= DAEMON_UP
)
929 static void sigint(void)
931 zlog_notice("Terminating on signal");
932 systemd_send_stopping();
936 static int valid_command(const char *cmd
)
940 return ((p
= strchr(cmd
, '%')) != NULL
) && (*(p
+ 1) == 's')
941 && !strchr(p
+ 1, '%');
944 /* This is an ugly hack to circumvent problems with passing command-line
945 arguments that contain spaces. The fix is to use a configuration file. */
946 static char *translate_blanks(const char *cmd
, const char *blankstr
)
950 size_t bslen
= strlen(blankstr
);
952 if (!(res
= strdup(cmd
))) {
956 while ((p
= strstr(res
, blankstr
)) != NULL
) {
959 memmove(p
+ 1, p
+ bslen
, strlen(p
+ bslen
) + 1);
964 struct zebra_privs_t watchfrr_privs
= {
966 .vty_group
= VTY_GROUP
,
970 static struct quagga_signal_t watchfrr_signals
[] = {
985 FRR_DAEMON_INFO(watchfrr
, WATCHFRR
,
986 .flags
= FRR_NO_PRIVSEP
| FRR_NO_TCPVTY
| FRR_LIMITED_CLI
987 | FRR_NO_CFG_PID_DRY
| FRR_NO_ZCLIENT
,
989 .printhelp
= printhelp
,
990 .copyright
= "Copyright 2004 Andrew J. Schorr",
992 .signals
= watchfrr_signals
,
993 .n_signals
= array_size(watchfrr_signals
),
995 .privs
= &watchfrr_privs
, )
997 #define DEPRECATED_OPTIONS "aAezR:"
999 int main(int argc
, char **argv
)
1002 const char *pidfile
= pidfile_default
;
1003 const char *special
= "zebra";
1004 const char *blankstr
= NULL
;
1006 snprintf(pidfile_default
, sizeof(pidfile_default
), "%s/watchfrr.pid",
1009 frr_preinit(&watchfrr_di
, argc
, argv
);
1010 progname
= watchfrr_di
.progname
;
1012 frr_opt_add("b:dk:l:i:p:r:S:s:t:T:" DEPRECATED_OPTIONS
, longopts
, "");
1014 gs
.restart
.name
= "all";
1015 while ((opt
= frr_getopt(argc
, argv
, NULL
)) != EOF
) {
1016 if (opt
&& opt
< 128 && strchr(DEPRECATED_OPTIONS
, opt
)) {
1018 "The -%c option no longer exists.\n"
1019 "Please refer to the watchfrr(8) man page.\n",
1034 if (!valid_command(optarg
)) {
1036 "Invalid kill command, must contain '%%s': %s\n",
1040 gs
.stop_command
= optarg
;
1044 if ((sscanf(optarg
, "%d%1s", &gs
.loglevel
, garbage
)
1046 || (gs
.loglevel
< LOG_EMERG
)) {
1048 "Invalid loglevel argument: %s\n",
1053 case OPTION_MINRESTART
: {
1055 if ((sscanf(optarg
, "%ld%1s", &gs
.min_restart_interval
,
1058 || (gs
.min_restart_interval
< 0)) {
1060 "Invalid min_restart_interval argument: %s\n",
1065 case OPTION_MAXRESTART
: {
1067 if ((sscanf(optarg
, "%ld%1s", &gs
.max_restart_interval
,
1070 || (gs
.max_restart_interval
< 0)) {
1072 "Invalid max_restart_interval argument: %s\n",
1080 if ((sscanf(optarg
, "%d%1s", &period
, garbage
) != 1)
1081 || (gs
.period
< 1)) {
1083 "Invalid interval argument: %s\n",
1087 gs
.period
= 1000 * period
;
1093 if (!valid_command(optarg
)) {
1095 "Invalid restart command, must contain '%%s': %s\n",
1099 gs
.restart_command
= optarg
;
1102 if (!valid_command(optarg
)) {
1104 "Invalid start command, must contain '%%s': %s\n",
1108 gs
.start_command
= optarg
;
1115 if ((sscanf(optarg
, "%ld%1s", &gs
.timeout
, garbage
)
1117 || (gs
.timeout
< 1)) {
1119 "Invalid timeout argument: %s\n",
1126 if ((sscanf(optarg
, "%ld%1s", &gs
.restart_timeout
,
1129 || (gs
.restart_timeout
< 1)) {
1131 "Invalid restart timeout argument: %s\n",
1137 fputs("Invalid option.\n", stderr
);
1143 && (gs
.start_command
|| gs
.stop_command
|| gs
.restart_command
)) {
1144 fputs("Options -r/-s/-k are not used when --dry is active.\n",
1148 && (!gs
.restart_command
|| !gs
.start_command
|| !gs
.stop_command
)) {
1150 "Options -s (start), -k (kill), and -r (restart) are required.\n");
1155 if (gs
.restart_command
)
1156 gs
.restart_command
=
1157 translate_blanks(gs
.restart_command
, blankstr
);
1158 if (gs
.start_command
)
1160 translate_blanks(gs
.start_command
, blankstr
);
1161 if (gs
.stop_command
)
1163 translate_blanks(gs
.stop_command
, blankstr
);
1166 gs
.restart
.interval
= gs
.min_restart_interval
;
1168 master
= frr_init();
1169 watchfrr_error_init();
1171 zlog_set_level(ZLOG_DEST_MONITOR
, ZLOG_DISABLED
);
1172 if (watchfrr_di
.daemon_mode
) {
1173 zlog_set_level(ZLOG_DEST_SYSLOG
, MIN(gs
.loglevel
, LOG_DEBUG
));
1174 if (daemon(0, 0) < 0) {
1175 fprintf(stderr
, "Watchfrr daemon failed: %s",
1180 zlog_set_level(ZLOG_DEST_STDOUT
, MIN(gs
.loglevel
, LOG_DEBUG
));
1182 watchfrr_vty_init();
1188 struct daemon
*tail
= NULL
;
1190 for (i
= optind
; i
< argc
; i
++) {
1193 if (!(dmn
= (struct daemon
*)calloc(1, sizeof(*dmn
)))) {
1194 fprintf(stderr
, "calloc(1,%u) failed: %s\n",
1195 (unsigned int)sizeof(*dmn
),
1196 safe_strerror(errno
));
1199 dmn
->name
= dmn
->restart
.name
= argv
[i
];
1200 dmn
->state
= DAEMON_INIT
;
1204 dmn
->t_wakeup
= NULL
;
1205 thread_add_timer_msec(master
, wakeup_init
, dmn
,
1206 100 + (random() % 900),
1208 dmn
->restart
.interval
= gs
.min_restart_interval
;
1215 if (!strcmp(dmn
->name
, special
))
1220 fputs("Must specify one or more daemons to monitor.\n", stderr
);
1223 if (!watch_only
&& !gs
.special
) {
1224 fprintf(stderr
, "\"%s\" daemon must be in daemon list\n",
1229 /* Make sure we're not already running. */
1230 pid_output(pidfile
);
1232 /* Announce which daemons are being monitored. */
1237 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
1238 len
+= strlen(dmn
->name
) + 1;
1244 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
1247 strcpy(p
, dmn
->name
);
1250 zlog_notice("%s %s watching [%s]%s", progname
,
1252 watch_only
? ", monitor mode" : "");
1257 struct thread thread
;
1259 while (thread_fetch(master
, &thread
))
1260 thread_call(&thread
);
1263 systemd_send_stopping();