2 * Monitor status of frr daemons and restart if necessary.
4 * Copyright (C) 2004 Andrew J. Schorr
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
26 #include <lib/version.h>
28 #include "memory_vty.h"
30 #include "lib_errors.h"
39 #include "watchfrr_errors.h"
42 #define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
45 /* Macros to help randomize timers. */
46 #define JITTER(X) ((random() % ((X)+1))-((X)/2))
47 #define FUZZY(X) ((X)+JITTER((X)/20))
49 #define DEFAULT_PERIOD 5
50 #define DEFAULT_TIMEOUT 90
51 #define DEFAULT_RESTART_TIMEOUT 20
52 #define DEFAULT_LOGLEVEL LOG_INFO
53 #define DEFAULT_MIN_RESTART 60
54 #define DEFAULT_MAX_RESTART 600
56 #define PING_TOKEN "PING"
58 /* Needs to be global, referenced somewhere inside libfrr. */
59 struct thread_master
*master
;
60 static char pidfile_default
[256];
62 static bool watch_only
= false;
68 PHASE_ZEBRA_RESTART_PENDING
,
69 PHASE_WAITING_ZEBRA_UP
72 static const char *phase_str
[] = {
75 "Waiting for other daemons to come down",
76 "Zebra restart job running",
77 "Waiting for zebra to come up",
81 #define PHASE_TIMEOUT (3*gs.restart_timeout)
89 struct thread
*t_kill
;
93 static struct global_state
{
94 restart_phase_t phase
;
95 struct thread
*t_phase_hanging
;
100 long min_restart_interval
;
101 long max_restart_interval
;
102 struct daemon
*daemons
;
103 const char *restart_command
;
104 const char *start_command
;
105 const char *stop_command
;
106 struct restart_info restart
;
108 struct daemon
*special
; /* points to zebra when doing phased restart */
111 int numdown
; /* # of daemons that are not UP or UNRESPONSIVE */
114 .vtydir
= frr_vtydir
,
115 .period
= 1000 * DEFAULT_PERIOD
,
116 .timeout
= DEFAULT_TIMEOUT
,
117 .restart_timeout
= DEFAULT_RESTART_TIMEOUT
,
118 .loglevel
= DEFAULT_LOGLEVEL
,
119 .min_restart_interval
= DEFAULT_MIN_RESTART
,
120 .max_restart_interval
= DEFAULT_MAX_RESTART
,
132 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
134 static const char *state_str
[] = {
135 "Init", "Down", "Connecting", "Up", "Unresponsive",
140 daemon_state_t state
;
142 struct timeval echo_sent
;
143 unsigned int connect_tries
;
144 struct thread
*t_wakeup
;
145 struct thread
*t_read
;
146 struct thread
*t_write
;
148 struct restart_info restart
;
151 #define OPTION_MINRESTART 2000
152 #define OPTION_MAXRESTART 2001
153 #define OPTION_DRY 2002
155 static const struct option longopts
[] = {
156 {"daemon", no_argument
, NULL
, 'd'},
157 {"statedir", required_argument
, NULL
, 'S'},
158 {"loglevel", required_argument
, NULL
, 'l'},
159 {"interval", required_argument
, NULL
, 'i'},
160 {"timeout", required_argument
, NULL
, 't'},
161 {"restart-timeout", required_argument
, NULL
, 'T'},
162 {"restart", required_argument
, NULL
, 'r'},
163 {"start-command", required_argument
, NULL
, 's'},
164 {"kill-command", required_argument
, NULL
, 'k'},
165 {"dry", no_argument
, NULL
, OPTION_DRY
},
166 {"min-restart-interval", required_argument
, NULL
, OPTION_MINRESTART
},
167 {"max-restart-interval", required_argument
, NULL
, OPTION_MAXRESTART
},
168 {"pid-file", required_argument
, NULL
, 'p'},
169 {"blank-string", required_argument
, NULL
, 'b'},
170 {"help", no_argument
, NULL
, 'h'},
171 {"version", no_argument
, NULL
, 'v'},
174 static int try_connect(struct daemon
*dmn
);
175 static int wakeup_send_echo(struct thread
*t_wakeup
);
176 static void try_restart(struct daemon
*dmn
);
177 static void phase_check(void);
179 static const char *progname
;
180 static void printhelp(FILE *target
)
183 "Usage : %s [OPTION...] <daemon name> ...\n\n\
184 Watchdog program to monitor status of frr daemons and try to restart\n\
185 them if they are down or unresponsive. It determines whether a daemon is\n\
186 up based on whether it can connect to the daemon's vty unix stream socket.\n\
187 It then repeatedly sends echo commands over that socket to determine whether\n\
188 the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
189 on the socket connection and know immediately that the daemon is down.\n\n\
190 The daemons to be monitored should be listed on the command line.\n\n\
191 In order to avoid attempting to restart the daemons in a fast loop,\n\
192 the -m and -M options allow you to control the minimum delay between\n\
193 restart commands. The minimum restart delay is recalculated each time\n\
194 a restart is attempted: if the time since the last restart attempt exceeds\n\
195 twice the -M value, then the restart delay is set to the -m value.\n\
196 Otherwise, the interval is doubled (but capped at the -M value).\n\n",
201 -d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
202 to syslog instead of stdout.\n\
203 -S, --statedir Set the vty socket directory (default is %s)\n\
204 -l, --loglevel Set the logging level (default is %d).\n\
205 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
206 but it can be set higher than %d if extra-verbose debugging\n\
207 messages are desired.\n\
208 --min-restart-interval\n\
209 Set the minimum seconds to wait between invocations of daemon\n\
210 restart commands (default is %d).\n\
211 --max-restart-interval\n\
212 Set the maximum seconds to wait between invocations of daemon\n\
213 restart commands (default is %d).\n\
214 -i, --interval Set the status polling interval in seconds (default is %d)\n\
215 -t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
216 -T, --restart-timeout\n\
217 Set the restart (kill) timeout in seconds (default is %d).\n\
218 If any background jobs are still running after this much\n\
219 time has elapsed, they will be killed.\n\
220 -r, --restart Supply a Bourne shell command to use to restart a single\n\
221 daemon. The command string should include '%%s' where the\n\
222 name of the daemon should be substituted.\n\
223 -s, --start-command\n\
224 Supply a Bourne shell to command to use to start a single\n\
225 daemon. The command string should include '%%s' where the\n\
226 name of the daemon should be substituted.\n\
227 -k, --kill-command\n\
228 Supply a Bourne shell to command to use to stop a single\n\
229 daemon. The command string should include '%%s' where the\n\
230 name of the daemon should be substituted.\n\
231 --dry Do not start or restart anything, just log.\n\
232 -p, --pid-file Set process identifier file name\n\
234 -b, --blank-string\n\
235 When the supplied argument string is found in any of the\n\
236 various shell command arguments (-r, -s, or -k), replace\n\
237 it with a space. This is an ugly hack to circumvent problems\n\
238 passing command-line arguments with embedded spaces.\n\
239 -v, --version Print program version\n\
240 -h, --help Display this help and exit\n",
241 frr_vtydir
, DEFAULT_LOGLEVEL
, LOG_EMERG
, LOG_DEBUG
, LOG_DEBUG
,
242 DEFAULT_MIN_RESTART
, DEFAULT_MAX_RESTART
, DEFAULT_PERIOD
,
243 DEFAULT_TIMEOUT
, DEFAULT_RESTART_TIMEOUT
, pidfile_default
);
246 static pid_t
run_background(char *shell_cmd
)
250 switch (child
= fork()) {
252 zlog_ferr(LIB_ERR_SYSTEM_CALL
,
253 "fork failed, cannot run command [%s]: %s", shell_cmd
,
254 safe_strerror(errno
));
258 /* Use separate process group so child processes can be killed
260 if (setpgid(0, 0) < 0)
261 zlog_warn("warning: setpgid(0,0) failed: %s",
262 safe_strerror(errno
));
266 char *const argv
[4] = {shell
, dashc
, shell_cmd
, NULL
};
267 execv("/bin/sh", argv
);
268 zlog_ferr(LIB_ERR_SYSTEM_CALL
,
269 "execv(/bin/sh -c '%s') failed: %s", shell_cmd
,
270 safe_strerror(errno
));
274 /* Parent process: we will reap the child later. */
275 zlog_ferr(LIB_ERR_SYSTEM_CALL
,
276 "Forked background command [pid %d]: %s", (int)child
,
282 static struct timeval
*time_elapsed(struct timeval
*result
,
283 const struct timeval
*start_time
)
285 gettimeofday(result
, NULL
);
286 result
->tv_sec
-= start_time
->tv_sec
;
287 result
->tv_usec
-= start_time
->tv_usec
;
288 while (result
->tv_usec
< 0) {
289 result
->tv_usec
+= 1000000L;
295 static int restart_kill(struct thread
*t_kill
)
297 struct restart_info
*restart
= THREAD_ARG(t_kill
);
298 struct timeval delay
;
300 time_elapsed(&delay
, &restart
->time
);
302 "Warning: %s %s child process %d still running after "
303 "%ld seconds, sending signal %d",
304 restart
->what
, restart
->name
, (int)restart
->pid
,
305 (long)delay
.tv_sec
, (restart
->kills
? SIGKILL
: SIGTERM
));
306 kill(-restart
->pid
, (restart
->kills
? SIGKILL
: SIGTERM
));
308 restart
->t_kill
= NULL
;
309 thread_add_timer(master
, restart_kill
, restart
, gs
.restart_timeout
,
314 static struct restart_info
*find_child(pid_t child
)
317 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
318 if (dmn
->restart
.pid
== child
)
319 return &dmn
->restart
;
324 static void sigchild(void)
330 struct restart_info
*restart
;
332 switch (child
= waitpid(-1, &status
, WNOHANG
)) {
334 zlog_ferr(LIB_ERR_SYSTEM_CALL
,
335 "waitpid failed: %s", safe_strerror(errno
));
338 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
342 if (child
== integrated_write_pid
) {
343 integrated_write_sigchld(status
);
347 if ((restart
= find_child(child
)) != NULL
) {
348 name
= restart
->name
;
349 what
= restart
->what
;
352 thread_cancel(restart
->t_kill
);
353 restart
->t_kill
= NULL
;
354 /* Update restart time to reflect the time the command
356 gettimeofday(&restart
->time
, NULL
);
358 zlog_ferr(LIB_ERR_SYSTEM_CALL
,
359 "waitpid returned status for an unknown child process %d",
364 if (WIFSTOPPED(status
))
365 zlog_warn("warning: %s %s process %d is stopped", what
, name
,
367 else if (WIFSIGNALED(status
))
368 zlog_warn("%s %s process %d terminated due to signal %d", what
,
369 name
, (int)child
, WTERMSIG(status
));
370 else if (WIFEXITED(status
)) {
371 if (WEXITSTATUS(status
) != 0)
373 "%s %s process %d exited with non-zero status %d",
374 what
, name
, (int)child
, WEXITSTATUS(status
));
376 zlog_debug("%s %s process %d exited normally", what
,
379 zlog_ferr(LIB_ERR_SYSTEM_CALL
,
380 "cannot interpret %s %s process %d wait status 0x%x",
381 what
, name
, (int)child
, status
);
385 static int run_job(struct restart_info
*restart
, const char *cmdtype
,
386 const char *command
, int force
, int update_interval
)
388 struct timeval delay
;
390 if (gs
.loglevel
> LOG_DEBUG
+ 1)
391 zlog_debug("attempting to %s %s", cmdtype
, restart
->name
);
394 if (gs
.loglevel
> LOG_DEBUG
+ 1)
396 "cannot %s %s, previous pid %d still running",
397 cmdtype
, restart
->name
, (int)restart
->pid
);
401 /* Note: time_elapsed test must come before the force test, since we
403 to make sure that delay is initialized for use below in updating the
405 if ((time_elapsed(&delay
, &restart
->time
)->tv_sec
< restart
->interval
)
407 if (gs
.loglevel
> LOG_DEBUG
+ 1)
410 "elapsed time %ld < retry interval %ld",
411 cmdtype
, restart
->name
, (long)delay
.tv_sec
,
416 gettimeofday(&restart
->time
, NULL
);
419 char cmd
[strlen(command
) + strlen(restart
->name
) + 1];
420 snprintf(cmd
, sizeof(cmd
), command
, restart
->name
);
421 if ((restart
->pid
= run_background(cmd
)) > 0) {
422 restart
->t_kill
= NULL
;
423 thread_add_timer(master
, restart_kill
, restart
,
424 gs
.restart_timeout
, &restart
->t_kill
);
425 restart
->what
= cmdtype
;
431 /* Calculate the new restart interval. */
432 if (update_interval
) {
433 if (delay
.tv_sec
> 2 * gs
.max_restart_interval
)
434 restart
->interval
= gs
.min_restart_interval
;
435 else if ((restart
->interval
*= 2) > gs
.max_restart_interval
)
436 restart
->interval
= gs
.max_restart_interval
;
437 if (gs
.loglevel
> LOG_DEBUG
+ 1)
438 zlog_debug("restart %s interval is now %ld",
439 restart
->name
, restart
->interval
);
444 #define SET_READ_HANDLER(DMN) \
446 (DMN)->t_read = NULL; \
447 thread_add_read(master, handle_read, (DMN), (DMN)->fd, \
451 #define SET_WAKEUP_DOWN(DMN) \
453 (DMN)->t_wakeup = NULL; \
454 thread_add_timer_msec(master, wakeup_down, (DMN), \
455 FUZZY(gs.period), &(DMN)->t_wakeup); \
458 #define SET_WAKEUP_UNRESPONSIVE(DMN) \
460 (DMN)->t_wakeup = NULL; \
461 thread_add_timer_msec(master, wakeup_unresponsive, (DMN), \
462 FUZZY(gs.period), &(DMN)->t_wakeup); \
465 #define SET_WAKEUP_ECHO(DMN) \
467 (DMN)->t_wakeup = NULL; \
468 thread_add_timer_msec(master, wakeup_send_echo, (DMN), \
469 FUZZY(gs.period), &(DMN)->t_wakeup); \
472 static int wakeup_down(struct thread
*t_wakeup
)
474 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
476 dmn
->t_wakeup
= NULL
;
477 if (try_connect(dmn
) < 0)
478 SET_WAKEUP_DOWN(dmn
);
479 if ((dmn
->connect_tries
> 1) && (dmn
->state
!= DAEMON_UP
))
484 static int wakeup_init(struct thread
*t_wakeup
)
486 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
488 dmn
->t_wakeup
= NULL
;
489 if (try_connect(dmn
) < 0) {
490 SET_WAKEUP_DOWN(dmn
);
491 zlog_ferr(WATCHFRR_ERR_CONNECTION
,
492 "%s state -> down : initial connection attempt failed",
494 dmn
->state
= DAEMON_DOWN
;
499 static void daemon_down(struct daemon
*dmn
, const char *why
)
501 if (IS_UP(dmn
) || (dmn
->state
== DAEMON_INIT
))
502 zlog_ferr(WATCHFRR_ERR_CONNECTION
,
503 "%s state -> down : %s", dmn
->name
, why
);
504 else if (gs
.loglevel
> LOG_DEBUG
)
505 zlog_debug("%s still down : %s", dmn
->name
, why
);
508 dmn
->state
= DAEMON_DOWN
;
513 THREAD_OFF(dmn
->t_read
);
514 THREAD_OFF(dmn
->t_write
);
515 THREAD_OFF(dmn
->t_wakeup
);
516 if (try_connect(dmn
) < 0)
517 SET_WAKEUP_DOWN(dmn
);
521 static int handle_read(struct thread
*t_read
)
523 struct daemon
*dmn
= THREAD_ARG(t_read
);
524 static const char resp
[sizeof(PING_TOKEN
) + 4] = PING_TOKEN
"\n";
525 char buf
[sizeof(resp
) + 100];
527 struct timeval delay
;
530 if ((rc
= read(dmn
->fd
, buf
, sizeof(buf
))) < 0) {
533 if (ERRNO_IO_RETRY(errno
)) {
534 /* Pretend it never happened. */
535 SET_READ_HANDLER(dmn
);
538 snprintf(why
, sizeof(why
), "unexpected read error: %s",
539 safe_strerror(errno
));
540 daemon_down(dmn
, why
);
544 daemon_down(dmn
, "read returned EOF");
547 if (!dmn
->echo_sent
.tv_sec
) {
548 char why
[sizeof(buf
) + 100];
549 snprintf(why
, sizeof(why
),
550 "unexpected read returns %d bytes: %.*s", (int)rc
,
552 daemon_down(dmn
, why
);
556 /* We are expecting an echo response: is there any chance that the
557 response would not be returned entirely in the first read? That
558 seems inconceivable... */
559 if ((rc
!= sizeof(resp
)) || memcmp(buf
, resp
, sizeof(resp
))) {
560 char why
[100 + sizeof(buf
)];
561 snprintf(why
, sizeof(why
),
562 "read returned bad echo response of %d bytes "
563 "(expecting %u): %.*s",
564 (int)rc
, (unsigned int)sizeof(resp
), (int)rc
, buf
);
565 daemon_down(dmn
, why
);
569 time_elapsed(&delay
, &dmn
->echo_sent
);
570 dmn
->echo_sent
.tv_sec
= 0;
571 if (dmn
->state
== DAEMON_UNRESPONSIVE
) {
572 if (delay
.tv_sec
< gs
.timeout
) {
573 dmn
->state
= DAEMON_UP
;
575 "%s state -> up : echo response received after %ld.%06ld "
577 dmn
->name
, (long)delay
.tv_sec
,
578 (long)delay
.tv_usec
);
581 "%s: slow echo response finally received after %ld.%06ld "
583 dmn
->name
, (long)delay
.tv_sec
,
584 (long)delay
.tv_usec
);
585 } else if (gs
.loglevel
> LOG_DEBUG
+ 1)
586 zlog_debug("%s: echo response received after %ld.%06ld seconds",
587 dmn
->name
, (long)delay
.tv_sec
, (long)delay
.tv_usec
);
589 SET_READ_HANDLER(dmn
);
591 thread_cancel(dmn
->t_wakeup
);
592 SET_WAKEUP_ECHO(dmn
);
598 * Wait till we notice that all daemons are ready before
599 * we send we are ready to systemd
601 static void daemon_send_ready(void)
604 if (!sent
&& gs
.numdown
== 0) {
607 fp
= fopen(DAEMON_VTY_DIR
"/watchfrr.started", "w");
610 #if defined HAVE_SYSTEMD
612 "Watchfrr: Notifying Systemd we are up and running");
613 systemd_send_started(master
, 0);
619 static void daemon_up(struct daemon
*dmn
, const char *why
)
621 dmn
->state
= DAEMON_UP
;
623 dmn
->connect_tries
= 0;
624 zlog_notice("%s state -> up : %s", dmn
->name
, why
);
626 SET_WAKEUP_ECHO(dmn
);
630 static int check_connect(struct thread
*t_write
)
632 struct daemon
*dmn
= THREAD_ARG(t_write
);
634 socklen_t reslen
= sizeof(sockerr
);
637 if (getsockopt(dmn
->fd
, SOL_SOCKET
, SO_ERROR
, (char *)&sockerr
, &reslen
)
639 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn
->name
,
640 safe_strerror(errno
));
642 "getsockopt failed checking connection success");
645 if ((reslen
== sizeof(sockerr
)) && sockerr
) {
649 "getsockopt reports that connection attempt failed: %s",
650 safe_strerror(sockerr
));
651 daemon_down(dmn
, why
);
655 daemon_up(dmn
, "delayed connect succeeded");
659 static int wakeup_connect_hanging(struct thread
*t_wakeup
)
661 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
664 dmn
->t_wakeup
= NULL
;
665 snprintf(why
, sizeof(why
),
666 "connection attempt timed out after %ld seconds", gs
.timeout
);
667 daemon_down(dmn
, why
);
671 /* Making connection to protocol daemon. */
672 static int try_connect(struct daemon
*dmn
)
675 struct sockaddr_un addr
;
678 if (gs
.loglevel
> LOG_DEBUG
+ 1)
679 zlog_debug("%s: attempting to connect", dmn
->name
);
680 dmn
->connect_tries
++;
682 memset(&addr
, 0, sizeof(struct sockaddr_un
));
683 addr
.sun_family
= AF_UNIX
;
684 snprintf(addr
.sun_path
, sizeof(addr
.sun_path
), "%s/%s.vty", gs
.vtydir
,
686 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
687 len
= addr
.sun_len
= SUN_LEN(&addr
);
689 len
= sizeof(addr
.sun_family
) + strlen(addr
.sun_path
);
690 #endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
692 /* Quick check to see if we might succeed before we go to the trouble
693 of creating a socket. */
694 if (access(addr
.sun_path
, W_OK
) < 0) {
696 zlog_ferr(LIB_ERR_SYSTEM_CALL
,
697 "%s: access to socket %s denied: %s",
698 dmn
->name
, addr
.sun_path
,
699 safe_strerror(errno
));
703 if ((sock
= socket(AF_UNIX
, SOCK_STREAM
, 0)) < 0) {
704 zlog_ferr(LIB_ERR_SOCKET
,
705 "%s(%s): cannot make socket: %s", __func__
,
706 addr
.sun_path
, safe_strerror(errno
));
710 if (set_nonblocking(sock
) < 0 || set_cloexec(sock
) < 0) {
711 zlog_ferr(LIB_ERR_SYSTEM_CALL
,
712 "%s(%s): set_nonblocking/cloexec(%d) failed",
713 __func__
, addr
.sun_path
, sock
);
718 if (connect(sock
, (struct sockaddr
*)&addr
, len
) < 0) {
719 if ((errno
!= EINPROGRESS
) && (errno
!= EWOULDBLOCK
)) {
720 if (gs
.loglevel
> LOG_DEBUG
)
721 zlog_debug("%s(%s): connect failed: %s",
722 __func__
, addr
.sun_path
,
723 safe_strerror(errno
));
727 if (gs
.loglevel
> LOG_DEBUG
)
728 zlog_debug("%s: connection in progress", dmn
->name
);
729 dmn
->state
= DAEMON_CONNECTING
;
732 thread_add_write(master
, check_connect
, dmn
, dmn
->fd
,
734 dmn
->t_wakeup
= NULL
;
735 thread_add_timer(master
, wakeup_connect_hanging
, dmn
,
736 gs
.timeout
, &dmn
->t_wakeup
);
737 SET_READ_HANDLER(dmn
);
742 SET_READ_HANDLER(dmn
);
743 daemon_up(dmn
, "connect succeeded");
747 static int phase_hanging(struct thread
*t_hanging
)
749 gs
.t_phase_hanging
= NULL
;
750 zlog_ferr(WATCHFRR_ERR_CONNECTION
,
751 "Phase [%s] hanging for %ld seconds, aborting phased restart",
752 phase_str
[gs
.phase
], PHASE_TIMEOUT
);
753 gs
.phase
= PHASE_NONE
;
757 static void set_phase(restart_phase_t new_phase
)
759 gs
.phase
= new_phase
;
760 if (gs
.t_phase_hanging
)
761 thread_cancel(gs
.t_phase_hanging
);
762 gs
.t_phase_hanging
= NULL
;
763 thread_add_timer(master
, phase_hanging
, NULL
, PHASE_TIMEOUT
,
764 &gs
.t_phase_hanging
);
767 static void phase_check(void)
772 case PHASE_STOPS_PENDING
:
776 "Phased restart: all routing daemon stop jobs have completed.");
777 set_phase(PHASE_WAITING_DOWN
);
780 case PHASE_WAITING_DOWN
:
781 if (gs
.numdown
+ IS_UP(gs
.special
) < gs
.numdaemons
)
783 zlog_info("Phased restart: all routing daemons now down.");
784 run_job(&gs
.special
->restart
, "restart", gs
.restart_command
, 1,
786 set_phase(PHASE_ZEBRA_RESTART_PENDING
);
789 case PHASE_ZEBRA_RESTART_PENDING
:
790 if (gs
.special
->restart
.pid
)
792 zlog_info("Phased restart: %s restart job completed.",
794 set_phase(PHASE_WAITING_ZEBRA_UP
);
797 case PHASE_WAITING_ZEBRA_UP
:
798 if (!IS_UP(gs
.special
))
800 zlog_info("Phased restart: %s is now up.", gs
.special
->name
);
803 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
804 if (dmn
!= gs
.special
)
805 run_job(&dmn
->restart
, "start",
806 gs
.start_command
, 1, 0);
809 gs
.phase
= PHASE_NONE
;
810 THREAD_OFF(gs
.t_phase_hanging
);
811 zlog_notice("Phased global restart has completed.");
816 static void try_restart(struct daemon
*dmn
)
821 if (dmn
!= gs
.special
) {
822 if ((gs
.special
->state
== DAEMON_UP
)
823 && (gs
.phase
== PHASE_NONE
))
824 run_job(&dmn
->restart
, "restart", gs
.restart_command
, 0,
828 "%s: postponing restart attempt because master %s daemon "
829 "not up [%s], or phased restart in progress",
830 dmn
->name
, gs
.special
->name
,
831 state_str
[gs
.special
->state
]);
835 if ((gs
.phase
!= PHASE_NONE
) || gs
.numpids
) {
836 if (gs
.loglevel
> LOG_DEBUG
+ 1)
838 "postponing phased global restart: restart already in "
839 "progress [%s], or outstanding child processes [%d]",
840 phase_str
[gs
.phase
], gs
.numpids
);
843 /* Is it too soon for a restart? */
845 struct timeval delay
;
846 if (time_elapsed(&delay
, &gs
.special
->restart
.time
)->tv_sec
847 < gs
.special
->restart
.interval
) {
848 if (gs
.loglevel
> LOG_DEBUG
+ 1)
850 "postponing phased global restart: "
851 "elapsed time %ld < retry interval %ld",
853 gs
.special
->restart
.interval
);
857 run_job(&gs
.restart
, "restart", gs
.restart_command
, 0, 1);
860 static int wakeup_unresponsive(struct thread
*t_wakeup
)
862 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
864 dmn
->t_wakeup
= NULL
;
865 if (dmn
->state
!= DAEMON_UNRESPONSIVE
)
866 zlog_ferr(WATCHFRR_ERR_CONNECTION
,
867 "%s: no longer unresponsive (now %s), "
868 "wakeup should have been cancelled!",
869 dmn
->name
, state_str
[dmn
->state
]);
871 SET_WAKEUP_UNRESPONSIVE(dmn
);
877 static int wakeup_no_answer(struct thread
*t_wakeup
)
879 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
881 dmn
->t_wakeup
= NULL
;
882 dmn
->state
= DAEMON_UNRESPONSIVE
;
883 zlog_ferr(WATCHFRR_ERR_CONNECTION
,
884 "%s state -> unresponsive : no response yet to ping "
885 "sent %ld seconds ago",
886 dmn
->name
, gs
.timeout
);
887 SET_WAKEUP_UNRESPONSIVE(dmn
);
892 static int wakeup_send_echo(struct thread
*t_wakeup
)
894 static const char echocmd
[] = "echo " PING_TOKEN
;
896 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
898 dmn
->t_wakeup
= NULL
;
899 if (((rc
= write(dmn
->fd
, echocmd
, sizeof(echocmd
))) < 0)
900 || ((size_t)rc
!= sizeof(echocmd
))) {
901 char why
[100 + sizeof(echocmd
)];
902 snprintf(why
, sizeof(why
),
903 "write '%s' returned %d instead of %u", echocmd
,
904 (int)rc
, (unsigned int)sizeof(echocmd
));
905 daemon_down(dmn
, why
);
907 gettimeofday(&dmn
->echo_sent
, NULL
);
908 dmn
->t_wakeup
= NULL
;
909 thread_add_timer(master
, wakeup_no_answer
, dmn
, gs
.timeout
,
915 bool check_all_up(void)
919 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
920 if (dmn
->state
!= DAEMON_UP
)
925 static void sigint(void)
927 zlog_notice("Terminating on signal");
928 systemd_send_stopping();
932 static int valid_command(const char *cmd
)
936 return ((p
= strchr(cmd
, '%')) != NULL
) && (*(p
+ 1) == 's')
937 && !strchr(p
+ 1, '%');
940 /* This is an ugly hack to circumvent problems with passing command-line
941 arguments that contain spaces. The fix is to use a configuration file. */
942 static char *translate_blanks(const char *cmd
, const char *blankstr
)
946 size_t bslen
= strlen(blankstr
);
948 if (!(res
= strdup(cmd
))) {
952 while ((p
= strstr(res
, blankstr
)) != NULL
) {
955 memmove(p
+ 1, p
+ bslen
, strlen(p
+ bslen
) + 1);
960 struct zebra_privs_t watchfrr_privs
= {
962 .vty_group
= VTY_GROUP
,
966 static struct quagga_signal_t watchfrr_signals
[] = {
981 FRR_DAEMON_INFO(watchfrr
, WATCHFRR
,
982 .flags
= FRR_NO_PRIVSEP
| FRR_NO_TCPVTY
| FRR_LIMITED_CLI
983 | FRR_NO_CFG_PID_DRY
| FRR_NO_ZCLIENT
,
985 .printhelp
= printhelp
,
986 .copyright
= "Copyright 2004 Andrew J. Schorr",
988 .signals
= watchfrr_signals
,
989 .n_signals
= array_size(watchfrr_signals
),
991 .privs
= &watchfrr_privs
, )
993 #define DEPRECATED_OPTIONS "aAezR:"
995 int main(int argc
, char **argv
)
998 const char *pidfile
= pidfile_default
;
999 const char *special
= "zebra";
1000 const char *blankstr
= NULL
;
1002 snprintf(pidfile_default
, sizeof(pidfile_default
), "%s/watchfrr.pid",
1005 frr_preinit(&watchfrr_di
, argc
, argv
);
1006 progname
= watchfrr_di
.progname
;
1008 frr_opt_add("b:dk:l:i:p:r:S:s:t:T:" DEPRECATED_OPTIONS
, longopts
, "");
1010 gs
.restart
.name
= "all";
1011 while ((opt
= frr_getopt(argc
, argv
, NULL
)) != EOF
) {
1012 if (opt
&& opt
< 128 && strchr(DEPRECATED_OPTIONS
, opt
)) {
1014 "The -%c option no longer exists.\n"
1015 "Please refer to the watchfrr(8) man page.\n",
1030 if (!valid_command(optarg
)) {
1032 "Invalid kill command, must contain '%%s': %s\n",
1036 gs
.stop_command
= optarg
;
1040 if ((sscanf(optarg
, "%d%1s", &gs
.loglevel
, garbage
)
1042 || (gs
.loglevel
< LOG_EMERG
)) {
1044 "Invalid loglevel argument: %s\n",
1049 case OPTION_MINRESTART
: {
1051 if ((sscanf(optarg
, "%ld%1s", &gs
.min_restart_interval
,
1054 || (gs
.min_restart_interval
< 0)) {
1056 "Invalid min_restart_interval argument: %s\n",
1061 case OPTION_MAXRESTART
: {
1063 if ((sscanf(optarg
, "%ld%1s", &gs
.max_restart_interval
,
1066 || (gs
.max_restart_interval
< 0)) {
1068 "Invalid max_restart_interval argument: %s\n",
1076 if ((sscanf(optarg
, "%d%1s", &period
, garbage
) != 1)
1077 || (gs
.period
< 1)) {
1079 "Invalid interval argument: %s\n",
1083 gs
.period
= 1000 * period
;
1089 if (!valid_command(optarg
)) {
1091 "Invalid restart command, must contain '%%s': %s\n",
1095 gs
.restart_command
= optarg
;
1098 if (!valid_command(optarg
)) {
1100 "Invalid start command, must contain '%%s': %s\n",
1104 gs
.start_command
= optarg
;
1111 if ((sscanf(optarg
, "%ld%1s", &gs
.timeout
, garbage
)
1113 || (gs
.timeout
< 1)) {
1115 "Invalid timeout argument: %s\n",
1122 if ((sscanf(optarg
, "%ld%1s", &gs
.restart_timeout
,
1125 || (gs
.restart_timeout
< 1)) {
1127 "Invalid restart timeout argument: %s\n",
1133 fputs("Invalid option.\n", stderr
);
1139 && (gs
.start_command
|| gs
.stop_command
|| gs
.restart_command
)) {
1140 fputs("Options -r/-s/-k are not used when --dry is active.\n",
1144 && (!gs
.restart_command
|| !gs
.start_command
|| !gs
.stop_command
)) {
1146 "Options -s (start), -k (kill), and -r (restart) are required.\n");
1151 if (gs
.restart_command
)
1152 gs
.restart_command
=
1153 translate_blanks(gs
.restart_command
, blankstr
);
1154 if (gs
.start_command
)
1156 translate_blanks(gs
.start_command
, blankstr
);
1157 if (gs
.stop_command
)
1159 translate_blanks(gs
.stop_command
, blankstr
);
1162 gs
.restart
.interval
= gs
.min_restart_interval
;
1164 master
= frr_init();
1165 watchfrr_error_init();
1167 zlog_set_level(ZLOG_DEST_MONITOR
, ZLOG_DISABLED
);
1168 if (watchfrr_di
.daemon_mode
) {
1169 zlog_set_level(ZLOG_DEST_SYSLOG
, MIN(gs
.loglevel
, LOG_DEBUG
));
1170 if (daemon(0, 0) < 0) {
1171 fprintf(stderr
, "Watchfrr daemon failed: %s",
1176 zlog_set_level(ZLOG_DEST_STDOUT
, MIN(gs
.loglevel
, LOG_DEBUG
));
1178 watchfrr_vty_init();
1184 struct daemon
*tail
= NULL
;
1186 for (i
= optind
; i
< argc
; i
++) {
1189 if (!(dmn
= (struct daemon
*)calloc(1, sizeof(*dmn
)))) {
1190 fprintf(stderr
, "calloc(1,%u) failed: %s\n",
1191 (unsigned int)sizeof(*dmn
),
1192 safe_strerror(errno
));
1195 dmn
->name
= dmn
->restart
.name
= argv
[i
];
1196 dmn
->state
= DAEMON_INIT
;
1200 dmn
->t_wakeup
= NULL
;
1201 thread_add_timer_msec(master
, wakeup_init
, dmn
,
1202 100 + (random() % 900),
1204 dmn
->restart
.interval
= gs
.min_restart_interval
;
1211 if (!strcmp(dmn
->name
, special
))
1216 fputs("Must specify one or more daemons to monitor.\n", stderr
);
1219 if (!watch_only
&& !gs
.special
) {
1220 fprintf(stderr
, "\"%s\" daemon must be in daemon list\n",
1225 /* Make sure we're not already running. */
1226 pid_output(pidfile
);
1228 /* Announce which daemons are being monitored. */
1233 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
1234 len
+= strlen(dmn
->name
) + 1;
1240 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
1243 strcpy(p
, dmn
->name
);
1246 zlog_notice("%s %s watching [%s]%s", progname
,
1248 watch_only
? ", monitor mode" : "");
1253 struct thread thread
;
1255 while (thread_fetch(master
, &thread
))
1256 thread_call(&thread
);
1259 systemd_send_stopping();