2 * Monitor status of frr daemons and restart if necessary.
4 * Copyright (C) 2004 Andrew J. Schorr
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
26 #include <lib/version.h>
28 #include "memory_vty.h"
30 #include "lib_errors.h"
39 #include "watchfrr_errors.h"
42 #define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
45 /* Macros to help randomize timers. */
46 #define JITTER(X) ((random() % ((X)+1))-((X)/2))
47 #define FUZZY(X) ((X)+JITTER((X)/20))
49 #define DEFAULT_PERIOD 5
50 #define DEFAULT_TIMEOUT 90
51 #define DEFAULT_RESTART_TIMEOUT 20
52 #define DEFAULT_LOGLEVEL LOG_INFO
53 #define DEFAULT_MIN_RESTART 60
54 #define DEFAULT_MAX_RESTART 600
56 #define PING_TOKEN "PING"
58 DEFINE_MGROUP(WATCHFRR
, "watchfrr")
59 DEFINE_MTYPE_STATIC(WATCHFRR
, WATCHFRR_DAEMON
, "watchfrr daemon entry")
61 /* Needs to be global, referenced somewhere inside libfrr. */
62 struct thread_master
*master
;
64 static bool watch_only
= false;
70 PHASE_ZEBRA_RESTART_PENDING
,
71 PHASE_WAITING_ZEBRA_UP
74 static const char *phase_str
[] = {
77 "Waiting for other daemons to come down",
78 "Zebra restart job running",
79 "Waiting for zebra to come up",
83 #define PHASE_TIMEOUT (3*gs.restart_timeout)
91 struct thread
*t_kill
;
95 static struct global_state
{
96 restart_phase_t phase
;
97 struct thread
*t_phase_hanging
;
101 long restart_timeout
;
102 long min_restart_interval
;
103 long max_restart_interval
;
104 struct daemon
*daemons
;
105 const char *restart_command
;
106 const char *start_command
;
107 const char *stop_command
;
108 struct restart_info restart
;
110 struct daemon
*special
; /* points to zebra when doing phased restart */
113 int numdown
; /* # of daemons that are not UP or UNRESPONSIVE */
116 .vtydir
= frr_vtydir
,
117 .period
= 1000 * DEFAULT_PERIOD
,
118 .timeout
= DEFAULT_TIMEOUT
,
119 .restart_timeout
= DEFAULT_RESTART_TIMEOUT
,
120 .loglevel
= DEFAULT_LOGLEVEL
,
121 .min_restart_interval
= DEFAULT_MIN_RESTART
,
122 .max_restart_interval
= DEFAULT_MAX_RESTART
,
134 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
136 static const char *state_str
[] = {
137 "Init", "Down", "Connecting", "Up", "Unresponsive",
142 daemon_state_t state
;
144 struct timeval echo_sent
;
145 unsigned int connect_tries
;
146 struct thread
*t_wakeup
;
147 struct thread
*t_read
;
148 struct thread
*t_write
;
150 struct restart_info restart
;
153 #define OPTION_MINRESTART 2000
154 #define OPTION_MAXRESTART 2001
155 #define OPTION_DRY 2002
157 static const struct option longopts
[] = {
158 {"daemon", no_argument
, NULL
, 'd'},
159 {"statedir", required_argument
, NULL
, 'S'},
160 {"loglevel", required_argument
, NULL
, 'l'},
161 {"interval", required_argument
, NULL
, 'i'},
162 {"timeout", required_argument
, NULL
, 't'},
163 {"restart-timeout", required_argument
, NULL
, 'T'},
164 {"restart", required_argument
, NULL
, 'r'},
165 {"start-command", required_argument
, NULL
, 's'},
166 {"kill-command", required_argument
, NULL
, 'k'},
167 {"dry", no_argument
, NULL
, OPTION_DRY
},
168 {"min-restart-interval", required_argument
, NULL
, OPTION_MINRESTART
},
169 {"max-restart-interval", required_argument
, NULL
, OPTION_MAXRESTART
},
170 {"pid-file", required_argument
, NULL
, 'p'},
171 {"blank-string", required_argument
, NULL
, 'b'},
172 {"help", no_argument
, NULL
, 'h'},
173 {"version", no_argument
, NULL
, 'v'},
176 static int try_connect(struct daemon
*dmn
);
177 static int wakeup_send_echo(struct thread
*t_wakeup
);
178 static void try_restart(struct daemon
*dmn
);
179 static void phase_check(void);
181 static const char *progname
;
182 static void printhelp(FILE *target
)
185 "Usage : %s [OPTION...] <daemon name> ...\n\n\
186 Watchdog program to monitor status of frr daemons and try to restart\n\
187 them if they are down or unresponsive. It determines whether a daemon is\n\
188 up based on whether it can connect to the daemon's vty unix stream socket.\n\
189 It then repeatedly sends echo commands over that socket to determine whether\n\
190 the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
191 on the socket connection and know immediately that the daemon is down.\n\n\
192 The daemons to be monitored should be listed on the command line.\n\n\
193 In order to avoid attempting to restart the daemons in a fast loop,\n\
194 the -m and -M options allow you to control the minimum delay between\n\
195 restart commands. The minimum restart delay is recalculated each time\n\
196 a restart is attempted: if the time since the last restart attempt exceeds\n\
197 twice the -M value, then the restart delay is set to the -m value.\n\
198 Otherwise, the interval is doubled (but capped at the -M value).\n\n",
203 -d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
204 to syslog instead of stdout.\n\
205 -S, --statedir Set the vty socket directory (default is %s)\n\
206 -l, --loglevel Set the logging level (default is %d).\n\
207 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
208 but it can be set higher than %d if extra-verbose debugging\n\
209 messages are desired.\n\
210 --min-restart-interval\n\
211 Set the minimum seconds to wait between invocations of daemon\n\
212 restart commands (default is %d).\n\
213 --max-restart-interval\n\
214 Set the maximum seconds to wait between invocations of daemon\n\
215 restart commands (default is %d).\n\
216 -i, --interval Set the status polling interval in seconds (default is %d)\n\
217 -t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
218 -T, --restart-timeout\n\
219 Set the restart (kill) timeout in seconds (default is %d).\n\
220 If any background jobs are still running after this much\n\
221 time has elapsed, they will be killed.\n\
222 -r, --restart Supply a Bourne shell command to use to restart a single\n\
223 daemon. The command string should include '%%s' where the\n\
224 name of the daemon should be substituted.\n\
225 -s, --start-command\n\
226 Supply a Bourne shell to command to use to start a single\n\
227 daemon. The command string should include '%%s' where the\n\
228 name of the daemon should be substituted.\n\
229 -k, --kill-command\n\
230 Supply a Bourne shell to command to use to stop a single\n\
231 daemon. The command string should include '%%s' where the\n\
232 name of the daemon should be substituted.\n\
233 --dry Do not start or restart anything, just log.\n\
234 -p, --pid-file Set process identifier file name\n\
235 (default is %s/watchfrr.pid).\n\
236 -b, --blank-string\n\
237 When the supplied argument string is found in any of the\n\
238 various shell command arguments (-r, -s, or -k), replace\n\
239 it with a space. This is an ugly hack to circumvent problems\n\
240 passing command-line arguments with embedded spaces.\n\
241 -v, --version Print program version\n\
242 -h, --help Display this help and exit\n",
243 frr_vtydir
, DEFAULT_LOGLEVEL
, LOG_EMERG
, LOG_DEBUG
, LOG_DEBUG
,
244 DEFAULT_MIN_RESTART
, DEFAULT_MAX_RESTART
, DEFAULT_PERIOD
,
245 DEFAULT_TIMEOUT
, DEFAULT_RESTART_TIMEOUT
, frr_vtydir
);
248 static pid_t
run_background(char *shell_cmd
)
252 switch (child
= fork()) {
254 flog_err_sys(EC_LIB_SYSTEM_CALL
,
255 "fork failed, cannot run command [%s]: %s",
256 shell_cmd
, safe_strerror(errno
));
260 /* Use separate process group so child processes can be killed
262 if (setpgid(0, 0) < 0)
263 zlog_warn("warning: setpgid(0,0) failed: %s",
264 safe_strerror(errno
));
268 char *const argv
[4] = {shell
, dashc
, shell_cmd
, NULL
};
269 execv("/bin/sh", argv
);
270 flog_err_sys(EC_LIB_SYSTEM_CALL
,
271 "execv(/bin/sh -c '%s') failed: %s",
272 shell_cmd
, safe_strerror(errno
));
276 /* Parent process: we will reap the child later. */
277 flog_err_sys(EC_LIB_SYSTEM_CALL
,
278 "Forked background command [pid %d]: %s",
279 (int)child
, shell_cmd
);
284 static struct timeval
*time_elapsed(struct timeval
*result
,
285 const struct timeval
*start_time
)
287 gettimeofday(result
, NULL
);
288 result
->tv_sec
-= start_time
->tv_sec
;
289 result
->tv_usec
-= start_time
->tv_usec
;
290 while (result
->tv_usec
< 0) {
291 result
->tv_usec
+= 1000000L;
297 static int restart_kill(struct thread
*t_kill
)
299 struct restart_info
*restart
= THREAD_ARG(t_kill
);
300 struct timeval delay
;
302 time_elapsed(&delay
, &restart
->time
);
304 "Warning: %s %s child process %d still running after "
305 "%ld seconds, sending signal %d",
306 restart
->what
, restart
->name
, (int)restart
->pid
,
307 (long)delay
.tv_sec
, (restart
->kills
? SIGKILL
: SIGTERM
));
308 kill(-restart
->pid
, (restart
->kills
? SIGKILL
: SIGTERM
));
310 restart
->t_kill
= NULL
;
311 thread_add_timer(master
, restart_kill
, restart
, gs
.restart_timeout
,
316 static struct restart_info
*find_child(pid_t child
)
319 if (gs
.restart
.pid
== child
)
322 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
323 if (dmn
->restart
.pid
== child
)
324 return &dmn
->restart
;
329 static void sigchild(void)
335 struct restart_info
*restart
;
337 switch (child
= waitpid(-1, &status
, WNOHANG
)) {
339 flog_err_sys(EC_LIB_SYSTEM_CALL
, "waitpid failed: %s",
340 safe_strerror(errno
));
343 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
347 if (child
== integrated_write_pid
) {
348 integrated_write_sigchld(status
);
352 if ((restart
= find_child(child
)) != NULL
) {
353 name
= restart
->name
;
354 what
= restart
->what
;
357 thread_cancel(restart
->t_kill
);
358 restart
->t_kill
= NULL
;
359 /* Update restart time to reflect the time the command
361 gettimeofday(&restart
->time
, NULL
);
365 "waitpid returned status for an unknown child process %d",
370 if (WIFSTOPPED(status
))
371 zlog_warn("warning: %s %s process %d is stopped", what
, name
,
373 else if (WIFSIGNALED(status
))
374 zlog_warn("%s %s process %d terminated due to signal %d", what
,
375 name
, (int)child
, WTERMSIG(status
));
376 else if (WIFEXITED(status
)) {
377 if (WEXITSTATUS(status
) != 0)
379 "%s %s process %d exited with non-zero status %d",
380 what
, name
, (int)child
, WEXITSTATUS(status
));
382 zlog_debug("%s %s process %d exited normally", what
,
387 "cannot interpret %s %s process %d wait status 0x%x",
388 what
, name
, (int)child
, status
);
392 static int run_job(struct restart_info
*restart
, const char *cmdtype
,
393 const char *command
, int force
, int update_interval
)
395 struct timeval delay
;
397 if (gs
.loglevel
> LOG_DEBUG
+ 1)
398 zlog_debug("attempting to %s %s", cmdtype
, restart
->name
);
401 if (gs
.loglevel
> LOG_DEBUG
+ 1)
403 "cannot %s %s, previous pid %d still running",
404 cmdtype
, restart
->name
, (int)restart
->pid
);
408 /* Note: time_elapsed test must come before the force test, since we
410 to make sure that delay is initialized for use below in updating the
412 if ((time_elapsed(&delay
, &restart
->time
)->tv_sec
< restart
->interval
)
414 if (gs
.loglevel
> LOG_DEBUG
+ 1)
417 "elapsed time %ld < retry interval %ld",
418 cmdtype
, restart
->name
, (long)delay
.tv_sec
,
423 gettimeofday(&restart
->time
, NULL
);
426 char cmd
[strlen(command
) + strlen(restart
->name
) + 1];
427 snprintf(cmd
, sizeof(cmd
), command
, restart
->name
);
428 if ((restart
->pid
= run_background(cmd
)) > 0) {
429 restart
->t_kill
= NULL
;
430 thread_add_timer(master
, restart_kill
, restart
,
431 gs
.restart_timeout
, &restart
->t_kill
);
432 restart
->what
= cmdtype
;
438 /* Calculate the new restart interval. */
439 if (update_interval
) {
440 if (delay
.tv_sec
> 2 * gs
.max_restart_interval
)
441 restart
->interval
= gs
.min_restart_interval
;
442 else if ((restart
->interval
*= 2) > gs
.max_restart_interval
)
443 restart
->interval
= gs
.max_restart_interval
;
444 if (gs
.loglevel
> LOG_DEBUG
+ 1)
445 zlog_debug("restart %s interval is now %ld",
446 restart
->name
, restart
->interval
);
451 #define SET_READ_HANDLER(DMN) \
453 (DMN)->t_read = NULL; \
454 thread_add_read(master, handle_read, (DMN), (DMN)->fd, \
458 #define SET_WAKEUP_DOWN(DMN) \
460 (DMN)->t_wakeup = NULL; \
461 thread_add_timer_msec(master, wakeup_down, (DMN), \
462 FUZZY(gs.period), &(DMN)->t_wakeup); \
465 #define SET_WAKEUP_UNRESPONSIVE(DMN) \
467 (DMN)->t_wakeup = NULL; \
468 thread_add_timer_msec(master, wakeup_unresponsive, (DMN), \
469 FUZZY(gs.period), &(DMN)->t_wakeup); \
472 #define SET_WAKEUP_ECHO(DMN) \
474 (DMN)->t_wakeup = NULL; \
475 thread_add_timer_msec(master, wakeup_send_echo, (DMN), \
476 FUZZY(gs.period), &(DMN)->t_wakeup); \
479 static int wakeup_down(struct thread
*t_wakeup
)
481 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
483 dmn
->t_wakeup
= NULL
;
484 if (try_connect(dmn
) < 0)
485 SET_WAKEUP_DOWN(dmn
);
486 if ((dmn
->connect_tries
> 1) && (dmn
->state
!= DAEMON_UP
))
491 static int wakeup_init(struct thread
*t_wakeup
)
493 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
495 dmn
->t_wakeup
= NULL
;
496 if (try_connect(dmn
) < 0) {
497 SET_WAKEUP_DOWN(dmn
);
498 flog_err(EC_WATCHFRR_CONNECTION
,
499 "%s state -> down : initial connection attempt failed",
501 dmn
->state
= DAEMON_DOWN
;
506 static void daemon_down(struct daemon
*dmn
, const char *why
)
508 if (IS_UP(dmn
) || (dmn
->state
== DAEMON_INIT
))
509 flog_err(EC_WATCHFRR_CONNECTION
, "%s state -> down : %s",
511 else if (gs
.loglevel
> LOG_DEBUG
)
512 zlog_debug("%s still down : %s", dmn
->name
, why
);
515 dmn
->state
= DAEMON_DOWN
;
520 THREAD_OFF(dmn
->t_read
);
521 THREAD_OFF(dmn
->t_write
);
522 THREAD_OFF(dmn
->t_wakeup
);
523 if (try_connect(dmn
) < 0)
524 SET_WAKEUP_DOWN(dmn
);
528 static int handle_read(struct thread
*t_read
)
530 struct daemon
*dmn
= THREAD_ARG(t_read
);
531 static const char resp
[sizeof(PING_TOKEN
) + 4] = PING_TOKEN
"\n";
532 char buf
[sizeof(resp
) + 100];
534 struct timeval delay
;
537 if ((rc
= read(dmn
->fd
, buf
, sizeof(buf
))) < 0) {
540 if (ERRNO_IO_RETRY(errno
)) {
541 /* Pretend it never happened. */
542 SET_READ_HANDLER(dmn
);
545 snprintf(why
, sizeof(why
), "unexpected read error: %s",
546 safe_strerror(errno
));
547 daemon_down(dmn
, why
);
551 daemon_down(dmn
, "read returned EOF");
554 if (!dmn
->echo_sent
.tv_sec
) {
555 char why
[sizeof(buf
) + 100];
556 snprintf(why
, sizeof(why
),
557 "unexpected read returns %d bytes: %.*s", (int)rc
,
559 daemon_down(dmn
, why
);
563 /* We are expecting an echo response: is there any chance that the
564 response would not be returned entirely in the first read? That
565 seems inconceivable... */
566 if ((rc
!= sizeof(resp
)) || memcmp(buf
, resp
, sizeof(resp
))) {
567 char why
[100 + sizeof(buf
)];
568 snprintf(why
, sizeof(why
),
569 "read returned bad echo response of %d bytes "
570 "(expecting %u): %.*s",
571 (int)rc
, (unsigned int)sizeof(resp
), (int)rc
, buf
);
572 daemon_down(dmn
, why
);
576 time_elapsed(&delay
, &dmn
->echo_sent
);
577 dmn
->echo_sent
.tv_sec
= 0;
578 if (dmn
->state
== DAEMON_UNRESPONSIVE
) {
579 if (delay
.tv_sec
< gs
.timeout
) {
580 dmn
->state
= DAEMON_UP
;
582 "%s state -> up : echo response received after %ld.%06ld "
584 dmn
->name
, (long)delay
.tv_sec
,
585 (long)delay
.tv_usec
);
588 "%s: slow echo response finally received after %ld.%06ld "
590 dmn
->name
, (long)delay
.tv_sec
,
591 (long)delay
.tv_usec
);
592 } else if (gs
.loglevel
> LOG_DEBUG
+ 1)
593 zlog_debug("%s: echo response received after %ld.%06ld seconds",
594 dmn
->name
, (long)delay
.tv_sec
, (long)delay
.tv_usec
);
596 SET_READ_HANDLER(dmn
);
598 thread_cancel(dmn
->t_wakeup
);
599 SET_WAKEUP_ECHO(dmn
);
605 * Wait till we notice that all daemons are ready before
606 * we send we are ready to systemd
608 static void daemon_send_ready(void)
611 if (!sent
&& gs
.numdown
== 0) {
614 zlog_notice("all daemons up, doing startup-complete notify");
617 fp
= fopen(DAEMON_VTY_DIR
"/watchfrr.started", "w");
620 #if defined HAVE_SYSTEMD
621 systemd_send_started(master
, 0);
627 static void daemon_up(struct daemon
*dmn
, const char *why
)
629 dmn
->state
= DAEMON_UP
;
631 dmn
->connect_tries
= 0;
632 zlog_notice("%s state -> up : %s", dmn
->name
, why
);
634 SET_WAKEUP_ECHO(dmn
);
638 static int check_connect(struct thread
*t_write
)
640 struct daemon
*dmn
= THREAD_ARG(t_write
);
642 socklen_t reslen
= sizeof(sockerr
);
645 if (getsockopt(dmn
->fd
, SOL_SOCKET
, SO_ERROR
, (char *)&sockerr
, &reslen
)
647 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn
->name
,
648 safe_strerror(errno
));
650 "getsockopt failed checking connection success");
653 if ((reslen
== sizeof(sockerr
)) && sockerr
) {
657 "getsockopt reports that connection attempt failed: %s",
658 safe_strerror(sockerr
));
659 daemon_down(dmn
, why
);
663 daemon_up(dmn
, "delayed connect succeeded");
667 static int wakeup_connect_hanging(struct thread
*t_wakeup
)
669 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
672 dmn
->t_wakeup
= NULL
;
673 snprintf(why
, sizeof(why
),
674 "connection attempt timed out after %ld seconds", gs
.timeout
);
675 daemon_down(dmn
, why
);
679 /* Making connection to protocol daemon. */
680 static int try_connect(struct daemon
*dmn
)
683 struct sockaddr_un addr
;
686 if (gs
.loglevel
> LOG_DEBUG
+ 1)
687 zlog_debug("%s: attempting to connect", dmn
->name
);
688 dmn
->connect_tries
++;
690 memset(&addr
, 0, sizeof(struct sockaddr_un
));
691 addr
.sun_family
= AF_UNIX
;
692 snprintf(addr
.sun_path
, sizeof(addr
.sun_path
), "%s/%s.vty", gs
.vtydir
,
694 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
695 len
= addr
.sun_len
= SUN_LEN(&addr
);
697 len
= sizeof(addr
.sun_family
) + strlen(addr
.sun_path
);
698 #endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
700 /* Quick check to see if we might succeed before we go to the trouble
701 of creating a socket. */
702 if (access(addr
.sun_path
, W_OK
) < 0) {
704 flog_err_sys(EC_LIB_SYSTEM_CALL
,
705 "%s: access to socket %s denied: %s",
706 dmn
->name
, addr
.sun_path
,
707 safe_strerror(errno
));
711 if ((sock
= socket(AF_UNIX
, SOCK_STREAM
, 0)) < 0) {
712 flog_err_sys(EC_LIB_SOCKET
, "%s(%s): cannot make socket: %s",
713 __func__
, addr
.sun_path
, safe_strerror(errno
));
717 if (set_nonblocking(sock
) < 0 || set_cloexec(sock
) < 0) {
718 flog_err_sys(EC_LIB_SYSTEM_CALL
,
719 "%s(%s): set_nonblocking/cloexec(%d) failed",
720 __func__
, addr
.sun_path
, sock
);
725 if (connect(sock
, (struct sockaddr
*)&addr
, len
) < 0) {
726 if ((errno
!= EINPROGRESS
) && (errno
!= EWOULDBLOCK
)) {
727 if (gs
.loglevel
> LOG_DEBUG
)
728 zlog_debug("%s(%s): connect failed: %s",
729 __func__
, addr
.sun_path
,
730 safe_strerror(errno
));
734 if (gs
.loglevel
> LOG_DEBUG
)
735 zlog_debug("%s: connection in progress", dmn
->name
);
736 dmn
->state
= DAEMON_CONNECTING
;
739 thread_add_write(master
, check_connect
, dmn
, dmn
->fd
,
741 dmn
->t_wakeup
= NULL
;
742 thread_add_timer(master
, wakeup_connect_hanging
, dmn
,
743 gs
.timeout
, &dmn
->t_wakeup
);
744 SET_READ_HANDLER(dmn
);
749 SET_READ_HANDLER(dmn
);
750 daemon_up(dmn
, "connect succeeded");
754 static int phase_hanging(struct thread
*t_hanging
)
756 gs
.t_phase_hanging
= NULL
;
757 flog_err(EC_WATCHFRR_CONNECTION
,
758 "Phase [%s] hanging for %ld seconds, aborting phased restart",
759 phase_str
[gs
.phase
], PHASE_TIMEOUT
);
760 gs
.phase
= PHASE_NONE
;
764 static void set_phase(restart_phase_t new_phase
)
766 gs
.phase
= new_phase
;
767 if (gs
.t_phase_hanging
)
768 thread_cancel(gs
.t_phase_hanging
);
769 gs
.t_phase_hanging
= NULL
;
770 thread_add_timer(master
, phase_hanging
, NULL
, PHASE_TIMEOUT
,
771 &gs
.t_phase_hanging
);
774 static void phase_check(void)
779 case PHASE_STOPS_PENDING
:
783 "Phased restart: all routing daemon stop jobs have completed.");
784 set_phase(PHASE_WAITING_DOWN
);
787 case PHASE_WAITING_DOWN
:
788 if (gs
.numdown
+ IS_UP(gs
.special
) < gs
.numdaemons
)
790 zlog_info("Phased restart: all routing daemons now down.");
791 run_job(&gs
.special
->restart
, "restart", gs
.restart_command
, 1,
793 set_phase(PHASE_ZEBRA_RESTART_PENDING
);
796 case PHASE_ZEBRA_RESTART_PENDING
:
797 if (gs
.special
->restart
.pid
)
799 zlog_info("Phased restart: %s restart job completed.",
801 set_phase(PHASE_WAITING_ZEBRA_UP
);
804 case PHASE_WAITING_ZEBRA_UP
:
805 if (!IS_UP(gs
.special
))
807 zlog_info("Phased restart: %s is now up.", gs
.special
->name
);
810 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
811 if (dmn
!= gs
.special
)
812 run_job(&dmn
->restart
, "start",
813 gs
.start_command
, 1, 0);
816 gs
.phase
= PHASE_NONE
;
817 THREAD_OFF(gs
.t_phase_hanging
);
818 zlog_notice("Phased global restart has completed.");
823 static void try_restart(struct daemon
*dmn
)
828 if (dmn
!= gs
.special
) {
829 if ((gs
.special
->state
== DAEMON_UP
)
830 && (gs
.phase
== PHASE_NONE
))
831 run_job(&dmn
->restart
, "restart", gs
.restart_command
, 0,
835 "%s: postponing restart attempt because master %s daemon "
836 "not up [%s], or phased restart in progress",
837 dmn
->name
, gs
.special
->name
,
838 state_str
[gs
.special
->state
]);
842 if ((gs
.phase
!= PHASE_NONE
) || gs
.numpids
) {
843 if (gs
.loglevel
> LOG_DEBUG
+ 1)
845 "postponing phased global restart: restart already in "
846 "progress [%s], or outstanding child processes [%d]",
847 phase_str
[gs
.phase
], gs
.numpids
);
850 /* Is it too soon for a restart? */
852 struct timeval delay
;
853 if (time_elapsed(&delay
, &gs
.special
->restart
.time
)->tv_sec
854 < gs
.special
->restart
.interval
) {
855 if (gs
.loglevel
> LOG_DEBUG
+ 1)
857 "postponing phased global restart: "
858 "elapsed time %ld < retry interval %ld",
860 gs
.special
->restart
.interval
);
864 run_job(&gs
.restart
, "restart", gs
.restart_command
, 0, 1);
867 static int wakeup_unresponsive(struct thread
*t_wakeup
)
869 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
871 dmn
->t_wakeup
= NULL
;
872 if (dmn
->state
!= DAEMON_UNRESPONSIVE
)
873 flog_err(EC_WATCHFRR_CONNECTION
,
874 "%s: no longer unresponsive (now %s), "
875 "wakeup should have been cancelled!",
876 dmn
->name
, state_str
[dmn
->state
]);
878 SET_WAKEUP_UNRESPONSIVE(dmn
);
884 static int wakeup_no_answer(struct thread
*t_wakeup
)
886 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
888 dmn
->t_wakeup
= NULL
;
889 dmn
->state
= DAEMON_UNRESPONSIVE
;
890 flog_err(EC_WATCHFRR_CONNECTION
,
891 "%s state -> unresponsive : no response yet to ping "
892 "sent %ld seconds ago",
893 dmn
->name
, gs
.timeout
);
894 SET_WAKEUP_UNRESPONSIVE(dmn
);
899 static int wakeup_send_echo(struct thread
*t_wakeup
)
901 static const char echocmd
[] = "echo " PING_TOKEN
;
903 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
905 dmn
->t_wakeup
= NULL
;
906 if (((rc
= write(dmn
->fd
, echocmd
, sizeof(echocmd
))) < 0)
907 || ((size_t)rc
!= sizeof(echocmd
))) {
908 char why
[100 + sizeof(echocmd
)];
909 snprintf(why
, sizeof(why
),
910 "write '%s' returned %d instead of %u", echocmd
,
911 (int)rc
, (unsigned int)sizeof(echocmd
));
912 daemon_down(dmn
, why
);
914 gettimeofday(&dmn
->echo_sent
, NULL
);
915 dmn
->t_wakeup
= NULL
;
916 thread_add_timer(master
, wakeup_no_answer
, dmn
, gs
.timeout
,
922 bool check_all_up(void)
926 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
927 if (dmn
->state
!= DAEMON_UP
)
932 static void sigint(void)
934 zlog_notice("Terminating on signal");
935 systemd_send_stopping();
939 static int valid_command(const char *cmd
)
943 return ((p
= strchr(cmd
, '%')) != NULL
) && (*(p
+ 1) == 's')
944 && !strchr(p
+ 1, '%');
947 /* This is an ugly hack to circumvent problems with passing command-line
948 arguments that contain spaces. The fix is to use a configuration file. */
949 static char *translate_blanks(const char *cmd
, const char *blankstr
)
953 size_t bslen
= strlen(blankstr
);
955 if (!(res
= strdup(cmd
))) {
959 while ((p
= strstr(res
, blankstr
)) != NULL
) {
962 memmove(p
+ 1, p
+ bslen
, strlen(p
+ bslen
) + 1);
967 static void watchfrr_init(int argc
, char **argv
)
969 const char *special
= "zebra";
971 struct daemon
*dmn
, **add
= &gs
.daemons
;
972 char alldaemons
[512] = "", *p
= alldaemons
;
974 for (i
= optind
; i
< argc
; i
++) {
975 dmn
= XCALLOC(MTYPE_WATCHFRR_DAEMON
, sizeof(*dmn
));
977 dmn
->name
= dmn
->restart
.name
= argv
[i
];
978 dmn
->state
= DAEMON_INIT
;
982 dmn
->t_wakeup
= NULL
;
983 thread_add_timer_msec(master
, wakeup_init
, dmn
,
984 100 + (random() % 900),
986 dmn
->restart
.interval
= gs
.min_restart_interval
;
990 if (!strcmp(dmn
->name
, special
))
996 "Must specify one or more daemons to monitor.\n\n");
999 if (!watch_only
&& !gs
.special
) {
1000 fprintf(stderr
, "\"%s\" daemon must be in daemon lists\n\n",
1005 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
1006 snprintf(p
, alldaemons
+ sizeof(alldaemons
) - p
, "%s%s",
1007 (p
== alldaemons
) ? "" : " ", dmn
->name
);
1010 zlog_notice("%s %s watching [%s]%s", progname
, FRR_VERSION
, alldaemons
,
1011 watch_only
? ", monitor mode" : "");
1014 struct zebra_privs_t watchfrr_privs
= {
1016 .vty_group
= VTY_GROUP
,
1020 static struct quagga_signal_t watchfrr_signals
[] = {
1031 .handler
= sigchild
,
1035 FRR_DAEMON_INFO(watchfrr
, WATCHFRR
,
1036 .flags
= FRR_NO_PRIVSEP
| FRR_NO_TCPVTY
| FRR_LIMITED_CLI
1037 | FRR_NO_CFG_PID_DRY
| FRR_NO_ZCLIENT
1040 .printhelp
= printhelp
,
1041 .copyright
= "Copyright 2004 Andrew J. Schorr",
1043 .signals
= watchfrr_signals
,
1044 .n_signals
= array_size(watchfrr_signals
),
1046 .privs
= &watchfrr_privs
, )
1048 #define DEPRECATED_OPTIONS "aAezR:"
1050 int main(int argc
, char **argv
)
1053 const char *blankstr
= NULL
;
1055 frr_preinit(&watchfrr_di
, argc
, argv
);
1056 progname
= watchfrr_di
.progname
;
1058 frr_opt_add("b:dk:l:i:p:r:S:s:t:T:" DEPRECATED_OPTIONS
, longopts
, "");
1060 gs
.restart
.name
= "all";
1061 while ((opt
= frr_getopt(argc
, argv
, NULL
)) != EOF
) {
1062 if (opt
&& opt
< 128 && strchr(DEPRECATED_OPTIONS
, opt
)) {
1064 "The -%c option no longer exists.\n"
1065 "Please refer to the watchfrr(8) man page.\n",
1080 if (!valid_command(optarg
)) {
1082 "Invalid kill command, must contain '%%s': %s\n",
1086 gs
.stop_command
= optarg
;
1090 if ((sscanf(optarg
, "%d%1s", &gs
.loglevel
, garbage
)
1092 || (gs
.loglevel
< LOG_EMERG
)) {
1094 "Invalid loglevel argument: %s\n",
1099 case OPTION_MINRESTART
: {
1101 if ((sscanf(optarg
, "%ld%1s", &gs
.min_restart_interval
,
1104 || (gs
.min_restart_interval
< 0)) {
1106 "Invalid min_restart_interval argument: %s\n",
1111 case OPTION_MAXRESTART
: {
1113 if ((sscanf(optarg
, "%ld%1s", &gs
.max_restart_interval
,
1116 || (gs
.max_restart_interval
< 0)) {
1118 "Invalid max_restart_interval argument: %s\n",
1126 if ((sscanf(optarg
, "%d%1s", &period
, garbage
) != 1)
1127 || (gs
.period
< 1)) {
1129 "Invalid interval argument: %s\n",
1133 gs
.period
= 1000 * period
;
1136 watchfrr_di
.pid_file
= optarg
;
1139 if (!valid_command(optarg
)) {
1141 "Invalid restart command, must contain '%%s': %s\n",
1145 gs
.restart_command
= optarg
;
1148 if (!valid_command(optarg
)) {
1150 "Invalid start command, must contain '%%s': %s\n",
1154 gs
.start_command
= optarg
;
1161 if ((sscanf(optarg
, "%ld%1s", &gs
.timeout
, garbage
)
1163 || (gs
.timeout
< 1)) {
1165 "Invalid timeout argument: %s\n",
1172 if ((sscanf(optarg
, "%ld%1s", &gs
.restart_timeout
,
1175 || (gs
.restart_timeout
< 1)) {
1177 "Invalid restart timeout argument: %s\n",
1183 fputs("Invalid option.\n", stderr
);
1189 && (gs
.start_command
|| gs
.stop_command
|| gs
.restart_command
)) {
1190 fputs("Options -r/-s/-k are not used when --dry is active.\n",
1194 && (!gs
.restart_command
|| !gs
.start_command
|| !gs
.stop_command
)) {
1196 "Options -s (start), -k (kill), and -r (restart) are required.\n");
1201 if (gs
.restart_command
)
1202 gs
.restart_command
=
1203 translate_blanks(gs
.restart_command
, blankstr
);
1204 if (gs
.start_command
)
1206 translate_blanks(gs
.start_command
, blankstr
);
1207 if (gs
.stop_command
)
1209 translate_blanks(gs
.stop_command
, blankstr
);
1212 gs
.restart
.interval
= gs
.min_restart_interval
;
1214 master
= frr_init();
1215 watchfrr_error_init();
1216 watchfrr_init(argc
, argv
);
1217 watchfrr_vty_init();
1221 zlog_set_level(ZLOG_DEST_MONITOR
, ZLOG_DISABLED
);
1222 if (watchfrr_di
.daemon_mode
)
1223 zlog_set_level(ZLOG_DEST_SYSLOG
, MIN(gs
.loglevel
, LOG_DEBUG
));
1225 zlog_set_level(ZLOG_DEST_STDOUT
, MIN(gs
.loglevel
, LOG_DEBUG
));
1229 systemd_send_stopping();