2 * Monitor status of frr daemons and restart if necessary.
4 * Copyright (C) 2004 Andrew J. Schorr
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
26 #include <lib/version.h>
28 #include "memory_vty.h"
30 #include "lib_errors.h"
39 #include "watchfrr_errors.h"
42 #define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
45 /* Macros to help randomize timers. */
46 #define JITTER(X) ((random() % ((X)+1))-((X)/2))
47 #define FUZZY(X) ((X)+JITTER((X)/20))
49 #define DEFAULT_PERIOD 5
50 #define DEFAULT_TIMEOUT 90
51 #define DEFAULT_RESTART_TIMEOUT 20
52 #define DEFAULT_LOGLEVEL LOG_INFO
53 #define DEFAULT_MIN_RESTART 60
54 #define DEFAULT_MAX_RESTART 600
56 #define PING_TOKEN "PING"
58 DEFINE_MGROUP(WATCHFRR
, "watchfrr")
59 DEFINE_MTYPE_STATIC(WATCHFRR
, WATCHFRR_DAEMON
, "watchfrr daemon entry")
61 /* Needs to be global, referenced somewhere inside libfrr. */
62 struct thread_master
*master
;
64 static bool watch_only
= false;
71 PHASE_ZEBRA_RESTART_PENDING
,
72 PHASE_WAITING_ZEBRA_UP
75 static const char *phase_str
[] = {
79 "Waiting for other daemons to come down",
80 "Zebra restart job running",
81 "Waiting for zebra to come up",
85 #define PHASE_TIMEOUT (3*gs.restart_timeout)
93 struct thread
*t_kill
;
97 static struct global_state
{
98 restart_phase_t phase
;
99 struct thread
*t_phase_hanging
;
103 long restart_timeout
;
104 long min_restart_interval
;
105 long max_restart_interval
;
106 struct daemon
*daemons
;
107 const char *restart_command
;
108 const char *start_command
;
109 const char *stop_command
;
110 struct restart_info restart
;
112 struct daemon
*special
; /* points to zebra when doing phased restart */
115 int numdown
; /* # of daemons that are not UP or UNRESPONSIVE */
118 .vtydir
= frr_vtydir
,
119 .period
= 1000 * DEFAULT_PERIOD
,
120 .timeout
= DEFAULT_TIMEOUT
,
121 .restart_timeout
= DEFAULT_RESTART_TIMEOUT
,
122 .loglevel
= DEFAULT_LOGLEVEL
,
123 .min_restart_interval
= DEFAULT_MIN_RESTART
,
124 .max_restart_interval
= DEFAULT_MAX_RESTART
,
136 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
138 static const char *state_str
[] = {
139 "Init", "Down", "Connecting", "Up", "Unresponsive",
144 daemon_state_t state
;
146 struct timeval echo_sent
;
147 unsigned int connect_tries
;
148 struct thread
*t_wakeup
;
149 struct thread
*t_read
;
150 struct thread
*t_write
;
152 struct restart_info restart
;
155 #define OPTION_MINRESTART 2000
156 #define OPTION_MAXRESTART 2001
157 #define OPTION_DRY 2002
159 static const struct option longopts
[] = {
160 {"daemon", no_argument
, NULL
, 'd'},
161 {"statedir", required_argument
, NULL
, 'S'},
162 {"loglevel", required_argument
, NULL
, 'l'},
163 {"interval", required_argument
, NULL
, 'i'},
164 {"timeout", required_argument
, NULL
, 't'},
165 {"restart-timeout", required_argument
, NULL
, 'T'},
166 {"restart", required_argument
, NULL
, 'r'},
167 {"start-command", required_argument
, NULL
, 's'},
168 {"kill-command", required_argument
, NULL
, 'k'},
169 {"dry", no_argument
, NULL
, OPTION_DRY
},
170 {"min-restart-interval", required_argument
, NULL
, OPTION_MINRESTART
},
171 {"max-restart-interval", required_argument
, NULL
, OPTION_MAXRESTART
},
172 {"pid-file", required_argument
, NULL
, 'p'},
173 {"blank-string", required_argument
, NULL
, 'b'},
174 {"help", no_argument
, NULL
, 'h'},
175 {"version", no_argument
, NULL
, 'v'},
178 static int try_connect(struct daemon
*dmn
);
179 static int wakeup_send_echo(struct thread
*t_wakeup
);
180 static void try_restart(struct daemon
*dmn
);
181 static void phase_check(void);
182 static void restart_done(struct daemon
*dmn
);
184 static const char *progname
;
185 static void printhelp(FILE *target
)
188 "Usage : %s [OPTION...] <daemon name> ...\n\n\
189 Watchdog program to monitor status of frr daemons and try to restart\n\
190 them if they are down or unresponsive. It determines whether a daemon is\n\
191 up based on whether it can connect to the daemon's vty unix stream socket.\n\
192 It then repeatedly sends echo commands over that socket to determine whether\n\
193 the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
194 on the socket connection and know immediately that the daemon is down.\n\n\
195 The daemons to be monitored should be listed on the command line.\n\n\
196 In order to avoid attempting to restart the daemons in a fast loop,\n\
197 the -m and -M options allow you to control the minimum delay between\n\
198 restart commands. The minimum restart delay is recalculated each time\n\
199 a restart is attempted: if the time since the last restart attempt exceeds\n\
200 twice the -M value, then the restart delay is set to the -m value.\n\
201 Otherwise, the interval is doubled (but capped at the -M value).\n\n",
206 -d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
207 to syslog instead of stdout.\n\
208 -S, --statedir Set the vty socket directory (default is %s)\n\
209 -l, --loglevel Set the logging level (default is %d).\n\
210 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
211 but it can be set higher than %d if extra-verbose debugging\n\
212 messages are desired.\n\
213 --min-restart-interval\n\
214 Set the minimum seconds to wait between invocations of daemon\n\
215 restart commands (default is %d).\n\
216 --max-restart-interval\n\
217 Set the maximum seconds to wait between invocations of daemon\n\
218 restart commands (default is %d).\n\
219 -i, --interval Set the status polling interval in seconds (default is %d)\n\
220 -t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
221 -T, --restart-timeout\n\
222 Set the restart (kill) timeout in seconds (default is %d).\n\
223 If any background jobs are still running after this much\n\
224 time has elapsed, they will be killed.\n\
225 -r, --restart Supply a Bourne shell command to use to restart a single\n\
226 daemon. The command string should include '%%s' where the\n\
227 name of the daemon should be substituted.\n\
228 -s, --start-command\n\
229 Supply a Bourne shell to command to use to start a single\n\
230 daemon. The command string should include '%%s' where the\n\
231 name of the daemon should be substituted.\n\
232 -k, --kill-command\n\
233 Supply a Bourne shell to command to use to stop a single\n\
234 daemon. The command string should include '%%s' where the\n\
235 name of the daemon should be substituted.\n\
236 --dry Do not start or restart anything, just log.\n\
237 -p, --pid-file Set process identifier file name\n\
238 (default is %s/watchfrr.pid).\n\
239 -b, --blank-string\n\
240 When the supplied argument string is found in any of the\n\
241 various shell command arguments (-r, -s, or -k), replace\n\
242 it with a space. This is an ugly hack to circumvent problems\n\
243 passing command-line arguments with embedded spaces.\n\
244 -v, --version Print program version\n\
245 -h, --help Display this help and exit\n",
246 frr_vtydir
, DEFAULT_LOGLEVEL
, LOG_EMERG
, LOG_DEBUG
, LOG_DEBUG
,
247 DEFAULT_MIN_RESTART
, DEFAULT_MAX_RESTART
, DEFAULT_PERIOD
,
248 DEFAULT_TIMEOUT
, DEFAULT_RESTART_TIMEOUT
, frr_vtydir
);
251 static pid_t
run_background(char *shell_cmd
)
255 switch (child
= fork()) {
257 flog_err_sys(LIB_ERR_SYSTEM_CALL
,
258 "fork failed, cannot run command [%s]: %s",
259 shell_cmd
, safe_strerror(errno
));
263 /* Use separate process group so child processes can be killed
265 if (setpgid(0, 0) < 0)
266 zlog_warn("warning: setpgid(0,0) failed: %s",
267 safe_strerror(errno
));
271 char *const argv
[4] = {shell
, dashc
, shell_cmd
, NULL
};
272 execv("/bin/sh", argv
);
273 flog_err_sys(LIB_ERR_SYSTEM_CALL
,
274 "execv(/bin/sh -c '%s') failed: %s",
275 shell_cmd
, safe_strerror(errno
));
279 /* Parent process: we will reap the child later. */
280 flog_err_sys(LIB_ERR_SYSTEM_CALL
,
281 "Forked background command [pid %d]: %s",
282 (int)child
, shell_cmd
);
287 static struct timeval
*time_elapsed(struct timeval
*result
,
288 const struct timeval
*start_time
)
290 gettimeofday(result
, NULL
);
291 result
->tv_sec
-= start_time
->tv_sec
;
292 result
->tv_usec
-= start_time
->tv_usec
;
293 while (result
->tv_usec
< 0) {
294 result
->tv_usec
+= 1000000L;
300 static int restart_kill(struct thread
*t_kill
)
302 struct restart_info
*restart
= THREAD_ARG(t_kill
);
303 struct timeval delay
;
305 time_elapsed(&delay
, &restart
->time
);
307 "Warning: %s %s child process %d still running after "
308 "%ld seconds, sending signal %d",
309 restart
->what
, restart
->name
, (int)restart
->pid
,
310 (long)delay
.tv_sec
, (restart
->kills
? SIGKILL
: SIGTERM
));
311 kill(-restart
->pid
, (restart
->kills
? SIGKILL
: SIGTERM
));
313 restart
->t_kill
= NULL
;
314 thread_add_timer(master
, restart_kill
, restart
, gs
.restart_timeout
,
319 static struct restart_info
*find_child(pid_t child
)
322 if (gs
.restart
.pid
== child
)
325 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
326 if (dmn
->restart
.pid
== child
)
327 return &dmn
->restart
;
332 static void sigchild(void)
338 struct restart_info
*restart
;
341 switch (child
= waitpid(-1, &status
, WNOHANG
)) {
343 flog_err_sys(LIB_ERR_SYSTEM_CALL
, "waitpid failed: %s",
344 safe_strerror(errno
));
347 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
351 if (child
== integrated_write_pid
) {
352 integrated_write_sigchld(status
);
356 if ((restart
= find_child(child
)) != NULL
) {
357 name
= restart
->name
;
358 what
= restart
->what
;
361 thread_cancel(restart
->t_kill
);
362 restart
->t_kill
= NULL
;
363 /* Update restart time to reflect the time the command
365 gettimeofday(&restart
->time
, NULL
);
369 "waitpid returned status for an unknown child process %d",
374 if (WIFSTOPPED(status
))
375 zlog_warn("warning: %s %s process %d is stopped", what
, name
,
377 else if (WIFSIGNALED(status
))
378 zlog_warn("%s %s process %d terminated due to signal %d", what
,
379 name
, (int)child
, WTERMSIG(status
));
380 else if (WIFEXITED(status
)) {
381 if (WEXITSTATUS(status
) != 0)
383 "%s %s process %d exited with non-zero status %d",
384 what
, name
, (int)child
, WEXITSTATUS(status
));
386 zlog_debug("%s %s process %d exited normally", what
,
389 if (restart
&& restart
!= &gs
.restart
) {
390 dmn
= container_of(restart
, struct daemon
,
394 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
400 "cannot interpret %s %s process %d wait status 0x%x",
401 what
, name
, (int)child
, status
);
405 static int run_job(struct restart_info
*restart
, const char *cmdtype
,
406 const char *command
, int force
, int update_interval
)
408 struct timeval delay
;
410 if (gs
.loglevel
> LOG_DEBUG
+ 1)
411 zlog_debug("attempting to %s %s", cmdtype
, restart
->name
);
414 if (gs
.loglevel
> LOG_DEBUG
+ 1)
416 "cannot %s %s, previous pid %d still running",
417 cmdtype
, restart
->name
, (int)restart
->pid
);
421 /* Note: time_elapsed test must come before the force test, since we
423 to make sure that delay is initialized for use below in updating the
425 if ((time_elapsed(&delay
, &restart
->time
)->tv_sec
< restart
->interval
)
427 if (gs
.loglevel
> LOG_DEBUG
+ 1)
430 "elapsed time %ld < retry interval %ld",
431 cmdtype
, restart
->name
, (long)delay
.tv_sec
,
436 gettimeofday(&restart
->time
, NULL
);
439 char cmd
[strlen(command
) + strlen(restart
->name
) + 1];
440 snprintf(cmd
, sizeof(cmd
), command
, restart
->name
);
441 if ((restart
->pid
= run_background(cmd
)) > 0) {
442 restart
->t_kill
= NULL
;
443 thread_add_timer(master
, restart_kill
, restart
,
444 gs
.restart_timeout
, &restart
->t_kill
);
445 restart
->what
= cmdtype
;
451 /* Calculate the new restart interval. */
452 if (update_interval
) {
453 if (delay
.tv_sec
> 2 * gs
.max_restart_interval
)
454 restart
->interval
= gs
.min_restart_interval
;
455 else if ((restart
->interval
*= 2) > gs
.max_restart_interval
)
456 restart
->interval
= gs
.max_restart_interval
;
457 if (gs
.loglevel
> LOG_DEBUG
+ 1)
458 zlog_debug("restart %s interval is now %ld",
459 restart
->name
, restart
->interval
);
464 #define SET_READ_HANDLER(DMN) \
466 (DMN)->t_read = NULL; \
467 thread_add_read(master, handle_read, (DMN), (DMN)->fd, \
471 #define SET_WAKEUP_DOWN(DMN) \
473 (DMN)->t_wakeup = NULL; \
474 thread_add_timer_msec(master, wakeup_down, (DMN), \
475 FUZZY(gs.period), &(DMN)->t_wakeup); \
478 #define SET_WAKEUP_UNRESPONSIVE(DMN) \
480 (DMN)->t_wakeup = NULL; \
481 thread_add_timer_msec(master, wakeup_unresponsive, (DMN), \
482 FUZZY(gs.period), &(DMN)->t_wakeup); \
485 #define SET_WAKEUP_ECHO(DMN) \
487 (DMN)->t_wakeup = NULL; \
488 thread_add_timer_msec(master, wakeup_send_echo, (DMN), \
489 FUZZY(gs.period), &(DMN)->t_wakeup); \
492 static int wakeup_down(struct thread
*t_wakeup
)
494 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
496 dmn
->t_wakeup
= NULL
;
497 if (try_connect(dmn
) < 0)
498 SET_WAKEUP_DOWN(dmn
);
499 if ((dmn
->connect_tries
> 1) && (dmn
->state
!= DAEMON_UP
))
504 static int wakeup_init(struct thread
*t_wakeup
)
506 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
508 dmn
->t_wakeup
= NULL
;
509 if (try_connect(dmn
) < 0) {
510 flog_err(WATCHFRR_ERR_CONNECTION
,
511 "%s state -> down : initial connection attempt failed",
513 dmn
->state
= DAEMON_DOWN
;
519 static void restart_done(struct daemon
*dmn
)
521 if (dmn
->state
!= DAEMON_DOWN
) {
526 THREAD_OFF(dmn
->t_wakeup
);
527 if (try_connect(dmn
) < 0)
528 SET_WAKEUP_DOWN(dmn
);
531 static void daemon_down(struct daemon
*dmn
, const char *why
)
533 if (IS_UP(dmn
) || (dmn
->state
== DAEMON_INIT
))
534 flog_err(WATCHFRR_ERR_CONNECTION
,
535 "%s state -> down : %s", dmn
->name
, why
);
536 else if (gs
.loglevel
> LOG_DEBUG
)
537 zlog_debug("%s still down : %s", dmn
->name
, why
);
540 dmn
->state
= DAEMON_DOWN
;
545 THREAD_OFF(dmn
->t_read
);
546 THREAD_OFF(dmn
->t_write
);
547 THREAD_OFF(dmn
->t_wakeup
);
548 if (try_connect(dmn
) < 0)
549 SET_WAKEUP_DOWN(dmn
);
553 static int handle_read(struct thread
*t_read
)
555 struct daemon
*dmn
= THREAD_ARG(t_read
);
556 static const char resp
[sizeof(PING_TOKEN
) + 4] = PING_TOKEN
"\n";
557 char buf
[sizeof(resp
) + 100];
559 struct timeval delay
;
562 if ((rc
= read(dmn
->fd
, buf
, sizeof(buf
))) < 0) {
565 if (ERRNO_IO_RETRY(errno
)) {
566 /* Pretend it never happened. */
567 SET_READ_HANDLER(dmn
);
570 snprintf(why
, sizeof(why
), "unexpected read error: %s",
571 safe_strerror(errno
));
572 daemon_down(dmn
, why
);
576 daemon_down(dmn
, "read returned EOF");
579 if (!dmn
->echo_sent
.tv_sec
) {
580 char why
[sizeof(buf
) + 100];
581 snprintf(why
, sizeof(why
),
582 "unexpected read returns %d bytes: %.*s", (int)rc
,
584 daemon_down(dmn
, why
);
588 /* We are expecting an echo response: is there any chance that the
589 response would not be returned entirely in the first read? That
590 seems inconceivable... */
591 if ((rc
!= sizeof(resp
)) || memcmp(buf
, resp
, sizeof(resp
))) {
592 char why
[100 + sizeof(buf
)];
593 snprintf(why
, sizeof(why
),
594 "read returned bad echo response of %d bytes "
595 "(expecting %u): %.*s",
596 (int)rc
, (unsigned int)sizeof(resp
), (int)rc
, buf
);
597 daemon_down(dmn
, why
);
601 time_elapsed(&delay
, &dmn
->echo_sent
);
602 dmn
->echo_sent
.tv_sec
= 0;
603 if (dmn
->state
== DAEMON_UNRESPONSIVE
) {
604 if (delay
.tv_sec
< gs
.timeout
) {
605 dmn
->state
= DAEMON_UP
;
607 "%s state -> up : echo response received after %ld.%06ld "
609 dmn
->name
, (long)delay
.tv_sec
,
610 (long)delay
.tv_usec
);
613 "%s: slow echo response finally received after %ld.%06ld "
615 dmn
->name
, (long)delay
.tv_sec
,
616 (long)delay
.tv_usec
);
617 } else if (gs
.loglevel
> LOG_DEBUG
+ 1)
618 zlog_debug("%s: echo response received after %ld.%06ld seconds",
619 dmn
->name
, (long)delay
.tv_sec
, (long)delay
.tv_usec
);
621 SET_READ_HANDLER(dmn
);
623 thread_cancel(dmn
->t_wakeup
);
624 SET_WAKEUP_ECHO(dmn
);
630 * Wait till we notice that all daemons are ready before
631 * we send we are ready to systemd
633 static void daemon_send_ready(void)
636 if (!sent
&& gs
.numdown
== 0) {
639 zlog_notice("all daemons up, doing startup-complete notify");
642 fp
= fopen(DAEMON_VTY_DIR
"/watchfrr.started", "w");
645 #if defined HAVE_SYSTEMD
646 systemd_send_started(master
, 0);
652 static void daemon_up(struct daemon
*dmn
, const char *why
)
654 dmn
->state
= DAEMON_UP
;
656 dmn
->connect_tries
= 0;
657 zlog_notice("%s state -> up : %s", dmn
->name
, why
);
659 SET_WAKEUP_ECHO(dmn
);
663 static int check_connect(struct thread
*t_write
)
665 struct daemon
*dmn
= THREAD_ARG(t_write
);
667 socklen_t reslen
= sizeof(sockerr
);
670 if (getsockopt(dmn
->fd
, SOL_SOCKET
, SO_ERROR
, (char *)&sockerr
, &reslen
)
672 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn
->name
,
673 safe_strerror(errno
));
675 "getsockopt failed checking connection success");
678 if ((reslen
== sizeof(sockerr
)) && sockerr
) {
682 "getsockopt reports that connection attempt failed: %s",
683 safe_strerror(sockerr
));
684 daemon_down(dmn
, why
);
688 daemon_up(dmn
, "delayed connect succeeded");
692 static int wakeup_connect_hanging(struct thread
*t_wakeup
)
694 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
697 dmn
->t_wakeup
= NULL
;
698 snprintf(why
, sizeof(why
),
699 "connection attempt timed out after %ld seconds", gs
.timeout
);
700 daemon_down(dmn
, why
);
704 /* Making connection to protocol daemon. */
705 static int try_connect(struct daemon
*dmn
)
708 struct sockaddr_un addr
;
711 if (gs
.loglevel
> LOG_DEBUG
+ 1)
712 zlog_debug("%s: attempting to connect", dmn
->name
);
713 dmn
->connect_tries
++;
715 memset(&addr
, 0, sizeof(struct sockaddr_un
));
716 addr
.sun_family
= AF_UNIX
;
717 snprintf(addr
.sun_path
, sizeof(addr
.sun_path
), "%s/%s.vty", gs
.vtydir
,
719 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
720 len
= addr
.sun_len
= SUN_LEN(&addr
);
722 len
= sizeof(addr
.sun_family
) + strlen(addr
.sun_path
);
723 #endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
725 /* Quick check to see if we might succeed before we go to the trouble
726 of creating a socket. */
727 if (access(addr
.sun_path
, W_OK
) < 0) {
729 flog_err_sys(LIB_ERR_SYSTEM_CALL
,
730 "%s: access to socket %s denied: %s",
731 dmn
->name
, addr
.sun_path
,
732 safe_strerror(errno
));
736 if ((sock
= socket(AF_UNIX
, SOCK_STREAM
, 0)) < 0) {
737 flog_err_sys(LIB_ERR_SOCKET
, "%s(%s): cannot make socket: %s",
738 __func__
, addr
.sun_path
, safe_strerror(errno
));
742 if (set_nonblocking(sock
) < 0 || set_cloexec(sock
) < 0) {
743 flog_err_sys(LIB_ERR_SYSTEM_CALL
,
744 "%s(%s): set_nonblocking/cloexec(%d) failed",
745 __func__
, addr
.sun_path
, sock
);
750 if (connect(sock
, (struct sockaddr
*)&addr
, len
) < 0) {
751 if ((errno
!= EINPROGRESS
) && (errno
!= EWOULDBLOCK
)) {
752 if (gs
.loglevel
> LOG_DEBUG
)
753 zlog_debug("%s(%s): connect failed: %s",
754 __func__
, addr
.sun_path
,
755 safe_strerror(errno
));
759 if (gs
.loglevel
> LOG_DEBUG
)
760 zlog_debug("%s: connection in progress", dmn
->name
);
761 dmn
->state
= DAEMON_CONNECTING
;
764 thread_add_write(master
, check_connect
, dmn
, dmn
->fd
,
766 dmn
->t_wakeup
= NULL
;
767 thread_add_timer(master
, wakeup_connect_hanging
, dmn
,
768 gs
.timeout
, &dmn
->t_wakeup
);
769 SET_READ_HANDLER(dmn
);
774 SET_READ_HANDLER(dmn
);
775 daemon_up(dmn
, "connect succeeded");
779 static int phase_hanging(struct thread
*t_hanging
)
781 gs
.t_phase_hanging
= NULL
;
782 flog_err(WATCHFRR_ERR_CONNECTION
,
783 "Phase [%s] hanging for %ld seconds, aborting phased restart",
784 phase_str
[gs
.phase
], PHASE_TIMEOUT
);
785 gs
.phase
= PHASE_NONE
;
789 static void set_phase(restart_phase_t new_phase
)
791 gs
.phase
= new_phase
;
792 if (gs
.t_phase_hanging
)
793 thread_cancel(gs
.t_phase_hanging
);
794 gs
.t_phase_hanging
= NULL
;
795 thread_add_timer(master
, phase_hanging
, NULL
, PHASE_TIMEOUT
,
796 &gs
.t_phase_hanging
);
799 static void phase_check(void)
808 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
809 if (dmn
->state
== DAEMON_INIT
)
812 /* startup complete, everything out of INIT */
813 gs
.phase
= PHASE_NONE
;
814 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
815 if (dmn
->state
== DAEMON_DOWN
) {
816 SET_WAKEUP_DOWN(dmn
);
820 case PHASE_STOPS_PENDING
:
824 "Phased restart: all routing daemon stop jobs have completed.");
825 set_phase(PHASE_WAITING_DOWN
);
828 case PHASE_WAITING_DOWN
:
829 if (gs
.numdown
+ IS_UP(gs
.special
) < gs
.numdaemons
)
831 zlog_info("Phased restart: all routing daemons now down.");
832 run_job(&gs
.special
->restart
, "restart", gs
.restart_command
, 1,
834 set_phase(PHASE_ZEBRA_RESTART_PENDING
);
837 case PHASE_ZEBRA_RESTART_PENDING
:
838 if (gs
.special
->restart
.pid
)
840 zlog_info("Phased restart: %s restart job completed.",
842 set_phase(PHASE_WAITING_ZEBRA_UP
);
845 case PHASE_WAITING_ZEBRA_UP
:
846 if (!IS_UP(gs
.special
))
848 zlog_info("Phased restart: %s is now up.", gs
.special
->name
);
851 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
852 if (dmn
!= gs
.special
)
853 run_job(&dmn
->restart
, "start",
854 gs
.start_command
, 1, 0);
857 gs
.phase
= PHASE_NONE
;
858 THREAD_OFF(gs
.t_phase_hanging
);
859 zlog_notice("Phased global restart has completed.");
864 static void try_restart(struct daemon
*dmn
)
869 if (dmn
!= gs
.special
) {
870 if ((gs
.special
->state
== DAEMON_UP
)
871 && (gs
.phase
== PHASE_NONE
))
872 run_job(&dmn
->restart
, "restart", gs
.restart_command
, 0,
876 "%s: postponing restart attempt because master %s daemon "
877 "not up [%s], or phased restart in progress",
878 dmn
->name
, gs
.special
->name
,
879 state_str
[gs
.special
->state
]);
883 if ((gs
.phase
!= PHASE_NONE
) || gs
.numpids
) {
884 if (gs
.loglevel
> LOG_DEBUG
+ 1)
886 "postponing phased global restart: restart already in "
887 "progress [%s], or outstanding child processes [%d]",
888 phase_str
[gs
.phase
], gs
.numpids
);
891 /* Is it too soon for a restart? */
893 struct timeval delay
;
894 if (time_elapsed(&delay
, &gs
.special
->restart
.time
)->tv_sec
895 < gs
.special
->restart
.interval
) {
896 if (gs
.loglevel
> LOG_DEBUG
+ 1)
898 "postponing phased global restart: "
899 "elapsed time %ld < retry interval %ld",
901 gs
.special
->restart
.interval
);
905 run_job(&gs
.restart
, "restart", gs
.restart_command
, 0, 1);
908 static int wakeup_unresponsive(struct thread
*t_wakeup
)
910 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
912 dmn
->t_wakeup
= NULL
;
913 if (dmn
->state
!= DAEMON_UNRESPONSIVE
)
914 flog_err(WATCHFRR_ERR_CONNECTION
,
915 "%s: no longer unresponsive (now %s), "
916 "wakeup should have been cancelled!",
917 dmn
->name
, state_str
[dmn
->state
]);
919 SET_WAKEUP_UNRESPONSIVE(dmn
);
925 static int wakeup_no_answer(struct thread
*t_wakeup
)
927 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
929 dmn
->t_wakeup
= NULL
;
930 dmn
->state
= DAEMON_UNRESPONSIVE
;
931 flog_err(WATCHFRR_ERR_CONNECTION
,
932 "%s state -> unresponsive : no response yet to ping "
933 "sent %ld seconds ago",
934 dmn
->name
, gs
.timeout
);
935 SET_WAKEUP_UNRESPONSIVE(dmn
);
940 static int wakeup_send_echo(struct thread
*t_wakeup
)
942 static const char echocmd
[] = "echo " PING_TOKEN
;
944 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
946 dmn
->t_wakeup
= NULL
;
947 if (((rc
= write(dmn
->fd
, echocmd
, sizeof(echocmd
))) < 0)
948 || ((size_t)rc
!= sizeof(echocmd
))) {
949 char why
[100 + sizeof(echocmd
)];
950 snprintf(why
, sizeof(why
),
951 "write '%s' returned %d instead of %u", echocmd
,
952 (int)rc
, (unsigned int)sizeof(echocmd
));
953 daemon_down(dmn
, why
);
955 gettimeofday(&dmn
->echo_sent
, NULL
);
956 dmn
->t_wakeup
= NULL
;
957 thread_add_timer(master
, wakeup_no_answer
, dmn
, gs
.timeout
,
963 bool check_all_up(void)
967 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
968 if (dmn
->state
!= DAEMON_UP
)
973 static void sigint(void)
975 zlog_notice("Terminating on signal");
976 systemd_send_stopping();
980 static int valid_command(const char *cmd
)
984 return ((p
= strchr(cmd
, '%')) != NULL
) && (*(p
+ 1) == 's')
985 && !strchr(p
+ 1, '%');
988 /* This is an ugly hack to circumvent problems with passing command-line
989 arguments that contain spaces. The fix is to use a configuration file. */
990 static char *translate_blanks(const char *cmd
, const char *blankstr
)
994 size_t bslen
= strlen(blankstr
);
996 if (!(res
= strdup(cmd
))) {
1000 while ((p
= strstr(res
, blankstr
)) != NULL
) {
1003 memmove(p
+ 1, p
+ bslen
, strlen(p
+ bslen
) + 1);
1008 static void watchfrr_init(int argc
, char **argv
)
1010 const char *special
= "zebra";
1012 struct daemon
*dmn
, **add
= &gs
.daemons
;
1013 char alldaemons
[512] = "", *p
= alldaemons
;
1015 for (i
= optind
; i
< argc
; i
++) {
1016 dmn
= XCALLOC(MTYPE_WATCHFRR_DAEMON
, sizeof(*dmn
));
1018 dmn
->name
= dmn
->restart
.name
= argv
[i
];
1019 dmn
->state
= DAEMON_INIT
;
1023 dmn
->t_wakeup
= NULL
;
1024 thread_add_timer_msec(master
, wakeup_init
, dmn
, 0,
1026 dmn
->restart
.interval
= gs
.min_restart_interval
;
1030 if (!strcmp(dmn
->name
, special
))
1036 "Must specify one or more daemons to monitor.\n\n");
1039 if (!watch_only
&& !gs
.special
) {
1040 fprintf(stderr
, "\"%s\" daemon must be in daemon lists\n\n",
1045 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
1046 snprintf(p
, alldaemons
+ sizeof(alldaemons
) - p
, "%s%s",
1047 (p
== alldaemons
) ? "" : " ", dmn
->name
);
1050 zlog_notice("%s %s watching [%s]%s", progname
, FRR_VERSION
, alldaemons
,
1051 watch_only
? ", monitor mode" : "");
1054 struct zebra_privs_t watchfrr_privs
= {
1056 .vty_group
= VTY_GROUP
,
1060 static struct quagga_signal_t watchfrr_signals
[] = {
1071 .handler
= sigchild
,
1075 FRR_DAEMON_INFO(watchfrr
, WATCHFRR
,
1076 .flags
= FRR_NO_PRIVSEP
| FRR_NO_TCPVTY
| FRR_LIMITED_CLI
1077 | FRR_NO_CFG_PID_DRY
| FRR_NO_ZCLIENT
1080 .printhelp
= printhelp
,
1081 .copyright
= "Copyright 2004 Andrew J. Schorr",
1083 .signals
= watchfrr_signals
,
1084 .n_signals
= array_size(watchfrr_signals
),
1086 .privs
= &watchfrr_privs
, )
1088 #define DEPRECATED_OPTIONS "aAezR:"
1090 int main(int argc
, char **argv
)
1093 const char *blankstr
= NULL
;
1095 frr_preinit(&watchfrr_di
, argc
, argv
);
1096 progname
= watchfrr_di
.progname
;
1098 frr_opt_add("b:dk:l:i:p:r:S:s:t:T:" DEPRECATED_OPTIONS
, longopts
, "");
1100 gs
.restart
.name
= "all";
1101 while ((opt
= frr_getopt(argc
, argv
, NULL
)) != EOF
) {
1102 if (opt
&& opt
< 128 && strchr(DEPRECATED_OPTIONS
, opt
)) {
1104 "The -%c option no longer exists.\n"
1105 "Please refer to the watchfrr(8) man page.\n",
1120 if (!valid_command(optarg
)) {
1122 "Invalid kill command, must contain '%%s': %s\n",
1126 gs
.stop_command
= optarg
;
1130 if ((sscanf(optarg
, "%d%1s", &gs
.loglevel
, garbage
)
1132 || (gs
.loglevel
< LOG_EMERG
)) {
1134 "Invalid loglevel argument: %s\n",
1139 case OPTION_MINRESTART
: {
1141 if ((sscanf(optarg
, "%ld%1s", &gs
.min_restart_interval
,
1144 || (gs
.min_restart_interval
< 0)) {
1146 "Invalid min_restart_interval argument: %s\n",
1151 case OPTION_MAXRESTART
: {
1153 if ((sscanf(optarg
, "%ld%1s", &gs
.max_restart_interval
,
1156 || (gs
.max_restart_interval
< 0)) {
1158 "Invalid max_restart_interval argument: %s\n",
1166 if ((sscanf(optarg
, "%d%1s", &period
, garbage
) != 1)
1167 || (gs
.period
< 1)) {
1169 "Invalid interval argument: %s\n",
1173 gs
.period
= 1000 * period
;
1176 watchfrr_di
.pid_file
= optarg
;
1179 if (!valid_command(optarg
)) {
1181 "Invalid restart command, must contain '%%s': %s\n",
1185 gs
.restart_command
= optarg
;
1188 if (!valid_command(optarg
)) {
1190 "Invalid start command, must contain '%%s': %s\n",
1194 gs
.start_command
= optarg
;
1201 if ((sscanf(optarg
, "%ld%1s", &gs
.timeout
, garbage
)
1203 || (gs
.timeout
< 1)) {
1205 "Invalid timeout argument: %s\n",
1212 if ((sscanf(optarg
, "%ld%1s", &gs
.restart_timeout
,
1215 || (gs
.restart_timeout
< 1)) {
1217 "Invalid restart timeout argument: %s\n",
1223 fputs("Invalid option.\n", stderr
);
1229 && (gs
.start_command
|| gs
.stop_command
|| gs
.restart_command
)) {
1230 fputs("Options -r/-s/-k are not used when --dry is active.\n",
1234 && (!gs
.restart_command
|| !gs
.start_command
|| !gs
.stop_command
)) {
1236 "Options -s (start), -k (kill), and -r (restart) are required.\n");
1241 if (gs
.restart_command
)
1242 gs
.restart_command
=
1243 translate_blanks(gs
.restart_command
, blankstr
);
1244 if (gs
.start_command
)
1246 translate_blanks(gs
.start_command
, blankstr
);
1247 if (gs
.stop_command
)
1249 translate_blanks(gs
.stop_command
, blankstr
);
1252 gs
.restart
.interval
= gs
.min_restart_interval
;
1254 master
= frr_init();
1255 watchfrr_error_init();
1256 watchfrr_init(argc
, argv
);
1257 watchfrr_vty_init();
1261 zlog_set_level(ZLOG_DEST_MONITOR
, ZLOG_DISABLED
);
1262 if (watchfrr_di
.daemon_mode
)
1263 zlog_set_level(ZLOG_DEST_SYSLOG
, MIN(gs
.loglevel
, LOG_DEBUG
));
1265 zlog_set_level(ZLOG_DEST_STDOUT
, MIN(gs
.loglevel
, LOG_DEBUG
));
1269 systemd_send_stopping();