2 * Monitor status of frr daemons and restart if necessary.
4 * Copyright (C) 2004 Andrew J. Schorr
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
26 #include <lib/version.h>
28 #include "memory_vty.h"
40 #define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
43 /* Macros to help randomize timers. */
44 #define JITTER(X) ((random() % ((X)+1))-((X)/2))
45 #define FUZZY(X) ((X)+JITTER((X)/20))
47 #define DEFAULT_PERIOD 5
48 #define DEFAULT_TIMEOUT 10
49 #define DEFAULT_RESTART_TIMEOUT 20
50 #define DEFAULT_LOGLEVEL LOG_INFO
51 #define DEFAULT_MIN_RESTART 60
52 #define DEFAULT_MAX_RESTART 600
53 #ifdef PATH_WATCHFRR_PID
54 #define DEFAULT_PIDFILE PATH_WATCHFRR_PID
56 #define DEFAULT_PIDFILE STATEDIR "/watchfrr.pid"
59 #define VTYDIR DAEMON_VTY_DIR
61 #define VTYDIR STATEDIR
64 #define PING_TOKEN "PING"
66 /* Needs to be global, referenced somewhere inside libfrr. */
67 struct thread_master
*master
;
72 MODE_SEPARATE_RESTART
,
73 MODE_PHASED_ZEBRA_RESTART
,
74 MODE_PHASED_ALL_RESTART
77 static const char *mode_str
[] = {
80 "individual daemon restart",
81 "phased zebra restart",
82 "phased global restart for any failure",
89 PHASE_ZEBRA_RESTART_PENDING
,
90 PHASE_WAITING_ZEBRA_UP
93 static const char *phase_str
[] = {
96 "Waiting for other daemons to come down",
97 "Zebra restart job running",
98 "Waiting for zebra to come up",
102 #define PHASE_TIMEOUT (3*gs.restart_timeout)
104 struct restart_info
{
110 struct thread
*t_kill
;
114 static struct global_state
{
116 restart_phase_t phase
;
117 struct thread
*t_phase_hanging
;
121 long restart_timeout
;
122 long min_restart_interval
;
123 long max_restart_interval
;
125 struct daemon
*daemons
;
126 const char *restart_command
;
127 const char *start_command
;
128 const char *stop_command
;
129 struct restart_info restart
;
130 int unresponsive_restart
;
132 struct daemon
*special
; /* points to zebra when doing phased restart */
135 int numdown
; /* # of daemons that are not UP or UNRESPONSIVE */
137 .mode
= MODE_MONITOR
,
140 .period
= 1000 * DEFAULT_PERIOD
,
141 .timeout
= DEFAULT_TIMEOUT
,
142 .restart_timeout
= DEFAULT_RESTART_TIMEOUT
,
143 .loglevel
= DEFAULT_LOGLEVEL
,
144 .min_restart_interval
= DEFAULT_MIN_RESTART
,
145 .max_restart_interval
= DEFAULT_MAX_RESTART
,
158 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
160 static const char *state_str
[] = {
161 "Init", "Down", "Connecting", "Up", "Unresponsive",
166 daemon_state_t state
;
168 struct timeval echo_sent
;
170 struct thread
*t_wakeup
;
171 struct thread
*t_read
;
172 struct thread
*t_write
;
174 struct restart_info restart
;
177 #define OPTION_MINRESTART 2000
178 #define OPTION_MAXRESTART 2001
180 static const struct option longopts
[] = {
181 {"daemon", no_argument
, NULL
, 'd'},
182 {"statedir", required_argument
, NULL
, 'S'},
183 {"no-echo", no_argument
, NULL
, 'e'},
184 {"loglevel", required_argument
, NULL
, 'l'},
185 {"interval", required_argument
, NULL
, 'i'},
186 {"timeout", required_argument
, NULL
, 't'},
187 {"restart-timeout", required_argument
, NULL
, 'T'},
188 {"restart", required_argument
, NULL
, 'r'},
189 {"start-command", required_argument
, NULL
, 's'},
190 {"kill-command", required_argument
, NULL
, 'k'},
191 {"restart-all", required_argument
, NULL
, 'R'},
192 {"all-restart", no_argument
, NULL
, 'a'},
193 {"always-all-restart", no_argument
, NULL
, 'A'},
194 {"unresponsive-restart", no_argument
, NULL
, 'z'},
195 {"min-restart-interval", required_argument
, NULL
, OPTION_MINRESTART
},
196 {"max-restart-interval", required_argument
, NULL
, OPTION_MAXRESTART
},
197 {"pid-file", required_argument
, NULL
, 'p'},
198 {"blank-string", required_argument
, NULL
, 'b'},
199 {"help", no_argument
, NULL
, 'h'},
200 {"version", no_argument
, NULL
, 'v'},
203 static int try_connect(struct daemon
*dmn
);
204 static int wakeup_send_echo(struct thread
*t_wakeup
);
205 static void try_restart(struct daemon
*dmn
);
206 static void phase_check(void);
208 static const char *progname
;
209 static void printhelp(FILE *target
)
212 "Usage : %s [OPTION...] <daemon name> ...\n\n\
213 Watchdog program to monitor status of frr daemons and try to restart\n\
214 them if they are down or unresponsive. It determines whether a daemon is\n\
215 up based on whether it can connect to the daemon's vty unix stream socket.\n\
216 It then repeatedly sends echo commands over that socket to determine whether\n\
217 the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
218 on the socket connection and know immediately that the daemon is down.\n\n\
219 The daemons to be monitored should be listed on the command line.\n\n\
220 This program can run in one of 5 modes:\n\n\
222 Just monitor and report on status changes. Example:\n\
223 %s -d zebra ospfd bgpd\n\n\
225 Whenever any daemon hangs or crashes, use the given command to restart\n\
226 them all. Example:\n\
228 -R '/sbin/service zebra restart; /sbin/service ospfd restart' \\\n\
231 When any single daemon hangs or crashes, restart only the daemon that's\n\
232 in trouble using the supplied restart command. Example:\n\
233 %s -dz -r '/sbin/service %%s restart' zebra ospfd bgpd\n\n\
235 The same as the previous mode, except that there is special treatment when\n\
236 the zebra daemon is in trouble. In that case, a phased restart approach\n\
237 is used: 1. stop all other daemons; 2. restart zebra; 3. start the other\n\
239 %s -adz -r '/sbin/service %%s restart' \\\n\
240 -s '/sbin/service %%s start' \\\n\
241 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
243 This is the same as the previous mode, except that the phased restart\n\
244 procedure is used whenever any of the daemons hangs or crashes. Example:\n\
245 %s -Adz -r '/sbin/service %%s restart' \\\n\
246 -s '/sbin/service %%s start' \\\n\
247 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
248 As of this writing, it is believed that mode 2 [%s]\n\
249 is not safe, and mode 3 [%s] may not be safe with some of the\n\
250 routing daemons.\n\n\
251 In order to avoid attempting to restart the daemons in a fast loop,\n\
252 the -m and -M options allow you to control the minimum delay between\n\
253 restart commands. The minimum restart delay is recalculated each time\n\
254 a restart is attempted: if the time since the last restart attempt exceeds\n\
255 twice the -M value, then the restart delay is set to the -m value.\n\
256 Otherwise, the interval is doubled (but capped at the -M value).\n\n",
257 progname
, mode_str
[0], progname
, mode_str
[1], progname
,
258 mode_str
[2], progname
, mode_str
[3], progname
, mode_str
[4],
259 progname
, mode_str
[2], mode_str
[3]);
263 -d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
264 to syslog instead of stdout.\n\
265 -S, --statedir Set the vty socket directory (default is %s)\n\
266 -e, --no-echo Do not ping the daemons to test responsiveness (this\n\
267 option is necessary if the daemons do not support the\n\
269 -l, --loglevel Set the logging level (default is %d).\n\
270 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
271 but it can be set higher than %d if extra-verbose debugging\n\
272 messages are desired.\n\
273 --min-restart-interval\n\
274 Set the minimum seconds to wait between invocations of daemon\n\
275 restart commands (default is %d).\n\
276 --max-restart-interval\n\
277 Set the maximum seconds to wait between invocations of daemon\n\
278 restart commands (default is %d).\n\
279 -i, --interval Set the status polling interval in seconds (default is %d)\n\
280 -t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
281 -T, --restart-timeout\n\
282 Set the restart (kill) timeout in seconds (default is %d).\n\
283 If any background jobs are still running after this much\n\
284 time has elapsed, they will be killed.\n\
285 -r, --restart Supply a Bourne shell command to use to restart a single\n\
286 daemon. The command string should include '%%s' where the\n\
287 name of the daemon should be substituted.\n\
288 Note that -r and -R are incompatible.\n\
289 -s, --start-command\n\
290 Supply a Bourne shell to command to use to start a single\n\
291 daemon. The command string should include '%%s' where the\n\
292 name of the daemon should be substituted.\n\
293 -k, --kill-command\n\
294 Supply a Bourne shell to command to use to stop a single\n\
295 daemon. The command string should include '%%s' where the\n\
296 name of the daemon should be substituted.\n\
298 When one or more daemons is down, try to restart everything\n\
299 using the Bourne shell command supplied as the argument.\n\
300 Note that -r and -R are incompatible.\n\
301 -z, --unresponsive-restart\n\
302 When a daemon is unresponsive, treat it as being down for\n\
305 When zebra hangs or crashes, restart all daemons using\n\
306 this phased approach: 1. stop all other daemons; 2. restart\n\
307 zebra; 3. start other daemons. Requires -r, -s, and -k.\n\
308 -A, --always-all-restart\n\
309 When any daemon (not just zebra) hangs or crashes, use the\n\
310 same phased restart mechanism described above for -a.\n\
311 Requires -r, -s, and -k.\n\
312 -p, --pid-file Set process identifier file name\n\
314 -b, --blank-string\n\
315 When the supplied argument string is found in any of the\n\
316 various shell command arguments (-r, -s, -k, or -R), replace\n\
317 it with a space. This is an ugly hack to circumvent problems\n\
318 passing command-line arguments with embedded spaces.\n\
319 -v, --version Print program version\n\
320 -h, --help Display this help and exit\n",
321 VTYDIR
, DEFAULT_LOGLEVEL
, LOG_EMERG
, LOG_DEBUG
, LOG_DEBUG
,
322 DEFAULT_MIN_RESTART
, DEFAULT_MAX_RESTART
, DEFAULT_PERIOD
,
323 DEFAULT_TIMEOUT
, DEFAULT_RESTART_TIMEOUT
, DEFAULT_PIDFILE
);
326 static pid_t
run_background(char *shell_cmd
)
330 switch (child
= fork()) {
332 zlog_err("fork failed, cannot run command [%s]: %s", shell_cmd
,
333 safe_strerror(errno
));
337 /* Use separate process group so child processes can be killed
339 if (setpgid(0, 0) < 0)
340 zlog_warn("warning: setpgid(0,0) failed: %s",
341 safe_strerror(errno
));
345 char *const argv
[4] = {shell
, dashc
, shell_cmd
, NULL
};
346 execv("/bin/sh", argv
);
347 zlog_err("execv(/bin/sh -c '%s') failed: %s", shell_cmd
,
348 safe_strerror(errno
));
352 /* Parent process: we will reap the child later. */
353 zlog_err("Forked background command [pid %d]: %s", (int)child
,
359 static struct timeval
*time_elapsed(struct timeval
*result
,
360 const struct timeval
*start_time
)
362 gettimeofday(result
, NULL
);
363 result
->tv_sec
-= start_time
->tv_sec
;
364 result
->tv_usec
-= start_time
->tv_usec
;
365 while (result
->tv_usec
< 0) {
366 result
->tv_usec
+= 1000000L;
372 static int restart_kill(struct thread
*t_kill
)
374 struct restart_info
*restart
= THREAD_ARG(t_kill
);
375 struct timeval delay
;
377 time_elapsed(&delay
, &restart
->time
);
379 "Warning: %s %s child process %d still running after "
380 "%ld seconds, sending signal %d",
381 restart
->what
, restart
->name
, (int)restart
->pid
,
382 (long)delay
.tv_sec
, (restart
->kills
? SIGKILL
: SIGTERM
));
383 kill(-restart
->pid
, (restart
->kills
? SIGKILL
: SIGTERM
));
385 restart
->t_kill
= NULL
;
386 thread_add_timer(master
, restart_kill
, restart
, gs
.restart_timeout
,
391 static struct restart_info
*find_child(pid_t child
)
393 if (gs
.mode
== MODE_GLOBAL_RESTART
) {
394 if (gs
.restart
.pid
== child
)
398 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
399 if (dmn
->restart
.pid
== child
)
400 return &dmn
->restart
;
406 static void sigchild(void)
412 struct restart_info
*restart
;
414 switch (child
= waitpid(-1, &status
, WNOHANG
)) {
416 zlog_err("waitpid failed: %s", safe_strerror(errno
));
419 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
423 if (child
== integrated_write_pid
) {
424 integrated_write_sigchld(status
);
428 if ((restart
= find_child(child
)) != NULL
) {
429 name
= restart
->name
;
430 what
= restart
->what
;
433 thread_cancel(restart
->t_kill
);
434 restart
->t_kill
= NULL
;
435 /* Update restart time to reflect the time the command
437 gettimeofday(&restart
->time
, NULL
);
440 "waitpid returned status for an unknown child process %d",
445 if (WIFSTOPPED(status
))
446 zlog_warn("warning: %s %s process %d is stopped", what
, name
,
448 else if (WIFSIGNALED(status
))
449 zlog_warn("%s %s process %d terminated due to signal %d", what
,
450 name
, (int)child
, WTERMSIG(status
));
451 else if (WIFEXITED(status
)) {
452 if (WEXITSTATUS(status
) != 0)
454 "%s %s process %d exited with non-zero status %d",
455 what
, name
, (int)child
, WEXITSTATUS(status
));
457 zlog_debug("%s %s process %d exited normally", what
,
460 zlog_err("cannot interpret %s %s process %d wait status 0x%x",
461 what
, name
, (int)child
, status
);
465 static int run_job(struct restart_info
*restart
, const char *cmdtype
,
466 const char *command
, int force
, int update_interval
)
468 struct timeval delay
;
470 if (gs
.loglevel
> LOG_DEBUG
+ 1)
471 zlog_debug("attempting to %s %s", cmdtype
, restart
->name
);
474 if (gs
.loglevel
> LOG_DEBUG
+ 1)
476 "cannot %s %s, previous pid %d still running",
477 cmdtype
, restart
->name
, (int)restart
->pid
);
481 /* Note: time_elapsed test must come before the force test, since we
483 to make sure that delay is initialized for use below in updating the
485 if ((time_elapsed(&delay
, &restart
->time
)->tv_sec
< restart
->interval
)
487 if (gs
.loglevel
> LOG_DEBUG
+ 1)
490 "elapsed time %ld < retry interval %ld",
491 cmdtype
, restart
->name
, (long)delay
.tv_sec
,
496 gettimeofday(&restart
->time
, NULL
);
499 char cmd
[strlen(command
) + strlen(restart
->name
) + 1];
500 snprintf(cmd
, sizeof(cmd
), command
, restart
->name
);
501 if ((restart
->pid
= run_background(cmd
)) > 0) {
502 restart
->t_kill
= NULL
;
503 thread_add_timer(master
, restart_kill
, restart
,
504 gs
.restart_timeout
, &restart
->t_kill
);
505 restart
->what
= cmdtype
;
511 /* Calculate the new restart interval. */
512 if (update_interval
) {
513 if (delay
.tv_sec
> 2 * gs
.max_restart_interval
)
514 restart
->interval
= gs
.min_restart_interval
;
515 else if ((restart
->interval
*= 2) > gs
.max_restart_interval
)
516 restart
->interval
= gs
.max_restart_interval
;
517 if (gs
.loglevel
> LOG_DEBUG
+ 1)
518 zlog_debug("restart %s interval is now %ld",
519 restart
->name
, restart
->interval
);
524 #define SET_READ_HANDLER(DMN) \
526 (DMN)->t_read = NULL; \
527 thread_add_read(master, handle_read, (DMN), (DMN)->fd, \
531 #define SET_WAKEUP_DOWN(DMN) \
533 (DMN)->t_wakeup = NULL; \
534 thread_add_timer_msec(master, wakeup_down, (DMN), \
535 FUZZY(gs.period), &(DMN)->t_wakeup); \
538 #define SET_WAKEUP_UNRESPONSIVE(DMN) \
540 (DMN)->t_wakeup = NULL; \
541 thread_add_timer_msec(master, wakeup_unresponsive, (DMN), \
542 FUZZY(gs.period), &(DMN)->t_wakeup); \
545 #define SET_WAKEUP_ECHO(DMN) \
547 (DMN)->t_wakeup = NULL; \
548 thread_add_timer_msec(master, wakeup_send_echo, (DMN), \
549 FUZZY(gs.period), &(DMN)->t_wakeup); \
552 static int wakeup_down(struct thread
*t_wakeup
)
554 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
556 dmn
->t_wakeup
= NULL
;
557 if (try_connect(dmn
) < 0)
558 SET_WAKEUP_DOWN(dmn
);
559 if ((dmn
->connect_tries
> 1) && (dmn
->state
!= DAEMON_UP
))
564 static int wakeup_init(struct thread
*t_wakeup
)
566 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
568 dmn
->t_wakeup
= NULL
;
569 if (try_connect(dmn
) < 0) {
570 SET_WAKEUP_DOWN(dmn
);
571 zlog_err("%s state -> down : initial connection attempt failed",
573 dmn
->state
= DAEMON_DOWN
;
578 static void daemon_down(struct daemon
*dmn
, const char *why
)
580 if (IS_UP(dmn
) || (dmn
->state
== DAEMON_INIT
))
581 zlog_err("%s state -> down : %s", dmn
->name
, why
);
582 else if (gs
.loglevel
> LOG_DEBUG
)
583 zlog_debug("%s still down : %s", dmn
->name
, why
);
586 dmn
->state
= DAEMON_DOWN
;
591 THREAD_OFF(dmn
->t_read
);
592 THREAD_OFF(dmn
->t_write
);
593 THREAD_OFF(dmn
->t_wakeup
);
594 if (try_connect(dmn
) < 0)
595 SET_WAKEUP_DOWN(dmn
);
599 static int handle_read(struct thread
*t_read
)
601 struct daemon
*dmn
= THREAD_ARG(t_read
);
602 static const char resp
[sizeof(PING_TOKEN
) + 4] = PING_TOKEN
"\n";
603 char buf
[sizeof(resp
) + 100];
605 struct timeval delay
;
608 if ((rc
= read(dmn
->fd
, buf
, sizeof(buf
))) < 0) {
611 if (ERRNO_IO_RETRY(errno
)) {
612 /* Pretend it never happened. */
613 SET_READ_HANDLER(dmn
);
616 snprintf(why
, sizeof(why
), "unexpected read error: %s",
617 safe_strerror(errno
));
618 daemon_down(dmn
, why
);
622 daemon_down(dmn
, "read returned EOF");
625 if (!dmn
->echo_sent
.tv_sec
) {
626 char why
[sizeof(buf
) + 100];
627 snprintf(why
, sizeof(why
),
628 "unexpected read returns %d bytes: %.*s", (int)rc
,
630 daemon_down(dmn
, why
);
634 /* We are expecting an echo response: is there any chance that the
635 response would not be returned entirely in the first read? That
636 seems inconceivable... */
637 if ((rc
!= sizeof(resp
)) || memcmp(buf
, resp
, sizeof(resp
))) {
638 char why
[100 + sizeof(buf
)];
639 snprintf(why
, sizeof(why
),
640 "read returned bad echo response of %d bytes "
641 "(expecting %u): %.*s",
642 (int)rc
, (u_int
)sizeof(resp
), (int)rc
, buf
);
643 daemon_down(dmn
, why
);
647 time_elapsed(&delay
, &dmn
->echo_sent
);
648 dmn
->echo_sent
.tv_sec
= 0;
649 if (dmn
->state
== DAEMON_UNRESPONSIVE
) {
650 if (delay
.tv_sec
< gs
.timeout
) {
651 dmn
->state
= DAEMON_UP
;
653 "%s state -> up : echo response received after %ld.%06ld "
655 dmn
->name
, (long)delay
.tv_sec
,
656 (long)delay
.tv_usec
);
659 "%s: slow echo response finally received after %ld.%06ld "
661 dmn
->name
, (long)delay
.tv_sec
,
662 (long)delay
.tv_usec
);
663 } else if (gs
.loglevel
> LOG_DEBUG
+ 1)
664 zlog_debug("%s: echo response received after %ld.%06ld seconds",
665 dmn
->name
, (long)delay
.tv_sec
, (long)delay
.tv_usec
);
667 SET_READ_HANDLER(dmn
);
669 thread_cancel(dmn
->t_wakeup
);
670 SET_WAKEUP_ECHO(dmn
);
676 * Wait till we notice that all daemons are ready before
677 * we send we are ready to systemd
679 static void daemon_send_ready(void)
682 if (!sent
&& gs
.numdown
== 0) {
685 fp
= fopen(DAEMON_VTY_DIR
"/watchfrr.started", "w");
688 "Watchfrr: Notifying Systemd we are up and running");
689 systemd_send_started(master
, 0);
694 static void daemon_up(struct daemon
*dmn
, const char *why
)
696 dmn
->state
= DAEMON_UP
;
698 dmn
->connect_tries
= 0;
699 zlog_notice("%s state -> up : %s", dmn
->name
, why
);
702 SET_WAKEUP_ECHO(dmn
);
706 static int check_connect(struct thread
*t_write
)
708 struct daemon
*dmn
= THREAD_ARG(t_write
);
710 socklen_t reslen
= sizeof(sockerr
);
713 if (getsockopt(dmn
->fd
, SOL_SOCKET
, SO_ERROR
, (char *)&sockerr
, &reslen
)
715 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn
->name
,
716 safe_strerror(errno
));
718 "getsockopt failed checking connection success");
721 if ((reslen
== sizeof(sockerr
)) && sockerr
) {
725 "getsockopt reports that connection attempt failed: %s",
726 safe_strerror(sockerr
));
727 daemon_down(dmn
, why
);
731 daemon_up(dmn
, "delayed connect succeeded");
735 static int wakeup_connect_hanging(struct thread
*t_wakeup
)
737 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
740 dmn
->t_wakeup
= NULL
;
741 snprintf(why
, sizeof(why
),
742 "connection attempt timed out after %ld seconds", gs
.timeout
);
743 daemon_down(dmn
, why
);
747 /* Making connection to protocol daemon. */
748 static int try_connect(struct daemon
*dmn
)
751 struct sockaddr_un addr
;
754 if (gs
.loglevel
> LOG_DEBUG
+ 1)
755 zlog_debug("%s: attempting to connect", dmn
->name
);
756 dmn
->connect_tries
++;
758 memset(&addr
, 0, sizeof(struct sockaddr_un
));
759 addr
.sun_family
= AF_UNIX
;
760 snprintf(addr
.sun_path
, sizeof(addr
.sun_path
), "%s/%s.vty", gs
.vtydir
,
762 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
763 len
= addr
.sun_len
= SUN_LEN(&addr
);
765 len
= sizeof(addr
.sun_family
) + strlen(addr
.sun_path
);
766 #endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
768 /* Quick check to see if we might succeed before we go to the trouble
769 of creating a socket. */
770 if (access(addr
.sun_path
, W_OK
) < 0) {
772 zlog_err("%s: access to socket %s denied: %s",
773 dmn
->name
, addr
.sun_path
,
774 safe_strerror(errno
));
778 if ((sock
= socket(AF_UNIX
, SOCK_STREAM
, 0)) < 0) {
779 zlog_err("%s(%s): cannot make socket: %s", __func__
,
780 addr
.sun_path
, safe_strerror(errno
));
784 if (set_nonblocking(sock
) < 0 || set_cloexec(sock
) < 0) {
785 zlog_err("%s(%s): set_nonblocking/cloexec(%d) failed", __func__
,
786 addr
.sun_path
, sock
);
791 if (connect(sock
, (struct sockaddr
*)&addr
, len
) < 0) {
792 if ((errno
!= EINPROGRESS
) && (errno
!= EWOULDBLOCK
)) {
793 if (gs
.loglevel
> LOG_DEBUG
)
794 zlog_debug("%s(%s): connect failed: %s",
795 __func__
, addr
.sun_path
,
796 safe_strerror(errno
));
800 if (gs
.loglevel
> LOG_DEBUG
)
801 zlog_debug("%s: connection in progress", dmn
->name
);
802 dmn
->state
= DAEMON_CONNECTING
;
805 thread_add_write(master
, check_connect
, dmn
, dmn
->fd
,
807 dmn
->t_wakeup
= NULL
;
808 thread_add_timer(master
, wakeup_connect_hanging
, dmn
,
809 gs
.timeout
, &dmn
->t_wakeup
);
810 SET_READ_HANDLER(dmn
);
815 SET_READ_HANDLER(dmn
);
816 daemon_up(dmn
, "connect succeeded");
820 static int phase_hanging(struct thread
*t_hanging
)
822 gs
.t_phase_hanging
= NULL
;
823 zlog_err("Phase [%s] hanging for %ld seconds, aborting phased restart",
824 phase_str
[gs
.phase
], PHASE_TIMEOUT
);
825 gs
.phase
= PHASE_NONE
;
829 static void set_phase(restart_phase_t new_phase
)
831 gs
.phase
= new_phase
;
832 if (gs
.t_phase_hanging
)
833 thread_cancel(gs
.t_phase_hanging
);
834 gs
.t_phase_hanging
= NULL
;
835 thread_add_timer(master
, phase_hanging
, NULL
, PHASE_TIMEOUT
,
836 &gs
.t_phase_hanging
);
839 static void phase_check(void)
844 case PHASE_STOPS_PENDING
:
848 "Phased restart: all routing daemon stop jobs have completed.");
849 set_phase(PHASE_WAITING_DOWN
);
852 case PHASE_WAITING_DOWN
:
853 if (gs
.numdown
+ IS_UP(gs
.special
) < gs
.numdaemons
)
855 zlog_info("Phased restart: all routing daemons now down.");
856 run_job(&gs
.special
->restart
, "restart", gs
.restart_command
, 1,
858 set_phase(PHASE_ZEBRA_RESTART_PENDING
);
861 case PHASE_ZEBRA_RESTART_PENDING
:
862 if (gs
.special
->restart
.pid
)
864 zlog_info("Phased restart: %s restart job completed.",
866 set_phase(PHASE_WAITING_ZEBRA_UP
);
869 case PHASE_WAITING_ZEBRA_UP
:
870 if (!IS_UP(gs
.special
))
872 zlog_info("Phased restart: %s is now up.", gs
.special
->name
);
875 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
876 if (dmn
!= gs
.special
)
877 run_job(&dmn
->restart
, "start",
878 gs
.start_command
, 1, 0);
881 gs
.phase
= PHASE_NONE
;
882 THREAD_OFF(gs
.t_phase_hanging
);
883 zlog_notice("Phased global restart has completed.");
888 static void try_restart(struct daemon
*dmn
)
893 case MODE_GLOBAL_RESTART
:
894 run_job(&gs
.restart
, "restart", gs
.restart_command
, 0, 1);
896 case MODE_SEPARATE_RESTART
:
897 run_job(&dmn
->restart
, "restart", gs
.restart_command
, 0, 1);
899 case MODE_PHASED_ZEBRA_RESTART
:
900 if (dmn
!= gs
.special
) {
901 if ((gs
.special
->state
== DAEMON_UP
)
902 && (gs
.phase
== PHASE_NONE
))
903 run_job(&dmn
->restart
, "restart",
904 gs
.restart_command
, 0, 1);
907 "%s: postponing restart attempt because master %s daemon "
908 "not up [%s], or phased restart in progress",
909 dmn
->name
, gs
.special
->name
,
910 state_str
[gs
.special
->state
]);
915 case MODE_PHASED_ALL_RESTART
:
916 if ((gs
.phase
!= PHASE_NONE
) || gs
.numpids
) {
917 if (gs
.loglevel
> LOG_DEBUG
+ 1)
919 "postponing phased global restart: restart already in "
920 "progress [%s], or outstanding child processes [%d]",
921 phase_str
[gs
.phase
], gs
.numpids
);
924 /* Is it too soon for a restart? */
926 struct timeval delay
;
927 if (time_elapsed(&delay
, &gs
.special
->restart
.time
)
929 < gs
.special
->restart
.interval
) {
930 if (gs
.loglevel
> LOG_DEBUG
+ 1)
932 "postponing phased global restart: "
933 "elapsed time %ld < retry interval %ld",
935 gs
.special
->restart
.interval
);
939 run_job(&gs
.restart
, "restart", gs
.restart_command
, 0, 1);
942 zlog_err("error: unknown restart mode %d", gs
.mode
);
947 static int wakeup_unresponsive(struct thread
*t_wakeup
)
949 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
951 dmn
->t_wakeup
= NULL
;
952 if (dmn
->state
!= DAEMON_UNRESPONSIVE
)
954 "%s: no longer unresponsive (now %s), "
955 "wakeup should have been cancelled!",
956 dmn
->name
, state_str
[dmn
->state
]);
958 SET_WAKEUP_UNRESPONSIVE(dmn
);
964 static int wakeup_no_answer(struct thread
*t_wakeup
)
966 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
968 dmn
->t_wakeup
= NULL
;
969 dmn
->state
= DAEMON_UNRESPONSIVE
;
971 "%s state -> unresponsive : no response yet to ping "
972 "sent %ld seconds ago",
973 dmn
->name
, gs
.timeout
);
974 if (gs
.unresponsive_restart
) {
975 SET_WAKEUP_UNRESPONSIVE(dmn
);
981 static int wakeup_send_echo(struct thread
*t_wakeup
)
983 static const char echocmd
[] = "echo " PING_TOKEN
;
985 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
987 dmn
->t_wakeup
= NULL
;
988 if (((rc
= write(dmn
->fd
, echocmd
, sizeof(echocmd
))) < 0)
989 || ((size_t)rc
!= sizeof(echocmd
))) {
990 char why
[100 + sizeof(echocmd
)];
991 snprintf(why
, sizeof(why
),
992 "write '%s' returned %d instead of %u", echocmd
,
993 (int)rc
, (u_int
)sizeof(echocmd
));
994 daemon_down(dmn
, why
);
996 gettimeofday(&dmn
->echo_sent
, NULL
);
997 dmn
->t_wakeup
= NULL
;
998 thread_add_timer(master
, wakeup_no_answer
, dmn
, gs
.timeout
,
1004 static void sigint(void)
1006 zlog_notice("Terminating on signal");
1007 systemd_send_stopping();
1011 static int valid_command(const char *cmd
)
1015 return ((p
= strchr(cmd
, '%')) != NULL
) && (*(p
+ 1) == 's')
1016 && !strchr(p
+ 1, '%');
1019 /* This is an ugly hack to circumvent problems with passing command-line
1020 arguments that contain spaces. The fix is to use a configuration file. */
1021 static char *translate_blanks(const char *cmd
, const char *blankstr
)
1025 size_t bslen
= strlen(blankstr
);
1027 if (!(res
= strdup(cmd
))) {
1031 while ((p
= strstr(res
, blankstr
)) != NULL
) {
1034 memmove(p
+ 1, p
+ bslen
, strlen(p
+ bslen
) + 1);
1039 struct zebra_privs_t watchfrr_privs
= {
1041 .vty_group
= VTY_GROUP
,
1045 static struct quagga_signal_t watchfrr_signals
[] = {
1056 .handler
= sigchild
,
1060 FRR_DAEMON_INFO(watchfrr
, WATCHFRR
,
1061 .flags
= FRR_NO_PRIVSEP
| FRR_NO_TCPVTY
| FRR_LIMITED_CLI
1062 | FRR_NO_CFG_PID_DRY
| FRR_NO_ZCLIENT
,
1064 .printhelp
= printhelp
,
1065 .copyright
= "Copyright 2004 Andrew J. Schorr",
1067 .signals
= watchfrr_signals
,
1068 .n_signals
= array_size(watchfrr_signals
),
1070 .privs
= &watchfrr_privs
, )
1072 int main(int argc
, char **argv
)
1075 const char *pidfile
= DEFAULT_PIDFILE
;
1076 const char *special
= "zebra";
1077 const char *blankstr
= NULL
;
1079 frr_preinit(&watchfrr_di
, argc
, argv
);
1080 progname
= watchfrr_di
.progname
;
1082 frr_opt_add("aAb:dek:l:i:p:r:R:S:s:t:T:z", longopts
, "");
1084 gs
.restart
.name
= "all";
1085 while ((opt
= frr_getopt(argc
, argv
, NULL
)) != EOF
) {
1090 if ((gs
.mode
!= MODE_MONITOR
)
1091 && (gs
.mode
!= MODE_SEPARATE_RESTART
)) {
1092 fputs("Ambiguous operating mode selected.\n",
1096 gs
.mode
= MODE_PHASED_ZEBRA_RESTART
;
1099 if ((gs
.mode
!= MODE_MONITOR
)
1100 && (gs
.mode
!= MODE_SEPARATE_RESTART
)) {
1101 fputs("Ambiguous operating mode selected.\n",
1105 gs
.mode
= MODE_PHASED_ALL_RESTART
;
1114 if (!valid_command(optarg
)) {
1116 "Invalid kill command, must contain '%%s': %s\n",
1120 gs
.stop_command
= optarg
;
1124 if ((sscanf(optarg
, "%d%1s", &gs
.loglevel
, garbage
)
1126 || (gs
.loglevel
< LOG_EMERG
)) {
1128 "Invalid loglevel argument: %s\n",
1133 case OPTION_MINRESTART
: {
1135 if ((sscanf(optarg
, "%ld%1s", &gs
.min_restart_interval
,
1138 || (gs
.min_restart_interval
< 0)) {
1140 "Invalid min_restart_interval argument: %s\n",
1145 case OPTION_MAXRESTART
: {
1147 if ((sscanf(optarg
, "%ld%1s", &gs
.max_restart_interval
,
1150 || (gs
.max_restart_interval
< 0)) {
1152 "Invalid max_restart_interval argument: %s\n",
1160 if ((sscanf(optarg
, "%d%1s", &period
, garbage
) != 1)
1161 || (gs
.period
< 1)) {
1163 "Invalid interval argument: %s\n",
1167 gs
.period
= 1000 * period
;
1173 if ((gs
.mode
== MODE_GLOBAL_RESTART
)
1174 || (gs
.mode
== MODE_SEPARATE_RESTART
)) {
1175 fputs("Ambiguous operating mode selected.\n",
1179 if (!valid_command(optarg
)) {
1181 "Invalid restart command, must contain '%%s': %s\n",
1185 gs
.restart_command
= optarg
;
1186 if (gs
.mode
== MODE_MONITOR
)
1187 gs
.mode
= MODE_SEPARATE_RESTART
;
1190 if (gs
.mode
!= MODE_MONITOR
) {
1191 fputs("Ambiguous operating mode selected.\n",
1195 if (strchr(optarg
, '%')) {
1197 "Invalid restart-all arg, must not contain '%%s': %s\n",
1201 gs
.restart_command
= optarg
;
1202 gs
.mode
= MODE_GLOBAL_RESTART
;
1205 if (!valid_command(optarg
)) {
1207 "Invalid start command, must contain '%%s': %s\n",
1211 gs
.start_command
= optarg
;
1218 if ((sscanf(optarg
, "%ld%1s", &gs
.timeout
, garbage
)
1220 || (gs
.timeout
< 1)) {
1222 "Invalid timeout argument: %s\n",
1229 if ((sscanf(optarg
, "%ld%1s", &gs
.restart_timeout
,
1232 || (gs
.restart_timeout
< 1)) {
1234 "Invalid restart timeout argument: %s\n",
1240 gs
.unresponsive_restart
= 1;
1243 fputs("Invalid option.\n", stderr
);
1248 if (gs
.unresponsive_restart
&& (gs
.mode
== MODE_MONITOR
)) {
1249 fputs("Option -z requires a -r or -R restart option.\n",
1255 if (gs
.restart_command
|| gs
.start_command
|| gs
.stop_command
) {
1257 "No kill/(re)start commands needed for %s mode.\n",
1262 case MODE_GLOBAL_RESTART
:
1263 case MODE_SEPARATE_RESTART
:
1264 if (!gs
.restart_command
|| gs
.start_command
1265 || gs
.stop_command
) {
1267 "No start/kill commands needed in [%s] mode.\n",
1272 case MODE_PHASED_ZEBRA_RESTART
:
1273 case MODE_PHASED_ALL_RESTART
:
1274 if (!gs
.restart_command
|| !gs
.start_command
1275 || !gs
.stop_command
) {
1277 "Need start, kill, and restart commands in [%s] mode.\n",
1285 if (gs
.restart_command
)
1286 gs
.restart_command
=
1287 translate_blanks(gs
.restart_command
, blankstr
);
1288 if (gs
.start_command
)
1290 translate_blanks(gs
.start_command
, blankstr
);
1291 if (gs
.stop_command
)
1293 translate_blanks(gs
.stop_command
, blankstr
);
1296 gs
.restart
.interval
= gs
.min_restart_interval
;
1298 master
= frr_init();
1300 zlog_set_level(ZLOG_DEST_MONITOR
, ZLOG_DISABLED
);
1301 if (watchfrr_di
.daemon_mode
) {
1302 zlog_set_level(ZLOG_DEST_SYSLOG
, MIN(gs
.loglevel
, LOG_DEBUG
));
1303 if (daemon(0, 0) < 0) {
1304 fprintf(stderr
, "Watchfrr daemon failed: %s",
1309 zlog_set_level(ZLOG_DEST_STDOUT
, MIN(gs
.loglevel
, LOG_DEBUG
));
1311 watchfrr_vty_init();
1317 struct daemon
*tail
= NULL
;
1319 for (i
= optind
; i
< argc
; i
++) {
1322 if (!(dmn
= (struct daemon
*)calloc(1, sizeof(*dmn
)))) {
1323 fprintf(stderr
, "calloc(1,%u) failed: %s\n",
1324 (u_int
)sizeof(*dmn
),
1325 safe_strerror(errno
));
1328 dmn
->name
= dmn
->restart
.name
= argv
[i
];
1329 dmn
->state
= DAEMON_INIT
;
1333 dmn
->t_wakeup
= NULL
;
1334 thread_add_timer_msec(master
, wakeup_init
, dmn
,
1335 100 + (random() % 900),
1337 dmn
->restart
.interval
= gs
.min_restart_interval
;
1344 if (((gs
.mode
== MODE_PHASED_ZEBRA_RESTART
)
1345 || (gs
.mode
== MODE_PHASED_ALL_RESTART
))
1346 && !strcmp(dmn
->name
, special
))
1351 fputs("Must specify one or more daemons to monitor.\n", stderr
);
1354 if (((gs
.mode
== MODE_PHASED_ZEBRA_RESTART
)
1355 || (gs
.mode
== MODE_PHASED_ALL_RESTART
))
1358 "In mode [%s], but cannot find master daemon %s\n",
1359 mode_str
[gs
.mode
], special
);
1363 /* Make sure we're not already running. */
1364 pid_output(pidfile
);
1366 /* Announce which daemons are being monitored. */
1371 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
1372 len
+= strlen(dmn
->name
) + 1;
1378 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
1381 strcpy(p
, dmn
->name
);
1384 zlog_notice("%s %s watching [%s], mode [%s]", progname
,
1385 FRR_VERSION
, buf
, mode_str
[gs
.mode
]);
1390 struct thread thread
;
1392 while (thread_fetch(master
, &thread
))
1393 thread_call(&thread
);
1396 systemd_send_stopping();