2 * Monitor status of frr daemons and restart if necessary.
4 * Copyright (C) 2004 Andrew J. Schorr
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
26 #include <lib/version.h>
28 #include "memory_vty.h"
40 #define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
43 /* Macros to help randomize timers. */
44 #define JITTER(X) ((random() % ((X)+1))-((X)/2))
45 #define FUZZY(X) ((X)+JITTER((X)/20))
47 #define DEFAULT_PERIOD 5
48 #define DEFAULT_TIMEOUT 10
49 #define DEFAULT_RESTART_TIMEOUT 20
50 #define DEFAULT_LOGLEVEL LOG_INFO
51 #define DEFAULT_MIN_RESTART 60
52 #define DEFAULT_MAX_RESTART 600
53 #ifdef PATH_WATCHFRR_PID
54 #define DEFAULT_PIDFILE PATH_WATCHFRR_PID
56 #define DEFAULT_PIDFILE STATEDIR "/watchfrr.pid"
59 #define VTYDIR DAEMON_VTY_DIR
61 #define VTYDIR STATEDIR
64 #define PING_TOKEN "PING"
66 /* Needs to be global, referenced somewhere inside libfrr. */
67 struct thread_master
*master
;
72 MODE_SEPARATE_RESTART
,
73 MODE_PHASED_ZEBRA_RESTART
,
74 MODE_PHASED_ALL_RESTART
77 static const char *mode_str
[] = {
80 "individual daemon restart",
81 "phased zebra restart",
82 "phased global restart for any failure",
89 PHASE_ZEBRA_RESTART_PENDING
,
90 PHASE_WAITING_ZEBRA_UP
93 static const char *phase_str
[] = {
96 "Waiting for other daemons to come down",
97 "Zebra restart job running",
98 "Waiting for zebra to come up",
102 #define PHASE_TIMEOUT (3*gs.restart_timeout)
104 struct restart_info
{
110 struct thread
*t_kill
;
114 static struct global_state
{
116 restart_phase_t phase
;
117 struct thread
*t_phase_hanging
;
121 long restart_timeout
;
122 long min_restart_interval
;
123 long max_restart_interval
;
125 struct daemon
*daemons
;
126 const char *restart_command
;
127 const char *start_command
;
128 const char *stop_command
;
129 struct restart_info restart
;
130 int unresponsive_restart
;
132 struct daemon
*special
; /* points to zebra when doing phased restart */
135 int numdown
; /* # of daemons that are not UP or UNRESPONSIVE */
137 .mode
= MODE_MONITOR
,.phase
= PHASE_NONE
,.vtydir
= VTYDIR
,.period
=
138 1000 * DEFAULT_PERIOD
,.timeout
=
139 DEFAULT_TIMEOUT
,.restart_timeout
=
140 DEFAULT_RESTART_TIMEOUT
,.loglevel
=
141 DEFAULT_LOGLEVEL
,.min_restart_interval
=
142 DEFAULT_MIN_RESTART
,.max_restart_interval
=
143 DEFAULT_MAX_RESTART
,.do_ping
= 1,};
154 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
156 static const char *state_str
[] = {
166 daemon_state_t state
;
168 struct timeval echo_sent
;
170 struct thread
*t_wakeup
;
171 struct thread
*t_read
;
172 struct thread
*t_write
;
174 struct restart_info restart
;
177 #define OPTION_MINRESTART 2000
178 #define OPTION_MAXRESTART 2001
180 static const struct option longopts
[] = {
181 {"daemon", no_argument
, NULL
, 'd'},
182 {"statedir", required_argument
, NULL
, 'S'},
183 {"no-echo", no_argument
, NULL
, 'e'},
184 {"loglevel", required_argument
, NULL
, 'l'},
185 {"interval", required_argument
, NULL
, 'i'},
186 {"timeout", required_argument
, NULL
, 't'},
187 {"restart-timeout", required_argument
, NULL
, 'T'},
188 {"restart", required_argument
, NULL
, 'r'},
189 {"start-command", required_argument
, NULL
, 's'},
190 {"kill-command", required_argument
, NULL
, 'k'},
191 {"restart-all", required_argument
, NULL
, 'R'},
192 {"all-restart", no_argument
, NULL
, 'a'},
193 {"always-all-restart", no_argument
, NULL
, 'A'},
194 {"unresponsive-restart", no_argument
, NULL
, 'z'},
195 {"min-restart-interval", required_argument
, NULL
, OPTION_MINRESTART
},
196 {"max-restart-interval", required_argument
, NULL
, OPTION_MAXRESTART
},
197 {"pid-file", required_argument
, NULL
, 'p'},
198 {"blank-string", required_argument
, NULL
, 'b'},
199 {"help", no_argument
, NULL
, 'h'},
200 {"version", no_argument
, NULL
, 'v'},
204 static int try_connect(struct daemon
*dmn
);
205 static int wakeup_send_echo(struct thread
*t_wakeup
);
206 static void try_restart(struct daemon
*dmn
);
207 static void phase_check(void);
209 static const char *progname
;
210 static void printhelp(FILE *target
)
212 fprintf(target
, "Usage : %s [OPTION...] <daemon name> ...\n\n\
213 Watchdog program to monitor status of frr daemons and try to restart\n\
214 them if they are down or unresponsive. It determines whether a daemon is\n\
215 up based on whether it can connect to the daemon's vty unix stream socket.\n\
216 It then repeatedly sends echo commands over that socket to determine whether\n\
217 the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
218 on the socket connection and know immediately that the daemon is down.\n\n\
219 The daemons to be monitored should be listed on the command line.\n\n\
220 This program can run in one of 5 modes:\n\n\
222 Just monitor and report on status changes. Example:\n\
223 %s -d zebra ospfd bgpd\n\n\
225 Whenever any daemon hangs or crashes, use the given command to restart\n\
226 them all. Example:\n\
228 -R '/sbin/service zebra restart; /sbin/service ospfd restart' \\\n\
231 When any single daemon hangs or crashes, restart only the daemon that's\n\
232 in trouble using the supplied restart command. Example:\n\
233 %s -dz -r '/sbin/service %%s restart' zebra ospfd bgpd\n\n\
235 The same as the previous mode, except that there is special treatment when\n\
236 the zebra daemon is in trouble. In that case, a phased restart approach\n\
237 is used: 1. stop all other daemons; 2. restart zebra; 3. start the other\n\
239 %s -adz -r '/sbin/service %%s restart' \\\n\
240 -s '/sbin/service %%s start' \\\n\
241 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
243 This is the same as the previous mode, except that the phased restart\n\
244 procedure is used whenever any of the daemons hangs or crashes. Example:\n\
245 %s -Adz -r '/sbin/service %%s restart' \\\n\
246 -s '/sbin/service %%s start' \\\n\
247 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
248 As of this writing, it is believed that mode 2 [%s]\n\
249 is not safe, and mode 3 [%s] may not be safe with some of the\n\
250 routing daemons.\n\n\
251 In order to avoid attempting to restart the daemons in a fast loop,\n\
252 the -m and -M options allow you to control the minimum delay between\n\
253 restart commands. The minimum restart delay is recalculated each time\n\
254 a restart is attempted: if the time since the last restart attempt exceeds\n\
255 twice the -M value, then the restart delay is set to the -m value.\n\
256 Otherwise, the interval is doubled (but capped at the -M value).\n\n", progname
, mode_str
[0], progname
, mode_str
[1], progname
, mode_str
[2], progname
, mode_str
[3], progname
, mode_str
[4], progname
, mode_str
[2], mode_str
[3]);
258 fprintf(target
, "Options:\n\
259 -d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
260 to syslog instead of stdout.\n\
261 -S, --statedir Set the vty socket directory (default is %s)\n\
262 -e, --no-echo Do not ping the daemons to test responsiveness (this\n\
263 option is necessary if the daemons do not support the\n\
265 -l, --loglevel Set the logging level (default is %d).\n\
266 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
267 but it can be set higher than %d if extra-verbose debugging\n\
268 messages are desired.\n\
269 --min-restart-interval\n\
270 Set the minimum seconds to wait between invocations of daemon\n\
271 restart commands (default is %d).\n\
272 --max-restart-interval\n\
273 Set the maximum seconds to wait between invocations of daemon\n\
274 restart commands (default is %d).\n\
275 -i, --interval Set the status polling interval in seconds (default is %d)\n\
276 -t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
277 -T, --restart-timeout\n\
278 Set the restart (kill) timeout in seconds (default is %d).\n\
279 If any background jobs are still running after this much\n\
280 time has elapsed, they will be killed.\n\
281 -r, --restart Supply a Bourne shell command to use to restart a single\n\
282 daemon. The command string should include '%%s' where the\n\
283 name of the daemon should be substituted.\n\
284 Note that -r and -R are incompatible.\n\
285 -s, --start-command\n\
286 Supply a Bourne shell to command to use to start a single\n\
287 daemon. The command string should include '%%s' where the\n\
288 name of the daemon should be substituted.\n\
289 -k, --kill-command\n\
290 Supply a Bourne shell to command to use to stop a single\n\
291 daemon. The command string should include '%%s' where the\n\
292 name of the daemon should be substituted.\n\
294 When one or more daemons is down, try to restart everything\n\
295 using the Bourne shell command supplied as the argument.\n\
296 Note that -r and -R are incompatible.\n\
297 -z, --unresponsive-restart\n\
298 When a daemon is unresponsive, treat it as being down for\n\
301 When zebra hangs or crashes, restart all daemons using\n\
302 this phased approach: 1. stop all other daemons; 2. restart\n\
303 zebra; 3. start other daemons. Requires -r, -s, and -k.\n\
304 -A, --always-all-restart\n\
305 When any daemon (not just zebra) hangs or crashes, use the\n\
306 same phased restart mechanism described above for -a.\n\
307 Requires -r, -s, and -k.\n\
308 -p, --pid-file Set process identifier file name\n\
310 -b, --blank-string\n\
311 When the supplied argument string is found in any of the\n\
312 various shell command arguments (-r, -s, -k, or -R), replace\n\
313 it with a space. This is an ugly hack to circumvent problems\n\
314 passing command-line arguments with embedded spaces.\n\
315 -v, --version Print program version\n\
316 -h, --help Display this help and exit\n", VTYDIR
, DEFAULT_LOGLEVEL
, LOG_EMERG
, LOG_DEBUG
, LOG_DEBUG
, DEFAULT_MIN_RESTART
, DEFAULT_MAX_RESTART
, DEFAULT_PERIOD
, DEFAULT_TIMEOUT
, DEFAULT_RESTART_TIMEOUT
, DEFAULT_PIDFILE
);
319 static pid_t
run_background(char *shell_cmd
)
323 switch (child
= fork()) {
325 zlog_err("fork failed, cannot run command [%s]: %s",
326 shell_cmd
, safe_strerror(errno
));
330 /* Use separate process group so child processes can be killed easily. */
331 if (setpgid(0, 0) < 0)
332 zlog_warn("warning: setpgid(0,0) failed: %s",
333 safe_strerror(errno
));
337 char *const argv
[4] = { shell
, dashc
, shell_cmd
, NULL
};
338 execv("/bin/sh", argv
);
339 zlog_err("execv(/bin/sh -c '%s') failed: %s",
340 shell_cmd
, safe_strerror(errno
));
344 /* Parent process: we will reap the child later. */
345 zlog_err("Forked background command [pid %d]: %s", (int)child
,
351 static struct timeval
*time_elapsed(struct timeval
*result
,
352 const struct timeval
*start_time
)
354 gettimeofday(result
, NULL
);
355 result
->tv_sec
-= start_time
->tv_sec
;
356 result
->tv_usec
-= start_time
->tv_usec
;
357 while (result
->tv_usec
< 0) {
358 result
->tv_usec
+= 1000000L;
364 static int restart_kill(struct thread
*t_kill
)
366 struct restart_info
*restart
= THREAD_ARG(t_kill
);
367 struct timeval delay
;
369 time_elapsed(&delay
, &restart
->time
);
370 zlog_warn("Warning: %s %s child process %d still running after "
371 "%ld seconds, sending signal %d",
372 restart
->what
, restart
->name
, (int)restart
->pid
,
373 (long)delay
.tv_sec
, (restart
->kills
? SIGKILL
: SIGTERM
));
374 kill(-restart
->pid
, (restart
->kills
? SIGKILL
: SIGTERM
));
376 restart
->t_kill
= NULL
;
377 thread_add_timer(master
, restart_kill
, restart
, gs
.restart_timeout
,
382 static struct restart_info
*find_child(pid_t child
)
384 if (gs
.mode
== MODE_GLOBAL_RESTART
) {
385 if (gs
.restart
.pid
== child
)
389 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
390 if (dmn
->restart
.pid
== child
)
391 return &dmn
->restart
;
397 static void sigchild(void)
403 struct restart_info
*restart
;
405 switch (child
= waitpid(-1, &status
, WNOHANG
)) {
407 zlog_err("waitpid failed: %s", safe_strerror(errno
));
410 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
414 if (child
== integrated_write_pid
) {
415 integrated_write_sigchld(status
);
419 if ((restart
= find_child(child
)) != NULL
) {
420 name
= restart
->name
;
421 what
= restart
->what
;
424 thread_cancel(restart
->t_kill
);
425 restart
->t_kill
= NULL
;
426 /* Update restart time to reflect the time the command completed. */
427 gettimeofday(&restart
->time
, NULL
);
430 ("waitpid returned status for an unknown child process %d",
435 if (WIFSTOPPED(status
))
436 zlog_warn("warning: %s %s process %d is stopped",
437 what
, name
, (int)child
);
438 else if (WIFSIGNALED(status
))
439 zlog_warn("%s %s process %d terminated due to signal %d",
440 what
, name
, (int)child
, WTERMSIG(status
));
441 else if (WIFEXITED(status
)) {
442 if (WEXITSTATUS(status
) != 0)
444 ("%s %s process %d exited with non-zero status %d",
445 what
, name
, (int)child
, WEXITSTATUS(status
));
447 zlog_debug("%s %s process %d exited normally", what
,
450 zlog_err("cannot interpret %s %s process %d wait status 0x%x",
451 what
, name
, (int)child
, status
);
456 run_job(struct restart_info
*restart
, const char *cmdtype
, const char *command
,
457 int force
, int update_interval
)
459 struct timeval delay
;
461 if (gs
.loglevel
> LOG_DEBUG
+ 1)
462 zlog_debug("attempting to %s %s", cmdtype
, restart
->name
);
465 if (gs
.loglevel
> LOG_DEBUG
+ 1)
467 ("cannot %s %s, previous pid %d still running",
468 cmdtype
, restart
->name
, (int)restart
->pid
);
472 /* Note: time_elapsed test must come before the force test, since we need
473 to make sure that delay is initialized for use below in updating the
475 if ((time_elapsed(&delay
, &restart
->time
)->tv_sec
< restart
->interval
)
477 if (gs
.loglevel
> LOG_DEBUG
+ 1)
478 zlog_debug("postponing %s %s: "
479 "elapsed time %ld < retry interval %ld",
480 cmdtype
, restart
->name
, (long)delay
.tv_sec
,
485 gettimeofday(&restart
->time
, NULL
);
488 char cmd
[strlen(command
) + strlen(restart
->name
) + 1];
489 snprintf(cmd
, sizeof(cmd
), command
, restart
->name
);
490 if ((restart
->pid
= run_background(cmd
)) > 0) {
491 restart
->t_kill
= NULL
;
492 thread_add_timer(master
, restart_kill
, restart
, gs
.restart_timeout
,
494 restart
->what
= cmdtype
;
500 /* Calculate the new restart interval. */
501 if (update_interval
) {
502 if (delay
.tv_sec
> 2 * gs
.max_restart_interval
)
503 restart
->interval
= gs
.min_restart_interval
;
504 else if ((restart
->interval
*= 2) > gs
.max_restart_interval
)
505 restart
->interval
= gs
.max_restart_interval
;
506 if (gs
.loglevel
> LOG_DEBUG
+ 1)
507 zlog_debug("restart %s interval is now %ld",
508 restart
->name
, restart
->interval
);
513 #define SET_READ_HANDLER(DMN) \
515 (DMN)->t_read = NULL; \
516 thread_add_read (master, handle_read, (DMN), (DMN)->fd, &(DMN)->t_read); \
519 #define SET_WAKEUP_DOWN(DMN) \
521 (DMN)->t_wakeup = NULL; \
522 thread_add_timer_msec (master, wakeup_down, (DMN), FUZZY(gs.period), \
526 #define SET_WAKEUP_UNRESPONSIVE(DMN) \
528 (DMN)->t_wakeup = NULL; \
529 thread_add_timer_msec (master, wakeup_unresponsive, (DMN), \
530 FUZZY(gs.period), &(DMN)->t_wakeup); \
533 #define SET_WAKEUP_ECHO(DMN) \
535 (DMN)->t_wakeup = NULL; \
536 thread_add_timer_msec (master, wakeup_send_echo, (DMN), \
537 FUZZY(gs.period), &(DMN)->t_wakeup); \
540 static int wakeup_down(struct thread
*t_wakeup
)
542 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
544 dmn
->t_wakeup
= NULL
;
545 if (try_connect(dmn
) < 0)
546 SET_WAKEUP_DOWN(dmn
);
547 if ((dmn
->connect_tries
> 1) && (dmn
->state
!= DAEMON_UP
))
552 static int wakeup_init(struct thread
*t_wakeup
)
554 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
556 dmn
->t_wakeup
= NULL
;
557 if (try_connect(dmn
) < 0) {
558 SET_WAKEUP_DOWN(dmn
);
559 zlog_err("%s state -> down : initial connection attempt failed",
561 dmn
->state
= DAEMON_DOWN
;
566 static void daemon_down(struct daemon
*dmn
, const char *why
)
568 if (IS_UP(dmn
) || (dmn
->state
== DAEMON_INIT
))
569 zlog_err("%s state -> down : %s", dmn
->name
, why
);
570 else if (gs
.loglevel
> LOG_DEBUG
)
571 zlog_debug("%s still down : %s", dmn
->name
, why
);
574 dmn
->state
= DAEMON_DOWN
;
579 THREAD_OFF(dmn
->t_read
);
580 THREAD_OFF(dmn
->t_write
);
581 THREAD_OFF(dmn
->t_wakeup
);
582 if (try_connect(dmn
) < 0)
583 SET_WAKEUP_DOWN(dmn
);
587 static int handle_read(struct thread
*t_read
)
589 struct daemon
*dmn
= THREAD_ARG(t_read
);
590 static const char resp
[sizeof(PING_TOKEN
) + 4] = PING_TOKEN
"\n";
591 char buf
[sizeof(resp
) + 100];
593 struct timeval delay
;
596 if ((rc
= read(dmn
->fd
, buf
, sizeof(buf
))) < 0) {
599 if (ERRNO_IO_RETRY(errno
)) {
600 /* Pretend it never happened. */
601 SET_READ_HANDLER(dmn
);
604 snprintf(why
, sizeof(why
), "unexpected read error: %s",
605 safe_strerror(errno
));
606 daemon_down(dmn
, why
);
610 daemon_down(dmn
, "read returned EOF");
613 if (!dmn
->echo_sent
.tv_sec
) {
614 char why
[sizeof(buf
) + 100];
615 snprintf(why
, sizeof(why
),
616 "unexpected read returns %d bytes: %.*s", (int)rc
,
618 daemon_down(dmn
, why
);
622 /* We are expecting an echo response: is there any chance that the
623 response would not be returned entirely in the first read? That
624 seems inconceivable... */
625 if ((rc
!= sizeof(resp
)) || memcmp(buf
, resp
, sizeof(resp
))) {
626 char why
[100 + sizeof(buf
)];
627 snprintf(why
, sizeof(why
),
628 "read returned bad echo response of %d bytes "
629 "(expecting %u): %.*s", (int)rc
, (u_int
) sizeof(resp
),
631 daemon_down(dmn
, why
);
635 time_elapsed(&delay
, &dmn
->echo_sent
);
636 dmn
->echo_sent
.tv_sec
= 0;
637 if (dmn
->state
== DAEMON_UNRESPONSIVE
) {
638 if (delay
.tv_sec
< gs
.timeout
) {
639 dmn
->state
= DAEMON_UP
;
641 ("%s state -> up : echo response received after %ld.%06ld "
642 "seconds", dmn
->name
, (long)delay
.tv_sec
,
643 (long)delay
.tv_usec
);
646 ("%s: slow echo response finally received after %ld.%06ld "
647 "seconds", dmn
->name
, (long)delay
.tv_sec
,
648 (long)delay
.tv_usec
);
649 } else if (gs
.loglevel
> LOG_DEBUG
+ 1)
650 zlog_debug("%s: echo response received after %ld.%06ld seconds",
651 dmn
->name
, (long)delay
.tv_sec
, (long)delay
.tv_usec
);
653 SET_READ_HANDLER(dmn
);
655 thread_cancel(dmn
->t_wakeup
);
656 SET_WAKEUP_ECHO(dmn
);
662 * Wait till we notice that all daemons are ready before
663 * we send we are ready to systemd
665 static void daemon_send_ready(void)
668 if (!sent
&& gs
.numdown
== 0) {
669 #if defined (HAVE_CUMULUS)
672 fp
= fopen(DAEMON_VTY_DIR
"/watchfrr.started", "w");
676 ("Watchfrr: Notifying Systemd we are up and running");
677 systemd_send_started(master
, 0);
682 static void daemon_up(struct daemon
*dmn
, const char *why
)
684 dmn
->state
= DAEMON_UP
;
686 dmn
->connect_tries
= 0;
687 zlog_notice("%s state -> up : %s", dmn
->name
, why
);
690 SET_WAKEUP_ECHO(dmn
);
694 static int check_connect(struct thread
*t_write
)
696 struct daemon
*dmn
= THREAD_ARG(t_write
);
698 socklen_t reslen
= sizeof(sockerr
);
701 if (getsockopt(dmn
->fd
, SOL_SOCKET
, SO_ERROR
, (char *)&sockerr
, &reslen
)
703 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn
->name
,
704 safe_strerror(errno
));
706 "getsockopt failed checking connection success");
709 if ((reslen
== sizeof(sockerr
)) && sockerr
) {
711 snprintf(why
, sizeof(why
),
712 "getsockopt reports that connection attempt failed: %s",
713 safe_strerror(sockerr
));
714 daemon_down(dmn
, why
);
718 daemon_up(dmn
, "delayed connect succeeded");
722 static int wakeup_connect_hanging(struct thread
*t_wakeup
)
724 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
727 dmn
->t_wakeup
= NULL
;
728 snprintf(why
, sizeof(why
),
729 "connection attempt timed out after %ld seconds", gs
.timeout
);
730 daemon_down(dmn
, why
);
734 /* Making connection to protocol daemon. */
735 static int try_connect(struct daemon
*dmn
)
738 struct sockaddr_un addr
;
741 if (gs
.loglevel
> LOG_DEBUG
+ 1)
742 zlog_debug("%s: attempting to connect", dmn
->name
);
743 dmn
->connect_tries
++;
745 memset(&addr
, 0, sizeof(struct sockaddr_un
));
746 addr
.sun_family
= AF_UNIX
;
747 snprintf(addr
.sun_path
, sizeof(addr
.sun_path
), "%s/%s.vty",
748 gs
.vtydir
, dmn
->name
);
749 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
750 len
= addr
.sun_len
= SUN_LEN(&addr
);
752 len
= sizeof(addr
.sun_family
) + strlen(addr
.sun_path
);
753 #endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
755 /* Quick check to see if we might succeed before we go to the trouble
756 of creating a socket. */
757 if (access(addr
.sun_path
, W_OK
) < 0) {
759 zlog_err("%s: access to socket %s denied: %s",
760 dmn
->name
, addr
.sun_path
,
761 safe_strerror(errno
));
765 if ((sock
= socket(AF_UNIX
, SOCK_STREAM
, 0)) < 0) {
766 zlog_err("%s(%s): cannot make socket: %s",
767 __func__
, addr
.sun_path
, safe_strerror(errno
));
771 if (set_nonblocking(sock
) < 0 || set_cloexec(sock
) < 0) {
772 zlog_err("%s(%s): set_nonblocking/cloexec(%d) failed",
773 __func__
, addr
.sun_path
, sock
);
778 if (connect(sock
, (struct sockaddr
*)&addr
, len
) < 0) {
779 if ((errno
!= EINPROGRESS
) && (errno
!= EWOULDBLOCK
)) {
780 if (gs
.loglevel
> LOG_DEBUG
)
781 zlog_debug("%s(%s): connect failed: %s",
782 __func__
, addr
.sun_path
,
783 safe_strerror(errno
));
787 if (gs
.loglevel
> LOG_DEBUG
)
788 zlog_debug("%s: connection in progress", dmn
->name
);
789 dmn
->state
= DAEMON_CONNECTING
;
792 thread_add_write(master
, check_connect
, dmn
, dmn
->fd
,
793 &dmn
->t_write
);dmn
->t_wakeup
= NULL
;
794 thread_add_timer(master
, wakeup_connect_hanging
, dmn
, gs
.timeout
,
796 SET_READ_HANDLER(dmn
);
801 SET_READ_HANDLER(dmn
);
802 daemon_up(dmn
, "connect succeeded");
806 static int phase_hanging(struct thread
*t_hanging
)
808 gs
.t_phase_hanging
= NULL
;
809 zlog_err("Phase [%s] hanging for %ld seconds, aborting phased restart",
810 phase_str
[gs
.phase
], PHASE_TIMEOUT
);
811 gs
.phase
= PHASE_NONE
;
815 static void set_phase(restart_phase_t new_phase
)
817 gs
.phase
= new_phase
;
818 if (gs
.t_phase_hanging
)
819 thread_cancel(gs
.t_phase_hanging
);
820 gs
.t_phase_hanging
= NULL
;
821 thread_add_timer(master
, phase_hanging
, NULL
, PHASE_TIMEOUT
,
822 &gs
.t_phase_hanging
);
825 static void phase_check(void)
830 case PHASE_STOPS_PENDING
:
834 ("Phased restart: all routing daemon stop jobs have completed.");
835 set_phase(PHASE_WAITING_DOWN
);
838 case PHASE_WAITING_DOWN
:
839 if (gs
.numdown
+ IS_UP(gs
.special
) < gs
.numdaemons
)
841 zlog_info("Phased restart: all routing daemons now down.");
842 run_job(&gs
.special
->restart
, "restart", gs
.restart_command
, 1,
844 set_phase(PHASE_ZEBRA_RESTART_PENDING
);
847 case PHASE_ZEBRA_RESTART_PENDING
:
848 if (gs
.special
->restart
.pid
)
850 zlog_info("Phased restart: %s restart job completed.",
852 set_phase(PHASE_WAITING_ZEBRA_UP
);
855 case PHASE_WAITING_ZEBRA_UP
:
856 if (!IS_UP(gs
.special
))
858 zlog_info("Phased restart: %s is now up.", gs
.special
->name
);
861 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
862 if (dmn
!= gs
.special
)
863 run_job(&dmn
->restart
, "start",
864 gs
.start_command
, 1, 0);
867 gs
.phase
= PHASE_NONE
;
868 THREAD_OFF(gs
.t_phase_hanging
);
869 zlog_notice("Phased global restart has completed.");
874 static void try_restart(struct daemon
*dmn
)
879 case MODE_GLOBAL_RESTART
:
880 run_job(&gs
.restart
, "restart", gs
.restart_command
, 0, 1);
882 case MODE_SEPARATE_RESTART
:
883 run_job(&dmn
->restart
, "restart", gs
.restart_command
, 0, 1);
885 case MODE_PHASED_ZEBRA_RESTART
:
886 if (dmn
!= gs
.special
) {
887 if ((gs
.special
->state
== DAEMON_UP
)
888 && (gs
.phase
== PHASE_NONE
))
889 run_job(&dmn
->restart
, "restart",
890 gs
.restart_command
, 0, 1);
893 ("%s: postponing restart attempt because master %s daemon "
894 "not up [%s], or phased restart in progress",
895 dmn
->name
, gs
.special
->name
,
896 state_str
[gs
.special
->state
]);
901 case MODE_PHASED_ALL_RESTART
:
902 if ((gs
.phase
!= PHASE_NONE
) || gs
.numpids
) {
903 if (gs
.loglevel
> LOG_DEBUG
+ 1)
905 ("postponing phased global restart: restart already in "
906 "progress [%s], or outstanding child processes [%d]",
907 phase_str
[gs
.phase
], gs
.numpids
);
910 /* Is it too soon for a restart? */
912 struct timeval delay
;
913 if (time_elapsed(&delay
, &gs
.special
->restart
.time
)->
914 tv_sec
< gs
.special
->restart
.interval
) {
915 if (gs
.loglevel
> LOG_DEBUG
+ 1)
917 ("postponing phased global restart: "
918 "elapsed time %ld < retry interval %ld",
920 gs
.special
->restart
.interval
);
924 run_job(&gs
.restart
, "restart", gs
.restart_command
, 0, 1);
927 zlog_err("error: unknown restart mode %d", gs
.mode
);
932 static int wakeup_unresponsive(struct thread
*t_wakeup
)
934 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
936 dmn
->t_wakeup
= NULL
;
937 if (dmn
->state
!= DAEMON_UNRESPONSIVE
)
938 zlog_err("%s: no longer unresponsive (now %s), "
939 "wakeup should have been cancelled!",
940 dmn
->name
, state_str
[dmn
->state
]);
942 SET_WAKEUP_UNRESPONSIVE(dmn
);
948 static int wakeup_no_answer(struct thread
*t_wakeup
)
950 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
952 dmn
->t_wakeup
= NULL
;
953 dmn
->state
= DAEMON_UNRESPONSIVE
;
954 zlog_err("%s state -> unresponsive : no response yet to ping "
955 "sent %ld seconds ago", dmn
->name
, gs
.timeout
);
956 if (gs
.unresponsive_restart
) {
957 SET_WAKEUP_UNRESPONSIVE(dmn
);
963 static int wakeup_send_echo(struct thread
*t_wakeup
)
965 static const char echocmd
[] = "echo " PING_TOKEN
;
967 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
969 dmn
->t_wakeup
= NULL
;
970 if (((rc
= write(dmn
->fd
, echocmd
, sizeof(echocmd
))) < 0) ||
971 ((size_t) rc
!= sizeof(echocmd
))) {
972 char why
[100 + sizeof(echocmd
)];
973 snprintf(why
, sizeof(why
),
974 "write '%s' returned %d instead of %u", echocmd
,
975 (int)rc
, (u_int
) sizeof(echocmd
));
976 daemon_down(dmn
, why
);
978 gettimeofday(&dmn
->echo_sent
, NULL
);
979 dmn
->t_wakeup
= NULL
;
980 thread_add_timer(master
, wakeup_no_answer
, dmn
, gs
.timeout
,
986 static void sigint(void)
988 zlog_notice("Terminating on signal");
989 systemd_send_stopping();
993 static int valid_command(const char *cmd
)
997 return ((p
= strchr(cmd
, '%')) != NULL
) && (*(p
+ 1) == 's')
998 && !strchr(p
+ 1, '%');
1001 /* This is an ugly hack to circumvent problems with passing command-line
1002 arguments that contain spaces. The fix is to use a configuration file. */
1003 static char *translate_blanks(const char *cmd
, const char *blankstr
)
1007 size_t bslen
= strlen(blankstr
);
1009 if (!(res
= strdup(cmd
))) {
1013 while ((p
= strstr(res
, blankstr
)) != NULL
) {
1016 memmove(p
+ 1, p
+ bslen
, strlen(p
+ bslen
) + 1);
1021 struct zebra_privs_t watchfrr_privs
= {
1023 .vty_group
= VTY_GROUP
,
1027 static struct quagga_signal_t watchfrr_signals
[] = {
1038 .handler
= sigchild
,
1042 FRR_DAEMON_INFO(watchfrr
, WATCHFRR
,
1043 .flags
= FRR_NO_PRIVSEP
| FRR_NO_TCPVTY
| FRR_LIMITED_CLI
1044 | FRR_NO_CFG_PID_DRY
| FRR_NO_ZCLIENT
,
1046 .printhelp
= printhelp
,
1047 .copyright
= "Copyright 2004 Andrew J. Schorr",
1049 .signals
= watchfrr_signals
,
1050 .n_signals
= array_size(watchfrr_signals
),
1052 .privs
= &watchfrr_privs
,
1055 int main(int argc
, char **argv
)
1058 const char *pidfile
= DEFAULT_PIDFILE
;
1059 const char *special
= "zebra";
1060 const char *blankstr
= NULL
;
1062 frr_preinit(&watchfrr_di
, argc
, argv
);
1063 progname
= watchfrr_di
.progname
;
1065 frr_opt_add("aAb:dek:l:i:p:r:R:S:s:t:T:z", longopts
, "");
1067 gs
.restart
.name
= "all";
1068 while ((opt
= frr_getopt(argc
, argv
, NULL
)) != EOF
) {
1073 if ((gs
.mode
!= MODE_MONITOR
)
1074 && (gs
.mode
!= MODE_SEPARATE_RESTART
)) {
1075 fputs("Ambiguous operating mode selected.\n",
1079 gs
.mode
= MODE_PHASED_ZEBRA_RESTART
;
1082 if ((gs
.mode
!= MODE_MONITOR
)
1083 && (gs
.mode
!= MODE_SEPARATE_RESTART
)) {
1084 fputs("Ambiguous operating mode selected.\n",
1088 gs
.mode
= MODE_PHASED_ALL_RESTART
;
1097 if (!valid_command(optarg
)) {
1099 "Invalid kill command, must contain '%%s': %s\n",
1103 gs
.stop_command
= optarg
;
1109 (optarg
, "%d%1s", &gs
.loglevel
,
1111 || (gs
.loglevel
< LOG_EMERG
)) {
1113 "Invalid loglevel argument: %s\n",
1119 case OPTION_MINRESTART
:
1122 if ((sscanf(optarg
, "%ld%1s",
1123 &gs
.min_restart_interval
,
1125 || (gs
.min_restart_interval
< 0)) {
1127 "Invalid min_restart_interval argument: %s\n",
1133 case OPTION_MAXRESTART
:
1136 if ((sscanf(optarg
, "%ld%1s",
1137 &gs
.max_restart_interval
,
1139 || (gs
.max_restart_interval
< 0)) {
1141 "Invalid max_restart_interval argument: %s\n",
1151 if ((sscanf(optarg
, "%d%1s", &period
, garbage
)
1152 != 1) || (gs
.period
< 1)) {
1154 "Invalid interval argument: %s\n",
1158 gs
.period
= 1000 * period
;
1165 if ((gs
.mode
== MODE_GLOBAL_RESTART
) ||
1166 (gs
.mode
== MODE_SEPARATE_RESTART
)) {
1167 fputs("Ambiguous operating mode selected.\n",
1171 if (!valid_command(optarg
)) {
1173 "Invalid restart command, must contain '%%s': %s\n",
1177 gs
.restart_command
= optarg
;
1178 if (gs
.mode
== MODE_MONITOR
)
1179 gs
.mode
= MODE_SEPARATE_RESTART
;
1182 if (gs
.mode
!= MODE_MONITOR
) {
1183 fputs("Ambiguous operating mode selected.\n",
1187 if (strchr(optarg
, '%')) {
1189 "Invalid restart-all arg, must not contain '%%s': %s\n",
1193 gs
.restart_command
= optarg
;
1194 gs
.mode
= MODE_GLOBAL_RESTART
;
1197 if (!valid_command(optarg
)) {
1199 "Invalid start command, must contain '%%s': %s\n",
1203 gs
.start_command
= optarg
;
1212 (optarg
, "%ld%1s", &gs
.timeout
,
1213 garbage
) != 1) || (gs
.timeout
< 1)) {
1215 "Invalid timeout argument: %s\n",
1225 (optarg
, "%ld%1s", &gs
.restart_timeout
,
1227 || (gs
.restart_timeout
< 1)) {
1229 "Invalid restart timeout argument: %s\n",
1236 gs
.unresponsive_restart
= 1;
1239 fputs("Invalid option.\n", stderr
);
1244 if (gs
.unresponsive_restart
&& (gs
.mode
== MODE_MONITOR
)) {
1245 fputs("Option -z requires a -r or -R restart option.\n",
1251 if (gs
.restart_command
|| gs
.start_command
|| gs
.stop_command
) {
1253 "No kill/(re)start commands needed for %s mode.\n",
1258 case MODE_GLOBAL_RESTART
:
1259 case MODE_SEPARATE_RESTART
:
1260 if (!gs
.restart_command
|| gs
.start_command
|| gs
.stop_command
) {
1262 "No start/kill commands needed in [%s] mode.\n",
1267 case MODE_PHASED_ZEBRA_RESTART
:
1268 case MODE_PHASED_ALL_RESTART
:
1269 if (!gs
.restart_command
|| !gs
.start_command
1270 || !gs
.stop_command
) {
1272 "Need start, kill, and restart commands in [%s] mode.\n",
1280 if (gs
.restart_command
)
1281 gs
.restart_command
=
1282 translate_blanks(gs
.restart_command
, blankstr
);
1283 if (gs
.start_command
)
1285 translate_blanks(gs
.start_command
, blankstr
);
1286 if (gs
.stop_command
)
1288 translate_blanks(gs
.stop_command
, blankstr
);
1291 gs
.restart
.interval
= gs
.min_restart_interval
;
1293 master
= frr_init();
1295 zlog_set_level(ZLOG_DEST_MONITOR
, ZLOG_DISABLED
);
1296 if (watchfrr_di
.daemon_mode
) {
1297 zlog_set_level(ZLOG_DEST_SYSLOG
, MIN(gs
.loglevel
, LOG_DEBUG
));
1298 if (daemon (0, 0) < 0) {
1299 fprintf(stderr
, "Watchquagga daemon failed: %s",
1304 zlog_set_level(ZLOG_DEST_STDOUT
, MIN(gs
.loglevel
, LOG_DEBUG
));
1306 watchfrr_vty_init();
1312 struct daemon
*tail
= NULL
;
1314 for (i
= optind
; i
< argc
; i
++) {
1317 if (!(dmn
= (struct daemon
*)calloc(1, sizeof(*dmn
)))) {
1318 fprintf(stderr
, "calloc(1,%u) failed: %s\n",
1319 (u_int
) sizeof(*dmn
),
1320 safe_strerror(errno
));
1323 dmn
->name
= dmn
->restart
.name
= argv
[i
];
1324 dmn
->state
= DAEMON_INIT
;
1328 dmn
->t_wakeup
= NULL
;
1329 thread_add_timer_msec(master
, wakeup_init
, dmn
, 100 + (random() % 900),
1331 dmn
->restart
.interval
= gs
.min_restart_interval
;
1338 if (((gs
.mode
== MODE_PHASED_ZEBRA_RESTART
) ||
1339 (gs
.mode
== MODE_PHASED_ALL_RESTART
)) &&
1340 !strcmp(dmn
->name
, special
))
1345 fputs("Must specify one or more daemons to monitor.\n", stderr
);
1348 if (((gs
.mode
== MODE_PHASED_ZEBRA_RESTART
) ||
1349 (gs
.mode
== MODE_PHASED_ALL_RESTART
)) && !gs
.special
) {
1351 "In mode [%s], but cannot find master daemon %s\n",
1352 mode_str
[gs
.mode
], special
);
1356 /* Make sure we're not already running. */
1357 pid_output(pidfile
);
1359 /* Announce which daemons are being monitored. */
1364 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
1365 len
+= strlen(dmn
->name
) + 1;
1371 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
) {
1374 strcpy(p
, dmn
->name
);
1377 zlog_notice("%s %s watching [%s], mode [%s]",
1378 progname
, FRR_VERSION
, buf
,
1384 struct thread thread
;
1386 while (thread_fetch(master
, &thread
))
1387 thread_call(&thread
);
1390 systemd_send_stopping();