2 Monitor status of frr daemons and restart if necessary.
4 Copyright (C) 2004 Andrew J. Schorr
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
26 #include <lib/version.h>
28 #include "memory_vty.h"
39 #define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
42 /* Macros to help randomize timers. */
43 #define JITTER(X) ((random() % ((X)+1))-((X)/2))
44 #define FUZZY(X) ((X)+JITTER((X)/20))
46 #define DEFAULT_PERIOD 5
47 #define DEFAULT_TIMEOUT 10
48 #define DEFAULT_RESTART_TIMEOUT 20
49 #define DEFAULT_LOGLEVEL LOG_INFO
50 #define DEFAULT_MIN_RESTART 60
51 #define DEFAULT_MAX_RESTART 600
52 #ifdef PATH_WATCHFRR_PID
53 #define DEFAULT_PIDFILE PATH_WATCHFRR_PID
55 #define DEFAULT_PIDFILE STATEDIR "/watchfrr.pid"
58 #define VTYDIR DAEMON_VTY_DIR
60 #define VTYDIR STATEDIR
63 #define PING_TOKEN "PING"
65 /* Needs to be global, referenced somewhere inside libfrr. */
66 struct thread_master
*master
;
72 MODE_SEPARATE_RESTART
,
73 MODE_PHASED_ZEBRA_RESTART
,
74 MODE_PHASED_ALL_RESTART
77 static const char *mode_str
[] =
81 "individual daemon restart",
82 "phased zebra restart",
83 "phased global restart for any failure",
91 PHASE_ZEBRA_RESTART_PENDING
,
92 PHASE_WAITING_ZEBRA_UP
95 static const char *phase_str
[] =
99 "Waiting for other daemons to come down",
100 "Zebra restart job running",
101 "Waiting for zebra to come up",
102 "Start jobs running",
105 #define PHASE_TIMEOUT (3*gs.restart_timeout)
114 struct thread
*t_kill
;
118 static struct global_state
121 restart_phase_t phase
;
122 struct thread
*t_phase_hanging
;
126 long restart_timeout
;
127 long min_restart_interval
;
128 long max_restart_interval
;
130 struct daemon
*daemons
;
131 const char *restart_command
;
132 const char *start_command
;
133 const char *stop_command
;
134 struct restart_info restart
;
135 int unresponsive_restart
;
137 struct daemon
*special
; /* points to zebra when doing phased restart */
140 int numdown
; /* # of daemons that are not UP or UNRESPONSIVE */
142 .mode
= MODE_MONITOR
,
145 .period
= 1000*DEFAULT_PERIOD
,
146 .timeout
= DEFAULT_TIMEOUT
,
147 .restart_timeout
= DEFAULT_RESTART_TIMEOUT
,
148 .loglevel
= DEFAULT_LOGLEVEL
,
149 .min_restart_interval
= DEFAULT_MIN_RESTART
,
150 .max_restart_interval
= DEFAULT_MAX_RESTART
,
164 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
166 static const char *state_str
[] =
177 daemon_state_t state
;
179 struct timeval echo_sent
;
181 struct thread
*t_wakeup
;
182 struct thread
*t_read
;
183 struct thread
*t_write
;
185 struct restart_info restart
;
188 static const struct option longopts
[] =
190 { "daemon", no_argument
, NULL
, 'd'},
191 { "statedir", required_argument
, NULL
, 'S'},
192 { "no-echo", no_argument
, NULL
, 'e'},
193 { "loglevel", required_argument
, NULL
, 'l'},
194 { "interval", required_argument
, NULL
, 'i'},
195 { "timeout", required_argument
, NULL
, 't'},
196 { "restart-timeout", required_argument
, NULL
, 'T'},
197 { "restart", required_argument
, NULL
, 'r'},
198 { "start-command", required_argument
, NULL
, 's'},
199 { "kill-command", required_argument
, NULL
, 'k'},
200 { "restart-all", required_argument
, NULL
, 'R'},
201 { "all-restart", no_argument
, NULL
, 'a'},
202 { "always-all-restart", no_argument
, NULL
, 'A'},
203 { "unresponsive-restart", no_argument
, NULL
, 'z'},
204 { "min-restart-interval", required_argument
, NULL
, 'm'},
205 { "max-restart-interval", required_argument
, NULL
, 'M'},
206 { "pid-file", required_argument
, NULL
, 'p'},
207 { "blank-string", required_argument
, NULL
, 'b'},
208 { "help", no_argument
, NULL
, 'h'},
209 { "version", no_argument
, NULL
, 'v'},
213 static int try_connect(struct daemon
*dmn
);
214 static int wakeup_send_echo(struct thread
*t_wakeup
);
215 static void try_restart(struct daemon
*dmn
);
216 static void phase_check(void);
219 usage(const char *progname
, int status
)
222 fprintf(stderr
, "Try `%s --help' for more information.\n", progname
);
225 printf("Usage : %s [OPTION...] <daemon name> ...\n\n\
226 Watchdog program to monitor status of frr daemons and try to restart\n\
227 them if they are down or unresponsive. It determines whether a daemon is\n\
228 up based on whether it can connect to the daemon's vty unix stream socket.\n\
229 It then repeatedly sends echo commands over that socket to determine whether\n\
230 the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
231 on the socket connection and know immediately that the daemon is down.\n\n\
232 The daemons to be monitored should be listed on the command line.\n\n\
233 This program can run in one of 5 modes:\n\n\
235 Just monitor and report on status changes. Example:\n\
236 %s -d zebra ospfd bgpd\n\n\
238 Whenever any daemon hangs or crashes, use the given command to restart\n\
239 them all. Example:\n\
241 -R '/sbin/service zebra restart; /sbin/service ospfd restart' \\\n\
244 When any single daemon hangs or crashes, restart only the daemon that's\n\
245 in trouble using the supplied restart command. Example:\n\
246 %s -dz -r '/sbin/service %%s restart' zebra ospfd bgpd\n\n\
248 The same as the previous mode, except that there is special treatment when\n\
249 the zebra daemon is in trouble. In that case, a phased restart approach\n\
250 is used: 1. stop all other daemons; 2. restart zebra; 3. start the other\n\
252 %s -adz -r '/sbin/service %%s restart' \\\n\
253 -s '/sbin/service %%s start' \\\n\
254 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
256 This is the same as the previous mode, except that the phased restart\n\
257 procedure is used whenever any of the daemons hangs or crashes. Example:\n\
258 %s -Adz -r '/sbin/service %%s restart' \\\n\
259 -s '/sbin/service %%s start' \\\n\
260 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
261 As of this writing, it is believed that mode 2 [%s]\n\
262 is not safe, and mode 3 [%s] may not be safe with some of the\n\
263 routing daemons.\n\n\
264 In order to avoid attempting to restart the daemons in a fast loop,\n\
265 the -m and -M options allow you to control the minimum delay between\n\
266 restart commands. The minimum restart delay is recalculated each time\n\
267 a restart is attempted: if the time since the last restart attempt exceeds\n\
268 twice the -M value, then the restart delay is set to the -m value.\n\
269 Otherwise, the interval is doubled (but capped at the -M value).\n\n",
270 progname
,mode_str
[0],progname
,mode_str
[1],progname
,mode_str
[2],
271 progname
,mode_str
[3],progname
,mode_str
[4],progname
,mode_str
[2],
275 -d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
276 to syslog instead of stdout.\n\
277 -S, --statedir Set the vty socket directory (default is %s)\n\
278 -e, --no-echo Do not ping the daemons to test responsiveness (this\n\
279 option is necessary if the daemons do not support the\n\
281 -l, --loglevel Set the logging level (default is %d).\n\
282 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
283 but it can be set higher than %d if extra-verbose debugging\n\
284 messages are desired.\n\
285 -m, --min-restart-interval\n\
286 Set the minimum seconds to wait between invocations of daemon\n\
287 restart commands (default is %d).\n\
288 -M, --max-restart-interval\n\
289 Set the maximum seconds to wait between invocations of daemon\n\
290 restart commands (default is %d).\n\
291 -i, --interval Set the status polling interval in seconds (default is %d)\n\
292 -t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
293 -T, --restart-timeout\n\
294 Set the restart (kill) timeout in seconds (default is %d).\n\
295 If any background jobs are still running after this much\n\
296 time has elapsed, they will be killed.\n\
297 -r, --restart Supply a Bourne shell command to use to restart a single\n\
298 daemon. The command string should include '%%s' where the\n\
299 name of the daemon should be substituted.\n\
300 Note that -r and -R are incompatible.\n\
301 -s, --start-command\n\
302 Supply a Bourne shell to command to use to start a single\n\
303 daemon. The command string should include '%%s' where the\n\
304 name of the daemon should be substituted.\n\
305 -k, --kill-command\n\
306 Supply a Bourne shell to command to use to stop a single\n\
307 daemon. The command string should include '%%s' where the\n\
308 name of the daemon should be substituted.\n\
310 When one or more daemons is down, try to restart everything\n\
311 using the Bourne shell command supplied as the argument.\n\
312 Note that -r and -R are incompatible.\n\
313 -z, --unresponsive-restart\n\
314 When a daemon is unresponsive, treat it as being down for\n\
317 When zebra hangs or crashes, restart all daemons using\n\
318 this phased approach: 1. stop all other daemons; 2. restart\n\
319 zebra; 3. start other daemons. Requires -r, -s, and -k.\n\
320 -A, --always-all-restart\n\
321 When any daemon (not just zebra) hangs or crashes, use the\n\
322 same phased restart mechanism described above for -a.\n\
323 Requires -r, -s, and -k.\n\
324 -p, --pid-file Set process identifier file name\n\
326 -b, --blank-string\n\
327 When the supplied argument string is found in any of the\n\
328 various shell command arguments (-r, -s, -k, or -R), replace\n\
329 it with a space. This is an ugly hack to circumvent problems\n\
330 passing command-line arguments with embedded spaces.\n\
331 -v, --version Print program version\n\
332 -h, --help Display this help and exit\n",
333 VTYDIR
,DEFAULT_LOGLEVEL
,LOG_EMERG
,LOG_DEBUG
,LOG_DEBUG
,
334 DEFAULT_MIN_RESTART
,DEFAULT_MAX_RESTART
,
335 DEFAULT_PERIOD
,DEFAULT_TIMEOUT
,DEFAULT_RESTART_TIMEOUT
,
343 run_background(char *shell_cmd
)
347 switch (child
= fork())
350 zlog_err("fork failed, cannot run command [%s]: %s",
351 shell_cmd
,safe_strerror(errno
));
355 /* Use separate process group so child processes can be killed easily. */
356 if (setpgid(0,0) < 0)
357 zlog_warn("warning: setpgid(0,0) failed: %s",safe_strerror(errno
));
361 char * const argv
[4] = { shell
, dashc
, shell_cmd
, NULL
};
362 execv("/bin/sh", argv
);
363 zlog_err("execv(/bin/sh -c '%s') failed: %s",
364 shell_cmd
,safe_strerror(errno
));
368 /* Parent process: we will reap the child later. */
369 zlog_err("Forked background command [pid %d]: %s",(int)child
,shell_cmd
);
374 static struct timeval
*
375 time_elapsed(struct timeval
*result
, const struct timeval
*start_time
)
377 gettimeofday(result
,NULL
);
378 result
->tv_sec
-= start_time
->tv_sec
;
379 result
->tv_usec
-= start_time
->tv_usec
;
380 while (result
->tv_usec
< 0)
382 result
->tv_usec
+= 1000000L;
389 restart_kill(struct thread
*t_kill
)
391 struct restart_info
*restart
= THREAD_ARG(t_kill
);
392 struct timeval delay
;
394 time_elapsed(&delay
,&restart
->time
);
395 zlog_warn("Warning: %s %s child process %d still running after "
396 "%ld seconds, sending signal %d",
397 restart
->what
,restart
->name
,(int)restart
->pid
, (long)delay
.tv_sec
,
398 (restart
->kills
? SIGKILL
: SIGTERM
));
399 kill(-restart
->pid
,(restart
->kills
? SIGKILL
: SIGTERM
));
401 restart
->t_kill
= thread_add_timer(master
,restart_kill
,restart
,
406 static struct restart_info
*
407 find_child(pid_t child
)
409 if (gs
.mode
== MODE_GLOBAL_RESTART
)
411 if (gs
.restart
.pid
== child
)
417 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
419 if (dmn
->restart
.pid
== child
)
420 return &dmn
->restart
;
433 struct restart_info
*restart
;
435 switch (child
= waitpid(-1,&status
,WNOHANG
))
438 zlog_err("waitpid failed: %s",safe_strerror(errno
));
441 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
445 if (child
== integrated_write_pid
)
447 integrated_write_sigchld(status
);
451 if ((restart
= find_child(child
)) != NULL
)
453 name
= restart
->name
;
454 what
= restart
->what
;
457 thread_cancel(restart
->t_kill
);
458 restart
->t_kill
= NULL
;
459 /* Update restart time to reflect the time the command completed. */
460 gettimeofday(&restart
->time
,NULL
);
464 zlog_err("waitpid returned status for an unknown child process %d",
469 if (WIFSTOPPED(status
))
470 zlog_warn("warning: %s %s process %d is stopped",
471 what
,name
,(int)child
);
472 else if (WIFSIGNALED(status
))
473 zlog_warn("%s %s process %d terminated due to signal %d",
474 what
,name
,(int)child
,WTERMSIG(status
));
475 else if (WIFEXITED(status
))
477 if (WEXITSTATUS(status
) != 0)
478 zlog_warn("%s %s process %d exited with non-zero status %d",
479 what
,name
,(int)child
,WEXITSTATUS(status
));
481 zlog_debug("%s %s process %d exited normally",what
,name
,(int)child
);
484 zlog_err("cannot interpret %s %s process %d wait status 0x%x",
485 what
,name
,(int)child
,status
);
490 run_job(struct restart_info
*restart
, const char *cmdtype
, const char *command
,
491 int force
, int update_interval
)
493 struct timeval delay
;
495 if (gs
.loglevel
> LOG_DEBUG
+1)
496 zlog_debug("attempting to %s %s",cmdtype
,restart
->name
);
500 if (gs
.loglevel
> LOG_DEBUG
+1)
501 zlog_debug("cannot %s %s, previous pid %d still running",
502 cmdtype
,restart
->name
,(int)restart
->pid
);
506 /* Note: time_elapsed test must come before the force test, since we need
507 to make sure that delay is initialized for use below in updating the
509 if ((time_elapsed(&delay
,&restart
->time
)->tv_sec
< restart
->interval
) &&
512 if (gs
.loglevel
> LOG_DEBUG
+1)
513 zlog_debug("postponing %s %s: "
514 "elapsed time %ld < retry interval %ld",
515 cmdtype
,restart
->name
,(long)delay
.tv_sec
,restart
->interval
);
519 gettimeofday(&restart
->time
,NULL
);
522 char cmd
[strlen(command
)+strlen(restart
->name
)+1];
523 snprintf(cmd
,sizeof(cmd
),command
,restart
->name
);
524 if ((restart
->pid
= run_background(cmd
)) > 0)
526 restart
->t_kill
= thread_add_timer(master
,restart_kill
,restart
,
528 restart
->what
= cmdtype
;
535 /* Calculate the new restart interval. */
538 if (delay
.tv_sec
> 2*gs
.max_restart_interval
)
539 restart
->interval
= gs
.min_restart_interval
;
540 else if ((restart
->interval
*= 2) > gs
.max_restart_interval
)
541 restart
->interval
= gs
.max_restart_interval
;
542 if (gs
.loglevel
> LOG_DEBUG
+1)
543 zlog_debug("restart %s interval is now %ld",
544 restart
->name
,restart
->interval
);
549 #define SET_READ_HANDLER(DMN) \
550 (DMN)->t_read = thread_add_read(master,handle_read,(DMN),(DMN)->fd)
552 #define SET_WAKEUP_DOWN(DMN) \
553 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_down,(DMN), \
556 #define SET_WAKEUP_UNRESPONSIVE(DMN) \
557 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_unresponsive,(DMN), \
560 #define SET_WAKEUP_ECHO(DMN) \
561 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_send_echo,(DMN), \
565 wakeup_down(struct thread
*t_wakeup
)
567 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
569 dmn
->t_wakeup
= NULL
;
570 if (try_connect(dmn
) < 0)
571 SET_WAKEUP_DOWN(dmn
);
572 if ((dmn
->connect_tries
> 1) && (dmn
->state
!= DAEMON_UP
))
578 wakeup_init(struct thread
*t_wakeup
)
580 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
582 dmn
->t_wakeup
= NULL
;
583 if (try_connect(dmn
) < 0)
585 SET_WAKEUP_DOWN(dmn
);
586 zlog_err("%s state -> down : initial connection attempt failed",
588 dmn
->state
= DAEMON_DOWN
;
594 daemon_down(struct daemon
*dmn
, const char *why
)
596 if (IS_UP(dmn
) || (dmn
->state
== DAEMON_INIT
))
597 zlog_err("%s state -> down : %s",dmn
->name
,why
);
598 else if (gs
.loglevel
> LOG_DEBUG
)
599 zlog_debug("%s still down : %s",dmn
->name
,why
);
602 dmn
->state
= DAEMON_DOWN
;
608 THREAD_OFF(dmn
->t_read
);
609 THREAD_OFF(dmn
->t_write
);
610 THREAD_OFF(dmn
->t_wakeup
);
611 if (try_connect(dmn
) < 0)
612 SET_WAKEUP_DOWN(dmn
);
617 handle_read(struct thread
*t_read
)
619 struct daemon
*dmn
= THREAD_ARG(t_read
);
620 static const char resp
[sizeof(PING_TOKEN
)+4] = PING_TOKEN
"\n";
621 char buf
[sizeof(resp
)+100];
623 struct timeval delay
;
626 if ((rc
= read(dmn
->fd
,buf
,sizeof(buf
))) < 0)
630 if (ERRNO_IO_RETRY(errno
))
632 /* Pretend it never happened. */
633 SET_READ_HANDLER(dmn
);
636 snprintf(why
,sizeof(why
),"unexpected read error: %s",
637 safe_strerror(errno
));
638 daemon_down(dmn
,why
);
643 daemon_down(dmn
,"read returned EOF");
646 if (!dmn
->echo_sent
.tv_sec
)
648 char why
[sizeof(buf
)+100];
649 snprintf(why
,sizeof(why
),"unexpected read returns %d bytes: %.*s",
650 (int)rc
,(int)rc
,buf
);
651 daemon_down(dmn
,why
);
655 /* We are expecting an echo response: is there any chance that the
656 response would not be returned entirely in the first read? That
657 seems inconceivable... */
658 if ((rc
!= sizeof(resp
)) || memcmp(buf
,resp
,sizeof(resp
)))
660 char why
[100+sizeof(buf
)];
661 snprintf(why
,sizeof(why
),"read returned bad echo response of %d bytes "
662 "(expecting %u): %.*s",
663 (int)rc
,(u_int
)sizeof(resp
),(int)rc
,buf
);
664 daemon_down(dmn
,why
);
668 time_elapsed(&delay
,&dmn
->echo_sent
);
669 dmn
->echo_sent
.tv_sec
= 0;
670 if (dmn
->state
== DAEMON_UNRESPONSIVE
)
672 if (delay
.tv_sec
< gs
.timeout
)
674 dmn
->state
= DAEMON_UP
;
675 zlog_warn("%s state -> up : echo response received after %ld.%06ld "
676 "seconds", dmn
->name
,
677 (long)delay
.tv_sec
, (long)delay
.tv_usec
);
680 zlog_warn("%s: slow echo response finally received after %ld.%06ld "
681 "seconds", dmn
->name
,
682 (long)delay
.tv_sec
, (long)delay
.tv_usec
);
684 else if (gs
.loglevel
> LOG_DEBUG
+1)
685 zlog_debug("%s: echo response received after %ld.%06ld seconds",
686 dmn
->name
, (long)delay
.tv_sec
, (long)delay
.tv_usec
);
688 SET_READ_HANDLER(dmn
);
690 thread_cancel(dmn
->t_wakeup
);
691 SET_WAKEUP_ECHO(dmn
);
697 * Wait till we notice that all daemons are ready before
698 * we send we are ready to systemd
701 daemon_send_ready (void)
704 if (!sent
&& gs
.numdown
== 0)
706 #if defined (HAVE_CUMULUS)
709 fp
= fopen(DAEMON_VTY_DIR
"/watchfrr.started", "w");
712 zlog_notice ("Watchfrr: Notifying Systemd we are up and running");
713 systemd_send_started(master
, 0);
719 daemon_up(struct daemon
*dmn
, const char *why
)
721 dmn
->state
= DAEMON_UP
;
723 dmn
->connect_tries
= 0;
724 zlog_notice("%s state -> up : %s",dmn
->name
,why
);
727 SET_WAKEUP_ECHO(dmn
);
732 check_connect(struct thread
*t_write
)
734 struct daemon
*dmn
= THREAD_ARG(t_write
);
736 socklen_t reslen
= sizeof(sockerr
);
739 if (getsockopt(dmn
->fd
,SOL_SOCKET
,SO_ERROR
,(char *)&sockerr
,&reslen
) < 0)
741 zlog_warn("%s: check_connect: getsockopt failed: %s",
742 dmn
->name
,safe_strerror(errno
));
743 daemon_down(dmn
,"getsockopt failed checking connection success");
746 if ((reslen
== sizeof(sockerr
)) && sockerr
)
749 snprintf(why
,sizeof(why
),
750 "getsockopt reports that connection attempt failed: %s",
751 safe_strerror(sockerr
));
752 daemon_down(dmn
,why
);
756 daemon_up(dmn
,"delayed connect succeeded");
761 wakeup_connect_hanging(struct thread
*t_wakeup
)
763 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
766 dmn
->t_wakeup
= NULL
;
767 snprintf(why
,sizeof(why
),"connection attempt timed out after %ld seconds",
769 daemon_down(dmn
,why
);
773 /* Making connection to protocol daemon. */
775 try_connect(struct daemon
*dmn
)
778 struct sockaddr_un addr
;
781 if (gs
.loglevel
> LOG_DEBUG
+1)
782 zlog_debug("%s: attempting to connect",dmn
->name
);
783 dmn
->connect_tries
++;
785 memset (&addr
, 0, sizeof (struct sockaddr_un
));
786 addr
.sun_family
= AF_UNIX
;
787 snprintf(addr
.sun_path
, sizeof(addr
.sun_path
), "%s/%s.vty",
788 gs
.vtydir
,dmn
->name
);
789 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
790 len
= addr
.sun_len
= SUN_LEN(&addr
);
792 len
= sizeof (addr
.sun_family
) + strlen (addr
.sun_path
);
793 #endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
795 /* Quick check to see if we might succeed before we go to the trouble
796 of creating a socket. */
797 if (access(addr
.sun_path
, W_OK
) < 0)
800 zlog_err("%s: access to socket %s denied: %s",
801 dmn
->name
,addr
.sun_path
,safe_strerror(errno
));
805 if ((sock
= socket (AF_UNIX
, SOCK_STREAM
, 0)) < 0)
807 zlog_err("%s(%s): cannot make socket: %s",
808 __func__
,addr
.sun_path
, safe_strerror(errno
));
812 if (set_nonblocking(sock
) < 0 || set_cloexec(sock
) < 0)
814 zlog_err("%s(%s): set_nonblocking/cloexec(%d) failed",
815 __func__
, addr
.sun_path
, sock
);
820 if (connect (sock
, (struct sockaddr
*) &addr
, len
) < 0)
822 if ((errno
!= EINPROGRESS
) && (errno
!= EWOULDBLOCK
))
824 if (gs
.loglevel
> LOG_DEBUG
)
825 zlog_debug("%s(%s): connect failed: %s",
826 __func__
,addr
.sun_path
, safe_strerror(errno
));
830 if (gs
.loglevel
> LOG_DEBUG
)
831 zlog_debug("%s: connection in progress",dmn
->name
);
832 dmn
->state
= DAEMON_CONNECTING
;
834 dmn
->t_write
= thread_add_write(master
,check_connect
,dmn
,dmn
->fd
);
835 dmn
->t_wakeup
= thread_add_timer(master
,wakeup_connect_hanging
,dmn
,
837 SET_READ_HANDLER(dmn
);
842 SET_READ_HANDLER(dmn
);
843 daemon_up(dmn
,"connect succeeded");
848 phase_hanging(struct thread
*t_hanging
)
850 gs
.t_phase_hanging
= NULL
;
851 zlog_err("Phase [%s] hanging for %ld seconds, aborting phased restart",
852 phase_str
[gs
.phase
],PHASE_TIMEOUT
);
853 gs
.phase
= PHASE_NONE
;
858 set_phase(restart_phase_t new_phase
)
860 gs
.phase
= new_phase
;
861 if (gs
.t_phase_hanging
)
862 thread_cancel(gs
.t_phase_hanging
);
863 gs
.t_phase_hanging
= thread_add_timer(master
,phase_hanging
,NULL
,
874 case PHASE_STOPS_PENDING
:
877 zlog_info("Phased restart: all routing daemon stop jobs have completed.");
878 set_phase(PHASE_WAITING_DOWN
);
880 case PHASE_WAITING_DOWN
:
881 if (gs
.numdown
+IS_UP(gs
.special
) < gs
.numdaemons
)
883 zlog_info("Phased restart: all routing daemons now down.");
884 run_job(&gs
.special
->restart
,"restart",gs
.restart_command
,1,1);
885 set_phase(PHASE_ZEBRA_RESTART_PENDING
);
887 case PHASE_ZEBRA_RESTART_PENDING
:
888 if (gs
.special
->restart
.pid
)
890 zlog_info("Phased restart: %s restart job completed.",gs
.special
->name
);
891 set_phase(PHASE_WAITING_ZEBRA_UP
);
893 case PHASE_WAITING_ZEBRA_UP
:
894 if (!IS_UP(gs
.special
))
896 zlog_info("Phased restart: %s is now up.",gs
.special
->name
);
899 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
901 if (dmn
!= gs
.special
)
902 run_job(&dmn
->restart
,"start",gs
.start_command
,1,0);
905 gs
.phase
= PHASE_NONE
;
906 THREAD_OFF(gs
.t_phase_hanging
);
907 zlog_notice("Phased global restart has completed.");
913 try_restart(struct daemon
*dmn
)
919 case MODE_GLOBAL_RESTART
:
920 run_job(&gs
.restart
,"restart",gs
.restart_command
,0,1);
922 case MODE_SEPARATE_RESTART
:
923 run_job(&dmn
->restart
,"restart",gs
.restart_command
,0,1);
925 case MODE_PHASED_ZEBRA_RESTART
:
926 if (dmn
!= gs
.special
)
928 if ((gs
.special
->state
== DAEMON_UP
) && (gs
.phase
== PHASE_NONE
))
929 run_job(&dmn
->restart
,"restart",gs
.restart_command
,0,1);
931 zlog_debug("%s: postponing restart attempt because master %s daemon "
932 "not up [%s], or phased restart in progress",
933 dmn
->name
,gs
.special
->name
,state_str
[gs
.special
->state
]);
937 case MODE_PHASED_ALL_RESTART
:
938 if ((gs
.phase
!= PHASE_NONE
) || gs
.numpids
)
940 if (gs
.loglevel
> LOG_DEBUG
+1)
941 zlog_debug("postponing phased global restart: restart already in "
942 "progress [%s], or outstanding child processes [%d]",
943 phase_str
[gs
.phase
],gs
.numpids
);
946 /* Is it too soon for a restart? */
948 struct timeval delay
;
949 if (time_elapsed(&delay
,&gs
.special
->restart
.time
)->tv_sec
<
950 gs
.special
->restart
.interval
)
952 if (gs
.loglevel
> LOG_DEBUG
+1)
953 zlog_debug("postponing phased global restart: "
954 "elapsed time %ld < retry interval %ld",
955 (long)delay
.tv_sec
,gs
.special
->restart
.interval
);
959 run_job(&gs
.restart
,"restart",gs
.restart_command
,0,1);
962 zlog_err("error: unknown restart mode %d",gs
.mode
);
968 wakeup_unresponsive(struct thread
*t_wakeup
)
970 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
972 dmn
->t_wakeup
= NULL
;
973 if (dmn
->state
!= DAEMON_UNRESPONSIVE
)
974 zlog_err("%s: no longer unresponsive (now %s), "
975 "wakeup should have been cancelled!",
976 dmn
->name
,state_str
[dmn
->state
]);
979 SET_WAKEUP_UNRESPONSIVE(dmn
);
986 wakeup_no_answer(struct thread
*t_wakeup
)
988 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
990 dmn
->t_wakeup
= NULL
;
991 dmn
->state
= DAEMON_UNRESPONSIVE
;
992 zlog_err("%s state -> unresponsive : no response yet to ping "
993 "sent %ld seconds ago",dmn
->name
,gs
.timeout
);
994 if (gs
.unresponsive_restart
)
996 SET_WAKEUP_UNRESPONSIVE(dmn
);
1003 wakeup_send_echo(struct thread
*t_wakeup
)
1005 static const char echocmd
[] = "echo " PING_TOKEN
;
1007 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
1009 dmn
->t_wakeup
= NULL
;
1010 if (((rc
= write(dmn
->fd
,echocmd
,sizeof(echocmd
))) < 0) ||
1011 ((size_t)rc
!= sizeof(echocmd
)))
1013 char why
[100+sizeof(echocmd
)];
1014 snprintf(why
,sizeof(why
),"write '%s' returned %d instead of %u",
1015 echocmd
,(int)rc
,(u_int
)sizeof(echocmd
));
1016 daemon_down(dmn
,why
);
1020 gettimeofday(&dmn
->echo_sent
,NULL
);
1021 dmn
->t_wakeup
= thread_add_timer(master
,wakeup_no_answer
,dmn
,gs
.timeout
);
1029 zlog_notice("Terminating on signal");
1030 systemd_send_stopping ();
1035 valid_command(const char *cmd
)
1039 return ((p
= strchr(cmd
,'%')) != NULL
) && (*(p
+1) == 's') && !strchr(p
+1,'%');
1042 /* This is an ugly hack to circumvent problems with passing command-line
1043 arguments that contain spaces. The fix is to use a configuration file. */
1045 translate_blanks(const char *cmd
, const char *blankstr
)
1049 size_t bslen
= strlen(blankstr
);
1051 if (!(res
= strdup(cmd
)))
1056 while ((p
= strstr(res
,blankstr
)) != NULL
)
1060 memmove(p
+1,p
+bslen
,strlen(p
+bslen
)+1);
1065 struct zebra_privs_t watchfrr_privs
=
1068 .vty_group
= VTY_GROUP
,
1073 main(int argc
, char **argv
)
1075 const char *progname
;
1077 int daemon_mode
= 0;
1078 const char *pidfile
= DEFAULT_PIDFILE
;
1079 const char *special
= "zebra";
1080 const char *blankstr
= NULL
;
1081 static struct quagga_signal_t my_signals
[] =
1093 .handler
= sigchild
,
1097 if ((progname
= strrchr (argv
[0], '/')) != NULL
)
1102 gs
.restart
.name
= "all";
1103 while ((opt
= getopt_long(argc
, argv
, "aAb:dek:l:m:M:i:p:r:R:S:s:t:T:zvh",
1104 longopts
, 0)) != EOF
)
1111 if ((gs
.mode
!= MODE_MONITOR
) && (gs
.mode
!= MODE_SEPARATE_RESTART
))
1113 fputs("Ambiguous operating mode selected.\n",stderr
);
1114 return usage(progname
,1);
1116 gs
.mode
= MODE_PHASED_ZEBRA_RESTART
;
1119 if ((gs
.mode
!= MODE_MONITOR
) && (gs
.mode
!= MODE_SEPARATE_RESTART
))
1121 fputs("Ambiguous operating mode selected.\n",stderr
);
1122 return usage(progname
,1);
1124 gs
.mode
= MODE_PHASED_ALL_RESTART
;
1136 if (!valid_command(optarg
))
1138 fprintf(stderr
,"Invalid kill command, must contain '%%s': %s\n",
1140 return usage(progname
,1);
1142 gs
.stop_command
= optarg
;
1147 if ((sscanf(optarg
,"%d%1s",&gs
.loglevel
,garbage
) != 1) ||
1148 (gs
.loglevel
< LOG_EMERG
))
1150 fprintf(stderr
,"Invalid loglevel argument: %s\n",optarg
);
1151 return usage(progname
,1);
1158 if ((sscanf(optarg
,"%ld%1s",
1159 &gs
.min_restart_interval
,garbage
) != 1) ||
1160 (gs
.min_restart_interval
< 0))
1162 fprintf(stderr
,"Invalid min_restart_interval argument: %s\n",
1164 return usage(progname
,1);
1171 if ((sscanf(optarg
,"%ld%1s",
1172 &gs
.max_restart_interval
,garbage
) != 1) ||
1173 (gs
.max_restart_interval
< 0))
1175 fprintf(stderr
,"Invalid max_restart_interval argument: %s\n",
1177 return usage(progname
,1);
1185 if ((sscanf(optarg
,"%d%1s",&period
,garbage
) != 1) ||
1188 fprintf(stderr
,"Invalid interval argument: %s\n",optarg
);
1189 return usage(progname
,1);
1191 gs
.period
= 1000*period
;
1198 if ((gs
.mode
== MODE_GLOBAL_RESTART
) ||
1199 (gs
.mode
== MODE_SEPARATE_RESTART
))
1201 fputs("Ambiguous operating mode selected.\n",stderr
);
1202 return usage(progname
,1);
1204 if (!valid_command(optarg
))
1207 "Invalid restart command, must contain '%%s': %s\n",
1209 return usage(progname
,1);
1211 gs
.restart_command
= optarg
;
1212 if (gs
.mode
== MODE_MONITOR
)
1213 gs
.mode
= MODE_SEPARATE_RESTART
;
1216 if (gs
.mode
!= MODE_MONITOR
)
1218 fputs("Ambiguous operating mode selected.\n",stderr
);
1219 return usage(progname
,1);
1221 if (strchr(optarg
,'%'))
1224 "Invalid restart-all arg, must not contain '%%s': %s\n",
1226 return usage(progname
,1);
1228 gs
.restart_command
= optarg
;
1229 gs
.mode
= MODE_GLOBAL_RESTART
;
1232 if (!valid_command(optarg
))
1234 fprintf(stderr
,"Invalid start command, must contain '%%s': %s\n",
1236 return usage(progname
,1);
1238 gs
.start_command
= optarg
;
1246 if ((sscanf(optarg
,"%ld%1s",&gs
.timeout
,garbage
) != 1) ||
1249 fprintf(stderr
,"Invalid timeout argument: %s\n",optarg
);
1250 return usage(progname
,1);
1257 if ((sscanf(optarg
,"%ld%1s",&gs
.restart_timeout
,garbage
) != 1) ||
1258 (gs
.restart_timeout
< 1))
1260 fprintf(stderr
,"Invalid restart timeout argument: %s\n",optarg
);
1261 return usage(progname
,1);
1266 gs
.unresponsive_restart
= 1;
1269 printf ("%s version %s\n", progname
, FRR_VERSION
);
1270 puts("Copyright 2004 Andrew J. Schorr");
1273 return usage(progname
,0);
1275 fputs("Invalid option.\n",stderr
);
1276 return usage(progname
,1);
1280 if (gs
.unresponsive_restart
&& (gs
.mode
== MODE_MONITOR
))
1282 fputs("Option -z requires a -r or -R restart option.\n",stderr
);
1283 return usage(progname
,1);
1288 if (gs
.restart_command
|| gs
.start_command
|| gs
.stop_command
)
1290 fprintf(stderr
,"No kill/(re)start commands needed for %s mode.\n",
1292 return usage(progname
,1);
1295 case MODE_GLOBAL_RESTART
:
1296 case MODE_SEPARATE_RESTART
:
1297 if (!gs
.restart_command
|| gs
.start_command
|| gs
.stop_command
)
1299 fprintf(stderr
,"No start/kill commands needed in [%s] mode.\n",
1301 return usage(progname
,1);
1304 case MODE_PHASED_ZEBRA_RESTART
:
1305 case MODE_PHASED_ALL_RESTART
:
1306 if (!gs
.restart_command
|| !gs
.start_command
|| !gs
.stop_command
)
1309 "Need start, kill, and restart commands in [%s] mode.\n",
1311 return usage(progname
,1);
1318 if (gs
.restart_command
)
1319 gs
.restart_command
= translate_blanks(gs
.restart_command
,blankstr
);
1320 if (gs
.start_command
)
1321 gs
.start_command
= translate_blanks(gs
.start_command
,blankstr
);
1322 if (gs
.stop_command
)
1323 gs
.stop_command
= translate_blanks(gs
.stop_command
,blankstr
);
1326 gs
.restart
.interval
= gs
.min_restart_interval
;
1328 zprivs_init (&watchfrr_privs
);
1330 master
= thread_master_create();
1334 watchfrr_vty_init();
1335 vty_serv_sock(NULL
, 0, WATCHFRR_VTYSH_PATH
);
1337 signal_init (master
, array_size(my_signals
), my_signals
);
1338 srandom(time(NULL
));
1342 struct daemon
*tail
= NULL
;
1344 for (i
= optind
; i
< argc
; i
++)
1348 if (!(dmn
= (struct daemon
*)calloc(1,sizeof(*dmn
))))
1350 fprintf(stderr
,"calloc(1,%u) failed: %s\n",
1351 (u_int
)sizeof(*dmn
), safe_strerror(errno
));
1354 dmn
->name
= dmn
->restart
.name
= argv
[i
];
1355 dmn
->state
= DAEMON_INIT
;
1359 dmn
->t_wakeup
= thread_add_timer_msec(master
,wakeup_init
,dmn
,
1360 100+(random() % 900));
1361 dmn
->restart
.interval
= gs
.min_restart_interval
;
1368 if (((gs
.mode
== MODE_PHASED_ZEBRA_RESTART
) ||
1369 (gs
.mode
== MODE_PHASED_ALL_RESTART
)) &&
1370 !strcmp(dmn
->name
,special
))
1376 fputs("Must specify one or more daemons to monitor.\n",stderr
);
1377 return usage(progname
,1);
1379 if (((gs
.mode
== MODE_PHASED_ZEBRA_RESTART
) ||
1380 (gs
.mode
== MODE_PHASED_ALL_RESTART
)) && !gs
.special
)
1382 fprintf(stderr
,"In mode [%s], but cannot find master daemon %s\n",
1383 mode_str
[gs
.mode
],special
);
1384 return usage(progname
,1);
1387 zlog_default
= openzlog(progname
, ZLOG_WATCHFRR
, 0,
1388 LOG_CONS
|LOG_NDELAY
|LOG_PID
, LOG_DAEMON
);
1389 zlog_set_level(NULL
, ZLOG_DEST_MONITOR
, ZLOG_DISABLED
);
1392 zlog_set_level(NULL
, ZLOG_DEST_SYSLOG
, MIN(gs
.loglevel
,LOG_DEBUG
));
1393 if (daemon (0, 0) < 0)
1395 fprintf(stderr
, "Watchfrr daemon failed: %s", strerror(errno
));
1400 zlog_set_level(NULL
, ZLOG_DEST_STDOUT
, MIN(gs
.loglevel
,LOG_DEBUG
));
1402 /* Make sure we're not already running. */
1403 pid_output (pidfile
);
1405 /* Announce which daemons are being monitored. */
1410 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
1411 len
+= strlen(dmn
->name
)+1;
1417 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
1421 strcpy(p
,dmn
->name
);
1424 zlog_notice("%s %s watching [%s], mode [%s]",
1425 progname
, FRR_VERSION
, buf
, mode_str
[gs
.mode
]);
1430 struct thread thread
;
1432 while (thread_fetch (master
, &thread
))
1433 thread_call (&thread
);
1436 systemd_send_stopping ();