2 Monitor status of quagga daemons and restart if necessary.
4 Copyright (C) 2004 Andrew J. Schorr
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
26 #include <lib/version.h>
34 #define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
37 /* Macros to help randomize timers. */
38 #define JITTER(X) ((random() % ((X)+1))-((X)/2))
39 #define FUZZY(X) ((X)+JITTER((X)/20))
41 #define DEFAULT_PERIOD 5
42 #define DEFAULT_TIMEOUT 10
43 #define DEFAULT_RESTART_TIMEOUT 20
44 #define DEFAULT_LOGLEVEL LOG_INFO
45 #define DEFAULT_MIN_RESTART 60
46 #define DEFAULT_MAX_RESTART 600
47 #ifdef PATH_WATCHQUAGGA_PID
48 #define DEFAULT_PIDFILE PATH_WATCHQUAGGA_PID
50 #define DEFAULT_PIDFILE STATEDIR "/watchquagga.pid"
53 #define VTYDIR DAEMON_VTY_DIR
55 #define VTYDIR STATEDIR
58 #define PING_TOKEN "PING"
60 /* Needs to be global, referenced somewhere inside libzebra. */
61 struct thread_master
*master
;
67 MODE_SEPARATE_RESTART
,
68 MODE_PHASED_ZEBRA_RESTART
,
69 MODE_PHASED_ALL_RESTART
72 static const char *mode_str
[] =
76 "individual daemon restart",
77 "phased zebra restart",
78 "phased global restart for any failure",
86 PHASE_ZEBRA_RESTART_PENDING
,
87 PHASE_WAITING_ZEBRA_UP
90 static const char *phase_str
[] =
94 "Waiting for other daemons to come down",
95 "Zebra restart job running",
96 "Waiting for zebra to come up",
100 #define PHASE_TIMEOUT (3*gs.restart_timeout)
109 struct thread
*t_kill
;
113 static struct global_state
116 restart_phase_t phase
;
117 struct thread
*t_phase_hanging
;
121 long restart_timeout
;
122 long min_restart_interval
;
123 long max_restart_interval
;
125 struct daemon
*daemons
;
126 const char *restart_command
;
127 const char *start_command
;
128 const char *stop_command
;
129 struct restart_info restart
;
130 int unresponsive_restart
;
132 struct daemon
*special
; /* points to zebra when doing phased restart */
135 int numdown
; /* # of daemons that are not UP or UNRESPONSIVE */
137 .mode
= MODE_MONITOR
,
140 .period
= 1000*DEFAULT_PERIOD
,
141 .timeout
= DEFAULT_TIMEOUT
,
142 .restart_timeout
= DEFAULT_RESTART_TIMEOUT
,
143 .loglevel
= DEFAULT_LOGLEVEL
,
144 .min_restart_interval
= DEFAULT_MIN_RESTART
,
145 .max_restart_interval
= DEFAULT_MAX_RESTART
,
159 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
161 static const char *state_str
[] =
172 daemon_state_t state
;
174 struct timeval echo_sent
;
176 struct thread
*t_wakeup
;
177 struct thread
*t_read
;
178 struct thread
*t_write
;
180 struct restart_info restart
;
183 static const struct option longopts
[] =
185 { "daemon", no_argument
, NULL
, 'd'},
186 { "statedir", required_argument
, NULL
, 'S'},
187 { "no-echo", no_argument
, NULL
, 'e'},
188 { "loglevel", required_argument
, NULL
, 'l'},
189 { "interval", required_argument
, NULL
, 'i'},
190 { "timeout", required_argument
, NULL
, 't'},
191 { "restart-timeout", required_argument
, NULL
, 'T'},
192 { "restart", required_argument
, NULL
, 'r'},
193 { "start-command", required_argument
, NULL
, 's'},
194 { "kill-command", required_argument
, NULL
, 'k'},
195 { "restart-all", required_argument
, NULL
, 'R'},
196 { "all-restart", no_argument
, NULL
, 'a'},
197 { "always-all-restart", no_argument
, NULL
, 'A'},
198 { "unresponsive-restart", no_argument
, NULL
, 'z'},
199 { "min-restart-interval", required_argument
, NULL
, 'm'},
200 { "max-restart-interval", required_argument
, NULL
, 'M'},
201 { "pid-file", required_argument
, NULL
, 'p'},
202 { "blank-string", required_argument
, NULL
, 'b'},
203 { "help", no_argument
, NULL
, 'h'},
204 { "version", no_argument
, NULL
, 'v'},
208 static int try_connect(struct daemon
*dmn
);
209 static int wakeup_send_echo(struct thread
*t_wakeup
);
210 static void try_restart(struct daemon
*dmn
);
211 static void phase_check(void);
214 usage(const char *progname
, int status
)
217 fprintf(stderr
, "Try `%s --help' for more information.\n", progname
);
220 printf("Usage : %s [OPTION...] <daemon name> ...\n\n\
221 Watchdog program to monitor status of quagga daemons and try to restart\n\
222 them if they are down or unresponsive. It determines whether a daemon is\n\
223 up based on whether it can connect to the daemon's vty unix stream socket.\n\
224 It then repeatedly sends echo commands over that socket to determine whether\n\
225 the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
226 on the socket connection and know immediately that the daemon is down.\n\n\
227 The daemons to be monitored should be listed on the command line.\n\n\
228 This program can run in one of 5 modes:\n\n\
230 Just monitor and report on status changes. Example:\n\
231 %s -d zebra ospfd bgpd\n\n\
233 Whenever any daemon hangs or crashes, use the given command to restart\n\
234 them all. Example:\n\
236 -R '/sbin/service zebra restart; /sbin/service ospfd restart' \\\n\
239 When any single daemon hangs or crashes, restart only the daemon that's\n\
240 in trouble using the supplied restart command. Example:\n\
241 %s -dz -r '/sbin/service %%s restart' zebra ospfd bgpd\n\n\
243 The same as the previous mode, except that there is special treatment when\n\
244 the zebra daemon is in trouble. In that case, a phased restart approach\n\
245 is used: 1. stop all other daemons; 2. restart zebra; 3. start the other\n\
247 %s -adz -r '/sbin/service %%s restart' \\\n\
248 -s '/sbin/service %%s start' \\\n\
249 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
251 This is the same as the previous mode, except that the phased restart\n\
252 procedure is used whenever any of the daemons hangs or crashes. Example:\n\
253 %s -Adz -r '/sbin/service %%s restart' \\\n\
254 -s '/sbin/service %%s start' \\\n\
255 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
256 As of this writing, it is believed that mode 2 [%s]\n\
257 is not safe, and mode 3 [%s] may not be safe with some of the\n\
258 routing daemons.\n\n\
259 In order to avoid attempting to restart the daemons in a fast loop,\n\
260 the -m and -M options allow you to control the minimum delay between\n\
261 restart commands. The minimum restart delay is recalculated each time\n\
262 a restart is attempted: if the time since the last restart attempt exceeds\n\
263 twice the -M value, then the restart delay is set to the -m value.\n\
264 Otherwise, the interval is doubled (but capped at the -M value).\n\n",
265 progname
,mode_str
[0],progname
,mode_str
[1],progname
,mode_str
[2],
266 progname
,mode_str
[3],progname
,mode_str
[4],progname
,mode_str
[2],
270 -d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
271 to syslog instead of stdout.\n\
272 -S, --statedir Set the vty socket directory (default is %s)\n\
273 -e, --no-echo Do not ping the daemons to test responsiveness (this\n\
274 option is necessary if the daemons do not support the\n\
276 -l, --loglevel Set the logging level (default is %d).\n\
277 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
278 but it can be set higher than %d if extra-verbose debugging\n\
279 messages are desired.\n\
280 -m, --min-restart-interval\n\
281 Set the minimum seconds to wait between invocations of daemon\n\
282 restart commands (default is %d).\n\
283 -M, --max-restart-interval\n\
284 Set the maximum seconds to wait between invocations of daemon\n\
285 restart commands (default is %d).\n\
286 -i, --interval Set the status polling interval in seconds (default is %d)\n\
287 -t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
288 -T, --restart-timeout\n\
289 Set the restart (kill) timeout in seconds (default is %d).\n\
290 If any background jobs are still running after this much\n\
291 time has elapsed, they will be killed.\n\
292 -r, --restart Supply a Bourne shell command to use to restart a single\n\
293 daemon. The command string should include '%%s' where the\n\
294 name of the daemon should be substituted.\n\
295 Note that -r and -R are incompatible.\n\
296 -s, --start-command\n\
297 Supply a Bourne shell to command to use to start a single\n\
298 daemon. The command string should include '%%s' where the\n\
299 name of the daemon should be substituted.\n\
300 -k, --kill-command\n\
301 Supply a Bourne shell to command to use to stop a single\n\
302 daemon. The command string should include '%%s' where the\n\
303 name of the daemon should be substituted.\n\
305 When one or more daemons is down, try to restart everything\n\
306 using the Bourne shell command supplied as the argument.\n\
307 Note that -r and -R are incompatible.\n\
308 -z, --unresponsive-restart\n\
309 When a daemon is unresponsive, treat it as being down for\n\
312 When zebra hangs or crashes, restart all daemons using\n\
313 this phased approach: 1. stop all other daemons; 2. restart\n\
314 zebra; 3. start other daemons. Requires -r, -s, and -k.\n\
315 -A, --always-all-restart\n\
316 When any daemon (not just zebra) hangs or crashes, use the\n\
317 same phased restart mechanism described above for -a.\n\
318 Requires -r, -s, and -k.\n\
319 -p, --pid-file Set process identifier file name\n\
321 -b, --blank-string\n\
322 When the supplied argument string is found in any of the\n\
323 various shell command arguments (-r, -s, -k, or -R), replace\n\
324 it with a space. This is an ugly hack to circumvent problems\n\
325 passing command-line arguments with embedded spaces.\n\
326 -v, --version Print program version\n\
327 -h, --help Display this help and exit\n",
328 VTYDIR
,DEFAULT_LOGLEVEL
,LOG_EMERG
,LOG_DEBUG
,LOG_DEBUG
,
329 DEFAULT_MIN_RESTART
,DEFAULT_MAX_RESTART
,
330 DEFAULT_PERIOD
,DEFAULT_TIMEOUT
,DEFAULT_RESTART_TIMEOUT
,
338 run_background(char *shell_cmd
)
342 switch (child
= fork())
345 zlog_err("fork failed, cannot run command [%s]: %s",
346 shell_cmd
,safe_strerror(errno
));
350 /* Use separate process group so child processes can be killed easily. */
351 if (setpgid(0,0) < 0)
352 zlog_warn("warning: setpgid(0,0) failed: %s",safe_strerror(errno
));
356 char * const argv
[4] = { shell
, dashc
, shell_cmd
, NULL
};
357 execv("/bin/sh", argv
);
358 zlog_err("execv(/bin/sh -c '%s') failed: %s",
359 shell_cmd
,safe_strerror(errno
));
363 /* Parent process: we will reap the child later. */
364 zlog_err("Forked background command [pid %d]: %s",(int)child
,shell_cmd
);
369 static struct timeval
*
370 time_elapsed(struct timeval
*result
, const struct timeval
*start_time
)
372 gettimeofday(result
,NULL
);
373 result
->tv_sec
-= start_time
->tv_sec
;
374 result
->tv_usec
-= start_time
->tv_usec
;
375 while (result
->tv_usec
< 0)
377 result
->tv_usec
+= 1000000L;
384 restart_kill(struct thread
*t_kill
)
386 struct restart_info
*restart
= THREAD_ARG(t_kill
);
387 struct timeval delay
;
389 time_elapsed(&delay
,&restart
->time
);
390 zlog_warn("Warning: %s %s child process %d still running after "
391 "%ld seconds, sending signal %d",
392 restart
->what
,restart
->name
,(int)restart
->pid
, (long)delay
.tv_sec
,
393 (restart
->kills
? SIGKILL
: SIGTERM
));
394 kill(-restart
->pid
,(restart
->kills
? SIGKILL
: SIGTERM
));
396 restart
->t_kill
= thread_add_timer(master
,restart_kill
,restart
,
401 static struct restart_info
*
402 find_child(pid_t child
)
404 if (gs
.mode
== MODE_GLOBAL_RESTART
)
406 if (gs
.restart
.pid
== child
)
412 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
414 if (dmn
->restart
.pid
== child
)
415 return &dmn
->restart
;
428 struct restart_info
*restart
;
430 switch (child
= waitpid(-1,&status
,WNOHANG
))
433 zlog_err("waitpid failed: %s",safe_strerror(errno
));
436 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
440 if ((restart
= find_child(child
)) != NULL
)
442 name
= restart
->name
;
443 what
= restart
->what
;
446 thread_cancel(restart
->t_kill
);
447 restart
->t_kill
= NULL
;
448 /* Update restart time to reflect the time the command completed. */
449 gettimeofday(&restart
->time
,NULL
);
453 zlog_err("waitpid returned status for an unknown child process %d",
458 if (WIFSTOPPED(status
))
459 zlog_warn("warning: %s %s process %d is stopped",
460 what
,name
,(int)child
);
461 else if (WIFSIGNALED(status
))
462 zlog_warn("%s %s process %d terminated due to signal %d",
463 what
,name
,(int)child
,WTERMSIG(status
));
464 else if (WIFEXITED(status
))
466 if (WEXITSTATUS(status
) != 0)
467 zlog_warn("%s %s process %d exited with non-zero status %d",
468 what
,name
,(int)child
,WEXITSTATUS(status
));
470 zlog_debug("%s %s process %d exited normally",what
,name
,(int)child
);
473 zlog_err("cannot interpret %s %s process %d wait status 0x%x",
474 what
,name
,(int)child
,status
);
479 run_job(struct restart_info
*restart
, const char *cmdtype
, const char *command
,
480 int force
, int update_interval
)
482 struct timeval delay
;
484 if (gs
.loglevel
> LOG_DEBUG
+1)
485 zlog_debug("attempting to %s %s",cmdtype
,restart
->name
);
489 if (gs
.loglevel
> LOG_DEBUG
+1)
490 zlog_debug("cannot %s %s, previous pid %d still running",
491 cmdtype
,restart
->name
,(int)restart
->pid
);
495 /* Note: time_elapsed test must come before the force test, since we need
496 to make sure that delay is initialized for use below in updating the
498 if ((time_elapsed(&delay
,&restart
->time
)->tv_sec
< restart
->interval
) &&
501 if (gs
.loglevel
> LOG_DEBUG
+1)
502 zlog_debug("postponing %s %s: "
503 "elapsed time %ld < retry interval %ld",
504 cmdtype
,restart
->name
,(long)delay
.tv_sec
,restart
->interval
);
508 gettimeofday(&restart
->time
,NULL
);
511 char cmd
[strlen(command
)+strlen(restart
->name
)+1];
512 snprintf(cmd
,sizeof(cmd
),command
,restart
->name
);
513 if ((restart
->pid
= run_background(cmd
)) > 0)
515 restart
->t_kill
= thread_add_timer(master
,restart_kill
,restart
,
517 restart
->what
= cmdtype
;
524 /* Calculate the new restart interval. */
527 if (delay
.tv_sec
> 2*gs
.max_restart_interval
)
528 restart
->interval
= gs
.min_restart_interval
;
529 else if ((restart
->interval
*= 2) > gs
.max_restart_interval
)
530 restart
->interval
= gs
.max_restart_interval
;
531 if (gs
.loglevel
> LOG_DEBUG
+1)
532 zlog_debug("restart %s interval is now %ld",
533 restart
->name
,restart
->interval
);
538 #define SET_READ_HANDLER(DMN) \
539 (DMN)->t_read = thread_add_read(master,handle_read,(DMN),(DMN)->fd)
541 #define SET_WAKEUP_DOWN(DMN) \
542 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_down,(DMN), \
545 #define SET_WAKEUP_UNRESPONSIVE(DMN) \
546 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_unresponsive,(DMN), \
549 #define SET_WAKEUP_ECHO(DMN) \
550 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_send_echo,(DMN), \
554 wakeup_down(struct thread
*t_wakeup
)
556 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
558 dmn
->t_wakeup
= NULL
;
559 if (try_connect(dmn
) < 0)
560 SET_WAKEUP_DOWN(dmn
);
561 if ((dmn
->connect_tries
> 1) && (dmn
->state
!= DAEMON_UP
))
567 wakeup_init(struct thread
*t_wakeup
)
569 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
571 dmn
->t_wakeup
= NULL
;
572 if (try_connect(dmn
) < 0)
574 SET_WAKEUP_DOWN(dmn
);
575 zlog_err("%s state -> down : initial connection attempt failed",
577 dmn
->state
= DAEMON_DOWN
;
583 daemon_down(struct daemon
*dmn
, const char *why
)
585 if (IS_UP(dmn
) || (dmn
->state
== DAEMON_INIT
))
586 zlog_err("%s state -> down : %s",dmn
->name
,why
);
587 else if (gs
.loglevel
> LOG_DEBUG
)
588 zlog_debug("%s still down : %s",dmn
->name
,why
);
591 dmn
->state
= DAEMON_DOWN
;
597 THREAD_OFF(dmn
->t_read
);
598 THREAD_OFF(dmn
->t_write
);
599 THREAD_OFF(dmn
->t_wakeup
);
600 if (try_connect(dmn
) < 0)
601 SET_WAKEUP_DOWN(dmn
);
606 handle_read(struct thread
*t_read
)
608 struct daemon
*dmn
= THREAD_ARG(t_read
);
609 static const char resp
[sizeof(PING_TOKEN
)+4] = PING_TOKEN
"\n";
610 char buf
[sizeof(resp
)+100];
612 struct timeval delay
;
615 if ((rc
= read(dmn
->fd
,buf
,sizeof(buf
))) < 0)
619 if (ERRNO_IO_RETRY(errno
))
621 /* Pretend it never happened. */
622 SET_READ_HANDLER(dmn
);
625 snprintf(why
,sizeof(why
),"unexpected read error: %s",
626 safe_strerror(errno
));
627 daemon_down(dmn
,why
);
632 daemon_down(dmn
,"read returned EOF");
635 if (!dmn
->echo_sent
.tv_sec
)
637 char why
[sizeof(buf
)+100];
638 snprintf(why
,sizeof(why
),"unexpected read returns %d bytes: %.*s",
639 (int)rc
,(int)rc
,buf
);
640 daemon_down(dmn
,why
);
644 /* We are expecting an echo response: is there any chance that the
645 response would not be returned entirely in the first read? That
646 seems inconceivable... */
647 if ((rc
!= sizeof(resp
)) || memcmp(buf
,resp
,sizeof(resp
)))
649 char why
[100+sizeof(buf
)];
650 snprintf(why
,sizeof(why
),"read returned bad echo response of %d bytes "
651 "(expecting %u): %.*s",
652 (int)rc
,(u_int
)sizeof(resp
),(int)rc
,buf
);
653 daemon_down(dmn
,why
);
657 time_elapsed(&delay
,&dmn
->echo_sent
);
658 dmn
->echo_sent
.tv_sec
= 0;
659 if (dmn
->state
== DAEMON_UNRESPONSIVE
)
661 if (delay
.tv_sec
< gs
.timeout
)
663 dmn
->state
= DAEMON_UP
;
664 zlog_warn("%s state -> up : echo response received after %ld.%06ld "
665 "seconds", dmn
->name
,
666 (long)delay
.tv_sec
, (long)delay
.tv_usec
);
669 zlog_warn("%s: slow echo response finally received after %ld.%06ld "
670 "seconds", dmn
->name
,
671 (long)delay
.tv_sec
, (long)delay
.tv_usec
);
673 else if (gs
.loglevel
> LOG_DEBUG
+1)
674 zlog_debug("%s: echo response received after %ld.%06ld seconds",
675 dmn
->name
, (long)delay
.tv_sec
, (long)delay
.tv_usec
);
677 SET_READ_HANDLER(dmn
);
679 thread_cancel(dmn
->t_wakeup
);
680 SET_WAKEUP_ECHO(dmn
);
686 daemon_up(struct daemon
*dmn
, const char *why
)
688 dmn
->state
= DAEMON_UP
;
690 dmn
->connect_tries
= 0;
691 zlog_notice("%s state -> up : %s",dmn
->name
,why
);
693 SET_WAKEUP_ECHO(dmn
);
698 check_connect(struct thread
*t_write
)
700 struct daemon
*dmn
= THREAD_ARG(t_write
);
702 socklen_t reslen
= sizeof(sockerr
);
705 if (getsockopt(dmn
->fd
,SOL_SOCKET
,SO_ERROR
,(char *)&sockerr
,&reslen
) < 0)
707 zlog_warn("%s: check_connect: getsockopt failed: %s",
708 dmn
->name
,safe_strerror(errno
));
709 daemon_down(dmn
,"getsockopt failed checking connection success");
712 if ((reslen
== sizeof(sockerr
)) && sockerr
)
715 snprintf(why
,sizeof(why
),
716 "getsockopt reports that connection attempt failed: %s",
717 safe_strerror(sockerr
));
718 daemon_down(dmn
,why
);
722 daemon_up(dmn
,"delayed connect succeeded");
727 wakeup_connect_hanging(struct thread
*t_wakeup
)
729 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
732 dmn
->t_wakeup
= NULL
;
733 snprintf(why
,sizeof(why
),"connection attempt timed out after %ld seconds",
735 daemon_down(dmn
,why
);
739 /* Making connection to protocol daemon. */
741 try_connect(struct daemon
*dmn
)
744 struct sockaddr_un addr
;
747 if (gs
.loglevel
> LOG_DEBUG
+1)
748 zlog_debug("%s: attempting to connect",dmn
->name
);
749 dmn
->connect_tries
++;
751 memset (&addr
, 0, sizeof (struct sockaddr_un
));
752 addr
.sun_family
= AF_UNIX
;
753 snprintf(addr
.sun_path
, sizeof(addr
.sun_path
), "%s/%s.vty",
754 gs
.vtydir
,dmn
->name
);
755 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
756 len
= addr
.sun_len
= SUN_LEN(&addr
);
758 len
= sizeof (addr
.sun_family
) + strlen (addr
.sun_path
);
759 #endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
761 /* Quick check to see if we might succeed before we go to the trouble
762 of creating a socket. */
763 if (access(addr
.sun_path
, W_OK
) < 0)
766 zlog_err("%s: access to socket %s denied: %s",
767 dmn
->name
,addr
.sun_path
,safe_strerror(errno
));
771 if ((sock
= socket (AF_UNIX
, SOCK_STREAM
, 0)) < 0)
773 zlog_err("%s(%s): cannot make socket: %s",
774 __func__
,addr
.sun_path
, safe_strerror(errno
));
778 if (set_nonblocking(sock
) < 0)
780 zlog_err("%s(%s): set_nonblocking(%d) failed",
781 __func__
, addr
.sun_path
, sock
);
786 if (connect (sock
, (struct sockaddr
*) &addr
, len
) < 0)
788 if ((errno
!= EINPROGRESS
) && (errno
!= EWOULDBLOCK
))
790 if (gs
.loglevel
> LOG_DEBUG
)
791 zlog_debug("%s(%s): connect failed: %s",
792 __func__
,addr
.sun_path
, safe_strerror(errno
));
796 if (gs
.loglevel
> LOG_DEBUG
)
797 zlog_debug("%s: connection in progress",dmn
->name
);
798 dmn
->state
= DAEMON_CONNECTING
;
800 dmn
->t_write
= thread_add_write(master
,check_connect
,dmn
,dmn
->fd
);
801 dmn
->t_wakeup
= thread_add_timer(master
,wakeup_connect_hanging
,dmn
,
803 SET_READ_HANDLER(dmn
);
808 SET_READ_HANDLER(dmn
);
809 daemon_up(dmn
,"connect succeeded");
814 phase_hanging(struct thread
*t_hanging
)
816 gs
.t_phase_hanging
= NULL
;
817 zlog_err("Phase [%s] hanging for %ld seconds, aborting phased restart",
818 phase_str
[gs
.phase
],PHASE_TIMEOUT
);
819 gs
.phase
= PHASE_NONE
;
824 set_phase(restart_phase_t new_phase
)
826 gs
.phase
= new_phase
;
827 if (gs
.t_phase_hanging
)
828 thread_cancel(gs
.t_phase_hanging
);
829 gs
.t_phase_hanging
= thread_add_timer(master
,phase_hanging
,NULL
,
840 case PHASE_STOPS_PENDING
:
843 zlog_info("Phased restart: all routing daemon stop jobs have completed.");
844 set_phase(PHASE_WAITING_DOWN
);
846 case PHASE_WAITING_DOWN
:
847 if (gs
.numdown
+IS_UP(gs
.special
) < gs
.numdaemons
)
849 zlog_info("Phased restart: all routing daemons now down.");
850 run_job(&gs
.special
->restart
,"restart",gs
.restart_command
,1,1);
851 set_phase(PHASE_ZEBRA_RESTART_PENDING
);
853 case PHASE_ZEBRA_RESTART_PENDING
:
854 if (gs
.special
->restart
.pid
)
856 zlog_info("Phased restart: %s restart job completed.",gs
.special
->name
);
857 set_phase(PHASE_WAITING_ZEBRA_UP
);
859 case PHASE_WAITING_ZEBRA_UP
:
860 if (!IS_UP(gs
.special
))
862 zlog_info("Phased restart: %s is now up.",gs
.special
->name
);
865 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
867 if (dmn
!= gs
.special
)
868 run_job(&dmn
->restart
,"start",gs
.start_command
,1,0);
871 gs
.phase
= PHASE_NONE
;
872 THREAD_OFF(gs
.t_phase_hanging
);
873 zlog_notice("Phased global restart has completed.");
879 try_restart(struct daemon
*dmn
)
885 case MODE_GLOBAL_RESTART
:
886 run_job(&gs
.restart
,"restart",gs
.restart_command
,0,1);
888 case MODE_SEPARATE_RESTART
:
889 run_job(&dmn
->restart
,"restart",gs
.restart_command
,0,1);
891 case MODE_PHASED_ZEBRA_RESTART
:
892 if (dmn
!= gs
.special
)
894 if ((gs
.special
->state
== DAEMON_UP
) && (gs
.phase
== PHASE_NONE
))
895 run_job(&dmn
->restart
,"restart",gs
.restart_command
,0,1);
897 zlog_debug("%s: postponing restart attempt because master %s daemon "
898 "not up [%s], or phased restart in progress",
899 dmn
->name
,gs
.special
->name
,state_str
[gs
.special
->state
]);
903 case MODE_PHASED_ALL_RESTART
:
904 if ((gs
.phase
!= PHASE_NONE
) || gs
.numpids
)
906 if (gs
.loglevel
> LOG_DEBUG
+1)
907 zlog_debug("postponing phased global restart: restart already in "
908 "progress [%s], or outstanding child processes [%d]",
909 phase_str
[gs
.phase
],gs
.numpids
);
912 /* Is it too soon for a restart? */
914 struct timeval delay
;
915 if (time_elapsed(&delay
,&gs
.special
->restart
.time
)->tv_sec
<
916 gs
.special
->restart
.interval
)
918 if (gs
.loglevel
> LOG_DEBUG
+1)
919 zlog_debug("postponing phased global restart: "
920 "elapsed time %ld < retry interval %ld",
921 (long)delay
.tv_sec
,gs
.special
->restart
.interval
);
925 run_job(&gs
.restart
,"restart",gs
.restart_command
,0,1);
928 zlog_err("error: unknown restart mode %d",gs
.mode
);
934 wakeup_unresponsive(struct thread
*t_wakeup
)
936 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
938 dmn
->t_wakeup
= NULL
;
939 if (dmn
->state
!= DAEMON_UNRESPONSIVE
)
940 zlog_err("%s: no longer unresponsive (now %s), "
941 "wakeup should have been cancelled!",
942 dmn
->name
,state_str
[dmn
->state
]);
945 SET_WAKEUP_UNRESPONSIVE(dmn
);
952 wakeup_no_answer(struct thread
*t_wakeup
)
954 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
956 dmn
->t_wakeup
= NULL
;
957 dmn
->state
= DAEMON_UNRESPONSIVE
;
958 zlog_err("%s state -> unresponsive : no response yet to ping "
959 "sent %ld seconds ago",dmn
->name
,gs
.timeout
);
960 if (gs
.unresponsive_restart
)
962 SET_WAKEUP_UNRESPONSIVE(dmn
);
969 wakeup_send_echo(struct thread
*t_wakeup
)
971 static const char echocmd
[] = "echo " PING_TOKEN
;
973 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
975 dmn
->t_wakeup
= NULL
;
976 if (((rc
= write(dmn
->fd
,echocmd
,sizeof(echocmd
))) < 0) ||
977 ((size_t)rc
!= sizeof(echocmd
)))
979 char why
[100+sizeof(echocmd
)];
980 snprintf(why
,sizeof(why
),"write '%s' returned %d instead of %u",
981 echocmd
,(int)rc
,(u_int
)sizeof(echocmd
));
982 daemon_down(dmn
,why
);
986 gettimeofday(&dmn
->echo_sent
,NULL
);
987 dmn
->t_wakeup
= thread_add_timer(master
,wakeup_no_answer
,dmn
,gs
.timeout
);
995 zlog_notice("Terminating on signal");
996 systemd_send_stopping ();
1001 valid_command(const char *cmd
)
1005 return ((p
= strchr(cmd
,'%')) != NULL
) && (*(p
+1) == 's') && !strchr(p
+1,'%');
1008 /* This is an ugly hack to circumvent problems with passing command-line
1009 arguments that contain spaces. The fix is to use a configuration file. */
1011 translate_blanks(const char *cmd
, const char *blankstr
)
1015 size_t bslen
= strlen(blankstr
);
1017 if (!(res
= strdup(cmd
)))
1022 while ((p
= strstr(res
,blankstr
)) != NULL
)
1026 memmove(p
+1,p
+bslen
,strlen(p
+bslen
)+1);
1032 main(int argc
, char **argv
)
1034 const char *progname
;
1036 int daemon_mode
= 0;
1037 const char *pidfile
= DEFAULT_PIDFILE
;
1038 const char *special
= "zebra";
1039 const char *blankstr
= NULL
;
1040 static struct quagga_signal_t my_signals
[] =
1052 .handler
= sigchild
,
1056 if ((progname
= strrchr (argv
[0], '/')) != NULL
)
1061 gs
.restart
.name
= "all";
1062 while ((opt
= getopt_long(argc
, argv
, "aAb:dek:l:m:M:i:p:r:R:S:s:t:T:zvh",
1063 longopts
, 0)) != EOF
)
1070 if ((gs
.mode
!= MODE_MONITOR
) && (gs
.mode
!= MODE_SEPARATE_RESTART
))
1072 fputs("Ambiguous operating mode selected.\n",stderr
);
1073 return usage(progname
,1);
1075 gs
.mode
= MODE_PHASED_ZEBRA_RESTART
;
1078 if ((gs
.mode
!= MODE_MONITOR
) && (gs
.mode
!= MODE_SEPARATE_RESTART
))
1080 fputs("Ambiguous operating mode selected.\n",stderr
);
1081 return usage(progname
,1);
1083 gs
.mode
= MODE_PHASED_ALL_RESTART
;
1095 if (!valid_command(optarg
))
1097 fprintf(stderr
,"Invalid kill command, must contain '%%s': %s\n",
1099 return usage(progname
,1);
1101 gs
.stop_command
= optarg
;
1106 if ((sscanf(optarg
,"%d%1s",&gs
.loglevel
,garbage
) != 1) ||
1107 (gs
.loglevel
< LOG_EMERG
))
1109 fprintf(stderr
,"Invalid loglevel argument: %s\n",optarg
);
1110 return usage(progname
,1);
1117 if ((sscanf(optarg
,"%ld%1s",
1118 &gs
.min_restart_interval
,garbage
) != 1) ||
1119 (gs
.min_restart_interval
< 0))
1121 fprintf(stderr
,"Invalid min_restart_interval argument: %s\n",
1123 return usage(progname
,1);
1130 if ((sscanf(optarg
,"%ld%1s",
1131 &gs
.max_restart_interval
,garbage
) != 1) ||
1132 (gs
.max_restart_interval
< 0))
1134 fprintf(stderr
,"Invalid max_restart_interval argument: %s\n",
1136 return usage(progname
,1);
1144 if ((sscanf(optarg
,"%d%1s",&period
,garbage
) != 1) ||
1147 fprintf(stderr
,"Invalid interval argument: %s\n",optarg
);
1148 return usage(progname
,1);
1150 gs
.period
= 1000*period
;
1157 if ((gs
.mode
== MODE_GLOBAL_RESTART
) ||
1158 (gs
.mode
== MODE_SEPARATE_RESTART
))
1160 fputs("Ambiguous operating mode selected.\n",stderr
);
1161 return usage(progname
,1);
1163 if (!valid_command(optarg
))
1166 "Invalid restart command, must contain '%%s': %s\n",
1168 return usage(progname
,1);
1170 gs
.restart_command
= optarg
;
1171 if (gs
.mode
== MODE_MONITOR
)
1172 gs
.mode
= MODE_SEPARATE_RESTART
;
1175 if (gs
.mode
!= MODE_MONITOR
)
1177 fputs("Ambiguous operating mode selected.\n",stderr
);
1178 return usage(progname
,1);
1180 if (strchr(optarg
,'%'))
1183 "Invalid restart-all arg, must not contain '%%s': %s\n",
1185 return usage(progname
,1);
1187 gs
.restart_command
= optarg
;
1188 gs
.mode
= MODE_GLOBAL_RESTART
;
1191 if (!valid_command(optarg
))
1193 fprintf(stderr
,"Invalid start command, must contain '%%s': %s\n",
1195 return usage(progname
,1);
1197 gs
.start_command
= optarg
;
1205 if ((sscanf(optarg
,"%ld%1s",&gs
.timeout
,garbage
) != 1) ||
1208 fprintf(stderr
,"Invalid timeout argument: %s\n",optarg
);
1209 return usage(progname
,1);
1216 if ((sscanf(optarg
,"%ld%1s",&gs
.restart_timeout
,garbage
) != 1) ||
1217 (gs
.restart_timeout
< 1))
1219 fprintf(stderr
,"Invalid restart timeout argument: %s\n",optarg
);
1220 return usage(progname
,1);
1225 gs
.unresponsive_restart
= 1;
1228 printf ("%s version %s\n", progname
, QUAGGA_VERSION
);
1229 puts("Copyright 2004 Andrew J. Schorr");
1232 return usage(progname
,0);
1234 fputs("Invalid option.\n",stderr
);
1235 return usage(progname
,1);
1239 if (gs
.unresponsive_restart
&& (gs
.mode
== MODE_MONITOR
))
1241 fputs("Option -z requires a -r or -R restart option.\n",stderr
);
1242 return usage(progname
,1);
1247 if (gs
.restart_command
|| gs
.start_command
|| gs
.stop_command
)
1249 fprintf(stderr
,"No kill/(re)start commands needed for %s mode.\n",
1251 return usage(progname
,1);
1254 case MODE_GLOBAL_RESTART
:
1255 case MODE_SEPARATE_RESTART
:
1256 if (!gs
.restart_command
|| gs
.start_command
|| gs
.stop_command
)
1258 fprintf(stderr
,"No start/kill commands needed in [%s] mode.\n",
1260 return usage(progname
,1);
1263 case MODE_PHASED_ZEBRA_RESTART
:
1264 case MODE_PHASED_ALL_RESTART
:
1265 if (!gs
.restart_command
|| !gs
.start_command
|| !gs
.stop_command
)
1268 "Need start, kill, and restart commands in [%s] mode.\n",
1270 return usage(progname
,1);
1277 if (gs
.restart_command
)
1278 gs
.restart_command
= translate_blanks(gs
.restart_command
,blankstr
);
1279 if (gs
.start_command
)
1280 gs
.start_command
= translate_blanks(gs
.start_command
,blankstr
);
1281 if (gs
.stop_command
)
1282 gs
.stop_command
= translate_blanks(gs
.stop_command
,blankstr
);
1285 gs
.restart
.interval
= gs
.min_restart_interval
;
1286 master
= thread_master_create();
1287 systemd_send_started (master
, 0);
1288 signal_init (master
, array_size(my_signals
), my_signals
);
1289 srandom(time(NULL
));
1293 struct daemon
*tail
= NULL
;
1295 for (i
= optind
; i
< argc
; i
++)
1299 if (!(dmn
= (struct daemon
*)calloc(1,sizeof(*dmn
))))
1301 fprintf(stderr
,"calloc(1,%u) failed: %s\n",
1302 (u_int
)sizeof(*dmn
), safe_strerror(errno
));
1305 dmn
->name
= dmn
->restart
.name
= argv
[i
];
1306 dmn
->state
= DAEMON_INIT
;
1310 dmn
->t_wakeup
= thread_add_timer_msec(master
,wakeup_init
,dmn
,
1311 100+(random() % 900));
1312 dmn
->restart
.interval
= gs
.min_restart_interval
;
1319 if (((gs
.mode
== MODE_PHASED_ZEBRA_RESTART
) ||
1320 (gs
.mode
== MODE_PHASED_ALL_RESTART
)) &&
1321 !strcmp(dmn
->name
,special
))
1327 fputs("Must specify one or more daemons to monitor.\n",stderr
);
1328 return usage(progname
,1);
1330 if (((gs
.mode
== MODE_PHASED_ZEBRA_RESTART
) ||
1331 (gs
.mode
== MODE_PHASED_ALL_RESTART
)) && !gs
.special
)
1333 fprintf(stderr
,"In mode [%s], but cannot find master daemon %s\n",
1334 mode_str
[gs
.mode
],special
);
1335 return usage(progname
,1);
1338 zlog_default
= openzlog(progname
, ZLOG_NONE
, 0,
1339 LOG_CONS
|LOG_NDELAY
|LOG_PID
, LOG_DAEMON
);
1340 zlog_set_level(NULL
, ZLOG_DEST_MONITOR
, ZLOG_DISABLED
);
1343 zlog_set_level(NULL
, ZLOG_DEST_SYSLOG
, MIN(gs
.loglevel
,LOG_DEBUG
));
1344 if (daemon (0, 0) < 0)
1346 fprintf(stderr
, "Watchquagga daemon failed: %s", strerror(errno
));
1351 zlog_set_level(NULL
, ZLOG_DEST_STDOUT
, MIN(gs
.loglevel
,LOG_DEBUG
));
1353 /* Make sure we're not already running. */
1354 pid_output (pidfile
);
1356 /* Announce which daemons are being monitored. */
1361 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
1362 len
+= strlen(dmn
->name
)+1;
1368 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
1372 strcpy(p
,dmn
->name
);
1375 zlog_notice("%s %s watching [%s], mode [%s]",
1376 progname
, QUAGGA_VERSION
, buf
, mode_str
[gs
.mode
]);
1381 struct thread thread
;
1383 while (thread_fetch (master
, &thread
))
1384 thread_call (&thread
);
1387 systemd_send_stopping ();