2 $Id: watchquagga.c,v 1.6 2004/12/23 19:35:56 paul Exp $
4 Monitor status of quagga daemons and restart if necessary.
6 Copyright (C) 2004 Andrew J. Schorr
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 #include <sys/types.h>
25 #include <sys/socket.h>
50 #define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
53 /* Macros to help randomize timers. */
54 #define JITTER(X) ((random() % ((X)+1))-((X)/2))
55 #define FUZZY(X) ((X)+JITTER((X)/20))
57 #define DEFAULT_PERIOD 5
58 #define DEFAULT_TIMEOUT 10
59 #define DEFAULT_RESTART_TIMEOUT 20
60 #define DEFAULT_LOGLEVEL LOG_INFO
61 #define DEFAULT_MIN_RESTART 60
62 #define DEFAULT_MAX_RESTART 600
63 #ifdef PATH_WATCHQUAGGA_PID
64 #define DEFAULT_PIDFILE PATH_WATCHQUAGGA_PID
66 #define DEFAULT_PIDFILE STATEDIR "/watchquagga.pid"
69 #define VTYDIR DAEMON_VTY_DIR
71 #define VTYDIR STATEDIR
74 #define PING_TOKEN "PING"
76 /* Needs to be global, referenced somewhere inside libzebra. */
77 struct thread_master
*master
;
83 MODE_SEPARATE_RESTART
,
84 MODE_PHASED_ZEBRA_RESTART
,
85 MODE_PHASED_ALL_RESTART
88 static const char *mode_str
[] =
92 "individual daemon restart",
93 "phased zebra restart",
94 "phased global restart for any failure",
102 PHASE_ZEBRA_RESTART_PENDING
,
103 PHASE_WAITING_ZEBRA_UP
106 static const char *phase_str
[] =
110 "Waiting for other daemons to come down",
111 "Zebra restart job running",
112 "Waiting for zebra to come up",
113 "Start jobs running",
116 #define PHASE_TIMEOUT (3*gs.restart_timeout)
125 struct thread
*t_kill
;
129 static struct global_state
132 restart_phase_t phase
;
133 struct thread
*t_phase_hanging
;
137 long restart_timeout
;
138 long min_restart_interval
;
139 long max_restart_interval
;
141 struct daemon
*daemons
;
142 const char *restart_command
;
143 const char *start_command
;
144 const char *stop_command
;
145 struct restart_info restart
;
146 int unresponsive_restart
;
148 struct daemon
*special
; /* points to zebra when doing phased restart */
151 int numdown
; /* # of daemons that are not UP or UNRESPONSIVE */
153 .mode
= MODE_MONITOR
,
156 .period
= 1000*DEFAULT_PERIOD
,
157 .timeout
= DEFAULT_TIMEOUT
,
158 .restart_timeout
= DEFAULT_RESTART_TIMEOUT
,
159 .loglevel
= DEFAULT_LOGLEVEL
,
160 .min_restart_interval
= DEFAULT_MIN_RESTART
,
161 .max_restart_interval
= DEFAULT_MAX_RESTART
,
175 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
177 static const char *state_str
[] =
188 daemon_state_t state
;
190 struct timeval echo_sent
;
192 struct thread
*t_wakeup
;
193 struct thread
*t_read
;
194 struct thread
*t_write
;
196 struct restart_info restart
;
199 static const struct option longopts
[] =
201 { "daemon", no_argument
, NULL
, 'd'},
202 { "statedir", required_argument
, NULL
, 'S'},
203 { "no-echo", no_argument
, NULL
, 'e'},
204 { "loglevel", required_argument
, NULL
, 'l'},
205 { "interval", required_argument
, NULL
, 'i'},
206 { "timeout", required_argument
, NULL
, 't'},
207 { "restart-timeout", required_argument
, NULL
, 'T'},
208 { "restart", required_argument
, NULL
, 'r'},
209 { "start-command", required_argument
, NULL
, 's'},
210 { "kill-command", required_argument
, NULL
, 'k'},
211 { "restart-all", required_argument
, NULL
, 'R'},
212 { "all-restart", no_argument
, NULL
, 'a'},
213 { "always-all-restart", no_argument
, NULL
, 'A'},
214 { "unresponsive-restart", no_argument
, NULL
, 'z'},
215 { "min-restart-interval", required_argument
, NULL
, 'm'},
216 { "max-restart-interval", required_argument
, NULL
, 'M'},
217 { "pid-file", required_argument
, NULL
, 'p'},
218 { "blank-string", required_argument
, NULL
, 'b'},
219 { "help", no_argument
, NULL
, 'h'},
220 { "version", no_argument
, NULL
, 'v'},
224 static int try_connect(struct daemon
*dmn
);
225 static int wakeup_send_echo(struct thread
*t_wakeup
);
226 static void try_restart(struct daemon
*dmn
);
227 static void phase_check(void);
230 usage(const char *progname
, int status
)
233 fprintf(stderr
, "Try `%s --help' for more information.\n", progname
);
235 printf("Usage : %s [OPTION...] <daemon name> ...\n\n\
236 Watchdog program to monitor status of quagga daemons and try to restart\n\
237 them if they are down or unresponsive. It determines whether a daemon is\n\
238 up based on whether it can connect to the daemon's vty unix stream socket.\n\
239 It then repeatedly sends echo commands over that socket to determine whether\n\
240 the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
241 on the socket connection and know immediately that the daemon is down.\n\n\
242 The daemons to be monitored should be listed on the command line.\n\n\
243 This program can run in one of 5 modes:\n\n\
245 Just monitor and report on status changes. Example:\n\
246 %s -d zebra ospfd bgpd\n\n\
248 Whenever any daemon hangs or crashes, use the given command to restart\n\
249 them all. Example:\n\
251 -R '/sbin/service zebra restart; /sbin/service ospfd restart' \\\n\
254 When any single daemon hangs or crashes, restart only the daemon that's\n\
255 in trouble using the supplied restart command. Example:\n\
256 %s -dz -r '/sbin/service %%s restart' zebra ospfd bgpd\n\n\
258 The same as the previous mode, except that there is special treatment when\n\
259 the zebra daemon is in trouble. In that case, a phased restart approach\n\
260 is used: 1. stop all other daemons; 2. restart zebra; 3. start the other\n\
262 %s -adz -r '/sbin/service %%s restart' \\\n\
263 -s '/sbin/service %%s start' \\\n\
264 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
266 This is the same as the previous mode, except that the phased restart\n\
267 procedure is used whenever any of the daemons hangs or crashes. Example:\n\
268 %s -Adz -r '/sbin/service %%s restart' \\\n\
269 -s '/sbin/service %%s start' \\\n\
270 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
271 As of this writing, it is believed that mode 2 [%s]\n\
272 is not safe, and mode 3 [%s] may not be safe with some of the\n\
273 routing daemons.\n\n\
274 In order to avoid attempting to restart the daemons in a fast loop,\n\
275 the -m and -M options allow you to control the minimum delay between\n\
276 restart commands. The minimum restart delay is recalculated each time\n\
277 a restart is attempted: if the time since the last restart attempt exceeds\n\
278 twice the -M value, then the restart delay is set to the -m value.\n\
279 Otherwise, the interval is doubled (but capped at the -M value).\n\n\
281 -d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
282 to syslog instead of stdout.\n\
283 -S, --statedir Set the vty socket directory (default is %s)\n\
284 -e, --no-echo Do not ping the daemons to test responsiveness (this\n\
285 option is necessary if the daemons do not support the\n\
287 -l, --loglevel Set the logging level (default is %d).\n\
288 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
289 but it can be set higher than %d if extra-verbose debugging\n\
290 messages are desired.\n\
291 -m, --min-restart-interval\n\
292 Set the minimum seconds to wait between invocations of daemon\n\
293 restart commands (default is %d).\n\
294 -M, --max-restart-interval\n\
295 Set the maximum seconds to wait between invocations of daemon\n\
296 restart commands (default is %d).\n\
297 -i, --interval Set the status polling interval in seconds (default is %d)\n\
298 -t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
299 -T, --restart-timeout\n\
300 Set the restart (kill) timeout in seconds (default is %d).\n\
301 If any background jobs are still running after this much\n\
302 time has elapsed, they will be killed.\n\
303 -r, --restart Supply a Bourne shell command to use to restart a single\n\
304 daemon. The command string should include '%%s' where the\n\
305 name of the daemon should be substituted.\n\
306 Note that -r and -R are incompatible.\n\
307 -s, --start-command\n\
308 Supply a Bourne shell to command to use to start a single\n\
309 daemon. The command string should include '%%s' where the\n\
310 name of the daemon should be substituted.\n\
311 -k, --kill-command\n\
312 Supply a Bourne shell to command to use to stop a single\n\
313 daemon. The command string should include '%%s' where the\n\
314 name of the daemon should be substituted.\n\
316 When one or more daemons is down, try to restart everything\n\
317 using the Bourne shell command supplied as the argument.\n\
318 Note that -r and -R are incompatible.\n\
319 -z, --unresponsive-restart\n\
320 When a daemon is unresponsive, treat it as being down for\n\
323 When zebra hangs or crashes, restart all daemons using\n\
324 this phased approach: 1. stop all other daemons; 2. restart\n\
325 zebra; 3. start other daemons. Requires -r, -s, and -k.\n\
326 -A, --always-all-restart\n\
327 When any daemon (not just zebra) hangs or crashes, use the\n\
328 same phased restart mechanism described above for -a.\n\
329 Requires -r, -s, and -k.\n\
330 -p, --pid-file Set process identifier file name\n\
332 -b, --blank-string\n\
333 When the supplied argument string is found in any of the\n\
334 various shell command arguments (-r, -s, -k, or -R), replace\n\
335 it with a space. This is an ugly hack to circumvent problems\n\
336 passing command-line arguments with embedded spaces.\n\
337 -v, --version Print program version\n\
338 -h, --help Display this help and exit\n\
339 ", progname
,mode_str
[0],progname
,mode_str
[1],progname
,mode_str
[2],
340 progname
,mode_str
[3],progname
,mode_str
[4],progname
,mode_str
[2],mode_str
[3],
341 VTYDIR
,DEFAULT_LOGLEVEL
,LOG_EMERG
,LOG_DEBUG
,LOG_DEBUG
,
342 DEFAULT_MIN_RESTART
,DEFAULT_MAX_RESTART
,
343 DEFAULT_PERIOD
,DEFAULT_TIMEOUT
,DEFAULT_RESTART_TIMEOUT
,DEFAULT_PIDFILE
);
349 run_background(const char *shell_cmd
)
353 switch (child
= fork())
356 zlog_err("fork failed, cannot run command [%s]: %s",
357 shell_cmd
,safe_strerror(errno
));
361 /* Use separate process group so child processes can be killed easily. */
362 if (setpgid(0,0) < 0)
363 zlog_warn("warning: setpgid(0,0) failed: %s",safe_strerror(errno
));
365 const char *argv
[4] = { "sh", "-c", shell_cmd
, NULL
};
366 execv("/bin/sh",(char *const *)argv
);
367 zlog_err("execv(/bin/sh -c '%s') failed: %s",
368 shell_cmd
,safe_strerror(errno
));
372 /* Parent process: we will reap the child later. */
373 zlog_err("Forked background command [pid %d]: %s",child
,shell_cmd
);
378 static struct timeval
*
379 time_elapsed(struct timeval
*result
, const struct timeval
*start_time
)
381 gettimeofday(result
,NULL
);
382 result
->tv_sec
-= start_time
->tv_sec
;
383 result
->tv_usec
-= start_time
->tv_usec
;
384 while (result
->tv_usec
< 0)
386 result
->tv_usec
+= 1000000L;
393 restart_kill(struct thread
*t_kill
)
395 struct restart_info
*restart
= THREAD_ARG(t_kill
);
396 struct timeval delay
;
398 time_elapsed(&delay
,&restart
->time
);
399 zlog_warn("Warning: %s %s child process %d still running after "
400 "%ld seconds, sending signal %d",
401 restart
->what
,restart
->name
,restart
->pid
,delay
.tv_sec
,
402 (restart
->kills
? SIGKILL
: SIGTERM
));
403 kill(-restart
->pid
,(restart
->kills
? SIGKILL
: SIGTERM
));
405 restart
->t_kill
= thread_add_timer(master
,restart_kill
,restart
,
410 static struct restart_info
*
411 find_child(pid_t child
)
413 if (gs
.mode
== MODE_GLOBAL_RESTART
)
415 if (gs
.restart
.pid
== child
)
421 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
423 if (dmn
->restart
.pid
== child
)
424 return &dmn
->restart
;
437 struct restart_info
*restart
;
439 switch (child
= waitpid(-1,&status
,WNOHANG
))
442 zlog_err("waitpid failed: %s",safe_strerror(errno
));
445 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
449 if ((restart
= find_child(child
)) != NULL
)
451 name
= restart
->name
;
452 what
= restart
->what
;
455 thread_cancel(restart
->t_kill
);
456 restart
->t_kill
= NULL
;
457 /* Update restart time to reflect the time the command completed. */
458 gettimeofday(&restart
->time
,NULL
);
462 zlog_err("waitpid returned status for an unknown child process %d",
467 if (WIFSTOPPED(status
))
468 zlog_warn("warning: %s %s process %d is stopped",
470 else if (WIFSIGNALED(status
))
471 zlog_warn("%s %s process %d terminated due to signal %d",
472 what
,name
,child
,WTERMSIG(status
));
473 else if (WIFEXITED(status
))
475 if (WEXITSTATUS(status
) != 0)
476 zlog_warn("%s %s process %d exited with non-zero status %d",
477 what
,name
,child
,WEXITSTATUS(status
));
479 zlog_debug("%s %s process %d exited normally",what
,name
,child
);
482 zlog_err("cannot interpret %s %s process %d wait status 0x%x",
483 what
,name
,child
,status
);
488 run_job(struct restart_info
*restart
, const char *cmdtype
, const char *command
,
489 int force
, int update_interval
)
491 struct timeval delay
;
493 if (gs
.loglevel
> LOG_DEBUG
+1)
494 zlog_debug("attempting to %s %s",cmdtype
,restart
->name
);
498 if (gs
.loglevel
> LOG_DEBUG
+1)
499 zlog_debug("cannot %s %s, previous pid %d still running",
500 cmdtype
,restart
->name
,restart
->pid
);
505 (time_elapsed(&delay
,&restart
->time
)->tv_sec
< restart
->interval
))
507 if (gs
.loglevel
> LOG_DEBUG
+1)
508 zlog_debug("postponing %s %s: "
509 "elapsed time %ld < retry interval %ld",
510 cmdtype
,restart
->name
,(long)delay
.tv_sec
,restart
->interval
);
514 gettimeofday(&restart
->time
,NULL
);
517 char cmd
[strlen(command
)+strlen(restart
->name
)+1];
518 snprintf(cmd
,sizeof(cmd
),command
,restart
->name
);
519 if ((restart
->pid
= run_background(cmd
)) > 0)
521 restart
->t_kill
= thread_add_timer(master
,restart_kill
,restart
,
523 restart
->what
= cmdtype
;
530 /* Calculate the new restart interval. */
533 if (delay
.tv_sec
> 2*gs
.max_restart_interval
)
534 restart
->interval
= gs
.min_restart_interval
;
535 else if ((restart
->interval
*= 2) > gs
.max_restart_interval
)
536 restart
->interval
= gs
.max_restart_interval
;
537 if (gs
.loglevel
> LOG_DEBUG
+1)
538 zlog_debug("restart %s interval is now %ld",
539 restart
->name
,restart
->interval
);
544 #define SET_READ_HANDLER(DMN) \
545 (DMN)->t_read = thread_add_read(master,handle_read,(DMN),(DMN)->fd)
547 #define SET_WAKEUP_DOWN(DMN) \
548 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_down,(DMN), \
551 #define SET_WAKEUP_UNRESPONSIVE(DMN) \
552 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_unresponsive,(DMN), \
555 #define SET_WAKEUP_ECHO(DMN) \
556 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_send_echo,(DMN), \
560 wakeup_down(struct thread
*t_wakeup
)
562 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
564 dmn
->t_wakeup
= NULL
;
565 if (try_connect(dmn
) < 0)
566 SET_WAKEUP_DOWN(dmn
);
567 if ((dmn
->connect_tries
> 1) && (dmn
->state
!= DAEMON_UP
))
573 wakeup_init(struct thread
*t_wakeup
)
575 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
577 dmn
->t_wakeup
= NULL
;
578 if (try_connect(dmn
) < 0)
580 SET_WAKEUP_DOWN(dmn
);
581 zlog_err("%s state -> down : initial connection attempt failed",
583 dmn
->state
= DAEMON_DOWN
;
589 daemon_down(struct daemon
*dmn
, const char *why
)
591 if (IS_UP(dmn
) || (dmn
->state
== DAEMON_INIT
))
592 zlog_err("%s state -> down : %s",dmn
->name
,why
);
593 else if (gs
.loglevel
> LOG_DEBUG
)
594 zlog_debug("%s still down : %s",dmn
->name
,why
);
597 dmn
->state
= DAEMON_DOWN
;
603 THREAD_OFF(dmn
->t_read
);
604 THREAD_OFF(dmn
->t_write
);
605 THREAD_OFF(dmn
->t_wakeup
);
606 if (try_connect(dmn
) < 0)
607 SET_WAKEUP_DOWN(dmn
);
612 handle_read(struct thread
*t_read
)
614 struct daemon
*dmn
= THREAD_ARG(t_read
);
615 static const char resp
[sizeof(PING_TOKEN
)+4] = PING_TOKEN
"\n";
616 char buf
[sizeof(resp
)+100];
618 struct timeval delay
;
621 if ((rc
= read(dmn
->fd
,buf
,sizeof(buf
))) < 0)
625 if ((errno
== EINTR
) || (errno
== EAGAIN
))
627 /* Pretend it never happened. */
628 SET_READ_HANDLER(dmn
);
631 snprintf(why
,sizeof(why
),"unexpected read error: %s",
632 safe_strerror(errno
));
633 daemon_down(dmn
,why
);
638 daemon_down(dmn
,"read returned EOF");
641 if (!dmn
->echo_sent
.tv_sec
)
643 char why
[sizeof(buf
)+100];
644 snprintf(why
,sizeof(why
),"unexpected read returns %d bytes: %.*s",
645 (int)rc
,(int)rc
,buf
);
646 daemon_down(dmn
,why
);
650 /* We are expecting an echo response: is there any chance that the
651 response would not be returned entirely in the first read? That
652 seems inconceivable... */
653 if ((rc
!= sizeof(resp
)) || memcmp(buf
,resp
,sizeof(resp
)))
655 char why
[100+sizeof(buf
)];
656 snprintf(why
,sizeof(why
),"read returned bad echo response of %d bytes "
657 "(expecting %u): %.*s",
658 (int)rc
,(u_int
)sizeof(resp
),(int)rc
,buf
);
659 daemon_down(dmn
,why
);
663 time_elapsed(&delay
,&dmn
->echo_sent
);
664 dmn
->echo_sent
.tv_sec
= 0;
665 if (dmn
->state
== DAEMON_UNRESPONSIVE
)
667 if (delay
.tv_sec
< gs
.timeout
)
669 dmn
->state
= DAEMON_UP
;
670 zlog_warn("%s state -> up : echo response received after %ld.%06ld "
671 "seconds", dmn
->name
,delay
.tv_sec
,delay
.tv_usec
);
674 zlog_warn("%s: slow echo response finally received after %ld.%06ld "
675 "seconds", dmn
->name
,delay
.tv_sec
,delay
.tv_usec
);
677 else if (gs
.loglevel
> LOG_DEBUG
+1)
678 zlog_debug("%s: echo response received after %ld.%06ld seconds",
679 dmn
->name
,delay
.tv_sec
,delay
.tv_usec
);
681 SET_READ_HANDLER(dmn
);
683 thread_cancel(dmn
->t_wakeup
);
684 SET_WAKEUP_ECHO(dmn
);
690 daemon_up(struct daemon
*dmn
, const char *why
)
692 dmn
->state
= DAEMON_UP
;
694 dmn
->connect_tries
= 0;
695 zlog_notice("%s state -> up : %s",dmn
->name
,why
);
697 SET_WAKEUP_ECHO(dmn
);
702 check_connect(struct thread
*t_write
)
704 struct daemon
*dmn
= THREAD_ARG(t_write
);
706 socklen_t reslen
= sizeof(sockerr
);
709 if (getsockopt(dmn
->fd
,SOL_SOCKET
,SO_ERROR
,(char *)&sockerr
,&reslen
) < 0)
711 zlog_warn("%s: check_connect: getsockopt failed: %s",
712 dmn
->name
,safe_strerror(errno
));
713 daemon_down(dmn
,"getsockopt failed checking connection success");
716 if ((reslen
== sizeof(sockerr
)) && sockerr
)
719 snprintf(why
,sizeof(why
),
720 "getsockopt reports that connection attempt failed: %s",
721 safe_strerror(sockerr
));
722 daemon_down(dmn
,why
);
726 daemon_up(dmn
,"delayed connect succeeded");
731 wakeup_connect_hanging(struct thread
*t_wakeup
)
733 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
736 dmn
->t_wakeup
= NULL
;
737 snprintf(why
,sizeof(why
),"connection attempt timed out after %ld seconds",
739 daemon_down(dmn
,why
);
743 /* Making connection to protocol daemon. */
745 try_connect(struct daemon
*dmn
)
748 struct sockaddr_un addr
;
752 if (gs
.loglevel
> LOG_DEBUG
+1)
753 zlog_debug("%s: attempting to connect",dmn
->name
);
754 dmn
->connect_tries
++;
756 memset (&addr
, 0, sizeof (struct sockaddr_un
));
757 addr
.sun_family
= AF_UNIX
;
758 snprintf(addr
.sun_path
, sizeof(addr
.sun_path
), "%s/%s.vty",
759 gs
.vtydir
,dmn
->name
);
761 len
= addr
.sun_len
= SUN_LEN(&addr
);
763 len
= sizeof (addr
.sun_family
) + strlen (addr
.sun_path
);
764 #endif /* HAVE_SUN_LEN */
766 /* Quick check to see if we might succeed before we go to the trouble
767 of creating a socket. */
768 if (access(addr
.sun_path
, W_OK
) < 0)
771 zlog_err("%s: access to socket %s denied: %s",
772 dmn
->name
,addr
.sun_path
,safe_strerror(errno
));
776 if ((sock
= socket (AF_UNIX
, SOCK_STREAM
, 0)) < 0)
778 zlog_err("%s(%s): cannot make socket: %s",
779 __func__
,addr
.sun_path
, safe_strerror(errno
));
783 /* Set non-blocking. */
784 if ((flags
= fcntl(sock
, F_GETFL
, 0)) < 0)
786 zlog_err("%s(%s): fcntl(F_GETFL) failed: %s",
787 __func__
,addr
.sun_path
, safe_strerror(errno
));
791 if (fcntl(sock
, F_SETFL
, (flags
|O_NONBLOCK
)) < 0)
793 zlog_err("%s(%s): fcntl(F_SETFL,O_NONBLOCK) failed: %s",
794 __func__
,addr
.sun_path
, safe_strerror(errno
));
799 if (connect (sock
, (struct sockaddr
*) &addr
, len
) < 0)
801 if ((errno
!= EINPROGRESS
) && (errno
!= EWOULDBLOCK
))
803 if (gs
.loglevel
> LOG_DEBUG
)
804 zlog_debug("%s(%s): connect failed: %s",
805 __func__
,addr
.sun_path
, safe_strerror(errno
));
809 if (gs
.loglevel
> LOG_DEBUG
)
810 zlog_debug("%s: connection in progress",dmn
->name
);
811 dmn
->state
= DAEMON_CONNECTING
;
813 dmn
->t_write
= thread_add_write(master
,check_connect
,dmn
,dmn
->fd
);
814 dmn
->t_wakeup
= thread_add_timer(master
,wakeup_connect_hanging
,dmn
,
816 SET_READ_HANDLER(dmn
);
821 SET_READ_HANDLER(dmn
);
822 daemon_up(dmn
,"connect succeeded");
827 phase_hanging(struct thread
*t_hanging
)
829 gs
.t_phase_hanging
= NULL
;
830 zlog_err("Phase [%s] hanging for %ld seconds, aborting phased restart",
831 phase_str
[gs
.phase
],PHASE_TIMEOUT
);
832 gs
.phase
= PHASE_NONE
;
837 set_phase(restart_phase_t new_phase
)
839 gs
.phase
= new_phase
;
840 if (gs
.t_phase_hanging
)
841 thread_cancel(gs
.t_phase_hanging
);
842 gs
.t_phase_hanging
= thread_add_timer(master
,phase_hanging
,NULL
,
853 case PHASE_STOPS_PENDING
:
856 zlog_info("Phased restart: all routing daemon stop jobs have completed.");
857 set_phase(PHASE_WAITING_DOWN
);
859 case PHASE_WAITING_DOWN
:
860 if (gs
.numdown
+IS_UP(gs
.special
) < gs
.numdaemons
)
862 zlog_info("Phased restart: all routing daemons now down.");
863 run_job(&gs
.special
->restart
,"restart",gs
.restart_command
,1,1);
864 set_phase(PHASE_ZEBRA_RESTART_PENDING
);
866 case PHASE_ZEBRA_RESTART_PENDING
:
867 if (gs
.special
->restart
.pid
)
869 zlog_info("Phased restart: %s restart job completed.",gs
.special
->name
);
870 set_phase(PHASE_WAITING_ZEBRA_UP
);
872 case PHASE_WAITING_ZEBRA_UP
:
873 if (!IS_UP(gs
.special
))
875 zlog_info("Phased restart: %s is now up.",gs
.special
->name
);
878 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
880 if (dmn
!= gs
.special
)
881 run_job(&dmn
->restart
,"start",gs
.start_command
,1,1);
884 gs
.phase
= PHASE_NONE
;
885 THREAD_OFF(gs
.t_phase_hanging
);
886 zlog_notice("Phased global restart has completed.");
892 try_restart(struct daemon
*dmn
)
898 case MODE_GLOBAL_RESTART
:
899 run_job(&gs
.restart
,"restart",gs
.restart_command
,0,1);
901 case MODE_SEPARATE_RESTART
:
902 run_job(&dmn
->restart
,"restart",gs
.restart_command
,0,1);
904 case MODE_PHASED_ZEBRA_RESTART
:
905 if (dmn
!= gs
.special
)
907 if ((gs
.special
->state
== DAEMON_UP
) && (gs
.phase
== PHASE_NONE
))
908 run_job(&dmn
->restart
,"restart",gs
.restart_command
,0,1);
910 zlog_debug("%s: postponing restart attempt because master %s daemon "
911 "not up [%s], or phased restart in progress",
912 dmn
->name
,gs
.special
->name
,state_str
[gs
.special
->state
]);
916 case MODE_PHASED_ALL_RESTART
:
917 if ((gs
.phase
!= PHASE_NONE
) || gs
.numpids
)
919 if (gs
.loglevel
> LOG_DEBUG
+1)
920 zlog_debug("postponing phased global restart: restart already in "
921 "progress [%s], or outstanding child processes [%d]",
922 phase_str
[gs
.phase
],gs
.numpids
);
925 /* Is it too soon for a restart? */
927 struct timeval delay
;
928 if (time_elapsed(&delay
,&gs
.special
->restart
.time
)->tv_sec
<
929 gs
.special
->restart
.interval
)
931 if (gs
.loglevel
> LOG_DEBUG
+1)
932 zlog_debug("postponing phased global restart: "
933 "elapsed time %ld < retry interval %ld",
934 (long)delay
.tv_sec
,gs
.special
->restart
.interval
);
938 zlog_info("Phased restart: stopping all routing daemons.");
939 /* First step: stop all other daemons. */
940 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
942 if (dmn
!= gs
.special
)
943 run_job(&dmn
->restart
,"stop",gs
.stop_command
,1,0);
945 set_phase(PHASE_STOPS_PENDING
);
948 zlog_err("error: unknown restart mode %d",gs
.mode
);
954 wakeup_unresponsive(struct thread
*t_wakeup
)
956 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
958 dmn
->t_wakeup
= NULL
;
959 if (dmn
->state
!= DAEMON_UNRESPONSIVE
)
960 zlog_err("%s: no longer unresponsive (now %s), "
961 "wakeup should have been cancelled!",
962 dmn
->name
,state_str
[dmn
->state
]);
965 SET_WAKEUP_UNRESPONSIVE(dmn
);
972 wakeup_no_answer(struct thread
*t_wakeup
)
974 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
976 dmn
->t_wakeup
= NULL
;
977 dmn
->state
= DAEMON_UNRESPONSIVE
;
978 zlog_err("%s state -> unresponsive : no response yet to ping "
979 "sent %ld seconds ago",dmn
->name
,gs
.timeout
);
980 if (gs
.unresponsive_restart
)
982 SET_WAKEUP_UNRESPONSIVE(dmn
);
989 wakeup_send_echo(struct thread
*t_wakeup
)
991 static const char echocmd
[] = "echo " PING_TOKEN
;
993 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
995 dmn
->t_wakeup
= NULL
;
996 if (((rc
= write(dmn
->fd
,echocmd
,sizeof(echocmd
))) < 0) ||
997 ((size_t)rc
!= sizeof(echocmd
)))
999 char why
[100+sizeof(echocmd
)];
1000 snprintf(why
,sizeof(why
),"write '%s' returned %d instead of %u",
1001 echocmd
,(int)rc
,(u_int
)sizeof(echocmd
));
1002 daemon_down(dmn
,why
);
1006 gettimeofday(&dmn
->echo_sent
,NULL
);
1007 dmn
->t_wakeup
= thread_add_timer(master
,wakeup_no_answer
,dmn
,gs
.timeout
);
1015 zlog_notice("Terminating on signal");
1020 valid_command(const char *cmd
)
1024 return ((p
= strchr(cmd
,'%')) != NULL
) && (*(p
+1) == 's') && !strchr(p
+1,'%');
1027 /* This is an ugly hack to circumvent problems with passing command-line
1028 arguments that contain spaces. The fix is to use a configuration file. */
1030 translate_blanks(const char *cmd
, const char *blankstr
)
1034 size_t bslen
= strlen(blankstr
);
1036 if (!(res
= strdup(cmd
)))
1041 while ((p
= strstr(res
,blankstr
)) != NULL
)
1045 memmove(p
+1,p
+bslen
,strlen(p
+bslen
)+1);
1051 main(int argc
, char **argv
)
1053 const char *progname
;
1055 int daemon_mode
= 0;
1056 const char *pidfile
= DEFAULT_PIDFILE
;
1057 const char *special
= "zebra";
1058 const char *blankstr
= NULL
;
1059 static struct quagga_signal_t my_signals
[] =
1071 .handler
= sigchild
,
1075 if ((progname
= strrchr (argv
[0], '/')) != NULL
)
1080 gs
.restart
.name
= "all";
1081 while ((opt
= getopt_long(argc
, argv
, "aAb:dek:l:m:M:i:p:r:R:S:s:t:T:zvh",
1082 longopts
, 0)) != EOF
)
1089 if ((gs
.mode
!= MODE_MONITOR
) && (gs
.mode
!= MODE_SEPARATE_RESTART
))
1091 fputs("Ambiguous operating mode selected.\n",stderr
);
1092 return usage(progname
,1);
1094 gs
.mode
= MODE_PHASED_ZEBRA_RESTART
;
1097 if ((gs
.mode
!= MODE_MONITOR
) && (gs
.mode
!= MODE_SEPARATE_RESTART
))
1099 fputs("Ambiguous operating mode selected.\n",stderr
);
1100 return usage(progname
,1);
1102 gs
.mode
= MODE_PHASED_ALL_RESTART
;
1114 if (!valid_command(optarg
))
1116 fprintf(stderr
,"Invalid kill command, must contain '%%s': %s\n",
1118 return usage(progname
,1);
1120 gs
.stop_command
= optarg
;
1125 if ((sscanf(optarg
,"%d%1s",&gs
.loglevel
,garbage
) != 1) ||
1126 (gs
.loglevel
< LOG_EMERG
))
1128 fprintf(stderr
,"Invalid loglevel argument: %s\n",optarg
);
1129 return usage(progname
,1);
1136 if ((sscanf(optarg
,"%ld%1s",
1137 &gs
.min_restart_interval
,garbage
) != 1) ||
1138 (gs
.min_restart_interval
< 0))
1140 fprintf(stderr
,"Invalid min_restart_interval argument: %s\n",
1142 return usage(progname
,1);
1149 if ((sscanf(optarg
,"%ld%1s",
1150 &gs
.max_restart_interval
,garbage
) != 1) ||
1151 (gs
.max_restart_interval
< 0))
1153 fprintf(stderr
,"Invalid max_restart_interval argument: %s\n",
1155 return usage(progname
,1);
1163 if ((sscanf(optarg
,"%d%1s",&period
,garbage
) != 1) ||
1166 fprintf(stderr
,"Invalid interval argument: %s\n",optarg
);
1167 return usage(progname
,1);
1169 gs
.period
= 1000*period
;
1176 if ((gs
.mode
== MODE_GLOBAL_RESTART
) ||
1177 (gs
.mode
== MODE_SEPARATE_RESTART
))
1179 fputs("Ambiguous operating mode selected.\n",stderr
);
1180 return usage(progname
,1);
1182 if (!valid_command(optarg
))
1185 "Invalid restart command, must contain '%%s': %s\n",
1187 return usage(progname
,1);
1189 gs
.restart_command
= optarg
;
1190 if (gs
.mode
== MODE_MONITOR
)
1191 gs
.mode
= MODE_SEPARATE_RESTART
;
1194 if (gs
.mode
!= MODE_MONITOR
)
1196 fputs("Ambiguous operating mode selected.\n",stderr
);
1197 return usage(progname
,1);
1199 if (strchr(optarg
,'%'))
1202 "Invalid restart-all arg, must not contain '%%s': %s\n",
1204 return usage(progname
,1);
1206 gs
.restart_command
= optarg
;
1207 gs
.mode
= MODE_GLOBAL_RESTART
;
1210 if (!valid_command(optarg
))
1212 fprintf(stderr
,"Invalid start command, must contain '%%s': %s\n",
1214 return usage(progname
,1);
1216 gs
.start_command
= optarg
;
1224 if ((sscanf(optarg
,"%ld%1s",&gs
.timeout
,garbage
) != 1) ||
1227 fprintf(stderr
,"Invalid timeout argument: %s\n",optarg
);
1228 return usage(progname
,1);
1235 if ((sscanf(optarg
,"%ld%1s",&gs
.restart_timeout
,garbage
) != 1) ||
1236 (gs
.restart_timeout
< 1))
1238 fprintf(stderr
,"Invalid restart timeout argument: %s\n",optarg
);
1239 return usage(progname
,1);
1244 gs
.unresponsive_restart
= 1;
1247 printf ("%s version %s\n", progname
, QUAGGA_VERSION
);
1248 puts("Copyright 2004 Andrew J. Schorr");
1251 return usage(progname
,0);
1253 fputs("Invalid option.\n",stderr
);
1254 return usage(progname
,1);
1258 if (gs
.unresponsive_restart
&& (gs
.mode
== MODE_MONITOR
))
1260 fputs("Option -z requires a -r or -R restart option.\n",stderr
);
1261 return usage(progname
,1);
1266 if (gs
.restart_command
|| gs
.start_command
|| gs
.stop_command
)
1268 fprintf(stderr
,"No kill/(re)start commands needed for %s mode.\n",
1270 return usage(progname
,1);
1273 case MODE_GLOBAL_RESTART
:
1274 case MODE_SEPARATE_RESTART
:
1275 if (!gs
.restart_command
|| gs
.start_command
|| gs
.stop_command
)
1277 fprintf(stderr
,"No start/kill commands needed in [%s] mode.\n",
1279 return usage(progname
,1);
1282 case MODE_PHASED_ZEBRA_RESTART
:
1283 case MODE_PHASED_ALL_RESTART
:
1284 if (!gs
.restart_command
|| !gs
.start_command
|| !gs
.stop_command
)
1287 "Need start, kill, and restart commands in [%s] mode.\n",
1289 return usage(progname
,1);
1296 if (gs
.restart_command
)
1297 gs
.restart_command
= translate_blanks(gs
.restart_command
,blankstr
);
1298 if (gs
.start_command
)
1299 gs
.start_command
= translate_blanks(gs
.start_command
,blankstr
);
1300 if (gs
.stop_command
)
1301 gs
.stop_command
= translate_blanks(gs
.stop_command
,blankstr
);
1304 gs
.restart
.interval
= gs
.min_restart_interval
;
1305 master
= thread_master_create();
1306 signal_init (master
, Q_SIGC(my_signals
), my_signals
);
1307 srandom(time(NULL
));
1311 struct daemon
*tail
= NULL
;
1313 for (i
= optind
; i
< argc
; i
++)
1317 if (!(dmn
= (struct daemon
*)calloc(1,sizeof(*dmn
))))
1319 fprintf(stderr
,"calloc(1,%u) failed: %s\n",
1320 (u_int
)sizeof(*dmn
), safe_strerror(errno
));
1323 dmn
->name
= dmn
->restart
.name
= argv
[i
];
1324 dmn
->state
= DAEMON_INIT
;
1328 dmn
->t_wakeup
= thread_add_timer_msec(master
,wakeup_init
,dmn
,
1329 100+(random() % 900));
1330 dmn
->restart
.interval
= gs
.min_restart_interval
;
1337 if (((gs
.mode
== MODE_PHASED_ZEBRA_RESTART
) ||
1338 (gs
.mode
== MODE_PHASED_ALL_RESTART
)) &&
1339 !strcmp(dmn
->name
,special
))
1345 fputs("Must specify one or more daemons to monitor.\n",stderr
);
1346 return usage(progname
,1);
1348 if (((gs
.mode
== MODE_PHASED_ZEBRA_RESTART
) ||
1349 (gs
.mode
== MODE_PHASED_ALL_RESTART
)) && !gs
.special
)
1351 fprintf(stderr
,"In mode [%s], but cannot find master daemon %s\n",
1352 mode_str
[gs
.mode
],special
);
1353 return usage(progname
,1);
1355 if (gs
.special
&& (gs
.numdaemons
< 2))
1357 fprintf(stderr
,"Mode [%s] does not make sense with only 1 daemon "
1358 "to watch.\n",mode_str
[gs
.mode
]);
1359 return usage(progname
,1);
1362 zlog_default
= openzlog(progname
, ZLOG_NONE
,
1363 LOG_CONS
|LOG_NDELAY
|LOG_PID
, LOG_DAEMON
);
1364 zlog_set_level(NULL
, ZLOG_DEST_MONITOR
, ZLOG_DISABLED
);
1367 zlog_set_level(NULL
, ZLOG_DEST_SYSLOG
, MIN(gs
.loglevel
,LOG_DEBUG
));
1371 zlog_set_level(NULL
, ZLOG_DEST_STDOUT
, MIN(gs
.loglevel
,LOG_DEBUG
));
1373 /* Make sure we're not already running. */
1374 pid_output (pidfile
);
1376 /* Announce which daemons are being monitored. */
1381 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
1382 len
+= strlen(dmn
->name
)+1;
1388 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
1392 strcpy(p
,dmn
->name
);
1395 zlog_notice("%s %s watching [%s], mode [%s]",
1396 progname
, QUAGGA_VERSION
, buf
, mode_str
[gs
.mode
]);
1401 struct thread thread
;
1403 while (thread_fetch (master
, &thread
))
1404 thread_call (&thread
);