2 Monitor status of quagga daemons and restart if necessary.
4 Copyright (C) 2004 Andrew J. Schorr
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
26 #include <lib/version.h>
32 #define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
35 /* Macros to help randomize timers. */
36 #define JITTER(X) ((random() % ((X)+1))-((X)/2))
37 #define FUZZY(X) ((X)+JITTER((X)/20))
39 #define DEFAULT_PERIOD 5
40 #define DEFAULT_TIMEOUT 10
41 #define DEFAULT_RESTART_TIMEOUT 20
42 #define DEFAULT_LOGLEVEL LOG_INFO
43 #define DEFAULT_MIN_RESTART 60
44 #define DEFAULT_MAX_RESTART 600
45 #ifdef PATH_WATCHQUAGGA_PID
46 #define DEFAULT_PIDFILE PATH_WATCHQUAGGA_PID
48 #define DEFAULT_PIDFILE STATEDIR "/watchquagga.pid"
51 #define VTYDIR DAEMON_VTY_DIR
53 #define VTYDIR STATEDIR
56 #define PING_TOKEN "PING"
58 /* Needs to be global, referenced somewhere inside libzebra. */
59 struct thread_master
*master
;
65 MODE_SEPARATE_RESTART
,
66 MODE_PHASED_ZEBRA_RESTART
,
67 MODE_PHASED_ALL_RESTART
70 static const char *mode_str
[] =
74 "individual daemon restart",
75 "phased zebra restart",
76 "phased global restart for any failure",
84 PHASE_ZEBRA_RESTART_PENDING
,
85 PHASE_WAITING_ZEBRA_UP
88 static const char *phase_str
[] =
92 "Waiting for other daemons to come down",
93 "Zebra restart job running",
94 "Waiting for zebra to come up",
98 #define PHASE_TIMEOUT (3*gs.restart_timeout)
107 struct thread
*t_kill
;
111 static struct global_state
114 restart_phase_t phase
;
115 struct thread
*t_phase_hanging
;
119 long restart_timeout
;
120 long min_restart_interval
;
121 long max_restart_interval
;
123 struct daemon
*daemons
;
124 const char *restart_command
;
125 const char *start_command
;
126 const char *stop_command
;
127 struct restart_info restart
;
128 int unresponsive_restart
;
130 struct daemon
*special
; /* points to zebra when doing phased restart */
133 int numdown
; /* # of daemons that are not UP or UNRESPONSIVE */
135 .mode
= MODE_MONITOR
,
138 .period
= 1000*DEFAULT_PERIOD
,
139 .timeout
= DEFAULT_TIMEOUT
,
140 .restart_timeout
= DEFAULT_RESTART_TIMEOUT
,
141 .loglevel
= DEFAULT_LOGLEVEL
,
142 .min_restart_interval
= DEFAULT_MIN_RESTART
,
143 .max_restart_interval
= DEFAULT_MAX_RESTART
,
157 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
159 static const char *state_str
[] =
170 daemon_state_t state
;
172 struct timeval echo_sent
;
174 struct thread
*t_wakeup
;
175 struct thread
*t_read
;
176 struct thread
*t_write
;
178 struct restart_info restart
;
181 static const struct option longopts
[] =
183 { "daemon", no_argument
, NULL
, 'd'},
184 { "statedir", required_argument
, NULL
, 'S'},
185 { "no-echo", no_argument
, NULL
, 'e'},
186 { "loglevel", required_argument
, NULL
, 'l'},
187 { "interval", required_argument
, NULL
, 'i'},
188 { "timeout", required_argument
, NULL
, 't'},
189 { "restart-timeout", required_argument
, NULL
, 'T'},
190 { "restart", required_argument
, NULL
, 'r'},
191 { "start-command", required_argument
, NULL
, 's'},
192 { "kill-command", required_argument
, NULL
, 'k'},
193 { "restart-all", required_argument
, NULL
, 'R'},
194 { "all-restart", no_argument
, NULL
, 'a'},
195 { "always-all-restart", no_argument
, NULL
, 'A'},
196 { "unresponsive-restart", no_argument
, NULL
, 'z'},
197 { "min-restart-interval", required_argument
, NULL
, 'm'},
198 { "max-restart-interval", required_argument
, NULL
, 'M'},
199 { "pid-file", required_argument
, NULL
, 'p'},
200 { "blank-string", required_argument
, NULL
, 'b'},
201 { "help", no_argument
, NULL
, 'h'},
202 { "version", no_argument
, NULL
, 'v'},
206 static int try_connect(struct daemon
*dmn
);
207 static int wakeup_send_echo(struct thread
*t_wakeup
);
208 static void try_restart(struct daemon
*dmn
);
209 static void phase_check(void);
212 usage(const char *progname
, int status
)
215 fprintf(stderr
, "Try `%s --help' for more information.\n", progname
);
217 printf("Usage : %s [OPTION...] <daemon name> ...\n\n\
218 Watchdog program to monitor status of quagga daemons and try to restart\n\
219 them if they are down or unresponsive. It determines whether a daemon is\n\
220 up based on whether it can connect to the daemon's vty unix stream socket.\n\
221 It then repeatedly sends echo commands over that socket to determine whether\n\
222 the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
223 on the socket connection and know immediately that the daemon is down.\n\n\
224 The daemons to be monitored should be listed on the command line.\n\n\
225 This program can run in one of 5 modes:\n\n\
227 Just monitor and report on status changes. Example:\n\
228 %s -d zebra ospfd bgpd\n\n\
230 Whenever any daemon hangs or crashes, use the given command to restart\n\
231 them all. Example:\n\
233 -R '/sbin/service zebra restart; /sbin/service ospfd restart' \\\n\
236 When any single daemon hangs or crashes, restart only the daemon that's\n\
237 in trouble using the supplied restart command. Example:\n\
238 %s -dz -r '/sbin/service %%s restart' zebra ospfd bgpd\n\n\
240 The same as the previous mode, except that there is special treatment when\n\
241 the zebra daemon is in trouble. In that case, a phased restart approach\n\
242 is used: 1. stop all other daemons; 2. restart zebra; 3. start the other\n\
244 %s -adz -r '/sbin/service %%s restart' \\\n\
245 -s '/sbin/service %%s start' \\\n\
246 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
248 This is the same as the previous mode, except that the phased restart\n\
249 procedure is used whenever any of the daemons hangs or crashes. Example:\n\
250 %s -Adz -r '/sbin/service %%s restart' \\\n\
251 -s '/sbin/service %%s start' \\\n\
252 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
253 As of this writing, it is believed that mode 2 [%s]\n\
254 is not safe, and mode 3 [%s] may not be safe with some of the\n\
255 routing daemons.\n\n\
256 In order to avoid attempting to restart the daemons in a fast loop,\n\
257 the -m and -M options allow you to control the minimum delay between\n\
258 restart commands. The minimum restart delay is recalculated each time\n\
259 a restart is attempted: if the time since the last restart attempt exceeds\n\
260 twice the -M value, then the restart delay is set to the -m value.\n\
261 Otherwise, the interval is doubled (but capped at the -M value).\n\n\
263 -d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
264 to syslog instead of stdout.\n\
265 -S, --statedir Set the vty socket directory (default is %s)\n\
266 -e, --no-echo Do not ping the daemons to test responsiveness (this\n\
267 option is necessary if the daemons do not support the\n\
269 -l, --loglevel Set the logging level (default is %d).\n\
270 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
271 but it can be set higher than %d if extra-verbose debugging\n\
272 messages are desired.\n\
273 -m, --min-restart-interval\n\
274 Set the minimum seconds to wait between invocations of daemon\n\
275 restart commands (default is %d).\n\
276 -M, --max-restart-interval\n\
277 Set the maximum seconds to wait between invocations of daemon\n\
278 restart commands (default is %d).\n\
279 -i, --interval Set the status polling interval in seconds (default is %d)\n\
280 -t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
281 -T, --restart-timeout\n\
282 Set the restart (kill) timeout in seconds (default is %d).\n\
283 If any background jobs are still running after this much\n\
284 time has elapsed, they will be killed.\n\
285 -r, --restart Supply a Bourne shell command to use to restart a single\n\
286 daemon. The command string should include '%%s' where the\n\
287 name of the daemon should be substituted.\n\
288 Note that -r and -R are incompatible.\n\
289 -s, --start-command\n\
290 Supply a Bourne shell to command to use to start a single\n\
291 daemon. The command string should include '%%s' where the\n\
292 name of the daemon should be substituted.\n\
293 -k, --kill-command\n\
294 Supply a Bourne shell to command to use to stop a single\n\
295 daemon. The command string should include '%%s' where the\n\
296 name of the daemon should be substituted.\n\
298 When one or more daemons is down, try to restart everything\n\
299 using the Bourne shell command supplied as the argument.\n\
300 Note that -r and -R are incompatible.\n\
301 -z, --unresponsive-restart\n\
302 When a daemon is unresponsive, treat it as being down for\n\
305 When zebra hangs or crashes, restart all daemons using\n\
306 this phased approach: 1. stop all other daemons; 2. restart\n\
307 zebra; 3. start other daemons. Requires -r, -s, and -k.\n\
308 -A, --always-all-restart\n\
309 When any daemon (not just zebra) hangs or crashes, use the\n\
310 same phased restart mechanism described above for -a.\n\
311 Requires -r, -s, and -k.\n\
312 -p, --pid-file Set process identifier file name\n\
314 -b, --blank-string\n\
315 When the supplied argument string is found in any of the\n\
316 various shell command arguments (-r, -s, -k, or -R), replace\n\
317 it with a space. This is an ugly hack to circumvent problems\n\
318 passing command-line arguments with embedded spaces.\n\
319 -v, --version Print program version\n\
320 -h, --help Display this help and exit\n\
321 ", progname
,mode_str
[0],progname
,mode_str
[1],progname
,mode_str
[2],
322 progname
,mode_str
[3],progname
,mode_str
[4],progname
,mode_str
[2],mode_str
[3],
323 VTYDIR
,DEFAULT_LOGLEVEL
,LOG_EMERG
,LOG_DEBUG
,LOG_DEBUG
,
324 DEFAULT_MIN_RESTART
,DEFAULT_MAX_RESTART
,
325 DEFAULT_PERIOD
,DEFAULT_TIMEOUT
,DEFAULT_RESTART_TIMEOUT
,DEFAULT_PIDFILE
);
331 run_background(const char *shell_cmd
)
335 switch (child
= fork())
338 zlog_err("fork failed, cannot run command [%s]: %s",
339 shell_cmd
,safe_strerror(errno
));
343 /* Use separate process group so child processes can be killed easily. */
344 if (setpgid(0,0) < 0)
345 zlog_warn("warning: setpgid(0,0) failed: %s",safe_strerror(errno
));
347 const char *argv
[4] = { "sh", "-c", shell_cmd
, NULL
};
348 execv("/bin/sh",(char *const *)argv
);
349 zlog_err("execv(/bin/sh -c '%s') failed: %s",
350 shell_cmd
,safe_strerror(errno
));
354 /* Parent process: we will reap the child later. */
355 zlog_err("Forked background command [pid %d]: %s",(int)child
,shell_cmd
);
360 static struct timeval
*
361 time_elapsed(struct timeval
*result
, const struct timeval
*start_time
)
363 gettimeofday(result
,NULL
);
364 result
->tv_sec
-= start_time
->tv_sec
;
365 result
->tv_usec
-= start_time
->tv_usec
;
366 while (result
->tv_usec
< 0)
368 result
->tv_usec
+= 1000000L;
375 restart_kill(struct thread
*t_kill
)
377 struct restart_info
*restart
= THREAD_ARG(t_kill
);
378 struct timeval delay
;
380 time_elapsed(&delay
,&restart
->time
);
381 zlog_warn("Warning: %s %s child process %d still running after "
382 "%ld seconds, sending signal %d",
383 restart
->what
,restart
->name
,(int)restart
->pid
,delay
.tv_sec
,
384 (restart
->kills
? SIGKILL
: SIGTERM
));
385 kill(-restart
->pid
,(restart
->kills
? SIGKILL
: SIGTERM
));
387 restart
->t_kill
= thread_add_timer(master
,restart_kill
,restart
,
392 static struct restart_info
*
393 find_child(pid_t child
)
395 if (gs
.mode
== MODE_GLOBAL_RESTART
)
397 if (gs
.restart
.pid
== child
)
403 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
405 if (dmn
->restart
.pid
== child
)
406 return &dmn
->restart
;
419 struct restart_info
*restart
;
421 switch (child
= waitpid(-1,&status
,WNOHANG
))
424 zlog_err("waitpid failed: %s",safe_strerror(errno
));
427 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
431 if ((restart
= find_child(child
)) != NULL
)
433 name
= restart
->name
;
434 what
= restart
->what
;
437 thread_cancel(restart
->t_kill
);
438 restart
->t_kill
= NULL
;
439 /* Update restart time to reflect the time the command completed. */
440 gettimeofday(&restart
->time
,NULL
);
444 zlog_err("waitpid returned status for an unknown child process %d",
449 if (WIFSTOPPED(status
))
450 zlog_warn("warning: %s %s process %d is stopped",
451 what
,name
,(int)child
);
452 else if (WIFSIGNALED(status
))
453 zlog_warn("%s %s process %d terminated due to signal %d",
454 what
,name
,(int)child
,WTERMSIG(status
));
455 else if (WIFEXITED(status
))
457 if (WEXITSTATUS(status
) != 0)
458 zlog_warn("%s %s process %d exited with non-zero status %d",
459 what
,name
,(int)child
,WEXITSTATUS(status
));
461 zlog_debug("%s %s process %d exited normally",what
,name
,(int)child
);
464 zlog_err("cannot interpret %s %s process %d wait status 0x%x",
465 what
,name
,(int)child
,status
);
470 run_job(struct restart_info
*restart
, const char *cmdtype
, const char *command
,
471 int force
, int update_interval
)
473 struct timeval delay
;
475 if (gs
.loglevel
> LOG_DEBUG
+1)
476 zlog_debug("attempting to %s %s",cmdtype
,restart
->name
);
480 if (gs
.loglevel
> LOG_DEBUG
+1)
481 zlog_debug("cannot %s %s, previous pid %d still running",
482 cmdtype
,restart
->name
,(int)restart
->pid
);
486 /* Note: time_elapsed test must come before the force test, since we need
487 to make sure that delay is initialized for use below in updating the
489 if ((time_elapsed(&delay
,&restart
->time
)->tv_sec
< restart
->interval
) &&
492 if (gs
.loglevel
> LOG_DEBUG
+1)
493 zlog_debug("postponing %s %s: "
494 "elapsed time %ld < retry interval %ld",
495 cmdtype
,restart
->name
,(long)delay
.tv_sec
,restart
->interval
);
499 gettimeofday(&restart
->time
,NULL
);
502 char cmd
[strlen(command
)+strlen(restart
->name
)+1];
503 snprintf(cmd
,sizeof(cmd
),command
,restart
->name
);
504 if ((restart
->pid
= run_background(cmd
)) > 0)
506 restart
->t_kill
= thread_add_timer(master
,restart_kill
,restart
,
508 restart
->what
= cmdtype
;
515 /* Calculate the new restart interval. */
518 if (delay
.tv_sec
> 2*gs
.max_restart_interval
)
519 restart
->interval
= gs
.min_restart_interval
;
520 else if ((restart
->interval
*= 2) > gs
.max_restart_interval
)
521 restart
->interval
= gs
.max_restart_interval
;
522 if (gs
.loglevel
> LOG_DEBUG
+1)
523 zlog_debug("restart %s interval is now %ld",
524 restart
->name
,restart
->interval
);
529 #define SET_READ_HANDLER(DMN) \
530 (DMN)->t_read = thread_add_read(master,handle_read,(DMN),(DMN)->fd)
532 #define SET_WAKEUP_DOWN(DMN) \
533 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_down,(DMN), \
536 #define SET_WAKEUP_UNRESPONSIVE(DMN) \
537 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_unresponsive,(DMN), \
540 #define SET_WAKEUP_ECHO(DMN) \
541 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_send_echo,(DMN), \
545 wakeup_down(struct thread
*t_wakeup
)
547 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
549 dmn
->t_wakeup
= NULL
;
550 if (try_connect(dmn
) < 0)
551 SET_WAKEUP_DOWN(dmn
);
552 if ((dmn
->connect_tries
> 1) && (dmn
->state
!= DAEMON_UP
))
558 wakeup_init(struct thread
*t_wakeup
)
560 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
562 dmn
->t_wakeup
= NULL
;
563 if (try_connect(dmn
) < 0)
565 SET_WAKEUP_DOWN(dmn
);
566 zlog_err("%s state -> down : initial connection attempt failed",
568 dmn
->state
= DAEMON_DOWN
;
574 daemon_down(struct daemon
*dmn
, const char *why
)
576 if (IS_UP(dmn
) || (dmn
->state
== DAEMON_INIT
))
577 zlog_err("%s state -> down : %s",dmn
->name
,why
);
578 else if (gs
.loglevel
> LOG_DEBUG
)
579 zlog_debug("%s still down : %s",dmn
->name
,why
);
582 dmn
->state
= DAEMON_DOWN
;
588 THREAD_OFF(dmn
->t_read
);
589 THREAD_OFF(dmn
->t_write
);
590 THREAD_OFF(dmn
->t_wakeup
);
591 if (try_connect(dmn
) < 0)
592 SET_WAKEUP_DOWN(dmn
);
597 handle_read(struct thread
*t_read
)
599 struct daemon
*dmn
= THREAD_ARG(t_read
);
600 static const char resp
[sizeof(PING_TOKEN
)+4] = PING_TOKEN
"\n";
601 char buf
[sizeof(resp
)+100];
603 struct timeval delay
;
606 if ((rc
= read(dmn
->fd
,buf
,sizeof(buf
))) < 0)
610 if (ERRNO_IO_RETRY(errno
))
612 /* Pretend it never happened. */
613 SET_READ_HANDLER(dmn
);
616 snprintf(why
,sizeof(why
),"unexpected read error: %s",
617 safe_strerror(errno
));
618 daemon_down(dmn
,why
);
623 daemon_down(dmn
,"read returned EOF");
626 if (!dmn
->echo_sent
.tv_sec
)
628 char why
[sizeof(buf
)+100];
629 snprintf(why
,sizeof(why
),"unexpected read returns %d bytes: %.*s",
630 (int)rc
,(int)rc
,buf
);
631 daemon_down(dmn
,why
);
635 /* We are expecting an echo response: is there any chance that the
636 response would not be returned entirely in the first read? That
637 seems inconceivable... */
638 if ((rc
!= sizeof(resp
)) || memcmp(buf
,resp
,sizeof(resp
)))
640 char why
[100+sizeof(buf
)];
641 snprintf(why
,sizeof(why
),"read returned bad echo response of %d bytes "
642 "(expecting %u): %.*s",
643 (int)rc
,(u_int
)sizeof(resp
),(int)rc
,buf
);
644 daemon_down(dmn
,why
);
648 time_elapsed(&delay
,&dmn
->echo_sent
);
649 dmn
->echo_sent
.tv_sec
= 0;
650 if (dmn
->state
== DAEMON_UNRESPONSIVE
)
652 if (delay
.tv_sec
< gs
.timeout
)
654 dmn
->state
= DAEMON_UP
;
655 zlog_warn("%s state -> up : echo response received after %ld.%06ld "
656 "seconds", dmn
->name
,delay
.tv_sec
,delay
.tv_usec
);
659 zlog_warn("%s: slow echo response finally received after %ld.%06ld "
660 "seconds", dmn
->name
,delay
.tv_sec
,delay
.tv_usec
);
662 else if (gs
.loglevel
> LOG_DEBUG
+1)
663 zlog_debug("%s: echo response received after %ld.%06ld seconds",
664 dmn
->name
,delay
.tv_sec
,delay
.tv_usec
);
666 SET_READ_HANDLER(dmn
);
668 thread_cancel(dmn
->t_wakeup
);
669 SET_WAKEUP_ECHO(dmn
);
675 daemon_up(struct daemon
*dmn
, const char *why
)
677 dmn
->state
= DAEMON_UP
;
679 dmn
->connect_tries
= 0;
680 zlog_notice("%s state -> up : %s",dmn
->name
,why
);
682 SET_WAKEUP_ECHO(dmn
);
687 check_connect(struct thread
*t_write
)
689 struct daemon
*dmn
= THREAD_ARG(t_write
);
691 socklen_t reslen
= sizeof(sockerr
);
694 if (getsockopt(dmn
->fd
,SOL_SOCKET
,SO_ERROR
,(char *)&sockerr
,&reslen
) < 0)
696 zlog_warn("%s: check_connect: getsockopt failed: %s",
697 dmn
->name
,safe_strerror(errno
));
698 daemon_down(dmn
,"getsockopt failed checking connection success");
701 if ((reslen
== sizeof(sockerr
)) && sockerr
)
704 snprintf(why
,sizeof(why
),
705 "getsockopt reports that connection attempt failed: %s",
706 safe_strerror(sockerr
));
707 daemon_down(dmn
,why
);
711 daemon_up(dmn
,"delayed connect succeeded");
716 wakeup_connect_hanging(struct thread
*t_wakeup
)
718 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
721 dmn
->t_wakeup
= NULL
;
722 snprintf(why
,sizeof(why
),"connection attempt timed out after %ld seconds",
724 daemon_down(dmn
,why
);
728 /* Making connection to protocol daemon. */
730 try_connect(struct daemon
*dmn
)
733 struct sockaddr_un addr
;
736 if (gs
.loglevel
> LOG_DEBUG
+1)
737 zlog_debug("%s: attempting to connect",dmn
->name
);
738 dmn
->connect_tries
++;
740 memset (&addr
, 0, sizeof (struct sockaddr_un
));
741 addr
.sun_family
= AF_UNIX
;
742 snprintf(addr
.sun_path
, sizeof(addr
.sun_path
), "%s/%s.vty",
743 gs
.vtydir
,dmn
->name
);
744 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
745 len
= addr
.sun_len
= SUN_LEN(&addr
);
747 len
= sizeof (addr
.sun_family
) + strlen (addr
.sun_path
);
748 #endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
750 /* Quick check to see if we might succeed before we go to the trouble
751 of creating a socket. */
752 if (access(addr
.sun_path
, W_OK
) < 0)
755 zlog_err("%s: access to socket %s denied: %s",
756 dmn
->name
,addr
.sun_path
,safe_strerror(errno
));
760 if ((sock
= socket (AF_UNIX
, SOCK_STREAM
, 0)) < 0)
762 zlog_err("%s(%s): cannot make socket: %s",
763 __func__
,addr
.sun_path
, safe_strerror(errno
));
767 if (set_nonblocking(sock
) < 0)
769 zlog_err("%s(%s): set_nonblocking(%d) failed",
770 __func__
, addr
.sun_path
, sock
);
775 if (connect (sock
, (struct sockaddr
*) &addr
, len
) < 0)
777 if ((errno
!= EINPROGRESS
) && (errno
!= EWOULDBLOCK
))
779 if (gs
.loglevel
> LOG_DEBUG
)
780 zlog_debug("%s(%s): connect failed: %s",
781 __func__
,addr
.sun_path
, safe_strerror(errno
));
785 if (gs
.loglevel
> LOG_DEBUG
)
786 zlog_debug("%s: connection in progress",dmn
->name
);
787 dmn
->state
= DAEMON_CONNECTING
;
789 dmn
->t_write
= thread_add_write(master
,check_connect
,dmn
,dmn
->fd
);
790 dmn
->t_wakeup
= thread_add_timer(master
,wakeup_connect_hanging
,dmn
,
792 SET_READ_HANDLER(dmn
);
797 SET_READ_HANDLER(dmn
);
798 daemon_up(dmn
,"connect succeeded");
803 phase_hanging(struct thread
*t_hanging
)
805 gs
.t_phase_hanging
= NULL
;
806 zlog_err("Phase [%s] hanging for %ld seconds, aborting phased restart",
807 phase_str
[gs
.phase
],PHASE_TIMEOUT
);
808 gs
.phase
= PHASE_NONE
;
813 set_phase(restart_phase_t new_phase
)
815 gs
.phase
= new_phase
;
816 if (gs
.t_phase_hanging
)
817 thread_cancel(gs
.t_phase_hanging
);
818 gs
.t_phase_hanging
= thread_add_timer(master
,phase_hanging
,NULL
,
829 case PHASE_STOPS_PENDING
:
832 zlog_info("Phased restart: all routing daemon stop jobs have completed.");
833 set_phase(PHASE_WAITING_DOWN
);
835 case PHASE_WAITING_DOWN
:
836 if (gs
.numdown
+IS_UP(gs
.special
) < gs
.numdaemons
)
838 zlog_info("Phased restart: all routing daemons now down.");
839 run_job(&gs
.special
->restart
,"restart",gs
.restart_command
,1,1);
840 set_phase(PHASE_ZEBRA_RESTART_PENDING
);
842 case PHASE_ZEBRA_RESTART_PENDING
:
843 if (gs
.special
->restart
.pid
)
845 zlog_info("Phased restart: %s restart job completed.",gs
.special
->name
);
846 set_phase(PHASE_WAITING_ZEBRA_UP
);
848 case PHASE_WAITING_ZEBRA_UP
:
849 if (!IS_UP(gs
.special
))
851 zlog_info("Phased restart: %s is now up.",gs
.special
->name
);
854 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
856 if (dmn
!= gs
.special
)
857 run_job(&dmn
->restart
,"start",gs
.start_command
,1,0);
860 gs
.phase
= PHASE_NONE
;
861 THREAD_OFF(gs
.t_phase_hanging
);
862 zlog_notice("Phased global restart has completed.");
868 try_restart(struct daemon
*dmn
)
874 case MODE_GLOBAL_RESTART
:
875 run_job(&gs
.restart
,"restart",gs
.restart_command
,0,1);
877 case MODE_SEPARATE_RESTART
:
878 run_job(&dmn
->restart
,"restart",gs
.restart_command
,0,1);
880 case MODE_PHASED_ZEBRA_RESTART
:
881 if (dmn
!= gs
.special
)
883 if ((gs
.special
->state
== DAEMON_UP
) && (gs
.phase
== PHASE_NONE
))
884 run_job(&dmn
->restart
,"restart",gs
.restart_command
,0,1);
886 zlog_debug("%s: postponing restart attempt because master %s daemon "
887 "not up [%s], or phased restart in progress",
888 dmn
->name
,gs
.special
->name
,state_str
[gs
.special
->state
]);
892 case MODE_PHASED_ALL_RESTART
:
893 if ((gs
.phase
!= PHASE_NONE
) || gs
.numpids
)
895 if (gs
.loglevel
> LOG_DEBUG
+1)
896 zlog_debug("postponing phased global restart: restart already in "
897 "progress [%s], or outstanding child processes [%d]",
898 phase_str
[gs
.phase
],gs
.numpids
);
901 /* Is it too soon for a restart? */
903 struct timeval delay
;
904 if (time_elapsed(&delay
,&gs
.special
->restart
.time
)->tv_sec
<
905 gs
.special
->restart
.interval
)
907 if (gs
.loglevel
> LOG_DEBUG
+1)
908 zlog_debug("postponing phased global restart: "
909 "elapsed time %ld < retry interval %ld",
910 (long)delay
.tv_sec
,gs
.special
->restart
.interval
);
914 zlog_info("Phased restart: stopping all routing daemons.");
915 /* First step: stop all other daemons. */
916 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
918 if (dmn
!= gs
.special
)
919 run_job(&dmn
->restart
,"stop",gs
.stop_command
,1,1);
921 set_phase(PHASE_STOPS_PENDING
);
924 zlog_err("error: unknown restart mode %d",gs
.mode
);
930 wakeup_unresponsive(struct thread
*t_wakeup
)
932 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
934 dmn
->t_wakeup
= NULL
;
935 if (dmn
->state
!= DAEMON_UNRESPONSIVE
)
936 zlog_err("%s: no longer unresponsive (now %s), "
937 "wakeup should have been cancelled!",
938 dmn
->name
,state_str
[dmn
->state
]);
941 SET_WAKEUP_UNRESPONSIVE(dmn
);
948 wakeup_no_answer(struct thread
*t_wakeup
)
950 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
952 dmn
->t_wakeup
= NULL
;
953 dmn
->state
= DAEMON_UNRESPONSIVE
;
954 zlog_err("%s state -> unresponsive : no response yet to ping "
955 "sent %ld seconds ago",dmn
->name
,gs
.timeout
);
956 if (gs
.unresponsive_restart
)
958 SET_WAKEUP_UNRESPONSIVE(dmn
);
965 wakeup_send_echo(struct thread
*t_wakeup
)
967 static const char echocmd
[] = "echo " PING_TOKEN
;
969 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
971 dmn
->t_wakeup
= NULL
;
972 if (((rc
= write(dmn
->fd
,echocmd
,sizeof(echocmd
))) < 0) ||
973 ((size_t)rc
!= sizeof(echocmd
)))
975 char why
[100+sizeof(echocmd
)];
976 snprintf(why
,sizeof(why
),"write '%s' returned %d instead of %u",
977 echocmd
,(int)rc
,(u_int
)sizeof(echocmd
));
978 daemon_down(dmn
,why
);
982 gettimeofday(&dmn
->echo_sent
,NULL
);
983 dmn
->t_wakeup
= thread_add_timer(master
,wakeup_no_answer
,dmn
,gs
.timeout
);
991 zlog_notice("Terminating on signal");
996 valid_command(const char *cmd
)
1000 return ((p
= strchr(cmd
,'%')) != NULL
) && (*(p
+1) == 's') && !strchr(p
+1,'%');
1003 /* This is an ugly hack to circumvent problems with passing command-line
1004 arguments that contain spaces. The fix is to use a configuration file. */
1006 translate_blanks(const char *cmd
, const char *blankstr
)
1010 size_t bslen
= strlen(blankstr
);
1012 if (!(res
= strdup(cmd
)))
1017 while ((p
= strstr(res
,blankstr
)) != NULL
)
1021 memmove(p
+1,p
+bslen
,strlen(p
+bslen
)+1);
1027 main(int argc
, char **argv
)
1029 const char *progname
;
1031 int daemon_mode
= 0;
1032 const char *pidfile
= DEFAULT_PIDFILE
;
1033 const char *special
= "zebra";
1034 const char *blankstr
= NULL
;
1035 static struct quagga_signal_t my_signals
[] =
1047 .handler
= sigchild
,
1051 if ((progname
= strrchr (argv
[0], '/')) != NULL
)
1056 gs
.restart
.name
= "all";
1057 while ((opt
= getopt_long(argc
, argv
, "aAb:dek:l:m:M:i:p:r:R:S:s:t:T:zvh",
1058 longopts
, 0)) != EOF
)
1065 if ((gs
.mode
!= MODE_MONITOR
) && (gs
.mode
!= MODE_SEPARATE_RESTART
))
1067 fputs("Ambiguous operating mode selected.\n",stderr
);
1068 return usage(progname
,1);
1070 gs
.mode
= MODE_PHASED_ZEBRA_RESTART
;
1073 if ((gs
.mode
!= MODE_MONITOR
) && (gs
.mode
!= MODE_SEPARATE_RESTART
))
1075 fputs("Ambiguous operating mode selected.\n",stderr
);
1076 return usage(progname
,1);
1078 gs
.mode
= MODE_PHASED_ALL_RESTART
;
1090 if (!valid_command(optarg
))
1092 fprintf(stderr
,"Invalid kill command, must contain '%%s': %s\n",
1094 return usage(progname
,1);
1096 gs
.stop_command
= optarg
;
1101 if ((sscanf(optarg
,"%d%1s",&gs
.loglevel
,garbage
) != 1) ||
1102 (gs
.loglevel
< LOG_EMERG
))
1104 fprintf(stderr
,"Invalid loglevel argument: %s\n",optarg
);
1105 return usage(progname
,1);
1112 if ((sscanf(optarg
,"%ld%1s",
1113 &gs
.min_restart_interval
,garbage
) != 1) ||
1114 (gs
.min_restart_interval
< 0))
1116 fprintf(stderr
,"Invalid min_restart_interval argument: %s\n",
1118 return usage(progname
,1);
1125 if ((sscanf(optarg
,"%ld%1s",
1126 &gs
.max_restart_interval
,garbage
) != 1) ||
1127 (gs
.max_restart_interval
< 0))
1129 fprintf(stderr
,"Invalid max_restart_interval argument: %s\n",
1131 return usage(progname
,1);
1139 if ((sscanf(optarg
,"%d%1s",&period
,garbage
) != 1) ||
1142 fprintf(stderr
,"Invalid interval argument: %s\n",optarg
);
1143 return usage(progname
,1);
1145 gs
.period
= 1000*period
;
1152 if ((gs
.mode
== MODE_GLOBAL_RESTART
) ||
1153 (gs
.mode
== MODE_SEPARATE_RESTART
))
1155 fputs("Ambiguous operating mode selected.\n",stderr
);
1156 return usage(progname
,1);
1158 if (!valid_command(optarg
))
1161 "Invalid restart command, must contain '%%s': %s\n",
1163 return usage(progname
,1);
1165 gs
.restart_command
= optarg
;
1166 if (gs
.mode
== MODE_MONITOR
)
1167 gs
.mode
= MODE_SEPARATE_RESTART
;
1170 if (gs
.mode
!= MODE_MONITOR
)
1172 fputs("Ambiguous operating mode selected.\n",stderr
);
1173 return usage(progname
,1);
1175 if (strchr(optarg
,'%'))
1178 "Invalid restart-all arg, must not contain '%%s': %s\n",
1180 return usage(progname
,1);
1182 gs
.restart_command
= optarg
;
1183 gs
.mode
= MODE_GLOBAL_RESTART
;
1186 if (!valid_command(optarg
))
1188 fprintf(stderr
,"Invalid start command, must contain '%%s': %s\n",
1190 return usage(progname
,1);
1192 gs
.start_command
= optarg
;
1200 if ((sscanf(optarg
,"%ld%1s",&gs
.timeout
,garbage
) != 1) ||
1203 fprintf(stderr
,"Invalid timeout argument: %s\n",optarg
);
1204 return usage(progname
,1);
1211 if ((sscanf(optarg
,"%ld%1s",&gs
.restart_timeout
,garbage
) != 1) ||
1212 (gs
.restart_timeout
< 1))
1214 fprintf(stderr
,"Invalid restart timeout argument: %s\n",optarg
);
1215 return usage(progname
,1);
1220 gs
.unresponsive_restart
= 1;
1223 printf ("%s version %s\n", progname
, QUAGGA_VERSION
);
1224 puts("Copyright 2004 Andrew J. Schorr");
1227 return usage(progname
,0);
1229 fputs("Invalid option.\n",stderr
);
1230 return usage(progname
,1);
1234 if (gs
.unresponsive_restart
&& (gs
.mode
== MODE_MONITOR
))
1236 fputs("Option -z requires a -r or -R restart option.\n",stderr
);
1237 return usage(progname
,1);
1242 if (gs
.restart_command
|| gs
.start_command
|| gs
.stop_command
)
1244 fprintf(stderr
,"No kill/(re)start commands needed for %s mode.\n",
1246 return usage(progname
,1);
1249 case MODE_GLOBAL_RESTART
:
1250 case MODE_SEPARATE_RESTART
:
1251 if (!gs
.restart_command
|| gs
.start_command
|| gs
.stop_command
)
1253 fprintf(stderr
,"No start/kill commands needed in [%s] mode.\n",
1255 return usage(progname
,1);
1258 case MODE_PHASED_ZEBRA_RESTART
:
1259 case MODE_PHASED_ALL_RESTART
:
1260 if (!gs
.restart_command
|| !gs
.start_command
|| !gs
.stop_command
)
1263 "Need start, kill, and restart commands in [%s] mode.\n",
1265 return usage(progname
,1);
1272 if (gs
.restart_command
)
1273 gs
.restart_command
= translate_blanks(gs
.restart_command
,blankstr
);
1274 if (gs
.start_command
)
1275 gs
.start_command
= translate_blanks(gs
.start_command
,blankstr
);
1276 if (gs
.stop_command
)
1277 gs
.stop_command
= translate_blanks(gs
.stop_command
,blankstr
);
1280 gs
.restart
.interval
= gs
.min_restart_interval
;
1281 master
= thread_master_create();
1282 signal_init (master
, Q_SIGC(my_signals
), my_signals
);
1283 srandom(time(NULL
));
1287 struct daemon
*tail
= NULL
;
1289 for (i
= optind
; i
< argc
; i
++)
1293 if (!(dmn
= (struct daemon
*)calloc(1,sizeof(*dmn
))))
1295 fprintf(stderr
,"calloc(1,%u) failed: %s\n",
1296 (u_int
)sizeof(*dmn
), safe_strerror(errno
));
1299 dmn
->name
= dmn
->restart
.name
= argv
[i
];
1300 dmn
->state
= DAEMON_INIT
;
1304 dmn
->t_wakeup
= thread_add_timer_msec(master
,wakeup_init
,dmn
,
1305 100+(random() % 900));
1306 dmn
->restart
.interval
= gs
.min_restart_interval
;
1313 if (((gs
.mode
== MODE_PHASED_ZEBRA_RESTART
) ||
1314 (gs
.mode
== MODE_PHASED_ALL_RESTART
)) &&
1315 !strcmp(dmn
->name
,special
))
1321 fputs("Must specify one or more daemons to monitor.\n",stderr
);
1322 return usage(progname
,1);
1324 if (((gs
.mode
== MODE_PHASED_ZEBRA_RESTART
) ||
1325 (gs
.mode
== MODE_PHASED_ALL_RESTART
)) && !gs
.special
)
1327 fprintf(stderr
,"In mode [%s], but cannot find master daemon %s\n",
1328 mode_str
[gs
.mode
],special
);
1329 return usage(progname
,1);
1331 if (gs
.special
&& (gs
.numdaemons
< 2))
1333 fprintf(stderr
,"Mode [%s] does not make sense with only 1 daemon "
1334 "to watch.\n",mode_str
[gs
.mode
]);
1335 return usage(progname
,1);
1338 zlog_default
= openzlog(progname
, ZLOG_NONE
,
1339 LOG_CONS
|LOG_NDELAY
|LOG_PID
, LOG_DAEMON
);
1340 zlog_set_level(NULL
, ZLOG_DEST_MONITOR
, ZLOG_DISABLED
);
1343 zlog_set_level(NULL
, ZLOG_DEST_SYSLOG
, MIN(gs
.loglevel
,LOG_DEBUG
));
1344 if (daemon (0, 0) < 0)
1346 fprintf(stderr
, "Watchquagga daemon failed: %s", strerror(errno
));
1351 zlog_set_level(NULL
, ZLOG_DEST_STDOUT
, MIN(gs
.loglevel
,LOG_DEBUG
));
1353 /* Make sure we're not already running. */
1354 pid_output (pidfile
);
1356 /* Announce which daemons are being monitored. */
1361 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
1362 len
+= strlen(dmn
->name
)+1;
1368 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
1372 strcpy(p
,dmn
->name
);
1375 zlog_notice("%s %s watching [%s], mode [%s]",
1376 progname
, QUAGGA_VERSION
, buf
, mode_str
[gs
.mode
]);
1381 struct thread thread
;
1383 while (thread_fetch (master
, &thread
))
1384 thread_call (&thread
);