#include "command.h"
#include "memory_vty.h"
#include "libfrr.h"
+#include "lib_errors.h"
#include <getopt.h>
#include <sys/un.h>
#include <systemd.h>
#include "watchfrr.h"
+#include "watchfrr_errors.h"
#ifndef MIN
#define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
#define PING_TOKEN "PING"
+DEFINE_MGROUP(WATCHFRR, "watchfrr")
+DEFINE_MTYPE_STATIC(WATCHFRR, WATCHFRR_DAEMON, "watchfrr daemon entry")
+
/* Needs to be global, referenced somewhere inside libfrr. */
struct thread_master *master;
-static char pidfile_default[256];
static bool watch_only = false;
typedef enum {
PHASE_NONE = 0,
+ PHASE_INIT,
PHASE_STOPS_PENDING,
PHASE_WAITING_DOWN,
PHASE_ZEBRA_RESTART_PENDING,
} restart_phase_t;
static const char *phase_str[] = {
- "None",
+ "Idle",
+ "Startup",
"Stop jobs running",
"Waiting for other daemons to come down",
"Zebra restart job running",
int numpids;
int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
} gs = {
- .phase = PHASE_NONE,
+ .phase = PHASE_INIT,
.vtydir = frr_vtydir,
.period = 1000 * DEFAULT_PERIOD,
.timeout = DEFAULT_TIMEOUT,
daemon_state_t state;
int fd;
struct timeval echo_sent;
- u_int connect_tries;
+ unsigned int connect_tries;
struct thread *t_wakeup;
struct thread *t_read;
struct thread *t_write;
static int wakeup_send_echo(struct thread *t_wakeup);
static void try_restart(struct daemon *dmn);
static void phase_check(void);
+static void restart_done(struct daemon *dmn);
static const char *progname;
static void printhelp(FILE *target)
name of the daemon should be substituted.\n\
--dry Do not start or restart anything, just log.\n\
-p, --pid-file Set process identifier file name\n\
- (default is %s).\n\
+ (default is %s/watchfrr.pid).\n\
-b, --blank-string\n\
When the supplied argument string is found in any of the\n\
various shell command arguments (-r, -s, or -k), replace\n\
-h, --help Display this help and exit\n",
frr_vtydir, DEFAULT_LOGLEVEL, LOG_EMERG, LOG_DEBUG, LOG_DEBUG,
DEFAULT_MIN_RESTART, DEFAULT_MAX_RESTART, DEFAULT_PERIOD,
- DEFAULT_TIMEOUT, DEFAULT_RESTART_TIMEOUT, pidfile_default);
+ DEFAULT_TIMEOUT, DEFAULT_RESTART_TIMEOUT, frr_vtydir);
}
static pid_t run_background(char *shell_cmd)
switch (child = fork()) {
case -1:
- zlog_err("fork failed, cannot run command [%s]: %s", shell_cmd,
- safe_strerror(errno));
+ flog_err_sys(EC_LIB_SYSTEM_CALL,
+ "fork failed, cannot run command [%s]: %s",
+ shell_cmd, safe_strerror(errno));
return -1;
case 0:
/* Child process. */
char dashc[] = "-c";
char *const argv[4] = {shell, dashc, shell_cmd, NULL};
execv("/bin/sh", argv);
- zlog_err("execv(/bin/sh -c '%s') failed: %s", shell_cmd,
- safe_strerror(errno));
+ flog_err_sys(EC_LIB_SYSTEM_CALL,
+ "execv(/bin/sh -c '%s') failed: %s",
+ shell_cmd, safe_strerror(errno));
_exit(127);
}
default:
/* Parent process: we will reap the child later. */
- zlog_err("Forked background command [pid %d]: %s", (int)child,
- shell_cmd);
+ flog_err_sys(EC_LIB_SYSTEM_CALL,
+ "Forked background command [pid %d]: %s",
+ (int)child, shell_cmd);
return child;
}
}
static struct restart_info *find_child(pid_t child)
{
struct daemon *dmn;
+ if (gs.restart.pid == child)
+ return &gs.restart;
+
for (dmn = gs.daemons; dmn; dmn = dmn->next) {
if (dmn->restart.pid == child)
return &dmn->restart;
const char *name;
const char *what;
struct restart_info *restart;
+ struct daemon *dmn;
switch (child = waitpid(-1, &status, WNOHANG)) {
case -1:
- zlog_err("waitpid failed: %s", safe_strerror(errno));
+ flog_err_sys(EC_LIB_SYSTEM_CALL, "waitpid failed: %s",
+ safe_strerror(errno));
return;
case 0:
zlog_warn("SIGCHLD received, but waitpid did not reap a child");
* completed. */
gettimeofday(&restart->time, NULL);
} else {
- zlog_err(
+ flog_err_sys(
+ EC_LIB_SYSTEM_CALL,
"waitpid returned status for an unknown child process %d",
(int)child);
name = "(unknown)";
zlog_warn(
"%s %s process %d exited with non-zero status %d",
what, name, (int)child, WEXITSTATUS(status));
- else
+ else {
zlog_debug("%s %s process %d exited normally", what,
name, (int)child);
+
+ if (restart && restart != &gs.restart) {
+ dmn = container_of(restart, struct daemon,
+ restart);
+ restart_done(dmn);
+ } else if (restart)
+ for (dmn = gs.daemons; dmn; dmn = dmn->next)
+ restart_done(dmn);
+ }
} else
- zlog_err("cannot interpret %s %s process %d wait status 0x%x",
- what, name, (int)child, status);
+ flog_err_sys(
+ EC_LIB_SYSTEM_CALL,
+ "cannot interpret %s %s process %d wait status 0x%x",
+ what, name, (int)child, status);
phase_check();
}
dmn->t_wakeup = NULL;
if (try_connect(dmn) < 0) {
- SET_WAKEUP_DOWN(dmn);
- zlog_err("%s state -> down : initial connection attempt failed",
+ flog_err(EC_WATCHFRR_CONNECTION,
+ "%s state -> down : initial connection attempt failed",
dmn->name);
dmn->state = DAEMON_DOWN;
}
+ phase_check();
return 0;
}
+static void restart_done(struct daemon *dmn)
+{
+ if (dmn->state != DAEMON_DOWN) {
+ zlog_warn("wtf?");
+ return;
+ }
+ if (dmn->t_wakeup)
+ THREAD_OFF(dmn->t_wakeup);
+ if (try_connect(dmn) < 0)
+ SET_WAKEUP_DOWN(dmn);
+}
+
static void daemon_down(struct daemon *dmn, const char *why)
{
if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
- zlog_err("%s state -> down : %s", dmn->name, why);
+ flog_err(EC_WATCHFRR_CONNECTION, "%s state -> down : %s",
+ dmn->name, why);
else if (gs.loglevel > LOG_DEBUG)
zlog_debug("%s still down : %s", dmn->name, why);
if (IS_UP(dmn))
snprintf(why, sizeof(why),
"read returned bad echo response of %d bytes "
"(expecting %u): %.*s",
- (int)rc, (u_int)sizeof(resp), (int)rc, buf);
+ (int)rc, (unsigned int)sizeof(resp), (int)rc, buf);
daemon_down(dmn, why);
return 0;
}
if (!sent && gs.numdown == 0) {
FILE *fp;
+ zlog_notice("all daemons up, doing startup-complete notify");
+ frr_detach();
+
fp = fopen(DAEMON_VTY_DIR "/watchfrr.started", "w");
if (fp)
fclose(fp);
#if defined HAVE_SYSTEMD
- zlog_notice(
- "Watchfrr: Notifying Systemd we are up and running");
systemd_send_started(master, 0);
#endif
sent = 1;
of creating a socket. */
if (access(addr.sun_path, W_OK) < 0) {
if (errno != ENOENT)
- zlog_err("%s: access to socket %s denied: %s",
- dmn->name, addr.sun_path,
- safe_strerror(errno));
+ flog_err_sys(EC_LIB_SYSTEM_CALL,
+ "%s: access to socket %s denied: %s",
+ dmn->name, addr.sun_path,
+ safe_strerror(errno));
return -1;
}
if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
- zlog_err("%s(%s): cannot make socket: %s", __func__,
- addr.sun_path, safe_strerror(errno));
+ flog_err_sys(EC_LIB_SOCKET, "%s(%s): cannot make socket: %s",
+ __func__, addr.sun_path, safe_strerror(errno));
return -1;
}
if (set_nonblocking(sock) < 0 || set_cloexec(sock) < 0) {
- zlog_err("%s(%s): set_nonblocking/cloexec(%d) failed", __func__,
- addr.sun_path, sock);
+ flog_err_sys(EC_LIB_SYSTEM_CALL,
+ "%s(%s): set_nonblocking/cloexec(%d) failed",
+ __func__, addr.sun_path, sock);
close(sock);
return -1;
}
static int phase_hanging(struct thread *t_hanging)
{
gs.t_phase_hanging = NULL;
- zlog_err("Phase [%s] hanging for %ld seconds, aborting phased restart",
+ flog_err(EC_WATCHFRR_CONNECTION,
+ "Phase [%s] hanging for %ld seconds, aborting phased restart",
phase_str[gs.phase], PHASE_TIMEOUT);
gs.phase = PHASE_NONE;
return 0;
static void phase_check(void)
{
+ struct daemon *dmn;
+
switch (gs.phase) {
case PHASE_NONE:
break;
+
+ case PHASE_INIT:
+ for (dmn = gs.daemons; dmn; dmn = dmn->next)
+ if (dmn->state == DAEMON_INIT)
+ return;
+
+ /* startup complete, everything out of INIT */
+ gs.phase = PHASE_NONE;
+ for (dmn = gs.daemons; dmn; dmn = dmn->next)
+ if (dmn->state == DAEMON_DOWN) {
+ SET_WAKEUP_DOWN(dmn);
+ try_restart(dmn);
+ }
+ break;
case PHASE_STOPS_PENDING:
if (gs.numpids)
break;
dmn->t_wakeup = NULL;
if (dmn->state != DAEMON_UNRESPONSIVE)
- zlog_err(
- "%s: no longer unresponsive (now %s), "
- "wakeup should have been cancelled!",
- dmn->name, state_str[dmn->state]);
+ flog_err(EC_WATCHFRR_CONNECTION,
+ "%s: no longer unresponsive (now %s), "
+ "wakeup should have been cancelled!",
+ dmn->name, state_str[dmn->state]);
else {
SET_WAKEUP_UNRESPONSIVE(dmn);
try_restart(dmn);
dmn->t_wakeup = NULL;
dmn->state = DAEMON_UNRESPONSIVE;
- zlog_err(
- "%s state -> unresponsive : no response yet to ping "
- "sent %ld seconds ago",
- dmn->name, gs.timeout);
+ flog_err(EC_WATCHFRR_CONNECTION,
+ "%s state -> unresponsive : no response yet to ping "
+ "sent %ld seconds ago",
+ dmn->name, gs.timeout);
SET_WAKEUP_UNRESPONSIVE(dmn);
try_restart(dmn);
return 0;
char why[100 + sizeof(echocmd)];
snprintf(why, sizeof(why),
"write '%s' returned %d instead of %u", echocmd,
- (int)rc, (u_int)sizeof(echocmd));
+ (int)rc, (unsigned int)sizeof(echocmd));
daemon_down(dmn, why);
} else {
gettimeofday(&dmn->echo_sent, NULL);
return true;
}
+void watchfrr_status(struct vty *vty)
+{
+ struct daemon *dmn;
+ struct timeval delay;
+
+ vty_out(vty, "watchfrr global phase: %s\n", phase_str[gs.phase]);
+ if (gs.restart.pid)
+ vty_out(vty, " global restart running, pid %ld\n",
+ (long)gs.restart.pid);
+
+ for (dmn = gs.daemons; dmn; dmn = dmn->next) {
+ vty_out(vty, " %-20s %s\n", dmn->name, state_str[dmn->state]);
+ if (dmn->restart.pid)
+ vty_out(vty, " restart running, pid %ld\n",
+ (long)dmn->restart.pid);
+ else if (dmn->state == DAEMON_DOWN &&
+ time_elapsed(&delay, &dmn->restart.time)->tv_sec
+ < dmn->restart.interval)
+ vty_out(vty, " restarting in %ld seconds"
+ " (%lds backoff interval)\n",
+ dmn->restart.interval - delay.tv_sec,
+ dmn->restart.interval);
+ }
+}
+
static void sigint(void)
{
zlog_notice("Terminating on signal");
return res;
}
+static void watchfrr_init(int argc, char **argv)
+{
+ const char *special = "zebra";
+ int i;
+ struct daemon *dmn, **add = &gs.daemons;
+ char alldaemons[512] = "", *p = alldaemons;
+
+ for (i = optind; i < argc; i++) {
+ dmn = XCALLOC(MTYPE_WATCHFRR_DAEMON, sizeof(*dmn));
+
+ dmn->name = dmn->restart.name = argv[i];
+ dmn->state = DAEMON_INIT;
+ gs.numdaemons++;
+ gs.numdown++;
+ dmn->fd = -1;
+ dmn->t_wakeup = NULL;
+ thread_add_timer_msec(master, wakeup_init, dmn, 0,
+ &dmn->t_wakeup);
+ dmn->restart.interval = gs.min_restart_interval;
+ *add = dmn;
+ add = &dmn->next;
+
+ if (!strcmp(dmn->name, special))
+ gs.special = dmn;
+ }
+
+ if (!gs.daemons) {
+ fprintf(stderr,
+ "Must specify one or more daemons to monitor.\n\n");
+ frr_help_exit(1);
+ }
+ if (!watch_only && !gs.special) {
+ fprintf(stderr, "\"%s\" daemon must be in daemon lists\n\n",
+ special);
+ frr_help_exit(1);
+ }
+
+ for (dmn = gs.daemons; dmn; dmn = dmn->next) {
+ snprintf(p, alldaemons + sizeof(alldaemons) - p, "%s%s",
+ (p == alldaemons) ? "" : " ", dmn->name);
+ p += strlen(p);
+ }
+ zlog_notice("%s %s watching [%s]%s", progname, FRR_VERSION, alldaemons,
+ watch_only ? ", monitor mode" : "");
+}
+
struct zebra_privs_t watchfrr_privs = {
#ifdef VTY_GROUP
.vty_group = VTY_GROUP,
FRR_DAEMON_INFO(watchfrr, WATCHFRR,
.flags = FRR_NO_PRIVSEP | FRR_NO_TCPVTY | FRR_LIMITED_CLI
- | FRR_NO_CFG_PID_DRY | FRR_NO_ZCLIENT,
+ | FRR_NO_CFG_PID_DRY | FRR_NO_ZCLIENT
+ | FRR_DETACH_LATER,
.printhelp = printhelp,
.copyright = "Copyright 2004 Andrew J. Schorr",
int main(int argc, char **argv)
{
int opt;
- const char *pidfile = pidfile_default;
- const char *special = "zebra";
const char *blankstr = NULL;
- snprintf(pidfile_default, sizeof(pidfile_default), "%s/watchfrr.pid",
- frr_vtydir);
-
frr_preinit(&watchfrr_di, argc, argv);
progname = watchfrr_di.progname;
gs.period = 1000 * period;
} break;
case 'p':
- pidfile = optarg;
+ watchfrr_di.pid_file = optarg;
break;
case 'r':
if (!valid_command(optarg)) {
gs.restart.interval = gs.min_restart_interval;
master = frr_init();
+ watchfrr_error_init();
+ watchfrr_init(argc, argv);
+ watchfrr_vty_init();
+
+ frr_config_fork();
zlog_set_level(ZLOG_DEST_MONITOR, ZLOG_DISABLED);
- if (watchfrr_di.daemon_mode) {
+ if (watchfrr_di.daemon_mode)
zlog_set_level(ZLOG_DEST_SYSLOG, MIN(gs.loglevel, LOG_DEBUG));
- if (daemon(0, 0) < 0) {
- fprintf(stderr, "Watchfrr daemon failed: %s",
- strerror(errno));
- exit(1);
- }
- } else
+ else
zlog_set_level(ZLOG_DEST_STDOUT, MIN(gs.loglevel, LOG_DEBUG));
- watchfrr_vty_init();
-
- frr_vty_serv();
-
- {
- int i;
- struct daemon *tail = NULL;
-
- for (i = optind; i < argc; i++) {
- struct daemon *dmn;
-
- if (!(dmn = (struct daemon *)calloc(1, sizeof(*dmn)))) {
- fprintf(stderr, "calloc(1,%u) failed: %s\n",
- (u_int)sizeof(*dmn),
- safe_strerror(errno));
- return 1;
- }
- dmn->name = dmn->restart.name = argv[i];
- dmn->state = DAEMON_INIT;
- gs.numdaemons++;
- gs.numdown++;
- dmn->fd = -1;
- dmn->t_wakeup = NULL;
- thread_add_timer_msec(master, wakeup_init, dmn,
- 100 + (random() % 900),
- &dmn->t_wakeup);
- dmn->restart.interval = gs.min_restart_interval;
- if (tail)
- tail->next = dmn;
- else
- gs.daemons = dmn;
- tail = dmn;
-
- if (!strcmp(dmn->name, special))
- gs.special = dmn;
- }
- }
- if (!gs.daemons) {
- fputs("Must specify one or more daemons to monitor.\n", stderr);
- frr_help_exit(1);
- }
- if (!watch_only && !gs.special) {
- fprintf(stderr, "\"%s\" daemon must be in daemon list\n",
- special);
- frr_help_exit(1);
- }
-
- /* Make sure we're not already running. */
- pid_output(pidfile);
-
- /* Announce which daemons are being monitored. */
- {
- struct daemon *dmn;
- size_t len = 0;
-
- for (dmn = gs.daemons; dmn; dmn = dmn->next)
- len += strlen(dmn->name) + 1;
-
- {
- char buf[len + 1];
- char *p = buf;
-
- for (dmn = gs.daemons; dmn; dmn = dmn->next) {
- if (p != buf)
- *p++ = ' ';
- strcpy(p, dmn->name);
- p += strlen(p);
- }
- zlog_notice("%s %s watching [%s]%s", progname,
- FRR_VERSION, buf,
- watch_only ? ", monitor mode" : "");
- }
- }
-
- {
- struct thread thread;
-
- while (thread_fetch(master, &thread))
- thread_call(&thread);
- }
+ frr_run(master);
systemd_send_stopping();
/* Not reached. */