]> git.proxmox.com Git - mirror_frr.git/blame_incremental - watchfrr/watchfrr.c
lib: replace stderr with zlog in vty config load
[mirror_frr.git] / watchfrr / watchfrr.c
... / ...
CommitLineData
1/*
2 * Monitor status of frr daemons and restart if necessary.
3 *
4 * Copyright (C) 2004 Andrew J. Schorr
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include <zebra.h>
22#include <thread.h>
23#include <log.h>
24#include <network.h>
25#include <sigevent.h>
26#include <lib/version.h>
27#include "command.h"
28#include "memory_vty.h"
29#include "libfrr.h"
30
31#include <getopt.h>
32#include <sys/un.h>
33#include <sys/wait.h>
34#include <memory.h>
35#include <systemd.h>
36
37#include "watchfrr.h"
38
39#ifndef MIN
40#define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
41#endif
42
43/* Macros to help randomize timers. */
44#define JITTER(X) ((random() % ((X)+1))-((X)/2))
45#define FUZZY(X) ((X)+JITTER((X)/20))
46
47#define DEFAULT_PERIOD 5
48#define DEFAULT_TIMEOUT 10
49#define DEFAULT_RESTART_TIMEOUT 20
50#define DEFAULT_LOGLEVEL LOG_INFO
51#define DEFAULT_MIN_RESTART 60
52#define DEFAULT_MAX_RESTART 600
53#ifdef PATH_WATCHFRR_PID
54#define DEFAULT_PIDFILE PATH_WATCHFRR_PID
55#else
56#define DEFAULT_PIDFILE STATEDIR "/watchfrr.pid"
57#endif
58#ifdef DAEMON_VTY_DIR
59#define VTYDIR DAEMON_VTY_DIR
60#else
61#define VTYDIR STATEDIR
62#endif
63
64#define PING_TOKEN "PING"
65
66/* Needs to be global, referenced somewhere inside libfrr. */
67struct thread_master *master;
68
69static bool watch_only = false;
70
71typedef enum {
72 PHASE_NONE = 0,
73 PHASE_STOPS_PENDING,
74 PHASE_WAITING_DOWN,
75 PHASE_ZEBRA_RESTART_PENDING,
76 PHASE_WAITING_ZEBRA_UP
77} restart_phase_t;
78
79static const char *phase_str[] = {
80 "None",
81 "Stop jobs running",
82 "Waiting for other daemons to come down",
83 "Zebra restart job running",
84 "Waiting for zebra to come up",
85 "Start jobs running",
86};
87
88#define PHASE_TIMEOUT (3*gs.restart_timeout)
89
90struct restart_info {
91 const char *name;
92 const char *what;
93 pid_t pid;
94 struct timeval time;
95 long interval;
96 struct thread *t_kill;
97 int kills;
98};
99
100static struct global_state {
101 restart_phase_t phase;
102 struct thread *t_phase_hanging;
103 const char *vtydir;
104 long period;
105 long timeout;
106 long restart_timeout;
107 long min_restart_interval;
108 long max_restart_interval;
109 struct daemon *daemons;
110 const char *restart_command;
111 const char *start_command;
112 const char *stop_command;
113 struct restart_info restart;
114 int loglevel;
115 struct daemon *special; /* points to zebra when doing phased restart */
116 int numdaemons;
117 int numpids;
118 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
119} gs = {
120 .phase = PHASE_NONE,
121 .vtydir = VTYDIR,
122 .period = 1000 * DEFAULT_PERIOD,
123 .timeout = DEFAULT_TIMEOUT,
124 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
125 .loglevel = DEFAULT_LOGLEVEL,
126 .min_restart_interval = DEFAULT_MIN_RESTART,
127 .max_restart_interval = DEFAULT_MAX_RESTART,
128};
129
130typedef enum {
131 DAEMON_INIT,
132 DAEMON_DOWN,
133 DAEMON_CONNECTING,
134 DAEMON_UP,
135 DAEMON_UNRESPONSIVE
136} daemon_state_t;
137
138#define IS_UP(DMN) \
139 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
140
141static const char *state_str[] = {
142 "Init", "Down", "Connecting", "Up", "Unresponsive",
143};
144
145struct daemon {
146 const char *name;
147 daemon_state_t state;
148 int fd;
149 struct timeval echo_sent;
150 u_int connect_tries;
151 struct thread *t_wakeup;
152 struct thread *t_read;
153 struct thread *t_write;
154 struct daemon *next;
155 struct restart_info restart;
156};
157
158#define OPTION_MINRESTART 2000
159#define OPTION_MAXRESTART 2001
160#define OPTION_DRY 2002
161
162static const struct option longopts[] = {
163 {"daemon", no_argument, NULL, 'd'},
164 {"statedir", required_argument, NULL, 'S'},
165 {"loglevel", required_argument, NULL, 'l'},
166 {"interval", required_argument, NULL, 'i'},
167 {"timeout", required_argument, NULL, 't'},
168 {"restart-timeout", required_argument, NULL, 'T'},
169 {"restart", required_argument, NULL, 'r'},
170 {"start-command", required_argument, NULL, 's'},
171 {"kill-command", required_argument, NULL, 'k'},
172 {"dry", no_argument, NULL, OPTION_DRY},
173 {"min-restart-interval", required_argument, NULL, OPTION_MINRESTART},
174 {"max-restart-interval", required_argument, NULL, OPTION_MAXRESTART},
175 {"pid-file", required_argument, NULL, 'p'},
176 {"blank-string", required_argument, NULL, 'b'},
177 {"help", no_argument, NULL, 'h'},
178 {"version", no_argument, NULL, 'v'},
179 {NULL, 0, NULL, 0}};
180
181static int try_connect(struct daemon *dmn);
182static int wakeup_send_echo(struct thread *t_wakeup);
183static void try_restart(struct daemon *dmn);
184static void phase_check(void);
185
186static const char *progname;
187static void printhelp(FILE *target)
188{
189 fprintf(target,
190 "Usage : %s [OPTION...] <daemon name> ...\n\n\
191Watchdog program to monitor status of frr daemons and try to restart\n\
192them if they are down or unresponsive. It determines whether a daemon is\n\
193up based on whether it can connect to the daemon's vty unix stream socket.\n\
194It then repeatedly sends echo commands over that socket to determine whether\n\
195the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
196on the socket connection and know immediately that the daemon is down.\n\n\
197The daemons to be monitored should be listed on the command line.\n\n\
198In order to avoid attempting to restart the daemons in a fast loop,\n\
199the -m and -M options allow you to control the minimum delay between\n\
200restart commands. The minimum restart delay is recalculated each time\n\
201a restart is attempted: if the time since the last restart attempt exceeds\n\
202twice the -M value, then the restart delay is set to the -m value.\n\
203Otherwise, the interval is doubled (but capped at the -M value).\n\n",
204 progname);
205
206 fprintf(target,
207 "Options:\n\
208-d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
209 to syslog instead of stdout.\n\
210-S, --statedir Set the vty socket directory (default is %s)\n\
211-l, --loglevel Set the logging level (default is %d).\n\
212 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
213 but it can be set higher than %d if extra-verbose debugging\n\
214 messages are desired.\n\
215 --min-restart-interval\n\
216 Set the minimum seconds to wait between invocations of daemon\n\
217 restart commands (default is %d).\n\
218 --max-restart-interval\n\
219 Set the maximum seconds to wait between invocations of daemon\n\
220 restart commands (default is %d).\n\
221-i, --interval Set the status polling interval in seconds (default is %d)\n\
222-t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
223-T, --restart-timeout\n\
224 Set the restart (kill) timeout in seconds (default is %d).\n\
225 If any background jobs are still running after this much\n\
226 time has elapsed, they will be killed.\n\
227-r, --restart Supply a Bourne shell command to use to restart a single\n\
228 daemon. The command string should include '%%s' where the\n\
229 name of the daemon should be substituted.\n\
230-s, --start-command\n\
231 Supply a Bourne shell to command to use to start a single\n\
232 daemon. The command string should include '%%s' where the\n\
233 name of the daemon should be substituted.\n\
234-k, --kill-command\n\
235 Supply a Bourne shell to command to use to stop a single\n\
236 daemon. The command string should include '%%s' where the\n\
237 name of the daemon should be substituted.\n\
238 --dry Do not start or restart anything, just log.\n\
239-p, --pid-file Set process identifier file name\n\
240 (default is %s).\n\
241-b, --blank-string\n\
242 When the supplied argument string is found in any of the\n\
243 various shell command arguments (-r, -s, or -k), replace\n\
244 it with a space. This is an ugly hack to circumvent problems\n\
245 passing command-line arguments with embedded spaces.\n\
246-v, --version Print program version\n\
247-h, --help Display this help and exit\n",
248 VTYDIR, DEFAULT_LOGLEVEL, LOG_EMERG, LOG_DEBUG, LOG_DEBUG,
249 DEFAULT_MIN_RESTART, DEFAULT_MAX_RESTART, DEFAULT_PERIOD,
250 DEFAULT_TIMEOUT, DEFAULT_RESTART_TIMEOUT, DEFAULT_PIDFILE);
251}
252
253static pid_t run_background(char *shell_cmd)
254{
255 pid_t child;
256
257 switch (child = fork()) {
258 case -1:
259 zlog_err("fork failed, cannot run command [%s]: %s", shell_cmd,
260 safe_strerror(errno));
261 return -1;
262 case 0:
263 /* Child process. */
264 /* Use separate process group so child processes can be killed
265 * easily. */
266 if (setpgid(0, 0) < 0)
267 zlog_warn("warning: setpgid(0,0) failed: %s",
268 safe_strerror(errno));
269 {
270 char shell[] = "sh";
271 char dashc[] = "-c";
272 char *const argv[4] = {shell, dashc, shell_cmd, NULL};
273 execv("/bin/sh", argv);
274 zlog_err("execv(/bin/sh -c '%s') failed: %s", shell_cmd,
275 safe_strerror(errno));
276 _exit(127);
277 }
278 default:
279 /* Parent process: we will reap the child later. */
280 zlog_err("Forked background command [pid %d]: %s", (int)child,
281 shell_cmd);
282 return child;
283 }
284}
285
286static struct timeval *time_elapsed(struct timeval *result,
287 const struct timeval *start_time)
288{
289 gettimeofday(result, NULL);
290 result->tv_sec -= start_time->tv_sec;
291 result->tv_usec -= start_time->tv_usec;
292 while (result->tv_usec < 0) {
293 result->tv_usec += 1000000L;
294 result->tv_sec--;
295 }
296 return result;
297}
298
299static int restart_kill(struct thread *t_kill)
300{
301 struct restart_info *restart = THREAD_ARG(t_kill);
302 struct timeval delay;
303
304 time_elapsed(&delay, &restart->time);
305 zlog_warn(
306 "Warning: %s %s child process %d still running after "
307 "%ld seconds, sending signal %d",
308 restart->what, restart->name, (int)restart->pid,
309 (long)delay.tv_sec, (restart->kills ? SIGKILL : SIGTERM));
310 kill(-restart->pid, (restart->kills ? SIGKILL : SIGTERM));
311 restart->kills++;
312 restart->t_kill = NULL;
313 thread_add_timer(master, restart_kill, restart, gs.restart_timeout,
314 &restart->t_kill);
315 return 0;
316}
317
318static struct restart_info *find_child(pid_t child)
319{
320 struct daemon *dmn;
321 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
322 if (dmn->restart.pid == child)
323 return &dmn->restart;
324 }
325 return NULL;
326}
327
328static void sigchild(void)
329{
330 pid_t child;
331 int status;
332 const char *name;
333 const char *what;
334 struct restart_info *restart;
335
336 switch (child = waitpid(-1, &status, WNOHANG)) {
337 case -1:
338 zlog_err("waitpid failed: %s", safe_strerror(errno));
339 return;
340 case 0:
341 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
342 return;
343 }
344
345 if (child == integrated_write_pid) {
346 integrated_write_sigchld(status);
347 return;
348 }
349
350 if ((restart = find_child(child)) != NULL) {
351 name = restart->name;
352 what = restart->what;
353 restart->pid = 0;
354 gs.numpids--;
355 thread_cancel(restart->t_kill);
356 restart->t_kill = NULL;
357 /* Update restart time to reflect the time the command
358 * completed. */
359 gettimeofday(&restart->time, NULL);
360 } else {
361 zlog_err(
362 "waitpid returned status for an unknown child process %d",
363 (int)child);
364 name = "(unknown)";
365 what = "background";
366 }
367 if (WIFSTOPPED(status))
368 zlog_warn("warning: %s %s process %d is stopped", what, name,
369 (int)child);
370 else if (WIFSIGNALED(status))
371 zlog_warn("%s %s process %d terminated due to signal %d", what,
372 name, (int)child, WTERMSIG(status));
373 else if (WIFEXITED(status)) {
374 if (WEXITSTATUS(status) != 0)
375 zlog_warn(
376 "%s %s process %d exited with non-zero status %d",
377 what, name, (int)child, WEXITSTATUS(status));
378 else
379 zlog_debug("%s %s process %d exited normally", what,
380 name, (int)child);
381 } else
382 zlog_err("cannot interpret %s %s process %d wait status 0x%x",
383 what, name, (int)child, status);
384 phase_check();
385}
386
387static int run_job(struct restart_info *restart, const char *cmdtype,
388 const char *command, int force, int update_interval)
389{
390 struct timeval delay;
391
392 if (gs.loglevel > LOG_DEBUG + 1)
393 zlog_debug("attempting to %s %s", cmdtype, restart->name);
394
395 if (restart->pid) {
396 if (gs.loglevel > LOG_DEBUG + 1)
397 zlog_debug(
398 "cannot %s %s, previous pid %d still running",
399 cmdtype, restart->name, (int)restart->pid);
400 return -1;
401 }
402
403 /* Note: time_elapsed test must come before the force test, since we
404 need
405 to make sure that delay is initialized for use below in updating the
406 restart interval. */
407 if ((time_elapsed(&delay, &restart->time)->tv_sec < restart->interval)
408 && !force) {
409 if (gs.loglevel > LOG_DEBUG + 1)
410 zlog_debug(
411 "postponing %s %s: "
412 "elapsed time %ld < retry interval %ld",
413 cmdtype, restart->name, (long)delay.tv_sec,
414 restart->interval);
415 return -1;
416 }
417
418 gettimeofday(&restart->time, NULL);
419 restart->kills = 0;
420 {
421 char cmd[strlen(command) + strlen(restart->name) + 1];
422 snprintf(cmd, sizeof(cmd), command, restart->name);
423 if ((restart->pid = run_background(cmd)) > 0) {
424 restart->t_kill = NULL;
425 thread_add_timer(master, restart_kill, restart,
426 gs.restart_timeout, &restart->t_kill);
427 restart->what = cmdtype;
428 gs.numpids++;
429 } else
430 restart->pid = 0;
431 }
432
433 /* Calculate the new restart interval. */
434 if (update_interval) {
435 if (delay.tv_sec > 2 * gs.max_restart_interval)
436 restart->interval = gs.min_restart_interval;
437 else if ((restart->interval *= 2) > gs.max_restart_interval)
438 restart->interval = gs.max_restart_interval;
439 if (gs.loglevel > LOG_DEBUG + 1)
440 zlog_debug("restart %s interval is now %ld",
441 restart->name, restart->interval);
442 }
443 return restart->pid;
444}
445
446#define SET_READ_HANDLER(DMN) \
447 do { \
448 (DMN)->t_read = NULL; \
449 thread_add_read(master, handle_read, (DMN), (DMN)->fd, \
450 &(DMN)->t_read); \
451 } while (0);
452
453#define SET_WAKEUP_DOWN(DMN) \
454 do { \
455 (DMN)->t_wakeup = NULL; \
456 thread_add_timer_msec(master, wakeup_down, (DMN), \
457 FUZZY(gs.period), &(DMN)->t_wakeup); \
458 } while (0);
459
460#define SET_WAKEUP_UNRESPONSIVE(DMN) \
461 do { \
462 (DMN)->t_wakeup = NULL; \
463 thread_add_timer_msec(master, wakeup_unresponsive, (DMN), \
464 FUZZY(gs.period), &(DMN)->t_wakeup); \
465 } while (0);
466
467#define SET_WAKEUP_ECHO(DMN) \
468 do { \
469 (DMN)->t_wakeup = NULL; \
470 thread_add_timer_msec(master, wakeup_send_echo, (DMN), \
471 FUZZY(gs.period), &(DMN)->t_wakeup); \
472 } while (0);
473
474static int wakeup_down(struct thread *t_wakeup)
475{
476 struct daemon *dmn = THREAD_ARG(t_wakeup);
477
478 dmn->t_wakeup = NULL;
479 if (try_connect(dmn) < 0)
480 SET_WAKEUP_DOWN(dmn);
481 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
482 try_restart(dmn);
483 return 0;
484}
485
486static int wakeup_init(struct thread *t_wakeup)
487{
488 struct daemon *dmn = THREAD_ARG(t_wakeup);
489
490 dmn->t_wakeup = NULL;
491 if (try_connect(dmn) < 0) {
492 SET_WAKEUP_DOWN(dmn);
493 zlog_err("%s state -> down : initial connection attempt failed",
494 dmn->name);
495 dmn->state = DAEMON_DOWN;
496 }
497 return 0;
498}
499
500static void daemon_down(struct daemon *dmn, const char *why)
501{
502 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
503 zlog_err("%s state -> down : %s", dmn->name, why);
504 else if (gs.loglevel > LOG_DEBUG)
505 zlog_debug("%s still down : %s", dmn->name, why);
506 if (IS_UP(dmn))
507 gs.numdown++;
508 dmn->state = DAEMON_DOWN;
509 if (dmn->fd >= 0) {
510 close(dmn->fd);
511 dmn->fd = -1;
512 }
513 THREAD_OFF(dmn->t_read);
514 THREAD_OFF(dmn->t_write);
515 THREAD_OFF(dmn->t_wakeup);
516 if (try_connect(dmn) < 0)
517 SET_WAKEUP_DOWN(dmn);
518 phase_check();
519}
520
521static int handle_read(struct thread *t_read)
522{
523 struct daemon *dmn = THREAD_ARG(t_read);
524 static const char resp[sizeof(PING_TOKEN) + 4] = PING_TOKEN "\n";
525 char buf[sizeof(resp) + 100];
526 ssize_t rc;
527 struct timeval delay;
528
529 dmn->t_read = NULL;
530 if ((rc = read(dmn->fd, buf, sizeof(buf))) < 0) {
531 char why[100];
532
533 if (ERRNO_IO_RETRY(errno)) {
534 /* Pretend it never happened. */
535 SET_READ_HANDLER(dmn);
536 return 0;
537 }
538 snprintf(why, sizeof(why), "unexpected read error: %s",
539 safe_strerror(errno));
540 daemon_down(dmn, why);
541 return 0;
542 }
543 if (rc == 0) {
544 daemon_down(dmn, "read returned EOF");
545 return 0;
546 }
547 if (!dmn->echo_sent.tv_sec) {
548 char why[sizeof(buf) + 100];
549 snprintf(why, sizeof(why),
550 "unexpected read returns %d bytes: %.*s", (int)rc,
551 (int)rc, buf);
552 daemon_down(dmn, why);
553 return 0;
554 }
555
556 /* We are expecting an echo response: is there any chance that the
557 response would not be returned entirely in the first read? That
558 seems inconceivable... */
559 if ((rc != sizeof(resp)) || memcmp(buf, resp, sizeof(resp))) {
560 char why[100 + sizeof(buf)];
561 snprintf(why, sizeof(why),
562 "read returned bad echo response of %d bytes "
563 "(expecting %u): %.*s",
564 (int)rc, (u_int)sizeof(resp), (int)rc, buf);
565 daemon_down(dmn, why);
566 return 0;
567 }
568
569 time_elapsed(&delay, &dmn->echo_sent);
570 dmn->echo_sent.tv_sec = 0;
571 if (dmn->state == DAEMON_UNRESPONSIVE) {
572 if (delay.tv_sec < gs.timeout) {
573 dmn->state = DAEMON_UP;
574 zlog_warn(
575 "%s state -> up : echo response received after %ld.%06ld "
576 "seconds",
577 dmn->name, (long)delay.tv_sec,
578 (long)delay.tv_usec);
579 } else
580 zlog_warn(
581 "%s: slow echo response finally received after %ld.%06ld "
582 "seconds",
583 dmn->name, (long)delay.tv_sec,
584 (long)delay.tv_usec);
585 } else if (gs.loglevel > LOG_DEBUG + 1)
586 zlog_debug("%s: echo response received after %ld.%06ld seconds",
587 dmn->name, (long)delay.tv_sec, (long)delay.tv_usec);
588
589 SET_READ_HANDLER(dmn);
590 if (dmn->t_wakeup)
591 thread_cancel(dmn->t_wakeup);
592 SET_WAKEUP_ECHO(dmn);
593
594 return 0;
595}
596
597/*
598 * Wait till we notice that all daemons are ready before
599 * we send we are ready to systemd
600 */
601static void daemon_send_ready(void)
602{
603 static int sent = 0;
604 if (!sent && gs.numdown == 0) {
605 FILE *fp;
606
607 fp = fopen(DAEMON_VTY_DIR "/watchfrr.started", "w");
608 fclose(fp);
609 zlog_notice(
610 "Watchfrr: Notifying Systemd we are up and running");
611 systemd_send_started(master, 0);
612 sent = 1;
613 }
614}
615
616static void daemon_up(struct daemon *dmn, const char *why)
617{
618 dmn->state = DAEMON_UP;
619 gs.numdown--;
620 dmn->connect_tries = 0;
621 zlog_notice("%s state -> up : %s", dmn->name, why);
622 daemon_send_ready();
623 SET_WAKEUP_ECHO(dmn);
624 phase_check();
625}
626
627static int check_connect(struct thread *t_write)
628{
629 struct daemon *dmn = THREAD_ARG(t_write);
630 int sockerr;
631 socklen_t reslen = sizeof(sockerr);
632
633 dmn->t_write = NULL;
634 if (getsockopt(dmn->fd, SOL_SOCKET, SO_ERROR, (char *)&sockerr, &reslen)
635 < 0) {
636 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn->name,
637 safe_strerror(errno));
638 daemon_down(dmn,
639 "getsockopt failed checking connection success");
640 return 0;
641 }
642 if ((reslen == sizeof(sockerr)) && sockerr) {
643 char why[100];
644 snprintf(
645 why, sizeof(why),
646 "getsockopt reports that connection attempt failed: %s",
647 safe_strerror(sockerr));
648 daemon_down(dmn, why);
649 return 0;
650 }
651
652 daemon_up(dmn, "delayed connect succeeded");
653 return 0;
654}
655
656static int wakeup_connect_hanging(struct thread *t_wakeup)
657{
658 struct daemon *dmn = THREAD_ARG(t_wakeup);
659 char why[100];
660
661 dmn->t_wakeup = NULL;
662 snprintf(why, sizeof(why),
663 "connection attempt timed out after %ld seconds", gs.timeout);
664 daemon_down(dmn, why);
665 return 0;
666}
667
668/* Making connection to protocol daemon. */
669static int try_connect(struct daemon *dmn)
670{
671 int sock;
672 struct sockaddr_un addr;
673 socklen_t len;
674
675 if (gs.loglevel > LOG_DEBUG + 1)
676 zlog_debug("%s: attempting to connect", dmn->name);
677 dmn->connect_tries++;
678
679 memset(&addr, 0, sizeof(struct sockaddr_un));
680 addr.sun_family = AF_UNIX;
681 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty", gs.vtydir,
682 dmn->name);
683#ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
684 len = addr.sun_len = SUN_LEN(&addr);
685#else
686 len = sizeof(addr.sun_family) + strlen(addr.sun_path);
687#endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
688
689 /* Quick check to see if we might succeed before we go to the trouble
690 of creating a socket. */
691 if (access(addr.sun_path, W_OK) < 0) {
692 if (errno != ENOENT)
693 zlog_err("%s: access to socket %s denied: %s",
694 dmn->name, addr.sun_path,
695 safe_strerror(errno));
696 return -1;
697 }
698
699 if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
700 zlog_err("%s(%s): cannot make socket: %s", __func__,
701 addr.sun_path, safe_strerror(errno));
702 return -1;
703 }
704
705 if (set_nonblocking(sock) < 0 || set_cloexec(sock) < 0) {
706 zlog_err("%s(%s): set_nonblocking/cloexec(%d) failed", __func__,
707 addr.sun_path, sock);
708 close(sock);
709 return -1;
710 }
711
712 if (connect(sock, (struct sockaddr *)&addr, len) < 0) {
713 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK)) {
714 if (gs.loglevel > LOG_DEBUG)
715 zlog_debug("%s(%s): connect failed: %s",
716 __func__, addr.sun_path,
717 safe_strerror(errno));
718 close(sock);
719 return -1;
720 }
721 if (gs.loglevel > LOG_DEBUG)
722 zlog_debug("%s: connection in progress", dmn->name);
723 dmn->state = DAEMON_CONNECTING;
724 dmn->fd = sock;
725 dmn->t_write = NULL;
726 thread_add_write(master, check_connect, dmn, dmn->fd,
727 &dmn->t_write);
728 dmn->t_wakeup = NULL;
729 thread_add_timer(master, wakeup_connect_hanging, dmn,
730 gs.timeout, &dmn->t_wakeup);
731 SET_READ_HANDLER(dmn);
732 return 0;
733 }
734
735 dmn->fd = sock;
736 SET_READ_HANDLER(dmn);
737 daemon_up(dmn, "connect succeeded");
738 return 1;
739}
740
741static int phase_hanging(struct thread *t_hanging)
742{
743 gs.t_phase_hanging = NULL;
744 zlog_err("Phase [%s] hanging for %ld seconds, aborting phased restart",
745 phase_str[gs.phase], PHASE_TIMEOUT);
746 gs.phase = PHASE_NONE;
747 return 0;
748}
749
750static void set_phase(restart_phase_t new_phase)
751{
752 gs.phase = new_phase;
753 if (gs.t_phase_hanging)
754 thread_cancel(gs.t_phase_hanging);
755 gs.t_phase_hanging = NULL;
756 thread_add_timer(master, phase_hanging, NULL, PHASE_TIMEOUT,
757 &gs.t_phase_hanging);
758}
759
760static void phase_check(void)
761{
762 switch (gs.phase) {
763 case PHASE_NONE:
764 break;
765 case PHASE_STOPS_PENDING:
766 if (gs.numpids)
767 break;
768 zlog_info(
769 "Phased restart: all routing daemon stop jobs have completed.");
770 set_phase(PHASE_WAITING_DOWN);
771
772 /*FALLTHRU*/
773 case PHASE_WAITING_DOWN:
774 if (gs.numdown + IS_UP(gs.special) < gs.numdaemons)
775 break;
776 zlog_info("Phased restart: all routing daemons now down.");
777 run_job(&gs.special->restart, "restart", gs.restart_command, 1,
778 1);
779 set_phase(PHASE_ZEBRA_RESTART_PENDING);
780
781 /*FALLTHRU*/
782 case PHASE_ZEBRA_RESTART_PENDING:
783 if (gs.special->restart.pid)
784 break;
785 zlog_info("Phased restart: %s restart job completed.",
786 gs.special->name);
787 set_phase(PHASE_WAITING_ZEBRA_UP);
788
789 /*FALLTHRU*/
790 case PHASE_WAITING_ZEBRA_UP:
791 if (!IS_UP(gs.special))
792 break;
793 zlog_info("Phased restart: %s is now up.", gs.special->name);
794 {
795 struct daemon *dmn;
796 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
797 if (dmn != gs.special)
798 run_job(&dmn->restart, "start",
799 gs.start_command, 1, 0);
800 }
801 }
802 gs.phase = PHASE_NONE;
803 THREAD_OFF(gs.t_phase_hanging);
804 zlog_notice("Phased global restart has completed.");
805 break;
806 }
807}
808
809static void try_restart(struct daemon *dmn)
810{
811 if (watch_only)
812 return;
813
814 if (dmn != gs.special) {
815 if ((gs.special->state == DAEMON_UP)
816 && (gs.phase == PHASE_NONE))
817 run_job(&dmn->restart, "restart", gs.restart_command, 0,
818 1);
819 else
820 zlog_debug(
821 "%s: postponing restart attempt because master %s daemon "
822 "not up [%s], or phased restart in progress",
823 dmn->name, gs.special->name,
824 state_str[gs.special->state]);
825 return;
826 }
827
828 if ((gs.phase != PHASE_NONE) || gs.numpids) {
829 if (gs.loglevel > LOG_DEBUG + 1)
830 zlog_debug(
831 "postponing phased global restart: restart already in "
832 "progress [%s], or outstanding child processes [%d]",
833 phase_str[gs.phase], gs.numpids);
834 return;
835 }
836 /* Is it too soon for a restart? */
837 {
838 struct timeval delay;
839 if (time_elapsed(&delay, &gs.special->restart.time)->tv_sec
840 < gs.special->restart.interval) {
841 if (gs.loglevel > LOG_DEBUG + 1)
842 zlog_debug(
843 "postponing phased global restart: "
844 "elapsed time %ld < retry interval %ld",
845 (long)delay.tv_sec,
846 gs.special->restart.interval);
847 return;
848 }
849 }
850 run_job(&gs.restart, "restart", gs.restart_command, 0, 1);
851}
852
853static int wakeup_unresponsive(struct thread *t_wakeup)
854{
855 struct daemon *dmn = THREAD_ARG(t_wakeup);
856
857 dmn->t_wakeup = NULL;
858 if (dmn->state != DAEMON_UNRESPONSIVE)
859 zlog_err(
860 "%s: no longer unresponsive (now %s), "
861 "wakeup should have been cancelled!",
862 dmn->name, state_str[dmn->state]);
863 else {
864 SET_WAKEUP_UNRESPONSIVE(dmn);
865 try_restart(dmn);
866 }
867 return 0;
868}
869
870static int wakeup_no_answer(struct thread *t_wakeup)
871{
872 struct daemon *dmn = THREAD_ARG(t_wakeup);
873
874 dmn->t_wakeup = NULL;
875 dmn->state = DAEMON_UNRESPONSIVE;
876 zlog_err(
877 "%s state -> unresponsive : no response yet to ping "
878 "sent %ld seconds ago",
879 dmn->name, gs.timeout);
880 SET_WAKEUP_UNRESPONSIVE(dmn);
881 try_restart(dmn);
882 return 0;
883}
884
885static int wakeup_send_echo(struct thread *t_wakeup)
886{
887 static const char echocmd[] = "echo " PING_TOKEN;
888 ssize_t rc;
889 struct daemon *dmn = THREAD_ARG(t_wakeup);
890
891 dmn->t_wakeup = NULL;
892 if (((rc = write(dmn->fd, echocmd, sizeof(echocmd))) < 0)
893 || ((size_t)rc != sizeof(echocmd))) {
894 char why[100 + sizeof(echocmd)];
895 snprintf(why, sizeof(why),
896 "write '%s' returned %d instead of %u", echocmd,
897 (int)rc, (u_int)sizeof(echocmd));
898 daemon_down(dmn, why);
899 } else {
900 gettimeofday(&dmn->echo_sent, NULL);
901 dmn->t_wakeup = NULL;
902 thread_add_timer(master, wakeup_no_answer, dmn, gs.timeout,
903 &dmn->t_wakeup);
904 }
905 return 0;
906}
907
908static void sigint(void)
909{
910 zlog_notice("Terminating on signal");
911 systemd_send_stopping();
912 exit(0);
913}
914
915static int valid_command(const char *cmd)
916{
917 char *p;
918
919 return ((p = strchr(cmd, '%')) != NULL) && (*(p + 1) == 's')
920 && !strchr(p + 1, '%');
921}
922
923/* This is an ugly hack to circumvent problems with passing command-line
924 arguments that contain spaces. The fix is to use a configuration file. */
925static char *translate_blanks(const char *cmd, const char *blankstr)
926{
927 char *res;
928 char *p;
929 size_t bslen = strlen(blankstr);
930
931 if (!(res = strdup(cmd))) {
932 perror("strdup");
933 exit(1);
934 }
935 while ((p = strstr(res, blankstr)) != NULL) {
936 *p = ' ';
937 if (bslen != 1)
938 memmove(p + 1, p + bslen, strlen(p + bslen) + 1);
939 }
940 return res;
941}
942
943struct zebra_privs_t watchfrr_privs = {
944#ifdef VTY_GROUP
945 .vty_group = VTY_GROUP,
946#endif
947};
948
949static struct quagga_signal_t watchfrr_signals[] = {
950 {
951 .signal = SIGINT,
952 .handler = sigint,
953 },
954 {
955 .signal = SIGTERM,
956 .handler = sigint,
957 },
958 {
959 .signal = SIGCHLD,
960 .handler = sigchild,
961 },
962};
963
964FRR_DAEMON_INFO(watchfrr, WATCHFRR,
965 .flags = FRR_NO_PRIVSEP | FRR_NO_TCPVTY | FRR_LIMITED_CLI
966 | FRR_NO_CFG_PID_DRY | FRR_NO_ZCLIENT,
967
968 .printhelp = printhelp,
969 .copyright = "Copyright 2004 Andrew J. Schorr",
970
971 .signals = watchfrr_signals,
972 .n_signals = array_size(watchfrr_signals),
973
974 .privs = &watchfrr_privs, )
975
976int main(int argc, char **argv)
977{
978 int opt;
979 const char *pidfile = DEFAULT_PIDFILE;
980 const char *special = "zebra";
981 const char *blankstr = NULL;
982
983 frr_preinit(&watchfrr_di, argc, argv);
984 progname = watchfrr_di.progname;
985
986 frr_opt_add("b:dk:l:i:p:r:S:s:t:T:", longopts, "");
987
988 gs.restart.name = "all";
989 while ((opt = frr_getopt(argc, argv, NULL)) != EOF) {
990 switch (opt) {
991 case 0:
992 break;
993 case 'b':
994 blankstr = optarg;
995 break;
996 case OPTION_DRY:
997 watch_only = true;
998 break;
999 case 'k':
1000 if (!valid_command(optarg)) {
1001 fprintf(stderr,
1002 "Invalid kill command, must contain '%%s': %s\n",
1003 optarg);
1004 frr_help_exit(1);
1005 }
1006 gs.stop_command = optarg;
1007 break;
1008 case 'l': {
1009 char garbage[3];
1010 if ((sscanf(optarg, "%d%1s", &gs.loglevel, garbage)
1011 != 1)
1012 || (gs.loglevel < LOG_EMERG)) {
1013 fprintf(stderr,
1014 "Invalid loglevel argument: %s\n",
1015 optarg);
1016 frr_help_exit(1);
1017 }
1018 } break;
1019 case OPTION_MINRESTART: {
1020 char garbage[3];
1021 if ((sscanf(optarg, "%ld%1s", &gs.min_restart_interval,
1022 garbage)
1023 != 1)
1024 || (gs.min_restart_interval < 0)) {
1025 fprintf(stderr,
1026 "Invalid min_restart_interval argument: %s\n",
1027 optarg);
1028 frr_help_exit(1);
1029 }
1030 } break;
1031 case OPTION_MAXRESTART: {
1032 char garbage[3];
1033 if ((sscanf(optarg, "%ld%1s", &gs.max_restart_interval,
1034 garbage)
1035 != 1)
1036 || (gs.max_restart_interval < 0)) {
1037 fprintf(stderr,
1038 "Invalid max_restart_interval argument: %s\n",
1039 optarg);
1040 frr_help_exit(1);
1041 }
1042 } break;
1043 case 'i': {
1044 char garbage[3];
1045 int period;
1046 if ((sscanf(optarg, "%d%1s", &period, garbage) != 1)
1047 || (gs.period < 1)) {
1048 fprintf(stderr,
1049 "Invalid interval argument: %s\n",
1050 optarg);
1051 frr_help_exit(1);
1052 }
1053 gs.period = 1000 * period;
1054 } break;
1055 case 'p':
1056 pidfile = optarg;
1057 break;
1058 case 'r':
1059 if (!valid_command(optarg)) {
1060 fprintf(stderr,
1061 "Invalid restart command, must contain '%%s': %s\n",
1062 optarg);
1063 frr_help_exit(1);
1064 }
1065 gs.restart_command = optarg;
1066 break;
1067 case 's':
1068 if (!valid_command(optarg)) {
1069 fprintf(stderr,
1070 "Invalid start command, must contain '%%s': %s\n",
1071 optarg);
1072 frr_help_exit(1);
1073 }
1074 gs.start_command = optarg;
1075 break;
1076 case 'S':
1077 gs.vtydir = optarg;
1078 break;
1079 case 't': {
1080 char garbage[3];
1081 if ((sscanf(optarg, "%ld%1s", &gs.timeout, garbage)
1082 != 1)
1083 || (gs.timeout < 1)) {
1084 fprintf(stderr,
1085 "Invalid timeout argument: %s\n",
1086 optarg);
1087 frr_help_exit(1);
1088 }
1089 } break;
1090 case 'T': {
1091 char garbage[3];
1092 if ((sscanf(optarg, "%ld%1s", &gs.restart_timeout,
1093 garbage)
1094 != 1)
1095 || (gs.restart_timeout < 1)) {
1096 fprintf(stderr,
1097 "Invalid restart timeout argument: %s\n",
1098 optarg);
1099 frr_help_exit(1);
1100 }
1101 } break;
1102 default:
1103 fputs("Invalid option.\n", stderr);
1104 frr_help_exit(1);
1105 }
1106 }
1107
1108 if (watch_only
1109 && (gs.start_command || gs.stop_command || gs.restart_command)) {
1110 fputs("Options -r/-s/-k make no sense combined with -D.\n",
1111 stderr);
1112 frr_help_exit(1);
1113 }
1114 if (!watch_only
1115 && (!gs.restart_command || !gs.start_command || !gs.stop_command)) {
1116 fprintf(stderr,
1117 "Options -s (start), -k (kill), and -r (restart) are required.\n");
1118 frr_help_exit(1);
1119 }
1120
1121 if (blankstr) {
1122 if (gs.restart_command)
1123 gs.restart_command =
1124 translate_blanks(gs.restart_command, blankstr);
1125 if (gs.start_command)
1126 gs.start_command =
1127 translate_blanks(gs.start_command, blankstr);
1128 if (gs.stop_command)
1129 gs.stop_command =
1130 translate_blanks(gs.stop_command, blankstr);
1131 }
1132
1133 gs.restart.interval = gs.min_restart_interval;
1134
1135 master = frr_init();
1136
1137 zlog_set_level(ZLOG_DEST_MONITOR, ZLOG_DISABLED);
1138 if (watchfrr_di.daemon_mode) {
1139 zlog_set_level(ZLOG_DEST_SYSLOG, MIN(gs.loglevel, LOG_DEBUG));
1140 if (daemon(0, 0) < 0) {
1141 fprintf(stderr, "Watchfrr daemon failed: %s",
1142 strerror(errno));
1143 exit(1);
1144 }
1145 } else
1146 zlog_set_level(ZLOG_DEST_STDOUT, MIN(gs.loglevel, LOG_DEBUG));
1147
1148 watchfrr_vty_init();
1149
1150 frr_vty_serv();
1151
1152 {
1153 int i;
1154 struct daemon *tail = NULL;
1155
1156 for (i = optind; i < argc; i++) {
1157 struct daemon *dmn;
1158
1159 if (!(dmn = (struct daemon *)calloc(1, sizeof(*dmn)))) {
1160 fprintf(stderr, "calloc(1,%u) failed: %s\n",
1161 (u_int)sizeof(*dmn),
1162 safe_strerror(errno));
1163 return 1;
1164 }
1165 dmn->name = dmn->restart.name = argv[i];
1166 dmn->state = DAEMON_INIT;
1167 gs.numdaemons++;
1168 gs.numdown++;
1169 dmn->fd = -1;
1170 dmn->t_wakeup = NULL;
1171 thread_add_timer_msec(master, wakeup_init, dmn,
1172 100 + (random() % 900),
1173 &dmn->t_wakeup);
1174 dmn->restart.interval = gs.min_restart_interval;
1175 if (tail)
1176 tail->next = dmn;
1177 else
1178 gs.daemons = dmn;
1179 tail = dmn;
1180
1181 if (!strcmp(dmn->name, special))
1182 gs.special = dmn;
1183 }
1184 }
1185 if (!gs.daemons) {
1186 fputs("Must specify one or more daemons to monitor.\n", stderr);
1187 frr_help_exit(1);
1188 }
1189 if (!watch_only && !gs.special) {
1190 fprintf(stderr, "\"%s\" daemon must be in daemon list\n",
1191 special);
1192 frr_help_exit(1);
1193 }
1194
1195 /* Make sure we're not already running. */
1196 pid_output(pidfile);
1197
1198 /* Announce which daemons are being monitored. */
1199 {
1200 struct daemon *dmn;
1201 size_t len = 0;
1202
1203 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1204 len += strlen(dmn->name) + 1;
1205
1206 {
1207 char buf[len + 1];
1208 char *p = buf;
1209
1210 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
1211 if (p != buf)
1212 *p++ = ' ';
1213 strcpy(p, dmn->name);
1214 p += strlen(p);
1215 }
1216 zlog_notice("%s %s watching [%s]%s", progname,
1217 FRR_VERSION, buf,
1218 watch_only ? ", monitor mode" : "");
1219 }
1220 }
1221
1222 {
1223 struct thread thread;
1224
1225 while (thread_fetch(master, &thread))
1226 thread_call(&thread);
1227 }
1228
1229 systemd_send_stopping();
1230 /* Not reached. */
1231 return 0;
1232}