]> git.proxmox.com Git - mirror_frr.git/blob - watchfrr/watchfrr.c
watchfrr: remove -e option
[mirror_frr.git] / watchfrr / watchfrr.c
1 /*
2 * Monitor status of frr daemons and restart if necessary.
3 *
4 * Copyright (C) 2004 Andrew J. Schorr
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include <zebra.h>
22 #include <thread.h>
23 #include <log.h>
24 #include <network.h>
25 #include <sigevent.h>
26 #include <lib/version.h>
27 #include "command.h"
28 #include "memory_vty.h"
29 #include "libfrr.h"
30
31 #include <getopt.h>
32 #include <sys/un.h>
33 #include <sys/wait.h>
34 #include <memory.h>
35 #include <systemd.h>
36
37 #include "watchfrr.h"
38
39 #ifndef MIN
40 #define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
41 #endif
42
43 /* Macros to help randomize timers. */
44 #define JITTER(X) ((random() % ((X)+1))-((X)/2))
45 #define FUZZY(X) ((X)+JITTER((X)/20))
46
47 #define DEFAULT_PERIOD 5
48 #define DEFAULT_TIMEOUT 10
49 #define DEFAULT_RESTART_TIMEOUT 20
50 #define DEFAULT_LOGLEVEL LOG_INFO
51 #define DEFAULT_MIN_RESTART 60
52 #define DEFAULT_MAX_RESTART 600
53 #ifdef PATH_WATCHFRR_PID
54 #define DEFAULT_PIDFILE PATH_WATCHFRR_PID
55 #else
56 #define DEFAULT_PIDFILE STATEDIR "/watchfrr.pid"
57 #endif
58 #ifdef DAEMON_VTY_DIR
59 #define VTYDIR DAEMON_VTY_DIR
60 #else
61 #define VTYDIR STATEDIR
62 #endif
63
64 #define PING_TOKEN "PING"
65
66 /* Needs to be global, referenced somewhere inside libfrr. */
67 struct thread_master *master;
68
69 static bool watch_only = false;
70
71 typedef enum {
72 PHASE_NONE = 0,
73 PHASE_STOPS_PENDING,
74 PHASE_WAITING_DOWN,
75 PHASE_ZEBRA_RESTART_PENDING,
76 PHASE_WAITING_ZEBRA_UP
77 } restart_phase_t;
78
79 static const char *phase_str[] = {
80 "None",
81 "Stop jobs running",
82 "Waiting for other daemons to come down",
83 "Zebra restart job running",
84 "Waiting for zebra to come up",
85 "Start jobs running",
86 };
87
88 #define PHASE_TIMEOUT (3*gs.restart_timeout)
89
90 struct restart_info {
91 const char *name;
92 const char *what;
93 pid_t pid;
94 struct timeval time;
95 long interval;
96 struct thread *t_kill;
97 int kills;
98 };
99
100 static struct global_state {
101 restart_phase_t phase;
102 struct thread *t_phase_hanging;
103 const char *vtydir;
104 long period;
105 long timeout;
106 long restart_timeout;
107 long min_restart_interval;
108 long max_restart_interval;
109 struct daemon *daemons;
110 const char *restart_command;
111 const char *start_command;
112 const char *stop_command;
113 struct restart_info restart;
114 int unresponsive_restart;
115 int loglevel;
116 struct daemon *special; /* points to zebra when doing phased restart */
117 int numdaemons;
118 int numpids;
119 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
120 } gs = {
121 .phase = PHASE_NONE,
122 .vtydir = VTYDIR,
123 .period = 1000 * DEFAULT_PERIOD,
124 .timeout = DEFAULT_TIMEOUT,
125 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
126 .loglevel = DEFAULT_LOGLEVEL,
127 .min_restart_interval = DEFAULT_MIN_RESTART,
128 .max_restart_interval = DEFAULT_MAX_RESTART,
129 };
130
131 typedef enum {
132 DAEMON_INIT,
133 DAEMON_DOWN,
134 DAEMON_CONNECTING,
135 DAEMON_UP,
136 DAEMON_UNRESPONSIVE
137 } daemon_state_t;
138
139 #define IS_UP(DMN) \
140 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
141
142 static const char *state_str[] = {
143 "Init", "Down", "Connecting", "Up", "Unresponsive",
144 };
145
146 struct daemon {
147 const char *name;
148 daemon_state_t state;
149 int fd;
150 struct timeval echo_sent;
151 u_int connect_tries;
152 struct thread *t_wakeup;
153 struct thread *t_read;
154 struct thread *t_write;
155 struct daemon *next;
156 struct restart_info restart;
157 };
158
159 #define OPTION_MINRESTART 2000
160 #define OPTION_MAXRESTART 2001
161 #define OPTION_DRY 2002
162
163 static const struct option longopts[] = {
164 {"daemon", no_argument, NULL, 'd'},
165 {"statedir", required_argument, NULL, 'S'},
166 {"loglevel", required_argument, NULL, 'l'},
167 {"interval", required_argument, NULL, 'i'},
168 {"timeout", required_argument, NULL, 't'},
169 {"restart-timeout", required_argument, NULL, 'T'},
170 {"restart", required_argument, NULL, 'r'},
171 {"start-command", required_argument, NULL, 's'},
172 {"kill-command", required_argument, NULL, 'k'},
173 {"unresponsive-restart", no_argument, NULL, 'z'},
174 {"dry", no_argument, NULL, OPTION_DRY},
175 {"min-restart-interval", required_argument, NULL, OPTION_MINRESTART},
176 {"max-restart-interval", required_argument, NULL, OPTION_MAXRESTART},
177 {"pid-file", required_argument, NULL, 'p'},
178 {"blank-string", required_argument, NULL, 'b'},
179 {"help", no_argument, NULL, 'h'},
180 {"version", no_argument, NULL, 'v'},
181 {NULL, 0, NULL, 0}};
182
183 static int try_connect(struct daemon *dmn);
184 static int wakeup_send_echo(struct thread *t_wakeup);
185 static void try_restart(struct daemon *dmn);
186 static void phase_check(void);
187
188 static const char *progname;
189 static void printhelp(FILE *target)
190 {
191 fprintf(target,
192 "Usage : %s [OPTION...] <daemon name> ...\n\n\
193 Watchdog program to monitor status of frr daemons and try to restart\n\
194 them if they are down or unresponsive. It determines whether a daemon is\n\
195 up based on whether it can connect to the daemon's vty unix stream socket.\n\
196 It then repeatedly sends echo commands over that socket to determine whether\n\
197 the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
198 on the socket connection and know immediately that the daemon is down.\n\n\
199 The daemons to be monitored should be listed on the command line.\n\n\
200 In order to avoid attempting to restart the daemons in a fast loop,\n\
201 the -m and -M options allow you to control the minimum delay between\n\
202 restart commands. The minimum restart delay is recalculated each time\n\
203 a restart is attempted: if the time since the last restart attempt exceeds\n\
204 twice the -M value, then the restart delay is set to the -m value.\n\
205 Otherwise, the interval is doubled (but capped at the -M value).\n\n",
206 progname);
207
208 fprintf(target,
209 "Options:\n\
210 -d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
211 to syslog instead of stdout.\n\
212 -S, --statedir Set the vty socket directory (default is %s)\n\
213 -l, --loglevel Set the logging level (default is %d).\n\
214 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
215 but it can be set higher than %d if extra-verbose debugging\n\
216 messages are desired.\n\
217 --min-restart-interval\n\
218 Set the minimum seconds to wait between invocations of daemon\n\
219 restart commands (default is %d).\n\
220 --max-restart-interval\n\
221 Set the maximum seconds to wait between invocations of daemon\n\
222 restart commands (default is %d).\n\
223 -i, --interval Set the status polling interval in seconds (default is %d)\n\
224 -t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
225 -T, --restart-timeout\n\
226 Set the restart (kill) timeout in seconds (default is %d).\n\
227 If any background jobs are still running after this much\n\
228 time has elapsed, they will be killed.\n\
229 -r, --restart Supply a Bourne shell command to use to restart a single\n\
230 daemon. The command string should include '%%s' where the\n\
231 name of the daemon should be substituted.\n\
232 -s, --start-command\n\
233 Supply a Bourne shell to command to use to start a single\n\
234 daemon. The command string should include '%%s' where the\n\
235 name of the daemon should be substituted.\n\
236 -k, --kill-command\n\
237 Supply a Bourne shell to command to use to stop a single\n\
238 daemon. The command string should include '%%s' where the\n\
239 name of the daemon should be substituted.\n\
240 -z, --unresponsive-restart\n\
241 When a daemon is unresponsive, treat it as being down for\n\
242 restart purposes.\n\
243 --dry Do not start or restart anything, just log.\n\
244 -p, --pid-file Set process identifier file name\n\
245 (default is %s).\n\
246 -b, --blank-string\n\
247 When the supplied argument string is found in any of the\n\
248 various shell command arguments (-r, -s, or -k), replace\n\
249 it with a space. This is an ugly hack to circumvent problems\n\
250 passing command-line arguments with embedded spaces.\n\
251 -v, --version Print program version\n\
252 -h, --help Display this help and exit\n",
253 VTYDIR, DEFAULT_LOGLEVEL, LOG_EMERG, LOG_DEBUG, LOG_DEBUG,
254 DEFAULT_MIN_RESTART, DEFAULT_MAX_RESTART, DEFAULT_PERIOD,
255 DEFAULT_TIMEOUT, DEFAULT_RESTART_TIMEOUT, DEFAULT_PIDFILE);
256 }
257
258 static pid_t run_background(char *shell_cmd)
259 {
260 pid_t child;
261
262 switch (child = fork()) {
263 case -1:
264 zlog_err("fork failed, cannot run command [%s]: %s", shell_cmd,
265 safe_strerror(errno));
266 return -1;
267 case 0:
268 /* Child process. */
269 /* Use separate process group so child processes can be killed
270 * easily. */
271 if (setpgid(0, 0) < 0)
272 zlog_warn("warning: setpgid(0,0) failed: %s",
273 safe_strerror(errno));
274 {
275 char shell[] = "sh";
276 char dashc[] = "-c";
277 char *const argv[4] = {shell, dashc, shell_cmd, NULL};
278 execv("/bin/sh", argv);
279 zlog_err("execv(/bin/sh -c '%s') failed: %s", shell_cmd,
280 safe_strerror(errno));
281 _exit(127);
282 }
283 default:
284 /* Parent process: we will reap the child later. */
285 zlog_err("Forked background command [pid %d]: %s", (int)child,
286 shell_cmd);
287 return child;
288 }
289 }
290
291 static struct timeval *time_elapsed(struct timeval *result,
292 const struct timeval *start_time)
293 {
294 gettimeofday(result, NULL);
295 result->tv_sec -= start_time->tv_sec;
296 result->tv_usec -= start_time->tv_usec;
297 while (result->tv_usec < 0) {
298 result->tv_usec += 1000000L;
299 result->tv_sec--;
300 }
301 return result;
302 }
303
304 static int restart_kill(struct thread *t_kill)
305 {
306 struct restart_info *restart = THREAD_ARG(t_kill);
307 struct timeval delay;
308
309 time_elapsed(&delay, &restart->time);
310 zlog_warn(
311 "Warning: %s %s child process %d still running after "
312 "%ld seconds, sending signal %d",
313 restart->what, restart->name, (int)restart->pid,
314 (long)delay.tv_sec, (restart->kills ? SIGKILL : SIGTERM));
315 kill(-restart->pid, (restart->kills ? SIGKILL : SIGTERM));
316 restart->kills++;
317 restart->t_kill = NULL;
318 thread_add_timer(master, restart_kill, restart, gs.restart_timeout,
319 &restart->t_kill);
320 return 0;
321 }
322
323 static struct restart_info *find_child(pid_t child)
324 {
325 struct daemon *dmn;
326 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
327 if (dmn->restart.pid == child)
328 return &dmn->restart;
329 }
330 return NULL;
331 }
332
333 static void sigchild(void)
334 {
335 pid_t child;
336 int status;
337 const char *name;
338 const char *what;
339 struct restart_info *restart;
340
341 switch (child = waitpid(-1, &status, WNOHANG)) {
342 case -1:
343 zlog_err("waitpid failed: %s", safe_strerror(errno));
344 return;
345 case 0:
346 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
347 return;
348 }
349
350 if (child == integrated_write_pid) {
351 integrated_write_sigchld(status);
352 return;
353 }
354
355 if ((restart = find_child(child)) != NULL) {
356 name = restart->name;
357 what = restart->what;
358 restart->pid = 0;
359 gs.numpids--;
360 thread_cancel(restart->t_kill);
361 restart->t_kill = NULL;
362 /* Update restart time to reflect the time the command
363 * completed. */
364 gettimeofday(&restart->time, NULL);
365 } else {
366 zlog_err(
367 "waitpid returned status for an unknown child process %d",
368 (int)child);
369 name = "(unknown)";
370 what = "background";
371 }
372 if (WIFSTOPPED(status))
373 zlog_warn("warning: %s %s process %d is stopped", what, name,
374 (int)child);
375 else if (WIFSIGNALED(status))
376 zlog_warn("%s %s process %d terminated due to signal %d", what,
377 name, (int)child, WTERMSIG(status));
378 else if (WIFEXITED(status)) {
379 if (WEXITSTATUS(status) != 0)
380 zlog_warn(
381 "%s %s process %d exited with non-zero status %d",
382 what, name, (int)child, WEXITSTATUS(status));
383 else
384 zlog_debug("%s %s process %d exited normally", what,
385 name, (int)child);
386 } else
387 zlog_err("cannot interpret %s %s process %d wait status 0x%x",
388 what, name, (int)child, status);
389 phase_check();
390 }
391
392 static int run_job(struct restart_info *restart, const char *cmdtype,
393 const char *command, int force, int update_interval)
394 {
395 struct timeval delay;
396
397 if (gs.loglevel > LOG_DEBUG + 1)
398 zlog_debug("attempting to %s %s", cmdtype, restart->name);
399
400 if (restart->pid) {
401 if (gs.loglevel > LOG_DEBUG + 1)
402 zlog_debug(
403 "cannot %s %s, previous pid %d still running",
404 cmdtype, restart->name, (int)restart->pid);
405 return -1;
406 }
407
408 /* Note: time_elapsed test must come before the force test, since we
409 need
410 to make sure that delay is initialized for use below in updating the
411 restart interval. */
412 if ((time_elapsed(&delay, &restart->time)->tv_sec < restart->interval)
413 && !force) {
414 if (gs.loglevel > LOG_DEBUG + 1)
415 zlog_debug(
416 "postponing %s %s: "
417 "elapsed time %ld < retry interval %ld",
418 cmdtype, restart->name, (long)delay.tv_sec,
419 restart->interval);
420 return -1;
421 }
422
423 gettimeofday(&restart->time, NULL);
424 restart->kills = 0;
425 {
426 char cmd[strlen(command) + strlen(restart->name) + 1];
427 snprintf(cmd, sizeof(cmd), command, restart->name);
428 if ((restart->pid = run_background(cmd)) > 0) {
429 restart->t_kill = NULL;
430 thread_add_timer(master, restart_kill, restart,
431 gs.restart_timeout, &restart->t_kill);
432 restart->what = cmdtype;
433 gs.numpids++;
434 } else
435 restart->pid = 0;
436 }
437
438 /* Calculate the new restart interval. */
439 if (update_interval) {
440 if (delay.tv_sec > 2 * gs.max_restart_interval)
441 restart->interval = gs.min_restart_interval;
442 else if ((restart->interval *= 2) > gs.max_restart_interval)
443 restart->interval = gs.max_restart_interval;
444 if (gs.loglevel > LOG_DEBUG + 1)
445 zlog_debug("restart %s interval is now %ld",
446 restart->name, restart->interval);
447 }
448 return restart->pid;
449 }
450
451 #define SET_READ_HANDLER(DMN) \
452 do { \
453 (DMN)->t_read = NULL; \
454 thread_add_read(master, handle_read, (DMN), (DMN)->fd, \
455 &(DMN)->t_read); \
456 } while (0);
457
458 #define SET_WAKEUP_DOWN(DMN) \
459 do { \
460 (DMN)->t_wakeup = NULL; \
461 thread_add_timer_msec(master, wakeup_down, (DMN), \
462 FUZZY(gs.period), &(DMN)->t_wakeup); \
463 } while (0);
464
465 #define SET_WAKEUP_UNRESPONSIVE(DMN) \
466 do { \
467 (DMN)->t_wakeup = NULL; \
468 thread_add_timer_msec(master, wakeup_unresponsive, (DMN), \
469 FUZZY(gs.period), &(DMN)->t_wakeup); \
470 } while (0);
471
472 #define SET_WAKEUP_ECHO(DMN) \
473 do { \
474 (DMN)->t_wakeup = NULL; \
475 thread_add_timer_msec(master, wakeup_send_echo, (DMN), \
476 FUZZY(gs.period), &(DMN)->t_wakeup); \
477 } while (0);
478
479 static int wakeup_down(struct thread *t_wakeup)
480 {
481 struct daemon *dmn = THREAD_ARG(t_wakeup);
482
483 dmn->t_wakeup = NULL;
484 if (try_connect(dmn) < 0)
485 SET_WAKEUP_DOWN(dmn);
486 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
487 try_restart(dmn);
488 return 0;
489 }
490
491 static int wakeup_init(struct thread *t_wakeup)
492 {
493 struct daemon *dmn = THREAD_ARG(t_wakeup);
494
495 dmn->t_wakeup = NULL;
496 if (try_connect(dmn) < 0) {
497 SET_WAKEUP_DOWN(dmn);
498 zlog_err("%s state -> down : initial connection attempt failed",
499 dmn->name);
500 dmn->state = DAEMON_DOWN;
501 }
502 return 0;
503 }
504
505 static void daemon_down(struct daemon *dmn, const char *why)
506 {
507 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
508 zlog_err("%s state -> down : %s", dmn->name, why);
509 else if (gs.loglevel > LOG_DEBUG)
510 zlog_debug("%s still down : %s", dmn->name, why);
511 if (IS_UP(dmn))
512 gs.numdown++;
513 dmn->state = DAEMON_DOWN;
514 if (dmn->fd >= 0) {
515 close(dmn->fd);
516 dmn->fd = -1;
517 }
518 THREAD_OFF(dmn->t_read);
519 THREAD_OFF(dmn->t_write);
520 THREAD_OFF(dmn->t_wakeup);
521 if (try_connect(dmn) < 0)
522 SET_WAKEUP_DOWN(dmn);
523 phase_check();
524 }
525
526 static int handle_read(struct thread *t_read)
527 {
528 struct daemon *dmn = THREAD_ARG(t_read);
529 static const char resp[sizeof(PING_TOKEN) + 4] = PING_TOKEN "\n";
530 char buf[sizeof(resp) + 100];
531 ssize_t rc;
532 struct timeval delay;
533
534 dmn->t_read = NULL;
535 if ((rc = read(dmn->fd, buf, sizeof(buf))) < 0) {
536 char why[100];
537
538 if (ERRNO_IO_RETRY(errno)) {
539 /* Pretend it never happened. */
540 SET_READ_HANDLER(dmn);
541 return 0;
542 }
543 snprintf(why, sizeof(why), "unexpected read error: %s",
544 safe_strerror(errno));
545 daemon_down(dmn, why);
546 return 0;
547 }
548 if (rc == 0) {
549 daemon_down(dmn, "read returned EOF");
550 return 0;
551 }
552 if (!dmn->echo_sent.tv_sec) {
553 char why[sizeof(buf) + 100];
554 snprintf(why, sizeof(why),
555 "unexpected read returns %d bytes: %.*s", (int)rc,
556 (int)rc, buf);
557 daemon_down(dmn, why);
558 return 0;
559 }
560
561 /* We are expecting an echo response: is there any chance that the
562 response would not be returned entirely in the first read? That
563 seems inconceivable... */
564 if ((rc != sizeof(resp)) || memcmp(buf, resp, sizeof(resp))) {
565 char why[100 + sizeof(buf)];
566 snprintf(why, sizeof(why),
567 "read returned bad echo response of %d bytes "
568 "(expecting %u): %.*s",
569 (int)rc, (u_int)sizeof(resp), (int)rc, buf);
570 daemon_down(dmn, why);
571 return 0;
572 }
573
574 time_elapsed(&delay, &dmn->echo_sent);
575 dmn->echo_sent.tv_sec = 0;
576 if (dmn->state == DAEMON_UNRESPONSIVE) {
577 if (delay.tv_sec < gs.timeout) {
578 dmn->state = DAEMON_UP;
579 zlog_warn(
580 "%s state -> up : echo response received after %ld.%06ld "
581 "seconds",
582 dmn->name, (long)delay.tv_sec,
583 (long)delay.tv_usec);
584 } else
585 zlog_warn(
586 "%s: slow echo response finally received after %ld.%06ld "
587 "seconds",
588 dmn->name, (long)delay.tv_sec,
589 (long)delay.tv_usec);
590 } else if (gs.loglevel > LOG_DEBUG + 1)
591 zlog_debug("%s: echo response received after %ld.%06ld seconds",
592 dmn->name, (long)delay.tv_sec, (long)delay.tv_usec);
593
594 SET_READ_HANDLER(dmn);
595 if (dmn->t_wakeup)
596 thread_cancel(dmn->t_wakeup);
597 SET_WAKEUP_ECHO(dmn);
598
599 return 0;
600 }
601
602 /*
603 * Wait till we notice that all daemons are ready before
604 * we send we are ready to systemd
605 */
606 static void daemon_send_ready(void)
607 {
608 static int sent = 0;
609 if (!sent && gs.numdown == 0) {
610 FILE *fp;
611
612 fp = fopen(DAEMON_VTY_DIR "/watchfrr.started", "w");
613 fclose(fp);
614 zlog_notice(
615 "Watchfrr: Notifying Systemd we are up and running");
616 systemd_send_started(master, 0);
617 sent = 1;
618 }
619 }
620
621 static void daemon_up(struct daemon *dmn, const char *why)
622 {
623 dmn->state = DAEMON_UP;
624 gs.numdown--;
625 dmn->connect_tries = 0;
626 zlog_notice("%s state -> up : %s", dmn->name, why);
627 daemon_send_ready();
628 SET_WAKEUP_ECHO(dmn);
629 phase_check();
630 }
631
632 static int check_connect(struct thread *t_write)
633 {
634 struct daemon *dmn = THREAD_ARG(t_write);
635 int sockerr;
636 socklen_t reslen = sizeof(sockerr);
637
638 dmn->t_write = NULL;
639 if (getsockopt(dmn->fd, SOL_SOCKET, SO_ERROR, (char *)&sockerr, &reslen)
640 < 0) {
641 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn->name,
642 safe_strerror(errno));
643 daemon_down(dmn,
644 "getsockopt failed checking connection success");
645 return 0;
646 }
647 if ((reslen == sizeof(sockerr)) && sockerr) {
648 char why[100];
649 snprintf(
650 why, sizeof(why),
651 "getsockopt reports that connection attempt failed: %s",
652 safe_strerror(sockerr));
653 daemon_down(dmn, why);
654 return 0;
655 }
656
657 daemon_up(dmn, "delayed connect succeeded");
658 return 0;
659 }
660
661 static int wakeup_connect_hanging(struct thread *t_wakeup)
662 {
663 struct daemon *dmn = THREAD_ARG(t_wakeup);
664 char why[100];
665
666 dmn->t_wakeup = NULL;
667 snprintf(why, sizeof(why),
668 "connection attempt timed out after %ld seconds", gs.timeout);
669 daemon_down(dmn, why);
670 return 0;
671 }
672
673 /* Making connection to protocol daemon. */
674 static int try_connect(struct daemon *dmn)
675 {
676 int sock;
677 struct sockaddr_un addr;
678 socklen_t len;
679
680 if (gs.loglevel > LOG_DEBUG + 1)
681 zlog_debug("%s: attempting to connect", dmn->name);
682 dmn->connect_tries++;
683
684 memset(&addr, 0, sizeof(struct sockaddr_un));
685 addr.sun_family = AF_UNIX;
686 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty", gs.vtydir,
687 dmn->name);
688 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
689 len = addr.sun_len = SUN_LEN(&addr);
690 #else
691 len = sizeof(addr.sun_family) + strlen(addr.sun_path);
692 #endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
693
694 /* Quick check to see if we might succeed before we go to the trouble
695 of creating a socket. */
696 if (access(addr.sun_path, W_OK) < 0) {
697 if (errno != ENOENT)
698 zlog_err("%s: access to socket %s denied: %s",
699 dmn->name, addr.sun_path,
700 safe_strerror(errno));
701 return -1;
702 }
703
704 if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
705 zlog_err("%s(%s): cannot make socket: %s", __func__,
706 addr.sun_path, safe_strerror(errno));
707 return -1;
708 }
709
710 if (set_nonblocking(sock) < 0 || set_cloexec(sock) < 0) {
711 zlog_err("%s(%s): set_nonblocking/cloexec(%d) failed", __func__,
712 addr.sun_path, sock);
713 close(sock);
714 return -1;
715 }
716
717 if (connect(sock, (struct sockaddr *)&addr, len) < 0) {
718 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK)) {
719 if (gs.loglevel > LOG_DEBUG)
720 zlog_debug("%s(%s): connect failed: %s",
721 __func__, addr.sun_path,
722 safe_strerror(errno));
723 close(sock);
724 return -1;
725 }
726 if (gs.loglevel > LOG_DEBUG)
727 zlog_debug("%s: connection in progress", dmn->name);
728 dmn->state = DAEMON_CONNECTING;
729 dmn->fd = sock;
730 dmn->t_write = NULL;
731 thread_add_write(master, check_connect, dmn, dmn->fd,
732 &dmn->t_write);
733 dmn->t_wakeup = NULL;
734 thread_add_timer(master, wakeup_connect_hanging, dmn,
735 gs.timeout, &dmn->t_wakeup);
736 SET_READ_HANDLER(dmn);
737 return 0;
738 }
739
740 dmn->fd = sock;
741 SET_READ_HANDLER(dmn);
742 daemon_up(dmn, "connect succeeded");
743 return 1;
744 }
745
746 static int phase_hanging(struct thread *t_hanging)
747 {
748 gs.t_phase_hanging = NULL;
749 zlog_err("Phase [%s] hanging for %ld seconds, aborting phased restart",
750 phase_str[gs.phase], PHASE_TIMEOUT);
751 gs.phase = PHASE_NONE;
752 return 0;
753 }
754
755 static void set_phase(restart_phase_t new_phase)
756 {
757 gs.phase = new_phase;
758 if (gs.t_phase_hanging)
759 thread_cancel(gs.t_phase_hanging);
760 gs.t_phase_hanging = NULL;
761 thread_add_timer(master, phase_hanging, NULL, PHASE_TIMEOUT,
762 &gs.t_phase_hanging);
763 }
764
765 static void phase_check(void)
766 {
767 switch (gs.phase) {
768 case PHASE_NONE:
769 break;
770 case PHASE_STOPS_PENDING:
771 if (gs.numpids)
772 break;
773 zlog_info(
774 "Phased restart: all routing daemon stop jobs have completed.");
775 set_phase(PHASE_WAITING_DOWN);
776
777 /*FALLTHRU*/
778 case PHASE_WAITING_DOWN:
779 if (gs.numdown + IS_UP(gs.special) < gs.numdaemons)
780 break;
781 zlog_info("Phased restart: all routing daemons now down.");
782 run_job(&gs.special->restart, "restart", gs.restart_command, 1,
783 1);
784 set_phase(PHASE_ZEBRA_RESTART_PENDING);
785
786 /*FALLTHRU*/
787 case PHASE_ZEBRA_RESTART_PENDING:
788 if (gs.special->restart.pid)
789 break;
790 zlog_info("Phased restart: %s restart job completed.",
791 gs.special->name);
792 set_phase(PHASE_WAITING_ZEBRA_UP);
793
794 /*FALLTHRU*/
795 case PHASE_WAITING_ZEBRA_UP:
796 if (!IS_UP(gs.special))
797 break;
798 zlog_info("Phased restart: %s is now up.", gs.special->name);
799 {
800 struct daemon *dmn;
801 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
802 if (dmn != gs.special)
803 run_job(&dmn->restart, "start",
804 gs.start_command, 1, 0);
805 }
806 }
807 gs.phase = PHASE_NONE;
808 THREAD_OFF(gs.t_phase_hanging);
809 zlog_notice("Phased global restart has completed.");
810 break;
811 }
812 }
813
814 static void try_restart(struct daemon *dmn)
815 {
816 if (watch_only)
817 return;
818
819 if (dmn != gs.special) {
820 if ((gs.special->state == DAEMON_UP)
821 && (gs.phase == PHASE_NONE))
822 run_job(&dmn->restart, "restart", gs.restart_command, 0,
823 1);
824 else
825 zlog_debug(
826 "%s: postponing restart attempt because master %s daemon "
827 "not up [%s], or phased restart in progress",
828 dmn->name, gs.special->name,
829 state_str[gs.special->state]);
830 return;
831 }
832
833 if ((gs.phase != PHASE_NONE) || gs.numpids) {
834 if (gs.loglevel > LOG_DEBUG + 1)
835 zlog_debug(
836 "postponing phased global restart: restart already in "
837 "progress [%s], or outstanding child processes [%d]",
838 phase_str[gs.phase], gs.numpids);
839 return;
840 }
841 /* Is it too soon for a restart? */
842 {
843 struct timeval delay;
844 if (time_elapsed(&delay, &gs.special->restart.time)->tv_sec
845 < gs.special->restart.interval) {
846 if (gs.loglevel > LOG_DEBUG + 1)
847 zlog_debug(
848 "postponing phased global restart: "
849 "elapsed time %ld < retry interval %ld",
850 (long)delay.tv_sec,
851 gs.special->restart.interval);
852 return;
853 }
854 }
855 run_job(&gs.restart, "restart", gs.restart_command, 0, 1);
856 }
857
858 static int wakeup_unresponsive(struct thread *t_wakeup)
859 {
860 struct daemon *dmn = THREAD_ARG(t_wakeup);
861
862 dmn->t_wakeup = NULL;
863 if (dmn->state != DAEMON_UNRESPONSIVE)
864 zlog_err(
865 "%s: no longer unresponsive (now %s), "
866 "wakeup should have been cancelled!",
867 dmn->name, state_str[dmn->state]);
868 else {
869 SET_WAKEUP_UNRESPONSIVE(dmn);
870 try_restart(dmn);
871 }
872 return 0;
873 }
874
875 static int wakeup_no_answer(struct thread *t_wakeup)
876 {
877 struct daemon *dmn = THREAD_ARG(t_wakeup);
878
879 dmn->t_wakeup = NULL;
880 dmn->state = DAEMON_UNRESPONSIVE;
881 zlog_err(
882 "%s state -> unresponsive : no response yet to ping "
883 "sent %ld seconds ago",
884 dmn->name, gs.timeout);
885 if (gs.unresponsive_restart) {
886 SET_WAKEUP_UNRESPONSIVE(dmn);
887 try_restart(dmn);
888 }
889 return 0;
890 }
891
892 static int wakeup_send_echo(struct thread *t_wakeup)
893 {
894 static const char echocmd[] = "echo " PING_TOKEN;
895 ssize_t rc;
896 struct daemon *dmn = THREAD_ARG(t_wakeup);
897
898 dmn->t_wakeup = NULL;
899 if (((rc = write(dmn->fd, echocmd, sizeof(echocmd))) < 0)
900 || ((size_t)rc != sizeof(echocmd))) {
901 char why[100 + sizeof(echocmd)];
902 snprintf(why, sizeof(why),
903 "write '%s' returned %d instead of %u", echocmd,
904 (int)rc, (u_int)sizeof(echocmd));
905 daemon_down(dmn, why);
906 } else {
907 gettimeofday(&dmn->echo_sent, NULL);
908 dmn->t_wakeup = NULL;
909 thread_add_timer(master, wakeup_no_answer, dmn, gs.timeout,
910 &dmn->t_wakeup);
911 }
912 return 0;
913 }
914
915 static void sigint(void)
916 {
917 zlog_notice("Terminating on signal");
918 systemd_send_stopping();
919 exit(0);
920 }
921
922 static int valid_command(const char *cmd)
923 {
924 char *p;
925
926 return ((p = strchr(cmd, '%')) != NULL) && (*(p + 1) == 's')
927 && !strchr(p + 1, '%');
928 }
929
930 /* This is an ugly hack to circumvent problems with passing command-line
931 arguments that contain spaces. The fix is to use a configuration file. */
932 static char *translate_blanks(const char *cmd, const char *blankstr)
933 {
934 char *res;
935 char *p;
936 size_t bslen = strlen(blankstr);
937
938 if (!(res = strdup(cmd))) {
939 perror("strdup");
940 exit(1);
941 }
942 while ((p = strstr(res, blankstr)) != NULL) {
943 *p = ' ';
944 if (bslen != 1)
945 memmove(p + 1, p + bslen, strlen(p + bslen) + 1);
946 }
947 return res;
948 }
949
950 struct zebra_privs_t watchfrr_privs = {
951 #ifdef VTY_GROUP
952 .vty_group = VTY_GROUP,
953 #endif
954 };
955
956 static struct quagga_signal_t watchfrr_signals[] = {
957 {
958 .signal = SIGINT,
959 .handler = sigint,
960 },
961 {
962 .signal = SIGTERM,
963 .handler = sigint,
964 },
965 {
966 .signal = SIGCHLD,
967 .handler = sigchild,
968 },
969 };
970
971 FRR_DAEMON_INFO(watchfrr, WATCHFRR,
972 .flags = FRR_NO_PRIVSEP | FRR_NO_TCPVTY | FRR_LIMITED_CLI
973 | FRR_NO_CFG_PID_DRY | FRR_NO_ZCLIENT,
974
975 .printhelp = printhelp,
976 .copyright = "Copyright 2004 Andrew J. Schorr",
977
978 .signals = watchfrr_signals,
979 .n_signals = array_size(watchfrr_signals),
980
981 .privs = &watchfrr_privs, )
982
983 int main(int argc, char **argv)
984 {
985 int opt;
986 const char *pidfile = DEFAULT_PIDFILE;
987 const char *special = "zebra";
988 const char *blankstr = NULL;
989
990 frr_preinit(&watchfrr_di, argc, argv);
991 progname = watchfrr_di.progname;
992
993 frr_opt_add("b:dk:l:i:p:r:S:s:t:T:z", longopts, "");
994
995 gs.restart.name = "all";
996 while ((opt = frr_getopt(argc, argv, NULL)) != EOF) {
997 switch (opt) {
998 case 0:
999 break;
1000 case 'b':
1001 blankstr = optarg;
1002 break;
1003 case OPTION_DRY:
1004 watch_only = true;
1005 break;
1006 case 'k':
1007 if (!valid_command(optarg)) {
1008 fprintf(stderr,
1009 "Invalid kill command, must contain '%%s': %s\n",
1010 optarg);
1011 frr_help_exit(1);
1012 }
1013 gs.stop_command = optarg;
1014 break;
1015 case 'l': {
1016 char garbage[3];
1017 if ((sscanf(optarg, "%d%1s", &gs.loglevel, garbage)
1018 != 1)
1019 || (gs.loglevel < LOG_EMERG)) {
1020 fprintf(stderr,
1021 "Invalid loglevel argument: %s\n",
1022 optarg);
1023 frr_help_exit(1);
1024 }
1025 } break;
1026 case OPTION_MINRESTART: {
1027 char garbage[3];
1028 if ((sscanf(optarg, "%ld%1s", &gs.min_restart_interval,
1029 garbage)
1030 != 1)
1031 || (gs.min_restart_interval < 0)) {
1032 fprintf(stderr,
1033 "Invalid min_restart_interval argument: %s\n",
1034 optarg);
1035 frr_help_exit(1);
1036 }
1037 } break;
1038 case OPTION_MAXRESTART: {
1039 char garbage[3];
1040 if ((sscanf(optarg, "%ld%1s", &gs.max_restart_interval,
1041 garbage)
1042 != 1)
1043 || (gs.max_restart_interval < 0)) {
1044 fprintf(stderr,
1045 "Invalid max_restart_interval argument: %s\n",
1046 optarg);
1047 frr_help_exit(1);
1048 }
1049 } break;
1050 case 'i': {
1051 char garbage[3];
1052 int period;
1053 if ((sscanf(optarg, "%d%1s", &period, garbage) != 1)
1054 || (gs.period < 1)) {
1055 fprintf(stderr,
1056 "Invalid interval argument: %s\n",
1057 optarg);
1058 frr_help_exit(1);
1059 }
1060 gs.period = 1000 * period;
1061 } break;
1062 case 'p':
1063 pidfile = optarg;
1064 break;
1065 case 'r':
1066 if (!valid_command(optarg)) {
1067 fprintf(stderr,
1068 "Invalid restart command, must contain '%%s': %s\n",
1069 optarg);
1070 frr_help_exit(1);
1071 }
1072 gs.restart_command = optarg;
1073 break;
1074 case 's':
1075 if (!valid_command(optarg)) {
1076 fprintf(stderr,
1077 "Invalid start command, must contain '%%s': %s\n",
1078 optarg);
1079 frr_help_exit(1);
1080 }
1081 gs.start_command = optarg;
1082 break;
1083 case 'S':
1084 gs.vtydir = optarg;
1085 break;
1086 case 't': {
1087 char garbage[3];
1088 if ((sscanf(optarg, "%ld%1s", &gs.timeout, garbage)
1089 != 1)
1090 || (gs.timeout < 1)) {
1091 fprintf(stderr,
1092 "Invalid timeout argument: %s\n",
1093 optarg);
1094 frr_help_exit(1);
1095 }
1096 } break;
1097 case 'T': {
1098 char garbage[3];
1099 if ((sscanf(optarg, "%ld%1s", &gs.restart_timeout,
1100 garbage)
1101 != 1)
1102 || (gs.restart_timeout < 1)) {
1103 fprintf(stderr,
1104 "Invalid restart timeout argument: %s\n",
1105 optarg);
1106 frr_help_exit(1);
1107 }
1108 } break;
1109 case 'z':
1110 gs.unresponsive_restart = 1;
1111 break;
1112 default:
1113 fputs("Invalid option.\n", stderr);
1114 frr_help_exit(1);
1115 }
1116 }
1117
1118 if (watch_only && (gs.unresponsive_restart || gs.start_command
1119 || gs.stop_command || gs.restart_command)) {
1120 fputs("Options -z/-r/-s/-k make no sense combined with -D.\n",
1121 stderr);
1122 frr_help_exit(1);
1123 }
1124 if (!watch_only
1125 && (!gs.restart_command || !gs.start_command || !gs.stop_command)) {
1126 fprintf(stderr,
1127 "Options -s (start), -k (kill), and -r (restart) are required.\n");
1128 frr_help_exit(1);
1129 }
1130
1131 if (blankstr) {
1132 if (gs.restart_command)
1133 gs.restart_command =
1134 translate_blanks(gs.restart_command, blankstr);
1135 if (gs.start_command)
1136 gs.start_command =
1137 translate_blanks(gs.start_command, blankstr);
1138 if (gs.stop_command)
1139 gs.stop_command =
1140 translate_blanks(gs.stop_command, blankstr);
1141 }
1142
1143 gs.restart.interval = gs.min_restart_interval;
1144
1145 master = frr_init();
1146
1147 zlog_set_level(ZLOG_DEST_MONITOR, ZLOG_DISABLED);
1148 if (watchfrr_di.daemon_mode) {
1149 zlog_set_level(ZLOG_DEST_SYSLOG, MIN(gs.loglevel, LOG_DEBUG));
1150 if (daemon(0, 0) < 0) {
1151 fprintf(stderr, "Watchfrr daemon failed: %s",
1152 strerror(errno));
1153 exit(1);
1154 }
1155 } else
1156 zlog_set_level(ZLOG_DEST_STDOUT, MIN(gs.loglevel, LOG_DEBUG));
1157
1158 watchfrr_vty_init();
1159
1160 frr_vty_serv();
1161
1162 {
1163 int i;
1164 struct daemon *tail = NULL;
1165
1166 for (i = optind; i < argc; i++) {
1167 struct daemon *dmn;
1168
1169 if (!(dmn = (struct daemon *)calloc(1, sizeof(*dmn)))) {
1170 fprintf(stderr, "calloc(1,%u) failed: %s\n",
1171 (u_int)sizeof(*dmn),
1172 safe_strerror(errno));
1173 return 1;
1174 }
1175 dmn->name = dmn->restart.name = argv[i];
1176 dmn->state = DAEMON_INIT;
1177 gs.numdaemons++;
1178 gs.numdown++;
1179 dmn->fd = -1;
1180 dmn->t_wakeup = NULL;
1181 thread_add_timer_msec(master, wakeup_init, dmn,
1182 100 + (random() % 900),
1183 &dmn->t_wakeup);
1184 dmn->restart.interval = gs.min_restart_interval;
1185 if (tail)
1186 tail->next = dmn;
1187 else
1188 gs.daemons = dmn;
1189 tail = dmn;
1190
1191 if (!strcmp(dmn->name, special))
1192 gs.special = dmn;
1193 }
1194 }
1195 if (!gs.daemons) {
1196 fputs("Must specify one or more daemons to monitor.\n", stderr);
1197 frr_help_exit(1);
1198 }
1199 if (!watch_only && !gs.special) {
1200 fprintf(stderr, "\"%s\" daemon must be in daemon list\n",
1201 special);
1202 frr_help_exit(1);
1203 }
1204
1205 /* Make sure we're not already running. */
1206 pid_output(pidfile);
1207
1208 /* Announce which daemons are being monitored. */
1209 {
1210 struct daemon *dmn;
1211 size_t len = 0;
1212
1213 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1214 len += strlen(dmn->name) + 1;
1215
1216 {
1217 char buf[len + 1];
1218 char *p = buf;
1219
1220 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
1221 if (p != buf)
1222 *p++ = ' ';
1223 strcpy(p, dmn->name);
1224 p += strlen(p);
1225 }
1226 zlog_notice("%s %s watching [%s]%s", progname,
1227 FRR_VERSION, buf,
1228 watch_only ? ", monitor mode" : "");
1229 }
1230 }
1231
1232 {
1233 struct thread thread;
1234
1235 while (thread_fetch(master, &thread))
1236 thread_call(&thread);
1237 }
1238
1239 systemd_send_stopping();
1240 /* Not reached. */
1241 return 0;
1242 }