]> git.proxmox.com Git - mirror_frr.git/blob - watchfrr/watchfrr.c
51e4f802c9a430cbc33d5d712286d3a7da87f15f
[mirror_frr.git] / watchfrr / watchfrr.c
1 /*
2 * Monitor status of frr daemons and restart if necessary.
3 *
4 * Copyright (C) 2004 Andrew J. Schorr
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include <zebra.h>
22 #include <thread.h>
23 #include <log.h>
24 #include <network.h>
25 #include <sigevent.h>
26 #include <lib/version.h>
27 #include "command.h"
28 #include "libfrr.h"
29 #include "lib_errors.h"
30 #include "zlog_targets.h"
31 #include "network.h"
32 #include "printfrr.h"
33
34 #include <getopt.h>
35 #include <sys/un.h>
36 #include <sys/wait.h>
37 #include <memory.h>
38 #include <systemd.h>
39
40 #include "watchfrr.h"
41 #include "watchfrr_errors.h"
42
43 #ifndef MIN
44 #define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
45 #endif
46
47 /* Macros to help randomize timers. */
48 #define JITTER(X) ((frr_weak_random() % ((X)+1))-((X)/2))
49 #define FUZZY(X) ((X)+JITTER((X)/20))
50
51 #define DEFAULT_PERIOD 5
52 #define DEFAULT_TIMEOUT 90
53 #define DEFAULT_RESTART_TIMEOUT 20
54 #define DEFAULT_LOGLEVEL LOG_INFO
55 #define DEFAULT_MIN_RESTART 60
56 #define DEFAULT_MAX_RESTART 600
57 #define DEFAULT_OPERATIONAL_TIMEOUT 60
58
59 #define DEFAULT_RESTART_CMD WATCHFRR_SH_PATH " restart %s"
60 #define DEFAULT_START_CMD WATCHFRR_SH_PATH " start %s"
61 #define DEFAULT_STOP_CMD WATCHFRR_SH_PATH " stop %s"
62
63 #define PING_TOKEN "PING"
64
65 DEFINE_MGROUP(WATCHFRR, "watchfrr");
66 DEFINE_MTYPE_STATIC(WATCHFRR, WATCHFRR_DAEMON, "watchfrr daemon entry");
67
68 /* Needs to be global, referenced somewhere inside libfrr. */
69 struct thread_master *master;
70
71 static bool watch_only = false;
72 const char *pathspace;
73
74 enum restart_phase {
75 PHASE_NONE = 0,
76 PHASE_INIT,
77 PHASE_STOPS_PENDING,
78 PHASE_WAITING_DOWN,
79 PHASE_ZEBRA_RESTART_PENDING,
80 PHASE_WAITING_ZEBRA_UP
81 };
82
83 static const char *const phase_str[] = {
84 "Idle",
85 "Startup",
86 "Stop jobs running",
87 "Waiting for other daemons to come down",
88 "Zebra restart job running",
89 "Waiting for zebra to come up",
90 "Start jobs running",
91 };
92
93 #define PHASE_TIMEOUT (3*gs.restart_timeout)
94 #define STARTUP_TIMEOUT 55 * 1000
95
96 struct restart_info {
97 const char *name;
98 const char *what;
99 pid_t pid;
100 struct timeval time;
101 long interval;
102 struct thread *t_kill;
103 int kills;
104 };
105
106 static struct global_state {
107 enum restart_phase phase;
108 struct thread *t_phase_hanging;
109 struct thread *t_startup_timeout;
110 struct thread *t_operational;
111 const char *vtydir;
112 long period;
113 long timeout;
114 long restart_timeout;
115 long min_restart_interval;
116 long max_restart_interval;
117 long operational_timeout;
118 struct daemon *daemons;
119 const char *restart_command;
120 const char *start_command;
121 const char *stop_command;
122 struct restart_info restart;
123 int loglevel;
124 struct daemon *special; /* points to zebra when doing phased restart */
125 int numdaemons;
126 int numpids;
127 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
128 } gs = {
129 .phase = PHASE_INIT,
130 .vtydir = frr_vtydir,
131 .period = 1000 * DEFAULT_PERIOD,
132 .timeout = DEFAULT_TIMEOUT,
133 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
134 .loglevel = DEFAULT_LOGLEVEL,
135 .min_restart_interval = DEFAULT_MIN_RESTART,
136 .max_restart_interval = DEFAULT_MAX_RESTART,
137 .operational_timeout = DEFAULT_OPERATIONAL_TIMEOUT,
138 .restart_command = DEFAULT_RESTART_CMD,
139 .start_command = DEFAULT_START_CMD,
140 .stop_command = DEFAULT_STOP_CMD,
141 };
142
143 enum daemon_state {
144 DAEMON_INIT,
145 DAEMON_DOWN,
146 DAEMON_CONNECTING,
147 DAEMON_UP,
148 DAEMON_UNRESPONSIVE
149 };
150
151 #define IS_UP(DMN) \
152 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
153
154 static const char *const state_str[] = {
155 "Init", "Down", "Connecting", "Up", "Unresponsive",
156 };
157
158 struct daemon {
159 const char *name;
160 enum daemon_state state;
161 int fd;
162 struct timeval echo_sent;
163 unsigned int connect_tries;
164 struct thread *t_wakeup;
165 struct thread *t_read;
166 struct thread *t_write;
167 struct daemon *next;
168 struct restart_info restart;
169
170 /*
171 * For a given daemon, if we've turned on ignore timeouts
172 * ignore the timeout value and assume everything is ok
173 * This is for daemon debugging w/ gdb after we have started
174 * FRR and realize we have something that needs to be looked
175 * at
176 */
177 bool ignore_timeout;
178 };
179
180 #define OPTION_MINRESTART 2000
181 #define OPTION_MAXRESTART 2001
182 #define OPTION_DRY 2002
183 #define OPTION_NETNS 2003
184 #define OPTION_MAXOPERATIONAL 2004
185
186 static const struct option longopts[] = {
187 {"daemon", no_argument, NULL, 'd'},
188 {"statedir", required_argument, NULL, 'S'},
189 {"loglevel", required_argument, NULL, 'l'},
190 {"interval", required_argument, NULL, 'i'},
191 {"timeout", required_argument, NULL, 't'},
192 {"restart-timeout", required_argument, NULL, 'T'},
193 {"restart", required_argument, NULL, 'r'},
194 {"start-command", required_argument, NULL, 's'},
195 {"kill-command", required_argument, NULL, 'k'},
196 {"dry", no_argument, NULL, OPTION_DRY},
197 {"min-restart-interval", required_argument, NULL, OPTION_MINRESTART},
198 {"max-restart-interval", required_argument, NULL, OPTION_MAXRESTART},
199 {"operational-timeout", required_argument, NULL, OPTION_MAXOPERATIONAL},
200 {"pid-file", required_argument, NULL, 'p'},
201 {"blank-string", required_argument, NULL, 'b'},
202 #ifdef GNU_LINUX
203 {"netns", optional_argument, NULL, OPTION_NETNS},
204 #endif
205 {"help", no_argument, NULL, 'h'},
206 {"version", no_argument, NULL, 'v'},
207 {NULL, 0, NULL, 0}};
208
209 static int try_connect(struct daemon *dmn);
210 static void wakeup_send_echo(struct thread *t_wakeup);
211 static void try_restart(struct daemon *dmn);
212 static void phase_check(void);
213 static void restart_done(struct daemon *dmn);
214
215 static const char *progname;
216
217 void watchfrr_set_ignore_daemon(struct vty *vty, const char *dname, bool ignore)
218 {
219 struct daemon *dmn;
220
221 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
222 if (strncmp(dmn->name, dname, strlen(dmn->name)) == 0)
223 break;
224 }
225
226 if (dmn) {
227 dmn->ignore_timeout = ignore;
228 vty_out(vty, "%s switching to %s\n", dmn->name,
229 ignore ? "ignore" : "watch");
230 } else
231 vty_out(vty, "%s is not configured for running at the moment",
232 dname);
233 }
234
235 static void printhelp(FILE *target)
236 {
237 fprintf(target,
238 "Usage : %s [OPTION...] <daemon name> ...\n\n\
239 Watchdog program to monitor status of frr daemons and try to restart\n\
240 them if they are down or unresponsive. It determines whether a daemon is\n\
241 up based on whether it can connect to the daemon's vty unix stream socket.\n\
242 It then repeatedly sends echo commands over that socket to determine whether\n\
243 the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
244 on the socket connection and know immediately that the daemon is down.\n\n\
245 The daemons to be monitored should be listed on the command line.\n\n\
246 In order to avoid attempting to restart the daemons in a fast loop,\n\
247 the -m and -M options allow you to control the minimum delay between\n\
248 restart commands. The minimum restart delay is recalculated each time\n\
249 a restart is attempted: if the time since the last restart attempt exceeds\n\
250 twice the -M value, then the restart delay is set to the -m value.\n\
251 Otherwise, the interval is doubled (but capped at the -M value).\n\n",
252 progname);
253
254 fprintf(target,
255 "Options:\n\
256 -d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
257 to syslog instead of stdout.\n\
258 -S, --statedir Set the vty socket directory (default is %s)\n\
259 -N, --pathspace Insert prefix into config & socket paths\n"
260 #ifdef GNU_LINUX
261 " --netns Create and/or use Linux network namespace. If no name is\n"
262 " given, uses the value from `-N`.\n"
263 #endif
264 "-l, --loglevel Set the logging level (default is %d).\n\
265 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
266 but it can be set higher than %d if extra-verbose debugging\n\
267 messages are desired.\n\
268 --min-restart-interval\n\
269 Set the minimum seconds to wait between invocations of daemon\n\
270 restart commands (default is %d).\n\
271 --max-restart-interval\n\
272 Set the maximum seconds to wait between invocations of daemon\n\
273 restart commands (default is %d).\n\
274 --operational-timeout\n\
275 Set the time before systemd is notified that we are considered\n\
276 operational again after a daemon restart (default is %d).\n\
277 -i, --interval Set the status polling interval in seconds (default is %d)\n\
278 -t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
279 -T, --restart-timeout\n\
280 Set the restart (kill) timeout in seconds (default is %d).\n\
281 If any background jobs are still running after this much\n\
282 time has elapsed, they will be killed.\n\
283 -r, --restart Supply a Bourne shell command to use to restart a single\n\
284 daemon. The command string should include '%%s' where the\n\
285 name of the daemon should be substituted.\n\
286 (default: '%s')\n\
287 -s, --start-command\n\
288 Supply a Bourne shell to command to use to start a single\n\
289 daemon. The command string should include '%%s' where the\n\
290 name of the daemon should be substituted.\n\
291 (default: '%s')\n\
292 -k, --kill-command\n\
293 Supply a Bourne shell to command to use to stop a single\n\
294 daemon. The command string should include '%%s' where the\n\
295 name of the daemon should be substituted.\n\
296 (default: '%s')\n\
297 --dry Do not start or restart anything, just log.\n\
298 -p, --pid-file Set process identifier file name\n\
299 (default is %s/watchfrr.pid).\n\
300 -b, --blank-string\n\
301 When the supplied argument string is found in any of the\n\
302 various shell command arguments (-r, -s, or -k), replace\n\
303 it with a space. This is an ugly hack to circumvent problems\n\
304 passing command-line arguments with embedded spaces.\n\
305 -v, --version Print program version\n\
306 -h, --help Display this help and exit\n",
307 frr_vtydir, DEFAULT_LOGLEVEL, LOG_EMERG, LOG_DEBUG, LOG_DEBUG,
308 DEFAULT_MIN_RESTART, DEFAULT_MAX_RESTART,
309 DEFAULT_OPERATIONAL_TIMEOUT, DEFAULT_PERIOD, DEFAULT_TIMEOUT,
310 DEFAULT_RESTART_TIMEOUT, DEFAULT_RESTART_CMD, DEFAULT_START_CMD,
311 DEFAULT_STOP_CMD, frr_vtydir);
312 }
313
314 static pid_t run_background(char *shell_cmd)
315 {
316 pid_t child;
317
318 switch (child = fork()) {
319 case -1:
320 flog_err_sys(EC_LIB_SYSTEM_CALL,
321 "fork failed, cannot run command [%s]: %s",
322 shell_cmd, safe_strerror(errno));
323 return -1;
324 case 0:
325 /* Child process. */
326 /* Use separate process group so child processes can be killed
327 * easily. */
328 if (setpgid(0, 0) < 0)
329 zlog_warn("setpgid(0,0) failed: %s",
330 safe_strerror(errno));
331 {
332 char shell[] = "sh";
333 char dashc[] = "-c";
334 char *const argv[4] = {shell, dashc, shell_cmd, NULL};
335 execv("/bin/sh", argv);
336 flog_err_sys(EC_LIB_SYSTEM_CALL,
337 "execv(/bin/sh -c '%s') failed: %s",
338 shell_cmd, safe_strerror(errno));
339 _exit(127);
340 }
341 default:
342 /* Parent process: we will reap the child later. */
343 zlog_info("Forked background command [pid %d]: %s", (int)child,
344 shell_cmd);
345 return child;
346 }
347 }
348
349 static struct timeval *time_elapsed(struct timeval *result,
350 const struct timeval *start_time)
351 {
352 gettimeofday(result, NULL);
353 result->tv_sec -= start_time->tv_sec;
354 result->tv_usec -= start_time->tv_usec;
355 while (result->tv_usec < 0) {
356 result->tv_usec += 1000000L;
357 result->tv_sec--;
358 }
359 return result;
360 }
361
362 static void restart_kill(struct thread *t_kill)
363 {
364 struct restart_info *restart = THREAD_ARG(t_kill);
365 struct timeval delay;
366
367 time_elapsed(&delay, &restart->time);
368 zlog_warn(
369 "%s %s child process %d still running after %ld seconds, sending signal %d",
370 restart->what, restart->name, (int)restart->pid,
371 (long)delay.tv_sec, (restart->kills ? SIGKILL : SIGTERM));
372 kill(-restart->pid, (restart->kills ? SIGKILL : SIGTERM));
373 restart->kills++;
374 thread_add_timer(master, restart_kill, restart, gs.restart_timeout,
375 &restart->t_kill);
376 }
377
378 static struct restart_info *find_child(pid_t child)
379 {
380 struct daemon *dmn;
381 if (gs.restart.pid == child)
382 return &gs.restart;
383
384 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
385 if (dmn->restart.pid == child)
386 return &dmn->restart;
387 }
388 return NULL;
389 }
390
391 static void sigchild(void)
392 {
393 pid_t child;
394 int status;
395 const char *name;
396 const char *what;
397 struct restart_info *restart;
398 struct daemon *dmn;
399
400 switch (child = waitpid(-1, &status, WNOHANG)) {
401 case -1:
402 flog_err_sys(EC_LIB_SYSTEM_CALL, "waitpid failed: %s",
403 safe_strerror(errno));
404 return;
405 case 0:
406 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
407 return;
408 }
409
410 if (child == integrated_write_pid) {
411 integrated_write_sigchld(status);
412 return;
413 }
414
415 if ((restart = find_child(child)) != NULL) {
416 name = restart->name;
417 what = restart->what;
418 restart->pid = 0;
419 gs.numpids--;
420 thread_cancel(&restart->t_kill);
421
422 /* Update restart time to reflect the time the command
423 * completed. */
424 gettimeofday(&restart->time, NULL);
425 } else {
426 flog_err_sys(
427 EC_LIB_SYSTEM_CALL,
428 "waitpid returned status for an unknown child process %d",
429 (int)child);
430 name = "(unknown)";
431 what = "background";
432 }
433 if (WIFSTOPPED(status))
434 zlog_warn("%s %s process %d is stopped", what, name,
435 (int)child);
436 else if (WIFSIGNALED(status))
437 zlog_warn("%s %s process %d terminated due to signal %d", what,
438 name, (int)child, WTERMSIG(status));
439 else if (WIFEXITED(status)) {
440 if (WEXITSTATUS(status) != 0)
441 zlog_warn(
442 "%s %s process %d exited with non-zero status %d",
443 what, name, (int)child, WEXITSTATUS(status));
444 else {
445 zlog_debug("%s %s process %d exited normally", what,
446 name, (int)child);
447
448 if (restart && restart != &gs.restart) {
449 dmn = container_of(restart, struct daemon,
450 restart);
451 restart_done(dmn);
452 } else if (restart)
453 for (dmn = gs.daemons; dmn; dmn = dmn->next)
454 restart_done(dmn);
455 }
456 } else
457 flog_err_sys(
458 EC_LIB_SYSTEM_CALL,
459 "cannot interpret %s %s process %d wait status 0x%x",
460 what, name, (int)child, status);
461 phase_check();
462 }
463
464 static int run_job(struct restart_info *restart, const char *cmdtype,
465 const char *command, int force, int update_interval)
466 {
467 struct timeval delay;
468
469 if (gs.loglevel > LOG_DEBUG + 1)
470 zlog_debug("attempting to %s %s", cmdtype, restart->name);
471
472 if (restart->pid) {
473 if (gs.loglevel > LOG_DEBUG + 1)
474 zlog_debug(
475 "cannot %s %s, previous pid %d still running",
476 cmdtype, restart->name, (int)restart->pid);
477 return -1;
478 }
479
480 char buffer[512];
481
482 snprintf(buffer, sizeof(buffer), "restarting %s", restart->name);
483 systemd_send_status(buffer);
484
485 /* Note: time_elapsed test must come before the force test, since we
486 need
487 to make sure that delay is initialized for use below in updating the
488 restart interval. */
489 if ((time_elapsed(&delay, &restart->time)->tv_sec < restart->interval)
490 && !force) {
491
492 if (gs.loglevel > LOG_DEBUG + 1)
493 zlog_debug(
494 "postponing %s %s: elapsed time %ld < retry interval %ld",
495 cmdtype, restart->name, (long)delay.tv_sec,
496 restart->interval);
497 return -1;
498 }
499
500 gettimeofday(&restart->time, NULL);
501 restart->kills = 0;
502 {
503 char cmd[strlen(command) + strlen(restart->name) + 1];
504 snprintf(cmd, sizeof(cmd), command, restart->name);
505 if ((restart->pid = run_background(cmd)) > 0) {
506 thread_add_timer(master, restart_kill, restart,
507 gs.restart_timeout, &restart->t_kill);
508 restart->what = cmdtype;
509 gs.numpids++;
510 } else
511 restart->pid = 0;
512 }
513
514 /* Calculate the new restart interval. */
515 if (update_interval) {
516 if (delay.tv_sec > 2 * gs.max_restart_interval)
517 restart->interval = gs.min_restart_interval;
518 else if ((restart->interval *= 2) > gs.max_restart_interval)
519 restart->interval = gs.max_restart_interval;
520 if (gs.loglevel > LOG_DEBUG + 1)
521 zlog_debug("restart %s interval is now %ld",
522 restart->name, restart->interval);
523 }
524 return restart->pid;
525 }
526
527 #define SET_READ_HANDLER(DMN) \
528 do { \
529 (DMN)->t_read = NULL; \
530 thread_add_read(master, handle_read, (DMN), (DMN)->fd, \
531 &(DMN)->t_read); \
532 } while (0);
533
534 #define SET_WAKEUP_DOWN(DMN) \
535 do { \
536 (DMN)->t_wakeup = NULL; \
537 thread_add_timer_msec(master, wakeup_down, (DMN), \
538 FUZZY(gs.period), &(DMN)->t_wakeup); \
539 } while (0);
540
541 #define SET_WAKEUP_UNRESPONSIVE(DMN) \
542 do { \
543 (DMN)->t_wakeup = NULL; \
544 thread_add_timer_msec(master, wakeup_unresponsive, (DMN), \
545 FUZZY(gs.period), &(DMN)->t_wakeup); \
546 } while (0);
547
548 #define SET_WAKEUP_ECHO(DMN) \
549 do { \
550 (DMN)->t_wakeup = NULL; \
551 thread_add_timer_msec(master, wakeup_send_echo, (DMN), \
552 FUZZY(gs.period), &(DMN)->t_wakeup); \
553 } while (0);
554
555 static void wakeup_down(struct thread *t_wakeup)
556 {
557 struct daemon *dmn = THREAD_ARG(t_wakeup);
558
559 dmn->t_wakeup = NULL;
560 if (try_connect(dmn) < 0)
561 SET_WAKEUP_DOWN(dmn);
562 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
563 try_restart(dmn);
564 }
565
566 static void wakeup_init(struct thread *t_wakeup)
567 {
568 struct daemon *dmn = THREAD_ARG(t_wakeup);
569
570 dmn->t_wakeup = NULL;
571 if (try_connect(dmn) < 0) {
572 zlog_info(
573 "%s state -> down : initial connection attempt failed",
574 dmn->name);
575 dmn->state = DAEMON_DOWN;
576 }
577 phase_check();
578 }
579
580 static void restart_done(struct daemon *dmn)
581 {
582 if (dmn->state != DAEMON_DOWN) {
583 zlog_warn(
584 "Daemon: %s: is in %s state but expected it to be in DAEMON_DOWN state",
585 dmn->name, state_str[dmn->state]);
586 return;
587 }
588 THREAD_OFF(dmn->t_wakeup);
589
590 if (try_connect(dmn) < 0)
591 SET_WAKEUP_DOWN(dmn);
592 }
593
594 static void daemon_restarting_operational(struct thread *thread)
595 {
596 systemd_send_status("FRR Operational");
597 }
598
599 static void daemon_down(struct daemon *dmn, const char *why)
600 {
601 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
602 flog_err(EC_WATCHFRR_CONNECTION, "%s state -> down : %s",
603 dmn->name, why);
604 else if (gs.loglevel > LOG_DEBUG)
605 zlog_debug("%s still down : %s", dmn->name, why);
606 if (IS_UP(dmn))
607 gs.numdown++;
608 dmn->state = DAEMON_DOWN;
609 if (dmn->fd >= 0) {
610 close(dmn->fd);
611 dmn->fd = -1;
612 }
613 THREAD_OFF(dmn->t_read);
614 THREAD_OFF(dmn->t_write);
615 THREAD_OFF(dmn->t_wakeup);
616 if (try_connect(dmn) < 0)
617 SET_WAKEUP_DOWN(dmn);
618
619 systemd_send_status("FRR partially operational");
620 phase_check();
621 }
622
623 static void handle_read(struct thread *t_read)
624 {
625 struct daemon *dmn = THREAD_ARG(t_read);
626 static const char resp[sizeof(PING_TOKEN) + 4] = PING_TOKEN "\n";
627 char buf[sizeof(resp) + 100];
628 ssize_t rc;
629 struct timeval delay;
630
631 dmn->t_read = NULL;
632 if ((rc = read(dmn->fd, buf, sizeof(buf))) < 0) {
633 char why[100];
634
635 if (ERRNO_IO_RETRY(errno)) {
636 /* Pretend it never happened. */
637 SET_READ_HANDLER(dmn);
638 return;
639 }
640 snprintf(why, sizeof(why), "unexpected read error: %s",
641 safe_strerror(errno));
642 daemon_down(dmn, why);
643 return;
644 }
645 if (rc == 0) {
646 daemon_down(dmn, "read returned EOF");
647 return;
648 }
649 if (!dmn->echo_sent.tv_sec) {
650 char why[sizeof(buf) + 100];
651 snprintf(why, sizeof(why),
652 "unexpected read returns %d bytes: %.*s", (int)rc,
653 (int)rc, buf);
654 daemon_down(dmn, why);
655 return;
656 }
657
658 /* We are expecting an echo response: is there any chance that the
659 response would not be returned entirely in the first read? That
660 seems inconceivable... */
661 if ((rc != sizeof(resp)) || memcmp(buf, resp, sizeof(resp))) {
662 char why[100 + sizeof(buf)];
663 snprintf(why, sizeof(why),
664 "read returned bad echo response of %d bytes (expecting %u): %.*s",
665 (int)rc, (unsigned int)sizeof(resp), (int)rc, buf);
666 daemon_down(dmn, why);
667 return;
668 }
669
670 time_elapsed(&delay, &dmn->echo_sent);
671 dmn->echo_sent.tv_sec = 0;
672 if (dmn->state == DAEMON_UNRESPONSIVE) {
673 if (delay.tv_sec < gs.timeout) {
674 dmn->state = DAEMON_UP;
675 zlog_warn(
676 "%s state -> up : echo response received after %ld.%06ld seconds",
677 dmn->name, (long)delay.tv_sec,
678 (long)delay.tv_usec);
679 } else
680 zlog_warn(
681 "%s: slow echo response finally received after %ld.%06ld seconds",
682 dmn->name, (long)delay.tv_sec,
683 (long)delay.tv_usec);
684 } else if (gs.loglevel > LOG_DEBUG + 1)
685 zlog_debug("%s: echo response received after %ld.%06ld seconds",
686 dmn->name, (long)delay.tv_sec, (long)delay.tv_usec);
687
688 SET_READ_HANDLER(dmn);
689 thread_cancel(&dmn->t_wakeup);
690 SET_WAKEUP_ECHO(dmn);
691 }
692
693 /*
694 * Wait till we notice that all daemons are ready before
695 * we send we are ready to systemd
696 */
697 static void daemon_send_ready(int exitcode)
698 {
699 FILE *fp;
700 static int sent = 0;
701 char started[1024];
702
703 if (sent)
704 return;
705
706 if (exitcode == 0)
707 zlog_notice("all daemons up, doing startup-complete notify");
708 else if (gs.numdown < gs.numdaemons)
709 flog_err(EC_WATCHFRR_CONNECTION,
710 "startup did not complete within timeout (%d/%d daemons running)",
711 gs.numdaemons - gs.numdown, gs.numdaemons);
712 else {
713 flog_err(EC_WATCHFRR_CONNECTION,
714 "all configured daemons failed to start -- exiting watchfrr");
715 exit(exitcode);
716
717 }
718
719 frr_detach();
720
721 snprintf(started, sizeof(started), "%s/%s", frr_vtydir,
722 "watchfrr.started");
723 fp = fopen(started, "w");
724 if (fp)
725 fclose(fp);
726
727 systemd_send_started(master);
728 systemd_send_status("FRR Operational");
729 sent = 1;
730 }
731
732 static void daemon_up(struct daemon *dmn, const char *why)
733 {
734 dmn->state = DAEMON_UP;
735 gs.numdown--;
736 dmn->connect_tries = 0;
737 zlog_notice("%s state -> up : %s", dmn->name, why);
738 if (gs.numdown == 0) {
739 daemon_send_ready(0);
740
741 THREAD_OFF(gs.t_operational);
742
743 thread_add_timer(master, daemon_restarting_operational, NULL,
744 gs.operational_timeout, &gs.t_operational);
745 }
746
747 SET_WAKEUP_ECHO(dmn);
748 phase_check();
749 }
750
751 static void check_connect(struct thread *t_write)
752 {
753 struct daemon *dmn = THREAD_ARG(t_write);
754 int sockerr;
755 socklen_t reslen = sizeof(sockerr);
756
757 dmn->t_write = NULL;
758 if (getsockopt(dmn->fd, SOL_SOCKET, SO_ERROR, (char *)&sockerr, &reslen)
759 < 0) {
760 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn->name,
761 safe_strerror(errno));
762 daemon_down(dmn,
763 "getsockopt failed checking connection success");
764 return;
765 }
766 if ((reslen == sizeof(sockerr)) && sockerr) {
767 char why[100];
768 snprintf(
769 why, sizeof(why),
770 "getsockopt reports that connection attempt failed: %s",
771 safe_strerror(sockerr));
772 daemon_down(dmn, why);
773 return;
774 }
775
776 daemon_up(dmn, "delayed connect succeeded");
777 }
778
779 static void wakeup_connect_hanging(struct thread *t_wakeup)
780 {
781 struct daemon *dmn = THREAD_ARG(t_wakeup);
782 char why[100];
783
784 dmn->t_wakeup = NULL;
785 snprintf(why, sizeof(why),
786 "connection attempt timed out after %ld seconds", gs.timeout);
787 daemon_down(dmn, why);
788 }
789
790 /* Making connection to protocol daemon. */
791 static int try_connect(struct daemon *dmn)
792 {
793 int sock;
794 struct sockaddr_un addr;
795 socklen_t len;
796
797 if (gs.loglevel > LOG_DEBUG + 1)
798 zlog_debug("%s: attempting to connect", dmn->name);
799 dmn->connect_tries++;
800
801 memset(&addr, 0, sizeof(struct sockaddr_un));
802 addr.sun_family = AF_UNIX;
803 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty", gs.vtydir,
804 dmn->name);
805 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
806 len = addr.sun_len = SUN_LEN(&addr);
807 #else
808 len = sizeof(addr.sun_family) + strlen(addr.sun_path);
809 #endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
810
811 /* Quick check to see if we might succeed before we go to the trouble
812 of creating a socket. */
813 if (access(addr.sun_path, W_OK) < 0) {
814 if (errno != ENOENT)
815 flog_err_sys(EC_LIB_SYSTEM_CALL,
816 "%s: access to socket %s denied: %s",
817 dmn->name, addr.sun_path,
818 safe_strerror(errno));
819 return -1;
820 }
821
822 if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
823 flog_err_sys(EC_LIB_SOCKET, "%s(%s): cannot make socket: %s",
824 __func__, addr.sun_path, safe_strerror(errno));
825 return -1;
826 }
827
828 if (set_nonblocking(sock) < 0 || set_cloexec(sock) < 0) {
829 flog_err_sys(EC_LIB_SYSTEM_CALL,
830 "%s(%s): set_nonblocking/cloexec(%d) failed",
831 __func__, addr.sun_path, sock);
832 close(sock);
833 return -1;
834 }
835
836 if (connect(sock, (struct sockaddr *)&addr, len) < 0) {
837 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK)) {
838 if (gs.loglevel > LOG_DEBUG)
839 zlog_debug("%s(%s): connect failed: %s",
840 __func__, addr.sun_path,
841 safe_strerror(errno));
842 close(sock);
843 return -1;
844 }
845 if (gs.loglevel > LOG_DEBUG)
846 zlog_debug("%s: connection in progress", dmn->name);
847 dmn->state = DAEMON_CONNECTING;
848 dmn->fd = sock;
849 thread_add_write(master, check_connect, dmn, dmn->fd,
850 &dmn->t_write);
851 thread_add_timer(master, wakeup_connect_hanging, dmn,
852 gs.timeout, &dmn->t_wakeup);
853 SET_READ_HANDLER(dmn);
854 return 0;
855 }
856
857 dmn->fd = sock;
858 SET_READ_HANDLER(dmn);
859 daemon_up(dmn, "connect succeeded");
860 return 1;
861 }
862
863 static void phase_hanging(struct thread *t_hanging)
864 {
865 gs.t_phase_hanging = NULL;
866 flog_err(EC_WATCHFRR_CONNECTION,
867 "Phase [%s] hanging for %ld seconds, aborting phased restart",
868 phase_str[gs.phase], PHASE_TIMEOUT);
869 gs.phase = PHASE_NONE;
870 }
871
872 static void set_phase(enum restart_phase new_phase)
873 {
874 gs.phase = new_phase;
875 thread_cancel(&gs.t_phase_hanging);
876
877 thread_add_timer(master, phase_hanging, NULL, PHASE_TIMEOUT,
878 &gs.t_phase_hanging);
879 }
880
881 static void phase_check(void)
882 {
883 struct daemon *dmn;
884
885 switch (gs.phase) {
886 case PHASE_NONE:
887 break;
888
889 case PHASE_INIT:
890 for (dmn = gs.daemons; dmn; dmn = dmn->next)
891 if (dmn->state == DAEMON_INIT)
892 return;
893
894 /* startup complete, everything out of INIT */
895 gs.phase = PHASE_NONE;
896 for (dmn = gs.daemons; dmn; dmn = dmn->next)
897 if (dmn->state == DAEMON_DOWN) {
898 SET_WAKEUP_DOWN(dmn);
899 try_restart(dmn);
900 }
901 break;
902 case PHASE_STOPS_PENDING:
903 if (gs.numpids)
904 break;
905 zlog_info(
906 "Phased restart: all routing daemon stop jobs have completed.");
907 set_phase(PHASE_WAITING_DOWN);
908
909 /*FALLTHRU*/
910 case PHASE_WAITING_DOWN:
911 if (gs.numdown + IS_UP(gs.special) < gs.numdaemons)
912 break;
913 systemd_send_status("Phased Restart");
914 zlog_info("Phased restart: all routing daemons now down.");
915 run_job(&gs.special->restart, "restart", gs.restart_command, 1,
916 1);
917 set_phase(PHASE_ZEBRA_RESTART_PENDING);
918
919 /*FALLTHRU*/
920 case PHASE_ZEBRA_RESTART_PENDING:
921 if (gs.special->restart.pid)
922 break;
923 systemd_send_status("Zebra Restarting");
924 zlog_info("Phased restart: %s restart job completed.",
925 gs.special->name);
926 set_phase(PHASE_WAITING_ZEBRA_UP);
927
928 /*FALLTHRU*/
929 case PHASE_WAITING_ZEBRA_UP:
930 if (!IS_UP(gs.special))
931 break;
932 zlog_info("Phased restart: %s is now up.", gs.special->name);
933 {
934 struct daemon *dmn;
935 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
936 if (dmn != gs.special)
937 run_job(&dmn->restart, "start",
938 gs.start_command, 1, 0);
939 }
940 }
941 gs.phase = PHASE_NONE;
942 THREAD_OFF(gs.t_phase_hanging);
943 zlog_notice("Phased global restart has completed.");
944 break;
945 }
946 }
947
948 static void try_restart(struct daemon *dmn)
949 {
950 if (watch_only)
951 return;
952
953 if (dmn != gs.special) {
954 if ((gs.special->state == DAEMON_UP)
955 && (gs.phase == PHASE_NONE))
956 run_job(&dmn->restart, "restart", gs.restart_command, 0,
957 1);
958 else
959 zlog_debug(
960 "%s: postponing restart attempt because master %s daemon not up [%s], or phased restart in progress",
961 dmn->name, gs.special->name,
962 state_str[gs.special->state]);
963 return;
964 }
965
966 if ((gs.phase != PHASE_NONE) || gs.numpids) {
967 if (gs.loglevel > LOG_DEBUG + 1)
968 zlog_debug(
969 "postponing phased global restart: restart already in progress [%s], or outstanding child processes [%d]",
970 phase_str[gs.phase], gs.numpids);
971 return;
972 }
973 /* Is it too soon for a restart? */
974 {
975 struct timeval delay;
976 if (time_elapsed(&delay, &gs.special->restart.time)->tv_sec
977 < gs.special->restart.interval) {
978 if (gs.loglevel > LOG_DEBUG + 1)
979 zlog_debug(
980 "postponing phased global restart: elapsed time %ld < retry interval %ld",
981 (long)delay.tv_sec,
982 gs.special->restart.interval);
983 return;
984 }
985 }
986 run_job(&gs.restart, "restart", gs.restart_command, 0, 1);
987 }
988
989 static void wakeup_unresponsive(struct thread *t_wakeup)
990 {
991 struct daemon *dmn = THREAD_ARG(t_wakeup);
992
993 dmn->t_wakeup = NULL;
994 if (dmn->state != DAEMON_UNRESPONSIVE)
995 flog_err(EC_WATCHFRR_CONNECTION,
996 "%s: no longer unresponsive (now %s), wakeup should have been cancelled!",
997 dmn->name, state_str[dmn->state]);
998 else {
999 SET_WAKEUP_UNRESPONSIVE(dmn);
1000 try_restart(dmn);
1001 }
1002 }
1003
1004 static void wakeup_no_answer(struct thread *t_wakeup)
1005 {
1006 struct daemon *dmn = THREAD_ARG(t_wakeup);
1007
1008 dmn->t_wakeup = NULL;
1009 dmn->state = DAEMON_UNRESPONSIVE;
1010 if (dmn->ignore_timeout)
1011 return;
1012 flog_err(EC_WATCHFRR_CONNECTION,
1013 "%s state -> unresponsive : no response yet to ping sent %ld seconds ago",
1014 dmn->name, gs.timeout);
1015 SET_WAKEUP_UNRESPONSIVE(dmn);
1016 try_restart(dmn);
1017 }
1018
1019 static void wakeup_send_echo(struct thread *t_wakeup)
1020 {
1021 static const char echocmd[] = "echo " PING_TOKEN;
1022 ssize_t rc;
1023 struct daemon *dmn = THREAD_ARG(t_wakeup);
1024
1025 dmn->t_wakeup = NULL;
1026 if (((rc = write(dmn->fd, echocmd, sizeof(echocmd))) < 0)
1027 || ((size_t)rc != sizeof(echocmd))) {
1028 char why[100 + sizeof(echocmd)];
1029 snprintf(why, sizeof(why),
1030 "write '%s' returned %d instead of %u", echocmd,
1031 (int)rc, (unsigned int)sizeof(echocmd));
1032 daemon_down(dmn, why);
1033 } else {
1034 gettimeofday(&dmn->echo_sent, NULL);
1035 thread_add_timer(master, wakeup_no_answer, dmn, gs.timeout,
1036 &dmn->t_wakeup);
1037 }
1038 }
1039
1040 bool check_all_up(void)
1041 {
1042 struct daemon *dmn;
1043
1044 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1045 if (dmn->state != DAEMON_UP)
1046 return false;
1047 return true;
1048 }
1049
1050 void watchfrr_status(struct vty *vty)
1051 {
1052 struct daemon *dmn;
1053 struct timeval delay;
1054
1055 vty_out(vty, "watchfrr global phase: %s\n", phase_str[gs.phase]);
1056 vty_out(vty, " Restart Command: %pSQq\n", gs.restart_command);
1057 vty_out(vty, " Start Command: %pSQq\n", gs.start_command);
1058 vty_out(vty, " Stop Command: %pSQq\n", gs.stop_command);
1059 vty_out(vty, " Min Restart Interval: %ld\n", gs.min_restart_interval);
1060 vty_out(vty, " Max Restart Interval: %ld\n", gs.max_restart_interval);
1061 vty_out(vty, " Restart Timeout: %ld\n", gs.restart_timeout);
1062 if (gs.restart.pid)
1063 vty_out(vty, " global restart running, pid %ld\n",
1064 (long)gs.restart.pid);
1065
1066 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
1067 vty_out(vty, " %-20s %s%s", dmn->name, state_str[dmn->state],
1068 dmn->ignore_timeout ? "/Ignoring Timeout\n" : "\n");
1069 if (dmn->restart.pid)
1070 vty_out(vty, " restart running, pid %ld\n",
1071 (long)dmn->restart.pid);
1072 else if (dmn->state == DAEMON_DOWN &&
1073 time_elapsed(&delay, &dmn->restart.time)->tv_sec
1074 < dmn->restart.interval)
1075 vty_out(vty, " restarting in %jd seconds (%jds backoff interval)\n",
1076 (intmax_t)dmn->restart.interval
1077 - (intmax_t)delay.tv_sec,
1078 (intmax_t)dmn->restart.interval);
1079 }
1080 }
1081
1082 static void sigint(void)
1083 {
1084 zlog_notice("Terminating on signal");
1085 systemd_send_stopping();
1086 exit(0);
1087 }
1088
1089 static int valid_command(const char *cmd)
1090 {
1091 char *p;
1092
1093 if (cmd == NULL)
1094 return 0;
1095
1096 return ((p = strchr(cmd, '%')) != NULL) && (*(p + 1) == 's')
1097 && !strchr(p + 1, '%');
1098 }
1099
1100 /* This is an ugly hack to circumvent problems with passing command-line
1101 arguments that contain spaces. The fix is to use a configuration file. */
1102 static char *translate_blanks(const char *cmd, const char *blankstr)
1103 {
1104 char *res;
1105 char *p;
1106 size_t bslen = strlen(blankstr);
1107
1108 if (!(res = strdup(cmd))) {
1109 perror("strdup");
1110 exit(1);
1111 }
1112 while ((p = strstr(res, blankstr)) != NULL) {
1113 *p = ' ';
1114 if (bslen != 1)
1115 memmove(p + 1, p + bslen, strlen(p + bslen) + 1);
1116 }
1117 return res;
1118 }
1119
1120 static void startup_timeout(struct thread *t_wakeup)
1121 {
1122 daemon_send_ready(1);
1123 }
1124
1125 #ifdef GNU_LINUX
1126
1127 #include <sys/mount.h>
1128 #include <sched.h>
1129
1130 #define NETNS_RUN_DIR "/var/run/netns"
1131
1132 static void netns_create(int dirfd, const char *nsname)
1133 {
1134 /* make /var/run/netns shared between mount namespaces
1135 * just like iproute2 sets it up
1136 */
1137 if (mount("", NETNS_RUN_DIR, "none", MS_SHARED | MS_REC, NULL)) {
1138 if (errno != EINVAL) {
1139 perror("mount");
1140 exit(1);
1141 }
1142
1143 if (mount(NETNS_RUN_DIR, NETNS_RUN_DIR, "none",
1144 MS_BIND | MS_REC, NULL)) {
1145 perror("mount");
1146 exit(1);
1147 }
1148
1149 if (mount("", NETNS_RUN_DIR, "none", MS_SHARED | MS_REC,
1150 NULL)) {
1151 perror("mount");
1152 exit(1);
1153 }
1154 }
1155
1156 /* need an empty file to mount on top of */
1157 int nsfd = openat(dirfd, nsname, O_CREAT | O_RDONLY | O_EXCL, 0);
1158
1159 if (nsfd < 0) {
1160 fprintf(stderr, "failed to create \"%s/%s\": %s\n",
1161 NETNS_RUN_DIR, nsname, strerror(errno));
1162 exit(1);
1163 }
1164 close(nsfd);
1165
1166 if (unshare(CLONE_NEWNET)) {
1167 perror("unshare");
1168 unlinkat(dirfd, nsname, 0);
1169 exit(1);
1170 }
1171
1172 char *dstpath = asprintfrr(MTYPE_TMP, "%s/%s", NETNS_RUN_DIR, nsname);
1173
1174 /* bind-mount so the namespace has a name and is persistent */
1175 if (mount("/proc/self/ns/net", dstpath, "none", MS_BIND, NULL) < 0) {
1176 fprintf(stderr, "failed to bind-mount netns to \"%s\": %s\n",
1177 dstpath, strerror(errno));
1178 unlinkat(dirfd, nsname, 0);
1179 exit(1);
1180 }
1181
1182 XFREE(MTYPE_TMP, dstpath);
1183 }
1184
1185 static void netns_setup(const char *nsname)
1186 {
1187 int dirfd, nsfd;
1188
1189 dirfd = open(NETNS_RUN_DIR, O_DIRECTORY | O_RDONLY);
1190 if (dirfd < 0) {
1191 if (errno == ENOTDIR) {
1192 fprintf(stderr, "error: \"%s\" is not a directory!\n",
1193 NETNS_RUN_DIR);
1194 exit(1);
1195 } else if (errno == ENOENT) {
1196 if (mkdir(NETNS_RUN_DIR, 0755)) {
1197 fprintf(stderr, "error: \"%s\": mkdir: %s\n",
1198 NETNS_RUN_DIR, strerror(errno));
1199 exit(1);
1200 }
1201 dirfd = open(NETNS_RUN_DIR, O_DIRECTORY | O_RDONLY);
1202 if (dirfd < 0) {
1203 fprintf(stderr, "error: \"%s\": opendir: %s\n",
1204 NETNS_RUN_DIR, strerror(errno));
1205 exit(1);
1206 }
1207 } else {
1208 fprintf(stderr, "error: \"%s\": %s\n",
1209 NETNS_RUN_DIR, strerror(errno));
1210 exit(1);
1211 }
1212 }
1213
1214 nsfd = openat(dirfd, nsname, O_RDONLY);
1215 if (nsfd < 0 && errno != ENOENT) {
1216 fprintf(stderr, "error: \"%s/%s\": %s\n",
1217 NETNS_RUN_DIR, nsname, strerror(errno));
1218 exit(1);
1219 }
1220 if (nsfd < 0)
1221 netns_create(dirfd, nsname);
1222 else {
1223 if (setns(nsfd, CLONE_NEWNET)) {
1224 perror("setns");
1225 exit(1);
1226 }
1227 close(nsfd);
1228 }
1229 close(dirfd);
1230
1231 /* make sure loopback is up... weird things happen otherwise.
1232 * ioctl is perfectly fine for this, don't need netlink...
1233 */
1234 int sockfd;
1235 struct ifreq ifr = { };
1236
1237 strlcpy(ifr.ifr_name, "lo", sizeof(ifr.ifr_name));
1238
1239 sockfd = socket(AF_INET, SOCK_DGRAM, 0);
1240 if (sockfd < 0) {
1241 perror("socket");
1242 exit(1);
1243 }
1244 if (ioctl(sockfd, SIOCGIFFLAGS, &ifr)) {
1245 perror("ioctl(SIOCGIFFLAGS, \"lo\")");
1246 exit(1);
1247 }
1248 if (!(ifr.ifr_flags & IFF_UP)) {
1249 ifr.ifr_flags |= IFF_UP;
1250 if (ioctl(sockfd, SIOCSIFFLAGS, &ifr)) {
1251 perror("ioctl(SIOCSIFFLAGS, \"lo\")");
1252 exit(1);
1253 }
1254 }
1255 close(sockfd);
1256 }
1257
1258 #else /* !GNU_LINUX */
1259
1260 static void netns_setup(const char *nsname)
1261 {
1262 fprintf(stderr, "network namespaces are only available on Linux\n");
1263 exit(1);
1264 }
1265 #endif
1266
1267 static void watchfrr_init(int argc, char **argv)
1268 {
1269 const char *special = "zebra";
1270 int i;
1271 struct daemon *dmn, **add = &gs.daemons;
1272 char alldaemons[512] = "", *p = alldaemons;
1273
1274 thread_add_timer_msec(master, startup_timeout, NULL, STARTUP_TIMEOUT,
1275 &gs.t_startup_timeout);
1276
1277 for (i = optind; i < argc; i++) {
1278 dmn = XCALLOC(MTYPE_WATCHFRR_DAEMON, sizeof(*dmn));
1279
1280 dmn->name = dmn->restart.name = argv[i];
1281 dmn->state = DAEMON_INIT;
1282 gs.numdaemons++;
1283 gs.numdown++;
1284 dmn->fd = -1;
1285 thread_add_timer_msec(master, wakeup_init, dmn, 0,
1286 &dmn->t_wakeup);
1287 dmn->restart.interval = gs.min_restart_interval;
1288 *add = dmn;
1289 add = &dmn->next;
1290
1291 if (!strcmp(dmn->name, special))
1292 gs.special = dmn;
1293 }
1294
1295 if (!gs.daemons) {
1296 fprintf(stderr,
1297 "Must specify one or more daemons to monitor.\n\n");
1298 frr_help_exit(1);
1299 }
1300 if (!watch_only && !gs.special) {
1301 fprintf(stderr, "\"%s\" daemon must be in daemon lists\n\n",
1302 special);
1303 frr_help_exit(1);
1304 }
1305
1306 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
1307 snprintf(p, alldaemons + sizeof(alldaemons) - p, "%s%s",
1308 (p == alldaemons) ? "" : " ", dmn->name);
1309 p += strlen(p);
1310 }
1311 zlog_notice("%s %s watching [%s]%s", progname, FRR_VERSION, alldaemons,
1312 watch_only ? ", monitor mode" : "");
1313 }
1314
1315 struct zebra_privs_t watchfrr_privs = {
1316 #ifdef VTY_GROUP
1317 .vty_group = VTY_GROUP,
1318 #endif
1319 };
1320
1321 static struct frr_signal_t watchfrr_signals[] = {
1322 {
1323 .signal = SIGINT,
1324 .handler = sigint,
1325 },
1326 {
1327 .signal = SIGTERM,
1328 .handler = sigint,
1329 },
1330 {
1331 .signal = SIGCHLD,
1332 .handler = sigchild,
1333 },
1334 };
1335
1336 FRR_DAEMON_INFO(watchfrr, WATCHFRR,
1337 .flags = FRR_NO_PRIVSEP | FRR_NO_TCPVTY | FRR_LIMITED_CLI
1338 | FRR_NO_CFG_PID_DRY | FRR_NO_ZCLIENT
1339 | FRR_DETACH_LATER,
1340
1341 .printhelp = printhelp,
1342 .copyright = "Copyright 2004 Andrew J. Schorr",
1343
1344 .signals = watchfrr_signals,
1345 .n_signals = array_size(watchfrr_signals),
1346
1347 .privs = &watchfrr_privs,
1348 );
1349
1350 #define DEPRECATED_OPTIONS "aAezR:"
1351
1352 int main(int argc, char **argv)
1353 {
1354 int opt;
1355 const char *blankstr = NULL;
1356 const char *netns = NULL;
1357 bool netns_en = false;
1358
1359 frr_preinit(&watchfrr_di, argc, argv);
1360 progname = watchfrr_di.progname;
1361
1362 frr_opt_add("b:di:k:l:N:p:r:S:s:t:T:" DEPRECATED_OPTIONS, longopts, "");
1363
1364 gs.restart.name = "all";
1365 while ((opt = frr_getopt(argc, argv, NULL)) != EOF) {
1366 if (opt && opt < 128 && strchr(DEPRECATED_OPTIONS, opt)) {
1367 fprintf(stderr,
1368 "The -%c option no longer exists.\n"
1369 "Please refer to the watchfrr(8) man page.\n",
1370 opt);
1371 exit(1);
1372 }
1373
1374 switch (opt) {
1375 case 0:
1376 break;
1377 case 'b':
1378 blankstr = optarg;
1379 break;
1380 case OPTION_DRY:
1381 watch_only = true;
1382 break;
1383 case 'k':
1384 if (!valid_command(optarg)) {
1385 fprintf(stderr,
1386 "Invalid kill command, must contain '%%s': %s\n",
1387 optarg);
1388 frr_help_exit(1);
1389 }
1390 gs.stop_command = optarg;
1391 break;
1392 case 'l': {
1393 char garbage[3];
1394 if ((sscanf(optarg, "%d%1s", &gs.loglevel, garbage)
1395 != 1)
1396 || (gs.loglevel < LOG_EMERG)) {
1397 fprintf(stderr,
1398 "Invalid loglevel argument: %s\n",
1399 optarg);
1400 frr_help_exit(1);
1401 }
1402 } break;
1403 case OPTION_MINRESTART: {
1404 char garbage[3];
1405 if ((sscanf(optarg, "%ld%1s", &gs.min_restart_interval,
1406 garbage)
1407 != 1)
1408 || (gs.min_restart_interval < 0)) {
1409 fprintf(stderr,
1410 "Invalid min_restart_interval argument: %s\n",
1411 optarg);
1412 frr_help_exit(1);
1413 }
1414 } break;
1415 case OPTION_MAXRESTART: {
1416 char garbage[3];
1417 if ((sscanf(optarg, "%ld%1s", &gs.max_restart_interval,
1418 garbage)
1419 != 1)
1420 || (gs.max_restart_interval < 0)) {
1421 fprintf(stderr,
1422 "Invalid max_restart_interval argument: %s\n",
1423 optarg);
1424 frr_help_exit(1);
1425 }
1426 } break;
1427 case OPTION_MAXOPERATIONAL: {
1428 char garbage[3];
1429
1430 if ((sscanf(optarg, "%ld%1s", &gs.operational_timeout,
1431 garbage) != 1) ||
1432 (gs.max_restart_interval < 0)) {
1433 fprintf(stderr,
1434 "Invalid Operational_timeout argument: %s\n",
1435 optarg);
1436 frr_help_exit(1);
1437 }
1438 } break;
1439 case OPTION_NETNS:
1440 netns_en = true;
1441 if (optarg && strchr(optarg, '/')) {
1442 fprintf(stderr,
1443 "invalid network namespace name \"%s\" (may not contain slashes)\n",
1444 optarg);
1445 frr_help_exit(1);
1446 }
1447 netns = optarg;
1448 break;
1449 case 'i': {
1450 char garbage[3];
1451 int period;
1452 if ((sscanf(optarg, "%d%1s", &period, garbage) != 1)
1453 || (gs.period < 1)) {
1454 fprintf(stderr,
1455 "Invalid interval argument: %s\n",
1456 optarg);
1457 frr_help_exit(1);
1458 }
1459 gs.period = 1000 * period;
1460 } break;
1461 case 'p':
1462 watchfrr_di.pid_file = optarg;
1463 break;
1464 case 'r':
1465 if (!valid_command(optarg)) {
1466 fprintf(stderr,
1467 "Invalid restart command, must contain '%%s': %s\n",
1468 optarg);
1469 frr_help_exit(1);
1470 }
1471 gs.restart_command = optarg;
1472 break;
1473 case 's':
1474 if (!valid_command(optarg)) {
1475 fprintf(stderr,
1476 "Invalid start command, must contain '%%s': %s\n",
1477 optarg);
1478 frr_help_exit(1);
1479 }
1480 gs.start_command = optarg;
1481 break;
1482 case 'S':
1483 gs.vtydir = optarg;
1484 break;
1485 case 't': {
1486 char garbage[3];
1487 if ((sscanf(optarg, "%ld%1s", &gs.timeout, garbage)
1488 != 1)
1489 || (gs.timeout < 1)) {
1490 fprintf(stderr,
1491 "Invalid timeout argument: %s\n",
1492 optarg);
1493 frr_help_exit(1);
1494 }
1495 } break;
1496 case 'T': {
1497 char garbage[3];
1498 if ((sscanf(optarg, "%ld%1s", &gs.restart_timeout,
1499 garbage)
1500 != 1)
1501 || (gs.restart_timeout < 1)) {
1502 fprintf(stderr,
1503 "Invalid restart timeout argument: %s\n",
1504 optarg);
1505 frr_help_exit(1);
1506 }
1507 } break;
1508 default:
1509 fputs("Invalid option.\n", stderr);
1510 frr_help_exit(1);
1511 }
1512 }
1513
1514 if (watch_only
1515 && (gs.start_command || gs.stop_command || gs.restart_command)) {
1516 fputs("Options -r/-s/-k are not used when --dry is active.\n",
1517 stderr);
1518 }
1519 if (!watch_only
1520 && (!gs.restart_command || !gs.start_command || !gs.stop_command)) {
1521 fprintf(stderr,
1522 "Options -s (start), -k (kill), and -r (restart) are required.\n");
1523 frr_help_exit(1);
1524 }
1525
1526 if (blankstr) {
1527 if (gs.restart_command)
1528 gs.restart_command =
1529 translate_blanks(gs.restart_command, blankstr);
1530 if (gs.start_command)
1531 gs.start_command =
1532 translate_blanks(gs.start_command, blankstr);
1533 if (gs.stop_command)
1534 gs.stop_command =
1535 translate_blanks(gs.stop_command, blankstr);
1536 }
1537
1538 gs.restart.interval = gs.min_restart_interval;
1539
1540 /* env variable for the processes that we start */
1541 if (watchfrr_di.pathspace)
1542 setenv("FRR_PATHSPACE", watchfrr_di.pathspace, 1);
1543 else
1544 unsetenv("FRR_PATHSPACE");
1545
1546 /*
1547 * when watchfrr_di.pathspace is read, if it is not specified
1548 * pathspace is NULL as expected
1549 */
1550 pathspace = watchfrr_di.pathspace;
1551
1552 if (netns_en && !netns)
1553 netns = watchfrr_di.pathspace;
1554
1555 if (netns_en && netns && netns[0])
1556 netns_setup(netns);
1557
1558 master = frr_init();
1559 watchfrr_error_init();
1560 watchfrr_init(argc, argv);
1561 watchfrr_vty_init();
1562
1563 frr_config_fork();
1564
1565 if (watchfrr_di.daemon_mode)
1566 zlog_syslog_set_prio_min(MIN(gs.loglevel, LOG_DEBUG));
1567 else
1568 zlog_aux_init(NULL, MIN(gs.loglevel, LOG_DEBUG));
1569
1570 frr_run(master);
1571
1572 systemd_send_stopping();
1573 /* Not reached. */
1574 return 0;
1575 }