]> git.proxmox.com Git - mirror_frr.git/blob - watchfrr/watchfrr.c
Merge pull request #12703 from donaldsharp/basic_babel
[mirror_frr.git] / watchfrr / watchfrr.c
1 /*
2 * Monitor status of frr daemons and restart if necessary.
3 *
4 * Copyright (C) 2004 Andrew J. Schorr
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include <zebra.h>
22 #include <thread.h>
23 #include <log.h>
24 #include <network.h>
25 #include <sigevent.h>
26 #include <lib/version.h>
27 #include "command.h"
28 #include "libfrr.h"
29 #include "lib_errors.h"
30 #include "zlog_targets.h"
31 #include "network.h"
32 #include "printfrr.h"
33
34 #include <getopt.h>
35 #include <sys/un.h>
36 #include <sys/wait.h>
37 #include <memory.h>
38 #include <systemd.h>
39
40 #include "watchfrr.h"
41 #include "watchfrr_errors.h"
42
43 #ifndef MIN
44 #define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
45 #endif
46
47 /* Macros to help randomize timers. */
48 #define JITTER(X) ((frr_weak_random() % ((X)+1))-((X)/2))
49 #define FUZZY(X) ((X)+JITTER((X)/20))
50
51 #define DEFAULT_PERIOD 5
52 #define DEFAULT_TIMEOUT 90
53 #define DEFAULT_RESTART_TIMEOUT 20
54 #define DEFAULT_LOGLEVEL LOG_INFO
55 #define DEFAULT_MIN_RESTART 60
56 #define DEFAULT_MAX_RESTART 600
57 #define DEFAULT_OPERATIONAL_TIMEOUT 60
58
59 #define DEFAULT_RESTART_CMD WATCHFRR_SH_PATH " restart %s"
60 #define DEFAULT_START_CMD WATCHFRR_SH_PATH " start %s"
61 #define DEFAULT_STOP_CMD WATCHFRR_SH_PATH " stop %s"
62
63 #define PING_TOKEN "PING"
64
65 DEFINE_MGROUP(WATCHFRR, "watchfrr");
66 DEFINE_MTYPE_STATIC(WATCHFRR, WATCHFRR_DAEMON, "watchfrr daemon entry");
67
68 /* Needs to be global, referenced somewhere inside libfrr. */
69 struct thread_master *master;
70
71 static bool watch_only = false;
72 const char *pathspace;
73
74 enum restart_phase {
75 PHASE_NONE = 0,
76 PHASE_INIT,
77 PHASE_STOPS_PENDING,
78 PHASE_WAITING_DOWN,
79 PHASE_ZEBRA_RESTART_PENDING,
80 PHASE_WAITING_ZEBRA_UP
81 };
82
83 static const char *const phase_str[] = {
84 "Idle",
85 "Startup",
86 "Stop jobs running",
87 "Waiting for other daemons to come down",
88 "Zebra restart job running",
89 "Waiting for zebra to come up",
90 "Start jobs running",
91 };
92
93 #define PHASE_TIMEOUT (3*gs.restart_timeout)
94 #define STARTUP_TIMEOUT 55 * 1000
95
96 struct restart_info {
97 const char *name;
98 const char *what;
99 pid_t pid;
100 struct timeval time;
101 long interval;
102 struct thread *t_kill;
103 int kills;
104 };
105
106 static struct global_state {
107 enum restart_phase phase;
108 struct thread *t_phase_hanging;
109 struct thread *t_startup_timeout;
110 struct thread *t_operational;
111 const char *vtydir;
112 long period;
113 long timeout;
114 long restart_timeout;
115 bool reading_configuration;
116 long min_restart_interval;
117 long max_restart_interval;
118 long operational_timeout;
119 struct daemon *daemons;
120 const char *restart_command;
121 const char *start_command;
122 const char *stop_command;
123 struct restart_info restart;
124 int loglevel;
125 struct daemon *special; /* points to zebra when doing phased restart */
126 int numdaemons;
127 int numpids;
128 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
129 } gs = {
130 .phase = PHASE_INIT,
131 .vtydir = frr_vtydir,
132 .period = 1000 * DEFAULT_PERIOD,
133 .timeout = DEFAULT_TIMEOUT,
134 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
135 .loglevel = DEFAULT_LOGLEVEL,
136 .min_restart_interval = DEFAULT_MIN_RESTART,
137 .max_restart_interval = DEFAULT_MAX_RESTART,
138 .operational_timeout = DEFAULT_OPERATIONAL_TIMEOUT,
139 .restart_command = DEFAULT_RESTART_CMD,
140 .start_command = DEFAULT_START_CMD,
141 .stop_command = DEFAULT_STOP_CMD,
142 };
143
144 enum daemon_state {
145 DAEMON_INIT,
146 DAEMON_DOWN,
147 DAEMON_CONNECTING,
148 DAEMON_UP,
149 DAEMON_UNRESPONSIVE
150 };
151
152 #define IS_UP(DMN) \
153 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
154
155 static const char *const state_str[] = {
156 "Init", "Down", "Connecting", "Up", "Unresponsive",
157 };
158
159 struct daemon {
160 const char *name;
161 enum daemon_state state;
162 int fd;
163 struct timeval echo_sent;
164 unsigned int connect_tries;
165 struct thread *t_wakeup;
166 struct thread *t_read;
167 struct thread *t_write;
168 struct daemon *next;
169 struct restart_info restart;
170
171 /*
172 * For a given daemon, if we've turned on ignore timeouts
173 * ignore the timeout value and assume everything is ok
174 * This is for daemon debugging w/ gdb after we have started
175 * FRR and realize we have something that needs to be looked
176 * at
177 */
178 bool ignore_timeout;
179 };
180
181 #define OPTION_MINRESTART 2000
182 #define OPTION_MAXRESTART 2001
183 #define OPTION_DRY 2002
184 #define OPTION_NETNS 2003
185 #define OPTION_MAXOPERATIONAL 2004
186
187 static const struct option longopts[] = {
188 {"daemon", no_argument, NULL, 'd'},
189 {"statedir", required_argument, NULL, 'S'},
190 {"loglevel", required_argument, NULL, 'l'},
191 {"interval", required_argument, NULL, 'i'},
192 {"timeout", required_argument, NULL, 't'},
193 {"restart-timeout", required_argument, NULL, 'T'},
194 {"restart", required_argument, NULL, 'r'},
195 {"start-command", required_argument, NULL, 's'},
196 {"kill-command", required_argument, NULL, 'k'},
197 {"dry", no_argument, NULL, OPTION_DRY},
198 {"min-restart-interval", required_argument, NULL, OPTION_MINRESTART},
199 {"max-restart-interval", required_argument, NULL, OPTION_MAXRESTART},
200 {"operational-timeout", required_argument, NULL, OPTION_MAXOPERATIONAL},
201 {"pid-file", required_argument, NULL, 'p'},
202 {"blank-string", required_argument, NULL, 'b'},
203 #ifdef GNU_LINUX
204 {"netns", optional_argument, NULL, OPTION_NETNS},
205 #endif
206 {"help", no_argument, NULL, 'h'},
207 {"version", no_argument, NULL, 'v'},
208 {NULL, 0, NULL, 0}};
209
210 static int try_connect(struct daemon *dmn);
211 static void wakeup_send_echo(struct thread *t_wakeup);
212 static void try_restart(struct daemon *dmn);
213 static void phase_check(void);
214 static void restart_done(struct daemon *dmn);
215
216 static const char *progname;
217
218 void watchfrr_set_ignore_daemon(struct vty *vty, const char *dname, bool ignore)
219 {
220 struct daemon *dmn;
221
222 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
223 if (strncmp(dmn->name, dname, strlen(dmn->name)) == 0)
224 break;
225 }
226
227 if (dmn) {
228 dmn->ignore_timeout = ignore;
229 vty_out(vty, "%s switching to %s\n", dmn->name,
230 ignore ? "ignore" : "watch");
231 } else
232 vty_out(vty, "%s is not configured for running at the moment",
233 dname);
234 }
235
236 static void printhelp(FILE *target)
237 {
238 fprintf(target,
239 "Usage : %s [OPTION...] <daemon name> ...\n\n\
240 Watchdog program to monitor status of frr daemons and try to restart\n\
241 them if they are down or unresponsive. It determines whether a daemon is\n\
242 up based on whether it can connect to the daemon's vty unix stream socket.\n\
243 It then repeatedly sends echo commands over that socket to determine whether\n\
244 the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
245 on the socket connection and know immediately that the daemon is down.\n\n\
246 The daemons to be monitored should be listed on the command line.\n\n\
247 In order to avoid attempting to restart the daemons in a fast loop,\n\
248 the -m and -M options allow you to control the minimum delay between\n\
249 restart commands. The minimum restart delay is recalculated each time\n\
250 a restart is attempted: if the time since the last restart attempt exceeds\n\
251 twice the -M value, then the restart delay is set to the -m value.\n\
252 Otherwise, the interval is doubled (but capped at the -M value).\n\n",
253 progname);
254
255 fprintf(target,
256 "Options:\n\
257 -d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
258 to syslog instead of stdout.\n\
259 -S, --statedir Set the vty socket directory (default is %s)\n\
260 -N, --pathspace Insert prefix into config & socket paths\n"
261 #ifdef GNU_LINUX
262 " --netns Create and/or use Linux network namespace. If no name is\n"
263 " given, uses the value from `-N`.\n"
264 #endif
265 "-l, --loglevel Set the logging level (default is %d).\n\
266 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
267 but it can be set higher than %d if extra-verbose debugging\n\
268 messages are desired.\n\
269 --min-restart-interval\n\
270 Set the minimum seconds to wait between invocations of daemon\n\
271 restart commands (default is %d).\n\
272 --max-restart-interval\n\
273 Set the maximum seconds to wait between invocations of daemon\n\
274 restart commands (default is %d).\n\
275 --operational-timeout\n\
276 Set the time before systemd is notified that we are considered\n\
277 operational again after a daemon restart (default is %d).\n\
278 -i, --interval Set the status polling interval in seconds (default is %d)\n\
279 -t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
280 -T, --restart-timeout\n\
281 Set the restart (kill) timeout in seconds (default is %d).\n\
282 If any background jobs are still running after this much\n\
283 time has elapsed, they will be killed.\n\
284 -r, --restart Supply a Bourne shell command to use to restart a single\n\
285 daemon. The command string should include '%%s' where the\n\
286 name of the daemon should be substituted.\n\
287 (default: '%s')\n\
288 -s, --start-command\n\
289 Supply a Bourne shell to command to use to start a single\n\
290 daemon. The command string should include '%%s' where the\n\
291 name of the daemon should be substituted.\n\
292 (default: '%s')\n\
293 -k, --kill-command\n\
294 Supply a Bourne shell to command to use to stop a single\n\
295 daemon. The command string should include '%%s' where the\n\
296 name of the daemon should be substituted.\n\
297 (default: '%s')\n\
298 --dry Do not start or restart anything, just log.\n\
299 -p, --pid-file Set process identifier file name\n\
300 (default is %s/watchfrr.pid).\n\
301 -b, --blank-string\n\
302 When the supplied argument string is found in any of the\n\
303 various shell command arguments (-r, -s, or -k), replace\n\
304 it with a space. This is an ugly hack to circumvent problems\n\
305 passing command-line arguments with embedded spaces.\n\
306 -v, --version Print program version\n\
307 -h, --help Display this help and exit\n",
308 frr_vtydir, DEFAULT_LOGLEVEL, LOG_EMERG, LOG_DEBUG, LOG_DEBUG,
309 DEFAULT_MIN_RESTART, DEFAULT_MAX_RESTART,
310 DEFAULT_OPERATIONAL_TIMEOUT, DEFAULT_PERIOD, DEFAULT_TIMEOUT,
311 DEFAULT_RESTART_TIMEOUT, DEFAULT_RESTART_CMD, DEFAULT_START_CMD,
312 DEFAULT_STOP_CMD, frr_vtydir);
313 }
314
315 static pid_t run_background(char *shell_cmd)
316 {
317 pid_t child;
318
319 switch (child = fork()) {
320 case -1:
321 flog_err_sys(EC_LIB_SYSTEM_CALL,
322 "fork failed, cannot run command [%s]: %s",
323 shell_cmd, safe_strerror(errno));
324 return -1;
325 case 0:
326 /* Child process. */
327 /* Use separate process group so child processes can be killed
328 * easily. */
329 if (setpgid(0, 0) < 0)
330 zlog_warn("setpgid(0,0) failed: %s",
331 safe_strerror(errno));
332 {
333 char shell[] = "sh";
334 char dashc[] = "-c";
335 char *const argv[4] = {shell, dashc, shell_cmd, NULL};
336 execv("/bin/sh", argv);
337 flog_err_sys(EC_LIB_SYSTEM_CALL,
338 "execv(/bin/sh -c '%s') failed: %s",
339 shell_cmd, safe_strerror(errno));
340 _exit(127);
341 }
342 default:
343 /* Parent process: we will reap the child later. */
344 zlog_info("Forked background command [pid %d]: %s", (int)child,
345 shell_cmd);
346 return child;
347 }
348 }
349
350 static struct timeval *time_elapsed(struct timeval *result,
351 const struct timeval *start_time)
352 {
353 gettimeofday(result, NULL);
354 result->tv_sec -= start_time->tv_sec;
355 result->tv_usec -= start_time->tv_usec;
356 while (result->tv_usec < 0) {
357 result->tv_usec += 1000000L;
358 result->tv_sec--;
359 }
360 return result;
361 }
362
363 static void restart_kill(struct thread *t_kill)
364 {
365 struct restart_info *restart = THREAD_ARG(t_kill);
366 struct timeval delay;
367
368 time_elapsed(&delay, &restart->time);
369
370 if (gs.reading_configuration) {
371 zlog_err(
372 "%s %s child process appears to still be reading configuration, delaying for another %lu time",
373 restart->what, restart->name, gs.restart_timeout);
374 thread_add_timer(master, restart_kill, restart,
375 gs.restart_timeout, &restart->t_kill);
376 return;
377 }
378
379 zlog_warn(
380 "%s %s child process %d still running after %ld seconds, sending signal %d",
381 restart->what, restart->name, (int)restart->pid,
382 (long)delay.tv_sec, (restart->kills ? SIGKILL : SIGTERM));
383 kill(-restart->pid, (restart->kills ? SIGKILL : SIGTERM));
384 restart->kills++;
385 thread_add_timer(master, restart_kill, restart, gs.restart_timeout,
386 &restart->t_kill);
387 }
388
389 static struct restart_info *find_child(pid_t child)
390 {
391 struct daemon *dmn;
392 if (gs.restart.pid == child)
393 return &gs.restart;
394
395 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
396 if (dmn->restart.pid == child)
397 return &dmn->restart;
398 }
399 return NULL;
400 }
401
402 static void sigchild(void)
403 {
404 pid_t child;
405 int status;
406 const char *name;
407 const char *what;
408 struct restart_info *restart;
409 struct daemon *dmn;
410
411 switch (child = waitpid(-1, &status, WNOHANG)) {
412 case -1:
413 flog_err_sys(EC_LIB_SYSTEM_CALL, "waitpid failed: %s",
414 safe_strerror(errno));
415 return;
416 case 0:
417 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
418 return;
419 }
420
421 if (child == integrated_write_pid) {
422 integrated_write_sigchld(status);
423 return;
424 }
425
426 if ((restart = find_child(child)) != NULL) {
427 name = restart->name;
428 what = restart->what;
429 restart->pid = 0;
430 gs.numpids--;
431 thread_cancel(&restart->t_kill);
432
433 /* Update restart time to reflect the time the command
434 * completed. */
435 gettimeofday(&restart->time, NULL);
436 } else {
437 flog_err_sys(
438 EC_LIB_SYSTEM_CALL,
439 "waitpid returned status for an unknown child process %d",
440 (int)child);
441 name = "(unknown)";
442 what = "background";
443 }
444 if (WIFSTOPPED(status))
445 zlog_warn("%s %s process %d is stopped", what, name,
446 (int)child);
447 else if (WIFSIGNALED(status))
448 zlog_warn("%s %s process %d terminated due to signal %d", what,
449 name, (int)child, WTERMSIG(status));
450 else if (WIFEXITED(status)) {
451 if (WEXITSTATUS(status) != 0)
452 zlog_warn(
453 "%s %s process %d exited with non-zero status %d",
454 what, name, (int)child, WEXITSTATUS(status));
455 else {
456 zlog_debug("%s %s process %d exited normally", what,
457 name, (int)child);
458
459 if (restart && restart != &gs.restart) {
460 dmn = container_of(restart, struct daemon,
461 restart);
462 restart_done(dmn);
463 } else if (restart)
464 for (dmn = gs.daemons; dmn; dmn = dmn->next)
465 restart_done(dmn);
466 }
467 } else
468 flog_err_sys(
469 EC_LIB_SYSTEM_CALL,
470 "cannot interpret %s %s process %d wait status 0x%x",
471 what, name, (int)child, status);
472 phase_check();
473 }
474
475 static int run_job(struct restart_info *restart, const char *cmdtype,
476 const char *command, int force, int update_interval)
477 {
478 struct timeval delay;
479
480 if (gs.loglevel > LOG_DEBUG + 1)
481 zlog_debug("attempting to %s %s", cmdtype, restart->name);
482
483 if (restart->pid) {
484 if (gs.loglevel > LOG_DEBUG + 1)
485 zlog_debug(
486 "cannot %s %s, previous pid %d still running",
487 cmdtype, restart->name, (int)restart->pid);
488 return -1;
489 }
490
491 char buffer[512];
492
493 snprintf(buffer, sizeof(buffer), "restarting %s", restart->name);
494 systemd_send_status(buffer);
495
496 /* Note: time_elapsed test must come before the force test, since we
497 need
498 to make sure that delay is initialized for use below in updating the
499 restart interval. */
500 if ((time_elapsed(&delay, &restart->time)->tv_sec < restart->interval)
501 && !force) {
502
503 if (gs.loglevel > LOG_DEBUG + 1)
504 zlog_debug(
505 "postponing %s %s: elapsed time %ld < retry interval %ld",
506 cmdtype, restart->name, (long)delay.tv_sec,
507 restart->interval);
508 return -1;
509 }
510
511 gettimeofday(&restart->time, NULL);
512 restart->kills = 0;
513 {
514 char cmd[strlen(command) + strlen(restart->name) + 1];
515 snprintf(cmd, sizeof(cmd), command, restart->name);
516 if ((restart->pid = run_background(cmd)) > 0) {
517 thread_add_timer(master, restart_kill, restart,
518 gs.restart_timeout, &restart->t_kill);
519 restart->what = cmdtype;
520 gs.numpids++;
521 } else
522 restart->pid = 0;
523 }
524
525 /* Calculate the new restart interval. */
526 if (update_interval) {
527 if (delay.tv_sec > 2 * gs.max_restart_interval)
528 restart->interval = gs.min_restart_interval;
529 else if ((restart->interval *= 2) > gs.max_restart_interval)
530 restart->interval = gs.max_restart_interval;
531 if (gs.loglevel > LOG_DEBUG + 1)
532 zlog_debug("restart %s interval is now %ld",
533 restart->name, restart->interval);
534 }
535 return restart->pid;
536 }
537
538 #define SET_READ_HANDLER(DMN) \
539 do { \
540 (DMN)->t_read = NULL; \
541 thread_add_read(master, handle_read, (DMN), (DMN)->fd, \
542 &(DMN)->t_read); \
543 } while (0);
544
545 #define SET_WAKEUP_DOWN(DMN) \
546 do { \
547 (DMN)->t_wakeup = NULL; \
548 thread_add_timer_msec(master, wakeup_down, (DMN), \
549 FUZZY(gs.period), &(DMN)->t_wakeup); \
550 } while (0);
551
552 #define SET_WAKEUP_UNRESPONSIVE(DMN) \
553 do { \
554 (DMN)->t_wakeup = NULL; \
555 thread_add_timer_msec(master, wakeup_unresponsive, (DMN), \
556 FUZZY(gs.period), &(DMN)->t_wakeup); \
557 } while (0);
558
559 #define SET_WAKEUP_ECHO(DMN) \
560 do { \
561 (DMN)->t_wakeup = NULL; \
562 thread_add_timer_msec(master, wakeup_send_echo, (DMN), \
563 FUZZY(gs.period), &(DMN)->t_wakeup); \
564 } while (0);
565
566 static void wakeup_down(struct thread *t_wakeup)
567 {
568 struct daemon *dmn = THREAD_ARG(t_wakeup);
569
570 dmn->t_wakeup = NULL;
571 if (try_connect(dmn) < 0)
572 SET_WAKEUP_DOWN(dmn);
573 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
574 try_restart(dmn);
575 }
576
577 static void wakeup_init(struct thread *t_wakeup)
578 {
579 struct daemon *dmn = THREAD_ARG(t_wakeup);
580
581 dmn->t_wakeup = NULL;
582 if (try_connect(dmn) < 0) {
583 zlog_info(
584 "%s state -> down : initial connection attempt failed",
585 dmn->name);
586 dmn->state = DAEMON_DOWN;
587 }
588 phase_check();
589 }
590
591 static void restart_done(struct daemon *dmn)
592 {
593 if (dmn->state != DAEMON_DOWN) {
594 zlog_warn(
595 "Daemon: %s: is in %s state but expected it to be in DAEMON_DOWN state",
596 dmn->name, state_str[dmn->state]);
597 return;
598 }
599 THREAD_OFF(dmn->t_wakeup);
600
601 if (try_connect(dmn) < 0)
602 SET_WAKEUP_DOWN(dmn);
603 }
604
605 static void daemon_restarting_operational(struct thread *thread)
606 {
607 systemd_send_status("FRR Operational");
608 }
609
610 static void daemon_down(struct daemon *dmn, const char *why)
611 {
612 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
613 flog_err(EC_WATCHFRR_CONNECTION, "%s state -> down : %s",
614 dmn->name, why);
615 else if (gs.loglevel > LOG_DEBUG)
616 zlog_debug("%s still down : %s", dmn->name, why);
617 if (IS_UP(dmn))
618 gs.numdown++;
619 dmn->state = DAEMON_DOWN;
620 if (dmn->fd >= 0) {
621 close(dmn->fd);
622 dmn->fd = -1;
623 }
624 THREAD_OFF(dmn->t_read);
625 THREAD_OFF(dmn->t_write);
626 THREAD_OFF(dmn->t_wakeup);
627 if (try_connect(dmn) < 0)
628 SET_WAKEUP_DOWN(dmn);
629
630 systemd_send_status("FRR partially operational");
631 phase_check();
632 }
633
634 static void handle_read(struct thread *t_read)
635 {
636 struct daemon *dmn = THREAD_ARG(t_read);
637 static const char resp[sizeof(PING_TOKEN) + 4] = PING_TOKEN "\n";
638 char buf[sizeof(resp) + 100];
639 ssize_t rc;
640 struct timeval delay;
641
642 dmn->t_read = NULL;
643 if ((rc = read(dmn->fd, buf, sizeof(buf))) < 0) {
644 char why[100];
645
646 if (ERRNO_IO_RETRY(errno)) {
647 /* Pretend it never happened. */
648 SET_READ_HANDLER(dmn);
649 return;
650 }
651 snprintf(why, sizeof(why), "unexpected read error: %s",
652 safe_strerror(errno));
653 daemon_down(dmn, why);
654 return;
655 }
656 if (rc == 0) {
657 daemon_down(dmn, "read returned EOF");
658 return;
659 }
660 if (!dmn->echo_sent.tv_sec) {
661 char why[sizeof(buf) + 100];
662 snprintf(why, sizeof(why),
663 "unexpected read returns %d bytes: %.*s", (int)rc,
664 (int)rc, buf);
665 daemon_down(dmn, why);
666 return;
667 }
668
669 /* We are expecting an echo response: is there any chance that the
670 response would not be returned entirely in the first read? That
671 seems inconceivable... */
672 if ((rc != sizeof(resp)) || memcmp(buf, resp, sizeof(resp))) {
673 char why[100 + sizeof(buf)];
674 snprintf(why, sizeof(why),
675 "read returned bad echo response of %d bytes (expecting %u): %.*s",
676 (int)rc, (unsigned int)sizeof(resp), (int)rc, buf);
677 daemon_down(dmn, why);
678 return;
679 }
680
681 time_elapsed(&delay, &dmn->echo_sent);
682 dmn->echo_sent.tv_sec = 0;
683 if (dmn->state == DAEMON_UNRESPONSIVE) {
684 if (delay.tv_sec < gs.timeout) {
685 dmn->state = DAEMON_UP;
686 zlog_warn(
687 "%s state -> up : echo response received after %ld.%06ld seconds",
688 dmn->name, (long)delay.tv_sec,
689 (long)delay.tv_usec);
690 } else
691 zlog_warn(
692 "%s: slow echo response finally received after %ld.%06ld seconds",
693 dmn->name, (long)delay.tv_sec,
694 (long)delay.tv_usec);
695 } else if (gs.loglevel > LOG_DEBUG + 1)
696 zlog_debug("%s: echo response received after %ld.%06ld seconds",
697 dmn->name, (long)delay.tv_sec, (long)delay.tv_usec);
698
699 SET_READ_HANDLER(dmn);
700 thread_cancel(&dmn->t_wakeup);
701 SET_WAKEUP_ECHO(dmn);
702 }
703
704 /*
705 * Wait till we notice that all daemons are ready before
706 * we send we are ready to systemd
707 */
708 static void daemon_send_ready(int exitcode)
709 {
710 FILE *fp;
711 static int sent = 0;
712 char started[1024];
713
714 if (sent)
715 return;
716
717 if (exitcode == 0)
718 zlog_notice("all daemons up, doing startup-complete notify");
719 else if (gs.numdown < gs.numdaemons)
720 flog_err(EC_WATCHFRR_CONNECTION,
721 "startup did not complete within timeout (%d/%d daemons running)",
722 gs.numdaemons - gs.numdown, gs.numdaemons);
723 else {
724 flog_err(EC_WATCHFRR_CONNECTION,
725 "all configured daemons failed to start -- exiting watchfrr");
726 exit(exitcode);
727
728 }
729
730 frr_detach();
731
732 snprintf(started, sizeof(started), "%s/%s", frr_vtydir,
733 "watchfrr.started");
734 fp = fopen(started, "w");
735 if (fp)
736 fclose(fp);
737
738 systemd_send_started(master);
739 systemd_send_status("FRR Operational");
740 sent = 1;
741 }
742
743 static void daemon_up(struct daemon *dmn, const char *why)
744 {
745 dmn->state = DAEMON_UP;
746 gs.numdown--;
747 dmn->connect_tries = 0;
748 zlog_notice("%s state -> up : %s", dmn->name, why);
749 if (gs.numdown == 0) {
750 daemon_send_ready(0);
751
752 THREAD_OFF(gs.t_operational);
753
754 thread_add_timer(master, daemon_restarting_operational, NULL,
755 gs.operational_timeout, &gs.t_operational);
756 }
757
758 SET_WAKEUP_ECHO(dmn);
759 phase_check();
760 }
761
762 static void check_connect(struct thread *t_write)
763 {
764 struct daemon *dmn = THREAD_ARG(t_write);
765 int sockerr;
766 socklen_t reslen = sizeof(sockerr);
767
768 dmn->t_write = NULL;
769 if (getsockopt(dmn->fd, SOL_SOCKET, SO_ERROR, (char *)&sockerr, &reslen)
770 < 0) {
771 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn->name,
772 safe_strerror(errno));
773 daemon_down(dmn,
774 "getsockopt failed checking connection success");
775 return;
776 }
777 if ((reslen == sizeof(sockerr)) && sockerr) {
778 char why[100];
779 snprintf(
780 why, sizeof(why),
781 "getsockopt reports that connection attempt failed: %s",
782 safe_strerror(sockerr));
783 daemon_down(dmn, why);
784 return;
785 }
786
787 daemon_up(dmn, "delayed connect succeeded");
788 }
789
790 static void wakeup_connect_hanging(struct thread *t_wakeup)
791 {
792 struct daemon *dmn = THREAD_ARG(t_wakeup);
793 char why[100];
794
795 dmn->t_wakeup = NULL;
796 snprintf(why, sizeof(why),
797 "connection attempt timed out after %ld seconds", gs.timeout);
798 daemon_down(dmn, why);
799 }
800
801 /* Making connection to protocol daemon. */
802 static int try_connect(struct daemon *dmn)
803 {
804 int sock;
805 struct sockaddr_un addr;
806 socklen_t len;
807
808 if (gs.loglevel > LOG_DEBUG + 1)
809 zlog_debug("%s: attempting to connect", dmn->name);
810 dmn->connect_tries++;
811
812 memset(&addr, 0, sizeof(addr));
813 addr.sun_family = AF_UNIX;
814 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty", gs.vtydir,
815 dmn->name);
816 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
817 len = addr.sun_len = SUN_LEN(&addr);
818 #else
819 len = sizeof(addr.sun_family) + strlen(addr.sun_path);
820 #endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
821
822 /* Quick check to see if we might succeed before we go to the trouble
823 of creating a socket. */
824 if (access(addr.sun_path, W_OK) < 0) {
825 if (errno != ENOENT)
826 flog_err_sys(EC_LIB_SYSTEM_CALL,
827 "%s: access to socket %s denied: %s",
828 dmn->name, addr.sun_path,
829 safe_strerror(errno));
830 return -1;
831 }
832
833 if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
834 flog_err_sys(EC_LIB_SOCKET, "%s(%s): cannot make socket: %s",
835 __func__, addr.sun_path, safe_strerror(errno));
836 return -1;
837 }
838
839 if (set_nonblocking(sock) < 0 || set_cloexec(sock) < 0) {
840 flog_err_sys(EC_LIB_SYSTEM_CALL,
841 "%s(%s): set_nonblocking/cloexec(%d) failed",
842 __func__, addr.sun_path, sock);
843 close(sock);
844 return -1;
845 }
846
847 if (connect(sock, (struct sockaddr *)&addr, len) < 0) {
848 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK)) {
849 if (gs.loglevel > LOG_DEBUG)
850 zlog_debug("%s(%s): connect failed: %s",
851 __func__, addr.sun_path,
852 safe_strerror(errno));
853 close(sock);
854 return -1;
855 }
856 if (gs.loglevel > LOG_DEBUG)
857 zlog_debug("%s: connection in progress", dmn->name);
858 dmn->state = DAEMON_CONNECTING;
859 dmn->fd = sock;
860 thread_add_write(master, check_connect, dmn, dmn->fd,
861 &dmn->t_write);
862 thread_add_timer(master, wakeup_connect_hanging, dmn,
863 gs.timeout, &dmn->t_wakeup);
864 SET_READ_HANDLER(dmn);
865 return 0;
866 }
867
868 dmn->fd = sock;
869 SET_READ_HANDLER(dmn);
870 daemon_up(dmn, "connect succeeded");
871 return 1;
872 }
873
874 static void phase_hanging(struct thread *t_hanging)
875 {
876 gs.t_phase_hanging = NULL;
877 flog_err(EC_WATCHFRR_CONNECTION,
878 "Phase [%s] hanging for %ld seconds, aborting phased restart",
879 phase_str[gs.phase], PHASE_TIMEOUT);
880 gs.phase = PHASE_NONE;
881 }
882
883 static void set_phase(enum restart_phase new_phase)
884 {
885 gs.phase = new_phase;
886 thread_cancel(&gs.t_phase_hanging);
887
888 thread_add_timer(master, phase_hanging, NULL, PHASE_TIMEOUT,
889 &gs.t_phase_hanging);
890 }
891
892 static void phase_check(void)
893 {
894 struct daemon *dmn;
895
896 switch (gs.phase) {
897 case PHASE_NONE:
898 break;
899
900 case PHASE_INIT:
901 for (dmn = gs.daemons; dmn; dmn = dmn->next)
902 if (dmn->state == DAEMON_INIT)
903 return;
904
905 /* startup complete, everything out of INIT */
906 gs.phase = PHASE_NONE;
907 for (dmn = gs.daemons; dmn; dmn = dmn->next)
908 if (dmn->state == DAEMON_DOWN) {
909 SET_WAKEUP_DOWN(dmn);
910 try_restart(dmn);
911 }
912 break;
913 case PHASE_STOPS_PENDING:
914 if (gs.numpids)
915 break;
916 zlog_info(
917 "Phased restart: all routing daemon stop jobs have completed.");
918 set_phase(PHASE_WAITING_DOWN);
919
920 /*FALLTHRU*/
921 case PHASE_WAITING_DOWN:
922 if (gs.numdown + IS_UP(gs.special) < gs.numdaemons)
923 break;
924 systemd_send_status("Phased Restart");
925 zlog_info("Phased restart: all routing daemons now down.");
926 run_job(&gs.special->restart, "restart", gs.restart_command, 1,
927 1);
928 set_phase(PHASE_ZEBRA_RESTART_PENDING);
929
930 /*FALLTHRU*/
931 case PHASE_ZEBRA_RESTART_PENDING:
932 if (gs.special->restart.pid)
933 break;
934 systemd_send_status("Zebra Restarting");
935 zlog_info("Phased restart: %s restart job completed.",
936 gs.special->name);
937 set_phase(PHASE_WAITING_ZEBRA_UP);
938
939 /*FALLTHRU*/
940 case PHASE_WAITING_ZEBRA_UP:
941 if (!IS_UP(gs.special))
942 break;
943 zlog_info("Phased restart: %s is now up.", gs.special->name);
944 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
945 if (dmn != gs.special)
946 run_job(&dmn->restart, "start",
947 gs.start_command, 1, 0);
948 }
949 gs.phase = PHASE_NONE;
950 THREAD_OFF(gs.t_phase_hanging);
951 zlog_notice("Phased global restart has completed.");
952 break;
953 }
954 }
955
956 static void try_restart(struct daemon *dmn)
957 {
958 if (watch_only)
959 return;
960
961 if (dmn != gs.special) {
962 if ((gs.special->state == DAEMON_UP)
963 && (gs.phase == PHASE_NONE))
964 run_job(&dmn->restart, "restart", gs.restart_command, 0,
965 1);
966 else
967 zlog_debug(
968 "%s: postponing restart attempt because master %s daemon not up [%s], or phased restart in progress",
969 dmn->name, gs.special->name,
970 state_str[gs.special->state]);
971 return;
972 }
973
974 if ((gs.phase != PHASE_NONE) || gs.numpids) {
975 if (gs.loglevel > LOG_DEBUG + 1)
976 zlog_debug(
977 "postponing phased global restart: restart already in progress [%s], or outstanding child processes [%d]",
978 phase_str[gs.phase], gs.numpids);
979 return;
980 }
981 /* Is it too soon for a restart? */
982 {
983 struct timeval delay;
984 if (time_elapsed(&delay, &gs.special->restart.time)->tv_sec
985 < gs.special->restart.interval) {
986 if (gs.loglevel > LOG_DEBUG + 1)
987 zlog_debug(
988 "postponing phased global restart: elapsed time %ld < retry interval %ld",
989 (long)delay.tv_sec,
990 gs.special->restart.interval);
991 return;
992 }
993 }
994 run_job(&gs.restart, "restart", gs.restart_command, 0, 1);
995 }
996
997 static void wakeup_unresponsive(struct thread *t_wakeup)
998 {
999 struct daemon *dmn = THREAD_ARG(t_wakeup);
1000
1001 dmn->t_wakeup = NULL;
1002 if (dmn->state != DAEMON_UNRESPONSIVE)
1003 flog_err(EC_WATCHFRR_CONNECTION,
1004 "%s: no longer unresponsive (now %s), wakeup should have been cancelled!",
1005 dmn->name, state_str[dmn->state]);
1006 else {
1007 SET_WAKEUP_UNRESPONSIVE(dmn);
1008 try_restart(dmn);
1009 }
1010 }
1011
1012 static void wakeup_no_answer(struct thread *t_wakeup)
1013 {
1014 struct daemon *dmn = THREAD_ARG(t_wakeup);
1015
1016 dmn->t_wakeup = NULL;
1017 dmn->state = DAEMON_UNRESPONSIVE;
1018 if (dmn->ignore_timeout)
1019 return;
1020 flog_err(EC_WATCHFRR_CONNECTION,
1021 "%s state -> unresponsive : no response yet to ping sent %ld seconds ago",
1022 dmn->name, gs.timeout);
1023 SET_WAKEUP_UNRESPONSIVE(dmn);
1024 try_restart(dmn);
1025 }
1026
1027 static void wakeup_send_echo(struct thread *t_wakeup)
1028 {
1029 static const char echocmd[] = "echo " PING_TOKEN;
1030 ssize_t rc;
1031 struct daemon *dmn = THREAD_ARG(t_wakeup);
1032
1033 dmn->t_wakeup = NULL;
1034 if (((rc = write(dmn->fd, echocmd, sizeof(echocmd))) < 0)
1035 || ((size_t)rc != sizeof(echocmd))) {
1036 char why[100 + sizeof(echocmd)];
1037 snprintf(why, sizeof(why),
1038 "write '%s' returned %d instead of %u", echocmd,
1039 (int)rc, (unsigned int)sizeof(echocmd));
1040 daemon_down(dmn, why);
1041 } else {
1042 gettimeofday(&dmn->echo_sent, NULL);
1043 thread_add_timer(master, wakeup_no_answer, dmn, gs.timeout,
1044 &dmn->t_wakeup);
1045 }
1046 }
1047
1048 bool check_all_up(void)
1049 {
1050 struct daemon *dmn;
1051
1052 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1053 if (dmn->state != DAEMON_UP)
1054 return false;
1055 return true;
1056 }
1057
1058 void watchfrr_status(struct vty *vty)
1059 {
1060 struct daemon *dmn;
1061 struct timeval delay;
1062
1063 vty_out(vty, "watchfrr global phase: %s\n", phase_str[gs.phase]);
1064 vty_out(vty, " Restart Command: %pSQq\n", gs.restart_command);
1065 vty_out(vty, " Start Command: %pSQq\n", gs.start_command);
1066 vty_out(vty, " Stop Command: %pSQq\n", gs.stop_command);
1067 vty_out(vty, " Min Restart Interval: %ld\n", gs.min_restart_interval);
1068 vty_out(vty, " Max Restart Interval: %ld\n", gs.max_restart_interval);
1069 vty_out(vty, " Restart Timeout: %ld\n", gs.restart_timeout);
1070 vty_out(vty, " Reading Configuration: %s\n",
1071 gs.reading_configuration ? "yes" : "no");
1072 if (gs.restart.pid)
1073 vty_out(vty, " global restart running, pid %ld\n",
1074 (long)gs.restart.pid);
1075
1076 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
1077 vty_out(vty, " %-20s %s%s", dmn->name, state_str[dmn->state],
1078 dmn->ignore_timeout ? "/Ignoring Timeout\n" : "\n");
1079 if (dmn->restart.pid)
1080 vty_out(vty, " restart running, pid %ld\n",
1081 (long)dmn->restart.pid);
1082 else if (dmn->state == DAEMON_DOWN &&
1083 time_elapsed(&delay, &dmn->restart.time)->tv_sec
1084 < dmn->restart.interval)
1085 vty_out(vty, " restarting in %jd seconds (%jds backoff interval)\n",
1086 (intmax_t)dmn->restart.interval
1087 - (intmax_t)delay.tv_sec,
1088 (intmax_t)dmn->restart.interval);
1089 }
1090 }
1091
1092 static void sigint(void)
1093 {
1094 zlog_notice("Terminating on signal");
1095 systemd_send_stopping();
1096 exit(0);
1097 }
1098
1099 static int valid_command(const char *cmd)
1100 {
1101 char *p;
1102
1103 if (cmd == NULL)
1104 return 0;
1105
1106 return ((p = strchr(cmd, '%')) != NULL) && (*(p + 1) == 's')
1107 && !strchr(p + 1, '%');
1108 }
1109
1110 /* This is an ugly hack to circumvent problems with passing command-line
1111 arguments that contain spaces. The fix is to use a configuration file. */
1112 static char *translate_blanks(const char *cmd, const char *blankstr)
1113 {
1114 char *res;
1115 char *p;
1116 size_t bslen = strlen(blankstr);
1117
1118 if (!(res = strdup(cmd))) {
1119 perror("strdup");
1120 exit(1);
1121 }
1122 while ((p = strstr(res, blankstr)) != NULL) {
1123 *p = ' ';
1124 if (bslen != 1)
1125 memmove(p + 1, p + bslen, strlen(p + bslen) + 1);
1126 }
1127 return res;
1128 }
1129
1130 static void startup_timeout(struct thread *t_wakeup)
1131 {
1132 daemon_send_ready(1);
1133 }
1134
1135 #ifdef GNU_LINUX
1136
1137 #include <sys/mount.h>
1138 #include <sched.h>
1139
1140 #define NETNS_RUN_DIR "/var/run/netns"
1141
1142 static void netns_create(int dirfd, const char *nsname)
1143 {
1144 /* make /var/run/netns shared between mount namespaces
1145 * just like iproute2 sets it up
1146 */
1147 if (mount("", NETNS_RUN_DIR, "none", MS_SHARED | MS_REC, NULL)) {
1148 if (errno != EINVAL) {
1149 perror("mount");
1150 exit(1);
1151 }
1152
1153 if (mount(NETNS_RUN_DIR, NETNS_RUN_DIR, "none",
1154 MS_BIND | MS_REC, NULL)) {
1155 perror("mount");
1156 exit(1);
1157 }
1158
1159 if (mount("", NETNS_RUN_DIR, "none", MS_SHARED | MS_REC,
1160 NULL)) {
1161 perror("mount");
1162 exit(1);
1163 }
1164 }
1165
1166 /* need an empty file to mount on top of */
1167 int nsfd = openat(dirfd, nsname, O_CREAT | O_RDONLY | O_EXCL, 0);
1168
1169 if (nsfd < 0) {
1170 fprintf(stderr, "failed to create \"%s/%s\": %s\n",
1171 NETNS_RUN_DIR, nsname, strerror(errno));
1172 exit(1);
1173 }
1174 close(nsfd);
1175
1176 if (unshare(CLONE_NEWNET)) {
1177 perror("unshare");
1178 unlinkat(dirfd, nsname, 0);
1179 exit(1);
1180 }
1181
1182 char *dstpath = asprintfrr(MTYPE_TMP, "%s/%s", NETNS_RUN_DIR, nsname);
1183
1184 /* bind-mount so the namespace has a name and is persistent */
1185 if (mount("/proc/self/ns/net", dstpath, "none", MS_BIND, NULL) < 0) {
1186 fprintf(stderr, "failed to bind-mount netns to \"%s\": %s\n",
1187 dstpath, strerror(errno));
1188 unlinkat(dirfd, nsname, 0);
1189 exit(1);
1190 }
1191
1192 XFREE(MTYPE_TMP, dstpath);
1193 }
1194
1195 static void netns_setup(const char *nsname)
1196 {
1197 int dirfd, nsfd;
1198
1199 dirfd = open(NETNS_RUN_DIR, O_DIRECTORY | O_RDONLY);
1200 if (dirfd < 0) {
1201 if (errno == ENOTDIR) {
1202 fprintf(stderr, "error: \"%s\" is not a directory!\n",
1203 NETNS_RUN_DIR);
1204 exit(1);
1205 } else if (errno == ENOENT) {
1206 if (mkdir(NETNS_RUN_DIR, 0755)) {
1207 fprintf(stderr, "error: \"%s\": mkdir: %s\n",
1208 NETNS_RUN_DIR, strerror(errno));
1209 exit(1);
1210 }
1211 dirfd = open(NETNS_RUN_DIR, O_DIRECTORY | O_RDONLY);
1212 if (dirfd < 0) {
1213 fprintf(stderr, "error: \"%s\": opendir: %s\n",
1214 NETNS_RUN_DIR, strerror(errno));
1215 exit(1);
1216 }
1217 } else {
1218 fprintf(stderr, "error: \"%s\": %s\n",
1219 NETNS_RUN_DIR, strerror(errno));
1220 exit(1);
1221 }
1222 }
1223
1224 nsfd = openat(dirfd, nsname, O_RDONLY);
1225 if (nsfd < 0 && errno != ENOENT) {
1226 fprintf(stderr, "error: \"%s/%s\": %s\n",
1227 NETNS_RUN_DIR, nsname, strerror(errno));
1228 exit(1);
1229 }
1230 if (nsfd < 0)
1231 netns_create(dirfd, nsname);
1232 else {
1233 if (setns(nsfd, CLONE_NEWNET)) {
1234 perror("setns");
1235 exit(1);
1236 }
1237 close(nsfd);
1238 }
1239 close(dirfd);
1240
1241 /* make sure loopback is up... weird things happen otherwise.
1242 * ioctl is perfectly fine for this, don't need netlink...
1243 */
1244 int sockfd;
1245 struct ifreq ifr = { };
1246
1247 strlcpy(ifr.ifr_name, "lo", sizeof(ifr.ifr_name));
1248
1249 sockfd = socket(AF_INET, SOCK_DGRAM, 0);
1250 if (sockfd < 0) {
1251 perror("socket");
1252 exit(1);
1253 }
1254 if (ioctl(sockfd, SIOCGIFFLAGS, &ifr)) {
1255 perror("ioctl(SIOCGIFFLAGS, \"lo\")");
1256 exit(1);
1257 }
1258 if (!(ifr.ifr_flags & IFF_UP)) {
1259 ifr.ifr_flags |= IFF_UP;
1260 if (ioctl(sockfd, SIOCSIFFLAGS, &ifr)) {
1261 perror("ioctl(SIOCSIFFLAGS, \"lo\")");
1262 exit(1);
1263 }
1264 }
1265 close(sockfd);
1266 }
1267
1268 #else /* !GNU_LINUX */
1269
1270 static void netns_setup(const char *nsname)
1271 {
1272 fprintf(stderr, "network namespaces are only available on Linux\n");
1273 exit(1);
1274 }
1275 #endif
1276
1277 static void watchfrr_start_config(void)
1278 {
1279 gs.reading_configuration = true;
1280 }
1281
1282 static void watchfrr_end_config(void)
1283 {
1284 gs.reading_configuration = false;
1285 }
1286
1287 static void watchfrr_init(int argc, char **argv)
1288 {
1289 const char *special = "zebra";
1290 int i;
1291 struct daemon *dmn, **add = &gs.daemons;
1292 char alldaemons[512] = "", *p = alldaemons;
1293
1294 thread_add_timer_msec(master, startup_timeout, NULL, STARTUP_TIMEOUT,
1295 &gs.t_startup_timeout);
1296
1297 for (i = optind; i < argc; i++) {
1298 dmn = XCALLOC(MTYPE_WATCHFRR_DAEMON, sizeof(*dmn));
1299
1300 dmn->name = dmn->restart.name = argv[i];
1301 dmn->state = DAEMON_INIT;
1302 gs.numdaemons++;
1303 gs.numdown++;
1304 dmn->fd = -1;
1305 thread_add_timer_msec(master, wakeup_init, dmn, 0,
1306 &dmn->t_wakeup);
1307 dmn->restart.interval = gs.min_restart_interval;
1308 *add = dmn;
1309 add = &dmn->next;
1310
1311 if (!strcmp(dmn->name, special))
1312 gs.special = dmn;
1313 }
1314
1315 if (!gs.daemons) {
1316 fprintf(stderr,
1317 "Must specify one or more daemons to monitor.\n\n");
1318 frr_help_exit(1);
1319 }
1320 if (!watch_only && !gs.special) {
1321 fprintf(stderr, "\"%s\" daemon must be in daemon lists\n\n",
1322 special);
1323 frr_help_exit(1);
1324 }
1325
1326 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
1327 snprintf(p, alldaemons + sizeof(alldaemons) - p, "%s%s",
1328 (p == alldaemons) ? "" : " ", dmn->name);
1329 p += strlen(p);
1330 }
1331 zlog_notice("%s %s watching [%s]%s", progname, FRR_VERSION, alldaemons,
1332 watch_only ? ", monitor mode" : "");
1333 }
1334
1335 struct zebra_privs_t watchfrr_privs = {
1336 #ifdef VTY_GROUP
1337 .vty_group = VTY_GROUP,
1338 #endif
1339 };
1340
1341 static struct frr_signal_t watchfrr_signals[] = {
1342 {
1343 .signal = SIGINT,
1344 .handler = sigint,
1345 },
1346 {
1347 .signal = SIGTERM,
1348 .handler = sigint,
1349 },
1350 {
1351 .signal = SIGCHLD,
1352 .handler = sigchild,
1353 },
1354 };
1355
1356 FRR_DAEMON_INFO(watchfrr, WATCHFRR,
1357 .flags = FRR_NO_PRIVSEP | FRR_NO_TCPVTY | FRR_LIMITED_CLI
1358 | FRR_NO_CFG_PID_DRY | FRR_NO_ZCLIENT
1359 | FRR_DETACH_LATER,
1360
1361 .printhelp = printhelp,
1362 .copyright = "Copyright 2004 Andrew J. Schorr",
1363
1364 .signals = watchfrr_signals,
1365 .n_signals = array_size(watchfrr_signals),
1366
1367 .privs = &watchfrr_privs,
1368 );
1369
1370 #define DEPRECATED_OPTIONS "aAezR:"
1371
1372 int main(int argc, char **argv)
1373 {
1374 int opt;
1375 const char *blankstr = NULL;
1376 const char *netns = NULL;
1377 bool netns_en = false;
1378
1379 frr_preinit(&watchfrr_di, argc, argv);
1380 progname = watchfrr_di.progname;
1381
1382 frr_opt_add("b:di:k:l:N:p:r:S:s:t:T:" DEPRECATED_OPTIONS, longopts, "");
1383
1384 gs.restart.name = "all";
1385 while ((opt = frr_getopt(argc, argv, NULL)) != EOF) {
1386 if (opt && opt < 128 && strchr(DEPRECATED_OPTIONS, opt)) {
1387 fprintf(stderr,
1388 "The -%c option no longer exists.\n"
1389 "Please refer to the watchfrr(8) man page.\n",
1390 opt);
1391 exit(1);
1392 }
1393
1394 switch (opt) {
1395 case 0:
1396 break;
1397 case 'b':
1398 blankstr = optarg;
1399 break;
1400 case OPTION_DRY:
1401 watch_only = true;
1402 break;
1403 case 'k':
1404 if (!valid_command(optarg)) {
1405 fprintf(stderr,
1406 "Invalid kill command, must contain '%%s': %s\n",
1407 optarg);
1408 frr_help_exit(1);
1409 }
1410 gs.stop_command = optarg;
1411 break;
1412 case 'l': {
1413 char garbage[3];
1414 if ((sscanf(optarg, "%d%1s", &gs.loglevel, garbage)
1415 != 1)
1416 || (gs.loglevel < LOG_EMERG)) {
1417 fprintf(stderr,
1418 "Invalid loglevel argument: %s\n",
1419 optarg);
1420 frr_help_exit(1);
1421 }
1422 } break;
1423 case OPTION_MINRESTART: {
1424 char garbage[3];
1425 if ((sscanf(optarg, "%ld%1s", &gs.min_restart_interval,
1426 garbage)
1427 != 1)
1428 || (gs.min_restart_interval < 0)) {
1429 fprintf(stderr,
1430 "Invalid min_restart_interval argument: %s\n",
1431 optarg);
1432 frr_help_exit(1);
1433 }
1434 } break;
1435 case OPTION_MAXRESTART: {
1436 char garbage[3];
1437 if ((sscanf(optarg, "%ld%1s", &gs.max_restart_interval,
1438 garbage)
1439 != 1)
1440 || (gs.max_restart_interval < 0)) {
1441 fprintf(stderr,
1442 "Invalid max_restart_interval argument: %s\n",
1443 optarg);
1444 frr_help_exit(1);
1445 }
1446 } break;
1447 case OPTION_MAXOPERATIONAL: {
1448 char garbage[3];
1449
1450 if ((sscanf(optarg, "%ld%1s", &gs.operational_timeout,
1451 garbage) != 1) ||
1452 (gs.operational_timeout < 0)) {
1453 fprintf(stderr,
1454 "Invalid Operational_timeout argument: %s\n",
1455 optarg);
1456 frr_help_exit(1);
1457 }
1458 } break;
1459 case OPTION_NETNS:
1460 netns_en = true;
1461 if (optarg && strchr(optarg, '/')) {
1462 fprintf(stderr,
1463 "invalid network namespace name \"%s\" (may not contain slashes)\n",
1464 optarg);
1465 frr_help_exit(1);
1466 }
1467 netns = optarg;
1468 break;
1469 case 'i': {
1470 char garbage[3];
1471 int period;
1472 if ((sscanf(optarg, "%d%1s", &period, garbage) != 1)
1473 || (gs.period < 1)) {
1474 fprintf(stderr,
1475 "Invalid interval argument: %s\n",
1476 optarg);
1477 frr_help_exit(1);
1478 }
1479 gs.period = 1000 * period;
1480 } break;
1481 case 'p':
1482 watchfrr_di.pid_file = optarg;
1483 break;
1484 case 'r':
1485 if (!valid_command(optarg)) {
1486 fprintf(stderr,
1487 "Invalid restart command, must contain '%%s': %s\n",
1488 optarg);
1489 frr_help_exit(1);
1490 }
1491 gs.restart_command = optarg;
1492 break;
1493 case 's':
1494 if (!valid_command(optarg)) {
1495 fprintf(stderr,
1496 "Invalid start command, must contain '%%s': %s\n",
1497 optarg);
1498 frr_help_exit(1);
1499 }
1500 gs.start_command = optarg;
1501 break;
1502 case 'S':
1503 gs.vtydir = optarg;
1504 break;
1505 case 't': {
1506 char garbage[3];
1507 if ((sscanf(optarg, "%ld%1s", &gs.timeout, garbage)
1508 != 1)
1509 || (gs.timeout < 1)) {
1510 fprintf(stderr,
1511 "Invalid timeout argument: %s\n",
1512 optarg);
1513 frr_help_exit(1);
1514 }
1515 } break;
1516 case 'T': {
1517 char garbage[3];
1518 if ((sscanf(optarg, "%ld%1s", &gs.restart_timeout,
1519 garbage)
1520 != 1)
1521 || (gs.restart_timeout < 1)) {
1522 fprintf(stderr,
1523 "Invalid restart timeout argument: %s\n",
1524 optarg);
1525 frr_help_exit(1);
1526 }
1527 } break;
1528 default:
1529 fputs("Invalid option.\n", stderr);
1530 frr_help_exit(1);
1531 }
1532 }
1533
1534 if (watch_only
1535 && (gs.start_command || gs.stop_command || gs.restart_command)) {
1536 fputs("Options -r/-s/-k are not used when --dry is active.\n",
1537 stderr);
1538 }
1539 if (!watch_only
1540 && (!gs.restart_command || !gs.start_command || !gs.stop_command)) {
1541 fprintf(stderr,
1542 "Options -s (start), -k (kill), and -r (restart) are required.\n");
1543 frr_help_exit(1);
1544 }
1545
1546 if (blankstr) {
1547 if (gs.restart_command)
1548 gs.restart_command =
1549 translate_blanks(gs.restart_command, blankstr);
1550 if (gs.start_command)
1551 gs.start_command =
1552 translate_blanks(gs.start_command, blankstr);
1553 if (gs.stop_command)
1554 gs.stop_command =
1555 translate_blanks(gs.stop_command, blankstr);
1556 }
1557
1558 gs.restart.interval = gs.min_restart_interval;
1559
1560 /* env variable for the processes that we start */
1561 if (watchfrr_di.pathspace)
1562 setenv("FRR_PATHSPACE", watchfrr_di.pathspace, 1);
1563 else
1564 unsetenv("FRR_PATHSPACE");
1565
1566 /*
1567 * when watchfrr_di.pathspace is read, if it is not specified
1568 * pathspace is NULL as expected
1569 */
1570 pathspace = watchfrr_di.pathspace;
1571
1572 if (netns_en && !netns)
1573 netns = watchfrr_di.pathspace;
1574
1575 if (netns_en && netns && netns[0])
1576 netns_setup(netns);
1577
1578 master = frr_init();
1579 watchfrr_error_init();
1580 watchfrr_init(argc, argv);
1581 cmd_init_config_callbacks(watchfrr_start_config, watchfrr_end_config);
1582 watchfrr_vty_init();
1583
1584 frr_config_fork();
1585
1586 if (watchfrr_di.daemon_mode)
1587 zlog_syslog_set_prio_min(MIN(gs.loglevel, LOG_DEBUG));
1588 else
1589 zlog_aux_init(NULL, MIN(gs.loglevel, LOG_DEBUG));
1590
1591 frr_run(master);
1592
1593 systemd_send_stopping();
1594 /* Not reached. */
1595 return 0;
1596 }