]> git.proxmox.com Git - mirror_frr.git/blob - watchfrr/watchfrr.c
*: auto-convert to SPDX License IDs
[mirror_frr.git] / watchfrr / watchfrr.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Monitor status of frr daemons and restart if necessary.
4 *
5 * Copyright (C) 2004 Andrew J. Schorr
6 */
7
8 #include <zebra.h>
9 #include <thread.h>
10 #include <log.h>
11 #include <network.h>
12 #include <sigevent.h>
13 #include <lib/version.h>
14 #include "command.h"
15 #include "libfrr.h"
16 #include "lib_errors.h"
17 #include "zlog_targets.h"
18 #include "network.h"
19 #include "printfrr.h"
20
21 #include <getopt.h>
22 #include <sys/un.h>
23 #include <sys/wait.h>
24 #include <memory.h>
25 #include <systemd.h>
26
27 #include "watchfrr.h"
28 #include "watchfrr_errors.h"
29
30 #ifndef MIN
31 #define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
32 #endif
33
34 /* Macros to help randomize timers. */
35 #define JITTER(X) ((frr_weak_random() % ((X)+1))-((X)/2))
36 #define FUZZY(X) ((X)+JITTER((X)/20))
37
38 #define DEFAULT_PERIOD 5
39 #define DEFAULT_TIMEOUT 90
40 #define DEFAULT_RESTART_TIMEOUT 20
41 #define DEFAULT_LOGLEVEL LOG_INFO
42 #define DEFAULT_MIN_RESTART 60
43 #define DEFAULT_MAX_RESTART 600
44 #define DEFAULT_OPERATIONAL_TIMEOUT 60
45
46 #define DEFAULT_RESTART_CMD WATCHFRR_SH_PATH " restart %s"
47 #define DEFAULT_START_CMD WATCHFRR_SH_PATH " start %s"
48 #define DEFAULT_STOP_CMD WATCHFRR_SH_PATH " stop %s"
49
50 #define PING_TOKEN "PING"
51
52 DEFINE_MGROUP(WATCHFRR, "watchfrr");
53 DEFINE_MTYPE_STATIC(WATCHFRR, WATCHFRR_DAEMON, "watchfrr daemon entry");
54
55 /* Needs to be global, referenced somewhere inside libfrr. */
56 struct thread_master *master;
57
58 static bool watch_only = false;
59 const char *pathspace;
60
61 enum restart_phase {
62 PHASE_NONE = 0,
63 PHASE_INIT,
64 PHASE_STOPS_PENDING,
65 PHASE_WAITING_DOWN,
66 PHASE_ZEBRA_RESTART_PENDING,
67 PHASE_WAITING_ZEBRA_UP
68 };
69
70 static const char *const phase_str[] = {
71 "Idle",
72 "Startup",
73 "Stop jobs running",
74 "Waiting for other daemons to come down",
75 "Zebra restart job running",
76 "Waiting for zebra to come up",
77 "Start jobs running",
78 };
79
80 #define PHASE_TIMEOUT (3*gs.restart_timeout)
81 #define STARTUP_TIMEOUT 55 * 1000
82
83 struct restart_info {
84 const char *name;
85 const char *what;
86 pid_t pid;
87 struct timeval time;
88 long interval;
89 struct thread *t_kill;
90 int kills;
91 };
92
93 static struct global_state {
94 enum restart_phase phase;
95 struct thread *t_phase_hanging;
96 struct thread *t_startup_timeout;
97 struct thread *t_operational;
98 const char *vtydir;
99 long period;
100 long timeout;
101 long restart_timeout;
102 bool reading_configuration;
103 long min_restart_interval;
104 long max_restart_interval;
105 long operational_timeout;
106 struct daemon *daemons;
107 const char *restart_command;
108 const char *start_command;
109 const char *stop_command;
110 struct restart_info restart;
111 int loglevel;
112 struct daemon *special; /* points to zebra when doing phased restart */
113 int numdaemons;
114 int numpids;
115 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
116 } gs = {
117 .phase = PHASE_INIT,
118 .vtydir = frr_vtydir,
119 .period = 1000 * DEFAULT_PERIOD,
120 .timeout = DEFAULT_TIMEOUT,
121 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
122 .loglevel = DEFAULT_LOGLEVEL,
123 .min_restart_interval = DEFAULT_MIN_RESTART,
124 .max_restart_interval = DEFAULT_MAX_RESTART,
125 .operational_timeout = DEFAULT_OPERATIONAL_TIMEOUT,
126 .restart_command = DEFAULT_RESTART_CMD,
127 .start_command = DEFAULT_START_CMD,
128 .stop_command = DEFAULT_STOP_CMD,
129 };
130
131 enum daemon_state {
132 DAEMON_INIT,
133 DAEMON_DOWN,
134 DAEMON_CONNECTING,
135 DAEMON_UP,
136 DAEMON_UNRESPONSIVE
137 };
138
139 #define IS_UP(DMN) \
140 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
141
142 static const char *const state_str[] = {
143 "Init", "Down", "Connecting", "Up", "Unresponsive",
144 };
145
146 struct daemon {
147 const char *name;
148 enum daemon_state state;
149 int fd;
150 struct timeval echo_sent;
151 unsigned int connect_tries;
152 struct thread *t_wakeup;
153 struct thread *t_read;
154 struct thread *t_write;
155 struct daemon *next;
156 struct restart_info restart;
157
158 /*
159 * For a given daemon, if we've turned on ignore timeouts
160 * ignore the timeout value and assume everything is ok
161 * This is for daemon debugging w/ gdb after we have started
162 * FRR and realize we have something that needs to be looked
163 * at
164 */
165 bool ignore_timeout;
166 };
167
168 #define OPTION_MINRESTART 2000
169 #define OPTION_MAXRESTART 2001
170 #define OPTION_DRY 2002
171 #define OPTION_NETNS 2003
172 #define OPTION_MAXOPERATIONAL 2004
173
174 static const struct option longopts[] = {
175 {"daemon", no_argument, NULL, 'd'},
176 {"statedir", required_argument, NULL, 'S'},
177 {"loglevel", required_argument, NULL, 'l'},
178 {"interval", required_argument, NULL, 'i'},
179 {"timeout", required_argument, NULL, 't'},
180 {"restart-timeout", required_argument, NULL, 'T'},
181 {"restart", required_argument, NULL, 'r'},
182 {"start-command", required_argument, NULL, 's'},
183 {"kill-command", required_argument, NULL, 'k'},
184 {"dry", no_argument, NULL, OPTION_DRY},
185 {"min-restart-interval", required_argument, NULL, OPTION_MINRESTART},
186 {"max-restart-interval", required_argument, NULL, OPTION_MAXRESTART},
187 {"operational-timeout", required_argument, NULL, OPTION_MAXOPERATIONAL},
188 {"pid-file", required_argument, NULL, 'p'},
189 {"blank-string", required_argument, NULL, 'b'},
190 #ifdef GNU_LINUX
191 {"netns", optional_argument, NULL, OPTION_NETNS},
192 #endif
193 {"help", no_argument, NULL, 'h'},
194 {"version", no_argument, NULL, 'v'},
195 {NULL, 0, NULL, 0}};
196
197 static int try_connect(struct daemon *dmn);
198 static void wakeup_send_echo(struct thread *t_wakeup);
199 static void try_restart(struct daemon *dmn);
200 static void phase_check(void);
201 static void restart_done(struct daemon *dmn);
202
203 static const char *progname;
204
205 void watchfrr_set_ignore_daemon(struct vty *vty, const char *dname, bool ignore)
206 {
207 struct daemon *dmn;
208
209 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
210 if (strncmp(dmn->name, dname, strlen(dmn->name)) == 0)
211 break;
212 }
213
214 if (dmn) {
215 dmn->ignore_timeout = ignore;
216 vty_out(vty, "%s switching to %s\n", dmn->name,
217 ignore ? "ignore" : "watch");
218 } else
219 vty_out(vty, "%s is not configured for running at the moment",
220 dname);
221 }
222
223 static void printhelp(FILE *target)
224 {
225 fprintf(target,
226 "Usage : %s [OPTION...] <daemon name> ...\n\n\
227 Watchdog program to monitor status of frr daemons and try to restart\n\
228 them if they are down or unresponsive. It determines whether a daemon is\n\
229 up based on whether it can connect to the daemon's vty unix stream socket.\n\
230 It then repeatedly sends echo commands over that socket to determine whether\n\
231 the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
232 on the socket connection and know immediately that the daemon is down.\n\n\
233 The daemons to be monitored should be listed on the command line.\n\n\
234 In order to avoid attempting to restart the daemons in a fast loop,\n\
235 the -m and -M options allow you to control the minimum delay between\n\
236 restart commands. The minimum restart delay is recalculated each time\n\
237 a restart is attempted: if the time since the last restart attempt exceeds\n\
238 twice the -M value, then the restart delay is set to the -m value.\n\
239 Otherwise, the interval is doubled (but capped at the -M value).\n\n",
240 progname);
241
242 fprintf(target,
243 "Options:\n\
244 -d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
245 to syslog instead of stdout.\n\
246 -S, --statedir Set the vty socket directory (default is %s)\n\
247 -N, --pathspace Insert prefix into config & socket paths\n"
248 #ifdef GNU_LINUX
249 " --netns Create and/or use Linux network namespace. If no name is\n"
250 " given, uses the value from `-N`.\n"
251 #endif
252 "-l, --loglevel Set the logging level (default is %d).\n\
253 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
254 but it can be set higher than %d if extra-verbose debugging\n\
255 messages are desired.\n\
256 --min-restart-interval\n\
257 Set the minimum seconds to wait between invocations of daemon\n\
258 restart commands (default is %d).\n\
259 --max-restart-interval\n\
260 Set the maximum seconds to wait between invocations of daemon\n\
261 restart commands (default is %d).\n\
262 --operational-timeout\n\
263 Set the time before systemd is notified that we are considered\n\
264 operational again after a daemon restart (default is %d).\n\
265 -i, --interval Set the status polling interval in seconds (default is %d)\n\
266 -t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
267 -T, --restart-timeout\n\
268 Set the restart (kill) timeout in seconds (default is %d).\n\
269 If any background jobs are still running after this much\n\
270 time has elapsed, they will be killed.\n\
271 -r, --restart Supply a Bourne shell command to use to restart a single\n\
272 daemon. The command string should include '%%s' where the\n\
273 name of the daemon should be substituted.\n\
274 (default: '%s')\n\
275 -s, --start-command\n\
276 Supply a Bourne shell to command to use to start a single\n\
277 daemon. The command string should include '%%s' where the\n\
278 name of the daemon should be substituted.\n\
279 (default: '%s')\n\
280 -k, --kill-command\n\
281 Supply a Bourne shell to command to use to stop a single\n\
282 daemon. The command string should include '%%s' where the\n\
283 name of the daemon should be substituted.\n\
284 (default: '%s')\n\
285 --dry Do not start or restart anything, just log.\n\
286 -p, --pid-file Set process identifier file name\n\
287 (default is %s/watchfrr.pid).\n\
288 -b, --blank-string\n\
289 When the supplied argument string is found in any of the\n\
290 various shell command arguments (-r, -s, or -k), replace\n\
291 it with a space. This is an ugly hack to circumvent problems\n\
292 passing command-line arguments with embedded spaces.\n\
293 -v, --version Print program version\n\
294 -h, --help Display this help and exit\n",
295 frr_vtydir, DEFAULT_LOGLEVEL, LOG_EMERG, LOG_DEBUG, LOG_DEBUG,
296 DEFAULT_MIN_RESTART, DEFAULT_MAX_RESTART,
297 DEFAULT_OPERATIONAL_TIMEOUT, DEFAULT_PERIOD, DEFAULT_TIMEOUT,
298 DEFAULT_RESTART_TIMEOUT, DEFAULT_RESTART_CMD, DEFAULT_START_CMD,
299 DEFAULT_STOP_CMD, frr_vtydir);
300 }
301
302 static pid_t run_background(char *shell_cmd)
303 {
304 pid_t child;
305
306 switch (child = fork()) {
307 case -1:
308 flog_err_sys(EC_LIB_SYSTEM_CALL,
309 "fork failed, cannot run command [%s]: %s",
310 shell_cmd, safe_strerror(errno));
311 return -1;
312 case 0:
313 /* Child process. */
314 /* Use separate process group so child processes can be killed
315 * easily. */
316 if (setpgid(0, 0) < 0)
317 zlog_warn("setpgid(0,0) failed: %s",
318 safe_strerror(errno));
319 {
320 char shell[] = "sh";
321 char dashc[] = "-c";
322 char *const argv[4] = {shell, dashc, shell_cmd, NULL};
323 execv("/bin/sh", argv);
324 flog_err_sys(EC_LIB_SYSTEM_CALL,
325 "execv(/bin/sh -c '%s') failed: %s",
326 shell_cmd, safe_strerror(errno));
327 _exit(127);
328 }
329 default:
330 /* Parent process: we will reap the child later. */
331 zlog_info("Forked background command [pid %d]: %s", (int)child,
332 shell_cmd);
333 return child;
334 }
335 }
336
337 static struct timeval *time_elapsed(struct timeval *result,
338 const struct timeval *start_time)
339 {
340 gettimeofday(result, NULL);
341 result->tv_sec -= start_time->tv_sec;
342 result->tv_usec -= start_time->tv_usec;
343 while (result->tv_usec < 0) {
344 result->tv_usec += 1000000L;
345 result->tv_sec--;
346 }
347 return result;
348 }
349
350 static void restart_kill(struct thread *t_kill)
351 {
352 struct restart_info *restart = THREAD_ARG(t_kill);
353 struct timeval delay;
354
355 time_elapsed(&delay, &restart->time);
356
357 if (gs.reading_configuration) {
358 zlog_err(
359 "%s %s child process appears to still be reading configuration, delaying for another %lu time",
360 restart->what, restart->name, gs.restart_timeout);
361 thread_add_timer(master, restart_kill, restart,
362 gs.restart_timeout, &restart->t_kill);
363 return;
364 }
365
366 zlog_warn(
367 "%s %s child process %d still running after %ld seconds, sending signal %d",
368 restart->what, restart->name, (int)restart->pid,
369 (long)delay.tv_sec, (restart->kills ? SIGKILL : SIGTERM));
370 kill(-restart->pid, (restart->kills ? SIGKILL : SIGTERM));
371 restart->kills++;
372 thread_add_timer(master, restart_kill, restart, gs.restart_timeout,
373 &restart->t_kill);
374 }
375
376 static struct restart_info *find_child(pid_t child)
377 {
378 struct daemon *dmn;
379 if (gs.restart.pid == child)
380 return &gs.restart;
381
382 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
383 if (dmn->restart.pid == child)
384 return &dmn->restart;
385 }
386 return NULL;
387 }
388
389 static void sigchild(void)
390 {
391 pid_t child;
392 int status;
393 const char *name;
394 const char *what;
395 struct restart_info *restart;
396 struct daemon *dmn;
397
398 switch (child = waitpid(-1, &status, WNOHANG)) {
399 case -1:
400 flog_err_sys(EC_LIB_SYSTEM_CALL, "waitpid failed: %s",
401 safe_strerror(errno));
402 return;
403 case 0:
404 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
405 return;
406 }
407
408 if (child == integrated_write_pid) {
409 integrated_write_sigchld(status);
410 return;
411 }
412
413 if ((restart = find_child(child)) != NULL) {
414 name = restart->name;
415 what = restart->what;
416 restart->pid = 0;
417 gs.numpids--;
418 thread_cancel(&restart->t_kill);
419
420 /* Update restart time to reflect the time the command
421 * completed. */
422 gettimeofday(&restart->time, NULL);
423 } else {
424 flog_err_sys(
425 EC_LIB_SYSTEM_CALL,
426 "waitpid returned status for an unknown child process %d",
427 (int)child);
428 name = "(unknown)";
429 what = "background";
430 }
431 if (WIFSTOPPED(status))
432 zlog_warn("%s %s process %d is stopped", what, name,
433 (int)child);
434 else if (WIFSIGNALED(status))
435 zlog_warn("%s %s process %d terminated due to signal %d", what,
436 name, (int)child, WTERMSIG(status));
437 else if (WIFEXITED(status)) {
438 if (WEXITSTATUS(status) != 0)
439 zlog_warn(
440 "%s %s process %d exited with non-zero status %d",
441 what, name, (int)child, WEXITSTATUS(status));
442 else {
443 zlog_debug("%s %s process %d exited normally", what,
444 name, (int)child);
445
446 if (restart && restart != &gs.restart) {
447 dmn = container_of(restart, struct daemon,
448 restart);
449 restart_done(dmn);
450 } else if (restart)
451 for (dmn = gs.daemons; dmn; dmn = dmn->next)
452 restart_done(dmn);
453 }
454 } else
455 flog_err_sys(
456 EC_LIB_SYSTEM_CALL,
457 "cannot interpret %s %s process %d wait status 0x%x",
458 what, name, (int)child, status);
459 phase_check();
460 }
461
462 static int run_job(struct restart_info *restart, const char *cmdtype,
463 const char *command, int force, int update_interval)
464 {
465 struct timeval delay;
466
467 if (gs.loglevel > LOG_DEBUG + 1)
468 zlog_debug("attempting to %s %s", cmdtype, restart->name);
469
470 if (restart->pid) {
471 if (gs.loglevel > LOG_DEBUG + 1)
472 zlog_debug(
473 "cannot %s %s, previous pid %d still running",
474 cmdtype, restart->name, (int)restart->pid);
475 return -1;
476 }
477
478 char buffer[512];
479
480 snprintf(buffer, sizeof(buffer), "restarting %s", restart->name);
481 systemd_send_status(buffer);
482
483 /* Note: time_elapsed test must come before the force test, since we
484 need
485 to make sure that delay is initialized for use below in updating the
486 restart interval. */
487 if ((time_elapsed(&delay, &restart->time)->tv_sec < restart->interval)
488 && !force) {
489
490 if (gs.loglevel > LOG_DEBUG + 1)
491 zlog_debug(
492 "postponing %s %s: elapsed time %ld < retry interval %ld",
493 cmdtype, restart->name, (long)delay.tv_sec,
494 restart->interval);
495 return -1;
496 }
497
498 gettimeofday(&restart->time, NULL);
499 restart->kills = 0;
500 {
501 char cmd[strlen(command) + strlen(restart->name) + 1];
502 #pragma GCC diagnostic push
503 #pragma GCC diagnostic ignored "-Wformat-nonliteral"
504 /* user supplied command string has a %s for the daemon name */
505 snprintf(cmd, sizeof(cmd), command, restart->name);
506 #pragma GCC diagnostic pop
507 if ((restart->pid = run_background(cmd)) > 0) {
508 thread_add_timer(master, restart_kill, restart,
509 gs.restart_timeout, &restart->t_kill);
510 restart->what = cmdtype;
511 gs.numpids++;
512 } else
513 restart->pid = 0;
514 }
515
516 /* Calculate the new restart interval. */
517 if (update_interval) {
518 if (delay.tv_sec > 2 * gs.max_restart_interval)
519 restart->interval = gs.min_restart_interval;
520 else if ((restart->interval *= 2) > gs.max_restart_interval)
521 restart->interval = gs.max_restart_interval;
522 if (gs.loglevel > LOG_DEBUG + 1)
523 zlog_debug("restart %s interval is now %ld",
524 restart->name, restart->interval);
525 }
526 return restart->pid;
527 }
528
529 #define SET_READ_HANDLER(DMN) \
530 do { \
531 (DMN)->t_read = NULL; \
532 thread_add_read(master, handle_read, (DMN), (DMN)->fd, \
533 &(DMN)->t_read); \
534 } while (0);
535
536 #define SET_WAKEUP_DOWN(DMN) \
537 do { \
538 (DMN)->t_wakeup = NULL; \
539 thread_add_timer_msec(master, wakeup_down, (DMN), \
540 FUZZY(gs.period), &(DMN)->t_wakeup); \
541 } while (0);
542
543 #define SET_WAKEUP_UNRESPONSIVE(DMN) \
544 do { \
545 (DMN)->t_wakeup = NULL; \
546 thread_add_timer_msec(master, wakeup_unresponsive, (DMN), \
547 FUZZY(gs.period), &(DMN)->t_wakeup); \
548 } while (0);
549
550 #define SET_WAKEUP_ECHO(DMN) \
551 do { \
552 (DMN)->t_wakeup = NULL; \
553 thread_add_timer_msec(master, wakeup_send_echo, (DMN), \
554 FUZZY(gs.period), &(DMN)->t_wakeup); \
555 } while (0);
556
557 static void wakeup_down(struct thread *t_wakeup)
558 {
559 struct daemon *dmn = THREAD_ARG(t_wakeup);
560
561 dmn->t_wakeup = NULL;
562 if (try_connect(dmn) < 0)
563 SET_WAKEUP_DOWN(dmn);
564 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
565 try_restart(dmn);
566 }
567
568 static void wakeup_init(struct thread *t_wakeup)
569 {
570 struct daemon *dmn = THREAD_ARG(t_wakeup);
571
572 dmn->t_wakeup = NULL;
573 if (try_connect(dmn) < 0) {
574 zlog_info(
575 "%s state -> down : initial connection attempt failed",
576 dmn->name);
577 dmn->state = DAEMON_DOWN;
578 }
579 phase_check();
580 }
581
582 static void restart_done(struct daemon *dmn)
583 {
584 if (dmn->state != DAEMON_DOWN) {
585 zlog_warn(
586 "Daemon: %s: is in %s state but expected it to be in DAEMON_DOWN state",
587 dmn->name, state_str[dmn->state]);
588 return;
589 }
590 THREAD_OFF(dmn->t_wakeup);
591
592 if (try_connect(dmn) < 0)
593 SET_WAKEUP_DOWN(dmn);
594 }
595
596 static void daemon_restarting_operational(struct thread *thread)
597 {
598 systemd_send_status("FRR Operational");
599 }
600
601 static void daemon_down(struct daemon *dmn, const char *why)
602 {
603 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
604 flog_err(EC_WATCHFRR_CONNECTION, "%s state -> down : %s",
605 dmn->name, why);
606 else if (gs.loglevel > LOG_DEBUG)
607 zlog_debug("%s still down : %s", dmn->name, why);
608 if (IS_UP(dmn))
609 gs.numdown++;
610 dmn->state = DAEMON_DOWN;
611 if (dmn->fd >= 0) {
612 close(dmn->fd);
613 dmn->fd = -1;
614 }
615 THREAD_OFF(dmn->t_read);
616 THREAD_OFF(dmn->t_write);
617 THREAD_OFF(dmn->t_wakeup);
618 if (try_connect(dmn) < 0)
619 SET_WAKEUP_DOWN(dmn);
620
621 systemd_send_status("FRR partially operational");
622 phase_check();
623 }
624
625 static void handle_read(struct thread *t_read)
626 {
627 struct daemon *dmn = THREAD_ARG(t_read);
628 static const char resp[sizeof(PING_TOKEN) + 4] = PING_TOKEN "\n";
629 char buf[sizeof(resp) + 100];
630 ssize_t rc;
631 struct timeval delay;
632
633 dmn->t_read = NULL;
634 if ((rc = read(dmn->fd, buf, sizeof(buf))) < 0) {
635 char why[100];
636
637 if (ERRNO_IO_RETRY(errno)) {
638 /* Pretend it never happened. */
639 SET_READ_HANDLER(dmn);
640 return;
641 }
642 snprintf(why, sizeof(why), "unexpected read error: %s",
643 safe_strerror(errno));
644 daemon_down(dmn, why);
645 return;
646 }
647 if (rc == 0) {
648 daemon_down(dmn, "read returned EOF");
649 return;
650 }
651 if (!dmn->echo_sent.tv_sec) {
652 char why[sizeof(buf) + 100];
653 snprintf(why, sizeof(why),
654 "unexpected read returns %d bytes: %.*s", (int)rc,
655 (int)rc, buf);
656 daemon_down(dmn, why);
657 return;
658 }
659
660 /* We are expecting an echo response: is there any chance that the
661 response would not be returned entirely in the first read? That
662 seems inconceivable... */
663 if ((rc != sizeof(resp)) || memcmp(buf, resp, sizeof(resp))) {
664 char why[100 + sizeof(buf)];
665 snprintf(why, sizeof(why),
666 "read returned bad echo response of %d bytes (expecting %u): %.*s",
667 (int)rc, (unsigned int)sizeof(resp), (int)rc, buf);
668 daemon_down(dmn, why);
669 return;
670 }
671
672 time_elapsed(&delay, &dmn->echo_sent);
673 dmn->echo_sent.tv_sec = 0;
674 if (dmn->state == DAEMON_UNRESPONSIVE) {
675 if (delay.tv_sec < gs.timeout) {
676 dmn->state = DAEMON_UP;
677 zlog_warn(
678 "%s state -> up : echo response received after %ld.%06ld seconds",
679 dmn->name, (long)delay.tv_sec,
680 (long)delay.tv_usec);
681 } else
682 zlog_warn(
683 "%s: slow echo response finally received after %ld.%06ld seconds",
684 dmn->name, (long)delay.tv_sec,
685 (long)delay.tv_usec);
686 } else if (gs.loglevel > LOG_DEBUG + 1)
687 zlog_debug("%s: echo response received after %ld.%06ld seconds",
688 dmn->name, (long)delay.tv_sec, (long)delay.tv_usec);
689
690 SET_READ_HANDLER(dmn);
691 thread_cancel(&dmn->t_wakeup);
692 SET_WAKEUP_ECHO(dmn);
693 }
694
695 /*
696 * Wait till we notice that all daemons are ready before
697 * we send we are ready to systemd
698 */
699 static void daemon_send_ready(int exitcode)
700 {
701 FILE *fp;
702 static int sent = 0;
703 char started[1024];
704
705 if (sent)
706 return;
707
708 if (exitcode == 0)
709 zlog_notice("all daemons up, doing startup-complete notify");
710 else if (gs.numdown < gs.numdaemons)
711 flog_err(EC_WATCHFRR_CONNECTION,
712 "startup did not complete within timeout (%d/%d daemons running)",
713 gs.numdaemons - gs.numdown, gs.numdaemons);
714 else {
715 flog_err(EC_WATCHFRR_CONNECTION,
716 "all configured daemons failed to start -- exiting watchfrr");
717 exit(exitcode);
718
719 }
720
721 frr_detach();
722
723 snprintf(started, sizeof(started), "%s/%s", frr_vtydir,
724 "watchfrr.started");
725 fp = fopen(started, "w");
726 if (fp)
727 fclose(fp);
728
729 systemd_send_started(master);
730 systemd_send_status("FRR Operational");
731 sent = 1;
732 }
733
734 static void daemon_up(struct daemon *dmn, const char *why)
735 {
736 dmn->state = DAEMON_UP;
737 gs.numdown--;
738 dmn->connect_tries = 0;
739 zlog_notice("%s state -> up : %s", dmn->name, why);
740 if (gs.numdown == 0) {
741 daemon_send_ready(0);
742
743 THREAD_OFF(gs.t_operational);
744
745 thread_add_timer(master, daemon_restarting_operational, NULL,
746 gs.operational_timeout, &gs.t_operational);
747 }
748
749 SET_WAKEUP_ECHO(dmn);
750 phase_check();
751 }
752
753 static void check_connect(struct thread *t_write)
754 {
755 struct daemon *dmn = THREAD_ARG(t_write);
756 int sockerr;
757 socklen_t reslen = sizeof(sockerr);
758
759 dmn->t_write = NULL;
760 if (getsockopt(dmn->fd, SOL_SOCKET, SO_ERROR, (char *)&sockerr, &reslen)
761 < 0) {
762 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn->name,
763 safe_strerror(errno));
764 daemon_down(dmn,
765 "getsockopt failed checking connection success");
766 return;
767 }
768 if ((reslen == sizeof(sockerr)) && sockerr) {
769 char why[100];
770 snprintf(
771 why, sizeof(why),
772 "getsockopt reports that connection attempt failed: %s",
773 safe_strerror(sockerr));
774 daemon_down(dmn, why);
775 return;
776 }
777
778 daemon_up(dmn, "delayed connect succeeded");
779 }
780
781 static void wakeup_connect_hanging(struct thread *t_wakeup)
782 {
783 struct daemon *dmn = THREAD_ARG(t_wakeup);
784 char why[100];
785
786 dmn->t_wakeup = NULL;
787 snprintf(why, sizeof(why),
788 "connection attempt timed out after %ld seconds", gs.timeout);
789 daemon_down(dmn, why);
790 }
791
792 /* Making connection to protocol daemon. */
793 static int try_connect(struct daemon *dmn)
794 {
795 int sock;
796 struct sockaddr_un addr;
797 socklen_t len;
798
799 if (gs.loglevel > LOG_DEBUG + 1)
800 zlog_debug("%s: attempting to connect", dmn->name);
801 dmn->connect_tries++;
802
803 memset(&addr, 0, sizeof(addr));
804 addr.sun_family = AF_UNIX;
805 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty", gs.vtydir,
806 dmn->name);
807 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
808 len = addr.sun_len = SUN_LEN(&addr);
809 #else
810 len = sizeof(addr.sun_family) + strlen(addr.sun_path);
811 #endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
812
813 /* Quick check to see if we might succeed before we go to the trouble
814 of creating a socket. */
815 if (access(addr.sun_path, W_OK) < 0) {
816 if (errno != ENOENT)
817 flog_err_sys(EC_LIB_SYSTEM_CALL,
818 "%s: access to socket %s denied: %s",
819 dmn->name, addr.sun_path,
820 safe_strerror(errno));
821 return -1;
822 }
823
824 if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
825 flog_err_sys(EC_LIB_SOCKET, "%s(%s): cannot make socket: %s",
826 __func__, addr.sun_path, safe_strerror(errno));
827 return -1;
828 }
829
830 if (set_nonblocking(sock) < 0 || set_cloexec(sock) < 0) {
831 flog_err_sys(EC_LIB_SYSTEM_CALL,
832 "%s(%s): set_nonblocking/cloexec(%d) failed",
833 __func__, addr.sun_path, sock);
834 close(sock);
835 return -1;
836 }
837
838 if (connect(sock, (struct sockaddr *)&addr, len) < 0) {
839 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK)) {
840 if (gs.loglevel > LOG_DEBUG)
841 zlog_debug("%s(%s): connect failed: %s",
842 __func__, addr.sun_path,
843 safe_strerror(errno));
844 close(sock);
845 return -1;
846 }
847 if (gs.loglevel > LOG_DEBUG)
848 zlog_debug("%s: connection in progress", dmn->name);
849 dmn->state = DAEMON_CONNECTING;
850 dmn->fd = sock;
851 thread_add_write(master, check_connect, dmn, dmn->fd,
852 &dmn->t_write);
853 thread_add_timer(master, wakeup_connect_hanging, dmn,
854 gs.timeout, &dmn->t_wakeup);
855 SET_READ_HANDLER(dmn);
856 return 0;
857 }
858
859 dmn->fd = sock;
860 SET_READ_HANDLER(dmn);
861 daemon_up(dmn, "connect succeeded");
862 return 1;
863 }
864
865 static void phase_hanging(struct thread *t_hanging)
866 {
867 gs.t_phase_hanging = NULL;
868 flog_err(EC_WATCHFRR_CONNECTION,
869 "Phase [%s] hanging for %ld seconds, aborting phased restart",
870 phase_str[gs.phase], PHASE_TIMEOUT);
871 gs.phase = PHASE_NONE;
872 }
873
874 static void set_phase(enum restart_phase new_phase)
875 {
876 gs.phase = new_phase;
877 thread_cancel(&gs.t_phase_hanging);
878
879 thread_add_timer(master, phase_hanging, NULL, PHASE_TIMEOUT,
880 &gs.t_phase_hanging);
881 }
882
883 static void phase_check(void)
884 {
885 struct daemon *dmn;
886
887 switch (gs.phase) {
888 case PHASE_NONE:
889 break;
890
891 case PHASE_INIT:
892 for (dmn = gs.daemons; dmn; dmn = dmn->next)
893 if (dmn->state == DAEMON_INIT)
894 return;
895
896 /* startup complete, everything out of INIT */
897 gs.phase = PHASE_NONE;
898 for (dmn = gs.daemons; dmn; dmn = dmn->next)
899 if (dmn->state == DAEMON_DOWN) {
900 SET_WAKEUP_DOWN(dmn);
901 try_restart(dmn);
902 }
903 break;
904 case PHASE_STOPS_PENDING:
905 if (gs.numpids)
906 break;
907 zlog_info(
908 "Phased restart: all routing daemon stop jobs have completed.");
909 set_phase(PHASE_WAITING_DOWN);
910
911 /*FALLTHRU*/
912 case PHASE_WAITING_DOWN:
913 if (gs.numdown + IS_UP(gs.special) < gs.numdaemons)
914 break;
915 systemd_send_status("Phased Restart");
916 zlog_info("Phased restart: all routing daemons now down.");
917 run_job(&gs.special->restart, "restart", gs.restart_command, 1,
918 1);
919 set_phase(PHASE_ZEBRA_RESTART_PENDING);
920
921 /*FALLTHRU*/
922 case PHASE_ZEBRA_RESTART_PENDING:
923 if (gs.special->restart.pid)
924 break;
925 systemd_send_status("Zebra Restarting");
926 zlog_info("Phased restart: %s restart job completed.",
927 gs.special->name);
928 set_phase(PHASE_WAITING_ZEBRA_UP);
929
930 /*FALLTHRU*/
931 case PHASE_WAITING_ZEBRA_UP:
932 if (!IS_UP(gs.special))
933 break;
934 zlog_info("Phased restart: %s is now up.", gs.special->name);
935 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
936 if (dmn != gs.special)
937 run_job(&dmn->restart, "start",
938 gs.start_command, 1, 0);
939 }
940 gs.phase = PHASE_NONE;
941 THREAD_OFF(gs.t_phase_hanging);
942 zlog_notice("Phased global restart has completed.");
943 break;
944 }
945 }
946
947 static void try_restart(struct daemon *dmn)
948 {
949 if (watch_only)
950 return;
951
952 if (dmn != gs.special) {
953 if ((gs.special->state == DAEMON_UP)
954 && (gs.phase == PHASE_NONE))
955 run_job(&dmn->restart, "restart", gs.restart_command, 0,
956 1);
957 else
958 zlog_debug(
959 "%s: postponing restart attempt because master %s daemon not up [%s], or phased restart in progress",
960 dmn->name, gs.special->name,
961 state_str[gs.special->state]);
962 return;
963 }
964
965 if ((gs.phase != PHASE_NONE) || gs.numpids) {
966 if (gs.loglevel > LOG_DEBUG + 1)
967 zlog_debug(
968 "postponing phased global restart: restart already in progress [%s], or outstanding child processes [%d]",
969 phase_str[gs.phase], gs.numpids);
970 return;
971 }
972 /* Is it too soon for a restart? */
973 {
974 struct timeval delay;
975 if (time_elapsed(&delay, &gs.special->restart.time)->tv_sec
976 < gs.special->restart.interval) {
977 if (gs.loglevel > LOG_DEBUG + 1)
978 zlog_debug(
979 "postponing phased global restart: elapsed time %ld < retry interval %ld",
980 (long)delay.tv_sec,
981 gs.special->restart.interval);
982 return;
983 }
984 }
985 run_job(&gs.restart, "restart", gs.restart_command, 0, 1);
986 }
987
988 static void wakeup_unresponsive(struct thread *t_wakeup)
989 {
990 struct daemon *dmn = THREAD_ARG(t_wakeup);
991
992 dmn->t_wakeup = NULL;
993 if (dmn->state != DAEMON_UNRESPONSIVE)
994 flog_err(EC_WATCHFRR_CONNECTION,
995 "%s: no longer unresponsive (now %s), wakeup should have been cancelled!",
996 dmn->name, state_str[dmn->state]);
997 else {
998 SET_WAKEUP_UNRESPONSIVE(dmn);
999 try_restart(dmn);
1000 }
1001 }
1002
1003 static void wakeup_no_answer(struct thread *t_wakeup)
1004 {
1005 struct daemon *dmn = THREAD_ARG(t_wakeup);
1006
1007 dmn->t_wakeup = NULL;
1008 dmn->state = DAEMON_UNRESPONSIVE;
1009 if (dmn->ignore_timeout)
1010 return;
1011 flog_err(EC_WATCHFRR_CONNECTION,
1012 "%s state -> unresponsive : no response yet to ping sent %ld seconds ago",
1013 dmn->name, gs.timeout);
1014 SET_WAKEUP_UNRESPONSIVE(dmn);
1015 try_restart(dmn);
1016 }
1017
1018 static void wakeup_send_echo(struct thread *t_wakeup)
1019 {
1020 static const char echocmd[] = "echo " PING_TOKEN;
1021 ssize_t rc;
1022 struct daemon *dmn = THREAD_ARG(t_wakeup);
1023
1024 dmn->t_wakeup = NULL;
1025 if (((rc = write(dmn->fd, echocmd, sizeof(echocmd))) < 0)
1026 || ((size_t)rc != sizeof(echocmd))) {
1027 char why[100 + sizeof(echocmd)];
1028 snprintf(why, sizeof(why),
1029 "write '%s' returned %d instead of %u", echocmd,
1030 (int)rc, (unsigned int)sizeof(echocmd));
1031 daemon_down(dmn, why);
1032 } else {
1033 gettimeofday(&dmn->echo_sent, NULL);
1034 thread_add_timer(master, wakeup_no_answer, dmn, gs.timeout,
1035 &dmn->t_wakeup);
1036 }
1037 }
1038
1039 bool check_all_up(void)
1040 {
1041 struct daemon *dmn;
1042
1043 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1044 if (dmn->state != DAEMON_UP)
1045 return false;
1046 return true;
1047 }
1048
1049 void watchfrr_status(struct vty *vty)
1050 {
1051 struct daemon *dmn;
1052 struct timeval delay;
1053
1054 vty_out(vty, "watchfrr global phase: %s\n", phase_str[gs.phase]);
1055 vty_out(vty, " Restart Command: %pSQq\n", gs.restart_command);
1056 vty_out(vty, " Start Command: %pSQq\n", gs.start_command);
1057 vty_out(vty, " Stop Command: %pSQq\n", gs.stop_command);
1058 vty_out(vty, " Min Restart Interval: %ld\n", gs.min_restart_interval);
1059 vty_out(vty, " Max Restart Interval: %ld\n", gs.max_restart_interval);
1060 vty_out(vty, " Restart Timeout: %ld\n", gs.restart_timeout);
1061 vty_out(vty, " Reading Configuration: %s\n",
1062 gs.reading_configuration ? "yes" : "no");
1063 if (gs.restart.pid)
1064 vty_out(vty, " global restart running, pid %ld\n",
1065 (long)gs.restart.pid);
1066
1067 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
1068 vty_out(vty, " %-20s %s%s", dmn->name, state_str[dmn->state],
1069 dmn->ignore_timeout ? "/Ignoring Timeout\n" : "\n");
1070 if (dmn->restart.pid)
1071 vty_out(vty, " restart running, pid %ld\n",
1072 (long)dmn->restart.pid);
1073 else if (dmn->state == DAEMON_DOWN &&
1074 time_elapsed(&delay, &dmn->restart.time)->tv_sec
1075 < dmn->restart.interval)
1076 vty_out(vty, " restarting in %jd seconds (%jds backoff interval)\n",
1077 (intmax_t)dmn->restart.interval
1078 - (intmax_t)delay.tv_sec,
1079 (intmax_t)dmn->restart.interval);
1080 }
1081 }
1082
1083 static void sigint(void)
1084 {
1085 zlog_notice("Terminating on signal");
1086 systemd_send_stopping();
1087 exit(0);
1088 }
1089
1090 static int valid_command(const char *cmd)
1091 {
1092 char *p;
1093
1094 if (cmd == NULL)
1095 return 0;
1096
1097 return ((p = strchr(cmd, '%')) != NULL) && (*(p + 1) == 's')
1098 && !strchr(p + 1, '%');
1099 }
1100
1101 /* This is an ugly hack to circumvent problems with passing command-line
1102 arguments that contain spaces. The fix is to use a configuration file. */
1103 static char *translate_blanks(const char *cmd, const char *blankstr)
1104 {
1105 char *res;
1106 char *p;
1107 size_t bslen = strlen(blankstr);
1108
1109 if (!(res = strdup(cmd))) {
1110 perror("strdup");
1111 exit(1);
1112 }
1113 while ((p = strstr(res, blankstr)) != NULL) {
1114 *p = ' ';
1115 if (bslen != 1)
1116 memmove(p + 1, p + bslen, strlen(p + bslen) + 1);
1117 }
1118 return res;
1119 }
1120
1121 static void startup_timeout(struct thread *t_wakeup)
1122 {
1123 daemon_send_ready(1);
1124 }
1125
1126 #ifdef GNU_LINUX
1127
1128 #include <sys/mount.h>
1129 #include <sched.h>
1130
1131 #define NETNS_RUN_DIR "/var/run/netns"
1132
1133 static void netns_create(int dirfd, const char *nsname)
1134 {
1135 /* make /var/run/netns shared between mount namespaces
1136 * just like iproute2 sets it up
1137 */
1138 if (mount("", NETNS_RUN_DIR, "none", MS_SHARED | MS_REC, NULL)) {
1139 if (errno != EINVAL) {
1140 perror("mount");
1141 exit(1);
1142 }
1143
1144 if (mount(NETNS_RUN_DIR, NETNS_RUN_DIR, "none",
1145 MS_BIND | MS_REC, NULL)) {
1146 perror("mount");
1147 exit(1);
1148 }
1149
1150 if (mount("", NETNS_RUN_DIR, "none", MS_SHARED | MS_REC,
1151 NULL)) {
1152 perror("mount");
1153 exit(1);
1154 }
1155 }
1156
1157 /* need an empty file to mount on top of */
1158 int nsfd = openat(dirfd, nsname, O_CREAT | O_RDONLY | O_EXCL, 0);
1159
1160 if (nsfd < 0) {
1161 fprintf(stderr, "failed to create \"%s/%s\": %s\n",
1162 NETNS_RUN_DIR, nsname, strerror(errno));
1163 exit(1);
1164 }
1165 close(nsfd);
1166
1167 if (unshare(CLONE_NEWNET)) {
1168 perror("unshare");
1169 unlinkat(dirfd, nsname, 0);
1170 exit(1);
1171 }
1172
1173 char *dstpath = asprintfrr(MTYPE_TMP, "%s/%s", NETNS_RUN_DIR, nsname);
1174
1175 /* bind-mount so the namespace has a name and is persistent */
1176 if (mount("/proc/self/ns/net", dstpath, "none", MS_BIND, NULL) < 0) {
1177 fprintf(stderr, "failed to bind-mount netns to \"%s\": %s\n",
1178 dstpath, strerror(errno));
1179 unlinkat(dirfd, nsname, 0);
1180 exit(1);
1181 }
1182
1183 XFREE(MTYPE_TMP, dstpath);
1184 }
1185
1186 static void netns_setup(const char *nsname)
1187 {
1188 int dirfd, nsfd;
1189
1190 dirfd = open(NETNS_RUN_DIR, O_DIRECTORY | O_RDONLY);
1191 if (dirfd < 0) {
1192 if (errno == ENOTDIR) {
1193 fprintf(stderr, "error: \"%s\" is not a directory!\n",
1194 NETNS_RUN_DIR);
1195 exit(1);
1196 } else if (errno == ENOENT) {
1197 if (mkdir(NETNS_RUN_DIR, 0755)) {
1198 fprintf(stderr, "error: \"%s\": mkdir: %s\n",
1199 NETNS_RUN_DIR, strerror(errno));
1200 exit(1);
1201 }
1202 dirfd = open(NETNS_RUN_DIR, O_DIRECTORY | O_RDONLY);
1203 if (dirfd < 0) {
1204 fprintf(stderr, "error: \"%s\": opendir: %s\n",
1205 NETNS_RUN_DIR, strerror(errno));
1206 exit(1);
1207 }
1208 } else {
1209 fprintf(stderr, "error: \"%s\": %s\n",
1210 NETNS_RUN_DIR, strerror(errno));
1211 exit(1);
1212 }
1213 }
1214
1215 nsfd = openat(dirfd, nsname, O_RDONLY);
1216 if (nsfd < 0 && errno != ENOENT) {
1217 fprintf(stderr, "error: \"%s/%s\": %s\n",
1218 NETNS_RUN_DIR, nsname, strerror(errno));
1219 exit(1);
1220 }
1221 if (nsfd < 0)
1222 netns_create(dirfd, nsname);
1223 else {
1224 if (setns(nsfd, CLONE_NEWNET)) {
1225 perror("setns");
1226 exit(1);
1227 }
1228 close(nsfd);
1229 }
1230 close(dirfd);
1231
1232 /* make sure loopback is up... weird things happen otherwise.
1233 * ioctl is perfectly fine for this, don't need netlink...
1234 */
1235 int sockfd;
1236 struct ifreq ifr = { };
1237
1238 strlcpy(ifr.ifr_name, "lo", sizeof(ifr.ifr_name));
1239
1240 sockfd = socket(AF_INET, SOCK_DGRAM, 0);
1241 if (sockfd < 0) {
1242 perror("socket");
1243 exit(1);
1244 }
1245 if (ioctl(sockfd, SIOCGIFFLAGS, &ifr)) {
1246 perror("ioctl(SIOCGIFFLAGS, \"lo\")");
1247 exit(1);
1248 }
1249 if (!(ifr.ifr_flags & IFF_UP)) {
1250 ifr.ifr_flags |= IFF_UP;
1251 if (ioctl(sockfd, SIOCSIFFLAGS, &ifr)) {
1252 perror("ioctl(SIOCSIFFLAGS, \"lo\")");
1253 exit(1);
1254 }
1255 }
1256 close(sockfd);
1257 }
1258
1259 #else /* !GNU_LINUX */
1260
1261 static void netns_setup(const char *nsname)
1262 {
1263 fprintf(stderr, "network namespaces are only available on Linux\n");
1264 exit(1);
1265 }
1266 #endif
1267
1268 static void watchfrr_start_config(void)
1269 {
1270 gs.reading_configuration = true;
1271 }
1272
1273 static void watchfrr_end_config(void)
1274 {
1275 gs.reading_configuration = false;
1276 }
1277
1278 static void watchfrr_init(int argc, char **argv)
1279 {
1280 const char *special = "zebra";
1281 int i;
1282 struct daemon *dmn, **add = &gs.daemons;
1283 char alldaemons[512] = "", *p = alldaemons;
1284
1285 thread_add_timer_msec(master, startup_timeout, NULL, STARTUP_TIMEOUT,
1286 &gs.t_startup_timeout);
1287
1288 for (i = optind; i < argc; i++) {
1289 dmn = XCALLOC(MTYPE_WATCHFRR_DAEMON, sizeof(*dmn));
1290
1291 dmn->name = dmn->restart.name = argv[i];
1292 dmn->state = DAEMON_INIT;
1293 gs.numdaemons++;
1294 gs.numdown++;
1295 dmn->fd = -1;
1296 thread_add_timer_msec(master, wakeup_init, dmn, 0,
1297 &dmn->t_wakeup);
1298 dmn->restart.interval = gs.min_restart_interval;
1299 *add = dmn;
1300 add = &dmn->next;
1301
1302 if (!strcmp(dmn->name, special))
1303 gs.special = dmn;
1304 }
1305
1306 if (!gs.daemons) {
1307 fprintf(stderr,
1308 "Must specify one or more daemons to monitor.\n\n");
1309 frr_help_exit(1);
1310 }
1311 if (!watch_only && !gs.special) {
1312 fprintf(stderr, "\"%s\" daemon must be in daemon lists\n\n",
1313 special);
1314 frr_help_exit(1);
1315 }
1316
1317 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
1318 snprintf(p, alldaemons + sizeof(alldaemons) - p, "%s%s",
1319 (p == alldaemons) ? "" : " ", dmn->name);
1320 p += strlen(p);
1321 }
1322 zlog_notice("%s %s watching [%s]%s", progname, FRR_VERSION, alldaemons,
1323 watch_only ? ", monitor mode" : "");
1324 }
1325
1326 struct zebra_privs_t watchfrr_privs = {
1327 #ifdef VTY_GROUP
1328 .vty_group = VTY_GROUP,
1329 #endif
1330 };
1331
1332 static struct frr_signal_t watchfrr_signals[] = {
1333 {
1334 .signal = SIGINT,
1335 .handler = sigint,
1336 },
1337 {
1338 .signal = SIGTERM,
1339 .handler = sigint,
1340 },
1341 {
1342 .signal = SIGCHLD,
1343 .handler = sigchild,
1344 },
1345 };
1346
1347 FRR_DAEMON_INFO(watchfrr, WATCHFRR,
1348 .flags = FRR_NO_PRIVSEP | FRR_NO_TCPVTY | FRR_LIMITED_CLI
1349 | FRR_NO_CFG_PID_DRY | FRR_NO_ZCLIENT
1350 | FRR_DETACH_LATER,
1351
1352 .printhelp = printhelp,
1353 .copyright = "Copyright 2004 Andrew J. Schorr",
1354
1355 .signals = watchfrr_signals,
1356 .n_signals = array_size(watchfrr_signals),
1357
1358 .privs = &watchfrr_privs,
1359 );
1360
1361 #define DEPRECATED_OPTIONS "aAezR:"
1362
1363 int main(int argc, char **argv)
1364 {
1365 int opt;
1366 const char *blankstr = NULL;
1367 const char *netns = NULL;
1368 bool netns_en = false;
1369
1370 frr_preinit(&watchfrr_di, argc, argv);
1371 progname = watchfrr_di.progname;
1372
1373 frr_opt_add("b:di:k:l:N:p:r:S:s:t:T:" DEPRECATED_OPTIONS, longopts, "");
1374
1375 gs.restart.name = "all";
1376 while ((opt = frr_getopt(argc, argv, NULL)) != EOF) {
1377 if (opt && opt < 128 && strchr(DEPRECATED_OPTIONS, opt)) {
1378 fprintf(stderr,
1379 "The -%c option no longer exists.\n"
1380 "Please refer to the watchfrr(8) man page.\n",
1381 opt);
1382 exit(1);
1383 }
1384
1385 switch (opt) {
1386 case 0:
1387 break;
1388 case 'b':
1389 blankstr = optarg;
1390 break;
1391 case OPTION_DRY:
1392 watch_only = true;
1393 break;
1394 case 'k':
1395 if (!valid_command(optarg)) {
1396 fprintf(stderr,
1397 "Invalid kill command, must contain '%%s': %s\n",
1398 optarg);
1399 frr_help_exit(1);
1400 }
1401 gs.stop_command = optarg;
1402 break;
1403 case 'l': {
1404 char garbage[3];
1405 if ((sscanf(optarg, "%d%1s", &gs.loglevel, garbage)
1406 != 1)
1407 || (gs.loglevel < LOG_EMERG)) {
1408 fprintf(stderr,
1409 "Invalid loglevel argument: %s\n",
1410 optarg);
1411 frr_help_exit(1);
1412 }
1413 } break;
1414 case OPTION_MINRESTART: {
1415 char garbage[3];
1416 if ((sscanf(optarg, "%ld%1s", &gs.min_restart_interval,
1417 garbage)
1418 != 1)
1419 || (gs.min_restart_interval < 0)) {
1420 fprintf(stderr,
1421 "Invalid min_restart_interval argument: %s\n",
1422 optarg);
1423 frr_help_exit(1);
1424 }
1425 } break;
1426 case OPTION_MAXRESTART: {
1427 char garbage[3];
1428 if ((sscanf(optarg, "%ld%1s", &gs.max_restart_interval,
1429 garbage)
1430 != 1)
1431 || (gs.max_restart_interval < 0)) {
1432 fprintf(stderr,
1433 "Invalid max_restart_interval argument: %s\n",
1434 optarg);
1435 frr_help_exit(1);
1436 }
1437 } break;
1438 case OPTION_MAXOPERATIONAL: {
1439 char garbage[3];
1440
1441 if ((sscanf(optarg, "%ld%1s", &gs.operational_timeout,
1442 garbage) != 1) ||
1443 (gs.operational_timeout < 0)) {
1444 fprintf(stderr,
1445 "Invalid Operational_timeout argument: %s\n",
1446 optarg);
1447 frr_help_exit(1);
1448 }
1449 } break;
1450 case OPTION_NETNS:
1451 netns_en = true;
1452 if (optarg && strchr(optarg, '/')) {
1453 fprintf(stderr,
1454 "invalid network namespace name \"%s\" (may not contain slashes)\n",
1455 optarg);
1456 frr_help_exit(1);
1457 }
1458 netns = optarg;
1459 break;
1460 case 'i': {
1461 char garbage[3];
1462 int period;
1463 if ((sscanf(optarg, "%d%1s", &period, garbage) != 1)
1464 || (gs.period < 1)) {
1465 fprintf(stderr,
1466 "Invalid interval argument: %s\n",
1467 optarg);
1468 frr_help_exit(1);
1469 }
1470 gs.period = 1000 * period;
1471 } break;
1472 case 'p':
1473 watchfrr_di.pid_file = optarg;
1474 break;
1475 case 'r':
1476 if (!valid_command(optarg)) {
1477 fprintf(stderr,
1478 "Invalid restart command, must contain '%%s': %s\n",
1479 optarg);
1480 frr_help_exit(1);
1481 }
1482 gs.restart_command = optarg;
1483 break;
1484 case 's':
1485 if (!valid_command(optarg)) {
1486 fprintf(stderr,
1487 "Invalid start command, must contain '%%s': %s\n",
1488 optarg);
1489 frr_help_exit(1);
1490 }
1491 gs.start_command = optarg;
1492 break;
1493 case 'S':
1494 gs.vtydir = optarg;
1495 break;
1496 case 't': {
1497 char garbage[3];
1498 if ((sscanf(optarg, "%ld%1s", &gs.timeout, garbage)
1499 != 1)
1500 || (gs.timeout < 1)) {
1501 fprintf(stderr,
1502 "Invalid timeout argument: %s\n",
1503 optarg);
1504 frr_help_exit(1);
1505 }
1506 } break;
1507 case 'T': {
1508 char garbage[3];
1509 if ((sscanf(optarg, "%ld%1s", &gs.restart_timeout,
1510 garbage)
1511 != 1)
1512 || (gs.restart_timeout < 1)) {
1513 fprintf(stderr,
1514 "Invalid restart timeout argument: %s\n",
1515 optarg);
1516 frr_help_exit(1);
1517 }
1518 } break;
1519 default:
1520 fputs("Invalid option.\n", stderr);
1521 frr_help_exit(1);
1522 }
1523 }
1524
1525 if (watch_only
1526 && (gs.start_command || gs.stop_command || gs.restart_command)) {
1527 fputs("Options -r/-s/-k are not used when --dry is active.\n",
1528 stderr);
1529 }
1530 if (!watch_only
1531 && (!gs.restart_command || !gs.start_command || !gs.stop_command)) {
1532 fprintf(stderr,
1533 "Options -s (start), -k (kill), and -r (restart) are required.\n");
1534 frr_help_exit(1);
1535 }
1536
1537 if (blankstr) {
1538 if (gs.restart_command)
1539 gs.restart_command =
1540 translate_blanks(gs.restart_command, blankstr);
1541 if (gs.start_command)
1542 gs.start_command =
1543 translate_blanks(gs.start_command, blankstr);
1544 if (gs.stop_command)
1545 gs.stop_command =
1546 translate_blanks(gs.stop_command, blankstr);
1547 }
1548
1549 gs.restart.interval = gs.min_restart_interval;
1550
1551 /* env variable for the processes that we start */
1552 if (watchfrr_di.pathspace)
1553 setenv("FRR_PATHSPACE", watchfrr_di.pathspace, 1);
1554 else
1555 unsetenv("FRR_PATHSPACE");
1556
1557 /*
1558 * when watchfrr_di.pathspace is read, if it is not specified
1559 * pathspace is NULL as expected
1560 */
1561 pathspace = watchfrr_di.pathspace;
1562
1563 if (netns_en && !netns)
1564 netns = watchfrr_di.pathspace;
1565
1566 if (netns_en && netns && netns[0])
1567 netns_setup(netns);
1568
1569 master = frr_init();
1570 watchfrr_error_init();
1571 watchfrr_init(argc, argv);
1572 cmd_init_config_callbacks(watchfrr_start_config, watchfrr_end_config);
1573 watchfrr_vty_init();
1574
1575 frr_config_fork();
1576
1577 if (watchfrr_di.daemon_mode)
1578 zlog_syslog_set_prio_min(MIN(gs.loglevel, LOG_DEBUG));
1579 else
1580 zlog_aux_init(NULL, MIN(gs.loglevel, LOG_DEBUG));
1581
1582 frr_run(master);
1583
1584 systemd_send_stopping();
1585 /* Not reached. */
1586 return 0;
1587 }