]> git.proxmox.com Git - mirror_frr.git/blob - watchfrr/watchfrr.c
Merge pull request #12142 from opensourcerouting/fix/sendholdtimer
[mirror_frr.git] / watchfrr / watchfrr.c
1 /*
2 * Monitor status of frr daemons and restart if necessary.
3 *
4 * Copyright (C) 2004 Andrew J. Schorr
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include <zebra.h>
22 #include <thread.h>
23 #include <log.h>
24 #include <network.h>
25 #include <sigevent.h>
26 #include <lib/version.h>
27 #include "command.h"
28 #include "libfrr.h"
29 #include "lib_errors.h"
30 #include "zlog_targets.h"
31 #include "network.h"
32 #include "printfrr.h"
33
34 #include <getopt.h>
35 #include <sys/un.h>
36 #include <sys/wait.h>
37 #include <memory.h>
38 #include <systemd.h>
39
40 #include "watchfrr.h"
41 #include "watchfrr_errors.h"
42
43 #ifndef MIN
44 #define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
45 #endif
46
47 /* Macros to help randomize timers. */
48 #define JITTER(X) ((frr_weak_random() % ((X)+1))-((X)/2))
49 #define FUZZY(X) ((X)+JITTER((X)/20))
50
51 #define DEFAULT_PERIOD 5
52 #define DEFAULT_TIMEOUT 90
53 #define DEFAULT_RESTART_TIMEOUT 20
54 #define DEFAULT_LOGLEVEL LOG_INFO
55 #define DEFAULT_MIN_RESTART 60
56 #define DEFAULT_MAX_RESTART 600
57 #define DEFAULT_OPERATIONAL_TIMEOUT 60
58
59 #define DEFAULT_RESTART_CMD WATCHFRR_SH_PATH " restart %s"
60 #define DEFAULT_START_CMD WATCHFRR_SH_PATH " start %s"
61 #define DEFAULT_STOP_CMD WATCHFRR_SH_PATH " stop %s"
62
63 #define PING_TOKEN "PING"
64
65 DEFINE_MGROUP(WATCHFRR, "watchfrr");
66 DEFINE_MTYPE_STATIC(WATCHFRR, WATCHFRR_DAEMON, "watchfrr daemon entry");
67
68 /* Needs to be global, referenced somewhere inside libfrr. */
69 struct thread_master *master;
70
71 static bool watch_only = false;
72 const char *pathspace;
73
74 enum restart_phase {
75 PHASE_NONE = 0,
76 PHASE_INIT,
77 PHASE_STOPS_PENDING,
78 PHASE_WAITING_DOWN,
79 PHASE_ZEBRA_RESTART_PENDING,
80 PHASE_WAITING_ZEBRA_UP
81 };
82
83 static const char *const phase_str[] = {
84 "Idle",
85 "Startup",
86 "Stop jobs running",
87 "Waiting for other daemons to come down",
88 "Zebra restart job running",
89 "Waiting for zebra to come up",
90 "Start jobs running",
91 };
92
93 #define PHASE_TIMEOUT (3*gs.restart_timeout)
94 #define STARTUP_TIMEOUT 55 * 1000
95
96 struct restart_info {
97 const char *name;
98 const char *what;
99 pid_t pid;
100 struct timeval time;
101 long interval;
102 struct thread *t_kill;
103 int kills;
104 };
105
106 static struct global_state {
107 enum restart_phase phase;
108 struct thread *t_phase_hanging;
109 struct thread *t_startup_timeout;
110 struct thread *t_operational;
111 const char *vtydir;
112 long period;
113 long timeout;
114 long restart_timeout;
115 bool reading_configuration;
116 long min_restart_interval;
117 long max_restart_interval;
118 long operational_timeout;
119 struct daemon *daemons;
120 const char *restart_command;
121 const char *start_command;
122 const char *stop_command;
123 struct restart_info restart;
124 int loglevel;
125 struct daemon *special; /* points to zebra when doing phased restart */
126 int numdaemons;
127 int numpids;
128 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
129 } gs = {
130 .phase = PHASE_INIT,
131 .vtydir = frr_vtydir,
132 .period = 1000 * DEFAULT_PERIOD,
133 .timeout = DEFAULT_TIMEOUT,
134 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
135 .loglevel = DEFAULT_LOGLEVEL,
136 .min_restart_interval = DEFAULT_MIN_RESTART,
137 .max_restart_interval = DEFAULT_MAX_RESTART,
138 .operational_timeout = DEFAULT_OPERATIONAL_TIMEOUT,
139 .restart_command = DEFAULT_RESTART_CMD,
140 .start_command = DEFAULT_START_CMD,
141 .stop_command = DEFAULT_STOP_CMD,
142 };
143
144 enum daemon_state {
145 DAEMON_INIT,
146 DAEMON_DOWN,
147 DAEMON_CONNECTING,
148 DAEMON_UP,
149 DAEMON_UNRESPONSIVE
150 };
151
152 #define IS_UP(DMN) \
153 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
154
155 static const char *const state_str[] = {
156 "Init", "Down", "Connecting", "Up", "Unresponsive",
157 };
158
159 struct daemon {
160 const char *name;
161 enum daemon_state state;
162 int fd;
163 struct timeval echo_sent;
164 unsigned int connect_tries;
165 struct thread *t_wakeup;
166 struct thread *t_read;
167 struct thread *t_write;
168 struct daemon *next;
169 struct restart_info restart;
170
171 /*
172 * For a given daemon, if we've turned on ignore timeouts
173 * ignore the timeout value and assume everything is ok
174 * This is for daemon debugging w/ gdb after we have started
175 * FRR and realize we have something that needs to be looked
176 * at
177 */
178 bool ignore_timeout;
179 };
180
181 #define OPTION_MINRESTART 2000
182 #define OPTION_MAXRESTART 2001
183 #define OPTION_DRY 2002
184 #define OPTION_NETNS 2003
185 #define OPTION_MAXOPERATIONAL 2004
186
187 static const struct option longopts[] = {
188 {"daemon", no_argument, NULL, 'd'},
189 {"statedir", required_argument, NULL, 'S'},
190 {"loglevel", required_argument, NULL, 'l'},
191 {"interval", required_argument, NULL, 'i'},
192 {"timeout", required_argument, NULL, 't'},
193 {"restart-timeout", required_argument, NULL, 'T'},
194 {"restart", required_argument, NULL, 'r'},
195 {"start-command", required_argument, NULL, 's'},
196 {"kill-command", required_argument, NULL, 'k'},
197 {"dry", no_argument, NULL, OPTION_DRY},
198 {"min-restart-interval", required_argument, NULL, OPTION_MINRESTART},
199 {"max-restart-interval", required_argument, NULL, OPTION_MAXRESTART},
200 {"operational-timeout", required_argument, NULL, OPTION_MAXOPERATIONAL},
201 {"pid-file", required_argument, NULL, 'p'},
202 {"blank-string", required_argument, NULL, 'b'},
203 #ifdef GNU_LINUX
204 {"netns", optional_argument, NULL, OPTION_NETNS},
205 #endif
206 {"help", no_argument, NULL, 'h'},
207 {"version", no_argument, NULL, 'v'},
208 {NULL, 0, NULL, 0}};
209
210 static int try_connect(struct daemon *dmn);
211 static void wakeup_send_echo(struct thread *t_wakeup);
212 static void try_restart(struct daemon *dmn);
213 static void phase_check(void);
214 static void restart_done(struct daemon *dmn);
215
216 static const char *progname;
217
218 void watchfrr_set_ignore_daemon(struct vty *vty, const char *dname, bool ignore)
219 {
220 struct daemon *dmn;
221
222 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
223 if (strncmp(dmn->name, dname, strlen(dmn->name)) == 0)
224 break;
225 }
226
227 if (dmn) {
228 dmn->ignore_timeout = ignore;
229 vty_out(vty, "%s switching to %s\n", dmn->name,
230 ignore ? "ignore" : "watch");
231 } else
232 vty_out(vty, "%s is not configured for running at the moment",
233 dname);
234 }
235
236 static void printhelp(FILE *target)
237 {
238 fprintf(target,
239 "Usage : %s [OPTION...] <daemon name> ...\n\n\
240 Watchdog program to monitor status of frr daemons and try to restart\n\
241 them if they are down or unresponsive. It determines whether a daemon is\n\
242 up based on whether it can connect to the daemon's vty unix stream socket.\n\
243 It then repeatedly sends echo commands over that socket to determine whether\n\
244 the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
245 on the socket connection and know immediately that the daemon is down.\n\n\
246 The daemons to be monitored should be listed on the command line.\n\n\
247 In order to avoid attempting to restart the daemons in a fast loop,\n\
248 the -m and -M options allow you to control the minimum delay between\n\
249 restart commands. The minimum restart delay is recalculated each time\n\
250 a restart is attempted: if the time since the last restart attempt exceeds\n\
251 twice the -M value, then the restart delay is set to the -m value.\n\
252 Otherwise, the interval is doubled (but capped at the -M value).\n\n",
253 progname);
254
255 fprintf(target,
256 "Options:\n\
257 -d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
258 to syslog instead of stdout.\n\
259 -S, --statedir Set the vty socket directory (default is %s)\n\
260 -N, --pathspace Insert prefix into config & socket paths\n"
261 #ifdef GNU_LINUX
262 " --netns Create and/or use Linux network namespace. If no name is\n"
263 " given, uses the value from `-N`.\n"
264 #endif
265 "-l, --loglevel Set the logging level (default is %d).\n\
266 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
267 but it can be set higher than %d if extra-verbose debugging\n\
268 messages are desired.\n\
269 --min-restart-interval\n\
270 Set the minimum seconds to wait between invocations of daemon\n\
271 restart commands (default is %d).\n\
272 --max-restart-interval\n\
273 Set the maximum seconds to wait between invocations of daemon\n\
274 restart commands (default is %d).\n\
275 --operational-timeout\n\
276 Set the time before systemd is notified that we are considered\n\
277 operational again after a daemon restart (default is %d).\n\
278 -i, --interval Set the status polling interval in seconds (default is %d)\n\
279 -t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
280 -T, --restart-timeout\n\
281 Set the restart (kill) timeout in seconds (default is %d).\n\
282 If any background jobs are still running after this much\n\
283 time has elapsed, they will be killed.\n\
284 -r, --restart Supply a Bourne shell command to use to restart a single\n\
285 daemon. The command string should include '%%s' where the\n\
286 name of the daemon should be substituted.\n\
287 (default: '%s')\n\
288 -s, --start-command\n\
289 Supply a Bourne shell to command to use to start a single\n\
290 daemon. The command string should include '%%s' where the\n\
291 name of the daemon should be substituted.\n\
292 (default: '%s')\n\
293 -k, --kill-command\n\
294 Supply a Bourne shell to command to use to stop a single\n\
295 daemon. The command string should include '%%s' where the\n\
296 name of the daemon should be substituted.\n\
297 (default: '%s')\n\
298 --dry Do not start or restart anything, just log.\n\
299 -p, --pid-file Set process identifier file name\n\
300 (default is %s/watchfrr.pid).\n\
301 -b, --blank-string\n\
302 When the supplied argument string is found in any of the\n\
303 various shell command arguments (-r, -s, or -k), replace\n\
304 it with a space. This is an ugly hack to circumvent problems\n\
305 passing command-line arguments with embedded spaces.\n\
306 -v, --version Print program version\n\
307 -h, --help Display this help and exit\n",
308 frr_vtydir, DEFAULT_LOGLEVEL, LOG_EMERG, LOG_DEBUG, LOG_DEBUG,
309 DEFAULT_MIN_RESTART, DEFAULT_MAX_RESTART,
310 DEFAULT_OPERATIONAL_TIMEOUT, DEFAULT_PERIOD, DEFAULT_TIMEOUT,
311 DEFAULT_RESTART_TIMEOUT, DEFAULT_RESTART_CMD, DEFAULT_START_CMD,
312 DEFAULT_STOP_CMD, frr_vtydir);
313 }
314
315 static pid_t run_background(char *shell_cmd)
316 {
317 pid_t child;
318
319 switch (child = fork()) {
320 case -1:
321 flog_err_sys(EC_LIB_SYSTEM_CALL,
322 "fork failed, cannot run command [%s]: %s",
323 shell_cmd, safe_strerror(errno));
324 return -1;
325 case 0:
326 /* Child process. */
327 /* Use separate process group so child processes can be killed
328 * easily. */
329 if (setpgid(0, 0) < 0)
330 zlog_warn("setpgid(0,0) failed: %s",
331 safe_strerror(errno));
332 {
333 char shell[] = "sh";
334 char dashc[] = "-c";
335 char *const argv[4] = {shell, dashc, shell_cmd, NULL};
336 execv("/bin/sh", argv);
337 flog_err_sys(EC_LIB_SYSTEM_CALL,
338 "execv(/bin/sh -c '%s') failed: %s",
339 shell_cmd, safe_strerror(errno));
340 _exit(127);
341 }
342 default:
343 /* Parent process: we will reap the child later. */
344 zlog_info("Forked background command [pid %d]: %s", (int)child,
345 shell_cmd);
346 return child;
347 }
348 }
349
350 static struct timeval *time_elapsed(struct timeval *result,
351 const struct timeval *start_time)
352 {
353 gettimeofday(result, NULL);
354 result->tv_sec -= start_time->tv_sec;
355 result->tv_usec -= start_time->tv_usec;
356 while (result->tv_usec < 0) {
357 result->tv_usec += 1000000L;
358 result->tv_sec--;
359 }
360 return result;
361 }
362
363 static void restart_kill(struct thread *t_kill)
364 {
365 struct restart_info *restart = THREAD_ARG(t_kill);
366 struct timeval delay;
367
368 time_elapsed(&delay, &restart->time);
369
370 if (gs.reading_configuration) {
371 zlog_err(
372 "%s %s child process appears to still be reading configuration, delaying for another %lu time",
373 restart->what, restart->name, gs.restart_timeout);
374 thread_add_timer(master, restart_kill, restart,
375 gs.restart_timeout, &restart->t_kill);
376 return;
377 }
378
379 zlog_warn(
380 "%s %s child process %d still running after %ld seconds, sending signal %d",
381 restart->what, restart->name, (int)restart->pid,
382 (long)delay.tv_sec, (restart->kills ? SIGKILL : SIGTERM));
383 kill(-restart->pid, (restart->kills ? SIGKILL : SIGTERM));
384 restart->kills++;
385 thread_add_timer(master, restart_kill, restart, gs.restart_timeout,
386 &restart->t_kill);
387 }
388
389 static struct restart_info *find_child(pid_t child)
390 {
391 struct daemon *dmn;
392 if (gs.restart.pid == child)
393 return &gs.restart;
394
395 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
396 if (dmn->restart.pid == child)
397 return &dmn->restart;
398 }
399 return NULL;
400 }
401
402 static void sigchild(void)
403 {
404 pid_t child;
405 int status;
406 const char *name;
407 const char *what;
408 struct restart_info *restart;
409 struct daemon *dmn;
410
411 switch (child = waitpid(-1, &status, WNOHANG)) {
412 case -1:
413 flog_err_sys(EC_LIB_SYSTEM_CALL, "waitpid failed: %s",
414 safe_strerror(errno));
415 return;
416 case 0:
417 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
418 return;
419 }
420
421 if (child == integrated_write_pid) {
422 integrated_write_sigchld(status);
423 return;
424 }
425
426 if ((restart = find_child(child)) != NULL) {
427 name = restart->name;
428 what = restart->what;
429 restart->pid = 0;
430 gs.numpids--;
431 thread_cancel(&restart->t_kill);
432
433 /* Update restart time to reflect the time the command
434 * completed. */
435 gettimeofday(&restart->time, NULL);
436 } else {
437 flog_err_sys(
438 EC_LIB_SYSTEM_CALL,
439 "waitpid returned status for an unknown child process %d",
440 (int)child);
441 name = "(unknown)";
442 what = "background";
443 }
444 if (WIFSTOPPED(status))
445 zlog_warn("%s %s process %d is stopped", what, name,
446 (int)child);
447 else if (WIFSIGNALED(status))
448 zlog_warn("%s %s process %d terminated due to signal %d", what,
449 name, (int)child, WTERMSIG(status));
450 else if (WIFEXITED(status)) {
451 if (WEXITSTATUS(status) != 0)
452 zlog_warn(
453 "%s %s process %d exited with non-zero status %d",
454 what, name, (int)child, WEXITSTATUS(status));
455 else {
456 zlog_debug("%s %s process %d exited normally", what,
457 name, (int)child);
458
459 if (restart && restart != &gs.restart) {
460 dmn = container_of(restart, struct daemon,
461 restart);
462 restart_done(dmn);
463 } else if (restart)
464 for (dmn = gs.daemons; dmn; dmn = dmn->next)
465 restart_done(dmn);
466 }
467 } else
468 flog_err_sys(
469 EC_LIB_SYSTEM_CALL,
470 "cannot interpret %s %s process %d wait status 0x%x",
471 what, name, (int)child, status);
472 phase_check();
473 }
474
475 static int run_job(struct restart_info *restart, const char *cmdtype,
476 const char *command, int force, int update_interval)
477 {
478 struct timeval delay;
479
480 if (gs.loglevel > LOG_DEBUG + 1)
481 zlog_debug("attempting to %s %s", cmdtype, restart->name);
482
483 if (restart->pid) {
484 if (gs.loglevel > LOG_DEBUG + 1)
485 zlog_debug(
486 "cannot %s %s, previous pid %d still running",
487 cmdtype, restart->name, (int)restart->pid);
488 return -1;
489 }
490
491 char buffer[512];
492
493 snprintf(buffer, sizeof(buffer), "restarting %s", restart->name);
494 systemd_send_status(buffer);
495
496 /* Note: time_elapsed test must come before the force test, since we
497 need
498 to make sure that delay is initialized for use below in updating the
499 restart interval. */
500 if ((time_elapsed(&delay, &restart->time)->tv_sec < restart->interval)
501 && !force) {
502
503 if (gs.loglevel > LOG_DEBUG + 1)
504 zlog_debug(
505 "postponing %s %s: elapsed time %ld < retry interval %ld",
506 cmdtype, restart->name, (long)delay.tv_sec,
507 restart->interval);
508 return -1;
509 }
510
511 gettimeofday(&restart->time, NULL);
512 restart->kills = 0;
513 {
514 char cmd[strlen(command) + strlen(restart->name) + 1];
515 snprintf(cmd, sizeof(cmd), command, restart->name);
516 if ((restart->pid = run_background(cmd)) > 0) {
517 thread_add_timer(master, restart_kill, restart,
518 gs.restart_timeout, &restart->t_kill);
519 restart->what = cmdtype;
520 gs.numpids++;
521 } else
522 restart->pid = 0;
523 }
524
525 /* Calculate the new restart interval. */
526 if (update_interval) {
527 if (delay.tv_sec > 2 * gs.max_restart_interval)
528 restart->interval = gs.min_restart_interval;
529 else if ((restart->interval *= 2) > gs.max_restart_interval)
530 restart->interval = gs.max_restart_interval;
531 if (gs.loglevel > LOG_DEBUG + 1)
532 zlog_debug("restart %s interval is now %ld",
533 restart->name, restart->interval);
534 }
535 return restart->pid;
536 }
537
538 #define SET_READ_HANDLER(DMN) \
539 do { \
540 (DMN)->t_read = NULL; \
541 thread_add_read(master, handle_read, (DMN), (DMN)->fd, \
542 &(DMN)->t_read); \
543 } while (0);
544
545 #define SET_WAKEUP_DOWN(DMN) \
546 do { \
547 (DMN)->t_wakeup = NULL; \
548 thread_add_timer_msec(master, wakeup_down, (DMN), \
549 FUZZY(gs.period), &(DMN)->t_wakeup); \
550 } while (0);
551
552 #define SET_WAKEUP_UNRESPONSIVE(DMN) \
553 do { \
554 (DMN)->t_wakeup = NULL; \
555 thread_add_timer_msec(master, wakeup_unresponsive, (DMN), \
556 FUZZY(gs.period), &(DMN)->t_wakeup); \
557 } while (0);
558
559 #define SET_WAKEUP_ECHO(DMN) \
560 do { \
561 (DMN)->t_wakeup = NULL; \
562 thread_add_timer_msec(master, wakeup_send_echo, (DMN), \
563 FUZZY(gs.period), &(DMN)->t_wakeup); \
564 } while (0);
565
566 static void wakeup_down(struct thread *t_wakeup)
567 {
568 struct daemon *dmn = THREAD_ARG(t_wakeup);
569
570 dmn->t_wakeup = NULL;
571 if (try_connect(dmn) < 0)
572 SET_WAKEUP_DOWN(dmn);
573 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
574 try_restart(dmn);
575 }
576
577 static void wakeup_init(struct thread *t_wakeup)
578 {
579 struct daemon *dmn = THREAD_ARG(t_wakeup);
580
581 dmn->t_wakeup = NULL;
582 if (try_connect(dmn) < 0) {
583 zlog_info(
584 "%s state -> down : initial connection attempt failed",
585 dmn->name);
586 dmn->state = DAEMON_DOWN;
587 }
588 phase_check();
589 }
590
591 static void restart_done(struct daemon *dmn)
592 {
593 if (dmn->state != DAEMON_DOWN) {
594 zlog_warn(
595 "Daemon: %s: is in %s state but expected it to be in DAEMON_DOWN state",
596 dmn->name, state_str[dmn->state]);
597 return;
598 }
599 THREAD_OFF(dmn->t_wakeup);
600
601 if (try_connect(dmn) < 0)
602 SET_WAKEUP_DOWN(dmn);
603 }
604
605 static void daemon_restarting_operational(struct thread *thread)
606 {
607 systemd_send_status("FRR Operational");
608 }
609
610 static void daemon_down(struct daemon *dmn, const char *why)
611 {
612 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
613 flog_err(EC_WATCHFRR_CONNECTION, "%s state -> down : %s",
614 dmn->name, why);
615 else if (gs.loglevel > LOG_DEBUG)
616 zlog_debug("%s still down : %s", dmn->name, why);
617 if (IS_UP(dmn))
618 gs.numdown++;
619 dmn->state = DAEMON_DOWN;
620 if (dmn->fd >= 0) {
621 close(dmn->fd);
622 dmn->fd = -1;
623 }
624 THREAD_OFF(dmn->t_read);
625 THREAD_OFF(dmn->t_write);
626 THREAD_OFF(dmn->t_wakeup);
627 if (try_connect(dmn) < 0)
628 SET_WAKEUP_DOWN(dmn);
629
630 systemd_send_status("FRR partially operational");
631 phase_check();
632 }
633
634 static void handle_read(struct thread *t_read)
635 {
636 struct daemon *dmn = THREAD_ARG(t_read);
637 static const char resp[sizeof(PING_TOKEN) + 4] = PING_TOKEN "\n";
638 char buf[sizeof(resp) + 100];
639 ssize_t rc;
640 struct timeval delay;
641
642 dmn->t_read = NULL;
643 if ((rc = read(dmn->fd, buf, sizeof(buf))) < 0) {
644 char why[100];
645
646 if (ERRNO_IO_RETRY(errno)) {
647 /* Pretend it never happened. */
648 SET_READ_HANDLER(dmn);
649 return;
650 }
651 snprintf(why, sizeof(why), "unexpected read error: %s",
652 safe_strerror(errno));
653 daemon_down(dmn, why);
654 return;
655 }
656 if (rc == 0) {
657 daemon_down(dmn, "read returned EOF");
658 return;
659 }
660 if (!dmn->echo_sent.tv_sec) {
661 char why[sizeof(buf) + 100];
662 snprintf(why, sizeof(why),
663 "unexpected read returns %d bytes: %.*s", (int)rc,
664 (int)rc, buf);
665 daemon_down(dmn, why);
666 return;
667 }
668
669 /* We are expecting an echo response: is there any chance that the
670 response would not be returned entirely in the first read? That
671 seems inconceivable... */
672 if ((rc != sizeof(resp)) || memcmp(buf, resp, sizeof(resp))) {
673 char why[100 + sizeof(buf)];
674 snprintf(why, sizeof(why),
675 "read returned bad echo response of %d bytes (expecting %u): %.*s",
676 (int)rc, (unsigned int)sizeof(resp), (int)rc, buf);
677 daemon_down(dmn, why);
678 return;
679 }
680
681 time_elapsed(&delay, &dmn->echo_sent);
682 dmn->echo_sent.tv_sec = 0;
683 if (dmn->state == DAEMON_UNRESPONSIVE) {
684 if (delay.tv_sec < gs.timeout) {
685 dmn->state = DAEMON_UP;
686 zlog_warn(
687 "%s state -> up : echo response received after %ld.%06ld seconds",
688 dmn->name, (long)delay.tv_sec,
689 (long)delay.tv_usec);
690 } else
691 zlog_warn(
692 "%s: slow echo response finally received after %ld.%06ld seconds",
693 dmn->name, (long)delay.tv_sec,
694 (long)delay.tv_usec);
695 } else if (gs.loglevel > LOG_DEBUG + 1)
696 zlog_debug("%s: echo response received after %ld.%06ld seconds",
697 dmn->name, (long)delay.tv_sec, (long)delay.tv_usec);
698
699 SET_READ_HANDLER(dmn);
700 thread_cancel(&dmn->t_wakeup);
701 SET_WAKEUP_ECHO(dmn);
702 }
703
704 /*
705 * Wait till we notice that all daemons are ready before
706 * we send we are ready to systemd
707 */
708 static void daemon_send_ready(int exitcode)
709 {
710 FILE *fp;
711 static int sent = 0;
712 char started[1024];
713
714 if (sent)
715 return;
716
717 if (exitcode == 0)
718 zlog_notice("all daemons up, doing startup-complete notify");
719 else if (gs.numdown < gs.numdaemons)
720 flog_err(EC_WATCHFRR_CONNECTION,
721 "startup did not complete within timeout (%d/%d daemons running)",
722 gs.numdaemons - gs.numdown, gs.numdaemons);
723 else {
724 flog_err(EC_WATCHFRR_CONNECTION,
725 "all configured daemons failed to start -- exiting watchfrr");
726 exit(exitcode);
727
728 }
729
730 frr_detach();
731
732 snprintf(started, sizeof(started), "%s/%s", frr_vtydir,
733 "watchfrr.started");
734 fp = fopen(started, "w");
735 if (fp)
736 fclose(fp);
737
738 systemd_send_started(master);
739 systemd_send_status("FRR Operational");
740 sent = 1;
741 }
742
743 static void daemon_up(struct daemon *dmn, const char *why)
744 {
745 dmn->state = DAEMON_UP;
746 gs.numdown--;
747 dmn->connect_tries = 0;
748 zlog_notice("%s state -> up : %s", dmn->name, why);
749 if (gs.numdown == 0) {
750 daemon_send_ready(0);
751
752 THREAD_OFF(gs.t_operational);
753
754 thread_add_timer(master, daemon_restarting_operational, NULL,
755 gs.operational_timeout, &gs.t_operational);
756 }
757
758 SET_WAKEUP_ECHO(dmn);
759 phase_check();
760 }
761
762 static void check_connect(struct thread *t_write)
763 {
764 struct daemon *dmn = THREAD_ARG(t_write);
765 int sockerr;
766 socklen_t reslen = sizeof(sockerr);
767
768 dmn->t_write = NULL;
769 if (getsockopt(dmn->fd, SOL_SOCKET, SO_ERROR, (char *)&sockerr, &reslen)
770 < 0) {
771 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn->name,
772 safe_strerror(errno));
773 daemon_down(dmn,
774 "getsockopt failed checking connection success");
775 return;
776 }
777 if ((reslen == sizeof(sockerr)) && sockerr) {
778 char why[100];
779 snprintf(
780 why, sizeof(why),
781 "getsockopt reports that connection attempt failed: %s",
782 safe_strerror(sockerr));
783 daemon_down(dmn, why);
784 return;
785 }
786
787 daemon_up(dmn, "delayed connect succeeded");
788 }
789
790 static void wakeup_connect_hanging(struct thread *t_wakeup)
791 {
792 struct daemon *dmn = THREAD_ARG(t_wakeup);
793 char why[100];
794
795 dmn->t_wakeup = NULL;
796 snprintf(why, sizeof(why),
797 "connection attempt timed out after %ld seconds", gs.timeout);
798 daemon_down(dmn, why);
799 }
800
801 /* Making connection to protocol daemon. */
802 static int try_connect(struct daemon *dmn)
803 {
804 int sock;
805 struct sockaddr_un addr;
806 socklen_t len;
807
808 if (gs.loglevel > LOG_DEBUG + 1)
809 zlog_debug("%s: attempting to connect", dmn->name);
810 dmn->connect_tries++;
811
812 memset(&addr, 0, sizeof(addr));
813 addr.sun_family = AF_UNIX;
814 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty", gs.vtydir,
815 dmn->name);
816 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
817 len = addr.sun_len = SUN_LEN(&addr);
818 #else
819 len = sizeof(addr.sun_family) + strlen(addr.sun_path);
820 #endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
821
822 /* Quick check to see if we might succeed before we go to the trouble
823 of creating a socket. */
824 if (access(addr.sun_path, W_OK) < 0) {
825 if (errno != ENOENT)
826 flog_err_sys(EC_LIB_SYSTEM_CALL,
827 "%s: access to socket %s denied: %s",
828 dmn->name, addr.sun_path,
829 safe_strerror(errno));
830 return -1;
831 }
832
833 if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
834 flog_err_sys(EC_LIB_SOCKET, "%s(%s): cannot make socket: %s",
835 __func__, addr.sun_path, safe_strerror(errno));
836 return -1;
837 }
838
839 if (set_nonblocking(sock) < 0 || set_cloexec(sock) < 0) {
840 flog_err_sys(EC_LIB_SYSTEM_CALL,
841 "%s(%s): set_nonblocking/cloexec(%d) failed",
842 __func__, addr.sun_path, sock);
843 close(sock);
844 return -1;
845 }
846
847 if (connect(sock, (struct sockaddr *)&addr, len) < 0) {
848 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK)) {
849 if (gs.loglevel > LOG_DEBUG)
850 zlog_debug("%s(%s): connect failed: %s",
851 __func__, addr.sun_path,
852 safe_strerror(errno));
853 close(sock);
854 return -1;
855 }
856 if (gs.loglevel > LOG_DEBUG)
857 zlog_debug("%s: connection in progress", dmn->name);
858 dmn->state = DAEMON_CONNECTING;
859 dmn->fd = sock;
860 thread_add_write(master, check_connect, dmn, dmn->fd,
861 &dmn->t_write);
862 thread_add_timer(master, wakeup_connect_hanging, dmn,
863 gs.timeout, &dmn->t_wakeup);
864 SET_READ_HANDLER(dmn);
865 return 0;
866 }
867
868 dmn->fd = sock;
869 SET_READ_HANDLER(dmn);
870 daemon_up(dmn, "connect succeeded");
871 return 1;
872 }
873
874 static void phase_hanging(struct thread *t_hanging)
875 {
876 gs.t_phase_hanging = NULL;
877 flog_err(EC_WATCHFRR_CONNECTION,
878 "Phase [%s] hanging for %ld seconds, aborting phased restart",
879 phase_str[gs.phase], PHASE_TIMEOUT);
880 gs.phase = PHASE_NONE;
881 }
882
883 static void set_phase(enum restart_phase new_phase)
884 {
885 gs.phase = new_phase;
886 thread_cancel(&gs.t_phase_hanging);
887
888 thread_add_timer(master, phase_hanging, NULL, PHASE_TIMEOUT,
889 &gs.t_phase_hanging);
890 }
891
892 static void phase_check(void)
893 {
894 struct daemon *dmn;
895
896 switch (gs.phase) {
897 case PHASE_NONE:
898 break;
899
900 case PHASE_INIT:
901 for (dmn = gs.daemons; dmn; dmn = dmn->next)
902 if (dmn->state == DAEMON_INIT)
903 return;
904
905 /* startup complete, everything out of INIT */
906 gs.phase = PHASE_NONE;
907 for (dmn = gs.daemons; dmn; dmn = dmn->next)
908 if (dmn->state == DAEMON_DOWN) {
909 SET_WAKEUP_DOWN(dmn);
910 try_restart(dmn);
911 }
912 break;
913 case PHASE_STOPS_PENDING:
914 if (gs.numpids)
915 break;
916 zlog_info(
917 "Phased restart: all routing daemon stop jobs have completed.");
918 set_phase(PHASE_WAITING_DOWN);
919
920 /*FALLTHRU*/
921 case PHASE_WAITING_DOWN:
922 if (gs.numdown + IS_UP(gs.special) < gs.numdaemons)
923 break;
924 systemd_send_status("Phased Restart");
925 zlog_info("Phased restart: all routing daemons now down.");
926 run_job(&gs.special->restart, "restart", gs.restart_command, 1,
927 1);
928 set_phase(PHASE_ZEBRA_RESTART_PENDING);
929
930 /*FALLTHRU*/
931 case PHASE_ZEBRA_RESTART_PENDING:
932 if (gs.special->restart.pid)
933 break;
934 systemd_send_status("Zebra Restarting");
935 zlog_info("Phased restart: %s restart job completed.",
936 gs.special->name);
937 set_phase(PHASE_WAITING_ZEBRA_UP);
938
939 /*FALLTHRU*/
940 case PHASE_WAITING_ZEBRA_UP:
941 if (!IS_UP(gs.special))
942 break;
943 zlog_info("Phased restart: %s is now up.", gs.special->name);
944 {
945 struct daemon *dmn;
946 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
947 if (dmn != gs.special)
948 run_job(&dmn->restart, "start",
949 gs.start_command, 1, 0);
950 }
951 }
952 gs.phase = PHASE_NONE;
953 THREAD_OFF(gs.t_phase_hanging);
954 zlog_notice("Phased global restart has completed.");
955 break;
956 }
957 }
958
959 static void try_restart(struct daemon *dmn)
960 {
961 if (watch_only)
962 return;
963
964 if (dmn != gs.special) {
965 if ((gs.special->state == DAEMON_UP)
966 && (gs.phase == PHASE_NONE))
967 run_job(&dmn->restart, "restart", gs.restart_command, 0,
968 1);
969 else
970 zlog_debug(
971 "%s: postponing restart attempt because master %s daemon not up [%s], or phased restart in progress",
972 dmn->name, gs.special->name,
973 state_str[gs.special->state]);
974 return;
975 }
976
977 if ((gs.phase != PHASE_NONE) || gs.numpids) {
978 if (gs.loglevel > LOG_DEBUG + 1)
979 zlog_debug(
980 "postponing phased global restart: restart already in progress [%s], or outstanding child processes [%d]",
981 phase_str[gs.phase], gs.numpids);
982 return;
983 }
984 /* Is it too soon for a restart? */
985 {
986 struct timeval delay;
987 if (time_elapsed(&delay, &gs.special->restart.time)->tv_sec
988 < gs.special->restart.interval) {
989 if (gs.loglevel > LOG_DEBUG + 1)
990 zlog_debug(
991 "postponing phased global restart: elapsed time %ld < retry interval %ld",
992 (long)delay.tv_sec,
993 gs.special->restart.interval);
994 return;
995 }
996 }
997 run_job(&gs.restart, "restart", gs.restart_command, 0, 1);
998 }
999
1000 static void wakeup_unresponsive(struct thread *t_wakeup)
1001 {
1002 struct daemon *dmn = THREAD_ARG(t_wakeup);
1003
1004 dmn->t_wakeup = NULL;
1005 if (dmn->state != DAEMON_UNRESPONSIVE)
1006 flog_err(EC_WATCHFRR_CONNECTION,
1007 "%s: no longer unresponsive (now %s), wakeup should have been cancelled!",
1008 dmn->name, state_str[dmn->state]);
1009 else {
1010 SET_WAKEUP_UNRESPONSIVE(dmn);
1011 try_restart(dmn);
1012 }
1013 }
1014
1015 static void wakeup_no_answer(struct thread *t_wakeup)
1016 {
1017 struct daemon *dmn = THREAD_ARG(t_wakeup);
1018
1019 dmn->t_wakeup = NULL;
1020 dmn->state = DAEMON_UNRESPONSIVE;
1021 if (dmn->ignore_timeout)
1022 return;
1023 flog_err(EC_WATCHFRR_CONNECTION,
1024 "%s state -> unresponsive : no response yet to ping sent %ld seconds ago",
1025 dmn->name, gs.timeout);
1026 SET_WAKEUP_UNRESPONSIVE(dmn);
1027 try_restart(dmn);
1028 }
1029
1030 static void wakeup_send_echo(struct thread *t_wakeup)
1031 {
1032 static const char echocmd[] = "echo " PING_TOKEN;
1033 ssize_t rc;
1034 struct daemon *dmn = THREAD_ARG(t_wakeup);
1035
1036 dmn->t_wakeup = NULL;
1037 if (((rc = write(dmn->fd, echocmd, sizeof(echocmd))) < 0)
1038 || ((size_t)rc != sizeof(echocmd))) {
1039 char why[100 + sizeof(echocmd)];
1040 snprintf(why, sizeof(why),
1041 "write '%s' returned %d instead of %u", echocmd,
1042 (int)rc, (unsigned int)sizeof(echocmd));
1043 daemon_down(dmn, why);
1044 } else {
1045 gettimeofday(&dmn->echo_sent, NULL);
1046 thread_add_timer(master, wakeup_no_answer, dmn, gs.timeout,
1047 &dmn->t_wakeup);
1048 }
1049 }
1050
1051 bool check_all_up(void)
1052 {
1053 struct daemon *dmn;
1054
1055 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1056 if (dmn->state != DAEMON_UP)
1057 return false;
1058 return true;
1059 }
1060
1061 void watchfrr_status(struct vty *vty)
1062 {
1063 struct daemon *dmn;
1064 struct timeval delay;
1065
1066 vty_out(vty, "watchfrr global phase: %s\n", phase_str[gs.phase]);
1067 vty_out(vty, " Restart Command: %pSQq\n", gs.restart_command);
1068 vty_out(vty, " Start Command: %pSQq\n", gs.start_command);
1069 vty_out(vty, " Stop Command: %pSQq\n", gs.stop_command);
1070 vty_out(vty, " Min Restart Interval: %ld\n", gs.min_restart_interval);
1071 vty_out(vty, " Max Restart Interval: %ld\n", gs.max_restart_interval);
1072 vty_out(vty, " Restart Timeout: %ld\n", gs.restart_timeout);
1073 vty_out(vty, " Reading Configuration: %s\n",
1074 gs.reading_configuration ? "yes" : "no");
1075 if (gs.restart.pid)
1076 vty_out(vty, " global restart running, pid %ld\n",
1077 (long)gs.restart.pid);
1078
1079 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
1080 vty_out(vty, " %-20s %s%s", dmn->name, state_str[dmn->state],
1081 dmn->ignore_timeout ? "/Ignoring Timeout\n" : "\n");
1082 if (dmn->restart.pid)
1083 vty_out(vty, " restart running, pid %ld\n",
1084 (long)dmn->restart.pid);
1085 else if (dmn->state == DAEMON_DOWN &&
1086 time_elapsed(&delay, &dmn->restart.time)->tv_sec
1087 < dmn->restart.interval)
1088 vty_out(vty, " restarting in %jd seconds (%jds backoff interval)\n",
1089 (intmax_t)dmn->restart.interval
1090 - (intmax_t)delay.tv_sec,
1091 (intmax_t)dmn->restart.interval);
1092 }
1093 }
1094
1095 static void sigint(void)
1096 {
1097 zlog_notice("Terminating on signal");
1098 systemd_send_stopping();
1099 exit(0);
1100 }
1101
1102 static int valid_command(const char *cmd)
1103 {
1104 char *p;
1105
1106 if (cmd == NULL)
1107 return 0;
1108
1109 return ((p = strchr(cmd, '%')) != NULL) && (*(p + 1) == 's')
1110 && !strchr(p + 1, '%');
1111 }
1112
1113 /* This is an ugly hack to circumvent problems with passing command-line
1114 arguments that contain spaces. The fix is to use a configuration file. */
1115 static char *translate_blanks(const char *cmd, const char *blankstr)
1116 {
1117 char *res;
1118 char *p;
1119 size_t bslen = strlen(blankstr);
1120
1121 if (!(res = strdup(cmd))) {
1122 perror("strdup");
1123 exit(1);
1124 }
1125 while ((p = strstr(res, blankstr)) != NULL) {
1126 *p = ' ';
1127 if (bslen != 1)
1128 memmove(p + 1, p + bslen, strlen(p + bslen) + 1);
1129 }
1130 return res;
1131 }
1132
1133 static void startup_timeout(struct thread *t_wakeup)
1134 {
1135 daemon_send_ready(1);
1136 }
1137
1138 #ifdef GNU_LINUX
1139
1140 #include <sys/mount.h>
1141 #include <sched.h>
1142
1143 #define NETNS_RUN_DIR "/var/run/netns"
1144
1145 static void netns_create(int dirfd, const char *nsname)
1146 {
1147 /* make /var/run/netns shared between mount namespaces
1148 * just like iproute2 sets it up
1149 */
1150 if (mount("", NETNS_RUN_DIR, "none", MS_SHARED | MS_REC, NULL)) {
1151 if (errno != EINVAL) {
1152 perror("mount");
1153 exit(1);
1154 }
1155
1156 if (mount(NETNS_RUN_DIR, NETNS_RUN_DIR, "none",
1157 MS_BIND | MS_REC, NULL)) {
1158 perror("mount");
1159 exit(1);
1160 }
1161
1162 if (mount("", NETNS_RUN_DIR, "none", MS_SHARED | MS_REC,
1163 NULL)) {
1164 perror("mount");
1165 exit(1);
1166 }
1167 }
1168
1169 /* need an empty file to mount on top of */
1170 int nsfd = openat(dirfd, nsname, O_CREAT | O_RDONLY | O_EXCL, 0);
1171
1172 if (nsfd < 0) {
1173 fprintf(stderr, "failed to create \"%s/%s\": %s\n",
1174 NETNS_RUN_DIR, nsname, strerror(errno));
1175 exit(1);
1176 }
1177 close(nsfd);
1178
1179 if (unshare(CLONE_NEWNET)) {
1180 perror("unshare");
1181 unlinkat(dirfd, nsname, 0);
1182 exit(1);
1183 }
1184
1185 char *dstpath = asprintfrr(MTYPE_TMP, "%s/%s", NETNS_RUN_DIR, nsname);
1186
1187 /* bind-mount so the namespace has a name and is persistent */
1188 if (mount("/proc/self/ns/net", dstpath, "none", MS_BIND, NULL) < 0) {
1189 fprintf(stderr, "failed to bind-mount netns to \"%s\": %s\n",
1190 dstpath, strerror(errno));
1191 unlinkat(dirfd, nsname, 0);
1192 exit(1);
1193 }
1194
1195 XFREE(MTYPE_TMP, dstpath);
1196 }
1197
1198 static void netns_setup(const char *nsname)
1199 {
1200 int dirfd, nsfd;
1201
1202 dirfd = open(NETNS_RUN_DIR, O_DIRECTORY | O_RDONLY);
1203 if (dirfd < 0) {
1204 if (errno == ENOTDIR) {
1205 fprintf(stderr, "error: \"%s\" is not a directory!\n",
1206 NETNS_RUN_DIR);
1207 exit(1);
1208 } else if (errno == ENOENT) {
1209 if (mkdir(NETNS_RUN_DIR, 0755)) {
1210 fprintf(stderr, "error: \"%s\": mkdir: %s\n",
1211 NETNS_RUN_DIR, strerror(errno));
1212 exit(1);
1213 }
1214 dirfd = open(NETNS_RUN_DIR, O_DIRECTORY | O_RDONLY);
1215 if (dirfd < 0) {
1216 fprintf(stderr, "error: \"%s\": opendir: %s\n",
1217 NETNS_RUN_DIR, strerror(errno));
1218 exit(1);
1219 }
1220 } else {
1221 fprintf(stderr, "error: \"%s\": %s\n",
1222 NETNS_RUN_DIR, strerror(errno));
1223 exit(1);
1224 }
1225 }
1226
1227 nsfd = openat(dirfd, nsname, O_RDONLY);
1228 if (nsfd < 0 && errno != ENOENT) {
1229 fprintf(stderr, "error: \"%s/%s\": %s\n",
1230 NETNS_RUN_DIR, nsname, strerror(errno));
1231 exit(1);
1232 }
1233 if (nsfd < 0)
1234 netns_create(dirfd, nsname);
1235 else {
1236 if (setns(nsfd, CLONE_NEWNET)) {
1237 perror("setns");
1238 exit(1);
1239 }
1240 close(nsfd);
1241 }
1242 close(dirfd);
1243
1244 /* make sure loopback is up... weird things happen otherwise.
1245 * ioctl is perfectly fine for this, don't need netlink...
1246 */
1247 int sockfd;
1248 struct ifreq ifr = { };
1249
1250 strlcpy(ifr.ifr_name, "lo", sizeof(ifr.ifr_name));
1251
1252 sockfd = socket(AF_INET, SOCK_DGRAM, 0);
1253 if (sockfd < 0) {
1254 perror("socket");
1255 exit(1);
1256 }
1257 if (ioctl(sockfd, SIOCGIFFLAGS, &ifr)) {
1258 perror("ioctl(SIOCGIFFLAGS, \"lo\")");
1259 exit(1);
1260 }
1261 if (!(ifr.ifr_flags & IFF_UP)) {
1262 ifr.ifr_flags |= IFF_UP;
1263 if (ioctl(sockfd, SIOCSIFFLAGS, &ifr)) {
1264 perror("ioctl(SIOCSIFFLAGS, \"lo\")");
1265 exit(1);
1266 }
1267 }
1268 close(sockfd);
1269 }
1270
1271 #else /* !GNU_LINUX */
1272
1273 static void netns_setup(const char *nsname)
1274 {
1275 fprintf(stderr, "network namespaces are only available on Linux\n");
1276 exit(1);
1277 }
1278 #endif
1279
1280 static void watchfrr_start_config(void)
1281 {
1282 gs.reading_configuration = true;
1283 }
1284
1285 static void watchfrr_end_config(void)
1286 {
1287 gs.reading_configuration = false;
1288 }
1289
1290 static void watchfrr_init(int argc, char **argv)
1291 {
1292 const char *special = "zebra";
1293 int i;
1294 struct daemon *dmn, **add = &gs.daemons;
1295 char alldaemons[512] = "", *p = alldaemons;
1296
1297 thread_add_timer_msec(master, startup_timeout, NULL, STARTUP_TIMEOUT,
1298 &gs.t_startup_timeout);
1299
1300 for (i = optind; i < argc; i++) {
1301 dmn = XCALLOC(MTYPE_WATCHFRR_DAEMON, sizeof(*dmn));
1302
1303 dmn->name = dmn->restart.name = argv[i];
1304 dmn->state = DAEMON_INIT;
1305 gs.numdaemons++;
1306 gs.numdown++;
1307 dmn->fd = -1;
1308 thread_add_timer_msec(master, wakeup_init, dmn, 0,
1309 &dmn->t_wakeup);
1310 dmn->restart.interval = gs.min_restart_interval;
1311 *add = dmn;
1312 add = &dmn->next;
1313
1314 if (!strcmp(dmn->name, special))
1315 gs.special = dmn;
1316 }
1317
1318 if (!gs.daemons) {
1319 fprintf(stderr,
1320 "Must specify one or more daemons to monitor.\n\n");
1321 frr_help_exit(1);
1322 }
1323 if (!watch_only && !gs.special) {
1324 fprintf(stderr, "\"%s\" daemon must be in daemon lists\n\n",
1325 special);
1326 frr_help_exit(1);
1327 }
1328
1329 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
1330 snprintf(p, alldaemons + sizeof(alldaemons) - p, "%s%s",
1331 (p == alldaemons) ? "" : " ", dmn->name);
1332 p += strlen(p);
1333 }
1334 zlog_notice("%s %s watching [%s]%s", progname, FRR_VERSION, alldaemons,
1335 watch_only ? ", monitor mode" : "");
1336 }
1337
1338 struct zebra_privs_t watchfrr_privs = {
1339 #ifdef VTY_GROUP
1340 .vty_group = VTY_GROUP,
1341 #endif
1342 };
1343
1344 static struct frr_signal_t watchfrr_signals[] = {
1345 {
1346 .signal = SIGINT,
1347 .handler = sigint,
1348 },
1349 {
1350 .signal = SIGTERM,
1351 .handler = sigint,
1352 },
1353 {
1354 .signal = SIGCHLD,
1355 .handler = sigchild,
1356 },
1357 };
1358
1359 FRR_DAEMON_INFO(watchfrr, WATCHFRR,
1360 .flags = FRR_NO_PRIVSEP | FRR_NO_TCPVTY | FRR_LIMITED_CLI
1361 | FRR_NO_CFG_PID_DRY | FRR_NO_ZCLIENT
1362 | FRR_DETACH_LATER,
1363
1364 .printhelp = printhelp,
1365 .copyright = "Copyright 2004 Andrew J. Schorr",
1366
1367 .signals = watchfrr_signals,
1368 .n_signals = array_size(watchfrr_signals),
1369
1370 .privs = &watchfrr_privs,
1371 );
1372
1373 #define DEPRECATED_OPTIONS "aAezR:"
1374
1375 int main(int argc, char **argv)
1376 {
1377 int opt;
1378 const char *blankstr = NULL;
1379 const char *netns = NULL;
1380 bool netns_en = false;
1381
1382 frr_preinit(&watchfrr_di, argc, argv);
1383 progname = watchfrr_di.progname;
1384
1385 frr_opt_add("b:di:k:l:N:p:r:S:s:t:T:" DEPRECATED_OPTIONS, longopts, "");
1386
1387 gs.restart.name = "all";
1388 while ((opt = frr_getopt(argc, argv, NULL)) != EOF) {
1389 if (opt && opt < 128 && strchr(DEPRECATED_OPTIONS, opt)) {
1390 fprintf(stderr,
1391 "The -%c option no longer exists.\n"
1392 "Please refer to the watchfrr(8) man page.\n",
1393 opt);
1394 exit(1);
1395 }
1396
1397 switch (opt) {
1398 case 0:
1399 break;
1400 case 'b':
1401 blankstr = optarg;
1402 break;
1403 case OPTION_DRY:
1404 watch_only = true;
1405 break;
1406 case 'k':
1407 if (!valid_command(optarg)) {
1408 fprintf(stderr,
1409 "Invalid kill command, must contain '%%s': %s\n",
1410 optarg);
1411 frr_help_exit(1);
1412 }
1413 gs.stop_command = optarg;
1414 break;
1415 case 'l': {
1416 char garbage[3];
1417 if ((sscanf(optarg, "%d%1s", &gs.loglevel, garbage)
1418 != 1)
1419 || (gs.loglevel < LOG_EMERG)) {
1420 fprintf(stderr,
1421 "Invalid loglevel argument: %s\n",
1422 optarg);
1423 frr_help_exit(1);
1424 }
1425 } break;
1426 case OPTION_MINRESTART: {
1427 char garbage[3];
1428 if ((sscanf(optarg, "%ld%1s", &gs.min_restart_interval,
1429 garbage)
1430 != 1)
1431 || (gs.min_restart_interval < 0)) {
1432 fprintf(stderr,
1433 "Invalid min_restart_interval argument: %s\n",
1434 optarg);
1435 frr_help_exit(1);
1436 }
1437 } break;
1438 case OPTION_MAXRESTART: {
1439 char garbage[3];
1440 if ((sscanf(optarg, "%ld%1s", &gs.max_restart_interval,
1441 garbage)
1442 != 1)
1443 || (gs.max_restart_interval < 0)) {
1444 fprintf(stderr,
1445 "Invalid max_restart_interval argument: %s\n",
1446 optarg);
1447 frr_help_exit(1);
1448 }
1449 } break;
1450 case OPTION_MAXOPERATIONAL: {
1451 char garbage[3];
1452
1453 if ((sscanf(optarg, "%ld%1s", &gs.operational_timeout,
1454 garbage) != 1) ||
1455 (gs.operational_timeout < 0)) {
1456 fprintf(stderr,
1457 "Invalid Operational_timeout argument: %s\n",
1458 optarg);
1459 frr_help_exit(1);
1460 }
1461 } break;
1462 case OPTION_NETNS:
1463 netns_en = true;
1464 if (optarg && strchr(optarg, '/')) {
1465 fprintf(stderr,
1466 "invalid network namespace name \"%s\" (may not contain slashes)\n",
1467 optarg);
1468 frr_help_exit(1);
1469 }
1470 netns = optarg;
1471 break;
1472 case 'i': {
1473 char garbage[3];
1474 int period;
1475 if ((sscanf(optarg, "%d%1s", &period, garbage) != 1)
1476 || (gs.period < 1)) {
1477 fprintf(stderr,
1478 "Invalid interval argument: %s\n",
1479 optarg);
1480 frr_help_exit(1);
1481 }
1482 gs.period = 1000 * period;
1483 } break;
1484 case 'p':
1485 watchfrr_di.pid_file = optarg;
1486 break;
1487 case 'r':
1488 if (!valid_command(optarg)) {
1489 fprintf(stderr,
1490 "Invalid restart command, must contain '%%s': %s\n",
1491 optarg);
1492 frr_help_exit(1);
1493 }
1494 gs.restart_command = optarg;
1495 break;
1496 case 's':
1497 if (!valid_command(optarg)) {
1498 fprintf(stderr,
1499 "Invalid start command, must contain '%%s': %s\n",
1500 optarg);
1501 frr_help_exit(1);
1502 }
1503 gs.start_command = optarg;
1504 break;
1505 case 'S':
1506 gs.vtydir = optarg;
1507 break;
1508 case 't': {
1509 char garbage[3];
1510 if ((sscanf(optarg, "%ld%1s", &gs.timeout, garbage)
1511 != 1)
1512 || (gs.timeout < 1)) {
1513 fprintf(stderr,
1514 "Invalid timeout argument: %s\n",
1515 optarg);
1516 frr_help_exit(1);
1517 }
1518 } break;
1519 case 'T': {
1520 char garbage[3];
1521 if ((sscanf(optarg, "%ld%1s", &gs.restart_timeout,
1522 garbage)
1523 != 1)
1524 || (gs.restart_timeout < 1)) {
1525 fprintf(stderr,
1526 "Invalid restart timeout argument: %s\n",
1527 optarg);
1528 frr_help_exit(1);
1529 }
1530 } break;
1531 default:
1532 fputs("Invalid option.\n", stderr);
1533 frr_help_exit(1);
1534 }
1535 }
1536
1537 if (watch_only
1538 && (gs.start_command || gs.stop_command || gs.restart_command)) {
1539 fputs("Options -r/-s/-k are not used when --dry is active.\n",
1540 stderr);
1541 }
1542 if (!watch_only
1543 && (!gs.restart_command || !gs.start_command || !gs.stop_command)) {
1544 fprintf(stderr,
1545 "Options -s (start), -k (kill), and -r (restart) are required.\n");
1546 frr_help_exit(1);
1547 }
1548
1549 if (blankstr) {
1550 if (gs.restart_command)
1551 gs.restart_command =
1552 translate_blanks(gs.restart_command, blankstr);
1553 if (gs.start_command)
1554 gs.start_command =
1555 translate_blanks(gs.start_command, blankstr);
1556 if (gs.stop_command)
1557 gs.stop_command =
1558 translate_blanks(gs.stop_command, blankstr);
1559 }
1560
1561 gs.restart.interval = gs.min_restart_interval;
1562
1563 /* env variable for the processes that we start */
1564 if (watchfrr_di.pathspace)
1565 setenv("FRR_PATHSPACE", watchfrr_di.pathspace, 1);
1566 else
1567 unsetenv("FRR_PATHSPACE");
1568
1569 /*
1570 * when watchfrr_di.pathspace is read, if it is not specified
1571 * pathspace is NULL as expected
1572 */
1573 pathspace = watchfrr_di.pathspace;
1574
1575 if (netns_en && !netns)
1576 netns = watchfrr_di.pathspace;
1577
1578 if (netns_en && netns && netns[0])
1579 netns_setup(netns);
1580
1581 master = frr_init();
1582 watchfrr_error_init();
1583 watchfrr_init(argc, argv);
1584 cmd_init_config_callbacks(watchfrr_start_config, watchfrr_end_config);
1585 watchfrr_vty_init();
1586
1587 frr_config_fork();
1588
1589 if (watchfrr_di.daemon_mode)
1590 zlog_syslog_set_prio_min(MIN(gs.loglevel, LOG_DEBUG));
1591 else
1592 zlog_aux_init(NULL, MIN(gs.loglevel, LOG_DEBUG));
1593
1594 frr_run(master);
1595
1596 systemd_send_stopping();
1597 /* Not reached. */
1598 return 0;
1599 }