]> git.proxmox.com Git - mirror_frr.git/blob - watchfrr/watchfrr.c
Merge pull request #10978 from anlancs/bgpd-cleanup-6
[mirror_frr.git] / watchfrr / watchfrr.c
1 /*
2 * Monitor status of frr daemons and restart if necessary.
3 *
4 * Copyright (C) 2004 Andrew J. Schorr
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include <zebra.h>
22 #include <thread.h>
23 #include <log.h>
24 #include <network.h>
25 #include <sigevent.h>
26 #include <lib/version.h>
27 #include "command.h"
28 #include "libfrr.h"
29 #include "lib_errors.h"
30 #include "zlog_targets.h"
31 #include "network.h"
32 #include "printfrr.h"
33
34 #include <getopt.h>
35 #include <sys/un.h>
36 #include <sys/wait.h>
37 #include <memory.h>
38 #include <systemd.h>
39
40 #include "watchfrr.h"
41 #include "watchfrr_errors.h"
42
43 #ifndef MIN
44 #define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
45 #endif
46
47 /* Macros to help randomize timers. */
48 #define JITTER(X) ((frr_weak_random() % ((X)+1))-((X)/2))
49 #define FUZZY(X) ((X)+JITTER((X)/20))
50
51 #define DEFAULT_PERIOD 5
52 #define DEFAULT_TIMEOUT 90
53 #define DEFAULT_RESTART_TIMEOUT 20
54 #define DEFAULT_LOGLEVEL LOG_INFO
55 #define DEFAULT_MIN_RESTART 60
56 #define DEFAULT_MAX_RESTART 600
57 #define DEFAULT_OPERATIONAL_TIMEOUT 60
58
59 #define DEFAULT_RESTART_CMD WATCHFRR_SH_PATH " restart %s"
60 #define DEFAULT_START_CMD WATCHFRR_SH_PATH " start %s"
61 #define DEFAULT_STOP_CMD WATCHFRR_SH_PATH " stop %s"
62
63 #define PING_TOKEN "PING"
64
65 DEFINE_MGROUP(WATCHFRR, "watchfrr");
66 DEFINE_MTYPE_STATIC(WATCHFRR, WATCHFRR_DAEMON, "watchfrr daemon entry");
67
68 /* Needs to be global, referenced somewhere inside libfrr. */
69 struct thread_master *master;
70
71 static bool watch_only = false;
72 const char *pathspace;
73
74 enum restart_phase {
75 PHASE_NONE = 0,
76 PHASE_INIT,
77 PHASE_STOPS_PENDING,
78 PHASE_WAITING_DOWN,
79 PHASE_ZEBRA_RESTART_PENDING,
80 PHASE_WAITING_ZEBRA_UP
81 };
82
83 static const char *const phase_str[] = {
84 "Idle",
85 "Startup",
86 "Stop jobs running",
87 "Waiting for other daemons to come down",
88 "Zebra restart job running",
89 "Waiting for zebra to come up",
90 "Start jobs running",
91 };
92
93 #define PHASE_TIMEOUT (3*gs.restart_timeout)
94 #define STARTUP_TIMEOUT 55 * 1000
95
96 struct restart_info {
97 const char *name;
98 const char *what;
99 pid_t pid;
100 struct timeval time;
101 long interval;
102 struct thread *t_kill;
103 int kills;
104 };
105
106 static struct global_state {
107 enum restart_phase phase;
108 struct thread *t_phase_hanging;
109 struct thread *t_startup_timeout;
110 struct thread *t_operational;
111 const char *vtydir;
112 long period;
113 long timeout;
114 long restart_timeout;
115 long min_restart_interval;
116 long max_restart_interval;
117 long operational_timeout;
118 struct daemon *daemons;
119 const char *restart_command;
120 const char *start_command;
121 const char *stop_command;
122 struct restart_info restart;
123 int loglevel;
124 struct daemon *special; /* points to zebra when doing phased restart */
125 int numdaemons;
126 int numpids;
127 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
128 } gs = {
129 .phase = PHASE_INIT,
130 .vtydir = frr_vtydir,
131 .period = 1000 * DEFAULT_PERIOD,
132 .timeout = DEFAULT_TIMEOUT,
133 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
134 .loglevel = DEFAULT_LOGLEVEL,
135 .min_restart_interval = DEFAULT_MIN_RESTART,
136 .max_restart_interval = DEFAULT_MAX_RESTART,
137 .operational_timeout = DEFAULT_OPERATIONAL_TIMEOUT,
138 .restart_command = DEFAULT_RESTART_CMD,
139 .start_command = DEFAULT_START_CMD,
140 .stop_command = DEFAULT_STOP_CMD,
141 };
142
143 enum daemon_state {
144 DAEMON_INIT,
145 DAEMON_DOWN,
146 DAEMON_CONNECTING,
147 DAEMON_UP,
148 DAEMON_UNRESPONSIVE
149 };
150
151 #define IS_UP(DMN) \
152 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
153
154 static const char *const state_str[] = {
155 "Init", "Down", "Connecting", "Up", "Unresponsive",
156 };
157
158 struct daemon {
159 const char *name;
160 enum daemon_state state;
161 int fd;
162 struct timeval echo_sent;
163 unsigned int connect_tries;
164 struct thread *t_wakeup;
165 struct thread *t_read;
166 struct thread *t_write;
167 struct daemon *next;
168 struct restart_info restart;
169
170 /*
171 * For a given daemon, if we've turned on ignore timeouts
172 * ignore the timeout value and assume everything is ok
173 * This is for daemon debugging w/ gdb after we have started
174 * FRR and realize we have something that needs to be looked
175 * at
176 */
177 bool ignore_timeout;
178 };
179
180 #define OPTION_MINRESTART 2000
181 #define OPTION_MAXRESTART 2001
182 #define OPTION_DRY 2002
183 #define OPTION_NETNS 2003
184 #define OPTION_MAXOPERATIONAL 2004
185
186 static const struct option longopts[] = {
187 {"daemon", no_argument, NULL, 'd'},
188 {"statedir", required_argument, NULL, 'S'},
189 {"loglevel", required_argument, NULL, 'l'},
190 {"interval", required_argument, NULL, 'i'},
191 {"timeout", required_argument, NULL, 't'},
192 {"restart-timeout", required_argument, NULL, 'T'},
193 {"restart", required_argument, NULL, 'r'},
194 {"start-command", required_argument, NULL, 's'},
195 {"kill-command", required_argument, NULL, 'k'},
196 {"dry", no_argument, NULL, OPTION_DRY},
197 {"min-restart-interval", required_argument, NULL, OPTION_MINRESTART},
198 {"max-restart-interval", required_argument, NULL, OPTION_MAXRESTART},
199 {"operational-timeout", required_argument, NULL, OPTION_MAXOPERATIONAL},
200 {"pid-file", required_argument, NULL, 'p'},
201 {"blank-string", required_argument, NULL, 'b'},
202 #ifdef GNU_LINUX
203 {"netns", optional_argument, NULL, OPTION_NETNS},
204 #endif
205 {"help", no_argument, NULL, 'h'},
206 {"version", no_argument, NULL, 'v'},
207 {NULL, 0, NULL, 0}};
208
209 static int try_connect(struct daemon *dmn);
210 static void wakeup_send_echo(struct thread *t_wakeup);
211 static void try_restart(struct daemon *dmn);
212 static void phase_check(void);
213 static void restart_done(struct daemon *dmn);
214
215 static const char *progname;
216
217 void watchfrr_set_ignore_daemon(struct vty *vty, const char *dname, bool ignore)
218 {
219 struct daemon *dmn;
220
221 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
222 if (strncmp(dmn->name, dname, strlen(dmn->name)) == 0)
223 break;
224 }
225
226 if (dmn) {
227 dmn->ignore_timeout = ignore;
228 vty_out(vty, "%s switching to %s\n", dmn->name,
229 ignore ? "ignore" : "watch");
230 } else
231 vty_out(vty, "%s is not configured for running at the moment",
232 dname);
233 }
234
235 static void printhelp(FILE *target)
236 {
237 fprintf(target,
238 "Usage : %s [OPTION...] <daemon name> ...\n\n\
239 Watchdog program to monitor status of frr daemons and try to restart\n\
240 them if they are down or unresponsive. It determines whether a daemon is\n\
241 up based on whether it can connect to the daemon's vty unix stream socket.\n\
242 It then repeatedly sends echo commands over that socket to determine whether\n\
243 the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
244 on the socket connection and know immediately that the daemon is down.\n\n\
245 The daemons to be monitored should be listed on the command line.\n\n\
246 In order to avoid attempting to restart the daemons in a fast loop,\n\
247 the -m and -M options allow you to control the minimum delay between\n\
248 restart commands. The minimum restart delay is recalculated each time\n\
249 a restart is attempted: if the time since the last restart attempt exceeds\n\
250 twice the -M value, then the restart delay is set to the -m value.\n\
251 Otherwise, the interval is doubled (but capped at the -M value).\n\n",
252 progname);
253
254 fprintf(target,
255 "Options:\n\
256 -d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
257 to syslog instead of stdout.\n\
258 -S, --statedir Set the vty socket directory (default is %s)\n\
259 -N, --pathspace Insert prefix into config & socket paths\n"
260 #ifdef GNU_LINUX
261 " --netns Create and/or use Linux network namespace. If no name is\n"
262 " given, uses the value from `-N`.\n"
263 #endif
264 "-l, --loglevel Set the logging level (default is %d).\n\
265 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
266 but it can be set higher than %d if extra-verbose debugging\n\
267 messages are desired.\n\
268 --min-restart-interval\n\
269 Set the minimum seconds to wait between invocations of daemon\n\
270 restart commands (default is %d).\n\
271 --max-restart-interval\n\
272 Set the maximum seconds to wait between invocations of daemon\n\
273 restart commands (default is %d).\n\
274 --operational-timeout\n\
275 Set the time before systemd is notified that we are considered\n\
276 operational again after a daemon restart (default is %d).\n\
277 -i, --interval Set the status polling interval in seconds (default is %d)\n\
278 -t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
279 -T, --restart-timeout\n\
280 Set the restart (kill) timeout in seconds (default is %d).\n\
281 If any background jobs are still running after this much\n\
282 time has elapsed, they will be killed.\n\
283 -r, --restart Supply a Bourne shell command to use to restart a single\n\
284 daemon. The command string should include '%%s' where the\n\
285 name of the daemon should be substituted.\n\
286 (default: '%s')\n\
287 -s, --start-command\n\
288 Supply a Bourne shell to command to use to start a single\n\
289 daemon. The command string should include '%%s' where the\n\
290 name of the daemon should be substituted.\n\
291 (default: '%s')\n\
292 -k, --kill-command\n\
293 Supply a Bourne shell to command to use to stop a single\n\
294 daemon. The command string should include '%%s' where the\n\
295 name of the daemon should be substituted.\n\
296 (default: '%s')\n\
297 --dry Do not start or restart anything, just log.\n\
298 -p, --pid-file Set process identifier file name\n\
299 (default is %s/watchfrr.pid).\n\
300 -b, --blank-string\n\
301 When the supplied argument string is found in any of the\n\
302 various shell command arguments (-r, -s, or -k), replace\n\
303 it with a space. This is an ugly hack to circumvent problems\n\
304 passing command-line arguments with embedded spaces.\n\
305 -v, --version Print program version\n\
306 -h, --help Display this help and exit\n",
307 frr_vtydir, DEFAULT_LOGLEVEL, LOG_EMERG, LOG_DEBUG, LOG_DEBUG,
308 DEFAULT_MIN_RESTART, DEFAULT_MAX_RESTART,
309 DEFAULT_OPERATIONAL_TIMEOUT, DEFAULT_PERIOD, DEFAULT_TIMEOUT,
310 DEFAULT_RESTART_TIMEOUT, DEFAULT_RESTART_CMD, DEFAULT_START_CMD,
311 DEFAULT_STOP_CMD, frr_vtydir);
312 }
313
314 static pid_t run_background(char *shell_cmd)
315 {
316 pid_t child;
317
318 switch (child = fork()) {
319 case -1:
320 flog_err_sys(EC_LIB_SYSTEM_CALL,
321 "fork failed, cannot run command [%s]: %s",
322 shell_cmd, safe_strerror(errno));
323 return -1;
324 case 0:
325 /* Child process. */
326 /* Use separate process group so child processes can be killed
327 * easily. */
328 if (setpgid(0, 0) < 0)
329 zlog_warn("setpgid(0,0) failed: %s",
330 safe_strerror(errno));
331 {
332 char shell[] = "sh";
333 char dashc[] = "-c";
334 char *const argv[4] = {shell, dashc, shell_cmd, NULL};
335 execv("/bin/sh", argv);
336 flog_err_sys(EC_LIB_SYSTEM_CALL,
337 "execv(/bin/sh -c '%s') failed: %s",
338 shell_cmd, safe_strerror(errno));
339 _exit(127);
340 }
341 default:
342 /* Parent process: we will reap the child later. */
343 zlog_info("Forked background command [pid %d]: %s", (int)child,
344 shell_cmd);
345 return child;
346 }
347 }
348
349 static struct timeval *time_elapsed(struct timeval *result,
350 const struct timeval *start_time)
351 {
352 gettimeofday(result, NULL);
353 result->tv_sec -= start_time->tv_sec;
354 result->tv_usec -= start_time->tv_usec;
355 while (result->tv_usec < 0) {
356 result->tv_usec += 1000000L;
357 result->tv_sec--;
358 }
359 return result;
360 }
361
362 static void restart_kill(struct thread *t_kill)
363 {
364 struct restart_info *restart = THREAD_ARG(t_kill);
365 struct timeval delay;
366
367 time_elapsed(&delay, &restart->time);
368 zlog_warn(
369 "%s %s child process %d still running after %ld seconds, sending signal %d",
370 restart->what, restart->name, (int)restart->pid,
371 (long)delay.tv_sec, (restart->kills ? SIGKILL : SIGTERM));
372 kill(-restart->pid, (restart->kills ? SIGKILL : SIGTERM));
373 restart->kills++;
374 thread_add_timer(master, restart_kill, restart, gs.restart_timeout,
375 &restart->t_kill);
376 }
377
378 static struct restart_info *find_child(pid_t child)
379 {
380 struct daemon *dmn;
381 if (gs.restart.pid == child)
382 return &gs.restart;
383
384 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
385 if (dmn->restart.pid == child)
386 return &dmn->restart;
387 }
388 return NULL;
389 }
390
391 static void sigchild(void)
392 {
393 pid_t child;
394 int status;
395 const char *name;
396 const char *what;
397 struct restart_info *restart;
398 struct daemon *dmn;
399
400 switch (child = waitpid(-1, &status, WNOHANG)) {
401 case -1:
402 flog_err_sys(EC_LIB_SYSTEM_CALL, "waitpid failed: %s",
403 safe_strerror(errno));
404 return;
405 case 0:
406 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
407 return;
408 }
409
410 if (child == integrated_write_pid) {
411 integrated_write_sigchld(status);
412 return;
413 }
414
415 if ((restart = find_child(child)) != NULL) {
416 name = restart->name;
417 what = restart->what;
418 restart->pid = 0;
419 gs.numpids--;
420 thread_cancel(&restart->t_kill);
421
422 /* Update restart time to reflect the time the command
423 * completed. */
424 gettimeofday(&restart->time, NULL);
425 } else {
426 flog_err_sys(
427 EC_LIB_SYSTEM_CALL,
428 "waitpid returned status for an unknown child process %d",
429 (int)child);
430 name = "(unknown)";
431 what = "background";
432 }
433 if (WIFSTOPPED(status))
434 zlog_warn("%s %s process %d is stopped", what, name,
435 (int)child);
436 else if (WIFSIGNALED(status))
437 zlog_warn("%s %s process %d terminated due to signal %d", what,
438 name, (int)child, WTERMSIG(status));
439 else if (WIFEXITED(status)) {
440 if (WEXITSTATUS(status) != 0)
441 zlog_warn(
442 "%s %s process %d exited with non-zero status %d",
443 what, name, (int)child, WEXITSTATUS(status));
444 else {
445 zlog_debug("%s %s process %d exited normally", what,
446 name, (int)child);
447
448 if (restart && restart != &gs.restart) {
449 dmn = container_of(restart, struct daemon,
450 restart);
451 restart_done(dmn);
452 } else if (restart)
453 for (dmn = gs.daemons; dmn; dmn = dmn->next)
454 restart_done(dmn);
455 }
456 } else
457 flog_err_sys(
458 EC_LIB_SYSTEM_CALL,
459 "cannot interpret %s %s process %d wait status 0x%x",
460 what, name, (int)child, status);
461 phase_check();
462 }
463
464 static int run_job(struct restart_info *restart, const char *cmdtype,
465 const char *command, int force, int update_interval)
466 {
467 struct timeval delay;
468
469 if (gs.loglevel > LOG_DEBUG + 1)
470 zlog_debug("attempting to %s %s", cmdtype, restart->name);
471
472 if (restart->pid) {
473 if (gs.loglevel > LOG_DEBUG + 1)
474 zlog_debug(
475 "cannot %s %s, previous pid %d still running",
476 cmdtype, restart->name, (int)restart->pid);
477 return -1;
478 }
479
480 char buffer[512];
481
482 snprintf(buffer, sizeof(buffer), "restarting %s", restart->name);
483 systemd_send_status(buffer);
484
485 /* Note: time_elapsed test must come before the force test, since we
486 need
487 to make sure that delay is initialized for use below in updating the
488 restart interval. */
489 if ((time_elapsed(&delay, &restart->time)->tv_sec < restart->interval)
490 && !force) {
491
492 if (gs.loglevel > LOG_DEBUG + 1)
493 zlog_debug(
494 "postponing %s %s: elapsed time %ld < retry interval %ld",
495 cmdtype, restart->name, (long)delay.tv_sec,
496 restart->interval);
497 return -1;
498 }
499
500 gettimeofday(&restart->time, NULL);
501 restart->kills = 0;
502 {
503 char cmd[strlen(command) + strlen(restart->name) + 1];
504 snprintf(cmd, sizeof(cmd), command, restart->name);
505 if ((restart->pid = run_background(cmd)) > 0) {
506 thread_add_timer(master, restart_kill, restart,
507 gs.restart_timeout, &restart->t_kill);
508 restart->what = cmdtype;
509 gs.numpids++;
510 } else
511 restart->pid = 0;
512 }
513
514 /* Calculate the new restart interval. */
515 if (update_interval) {
516 if (delay.tv_sec > 2 * gs.max_restart_interval)
517 restart->interval = gs.min_restart_interval;
518 else if ((restart->interval *= 2) > gs.max_restart_interval)
519 restart->interval = gs.max_restart_interval;
520 if (gs.loglevel > LOG_DEBUG + 1)
521 zlog_debug("restart %s interval is now %ld",
522 restart->name, restart->interval);
523 }
524 return restart->pid;
525 }
526
527 #define SET_READ_HANDLER(DMN) \
528 do { \
529 (DMN)->t_read = NULL; \
530 thread_add_read(master, handle_read, (DMN), (DMN)->fd, \
531 &(DMN)->t_read); \
532 } while (0);
533
534 #define SET_WAKEUP_DOWN(DMN) \
535 do { \
536 (DMN)->t_wakeup = NULL; \
537 thread_add_timer_msec(master, wakeup_down, (DMN), \
538 FUZZY(gs.period), &(DMN)->t_wakeup); \
539 } while (0);
540
541 #define SET_WAKEUP_UNRESPONSIVE(DMN) \
542 do { \
543 (DMN)->t_wakeup = NULL; \
544 thread_add_timer_msec(master, wakeup_unresponsive, (DMN), \
545 FUZZY(gs.period), &(DMN)->t_wakeup); \
546 } while (0);
547
548 #define SET_WAKEUP_ECHO(DMN) \
549 do { \
550 (DMN)->t_wakeup = NULL; \
551 thread_add_timer_msec(master, wakeup_send_echo, (DMN), \
552 FUZZY(gs.period), &(DMN)->t_wakeup); \
553 } while (0);
554
555 static void wakeup_down(struct thread *t_wakeup)
556 {
557 struct daemon *dmn = THREAD_ARG(t_wakeup);
558
559 dmn->t_wakeup = NULL;
560 if (try_connect(dmn) < 0)
561 SET_WAKEUP_DOWN(dmn);
562 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
563 try_restart(dmn);
564 }
565
566 static void wakeup_init(struct thread *t_wakeup)
567 {
568 struct daemon *dmn = THREAD_ARG(t_wakeup);
569
570 dmn->t_wakeup = NULL;
571 if (try_connect(dmn) < 0) {
572 zlog_info(
573 "%s state -> down : initial connection attempt failed",
574 dmn->name);
575 dmn->state = DAEMON_DOWN;
576 }
577 phase_check();
578 }
579
580 static void restart_done(struct daemon *dmn)
581 {
582 if (dmn->state != DAEMON_DOWN) {
583 zlog_warn(
584 "Daemon: %s: is in %s state but expected it to be in DAEMON_DOWN state",
585 dmn->name, state_str[dmn->state]);
586 return;
587 }
588 THREAD_OFF(dmn->t_wakeup);
589
590 if (try_connect(dmn) < 0)
591 SET_WAKEUP_DOWN(dmn);
592 }
593
594 static void daemon_restarting_operational(struct thread *thread)
595 {
596 systemd_send_status("FRR Operational");
597 }
598
599 static void daemon_down(struct daemon *dmn, const char *why)
600 {
601 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
602 flog_err(EC_WATCHFRR_CONNECTION, "%s state -> down : %s",
603 dmn->name, why);
604 else if (gs.loglevel > LOG_DEBUG)
605 zlog_debug("%s still down : %s", dmn->name, why);
606 if (IS_UP(dmn))
607 gs.numdown++;
608 dmn->state = DAEMON_DOWN;
609 if (dmn->fd >= 0) {
610 close(dmn->fd);
611 dmn->fd = -1;
612 }
613 THREAD_OFF(dmn->t_read);
614 THREAD_OFF(dmn->t_write);
615 THREAD_OFF(dmn->t_wakeup);
616 if (try_connect(dmn) < 0)
617 SET_WAKEUP_DOWN(dmn);
618
619 systemd_send_status("FRR partially operational");
620 phase_check();
621 }
622
623 static void handle_read(struct thread *t_read)
624 {
625 struct daemon *dmn = THREAD_ARG(t_read);
626 static const char resp[sizeof(PING_TOKEN) + 4] = PING_TOKEN "\n";
627 char buf[sizeof(resp) + 100];
628 ssize_t rc;
629 struct timeval delay;
630
631 dmn->t_read = NULL;
632 if ((rc = read(dmn->fd, buf, sizeof(buf))) < 0) {
633 char why[100];
634
635 if (ERRNO_IO_RETRY(errno)) {
636 /* Pretend it never happened. */
637 SET_READ_HANDLER(dmn);
638 return;
639 }
640 snprintf(why, sizeof(why), "unexpected read error: %s",
641 safe_strerror(errno));
642 daemon_down(dmn, why);
643 return;
644 }
645 if (rc == 0) {
646 daemon_down(dmn, "read returned EOF");
647 return;
648 }
649 if (!dmn->echo_sent.tv_sec) {
650 char why[sizeof(buf) + 100];
651 snprintf(why, sizeof(why),
652 "unexpected read returns %d bytes: %.*s", (int)rc,
653 (int)rc, buf);
654 daemon_down(dmn, why);
655 return;
656 }
657
658 /* We are expecting an echo response: is there any chance that the
659 response would not be returned entirely in the first read? That
660 seems inconceivable... */
661 if ((rc != sizeof(resp)) || memcmp(buf, resp, sizeof(resp))) {
662 char why[100 + sizeof(buf)];
663 snprintf(why, sizeof(why),
664 "read returned bad echo response of %d bytes (expecting %u): %.*s",
665 (int)rc, (unsigned int)sizeof(resp), (int)rc, buf);
666 daemon_down(dmn, why);
667 return;
668 }
669
670 time_elapsed(&delay, &dmn->echo_sent);
671 dmn->echo_sent.tv_sec = 0;
672 if (dmn->state == DAEMON_UNRESPONSIVE) {
673 if (delay.tv_sec < gs.timeout) {
674 dmn->state = DAEMON_UP;
675 zlog_warn(
676 "%s state -> up : echo response received after %ld.%06ld seconds",
677 dmn->name, (long)delay.tv_sec,
678 (long)delay.tv_usec);
679 } else
680 zlog_warn(
681 "%s: slow echo response finally received after %ld.%06ld seconds",
682 dmn->name, (long)delay.tv_sec,
683 (long)delay.tv_usec);
684 } else if (gs.loglevel > LOG_DEBUG + 1)
685 zlog_debug("%s: echo response received after %ld.%06ld seconds",
686 dmn->name, (long)delay.tv_sec, (long)delay.tv_usec);
687
688 SET_READ_HANDLER(dmn);
689 thread_cancel(&dmn->t_wakeup);
690 SET_WAKEUP_ECHO(dmn);
691 }
692
693 /*
694 * Wait till we notice that all daemons are ready before
695 * we send we are ready to systemd
696 */
697 static void daemon_send_ready(int exitcode)
698 {
699 FILE *fp;
700 static int sent = 0;
701 char started[1024];
702
703 if (sent)
704 return;
705
706 if (exitcode == 0)
707 zlog_notice("all daemons up, doing startup-complete notify");
708 else if (gs.numdown < gs.numdaemons)
709 flog_err(EC_WATCHFRR_CONNECTION,
710 "startup did not complete within timeout (%d/%d daemons running)",
711 gs.numdaemons - gs.numdown, gs.numdaemons);
712 else {
713 flog_err(EC_WATCHFRR_CONNECTION,
714 "all configured daemons failed to start -- exiting watchfrr");
715 exit(exitcode);
716
717 }
718
719 frr_detach();
720
721 snprintf(started, sizeof(started), "%s/%s", frr_vtydir,
722 "watchfrr.started");
723 fp = fopen(started, "w");
724 if (fp)
725 fclose(fp);
726
727 systemd_send_started(master);
728 systemd_send_status("FRR Operational");
729 sent = 1;
730 }
731
732 static void daemon_up(struct daemon *dmn, const char *why)
733 {
734 dmn->state = DAEMON_UP;
735 gs.numdown--;
736 dmn->connect_tries = 0;
737 zlog_notice("%s state -> up : %s", dmn->name, why);
738 if (gs.numdown == 0) {
739 daemon_send_ready(0);
740
741 THREAD_OFF(gs.t_operational);
742
743 thread_add_timer(master, daemon_restarting_operational, NULL,
744 gs.operational_timeout, &gs.t_operational);
745 }
746
747 SET_WAKEUP_ECHO(dmn);
748 phase_check();
749 }
750
751 static void check_connect(struct thread *t_write)
752 {
753 struct daemon *dmn = THREAD_ARG(t_write);
754 int sockerr;
755 socklen_t reslen = sizeof(sockerr);
756
757 dmn->t_write = NULL;
758 if (getsockopt(dmn->fd, SOL_SOCKET, SO_ERROR, (char *)&sockerr, &reslen)
759 < 0) {
760 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn->name,
761 safe_strerror(errno));
762 daemon_down(dmn,
763 "getsockopt failed checking connection success");
764 return;
765 }
766 if ((reslen == sizeof(sockerr)) && sockerr) {
767 char why[100];
768 snprintf(
769 why, sizeof(why),
770 "getsockopt reports that connection attempt failed: %s",
771 safe_strerror(sockerr));
772 daemon_down(dmn, why);
773 return;
774 }
775
776 daemon_up(dmn, "delayed connect succeeded");
777 }
778
779 static void wakeup_connect_hanging(struct thread *t_wakeup)
780 {
781 struct daemon *dmn = THREAD_ARG(t_wakeup);
782 char why[100];
783
784 dmn->t_wakeup = NULL;
785 snprintf(why, sizeof(why),
786 "connection attempt timed out after %ld seconds", gs.timeout);
787 daemon_down(dmn, why);
788 }
789
790 /* Making connection to protocol daemon. */
791 static int try_connect(struct daemon *dmn)
792 {
793 int sock;
794 struct sockaddr_un addr;
795 socklen_t len;
796
797 if (gs.loglevel > LOG_DEBUG + 1)
798 zlog_debug("%s: attempting to connect", dmn->name);
799 dmn->connect_tries++;
800
801 memset(&addr, 0, sizeof(struct sockaddr_un));
802 addr.sun_family = AF_UNIX;
803 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty", gs.vtydir,
804 dmn->name);
805 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
806 len = addr.sun_len = SUN_LEN(&addr);
807 #else
808 len = sizeof(addr.sun_family) + strlen(addr.sun_path);
809 #endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
810
811 /* Quick check to see if we might succeed before we go to the trouble
812 of creating a socket. */
813 if (access(addr.sun_path, W_OK) < 0) {
814 if (errno != ENOENT)
815 flog_err_sys(EC_LIB_SYSTEM_CALL,
816 "%s: access to socket %s denied: %s",
817 dmn->name, addr.sun_path,
818 safe_strerror(errno));
819 return -1;
820 }
821
822 if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
823 flog_err_sys(EC_LIB_SOCKET, "%s(%s): cannot make socket: %s",
824 __func__, addr.sun_path, safe_strerror(errno));
825 return -1;
826 }
827
828 if (set_nonblocking(sock) < 0 || set_cloexec(sock) < 0) {
829 flog_err_sys(EC_LIB_SYSTEM_CALL,
830 "%s(%s): set_nonblocking/cloexec(%d) failed",
831 __func__, addr.sun_path, sock);
832 close(sock);
833 return -1;
834 }
835
836 if (connect(sock, (struct sockaddr *)&addr, len) < 0) {
837 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK)) {
838 if (gs.loglevel > LOG_DEBUG)
839 zlog_debug("%s(%s): connect failed: %s",
840 __func__, addr.sun_path,
841 safe_strerror(errno));
842 close(sock);
843 return -1;
844 }
845 if (gs.loglevel > LOG_DEBUG)
846 zlog_debug("%s: connection in progress", dmn->name);
847 dmn->state = DAEMON_CONNECTING;
848 dmn->fd = sock;
849 thread_add_write(master, check_connect, dmn, dmn->fd,
850 &dmn->t_write);
851 thread_add_timer(master, wakeup_connect_hanging, dmn,
852 gs.timeout, &dmn->t_wakeup);
853 SET_READ_HANDLER(dmn);
854 return 0;
855 }
856
857 dmn->fd = sock;
858 SET_READ_HANDLER(dmn);
859 daemon_up(dmn, "connect succeeded");
860 return 1;
861 }
862
863 static void phase_hanging(struct thread *t_hanging)
864 {
865 gs.t_phase_hanging = NULL;
866 flog_err(EC_WATCHFRR_CONNECTION,
867 "Phase [%s] hanging for %ld seconds, aborting phased restart",
868 phase_str[gs.phase], PHASE_TIMEOUT);
869 gs.phase = PHASE_NONE;
870 }
871
872 static void set_phase(enum restart_phase new_phase)
873 {
874 gs.phase = new_phase;
875 thread_cancel(&gs.t_phase_hanging);
876
877 thread_add_timer(master, phase_hanging, NULL, PHASE_TIMEOUT,
878 &gs.t_phase_hanging);
879 }
880
881 static void phase_check(void)
882 {
883 struct daemon *dmn;
884
885 switch (gs.phase) {
886 case PHASE_NONE:
887 break;
888
889 case PHASE_INIT:
890 for (dmn = gs.daemons; dmn; dmn = dmn->next)
891 if (dmn->state == DAEMON_INIT)
892 return;
893
894 /* startup complete, everything out of INIT */
895 gs.phase = PHASE_NONE;
896 for (dmn = gs.daemons; dmn; dmn = dmn->next)
897 if (dmn->state == DAEMON_DOWN) {
898 SET_WAKEUP_DOWN(dmn);
899 try_restart(dmn);
900 }
901 break;
902 case PHASE_STOPS_PENDING:
903 if (gs.numpids)
904 break;
905 zlog_info(
906 "Phased restart: all routing daemon stop jobs have completed.");
907 set_phase(PHASE_WAITING_DOWN);
908
909 /*FALLTHRU*/
910 case PHASE_WAITING_DOWN:
911 if (gs.numdown + IS_UP(gs.special) < gs.numdaemons)
912 break;
913 systemd_send_status("Phased Restart");
914 zlog_info("Phased restart: all routing daemons now down.");
915 run_job(&gs.special->restart, "restart", gs.restart_command, 1,
916 1);
917 set_phase(PHASE_ZEBRA_RESTART_PENDING);
918
919 /*FALLTHRU*/
920 case PHASE_ZEBRA_RESTART_PENDING:
921 if (gs.special->restart.pid)
922 break;
923 systemd_send_status("Zebra Restarting");
924 zlog_info("Phased restart: %s restart job completed.",
925 gs.special->name);
926 set_phase(PHASE_WAITING_ZEBRA_UP);
927
928 /*FALLTHRU*/
929 case PHASE_WAITING_ZEBRA_UP:
930 if (!IS_UP(gs.special))
931 break;
932 zlog_info("Phased restart: %s is now up.", gs.special->name);
933 {
934 struct daemon *dmn;
935 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
936 if (dmn != gs.special)
937 run_job(&dmn->restart, "start",
938 gs.start_command, 1, 0);
939 }
940 }
941 gs.phase = PHASE_NONE;
942 THREAD_OFF(gs.t_phase_hanging);
943 zlog_notice("Phased global restart has completed.");
944 break;
945 }
946 }
947
948 static void try_restart(struct daemon *dmn)
949 {
950 if (watch_only)
951 return;
952
953 if (dmn != gs.special) {
954 if ((gs.special->state == DAEMON_UP)
955 && (gs.phase == PHASE_NONE))
956 run_job(&dmn->restart, "restart", gs.restart_command, 0,
957 1);
958 else
959 zlog_debug(
960 "%s: postponing restart attempt because master %s daemon not up [%s], or phased restart in progress",
961 dmn->name, gs.special->name,
962 state_str[gs.special->state]);
963 return;
964 }
965
966 if ((gs.phase != PHASE_NONE) || gs.numpids) {
967 if (gs.loglevel > LOG_DEBUG + 1)
968 zlog_debug(
969 "postponing phased global restart: restart already in progress [%s], or outstanding child processes [%d]",
970 phase_str[gs.phase], gs.numpids);
971 return;
972 }
973 /* Is it too soon for a restart? */
974 {
975 struct timeval delay;
976 if (time_elapsed(&delay, &gs.special->restart.time)->tv_sec
977 < gs.special->restart.interval) {
978 if (gs.loglevel > LOG_DEBUG + 1)
979 zlog_debug(
980 "postponing phased global restart: elapsed time %ld < retry interval %ld",
981 (long)delay.tv_sec,
982 gs.special->restart.interval);
983 return;
984 }
985 }
986 run_job(&gs.restart, "restart", gs.restart_command, 0, 1);
987 }
988
989 static void wakeup_unresponsive(struct thread *t_wakeup)
990 {
991 struct daemon *dmn = THREAD_ARG(t_wakeup);
992
993 dmn->t_wakeup = NULL;
994 if (dmn->state != DAEMON_UNRESPONSIVE)
995 flog_err(EC_WATCHFRR_CONNECTION,
996 "%s: no longer unresponsive (now %s), wakeup should have been cancelled!",
997 dmn->name, state_str[dmn->state]);
998 else {
999 SET_WAKEUP_UNRESPONSIVE(dmn);
1000 try_restart(dmn);
1001 }
1002 }
1003
1004 static void wakeup_no_answer(struct thread *t_wakeup)
1005 {
1006 struct daemon *dmn = THREAD_ARG(t_wakeup);
1007
1008 dmn->t_wakeup = NULL;
1009 dmn->state = DAEMON_UNRESPONSIVE;
1010 if (dmn->ignore_timeout)
1011 return;
1012 flog_err(EC_WATCHFRR_CONNECTION,
1013 "%s state -> unresponsive : no response yet to ping sent %ld seconds ago",
1014 dmn->name, gs.timeout);
1015 SET_WAKEUP_UNRESPONSIVE(dmn);
1016 try_restart(dmn);
1017 }
1018
1019 static void wakeup_send_echo(struct thread *t_wakeup)
1020 {
1021 static const char echocmd[] = "echo " PING_TOKEN;
1022 ssize_t rc;
1023 struct daemon *dmn = THREAD_ARG(t_wakeup);
1024
1025 dmn->t_wakeup = NULL;
1026 if (((rc = write(dmn->fd, echocmd, sizeof(echocmd))) < 0)
1027 || ((size_t)rc != sizeof(echocmd))) {
1028 char why[100 + sizeof(echocmd)];
1029 snprintf(why, sizeof(why),
1030 "write '%s' returned %d instead of %u", echocmd,
1031 (int)rc, (unsigned int)sizeof(echocmd));
1032 daemon_down(dmn, why);
1033 } else {
1034 gettimeofday(&dmn->echo_sent, NULL);
1035 thread_add_timer(master, wakeup_no_answer, dmn, gs.timeout,
1036 &dmn->t_wakeup);
1037 }
1038 }
1039
1040 bool check_all_up(void)
1041 {
1042 struct daemon *dmn;
1043
1044 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1045 if (dmn->state != DAEMON_UP)
1046 return false;
1047 return true;
1048 }
1049
1050 void watchfrr_status(struct vty *vty)
1051 {
1052 struct daemon *dmn;
1053 struct timeval delay;
1054
1055 vty_out(vty, "watchfrr global phase: %s\n", phase_str[gs.phase]);
1056 if (gs.restart.pid)
1057 vty_out(vty, " global restart running, pid %ld\n",
1058 (long)gs.restart.pid);
1059
1060 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
1061 vty_out(vty, " %-20s %s%s", dmn->name, state_str[dmn->state],
1062 dmn->ignore_timeout ? "/Ignoring Timeout\n" : "\n");
1063 if (dmn->restart.pid)
1064 vty_out(vty, " restart running, pid %ld\n",
1065 (long)dmn->restart.pid);
1066 else if (dmn->state == DAEMON_DOWN &&
1067 time_elapsed(&delay, &dmn->restart.time)->tv_sec
1068 < dmn->restart.interval)
1069 vty_out(vty, " restarting in %jd seconds (%jds backoff interval)\n",
1070 (intmax_t)dmn->restart.interval
1071 - (intmax_t)delay.tv_sec,
1072 (intmax_t)dmn->restart.interval);
1073 }
1074 }
1075
1076 static void sigint(void)
1077 {
1078 zlog_notice("Terminating on signal");
1079 systemd_send_stopping();
1080 exit(0);
1081 }
1082
1083 static int valid_command(const char *cmd)
1084 {
1085 char *p;
1086
1087 if (cmd == NULL)
1088 return 0;
1089
1090 return ((p = strchr(cmd, '%')) != NULL) && (*(p + 1) == 's')
1091 && !strchr(p + 1, '%');
1092 }
1093
1094 /* This is an ugly hack to circumvent problems with passing command-line
1095 arguments that contain spaces. The fix is to use a configuration file. */
1096 static char *translate_blanks(const char *cmd, const char *blankstr)
1097 {
1098 char *res;
1099 char *p;
1100 size_t bslen = strlen(blankstr);
1101
1102 if (!(res = strdup(cmd))) {
1103 perror("strdup");
1104 exit(1);
1105 }
1106 while ((p = strstr(res, blankstr)) != NULL) {
1107 *p = ' ';
1108 if (bslen != 1)
1109 memmove(p + 1, p + bslen, strlen(p + bslen) + 1);
1110 }
1111 return res;
1112 }
1113
1114 static void startup_timeout(struct thread *t_wakeup)
1115 {
1116 daemon_send_ready(1);
1117 }
1118
1119 #ifdef GNU_LINUX
1120
1121 #include <sys/mount.h>
1122 #include <sched.h>
1123
1124 #define NETNS_RUN_DIR "/var/run/netns"
1125
1126 static void netns_create(int dirfd, const char *nsname)
1127 {
1128 /* make /var/run/netns shared between mount namespaces
1129 * just like iproute2 sets it up
1130 */
1131 if (mount("", NETNS_RUN_DIR, "none", MS_SHARED | MS_REC, NULL)) {
1132 if (errno != EINVAL) {
1133 perror("mount");
1134 exit(1);
1135 }
1136
1137 if (mount(NETNS_RUN_DIR, NETNS_RUN_DIR, "none",
1138 MS_BIND | MS_REC, NULL)) {
1139 perror("mount");
1140 exit(1);
1141 }
1142
1143 if (mount("", NETNS_RUN_DIR, "none", MS_SHARED | MS_REC,
1144 NULL)) {
1145 perror("mount");
1146 exit(1);
1147 }
1148 }
1149
1150 /* need an empty file to mount on top of */
1151 int nsfd = openat(dirfd, nsname, O_CREAT | O_RDONLY | O_EXCL, 0);
1152
1153 if (nsfd < 0) {
1154 fprintf(stderr, "failed to create \"%s/%s\": %s\n",
1155 NETNS_RUN_DIR, nsname, strerror(errno));
1156 exit(1);
1157 }
1158 close(nsfd);
1159
1160 if (unshare(CLONE_NEWNET)) {
1161 perror("unshare");
1162 unlinkat(dirfd, nsname, 0);
1163 exit(1);
1164 }
1165
1166 char *dstpath = asprintfrr(MTYPE_TMP, "%s/%s", NETNS_RUN_DIR, nsname);
1167
1168 /* bind-mount so the namespace has a name and is persistent */
1169 if (mount("/proc/self/ns/net", dstpath, "none", MS_BIND, NULL) < 0) {
1170 fprintf(stderr, "failed to bind-mount netns to \"%s\": %s\n",
1171 dstpath, strerror(errno));
1172 unlinkat(dirfd, nsname, 0);
1173 exit(1);
1174 }
1175
1176 XFREE(MTYPE_TMP, dstpath);
1177 }
1178
1179 static void netns_setup(const char *nsname)
1180 {
1181 int dirfd, nsfd;
1182
1183 dirfd = open(NETNS_RUN_DIR, O_DIRECTORY | O_RDONLY);
1184 if (dirfd < 0) {
1185 if (errno == ENOTDIR) {
1186 fprintf(stderr, "error: \"%s\" is not a directory!\n",
1187 NETNS_RUN_DIR);
1188 exit(1);
1189 } else if (errno == ENOENT) {
1190 if (mkdir(NETNS_RUN_DIR, 0755)) {
1191 fprintf(stderr, "error: \"%s\": mkdir: %s\n",
1192 NETNS_RUN_DIR, strerror(errno));
1193 exit(1);
1194 }
1195 dirfd = open(NETNS_RUN_DIR, O_DIRECTORY | O_RDONLY);
1196 if (dirfd < 0) {
1197 fprintf(stderr, "error: \"%s\": opendir: %s\n",
1198 NETNS_RUN_DIR, strerror(errno));
1199 exit(1);
1200 }
1201 } else {
1202 fprintf(stderr, "error: \"%s\": %s\n",
1203 NETNS_RUN_DIR, strerror(errno));
1204 exit(1);
1205 }
1206 }
1207
1208 nsfd = openat(dirfd, nsname, O_RDONLY);
1209 if (nsfd < 0 && errno != ENOENT) {
1210 fprintf(stderr, "error: \"%s/%s\": %s\n",
1211 NETNS_RUN_DIR, nsname, strerror(errno));
1212 exit(1);
1213 }
1214 if (nsfd < 0)
1215 netns_create(dirfd, nsname);
1216 else {
1217 if (setns(nsfd, CLONE_NEWNET)) {
1218 perror("setns");
1219 exit(1);
1220 }
1221 close(nsfd);
1222 }
1223 close(dirfd);
1224
1225 /* make sure loopback is up... weird things happen otherwise.
1226 * ioctl is perfectly fine for this, don't need netlink...
1227 */
1228 int sockfd;
1229 struct ifreq ifr = { };
1230
1231 strlcpy(ifr.ifr_name, "lo", sizeof(ifr.ifr_name));
1232
1233 sockfd = socket(AF_INET, SOCK_DGRAM, 0);
1234 if (sockfd < 0) {
1235 perror("socket");
1236 exit(1);
1237 }
1238 if (ioctl(sockfd, SIOCGIFFLAGS, &ifr)) {
1239 perror("ioctl(SIOCGIFFLAGS, \"lo\")");
1240 exit(1);
1241 }
1242 if (!(ifr.ifr_flags & IFF_UP)) {
1243 ifr.ifr_flags |= IFF_UP;
1244 if (ioctl(sockfd, SIOCSIFFLAGS, &ifr)) {
1245 perror("ioctl(SIOCSIFFLAGS, \"lo\")");
1246 exit(1);
1247 }
1248 }
1249 close(sockfd);
1250 }
1251
1252 #else /* !GNU_LINUX */
1253
1254 static void netns_setup(const char *nsname)
1255 {
1256 fprintf(stderr, "network namespaces are only available on Linux\n");
1257 exit(1);
1258 }
1259 #endif
1260
1261 static void watchfrr_init(int argc, char **argv)
1262 {
1263 const char *special = "zebra";
1264 int i;
1265 struct daemon *dmn, **add = &gs.daemons;
1266 char alldaemons[512] = "", *p = alldaemons;
1267
1268 thread_add_timer_msec(master, startup_timeout, NULL, STARTUP_TIMEOUT,
1269 &gs.t_startup_timeout);
1270
1271 for (i = optind; i < argc; i++) {
1272 dmn = XCALLOC(MTYPE_WATCHFRR_DAEMON, sizeof(*dmn));
1273
1274 dmn->name = dmn->restart.name = argv[i];
1275 dmn->state = DAEMON_INIT;
1276 gs.numdaemons++;
1277 gs.numdown++;
1278 dmn->fd = -1;
1279 thread_add_timer_msec(master, wakeup_init, dmn, 0,
1280 &dmn->t_wakeup);
1281 dmn->restart.interval = gs.min_restart_interval;
1282 *add = dmn;
1283 add = &dmn->next;
1284
1285 if (!strcmp(dmn->name, special))
1286 gs.special = dmn;
1287 }
1288
1289 if (!gs.daemons) {
1290 fprintf(stderr,
1291 "Must specify one or more daemons to monitor.\n\n");
1292 frr_help_exit(1);
1293 }
1294 if (!watch_only && !gs.special) {
1295 fprintf(stderr, "\"%s\" daemon must be in daemon lists\n\n",
1296 special);
1297 frr_help_exit(1);
1298 }
1299
1300 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
1301 snprintf(p, alldaemons + sizeof(alldaemons) - p, "%s%s",
1302 (p == alldaemons) ? "" : " ", dmn->name);
1303 p += strlen(p);
1304 }
1305 zlog_notice("%s %s watching [%s]%s", progname, FRR_VERSION, alldaemons,
1306 watch_only ? ", monitor mode" : "");
1307 }
1308
1309 struct zebra_privs_t watchfrr_privs = {
1310 #ifdef VTY_GROUP
1311 .vty_group = VTY_GROUP,
1312 #endif
1313 };
1314
1315 static struct frr_signal_t watchfrr_signals[] = {
1316 {
1317 .signal = SIGINT,
1318 .handler = sigint,
1319 },
1320 {
1321 .signal = SIGTERM,
1322 .handler = sigint,
1323 },
1324 {
1325 .signal = SIGCHLD,
1326 .handler = sigchild,
1327 },
1328 };
1329
1330 FRR_DAEMON_INFO(watchfrr, WATCHFRR,
1331 .flags = FRR_NO_PRIVSEP | FRR_NO_TCPVTY | FRR_LIMITED_CLI
1332 | FRR_NO_CFG_PID_DRY | FRR_NO_ZCLIENT
1333 | FRR_DETACH_LATER,
1334
1335 .printhelp = printhelp,
1336 .copyright = "Copyright 2004 Andrew J. Schorr",
1337
1338 .signals = watchfrr_signals,
1339 .n_signals = array_size(watchfrr_signals),
1340
1341 .privs = &watchfrr_privs,
1342 );
1343
1344 #define DEPRECATED_OPTIONS "aAezR:"
1345
1346 int main(int argc, char **argv)
1347 {
1348 int opt;
1349 const char *blankstr = NULL;
1350 const char *netns = NULL;
1351 bool netns_en = false;
1352
1353 frr_preinit(&watchfrr_di, argc, argv);
1354 progname = watchfrr_di.progname;
1355
1356 frr_opt_add("b:di:k:l:N:p:r:S:s:t:T:" DEPRECATED_OPTIONS, longopts, "");
1357
1358 gs.restart.name = "all";
1359 while ((opt = frr_getopt(argc, argv, NULL)) != EOF) {
1360 if (opt && opt < 128 && strchr(DEPRECATED_OPTIONS, opt)) {
1361 fprintf(stderr,
1362 "The -%c option no longer exists.\n"
1363 "Please refer to the watchfrr(8) man page.\n",
1364 opt);
1365 exit(1);
1366 }
1367
1368 switch (opt) {
1369 case 0:
1370 break;
1371 case 'b':
1372 blankstr = optarg;
1373 break;
1374 case OPTION_DRY:
1375 watch_only = true;
1376 break;
1377 case 'k':
1378 if (!valid_command(optarg)) {
1379 fprintf(stderr,
1380 "Invalid kill command, must contain '%%s': %s\n",
1381 optarg);
1382 frr_help_exit(1);
1383 }
1384 gs.stop_command = optarg;
1385 break;
1386 case 'l': {
1387 char garbage[3];
1388 if ((sscanf(optarg, "%d%1s", &gs.loglevel, garbage)
1389 != 1)
1390 || (gs.loglevel < LOG_EMERG)) {
1391 fprintf(stderr,
1392 "Invalid loglevel argument: %s\n",
1393 optarg);
1394 frr_help_exit(1);
1395 }
1396 } break;
1397 case OPTION_MINRESTART: {
1398 char garbage[3];
1399 if ((sscanf(optarg, "%ld%1s", &gs.min_restart_interval,
1400 garbage)
1401 != 1)
1402 || (gs.min_restart_interval < 0)) {
1403 fprintf(stderr,
1404 "Invalid min_restart_interval argument: %s\n",
1405 optarg);
1406 frr_help_exit(1);
1407 }
1408 } break;
1409 case OPTION_MAXRESTART: {
1410 char garbage[3];
1411 if ((sscanf(optarg, "%ld%1s", &gs.max_restart_interval,
1412 garbage)
1413 != 1)
1414 || (gs.max_restart_interval < 0)) {
1415 fprintf(stderr,
1416 "Invalid max_restart_interval argument: %s\n",
1417 optarg);
1418 frr_help_exit(1);
1419 }
1420 } break;
1421 case OPTION_MAXOPERATIONAL: {
1422 char garbage[3];
1423
1424 if ((sscanf(optarg, "%ld%1s", &gs.operational_timeout,
1425 garbage) != 1) ||
1426 (gs.max_restart_interval < 0)) {
1427 fprintf(stderr,
1428 "Invalid Operational_timeout argument: %s\n",
1429 optarg);
1430 frr_help_exit(1);
1431 }
1432 } break;
1433 case OPTION_NETNS:
1434 netns_en = true;
1435 if (optarg && strchr(optarg, '/')) {
1436 fprintf(stderr,
1437 "invalid network namespace name \"%s\" (may not contain slashes)\n",
1438 optarg);
1439 frr_help_exit(1);
1440 }
1441 netns = optarg;
1442 break;
1443 case 'i': {
1444 char garbage[3];
1445 int period;
1446 if ((sscanf(optarg, "%d%1s", &period, garbage) != 1)
1447 || (gs.period < 1)) {
1448 fprintf(stderr,
1449 "Invalid interval argument: %s\n",
1450 optarg);
1451 frr_help_exit(1);
1452 }
1453 gs.period = 1000 * period;
1454 } break;
1455 case 'p':
1456 watchfrr_di.pid_file = optarg;
1457 break;
1458 case 'r':
1459 if (!valid_command(optarg)) {
1460 fprintf(stderr,
1461 "Invalid restart command, must contain '%%s': %s\n",
1462 optarg);
1463 frr_help_exit(1);
1464 }
1465 gs.restart_command = optarg;
1466 break;
1467 case 's':
1468 if (!valid_command(optarg)) {
1469 fprintf(stderr,
1470 "Invalid start command, must contain '%%s': %s\n",
1471 optarg);
1472 frr_help_exit(1);
1473 }
1474 gs.start_command = optarg;
1475 break;
1476 case 'S':
1477 gs.vtydir = optarg;
1478 break;
1479 case 't': {
1480 char garbage[3];
1481 if ((sscanf(optarg, "%ld%1s", &gs.timeout, garbage)
1482 != 1)
1483 || (gs.timeout < 1)) {
1484 fprintf(stderr,
1485 "Invalid timeout argument: %s\n",
1486 optarg);
1487 frr_help_exit(1);
1488 }
1489 } break;
1490 case 'T': {
1491 char garbage[3];
1492 if ((sscanf(optarg, "%ld%1s", &gs.restart_timeout,
1493 garbage)
1494 != 1)
1495 || (gs.restart_timeout < 1)) {
1496 fprintf(stderr,
1497 "Invalid restart timeout argument: %s\n",
1498 optarg);
1499 frr_help_exit(1);
1500 }
1501 } break;
1502 default:
1503 fputs("Invalid option.\n", stderr);
1504 frr_help_exit(1);
1505 }
1506 }
1507
1508 if (watch_only
1509 && (gs.start_command || gs.stop_command || gs.restart_command)) {
1510 fputs("Options -r/-s/-k are not used when --dry is active.\n",
1511 stderr);
1512 }
1513 if (!watch_only
1514 && (!gs.restart_command || !gs.start_command || !gs.stop_command)) {
1515 fprintf(stderr,
1516 "Options -s (start), -k (kill), and -r (restart) are required.\n");
1517 frr_help_exit(1);
1518 }
1519
1520 if (blankstr) {
1521 if (gs.restart_command)
1522 gs.restart_command =
1523 translate_blanks(gs.restart_command, blankstr);
1524 if (gs.start_command)
1525 gs.start_command =
1526 translate_blanks(gs.start_command, blankstr);
1527 if (gs.stop_command)
1528 gs.stop_command =
1529 translate_blanks(gs.stop_command, blankstr);
1530 }
1531
1532 gs.restart.interval = gs.min_restart_interval;
1533
1534 /* env variable for the processes that we start */
1535 if (watchfrr_di.pathspace)
1536 setenv("FRR_PATHSPACE", watchfrr_di.pathspace, 1);
1537 else
1538 unsetenv("FRR_PATHSPACE");
1539
1540 /*
1541 * when watchfrr_di.pathspace is read, if it is not specified
1542 * pathspace is NULL as expected
1543 */
1544 pathspace = watchfrr_di.pathspace;
1545
1546 if (netns_en && !netns)
1547 netns = watchfrr_di.pathspace;
1548
1549 if (netns_en && netns && netns[0])
1550 netns_setup(netns);
1551
1552 master = frr_init();
1553 watchfrr_error_init();
1554 watchfrr_init(argc, argv);
1555 watchfrr_vty_init();
1556
1557 frr_config_fork();
1558
1559 if (watchfrr_di.daemon_mode)
1560 zlog_syslog_set_prio_min(MIN(gs.loglevel, LOG_DEBUG));
1561 else
1562 zlog_aux_init(NULL, MIN(gs.loglevel, LOG_DEBUG));
1563
1564 frr_run(master);
1565
1566 systemd_send_stopping();
1567 /* Not reached. */
1568 return 0;
1569 }