]> git.proxmox.com Git - mirror_frr.git/blob - watchfrr/watchfrr.c
Merge pull request #6738 from deastoe/frr-reload-log-level
[mirror_frr.git] / watchfrr / watchfrr.c
1 /*
2 * Monitor status of frr daemons and restart if necessary.
3 *
4 * Copyright (C) 2004 Andrew J. Schorr
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include <zebra.h>
22 #include <thread.h>
23 #include <log.h>
24 #include <network.h>
25 #include <sigevent.h>
26 #include <lib/version.h>
27 #include "command.h"
28 #include "libfrr.h"
29 #include "lib_errors.h"
30 #include "zlog_targets.h"
31 #include "network.h"
32 #include "printfrr.h"
33
34 #include <getopt.h>
35 #include <sys/un.h>
36 #include <sys/wait.h>
37 #include <memory.h>
38 #include <systemd.h>
39
40 #include "watchfrr.h"
41 #include "watchfrr_errors.h"
42
43 #ifndef MIN
44 #define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
45 #endif
46
47 /* Macros to help randomize timers. */
48 #define JITTER(X) ((frr_weak_random() % ((X)+1))-((X)/2))
49 #define FUZZY(X) ((X)+JITTER((X)/20))
50
51 #define DEFAULT_PERIOD 5
52 #define DEFAULT_TIMEOUT 90
53 #define DEFAULT_RESTART_TIMEOUT 20
54 #define DEFAULT_LOGLEVEL LOG_INFO
55 #define DEFAULT_MIN_RESTART 60
56 #define DEFAULT_MAX_RESTART 600
57
58 #define DEFAULT_RESTART_CMD WATCHFRR_SH_PATH " restart %s"
59 #define DEFAULT_START_CMD WATCHFRR_SH_PATH " start %s"
60 #define DEFAULT_STOP_CMD WATCHFRR_SH_PATH " stop %s"
61
62 #define PING_TOKEN "PING"
63
64 DEFINE_MGROUP(WATCHFRR, "watchfrr")
65 DEFINE_MTYPE_STATIC(WATCHFRR, WATCHFRR_DAEMON, "watchfrr daemon entry")
66
67 /* Needs to be global, referenced somewhere inside libfrr. */
68 struct thread_master *master;
69
70 static bool watch_only = false;
71
72 typedef enum {
73 PHASE_NONE = 0,
74 PHASE_INIT,
75 PHASE_STOPS_PENDING,
76 PHASE_WAITING_DOWN,
77 PHASE_ZEBRA_RESTART_PENDING,
78 PHASE_WAITING_ZEBRA_UP
79 } restart_phase_t;
80
81 static const char *const phase_str[] = {
82 "Idle",
83 "Startup",
84 "Stop jobs running",
85 "Waiting for other daemons to come down",
86 "Zebra restart job running",
87 "Waiting for zebra to come up",
88 "Start jobs running",
89 };
90
91 #define PHASE_TIMEOUT (3*gs.restart_timeout)
92 #define STARTUP_TIMEOUT 55 * 1000
93
94 struct restart_info {
95 const char *name;
96 const char *what;
97 pid_t pid;
98 struct timeval time;
99 long interval;
100 struct thread *t_kill;
101 int kills;
102 };
103
104 static struct global_state {
105 restart_phase_t phase;
106 struct thread *t_phase_hanging;
107 struct thread *t_startup_timeout;
108 const char *vtydir;
109 long period;
110 long timeout;
111 long restart_timeout;
112 long min_restart_interval;
113 long max_restart_interval;
114 struct daemon *daemons;
115 const char *restart_command;
116 const char *start_command;
117 const char *stop_command;
118 struct restart_info restart;
119 int loglevel;
120 struct daemon *special; /* points to zebra when doing phased restart */
121 int numdaemons;
122 int numpids;
123 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
124 } gs = {
125 .phase = PHASE_INIT,
126 .vtydir = frr_vtydir,
127 .period = 1000 * DEFAULT_PERIOD,
128 .timeout = DEFAULT_TIMEOUT,
129 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
130 .loglevel = DEFAULT_LOGLEVEL,
131 .min_restart_interval = DEFAULT_MIN_RESTART,
132 .max_restart_interval = DEFAULT_MAX_RESTART,
133 .restart_command = DEFAULT_RESTART_CMD,
134 .start_command = DEFAULT_START_CMD,
135 .stop_command = DEFAULT_STOP_CMD,
136 };
137
138 typedef enum {
139 DAEMON_INIT,
140 DAEMON_DOWN,
141 DAEMON_CONNECTING,
142 DAEMON_UP,
143 DAEMON_UNRESPONSIVE
144 } daemon_state_t;
145
146 #define IS_UP(DMN) \
147 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
148
149 static const char *const state_str[] = {
150 "Init", "Down", "Connecting", "Up", "Unresponsive",
151 };
152
153 struct daemon {
154 const char *name;
155 daemon_state_t state;
156 int fd;
157 struct timeval echo_sent;
158 unsigned int connect_tries;
159 struct thread *t_wakeup;
160 struct thread *t_read;
161 struct thread *t_write;
162 struct daemon *next;
163 struct restart_info restart;
164
165 /*
166 * For a given daemon, if we've turned on ignore timeouts
167 * ignore the timeout value and assume everything is ok
168 * This is for daemon debugging w/ gdb after we have started
169 * FRR and realize we have something that needs to be looked
170 * at
171 */
172 bool ignore_timeout;
173 };
174
175 #define OPTION_MINRESTART 2000
176 #define OPTION_MAXRESTART 2001
177 #define OPTION_DRY 2002
178 #define OPTION_NETNS 2003
179
180 static const struct option longopts[] = {
181 {"daemon", no_argument, NULL, 'd'},
182 {"statedir", required_argument, NULL, 'S'},
183 {"loglevel", required_argument, NULL, 'l'},
184 {"interval", required_argument, NULL, 'i'},
185 {"timeout", required_argument, NULL, 't'},
186 {"restart-timeout", required_argument, NULL, 'T'},
187 {"restart", required_argument, NULL, 'r'},
188 {"start-command", required_argument, NULL, 's'},
189 {"kill-command", required_argument, NULL, 'k'},
190 {"dry", no_argument, NULL, OPTION_DRY},
191 {"min-restart-interval", required_argument, NULL, OPTION_MINRESTART},
192 {"max-restart-interval", required_argument, NULL, OPTION_MAXRESTART},
193 {"pid-file", required_argument, NULL, 'p'},
194 {"blank-string", required_argument, NULL, 'b'},
195 #ifdef GNU_LINUX
196 {"netns", optional_argument, NULL, OPTION_NETNS},
197 #endif
198 {"help", no_argument, NULL, 'h'},
199 {"version", no_argument, NULL, 'v'},
200 {NULL, 0, NULL, 0}};
201
202 static int try_connect(struct daemon *dmn);
203 static int wakeup_send_echo(struct thread *t_wakeup);
204 static void try_restart(struct daemon *dmn);
205 static void phase_check(void);
206 static void restart_done(struct daemon *dmn);
207
208 static const char *progname;
209
210 void watchfrr_set_ignore_daemon(struct vty *vty, const char *dname, bool ignore)
211 {
212 struct daemon *dmn;
213
214 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
215 if (strncmp(dmn->name, dname, strlen(dmn->name)) == 0)
216 break;
217 }
218
219 if (dmn) {
220 dmn->ignore_timeout = ignore;
221 vty_out(vty, "%s switching to %s\n", dmn->name,
222 ignore ? "ignore" : "watch");
223 } else
224 vty_out(vty, "%s is not configured for running at the moment",
225 dname);
226 }
227
228 static void printhelp(FILE *target)
229 {
230 fprintf(target,
231 "Usage : %s [OPTION...] <daemon name> ...\n\n\
232 Watchdog program to monitor status of frr daemons and try to restart\n\
233 them if they are down or unresponsive. It determines whether a daemon is\n\
234 up based on whether it can connect to the daemon's vty unix stream socket.\n\
235 It then repeatedly sends echo commands over that socket to determine whether\n\
236 the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
237 on the socket connection and know immediately that the daemon is down.\n\n\
238 The daemons to be monitored should be listed on the command line.\n\n\
239 In order to avoid attempting to restart the daemons in a fast loop,\n\
240 the -m and -M options allow you to control the minimum delay between\n\
241 restart commands. The minimum restart delay is recalculated each time\n\
242 a restart is attempted: if the time since the last restart attempt exceeds\n\
243 twice the -M value, then the restart delay is set to the -m value.\n\
244 Otherwise, the interval is doubled (but capped at the -M value).\n\n",
245 progname);
246
247 fprintf(target,
248 "Options:\n\
249 -d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
250 to syslog instead of stdout.\n\
251 -S, --statedir Set the vty socket directory (default is %s)\n\
252 -N, --pathspace Insert prefix into config & socket paths\n"
253 #ifdef GNU_LINUX
254 " --netns Create and/or use Linux network namespace. If no name is\n"
255 " given, uses the value from `-N`.\n"
256 #endif
257 "-l, --loglevel Set the logging level (default is %d).\n\
258 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
259 but it can be set higher than %d if extra-verbose debugging\n\
260 messages are desired.\n\
261 --min-restart-interval\n\
262 Set the minimum seconds to wait between invocations of daemon\n\
263 restart commands (default is %d).\n\
264 --max-restart-interval\n\
265 Set the maximum seconds to wait between invocations of daemon\n\
266 restart commands (default is %d).\n\
267 -i, --interval Set the status polling interval in seconds (default is %d)\n\
268 -t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
269 -T, --restart-timeout\n\
270 Set the restart (kill) timeout in seconds (default is %d).\n\
271 If any background jobs are still running after this much\n\
272 time has elapsed, they will be killed.\n\
273 -r, --restart Supply a Bourne shell command to use to restart a single\n\
274 daemon. The command string should include '%%s' where the\n\
275 name of the daemon should be substituted.\n\
276 (default: '%s')\n\
277 -s, --start-command\n\
278 Supply a Bourne shell to command to use to start a single\n\
279 daemon. The command string should include '%%s' where the\n\
280 name of the daemon should be substituted.\n\
281 (default: '%s')\n\
282 -k, --kill-command\n\
283 Supply a Bourne shell to command to use to stop a single\n\
284 daemon. The command string should include '%%s' where the\n\
285 name of the daemon should be substituted.\n\
286 (default: '%s')\n\
287 --dry Do not start or restart anything, just log.\n\
288 -p, --pid-file Set process identifier file name\n\
289 (default is %s/watchfrr.pid).\n\
290 -b, --blank-string\n\
291 When the supplied argument string is found in any of the\n\
292 various shell command arguments (-r, -s, or -k), replace\n\
293 it with a space. This is an ugly hack to circumvent problems\n\
294 passing command-line arguments with embedded spaces.\n\
295 -v, --version Print program version\n\
296 -h, --help Display this help and exit\n",
297 frr_vtydir, DEFAULT_LOGLEVEL, LOG_EMERG, LOG_DEBUG, LOG_DEBUG,
298 DEFAULT_MIN_RESTART, DEFAULT_MAX_RESTART, DEFAULT_PERIOD,
299 DEFAULT_TIMEOUT, DEFAULT_RESTART_TIMEOUT,
300 DEFAULT_RESTART_CMD, DEFAULT_START_CMD, DEFAULT_STOP_CMD,
301 frr_vtydir);
302 }
303
304 static pid_t run_background(char *shell_cmd)
305 {
306 pid_t child;
307
308 switch (child = fork()) {
309 case -1:
310 flog_err_sys(EC_LIB_SYSTEM_CALL,
311 "fork failed, cannot run command [%s]: %s",
312 shell_cmd, safe_strerror(errno));
313 return -1;
314 case 0:
315 /* Child process. */
316 /* Use separate process group so child processes can be killed
317 * easily. */
318 if (setpgid(0, 0) < 0)
319 zlog_warn("warning: setpgid(0,0) failed: %s",
320 safe_strerror(errno));
321 {
322 char shell[] = "sh";
323 char dashc[] = "-c";
324 char *const argv[4] = {shell, dashc, shell_cmd, NULL};
325 execv("/bin/sh", argv);
326 flog_err_sys(EC_LIB_SYSTEM_CALL,
327 "execv(/bin/sh -c '%s') failed: %s",
328 shell_cmd, safe_strerror(errno));
329 _exit(127);
330 }
331 default:
332 /* Parent process: we will reap the child later. */
333 zlog_info("Forked background command [pid %d]: %s", (int)child,
334 shell_cmd);
335 return child;
336 }
337 }
338
339 static struct timeval *time_elapsed(struct timeval *result,
340 const struct timeval *start_time)
341 {
342 gettimeofday(result, NULL);
343 result->tv_sec -= start_time->tv_sec;
344 result->tv_usec -= start_time->tv_usec;
345 while (result->tv_usec < 0) {
346 result->tv_usec += 1000000L;
347 result->tv_sec--;
348 }
349 return result;
350 }
351
352 static int restart_kill(struct thread *t_kill)
353 {
354 struct restart_info *restart = THREAD_ARG(t_kill);
355 struct timeval delay;
356
357 time_elapsed(&delay, &restart->time);
358 zlog_warn(
359 "Warning: %s %s child process %d still running after %ld seconds, sending signal %d",
360 restart->what, restart->name, (int)restart->pid,
361 (long)delay.tv_sec, (restart->kills ? SIGKILL : SIGTERM));
362 kill(-restart->pid, (restart->kills ? SIGKILL : SIGTERM));
363 restart->kills++;
364 restart->t_kill = NULL;
365 thread_add_timer(master, restart_kill, restart, gs.restart_timeout,
366 &restart->t_kill);
367 return 0;
368 }
369
370 static struct restart_info *find_child(pid_t child)
371 {
372 struct daemon *dmn;
373 if (gs.restart.pid == child)
374 return &gs.restart;
375
376 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
377 if (dmn->restart.pid == child)
378 return &dmn->restart;
379 }
380 return NULL;
381 }
382
383 static void sigchild(void)
384 {
385 pid_t child;
386 int status;
387 const char *name;
388 const char *what;
389 struct restart_info *restart;
390 struct daemon *dmn;
391
392 switch (child = waitpid(-1, &status, WNOHANG)) {
393 case -1:
394 flog_err_sys(EC_LIB_SYSTEM_CALL, "waitpid failed: %s",
395 safe_strerror(errno));
396 return;
397 case 0:
398 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
399 return;
400 }
401
402 if (child == integrated_write_pid) {
403 integrated_write_sigchld(status);
404 return;
405 }
406
407 if ((restart = find_child(child)) != NULL) {
408 name = restart->name;
409 what = restart->what;
410 restart->pid = 0;
411 gs.numpids--;
412 thread_cancel(restart->t_kill);
413 restart->t_kill = NULL;
414 /* Update restart time to reflect the time the command
415 * completed. */
416 gettimeofday(&restart->time, NULL);
417 } else {
418 flog_err_sys(
419 EC_LIB_SYSTEM_CALL,
420 "waitpid returned status for an unknown child process %d",
421 (int)child);
422 name = "(unknown)";
423 what = "background";
424 }
425 if (WIFSTOPPED(status))
426 zlog_warn("warning: %s %s process %d is stopped", what, name,
427 (int)child);
428 else if (WIFSIGNALED(status))
429 zlog_warn("%s %s process %d terminated due to signal %d", what,
430 name, (int)child, WTERMSIG(status));
431 else if (WIFEXITED(status)) {
432 if (WEXITSTATUS(status) != 0)
433 zlog_warn(
434 "%s %s process %d exited with non-zero status %d",
435 what, name, (int)child, WEXITSTATUS(status));
436 else {
437 zlog_debug("%s %s process %d exited normally", what,
438 name, (int)child);
439
440 if (restart && restart != &gs.restart) {
441 dmn = container_of(restart, struct daemon,
442 restart);
443 restart_done(dmn);
444 } else if (restart)
445 for (dmn = gs.daemons; dmn; dmn = dmn->next)
446 restart_done(dmn);
447 }
448 } else
449 flog_err_sys(
450 EC_LIB_SYSTEM_CALL,
451 "cannot interpret %s %s process %d wait status 0x%x",
452 what, name, (int)child, status);
453 phase_check();
454 }
455
456 static int run_job(struct restart_info *restart, const char *cmdtype,
457 const char *command, int force, int update_interval)
458 {
459 struct timeval delay;
460
461 if (gs.loglevel > LOG_DEBUG + 1)
462 zlog_debug("attempting to %s %s", cmdtype, restart->name);
463
464 if (restart->pid) {
465 if (gs.loglevel > LOG_DEBUG + 1)
466 zlog_debug(
467 "cannot %s %s, previous pid %d still running",
468 cmdtype, restart->name, (int)restart->pid);
469 return -1;
470 }
471
472 #if defined HAVE_SYSTEMD
473 char buffer[512];
474
475 snprintf(buffer, sizeof(buffer), "restarting %s", restart->name);
476 systemd_send_status(buffer);
477 #endif
478
479 /* Note: time_elapsed test must come before the force test, since we
480 need
481 to make sure that delay is initialized for use below in updating the
482 restart interval. */
483 if ((time_elapsed(&delay, &restart->time)->tv_sec < restart->interval)
484 && !force) {
485
486 if (gs.loglevel > LOG_DEBUG + 1)
487 zlog_debug(
488 "postponing %s %s: elapsed time %ld < retry interval %ld",
489 cmdtype, restart->name, (long)delay.tv_sec,
490 restart->interval);
491 return -1;
492 }
493
494 gettimeofday(&restart->time, NULL);
495 restart->kills = 0;
496 {
497 char cmd[strlen(command) + strlen(restart->name) + 1];
498 snprintf(cmd, sizeof(cmd), command, restart->name);
499 if ((restart->pid = run_background(cmd)) > 0) {
500 restart->t_kill = NULL;
501 thread_add_timer(master, restart_kill, restart,
502 gs.restart_timeout, &restart->t_kill);
503 restart->what = cmdtype;
504 gs.numpids++;
505 } else
506 restart->pid = 0;
507 }
508
509 #if defined HAVE_SYSTEMD
510 systemd_send_status("FRR Operational");
511 #endif
512 /* Calculate the new restart interval. */
513 if (update_interval) {
514 if (delay.tv_sec > 2 * gs.max_restart_interval)
515 restart->interval = gs.min_restart_interval;
516 else if ((restart->interval *= 2) > gs.max_restart_interval)
517 restart->interval = gs.max_restart_interval;
518 if (gs.loglevel > LOG_DEBUG + 1)
519 zlog_debug("restart %s interval is now %ld",
520 restart->name, restart->interval);
521 }
522 return restart->pid;
523 }
524
525 #define SET_READ_HANDLER(DMN) \
526 do { \
527 (DMN)->t_read = NULL; \
528 thread_add_read(master, handle_read, (DMN), (DMN)->fd, \
529 &(DMN)->t_read); \
530 } while (0);
531
532 #define SET_WAKEUP_DOWN(DMN) \
533 do { \
534 (DMN)->t_wakeup = NULL; \
535 thread_add_timer_msec(master, wakeup_down, (DMN), \
536 FUZZY(gs.period), &(DMN)->t_wakeup); \
537 } while (0);
538
539 #define SET_WAKEUP_UNRESPONSIVE(DMN) \
540 do { \
541 (DMN)->t_wakeup = NULL; \
542 thread_add_timer_msec(master, wakeup_unresponsive, (DMN), \
543 FUZZY(gs.period), &(DMN)->t_wakeup); \
544 } while (0);
545
546 #define SET_WAKEUP_ECHO(DMN) \
547 do { \
548 (DMN)->t_wakeup = NULL; \
549 thread_add_timer_msec(master, wakeup_send_echo, (DMN), \
550 FUZZY(gs.period), &(DMN)->t_wakeup); \
551 } while (0);
552
553 static int wakeup_down(struct thread *t_wakeup)
554 {
555 struct daemon *dmn = THREAD_ARG(t_wakeup);
556
557 dmn->t_wakeup = NULL;
558 if (try_connect(dmn) < 0)
559 SET_WAKEUP_DOWN(dmn);
560 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
561 try_restart(dmn);
562 return 0;
563 }
564
565 static int wakeup_init(struct thread *t_wakeup)
566 {
567 struct daemon *dmn = THREAD_ARG(t_wakeup);
568
569 dmn->t_wakeup = NULL;
570 if (try_connect(dmn) < 0) {
571 zlog_info(
572 "%s state -> down : initial connection attempt failed",
573 dmn->name);
574 dmn->state = DAEMON_DOWN;
575 }
576 phase_check();
577 return 0;
578 }
579
580 static void restart_done(struct daemon *dmn)
581 {
582 if (dmn->state != DAEMON_DOWN) {
583 zlog_warn(
584 "Daemon: %s: is in %s state but expected it to be in DAEMON_DOWN state",
585 dmn->name, state_str[dmn->state]);
586 return;
587 }
588 if (dmn->t_wakeup)
589 THREAD_OFF(dmn->t_wakeup);
590 if (try_connect(dmn) < 0)
591 SET_WAKEUP_DOWN(dmn);
592 }
593
594 static void daemon_down(struct daemon *dmn, const char *why)
595 {
596 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
597 flog_err(EC_WATCHFRR_CONNECTION, "%s state -> down : %s",
598 dmn->name, why);
599 else if (gs.loglevel > LOG_DEBUG)
600 zlog_debug("%s still down : %s", dmn->name, why);
601 if (IS_UP(dmn))
602 gs.numdown++;
603 dmn->state = DAEMON_DOWN;
604 if (dmn->fd >= 0) {
605 close(dmn->fd);
606 dmn->fd = -1;
607 }
608 THREAD_OFF(dmn->t_read);
609 THREAD_OFF(dmn->t_write);
610 THREAD_OFF(dmn->t_wakeup);
611 if (try_connect(dmn) < 0)
612 SET_WAKEUP_DOWN(dmn);
613 phase_check();
614 }
615
616 static int handle_read(struct thread *t_read)
617 {
618 struct daemon *dmn = THREAD_ARG(t_read);
619 static const char resp[sizeof(PING_TOKEN) + 4] = PING_TOKEN "\n";
620 char buf[sizeof(resp) + 100];
621 ssize_t rc;
622 struct timeval delay;
623
624 dmn->t_read = NULL;
625 if ((rc = read(dmn->fd, buf, sizeof(buf))) < 0) {
626 char why[100];
627
628 if (ERRNO_IO_RETRY(errno)) {
629 /* Pretend it never happened. */
630 SET_READ_HANDLER(dmn);
631 return 0;
632 }
633 snprintf(why, sizeof(why), "unexpected read error: %s",
634 safe_strerror(errno));
635 daemon_down(dmn, why);
636 return 0;
637 }
638 if (rc == 0) {
639 daemon_down(dmn, "read returned EOF");
640 return 0;
641 }
642 if (!dmn->echo_sent.tv_sec) {
643 char why[sizeof(buf) + 100];
644 snprintf(why, sizeof(why),
645 "unexpected read returns %d bytes: %.*s", (int)rc,
646 (int)rc, buf);
647 daemon_down(dmn, why);
648 return 0;
649 }
650
651 /* We are expecting an echo response: is there any chance that the
652 response would not be returned entirely in the first read? That
653 seems inconceivable... */
654 if ((rc != sizeof(resp)) || memcmp(buf, resp, sizeof(resp))) {
655 char why[100 + sizeof(buf)];
656 snprintf(why, sizeof(why),
657 "read returned bad echo response of %d bytes (expecting %u): %.*s",
658 (int)rc, (unsigned int)sizeof(resp), (int)rc, buf);
659 daemon_down(dmn, why);
660 return 0;
661 }
662
663 time_elapsed(&delay, &dmn->echo_sent);
664 dmn->echo_sent.tv_sec = 0;
665 if (dmn->state == DAEMON_UNRESPONSIVE) {
666 if (delay.tv_sec < gs.timeout) {
667 dmn->state = DAEMON_UP;
668 zlog_warn(
669 "%s state -> up : echo response received after %ld.%06ld seconds",
670 dmn->name, (long)delay.tv_sec,
671 (long)delay.tv_usec);
672 } else
673 zlog_warn(
674 "%s: slow echo response finally received after %ld.%06ld seconds",
675 dmn->name, (long)delay.tv_sec,
676 (long)delay.tv_usec);
677 } else if (gs.loglevel > LOG_DEBUG + 1)
678 zlog_debug("%s: echo response received after %ld.%06ld seconds",
679 dmn->name, (long)delay.tv_sec, (long)delay.tv_usec);
680
681 SET_READ_HANDLER(dmn);
682 if (dmn->t_wakeup)
683 thread_cancel(dmn->t_wakeup);
684 SET_WAKEUP_ECHO(dmn);
685
686 return 0;
687 }
688
689 /*
690 * Wait till we notice that all daemons are ready before
691 * we send we are ready to systemd
692 */
693 static void daemon_send_ready(int exitcode)
694 {
695 FILE *fp;
696 static int sent = 0;
697 char started[1024];
698
699 if (sent)
700 return;
701
702 if (exitcode == 0)
703 zlog_notice("all daemons up, doing startup-complete notify");
704 else if (gs.numdown < gs.numdaemons)
705 flog_err(EC_WATCHFRR_CONNECTION,
706 "startup did not complete within timeout (%d/%d daemons running)",
707 gs.numdaemons - gs.numdown, gs.numdaemons);
708 else {
709 flog_err(EC_WATCHFRR_CONNECTION,
710 "all configured daemons failed to start -- exiting watchfrr");
711 exit(exitcode);
712
713 }
714
715 frr_detach();
716
717 snprintf(started, sizeof(started), "%s/%s", frr_vtydir,
718 "watchfrr.started");
719 fp = fopen(started, "w");
720 if (fp)
721 fclose(fp);
722 #if defined HAVE_SYSTEMD
723 systemd_send_started(master, 0);
724 systemd_send_status("FRR Operational");
725 #endif
726 sent = 1;
727 }
728
729 static void daemon_up(struct daemon *dmn, const char *why)
730 {
731 dmn->state = DAEMON_UP;
732 gs.numdown--;
733 dmn->connect_tries = 0;
734 zlog_notice("%s state -> up : %s", dmn->name, why);
735 if (gs.numdown == 0)
736 daemon_send_ready(0);
737 SET_WAKEUP_ECHO(dmn);
738 phase_check();
739 }
740
741 static int check_connect(struct thread *t_write)
742 {
743 struct daemon *dmn = THREAD_ARG(t_write);
744 int sockerr;
745 socklen_t reslen = sizeof(sockerr);
746
747 dmn->t_write = NULL;
748 if (getsockopt(dmn->fd, SOL_SOCKET, SO_ERROR, (char *)&sockerr, &reslen)
749 < 0) {
750 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn->name,
751 safe_strerror(errno));
752 daemon_down(dmn,
753 "getsockopt failed checking connection success");
754 return 0;
755 }
756 if ((reslen == sizeof(sockerr)) && sockerr) {
757 char why[100];
758 snprintf(
759 why, sizeof(why),
760 "getsockopt reports that connection attempt failed: %s",
761 safe_strerror(sockerr));
762 daemon_down(dmn, why);
763 return 0;
764 }
765
766 daemon_up(dmn, "delayed connect succeeded");
767 return 0;
768 }
769
770 static int wakeup_connect_hanging(struct thread *t_wakeup)
771 {
772 struct daemon *dmn = THREAD_ARG(t_wakeup);
773 char why[100];
774
775 dmn->t_wakeup = NULL;
776 snprintf(why, sizeof(why),
777 "connection attempt timed out after %ld seconds", gs.timeout);
778 daemon_down(dmn, why);
779 return 0;
780 }
781
782 /* Making connection to protocol daemon. */
783 static int try_connect(struct daemon *dmn)
784 {
785 int sock;
786 struct sockaddr_un addr;
787 socklen_t len;
788
789 if (gs.loglevel > LOG_DEBUG + 1)
790 zlog_debug("%s: attempting to connect", dmn->name);
791 dmn->connect_tries++;
792
793 memset(&addr, 0, sizeof(struct sockaddr_un));
794 addr.sun_family = AF_UNIX;
795 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty", gs.vtydir,
796 dmn->name);
797 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
798 len = addr.sun_len = SUN_LEN(&addr);
799 #else
800 len = sizeof(addr.sun_family) + strlen(addr.sun_path);
801 #endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
802
803 /* Quick check to see if we might succeed before we go to the trouble
804 of creating a socket. */
805 if (access(addr.sun_path, W_OK) < 0) {
806 if (errno != ENOENT)
807 flog_err_sys(EC_LIB_SYSTEM_CALL,
808 "%s: access to socket %s denied: %s",
809 dmn->name, addr.sun_path,
810 safe_strerror(errno));
811 return -1;
812 }
813
814 if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
815 flog_err_sys(EC_LIB_SOCKET, "%s(%s): cannot make socket: %s",
816 __func__, addr.sun_path, safe_strerror(errno));
817 return -1;
818 }
819
820 if (set_nonblocking(sock) < 0 || set_cloexec(sock) < 0) {
821 flog_err_sys(EC_LIB_SYSTEM_CALL,
822 "%s(%s): set_nonblocking/cloexec(%d) failed",
823 __func__, addr.sun_path, sock);
824 close(sock);
825 return -1;
826 }
827
828 if (connect(sock, (struct sockaddr *)&addr, len) < 0) {
829 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK)) {
830 if (gs.loglevel > LOG_DEBUG)
831 zlog_debug("%s(%s): connect failed: %s",
832 __func__, addr.sun_path,
833 safe_strerror(errno));
834 close(sock);
835 return -1;
836 }
837 if (gs.loglevel > LOG_DEBUG)
838 zlog_debug("%s: connection in progress", dmn->name);
839 dmn->state = DAEMON_CONNECTING;
840 dmn->fd = sock;
841 dmn->t_write = NULL;
842 thread_add_write(master, check_connect, dmn, dmn->fd,
843 &dmn->t_write);
844 dmn->t_wakeup = NULL;
845 thread_add_timer(master, wakeup_connect_hanging, dmn,
846 gs.timeout, &dmn->t_wakeup);
847 SET_READ_HANDLER(dmn);
848 return 0;
849 }
850
851 dmn->fd = sock;
852 SET_READ_HANDLER(dmn);
853 daemon_up(dmn, "connect succeeded");
854 return 1;
855 }
856
857 static int phase_hanging(struct thread *t_hanging)
858 {
859 gs.t_phase_hanging = NULL;
860 flog_err(EC_WATCHFRR_CONNECTION,
861 "Phase [%s] hanging for %ld seconds, aborting phased restart",
862 phase_str[gs.phase], PHASE_TIMEOUT);
863 gs.phase = PHASE_NONE;
864 return 0;
865 }
866
867 static void set_phase(restart_phase_t new_phase)
868 {
869 gs.phase = new_phase;
870 if (gs.t_phase_hanging)
871 thread_cancel(gs.t_phase_hanging);
872 gs.t_phase_hanging = NULL;
873 thread_add_timer(master, phase_hanging, NULL, PHASE_TIMEOUT,
874 &gs.t_phase_hanging);
875 }
876
877 static void phase_check(void)
878 {
879 struct daemon *dmn;
880
881 switch (gs.phase) {
882 case PHASE_NONE:
883 break;
884
885 case PHASE_INIT:
886 for (dmn = gs.daemons; dmn; dmn = dmn->next)
887 if (dmn->state == DAEMON_INIT)
888 return;
889
890 /* startup complete, everything out of INIT */
891 gs.phase = PHASE_NONE;
892 for (dmn = gs.daemons; dmn; dmn = dmn->next)
893 if (dmn->state == DAEMON_DOWN) {
894 SET_WAKEUP_DOWN(dmn);
895 try_restart(dmn);
896 }
897 break;
898 case PHASE_STOPS_PENDING:
899 if (gs.numpids)
900 break;
901 zlog_info(
902 "Phased restart: all routing daemon stop jobs have completed.");
903 set_phase(PHASE_WAITING_DOWN);
904
905 /*FALLTHRU*/
906 case PHASE_WAITING_DOWN:
907 if (gs.numdown + IS_UP(gs.special) < gs.numdaemons)
908 break;
909 zlog_info("Phased restart: all routing daemons now down.");
910 run_job(&gs.special->restart, "restart", gs.restart_command, 1,
911 1);
912 set_phase(PHASE_ZEBRA_RESTART_PENDING);
913
914 /*FALLTHRU*/
915 case PHASE_ZEBRA_RESTART_PENDING:
916 if (gs.special->restart.pid)
917 break;
918 zlog_info("Phased restart: %s restart job completed.",
919 gs.special->name);
920 set_phase(PHASE_WAITING_ZEBRA_UP);
921
922 /*FALLTHRU*/
923 case PHASE_WAITING_ZEBRA_UP:
924 if (!IS_UP(gs.special))
925 break;
926 zlog_info("Phased restart: %s is now up.", gs.special->name);
927 {
928 struct daemon *dmn;
929 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
930 if (dmn != gs.special)
931 run_job(&dmn->restart, "start",
932 gs.start_command, 1, 0);
933 }
934 }
935 gs.phase = PHASE_NONE;
936 THREAD_OFF(gs.t_phase_hanging);
937 zlog_notice("Phased global restart has completed.");
938 break;
939 }
940 }
941
942 static void try_restart(struct daemon *dmn)
943 {
944 if (watch_only)
945 return;
946
947 if (dmn != gs.special) {
948 if ((gs.special->state == DAEMON_UP)
949 && (gs.phase == PHASE_NONE))
950 run_job(&dmn->restart, "restart", gs.restart_command, 0,
951 1);
952 else
953 zlog_debug(
954 "%s: postponing restart attempt because master %s daemon not up [%s], or phased restart in progress",
955 dmn->name, gs.special->name,
956 state_str[gs.special->state]);
957 return;
958 }
959
960 if ((gs.phase != PHASE_NONE) || gs.numpids) {
961 if (gs.loglevel > LOG_DEBUG + 1)
962 zlog_debug(
963 "postponing phased global restart: restart already in progress [%s], or outstanding child processes [%d]",
964 phase_str[gs.phase], gs.numpids);
965 return;
966 }
967 /* Is it too soon for a restart? */
968 {
969 struct timeval delay;
970 if (time_elapsed(&delay, &gs.special->restart.time)->tv_sec
971 < gs.special->restart.interval) {
972 if (gs.loglevel > LOG_DEBUG + 1)
973 zlog_debug(
974 "postponing phased global restart: elapsed time %ld < retry interval %ld",
975 (long)delay.tv_sec,
976 gs.special->restart.interval);
977 return;
978 }
979 }
980 run_job(&gs.restart, "restart", gs.restart_command, 0, 1);
981 }
982
983 static int wakeup_unresponsive(struct thread *t_wakeup)
984 {
985 struct daemon *dmn = THREAD_ARG(t_wakeup);
986
987 dmn->t_wakeup = NULL;
988 if (dmn->state != DAEMON_UNRESPONSIVE)
989 flog_err(EC_WATCHFRR_CONNECTION,
990 "%s: no longer unresponsive (now %s), wakeup should have been cancelled!",
991 dmn->name, state_str[dmn->state]);
992 else {
993 SET_WAKEUP_UNRESPONSIVE(dmn);
994 try_restart(dmn);
995 }
996 return 0;
997 }
998
999 static int wakeup_no_answer(struct thread *t_wakeup)
1000 {
1001 struct daemon *dmn = THREAD_ARG(t_wakeup);
1002
1003 dmn->t_wakeup = NULL;
1004 dmn->state = DAEMON_UNRESPONSIVE;
1005 if (dmn->ignore_timeout)
1006 return 0;
1007 flog_err(EC_WATCHFRR_CONNECTION,
1008 "%s state -> unresponsive : no response yet to ping sent %ld seconds ago",
1009 dmn->name, gs.timeout);
1010 SET_WAKEUP_UNRESPONSIVE(dmn);
1011 try_restart(dmn);
1012 return 0;
1013 }
1014
1015 static int wakeup_send_echo(struct thread *t_wakeup)
1016 {
1017 static const char echocmd[] = "echo " PING_TOKEN;
1018 ssize_t rc;
1019 struct daemon *dmn = THREAD_ARG(t_wakeup);
1020
1021 dmn->t_wakeup = NULL;
1022 if (((rc = write(dmn->fd, echocmd, sizeof(echocmd))) < 0)
1023 || ((size_t)rc != sizeof(echocmd))) {
1024 char why[100 + sizeof(echocmd)];
1025 snprintf(why, sizeof(why),
1026 "write '%s' returned %d instead of %u", echocmd,
1027 (int)rc, (unsigned int)sizeof(echocmd));
1028 daemon_down(dmn, why);
1029 } else {
1030 gettimeofday(&dmn->echo_sent, NULL);
1031 dmn->t_wakeup = NULL;
1032 thread_add_timer(master, wakeup_no_answer, dmn, gs.timeout,
1033 &dmn->t_wakeup);
1034 }
1035 return 0;
1036 }
1037
1038 bool check_all_up(void)
1039 {
1040 struct daemon *dmn;
1041
1042 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1043 if (dmn->state != DAEMON_UP)
1044 return false;
1045 return true;
1046 }
1047
1048 void watchfrr_status(struct vty *vty)
1049 {
1050 struct daemon *dmn;
1051 struct timeval delay;
1052
1053 vty_out(vty, "watchfrr global phase: %s\n", phase_str[gs.phase]);
1054 if (gs.restart.pid)
1055 vty_out(vty, " global restart running, pid %ld\n",
1056 (long)gs.restart.pid);
1057
1058 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
1059 vty_out(vty, " %-20s %s%s", dmn->name, state_str[dmn->state],
1060 dmn->ignore_timeout ? "/Ignoring Timeout\n" : "\n");
1061 if (dmn->restart.pid)
1062 vty_out(vty, " restart running, pid %ld\n",
1063 (long)dmn->restart.pid);
1064 else if (dmn->state == DAEMON_DOWN &&
1065 time_elapsed(&delay, &dmn->restart.time)->tv_sec
1066 < dmn->restart.interval)
1067 vty_out(vty, " restarting in %jd seconds (%jds backoff interval)\n",
1068 (intmax_t)dmn->restart.interval
1069 - (intmax_t)delay.tv_sec,
1070 (intmax_t)dmn->restart.interval);
1071 }
1072 }
1073
1074 static void sigint(void)
1075 {
1076 zlog_notice("Terminating on signal");
1077 systemd_send_stopping();
1078 exit(0);
1079 }
1080
1081 static int valid_command(const char *cmd)
1082 {
1083 char *p;
1084
1085 return ((p = strchr(cmd, '%')) != NULL) && (*(p + 1) == 's')
1086 && !strchr(p + 1, '%');
1087 }
1088
1089 /* This is an ugly hack to circumvent problems with passing command-line
1090 arguments that contain spaces. The fix is to use a configuration file. */
1091 static char *translate_blanks(const char *cmd, const char *blankstr)
1092 {
1093 char *res;
1094 char *p;
1095 size_t bslen = strlen(blankstr);
1096
1097 if (!(res = strdup(cmd))) {
1098 perror("strdup");
1099 exit(1);
1100 }
1101 while ((p = strstr(res, blankstr)) != NULL) {
1102 *p = ' ';
1103 if (bslen != 1)
1104 memmove(p + 1, p + bslen, strlen(p + bslen) + 1);
1105 }
1106 return res;
1107 }
1108
1109 static int startup_timeout(struct thread *t_wakeup)
1110 {
1111 daemon_send_ready(1);
1112 return 0;
1113 }
1114
1115 #ifdef GNU_LINUX
1116
1117 #include <sys/mount.h>
1118 #include <sched.h>
1119
1120 #define NETNS_RUN_DIR "/var/run/netns"
1121
1122 static void netns_create(int dirfd, const char *nsname)
1123 {
1124 /* make /var/run/netns shared between mount namespaces
1125 * just like iproute2 sets it up
1126 */
1127 if (mount("", NETNS_RUN_DIR, "none", MS_SHARED | MS_REC, NULL)) {
1128 if (errno != EINVAL) {
1129 perror("mount");
1130 exit(1);
1131 }
1132
1133 if (mount(NETNS_RUN_DIR, NETNS_RUN_DIR, "none",
1134 MS_BIND | MS_REC, NULL)) {
1135 perror("mount");
1136 exit(1);
1137 }
1138
1139 if (mount("", NETNS_RUN_DIR, "none", MS_SHARED | MS_REC,
1140 NULL)) {
1141 perror("mount");
1142 exit(1);
1143 }
1144 }
1145
1146 /* need an empty file to mount on top of */
1147 int nsfd = openat(dirfd, nsname, O_CREAT | O_RDONLY | O_EXCL, 0);
1148
1149 if (nsfd < 0) {
1150 fprintf(stderr, "failed to create \"%s/%s\": %s\n",
1151 NETNS_RUN_DIR, nsname, strerror(errno));
1152 exit(1);
1153 }
1154 close(nsfd);
1155
1156 if (unshare(CLONE_NEWNET)) {
1157 perror("unshare");
1158 unlinkat(dirfd, nsname, 0);
1159 exit(1);
1160 }
1161
1162 char *dstpath = asprintfrr(MTYPE_TMP, "%s/%s", NETNS_RUN_DIR, nsname);
1163
1164 /* bind-mount so the namespace has a name and is persistent */
1165 if (mount("/proc/self/ns/net", dstpath, "none", MS_BIND, NULL) < 0) {
1166 fprintf(stderr, "failed to bind-mount netns to \"%s\": %s\n",
1167 dstpath, strerror(errno));
1168 unlinkat(dirfd, nsname, 0);
1169 exit(1);
1170 }
1171
1172 XFREE(MTYPE_TMP, dstpath);
1173 }
1174
1175 static void netns_setup(const char *nsname)
1176 {
1177 int dirfd, nsfd;
1178
1179 dirfd = open(NETNS_RUN_DIR, O_DIRECTORY | O_RDONLY);
1180 if (dirfd < 0) {
1181 if (errno == ENOTDIR) {
1182 fprintf(stderr, "error: \"%s\" is not a directory!\n",
1183 NETNS_RUN_DIR);
1184 exit(1);
1185 } else if (errno == ENOENT) {
1186 if (mkdir(NETNS_RUN_DIR, 0755)) {
1187 fprintf(stderr, "error: \"%s\": mkdir: %s\n",
1188 NETNS_RUN_DIR, strerror(errno));
1189 exit(1);
1190 }
1191 dirfd = open(NETNS_RUN_DIR, O_DIRECTORY | O_RDONLY);
1192 if (dirfd < 0) {
1193 fprintf(stderr, "error: \"%s\": opendir: %s\n",
1194 NETNS_RUN_DIR, strerror(errno));
1195 exit(1);
1196 }
1197 } else {
1198 fprintf(stderr, "error: \"%s\": %s\n",
1199 NETNS_RUN_DIR, strerror(errno));
1200 exit(1);
1201 }
1202 }
1203
1204 nsfd = openat(dirfd, nsname, O_RDONLY);
1205 if (nsfd < 0 && errno != ENOENT) {
1206 fprintf(stderr, "error: \"%s/%s\": %s\n",
1207 NETNS_RUN_DIR, nsname, strerror(errno));
1208 exit(1);
1209 }
1210 if (nsfd < 0)
1211 netns_create(dirfd, nsname);
1212 else {
1213 if (setns(nsfd, CLONE_NEWNET)) {
1214 perror("setns");
1215 exit(1);
1216 }
1217 close(nsfd);
1218 }
1219 close(dirfd);
1220
1221 /* make sure loopback is up... weird things happen otherwise.
1222 * ioctl is perfectly fine for this, don't need netlink...
1223 */
1224 int sockfd;
1225 struct ifreq ifr = { };
1226
1227 strlcpy(ifr.ifr_name, "lo", sizeof(ifr.ifr_name));
1228
1229 sockfd = socket(AF_INET, SOCK_DGRAM, 0);
1230 if (sockfd < 0) {
1231 perror("socket");
1232 exit(1);
1233 }
1234 if (ioctl(sockfd, SIOCGIFFLAGS, &ifr)) {
1235 perror("ioctl(SIOCGIFFLAGS, \"lo\")");
1236 exit(1);
1237 }
1238 if (!(ifr.ifr_flags & IFF_UP)) {
1239 ifr.ifr_flags |= IFF_UP;
1240 if (ioctl(sockfd, SIOCSIFFLAGS, &ifr)) {
1241 perror("ioctl(SIOCSIFFLAGS, \"lo\")");
1242 exit(1);
1243 }
1244 }
1245 close(sockfd);
1246 }
1247
1248 #else /* !GNU_LINUX */
1249
1250 static void netns_setup(const char *nsname)
1251 {
1252 fprintf(stderr, "network namespaces are only available on Linux\n");
1253 exit(1);
1254 }
1255 #endif
1256
1257 static void watchfrr_init(int argc, char **argv)
1258 {
1259 const char *special = "zebra";
1260 int i;
1261 struct daemon *dmn, **add = &gs.daemons;
1262 char alldaemons[512] = "", *p = alldaemons;
1263
1264 thread_add_timer_msec(master, startup_timeout, NULL, STARTUP_TIMEOUT,
1265 &gs.t_startup_timeout);
1266
1267 for (i = optind; i < argc; i++) {
1268 dmn = XCALLOC(MTYPE_WATCHFRR_DAEMON, sizeof(*dmn));
1269
1270 dmn->name = dmn->restart.name = argv[i];
1271 dmn->state = DAEMON_INIT;
1272 gs.numdaemons++;
1273 gs.numdown++;
1274 dmn->fd = -1;
1275 dmn->t_wakeup = NULL;
1276 thread_add_timer_msec(master, wakeup_init, dmn, 0,
1277 &dmn->t_wakeup);
1278 dmn->restart.interval = gs.min_restart_interval;
1279 *add = dmn;
1280 add = &dmn->next;
1281
1282 if (!strcmp(dmn->name, special))
1283 gs.special = dmn;
1284 }
1285
1286 if (!gs.daemons) {
1287 fprintf(stderr,
1288 "Must specify one or more daemons to monitor.\n\n");
1289 frr_help_exit(1);
1290 }
1291 if (!watch_only && !gs.special) {
1292 fprintf(stderr, "\"%s\" daemon must be in daemon lists\n\n",
1293 special);
1294 frr_help_exit(1);
1295 }
1296
1297 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
1298 snprintf(p, alldaemons + sizeof(alldaemons) - p, "%s%s",
1299 (p == alldaemons) ? "" : " ", dmn->name);
1300 p += strlen(p);
1301 }
1302 zlog_notice("%s %s watching [%s]%s", progname, FRR_VERSION, alldaemons,
1303 watch_only ? ", monitor mode" : "");
1304 }
1305
1306 struct zebra_privs_t watchfrr_privs = {
1307 #ifdef VTY_GROUP
1308 .vty_group = VTY_GROUP,
1309 #endif
1310 };
1311
1312 static struct quagga_signal_t watchfrr_signals[] = {
1313 {
1314 .signal = SIGINT,
1315 .handler = sigint,
1316 },
1317 {
1318 .signal = SIGTERM,
1319 .handler = sigint,
1320 },
1321 {
1322 .signal = SIGCHLD,
1323 .handler = sigchild,
1324 },
1325 };
1326
1327 FRR_DAEMON_INFO(watchfrr, WATCHFRR,
1328 .flags = FRR_NO_PRIVSEP | FRR_NO_TCPVTY | FRR_LIMITED_CLI
1329 | FRR_NO_CFG_PID_DRY | FRR_NO_ZCLIENT
1330 | FRR_DETACH_LATER,
1331
1332 .printhelp = printhelp,
1333 .copyright = "Copyright 2004 Andrew J. Schorr",
1334
1335 .signals = watchfrr_signals,
1336 .n_signals = array_size(watchfrr_signals),
1337
1338 .privs = &watchfrr_privs, )
1339
1340 #define DEPRECATED_OPTIONS "aAezR:"
1341
1342 int main(int argc, char **argv)
1343 {
1344 int opt;
1345 const char *blankstr = NULL;
1346 const char *netns = NULL;
1347 bool netns_en = false;
1348
1349 frr_preinit(&watchfrr_di, argc, argv);
1350 progname = watchfrr_di.progname;
1351
1352 frr_opt_add("b:di:k:l:N:p:r:S:s:t:T:" DEPRECATED_OPTIONS, longopts, "");
1353
1354 gs.restart.name = "all";
1355 while ((opt = frr_getopt(argc, argv, NULL)) != EOF) {
1356 if (opt && opt < 128 && strchr(DEPRECATED_OPTIONS, opt)) {
1357 fprintf(stderr,
1358 "The -%c option no longer exists.\n"
1359 "Please refer to the watchfrr(8) man page.\n",
1360 opt);
1361 exit(1);
1362 }
1363
1364 switch (opt) {
1365 case 0:
1366 break;
1367 case 'b':
1368 blankstr = optarg;
1369 break;
1370 case OPTION_DRY:
1371 watch_only = true;
1372 break;
1373 case 'k':
1374 if (!valid_command(optarg)) {
1375 fprintf(stderr,
1376 "Invalid kill command, must contain '%%s': %s\n",
1377 optarg);
1378 frr_help_exit(1);
1379 }
1380 gs.stop_command = optarg;
1381 break;
1382 case 'l': {
1383 char garbage[3];
1384 if ((sscanf(optarg, "%d%1s", &gs.loglevel, garbage)
1385 != 1)
1386 || (gs.loglevel < LOG_EMERG)) {
1387 fprintf(stderr,
1388 "Invalid loglevel argument: %s\n",
1389 optarg);
1390 frr_help_exit(1);
1391 }
1392 } break;
1393 case OPTION_MINRESTART: {
1394 char garbage[3];
1395 if ((sscanf(optarg, "%ld%1s", &gs.min_restart_interval,
1396 garbage)
1397 != 1)
1398 || (gs.min_restart_interval < 0)) {
1399 fprintf(stderr,
1400 "Invalid min_restart_interval argument: %s\n",
1401 optarg);
1402 frr_help_exit(1);
1403 }
1404 } break;
1405 case OPTION_MAXRESTART: {
1406 char garbage[3];
1407 if ((sscanf(optarg, "%ld%1s", &gs.max_restart_interval,
1408 garbage)
1409 != 1)
1410 || (gs.max_restart_interval < 0)) {
1411 fprintf(stderr,
1412 "Invalid max_restart_interval argument: %s\n",
1413 optarg);
1414 frr_help_exit(1);
1415 }
1416 } break;
1417 case OPTION_NETNS:
1418 netns_en = true;
1419 if (strchr(optarg, '/')) {
1420 fprintf(stderr,
1421 "invalid network namespace name \"%s\" (may not contain slashes)\n",
1422 optarg);
1423 frr_help_exit(1);
1424 }
1425 netns = optarg;
1426 break;
1427 case 'i': {
1428 char garbage[3];
1429 int period;
1430 if ((sscanf(optarg, "%d%1s", &period, garbage) != 1)
1431 || (gs.period < 1)) {
1432 fprintf(stderr,
1433 "Invalid interval argument: %s\n",
1434 optarg);
1435 frr_help_exit(1);
1436 }
1437 gs.period = 1000 * period;
1438 } break;
1439 case 'p':
1440 watchfrr_di.pid_file = optarg;
1441 break;
1442 case 'r':
1443 if (!valid_command(optarg)) {
1444 fprintf(stderr,
1445 "Invalid restart command, must contain '%%s': %s\n",
1446 optarg);
1447 frr_help_exit(1);
1448 }
1449 gs.restart_command = optarg;
1450 break;
1451 case 's':
1452 if (!valid_command(optarg)) {
1453 fprintf(stderr,
1454 "Invalid start command, must contain '%%s': %s\n",
1455 optarg);
1456 frr_help_exit(1);
1457 }
1458 gs.start_command = optarg;
1459 break;
1460 case 'S':
1461 gs.vtydir = optarg;
1462 break;
1463 case 't': {
1464 char garbage[3];
1465 if ((sscanf(optarg, "%ld%1s", &gs.timeout, garbage)
1466 != 1)
1467 || (gs.timeout < 1)) {
1468 fprintf(stderr,
1469 "Invalid timeout argument: %s\n",
1470 optarg);
1471 frr_help_exit(1);
1472 }
1473 } break;
1474 case 'T': {
1475 char garbage[3];
1476 if ((sscanf(optarg, "%ld%1s", &gs.restart_timeout,
1477 garbage)
1478 != 1)
1479 || (gs.restart_timeout < 1)) {
1480 fprintf(stderr,
1481 "Invalid restart timeout argument: %s\n",
1482 optarg);
1483 frr_help_exit(1);
1484 }
1485 } break;
1486 default:
1487 fputs("Invalid option.\n", stderr);
1488 frr_help_exit(1);
1489 }
1490 }
1491
1492 if (watch_only
1493 && (gs.start_command || gs.stop_command || gs.restart_command)) {
1494 fputs("Options -r/-s/-k are not used when --dry is active.\n",
1495 stderr);
1496 }
1497 if (!watch_only
1498 && (!gs.restart_command || !gs.start_command || !gs.stop_command)) {
1499 fprintf(stderr,
1500 "Options -s (start), -k (kill), and -r (restart) are required.\n");
1501 frr_help_exit(1);
1502 }
1503
1504 if (blankstr) {
1505 if (gs.restart_command)
1506 gs.restart_command =
1507 translate_blanks(gs.restart_command, blankstr);
1508 if (gs.start_command)
1509 gs.start_command =
1510 translate_blanks(gs.start_command, blankstr);
1511 if (gs.stop_command)
1512 gs.stop_command =
1513 translate_blanks(gs.stop_command, blankstr);
1514 }
1515
1516 gs.restart.interval = gs.min_restart_interval;
1517
1518 /* env variable for the processes that we start */
1519 if (watchfrr_di.pathspace)
1520 setenv("FRR_PATHSPACE", watchfrr_di.pathspace, 1);
1521 else
1522 unsetenv("FRR_PATHSPACE");
1523
1524 if (netns_en && !netns)
1525 netns = watchfrr_di.pathspace;
1526 if (netns_en && netns && netns[0])
1527 netns_setup(netns);
1528
1529 master = frr_init();
1530 watchfrr_error_init();
1531 watchfrr_init(argc, argv);
1532 watchfrr_vty_init();
1533
1534 frr_config_fork();
1535
1536 if (watchfrr_di.daemon_mode)
1537 zlog_syslog_set_prio_min(MIN(gs.loglevel, LOG_DEBUG));
1538 else
1539 zlog_aux_init(NULL, MIN(gs.loglevel, LOG_DEBUG));
1540
1541 frr_run(master);
1542
1543 systemd_send_stopping();
1544 /* Not reached. */
1545 return 0;
1546 }