]> git.proxmox.com Git - mirror_frr.git/blob - watchfrr/watchfrr.c
Merge pull request #6016 from sarav511/ppend
[mirror_frr.git] / watchfrr / watchfrr.c
1 /*
2 * Monitor status of frr daemons and restart if necessary.
3 *
4 * Copyright (C) 2004 Andrew J. Schorr
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include <zebra.h>
22 #include <thread.h>
23 #include <log.h>
24 #include <network.h>
25 #include <sigevent.h>
26 #include <lib/version.h>
27 #include "command.h"
28 #include "libfrr.h"
29 #include "lib_errors.h"
30 #include "zlog_targets.h"
31 #include "network.h"
32 #include "printfrr.h"
33
34 #include <getopt.h>
35 #include <sys/un.h>
36 #include <sys/wait.h>
37 #include <memory.h>
38 #include <systemd.h>
39
40 #include "watchfrr.h"
41 #include "watchfrr_errors.h"
42
43 #ifndef MIN
44 #define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
45 #endif
46
47 /* Macros to help randomize timers. */
48 #define JITTER(X) ((frr_weak_random() % ((X)+1))-((X)/2))
49 #define FUZZY(X) ((X)+JITTER((X)/20))
50
51 #define DEFAULT_PERIOD 5
52 #define DEFAULT_TIMEOUT 90
53 #define DEFAULT_RESTART_TIMEOUT 20
54 #define DEFAULT_LOGLEVEL LOG_INFO
55 #define DEFAULT_MIN_RESTART 60
56 #define DEFAULT_MAX_RESTART 600
57
58 #define DEFAULT_RESTART_CMD WATCHFRR_SH_PATH " restart %s"
59 #define DEFAULT_START_CMD WATCHFRR_SH_PATH " start %s"
60 #define DEFAULT_STOP_CMD WATCHFRR_SH_PATH " stop %s"
61
62 #define PING_TOKEN "PING"
63
64 DEFINE_MGROUP(WATCHFRR, "watchfrr")
65 DEFINE_MTYPE_STATIC(WATCHFRR, WATCHFRR_DAEMON, "watchfrr daemon entry")
66
67 /* Needs to be global, referenced somewhere inside libfrr. */
68 struct thread_master *master;
69
70 static bool watch_only = false;
71
72 typedef enum {
73 PHASE_NONE = 0,
74 PHASE_INIT,
75 PHASE_STOPS_PENDING,
76 PHASE_WAITING_DOWN,
77 PHASE_ZEBRA_RESTART_PENDING,
78 PHASE_WAITING_ZEBRA_UP
79 } restart_phase_t;
80
81 static const char *const phase_str[] = {
82 "Idle",
83 "Startup",
84 "Stop jobs running",
85 "Waiting for other daemons to come down",
86 "Zebra restart job running",
87 "Waiting for zebra to come up",
88 "Start jobs running",
89 };
90
91 #define PHASE_TIMEOUT (3*gs.restart_timeout)
92 #define STARTUP_TIMEOUT 55 * 1000
93
94 struct restart_info {
95 const char *name;
96 const char *what;
97 pid_t pid;
98 struct timeval time;
99 long interval;
100 struct thread *t_kill;
101 int kills;
102 };
103
104 static struct global_state {
105 restart_phase_t phase;
106 struct thread *t_phase_hanging;
107 struct thread *t_startup_timeout;
108 const char *vtydir;
109 long period;
110 long timeout;
111 long restart_timeout;
112 long min_restart_interval;
113 long max_restart_interval;
114 struct daemon *daemons;
115 const char *restart_command;
116 const char *start_command;
117 const char *stop_command;
118 struct restart_info restart;
119 int loglevel;
120 struct daemon *special; /* points to zebra when doing phased restart */
121 int numdaemons;
122 int numpids;
123 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
124 } gs = {
125 .phase = PHASE_INIT,
126 .vtydir = frr_vtydir,
127 .period = 1000 * DEFAULT_PERIOD,
128 .timeout = DEFAULT_TIMEOUT,
129 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
130 .loglevel = DEFAULT_LOGLEVEL,
131 .min_restart_interval = DEFAULT_MIN_RESTART,
132 .max_restart_interval = DEFAULT_MAX_RESTART,
133 .restart_command = DEFAULT_RESTART_CMD,
134 .start_command = DEFAULT_START_CMD,
135 .stop_command = DEFAULT_STOP_CMD,
136 };
137
138 typedef enum {
139 DAEMON_INIT,
140 DAEMON_DOWN,
141 DAEMON_CONNECTING,
142 DAEMON_UP,
143 DAEMON_UNRESPONSIVE
144 } daemon_state_t;
145
146 #define IS_UP(DMN) \
147 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
148
149 static const char *const state_str[] = {
150 "Init", "Down", "Connecting", "Up", "Unresponsive",
151 };
152
153 struct daemon {
154 const char *name;
155 daemon_state_t state;
156 int fd;
157 struct timeval echo_sent;
158 unsigned int connect_tries;
159 struct thread *t_wakeup;
160 struct thread *t_read;
161 struct thread *t_write;
162 struct daemon *next;
163 struct restart_info restart;
164
165 /*
166 * For a given daemon, if we've turned on ignore timeouts
167 * ignore the timeout value and assume everything is ok
168 * This is for daemon debugging w/ gdb after we have started
169 * FRR and realize we have something that needs to be looked
170 * at
171 */
172 bool ignore_timeout;
173 };
174
175 #define OPTION_MINRESTART 2000
176 #define OPTION_MAXRESTART 2001
177 #define OPTION_DRY 2002
178 #define OPTION_NETNS 2003
179
180 static const struct option longopts[] = {
181 {"daemon", no_argument, NULL, 'd'},
182 {"statedir", required_argument, NULL, 'S'},
183 {"loglevel", required_argument, NULL, 'l'},
184 {"interval", required_argument, NULL, 'i'},
185 {"timeout", required_argument, NULL, 't'},
186 {"restart-timeout", required_argument, NULL, 'T'},
187 {"restart", required_argument, NULL, 'r'},
188 {"start-command", required_argument, NULL, 's'},
189 {"kill-command", required_argument, NULL, 'k'},
190 {"dry", no_argument, NULL, OPTION_DRY},
191 {"min-restart-interval", required_argument, NULL, OPTION_MINRESTART},
192 {"max-restart-interval", required_argument, NULL, OPTION_MAXRESTART},
193 {"pid-file", required_argument, NULL, 'p'},
194 {"blank-string", required_argument, NULL, 'b'},
195 #ifdef GNU_LINUX
196 {"netns", optional_argument, NULL, OPTION_NETNS},
197 #endif
198 {"help", no_argument, NULL, 'h'},
199 {"version", no_argument, NULL, 'v'},
200 {NULL, 0, NULL, 0}};
201
202 static int try_connect(struct daemon *dmn);
203 static int wakeup_send_echo(struct thread *t_wakeup);
204 static void try_restart(struct daemon *dmn);
205 static void phase_check(void);
206 static void restart_done(struct daemon *dmn);
207
208 static const char *progname;
209
210 void watchfrr_set_ignore_daemon(struct vty *vty, const char *dname, bool ignore)
211 {
212 struct daemon *dmn;
213
214 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
215 if (strncmp(dmn->name, dname, strlen(dmn->name)) == 0)
216 break;
217 }
218
219 if (dmn) {
220 dmn->ignore_timeout = ignore;
221 vty_out(vty, "%s switching to %s\n", dmn->name,
222 ignore ? "ignore" : "watch");
223 } else
224 vty_out(vty, "%s is not configured for running at the moment",
225 dname);
226 }
227
228 static void printhelp(FILE *target)
229 {
230 fprintf(target,
231 "Usage : %s [OPTION...] <daemon name> ...\n\n\
232 Watchdog program to monitor status of frr daemons and try to restart\n\
233 them if they are down or unresponsive. It determines whether a daemon is\n\
234 up based on whether it can connect to the daemon's vty unix stream socket.\n\
235 It then repeatedly sends echo commands over that socket to determine whether\n\
236 the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
237 on the socket connection and know immediately that the daemon is down.\n\n\
238 The daemons to be monitored should be listed on the command line.\n\n\
239 In order to avoid attempting to restart the daemons in a fast loop,\n\
240 the -m and -M options allow you to control the minimum delay between\n\
241 restart commands. The minimum restart delay is recalculated each time\n\
242 a restart is attempted: if the time since the last restart attempt exceeds\n\
243 twice the -M value, then the restart delay is set to the -m value.\n\
244 Otherwise, the interval is doubled (but capped at the -M value).\n\n",
245 progname);
246
247 fprintf(target,
248 "Options:\n\
249 -d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
250 to syslog instead of stdout.\n\
251 -S, --statedir Set the vty socket directory (default is %s)\n\
252 -N, --pathspace Insert prefix into config & socket paths\n"
253 #ifdef GNU_LINUX
254 " --netns Create and/or use Linux network namespace. If no name is\n"
255 " given, uses the value from `-N`.\n"
256 #endif
257 "-l, --loglevel Set the logging level (default is %d).\n\
258 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
259 but it can be set higher than %d if extra-verbose debugging\n\
260 messages are desired.\n\
261 --min-restart-interval\n\
262 Set the minimum seconds to wait between invocations of daemon\n\
263 restart commands (default is %d).\n\
264 --max-restart-interval\n\
265 Set the maximum seconds to wait between invocations of daemon\n\
266 restart commands (default is %d).\n\
267 -i, --interval Set the status polling interval in seconds (default is %d)\n\
268 -t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
269 -T, --restart-timeout\n\
270 Set the restart (kill) timeout in seconds (default is %d).\n\
271 If any background jobs are still running after this much\n\
272 time has elapsed, they will be killed.\n\
273 -r, --restart Supply a Bourne shell command to use to restart a single\n\
274 daemon. The command string should include '%%s' where the\n\
275 name of the daemon should be substituted.\n\
276 (default: '%s')\n\
277 -s, --start-command\n\
278 Supply a Bourne shell to command to use to start a single\n\
279 daemon. The command string should include '%%s' where the\n\
280 name of the daemon should be substituted.\n\
281 (default: '%s')\n\
282 -k, --kill-command\n\
283 Supply a Bourne shell to command to use to stop a single\n\
284 daemon. The command string should include '%%s' where the\n\
285 name of the daemon should be substituted.\n\
286 (default: '%s')\n\
287 --dry Do not start or restart anything, just log.\n\
288 -p, --pid-file Set process identifier file name\n\
289 (default is %s/watchfrr.pid).\n\
290 -b, --blank-string\n\
291 When the supplied argument string is found in any of the\n\
292 various shell command arguments (-r, -s, or -k), replace\n\
293 it with a space. This is an ugly hack to circumvent problems\n\
294 passing command-line arguments with embedded spaces.\n\
295 -v, --version Print program version\n\
296 -h, --help Display this help and exit\n",
297 frr_vtydir, DEFAULT_LOGLEVEL, LOG_EMERG, LOG_DEBUG, LOG_DEBUG,
298 DEFAULT_MIN_RESTART, DEFAULT_MAX_RESTART, DEFAULT_PERIOD,
299 DEFAULT_TIMEOUT, DEFAULT_RESTART_TIMEOUT,
300 DEFAULT_RESTART_CMD, DEFAULT_START_CMD, DEFAULT_STOP_CMD,
301 frr_vtydir);
302 }
303
304 static pid_t run_background(char *shell_cmd)
305 {
306 pid_t child;
307
308 switch (child = fork()) {
309 case -1:
310 flog_err_sys(EC_LIB_SYSTEM_CALL,
311 "fork failed, cannot run command [%s]: %s",
312 shell_cmd, safe_strerror(errno));
313 return -1;
314 case 0:
315 /* Child process. */
316 /* Use separate process group so child processes can be killed
317 * easily. */
318 if (setpgid(0, 0) < 0)
319 zlog_warn("warning: setpgid(0,0) failed: %s",
320 safe_strerror(errno));
321 {
322 char shell[] = "sh";
323 char dashc[] = "-c";
324 char *const argv[4] = {shell, dashc, shell_cmd, NULL};
325 execv("/bin/sh", argv);
326 flog_err_sys(EC_LIB_SYSTEM_CALL,
327 "execv(/bin/sh -c '%s') failed: %s",
328 shell_cmd, safe_strerror(errno));
329 _exit(127);
330 }
331 default:
332 /* Parent process: we will reap the child later. */
333 zlog_info("Forked background command [pid %d]: %s", (int)child,
334 shell_cmd);
335 return child;
336 }
337 }
338
339 static struct timeval *time_elapsed(struct timeval *result,
340 const struct timeval *start_time)
341 {
342 gettimeofday(result, NULL);
343 result->tv_sec -= start_time->tv_sec;
344 result->tv_usec -= start_time->tv_usec;
345 while (result->tv_usec < 0) {
346 result->tv_usec += 1000000L;
347 result->tv_sec--;
348 }
349 return result;
350 }
351
352 static int restart_kill(struct thread *t_kill)
353 {
354 struct restart_info *restart = THREAD_ARG(t_kill);
355 struct timeval delay;
356
357 time_elapsed(&delay, &restart->time);
358 zlog_warn(
359 "Warning: %s %s child process %d still running after %ld seconds, sending signal %d",
360 restart->what, restart->name, (int)restart->pid,
361 (long)delay.tv_sec, (restart->kills ? SIGKILL : SIGTERM));
362 kill(-restart->pid, (restart->kills ? SIGKILL : SIGTERM));
363 restart->kills++;
364 restart->t_kill = NULL;
365 thread_add_timer(master, restart_kill, restart, gs.restart_timeout,
366 &restart->t_kill);
367 return 0;
368 }
369
370 static struct restart_info *find_child(pid_t child)
371 {
372 struct daemon *dmn;
373 if (gs.restart.pid == child)
374 return &gs.restart;
375
376 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
377 if (dmn->restart.pid == child)
378 return &dmn->restart;
379 }
380 return NULL;
381 }
382
383 static void sigchild(void)
384 {
385 pid_t child;
386 int status;
387 const char *name;
388 const char *what;
389 struct restart_info *restart;
390 struct daemon *dmn;
391
392 switch (child = waitpid(-1, &status, WNOHANG)) {
393 case -1:
394 flog_err_sys(EC_LIB_SYSTEM_CALL, "waitpid failed: %s",
395 safe_strerror(errno));
396 return;
397 case 0:
398 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
399 return;
400 }
401
402 if (child == integrated_write_pid) {
403 integrated_write_sigchld(status);
404 return;
405 }
406
407 if ((restart = find_child(child)) != NULL) {
408 name = restart->name;
409 what = restart->what;
410 restart->pid = 0;
411 gs.numpids--;
412 thread_cancel(restart->t_kill);
413 restart->t_kill = NULL;
414 /* Update restart time to reflect the time the command
415 * completed. */
416 gettimeofday(&restart->time, NULL);
417 } else {
418 flog_err_sys(
419 EC_LIB_SYSTEM_CALL,
420 "waitpid returned status for an unknown child process %d",
421 (int)child);
422 name = "(unknown)";
423 what = "background";
424 }
425 if (WIFSTOPPED(status))
426 zlog_warn("warning: %s %s process %d is stopped", what, name,
427 (int)child);
428 else if (WIFSIGNALED(status))
429 zlog_warn("%s %s process %d terminated due to signal %d", what,
430 name, (int)child, WTERMSIG(status));
431 else if (WIFEXITED(status)) {
432 if (WEXITSTATUS(status) != 0)
433 zlog_warn(
434 "%s %s process %d exited with non-zero status %d",
435 what, name, (int)child, WEXITSTATUS(status));
436 else {
437 zlog_debug("%s %s process %d exited normally", what,
438 name, (int)child);
439
440 if (restart && restart != &gs.restart) {
441 dmn = container_of(restart, struct daemon,
442 restart);
443 restart_done(dmn);
444 } else if (restart)
445 for (dmn = gs.daemons; dmn; dmn = dmn->next)
446 restart_done(dmn);
447 }
448 } else
449 flog_err_sys(
450 EC_LIB_SYSTEM_CALL,
451 "cannot interpret %s %s process %d wait status 0x%x",
452 what, name, (int)child, status);
453 phase_check();
454 }
455
456 static int run_job(struct restart_info *restart, const char *cmdtype,
457 const char *command, int force, int update_interval)
458 {
459 struct timeval delay;
460
461 if (gs.loglevel > LOG_DEBUG + 1)
462 zlog_debug("attempting to %s %s", cmdtype, restart->name);
463
464 if (restart->pid) {
465 if (gs.loglevel > LOG_DEBUG + 1)
466 zlog_debug(
467 "cannot %s %s, previous pid %d still running",
468 cmdtype, restart->name, (int)restart->pid);
469 return -1;
470 }
471
472 #if defined HAVE_SYSTEMD
473 char buffer[512];
474
475 snprintf(buffer, sizeof(buffer), "restarting %s", restart->name);
476 systemd_send_status(buffer);
477 #endif
478
479 /* Note: time_elapsed test must come before the force test, since we
480 need
481 to make sure that delay is initialized for use below in updating the
482 restart interval. */
483 if ((time_elapsed(&delay, &restart->time)->tv_sec < restart->interval)
484 && !force) {
485
486 if (gs.loglevel > LOG_DEBUG + 1)
487 zlog_debug(
488 "postponing %s %s: elapsed time %ld < retry interval %ld",
489 cmdtype, restart->name, (long)delay.tv_sec,
490 restart->interval);
491 return -1;
492 }
493
494 gettimeofday(&restart->time, NULL);
495 restart->kills = 0;
496 {
497 char cmd[strlen(command) + strlen(restart->name) + 1];
498 snprintf(cmd, sizeof(cmd), command, restart->name);
499 if ((restart->pid = run_background(cmd)) > 0) {
500 restart->t_kill = NULL;
501 thread_add_timer(master, restart_kill, restart,
502 gs.restart_timeout, &restart->t_kill);
503 restart->what = cmdtype;
504 gs.numpids++;
505 } else
506 restart->pid = 0;
507 }
508
509 #if defined HAVE_SYSTEMD
510 systemd_send_status("FRR Operational");
511 #endif
512 /* Calculate the new restart interval. */
513 if (update_interval) {
514 if (delay.tv_sec > 2 * gs.max_restart_interval)
515 restart->interval = gs.min_restart_interval;
516 else if ((restart->interval *= 2) > gs.max_restart_interval)
517 restart->interval = gs.max_restart_interval;
518 if (gs.loglevel > LOG_DEBUG + 1)
519 zlog_debug("restart %s interval is now %ld",
520 restart->name, restart->interval);
521 }
522 return restart->pid;
523 }
524
525 #define SET_READ_HANDLER(DMN) \
526 do { \
527 (DMN)->t_read = NULL; \
528 thread_add_read(master, handle_read, (DMN), (DMN)->fd, \
529 &(DMN)->t_read); \
530 } while (0);
531
532 #define SET_WAKEUP_DOWN(DMN) \
533 do { \
534 (DMN)->t_wakeup = NULL; \
535 thread_add_timer_msec(master, wakeup_down, (DMN), \
536 FUZZY(gs.period), &(DMN)->t_wakeup); \
537 } while (0);
538
539 #define SET_WAKEUP_UNRESPONSIVE(DMN) \
540 do { \
541 (DMN)->t_wakeup = NULL; \
542 thread_add_timer_msec(master, wakeup_unresponsive, (DMN), \
543 FUZZY(gs.period), &(DMN)->t_wakeup); \
544 } while (0);
545
546 #define SET_WAKEUP_ECHO(DMN) \
547 do { \
548 (DMN)->t_wakeup = NULL; \
549 thread_add_timer_msec(master, wakeup_send_echo, (DMN), \
550 FUZZY(gs.period), &(DMN)->t_wakeup); \
551 } while (0);
552
553 static int wakeup_down(struct thread *t_wakeup)
554 {
555 struct daemon *dmn = THREAD_ARG(t_wakeup);
556
557 dmn->t_wakeup = NULL;
558 if (try_connect(dmn) < 0)
559 SET_WAKEUP_DOWN(dmn);
560 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
561 try_restart(dmn);
562 return 0;
563 }
564
565 static int wakeup_init(struct thread *t_wakeup)
566 {
567 struct daemon *dmn = THREAD_ARG(t_wakeup);
568
569 dmn->t_wakeup = NULL;
570 if (try_connect(dmn) < 0) {
571 zlog_info(
572 "%s state -> down : initial connection attempt failed",
573 dmn->name);
574 dmn->state = DAEMON_DOWN;
575 }
576 phase_check();
577 return 0;
578 }
579
580 static void restart_done(struct daemon *dmn)
581 {
582 if (dmn->state != DAEMON_DOWN) {
583 zlog_warn(
584 "Daemon: %s: is in %s state but expected it to be in DAEMON_DOWN state",
585 dmn->name, state_str[dmn->state]);
586 return;
587 }
588 THREAD_OFF(dmn->t_wakeup);
589 if (try_connect(dmn) < 0)
590 SET_WAKEUP_DOWN(dmn);
591 }
592
593 static void daemon_down(struct daemon *dmn, const char *why)
594 {
595 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
596 flog_err(EC_WATCHFRR_CONNECTION, "%s state -> down : %s",
597 dmn->name, why);
598 else if (gs.loglevel > LOG_DEBUG)
599 zlog_debug("%s still down : %s", dmn->name, why);
600 if (IS_UP(dmn))
601 gs.numdown++;
602 dmn->state = DAEMON_DOWN;
603 if (dmn->fd >= 0) {
604 close(dmn->fd);
605 dmn->fd = -1;
606 }
607 THREAD_OFF(dmn->t_read);
608 THREAD_OFF(dmn->t_write);
609 THREAD_OFF(dmn->t_wakeup);
610 if (try_connect(dmn) < 0)
611 SET_WAKEUP_DOWN(dmn);
612 phase_check();
613 }
614
615 static int handle_read(struct thread *t_read)
616 {
617 struct daemon *dmn = THREAD_ARG(t_read);
618 static const char resp[sizeof(PING_TOKEN) + 4] = PING_TOKEN "\n";
619 char buf[sizeof(resp) + 100];
620 ssize_t rc;
621 struct timeval delay;
622
623 dmn->t_read = NULL;
624 if ((rc = read(dmn->fd, buf, sizeof(buf))) < 0) {
625 char why[100];
626
627 if (ERRNO_IO_RETRY(errno)) {
628 /* Pretend it never happened. */
629 SET_READ_HANDLER(dmn);
630 return 0;
631 }
632 snprintf(why, sizeof(why), "unexpected read error: %s",
633 safe_strerror(errno));
634 daemon_down(dmn, why);
635 return 0;
636 }
637 if (rc == 0) {
638 daemon_down(dmn, "read returned EOF");
639 return 0;
640 }
641 if (!dmn->echo_sent.tv_sec) {
642 char why[sizeof(buf) + 100];
643 snprintf(why, sizeof(why),
644 "unexpected read returns %d bytes: %.*s", (int)rc,
645 (int)rc, buf);
646 daemon_down(dmn, why);
647 return 0;
648 }
649
650 /* We are expecting an echo response: is there any chance that the
651 response would not be returned entirely in the first read? That
652 seems inconceivable... */
653 if ((rc != sizeof(resp)) || memcmp(buf, resp, sizeof(resp))) {
654 char why[100 + sizeof(buf)];
655 snprintf(why, sizeof(why),
656 "read returned bad echo response of %d bytes (expecting %u): %.*s",
657 (int)rc, (unsigned int)sizeof(resp), (int)rc, buf);
658 daemon_down(dmn, why);
659 return 0;
660 }
661
662 time_elapsed(&delay, &dmn->echo_sent);
663 dmn->echo_sent.tv_sec = 0;
664 if (dmn->state == DAEMON_UNRESPONSIVE) {
665 if (delay.tv_sec < gs.timeout) {
666 dmn->state = DAEMON_UP;
667 zlog_warn(
668 "%s state -> up : echo response received after %ld.%06ld seconds",
669 dmn->name, (long)delay.tv_sec,
670 (long)delay.tv_usec);
671 } else
672 zlog_warn(
673 "%s: slow echo response finally received after %ld.%06ld seconds",
674 dmn->name, (long)delay.tv_sec,
675 (long)delay.tv_usec);
676 } else if (gs.loglevel > LOG_DEBUG + 1)
677 zlog_debug("%s: echo response received after %ld.%06ld seconds",
678 dmn->name, (long)delay.tv_sec, (long)delay.tv_usec);
679
680 SET_READ_HANDLER(dmn);
681 if (dmn->t_wakeup)
682 thread_cancel(dmn->t_wakeup);
683 SET_WAKEUP_ECHO(dmn);
684
685 return 0;
686 }
687
688 /*
689 * Wait till we notice that all daemons are ready before
690 * we send we are ready to systemd
691 */
692 static void daemon_send_ready(int exitcode)
693 {
694 FILE *fp;
695 static int sent = 0;
696 char started[1024];
697
698 if (sent)
699 return;
700
701 if (exitcode == 0)
702 zlog_notice("all daemons up, doing startup-complete notify");
703 else if (gs.numdown < gs.numdaemons)
704 flog_err(EC_WATCHFRR_CONNECTION,
705 "startup did not complete within timeout (%d/%d daemons running)",
706 gs.numdaemons - gs.numdown, gs.numdaemons);
707 else {
708 flog_err(EC_WATCHFRR_CONNECTION,
709 "all configured daemons failed to start -- exiting watchfrr");
710 exit(exitcode);
711
712 }
713
714 frr_detach();
715
716 snprintf(started, sizeof(started), "%s/%s", frr_vtydir,
717 "watchfrr.started");
718 fp = fopen(started, "w");
719 if (fp)
720 fclose(fp);
721 #if defined HAVE_SYSTEMD
722 systemd_send_started(master, 0);
723 systemd_send_status("FRR Operational");
724 #endif
725 sent = 1;
726 }
727
728 static void daemon_up(struct daemon *dmn, const char *why)
729 {
730 dmn->state = DAEMON_UP;
731 gs.numdown--;
732 dmn->connect_tries = 0;
733 zlog_notice("%s state -> up : %s", dmn->name, why);
734 if (gs.numdown == 0)
735 daemon_send_ready(0);
736 SET_WAKEUP_ECHO(dmn);
737 phase_check();
738 }
739
740 static int check_connect(struct thread *t_write)
741 {
742 struct daemon *dmn = THREAD_ARG(t_write);
743 int sockerr;
744 socklen_t reslen = sizeof(sockerr);
745
746 dmn->t_write = NULL;
747 if (getsockopt(dmn->fd, SOL_SOCKET, SO_ERROR, (char *)&sockerr, &reslen)
748 < 0) {
749 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn->name,
750 safe_strerror(errno));
751 daemon_down(dmn,
752 "getsockopt failed checking connection success");
753 return 0;
754 }
755 if ((reslen == sizeof(sockerr)) && sockerr) {
756 char why[100];
757 snprintf(
758 why, sizeof(why),
759 "getsockopt reports that connection attempt failed: %s",
760 safe_strerror(sockerr));
761 daemon_down(dmn, why);
762 return 0;
763 }
764
765 daemon_up(dmn, "delayed connect succeeded");
766 return 0;
767 }
768
769 static int wakeup_connect_hanging(struct thread *t_wakeup)
770 {
771 struct daemon *dmn = THREAD_ARG(t_wakeup);
772 char why[100];
773
774 dmn->t_wakeup = NULL;
775 snprintf(why, sizeof(why),
776 "connection attempt timed out after %ld seconds", gs.timeout);
777 daemon_down(dmn, why);
778 return 0;
779 }
780
781 /* Making connection to protocol daemon. */
782 static int try_connect(struct daemon *dmn)
783 {
784 int sock;
785 struct sockaddr_un addr;
786 socklen_t len;
787
788 if (gs.loglevel > LOG_DEBUG + 1)
789 zlog_debug("%s: attempting to connect", dmn->name);
790 dmn->connect_tries++;
791
792 memset(&addr, 0, sizeof(struct sockaddr_un));
793 addr.sun_family = AF_UNIX;
794 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty", gs.vtydir,
795 dmn->name);
796 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
797 len = addr.sun_len = SUN_LEN(&addr);
798 #else
799 len = sizeof(addr.sun_family) + strlen(addr.sun_path);
800 #endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
801
802 /* Quick check to see if we might succeed before we go to the trouble
803 of creating a socket. */
804 if (access(addr.sun_path, W_OK) < 0) {
805 if (errno != ENOENT)
806 flog_err_sys(EC_LIB_SYSTEM_CALL,
807 "%s: access to socket %s denied: %s",
808 dmn->name, addr.sun_path,
809 safe_strerror(errno));
810 return -1;
811 }
812
813 if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
814 flog_err_sys(EC_LIB_SOCKET, "%s(%s): cannot make socket: %s",
815 __func__, addr.sun_path, safe_strerror(errno));
816 return -1;
817 }
818
819 if (set_nonblocking(sock) < 0 || set_cloexec(sock) < 0) {
820 flog_err_sys(EC_LIB_SYSTEM_CALL,
821 "%s(%s): set_nonblocking/cloexec(%d) failed",
822 __func__, addr.sun_path, sock);
823 close(sock);
824 return -1;
825 }
826
827 if (connect(sock, (struct sockaddr *)&addr, len) < 0) {
828 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK)) {
829 if (gs.loglevel > LOG_DEBUG)
830 zlog_debug("%s(%s): connect failed: %s",
831 __func__, addr.sun_path,
832 safe_strerror(errno));
833 close(sock);
834 return -1;
835 }
836 if (gs.loglevel > LOG_DEBUG)
837 zlog_debug("%s: connection in progress", dmn->name);
838 dmn->state = DAEMON_CONNECTING;
839 dmn->fd = sock;
840 dmn->t_write = NULL;
841 thread_add_write(master, check_connect, dmn, dmn->fd,
842 &dmn->t_write);
843 dmn->t_wakeup = NULL;
844 thread_add_timer(master, wakeup_connect_hanging, dmn,
845 gs.timeout, &dmn->t_wakeup);
846 SET_READ_HANDLER(dmn);
847 return 0;
848 }
849
850 dmn->fd = sock;
851 SET_READ_HANDLER(dmn);
852 daemon_up(dmn, "connect succeeded");
853 return 1;
854 }
855
856 static int phase_hanging(struct thread *t_hanging)
857 {
858 gs.t_phase_hanging = NULL;
859 flog_err(EC_WATCHFRR_CONNECTION,
860 "Phase [%s] hanging for %ld seconds, aborting phased restart",
861 phase_str[gs.phase], PHASE_TIMEOUT);
862 gs.phase = PHASE_NONE;
863 return 0;
864 }
865
866 static void set_phase(restart_phase_t new_phase)
867 {
868 gs.phase = new_phase;
869 if (gs.t_phase_hanging)
870 thread_cancel(gs.t_phase_hanging);
871 gs.t_phase_hanging = NULL;
872 thread_add_timer(master, phase_hanging, NULL, PHASE_TIMEOUT,
873 &gs.t_phase_hanging);
874 }
875
876 static void phase_check(void)
877 {
878 struct daemon *dmn;
879
880 switch (gs.phase) {
881 case PHASE_NONE:
882 break;
883
884 case PHASE_INIT:
885 for (dmn = gs.daemons; dmn; dmn = dmn->next)
886 if (dmn->state == DAEMON_INIT)
887 return;
888
889 /* startup complete, everything out of INIT */
890 gs.phase = PHASE_NONE;
891 for (dmn = gs.daemons; dmn; dmn = dmn->next)
892 if (dmn->state == DAEMON_DOWN) {
893 SET_WAKEUP_DOWN(dmn);
894 try_restart(dmn);
895 }
896 break;
897 case PHASE_STOPS_PENDING:
898 if (gs.numpids)
899 break;
900 zlog_info(
901 "Phased restart: all routing daemon stop jobs have completed.");
902 set_phase(PHASE_WAITING_DOWN);
903
904 /*FALLTHRU*/
905 case PHASE_WAITING_DOWN:
906 if (gs.numdown + IS_UP(gs.special) < gs.numdaemons)
907 break;
908 zlog_info("Phased restart: all routing daemons now down.");
909 run_job(&gs.special->restart, "restart", gs.restart_command, 1,
910 1);
911 set_phase(PHASE_ZEBRA_RESTART_PENDING);
912
913 /*FALLTHRU*/
914 case PHASE_ZEBRA_RESTART_PENDING:
915 if (gs.special->restart.pid)
916 break;
917 zlog_info("Phased restart: %s restart job completed.",
918 gs.special->name);
919 set_phase(PHASE_WAITING_ZEBRA_UP);
920
921 /*FALLTHRU*/
922 case PHASE_WAITING_ZEBRA_UP:
923 if (!IS_UP(gs.special))
924 break;
925 zlog_info("Phased restart: %s is now up.", gs.special->name);
926 {
927 struct daemon *dmn;
928 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
929 if (dmn != gs.special)
930 run_job(&dmn->restart, "start",
931 gs.start_command, 1, 0);
932 }
933 }
934 gs.phase = PHASE_NONE;
935 THREAD_OFF(gs.t_phase_hanging);
936 zlog_notice("Phased global restart has completed.");
937 break;
938 }
939 }
940
941 static void try_restart(struct daemon *dmn)
942 {
943 if (watch_only)
944 return;
945
946 if (dmn != gs.special) {
947 if ((gs.special->state == DAEMON_UP)
948 && (gs.phase == PHASE_NONE))
949 run_job(&dmn->restart, "restart", gs.restart_command, 0,
950 1);
951 else
952 zlog_debug(
953 "%s: postponing restart attempt because master %s daemon not up [%s], or phased restart in progress",
954 dmn->name, gs.special->name,
955 state_str[gs.special->state]);
956 return;
957 }
958
959 if ((gs.phase != PHASE_NONE) || gs.numpids) {
960 if (gs.loglevel > LOG_DEBUG + 1)
961 zlog_debug(
962 "postponing phased global restart: restart already in progress [%s], or outstanding child processes [%d]",
963 phase_str[gs.phase], gs.numpids);
964 return;
965 }
966 /* Is it too soon for a restart? */
967 {
968 struct timeval delay;
969 if (time_elapsed(&delay, &gs.special->restart.time)->tv_sec
970 < gs.special->restart.interval) {
971 if (gs.loglevel > LOG_DEBUG + 1)
972 zlog_debug(
973 "postponing phased global restart: elapsed time %ld < retry interval %ld",
974 (long)delay.tv_sec,
975 gs.special->restart.interval);
976 return;
977 }
978 }
979 run_job(&gs.restart, "restart", gs.restart_command, 0, 1);
980 }
981
982 static int wakeup_unresponsive(struct thread *t_wakeup)
983 {
984 struct daemon *dmn = THREAD_ARG(t_wakeup);
985
986 dmn->t_wakeup = NULL;
987 if (dmn->state != DAEMON_UNRESPONSIVE)
988 flog_err(EC_WATCHFRR_CONNECTION,
989 "%s: no longer unresponsive (now %s), wakeup should have been cancelled!",
990 dmn->name, state_str[dmn->state]);
991 else {
992 SET_WAKEUP_UNRESPONSIVE(dmn);
993 try_restart(dmn);
994 }
995 return 0;
996 }
997
998 static int wakeup_no_answer(struct thread *t_wakeup)
999 {
1000 struct daemon *dmn = THREAD_ARG(t_wakeup);
1001
1002 dmn->t_wakeup = NULL;
1003 dmn->state = DAEMON_UNRESPONSIVE;
1004 if (dmn->ignore_timeout)
1005 return 0;
1006 flog_err(EC_WATCHFRR_CONNECTION,
1007 "%s state -> unresponsive : no response yet to ping sent %ld seconds ago",
1008 dmn->name, gs.timeout);
1009 SET_WAKEUP_UNRESPONSIVE(dmn);
1010 try_restart(dmn);
1011 return 0;
1012 }
1013
1014 static int wakeup_send_echo(struct thread *t_wakeup)
1015 {
1016 static const char echocmd[] = "echo " PING_TOKEN;
1017 ssize_t rc;
1018 struct daemon *dmn = THREAD_ARG(t_wakeup);
1019
1020 dmn->t_wakeup = NULL;
1021 if (((rc = write(dmn->fd, echocmd, sizeof(echocmd))) < 0)
1022 || ((size_t)rc != sizeof(echocmd))) {
1023 char why[100 + sizeof(echocmd)];
1024 snprintf(why, sizeof(why),
1025 "write '%s' returned %d instead of %u", echocmd,
1026 (int)rc, (unsigned int)sizeof(echocmd));
1027 daemon_down(dmn, why);
1028 } else {
1029 gettimeofday(&dmn->echo_sent, NULL);
1030 dmn->t_wakeup = NULL;
1031 thread_add_timer(master, wakeup_no_answer, dmn, gs.timeout,
1032 &dmn->t_wakeup);
1033 }
1034 return 0;
1035 }
1036
1037 bool check_all_up(void)
1038 {
1039 struct daemon *dmn;
1040
1041 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1042 if (dmn->state != DAEMON_UP)
1043 return false;
1044 return true;
1045 }
1046
1047 void watchfrr_status(struct vty *vty)
1048 {
1049 struct daemon *dmn;
1050 struct timeval delay;
1051
1052 vty_out(vty, "watchfrr global phase: %s\n", phase_str[gs.phase]);
1053 if (gs.restart.pid)
1054 vty_out(vty, " global restart running, pid %ld\n",
1055 (long)gs.restart.pid);
1056
1057 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
1058 vty_out(vty, " %-20s %s%s", dmn->name, state_str[dmn->state],
1059 dmn->ignore_timeout ? "/Ignoring Timeout\n" : "\n");
1060 if (dmn->restart.pid)
1061 vty_out(vty, " restart running, pid %ld\n",
1062 (long)dmn->restart.pid);
1063 else if (dmn->state == DAEMON_DOWN &&
1064 time_elapsed(&delay, &dmn->restart.time)->tv_sec
1065 < dmn->restart.interval)
1066 vty_out(vty, " restarting in %jd seconds (%jds backoff interval)\n",
1067 (intmax_t)dmn->restart.interval
1068 - (intmax_t)delay.tv_sec,
1069 (intmax_t)dmn->restart.interval);
1070 }
1071 }
1072
1073 static void sigint(void)
1074 {
1075 zlog_notice("Terminating on signal");
1076 systemd_send_stopping();
1077 exit(0);
1078 }
1079
1080 static int valid_command(const char *cmd)
1081 {
1082 char *p;
1083
1084 return ((p = strchr(cmd, '%')) != NULL) && (*(p + 1) == 's')
1085 && !strchr(p + 1, '%');
1086 }
1087
1088 /* This is an ugly hack to circumvent problems with passing command-line
1089 arguments that contain spaces. The fix is to use a configuration file. */
1090 static char *translate_blanks(const char *cmd, const char *blankstr)
1091 {
1092 char *res;
1093 char *p;
1094 size_t bslen = strlen(blankstr);
1095
1096 if (!(res = strdup(cmd))) {
1097 perror("strdup");
1098 exit(1);
1099 }
1100 while ((p = strstr(res, blankstr)) != NULL) {
1101 *p = ' ';
1102 if (bslen != 1)
1103 memmove(p + 1, p + bslen, strlen(p + bslen) + 1);
1104 }
1105 return res;
1106 }
1107
1108 static int startup_timeout(struct thread *t_wakeup)
1109 {
1110 daemon_send_ready(1);
1111 return 0;
1112 }
1113
1114 #ifdef GNU_LINUX
1115
1116 #include <sys/mount.h>
1117 #include <sched.h>
1118
1119 #define NETNS_RUN_DIR "/var/run/netns"
1120
1121 static void netns_create(int dirfd, const char *nsname)
1122 {
1123 /* make /var/run/netns shared between mount namespaces
1124 * just like iproute2 sets it up
1125 */
1126 if (mount("", NETNS_RUN_DIR, "none", MS_SHARED | MS_REC, NULL)) {
1127 if (errno != EINVAL) {
1128 perror("mount");
1129 exit(1);
1130 }
1131
1132 if (mount(NETNS_RUN_DIR, NETNS_RUN_DIR, "none",
1133 MS_BIND | MS_REC, NULL)) {
1134 perror("mount");
1135 exit(1);
1136 }
1137
1138 if (mount("", NETNS_RUN_DIR, "none", MS_SHARED | MS_REC,
1139 NULL)) {
1140 perror("mount");
1141 exit(1);
1142 }
1143 }
1144
1145 /* need an empty file to mount on top of */
1146 int nsfd = openat(dirfd, nsname, O_CREAT | O_RDONLY | O_EXCL, 0);
1147
1148 if (nsfd < 0) {
1149 fprintf(stderr, "failed to create \"%s/%s\": %s\n",
1150 NETNS_RUN_DIR, nsname, strerror(errno));
1151 exit(1);
1152 }
1153 close(nsfd);
1154
1155 if (unshare(CLONE_NEWNET)) {
1156 perror("unshare");
1157 unlinkat(dirfd, nsname, 0);
1158 exit(1);
1159 }
1160
1161 char *dstpath = asprintfrr(MTYPE_TMP, "%s/%s", NETNS_RUN_DIR, nsname);
1162
1163 /* bind-mount so the namespace has a name and is persistent */
1164 if (mount("/proc/self/ns/net", dstpath, "none", MS_BIND, NULL) < 0) {
1165 fprintf(stderr, "failed to bind-mount netns to \"%s\": %s\n",
1166 dstpath, strerror(errno));
1167 unlinkat(dirfd, nsname, 0);
1168 exit(1);
1169 }
1170
1171 XFREE(MTYPE_TMP, dstpath);
1172 }
1173
1174 static void netns_setup(const char *nsname)
1175 {
1176 int dirfd, nsfd;
1177
1178 dirfd = open(NETNS_RUN_DIR, O_DIRECTORY | O_RDONLY);
1179 if (dirfd < 0) {
1180 if (errno == ENOTDIR) {
1181 fprintf(stderr, "error: \"%s\" is not a directory!\n",
1182 NETNS_RUN_DIR);
1183 exit(1);
1184 } else if (errno == ENOENT) {
1185 if (mkdir(NETNS_RUN_DIR, 0755)) {
1186 fprintf(stderr, "error: \"%s\": mkdir: %s\n",
1187 NETNS_RUN_DIR, strerror(errno));
1188 exit(1);
1189 }
1190 dirfd = open(NETNS_RUN_DIR, O_DIRECTORY | O_RDONLY);
1191 if (dirfd < 0) {
1192 fprintf(stderr, "error: \"%s\": opendir: %s\n",
1193 NETNS_RUN_DIR, strerror(errno));
1194 exit(1);
1195 }
1196 } else {
1197 fprintf(stderr, "error: \"%s\": %s\n",
1198 NETNS_RUN_DIR, strerror(errno));
1199 exit(1);
1200 }
1201 }
1202
1203 nsfd = openat(dirfd, nsname, O_RDONLY);
1204 if (nsfd < 0 && errno != ENOENT) {
1205 fprintf(stderr, "error: \"%s/%s\": %s\n",
1206 NETNS_RUN_DIR, nsname, strerror(errno));
1207 exit(1);
1208 }
1209 if (nsfd < 0)
1210 netns_create(dirfd, nsname);
1211 else {
1212 if (setns(nsfd, CLONE_NEWNET)) {
1213 perror("setns");
1214 exit(1);
1215 }
1216 close(nsfd);
1217 }
1218 close(dirfd);
1219
1220 /* make sure loopback is up... weird things happen otherwise.
1221 * ioctl is perfectly fine for this, don't need netlink...
1222 */
1223 int sockfd;
1224 struct ifreq ifr = { };
1225
1226 strlcpy(ifr.ifr_name, "lo", sizeof(ifr.ifr_name));
1227
1228 sockfd = socket(AF_INET, SOCK_DGRAM, 0);
1229 if (sockfd < 0) {
1230 perror("socket");
1231 exit(1);
1232 }
1233 if (ioctl(sockfd, SIOCGIFFLAGS, &ifr)) {
1234 perror("ioctl(SIOCGIFFLAGS, \"lo\")");
1235 exit(1);
1236 }
1237 if (!(ifr.ifr_flags & IFF_UP)) {
1238 ifr.ifr_flags |= IFF_UP;
1239 if (ioctl(sockfd, SIOCSIFFLAGS, &ifr)) {
1240 perror("ioctl(SIOCSIFFLAGS, \"lo\")");
1241 exit(1);
1242 }
1243 }
1244 close(sockfd);
1245 }
1246
1247 #else /* !GNU_LINUX */
1248
1249 static void netns_setup(const char *nsname)
1250 {
1251 fprintf(stderr, "network namespaces are only available on Linux\n");
1252 exit(1);
1253 }
1254 #endif
1255
1256 static void watchfrr_init(int argc, char **argv)
1257 {
1258 const char *special = "zebra";
1259 int i;
1260 struct daemon *dmn, **add = &gs.daemons;
1261 char alldaemons[512] = "", *p = alldaemons;
1262
1263 thread_add_timer_msec(master, startup_timeout, NULL, STARTUP_TIMEOUT,
1264 &gs.t_startup_timeout);
1265
1266 for (i = optind; i < argc; i++) {
1267 dmn = XCALLOC(MTYPE_WATCHFRR_DAEMON, sizeof(*dmn));
1268
1269 dmn->name = dmn->restart.name = argv[i];
1270 dmn->state = DAEMON_INIT;
1271 gs.numdaemons++;
1272 gs.numdown++;
1273 dmn->fd = -1;
1274 dmn->t_wakeup = NULL;
1275 thread_add_timer_msec(master, wakeup_init, dmn, 0,
1276 &dmn->t_wakeup);
1277 dmn->restart.interval = gs.min_restart_interval;
1278 *add = dmn;
1279 add = &dmn->next;
1280
1281 if (!strcmp(dmn->name, special))
1282 gs.special = dmn;
1283 }
1284
1285 if (!gs.daemons) {
1286 fprintf(stderr,
1287 "Must specify one or more daemons to monitor.\n\n");
1288 frr_help_exit(1);
1289 }
1290 if (!watch_only && !gs.special) {
1291 fprintf(stderr, "\"%s\" daemon must be in daemon lists\n\n",
1292 special);
1293 frr_help_exit(1);
1294 }
1295
1296 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
1297 snprintf(p, alldaemons + sizeof(alldaemons) - p, "%s%s",
1298 (p == alldaemons) ? "" : " ", dmn->name);
1299 p += strlen(p);
1300 }
1301 zlog_notice("%s %s watching [%s]%s", progname, FRR_VERSION, alldaemons,
1302 watch_only ? ", monitor mode" : "");
1303 }
1304
1305 struct zebra_privs_t watchfrr_privs = {
1306 #ifdef VTY_GROUP
1307 .vty_group = VTY_GROUP,
1308 #endif
1309 };
1310
1311 static struct quagga_signal_t watchfrr_signals[] = {
1312 {
1313 .signal = SIGINT,
1314 .handler = sigint,
1315 },
1316 {
1317 .signal = SIGTERM,
1318 .handler = sigint,
1319 },
1320 {
1321 .signal = SIGCHLD,
1322 .handler = sigchild,
1323 },
1324 };
1325
1326 FRR_DAEMON_INFO(watchfrr, WATCHFRR,
1327 .flags = FRR_NO_PRIVSEP | FRR_NO_TCPVTY | FRR_LIMITED_CLI
1328 | FRR_NO_CFG_PID_DRY | FRR_NO_ZCLIENT
1329 | FRR_DETACH_LATER,
1330
1331 .printhelp = printhelp,
1332 .copyright = "Copyright 2004 Andrew J. Schorr",
1333
1334 .signals = watchfrr_signals,
1335 .n_signals = array_size(watchfrr_signals),
1336
1337 .privs = &watchfrr_privs, )
1338
1339 #define DEPRECATED_OPTIONS "aAezR:"
1340
1341 int main(int argc, char **argv)
1342 {
1343 int opt;
1344 const char *blankstr = NULL;
1345 const char *netns = NULL;
1346 bool netns_en = false;
1347
1348 frr_preinit(&watchfrr_di, argc, argv);
1349 progname = watchfrr_di.progname;
1350
1351 frr_opt_add("b:di:k:l:N:p:r:S:s:t:T:" DEPRECATED_OPTIONS, longopts, "");
1352
1353 gs.restart.name = "all";
1354 while ((opt = frr_getopt(argc, argv, NULL)) != EOF) {
1355 if (opt && opt < 128 && strchr(DEPRECATED_OPTIONS, opt)) {
1356 fprintf(stderr,
1357 "The -%c option no longer exists.\n"
1358 "Please refer to the watchfrr(8) man page.\n",
1359 opt);
1360 exit(1);
1361 }
1362
1363 switch (opt) {
1364 case 0:
1365 break;
1366 case 'b':
1367 blankstr = optarg;
1368 break;
1369 case OPTION_DRY:
1370 watch_only = true;
1371 break;
1372 case 'k':
1373 if (!valid_command(optarg)) {
1374 fprintf(stderr,
1375 "Invalid kill command, must contain '%%s': %s\n",
1376 optarg);
1377 frr_help_exit(1);
1378 }
1379 gs.stop_command = optarg;
1380 break;
1381 case 'l': {
1382 char garbage[3];
1383 if ((sscanf(optarg, "%d%1s", &gs.loglevel, garbage)
1384 != 1)
1385 || (gs.loglevel < LOG_EMERG)) {
1386 fprintf(stderr,
1387 "Invalid loglevel argument: %s\n",
1388 optarg);
1389 frr_help_exit(1);
1390 }
1391 } break;
1392 case OPTION_MINRESTART: {
1393 char garbage[3];
1394 if ((sscanf(optarg, "%ld%1s", &gs.min_restart_interval,
1395 garbage)
1396 != 1)
1397 || (gs.min_restart_interval < 0)) {
1398 fprintf(stderr,
1399 "Invalid min_restart_interval argument: %s\n",
1400 optarg);
1401 frr_help_exit(1);
1402 }
1403 } break;
1404 case OPTION_MAXRESTART: {
1405 char garbage[3];
1406 if ((sscanf(optarg, "%ld%1s", &gs.max_restart_interval,
1407 garbage)
1408 != 1)
1409 || (gs.max_restart_interval < 0)) {
1410 fprintf(stderr,
1411 "Invalid max_restart_interval argument: %s\n",
1412 optarg);
1413 frr_help_exit(1);
1414 }
1415 } break;
1416 case OPTION_NETNS:
1417 netns_en = true;
1418 if (strchr(optarg, '/')) {
1419 fprintf(stderr,
1420 "invalid network namespace name \"%s\" (may not contain slashes)\n",
1421 optarg);
1422 frr_help_exit(1);
1423 }
1424 netns = optarg;
1425 break;
1426 case 'i': {
1427 char garbage[3];
1428 int period;
1429 if ((sscanf(optarg, "%d%1s", &period, garbage) != 1)
1430 || (gs.period < 1)) {
1431 fprintf(stderr,
1432 "Invalid interval argument: %s\n",
1433 optarg);
1434 frr_help_exit(1);
1435 }
1436 gs.period = 1000 * period;
1437 } break;
1438 case 'p':
1439 watchfrr_di.pid_file = optarg;
1440 break;
1441 case 'r':
1442 if (!valid_command(optarg)) {
1443 fprintf(stderr,
1444 "Invalid restart command, must contain '%%s': %s\n",
1445 optarg);
1446 frr_help_exit(1);
1447 }
1448 gs.restart_command = optarg;
1449 break;
1450 case 's':
1451 if (!valid_command(optarg)) {
1452 fprintf(stderr,
1453 "Invalid start command, must contain '%%s': %s\n",
1454 optarg);
1455 frr_help_exit(1);
1456 }
1457 gs.start_command = optarg;
1458 break;
1459 case 'S':
1460 gs.vtydir = optarg;
1461 break;
1462 case 't': {
1463 char garbage[3];
1464 if ((sscanf(optarg, "%ld%1s", &gs.timeout, garbage)
1465 != 1)
1466 || (gs.timeout < 1)) {
1467 fprintf(stderr,
1468 "Invalid timeout argument: %s\n",
1469 optarg);
1470 frr_help_exit(1);
1471 }
1472 } break;
1473 case 'T': {
1474 char garbage[3];
1475 if ((sscanf(optarg, "%ld%1s", &gs.restart_timeout,
1476 garbage)
1477 != 1)
1478 || (gs.restart_timeout < 1)) {
1479 fprintf(stderr,
1480 "Invalid restart timeout argument: %s\n",
1481 optarg);
1482 frr_help_exit(1);
1483 }
1484 } break;
1485 default:
1486 fputs("Invalid option.\n", stderr);
1487 frr_help_exit(1);
1488 }
1489 }
1490
1491 if (watch_only
1492 && (gs.start_command || gs.stop_command || gs.restart_command)) {
1493 fputs("Options -r/-s/-k are not used when --dry is active.\n",
1494 stderr);
1495 }
1496 if (!watch_only
1497 && (!gs.restart_command || !gs.start_command || !gs.stop_command)) {
1498 fprintf(stderr,
1499 "Options -s (start), -k (kill), and -r (restart) are required.\n");
1500 frr_help_exit(1);
1501 }
1502
1503 if (blankstr) {
1504 if (gs.restart_command)
1505 gs.restart_command =
1506 translate_blanks(gs.restart_command, blankstr);
1507 if (gs.start_command)
1508 gs.start_command =
1509 translate_blanks(gs.start_command, blankstr);
1510 if (gs.stop_command)
1511 gs.stop_command =
1512 translate_blanks(gs.stop_command, blankstr);
1513 }
1514
1515 gs.restart.interval = gs.min_restart_interval;
1516
1517 /* env variable for the processes that we start */
1518 if (watchfrr_di.pathspace)
1519 setenv("FRR_PATHSPACE", watchfrr_di.pathspace, 1);
1520 else
1521 unsetenv("FRR_PATHSPACE");
1522
1523 if (netns_en && !netns)
1524 netns = watchfrr_di.pathspace;
1525 if (netns_en && netns && netns[0])
1526 netns_setup(netns);
1527
1528 master = frr_init();
1529 watchfrr_error_init();
1530 watchfrr_init(argc, argv);
1531 watchfrr_vty_init();
1532
1533 frr_config_fork();
1534
1535 if (watchfrr_di.daemon_mode)
1536 zlog_syslog_set_prio_min(MIN(gs.loglevel, LOG_DEBUG));
1537 else
1538 zlog_aux_init(NULL, MIN(gs.loglevel, LOG_DEBUG));
1539
1540 frr_run(master);
1541
1542 systemd_send_stopping();
1543 /* Not reached. */
1544 return 0;
1545 }