]> git.proxmox.com Git - mirror_frr.git/blob - watchfrr/watchfrr.c
Merge pull request #10989 from opensourcerouting/pim-options-remove
[mirror_frr.git] / watchfrr / watchfrr.c
1 /*
2 * Monitor status of frr daemons and restart if necessary.
3 *
4 * Copyright (C) 2004 Andrew J. Schorr
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include <zebra.h>
22 #include <thread.h>
23 #include <log.h>
24 #include <network.h>
25 #include <sigevent.h>
26 #include <lib/version.h>
27 #include "command.h"
28 #include "libfrr.h"
29 #include "lib_errors.h"
30 #include "zlog_targets.h"
31 #include "network.h"
32 #include "printfrr.h"
33
34 #include <getopt.h>
35 #include <sys/un.h>
36 #include <sys/wait.h>
37 #include <memory.h>
38 #include <systemd.h>
39
40 #include "watchfrr.h"
41 #include "watchfrr_errors.h"
42
43 #ifndef MIN
44 #define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
45 #endif
46
47 /* Macros to help randomize timers. */
48 #define JITTER(X) ((frr_weak_random() % ((X)+1))-((X)/2))
49 #define FUZZY(X) ((X)+JITTER((X)/20))
50
51 #define DEFAULT_PERIOD 5
52 #define DEFAULT_TIMEOUT 90
53 #define DEFAULT_RESTART_TIMEOUT 20
54 #define DEFAULT_LOGLEVEL LOG_INFO
55 #define DEFAULT_MIN_RESTART 60
56 #define DEFAULT_MAX_RESTART 600
57
58 #define DEFAULT_RESTART_CMD WATCHFRR_SH_PATH " restart %s"
59 #define DEFAULT_START_CMD WATCHFRR_SH_PATH " start %s"
60 #define DEFAULT_STOP_CMD WATCHFRR_SH_PATH " stop %s"
61
62 #define PING_TOKEN "PING"
63
64 DEFINE_MGROUP(WATCHFRR, "watchfrr");
65 DEFINE_MTYPE_STATIC(WATCHFRR, WATCHFRR_DAEMON, "watchfrr daemon entry");
66
67 /* Needs to be global, referenced somewhere inside libfrr. */
68 struct thread_master *master;
69
70 static bool watch_only = false;
71 const char *pathspace;
72
73 enum restart_phase {
74 PHASE_NONE = 0,
75 PHASE_INIT,
76 PHASE_STOPS_PENDING,
77 PHASE_WAITING_DOWN,
78 PHASE_ZEBRA_RESTART_PENDING,
79 PHASE_WAITING_ZEBRA_UP
80 };
81
82 static const char *const phase_str[] = {
83 "Idle",
84 "Startup",
85 "Stop jobs running",
86 "Waiting for other daemons to come down",
87 "Zebra restart job running",
88 "Waiting for zebra to come up",
89 "Start jobs running",
90 };
91
92 #define PHASE_TIMEOUT (3*gs.restart_timeout)
93 #define STARTUP_TIMEOUT 55 * 1000
94
95 struct restart_info {
96 const char *name;
97 const char *what;
98 pid_t pid;
99 struct timeval time;
100 long interval;
101 struct thread *t_kill;
102 int kills;
103 };
104
105 static struct global_state {
106 enum restart_phase phase;
107 struct thread *t_phase_hanging;
108 struct thread *t_startup_timeout;
109 const char *vtydir;
110 long period;
111 long timeout;
112 long restart_timeout;
113 long min_restart_interval;
114 long max_restart_interval;
115 struct daemon *daemons;
116 const char *restart_command;
117 const char *start_command;
118 const char *stop_command;
119 struct restart_info restart;
120 int loglevel;
121 struct daemon *special; /* points to zebra when doing phased restart */
122 int numdaemons;
123 int numpids;
124 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
125 } gs = {
126 .phase = PHASE_INIT,
127 .vtydir = frr_vtydir,
128 .period = 1000 * DEFAULT_PERIOD,
129 .timeout = DEFAULT_TIMEOUT,
130 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
131 .loglevel = DEFAULT_LOGLEVEL,
132 .min_restart_interval = DEFAULT_MIN_RESTART,
133 .max_restart_interval = DEFAULT_MAX_RESTART,
134 .restart_command = DEFAULT_RESTART_CMD,
135 .start_command = DEFAULT_START_CMD,
136 .stop_command = DEFAULT_STOP_CMD,
137 };
138
139 enum daemon_state {
140 DAEMON_INIT,
141 DAEMON_DOWN,
142 DAEMON_CONNECTING,
143 DAEMON_UP,
144 DAEMON_UNRESPONSIVE
145 };
146
147 #define IS_UP(DMN) \
148 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
149
150 static const char *const state_str[] = {
151 "Init", "Down", "Connecting", "Up", "Unresponsive",
152 };
153
154 struct daemon {
155 const char *name;
156 enum daemon_state state;
157 int fd;
158 struct timeval echo_sent;
159 unsigned int connect_tries;
160 struct thread *t_wakeup;
161 struct thread *t_read;
162 struct thread *t_write;
163 struct daemon *next;
164 struct restart_info restart;
165
166 /*
167 * For a given daemon, if we've turned on ignore timeouts
168 * ignore the timeout value and assume everything is ok
169 * This is for daemon debugging w/ gdb after we have started
170 * FRR and realize we have something that needs to be looked
171 * at
172 */
173 bool ignore_timeout;
174 };
175
176 #define OPTION_MINRESTART 2000
177 #define OPTION_MAXRESTART 2001
178 #define OPTION_DRY 2002
179 #define OPTION_NETNS 2003
180
181 static const struct option longopts[] = {
182 {"daemon", no_argument, NULL, 'd'},
183 {"statedir", required_argument, NULL, 'S'},
184 {"loglevel", required_argument, NULL, 'l'},
185 {"interval", required_argument, NULL, 'i'},
186 {"timeout", required_argument, NULL, 't'},
187 {"restart-timeout", required_argument, NULL, 'T'},
188 {"restart", required_argument, NULL, 'r'},
189 {"start-command", required_argument, NULL, 's'},
190 {"kill-command", required_argument, NULL, 'k'},
191 {"dry", no_argument, NULL, OPTION_DRY},
192 {"min-restart-interval", required_argument, NULL, OPTION_MINRESTART},
193 {"max-restart-interval", required_argument, NULL, OPTION_MAXRESTART},
194 {"pid-file", required_argument, NULL, 'p'},
195 {"blank-string", required_argument, NULL, 'b'},
196 #ifdef GNU_LINUX
197 {"netns", optional_argument, NULL, OPTION_NETNS},
198 #endif
199 {"help", no_argument, NULL, 'h'},
200 {"version", no_argument, NULL, 'v'},
201 {NULL, 0, NULL, 0}};
202
203 static int try_connect(struct daemon *dmn);
204 static void wakeup_send_echo(struct thread *t_wakeup);
205 static void try_restart(struct daemon *dmn);
206 static void phase_check(void);
207 static void restart_done(struct daemon *dmn);
208
209 static const char *progname;
210
211 void watchfrr_set_ignore_daemon(struct vty *vty, const char *dname, bool ignore)
212 {
213 struct daemon *dmn;
214
215 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
216 if (strncmp(dmn->name, dname, strlen(dmn->name)) == 0)
217 break;
218 }
219
220 if (dmn) {
221 dmn->ignore_timeout = ignore;
222 vty_out(vty, "%s switching to %s\n", dmn->name,
223 ignore ? "ignore" : "watch");
224 } else
225 vty_out(vty, "%s is not configured for running at the moment",
226 dname);
227 }
228
229 static void printhelp(FILE *target)
230 {
231 fprintf(target,
232 "Usage : %s [OPTION...] <daemon name> ...\n\n\
233 Watchdog program to monitor status of frr daemons and try to restart\n\
234 them if they are down or unresponsive. It determines whether a daemon is\n\
235 up based on whether it can connect to the daemon's vty unix stream socket.\n\
236 It then repeatedly sends echo commands over that socket to determine whether\n\
237 the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
238 on the socket connection and know immediately that the daemon is down.\n\n\
239 The daemons to be monitored should be listed on the command line.\n\n\
240 In order to avoid attempting to restart the daemons in a fast loop,\n\
241 the -m and -M options allow you to control the minimum delay between\n\
242 restart commands. The minimum restart delay is recalculated each time\n\
243 a restart is attempted: if the time since the last restart attempt exceeds\n\
244 twice the -M value, then the restart delay is set to the -m value.\n\
245 Otherwise, the interval is doubled (but capped at the -M value).\n\n",
246 progname);
247
248 fprintf(target,
249 "Options:\n\
250 -d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
251 to syslog instead of stdout.\n\
252 -S, --statedir Set the vty socket directory (default is %s)\n\
253 -N, --pathspace Insert prefix into config & socket paths\n"
254 #ifdef GNU_LINUX
255 " --netns Create and/or use Linux network namespace. If no name is\n"
256 " given, uses the value from `-N`.\n"
257 #endif
258 "-l, --loglevel Set the logging level (default is %d).\n\
259 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
260 but it can be set higher than %d if extra-verbose debugging\n\
261 messages are desired.\n\
262 --min-restart-interval\n\
263 Set the minimum seconds to wait between invocations of daemon\n\
264 restart commands (default is %d).\n\
265 --max-restart-interval\n\
266 Set the maximum seconds to wait between invocations of daemon\n\
267 restart commands (default is %d).\n\
268 -i, --interval Set the status polling interval in seconds (default is %d)\n\
269 -t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
270 -T, --restart-timeout\n\
271 Set the restart (kill) timeout in seconds (default is %d).\n\
272 If any background jobs are still running after this much\n\
273 time has elapsed, they will be killed.\n\
274 -r, --restart Supply a Bourne shell command to use to restart a single\n\
275 daemon. The command string should include '%%s' where the\n\
276 name of the daemon should be substituted.\n\
277 (default: '%s')\n\
278 -s, --start-command\n\
279 Supply a Bourne shell to command to use to start a single\n\
280 daemon. The command string should include '%%s' where the\n\
281 name of the daemon should be substituted.\n\
282 (default: '%s')\n\
283 -k, --kill-command\n\
284 Supply a Bourne shell to command to use to stop a single\n\
285 daemon. The command string should include '%%s' where the\n\
286 name of the daemon should be substituted.\n\
287 (default: '%s')\n\
288 --dry Do not start or restart anything, just log.\n\
289 -p, --pid-file Set process identifier file name\n\
290 (default is %s/watchfrr.pid).\n\
291 -b, --blank-string\n\
292 When the supplied argument string is found in any of the\n\
293 various shell command arguments (-r, -s, or -k), replace\n\
294 it with a space. This is an ugly hack to circumvent problems\n\
295 passing command-line arguments with embedded spaces.\n\
296 -v, --version Print program version\n\
297 -h, --help Display this help and exit\n",
298 frr_vtydir, DEFAULT_LOGLEVEL, LOG_EMERG, LOG_DEBUG, LOG_DEBUG,
299 DEFAULT_MIN_RESTART, DEFAULT_MAX_RESTART, DEFAULT_PERIOD,
300 DEFAULT_TIMEOUT, DEFAULT_RESTART_TIMEOUT,
301 DEFAULT_RESTART_CMD, DEFAULT_START_CMD, DEFAULT_STOP_CMD,
302 frr_vtydir);
303 }
304
305 static pid_t run_background(char *shell_cmd)
306 {
307 pid_t child;
308
309 switch (child = fork()) {
310 case -1:
311 flog_err_sys(EC_LIB_SYSTEM_CALL,
312 "fork failed, cannot run command [%s]: %s",
313 shell_cmd, safe_strerror(errno));
314 return -1;
315 case 0:
316 /* Child process. */
317 /* Use separate process group so child processes can be killed
318 * easily. */
319 if (setpgid(0, 0) < 0)
320 zlog_warn("setpgid(0,0) failed: %s",
321 safe_strerror(errno));
322 {
323 char shell[] = "sh";
324 char dashc[] = "-c";
325 char *const argv[4] = {shell, dashc, shell_cmd, NULL};
326 execv("/bin/sh", argv);
327 flog_err_sys(EC_LIB_SYSTEM_CALL,
328 "execv(/bin/sh -c '%s') failed: %s",
329 shell_cmd, safe_strerror(errno));
330 _exit(127);
331 }
332 default:
333 /* Parent process: we will reap the child later. */
334 zlog_info("Forked background command [pid %d]: %s", (int)child,
335 shell_cmd);
336 return child;
337 }
338 }
339
340 static struct timeval *time_elapsed(struct timeval *result,
341 const struct timeval *start_time)
342 {
343 gettimeofday(result, NULL);
344 result->tv_sec -= start_time->tv_sec;
345 result->tv_usec -= start_time->tv_usec;
346 while (result->tv_usec < 0) {
347 result->tv_usec += 1000000L;
348 result->tv_sec--;
349 }
350 return result;
351 }
352
353 static void restart_kill(struct thread *t_kill)
354 {
355 struct restart_info *restart = THREAD_ARG(t_kill);
356 struct timeval delay;
357
358 time_elapsed(&delay, &restart->time);
359 zlog_warn(
360 "%s %s child process %d still running after %ld seconds, sending signal %d",
361 restart->what, restart->name, (int)restart->pid,
362 (long)delay.tv_sec, (restart->kills ? SIGKILL : SIGTERM));
363 kill(-restart->pid, (restart->kills ? SIGKILL : SIGTERM));
364 restart->kills++;
365 thread_add_timer(master, restart_kill, restart, gs.restart_timeout,
366 &restart->t_kill);
367 }
368
369 static struct restart_info *find_child(pid_t child)
370 {
371 struct daemon *dmn;
372 if (gs.restart.pid == child)
373 return &gs.restart;
374
375 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
376 if (dmn->restart.pid == child)
377 return &dmn->restart;
378 }
379 return NULL;
380 }
381
382 static void sigchild(void)
383 {
384 pid_t child;
385 int status;
386 const char *name;
387 const char *what;
388 struct restart_info *restart;
389 struct daemon *dmn;
390
391 switch (child = waitpid(-1, &status, WNOHANG)) {
392 case -1:
393 flog_err_sys(EC_LIB_SYSTEM_CALL, "waitpid failed: %s",
394 safe_strerror(errno));
395 return;
396 case 0:
397 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
398 return;
399 }
400
401 if (child == integrated_write_pid) {
402 integrated_write_sigchld(status);
403 return;
404 }
405
406 if ((restart = find_child(child)) != NULL) {
407 name = restart->name;
408 what = restart->what;
409 restart->pid = 0;
410 gs.numpids--;
411 thread_cancel(&restart->t_kill);
412
413 /* Update restart time to reflect the time the command
414 * completed. */
415 gettimeofday(&restart->time, NULL);
416 } else {
417 flog_err_sys(
418 EC_LIB_SYSTEM_CALL,
419 "waitpid returned status for an unknown child process %d",
420 (int)child);
421 name = "(unknown)";
422 what = "background";
423 }
424 if (WIFSTOPPED(status))
425 zlog_warn("%s %s process %d is stopped", what, name,
426 (int)child);
427 else if (WIFSIGNALED(status))
428 zlog_warn("%s %s process %d terminated due to signal %d", what,
429 name, (int)child, WTERMSIG(status));
430 else if (WIFEXITED(status)) {
431 if (WEXITSTATUS(status) != 0)
432 zlog_warn(
433 "%s %s process %d exited with non-zero status %d",
434 what, name, (int)child, WEXITSTATUS(status));
435 else {
436 zlog_debug("%s %s process %d exited normally", what,
437 name, (int)child);
438
439 if (restart && restart != &gs.restart) {
440 dmn = container_of(restart, struct daemon,
441 restart);
442 restart_done(dmn);
443 } else if (restart)
444 for (dmn = gs.daemons; dmn; dmn = dmn->next)
445 restart_done(dmn);
446 }
447 } else
448 flog_err_sys(
449 EC_LIB_SYSTEM_CALL,
450 "cannot interpret %s %s process %d wait status 0x%x",
451 what, name, (int)child, status);
452 phase_check();
453 }
454
455 static int run_job(struct restart_info *restart, const char *cmdtype,
456 const char *command, int force, int update_interval)
457 {
458 struct timeval delay;
459
460 if (gs.loglevel > LOG_DEBUG + 1)
461 zlog_debug("attempting to %s %s", cmdtype, restart->name);
462
463 if (restart->pid) {
464 if (gs.loglevel > LOG_DEBUG + 1)
465 zlog_debug(
466 "cannot %s %s, previous pid %d still running",
467 cmdtype, restart->name, (int)restart->pid);
468 return -1;
469 }
470
471 char buffer[512];
472
473 snprintf(buffer, sizeof(buffer), "restarting %s", restart->name);
474 systemd_send_status(buffer);
475
476 /* Note: time_elapsed test must come before the force test, since we
477 need
478 to make sure that delay is initialized for use below in updating the
479 restart interval. */
480 if ((time_elapsed(&delay, &restart->time)->tv_sec < restart->interval)
481 && !force) {
482
483 if (gs.loglevel > LOG_DEBUG + 1)
484 zlog_debug(
485 "postponing %s %s: elapsed time %ld < retry interval %ld",
486 cmdtype, restart->name, (long)delay.tv_sec,
487 restart->interval);
488 return -1;
489 }
490
491 gettimeofday(&restart->time, NULL);
492 restart->kills = 0;
493 {
494 char cmd[strlen(command) + strlen(restart->name) + 1];
495 snprintf(cmd, sizeof(cmd), command, restart->name);
496 if ((restart->pid = run_background(cmd)) > 0) {
497 thread_add_timer(master, restart_kill, restart,
498 gs.restart_timeout, &restart->t_kill);
499 restart->what = cmdtype;
500 gs.numpids++;
501 } else
502 restart->pid = 0;
503 }
504
505 systemd_send_status("FRR Operational");
506
507 /* Calculate the new restart interval. */
508 if (update_interval) {
509 if (delay.tv_sec > 2 * gs.max_restart_interval)
510 restart->interval = gs.min_restart_interval;
511 else if ((restart->interval *= 2) > gs.max_restart_interval)
512 restart->interval = gs.max_restart_interval;
513 if (gs.loglevel > LOG_DEBUG + 1)
514 zlog_debug("restart %s interval is now %ld",
515 restart->name, restart->interval);
516 }
517 return restart->pid;
518 }
519
520 #define SET_READ_HANDLER(DMN) \
521 do { \
522 (DMN)->t_read = NULL; \
523 thread_add_read(master, handle_read, (DMN), (DMN)->fd, \
524 &(DMN)->t_read); \
525 } while (0);
526
527 #define SET_WAKEUP_DOWN(DMN) \
528 do { \
529 (DMN)->t_wakeup = NULL; \
530 thread_add_timer_msec(master, wakeup_down, (DMN), \
531 FUZZY(gs.period), &(DMN)->t_wakeup); \
532 } while (0);
533
534 #define SET_WAKEUP_UNRESPONSIVE(DMN) \
535 do { \
536 (DMN)->t_wakeup = NULL; \
537 thread_add_timer_msec(master, wakeup_unresponsive, (DMN), \
538 FUZZY(gs.period), &(DMN)->t_wakeup); \
539 } while (0);
540
541 #define SET_WAKEUP_ECHO(DMN) \
542 do { \
543 (DMN)->t_wakeup = NULL; \
544 thread_add_timer_msec(master, wakeup_send_echo, (DMN), \
545 FUZZY(gs.period), &(DMN)->t_wakeup); \
546 } while (0);
547
548 static void wakeup_down(struct thread *t_wakeup)
549 {
550 struct daemon *dmn = THREAD_ARG(t_wakeup);
551
552 dmn->t_wakeup = NULL;
553 if (try_connect(dmn) < 0)
554 SET_WAKEUP_DOWN(dmn);
555 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
556 try_restart(dmn);
557 }
558
559 static void wakeup_init(struct thread *t_wakeup)
560 {
561 struct daemon *dmn = THREAD_ARG(t_wakeup);
562
563 dmn->t_wakeup = NULL;
564 if (try_connect(dmn) < 0) {
565 zlog_info(
566 "%s state -> down : initial connection attempt failed",
567 dmn->name);
568 dmn->state = DAEMON_DOWN;
569 }
570 phase_check();
571 }
572
573 static void restart_done(struct daemon *dmn)
574 {
575 if (dmn->state != DAEMON_DOWN) {
576 zlog_warn(
577 "Daemon: %s: is in %s state but expected it to be in DAEMON_DOWN state",
578 dmn->name, state_str[dmn->state]);
579 return;
580 }
581 THREAD_OFF(dmn->t_wakeup);
582
583 if (try_connect(dmn) < 0)
584 SET_WAKEUP_DOWN(dmn);
585 }
586
587 static void daemon_down(struct daemon *dmn, const char *why)
588 {
589 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
590 flog_err(EC_WATCHFRR_CONNECTION, "%s state -> down : %s",
591 dmn->name, why);
592 else if (gs.loglevel > LOG_DEBUG)
593 zlog_debug("%s still down : %s", dmn->name, why);
594 if (IS_UP(dmn))
595 gs.numdown++;
596 dmn->state = DAEMON_DOWN;
597 if (dmn->fd >= 0) {
598 close(dmn->fd);
599 dmn->fd = -1;
600 }
601 THREAD_OFF(dmn->t_read);
602 THREAD_OFF(dmn->t_write);
603 THREAD_OFF(dmn->t_wakeup);
604 if (try_connect(dmn) < 0)
605 SET_WAKEUP_DOWN(dmn);
606 phase_check();
607 }
608
609 static void handle_read(struct thread *t_read)
610 {
611 struct daemon *dmn = THREAD_ARG(t_read);
612 static const char resp[sizeof(PING_TOKEN) + 4] = PING_TOKEN "\n";
613 char buf[sizeof(resp) + 100];
614 ssize_t rc;
615 struct timeval delay;
616
617 dmn->t_read = NULL;
618 if ((rc = read(dmn->fd, buf, sizeof(buf))) < 0) {
619 char why[100];
620
621 if (ERRNO_IO_RETRY(errno)) {
622 /* Pretend it never happened. */
623 SET_READ_HANDLER(dmn);
624 return;
625 }
626 snprintf(why, sizeof(why), "unexpected read error: %s",
627 safe_strerror(errno));
628 daemon_down(dmn, why);
629 return;
630 }
631 if (rc == 0) {
632 daemon_down(dmn, "read returned EOF");
633 return;
634 }
635 if (!dmn->echo_sent.tv_sec) {
636 char why[sizeof(buf) + 100];
637 snprintf(why, sizeof(why),
638 "unexpected read returns %d bytes: %.*s", (int)rc,
639 (int)rc, buf);
640 daemon_down(dmn, why);
641 return;
642 }
643
644 /* We are expecting an echo response: is there any chance that the
645 response would not be returned entirely in the first read? That
646 seems inconceivable... */
647 if ((rc != sizeof(resp)) || memcmp(buf, resp, sizeof(resp))) {
648 char why[100 + sizeof(buf)];
649 snprintf(why, sizeof(why),
650 "read returned bad echo response of %d bytes (expecting %u): %.*s",
651 (int)rc, (unsigned int)sizeof(resp), (int)rc, buf);
652 daemon_down(dmn, why);
653 return;
654 }
655
656 time_elapsed(&delay, &dmn->echo_sent);
657 dmn->echo_sent.tv_sec = 0;
658 if (dmn->state == DAEMON_UNRESPONSIVE) {
659 if (delay.tv_sec < gs.timeout) {
660 dmn->state = DAEMON_UP;
661 zlog_warn(
662 "%s state -> up : echo response received after %ld.%06ld seconds",
663 dmn->name, (long)delay.tv_sec,
664 (long)delay.tv_usec);
665 } else
666 zlog_warn(
667 "%s: slow echo response finally received after %ld.%06ld seconds",
668 dmn->name, (long)delay.tv_sec,
669 (long)delay.tv_usec);
670 } else if (gs.loglevel > LOG_DEBUG + 1)
671 zlog_debug("%s: echo response received after %ld.%06ld seconds",
672 dmn->name, (long)delay.tv_sec, (long)delay.tv_usec);
673
674 SET_READ_HANDLER(dmn);
675 thread_cancel(&dmn->t_wakeup);
676 SET_WAKEUP_ECHO(dmn);
677 }
678
679 /*
680 * Wait till we notice that all daemons are ready before
681 * we send we are ready to systemd
682 */
683 static void daemon_send_ready(int exitcode)
684 {
685 FILE *fp;
686 static int sent = 0;
687 char started[1024];
688
689 if (sent)
690 return;
691
692 if (exitcode == 0)
693 zlog_notice("all daemons up, doing startup-complete notify");
694 else if (gs.numdown < gs.numdaemons)
695 flog_err(EC_WATCHFRR_CONNECTION,
696 "startup did not complete within timeout (%d/%d daemons running)",
697 gs.numdaemons - gs.numdown, gs.numdaemons);
698 else {
699 flog_err(EC_WATCHFRR_CONNECTION,
700 "all configured daemons failed to start -- exiting watchfrr");
701 exit(exitcode);
702
703 }
704
705 frr_detach();
706
707 snprintf(started, sizeof(started), "%s/%s", frr_vtydir,
708 "watchfrr.started");
709 fp = fopen(started, "w");
710 if (fp)
711 fclose(fp);
712
713 systemd_send_started(master);
714 systemd_send_status("FRR Operational");
715 sent = 1;
716 }
717
718 static void daemon_up(struct daemon *dmn, const char *why)
719 {
720 dmn->state = DAEMON_UP;
721 gs.numdown--;
722 dmn->connect_tries = 0;
723 zlog_notice("%s state -> up : %s", dmn->name, why);
724 if (gs.numdown == 0)
725 daemon_send_ready(0);
726 SET_WAKEUP_ECHO(dmn);
727 phase_check();
728 }
729
730 static void check_connect(struct thread *t_write)
731 {
732 struct daemon *dmn = THREAD_ARG(t_write);
733 int sockerr;
734 socklen_t reslen = sizeof(sockerr);
735
736 dmn->t_write = NULL;
737 if (getsockopt(dmn->fd, SOL_SOCKET, SO_ERROR, (char *)&sockerr, &reslen)
738 < 0) {
739 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn->name,
740 safe_strerror(errno));
741 daemon_down(dmn,
742 "getsockopt failed checking connection success");
743 return;
744 }
745 if ((reslen == sizeof(sockerr)) && sockerr) {
746 char why[100];
747 snprintf(
748 why, sizeof(why),
749 "getsockopt reports that connection attempt failed: %s",
750 safe_strerror(sockerr));
751 daemon_down(dmn, why);
752 return;
753 }
754
755 daemon_up(dmn, "delayed connect succeeded");
756 }
757
758 static void wakeup_connect_hanging(struct thread *t_wakeup)
759 {
760 struct daemon *dmn = THREAD_ARG(t_wakeup);
761 char why[100];
762
763 dmn->t_wakeup = NULL;
764 snprintf(why, sizeof(why),
765 "connection attempt timed out after %ld seconds", gs.timeout);
766 daemon_down(dmn, why);
767 }
768
769 /* Making connection to protocol daemon. */
770 static int try_connect(struct daemon *dmn)
771 {
772 int sock;
773 struct sockaddr_un addr;
774 socklen_t len;
775
776 if (gs.loglevel > LOG_DEBUG + 1)
777 zlog_debug("%s: attempting to connect", dmn->name);
778 dmn->connect_tries++;
779
780 memset(&addr, 0, sizeof(struct sockaddr_un));
781 addr.sun_family = AF_UNIX;
782 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty", gs.vtydir,
783 dmn->name);
784 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
785 len = addr.sun_len = SUN_LEN(&addr);
786 #else
787 len = sizeof(addr.sun_family) + strlen(addr.sun_path);
788 #endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
789
790 /* Quick check to see if we might succeed before we go to the trouble
791 of creating a socket. */
792 if (access(addr.sun_path, W_OK) < 0) {
793 if (errno != ENOENT)
794 flog_err_sys(EC_LIB_SYSTEM_CALL,
795 "%s: access to socket %s denied: %s",
796 dmn->name, addr.sun_path,
797 safe_strerror(errno));
798 return -1;
799 }
800
801 if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
802 flog_err_sys(EC_LIB_SOCKET, "%s(%s): cannot make socket: %s",
803 __func__, addr.sun_path, safe_strerror(errno));
804 return -1;
805 }
806
807 if (set_nonblocking(sock) < 0 || set_cloexec(sock) < 0) {
808 flog_err_sys(EC_LIB_SYSTEM_CALL,
809 "%s(%s): set_nonblocking/cloexec(%d) failed",
810 __func__, addr.sun_path, sock);
811 close(sock);
812 return -1;
813 }
814
815 if (connect(sock, (struct sockaddr *)&addr, len) < 0) {
816 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK)) {
817 if (gs.loglevel > LOG_DEBUG)
818 zlog_debug("%s(%s): connect failed: %s",
819 __func__, addr.sun_path,
820 safe_strerror(errno));
821 close(sock);
822 return -1;
823 }
824 if (gs.loglevel > LOG_DEBUG)
825 zlog_debug("%s: connection in progress", dmn->name);
826 dmn->state = DAEMON_CONNECTING;
827 dmn->fd = sock;
828 thread_add_write(master, check_connect, dmn, dmn->fd,
829 &dmn->t_write);
830 thread_add_timer(master, wakeup_connect_hanging, dmn,
831 gs.timeout, &dmn->t_wakeup);
832 SET_READ_HANDLER(dmn);
833 return 0;
834 }
835
836 dmn->fd = sock;
837 SET_READ_HANDLER(dmn);
838 daemon_up(dmn, "connect succeeded");
839 return 1;
840 }
841
842 static void phase_hanging(struct thread *t_hanging)
843 {
844 gs.t_phase_hanging = NULL;
845 flog_err(EC_WATCHFRR_CONNECTION,
846 "Phase [%s] hanging for %ld seconds, aborting phased restart",
847 phase_str[gs.phase], PHASE_TIMEOUT);
848 gs.phase = PHASE_NONE;
849 }
850
851 static void set_phase(enum restart_phase new_phase)
852 {
853 gs.phase = new_phase;
854 thread_cancel(&gs.t_phase_hanging);
855
856 thread_add_timer(master, phase_hanging, NULL, PHASE_TIMEOUT,
857 &gs.t_phase_hanging);
858 }
859
860 static void phase_check(void)
861 {
862 struct daemon *dmn;
863
864 switch (gs.phase) {
865 case PHASE_NONE:
866 break;
867
868 case PHASE_INIT:
869 for (dmn = gs.daemons; dmn; dmn = dmn->next)
870 if (dmn->state == DAEMON_INIT)
871 return;
872
873 /* startup complete, everything out of INIT */
874 gs.phase = PHASE_NONE;
875 for (dmn = gs.daemons; dmn; dmn = dmn->next)
876 if (dmn->state == DAEMON_DOWN) {
877 SET_WAKEUP_DOWN(dmn);
878 try_restart(dmn);
879 }
880 break;
881 case PHASE_STOPS_PENDING:
882 if (gs.numpids)
883 break;
884 zlog_info(
885 "Phased restart: all routing daemon stop jobs have completed.");
886 set_phase(PHASE_WAITING_DOWN);
887
888 /*FALLTHRU*/
889 case PHASE_WAITING_DOWN:
890 if (gs.numdown + IS_UP(gs.special) < gs.numdaemons)
891 break;
892 zlog_info("Phased restart: all routing daemons now down.");
893 run_job(&gs.special->restart, "restart", gs.restart_command, 1,
894 1);
895 set_phase(PHASE_ZEBRA_RESTART_PENDING);
896
897 /*FALLTHRU*/
898 case PHASE_ZEBRA_RESTART_PENDING:
899 if (gs.special->restart.pid)
900 break;
901 zlog_info("Phased restart: %s restart job completed.",
902 gs.special->name);
903 set_phase(PHASE_WAITING_ZEBRA_UP);
904
905 /*FALLTHRU*/
906 case PHASE_WAITING_ZEBRA_UP:
907 if (!IS_UP(gs.special))
908 break;
909 zlog_info("Phased restart: %s is now up.", gs.special->name);
910 {
911 struct daemon *dmn;
912 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
913 if (dmn != gs.special)
914 run_job(&dmn->restart, "start",
915 gs.start_command, 1, 0);
916 }
917 }
918 gs.phase = PHASE_NONE;
919 THREAD_OFF(gs.t_phase_hanging);
920 zlog_notice("Phased global restart has completed.");
921 break;
922 }
923 }
924
925 static void try_restart(struct daemon *dmn)
926 {
927 if (watch_only)
928 return;
929
930 if (dmn != gs.special) {
931 if ((gs.special->state == DAEMON_UP)
932 && (gs.phase == PHASE_NONE))
933 run_job(&dmn->restart, "restart", gs.restart_command, 0,
934 1);
935 else
936 zlog_debug(
937 "%s: postponing restart attempt because master %s daemon not up [%s], or phased restart in progress",
938 dmn->name, gs.special->name,
939 state_str[gs.special->state]);
940 return;
941 }
942
943 if ((gs.phase != PHASE_NONE) || gs.numpids) {
944 if (gs.loglevel > LOG_DEBUG + 1)
945 zlog_debug(
946 "postponing phased global restart: restart already in progress [%s], or outstanding child processes [%d]",
947 phase_str[gs.phase], gs.numpids);
948 return;
949 }
950 /* Is it too soon for a restart? */
951 {
952 struct timeval delay;
953 if (time_elapsed(&delay, &gs.special->restart.time)->tv_sec
954 < gs.special->restart.interval) {
955 if (gs.loglevel > LOG_DEBUG + 1)
956 zlog_debug(
957 "postponing phased global restart: elapsed time %ld < retry interval %ld",
958 (long)delay.tv_sec,
959 gs.special->restart.interval);
960 return;
961 }
962 }
963 run_job(&gs.restart, "restart", gs.restart_command, 0, 1);
964 }
965
966 static void wakeup_unresponsive(struct thread *t_wakeup)
967 {
968 struct daemon *dmn = THREAD_ARG(t_wakeup);
969
970 dmn->t_wakeup = NULL;
971 if (dmn->state != DAEMON_UNRESPONSIVE)
972 flog_err(EC_WATCHFRR_CONNECTION,
973 "%s: no longer unresponsive (now %s), wakeup should have been cancelled!",
974 dmn->name, state_str[dmn->state]);
975 else {
976 SET_WAKEUP_UNRESPONSIVE(dmn);
977 try_restart(dmn);
978 }
979 }
980
981 static void wakeup_no_answer(struct thread *t_wakeup)
982 {
983 struct daemon *dmn = THREAD_ARG(t_wakeup);
984
985 dmn->t_wakeup = NULL;
986 dmn->state = DAEMON_UNRESPONSIVE;
987 if (dmn->ignore_timeout)
988 return;
989 flog_err(EC_WATCHFRR_CONNECTION,
990 "%s state -> unresponsive : no response yet to ping sent %ld seconds ago",
991 dmn->name, gs.timeout);
992 SET_WAKEUP_UNRESPONSIVE(dmn);
993 try_restart(dmn);
994 }
995
996 static void wakeup_send_echo(struct thread *t_wakeup)
997 {
998 static const char echocmd[] = "echo " PING_TOKEN;
999 ssize_t rc;
1000 struct daemon *dmn = THREAD_ARG(t_wakeup);
1001
1002 dmn->t_wakeup = NULL;
1003 if (((rc = write(dmn->fd, echocmd, sizeof(echocmd))) < 0)
1004 || ((size_t)rc != sizeof(echocmd))) {
1005 char why[100 + sizeof(echocmd)];
1006 snprintf(why, sizeof(why),
1007 "write '%s' returned %d instead of %u", echocmd,
1008 (int)rc, (unsigned int)sizeof(echocmd));
1009 daemon_down(dmn, why);
1010 } else {
1011 gettimeofday(&dmn->echo_sent, NULL);
1012 thread_add_timer(master, wakeup_no_answer, dmn, gs.timeout,
1013 &dmn->t_wakeup);
1014 }
1015 }
1016
1017 bool check_all_up(void)
1018 {
1019 struct daemon *dmn;
1020
1021 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1022 if (dmn->state != DAEMON_UP)
1023 return false;
1024 return true;
1025 }
1026
1027 void watchfrr_status(struct vty *vty)
1028 {
1029 struct daemon *dmn;
1030 struct timeval delay;
1031
1032 vty_out(vty, "watchfrr global phase: %s\n", phase_str[gs.phase]);
1033 if (gs.restart.pid)
1034 vty_out(vty, " global restart running, pid %ld\n",
1035 (long)gs.restart.pid);
1036
1037 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
1038 vty_out(vty, " %-20s %s%s", dmn->name, state_str[dmn->state],
1039 dmn->ignore_timeout ? "/Ignoring Timeout\n" : "\n");
1040 if (dmn->restart.pid)
1041 vty_out(vty, " restart running, pid %ld\n",
1042 (long)dmn->restart.pid);
1043 else if (dmn->state == DAEMON_DOWN &&
1044 time_elapsed(&delay, &dmn->restart.time)->tv_sec
1045 < dmn->restart.interval)
1046 vty_out(vty, " restarting in %jd seconds (%jds backoff interval)\n",
1047 (intmax_t)dmn->restart.interval
1048 - (intmax_t)delay.tv_sec,
1049 (intmax_t)dmn->restart.interval);
1050 }
1051 }
1052
1053 static void sigint(void)
1054 {
1055 zlog_notice("Terminating on signal");
1056 systemd_send_stopping();
1057 exit(0);
1058 }
1059
1060 static int valid_command(const char *cmd)
1061 {
1062 char *p;
1063
1064 if (cmd == NULL)
1065 return 0;
1066
1067 return ((p = strchr(cmd, '%')) != NULL) && (*(p + 1) == 's')
1068 && !strchr(p + 1, '%');
1069 }
1070
1071 /* This is an ugly hack to circumvent problems with passing command-line
1072 arguments that contain spaces. The fix is to use a configuration file. */
1073 static char *translate_blanks(const char *cmd, const char *blankstr)
1074 {
1075 char *res;
1076 char *p;
1077 size_t bslen = strlen(blankstr);
1078
1079 if (!(res = strdup(cmd))) {
1080 perror("strdup");
1081 exit(1);
1082 }
1083 while ((p = strstr(res, blankstr)) != NULL) {
1084 *p = ' ';
1085 if (bslen != 1)
1086 memmove(p + 1, p + bslen, strlen(p + bslen) + 1);
1087 }
1088 return res;
1089 }
1090
1091 static void startup_timeout(struct thread *t_wakeup)
1092 {
1093 daemon_send_ready(1);
1094 }
1095
1096 #ifdef GNU_LINUX
1097
1098 #include <sys/mount.h>
1099 #include <sched.h>
1100
1101 #define NETNS_RUN_DIR "/var/run/netns"
1102
1103 static void netns_create(int dirfd, const char *nsname)
1104 {
1105 /* make /var/run/netns shared between mount namespaces
1106 * just like iproute2 sets it up
1107 */
1108 if (mount("", NETNS_RUN_DIR, "none", MS_SHARED | MS_REC, NULL)) {
1109 if (errno != EINVAL) {
1110 perror("mount");
1111 exit(1);
1112 }
1113
1114 if (mount(NETNS_RUN_DIR, NETNS_RUN_DIR, "none",
1115 MS_BIND | MS_REC, NULL)) {
1116 perror("mount");
1117 exit(1);
1118 }
1119
1120 if (mount("", NETNS_RUN_DIR, "none", MS_SHARED | MS_REC,
1121 NULL)) {
1122 perror("mount");
1123 exit(1);
1124 }
1125 }
1126
1127 /* need an empty file to mount on top of */
1128 int nsfd = openat(dirfd, nsname, O_CREAT | O_RDONLY | O_EXCL, 0);
1129
1130 if (nsfd < 0) {
1131 fprintf(stderr, "failed to create \"%s/%s\": %s\n",
1132 NETNS_RUN_DIR, nsname, strerror(errno));
1133 exit(1);
1134 }
1135 close(nsfd);
1136
1137 if (unshare(CLONE_NEWNET)) {
1138 perror("unshare");
1139 unlinkat(dirfd, nsname, 0);
1140 exit(1);
1141 }
1142
1143 char *dstpath = asprintfrr(MTYPE_TMP, "%s/%s", NETNS_RUN_DIR, nsname);
1144
1145 /* bind-mount so the namespace has a name and is persistent */
1146 if (mount("/proc/self/ns/net", dstpath, "none", MS_BIND, NULL) < 0) {
1147 fprintf(stderr, "failed to bind-mount netns to \"%s\": %s\n",
1148 dstpath, strerror(errno));
1149 unlinkat(dirfd, nsname, 0);
1150 exit(1);
1151 }
1152
1153 XFREE(MTYPE_TMP, dstpath);
1154 }
1155
1156 static void netns_setup(const char *nsname)
1157 {
1158 int dirfd, nsfd;
1159
1160 dirfd = open(NETNS_RUN_DIR, O_DIRECTORY | O_RDONLY);
1161 if (dirfd < 0) {
1162 if (errno == ENOTDIR) {
1163 fprintf(stderr, "error: \"%s\" is not a directory!\n",
1164 NETNS_RUN_DIR);
1165 exit(1);
1166 } else if (errno == ENOENT) {
1167 if (mkdir(NETNS_RUN_DIR, 0755)) {
1168 fprintf(stderr, "error: \"%s\": mkdir: %s\n",
1169 NETNS_RUN_DIR, strerror(errno));
1170 exit(1);
1171 }
1172 dirfd = open(NETNS_RUN_DIR, O_DIRECTORY | O_RDONLY);
1173 if (dirfd < 0) {
1174 fprintf(stderr, "error: \"%s\": opendir: %s\n",
1175 NETNS_RUN_DIR, strerror(errno));
1176 exit(1);
1177 }
1178 } else {
1179 fprintf(stderr, "error: \"%s\": %s\n",
1180 NETNS_RUN_DIR, strerror(errno));
1181 exit(1);
1182 }
1183 }
1184
1185 nsfd = openat(dirfd, nsname, O_RDONLY);
1186 if (nsfd < 0 && errno != ENOENT) {
1187 fprintf(stderr, "error: \"%s/%s\": %s\n",
1188 NETNS_RUN_DIR, nsname, strerror(errno));
1189 exit(1);
1190 }
1191 if (nsfd < 0)
1192 netns_create(dirfd, nsname);
1193 else {
1194 if (setns(nsfd, CLONE_NEWNET)) {
1195 perror("setns");
1196 exit(1);
1197 }
1198 close(nsfd);
1199 }
1200 close(dirfd);
1201
1202 /* make sure loopback is up... weird things happen otherwise.
1203 * ioctl is perfectly fine for this, don't need netlink...
1204 */
1205 int sockfd;
1206 struct ifreq ifr = { };
1207
1208 strlcpy(ifr.ifr_name, "lo", sizeof(ifr.ifr_name));
1209
1210 sockfd = socket(AF_INET, SOCK_DGRAM, 0);
1211 if (sockfd < 0) {
1212 perror("socket");
1213 exit(1);
1214 }
1215 if (ioctl(sockfd, SIOCGIFFLAGS, &ifr)) {
1216 perror("ioctl(SIOCGIFFLAGS, \"lo\")");
1217 exit(1);
1218 }
1219 if (!(ifr.ifr_flags & IFF_UP)) {
1220 ifr.ifr_flags |= IFF_UP;
1221 if (ioctl(sockfd, SIOCSIFFLAGS, &ifr)) {
1222 perror("ioctl(SIOCSIFFLAGS, \"lo\")");
1223 exit(1);
1224 }
1225 }
1226 close(sockfd);
1227 }
1228
1229 #else /* !GNU_LINUX */
1230
1231 static void netns_setup(const char *nsname)
1232 {
1233 fprintf(stderr, "network namespaces are only available on Linux\n");
1234 exit(1);
1235 }
1236 #endif
1237
1238 static void watchfrr_init(int argc, char **argv)
1239 {
1240 const char *special = "zebra";
1241 int i;
1242 struct daemon *dmn, **add = &gs.daemons;
1243 char alldaemons[512] = "", *p = alldaemons;
1244
1245 thread_add_timer_msec(master, startup_timeout, NULL, STARTUP_TIMEOUT,
1246 &gs.t_startup_timeout);
1247
1248 for (i = optind; i < argc; i++) {
1249 dmn = XCALLOC(MTYPE_WATCHFRR_DAEMON, sizeof(*dmn));
1250
1251 dmn->name = dmn->restart.name = argv[i];
1252 dmn->state = DAEMON_INIT;
1253 gs.numdaemons++;
1254 gs.numdown++;
1255 dmn->fd = -1;
1256 thread_add_timer_msec(master, wakeup_init, dmn, 0,
1257 &dmn->t_wakeup);
1258 dmn->restart.interval = gs.min_restart_interval;
1259 *add = dmn;
1260 add = &dmn->next;
1261
1262 if (!strcmp(dmn->name, special))
1263 gs.special = dmn;
1264 }
1265
1266 if (!gs.daemons) {
1267 fprintf(stderr,
1268 "Must specify one or more daemons to monitor.\n\n");
1269 frr_help_exit(1);
1270 }
1271 if (!watch_only && !gs.special) {
1272 fprintf(stderr, "\"%s\" daemon must be in daemon lists\n\n",
1273 special);
1274 frr_help_exit(1);
1275 }
1276
1277 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
1278 snprintf(p, alldaemons + sizeof(alldaemons) - p, "%s%s",
1279 (p == alldaemons) ? "" : " ", dmn->name);
1280 p += strlen(p);
1281 }
1282 zlog_notice("%s %s watching [%s]%s", progname, FRR_VERSION, alldaemons,
1283 watch_only ? ", monitor mode" : "");
1284 }
1285
1286 struct zebra_privs_t watchfrr_privs = {
1287 #ifdef VTY_GROUP
1288 .vty_group = VTY_GROUP,
1289 #endif
1290 };
1291
1292 static struct frr_signal_t watchfrr_signals[] = {
1293 {
1294 .signal = SIGINT,
1295 .handler = sigint,
1296 },
1297 {
1298 .signal = SIGTERM,
1299 .handler = sigint,
1300 },
1301 {
1302 .signal = SIGCHLD,
1303 .handler = sigchild,
1304 },
1305 };
1306
1307 FRR_DAEMON_INFO(watchfrr, WATCHFRR,
1308 .flags = FRR_NO_PRIVSEP | FRR_NO_TCPVTY | FRR_LIMITED_CLI
1309 | FRR_NO_CFG_PID_DRY | FRR_NO_ZCLIENT
1310 | FRR_DETACH_LATER,
1311
1312 .printhelp = printhelp,
1313 .copyright = "Copyright 2004 Andrew J. Schorr",
1314
1315 .signals = watchfrr_signals,
1316 .n_signals = array_size(watchfrr_signals),
1317
1318 .privs = &watchfrr_privs,
1319 );
1320
1321 #define DEPRECATED_OPTIONS "aAezR:"
1322
1323 int main(int argc, char **argv)
1324 {
1325 int opt;
1326 const char *blankstr = NULL;
1327 const char *netns = NULL;
1328 bool netns_en = false;
1329
1330 frr_preinit(&watchfrr_di, argc, argv);
1331 progname = watchfrr_di.progname;
1332
1333 frr_opt_add("b:di:k:l:N:p:r:S:s:t:T:" DEPRECATED_OPTIONS, longopts, "");
1334
1335 gs.restart.name = "all";
1336 while ((opt = frr_getopt(argc, argv, NULL)) != EOF) {
1337 if (opt && opt < 128 && strchr(DEPRECATED_OPTIONS, opt)) {
1338 fprintf(stderr,
1339 "The -%c option no longer exists.\n"
1340 "Please refer to the watchfrr(8) man page.\n",
1341 opt);
1342 exit(1);
1343 }
1344
1345 switch (opt) {
1346 case 0:
1347 break;
1348 case 'b':
1349 blankstr = optarg;
1350 break;
1351 case OPTION_DRY:
1352 watch_only = true;
1353 break;
1354 case 'k':
1355 if (!valid_command(optarg)) {
1356 fprintf(stderr,
1357 "Invalid kill command, must contain '%%s': %s\n",
1358 optarg);
1359 frr_help_exit(1);
1360 }
1361 gs.stop_command = optarg;
1362 break;
1363 case 'l': {
1364 char garbage[3];
1365 if ((sscanf(optarg, "%d%1s", &gs.loglevel, garbage)
1366 != 1)
1367 || (gs.loglevel < LOG_EMERG)) {
1368 fprintf(stderr,
1369 "Invalid loglevel argument: %s\n",
1370 optarg);
1371 frr_help_exit(1);
1372 }
1373 } break;
1374 case OPTION_MINRESTART: {
1375 char garbage[3];
1376 if ((sscanf(optarg, "%ld%1s", &gs.min_restart_interval,
1377 garbage)
1378 != 1)
1379 || (gs.min_restart_interval < 0)) {
1380 fprintf(stderr,
1381 "Invalid min_restart_interval argument: %s\n",
1382 optarg);
1383 frr_help_exit(1);
1384 }
1385 } break;
1386 case OPTION_MAXRESTART: {
1387 char garbage[3];
1388 if ((sscanf(optarg, "%ld%1s", &gs.max_restart_interval,
1389 garbage)
1390 != 1)
1391 || (gs.max_restart_interval < 0)) {
1392 fprintf(stderr,
1393 "Invalid max_restart_interval argument: %s\n",
1394 optarg);
1395 frr_help_exit(1);
1396 }
1397 } break;
1398 case OPTION_NETNS:
1399 netns_en = true;
1400 if (optarg && strchr(optarg, '/')) {
1401 fprintf(stderr,
1402 "invalid network namespace name \"%s\" (may not contain slashes)\n",
1403 optarg);
1404 frr_help_exit(1);
1405 }
1406 netns = optarg;
1407 break;
1408 case 'i': {
1409 char garbage[3];
1410 int period;
1411 if ((sscanf(optarg, "%d%1s", &period, garbage) != 1)
1412 || (gs.period < 1)) {
1413 fprintf(stderr,
1414 "Invalid interval argument: %s\n",
1415 optarg);
1416 frr_help_exit(1);
1417 }
1418 gs.period = 1000 * period;
1419 } break;
1420 case 'p':
1421 watchfrr_di.pid_file = optarg;
1422 break;
1423 case 'r':
1424 if (!valid_command(optarg)) {
1425 fprintf(stderr,
1426 "Invalid restart command, must contain '%%s': %s\n",
1427 optarg);
1428 frr_help_exit(1);
1429 }
1430 gs.restart_command = optarg;
1431 break;
1432 case 's':
1433 if (!valid_command(optarg)) {
1434 fprintf(stderr,
1435 "Invalid start command, must contain '%%s': %s\n",
1436 optarg);
1437 frr_help_exit(1);
1438 }
1439 gs.start_command = optarg;
1440 break;
1441 case 'S':
1442 gs.vtydir = optarg;
1443 break;
1444 case 't': {
1445 char garbage[3];
1446 if ((sscanf(optarg, "%ld%1s", &gs.timeout, garbage)
1447 != 1)
1448 || (gs.timeout < 1)) {
1449 fprintf(stderr,
1450 "Invalid timeout argument: %s\n",
1451 optarg);
1452 frr_help_exit(1);
1453 }
1454 } break;
1455 case 'T': {
1456 char garbage[3];
1457 if ((sscanf(optarg, "%ld%1s", &gs.restart_timeout,
1458 garbage)
1459 != 1)
1460 || (gs.restart_timeout < 1)) {
1461 fprintf(stderr,
1462 "Invalid restart timeout argument: %s\n",
1463 optarg);
1464 frr_help_exit(1);
1465 }
1466 } break;
1467 default:
1468 fputs("Invalid option.\n", stderr);
1469 frr_help_exit(1);
1470 }
1471 }
1472
1473 if (watch_only
1474 && (gs.start_command || gs.stop_command || gs.restart_command)) {
1475 fputs("Options -r/-s/-k are not used when --dry is active.\n",
1476 stderr);
1477 }
1478 if (!watch_only
1479 && (!gs.restart_command || !gs.start_command || !gs.stop_command)) {
1480 fprintf(stderr,
1481 "Options -s (start), -k (kill), and -r (restart) are required.\n");
1482 frr_help_exit(1);
1483 }
1484
1485 if (blankstr) {
1486 if (gs.restart_command)
1487 gs.restart_command =
1488 translate_blanks(gs.restart_command, blankstr);
1489 if (gs.start_command)
1490 gs.start_command =
1491 translate_blanks(gs.start_command, blankstr);
1492 if (gs.stop_command)
1493 gs.stop_command =
1494 translate_blanks(gs.stop_command, blankstr);
1495 }
1496
1497 gs.restart.interval = gs.min_restart_interval;
1498
1499 /* env variable for the processes that we start */
1500 if (watchfrr_di.pathspace)
1501 setenv("FRR_PATHSPACE", watchfrr_di.pathspace, 1);
1502 else
1503 unsetenv("FRR_PATHSPACE");
1504
1505 /*
1506 * when watchfrr_di.pathspace is read, if it is not specified
1507 * pathspace is NULL as expected
1508 */
1509 pathspace = watchfrr_di.pathspace;
1510
1511 if (netns_en && !netns)
1512 netns = watchfrr_di.pathspace;
1513
1514 if (netns_en && netns && netns[0])
1515 netns_setup(netns);
1516
1517 master = frr_init();
1518 watchfrr_error_init();
1519 watchfrr_init(argc, argv);
1520 watchfrr_vty_init();
1521
1522 frr_config_fork();
1523
1524 if (watchfrr_di.daemon_mode)
1525 zlog_syslog_set_prio_min(MIN(gs.loglevel, LOG_DEBUG));
1526 else
1527 zlog_aux_init(NULL, MIN(gs.loglevel, LOG_DEBUG));
1528
1529 frr_run(master);
1530
1531 systemd_send_stopping();
1532 /* Not reached. */
1533 return 0;
1534 }