]> git.proxmox.com Git - mirror_frr.git/blame - watchfrr/watchfrr.c
zebra: Do not accept illegal safi's for route installation (#5679)
[mirror_frr.git] / watchfrr / watchfrr.c
CommitLineData
8b886ca7 1/*
896014f4
DL
2 * Monitor status of frr daemons and restart if necessary.
3 *
4 * Copyright (C) 2004 Andrew J. Schorr
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
8b886ca7 19 */
20
a365534f 21#include <zebra.h>
8b886ca7 22#include <thread.h>
23#include <log.h>
52e66296 24#include <network.h>
8b886ca7 25#include <sigevent.h>
a365534f 26#include <lib/version.h>
95c4aff2 27#include "command.h"
4f04a76b 28#include "libfrr.h"
b647dc2a 29#include "lib_errors.h"
95c4aff2 30
6f594023 31#include <getopt.h>
a365534f 32#include <sys/un.h>
33#include <sys/wait.h>
837d16cc 34#include <memory.h>
651415bd 35#include <systemd.h>
8b886ca7 36
9473e340 37#include "watchfrr.h"
b647dc2a 38#include "watchfrr_errors.h"
95c4aff2 39
8b886ca7 40#ifndef MIN
41#define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
42#endif
43
44/* Macros to help randomize timers. */
45#define JITTER(X) ((random() % ((X)+1))-((X)/2))
46#define FUZZY(X) ((X)+JITTER((X)/20))
47
48#define DEFAULT_PERIOD 5
0a64aff6 49#define DEFAULT_TIMEOUT 90
8b886ca7 50#define DEFAULT_RESTART_TIMEOUT 20
51#define DEFAULT_LOGLEVEL LOG_INFO
52#define DEFAULT_MIN_RESTART 60
53#define DEFAULT_MAX_RESTART 600
8b886ca7 54
3ec95567
DL
55#define DEFAULT_RESTART_CMD WATCHFRR_SH_PATH " restart %s"
56#define DEFAULT_START_CMD WATCHFRR_SH_PATH " start %s"
57#define DEFAULT_STOP_CMD WATCHFRR_SH_PATH " stop %s"
58
8b886ca7 59#define PING_TOKEN "PING"
60
0a7c7856
DL
61DEFINE_MGROUP(WATCHFRR, "watchfrr")
62DEFINE_MTYPE_STATIC(WATCHFRR, WATCHFRR_DAEMON, "watchfrr daemon entry")
63
55c72803 64/* Needs to be global, referenced somewhere inside libfrr. */
8b886ca7 65struct thread_master *master;
66
f168b713 67static bool watch_only = false;
8b886ca7 68
a6810074
DL
69typedef enum {
70 PHASE_NONE = 0,
c0e5cb52 71 PHASE_INIT,
a6810074
DL
72 PHASE_STOPS_PENDING,
73 PHASE_WAITING_DOWN,
74 PHASE_ZEBRA_RESTART_PENDING,
75 PHASE_WAITING_ZEBRA_UP
8b886ca7 76} restart_phase_t;
77
2b64873d 78static const char *const phase_str[] = {
af568444 79 "Idle",
c0e5cb52 80 "Startup",
a6810074
DL
81 "Stop jobs running",
82 "Waiting for other daemons to come down",
83 "Zebra restart job running",
84 "Waiting for zebra to come up",
85 "Start jobs running",
8b886ca7 86};
87
88#define PHASE_TIMEOUT (3*gs.restart_timeout)
5c9d1c83 89#define STARTUP_TIMEOUT 55 * 1000
8b886ca7 90
a6810074
DL
91struct restart_info {
92 const char *name;
93 const char *what;
94 pid_t pid;
95 struct timeval time;
96 long interval;
97 struct thread *t_kill;
98 int kills;
098e240f 99};
100
a6810074 101static struct global_state {
a6810074
DL
102 restart_phase_t phase;
103 struct thread *t_phase_hanging;
5c9d1c83 104 struct thread *t_startup_timeout;
a6810074
DL
105 const char *vtydir;
106 long period;
107 long timeout;
108 long restart_timeout;
109 long min_restart_interval;
110 long max_restart_interval;
a6810074
DL
111 struct daemon *daemons;
112 const char *restart_command;
113 const char *start_command;
114 const char *stop_command;
115 struct restart_info restart;
a6810074 116 int loglevel;
d62a17ae 117 struct daemon *special; /* points to zebra when doing phased restart */
a6810074
DL
118 int numdaemons;
119 int numpids;
d62a17ae 120 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
8b886ca7 121} gs = {
c0e5cb52 122 .phase = PHASE_INIT,
64a249ad 123 .vtydir = frr_vtydir,
d62a17ae 124 .period = 1000 * DEFAULT_PERIOD,
125 .timeout = DEFAULT_TIMEOUT,
126 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
127 .loglevel = DEFAULT_LOGLEVEL,
128 .min_restart_interval = DEFAULT_MIN_RESTART,
129 .max_restart_interval = DEFAULT_MAX_RESTART,
3ec95567
DL
130 .restart_command = DEFAULT_RESTART_CMD,
131 .start_command = DEFAULT_START_CMD,
132 .stop_command = DEFAULT_STOP_CMD,
d62a17ae 133};
a6810074
DL
134
135typedef enum {
136 DAEMON_INIT,
137 DAEMON_DOWN,
138 DAEMON_CONNECTING,
139 DAEMON_UP,
140 DAEMON_UNRESPONSIVE
8b886ca7 141} daemon_state_t;
142
d62a17ae 143#define IS_UP(DMN) \
144 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
8b886ca7 145
2b64873d 146static const char *const state_str[] = {
d62a17ae 147 "Init", "Down", "Connecting", "Up", "Unresponsive",
8b886ca7 148};
149
150struct daemon {
a6810074
DL
151 const char *name;
152 daemon_state_t state;
153 int fd;
154 struct timeval echo_sent;
d7c0a89a 155 unsigned int connect_tries;
a6810074
DL
156 struct thread *t_wakeup;
157 struct thread *t_read;
158 struct thread *t_write;
159 struct daemon *next;
160 struct restart_info restart;
cc53b605
DS
161
162 /*
163 * For a given daemon, if we've turned on ignore timeouts
164 * ignore the timeout value and assume everything is ok
165 * This is for daemon debugging w/ gdb after we have started
166 * FRR and realize we have something that needs to be looked
167 * at
168 */
169 bool ignore_timeout;
8b886ca7 170};
171
9272302b
DL
172#define OPTION_MINRESTART 2000
173#define OPTION_MAXRESTART 2001
f168b713 174#define OPTION_DRY 2002
9272302b 175
a6810074
DL
176static const struct option longopts[] = {
177 {"daemon", no_argument, NULL, 'd'},
178 {"statedir", required_argument, NULL, 'S'},
a6810074
DL
179 {"loglevel", required_argument, NULL, 'l'},
180 {"interval", required_argument, NULL, 'i'},
181 {"timeout", required_argument, NULL, 't'},
182 {"restart-timeout", required_argument, NULL, 'T'},
183 {"restart", required_argument, NULL, 'r'},
184 {"start-command", required_argument, NULL, 's'},
185 {"kill-command", required_argument, NULL, 'k'},
f168b713 186 {"dry", no_argument, NULL, OPTION_DRY},
d62a17ae 187 {"min-restart-interval", required_argument, NULL, OPTION_MINRESTART},
188 {"max-restart-interval", required_argument, NULL, OPTION_MAXRESTART},
a6810074
DL
189 {"pid-file", required_argument, NULL, 'p'},
190 {"blank-string", required_argument, NULL, 'b'},
191 {"help", no_argument, NULL, 'h'},
192 {"version", no_argument, NULL, 'v'},
d62a17ae 193 {NULL, 0, NULL, 0}};
8b886ca7 194
195static int try_connect(struct daemon *dmn);
196static int wakeup_send_echo(struct thread *t_wakeup);
197static void try_restart(struct daemon *dmn);
198static void phase_check(void);
75f8b0e4 199static void restart_done(struct daemon *dmn);
8b886ca7 200
4f04a76b 201static const char *progname;
cc53b605
DS
202
203void watchfrr_set_ignore_daemon(struct vty *vty, const char *dname, bool ignore)
204{
205 struct daemon *dmn;
206
207 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
208 if (strncmp(dmn->name, dname, strlen(dmn->name)) == 0)
209 break;
210 }
211
212 if (dmn) {
213 dmn->ignore_timeout = ignore;
214 vty_out(vty, "%s switching to %s\n", dmn->name,
215 ignore ? "ignore" : "watch");
216 } else
217 vty_out(vty, "%s is not configured for running at the moment",
218 dname);
219}
220
4f04a76b 221static void printhelp(FILE *target)
8b886ca7 222{
d62a17ae 223 fprintf(target,
224 "Usage : %s [OPTION...] <daemon name> ...\n\n\
9473e340 225Watchdog program to monitor status of frr daemons and try to restart\n\
8b886ca7 226them if they are down or unresponsive. It determines whether a daemon is\n\
227up based on whether it can connect to the daemon's vty unix stream socket.\n\
228It then repeatedly sends echo commands over that socket to determine whether\n\
229the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
230on the socket connection and know immediately that the daemon is down.\n\n\
231The daemons to be monitored should be listed on the command line.\n\n\
8b886ca7 232In order to avoid attempting to restart the daemons in a fast loop,\n\
233the -m and -M options allow you to control the minimum delay between\n\
234restart commands. The minimum restart delay is recalculated each time\n\
235a restart is attempted: if the time since the last restart attempt exceeds\n\
236twice the -M value, then the restart delay is set to the -m value.\n\
d62a17ae 237Otherwise, the interval is doubled (but capped at the -M value).\n\n",
f168b713 238 progname);
e757c940 239
d62a17ae 240 fprintf(target,
241 "Options:\n\
8b886ca7 242-d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
243 to syslog instead of stdout.\n\
244-S, --statedir Set the vty socket directory (default is %s)\n\
8b886ca7 245-l, --loglevel Set the logging level (default is %d).\n\
246 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
247 but it can be set higher than %d if extra-verbose debugging\n\
248 messages are desired.\n\
9272302b 249 --min-restart-interval\n\
8b886ca7 250 Set the minimum seconds to wait between invocations of daemon\n\
251 restart commands (default is %d).\n\
9272302b 252 --max-restart-interval\n\
8b886ca7 253 Set the maximum seconds to wait between invocations of daemon\n\
254 restart commands (default is %d).\n\
255-i, --interval Set the status polling interval in seconds (default is %d)\n\
256-t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
257-T, --restart-timeout\n\
258 Set the restart (kill) timeout in seconds (default is %d).\n\
259 If any background jobs are still running after this much\n\
260 time has elapsed, they will be killed.\n\
261-r, --restart Supply a Bourne shell command to use to restart a single\n\
262 daemon. The command string should include '%%s' where the\n\
263 name of the daemon should be substituted.\n\
3ec95567 264 (default: '%s')\n\
8b886ca7 265-s, --start-command\n\
266 Supply a Bourne shell to command to use to start a single\n\
267 daemon. The command string should include '%%s' where the\n\
268 name of the daemon should be substituted.\n\
3ec95567 269 (default: '%s')\n\
8b886ca7 270-k, --kill-command\n\
271 Supply a Bourne shell to command to use to stop a single\n\
272 daemon. The command string should include '%%s' where the\n\
273 name of the daemon should be substituted.\n\
3ec95567 274 (default: '%s')\n\
f168b713 275 --dry Do not start or restart anything, just log.\n\
8b886ca7 276-p, --pid-file Set process identifier file name\n\
0a7c7856 277 (default is %s/watchfrr.pid).\n\
c8b40f86 278-b, --blank-string\n\
279 When the supplied argument string is found in any of the\n\
f168b713 280 various shell command arguments (-r, -s, or -k), replace\n\
c8b40f86 281 it with a space. This is an ugly hack to circumvent problems\n\
282 passing command-line arguments with embedded spaces.\n\
8b886ca7 283-v, --version Print program version\n\
d62a17ae 284-h, --help Display this help and exit\n",
64a249ad 285 frr_vtydir, DEFAULT_LOGLEVEL, LOG_EMERG, LOG_DEBUG, LOG_DEBUG,
d62a17ae 286 DEFAULT_MIN_RESTART, DEFAULT_MAX_RESTART, DEFAULT_PERIOD,
3ec95567
DL
287 DEFAULT_TIMEOUT, DEFAULT_RESTART_TIMEOUT,
288 DEFAULT_RESTART_CMD, DEFAULT_START_CMD, DEFAULT_STOP_CMD,
289 frr_vtydir);
8b886ca7 290}
291
a6810074 292static pid_t run_background(char *shell_cmd)
8b886ca7 293{
a6810074
DL
294 pid_t child;
295
296 switch (child = fork()) {
297 case -1:
450971aa 298 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
299 "fork failed, cannot run command [%s]: %s",
300 shell_cmd, safe_strerror(errno));
a6810074
DL
301 return -1;
302 case 0:
303 /* Child process. */
d62a17ae 304 /* Use separate process group so child processes can be killed
305 * easily. */
a6810074
DL
306 if (setpgid(0, 0) < 0)
307 zlog_warn("warning: setpgid(0,0) failed: %s",
308 safe_strerror(errno));
309 {
310 char shell[] = "sh";
311 char dashc[] = "-c";
d62a17ae 312 char *const argv[4] = {shell, dashc, shell_cmd, NULL};
a6810074 313 execv("/bin/sh", argv);
450971aa 314 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
315 "execv(/bin/sh -c '%s') failed: %s",
316 shell_cmd, safe_strerror(errno));
a6810074
DL
317 _exit(127);
318 }
319 default:
320 /* Parent process: we will reap the child later. */
450971aa 321 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
322 "Forked background command [pid %d]: %s",
323 (int)child, shell_cmd);
a6810074
DL
324 return child;
325 }
8b886ca7 326}
327
a6810074
DL
328static struct timeval *time_elapsed(struct timeval *result,
329 const struct timeval *start_time)
8b886ca7 330{
a6810074
DL
331 gettimeofday(result, NULL);
332 result->tv_sec -= start_time->tv_sec;
333 result->tv_usec -= start_time->tv_usec;
334 while (result->tv_usec < 0) {
335 result->tv_usec += 1000000L;
336 result->tv_sec--;
337 }
338 return result;
8b886ca7 339}
340
a6810074 341static int restart_kill(struct thread *t_kill)
8b886ca7 342{
a6810074
DL
343 struct restart_info *restart = THREAD_ARG(t_kill);
344 struct timeval delay;
345
346 time_elapsed(&delay, &restart->time);
d62a17ae 347 zlog_warn(
348 "Warning: %s %s child process %d still running after "
349 "%ld seconds, sending signal %d",
350 restart->what, restart->name, (int)restart->pid,
351 (long)delay.tv_sec, (restart->kills ? SIGKILL : SIGTERM));
a6810074
DL
352 kill(-restart->pid, (restart->kills ? SIGKILL : SIGTERM));
353 restart->kills++;
66e78ae6
QY
354 restart->t_kill = NULL;
355 thread_add_timer(master, restart_kill, restart, gs.restart_timeout,
356 &restart->t_kill);
a6810074 357 return 0;
8b886ca7 358}
359
a6810074 360static struct restart_info *find_child(pid_t child)
8b886ca7 361{
f168b713 362 struct daemon *dmn;
7c265f7d
CF
363 if (gs.restart.pid == child)
364 return &gs.restart;
365
f168b713
DL
366 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
367 if (dmn->restart.pid == child)
368 return &dmn->restart;
a6810074
DL
369 }
370 return NULL;
8b886ca7 371}
372
a6810074 373static void sigchild(void)
8b886ca7 374{
a6810074
DL
375 pid_t child;
376 int status;
377 const char *name;
378 const char *what;
379 struct restart_info *restart;
75f8b0e4 380 struct daemon *dmn;
a6810074
DL
381
382 switch (child = waitpid(-1, &status, WNOHANG)) {
383 case -1:
450971aa 384 flog_err_sys(EC_LIB_SYSTEM_CALL, "waitpid failed: %s",
09c866e3 385 safe_strerror(errno));
a6810074
DL
386 return;
387 case 0:
388 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
389 return;
390 }
391
392 if (child == integrated_write_pid) {
393 integrated_write_sigchld(status);
394 return;
395 }
396
397 if ((restart = find_child(child)) != NULL) {
398 name = restart->name;
399 what = restart->what;
400 restart->pid = 0;
401 gs.numpids--;
402 thread_cancel(restart->t_kill);
403 restart->t_kill = NULL;
d62a17ae 404 /* Update restart time to reflect the time the command
405 * completed. */
a6810074
DL
406 gettimeofday(&restart->time, NULL);
407 } else {
09c866e3 408 flog_err_sys(
450971aa 409 EC_LIB_SYSTEM_CALL,
09c866e3
QY
410 "waitpid returned status for an unknown child process %d",
411 (int)child);
a6810074
DL
412 name = "(unknown)";
413 what = "background";
414 }
415 if (WIFSTOPPED(status))
d62a17ae 416 zlog_warn("warning: %s %s process %d is stopped", what, name,
417 (int)child);
a6810074 418 else if (WIFSIGNALED(status))
d62a17ae 419 zlog_warn("%s %s process %d terminated due to signal %d", what,
420 name, (int)child, WTERMSIG(status));
a6810074
DL
421 else if (WIFEXITED(status)) {
422 if (WEXITSTATUS(status) != 0)
d62a17ae 423 zlog_warn(
424 "%s %s process %d exited with non-zero status %d",
425 what, name, (int)child, WEXITSTATUS(status));
75f8b0e4 426 else {
a6810074
DL
427 zlog_debug("%s %s process %d exited normally", what,
428 name, (int)child);
75f8b0e4
DL
429
430 if (restart && restart != &gs.restart) {
431 dmn = container_of(restart, struct daemon,
432 restart);
433 restart_done(dmn);
434 } else if (restart)
435 for (dmn = gs.daemons; dmn; dmn = dmn->next)
436 restart_done(dmn);
437 }
a6810074 438 } else
09c866e3 439 flog_err_sys(
450971aa 440 EC_LIB_SYSTEM_CALL,
09c866e3
QY
441 "cannot interpret %s %s process %d wait status 0x%x",
442 what, name, (int)child, status);
a6810074 443 phase_check();
8b886ca7 444}
445
d62a17ae 446static int run_job(struct restart_info *restart, const char *cmdtype,
447 const char *command, int force, int update_interval)
8b886ca7 448{
a6810074
DL
449 struct timeval delay;
450
451 if (gs.loglevel > LOG_DEBUG + 1)
452 zlog_debug("attempting to %s %s", cmdtype, restart->name);
453
454 if (restart->pid) {
455 if (gs.loglevel > LOG_DEBUG + 1)
d62a17ae 456 zlog_debug(
457 "cannot %s %s, previous pid %d still running",
458 cmdtype, restart->name, (int)restart->pid);
a6810074
DL
459 return -1;
460 }
461
b3ee8bcc
DS
462#if defined HAVE_SYSTEMD
463 char buffer[512];
464
465 snprintf(buffer, sizeof(buffer), "restarting %s", restart->name);
466 systemd_send_status(buffer);
467#endif
468
d62a17ae 469 /* Note: time_elapsed test must come before the force test, since we
470 need
a6810074
DL
471 to make sure that delay is initialized for use below in updating the
472 restart interval. */
473 if ((time_elapsed(&delay, &restart->time)->tv_sec < restart->interval)
474 && !force) {
b3ee8bcc 475
a6810074 476 if (gs.loglevel > LOG_DEBUG + 1)
d62a17ae 477 zlog_debug(
478 "postponing %s %s: "
479 "elapsed time %ld < retry interval %ld",
480 cmdtype, restart->name, (long)delay.tv_sec,
481 restart->interval);
a6810074
DL
482 return -1;
483 }
484
485 gettimeofday(&restart->time, NULL);
486 restart->kills = 0;
487 {
488 char cmd[strlen(command) + strlen(restart->name) + 1];
489 snprintf(cmd, sizeof(cmd), command, restart->name);
490 if ((restart->pid = run_background(cmd)) > 0) {
66e78ae6 491 restart->t_kill = NULL;
d62a17ae 492 thread_add_timer(master, restart_kill, restart,
493 gs.restart_timeout, &restart->t_kill);
a6810074
DL
494 restart->what = cmdtype;
495 gs.numpids++;
496 } else
497 restart->pid = 0;
498 }
499
b3ee8bcc
DS
500#if defined HAVE_SYSTEMD
501 systemd_send_status("FRR Operational");
502#endif
a6810074
DL
503 /* Calculate the new restart interval. */
504 if (update_interval) {
505 if (delay.tv_sec > 2 * gs.max_restart_interval)
506 restart->interval = gs.min_restart_interval;
507 else if ((restart->interval *= 2) > gs.max_restart_interval)
508 restart->interval = gs.max_restart_interval;
509 if (gs.loglevel > LOG_DEBUG + 1)
510 zlog_debug("restart %s interval is now %ld",
511 restart->name, restart->interval);
512 }
513 return restart->pid;
8b886ca7 514}
515
d62a17ae 516#define SET_READ_HANDLER(DMN) \
517 do { \
518 (DMN)->t_read = NULL; \
519 thread_add_read(master, handle_read, (DMN), (DMN)->fd, \
520 &(DMN)->t_read); \
521 } while (0);
522
523#define SET_WAKEUP_DOWN(DMN) \
524 do { \
525 (DMN)->t_wakeup = NULL; \
526 thread_add_timer_msec(master, wakeup_down, (DMN), \
527 FUZZY(gs.period), &(DMN)->t_wakeup); \
528 } while (0);
529
530#define SET_WAKEUP_UNRESPONSIVE(DMN) \
531 do { \
532 (DMN)->t_wakeup = NULL; \
533 thread_add_timer_msec(master, wakeup_unresponsive, (DMN), \
534 FUZZY(gs.period), &(DMN)->t_wakeup); \
535 } while (0);
536
537#define SET_WAKEUP_ECHO(DMN) \
538 do { \
539 (DMN)->t_wakeup = NULL; \
540 thread_add_timer_msec(master, wakeup_send_echo, (DMN), \
541 FUZZY(gs.period), &(DMN)->t_wakeup); \
542 } while (0);
8b886ca7 543
a6810074 544static int wakeup_down(struct thread *t_wakeup)
8b886ca7 545{
a6810074
DL
546 struct daemon *dmn = THREAD_ARG(t_wakeup);
547
548 dmn->t_wakeup = NULL;
549 if (try_connect(dmn) < 0)
550 SET_WAKEUP_DOWN(dmn);
551 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
552 try_restart(dmn);
553 return 0;
8b886ca7 554}
555
a6810074 556static int wakeup_init(struct thread *t_wakeup)
8b886ca7 557{
a6810074
DL
558 struct daemon *dmn = THREAD_ARG(t_wakeup);
559
560 dmn->t_wakeup = NULL;
561 if (try_connect(dmn) < 0) {
f74ae2bb 562 flog_err(EC_WATCHFRR_CONNECTION,
1c50c1c0
QY
563 "%s state -> down : initial connection attempt failed",
564 dmn->name);
a6810074
DL
565 dmn->state = DAEMON_DOWN;
566 }
c0e5cb52 567 phase_check();
a6810074 568 return 0;
8b886ca7 569}
570
75f8b0e4
DL
571static void restart_done(struct daemon *dmn)
572{
573 if (dmn->state != DAEMON_DOWN) {
3f391bec
DS
574 zlog_warn(
575 "Daemon: %s: is in %s state but expected it to be in DAEMON_DOWN state",
576 dmn->name, state_str[dmn->state]);
75f8b0e4
DL
577 return;
578 }
579 if (dmn->t_wakeup)
580 THREAD_OFF(dmn->t_wakeup);
581 if (try_connect(dmn) < 0)
582 SET_WAKEUP_DOWN(dmn);
583}
584
a6810074 585static void daemon_down(struct daemon *dmn, const char *why)
8b886ca7 586{
a6810074 587 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
1c50c1c0
QY
588 flog_err(EC_WATCHFRR_CONNECTION, "%s state -> down : %s",
589 dmn->name, why);
a6810074
DL
590 else if (gs.loglevel > LOG_DEBUG)
591 zlog_debug("%s still down : %s", dmn->name, why);
592 if (IS_UP(dmn))
593 gs.numdown++;
594 dmn->state = DAEMON_DOWN;
595 if (dmn->fd >= 0) {
596 close(dmn->fd);
597 dmn->fd = -1;
598 }
599 THREAD_OFF(dmn->t_read);
600 THREAD_OFF(dmn->t_write);
601 THREAD_OFF(dmn->t_wakeup);
602 if (try_connect(dmn) < 0)
603 SET_WAKEUP_DOWN(dmn);
604 phase_check();
8b886ca7 605}
606
a6810074 607static int handle_read(struct thread *t_read)
8b886ca7 608{
a6810074
DL
609 struct daemon *dmn = THREAD_ARG(t_read);
610 static const char resp[sizeof(PING_TOKEN) + 4] = PING_TOKEN "\n";
611 char buf[sizeof(resp) + 100];
612 ssize_t rc;
613 struct timeval delay;
614
615 dmn->t_read = NULL;
616 if ((rc = read(dmn->fd, buf, sizeof(buf))) < 0) {
617 char why[100];
618
619 if (ERRNO_IO_RETRY(errno)) {
620 /* Pretend it never happened. */
621 SET_READ_HANDLER(dmn);
622 return 0;
623 }
624 snprintf(why, sizeof(why), "unexpected read error: %s",
625 safe_strerror(errno));
626 daemon_down(dmn, why);
627 return 0;
8b886ca7 628 }
a6810074
DL
629 if (rc == 0) {
630 daemon_down(dmn, "read returned EOF");
631 return 0;
632 }
633 if (!dmn->echo_sent.tv_sec) {
634 char why[sizeof(buf) + 100];
635 snprintf(why, sizeof(why),
636 "unexpected read returns %d bytes: %.*s", (int)rc,
637 (int)rc, buf);
638 daemon_down(dmn, why);
639 return 0;
8b886ca7 640 }
a6810074
DL
641
642 /* We are expecting an echo response: is there any chance that the
643 response would not be returned entirely in the first read? That
644 seems inconceivable... */
645 if ((rc != sizeof(resp)) || memcmp(buf, resp, sizeof(resp))) {
646 char why[100 + sizeof(buf)];
647 snprintf(why, sizeof(why),
648 "read returned bad echo response of %d bytes "
d62a17ae 649 "(expecting %u): %.*s",
d7c0a89a 650 (int)rc, (unsigned int)sizeof(resp), (int)rc, buf);
a6810074
DL
651 daemon_down(dmn, why);
652 return 0;
653 }
654
655 time_elapsed(&delay, &dmn->echo_sent);
656 dmn->echo_sent.tv_sec = 0;
657 if (dmn->state == DAEMON_UNRESPONSIVE) {
658 if (delay.tv_sec < gs.timeout) {
659 dmn->state = DAEMON_UP;
d62a17ae 660 zlog_warn(
661 "%s state -> up : echo response received after %ld.%06ld "
662 "seconds",
663 dmn->name, (long)delay.tv_sec,
664 (long)delay.tv_usec);
a6810074 665 } else
d62a17ae 666 zlog_warn(
667 "%s: slow echo response finally received after %ld.%06ld "
668 "seconds",
669 dmn->name, (long)delay.tv_sec,
670 (long)delay.tv_usec);
a6810074
DL
671 } else if (gs.loglevel > LOG_DEBUG + 1)
672 zlog_debug("%s: echo response received after %ld.%06ld seconds",
673 dmn->name, (long)delay.tv_sec, (long)delay.tv_usec);
674
675 SET_READ_HANDLER(dmn);
676 if (dmn->t_wakeup)
677 thread_cancel(dmn->t_wakeup);
678 SET_WAKEUP_ECHO(dmn);
679
680 return 0;
8b886ca7 681}
682
207e0d7a
DS
683/*
684 * Wait till we notice that all daemons are ready before
685 * we send we are ready to systemd
686 */
5c9d1c83 687static void daemon_send_ready(int exitcode)
207e0d7a 688{
5c9d1c83 689 FILE *fp;
a6810074 690 static int sent = 0;
43e587c1 691 char started[1024];
207e0d7a 692
5c9d1c83
DL
693 if (sent)
694 return;
695
696 if (exitcode == 0)
0a7c7856 697 zlog_notice("all daemons up, doing startup-complete notify");
5c9d1c83
DL
698 else if (gs.numdown < gs.numdaemons)
699 flog_err(EC_WATCHFRR_CONNECTION,
700 "startup did not complete within timeout"
701 " (%d/%d daemons running)",
702 gs.numdaemons - gs.numdown, gs.numdaemons);
703 else {
704 flog_err(EC_WATCHFRR_CONNECTION,
705 "all configured daemons failed to start"
706 " -- exiting watchfrr");
707 exit(exitcode);
708
709 }
0a7c7856 710
5c9d1c83
DL
711 frr_detach();
712
3c649c71
DS
713 snprintf(started, sizeof(started), "%s%s", frr_vtydir,
714 "watchfrr.started");
715 fp = fopen(started, "w");
5c9d1c83
DL
716 if (fp)
717 fclose(fp);
60bd2534 718#if defined HAVE_SYSTEMD
5c9d1c83 719 systemd_send_started(master, 0);
b3ee8bcc 720 systemd_send_status("FRR Operational");
60bd2534 721#endif
5c9d1c83 722 sent = 1;
207e0d7a
DS
723}
724
a6810074 725static void daemon_up(struct daemon *dmn, const char *why)
8b886ca7 726{
a6810074
DL
727 dmn->state = DAEMON_UP;
728 gs.numdown--;
729 dmn->connect_tries = 0;
730 zlog_notice("%s state -> up : %s", dmn->name, why);
5c9d1c83
DL
731 if (gs.numdown == 0)
732 daemon_send_ready(0);
a8cbb8b3 733 SET_WAKEUP_ECHO(dmn);
a6810074 734 phase_check();
8b886ca7 735}
736
a6810074 737static int check_connect(struct thread *t_write)
8b886ca7 738{
a6810074
DL
739 struct daemon *dmn = THREAD_ARG(t_write);
740 int sockerr;
741 socklen_t reslen = sizeof(sockerr);
742
743 dmn->t_write = NULL;
744 if (getsockopt(dmn->fd, SOL_SOCKET, SO_ERROR, (char *)&sockerr, &reslen)
745 < 0) {
746 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn->name,
747 safe_strerror(errno));
748 daemon_down(dmn,
749 "getsockopt failed checking connection success");
750 return 0;
751 }
752 if ((reslen == sizeof(sockerr)) && sockerr) {
753 char why[100];
d62a17ae 754 snprintf(
755 why, sizeof(why),
756 "getsockopt reports that connection attempt failed: %s",
757 safe_strerror(sockerr));
a6810074
DL
758 daemon_down(dmn, why);
759 return 0;
760 }
761
762 daemon_up(dmn, "delayed connect succeeded");
763 return 0;
8b886ca7 764}
765
a6810074 766static int wakeup_connect_hanging(struct thread *t_wakeup)
8b886ca7 767{
a6810074
DL
768 struct daemon *dmn = THREAD_ARG(t_wakeup);
769 char why[100];
770
771 dmn->t_wakeup = NULL;
772 snprintf(why, sizeof(why),
773 "connection attempt timed out after %ld seconds", gs.timeout);
774 daemon_down(dmn, why);
775 return 0;
8b886ca7 776}
777
778/* Making connection to protocol daemon. */
a6810074 779static int try_connect(struct daemon *dmn)
8b886ca7 780{
a6810074
DL
781 int sock;
782 struct sockaddr_un addr;
783 socklen_t len;
784
785 if (gs.loglevel > LOG_DEBUG + 1)
786 zlog_debug("%s: attempting to connect", dmn->name);
787 dmn->connect_tries++;
788
789 memset(&addr, 0, sizeof(struct sockaddr_un));
790 addr.sun_family = AF_UNIX;
d62a17ae 791 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty", gs.vtydir,
792 dmn->name);
6f0e3f6e 793#ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
a6810074 794 len = addr.sun_len = SUN_LEN(&addr);
8b886ca7 795#else
a6810074 796 len = sizeof(addr.sun_family) + strlen(addr.sun_path);
d62a17ae 797#endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
a6810074
DL
798
799 /* Quick check to see if we might succeed before we go to the trouble
800 of creating a socket. */
801 if (access(addr.sun_path, W_OK) < 0) {
802 if (errno != ENOENT)
450971aa 803 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
804 "%s: access to socket %s denied: %s",
805 dmn->name, addr.sun_path,
806 safe_strerror(errno));
a6810074
DL
807 return -1;
808 }
809
810 if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
450971aa 811 flog_err_sys(EC_LIB_SOCKET, "%s(%s): cannot make socket: %s",
09c866e3 812 __func__, addr.sun_path, safe_strerror(errno));
a6810074
DL
813 return -1;
814 }
815
816 if (set_nonblocking(sock) < 0 || set_cloexec(sock) < 0) {
450971aa 817 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
818 "%s(%s): set_nonblocking/cloexec(%d) failed",
819 __func__, addr.sun_path, sock);
a6810074
DL
820 close(sock);
821 return -1;
8b886ca7 822 }
a6810074
DL
823
824 if (connect(sock, (struct sockaddr *)&addr, len) < 0) {
825 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK)) {
826 if (gs.loglevel > LOG_DEBUG)
827 zlog_debug("%s(%s): connect failed: %s",
828 __func__, addr.sun_path,
829 safe_strerror(errno));
830 close(sock);
831 return -1;
832 }
833 if (gs.loglevel > LOG_DEBUG)
834 zlog_debug("%s: connection in progress", dmn->name);
835 dmn->state = DAEMON_CONNECTING;
836 dmn->fd = sock;
66e78ae6
QY
837 dmn->t_write = NULL;
838 thread_add_write(master, check_connect, dmn, dmn->fd,
d62a17ae 839 &dmn->t_write);
840 dmn->t_wakeup = NULL;
841 thread_add_timer(master, wakeup_connect_hanging, dmn,
842 gs.timeout, &dmn->t_wakeup);
a6810074
DL
843 SET_READ_HANDLER(dmn);
844 return 0;
845 }
846
847 dmn->fd = sock;
848 SET_READ_HANDLER(dmn);
849 daemon_up(dmn, "connect succeeded");
850 return 1;
8b886ca7 851}
852
a6810074 853static int phase_hanging(struct thread *t_hanging)
8b886ca7 854{
a6810074 855 gs.t_phase_hanging = NULL;
f74ae2bb 856 flog_err(EC_WATCHFRR_CONNECTION,
1c50c1c0
QY
857 "Phase [%s] hanging for %ld seconds, aborting phased restart",
858 phase_str[gs.phase], PHASE_TIMEOUT);
a6810074
DL
859 gs.phase = PHASE_NONE;
860 return 0;
8b886ca7 861}
862
a6810074 863static void set_phase(restart_phase_t new_phase)
8b886ca7 864{
a6810074
DL
865 gs.phase = new_phase;
866 if (gs.t_phase_hanging)
867 thread_cancel(gs.t_phase_hanging);
66e78ae6
QY
868 gs.t_phase_hanging = NULL;
869 thread_add_timer(master, phase_hanging, NULL, PHASE_TIMEOUT,
870 &gs.t_phase_hanging);
8b886ca7 871}
872
a6810074 873static void phase_check(void)
8b886ca7 874{
c0e5cb52
DL
875 struct daemon *dmn;
876
a6810074
DL
877 switch (gs.phase) {
878 case PHASE_NONE:
879 break;
c0e5cb52
DL
880
881 case PHASE_INIT:
882 for (dmn = gs.daemons; dmn; dmn = dmn->next)
883 if (dmn->state == DAEMON_INIT)
884 return;
885
886 /* startup complete, everything out of INIT */
887 gs.phase = PHASE_NONE;
888 for (dmn = gs.daemons; dmn; dmn = dmn->next)
889 if (dmn->state == DAEMON_DOWN) {
890 SET_WAKEUP_DOWN(dmn);
891 try_restart(dmn);
892 }
893 break;
a6810074
DL
894 case PHASE_STOPS_PENDING:
895 if (gs.numpids)
896 break;
d62a17ae 897 zlog_info(
898 "Phased restart: all routing daemon stop jobs have completed.");
a6810074
DL
899 set_phase(PHASE_WAITING_DOWN);
900
d62a17ae 901 /*FALLTHRU*/
a6810074
DL
902 case PHASE_WAITING_DOWN:
903 if (gs.numdown + IS_UP(gs.special) < gs.numdaemons)
904 break;
905 zlog_info("Phased restart: all routing daemons now down.");
906 run_job(&gs.special->restart, "restart", gs.restart_command, 1,
907 1);
908 set_phase(PHASE_ZEBRA_RESTART_PENDING);
909
d62a17ae 910 /*FALLTHRU*/
a6810074
DL
911 case PHASE_ZEBRA_RESTART_PENDING:
912 if (gs.special->restart.pid)
913 break;
914 zlog_info("Phased restart: %s restart job completed.",
915 gs.special->name);
916 set_phase(PHASE_WAITING_ZEBRA_UP);
917
d62a17ae 918 /*FALLTHRU*/
a6810074
DL
919 case PHASE_WAITING_ZEBRA_UP:
920 if (!IS_UP(gs.special))
921 break;
922 zlog_info("Phased restart: %s is now up.", gs.special->name);
923 {
924 struct daemon *dmn;
925 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
926 if (dmn != gs.special)
927 run_job(&dmn->restart, "start",
928 gs.start_command, 1, 0);
929 }
930 }
931 gs.phase = PHASE_NONE;
932 THREAD_OFF(gs.t_phase_hanging);
933 zlog_notice("Phased global restart has completed.");
934 break;
935 }
8b886ca7 936}
937
a6810074 938static void try_restart(struct daemon *dmn)
8b886ca7 939{
f168b713 940 if (watch_only)
a6810074 941 return;
a6810074 942
f168b713
DL
943 if (dmn != gs.special) {
944 if ((gs.special->state == DAEMON_UP)
945 && (gs.phase == PHASE_NONE))
946 run_job(&dmn->restart, "restart", gs.restart_command, 0,
947 1);
948 else
949 zlog_debug(
950 "%s: postponing restart attempt because master %s daemon "
951 "not up [%s], or phased restart in progress",
952 dmn->name, gs.special->name,
953 state_str[gs.special->state]);
954 return;
955 }
956
957 if ((gs.phase != PHASE_NONE) || gs.numpids) {
958 if (gs.loglevel > LOG_DEBUG + 1)
959 zlog_debug(
960 "postponing phased global restart: restart already in "
961 "progress [%s], or outstanding child processes [%d]",
962 phase_str[gs.phase], gs.numpids);
963 return;
964 }
965 /* Is it too soon for a restart? */
966 {
967 struct timeval delay;
968 if (time_elapsed(&delay, &gs.special->restart.time)->tv_sec
969 < gs.special->restart.interval) {
a6810074 970 if (gs.loglevel > LOG_DEBUG + 1)
d62a17ae 971 zlog_debug(
f168b713
DL
972 "postponing phased global restart: "
973 "elapsed time %ld < retry interval %ld",
974 (long)delay.tv_sec,
975 gs.special->restart.interval);
976 return;
a6810074 977 }
8b886ca7 978 }
f168b713 979 run_job(&gs.restart, "restart", gs.restart_command, 0, 1);
8b886ca7 980}
981
a6810074 982static int wakeup_unresponsive(struct thread *t_wakeup)
8b886ca7 983{
a6810074
DL
984 struct daemon *dmn = THREAD_ARG(t_wakeup);
985
986 dmn->t_wakeup = NULL;
987 if (dmn->state != DAEMON_UNRESPONSIVE)
f74ae2bb 988 flog_err(EC_WATCHFRR_CONNECTION,
1c50c1c0
QY
989 "%s: no longer unresponsive (now %s), "
990 "wakeup should have been cancelled!",
991 dmn->name, state_str[dmn->state]);
a6810074
DL
992 else {
993 SET_WAKEUP_UNRESPONSIVE(dmn);
994 try_restart(dmn);
995 }
996 return 0;
8b886ca7 997}
998
a6810074 999static int wakeup_no_answer(struct thread *t_wakeup)
8b886ca7 1000{
a6810074
DL
1001 struct daemon *dmn = THREAD_ARG(t_wakeup);
1002
1003 dmn->t_wakeup = NULL;
1004 dmn->state = DAEMON_UNRESPONSIVE;
cc53b605
DS
1005 if (dmn->ignore_timeout)
1006 return 0;
f74ae2bb 1007 flog_err(EC_WATCHFRR_CONNECTION,
1c50c1c0
QY
1008 "%s state -> unresponsive : no response yet to ping "
1009 "sent %ld seconds ago",
1010 dmn->name, gs.timeout);
71e7975a
DL
1011 SET_WAKEUP_UNRESPONSIVE(dmn);
1012 try_restart(dmn);
a6810074 1013 return 0;
8b886ca7 1014}
1015
a6810074 1016static int wakeup_send_echo(struct thread *t_wakeup)
8b886ca7 1017{
a6810074
DL
1018 static const char echocmd[] = "echo " PING_TOKEN;
1019 ssize_t rc;
1020 struct daemon *dmn = THREAD_ARG(t_wakeup);
1021
1022 dmn->t_wakeup = NULL;
d62a17ae 1023 if (((rc = write(dmn->fd, echocmd, sizeof(echocmd))) < 0)
1024 || ((size_t)rc != sizeof(echocmd))) {
a6810074
DL
1025 char why[100 + sizeof(echocmd)];
1026 snprintf(why, sizeof(why),
1027 "write '%s' returned %d instead of %u", echocmd,
d7c0a89a 1028 (int)rc, (unsigned int)sizeof(echocmd));
a6810074
DL
1029 daemon_down(dmn, why);
1030 } else {
1031 gettimeofday(&dmn->echo_sent, NULL);
66e78ae6
QY
1032 dmn->t_wakeup = NULL;
1033 thread_add_timer(master, wakeup_no_answer, dmn, gs.timeout,
1034 &dmn->t_wakeup);
a6810074
DL
1035 }
1036 return 0;
8b886ca7 1037}
1038
470bc619
QY
1039bool check_all_up(void)
1040{
1041 struct daemon *dmn;
1042
1043 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1044 if (dmn->state != DAEMON_UP)
1045 return false;
1046 return true;
1047}
1048
af568444
DL
1049void watchfrr_status(struct vty *vty)
1050{
1051 struct daemon *dmn;
1052 struct timeval delay;
1053
1054 vty_out(vty, "watchfrr global phase: %s\n", phase_str[gs.phase]);
1055 if (gs.restart.pid)
1056 vty_out(vty, " global restart running, pid %ld\n",
1057 (long)gs.restart.pid);
1058
1059 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
cc53b605
DS
1060 vty_out(vty, " %-20s %s%s", dmn->name, state_str[dmn->state],
1061 dmn->ignore_timeout ? "/Ignoring Timeout\n" : "\n");
af568444
DL
1062 if (dmn->restart.pid)
1063 vty_out(vty, " restart running, pid %ld\n",
1064 (long)dmn->restart.pid);
1065 else if (dmn->state == DAEMON_DOWN &&
1066 time_elapsed(&delay, &dmn->restart.time)->tv_sec
1067 < dmn->restart.interval)
051a0be4
DL
1068 vty_out(vty, " restarting in %jd seconds"
1069 " (%jds backoff interval)\n",
1070 (intmax_t)dmn->restart.interval
1071 - (intmax_t)delay.tv_sec,
1072 (intmax_t)dmn->restart.interval);
af568444
DL
1073 }
1074}
1075
a6810074 1076static void sigint(void)
8b886ca7 1077{
a6810074
DL
1078 zlog_notice("Terminating on signal");
1079 systemd_send_stopping();
1080 exit(0);
8b886ca7 1081}
1082
a6810074 1083static int valid_command(const char *cmd)
8b886ca7 1084{
a6810074 1085 char *p;
8b886ca7 1086
a6810074 1087 return ((p = strchr(cmd, '%')) != NULL) && (*(p + 1) == 's')
d62a17ae 1088 && !strchr(p + 1, '%');
8b886ca7 1089}
1090
c8b40f86 1091/* This is an ugly hack to circumvent problems with passing command-line
1092 arguments that contain spaces. The fix is to use a configuration file. */
a6810074 1093static char *translate_blanks(const char *cmd, const char *blankstr)
c8b40f86 1094{
a6810074
DL
1095 char *res;
1096 char *p;
1097 size_t bslen = strlen(blankstr);
1098
1099 if (!(res = strdup(cmd))) {
1100 perror("strdup");
1101 exit(1);
1102 }
1103 while ((p = strstr(res, blankstr)) != NULL) {
1104 *p = ' ';
1105 if (bslen != 1)
1106 memmove(p + 1, p + bslen, strlen(p + bslen) + 1);
1107 }
1108 return res;
c8b40f86 1109}
1110
5c9d1c83
DL
1111static int startup_timeout(struct thread *t_wakeup)
1112{
1113 daemon_send_ready(1);
1114 return 0;
1115}
1116
0a7c7856
DL
1117static void watchfrr_init(int argc, char **argv)
1118{
1119 const char *special = "zebra";
1120 int i;
1121 struct daemon *dmn, **add = &gs.daemons;
1122 char alldaemons[512] = "", *p = alldaemons;
1123
5c9d1c83
DL
1124 thread_add_timer_msec(master, startup_timeout, NULL, STARTUP_TIMEOUT,
1125 &gs.t_startup_timeout);
1126
0a7c7856
DL
1127 for (i = optind; i < argc; i++) {
1128 dmn = XCALLOC(MTYPE_WATCHFRR_DAEMON, sizeof(*dmn));
1129
1130 dmn->name = dmn->restart.name = argv[i];
1131 dmn->state = DAEMON_INIT;
1132 gs.numdaemons++;
1133 gs.numdown++;
1134 dmn->fd = -1;
1135 dmn->t_wakeup = NULL;
c0e5cb52 1136 thread_add_timer_msec(master, wakeup_init, dmn, 0,
0a7c7856
DL
1137 &dmn->t_wakeup);
1138 dmn->restart.interval = gs.min_restart_interval;
1139 *add = dmn;
1140 add = &dmn->next;
1141
1142 if (!strcmp(dmn->name, special))
1143 gs.special = dmn;
1144 }
1145
1146 if (!gs.daemons) {
1147 fprintf(stderr,
1148 "Must specify one or more daemons to monitor.\n\n");
1149 frr_help_exit(1);
1150 }
1151 if (!watch_only && !gs.special) {
1152 fprintf(stderr, "\"%s\" daemon must be in daemon lists\n\n",
1153 special);
1154 frr_help_exit(1);
1155 }
1156
1157 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
1158 snprintf(p, alldaemons + sizeof(alldaemons) - p, "%s%s",
1159 (p == alldaemons) ? "" : " ", dmn->name);
1160 p += strlen(p);
1161 }
1162 zlog_notice("%s %s watching [%s]%s", progname, FRR_VERSION, alldaemons,
1163 watch_only ? ", monitor mode" : "");
1164}
1165
a6810074 1166struct zebra_privs_t watchfrr_privs = {
95c4aff2 1167#ifdef VTY_GROUP
a6810074 1168 .vty_group = VTY_GROUP,
95c4aff2
DL
1169#endif
1170};
1171
4f04a76b
DL
1172static struct quagga_signal_t watchfrr_signals[] = {
1173 {
1174 .signal = SIGINT,
1175 .handler = sigint,
1176 },
1177 {
1178 .signal = SIGTERM,
1179 .handler = sigint,
1180 },
1181 {
1182 .signal = SIGCHLD,
1183 .handler = sigchild,
1184 },
1185};
1186
1187FRR_DAEMON_INFO(watchfrr, WATCHFRR,
d62a17ae 1188 .flags = FRR_NO_PRIVSEP | FRR_NO_TCPVTY | FRR_LIMITED_CLI
0a7c7856
DL
1189 | FRR_NO_CFG_PID_DRY | FRR_NO_ZCLIENT
1190 | FRR_DETACH_LATER,
4f04a76b 1191
d62a17ae 1192 .printhelp = printhelp,
1193 .copyright = "Copyright 2004 Andrew J. Schorr",
4f04a76b 1194
d62a17ae 1195 .signals = watchfrr_signals,
1196 .n_signals = array_size(watchfrr_signals),
4f04a76b 1197
d62a17ae 1198 .privs = &watchfrr_privs, )
4f04a76b 1199
999f153e
DL
1200#define DEPRECATED_OPTIONS "aAezR:"
1201
a6810074 1202int main(int argc, char **argv)
8b886ca7 1203{
a6810074 1204 int opt;
a6810074 1205 const char *blankstr = NULL;
a6810074 1206
4f04a76b
DL
1207 frr_preinit(&watchfrr_di, argc, argv);
1208 progname = watchfrr_di.progname;
1209
999f153e 1210 frr_opt_add("b:dk:l:i:p:r:S:s:t:T:" DEPRECATED_OPTIONS, longopts, "");
a6810074
DL
1211
1212 gs.restart.name = "all";
4f04a76b 1213 while ((opt = frr_getopt(argc, argv, NULL)) != EOF) {
999f153e
DL
1214 if (opt && opt < 128 && strchr(DEPRECATED_OPTIONS, opt)) {
1215 fprintf(stderr,
1216 "The -%c option no longer exists.\n"
1217 "Please refer to the watchfrr(8) man page.\n",
1218 opt);
1219 exit(1);
1220 }
1221
a6810074
DL
1222 switch (opt) {
1223 case 0:
1224 break;
a6810074
DL
1225 case 'b':
1226 blankstr = optarg;
1227 break;
f168b713
DL
1228 case OPTION_DRY:
1229 watch_only = true;
a6810074
DL
1230 break;
1231 case 'k':
1232 if (!valid_command(optarg)) {
1233 fprintf(stderr,
1234 "Invalid kill command, must contain '%%s': %s\n",
1235 optarg);
4f04a76b 1236 frr_help_exit(1);
a6810074
DL
1237 }
1238 gs.stop_command = optarg;
1239 break;
d62a17ae 1240 case 'l': {
1241 char garbage[3];
1242 if ((sscanf(optarg, "%d%1s", &gs.loglevel, garbage)
1243 != 1)
1244 || (gs.loglevel < LOG_EMERG)) {
1245 fprintf(stderr,
1246 "Invalid loglevel argument: %s\n",
1247 optarg);
1248 frr_help_exit(1);
a6810074 1249 }
d62a17ae 1250 } break;
1251 case OPTION_MINRESTART: {
1252 char garbage[3];
1253 if ((sscanf(optarg, "%ld%1s", &gs.min_restart_interval,
1254 garbage)
1255 != 1)
1256 || (gs.min_restart_interval < 0)) {
1257 fprintf(stderr,
1258 "Invalid min_restart_interval argument: %s\n",
1259 optarg);
1260 frr_help_exit(1);
a6810074 1261 }
d62a17ae 1262 } break;
1263 case OPTION_MAXRESTART: {
1264 char garbage[3];
1265 if ((sscanf(optarg, "%ld%1s", &gs.max_restart_interval,
1266 garbage)
1267 != 1)
1268 || (gs.max_restart_interval < 0)) {
1269 fprintf(stderr,
1270 "Invalid max_restart_interval argument: %s\n",
1271 optarg);
1272 frr_help_exit(1);
a6810074 1273 }
d62a17ae 1274 } break;
1275 case 'i': {
1276 char garbage[3];
1277 int period;
1278 if ((sscanf(optarg, "%d%1s", &period, garbage) != 1)
1279 || (gs.period < 1)) {
1280 fprintf(stderr,
1281 "Invalid interval argument: %s\n",
1282 optarg);
1283 frr_help_exit(1);
a6810074 1284 }
d62a17ae 1285 gs.period = 1000 * period;
1286 } break;
a6810074 1287 case 'p':
0a7c7856 1288 watchfrr_di.pid_file = optarg;
a6810074
DL
1289 break;
1290 case 'r':
a6810074
DL
1291 if (!valid_command(optarg)) {
1292 fprintf(stderr,
1293 "Invalid restart command, must contain '%%s': %s\n",
1294 optarg);
4f04a76b 1295 frr_help_exit(1);
a6810074
DL
1296 }
1297 gs.restart_command = optarg;
a6810074
DL
1298 break;
1299 case 's':
1300 if (!valid_command(optarg)) {
1301 fprintf(stderr,
1302 "Invalid start command, must contain '%%s': %s\n",
1303 optarg);
4f04a76b 1304 frr_help_exit(1);
a6810074
DL
1305 }
1306 gs.start_command = optarg;
1307 break;
1308 case 'S':
1309 gs.vtydir = optarg;
1310 break;
d62a17ae 1311 case 't': {
1312 char garbage[3];
1313 if ((sscanf(optarg, "%ld%1s", &gs.timeout, garbage)
1314 != 1)
1315 || (gs.timeout < 1)) {
1316 fprintf(stderr,
1317 "Invalid timeout argument: %s\n",
1318 optarg);
1319 frr_help_exit(1);
a6810074 1320 }
d62a17ae 1321 } break;
1322 case 'T': {
1323 char garbage[3];
1324 if ((sscanf(optarg, "%ld%1s", &gs.restart_timeout,
1325 garbage)
1326 != 1)
1327 || (gs.restart_timeout < 1)) {
1328 fprintf(stderr,
1329 "Invalid restart timeout argument: %s\n",
1330 optarg);
1331 frr_help_exit(1);
a6810074 1332 }
d62a17ae 1333 } break;
a6810074
DL
1334 default:
1335 fputs("Invalid option.\n", stderr);
4f04a76b 1336 frr_help_exit(1);
a6810074 1337 }
8b886ca7 1338 }
a6810074 1339
71e7975a
DL
1340 if (watch_only
1341 && (gs.start_command || gs.stop_command || gs.restart_command)) {
d87ae5cc 1342 fputs("Options -r/-s/-k are not used when --dry is active.\n",
a6810074 1343 stderr);
8b886ca7 1344 }
f168b713
DL
1345 if (!watch_only
1346 && (!gs.restart_command || !gs.start_command || !gs.stop_command)) {
1347 fprintf(stderr,
1348 "Options -s (start), -k (kill), and -r (restart) are required.\n");
1349 frr_help_exit(1);
8b886ca7 1350 }
8b886ca7 1351
a6810074
DL
1352 if (blankstr) {
1353 if (gs.restart_command)
1354 gs.restart_command =
d62a17ae 1355 translate_blanks(gs.restart_command, blankstr);
a6810074
DL
1356 if (gs.start_command)
1357 gs.start_command =
d62a17ae 1358 translate_blanks(gs.start_command, blankstr);
a6810074
DL
1359 if (gs.stop_command)
1360 gs.stop_command =
d62a17ae 1361 translate_blanks(gs.stop_command, blankstr);
065de903 1362 }
8b886ca7 1363
a6810074 1364 gs.restart.interval = gs.min_restart_interval;
8b886ca7 1365
4f04a76b 1366 master = frr_init();
b647dc2a 1367 watchfrr_error_init();
0a7c7856
DL
1368 watchfrr_init(argc, argv);
1369 watchfrr_vty_init();
1370
1371 frr_config_fork();
4f04a76b 1372
dd8376fe 1373 zlog_set_level(ZLOG_DEST_MONITOR, ZLOG_DISABLED);
0a7c7856 1374 if (watchfrr_di.daemon_mode)
dd8376fe 1375 zlog_set_level(ZLOG_DEST_SYSLOG, MIN(gs.loglevel, LOG_DEBUG));
0a7c7856 1376 else
dd8376fe 1377 zlog_set_level(ZLOG_DEST_STDOUT, MIN(gs.loglevel, LOG_DEBUG));
8b886ca7 1378
0a7c7856 1379 frr_run(master);
8b886ca7 1380
a6810074
DL
1381 systemd_send_stopping();
1382 /* Not reached. */
1383 return 0;
8b886ca7 1384}