]> git.proxmox.com Git - mirror_frr.git/blame - watchfrr/watchfrr.c
watchfrr: don't wait around pointlessly at startup
[mirror_frr.git] / watchfrr / watchfrr.c
CommitLineData
8b886ca7 1/*
896014f4
DL
2 * Monitor status of frr daemons and restart if necessary.
3 *
4 * Copyright (C) 2004 Andrew J. Schorr
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
8b886ca7 19 */
20
a365534f 21#include <zebra.h>
8b886ca7 22#include <thread.h>
23#include <log.h>
52e66296 24#include <network.h>
8b886ca7 25#include <sigevent.h>
a365534f 26#include <lib/version.h>
95c4aff2 27#include "command.h"
87f44e2f 28#include "memory_vty.h"
4f04a76b 29#include "libfrr.h"
b647dc2a 30#include "lib_errors.h"
95c4aff2 31
6f594023 32#include <getopt.h>
a365534f 33#include <sys/un.h>
34#include <sys/wait.h>
837d16cc 35#include <memory.h>
651415bd 36#include <systemd.h>
8b886ca7 37
9473e340 38#include "watchfrr.h"
b647dc2a 39#include "watchfrr_errors.h"
95c4aff2 40
8b886ca7 41#ifndef MIN
42#define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
43#endif
44
45/* Macros to help randomize timers. */
46#define JITTER(X) ((random() % ((X)+1))-((X)/2))
47#define FUZZY(X) ((X)+JITTER((X)/20))
48
49#define DEFAULT_PERIOD 5
0a64aff6 50#define DEFAULT_TIMEOUT 90
8b886ca7 51#define DEFAULT_RESTART_TIMEOUT 20
52#define DEFAULT_LOGLEVEL LOG_INFO
53#define DEFAULT_MIN_RESTART 60
54#define DEFAULT_MAX_RESTART 600
8b886ca7 55
56#define PING_TOKEN "PING"
57
0a7c7856
DL
58DEFINE_MGROUP(WATCHFRR, "watchfrr")
59DEFINE_MTYPE_STATIC(WATCHFRR, WATCHFRR_DAEMON, "watchfrr daemon entry")
60
55c72803 61/* Needs to be global, referenced somewhere inside libfrr. */
8b886ca7 62struct thread_master *master;
63
f168b713 64static bool watch_only = false;
8b886ca7 65
a6810074
DL
66typedef enum {
67 PHASE_NONE = 0,
c0e5cb52 68 PHASE_INIT,
a6810074
DL
69 PHASE_STOPS_PENDING,
70 PHASE_WAITING_DOWN,
71 PHASE_ZEBRA_RESTART_PENDING,
72 PHASE_WAITING_ZEBRA_UP
8b886ca7 73} restart_phase_t;
74
a6810074
DL
75static const char *phase_str[] = {
76 "None",
c0e5cb52 77 "Startup",
a6810074
DL
78 "Stop jobs running",
79 "Waiting for other daemons to come down",
80 "Zebra restart job running",
81 "Waiting for zebra to come up",
82 "Start jobs running",
8b886ca7 83};
84
85#define PHASE_TIMEOUT (3*gs.restart_timeout)
86
a6810074
DL
87struct restart_info {
88 const char *name;
89 const char *what;
90 pid_t pid;
91 struct timeval time;
92 long interval;
93 struct thread *t_kill;
94 int kills;
098e240f 95};
96
a6810074 97static struct global_state {
a6810074
DL
98 restart_phase_t phase;
99 struct thread *t_phase_hanging;
100 const char *vtydir;
101 long period;
102 long timeout;
103 long restart_timeout;
104 long min_restart_interval;
105 long max_restart_interval;
a6810074
DL
106 struct daemon *daemons;
107 const char *restart_command;
108 const char *start_command;
109 const char *stop_command;
110 struct restart_info restart;
a6810074 111 int loglevel;
d62a17ae 112 struct daemon *special; /* points to zebra when doing phased restart */
a6810074
DL
113 int numdaemons;
114 int numpids;
d62a17ae 115 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
8b886ca7 116} gs = {
c0e5cb52 117 .phase = PHASE_INIT,
64a249ad 118 .vtydir = frr_vtydir,
d62a17ae 119 .period = 1000 * DEFAULT_PERIOD,
120 .timeout = DEFAULT_TIMEOUT,
121 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
122 .loglevel = DEFAULT_LOGLEVEL,
123 .min_restart_interval = DEFAULT_MIN_RESTART,
124 .max_restart_interval = DEFAULT_MAX_RESTART,
d62a17ae 125};
a6810074
DL
126
127typedef enum {
128 DAEMON_INIT,
129 DAEMON_DOWN,
130 DAEMON_CONNECTING,
131 DAEMON_UP,
132 DAEMON_UNRESPONSIVE
8b886ca7 133} daemon_state_t;
134
d62a17ae 135#define IS_UP(DMN) \
136 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
8b886ca7 137
a6810074 138static const char *state_str[] = {
d62a17ae 139 "Init", "Down", "Connecting", "Up", "Unresponsive",
8b886ca7 140};
141
142struct daemon {
a6810074
DL
143 const char *name;
144 daemon_state_t state;
145 int fd;
146 struct timeval echo_sent;
d7c0a89a 147 unsigned int connect_tries;
a6810074
DL
148 struct thread *t_wakeup;
149 struct thread *t_read;
150 struct thread *t_write;
151 struct daemon *next;
152 struct restart_info restart;
8b886ca7 153};
154
9272302b
DL
155#define OPTION_MINRESTART 2000
156#define OPTION_MAXRESTART 2001
f168b713 157#define OPTION_DRY 2002
9272302b 158
a6810074
DL
159static const struct option longopts[] = {
160 {"daemon", no_argument, NULL, 'd'},
161 {"statedir", required_argument, NULL, 'S'},
a6810074
DL
162 {"loglevel", required_argument, NULL, 'l'},
163 {"interval", required_argument, NULL, 'i'},
164 {"timeout", required_argument, NULL, 't'},
165 {"restart-timeout", required_argument, NULL, 'T'},
166 {"restart", required_argument, NULL, 'r'},
167 {"start-command", required_argument, NULL, 's'},
168 {"kill-command", required_argument, NULL, 'k'},
f168b713 169 {"dry", no_argument, NULL, OPTION_DRY},
d62a17ae 170 {"min-restart-interval", required_argument, NULL, OPTION_MINRESTART},
171 {"max-restart-interval", required_argument, NULL, OPTION_MAXRESTART},
a6810074
DL
172 {"pid-file", required_argument, NULL, 'p'},
173 {"blank-string", required_argument, NULL, 'b'},
174 {"help", no_argument, NULL, 'h'},
175 {"version", no_argument, NULL, 'v'},
d62a17ae 176 {NULL, 0, NULL, 0}};
8b886ca7 177
178static int try_connect(struct daemon *dmn);
179static int wakeup_send_echo(struct thread *t_wakeup);
180static void try_restart(struct daemon *dmn);
181static void phase_check(void);
182
4f04a76b
DL
183static const char *progname;
184static void printhelp(FILE *target)
8b886ca7 185{
d62a17ae 186 fprintf(target,
187 "Usage : %s [OPTION...] <daemon name> ...\n\n\
9473e340 188Watchdog program to monitor status of frr daemons and try to restart\n\
8b886ca7 189them if they are down or unresponsive. It determines whether a daemon is\n\
190up based on whether it can connect to the daemon's vty unix stream socket.\n\
191It then repeatedly sends echo commands over that socket to determine whether\n\
192the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
193on the socket connection and know immediately that the daemon is down.\n\n\
194The daemons to be monitored should be listed on the command line.\n\n\
8b886ca7 195In order to avoid attempting to restart the daemons in a fast loop,\n\
196the -m and -M options allow you to control the minimum delay between\n\
197restart commands. The minimum restart delay is recalculated each time\n\
198a restart is attempted: if the time since the last restart attempt exceeds\n\
199twice the -M value, then the restart delay is set to the -m value.\n\
d62a17ae 200Otherwise, the interval is doubled (but capped at the -M value).\n\n",
f168b713 201 progname);
e757c940 202
d62a17ae 203 fprintf(target,
204 "Options:\n\
8b886ca7 205-d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
206 to syslog instead of stdout.\n\
207-S, --statedir Set the vty socket directory (default is %s)\n\
8b886ca7 208-l, --loglevel Set the logging level (default is %d).\n\
209 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
210 but it can be set higher than %d if extra-verbose debugging\n\
211 messages are desired.\n\
9272302b 212 --min-restart-interval\n\
8b886ca7 213 Set the minimum seconds to wait between invocations of daemon\n\
214 restart commands (default is %d).\n\
9272302b 215 --max-restart-interval\n\
8b886ca7 216 Set the maximum seconds to wait between invocations of daemon\n\
217 restart commands (default is %d).\n\
218-i, --interval Set the status polling interval in seconds (default is %d)\n\
219-t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
220-T, --restart-timeout\n\
221 Set the restart (kill) timeout in seconds (default is %d).\n\
222 If any background jobs are still running after this much\n\
223 time has elapsed, they will be killed.\n\
224-r, --restart Supply a Bourne shell command to use to restart a single\n\
225 daemon. The command string should include '%%s' where the\n\
226 name of the daemon should be substituted.\n\
8b886ca7 227-s, --start-command\n\
228 Supply a Bourne shell to command to use to start a single\n\
229 daemon. The command string should include '%%s' where the\n\
230 name of the daemon should be substituted.\n\
231-k, --kill-command\n\
232 Supply a Bourne shell to command to use to stop a single\n\
233 daemon. The command string should include '%%s' where the\n\
234 name of the daemon should be substituted.\n\
f168b713 235 --dry Do not start or restart anything, just log.\n\
8b886ca7 236-p, --pid-file Set process identifier file name\n\
0a7c7856 237 (default is %s/watchfrr.pid).\n\
c8b40f86 238-b, --blank-string\n\
239 When the supplied argument string is found in any of the\n\
f168b713 240 various shell command arguments (-r, -s, or -k), replace\n\
c8b40f86 241 it with a space. This is an ugly hack to circumvent problems\n\
242 passing command-line arguments with embedded spaces.\n\
8b886ca7 243-v, --version Print program version\n\
d62a17ae 244-h, --help Display this help and exit\n",
64a249ad 245 frr_vtydir, DEFAULT_LOGLEVEL, LOG_EMERG, LOG_DEBUG, LOG_DEBUG,
d62a17ae 246 DEFAULT_MIN_RESTART, DEFAULT_MAX_RESTART, DEFAULT_PERIOD,
0a7c7856 247 DEFAULT_TIMEOUT, DEFAULT_RESTART_TIMEOUT, frr_vtydir);
8b886ca7 248}
249
a6810074 250static pid_t run_background(char *shell_cmd)
8b886ca7 251{
a6810074
DL
252 pid_t child;
253
254 switch (child = fork()) {
255 case -1:
450971aa 256 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
257 "fork failed, cannot run command [%s]: %s",
258 shell_cmd, safe_strerror(errno));
a6810074
DL
259 return -1;
260 case 0:
261 /* Child process. */
d62a17ae 262 /* Use separate process group so child processes can be killed
263 * easily. */
a6810074
DL
264 if (setpgid(0, 0) < 0)
265 zlog_warn("warning: setpgid(0,0) failed: %s",
266 safe_strerror(errno));
267 {
268 char shell[] = "sh";
269 char dashc[] = "-c";
d62a17ae 270 char *const argv[4] = {shell, dashc, shell_cmd, NULL};
a6810074 271 execv("/bin/sh", argv);
450971aa 272 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
273 "execv(/bin/sh -c '%s') failed: %s",
274 shell_cmd, safe_strerror(errno));
a6810074
DL
275 _exit(127);
276 }
277 default:
278 /* Parent process: we will reap the child later. */
450971aa 279 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
280 "Forked background command [pid %d]: %s",
281 (int)child, shell_cmd);
a6810074
DL
282 return child;
283 }
8b886ca7 284}
285
a6810074
DL
286static struct timeval *time_elapsed(struct timeval *result,
287 const struct timeval *start_time)
8b886ca7 288{
a6810074
DL
289 gettimeofday(result, NULL);
290 result->tv_sec -= start_time->tv_sec;
291 result->tv_usec -= start_time->tv_usec;
292 while (result->tv_usec < 0) {
293 result->tv_usec += 1000000L;
294 result->tv_sec--;
295 }
296 return result;
8b886ca7 297}
298
a6810074 299static int restart_kill(struct thread *t_kill)
8b886ca7 300{
a6810074
DL
301 struct restart_info *restart = THREAD_ARG(t_kill);
302 struct timeval delay;
303
304 time_elapsed(&delay, &restart->time);
d62a17ae 305 zlog_warn(
306 "Warning: %s %s child process %d still running after "
307 "%ld seconds, sending signal %d",
308 restart->what, restart->name, (int)restart->pid,
309 (long)delay.tv_sec, (restart->kills ? SIGKILL : SIGTERM));
a6810074
DL
310 kill(-restart->pid, (restart->kills ? SIGKILL : SIGTERM));
311 restart->kills++;
66e78ae6
QY
312 restart->t_kill = NULL;
313 thread_add_timer(master, restart_kill, restart, gs.restart_timeout,
314 &restart->t_kill);
a6810074 315 return 0;
8b886ca7 316}
317
a6810074 318static struct restart_info *find_child(pid_t child)
8b886ca7 319{
f168b713 320 struct daemon *dmn;
7c265f7d
CF
321 if (gs.restart.pid == child)
322 return &gs.restart;
323
f168b713
DL
324 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
325 if (dmn->restart.pid == child)
326 return &dmn->restart;
a6810074
DL
327 }
328 return NULL;
8b886ca7 329}
330
a6810074 331static void sigchild(void)
8b886ca7 332{
a6810074
DL
333 pid_t child;
334 int status;
335 const char *name;
336 const char *what;
337 struct restart_info *restart;
338
339 switch (child = waitpid(-1, &status, WNOHANG)) {
340 case -1:
450971aa 341 flog_err_sys(EC_LIB_SYSTEM_CALL, "waitpid failed: %s",
09c866e3 342 safe_strerror(errno));
a6810074
DL
343 return;
344 case 0:
345 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
346 return;
347 }
348
349 if (child == integrated_write_pid) {
350 integrated_write_sigchld(status);
351 return;
352 }
353
354 if ((restart = find_child(child)) != NULL) {
355 name = restart->name;
356 what = restart->what;
357 restart->pid = 0;
358 gs.numpids--;
359 thread_cancel(restart->t_kill);
360 restart->t_kill = NULL;
d62a17ae 361 /* Update restart time to reflect the time the command
362 * completed. */
a6810074
DL
363 gettimeofday(&restart->time, NULL);
364 } else {
09c866e3 365 flog_err_sys(
450971aa 366 EC_LIB_SYSTEM_CALL,
09c866e3
QY
367 "waitpid returned status for an unknown child process %d",
368 (int)child);
a6810074
DL
369 name = "(unknown)";
370 what = "background";
371 }
372 if (WIFSTOPPED(status))
d62a17ae 373 zlog_warn("warning: %s %s process %d is stopped", what, name,
374 (int)child);
a6810074 375 else if (WIFSIGNALED(status))
d62a17ae 376 zlog_warn("%s %s process %d terminated due to signal %d", what,
377 name, (int)child, WTERMSIG(status));
a6810074
DL
378 else if (WIFEXITED(status)) {
379 if (WEXITSTATUS(status) != 0)
d62a17ae 380 zlog_warn(
381 "%s %s process %d exited with non-zero status %d",
382 what, name, (int)child, WEXITSTATUS(status));
a6810074
DL
383 else
384 zlog_debug("%s %s process %d exited normally", what,
385 name, (int)child);
386 } else
09c866e3 387 flog_err_sys(
450971aa 388 EC_LIB_SYSTEM_CALL,
09c866e3
QY
389 "cannot interpret %s %s process %d wait status 0x%x",
390 what, name, (int)child, status);
a6810074 391 phase_check();
8b886ca7 392}
393
d62a17ae 394static int run_job(struct restart_info *restart, const char *cmdtype,
395 const char *command, int force, int update_interval)
8b886ca7 396{
a6810074
DL
397 struct timeval delay;
398
399 if (gs.loglevel > LOG_DEBUG + 1)
400 zlog_debug("attempting to %s %s", cmdtype, restart->name);
401
402 if (restart->pid) {
403 if (gs.loglevel > LOG_DEBUG + 1)
d62a17ae 404 zlog_debug(
405 "cannot %s %s, previous pid %d still running",
406 cmdtype, restart->name, (int)restart->pid);
a6810074
DL
407 return -1;
408 }
409
d62a17ae 410 /* Note: time_elapsed test must come before the force test, since we
411 need
a6810074
DL
412 to make sure that delay is initialized for use below in updating the
413 restart interval. */
414 if ((time_elapsed(&delay, &restart->time)->tv_sec < restart->interval)
415 && !force) {
416 if (gs.loglevel > LOG_DEBUG + 1)
d62a17ae 417 zlog_debug(
418 "postponing %s %s: "
419 "elapsed time %ld < retry interval %ld",
420 cmdtype, restart->name, (long)delay.tv_sec,
421 restart->interval);
a6810074
DL
422 return -1;
423 }
424
425 gettimeofday(&restart->time, NULL);
426 restart->kills = 0;
427 {
428 char cmd[strlen(command) + strlen(restart->name) + 1];
429 snprintf(cmd, sizeof(cmd), command, restart->name);
430 if ((restart->pid = run_background(cmd)) > 0) {
66e78ae6 431 restart->t_kill = NULL;
d62a17ae 432 thread_add_timer(master, restart_kill, restart,
433 gs.restart_timeout, &restart->t_kill);
a6810074
DL
434 restart->what = cmdtype;
435 gs.numpids++;
436 } else
437 restart->pid = 0;
438 }
439
440 /* Calculate the new restart interval. */
441 if (update_interval) {
442 if (delay.tv_sec > 2 * gs.max_restart_interval)
443 restart->interval = gs.min_restart_interval;
444 else if ((restart->interval *= 2) > gs.max_restart_interval)
445 restart->interval = gs.max_restart_interval;
446 if (gs.loglevel > LOG_DEBUG + 1)
447 zlog_debug("restart %s interval is now %ld",
448 restart->name, restart->interval);
449 }
450 return restart->pid;
8b886ca7 451}
452
d62a17ae 453#define SET_READ_HANDLER(DMN) \
454 do { \
455 (DMN)->t_read = NULL; \
456 thread_add_read(master, handle_read, (DMN), (DMN)->fd, \
457 &(DMN)->t_read); \
458 } while (0);
459
460#define SET_WAKEUP_DOWN(DMN) \
461 do { \
462 (DMN)->t_wakeup = NULL; \
463 thread_add_timer_msec(master, wakeup_down, (DMN), \
464 FUZZY(gs.period), &(DMN)->t_wakeup); \
465 } while (0);
466
467#define SET_WAKEUP_UNRESPONSIVE(DMN) \
468 do { \
469 (DMN)->t_wakeup = NULL; \
470 thread_add_timer_msec(master, wakeup_unresponsive, (DMN), \
471 FUZZY(gs.period), &(DMN)->t_wakeup); \
472 } while (0);
473
474#define SET_WAKEUP_ECHO(DMN) \
475 do { \
476 (DMN)->t_wakeup = NULL; \
477 thread_add_timer_msec(master, wakeup_send_echo, (DMN), \
478 FUZZY(gs.period), &(DMN)->t_wakeup); \
479 } while (0);
8b886ca7 480
a6810074 481static int wakeup_down(struct thread *t_wakeup)
8b886ca7 482{
a6810074
DL
483 struct daemon *dmn = THREAD_ARG(t_wakeup);
484
485 dmn->t_wakeup = NULL;
486 if (try_connect(dmn) < 0)
487 SET_WAKEUP_DOWN(dmn);
488 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
489 try_restart(dmn);
490 return 0;
8b886ca7 491}
492
a6810074 493static int wakeup_init(struct thread *t_wakeup)
8b886ca7 494{
a6810074
DL
495 struct daemon *dmn = THREAD_ARG(t_wakeup);
496
497 dmn->t_wakeup = NULL;
498 if (try_connect(dmn) < 0) {
f74ae2bb 499 flog_err(EC_WATCHFRR_CONNECTION,
1c50c1c0
QY
500 "%s state -> down : initial connection attempt failed",
501 dmn->name);
a6810074
DL
502 dmn->state = DAEMON_DOWN;
503 }
c0e5cb52 504 phase_check();
a6810074 505 return 0;
8b886ca7 506}
507
a6810074 508static void daemon_down(struct daemon *dmn, const char *why)
8b886ca7 509{
a6810074 510 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
1c50c1c0
QY
511 flog_err(EC_WATCHFRR_CONNECTION, "%s state -> down : %s",
512 dmn->name, why);
a6810074
DL
513 else if (gs.loglevel > LOG_DEBUG)
514 zlog_debug("%s still down : %s", dmn->name, why);
515 if (IS_UP(dmn))
516 gs.numdown++;
517 dmn->state = DAEMON_DOWN;
518 if (dmn->fd >= 0) {
519 close(dmn->fd);
520 dmn->fd = -1;
521 }
522 THREAD_OFF(dmn->t_read);
523 THREAD_OFF(dmn->t_write);
524 THREAD_OFF(dmn->t_wakeup);
525 if (try_connect(dmn) < 0)
526 SET_WAKEUP_DOWN(dmn);
527 phase_check();
8b886ca7 528}
529
a6810074 530static int handle_read(struct thread *t_read)
8b886ca7 531{
a6810074
DL
532 struct daemon *dmn = THREAD_ARG(t_read);
533 static const char resp[sizeof(PING_TOKEN) + 4] = PING_TOKEN "\n";
534 char buf[sizeof(resp) + 100];
535 ssize_t rc;
536 struct timeval delay;
537
538 dmn->t_read = NULL;
539 if ((rc = read(dmn->fd, buf, sizeof(buf))) < 0) {
540 char why[100];
541
542 if (ERRNO_IO_RETRY(errno)) {
543 /* Pretend it never happened. */
544 SET_READ_HANDLER(dmn);
545 return 0;
546 }
547 snprintf(why, sizeof(why), "unexpected read error: %s",
548 safe_strerror(errno));
549 daemon_down(dmn, why);
550 return 0;
8b886ca7 551 }
a6810074
DL
552 if (rc == 0) {
553 daemon_down(dmn, "read returned EOF");
554 return 0;
555 }
556 if (!dmn->echo_sent.tv_sec) {
557 char why[sizeof(buf) + 100];
558 snprintf(why, sizeof(why),
559 "unexpected read returns %d bytes: %.*s", (int)rc,
560 (int)rc, buf);
561 daemon_down(dmn, why);
562 return 0;
8b886ca7 563 }
a6810074
DL
564
565 /* We are expecting an echo response: is there any chance that the
566 response would not be returned entirely in the first read? That
567 seems inconceivable... */
568 if ((rc != sizeof(resp)) || memcmp(buf, resp, sizeof(resp))) {
569 char why[100 + sizeof(buf)];
570 snprintf(why, sizeof(why),
571 "read returned bad echo response of %d bytes "
d62a17ae 572 "(expecting %u): %.*s",
d7c0a89a 573 (int)rc, (unsigned int)sizeof(resp), (int)rc, buf);
a6810074
DL
574 daemon_down(dmn, why);
575 return 0;
576 }
577
578 time_elapsed(&delay, &dmn->echo_sent);
579 dmn->echo_sent.tv_sec = 0;
580 if (dmn->state == DAEMON_UNRESPONSIVE) {
581 if (delay.tv_sec < gs.timeout) {
582 dmn->state = DAEMON_UP;
d62a17ae 583 zlog_warn(
584 "%s state -> up : echo response received after %ld.%06ld "
585 "seconds",
586 dmn->name, (long)delay.tv_sec,
587 (long)delay.tv_usec);
a6810074 588 } else
d62a17ae 589 zlog_warn(
590 "%s: slow echo response finally received after %ld.%06ld "
591 "seconds",
592 dmn->name, (long)delay.tv_sec,
593 (long)delay.tv_usec);
a6810074
DL
594 } else if (gs.loglevel > LOG_DEBUG + 1)
595 zlog_debug("%s: echo response received after %ld.%06ld seconds",
596 dmn->name, (long)delay.tv_sec, (long)delay.tv_usec);
597
598 SET_READ_HANDLER(dmn);
599 if (dmn->t_wakeup)
600 thread_cancel(dmn->t_wakeup);
601 SET_WAKEUP_ECHO(dmn);
602
603 return 0;
8b886ca7 604}
605
207e0d7a
DS
606/*
607 * Wait till we notice that all daemons are ready before
608 * we send we are ready to systemd
609 */
a6810074 610static void daemon_send_ready(void)
207e0d7a 611{
a6810074
DL
612 static int sent = 0;
613 if (!sent && gs.numdown == 0) {
a6810074 614 FILE *fp;
207e0d7a 615
0a7c7856
DL
616 zlog_notice("all daemons up, doing startup-complete notify");
617 frr_detach();
618
a6810074 619 fp = fopen(DAEMON_VTY_DIR "/watchfrr.started", "w");
f5ba21fc
DS
620 if (fp)
621 fclose(fp);
60bd2534 622#if defined HAVE_SYSTEMD
a6810074 623 systemd_send_started(master, 0);
60bd2534 624#endif
a6810074
DL
625 sent = 1;
626 }
207e0d7a
DS
627}
628
a6810074 629static void daemon_up(struct daemon *dmn, const char *why)
8b886ca7 630{
a6810074
DL
631 dmn->state = DAEMON_UP;
632 gs.numdown--;
633 dmn->connect_tries = 0;
634 zlog_notice("%s state -> up : %s", dmn->name, why);
635 daemon_send_ready();
a8cbb8b3 636 SET_WAKEUP_ECHO(dmn);
a6810074 637 phase_check();
8b886ca7 638}
639
a6810074 640static int check_connect(struct thread *t_write)
8b886ca7 641{
a6810074
DL
642 struct daemon *dmn = THREAD_ARG(t_write);
643 int sockerr;
644 socklen_t reslen = sizeof(sockerr);
645
646 dmn->t_write = NULL;
647 if (getsockopt(dmn->fd, SOL_SOCKET, SO_ERROR, (char *)&sockerr, &reslen)
648 < 0) {
649 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn->name,
650 safe_strerror(errno));
651 daemon_down(dmn,
652 "getsockopt failed checking connection success");
653 return 0;
654 }
655 if ((reslen == sizeof(sockerr)) && sockerr) {
656 char why[100];
d62a17ae 657 snprintf(
658 why, sizeof(why),
659 "getsockopt reports that connection attempt failed: %s",
660 safe_strerror(sockerr));
a6810074
DL
661 daemon_down(dmn, why);
662 return 0;
663 }
664
665 daemon_up(dmn, "delayed connect succeeded");
666 return 0;
8b886ca7 667}
668
a6810074 669static int wakeup_connect_hanging(struct thread *t_wakeup)
8b886ca7 670{
a6810074
DL
671 struct daemon *dmn = THREAD_ARG(t_wakeup);
672 char why[100];
673
674 dmn->t_wakeup = NULL;
675 snprintf(why, sizeof(why),
676 "connection attempt timed out after %ld seconds", gs.timeout);
677 daemon_down(dmn, why);
678 return 0;
8b886ca7 679}
680
681/* Making connection to protocol daemon. */
a6810074 682static int try_connect(struct daemon *dmn)
8b886ca7 683{
a6810074
DL
684 int sock;
685 struct sockaddr_un addr;
686 socklen_t len;
687
688 if (gs.loglevel > LOG_DEBUG + 1)
689 zlog_debug("%s: attempting to connect", dmn->name);
690 dmn->connect_tries++;
691
692 memset(&addr, 0, sizeof(struct sockaddr_un));
693 addr.sun_family = AF_UNIX;
d62a17ae 694 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty", gs.vtydir,
695 dmn->name);
6f0e3f6e 696#ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
a6810074 697 len = addr.sun_len = SUN_LEN(&addr);
8b886ca7 698#else
a6810074 699 len = sizeof(addr.sun_family) + strlen(addr.sun_path);
d62a17ae 700#endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
a6810074
DL
701
702 /* Quick check to see if we might succeed before we go to the trouble
703 of creating a socket. */
704 if (access(addr.sun_path, W_OK) < 0) {
705 if (errno != ENOENT)
450971aa 706 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
707 "%s: access to socket %s denied: %s",
708 dmn->name, addr.sun_path,
709 safe_strerror(errno));
a6810074
DL
710 return -1;
711 }
712
713 if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
450971aa 714 flog_err_sys(EC_LIB_SOCKET, "%s(%s): cannot make socket: %s",
09c866e3 715 __func__, addr.sun_path, safe_strerror(errno));
a6810074
DL
716 return -1;
717 }
718
719 if (set_nonblocking(sock) < 0 || set_cloexec(sock) < 0) {
450971aa 720 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
721 "%s(%s): set_nonblocking/cloexec(%d) failed",
722 __func__, addr.sun_path, sock);
a6810074
DL
723 close(sock);
724 return -1;
8b886ca7 725 }
a6810074
DL
726
727 if (connect(sock, (struct sockaddr *)&addr, len) < 0) {
728 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK)) {
729 if (gs.loglevel > LOG_DEBUG)
730 zlog_debug("%s(%s): connect failed: %s",
731 __func__, addr.sun_path,
732 safe_strerror(errno));
733 close(sock);
734 return -1;
735 }
736 if (gs.loglevel > LOG_DEBUG)
737 zlog_debug("%s: connection in progress", dmn->name);
738 dmn->state = DAEMON_CONNECTING;
739 dmn->fd = sock;
66e78ae6
QY
740 dmn->t_write = NULL;
741 thread_add_write(master, check_connect, dmn, dmn->fd,
d62a17ae 742 &dmn->t_write);
743 dmn->t_wakeup = NULL;
744 thread_add_timer(master, wakeup_connect_hanging, dmn,
745 gs.timeout, &dmn->t_wakeup);
a6810074
DL
746 SET_READ_HANDLER(dmn);
747 return 0;
748 }
749
750 dmn->fd = sock;
751 SET_READ_HANDLER(dmn);
752 daemon_up(dmn, "connect succeeded");
753 return 1;
8b886ca7 754}
755
a6810074 756static int phase_hanging(struct thread *t_hanging)
8b886ca7 757{
a6810074 758 gs.t_phase_hanging = NULL;
f74ae2bb 759 flog_err(EC_WATCHFRR_CONNECTION,
1c50c1c0
QY
760 "Phase [%s] hanging for %ld seconds, aborting phased restart",
761 phase_str[gs.phase], PHASE_TIMEOUT);
a6810074
DL
762 gs.phase = PHASE_NONE;
763 return 0;
8b886ca7 764}
765
a6810074 766static void set_phase(restart_phase_t new_phase)
8b886ca7 767{
a6810074
DL
768 gs.phase = new_phase;
769 if (gs.t_phase_hanging)
770 thread_cancel(gs.t_phase_hanging);
66e78ae6
QY
771 gs.t_phase_hanging = NULL;
772 thread_add_timer(master, phase_hanging, NULL, PHASE_TIMEOUT,
773 &gs.t_phase_hanging);
8b886ca7 774}
775
a6810074 776static void phase_check(void)
8b886ca7 777{
c0e5cb52
DL
778 struct daemon *dmn;
779
a6810074
DL
780 switch (gs.phase) {
781 case PHASE_NONE:
782 break;
c0e5cb52
DL
783
784 case PHASE_INIT:
785 for (dmn = gs.daemons; dmn; dmn = dmn->next)
786 if (dmn->state == DAEMON_INIT)
787 return;
788
789 /* startup complete, everything out of INIT */
790 gs.phase = PHASE_NONE;
791 for (dmn = gs.daemons; dmn; dmn = dmn->next)
792 if (dmn->state == DAEMON_DOWN) {
793 SET_WAKEUP_DOWN(dmn);
794 try_restart(dmn);
795 }
796 break;
a6810074
DL
797 case PHASE_STOPS_PENDING:
798 if (gs.numpids)
799 break;
d62a17ae 800 zlog_info(
801 "Phased restart: all routing daemon stop jobs have completed.");
a6810074
DL
802 set_phase(PHASE_WAITING_DOWN);
803
d62a17ae 804 /*FALLTHRU*/
a6810074
DL
805 case PHASE_WAITING_DOWN:
806 if (gs.numdown + IS_UP(gs.special) < gs.numdaemons)
807 break;
808 zlog_info("Phased restart: all routing daemons now down.");
809 run_job(&gs.special->restart, "restart", gs.restart_command, 1,
810 1);
811 set_phase(PHASE_ZEBRA_RESTART_PENDING);
812
d62a17ae 813 /*FALLTHRU*/
a6810074
DL
814 case PHASE_ZEBRA_RESTART_PENDING:
815 if (gs.special->restart.pid)
816 break;
817 zlog_info("Phased restart: %s restart job completed.",
818 gs.special->name);
819 set_phase(PHASE_WAITING_ZEBRA_UP);
820
d62a17ae 821 /*FALLTHRU*/
a6810074
DL
822 case PHASE_WAITING_ZEBRA_UP:
823 if (!IS_UP(gs.special))
824 break;
825 zlog_info("Phased restart: %s is now up.", gs.special->name);
826 {
827 struct daemon *dmn;
828 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
829 if (dmn != gs.special)
830 run_job(&dmn->restart, "start",
831 gs.start_command, 1, 0);
832 }
833 }
834 gs.phase = PHASE_NONE;
835 THREAD_OFF(gs.t_phase_hanging);
836 zlog_notice("Phased global restart has completed.");
837 break;
838 }
8b886ca7 839}
840
a6810074 841static void try_restart(struct daemon *dmn)
8b886ca7 842{
f168b713 843 if (watch_only)
a6810074 844 return;
a6810074 845
f168b713
DL
846 if (dmn != gs.special) {
847 if ((gs.special->state == DAEMON_UP)
848 && (gs.phase == PHASE_NONE))
849 run_job(&dmn->restart, "restart", gs.restart_command, 0,
850 1);
851 else
852 zlog_debug(
853 "%s: postponing restart attempt because master %s daemon "
854 "not up [%s], or phased restart in progress",
855 dmn->name, gs.special->name,
856 state_str[gs.special->state]);
857 return;
858 }
859
860 if ((gs.phase != PHASE_NONE) || gs.numpids) {
861 if (gs.loglevel > LOG_DEBUG + 1)
862 zlog_debug(
863 "postponing phased global restart: restart already in "
864 "progress [%s], or outstanding child processes [%d]",
865 phase_str[gs.phase], gs.numpids);
866 return;
867 }
868 /* Is it too soon for a restart? */
869 {
870 struct timeval delay;
871 if (time_elapsed(&delay, &gs.special->restart.time)->tv_sec
872 < gs.special->restart.interval) {
a6810074 873 if (gs.loglevel > LOG_DEBUG + 1)
d62a17ae 874 zlog_debug(
f168b713
DL
875 "postponing phased global restart: "
876 "elapsed time %ld < retry interval %ld",
877 (long)delay.tv_sec,
878 gs.special->restart.interval);
879 return;
a6810074 880 }
8b886ca7 881 }
f168b713 882 run_job(&gs.restart, "restart", gs.restart_command, 0, 1);
8b886ca7 883}
884
a6810074 885static int wakeup_unresponsive(struct thread *t_wakeup)
8b886ca7 886{
a6810074
DL
887 struct daemon *dmn = THREAD_ARG(t_wakeup);
888
889 dmn->t_wakeup = NULL;
890 if (dmn->state != DAEMON_UNRESPONSIVE)
f74ae2bb 891 flog_err(EC_WATCHFRR_CONNECTION,
1c50c1c0
QY
892 "%s: no longer unresponsive (now %s), "
893 "wakeup should have been cancelled!",
894 dmn->name, state_str[dmn->state]);
a6810074
DL
895 else {
896 SET_WAKEUP_UNRESPONSIVE(dmn);
897 try_restart(dmn);
898 }
899 return 0;
8b886ca7 900}
901
a6810074 902static int wakeup_no_answer(struct thread *t_wakeup)
8b886ca7 903{
a6810074
DL
904 struct daemon *dmn = THREAD_ARG(t_wakeup);
905
906 dmn->t_wakeup = NULL;
907 dmn->state = DAEMON_UNRESPONSIVE;
f74ae2bb 908 flog_err(EC_WATCHFRR_CONNECTION,
1c50c1c0
QY
909 "%s state -> unresponsive : no response yet to ping "
910 "sent %ld seconds ago",
911 dmn->name, gs.timeout);
71e7975a
DL
912 SET_WAKEUP_UNRESPONSIVE(dmn);
913 try_restart(dmn);
a6810074 914 return 0;
8b886ca7 915}
916
a6810074 917static int wakeup_send_echo(struct thread *t_wakeup)
8b886ca7 918{
a6810074
DL
919 static const char echocmd[] = "echo " PING_TOKEN;
920 ssize_t rc;
921 struct daemon *dmn = THREAD_ARG(t_wakeup);
922
923 dmn->t_wakeup = NULL;
d62a17ae 924 if (((rc = write(dmn->fd, echocmd, sizeof(echocmd))) < 0)
925 || ((size_t)rc != sizeof(echocmd))) {
a6810074
DL
926 char why[100 + sizeof(echocmd)];
927 snprintf(why, sizeof(why),
928 "write '%s' returned %d instead of %u", echocmd,
d7c0a89a 929 (int)rc, (unsigned int)sizeof(echocmd));
a6810074
DL
930 daemon_down(dmn, why);
931 } else {
932 gettimeofday(&dmn->echo_sent, NULL);
66e78ae6
QY
933 dmn->t_wakeup = NULL;
934 thread_add_timer(master, wakeup_no_answer, dmn, gs.timeout,
935 &dmn->t_wakeup);
a6810074
DL
936 }
937 return 0;
8b886ca7 938}
939
470bc619
QY
940bool check_all_up(void)
941{
942 struct daemon *dmn;
943
944 for (dmn = gs.daemons; dmn; dmn = dmn->next)
945 if (dmn->state != DAEMON_UP)
946 return false;
947 return true;
948}
949
a6810074 950static void sigint(void)
8b886ca7 951{
a6810074
DL
952 zlog_notice("Terminating on signal");
953 systemd_send_stopping();
954 exit(0);
8b886ca7 955}
956
a6810074 957static int valid_command(const char *cmd)
8b886ca7 958{
a6810074 959 char *p;
8b886ca7 960
a6810074 961 return ((p = strchr(cmd, '%')) != NULL) && (*(p + 1) == 's')
d62a17ae 962 && !strchr(p + 1, '%');
8b886ca7 963}
964
c8b40f86 965/* This is an ugly hack to circumvent problems with passing command-line
966 arguments that contain spaces. The fix is to use a configuration file. */
a6810074 967static char *translate_blanks(const char *cmd, const char *blankstr)
c8b40f86 968{
a6810074
DL
969 char *res;
970 char *p;
971 size_t bslen = strlen(blankstr);
972
973 if (!(res = strdup(cmd))) {
974 perror("strdup");
975 exit(1);
976 }
977 while ((p = strstr(res, blankstr)) != NULL) {
978 *p = ' ';
979 if (bslen != 1)
980 memmove(p + 1, p + bslen, strlen(p + bslen) + 1);
981 }
982 return res;
c8b40f86 983}
984
0a7c7856
DL
985static void watchfrr_init(int argc, char **argv)
986{
987 const char *special = "zebra";
988 int i;
989 struct daemon *dmn, **add = &gs.daemons;
990 char alldaemons[512] = "", *p = alldaemons;
991
992 for (i = optind; i < argc; i++) {
993 dmn = XCALLOC(MTYPE_WATCHFRR_DAEMON, sizeof(*dmn));
994
995 dmn->name = dmn->restart.name = argv[i];
996 dmn->state = DAEMON_INIT;
997 gs.numdaemons++;
998 gs.numdown++;
999 dmn->fd = -1;
1000 dmn->t_wakeup = NULL;
c0e5cb52 1001 thread_add_timer_msec(master, wakeup_init, dmn, 0,
0a7c7856
DL
1002 &dmn->t_wakeup);
1003 dmn->restart.interval = gs.min_restart_interval;
1004 *add = dmn;
1005 add = &dmn->next;
1006
1007 if (!strcmp(dmn->name, special))
1008 gs.special = dmn;
1009 }
1010
1011 if (!gs.daemons) {
1012 fprintf(stderr,
1013 "Must specify one or more daemons to monitor.\n\n");
1014 frr_help_exit(1);
1015 }
1016 if (!watch_only && !gs.special) {
1017 fprintf(stderr, "\"%s\" daemon must be in daemon lists\n\n",
1018 special);
1019 frr_help_exit(1);
1020 }
1021
1022 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
1023 snprintf(p, alldaemons + sizeof(alldaemons) - p, "%s%s",
1024 (p == alldaemons) ? "" : " ", dmn->name);
1025 p += strlen(p);
1026 }
1027 zlog_notice("%s %s watching [%s]%s", progname, FRR_VERSION, alldaemons,
1028 watch_only ? ", monitor mode" : "");
1029}
1030
a6810074 1031struct zebra_privs_t watchfrr_privs = {
95c4aff2 1032#ifdef VTY_GROUP
a6810074 1033 .vty_group = VTY_GROUP,
95c4aff2
DL
1034#endif
1035};
1036
4f04a76b
DL
1037static struct quagga_signal_t watchfrr_signals[] = {
1038 {
1039 .signal = SIGINT,
1040 .handler = sigint,
1041 },
1042 {
1043 .signal = SIGTERM,
1044 .handler = sigint,
1045 },
1046 {
1047 .signal = SIGCHLD,
1048 .handler = sigchild,
1049 },
1050};
1051
1052FRR_DAEMON_INFO(watchfrr, WATCHFRR,
d62a17ae 1053 .flags = FRR_NO_PRIVSEP | FRR_NO_TCPVTY | FRR_LIMITED_CLI
0a7c7856
DL
1054 | FRR_NO_CFG_PID_DRY | FRR_NO_ZCLIENT
1055 | FRR_DETACH_LATER,
4f04a76b 1056
d62a17ae 1057 .printhelp = printhelp,
1058 .copyright = "Copyright 2004 Andrew J. Schorr",
4f04a76b 1059
d62a17ae 1060 .signals = watchfrr_signals,
1061 .n_signals = array_size(watchfrr_signals),
4f04a76b 1062
d62a17ae 1063 .privs = &watchfrr_privs, )
4f04a76b 1064
999f153e
DL
1065#define DEPRECATED_OPTIONS "aAezR:"
1066
a6810074 1067int main(int argc, char **argv)
8b886ca7 1068{
a6810074 1069 int opt;
a6810074 1070 const char *blankstr = NULL;
a6810074 1071
4f04a76b
DL
1072 frr_preinit(&watchfrr_di, argc, argv);
1073 progname = watchfrr_di.progname;
1074
999f153e 1075 frr_opt_add("b:dk:l:i:p:r:S:s:t:T:" DEPRECATED_OPTIONS, longopts, "");
a6810074
DL
1076
1077 gs.restart.name = "all";
4f04a76b 1078 while ((opt = frr_getopt(argc, argv, NULL)) != EOF) {
999f153e
DL
1079 if (opt && opt < 128 && strchr(DEPRECATED_OPTIONS, opt)) {
1080 fprintf(stderr,
1081 "The -%c option no longer exists.\n"
1082 "Please refer to the watchfrr(8) man page.\n",
1083 opt);
1084 exit(1);
1085 }
1086
a6810074
DL
1087 switch (opt) {
1088 case 0:
1089 break;
a6810074
DL
1090 case 'b':
1091 blankstr = optarg;
1092 break;
f168b713
DL
1093 case OPTION_DRY:
1094 watch_only = true;
a6810074
DL
1095 break;
1096 case 'k':
1097 if (!valid_command(optarg)) {
1098 fprintf(stderr,
1099 "Invalid kill command, must contain '%%s': %s\n",
1100 optarg);
4f04a76b 1101 frr_help_exit(1);
a6810074
DL
1102 }
1103 gs.stop_command = optarg;
1104 break;
d62a17ae 1105 case 'l': {
1106 char garbage[3];
1107 if ((sscanf(optarg, "%d%1s", &gs.loglevel, garbage)
1108 != 1)
1109 || (gs.loglevel < LOG_EMERG)) {
1110 fprintf(stderr,
1111 "Invalid loglevel argument: %s\n",
1112 optarg);
1113 frr_help_exit(1);
a6810074 1114 }
d62a17ae 1115 } break;
1116 case OPTION_MINRESTART: {
1117 char garbage[3];
1118 if ((sscanf(optarg, "%ld%1s", &gs.min_restart_interval,
1119 garbage)
1120 != 1)
1121 || (gs.min_restart_interval < 0)) {
1122 fprintf(stderr,
1123 "Invalid min_restart_interval argument: %s\n",
1124 optarg);
1125 frr_help_exit(1);
a6810074 1126 }
d62a17ae 1127 } break;
1128 case OPTION_MAXRESTART: {
1129 char garbage[3];
1130 if ((sscanf(optarg, "%ld%1s", &gs.max_restart_interval,
1131 garbage)
1132 != 1)
1133 || (gs.max_restart_interval < 0)) {
1134 fprintf(stderr,
1135 "Invalid max_restart_interval argument: %s\n",
1136 optarg);
1137 frr_help_exit(1);
a6810074 1138 }
d62a17ae 1139 } break;
1140 case 'i': {
1141 char garbage[3];
1142 int period;
1143 if ((sscanf(optarg, "%d%1s", &period, garbage) != 1)
1144 || (gs.period < 1)) {
1145 fprintf(stderr,
1146 "Invalid interval argument: %s\n",
1147 optarg);
1148 frr_help_exit(1);
a6810074 1149 }
d62a17ae 1150 gs.period = 1000 * period;
1151 } break;
a6810074 1152 case 'p':
0a7c7856 1153 watchfrr_di.pid_file = optarg;
a6810074
DL
1154 break;
1155 case 'r':
a6810074
DL
1156 if (!valid_command(optarg)) {
1157 fprintf(stderr,
1158 "Invalid restart command, must contain '%%s': %s\n",
1159 optarg);
4f04a76b 1160 frr_help_exit(1);
a6810074
DL
1161 }
1162 gs.restart_command = optarg;
a6810074
DL
1163 break;
1164 case 's':
1165 if (!valid_command(optarg)) {
1166 fprintf(stderr,
1167 "Invalid start command, must contain '%%s': %s\n",
1168 optarg);
4f04a76b 1169 frr_help_exit(1);
a6810074
DL
1170 }
1171 gs.start_command = optarg;
1172 break;
1173 case 'S':
1174 gs.vtydir = optarg;
1175 break;
d62a17ae 1176 case 't': {
1177 char garbage[3];
1178 if ((sscanf(optarg, "%ld%1s", &gs.timeout, garbage)
1179 != 1)
1180 || (gs.timeout < 1)) {
1181 fprintf(stderr,
1182 "Invalid timeout argument: %s\n",
1183 optarg);
1184 frr_help_exit(1);
a6810074 1185 }
d62a17ae 1186 } break;
1187 case 'T': {
1188 char garbage[3];
1189 if ((sscanf(optarg, "%ld%1s", &gs.restart_timeout,
1190 garbage)
1191 != 1)
1192 || (gs.restart_timeout < 1)) {
1193 fprintf(stderr,
1194 "Invalid restart timeout argument: %s\n",
1195 optarg);
1196 frr_help_exit(1);
a6810074 1197 }
d62a17ae 1198 } break;
a6810074
DL
1199 default:
1200 fputs("Invalid option.\n", stderr);
4f04a76b 1201 frr_help_exit(1);
a6810074 1202 }
8b886ca7 1203 }
a6810074 1204
71e7975a
DL
1205 if (watch_only
1206 && (gs.start_command || gs.stop_command || gs.restart_command)) {
d87ae5cc 1207 fputs("Options -r/-s/-k are not used when --dry is active.\n",
a6810074 1208 stderr);
8b886ca7 1209 }
f168b713
DL
1210 if (!watch_only
1211 && (!gs.restart_command || !gs.start_command || !gs.stop_command)) {
1212 fprintf(stderr,
1213 "Options -s (start), -k (kill), and -r (restart) are required.\n");
1214 frr_help_exit(1);
8b886ca7 1215 }
8b886ca7 1216
a6810074
DL
1217 if (blankstr) {
1218 if (gs.restart_command)
1219 gs.restart_command =
d62a17ae 1220 translate_blanks(gs.restart_command, blankstr);
a6810074
DL
1221 if (gs.start_command)
1222 gs.start_command =
d62a17ae 1223 translate_blanks(gs.start_command, blankstr);
a6810074
DL
1224 if (gs.stop_command)
1225 gs.stop_command =
d62a17ae 1226 translate_blanks(gs.stop_command, blankstr);
065de903 1227 }
8b886ca7 1228
a6810074 1229 gs.restart.interval = gs.min_restart_interval;
8b886ca7 1230
4f04a76b 1231 master = frr_init();
b647dc2a 1232 watchfrr_error_init();
0a7c7856
DL
1233 watchfrr_init(argc, argv);
1234 watchfrr_vty_init();
1235
1236 frr_config_fork();
4f04a76b 1237
dd8376fe 1238 zlog_set_level(ZLOG_DEST_MONITOR, ZLOG_DISABLED);
0a7c7856 1239 if (watchfrr_di.daemon_mode)
dd8376fe 1240 zlog_set_level(ZLOG_DEST_SYSLOG, MIN(gs.loglevel, LOG_DEBUG));
0a7c7856 1241 else
dd8376fe 1242 zlog_set_level(ZLOG_DEST_STDOUT, MIN(gs.loglevel, LOG_DEBUG));
8b886ca7 1243
0a7c7856 1244 frr_run(master);
8b886ca7 1245
a6810074
DL
1246 systemd_send_stopping();
1247 /* Not reached. */
1248 return 0;
8b886ca7 1249}