]> git.proxmox.com Git - mirror_frr.git/blame - watchfrr/watchfrr.c
Merge pull request #4458 from karamalla0406/frr4123
[mirror_frr.git] / watchfrr / watchfrr.c
CommitLineData
8b886ca7 1/*
896014f4
DL
2 * Monitor status of frr daemons and restart if necessary.
3 *
4 * Copyright (C) 2004 Andrew J. Schorr
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
8b886ca7 19 */
20
a365534f 21#include <zebra.h>
8b886ca7 22#include <thread.h>
23#include <log.h>
52e66296 24#include <network.h>
8b886ca7 25#include <sigevent.h>
a365534f 26#include <lib/version.h>
95c4aff2 27#include "command.h"
87f44e2f 28#include "memory_vty.h"
4f04a76b 29#include "libfrr.h"
b647dc2a 30#include "lib_errors.h"
95c4aff2 31
6f594023 32#include <getopt.h>
a365534f 33#include <sys/un.h>
34#include <sys/wait.h>
837d16cc 35#include <memory.h>
651415bd 36#include <systemd.h>
8b886ca7 37
9473e340 38#include "watchfrr.h"
b647dc2a 39#include "watchfrr_errors.h"
95c4aff2 40
8b886ca7 41#ifndef MIN
42#define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
43#endif
44
45/* Macros to help randomize timers. */
46#define JITTER(X) ((random() % ((X)+1))-((X)/2))
47#define FUZZY(X) ((X)+JITTER((X)/20))
48
49#define DEFAULT_PERIOD 5
0a64aff6 50#define DEFAULT_TIMEOUT 90
8b886ca7 51#define DEFAULT_RESTART_TIMEOUT 20
52#define DEFAULT_LOGLEVEL LOG_INFO
53#define DEFAULT_MIN_RESTART 60
54#define DEFAULT_MAX_RESTART 600
8b886ca7 55
3ec95567
DL
56#define DEFAULT_RESTART_CMD WATCHFRR_SH_PATH " restart %s"
57#define DEFAULT_START_CMD WATCHFRR_SH_PATH " start %s"
58#define DEFAULT_STOP_CMD WATCHFRR_SH_PATH " stop %s"
59
8b886ca7 60#define PING_TOKEN "PING"
61
0a7c7856
DL
62DEFINE_MGROUP(WATCHFRR, "watchfrr")
63DEFINE_MTYPE_STATIC(WATCHFRR, WATCHFRR_DAEMON, "watchfrr daemon entry")
64
55c72803 65/* Needs to be global, referenced somewhere inside libfrr. */
8b886ca7 66struct thread_master *master;
67
f168b713 68static bool watch_only = false;
8b886ca7 69
a6810074
DL
70typedef enum {
71 PHASE_NONE = 0,
c0e5cb52 72 PHASE_INIT,
a6810074
DL
73 PHASE_STOPS_PENDING,
74 PHASE_WAITING_DOWN,
75 PHASE_ZEBRA_RESTART_PENDING,
76 PHASE_WAITING_ZEBRA_UP
8b886ca7 77} restart_phase_t;
78
a6810074 79static const char *phase_str[] = {
af568444 80 "Idle",
c0e5cb52 81 "Startup",
a6810074
DL
82 "Stop jobs running",
83 "Waiting for other daemons to come down",
84 "Zebra restart job running",
85 "Waiting for zebra to come up",
86 "Start jobs running",
8b886ca7 87};
88
89#define PHASE_TIMEOUT (3*gs.restart_timeout)
5c9d1c83 90#define STARTUP_TIMEOUT 55 * 1000
8b886ca7 91
a6810074
DL
92struct restart_info {
93 const char *name;
94 const char *what;
95 pid_t pid;
96 struct timeval time;
97 long interval;
98 struct thread *t_kill;
99 int kills;
098e240f 100};
101
a6810074 102static struct global_state {
a6810074
DL
103 restart_phase_t phase;
104 struct thread *t_phase_hanging;
5c9d1c83 105 struct thread *t_startup_timeout;
a6810074
DL
106 const char *vtydir;
107 long period;
108 long timeout;
109 long restart_timeout;
110 long min_restart_interval;
111 long max_restart_interval;
a6810074
DL
112 struct daemon *daemons;
113 const char *restart_command;
114 const char *start_command;
115 const char *stop_command;
116 struct restart_info restart;
a6810074 117 int loglevel;
d62a17ae 118 struct daemon *special; /* points to zebra when doing phased restart */
a6810074
DL
119 int numdaemons;
120 int numpids;
d62a17ae 121 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
8b886ca7 122} gs = {
c0e5cb52 123 .phase = PHASE_INIT,
64a249ad 124 .vtydir = frr_vtydir,
d62a17ae 125 .period = 1000 * DEFAULT_PERIOD,
126 .timeout = DEFAULT_TIMEOUT,
127 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
128 .loglevel = DEFAULT_LOGLEVEL,
129 .min_restart_interval = DEFAULT_MIN_RESTART,
130 .max_restart_interval = DEFAULT_MAX_RESTART,
3ec95567
DL
131 .restart_command = DEFAULT_RESTART_CMD,
132 .start_command = DEFAULT_START_CMD,
133 .stop_command = DEFAULT_STOP_CMD,
d62a17ae 134};
a6810074
DL
135
136typedef enum {
137 DAEMON_INIT,
138 DAEMON_DOWN,
139 DAEMON_CONNECTING,
140 DAEMON_UP,
141 DAEMON_UNRESPONSIVE
8b886ca7 142} daemon_state_t;
143
d62a17ae 144#define IS_UP(DMN) \
145 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
8b886ca7 146
a6810074 147static const char *state_str[] = {
d62a17ae 148 "Init", "Down", "Connecting", "Up", "Unresponsive",
8b886ca7 149};
150
151struct daemon {
a6810074
DL
152 const char *name;
153 daemon_state_t state;
154 int fd;
155 struct timeval echo_sent;
d7c0a89a 156 unsigned int connect_tries;
a6810074
DL
157 struct thread *t_wakeup;
158 struct thread *t_read;
159 struct thread *t_write;
160 struct daemon *next;
161 struct restart_info restart;
8b886ca7 162};
163
9272302b
DL
164#define OPTION_MINRESTART 2000
165#define OPTION_MAXRESTART 2001
f168b713 166#define OPTION_DRY 2002
9272302b 167
a6810074
DL
168static const struct option longopts[] = {
169 {"daemon", no_argument, NULL, 'd'},
170 {"statedir", required_argument, NULL, 'S'},
a6810074
DL
171 {"loglevel", required_argument, NULL, 'l'},
172 {"interval", required_argument, NULL, 'i'},
173 {"timeout", required_argument, NULL, 't'},
174 {"restart-timeout", required_argument, NULL, 'T'},
175 {"restart", required_argument, NULL, 'r'},
176 {"start-command", required_argument, NULL, 's'},
177 {"kill-command", required_argument, NULL, 'k'},
f168b713 178 {"dry", no_argument, NULL, OPTION_DRY},
d62a17ae 179 {"min-restart-interval", required_argument, NULL, OPTION_MINRESTART},
180 {"max-restart-interval", required_argument, NULL, OPTION_MAXRESTART},
a6810074
DL
181 {"pid-file", required_argument, NULL, 'p'},
182 {"blank-string", required_argument, NULL, 'b'},
183 {"help", no_argument, NULL, 'h'},
184 {"version", no_argument, NULL, 'v'},
d62a17ae 185 {NULL, 0, NULL, 0}};
8b886ca7 186
187static int try_connect(struct daemon *dmn);
188static int wakeup_send_echo(struct thread *t_wakeup);
189static void try_restart(struct daemon *dmn);
190static void phase_check(void);
75f8b0e4 191static void restart_done(struct daemon *dmn);
8b886ca7 192
4f04a76b
DL
193static const char *progname;
194static void printhelp(FILE *target)
8b886ca7 195{
d62a17ae 196 fprintf(target,
197 "Usage : %s [OPTION...] <daemon name> ...\n\n\
9473e340 198Watchdog program to monitor status of frr daemons and try to restart\n\
8b886ca7 199them if they are down or unresponsive. It determines whether a daemon is\n\
200up based on whether it can connect to the daemon's vty unix stream socket.\n\
201It then repeatedly sends echo commands over that socket to determine whether\n\
202the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
203on the socket connection and know immediately that the daemon is down.\n\n\
204The daemons to be monitored should be listed on the command line.\n\n\
8b886ca7 205In order to avoid attempting to restart the daemons in a fast loop,\n\
206the -m and -M options allow you to control the minimum delay between\n\
207restart commands. The minimum restart delay is recalculated each time\n\
208a restart is attempted: if the time since the last restart attempt exceeds\n\
209twice the -M value, then the restart delay is set to the -m value.\n\
d62a17ae 210Otherwise, the interval is doubled (but capped at the -M value).\n\n",
f168b713 211 progname);
e757c940 212
d62a17ae 213 fprintf(target,
214 "Options:\n\
8b886ca7 215-d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
216 to syslog instead of stdout.\n\
217-S, --statedir Set the vty socket directory (default is %s)\n\
8b886ca7 218-l, --loglevel Set the logging level (default is %d).\n\
219 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
220 but it can be set higher than %d if extra-verbose debugging\n\
221 messages are desired.\n\
9272302b 222 --min-restart-interval\n\
8b886ca7 223 Set the minimum seconds to wait between invocations of daemon\n\
224 restart commands (default is %d).\n\
9272302b 225 --max-restart-interval\n\
8b886ca7 226 Set the maximum seconds to wait between invocations of daemon\n\
227 restart commands (default is %d).\n\
228-i, --interval Set the status polling interval in seconds (default is %d)\n\
229-t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
230-T, --restart-timeout\n\
231 Set the restart (kill) timeout in seconds (default is %d).\n\
232 If any background jobs are still running after this much\n\
233 time has elapsed, they will be killed.\n\
234-r, --restart Supply a Bourne shell command to use to restart a single\n\
235 daemon. The command string should include '%%s' where the\n\
236 name of the daemon should be substituted.\n\
3ec95567 237 (default: '%s')\n\
8b886ca7 238-s, --start-command\n\
239 Supply a Bourne shell to command to use to start a single\n\
240 daemon. The command string should include '%%s' where the\n\
241 name of the daemon should be substituted.\n\
3ec95567 242 (default: '%s')\n\
8b886ca7 243-k, --kill-command\n\
244 Supply a Bourne shell to command to use to stop a single\n\
245 daemon. The command string should include '%%s' where the\n\
246 name of the daemon should be substituted.\n\
3ec95567 247 (default: '%s')\n\
f168b713 248 --dry Do not start or restart anything, just log.\n\
8b886ca7 249-p, --pid-file Set process identifier file name\n\
0a7c7856 250 (default is %s/watchfrr.pid).\n\
c8b40f86 251-b, --blank-string\n\
252 When the supplied argument string is found in any of the\n\
f168b713 253 various shell command arguments (-r, -s, or -k), replace\n\
c8b40f86 254 it with a space. This is an ugly hack to circumvent problems\n\
255 passing command-line arguments with embedded spaces.\n\
8b886ca7 256-v, --version Print program version\n\
d62a17ae 257-h, --help Display this help and exit\n",
64a249ad 258 frr_vtydir, DEFAULT_LOGLEVEL, LOG_EMERG, LOG_DEBUG, LOG_DEBUG,
d62a17ae 259 DEFAULT_MIN_RESTART, DEFAULT_MAX_RESTART, DEFAULT_PERIOD,
3ec95567
DL
260 DEFAULT_TIMEOUT, DEFAULT_RESTART_TIMEOUT,
261 DEFAULT_RESTART_CMD, DEFAULT_START_CMD, DEFAULT_STOP_CMD,
262 frr_vtydir);
8b886ca7 263}
264
a6810074 265static pid_t run_background(char *shell_cmd)
8b886ca7 266{
a6810074
DL
267 pid_t child;
268
269 switch (child = fork()) {
270 case -1:
450971aa 271 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
272 "fork failed, cannot run command [%s]: %s",
273 shell_cmd, safe_strerror(errno));
a6810074
DL
274 return -1;
275 case 0:
276 /* Child process. */
d62a17ae 277 /* Use separate process group so child processes can be killed
278 * easily. */
a6810074
DL
279 if (setpgid(0, 0) < 0)
280 zlog_warn("warning: setpgid(0,0) failed: %s",
281 safe_strerror(errno));
282 {
283 char shell[] = "sh";
284 char dashc[] = "-c";
d62a17ae 285 char *const argv[4] = {shell, dashc, shell_cmd, NULL};
a6810074 286 execv("/bin/sh", argv);
450971aa 287 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
288 "execv(/bin/sh -c '%s') failed: %s",
289 shell_cmd, safe_strerror(errno));
a6810074
DL
290 _exit(127);
291 }
292 default:
293 /* Parent process: we will reap the child later. */
450971aa 294 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
295 "Forked background command [pid %d]: %s",
296 (int)child, shell_cmd);
a6810074
DL
297 return child;
298 }
8b886ca7 299}
300
a6810074
DL
301static struct timeval *time_elapsed(struct timeval *result,
302 const struct timeval *start_time)
8b886ca7 303{
a6810074
DL
304 gettimeofday(result, NULL);
305 result->tv_sec -= start_time->tv_sec;
306 result->tv_usec -= start_time->tv_usec;
307 while (result->tv_usec < 0) {
308 result->tv_usec += 1000000L;
309 result->tv_sec--;
310 }
311 return result;
8b886ca7 312}
313
a6810074 314static int restart_kill(struct thread *t_kill)
8b886ca7 315{
a6810074
DL
316 struct restart_info *restart = THREAD_ARG(t_kill);
317 struct timeval delay;
318
319 time_elapsed(&delay, &restart->time);
d62a17ae 320 zlog_warn(
321 "Warning: %s %s child process %d still running after "
322 "%ld seconds, sending signal %d",
323 restart->what, restart->name, (int)restart->pid,
324 (long)delay.tv_sec, (restart->kills ? SIGKILL : SIGTERM));
a6810074
DL
325 kill(-restart->pid, (restart->kills ? SIGKILL : SIGTERM));
326 restart->kills++;
66e78ae6
QY
327 restart->t_kill = NULL;
328 thread_add_timer(master, restart_kill, restart, gs.restart_timeout,
329 &restart->t_kill);
a6810074 330 return 0;
8b886ca7 331}
332
a6810074 333static struct restart_info *find_child(pid_t child)
8b886ca7 334{
f168b713 335 struct daemon *dmn;
7c265f7d
CF
336 if (gs.restart.pid == child)
337 return &gs.restart;
338
f168b713
DL
339 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
340 if (dmn->restart.pid == child)
341 return &dmn->restart;
a6810074
DL
342 }
343 return NULL;
8b886ca7 344}
345
a6810074 346static void sigchild(void)
8b886ca7 347{
a6810074
DL
348 pid_t child;
349 int status;
350 const char *name;
351 const char *what;
352 struct restart_info *restart;
75f8b0e4 353 struct daemon *dmn;
a6810074
DL
354
355 switch (child = waitpid(-1, &status, WNOHANG)) {
356 case -1:
450971aa 357 flog_err_sys(EC_LIB_SYSTEM_CALL, "waitpid failed: %s",
09c866e3 358 safe_strerror(errno));
a6810074
DL
359 return;
360 case 0:
361 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
362 return;
363 }
364
365 if (child == integrated_write_pid) {
366 integrated_write_sigchld(status);
367 return;
368 }
369
370 if ((restart = find_child(child)) != NULL) {
371 name = restart->name;
372 what = restart->what;
373 restart->pid = 0;
374 gs.numpids--;
375 thread_cancel(restart->t_kill);
376 restart->t_kill = NULL;
d62a17ae 377 /* Update restart time to reflect the time the command
378 * completed. */
a6810074
DL
379 gettimeofday(&restart->time, NULL);
380 } else {
09c866e3 381 flog_err_sys(
450971aa 382 EC_LIB_SYSTEM_CALL,
09c866e3
QY
383 "waitpid returned status for an unknown child process %d",
384 (int)child);
a6810074
DL
385 name = "(unknown)";
386 what = "background";
387 }
388 if (WIFSTOPPED(status))
d62a17ae 389 zlog_warn("warning: %s %s process %d is stopped", what, name,
390 (int)child);
a6810074 391 else if (WIFSIGNALED(status))
d62a17ae 392 zlog_warn("%s %s process %d terminated due to signal %d", what,
393 name, (int)child, WTERMSIG(status));
a6810074
DL
394 else if (WIFEXITED(status)) {
395 if (WEXITSTATUS(status) != 0)
d62a17ae 396 zlog_warn(
397 "%s %s process %d exited with non-zero status %d",
398 what, name, (int)child, WEXITSTATUS(status));
75f8b0e4 399 else {
a6810074
DL
400 zlog_debug("%s %s process %d exited normally", what,
401 name, (int)child);
75f8b0e4
DL
402
403 if (restart && restart != &gs.restart) {
404 dmn = container_of(restart, struct daemon,
405 restart);
406 restart_done(dmn);
407 } else if (restart)
408 for (dmn = gs.daemons; dmn; dmn = dmn->next)
409 restart_done(dmn);
410 }
a6810074 411 } else
09c866e3 412 flog_err_sys(
450971aa 413 EC_LIB_SYSTEM_CALL,
09c866e3
QY
414 "cannot interpret %s %s process %d wait status 0x%x",
415 what, name, (int)child, status);
a6810074 416 phase_check();
8b886ca7 417}
418
d62a17ae 419static int run_job(struct restart_info *restart, const char *cmdtype,
420 const char *command, int force, int update_interval)
8b886ca7 421{
a6810074
DL
422 struct timeval delay;
423
424 if (gs.loglevel > LOG_DEBUG + 1)
425 zlog_debug("attempting to %s %s", cmdtype, restart->name);
426
427 if (restart->pid) {
428 if (gs.loglevel > LOG_DEBUG + 1)
d62a17ae 429 zlog_debug(
430 "cannot %s %s, previous pid %d still running",
431 cmdtype, restart->name, (int)restart->pid);
a6810074
DL
432 return -1;
433 }
434
d62a17ae 435 /* Note: time_elapsed test must come before the force test, since we
436 need
a6810074
DL
437 to make sure that delay is initialized for use below in updating the
438 restart interval. */
439 if ((time_elapsed(&delay, &restart->time)->tv_sec < restart->interval)
440 && !force) {
441 if (gs.loglevel > LOG_DEBUG + 1)
d62a17ae 442 zlog_debug(
443 "postponing %s %s: "
444 "elapsed time %ld < retry interval %ld",
445 cmdtype, restart->name, (long)delay.tv_sec,
446 restart->interval);
a6810074
DL
447 return -1;
448 }
449
450 gettimeofday(&restart->time, NULL);
451 restart->kills = 0;
452 {
453 char cmd[strlen(command) + strlen(restart->name) + 1];
454 snprintf(cmd, sizeof(cmd), command, restart->name);
455 if ((restart->pid = run_background(cmd)) > 0) {
66e78ae6 456 restart->t_kill = NULL;
d62a17ae 457 thread_add_timer(master, restart_kill, restart,
458 gs.restart_timeout, &restart->t_kill);
a6810074
DL
459 restart->what = cmdtype;
460 gs.numpids++;
461 } else
462 restart->pid = 0;
463 }
464
465 /* Calculate the new restart interval. */
466 if (update_interval) {
467 if (delay.tv_sec > 2 * gs.max_restart_interval)
468 restart->interval = gs.min_restart_interval;
469 else if ((restart->interval *= 2) > gs.max_restart_interval)
470 restart->interval = gs.max_restart_interval;
471 if (gs.loglevel > LOG_DEBUG + 1)
472 zlog_debug("restart %s interval is now %ld",
473 restart->name, restart->interval);
474 }
475 return restart->pid;
8b886ca7 476}
477
d62a17ae 478#define SET_READ_HANDLER(DMN) \
479 do { \
480 (DMN)->t_read = NULL; \
481 thread_add_read(master, handle_read, (DMN), (DMN)->fd, \
482 &(DMN)->t_read); \
483 } while (0);
484
485#define SET_WAKEUP_DOWN(DMN) \
486 do { \
487 (DMN)->t_wakeup = NULL; \
488 thread_add_timer_msec(master, wakeup_down, (DMN), \
489 FUZZY(gs.period), &(DMN)->t_wakeup); \
490 } while (0);
491
492#define SET_WAKEUP_UNRESPONSIVE(DMN) \
493 do { \
494 (DMN)->t_wakeup = NULL; \
495 thread_add_timer_msec(master, wakeup_unresponsive, (DMN), \
496 FUZZY(gs.period), &(DMN)->t_wakeup); \
497 } while (0);
498
499#define SET_WAKEUP_ECHO(DMN) \
500 do { \
501 (DMN)->t_wakeup = NULL; \
502 thread_add_timer_msec(master, wakeup_send_echo, (DMN), \
503 FUZZY(gs.period), &(DMN)->t_wakeup); \
504 } while (0);
8b886ca7 505
a6810074 506static int wakeup_down(struct thread *t_wakeup)
8b886ca7 507{
a6810074
DL
508 struct daemon *dmn = THREAD_ARG(t_wakeup);
509
510 dmn->t_wakeup = NULL;
511 if (try_connect(dmn) < 0)
512 SET_WAKEUP_DOWN(dmn);
513 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
514 try_restart(dmn);
515 return 0;
8b886ca7 516}
517
a6810074 518static int wakeup_init(struct thread *t_wakeup)
8b886ca7 519{
a6810074
DL
520 struct daemon *dmn = THREAD_ARG(t_wakeup);
521
522 dmn->t_wakeup = NULL;
523 if (try_connect(dmn) < 0) {
f74ae2bb 524 flog_err(EC_WATCHFRR_CONNECTION,
1c50c1c0
QY
525 "%s state -> down : initial connection attempt failed",
526 dmn->name);
a6810074
DL
527 dmn->state = DAEMON_DOWN;
528 }
c0e5cb52 529 phase_check();
a6810074 530 return 0;
8b886ca7 531}
532
75f8b0e4
DL
533static void restart_done(struct daemon *dmn)
534{
535 if (dmn->state != DAEMON_DOWN) {
536 zlog_warn("wtf?");
537 return;
538 }
539 if (dmn->t_wakeup)
540 THREAD_OFF(dmn->t_wakeup);
541 if (try_connect(dmn) < 0)
542 SET_WAKEUP_DOWN(dmn);
543}
544
a6810074 545static void daemon_down(struct daemon *dmn, const char *why)
8b886ca7 546{
a6810074 547 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
1c50c1c0
QY
548 flog_err(EC_WATCHFRR_CONNECTION, "%s state -> down : %s",
549 dmn->name, why);
a6810074
DL
550 else if (gs.loglevel > LOG_DEBUG)
551 zlog_debug("%s still down : %s", dmn->name, why);
552 if (IS_UP(dmn))
553 gs.numdown++;
554 dmn->state = DAEMON_DOWN;
555 if (dmn->fd >= 0) {
556 close(dmn->fd);
557 dmn->fd = -1;
558 }
559 THREAD_OFF(dmn->t_read);
560 THREAD_OFF(dmn->t_write);
561 THREAD_OFF(dmn->t_wakeup);
562 if (try_connect(dmn) < 0)
563 SET_WAKEUP_DOWN(dmn);
564 phase_check();
8b886ca7 565}
566
a6810074 567static int handle_read(struct thread *t_read)
8b886ca7 568{
a6810074
DL
569 struct daemon *dmn = THREAD_ARG(t_read);
570 static const char resp[sizeof(PING_TOKEN) + 4] = PING_TOKEN "\n";
571 char buf[sizeof(resp) + 100];
572 ssize_t rc;
573 struct timeval delay;
574
575 dmn->t_read = NULL;
576 if ((rc = read(dmn->fd, buf, sizeof(buf))) < 0) {
577 char why[100];
578
579 if (ERRNO_IO_RETRY(errno)) {
580 /* Pretend it never happened. */
581 SET_READ_HANDLER(dmn);
582 return 0;
583 }
584 snprintf(why, sizeof(why), "unexpected read error: %s",
585 safe_strerror(errno));
586 daemon_down(dmn, why);
587 return 0;
8b886ca7 588 }
a6810074
DL
589 if (rc == 0) {
590 daemon_down(dmn, "read returned EOF");
591 return 0;
592 }
593 if (!dmn->echo_sent.tv_sec) {
594 char why[sizeof(buf) + 100];
595 snprintf(why, sizeof(why),
596 "unexpected read returns %d bytes: %.*s", (int)rc,
597 (int)rc, buf);
598 daemon_down(dmn, why);
599 return 0;
8b886ca7 600 }
a6810074
DL
601
602 /* We are expecting an echo response: is there any chance that the
603 response would not be returned entirely in the first read? That
604 seems inconceivable... */
605 if ((rc != sizeof(resp)) || memcmp(buf, resp, sizeof(resp))) {
606 char why[100 + sizeof(buf)];
607 snprintf(why, sizeof(why),
608 "read returned bad echo response of %d bytes "
d62a17ae 609 "(expecting %u): %.*s",
d7c0a89a 610 (int)rc, (unsigned int)sizeof(resp), (int)rc, buf);
a6810074
DL
611 daemon_down(dmn, why);
612 return 0;
613 }
614
615 time_elapsed(&delay, &dmn->echo_sent);
616 dmn->echo_sent.tv_sec = 0;
617 if (dmn->state == DAEMON_UNRESPONSIVE) {
618 if (delay.tv_sec < gs.timeout) {
619 dmn->state = DAEMON_UP;
d62a17ae 620 zlog_warn(
621 "%s state -> up : echo response received after %ld.%06ld "
622 "seconds",
623 dmn->name, (long)delay.tv_sec,
624 (long)delay.tv_usec);
a6810074 625 } else
d62a17ae 626 zlog_warn(
627 "%s: slow echo response finally received after %ld.%06ld "
628 "seconds",
629 dmn->name, (long)delay.tv_sec,
630 (long)delay.tv_usec);
a6810074
DL
631 } else if (gs.loglevel > LOG_DEBUG + 1)
632 zlog_debug("%s: echo response received after %ld.%06ld seconds",
633 dmn->name, (long)delay.tv_sec, (long)delay.tv_usec);
634
635 SET_READ_HANDLER(dmn);
636 if (dmn->t_wakeup)
637 thread_cancel(dmn->t_wakeup);
638 SET_WAKEUP_ECHO(dmn);
639
640 return 0;
8b886ca7 641}
642
207e0d7a
DS
643/*
644 * Wait till we notice that all daemons are ready before
645 * we send we are ready to systemd
646 */
5c9d1c83 647static void daemon_send_ready(int exitcode)
207e0d7a 648{
5c9d1c83 649 FILE *fp;
a6810074 650 static int sent = 0;
43e587c1 651 char started[1024];
207e0d7a 652
5c9d1c83
DL
653 if (sent)
654 return;
655
656 if (exitcode == 0)
0a7c7856 657 zlog_notice("all daemons up, doing startup-complete notify");
5c9d1c83
DL
658 else if (gs.numdown < gs.numdaemons)
659 flog_err(EC_WATCHFRR_CONNECTION,
660 "startup did not complete within timeout"
661 " (%d/%d daemons running)",
662 gs.numdaemons - gs.numdown, gs.numdaemons);
663 else {
664 flog_err(EC_WATCHFRR_CONNECTION,
665 "all configured daemons failed to start"
666 " -- exiting watchfrr");
667 exit(exitcode);
668
669 }
0a7c7856 670
5c9d1c83
DL
671 frr_detach();
672
3c649c71
DS
673 snprintf(started, sizeof(started), "%s%s", frr_vtydir,
674 "watchfrr.started");
675 fp = fopen(started, "w");
5c9d1c83
DL
676 if (fp)
677 fclose(fp);
60bd2534 678#if defined HAVE_SYSTEMD
5c9d1c83 679 systemd_send_started(master, 0);
60bd2534 680#endif
5c9d1c83 681 sent = 1;
207e0d7a
DS
682}
683
a6810074 684static void daemon_up(struct daemon *dmn, const char *why)
8b886ca7 685{
a6810074
DL
686 dmn->state = DAEMON_UP;
687 gs.numdown--;
688 dmn->connect_tries = 0;
689 zlog_notice("%s state -> up : %s", dmn->name, why);
5c9d1c83
DL
690 if (gs.numdown == 0)
691 daemon_send_ready(0);
a8cbb8b3 692 SET_WAKEUP_ECHO(dmn);
a6810074 693 phase_check();
8b886ca7 694}
695
a6810074 696static int check_connect(struct thread *t_write)
8b886ca7 697{
a6810074
DL
698 struct daemon *dmn = THREAD_ARG(t_write);
699 int sockerr;
700 socklen_t reslen = sizeof(sockerr);
701
702 dmn->t_write = NULL;
703 if (getsockopt(dmn->fd, SOL_SOCKET, SO_ERROR, (char *)&sockerr, &reslen)
704 < 0) {
705 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn->name,
706 safe_strerror(errno));
707 daemon_down(dmn,
708 "getsockopt failed checking connection success");
709 return 0;
710 }
711 if ((reslen == sizeof(sockerr)) && sockerr) {
712 char why[100];
d62a17ae 713 snprintf(
714 why, sizeof(why),
715 "getsockopt reports that connection attempt failed: %s",
716 safe_strerror(sockerr));
a6810074
DL
717 daemon_down(dmn, why);
718 return 0;
719 }
720
721 daemon_up(dmn, "delayed connect succeeded");
722 return 0;
8b886ca7 723}
724
a6810074 725static int wakeup_connect_hanging(struct thread *t_wakeup)
8b886ca7 726{
a6810074
DL
727 struct daemon *dmn = THREAD_ARG(t_wakeup);
728 char why[100];
729
730 dmn->t_wakeup = NULL;
731 snprintf(why, sizeof(why),
732 "connection attempt timed out after %ld seconds", gs.timeout);
733 daemon_down(dmn, why);
734 return 0;
8b886ca7 735}
736
737/* Making connection to protocol daemon. */
a6810074 738static int try_connect(struct daemon *dmn)
8b886ca7 739{
a6810074
DL
740 int sock;
741 struct sockaddr_un addr;
742 socklen_t len;
743
744 if (gs.loglevel > LOG_DEBUG + 1)
745 zlog_debug("%s: attempting to connect", dmn->name);
746 dmn->connect_tries++;
747
748 memset(&addr, 0, sizeof(struct sockaddr_un));
749 addr.sun_family = AF_UNIX;
d62a17ae 750 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty", gs.vtydir,
751 dmn->name);
6f0e3f6e 752#ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
a6810074 753 len = addr.sun_len = SUN_LEN(&addr);
8b886ca7 754#else
a6810074 755 len = sizeof(addr.sun_family) + strlen(addr.sun_path);
d62a17ae 756#endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
a6810074
DL
757
758 /* Quick check to see if we might succeed before we go to the trouble
759 of creating a socket. */
760 if (access(addr.sun_path, W_OK) < 0) {
761 if (errno != ENOENT)
450971aa 762 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
763 "%s: access to socket %s denied: %s",
764 dmn->name, addr.sun_path,
765 safe_strerror(errno));
a6810074
DL
766 return -1;
767 }
768
769 if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
450971aa 770 flog_err_sys(EC_LIB_SOCKET, "%s(%s): cannot make socket: %s",
09c866e3 771 __func__, addr.sun_path, safe_strerror(errno));
a6810074
DL
772 return -1;
773 }
774
775 if (set_nonblocking(sock) < 0 || set_cloexec(sock) < 0) {
450971aa 776 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
777 "%s(%s): set_nonblocking/cloexec(%d) failed",
778 __func__, addr.sun_path, sock);
a6810074
DL
779 close(sock);
780 return -1;
8b886ca7 781 }
a6810074
DL
782
783 if (connect(sock, (struct sockaddr *)&addr, len) < 0) {
784 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK)) {
785 if (gs.loglevel > LOG_DEBUG)
786 zlog_debug("%s(%s): connect failed: %s",
787 __func__, addr.sun_path,
788 safe_strerror(errno));
789 close(sock);
790 return -1;
791 }
792 if (gs.loglevel > LOG_DEBUG)
793 zlog_debug("%s: connection in progress", dmn->name);
794 dmn->state = DAEMON_CONNECTING;
795 dmn->fd = sock;
66e78ae6
QY
796 dmn->t_write = NULL;
797 thread_add_write(master, check_connect, dmn, dmn->fd,
d62a17ae 798 &dmn->t_write);
799 dmn->t_wakeup = NULL;
800 thread_add_timer(master, wakeup_connect_hanging, dmn,
801 gs.timeout, &dmn->t_wakeup);
a6810074
DL
802 SET_READ_HANDLER(dmn);
803 return 0;
804 }
805
806 dmn->fd = sock;
807 SET_READ_HANDLER(dmn);
808 daemon_up(dmn, "connect succeeded");
809 return 1;
8b886ca7 810}
811
a6810074 812static int phase_hanging(struct thread *t_hanging)
8b886ca7 813{
a6810074 814 gs.t_phase_hanging = NULL;
f74ae2bb 815 flog_err(EC_WATCHFRR_CONNECTION,
1c50c1c0
QY
816 "Phase [%s] hanging for %ld seconds, aborting phased restart",
817 phase_str[gs.phase], PHASE_TIMEOUT);
a6810074
DL
818 gs.phase = PHASE_NONE;
819 return 0;
8b886ca7 820}
821
a6810074 822static void set_phase(restart_phase_t new_phase)
8b886ca7 823{
a6810074
DL
824 gs.phase = new_phase;
825 if (gs.t_phase_hanging)
826 thread_cancel(gs.t_phase_hanging);
66e78ae6
QY
827 gs.t_phase_hanging = NULL;
828 thread_add_timer(master, phase_hanging, NULL, PHASE_TIMEOUT,
829 &gs.t_phase_hanging);
8b886ca7 830}
831
a6810074 832static void phase_check(void)
8b886ca7 833{
c0e5cb52
DL
834 struct daemon *dmn;
835
a6810074
DL
836 switch (gs.phase) {
837 case PHASE_NONE:
838 break;
c0e5cb52
DL
839
840 case PHASE_INIT:
841 for (dmn = gs.daemons; dmn; dmn = dmn->next)
842 if (dmn->state == DAEMON_INIT)
843 return;
844
845 /* startup complete, everything out of INIT */
846 gs.phase = PHASE_NONE;
847 for (dmn = gs.daemons; dmn; dmn = dmn->next)
848 if (dmn->state == DAEMON_DOWN) {
849 SET_WAKEUP_DOWN(dmn);
850 try_restart(dmn);
851 }
852 break;
a6810074
DL
853 case PHASE_STOPS_PENDING:
854 if (gs.numpids)
855 break;
d62a17ae 856 zlog_info(
857 "Phased restart: all routing daemon stop jobs have completed.");
a6810074
DL
858 set_phase(PHASE_WAITING_DOWN);
859
d62a17ae 860 /*FALLTHRU*/
a6810074
DL
861 case PHASE_WAITING_DOWN:
862 if (gs.numdown + IS_UP(gs.special) < gs.numdaemons)
863 break;
864 zlog_info("Phased restart: all routing daemons now down.");
865 run_job(&gs.special->restart, "restart", gs.restart_command, 1,
866 1);
867 set_phase(PHASE_ZEBRA_RESTART_PENDING);
868
d62a17ae 869 /*FALLTHRU*/
a6810074
DL
870 case PHASE_ZEBRA_RESTART_PENDING:
871 if (gs.special->restart.pid)
872 break;
873 zlog_info("Phased restart: %s restart job completed.",
874 gs.special->name);
875 set_phase(PHASE_WAITING_ZEBRA_UP);
876
d62a17ae 877 /*FALLTHRU*/
a6810074
DL
878 case PHASE_WAITING_ZEBRA_UP:
879 if (!IS_UP(gs.special))
880 break;
881 zlog_info("Phased restart: %s is now up.", gs.special->name);
882 {
883 struct daemon *dmn;
884 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
885 if (dmn != gs.special)
886 run_job(&dmn->restart, "start",
887 gs.start_command, 1, 0);
888 }
889 }
890 gs.phase = PHASE_NONE;
891 THREAD_OFF(gs.t_phase_hanging);
892 zlog_notice("Phased global restart has completed.");
893 break;
894 }
8b886ca7 895}
896
a6810074 897static void try_restart(struct daemon *dmn)
8b886ca7 898{
f168b713 899 if (watch_only)
a6810074 900 return;
a6810074 901
f168b713
DL
902 if (dmn != gs.special) {
903 if ((gs.special->state == DAEMON_UP)
904 && (gs.phase == PHASE_NONE))
905 run_job(&dmn->restart, "restart", gs.restart_command, 0,
906 1);
907 else
908 zlog_debug(
909 "%s: postponing restart attempt because master %s daemon "
910 "not up [%s], or phased restart in progress",
911 dmn->name, gs.special->name,
912 state_str[gs.special->state]);
913 return;
914 }
915
916 if ((gs.phase != PHASE_NONE) || gs.numpids) {
917 if (gs.loglevel > LOG_DEBUG + 1)
918 zlog_debug(
919 "postponing phased global restart: restart already in "
920 "progress [%s], or outstanding child processes [%d]",
921 phase_str[gs.phase], gs.numpids);
922 return;
923 }
924 /* Is it too soon for a restart? */
925 {
926 struct timeval delay;
927 if (time_elapsed(&delay, &gs.special->restart.time)->tv_sec
928 < gs.special->restart.interval) {
a6810074 929 if (gs.loglevel > LOG_DEBUG + 1)
d62a17ae 930 zlog_debug(
f168b713
DL
931 "postponing phased global restart: "
932 "elapsed time %ld < retry interval %ld",
933 (long)delay.tv_sec,
934 gs.special->restart.interval);
935 return;
a6810074 936 }
8b886ca7 937 }
f168b713 938 run_job(&gs.restart, "restart", gs.restart_command, 0, 1);
8b886ca7 939}
940
a6810074 941static int wakeup_unresponsive(struct thread *t_wakeup)
8b886ca7 942{
a6810074
DL
943 struct daemon *dmn = THREAD_ARG(t_wakeup);
944
945 dmn->t_wakeup = NULL;
946 if (dmn->state != DAEMON_UNRESPONSIVE)
f74ae2bb 947 flog_err(EC_WATCHFRR_CONNECTION,
1c50c1c0
QY
948 "%s: no longer unresponsive (now %s), "
949 "wakeup should have been cancelled!",
950 dmn->name, state_str[dmn->state]);
a6810074
DL
951 else {
952 SET_WAKEUP_UNRESPONSIVE(dmn);
953 try_restart(dmn);
954 }
955 return 0;
8b886ca7 956}
957
a6810074 958static int wakeup_no_answer(struct thread *t_wakeup)
8b886ca7 959{
a6810074
DL
960 struct daemon *dmn = THREAD_ARG(t_wakeup);
961
962 dmn->t_wakeup = NULL;
963 dmn->state = DAEMON_UNRESPONSIVE;
f74ae2bb 964 flog_err(EC_WATCHFRR_CONNECTION,
1c50c1c0
QY
965 "%s state -> unresponsive : no response yet to ping "
966 "sent %ld seconds ago",
967 dmn->name, gs.timeout);
71e7975a
DL
968 SET_WAKEUP_UNRESPONSIVE(dmn);
969 try_restart(dmn);
a6810074 970 return 0;
8b886ca7 971}
972
a6810074 973static int wakeup_send_echo(struct thread *t_wakeup)
8b886ca7 974{
a6810074
DL
975 static const char echocmd[] = "echo " PING_TOKEN;
976 ssize_t rc;
977 struct daemon *dmn = THREAD_ARG(t_wakeup);
978
979 dmn->t_wakeup = NULL;
d62a17ae 980 if (((rc = write(dmn->fd, echocmd, sizeof(echocmd))) < 0)
981 || ((size_t)rc != sizeof(echocmd))) {
a6810074
DL
982 char why[100 + sizeof(echocmd)];
983 snprintf(why, sizeof(why),
984 "write '%s' returned %d instead of %u", echocmd,
d7c0a89a 985 (int)rc, (unsigned int)sizeof(echocmd));
a6810074
DL
986 daemon_down(dmn, why);
987 } else {
988 gettimeofday(&dmn->echo_sent, NULL);
66e78ae6
QY
989 dmn->t_wakeup = NULL;
990 thread_add_timer(master, wakeup_no_answer, dmn, gs.timeout,
991 &dmn->t_wakeup);
a6810074
DL
992 }
993 return 0;
8b886ca7 994}
995
470bc619
QY
996bool check_all_up(void)
997{
998 struct daemon *dmn;
999
1000 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1001 if (dmn->state != DAEMON_UP)
1002 return false;
1003 return true;
1004}
1005
af568444
DL
1006void watchfrr_status(struct vty *vty)
1007{
1008 struct daemon *dmn;
1009 struct timeval delay;
1010
1011 vty_out(vty, "watchfrr global phase: %s\n", phase_str[gs.phase]);
1012 if (gs.restart.pid)
1013 vty_out(vty, " global restart running, pid %ld\n",
1014 (long)gs.restart.pid);
1015
1016 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
1017 vty_out(vty, " %-20s %s\n", dmn->name, state_str[dmn->state]);
1018 if (dmn->restart.pid)
1019 vty_out(vty, " restart running, pid %ld\n",
1020 (long)dmn->restart.pid);
1021 else if (dmn->state == DAEMON_DOWN &&
1022 time_elapsed(&delay, &dmn->restart.time)->tv_sec
1023 < dmn->restart.interval)
051a0be4
DL
1024 vty_out(vty, " restarting in %jd seconds"
1025 " (%jds backoff interval)\n",
1026 (intmax_t)dmn->restart.interval
1027 - (intmax_t)delay.tv_sec,
1028 (intmax_t)dmn->restart.interval);
af568444
DL
1029 }
1030}
1031
a6810074 1032static void sigint(void)
8b886ca7 1033{
a6810074
DL
1034 zlog_notice("Terminating on signal");
1035 systemd_send_stopping();
1036 exit(0);
8b886ca7 1037}
1038
a6810074 1039static int valid_command(const char *cmd)
8b886ca7 1040{
a6810074 1041 char *p;
8b886ca7 1042
a6810074 1043 return ((p = strchr(cmd, '%')) != NULL) && (*(p + 1) == 's')
d62a17ae 1044 && !strchr(p + 1, '%');
8b886ca7 1045}
1046
c8b40f86 1047/* This is an ugly hack to circumvent problems with passing command-line
1048 arguments that contain spaces. The fix is to use a configuration file. */
a6810074 1049static char *translate_blanks(const char *cmd, const char *blankstr)
c8b40f86 1050{
a6810074
DL
1051 char *res;
1052 char *p;
1053 size_t bslen = strlen(blankstr);
1054
1055 if (!(res = strdup(cmd))) {
1056 perror("strdup");
1057 exit(1);
1058 }
1059 while ((p = strstr(res, blankstr)) != NULL) {
1060 *p = ' ';
1061 if (bslen != 1)
1062 memmove(p + 1, p + bslen, strlen(p + bslen) + 1);
1063 }
1064 return res;
c8b40f86 1065}
1066
5c9d1c83
DL
1067static int startup_timeout(struct thread *t_wakeup)
1068{
1069 daemon_send_ready(1);
1070 return 0;
1071}
1072
0a7c7856
DL
1073static void watchfrr_init(int argc, char **argv)
1074{
1075 const char *special = "zebra";
1076 int i;
1077 struct daemon *dmn, **add = &gs.daemons;
1078 char alldaemons[512] = "", *p = alldaemons;
1079
5c9d1c83
DL
1080 thread_add_timer_msec(master, startup_timeout, NULL, STARTUP_TIMEOUT,
1081 &gs.t_startup_timeout);
1082
0a7c7856
DL
1083 for (i = optind; i < argc; i++) {
1084 dmn = XCALLOC(MTYPE_WATCHFRR_DAEMON, sizeof(*dmn));
1085
1086 dmn->name = dmn->restart.name = argv[i];
1087 dmn->state = DAEMON_INIT;
1088 gs.numdaemons++;
1089 gs.numdown++;
1090 dmn->fd = -1;
1091 dmn->t_wakeup = NULL;
c0e5cb52 1092 thread_add_timer_msec(master, wakeup_init, dmn, 0,
0a7c7856
DL
1093 &dmn->t_wakeup);
1094 dmn->restart.interval = gs.min_restart_interval;
1095 *add = dmn;
1096 add = &dmn->next;
1097
1098 if (!strcmp(dmn->name, special))
1099 gs.special = dmn;
1100 }
1101
1102 if (!gs.daemons) {
1103 fprintf(stderr,
1104 "Must specify one or more daemons to monitor.\n\n");
1105 frr_help_exit(1);
1106 }
1107 if (!watch_only && !gs.special) {
1108 fprintf(stderr, "\"%s\" daemon must be in daemon lists\n\n",
1109 special);
1110 frr_help_exit(1);
1111 }
1112
1113 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
1114 snprintf(p, alldaemons + sizeof(alldaemons) - p, "%s%s",
1115 (p == alldaemons) ? "" : " ", dmn->name);
1116 p += strlen(p);
1117 }
1118 zlog_notice("%s %s watching [%s]%s", progname, FRR_VERSION, alldaemons,
1119 watch_only ? ", monitor mode" : "");
1120}
1121
a6810074 1122struct zebra_privs_t watchfrr_privs = {
95c4aff2 1123#ifdef VTY_GROUP
a6810074 1124 .vty_group = VTY_GROUP,
95c4aff2
DL
1125#endif
1126};
1127
4f04a76b
DL
1128static struct quagga_signal_t watchfrr_signals[] = {
1129 {
1130 .signal = SIGINT,
1131 .handler = sigint,
1132 },
1133 {
1134 .signal = SIGTERM,
1135 .handler = sigint,
1136 },
1137 {
1138 .signal = SIGCHLD,
1139 .handler = sigchild,
1140 },
1141};
1142
1143FRR_DAEMON_INFO(watchfrr, WATCHFRR,
d62a17ae 1144 .flags = FRR_NO_PRIVSEP | FRR_NO_TCPVTY | FRR_LIMITED_CLI
0a7c7856
DL
1145 | FRR_NO_CFG_PID_DRY | FRR_NO_ZCLIENT
1146 | FRR_DETACH_LATER,
4f04a76b 1147
d62a17ae 1148 .printhelp = printhelp,
1149 .copyright = "Copyright 2004 Andrew J. Schorr",
4f04a76b 1150
d62a17ae 1151 .signals = watchfrr_signals,
1152 .n_signals = array_size(watchfrr_signals),
4f04a76b 1153
d62a17ae 1154 .privs = &watchfrr_privs, )
4f04a76b 1155
999f153e
DL
1156#define DEPRECATED_OPTIONS "aAezR:"
1157
a6810074 1158int main(int argc, char **argv)
8b886ca7 1159{
a6810074 1160 int opt;
a6810074 1161 const char *blankstr = NULL;
a6810074 1162
4f04a76b
DL
1163 frr_preinit(&watchfrr_di, argc, argv);
1164 progname = watchfrr_di.progname;
1165
999f153e 1166 frr_opt_add("b:dk:l:i:p:r:S:s:t:T:" DEPRECATED_OPTIONS, longopts, "");
a6810074
DL
1167
1168 gs.restart.name = "all";
4f04a76b 1169 while ((opt = frr_getopt(argc, argv, NULL)) != EOF) {
999f153e
DL
1170 if (opt && opt < 128 && strchr(DEPRECATED_OPTIONS, opt)) {
1171 fprintf(stderr,
1172 "The -%c option no longer exists.\n"
1173 "Please refer to the watchfrr(8) man page.\n",
1174 opt);
1175 exit(1);
1176 }
1177
a6810074
DL
1178 switch (opt) {
1179 case 0:
1180 break;
a6810074
DL
1181 case 'b':
1182 blankstr = optarg;
1183 break;
f168b713
DL
1184 case OPTION_DRY:
1185 watch_only = true;
a6810074
DL
1186 break;
1187 case 'k':
1188 if (!valid_command(optarg)) {
1189 fprintf(stderr,
1190 "Invalid kill command, must contain '%%s': %s\n",
1191 optarg);
4f04a76b 1192 frr_help_exit(1);
a6810074
DL
1193 }
1194 gs.stop_command = optarg;
1195 break;
d62a17ae 1196 case 'l': {
1197 char garbage[3];
1198 if ((sscanf(optarg, "%d%1s", &gs.loglevel, garbage)
1199 != 1)
1200 || (gs.loglevel < LOG_EMERG)) {
1201 fprintf(stderr,
1202 "Invalid loglevel argument: %s\n",
1203 optarg);
1204 frr_help_exit(1);
a6810074 1205 }
d62a17ae 1206 } break;
1207 case OPTION_MINRESTART: {
1208 char garbage[3];
1209 if ((sscanf(optarg, "%ld%1s", &gs.min_restart_interval,
1210 garbage)
1211 != 1)
1212 || (gs.min_restart_interval < 0)) {
1213 fprintf(stderr,
1214 "Invalid min_restart_interval argument: %s\n",
1215 optarg);
1216 frr_help_exit(1);
a6810074 1217 }
d62a17ae 1218 } break;
1219 case OPTION_MAXRESTART: {
1220 char garbage[3];
1221 if ((sscanf(optarg, "%ld%1s", &gs.max_restart_interval,
1222 garbage)
1223 != 1)
1224 || (gs.max_restart_interval < 0)) {
1225 fprintf(stderr,
1226 "Invalid max_restart_interval argument: %s\n",
1227 optarg);
1228 frr_help_exit(1);
a6810074 1229 }
d62a17ae 1230 } break;
1231 case 'i': {
1232 char garbage[3];
1233 int period;
1234 if ((sscanf(optarg, "%d%1s", &period, garbage) != 1)
1235 || (gs.period < 1)) {
1236 fprintf(stderr,
1237 "Invalid interval argument: %s\n",
1238 optarg);
1239 frr_help_exit(1);
a6810074 1240 }
d62a17ae 1241 gs.period = 1000 * period;
1242 } break;
a6810074 1243 case 'p':
0a7c7856 1244 watchfrr_di.pid_file = optarg;
a6810074
DL
1245 break;
1246 case 'r':
a6810074
DL
1247 if (!valid_command(optarg)) {
1248 fprintf(stderr,
1249 "Invalid restart command, must contain '%%s': %s\n",
1250 optarg);
4f04a76b 1251 frr_help_exit(1);
a6810074
DL
1252 }
1253 gs.restart_command = optarg;
a6810074
DL
1254 break;
1255 case 's':
1256 if (!valid_command(optarg)) {
1257 fprintf(stderr,
1258 "Invalid start command, must contain '%%s': %s\n",
1259 optarg);
4f04a76b 1260 frr_help_exit(1);
a6810074
DL
1261 }
1262 gs.start_command = optarg;
1263 break;
1264 case 'S':
1265 gs.vtydir = optarg;
1266 break;
d62a17ae 1267 case 't': {
1268 char garbage[3];
1269 if ((sscanf(optarg, "%ld%1s", &gs.timeout, garbage)
1270 != 1)
1271 || (gs.timeout < 1)) {
1272 fprintf(stderr,
1273 "Invalid timeout argument: %s\n",
1274 optarg);
1275 frr_help_exit(1);
a6810074 1276 }
d62a17ae 1277 } break;
1278 case 'T': {
1279 char garbage[3];
1280 if ((sscanf(optarg, "%ld%1s", &gs.restart_timeout,
1281 garbage)
1282 != 1)
1283 || (gs.restart_timeout < 1)) {
1284 fprintf(stderr,
1285 "Invalid restart timeout argument: %s\n",
1286 optarg);
1287 frr_help_exit(1);
a6810074 1288 }
d62a17ae 1289 } break;
a6810074
DL
1290 default:
1291 fputs("Invalid option.\n", stderr);
4f04a76b 1292 frr_help_exit(1);
a6810074 1293 }
8b886ca7 1294 }
a6810074 1295
71e7975a
DL
1296 if (watch_only
1297 && (gs.start_command || gs.stop_command || gs.restart_command)) {
d87ae5cc 1298 fputs("Options -r/-s/-k are not used when --dry is active.\n",
a6810074 1299 stderr);
8b886ca7 1300 }
f168b713
DL
1301 if (!watch_only
1302 && (!gs.restart_command || !gs.start_command || !gs.stop_command)) {
1303 fprintf(stderr,
1304 "Options -s (start), -k (kill), and -r (restart) are required.\n");
1305 frr_help_exit(1);
8b886ca7 1306 }
8b886ca7 1307
a6810074
DL
1308 if (blankstr) {
1309 if (gs.restart_command)
1310 gs.restart_command =
d62a17ae 1311 translate_blanks(gs.restart_command, blankstr);
a6810074
DL
1312 if (gs.start_command)
1313 gs.start_command =
d62a17ae 1314 translate_blanks(gs.start_command, blankstr);
a6810074
DL
1315 if (gs.stop_command)
1316 gs.stop_command =
d62a17ae 1317 translate_blanks(gs.stop_command, blankstr);
065de903 1318 }
8b886ca7 1319
a6810074 1320 gs.restart.interval = gs.min_restart_interval;
8b886ca7 1321
4f04a76b 1322 master = frr_init();
b647dc2a 1323 watchfrr_error_init();
0a7c7856
DL
1324 watchfrr_init(argc, argv);
1325 watchfrr_vty_init();
1326
1327 frr_config_fork();
4f04a76b 1328
dd8376fe 1329 zlog_set_level(ZLOG_DEST_MONITOR, ZLOG_DISABLED);
0a7c7856 1330 if (watchfrr_di.daemon_mode)
dd8376fe 1331 zlog_set_level(ZLOG_DEST_SYSLOG, MIN(gs.loglevel, LOG_DEBUG));
0a7c7856 1332 else
dd8376fe 1333 zlog_set_level(ZLOG_DEST_STDOUT, MIN(gs.loglevel, LOG_DEBUG));
8b886ca7 1334
0a7c7856 1335 frr_run(master);
8b886ca7 1336
a6810074
DL
1337 systemd_send_stopping();
1338 /* Not reached. */
1339 return 0;
8b886ca7 1340}