]> git.proxmox.com Git - mirror_frr.git/blame - watchfrr/watchfrr.c
Merge pull request #5005 from Frankkkkk/dockerfile
[mirror_frr.git] / watchfrr / watchfrr.c
CommitLineData
8b886ca7 1/*
896014f4
DL
2 * Monitor status of frr daemons and restart if necessary.
3 *
4 * Copyright (C) 2004 Andrew J. Schorr
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
8b886ca7 19 */
20
a365534f 21#include <zebra.h>
8b886ca7 22#include <thread.h>
23#include <log.h>
52e66296 24#include <network.h>
8b886ca7 25#include <sigevent.h>
a365534f 26#include <lib/version.h>
95c4aff2 27#include "command.h"
87f44e2f 28#include "memory_vty.h"
4f04a76b 29#include "libfrr.h"
b647dc2a 30#include "lib_errors.h"
95c4aff2 31
6f594023 32#include <getopt.h>
a365534f 33#include <sys/un.h>
34#include <sys/wait.h>
837d16cc 35#include <memory.h>
651415bd 36#include <systemd.h>
8b886ca7 37
9473e340 38#include "watchfrr.h"
b647dc2a 39#include "watchfrr_errors.h"
95c4aff2 40
8b886ca7 41#ifndef MIN
42#define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
43#endif
44
45/* Macros to help randomize timers. */
46#define JITTER(X) ((random() % ((X)+1))-((X)/2))
47#define FUZZY(X) ((X)+JITTER((X)/20))
48
49#define DEFAULT_PERIOD 5
0a64aff6 50#define DEFAULT_TIMEOUT 90
8b886ca7 51#define DEFAULT_RESTART_TIMEOUT 20
52#define DEFAULT_LOGLEVEL LOG_INFO
53#define DEFAULT_MIN_RESTART 60
54#define DEFAULT_MAX_RESTART 600
8b886ca7 55
3ec95567
DL
56#define DEFAULT_RESTART_CMD WATCHFRR_SH_PATH " restart %s"
57#define DEFAULT_START_CMD WATCHFRR_SH_PATH " start %s"
58#define DEFAULT_STOP_CMD WATCHFRR_SH_PATH " stop %s"
59
8b886ca7 60#define PING_TOKEN "PING"
61
0a7c7856
DL
62DEFINE_MGROUP(WATCHFRR, "watchfrr")
63DEFINE_MTYPE_STATIC(WATCHFRR, WATCHFRR_DAEMON, "watchfrr daemon entry")
64
55c72803 65/* Needs to be global, referenced somewhere inside libfrr. */
8b886ca7 66struct thread_master *master;
67
f168b713 68static bool watch_only = false;
8b886ca7 69
a6810074
DL
70typedef enum {
71 PHASE_NONE = 0,
c0e5cb52 72 PHASE_INIT,
a6810074
DL
73 PHASE_STOPS_PENDING,
74 PHASE_WAITING_DOWN,
75 PHASE_ZEBRA_RESTART_PENDING,
76 PHASE_WAITING_ZEBRA_UP
8b886ca7 77} restart_phase_t;
78
a6810074 79static const char *phase_str[] = {
af568444 80 "Idle",
c0e5cb52 81 "Startup",
a6810074
DL
82 "Stop jobs running",
83 "Waiting for other daemons to come down",
84 "Zebra restart job running",
85 "Waiting for zebra to come up",
86 "Start jobs running",
8b886ca7 87};
88
89#define PHASE_TIMEOUT (3*gs.restart_timeout)
5c9d1c83 90#define STARTUP_TIMEOUT 55 * 1000
8b886ca7 91
a6810074
DL
92struct restart_info {
93 const char *name;
94 const char *what;
95 pid_t pid;
96 struct timeval time;
97 long interval;
98 struct thread *t_kill;
99 int kills;
098e240f 100};
101
a6810074 102static struct global_state {
a6810074
DL
103 restart_phase_t phase;
104 struct thread *t_phase_hanging;
5c9d1c83 105 struct thread *t_startup_timeout;
a6810074
DL
106 const char *vtydir;
107 long period;
108 long timeout;
109 long restart_timeout;
110 long min_restart_interval;
111 long max_restart_interval;
a6810074
DL
112 struct daemon *daemons;
113 const char *restart_command;
114 const char *start_command;
115 const char *stop_command;
116 struct restart_info restart;
a6810074 117 int loglevel;
d62a17ae 118 struct daemon *special; /* points to zebra when doing phased restart */
a6810074
DL
119 int numdaemons;
120 int numpids;
d62a17ae 121 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
8b886ca7 122} gs = {
c0e5cb52 123 .phase = PHASE_INIT,
64a249ad 124 .vtydir = frr_vtydir,
d62a17ae 125 .period = 1000 * DEFAULT_PERIOD,
126 .timeout = DEFAULT_TIMEOUT,
127 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
128 .loglevel = DEFAULT_LOGLEVEL,
129 .min_restart_interval = DEFAULT_MIN_RESTART,
130 .max_restart_interval = DEFAULT_MAX_RESTART,
3ec95567
DL
131 .restart_command = DEFAULT_RESTART_CMD,
132 .start_command = DEFAULT_START_CMD,
133 .stop_command = DEFAULT_STOP_CMD,
d62a17ae 134};
a6810074
DL
135
136typedef enum {
137 DAEMON_INIT,
138 DAEMON_DOWN,
139 DAEMON_CONNECTING,
140 DAEMON_UP,
141 DAEMON_UNRESPONSIVE
8b886ca7 142} daemon_state_t;
143
d62a17ae 144#define IS_UP(DMN) \
145 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
8b886ca7 146
a6810074 147static const char *state_str[] = {
d62a17ae 148 "Init", "Down", "Connecting", "Up", "Unresponsive",
8b886ca7 149};
150
151struct daemon {
a6810074
DL
152 const char *name;
153 daemon_state_t state;
154 int fd;
155 struct timeval echo_sent;
d7c0a89a 156 unsigned int connect_tries;
a6810074
DL
157 struct thread *t_wakeup;
158 struct thread *t_read;
159 struct thread *t_write;
160 struct daemon *next;
161 struct restart_info restart;
cc53b605
DS
162
163 /*
164 * For a given daemon, if we've turned on ignore timeouts
165 * ignore the timeout value and assume everything is ok
166 * This is for daemon debugging w/ gdb after we have started
167 * FRR and realize we have something that needs to be looked
168 * at
169 */
170 bool ignore_timeout;
8b886ca7 171};
172
9272302b
DL
173#define OPTION_MINRESTART 2000
174#define OPTION_MAXRESTART 2001
f168b713 175#define OPTION_DRY 2002
9272302b 176
a6810074
DL
177static const struct option longopts[] = {
178 {"daemon", no_argument, NULL, 'd'},
179 {"statedir", required_argument, NULL, 'S'},
a6810074
DL
180 {"loglevel", required_argument, NULL, 'l'},
181 {"interval", required_argument, NULL, 'i'},
182 {"timeout", required_argument, NULL, 't'},
183 {"restart-timeout", required_argument, NULL, 'T'},
184 {"restart", required_argument, NULL, 'r'},
185 {"start-command", required_argument, NULL, 's'},
186 {"kill-command", required_argument, NULL, 'k'},
f168b713 187 {"dry", no_argument, NULL, OPTION_DRY},
d62a17ae 188 {"min-restart-interval", required_argument, NULL, OPTION_MINRESTART},
189 {"max-restart-interval", required_argument, NULL, OPTION_MAXRESTART},
a6810074
DL
190 {"pid-file", required_argument, NULL, 'p'},
191 {"blank-string", required_argument, NULL, 'b'},
192 {"help", no_argument, NULL, 'h'},
193 {"version", no_argument, NULL, 'v'},
d62a17ae 194 {NULL, 0, NULL, 0}};
8b886ca7 195
196static int try_connect(struct daemon *dmn);
197static int wakeup_send_echo(struct thread *t_wakeup);
198static void try_restart(struct daemon *dmn);
199static void phase_check(void);
75f8b0e4 200static void restart_done(struct daemon *dmn);
8b886ca7 201
4f04a76b 202static const char *progname;
cc53b605
DS
203
204void watchfrr_set_ignore_daemon(struct vty *vty, const char *dname, bool ignore)
205{
206 struct daemon *dmn;
207
208 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
209 if (strncmp(dmn->name, dname, strlen(dmn->name)) == 0)
210 break;
211 }
212
213 if (dmn) {
214 dmn->ignore_timeout = ignore;
215 vty_out(vty, "%s switching to %s\n", dmn->name,
216 ignore ? "ignore" : "watch");
217 } else
218 vty_out(vty, "%s is not configured for running at the moment",
219 dname);
220}
221
4f04a76b 222static void printhelp(FILE *target)
8b886ca7 223{
d62a17ae 224 fprintf(target,
225 "Usage : %s [OPTION...] <daemon name> ...\n\n\
9473e340 226Watchdog program to monitor status of frr daemons and try to restart\n\
8b886ca7 227them if they are down or unresponsive. It determines whether a daemon is\n\
228up based on whether it can connect to the daemon's vty unix stream socket.\n\
229It then repeatedly sends echo commands over that socket to determine whether\n\
230the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
231on the socket connection and know immediately that the daemon is down.\n\n\
232The daemons to be monitored should be listed on the command line.\n\n\
8b886ca7 233In order to avoid attempting to restart the daemons in a fast loop,\n\
234the -m and -M options allow you to control the minimum delay between\n\
235restart commands. The minimum restart delay is recalculated each time\n\
236a restart is attempted: if the time since the last restart attempt exceeds\n\
237twice the -M value, then the restart delay is set to the -m value.\n\
d62a17ae 238Otherwise, the interval is doubled (but capped at the -M value).\n\n",
f168b713 239 progname);
e757c940 240
d62a17ae 241 fprintf(target,
242 "Options:\n\
8b886ca7 243-d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
244 to syslog instead of stdout.\n\
245-S, --statedir Set the vty socket directory (default is %s)\n\
8b886ca7 246-l, --loglevel Set the logging level (default is %d).\n\
247 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
248 but it can be set higher than %d if extra-verbose debugging\n\
249 messages are desired.\n\
9272302b 250 --min-restart-interval\n\
8b886ca7 251 Set the minimum seconds to wait between invocations of daemon\n\
252 restart commands (default is %d).\n\
9272302b 253 --max-restart-interval\n\
8b886ca7 254 Set the maximum seconds to wait between invocations of daemon\n\
255 restart commands (default is %d).\n\
256-i, --interval Set the status polling interval in seconds (default is %d)\n\
257-t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
258-T, --restart-timeout\n\
259 Set the restart (kill) timeout in seconds (default is %d).\n\
260 If any background jobs are still running after this much\n\
261 time has elapsed, they will be killed.\n\
262-r, --restart Supply a Bourne shell command to use to restart a single\n\
263 daemon. The command string should include '%%s' where the\n\
264 name of the daemon should be substituted.\n\
3ec95567 265 (default: '%s')\n\
8b886ca7 266-s, --start-command\n\
267 Supply a Bourne shell to command to use to start a single\n\
268 daemon. The command string should include '%%s' where the\n\
269 name of the daemon should be substituted.\n\
3ec95567 270 (default: '%s')\n\
8b886ca7 271-k, --kill-command\n\
272 Supply a Bourne shell to command to use to stop a single\n\
273 daemon. The command string should include '%%s' where the\n\
274 name of the daemon should be substituted.\n\
3ec95567 275 (default: '%s')\n\
f168b713 276 --dry Do not start or restart anything, just log.\n\
8b886ca7 277-p, --pid-file Set process identifier file name\n\
0a7c7856 278 (default is %s/watchfrr.pid).\n\
c8b40f86 279-b, --blank-string\n\
280 When the supplied argument string is found in any of the\n\
f168b713 281 various shell command arguments (-r, -s, or -k), replace\n\
c8b40f86 282 it with a space. This is an ugly hack to circumvent problems\n\
283 passing command-line arguments with embedded spaces.\n\
8b886ca7 284-v, --version Print program version\n\
d62a17ae 285-h, --help Display this help and exit\n",
64a249ad 286 frr_vtydir, DEFAULT_LOGLEVEL, LOG_EMERG, LOG_DEBUG, LOG_DEBUG,
d62a17ae 287 DEFAULT_MIN_RESTART, DEFAULT_MAX_RESTART, DEFAULT_PERIOD,
3ec95567
DL
288 DEFAULT_TIMEOUT, DEFAULT_RESTART_TIMEOUT,
289 DEFAULT_RESTART_CMD, DEFAULT_START_CMD, DEFAULT_STOP_CMD,
290 frr_vtydir);
8b886ca7 291}
292
a6810074 293static pid_t run_background(char *shell_cmd)
8b886ca7 294{
a6810074
DL
295 pid_t child;
296
297 switch (child = fork()) {
298 case -1:
450971aa 299 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
300 "fork failed, cannot run command [%s]: %s",
301 shell_cmd, safe_strerror(errno));
a6810074
DL
302 return -1;
303 case 0:
304 /* Child process. */
d62a17ae 305 /* Use separate process group so child processes can be killed
306 * easily. */
a6810074
DL
307 if (setpgid(0, 0) < 0)
308 zlog_warn("warning: setpgid(0,0) failed: %s",
309 safe_strerror(errno));
310 {
311 char shell[] = "sh";
312 char dashc[] = "-c";
d62a17ae 313 char *const argv[4] = {shell, dashc, shell_cmd, NULL};
a6810074 314 execv("/bin/sh", argv);
450971aa 315 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
316 "execv(/bin/sh -c '%s') failed: %s",
317 shell_cmd, safe_strerror(errno));
a6810074
DL
318 _exit(127);
319 }
320 default:
321 /* Parent process: we will reap the child later. */
450971aa 322 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
323 "Forked background command [pid %d]: %s",
324 (int)child, shell_cmd);
a6810074
DL
325 return child;
326 }
8b886ca7 327}
328
a6810074
DL
329static struct timeval *time_elapsed(struct timeval *result,
330 const struct timeval *start_time)
8b886ca7 331{
a6810074
DL
332 gettimeofday(result, NULL);
333 result->tv_sec -= start_time->tv_sec;
334 result->tv_usec -= start_time->tv_usec;
335 while (result->tv_usec < 0) {
336 result->tv_usec += 1000000L;
337 result->tv_sec--;
338 }
339 return result;
8b886ca7 340}
341
a6810074 342static int restart_kill(struct thread *t_kill)
8b886ca7 343{
a6810074
DL
344 struct restart_info *restart = THREAD_ARG(t_kill);
345 struct timeval delay;
346
347 time_elapsed(&delay, &restart->time);
d62a17ae 348 zlog_warn(
349 "Warning: %s %s child process %d still running after "
350 "%ld seconds, sending signal %d",
351 restart->what, restart->name, (int)restart->pid,
352 (long)delay.tv_sec, (restart->kills ? SIGKILL : SIGTERM));
a6810074
DL
353 kill(-restart->pid, (restart->kills ? SIGKILL : SIGTERM));
354 restart->kills++;
66e78ae6
QY
355 restart->t_kill = NULL;
356 thread_add_timer(master, restart_kill, restart, gs.restart_timeout,
357 &restart->t_kill);
a6810074 358 return 0;
8b886ca7 359}
360
a6810074 361static struct restart_info *find_child(pid_t child)
8b886ca7 362{
f168b713 363 struct daemon *dmn;
7c265f7d
CF
364 if (gs.restart.pid == child)
365 return &gs.restart;
366
f168b713
DL
367 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
368 if (dmn->restart.pid == child)
369 return &dmn->restart;
a6810074
DL
370 }
371 return NULL;
8b886ca7 372}
373
a6810074 374static void sigchild(void)
8b886ca7 375{
a6810074
DL
376 pid_t child;
377 int status;
378 const char *name;
379 const char *what;
380 struct restart_info *restart;
75f8b0e4 381 struct daemon *dmn;
a6810074
DL
382
383 switch (child = waitpid(-1, &status, WNOHANG)) {
384 case -1:
450971aa 385 flog_err_sys(EC_LIB_SYSTEM_CALL, "waitpid failed: %s",
09c866e3 386 safe_strerror(errno));
a6810074
DL
387 return;
388 case 0:
389 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
390 return;
391 }
392
393 if (child == integrated_write_pid) {
394 integrated_write_sigchld(status);
395 return;
396 }
397
398 if ((restart = find_child(child)) != NULL) {
399 name = restart->name;
400 what = restart->what;
401 restart->pid = 0;
402 gs.numpids--;
403 thread_cancel(restart->t_kill);
404 restart->t_kill = NULL;
d62a17ae 405 /* Update restart time to reflect the time the command
406 * completed. */
a6810074
DL
407 gettimeofday(&restart->time, NULL);
408 } else {
09c866e3 409 flog_err_sys(
450971aa 410 EC_LIB_SYSTEM_CALL,
09c866e3
QY
411 "waitpid returned status for an unknown child process %d",
412 (int)child);
a6810074
DL
413 name = "(unknown)";
414 what = "background";
415 }
416 if (WIFSTOPPED(status))
d62a17ae 417 zlog_warn("warning: %s %s process %d is stopped", what, name,
418 (int)child);
a6810074 419 else if (WIFSIGNALED(status))
d62a17ae 420 zlog_warn("%s %s process %d terminated due to signal %d", what,
421 name, (int)child, WTERMSIG(status));
a6810074
DL
422 else if (WIFEXITED(status)) {
423 if (WEXITSTATUS(status) != 0)
d62a17ae 424 zlog_warn(
425 "%s %s process %d exited with non-zero status %d",
426 what, name, (int)child, WEXITSTATUS(status));
75f8b0e4 427 else {
a6810074
DL
428 zlog_debug("%s %s process %d exited normally", what,
429 name, (int)child);
75f8b0e4
DL
430
431 if (restart && restart != &gs.restart) {
432 dmn = container_of(restart, struct daemon,
433 restart);
434 restart_done(dmn);
435 } else if (restart)
436 for (dmn = gs.daemons; dmn; dmn = dmn->next)
437 restart_done(dmn);
438 }
a6810074 439 } else
09c866e3 440 flog_err_sys(
450971aa 441 EC_LIB_SYSTEM_CALL,
09c866e3
QY
442 "cannot interpret %s %s process %d wait status 0x%x",
443 what, name, (int)child, status);
a6810074 444 phase_check();
8b886ca7 445}
446
d62a17ae 447static int run_job(struct restart_info *restart, const char *cmdtype,
448 const char *command, int force, int update_interval)
8b886ca7 449{
a6810074
DL
450 struct timeval delay;
451
452 if (gs.loglevel > LOG_DEBUG + 1)
453 zlog_debug("attempting to %s %s", cmdtype, restart->name);
454
455 if (restart->pid) {
456 if (gs.loglevel > LOG_DEBUG + 1)
d62a17ae 457 zlog_debug(
458 "cannot %s %s, previous pid %d still running",
459 cmdtype, restart->name, (int)restart->pid);
a6810074
DL
460 return -1;
461 }
462
d62a17ae 463 /* Note: time_elapsed test must come before the force test, since we
464 need
a6810074
DL
465 to make sure that delay is initialized for use below in updating the
466 restart interval. */
467 if ((time_elapsed(&delay, &restart->time)->tv_sec < restart->interval)
468 && !force) {
469 if (gs.loglevel > LOG_DEBUG + 1)
d62a17ae 470 zlog_debug(
471 "postponing %s %s: "
472 "elapsed time %ld < retry interval %ld",
473 cmdtype, restart->name, (long)delay.tv_sec,
474 restart->interval);
a6810074
DL
475 return -1;
476 }
477
478 gettimeofday(&restart->time, NULL);
479 restart->kills = 0;
480 {
481 char cmd[strlen(command) + strlen(restart->name) + 1];
482 snprintf(cmd, sizeof(cmd), command, restart->name);
483 if ((restart->pid = run_background(cmd)) > 0) {
66e78ae6 484 restart->t_kill = NULL;
d62a17ae 485 thread_add_timer(master, restart_kill, restart,
486 gs.restart_timeout, &restart->t_kill);
a6810074
DL
487 restart->what = cmdtype;
488 gs.numpids++;
489 } else
490 restart->pid = 0;
491 }
492
493 /* Calculate the new restart interval. */
494 if (update_interval) {
495 if (delay.tv_sec > 2 * gs.max_restart_interval)
496 restart->interval = gs.min_restart_interval;
497 else if ((restart->interval *= 2) > gs.max_restart_interval)
498 restart->interval = gs.max_restart_interval;
499 if (gs.loglevel > LOG_DEBUG + 1)
500 zlog_debug("restart %s interval is now %ld",
501 restart->name, restart->interval);
502 }
503 return restart->pid;
8b886ca7 504}
505
d62a17ae 506#define SET_READ_HANDLER(DMN) \
507 do { \
508 (DMN)->t_read = NULL; \
509 thread_add_read(master, handle_read, (DMN), (DMN)->fd, \
510 &(DMN)->t_read); \
511 } while (0);
512
513#define SET_WAKEUP_DOWN(DMN) \
514 do { \
515 (DMN)->t_wakeup = NULL; \
516 thread_add_timer_msec(master, wakeup_down, (DMN), \
517 FUZZY(gs.period), &(DMN)->t_wakeup); \
518 } while (0);
519
520#define SET_WAKEUP_UNRESPONSIVE(DMN) \
521 do { \
522 (DMN)->t_wakeup = NULL; \
523 thread_add_timer_msec(master, wakeup_unresponsive, (DMN), \
524 FUZZY(gs.period), &(DMN)->t_wakeup); \
525 } while (0);
526
527#define SET_WAKEUP_ECHO(DMN) \
528 do { \
529 (DMN)->t_wakeup = NULL; \
530 thread_add_timer_msec(master, wakeup_send_echo, (DMN), \
531 FUZZY(gs.period), &(DMN)->t_wakeup); \
532 } while (0);
8b886ca7 533
a6810074 534static int wakeup_down(struct thread *t_wakeup)
8b886ca7 535{
a6810074
DL
536 struct daemon *dmn = THREAD_ARG(t_wakeup);
537
538 dmn->t_wakeup = NULL;
539 if (try_connect(dmn) < 0)
540 SET_WAKEUP_DOWN(dmn);
541 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
542 try_restart(dmn);
543 return 0;
8b886ca7 544}
545
a6810074 546static int wakeup_init(struct thread *t_wakeup)
8b886ca7 547{
a6810074
DL
548 struct daemon *dmn = THREAD_ARG(t_wakeup);
549
550 dmn->t_wakeup = NULL;
551 if (try_connect(dmn) < 0) {
f74ae2bb 552 flog_err(EC_WATCHFRR_CONNECTION,
1c50c1c0
QY
553 "%s state -> down : initial connection attempt failed",
554 dmn->name);
a6810074
DL
555 dmn->state = DAEMON_DOWN;
556 }
c0e5cb52 557 phase_check();
a6810074 558 return 0;
8b886ca7 559}
560
75f8b0e4
DL
561static void restart_done(struct daemon *dmn)
562{
563 if (dmn->state != DAEMON_DOWN) {
3f391bec
DS
564 zlog_warn(
565 "Daemon: %s: is in %s state but expected it to be in DAEMON_DOWN state",
566 dmn->name, state_str[dmn->state]);
75f8b0e4
DL
567 return;
568 }
569 if (dmn->t_wakeup)
570 THREAD_OFF(dmn->t_wakeup);
571 if (try_connect(dmn) < 0)
572 SET_WAKEUP_DOWN(dmn);
573}
574
a6810074 575static void daemon_down(struct daemon *dmn, const char *why)
8b886ca7 576{
a6810074 577 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
1c50c1c0
QY
578 flog_err(EC_WATCHFRR_CONNECTION, "%s state -> down : %s",
579 dmn->name, why);
a6810074
DL
580 else if (gs.loglevel > LOG_DEBUG)
581 zlog_debug("%s still down : %s", dmn->name, why);
582 if (IS_UP(dmn))
583 gs.numdown++;
584 dmn->state = DAEMON_DOWN;
585 if (dmn->fd >= 0) {
586 close(dmn->fd);
587 dmn->fd = -1;
588 }
589 THREAD_OFF(dmn->t_read);
590 THREAD_OFF(dmn->t_write);
591 THREAD_OFF(dmn->t_wakeup);
592 if (try_connect(dmn) < 0)
593 SET_WAKEUP_DOWN(dmn);
594 phase_check();
8b886ca7 595}
596
a6810074 597static int handle_read(struct thread *t_read)
8b886ca7 598{
a6810074
DL
599 struct daemon *dmn = THREAD_ARG(t_read);
600 static const char resp[sizeof(PING_TOKEN) + 4] = PING_TOKEN "\n";
601 char buf[sizeof(resp) + 100];
602 ssize_t rc;
603 struct timeval delay;
604
605 dmn->t_read = NULL;
606 if ((rc = read(dmn->fd, buf, sizeof(buf))) < 0) {
607 char why[100];
608
609 if (ERRNO_IO_RETRY(errno)) {
610 /* Pretend it never happened. */
611 SET_READ_HANDLER(dmn);
612 return 0;
613 }
614 snprintf(why, sizeof(why), "unexpected read error: %s",
615 safe_strerror(errno));
616 daemon_down(dmn, why);
617 return 0;
8b886ca7 618 }
a6810074
DL
619 if (rc == 0) {
620 daemon_down(dmn, "read returned EOF");
621 return 0;
622 }
623 if (!dmn->echo_sent.tv_sec) {
624 char why[sizeof(buf) + 100];
625 snprintf(why, sizeof(why),
626 "unexpected read returns %d bytes: %.*s", (int)rc,
627 (int)rc, buf);
628 daemon_down(dmn, why);
629 return 0;
8b886ca7 630 }
a6810074
DL
631
632 /* We are expecting an echo response: is there any chance that the
633 response would not be returned entirely in the first read? That
634 seems inconceivable... */
635 if ((rc != sizeof(resp)) || memcmp(buf, resp, sizeof(resp))) {
636 char why[100 + sizeof(buf)];
637 snprintf(why, sizeof(why),
638 "read returned bad echo response of %d bytes "
d62a17ae 639 "(expecting %u): %.*s",
d7c0a89a 640 (int)rc, (unsigned int)sizeof(resp), (int)rc, buf);
a6810074
DL
641 daemon_down(dmn, why);
642 return 0;
643 }
644
645 time_elapsed(&delay, &dmn->echo_sent);
646 dmn->echo_sent.tv_sec = 0;
647 if (dmn->state == DAEMON_UNRESPONSIVE) {
648 if (delay.tv_sec < gs.timeout) {
649 dmn->state = DAEMON_UP;
d62a17ae 650 zlog_warn(
651 "%s state -> up : echo response received after %ld.%06ld "
652 "seconds",
653 dmn->name, (long)delay.tv_sec,
654 (long)delay.tv_usec);
a6810074 655 } else
d62a17ae 656 zlog_warn(
657 "%s: slow echo response finally received after %ld.%06ld "
658 "seconds",
659 dmn->name, (long)delay.tv_sec,
660 (long)delay.tv_usec);
a6810074
DL
661 } else if (gs.loglevel > LOG_DEBUG + 1)
662 zlog_debug("%s: echo response received after %ld.%06ld seconds",
663 dmn->name, (long)delay.tv_sec, (long)delay.tv_usec);
664
665 SET_READ_HANDLER(dmn);
666 if (dmn->t_wakeup)
667 thread_cancel(dmn->t_wakeup);
668 SET_WAKEUP_ECHO(dmn);
669
670 return 0;
8b886ca7 671}
672
207e0d7a
DS
673/*
674 * Wait till we notice that all daemons are ready before
675 * we send we are ready to systemd
676 */
5c9d1c83 677static void daemon_send_ready(int exitcode)
207e0d7a 678{
5c9d1c83 679 FILE *fp;
a6810074 680 static int sent = 0;
43e587c1 681 char started[1024];
207e0d7a 682
5c9d1c83
DL
683 if (sent)
684 return;
685
686 if (exitcode == 0)
0a7c7856 687 zlog_notice("all daemons up, doing startup-complete notify");
5c9d1c83
DL
688 else if (gs.numdown < gs.numdaemons)
689 flog_err(EC_WATCHFRR_CONNECTION,
690 "startup did not complete within timeout"
691 " (%d/%d daemons running)",
692 gs.numdaemons - gs.numdown, gs.numdaemons);
693 else {
694 flog_err(EC_WATCHFRR_CONNECTION,
695 "all configured daemons failed to start"
696 " -- exiting watchfrr");
697 exit(exitcode);
698
699 }
0a7c7856 700
5c9d1c83
DL
701 frr_detach();
702
3c649c71
DS
703 snprintf(started, sizeof(started), "%s%s", frr_vtydir,
704 "watchfrr.started");
705 fp = fopen(started, "w");
5c9d1c83
DL
706 if (fp)
707 fclose(fp);
60bd2534 708#if defined HAVE_SYSTEMD
5c9d1c83 709 systemd_send_started(master, 0);
60bd2534 710#endif
5c9d1c83 711 sent = 1;
207e0d7a
DS
712}
713
a6810074 714static void daemon_up(struct daemon *dmn, const char *why)
8b886ca7 715{
a6810074
DL
716 dmn->state = DAEMON_UP;
717 gs.numdown--;
718 dmn->connect_tries = 0;
719 zlog_notice("%s state -> up : %s", dmn->name, why);
5c9d1c83
DL
720 if (gs.numdown == 0)
721 daemon_send_ready(0);
a8cbb8b3 722 SET_WAKEUP_ECHO(dmn);
a6810074 723 phase_check();
8b886ca7 724}
725
a6810074 726static int check_connect(struct thread *t_write)
8b886ca7 727{
a6810074
DL
728 struct daemon *dmn = THREAD_ARG(t_write);
729 int sockerr;
730 socklen_t reslen = sizeof(sockerr);
731
732 dmn->t_write = NULL;
733 if (getsockopt(dmn->fd, SOL_SOCKET, SO_ERROR, (char *)&sockerr, &reslen)
734 < 0) {
735 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn->name,
736 safe_strerror(errno));
737 daemon_down(dmn,
738 "getsockopt failed checking connection success");
739 return 0;
740 }
741 if ((reslen == sizeof(sockerr)) && sockerr) {
742 char why[100];
d62a17ae 743 snprintf(
744 why, sizeof(why),
745 "getsockopt reports that connection attempt failed: %s",
746 safe_strerror(sockerr));
a6810074
DL
747 daemon_down(dmn, why);
748 return 0;
749 }
750
751 daemon_up(dmn, "delayed connect succeeded");
752 return 0;
8b886ca7 753}
754
a6810074 755static int wakeup_connect_hanging(struct thread *t_wakeup)
8b886ca7 756{
a6810074
DL
757 struct daemon *dmn = THREAD_ARG(t_wakeup);
758 char why[100];
759
760 dmn->t_wakeup = NULL;
761 snprintf(why, sizeof(why),
762 "connection attempt timed out after %ld seconds", gs.timeout);
763 daemon_down(dmn, why);
764 return 0;
8b886ca7 765}
766
767/* Making connection to protocol daemon. */
a6810074 768static int try_connect(struct daemon *dmn)
8b886ca7 769{
a6810074
DL
770 int sock;
771 struct sockaddr_un addr;
772 socklen_t len;
773
774 if (gs.loglevel > LOG_DEBUG + 1)
775 zlog_debug("%s: attempting to connect", dmn->name);
776 dmn->connect_tries++;
777
778 memset(&addr, 0, sizeof(struct sockaddr_un));
779 addr.sun_family = AF_UNIX;
d62a17ae 780 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty", gs.vtydir,
781 dmn->name);
6f0e3f6e 782#ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
a6810074 783 len = addr.sun_len = SUN_LEN(&addr);
8b886ca7 784#else
a6810074 785 len = sizeof(addr.sun_family) + strlen(addr.sun_path);
d62a17ae 786#endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
a6810074
DL
787
788 /* Quick check to see if we might succeed before we go to the trouble
789 of creating a socket. */
790 if (access(addr.sun_path, W_OK) < 0) {
791 if (errno != ENOENT)
450971aa 792 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
793 "%s: access to socket %s denied: %s",
794 dmn->name, addr.sun_path,
795 safe_strerror(errno));
a6810074
DL
796 return -1;
797 }
798
799 if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
450971aa 800 flog_err_sys(EC_LIB_SOCKET, "%s(%s): cannot make socket: %s",
09c866e3 801 __func__, addr.sun_path, safe_strerror(errno));
a6810074
DL
802 return -1;
803 }
804
805 if (set_nonblocking(sock) < 0 || set_cloexec(sock) < 0) {
450971aa 806 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
807 "%s(%s): set_nonblocking/cloexec(%d) failed",
808 __func__, addr.sun_path, sock);
a6810074
DL
809 close(sock);
810 return -1;
8b886ca7 811 }
a6810074
DL
812
813 if (connect(sock, (struct sockaddr *)&addr, len) < 0) {
814 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK)) {
815 if (gs.loglevel > LOG_DEBUG)
816 zlog_debug("%s(%s): connect failed: %s",
817 __func__, addr.sun_path,
818 safe_strerror(errno));
819 close(sock);
820 return -1;
821 }
822 if (gs.loglevel > LOG_DEBUG)
823 zlog_debug("%s: connection in progress", dmn->name);
824 dmn->state = DAEMON_CONNECTING;
825 dmn->fd = sock;
66e78ae6
QY
826 dmn->t_write = NULL;
827 thread_add_write(master, check_connect, dmn, dmn->fd,
d62a17ae 828 &dmn->t_write);
829 dmn->t_wakeup = NULL;
830 thread_add_timer(master, wakeup_connect_hanging, dmn,
831 gs.timeout, &dmn->t_wakeup);
a6810074
DL
832 SET_READ_HANDLER(dmn);
833 return 0;
834 }
835
836 dmn->fd = sock;
837 SET_READ_HANDLER(dmn);
838 daemon_up(dmn, "connect succeeded");
839 return 1;
8b886ca7 840}
841
a6810074 842static int phase_hanging(struct thread *t_hanging)
8b886ca7 843{
a6810074 844 gs.t_phase_hanging = NULL;
f74ae2bb 845 flog_err(EC_WATCHFRR_CONNECTION,
1c50c1c0
QY
846 "Phase [%s] hanging for %ld seconds, aborting phased restart",
847 phase_str[gs.phase], PHASE_TIMEOUT);
a6810074
DL
848 gs.phase = PHASE_NONE;
849 return 0;
8b886ca7 850}
851
a6810074 852static void set_phase(restart_phase_t new_phase)
8b886ca7 853{
a6810074
DL
854 gs.phase = new_phase;
855 if (gs.t_phase_hanging)
856 thread_cancel(gs.t_phase_hanging);
66e78ae6
QY
857 gs.t_phase_hanging = NULL;
858 thread_add_timer(master, phase_hanging, NULL, PHASE_TIMEOUT,
859 &gs.t_phase_hanging);
8b886ca7 860}
861
a6810074 862static void phase_check(void)
8b886ca7 863{
c0e5cb52
DL
864 struct daemon *dmn;
865
a6810074
DL
866 switch (gs.phase) {
867 case PHASE_NONE:
868 break;
c0e5cb52
DL
869
870 case PHASE_INIT:
871 for (dmn = gs.daemons; dmn; dmn = dmn->next)
872 if (dmn->state == DAEMON_INIT)
873 return;
874
875 /* startup complete, everything out of INIT */
876 gs.phase = PHASE_NONE;
877 for (dmn = gs.daemons; dmn; dmn = dmn->next)
878 if (dmn->state == DAEMON_DOWN) {
879 SET_WAKEUP_DOWN(dmn);
880 try_restart(dmn);
881 }
882 break;
a6810074
DL
883 case PHASE_STOPS_PENDING:
884 if (gs.numpids)
885 break;
d62a17ae 886 zlog_info(
887 "Phased restart: all routing daemon stop jobs have completed.");
a6810074
DL
888 set_phase(PHASE_WAITING_DOWN);
889
d62a17ae 890 /*FALLTHRU*/
a6810074
DL
891 case PHASE_WAITING_DOWN:
892 if (gs.numdown + IS_UP(gs.special) < gs.numdaemons)
893 break;
894 zlog_info("Phased restart: all routing daemons now down.");
895 run_job(&gs.special->restart, "restart", gs.restart_command, 1,
896 1);
897 set_phase(PHASE_ZEBRA_RESTART_PENDING);
898
d62a17ae 899 /*FALLTHRU*/
a6810074
DL
900 case PHASE_ZEBRA_RESTART_PENDING:
901 if (gs.special->restart.pid)
902 break;
903 zlog_info("Phased restart: %s restart job completed.",
904 gs.special->name);
905 set_phase(PHASE_WAITING_ZEBRA_UP);
906
d62a17ae 907 /*FALLTHRU*/
a6810074
DL
908 case PHASE_WAITING_ZEBRA_UP:
909 if (!IS_UP(gs.special))
910 break;
911 zlog_info("Phased restart: %s is now up.", gs.special->name);
912 {
913 struct daemon *dmn;
914 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
915 if (dmn != gs.special)
916 run_job(&dmn->restart, "start",
917 gs.start_command, 1, 0);
918 }
919 }
920 gs.phase = PHASE_NONE;
921 THREAD_OFF(gs.t_phase_hanging);
922 zlog_notice("Phased global restart has completed.");
923 break;
924 }
8b886ca7 925}
926
a6810074 927static void try_restart(struct daemon *dmn)
8b886ca7 928{
f168b713 929 if (watch_only)
a6810074 930 return;
a6810074 931
f168b713
DL
932 if (dmn != gs.special) {
933 if ((gs.special->state == DAEMON_UP)
934 && (gs.phase == PHASE_NONE))
935 run_job(&dmn->restart, "restart", gs.restart_command, 0,
936 1);
937 else
938 zlog_debug(
939 "%s: postponing restart attempt because master %s daemon "
940 "not up [%s], or phased restart in progress",
941 dmn->name, gs.special->name,
942 state_str[gs.special->state]);
943 return;
944 }
945
946 if ((gs.phase != PHASE_NONE) || gs.numpids) {
947 if (gs.loglevel > LOG_DEBUG + 1)
948 zlog_debug(
949 "postponing phased global restart: restart already in "
950 "progress [%s], or outstanding child processes [%d]",
951 phase_str[gs.phase], gs.numpids);
952 return;
953 }
954 /* Is it too soon for a restart? */
955 {
956 struct timeval delay;
957 if (time_elapsed(&delay, &gs.special->restart.time)->tv_sec
958 < gs.special->restart.interval) {
a6810074 959 if (gs.loglevel > LOG_DEBUG + 1)
d62a17ae 960 zlog_debug(
f168b713
DL
961 "postponing phased global restart: "
962 "elapsed time %ld < retry interval %ld",
963 (long)delay.tv_sec,
964 gs.special->restart.interval);
965 return;
a6810074 966 }
8b886ca7 967 }
f168b713 968 run_job(&gs.restart, "restart", gs.restart_command, 0, 1);
8b886ca7 969}
970
a6810074 971static int wakeup_unresponsive(struct thread *t_wakeup)
8b886ca7 972{
a6810074
DL
973 struct daemon *dmn = THREAD_ARG(t_wakeup);
974
975 dmn->t_wakeup = NULL;
976 if (dmn->state != DAEMON_UNRESPONSIVE)
f74ae2bb 977 flog_err(EC_WATCHFRR_CONNECTION,
1c50c1c0
QY
978 "%s: no longer unresponsive (now %s), "
979 "wakeup should have been cancelled!",
980 dmn->name, state_str[dmn->state]);
a6810074
DL
981 else {
982 SET_WAKEUP_UNRESPONSIVE(dmn);
983 try_restart(dmn);
984 }
985 return 0;
8b886ca7 986}
987
a6810074 988static int wakeup_no_answer(struct thread *t_wakeup)
8b886ca7 989{
a6810074
DL
990 struct daemon *dmn = THREAD_ARG(t_wakeup);
991
992 dmn->t_wakeup = NULL;
993 dmn->state = DAEMON_UNRESPONSIVE;
cc53b605
DS
994 if (dmn->ignore_timeout)
995 return 0;
f74ae2bb 996 flog_err(EC_WATCHFRR_CONNECTION,
1c50c1c0
QY
997 "%s state -> unresponsive : no response yet to ping "
998 "sent %ld seconds ago",
999 dmn->name, gs.timeout);
71e7975a
DL
1000 SET_WAKEUP_UNRESPONSIVE(dmn);
1001 try_restart(dmn);
a6810074 1002 return 0;
8b886ca7 1003}
1004
a6810074 1005static int wakeup_send_echo(struct thread *t_wakeup)
8b886ca7 1006{
a6810074
DL
1007 static const char echocmd[] = "echo " PING_TOKEN;
1008 ssize_t rc;
1009 struct daemon *dmn = THREAD_ARG(t_wakeup);
1010
1011 dmn->t_wakeup = NULL;
d62a17ae 1012 if (((rc = write(dmn->fd, echocmd, sizeof(echocmd))) < 0)
1013 || ((size_t)rc != sizeof(echocmd))) {
a6810074
DL
1014 char why[100 + sizeof(echocmd)];
1015 snprintf(why, sizeof(why),
1016 "write '%s' returned %d instead of %u", echocmd,
d7c0a89a 1017 (int)rc, (unsigned int)sizeof(echocmd));
a6810074
DL
1018 daemon_down(dmn, why);
1019 } else {
1020 gettimeofday(&dmn->echo_sent, NULL);
66e78ae6
QY
1021 dmn->t_wakeup = NULL;
1022 thread_add_timer(master, wakeup_no_answer, dmn, gs.timeout,
1023 &dmn->t_wakeup);
a6810074
DL
1024 }
1025 return 0;
8b886ca7 1026}
1027
470bc619
QY
1028bool check_all_up(void)
1029{
1030 struct daemon *dmn;
1031
1032 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1033 if (dmn->state != DAEMON_UP)
1034 return false;
1035 return true;
1036}
1037
af568444
DL
1038void watchfrr_status(struct vty *vty)
1039{
1040 struct daemon *dmn;
1041 struct timeval delay;
1042
1043 vty_out(vty, "watchfrr global phase: %s\n", phase_str[gs.phase]);
1044 if (gs.restart.pid)
1045 vty_out(vty, " global restart running, pid %ld\n",
1046 (long)gs.restart.pid);
1047
1048 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
cc53b605
DS
1049 vty_out(vty, " %-20s %s%s", dmn->name, state_str[dmn->state],
1050 dmn->ignore_timeout ? "/Ignoring Timeout\n" : "\n");
af568444
DL
1051 if (dmn->restart.pid)
1052 vty_out(vty, " restart running, pid %ld\n",
1053 (long)dmn->restart.pid);
1054 else if (dmn->state == DAEMON_DOWN &&
1055 time_elapsed(&delay, &dmn->restart.time)->tv_sec
1056 < dmn->restart.interval)
051a0be4
DL
1057 vty_out(vty, " restarting in %jd seconds"
1058 " (%jds backoff interval)\n",
1059 (intmax_t)dmn->restart.interval
1060 - (intmax_t)delay.tv_sec,
1061 (intmax_t)dmn->restart.interval);
af568444
DL
1062 }
1063}
1064
a6810074 1065static void sigint(void)
8b886ca7 1066{
a6810074
DL
1067 zlog_notice("Terminating on signal");
1068 systemd_send_stopping();
1069 exit(0);
8b886ca7 1070}
1071
a6810074 1072static int valid_command(const char *cmd)
8b886ca7 1073{
a6810074 1074 char *p;
8b886ca7 1075
a6810074 1076 return ((p = strchr(cmd, '%')) != NULL) && (*(p + 1) == 's')
d62a17ae 1077 && !strchr(p + 1, '%');
8b886ca7 1078}
1079
c8b40f86 1080/* This is an ugly hack to circumvent problems with passing command-line
1081 arguments that contain spaces. The fix is to use a configuration file. */
a6810074 1082static char *translate_blanks(const char *cmd, const char *blankstr)
c8b40f86 1083{
a6810074
DL
1084 char *res;
1085 char *p;
1086 size_t bslen = strlen(blankstr);
1087
1088 if (!(res = strdup(cmd))) {
1089 perror("strdup");
1090 exit(1);
1091 }
1092 while ((p = strstr(res, blankstr)) != NULL) {
1093 *p = ' ';
1094 if (bslen != 1)
1095 memmove(p + 1, p + bslen, strlen(p + bslen) + 1);
1096 }
1097 return res;
c8b40f86 1098}
1099
5c9d1c83
DL
1100static int startup_timeout(struct thread *t_wakeup)
1101{
1102 daemon_send_ready(1);
1103 return 0;
1104}
1105
0a7c7856
DL
1106static void watchfrr_init(int argc, char **argv)
1107{
1108 const char *special = "zebra";
1109 int i;
1110 struct daemon *dmn, **add = &gs.daemons;
1111 char alldaemons[512] = "", *p = alldaemons;
1112
5c9d1c83
DL
1113 thread_add_timer_msec(master, startup_timeout, NULL, STARTUP_TIMEOUT,
1114 &gs.t_startup_timeout);
1115
0a7c7856
DL
1116 for (i = optind; i < argc; i++) {
1117 dmn = XCALLOC(MTYPE_WATCHFRR_DAEMON, sizeof(*dmn));
1118
1119 dmn->name = dmn->restart.name = argv[i];
1120 dmn->state = DAEMON_INIT;
1121 gs.numdaemons++;
1122 gs.numdown++;
1123 dmn->fd = -1;
1124 dmn->t_wakeup = NULL;
c0e5cb52 1125 thread_add_timer_msec(master, wakeup_init, dmn, 0,
0a7c7856
DL
1126 &dmn->t_wakeup);
1127 dmn->restart.interval = gs.min_restart_interval;
1128 *add = dmn;
1129 add = &dmn->next;
1130
1131 if (!strcmp(dmn->name, special))
1132 gs.special = dmn;
1133 }
1134
1135 if (!gs.daemons) {
1136 fprintf(stderr,
1137 "Must specify one or more daemons to monitor.\n\n");
1138 frr_help_exit(1);
1139 }
1140 if (!watch_only && !gs.special) {
1141 fprintf(stderr, "\"%s\" daemon must be in daemon lists\n\n",
1142 special);
1143 frr_help_exit(1);
1144 }
1145
1146 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
1147 snprintf(p, alldaemons + sizeof(alldaemons) - p, "%s%s",
1148 (p == alldaemons) ? "" : " ", dmn->name);
1149 p += strlen(p);
1150 }
1151 zlog_notice("%s %s watching [%s]%s", progname, FRR_VERSION, alldaemons,
1152 watch_only ? ", monitor mode" : "");
1153}
1154
a6810074 1155struct zebra_privs_t watchfrr_privs = {
95c4aff2 1156#ifdef VTY_GROUP
a6810074 1157 .vty_group = VTY_GROUP,
95c4aff2
DL
1158#endif
1159};
1160
4f04a76b
DL
1161static struct quagga_signal_t watchfrr_signals[] = {
1162 {
1163 .signal = SIGINT,
1164 .handler = sigint,
1165 },
1166 {
1167 .signal = SIGTERM,
1168 .handler = sigint,
1169 },
1170 {
1171 .signal = SIGCHLD,
1172 .handler = sigchild,
1173 },
1174};
1175
1176FRR_DAEMON_INFO(watchfrr, WATCHFRR,
d62a17ae 1177 .flags = FRR_NO_PRIVSEP | FRR_NO_TCPVTY | FRR_LIMITED_CLI
0a7c7856
DL
1178 | FRR_NO_CFG_PID_DRY | FRR_NO_ZCLIENT
1179 | FRR_DETACH_LATER,
4f04a76b 1180
d62a17ae 1181 .printhelp = printhelp,
1182 .copyright = "Copyright 2004 Andrew J. Schorr",
4f04a76b 1183
d62a17ae 1184 .signals = watchfrr_signals,
1185 .n_signals = array_size(watchfrr_signals),
4f04a76b 1186
d62a17ae 1187 .privs = &watchfrr_privs, )
4f04a76b 1188
999f153e
DL
1189#define DEPRECATED_OPTIONS "aAezR:"
1190
a6810074 1191int main(int argc, char **argv)
8b886ca7 1192{
a6810074 1193 int opt;
a6810074 1194 const char *blankstr = NULL;
a6810074 1195
4f04a76b
DL
1196 frr_preinit(&watchfrr_di, argc, argv);
1197 progname = watchfrr_di.progname;
1198
999f153e 1199 frr_opt_add("b:dk:l:i:p:r:S:s:t:T:" DEPRECATED_OPTIONS, longopts, "");
a6810074
DL
1200
1201 gs.restart.name = "all";
4f04a76b 1202 while ((opt = frr_getopt(argc, argv, NULL)) != EOF) {
999f153e
DL
1203 if (opt && opt < 128 && strchr(DEPRECATED_OPTIONS, opt)) {
1204 fprintf(stderr,
1205 "The -%c option no longer exists.\n"
1206 "Please refer to the watchfrr(8) man page.\n",
1207 opt);
1208 exit(1);
1209 }
1210
a6810074
DL
1211 switch (opt) {
1212 case 0:
1213 break;
a6810074
DL
1214 case 'b':
1215 blankstr = optarg;
1216 break;
f168b713
DL
1217 case OPTION_DRY:
1218 watch_only = true;
a6810074
DL
1219 break;
1220 case 'k':
1221 if (!valid_command(optarg)) {
1222 fprintf(stderr,
1223 "Invalid kill command, must contain '%%s': %s\n",
1224 optarg);
4f04a76b 1225 frr_help_exit(1);
a6810074
DL
1226 }
1227 gs.stop_command = optarg;
1228 break;
d62a17ae 1229 case 'l': {
1230 char garbage[3];
1231 if ((sscanf(optarg, "%d%1s", &gs.loglevel, garbage)
1232 != 1)
1233 || (gs.loglevel < LOG_EMERG)) {
1234 fprintf(stderr,
1235 "Invalid loglevel argument: %s\n",
1236 optarg);
1237 frr_help_exit(1);
a6810074 1238 }
d62a17ae 1239 } break;
1240 case OPTION_MINRESTART: {
1241 char garbage[3];
1242 if ((sscanf(optarg, "%ld%1s", &gs.min_restart_interval,
1243 garbage)
1244 != 1)
1245 || (gs.min_restart_interval < 0)) {
1246 fprintf(stderr,
1247 "Invalid min_restart_interval argument: %s\n",
1248 optarg);
1249 frr_help_exit(1);
a6810074 1250 }
d62a17ae 1251 } break;
1252 case OPTION_MAXRESTART: {
1253 char garbage[3];
1254 if ((sscanf(optarg, "%ld%1s", &gs.max_restart_interval,
1255 garbage)
1256 != 1)
1257 || (gs.max_restart_interval < 0)) {
1258 fprintf(stderr,
1259 "Invalid max_restart_interval argument: %s\n",
1260 optarg);
1261 frr_help_exit(1);
a6810074 1262 }
d62a17ae 1263 } break;
1264 case 'i': {
1265 char garbage[3];
1266 int period;
1267 if ((sscanf(optarg, "%d%1s", &period, garbage) != 1)
1268 || (gs.period < 1)) {
1269 fprintf(stderr,
1270 "Invalid interval argument: %s\n",
1271 optarg);
1272 frr_help_exit(1);
a6810074 1273 }
d62a17ae 1274 gs.period = 1000 * period;
1275 } break;
a6810074 1276 case 'p':
0a7c7856 1277 watchfrr_di.pid_file = optarg;
a6810074
DL
1278 break;
1279 case 'r':
a6810074
DL
1280 if (!valid_command(optarg)) {
1281 fprintf(stderr,
1282 "Invalid restart command, must contain '%%s': %s\n",
1283 optarg);
4f04a76b 1284 frr_help_exit(1);
a6810074
DL
1285 }
1286 gs.restart_command = optarg;
a6810074
DL
1287 break;
1288 case 's':
1289 if (!valid_command(optarg)) {
1290 fprintf(stderr,
1291 "Invalid start command, must contain '%%s': %s\n",
1292 optarg);
4f04a76b 1293 frr_help_exit(1);
a6810074
DL
1294 }
1295 gs.start_command = optarg;
1296 break;
1297 case 'S':
1298 gs.vtydir = optarg;
1299 break;
d62a17ae 1300 case 't': {
1301 char garbage[3];
1302 if ((sscanf(optarg, "%ld%1s", &gs.timeout, garbage)
1303 != 1)
1304 || (gs.timeout < 1)) {
1305 fprintf(stderr,
1306 "Invalid timeout argument: %s\n",
1307 optarg);
1308 frr_help_exit(1);
a6810074 1309 }
d62a17ae 1310 } break;
1311 case 'T': {
1312 char garbage[3];
1313 if ((sscanf(optarg, "%ld%1s", &gs.restart_timeout,
1314 garbage)
1315 != 1)
1316 || (gs.restart_timeout < 1)) {
1317 fprintf(stderr,
1318 "Invalid restart timeout argument: %s\n",
1319 optarg);
1320 frr_help_exit(1);
a6810074 1321 }
d62a17ae 1322 } break;
a6810074
DL
1323 default:
1324 fputs("Invalid option.\n", stderr);
4f04a76b 1325 frr_help_exit(1);
a6810074 1326 }
8b886ca7 1327 }
a6810074 1328
71e7975a
DL
1329 if (watch_only
1330 && (gs.start_command || gs.stop_command || gs.restart_command)) {
d87ae5cc 1331 fputs("Options -r/-s/-k are not used when --dry is active.\n",
a6810074 1332 stderr);
8b886ca7 1333 }
f168b713
DL
1334 if (!watch_only
1335 && (!gs.restart_command || !gs.start_command || !gs.stop_command)) {
1336 fprintf(stderr,
1337 "Options -s (start), -k (kill), and -r (restart) are required.\n");
1338 frr_help_exit(1);
8b886ca7 1339 }
8b886ca7 1340
a6810074
DL
1341 if (blankstr) {
1342 if (gs.restart_command)
1343 gs.restart_command =
d62a17ae 1344 translate_blanks(gs.restart_command, blankstr);
a6810074
DL
1345 if (gs.start_command)
1346 gs.start_command =
d62a17ae 1347 translate_blanks(gs.start_command, blankstr);
a6810074
DL
1348 if (gs.stop_command)
1349 gs.stop_command =
d62a17ae 1350 translate_blanks(gs.stop_command, blankstr);
065de903 1351 }
8b886ca7 1352
a6810074 1353 gs.restart.interval = gs.min_restart_interval;
8b886ca7 1354
4f04a76b 1355 master = frr_init();
b647dc2a 1356 watchfrr_error_init();
0a7c7856
DL
1357 watchfrr_init(argc, argv);
1358 watchfrr_vty_init();
1359
1360 frr_config_fork();
4f04a76b 1361
dd8376fe 1362 zlog_set_level(ZLOG_DEST_MONITOR, ZLOG_DISABLED);
0a7c7856 1363 if (watchfrr_di.daemon_mode)
dd8376fe 1364 zlog_set_level(ZLOG_DEST_SYSLOG, MIN(gs.loglevel, LOG_DEBUG));
0a7c7856 1365 else
dd8376fe 1366 zlog_set_level(ZLOG_DEST_STDOUT, MIN(gs.loglevel, LOG_DEBUG));
8b886ca7 1367
0a7c7856 1368 frr_run(master);
8b886ca7 1369
a6810074
DL
1370 systemd_send_stopping();
1371 /* Not reached. */
1372 return 0;
8b886ca7 1373}