]> git.proxmox.com Git - mirror_frr.git/blame - watchfrr/watchfrr.c
lib, watchfrr: Add some additional status messages to systemd
[mirror_frr.git] / watchfrr / watchfrr.c
CommitLineData
8b886ca7 1/*
896014f4
DL
2 * Monitor status of frr daemons and restart if necessary.
3 *
4 * Copyright (C) 2004 Andrew J. Schorr
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
8b886ca7 19 */
20
a365534f 21#include <zebra.h>
8b886ca7 22#include <thread.h>
23#include <log.h>
52e66296 24#include <network.h>
8b886ca7 25#include <sigevent.h>
a365534f 26#include <lib/version.h>
95c4aff2 27#include "command.h"
87f44e2f 28#include "memory_vty.h"
4f04a76b 29#include "libfrr.h"
b647dc2a 30#include "lib_errors.h"
95c4aff2 31
6f594023 32#include <getopt.h>
a365534f 33#include <sys/un.h>
34#include <sys/wait.h>
837d16cc 35#include <memory.h>
651415bd 36#include <systemd.h>
8b886ca7 37
9473e340 38#include "watchfrr.h"
b647dc2a 39#include "watchfrr_errors.h"
95c4aff2 40
8b886ca7 41#ifndef MIN
42#define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
43#endif
44
45/* Macros to help randomize timers. */
46#define JITTER(X) ((random() % ((X)+1))-((X)/2))
47#define FUZZY(X) ((X)+JITTER((X)/20))
48
49#define DEFAULT_PERIOD 5
0a64aff6 50#define DEFAULT_TIMEOUT 90
8b886ca7 51#define DEFAULT_RESTART_TIMEOUT 20
52#define DEFAULT_LOGLEVEL LOG_INFO
53#define DEFAULT_MIN_RESTART 60
54#define DEFAULT_MAX_RESTART 600
8b886ca7 55
3ec95567
DL
56#define DEFAULT_RESTART_CMD WATCHFRR_SH_PATH " restart %s"
57#define DEFAULT_START_CMD WATCHFRR_SH_PATH " start %s"
58#define DEFAULT_STOP_CMD WATCHFRR_SH_PATH " stop %s"
59
8b886ca7 60#define PING_TOKEN "PING"
61
0a7c7856
DL
62DEFINE_MGROUP(WATCHFRR, "watchfrr")
63DEFINE_MTYPE_STATIC(WATCHFRR, WATCHFRR_DAEMON, "watchfrr daemon entry")
64
55c72803 65/* Needs to be global, referenced somewhere inside libfrr. */
8b886ca7 66struct thread_master *master;
67
f168b713 68static bool watch_only = false;
8b886ca7 69
a6810074
DL
70typedef enum {
71 PHASE_NONE = 0,
c0e5cb52 72 PHASE_INIT,
a6810074
DL
73 PHASE_STOPS_PENDING,
74 PHASE_WAITING_DOWN,
75 PHASE_ZEBRA_RESTART_PENDING,
76 PHASE_WAITING_ZEBRA_UP
8b886ca7 77} restart_phase_t;
78
a6810074 79static const char *phase_str[] = {
af568444 80 "Idle",
c0e5cb52 81 "Startup",
a6810074
DL
82 "Stop jobs running",
83 "Waiting for other daemons to come down",
84 "Zebra restart job running",
85 "Waiting for zebra to come up",
86 "Start jobs running",
8b886ca7 87};
88
89#define PHASE_TIMEOUT (3*gs.restart_timeout)
5c9d1c83 90#define STARTUP_TIMEOUT 55 * 1000
8b886ca7 91
a6810074
DL
92struct restart_info {
93 const char *name;
94 const char *what;
95 pid_t pid;
96 struct timeval time;
97 long interval;
98 struct thread *t_kill;
99 int kills;
098e240f 100};
101
a6810074 102static struct global_state {
a6810074
DL
103 restart_phase_t phase;
104 struct thread *t_phase_hanging;
5c9d1c83 105 struct thread *t_startup_timeout;
a6810074
DL
106 const char *vtydir;
107 long period;
108 long timeout;
109 long restart_timeout;
110 long min_restart_interval;
111 long max_restart_interval;
a6810074
DL
112 struct daemon *daemons;
113 const char *restart_command;
114 const char *start_command;
115 const char *stop_command;
116 struct restart_info restart;
a6810074 117 int loglevel;
d62a17ae 118 struct daemon *special; /* points to zebra when doing phased restart */
a6810074
DL
119 int numdaemons;
120 int numpids;
d62a17ae 121 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
8b886ca7 122} gs = {
c0e5cb52 123 .phase = PHASE_INIT,
64a249ad 124 .vtydir = frr_vtydir,
d62a17ae 125 .period = 1000 * DEFAULT_PERIOD,
126 .timeout = DEFAULT_TIMEOUT,
127 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
128 .loglevel = DEFAULT_LOGLEVEL,
129 .min_restart_interval = DEFAULT_MIN_RESTART,
130 .max_restart_interval = DEFAULT_MAX_RESTART,
3ec95567
DL
131 .restart_command = DEFAULT_RESTART_CMD,
132 .start_command = DEFAULT_START_CMD,
133 .stop_command = DEFAULT_STOP_CMD,
d62a17ae 134};
a6810074
DL
135
136typedef enum {
137 DAEMON_INIT,
138 DAEMON_DOWN,
139 DAEMON_CONNECTING,
140 DAEMON_UP,
141 DAEMON_UNRESPONSIVE
8b886ca7 142} daemon_state_t;
143
d62a17ae 144#define IS_UP(DMN) \
145 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
8b886ca7 146
a6810074 147static const char *state_str[] = {
d62a17ae 148 "Init", "Down", "Connecting", "Up", "Unresponsive",
8b886ca7 149};
150
151struct daemon {
a6810074
DL
152 const char *name;
153 daemon_state_t state;
154 int fd;
155 struct timeval echo_sent;
d7c0a89a 156 unsigned int connect_tries;
a6810074
DL
157 struct thread *t_wakeup;
158 struct thread *t_read;
159 struct thread *t_write;
160 struct daemon *next;
161 struct restart_info restart;
cc53b605
DS
162
163 /*
164 * For a given daemon, if we've turned on ignore timeouts
165 * ignore the timeout value and assume everything is ok
166 * This is for daemon debugging w/ gdb after we have started
167 * FRR and realize we have something that needs to be looked
168 * at
169 */
170 bool ignore_timeout;
8b886ca7 171};
172
9272302b
DL
173#define OPTION_MINRESTART 2000
174#define OPTION_MAXRESTART 2001
f168b713 175#define OPTION_DRY 2002
9272302b 176
a6810074
DL
177static const struct option longopts[] = {
178 {"daemon", no_argument, NULL, 'd'},
179 {"statedir", required_argument, NULL, 'S'},
a6810074
DL
180 {"loglevel", required_argument, NULL, 'l'},
181 {"interval", required_argument, NULL, 'i'},
182 {"timeout", required_argument, NULL, 't'},
183 {"restart-timeout", required_argument, NULL, 'T'},
184 {"restart", required_argument, NULL, 'r'},
185 {"start-command", required_argument, NULL, 's'},
186 {"kill-command", required_argument, NULL, 'k'},
f168b713 187 {"dry", no_argument, NULL, OPTION_DRY},
d62a17ae 188 {"min-restart-interval", required_argument, NULL, OPTION_MINRESTART},
189 {"max-restart-interval", required_argument, NULL, OPTION_MAXRESTART},
a6810074
DL
190 {"pid-file", required_argument, NULL, 'p'},
191 {"blank-string", required_argument, NULL, 'b'},
192 {"help", no_argument, NULL, 'h'},
193 {"version", no_argument, NULL, 'v'},
d62a17ae 194 {NULL, 0, NULL, 0}};
8b886ca7 195
196static int try_connect(struct daemon *dmn);
197static int wakeup_send_echo(struct thread *t_wakeup);
198static void try_restart(struct daemon *dmn);
199static void phase_check(void);
75f8b0e4 200static void restart_done(struct daemon *dmn);
8b886ca7 201
4f04a76b 202static const char *progname;
cc53b605
DS
203
204void watchfrr_set_ignore_daemon(struct vty *vty, const char *dname, bool ignore)
205{
206 struct daemon *dmn;
207
208 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
209 if (strncmp(dmn->name, dname, strlen(dmn->name)) == 0)
210 break;
211 }
212
213 if (dmn) {
214 dmn->ignore_timeout = ignore;
215 vty_out(vty, "%s switching to %s\n", dmn->name,
216 ignore ? "ignore" : "watch");
217 } else
218 vty_out(vty, "%s is not configured for running at the moment",
219 dname);
220}
221
4f04a76b 222static void printhelp(FILE *target)
8b886ca7 223{
d62a17ae 224 fprintf(target,
225 "Usage : %s [OPTION...] <daemon name> ...\n\n\
9473e340 226Watchdog program to monitor status of frr daemons and try to restart\n\
8b886ca7 227them if they are down or unresponsive. It determines whether a daemon is\n\
228up based on whether it can connect to the daemon's vty unix stream socket.\n\
229It then repeatedly sends echo commands over that socket to determine whether\n\
230the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
231on the socket connection and know immediately that the daemon is down.\n\n\
232The daemons to be monitored should be listed on the command line.\n\n\
8b886ca7 233In order to avoid attempting to restart the daemons in a fast loop,\n\
234the -m and -M options allow you to control the minimum delay between\n\
235restart commands. The minimum restart delay is recalculated each time\n\
236a restart is attempted: if the time since the last restart attempt exceeds\n\
237twice the -M value, then the restart delay is set to the -m value.\n\
d62a17ae 238Otherwise, the interval is doubled (but capped at the -M value).\n\n",
f168b713 239 progname);
e757c940 240
d62a17ae 241 fprintf(target,
242 "Options:\n\
8b886ca7 243-d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
244 to syslog instead of stdout.\n\
245-S, --statedir Set the vty socket directory (default is %s)\n\
8b886ca7 246-l, --loglevel Set the logging level (default is %d).\n\
247 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
248 but it can be set higher than %d if extra-verbose debugging\n\
249 messages are desired.\n\
9272302b 250 --min-restart-interval\n\
8b886ca7 251 Set the minimum seconds to wait between invocations of daemon\n\
252 restart commands (default is %d).\n\
9272302b 253 --max-restart-interval\n\
8b886ca7 254 Set the maximum seconds to wait between invocations of daemon\n\
255 restart commands (default is %d).\n\
256-i, --interval Set the status polling interval in seconds (default is %d)\n\
257-t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
258-T, --restart-timeout\n\
259 Set the restart (kill) timeout in seconds (default is %d).\n\
260 If any background jobs are still running after this much\n\
261 time has elapsed, they will be killed.\n\
262-r, --restart Supply a Bourne shell command to use to restart a single\n\
263 daemon. The command string should include '%%s' where the\n\
264 name of the daemon should be substituted.\n\
3ec95567 265 (default: '%s')\n\
8b886ca7 266-s, --start-command\n\
267 Supply a Bourne shell to command to use to start a single\n\
268 daemon. The command string should include '%%s' where the\n\
269 name of the daemon should be substituted.\n\
3ec95567 270 (default: '%s')\n\
8b886ca7 271-k, --kill-command\n\
272 Supply a Bourne shell to command to use to stop a single\n\
273 daemon. The command string should include '%%s' where the\n\
274 name of the daemon should be substituted.\n\
3ec95567 275 (default: '%s')\n\
f168b713 276 --dry Do not start or restart anything, just log.\n\
8b886ca7 277-p, --pid-file Set process identifier file name\n\
0a7c7856 278 (default is %s/watchfrr.pid).\n\
c8b40f86 279-b, --blank-string\n\
280 When the supplied argument string is found in any of the\n\
f168b713 281 various shell command arguments (-r, -s, or -k), replace\n\
c8b40f86 282 it with a space. This is an ugly hack to circumvent problems\n\
283 passing command-line arguments with embedded spaces.\n\
8b886ca7 284-v, --version Print program version\n\
d62a17ae 285-h, --help Display this help and exit\n",
64a249ad 286 frr_vtydir, DEFAULT_LOGLEVEL, LOG_EMERG, LOG_DEBUG, LOG_DEBUG,
d62a17ae 287 DEFAULT_MIN_RESTART, DEFAULT_MAX_RESTART, DEFAULT_PERIOD,
3ec95567
DL
288 DEFAULT_TIMEOUT, DEFAULT_RESTART_TIMEOUT,
289 DEFAULT_RESTART_CMD, DEFAULT_START_CMD, DEFAULT_STOP_CMD,
290 frr_vtydir);
8b886ca7 291}
292
a6810074 293static pid_t run_background(char *shell_cmd)
8b886ca7 294{
a6810074
DL
295 pid_t child;
296
297 switch (child = fork()) {
298 case -1:
450971aa 299 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
300 "fork failed, cannot run command [%s]: %s",
301 shell_cmd, safe_strerror(errno));
a6810074
DL
302 return -1;
303 case 0:
304 /* Child process. */
d62a17ae 305 /* Use separate process group so child processes can be killed
306 * easily. */
a6810074
DL
307 if (setpgid(0, 0) < 0)
308 zlog_warn("warning: setpgid(0,0) failed: %s",
309 safe_strerror(errno));
310 {
311 char shell[] = "sh";
312 char dashc[] = "-c";
d62a17ae 313 char *const argv[4] = {shell, dashc, shell_cmd, NULL};
a6810074 314 execv("/bin/sh", argv);
450971aa 315 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
316 "execv(/bin/sh -c '%s') failed: %s",
317 shell_cmd, safe_strerror(errno));
a6810074
DL
318 _exit(127);
319 }
320 default:
321 /* Parent process: we will reap the child later. */
450971aa 322 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
323 "Forked background command [pid %d]: %s",
324 (int)child, shell_cmd);
a6810074
DL
325 return child;
326 }
8b886ca7 327}
328
a6810074
DL
329static struct timeval *time_elapsed(struct timeval *result,
330 const struct timeval *start_time)
8b886ca7 331{
a6810074
DL
332 gettimeofday(result, NULL);
333 result->tv_sec -= start_time->tv_sec;
334 result->tv_usec -= start_time->tv_usec;
335 while (result->tv_usec < 0) {
336 result->tv_usec += 1000000L;
337 result->tv_sec--;
338 }
339 return result;
8b886ca7 340}
341
a6810074 342static int restart_kill(struct thread *t_kill)
8b886ca7 343{
a6810074
DL
344 struct restart_info *restart = THREAD_ARG(t_kill);
345 struct timeval delay;
346
347 time_elapsed(&delay, &restart->time);
d62a17ae 348 zlog_warn(
349 "Warning: %s %s child process %d still running after "
350 "%ld seconds, sending signal %d",
351 restart->what, restart->name, (int)restart->pid,
352 (long)delay.tv_sec, (restart->kills ? SIGKILL : SIGTERM));
a6810074
DL
353 kill(-restart->pid, (restart->kills ? SIGKILL : SIGTERM));
354 restart->kills++;
66e78ae6
QY
355 restart->t_kill = NULL;
356 thread_add_timer(master, restart_kill, restart, gs.restart_timeout,
357 &restart->t_kill);
a6810074 358 return 0;
8b886ca7 359}
360
a6810074 361static struct restart_info *find_child(pid_t child)
8b886ca7 362{
f168b713 363 struct daemon *dmn;
7c265f7d
CF
364 if (gs.restart.pid == child)
365 return &gs.restart;
366
f168b713
DL
367 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
368 if (dmn->restart.pid == child)
369 return &dmn->restart;
a6810074
DL
370 }
371 return NULL;
8b886ca7 372}
373
a6810074 374static void sigchild(void)
8b886ca7 375{
a6810074
DL
376 pid_t child;
377 int status;
378 const char *name;
379 const char *what;
380 struct restart_info *restart;
75f8b0e4 381 struct daemon *dmn;
a6810074
DL
382
383 switch (child = waitpid(-1, &status, WNOHANG)) {
384 case -1:
450971aa 385 flog_err_sys(EC_LIB_SYSTEM_CALL, "waitpid failed: %s",
09c866e3 386 safe_strerror(errno));
a6810074
DL
387 return;
388 case 0:
389 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
390 return;
391 }
392
393 if (child == integrated_write_pid) {
394 integrated_write_sigchld(status);
395 return;
396 }
397
398 if ((restart = find_child(child)) != NULL) {
399 name = restart->name;
400 what = restart->what;
401 restart->pid = 0;
402 gs.numpids--;
403 thread_cancel(restart->t_kill);
404 restart->t_kill = NULL;
d62a17ae 405 /* Update restart time to reflect the time the command
406 * completed. */
a6810074
DL
407 gettimeofday(&restart->time, NULL);
408 } else {
09c866e3 409 flog_err_sys(
450971aa 410 EC_LIB_SYSTEM_CALL,
09c866e3
QY
411 "waitpid returned status for an unknown child process %d",
412 (int)child);
a6810074
DL
413 name = "(unknown)";
414 what = "background";
415 }
416 if (WIFSTOPPED(status))
d62a17ae 417 zlog_warn("warning: %s %s process %d is stopped", what, name,
418 (int)child);
a6810074 419 else if (WIFSIGNALED(status))
d62a17ae 420 zlog_warn("%s %s process %d terminated due to signal %d", what,
421 name, (int)child, WTERMSIG(status));
a6810074
DL
422 else if (WIFEXITED(status)) {
423 if (WEXITSTATUS(status) != 0)
d62a17ae 424 zlog_warn(
425 "%s %s process %d exited with non-zero status %d",
426 what, name, (int)child, WEXITSTATUS(status));
75f8b0e4 427 else {
a6810074
DL
428 zlog_debug("%s %s process %d exited normally", what,
429 name, (int)child);
75f8b0e4
DL
430
431 if (restart && restart != &gs.restart) {
432 dmn = container_of(restart, struct daemon,
433 restart);
434 restart_done(dmn);
435 } else if (restart)
436 for (dmn = gs.daemons; dmn; dmn = dmn->next)
437 restart_done(dmn);
438 }
a6810074 439 } else
09c866e3 440 flog_err_sys(
450971aa 441 EC_LIB_SYSTEM_CALL,
09c866e3
QY
442 "cannot interpret %s %s process %d wait status 0x%x",
443 what, name, (int)child, status);
a6810074 444 phase_check();
8b886ca7 445}
446
d62a17ae 447static int run_job(struct restart_info *restart, const char *cmdtype,
448 const char *command, int force, int update_interval)
8b886ca7 449{
a6810074
DL
450 struct timeval delay;
451
452 if (gs.loglevel > LOG_DEBUG + 1)
453 zlog_debug("attempting to %s %s", cmdtype, restart->name);
454
455 if (restart->pid) {
456 if (gs.loglevel > LOG_DEBUG + 1)
d62a17ae 457 zlog_debug(
458 "cannot %s %s, previous pid %d still running",
459 cmdtype, restart->name, (int)restart->pid);
a6810074
DL
460 return -1;
461 }
462
b3ee8bcc
DS
463#if defined HAVE_SYSTEMD
464 char buffer[512];
465
466 snprintf(buffer, sizeof(buffer), "restarting %s", restart->name);
467 systemd_send_status(buffer);
468#endif
469
d62a17ae 470 /* Note: time_elapsed test must come before the force test, since we
471 need
a6810074
DL
472 to make sure that delay is initialized for use below in updating the
473 restart interval. */
474 if ((time_elapsed(&delay, &restart->time)->tv_sec < restart->interval)
475 && !force) {
b3ee8bcc 476
a6810074 477 if (gs.loglevel > LOG_DEBUG + 1)
d62a17ae 478 zlog_debug(
479 "postponing %s %s: "
480 "elapsed time %ld < retry interval %ld",
481 cmdtype, restart->name, (long)delay.tv_sec,
482 restart->interval);
a6810074
DL
483 return -1;
484 }
485
486 gettimeofday(&restart->time, NULL);
487 restart->kills = 0;
488 {
489 char cmd[strlen(command) + strlen(restart->name) + 1];
490 snprintf(cmd, sizeof(cmd), command, restart->name);
491 if ((restart->pid = run_background(cmd)) > 0) {
66e78ae6 492 restart->t_kill = NULL;
d62a17ae 493 thread_add_timer(master, restart_kill, restart,
494 gs.restart_timeout, &restart->t_kill);
a6810074
DL
495 restart->what = cmdtype;
496 gs.numpids++;
497 } else
498 restart->pid = 0;
499 }
500
b3ee8bcc
DS
501#if defined HAVE_SYSTEMD
502 systemd_send_status("FRR Operational");
503#endif
a6810074
DL
504 /* Calculate the new restart interval. */
505 if (update_interval) {
506 if (delay.tv_sec > 2 * gs.max_restart_interval)
507 restart->interval = gs.min_restart_interval;
508 else if ((restart->interval *= 2) > gs.max_restart_interval)
509 restart->interval = gs.max_restart_interval;
510 if (gs.loglevel > LOG_DEBUG + 1)
511 zlog_debug("restart %s interval is now %ld",
512 restart->name, restart->interval);
513 }
514 return restart->pid;
8b886ca7 515}
516
d62a17ae 517#define SET_READ_HANDLER(DMN) \
518 do { \
519 (DMN)->t_read = NULL; \
520 thread_add_read(master, handle_read, (DMN), (DMN)->fd, \
521 &(DMN)->t_read); \
522 } while (0);
523
524#define SET_WAKEUP_DOWN(DMN) \
525 do { \
526 (DMN)->t_wakeup = NULL; \
527 thread_add_timer_msec(master, wakeup_down, (DMN), \
528 FUZZY(gs.period), &(DMN)->t_wakeup); \
529 } while (0);
530
531#define SET_WAKEUP_UNRESPONSIVE(DMN) \
532 do { \
533 (DMN)->t_wakeup = NULL; \
534 thread_add_timer_msec(master, wakeup_unresponsive, (DMN), \
535 FUZZY(gs.period), &(DMN)->t_wakeup); \
536 } while (0);
537
538#define SET_WAKEUP_ECHO(DMN) \
539 do { \
540 (DMN)->t_wakeup = NULL; \
541 thread_add_timer_msec(master, wakeup_send_echo, (DMN), \
542 FUZZY(gs.period), &(DMN)->t_wakeup); \
543 } while (0);
8b886ca7 544
a6810074 545static int wakeup_down(struct thread *t_wakeup)
8b886ca7 546{
a6810074
DL
547 struct daemon *dmn = THREAD_ARG(t_wakeup);
548
549 dmn->t_wakeup = NULL;
550 if (try_connect(dmn) < 0)
551 SET_WAKEUP_DOWN(dmn);
552 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
553 try_restart(dmn);
554 return 0;
8b886ca7 555}
556
a6810074 557static int wakeup_init(struct thread *t_wakeup)
8b886ca7 558{
a6810074
DL
559 struct daemon *dmn = THREAD_ARG(t_wakeup);
560
561 dmn->t_wakeup = NULL;
562 if (try_connect(dmn) < 0) {
f74ae2bb 563 flog_err(EC_WATCHFRR_CONNECTION,
1c50c1c0
QY
564 "%s state -> down : initial connection attempt failed",
565 dmn->name);
a6810074
DL
566 dmn->state = DAEMON_DOWN;
567 }
c0e5cb52 568 phase_check();
a6810074 569 return 0;
8b886ca7 570}
571
75f8b0e4
DL
572static void restart_done(struct daemon *dmn)
573{
574 if (dmn->state != DAEMON_DOWN) {
3f391bec
DS
575 zlog_warn(
576 "Daemon: %s: is in %s state but expected it to be in DAEMON_DOWN state",
577 dmn->name, state_str[dmn->state]);
75f8b0e4
DL
578 return;
579 }
580 if (dmn->t_wakeup)
581 THREAD_OFF(dmn->t_wakeup);
582 if (try_connect(dmn) < 0)
583 SET_WAKEUP_DOWN(dmn);
584}
585
a6810074 586static void daemon_down(struct daemon *dmn, const char *why)
8b886ca7 587{
a6810074 588 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
1c50c1c0
QY
589 flog_err(EC_WATCHFRR_CONNECTION, "%s state -> down : %s",
590 dmn->name, why);
a6810074
DL
591 else if (gs.loglevel > LOG_DEBUG)
592 zlog_debug("%s still down : %s", dmn->name, why);
593 if (IS_UP(dmn))
594 gs.numdown++;
595 dmn->state = DAEMON_DOWN;
596 if (dmn->fd >= 0) {
597 close(dmn->fd);
598 dmn->fd = -1;
599 }
600 THREAD_OFF(dmn->t_read);
601 THREAD_OFF(dmn->t_write);
602 THREAD_OFF(dmn->t_wakeup);
603 if (try_connect(dmn) < 0)
604 SET_WAKEUP_DOWN(dmn);
605 phase_check();
8b886ca7 606}
607
a6810074 608static int handle_read(struct thread *t_read)
8b886ca7 609{
a6810074
DL
610 struct daemon *dmn = THREAD_ARG(t_read);
611 static const char resp[sizeof(PING_TOKEN) + 4] = PING_TOKEN "\n";
612 char buf[sizeof(resp) + 100];
613 ssize_t rc;
614 struct timeval delay;
615
616 dmn->t_read = NULL;
617 if ((rc = read(dmn->fd, buf, sizeof(buf))) < 0) {
618 char why[100];
619
620 if (ERRNO_IO_RETRY(errno)) {
621 /* Pretend it never happened. */
622 SET_READ_HANDLER(dmn);
623 return 0;
624 }
625 snprintf(why, sizeof(why), "unexpected read error: %s",
626 safe_strerror(errno));
627 daemon_down(dmn, why);
628 return 0;
8b886ca7 629 }
a6810074
DL
630 if (rc == 0) {
631 daemon_down(dmn, "read returned EOF");
632 return 0;
633 }
634 if (!dmn->echo_sent.tv_sec) {
635 char why[sizeof(buf) + 100];
636 snprintf(why, sizeof(why),
637 "unexpected read returns %d bytes: %.*s", (int)rc,
638 (int)rc, buf);
639 daemon_down(dmn, why);
640 return 0;
8b886ca7 641 }
a6810074
DL
642
643 /* We are expecting an echo response: is there any chance that the
644 response would not be returned entirely in the first read? That
645 seems inconceivable... */
646 if ((rc != sizeof(resp)) || memcmp(buf, resp, sizeof(resp))) {
647 char why[100 + sizeof(buf)];
648 snprintf(why, sizeof(why),
649 "read returned bad echo response of %d bytes "
d62a17ae 650 "(expecting %u): %.*s",
d7c0a89a 651 (int)rc, (unsigned int)sizeof(resp), (int)rc, buf);
a6810074
DL
652 daemon_down(dmn, why);
653 return 0;
654 }
655
656 time_elapsed(&delay, &dmn->echo_sent);
657 dmn->echo_sent.tv_sec = 0;
658 if (dmn->state == DAEMON_UNRESPONSIVE) {
659 if (delay.tv_sec < gs.timeout) {
660 dmn->state = DAEMON_UP;
d62a17ae 661 zlog_warn(
662 "%s state -> up : echo response received after %ld.%06ld "
663 "seconds",
664 dmn->name, (long)delay.tv_sec,
665 (long)delay.tv_usec);
a6810074 666 } else
d62a17ae 667 zlog_warn(
668 "%s: slow echo response finally received after %ld.%06ld "
669 "seconds",
670 dmn->name, (long)delay.tv_sec,
671 (long)delay.tv_usec);
a6810074
DL
672 } else if (gs.loglevel > LOG_DEBUG + 1)
673 zlog_debug("%s: echo response received after %ld.%06ld seconds",
674 dmn->name, (long)delay.tv_sec, (long)delay.tv_usec);
675
676 SET_READ_HANDLER(dmn);
677 if (dmn->t_wakeup)
678 thread_cancel(dmn->t_wakeup);
679 SET_WAKEUP_ECHO(dmn);
680
681 return 0;
8b886ca7 682}
683
207e0d7a
DS
684/*
685 * Wait till we notice that all daemons are ready before
686 * we send we are ready to systemd
687 */
5c9d1c83 688static void daemon_send_ready(int exitcode)
207e0d7a 689{
5c9d1c83 690 FILE *fp;
a6810074 691 static int sent = 0;
43e587c1 692 char started[1024];
207e0d7a 693
5c9d1c83
DL
694 if (sent)
695 return;
696
697 if (exitcode == 0)
0a7c7856 698 zlog_notice("all daemons up, doing startup-complete notify");
5c9d1c83
DL
699 else if (gs.numdown < gs.numdaemons)
700 flog_err(EC_WATCHFRR_CONNECTION,
701 "startup did not complete within timeout"
702 " (%d/%d daemons running)",
703 gs.numdaemons - gs.numdown, gs.numdaemons);
704 else {
705 flog_err(EC_WATCHFRR_CONNECTION,
706 "all configured daemons failed to start"
707 " -- exiting watchfrr");
708 exit(exitcode);
709
710 }
0a7c7856 711
5c9d1c83
DL
712 frr_detach();
713
3c649c71
DS
714 snprintf(started, sizeof(started), "%s%s", frr_vtydir,
715 "watchfrr.started");
716 fp = fopen(started, "w");
5c9d1c83
DL
717 if (fp)
718 fclose(fp);
60bd2534 719#if defined HAVE_SYSTEMD
5c9d1c83 720 systemd_send_started(master, 0);
b3ee8bcc 721 systemd_send_status("FRR Operational");
60bd2534 722#endif
5c9d1c83 723 sent = 1;
207e0d7a
DS
724}
725
a6810074 726static void daemon_up(struct daemon *dmn, const char *why)
8b886ca7 727{
a6810074
DL
728 dmn->state = DAEMON_UP;
729 gs.numdown--;
730 dmn->connect_tries = 0;
731 zlog_notice("%s state -> up : %s", dmn->name, why);
5c9d1c83
DL
732 if (gs.numdown == 0)
733 daemon_send_ready(0);
a8cbb8b3 734 SET_WAKEUP_ECHO(dmn);
a6810074 735 phase_check();
8b886ca7 736}
737
a6810074 738static int check_connect(struct thread *t_write)
8b886ca7 739{
a6810074
DL
740 struct daemon *dmn = THREAD_ARG(t_write);
741 int sockerr;
742 socklen_t reslen = sizeof(sockerr);
743
744 dmn->t_write = NULL;
745 if (getsockopt(dmn->fd, SOL_SOCKET, SO_ERROR, (char *)&sockerr, &reslen)
746 < 0) {
747 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn->name,
748 safe_strerror(errno));
749 daemon_down(dmn,
750 "getsockopt failed checking connection success");
751 return 0;
752 }
753 if ((reslen == sizeof(sockerr)) && sockerr) {
754 char why[100];
d62a17ae 755 snprintf(
756 why, sizeof(why),
757 "getsockopt reports that connection attempt failed: %s",
758 safe_strerror(sockerr));
a6810074
DL
759 daemon_down(dmn, why);
760 return 0;
761 }
762
763 daemon_up(dmn, "delayed connect succeeded");
764 return 0;
8b886ca7 765}
766
a6810074 767static int wakeup_connect_hanging(struct thread *t_wakeup)
8b886ca7 768{
a6810074
DL
769 struct daemon *dmn = THREAD_ARG(t_wakeup);
770 char why[100];
771
772 dmn->t_wakeup = NULL;
773 snprintf(why, sizeof(why),
774 "connection attempt timed out after %ld seconds", gs.timeout);
775 daemon_down(dmn, why);
776 return 0;
8b886ca7 777}
778
779/* Making connection to protocol daemon. */
a6810074 780static int try_connect(struct daemon *dmn)
8b886ca7 781{
a6810074
DL
782 int sock;
783 struct sockaddr_un addr;
784 socklen_t len;
785
786 if (gs.loglevel > LOG_DEBUG + 1)
787 zlog_debug("%s: attempting to connect", dmn->name);
788 dmn->connect_tries++;
789
790 memset(&addr, 0, sizeof(struct sockaddr_un));
791 addr.sun_family = AF_UNIX;
d62a17ae 792 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty", gs.vtydir,
793 dmn->name);
6f0e3f6e 794#ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
a6810074 795 len = addr.sun_len = SUN_LEN(&addr);
8b886ca7 796#else
a6810074 797 len = sizeof(addr.sun_family) + strlen(addr.sun_path);
d62a17ae 798#endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
a6810074
DL
799
800 /* Quick check to see if we might succeed before we go to the trouble
801 of creating a socket. */
802 if (access(addr.sun_path, W_OK) < 0) {
803 if (errno != ENOENT)
450971aa 804 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
805 "%s: access to socket %s denied: %s",
806 dmn->name, addr.sun_path,
807 safe_strerror(errno));
a6810074
DL
808 return -1;
809 }
810
811 if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
450971aa 812 flog_err_sys(EC_LIB_SOCKET, "%s(%s): cannot make socket: %s",
09c866e3 813 __func__, addr.sun_path, safe_strerror(errno));
a6810074
DL
814 return -1;
815 }
816
817 if (set_nonblocking(sock) < 0 || set_cloexec(sock) < 0) {
450971aa 818 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
819 "%s(%s): set_nonblocking/cloexec(%d) failed",
820 __func__, addr.sun_path, sock);
a6810074
DL
821 close(sock);
822 return -1;
8b886ca7 823 }
a6810074
DL
824
825 if (connect(sock, (struct sockaddr *)&addr, len) < 0) {
826 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK)) {
827 if (gs.loglevel > LOG_DEBUG)
828 zlog_debug("%s(%s): connect failed: %s",
829 __func__, addr.sun_path,
830 safe_strerror(errno));
831 close(sock);
832 return -1;
833 }
834 if (gs.loglevel > LOG_DEBUG)
835 zlog_debug("%s: connection in progress", dmn->name);
836 dmn->state = DAEMON_CONNECTING;
837 dmn->fd = sock;
66e78ae6
QY
838 dmn->t_write = NULL;
839 thread_add_write(master, check_connect, dmn, dmn->fd,
d62a17ae 840 &dmn->t_write);
841 dmn->t_wakeup = NULL;
842 thread_add_timer(master, wakeup_connect_hanging, dmn,
843 gs.timeout, &dmn->t_wakeup);
a6810074
DL
844 SET_READ_HANDLER(dmn);
845 return 0;
846 }
847
848 dmn->fd = sock;
849 SET_READ_HANDLER(dmn);
850 daemon_up(dmn, "connect succeeded");
851 return 1;
8b886ca7 852}
853
a6810074 854static int phase_hanging(struct thread *t_hanging)
8b886ca7 855{
a6810074 856 gs.t_phase_hanging = NULL;
f74ae2bb 857 flog_err(EC_WATCHFRR_CONNECTION,
1c50c1c0
QY
858 "Phase [%s] hanging for %ld seconds, aborting phased restart",
859 phase_str[gs.phase], PHASE_TIMEOUT);
a6810074
DL
860 gs.phase = PHASE_NONE;
861 return 0;
8b886ca7 862}
863
a6810074 864static void set_phase(restart_phase_t new_phase)
8b886ca7 865{
a6810074
DL
866 gs.phase = new_phase;
867 if (gs.t_phase_hanging)
868 thread_cancel(gs.t_phase_hanging);
66e78ae6
QY
869 gs.t_phase_hanging = NULL;
870 thread_add_timer(master, phase_hanging, NULL, PHASE_TIMEOUT,
871 &gs.t_phase_hanging);
8b886ca7 872}
873
a6810074 874static void phase_check(void)
8b886ca7 875{
c0e5cb52
DL
876 struct daemon *dmn;
877
a6810074
DL
878 switch (gs.phase) {
879 case PHASE_NONE:
880 break;
c0e5cb52
DL
881
882 case PHASE_INIT:
883 for (dmn = gs.daemons; dmn; dmn = dmn->next)
884 if (dmn->state == DAEMON_INIT)
885 return;
886
887 /* startup complete, everything out of INIT */
888 gs.phase = PHASE_NONE;
889 for (dmn = gs.daemons; dmn; dmn = dmn->next)
890 if (dmn->state == DAEMON_DOWN) {
891 SET_WAKEUP_DOWN(dmn);
892 try_restart(dmn);
893 }
894 break;
a6810074
DL
895 case PHASE_STOPS_PENDING:
896 if (gs.numpids)
897 break;
d62a17ae 898 zlog_info(
899 "Phased restart: all routing daemon stop jobs have completed.");
a6810074
DL
900 set_phase(PHASE_WAITING_DOWN);
901
d62a17ae 902 /*FALLTHRU*/
a6810074
DL
903 case PHASE_WAITING_DOWN:
904 if (gs.numdown + IS_UP(gs.special) < gs.numdaemons)
905 break;
906 zlog_info("Phased restart: all routing daemons now down.");
907 run_job(&gs.special->restart, "restart", gs.restart_command, 1,
908 1);
909 set_phase(PHASE_ZEBRA_RESTART_PENDING);
910
d62a17ae 911 /*FALLTHRU*/
a6810074
DL
912 case PHASE_ZEBRA_RESTART_PENDING:
913 if (gs.special->restart.pid)
914 break;
915 zlog_info("Phased restart: %s restart job completed.",
916 gs.special->name);
917 set_phase(PHASE_WAITING_ZEBRA_UP);
918
d62a17ae 919 /*FALLTHRU*/
a6810074
DL
920 case PHASE_WAITING_ZEBRA_UP:
921 if (!IS_UP(gs.special))
922 break;
923 zlog_info("Phased restart: %s is now up.", gs.special->name);
924 {
925 struct daemon *dmn;
926 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
927 if (dmn != gs.special)
928 run_job(&dmn->restart, "start",
929 gs.start_command, 1, 0);
930 }
931 }
932 gs.phase = PHASE_NONE;
933 THREAD_OFF(gs.t_phase_hanging);
934 zlog_notice("Phased global restart has completed.");
935 break;
936 }
8b886ca7 937}
938
a6810074 939static void try_restart(struct daemon *dmn)
8b886ca7 940{
f168b713 941 if (watch_only)
a6810074 942 return;
a6810074 943
f168b713
DL
944 if (dmn != gs.special) {
945 if ((gs.special->state == DAEMON_UP)
946 && (gs.phase == PHASE_NONE))
947 run_job(&dmn->restart, "restart", gs.restart_command, 0,
948 1);
949 else
950 zlog_debug(
951 "%s: postponing restart attempt because master %s daemon "
952 "not up [%s], or phased restart in progress",
953 dmn->name, gs.special->name,
954 state_str[gs.special->state]);
955 return;
956 }
957
958 if ((gs.phase != PHASE_NONE) || gs.numpids) {
959 if (gs.loglevel > LOG_DEBUG + 1)
960 zlog_debug(
961 "postponing phased global restart: restart already in "
962 "progress [%s], or outstanding child processes [%d]",
963 phase_str[gs.phase], gs.numpids);
964 return;
965 }
966 /* Is it too soon for a restart? */
967 {
968 struct timeval delay;
969 if (time_elapsed(&delay, &gs.special->restart.time)->tv_sec
970 < gs.special->restart.interval) {
a6810074 971 if (gs.loglevel > LOG_DEBUG + 1)
d62a17ae 972 zlog_debug(
f168b713
DL
973 "postponing phased global restart: "
974 "elapsed time %ld < retry interval %ld",
975 (long)delay.tv_sec,
976 gs.special->restart.interval);
977 return;
a6810074 978 }
8b886ca7 979 }
f168b713 980 run_job(&gs.restart, "restart", gs.restart_command, 0, 1);
8b886ca7 981}
982
a6810074 983static int wakeup_unresponsive(struct thread *t_wakeup)
8b886ca7 984{
a6810074
DL
985 struct daemon *dmn = THREAD_ARG(t_wakeup);
986
987 dmn->t_wakeup = NULL;
988 if (dmn->state != DAEMON_UNRESPONSIVE)
f74ae2bb 989 flog_err(EC_WATCHFRR_CONNECTION,
1c50c1c0
QY
990 "%s: no longer unresponsive (now %s), "
991 "wakeup should have been cancelled!",
992 dmn->name, state_str[dmn->state]);
a6810074
DL
993 else {
994 SET_WAKEUP_UNRESPONSIVE(dmn);
995 try_restart(dmn);
996 }
997 return 0;
8b886ca7 998}
999
a6810074 1000static int wakeup_no_answer(struct thread *t_wakeup)
8b886ca7 1001{
a6810074
DL
1002 struct daemon *dmn = THREAD_ARG(t_wakeup);
1003
1004 dmn->t_wakeup = NULL;
1005 dmn->state = DAEMON_UNRESPONSIVE;
cc53b605
DS
1006 if (dmn->ignore_timeout)
1007 return 0;
f74ae2bb 1008 flog_err(EC_WATCHFRR_CONNECTION,
1c50c1c0
QY
1009 "%s state -> unresponsive : no response yet to ping "
1010 "sent %ld seconds ago",
1011 dmn->name, gs.timeout);
71e7975a
DL
1012 SET_WAKEUP_UNRESPONSIVE(dmn);
1013 try_restart(dmn);
a6810074 1014 return 0;
8b886ca7 1015}
1016
a6810074 1017static int wakeup_send_echo(struct thread *t_wakeup)
8b886ca7 1018{
a6810074
DL
1019 static const char echocmd[] = "echo " PING_TOKEN;
1020 ssize_t rc;
1021 struct daemon *dmn = THREAD_ARG(t_wakeup);
1022
1023 dmn->t_wakeup = NULL;
d62a17ae 1024 if (((rc = write(dmn->fd, echocmd, sizeof(echocmd))) < 0)
1025 || ((size_t)rc != sizeof(echocmd))) {
a6810074
DL
1026 char why[100 + sizeof(echocmd)];
1027 snprintf(why, sizeof(why),
1028 "write '%s' returned %d instead of %u", echocmd,
d7c0a89a 1029 (int)rc, (unsigned int)sizeof(echocmd));
a6810074
DL
1030 daemon_down(dmn, why);
1031 } else {
1032 gettimeofday(&dmn->echo_sent, NULL);
66e78ae6
QY
1033 dmn->t_wakeup = NULL;
1034 thread_add_timer(master, wakeup_no_answer, dmn, gs.timeout,
1035 &dmn->t_wakeup);
a6810074
DL
1036 }
1037 return 0;
8b886ca7 1038}
1039
470bc619
QY
1040bool check_all_up(void)
1041{
1042 struct daemon *dmn;
1043
1044 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1045 if (dmn->state != DAEMON_UP)
1046 return false;
1047 return true;
1048}
1049
af568444
DL
1050void watchfrr_status(struct vty *vty)
1051{
1052 struct daemon *dmn;
1053 struct timeval delay;
1054
1055 vty_out(vty, "watchfrr global phase: %s\n", phase_str[gs.phase]);
1056 if (gs.restart.pid)
1057 vty_out(vty, " global restart running, pid %ld\n",
1058 (long)gs.restart.pid);
1059
1060 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
cc53b605
DS
1061 vty_out(vty, " %-20s %s%s", dmn->name, state_str[dmn->state],
1062 dmn->ignore_timeout ? "/Ignoring Timeout\n" : "\n");
af568444
DL
1063 if (dmn->restart.pid)
1064 vty_out(vty, " restart running, pid %ld\n",
1065 (long)dmn->restart.pid);
1066 else if (dmn->state == DAEMON_DOWN &&
1067 time_elapsed(&delay, &dmn->restart.time)->tv_sec
1068 < dmn->restart.interval)
051a0be4
DL
1069 vty_out(vty, " restarting in %jd seconds"
1070 " (%jds backoff interval)\n",
1071 (intmax_t)dmn->restart.interval
1072 - (intmax_t)delay.tv_sec,
1073 (intmax_t)dmn->restart.interval);
af568444
DL
1074 }
1075}
1076
a6810074 1077static void sigint(void)
8b886ca7 1078{
a6810074
DL
1079 zlog_notice("Terminating on signal");
1080 systemd_send_stopping();
1081 exit(0);
8b886ca7 1082}
1083
a6810074 1084static int valid_command(const char *cmd)
8b886ca7 1085{
a6810074 1086 char *p;
8b886ca7 1087
a6810074 1088 return ((p = strchr(cmd, '%')) != NULL) && (*(p + 1) == 's')
d62a17ae 1089 && !strchr(p + 1, '%');
8b886ca7 1090}
1091
c8b40f86 1092/* This is an ugly hack to circumvent problems with passing command-line
1093 arguments that contain spaces. The fix is to use a configuration file. */
a6810074 1094static char *translate_blanks(const char *cmd, const char *blankstr)
c8b40f86 1095{
a6810074
DL
1096 char *res;
1097 char *p;
1098 size_t bslen = strlen(blankstr);
1099
1100 if (!(res = strdup(cmd))) {
1101 perror("strdup");
1102 exit(1);
1103 }
1104 while ((p = strstr(res, blankstr)) != NULL) {
1105 *p = ' ';
1106 if (bslen != 1)
1107 memmove(p + 1, p + bslen, strlen(p + bslen) + 1);
1108 }
1109 return res;
c8b40f86 1110}
1111
5c9d1c83
DL
1112static int startup_timeout(struct thread *t_wakeup)
1113{
1114 daemon_send_ready(1);
1115 return 0;
1116}
1117
0a7c7856
DL
1118static void watchfrr_init(int argc, char **argv)
1119{
1120 const char *special = "zebra";
1121 int i;
1122 struct daemon *dmn, **add = &gs.daemons;
1123 char alldaemons[512] = "", *p = alldaemons;
1124
5c9d1c83
DL
1125 thread_add_timer_msec(master, startup_timeout, NULL, STARTUP_TIMEOUT,
1126 &gs.t_startup_timeout);
1127
0a7c7856
DL
1128 for (i = optind; i < argc; i++) {
1129 dmn = XCALLOC(MTYPE_WATCHFRR_DAEMON, sizeof(*dmn));
1130
1131 dmn->name = dmn->restart.name = argv[i];
1132 dmn->state = DAEMON_INIT;
1133 gs.numdaemons++;
1134 gs.numdown++;
1135 dmn->fd = -1;
1136 dmn->t_wakeup = NULL;
c0e5cb52 1137 thread_add_timer_msec(master, wakeup_init, dmn, 0,
0a7c7856
DL
1138 &dmn->t_wakeup);
1139 dmn->restart.interval = gs.min_restart_interval;
1140 *add = dmn;
1141 add = &dmn->next;
1142
1143 if (!strcmp(dmn->name, special))
1144 gs.special = dmn;
1145 }
1146
1147 if (!gs.daemons) {
1148 fprintf(stderr,
1149 "Must specify one or more daemons to monitor.\n\n");
1150 frr_help_exit(1);
1151 }
1152 if (!watch_only && !gs.special) {
1153 fprintf(stderr, "\"%s\" daemon must be in daemon lists\n\n",
1154 special);
1155 frr_help_exit(1);
1156 }
1157
1158 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
1159 snprintf(p, alldaemons + sizeof(alldaemons) - p, "%s%s",
1160 (p == alldaemons) ? "" : " ", dmn->name);
1161 p += strlen(p);
1162 }
1163 zlog_notice("%s %s watching [%s]%s", progname, FRR_VERSION, alldaemons,
1164 watch_only ? ", monitor mode" : "");
1165}
1166
a6810074 1167struct zebra_privs_t watchfrr_privs = {
95c4aff2 1168#ifdef VTY_GROUP
a6810074 1169 .vty_group = VTY_GROUP,
95c4aff2
DL
1170#endif
1171};
1172
4f04a76b
DL
1173static struct quagga_signal_t watchfrr_signals[] = {
1174 {
1175 .signal = SIGINT,
1176 .handler = sigint,
1177 },
1178 {
1179 .signal = SIGTERM,
1180 .handler = sigint,
1181 },
1182 {
1183 .signal = SIGCHLD,
1184 .handler = sigchild,
1185 },
1186};
1187
1188FRR_DAEMON_INFO(watchfrr, WATCHFRR,
d62a17ae 1189 .flags = FRR_NO_PRIVSEP | FRR_NO_TCPVTY | FRR_LIMITED_CLI
0a7c7856
DL
1190 | FRR_NO_CFG_PID_DRY | FRR_NO_ZCLIENT
1191 | FRR_DETACH_LATER,
4f04a76b 1192
d62a17ae 1193 .printhelp = printhelp,
1194 .copyright = "Copyright 2004 Andrew J. Schorr",
4f04a76b 1195
d62a17ae 1196 .signals = watchfrr_signals,
1197 .n_signals = array_size(watchfrr_signals),
4f04a76b 1198
d62a17ae 1199 .privs = &watchfrr_privs, )
4f04a76b 1200
999f153e
DL
1201#define DEPRECATED_OPTIONS "aAezR:"
1202
a6810074 1203int main(int argc, char **argv)
8b886ca7 1204{
a6810074 1205 int opt;
a6810074 1206 const char *blankstr = NULL;
a6810074 1207
4f04a76b
DL
1208 frr_preinit(&watchfrr_di, argc, argv);
1209 progname = watchfrr_di.progname;
1210
999f153e 1211 frr_opt_add("b:dk:l:i:p:r:S:s:t:T:" DEPRECATED_OPTIONS, longopts, "");
a6810074
DL
1212
1213 gs.restart.name = "all";
4f04a76b 1214 while ((opt = frr_getopt(argc, argv, NULL)) != EOF) {
999f153e
DL
1215 if (opt && opt < 128 && strchr(DEPRECATED_OPTIONS, opt)) {
1216 fprintf(stderr,
1217 "The -%c option no longer exists.\n"
1218 "Please refer to the watchfrr(8) man page.\n",
1219 opt);
1220 exit(1);
1221 }
1222
a6810074
DL
1223 switch (opt) {
1224 case 0:
1225 break;
a6810074
DL
1226 case 'b':
1227 blankstr = optarg;
1228 break;
f168b713
DL
1229 case OPTION_DRY:
1230 watch_only = true;
a6810074
DL
1231 break;
1232 case 'k':
1233 if (!valid_command(optarg)) {
1234 fprintf(stderr,
1235 "Invalid kill command, must contain '%%s': %s\n",
1236 optarg);
4f04a76b 1237 frr_help_exit(1);
a6810074
DL
1238 }
1239 gs.stop_command = optarg;
1240 break;
d62a17ae 1241 case 'l': {
1242 char garbage[3];
1243 if ((sscanf(optarg, "%d%1s", &gs.loglevel, garbage)
1244 != 1)
1245 || (gs.loglevel < LOG_EMERG)) {
1246 fprintf(stderr,
1247 "Invalid loglevel argument: %s\n",
1248 optarg);
1249 frr_help_exit(1);
a6810074 1250 }
d62a17ae 1251 } break;
1252 case OPTION_MINRESTART: {
1253 char garbage[3];
1254 if ((sscanf(optarg, "%ld%1s", &gs.min_restart_interval,
1255 garbage)
1256 != 1)
1257 || (gs.min_restart_interval < 0)) {
1258 fprintf(stderr,
1259 "Invalid min_restart_interval argument: %s\n",
1260 optarg);
1261 frr_help_exit(1);
a6810074 1262 }
d62a17ae 1263 } break;
1264 case OPTION_MAXRESTART: {
1265 char garbage[3];
1266 if ((sscanf(optarg, "%ld%1s", &gs.max_restart_interval,
1267 garbage)
1268 != 1)
1269 || (gs.max_restart_interval < 0)) {
1270 fprintf(stderr,
1271 "Invalid max_restart_interval argument: %s\n",
1272 optarg);
1273 frr_help_exit(1);
a6810074 1274 }
d62a17ae 1275 } break;
1276 case 'i': {
1277 char garbage[3];
1278 int period;
1279 if ((sscanf(optarg, "%d%1s", &period, garbage) != 1)
1280 || (gs.period < 1)) {
1281 fprintf(stderr,
1282 "Invalid interval argument: %s\n",
1283 optarg);
1284 frr_help_exit(1);
a6810074 1285 }
d62a17ae 1286 gs.period = 1000 * period;
1287 } break;
a6810074 1288 case 'p':
0a7c7856 1289 watchfrr_di.pid_file = optarg;
a6810074
DL
1290 break;
1291 case 'r':
a6810074
DL
1292 if (!valid_command(optarg)) {
1293 fprintf(stderr,
1294 "Invalid restart command, must contain '%%s': %s\n",
1295 optarg);
4f04a76b 1296 frr_help_exit(1);
a6810074
DL
1297 }
1298 gs.restart_command = optarg;
a6810074
DL
1299 break;
1300 case 's':
1301 if (!valid_command(optarg)) {
1302 fprintf(stderr,
1303 "Invalid start command, must contain '%%s': %s\n",
1304 optarg);
4f04a76b 1305 frr_help_exit(1);
a6810074
DL
1306 }
1307 gs.start_command = optarg;
1308 break;
1309 case 'S':
1310 gs.vtydir = optarg;
1311 break;
d62a17ae 1312 case 't': {
1313 char garbage[3];
1314 if ((sscanf(optarg, "%ld%1s", &gs.timeout, garbage)
1315 != 1)
1316 || (gs.timeout < 1)) {
1317 fprintf(stderr,
1318 "Invalid timeout argument: %s\n",
1319 optarg);
1320 frr_help_exit(1);
a6810074 1321 }
d62a17ae 1322 } break;
1323 case 'T': {
1324 char garbage[3];
1325 if ((sscanf(optarg, "%ld%1s", &gs.restart_timeout,
1326 garbage)
1327 != 1)
1328 || (gs.restart_timeout < 1)) {
1329 fprintf(stderr,
1330 "Invalid restart timeout argument: %s\n",
1331 optarg);
1332 frr_help_exit(1);
a6810074 1333 }
d62a17ae 1334 } break;
a6810074
DL
1335 default:
1336 fputs("Invalid option.\n", stderr);
4f04a76b 1337 frr_help_exit(1);
a6810074 1338 }
8b886ca7 1339 }
a6810074 1340
71e7975a
DL
1341 if (watch_only
1342 && (gs.start_command || gs.stop_command || gs.restart_command)) {
d87ae5cc 1343 fputs("Options -r/-s/-k are not used when --dry is active.\n",
a6810074 1344 stderr);
8b886ca7 1345 }
f168b713
DL
1346 if (!watch_only
1347 && (!gs.restart_command || !gs.start_command || !gs.stop_command)) {
1348 fprintf(stderr,
1349 "Options -s (start), -k (kill), and -r (restart) are required.\n");
1350 frr_help_exit(1);
8b886ca7 1351 }
8b886ca7 1352
a6810074
DL
1353 if (blankstr) {
1354 if (gs.restart_command)
1355 gs.restart_command =
d62a17ae 1356 translate_blanks(gs.restart_command, blankstr);
a6810074
DL
1357 if (gs.start_command)
1358 gs.start_command =
d62a17ae 1359 translate_blanks(gs.start_command, blankstr);
a6810074
DL
1360 if (gs.stop_command)
1361 gs.stop_command =
d62a17ae 1362 translate_blanks(gs.stop_command, blankstr);
065de903 1363 }
8b886ca7 1364
a6810074 1365 gs.restart.interval = gs.min_restart_interval;
8b886ca7 1366
4f04a76b 1367 master = frr_init();
b647dc2a 1368 watchfrr_error_init();
0a7c7856
DL
1369 watchfrr_init(argc, argv);
1370 watchfrr_vty_init();
1371
1372 frr_config_fork();
4f04a76b 1373
dd8376fe 1374 zlog_set_level(ZLOG_DEST_MONITOR, ZLOG_DISABLED);
0a7c7856 1375 if (watchfrr_di.daemon_mode)
dd8376fe 1376 zlog_set_level(ZLOG_DEST_SYSLOG, MIN(gs.loglevel, LOG_DEBUG));
0a7c7856 1377 else
dd8376fe 1378 zlog_set_level(ZLOG_DEST_STDOUT, MIN(gs.loglevel, LOG_DEBUG));
8b886ca7 1379
0a7c7856 1380 frr_run(master);
8b886ca7 1381
a6810074
DL
1382 systemd_send_stopping();
1383 /* Not reached. */
1384 return 0;
8b886ca7 1385}