]> git.proxmox.com Git - mirror_frr.git/blame - watchfrr/watchfrr.c
*: Convert `struct event_master` to `struct event_loop`
[mirror_frr.git] / watchfrr / watchfrr.c
CommitLineData
acddc0ed 1// SPDX-License-Identifier: GPL-2.0-or-later
8b886ca7 2/*
896014f4
DL
3 * Monitor status of frr daemons and restart if necessary.
4 *
5 * Copyright (C) 2004 Andrew J. Schorr
8b886ca7 6 */
7
a365534f 8#include <zebra.h>
cb37cb33 9#include "event.h"
8b886ca7 10#include <log.h>
52e66296 11#include <network.h>
8b886ca7 12#include <sigevent.h>
a365534f 13#include <lib/version.h>
95c4aff2 14#include "command.h"
4f04a76b 15#include "libfrr.h"
b647dc2a 16#include "lib_errors.h"
0bdeb5e5 17#include "zlog_targets.h"
5920b3eb 18#include "network.h"
33606a15 19#include "printfrr.h"
95c4aff2 20
6f594023 21#include <getopt.h>
a365534f 22#include <sys/un.h>
23#include <sys/wait.h>
837d16cc 24#include <memory.h>
651415bd 25#include <systemd.h>
8b886ca7 26
9473e340 27#include "watchfrr.h"
b647dc2a 28#include "watchfrr_errors.h"
95c4aff2 29
8b886ca7 30#ifndef MIN
31#define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
32#endif
33
34/* Macros to help randomize timers. */
5920b3eb 35#define JITTER(X) ((frr_weak_random() % ((X)+1))-((X)/2))
8b886ca7 36#define FUZZY(X) ((X)+JITTER((X)/20))
37
38#define DEFAULT_PERIOD 5
0a64aff6 39#define DEFAULT_TIMEOUT 90
8b886ca7 40#define DEFAULT_RESTART_TIMEOUT 20
41#define DEFAULT_LOGLEVEL LOG_INFO
42#define DEFAULT_MIN_RESTART 60
43#define DEFAULT_MAX_RESTART 600
6d0fa5c2 44#define DEFAULT_OPERATIONAL_TIMEOUT 60
8b886ca7 45
3ec95567
DL
46#define DEFAULT_RESTART_CMD WATCHFRR_SH_PATH " restart %s"
47#define DEFAULT_START_CMD WATCHFRR_SH_PATH " start %s"
48#define DEFAULT_STOP_CMD WATCHFRR_SH_PATH " stop %s"
49
8b886ca7 50#define PING_TOKEN "PING"
51
bf8d3d6a
DL
52DEFINE_MGROUP(WATCHFRR, "watchfrr");
53DEFINE_MTYPE_STATIC(WATCHFRR, WATCHFRR_DAEMON, "watchfrr daemon entry");
0a7c7856 54
55c72803 55/* Needs to be global, referenced somewhere inside libfrr. */
cd9d0537 56struct event_loop *master;
8b886ca7 57
f168b713 58static bool watch_only = false;
a91f5417 59const char *pathspace;
8b886ca7 60
f1692c51 61enum restart_phase {
a6810074 62 PHASE_NONE = 0,
c0e5cb52 63 PHASE_INIT,
a6810074
DL
64 PHASE_STOPS_PENDING,
65 PHASE_WAITING_DOWN,
66 PHASE_ZEBRA_RESTART_PENDING,
67 PHASE_WAITING_ZEBRA_UP
f1692c51 68};
8b886ca7 69
2b64873d 70static const char *const phase_str[] = {
af568444 71 "Idle",
c0e5cb52 72 "Startup",
a6810074
DL
73 "Stop jobs running",
74 "Waiting for other daemons to come down",
75 "Zebra restart job running",
76 "Waiting for zebra to come up",
77 "Start jobs running",
8b886ca7 78};
79
80#define PHASE_TIMEOUT (3*gs.restart_timeout)
5c9d1c83 81#define STARTUP_TIMEOUT 55 * 1000
8b886ca7 82
a6810074
DL
83struct restart_info {
84 const char *name;
85 const char *what;
86 pid_t pid;
87 struct timeval time;
88 long interval;
e6685141 89 struct event *t_kill;
a6810074 90 int kills;
098e240f 91};
92
a6810074 93static struct global_state {
f1692c51 94 enum restart_phase phase;
e6685141
DS
95 struct event *t_phase_hanging;
96 struct event *t_startup_timeout;
97 struct event *t_operational;
a6810074
DL
98 const char *vtydir;
99 long period;
100 long timeout;
101 long restart_timeout;
2ab760f0 102 bool reading_configuration;
a6810074
DL
103 long min_restart_interval;
104 long max_restart_interval;
6d0fa5c2 105 long operational_timeout;
a6810074
DL
106 struct daemon *daemons;
107 const char *restart_command;
108 const char *start_command;
109 const char *stop_command;
110 struct restart_info restart;
a6810074 111 int loglevel;
d62a17ae 112 struct daemon *special; /* points to zebra when doing phased restart */
a6810074
DL
113 int numdaemons;
114 int numpids;
d62a17ae 115 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
8b886ca7 116} gs = {
c0e5cb52 117 .phase = PHASE_INIT,
64a249ad 118 .vtydir = frr_vtydir,
d62a17ae 119 .period = 1000 * DEFAULT_PERIOD,
120 .timeout = DEFAULT_TIMEOUT,
121 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
122 .loglevel = DEFAULT_LOGLEVEL,
123 .min_restart_interval = DEFAULT_MIN_RESTART,
124 .max_restart_interval = DEFAULT_MAX_RESTART,
6d0fa5c2 125 .operational_timeout = DEFAULT_OPERATIONAL_TIMEOUT,
3ec95567
DL
126 .restart_command = DEFAULT_RESTART_CMD,
127 .start_command = DEFAULT_START_CMD,
128 .stop_command = DEFAULT_STOP_CMD,
d62a17ae 129};
a6810074 130
c3db4ca8 131enum daemon_state {
a6810074
DL
132 DAEMON_INIT,
133 DAEMON_DOWN,
134 DAEMON_CONNECTING,
135 DAEMON_UP,
136 DAEMON_UNRESPONSIVE
c3db4ca8 137};
8b886ca7 138
d62a17ae 139#define IS_UP(DMN) \
140 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
8b886ca7 141
2b64873d 142static const char *const state_str[] = {
d62a17ae 143 "Init", "Down", "Connecting", "Up", "Unresponsive",
8b886ca7 144};
145
146struct daemon {
a6810074 147 const char *name;
c3db4ca8 148 enum daemon_state state;
a6810074
DL
149 int fd;
150 struct timeval echo_sent;
d7c0a89a 151 unsigned int connect_tries;
e6685141
DS
152 struct event *t_wakeup;
153 struct event *t_read;
154 struct event *t_write;
a6810074
DL
155 struct daemon *next;
156 struct restart_info restart;
cc53b605
DS
157
158 /*
159 * For a given daemon, if we've turned on ignore timeouts
160 * ignore the timeout value and assume everything is ok
161 * This is for daemon debugging w/ gdb after we have started
162 * FRR and realize we have something that needs to be looked
163 * at
164 */
165 bool ignore_timeout;
8b886ca7 166};
167
9272302b
DL
168#define OPTION_MINRESTART 2000
169#define OPTION_MAXRESTART 2001
f168b713 170#define OPTION_DRY 2002
33606a15 171#define OPTION_NETNS 2003
6d0fa5c2 172#define OPTION_MAXOPERATIONAL 2004
9272302b 173
a6810074
DL
174static const struct option longopts[] = {
175 {"daemon", no_argument, NULL, 'd'},
176 {"statedir", required_argument, NULL, 'S'},
a6810074
DL
177 {"loglevel", required_argument, NULL, 'l'},
178 {"interval", required_argument, NULL, 'i'},
179 {"timeout", required_argument, NULL, 't'},
180 {"restart-timeout", required_argument, NULL, 'T'},
181 {"restart", required_argument, NULL, 'r'},
182 {"start-command", required_argument, NULL, 's'},
183 {"kill-command", required_argument, NULL, 'k'},
f168b713 184 {"dry", no_argument, NULL, OPTION_DRY},
d62a17ae 185 {"min-restart-interval", required_argument, NULL, OPTION_MINRESTART},
186 {"max-restart-interval", required_argument, NULL, OPTION_MAXRESTART},
6d0fa5c2 187 {"operational-timeout", required_argument, NULL, OPTION_MAXOPERATIONAL},
a6810074
DL
188 {"pid-file", required_argument, NULL, 'p'},
189 {"blank-string", required_argument, NULL, 'b'},
33606a15
DL
190#ifdef GNU_LINUX
191 {"netns", optional_argument, NULL, OPTION_NETNS},
192#endif
a6810074
DL
193 {"help", no_argument, NULL, 'h'},
194 {"version", no_argument, NULL, 'v'},
d62a17ae 195 {NULL, 0, NULL, 0}};
8b886ca7 196
197static int try_connect(struct daemon *dmn);
e6685141 198static void wakeup_send_echo(struct event *t_wakeup);
8b886ca7 199static void try_restart(struct daemon *dmn);
200static void phase_check(void);
75f8b0e4 201static void restart_done(struct daemon *dmn);
8b886ca7 202
4f04a76b 203static const char *progname;
cc53b605
DS
204
205void watchfrr_set_ignore_daemon(struct vty *vty, const char *dname, bool ignore)
206{
207 struct daemon *dmn;
208
209 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
210 if (strncmp(dmn->name, dname, strlen(dmn->name)) == 0)
211 break;
212 }
213
214 if (dmn) {
215 dmn->ignore_timeout = ignore;
216 vty_out(vty, "%s switching to %s\n", dmn->name,
217 ignore ? "ignore" : "watch");
218 } else
219 vty_out(vty, "%s is not configured for running at the moment",
220 dname);
221}
222
4f04a76b 223static void printhelp(FILE *target)
8b886ca7 224{
d62a17ae 225 fprintf(target,
226 "Usage : %s [OPTION...] <daemon name> ...\n\n\
9473e340 227Watchdog program to monitor status of frr daemons and try to restart\n\
8b886ca7 228them if they are down or unresponsive. It determines whether a daemon is\n\
229up based on whether it can connect to the daemon's vty unix stream socket.\n\
230It then repeatedly sends echo commands over that socket to determine whether\n\
231the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
232on the socket connection and know immediately that the daemon is down.\n\n\
233The daemons to be monitored should be listed on the command line.\n\n\
8b886ca7 234In order to avoid attempting to restart the daemons in a fast loop,\n\
235the -m and -M options allow you to control the minimum delay between\n\
236restart commands. The minimum restart delay is recalculated each time\n\
237a restart is attempted: if the time since the last restart attempt exceeds\n\
238twice the -M value, then the restart delay is set to the -m value.\n\
d62a17ae 239Otherwise, the interval is doubled (but capped at the -M value).\n\n",
f168b713 240 progname);
e757c940 241
d62a17ae 242 fprintf(target,
243 "Options:\n\
8b886ca7 244-d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
245 to syslog instead of stdout.\n\
246-S, --statedir Set the vty socket directory (default is %s)\n\
33606a15
DL
247-N, --pathspace Insert prefix into config & socket paths\n"
248#ifdef GNU_LINUX
249" --netns Create and/or use Linux network namespace. If no name is\n"
250" given, uses the value from `-N`.\n"
251#endif
252"-l, --loglevel Set the logging level (default is %d).\n\
8b886ca7 253 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
254 but it can be set higher than %d if extra-verbose debugging\n\
255 messages are desired.\n\
9272302b 256 --min-restart-interval\n\
8b886ca7 257 Set the minimum seconds to wait between invocations of daemon\n\
258 restart commands (default is %d).\n\
9272302b 259 --max-restart-interval\n\
8b886ca7 260 Set the maximum seconds to wait between invocations of daemon\n\
261 restart commands (default is %d).\n\
6d0fa5c2
DS
262 --operational-timeout\n\
263 Set the time before systemd is notified that we are considered\n\
264 operational again after a daemon restart (default is %d).\n\
8b886ca7 265-i, --interval Set the status polling interval in seconds (default is %d)\n\
266-t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
267-T, --restart-timeout\n\
268 Set the restart (kill) timeout in seconds (default is %d).\n\
269 If any background jobs are still running after this much\n\
270 time has elapsed, they will be killed.\n\
271-r, --restart Supply a Bourne shell command to use to restart a single\n\
272 daemon. The command string should include '%%s' where the\n\
273 name of the daemon should be substituted.\n\
3ec95567 274 (default: '%s')\n\
8b886ca7 275-s, --start-command\n\
276 Supply a Bourne shell to command to use to start a single\n\
277 daemon. The command string should include '%%s' where the\n\
278 name of the daemon should be substituted.\n\
3ec95567 279 (default: '%s')\n\
8b886ca7 280-k, --kill-command\n\
281 Supply a Bourne shell to command to use to stop a single\n\
282 daemon. The command string should include '%%s' where the\n\
283 name of the daemon should be substituted.\n\
3ec95567 284 (default: '%s')\n\
f168b713 285 --dry Do not start or restart anything, just log.\n\
8b886ca7 286-p, --pid-file Set process identifier file name\n\
0a7c7856 287 (default is %s/watchfrr.pid).\n\
c8b40f86 288-b, --blank-string\n\
289 When the supplied argument string is found in any of the\n\
f168b713 290 various shell command arguments (-r, -s, or -k), replace\n\
c8b40f86 291 it with a space. This is an ugly hack to circumvent problems\n\
292 passing command-line arguments with embedded spaces.\n\
8b886ca7 293-v, --version Print program version\n\
d62a17ae 294-h, --help Display this help and exit\n",
64a249ad 295 frr_vtydir, DEFAULT_LOGLEVEL, LOG_EMERG, LOG_DEBUG, LOG_DEBUG,
6d0fa5c2
DS
296 DEFAULT_MIN_RESTART, DEFAULT_MAX_RESTART,
297 DEFAULT_OPERATIONAL_TIMEOUT, DEFAULT_PERIOD, DEFAULT_TIMEOUT,
298 DEFAULT_RESTART_TIMEOUT, DEFAULT_RESTART_CMD, DEFAULT_START_CMD,
299 DEFAULT_STOP_CMD, frr_vtydir);
8b886ca7 300}
301
a6810074 302static pid_t run_background(char *shell_cmd)
8b886ca7 303{
a6810074
DL
304 pid_t child;
305
306 switch (child = fork()) {
307 case -1:
450971aa 308 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
309 "fork failed, cannot run command [%s]: %s",
310 shell_cmd, safe_strerror(errno));
a6810074
DL
311 return -1;
312 case 0:
313 /* Child process. */
d62a17ae 314 /* Use separate process group so child processes can be killed
315 * easily. */
a6810074 316 if (setpgid(0, 0) < 0)
957cfa24 317 zlog_warn("setpgid(0,0) failed: %s",
a6810074
DL
318 safe_strerror(errno));
319 {
320 char shell[] = "sh";
321 char dashc[] = "-c";
d62a17ae 322 char *const argv[4] = {shell, dashc, shell_cmd, NULL};
a6810074 323 execv("/bin/sh", argv);
450971aa 324 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
325 "execv(/bin/sh -c '%s') failed: %s",
326 shell_cmd, safe_strerror(errno));
a6810074
DL
327 _exit(127);
328 }
329 default:
330 /* Parent process: we will reap the child later. */
c3f65458
QY
331 zlog_info("Forked background command [pid %d]: %s", (int)child,
332 shell_cmd);
a6810074
DL
333 return child;
334 }
8b886ca7 335}
336
a6810074
DL
337static struct timeval *time_elapsed(struct timeval *result,
338 const struct timeval *start_time)
8b886ca7 339{
a6810074
DL
340 gettimeofday(result, NULL);
341 result->tv_sec -= start_time->tv_sec;
342 result->tv_usec -= start_time->tv_usec;
343 while (result->tv_usec < 0) {
344 result->tv_usec += 1000000L;
345 result->tv_sec--;
346 }
347 return result;
8b886ca7 348}
349
e6685141 350static void restart_kill(struct event *t_kill)
8b886ca7 351{
e16d030c 352 struct restart_info *restart = EVENT_ARG(t_kill);
a6810074
DL
353 struct timeval delay;
354
355 time_elapsed(&delay, &restart->time);
2ab760f0
DA
356
357 if (gs.reading_configuration) {
358 zlog_err(
359 "%s %s child process appears to still be reading configuration, delaying for another %lu time",
360 restart->what, restart->name, gs.restart_timeout);
907a2395
DS
361 event_add_timer(master, restart_kill, restart,
362 gs.restart_timeout, &restart->t_kill);
2ab760f0
DA
363 return;
364 }
365
d62a17ae 366 zlog_warn(
957cfa24 367 "%s %s child process %d still running after %ld seconds, sending signal %d",
d62a17ae 368 restart->what, restart->name, (int)restart->pid,
369 (long)delay.tv_sec, (restart->kills ? SIGKILL : SIGTERM));
a6810074
DL
370 kill(-restart->pid, (restart->kills ? SIGKILL : SIGTERM));
371 restart->kills++;
907a2395
DS
372 event_add_timer(master, restart_kill, restart, gs.restart_timeout,
373 &restart->t_kill);
8b886ca7 374}
375
a6810074 376static struct restart_info *find_child(pid_t child)
8b886ca7 377{
f168b713 378 struct daemon *dmn;
7c265f7d
CF
379 if (gs.restart.pid == child)
380 return &gs.restart;
381
f168b713
DL
382 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
383 if (dmn->restart.pid == child)
384 return &dmn->restart;
a6810074
DL
385 }
386 return NULL;
8b886ca7 387}
388
a6810074 389static void sigchild(void)
8b886ca7 390{
a6810074
DL
391 pid_t child;
392 int status;
393 const char *name;
394 const char *what;
395 struct restart_info *restart;
75f8b0e4 396 struct daemon *dmn;
a6810074
DL
397
398 switch (child = waitpid(-1, &status, WNOHANG)) {
399 case -1:
450971aa 400 flog_err_sys(EC_LIB_SYSTEM_CALL, "waitpid failed: %s",
09c866e3 401 safe_strerror(errno));
a6810074
DL
402 return;
403 case 0:
404 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
405 return;
406 }
407
408 if (child == integrated_write_pid) {
409 integrated_write_sigchld(status);
410 return;
411 }
412
413 if ((restart = find_child(child)) != NULL) {
414 name = restart->name;
415 what = restart->what;
416 restart->pid = 0;
417 gs.numpids--;
332beb64 418 event_cancel(&restart->t_kill);
b3d6bc6e 419
d62a17ae 420 /* Update restart time to reflect the time the command
421 * completed. */
a6810074
DL
422 gettimeofday(&restart->time, NULL);
423 } else {
09c866e3 424 flog_err_sys(
450971aa 425 EC_LIB_SYSTEM_CALL,
09c866e3
QY
426 "waitpid returned status for an unknown child process %d",
427 (int)child);
a6810074
DL
428 name = "(unknown)";
429 what = "background";
430 }
431 if (WIFSTOPPED(status))
957cfa24 432 zlog_warn("%s %s process %d is stopped", what, name,
d62a17ae 433 (int)child);
a6810074 434 else if (WIFSIGNALED(status))
d62a17ae 435 zlog_warn("%s %s process %d terminated due to signal %d", what,
436 name, (int)child, WTERMSIG(status));
a6810074
DL
437 else if (WIFEXITED(status)) {
438 if (WEXITSTATUS(status) != 0)
d62a17ae 439 zlog_warn(
440 "%s %s process %d exited with non-zero status %d",
441 what, name, (int)child, WEXITSTATUS(status));
75f8b0e4 442 else {
a6810074
DL
443 zlog_debug("%s %s process %d exited normally", what,
444 name, (int)child);
75f8b0e4
DL
445
446 if (restart && restart != &gs.restart) {
447 dmn = container_of(restart, struct daemon,
448 restart);
449 restart_done(dmn);
450 } else if (restart)
451 for (dmn = gs.daemons; dmn; dmn = dmn->next)
452 restart_done(dmn);
453 }
a6810074 454 } else
09c866e3 455 flog_err_sys(
450971aa 456 EC_LIB_SYSTEM_CALL,
09c866e3
QY
457 "cannot interpret %s %s process %d wait status 0x%x",
458 what, name, (int)child, status);
a6810074 459 phase_check();
8b886ca7 460}
461
d62a17ae 462static int run_job(struct restart_info *restart, const char *cmdtype,
463 const char *command, int force, int update_interval)
8b886ca7 464{
a6810074
DL
465 struct timeval delay;
466
467 if (gs.loglevel > LOG_DEBUG + 1)
468 zlog_debug("attempting to %s %s", cmdtype, restart->name);
469
470 if (restart->pid) {
471 if (gs.loglevel > LOG_DEBUG + 1)
d62a17ae 472 zlog_debug(
473 "cannot %s %s, previous pid %d still running",
474 cmdtype, restart->name, (int)restart->pid);
a6810074
DL
475 return -1;
476 }
477
b3ee8bcc
DS
478 char buffer[512];
479
480 snprintf(buffer, sizeof(buffer), "restarting %s", restart->name);
481 systemd_send_status(buffer);
b3ee8bcc 482
d62a17ae 483 /* Note: time_elapsed test must come before the force test, since we
484 need
a6810074
DL
485 to make sure that delay is initialized for use below in updating the
486 restart interval. */
487 if ((time_elapsed(&delay, &restart->time)->tv_sec < restart->interval)
488 && !force) {
b3ee8bcc 489
a6810074 490 if (gs.loglevel > LOG_DEBUG + 1)
d62a17ae 491 zlog_debug(
3efd0893 492 "postponing %s %s: elapsed time %ld < retry interval %ld",
d62a17ae 493 cmdtype, restart->name, (long)delay.tv_sec,
494 restart->interval);
a6810074
DL
495 return -1;
496 }
497
498 gettimeofday(&restart->time, NULL);
499 restart->kills = 0;
500 {
501 char cmd[strlen(command) + strlen(restart->name) + 1];
c84e5187
DL
502#pragma GCC diagnostic push
503#pragma GCC diagnostic ignored "-Wformat-nonliteral"
504 /* user supplied command string has a %s for the daemon name */
a6810074 505 snprintf(cmd, sizeof(cmd), command, restart->name);
c84e5187 506#pragma GCC diagnostic pop
a6810074 507 if ((restart->pid = run_background(cmd)) > 0) {
907a2395
DS
508 event_add_timer(master, restart_kill, restart,
509 gs.restart_timeout, &restart->t_kill);
a6810074
DL
510 restart->what = cmdtype;
511 gs.numpids++;
512 } else
513 restart->pid = 0;
514 }
515
516 /* Calculate the new restart interval. */
517 if (update_interval) {
518 if (delay.tv_sec > 2 * gs.max_restart_interval)
519 restart->interval = gs.min_restart_interval;
520 else if ((restart->interval *= 2) > gs.max_restart_interval)
521 restart->interval = gs.max_restart_interval;
522 if (gs.loglevel > LOG_DEBUG + 1)
523 zlog_debug("restart %s interval is now %ld",
524 restart->name, restart->interval);
525 }
526 return restart->pid;
8b886ca7 527}
528
d62a17ae 529#define SET_READ_HANDLER(DMN) \
530 do { \
531 (DMN)->t_read = NULL; \
907a2395
DS
532 event_add_read(master, handle_read, (DMN), (DMN)->fd, \
533 &(DMN)->t_read); \
d62a17ae 534 } while (0);
535
536#define SET_WAKEUP_DOWN(DMN) \
537 do { \
538 (DMN)->t_wakeup = NULL; \
907a2395
DS
539 event_add_timer_msec(master, wakeup_down, (DMN), \
540 FUZZY(gs.period), &(DMN)->t_wakeup); \
d62a17ae 541 } while (0);
542
543#define SET_WAKEUP_UNRESPONSIVE(DMN) \
544 do { \
545 (DMN)->t_wakeup = NULL; \
907a2395
DS
546 event_add_timer_msec(master, wakeup_unresponsive, (DMN), \
547 FUZZY(gs.period), &(DMN)->t_wakeup); \
d62a17ae 548 } while (0);
549
550#define SET_WAKEUP_ECHO(DMN) \
551 do { \
552 (DMN)->t_wakeup = NULL; \
907a2395
DS
553 event_add_timer_msec(master, wakeup_send_echo, (DMN), \
554 FUZZY(gs.period), &(DMN)->t_wakeup); \
d62a17ae 555 } while (0);
8b886ca7 556
e6685141 557static void wakeup_down(struct event *t_wakeup)
8b886ca7 558{
e16d030c 559 struct daemon *dmn = EVENT_ARG(t_wakeup);
a6810074
DL
560
561 dmn->t_wakeup = NULL;
562 if (try_connect(dmn) < 0)
563 SET_WAKEUP_DOWN(dmn);
564 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
565 try_restart(dmn);
8b886ca7 566}
567
e6685141 568static void wakeup_init(struct event *t_wakeup)
8b886ca7 569{
e16d030c 570 struct daemon *dmn = EVENT_ARG(t_wakeup);
a6810074
DL
571
572 dmn->t_wakeup = NULL;
573 if (try_connect(dmn) < 0) {
c3f65458
QY
574 zlog_info(
575 "%s state -> down : initial connection attempt failed",
576 dmn->name);
a6810074
DL
577 dmn->state = DAEMON_DOWN;
578 }
c0e5cb52 579 phase_check();
8b886ca7 580}
581
75f8b0e4
DL
582static void restart_done(struct daemon *dmn)
583{
584 if (dmn->state != DAEMON_DOWN) {
3f391bec
DS
585 zlog_warn(
586 "Daemon: %s: is in %s state but expected it to be in DAEMON_DOWN state",
587 dmn->name, state_str[dmn->state]);
75f8b0e4
DL
588 return;
589 }
e16d030c 590 EVENT_OFF(dmn->t_wakeup);
50478845 591
75f8b0e4
DL
592 if (try_connect(dmn) < 0)
593 SET_WAKEUP_DOWN(dmn);
594}
595
e6685141 596static void daemon_restarting_operational(struct event *thread)
6d0fa5c2
DS
597{
598 systemd_send_status("FRR Operational");
599}
600
a6810074 601static void daemon_down(struct daemon *dmn, const char *why)
8b886ca7 602{
a6810074 603 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
1c50c1c0
QY
604 flog_err(EC_WATCHFRR_CONNECTION, "%s state -> down : %s",
605 dmn->name, why);
a6810074
DL
606 else if (gs.loglevel > LOG_DEBUG)
607 zlog_debug("%s still down : %s", dmn->name, why);
608 if (IS_UP(dmn))
609 gs.numdown++;
610 dmn->state = DAEMON_DOWN;
611 if (dmn->fd >= 0) {
612 close(dmn->fd);
613 dmn->fd = -1;
614 }
e16d030c
DS
615 EVENT_OFF(dmn->t_read);
616 EVENT_OFF(dmn->t_write);
617 EVENT_OFF(dmn->t_wakeup);
a6810074
DL
618 if (try_connect(dmn) < 0)
619 SET_WAKEUP_DOWN(dmn);
6d0fa5c2
DS
620
621 systemd_send_status("FRR partially operational");
a6810074 622 phase_check();
8b886ca7 623}
624
e6685141 625static void handle_read(struct event *t_read)
8b886ca7 626{
e16d030c 627 struct daemon *dmn = EVENT_ARG(t_read);
a6810074
DL
628 static const char resp[sizeof(PING_TOKEN) + 4] = PING_TOKEN "\n";
629 char buf[sizeof(resp) + 100];
630 ssize_t rc;
631 struct timeval delay;
632
633 dmn->t_read = NULL;
634 if ((rc = read(dmn->fd, buf, sizeof(buf))) < 0) {
635 char why[100];
636
637 if (ERRNO_IO_RETRY(errno)) {
638 /* Pretend it never happened. */
639 SET_READ_HANDLER(dmn);
cc9f21da 640 return;
a6810074
DL
641 }
642 snprintf(why, sizeof(why), "unexpected read error: %s",
643 safe_strerror(errno));
644 daemon_down(dmn, why);
cc9f21da 645 return;
8b886ca7 646 }
a6810074
DL
647 if (rc == 0) {
648 daemon_down(dmn, "read returned EOF");
cc9f21da 649 return;
a6810074
DL
650 }
651 if (!dmn->echo_sent.tv_sec) {
652 char why[sizeof(buf) + 100];
653 snprintf(why, sizeof(why),
654 "unexpected read returns %d bytes: %.*s", (int)rc,
655 (int)rc, buf);
656 daemon_down(dmn, why);
cc9f21da 657 return;
8b886ca7 658 }
a6810074
DL
659
660 /* We are expecting an echo response: is there any chance that the
661 response would not be returned entirely in the first read? That
662 seems inconceivable... */
663 if ((rc != sizeof(resp)) || memcmp(buf, resp, sizeof(resp))) {
664 char why[100 + sizeof(buf)];
665 snprintf(why, sizeof(why),
3efd0893 666 "read returned bad echo response of %d bytes (expecting %u): %.*s",
d7c0a89a 667 (int)rc, (unsigned int)sizeof(resp), (int)rc, buf);
a6810074 668 daemon_down(dmn, why);
cc9f21da 669 return;
a6810074
DL
670 }
671
672 time_elapsed(&delay, &dmn->echo_sent);
673 dmn->echo_sent.tv_sec = 0;
674 if (dmn->state == DAEMON_UNRESPONSIVE) {
675 if (delay.tv_sec < gs.timeout) {
676 dmn->state = DAEMON_UP;
d62a17ae 677 zlog_warn(
3efd0893 678 "%s state -> up : echo response received after %ld.%06ld seconds",
d62a17ae 679 dmn->name, (long)delay.tv_sec,
680 (long)delay.tv_usec);
a6810074 681 } else
d62a17ae 682 zlog_warn(
3efd0893 683 "%s: slow echo response finally received after %ld.%06ld seconds",
d62a17ae 684 dmn->name, (long)delay.tv_sec,
685 (long)delay.tv_usec);
a6810074
DL
686 } else if (gs.loglevel > LOG_DEBUG + 1)
687 zlog_debug("%s: echo response received after %ld.%06ld seconds",
688 dmn->name, (long)delay.tv_sec, (long)delay.tv_usec);
689
690 SET_READ_HANDLER(dmn);
332beb64 691 event_cancel(&dmn->t_wakeup);
a6810074 692 SET_WAKEUP_ECHO(dmn);
8b886ca7 693}
694
207e0d7a
DS
695/*
696 * Wait till we notice that all daemons are ready before
697 * we send we are ready to systemd
698 */
5c9d1c83 699static void daemon_send_ready(int exitcode)
207e0d7a 700{
5c9d1c83 701 FILE *fp;
a6810074 702 static int sent = 0;
43e587c1 703 char started[1024];
207e0d7a 704
5c9d1c83
DL
705 if (sent)
706 return;
707
708 if (exitcode == 0)
0a7c7856 709 zlog_notice("all daemons up, doing startup-complete notify");
5c9d1c83
DL
710 else if (gs.numdown < gs.numdaemons)
711 flog_err(EC_WATCHFRR_CONNECTION,
3efd0893 712 "startup did not complete within timeout (%d/%d daemons running)",
5c9d1c83
DL
713 gs.numdaemons - gs.numdown, gs.numdaemons);
714 else {
715 flog_err(EC_WATCHFRR_CONNECTION,
3efd0893 716 "all configured daemons failed to start -- exiting watchfrr");
5c9d1c83
DL
717 exit(exitcode);
718
719 }
0a7c7856 720
5c9d1c83
DL
721 frr_detach();
722
33606a15 723 snprintf(started, sizeof(started), "%s/%s", frr_vtydir,
3c649c71
DS
724 "watchfrr.started");
725 fp = fopen(started, "w");
5c9d1c83
DL
726 if (fp)
727 fclose(fp);
247898d5
DL
728
729 systemd_send_started(master);
b3ee8bcc 730 systemd_send_status("FRR Operational");
5c9d1c83 731 sent = 1;
207e0d7a
DS
732}
733
a6810074 734static void daemon_up(struct daemon *dmn, const char *why)
8b886ca7 735{
a6810074
DL
736 dmn->state = DAEMON_UP;
737 gs.numdown--;
738 dmn->connect_tries = 0;
739 zlog_notice("%s state -> up : %s", dmn->name, why);
6d0fa5c2 740 if (gs.numdown == 0) {
5c9d1c83 741 daemon_send_ready(0);
6d0fa5c2 742
e16d030c 743 EVENT_OFF(gs.t_operational);
6d0fa5c2 744
907a2395
DS
745 event_add_timer(master, daemon_restarting_operational, NULL,
746 gs.operational_timeout, &gs.t_operational);
6d0fa5c2
DS
747 }
748
a8cbb8b3 749 SET_WAKEUP_ECHO(dmn);
a6810074 750 phase_check();
8b886ca7 751}
752
e6685141 753static void check_connect(struct event *t_write)
8b886ca7 754{
e16d030c 755 struct daemon *dmn = EVENT_ARG(t_write);
a6810074
DL
756 int sockerr;
757 socklen_t reslen = sizeof(sockerr);
758
759 dmn->t_write = NULL;
760 if (getsockopt(dmn->fd, SOL_SOCKET, SO_ERROR, (char *)&sockerr, &reslen)
761 < 0) {
762 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn->name,
763 safe_strerror(errno));
764 daemon_down(dmn,
765 "getsockopt failed checking connection success");
cc9f21da 766 return;
a6810074
DL
767 }
768 if ((reslen == sizeof(sockerr)) && sockerr) {
769 char why[100];
d62a17ae 770 snprintf(
771 why, sizeof(why),
772 "getsockopt reports that connection attempt failed: %s",
773 safe_strerror(sockerr));
a6810074 774 daemon_down(dmn, why);
cc9f21da 775 return;
a6810074
DL
776 }
777
778 daemon_up(dmn, "delayed connect succeeded");
8b886ca7 779}
780
e6685141 781static void wakeup_connect_hanging(struct event *t_wakeup)
8b886ca7 782{
e16d030c 783 struct daemon *dmn = EVENT_ARG(t_wakeup);
a6810074
DL
784 char why[100];
785
786 dmn->t_wakeup = NULL;
787 snprintf(why, sizeof(why),
788 "connection attempt timed out after %ld seconds", gs.timeout);
789 daemon_down(dmn, why);
8b886ca7 790}
791
792/* Making connection to protocol daemon. */
a6810074 793static int try_connect(struct daemon *dmn)
8b886ca7 794{
a6810074
DL
795 int sock;
796 struct sockaddr_un addr;
797 socklen_t len;
798
799 if (gs.loglevel > LOG_DEBUG + 1)
800 zlog_debug("%s: attempting to connect", dmn->name);
801 dmn->connect_tries++;
802
6006b807 803 memset(&addr, 0, sizeof(addr));
a6810074 804 addr.sun_family = AF_UNIX;
d62a17ae 805 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty", gs.vtydir,
806 dmn->name);
6f0e3f6e 807#ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
a6810074 808 len = addr.sun_len = SUN_LEN(&addr);
8b886ca7 809#else
a6810074 810 len = sizeof(addr.sun_family) + strlen(addr.sun_path);
d62a17ae 811#endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
a6810074
DL
812
813 /* Quick check to see if we might succeed before we go to the trouble
814 of creating a socket. */
815 if (access(addr.sun_path, W_OK) < 0) {
816 if (errno != ENOENT)
450971aa 817 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
818 "%s: access to socket %s denied: %s",
819 dmn->name, addr.sun_path,
820 safe_strerror(errno));
a6810074
DL
821 return -1;
822 }
823
824 if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
450971aa 825 flog_err_sys(EC_LIB_SOCKET, "%s(%s): cannot make socket: %s",
09c866e3 826 __func__, addr.sun_path, safe_strerror(errno));
a6810074
DL
827 return -1;
828 }
829
830 if (set_nonblocking(sock) < 0 || set_cloexec(sock) < 0) {
450971aa 831 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
832 "%s(%s): set_nonblocking/cloexec(%d) failed",
833 __func__, addr.sun_path, sock);
a6810074
DL
834 close(sock);
835 return -1;
8b886ca7 836 }
a6810074
DL
837
838 if (connect(sock, (struct sockaddr *)&addr, len) < 0) {
839 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK)) {
840 if (gs.loglevel > LOG_DEBUG)
841 zlog_debug("%s(%s): connect failed: %s",
842 __func__, addr.sun_path,
843 safe_strerror(errno));
844 close(sock);
845 return -1;
846 }
847 if (gs.loglevel > LOG_DEBUG)
848 zlog_debug("%s: connection in progress", dmn->name);
849 dmn->state = DAEMON_CONNECTING;
850 dmn->fd = sock;
907a2395
DS
851 event_add_write(master, check_connect, dmn, dmn->fd,
852 &dmn->t_write);
853 event_add_timer(master, wakeup_connect_hanging, dmn, gs.timeout,
854 &dmn->t_wakeup);
a6810074
DL
855 SET_READ_HANDLER(dmn);
856 return 0;
857 }
858
859 dmn->fd = sock;
860 SET_READ_HANDLER(dmn);
861 daemon_up(dmn, "connect succeeded");
862 return 1;
8b886ca7 863}
864
e6685141 865static void phase_hanging(struct event *t_hanging)
8b886ca7 866{
a6810074 867 gs.t_phase_hanging = NULL;
f74ae2bb 868 flog_err(EC_WATCHFRR_CONNECTION,
1c50c1c0
QY
869 "Phase [%s] hanging for %ld seconds, aborting phased restart",
870 phase_str[gs.phase], PHASE_TIMEOUT);
a6810074 871 gs.phase = PHASE_NONE;
8b886ca7 872}
873
f1692c51 874static void set_phase(enum restart_phase new_phase)
8b886ca7 875{
a6810074 876 gs.phase = new_phase;
332beb64 877 event_cancel(&gs.t_phase_hanging);
b3d6bc6e 878
907a2395
DS
879 event_add_timer(master, phase_hanging, NULL, PHASE_TIMEOUT,
880 &gs.t_phase_hanging);
8b886ca7 881}
882
a6810074 883static void phase_check(void)
8b886ca7 884{
c0e5cb52
DL
885 struct daemon *dmn;
886
a6810074
DL
887 switch (gs.phase) {
888 case PHASE_NONE:
889 break;
c0e5cb52
DL
890
891 case PHASE_INIT:
892 for (dmn = gs.daemons; dmn; dmn = dmn->next)
893 if (dmn->state == DAEMON_INIT)
894 return;
895
896 /* startup complete, everything out of INIT */
897 gs.phase = PHASE_NONE;
898 for (dmn = gs.daemons; dmn; dmn = dmn->next)
899 if (dmn->state == DAEMON_DOWN) {
900 SET_WAKEUP_DOWN(dmn);
901 try_restart(dmn);
902 }
903 break;
a6810074
DL
904 case PHASE_STOPS_PENDING:
905 if (gs.numpids)
906 break;
d62a17ae 907 zlog_info(
908 "Phased restart: all routing daemon stop jobs have completed.");
a6810074
DL
909 set_phase(PHASE_WAITING_DOWN);
910
d62a17ae 911 /*FALLTHRU*/
a6810074
DL
912 case PHASE_WAITING_DOWN:
913 if (gs.numdown + IS_UP(gs.special) < gs.numdaemons)
914 break;
6d0fa5c2 915 systemd_send_status("Phased Restart");
a6810074
DL
916 zlog_info("Phased restart: all routing daemons now down.");
917 run_job(&gs.special->restart, "restart", gs.restart_command, 1,
918 1);
919 set_phase(PHASE_ZEBRA_RESTART_PENDING);
920
d62a17ae 921 /*FALLTHRU*/
a6810074
DL
922 case PHASE_ZEBRA_RESTART_PENDING:
923 if (gs.special->restart.pid)
924 break;
6d0fa5c2 925 systemd_send_status("Zebra Restarting");
a6810074
DL
926 zlog_info("Phased restart: %s restart job completed.",
927 gs.special->name);
928 set_phase(PHASE_WAITING_ZEBRA_UP);
929
d62a17ae 930 /*FALLTHRU*/
a6810074
DL
931 case PHASE_WAITING_ZEBRA_UP:
932 if (!IS_UP(gs.special))
933 break;
934 zlog_info("Phased restart: %s is now up.", gs.special->name);
9ca6d3db
DS
935 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
936 if (dmn != gs.special)
937 run_job(&dmn->restart, "start",
938 gs.start_command, 1, 0);
a6810074
DL
939 }
940 gs.phase = PHASE_NONE;
e16d030c 941 EVENT_OFF(gs.t_phase_hanging);
a6810074
DL
942 zlog_notice("Phased global restart has completed.");
943 break;
944 }
8b886ca7 945}
946
a6810074 947static void try_restart(struct daemon *dmn)
8b886ca7 948{
f168b713 949 if (watch_only)
a6810074 950 return;
a6810074 951
f168b713
DL
952 if (dmn != gs.special) {
953 if ((gs.special->state == DAEMON_UP)
954 && (gs.phase == PHASE_NONE))
955 run_job(&dmn->restart, "restart", gs.restart_command, 0,
956 1);
957 else
958 zlog_debug(
3efd0893 959 "%s: postponing restart attempt because master %s daemon not up [%s], or phased restart in progress",
f168b713
DL
960 dmn->name, gs.special->name,
961 state_str[gs.special->state]);
962 return;
963 }
964
965 if ((gs.phase != PHASE_NONE) || gs.numpids) {
966 if (gs.loglevel > LOG_DEBUG + 1)
967 zlog_debug(
3efd0893 968 "postponing phased global restart: restart already in progress [%s], or outstanding child processes [%d]",
f168b713
DL
969 phase_str[gs.phase], gs.numpids);
970 return;
971 }
972 /* Is it too soon for a restart? */
973 {
974 struct timeval delay;
975 if (time_elapsed(&delay, &gs.special->restart.time)->tv_sec
976 < gs.special->restart.interval) {
a6810074 977 if (gs.loglevel > LOG_DEBUG + 1)
d62a17ae 978 zlog_debug(
3efd0893 979 "postponing phased global restart: elapsed time %ld < retry interval %ld",
f168b713
DL
980 (long)delay.tv_sec,
981 gs.special->restart.interval);
982 return;
a6810074 983 }
8b886ca7 984 }
f168b713 985 run_job(&gs.restart, "restart", gs.restart_command, 0, 1);
8b886ca7 986}
987
e6685141 988static void wakeup_unresponsive(struct event *t_wakeup)
8b886ca7 989{
e16d030c 990 struct daemon *dmn = EVENT_ARG(t_wakeup);
a6810074
DL
991
992 dmn->t_wakeup = NULL;
993 if (dmn->state != DAEMON_UNRESPONSIVE)
f74ae2bb 994 flog_err(EC_WATCHFRR_CONNECTION,
3efd0893 995 "%s: no longer unresponsive (now %s), wakeup should have been cancelled!",
1c50c1c0 996 dmn->name, state_str[dmn->state]);
a6810074
DL
997 else {
998 SET_WAKEUP_UNRESPONSIVE(dmn);
999 try_restart(dmn);
1000 }
8b886ca7 1001}
1002
e6685141 1003static void wakeup_no_answer(struct event *t_wakeup)
8b886ca7 1004{
e16d030c 1005 struct daemon *dmn = EVENT_ARG(t_wakeup);
a6810074
DL
1006
1007 dmn->t_wakeup = NULL;
1008 dmn->state = DAEMON_UNRESPONSIVE;
cc53b605 1009 if (dmn->ignore_timeout)
cc9f21da 1010 return;
f74ae2bb 1011 flog_err(EC_WATCHFRR_CONNECTION,
3efd0893 1012 "%s state -> unresponsive : no response yet to ping sent %ld seconds ago",
1c50c1c0 1013 dmn->name, gs.timeout);
71e7975a
DL
1014 SET_WAKEUP_UNRESPONSIVE(dmn);
1015 try_restart(dmn);
8b886ca7 1016}
1017
e6685141 1018static void wakeup_send_echo(struct event *t_wakeup)
8b886ca7 1019{
a6810074
DL
1020 static const char echocmd[] = "echo " PING_TOKEN;
1021 ssize_t rc;
e16d030c 1022 struct daemon *dmn = EVENT_ARG(t_wakeup);
a6810074
DL
1023
1024 dmn->t_wakeup = NULL;
d62a17ae 1025 if (((rc = write(dmn->fd, echocmd, sizeof(echocmd))) < 0)
1026 || ((size_t)rc != sizeof(echocmd))) {
a6810074
DL
1027 char why[100 + sizeof(echocmd)];
1028 snprintf(why, sizeof(why),
1029 "write '%s' returned %d instead of %u", echocmd,
d7c0a89a 1030 (int)rc, (unsigned int)sizeof(echocmd));
a6810074
DL
1031 daemon_down(dmn, why);
1032 } else {
1033 gettimeofday(&dmn->echo_sent, NULL);
907a2395
DS
1034 event_add_timer(master, wakeup_no_answer, dmn, gs.timeout,
1035 &dmn->t_wakeup);
a6810074 1036 }
8b886ca7 1037}
1038
470bc619
QY
1039bool check_all_up(void)
1040{
1041 struct daemon *dmn;
1042
1043 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1044 if (dmn->state != DAEMON_UP)
1045 return false;
1046 return true;
1047}
1048
af568444
DL
1049void watchfrr_status(struct vty *vty)
1050{
1051 struct daemon *dmn;
1052 struct timeval delay;
1053
1054 vty_out(vty, "watchfrr global phase: %s\n", phase_str[gs.phase]);
603fef0e
DS
1055 vty_out(vty, " Restart Command: %pSQq\n", gs.restart_command);
1056 vty_out(vty, " Start Command: %pSQq\n", gs.start_command);
1057 vty_out(vty, " Stop Command: %pSQq\n", gs.stop_command);
1058 vty_out(vty, " Min Restart Interval: %ld\n", gs.min_restart_interval);
1059 vty_out(vty, " Max Restart Interval: %ld\n", gs.max_restart_interval);
1060 vty_out(vty, " Restart Timeout: %ld\n", gs.restart_timeout);
2ab760f0
DA
1061 vty_out(vty, " Reading Configuration: %s\n",
1062 gs.reading_configuration ? "yes" : "no");
af568444
DL
1063 if (gs.restart.pid)
1064 vty_out(vty, " global restart running, pid %ld\n",
1065 (long)gs.restart.pid);
1066
1067 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
cc53b605
DS
1068 vty_out(vty, " %-20s %s%s", dmn->name, state_str[dmn->state],
1069 dmn->ignore_timeout ? "/Ignoring Timeout\n" : "\n");
af568444
DL
1070 if (dmn->restart.pid)
1071 vty_out(vty, " restart running, pid %ld\n",
1072 (long)dmn->restart.pid);
1073 else if (dmn->state == DAEMON_DOWN &&
1074 time_elapsed(&delay, &dmn->restart.time)->tv_sec
1075 < dmn->restart.interval)
3efd0893 1076 vty_out(vty, " restarting in %jd seconds (%jds backoff interval)\n",
051a0be4
DL
1077 (intmax_t)dmn->restart.interval
1078 - (intmax_t)delay.tv_sec,
1079 (intmax_t)dmn->restart.interval);
af568444
DL
1080 }
1081}
1082
a6810074 1083static void sigint(void)
8b886ca7 1084{
a6810074
DL
1085 zlog_notice("Terminating on signal");
1086 systemd_send_stopping();
1087 exit(0);
8b886ca7 1088}
1089
a6810074 1090static int valid_command(const char *cmd)
8b886ca7 1091{
a6810074 1092 char *p;
8b886ca7 1093
53a78fc1
RZ
1094 if (cmd == NULL)
1095 return 0;
1096
a6810074 1097 return ((p = strchr(cmd, '%')) != NULL) && (*(p + 1) == 's')
d62a17ae 1098 && !strchr(p + 1, '%');
8b886ca7 1099}
1100
c8b40f86 1101/* This is an ugly hack to circumvent problems with passing command-line
1102 arguments that contain spaces. The fix is to use a configuration file. */
a6810074 1103static char *translate_blanks(const char *cmd, const char *blankstr)
c8b40f86 1104{
a6810074
DL
1105 char *res;
1106 char *p;
1107 size_t bslen = strlen(blankstr);
1108
1109 if (!(res = strdup(cmd))) {
1110 perror("strdup");
1111 exit(1);
1112 }
1113 while ((p = strstr(res, blankstr)) != NULL) {
1114 *p = ' ';
1115 if (bslen != 1)
1116 memmove(p + 1, p + bslen, strlen(p + bslen) + 1);
1117 }
1118 return res;
c8b40f86 1119}
1120
e6685141 1121static void startup_timeout(struct event *t_wakeup)
5c9d1c83
DL
1122{
1123 daemon_send_ready(1);
5c9d1c83
DL
1124}
1125
33606a15
DL
1126#ifdef GNU_LINUX
1127
1128#include <sys/mount.h>
1129#include <sched.h>
1130
1131#define NETNS_RUN_DIR "/var/run/netns"
1132
1133static void netns_create(int dirfd, const char *nsname)
1134{
1135 /* make /var/run/netns shared between mount namespaces
1136 * just like iproute2 sets it up
1137 */
1138 if (mount("", NETNS_RUN_DIR, "none", MS_SHARED | MS_REC, NULL)) {
1139 if (errno != EINVAL) {
1140 perror("mount");
1141 exit(1);
1142 }
1143
1144 if (mount(NETNS_RUN_DIR, NETNS_RUN_DIR, "none",
1145 MS_BIND | MS_REC, NULL)) {
1146 perror("mount");
1147 exit(1);
1148 }
1149
1150 if (mount("", NETNS_RUN_DIR, "none", MS_SHARED | MS_REC,
1151 NULL)) {
1152 perror("mount");
1153 exit(1);
1154 }
1155 }
1156
1157 /* need an empty file to mount on top of */
1158 int nsfd = openat(dirfd, nsname, O_CREAT | O_RDONLY | O_EXCL, 0);
1159
1160 if (nsfd < 0) {
1161 fprintf(stderr, "failed to create \"%s/%s\": %s\n",
1162 NETNS_RUN_DIR, nsname, strerror(errno));
1163 exit(1);
1164 }
1165 close(nsfd);
1166
1167 if (unshare(CLONE_NEWNET)) {
1168 perror("unshare");
1169 unlinkat(dirfd, nsname, 0);
1170 exit(1);
1171 }
1172
1173 char *dstpath = asprintfrr(MTYPE_TMP, "%s/%s", NETNS_RUN_DIR, nsname);
1174
1175 /* bind-mount so the namespace has a name and is persistent */
1176 if (mount("/proc/self/ns/net", dstpath, "none", MS_BIND, NULL) < 0) {
1177 fprintf(stderr, "failed to bind-mount netns to \"%s\": %s\n",
1178 dstpath, strerror(errno));
1179 unlinkat(dirfd, nsname, 0);
1180 exit(1);
1181 }
1182
1183 XFREE(MTYPE_TMP, dstpath);
1184}
1185
1186static void netns_setup(const char *nsname)
1187{
1188 int dirfd, nsfd;
1189
1190 dirfd = open(NETNS_RUN_DIR, O_DIRECTORY | O_RDONLY);
1191 if (dirfd < 0) {
1192 if (errno == ENOTDIR) {
1193 fprintf(stderr, "error: \"%s\" is not a directory!\n",
1194 NETNS_RUN_DIR);
1195 exit(1);
1196 } else if (errno == ENOENT) {
1197 if (mkdir(NETNS_RUN_DIR, 0755)) {
1198 fprintf(stderr, "error: \"%s\": mkdir: %s\n",
1199 NETNS_RUN_DIR, strerror(errno));
1200 exit(1);
1201 }
1202 dirfd = open(NETNS_RUN_DIR, O_DIRECTORY | O_RDONLY);
1203 if (dirfd < 0) {
1204 fprintf(stderr, "error: \"%s\": opendir: %s\n",
1205 NETNS_RUN_DIR, strerror(errno));
1206 exit(1);
1207 }
1208 } else {
1209 fprintf(stderr, "error: \"%s\": %s\n",
1210 NETNS_RUN_DIR, strerror(errno));
1211 exit(1);
1212 }
1213 }
1214
1215 nsfd = openat(dirfd, nsname, O_RDONLY);
1216 if (nsfd < 0 && errno != ENOENT) {
1217 fprintf(stderr, "error: \"%s/%s\": %s\n",
1218 NETNS_RUN_DIR, nsname, strerror(errno));
1219 exit(1);
1220 }
1221 if (nsfd < 0)
1222 netns_create(dirfd, nsname);
1223 else {
1224 if (setns(nsfd, CLONE_NEWNET)) {
1225 perror("setns");
1226 exit(1);
1227 }
1228 close(nsfd);
1229 }
1230 close(dirfd);
1231
1232 /* make sure loopback is up... weird things happen otherwise.
1233 * ioctl is perfectly fine for this, don't need netlink...
1234 */
1235 int sockfd;
1236 struct ifreq ifr = { };
1237
1238 strlcpy(ifr.ifr_name, "lo", sizeof(ifr.ifr_name));
1239
1240 sockfd = socket(AF_INET, SOCK_DGRAM, 0);
1241 if (sockfd < 0) {
1242 perror("socket");
1243 exit(1);
1244 }
1245 if (ioctl(sockfd, SIOCGIFFLAGS, &ifr)) {
1246 perror("ioctl(SIOCGIFFLAGS, \"lo\")");
1247 exit(1);
1248 }
1249 if (!(ifr.ifr_flags & IFF_UP)) {
1250 ifr.ifr_flags |= IFF_UP;
1251 if (ioctl(sockfd, SIOCSIFFLAGS, &ifr)) {
1252 perror("ioctl(SIOCSIFFLAGS, \"lo\")");
1253 exit(1);
1254 }
1255 }
1256 close(sockfd);
1257}
1258
1259#else /* !GNU_LINUX */
1260
1261static void netns_setup(const char *nsname)
1262{
1263 fprintf(stderr, "network namespaces are only available on Linux\n");
1264 exit(1);
1265}
1266#endif
1267
2ab760f0
DA
1268static void watchfrr_start_config(void)
1269{
1270 gs.reading_configuration = true;
1271}
1272
1273static void watchfrr_end_config(void)
1274{
1275 gs.reading_configuration = false;
1276}
1277
0a7c7856
DL
1278static void watchfrr_init(int argc, char **argv)
1279{
1280 const char *special = "zebra";
1281 int i;
1282 struct daemon *dmn, **add = &gs.daemons;
1283 char alldaemons[512] = "", *p = alldaemons;
1284
907a2395
DS
1285 event_add_timer_msec(master, startup_timeout, NULL, STARTUP_TIMEOUT,
1286 &gs.t_startup_timeout);
5c9d1c83 1287
0a7c7856
DL
1288 for (i = optind; i < argc; i++) {
1289 dmn = XCALLOC(MTYPE_WATCHFRR_DAEMON, sizeof(*dmn));
1290
1291 dmn->name = dmn->restart.name = argv[i];
1292 dmn->state = DAEMON_INIT;
1293 gs.numdaemons++;
1294 gs.numdown++;
1295 dmn->fd = -1;
907a2395
DS
1296 event_add_timer_msec(master, wakeup_init, dmn, 0,
1297 &dmn->t_wakeup);
0a7c7856
DL
1298 dmn->restart.interval = gs.min_restart_interval;
1299 *add = dmn;
1300 add = &dmn->next;
1301
1302 if (!strcmp(dmn->name, special))
1303 gs.special = dmn;
1304 }
1305
1306 if (!gs.daemons) {
1307 fprintf(stderr,
1308 "Must specify one or more daemons to monitor.\n\n");
1309 frr_help_exit(1);
1310 }
1311 if (!watch_only && !gs.special) {
1312 fprintf(stderr, "\"%s\" daemon must be in daemon lists\n\n",
1313 special);
1314 frr_help_exit(1);
1315 }
1316
1317 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
1318 snprintf(p, alldaemons + sizeof(alldaemons) - p, "%s%s",
1319 (p == alldaemons) ? "" : " ", dmn->name);
1320 p += strlen(p);
1321 }
1322 zlog_notice("%s %s watching [%s]%s", progname, FRR_VERSION, alldaemons,
1323 watch_only ? ", monitor mode" : "");
1324}
1325
a6810074 1326struct zebra_privs_t watchfrr_privs = {
95c4aff2 1327#ifdef VTY_GROUP
a6810074 1328 .vty_group = VTY_GROUP,
95c4aff2
DL
1329#endif
1330};
1331
7cc91e67 1332static struct frr_signal_t watchfrr_signals[] = {
4f04a76b
DL
1333 {
1334 .signal = SIGINT,
1335 .handler = sigint,
1336 },
1337 {
1338 .signal = SIGTERM,
1339 .handler = sigint,
1340 },
1341 {
1342 .signal = SIGCHLD,
1343 .handler = sigchild,
1344 },
1345};
1346
1347FRR_DAEMON_INFO(watchfrr, WATCHFRR,
d62a17ae 1348 .flags = FRR_NO_PRIVSEP | FRR_NO_TCPVTY | FRR_LIMITED_CLI
0a7c7856
DL
1349 | FRR_NO_CFG_PID_DRY | FRR_NO_ZCLIENT
1350 | FRR_DETACH_LATER,
4f04a76b 1351
d62a17ae 1352 .printhelp = printhelp,
1353 .copyright = "Copyright 2004 Andrew J. Schorr",
4f04a76b 1354
d62a17ae 1355 .signals = watchfrr_signals,
1356 .n_signals = array_size(watchfrr_signals),
4f04a76b 1357
80413c20
DL
1358 .privs = &watchfrr_privs,
1359);
4f04a76b 1360
999f153e
DL
1361#define DEPRECATED_OPTIONS "aAezR:"
1362
a6810074 1363int main(int argc, char **argv)
8b886ca7 1364{
a6810074 1365 int opt;
a6810074 1366 const char *blankstr = NULL;
33606a15
DL
1367 const char *netns = NULL;
1368 bool netns_en = false;
a6810074 1369
4f04a76b
DL
1370 frr_preinit(&watchfrr_di, argc, argv);
1371 progname = watchfrr_di.progname;
1372
33606a15 1373 frr_opt_add("b:di:k:l:N:p:r:S:s:t:T:" DEPRECATED_OPTIONS, longopts, "");
a6810074
DL
1374
1375 gs.restart.name = "all";
4f04a76b 1376 while ((opt = frr_getopt(argc, argv, NULL)) != EOF) {
999f153e
DL
1377 if (opt && opt < 128 && strchr(DEPRECATED_OPTIONS, opt)) {
1378 fprintf(stderr,
1379 "The -%c option no longer exists.\n"
1380 "Please refer to the watchfrr(8) man page.\n",
1381 opt);
1382 exit(1);
1383 }
1384
a6810074
DL
1385 switch (opt) {
1386 case 0:
1387 break;
a6810074
DL
1388 case 'b':
1389 blankstr = optarg;
1390 break;
f168b713
DL
1391 case OPTION_DRY:
1392 watch_only = true;
a6810074
DL
1393 break;
1394 case 'k':
1395 if (!valid_command(optarg)) {
1396 fprintf(stderr,
1397 "Invalid kill command, must contain '%%s': %s\n",
1398 optarg);
4f04a76b 1399 frr_help_exit(1);
a6810074
DL
1400 }
1401 gs.stop_command = optarg;
1402 break;
d62a17ae 1403 case 'l': {
1404 char garbage[3];
1405 if ((sscanf(optarg, "%d%1s", &gs.loglevel, garbage)
1406 != 1)
1407 || (gs.loglevel < LOG_EMERG)) {
1408 fprintf(stderr,
1409 "Invalid loglevel argument: %s\n",
1410 optarg);
1411 frr_help_exit(1);
a6810074 1412 }
d62a17ae 1413 } break;
1414 case OPTION_MINRESTART: {
1415 char garbage[3];
1416 if ((sscanf(optarg, "%ld%1s", &gs.min_restart_interval,
1417 garbage)
1418 != 1)
1419 || (gs.min_restart_interval < 0)) {
1420 fprintf(stderr,
1421 "Invalid min_restart_interval argument: %s\n",
1422 optarg);
1423 frr_help_exit(1);
a6810074 1424 }
d62a17ae 1425 } break;
1426 case OPTION_MAXRESTART: {
1427 char garbage[3];
1428 if ((sscanf(optarg, "%ld%1s", &gs.max_restart_interval,
1429 garbage)
1430 != 1)
1431 || (gs.max_restart_interval < 0)) {
1432 fprintf(stderr,
1433 "Invalid max_restart_interval argument: %s\n",
1434 optarg);
1435 frr_help_exit(1);
a6810074 1436 }
d62a17ae 1437 } break;
6d0fa5c2
DS
1438 case OPTION_MAXOPERATIONAL: {
1439 char garbage[3];
1440
1441 if ((sscanf(optarg, "%ld%1s", &gs.operational_timeout,
1442 garbage) != 1) ||
7a8120da 1443 (gs.operational_timeout < 0)) {
6d0fa5c2
DS
1444 fprintf(stderr,
1445 "Invalid Operational_timeout argument: %s\n",
1446 optarg);
1447 frr_help_exit(1);
1448 }
1449 } break;
33606a15
DL
1450 case OPTION_NETNS:
1451 netns_en = true;
b12bc77c 1452 if (optarg && strchr(optarg, '/')) {
33606a15
DL
1453 fprintf(stderr,
1454 "invalid network namespace name \"%s\" (may not contain slashes)\n",
1455 optarg);
1456 frr_help_exit(1);
1457 }
1458 netns = optarg;
1459 break;
d62a17ae 1460 case 'i': {
1461 char garbage[3];
1462 int period;
1463 if ((sscanf(optarg, "%d%1s", &period, garbage) != 1)
1464 || (gs.period < 1)) {
1465 fprintf(stderr,
1466 "Invalid interval argument: %s\n",
1467 optarg);
1468 frr_help_exit(1);
a6810074 1469 }
d62a17ae 1470 gs.period = 1000 * period;
1471 } break;
a6810074 1472 case 'p':
0a7c7856 1473 watchfrr_di.pid_file = optarg;
a6810074
DL
1474 break;
1475 case 'r':
a6810074
DL
1476 if (!valid_command(optarg)) {
1477 fprintf(stderr,
1478 "Invalid restart command, must contain '%%s': %s\n",
1479 optarg);
4f04a76b 1480 frr_help_exit(1);
a6810074
DL
1481 }
1482 gs.restart_command = optarg;
a6810074
DL
1483 break;
1484 case 's':
1485 if (!valid_command(optarg)) {
1486 fprintf(stderr,
1487 "Invalid start command, must contain '%%s': %s\n",
1488 optarg);
4f04a76b 1489 frr_help_exit(1);
a6810074
DL
1490 }
1491 gs.start_command = optarg;
1492 break;
1493 case 'S':
1494 gs.vtydir = optarg;
1495 break;
d62a17ae 1496 case 't': {
1497 char garbage[3];
1498 if ((sscanf(optarg, "%ld%1s", &gs.timeout, garbage)
1499 != 1)
1500 || (gs.timeout < 1)) {
1501 fprintf(stderr,
1502 "Invalid timeout argument: %s\n",
1503 optarg);
1504 frr_help_exit(1);
a6810074 1505 }
d62a17ae 1506 } break;
1507 case 'T': {
1508 char garbage[3];
1509 if ((sscanf(optarg, "%ld%1s", &gs.restart_timeout,
1510 garbage)
1511 != 1)
1512 || (gs.restart_timeout < 1)) {
1513 fprintf(stderr,
1514 "Invalid restart timeout argument: %s\n",
1515 optarg);
1516 frr_help_exit(1);
a6810074 1517 }
d62a17ae 1518 } break;
a6810074
DL
1519 default:
1520 fputs("Invalid option.\n", stderr);
4f04a76b 1521 frr_help_exit(1);
a6810074 1522 }
8b886ca7 1523 }
a6810074 1524
71e7975a
DL
1525 if (watch_only
1526 && (gs.start_command || gs.stop_command || gs.restart_command)) {
d87ae5cc 1527 fputs("Options -r/-s/-k are not used when --dry is active.\n",
a6810074 1528 stderr);
8b886ca7 1529 }
f168b713
DL
1530 if (!watch_only
1531 && (!gs.restart_command || !gs.start_command || !gs.stop_command)) {
1532 fprintf(stderr,
1533 "Options -s (start), -k (kill), and -r (restart) are required.\n");
1534 frr_help_exit(1);
8b886ca7 1535 }
8b886ca7 1536
a6810074
DL
1537 if (blankstr) {
1538 if (gs.restart_command)
1539 gs.restart_command =
d62a17ae 1540 translate_blanks(gs.restart_command, blankstr);
a6810074
DL
1541 if (gs.start_command)
1542 gs.start_command =
d62a17ae 1543 translate_blanks(gs.start_command, blankstr);
a6810074
DL
1544 if (gs.stop_command)
1545 gs.stop_command =
d62a17ae 1546 translate_blanks(gs.stop_command, blankstr);
065de903 1547 }
8b886ca7 1548
a6810074 1549 gs.restart.interval = gs.min_restart_interval;
8b886ca7 1550
33606a15
DL
1551 /* env variable for the processes that we start */
1552 if (watchfrr_di.pathspace)
1553 setenv("FRR_PATHSPACE", watchfrr_di.pathspace, 1);
1554 else
1555 unsetenv("FRR_PATHSPACE");
1556
a91f5417
DS
1557 /*
1558 * when watchfrr_di.pathspace is read, if it is not specified
1559 * pathspace is NULL as expected
1560 */
1561 pathspace = watchfrr_di.pathspace;
1562
33606a15
DL
1563 if (netns_en && !netns)
1564 netns = watchfrr_di.pathspace;
a91f5417 1565
33606a15
DL
1566 if (netns_en && netns && netns[0])
1567 netns_setup(netns);
1568
4f04a76b 1569 master = frr_init();
b647dc2a 1570 watchfrr_error_init();
0a7c7856 1571 watchfrr_init(argc, argv);
2ab760f0 1572 cmd_init_config_callbacks(watchfrr_start_config, watchfrr_end_config);
0a7c7856
DL
1573 watchfrr_vty_init();
1574
1575 frr_config_fork();
4f04a76b 1576
0a7c7856 1577 if (watchfrr_di.daemon_mode)
0bdeb5e5 1578 zlog_syslog_set_prio_min(MIN(gs.loglevel, LOG_DEBUG));
0a7c7856 1579 else
0bdeb5e5 1580 zlog_aux_init(NULL, MIN(gs.loglevel, LOG_DEBUG));
8b886ca7 1581
0a7c7856 1582 frr_run(master);
8b886ca7 1583
a6810074
DL
1584 systemd_send_stopping();
1585 /* Not reached. */
1586 return 0;
8b886ca7 1587}