]> git.proxmox.com Git - mirror_frr.git/blame - watchfrr/watchfrr.c
zebra: i declaration shadows other i declared
[mirror_frr.git] / watchfrr / watchfrr.c
CommitLineData
8b886ca7 1/*
896014f4
DL
2 * Monitor status of frr daemons and restart if necessary.
3 *
4 * Copyright (C) 2004 Andrew J. Schorr
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
8b886ca7 19 */
20
a365534f 21#include <zebra.h>
8b886ca7 22#include <thread.h>
23#include <log.h>
52e66296 24#include <network.h>
8b886ca7 25#include <sigevent.h>
a365534f 26#include <lib/version.h>
95c4aff2 27#include "command.h"
4f04a76b 28#include "libfrr.h"
b647dc2a 29#include "lib_errors.h"
0bdeb5e5 30#include "zlog_targets.h"
5920b3eb 31#include "network.h"
33606a15 32#include "printfrr.h"
95c4aff2 33
6f594023 34#include <getopt.h>
a365534f 35#include <sys/un.h>
36#include <sys/wait.h>
837d16cc 37#include <memory.h>
651415bd 38#include <systemd.h>
8b886ca7 39
9473e340 40#include "watchfrr.h"
b647dc2a 41#include "watchfrr_errors.h"
95c4aff2 42
8b886ca7 43#ifndef MIN
44#define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
45#endif
46
47/* Macros to help randomize timers. */
5920b3eb 48#define JITTER(X) ((frr_weak_random() % ((X)+1))-((X)/2))
8b886ca7 49#define FUZZY(X) ((X)+JITTER((X)/20))
50
51#define DEFAULT_PERIOD 5
0a64aff6 52#define DEFAULT_TIMEOUT 90
8b886ca7 53#define DEFAULT_RESTART_TIMEOUT 20
54#define DEFAULT_LOGLEVEL LOG_INFO
55#define DEFAULT_MIN_RESTART 60
56#define DEFAULT_MAX_RESTART 600
6d0fa5c2 57#define DEFAULT_OPERATIONAL_TIMEOUT 60
8b886ca7 58
3ec95567
DL
59#define DEFAULT_RESTART_CMD WATCHFRR_SH_PATH " restart %s"
60#define DEFAULT_START_CMD WATCHFRR_SH_PATH " start %s"
61#define DEFAULT_STOP_CMD WATCHFRR_SH_PATH " stop %s"
62
8b886ca7 63#define PING_TOKEN "PING"
64
bf8d3d6a
DL
65DEFINE_MGROUP(WATCHFRR, "watchfrr");
66DEFINE_MTYPE_STATIC(WATCHFRR, WATCHFRR_DAEMON, "watchfrr daemon entry");
0a7c7856 67
55c72803 68/* Needs to be global, referenced somewhere inside libfrr. */
8b886ca7 69struct thread_master *master;
70
f168b713 71static bool watch_only = false;
a91f5417 72const char *pathspace;
8b886ca7 73
f1692c51 74enum restart_phase {
a6810074 75 PHASE_NONE = 0,
c0e5cb52 76 PHASE_INIT,
a6810074
DL
77 PHASE_STOPS_PENDING,
78 PHASE_WAITING_DOWN,
79 PHASE_ZEBRA_RESTART_PENDING,
80 PHASE_WAITING_ZEBRA_UP
f1692c51 81};
8b886ca7 82
2b64873d 83static const char *const phase_str[] = {
af568444 84 "Idle",
c0e5cb52 85 "Startup",
a6810074
DL
86 "Stop jobs running",
87 "Waiting for other daemons to come down",
88 "Zebra restart job running",
89 "Waiting for zebra to come up",
90 "Start jobs running",
8b886ca7 91};
92
93#define PHASE_TIMEOUT (3*gs.restart_timeout)
5c9d1c83 94#define STARTUP_TIMEOUT 55 * 1000
8b886ca7 95
a6810074
DL
96struct restart_info {
97 const char *name;
98 const char *what;
99 pid_t pid;
100 struct timeval time;
101 long interval;
102 struct thread *t_kill;
103 int kills;
098e240f 104};
105
a6810074 106static struct global_state {
f1692c51 107 enum restart_phase phase;
a6810074 108 struct thread *t_phase_hanging;
5c9d1c83 109 struct thread *t_startup_timeout;
6d0fa5c2 110 struct thread *t_operational;
a6810074
DL
111 const char *vtydir;
112 long period;
113 long timeout;
114 long restart_timeout;
2ab760f0 115 bool reading_configuration;
a6810074
DL
116 long min_restart_interval;
117 long max_restart_interval;
6d0fa5c2 118 long operational_timeout;
a6810074
DL
119 struct daemon *daemons;
120 const char *restart_command;
121 const char *start_command;
122 const char *stop_command;
123 struct restart_info restart;
a6810074 124 int loglevel;
d62a17ae 125 struct daemon *special; /* points to zebra when doing phased restart */
a6810074
DL
126 int numdaemons;
127 int numpids;
d62a17ae 128 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
8b886ca7 129} gs = {
c0e5cb52 130 .phase = PHASE_INIT,
64a249ad 131 .vtydir = frr_vtydir,
d62a17ae 132 .period = 1000 * DEFAULT_PERIOD,
133 .timeout = DEFAULT_TIMEOUT,
134 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
135 .loglevel = DEFAULT_LOGLEVEL,
136 .min_restart_interval = DEFAULT_MIN_RESTART,
137 .max_restart_interval = DEFAULT_MAX_RESTART,
6d0fa5c2 138 .operational_timeout = DEFAULT_OPERATIONAL_TIMEOUT,
3ec95567
DL
139 .restart_command = DEFAULT_RESTART_CMD,
140 .start_command = DEFAULT_START_CMD,
141 .stop_command = DEFAULT_STOP_CMD,
d62a17ae 142};
a6810074 143
c3db4ca8 144enum daemon_state {
a6810074
DL
145 DAEMON_INIT,
146 DAEMON_DOWN,
147 DAEMON_CONNECTING,
148 DAEMON_UP,
149 DAEMON_UNRESPONSIVE
c3db4ca8 150};
8b886ca7 151
d62a17ae 152#define IS_UP(DMN) \
153 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
8b886ca7 154
2b64873d 155static const char *const state_str[] = {
d62a17ae 156 "Init", "Down", "Connecting", "Up", "Unresponsive",
8b886ca7 157};
158
159struct daemon {
a6810074 160 const char *name;
c3db4ca8 161 enum daemon_state state;
a6810074
DL
162 int fd;
163 struct timeval echo_sent;
d7c0a89a 164 unsigned int connect_tries;
a6810074
DL
165 struct thread *t_wakeup;
166 struct thread *t_read;
167 struct thread *t_write;
168 struct daemon *next;
169 struct restart_info restart;
cc53b605
DS
170
171 /*
172 * For a given daemon, if we've turned on ignore timeouts
173 * ignore the timeout value and assume everything is ok
174 * This is for daemon debugging w/ gdb after we have started
175 * FRR and realize we have something that needs to be looked
176 * at
177 */
178 bool ignore_timeout;
8b886ca7 179};
180
9272302b
DL
181#define OPTION_MINRESTART 2000
182#define OPTION_MAXRESTART 2001
f168b713 183#define OPTION_DRY 2002
33606a15 184#define OPTION_NETNS 2003
6d0fa5c2 185#define OPTION_MAXOPERATIONAL 2004
9272302b 186
a6810074
DL
187static const struct option longopts[] = {
188 {"daemon", no_argument, NULL, 'd'},
189 {"statedir", required_argument, NULL, 'S'},
a6810074
DL
190 {"loglevel", required_argument, NULL, 'l'},
191 {"interval", required_argument, NULL, 'i'},
192 {"timeout", required_argument, NULL, 't'},
193 {"restart-timeout", required_argument, NULL, 'T'},
194 {"restart", required_argument, NULL, 'r'},
195 {"start-command", required_argument, NULL, 's'},
196 {"kill-command", required_argument, NULL, 'k'},
f168b713 197 {"dry", no_argument, NULL, OPTION_DRY},
d62a17ae 198 {"min-restart-interval", required_argument, NULL, OPTION_MINRESTART},
199 {"max-restart-interval", required_argument, NULL, OPTION_MAXRESTART},
6d0fa5c2 200 {"operational-timeout", required_argument, NULL, OPTION_MAXOPERATIONAL},
a6810074
DL
201 {"pid-file", required_argument, NULL, 'p'},
202 {"blank-string", required_argument, NULL, 'b'},
33606a15
DL
203#ifdef GNU_LINUX
204 {"netns", optional_argument, NULL, OPTION_NETNS},
205#endif
a6810074
DL
206 {"help", no_argument, NULL, 'h'},
207 {"version", no_argument, NULL, 'v'},
d62a17ae 208 {NULL, 0, NULL, 0}};
8b886ca7 209
210static int try_connect(struct daemon *dmn);
cc9f21da 211static void wakeup_send_echo(struct thread *t_wakeup);
8b886ca7 212static void try_restart(struct daemon *dmn);
213static void phase_check(void);
75f8b0e4 214static void restart_done(struct daemon *dmn);
8b886ca7 215
4f04a76b 216static const char *progname;
cc53b605
DS
217
218void watchfrr_set_ignore_daemon(struct vty *vty, const char *dname, bool ignore)
219{
220 struct daemon *dmn;
221
222 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
223 if (strncmp(dmn->name, dname, strlen(dmn->name)) == 0)
224 break;
225 }
226
227 if (dmn) {
228 dmn->ignore_timeout = ignore;
229 vty_out(vty, "%s switching to %s\n", dmn->name,
230 ignore ? "ignore" : "watch");
231 } else
232 vty_out(vty, "%s is not configured for running at the moment",
233 dname);
234}
235
4f04a76b 236static void printhelp(FILE *target)
8b886ca7 237{
d62a17ae 238 fprintf(target,
239 "Usage : %s [OPTION...] <daemon name> ...\n\n\
9473e340 240Watchdog program to monitor status of frr daemons and try to restart\n\
8b886ca7 241them if they are down or unresponsive. It determines whether a daemon is\n\
242up based on whether it can connect to the daemon's vty unix stream socket.\n\
243It then repeatedly sends echo commands over that socket to determine whether\n\
244the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
245on the socket connection and know immediately that the daemon is down.\n\n\
246The daemons to be monitored should be listed on the command line.\n\n\
8b886ca7 247In order to avoid attempting to restart the daemons in a fast loop,\n\
248the -m and -M options allow you to control the minimum delay between\n\
249restart commands. The minimum restart delay is recalculated each time\n\
250a restart is attempted: if the time since the last restart attempt exceeds\n\
251twice the -M value, then the restart delay is set to the -m value.\n\
d62a17ae 252Otherwise, the interval is doubled (but capped at the -M value).\n\n",
f168b713 253 progname);
e757c940 254
d62a17ae 255 fprintf(target,
256 "Options:\n\
8b886ca7 257-d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
258 to syslog instead of stdout.\n\
259-S, --statedir Set the vty socket directory (default is %s)\n\
33606a15
DL
260-N, --pathspace Insert prefix into config & socket paths\n"
261#ifdef GNU_LINUX
262" --netns Create and/or use Linux network namespace. If no name is\n"
263" given, uses the value from `-N`.\n"
264#endif
265"-l, --loglevel Set the logging level (default is %d).\n\
8b886ca7 266 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
267 but it can be set higher than %d if extra-verbose debugging\n\
268 messages are desired.\n\
9272302b 269 --min-restart-interval\n\
8b886ca7 270 Set the minimum seconds to wait between invocations of daemon\n\
271 restart commands (default is %d).\n\
9272302b 272 --max-restart-interval\n\
8b886ca7 273 Set the maximum seconds to wait between invocations of daemon\n\
274 restart commands (default is %d).\n\
6d0fa5c2
DS
275 --operational-timeout\n\
276 Set the time before systemd is notified that we are considered\n\
277 operational again after a daemon restart (default is %d).\n\
8b886ca7 278-i, --interval Set the status polling interval in seconds (default is %d)\n\
279-t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
280-T, --restart-timeout\n\
281 Set the restart (kill) timeout in seconds (default is %d).\n\
282 If any background jobs are still running after this much\n\
283 time has elapsed, they will be killed.\n\
284-r, --restart Supply a Bourne shell command to use to restart a single\n\
285 daemon. The command string should include '%%s' where the\n\
286 name of the daemon should be substituted.\n\
3ec95567 287 (default: '%s')\n\
8b886ca7 288-s, --start-command\n\
289 Supply a Bourne shell to command to use to start a single\n\
290 daemon. The command string should include '%%s' where the\n\
291 name of the daemon should be substituted.\n\
3ec95567 292 (default: '%s')\n\
8b886ca7 293-k, --kill-command\n\
294 Supply a Bourne shell to command to use to stop a single\n\
295 daemon. The command string should include '%%s' where the\n\
296 name of the daemon should be substituted.\n\
3ec95567 297 (default: '%s')\n\
f168b713 298 --dry Do not start or restart anything, just log.\n\
8b886ca7 299-p, --pid-file Set process identifier file name\n\
0a7c7856 300 (default is %s/watchfrr.pid).\n\
c8b40f86 301-b, --blank-string\n\
302 When the supplied argument string is found in any of the\n\
f168b713 303 various shell command arguments (-r, -s, or -k), replace\n\
c8b40f86 304 it with a space. This is an ugly hack to circumvent problems\n\
305 passing command-line arguments with embedded spaces.\n\
8b886ca7 306-v, --version Print program version\n\
d62a17ae 307-h, --help Display this help and exit\n",
64a249ad 308 frr_vtydir, DEFAULT_LOGLEVEL, LOG_EMERG, LOG_DEBUG, LOG_DEBUG,
6d0fa5c2
DS
309 DEFAULT_MIN_RESTART, DEFAULT_MAX_RESTART,
310 DEFAULT_OPERATIONAL_TIMEOUT, DEFAULT_PERIOD, DEFAULT_TIMEOUT,
311 DEFAULT_RESTART_TIMEOUT, DEFAULT_RESTART_CMD, DEFAULT_START_CMD,
312 DEFAULT_STOP_CMD, frr_vtydir);
8b886ca7 313}
314
a6810074 315static pid_t run_background(char *shell_cmd)
8b886ca7 316{
a6810074
DL
317 pid_t child;
318
319 switch (child = fork()) {
320 case -1:
450971aa 321 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
322 "fork failed, cannot run command [%s]: %s",
323 shell_cmd, safe_strerror(errno));
a6810074
DL
324 return -1;
325 case 0:
326 /* Child process. */
d62a17ae 327 /* Use separate process group so child processes can be killed
328 * easily. */
a6810074 329 if (setpgid(0, 0) < 0)
957cfa24 330 zlog_warn("setpgid(0,0) failed: %s",
a6810074
DL
331 safe_strerror(errno));
332 {
333 char shell[] = "sh";
334 char dashc[] = "-c";
d62a17ae 335 char *const argv[4] = {shell, dashc, shell_cmd, NULL};
a6810074 336 execv("/bin/sh", argv);
450971aa 337 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
338 "execv(/bin/sh -c '%s') failed: %s",
339 shell_cmd, safe_strerror(errno));
a6810074
DL
340 _exit(127);
341 }
342 default:
343 /* Parent process: we will reap the child later. */
c3f65458
QY
344 zlog_info("Forked background command [pid %d]: %s", (int)child,
345 shell_cmd);
a6810074
DL
346 return child;
347 }
8b886ca7 348}
349
a6810074
DL
350static struct timeval *time_elapsed(struct timeval *result,
351 const struct timeval *start_time)
8b886ca7 352{
a6810074
DL
353 gettimeofday(result, NULL);
354 result->tv_sec -= start_time->tv_sec;
355 result->tv_usec -= start_time->tv_usec;
356 while (result->tv_usec < 0) {
357 result->tv_usec += 1000000L;
358 result->tv_sec--;
359 }
360 return result;
8b886ca7 361}
362
cc9f21da 363static void restart_kill(struct thread *t_kill)
8b886ca7 364{
a6810074
DL
365 struct restart_info *restart = THREAD_ARG(t_kill);
366 struct timeval delay;
367
368 time_elapsed(&delay, &restart->time);
2ab760f0
DA
369
370 if (gs.reading_configuration) {
371 zlog_err(
372 "%s %s child process appears to still be reading configuration, delaying for another %lu time",
373 restart->what, restart->name, gs.restart_timeout);
374 thread_add_timer(master, restart_kill, restart,
375 gs.restart_timeout, &restart->t_kill);
376 return;
377 }
378
d62a17ae 379 zlog_warn(
957cfa24 380 "%s %s child process %d still running after %ld seconds, sending signal %d",
d62a17ae 381 restart->what, restart->name, (int)restart->pid,
382 (long)delay.tv_sec, (restart->kills ? SIGKILL : SIGTERM));
a6810074
DL
383 kill(-restart->pid, (restart->kills ? SIGKILL : SIGTERM));
384 restart->kills++;
66e78ae6
QY
385 thread_add_timer(master, restart_kill, restart, gs.restart_timeout,
386 &restart->t_kill);
8b886ca7 387}
388
a6810074 389static struct restart_info *find_child(pid_t child)
8b886ca7 390{
f168b713 391 struct daemon *dmn;
7c265f7d
CF
392 if (gs.restart.pid == child)
393 return &gs.restart;
394
f168b713
DL
395 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
396 if (dmn->restart.pid == child)
397 return &dmn->restart;
a6810074
DL
398 }
399 return NULL;
8b886ca7 400}
401
a6810074 402static void sigchild(void)
8b886ca7 403{
a6810074
DL
404 pid_t child;
405 int status;
406 const char *name;
407 const char *what;
408 struct restart_info *restart;
75f8b0e4 409 struct daemon *dmn;
a6810074
DL
410
411 switch (child = waitpid(-1, &status, WNOHANG)) {
412 case -1:
450971aa 413 flog_err_sys(EC_LIB_SYSTEM_CALL, "waitpid failed: %s",
09c866e3 414 safe_strerror(errno));
a6810074
DL
415 return;
416 case 0:
417 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
418 return;
419 }
420
421 if (child == integrated_write_pid) {
422 integrated_write_sigchld(status);
423 return;
424 }
425
426 if ((restart = find_child(child)) != NULL) {
427 name = restart->name;
428 what = restart->what;
429 restart->pid = 0;
430 gs.numpids--;
b3d6bc6e
MS
431 thread_cancel(&restart->t_kill);
432
d62a17ae 433 /* Update restart time to reflect the time the command
434 * completed. */
a6810074
DL
435 gettimeofday(&restart->time, NULL);
436 } else {
09c866e3 437 flog_err_sys(
450971aa 438 EC_LIB_SYSTEM_CALL,
09c866e3
QY
439 "waitpid returned status for an unknown child process %d",
440 (int)child);
a6810074
DL
441 name = "(unknown)";
442 what = "background";
443 }
444 if (WIFSTOPPED(status))
957cfa24 445 zlog_warn("%s %s process %d is stopped", what, name,
d62a17ae 446 (int)child);
a6810074 447 else if (WIFSIGNALED(status))
d62a17ae 448 zlog_warn("%s %s process %d terminated due to signal %d", what,
449 name, (int)child, WTERMSIG(status));
a6810074
DL
450 else if (WIFEXITED(status)) {
451 if (WEXITSTATUS(status) != 0)
d62a17ae 452 zlog_warn(
453 "%s %s process %d exited with non-zero status %d",
454 what, name, (int)child, WEXITSTATUS(status));
75f8b0e4 455 else {
a6810074
DL
456 zlog_debug("%s %s process %d exited normally", what,
457 name, (int)child);
75f8b0e4
DL
458
459 if (restart && restart != &gs.restart) {
460 dmn = container_of(restart, struct daemon,
461 restart);
462 restart_done(dmn);
463 } else if (restart)
464 for (dmn = gs.daemons; dmn; dmn = dmn->next)
465 restart_done(dmn);
466 }
a6810074 467 } else
09c866e3 468 flog_err_sys(
450971aa 469 EC_LIB_SYSTEM_CALL,
09c866e3
QY
470 "cannot interpret %s %s process %d wait status 0x%x",
471 what, name, (int)child, status);
a6810074 472 phase_check();
8b886ca7 473}
474
d62a17ae 475static int run_job(struct restart_info *restart, const char *cmdtype,
476 const char *command, int force, int update_interval)
8b886ca7 477{
a6810074
DL
478 struct timeval delay;
479
480 if (gs.loglevel > LOG_DEBUG + 1)
481 zlog_debug("attempting to %s %s", cmdtype, restart->name);
482
483 if (restart->pid) {
484 if (gs.loglevel > LOG_DEBUG + 1)
d62a17ae 485 zlog_debug(
486 "cannot %s %s, previous pid %d still running",
487 cmdtype, restart->name, (int)restart->pid);
a6810074
DL
488 return -1;
489 }
490
b3ee8bcc
DS
491 char buffer[512];
492
493 snprintf(buffer, sizeof(buffer), "restarting %s", restart->name);
494 systemd_send_status(buffer);
b3ee8bcc 495
d62a17ae 496 /* Note: time_elapsed test must come before the force test, since we
497 need
a6810074
DL
498 to make sure that delay is initialized for use below in updating the
499 restart interval. */
500 if ((time_elapsed(&delay, &restart->time)->tv_sec < restart->interval)
501 && !force) {
b3ee8bcc 502
a6810074 503 if (gs.loglevel > LOG_DEBUG + 1)
d62a17ae 504 zlog_debug(
3efd0893 505 "postponing %s %s: elapsed time %ld < retry interval %ld",
d62a17ae 506 cmdtype, restart->name, (long)delay.tv_sec,
507 restart->interval);
a6810074
DL
508 return -1;
509 }
510
511 gettimeofday(&restart->time, NULL);
512 restart->kills = 0;
513 {
514 char cmd[strlen(command) + strlen(restart->name) + 1];
515 snprintf(cmd, sizeof(cmd), command, restart->name);
516 if ((restart->pid = run_background(cmd)) > 0) {
d62a17ae 517 thread_add_timer(master, restart_kill, restart,
518 gs.restart_timeout, &restart->t_kill);
a6810074
DL
519 restart->what = cmdtype;
520 gs.numpids++;
521 } else
522 restart->pid = 0;
523 }
524
525 /* Calculate the new restart interval. */
526 if (update_interval) {
527 if (delay.tv_sec > 2 * gs.max_restart_interval)
528 restart->interval = gs.min_restart_interval;
529 else if ((restart->interval *= 2) > gs.max_restart_interval)
530 restart->interval = gs.max_restart_interval;
531 if (gs.loglevel > LOG_DEBUG + 1)
532 zlog_debug("restart %s interval is now %ld",
533 restart->name, restart->interval);
534 }
535 return restart->pid;
8b886ca7 536}
537
d62a17ae 538#define SET_READ_HANDLER(DMN) \
539 do { \
540 (DMN)->t_read = NULL; \
541 thread_add_read(master, handle_read, (DMN), (DMN)->fd, \
542 &(DMN)->t_read); \
543 } while (0);
544
545#define SET_WAKEUP_DOWN(DMN) \
546 do { \
547 (DMN)->t_wakeup = NULL; \
548 thread_add_timer_msec(master, wakeup_down, (DMN), \
549 FUZZY(gs.period), &(DMN)->t_wakeup); \
550 } while (0);
551
552#define SET_WAKEUP_UNRESPONSIVE(DMN) \
553 do { \
554 (DMN)->t_wakeup = NULL; \
555 thread_add_timer_msec(master, wakeup_unresponsive, (DMN), \
556 FUZZY(gs.period), &(DMN)->t_wakeup); \
557 } while (0);
558
559#define SET_WAKEUP_ECHO(DMN) \
560 do { \
561 (DMN)->t_wakeup = NULL; \
562 thread_add_timer_msec(master, wakeup_send_echo, (DMN), \
563 FUZZY(gs.period), &(DMN)->t_wakeup); \
564 } while (0);
8b886ca7 565
cc9f21da 566static void wakeup_down(struct thread *t_wakeup)
8b886ca7 567{
a6810074
DL
568 struct daemon *dmn = THREAD_ARG(t_wakeup);
569
570 dmn->t_wakeup = NULL;
571 if (try_connect(dmn) < 0)
572 SET_WAKEUP_DOWN(dmn);
573 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
574 try_restart(dmn);
8b886ca7 575}
576
cc9f21da 577static void wakeup_init(struct thread *t_wakeup)
8b886ca7 578{
a6810074
DL
579 struct daemon *dmn = THREAD_ARG(t_wakeup);
580
581 dmn->t_wakeup = NULL;
582 if (try_connect(dmn) < 0) {
c3f65458
QY
583 zlog_info(
584 "%s state -> down : initial connection attempt failed",
585 dmn->name);
a6810074
DL
586 dmn->state = DAEMON_DOWN;
587 }
c0e5cb52 588 phase_check();
8b886ca7 589}
590
75f8b0e4
DL
591static void restart_done(struct daemon *dmn)
592{
593 if (dmn->state != DAEMON_DOWN) {
3f391bec
DS
594 zlog_warn(
595 "Daemon: %s: is in %s state but expected it to be in DAEMON_DOWN state",
596 dmn->name, state_str[dmn->state]);
75f8b0e4
DL
597 return;
598 }
28ef0ee1 599 THREAD_OFF(dmn->t_wakeup);
50478845 600
75f8b0e4
DL
601 if (try_connect(dmn) < 0)
602 SET_WAKEUP_DOWN(dmn);
603}
604
6d0fa5c2
DS
605static void daemon_restarting_operational(struct thread *thread)
606{
607 systemd_send_status("FRR Operational");
608}
609
a6810074 610static void daemon_down(struct daemon *dmn, const char *why)
8b886ca7 611{
a6810074 612 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
1c50c1c0
QY
613 flog_err(EC_WATCHFRR_CONNECTION, "%s state -> down : %s",
614 dmn->name, why);
a6810074
DL
615 else if (gs.loglevel > LOG_DEBUG)
616 zlog_debug("%s still down : %s", dmn->name, why);
617 if (IS_UP(dmn))
618 gs.numdown++;
619 dmn->state = DAEMON_DOWN;
620 if (dmn->fd >= 0) {
621 close(dmn->fd);
622 dmn->fd = -1;
623 }
624 THREAD_OFF(dmn->t_read);
625 THREAD_OFF(dmn->t_write);
626 THREAD_OFF(dmn->t_wakeup);
627 if (try_connect(dmn) < 0)
628 SET_WAKEUP_DOWN(dmn);
6d0fa5c2
DS
629
630 systemd_send_status("FRR partially operational");
a6810074 631 phase_check();
8b886ca7 632}
633
cc9f21da 634static void handle_read(struct thread *t_read)
8b886ca7 635{
a6810074
DL
636 struct daemon *dmn = THREAD_ARG(t_read);
637 static const char resp[sizeof(PING_TOKEN) + 4] = PING_TOKEN "\n";
638 char buf[sizeof(resp) + 100];
639 ssize_t rc;
640 struct timeval delay;
641
642 dmn->t_read = NULL;
643 if ((rc = read(dmn->fd, buf, sizeof(buf))) < 0) {
644 char why[100];
645
646 if (ERRNO_IO_RETRY(errno)) {
647 /* Pretend it never happened. */
648 SET_READ_HANDLER(dmn);
cc9f21da 649 return;
a6810074
DL
650 }
651 snprintf(why, sizeof(why), "unexpected read error: %s",
652 safe_strerror(errno));
653 daemon_down(dmn, why);
cc9f21da 654 return;
8b886ca7 655 }
a6810074
DL
656 if (rc == 0) {
657 daemon_down(dmn, "read returned EOF");
cc9f21da 658 return;
a6810074
DL
659 }
660 if (!dmn->echo_sent.tv_sec) {
661 char why[sizeof(buf) + 100];
662 snprintf(why, sizeof(why),
663 "unexpected read returns %d bytes: %.*s", (int)rc,
664 (int)rc, buf);
665 daemon_down(dmn, why);
cc9f21da 666 return;
8b886ca7 667 }
a6810074
DL
668
669 /* We are expecting an echo response: is there any chance that the
670 response would not be returned entirely in the first read? That
671 seems inconceivable... */
672 if ((rc != sizeof(resp)) || memcmp(buf, resp, sizeof(resp))) {
673 char why[100 + sizeof(buf)];
674 snprintf(why, sizeof(why),
3efd0893 675 "read returned bad echo response of %d bytes (expecting %u): %.*s",
d7c0a89a 676 (int)rc, (unsigned int)sizeof(resp), (int)rc, buf);
a6810074 677 daemon_down(dmn, why);
cc9f21da 678 return;
a6810074
DL
679 }
680
681 time_elapsed(&delay, &dmn->echo_sent);
682 dmn->echo_sent.tv_sec = 0;
683 if (dmn->state == DAEMON_UNRESPONSIVE) {
684 if (delay.tv_sec < gs.timeout) {
685 dmn->state = DAEMON_UP;
d62a17ae 686 zlog_warn(
3efd0893 687 "%s state -> up : echo response received after %ld.%06ld seconds",
d62a17ae 688 dmn->name, (long)delay.tv_sec,
689 (long)delay.tv_usec);
a6810074 690 } else
d62a17ae 691 zlog_warn(
3efd0893 692 "%s: slow echo response finally received after %ld.%06ld seconds",
d62a17ae 693 dmn->name, (long)delay.tv_sec,
694 (long)delay.tv_usec);
a6810074
DL
695 } else if (gs.loglevel > LOG_DEBUG + 1)
696 zlog_debug("%s: echo response received after %ld.%06ld seconds",
697 dmn->name, (long)delay.tv_sec, (long)delay.tv_usec);
698
699 SET_READ_HANDLER(dmn);
b3d6bc6e 700 thread_cancel(&dmn->t_wakeup);
a6810074 701 SET_WAKEUP_ECHO(dmn);
8b886ca7 702}
703
207e0d7a
DS
704/*
705 * Wait till we notice that all daemons are ready before
706 * we send we are ready to systemd
707 */
5c9d1c83 708static void daemon_send_ready(int exitcode)
207e0d7a 709{
5c9d1c83 710 FILE *fp;
a6810074 711 static int sent = 0;
43e587c1 712 char started[1024];
207e0d7a 713
5c9d1c83
DL
714 if (sent)
715 return;
716
717 if (exitcode == 0)
0a7c7856 718 zlog_notice("all daemons up, doing startup-complete notify");
5c9d1c83
DL
719 else if (gs.numdown < gs.numdaemons)
720 flog_err(EC_WATCHFRR_CONNECTION,
3efd0893 721 "startup did not complete within timeout (%d/%d daemons running)",
5c9d1c83
DL
722 gs.numdaemons - gs.numdown, gs.numdaemons);
723 else {
724 flog_err(EC_WATCHFRR_CONNECTION,
3efd0893 725 "all configured daemons failed to start -- exiting watchfrr");
5c9d1c83
DL
726 exit(exitcode);
727
728 }
0a7c7856 729
5c9d1c83
DL
730 frr_detach();
731
33606a15 732 snprintf(started, sizeof(started), "%s/%s", frr_vtydir,
3c649c71
DS
733 "watchfrr.started");
734 fp = fopen(started, "w");
5c9d1c83
DL
735 if (fp)
736 fclose(fp);
247898d5
DL
737
738 systemd_send_started(master);
b3ee8bcc 739 systemd_send_status("FRR Operational");
5c9d1c83 740 sent = 1;
207e0d7a
DS
741}
742
a6810074 743static void daemon_up(struct daemon *dmn, const char *why)
8b886ca7 744{
a6810074
DL
745 dmn->state = DAEMON_UP;
746 gs.numdown--;
747 dmn->connect_tries = 0;
748 zlog_notice("%s state -> up : %s", dmn->name, why);
6d0fa5c2 749 if (gs.numdown == 0) {
5c9d1c83 750 daemon_send_ready(0);
6d0fa5c2
DS
751
752 THREAD_OFF(gs.t_operational);
753
754 thread_add_timer(master, daemon_restarting_operational, NULL,
755 gs.operational_timeout, &gs.t_operational);
756 }
757
a8cbb8b3 758 SET_WAKEUP_ECHO(dmn);
a6810074 759 phase_check();
8b886ca7 760}
761
cc9f21da 762static void check_connect(struct thread *t_write)
8b886ca7 763{
a6810074
DL
764 struct daemon *dmn = THREAD_ARG(t_write);
765 int sockerr;
766 socklen_t reslen = sizeof(sockerr);
767
768 dmn->t_write = NULL;
769 if (getsockopt(dmn->fd, SOL_SOCKET, SO_ERROR, (char *)&sockerr, &reslen)
770 < 0) {
771 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn->name,
772 safe_strerror(errno));
773 daemon_down(dmn,
774 "getsockopt failed checking connection success");
cc9f21da 775 return;
a6810074
DL
776 }
777 if ((reslen == sizeof(sockerr)) && sockerr) {
778 char why[100];
d62a17ae 779 snprintf(
780 why, sizeof(why),
781 "getsockopt reports that connection attempt failed: %s",
782 safe_strerror(sockerr));
a6810074 783 daemon_down(dmn, why);
cc9f21da 784 return;
a6810074
DL
785 }
786
787 daemon_up(dmn, "delayed connect succeeded");
8b886ca7 788}
789
cc9f21da 790static void wakeup_connect_hanging(struct thread *t_wakeup)
8b886ca7 791{
a6810074
DL
792 struct daemon *dmn = THREAD_ARG(t_wakeup);
793 char why[100];
794
795 dmn->t_wakeup = NULL;
796 snprintf(why, sizeof(why),
797 "connection attempt timed out after %ld seconds", gs.timeout);
798 daemon_down(dmn, why);
8b886ca7 799}
800
801/* Making connection to protocol daemon. */
a6810074 802static int try_connect(struct daemon *dmn)
8b886ca7 803{
a6810074
DL
804 int sock;
805 struct sockaddr_un addr;
806 socklen_t len;
807
808 if (gs.loglevel > LOG_DEBUG + 1)
809 zlog_debug("%s: attempting to connect", dmn->name);
810 dmn->connect_tries++;
811
6006b807 812 memset(&addr, 0, sizeof(addr));
a6810074 813 addr.sun_family = AF_UNIX;
d62a17ae 814 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty", gs.vtydir,
815 dmn->name);
6f0e3f6e 816#ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
a6810074 817 len = addr.sun_len = SUN_LEN(&addr);
8b886ca7 818#else
a6810074 819 len = sizeof(addr.sun_family) + strlen(addr.sun_path);
d62a17ae 820#endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
a6810074
DL
821
822 /* Quick check to see if we might succeed before we go to the trouble
823 of creating a socket. */
824 if (access(addr.sun_path, W_OK) < 0) {
825 if (errno != ENOENT)
450971aa 826 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
827 "%s: access to socket %s denied: %s",
828 dmn->name, addr.sun_path,
829 safe_strerror(errno));
a6810074
DL
830 return -1;
831 }
832
833 if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
450971aa 834 flog_err_sys(EC_LIB_SOCKET, "%s(%s): cannot make socket: %s",
09c866e3 835 __func__, addr.sun_path, safe_strerror(errno));
a6810074
DL
836 return -1;
837 }
838
839 if (set_nonblocking(sock) < 0 || set_cloexec(sock) < 0) {
450971aa 840 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
841 "%s(%s): set_nonblocking/cloexec(%d) failed",
842 __func__, addr.sun_path, sock);
a6810074
DL
843 close(sock);
844 return -1;
8b886ca7 845 }
a6810074
DL
846
847 if (connect(sock, (struct sockaddr *)&addr, len) < 0) {
848 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK)) {
849 if (gs.loglevel > LOG_DEBUG)
850 zlog_debug("%s(%s): connect failed: %s",
851 __func__, addr.sun_path,
852 safe_strerror(errno));
853 close(sock);
854 return -1;
855 }
856 if (gs.loglevel > LOG_DEBUG)
857 zlog_debug("%s: connection in progress", dmn->name);
858 dmn->state = DAEMON_CONNECTING;
859 dmn->fd = sock;
66e78ae6 860 thread_add_write(master, check_connect, dmn, dmn->fd,
d62a17ae 861 &dmn->t_write);
d62a17ae 862 thread_add_timer(master, wakeup_connect_hanging, dmn,
863 gs.timeout, &dmn->t_wakeup);
a6810074
DL
864 SET_READ_HANDLER(dmn);
865 return 0;
866 }
867
868 dmn->fd = sock;
869 SET_READ_HANDLER(dmn);
870 daemon_up(dmn, "connect succeeded");
871 return 1;
8b886ca7 872}
873
cc9f21da 874static void phase_hanging(struct thread *t_hanging)
8b886ca7 875{
a6810074 876 gs.t_phase_hanging = NULL;
f74ae2bb 877 flog_err(EC_WATCHFRR_CONNECTION,
1c50c1c0
QY
878 "Phase [%s] hanging for %ld seconds, aborting phased restart",
879 phase_str[gs.phase], PHASE_TIMEOUT);
a6810074 880 gs.phase = PHASE_NONE;
8b886ca7 881}
882
f1692c51 883static void set_phase(enum restart_phase new_phase)
8b886ca7 884{
a6810074 885 gs.phase = new_phase;
b3d6bc6e
MS
886 thread_cancel(&gs.t_phase_hanging);
887
66e78ae6
QY
888 thread_add_timer(master, phase_hanging, NULL, PHASE_TIMEOUT,
889 &gs.t_phase_hanging);
8b886ca7 890}
891
a6810074 892static void phase_check(void)
8b886ca7 893{
c0e5cb52
DL
894 struct daemon *dmn;
895
a6810074
DL
896 switch (gs.phase) {
897 case PHASE_NONE:
898 break;
c0e5cb52
DL
899
900 case PHASE_INIT:
901 for (dmn = gs.daemons; dmn; dmn = dmn->next)
902 if (dmn->state == DAEMON_INIT)
903 return;
904
905 /* startup complete, everything out of INIT */
906 gs.phase = PHASE_NONE;
907 for (dmn = gs.daemons; dmn; dmn = dmn->next)
908 if (dmn->state == DAEMON_DOWN) {
909 SET_WAKEUP_DOWN(dmn);
910 try_restart(dmn);
911 }
912 break;
a6810074
DL
913 case PHASE_STOPS_PENDING:
914 if (gs.numpids)
915 break;
d62a17ae 916 zlog_info(
917 "Phased restart: all routing daemon stop jobs have completed.");
a6810074
DL
918 set_phase(PHASE_WAITING_DOWN);
919
d62a17ae 920 /*FALLTHRU*/
a6810074
DL
921 case PHASE_WAITING_DOWN:
922 if (gs.numdown + IS_UP(gs.special) < gs.numdaemons)
923 break;
6d0fa5c2 924 systemd_send_status("Phased Restart");
a6810074
DL
925 zlog_info("Phased restart: all routing daemons now down.");
926 run_job(&gs.special->restart, "restart", gs.restart_command, 1,
927 1);
928 set_phase(PHASE_ZEBRA_RESTART_PENDING);
929
d62a17ae 930 /*FALLTHRU*/
a6810074
DL
931 case PHASE_ZEBRA_RESTART_PENDING:
932 if (gs.special->restart.pid)
933 break;
6d0fa5c2 934 systemd_send_status("Zebra Restarting");
a6810074
DL
935 zlog_info("Phased restart: %s restart job completed.",
936 gs.special->name);
937 set_phase(PHASE_WAITING_ZEBRA_UP);
938
d62a17ae 939 /*FALLTHRU*/
a6810074
DL
940 case PHASE_WAITING_ZEBRA_UP:
941 if (!IS_UP(gs.special))
942 break;
943 zlog_info("Phased restart: %s is now up.", gs.special->name);
944 {
945 struct daemon *dmn;
946 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
947 if (dmn != gs.special)
948 run_job(&dmn->restart, "start",
949 gs.start_command, 1, 0);
950 }
951 }
952 gs.phase = PHASE_NONE;
953 THREAD_OFF(gs.t_phase_hanging);
954 zlog_notice("Phased global restart has completed.");
955 break;
956 }
8b886ca7 957}
958
a6810074 959static void try_restart(struct daemon *dmn)
8b886ca7 960{
f168b713 961 if (watch_only)
a6810074 962 return;
a6810074 963
f168b713
DL
964 if (dmn != gs.special) {
965 if ((gs.special->state == DAEMON_UP)
966 && (gs.phase == PHASE_NONE))
967 run_job(&dmn->restart, "restart", gs.restart_command, 0,
968 1);
969 else
970 zlog_debug(
3efd0893 971 "%s: postponing restart attempt because master %s daemon not up [%s], or phased restart in progress",
f168b713
DL
972 dmn->name, gs.special->name,
973 state_str[gs.special->state]);
974 return;
975 }
976
977 if ((gs.phase != PHASE_NONE) || gs.numpids) {
978 if (gs.loglevel > LOG_DEBUG + 1)
979 zlog_debug(
3efd0893 980 "postponing phased global restart: restart already in progress [%s], or outstanding child processes [%d]",
f168b713
DL
981 phase_str[gs.phase], gs.numpids);
982 return;
983 }
984 /* Is it too soon for a restart? */
985 {
986 struct timeval delay;
987 if (time_elapsed(&delay, &gs.special->restart.time)->tv_sec
988 < gs.special->restart.interval) {
a6810074 989 if (gs.loglevel > LOG_DEBUG + 1)
d62a17ae 990 zlog_debug(
3efd0893 991 "postponing phased global restart: elapsed time %ld < retry interval %ld",
f168b713
DL
992 (long)delay.tv_sec,
993 gs.special->restart.interval);
994 return;
a6810074 995 }
8b886ca7 996 }
f168b713 997 run_job(&gs.restart, "restart", gs.restart_command, 0, 1);
8b886ca7 998}
999
cc9f21da 1000static void wakeup_unresponsive(struct thread *t_wakeup)
8b886ca7 1001{
a6810074
DL
1002 struct daemon *dmn = THREAD_ARG(t_wakeup);
1003
1004 dmn->t_wakeup = NULL;
1005 if (dmn->state != DAEMON_UNRESPONSIVE)
f74ae2bb 1006 flog_err(EC_WATCHFRR_CONNECTION,
3efd0893 1007 "%s: no longer unresponsive (now %s), wakeup should have been cancelled!",
1c50c1c0 1008 dmn->name, state_str[dmn->state]);
a6810074
DL
1009 else {
1010 SET_WAKEUP_UNRESPONSIVE(dmn);
1011 try_restart(dmn);
1012 }
8b886ca7 1013}
1014
cc9f21da 1015static void wakeup_no_answer(struct thread *t_wakeup)
8b886ca7 1016{
a6810074
DL
1017 struct daemon *dmn = THREAD_ARG(t_wakeup);
1018
1019 dmn->t_wakeup = NULL;
1020 dmn->state = DAEMON_UNRESPONSIVE;
cc53b605 1021 if (dmn->ignore_timeout)
cc9f21da 1022 return;
f74ae2bb 1023 flog_err(EC_WATCHFRR_CONNECTION,
3efd0893 1024 "%s state -> unresponsive : no response yet to ping sent %ld seconds ago",
1c50c1c0 1025 dmn->name, gs.timeout);
71e7975a
DL
1026 SET_WAKEUP_UNRESPONSIVE(dmn);
1027 try_restart(dmn);
8b886ca7 1028}
1029
cc9f21da 1030static void wakeup_send_echo(struct thread *t_wakeup)
8b886ca7 1031{
a6810074
DL
1032 static const char echocmd[] = "echo " PING_TOKEN;
1033 ssize_t rc;
1034 struct daemon *dmn = THREAD_ARG(t_wakeup);
1035
1036 dmn->t_wakeup = NULL;
d62a17ae 1037 if (((rc = write(dmn->fd, echocmd, sizeof(echocmd))) < 0)
1038 || ((size_t)rc != sizeof(echocmd))) {
a6810074
DL
1039 char why[100 + sizeof(echocmd)];
1040 snprintf(why, sizeof(why),
1041 "write '%s' returned %d instead of %u", echocmd,
d7c0a89a 1042 (int)rc, (unsigned int)sizeof(echocmd));
a6810074
DL
1043 daemon_down(dmn, why);
1044 } else {
1045 gettimeofday(&dmn->echo_sent, NULL);
66e78ae6
QY
1046 thread_add_timer(master, wakeup_no_answer, dmn, gs.timeout,
1047 &dmn->t_wakeup);
a6810074 1048 }
8b886ca7 1049}
1050
470bc619
QY
1051bool check_all_up(void)
1052{
1053 struct daemon *dmn;
1054
1055 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1056 if (dmn->state != DAEMON_UP)
1057 return false;
1058 return true;
1059}
1060
af568444
DL
1061void watchfrr_status(struct vty *vty)
1062{
1063 struct daemon *dmn;
1064 struct timeval delay;
1065
1066 vty_out(vty, "watchfrr global phase: %s\n", phase_str[gs.phase]);
603fef0e
DS
1067 vty_out(vty, " Restart Command: %pSQq\n", gs.restart_command);
1068 vty_out(vty, " Start Command: %pSQq\n", gs.start_command);
1069 vty_out(vty, " Stop Command: %pSQq\n", gs.stop_command);
1070 vty_out(vty, " Min Restart Interval: %ld\n", gs.min_restart_interval);
1071 vty_out(vty, " Max Restart Interval: %ld\n", gs.max_restart_interval);
1072 vty_out(vty, " Restart Timeout: %ld\n", gs.restart_timeout);
2ab760f0
DA
1073 vty_out(vty, " Reading Configuration: %s\n",
1074 gs.reading_configuration ? "yes" : "no");
af568444
DL
1075 if (gs.restart.pid)
1076 vty_out(vty, " global restart running, pid %ld\n",
1077 (long)gs.restart.pid);
1078
1079 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
cc53b605
DS
1080 vty_out(vty, " %-20s %s%s", dmn->name, state_str[dmn->state],
1081 dmn->ignore_timeout ? "/Ignoring Timeout\n" : "\n");
af568444
DL
1082 if (dmn->restart.pid)
1083 vty_out(vty, " restart running, pid %ld\n",
1084 (long)dmn->restart.pid);
1085 else if (dmn->state == DAEMON_DOWN &&
1086 time_elapsed(&delay, &dmn->restart.time)->tv_sec
1087 < dmn->restart.interval)
3efd0893 1088 vty_out(vty, " restarting in %jd seconds (%jds backoff interval)\n",
051a0be4
DL
1089 (intmax_t)dmn->restart.interval
1090 - (intmax_t)delay.tv_sec,
1091 (intmax_t)dmn->restart.interval);
af568444
DL
1092 }
1093}
1094
a6810074 1095static void sigint(void)
8b886ca7 1096{
a6810074
DL
1097 zlog_notice("Terminating on signal");
1098 systemd_send_stopping();
1099 exit(0);
8b886ca7 1100}
1101
a6810074 1102static int valid_command(const char *cmd)
8b886ca7 1103{
a6810074 1104 char *p;
8b886ca7 1105
53a78fc1
RZ
1106 if (cmd == NULL)
1107 return 0;
1108
a6810074 1109 return ((p = strchr(cmd, '%')) != NULL) && (*(p + 1) == 's')
d62a17ae 1110 && !strchr(p + 1, '%');
8b886ca7 1111}
1112
c8b40f86 1113/* This is an ugly hack to circumvent problems with passing command-line
1114 arguments that contain spaces. The fix is to use a configuration file. */
a6810074 1115static char *translate_blanks(const char *cmd, const char *blankstr)
c8b40f86 1116{
a6810074
DL
1117 char *res;
1118 char *p;
1119 size_t bslen = strlen(blankstr);
1120
1121 if (!(res = strdup(cmd))) {
1122 perror("strdup");
1123 exit(1);
1124 }
1125 while ((p = strstr(res, blankstr)) != NULL) {
1126 *p = ' ';
1127 if (bslen != 1)
1128 memmove(p + 1, p + bslen, strlen(p + bslen) + 1);
1129 }
1130 return res;
c8b40f86 1131}
1132
cc9f21da 1133static void startup_timeout(struct thread *t_wakeup)
5c9d1c83
DL
1134{
1135 daemon_send_ready(1);
5c9d1c83
DL
1136}
1137
33606a15
DL
1138#ifdef GNU_LINUX
1139
1140#include <sys/mount.h>
1141#include <sched.h>
1142
1143#define NETNS_RUN_DIR "/var/run/netns"
1144
1145static void netns_create(int dirfd, const char *nsname)
1146{
1147 /* make /var/run/netns shared between mount namespaces
1148 * just like iproute2 sets it up
1149 */
1150 if (mount("", NETNS_RUN_DIR, "none", MS_SHARED | MS_REC, NULL)) {
1151 if (errno != EINVAL) {
1152 perror("mount");
1153 exit(1);
1154 }
1155
1156 if (mount(NETNS_RUN_DIR, NETNS_RUN_DIR, "none",
1157 MS_BIND | MS_REC, NULL)) {
1158 perror("mount");
1159 exit(1);
1160 }
1161
1162 if (mount("", NETNS_RUN_DIR, "none", MS_SHARED | MS_REC,
1163 NULL)) {
1164 perror("mount");
1165 exit(1);
1166 }
1167 }
1168
1169 /* need an empty file to mount on top of */
1170 int nsfd = openat(dirfd, nsname, O_CREAT | O_RDONLY | O_EXCL, 0);
1171
1172 if (nsfd < 0) {
1173 fprintf(stderr, "failed to create \"%s/%s\": %s\n",
1174 NETNS_RUN_DIR, nsname, strerror(errno));
1175 exit(1);
1176 }
1177 close(nsfd);
1178
1179 if (unshare(CLONE_NEWNET)) {
1180 perror("unshare");
1181 unlinkat(dirfd, nsname, 0);
1182 exit(1);
1183 }
1184
1185 char *dstpath = asprintfrr(MTYPE_TMP, "%s/%s", NETNS_RUN_DIR, nsname);
1186
1187 /* bind-mount so the namespace has a name and is persistent */
1188 if (mount("/proc/self/ns/net", dstpath, "none", MS_BIND, NULL) < 0) {
1189 fprintf(stderr, "failed to bind-mount netns to \"%s\": %s\n",
1190 dstpath, strerror(errno));
1191 unlinkat(dirfd, nsname, 0);
1192 exit(1);
1193 }
1194
1195 XFREE(MTYPE_TMP, dstpath);
1196}
1197
1198static void netns_setup(const char *nsname)
1199{
1200 int dirfd, nsfd;
1201
1202 dirfd = open(NETNS_RUN_DIR, O_DIRECTORY | O_RDONLY);
1203 if (dirfd < 0) {
1204 if (errno == ENOTDIR) {
1205 fprintf(stderr, "error: \"%s\" is not a directory!\n",
1206 NETNS_RUN_DIR);
1207 exit(1);
1208 } else if (errno == ENOENT) {
1209 if (mkdir(NETNS_RUN_DIR, 0755)) {
1210 fprintf(stderr, "error: \"%s\": mkdir: %s\n",
1211 NETNS_RUN_DIR, strerror(errno));
1212 exit(1);
1213 }
1214 dirfd = open(NETNS_RUN_DIR, O_DIRECTORY | O_RDONLY);
1215 if (dirfd < 0) {
1216 fprintf(stderr, "error: \"%s\": opendir: %s\n",
1217 NETNS_RUN_DIR, strerror(errno));
1218 exit(1);
1219 }
1220 } else {
1221 fprintf(stderr, "error: \"%s\": %s\n",
1222 NETNS_RUN_DIR, strerror(errno));
1223 exit(1);
1224 }
1225 }
1226
1227 nsfd = openat(dirfd, nsname, O_RDONLY);
1228 if (nsfd < 0 && errno != ENOENT) {
1229 fprintf(stderr, "error: \"%s/%s\": %s\n",
1230 NETNS_RUN_DIR, nsname, strerror(errno));
1231 exit(1);
1232 }
1233 if (nsfd < 0)
1234 netns_create(dirfd, nsname);
1235 else {
1236 if (setns(nsfd, CLONE_NEWNET)) {
1237 perror("setns");
1238 exit(1);
1239 }
1240 close(nsfd);
1241 }
1242 close(dirfd);
1243
1244 /* make sure loopback is up... weird things happen otherwise.
1245 * ioctl is perfectly fine for this, don't need netlink...
1246 */
1247 int sockfd;
1248 struct ifreq ifr = { };
1249
1250 strlcpy(ifr.ifr_name, "lo", sizeof(ifr.ifr_name));
1251
1252 sockfd = socket(AF_INET, SOCK_DGRAM, 0);
1253 if (sockfd < 0) {
1254 perror("socket");
1255 exit(1);
1256 }
1257 if (ioctl(sockfd, SIOCGIFFLAGS, &ifr)) {
1258 perror("ioctl(SIOCGIFFLAGS, \"lo\")");
1259 exit(1);
1260 }
1261 if (!(ifr.ifr_flags & IFF_UP)) {
1262 ifr.ifr_flags |= IFF_UP;
1263 if (ioctl(sockfd, SIOCSIFFLAGS, &ifr)) {
1264 perror("ioctl(SIOCSIFFLAGS, \"lo\")");
1265 exit(1);
1266 }
1267 }
1268 close(sockfd);
1269}
1270
1271#else /* !GNU_LINUX */
1272
1273static void netns_setup(const char *nsname)
1274{
1275 fprintf(stderr, "network namespaces are only available on Linux\n");
1276 exit(1);
1277}
1278#endif
1279
2ab760f0
DA
1280static void watchfrr_start_config(void)
1281{
1282 gs.reading_configuration = true;
1283}
1284
1285static void watchfrr_end_config(void)
1286{
1287 gs.reading_configuration = false;
1288}
1289
0a7c7856
DL
1290static void watchfrr_init(int argc, char **argv)
1291{
1292 const char *special = "zebra";
1293 int i;
1294 struct daemon *dmn, **add = &gs.daemons;
1295 char alldaemons[512] = "", *p = alldaemons;
1296
5c9d1c83
DL
1297 thread_add_timer_msec(master, startup_timeout, NULL, STARTUP_TIMEOUT,
1298 &gs.t_startup_timeout);
1299
0a7c7856
DL
1300 for (i = optind; i < argc; i++) {
1301 dmn = XCALLOC(MTYPE_WATCHFRR_DAEMON, sizeof(*dmn));
1302
1303 dmn->name = dmn->restart.name = argv[i];
1304 dmn->state = DAEMON_INIT;
1305 gs.numdaemons++;
1306 gs.numdown++;
1307 dmn->fd = -1;
c0e5cb52 1308 thread_add_timer_msec(master, wakeup_init, dmn, 0,
0a7c7856
DL
1309 &dmn->t_wakeup);
1310 dmn->restart.interval = gs.min_restart_interval;
1311 *add = dmn;
1312 add = &dmn->next;
1313
1314 if (!strcmp(dmn->name, special))
1315 gs.special = dmn;
1316 }
1317
1318 if (!gs.daemons) {
1319 fprintf(stderr,
1320 "Must specify one or more daemons to monitor.\n\n");
1321 frr_help_exit(1);
1322 }
1323 if (!watch_only && !gs.special) {
1324 fprintf(stderr, "\"%s\" daemon must be in daemon lists\n\n",
1325 special);
1326 frr_help_exit(1);
1327 }
1328
1329 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
1330 snprintf(p, alldaemons + sizeof(alldaemons) - p, "%s%s",
1331 (p == alldaemons) ? "" : " ", dmn->name);
1332 p += strlen(p);
1333 }
1334 zlog_notice("%s %s watching [%s]%s", progname, FRR_VERSION, alldaemons,
1335 watch_only ? ", monitor mode" : "");
1336}
1337
a6810074 1338struct zebra_privs_t watchfrr_privs = {
95c4aff2 1339#ifdef VTY_GROUP
a6810074 1340 .vty_group = VTY_GROUP,
95c4aff2
DL
1341#endif
1342};
1343
7cc91e67 1344static struct frr_signal_t watchfrr_signals[] = {
4f04a76b
DL
1345 {
1346 .signal = SIGINT,
1347 .handler = sigint,
1348 },
1349 {
1350 .signal = SIGTERM,
1351 .handler = sigint,
1352 },
1353 {
1354 .signal = SIGCHLD,
1355 .handler = sigchild,
1356 },
1357};
1358
1359FRR_DAEMON_INFO(watchfrr, WATCHFRR,
d62a17ae 1360 .flags = FRR_NO_PRIVSEP | FRR_NO_TCPVTY | FRR_LIMITED_CLI
0a7c7856
DL
1361 | FRR_NO_CFG_PID_DRY | FRR_NO_ZCLIENT
1362 | FRR_DETACH_LATER,
4f04a76b 1363
d62a17ae 1364 .printhelp = printhelp,
1365 .copyright = "Copyright 2004 Andrew J. Schorr",
4f04a76b 1366
d62a17ae 1367 .signals = watchfrr_signals,
1368 .n_signals = array_size(watchfrr_signals),
4f04a76b 1369
80413c20
DL
1370 .privs = &watchfrr_privs,
1371);
4f04a76b 1372
999f153e
DL
1373#define DEPRECATED_OPTIONS "aAezR:"
1374
a6810074 1375int main(int argc, char **argv)
8b886ca7 1376{
a6810074 1377 int opt;
a6810074 1378 const char *blankstr = NULL;
33606a15
DL
1379 const char *netns = NULL;
1380 bool netns_en = false;
a6810074 1381
4f04a76b
DL
1382 frr_preinit(&watchfrr_di, argc, argv);
1383 progname = watchfrr_di.progname;
1384
33606a15 1385 frr_opt_add("b:di:k:l:N:p:r:S:s:t:T:" DEPRECATED_OPTIONS, longopts, "");
a6810074
DL
1386
1387 gs.restart.name = "all";
4f04a76b 1388 while ((opt = frr_getopt(argc, argv, NULL)) != EOF) {
999f153e
DL
1389 if (opt && opt < 128 && strchr(DEPRECATED_OPTIONS, opt)) {
1390 fprintf(stderr,
1391 "The -%c option no longer exists.\n"
1392 "Please refer to the watchfrr(8) man page.\n",
1393 opt);
1394 exit(1);
1395 }
1396
a6810074
DL
1397 switch (opt) {
1398 case 0:
1399 break;
a6810074
DL
1400 case 'b':
1401 blankstr = optarg;
1402 break;
f168b713
DL
1403 case OPTION_DRY:
1404 watch_only = true;
a6810074
DL
1405 break;
1406 case 'k':
1407 if (!valid_command(optarg)) {
1408 fprintf(stderr,
1409 "Invalid kill command, must contain '%%s': %s\n",
1410 optarg);
4f04a76b 1411 frr_help_exit(1);
a6810074
DL
1412 }
1413 gs.stop_command = optarg;
1414 break;
d62a17ae 1415 case 'l': {
1416 char garbage[3];
1417 if ((sscanf(optarg, "%d%1s", &gs.loglevel, garbage)
1418 != 1)
1419 || (gs.loglevel < LOG_EMERG)) {
1420 fprintf(stderr,
1421 "Invalid loglevel argument: %s\n",
1422 optarg);
1423 frr_help_exit(1);
a6810074 1424 }
d62a17ae 1425 } break;
1426 case OPTION_MINRESTART: {
1427 char garbage[3];
1428 if ((sscanf(optarg, "%ld%1s", &gs.min_restart_interval,
1429 garbage)
1430 != 1)
1431 || (gs.min_restart_interval < 0)) {
1432 fprintf(stderr,
1433 "Invalid min_restart_interval argument: %s\n",
1434 optarg);
1435 frr_help_exit(1);
a6810074 1436 }
d62a17ae 1437 } break;
1438 case OPTION_MAXRESTART: {
1439 char garbage[3];
1440 if ((sscanf(optarg, "%ld%1s", &gs.max_restart_interval,
1441 garbage)
1442 != 1)
1443 || (gs.max_restart_interval < 0)) {
1444 fprintf(stderr,
1445 "Invalid max_restart_interval argument: %s\n",
1446 optarg);
1447 frr_help_exit(1);
a6810074 1448 }
d62a17ae 1449 } break;
6d0fa5c2
DS
1450 case OPTION_MAXOPERATIONAL: {
1451 char garbage[3];
1452
1453 if ((sscanf(optarg, "%ld%1s", &gs.operational_timeout,
1454 garbage) != 1) ||
7a8120da 1455 (gs.operational_timeout < 0)) {
6d0fa5c2
DS
1456 fprintf(stderr,
1457 "Invalid Operational_timeout argument: %s\n",
1458 optarg);
1459 frr_help_exit(1);
1460 }
1461 } break;
33606a15
DL
1462 case OPTION_NETNS:
1463 netns_en = true;
b12bc77c 1464 if (optarg && strchr(optarg, '/')) {
33606a15
DL
1465 fprintf(stderr,
1466 "invalid network namespace name \"%s\" (may not contain slashes)\n",
1467 optarg);
1468 frr_help_exit(1);
1469 }
1470 netns = optarg;
1471 break;
d62a17ae 1472 case 'i': {
1473 char garbage[3];
1474 int period;
1475 if ((sscanf(optarg, "%d%1s", &period, garbage) != 1)
1476 || (gs.period < 1)) {
1477 fprintf(stderr,
1478 "Invalid interval argument: %s\n",
1479 optarg);
1480 frr_help_exit(1);
a6810074 1481 }
d62a17ae 1482 gs.period = 1000 * period;
1483 } break;
a6810074 1484 case 'p':
0a7c7856 1485 watchfrr_di.pid_file = optarg;
a6810074
DL
1486 break;
1487 case 'r':
a6810074
DL
1488 if (!valid_command(optarg)) {
1489 fprintf(stderr,
1490 "Invalid restart command, must contain '%%s': %s\n",
1491 optarg);
4f04a76b 1492 frr_help_exit(1);
a6810074
DL
1493 }
1494 gs.restart_command = optarg;
a6810074
DL
1495 break;
1496 case 's':
1497 if (!valid_command(optarg)) {
1498 fprintf(stderr,
1499 "Invalid start command, must contain '%%s': %s\n",
1500 optarg);
4f04a76b 1501 frr_help_exit(1);
a6810074
DL
1502 }
1503 gs.start_command = optarg;
1504 break;
1505 case 'S':
1506 gs.vtydir = optarg;
1507 break;
d62a17ae 1508 case 't': {
1509 char garbage[3];
1510 if ((sscanf(optarg, "%ld%1s", &gs.timeout, garbage)
1511 != 1)
1512 || (gs.timeout < 1)) {
1513 fprintf(stderr,
1514 "Invalid timeout argument: %s\n",
1515 optarg);
1516 frr_help_exit(1);
a6810074 1517 }
d62a17ae 1518 } break;
1519 case 'T': {
1520 char garbage[3];
1521 if ((sscanf(optarg, "%ld%1s", &gs.restart_timeout,
1522 garbage)
1523 != 1)
1524 || (gs.restart_timeout < 1)) {
1525 fprintf(stderr,
1526 "Invalid restart timeout argument: %s\n",
1527 optarg);
1528 frr_help_exit(1);
a6810074 1529 }
d62a17ae 1530 } break;
a6810074
DL
1531 default:
1532 fputs("Invalid option.\n", stderr);
4f04a76b 1533 frr_help_exit(1);
a6810074 1534 }
8b886ca7 1535 }
a6810074 1536
71e7975a
DL
1537 if (watch_only
1538 && (gs.start_command || gs.stop_command || gs.restart_command)) {
d87ae5cc 1539 fputs("Options -r/-s/-k are not used when --dry is active.\n",
a6810074 1540 stderr);
8b886ca7 1541 }
f168b713
DL
1542 if (!watch_only
1543 && (!gs.restart_command || !gs.start_command || !gs.stop_command)) {
1544 fprintf(stderr,
1545 "Options -s (start), -k (kill), and -r (restart) are required.\n");
1546 frr_help_exit(1);
8b886ca7 1547 }
8b886ca7 1548
a6810074
DL
1549 if (blankstr) {
1550 if (gs.restart_command)
1551 gs.restart_command =
d62a17ae 1552 translate_blanks(gs.restart_command, blankstr);
a6810074
DL
1553 if (gs.start_command)
1554 gs.start_command =
d62a17ae 1555 translate_blanks(gs.start_command, blankstr);
a6810074
DL
1556 if (gs.stop_command)
1557 gs.stop_command =
d62a17ae 1558 translate_blanks(gs.stop_command, blankstr);
065de903 1559 }
8b886ca7 1560
a6810074 1561 gs.restart.interval = gs.min_restart_interval;
8b886ca7 1562
33606a15
DL
1563 /* env variable for the processes that we start */
1564 if (watchfrr_di.pathspace)
1565 setenv("FRR_PATHSPACE", watchfrr_di.pathspace, 1);
1566 else
1567 unsetenv("FRR_PATHSPACE");
1568
a91f5417
DS
1569 /*
1570 * when watchfrr_di.pathspace is read, if it is not specified
1571 * pathspace is NULL as expected
1572 */
1573 pathspace = watchfrr_di.pathspace;
1574
33606a15
DL
1575 if (netns_en && !netns)
1576 netns = watchfrr_di.pathspace;
a91f5417 1577
33606a15
DL
1578 if (netns_en && netns && netns[0])
1579 netns_setup(netns);
1580
4f04a76b 1581 master = frr_init();
b647dc2a 1582 watchfrr_error_init();
0a7c7856 1583 watchfrr_init(argc, argv);
2ab760f0 1584 cmd_init_config_callbacks(watchfrr_start_config, watchfrr_end_config);
0a7c7856
DL
1585 watchfrr_vty_init();
1586
1587 frr_config_fork();
4f04a76b 1588
0a7c7856 1589 if (watchfrr_di.daemon_mode)
0bdeb5e5 1590 zlog_syslog_set_prio_min(MIN(gs.loglevel, LOG_DEBUG));
0a7c7856 1591 else
0bdeb5e5 1592 zlog_aux_init(NULL, MIN(gs.loglevel, LOG_DEBUG));
8b886ca7 1593
0a7c7856 1594 frr_run(master);
8b886ca7 1595
a6810074
DL
1596 systemd_send_stopping();
1597 /* Not reached. */
1598 return 0;
8b886ca7 1599}