]> git.proxmox.com Git - mirror_frr.git/blame - watchfrr/watchfrr.c
pbrd: VTY_GET_CONTEXT can fail
[mirror_frr.git] / watchfrr / watchfrr.c
CommitLineData
8b886ca7 1/*
896014f4
DL
2 * Monitor status of frr daemons and restart if necessary.
3 *
4 * Copyright (C) 2004 Andrew J. Schorr
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
8b886ca7 19 */
20
a365534f 21#include <zebra.h>
8b886ca7 22#include <thread.h>
23#include <log.h>
52e66296 24#include <network.h>
8b886ca7 25#include <sigevent.h>
a365534f 26#include <lib/version.h>
95c4aff2 27#include "command.h"
4f04a76b 28#include "libfrr.h"
b647dc2a 29#include "lib_errors.h"
0bdeb5e5 30#include "zlog_targets.h"
5920b3eb 31#include "network.h"
33606a15 32#include "printfrr.h"
95c4aff2 33
6f594023 34#include <getopt.h>
a365534f 35#include <sys/un.h>
36#include <sys/wait.h>
837d16cc 37#include <memory.h>
651415bd 38#include <systemd.h>
8b886ca7 39
9473e340 40#include "watchfrr.h"
b647dc2a 41#include "watchfrr_errors.h"
95c4aff2 42
8b886ca7 43#ifndef MIN
44#define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
45#endif
46
47/* Macros to help randomize timers. */
5920b3eb 48#define JITTER(X) ((frr_weak_random() % ((X)+1))-((X)/2))
8b886ca7 49#define FUZZY(X) ((X)+JITTER((X)/20))
50
51#define DEFAULT_PERIOD 5
0a64aff6 52#define DEFAULT_TIMEOUT 90
8b886ca7 53#define DEFAULT_RESTART_TIMEOUT 20
54#define DEFAULT_LOGLEVEL LOG_INFO
55#define DEFAULT_MIN_RESTART 60
56#define DEFAULT_MAX_RESTART 600
6d0fa5c2 57#define DEFAULT_OPERATIONAL_TIMEOUT 60
8b886ca7 58
3ec95567
DL
59#define DEFAULT_RESTART_CMD WATCHFRR_SH_PATH " restart %s"
60#define DEFAULT_START_CMD WATCHFRR_SH_PATH " start %s"
61#define DEFAULT_STOP_CMD WATCHFRR_SH_PATH " stop %s"
62
8b886ca7 63#define PING_TOKEN "PING"
64
bf8d3d6a
DL
65DEFINE_MGROUP(WATCHFRR, "watchfrr");
66DEFINE_MTYPE_STATIC(WATCHFRR, WATCHFRR_DAEMON, "watchfrr daemon entry");
0a7c7856 67
55c72803 68/* Needs to be global, referenced somewhere inside libfrr. */
8b886ca7 69struct thread_master *master;
70
f168b713 71static bool watch_only = false;
a91f5417 72const char *pathspace;
8b886ca7 73
f1692c51 74enum restart_phase {
a6810074 75 PHASE_NONE = 0,
c0e5cb52 76 PHASE_INIT,
a6810074
DL
77 PHASE_STOPS_PENDING,
78 PHASE_WAITING_DOWN,
79 PHASE_ZEBRA_RESTART_PENDING,
80 PHASE_WAITING_ZEBRA_UP
f1692c51 81};
8b886ca7 82
2b64873d 83static const char *const phase_str[] = {
af568444 84 "Idle",
c0e5cb52 85 "Startup",
a6810074
DL
86 "Stop jobs running",
87 "Waiting for other daemons to come down",
88 "Zebra restart job running",
89 "Waiting for zebra to come up",
90 "Start jobs running",
8b886ca7 91};
92
93#define PHASE_TIMEOUT (3*gs.restart_timeout)
5c9d1c83 94#define STARTUP_TIMEOUT 55 * 1000
8b886ca7 95
a6810074
DL
96struct restart_info {
97 const char *name;
98 const char *what;
99 pid_t pid;
100 struct timeval time;
101 long interval;
102 struct thread *t_kill;
103 int kills;
098e240f 104};
105
a6810074 106static struct global_state {
f1692c51 107 enum restart_phase phase;
a6810074 108 struct thread *t_phase_hanging;
5c9d1c83 109 struct thread *t_startup_timeout;
6d0fa5c2 110 struct thread *t_operational;
a6810074
DL
111 const char *vtydir;
112 long period;
113 long timeout;
114 long restart_timeout;
115 long min_restart_interval;
116 long max_restart_interval;
6d0fa5c2 117 long operational_timeout;
a6810074
DL
118 struct daemon *daemons;
119 const char *restart_command;
120 const char *start_command;
121 const char *stop_command;
122 struct restart_info restart;
a6810074 123 int loglevel;
d62a17ae 124 struct daemon *special; /* points to zebra when doing phased restart */
a6810074
DL
125 int numdaemons;
126 int numpids;
d62a17ae 127 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
8b886ca7 128} gs = {
c0e5cb52 129 .phase = PHASE_INIT,
64a249ad 130 .vtydir = frr_vtydir,
d62a17ae 131 .period = 1000 * DEFAULT_PERIOD,
132 .timeout = DEFAULT_TIMEOUT,
133 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
134 .loglevel = DEFAULT_LOGLEVEL,
135 .min_restart_interval = DEFAULT_MIN_RESTART,
136 .max_restart_interval = DEFAULT_MAX_RESTART,
6d0fa5c2 137 .operational_timeout = DEFAULT_OPERATIONAL_TIMEOUT,
3ec95567
DL
138 .restart_command = DEFAULT_RESTART_CMD,
139 .start_command = DEFAULT_START_CMD,
140 .stop_command = DEFAULT_STOP_CMD,
d62a17ae 141};
a6810074 142
c3db4ca8 143enum daemon_state {
a6810074
DL
144 DAEMON_INIT,
145 DAEMON_DOWN,
146 DAEMON_CONNECTING,
147 DAEMON_UP,
148 DAEMON_UNRESPONSIVE
c3db4ca8 149};
8b886ca7 150
d62a17ae 151#define IS_UP(DMN) \
152 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
8b886ca7 153
2b64873d 154static const char *const state_str[] = {
d62a17ae 155 "Init", "Down", "Connecting", "Up", "Unresponsive",
8b886ca7 156};
157
158struct daemon {
a6810074 159 const char *name;
c3db4ca8 160 enum daemon_state state;
a6810074
DL
161 int fd;
162 struct timeval echo_sent;
d7c0a89a 163 unsigned int connect_tries;
a6810074
DL
164 struct thread *t_wakeup;
165 struct thread *t_read;
166 struct thread *t_write;
167 struct daemon *next;
168 struct restart_info restart;
cc53b605
DS
169
170 /*
171 * For a given daemon, if we've turned on ignore timeouts
172 * ignore the timeout value and assume everything is ok
173 * This is for daemon debugging w/ gdb after we have started
174 * FRR and realize we have something that needs to be looked
175 * at
176 */
177 bool ignore_timeout;
8b886ca7 178};
179
9272302b
DL
180#define OPTION_MINRESTART 2000
181#define OPTION_MAXRESTART 2001
f168b713 182#define OPTION_DRY 2002
33606a15 183#define OPTION_NETNS 2003
6d0fa5c2 184#define OPTION_MAXOPERATIONAL 2004
9272302b 185
a6810074
DL
186static const struct option longopts[] = {
187 {"daemon", no_argument, NULL, 'd'},
188 {"statedir", required_argument, NULL, 'S'},
a6810074
DL
189 {"loglevel", required_argument, NULL, 'l'},
190 {"interval", required_argument, NULL, 'i'},
191 {"timeout", required_argument, NULL, 't'},
192 {"restart-timeout", required_argument, NULL, 'T'},
193 {"restart", required_argument, NULL, 'r'},
194 {"start-command", required_argument, NULL, 's'},
195 {"kill-command", required_argument, NULL, 'k'},
f168b713 196 {"dry", no_argument, NULL, OPTION_DRY},
d62a17ae 197 {"min-restart-interval", required_argument, NULL, OPTION_MINRESTART},
198 {"max-restart-interval", required_argument, NULL, OPTION_MAXRESTART},
6d0fa5c2 199 {"operational-timeout", required_argument, NULL, OPTION_MAXOPERATIONAL},
a6810074
DL
200 {"pid-file", required_argument, NULL, 'p'},
201 {"blank-string", required_argument, NULL, 'b'},
33606a15
DL
202#ifdef GNU_LINUX
203 {"netns", optional_argument, NULL, OPTION_NETNS},
204#endif
a6810074
DL
205 {"help", no_argument, NULL, 'h'},
206 {"version", no_argument, NULL, 'v'},
d62a17ae 207 {NULL, 0, NULL, 0}};
8b886ca7 208
209static int try_connect(struct daemon *dmn);
cc9f21da 210static void wakeup_send_echo(struct thread *t_wakeup);
8b886ca7 211static void try_restart(struct daemon *dmn);
212static void phase_check(void);
75f8b0e4 213static void restart_done(struct daemon *dmn);
8b886ca7 214
4f04a76b 215static const char *progname;
cc53b605
DS
216
217void watchfrr_set_ignore_daemon(struct vty *vty, const char *dname, bool ignore)
218{
219 struct daemon *dmn;
220
221 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
222 if (strncmp(dmn->name, dname, strlen(dmn->name)) == 0)
223 break;
224 }
225
226 if (dmn) {
227 dmn->ignore_timeout = ignore;
228 vty_out(vty, "%s switching to %s\n", dmn->name,
229 ignore ? "ignore" : "watch");
230 } else
231 vty_out(vty, "%s is not configured for running at the moment",
232 dname);
233}
234
4f04a76b 235static void printhelp(FILE *target)
8b886ca7 236{
d62a17ae 237 fprintf(target,
238 "Usage : %s [OPTION...] <daemon name> ...\n\n\
9473e340 239Watchdog program to monitor status of frr daemons and try to restart\n\
8b886ca7 240them if they are down or unresponsive. It determines whether a daemon is\n\
241up based on whether it can connect to the daemon's vty unix stream socket.\n\
242It then repeatedly sends echo commands over that socket to determine whether\n\
243the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
244on the socket connection and know immediately that the daemon is down.\n\n\
245The daemons to be monitored should be listed on the command line.\n\n\
8b886ca7 246In order to avoid attempting to restart the daemons in a fast loop,\n\
247the -m and -M options allow you to control the minimum delay between\n\
248restart commands. The minimum restart delay is recalculated each time\n\
249a restart is attempted: if the time since the last restart attempt exceeds\n\
250twice the -M value, then the restart delay is set to the -m value.\n\
d62a17ae 251Otherwise, the interval is doubled (but capped at the -M value).\n\n",
f168b713 252 progname);
e757c940 253
d62a17ae 254 fprintf(target,
255 "Options:\n\
8b886ca7 256-d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
257 to syslog instead of stdout.\n\
258-S, --statedir Set the vty socket directory (default is %s)\n\
33606a15
DL
259-N, --pathspace Insert prefix into config & socket paths\n"
260#ifdef GNU_LINUX
261" --netns Create and/or use Linux network namespace. If no name is\n"
262" given, uses the value from `-N`.\n"
263#endif
264"-l, --loglevel Set the logging level (default is %d).\n\
8b886ca7 265 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
266 but it can be set higher than %d if extra-verbose debugging\n\
267 messages are desired.\n\
9272302b 268 --min-restart-interval\n\
8b886ca7 269 Set the minimum seconds to wait between invocations of daemon\n\
270 restart commands (default is %d).\n\
9272302b 271 --max-restart-interval\n\
8b886ca7 272 Set the maximum seconds to wait between invocations of daemon\n\
273 restart commands (default is %d).\n\
6d0fa5c2
DS
274 --operational-timeout\n\
275 Set the time before systemd is notified that we are considered\n\
276 operational again after a daemon restart (default is %d).\n\
8b886ca7 277-i, --interval Set the status polling interval in seconds (default is %d)\n\
278-t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
279-T, --restart-timeout\n\
280 Set the restart (kill) timeout in seconds (default is %d).\n\
281 If any background jobs are still running after this much\n\
282 time has elapsed, they will be killed.\n\
283-r, --restart Supply a Bourne shell command to use to restart a single\n\
284 daemon. The command string should include '%%s' where the\n\
285 name of the daemon should be substituted.\n\
3ec95567 286 (default: '%s')\n\
8b886ca7 287-s, --start-command\n\
288 Supply a Bourne shell to command to use to start a single\n\
289 daemon. The command string should include '%%s' where the\n\
290 name of the daemon should be substituted.\n\
3ec95567 291 (default: '%s')\n\
8b886ca7 292-k, --kill-command\n\
293 Supply a Bourne shell to command to use to stop a single\n\
294 daemon. The command string should include '%%s' where the\n\
295 name of the daemon should be substituted.\n\
3ec95567 296 (default: '%s')\n\
f168b713 297 --dry Do not start or restart anything, just log.\n\
8b886ca7 298-p, --pid-file Set process identifier file name\n\
0a7c7856 299 (default is %s/watchfrr.pid).\n\
c8b40f86 300-b, --blank-string\n\
301 When the supplied argument string is found in any of the\n\
f168b713 302 various shell command arguments (-r, -s, or -k), replace\n\
c8b40f86 303 it with a space. This is an ugly hack to circumvent problems\n\
304 passing command-line arguments with embedded spaces.\n\
8b886ca7 305-v, --version Print program version\n\
d62a17ae 306-h, --help Display this help and exit\n",
64a249ad 307 frr_vtydir, DEFAULT_LOGLEVEL, LOG_EMERG, LOG_DEBUG, LOG_DEBUG,
6d0fa5c2
DS
308 DEFAULT_MIN_RESTART, DEFAULT_MAX_RESTART,
309 DEFAULT_OPERATIONAL_TIMEOUT, DEFAULT_PERIOD, DEFAULT_TIMEOUT,
310 DEFAULT_RESTART_TIMEOUT, DEFAULT_RESTART_CMD, DEFAULT_START_CMD,
311 DEFAULT_STOP_CMD, frr_vtydir);
8b886ca7 312}
313
a6810074 314static pid_t run_background(char *shell_cmd)
8b886ca7 315{
a6810074
DL
316 pid_t child;
317
318 switch (child = fork()) {
319 case -1:
450971aa 320 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
321 "fork failed, cannot run command [%s]: %s",
322 shell_cmd, safe_strerror(errno));
a6810074
DL
323 return -1;
324 case 0:
325 /* Child process. */
d62a17ae 326 /* Use separate process group so child processes can be killed
327 * easily. */
a6810074 328 if (setpgid(0, 0) < 0)
957cfa24 329 zlog_warn("setpgid(0,0) failed: %s",
a6810074
DL
330 safe_strerror(errno));
331 {
332 char shell[] = "sh";
333 char dashc[] = "-c";
d62a17ae 334 char *const argv[4] = {shell, dashc, shell_cmd, NULL};
a6810074 335 execv("/bin/sh", argv);
450971aa 336 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
337 "execv(/bin/sh -c '%s') failed: %s",
338 shell_cmd, safe_strerror(errno));
a6810074
DL
339 _exit(127);
340 }
341 default:
342 /* Parent process: we will reap the child later. */
c3f65458
QY
343 zlog_info("Forked background command [pid %d]: %s", (int)child,
344 shell_cmd);
a6810074
DL
345 return child;
346 }
8b886ca7 347}
348
a6810074
DL
349static struct timeval *time_elapsed(struct timeval *result,
350 const struct timeval *start_time)
8b886ca7 351{
a6810074
DL
352 gettimeofday(result, NULL);
353 result->tv_sec -= start_time->tv_sec;
354 result->tv_usec -= start_time->tv_usec;
355 while (result->tv_usec < 0) {
356 result->tv_usec += 1000000L;
357 result->tv_sec--;
358 }
359 return result;
8b886ca7 360}
361
cc9f21da 362static void restart_kill(struct thread *t_kill)
8b886ca7 363{
a6810074
DL
364 struct restart_info *restart = THREAD_ARG(t_kill);
365 struct timeval delay;
366
367 time_elapsed(&delay, &restart->time);
d62a17ae 368 zlog_warn(
957cfa24 369 "%s %s child process %d still running after %ld seconds, sending signal %d",
d62a17ae 370 restart->what, restart->name, (int)restart->pid,
371 (long)delay.tv_sec, (restart->kills ? SIGKILL : SIGTERM));
a6810074
DL
372 kill(-restart->pid, (restart->kills ? SIGKILL : SIGTERM));
373 restart->kills++;
66e78ae6
QY
374 thread_add_timer(master, restart_kill, restart, gs.restart_timeout,
375 &restart->t_kill);
8b886ca7 376}
377
a6810074 378static struct restart_info *find_child(pid_t child)
8b886ca7 379{
f168b713 380 struct daemon *dmn;
7c265f7d
CF
381 if (gs.restart.pid == child)
382 return &gs.restart;
383
f168b713
DL
384 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
385 if (dmn->restart.pid == child)
386 return &dmn->restart;
a6810074
DL
387 }
388 return NULL;
8b886ca7 389}
390
a6810074 391static void sigchild(void)
8b886ca7 392{
a6810074
DL
393 pid_t child;
394 int status;
395 const char *name;
396 const char *what;
397 struct restart_info *restart;
75f8b0e4 398 struct daemon *dmn;
a6810074
DL
399
400 switch (child = waitpid(-1, &status, WNOHANG)) {
401 case -1:
450971aa 402 flog_err_sys(EC_LIB_SYSTEM_CALL, "waitpid failed: %s",
09c866e3 403 safe_strerror(errno));
a6810074
DL
404 return;
405 case 0:
406 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
407 return;
408 }
409
410 if (child == integrated_write_pid) {
411 integrated_write_sigchld(status);
412 return;
413 }
414
415 if ((restart = find_child(child)) != NULL) {
416 name = restart->name;
417 what = restart->what;
418 restart->pid = 0;
419 gs.numpids--;
b3d6bc6e
MS
420 thread_cancel(&restart->t_kill);
421
d62a17ae 422 /* Update restart time to reflect the time the command
423 * completed. */
a6810074
DL
424 gettimeofday(&restart->time, NULL);
425 } else {
09c866e3 426 flog_err_sys(
450971aa 427 EC_LIB_SYSTEM_CALL,
09c866e3
QY
428 "waitpid returned status for an unknown child process %d",
429 (int)child);
a6810074
DL
430 name = "(unknown)";
431 what = "background";
432 }
433 if (WIFSTOPPED(status))
957cfa24 434 zlog_warn("%s %s process %d is stopped", what, name,
d62a17ae 435 (int)child);
a6810074 436 else if (WIFSIGNALED(status))
d62a17ae 437 zlog_warn("%s %s process %d terminated due to signal %d", what,
438 name, (int)child, WTERMSIG(status));
a6810074
DL
439 else if (WIFEXITED(status)) {
440 if (WEXITSTATUS(status) != 0)
d62a17ae 441 zlog_warn(
442 "%s %s process %d exited with non-zero status %d",
443 what, name, (int)child, WEXITSTATUS(status));
75f8b0e4 444 else {
a6810074
DL
445 zlog_debug("%s %s process %d exited normally", what,
446 name, (int)child);
75f8b0e4
DL
447
448 if (restart && restart != &gs.restart) {
449 dmn = container_of(restart, struct daemon,
450 restart);
451 restart_done(dmn);
452 } else if (restart)
453 for (dmn = gs.daemons; dmn; dmn = dmn->next)
454 restart_done(dmn);
455 }
a6810074 456 } else
09c866e3 457 flog_err_sys(
450971aa 458 EC_LIB_SYSTEM_CALL,
09c866e3
QY
459 "cannot interpret %s %s process %d wait status 0x%x",
460 what, name, (int)child, status);
a6810074 461 phase_check();
8b886ca7 462}
463
d62a17ae 464static int run_job(struct restart_info *restart, const char *cmdtype,
465 const char *command, int force, int update_interval)
8b886ca7 466{
a6810074
DL
467 struct timeval delay;
468
469 if (gs.loglevel > LOG_DEBUG + 1)
470 zlog_debug("attempting to %s %s", cmdtype, restart->name);
471
472 if (restart->pid) {
473 if (gs.loglevel > LOG_DEBUG + 1)
d62a17ae 474 zlog_debug(
475 "cannot %s %s, previous pid %d still running",
476 cmdtype, restart->name, (int)restart->pid);
a6810074
DL
477 return -1;
478 }
479
b3ee8bcc
DS
480 char buffer[512];
481
482 snprintf(buffer, sizeof(buffer), "restarting %s", restart->name);
483 systemd_send_status(buffer);
b3ee8bcc 484
d62a17ae 485 /* Note: time_elapsed test must come before the force test, since we
486 need
a6810074
DL
487 to make sure that delay is initialized for use below in updating the
488 restart interval. */
489 if ((time_elapsed(&delay, &restart->time)->tv_sec < restart->interval)
490 && !force) {
b3ee8bcc 491
a6810074 492 if (gs.loglevel > LOG_DEBUG + 1)
d62a17ae 493 zlog_debug(
3efd0893 494 "postponing %s %s: elapsed time %ld < retry interval %ld",
d62a17ae 495 cmdtype, restart->name, (long)delay.tv_sec,
496 restart->interval);
a6810074
DL
497 return -1;
498 }
499
500 gettimeofday(&restart->time, NULL);
501 restart->kills = 0;
502 {
503 char cmd[strlen(command) + strlen(restart->name) + 1];
504 snprintf(cmd, sizeof(cmd), command, restart->name);
505 if ((restart->pid = run_background(cmd)) > 0) {
d62a17ae 506 thread_add_timer(master, restart_kill, restart,
507 gs.restart_timeout, &restart->t_kill);
a6810074
DL
508 restart->what = cmdtype;
509 gs.numpids++;
510 } else
511 restart->pid = 0;
512 }
513
514 /* Calculate the new restart interval. */
515 if (update_interval) {
516 if (delay.tv_sec > 2 * gs.max_restart_interval)
517 restart->interval = gs.min_restart_interval;
518 else if ((restart->interval *= 2) > gs.max_restart_interval)
519 restart->interval = gs.max_restart_interval;
520 if (gs.loglevel > LOG_DEBUG + 1)
521 zlog_debug("restart %s interval is now %ld",
522 restart->name, restart->interval);
523 }
524 return restart->pid;
8b886ca7 525}
526
d62a17ae 527#define SET_READ_HANDLER(DMN) \
528 do { \
529 (DMN)->t_read = NULL; \
530 thread_add_read(master, handle_read, (DMN), (DMN)->fd, \
531 &(DMN)->t_read); \
532 } while (0);
533
534#define SET_WAKEUP_DOWN(DMN) \
535 do { \
536 (DMN)->t_wakeup = NULL; \
537 thread_add_timer_msec(master, wakeup_down, (DMN), \
538 FUZZY(gs.period), &(DMN)->t_wakeup); \
539 } while (0);
540
541#define SET_WAKEUP_UNRESPONSIVE(DMN) \
542 do { \
543 (DMN)->t_wakeup = NULL; \
544 thread_add_timer_msec(master, wakeup_unresponsive, (DMN), \
545 FUZZY(gs.period), &(DMN)->t_wakeup); \
546 } while (0);
547
548#define SET_WAKEUP_ECHO(DMN) \
549 do { \
550 (DMN)->t_wakeup = NULL; \
551 thread_add_timer_msec(master, wakeup_send_echo, (DMN), \
552 FUZZY(gs.period), &(DMN)->t_wakeup); \
553 } while (0);
8b886ca7 554
cc9f21da 555static void wakeup_down(struct thread *t_wakeup)
8b886ca7 556{
a6810074
DL
557 struct daemon *dmn = THREAD_ARG(t_wakeup);
558
559 dmn->t_wakeup = NULL;
560 if (try_connect(dmn) < 0)
561 SET_WAKEUP_DOWN(dmn);
562 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
563 try_restart(dmn);
8b886ca7 564}
565
cc9f21da 566static void wakeup_init(struct thread *t_wakeup)
8b886ca7 567{
a6810074
DL
568 struct daemon *dmn = THREAD_ARG(t_wakeup);
569
570 dmn->t_wakeup = NULL;
571 if (try_connect(dmn) < 0) {
c3f65458
QY
572 zlog_info(
573 "%s state -> down : initial connection attempt failed",
574 dmn->name);
a6810074
DL
575 dmn->state = DAEMON_DOWN;
576 }
c0e5cb52 577 phase_check();
8b886ca7 578}
579
75f8b0e4
DL
580static void restart_done(struct daemon *dmn)
581{
582 if (dmn->state != DAEMON_DOWN) {
3f391bec
DS
583 zlog_warn(
584 "Daemon: %s: is in %s state but expected it to be in DAEMON_DOWN state",
585 dmn->name, state_str[dmn->state]);
75f8b0e4
DL
586 return;
587 }
28ef0ee1 588 THREAD_OFF(dmn->t_wakeup);
50478845 589
75f8b0e4
DL
590 if (try_connect(dmn) < 0)
591 SET_WAKEUP_DOWN(dmn);
592}
593
6d0fa5c2
DS
594static void daemon_restarting_operational(struct thread *thread)
595{
596 systemd_send_status("FRR Operational");
597}
598
a6810074 599static void daemon_down(struct daemon *dmn, const char *why)
8b886ca7 600{
a6810074 601 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
1c50c1c0
QY
602 flog_err(EC_WATCHFRR_CONNECTION, "%s state -> down : %s",
603 dmn->name, why);
a6810074
DL
604 else if (gs.loglevel > LOG_DEBUG)
605 zlog_debug("%s still down : %s", dmn->name, why);
606 if (IS_UP(dmn))
607 gs.numdown++;
608 dmn->state = DAEMON_DOWN;
609 if (dmn->fd >= 0) {
610 close(dmn->fd);
611 dmn->fd = -1;
612 }
613 THREAD_OFF(dmn->t_read);
614 THREAD_OFF(dmn->t_write);
615 THREAD_OFF(dmn->t_wakeup);
616 if (try_connect(dmn) < 0)
617 SET_WAKEUP_DOWN(dmn);
6d0fa5c2
DS
618
619 systemd_send_status("FRR partially operational");
a6810074 620 phase_check();
8b886ca7 621}
622
cc9f21da 623static void handle_read(struct thread *t_read)
8b886ca7 624{
a6810074
DL
625 struct daemon *dmn = THREAD_ARG(t_read);
626 static const char resp[sizeof(PING_TOKEN) + 4] = PING_TOKEN "\n";
627 char buf[sizeof(resp) + 100];
628 ssize_t rc;
629 struct timeval delay;
630
631 dmn->t_read = NULL;
632 if ((rc = read(dmn->fd, buf, sizeof(buf))) < 0) {
633 char why[100];
634
635 if (ERRNO_IO_RETRY(errno)) {
636 /* Pretend it never happened. */
637 SET_READ_HANDLER(dmn);
cc9f21da 638 return;
a6810074
DL
639 }
640 snprintf(why, sizeof(why), "unexpected read error: %s",
641 safe_strerror(errno));
642 daemon_down(dmn, why);
cc9f21da 643 return;
8b886ca7 644 }
a6810074
DL
645 if (rc == 0) {
646 daemon_down(dmn, "read returned EOF");
cc9f21da 647 return;
a6810074
DL
648 }
649 if (!dmn->echo_sent.tv_sec) {
650 char why[sizeof(buf) + 100];
651 snprintf(why, sizeof(why),
652 "unexpected read returns %d bytes: %.*s", (int)rc,
653 (int)rc, buf);
654 daemon_down(dmn, why);
cc9f21da 655 return;
8b886ca7 656 }
a6810074
DL
657
658 /* We are expecting an echo response: is there any chance that the
659 response would not be returned entirely in the first read? That
660 seems inconceivable... */
661 if ((rc != sizeof(resp)) || memcmp(buf, resp, sizeof(resp))) {
662 char why[100 + sizeof(buf)];
663 snprintf(why, sizeof(why),
3efd0893 664 "read returned bad echo response of %d bytes (expecting %u): %.*s",
d7c0a89a 665 (int)rc, (unsigned int)sizeof(resp), (int)rc, buf);
a6810074 666 daemon_down(dmn, why);
cc9f21da 667 return;
a6810074
DL
668 }
669
670 time_elapsed(&delay, &dmn->echo_sent);
671 dmn->echo_sent.tv_sec = 0;
672 if (dmn->state == DAEMON_UNRESPONSIVE) {
673 if (delay.tv_sec < gs.timeout) {
674 dmn->state = DAEMON_UP;
d62a17ae 675 zlog_warn(
3efd0893 676 "%s state -> up : echo response received after %ld.%06ld seconds",
d62a17ae 677 dmn->name, (long)delay.tv_sec,
678 (long)delay.tv_usec);
a6810074 679 } else
d62a17ae 680 zlog_warn(
3efd0893 681 "%s: slow echo response finally received after %ld.%06ld seconds",
d62a17ae 682 dmn->name, (long)delay.tv_sec,
683 (long)delay.tv_usec);
a6810074
DL
684 } else if (gs.loglevel > LOG_DEBUG + 1)
685 zlog_debug("%s: echo response received after %ld.%06ld seconds",
686 dmn->name, (long)delay.tv_sec, (long)delay.tv_usec);
687
688 SET_READ_HANDLER(dmn);
b3d6bc6e 689 thread_cancel(&dmn->t_wakeup);
a6810074 690 SET_WAKEUP_ECHO(dmn);
8b886ca7 691}
692
207e0d7a
DS
693/*
694 * Wait till we notice that all daemons are ready before
695 * we send we are ready to systemd
696 */
5c9d1c83 697static void daemon_send_ready(int exitcode)
207e0d7a 698{
5c9d1c83 699 FILE *fp;
a6810074 700 static int sent = 0;
43e587c1 701 char started[1024];
207e0d7a 702
5c9d1c83
DL
703 if (sent)
704 return;
705
706 if (exitcode == 0)
0a7c7856 707 zlog_notice("all daemons up, doing startup-complete notify");
5c9d1c83
DL
708 else if (gs.numdown < gs.numdaemons)
709 flog_err(EC_WATCHFRR_CONNECTION,
3efd0893 710 "startup did not complete within timeout (%d/%d daemons running)",
5c9d1c83
DL
711 gs.numdaemons - gs.numdown, gs.numdaemons);
712 else {
713 flog_err(EC_WATCHFRR_CONNECTION,
3efd0893 714 "all configured daemons failed to start -- exiting watchfrr");
5c9d1c83
DL
715 exit(exitcode);
716
717 }
0a7c7856 718
5c9d1c83
DL
719 frr_detach();
720
33606a15 721 snprintf(started, sizeof(started), "%s/%s", frr_vtydir,
3c649c71
DS
722 "watchfrr.started");
723 fp = fopen(started, "w");
5c9d1c83
DL
724 if (fp)
725 fclose(fp);
247898d5
DL
726
727 systemd_send_started(master);
b3ee8bcc 728 systemd_send_status("FRR Operational");
5c9d1c83 729 sent = 1;
207e0d7a
DS
730}
731
a6810074 732static void daemon_up(struct daemon *dmn, const char *why)
8b886ca7 733{
a6810074
DL
734 dmn->state = DAEMON_UP;
735 gs.numdown--;
736 dmn->connect_tries = 0;
737 zlog_notice("%s state -> up : %s", dmn->name, why);
6d0fa5c2 738 if (gs.numdown == 0) {
5c9d1c83 739 daemon_send_ready(0);
6d0fa5c2
DS
740
741 THREAD_OFF(gs.t_operational);
742
743 thread_add_timer(master, daemon_restarting_operational, NULL,
744 gs.operational_timeout, &gs.t_operational);
745 }
746
a8cbb8b3 747 SET_WAKEUP_ECHO(dmn);
a6810074 748 phase_check();
8b886ca7 749}
750
cc9f21da 751static void check_connect(struct thread *t_write)
8b886ca7 752{
a6810074
DL
753 struct daemon *dmn = THREAD_ARG(t_write);
754 int sockerr;
755 socklen_t reslen = sizeof(sockerr);
756
757 dmn->t_write = NULL;
758 if (getsockopt(dmn->fd, SOL_SOCKET, SO_ERROR, (char *)&sockerr, &reslen)
759 < 0) {
760 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn->name,
761 safe_strerror(errno));
762 daemon_down(dmn,
763 "getsockopt failed checking connection success");
cc9f21da 764 return;
a6810074
DL
765 }
766 if ((reslen == sizeof(sockerr)) && sockerr) {
767 char why[100];
d62a17ae 768 snprintf(
769 why, sizeof(why),
770 "getsockopt reports that connection attempt failed: %s",
771 safe_strerror(sockerr));
a6810074 772 daemon_down(dmn, why);
cc9f21da 773 return;
a6810074
DL
774 }
775
776 daemon_up(dmn, "delayed connect succeeded");
8b886ca7 777}
778
cc9f21da 779static void wakeup_connect_hanging(struct thread *t_wakeup)
8b886ca7 780{
a6810074
DL
781 struct daemon *dmn = THREAD_ARG(t_wakeup);
782 char why[100];
783
784 dmn->t_wakeup = NULL;
785 snprintf(why, sizeof(why),
786 "connection attempt timed out after %ld seconds", gs.timeout);
787 daemon_down(dmn, why);
8b886ca7 788}
789
790/* Making connection to protocol daemon. */
a6810074 791static int try_connect(struct daemon *dmn)
8b886ca7 792{
a6810074
DL
793 int sock;
794 struct sockaddr_un addr;
795 socklen_t len;
796
797 if (gs.loglevel > LOG_DEBUG + 1)
798 zlog_debug("%s: attempting to connect", dmn->name);
799 dmn->connect_tries++;
800
6006b807 801 memset(&addr, 0, sizeof(addr));
a6810074 802 addr.sun_family = AF_UNIX;
d62a17ae 803 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty", gs.vtydir,
804 dmn->name);
6f0e3f6e 805#ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
a6810074 806 len = addr.sun_len = SUN_LEN(&addr);
8b886ca7 807#else
a6810074 808 len = sizeof(addr.sun_family) + strlen(addr.sun_path);
d62a17ae 809#endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
a6810074
DL
810
811 /* Quick check to see if we might succeed before we go to the trouble
812 of creating a socket. */
813 if (access(addr.sun_path, W_OK) < 0) {
814 if (errno != ENOENT)
450971aa 815 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
816 "%s: access to socket %s denied: %s",
817 dmn->name, addr.sun_path,
818 safe_strerror(errno));
a6810074
DL
819 return -1;
820 }
821
822 if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
450971aa 823 flog_err_sys(EC_LIB_SOCKET, "%s(%s): cannot make socket: %s",
09c866e3 824 __func__, addr.sun_path, safe_strerror(errno));
a6810074
DL
825 return -1;
826 }
827
828 if (set_nonblocking(sock) < 0 || set_cloexec(sock) < 0) {
450971aa 829 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
830 "%s(%s): set_nonblocking/cloexec(%d) failed",
831 __func__, addr.sun_path, sock);
a6810074
DL
832 close(sock);
833 return -1;
8b886ca7 834 }
a6810074
DL
835
836 if (connect(sock, (struct sockaddr *)&addr, len) < 0) {
837 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK)) {
838 if (gs.loglevel > LOG_DEBUG)
839 zlog_debug("%s(%s): connect failed: %s",
840 __func__, addr.sun_path,
841 safe_strerror(errno));
842 close(sock);
843 return -1;
844 }
845 if (gs.loglevel > LOG_DEBUG)
846 zlog_debug("%s: connection in progress", dmn->name);
847 dmn->state = DAEMON_CONNECTING;
848 dmn->fd = sock;
66e78ae6 849 thread_add_write(master, check_connect, dmn, dmn->fd,
d62a17ae 850 &dmn->t_write);
d62a17ae 851 thread_add_timer(master, wakeup_connect_hanging, dmn,
852 gs.timeout, &dmn->t_wakeup);
a6810074
DL
853 SET_READ_HANDLER(dmn);
854 return 0;
855 }
856
857 dmn->fd = sock;
858 SET_READ_HANDLER(dmn);
859 daemon_up(dmn, "connect succeeded");
860 return 1;
8b886ca7 861}
862
cc9f21da 863static void phase_hanging(struct thread *t_hanging)
8b886ca7 864{
a6810074 865 gs.t_phase_hanging = NULL;
f74ae2bb 866 flog_err(EC_WATCHFRR_CONNECTION,
1c50c1c0
QY
867 "Phase [%s] hanging for %ld seconds, aborting phased restart",
868 phase_str[gs.phase], PHASE_TIMEOUT);
a6810074 869 gs.phase = PHASE_NONE;
8b886ca7 870}
871
f1692c51 872static void set_phase(enum restart_phase new_phase)
8b886ca7 873{
a6810074 874 gs.phase = new_phase;
b3d6bc6e
MS
875 thread_cancel(&gs.t_phase_hanging);
876
66e78ae6
QY
877 thread_add_timer(master, phase_hanging, NULL, PHASE_TIMEOUT,
878 &gs.t_phase_hanging);
8b886ca7 879}
880
a6810074 881static void phase_check(void)
8b886ca7 882{
c0e5cb52
DL
883 struct daemon *dmn;
884
a6810074
DL
885 switch (gs.phase) {
886 case PHASE_NONE:
887 break;
c0e5cb52
DL
888
889 case PHASE_INIT:
890 for (dmn = gs.daemons; dmn; dmn = dmn->next)
891 if (dmn->state == DAEMON_INIT)
892 return;
893
894 /* startup complete, everything out of INIT */
895 gs.phase = PHASE_NONE;
896 for (dmn = gs.daemons; dmn; dmn = dmn->next)
897 if (dmn->state == DAEMON_DOWN) {
898 SET_WAKEUP_DOWN(dmn);
899 try_restart(dmn);
900 }
901 break;
a6810074
DL
902 case PHASE_STOPS_PENDING:
903 if (gs.numpids)
904 break;
d62a17ae 905 zlog_info(
906 "Phased restart: all routing daemon stop jobs have completed.");
a6810074
DL
907 set_phase(PHASE_WAITING_DOWN);
908
d62a17ae 909 /*FALLTHRU*/
a6810074
DL
910 case PHASE_WAITING_DOWN:
911 if (gs.numdown + IS_UP(gs.special) < gs.numdaemons)
912 break;
6d0fa5c2 913 systemd_send_status("Phased Restart");
a6810074
DL
914 zlog_info("Phased restart: all routing daemons now down.");
915 run_job(&gs.special->restart, "restart", gs.restart_command, 1,
916 1);
917 set_phase(PHASE_ZEBRA_RESTART_PENDING);
918
d62a17ae 919 /*FALLTHRU*/
a6810074
DL
920 case PHASE_ZEBRA_RESTART_PENDING:
921 if (gs.special->restart.pid)
922 break;
6d0fa5c2 923 systemd_send_status("Zebra Restarting");
a6810074
DL
924 zlog_info("Phased restart: %s restart job completed.",
925 gs.special->name);
926 set_phase(PHASE_WAITING_ZEBRA_UP);
927
d62a17ae 928 /*FALLTHRU*/
a6810074
DL
929 case PHASE_WAITING_ZEBRA_UP:
930 if (!IS_UP(gs.special))
931 break;
932 zlog_info("Phased restart: %s is now up.", gs.special->name);
933 {
934 struct daemon *dmn;
935 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
936 if (dmn != gs.special)
937 run_job(&dmn->restart, "start",
938 gs.start_command, 1, 0);
939 }
940 }
941 gs.phase = PHASE_NONE;
942 THREAD_OFF(gs.t_phase_hanging);
943 zlog_notice("Phased global restart has completed.");
944 break;
945 }
8b886ca7 946}
947
a6810074 948static void try_restart(struct daemon *dmn)
8b886ca7 949{
f168b713 950 if (watch_only)
a6810074 951 return;
a6810074 952
f168b713
DL
953 if (dmn != gs.special) {
954 if ((gs.special->state == DAEMON_UP)
955 && (gs.phase == PHASE_NONE))
956 run_job(&dmn->restart, "restart", gs.restart_command, 0,
957 1);
958 else
959 zlog_debug(
3efd0893 960 "%s: postponing restart attempt because master %s daemon not up [%s], or phased restart in progress",
f168b713
DL
961 dmn->name, gs.special->name,
962 state_str[gs.special->state]);
963 return;
964 }
965
966 if ((gs.phase != PHASE_NONE) || gs.numpids) {
967 if (gs.loglevel > LOG_DEBUG + 1)
968 zlog_debug(
3efd0893 969 "postponing phased global restart: restart already in progress [%s], or outstanding child processes [%d]",
f168b713
DL
970 phase_str[gs.phase], gs.numpids);
971 return;
972 }
973 /* Is it too soon for a restart? */
974 {
975 struct timeval delay;
976 if (time_elapsed(&delay, &gs.special->restart.time)->tv_sec
977 < gs.special->restart.interval) {
a6810074 978 if (gs.loglevel > LOG_DEBUG + 1)
d62a17ae 979 zlog_debug(
3efd0893 980 "postponing phased global restart: elapsed time %ld < retry interval %ld",
f168b713
DL
981 (long)delay.tv_sec,
982 gs.special->restart.interval);
983 return;
a6810074 984 }
8b886ca7 985 }
f168b713 986 run_job(&gs.restart, "restart", gs.restart_command, 0, 1);
8b886ca7 987}
988
cc9f21da 989static void wakeup_unresponsive(struct thread *t_wakeup)
8b886ca7 990{
a6810074
DL
991 struct daemon *dmn = THREAD_ARG(t_wakeup);
992
993 dmn->t_wakeup = NULL;
994 if (dmn->state != DAEMON_UNRESPONSIVE)
f74ae2bb 995 flog_err(EC_WATCHFRR_CONNECTION,
3efd0893 996 "%s: no longer unresponsive (now %s), wakeup should have been cancelled!",
1c50c1c0 997 dmn->name, state_str[dmn->state]);
a6810074
DL
998 else {
999 SET_WAKEUP_UNRESPONSIVE(dmn);
1000 try_restart(dmn);
1001 }
8b886ca7 1002}
1003
cc9f21da 1004static void wakeup_no_answer(struct thread *t_wakeup)
8b886ca7 1005{
a6810074
DL
1006 struct daemon *dmn = THREAD_ARG(t_wakeup);
1007
1008 dmn->t_wakeup = NULL;
1009 dmn->state = DAEMON_UNRESPONSIVE;
cc53b605 1010 if (dmn->ignore_timeout)
cc9f21da 1011 return;
f74ae2bb 1012 flog_err(EC_WATCHFRR_CONNECTION,
3efd0893 1013 "%s state -> unresponsive : no response yet to ping sent %ld seconds ago",
1c50c1c0 1014 dmn->name, gs.timeout);
71e7975a
DL
1015 SET_WAKEUP_UNRESPONSIVE(dmn);
1016 try_restart(dmn);
8b886ca7 1017}
1018
cc9f21da 1019static void wakeup_send_echo(struct thread *t_wakeup)
8b886ca7 1020{
a6810074
DL
1021 static const char echocmd[] = "echo " PING_TOKEN;
1022 ssize_t rc;
1023 struct daemon *dmn = THREAD_ARG(t_wakeup);
1024
1025 dmn->t_wakeup = NULL;
d62a17ae 1026 if (((rc = write(dmn->fd, echocmd, sizeof(echocmd))) < 0)
1027 || ((size_t)rc != sizeof(echocmd))) {
a6810074
DL
1028 char why[100 + sizeof(echocmd)];
1029 snprintf(why, sizeof(why),
1030 "write '%s' returned %d instead of %u", echocmd,
d7c0a89a 1031 (int)rc, (unsigned int)sizeof(echocmd));
a6810074
DL
1032 daemon_down(dmn, why);
1033 } else {
1034 gettimeofday(&dmn->echo_sent, NULL);
66e78ae6
QY
1035 thread_add_timer(master, wakeup_no_answer, dmn, gs.timeout,
1036 &dmn->t_wakeup);
a6810074 1037 }
8b886ca7 1038}
1039
470bc619
QY
1040bool check_all_up(void)
1041{
1042 struct daemon *dmn;
1043
1044 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1045 if (dmn->state != DAEMON_UP)
1046 return false;
1047 return true;
1048}
1049
af568444
DL
1050void watchfrr_status(struct vty *vty)
1051{
1052 struct daemon *dmn;
1053 struct timeval delay;
1054
1055 vty_out(vty, "watchfrr global phase: %s\n", phase_str[gs.phase]);
603fef0e
DS
1056 vty_out(vty, " Restart Command: %pSQq\n", gs.restart_command);
1057 vty_out(vty, " Start Command: %pSQq\n", gs.start_command);
1058 vty_out(vty, " Stop Command: %pSQq\n", gs.stop_command);
1059 vty_out(vty, " Min Restart Interval: %ld\n", gs.min_restart_interval);
1060 vty_out(vty, " Max Restart Interval: %ld\n", gs.max_restart_interval);
1061 vty_out(vty, " Restart Timeout: %ld\n", gs.restart_timeout);
af568444
DL
1062 if (gs.restart.pid)
1063 vty_out(vty, " global restart running, pid %ld\n",
1064 (long)gs.restart.pid);
1065
1066 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
cc53b605
DS
1067 vty_out(vty, " %-20s %s%s", dmn->name, state_str[dmn->state],
1068 dmn->ignore_timeout ? "/Ignoring Timeout\n" : "\n");
af568444
DL
1069 if (dmn->restart.pid)
1070 vty_out(vty, " restart running, pid %ld\n",
1071 (long)dmn->restart.pid);
1072 else if (dmn->state == DAEMON_DOWN &&
1073 time_elapsed(&delay, &dmn->restart.time)->tv_sec
1074 < dmn->restart.interval)
3efd0893 1075 vty_out(vty, " restarting in %jd seconds (%jds backoff interval)\n",
051a0be4
DL
1076 (intmax_t)dmn->restart.interval
1077 - (intmax_t)delay.tv_sec,
1078 (intmax_t)dmn->restart.interval);
af568444
DL
1079 }
1080}
1081
a6810074 1082static void sigint(void)
8b886ca7 1083{
a6810074
DL
1084 zlog_notice("Terminating on signal");
1085 systemd_send_stopping();
1086 exit(0);
8b886ca7 1087}
1088
a6810074 1089static int valid_command(const char *cmd)
8b886ca7 1090{
a6810074 1091 char *p;
8b886ca7 1092
53a78fc1
RZ
1093 if (cmd == NULL)
1094 return 0;
1095
a6810074 1096 return ((p = strchr(cmd, '%')) != NULL) && (*(p + 1) == 's')
d62a17ae 1097 && !strchr(p + 1, '%');
8b886ca7 1098}
1099
c8b40f86 1100/* This is an ugly hack to circumvent problems with passing command-line
1101 arguments that contain spaces. The fix is to use a configuration file. */
a6810074 1102static char *translate_blanks(const char *cmd, const char *blankstr)
c8b40f86 1103{
a6810074
DL
1104 char *res;
1105 char *p;
1106 size_t bslen = strlen(blankstr);
1107
1108 if (!(res = strdup(cmd))) {
1109 perror("strdup");
1110 exit(1);
1111 }
1112 while ((p = strstr(res, blankstr)) != NULL) {
1113 *p = ' ';
1114 if (bslen != 1)
1115 memmove(p + 1, p + bslen, strlen(p + bslen) + 1);
1116 }
1117 return res;
c8b40f86 1118}
1119
cc9f21da 1120static void startup_timeout(struct thread *t_wakeup)
5c9d1c83
DL
1121{
1122 daemon_send_ready(1);
5c9d1c83
DL
1123}
1124
33606a15
DL
1125#ifdef GNU_LINUX
1126
1127#include <sys/mount.h>
1128#include <sched.h>
1129
1130#define NETNS_RUN_DIR "/var/run/netns"
1131
1132static void netns_create(int dirfd, const char *nsname)
1133{
1134 /* make /var/run/netns shared between mount namespaces
1135 * just like iproute2 sets it up
1136 */
1137 if (mount("", NETNS_RUN_DIR, "none", MS_SHARED | MS_REC, NULL)) {
1138 if (errno != EINVAL) {
1139 perror("mount");
1140 exit(1);
1141 }
1142
1143 if (mount(NETNS_RUN_DIR, NETNS_RUN_DIR, "none",
1144 MS_BIND | MS_REC, NULL)) {
1145 perror("mount");
1146 exit(1);
1147 }
1148
1149 if (mount("", NETNS_RUN_DIR, "none", MS_SHARED | MS_REC,
1150 NULL)) {
1151 perror("mount");
1152 exit(1);
1153 }
1154 }
1155
1156 /* need an empty file to mount on top of */
1157 int nsfd = openat(dirfd, nsname, O_CREAT | O_RDONLY | O_EXCL, 0);
1158
1159 if (nsfd < 0) {
1160 fprintf(stderr, "failed to create \"%s/%s\": %s\n",
1161 NETNS_RUN_DIR, nsname, strerror(errno));
1162 exit(1);
1163 }
1164 close(nsfd);
1165
1166 if (unshare(CLONE_NEWNET)) {
1167 perror("unshare");
1168 unlinkat(dirfd, nsname, 0);
1169 exit(1);
1170 }
1171
1172 char *dstpath = asprintfrr(MTYPE_TMP, "%s/%s", NETNS_RUN_DIR, nsname);
1173
1174 /* bind-mount so the namespace has a name and is persistent */
1175 if (mount("/proc/self/ns/net", dstpath, "none", MS_BIND, NULL) < 0) {
1176 fprintf(stderr, "failed to bind-mount netns to \"%s\": %s\n",
1177 dstpath, strerror(errno));
1178 unlinkat(dirfd, nsname, 0);
1179 exit(1);
1180 }
1181
1182 XFREE(MTYPE_TMP, dstpath);
1183}
1184
1185static void netns_setup(const char *nsname)
1186{
1187 int dirfd, nsfd;
1188
1189 dirfd = open(NETNS_RUN_DIR, O_DIRECTORY | O_RDONLY);
1190 if (dirfd < 0) {
1191 if (errno == ENOTDIR) {
1192 fprintf(stderr, "error: \"%s\" is not a directory!\n",
1193 NETNS_RUN_DIR);
1194 exit(1);
1195 } else if (errno == ENOENT) {
1196 if (mkdir(NETNS_RUN_DIR, 0755)) {
1197 fprintf(stderr, "error: \"%s\": mkdir: %s\n",
1198 NETNS_RUN_DIR, strerror(errno));
1199 exit(1);
1200 }
1201 dirfd = open(NETNS_RUN_DIR, O_DIRECTORY | O_RDONLY);
1202 if (dirfd < 0) {
1203 fprintf(stderr, "error: \"%s\": opendir: %s\n",
1204 NETNS_RUN_DIR, strerror(errno));
1205 exit(1);
1206 }
1207 } else {
1208 fprintf(stderr, "error: \"%s\": %s\n",
1209 NETNS_RUN_DIR, strerror(errno));
1210 exit(1);
1211 }
1212 }
1213
1214 nsfd = openat(dirfd, nsname, O_RDONLY);
1215 if (nsfd < 0 && errno != ENOENT) {
1216 fprintf(stderr, "error: \"%s/%s\": %s\n",
1217 NETNS_RUN_DIR, nsname, strerror(errno));
1218 exit(1);
1219 }
1220 if (nsfd < 0)
1221 netns_create(dirfd, nsname);
1222 else {
1223 if (setns(nsfd, CLONE_NEWNET)) {
1224 perror("setns");
1225 exit(1);
1226 }
1227 close(nsfd);
1228 }
1229 close(dirfd);
1230
1231 /* make sure loopback is up... weird things happen otherwise.
1232 * ioctl is perfectly fine for this, don't need netlink...
1233 */
1234 int sockfd;
1235 struct ifreq ifr = { };
1236
1237 strlcpy(ifr.ifr_name, "lo", sizeof(ifr.ifr_name));
1238
1239 sockfd = socket(AF_INET, SOCK_DGRAM, 0);
1240 if (sockfd < 0) {
1241 perror("socket");
1242 exit(1);
1243 }
1244 if (ioctl(sockfd, SIOCGIFFLAGS, &ifr)) {
1245 perror("ioctl(SIOCGIFFLAGS, \"lo\")");
1246 exit(1);
1247 }
1248 if (!(ifr.ifr_flags & IFF_UP)) {
1249 ifr.ifr_flags |= IFF_UP;
1250 if (ioctl(sockfd, SIOCSIFFLAGS, &ifr)) {
1251 perror("ioctl(SIOCSIFFLAGS, \"lo\")");
1252 exit(1);
1253 }
1254 }
1255 close(sockfd);
1256}
1257
1258#else /* !GNU_LINUX */
1259
1260static void netns_setup(const char *nsname)
1261{
1262 fprintf(stderr, "network namespaces are only available on Linux\n");
1263 exit(1);
1264}
1265#endif
1266
0a7c7856
DL
1267static void watchfrr_init(int argc, char **argv)
1268{
1269 const char *special = "zebra";
1270 int i;
1271 struct daemon *dmn, **add = &gs.daemons;
1272 char alldaemons[512] = "", *p = alldaemons;
1273
5c9d1c83
DL
1274 thread_add_timer_msec(master, startup_timeout, NULL, STARTUP_TIMEOUT,
1275 &gs.t_startup_timeout);
1276
0a7c7856
DL
1277 for (i = optind; i < argc; i++) {
1278 dmn = XCALLOC(MTYPE_WATCHFRR_DAEMON, sizeof(*dmn));
1279
1280 dmn->name = dmn->restart.name = argv[i];
1281 dmn->state = DAEMON_INIT;
1282 gs.numdaemons++;
1283 gs.numdown++;
1284 dmn->fd = -1;
c0e5cb52 1285 thread_add_timer_msec(master, wakeup_init, dmn, 0,
0a7c7856
DL
1286 &dmn->t_wakeup);
1287 dmn->restart.interval = gs.min_restart_interval;
1288 *add = dmn;
1289 add = &dmn->next;
1290
1291 if (!strcmp(dmn->name, special))
1292 gs.special = dmn;
1293 }
1294
1295 if (!gs.daemons) {
1296 fprintf(stderr,
1297 "Must specify one or more daemons to monitor.\n\n");
1298 frr_help_exit(1);
1299 }
1300 if (!watch_only && !gs.special) {
1301 fprintf(stderr, "\"%s\" daemon must be in daemon lists\n\n",
1302 special);
1303 frr_help_exit(1);
1304 }
1305
1306 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
1307 snprintf(p, alldaemons + sizeof(alldaemons) - p, "%s%s",
1308 (p == alldaemons) ? "" : " ", dmn->name);
1309 p += strlen(p);
1310 }
1311 zlog_notice("%s %s watching [%s]%s", progname, FRR_VERSION, alldaemons,
1312 watch_only ? ", monitor mode" : "");
1313}
1314
a6810074 1315struct zebra_privs_t watchfrr_privs = {
95c4aff2 1316#ifdef VTY_GROUP
a6810074 1317 .vty_group = VTY_GROUP,
95c4aff2
DL
1318#endif
1319};
1320
7cc91e67 1321static struct frr_signal_t watchfrr_signals[] = {
4f04a76b
DL
1322 {
1323 .signal = SIGINT,
1324 .handler = sigint,
1325 },
1326 {
1327 .signal = SIGTERM,
1328 .handler = sigint,
1329 },
1330 {
1331 .signal = SIGCHLD,
1332 .handler = sigchild,
1333 },
1334};
1335
1336FRR_DAEMON_INFO(watchfrr, WATCHFRR,
d62a17ae 1337 .flags = FRR_NO_PRIVSEP | FRR_NO_TCPVTY | FRR_LIMITED_CLI
0a7c7856
DL
1338 | FRR_NO_CFG_PID_DRY | FRR_NO_ZCLIENT
1339 | FRR_DETACH_LATER,
4f04a76b 1340
d62a17ae 1341 .printhelp = printhelp,
1342 .copyright = "Copyright 2004 Andrew J. Schorr",
4f04a76b 1343
d62a17ae 1344 .signals = watchfrr_signals,
1345 .n_signals = array_size(watchfrr_signals),
4f04a76b 1346
80413c20
DL
1347 .privs = &watchfrr_privs,
1348);
4f04a76b 1349
999f153e
DL
1350#define DEPRECATED_OPTIONS "aAezR:"
1351
a6810074 1352int main(int argc, char **argv)
8b886ca7 1353{
a6810074 1354 int opt;
a6810074 1355 const char *blankstr = NULL;
33606a15
DL
1356 const char *netns = NULL;
1357 bool netns_en = false;
a6810074 1358
4f04a76b
DL
1359 frr_preinit(&watchfrr_di, argc, argv);
1360 progname = watchfrr_di.progname;
1361
33606a15 1362 frr_opt_add("b:di:k:l:N:p:r:S:s:t:T:" DEPRECATED_OPTIONS, longopts, "");
a6810074
DL
1363
1364 gs.restart.name = "all";
4f04a76b 1365 while ((opt = frr_getopt(argc, argv, NULL)) != EOF) {
999f153e
DL
1366 if (opt && opt < 128 && strchr(DEPRECATED_OPTIONS, opt)) {
1367 fprintf(stderr,
1368 "The -%c option no longer exists.\n"
1369 "Please refer to the watchfrr(8) man page.\n",
1370 opt);
1371 exit(1);
1372 }
1373
a6810074
DL
1374 switch (opt) {
1375 case 0:
1376 break;
a6810074
DL
1377 case 'b':
1378 blankstr = optarg;
1379 break;
f168b713
DL
1380 case OPTION_DRY:
1381 watch_only = true;
a6810074
DL
1382 break;
1383 case 'k':
1384 if (!valid_command(optarg)) {
1385 fprintf(stderr,
1386 "Invalid kill command, must contain '%%s': %s\n",
1387 optarg);
4f04a76b 1388 frr_help_exit(1);
a6810074
DL
1389 }
1390 gs.stop_command = optarg;
1391 break;
d62a17ae 1392 case 'l': {
1393 char garbage[3];
1394 if ((sscanf(optarg, "%d%1s", &gs.loglevel, garbage)
1395 != 1)
1396 || (gs.loglevel < LOG_EMERG)) {
1397 fprintf(stderr,
1398 "Invalid loglevel argument: %s\n",
1399 optarg);
1400 frr_help_exit(1);
a6810074 1401 }
d62a17ae 1402 } break;
1403 case OPTION_MINRESTART: {
1404 char garbage[3];
1405 if ((sscanf(optarg, "%ld%1s", &gs.min_restart_interval,
1406 garbage)
1407 != 1)
1408 || (gs.min_restart_interval < 0)) {
1409 fprintf(stderr,
1410 "Invalid min_restart_interval argument: %s\n",
1411 optarg);
1412 frr_help_exit(1);
a6810074 1413 }
d62a17ae 1414 } break;
1415 case OPTION_MAXRESTART: {
1416 char garbage[3];
1417 if ((sscanf(optarg, "%ld%1s", &gs.max_restart_interval,
1418 garbage)
1419 != 1)
1420 || (gs.max_restart_interval < 0)) {
1421 fprintf(stderr,
1422 "Invalid max_restart_interval argument: %s\n",
1423 optarg);
1424 frr_help_exit(1);
a6810074 1425 }
d62a17ae 1426 } break;
6d0fa5c2
DS
1427 case OPTION_MAXOPERATIONAL: {
1428 char garbage[3];
1429
1430 if ((sscanf(optarg, "%ld%1s", &gs.operational_timeout,
1431 garbage) != 1) ||
1432 (gs.max_restart_interval < 0)) {
1433 fprintf(stderr,
1434 "Invalid Operational_timeout argument: %s\n",
1435 optarg);
1436 frr_help_exit(1);
1437 }
1438 } break;
33606a15
DL
1439 case OPTION_NETNS:
1440 netns_en = true;
b12bc77c 1441 if (optarg && strchr(optarg, '/')) {
33606a15
DL
1442 fprintf(stderr,
1443 "invalid network namespace name \"%s\" (may not contain slashes)\n",
1444 optarg);
1445 frr_help_exit(1);
1446 }
1447 netns = optarg;
1448 break;
d62a17ae 1449 case 'i': {
1450 char garbage[3];
1451 int period;
1452 if ((sscanf(optarg, "%d%1s", &period, garbage) != 1)
1453 || (gs.period < 1)) {
1454 fprintf(stderr,
1455 "Invalid interval argument: %s\n",
1456 optarg);
1457 frr_help_exit(1);
a6810074 1458 }
d62a17ae 1459 gs.period = 1000 * period;
1460 } break;
a6810074 1461 case 'p':
0a7c7856 1462 watchfrr_di.pid_file = optarg;
a6810074
DL
1463 break;
1464 case 'r':
a6810074
DL
1465 if (!valid_command(optarg)) {
1466 fprintf(stderr,
1467 "Invalid restart command, must contain '%%s': %s\n",
1468 optarg);
4f04a76b 1469 frr_help_exit(1);
a6810074
DL
1470 }
1471 gs.restart_command = optarg;
a6810074
DL
1472 break;
1473 case 's':
1474 if (!valid_command(optarg)) {
1475 fprintf(stderr,
1476 "Invalid start command, must contain '%%s': %s\n",
1477 optarg);
4f04a76b 1478 frr_help_exit(1);
a6810074
DL
1479 }
1480 gs.start_command = optarg;
1481 break;
1482 case 'S':
1483 gs.vtydir = optarg;
1484 break;
d62a17ae 1485 case 't': {
1486 char garbage[3];
1487 if ((sscanf(optarg, "%ld%1s", &gs.timeout, garbage)
1488 != 1)
1489 || (gs.timeout < 1)) {
1490 fprintf(stderr,
1491 "Invalid timeout argument: %s\n",
1492 optarg);
1493 frr_help_exit(1);
a6810074 1494 }
d62a17ae 1495 } break;
1496 case 'T': {
1497 char garbage[3];
1498 if ((sscanf(optarg, "%ld%1s", &gs.restart_timeout,
1499 garbage)
1500 != 1)
1501 || (gs.restart_timeout < 1)) {
1502 fprintf(stderr,
1503 "Invalid restart timeout argument: %s\n",
1504 optarg);
1505 frr_help_exit(1);
a6810074 1506 }
d62a17ae 1507 } break;
a6810074
DL
1508 default:
1509 fputs("Invalid option.\n", stderr);
4f04a76b 1510 frr_help_exit(1);
a6810074 1511 }
8b886ca7 1512 }
a6810074 1513
71e7975a
DL
1514 if (watch_only
1515 && (gs.start_command || gs.stop_command || gs.restart_command)) {
d87ae5cc 1516 fputs("Options -r/-s/-k are not used when --dry is active.\n",
a6810074 1517 stderr);
8b886ca7 1518 }
f168b713
DL
1519 if (!watch_only
1520 && (!gs.restart_command || !gs.start_command || !gs.stop_command)) {
1521 fprintf(stderr,
1522 "Options -s (start), -k (kill), and -r (restart) are required.\n");
1523 frr_help_exit(1);
8b886ca7 1524 }
8b886ca7 1525
a6810074
DL
1526 if (blankstr) {
1527 if (gs.restart_command)
1528 gs.restart_command =
d62a17ae 1529 translate_blanks(gs.restart_command, blankstr);
a6810074
DL
1530 if (gs.start_command)
1531 gs.start_command =
d62a17ae 1532 translate_blanks(gs.start_command, blankstr);
a6810074
DL
1533 if (gs.stop_command)
1534 gs.stop_command =
d62a17ae 1535 translate_blanks(gs.stop_command, blankstr);
065de903 1536 }
8b886ca7 1537
a6810074 1538 gs.restart.interval = gs.min_restart_interval;
8b886ca7 1539
33606a15
DL
1540 /* env variable for the processes that we start */
1541 if (watchfrr_di.pathspace)
1542 setenv("FRR_PATHSPACE", watchfrr_di.pathspace, 1);
1543 else
1544 unsetenv("FRR_PATHSPACE");
1545
a91f5417
DS
1546 /*
1547 * when watchfrr_di.pathspace is read, if it is not specified
1548 * pathspace is NULL as expected
1549 */
1550 pathspace = watchfrr_di.pathspace;
1551
33606a15
DL
1552 if (netns_en && !netns)
1553 netns = watchfrr_di.pathspace;
a91f5417 1554
33606a15
DL
1555 if (netns_en && netns && netns[0])
1556 netns_setup(netns);
1557
4f04a76b 1558 master = frr_init();
b647dc2a 1559 watchfrr_error_init();
0a7c7856
DL
1560 watchfrr_init(argc, argv);
1561 watchfrr_vty_init();
1562
1563 frr_config_fork();
4f04a76b 1564
0a7c7856 1565 if (watchfrr_di.daemon_mode)
0bdeb5e5 1566 zlog_syslog_set_prio_min(MIN(gs.loglevel, LOG_DEBUG));
0a7c7856 1567 else
0bdeb5e5 1568 zlog_aux_init(NULL, MIN(gs.loglevel, LOG_DEBUG));
8b886ca7 1569
0a7c7856 1570 frr_run(master);
8b886ca7 1571
a6810074
DL
1572 systemd_send_stopping();
1573 /* Not reached. */
1574 return 0;
8b886ca7 1575}