]> git.proxmox.com Git - mirror_frr.git/blame - watchfrr/watchfrr.c
Merge pull request #8144 from LabNConsulting/chopps/ly2
[mirror_frr.git] / watchfrr / watchfrr.c
CommitLineData
8b886ca7 1/*
896014f4
DL
2 * Monitor status of frr daemons and restart if necessary.
3 *
4 * Copyright (C) 2004 Andrew J. Schorr
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
8b886ca7 19 */
20
a365534f 21#include <zebra.h>
8b886ca7 22#include <thread.h>
23#include <log.h>
52e66296 24#include <network.h>
8b886ca7 25#include <sigevent.h>
a365534f 26#include <lib/version.h>
95c4aff2 27#include "command.h"
4f04a76b 28#include "libfrr.h"
b647dc2a 29#include "lib_errors.h"
0bdeb5e5 30#include "zlog_targets.h"
5920b3eb 31#include "network.h"
33606a15 32#include "printfrr.h"
95c4aff2 33
6f594023 34#include <getopt.h>
a365534f 35#include <sys/un.h>
36#include <sys/wait.h>
837d16cc 37#include <memory.h>
651415bd 38#include <systemd.h>
8b886ca7 39
9473e340 40#include "watchfrr.h"
b647dc2a 41#include "watchfrr_errors.h"
95c4aff2 42
8b886ca7 43#ifndef MIN
44#define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
45#endif
46
47/* Macros to help randomize timers. */
5920b3eb 48#define JITTER(X) ((frr_weak_random() % ((X)+1))-((X)/2))
8b886ca7 49#define FUZZY(X) ((X)+JITTER((X)/20))
50
51#define DEFAULT_PERIOD 5
0a64aff6 52#define DEFAULT_TIMEOUT 90
8b886ca7 53#define DEFAULT_RESTART_TIMEOUT 20
54#define DEFAULT_LOGLEVEL LOG_INFO
55#define DEFAULT_MIN_RESTART 60
56#define DEFAULT_MAX_RESTART 600
8b886ca7 57
3ec95567
DL
58#define DEFAULT_RESTART_CMD WATCHFRR_SH_PATH " restart %s"
59#define DEFAULT_START_CMD WATCHFRR_SH_PATH " start %s"
60#define DEFAULT_STOP_CMD WATCHFRR_SH_PATH " stop %s"
61
8b886ca7 62#define PING_TOKEN "PING"
63
bf8d3d6a
DL
64DEFINE_MGROUP(WATCHFRR, "watchfrr");
65DEFINE_MTYPE_STATIC(WATCHFRR, WATCHFRR_DAEMON, "watchfrr daemon entry");
0a7c7856 66
55c72803 67/* Needs to be global, referenced somewhere inside libfrr. */
8b886ca7 68struct thread_master *master;
69
f168b713 70static bool watch_only = false;
8b886ca7 71
a6810074
DL
72typedef enum {
73 PHASE_NONE = 0,
c0e5cb52 74 PHASE_INIT,
a6810074
DL
75 PHASE_STOPS_PENDING,
76 PHASE_WAITING_DOWN,
77 PHASE_ZEBRA_RESTART_PENDING,
78 PHASE_WAITING_ZEBRA_UP
8b886ca7 79} restart_phase_t;
80
2b64873d 81static const char *const phase_str[] = {
af568444 82 "Idle",
c0e5cb52 83 "Startup",
a6810074
DL
84 "Stop jobs running",
85 "Waiting for other daemons to come down",
86 "Zebra restart job running",
87 "Waiting for zebra to come up",
88 "Start jobs running",
8b886ca7 89};
90
91#define PHASE_TIMEOUT (3*gs.restart_timeout)
5c9d1c83 92#define STARTUP_TIMEOUT 55 * 1000
8b886ca7 93
a6810074
DL
94struct restart_info {
95 const char *name;
96 const char *what;
97 pid_t pid;
98 struct timeval time;
99 long interval;
100 struct thread *t_kill;
101 int kills;
098e240f 102};
103
a6810074 104static struct global_state {
a6810074
DL
105 restart_phase_t phase;
106 struct thread *t_phase_hanging;
5c9d1c83 107 struct thread *t_startup_timeout;
a6810074
DL
108 const char *vtydir;
109 long period;
110 long timeout;
111 long restart_timeout;
112 long min_restart_interval;
113 long max_restart_interval;
a6810074
DL
114 struct daemon *daemons;
115 const char *restart_command;
116 const char *start_command;
117 const char *stop_command;
118 struct restart_info restart;
a6810074 119 int loglevel;
d62a17ae 120 struct daemon *special; /* points to zebra when doing phased restart */
a6810074
DL
121 int numdaemons;
122 int numpids;
d62a17ae 123 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
8b886ca7 124} gs = {
c0e5cb52 125 .phase = PHASE_INIT,
64a249ad 126 .vtydir = frr_vtydir,
d62a17ae 127 .period = 1000 * DEFAULT_PERIOD,
128 .timeout = DEFAULT_TIMEOUT,
129 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
130 .loglevel = DEFAULT_LOGLEVEL,
131 .min_restart_interval = DEFAULT_MIN_RESTART,
132 .max_restart_interval = DEFAULT_MAX_RESTART,
3ec95567
DL
133 .restart_command = DEFAULT_RESTART_CMD,
134 .start_command = DEFAULT_START_CMD,
135 .stop_command = DEFAULT_STOP_CMD,
d62a17ae 136};
a6810074
DL
137
138typedef enum {
139 DAEMON_INIT,
140 DAEMON_DOWN,
141 DAEMON_CONNECTING,
142 DAEMON_UP,
143 DAEMON_UNRESPONSIVE
8b886ca7 144} daemon_state_t;
145
d62a17ae 146#define IS_UP(DMN) \
147 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
8b886ca7 148
2b64873d 149static const char *const state_str[] = {
d62a17ae 150 "Init", "Down", "Connecting", "Up", "Unresponsive",
8b886ca7 151};
152
153struct daemon {
a6810074
DL
154 const char *name;
155 daemon_state_t state;
156 int fd;
157 struct timeval echo_sent;
d7c0a89a 158 unsigned int connect_tries;
a6810074
DL
159 struct thread *t_wakeup;
160 struct thread *t_read;
161 struct thread *t_write;
162 struct daemon *next;
163 struct restart_info restart;
cc53b605
DS
164
165 /*
166 * For a given daemon, if we've turned on ignore timeouts
167 * ignore the timeout value and assume everything is ok
168 * This is for daemon debugging w/ gdb after we have started
169 * FRR and realize we have something that needs to be looked
170 * at
171 */
172 bool ignore_timeout;
8b886ca7 173};
174
9272302b
DL
175#define OPTION_MINRESTART 2000
176#define OPTION_MAXRESTART 2001
f168b713 177#define OPTION_DRY 2002
33606a15 178#define OPTION_NETNS 2003
9272302b 179
a6810074
DL
180static const struct option longopts[] = {
181 {"daemon", no_argument, NULL, 'd'},
182 {"statedir", required_argument, NULL, 'S'},
a6810074
DL
183 {"loglevel", required_argument, NULL, 'l'},
184 {"interval", required_argument, NULL, 'i'},
185 {"timeout", required_argument, NULL, 't'},
186 {"restart-timeout", required_argument, NULL, 'T'},
187 {"restart", required_argument, NULL, 'r'},
188 {"start-command", required_argument, NULL, 's'},
189 {"kill-command", required_argument, NULL, 'k'},
f168b713 190 {"dry", no_argument, NULL, OPTION_DRY},
d62a17ae 191 {"min-restart-interval", required_argument, NULL, OPTION_MINRESTART},
192 {"max-restart-interval", required_argument, NULL, OPTION_MAXRESTART},
a6810074
DL
193 {"pid-file", required_argument, NULL, 'p'},
194 {"blank-string", required_argument, NULL, 'b'},
33606a15
DL
195#ifdef GNU_LINUX
196 {"netns", optional_argument, NULL, OPTION_NETNS},
197#endif
a6810074
DL
198 {"help", no_argument, NULL, 'h'},
199 {"version", no_argument, NULL, 'v'},
d62a17ae 200 {NULL, 0, NULL, 0}};
8b886ca7 201
202static int try_connect(struct daemon *dmn);
203static int wakeup_send_echo(struct thread *t_wakeup);
204static void try_restart(struct daemon *dmn);
205static void phase_check(void);
75f8b0e4 206static void restart_done(struct daemon *dmn);
8b886ca7 207
4f04a76b 208static const char *progname;
cc53b605
DS
209
210void watchfrr_set_ignore_daemon(struct vty *vty, const char *dname, bool ignore)
211{
212 struct daemon *dmn;
213
214 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
215 if (strncmp(dmn->name, dname, strlen(dmn->name)) == 0)
216 break;
217 }
218
219 if (dmn) {
220 dmn->ignore_timeout = ignore;
221 vty_out(vty, "%s switching to %s\n", dmn->name,
222 ignore ? "ignore" : "watch");
223 } else
224 vty_out(vty, "%s is not configured for running at the moment",
225 dname);
226}
227
4f04a76b 228static void printhelp(FILE *target)
8b886ca7 229{
d62a17ae 230 fprintf(target,
231 "Usage : %s [OPTION...] <daemon name> ...\n\n\
9473e340 232Watchdog program to monitor status of frr daemons and try to restart\n\
8b886ca7 233them if they are down or unresponsive. It determines whether a daemon is\n\
234up based on whether it can connect to the daemon's vty unix stream socket.\n\
235It then repeatedly sends echo commands over that socket to determine whether\n\
236the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
237on the socket connection and know immediately that the daemon is down.\n\n\
238The daemons to be monitored should be listed on the command line.\n\n\
8b886ca7 239In order to avoid attempting to restart the daemons in a fast loop,\n\
240the -m and -M options allow you to control the minimum delay between\n\
241restart commands. The minimum restart delay is recalculated each time\n\
242a restart is attempted: if the time since the last restart attempt exceeds\n\
243twice the -M value, then the restart delay is set to the -m value.\n\
d62a17ae 244Otherwise, the interval is doubled (but capped at the -M value).\n\n",
f168b713 245 progname);
e757c940 246
d62a17ae 247 fprintf(target,
248 "Options:\n\
8b886ca7 249-d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
250 to syslog instead of stdout.\n\
251-S, --statedir Set the vty socket directory (default is %s)\n\
33606a15
DL
252-N, --pathspace Insert prefix into config & socket paths\n"
253#ifdef GNU_LINUX
254" --netns Create and/or use Linux network namespace. If no name is\n"
255" given, uses the value from `-N`.\n"
256#endif
257"-l, --loglevel Set the logging level (default is %d).\n\
8b886ca7 258 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
259 but it can be set higher than %d if extra-verbose debugging\n\
260 messages are desired.\n\
9272302b 261 --min-restart-interval\n\
8b886ca7 262 Set the minimum seconds to wait between invocations of daemon\n\
263 restart commands (default is %d).\n\
9272302b 264 --max-restart-interval\n\
8b886ca7 265 Set the maximum seconds to wait between invocations of daemon\n\
266 restart commands (default is %d).\n\
267-i, --interval Set the status polling interval in seconds (default is %d)\n\
268-t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
269-T, --restart-timeout\n\
270 Set the restart (kill) timeout in seconds (default is %d).\n\
271 If any background jobs are still running after this much\n\
272 time has elapsed, they will be killed.\n\
273-r, --restart Supply a Bourne shell command to use to restart a single\n\
274 daemon. The command string should include '%%s' where the\n\
275 name of the daemon should be substituted.\n\
3ec95567 276 (default: '%s')\n\
8b886ca7 277-s, --start-command\n\
278 Supply a Bourne shell to command to use to start a single\n\
279 daemon. The command string should include '%%s' where the\n\
280 name of the daemon should be substituted.\n\
3ec95567 281 (default: '%s')\n\
8b886ca7 282-k, --kill-command\n\
283 Supply a Bourne shell to command to use to stop a single\n\
284 daemon. The command string should include '%%s' where the\n\
285 name of the daemon should be substituted.\n\
3ec95567 286 (default: '%s')\n\
f168b713 287 --dry Do not start or restart anything, just log.\n\
8b886ca7 288-p, --pid-file Set process identifier file name\n\
0a7c7856 289 (default is %s/watchfrr.pid).\n\
c8b40f86 290-b, --blank-string\n\
291 When the supplied argument string is found in any of the\n\
f168b713 292 various shell command arguments (-r, -s, or -k), replace\n\
c8b40f86 293 it with a space. This is an ugly hack to circumvent problems\n\
294 passing command-line arguments with embedded spaces.\n\
8b886ca7 295-v, --version Print program version\n\
d62a17ae 296-h, --help Display this help and exit\n",
64a249ad 297 frr_vtydir, DEFAULT_LOGLEVEL, LOG_EMERG, LOG_DEBUG, LOG_DEBUG,
d62a17ae 298 DEFAULT_MIN_RESTART, DEFAULT_MAX_RESTART, DEFAULT_PERIOD,
3ec95567
DL
299 DEFAULT_TIMEOUT, DEFAULT_RESTART_TIMEOUT,
300 DEFAULT_RESTART_CMD, DEFAULT_START_CMD, DEFAULT_STOP_CMD,
301 frr_vtydir);
8b886ca7 302}
303
a6810074 304static pid_t run_background(char *shell_cmd)
8b886ca7 305{
a6810074
DL
306 pid_t child;
307
308 switch (child = fork()) {
309 case -1:
450971aa 310 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
311 "fork failed, cannot run command [%s]: %s",
312 shell_cmd, safe_strerror(errno));
a6810074
DL
313 return -1;
314 case 0:
315 /* Child process. */
d62a17ae 316 /* Use separate process group so child processes can be killed
317 * easily. */
a6810074 318 if (setpgid(0, 0) < 0)
957cfa24 319 zlog_warn("setpgid(0,0) failed: %s",
a6810074
DL
320 safe_strerror(errno));
321 {
322 char shell[] = "sh";
323 char dashc[] = "-c";
d62a17ae 324 char *const argv[4] = {shell, dashc, shell_cmd, NULL};
a6810074 325 execv("/bin/sh", argv);
450971aa 326 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
327 "execv(/bin/sh -c '%s') failed: %s",
328 shell_cmd, safe_strerror(errno));
a6810074
DL
329 _exit(127);
330 }
331 default:
332 /* Parent process: we will reap the child later. */
c3f65458
QY
333 zlog_info("Forked background command [pid %d]: %s", (int)child,
334 shell_cmd);
a6810074
DL
335 return child;
336 }
8b886ca7 337}
338
a6810074
DL
339static struct timeval *time_elapsed(struct timeval *result,
340 const struct timeval *start_time)
8b886ca7 341{
a6810074
DL
342 gettimeofday(result, NULL);
343 result->tv_sec -= start_time->tv_sec;
344 result->tv_usec -= start_time->tv_usec;
345 while (result->tv_usec < 0) {
346 result->tv_usec += 1000000L;
347 result->tv_sec--;
348 }
349 return result;
8b886ca7 350}
351
a6810074 352static int restart_kill(struct thread *t_kill)
8b886ca7 353{
a6810074
DL
354 struct restart_info *restart = THREAD_ARG(t_kill);
355 struct timeval delay;
356
357 time_elapsed(&delay, &restart->time);
d62a17ae 358 zlog_warn(
957cfa24 359 "%s %s child process %d still running after %ld seconds, sending signal %d",
d62a17ae 360 restart->what, restart->name, (int)restart->pid,
361 (long)delay.tv_sec, (restart->kills ? SIGKILL : SIGTERM));
a6810074
DL
362 kill(-restart->pid, (restart->kills ? SIGKILL : SIGTERM));
363 restart->kills++;
66e78ae6
QY
364 restart->t_kill = NULL;
365 thread_add_timer(master, restart_kill, restart, gs.restart_timeout,
366 &restart->t_kill);
a6810074 367 return 0;
8b886ca7 368}
369
a6810074 370static struct restart_info *find_child(pid_t child)
8b886ca7 371{
f168b713 372 struct daemon *dmn;
7c265f7d
CF
373 if (gs.restart.pid == child)
374 return &gs.restart;
375
f168b713
DL
376 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
377 if (dmn->restart.pid == child)
378 return &dmn->restart;
a6810074
DL
379 }
380 return NULL;
8b886ca7 381}
382
a6810074 383static void sigchild(void)
8b886ca7 384{
a6810074
DL
385 pid_t child;
386 int status;
387 const char *name;
388 const char *what;
389 struct restart_info *restart;
75f8b0e4 390 struct daemon *dmn;
a6810074
DL
391
392 switch (child = waitpid(-1, &status, WNOHANG)) {
393 case -1:
450971aa 394 flog_err_sys(EC_LIB_SYSTEM_CALL, "waitpid failed: %s",
09c866e3 395 safe_strerror(errno));
a6810074
DL
396 return;
397 case 0:
398 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
399 return;
400 }
401
402 if (child == integrated_write_pid) {
403 integrated_write_sigchld(status);
404 return;
405 }
406
407 if ((restart = find_child(child)) != NULL) {
408 name = restart->name;
409 what = restart->what;
410 restart->pid = 0;
411 gs.numpids--;
b3d6bc6e
MS
412 thread_cancel(&restart->t_kill);
413
d62a17ae 414 /* Update restart time to reflect the time the command
415 * completed. */
a6810074
DL
416 gettimeofday(&restart->time, NULL);
417 } else {
09c866e3 418 flog_err_sys(
450971aa 419 EC_LIB_SYSTEM_CALL,
09c866e3
QY
420 "waitpid returned status for an unknown child process %d",
421 (int)child);
a6810074
DL
422 name = "(unknown)";
423 what = "background";
424 }
425 if (WIFSTOPPED(status))
957cfa24 426 zlog_warn("%s %s process %d is stopped", what, name,
d62a17ae 427 (int)child);
a6810074 428 else if (WIFSIGNALED(status))
d62a17ae 429 zlog_warn("%s %s process %d terminated due to signal %d", what,
430 name, (int)child, WTERMSIG(status));
a6810074
DL
431 else if (WIFEXITED(status)) {
432 if (WEXITSTATUS(status) != 0)
d62a17ae 433 zlog_warn(
434 "%s %s process %d exited with non-zero status %d",
435 what, name, (int)child, WEXITSTATUS(status));
75f8b0e4 436 else {
a6810074
DL
437 zlog_debug("%s %s process %d exited normally", what,
438 name, (int)child);
75f8b0e4
DL
439
440 if (restart && restart != &gs.restart) {
441 dmn = container_of(restart, struct daemon,
442 restart);
443 restart_done(dmn);
444 } else if (restart)
445 for (dmn = gs.daemons; dmn; dmn = dmn->next)
446 restart_done(dmn);
447 }
a6810074 448 } else
09c866e3 449 flog_err_sys(
450971aa 450 EC_LIB_SYSTEM_CALL,
09c866e3
QY
451 "cannot interpret %s %s process %d wait status 0x%x",
452 what, name, (int)child, status);
a6810074 453 phase_check();
8b886ca7 454}
455
d62a17ae 456static int run_job(struct restart_info *restart, const char *cmdtype,
457 const char *command, int force, int update_interval)
8b886ca7 458{
a6810074
DL
459 struct timeval delay;
460
461 if (gs.loglevel > LOG_DEBUG + 1)
462 zlog_debug("attempting to %s %s", cmdtype, restart->name);
463
464 if (restart->pid) {
465 if (gs.loglevel > LOG_DEBUG + 1)
d62a17ae 466 zlog_debug(
467 "cannot %s %s, previous pid %d still running",
468 cmdtype, restart->name, (int)restart->pid);
a6810074
DL
469 return -1;
470 }
471
b3ee8bcc
DS
472#if defined HAVE_SYSTEMD
473 char buffer[512];
474
475 snprintf(buffer, sizeof(buffer), "restarting %s", restart->name);
476 systemd_send_status(buffer);
477#endif
478
d62a17ae 479 /* Note: time_elapsed test must come before the force test, since we
480 need
a6810074
DL
481 to make sure that delay is initialized for use below in updating the
482 restart interval. */
483 if ((time_elapsed(&delay, &restart->time)->tv_sec < restart->interval)
484 && !force) {
b3ee8bcc 485
a6810074 486 if (gs.loglevel > LOG_DEBUG + 1)
d62a17ae 487 zlog_debug(
3efd0893 488 "postponing %s %s: elapsed time %ld < retry interval %ld",
d62a17ae 489 cmdtype, restart->name, (long)delay.tv_sec,
490 restart->interval);
a6810074
DL
491 return -1;
492 }
493
494 gettimeofday(&restart->time, NULL);
495 restart->kills = 0;
496 {
497 char cmd[strlen(command) + strlen(restart->name) + 1];
498 snprintf(cmd, sizeof(cmd), command, restart->name);
499 if ((restart->pid = run_background(cmd)) > 0) {
66e78ae6 500 restart->t_kill = NULL;
d62a17ae 501 thread_add_timer(master, restart_kill, restart,
502 gs.restart_timeout, &restart->t_kill);
a6810074
DL
503 restart->what = cmdtype;
504 gs.numpids++;
505 } else
506 restart->pid = 0;
507 }
508
b3ee8bcc
DS
509#if defined HAVE_SYSTEMD
510 systemd_send_status("FRR Operational");
511#endif
a6810074
DL
512 /* Calculate the new restart interval. */
513 if (update_interval) {
514 if (delay.tv_sec > 2 * gs.max_restart_interval)
515 restart->interval = gs.min_restart_interval;
516 else if ((restart->interval *= 2) > gs.max_restart_interval)
517 restart->interval = gs.max_restart_interval;
518 if (gs.loglevel > LOG_DEBUG + 1)
519 zlog_debug("restart %s interval is now %ld",
520 restart->name, restart->interval);
521 }
522 return restart->pid;
8b886ca7 523}
524
d62a17ae 525#define SET_READ_HANDLER(DMN) \
526 do { \
527 (DMN)->t_read = NULL; \
528 thread_add_read(master, handle_read, (DMN), (DMN)->fd, \
529 &(DMN)->t_read); \
530 } while (0);
531
532#define SET_WAKEUP_DOWN(DMN) \
533 do { \
534 (DMN)->t_wakeup = NULL; \
535 thread_add_timer_msec(master, wakeup_down, (DMN), \
536 FUZZY(gs.period), &(DMN)->t_wakeup); \
537 } while (0);
538
539#define SET_WAKEUP_UNRESPONSIVE(DMN) \
540 do { \
541 (DMN)->t_wakeup = NULL; \
542 thread_add_timer_msec(master, wakeup_unresponsive, (DMN), \
543 FUZZY(gs.period), &(DMN)->t_wakeup); \
544 } while (0);
545
546#define SET_WAKEUP_ECHO(DMN) \
547 do { \
548 (DMN)->t_wakeup = NULL; \
549 thread_add_timer_msec(master, wakeup_send_echo, (DMN), \
550 FUZZY(gs.period), &(DMN)->t_wakeup); \
551 } while (0);
8b886ca7 552
a6810074 553static int wakeup_down(struct thread *t_wakeup)
8b886ca7 554{
a6810074
DL
555 struct daemon *dmn = THREAD_ARG(t_wakeup);
556
557 dmn->t_wakeup = NULL;
558 if (try_connect(dmn) < 0)
559 SET_WAKEUP_DOWN(dmn);
560 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
561 try_restart(dmn);
562 return 0;
8b886ca7 563}
564
a6810074 565static int wakeup_init(struct thread *t_wakeup)
8b886ca7 566{
a6810074
DL
567 struct daemon *dmn = THREAD_ARG(t_wakeup);
568
569 dmn->t_wakeup = NULL;
570 if (try_connect(dmn) < 0) {
c3f65458
QY
571 zlog_info(
572 "%s state -> down : initial connection attempt failed",
573 dmn->name);
a6810074
DL
574 dmn->state = DAEMON_DOWN;
575 }
c0e5cb52 576 phase_check();
a6810074 577 return 0;
8b886ca7 578}
579
75f8b0e4
DL
580static void restart_done(struct daemon *dmn)
581{
582 if (dmn->state != DAEMON_DOWN) {
3f391bec
DS
583 zlog_warn(
584 "Daemon: %s: is in %s state but expected it to be in DAEMON_DOWN state",
585 dmn->name, state_str[dmn->state]);
75f8b0e4
DL
586 return;
587 }
28ef0ee1 588 THREAD_OFF(dmn->t_wakeup);
50478845 589
75f8b0e4
DL
590 if (try_connect(dmn) < 0)
591 SET_WAKEUP_DOWN(dmn);
592}
593
a6810074 594static void daemon_down(struct daemon *dmn, const char *why)
8b886ca7 595{
a6810074 596 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
1c50c1c0
QY
597 flog_err(EC_WATCHFRR_CONNECTION, "%s state -> down : %s",
598 dmn->name, why);
a6810074
DL
599 else if (gs.loglevel > LOG_DEBUG)
600 zlog_debug("%s still down : %s", dmn->name, why);
601 if (IS_UP(dmn))
602 gs.numdown++;
603 dmn->state = DAEMON_DOWN;
604 if (dmn->fd >= 0) {
605 close(dmn->fd);
606 dmn->fd = -1;
607 }
608 THREAD_OFF(dmn->t_read);
609 THREAD_OFF(dmn->t_write);
610 THREAD_OFF(dmn->t_wakeup);
611 if (try_connect(dmn) < 0)
612 SET_WAKEUP_DOWN(dmn);
613 phase_check();
8b886ca7 614}
615
a6810074 616static int handle_read(struct thread *t_read)
8b886ca7 617{
a6810074
DL
618 struct daemon *dmn = THREAD_ARG(t_read);
619 static const char resp[sizeof(PING_TOKEN) + 4] = PING_TOKEN "\n";
620 char buf[sizeof(resp) + 100];
621 ssize_t rc;
622 struct timeval delay;
623
624 dmn->t_read = NULL;
625 if ((rc = read(dmn->fd, buf, sizeof(buf))) < 0) {
626 char why[100];
627
628 if (ERRNO_IO_RETRY(errno)) {
629 /* Pretend it never happened. */
630 SET_READ_HANDLER(dmn);
631 return 0;
632 }
633 snprintf(why, sizeof(why), "unexpected read error: %s",
634 safe_strerror(errno));
635 daemon_down(dmn, why);
636 return 0;
8b886ca7 637 }
a6810074
DL
638 if (rc == 0) {
639 daemon_down(dmn, "read returned EOF");
640 return 0;
641 }
642 if (!dmn->echo_sent.tv_sec) {
643 char why[sizeof(buf) + 100];
644 snprintf(why, sizeof(why),
645 "unexpected read returns %d bytes: %.*s", (int)rc,
646 (int)rc, buf);
647 daemon_down(dmn, why);
648 return 0;
8b886ca7 649 }
a6810074
DL
650
651 /* We are expecting an echo response: is there any chance that the
652 response would not be returned entirely in the first read? That
653 seems inconceivable... */
654 if ((rc != sizeof(resp)) || memcmp(buf, resp, sizeof(resp))) {
655 char why[100 + sizeof(buf)];
656 snprintf(why, sizeof(why),
3efd0893 657 "read returned bad echo response of %d bytes (expecting %u): %.*s",
d7c0a89a 658 (int)rc, (unsigned int)sizeof(resp), (int)rc, buf);
a6810074
DL
659 daemon_down(dmn, why);
660 return 0;
661 }
662
663 time_elapsed(&delay, &dmn->echo_sent);
664 dmn->echo_sent.tv_sec = 0;
665 if (dmn->state == DAEMON_UNRESPONSIVE) {
666 if (delay.tv_sec < gs.timeout) {
667 dmn->state = DAEMON_UP;
d62a17ae 668 zlog_warn(
3efd0893 669 "%s state -> up : echo response received after %ld.%06ld seconds",
d62a17ae 670 dmn->name, (long)delay.tv_sec,
671 (long)delay.tv_usec);
a6810074 672 } else
d62a17ae 673 zlog_warn(
3efd0893 674 "%s: slow echo response finally received after %ld.%06ld seconds",
d62a17ae 675 dmn->name, (long)delay.tv_sec,
676 (long)delay.tv_usec);
a6810074
DL
677 } else if (gs.loglevel > LOG_DEBUG + 1)
678 zlog_debug("%s: echo response received after %ld.%06ld seconds",
679 dmn->name, (long)delay.tv_sec, (long)delay.tv_usec);
680
681 SET_READ_HANDLER(dmn);
b3d6bc6e 682 thread_cancel(&dmn->t_wakeup);
a6810074
DL
683 SET_WAKEUP_ECHO(dmn);
684
685 return 0;
8b886ca7 686}
687
207e0d7a
DS
688/*
689 * Wait till we notice that all daemons are ready before
690 * we send we are ready to systemd
691 */
5c9d1c83 692static void daemon_send_ready(int exitcode)
207e0d7a 693{
5c9d1c83 694 FILE *fp;
a6810074 695 static int sent = 0;
43e587c1 696 char started[1024];
207e0d7a 697
5c9d1c83
DL
698 if (sent)
699 return;
700
701 if (exitcode == 0)
0a7c7856 702 zlog_notice("all daemons up, doing startup-complete notify");
5c9d1c83
DL
703 else if (gs.numdown < gs.numdaemons)
704 flog_err(EC_WATCHFRR_CONNECTION,
3efd0893 705 "startup did not complete within timeout (%d/%d daemons running)",
5c9d1c83
DL
706 gs.numdaemons - gs.numdown, gs.numdaemons);
707 else {
708 flog_err(EC_WATCHFRR_CONNECTION,
3efd0893 709 "all configured daemons failed to start -- exiting watchfrr");
5c9d1c83
DL
710 exit(exitcode);
711
712 }
0a7c7856 713
5c9d1c83
DL
714 frr_detach();
715
33606a15 716 snprintf(started, sizeof(started), "%s/%s", frr_vtydir,
3c649c71
DS
717 "watchfrr.started");
718 fp = fopen(started, "w");
5c9d1c83
DL
719 if (fp)
720 fclose(fp);
60bd2534 721#if defined HAVE_SYSTEMD
5c9d1c83 722 systemd_send_started(master, 0);
b3ee8bcc 723 systemd_send_status("FRR Operational");
60bd2534 724#endif
5c9d1c83 725 sent = 1;
207e0d7a
DS
726}
727
a6810074 728static void daemon_up(struct daemon *dmn, const char *why)
8b886ca7 729{
a6810074
DL
730 dmn->state = DAEMON_UP;
731 gs.numdown--;
732 dmn->connect_tries = 0;
733 zlog_notice("%s state -> up : %s", dmn->name, why);
5c9d1c83
DL
734 if (gs.numdown == 0)
735 daemon_send_ready(0);
a8cbb8b3 736 SET_WAKEUP_ECHO(dmn);
a6810074 737 phase_check();
8b886ca7 738}
739
a6810074 740static int check_connect(struct thread *t_write)
8b886ca7 741{
a6810074
DL
742 struct daemon *dmn = THREAD_ARG(t_write);
743 int sockerr;
744 socklen_t reslen = sizeof(sockerr);
745
746 dmn->t_write = NULL;
747 if (getsockopt(dmn->fd, SOL_SOCKET, SO_ERROR, (char *)&sockerr, &reslen)
748 < 0) {
749 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn->name,
750 safe_strerror(errno));
751 daemon_down(dmn,
752 "getsockopt failed checking connection success");
753 return 0;
754 }
755 if ((reslen == sizeof(sockerr)) && sockerr) {
756 char why[100];
d62a17ae 757 snprintf(
758 why, sizeof(why),
759 "getsockopt reports that connection attempt failed: %s",
760 safe_strerror(sockerr));
a6810074
DL
761 daemon_down(dmn, why);
762 return 0;
763 }
764
765 daemon_up(dmn, "delayed connect succeeded");
766 return 0;
8b886ca7 767}
768
a6810074 769static int wakeup_connect_hanging(struct thread *t_wakeup)
8b886ca7 770{
a6810074
DL
771 struct daemon *dmn = THREAD_ARG(t_wakeup);
772 char why[100];
773
774 dmn->t_wakeup = NULL;
775 snprintf(why, sizeof(why),
776 "connection attempt timed out after %ld seconds", gs.timeout);
777 daemon_down(dmn, why);
778 return 0;
8b886ca7 779}
780
781/* Making connection to protocol daemon. */
a6810074 782static int try_connect(struct daemon *dmn)
8b886ca7 783{
a6810074
DL
784 int sock;
785 struct sockaddr_un addr;
786 socklen_t len;
787
788 if (gs.loglevel > LOG_DEBUG + 1)
789 zlog_debug("%s: attempting to connect", dmn->name);
790 dmn->connect_tries++;
791
792 memset(&addr, 0, sizeof(struct sockaddr_un));
793 addr.sun_family = AF_UNIX;
d62a17ae 794 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty", gs.vtydir,
795 dmn->name);
6f0e3f6e 796#ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
a6810074 797 len = addr.sun_len = SUN_LEN(&addr);
8b886ca7 798#else
a6810074 799 len = sizeof(addr.sun_family) + strlen(addr.sun_path);
d62a17ae 800#endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
a6810074
DL
801
802 /* Quick check to see if we might succeed before we go to the trouble
803 of creating a socket. */
804 if (access(addr.sun_path, W_OK) < 0) {
805 if (errno != ENOENT)
450971aa 806 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
807 "%s: access to socket %s denied: %s",
808 dmn->name, addr.sun_path,
809 safe_strerror(errno));
a6810074
DL
810 return -1;
811 }
812
813 if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
450971aa 814 flog_err_sys(EC_LIB_SOCKET, "%s(%s): cannot make socket: %s",
09c866e3 815 __func__, addr.sun_path, safe_strerror(errno));
a6810074
DL
816 return -1;
817 }
818
819 if (set_nonblocking(sock) < 0 || set_cloexec(sock) < 0) {
450971aa 820 flog_err_sys(EC_LIB_SYSTEM_CALL,
09c866e3
QY
821 "%s(%s): set_nonblocking/cloexec(%d) failed",
822 __func__, addr.sun_path, sock);
a6810074
DL
823 close(sock);
824 return -1;
8b886ca7 825 }
a6810074
DL
826
827 if (connect(sock, (struct sockaddr *)&addr, len) < 0) {
828 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK)) {
829 if (gs.loglevel > LOG_DEBUG)
830 zlog_debug("%s(%s): connect failed: %s",
831 __func__, addr.sun_path,
832 safe_strerror(errno));
833 close(sock);
834 return -1;
835 }
836 if (gs.loglevel > LOG_DEBUG)
837 zlog_debug("%s: connection in progress", dmn->name);
838 dmn->state = DAEMON_CONNECTING;
839 dmn->fd = sock;
66e78ae6
QY
840 dmn->t_write = NULL;
841 thread_add_write(master, check_connect, dmn, dmn->fd,
d62a17ae 842 &dmn->t_write);
843 dmn->t_wakeup = NULL;
844 thread_add_timer(master, wakeup_connect_hanging, dmn,
845 gs.timeout, &dmn->t_wakeup);
a6810074
DL
846 SET_READ_HANDLER(dmn);
847 return 0;
848 }
849
850 dmn->fd = sock;
851 SET_READ_HANDLER(dmn);
852 daemon_up(dmn, "connect succeeded");
853 return 1;
8b886ca7 854}
855
a6810074 856static int phase_hanging(struct thread *t_hanging)
8b886ca7 857{
a6810074 858 gs.t_phase_hanging = NULL;
f74ae2bb 859 flog_err(EC_WATCHFRR_CONNECTION,
1c50c1c0
QY
860 "Phase [%s] hanging for %ld seconds, aborting phased restart",
861 phase_str[gs.phase], PHASE_TIMEOUT);
a6810074
DL
862 gs.phase = PHASE_NONE;
863 return 0;
8b886ca7 864}
865
a6810074 866static void set_phase(restart_phase_t new_phase)
8b886ca7 867{
a6810074 868 gs.phase = new_phase;
b3d6bc6e
MS
869 thread_cancel(&gs.t_phase_hanging);
870
66e78ae6
QY
871 thread_add_timer(master, phase_hanging, NULL, PHASE_TIMEOUT,
872 &gs.t_phase_hanging);
8b886ca7 873}
874
a6810074 875static void phase_check(void)
8b886ca7 876{
c0e5cb52
DL
877 struct daemon *dmn;
878
a6810074
DL
879 switch (gs.phase) {
880 case PHASE_NONE:
881 break;
c0e5cb52
DL
882
883 case PHASE_INIT:
884 for (dmn = gs.daemons; dmn; dmn = dmn->next)
885 if (dmn->state == DAEMON_INIT)
886 return;
887
888 /* startup complete, everything out of INIT */
889 gs.phase = PHASE_NONE;
890 for (dmn = gs.daemons; dmn; dmn = dmn->next)
891 if (dmn->state == DAEMON_DOWN) {
892 SET_WAKEUP_DOWN(dmn);
893 try_restart(dmn);
894 }
895 break;
a6810074
DL
896 case PHASE_STOPS_PENDING:
897 if (gs.numpids)
898 break;
d62a17ae 899 zlog_info(
900 "Phased restart: all routing daemon stop jobs have completed.");
a6810074
DL
901 set_phase(PHASE_WAITING_DOWN);
902
d62a17ae 903 /*FALLTHRU*/
a6810074
DL
904 case PHASE_WAITING_DOWN:
905 if (gs.numdown + IS_UP(gs.special) < gs.numdaemons)
906 break;
907 zlog_info("Phased restart: all routing daemons now down.");
908 run_job(&gs.special->restart, "restart", gs.restart_command, 1,
909 1);
910 set_phase(PHASE_ZEBRA_RESTART_PENDING);
911
d62a17ae 912 /*FALLTHRU*/
a6810074
DL
913 case PHASE_ZEBRA_RESTART_PENDING:
914 if (gs.special->restart.pid)
915 break;
916 zlog_info("Phased restart: %s restart job completed.",
917 gs.special->name);
918 set_phase(PHASE_WAITING_ZEBRA_UP);
919
d62a17ae 920 /*FALLTHRU*/
a6810074
DL
921 case PHASE_WAITING_ZEBRA_UP:
922 if (!IS_UP(gs.special))
923 break;
924 zlog_info("Phased restart: %s is now up.", gs.special->name);
925 {
926 struct daemon *dmn;
927 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
928 if (dmn != gs.special)
929 run_job(&dmn->restart, "start",
930 gs.start_command, 1, 0);
931 }
932 }
933 gs.phase = PHASE_NONE;
934 THREAD_OFF(gs.t_phase_hanging);
935 zlog_notice("Phased global restart has completed.");
936 break;
937 }
8b886ca7 938}
939
a6810074 940static void try_restart(struct daemon *dmn)
8b886ca7 941{
f168b713 942 if (watch_only)
a6810074 943 return;
a6810074 944
f168b713
DL
945 if (dmn != gs.special) {
946 if ((gs.special->state == DAEMON_UP)
947 && (gs.phase == PHASE_NONE))
948 run_job(&dmn->restart, "restart", gs.restart_command, 0,
949 1);
950 else
951 zlog_debug(
3efd0893 952 "%s: postponing restart attempt because master %s daemon not up [%s], or phased restart in progress",
f168b713
DL
953 dmn->name, gs.special->name,
954 state_str[gs.special->state]);
955 return;
956 }
957
958 if ((gs.phase != PHASE_NONE) || gs.numpids) {
959 if (gs.loglevel > LOG_DEBUG + 1)
960 zlog_debug(
3efd0893 961 "postponing phased global restart: restart already in progress [%s], or outstanding child processes [%d]",
f168b713
DL
962 phase_str[gs.phase], gs.numpids);
963 return;
964 }
965 /* Is it too soon for a restart? */
966 {
967 struct timeval delay;
968 if (time_elapsed(&delay, &gs.special->restart.time)->tv_sec
969 < gs.special->restart.interval) {
a6810074 970 if (gs.loglevel > LOG_DEBUG + 1)
d62a17ae 971 zlog_debug(
3efd0893 972 "postponing phased global restart: elapsed time %ld < retry interval %ld",
f168b713
DL
973 (long)delay.tv_sec,
974 gs.special->restart.interval);
975 return;
a6810074 976 }
8b886ca7 977 }
f168b713 978 run_job(&gs.restart, "restart", gs.restart_command, 0, 1);
8b886ca7 979}
980
a6810074 981static int wakeup_unresponsive(struct thread *t_wakeup)
8b886ca7 982{
a6810074
DL
983 struct daemon *dmn = THREAD_ARG(t_wakeup);
984
985 dmn->t_wakeup = NULL;
986 if (dmn->state != DAEMON_UNRESPONSIVE)
f74ae2bb 987 flog_err(EC_WATCHFRR_CONNECTION,
3efd0893 988 "%s: no longer unresponsive (now %s), wakeup should have been cancelled!",
1c50c1c0 989 dmn->name, state_str[dmn->state]);
a6810074
DL
990 else {
991 SET_WAKEUP_UNRESPONSIVE(dmn);
992 try_restart(dmn);
993 }
994 return 0;
8b886ca7 995}
996
a6810074 997static int wakeup_no_answer(struct thread *t_wakeup)
8b886ca7 998{
a6810074
DL
999 struct daemon *dmn = THREAD_ARG(t_wakeup);
1000
1001 dmn->t_wakeup = NULL;
1002 dmn->state = DAEMON_UNRESPONSIVE;
cc53b605
DS
1003 if (dmn->ignore_timeout)
1004 return 0;
f74ae2bb 1005 flog_err(EC_WATCHFRR_CONNECTION,
3efd0893 1006 "%s state -> unresponsive : no response yet to ping sent %ld seconds ago",
1c50c1c0 1007 dmn->name, gs.timeout);
71e7975a
DL
1008 SET_WAKEUP_UNRESPONSIVE(dmn);
1009 try_restart(dmn);
a6810074 1010 return 0;
8b886ca7 1011}
1012
a6810074 1013static int wakeup_send_echo(struct thread *t_wakeup)
8b886ca7 1014{
a6810074
DL
1015 static const char echocmd[] = "echo " PING_TOKEN;
1016 ssize_t rc;
1017 struct daemon *dmn = THREAD_ARG(t_wakeup);
1018
1019 dmn->t_wakeup = NULL;
d62a17ae 1020 if (((rc = write(dmn->fd, echocmd, sizeof(echocmd))) < 0)
1021 || ((size_t)rc != sizeof(echocmd))) {
a6810074
DL
1022 char why[100 + sizeof(echocmd)];
1023 snprintf(why, sizeof(why),
1024 "write '%s' returned %d instead of %u", echocmd,
d7c0a89a 1025 (int)rc, (unsigned int)sizeof(echocmd));
a6810074
DL
1026 daemon_down(dmn, why);
1027 } else {
1028 gettimeofday(&dmn->echo_sent, NULL);
66e78ae6
QY
1029 dmn->t_wakeup = NULL;
1030 thread_add_timer(master, wakeup_no_answer, dmn, gs.timeout,
1031 &dmn->t_wakeup);
a6810074
DL
1032 }
1033 return 0;
8b886ca7 1034}
1035
470bc619
QY
1036bool check_all_up(void)
1037{
1038 struct daemon *dmn;
1039
1040 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1041 if (dmn->state != DAEMON_UP)
1042 return false;
1043 return true;
1044}
1045
af568444
DL
1046void watchfrr_status(struct vty *vty)
1047{
1048 struct daemon *dmn;
1049 struct timeval delay;
1050
1051 vty_out(vty, "watchfrr global phase: %s\n", phase_str[gs.phase]);
1052 if (gs.restart.pid)
1053 vty_out(vty, " global restart running, pid %ld\n",
1054 (long)gs.restart.pid);
1055
1056 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
cc53b605
DS
1057 vty_out(vty, " %-20s %s%s", dmn->name, state_str[dmn->state],
1058 dmn->ignore_timeout ? "/Ignoring Timeout\n" : "\n");
af568444
DL
1059 if (dmn->restart.pid)
1060 vty_out(vty, " restart running, pid %ld\n",
1061 (long)dmn->restart.pid);
1062 else if (dmn->state == DAEMON_DOWN &&
1063 time_elapsed(&delay, &dmn->restart.time)->tv_sec
1064 < dmn->restart.interval)
3efd0893 1065 vty_out(vty, " restarting in %jd seconds (%jds backoff interval)\n",
051a0be4
DL
1066 (intmax_t)dmn->restart.interval
1067 - (intmax_t)delay.tv_sec,
1068 (intmax_t)dmn->restart.interval);
af568444
DL
1069 }
1070}
1071
a6810074 1072static void sigint(void)
8b886ca7 1073{
a6810074
DL
1074 zlog_notice("Terminating on signal");
1075 systemd_send_stopping();
1076 exit(0);
8b886ca7 1077}
1078
a6810074 1079static int valid_command(const char *cmd)
8b886ca7 1080{
a6810074 1081 char *p;
8b886ca7 1082
53a78fc1
RZ
1083 if (cmd == NULL)
1084 return 0;
1085
a6810074 1086 return ((p = strchr(cmd, '%')) != NULL) && (*(p + 1) == 's')
d62a17ae 1087 && !strchr(p + 1, '%');
8b886ca7 1088}
1089
c8b40f86 1090/* This is an ugly hack to circumvent problems with passing command-line
1091 arguments that contain spaces. The fix is to use a configuration file. */
a6810074 1092static char *translate_blanks(const char *cmd, const char *blankstr)
c8b40f86 1093{
a6810074
DL
1094 char *res;
1095 char *p;
1096 size_t bslen = strlen(blankstr);
1097
1098 if (!(res = strdup(cmd))) {
1099 perror("strdup");
1100 exit(1);
1101 }
1102 while ((p = strstr(res, blankstr)) != NULL) {
1103 *p = ' ';
1104 if (bslen != 1)
1105 memmove(p + 1, p + bslen, strlen(p + bslen) + 1);
1106 }
1107 return res;
c8b40f86 1108}
1109
5c9d1c83
DL
1110static int startup_timeout(struct thread *t_wakeup)
1111{
1112 daemon_send_ready(1);
1113 return 0;
1114}
1115
33606a15
DL
1116#ifdef GNU_LINUX
1117
1118#include <sys/mount.h>
1119#include <sched.h>
1120
1121#define NETNS_RUN_DIR "/var/run/netns"
1122
1123static void netns_create(int dirfd, const char *nsname)
1124{
1125 /* make /var/run/netns shared between mount namespaces
1126 * just like iproute2 sets it up
1127 */
1128 if (mount("", NETNS_RUN_DIR, "none", MS_SHARED | MS_REC, NULL)) {
1129 if (errno != EINVAL) {
1130 perror("mount");
1131 exit(1);
1132 }
1133
1134 if (mount(NETNS_RUN_DIR, NETNS_RUN_DIR, "none",
1135 MS_BIND | MS_REC, NULL)) {
1136 perror("mount");
1137 exit(1);
1138 }
1139
1140 if (mount("", NETNS_RUN_DIR, "none", MS_SHARED | MS_REC,
1141 NULL)) {
1142 perror("mount");
1143 exit(1);
1144 }
1145 }
1146
1147 /* need an empty file to mount on top of */
1148 int nsfd = openat(dirfd, nsname, O_CREAT | O_RDONLY | O_EXCL, 0);
1149
1150 if (nsfd < 0) {
1151 fprintf(stderr, "failed to create \"%s/%s\": %s\n",
1152 NETNS_RUN_DIR, nsname, strerror(errno));
1153 exit(1);
1154 }
1155 close(nsfd);
1156
1157 if (unshare(CLONE_NEWNET)) {
1158 perror("unshare");
1159 unlinkat(dirfd, nsname, 0);
1160 exit(1);
1161 }
1162
1163 char *dstpath = asprintfrr(MTYPE_TMP, "%s/%s", NETNS_RUN_DIR, nsname);
1164
1165 /* bind-mount so the namespace has a name and is persistent */
1166 if (mount("/proc/self/ns/net", dstpath, "none", MS_BIND, NULL) < 0) {
1167 fprintf(stderr, "failed to bind-mount netns to \"%s\": %s\n",
1168 dstpath, strerror(errno));
1169 unlinkat(dirfd, nsname, 0);
1170 exit(1);
1171 }
1172
1173 XFREE(MTYPE_TMP, dstpath);
1174}
1175
1176static void netns_setup(const char *nsname)
1177{
1178 int dirfd, nsfd;
1179
1180 dirfd = open(NETNS_RUN_DIR, O_DIRECTORY | O_RDONLY);
1181 if (dirfd < 0) {
1182 if (errno == ENOTDIR) {
1183 fprintf(stderr, "error: \"%s\" is not a directory!\n",
1184 NETNS_RUN_DIR);
1185 exit(1);
1186 } else if (errno == ENOENT) {
1187 if (mkdir(NETNS_RUN_DIR, 0755)) {
1188 fprintf(stderr, "error: \"%s\": mkdir: %s\n",
1189 NETNS_RUN_DIR, strerror(errno));
1190 exit(1);
1191 }
1192 dirfd = open(NETNS_RUN_DIR, O_DIRECTORY | O_RDONLY);
1193 if (dirfd < 0) {
1194 fprintf(stderr, "error: \"%s\": opendir: %s\n",
1195 NETNS_RUN_DIR, strerror(errno));
1196 exit(1);
1197 }
1198 } else {
1199 fprintf(stderr, "error: \"%s\": %s\n",
1200 NETNS_RUN_DIR, strerror(errno));
1201 exit(1);
1202 }
1203 }
1204
1205 nsfd = openat(dirfd, nsname, O_RDONLY);
1206 if (nsfd < 0 && errno != ENOENT) {
1207 fprintf(stderr, "error: \"%s/%s\": %s\n",
1208 NETNS_RUN_DIR, nsname, strerror(errno));
1209 exit(1);
1210 }
1211 if (nsfd < 0)
1212 netns_create(dirfd, nsname);
1213 else {
1214 if (setns(nsfd, CLONE_NEWNET)) {
1215 perror("setns");
1216 exit(1);
1217 }
1218 close(nsfd);
1219 }
1220 close(dirfd);
1221
1222 /* make sure loopback is up... weird things happen otherwise.
1223 * ioctl is perfectly fine for this, don't need netlink...
1224 */
1225 int sockfd;
1226 struct ifreq ifr = { };
1227
1228 strlcpy(ifr.ifr_name, "lo", sizeof(ifr.ifr_name));
1229
1230 sockfd = socket(AF_INET, SOCK_DGRAM, 0);
1231 if (sockfd < 0) {
1232 perror("socket");
1233 exit(1);
1234 }
1235 if (ioctl(sockfd, SIOCGIFFLAGS, &ifr)) {
1236 perror("ioctl(SIOCGIFFLAGS, \"lo\")");
1237 exit(1);
1238 }
1239 if (!(ifr.ifr_flags & IFF_UP)) {
1240 ifr.ifr_flags |= IFF_UP;
1241 if (ioctl(sockfd, SIOCSIFFLAGS, &ifr)) {
1242 perror("ioctl(SIOCSIFFLAGS, \"lo\")");
1243 exit(1);
1244 }
1245 }
1246 close(sockfd);
1247}
1248
1249#else /* !GNU_LINUX */
1250
1251static void netns_setup(const char *nsname)
1252{
1253 fprintf(stderr, "network namespaces are only available on Linux\n");
1254 exit(1);
1255}
1256#endif
1257
0a7c7856
DL
1258static void watchfrr_init(int argc, char **argv)
1259{
1260 const char *special = "zebra";
1261 int i;
1262 struct daemon *dmn, **add = &gs.daemons;
1263 char alldaemons[512] = "", *p = alldaemons;
1264
5c9d1c83
DL
1265 thread_add_timer_msec(master, startup_timeout, NULL, STARTUP_TIMEOUT,
1266 &gs.t_startup_timeout);
1267
0a7c7856
DL
1268 for (i = optind; i < argc; i++) {
1269 dmn = XCALLOC(MTYPE_WATCHFRR_DAEMON, sizeof(*dmn));
1270
1271 dmn->name = dmn->restart.name = argv[i];
1272 dmn->state = DAEMON_INIT;
1273 gs.numdaemons++;
1274 gs.numdown++;
1275 dmn->fd = -1;
1276 dmn->t_wakeup = NULL;
c0e5cb52 1277 thread_add_timer_msec(master, wakeup_init, dmn, 0,
0a7c7856
DL
1278 &dmn->t_wakeup);
1279 dmn->restart.interval = gs.min_restart_interval;
1280 *add = dmn;
1281 add = &dmn->next;
1282
1283 if (!strcmp(dmn->name, special))
1284 gs.special = dmn;
1285 }
1286
1287 if (!gs.daemons) {
1288 fprintf(stderr,
1289 "Must specify one or more daemons to monitor.\n\n");
1290 frr_help_exit(1);
1291 }
1292 if (!watch_only && !gs.special) {
1293 fprintf(stderr, "\"%s\" daemon must be in daemon lists\n\n",
1294 special);
1295 frr_help_exit(1);
1296 }
1297
1298 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
1299 snprintf(p, alldaemons + sizeof(alldaemons) - p, "%s%s",
1300 (p == alldaemons) ? "" : " ", dmn->name);
1301 p += strlen(p);
1302 }
1303 zlog_notice("%s %s watching [%s]%s", progname, FRR_VERSION, alldaemons,
1304 watch_only ? ", monitor mode" : "");
1305}
1306
a6810074 1307struct zebra_privs_t watchfrr_privs = {
95c4aff2 1308#ifdef VTY_GROUP
a6810074 1309 .vty_group = VTY_GROUP,
95c4aff2
DL
1310#endif
1311};
1312
4f04a76b
DL
1313static struct quagga_signal_t watchfrr_signals[] = {
1314 {
1315 .signal = SIGINT,
1316 .handler = sigint,
1317 },
1318 {
1319 .signal = SIGTERM,
1320 .handler = sigint,
1321 },
1322 {
1323 .signal = SIGCHLD,
1324 .handler = sigchild,
1325 },
1326};
1327
1328FRR_DAEMON_INFO(watchfrr, WATCHFRR,
d62a17ae 1329 .flags = FRR_NO_PRIVSEP | FRR_NO_TCPVTY | FRR_LIMITED_CLI
0a7c7856
DL
1330 | FRR_NO_CFG_PID_DRY | FRR_NO_ZCLIENT
1331 | FRR_DETACH_LATER,
4f04a76b 1332
d62a17ae 1333 .printhelp = printhelp,
1334 .copyright = "Copyright 2004 Andrew J. Schorr",
4f04a76b 1335
d62a17ae 1336 .signals = watchfrr_signals,
1337 .n_signals = array_size(watchfrr_signals),
4f04a76b 1338
80413c20
DL
1339 .privs = &watchfrr_privs,
1340);
4f04a76b 1341
999f153e
DL
1342#define DEPRECATED_OPTIONS "aAezR:"
1343
a6810074 1344int main(int argc, char **argv)
8b886ca7 1345{
a6810074 1346 int opt;
a6810074 1347 const char *blankstr = NULL;
33606a15
DL
1348 const char *netns = NULL;
1349 bool netns_en = false;
a6810074 1350
4f04a76b
DL
1351 frr_preinit(&watchfrr_di, argc, argv);
1352 progname = watchfrr_di.progname;
1353
33606a15 1354 frr_opt_add("b:di:k:l:N:p:r:S:s:t:T:" DEPRECATED_OPTIONS, longopts, "");
a6810074
DL
1355
1356 gs.restart.name = "all";
4f04a76b 1357 while ((opt = frr_getopt(argc, argv, NULL)) != EOF) {
999f153e
DL
1358 if (opt && opt < 128 && strchr(DEPRECATED_OPTIONS, opt)) {
1359 fprintf(stderr,
1360 "The -%c option no longer exists.\n"
1361 "Please refer to the watchfrr(8) man page.\n",
1362 opt);
1363 exit(1);
1364 }
1365
a6810074
DL
1366 switch (opt) {
1367 case 0:
1368 break;
a6810074
DL
1369 case 'b':
1370 blankstr = optarg;
1371 break;
f168b713
DL
1372 case OPTION_DRY:
1373 watch_only = true;
a6810074
DL
1374 break;
1375 case 'k':
1376 if (!valid_command(optarg)) {
1377 fprintf(stderr,
1378 "Invalid kill command, must contain '%%s': %s\n",
1379 optarg);
4f04a76b 1380 frr_help_exit(1);
a6810074
DL
1381 }
1382 gs.stop_command = optarg;
1383 break;
d62a17ae 1384 case 'l': {
1385 char garbage[3];
1386 if ((sscanf(optarg, "%d%1s", &gs.loglevel, garbage)
1387 != 1)
1388 || (gs.loglevel < LOG_EMERG)) {
1389 fprintf(stderr,
1390 "Invalid loglevel argument: %s\n",
1391 optarg);
1392 frr_help_exit(1);
a6810074 1393 }
d62a17ae 1394 } break;
1395 case OPTION_MINRESTART: {
1396 char garbage[3];
1397 if ((sscanf(optarg, "%ld%1s", &gs.min_restart_interval,
1398 garbage)
1399 != 1)
1400 || (gs.min_restart_interval < 0)) {
1401 fprintf(stderr,
1402 "Invalid min_restart_interval argument: %s\n",
1403 optarg);
1404 frr_help_exit(1);
a6810074 1405 }
d62a17ae 1406 } break;
1407 case OPTION_MAXRESTART: {
1408 char garbage[3];
1409 if ((sscanf(optarg, "%ld%1s", &gs.max_restart_interval,
1410 garbage)
1411 != 1)
1412 || (gs.max_restart_interval < 0)) {
1413 fprintf(stderr,
1414 "Invalid max_restart_interval argument: %s\n",
1415 optarg);
1416 frr_help_exit(1);
a6810074 1417 }
d62a17ae 1418 } break;
33606a15
DL
1419 case OPTION_NETNS:
1420 netns_en = true;
b12bc77c 1421 if (optarg && strchr(optarg, '/')) {
33606a15
DL
1422 fprintf(stderr,
1423 "invalid network namespace name \"%s\" (may not contain slashes)\n",
1424 optarg);
1425 frr_help_exit(1);
1426 }
1427 netns = optarg;
1428 break;
d62a17ae 1429 case 'i': {
1430 char garbage[3];
1431 int period;
1432 if ((sscanf(optarg, "%d%1s", &period, garbage) != 1)
1433 || (gs.period < 1)) {
1434 fprintf(stderr,
1435 "Invalid interval argument: %s\n",
1436 optarg);
1437 frr_help_exit(1);
a6810074 1438 }
d62a17ae 1439 gs.period = 1000 * period;
1440 } break;
a6810074 1441 case 'p':
0a7c7856 1442 watchfrr_di.pid_file = optarg;
a6810074
DL
1443 break;
1444 case 'r':
a6810074
DL
1445 if (!valid_command(optarg)) {
1446 fprintf(stderr,
1447 "Invalid restart command, must contain '%%s': %s\n",
1448 optarg);
4f04a76b 1449 frr_help_exit(1);
a6810074
DL
1450 }
1451 gs.restart_command = optarg;
a6810074
DL
1452 break;
1453 case 's':
1454 if (!valid_command(optarg)) {
1455 fprintf(stderr,
1456 "Invalid start command, must contain '%%s': %s\n",
1457 optarg);
4f04a76b 1458 frr_help_exit(1);
a6810074
DL
1459 }
1460 gs.start_command = optarg;
1461 break;
1462 case 'S':
1463 gs.vtydir = optarg;
1464 break;
d62a17ae 1465 case 't': {
1466 char garbage[3];
1467 if ((sscanf(optarg, "%ld%1s", &gs.timeout, garbage)
1468 != 1)
1469 || (gs.timeout < 1)) {
1470 fprintf(stderr,
1471 "Invalid timeout argument: %s\n",
1472 optarg);
1473 frr_help_exit(1);
a6810074 1474 }
d62a17ae 1475 } break;
1476 case 'T': {
1477 char garbage[3];
1478 if ((sscanf(optarg, "%ld%1s", &gs.restart_timeout,
1479 garbage)
1480 != 1)
1481 || (gs.restart_timeout < 1)) {
1482 fprintf(stderr,
1483 "Invalid restart timeout argument: %s\n",
1484 optarg);
1485 frr_help_exit(1);
a6810074 1486 }
d62a17ae 1487 } break;
a6810074
DL
1488 default:
1489 fputs("Invalid option.\n", stderr);
4f04a76b 1490 frr_help_exit(1);
a6810074 1491 }
8b886ca7 1492 }
a6810074 1493
71e7975a
DL
1494 if (watch_only
1495 && (gs.start_command || gs.stop_command || gs.restart_command)) {
d87ae5cc 1496 fputs("Options -r/-s/-k are not used when --dry is active.\n",
a6810074 1497 stderr);
8b886ca7 1498 }
f168b713
DL
1499 if (!watch_only
1500 && (!gs.restart_command || !gs.start_command || !gs.stop_command)) {
1501 fprintf(stderr,
1502 "Options -s (start), -k (kill), and -r (restart) are required.\n");
1503 frr_help_exit(1);
8b886ca7 1504 }
8b886ca7 1505
a6810074
DL
1506 if (blankstr) {
1507 if (gs.restart_command)
1508 gs.restart_command =
d62a17ae 1509 translate_blanks(gs.restart_command, blankstr);
a6810074
DL
1510 if (gs.start_command)
1511 gs.start_command =
d62a17ae 1512 translate_blanks(gs.start_command, blankstr);
a6810074
DL
1513 if (gs.stop_command)
1514 gs.stop_command =
d62a17ae 1515 translate_blanks(gs.stop_command, blankstr);
065de903 1516 }
8b886ca7 1517
a6810074 1518 gs.restart.interval = gs.min_restart_interval;
8b886ca7 1519
33606a15
DL
1520 /* env variable for the processes that we start */
1521 if (watchfrr_di.pathspace)
1522 setenv("FRR_PATHSPACE", watchfrr_di.pathspace, 1);
1523 else
1524 unsetenv("FRR_PATHSPACE");
1525
1526 if (netns_en && !netns)
1527 netns = watchfrr_di.pathspace;
1528 if (netns_en && netns && netns[0])
1529 netns_setup(netns);
1530
4f04a76b 1531 master = frr_init();
b647dc2a 1532 watchfrr_error_init();
0a7c7856
DL
1533 watchfrr_init(argc, argv);
1534 watchfrr_vty_init();
1535
1536 frr_config_fork();
4f04a76b 1537
0a7c7856 1538 if (watchfrr_di.daemon_mode)
0bdeb5e5 1539 zlog_syslog_set_prio_min(MIN(gs.loglevel, LOG_DEBUG));
0a7c7856 1540 else
0bdeb5e5 1541 zlog_aux_init(NULL, MIN(gs.loglevel, LOG_DEBUG));
8b886ca7 1542
0a7c7856 1543 frr_run(master);
8b886ca7 1544
a6810074
DL
1545 systemd_send_stopping();
1546 /* Not reached. */
1547 return 0;
8b886ca7 1548}