]> git.proxmox.com Git - mirror_frr.git/blame - watchfrr/watchfrr.c
Merge pull request #2909 from netravnen/feature/git-pl-template
[mirror_frr.git] / watchfrr / watchfrr.c
CommitLineData
8b886ca7 1/*
896014f4
DL
2 * Monitor status of frr daemons and restart if necessary.
3 *
4 * Copyright (C) 2004 Andrew J. Schorr
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
8b886ca7 19 */
20
a365534f 21#include <zebra.h>
8b886ca7 22#include <thread.h>
23#include <log.h>
52e66296 24#include <network.h>
8b886ca7 25#include <sigevent.h>
a365534f 26#include <lib/version.h>
95c4aff2 27#include "command.h"
87f44e2f 28#include "memory_vty.h"
4f04a76b 29#include "libfrr.h"
b647dc2a 30#include "lib_errors.h"
95c4aff2 31
6f594023 32#include <getopt.h>
a365534f 33#include <sys/un.h>
34#include <sys/wait.h>
837d16cc 35#include <memory.h>
651415bd 36#include <systemd.h>
8b886ca7 37
9473e340 38#include "watchfrr.h"
b647dc2a 39#include "watchfrr_errors.h"
95c4aff2 40
8b886ca7 41#ifndef MIN
42#define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
43#endif
44
45/* Macros to help randomize timers. */
46#define JITTER(X) ((random() % ((X)+1))-((X)/2))
47#define FUZZY(X) ((X)+JITTER((X)/20))
48
49#define DEFAULT_PERIOD 5
0a64aff6 50#define DEFAULT_TIMEOUT 90
8b886ca7 51#define DEFAULT_RESTART_TIMEOUT 20
52#define DEFAULT_LOGLEVEL LOG_INFO
53#define DEFAULT_MIN_RESTART 60
54#define DEFAULT_MAX_RESTART 600
8b886ca7 55
56#define PING_TOKEN "PING"
57
55c72803 58/* Needs to be global, referenced somewhere inside libfrr. */
8b886ca7 59struct thread_master *master;
64a249ad 60static char pidfile_default[256];
8b886ca7 61
f168b713 62static bool watch_only = false;
8b886ca7 63
a6810074
DL
64typedef enum {
65 PHASE_NONE = 0,
66 PHASE_STOPS_PENDING,
67 PHASE_WAITING_DOWN,
68 PHASE_ZEBRA_RESTART_PENDING,
69 PHASE_WAITING_ZEBRA_UP
8b886ca7 70} restart_phase_t;
71
a6810074
DL
72static const char *phase_str[] = {
73 "None",
74 "Stop jobs running",
75 "Waiting for other daemons to come down",
76 "Zebra restart job running",
77 "Waiting for zebra to come up",
78 "Start jobs running",
8b886ca7 79};
80
81#define PHASE_TIMEOUT (3*gs.restart_timeout)
82
a6810074
DL
83struct restart_info {
84 const char *name;
85 const char *what;
86 pid_t pid;
87 struct timeval time;
88 long interval;
89 struct thread *t_kill;
90 int kills;
098e240f 91};
92
a6810074 93static struct global_state {
a6810074
DL
94 restart_phase_t phase;
95 struct thread *t_phase_hanging;
96 const char *vtydir;
97 long period;
98 long timeout;
99 long restart_timeout;
100 long min_restart_interval;
101 long max_restart_interval;
a6810074
DL
102 struct daemon *daemons;
103 const char *restart_command;
104 const char *start_command;
105 const char *stop_command;
106 struct restart_info restart;
a6810074 107 int loglevel;
d62a17ae 108 struct daemon *special; /* points to zebra when doing phased restart */
a6810074
DL
109 int numdaemons;
110 int numpids;
d62a17ae 111 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
8b886ca7 112} gs = {
d62a17ae 113 .phase = PHASE_NONE,
64a249ad 114 .vtydir = frr_vtydir,
d62a17ae 115 .period = 1000 * DEFAULT_PERIOD,
116 .timeout = DEFAULT_TIMEOUT,
117 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
118 .loglevel = DEFAULT_LOGLEVEL,
119 .min_restart_interval = DEFAULT_MIN_RESTART,
120 .max_restart_interval = DEFAULT_MAX_RESTART,
d62a17ae 121};
a6810074
DL
122
123typedef enum {
124 DAEMON_INIT,
125 DAEMON_DOWN,
126 DAEMON_CONNECTING,
127 DAEMON_UP,
128 DAEMON_UNRESPONSIVE
8b886ca7 129} daemon_state_t;
130
d62a17ae 131#define IS_UP(DMN) \
132 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
8b886ca7 133
a6810074 134static const char *state_str[] = {
d62a17ae 135 "Init", "Down", "Connecting", "Up", "Unresponsive",
8b886ca7 136};
137
138struct daemon {
a6810074
DL
139 const char *name;
140 daemon_state_t state;
141 int fd;
142 struct timeval echo_sent;
d7c0a89a 143 unsigned int connect_tries;
a6810074
DL
144 struct thread *t_wakeup;
145 struct thread *t_read;
146 struct thread *t_write;
147 struct daemon *next;
148 struct restart_info restart;
8b886ca7 149};
150
9272302b
DL
151#define OPTION_MINRESTART 2000
152#define OPTION_MAXRESTART 2001
f168b713 153#define OPTION_DRY 2002
9272302b 154
a6810074
DL
155static const struct option longopts[] = {
156 {"daemon", no_argument, NULL, 'd'},
157 {"statedir", required_argument, NULL, 'S'},
a6810074
DL
158 {"loglevel", required_argument, NULL, 'l'},
159 {"interval", required_argument, NULL, 'i'},
160 {"timeout", required_argument, NULL, 't'},
161 {"restart-timeout", required_argument, NULL, 'T'},
162 {"restart", required_argument, NULL, 'r'},
163 {"start-command", required_argument, NULL, 's'},
164 {"kill-command", required_argument, NULL, 'k'},
f168b713 165 {"dry", no_argument, NULL, OPTION_DRY},
d62a17ae 166 {"min-restart-interval", required_argument, NULL, OPTION_MINRESTART},
167 {"max-restart-interval", required_argument, NULL, OPTION_MAXRESTART},
a6810074
DL
168 {"pid-file", required_argument, NULL, 'p'},
169 {"blank-string", required_argument, NULL, 'b'},
170 {"help", no_argument, NULL, 'h'},
171 {"version", no_argument, NULL, 'v'},
d62a17ae 172 {NULL, 0, NULL, 0}};
8b886ca7 173
174static int try_connect(struct daemon *dmn);
175static int wakeup_send_echo(struct thread *t_wakeup);
176static void try_restart(struct daemon *dmn);
177static void phase_check(void);
178
4f04a76b
DL
179static const char *progname;
180static void printhelp(FILE *target)
8b886ca7 181{
d62a17ae 182 fprintf(target,
183 "Usage : %s [OPTION...] <daemon name> ...\n\n\
9473e340 184Watchdog program to monitor status of frr daemons and try to restart\n\
8b886ca7 185them if they are down or unresponsive. It determines whether a daemon is\n\
186up based on whether it can connect to the daemon's vty unix stream socket.\n\
187It then repeatedly sends echo commands over that socket to determine whether\n\
188the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
189on the socket connection and know immediately that the daemon is down.\n\n\
190The daemons to be monitored should be listed on the command line.\n\n\
8b886ca7 191In order to avoid attempting to restart the daemons in a fast loop,\n\
192the -m and -M options allow you to control the minimum delay between\n\
193restart commands. The minimum restart delay is recalculated each time\n\
194a restart is attempted: if the time since the last restart attempt exceeds\n\
195twice the -M value, then the restart delay is set to the -m value.\n\
d62a17ae 196Otherwise, the interval is doubled (but capped at the -M value).\n\n",
f168b713 197 progname);
e757c940 198
d62a17ae 199 fprintf(target,
200 "Options:\n\
8b886ca7 201-d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
202 to syslog instead of stdout.\n\
203-S, --statedir Set the vty socket directory (default is %s)\n\
8b886ca7 204-l, --loglevel Set the logging level (default is %d).\n\
205 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
206 but it can be set higher than %d if extra-verbose debugging\n\
207 messages are desired.\n\
9272302b 208 --min-restart-interval\n\
8b886ca7 209 Set the minimum seconds to wait between invocations of daemon\n\
210 restart commands (default is %d).\n\
9272302b 211 --max-restart-interval\n\
8b886ca7 212 Set the maximum seconds to wait between invocations of daemon\n\
213 restart commands (default is %d).\n\
214-i, --interval Set the status polling interval in seconds (default is %d)\n\
215-t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
216-T, --restart-timeout\n\
217 Set the restart (kill) timeout in seconds (default is %d).\n\
218 If any background jobs are still running after this much\n\
219 time has elapsed, they will be killed.\n\
220-r, --restart Supply a Bourne shell command to use to restart a single\n\
221 daemon. The command string should include '%%s' where the\n\
222 name of the daemon should be substituted.\n\
8b886ca7 223-s, --start-command\n\
224 Supply a Bourne shell to command to use to start a single\n\
225 daemon. The command string should include '%%s' where the\n\
226 name of the daemon should be substituted.\n\
227-k, --kill-command\n\
228 Supply a Bourne shell to command to use to stop a single\n\
229 daemon. The command string should include '%%s' where the\n\
230 name of the daemon should be substituted.\n\
f168b713 231 --dry Do not start or restart anything, just log.\n\
8b886ca7 232-p, --pid-file Set process identifier file name\n\
233 (default is %s).\n\
c8b40f86 234-b, --blank-string\n\
235 When the supplied argument string is found in any of the\n\
f168b713 236 various shell command arguments (-r, -s, or -k), replace\n\
c8b40f86 237 it with a space. This is an ugly hack to circumvent problems\n\
238 passing command-line arguments with embedded spaces.\n\
8b886ca7 239-v, --version Print program version\n\
d62a17ae 240-h, --help Display this help and exit\n",
64a249ad 241 frr_vtydir, DEFAULT_LOGLEVEL, LOG_EMERG, LOG_DEBUG, LOG_DEBUG,
d62a17ae 242 DEFAULT_MIN_RESTART, DEFAULT_MAX_RESTART, DEFAULT_PERIOD,
64a249ad 243 DEFAULT_TIMEOUT, DEFAULT_RESTART_TIMEOUT, pidfile_default);
8b886ca7 244}
245
a6810074 246static pid_t run_background(char *shell_cmd)
8b886ca7 247{
a6810074
DL
248 pid_t child;
249
250 switch (child = fork()) {
251 case -1:
09c866e3
QY
252 flog_err_sys(LIB_ERR_SYSTEM_CALL,
253 "fork failed, cannot run command [%s]: %s",
254 shell_cmd, safe_strerror(errno));
a6810074
DL
255 return -1;
256 case 0:
257 /* Child process. */
d62a17ae 258 /* Use separate process group so child processes can be killed
259 * easily. */
a6810074
DL
260 if (setpgid(0, 0) < 0)
261 zlog_warn("warning: setpgid(0,0) failed: %s",
262 safe_strerror(errno));
263 {
264 char shell[] = "sh";
265 char dashc[] = "-c";
d62a17ae 266 char *const argv[4] = {shell, dashc, shell_cmd, NULL};
a6810074 267 execv("/bin/sh", argv);
09c866e3
QY
268 flog_err_sys(LIB_ERR_SYSTEM_CALL,
269 "execv(/bin/sh -c '%s') failed: %s",
270 shell_cmd, safe_strerror(errno));
a6810074
DL
271 _exit(127);
272 }
273 default:
274 /* Parent process: we will reap the child later. */
09c866e3
QY
275 flog_err_sys(LIB_ERR_SYSTEM_CALL,
276 "Forked background command [pid %d]: %s",
277 (int)child, shell_cmd);
a6810074
DL
278 return child;
279 }
8b886ca7 280}
281
a6810074
DL
282static struct timeval *time_elapsed(struct timeval *result,
283 const struct timeval *start_time)
8b886ca7 284{
a6810074
DL
285 gettimeofday(result, NULL);
286 result->tv_sec -= start_time->tv_sec;
287 result->tv_usec -= start_time->tv_usec;
288 while (result->tv_usec < 0) {
289 result->tv_usec += 1000000L;
290 result->tv_sec--;
291 }
292 return result;
8b886ca7 293}
294
a6810074 295static int restart_kill(struct thread *t_kill)
8b886ca7 296{
a6810074
DL
297 struct restart_info *restart = THREAD_ARG(t_kill);
298 struct timeval delay;
299
300 time_elapsed(&delay, &restart->time);
d62a17ae 301 zlog_warn(
302 "Warning: %s %s child process %d still running after "
303 "%ld seconds, sending signal %d",
304 restart->what, restart->name, (int)restart->pid,
305 (long)delay.tv_sec, (restart->kills ? SIGKILL : SIGTERM));
a6810074
DL
306 kill(-restart->pid, (restart->kills ? SIGKILL : SIGTERM));
307 restart->kills++;
66e78ae6
QY
308 restart->t_kill = NULL;
309 thread_add_timer(master, restart_kill, restart, gs.restart_timeout,
310 &restart->t_kill);
a6810074 311 return 0;
8b886ca7 312}
313
a6810074 314static struct restart_info *find_child(pid_t child)
8b886ca7 315{
f168b713 316 struct daemon *dmn;
7c265f7d
CF
317 if (gs.restart.pid == child)
318 return &gs.restart;
319
f168b713
DL
320 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
321 if (dmn->restart.pid == child)
322 return &dmn->restart;
a6810074
DL
323 }
324 return NULL;
8b886ca7 325}
326
a6810074 327static void sigchild(void)
8b886ca7 328{
a6810074
DL
329 pid_t child;
330 int status;
331 const char *name;
332 const char *what;
333 struct restart_info *restart;
334
335 switch (child = waitpid(-1, &status, WNOHANG)) {
336 case -1:
09c866e3
QY
337 flog_err_sys(LIB_ERR_SYSTEM_CALL, "waitpid failed: %s",
338 safe_strerror(errno));
a6810074
DL
339 return;
340 case 0:
341 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
342 return;
343 }
344
345 if (child == integrated_write_pid) {
346 integrated_write_sigchld(status);
347 return;
348 }
349
350 if ((restart = find_child(child)) != NULL) {
351 name = restart->name;
352 what = restart->what;
353 restart->pid = 0;
354 gs.numpids--;
355 thread_cancel(restart->t_kill);
356 restart->t_kill = NULL;
d62a17ae 357 /* Update restart time to reflect the time the command
358 * completed. */
a6810074
DL
359 gettimeofday(&restart->time, NULL);
360 } else {
09c866e3
QY
361 flog_err_sys(
362 LIB_ERR_SYSTEM_CALL,
363 "waitpid returned status for an unknown child process %d",
364 (int)child);
a6810074
DL
365 name = "(unknown)";
366 what = "background";
367 }
368 if (WIFSTOPPED(status))
d62a17ae 369 zlog_warn("warning: %s %s process %d is stopped", what, name,
370 (int)child);
a6810074 371 else if (WIFSIGNALED(status))
d62a17ae 372 zlog_warn("%s %s process %d terminated due to signal %d", what,
373 name, (int)child, WTERMSIG(status));
a6810074
DL
374 else if (WIFEXITED(status)) {
375 if (WEXITSTATUS(status) != 0)
d62a17ae 376 zlog_warn(
377 "%s %s process %d exited with non-zero status %d",
378 what, name, (int)child, WEXITSTATUS(status));
a6810074
DL
379 else
380 zlog_debug("%s %s process %d exited normally", what,
381 name, (int)child);
382 } else
09c866e3
QY
383 flog_err_sys(
384 LIB_ERR_SYSTEM_CALL,
385 "cannot interpret %s %s process %d wait status 0x%x",
386 what, name, (int)child, status);
a6810074 387 phase_check();
8b886ca7 388}
389
d62a17ae 390static int run_job(struct restart_info *restart, const char *cmdtype,
391 const char *command, int force, int update_interval)
8b886ca7 392{
a6810074
DL
393 struct timeval delay;
394
395 if (gs.loglevel > LOG_DEBUG + 1)
396 zlog_debug("attempting to %s %s", cmdtype, restart->name);
397
398 if (restart->pid) {
399 if (gs.loglevel > LOG_DEBUG + 1)
d62a17ae 400 zlog_debug(
401 "cannot %s %s, previous pid %d still running",
402 cmdtype, restart->name, (int)restart->pid);
a6810074
DL
403 return -1;
404 }
405
d62a17ae 406 /* Note: time_elapsed test must come before the force test, since we
407 need
a6810074
DL
408 to make sure that delay is initialized for use below in updating the
409 restart interval. */
410 if ((time_elapsed(&delay, &restart->time)->tv_sec < restart->interval)
411 && !force) {
412 if (gs.loglevel > LOG_DEBUG + 1)
d62a17ae 413 zlog_debug(
414 "postponing %s %s: "
415 "elapsed time %ld < retry interval %ld",
416 cmdtype, restart->name, (long)delay.tv_sec,
417 restart->interval);
a6810074
DL
418 return -1;
419 }
420
421 gettimeofday(&restart->time, NULL);
422 restart->kills = 0;
423 {
424 char cmd[strlen(command) + strlen(restart->name) + 1];
425 snprintf(cmd, sizeof(cmd), command, restart->name);
426 if ((restart->pid = run_background(cmd)) > 0) {
66e78ae6 427 restart->t_kill = NULL;
d62a17ae 428 thread_add_timer(master, restart_kill, restart,
429 gs.restart_timeout, &restart->t_kill);
a6810074
DL
430 restart->what = cmdtype;
431 gs.numpids++;
432 } else
433 restart->pid = 0;
434 }
435
436 /* Calculate the new restart interval. */
437 if (update_interval) {
438 if (delay.tv_sec > 2 * gs.max_restart_interval)
439 restart->interval = gs.min_restart_interval;
440 else if ((restart->interval *= 2) > gs.max_restart_interval)
441 restart->interval = gs.max_restart_interval;
442 if (gs.loglevel > LOG_DEBUG + 1)
443 zlog_debug("restart %s interval is now %ld",
444 restart->name, restart->interval);
445 }
446 return restart->pid;
8b886ca7 447}
448
d62a17ae 449#define SET_READ_HANDLER(DMN) \
450 do { \
451 (DMN)->t_read = NULL; \
452 thread_add_read(master, handle_read, (DMN), (DMN)->fd, \
453 &(DMN)->t_read); \
454 } while (0);
455
456#define SET_WAKEUP_DOWN(DMN) \
457 do { \
458 (DMN)->t_wakeup = NULL; \
459 thread_add_timer_msec(master, wakeup_down, (DMN), \
460 FUZZY(gs.period), &(DMN)->t_wakeup); \
461 } while (0);
462
463#define SET_WAKEUP_UNRESPONSIVE(DMN) \
464 do { \
465 (DMN)->t_wakeup = NULL; \
466 thread_add_timer_msec(master, wakeup_unresponsive, (DMN), \
467 FUZZY(gs.period), &(DMN)->t_wakeup); \
468 } while (0);
469
470#define SET_WAKEUP_ECHO(DMN) \
471 do { \
472 (DMN)->t_wakeup = NULL; \
473 thread_add_timer_msec(master, wakeup_send_echo, (DMN), \
474 FUZZY(gs.period), &(DMN)->t_wakeup); \
475 } while (0);
8b886ca7 476
a6810074 477static int wakeup_down(struct thread *t_wakeup)
8b886ca7 478{
a6810074
DL
479 struct daemon *dmn = THREAD_ARG(t_wakeup);
480
481 dmn->t_wakeup = NULL;
482 if (try_connect(dmn) < 0)
483 SET_WAKEUP_DOWN(dmn);
484 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
485 try_restart(dmn);
486 return 0;
8b886ca7 487}
488
a6810074 489static int wakeup_init(struct thread *t_wakeup)
8b886ca7 490{
a6810074
DL
491 struct daemon *dmn = THREAD_ARG(t_wakeup);
492
493 dmn->t_wakeup = NULL;
494 if (try_connect(dmn) < 0) {
495 SET_WAKEUP_DOWN(dmn);
af4c2728 496 flog_err(WATCHFRR_ERR_CONNECTION,
b647dc2a
DS
497 "%s state -> down : initial connection attempt failed",
498 dmn->name);
a6810074
DL
499 dmn->state = DAEMON_DOWN;
500 }
501 return 0;
8b886ca7 502}
503
a6810074 504static void daemon_down(struct daemon *dmn, const char *why)
8b886ca7 505{
a6810074 506 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
af4c2728 507 flog_err(WATCHFRR_ERR_CONNECTION,
b647dc2a 508 "%s state -> down : %s", dmn->name, why);
a6810074
DL
509 else if (gs.loglevel > LOG_DEBUG)
510 zlog_debug("%s still down : %s", dmn->name, why);
511 if (IS_UP(dmn))
512 gs.numdown++;
513 dmn->state = DAEMON_DOWN;
514 if (dmn->fd >= 0) {
515 close(dmn->fd);
516 dmn->fd = -1;
517 }
518 THREAD_OFF(dmn->t_read);
519 THREAD_OFF(dmn->t_write);
520 THREAD_OFF(dmn->t_wakeup);
521 if (try_connect(dmn) < 0)
522 SET_WAKEUP_DOWN(dmn);
523 phase_check();
8b886ca7 524}
525
a6810074 526static int handle_read(struct thread *t_read)
8b886ca7 527{
a6810074
DL
528 struct daemon *dmn = THREAD_ARG(t_read);
529 static const char resp[sizeof(PING_TOKEN) + 4] = PING_TOKEN "\n";
530 char buf[sizeof(resp) + 100];
531 ssize_t rc;
532 struct timeval delay;
533
534 dmn->t_read = NULL;
535 if ((rc = read(dmn->fd, buf, sizeof(buf))) < 0) {
536 char why[100];
537
538 if (ERRNO_IO_RETRY(errno)) {
539 /* Pretend it never happened. */
540 SET_READ_HANDLER(dmn);
541 return 0;
542 }
543 snprintf(why, sizeof(why), "unexpected read error: %s",
544 safe_strerror(errno));
545 daemon_down(dmn, why);
546 return 0;
8b886ca7 547 }
a6810074
DL
548 if (rc == 0) {
549 daemon_down(dmn, "read returned EOF");
550 return 0;
551 }
552 if (!dmn->echo_sent.tv_sec) {
553 char why[sizeof(buf) + 100];
554 snprintf(why, sizeof(why),
555 "unexpected read returns %d bytes: %.*s", (int)rc,
556 (int)rc, buf);
557 daemon_down(dmn, why);
558 return 0;
8b886ca7 559 }
a6810074
DL
560
561 /* We are expecting an echo response: is there any chance that the
562 response would not be returned entirely in the first read? That
563 seems inconceivable... */
564 if ((rc != sizeof(resp)) || memcmp(buf, resp, sizeof(resp))) {
565 char why[100 + sizeof(buf)];
566 snprintf(why, sizeof(why),
567 "read returned bad echo response of %d bytes "
d62a17ae 568 "(expecting %u): %.*s",
d7c0a89a 569 (int)rc, (unsigned int)sizeof(resp), (int)rc, buf);
a6810074
DL
570 daemon_down(dmn, why);
571 return 0;
572 }
573
574 time_elapsed(&delay, &dmn->echo_sent);
575 dmn->echo_sent.tv_sec = 0;
576 if (dmn->state == DAEMON_UNRESPONSIVE) {
577 if (delay.tv_sec < gs.timeout) {
578 dmn->state = DAEMON_UP;
d62a17ae 579 zlog_warn(
580 "%s state -> up : echo response received after %ld.%06ld "
581 "seconds",
582 dmn->name, (long)delay.tv_sec,
583 (long)delay.tv_usec);
a6810074 584 } else
d62a17ae 585 zlog_warn(
586 "%s: slow echo response finally received after %ld.%06ld "
587 "seconds",
588 dmn->name, (long)delay.tv_sec,
589 (long)delay.tv_usec);
a6810074
DL
590 } else if (gs.loglevel > LOG_DEBUG + 1)
591 zlog_debug("%s: echo response received after %ld.%06ld seconds",
592 dmn->name, (long)delay.tv_sec, (long)delay.tv_usec);
593
594 SET_READ_HANDLER(dmn);
595 if (dmn->t_wakeup)
596 thread_cancel(dmn->t_wakeup);
597 SET_WAKEUP_ECHO(dmn);
598
599 return 0;
8b886ca7 600}
601
207e0d7a
DS
602/*
603 * Wait till we notice that all daemons are ready before
604 * we send we are ready to systemd
605 */
a6810074 606static void daemon_send_ready(void)
207e0d7a 607{
a6810074
DL
608 static int sent = 0;
609 if (!sent && gs.numdown == 0) {
a6810074 610 FILE *fp;
207e0d7a 611
a6810074 612 fp = fopen(DAEMON_VTY_DIR "/watchfrr.started", "w");
f5ba21fc
DS
613 if (fp)
614 fclose(fp);
60bd2534 615#if defined HAVE_SYSTEMD
d62a17ae 616 zlog_notice(
617 "Watchfrr: Notifying Systemd we are up and running");
a6810074 618 systemd_send_started(master, 0);
60bd2534 619#endif
a6810074
DL
620 sent = 1;
621 }
207e0d7a
DS
622}
623
a6810074 624static void daemon_up(struct daemon *dmn, const char *why)
8b886ca7 625{
a6810074
DL
626 dmn->state = DAEMON_UP;
627 gs.numdown--;
628 dmn->connect_tries = 0;
629 zlog_notice("%s state -> up : %s", dmn->name, why);
630 daemon_send_ready();
a8cbb8b3 631 SET_WAKEUP_ECHO(dmn);
a6810074 632 phase_check();
8b886ca7 633}
634
a6810074 635static int check_connect(struct thread *t_write)
8b886ca7 636{
a6810074
DL
637 struct daemon *dmn = THREAD_ARG(t_write);
638 int sockerr;
639 socklen_t reslen = sizeof(sockerr);
640
641 dmn->t_write = NULL;
642 if (getsockopt(dmn->fd, SOL_SOCKET, SO_ERROR, (char *)&sockerr, &reslen)
643 < 0) {
644 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn->name,
645 safe_strerror(errno));
646 daemon_down(dmn,
647 "getsockopt failed checking connection success");
648 return 0;
649 }
650 if ((reslen == sizeof(sockerr)) && sockerr) {
651 char why[100];
d62a17ae 652 snprintf(
653 why, sizeof(why),
654 "getsockopt reports that connection attempt failed: %s",
655 safe_strerror(sockerr));
a6810074
DL
656 daemon_down(dmn, why);
657 return 0;
658 }
659
660 daemon_up(dmn, "delayed connect succeeded");
661 return 0;
8b886ca7 662}
663
a6810074 664static int wakeup_connect_hanging(struct thread *t_wakeup)
8b886ca7 665{
a6810074
DL
666 struct daemon *dmn = THREAD_ARG(t_wakeup);
667 char why[100];
668
669 dmn->t_wakeup = NULL;
670 snprintf(why, sizeof(why),
671 "connection attempt timed out after %ld seconds", gs.timeout);
672 daemon_down(dmn, why);
673 return 0;
8b886ca7 674}
675
676/* Making connection to protocol daemon. */
a6810074 677static int try_connect(struct daemon *dmn)
8b886ca7 678{
a6810074
DL
679 int sock;
680 struct sockaddr_un addr;
681 socklen_t len;
682
683 if (gs.loglevel > LOG_DEBUG + 1)
684 zlog_debug("%s: attempting to connect", dmn->name);
685 dmn->connect_tries++;
686
687 memset(&addr, 0, sizeof(struct sockaddr_un));
688 addr.sun_family = AF_UNIX;
d62a17ae 689 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty", gs.vtydir,
690 dmn->name);
6f0e3f6e 691#ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
a6810074 692 len = addr.sun_len = SUN_LEN(&addr);
8b886ca7 693#else
a6810074 694 len = sizeof(addr.sun_family) + strlen(addr.sun_path);
d62a17ae 695#endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
a6810074
DL
696
697 /* Quick check to see if we might succeed before we go to the trouble
698 of creating a socket. */
699 if (access(addr.sun_path, W_OK) < 0) {
700 if (errno != ENOENT)
09c866e3
QY
701 flog_err_sys(LIB_ERR_SYSTEM_CALL,
702 "%s: access to socket %s denied: %s",
703 dmn->name, addr.sun_path,
704 safe_strerror(errno));
a6810074
DL
705 return -1;
706 }
707
708 if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
09c866e3
QY
709 flog_err_sys(LIB_ERR_SOCKET, "%s(%s): cannot make socket: %s",
710 __func__, addr.sun_path, safe_strerror(errno));
a6810074
DL
711 return -1;
712 }
713
714 if (set_nonblocking(sock) < 0 || set_cloexec(sock) < 0) {
09c866e3
QY
715 flog_err_sys(LIB_ERR_SYSTEM_CALL,
716 "%s(%s): set_nonblocking/cloexec(%d) failed",
717 __func__, addr.sun_path, sock);
a6810074
DL
718 close(sock);
719 return -1;
8b886ca7 720 }
a6810074
DL
721
722 if (connect(sock, (struct sockaddr *)&addr, len) < 0) {
723 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK)) {
724 if (gs.loglevel > LOG_DEBUG)
725 zlog_debug("%s(%s): connect failed: %s",
726 __func__, addr.sun_path,
727 safe_strerror(errno));
728 close(sock);
729 return -1;
730 }
731 if (gs.loglevel > LOG_DEBUG)
732 zlog_debug("%s: connection in progress", dmn->name);
733 dmn->state = DAEMON_CONNECTING;
734 dmn->fd = sock;
66e78ae6
QY
735 dmn->t_write = NULL;
736 thread_add_write(master, check_connect, dmn, dmn->fd,
d62a17ae 737 &dmn->t_write);
738 dmn->t_wakeup = NULL;
739 thread_add_timer(master, wakeup_connect_hanging, dmn,
740 gs.timeout, &dmn->t_wakeup);
a6810074
DL
741 SET_READ_HANDLER(dmn);
742 return 0;
743 }
744
745 dmn->fd = sock;
746 SET_READ_HANDLER(dmn);
747 daemon_up(dmn, "connect succeeded");
748 return 1;
8b886ca7 749}
750
a6810074 751static int phase_hanging(struct thread *t_hanging)
8b886ca7 752{
a6810074 753 gs.t_phase_hanging = NULL;
af4c2728 754 flog_err(WATCHFRR_ERR_CONNECTION,
b647dc2a
DS
755 "Phase [%s] hanging for %ld seconds, aborting phased restart",
756 phase_str[gs.phase], PHASE_TIMEOUT);
a6810074
DL
757 gs.phase = PHASE_NONE;
758 return 0;
8b886ca7 759}
760
a6810074 761static void set_phase(restart_phase_t new_phase)
8b886ca7 762{
a6810074
DL
763 gs.phase = new_phase;
764 if (gs.t_phase_hanging)
765 thread_cancel(gs.t_phase_hanging);
66e78ae6
QY
766 gs.t_phase_hanging = NULL;
767 thread_add_timer(master, phase_hanging, NULL, PHASE_TIMEOUT,
768 &gs.t_phase_hanging);
8b886ca7 769}
770
a6810074 771static void phase_check(void)
8b886ca7 772{
a6810074
DL
773 switch (gs.phase) {
774 case PHASE_NONE:
775 break;
776 case PHASE_STOPS_PENDING:
777 if (gs.numpids)
778 break;
d62a17ae 779 zlog_info(
780 "Phased restart: all routing daemon stop jobs have completed.");
a6810074
DL
781 set_phase(PHASE_WAITING_DOWN);
782
d62a17ae 783 /*FALLTHRU*/
a6810074
DL
784 case PHASE_WAITING_DOWN:
785 if (gs.numdown + IS_UP(gs.special) < gs.numdaemons)
786 break;
787 zlog_info("Phased restart: all routing daemons now down.");
788 run_job(&gs.special->restart, "restart", gs.restart_command, 1,
789 1);
790 set_phase(PHASE_ZEBRA_RESTART_PENDING);
791
d62a17ae 792 /*FALLTHRU*/
a6810074
DL
793 case PHASE_ZEBRA_RESTART_PENDING:
794 if (gs.special->restart.pid)
795 break;
796 zlog_info("Phased restart: %s restart job completed.",
797 gs.special->name);
798 set_phase(PHASE_WAITING_ZEBRA_UP);
799
d62a17ae 800 /*FALLTHRU*/
a6810074
DL
801 case PHASE_WAITING_ZEBRA_UP:
802 if (!IS_UP(gs.special))
803 break;
804 zlog_info("Phased restart: %s is now up.", gs.special->name);
805 {
806 struct daemon *dmn;
807 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
808 if (dmn != gs.special)
809 run_job(&dmn->restart, "start",
810 gs.start_command, 1, 0);
811 }
812 }
813 gs.phase = PHASE_NONE;
814 THREAD_OFF(gs.t_phase_hanging);
815 zlog_notice("Phased global restart has completed.");
816 break;
817 }
8b886ca7 818}
819
a6810074 820static void try_restart(struct daemon *dmn)
8b886ca7 821{
f168b713 822 if (watch_only)
a6810074 823 return;
a6810074 824
f168b713
DL
825 if (dmn != gs.special) {
826 if ((gs.special->state == DAEMON_UP)
827 && (gs.phase == PHASE_NONE))
828 run_job(&dmn->restart, "restart", gs.restart_command, 0,
829 1);
830 else
831 zlog_debug(
832 "%s: postponing restart attempt because master %s daemon "
833 "not up [%s], or phased restart in progress",
834 dmn->name, gs.special->name,
835 state_str[gs.special->state]);
836 return;
837 }
838
839 if ((gs.phase != PHASE_NONE) || gs.numpids) {
840 if (gs.loglevel > LOG_DEBUG + 1)
841 zlog_debug(
842 "postponing phased global restart: restart already in "
843 "progress [%s], or outstanding child processes [%d]",
844 phase_str[gs.phase], gs.numpids);
845 return;
846 }
847 /* Is it too soon for a restart? */
848 {
849 struct timeval delay;
850 if (time_elapsed(&delay, &gs.special->restart.time)->tv_sec
851 < gs.special->restart.interval) {
a6810074 852 if (gs.loglevel > LOG_DEBUG + 1)
d62a17ae 853 zlog_debug(
f168b713
DL
854 "postponing phased global restart: "
855 "elapsed time %ld < retry interval %ld",
856 (long)delay.tv_sec,
857 gs.special->restart.interval);
858 return;
a6810074 859 }
8b886ca7 860 }
f168b713 861 run_job(&gs.restart, "restart", gs.restart_command, 0, 1);
8b886ca7 862}
863
a6810074 864static int wakeup_unresponsive(struct thread *t_wakeup)
8b886ca7 865{
a6810074
DL
866 struct daemon *dmn = THREAD_ARG(t_wakeup);
867
868 dmn->t_wakeup = NULL;
869 if (dmn->state != DAEMON_UNRESPONSIVE)
af4c2728 870 flog_err(WATCHFRR_ERR_CONNECTION,
b647dc2a
DS
871 "%s: no longer unresponsive (now %s), "
872 "wakeup should have been cancelled!",
873 dmn->name, state_str[dmn->state]);
a6810074
DL
874 else {
875 SET_WAKEUP_UNRESPONSIVE(dmn);
876 try_restart(dmn);
877 }
878 return 0;
8b886ca7 879}
880
a6810074 881static int wakeup_no_answer(struct thread *t_wakeup)
8b886ca7 882{
a6810074
DL
883 struct daemon *dmn = THREAD_ARG(t_wakeup);
884
885 dmn->t_wakeup = NULL;
886 dmn->state = DAEMON_UNRESPONSIVE;
af4c2728 887 flog_err(WATCHFRR_ERR_CONNECTION,
b647dc2a
DS
888 "%s state -> unresponsive : no response yet to ping "
889 "sent %ld seconds ago",
890 dmn->name, gs.timeout);
71e7975a
DL
891 SET_WAKEUP_UNRESPONSIVE(dmn);
892 try_restart(dmn);
a6810074 893 return 0;
8b886ca7 894}
895
a6810074 896static int wakeup_send_echo(struct thread *t_wakeup)
8b886ca7 897{
a6810074
DL
898 static const char echocmd[] = "echo " PING_TOKEN;
899 ssize_t rc;
900 struct daemon *dmn = THREAD_ARG(t_wakeup);
901
902 dmn->t_wakeup = NULL;
d62a17ae 903 if (((rc = write(dmn->fd, echocmd, sizeof(echocmd))) < 0)
904 || ((size_t)rc != sizeof(echocmd))) {
a6810074
DL
905 char why[100 + sizeof(echocmd)];
906 snprintf(why, sizeof(why),
907 "write '%s' returned %d instead of %u", echocmd,
d7c0a89a 908 (int)rc, (unsigned int)sizeof(echocmd));
a6810074
DL
909 daemon_down(dmn, why);
910 } else {
911 gettimeofday(&dmn->echo_sent, NULL);
66e78ae6
QY
912 dmn->t_wakeup = NULL;
913 thread_add_timer(master, wakeup_no_answer, dmn, gs.timeout,
914 &dmn->t_wakeup);
a6810074
DL
915 }
916 return 0;
8b886ca7 917}
918
470bc619
QY
919bool check_all_up(void)
920{
921 struct daemon *dmn;
922
923 for (dmn = gs.daemons; dmn; dmn = dmn->next)
924 if (dmn->state != DAEMON_UP)
925 return false;
926 return true;
927}
928
a6810074 929static void sigint(void)
8b886ca7 930{
a6810074
DL
931 zlog_notice("Terminating on signal");
932 systemd_send_stopping();
933 exit(0);
8b886ca7 934}
935
a6810074 936static int valid_command(const char *cmd)
8b886ca7 937{
a6810074 938 char *p;
8b886ca7 939
a6810074 940 return ((p = strchr(cmd, '%')) != NULL) && (*(p + 1) == 's')
d62a17ae 941 && !strchr(p + 1, '%');
8b886ca7 942}
943
c8b40f86 944/* This is an ugly hack to circumvent problems with passing command-line
945 arguments that contain spaces. The fix is to use a configuration file. */
a6810074 946static char *translate_blanks(const char *cmd, const char *blankstr)
c8b40f86 947{
a6810074
DL
948 char *res;
949 char *p;
950 size_t bslen = strlen(blankstr);
951
952 if (!(res = strdup(cmd))) {
953 perror("strdup");
954 exit(1);
955 }
956 while ((p = strstr(res, blankstr)) != NULL) {
957 *p = ' ';
958 if (bslen != 1)
959 memmove(p + 1, p + bslen, strlen(p + bslen) + 1);
960 }
961 return res;
c8b40f86 962}
963
a6810074 964struct zebra_privs_t watchfrr_privs = {
95c4aff2 965#ifdef VTY_GROUP
a6810074 966 .vty_group = VTY_GROUP,
95c4aff2
DL
967#endif
968};
969
4f04a76b
DL
970static struct quagga_signal_t watchfrr_signals[] = {
971 {
972 .signal = SIGINT,
973 .handler = sigint,
974 },
975 {
976 .signal = SIGTERM,
977 .handler = sigint,
978 },
979 {
980 .signal = SIGCHLD,
981 .handler = sigchild,
982 },
983};
984
985FRR_DAEMON_INFO(watchfrr, WATCHFRR,
d62a17ae 986 .flags = FRR_NO_PRIVSEP | FRR_NO_TCPVTY | FRR_LIMITED_CLI
987 | FRR_NO_CFG_PID_DRY | FRR_NO_ZCLIENT,
4f04a76b 988
d62a17ae 989 .printhelp = printhelp,
990 .copyright = "Copyright 2004 Andrew J. Schorr",
4f04a76b 991
d62a17ae 992 .signals = watchfrr_signals,
993 .n_signals = array_size(watchfrr_signals),
4f04a76b 994
d62a17ae 995 .privs = &watchfrr_privs, )
4f04a76b 996
999f153e
DL
997#define DEPRECATED_OPTIONS "aAezR:"
998
a6810074 999int main(int argc, char **argv)
8b886ca7 1000{
a6810074 1001 int opt;
64a249ad 1002 const char *pidfile = pidfile_default;
a6810074
DL
1003 const char *special = "zebra";
1004 const char *blankstr = NULL;
a6810074 1005
64a249ad
DL
1006 snprintf(pidfile_default, sizeof(pidfile_default), "%s/watchfrr.pid",
1007 frr_vtydir);
1008
4f04a76b
DL
1009 frr_preinit(&watchfrr_di, argc, argv);
1010 progname = watchfrr_di.progname;
1011
999f153e 1012 frr_opt_add("b:dk:l:i:p:r:S:s:t:T:" DEPRECATED_OPTIONS, longopts, "");
a6810074
DL
1013
1014 gs.restart.name = "all";
4f04a76b 1015 while ((opt = frr_getopt(argc, argv, NULL)) != EOF) {
999f153e
DL
1016 if (opt && opt < 128 && strchr(DEPRECATED_OPTIONS, opt)) {
1017 fprintf(stderr,
1018 "The -%c option no longer exists.\n"
1019 "Please refer to the watchfrr(8) man page.\n",
1020 opt);
1021 exit(1);
1022 }
1023
a6810074
DL
1024 switch (opt) {
1025 case 0:
1026 break;
a6810074
DL
1027 case 'b':
1028 blankstr = optarg;
1029 break;
f168b713
DL
1030 case OPTION_DRY:
1031 watch_only = true;
a6810074
DL
1032 break;
1033 case 'k':
1034 if (!valid_command(optarg)) {
1035 fprintf(stderr,
1036 "Invalid kill command, must contain '%%s': %s\n",
1037 optarg);
4f04a76b 1038 frr_help_exit(1);
a6810074
DL
1039 }
1040 gs.stop_command = optarg;
1041 break;
d62a17ae 1042 case 'l': {
1043 char garbage[3];
1044 if ((sscanf(optarg, "%d%1s", &gs.loglevel, garbage)
1045 != 1)
1046 || (gs.loglevel < LOG_EMERG)) {
1047 fprintf(stderr,
1048 "Invalid loglevel argument: %s\n",
1049 optarg);
1050 frr_help_exit(1);
a6810074 1051 }
d62a17ae 1052 } break;
1053 case OPTION_MINRESTART: {
1054 char garbage[3];
1055 if ((sscanf(optarg, "%ld%1s", &gs.min_restart_interval,
1056 garbage)
1057 != 1)
1058 || (gs.min_restart_interval < 0)) {
1059 fprintf(stderr,
1060 "Invalid min_restart_interval argument: %s\n",
1061 optarg);
1062 frr_help_exit(1);
a6810074 1063 }
d62a17ae 1064 } break;
1065 case OPTION_MAXRESTART: {
1066 char garbage[3];
1067 if ((sscanf(optarg, "%ld%1s", &gs.max_restart_interval,
1068 garbage)
1069 != 1)
1070 || (gs.max_restart_interval < 0)) {
1071 fprintf(stderr,
1072 "Invalid max_restart_interval argument: %s\n",
1073 optarg);
1074 frr_help_exit(1);
a6810074 1075 }
d62a17ae 1076 } break;
1077 case 'i': {
1078 char garbage[3];
1079 int period;
1080 if ((sscanf(optarg, "%d%1s", &period, garbage) != 1)
1081 || (gs.period < 1)) {
1082 fprintf(stderr,
1083 "Invalid interval argument: %s\n",
1084 optarg);
1085 frr_help_exit(1);
a6810074 1086 }
d62a17ae 1087 gs.period = 1000 * period;
1088 } break;
a6810074
DL
1089 case 'p':
1090 pidfile = optarg;
1091 break;
1092 case 'r':
a6810074
DL
1093 if (!valid_command(optarg)) {
1094 fprintf(stderr,
1095 "Invalid restart command, must contain '%%s': %s\n",
1096 optarg);
4f04a76b 1097 frr_help_exit(1);
a6810074
DL
1098 }
1099 gs.restart_command = optarg;
a6810074
DL
1100 break;
1101 case 's':
1102 if (!valid_command(optarg)) {
1103 fprintf(stderr,
1104 "Invalid start command, must contain '%%s': %s\n",
1105 optarg);
4f04a76b 1106 frr_help_exit(1);
a6810074
DL
1107 }
1108 gs.start_command = optarg;
1109 break;
1110 case 'S':
1111 gs.vtydir = optarg;
1112 break;
d62a17ae 1113 case 't': {
1114 char garbage[3];
1115 if ((sscanf(optarg, "%ld%1s", &gs.timeout, garbage)
1116 != 1)
1117 || (gs.timeout < 1)) {
1118 fprintf(stderr,
1119 "Invalid timeout argument: %s\n",
1120 optarg);
1121 frr_help_exit(1);
a6810074 1122 }
d62a17ae 1123 } break;
1124 case 'T': {
1125 char garbage[3];
1126 if ((sscanf(optarg, "%ld%1s", &gs.restart_timeout,
1127 garbage)
1128 != 1)
1129 || (gs.restart_timeout < 1)) {
1130 fprintf(stderr,
1131 "Invalid restart timeout argument: %s\n",
1132 optarg);
1133 frr_help_exit(1);
a6810074 1134 }
d62a17ae 1135 } break;
a6810074
DL
1136 default:
1137 fputs("Invalid option.\n", stderr);
4f04a76b 1138 frr_help_exit(1);
a6810074 1139 }
8b886ca7 1140 }
a6810074 1141
71e7975a
DL
1142 if (watch_only
1143 && (gs.start_command || gs.stop_command || gs.restart_command)) {
d87ae5cc 1144 fputs("Options -r/-s/-k are not used when --dry is active.\n",
a6810074 1145 stderr);
8b886ca7 1146 }
f168b713
DL
1147 if (!watch_only
1148 && (!gs.restart_command || !gs.start_command || !gs.stop_command)) {
1149 fprintf(stderr,
1150 "Options -s (start), -k (kill), and -r (restart) are required.\n");
1151 frr_help_exit(1);
8b886ca7 1152 }
8b886ca7 1153
a6810074
DL
1154 if (blankstr) {
1155 if (gs.restart_command)
1156 gs.restart_command =
d62a17ae 1157 translate_blanks(gs.restart_command, blankstr);
a6810074
DL
1158 if (gs.start_command)
1159 gs.start_command =
d62a17ae 1160 translate_blanks(gs.start_command, blankstr);
a6810074
DL
1161 if (gs.stop_command)
1162 gs.stop_command =
d62a17ae 1163 translate_blanks(gs.stop_command, blankstr);
065de903 1164 }
8b886ca7 1165
a6810074 1166 gs.restart.interval = gs.min_restart_interval;
8b886ca7 1167
4f04a76b 1168 master = frr_init();
b647dc2a 1169 watchfrr_error_init();
4f04a76b 1170
dd8376fe 1171 zlog_set_level(ZLOG_DEST_MONITOR, ZLOG_DISABLED);
eb05883f 1172 if (watchfrr_di.daemon_mode) {
dd8376fe 1173 zlog_set_level(ZLOG_DEST_SYSLOG, MIN(gs.loglevel, LOG_DEBUG));
d62a17ae 1174 if (daemon(0, 0) < 0) {
2f4f11fa 1175 fprintf(stderr, "Watchfrr daemon failed: %s",
d62a17ae 1176 strerror(errno));
1177 exit(1);
4f04a76b
DL
1178 }
1179 } else
dd8376fe 1180 zlog_set_level(ZLOG_DEST_STDOUT, MIN(gs.loglevel, LOG_DEBUG));
8b886ca7 1181
a6810074 1182 watchfrr_vty_init();
8b886ca7 1183
eb05883f 1184 frr_vty_serv();
8b886ca7 1185
8b886ca7 1186 {
a6810074
DL
1187 int i;
1188 struct daemon *tail = NULL;
1189
1190 for (i = optind; i < argc; i++) {
1191 struct daemon *dmn;
1192
1193 if (!(dmn = (struct daemon *)calloc(1, sizeof(*dmn)))) {
1194 fprintf(stderr, "calloc(1,%u) failed: %s\n",
d7c0a89a 1195 (unsigned int)sizeof(*dmn),
a6810074
DL
1196 safe_strerror(errno));
1197 return 1;
1198 }
1199 dmn->name = dmn->restart.name = argv[i];
1200 dmn->state = DAEMON_INIT;
1201 gs.numdaemons++;
1202 gs.numdown++;
1203 dmn->fd = -1;
66e78ae6 1204 dmn->t_wakeup = NULL;
d62a17ae 1205 thread_add_timer_msec(master, wakeup_init, dmn,
1206 100 + (random() % 900),
66e78ae6 1207 &dmn->t_wakeup);
a6810074
DL
1208 dmn->restart.interval = gs.min_restart_interval;
1209 if (tail)
1210 tail->next = dmn;
1211 else
1212 gs.daemons = dmn;
1213 tail = dmn;
1214
f168b713 1215 if (!strcmp(dmn->name, special))
a6810074
DL
1216 gs.special = dmn;
1217 }
1218 }
1219 if (!gs.daemons) {
1220 fputs("Must specify one or more daemons to monitor.\n", stderr);
4f04a76b 1221 frr_help_exit(1);
a6810074 1222 }
f168b713
DL
1223 if (!watch_only && !gs.special) {
1224 fprintf(stderr, "\"%s\" daemon must be in daemon list\n",
1225 special);
4f04a76b 1226 frr_help_exit(1);
8b886ca7 1227 }
8b886ca7 1228
a6810074
DL
1229 /* Make sure we're not already running. */
1230 pid_output(pidfile);
1231
1232 /* Announce which daemons are being monitored. */
1233 {
1234 struct daemon *dmn;
1235 size_t len = 0;
1236
1237 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1238 len += strlen(dmn->name) + 1;
1239
1240 {
1241 char buf[len + 1];
1242 char *p = buf;
1243
1244 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
1245 if (p != buf)
1246 *p++ = ' ';
1247 strcpy(p, dmn->name);
1248 p += strlen(p);
1249 }
f168b713
DL
1250 zlog_notice("%s %s watching [%s]%s", progname,
1251 FRR_VERSION, buf,
1252 watch_only ? ", monitor mode" : "");
a6810074
DL
1253 }
1254 }
8b886ca7 1255
a6810074
DL
1256 {
1257 struct thread thread;
1258
1259 while (thread_fetch(master, &thread))
1260 thread_call(&thread);
1261 }
8b886ca7 1262
a6810074
DL
1263 systemd_send_stopping();
1264 /* Not reached. */
1265 return 0;
8b886ca7 1266}