]> git.proxmox.com Git - mirror_frr.git/blame - watchfrr/watchfrr.c
*: rename ferr_ref -> log_ref
[mirror_frr.git] / watchfrr / watchfrr.c
CommitLineData
8b886ca7 1/*
896014f4
DL
2 * Monitor status of frr daemons and restart if necessary.
3 *
4 * Copyright (C) 2004 Andrew J. Schorr
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
8b886ca7 19 */
20
a365534f 21#include <zebra.h>
8b886ca7 22#include <thread.h>
23#include <log.h>
52e66296 24#include <network.h>
8b886ca7 25#include <sigevent.h>
a365534f 26#include <lib/version.h>
95c4aff2 27#include "command.h"
87f44e2f 28#include "memory_vty.h"
4f04a76b 29#include "libfrr.h"
b647dc2a 30#include "lib_errors.h"
95c4aff2 31
6f594023 32#include <getopt.h>
a365534f 33#include <sys/un.h>
34#include <sys/wait.h>
837d16cc 35#include <memory.h>
651415bd 36#include <systemd.h>
8b886ca7 37
9473e340 38#include "watchfrr.h"
b647dc2a 39#include "watchfrr_errors.h"
95c4aff2 40
8b886ca7 41#ifndef MIN
42#define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
43#endif
44
45/* Macros to help randomize timers. */
46#define JITTER(X) ((random() % ((X)+1))-((X)/2))
47#define FUZZY(X) ((X)+JITTER((X)/20))
48
49#define DEFAULT_PERIOD 5
0a64aff6 50#define DEFAULT_TIMEOUT 90
8b886ca7 51#define DEFAULT_RESTART_TIMEOUT 20
52#define DEFAULT_LOGLEVEL LOG_INFO
53#define DEFAULT_MIN_RESTART 60
54#define DEFAULT_MAX_RESTART 600
8b886ca7 55
56#define PING_TOKEN "PING"
57
55c72803 58/* Needs to be global, referenced somewhere inside libfrr. */
8b886ca7 59struct thread_master *master;
64a249ad 60static char pidfile_default[256];
8b886ca7 61
f168b713 62static bool watch_only = false;
8b886ca7 63
a6810074
DL
64typedef enum {
65 PHASE_NONE = 0,
66 PHASE_STOPS_PENDING,
67 PHASE_WAITING_DOWN,
68 PHASE_ZEBRA_RESTART_PENDING,
69 PHASE_WAITING_ZEBRA_UP
8b886ca7 70} restart_phase_t;
71
a6810074
DL
72static const char *phase_str[] = {
73 "None",
74 "Stop jobs running",
75 "Waiting for other daemons to come down",
76 "Zebra restart job running",
77 "Waiting for zebra to come up",
78 "Start jobs running",
8b886ca7 79};
80
81#define PHASE_TIMEOUT (3*gs.restart_timeout)
82
a6810074
DL
83struct restart_info {
84 const char *name;
85 const char *what;
86 pid_t pid;
87 struct timeval time;
88 long interval;
89 struct thread *t_kill;
90 int kills;
098e240f 91};
92
a6810074 93static struct global_state {
a6810074
DL
94 restart_phase_t phase;
95 struct thread *t_phase_hanging;
96 const char *vtydir;
97 long period;
98 long timeout;
99 long restart_timeout;
100 long min_restart_interval;
101 long max_restart_interval;
a6810074
DL
102 struct daemon *daemons;
103 const char *restart_command;
104 const char *start_command;
105 const char *stop_command;
106 struct restart_info restart;
a6810074 107 int loglevel;
d62a17ae 108 struct daemon *special; /* points to zebra when doing phased restart */
a6810074
DL
109 int numdaemons;
110 int numpids;
d62a17ae 111 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
8b886ca7 112} gs = {
d62a17ae 113 .phase = PHASE_NONE,
64a249ad 114 .vtydir = frr_vtydir,
d62a17ae 115 .period = 1000 * DEFAULT_PERIOD,
116 .timeout = DEFAULT_TIMEOUT,
117 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
118 .loglevel = DEFAULT_LOGLEVEL,
119 .min_restart_interval = DEFAULT_MIN_RESTART,
120 .max_restart_interval = DEFAULT_MAX_RESTART,
d62a17ae 121};
a6810074
DL
122
123typedef enum {
124 DAEMON_INIT,
125 DAEMON_DOWN,
126 DAEMON_CONNECTING,
127 DAEMON_UP,
128 DAEMON_UNRESPONSIVE
8b886ca7 129} daemon_state_t;
130
d62a17ae 131#define IS_UP(DMN) \
132 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
8b886ca7 133
a6810074 134static const char *state_str[] = {
d62a17ae 135 "Init", "Down", "Connecting", "Up", "Unresponsive",
8b886ca7 136};
137
138struct daemon {
a6810074
DL
139 const char *name;
140 daemon_state_t state;
141 int fd;
142 struct timeval echo_sent;
d7c0a89a 143 unsigned int connect_tries;
a6810074
DL
144 struct thread *t_wakeup;
145 struct thread *t_read;
146 struct thread *t_write;
147 struct daemon *next;
148 struct restart_info restart;
8b886ca7 149};
150
9272302b
DL
151#define OPTION_MINRESTART 2000
152#define OPTION_MAXRESTART 2001
f168b713 153#define OPTION_DRY 2002
9272302b 154
a6810074
DL
155static const struct option longopts[] = {
156 {"daemon", no_argument, NULL, 'd'},
157 {"statedir", required_argument, NULL, 'S'},
a6810074
DL
158 {"loglevel", required_argument, NULL, 'l'},
159 {"interval", required_argument, NULL, 'i'},
160 {"timeout", required_argument, NULL, 't'},
161 {"restart-timeout", required_argument, NULL, 'T'},
162 {"restart", required_argument, NULL, 'r'},
163 {"start-command", required_argument, NULL, 's'},
164 {"kill-command", required_argument, NULL, 'k'},
f168b713 165 {"dry", no_argument, NULL, OPTION_DRY},
d62a17ae 166 {"min-restart-interval", required_argument, NULL, OPTION_MINRESTART},
167 {"max-restart-interval", required_argument, NULL, OPTION_MAXRESTART},
a6810074
DL
168 {"pid-file", required_argument, NULL, 'p'},
169 {"blank-string", required_argument, NULL, 'b'},
170 {"help", no_argument, NULL, 'h'},
171 {"version", no_argument, NULL, 'v'},
d62a17ae 172 {NULL, 0, NULL, 0}};
8b886ca7 173
174static int try_connect(struct daemon *dmn);
175static int wakeup_send_echo(struct thread *t_wakeup);
176static void try_restart(struct daemon *dmn);
177static void phase_check(void);
178
4f04a76b
DL
179static const char *progname;
180static void printhelp(FILE *target)
8b886ca7 181{
d62a17ae 182 fprintf(target,
183 "Usage : %s [OPTION...] <daemon name> ...\n\n\
9473e340 184Watchdog program to monitor status of frr daemons and try to restart\n\
8b886ca7 185them if they are down or unresponsive. It determines whether a daemon is\n\
186up based on whether it can connect to the daemon's vty unix stream socket.\n\
187It then repeatedly sends echo commands over that socket to determine whether\n\
188the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
189on the socket connection and know immediately that the daemon is down.\n\n\
190The daemons to be monitored should be listed on the command line.\n\n\
8b886ca7 191In order to avoid attempting to restart the daemons in a fast loop,\n\
192the -m and -M options allow you to control the minimum delay between\n\
193restart commands. The minimum restart delay is recalculated each time\n\
194a restart is attempted: if the time since the last restart attempt exceeds\n\
195twice the -M value, then the restart delay is set to the -m value.\n\
d62a17ae 196Otherwise, the interval is doubled (but capped at the -M value).\n\n",
f168b713 197 progname);
e757c940 198
d62a17ae 199 fprintf(target,
200 "Options:\n\
8b886ca7 201-d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
202 to syslog instead of stdout.\n\
203-S, --statedir Set the vty socket directory (default is %s)\n\
8b886ca7 204-l, --loglevel Set the logging level (default is %d).\n\
205 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
206 but it can be set higher than %d if extra-verbose debugging\n\
207 messages are desired.\n\
9272302b 208 --min-restart-interval\n\
8b886ca7 209 Set the minimum seconds to wait between invocations of daemon\n\
210 restart commands (default is %d).\n\
9272302b 211 --max-restart-interval\n\
8b886ca7 212 Set the maximum seconds to wait between invocations of daemon\n\
213 restart commands (default is %d).\n\
214-i, --interval Set the status polling interval in seconds (default is %d)\n\
215-t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
216-T, --restart-timeout\n\
217 Set the restart (kill) timeout in seconds (default is %d).\n\
218 If any background jobs are still running after this much\n\
219 time has elapsed, they will be killed.\n\
220-r, --restart Supply a Bourne shell command to use to restart a single\n\
221 daemon. The command string should include '%%s' where the\n\
222 name of the daemon should be substituted.\n\
8b886ca7 223-s, --start-command\n\
224 Supply a Bourne shell to command to use to start a single\n\
225 daemon. The command string should include '%%s' where the\n\
226 name of the daemon should be substituted.\n\
227-k, --kill-command\n\
228 Supply a Bourne shell to command to use to stop a single\n\
229 daemon. The command string should include '%%s' where the\n\
230 name of the daemon should be substituted.\n\
f168b713 231 --dry Do not start or restart anything, just log.\n\
8b886ca7 232-p, --pid-file Set process identifier file name\n\
233 (default is %s).\n\
c8b40f86 234-b, --blank-string\n\
235 When the supplied argument string is found in any of the\n\
f168b713 236 various shell command arguments (-r, -s, or -k), replace\n\
c8b40f86 237 it with a space. This is an ugly hack to circumvent problems\n\
238 passing command-line arguments with embedded spaces.\n\
8b886ca7 239-v, --version Print program version\n\
d62a17ae 240-h, --help Display this help and exit\n",
64a249ad 241 frr_vtydir, DEFAULT_LOGLEVEL, LOG_EMERG, LOG_DEBUG, LOG_DEBUG,
d62a17ae 242 DEFAULT_MIN_RESTART, DEFAULT_MAX_RESTART, DEFAULT_PERIOD,
64a249ad 243 DEFAULT_TIMEOUT, DEFAULT_RESTART_TIMEOUT, pidfile_default);
8b886ca7 244}
245
a6810074 246static pid_t run_background(char *shell_cmd)
8b886ca7 247{
a6810074
DL
248 pid_t child;
249
250 switch (child = fork()) {
251 case -1:
af4c2728 252 flog_err(LIB_ERR_SYSTEM_CALL,
b647dc2a
DS
253 "fork failed, cannot run command [%s]: %s", shell_cmd,
254 safe_strerror(errno));
a6810074
DL
255 return -1;
256 case 0:
257 /* Child process. */
d62a17ae 258 /* Use separate process group so child processes can be killed
259 * easily. */
a6810074
DL
260 if (setpgid(0, 0) < 0)
261 zlog_warn("warning: setpgid(0,0) failed: %s",
262 safe_strerror(errno));
263 {
264 char shell[] = "sh";
265 char dashc[] = "-c";
d62a17ae 266 char *const argv[4] = {shell, dashc, shell_cmd, NULL};
a6810074 267 execv("/bin/sh", argv);
af4c2728 268 flog_err(LIB_ERR_SYSTEM_CALL,
b647dc2a
DS
269 "execv(/bin/sh -c '%s') failed: %s", shell_cmd,
270 safe_strerror(errno));
a6810074
DL
271 _exit(127);
272 }
273 default:
274 /* Parent process: we will reap the child later. */
af4c2728 275 flog_err(LIB_ERR_SYSTEM_CALL,
b647dc2a
DS
276 "Forked background command [pid %d]: %s", (int)child,
277 shell_cmd);
a6810074
DL
278 return child;
279 }
8b886ca7 280}
281
a6810074
DL
282static struct timeval *time_elapsed(struct timeval *result,
283 const struct timeval *start_time)
8b886ca7 284{
a6810074
DL
285 gettimeofday(result, NULL);
286 result->tv_sec -= start_time->tv_sec;
287 result->tv_usec -= start_time->tv_usec;
288 while (result->tv_usec < 0) {
289 result->tv_usec += 1000000L;
290 result->tv_sec--;
291 }
292 return result;
8b886ca7 293}
294
a6810074 295static int restart_kill(struct thread *t_kill)
8b886ca7 296{
a6810074
DL
297 struct restart_info *restart = THREAD_ARG(t_kill);
298 struct timeval delay;
299
300 time_elapsed(&delay, &restart->time);
d62a17ae 301 zlog_warn(
302 "Warning: %s %s child process %d still running after "
303 "%ld seconds, sending signal %d",
304 restart->what, restart->name, (int)restart->pid,
305 (long)delay.tv_sec, (restart->kills ? SIGKILL : SIGTERM));
a6810074
DL
306 kill(-restart->pid, (restart->kills ? SIGKILL : SIGTERM));
307 restart->kills++;
66e78ae6
QY
308 restart->t_kill = NULL;
309 thread_add_timer(master, restart_kill, restart, gs.restart_timeout,
310 &restart->t_kill);
a6810074 311 return 0;
8b886ca7 312}
313
a6810074 314static struct restart_info *find_child(pid_t child)
8b886ca7 315{
f168b713
DL
316 struct daemon *dmn;
317 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
318 if (dmn->restart.pid == child)
319 return &dmn->restart;
a6810074
DL
320 }
321 return NULL;
8b886ca7 322}
323
a6810074 324static void sigchild(void)
8b886ca7 325{
a6810074
DL
326 pid_t child;
327 int status;
328 const char *name;
329 const char *what;
330 struct restart_info *restart;
331
332 switch (child = waitpid(-1, &status, WNOHANG)) {
333 case -1:
af4c2728 334 flog_err(LIB_ERR_SYSTEM_CALL,
b647dc2a 335 "waitpid failed: %s", safe_strerror(errno));
a6810074
DL
336 return;
337 case 0:
338 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
339 return;
340 }
341
342 if (child == integrated_write_pid) {
343 integrated_write_sigchld(status);
344 return;
345 }
346
347 if ((restart = find_child(child)) != NULL) {
348 name = restart->name;
349 what = restart->what;
350 restart->pid = 0;
351 gs.numpids--;
352 thread_cancel(restart->t_kill);
353 restart->t_kill = NULL;
d62a17ae 354 /* Update restart time to reflect the time the command
355 * completed. */
a6810074
DL
356 gettimeofday(&restart->time, NULL);
357 } else {
af4c2728 358 flog_err(LIB_ERR_SYSTEM_CALL,
b647dc2a
DS
359 "waitpid returned status for an unknown child process %d",
360 (int)child);
a6810074
DL
361 name = "(unknown)";
362 what = "background";
363 }
364 if (WIFSTOPPED(status))
d62a17ae 365 zlog_warn("warning: %s %s process %d is stopped", what, name,
366 (int)child);
a6810074 367 else if (WIFSIGNALED(status))
d62a17ae 368 zlog_warn("%s %s process %d terminated due to signal %d", what,
369 name, (int)child, WTERMSIG(status));
a6810074
DL
370 else if (WIFEXITED(status)) {
371 if (WEXITSTATUS(status) != 0)
d62a17ae 372 zlog_warn(
373 "%s %s process %d exited with non-zero status %d",
374 what, name, (int)child, WEXITSTATUS(status));
a6810074
DL
375 else
376 zlog_debug("%s %s process %d exited normally", what,
377 name, (int)child);
378 } else
af4c2728 379 flog_err(LIB_ERR_SYSTEM_CALL,
b647dc2a
DS
380 "cannot interpret %s %s process %d wait status 0x%x",
381 what, name, (int)child, status);
a6810074 382 phase_check();
8b886ca7 383}
384
d62a17ae 385static int run_job(struct restart_info *restart, const char *cmdtype,
386 const char *command, int force, int update_interval)
8b886ca7 387{
a6810074
DL
388 struct timeval delay;
389
390 if (gs.loglevel > LOG_DEBUG + 1)
391 zlog_debug("attempting to %s %s", cmdtype, restart->name);
392
393 if (restart->pid) {
394 if (gs.loglevel > LOG_DEBUG + 1)
d62a17ae 395 zlog_debug(
396 "cannot %s %s, previous pid %d still running",
397 cmdtype, restart->name, (int)restart->pid);
a6810074
DL
398 return -1;
399 }
400
d62a17ae 401 /* Note: time_elapsed test must come before the force test, since we
402 need
a6810074
DL
403 to make sure that delay is initialized for use below in updating the
404 restart interval. */
405 if ((time_elapsed(&delay, &restart->time)->tv_sec < restart->interval)
406 && !force) {
407 if (gs.loglevel > LOG_DEBUG + 1)
d62a17ae 408 zlog_debug(
409 "postponing %s %s: "
410 "elapsed time %ld < retry interval %ld",
411 cmdtype, restart->name, (long)delay.tv_sec,
412 restart->interval);
a6810074
DL
413 return -1;
414 }
415
416 gettimeofday(&restart->time, NULL);
417 restart->kills = 0;
418 {
419 char cmd[strlen(command) + strlen(restart->name) + 1];
420 snprintf(cmd, sizeof(cmd), command, restart->name);
421 if ((restart->pid = run_background(cmd)) > 0) {
66e78ae6 422 restart->t_kill = NULL;
d62a17ae 423 thread_add_timer(master, restart_kill, restart,
424 gs.restart_timeout, &restart->t_kill);
a6810074
DL
425 restart->what = cmdtype;
426 gs.numpids++;
427 } else
428 restart->pid = 0;
429 }
430
431 /* Calculate the new restart interval. */
432 if (update_interval) {
433 if (delay.tv_sec > 2 * gs.max_restart_interval)
434 restart->interval = gs.min_restart_interval;
435 else if ((restart->interval *= 2) > gs.max_restart_interval)
436 restart->interval = gs.max_restart_interval;
437 if (gs.loglevel > LOG_DEBUG + 1)
438 zlog_debug("restart %s interval is now %ld",
439 restart->name, restart->interval);
440 }
441 return restart->pid;
8b886ca7 442}
443
d62a17ae 444#define SET_READ_HANDLER(DMN) \
445 do { \
446 (DMN)->t_read = NULL; \
447 thread_add_read(master, handle_read, (DMN), (DMN)->fd, \
448 &(DMN)->t_read); \
449 } while (0);
450
451#define SET_WAKEUP_DOWN(DMN) \
452 do { \
453 (DMN)->t_wakeup = NULL; \
454 thread_add_timer_msec(master, wakeup_down, (DMN), \
455 FUZZY(gs.period), &(DMN)->t_wakeup); \
456 } while (0);
457
458#define SET_WAKEUP_UNRESPONSIVE(DMN) \
459 do { \
460 (DMN)->t_wakeup = NULL; \
461 thread_add_timer_msec(master, wakeup_unresponsive, (DMN), \
462 FUZZY(gs.period), &(DMN)->t_wakeup); \
463 } while (0);
464
465#define SET_WAKEUP_ECHO(DMN) \
466 do { \
467 (DMN)->t_wakeup = NULL; \
468 thread_add_timer_msec(master, wakeup_send_echo, (DMN), \
469 FUZZY(gs.period), &(DMN)->t_wakeup); \
470 } while (0);
8b886ca7 471
a6810074 472static int wakeup_down(struct thread *t_wakeup)
8b886ca7 473{
a6810074
DL
474 struct daemon *dmn = THREAD_ARG(t_wakeup);
475
476 dmn->t_wakeup = NULL;
477 if (try_connect(dmn) < 0)
478 SET_WAKEUP_DOWN(dmn);
479 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
480 try_restart(dmn);
481 return 0;
8b886ca7 482}
483
a6810074 484static int wakeup_init(struct thread *t_wakeup)
8b886ca7 485{
a6810074
DL
486 struct daemon *dmn = THREAD_ARG(t_wakeup);
487
488 dmn->t_wakeup = NULL;
489 if (try_connect(dmn) < 0) {
490 SET_WAKEUP_DOWN(dmn);
af4c2728 491 flog_err(WATCHFRR_ERR_CONNECTION,
b647dc2a
DS
492 "%s state -> down : initial connection attempt failed",
493 dmn->name);
a6810074
DL
494 dmn->state = DAEMON_DOWN;
495 }
496 return 0;
8b886ca7 497}
498
a6810074 499static void daemon_down(struct daemon *dmn, const char *why)
8b886ca7 500{
a6810074 501 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
af4c2728 502 flog_err(WATCHFRR_ERR_CONNECTION,
b647dc2a 503 "%s state -> down : %s", dmn->name, why);
a6810074
DL
504 else if (gs.loglevel > LOG_DEBUG)
505 zlog_debug("%s still down : %s", dmn->name, why);
506 if (IS_UP(dmn))
507 gs.numdown++;
508 dmn->state = DAEMON_DOWN;
509 if (dmn->fd >= 0) {
510 close(dmn->fd);
511 dmn->fd = -1;
512 }
513 THREAD_OFF(dmn->t_read);
514 THREAD_OFF(dmn->t_write);
515 THREAD_OFF(dmn->t_wakeup);
516 if (try_connect(dmn) < 0)
517 SET_WAKEUP_DOWN(dmn);
518 phase_check();
8b886ca7 519}
520
a6810074 521static int handle_read(struct thread *t_read)
8b886ca7 522{
a6810074
DL
523 struct daemon *dmn = THREAD_ARG(t_read);
524 static const char resp[sizeof(PING_TOKEN) + 4] = PING_TOKEN "\n";
525 char buf[sizeof(resp) + 100];
526 ssize_t rc;
527 struct timeval delay;
528
529 dmn->t_read = NULL;
530 if ((rc = read(dmn->fd, buf, sizeof(buf))) < 0) {
531 char why[100];
532
533 if (ERRNO_IO_RETRY(errno)) {
534 /* Pretend it never happened. */
535 SET_READ_HANDLER(dmn);
536 return 0;
537 }
538 snprintf(why, sizeof(why), "unexpected read error: %s",
539 safe_strerror(errno));
540 daemon_down(dmn, why);
541 return 0;
8b886ca7 542 }
a6810074
DL
543 if (rc == 0) {
544 daemon_down(dmn, "read returned EOF");
545 return 0;
546 }
547 if (!dmn->echo_sent.tv_sec) {
548 char why[sizeof(buf) + 100];
549 snprintf(why, sizeof(why),
550 "unexpected read returns %d bytes: %.*s", (int)rc,
551 (int)rc, buf);
552 daemon_down(dmn, why);
553 return 0;
8b886ca7 554 }
a6810074
DL
555
556 /* We are expecting an echo response: is there any chance that the
557 response would not be returned entirely in the first read? That
558 seems inconceivable... */
559 if ((rc != sizeof(resp)) || memcmp(buf, resp, sizeof(resp))) {
560 char why[100 + sizeof(buf)];
561 snprintf(why, sizeof(why),
562 "read returned bad echo response of %d bytes "
d62a17ae 563 "(expecting %u): %.*s",
d7c0a89a 564 (int)rc, (unsigned int)sizeof(resp), (int)rc, buf);
a6810074
DL
565 daemon_down(dmn, why);
566 return 0;
567 }
568
569 time_elapsed(&delay, &dmn->echo_sent);
570 dmn->echo_sent.tv_sec = 0;
571 if (dmn->state == DAEMON_UNRESPONSIVE) {
572 if (delay.tv_sec < gs.timeout) {
573 dmn->state = DAEMON_UP;
d62a17ae 574 zlog_warn(
575 "%s state -> up : echo response received after %ld.%06ld "
576 "seconds",
577 dmn->name, (long)delay.tv_sec,
578 (long)delay.tv_usec);
a6810074 579 } else
d62a17ae 580 zlog_warn(
581 "%s: slow echo response finally received after %ld.%06ld "
582 "seconds",
583 dmn->name, (long)delay.tv_sec,
584 (long)delay.tv_usec);
a6810074
DL
585 } else if (gs.loglevel > LOG_DEBUG + 1)
586 zlog_debug("%s: echo response received after %ld.%06ld seconds",
587 dmn->name, (long)delay.tv_sec, (long)delay.tv_usec);
588
589 SET_READ_HANDLER(dmn);
590 if (dmn->t_wakeup)
591 thread_cancel(dmn->t_wakeup);
592 SET_WAKEUP_ECHO(dmn);
593
594 return 0;
8b886ca7 595}
596
207e0d7a
DS
597/*
598 * Wait till we notice that all daemons are ready before
599 * we send we are ready to systemd
600 */
a6810074 601static void daemon_send_ready(void)
207e0d7a 602{
a6810074
DL
603 static int sent = 0;
604 if (!sent && gs.numdown == 0) {
a6810074 605 FILE *fp;
207e0d7a 606
a6810074 607 fp = fopen(DAEMON_VTY_DIR "/watchfrr.started", "w");
f5ba21fc
DS
608 if (fp)
609 fclose(fp);
60bd2534 610#if defined HAVE_SYSTEMD
d62a17ae 611 zlog_notice(
612 "Watchfrr: Notifying Systemd we are up and running");
a6810074 613 systemd_send_started(master, 0);
60bd2534 614#endif
a6810074
DL
615 sent = 1;
616 }
207e0d7a
DS
617}
618
a6810074 619static void daemon_up(struct daemon *dmn, const char *why)
8b886ca7 620{
a6810074
DL
621 dmn->state = DAEMON_UP;
622 gs.numdown--;
623 dmn->connect_tries = 0;
624 zlog_notice("%s state -> up : %s", dmn->name, why);
625 daemon_send_ready();
a8cbb8b3 626 SET_WAKEUP_ECHO(dmn);
a6810074 627 phase_check();
8b886ca7 628}
629
a6810074 630static int check_connect(struct thread *t_write)
8b886ca7 631{
a6810074
DL
632 struct daemon *dmn = THREAD_ARG(t_write);
633 int sockerr;
634 socklen_t reslen = sizeof(sockerr);
635
636 dmn->t_write = NULL;
637 if (getsockopt(dmn->fd, SOL_SOCKET, SO_ERROR, (char *)&sockerr, &reslen)
638 < 0) {
639 zlog_warn("%s: check_connect: getsockopt failed: %s", dmn->name,
640 safe_strerror(errno));
641 daemon_down(dmn,
642 "getsockopt failed checking connection success");
643 return 0;
644 }
645 if ((reslen == sizeof(sockerr)) && sockerr) {
646 char why[100];
d62a17ae 647 snprintf(
648 why, sizeof(why),
649 "getsockopt reports that connection attempt failed: %s",
650 safe_strerror(sockerr));
a6810074
DL
651 daemon_down(dmn, why);
652 return 0;
653 }
654
655 daemon_up(dmn, "delayed connect succeeded");
656 return 0;
8b886ca7 657}
658
a6810074 659static int wakeup_connect_hanging(struct thread *t_wakeup)
8b886ca7 660{
a6810074
DL
661 struct daemon *dmn = THREAD_ARG(t_wakeup);
662 char why[100];
663
664 dmn->t_wakeup = NULL;
665 snprintf(why, sizeof(why),
666 "connection attempt timed out after %ld seconds", gs.timeout);
667 daemon_down(dmn, why);
668 return 0;
8b886ca7 669}
670
671/* Making connection to protocol daemon. */
a6810074 672static int try_connect(struct daemon *dmn)
8b886ca7 673{
a6810074
DL
674 int sock;
675 struct sockaddr_un addr;
676 socklen_t len;
677
678 if (gs.loglevel > LOG_DEBUG + 1)
679 zlog_debug("%s: attempting to connect", dmn->name);
680 dmn->connect_tries++;
681
682 memset(&addr, 0, sizeof(struct sockaddr_un));
683 addr.sun_family = AF_UNIX;
d62a17ae 684 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty", gs.vtydir,
685 dmn->name);
6f0e3f6e 686#ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
a6810074 687 len = addr.sun_len = SUN_LEN(&addr);
8b886ca7 688#else
a6810074 689 len = sizeof(addr.sun_family) + strlen(addr.sun_path);
d62a17ae 690#endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
a6810074
DL
691
692 /* Quick check to see if we might succeed before we go to the trouble
693 of creating a socket. */
694 if (access(addr.sun_path, W_OK) < 0) {
695 if (errno != ENOENT)
af4c2728 696 flog_err(LIB_ERR_SYSTEM_CALL,
b647dc2a
DS
697 "%s: access to socket %s denied: %s",
698 dmn->name, addr.sun_path,
699 safe_strerror(errno));
a6810074
DL
700 return -1;
701 }
702
703 if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
af4c2728 704 flog_err(LIB_ERR_SOCKET,
b647dc2a
DS
705 "%s(%s): cannot make socket: %s", __func__,
706 addr.sun_path, safe_strerror(errno));
a6810074
DL
707 return -1;
708 }
709
710 if (set_nonblocking(sock) < 0 || set_cloexec(sock) < 0) {
af4c2728 711 flog_err(LIB_ERR_SYSTEM_CALL,
b647dc2a
DS
712 "%s(%s): set_nonblocking/cloexec(%d) failed",
713 __func__, addr.sun_path, sock);
a6810074
DL
714 close(sock);
715 return -1;
8b886ca7 716 }
a6810074
DL
717
718 if (connect(sock, (struct sockaddr *)&addr, len) < 0) {
719 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK)) {
720 if (gs.loglevel > LOG_DEBUG)
721 zlog_debug("%s(%s): connect failed: %s",
722 __func__, addr.sun_path,
723 safe_strerror(errno));
724 close(sock);
725 return -1;
726 }
727 if (gs.loglevel > LOG_DEBUG)
728 zlog_debug("%s: connection in progress", dmn->name);
729 dmn->state = DAEMON_CONNECTING;
730 dmn->fd = sock;
66e78ae6
QY
731 dmn->t_write = NULL;
732 thread_add_write(master, check_connect, dmn, dmn->fd,
d62a17ae 733 &dmn->t_write);
734 dmn->t_wakeup = NULL;
735 thread_add_timer(master, wakeup_connect_hanging, dmn,
736 gs.timeout, &dmn->t_wakeup);
a6810074
DL
737 SET_READ_HANDLER(dmn);
738 return 0;
739 }
740
741 dmn->fd = sock;
742 SET_READ_HANDLER(dmn);
743 daemon_up(dmn, "connect succeeded");
744 return 1;
8b886ca7 745}
746
a6810074 747static int phase_hanging(struct thread *t_hanging)
8b886ca7 748{
a6810074 749 gs.t_phase_hanging = NULL;
af4c2728 750 flog_err(WATCHFRR_ERR_CONNECTION,
b647dc2a
DS
751 "Phase [%s] hanging for %ld seconds, aborting phased restart",
752 phase_str[gs.phase], PHASE_TIMEOUT);
a6810074
DL
753 gs.phase = PHASE_NONE;
754 return 0;
8b886ca7 755}
756
a6810074 757static void set_phase(restart_phase_t new_phase)
8b886ca7 758{
a6810074
DL
759 gs.phase = new_phase;
760 if (gs.t_phase_hanging)
761 thread_cancel(gs.t_phase_hanging);
66e78ae6
QY
762 gs.t_phase_hanging = NULL;
763 thread_add_timer(master, phase_hanging, NULL, PHASE_TIMEOUT,
764 &gs.t_phase_hanging);
8b886ca7 765}
766
a6810074 767static void phase_check(void)
8b886ca7 768{
a6810074
DL
769 switch (gs.phase) {
770 case PHASE_NONE:
771 break;
772 case PHASE_STOPS_PENDING:
773 if (gs.numpids)
774 break;
d62a17ae 775 zlog_info(
776 "Phased restart: all routing daemon stop jobs have completed.");
a6810074
DL
777 set_phase(PHASE_WAITING_DOWN);
778
d62a17ae 779 /*FALLTHRU*/
a6810074
DL
780 case PHASE_WAITING_DOWN:
781 if (gs.numdown + IS_UP(gs.special) < gs.numdaemons)
782 break;
783 zlog_info("Phased restart: all routing daemons now down.");
784 run_job(&gs.special->restart, "restart", gs.restart_command, 1,
785 1);
786 set_phase(PHASE_ZEBRA_RESTART_PENDING);
787
d62a17ae 788 /*FALLTHRU*/
a6810074
DL
789 case PHASE_ZEBRA_RESTART_PENDING:
790 if (gs.special->restart.pid)
791 break;
792 zlog_info("Phased restart: %s restart job completed.",
793 gs.special->name);
794 set_phase(PHASE_WAITING_ZEBRA_UP);
795
d62a17ae 796 /*FALLTHRU*/
a6810074
DL
797 case PHASE_WAITING_ZEBRA_UP:
798 if (!IS_UP(gs.special))
799 break;
800 zlog_info("Phased restart: %s is now up.", gs.special->name);
801 {
802 struct daemon *dmn;
803 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
804 if (dmn != gs.special)
805 run_job(&dmn->restart, "start",
806 gs.start_command, 1, 0);
807 }
808 }
809 gs.phase = PHASE_NONE;
810 THREAD_OFF(gs.t_phase_hanging);
811 zlog_notice("Phased global restart has completed.");
812 break;
813 }
8b886ca7 814}
815
a6810074 816static void try_restart(struct daemon *dmn)
8b886ca7 817{
f168b713 818 if (watch_only)
a6810074 819 return;
a6810074 820
f168b713
DL
821 if (dmn != gs.special) {
822 if ((gs.special->state == DAEMON_UP)
823 && (gs.phase == PHASE_NONE))
824 run_job(&dmn->restart, "restart", gs.restart_command, 0,
825 1);
826 else
827 zlog_debug(
828 "%s: postponing restart attempt because master %s daemon "
829 "not up [%s], or phased restart in progress",
830 dmn->name, gs.special->name,
831 state_str[gs.special->state]);
832 return;
833 }
834
835 if ((gs.phase != PHASE_NONE) || gs.numpids) {
836 if (gs.loglevel > LOG_DEBUG + 1)
837 zlog_debug(
838 "postponing phased global restart: restart already in "
839 "progress [%s], or outstanding child processes [%d]",
840 phase_str[gs.phase], gs.numpids);
841 return;
842 }
843 /* Is it too soon for a restart? */
844 {
845 struct timeval delay;
846 if (time_elapsed(&delay, &gs.special->restart.time)->tv_sec
847 < gs.special->restart.interval) {
a6810074 848 if (gs.loglevel > LOG_DEBUG + 1)
d62a17ae 849 zlog_debug(
f168b713
DL
850 "postponing phased global restart: "
851 "elapsed time %ld < retry interval %ld",
852 (long)delay.tv_sec,
853 gs.special->restart.interval);
854 return;
a6810074 855 }
8b886ca7 856 }
f168b713 857 run_job(&gs.restart, "restart", gs.restart_command, 0, 1);
8b886ca7 858}
859
a6810074 860static int wakeup_unresponsive(struct thread *t_wakeup)
8b886ca7 861{
a6810074
DL
862 struct daemon *dmn = THREAD_ARG(t_wakeup);
863
864 dmn->t_wakeup = NULL;
865 if (dmn->state != DAEMON_UNRESPONSIVE)
af4c2728 866 flog_err(WATCHFRR_ERR_CONNECTION,
b647dc2a
DS
867 "%s: no longer unresponsive (now %s), "
868 "wakeup should have been cancelled!",
869 dmn->name, state_str[dmn->state]);
a6810074
DL
870 else {
871 SET_WAKEUP_UNRESPONSIVE(dmn);
872 try_restart(dmn);
873 }
874 return 0;
8b886ca7 875}
876
a6810074 877static int wakeup_no_answer(struct thread *t_wakeup)
8b886ca7 878{
a6810074
DL
879 struct daemon *dmn = THREAD_ARG(t_wakeup);
880
881 dmn->t_wakeup = NULL;
882 dmn->state = DAEMON_UNRESPONSIVE;
af4c2728 883 flog_err(WATCHFRR_ERR_CONNECTION,
b647dc2a
DS
884 "%s state -> unresponsive : no response yet to ping "
885 "sent %ld seconds ago",
886 dmn->name, gs.timeout);
71e7975a
DL
887 SET_WAKEUP_UNRESPONSIVE(dmn);
888 try_restart(dmn);
a6810074 889 return 0;
8b886ca7 890}
891
a6810074 892static int wakeup_send_echo(struct thread *t_wakeup)
8b886ca7 893{
a6810074
DL
894 static const char echocmd[] = "echo " PING_TOKEN;
895 ssize_t rc;
896 struct daemon *dmn = THREAD_ARG(t_wakeup);
897
898 dmn->t_wakeup = NULL;
d62a17ae 899 if (((rc = write(dmn->fd, echocmd, sizeof(echocmd))) < 0)
900 || ((size_t)rc != sizeof(echocmd))) {
a6810074
DL
901 char why[100 + sizeof(echocmd)];
902 snprintf(why, sizeof(why),
903 "write '%s' returned %d instead of %u", echocmd,
d7c0a89a 904 (int)rc, (unsigned int)sizeof(echocmd));
a6810074
DL
905 daemon_down(dmn, why);
906 } else {
907 gettimeofday(&dmn->echo_sent, NULL);
66e78ae6
QY
908 dmn->t_wakeup = NULL;
909 thread_add_timer(master, wakeup_no_answer, dmn, gs.timeout,
910 &dmn->t_wakeup);
a6810074
DL
911 }
912 return 0;
8b886ca7 913}
914
470bc619
QY
915bool check_all_up(void)
916{
917 struct daemon *dmn;
918
919 for (dmn = gs.daemons; dmn; dmn = dmn->next)
920 if (dmn->state != DAEMON_UP)
921 return false;
922 return true;
923}
924
a6810074 925static void sigint(void)
8b886ca7 926{
a6810074
DL
927 zlog_notice("Terminating on signal");
928 systemd_send_stopping();
929 exit(0);
8b886ca7 930}
931
a6810074 932static int valid_command(const char *cmd)
8b886ca7 933{
a6810074 934 char *p;
8b886ca7 935
a6810074 936 return ((p = strchr(cmd, '%')) != NULL) && (*(p + 1) == 's')
d62a17ae 937 && !strchr(p + 1, '%');
8b886ca7 938}
939
c8b40f86 940/* This is an ugly hack to circumvent problems with passing command-line
941 arguments that contain spaces. The fix is to use a configuration file. */
a6810074 942static char *translate_blanks(const char *cmd, const char *blankstr)
c8b40f86 943{
a6810074
DL
944 char *res;
945 char *p;
946 size_t bslen = strlen(blankstr);
947
948 if (!(res = strdup(cmd))) {
949 perror("strdup");
950 exit(1);
951 }
952 while ((p = strstr(res, blankstr)) != NULL) {
953 *p = ' ';
954 if (bslen != 1)
955 memmove(p + 1, p + bslen, strlen(p + bslen) + 1);
956 }
957 return res;
c8b40f86 958}
959
a6810074 960struct zebra_privs_t watchfrr_privs = {
95c4aff2 961#ifdef VTY_GROUP
a6810074 962 .vty_group = VTY_GROUP,
95c4aff2
DL
963#endif
964};
965
4f04a76b
DL
966static struct quagga_signal_t watchfrr_signals[] = {
967 {
968 .signal = SIGINT,
969 .handler = sigint,
970 },
971 {
972 .signal = SIGTERM,
973 .handler = sigint,
974 },
975 {
976 .signal = SIGCHLD,
977 .handler = sigchild,
978 },
979};
980
981FRR_DAEMON_INFO(watchfrr, WATCHFRR,
d62a17ae 982 .flags = FRR_NO_PRIVSEP | FRR_NO_TCPVTY | FRR_LIMITED_CLI
983 | FRR_NO_CFG_PID_DRY | FRR_NO_ZCLIENT,
4f04a76b 984
d62a17ae 985 .printhelp = printhelp,
986 .copyright = "Copyright 2004 Andrew J. Schorr",
4f04a76b 987
d62a17ae 988 .signals = watchfrr_signals,
989 .n_signals = array_size(watchfrr_signals),
4f04a76b 990
d62a17ae 991 .privs = &watchfrr_privs, )
4f04a76b 992
999f153e
DL
993#define DEPRECATED_OPTIONS "aAezR:"
994
a6810074 995int main(int argc, char **argv)
8b886ca7 996{
a6810074 997 int opt;
64a249ad 998 const char *pidfile = pidfile_default;
a6810074
DL
999 const char *special = "zebra";
1000 const char *blankstr = NULL;
a6810074 1001
64a249ad
DL
1002 snprintf(pidfile_default, sizeof(pidfile_default), "%s/watchfrr.pid",
1003 frr_vtydir);
1004
4f04a76b
DL
1005 frr_preinit(&watchfrr_di, argc, argv);
1006 progname = watchfrr_di.progname;
1007
999f153e 1008 frr_opt_add("b:dk:l:i:p:r:S:s:t:T:" DEPRECATED_OPTIONS, longopts, "");
a6810074
DL
1009
1010 gs.restart.name = "all";
4f04a76b 1011 while ((opt = frr_getopt(argc, argv, NULL)) != EOF) {
999f153e
DL
1012 if (opt && opt < 128 && strchr(DEPRECATED_OPTIONS, opt)) {
1013 fprintf(stderr,
1014 "The -%c option no longer exists.\n"
1015 "Please refer to the watchfrr(8) man page.\n",
1016 opt);
1017 exit(1);
1018 }
1019
a6810074
DL
1020 switch (opt) {
1021 case 0:
1022 break;
a6810074
DL
1023 case 'b':
1024 blankstr = optarg;
1025 break;
f168b713
DL
1026 case OPTION_DRY:
1027 watch_only = true;
a6810074
DL
1028 break;
1029 case 'k':
1030 if (!valid_command(optarg)) {
1031 fprintf(stderr,
1032 "Invalid kill command, must contain '%%s': %s\n",
1033 optarg);
4f04a76b 1034 frr_help_exit(1);
a6810074
DL
1035 }
1036 gs.stop_command = optarg;
1037 break;
d62a17ae 1038 case 'l': {
1039 char garbage[3];
1040 if ((sscanf(optarg, "%d%1s", &gs.loglevel, garbage)
1041 != 1)
1042 || (gs.loglevel < LOG_EMERG)) {
1043 fprintf(stderr,
1044 "Invalid loglevel argument: %s\n",
1045 optarg);
1046 frr_help_exit(1);
a6810074 1047 }
d62a17ae 1048 } break;
1049 case OPTION_MINRESTART: {
1050 char garbage[3];
1051 if ((sscanf(optarg, "%ld%1s", &gs.min_restart_interval,
1052 garbage)
1053 != 1)
1054 || (gs.min_restart_interval < 0)) {
1055 fprintf(stderr,
1056 "Invalid min_restart_interval argument: %s\n",
1057 optarg);
1058 frr_help_exit(1);
a6810074 1059 }
d62a17ae 1060 } break;
1061 case OPTION_MAXRESTART: {
1062 char garbage[3];
1063 if ((sscanf(optarg, "%ld%1s", &gs.max_restart_interval,
1064 garbage)
1065 != 1)
1066 || (gs.max_restart_interval < 0)) {
1067 fprintf(stderr,
1068 "Invalid max_restart_interval argument: %s\n",
1069 optarg);
1070 frr_help_exit(1);
a6810074 1071 }
d62a17ae 1072 } break;
1073 case 'i': {
1074 char garbage[3];
1075 int period;
1076 if ((sscanf(optarg, "%d%1s", &period, garbage) != 1)
1077 || (gs.period < 1)) {
1078 fprintf(stderr,
1079 "Invalid interval argument: %s\n",
1080 optarg);
1081 frr_help_exit(1);
a6810074 1082 }
d62a17ae 1083 gs.period = 1000 * period;
1084 } break;
a6810074
DL
1085 case 'p':
1086 pidfile = optarg;
1087 break;
1088 case 'r':
a6810074
DL
1089 if (!valid_command(optarg)) {
1090 fprintf(stderr,
1091 "Invalid restart command, must contain '%%s': %s\n",
1092 optarg);
4f04a76b 1093 frr_help_exit(1);
a6810074
DL
1094 }
1095 gs.restart_command = optarg;
a6810074
DL
1096 break;
1097 case 's':
1098 if (!valid_command(optarg)) {
1099 fprintf(stderr,
1100 "Invalid start command, must contain '%%s': %s\n",
1101 optarg);
4f04a76b 1102 frr_help_exit(1);
a6810074
DL
1103 }
1104 gs.start_command = optarg;
1105 break;
1106 case 'S':
1107 gs.vtydir = optarg;
1108 break;
d62a17ae 1109 case 't': {
1110 char garbage[3];
1111 if ((sscanf(optarg, "%ld%1s", &gs.timeout, garbage)
1112 != 1)
1113 || (gs.timeout < 1)) {
1114 fprintf(stderr,
1115 "Invalid timeout argument: %s\n",
1116 optarg);
1117 frr_help_exit(1);
a6810074 1118 }
d62a17ae 1119 } break;
1120 case 'T': {
1121 char garbage[3];
1122 if ((sscanf(optarg, "%ld%1s", &gs.restart_timeout,
1123 garbage)
1124 != 1)
1125 || (gs.restart_timeout < 1)) {
1126 fprintf(stderr,
1127 "Invalid restart timeout argument: %s\n",
1128 optarg);
1129 frr_help_exit(1);
a6810074 1130 }
d62a17ae 1131 } break;
a6810074
DL
1132 default:
1133 fputs("Invalid option.\n", stderr);
4f04a76b 1134 frr_help_exit(1);
a6810074 1135 }
8b886ca7 1136 }
a6810074 1137
71e7975a
DL
1138 if (watch_only
1139 && (gs.start_command || gs.stop_command || gs.restart_command)) {
d87ae5cc 1140 fputs("Options -r/-s/-k are not used when --dry is active.\n",
a6810074 1141 stderr);
8b886ca7 1142 }
f168b713
DL
1143 if (!watch_only
1144 && (!gs.restart_command || !gs.start_command || !gs.stop_command)) {
1145 fprintf(stderr,
1146 "Options -s (start), -k (kill), and -r (restart) are required.\n");
1147 frr_help_exit(1);
8b886ca7 1148 }
8b886ca7 1149
a6810074
DL
1150 if (blankstr) {
1151 if (gs.restart_command)
1152 gs.restart_command =
d62a17ae 1153 translate_blanks(gs.restart_command, blankstr);
a6810074
DL
1154 if (gs.start_command)
1155 gs.start_command =
d62a17ae 1156 translate_blanks(gs.start_command, blankstr);
a6810074
DL
1157 if (gs.stop_command)
1158 gs.stop_command =
d62a17ae 1159 translate_blanks(gs.stop_command, blankstr);
065de903 1160 }
8b886ca7 1161
a6810074 1162 gs.restart.interval = gs.min_restart_interval;
8b886ca7 1163
4f04a76b 1164 master = frr_init();
b647dc2a 1165 watchfrr_error_init();
4f04a76b 1166
dd8376fe 1167 zlog_set_level(ZLOG_DEST_MONITOR, ZLOG_DISABLED);
eb05883f 1168 if (watchfrr_di.daemon_mode) {
dd8376fe 1169 zlog_set_level(ZLOG_DEST_SYSLOG, MIN(gs.loglevel, LOG_DEBUG));
d62a17ae 1170 if (daemon(0, 0) < 0) {
2f4f11fa 1171 fprintf(stderr, "Watchfrr daemon failed: %s",
d62a17ae 1172 strerror(errno));
1173 exit(1);
4f04a76b
DL
1174 }
1175 } else
dd8376fe 1176 zlog_set_level(ZLOG_DEST_STDOUT, MIN(gs.loglevel, LOG_DEBUG));
8b886ca7 1177
a6810074 1178 watchfrr_vty_init();
8b886ca7 1179
eb05883f 1180 frr_vty_serv();
8b886ca7 1181
8b886ca7 1182 {
a6810074
DL
1183 int i;
1184 struct daemon *tail = NULL;
1185
1186 for (i = optind; i < argc; i++) {
1187 struct daemon *dmn;
1188
1189 if (!(dmn = (struct daemon *)calloc(1, sizeof(*dmn)))) {
1190 fprintf(stderr, "calloc(1,%u) failed: %s\n",
d7c0a89a 1191 (unsigned int)sizeof(*dmn),
a6810074
DL
1192 safe_strerror(errno));
1193 return 1;
1194 }
1195 dmn->name = dmn->restart.name = argv[i];
1196 dmn->state = DAEMON_INIT;
1197 gs.numdaemons++;
1198 gs.numdown++;
1199 dmn->fd = -1;
66e78ae6 1200 dmn->t_wakeup = NULL;
d62a17ae 1201 thread_add_timer_msec(master, wakeup_init, dmn,
1202 100 + (random() % 900),
66e78ae6 1203 &dmn->t_wakeup);
a6810074
DL
1204 dmn->restart.interval = gs.min_restart_interval;
1205 if (tail)
1206 tail->next = dmn;
1207 else
1208 gs.daemons = dmn;
1209 tail = dmn;
1210
f168b713 1211 if (!strcmp(dmn->name, special))
a6810074
DL
1212 gs.special = dmn;
1213 }
1214 }
1215 if (!gs.daemons) {
1216 fputs("Must specify one or more daemons to monitor.\n", stderr);
4f04a76b 1217 frr_help_exit(1);
a6810074 1218 }
f168b713
DL
1219 if (!watch_only && !gs.special) {
1220 fprintf(stderr, "\"%s\" daemon must be in daemon list\n",
1221 special);
4f04a76b 1222 frr_help_exit(1);
8b886ca7 1223 }
8b886ca7 1224
a6810074
DL
1225 /* Make sure we're not already running. */
1226 pid_output(pidfile);
1227
1228 /* Announce which daemons are being monitored. */
1229 {
1230 struct daemon *dmn;
1231 size_t len = 0;
1232
1233 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1234 len += strlen(dmn->name) + 1;
1235
1236 {
1237 char buf[len + 1];
1238 char *p = buf;
1239
1240 for (dmn = gs.daemons; dmn; dmn = dmn->next) {
1241 if (p != buf)
1242 *p++ = ' ';
1243 strcpy(p, dmn->name);
1244 p += strlen(p);
1245 }
f168b713
DL
1246 zlog_notice("%s %s watching [%s]%s", progname,
1247 FRR_VERSION, buf,
1248 watch_only ? ", monitor mode" : "");
a6810074
DL
1249 }
1250 }
8b886ca7 1251
a6810074
DL
1252 {
1253 struct thread thread;
1254
1255 while (thread_fetch(master, &thread))
1256 thread_call(&thread);
1257 }
8b886ca7 1258
a6810074
DL
1259 systemd_send_stopping();
1260 /* Not reached. */
1261 return 0;
8b886ca7 1262}