]> git.proxmox.com Git - pve-ha-manager.git/blob - src/watchdog-mux.c
b4bcc0c4108c2ba376acfba8a6663475f2a3e252
[pve-ha-manager.git] / src / watchdog-mux.c
1 #define _GNU_SOURCE
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <unistd.h>
5 #include <fcntl.h>
6 #include <string.h>
7 #include <errno.h>
8 #include <time.h>
9 #include <sys/ioctl.h>
10 #include <sys/types.h>
11 #include <sys/stat.h>
12 #include <sys/socket.h>
13 #include <sys/un.h>
14 #include <sys/epoll.h>
15 #include <signal.h>
16 #include <sys/signalfd.h>
17
18 #include <linux/types.h>
19 #include <linux/watchdog.h>
20
21 #define WD_SOCK_PATH "/run/watchdog-mux.sock"
22 #define WD_ACTIVE_MARKER "/run/watchdog-mux.active"
23
24 #define LISTEN_BACKLOG 32
25
26 #define MAX_EVENTS 10
27
28 #define WATCHDOG_DEV "/dev/watchdog"
29
30 #define JOURNALCTL_BIN "/bin/journalctl"
31
32 int watchdog_fd = -1;
33 int watchdog_timeout = 10;
34 int client_watchdog_timeout = 60;
35 int update_watchdog = 1;
36
37 typedef struct {
38 int fd;
39 time_t time;
40 int magic_close;
41 } wd_client_t;
42
43 #define MAX_CLIENTS 100
44
45 static wd_client_t client_list[MAX_CLIENTS];
46
47 static wd_client_t *
48 alloc_client(int fd, time_t time)
49 {
50 int i;
51
52 for (i = 0; i < MAX_CLIENTS; i++) {
53 if (client_list[i].fd == 0) {
54 client_list[i].fd = fd;
55 client_list[i].time = time;
56 client_list[i].magic_close = 0;
57 return &client_list[i];
58 }
59 }
60
61 return NULL;
62 }
63
64 static void
65 free_client(wd_client_t *wd_client)
66 {
67 if (!wd_client)
68 return;
69
70 wd_client->time = 0;
71 wd_client->fd = 0;
72 wd_client->magic_close = 0;
73 }
74
75 static int
76 active_client_count(void)
77 {
78 int i, count = 0;
79
80 for (i = 0; i < MAX_CLIENTS; i++) {
81 if (client_list[i].fd != 0 && client_list[i].time != 0) {
82 count++;
83 }
84 }
85
86 return count;
87 }
88
89 static void
90 watchdog_close(void)
91 {
92 if (watchdog_fd != -1) {
93 if (write(watchdog_fd, "V", 1) == -1) {
94 perror("write magic watchdog close");
95 }
96 if (close(watchdog_fd) == -1) {
97 perror("write magic watchdog close");
98 }
99 }
100
101 watchdog_fd = -1;
102 }
103
104 static void
105 sync_journal_unsafe(void)
106 {
107
108 pid_t child = fork();
109
110 // do not care about fork error or collecting the childs exit status,
111 // we are resetting soon anyway and just want to sync out the journal
112 if (child == 0) {
113 execl(JOURNALCTL_BIN, JOURNALCTL_BIN, "--sync", NULL);
114 exit(-1);
115 }
116 }
117
118 int
119 main(void)
120 {
121 struct sockaddr_un my_addr, peer_addr;
122 socklen_t peer_addr_size;
123 struct epoll_event ev, events[MAX_EVENTS];
124 int listen_sock, nfds, epollfd, sigfd;
125
126 struct stat fs;
127
128 if (stat(WD_ACTIVE_MARKER, &fs) == 0) {
129 fprintf(stderr, "watchdog active - unable to restart watchdog-mux\n");
130 exit(EXIT_FAILURE);
131 }
132
133 /* if you want to debug, set options in /lib/modprobe.d/aliases.conf
134 * options softdog soft_noboot=1
135 */
136 if (stat(WATCHDOG_DEV, &fs) == -1) {
137 char *wd_module = getenv("WATCHDOG_MODULE");
138 if (wd_module) {
139 char *cmd = NULL;
140 if ((asprintf(&cmd, "modprobe -q %s", wd_module) == -1)) {
141 perror("assemble modprobe command failed");
142 exit(EXIT_FAILURE);
143 }
144 fprintf(stderr, "Loading watchdog module '%s'\n", wd_module);
145 system(cmd);
146 free(cmd);
147 } else {
148 system("modprobe -q softdog"); // load softdog by default
149 }
150 }
151
152 if ((watchdog_fd = open(WATCHDOG_DEV, O_WRONLY)) == -1) {
153 perror("watchdog open");
154 exit(EXIT_FAILURE);
155 }
156
157 if (ioctl(watchdog_fd, WDIOC_SETTIMEOUT, &watchdog_timeout) == -1) {
158 perror("watchdog set timeout");
159 watchdog_close();
160 exit(EXIT_FAILURE);
161 }
162
163 /* read and log watchdog identity */
164 struct watchdog_info wdinfo;
165 if (ioctl(watchdog_fd, WDIOC_GETSUPPORT, &wdinfo) == -1) {
166 perror("read watchdog info");
167 watchdog_close();
168 exit(EXIT_FAILURE);
169 }
170
171 wdinfo.identity[sizeof(wdinfo.identity) - 1] = 0; // just to be sure
172 fprintf(stderr, "Watchdog driver '%s', version %x\n", wdinfo.identity, wdinfo.firmware_version);
173
174 /* always unlink socket path then create socket */
175 unlink(WD_SOCK_PATH);
176
177 listen_sock = socket(AF_UNIX, SOCK_STREAM, 0);
178 if (listen_sock == -1) {
179 perror("socket create");
180 exit(EXIT_FAILURE);
181 }
182 memset(&my_addr, 0, sizeof(struct sockaddr_un));
183 my_addr.sun_family = AF_UNIX;
184 strncpy(my_addr.sun_path, WD_SOCK_PATH, sizeof(my_addr.sun_path) - 1);
185
186 if (bind(listen_sock, (struct sockaddr *) &my_addr, sizeof(struct sockaddr_un)) == -1) {
187 perror("socket bind");
188 exit(EXIT_FAILURE);
189 }
190
191 if (listen(listen_sock, LISTEN_BACKLOG) == -1) {
192 perror("socket listen");
193 goto err;
194 }
195
196 epollfd = epoll_create(10);
197 if (epollfd == -1) {
198 perror("epoll_create");
199 goto err;
200 }
201
202 ev.events = EPOLLIN;
203 ev.data.ptr = alloc_client(listen_sock, 0);
204 if (epoll_ctl(epollfd, EPOLL_CTL_ADD, listen_sock, &ev) == -1) {
205 perror("epoll_ctl add listen_sock");
206 goto err;
207 }
208
209 sigset_t mask;
210 sigemptyset(&mask);
211 sigaddset(&mask, SIGINT);
212 sigaddset(&mask, SIGTERM);
213 sigaddset(&mask, SIGHUP);
214
215 sigprocmask(SIG_BLOCK, &mask, NULL);
216
217 if ((sigfd = signalfd(-1, &mask, SFD_NONBLOCK)) < 0) {
218 perror("unable to open signalfd");
219 goto err;
220 }
221
222 ev.events = EPOLLIN;
223 ev.data.ptr = alloc_client(sigfd, 0);
224 if (epoll_ctl(epollfd, EPOLL_CTL_ADD, sigfd, &ev) == -1) {
225 perror("epoll_ctl add sigfd");
226 goto err;
227 }
228
229 for (;;) {
230 nfds = epoll_wait(epollfd, events, MAX_EVENTS, 1000);
231 if (nfds == -1) {
232 if (errno == EINTR)
233 continue;
234
235 perror("epoll_pwait");
236 goto err;
237 }
238
239 if (nfds == 0) { // timeout
240
241 // check for timeouts
242 if (update_watchdog) {
243 int i;
244 time_t ctime = time(NULL);
245 for (i = 0; i < MAX_CLIENTS; i++) {
246 if (
247 client_list[i].fd != 0
248 && client_list[i].time != 0
249 && ((ctime - client_list[i].time) > client_watchdog_timeout)
250 ) {
251 update_watchdog = 0;
252 fprintf(stderr, "client watchdog expired - disable watchdog updates\n");
253 }
254 }
255 }
256
257 if (update_watchdog) {
258 if (ioctl(watchdog_fd, WDIOC_KEEPALIVE, 0) == -1) {
259 perror("watchdog update failed");
260 }
261 }
262
263 continue;
264 }
265
266 if (!update_watchdog)
267 break;
268
269 int terminate = 0;
270
271 int n;
272 for (n = 0; n < nfds; ++n) {
273 wd_client_t *wd_client = events[n].data.ptr;
274 if (wd_client->fd == listen_sock) {
275 int conn_sock = accept(listen_sock, (struct sockaddr *) &peer_addr, &peer_addr_size);
276 if (conn_sock == -1) {
277 perror("accept");
278 goto err; // fixme
279 }
280 if (fcntl(conn_sock, F_SETFL, O_NONBLOCK) == -1) {
281 perror("setnonblocking");
282 goto err; // fixme
283 }
284
285 wd_client_t *new_client = alloc_client(conn_sock, time(NULL));
286 if (new_client == NULL) {
287 fprintf(stderr, "unable to alloc wd_client structure\n");
288 goto err; // fixme;
289 }
290
291 mkdir(WD_ACTIVE_MARKER, 0600);
292
293 ev.events = EPOLLIN;
294 ev.data.ptr = new_client;
295 if (epoll_ctl(epollfd, EPOLL_CTL_ADD, conn_sock, &ev) == -1) {
296 perror("epoll_ctl: add conn_sock");
297 goto err; // fixme
298 }
299 } else if (wd_client->fd == sigfd) {
300
301 /* signal handling */
302
303 int rv = 0;
304 struct signalfd_siginfo si;
305
306 if ((rv = read(sigfd, &si, sizeof(si))) && rv >= 0) {
307 if (si.ssi_signo == SIGHUP) {
308 perror("got SIGHUP - ignored");
309 } else {
310 terminate = 1;
311 fprintf(stderr, "got terminate request\n");
312 }
313 }
314
315 } else {
316 char buf[4096];
317 int cfd = wd_client->fd;
318
319 ssize_t bytes = read(cfd, buf, sizeof(buf));
320 if (bytes == -1) {
321 perror("read");
322 goto err; // fixme
323 } else if (bytes > 0) {
324 int i;
325 for (i = 0; i < bytes; i++) {
326 if (buf[i] == 'V') {
327 wd_client->magic_close = 1;
328 } else {
329 wd_client->magic_close = 0;
330 }
331 }
332 wd_client->time = time(NULL);
333 } else {
334 if (events[n].events & EPOLLHUP || events[n].events & EPOLLERR) {
335 //printf("GOT %016x event\n", events[n].events);
336 if (epoll_ctl(epollfd, EPOLL_CTL_DEL, cfd, NULL) == -1) {
337 perror("epoll_ctl: del conn_sock");
338 goto err; // fixme
339 }
340 if (close(cfd) == -1) {
341 perror("close conn_sock");
342 goto err; // fixme
343 }
344
345 if (!wd_client->magic_close) {
346 fprintf(stderr, "client did not stop watchdog - disable watchdog updates\n");
347 sync_journal_unsafe();
348 update_watchdog = 0;
349 } else {
350 free_client(wd_client);
351 }
352
353 if (!active_client_count()) {
354 rmdir(WD_ACTIVE_MARKER);
355 }
356 }
357 }
358 }
359 }
360 if (terminate)
361 break;
362 }
363
364 int active_count = active_client_count();
365 if (active_count > 0) {
366 fprintf(stderr, "exit watchdog-mux with active connections\n");
367 sync_journal_unsafe();
368 } else {
369 fprintf(stderr, "clean exit\n");
370 watchdog_close();
371 }
372
373 unlink(WD_SOCK_PATH);
374
375 exit(EXIT_SUCCESS);
376
377 err:
378 unlink(WD_SOCK_PATH);
379
380 exit(EXIT_FAILURE);
381 }