]> git.proxmox.com Git - pve-ha-manager.git/blob - src/watchdog-mux.c
set backlog for watchdog-mux.socket
[pve-ha-manager.git] / src / watchdog-mux.c
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <unistd.h>
4 #include <fcntl.h>
5 #include <string.h>
6 #include <errno.h>
7 #include <time.h>
8 #include <sys/ioctl.h>
9 #include <sys/types.h>
10 #include <sys/stat.h>
11 #include <sys/socket.h>
12 #include <sys/un.h>
13 #include <sys/epoll.h>
14 #include <signal.h>
15 #include <sys/signalfd.h>
16
17 #include <linux/types.h>
18 #include <linux/watchdog.h>
19
20 #include <systemd/sd-daemon.h>
21
22 #define MY_SOCK_PATH "/run/watchdog-mux.sock"
23 #define WD_ACTIVE_MARKER "/run/watchdog-mux.active"
24
25 #define LISTEN_BACKLOG 32 /* set same value in watchdog-mux.socket */
26
27 #define MAX_EVENTS 10
28
29 #define WATCHDOG_DEV "/dev/watchdog"
30
31 int watchdog_fd = -1;
32 int watchdog_timeout = 10;
33 int client_watchdog_timeout = 60;
34 int update_watchdog = 1;
35
36 typedef struct {
37 int fd;
38 time_t time;
39 int magic_close;
40 } wd_client_t;
41
42 #define MAX_CLIENTS 100
43
44 static wd_client_t client_list[MAX_CLIENTS];
45
46 static wd_client_t *
47 alloc_client(int fd, time_t time)
48 {
49 int i;
50
51 for (i = 0; i < MAX_CLIENTS; i++) {
52 if (client_list[i].fd == 0) {
53 client_list[i].fd = fd;
54 client_list[i].time = time;
55 client_list[i].magic_close = 0;
56 return &client_list[i];
57 }
58 }
59
60 return NULL;
61 }
62
63 static void
64 free_client(wd_client_t *wd_client)
65 {
66 if (!wd_client)
67 return;
68
69 wd_client->time = 0;
70 wd_client->fd = 0;
71 wd_client->magic_close = 0;
72 }
73
74 static int
75 active_client_count(void)
76 {
77 int i, count = 0;
78
79 for (i = 0; i < MAX_CLIENTS; i++) {
80 if (client_list[i].fd != 0 && client_list[i].time != 0) {
81 count++;
82 }
83 }
84
85 return count;
86 }
87
88 static void
89 watchdog_close(void)
90 {
91 if (watchdog_fd != -1) {
92 if (write(watchdog_fd, "V", 1) == -1) {
93 perror("write magic watchdog close");
94 }
95 if (close(watchdog_fd) == -1) {
96 perror("write magic watchdog close");
97 }
98 }
99
100 watchdog_fd = -1;
101 }
102
103 int
104 main(void)
105 {
106 struct sockaddr_un my_addr, peer_addr;
107 socklen_t peer_addr_size;
108 struct epoll_event ev, events[MAX_EVENTS];
109 int socket_count, listen_sock, nfds, epollfd, sigfd;
110
111
112 struct stat fs;
113
114 if (stat(WD_ACTIVE_MARKER, &fs) == 0) {
115 fprintf(stderr, "watchdog active - unable to restart watchdog-mux\n");
116 exit(EXIT_FAILURE);
117 }
118
119 if (stat(WATCHDOG_DEV, &fs) == -1) {
120 system("modprobe -q softdog soft_noboot=1"); // fixme
121 }
122
123 if ((watchdog_fd = open(WATCHDOG_DEV, O_WRONLY)) == -1) {
124 perror("watchdog open");
125 exit(EXIT_FAILURE);
126 }
127
128 if (ioctl(watchdog_fd, WDIOC_SETTIMEOUT, &watchdog_timeout) == -1) {
129 perror("watchdog set timeout");
130 watchdog_close();
131 exit(EXIT_FAILURE);
132 }
133
134 /* read and log watchdog identity */
135 struct watchdog_info wdinfo;
136 if (ioctl(watchdog_fd, WDIOC_GETSUPPORT, &wdinfo) == -1) {
137 perror("read watchdog info");
138 watchdog_close();
139 exit(EXIT_FAILURE);
140 }
141
142 wdinfo.identity[sizeof(wdinfo.identity) - 1] = 0; // just to be sure
143 fprintf(stderr, "Watchdog driver '%s', version %x\n",
144 wdinfo.identity, wdinfo.firmware_version);
145
146 socket_count = sd_listen_fds(0);
147
148 if (socket_count > 1) {
149
150 perror("too many file descriptors received.\n");
151 goto err;
152
153 } else if (socket_count == 1) {
154
155 listen_sock = SD_LISTEN_FDS_START + 0;
156
157 } else {
158
159 unlink(MY_SOCK_PATH);
160
161 listen_sock = socket(AF_UNIX, SOCK_STREAM, 0);
162 if (listen_sock == -1) {
163 perror("socket create");
164 exit(EXIT_FAILURE);
165 }
166
167 memset(&my_addr, 0, sizeof(struct sockaddr_un));
168 my_addr.sun_family = AF_UNIX;
169 strncpy(my_addr.sun_path, MY_SOCK_PATH, sizeof(my_addr.sun_path) - 1);
170
171 if (bind(listen_sock, (struct sockaddr *) &my_addr,
172 sizeof(struct sockaddr_un)) == -1) {
173 perror("socket bind");
174 exit(EXIT_FAILURE);
175 }
176
177 if (listen(listen_sock, LISTEN_BACKLOG) == -1) {
178 perror("socket listen");
179 goto err;
180 }
181 }
182
183 epollfd = epoll_create(10);
184 if (epollfd == -1) {
185 perror("epoll_create");
186 goto err;
187 }
188
189 ev.events = EPOLLIN;
190 ev.data.ptr = alloc_client(listen_sock, 0);
191 if (epoll_ctl(epollfd, EPOLL_CTL_ADD, listen_sock, &ev) == -1) {
192 perror("epoll_ctl add listen_sock");
193 goto err;
194 }
195
196 sigset_t mask;
197 sigemptyset(&mask);
198 sigaddset(&mask, SIGINT);
199 sigaddset(&mask, SIGTERM);
200 sigaddset(&mask, SIGHUP);
201
202 sigprocmask(SIG_BLOCK, &mask, NULL);
203
204 if ((sigfd = signalfd(-1, &mask, SFD_NONBLOCK)) < 0) {
205 perror("unable to open signalfd");
206 goto err;
207 }
208
209 ev.events = EPOLLIN;
210 ev.data.ptr = alloc_client(sigfd, 0);
211 if (epoll_ctl(epollfd, EPOLL_CTL_ADD, sigfd, &ev) == -1) {
212 perror("epoll_ctl add sigfd");
213 goto err;
214 }
215
216 for (;;) {
217 nfds = epoll_wait(epollfd, events, MAX_EVENTS, 1000);
218 if (nfds == -1) {
219 if (errno == EINTR)
220 continue;
221
222 perror("epoll_pwait");
223 goto err;
224 }
225
226 if (nfds == 0) { // timeout
227
228 // check for timeouts
229 if (update_watchdog) {
230 int i;
231 time_t ctime = time(NULL);
232 for (i = 0; i < MAX_CLIENTS; i++) {
233 if (client_list[i].fd != 0 && client_list[i].time != 0 &&
234 ((ctime - client_list[i].time) > client_watchdog_timeout)) {
235 update_watchdog = 0;
236 fprintf(stderr, "client watchdog expired - disable watchdog updates\n");
237 }
238 }
239 }
240
241 if (update_watchdog) {
242 if (ioctl(watchdog_fd, WDIOC_KEEPALIVE, 0) == -1) {
243 perror("watchdog update failed");
244 }
245 }
246
247 continue;
248 }
249
250 if (!update_watchdog)
251 break;
252
253 int terminate = 0;
254
255 int n;
256 for (n = 0; n < nfds; ++n) {
257 wd_client_t *wd_client = events[n].data.ptr;
258 if (wd_client->fd == listen_sock) {
259 int conn_sock = accept(listen_sock, (struct sockaddr *) &peer_addr, &peer_addr_size);
260 if (conn_sock == -1) {
261 perror("accept");
262 goto err; // fixme
263 }
264 if (fcntl(conn_sock, F_SETFL, O_NONBLOCK) == -1) {
265 perror("setnonblocking");
266 goto err; // fixme
267 }
268
269 wd_client_t *new_client = alloc_client(conn_sock, time(NULL));
270 if (new_client == NULL) {
271 fprintf(stderr, "unable to alloc wd_client structure\n");
272 goto err; // fixme;
273 }
274
275 mkdir(WD_ACTIVE_MARKER, 0600);
276
277 ev.events = EPOLLIN;
278 ev.data.ptr = new_client;
279 if (epoll_ctl(epollfd, EPOLL_CTL_ADD, conn_sock, &ev) == -1) {
280 perror("epoll_ctl: add conn_sock");
281 goto err; // fixme
282 }
283 } else if (wd_client->fd == sigfd) {
284
285 /* signal handling */
286
287 int rv = 0;
288 struct signalfd_siginfo si;
289
290 if ((rv = read(sigfd, &si, sizeof(si))) && rv >= 0) {
291 if (si.ssi_signo == SIGHUP) {
292 perror("got SIGHUP - ignored");
293 } else {
294 terminate = 1;
295 fprintf(stderr, "got terminate request\n");
296 }
297 }
298
299 } else {
300 char buf[4096];
301 int cfd = wd_client->fd;
302
303 ssize_t bytes = read(cfd, buf, sizeof(buf));
304 if (bytes == -1) {
305 perror("read");
306 goto err; // fixme
307 } else if (bytes > 0) {
308 int i;
309 for (i = 0; i < bytes; i++) {
310 if (buf[i] == 'V') {
311 wd_client->magic_close = 1;
312 } else {
313 wd_client->magic_close = 0;
314 }
315 }
316 wd_client->time = time(NULL);
317 } else {
318 if (events[n].events & EPOLLHUP || events[n].events & EPOLLERR) {
319 //printf("GOT %016x event\n", events[n].events);
320 if (epoll_ctl(epollfd, EPOLL_CTL_DEL, cfd, NULL) == -1) {
321 perror("epoll_ctl: del conn_sock");
322 goto err; // fixme
323 }
324 if (close(cfd) == -1) {
325 perror("close conn_sock");
326 goto err; // fixme
327 }
328
329 if (!wd_client->magic_close) {
330 fprintf(stderr, "client did not stop watchdog - disable watchdog updates\n");
331 update_watchdog = 0;
332 } else {
333 free_client(wd_client);
334 }
335
336 if (!active_client_count()) {
337 rmdir(WD_ACTIVE_MARKER);
338 }
339 }
340 }
341 }
342 }
343 if (terminate)
344 break;
345 }
346
347 int active_count = active_client_count();
348 if (active_count > 0) {
349 fprintf(stderr, "exit watchdog-mux with active connections\n");
350 } else {
351 fprintf(stderr, "clean exit\n");
352 watchdog_close();
353 }
354
355 unlink(MY_SOCK_PATH);
356 exit(EXIT_SUCCESS);
357
358 err:
359 unlink(MY_SOCK_PATH);
360 exit(EXIT_FAILURE);
361 }