]> git.proxmox.com Git - pve-ha-manager.git/blob - src/watchdog-mux.c
implement watchdog update logic
[pve-ha-manager.git] / src / watchdog-mux.c
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <unistd.h>
4 #include <fcntl.h>
5 #include <string.h>
6 #include <errno.h>
7 #include <time.h>
8 #include <sys/ioctl.h>
9 #include <sys/types.h>
10 #include <sys/stat.h>
11 #include <sys/socket.h>
12 #include <sys/un.h>
13 #include <sys/epoll.h>
14 #include <signal.h>
15 #include <sys/signalfd.h>
16
17 #include <linux/types.h>
18 #include <linux/watchdog.h>
19
20 #include <systemd/sd-daemon.h>
21
22 #define MY_SOCK_PATH "/run/watchdog-mux.sock"
23 #define WD_ACTIVE_MARKER "/run/watchdog-mux.active"
24
25 #define LISTEN_BACKLOG 50
26 #define MAX_EVENTS 10
27
28 #define WATCHDOG_DEV "/dev/watchdog"
29
30 int watchdog_fd = -1;
31 int watchdog_timeout = 10;
32 int client_watchdog_timeout = 60;
33 int update_watchdog = 1;
34
35 typedef struct {
36 int fd;
37 time_t time;
38 int magic_close;
39 } wd_client_t;
40
41 #define MAX_CLIENTS 100
42
43 static wd_client_t client_list[MAX_CLIENTS];
44
45 static wd_client_t *
46 alloc_client(int fd, time_t time)
47 {
48 int i;
49
50 for (i = 0; i < MAX_CLIENTS; i++) {
51 if (client_list[i].fd == 0) {
52 client_list[i].fd = fd;
53 client_list[i].time = time;
54 client_list[i].magic_close = 0;
55 return &client_list[i];
56 }
57 }
58
59 return NULL;
60 }
61
62 static void
63 free_client(wd_client_t *wd_client)
64 {
65 if (!wd_client)
66 return;
67
68 wd_client->time = 0;
69 wd_client->fd = 0;
70 wd_client->magic_close = 0;
71 }
72
73 static int
74 active_client_count(void)
75 {
76 int i, count = 0;
77
78 for (i = 0; i < MAX_CLIENTS; i++) {
79 if (client_list[i].fd != 0 && client_list[i].time != 0) {
80 count++;
81 }
82 }
83
84 return count;
85 }
86
87 static void
88 watchdog_close(void)
89 {
90 if (watchdog_fd != -1) {
91 if (write(watchdog_fd, "V", 1) == -1) {
92 perror("write magic watchdog close");
93 }
94 if (close(watchdog_fd) == -1) {
95 perror("write magic watchdog close");
96 }
97 }
98
99 watchdog_fd = -1;
100 }
101
102 int
103 main(void)
104 {
105 struct sockaddr_un my_addr, peer_addr;
106 socklen_t peer_addr_size;
107 struct epoll_event ev, events[MAX_EVENTS];
108 int socket_count, listen_sock, nfds, epollfd, sigfd;
109
110
111 struct stat fs;
112
113 if (stat(WD_ACTIVE_MARKER, &fs) == 0) {
114 fprintf(stderr, "watchdog active - unable to restart watchdog-mux\n");
115 exit(EXIT_FAILURE);
116 }
117
118 if (stat(WATCHDOG_DEV, &fs) == -1) {
119 system("modprobe -q softdog soft_noboot=1"); // fixme
120 }
121
122 if ((watchdog_fd = open(WATCHDOG_DEV, O_WRONLY)) == -1) {
123 perror("watchdog open");
124 exit(EXIT_FAILURE);
125 }
126
127 if (ioctl(watchdog_fd, WDIOC_SETTIMEOUT, &watchdog_timeout) == -1) {
128 perror("watchdog set timeout");
129 watchdog_close();
130 exit(EXIT_FAILURE);
131 }
132
133 /* read and log watchdog identity */
134 struct watchdog_info wdinfo;
135 if (ioctl(watchdog_fd, WDIOC_GETSUPPORT, &wdinfo) == -1) {
136 perror("read watchdog info");
137 watchdog_close();
138 exit(EXIT_FAILURE);
139 }
140
141 wdinfo.identity[sizeof(wdinfo.identity) - 1] = 0; // just to be sure
142 fprintf(stderr, "Watchdog driver '%s', version %x\n",
143 wdinfo.identity, wdinfo.firmware_version);
144
145 socket_count = sd_listen_fds(0);
146
147 if (socket_count > 1) {
148
149 perror("too many file descriptors received.\n");
150 goto err;
151
152 } else if (socket_count == 1) {
153
154 listen_sock = SD_LISTEN_FDS_START + 0;
155
156 } else {
157
158 unlink(MY_SOCK_PATH);
159
160 listen_sock = socket(AF_UNIX, SOCK_STREAM, 0);
161 if (listen_sock == -1) {
162 perror("socket create");
163 exit(EXIT_FAILURE);
164 }
165
166 memset(&my_addr, 0, sizeof(struct sockaddr_un));
167 my_addr.sun_family = AF_UNIX;
168 strncpy(my_addr.sun_path, MY_SOCK_PATH, sizeof(my_addr.sun_path) - 1);
169
170 if (bind(listen_sock, (struct sockaddr *) &my_addr,
171 sizeof(struct sockaddr_un)) == -1) {
172 perror("socket bind");
173 exit(EXIT_FAILURE);
174 }
175
176 if (listen(listen_sock, LISTEN_BACKLOG) == -1) {
177 perror("socket listen");
178 goto err;
179 }
180 }
181
182 epollfd = epoll_create(10);
183 if (epollfd == -1) {
184 perror("epoll_create");
185 goto err;
186 }
187
188 ev.events = EPOLLIN;
189 ev.data.ptr = alloc_client(listen_sock, 0);
190 if (epoll_ctl(epollfd, EPOLL_CTL_ADD, listen_sock, &ev) == -1) {
191 perror("epoll_ctl add listen_sock");
192 goto err;
193 }
194
195 sigset_t mask;
196 sigemptyset(&mask);
197 sigaddset(&mask, SIGINT);
198 sigaddset(&mask, SIGTERM);
199 sigaddset(&mask, SIGHUP);
200
201 sigprocmask(SIG_BLOCK, &mask, NULL);
202
203 if ((sigfd = signalfd(-1, &mask, SFD_NONBLOCK)) < 0) {
204 perror("unable to open signalfd");
205 goto err;
206 }
207
208 ev.events = EPOLLIN;
209 ev.data.ptr = alloc_client(sigfd, 0);
210 if (epoll_ctl(epollfd, EPOLL_CTL_ADD, sigfd, &ev) == -1) {
211 perror("epoll_ctl add sigfd");
212 goto err;
213 }
214
215 for (;;) {
216 nfds = epoll_wait(epollfd, events, MAX_EVENTS, 1000);
217 if (nfds == -1) {
218 if (errno == EINTR)
219 continue;
220
221 perror("epoll_pwait");
222 goto err;
223 }
224
225 if (nfds == 0) { // timeout
226
227 // check for timeouts
228 if (update_watchdog) {
229 int i;
230 time_t ctime = time(NULL);
231 for (i = 0; i < MAX_CLIENTS; i++) {
232 if (client_list[i].fd != 0 && client_list[i].time != 0 &&
233 ((ctime - client_list[i].time) > client_watchdog_timeout)) {
234 update_watchdog = 0;
235 fprintf(stderr, "client watchdog expired - disable watchdog updates\n");
236 }
237 }
238 }
239
240 if (update_watchdog) {
241 if (ioctl(watchdog_fd, WDIOC_KEEPALIVE, 0) == -1) {
242 perror("watchdog update failed");
243 }
244 }
245
246 continue;
247 }
248
249 int terminate = 0;
250
251 int n;
252 for (n = 0; n < nfds; ++n) {
253 wd_client_t *wd_client = events[n].data.ptr;
254 if (wd_client->fd == listen_sock) {
255 int conn_sock = accept(listen_sock, (struct sockaddr *) &peer_addr, &peer_addr_size);
256 if (conn_sock == -1) {
257 perror("accept");
258 goto err; // fixme
259 }
260 if (fcntl(conn_sock, F_SETFL, O_NONBLOCK) == -1) {
261 perror("setnonblocking");
262 goto err; // fixme
263 }
264
265 wd_client_t *new_client = alloc_client(conn_sock, time(NULL));
266 if (new_client == NULL) {
267 fprintf(stderr, "unable to alloc wd_client structure\n");
268 goto err; // fixme;
269 }
270
271 mkdir(WD_ACTIVE_MARKER, 0600);
272
273 ev.events = EPOLLIN;
274 ev.data.ptr = new_client;
275 if (epoll_ctl(epollfd, EPOLL_CTL_ADD, conn_sock, &ev) == -1) {
276 perror("epoll_ctl: add conn_sock");
277 goto err; // fixme
278 }
279 } else if (wd_client->fd == sigfd) {
280
281 /* signal handling */
282
283 int rv = 0;
284 struct signalfd_siginfo si;
285
286 if ((rv = read(sigfd, &si, sizeof(si))) && rv >= 0) {
287 if (si.ssi_signo == SIGHUP) {
288 perror("got SIGHUP - ignored");
289 } else {
290 terminate = 1;
291 fprintf(stderr, "got terminate request\n");
292 }
293 }
294
295 } else {
296 char buf[4096];
297 int cfd = wd_client->fd;
298
299 ssize_t bytes = read(cfd, buf, sizeof(buf));
300 if (bytes == -1) {
301 perror("read");
302 goto err; // fixme
303 } else if (bytes > 0) {
304 int i;
305 for (i = 0; i < bytes; i++) {
306 if (buf[i] == 'V') {
307 wd_client->magic_close = 1;
308 } else {
309 wd_client->magic_close = 0;
310 }
311 }
312 wd_client->time = time(NULL);
313 } else {
314 if (events[n].events & EPOLLHUP || events[n].events & EPOLLERR) {
315 //printf("GOT %016x event\n", events[n].events);
316 if (epoll_ctl(epollfd, EPOLL_CTL_DEL, cfd, NULL) == -1) {
317 perror("epoll_ctl: del conn_sock");
318 goto err; // fixme
319 }
320 if (close(cfd) == -1) {
321 perror("close conn_sock");
322 goto err; // fixme
323 }
324
325 if (!wd_client->magic_close) {
326 fprintf(stderr, "client did not stop watchdog - disable watchdog updates\n");
327 update_watchdog = 0;
328 } else {
329 free_client(wd_client);
330 }
331
332 if (!active_client_count()) {
333 rmdir(WD_ACTIVE_MARKER);
334 }
335 }
336 }
337 }
338 }
339 if (terminate)
340 break;
341 }
342
343 int active_count = active_client_count();
344 if (active_count > 0) {
345 fprintf(stderr, "exit watchdog-mux with active connections\n");
346 } else {
347 fprintf(stderr, "clean exit\n");
348 watchdog_close();
349 }
350
351 unlink(MY_SOCK_PATH);
352 exit(EXIT_SUCCESS);
353
354 err:
355 unlink(MY_SOCK_PATH);
356 exit(EXIT_FAILURE);
357 }