]> git.proxmox.com Git - systemd.git/blob - src/journal/mmap-cache.c
Enable seccomp support on powerpc, ppc64el, and s390x
[systemd.git] / src / journal / mmap-cache.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2012 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <errno.h>
23 #include <stdlib.h>
24 #include <sys/mman.h>
25
26 #include "alloc-util.h"
27 #include "fd-util.h"
28 #include "hashmap.h"
29 #include "list.h"
30 #include "log.h"
31 #include "macro.h"
32 #include "mmap-cache.h"
33 #include "sigbus.h"
34 #include "util.h"
35
36 typedef struct Window Window;
37 typedef struct Context Context;
38 typedef struct FileDescriptor FileDescriptor;
39
40 struct Window {
41 MMapCache *cache;
42
43 bool invalidated;
44 bool keep_always;
45 bool in_unused;
46
47 int prot;
48 void *ptr;
49 uint64_t offset;
50 size_t size;
51
52 FileDescriptor *fd;
53
54 LIST_FIELDS(Window, by_fd);
55 LIST_FIELDS(Window, unused);
56
57 LIST_HEAD(Context, contexts);
58 };
59
60 struct Context {
61 MMapCache *cache;
62 unsigned id;
63 Window *window;
64
65 LIST_FIELDS(Context, by_window);
66 };
67
68 struct FileDescriptor {
69 MMapCache *cache;
70 int fd;
71 bool sigbus;
72 LIST_HEAD(Window, windows);
73 };
74
75 struct MMapCache {
76 int n_ref;
77 unsigned n_windows;
78
79 unsigned n_hit, n_missed;
80
81
82 Hashmap *fds;
83 Context *contexts[MMAP_CACHE_MAX_CONTEXTS];
84
85 LIST_HEAD(Window, unused);
86 Window *last_unused;
87 };
88
89 #define WINDOWS_MIN 64
90
91 #ifdef ENABLE_DEBUG_MMAP_CACHE
92 /* Tiny windows increase mmap activity and the chance of exposing unsafe use. */
93 # define WINDOW_SIZE (page_size())
94 #else
95 # define WINDOW_SIZE (8ULL*1024ULL*1024ULL)
96 #endif
97
98 MMapCache* mmap_cache_new(void) {
99 MMapCache *m;
100
101 m = new0(MMapCache, 1);
102 if (!m)
103 return NULL;
104
105 m->n_ref = 1;
106 return m;
107 }
108
109 MMapCache* mmap_cache_ref(MMapCache *m) {
110 assert(m);
111 assert(m->n_ref > 0);
112
113 m->n_ref ++;
114 return m;
115 }
116
117 static void window_unlink(Window *w) {
118 Context *c;
119
120 assert(w);
121
122 if (w->ptr)
123 munmap(w->ptr, w->size);
124
125 if (w->fd)
126 LIST_REMOVE(by_fd, w->fd->windows, w);
127
128 if (w->in_unused) {
129 if (w->cache->last_unused == w)
130 w->cache->last_unused = w->unused_prev;
131
132 LIST_REMOVE(unused, w->cache->unused, w);
133 }
134
135 LIST_FOREACH(by_window, c, w->contexts) {
136 assert(c->window == w);
137 c->window = NULL;
138 }
139 }
140
141 static void window_invalidate(Window *w) {
142 assert(w);
143
144 if (w->invalidated)
145 return;
146
147 /* Replace the window with anonymous pages. This is useful
148 * when we hit a SIGBUS and want to make sure the file cannot
149 * trigger any further SIGBUS, possibly overrunning the sigbus
150 * queue. */
151
152 assert_se(mmap(w->ptr, w->size, w->prot, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0) == w->ptr);
153 w->invalidated = true;
154 }
155
156 static void window_free(Window *w) {
157 assert(w);
158
159 window_unlink(w);
160 w->cache->n_windows--;
161 free(w);
162 }
163
164 _pure_ static bool window_matches(Window *w, int fd, int prot, uint64_t offset, size_t size) {
165 assert(w);
166 assert(fd >= 0);
167 assert(size > 0);
168
169 return
170 w->fd &&
171 fd == w->fd->fd &&
172 prot == w->prot &&
173 offset >= w->offset &&
174 offset + size <= w->offset + w->size;
175 }
176
177 static Window *window_add(MMapCache *m) {
178 Window *w;
179
180 assert(m);
181
182 if (!m->last_unused || m->n_windows <= WINDOWS_MIN) {
183
184 /* Allocate a new window */
185 w = new0(Window, 1);
186 if (!w)
187 return NULL;
188 m->n_windows++;
189 } else {
190
191 /* Reuse an existing one */
192 w = m->last_unused;
193 window_unlink(w);
194 zero(*w);
195 }
196
197 w->cache = m;
198 return w;
199 }
200
201 static void context_detach_window(Context *c) {
202 Window *w;
203
204 assert(c);
205
206 if (!c->window)
207 return;
208
209 w = c->window;
210 c->window = NULL;
211 LIST_REMOVE(by_window, w->contexts, c);
212
213 if (!w->contexts && !w->keep_always) {
214 /* Not used anymore? */
215 #ifdef ENABLE_DEBUG_MMAP_CACHE
216 /* Unmap unused windows immediately to expose use-after-unmap
217 * by SIGSEGV. */
218 window_free(w);
219 #else
220 LIST_PREPEND(unused, c->cache->unused, w);
221 if (!c->cache->last_unused)
222 c->cache->last_unused = w;
223
224 w->in_unused = true;
225 #endif
226 }
227 }
228
229 static void context_attach_window(Context *c, Window *w) {
230 assert(c);
231 assert(w);
232
233 if (c->window == w)
234 return;
235
236 context_detach_window(c);
237
238 if (w->in_unused) {
239 /* Used again? */
240 LIST_REMOVE(unused, c->cache->unused, w);
241 if (c->cache->last_unused == w)
242 c->cache->last_unused = w->unused_prev;
243
244 w->in_unused = false;
245 }
246
247 c->window = w;
248 LIST_PREPEND(by_window, w->contexts, c);
249 }
250
251 static Context *context_add(MMapCache *m, unsigned id) {
252 Context *c;
253
254 assert(m);
255
256 c = m->contexts[id];
257 if (c)
258 return c;
259
260 c = new0(Context, 1);
261 if (!c)
262 return NULL;
263
264 c->cache = m;
265 c->id = id;
266
267 assert(!m->contexts[id]);
268 m->contexts[id] = c;
269
270 return c;
271 }
272
273 static void context_free(Context *c) {
274 assert(c);
275
276 context_detach_window(c);
277
278 if (c->cache) {
279 assert(c->cache->contexts[c->id] == c);
280 c->cache->contexts[c->id] = NULL;
281 }
282
283 free(c);
284 }
285
286 static void fd_free(FileDescriptor *f) {
287 assert(f);
288
289 while (f->windows)
290 window_free(f->windows);
291
292 if (f->cache)
293 assert_se(hashmap_remove(f->cache->fds, FD_TO_PTR(f->fd)));
294
295 free(f);
296 }
297
298 static FileDescriptor* fd_add(MMapCache *m, int fd) {
299 FileDescriptor *f;
300 int r;
301
302 assert(m);
303 assert(fd >= 0);
304
305 f = hashmap_get(m->fds, FD_TO_PTR(fd));
306 if (f)
307 return f;
308
309 r = hashmap_ensure_allocated(&m->fds, NULL);
310 if (r < 0)
311 return NULL;
312
313 f = new0(FileDescriptor, 1);
314 if (!f)
315 return NULL;
316
317 f->cache = m;
318 f->fd = fd;
319
320 r = hashmap_put(m->fds, FD_TO_PTR(fd), f);
321 if (r < 0) {
322 free(f);
323 return NULL;
324 }
325
326 return f;
327 }
328
329 static void mmap_cache_free(MMapCache *m) {
330 FileDescriptor *f;
331 int i;
332
333 assert(m);
334
335 for (i = 0; i < MMAP_CACHE_MAX_CONTEXTS; i++)
336 if (m->contexts[i])
337 context_free(m->contexts[i]);
338
339 while ((f = hashmap_first(m->fds)))
340 fd_free(f);
341
342 hashmap_free(m->fds);
343
344 while (m->unused)
345 window_free(m->unused);
346
347 free(m);
348 }
349
350 MMapCache* mmap_cache_unref(MMapCache *m) {
351 assert(m);
352 assert(m->n_ref > 0);
353
354 m->n_ref --;
355 if (m->n_ref == 0)
356 mmap_cache_free(m);
357
358 return NULL;
359 }
360
361 static int make_room(MMapCache *m) {
362 assert(m);
363
364 if (!m->last_unused)
365 return 0;
366
367 window_free(m->last_unused);
368 return 1;
369 }
370
371 static int try_context(
372 MMapCache *m,
373 int fd,
374 int prot,
375 unsigned context,
376 bool keep_always,
377 uint64_t offset,
378 size_t size,
379 void **ret) {
380
381 Context *c;
382
383 assert(m);
384 assert(m->n_ref > 0);
385 assert(fd >= 0);
386 assert(size > 0);
387 assert(ret);
388
389 c = m->contexts[context];
390 if (!c)
391 return 0;
392
393 assert(c->id == context);
394
395 if (!c->window)
396 return 0;
397
398 if (!window_matches(c->window, fd, prot, offset, size)) {
399
400 /* Drop the reference to the window, since it's unnecessary now */
401 context_detach_window(c);
402 return 0;
403 }
404
405 if (c->window->fd->sigbus)
406 return -EIO;
407
408 c->window->keep_always |= keep_always;
409
410 *ret = (uint8_t*) c->window->ptr + (offset - c->window->offset);
411 return 1;
412 }
413
414 static int find_mmap(
415 MMapCache *m,
416 int fd,
417 int prot,
418 unsigned context,
419 bool keep_always,
420 uint64_t offset,
421 size_t size,
422 void **ret) {
423
424 FileDescriptor *f;
425 Window *w;
426 Context *c;
427
428 assert(m);
429 assert(m->n_ref > 0);
430 assert(fd >= 0);
431 assert(size > 0);
432
433 f = hashmap_get(m->fds, FD_TO_PTR(fd));
434 if (!f)
435 return 0;
436
437 assert(f->fd == fd);
438
439 if (f->sigbus)
440 return -EIO;
441
442 LIST_FOREACH(by_fd, w, f->windows)
443 if (window_matches(w, fd, prot, offset, size))
444 break;
445
446 if (!w)
447 return 0;
448
449 c = context_add(m, context);
450 if (!c)
451 return -ENOMEM;
452
453 context_attach_window(c, w);
454 w->keep_always += keep_always;
455
456 *ret = (uint8_t*) w->ptr + (offset - w->offset);
457 return 1;
458 }
459
460 static int add_mmap(
461 MMapCache *m,
462 int fd,
463 int prot,
464 unsigned context,
465 bool keep_always,
466 uint64_t offset,
467 size_t size,
468 struct stat *st,
469 void **ret) {
470
471 uint64_t woffset, wsize;
472 Context *c;
473 FileDescriptor *f;
474 Window *w;
475 void *d;
476 int r;
477
478 assert(m);
479 assert(m->n_ref > 0);
480 assert(fd >= 0);
481 assert(size > 0);
482 assert(ret);
483
484 woffset = offset & ~((uint64_t) page_size() - 1ULL);
485 wsize = size + (offset - woffset);
486 wsize = PAGE_ALIGN(wsize);
487
488 if (wsize < WINDOW_SIZE) {
489 uint64_t delta;
490
491 delta = PAGE_ALIGN((WINDOW_SIZE - wsize) / 2);
492
493 if (delta > offset)
494 woffset = 0;
495 else
496 woffset -= delta;
497
498 wsize = WINDOW_SIZE;
499 }
500
501 if (st) {
502 /* Memory maps that are larger then the files
503 underneath have undefined behavior. Hence, clamp
504 things to the file size if we know it */
505
506 if (woffset >= (uint64_t) st->st_size)
507 return -EADDRNOTAVAIL;
508
509 if (woffset + wsize > (uint64_t) st->st_size)
510 wsize = PAGE_ALIGN(st->st_size - woffset);
511 }
512
513 for (;;) {
514 d = mmap(NULL, wsize, prot, MAP_SHARED, fd, woffset);
515 if (d != MAP_FAILED)
516 break;
517 if (errno != ENOMEM)
518 return -errno;
519
520 r = make_room(m);
521 if (r < 0)
522 return r;
523 if (r == 0)
524 return -ENOMEM;
525 }
526
527 c = context_add(m, context);
528 if (!c)
529 goto outofmem;
530
531 f = fd_add(m, fd);
532 if (!f)
533 goto outofmem;
534
535 w = window_add(m);
536 if (!w)
537 goto outofmem;
538
539 w->keep_always = keep_always;
540 w->ptr = d;
541 w->offset = woffset;
542 w->prot = prot;
543 w->size = wsize;
544 w->fd = f;
545
546 LIST_PREPEND(by_fd, f->windows, w);
547
548 context_detach_window(c);
549 c->window = w;
550 LIST_PREPEND(by_window, w->contexts, c);
551
552 *ret = (uint8_t*) w->ptr + (offset - w->offset);
553 return 1;
554
555 outofmem:
556 munmap(d, wsize);
557 return -ENOMEM;
558 }
559
560 int mmap_cache_get(
561 MMapCache *m,
562 int fd,
563 int prot,
564 unsigned context,
565 bool keep_always,
566 uint64_t offset,
567 size_t size,
568 struct stat *st,
569 void **ret) {
570
571 int r;
572
573 assert(m);
574 assert(m->n_ref > 0);
575 assert(fd >= 0);
576 assert(size > 0);
577 assert(ret);
578 assert(context < MMAP_CACHE_MAX_CONTEXTS);
579
580 /* Check whether the current context is the right one already */
581 r = try_context(m, fd, prot, context, keep_always, offset, size, ret);
582 if (r != 0) {
583 m->n_hit ++;
584 return r;
585 }
586
587 /* Search for a matching mmap */
588 r = find_mmap(m, fd, prot, context, keep_always, offset, size, ret);
589 if (r != 0) {
590 m->n_hit ++;
591 return r;
592 }
593
594 m->n_missed++;
595
596 /* Create a new mmap */
597 return add_mmap(m, fd, prot, context, keep_always, offset, size, st, ret);
598 }
599
600 unsigned mmap_cache_get_hit(MMapCache *m) {
601 assert(m);
602
603 return m->n_hit;
604 }
605
606 unsigned mmap_cache_get_missed(MMapCache *m) {
607 assert(m);
608
609 return m->n_missed;
610 }
611
612 static void mmap_cache_process_sigbus(MMapCache *m) {
613 bool found = false;
614 FileDescriptor *f;
615 Iterator i;
616 int r;
617
618 assert(m);
619
620 /* Iterate through all triggered pages and mark their files as
621 * invalidated */
622 for (;;) {
623 bool ours;
624 void *addr;
625
626 r = sigbus_pop(&addr);
627 if (_likely_(r == 0))
628 break;
629 if (r < 0) {
630 log_error_errno(r, "SIGBUS handling failed: %m");
631 abort();
632 }
633
634 ours = false;
635 HASHMAP_FOREACH(f, m->fds, i) {
636 Window *w;
637
638 LIST_FOREACH(by_fd, w, f->windows) {
639 if ((uint8_t*) addr >= (uint8_t*) w->ptr &&
640 (uint8_t*) addr < (uint8_t*) w->ptr + w->size) {
641 found = ours = f->sigbus = true;
642 break;
643 }
644 }
645
646 if (ours)
647 break;
648 }
649
650 /* Didn't find a matching window, give up */
651 if (!ours) {
652 log_error("Unknown SIGBUS page, aborting.");
653 abort();
654 }
655 }
656
657 /* The list of triggered pages is now empty. Now, let's remap
658 * all windows of the triggered file to anonymous maps, so
659 * that no page of the file in question is triggered again, so
660 * that we can be sure not to hit the queue size limit. */
661 if (_likely_(!found))
662 return;
663
664 HASHMAP_FOREACH(f, m->fds, i) {
665 Window *w;
666
667 if (!f->sigbus)
668 continue;
669
670 LIST_FOREACH(by_fd, w, f->windows)
671 window_invalidate(w);
672 }
673 }
674
675 bool mmap_cache_got_sigbus(MMapCache *m, int fd) {
676 FileDescriptor *f;
677
678 assert(m);
679 assert(fd >= 0);
680
681 mmap_cache_process_sigbus(m);
682
683 f = hashmap_get(m->fds, FD_TO_PTR(fd));
684 if (!f)
685 return false;
686
687 return f->sigbus;
688 }
689
690 void mmap_cache_close_fd(MMapCache *m, int fd) {
691 FileDescriptor *f;
692
693 assert(m);
694 assert(fd >= 0);
695
696 /* Make sure that any queued SIGBUS are first dispatched, so
697 * that we don't end up with a SIGBUS entry we cannot relate
698 * to any existing memory map */
699
700 mmap_cache_process_sigbus(m);
701
702 f = hashmap_get(m->fds, FD_TO_PTR(fd));
703 if (!f)
704 return;
705
706 fd_free(f);
707 }