1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <linux/limits.h>
30 #include <sys/select.h>
32 #include <sys/types.h>
35 #include <linux/fanotify.h>
36 #include <sys/signalfd.h>
40 #include <linux/fiemap.h>
41 #include <sys/ioctl.h>
44 #include <sys/inotify.h>
46 #include <systemd/sd-daemon.h>
52 #include "readahead-common.h"
57 * - detect ssd on btrfs/lvm...
58 * - read ahead directories
61 * - handle files where nothing is in mincore
62 * - does ioprio_set work with fadvise()?
65 static ReadaheadShared
*shared
= NULL
;
67 /* Avoid collisions with the NULL pointer */
68 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
69 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
71 static int btrfs_defrag(int fd
) {
72 struct btrfs_ioctl_vol_args data
;
77 return ioctl(fd
, BTRFS_IOC_DEFRAG
, &data
);
80 static int pack_file(FILE *pack
, const char *fn
, bool on_btrfs
) {
82 void *start
= MAP_FAILED
;
88 int r
= 0, fd
= -1, k
;
93 fd
= open(fn
, O_RDONLY
|O_CLOEXEC
|O_NOATIME
|O_NOCTTY
|O_NOFOLLOW
);
99 if (errno
== EPERM
|| errno
== EACCES
)
102 log_warning("open(%s) failed: %m", fn
);
107 k
= file_verify(fd
, fn
, arg_file_size_max
, &st
);
116 l
= PAGE_ALIGN(st
.st_size
);
117 start
= mmap(NULL
, l
, PROT_READ
, MAP_SHARED
, fd
, 0);
118 if (start
== MAP_FAILED
) {
119 log_warning("mmap(%s) failed: %m", fn
);
124 pages
= l
/ page_size();
126 memset(vec
, 0, pages
);
127 if (mincore(start
, l
, vec
) < 0) {
128 log_warning("mincore(%s) failed: %m", fn
);
136 /* Store the inode, so that we notice when the file is deleted */
137 inode
= (uint64_t) st
.st_ino
;
138 fwrite(&inode
, sizeof(inode
), 1, pack
);
141 for (c
= 0; c
< pages
; c
++) {
142 bool new_mapped
= !!(vec
[c
] & 1);
144 if (!mapped
&& new_mapped
)
146 else if (mapped
&& !new_mapped
) {
147 fwrite(&b
, sizeof(b
), 1, pack
);
148 fwrite(&c
, sizeof(c
), 1, pack
);
150 log_debug("%s: page %u to %u", fn
, b
, c
);
156 /* We don't write any range data if we should read the entire file */
157 if (mapped
&& b
> 0) {
158 fwrite(&b
, sizeof(b
), 1, pack
);
159 fwrite(&c
, sizeof(c
), 1, pack
);
161 log_debug("%s: page %u to %u", fn
, b
, c
);
166 fwrite(&b
, sizeof(b
), 1, pack
);
167 fwrite(&b
, sizeof(b
), 1, pack
);
170 if (start
!= MAP_FAILED
)
174 close_nointr_nofail(fd
);
179 static unsigned long fd_first_block(int fd
) {
181 struct fiemap fiemap
;
182 struct fiemap_extent extent
;
186 data
.fiemap
.fm_length
= ~0ULL;
187 data
.fiemap
.fm_extent_count
= 1;
189 if (ioctl(fd
, FS_IOC_FIEMAP
, &data
) < 0)
192 if (data
.fiemap
.fm_mapped_extents
<= 0)
195 if (data
.fiemap
.fm_extents
[0].fe_flags
& FIEMAP_EXTENT_UNKNOWN
)
198 return (unsigned long) data
.fiemap
.fm_extents
[0].fe_physical
;
206 static int qsort_compare(const void *a
, const void *b
) {
207 const struct item
*i
, *j
;
212 if (i
->block
< j
->block
)
214 if (i
->block
> j
->block
)
217 return strcmp(i
->path
, j
->path
);
220 static int collect(const char *root
) {
222 FD_FANOTIFY
, /* Get the actual fs events */
224 FD_INOTIFY
, /* We get notifications to quit early via this fd */
227 struct pollfd pollfd
[_FD_MAX
];
228 int fanotify_fd
= -1, signal_fd
= -1, inotify_fd
= -1, r
= 0;
230 Hashmap
*files
= NULL
;
235 char *pack_fn_new
= NULL
, *pack_fn
= NULL
;
236 bool on_ssd
, on_btrfs
;
239 uint64_t previous_block_readahead
;
240 bool previous_block_readahead_set
= false;
244 if (asprintf(&pack_fn
, "%s/.readahead", root
) < 0) {
249 /* If there's no pack file yet we lower the kernel readahead
250 * so that mincore() is accurate. If there is a pack file
251 * already we assume it is accurate enough so that kernel
252 * readahead is never triggered. */
253 previous_block_readahead_set
=
254 access(pack_fn
, F_OK
) < 0 &&
255 block_get_readahead(root
, &previous_block_readahead
) >= 0 &&
256 block_set_readahead(root
, 8*1024) >= 0;
258 if (ioprio_set(IOPRIO_WHO_PROCESS
, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE
, 0)) < 0)
259 log_warning("Failed to set IDLE IO priority class: %m");
261 assert_se(sigemptyset(&mask
) == 0);
262 sigset_add_many(&mask
, SIGINT
, SIGTERM
, -1);
263 assert_se(sigprocmask(SIG_SETMASK
, &mask
, NULL
) == 0);
265 if ((signal_fd
= signalfd(-1, &mask
, SFD_NONBLOCK
|SFD_CLOEXEC
)) < 0) {
266 log_error("signalfd(): %m");
271 if (!(files
= hashmap_new(string_hash_func
, string_compare_func
))) {
272 log_error("Failed to allocate set.");
277 if ((fanotify_fd
= fanotify_init(FAN_CLOEXEC
|FAN_NONBLOCK
, O_RDONLY
|O_LARGEFILE
|O_CLOEXEC
|O_NOATIME
)) < 0) {
278 log_error("Failed to create fanotify object: %m");
283 if (fanotify_mark(fanotify_fd
, FAN_MARK_ADD
|FAN_MARK_MOUNT
, FAN_OPEN
, AT_FDCWD
, root
) < 0) {
284 log_error("Failed to mark %s: %m", root
);
289 if ((inotify_fd
= open_inotify()) < 0) {
294 not_after
= now(CLOCK_MONOTONIC
) + arg_timeout
;
299 pollfd
[FD_FANOTIFY
].fd
= fanotify_fd
;
300 pollfd
[FD_FANOTIFY
].events
= POLLIN
;
301 pollfd
[FD_SIGNAL
].fd
= signal_fd
;
302 pollfd
[FD_SIGNAL
].events
= POLLIN
;
303 pollfd
[FD_INOTIFY
].fd
= inotify_fd
;
304 pollfd
[FD_INOTIFY
].events
= POLLIN
;
308 "STATUS=Collecting readahead data");
310 log_debug("Collecting...");
312 if (access("/run/systemd/readahead/cancel", F_OK
) >= 0) {
313 log_debug("Collection canceled");
318 if (access("/run/systemd/readahead/done", F_OK
) >= 0) {
319 log_debug("Got termination request");
325 struct fanotify_event_metadata metadata
;
329 struct fanotify_event_metadata
*m
;
333 if (hashmap_size(files
) > arg_files_max
) {
334 log_debug("Reached maximum number of read ahead files, ending collection.");
338 t
= now(CLOCK_MONOTONIC
);
339 if (t
>= not_after
) {
340 log_debug("Reached maximum collection time, ending collection.");
344 if ((h
= poll(pollfd
, _FD_MAX
, (int) ((not_after
- t
) / USEC_PER_MSEC
))) < 0) {
349 log_error("poll(): %m");
355 log_debug("Reached maximum collection time, ending collection.");
359 if (pollfd
[FD_SIGNAL
].revents
) {
360 log_debug("Got signal.");
364 if (pollfd
[FD_INOTIFY
].revents
) {
365 uint8_t inotify_buffer
[sizeof(struct inotify_event
) + FILENAME_MAX
];
366 struct inotify_event
*e
;
368 if ((n
= read(inotify_fd
, &inotify_buffer
, sizeof(inotify_buffer
))) < 0) {
369 if (errno
== EINTR
|| errno
== EAGAIN
)
372 log_error("Failed to read inotify event: %m");
377 e
= (struct inotify_event
*) inotify_buffer
;
381 if ((e
->mask
& IN_CREATE
) && streq(e
->name
, "cancel")) {
382 log_debug("Collection canceled");
387 if ((e
->mask
& IN_CREATE
) && streq(e
->name
, "done")) {
388 log_debug("Got termination request");
392 step
= sizeof(struct inotify_event
) + e
->len
;
393 assert(step
<= (size_t) n
);
395 e
= (struct inotify_event
*) ((uint8_t*) e
+ step
);
400 if ((n
= read(fanotify_fd
, &data
, sizeof(data
))) < 0) {
402 if (errno
== EINTR
|| errno
== EAGAIN
)
405 /* fanotify sometimes returns EACCES on read()
406 * where it shouldn't. For now let's just
407 * ignore it here (which is safe), but
408 * eventually this should be
409 * dropped when the kernel is fixed.
411 * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
415 log_error("Failed to read event: %m");
420 for (m
= &data
.metadata
; FAN_EVENT_OK(m
, n
); m
= FAN_EVENT_NEXT(m
, n
)) {
427 if (m
->pid
== my_pid
)
430 __sync_synchronize();
431 if (m
->pid
== shared
->replay
)
434 snprintf(fn
, sizeof(fn
), "/proc/self/fd/%i", m
->fd
);
437 if ((k
= readlink_malloc(fn
, &p
)) >= 0) {
438 if (startswith(p
, "/tmp") ||
439 endswith(p
, " (deleted)") ||
440 hashmap_get(files
, p
))
441 /* Not interesting, or
447 ul
= fd_first_block(m
->fd
);
449 if ((k
= hashmap_put(files
, p
, SECTOR_TO_PTR(ul
))) < 0) {
450 log_warning("set_put() failed: %s", strerror(-k
));
456 log_warning("readlink(%s) failed: %s", fn
, strerror(-k
));
460 close_nointr_nofail(m
->fd
);
465 if (fanotify_fd
>= 0) {
466 close_nointr_nofail(fanotify_fd
);
470 log_debug("Writing Pack File...");
472 on_ssd
= fs_on_ssd(root
) > 0;
473 log_debug("On SSD: %s", yes_no(on_ssd
));
475 on_btrfs
= statfs(root
, &sfs
) >= 0 && (long) sfs
.f_type
== (long) BTRFS_SUPER_MAGIC
;
476 log_debug("On btrfs: %s", yes_no(on_btrfs
));
478 if (asprintf(&pack_fn_new
, "%s/.readahead.new", root
) < 0) {
483 pack
= fopen(pack_fn_new
, "we");
485 log_error("Failed to open pack file: %m");
490 fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION
, pack
);
491 putc(on_ssd
? 'S' : 'R', pack
);
493 if (on_ssd
|| on_btrfs
) {
495 /* On SSD or on btrfs, just write things out in the
496 * order the files were accessed. */
498 HASHMAP_FOREACH_KEY(q
, p
, files
, i
)
499 pack_file(pack
, p
, on_btrfs
);
501 struct item
*ordered
, *j
;
504 /* On rotating media, order things by the block
507 log_debug("Ordering...");
509 n
= hashmap_size(files
);
510 if (!(ordered
= new(struct item
, n
))) {
516 HASHMAP_FOREACH_KEY(q
, p
, files
, i
) {
518 j
->block
= PTR_TO_SECTOR(q
);
522 assert(ordered
+ n
== j
);
524 qsort(ordered
, n
, sizeof(struct item
), qsort_compare
);
526 for (k
= 0; k
< n
; k
++)
527 pack_file(pack
, ordered
[k
].path
, on_btrfs
);
532 log_debug("Finalizing...");
537 log_error("Failed to write pack file.");
542 if (rename(pack_fn_new
, pack_fn
) < 0) {
543 log_error("Failed to rename readahead file: %m");
554 if (fanotify_fd
>= 0)
555 close_nointr_nofail(fanotify_fd
);
558 close_nointr_nofail(signal_fd
);
561 close_nointr_nofail(inotify_fd
);
570 while ((p
= hashmap_steal_first_key(files
)))
575 if (previous_block_readahead_set
) {
578 /* Restore the original kernel readahead setting if we
579 * changed it, and nobody has overwritten it since
581 if (block_get_readahead(root
, &bytes
) >= 0 && bytes
== 8*1024)
582 block_set_readahead(root
, previous_block_readahead
);
588 int main_collect(const char *root
) {
593 /* Skip this step on read-only media. Note that we check the
594 * underlying block device here, not he read-only flag of the
595 * file system on top, since that one is most likely mounted
596 * read-only anyway at boot, even if the underlying block
597 * device is theoretically writable. */
598 if (fs_on_read_only(root
) > 0) {
599 log_info("Disabling readahead collector due to read-only media.");
604 log_info("Disabling readahead collector due to low memory.");
608 shared
= shared_get();
612 shared
->collect
= getpid();
613 __sync_synchronize();
615 if (collect(root
) < 0)