]>
Commit | Line | Data |
---|---|---|
0e9b5cd6 AG |
1 | /* |
2 | * Linux UFFD-WP support | |
3 | * | |
4 | * Copyright Virtuozzo GmbH, 2020 | |
5 | * | |
6 | * Authors: | |
7 | * Andrey Gruzdev <andrey.gruzdev@virtuozzo.com> | |
8 | * | |
9 | * This work is licensed under the terms of the GNU GPL, version 2 or | |
10 | * later. See the COPYING file in the top-level directory. | |
11 | */ | |
12 | ||
13 | #include "qemu/osdep.h" | |
14 | #include "qemu/bitops.h" | |
15 | #include "qemu/error-report.h" | |
16 | #include "qemu/userfaultfd.h" | |
17 | #include "trace.h" | |
18 | #include <poll.h> | |
19 | #include <sys/syscall.h> | |
20 | #include <sys/ioctl.h> | |
c40c0463 PX |
21 | #include <fcntl.h> |
22 | ||
23 | typedef enum { | |
24 | UFFD_UNINITIALIZED = 0, | |
25 | UFFD_USE_DEV_PATH, | |
26 | UFFD_USE_SYSCALL, | |
27 | } uffd_open_mode; | |
0e9b5cd6 | 28 | |
d5890ea0 PX |
29 | int uffd_open(int flags) |
30 | { | |
31 | #if defined(__NR_userfaultfd) | |
c40c0463 PX |
32 | static uffd_open_mode open_mode; |
33 | static int uffd_dev; | |
34 | ||
35 | /* Detect how to generate uffd desc when run the 1st time */ | |
36 | if (open_mode == UFFD_UNINITIALIZED) { | |
37 | /* | |
38 | * Make /dev/userfaultfd the default approach because it has better | |
39 | * permission controls, meanwhile allows kernel faults without any | |
40 | * privilege requirement (e.g. SYS_CAP_PTRACE). | |
41 | */ | |
42 | uffd_dev = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); | |
43 | if (uffd_dev >= 0) { | |
44 | open_mode = UFFD_USE_DEV_PATH; | |
45 | } else { | |
46 | /* Fallback to the system call */ | |
47 | open_mode = UFFD_USE_SYSCALL; | |
48 | } | |
49 | trace_uffd_detect_open_mode(open_mode); | |
50 | } | |
51 | ||
52 | if (open_mode == UFFD_USE_DEV_PATH) { | |
53 | assert(uffd_dev >= 0); | |
54 | return ioctl(uffd_dev, USERFAULTFD_IOC_NEW, flags); | |
55 | } | |
56 | ||
d5890ea0 PX |
57 | return syscall(__NR_userfaultfd, flags); |
58 | #else | |
59 | return -EINVAL; | |
60 | #endif | |
61 | } | |
62 | ||
0e9b5cd6 AG |
63 | /** |
64 | * uffd_query_features: query UFFD features | |
65 | * | |
66 | * Returns: 0 on success, negative value in case of an error | |
67 | * | |
68 | * @features: parameter to receive 'uffdio_api.features' | |
69 | */ | |
70 | int uffd_query_features(uint64_t *features) | |
71 | { | |
72 | int uffd_fd; | |
73 | struct uffdio_api api_struct = { 0 }; | |
74 | int ret = -1; | |
75 | ||
d5890ea0 | 76 | uffd_fd = uffd_open(O_CLOEXEC); |
0e9b5cd6 AG |
77 | if (uffd_fd < 0) { |
78 | trace_uffd_query_features_nosys(errno); | |
79 | return -1; | |
80 | } | |
81 | ||
82 | api_struct.api = UFFD_API; | |
83 | api_struct.features = 0; | |
84 | ||
85 | if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) { | |
86 | trace_uffd_query_features_api_failed(errno); | |
87 | goto out; | |
88 | } | |
89 | *features = api_struct.features; | |
90 | ret = 0; | |
91 | ||
92 | out: | |
93 | close(uffd_fd); | |
94 | return ret; | |
95 | } | |
96 | ||
97 | /** | |
98 | * uffd_create_fd: create UFFD file descriptor | |
99 | * | |
100 | * Returns non-negative file descriptor or negative value in case of an error | |
101 | * | |
102 | * @features: UFFD features to request | |
103 | * @non_blocking: create UFFD file descriptor for non-blocking operation | |
104 | */ | |
105 | int uffd_create_fd(uint64_t features, bool non_blocking) | |
106 | { | |
107 | int uffd_fd; | |
108 | int flags; | |
109 | struct uffdio_api api_struct = { 0 }; | |
110 | uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER); | |
111 | ||
112 | flags = O_CLOEXEC | (non_blocking ? O_NONBLOCK : 0); | |
d5890ea0 | 113 | uffd_fd = uffd_open(flags); |
0e9b5cd6 AG |
114 | if (uffd_fd < 0) { |
115 | trace_uffd_create_fd_nosys(errno); | |
116 | return -1; | |
117 | } | |
118 | ||
119 | api_struct.api = UFFD_API; | |
120 | api_struct.features = features; | |
121 | if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) { | |
122 | trace_uffd_create_fd_api_failed(errno); | |
123 | goto fail; | |
124 | } | |
125 | if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) { | |
126 | trace_uffd_create_fd_api_noioctl(ioctl_mask, api_struct.ioctls); | |
127 | goto fail; | |
128 | } | |
129 | ||
130 | return uffd_fd; | |
131 | ||
132 | fail: | |
133 | close(uffd_fd); | |
134 | return -1; | |
135 | } | |
136 | ||
137 | /** | |
138 | * uffd_close_fd: close UFFD file descriptor | |
139 | * | |
140 | * @uffd_fd: UFFD file descriptor | |
141 | */ | |
142 | void uffd_close_fd(int uffd_fd) | |
143 | { | |
144 | assert(uffd_fd >= 0); | |
145 | close(uffd_fd); | |
146 | } | |
147 | ||
148 | /** | |
149 | * uffd_register_memory: register memory range via UFFD-IO | |
150 | * | |
151 | * Returns 0 in case of success, negative value in case of an error | |
152 | * | |
153 | * @uffd_fd: UFFD file descriptor | |
154 | * @addr: base address of memory range | |
155 | * @length: length of memory range | |
156 | * @mode: UFFD register mode (UFFDIO_REGISTER_MODE_MISSING, ...) | |
157 | * @ioctls: optional pointer to receive supported IOCTL mask | |
158 | */ | |
159 | int uffd_register_memory(int uffd_fd, void *addr, uint64_t length, | |
160 | uint64_t mode, uint64_t *ioctls) | |
161 | { | |
162 | struct uffdio_register uffd_register; | |
163 | ||
164 | uffd_register.range.start = (uintptr_t) addr; | |
165 | uffd_register.range.len = length; | |
166 | uffd_register.mode = mode; | |
167 | ||
168 | if (ioctl(uffd_fd, UFFDIO_REGISTER, &uffd_register)) { | |
169 | trace_uffd_register_memory_failed(addr, length, mode, errno); | |
170 | return -1; | |
171 | } | |
172 | if (ioctls) { | |
173 | *ioctls = uffd_register.ioctls; | |
174 | } | |
175 | ||
176 | return 0; | |
177 | } | |
178 | ||
179 | /** | |
180 | * uffd_unregister_memory: un-register memory range with UFFD-IO | |
181 | * | |
182 | * Returns 0 in case of success, negative value in case of an error | |
183 | * | |
184 | * @uffd_fd: UFFD file descriptor | |
185 | * @addr: base address of memory range | |
186 | * @length: length of memory range | |
187 | */ | |
188 | int uffd_unregister_memory(int uffd_fd, void *addr, uint64_t length) | |
189 | { | |
190 | struct uffdio_range uffd_range; | |
191 | ||
192 | uffd_range.start = (uintptr_t) addr; | |
193 | uffd_range.len = length; | |
194 | ||
195 | if (ioctl(uffd_fd, UFFDIO_UNREGISTER, &uffd_range)) { | |
196 | trace_uffd_unregister_memory_failed(addr, length, errno); | |
197 | return -1; | |
198 | } | |
199 | ||
200 | return 0; | |
201 | } | |
202 | ||
203 | /** | |
204 | * uffd_change_protection: protect/un-protect memory range for writes via UFFD-IO | |
205 | * | |
206 | * Returns 0 on success, negative value in case of error | |
207 | * | |
208 | * @uffd_fd: UFFD file descriptor | |
209 | * @addr: base address of memory range | |
210 | * @length: length of memory range | |
211 | * @wp: write-protect/unprotect | |
212 | * @dont_wake: do not wake threads waiting on wr-protected page | |
213 | */ | |
214 | int uffd_change_protection(int uffd_fd, void *addr, uint64_t length, | |
215 | bool wp, bool dont_wake) | |
216 | { | |
217 | struct uffdio_writeprotect uffd_writeprotect; | |
218 | ||
219 | uffd_writeprotect.range.start = (uintptr_t) addr; | |
220 | uffd_writeprotect.range.len = length; | |
221 | if (!wp && dont_wake) { | |
222 | /* DONTWAKE is meaningful only on protection release */ | |
223 | uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_DONTWAKE; | |
224 | } else { | |
225 | uffd_writeprotect.mode = (wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0); | |
226 | } | |
227 | ||
228 | if (ioctl(uffd_fd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) { | |
229 | error_report("uffd_change_protection() failed: addr=%p len=%" PRIu64 | |
230 | " mode=%" PRIx64 " errno=%i", addr, length, | |
231 | (uint64_t) uffd_writeprotect.mode, errno); | |
232 | return -1; | |
233 | } | |
234 | ||
235 | return 0; | |
236 | } | |
237 | ||
238 | /** | |
239 | * uffd_copy_page: copy range of pages to destination via UFFD-IO | |
240 | * | |
241 | * Copy range of source pages to the destination to resolve | |
242 | * missing page fault somewhere in the destination range. | |
243 | * | |
244 | * Returns 0 on success, negative value in case of an error | |
245 | * | |
246 | * @uffd_fd: UFFD file descriptor | |
247 | * @dst_addr: destination base address | |
248 | * @src_addr: source base address | |
249 | * @length: length of the range to copy | |
250 | * @dont_wake: do not wake threads waiting on missing page | |
251 | */ | |
252 | int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr, | |
253 | uint64_t length, bool dont_wake) | |
254 | { | |
255 | struct uffdio_copy uffd_copy; | |
256 | ||
257 | uffd_copy.dst = (uintptr_t) dst_addr; | |
258 | uffd_copy.src = (uintptr_t) src_addr; | |
259 | uffd_copy.len = length; | |
260 | uffd_copy.mode = dont_wake ? UFFDIO_COPY_MODE_DONTWAKE : 0; | |
261 | ||
262 | if (ioctl(uffd_fd, UFFDIO_COPY, &uffd_copy)) { | |
263 | error_report("uffd_copy_page() failed: dst_addr=%p src_addr=%p length=%" PRIu64 | |
264 | " mode=%" PRIx64 " errno=%i", dst_addr, src_addr, | |
265 | length, (uint64_t) uffd_copy.mode, errno); | |
266 | return -1; | |
267 | } | |
268 | ||
269 | return 0; | |
270 | } | |
271 | ||
272 | /** | |
273 | * uffd_zero_page: fill range of pages with zeroes via UFFD-IO | |
274 | * | |
275 | * Fill range pages with zeroes to resolve missing page fault within the range. | |
276 | * | |
277 | * Returns 0 on success, negative value in case of an error | |
278 | * | |
279 | * @uffd_fd: UFFD file descriptor | |
280 | * @addr: base address | |
281 | * @length: length of the range to fill with zeroes | |
282 | * @dont_wake: do not wake threads waiting on missing page | |
283 | */ | |
284 | int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake) | |
285 | { | |
286 | struct uffdio_zeropage uffd_zeropage; | |
287 | ||
288 | uffd_zeropage.range.start = (uintptr_t) addr; | |
289 | uffd_zeropage.range.len = length; | |
290 | uffd_zeropage.mode = dont_wake ? UFFDIO_ZEROPAGE_MODE_DONTWAKE : 0; | |
291 | ||
292 | if (ioctl(uffd_fd, UFFDIO_ZEROPAGE, &uffd_zeropage)) { | |
293 | error_report("uffd_zero_page() failed: addr=%p length=%" PRIu64 | |
294 | " mode=%" PRIx64 " errno=%i", addr, length, | |
295 | (uint64_t) uffd_zeropage.mode, errno); | |
296 | return -1; | |
297 | } | |
298 | ||
299 | return 0; | |
300 | } | |
301 | ||
302 | /** | |
303 | * uffd_wakeup: wake up threads waiting on page UFFD-managed page fault resolution | |
304 | * | |
305 | * Wake up threads waiting on any page/pages from the designated range. | |
306 | * The main use case is when during some period, page faults are resolved | |
307 | * via UFFD-IO IOCTLs with MODE_DONTWAKE flag set, then after that all waits | |
308 | * for the whole memory range are satisfied in a single call to uffd_wakeup(). | |
309 | * | |
310 | * Returns 0 on success, negative value in case of an error | |
311 | * | |
312 | * @uffd_fd: UFFD file descriptor | |
313 | * @addr: base address | |
314 | * @length: length of the range | |
315 | */ | |
316 | int uffd_wakeup(int uffd_fd, void *addr, uint64_t length) | |
317 | { | |
318 | struct uffdio_range uffd_range; | |
319 | ||
320 | uffd_range.start = (uintptr_t) addr; | |
321 | uffd_range.len = length; | |
322 | ||
323 | if (ioctl(uffd_fd, UFFDIO_WAKE, &uffd_range)) { | |
324 | error_report("uffd_wakeup() failed: addr=%p length=%" PRIu64 " errno=%i", | |
325 | addr, length, errno); | |
326 | return -1; | |
327 | } | |
328 | ||
329 | return 0; | |
330 | } | |
331 | ||
332 | /** | |
333 | * uffd_read_events: read pending UFFD events | |
334 | * | |
335 | * Returns number of fetched messages, 0 if non is available or | |
336 | * negative value in case of an error | |
337 | * | |
338 | * @uffd_fd: UFFD file descriptor | |
339 | * @msgs: pointer to message buffer | |
340 | * @count: number of messages that can fit in the buffer | |
341 | */ | |
342 | int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count) | |
343 | { | |
344 | ssize_t res; | |
345 | do { | |
346 | res = read(uffd_fd, msgs, count * sizeof(struct uffd_msg)); | |
347 | } while (res < 0 && errno == EINTR); | |
348 | ||
349 | if ((res < 0 && errno == EAGAIN)) { | |
350 | return 0; | |
351 | } | |
352 | if (res < 0) { | |
353 | error_report("uffd_read_events() failed: errno=%i", errno); | |
354 | return -1; | |
355 | } | |
356 | ||
357 | return (int) (res / sizeof(struct uffd_msg)); | |
358 | } | |
359 | ||
360 | /** | |
361 | * uffd_poll_events: poll UFFD file descriptor for read | |
362 | * | |
363 | * Returns true if events are available for read, false otherwise | |
364 | * | |
365 | * @uffd_fd: UFFD file descriptor | |
366 | * @tmo: timeout value | |
367 | */ | |
368 | bool uffd_poll_events(int uffd_fd, int tmo) | |
369 | { | |
370 | int res; | |
371 | struct pollfd poll_fd = { .fd = uffd_fd, .events = POLLIN, .revents = 0 }; | |
372 | ||
373 | do { | |
374 | res = poll(&poll_fd, 1, tmo); | |
375 | } while (res < 0 && errno == EINTR); | |
376 | ||
377 | if (res == 0) { | |
378 | return false; | |
379 | } | |
380 | if (res < 0) { | |
381 | error_report("uffd_poll_events() failed: errno=%i", errno); | |
382 | return false; | |
383 | } | |
384 | ||
385 | return (poll_fd.revents & POLLIN) != 0; | |
386 | } |