]>
Commit | Line | Data |
---|---|---|
b2441318 | 1 | // SPDX-License-Identifier: GPL-2.0 |
87b2d440 DH |
2 | /* |
3 | * memfd GUP test-case | |
4 | * This tests memfd interactions with get_user_pages(). We require the | |
5 | * fuse_mnt.c program to provide a fake direct-IO FUSE mount-point for us. This | |
6 | * file-system delays _all_ reads by 1s and forces direct-IO. This means, any | |
7 | * read() on files in that file-system will pin the receive-buffer pages for at | |
8 | * least 1s via get_user_pages(). | |
9 | * | |
10 | * We use this trick to race ADD_SEALS against a write on a memfd object. The | |
11 | * ADD_SEALS must fail if the memfd pages are still pinned. Note that we use | |
12 | * the read() syscall with our memory-mapped memfd object as receive buffer to | |
13 | * force the kernel to write into our memfd object. | |
14 | */ | |
15 | ||
16 | #define _GNU_SOURCE | |
17 | #define __EXPORTED_HEADERS__ | |
18 | ||
19 | #include <errno.h> | |
20 | #include <inttypes.h> | |
21 | #include <limits.h> | |
22 | #include <linux/falloc.h> | |
23 | #include <linux/fcntl.h> | |
24 | #include <linux/memfd.h> | |
25 | #include <sched.h> | |
26 | #include <stdio.h> | |
27 | #include <stdlib.h> | |
28 | #include <signal.h> | |
29 | #include <string.h> | |
30 | #include <sys/mman.h> | |
31 | #include <sys/stat.h> | |
32 | #include <sys/syscall.h> | |
33 | #include <sys/wait.h> | |
34 | #include <unistd.h> | |
35 | ||
36 | #define MFD_DEF_SIZE 8192 | |
0e64f1d7 | 37 | #define STACK_SIZE 65536 |
87b2d440 DH |
38 | |
39 | static int sys_memfd_create(const char *name, | |
40 | unsigned int flags) | |
41 | { | |
42 | return syscall(__NR_memfd_create, name, flags); | |
43 | } | |
44 | ||
45 | static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags) | |
46 | { | |
47 | int r, fd; | |
48 | ||
49 | fd = sys_memfd_create(name, flags); | |
50 | if (fd < 0) { | |
51 | printf("memfd_create(\"%s\", %u) failed: %m\n", | |
52 | name, flags); | |
53 | abort(); | |
54 | } | |
55 | ||
56 | r = ftruncate(fd, sz); | |
57 | if (r < 0) { | |
58 | printf("ftruncate(%llu) failed: %m\n", (unsigned long long)sz); | |
59 | abort(); | |
60 | } | |
61 | ||
62 | return fd; | |
63 | } | |
64 | ||
65 | static __u64 mfd_assert_get_seals(int fd) | |
66 | { | |
67 | long r; | |
68 | ||
69 | r = fcntl(fd, F_GET_SEALS); | |
70 | if (r < 0) { | |
71 | printf("GET_SEALS(%d) failed: %m\n", fd); | |
72 | abort(); | |
73 | } | |
74 | ||
75 | return r; | |
76 | } | |
77 | ||
78 | static void mfd_assert_has_seals(int fd, __u64 seals) | |
79 | { | |
80 | __u64 s; | |
81 | ||
82 | s = mfd_assert_get_seals(fd); | |
83 | if (s != seals) { | |
84 | printf("%llu != %llu = GET_SEALS(%d)\n", | |
85 | (unsigned long long)seals, (unsigned long long)s, fd); | |
86 | abort(); | |
87 | } | |
88 | } | |
89 | ||
90 | static void mfd_assert_add_seals(int fd, __u64 seals) | |
91 | { | |
92 | long r; | |
93 | __u64 s; | |
94 | ||
95 | s = mfd_assert_get_seals(fd); | |
96 | r = fcntl(fd, F_ADD_SEALS, seals); | |
97 | if (r < 0) { | |
98 | printf("ADD_SEALS(%d, %llu -> %llu) failed: %m\n", | |
99 | fd, (unsigned long long)s, (unsigned long long)seals); | |
100 | abort(); | |
101 | } | |
102 | } | |
103 | ||
104 | static int mfd_busy_add_seals(int fd, __u64 seals) | |
105 | { | |
106 | long r; | |
107 | __u64 s; | |
108 | ||
109 | r = fcntl(fd, F_GET_SEALS); | |
110 | if (r < 0) | |
111 | s = 0; | |
112 | else | |
113 | s = r; | |
114 | ||
115 | r = fcntl(fd, F_ADD_SEALS, seals); | |
116 | if (r < 0 && errno != EBUSY) { | |
117 | printf("ADD_SEALS(%d, %llu -> %llu) didn't fail as expected with EBUSY: %m\n", | |
118 | fd, (unsigned long long)s, (unsigned long long)seals); | |
119 | abort(); | |
120 | } | |
121 | ||
122 | return r; | |
123 | } | |
124 | ||
125 | static void *mfd_assert_mmap_shared(int fd) | |
126 | { | |
127 | void *p; | |
128 | ||
129 | p = mmap(NULL, | |
130 | MFD_DEF_SIZE, | |
131 | PROT_READ | PROT_WRITE, | |
132 | MAP_SHARED, | |
133 | fd, | |
134 | 0); | |
135 | if (p == MAP_FAILED) { | |
136 | printf("mmap() failed: %m\n"); | |
137 | abort(); | |
138 | } | |
139 | ||
140 | return p; | |
141 | } | |
142 | ||
143 | static void *mfd_assert_mmap_private(int fd) | |
144 | { | |
145 | void *p; | |
146 | ||
147 | p = mmap(NULL, | |
148 | MFD_DEF_SIZE, | |
149 | PROT_READ | PROT_WRITE, | |
150 | MAP_PRIVATE, | |
151 | fd, | |
152 | 0); | |
153 | if (p == MAP_FAILED) { | |
154 | printf("mmap() failed: %m\n"); | |
155 | abort(); | |
156 | } | |
157 | ||
158 | return p; | |
159 | } | |
160 | ||
161 | static int global_mfd = -1; | |
162 | static void *global_p = NULL; | |
163 | ||
164 | static int sealing_thread_fn(void *arg) | |
165 | { | |
166 | int sig, r; | |
167 | ||
168 | /* | |
169 | * This thread first waits 200ms so any pending operation in the parent | |
170 | * is correctly started. After that, it tries to seal @global_mfd as | |
171 | * SEAL_WRITE. This _must_ fail as the parent thread has a read() into | |
172 | * that memory mapped object still ongoing. | |
173 | * We then wait one more second and try sealing again. This time it | |
174 | * must succeed as there shouldn't be anyone else pinning the pages. | |
175 | */ | |
176 | ||
177 | /* wait 200ms for FUSE-request to be active */ | |
178 | usleep(200000); | |
179 | ||
180 | /* unmount mapping before sealing to avoid i_mmap_writable failures */ | |
181 | munmap(global_p, MFD_DEF_SIZE); | |
182 | ||
183 | /* Try sealing the global file; expect EBUSY or success. Current | |
184 | * kernels will never succeed, but in the future, kernels might | |
185 | * implement page-replacements or other fancy ways to avoid racing | |
186 | * writes. */ | |
187 | r = mfd_busy_add_seals(global_mfd, F_SEAL_WRITE); | |
188 | if (r >= 0) { | |
189 | printf("HURRAY! This kernel fixed GUP races!\n"); | |
190 | } else { | |
191 | /* wait 1s more so the FUSE-request is done */ | |
192 | sleep(1); | |
193 | ||
194 | /* try sealing the global file again */ | |
195 | mfd_assert_add_seals(global_mfd, F_SEAL_WRITE); | |
196 | } | |
197 | ||
198 | return 0; | |
199 | } | |
200 | ||
201 | static pid_t spawn_sealing_thread(void) | |
202 | { | |
203 | uint8_t *stack; | |
204 | pid_t pid; | |
205 | ||
206 | stack = malloc(STACK_SIZE); | |
207 | if (!stack) { | |
208 | printf("malloc(STACK_SIZE) failed: %m\n"); | |
209 | abort(); | |
210 | } | |
211 | ||
212 | pid = clone(sealing_thread_fn, | |
213 | stack + STACK_SIZE, | |
214 | SIGCHLD | CLONE_FILES | CLONE_FS | CLONE_VM, | |
215 | NULL); | |
216 | if (pid < 0) { | |
217 | printf("clone() failed: %m\n"); | |
218 | abort(); | |
219 | } | |
220 | ||
221 | return pid; | |
222 | } | |
223 | ||
224 | static void join_sealing_thread(pid_t pid) | |
225 | { | |
226 | waitpid(pid, NULL, 0); | |
227 | } | |
228 | ||
229 | int main(int argc, char **argv) | |
230 | { | |
231 | static const char zero[MFD_DEF_SIZE]; | |
232 | int fd, mfd, r; | |
233 | void *p; | |
234 | int was_sealed; | |
235 | pid_t pid; | |
236 | ||
237 | if (argc < 2) { | |
238 | printf("error: please pass path to file in fuse_mnt mount-point\n"); | |
239 | abort(); | |
240 | } | |
241 | ||
242 | /* open FUSE memfd file for GUP testing */ | |
243 | printf("opening: %s\n", argv[1]); | |
244 | fd = open(argv[1], O_RDONLY | O_CLOEXEC); | |
245 | if (fd < 0) { | |
246 | printf("cannot open(\"%s\"): %m\n", argv[1]); | |
247 | abort(); | |
248 | } | |
249 | ||
250 | /* create new memfd-object */ | |
251 | mfd = mfd_assert_new("kern_memfd_fuse", | |
252 | MFD_DEF_SIZE, | |
253 | MFD_CLOEXEC | MFD_ALLOW_SEALING); | |
254 | ||
255 | /* mmap memfd-object for writing */ | |
256 | p = mfd_assert_mmap_shared(mfd); | |
257 | ||
258 | /* pass mfd+mapping to a separate sealing-thread which tries to seal | |
259 | * the memfd objects with SEAL_WRITE while we write into it */ | |
260 | global_mfd = mfd; | |
261 | global_p = p; | |
262 | pid = spawn_sealing_thread(); | |
263 | ||
264 | /* Use read() on the FUSE file to read into our memory-mapped memfd | |
265 | * object. This races the other thread which tries to seal the | |
266 | * memfd-object. | |
267 | * If @fd is on the memfd-fake-FUSE-FS, the read() is delayed by 1s. | |
268 | * This guarantees that the receive-buffer is pinned for 1s until the | |
269 | * data is written into it. The racing ADD_SEALS should thus fail as | |
270 | * the pages are still pinned. */ | |
271 | r = read(fd, p, MFD_DEF_SIZE); | |
272 | if (r < 0) { | |
273 | printf("read() failed: %m\n"); | |
274 | abort(); | |
275 | } else if (!r) { | |
276 | printf("unexpected EOF on read()\n"); | |
277 | abort(); | |
278 | } | |
279 | ||
280 | was_sealed = mfd_assert_get_seals(mfd) & F_SEAL_WRITE; | |
281 | ||
282 | /* Wait for sealing-thread to finish and verify that it | |
283 | * successfully sealed the file after the second try. */ | |
284 | join_sealing_thread(pid); | |
285 | mfd_assert_has_seals(mfd, F_SEAL_WRITE); | |
286 | ||
287 | /* *IF* the memfd-object was sealed at the time our read() returned, | |
288 | * then the kernel did a page-replacement or canceled the read() (or | |
289 | * whatever magic it did..). In that case, the memfd object is still | |
290 | * all zero. | |
291 | * In case the memfd-object was *not* sealed, the read() was successfull | |
292 | * and the memfd object must *not* be all zero. | |
293 | * Note that in real scenarios, there might be a mixture of both, but | |
294 | * in this test-cases, we have explicit 200ms delays which should be | |
295 | * enough to avoid any in-flight writes. */ | |
296 | ||
297 | p = mfd_assert_mmap_private(mfd); | |
298 | if (was_sealed && memcmp(p, zero, MFD_DEF_SIZE)) { | |
299 | printf("memfd sealed during read() but data not discarded\n"); | |
300 | abort(); | |
301 | } else if (!was_sealed && !memcmp(p, zero, MFD_DEF_SIZE)) { | |
302 | printf("memfd sealed after read() but data discarded\n"); | |
303 | abort(); | |
304 | } | |
305 | ||
306 | close(mfd); | |
307 | close(fd); | |
308 | ||
309 | printf("fuse: DONE\n"); | |
310 | ||
311 | return 0; | |
312 | } |