]> git.proxmox.com Git - zfsonlinux.git/blob - zfs-patches/0007-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch
Backport deadlock fix for issue ZOL#7939
[zfsonlinux.git] / zfs-patches / 0007-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch
1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: ilbsmart <wgqimut@gmail.com>
3 Date: Wed, 17 Oct 2018 02:11:24 +0800
4 Subject: [PATCH] deadlock between mm_sem and tx assign in zfs_write() and page
5 fault
6
7 The bug time sequence:
8 1. thread #1, `zfs_write` assign a txg "n".
9 2. In a same process, thread #2, mmap page fault (which means the
10 `mm_sem` is hold) occurred, `zfs_dirty_inode` open a txg failed,
11 and wait previous txg "n" completed.
12 3. thread #1 call `uiomove` to write, however page fault is occurred
13 in `uiomove`, which means it need `mm_sem`, but `mm_sem` is hold by
14 thread #2, so it stuck and can't complete, then txg "n" will
15 not complete.
16
17 So thread #1 and thread #2 are deadlocked.
18
19 Reviewed-by: Chunwei Chen <tuxoko@gmail.com>
20 Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
21 Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
22 Signed-off-by: Grady Wong <grady.w@xtaotech.com>
23 Closes #7939
24
25 (backported from: zfs-upstream 779a6c0bf6df76e0dd92c1ccf81f48512b835bb0)
26 Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
27 ---
28 include/sys/uio_impl.h | 2 +-
29 module/zcommon/zfs_uio.c | 31 ++++-
30 module/zfs/zfs_vnops.c | 24 +++-
31 tests/zfs-tests/cmd/mmapwrite/mmapwrite.c | 140 +++++++++++++++------
32 .../tests/functional/mmap/mmap_write_001_pos.ksh | 8 +-
33 5 files changed, 151 insertions(+), 54 deletions(-)
34
35 diff --git a/include/sys/uio_impl.h b/include/sys/uio_impl.h
36 index 37e283da..cfef0b95 100644
37 --- a/include/sys/uio_impl.h
38 +++ b/include/sys/uio_impl.h
39 @@ -42,7 +42,7 @@
40 #include <sys/uio.h>
41
42 extern int uiomove(void *, size_t, enum uio_rw, uio_t *);
43 -extern void uio_prefaultpages(ssize_t, uio_t *);
44 +extern int uio_prefaultpages(ssize_t, uio_t *);
45 extern int uiocopy(void *, size_t, enum uio_rw, uio_t *, size_t *);
46 extern void uioskip(uio_t *, size_t);
47
48 diff --git a/module/zcommon/zfs_uio.c b/module/zcommon/zfs_uio.c
49 index 7b4175bb..8e969bbc 100644
50 --- a/module/zcommon/zfs_uio.c
51 +++ b/module/zcommon/zfs_uio.c
52 @@ -50,6 +50,7 @@
53 #include <sys/types.h>
54 #include <sys/uio_impl.h>
55 #include <linux/kmap_compat.h>
56 +#include <linux/uaccess.h>
57
58 /*
59 * Move "n" bytes at byte address "p"; "rw" indicates the direction
60 @@ -77,8 +78,24 @@ uiomove_iov(void *p, size_t n, enum uio_rw rw, struct uio *uio)
61 if (copy_to_user(iov->iov_base+skip, p, cnt))
62 return (EFAULT);
63 } else {
64 - if (copy_from_user(p, iov->iov_base+skip, cnt))
65 - return (EFAULT);
66 + if (uio->uio_fault_disable) {
67 + if (!access_ok(VERIFY_READ,
68 + (iov->iov_base + skip), cnt)) {
69 + return (EFAULT);
70 + }
71 +
72 + pagefault_disable();
73 + if (__copy_from_user_inatomic(p,
74 + (iov->iov_base + skip), cnt)) {
75 + pagefault_enable();
76 + return (EFAULT);
77 + }
78 + pagefault_enable();
79 + } else {
80 + if (copy_from_user(p,
81 + (iov->iov_base + skip), cnt))
82 + return (EFAULT);
83 + }
84 }
85 break;
86 case UIO_SYSSPACE:
87 @@ -156,7 +173,7 @@ EXPORT_SYMBOL(uiomove);
88 * error will terminate the process as this is only a best attempt to get
89 * the pages resident.
90 */
91 -void
92 +int
93 uio_prefaultpages(ssize_t n, struct uio *uio)
94 {
95 const struct iovec *iov;
96 @@ -170,7 +187,7 @@ uio_prefaultpages(ssize_t n, struct uio *uio)
97 switch (uio->uio_segflg) {
98 case UIO_SYSSPACE:
99 case UIO_BVEC:
100 - return;
101 + return (0);
102 case UIO_USERSPACE:
103 case UIO_USERISPACE:
104 break;
105 @@ -194,7 +211,7 @@ uio_prefaultpages(ssize_t n, struct uio *uio)
106 p = iov->iov_base + skip;
107 while (cnt) {
108 if (fuword8((uint8_t *)p, &tmp))
109 - return;
110 + return (EFAULT);
111 incr = MIN(cnt, PAGESIZE);
112 p += incr;
113 cnt -= incr;
114 @@ -204,8 +221,10 @@ uio_prefaultpages(ssize_t n, struct uio *uio)
115 */
116 p--;
117 if (fuword8((uint8_t *)p, &tmp))
118 - return;
119 + return (EFAULT);
120 }
121 +
122 + return (0);
123 }
124 EXPORT_SYMBOL(uio_prefaultpages);
125
126 diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
127 index 5a2e55eb..c866352d 100644
128 --- a/module/zfs/zfs_vnops.c
129 +++ b/module/zfs/zfs_vnops.c
130 @@ -675,7 +675,10 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
131 xuio = (xuio_t *)uio;
132 else
133 #endif
134 - uio_prefaultpages(MIN(n, max_blksz), uio);
135 + if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
136 + ZFS_EXIT(zfsvfs);
137 + return (SET_ERROR(EFAULT));
138 + }
139
140 /*
141 * If in append mode, set the io offset pointer to eof.
142 @@ -820,8 +823,19 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
143
144 if (abuf == NULL) {
145 tx_bytes = uio->uio_resid;
146 + uio->uio_fault_disable = B_TRUE;
147 error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
148 uio, nbytes, tx);
149 + if (error == EFAULT) {
150 + dmu_tx_commit(tx);
151 + if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
152 + break;
153 + }
154 + continue;
155 + } else if (error != 0) {
156 + dmu_tx_commit(tx);
157 + break;
158 + }
159 tx_bytes -= uio->uio_resid;
160 } else {
161 tx_bytes = nbytes;
162 @@ -921,8 +935,12 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
163 ASSERT(tx_bytes == nbytes);
164 n -= nbytes;
165
166 - if (!xuio && n > 0)
167 - uio_prefaultpages(MIN(n, max_blksz), uio);
168 + if (!xuio && n > 0) {
169 + if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
170 + error = EFAULT;
171 + break;
172 + }
173 + }
174 }
175
176 zfs_inode_update(zp);
177 diff --git a/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c b/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c
178 index 190d31af..b9915d5d 100644
179 --- a/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c
180 +++ b/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c
181 @@ -31,74 +31,132 @@
182 #include <string.h>
183 #include <sys/mman.h>
184 #include <pthread.h>
185 +#include <errno.h>
186 +#include <err.h>
187
188 /*
189 * --------------------------------------------------------------------
190 - * Bug Id: 5032643
191 + * Bug Issue Id: #7512
192 + * The bug time sequence:
193 + * 1. context #1, zfs_write assign a txg "n".
194 + * 2. In the same process, context #2, mmap page fault (which means the mm_sem
195 + * is hold) occurred, zfs_dirty_inode open a txg failed, and wait previous
196 + * txg "n" completed.
197 + * 3. context #1 call uiomove to write, however page fault is occurred in
198 + * uiomove, which means it need mm_sem, but mm_sem is hold by
199 + * context #2, so it stuck and can't complete, then txg "n" will not
200 + * complete.
201 *
202 - * Simply writing to a file and mmaping that file at the same time can
203 - * result in deadlock. Nothing perverse like writing from the file's
204 - * own mapping is required.
205 + * So context #1 and context #2 trap into the "dead lock".
206 * --------------------------------------------------------------------
207 */
208
209 +#define NORMAL_WRITE_TH_NUM 2
210 +
211 static void *
212 -mapper(void *fdp)
213 +normal_writer(void *filename)
214 {
215 - void *addr;
216 - int fd = *(int *)fdp;
217 + char *file_path = filename;
218 + int fd = -1;
219 + ssize_t write_num = 0;
220 + int page_size = getpagesize();
221
222 - if ((addr =
223 - mmap(0, 8192, PROT_READ, MAP_SHARED, fd, 0)) == MAP_FAILED) {
224 - perror("mmap");
225 - exit(1);
226 + fd = open(file_path, O_RDWR | O_CREAT, 0777);
227 + if (fd == -1) {
228 + err(1, "failed to open %s", file_path);
229 }
230 - for (;;) {
231 - if (mmap(addr, 8192, PROT_READ,
232 - MAP_SHARED|MAP_FIXED, fd, 0) == MAP_FAILED) {
233 - perror("mmap");
234 - exit(1);
235 +
236 + char *buf = malloc(1);
237 + while (1) {
238 + write_num = write(fd, buf, 1);
239 + if (write_num == 0) {
240 + err(1, "write failed!");
241 + break;
242 }
243 + lseek(fd, page_size, SEEK_CUR);
244 + }
245 +
246 + if (buf) {
247 + free(buf);
248 }
249 - /* NOTREACHED */
250 - return ((void *)1);
251 }
252
253 -int
254 -main(int argc, char **argv)
255 +static void *
256 +map_writer(void *filename)
257 {
258 - int fd;
259 - char buf[1024];
260 - pthread_t tid;
261 + int fd = -1;
262 + int ret = 0;
263 + char *buf = NULL;
264 + int page_size = getpagesize();
265 + int op_errno = 0;
266 + char *file_path = filename;
267
268 - memset(buf, 'a', sizeof (buf));
269 + while (1) {
270 + ret = access(file_path, F_OK);
271 + if (ret) {
272 + op_errno = errno;
273 + if (op_errno == ENOENT) {
274 + fd = open(file_path, O_RDWR | O_CREAT, 0777);
275 + if (fd == -1) {
276 + err(1, "open file failed");
277 + }
278
279 - if (argc != 2) {
280 - (void) printf("usage: %s <file name>\n", argv[0]);
281 - exit(1);
282 - }
283 + ret = ftruncate(fd, page_size);
284 + if (ret == -1) {
285 + err(1, "truncate file failed");
286 + }
287 + } else {
288 + err(1, "access file failed!");
289 + }
290 + } else {
291 + fd = open(file_path, O_RDWR, 0777);
292 + if (fd == -1) {
293 + err(1, "open file failed");
294 + }
295 + }
296
297 - if ((fd = open(argv[1], O_RDWR|O_CREAT|O_TRUNC, 0666)) == -1) {
298 - perror("open");
299 - exit(1);
300 + if ((buf = mmap(NULL, page_size, PROT_READ | PROT_WRITE,
301 + MAP_SHARED, fd, 0)) == MAP_FAILED) {
302 + err(1, "map file failed");
303 + }
304 +
305 + if (fd != -1)
306 + close(fd);
307 +
308 + char s[10] = {0, };
309 + memcpy(buf, s, 10);
310 + ret = munmap(buf, page_size);
311 + if (ret != 0) {
312 + err(1, "unmap file failed");
313 + }
314 }
315 +}
316
317 - (void) pthread_setconcurrency(2);
318 - if (pthread_create(&tid, NULL, mapper, &fd) != 0) {
319 - perror("pthread_create");
320 - close(fd);
321 +int
322 +main(int argc, char **argv)
323 +{
324 + pthread_t map_write_tid;
325 + pthread_t normal_write_tid[NORMAL_WRITE_TH_NUM];
326 + int i = 0;
327 +
328 + if (argc != 3) {
329 + (void) printf("usage: %s <normal write file name>"
330 + "<map write file name>\n", argv[0]);
331 exit(1);
332 }
333 - for (;;) {
334 - if (write(fd, buf, sizeof (buf)) == -1) {
335 - perror("write");
336 - close(fd);
337 - exit(1);
338 +
339 + for (i = 0; i < NORMAL_WRITE_TH_NUM; i++) {
340 + if (pthread_create(&normal_write_tid[i], NULL, normal_writer,
341 + argv[1])) {
342 + err(1, "pthread_create normal_writer failed.");
343 }
344 }
345
346 - close(fd);
347 + if (pthread_create(&map_write_tid, NULL, map_writer, argv[2])) {
348 + err(1, "pthread_create map_writer failed.");
349 + }
350
351 /* NOTREACHED */
352 + pthread_join(map_write_tid, NULL);
353 return (0);
354 }
355 diff --git a/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh b/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh
356 index 1eda9710..24150b82 100755
357 --- a/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh
358 +++ b/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh
359 @@ -53,12 +53,14 @@ if ! is_mp; then
360 fi
361
362 log_must chmod 777 $TESTDIR
363 -mmapwrite $TESTDIR/test-write-file &
364 +mmapwrite $TESTDIR/normal_write_file $TESTDIR/map_write_file &
365 PID_MMAPWRITE=$!
366 -log_note "mmapwrite $TESTDIR/test-write-file pid: $PID_MMAPWRITE"
367 +log_note "mmapwrite $TESTDIR/normal_write_file $TESTDIR/map_write_file"\
368 + "pid: $PID_MMAPWRITE"
369 log_must sleep 30
370
371 log_must kill -9 $PID_MMAPWRITE
372 -log_must ls -l $TESTDIR/test-write-file
373 +log_must ls -l $TESTDIR/normal_write_file
374 +log_must ls -l $TESTDIR/map_write_file
375
376 log_pass "write(2) a mmap(2)'ing file succeeded."