]> git.proxmox.com Git - zfsonlinux.git/blob - zfs/debian/patches/0011-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch
update ZFS to 0.7.13
[zfsonlinux.git] / zfs / debian / patches / 0011-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch
1 From: ilbsmart <wgqimut@gmail.com>
2 Date: Wed, 17 Oct 2018 02:11:24 +0800
3 Subject: deadlock between mm_sem and tx assign in zfs_write() and page fault
4
5 The bug time sequence:
6 1. thread #1, `zfs_write` assign a txg "n".
7 2. In a same process, thread #2, mmap page fault (which means the
8 `mm_sem` is hold) occurred, `zfs_dirty_inode` open a txg failed,
9 and wait previous txg "n" completed.
10 3. thread #1 call `uiomove` to write, however page fault is occurred
11 in `uiomove`, which means it need `mm_sem`, but `mm_sem` is hold by
12 thread #2, so it stuck and can't complete, then txg "n" will
13 not complete.
14
15 So thread #1 and thread #2 are deadlocked.
16
17 Reviewed-by: Chunwei Chen <tuxoko@gmail.com>
18 Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
19 Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
20 Signed-off-by: Grady Wong <grady.w@xtaotech.com>
21 Closes #7939
22
23 (backported from: zfs-upstream 779a6c0bf6df76e0dd92c1ccf81f48512b835bb0)
24 Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
25 ---
26 include/sys/uio_impl.h | 2 +-
27 module/zcommon/zfs_uio.c | 31 ++++-
28 module/zfs/zfs_vnops.c | 24 +++-
29 tests/zfs-tests/cmd/mmapwrite/mmapwrite.c | 140 +++++++++++++++------
30 .../tests/functional/mmap/mmap_write_001_pos.ksh | 8 +-
31 5 files changed, 151 insertions(+), 54 deletions(-)
32
33 diff --git a/include/sys/uio_impl.h b/include/sys/uio_impl.h
34 index 37e283d..cfef0b9 100644
35 --- a/include/sys/uio_impl.h
36 +++ b/include/sys/uio_impl.h
37 @@ -42,7 +42,7 @@
38 #include <sys/uio.h>
39
40 extern int uiomove(void *, size_t, enum uio_rw, uio_t *);
41 -extern void uio_prefaultpages(ssize_t, uio_t *);
42 +extern int uio_prefaultpages(ssize_t, uio_t *);
43 extern int uiocopy(void *, size_t, enum uio_rw, uio_t *, size_t *);
44 extern void uioskip(uio_t *, size_t);
45
46 diff --git a/module/zcommon/zfs_uio.c b/module/zcommon/zfs_uio.c
47 index 7b4175b..8e969bb 100644
48 --- a/module/zcommon/zfs_uio.c
49 +++ b/module/zcommon/zfs_uio.c
50 @@ -50,6 +50,7 @@
51 #include <sys/types.h>
52 #include <sys/uio_impl.h>
53 #include <linux/kmap_compat.h>
54 +#include <linux/uaccess.h>
55
56 /*
57 * Move "n" bytes at byte address "p"; "rw" indicates the direction
58 @@ -77,8 +78,24 @@ uiomove_iov(void *p, size_t n, enum uio_rw rw, struct uio *uio)
59 if (copy_to_user(iov->iov_base+skip, p, cnt))
60 return (EFAULT);
61 } else {
62 - if (copy_from_user(p, iov->iov_base+skip, cnt))
63 - return (EFAULT);
64 + if (uio->uio_fault_disable) {
65 + if (!access_ok(VERIFY_READ,
66 + (iov->iov_base + skip), cnt)) {
67 + return (EFAULT);
68 + }
69 +
70 + pagefault_disable();
71 + if (__copy_from_user_inatomic(p,
72 + (iov->iov_base + skip), cnt)) {
73 + pagefault_enable();
74 + return (EFAULT);
75 + }
76 + pagefault_enable();
77 + } else {
78 + if (copy_from_user(p,
79 + (iov->iov_base + skip), cnt))
80 + return (EFAULT);
81 + }
82 }
83 break;
84 case UIO_SYSSPACE:
85 @@ -156,7 +173,7 @@ EXPORT_SYMBOL(uiomove);
86 * error will terminate the process as this is only a best attempt to get
87 * the pages resident.
88 */
89 -void
90 +int
91 uio_prefaultpages(ssize_t n, struct uio *uio)
92 {
93 const struct iovec *iov;
94 @@ -170,7 +187,7 @@ uio_prefaultpages(ssize_t n, struct uio *uio)
95 switch (uio->uio_segflg) {
96 case UIO_SYSSPACE:
97 case UIO_BVEC:
98 - return;
99 + return (0);
100 case UIO_USERSPACE:
101 case UIO_USERISPACE:
102 break;
103 @@ -194,7 +211,7 @@ uio_prefaultpages(ssize_t n, struct uio *uio)
104 p = iov->iov_base + skip;
105 while (cnt) {
106 if (fuword8((uint8_t *)p, &tmp))
107 - return;
108 + return (EFAULT);
109 incr = MIN(cnt, PAGESIZE);
110 p += incr;
111 cnt -= incr;
112 @@ -204,8 +221,10 @@ uio_prefaultpages(ssize_t n, struct uio *uio)
113 */
114 p--;
115 if (fuword8((uint8_t *)p, &tmp))
116 - return;
117 + return (EFAULT);
118 }
119 +
120 + return (0);
121 }
122 EXPORT_SYMBOL(uio_prefaultpages);
123
124 diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
125 index 5a2e55e..c866352 100644
126 --- a/module/zfs/zfs_vnops.c
127 +++ b/module/zfs/zfs_vnops.c
128 @@ -675,7 +675,10 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
129 xuio = (xuio_t *)uio;
130 else
131 #endif
132 - uio_prefaultpages(MIN(n, max_blksz), uio);
133 + if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
134 + ZFS_EXIT(zfsvfs);
135 + return (SET_ERROR(EFAULT));
136 + }
137
138 /*
139 * If in append mode, set the io offset pointer to eof.
140 @@ -820,8 +823,19 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
141
142 if (abuf == NULL) {
143 tx_bytes = uio->uio_resid;
144 + uio->uio_fault_disable = B_TRUE;
145 error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
146 uio, nbytes, tx);
147 + if (error == EFAULT) {
148 + dmu_tx_commit(tx);
149 + if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
150 + break;
151 + }
152 + continue;
153 + } else if (error != 0) {
154 + dmu_tx_commit(tx);
155 + break;
156 + }
157 tx_bytes -= uio->uio_resid;
158 } else {
159 tx_bytes = nbytes;
160 @@ -921,8 +935,12 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
161 ASSERT(tx_bytes == nbytes);
162 n -= nbytes;
163
164 - if (!xuio && n > 0)
165 - uio_prefaultpages(MIN(n, max_blksz), uio);
166 + if (!xuio && n > 0) {
167 + if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
168 + error = EFAULT;
169 + break;
170 + }
171 + }
172 }
173
174 zfs_inode_update(zp);
175 diff --git a/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c b/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c
176 index 190d31a..b9915d5 100644
177 --- a/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c
178 +++ b/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c
179 @@ -31,74 +31,132 @@
180 #include <string.h>
181 #include <sys/mman.h>
182 #include <pthread.h>
183 +#include <errno.h>
184 +#include <err.h>
185
186 /*
187 * --------------------------------------------------------------------
188 - * Bug Id: 5032643
189 + * Bug Issue Id: #7512
190 + * The bug time sequence:
191 + * 1. context #1, zfs_write assign a txg "n".
192 + * 2. In the same process, context #2, mmap page fault (which means the mm_sem
193 + * is hold) occurred, zfs_dirty_inode open a txg failed, and wait previous
194 + * txg "n" completed.
195 + * 3. context #1 call uiomove to write, however page fault is occurred in
196 + * uiomove, which means it need mm_sem, but mm_sem is hold by
197 + * context #2, so it stuck and can't complete, then txg "n" will not
198 + * complete.
199 *
200 - * Simply writing to a file and mmaping that file at the same time can
201 - * result in deadlock. Nothing perverse like writing from the file's
202 - * own mapping is required.
203 + * So context #1 and context #2 trap into the "dead lock".
204 * --------------------------------------------------------------------
205 */
206
207 +#define NORMAL_WRITE_TH_NUM 2
208 +
209 static void *
210 -mapper(void *fdp)
211 +normal_writer(void *filename)
212 {
213 - void *addr;
214 - int fd = *(int *)fdp;
215 + char *file_path = filename;
216 + int fd = -1;
217 + ssize_t write_num = 0;
218 + int page_size = getpagesize();
219
220 - if ((addr =
221 - mmap(0, 8192, PROT_READ, MAP_SHARED, fd, 0)) == MAP_FAILED) {
222 - perror("mmap");
223 - exit(1);
224 + fd = open(file_path, O_RDWR | O_CREAT, 0777);
225 + if (fd == -1) {
226 + err(1, "failed to open %s", file_path);
227 }
228 - for (;;) {
229 - if (mmap(addr, 8192, PROT_READ,
230 - MAP_SHARED|MAP_FIXED, fd, 0) == MAP_FAILED) {
231 - perror("mmap");
232 - exit(1);
233 +
234 + char *buf = malloc(1);
235 + while (1) {
236 + write_num = write(fd, buf, 1);
237 + if (write_num == 0) {
238 + err(1, "write failed!");
239 + break;
240 }
241 + lseek(fd, page_size, SEEK_CUR);
242 + }
243 +
244 + if (buf) {
245 + free(buf);
246 }
247 - /* NOTREACHED */
248 - return ((void *)1);
249 }
250
251 -int
252 -main(int argc, char **argv)
253 +static void *
254 +map_writer(void *filename)
255 {
256 - int fd;
257 - char buf[1024];
258 - pthread_t tid;
259 + int fd = -1;
260 + int ret = 0;
261 + char *buf = NULL;
262 + int page_size = getpagesize();
263 + int op_errno = 0;
264 + char *file_path = filename;
265
266 - memset(buf, 'a', sizeof (buf));
267 + while (1) {
268 + ret = access(file_path, F_OK);
269 + if (ret) {
270 + op_errno = errno;
271 + if (op_errno == ENOENT) {
272 + fd = open(file_path, O_RDWR | O_CREAT, 0777);
273 + if (fd == -1) {
274 + err(1, "open file failed");
275 + }
276
277 - if (argc != 2) {
278 - (void) printf("usage: %s <file name>\n", argv[0]);
279 - exit(1);
280 - }
281 + ret = ftruncate(fd, page_size);
282 + if (ret == -1) {
283 + err(1, "truncate file failed");
284 + }
285 + } else {
286 + err(1, "access file failed!");
287 + }
288 + } else {
289 + fd = open(file_path, O_RDWR, 0777);
290 + if (fd == -1) {
291 + err(1, "open file failed");
292 + }
293 + }
294
295 - if ((fd = open(argv[1], O_RDWR|O_CREAT|O_TRUNC, 0666)) == -1) {
296 - perror("open");
297 - exit(1);
298 + if ((buf = mmap(NULL, page_size, PROT_READ | PROT_WRITE,
299 + MAP_SHARED, fd, 0)) == MAP_FAILED) {
300 + err(1, "map file failed");
301 + }
302 +
303 + if (fd != -1)
304 + close(fd);
305 +
306 + char s[10] = {0, };
307 + memcpy(buf, s, 10);
308 + ret = munmap(buf, page_size);
309 + if (ret != 0) {
310 + err(1, "unmap file failed");
311 + }
312 }
313 +}
314
315 - (void) pthread_setconcurrency(2);
316 - if (pthread_create(&tid, NULL, mapper, &fd) != 0) {
317 - perror("pthread_create");
318 - close(fd);
319 +int
320 +main(int argc, char **argv)
321 +{
322 + pthread_t map_write_tid;
323 + pthread_t normal_write_tid[NORMAL_WRITE_TH_NUM];
324 + int i = 0;
325 +
326 + if (argc != 3) {
327 + (void) printf("usage: %s <normal write file name>"
328 + "<map write file name>\n", argv[0]);
329 exit(1);
330 }
331 - for (;;) {
332 - if (write(fd, buf, sizeof (buf)) == -1) {
333 - perror("write");
334 - close(fd);
335 - exit(1);
336 +
337 + for (i = 0; i < NORMAL_WRITE_TH_NUM; i++) {
338 + if (pthread_create(&normal_write_tid[i], NULL, normal_writer,
339 + argv[1])) {
340 + err(1, "pthread_create normal_writer failed.");
341 }
342 }
343
344 - close(fd);
345 + if (pthread_create(&map_write_tid, NULL, map_writer, argv[2])) {
346 + err(1, "pthread_create map_writer failed.");
347 + }
348
349 /* NOTREACHED */
350 + pthread_join(map_write_tid, NULL);
351 return (0);
352 }
353 diff --git a/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh b/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh
354 index 1eda971..24150b8 100755
355 --- a/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh
356 +++ b/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh
357 @@ -53,12 +53,14 @@ if ! is_mp; then
358 fi
359
360 log_must chmod 777 $TESTDIR
361 -mmapwrite $TESTDIR/test-write-file &
362 +mmapwrite $TESTDIR/normal_write_file $TESTDIR/map_write_file &
363 PID_MMAPWRITE=$!
364 -log_note "mmapwrite $TESTDIR/test-write-file pid: $PID_MMAPWRITE"
365 +log_note "mmapwrite $TESTDIR/normal_write_file $TESTDIR/map_write_file"\
366 + "pid: $PID_MMAPWRITE"
367 log_must sleep 30
368
369 log_must kill -9 $PID_MMAPWRITE
370 -log_must ls -l $TESTDIR/test-write-file
371 +log_must ls -l $TESTDIR/normal_write_file
372 +log_must ls -l $TESTDIR/map_write_file
373
374 log_pass "write(2) a mmap(2)'ing file succeeded."