]> git.proxmox.com Git - zfsonlinux.git/commitdiff
Backport deadlock fix for issue ZOL#7939
authorStoiko Ivanov <s.ivanov@proxmox.com>
Tue, 30 Oct 2018 10:14:27 +0000 (11:14 +0100)
committerThomas Lamprecht <t.lamprecht@proxmox.com>
Tue, 30 Oct 2018 13:24:19 +0000 (14:24 +0100)
A deadlock issue got fixed upstream and merged into master [0].
This patch backports the fix by splitting it into the parts for SPL and ZFS.

[0] https://github.com/zfsonlinux/zfs/pull/7939

Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
spl-patches/0002-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch [new file with mode: 0644]
spl-patches/series
zfs-patches/0007-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch [new file with mode: 0644]
zfs-patches/series

diff --git a/spl-patches/0002-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch b/spl-patches/0002-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch
new file mode 100644 (file)
index 0000000..2906ef8
--- /dev/null
@@ -0,0 +1,42 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: ilbsmart <wgqimut@gmail.com>
+Date: Wed, 17 Oct 2018 02:11:24 +0800
+Subject: [PATCH] deadlock between mm_sem and tx assign in zfs_write() and page
+ fault
+
+The bug time sequence:
+1. thread #1, `zfs_write` assign a txg "n".
+2. In a same process, thread #2, mmap page fault (which means the
+   `mm_sem` is hold) occurred, `zfs_dirty_inode` open a txg failed,
+   and wait previous txg "n" completed.
+3. thread #1 call `uiomove` to write, however page fault is occurred
+   in `uiomove`, which means it need `mm_sem`, but `mm_sem` is hold by
+   thread #2, so it stuck and can't complete,  then txg "n" will
+   not complete.
+
+So thread #1 and thread #2 are deadlocked.
+
+Reviewed-by: Chunwei Chen <tuxoko@gmail.com>
+Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
+Signed-off-by: Grady Wong <grady.w@xtaotech.com>
+Closes #7939
+
+(backported from: zfs-upstream 779a6c0bf6df76e0dd92c1ccf81f48512b835bb0)
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ include/sys/uio.h | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/include/sys/uio.h b/include/sys/uio.h
+index 764beb9..2895690 100644
+--- a/include/sys/uio.h
++++ b/include/sys/uio.h
+@@ -53,6 +53,7 @@ typedef struct uio {
+       int             uio_iovcnt;
+       offset_t        uio_loffset;
+       uio_seg_t       uio_segflg;
++      boolean_t       uio_fault_disable;
+       uint16_t        uio_fmode;
+       uint16_t        uio_extflg;
+       offset_t        uio_limit;
index 20724b7034ca69eabaff292e9bfcc9df0dca464a..5bcfa6a3546a2401737af8536d01eadc974d37a9 100644 (file)
@@ -1 +1,2 @@
 0001-remove-DKMS-and-module-build.patch
+0002-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch
diff --git a/zfs-patches/0007-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch b/zfs-patches/0007-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch
new file mode 100644 (file)
index 0000000..6872d8b
--- /dev/null
@@ -0,0 +1,376 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: ilbsmart <wgqimut@gmail.com>
+Date: Wed, 17 Oct 2018 02:11:24 +0800
+Subject: [PATCH] deadlock between mm_sem and tx assign in zfs_write() and page
+ fault
+
+The bug time sequence:
+1. thread #1, `zfs_write` assign a txg "n".
+2. In a same process, thread #2, mmap page fault (which means the
+   `mm_sem` is hold) occurred, `zfs_dirty_inode` open a txg failed,
+   and wait previous txg "n" completed.
+3. thread #1 call `uiomove` to write, however page fault is occurred
+   in `uiomove`, which means it need `mm_sem`, but `mm_sem` is hold by
+   thread #2, so it stuck and can't complete,  then txg "n" will
+   not complete.
+
+So thread #1 and thread #2 are deadlocked.
+
+Reviewed-by: Chunwei Chen <tuxoko@gmail.com>
+Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
+Signed-off-by: Grady Wong <grady.w@xtaotech.com>
+Closes #7939
+
+(backported from: zfs-upstream 779a6c0bf6df76e0dd92c1ccf81f48512b835bb0)
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ include/sys/uio_impl.h                             |   2 +-
+ module/zcommon/zfs_uio.c                           |  31 ++++-
+ module/zfs/zfs_vnops.c                             |  24 +++-
+ tests/zfs-tests/cmd/mmapwrite/mmapwrite.c          | 140 +++++++++++++++------
+ .../tests/functional/mmap/mmap_write_001_pos.ksh   |   8 +-
+ 5 files changed, 151 insertions(+), 54 deletions(-)
+
+diff --git a/include/sys/uio_impl.h b/include/sys/uio_impl.h
+index 37e283da..cfef0b95 100644
+--- a/include/sys/uio_impl.h
++++ b/include/sys/uio_impl.h
+@@ -42,7 +42,7 @@
+ #include <sys/uio.h>
+ extern int uiomove(void *, size_t, enum uio_rw, uio_t *);
+-extern void uio_prefaultpages(ssize_t, uio_t *);
++extern int uio_prefaultpages(ssize_t, uio_t *);
+ extern int uiocopy(void *, size_t, enum uio_rw, uio_t *, size_t *);
+ extern void uioskip(uio_t *, size_t);
+diff --git a/module/zcommon/zfs_uio.c b/module/zcommon/zfs_uio.c
+index 7b4175bb..8e969bbc 100644
+--- a/module/zcommon/zfs_uio.c
++++ b/module/zcommon/zfs_uio.c
+@@ -50,6 +50,7 @@
+ #include <sys/types.h>
+ #include <sys/uio_impl.h>
+ #include <linux/kmap_compat.h>
++#include <linux/uaccess.h>
+ /*
+  * Move "n" bytes at byte address "p"; "rw" indicates the direction
+@@ -77,8 +78,24 @@ uiomove_iov(void *p, size_t n, enum uio_rw rw, struct uio *uio)
+                               if (copy_to_user(iov->iov_base+skip, p, cnt))
+                                       return (EFAULT);
+                       } else {
+-                              if (copy_from_user(p, iov->iov_base+skip, cnt))
+-                                      return (EFAULT);
++                              if (uio->uio_fault_disable) {
++                                      if (!access_ok(VERIFY_READ,
++                                          (iov->iov_base + skip), cnt)) {
++                                              return (EFAULT);
++                                      }
++
++                                      pagefault_disable();
++                                      if (__copy_from_user_inatomic(p,
++                                          (iov->iov_base + skip), cnt)) {
++                                              pagefault_enable();
++                                              return (EFAULT);
++                                      }
++                                      pagefault_enable();
++                              } else {
++                                      if (copy_from_user(p,
++                                          (iov->iov_base + skip), cnt))
++                                              return (EFAULT);
++                              }
+                       }
+                       break;
+               case UIO_SYSSPACE:
+@@ -156,7 +173,7 @@ EXPORT_SYMBOL(uiomove);
+  * error will terminate the process as this is only a best attempt to get
+  * the pages resident.
+  */
+-void
++int
+ uio_prefaultpages(ssize_t n, struct uio *uio)
+ {
+       const struct iovec *iov;
+@@ -170,7 +187,7 @@ uio_prefaultpages(ssize_t n, struct uio *uio)
+       switch (uio->uio_segflg) {
+               case UIO_SYSSPACE:
+               case UIO_BVEC:
+-                      return;
++                      return (0);
+               case UIO_USERSPACE:
+               case UIO_USERISPACE:
+                       break;
+@@ -194,7 +211,7 @@ uio_prefaultpages(ssize_t n, struct uio *uio)
+               p = iov->iov_base + skip;
+               while (cnt) {
+                       if (fuword8((uint8_t *)p, &tmp))
+-                              return;
++                              return (EFAULT);
+                       incr = MIN(cnt, PAGESIZE);
+                       p += incr;
+                       cnt -= incr;
+@@ -204,8 +221,10 @@ uio_prefaultpages(ssize_t n, struct uio *uio)
+                */
+               p--;
+               if (fuword8((uint8_t *)p, &tmp))
+-                      return;
++                      return (EFAULT);
+       }
++
++      return (0);
+ }
+ EXPORT_SYMBOL(uio_prefaultpages);
+diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
+index 5a2e55eb..c866352d 100644
+--- a/module/zfs/zfs_vnops.c
++++ b/module/zfs/zfs_vnops.c
+@@ -675,7 +675,10 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
+               xuio = (xuio_t *)uio;
+       else
+ #endif
+-              uio_prefaultpages(MIN(n, max_blksz), uio);
++              if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
++                      ZFS_EXIT(zfsvfs);
++                      return (SET_ERROR(EFAULT));
++              }
+       /*
+        * If in append mode, set the io offset pointer to eof.
+@@ -820,8 +823,19 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
+               if (abuf == NULL) {
+                       tx_bytes = uio->uio_resid;
++                      uio->uio_fault_disable = B_TRUE;
+                       error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+                           uio, nbytes, tx);
++                      if (error == EFAULT) {
++                              dmu_tx_commit(tx);
++                              if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
++                                      break;
++                              }
++                              continue;
++                      } else if (error != 0) {
++                              dmu_tx_commit(tx);
++                              break;
++                      }
+                       tx_bytes -= uio->uio_resid;
+               } else {
+                       tx_bytes = nbytes;
+@@ -921,8 +935,12 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
+               ASSERT(tx_bytes == nbytes);
+               n -= nbytes;
+-              if (!xuio && n > 0)
+-                      uio_prefaultpages(MIN(n, max_blksz), uio);
++              if (!xuio && n > 0) {
++                      if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
++                              error = EFAULT;
++                              break;
++                      }
++              }
+       }
+       zfs_inode_update(zp);
+diff --git a/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c b/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c
+index 190d31af..b9915d5d 100644
+--- a/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c
++++ b/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c
+@@ -31,74 +31,132 @@
+ #include <string.h>
+ #include <sys/mman.h>
+ #include <pthread.h>
++#include <errno.h>
++#include <err.h>
+ /*
+  * --------------------------------------------------------------------
+- * Bug Id: 5032643
++ * Bug Issue Id: #7512
++ * The bug time sequence:
++ * 1. context #1, zfs_write assign a txg "n".
++ * 2. In the same process, context #2, mmap page fault (which means the mm_sem
++ *    is hold) occurred, zfs_dirty_inode open a txg failed, and wait previous
++ *    txg "n" completed.
++ * 3. context #1 call uiomove to write, however page fault is occurred in
++ *    uiomove, which means it need mm_sem, but mm_sem is hold by
++ *    context #2, so it stuck and can't complete, then txg "n" will not
++ *    complete.
+  *
+- * Simply writing to a file and mmaping that file at the same time can
+- * result in deadlock.  Nothing perverse like writing from the file's
+- * own mapping is required.
++ * So context #1 and context #2 trap into the "dead lock".
+  * --------------------------------------------------------------------
+  */
++#define       NORMAL_WRITE_TH_NUM     2
++
+ static void *
+-mapper(void *fdp)
++normal_writer(void *filename)
+ {
+-      void *addr;
+-      int fd = *(int *)fdp;
++      char *file_path = filename;
++      int fd = -1;
++      ssize_t write_num = 0;
++      int page_size = getpagesize();
+-      if ((addr =
+-          mmap(0, 8192, PROT_READ, MAP_SHARED, fd, 0)) == MAP_FAILED) {
+-              perror("mmap");
+-              exit(1);
++      fd = open(file_path, O_RDWR | O_CREAT, 0777);
++      if (fd == -1) {
++              err(1, "failed to open %s", file_path);
+       }
+-      for (;;) {
+-              if (mmap(addr, 8192, PROT_READ,
+-                  MAP_SHARED|MAP_FIXED, fd, 0) == MAP_FAILED) {
+-                      perror("mmap");
+-                      exit(1);
++
++      char *buf = malloc(1);
++      while (1) {
++              write_num = write(fd, buf, 1);
++              if (write_num == 0) {
++                      err(1, "write failed!");
++                      break;
+               }
++              lseek(fd, page_size, SEEK_CUR);
++      }
++
++      if (buf) {
++              free(buf);
+       }
+-      /* NOTREACHED */
+-      return ((void *)1);
+ }
+-int
+-main(int argc, char **argv)
++static void *
++map_writer(void *filename)
+ {
+-      int fd;
+-      char buf[1024];
+-      pthread_t tid;
++      int fd = -1;
++      int ret = 0;
++      char *buf = NULL;
++      int page_size = getpagesize();
++      int op_errno = 0;
++      char *file_path = filename;
+-      memset(buf, 'a', sizeof (buf));
++      while (1) {
++              ret = access(file_path, F_OK);
++              if (ret) {
++                      op_errno = errno;
++                      if (op_errno == ENOENT) {
++                              fd = open(file_path, O_RDWR | O_CREAT, 0777);
++                              if (fd == -1) {
++                                      err(1, "open file failed");
++                              }
+-      if (argc != 2) {
+-              (void) printf("usage: %s <file name>\n", argv[0]);
+-              exit(1);
+-      }
++                              ret = ftruncate(fd, page_size);
++                              if (ret == -1) {
++                                      err(1, "truncate file failed");
++                              }
++                      } else {
++                              err(1, "access file failed!");
++                      }
++              } else {
++                      fd = open(file_path, O_RDWR, 0777);
++                      if (fd == -1) {
++                              err(1, "open file failed");
++                      }
++              }
+-      if ((fd = open(argv[1], O_RDWR|O_CREAT|O_TRUNC, 0666)) == -1) {
+-              perror("open");
+-              exit(1);
++              if ((buf = mmap(NULL, page_size, PROT_READ | PROT_WRITE,
++                  MAP_SHARED, fd, 0)) == MAP_FAILED) {
++                      err(1, "map file failed");
++              }
++
++              if (fd != -1)
++                      close(fd);
++
++              char s[10] = {0, };
++              memcpy(buf, s, 10);
++              ret = munmap(buf, page_size);
++              if (ret != 0) {
++                      err(1, "unmap file failed");
++              }
+       }
++}
+-      (void) pthread_setconcurrency(2);
+-      if (pthread_create(&tid, NULL, mapper, &fd) != 0) {
+-              perror("pthread_create");
+-              close(fd);
++int
++main(int argc, char **argv)
++{
++      pthread_t map_write_tid;
++      pthread_t normal_write_tid[NORMAL_WRITE_TH_NUM];
++      int i = 0;
++
++      if (argc != 3) {
++              (void) printf("usage: %s <normal write file name>"
++                  "<map write file name>\n", argv[0]);
+               exit(1);
+       }
+-      for (;;) {
+-              if (write(fd, buf, sizeof (buf)) == -1) {
+-                      perror("write");
+-                      close(fd);
+-                      exit(1);
++
++      for (i = 0; i < NORMAL_WRITE_TH_NUM; i++) {
++              if (pthread_create(&normal_write_tid[i], NULL, normal_writer,
++                  argv[1])) {
++                      err(1, "pthread_create normal_writer failed.");
+               }
+       }
+-      close(fd);
++      if (pthread_create(&map_write_tid, NULL, map_writer, argv[2])) {
++              err(1, "pthread_create map_writer failed.");
++      }
+       /* NOTREACHED */
++      pthread_join(map_write_tid, NULL);
+       return (0);
+ }
+diff --git a/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh b/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh
+index 1eda9710..24150b82 100755
+--- a/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh
++++ b/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh
+@@ -53,12 +53,14 @@ if ! is_mp; then
+ fi
+ log_must chmod 777 $TESTDIR
+-mmapwrite $TESTDIR/test-write-file &
++mmapwrite $TESTDIR/normal_write_file $TESTDIR/map_write_file &
+ PID_MMAPWRITE=$!
+-log_note "mmapwrite $TESTDIR/test-write-file pid: $PID_MMAPWRITE"
++log_note "mmapwrite $TESTDIR/normal_write_file $TESTDIR/map_write_file"\
++       "pid: $PID_MMAPWRITE"
+ log_must sleep 30
+ log_must kill -9 $PID_MMAPWRITE
+-log_must ls -l $TESTDIR/test-write-file
++log_must ls -l $TESTDIR/normal_write_file
++log_must ls -l $TESTDIR/map_write_file
+ log_pass "write(2) a mmap(2)'ing file succeeded."
index 3b4626fb7adc447056fe991bcf8dc0bfe2d1ed62..1146f65b2c487e31cab3af7d444be912a4e35241 100644 (file)
@@ -4,3 +4,4 @@
 0004-Fix-deadlock-between-zfs-umount-snapentry_expire.patch
 0005-Fix-race-in-dnode_check_slots_free.patch
 0006-Reduce-taskq-and-context-switch-cost-of-zio-pipe.patch
+0007-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch