From 1bb9f57dcf20ce121c8c5e7dc2584d7e0820920d Mon Sep 17 00:00:00 2001 From: Stoiko Ivanov Date: Tue, 30 Oct 2018 11:14:27 +0100 Subject: [PATCH] Backport deadlock fix for issue ZOL#7939 A deadlock issue got fixed upstream and merged into master [0]. This patch backports the fix by splitting it into the parts for SPL and ZFS. [0] https://github.com/zfsonlinux/zfs/pull/7939 Signed-off-by: Stoiko Ivanov --- ...-mm_sem-and-tx-assign-in-zfs_write-a.patch | 42 ++ spl-patches/series | 1 + ...-mm_sem-and-tx-assign-in-zfs_write-a.patch | 376 ++++++++++++++++++ zfs-patches/series | 1 + 4 files changed, 420 insertions(+) create mode 100644 spl-patches/0002-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch create mode 100644 zfs-patches/0007-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch diff --git a/spl-patches/0002-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch b/spl-patches/0002-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch new file mode 100644 index 0000000..2906ef8 --- /dev/null +++ b/spl-patches/0002-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch @@ -0,0 +1,42 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: ilbsmart +Date: Wed, 17 Oct 2018 02:11:24 +0800 +Subject: [PATCH] deadlock between mm_sem and tx assign in zfs_write() and page + fault + +The bug time sequence: +1. thread #1, `zfs_write` assign a txg "n". +2. In a same process, thread #2, mmap page fault (which means the + `mm_sem` is hold) occurred, `zfs_dirty_inode` open a txg failed, + and wait previous txg "n" completed. +3. thread #1 call `uiomove` to write, however page fault is occurred + in `uiomove`, which means it need `mm_sem`, but `mm_sem` is hold by + thread #2, so it stuck and can't complete, then txg "n" will + not complete. + +So thread #1 and thread #2 are deadlocked. + +Reviewed-by: Chunwei Chen +Reviewed-by: Brian Behlendorf +Reviewed-by: Matthew Ahrens +Signed-off-by: Grady Wong +Closes #7939 + +(backported from: zfs-upstream 779a6c0bf6df76e0dd92c1ccf81f48512b835bb0) +Signed-off-by: Stoiko Ivanov +--- + include/sys/uio.h | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/include/sys/uio.h b/include/sys/uio.h +index 764beb9..2895690 100644 +--- a/include/sys/uio.h ++++ b/include/sys/uio.h +@@ -53,6 +53,7 @@ typedef struct uio { + int uio_iovcnt; + offset_t uio_loffset; + uio_seg_t uio_segflg; ++ boolean_t uio_fault_disable; + uint16_t uio_fmode; + uint16_t uio_extflg; + offset_t uio_limit; diff --git a/spl-patches/series b/spl-patches/series index 20724b7..5bcfa6a 100644 --- a/spl-patches/series +++ b/spl-patches/series @@ -1 +1,2 @@ 0001-remove-DKMS-and-module-build.patch +0002-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch diff --git a/zfs-patches/0007-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch b/zfs-patches/0007-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch new file mode 100644 index 0000000..6872d8b --- /dev/null +++ b/zfs-patches/0007-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch @@ -0,0 +1,376 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: ilbsmart +Date: Wed, 17 Oct 2018 02:11:24 +0800 +Subject: [PATCH] deadlock between mm_sem and tx assign in zfs_write() and page + fault + +The bug time sequence: +1. thread #1, `zfs_write` assign a txg "n". +2. In a same process, thread #2, mmap page fault (which means the + `mm_sem` is hold) occurred, `zfs_dirty_inode` open a txg failed, + and wait previous txg "n" completed. +3. thread #1 call `uiomove` to write, however page fault is occurred + in `uiomove`, which means it need `mm_sem`, but `mm_sem` is hold by + thread #2, so it stuck and can't complete, then txg "n" will + not complete. + +So thread #1 and thread #2 are deadlocked. + +Reviewed-by: Chunwei Chen +Reviewed-by: Brian Behlendorf +Reviewed-by: Matthew Ahrens +Signed-off-by: Grady Wong +Closes #7939 + +(backported from: zfs-upstream 779a6c0bf6df76e0dd92c1ccf81f48512b835bb0) +Signed-off-by: Stoiko Ivanov +--- + include/sys/uio_impl.h | 2 +- + module/zcommon/zfs_uio.c | 31 ++++- + module/zfs/zfs_vnops.c | 24 +++- + tests/zfs-tests/cmd/mmapwrite/mmapwrite.c | 140 +++++++++++++++------ + .../tests/functional/mmap/mmap_write_001_pos.ksh | 8 +- + 5 files changed, 151 insertions(+), 54 deletions(-) + +diff --git a/include/sys/uio_impl.h b/include/sys/uio_impl.h +index 37e283da..cfef0b95 100644 +--- a/include/sys/uio_impl.h ++++ b/include/sys/uio_impl.h +@@ -42,7 +42,7 @@ + #include + + extern int uiomove(void *, size_t, enum uio_rw, uio_t *); +-extern void uio_prefaultpages(ssize_t, uio_t *); ++extern int uio_prefaultpages(ssize_t, uio_t *); + extern int uiocopy(void *, size_t, enum uio_rw, uio_t *, size_t *); + extern void uioskip(uio_t *, size_t); + +diff --git a/module/zcommon/zfs_uio.c b/module/zcommon/zfs_uio.c +index 7b4175bb..8e969bbc 100644 +--- a/module/zcommon/zfs_uio.c ++++ b/module/zcommon/zfs_uio.c +@@ -50,6 +50,7 @@ + #include + #include + #include ++#include + + /* + * Move "n" bytes at byte address "p"; "rw" indicates the direction +@@ -77,8 +78,24 @@ uiomove_iov(void *p, size_t n, enum uio_rw rw, struct uio *uio) + if (copy_to_user(iov->iov_base+skip, p, cnt)) + return (EFAULT); + } else { +- if (copy_from_user(p, iov->iov_base+skip, cnt)) +- return (EFAULT); ++ if (uio->uio_fault_disable) { ++ if (!access_ok(VERIFY_READ, ++ (iov->iov_base + skip), cnt)) { ++ return (EFAULT); ++ } ++ ++ pagefault_disable(); ++ if (__copy_from_user_inatomic(p, ++ (iov->iov_base + skip), cnt)) { ++ pagefault_enable(); ++ return (EFAULT); ++ } ++ pagefault_enable(); ++ } else { ++ if (copy_from_user(p, ++ (iov->iov_base + skip), cnt)) ++ return (EFAULT); ++ } + } + break; + case UIO_SYSSPACE: +@@ -156,7 +173,7 @@ EXPORT_SYMBOL(uiomove); + * error will terminate the process as this is only a best attempt to get + * the pages resident. + */ +-void ++int + uio_prefaultpages(ssize_t n, struct uio *uio) + { + const struct iovec *iov; +@@ -170,7 +187,7 @@ uio_prefaultpages(ssize_t n, struct uio *uio) + switch (uio->uio_segflg) { + case UIO_SYSSPACE: + case UIO_BVEC: +- return; ++ return (0); + case UIO_USERSPACE: + case UIO_USERISPACE: + break; +@@ -194,7 +211,7 @@ uio_prefaultpages(ssize_t n, struct uio *uio) + p = iov->iov_base + skip; + while (cnt) { + if (fuword8((uint8_t *)p, &tmp)) +- return; ++ return (EFAULT); + incr = MIN(cnt, PAGESIZE); + p += incr; + cnt -= incr; +@@ -204,8 +221,10 @@ uio_prefaultpages(ssize_t n, struct uio *uio) + */ + p--; + if (fuword8((uint8_t *)p, &tmp)) +- return; ++ return (EFAULT); + } ++ ++ return (0); + } + EXPORT_SYMBOL(uio_prefaultpages); + +diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c +index 5a2e55eb..c866352d 100644 +--- a/module/zfs/zfs_vnops.c ++++ b/module/zfs/zfs_vnops.c +@@ -675,7 +675,10 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) + xuio = (xuio_t *)uio; + else + #endif +- uio_prefaultpages(MIN(n, max_blksz), uio); ++ if (uio_prefaultpages(MIN(n, max_blksz), uio)) { ++ ZFS_EXIT(zfsvfs); ++ return (SET_ERROR(EFAULT)); ++ } + + /* + * If in append mode, set the io offset pointer to eof. +@@ -820,8 +823,19 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) + + if (abuf == NULL) { + tx_bytes = uio->uio_resid; ++ uio->uio_fault_disable = B_TRUE; + error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, nbytes, tx); ++ if (error == EFAULT) { ++ dmu_tx_commit(tx); ++ if (uio_prefaultpages(MIN(n, max_blksz), uio)) { ++ break; ++ } ++ continue; ++ } else if (error != 0) { ++ dmu_tx_commit(tx); ++ break; ++ } + tx_bytes -= uio->uio_resid; + } else { + tx_bytes = nbytes; +@@ -921,8 +935,12 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) + ASSERT(tx_bytes == nbytes); + n -= nbytes; + +- if (!xuio && n > 0) +- uio_prefaultpages(MIN(n, max_blksz), uio); ++ if (!xuio && n > 0) { ++ if (uio_prefaultpages(MIN(n, max_blksz), uio)) { ++ error = EFAULT; ++ break; ++ } ++ } + } + + zfs_inode_update(zp); +diff --git a/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c b/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c +index 190d31af..b9915d5d 100644 +--- a/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c ++++ b/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c +@@ -31,74 +31,132 @@ + #include + #include + #include ++#include ++#include + + /* + * -------------------------------------------------------------------- +- * Bug Id: 5032643 ++ * Bug Issue Id: #7512 ++ * The bug time sequence: ++ * 1. context #1, zfs_write assign a txg "n". ++ * 2. In the same process, context #2, mmap page fault (which means the mm_sem ++ * is hold) occurred, zfs_dirty_inode open a txg failed, and wait previous ++ * txg "n" completed. ++ * 3. context #1 call uiomove to write, however page fault is occurred in ++ * uiomove, which means it need mm_sem, but mm_sem is hold by ++ * context #2, so it stuck and can't complete, then txg "n" will not ++ * complete. + * +- * Simply writing to a file and mmaping that file at the same time can +- * result in deadlock. Nothing perverse like writing from the file's +- * own mapping is required. ++ * So context #1 and context #2 trap into the "dead lock". + * -------------------------------------------------------------------- + */ + ++#define NORMAL_WRITE_TH_NUM 2 ++ + static void * +-mapper(void *fdp) ++normal_writer(void *filename) + { +- void *addr; +- int fd = *(int *)fdp; ++ char *file_path = filename; ++ int fd = -1; ++ ssize_t write_num = 0; ++ int page_size = getpagesize(); + +- if ((addr = +- mmap(0, 8192, PROT_READ, MAP_SHARED, fd, 0)) == MAP_FAILED) { +- perror("mmap"); +- exit(1); ++ fd = open(file_path, O_RDWR | O_CREAT, 0777); ++ if (fd == -1) { ++ err(1, "failed to open %s", file_path); + } +- for (;;) { +- if (mmap(addr, 8192, PROT_READ, +- MAP_SHARED|MAP_FIXED, fd, 0) == MAP_FAILED) { +- perror("mmap"); +- exit(1); ++ ++ char *buf = malloc(1); ++ while (1) { ++ write_num = write(fd, buf, 1); ++ if (write_num == 0) { ++ err(1, "write failed!"); ++ break; + } ++ lseek(fd, page_size, SEEK_CUR); ++ } ++ ++ if (buf) { ++ free(buf); + } +- /* NOTREACHED */ +- return ((void *)1); + } + +-int +-main(int argc, char **argv) ++static void * ++map_writer(void *filename) + { +- int fd; +- char buf[1024]; +- pthread_t tid; ++ int fd = -1; ++ int ret = 0; ++ char *buf = NULL; ++ int page_size = getpagesize(); ++ int op_errno = 0; ++ char *file_path = filename; + +- memset(buf, 'a', sizeof (buf)); ++ while (1) { ++ ret = access(file_path, F_OK); ++ if (ret) { ++ op_errno = errno; ++ if (op_errno == ENOENT) { ++ fd = open(file_path, O_RDWR | O_CREAT, 0777); ++ if (fd == -1) { ++ err(1, "open file failed"); ++ } + +- if (argc != 2) { +- (void) printf("usage: %s \n", argv[0]); +- exit(1); +- } ++ ret = ftruncate(fd, page_size); ++ if (ret == -1) { ++ err(1, "truncate file failed"); ++ } ++ } else { ++ err(1, "access file failed!"); ++ } ++ } else { ++ fd = open(file_path, O_RDWR, 0777); ++ if (fd == -1) { ++ err(1, "open file failed"); ++ } ++ } + +- if ((fd = open(argv[1], O_RDWR|O_CREAT|O_TRUNC, 0666)) == -1) { +- perror("open"); +- exit(1); ++ if ((buf = mmap(NULL, page_size, PROT_READ | PROT_WRITE, ++ MAP_SHARED, fd, 0)) == MAP_FAILED) { ++ err(1, "map file failed"); ++ } ++ ++ if (fd != -1) ++ close(fd); ++ ++ char s[10] = {0, }; ++ memcpy(buf, s, 10); ++ ret = munmap(buf, page_size); ++ if (ret != 0) { ++ err(1, "unmap file failed"); ++ } + } ++} + +- (void) pthread_setconcurrency(2); +- if (pthread_create(&tid, NULL, mapper, &fd) != 0) { +- perror("pthread_create"); +- close(fd); ++int ++main(int argc, char **argv) ++{ ++ pthread_t map_write_tid; ++ pthread_t normal_write_tid[NORMAL_WRITE_TH_NUM]; ++ int i = 0; ++ ++ if (argc != 3) { ++ (void) printf("usage: %s " ++ "\n", argv[0]); + exit(1); + } +- for (;;) { +- if (write(fd, buf, sizeof (buf)) == -1) { +- perror("write"); +- close(fd); +- exit(1); ++ ++ for (i = 0; i < NORMAL_WRITE_TH_NUM; i++) { ++ if (pthread_create(&normal_write_tid[i], NULL, normal_writer, ++ argv[1])) { ++ err(1, "pthread_create normal_writer failed."); + } + } + +- close(fd); ++ if (pthread_create(&map_write_tid, NULL, map_writer, argv[2])) { ++ err(1, "pthread_create map_writer failed."); ++ } + + /* NOTREACHED */ ++ pthread_join(map_write_tid, NULL); + return (0); + } +diff --git a/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh b/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh +index 1eda9710..24150b82 100755 +--- a/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh ++++ b/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh +@@ -53,12 +53,14 @@ if ! is_mp; then + fi + + log_must chmod 777 $TESTDIR +-mmapwrite $TESTDIR/test-write-file & ++mmapwrite $TESTDIR/normal_write_file $TESTDIR/map_write_file & + PID_MMAPWRITE=$! +-log_note "mmapwrite $TESTDIR/test-write-file pid: $PID_MMAPWRITE" ++log_note "mmapwrite $TESTDIR/normal_write_file $TESTDIR/map_write_file"\ ++ "pid: $PID_MMAPWRITE" + log_must sleep 30 + + log_must kill -9 $PID_MMAPWRITE +-log_must ls -l $TESTDIR/test-write-file ++log_must ls -l $TESTDIR/normal_write_file ++log_must ls -l $TESTDIR/map_write_file + + log_pass "write(2) a mmap(2)'ing file succeeded." diff --git a/zfs-patches/series b/zfs-patches/series index 3b4626f..1146f65 100644 --- a/zfs-patches/series +++ b/zfs-patches/series @@ -4,3 +4,4 @@ 0004-Fix-deadlock-between-zfs-umount-snapentry_expire.patch 0005-Fix-race-in-dnode_check_slots_free.patch 0006-Reduce-taskq-and-context-switch-cost-of-zio-pipe.patch +0007-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch -- 2.39.2