update/rebase to zfs-0.7.10 with patches from ZOL

author Stoiko Ivanov <s.ivanov@proxmox.com>

Tue, 11 Sep 2018 09:43:41 +0000 (11:43 +0200)

committer Thomas Lamprecht <t.lamprecht@proxmox.com>

Thu, 13 Sep 2018 06:54:51 +0000 (08:54 +0200)
author Stoiko Ivanov <s.ivanov@proxmox.com>
Tue, 11 Sep 2018 09:43:41 +0000 (11:43 +0200)
committer Thomas Lamprecht <t.lamprecht@proxmox.com>
Thu, 13 Sep 2018 06:54:51 +0000 (08:54 +0200)
diff --git a/zfs-patches/0004-Fix-deadlock-between-zfs-umount-snapentry_expire.patch b/zfs-patches/0004-Fix-deadlock-between-zfs-umount-snapentry_expire.patch

new file mode 100644 (file)

index 0000000..5c090d3
--- /dev/null
+++ b/zfs-patches/0004-Fix-deadlock-between-zfs-umount-snapentry_expire.patch
@@ -0,0 +1,59 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Rohan Puri <rohan.puri15@gmail.com>
+Date: Sat, 28 Jul 2018 18:32:12 +0530
+Subject: [PATCH] Fix deadlock between zfs umount & snapentry_expire
+
+zfs umount -> zfsctl_destroy() takes the zfs_snapshot_lock as a
+writer and calls zfsctl_snapshot_unmount_cancel(), which waits
+for snapentry_expire() if present (when snap is automounted).
+This snapentry_expire() itself then waits for zfs_snapshot_lock
+as a reader, resulting in a deadlock.
+
+The fix is to only hold the zfs_snapshot_lock over the tree
+lookup and removal.  After a successful lookup the lock can
+be dropped and zfs_snapentry_t will remain valid until the
+reference taken by the lookup is released.
+
+Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Signed-off-by: Rohan Puri <rohan.puri15@gmail.com>
+Closes #7751
+Closes #7752
+
+(Cherry-picked from fd7265c646f40e364396af5014bbb83e809e124a)
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ module/zfs/zfs_ctldir.c | 11 +++++------
+ 1 file changed, 5 insertions(+), 6 deletions(-)
+
+diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c
+index 3b5fb196..14af55c4 100644
+--- a/module/zfs/zfs_ctldir.c
++++ b/module/zfs/zfs_ctldir.c
+@@ -358,8 +358,6 @@ snapentry_expire(void *data)
+ static void
+ zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se)
+ {
+-      ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock));
+-
+       if (taskq_cancel_id(system_delay_taskq, se->se_taskqid) == 0) {
+               se->se_taskqid = TASKQID_INVALID;
+               zfsctl_snapshot_rele(se);
+@@ -570,13 +568,14 @@ zfsctl_destroy(zfsvfs_t *zfsvfs)
+               uint64_t objsetid = dmu_objset_id(zfsvfs->z_os);
+ 
+               rw_enter(&zfs_snapshot_lock, RW_WRITER);
+-              if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid))
+-                  != NULL) {
+-                      zfsctl_snapshot_unmount_cancel(se);
++              se = zfsctl_snapshot_find_by_objsetid(spa, objsetid);
++              if (se != NULL)
+                       zfsctl_snapshot_remove(se);
++              rw_exit(&zfs_snapshot_lock);
++              if (se != NULL) {
++                      zfsctl_snapshot_unmount_cancel(se);
+                       zfsctl_snapshot_rele(se);
+               }
+-              rw_exit(&zfs_snapshot_lock);
+       } else if (zfsvfs->z_ctldir) {
+               iput(zfsvfs->z_ctldir);
+               zfsvfs->z_ctldir = NULL;
diff --git a/zfs-patches/0004-Fix-zpl_mount-deadlock.patch b/zfs-patches/0004-Fix-zpl_mount-deadlock.patch

deleted file mode 100644 (file)

index 8947309..0000000
--- a/zfs-patches/0004-Fix-zpl_mount-deadlock.patch
+++ /dev/null
@@ -1,92 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Brian Behlendorf <behlendorf1@llnl.gov>
-Date: Wed, 11 Jul 2018 15:49:10 -0700
-Subject: [PATCH] Fix zpl_mount() deadlock
-
-Commit 93b43af10 inadvertently introduced the following scenario which
-can result in a deadlock.  This issue was most easily reproduced by
-LXD containers using a ZFS storage backend but should be reproducible
-under any workload which is frequently mounting and unmounting.
-
--- THREAD A --
-spa_sync()
-  spa_sync_upgrades()
-    rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); <- Waiting on B
-
--- THREAD B --
-mount_fs()
-  zpl_mount()
-    zpl_mount_impl()
-      dmu_objset_hold()
-        dmu_objset_hold_flags()
-          dsl_pool_hold()
-            dsl_pool_config_enter()
-              rrw_enter(&dp->dp_config_rwlock, RW_READER, tag);
-    sget()
-      sget_userns()
-        grab_super()
-          down_write(&s->s_umount); <- Waiting on C
-
--- THREAD C --
-cleanup_mnt()
-  deactivate_super()
-    down_write(&s->s_umount);
-    deactivate_locked_super()
-      zpl_kill_sb()
-        kill_anon_super()
-          generic_shutdown_super()
-            sync_filesystem()
-              zpl_sync_fs()
-                zfs_sync()
-                  zil_commit()
-                    txg_wait_synced() <- Waiting on A
-
-Reviewed by: Alek Pinchuk <apinchuk@datto.com>
-Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
-Closes #7598
-Closes #7659
-Closes #7691
-Closes #7693
-
-(Cherry-picked from ac09630d8b0bf6c92084a30fdaefd03fd0adbdc1)
-Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
----
- include/sys/zfs_vfsops.h |  1 +
- module/zfs/zpl_super.c   | 11 ++++++++++-
- 2 files changed, 11 insertions(+), 1 deletion(-)
-
-diff --git a/include/sys/zfs_vfsops.h b/include/sys/zfs_vfsops.h
-index 2326da42..927153b2 100644
---- a/include/sys/zfs_vfsops.h
-+++ b/include/sys/zfs_vfsops.h
-@@ -32,6 +32,7 @@
- #include <sys/zil.h>
- #include <sys/sa.h>
- #include <sys/rrwlock.h>
-+#include <sys/dsl_dataset.h>
- #include <sys/zfs_ioctl.h>
- 
- #ifdef        __cplusplus
-diff --git a/module/zfs/zpl_super.c b/module/zfs/zpl_super.c
-index fc10271b..5c426b0a 100644
---- a/module/zfs/zpl_super.c
-+++ b/module/zfs/zpl_super.c
-@@ -271,8 +271,17 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
-       if (err)
-               return (ERR_PTR(-err));
- 
-+      /*
-+       * The dsl pool lock must be released prior to calling sget().
-+       * It is possible sget() may block on the lock in grab_super()
-+       * while deactivate_super() holds that same lock and waits for
-+       * a txg sync.  If the dsl_pool lock is held over over sget()
-+       * this can prevent the pool sync and cause a deadlock.
-+       */
-+      dsl_pool_rele(dmu_objset_pool(os), FTAG);
-       s = zpl_sget(fs_type, zpl_test_super, set_anon_super, flags, os);
--      dmu_objset_rele(os, FTAG);
-+      dsl_dataset_rele(dmu_objset_ds(os), FTAG);
-+
-       if (IS_ERR(s))
-               return (ERR_CAST(s));
- 
diff --git a/zfs-patches/0005-Fix-deadlock-between-zfs-umount-snapentry_expire.patch b/zfs-patches/0005-Fix-deadlock-between-zfs-umount-snapentry_expire.patch

deleted file mode 100644 (file)

index 5c090d3..0000000
--- a/zfs-patches/0005-Fix-deadlock-between-zfs-umount-snapentry_expire.patch
+++ /dev/null
@@ -1,59 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Rohan Puri <rohan.puri15@gmail.com>
-Date: Sat, 28 Jul 2018 18:32:12 +0530
-Subject: [PATCH] Fix deadlock between zfs umount & snapentry_expire
-
-zfs umount -> zfsctl_destroy() takes the zfs_snapshot_lock as a
-writer and calls zfsctl_snapshot_unmount_cancel(), which waits
-for snapentry_expire() if present (when snap is automounted).
-This snapentry_expire() itself then waits for zfs_snapshot_lock
-as a reader, resulting in a deadlock.
-
-The fix is to only hold the zfs_snapshot_lock over the tree
-lookup and removal.  After a successful lookup the lock can
-be dropped and zfs_snapentry_t will remain valid until the
-reference taken by the lookup is released.
-
-Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
-Signed-off-by: Rohan Puri <rohan.puri15@gmail.com>
-Closes #7751
-Closes #7752
-
-(Cherry-picked from fd7265c646f40e364396af5014bbb83e809e124a)
-Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
----
- module/zfs/zfs_ctldir.c | 11 +++++------
- 1 file changed, 5 insertions(+), 6 deletions(-)
-
-diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c
-index 3b5fb196..14af55c4 100644
---- a/module/zfs/zfs_ctldir.c
-+++ b/module/zfs/zfs_ctldir.c
-@@ -358,8 +358,6 @@ snapentry_expire(void *data)
- static void
- zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se)
- {
--      ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock));
--
-       if (taskq_cancel_id(system_delay_taskq, se->se_taskqid) == 0) {
-               se->se_taskqid = TASKQID_INVALID;
-               zfsctl_snapshot_rele(se);
-@@ -570,13 +568,14 @@ zfsctl_destroy(zfsvfs_t *zfsvfs)
-               uint64_t objsetid = dmu_objset_id(zfsvfs->z_os);
- 
-               rw_enter(&zfs_snapshot_lock, RW_WRITER);
--              if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid))
--                  != NULL) {
--                      zfsctl_snapshot_unmount_cancel(se);
-+              se = zfsctl_snapshot_find_by_objsetid(spa, objsetid);
-+              if (se != NULL)
-                       zfsctl_snapshot_remove(se);
-+              rw_exit(&zfs_snapshot_lock);
-+              if (se != NULL) {
-+                      zfsctl_snapshot_unmount_cancel(se);
-                       zfsctl_snapshot_rele(se);
-               }
--              rw_exit(&zfs_snapshot_lock);
-       } else if (zfsvfs->z_ctldir) {
-               iput(zfsvfs->z_ctldir);
-               zfsvfs->z_ctldir = NULL;
diff --git a/zfs-patches/0005-zv_suspend_lock-in-zvol_open-zvol_release.patch b/zfs-patches/0005-zv_suspend_lock-in-zvol_open-zvol_release.patch

new file mode 100644 (file)

index 0000000..6a61f1a
--- /dev/null
+++ b/zfs-patches/0005-zv_suspend_lock-in-zvol_open-zvol_release.patch
@@ -0,0 +1,124 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Boris Protopopov <bprotopopov@users.noreply.github.com>
+Date: Wed, 9 Aug 2017 14:10:47 -0400
+Subject: [PATCH] zv_suspend_lock in zvol_open()/zvol_release()
+
+Acquire zv_suspend_lock on first open and last close only.
+
+Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Signed-off-by: Boris Protopopov <boris.protopopov@actifio.com>
+Closes #6342
+
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ module/zfs/zvol.c | 64 +++++++++++++++++++++++++++++++++++--------------------
+ 1 file changed, 41 insertions(+), 23 deletions(-)
+
+diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
+index 3e7059b3..ffa5fac7 100644
+--- a/module/zfs/zvol.c
++++ b/module/zfs/zvol.c
+@@ -1347,9 +1347,9 @@ zvol_open(struct block_device *bdev, fmode_t flag)
+ {
+       zvol_state_t *zv;
+       int error = 0;
+-      boolean_t drop_suspend = B_FALSE;
++      boolean_t drop_suspend = B_TRUE;
+ 
+-      ASSERT(!mutex_owned(&zvol_state_lock));
++      ASSERT(!MUTEX_HELD(&zvol_state_lock));
+ 
+       mutex_enter(&zvol_state_lock);
+       /*
+@@ -1364,23 +1364,31 @@ zvol_open(struct block_device *bdev, fmode_t flag)
+               return (SET_ERROR(-ENXIO));
+       }
+ 
+-      /* take zv_suspend_lock before zv_state_lock */
+-      rw_enter(&zv->zv_suspend_lock, RW_READER);
+-
+       mutex_enter(&zv->zv_state_lock);
+-
+       /*
+        * make sure zvol is not suspended during first open
+-       * (hold zv_suspend_lock), otherwise, drop the lock
++       * (hold zv_suspend_lock) and respect proper lock acquisition
++       * ordering - zv_suspend_lock before zv_state_lock
+        */
+       if (zv->zv_open_count == 0) {
+-              drop_suspend = B_TRUE;
++              if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
++                      mutex_exit(&zv->zv_state_lock);
++                      rw_enter(&zv->zv_suspend_lock, RW_READER);
++                      mutex_enter(&zv->zv_state_lock);
++                      /* check to see if zv_suspend_lock is needed */
++                      if (zv->zv_open_count != 0) {
++                              rw_exit(&zv->zv_suspend_lock);
++                              drop_suspend = B_FALSE;
++                      }
++              }
+       } else {
+-              rw_exit(&zv->zv_suspend_lock);
++              drop_suspend = B_FALSE;
+       }
+-
+       mutex_exit(&zvol_state_lock);
+ 
++      ASSERT(MUTEX_HELD(&zv->zv_state_lock));
++      ASSERT(zv->zv_open_count != 0 || RW_READ_HELD(&zv->zv_suspend_lock));
++
+       if (zv->zv_open_count == 0) {
+               error = zvol_first_open(zv);
+               if (error)
+@@ -1417,28 +1425,38 @@ static int
+ zvol_release(struct gendisk *disk, fmode_t mode)
+ {
+       zvol_state_t *zv;
+-      boolean_t drop_suspend = B_FALSE;
++      boolean_t drop_suspend = B_TRUE;
+ 
+-      ASSERT(!mutex_owned(&zvol_state_lock));
++      ASSERT(!MUTEX_HELD(&zvol_state_lock));
+ 
+       mutex_enter(&zvol_state_lock);
+       zv = disk->private_data;
+-      ASSERT(zv && zv->zv_open_count > 0);
+-
+-      /* take zv_suspend_lock before zv_state_lock */
+-      rw_enter(&zv->zv_suspend_lock, RW_READER);
+ 
+       mutex_enter(&zv->zv_state_lock);
+-      mutex_exit(&zvol_state_lock);
+-
++      ASSERT(zv->zv_open_count > 0);
+       /*
+        * make sure zvol is not suspended during last close
+-       * (hold zv_suspend_lock), otherwise, drop the lock
++       * (hold zv_suspend_lock) and respect proper lock acquisition
++       * ordering - zv_suspend_lock before zv_state_lock
+        */
+-      if (zv->zv_open_count == 1)
+-              drop_suspend = B_TRUE;
+-      else
+-              rw_exit(&zv->zv_suspend_lock);
++      if (zv->zv_open_count == 1) {
++              if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
++                      mutex_exit(&zv->zv_state_lock);
++                      rw_enter(&zv->zv_suspend_lock, RW_READER);
++                      mutex_enter(&zv->zv_state_lock);
++                      /* check to see if zv_suspend_lock is needed */
++                      if (zv->zv_open_count != 1) {
++                              rw_exit(&zv->zv_suspend_lock);
++                              drop_suspend = B_FALSE;
++                      }
++              }
++      } else {
++              drop_suspend = B_FALSE;
++      }
++      mutex_exit(&zvol_state_lock);
++
++      ASSERT(MUTEX_HELD(&zv->zv_state_lock));
++      ASSERT(zv->zv_open_count != 1 || RW_READ_HELD(&zv->zv_suspend_lock));
+ 
+       zv->zv_open_count--;
+       if (zv->zv_open_count == 0)
diff --git a/zfs-patches/0006-Linux-4.18-compat-inode-timespec-timespec64.patch b/zfs-patches/0006-Linux-4.18-compat-inode-timespec-timespec64.patch

new file mode 100644 (file)

index 0000000..5738b0c
--- /dev/null
+++ b/zfs-patches/0006-Linux-4.18-compat-inode-timespec-timespec64.patch
@@ -0,0 +1,560 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Brian Behlendorf <behlendorf1@llnl.gov>
+Date: Sun, 12 Aug 2018 18:22:03 -0400
+Subject: [PATCH] Linux 4.18 compat: inode timespec -> timespec64
+
+Commit torvalds/linux@95582b0 changes the inode i_atime, i_mtime,
+and i_ctime members form timespec's to timespec64's to make them
+2038 safe.  As part of this change the current_time() function was
+also updated to return the timespec64 type.
+
+Resolve this issue by introducing a new inode_timespec_t type which
+is defined to match the timespec type used by the inode.  It should
+be used when working with inode timestamps to ensure matching types.
+
+The timestruc_t type under Illumos was used in a similar fashion but
+was specified to always be a timespec_t.  Rather than incorrectly
+define this type all timespec_t types have been replaced by the new
+inode_timespec_t type.
+
+Finally, the kernel and user space 'sys/time.h' headers were aligned
+with each other.  They define as appropriate for the context several
+constants as macros and include static inline implementation of
+gethrestime(), gethrestime_sec(), and gethrtime().
+
+Reviewed-by: Chunwei Chen <tuxoko@gmail.com>
+Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Closes #7643
+Backported-by: Richard Yao <ryao@gentoo.org>
+
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ config/kernel-current-time.m4 |  7 +++----
+ include/sys/dmu.h             |  2 +-
+ include/sys/dmu_objset.h      |  2 +-
+ include/sys/dsl_dir.h         |  4 ++--
+ include/sys/spa_impl.h        |  2 +-
+ include/sys/xvattr.h          |  2 +-
+ include/sys/zfs_context.h     |  9 +--------
+ include/sys/zfs_znode.h       | 33 +++++++++++++++++++++++--------
+ include/sys/zpl.h             |  9 +++++++++
+ lib/libspl/Makefile.am        |  2 --
+ lib/libspl/gethrestime.c      | 38 ------------------------------------
+ lib/libspl/gethrtime.c        | 45 -------------------------------------------
+ lib/libspl/include/sys/time.h | 37 +++++++++++++++++++++++++++--------
+ lib/libzpool/kernel.c         |  4 ++--
+ module/zfs/dmu_objset.c       |  2 +-
+ module/zfs/dsl_dir.c          |  6 +++---
+ module/zfs/fm.c               |  2 +-
+ module/zfs/zfs_ctldir.c       |  2 +-
+ module/zfs/zfs_vnops.c        |  4 ++--
+ module/zfs/zfs_znode.c        |  4 ++--
+ module/zfs/zpl_inode.c        |  5 +++--
+ 21 files changed, 88 insertions(+), 133 deletions(-)
+ delete mode 100644 lib/libspl/gethrestime.c
+ delete mode 100644 lib/libspl/gethrtime.c
+
+diff --git a/config/kernel-current-time.m4 b/config/kernel-current-time.m4
+index 2ede9ff3..c7d5c9b5 100644
+--- a/config/kernel-current-time.m4
++++ b/config/kernel-current-time.m4
+@@ -1,15 +1,14 @@
+ dnl #
+ dnl # 4.9, current_time() added
++dnl # 4.18, return type changed from timespec to timespec64
+ dnl #
+ AC_DEFUN([ZFS_AC_KERNEL_CURRENT_TIME],
+       [AC_MSG_CHECKING([whether current_time() exists])
+       ZFS_LINUX_TRY_COMPILE_SYMBOL([
+               #include <linux/fs.h>
+       ], [
+-              struct inode ip;
+-              struct timespec now __attribute__ ((unused));
+-
+-              now = current_time(&ip);
++              struct inode ip __attribute__ ((unused));
++              ip.i_atime = current_time(&ip);
+       ], [current_time], [fs/inode.c], [
+               AC_MSG_RESULT(yes)
+               AC_DEFINE(HAVE_CURRENT_TIME, 1, [current_time() exists])
+diff --git a/include/sys/dmu.h b/include/sys/dmu.h
+index bcdf7d64..755a9056 100644
+--- a/include/sys/dmu.h
++++ b/include/sys/dmu.h
+@@ -891,7 +891,7 @@ uint64_t dmu_objset_fsid_guid(objset_t *os);
+ /*
+  * Get the [cm]time for an objset's snapshot dir
+  */
+-timestruc_t dmu_objset_snap_cmtime(objset_t *os);
++inode_timespec_t dmu_objset_snap_cmtime(objset_t *os);
+ 
+ int dmu_objset_is_snapshot(objset_t *os);
+ 
+diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h
+index a836e037..531e81d4 100644
+--- a/include/sys/dmu_objset.h
++++ b/include/sys/dmu_objset.h
+@@ -179,7 +179,7 @@ int dmu_objset_find_dp(struct dsl_pool *dp, uint64_t ddobj,
+     int func(struct dsl_pool *, struct dsl_dataset *, void *),
+     void *arg, int flags);
+ void dmu_objset_evict_dbufs(objset_t *os);
+-timestruc_t dmu_objset_snap_cmtime(objset_t *os);
++inode_timespec_t dmu_objset_snap_cmtime(objset_t *os);
+ 
+ /* called from dsl */
+ void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx);
+diff --git a/include/sys/dsl_dir.h b/include/sys/dsl_dir.h
+index 69b0b6a5..80e83fdc 100644
+--- a/include/sys/dsl_dir.h
++++ b/include/sys/dsl_dir.h
+@@ -103,7 +103,7 @@ struct dsl_dir {
+       /* Protected by dd_lock */
+       kmutex_t dd_lock;
+       list_t dd_props; /* list of dsl_prop_record_t's */
+-      timestruc_t dd_snap_cmtime; /* last time snapshot namespace changed */
++      inode_timespec_t dd_snap_cmtime; /* last snapshot namespace change */
+       uint64_t dd_origin_txg;
+ 
+       /* gross estimate of space used by in-flight tx's */
+@@ -159,7 +159,7 @@ boolean_t dsl_dir_is_clone(dsl_dir_t *dd);
+ void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds,
+     uint64_t reservation, cred_t *cr, dmu_tx_t *tx);
+ void dsl_dir_snap_cmtime_update(dsl_dir_t *dd);
+-timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd);
++inode_timespec_t dsl_dir_snap_cmtime(dsl_dir_t *dd);
+ void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value,
+     dmu_tx_t *tx);
+ void dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx);
+diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
+index b1e78c1d..fa7490ac 100644
+--- a/include/sys/spa_impl.h
++++ b/include/sys/spa_impl.h
+@@ -153,7 +153,7 @@ struct spa {
+       uint64_t        spa_freeze_txg;         /* freeze pool at this txg */
+       uint64_t        spa_load_max_txg;       /* best initial ub_txg */
+       uint64_t        spa_claim_max_txg;      /* highest claimed birth txg */
+-      timespec_t      spa_loaded_ts;          /* 1st successful open time */
++      inode_timespec_t spa_loaded_ts;         /* 1st successful open time */
+       objset_t        *spa_meta_objset;       /* copy of dp->dp_meta_objset */
+       kmutex_t        spa_evicting_os_lock;   /* Evicting objset list lock */
+       list_t          spa_evicting_os_list;   /* Objsets being evicted. */
+diff --git a/include/sys/xvattr.h b/include/sys/xvattr.h
+index 4779b632..5d38927c 100644
+--- a/include/sys/xvattr.h
++++ b/include/sys/xvattr.h
+@@ -47,7 +47,7 @@
+  * Structure of all optional attributes.
+  */
+ typedef struct xoptattr {
+-      timestruc_t     xoa_createtime; /* Create time of file */
++      inode_timespec_t xoa_createtime;        /* Create time of file */
+       uint8_t         xoa_archive;
+       uint8_t         xoa_system;
+       uint8_t         xoa_readonly;
+diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h
+index 4fe35342..68c58f95 100644
+--- a/include/sys/zfs_context.h
++++ b/include/sys/zfs_context.h
+@@ -527,7 +527,7 @@ extern char *vn_dumpdir;
+ #define       AV_SCANSTAMP_SZ 32              /* length of anti-virus scanstamp */
+ 
+ typedef struct xoptattr {
+-      timestruc_t     xoa_createtime; /* Create time of file */
++      inode_timespec_t xoa_createtime;        /* Create time of file */
+       uint8_t         xoa_archive;
+       uint8_t         xoa_system;
+       uint8_t         xoa_readonly;
+@@ -640,13 +640,6 @@ extern void delay(clock_t ticks);
+ #define       USEC_TO_TICK(usec)      ((usec) / (MICROSEC / hz))
+ #define       NSEC_TO_TICK(usec)      ((usec) / (NANOSEC / hz))
+ 
+-#define       gethrestime_sec() time(NULL)
+-#define       gethrestime(t) \
+-      do {\
+-              (t)->tv_sec = gethrestime_sec();\
+-              (t)->tv_nsec = 0;\
+-      } while (0);
+-
+ #define       max_ncpus       64
+ #define       boot_ncpus      (sysconf(_SC_NPROCESSORS_ONLN))
+ 
+diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h
+index c292f037..26d1eb37 100644
+--- a/include/sys/zfs_znode.h
++++ b/include/sys/zfs_znode.h
+@@ -270,19 +270,36 @@ typedef struct znode_hold {
+ 
+ extern unsigned int zfs_object_mutex_size;
+ 
+-/* Encode ZFS stored time values from a struct timespec */
++/*
++ * Encode ZFS stored time values from a struct timespec / struct timespec64.
++ */
+ #define       ZFS_TIME_ENCODE(tp, stmp)               \
+-{                                             \
++do {                                          \
+       (stmp)[0] = (uint64_t)(tp)->tv_sec;     \
+       (stmp)[1] = (uint64_t)(tp)->tv_nsec;    \
+-}
++} while (0)
+ 
+-/* Decode ZFS stored time values to a struct timespec */
++#if defined(HAVE_INODE_TIMESPEC64_TIMES)
++/*
++ * Decode ZFS stored time values to a struct timespec64
++ * 4.18 and newer kernels.
++ */
+ #define       ZFS_TIME_DECODE(tp, stmp)               \
+-{                                             \
+-      (tp)->tv_sec = (time_t)(stmp)[0];               \
+-      (tp)->tv_nsec = (long)(stmp)[1];                \
+-}
++do {                                          \
++      (tp)->tv_sec = (time64_t)(stmp)[0];     \
++      (tp)->tv_nsec = (long)(stmp)[1];        \
++} while (0)
++#else
++/*
++ * Decode ZFS stored time values to a struct timespec
++ * 4.17 and older kernels.
++ */
++#define       ZFS_TIME_DECODE(tp, stmp)               \
++do {                                          \
++      (tp)->tv_sec = (time_t)(stmp)[0];       \
++      (tp)->tv_nsec = (long)(stmp)[1];        \
++} while (0)
++#endif /* HAVE_INODE_TIMESPEC64_TIMES */
+ 
+ /*
+  * Timestamp defines
+diff --git a/include/sys/zpl.h b/include/sys/zpl.h
+index 65ed4313..e433fbc6 100644
+--- a/include/sys/zpl.h
++++ b/include/sys/zpl.h
+@@ -189,4 +189,13 @@ zpl_dir_emit_dots(struct file *file, zpl_dir_context_t *ctx)
+ }
+ #endif /* HAVE_VFS_ITERATE */
+ 
++/*
++ * Linux 4.18, inode times converted from timespec to timespec64.
++ */
++#if defined(HAVE_INODE_TIMESPEC64_TIMES)
++#define       zpl_inode_timespec_trunc(ts, gran)      timespec64_trunc(ts, gran)
++#else
++#define       zpl_inode_timespec_trunc(ts, gran)      timespec_trunc(ts, gran)
++#endif
++
+ #endif        /* _SYS_ZPL_H */
+diff --git a/lib/libspl/Makefile.am b/lib/libspl/Makefile.am
+index 59bc8ffb..a6e63cb8 100644
+--- a/lib/libspl/Makefile.am
++++ b/lib/libspl/Makefile.am
+@@ -19,8 +19,6 @@ noinst_LTLIBRARIES = libspl.la
+ 
+ USER_C = \
+       getexecname.c \
+-      gethrtime.c \
+-      gethrestime.c \
+       getmntany.c \
+       list.c \
+       mkdirp.c \
+diff --git a/lib/libspl/gethrestime.c b/lib/libspl/gethrestime.c
+deleted file mode 100644
+index d37cc2d5..00000000
+--- a/lib/libspl/gethrestime.c
++++ /dev/null
+@@ -1,38 +0,0 @@
+-/*
+- * CDDL HEADER START
+- *
+- * The contents of this file are subject to the terms of the
+- * Common Development and Distribution License (the "License").
+- * You may not use this file except in compliance with the License.
+- *
+- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+- * or http://www.opensolaris.org/os/licensing.
+- * See the License for the specific language governing permissions
+- * and limitations under the License.
+- *
+- * When distributing Covered Code, include this CDDL HEADER in each
+- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+- * If applicable, add the following below this CDDL HEADER, with the
+- * fields enclosed by brackets "[]" replaced with your own identifying
+- * information: Portions Copyright [yyyy] [name of copyright owner]
+- *
+- * CDDL HEADER END
+- */
+-
+-/*
+- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+- * Use is subject to license terms.
+- */
+-
+-#include <time.h>
+-#include <sys/time.h>
+-
+-void
+-gethrestime(timestruc_t *ts)
+-{
+-      struct timeval tv;
+-
+-      gettimeofday(&tv, NULL);
+-      ts->tv_sec = tv.tv_sec;
+-      ts->tv_nsec = tv.tv_usec * NSEC_PER_USEC;
+-}
+diff --git a/lib/libspl/gethrtime.c b/lib/libspl/gethrtime.c
+deleted file mode 100644
+index 95ceb18e..00000000
+--- a/lib/libspl/gethrtime.c
++++ /dev/null
+@@ -1,45 +0,0 @@
+-/*
+- * CDDL HEADER START
+- *
+- * The contents of this file are subject to the terms of the
+- * Common Development and Distribution License (the "License").
+- * You may not use this file except in compliance with the License.
+- *
+- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+- * or http://www.opensolaris.org/os/licensing.
+- * See the License for the specific language governing permissions
+- * and limitations under the License.
+- *
+- * When distributing Covered Code, include this CDDL HEADER in each
+- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+- * If applicable, add the following below this CDDL HEADER, with the
+- * fields enclosed by brackets "[]" replaced with your own identifying
+- * information: Portions Copyright [yyyy] [name of copyright owner]
+- *
+- * CDDL HEADER END
+- */
+-
+-/*
+- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+- * Use is subject to license terms.
+- */
+-
+-#include <time.h>
+-#include <sys/time.h>
+-#include <stdlib.h>
+-#include <stdio.h>
+-
+-hrtime_t
+-gethrtime(void)
+-{
+-      struct timespec ts;
+-      int rc;
+-
+-      rc = clock_gettime(CLOCK_MONOTONIC, &ts);
+-      if (rc) {
+-              fprintf(stderr, "Error: clock_gettime() = %d\n", rc);
+-              abort();
+-      }
+-
+-      return ((((u_int64_t)ts.tv_sec) * NANOSEC) + ts.tv_nsec);
+-}
+diff --git a/lib/libspl/include/sys/time.h b/lib/libspl/include/sys/time.h
+index dc645fa5..04b3ba87 100644
+--- a/lib/libspl/include/sys/time.h
++++ b/lib/libspl/include/sys/time.h
+@@ -27,8 +27,9 @@
+ #ifndef _LIBSPL_SYS_TIME_H
+ #define       _LIBSPL_SYS_TIME_H
+ 
+-#include_next <sys/time.h>
++#include <time.h>
+ #include <sys/types.h>
++#include_next <sys/time.h>
+ 
+ #ifndef SEC
+ #define       SEC             1
+@@ -70,13 +71,33 @@
+ #define       SEC2NSEC(m)     ((hrtime_t)(m) * (NANOSEC / SEC))
+ #endif
+ 
+-
+ typedef       long long               hrtime_t;
+-typedef       struct  timespec        timestruc_t;
+-typedef       struct  timespec        timespec_t;
+-
+-
+-extern hrtime_t gethrtime(void);
+-extern void gethrestime(timestruc_t *);
++typedef       struct timespec         timespec_t;
++typedef struct timespec               inode_timespec_t;
++
++static inline void
++gethrestime(inode_timespec_t *ts)
++{
++      struct timeval tv;
++      (void) gettimeofday(&tv, NULL);
++      ts->tv_sec = tv.tv_sec;
++      ts->tv_nsec = tv.tv_usec * NSEC_PER_USEC;
++}
++
++static inline time_t
++gethrestime_sec(void)
++{
++      struct timeval tv;
++      (void) gettimeofday(&tv, NULL);
++      return (tv.tv_sec);
++}
++
++static inline hrtime_t
++gethrtime(void)
++{
++      struct timespec ts;
++      (void) clock_gettime(CLOCK_MONOTONIC, &ts);
++      return ((((u_int64_t)ts.tv_sec) * NANOSEC) + ts.tv_nsec);
++}
+ 
+ #endif /* _LIBSPL_SYS_TIME_H */
+diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c
+index e67d13c9..3ea8778b 100644
+--- a/lib/libzpool/kernel.c
++++ b/lib/libzpool/kernel.c
+@@ -498,7 +498,7 @@ cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime)
+ {
+       int error;
+       struct timeval tv;
+-      timestruc_t ts;
++      struct timespec ts;
+       clock_t delta;
+ 
+       ASSERT3U(cv->cv_magic, ==, CV_MAGIC);
+@@ -536,7 +536,7 @@ cv_timedwait_hires(kcondvar_t *cv, kmutex_t *mp, hrtime_t tim, hrtime_t res,
+ {
+       int error;
+       struct timeval tv;
+-      timestruc_t ts;
++      struct timespec ts;
+       hrtime_t delta;
+ 
+       ASSERT(flag == 0 || flag == CALLOUT_FLAG_ABSOLUTE);
+diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
+index 3425d542..449ebedf 100644
+--- a/module/zfs/dmu_objset.c
++++ b/module/zfs/dmu_objset.c
+@@ -860,7 +860,7 @@ dmu_objset_evict_done(objset_t *os)
+       kmem_free(os, sizeof (objset_t));
+ }
+ 
+-timestruc_t
++inode_timespec_t
+ dmu_objset_snap_cmtime(objset_t *os)
+ {
+       return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir));
+diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c
+index a3ef5896..deecf6bc 100644
+--- a/module/zfs/dsl_dir.c
++++ b/module/zfs/dsl_dir.c
+@@ -1975,10 +1975,10 @@ dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd,
+       return (0);
+ }
+ 
+-timestruc_t
++inode_timespec_t
+ dsl_dir_snap_cmtime(dsl_dir_t *dd)
+ {
+-      timestruc_t t;
++      inode_timespec_t t;
+ 
+       mutex_enter(&dd->dd_lock);
+       t = dd->dd_snap_cmtime;
+@@ -1990,7 +1990,7 @@ dsl_dir_snap_cmtime(dsl_dir_t *dd)
+ void
+ dsl_dir_snap_cmtime_update(dsl_dir_t *dd)
+ {
+-      timestruc_t t;
++      inode_timespec_t t;
+ 
+       gethrestime(&t);
+       mutex_enter(&dd->dd_lock);
+diff --git a/module/zfs/fm.c b/module/zfs/fm.c
+index cb148149..9d26cc99 100644
+--- a/module/zfs/fm.c
++++ b/module/zfs/fm.c
+@@ -508,8 +508,8 @@ zfs_zevent_insert(zevent_t *ev)
+ int
+ zfs_zevent_post(nvlist_t *nvl, nvlist_t *detector, zevent_cb_t *cb)
+ {
++      inode_timespec_t tv;
+       int64_t tv_array[2];
+-      timestruc_t tv;
+       uint64_t eid;
+       size_t nvl_size = 0;
+       zevent_t *ev;
+diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c
+index 14af55c4..25edea78 100644
+--- a/module/zfs/zfs_ctldir.c
++++ b/module/zfs/zfs_ctldir.c
+@@ -449,7 +449,7 @@ static struct inode *
+ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
+     const struct file_operations *fops, const struct inode_operations *ops)
+ {
+-      struct timespec now;
++      inode_timespec_t now;
+       struct inode *ip;
+       znode_t *zp;
+ 
+diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
+index 0d2b61a1..34ea751c 100644
+--- a/module/zfs/zfs_vnops.c
++++ b/module/zfs/zfs_vnops.c
+@@ -3158,7 +3158,7 @@ top:
+ 
+       if (mask & (ATTR_MTIME | ATTR_SIZE)) {
+               ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
+-              ZTOI(zp)->i_mtime = timespec_trunc(vap->va_mtime,
++              ZTOI(zp)->i_mtime = zpl_inode_timespec_trunc(vap->va_mtime,
+                   ZTOI(zp)->i_sb->s_time_gran);
+ 
+               SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+@@ -3167,7 +3167,7 @@ top:
+ 
+       if (mask & (ATTR_CTIME | ATTR_SIZE)) {
+               ZFS_TIME_ENCODE(&vap->va_ctime, ctime);
+-              ZTOI(zp)->i_ctime = timespec_trunc(vap->va_ctime,
++              ZTOI(zp)->i_ctime = zpl_inode_timespec_trunc(vap->va_ctime,
+                   ZTOI(zp)->i_sb->s_time_gran);
+               SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+                   ctime, sizeof (ctime));
+diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c
+index f508a248..e222c791 100644
+--- a/module/zfs/zfs_znode.c
++++ b/module/zfs/zfs_znode.c
+@@ -700,7 +700,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
+       uint64_t        rdev = 0;
+       zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
+       dmu_buf_t       *db;
+-      timestruc_t     now;
++      inode_timespec_t now;
+       uint64_t        gen, obj;
+       int             bonuslen;
+       int             dnodesize;
+@@ -1349,7 +1349,7 @@ void
+ zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
+     uint64_t ctime[2])
+ {
+-      timestruc_t     now;
++      inode_timespec_t now;
+ 
+       gethrestime(&now);
+ 
+diff --git a/module/zfs/zpl_inode.c b/module/zfs/zpl_inode.c
+index 3b5643d0..41b91cab 100644
+--- a/module/zfs/zpl_inode.c
++++ b/module/zfs/zpl_inode.c
+@@ -384,9 +384,10 @@ zpl_setattr(struct dentry *dentry, struct iattr *ia)
+       vap->va_mtime = ia->ia_mtime;
+       vap->va_ctime = ia->ia_ctime;
+ 
+-      if (vap->va_mask & ATTR_ATIME)
+-              ip->i_atime = timespec_trunc(ia->ia_atime,
++      if (vap->va_mask & ATTR_ATIME) {
++              ip->i_atime = zpl_inode_timespec_trunc(ia->ia_atime,
+                   ip->i_sb->s_time_gran);
++      }
+ 
+       cookie = spl_fstrans_mark();
+       error = -zfs_setattr(ip, vap, 0, cr);
diff --git a/zfs-patches/0007-Linux-compat-4.18-check_disk_size_change.patch b/zfs-patches/0007-Linux-compat-4.18-check_disk_size_change.patch

new file mode 100644 (file)

index 0000000..e75a02c
--- /dev/null
+++ b/zfs-patches/0007-Linux-compat-4.18-check_disk_size_change.patch
@@ -0,0 +1,808 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Brian Behlendorf <behlendorf1@llnl.gov>
+Date: Fri, 15 Jun 2018 15:05:21 -0700
+Subject: [PATCH] Linux compat 4.18: check_disk_size_change()
+
+Added support for the bops->check_events() interface which was
+added in the 2.6.38 kernel to replace bops->media_changed().
+Fully implementing this functionality allows the volume resize
+code to rely on revalidate_disk(), which is the preferred
+mechanism, and removes the need to use check_disk_size_change().
+
+In order for bops->check_events() to lookup the zvol_state_t
+stored in the disk->private_data the zvol_state_lock needs to
+be held.  Since the check events interface may poll the mutex
+has been converted to a rwlock for better concurrently.  The
+rwlock need only be taken as a writer in the zvol_free() path
+when disk->private_data is set to NULL.
+
+The configure checks for the block_device_operations structure
+were consolidated in a single kernel-block-device-operations.m4
+file.
+
+The ZFS_AC_KERNEL_BDEV_BLOCK_DEVICE_OPERATIONS configure checks
+and assoicated dead code was removed.  This interface was added
+to the 2.6.28 kernel which predates the oldest supported 2.6.32
+kernel and will therefore always be available.
+
+Updated maximum Linux version in META file.  The 4.17 kernel
+was released on 2018-06-03 and ZoL is compatible with the
+finalized kernel.
+
+Reviewed-by: Boris Protopopov <boris.protopopov@actifio.com>
+Reviewed-by: Sara Hartse <sara.hartse@delphix.com>
+Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Closes #7611
+
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ config/kernel-bdev-block-device-operations.m4      |  34 ---
+ .../kernel-block-device-operations-release-void.m4 |  29 ---
+ config/kernel-block-device-operations.m4           |  57 +++++
+ config/kernel.m4                                   |   2 +-
+ include/linux/blkdev_compat.h                      |   1 +
+ module/zfs/zvol.c                                  | 259 +++++++++------------
+ 6 files changed, 174 insertions(+), 208 deletions(-)
+ delete mode 100644 config/kernel-bdev-block-device-operations.m4
+ delete mode 100644 config/kernel-block-device-operations-release-void.m4
+ create mode 100644 config/kernel-block-device-operations.m4
+
+diff --git a/config/kernel-bdev-block-device-operations.m4 b/config/kernel-bdev-block-device-operations.m4
+deleted file mode 100644
+index faacc195..00000000
+--- a/config/kernel-bdev-block-device-operations.m4
++++ /dev/null
+@@ -1,34 +0,0 @@
+-dnl #
+-dnl # 2.6.x API change
+-dnl #
+-AC_DEFUN([ZFS_AC_KERNEL_BDEV_BLOCK_DEVICE_OPERATIONS], [
+-      AC_MSG_CHECKING([block device operation prototypes])
+-      tmp_flags="$EXTRA_KCFLAGS"
+-      EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
+-      ZFS_LINUX_TRY_COMPILE([
+-              #include <linux/blkdev.h>
+-
+-              int blk_open(struct block_device *bdev, fmode_t mode)
+-                  { return 0; }
+-              int blk_ioctl(struct block_device *bdev, fmode_t mode,
+-                  unsigned x, unsigned long y) { return 0; }
+-              int blk_compat_ioctl(struct block_device * bdev, fmode_t mode,
+-                  unsigned x, unsigned long y) { return 0; }
+-
+-              static const struct block_device_operations
+-                  bops __attribute__ ((unused)) = {
+-                      .open           = blk_open,
+-                      .release        = NULL,
+-                      .ioctl          = blk_ioctl,
+-                      .compat_ioctl   = blk_compat_ioctl,
+-              };
+-      ],[
+-      ],[
+-              AC_MSG_RESULT(struct block_device)
+-              AC_DEFINE(HAVE_BDEV_BLOCK_DEVICE_OPERATIONS, 1,
+-                        [struct block_device_operations use bdevs])
+-      ],[
+-              AC_MSG_RESULT(struct inode)
+-      ])
+-      EXTRA_KCFLAGS="$tmp_flags"
+-])
+diff --git a/config/kernel-block-device-operations-release-void.m4 b/config/kernel-block-device-operations-release-void.m4
+deleted file mode 100644
+index a73f8587..00000000
+--- a/config/kernel-block-device-operations-release-void.m4
++++ /dev/null
+@@ -1,29 +0,0 @@
+-dnl #
+-dnl # 3.10.x API change
+-dnl #
+-AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID], [
+-      AC_MSG_CHECKING([whether block_device_operations.release is void])
+-      tmp_flags="$EXTRA_KCFLAGS"
+-      EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
+-      ZFS_LINUX_TRY_COMPILE([
+-              #include <linux/blkdev.h>
+-
+-              void blk_release(struct gendisk *g, fmode_t mode) { return; }
+-
+-              static const struct block_device_operations
+-                  bops __attribute__ ((unused)) = {
+-                      .open           = NULL,
+-                      .release        = blk_release,
+-                      .ioctl          = NULL,
+-                      .compat_ioctl   = NULL,
+-              };
+-      ],[
+-      ],[
+-              AC_MSG_RESULT(void)
+-              AC_DEFINE(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID, 1,
+-                        [struct block_device_operations.release returns void])
+-      ],[
+-              AC_MSG_RESULT(int)
+-      ])
+-      EXTRA_KCFLAGS="$tmp_flags"
+-])
+diff --git a/config/kernel-block-device-operations.m4 b/config/kernel-block-device-operations.m4
+new file mode 100644
+index 00000000..5f2811c1
+--- /dev/null
++++ b/config/kernel-block-device-operations.m4
+@@ -0,0 +1,57 @@
++dnl #
++dnl # 2.6.38 API change
++dnl #
++AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS], [
++      AC_MSG_CHECKING([whether bops->check_events() exists])
++      tmp_flags="$EXTRA_KCFLAGS"
++      EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
++      ZFS_LINUX_TRY_COMPILE([
++              #include <linux/blkdev.h>
++
++              unsigned int blk_check_events(struct gendisk *disk,
++                  unsigned int clearing) { return (0); }
++
++              static const struct block_device_operations
++                  bops __attribute__ ((unused)) = {
++                      .check_events   = blk_check_events,
++              };
++      ],[
++      ],[
++              AC_MSG_RESULT(yes)
++              AC_DEFINE(HAVE_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS, 1,
++                  [bops->check_events() exists])
++      ],[
++              AC_MSG_RESULT(no)
++      ])
++      EXTRA_KCFLAGS="$tmp_flags"
++])
++
++dnl #
++dnl # 3.10.x API change
++dnl #
++AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID], [
++      AC_MSG_CHECKING([whether bops->release() is void])
++      tmp_flags="$EXTRA_KCFLAGS"
++      EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
++      ZFS_LINUX_TRY_COMPILE([
++              #include <linux/blkdev.h>
++
++              void blk_release(struct gendisk *g, fmode_t mode) { return; }
++
++              static const struct block_device_operations
++                  bops __attribute__ ((unused)) = {
++                      .open           = NULL,
++                      .release        = blk_release,
++                      .ioctl          = NULL,
++                      .compat_ioctl   = NULL,
++              };
++      ],[
++      ],[
++              AC_MSG_RESULT(void)
++              AC_DEFINE(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID, 1,
++                        [bops->release() returns void])
++      ],[
++              AC_MSG_RESULT(int)
++      ])
++      EXTRA_KCFLAGS="$tmp_flags"
++])
+diff --git a/config/kernel.m4 b/config/kernel.m4
+index 375e4b79..c7ca260c 100644
+--- a/config/kernel.m4
++++ b/config/kernel.m4
+@@ -12,7 +12,7 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [
+       ZFS_AC_KERNEL_CURRENT_BIO_TAIL
+       ZFS_AC_KERNEL_SUPER_USER_NS
+       ZFS_AC_KERNEL_SUBMIT_BIO
+-      ZFS_AC_KERNEL_BDEV_BLOCK_DEVICE_OPERATIONS
++      ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS
+       ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID
+       ZFS_AC_KERNEL_TYPE_FMODE_T
+       ZFS_AC_KERNEL_3ARG_BLKDEV_GET
+diff --git a/include/linux/blkdev_compat.h b/include/linux/blkdev_compat.h
+index f99980ab..27f05662 100644
+--- a/include/linux/blkdev_compat.h
++++ b/include/linux/blkdev_compat.h
+@@ -32,6 +32,7 @@
+ #include <linux/blkdev.h>
+ #include <linux/elevator.h>
+ #include <linux/backing-dev.h>
++#include <linux/msdos_fs.h>   /* for SECTOR_* */
+ 
+ #ifndef HAVE_FMODE_T
+ typedef unsigned __bitwise__ fmode_t;
+diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
+index ffa5fac7..03f95630 100644
+--- a/module/zfs/zvol.c
++++ b/module/zfs/zvol.c
+@@ -99,7 +99,7 @@ unsigned long zvol_max_discard_blocks = 16384;
+ unsigned int zvol_volmode = ZFS_VOLMODE_GEOM;
+ 
+ static taskq_t *zvol_taskq;
+-static kmutex_t zvol_state_lock;
++static krwlock_t zvol_state_lock;
+ static list_t zvol_state_list;
+ 
+ #define       ZVOL_HT_SIZE    1024
+@@ -176,17 +176,17 @@ zvol_find_by_dev(dev_t dev)
+ {
+       zvol_state_t *zv;
+ 
+-      mutex_enter(&zvol_state_lock);
++      rw_enter(&zvol_state_lock, RW_READER);
+       for (zv = list_head(&zvol_state_list); zv != NULL;
+           zv = list_next(&zvol_state_list, zv)) {
+               mutex_enter(&zv->zv_state_lock);
+               if (zv->zv_dev == dev) {
+-                      mutex_exit(&zvol_state_lock);
++                      rw_exit(&zvol_state_lock);
+                       return (zv);
+               }
+               mutex_exit(&zv->zv_state_lock);
+       }
+-      mutex_exit(&zvol_state_lock);
++      rw_exit(&zvol_state_lock);
+ 
+       return (NULL);
+ }
+@@ -204,7 +204,7 @@ zvol_find_by_name_hash(const char *name, uint64_t hash, int mode)
+       zvol_state_t *zv;
+       struct hlist_node *p = NULL;
+ 
+-      mutex_enter(&zvol_state_lock);
++      rw_enter(&zvol_state_lock, RW_READER);
+       hlist_for_each(p, ZVOL_HT_HEAD(hash)) {
+               zv = hlist_entry(p, zvol_state_t, zv_hlink);
+               mutex_enter(&zv->zv_state_lock);
+@@ -227,12 +227,12 @@ zvol_find_by_name_hash(const char *name, uint64_t hash, int mode)
+                                   strncmp(zv->zv_name, name, MAXNAMELEN)
+                                   == 0);
+                       }
+-                      mutex_exit(&zvol_state_lock);
++                      rw_exit(&zvol_state_lock);
+                       return (zv);
+               }
+               mutex_exit(&zv->zv_state_lock);
+       }
+-      mutex_exit(&zvol_state_lock);
++      rw_exit(&zvol_state_lock);
+ 
+       return (NULL);
+ }
+@@ -339,24 +339,6 @@ zvol_get_stats(objset_t *os, nvlist_t *nv)
+       return (SET_ERROR(error));
+ }
+ 
+-static void
+-zvol_size_changed(zvol_state_t *zv, uint64_t volsize)
+-{
+-      struct block_device *bdev;
+-
+-      ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+-
+-      bdev = bdget_disk(zv->zv_disk, 0);
+-      if (bdev == NULL)
+-              return;
+-
+-      set_capacity(zv->zv_disk, volsize >> 9);
+-      zv->zv_volsize = volsize;
+-      check_disk_size_change(zv->zv_disk, bdev);
+-
+-      bdput(bdev);
+-}
+-
+ /*
+  * Sanity check volume size.
+  */
+@@ -409,31 +391,17 @@ zvol_update_volsize(uint64_t volsize, objset_t *os)
+       return (error);
+ }
+ 
+-static int
+-zvol_update_live_volsize(zvol_state_t *zv, uint64_t volsize)
+-{
+-      zvol_size_changed(zv, volsize);
+-
+-      /*
+-       * We should post a event here describing the expansion.  However,
+-       * the zfs_ereport_post() interface doesn't nicely support posting
+-       * events for zvols, it assumes events relate to vdevs or zios.
+-       */
+-
+-      return (0);
+-}
+-
+ /*
+- * Set ZFS_PROP_VOLSIZE set entry point.
++ * Set ZFS_PROP_VOLSIZE set entry point.  Note that modifying the volume
++ * size will result in a udev "change" event being generated.
+  */
+ int
+ zvol_set_volsize(const char *name, uint64_t volsize)
+ {
+-      zvol_state_t *zv = NULL;
+       objset_t *os = NULL;
+-      int error;
+-      dmu_object_info_t *doi;
++      struct gendisk *disk = NULL;
+       uint64_t readonly;
++      int error;
+       boolean_t owned = B_FALSE;
+ 
+       error = dsl_prop_get_integer(name,
+@@ -443,7 +411,7 @@ zvol_set_volsize(const char *name, uint64_t volsize)
+       if (readonly)
+               return (SET_ERROR(EROFS));
+ 
+-      zv = zvol_find_by_name(name, RW_READER);
++      zvol_state_t *zv = zvol_find_by_name(name, RW_READER);
+ 
+       ASSERT(zv == NULL || (MUTEX_HELD(&zv->zv_state_lock) &&
+           RW_READ_HELD(&zv->zv_suspend_lock)));
+@@ -464,16 +432,18 @@ zvol_set_volsize(const char *name, uint64_t volsize)
+               os = zv->zv_objset;
+       }
+ 
+-      doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
++      dmu_object_info_t *doi = kmem_alloc(sizeof (*doi), KM_SLEEP);
+ 
+       if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) ||
+           (error = zvol_check_volsize(volsize, doi->doi_data_block_size)))
+               goto out;
+ 
+       error = zvol_update_volsize(volsize, os);
+-
+-      if (error == 0 && zv != NULL)
+-              error = zvol_update_live_volsize(zv, volsize);
++      if (error == 0 && zv != NULL) {
++              zv->zv_volsize = volsize;
++              zv->zv_changed = 1;
++              disk = zv->zv_disk;
++      }
+ out:
+       kmem_free(doi, sizeof (dmu_object_info_t));
+ 
+@@ -488,6 +458,9 @@ out:
+       if (zv != NULL)
+               mutex_exit(&zv->zv_state_lock);
+ 
++      if (disk != NULL)
++              revalidate_disk(disk);
++
+       return (SET_ERROR(error));
+ }
+ 
+@@ -543,8 +516,8 @@ zvol_set_volblocksize(const char *name, uint64_t volblocksize)
+       if (zv == NULL)
+               return (SET_ERROR(ENXIO));
+ 
+-      ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
+-          RW_READ_HELD(&zv->zv_suspend_lock));
++      ASSERT(MUTEX_HELD(&zv->zv_state_lock));
++      ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
+ 
+       if (zv->zv_flags & ZVOL_RDONLY) {
+               mutex_exit(&zv->zv_state_lock);
+@@ -1120,7 +1093,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
+ static void
+ zvol_insert(zvol_state_t *zv)
+ {
+-      ASSERT(MUTEX_HELD(&zvol_state_lock));
++      ASSERT(RW_WRITE_HELD(&zvol_state_lock));
+       ASSERT3U(MINOR(zv->zv_dev) & ZVOL_MINOR_MASK, ==, 0);
+       list_insert_head(&zvol_state_list, zv);
+       hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
+@@ -1132,7 +1105,7 @@ zvol_insert(zvol_state_t *zv)
+ static void
+ zvol_remove(zvol_state_t *zv)
+ {
+-      ASSERT(MUTEX_HELD(&zvol_state_lock));
++      ASSERT(RW_WRITE_HELD(&zvol_state_lock));
+       list_remove(&zvol_state_list, zv);
+       hlist_del(&zv->zv_hlink);
+ }
+@@ -1148,8 +1121,8 @@ zvol_setup_zv(zvol_state_t *zv)
+       uint64_t ro;
+       objset_t *os = zv->zv_objset;
+ 
+-      ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
+-          RW_LOCK_HELD(&zv->zv_suspend_lock));
++      ASSERT(MUTEX_HELD(&zv->zv_state_lock));
++      ASSERT(RW_LOCK_HELD(&zv->zv_suspend_lock));
+ 
+       error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
+       if (error)
+@@ -1227,8 +1200,8 @@ zvol_suspend(const char *name)
+               return (NULL);
+ 
+       /* block all I/O, release in zvol_resume. */
+-      ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
+-          RW_WRITE_HELD(&zv->zv_suspend_lock));
++      ASSERT(MUTEX_HELD(&zv->zv_state_lock));
++      ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
+ 
+       atomic_inc(&zv->zv_suspend_ref);
+ 
+@@ -1349,9 +1322,7 @@ zvol_open(struct block_device *bdev, fmode_t flag)
+       int error = 0;
+       boolean_t drop_suspend = B_TRUE;
+ 
+-      ASSERT(!MUTEX_HELD(&zvol_state_lock));
+-
+-      mutex_enter(&zvol_state_lock);
++      rw_enter(&zvol_state_lock, RW_READER);
+       /*
+        * Obtain a copy of private_data under the zvol_state_lock to make
+        * sure that either the result of zvol free code path setting
+@@ -1360,7 +1331,7 @@ zvol_open(struct block_device *bdev, fmode_t flag)
+        */
+       zv = bdev->bd_disk->private_data;
+       if (zv == NULL) {
+-              mutex_exit(&zvol_state_lock);
++              rw_exit(&zvol_state_lock);
+               return (SET_ERROR(-ENXIO));
+       }
+ 
+@@ -1384,7 +1355,7 @@ zvol_open(struct block_device *bdev, fmode_t flag)
+       } else {
+               drop_suspend = B_FALSE;
+       }
+-      mutex_exit(&zvol_state_lock);
++      rw_exit(&zvol_state_lock);
+ 
+       ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+       ASSERT(zv->zv_open_count != 0 || RW_READ_HELD(&zv->zv_suspend_lock));
+@@ -1402,11 +1373,18 @@ zvol_open(struct block_device *bdev, fmode_t flag)
+ 
+       zv->zv_open_count++;
+ 
++      mutex_exit(&zv->zv_state_lock);
++      if (drop_suspend)
++              rw_exit(&zv->zv_suspend_lock);
++
+       check_disk_change(bdev);
+ 
++      return (0);
++
+ out_open_count:
+       if (zv->zv_open_count == 0)
+               zvol_last_close(zv);
++
+ out_mutex:
+       mutex_exit(&zv->zv_state_lock);
+       if (drop_suspend)
+@@ -1427,9 +1405,7 @@ zvol_release(struct gendisk *disk, fmode_t mode)
+       zvol_state_t *zv;
+       boolean_t drop_suspend = B_TRUE;
+ 
+-      ASSERT(!MUTEX_HELD(&zvol_state_lock));
+-
+-      mutex_enter(&zvol_state_lock);
++      rw_enter(&zvol_state_lock, RW_READER);
+       zv = disk->private_data;
+ 
+       mutex_enter(&zv->zv_state_lock);
+@@ -1453,7 +1429,7 @@ zvol_release(struct gendisk *disk, fmode_t mode)
+       } else {
+               drop_suspend = B_FALSE;
+       }
+-      mutex_exit(&zvol_state_lock);
++      rw_exit(&zvol_state_lock);
+ 
+       ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+       ASSERT(zv->zv_open_count != 1 || RW_READ_HELD(&zv->zv_suspend_lock));
+@@ -1479,7 +1455,7 @@ zvol_ioctl(struct block_device *bdev, fmode_t mode,
+       zvol_state_t *zv = bdev->bd_disk->private_data;
+       int error = 0;
+ 
+-      ASSERT(zv && zv->zv_open_count > 0);
++      ASSERT3U(zv->zv_open_count, >, 0);
+ 
+       switch (cmd) {
+       case BLKFLSBUF:
+@@ -1519,23 +1495,62 @@ zvol_compat_ioctl(struct block_device *bdev, fmode_t mode,
+ #define       zvol_compat_ioctl       NULL
+ #endif
+ 
++/*
++ * Linux 2.6.38 preferred interface.
++ */
++#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS
++static unsigned int
++zvol_check_events(struct gendisk *disk, unsigned int clearing)
++{
++      unsigned int mask = 0;
++
++      rw_enter(&zvol_state_lock, RW_READER);
++
++      zvol_state_t *zv = disk->private_data;
++      if (zv != NULL) {
++              mutex_enter(&zv->zv_state_lock);
++              mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
++              zv->zv_changed = 0;
++              mutex_exit(&zv->zv_state_lock);
++      }
++
++      rw_exit(&zvol_state_lock);
++
++      return (mask);
++}
++#else
+ static int zvol_media_changed(struct gendisk *disk)
+ {
++      int changed = 0;
++
++      rw_enter(&zvol_state_lock, RW_READER);
++
+       zvol_state_t *zv = disk->private_data;
++      if (zv != NULL) {
++              mutex_enter(&zv->zv_state_lock);
++              changed = zv->zv_changed;
++              zv->zv_changed = 0;
++              mutex_exit(&zv->zv_state_lock);
++      }
+ 
+-      ASSERT(zv && zv->zv_open_count > 0);
++      rw_exit(&zvol_state_lock);
+ 
+-      return (zv->zv_changed);
++      return (changed);
+ }
++#endif
+ 
+ static int zvol_revalidate_disk(struct gendisk *disk)
+ {
+-      zvol_state_t *zv = disk->private_data;
++      rw_enter(&zvol_state_lock, RW_READER);
+ 
+-      ASSERT(zv && zv->zv_open_count > 0);
++      zvol_state_t *zv = disk->private_data;
++      if (zv != NULL) {
++              mutex_enter(&zv->zv_state_lock);
++              set_capacity(zv->zv_disk, zv->zv_volsize >> SECTOR_BITS);
++              mutex_exit(&zv->zv_state_lock);
++      }
+ 
+-      zv->zv_changed = 0;
+-      set_capacity(zv->zv_disk, zv->zv_volsize >> 9);
++      rw_exit(&zvol_state_lock);
+ 
+       return (0);
+ }
+@@ -1552,7 +1567,7 @@ zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+       zvol_state_t *zv = bdev->bd_disk->private_data;
+       sector_t sectors;
+ 
+-      ASSERT(zv && zv->zv_open_count > 0);
++      ASSERT3U(zv->zv_open_count, >, 0);
+ 
+       sectors = get_capacity(zv->zv_disk);
+ 
+@@ -1585,68 +1600,20 @@ zvol_probe(dev_t dev, int *part, void *arg)
+       return (kobj);
+ }
+ 
+-#ifdef HAVE_BDEV_BLOCK_DEVICE_OPERATIONS
+ static struct block_device_operations zvol_ops = {
+       .open                   = zvol_open,
+       .release                = zvol_release,
+       .ioctl                  = zvol_ioctl,
+       .compat_ioctl           = zvol_compat_ioctl,
+-      .media_changed          = zvol_media_changed,
+-      .revalidate_disk        = zvol_revalidate_disk,
+-      .getgeo                 = zvol_getgeo,
+-      .owner                  = THIS_MODULE,
+-};
+-
+-#else /* HAVE_BDEV_BLOCK_DEVICE_OPERATIONS */
+-
+-static int
+-zvol_open_by_inode(struct inode *inode, struct file *file)
+-{
+-      return (zvol_open(inode->i_bdev, file->f_mode));
+-}
+-
+-static int
+-zvol_release_by_inode(struct inode *inode, struct file *file)
+-{
+-      return (zvol_release(inode->i_bdev->bd_disk, file->f_mode));
+-}
+-
+-static int
+-zvol_ioctl_by_inode(struct inode *inode, struct file *file,
+-    unsigned int cmd, unsigned long arg)
+-{
+-      if (file == NULL || inode == NULL)
+-              return (SET_ERROR(-EINVAL));
+-
+-      return (zvol_ioctl(inode->i_bdev, file->f_mode, cmd, arg));
+-}
+-
+-#ifdef CONFIG_COMPAT
+-static long
+-zvol_compat_ioctl_by_inode(struct file *file,
+-    unsigned int cmd, unsigned long arg)
+-{
+-      if (file == NULL)
+-              return (SET_ERROR(-EINVAL));
+-
+-      return (zvol_compat_ioctl(file->f_dentry->d_inode->i_bdev,
+-          file->f_mode, cmd, arg));
+-}
++#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS
++      .check_events           = zvol_check_events,
+ #else
+-#define       zvol_compat_ioctl_by_inode      NULL
+-#endif
+-
+-static struct block_device_operations zvol_ops = {
+-      .open                   = zvol_open_by_inode,
+-      .release                = zvol_release_by_inode,
+-      .ioctl                  = zvol_ioctl_by_inode,
+-      .compat_ioctl           = zvol_compat_ioctl_by_inode,
+       .media_changed          = zvol_media_changed,
++#endif
+       .revalidate_disk        = zvol_revalidate_disk,
+       .getgeo                 = zvol_getgeo,
+       .owner                  = THIS_MODULE,
+ };
+-#endif /* HAVE_BDEV_BLOCK_DEVICE_OPERATIONS */
+ 
+ /*
+  * Allocate memory for a new zvol_state_t and setup the required
+@@ -1699,6 +1666,10 @@ zvol_alloc(dev_t dev, const char *name)
+       rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
+ 
+       zv->zv_disk->major = zvol_major;
++#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS
++      zv->zv_disk->events = DISK_EVENT_MEDIA_CHANGE;
++#endif
++
+       if (volmode == ZFS_VOLMODE_DEV) {
+               /*
+                * ZFS_VOLMODE_DEV disable partitioning on ZVOL devices: set
+@@ -1743,7 +1714,6 @@ zvol_free(void *arg)
+ {
+       zvol_state_t *zv = arg;
+ 
+-      ASSERT(!MUTEX_HELD(&zvol_state_lock));
+       ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
+       ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
+       ASSERT(zv->zv_open_count == 0);
+@@ -1870,9 +1840,9 @@ out_doi:
+       kmem_free(doi, sizeof (dmu_object_info_t));
+ 
+       if (error == 0) {
+-              mutex_enter(&zvol_state_lock);
++              rw_enter(&zvol_state_lock, RW_WRITER);
+               zvol_insert(zv);
+-              mutex_exit(&zvol_state_lock);
++              rw_exit(&zvol_state_lock);
+               add_disk(zv->zv_disk);
+       } else {
+               ida_simple_remove(&zvol_ida, idx);
+@@ -1889,7 +1859,7 @@ zvol_rename_minor(zvol_state_t *zv, const char *newname)
+ {
+       int readonly = get_disk_ro(zv->zv_disk);
+ 
+-      ASSERT(MUTEX_HELD(&zvol_state_lock));
++      ASSERT(RW_LOCK_HELD(&zvol_state_lock));
+       ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+ 
+       strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
+@@ -2129,7 +2099,7 @@ zvol_remove_minors_impl(const char *name)
+       list_create(&free_list, sizeof (zvol_state_t),
+           offsetof(zvol_state_t, zv_next));
+ 
+-      mutex_enter(&zvol_state_lock);
++      rw_enter(&zvol_state_lock, RW_WRITER);
+ 
+       for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
+               zv_next = list_next(&zvol_state_list, zv);
+@@ -2154,15 +2124,15 @@ zvol_remove_minors_impl(const char *name)
+                       zvol_remove(zv);
+ 
+                       /*
+-                       * clear this while holding zvol_state_lock so
+-                       * zvol_open won't open it
++                       * Cleared while holding zvol_state_lock as a writer
++                       * which will prevent zvol_open() from opening it.
+                        */
+                       zv->zv_disk->private_data = NULL;
+ 
+                       /* Drop zv_state_lock before zvol_free() */
+                       mutex_exit(&zv->zv_state_lock);
+ 
+-                      /* try parallel zv_free, if failed do it in place */
++                      /* Try parallel zv_free, if failed do it in place */
+                       t = taskq_dispatch(system_taskq, zvol_free, zv,
+                           TQ_SLEEP);
+                       if (t == TASKQID_INVALID)
+@@ -2173,11 +2143,9 @@ zvol_remove_minors_impl(const char *name)
+                       mutex_exit(&zv->zv_state_lock);
+               }
+       }
+-      mutex_exit(&zvol_state_lock);
++      rw_exit(&zvol_state_lock);
+ 
+-      /*
+-       * Drop zvol_state_lock before calling zvol_free()
+-       */
++      /* Drop zvol_state_lock before calling zvol_free() */
+       while ((zv = list_head(&free_list)) != NULL) {
+               list_remove(&free_list, zv);
+               zvol_free(zv);
+@@ -2196,7 +2164,7 @@ zvol_remove_minor_impl(const char *name)
+       if (zvol_inhibit_dev)
+               return;
+ 
+-      mutex_enter(&zvol_state_lock);
++      rw_enter(&zvol_state_lock, RW_WRITER);
+ 
+       for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
+               zv_next = list_next(&zvol_state_list, zv);
+@@ -2216,7 +2184,10 @@ zvol_remove_minor_impl(const char *name)
+                       }
+                       zvol_remove(zv);
+ 
+-                      /* clear this so zvol_open won't open it */
++                      /*
++                       * Cleared while holding zvol_state_lock as a writer
++                       * which will prevent zvol_open() from opening it.
++                       */
+                       zv->zv_disk->private_data = NULL;
+ 
+                       mutex_exit(&zv->zv_state_lock);
+@@ -2227,7 +2198,7 @@ zvol_remove_minor_impl(const char *name)
+       }
+ 
+       /* Drop zvol_state_lock before calling zvol_free() */
+-      mutex_exit(&zvol_state_lock);
++      rw_exit(&zvol_state_lock);
+ 
+       if (zv != NULL)
+               zvol_free(zv);
+@@ -2248,7 +2219,7 @@ zvol_rename_minors_impl(const char *oldname, const char *newname)
+       oldnamelen = strlen(oldname);
+       newnamelen = strlen(newname);
+ 
+-      mutex_enter(&zvol_state_lock);
++      rw_enter(&zvol_state_lock, RW_READER);
+ 
+       for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
+               zv_next = list_next(&zvol_state_list, zv);
+@@ -2276,7 +2247,7 @@ zvol_rename_minors_impl(const char *oldname, const char *newname)
+               mutex_exit(&zv->zv_state_lock);
+       }
+ 
+-      mutex_exit(&zvol_state_lock);
++      rw_exit(&zvol_state_lock);
+ }
+ 
+ typedef struct zvol_snapdev_cb_arg {
+@@ -2653,7 +2624,7 @@ zvol_init(void)
+ 
+       list_create(&zvol_state_list, sizeof (zvol_state_t),
+           offsetof(zvol_state_t, zv_next));
+-      mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL);
++      rw_init(&zvol_state_lock, NULL, RW_DEFAULT, NULL);
+       ida_init(&zvol_ida);
+ 
+       zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri,
+@@ -2690,7 +2661,7 @@ out_taskq:
+       taskq_destroy(zvol_taskq);
+ out:
+       ida_destroy(&zvol_ida);
+-      mutex_destroy(&zvol_state_lock);
++      rw_destroy(&zvol_state_lock);
+       list_destroy(&zvol_state_list);
+ 
+       return (SET_ERROR(error));
+@@ -2707,7 +2678,7 @@ zvol_fini(void)
+ 
+       taskq_destroy(zvol_taskq);
+       list_destroy(&zvol_state_list);
+-      mutex_destroy(&zvol_state_lock);
++      rw_destroy(&zvol_state_lock);
+ 
+       ida_destroy(&zvol_ida);
+ }
diff --git a/zfs-patches/0008-OpenZFS-8997-ztest-assertion-failure-in-zil_lwb_writ.patch b/zfs-patches/0008-OpenZFS-8997-ztest-assertion-failure-in-zil_lwb_writ.patch

new file mode 100644 (file)

index 0000000..f6498c9
--- /dev/null
+++ b/zfs-patches/0008-OpenZFS-8997-ztest-assertion-failure-in-zil_lwb_writ.patch
@@ -0,0 +1,368 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Prakash Surya <prakash.surya@delphix.com>
+Date: Mon, 8 Jan 2018 13:45:53 -0800
+Subject: [PATCH] OpenZFS 8997 - ztest assertion failure in zil_lwb_write_issue
+
+PROBLEM
+=======
+
+When `dmu_tx_assign` is called from `zil_lwb_write_issue`, it's possible
+for either `ERESTART` or `EIO` to be returned.
+
+If `ERESTART` is returned, this will cause an assertion to fail directly
+in `zil_lwb_write_issue`, where the code assumes the return value is
+`EIO` if `dmu_tx_assign` returns a non-zero value. This can occur if the
+SPA is suspended when `dmu_tx_assign` is called, and most often occurs
+when running `zloop`.
+
+If `EIO` is returned, this can cause assertions to fail elsewhere in the
+ZIL code. For example, `zil_commit_waiter_timeout` contains the
+following logic:
+
+    lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb);
+    ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
+
+In this case, if `dmu_tx_assign` returned `EIO` from within
+`zil_lwb_write_issue`, the `lwb` variable passed in will not be issued
+to disk. Thus, it's `lwb_state` field will remain `LWB_STATE_OPENED` and
+this assertion will fail. `zil_commit_waiter_timeout` assumes that after
+it calls `zil_lwb_write_issue`, the `lwb` will be issued to disk, and
+doesn't handle the case where this is not true; i.e. it doesn't handle
+the case where `dmu_tx_assign` returns `EIO`.
+
+SOLUTION
+========
+
+This change modifies the `dmu_tx_assign` function such that `txg_how` is
+a bitmask, rather than of the `txg_how_t` enum type. Now, the previous
+`TXG_WAITED` semantics can be used via `TXG_NOTHROTTLE`, along with
+specifying either `TXG_NOWAIT` or `TXG_WAIT` semantics.
+
+Previously, when `TXG_WAITED` was specified, `TXG_NOWAIT` semantics was
+automatically invoked. This was not ideal when using `TXG_WAITED` within
+`zil_lwb_write_issued`, leading the problem described above. Rather, we
+want to achieve the semantics of `TXG_WAIT`, while also preventing the
+`tx` from being penalized via the dirty delay throttling.
+
+With this change, `zil_lwb_write_issued` can acheive the semtantics that
+it requires by passing in the value `TXG_WAIT | TXG_NOTHROTTLE` to
+`dmu_tx_assign`.
+
+Further, consumers of `dmu_tx_assign` wishing to achieve the old
+`TXG_WAITED` semantics can pass in the value `TXG_NOWAIT | TXG_NOTHROTTLE`.
+
+Authored by: Prakash Surya <prakash.surya@delphix.com>
+Approved by: Robert Mustacchi <rm@joyent.com>
+Reviewed by: Matt Ahrens <mahrens@delphix.com>
+Reviewed by: Andriy Gapon <avg@FreeBSD.org>
+Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>
+
+Porting Notes:
+- Additionally updated `zfs_tmpfile` to use `TXG_NOTHROTTLE`
+
+OpenZFS-issue: https://www.illumos.org/issues/8997
+OpenZFS-commit: https://github.com/openzfs/openzfs/commit/19ea6cb0f9
+Closes #7084
+
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ include/sys/dmu.h      | 15 +++++++------
+ include/sys/dmu_tx.h   |  8 +++----
+ module/zfs/dmu_tx.c    | 57 ++++++++++++++++++++++++++------------------------
+ module/zfs/zfs_vnops.c | 21 ++++++++++---------
+ module/zfs/zil.c       | 10 ++++++++-
+ 5 files changed, 63 insertions(+), 48 deletions(-)
+
+diff --git a/include/sys/dmu.h b/include/sys/dmu.h
+index 755a9056..5b355afb 100644
+--- a/include/sys/dmu.h
++++ b/include/sys/dmu.h
+@@ -227,11 +227,14 @@ typedef enum dmu_object_type {
+       DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE),
+ } dmu_object_type_t;
+ 
+-typedef enum txg_how {
+-      TXG_WAIT = 1,
+-      TXG_NOWAIT,
+-      TXG_WAITED,
+-} txg_how_t;
++/*
++ * These flags are intended to be used to specify the "txg_how"
++ * parameter when calling the dmu_tx_assign() function. See the comment
++ * above dmu_tx_assign() for more details on the meaning of these flags.
++ */
++#define       TXG_NOWAIT      (0ULL)
++#define       TXG_WAIT        (1ULL<<0)
++#define       TXG_NOTHROTTLE  (1ULL<<1)
+ 
+ void byteswap_uint64_array(void *buf, size_t size);
+ void byteswap_uint32_array(void *buf, size_t size);
+@@ -694,7 +697,7 @@ void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object);
+ void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow);
+ void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size);
+ void dmu_tx_abort(dmu_tx_t *tx);
+-int dmu_tx_assign(dmu_tx_t *tx, enum txg_how txg_how);
++int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
+ void dmu_tx_wait(dmu_tx_t *tx);
+ void dmu_tx_commit(dmu_tx_t *tx);
+ void dmu_tx_mark_netfree(dmu_tx_t *tx);
+diff --git a/include/sys/dmu_tx.h b/include/sys/dmu_tx.h
+index d82a7931..74b7e111 100644
+--- a/include/sys/dmu_tx.h
++++ b/include/sys/dmu_tx.h
+@@ -67,9 +67,6 @@ struct dmu_tx {
+       /* placeholder for syncing context, doesn't need specific holds */
+       boolean_t tx_anyobj;
+ 
+-      /* has this transaction already been delayed? */
+-      boolean_t tx_waited;
+-
+       /* transaction is marked as being a "net free" of space */
+       boolean_t tx_netfree;
+ 
+@@ -79,6 +76,9 @@ struct dmu_tx {
+       /* need to wait for sufficient dirty space */
+       boolean_t tx_wait_dirty;
+ 
++      /* has this transaction already been delayed? */
++      boolean_t tx_dirty_delayed;
++
+       int tx_err;
+ };
+ 
+@@ -138,7 +138,7 @@ extern dmu_tx_stats_t dmu_tx_stats;
+  * These routines are defined in dmu.h, and are called by the user.
+  */
+ dmu_tx_t *dmu_tx_create(objset_t *dd);
+-int dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how);
++int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
+ void dmu_tx_commit(dmu_tx_t *tx);
+ void dmu_tx_abort(dmu_tx_t *tx);
+ uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
+diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c
+index c3cc03a6..6ebff267 100644
+--- a/module/zfs/dmu_tx.c
++++ b/module/zfs/dmu_tx.c
+@@ -854,7 +854,7 @@ dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
+  * decreasing performance.
+  */
+ static int
+-dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
++dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
+ {
+       spa_t *spa = tx->tx_pool->dp_spa;
+ 
+@@ -878,13 +878,13 @@ dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
+                * of the failuremode setting.
+                */
+               if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
+-                  txg_how != TXG_WAIT)
++                  !(txg_how & TXG_WAIT))
+                       return (SET_ERROR(EIO));
+ 
+               return (SET_ERROR(ERESTART));
+       }
+ 
+-      if (!tx->tx_waited &&
++      if (!tx->tx_dirty_delayed &&
+           dsl_pool_need_dirty_delay(tx->tx_pool)) {
+               tx->tx_wait_dirty = B_TRUE;
+               DMU_TX_STAT_BUMP(dmu_tx_dirty_delay);
+@@ -976,41 +976,44 @@ dmu_tx_unassign(dmu_tx_t *tx)
+ }
+ 
+ /*
+- * Assign tx to a transaction group.  txg_how can be one of:
++ * Assign tx to a transaction group; txg_how is a bitmask:
+  *
+- * (1)        TXG_WAIT.  If the current open txg is full, waits until there's
+- *    a new one.  This should be used when you're not holding locks.
+- *    It will only fail if we're truly out of space (or over quota).
++ * If TXG_WAIT is set and the currently open txg is full, this function
++ * will wait until there's a new txg. This should be used when no locks
++ * are being held. With this bit set, this function will only fail if
++ * we're truly out of space (or over quota).
+  *
+- * (2)        TXG_NOWAIT.  If we can't assign into the current open txg without
+- *    blocking, returns immediately with ERESTART.  This should be used
+- *    whenever you're holding locks.  On an ERESTART error, the caller
+- *    should drop locks, do a dmu_tx_wait(tx), and try again.
++ * If TXG_WAIT is *not* set and we can't assign into the currently open
++ * txg without blocking, this function will return immediately with
++ * ERESTART. This should be used whenever locks are being held.  On an
++ * ERESTART error, the caller should drop all locks, call dmu_tx_wait(),
++ * and try again.
+  *
+- * (3)        TXG_WAITED.  Like TXG_NOWAIT, but indicates that dmu_tx_wait()
+- *    has already been called on behalf of this operation (though
+- *    most likely on a different tx).
++ * If TXG_NOTHROTTLE is set, this indicates that this tx should not be
++ * delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for
++ * details on the throttle). This is used by the VFS operations, after
++ * they have already called dmu_tx_wait() (though most likely on a
++ * different tx).
+  */
+ int
+-dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
++dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
+ {
+       int err;
+ 
+       ASSERT(tx->tx_txg == 0);
+-      ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
+-          txg_how == TXG_WAITED);
++      ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE));
+       ASSERT(!dsl_pool_sync_context(tx->tx_pool));
+ 
+-      if (txg_how == TXG_WAITED)
+-              tx->tx_waited = B_TRUE;
+-
+       /* If we might wait, we must not hold the config lock. */
+-      ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
++      IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool));
++
++      if ((txg_how & TXG_NOTHROTTLE))
++              tx->tx_dirty_delayed = B_TRUE;
+ 
+       while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
+               dmu_tx_unassign(tx);
+ 
+-              if (err != ERESTART || txg_how != TXG_WAIT)
++              if (err != ERESTART || !(txg_how & TXG_WAIT))
+                       return (err);
+ 
+               dmu_tx_wait(tx);
+@@ -1054,12 +1057,12 @@ dmu_tx_wait(dmu_tx_t *tx)
+               tx->tx_wait_dirty = B_FALSE;
+ 
+               /*
+-               * Note: setting tx_waited only has effect if the caller
+-               * used TX_WAIT.  Otherwise they are going to destroy
+-               * this tx and try again.  The common case, zfs_write(),
+-               * uses TX_WAIT.
++               * Note: setting tx_dirty_delayed only has effect if the
++               * caller used TX_WAIT.  Otherwise they are going to
++               * destroy this tx and try again.  The common case,
++               * zfs_write(), uses TX_WAIT.
+                */
+-              tx->tx_waited = B_TRUE;
++              tx->tx_dirty_delayed = B_TRUE;
+       } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
+               /*
+                * If the pool is suspended we need to wait until it
+diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
+index 34ea751c..4805f897 100644
+--- a/module/zfs/zfs_vnops.c
++++ b/module/zfs/zfs_vnops.c
+@@ -129,7 +129,7 @@
+  *
+  *    If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
+  *    then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
+- *    calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
++ *    calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
+  *    to indicate that this operation has already called dmu_tx_wait().
+  *    This will ensure that we don't retry forever, waiting a short bit
+  *    each time.
+@@ -154,7 +154,7 @@
+  *    rw_enter(...);                  // grab any other locks you need
+  *    tx = dmu_tx_create(...);        // get DMU tx
+  *    dmu_tx_hold_*();                // hold each object you might modify
+- *    error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
++ *    error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+  *    if (error) {
+  *            rw_exit(...);           // drop locks
+  *            zfs_dirent_unlock(dl);  // unlock directory entry
+@@ -1427,7 +1427,8 @@ top:
+                       dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+                           0, acl_ids.z_aclp->z_acl_bytes);
+               }
+-              error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
++              error = dmu_tx_assign(tx,
++                  (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+               if (error) {
+                       zfs_dirent_unlock(dl);
+                       if (error == ERESTART) {
+@@ -1602,7 +1603,7 @@ top:
+               dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+                   0, acl_ids.z_aclp->z_acl_bytes);
+       }
+-      error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
++      error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+       if (error) {
+               if (error == ERESTART) {
+                       waited = B_TRUE;
+@@ -1775,7 +1776,7 @@ top:
+        */
+       dmu_tx_mark_netfree(tx);
+ 
+-      error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
++      error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+       if (error) {
+               zfs_dirent_unlock(dl);
+               if (error == ERESTART) {
+@@ -2017,7 +2018,7 @@ top:
+       dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+           ZFS_SA_BASE_ATTR_SIZE);
+ 
+-      error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
++      error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+       if (error) {
+               zfs_dirent_unlock(dl);
+               if (error == ERESTART) {
+@@ -2156,7 +2157,7 @@ top:
+       zfs_sa_upgrade_txholds(tx, zp);
+       zfs_sa_upgrade_txholds(tx, dzp);
+       dmu_tx_mark_netfree(tx);
+-      error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
++      error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+       if (error) {
+               rw_exit(&zp->z_parent_lock);
+               rw_exit(&zp->z_name_lock);
+@@ -3623,7 +3624,7 @@ top:
+ 
+       zfs_sa_upgrade_txholds(tx, szp);
+       dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+-      error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
++      error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+       if (error) {
+               if (zl != NULL)
+                       zfs_rename_unlock(&zl);
+@@ -3815,7 +3816,7 @@ top:
+       }
+       if (fuid_dirtied)
+               zfs_fuid_txhold(zfsvfs, tx);
+-      error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
++      error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+       if (error) {
+               zfs_dirent_unlock(dl);
+               if (error == ERESTART) {
+@@ -4041,7 +4042,7 @@ top:
+ 
+       zfs_sa_upgrade_txholds(tx, szp);
+       zfs_sa_upgrade_txholds(tx, dzp);
+-      error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
++      error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+       if (error) {
+               zfs_dirent_unlock(dl);
+               if (error == ERESTART) {
+diff --git a/module/zfs/zil.c b/module/zfs/zil.c
+index 645b1d4d..a2bbdcb9 100644
+--- a/module/zfs/zil.c
++++ b/module/zfs/zil.c
+@@ -1009,7 +1009,15 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
+        * to clean up in the event of allocation failure or I/O failure.
+        */
+       tx = dmu_tx_create(zilog->zl_os);
+-      VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
++
++      /*
++       * Since we are not going to create any new dirty data, and we
++       * can even help with clearing the existing dirty data, we
++       * should not be subject to the dirty data based delays. We
++       * use TXG_NOTHROTTLE to bypass the delay mechanism.
++       */
++      VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
++
+       dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+       txg = dmu_tx_get_txg(tx);
+ 
diff --git a/zfs-patches/0009-Fix-divide-by-zero-in-mmp_delay_update.patch b/zfs-patches/0009-Fix-divide-by-zero-in-mmp_delay_update.patch

new file mode 100644 (file)

index 0000000..d39118b
--- /dev/null
+++ b/zfs-patches/0009-Fix-divide-by-zero-in-mmp_delay_update.patch
@@ -0,0 +1,34 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Olaf Faaland <faaland1@llnl.gov>
+Date: Fri, 6 Apr 2018 13:29:11 -0700
+Subject: [PATCH] Fix divide-by-zero in mmp_delay_update()
+
+vdev_count_leaves() in the denominator may return 0, caught by Coverity.
+Introduced by
+
+* 533ea04 Update mmp_delay on sync or skipped, failed write
+
+Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
+Reviewed-by: George Melikov <mail@gmelikov.ru>
+Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
+Closes #7391
+
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ module/zfs/mmp.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c
+index 1ae5f31f..3b74a6b6 100644
+--- a/module/zfs/mmp.c
++++ b/module/zfs/mmp.c
+@@ -327,7 +327,7 @@ mmp_delay_update(spa_t *spa, boolean_t write_completed)
+        */
+       if (delay < mts->mmp_delay) {
+               hrtime_t min_delay = MSEC2NSEC(zfs_multihost_interval) /
+-                  vdev_count_leaves(spa);
++                  MAX(1, vdev_count_leaves(spa));
+               mts->mmp_delay = MAX(((delay + mts->mmp_delay * 127) / 128),
+                   min_delay);
+       }
diff --git a/zfs-patches/0010-Fix-ENOSPC-in-Handle-zap_add-failures-in.patch b/zfs-patches/0010-Fix-ENOSPC-in-Handle-zap_add-failures-in.patch

new file mode 100644 (file)

index 0000000..910f4b8
--- /dev/null
+++ b/zfs-patches/0010-Fix-ENOSPC-in-Handle-zap_add-failures-in.patch
@@ -0,0 +1,867 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Chunwei Chen <tuxoko@gmail.com>
+Date: Wed, 18 Apr 2018 14:19:50 -0700
+Subject: [PATCH] Fix ENOSPC in "Handle zap_add() failures in ..."
+
+Commit cc63068 caused ENOSPC error when copy a large amount of files
+between two directories. The reason is that the patch limits zap leaf
+expansion to 2 retries, and return ENOSPC when failed.
+
+The intent for limiting retries is to prevent pointlessly growing table
+to max size when adding a block full of entries with same name in
+different case in mixed mode. However, it turns out we cannot use any
+limit on the retry. When we copy files from one directory in readdir
+order, we are copying in hash order, one leaf block at a time. Which
+means that if the leaf block in source directory has expanded 6 times,
+and you copy those entries in that block, by the time you need to expand
+the leaf in destination directory, you need to expand it 6 times in one
+go. So any limit on the retry will result in error where it shouldn't.
+
+Note that while we do use different salt for different directories, it
+seems that the salt/hash function doesn't provide enough randomization
+to the hash distance to prevent this from happening.
+
+Since cc63068 has already been reverted. This patch adds it back and
+removes the retry limit.
+
+Also, as it turn out, failing on zap_add() has a serious side effect for
+mzap_upgrade(). When upgrading from micro zap to fat zap, it will
+call zap_add() to transfer entries one at a time. If it hit any error
+halfway through, the remaining entries will be lost, causing those files
+to become orphan. This patch add a VERIFY to catch it.
+
+Reviewed-by: Sanjeev Bagewadi <sanjeev.bagewadi@gmail.com>
+Reviewed-by: Richard Yao <ryao@gentoo.org>
+Reviewed-by: Tony Hutter <hutter2@llnl.gov>
+Reviewed-by: Albert Lee <trisk@forkgnu.org>
+Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Reviewed by: Matthew Ahrens <mahrens@delphix.com>
+Signed-off-by: Chunwei Chen <david.chen@nutanix.com>
+Closes #7401
+Closes #7421
+
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ configure.ac                                       |   1 +
+ include/sys/zap_leaf.h                             |  15 ++-
+ module/zfs/zap.c                                   |  10 +-
+ module/zfs/zap_leaf.c                              |   2 +-
+ module/zfs/zap_micro.c                             |  47 ++++++-
+ module/zfs/zfs_dir.c                               |  29 ++++-
+ module/zfs/zfs_vnops.c                             |  74 ++++++++---
+ tests/runfiles/linux.run                           |   6 +-
+ tests/zfs-tests/tests/functional/Makefile.am       |   1 +
+ .../tests/functional/casenorm/Makefile.am          |   1 +
+ .../functional/casenorm/mixed_create_failure.ksh   | 136 +++++++++++++++++++++
+ .../zfs-tests/tests/functional/cp_files/.gitignore |   1 +
+ .../tests/functional/cp_files/Makefile.am          |  13 ++
+ .../tests/functional/cp_files/cleanup.ksh          |  34 ++++++
+ .../zfs-tests/tests/functional/cp_files/cp_files.c |  58 +++++++++
+ .../tests/functional/cp_files/cp_files_001_pos.ksh |  74 +++++++++++
+ .../zfs-tests/tests/functional/cp_files/setup.ksh  |  35 ++++++
+ 17 files changed, 500 insertions(+), 37 deletions(-)
+ create mode 100755 tests/zfs-tests/tests/functional/casenorm/mixed_create_failure.ksh
+ create mode 100644 tests/zfs-tests/tests/functional/cp_files/.gitignore
+ create mode 100644 tests/zfs-tests/tests/functional/cp_files/Makefile.am
+ create mode 100755 tests/zfs-tests/tests/functional/cp_files/cleanup.ksh
+ create mode 100644 tests/zfs-tests/tests/functional/cp_files/cp_files.c
+ create mode 100755 tests/zfs-tests/tests/functional/cp_files/cp_files_001_pos.ksh
+ create mode 100755 tests/zfs-tests/tests/functional/cp_files/setup.ksh
+
+diff --git a/configure.ac b/configure.ac
+index d9441a0f..3f4925c3 100644
+--- a/configure.ac
++++ b/configure.ac
+@@ -238,6 +238,7 @@ AC_CONFIG_FILES([
+       tests/zfs-tests/tests/functional/cli_user/zpool_iostat/Makefile
+       tests/zfs-tests/tests/functional/cli_user/zpool_list/Makefile
+       tests/zfs-tests/tests/functional/compression/Makefile
++      tests/zfs-tests/tests/functional/cp_files/Makefile
+       tests/zfs-tests/tests/functional/ctime/Makefile
+       tests/zfs-tests/tests/functional/delegate/Makefile
+       tests/zfs-tests/tests/functional/devices/Makefile
+diff --git a/include/sys/zap_leaf.h b/include/sys/zap_leaf.h
+index e784c596..a3da1036 100644
+--- a/include/sys/zap_leaf.h
++++ b/include/sys/zap_leaf.h
+@@ -46,10 +46,15 @@ struct zap_stats;
+  * block size (1<<l->l_bs) - hash entry size (2) * number of hash
+  * entries - header space (2*chunksize)
+  */
+-#define       ZAP_LEAF_NUMCHUNKS(l) \
+-      (((1<<(l)->l_bs) - 2*ZAP_LEAF_HASH_NUMENTRIES(l)) / \
++#define       ZAP_LEAF_NUMCHUNKS_BS(bs) \
++      (((1<<(bs)) - 2*ZAP_LEAF_HASH_NUMENTRIES_BS(bs)) / \
+       ZAP_LEAF_CHUNKSIZE - 2)
+ 
++#define       ZAP_LEAF_NUMCHUNKS(l) (ZAP_LEAF_NUMCHUNKS_BS(((l)->l_bs)))
++
++#define       ZAP_LEAF_NUMCHUNKS_DEF \
++      (ZAP_LEAF_NUMCHUNKS_BS(fzap_default_block_shift))
++
+ /*
+  * The amount of space within the chunk available for the array is:
+  * chunk size - space for type (1) - space for next pointer (2)
+@@ -74,8 +79,10 @@ struct zap_stats;
+  * which is less than block size / CHUNKSIZE (24) / minimum number of
+  * chunks per entry (3).
+  */
+-#define       ZAP_LEAF_HASH_SHIFT(l) ((l)->l_bs - 5)
+-#define       ZAP_LEAF_HASH_NUMENTRIES(l) (1 << ZAP_LEAF_HASH_SHIFT(l))
++#define       ZAP_LEAF_HASH_SHIFT_BS(bs) ((bs) - 5)
++#define       ZAP_LEAF_HASH_NUMENTRIES_BS(bs) (1 << ZAP_LEAF_HASH_SHIFT_BS(bs))
++#define       ZAP_LEAF_HASH_SHIFT(l) (ZAP_LEAF_HASH_SHIFT_BS(((l)->l_bs)))
++#define       ZAP_LEAF_HASH_NUMENTRIES(l) (ZAP_LEAF_HASH_NUMENTRIES_BS(((l)->l_bs)))
+ 
+ /*
+  * The chunks start immediately after the hash table.  The end of the
+diff --git a/module/zfs/zap.c b/module/zfs/zap.c
+index ee9962bf..47b4c1ab 100644
+--- a/module/zfs/zap.c
++++ b/module/zfs/zap.c
+@@ -853,8 +853,16 @@ retry:
+       } else if (err == EAGAIN) {
+               err = zap_expand_leaf(zn, l, tag, tx, &l);
+               zap = zn->zn_zap;       /* zap_expand_leaf() may change zap */
+-              if (err == 0)
++              if (err == 0) {
+                       goto retry;
++              } else if (err == ENOSPC) {
++                      /*
++                       * If we failed to expand the leaf, then bailout
++                       * as there is no point trying
++                       * zap_put_leaf_maybe_grow_ptrtbl().
++                       */
++                      return (err);
++              }
+       }
+ 
+ out:
+diff --git a/module/zfs/zap_leaf.c b/module/zfs/zap_leaf.c
+index c342695c..526e4660 100644
+--- a/module/zfs/zap_leaf.c
++++ b/module/zfs/zap_leaf.c
+@@ -53,7 +53,7 @@ static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
+       ((h) >> \
+       (64 - ZAP_LEAF_HASH_SHIFT(l) - zap_leaf_phys(l)->l_hdr.lh_prefix_len)))
+ 
+-#define       LEAF_HASH_ENTPTR(l, h) (&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)])
++#define       LEAF_HASH_ENTPTR(l, h)  (&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)])
+ 
+ extern inline zap_leaf_phys_t *zap_leaf_phys(zap_leaf_t *l);
+ 
+diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c
+index 3ebf995c..60e193ef 100644
+--- a/module/zfs/zap_micro.c
++++ b/module/zfs/zap_micro.c
+@@ -363,6 +363,41 @@ mze_find_unused_cd(zap_t *zap, uint64_t hash)
+       return (cd);
+ }
+ 
++/*
++ * Each mzap entry requires at max : 4 chunks
++ * 3 chunks for names + 1 chunk for value.
++ */
++#define       MZAP_ENT_CHUNKS (1 + ZAP_LEAF_ARRAY_NCHUNKS(MZAP_NAME_LEN) + \
++      ZAP_LEAF_ARRAY_NCHUNKS(sizeof (uint64_t)))
++
++/*
++ * Check if the current entry keeps the colliding entries under the fatzap leaf
++ * size.
++ */
++static boolean_t
++mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash)
++{
++      zap_t *zap = zn->zn_zap;
++      mzap_ent_t mze_tofind;
++      mzap_ent_t *mze;
++      avl_index_t idx;
++      avl_tree_t *avl = &zap->zap_m.zap_avl;
++      uint32_t mzap_ents = 0;
++
++      mze_tofind.mze_hash = hash;
++      mze_tofind.mze_cd = 0;
++
++      for (mze = avl_find(avl, &mze_tofind, &idx);
++          mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
++              mzap_ents++;
++      }
++
++      /* Include the new entry being added */
++      mzap_ents++;
++
++      return (ZAP_LEAF_NUMCHUNKS_DEF > (mzap_ents * MZAP_ENT_CHUNKS));
++}
++
+ static void
+ mze_remove(zap_t *zap, mzap_ent_t *mze)
+ {
+@@ -639,16 +674,15 @@ mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags)
+               dprintf("adding %s=%llu\n",
+                   mze->mze_name, mze->mze_value);
+               zn = zap_name_alloc(zap, mze->mze_name, 0);
+-              err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
+-                  tag, tx);
++              /* If we fail here, we would end up losing entries */
++              VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
++                  tag, tx));
+               zap = zn->zn_zap;       /* fzap_add_cd() may change zap */
+               zap_name_free(zn);
+-              if (err)
+-                      break;
+       }
+       vmem_free(mzp, sz);
+       *zapp = zap;
+-      return (err);
++      return (0);
+ }
+ 
+ /*
+@@ -1191,7 +1225,8 @@ zap_add_impl(zap_t *zap, const char *key,
+               err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
+               zap = zn->zn_zap;       /* fzap_add() may change zap */
+       } else if (integer_size != 8 || num_integers != 1 ||
+-          strlen(key) >= MZAP_NAME_LEN) {
++          strlen(key) >= MZAP_NAME_LEN ||
++          !mze_canfit_fzap_leaf(zn, zn->zn_hash)) {
+               err = mzap_upgrade(&zn->zn_zap, tag, tx, 0);
+               if (err == 0) {
+                       err = fzap_add(zn, integer_size, num_integers, val,
+diff --git a/module/zfs/zfs_dir.c b/module/zfs/zfs_dir.c
+index 9a8bbccd..6398a1d1 100644
+--- a/module/zfs/zfs_dir.c
++++ b/module/zfs/zfs_dir.c
+@@ -742,7 +742,11 @@ zfs_dirent(znode_t *zp, uint64_t mode)
+ }
+ 
+ /*
+- * Link zp into dl.  Can only fail if zp has been unlinked.
++ * Link zp into dl.  Can fail in the following cases :
++ * - if zp has been unlinked.
++ * - if the number of entries with the same hash (aka. colliding entries)
++ *    exceed the capacity of a leaf-block of fatzap and splitting of the
++ *    leaf-block does not help.
+  */
+ int
+ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
+@@ -776,6 +780,24 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
+                           NULL, &links, sizeof (links));
+               }
+       }
++
++      value = zfs_dirent(zp, zp->z_mode);
++      error = zap_add(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, 8, 1,
++          &value, tx);
++
++      /*
++       * zap_add could fail to add the entry if it exceeds the capacity of the
++       * leaf-block and zap_leaf_split() failed to help.
++       * The caller of this routine is responsible for failing the transaction
++       * which will rollback the SA updates done above.
++       */
++      if (error != 0) {
++              if (!(flag & ZRENAMING) && !(flag & ZNEW))
++                      drop_nlink(ZTOI(zp));
++              mutex_exit(&zp->z_lock);
++              return (error);
++      }
++
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
+           &dzp->z_id, sizeof (dzp->z_id));
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+@@ -813,11 +835,6 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
+       ASSERT(error == 0);
+       mutex_exit(&dzp->z_lock);
+ 
+-      value = zfs_dirent(zp, zp->z_mode);
+-      error = zap_add(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name,
+-          8, 1, &value, tx);
+-      ASSERT(error == 0);
+-
+       return (0);
+ }
+ 
+diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
+index 4805f897..5a2e55eb 100644
+--- a/module/zfs/zfs_vnops.c
++++ b/module/zfs/zfs_vnops.c
+@@ -1427,6 +1427,7 @@ top:
+                       dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+                           0, acl_ids.z_aclp->z_acl_bytes);
+               }
++
+               error = dmu_tx_assign(tx,
+                   (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+               if (error) {
+@@ -1444,10 +1445,22 @@ top:
+               }
+               zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+ 
++              error = zfs_link_create(dl, zp, tx, ZNEW);
++              if (error != 0) {
++                      /*
++                       * Since, we failed to add the directory entry for it,
++                       * delete the newly created dnode.
++                       */
++                      zfs_znode_delete(zp, tx);
++                      remove_inode_hash(ZTOI(zp));
++                      zfs_acl_ids_free(&acl_ids);
++                      dmu_tx_commit(tx);
++                      goto out;
++              }
++
+               if (fuid_dirtied)
+                       zfs_fuid_sync(zfsvfs, tx);
+ 
+-              (void) zfs_link_create(dl, zp, tx, ZNEW);
+               txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
+               if (flag & FIGNORECASE)
+                       txtype |= TX_CI;
+@@ -2038,13 +2051,18 @@ top:
+        */
+       zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+ 
+-      if (fuid_dirtied)
+-              zfs_fuid_sync(zfsvfs, tx);
+-
+       /*
+        * Now put new name in parent dir.
+        */
+-      (void) zfs_link_create(dl, zp, tx, ZNEW);
++      error = zfs_link_create(dl, zp, tx, ZNEW);
++      if (error != 0) {
++              zfs_znode_delete(zp, tx);
++              remove_inode_hash(ZTOI(zp));
++              goto out;
++      }
++
++      if (fuid_dirtied)
++              zfs_fuid_sync(zfsvfs, tx);
+ 
+       *ipp = ZTOI(zp);
+ 
+@@ -2054,6 +2072,7 @@ top:
+       zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
+           acl_ids.z_fuidp, vap);
+ 
++out:
+       zfs_acl_ids_free(&acl_ids);
+ 
+       dmu_tx_commit(tx);
+@@ -2063,10 +2082,14 @@ top:
+       if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+               zil_commit(zilog, 0);
+ 
+-      zfs_inode_update(dzp);
+-      zfs_inode_update(zp);
++      if (error != 0) {
++              iput(ZTOI(zp));
++      } else {
++              zfs_inode_update(dzp);
++              zfs_inode_update(zp);
++      }
+       ZFS_EXIT(zfsvfs);
+-      return (0);
++      return (error);
+ }
+ 
+ /*
+@@ -3684,6 +3707,13 @@ top:
+                               VERIFY3U(zfs_link_destroy(tdl, szp, tx,
+                                   ZRENAMING, NULL), ==, 0);
+                       }
++              } else {
++                      /*
++                       * If we had removed the existing target, subsequent
++                       * call to zfs_link_create() to add back the same entry
++                       * but, the new dnode (szp) should not fail.
++                       */
++                      ASSERT(tzp == NULL);
+               }
+       }
+ 
+@@ -3854,14 +3884,18 @@ top:
+       /*
+        * Insert the new object into the directory.
+        */
+-      (void) zfs_link_create(dl, zp, tx, ZNEW);
+-
+-      if (flags & FIGNORECASE)
+-              txtype |= TX_CI;
+-      zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
++      error = zfs_link_create(dl, zp, tx, ZNEW);
++      if (error != 0) {
++              zfs_znode_delete(zp, tx);
++              remove_inode_hash(ZTOI(zp));
++      } else {
++              if (flags & FIGNORECASE)
++                      txtype |= TX_CI;
++              zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
+ 
+-      zfs_inode_update(dzp);
+-      zfs_inode_update(zp);
++              zfs_inode_update(dzp);
++              zfs_inode_update(zp);
++      }
+ 
+       zfs_acl_ids_free(&acl_ids);
+ 
+@@ -3869,10 +3903,14 @@ top:
+ 
+       zfs_dirent_unlock(dl);
+ 
+-      *ipp = ZTOI(zp);
++      if (error == 0) {
++              *ipp = ZTOI(zp);
+ 
+-      if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+-              zil_commit(zilog, 0);
++              if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
++                      zil_commit(zilog, 0);
++      } else {
++              iput(ZTOI(zp));
++      }
+ 
+       ZFS_EXIT(zfsvfs);
+       return (error);
+diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
+index 272c8c77..379c9f73 100644
+--- a/tests/runfiles/linux.run
++++ b/tests/runfiles/linux.run
+@@ -55,7 +55,7 @@ tags = ['functional', 'cachefile']
+ # 'mixed_none_lookup', 'mixed_none_lookup_ci', 'mixed_none_delete',
+ # 'mixed_formd_lookup', 'mixed_formd_lookup_ci', 'mixed_formd_delete']
+ [tests/functional/casenorm]
+-tests = ['case_all_values', 'norm_all_values']
++tests = ['case_all_values', 'norm_all_values', 'mixed_create_failure']
+ tags = ['functional', 'casenorm']
+ 
+ [tests/functional/chattr]
+@@ -394,6 +394,10 @@ tests = ['compress_001_pos', 'compress_002_pos', 'compress_003_pos',
+     'compress_004_pos']
+ tags = ['functional', 'compression']
+ 
++[tests/functional/cp_files]
++tests = ['cp_files_001_pos']
++tags = ['functional', 'cp_files']
++
+ [tests/functional/ctime]
+ tests = ['ctime_001_pos' ]
+ tags = ['functional', 'ctime']
+diff --git a/tests/zfs-tests/tests/functional/Makefile.am b/tests/zfs-tests/tests/functional/Makefile.am
+index cd60324f..ea52205a 100644
+--- a/tests/zfs-tests/tests/functional/Makefile.am
++++ b/tests/zfs-tests/tests/functional/Makefile.am
+@@ -11,6 +11,7 @@ SUBDIRS = \
+       cli_root \
+       cli_user \
+       compression \
++      cp_files \
+       ctime \
+       delegate \
+       devices \
+diff --git a/tests/zfs-tests/tests/functional/casenorm/Makefile.am b/tests/zfs-tests/tests/functional/casenorm/Makefile.am
+index 65dd156e..b284a256 100644
+--- a/tests/zfs-tests/tests/functional/casenorm/Makefile.am
++++ b/tests/zfs-tests/tests/functional/casenorm/Makefile.am
+@@ -7,6 +7,7 @@ dist_pkgdata_SCRIPTS = \
+       insensitive_formd_lookup.ksh \
+       insensitive_none_delete.ksh \
+       insensitive_none_lookup.ksh \
++      mixed_create_failure.ksh \
+       mixed_formd_delete.ksh \
+       mixed_formd_lookup_ci.ksh \
+       mixed_formd_lookup.ksh \
+diff --git a/tests/zfs-tests/tests/functional/casenorm/mixed_create_failure.ksh b/tests/zfs-tests/tests/functional/casenorm/mixed_create_failure.ksh
+new file mode 100755
+index 00000000..51b5bb3f
+--- /dev/null
++++ b/tests/zfs-tests/tests/functional/casenorm/mixed_create_failure.ksh
+@@ -0,0 +1,136 @@
++#!/bin/ksh -p
++#
++#
++# This file and its contents are supplied under the terms of the
++# Common Development and Distribution License ("CDDL"), version 1.0.
++# You may only use this file in accordance with the terms of version
++# 1.0 of the CDDL.
++#
++# A full copy of the text of the CDDL should have accompanied this
++# source.  A copy of the CDDL is also available via the Internet at
++# http://www.illumos.org/license/CDDL.
++#
++#
++# Copyright 2018 Nutanix Inc.  All rights reserved.
++#
++
++. $STF_SUITE/tests/functional/casenorm/casenorm.kshlib
++
++# DESCRIPTION:
++# For the filesystem with casesensitivity=mixed, normalization=none,
++# when multiple files with the same name (differing only in case) are created,
++# the number of files is limited to what can fit in a fatzap leaf-block.
++# And beyond that, it fails with ENOSPC.
++#
++# Ensure that the create/rename operations fail gracefully and not trigger an
++# ASSERT.
++#
++# STRATEGY:
++# Repeat the below steps for objects: files, directories, symlinks and hardlinks
++# 1. Create objects with same name but varying in case.
++#    E.g. 'abcdefghijklmnop', 'Abcdefghijklmnop', 'ABcdefghijklmnop' etc.
++#    The create should fail with ENOSPC.
++# 2. Create an object with name 'tmp_obj' and try to rename it to name that we
++#    failed to add in step 1 above.
++#    This should fail as well.
++
++verify_runnable "global"
++
++function cleanup
++{
++        destroy_testfs
++}
++
++log_onexit cleanup
++log_assert "With mixed mode: ensure create fails with ENOSPC beyond a certain limit"
++
++create_testfs "-o casesensitivity=mixed -o normalization=none"
++
++# Different object types
++obj_type=('file' 'dir' 'symlink' 'hardlink')
++
++# Commands to create different object types
++typeset -A ops
++ops['file']='touch'
++ops['dir']='mkdir'
++ops['symlink']='ln -s'
++ops['hardlink']='ln'
++
++# This function tests the following for a give object type :
++# - Create multiple objects with the same name (varying only in case).
++#   Ensure that it eventually fails once the leaf-block limit is exceeded.
++# - Create another object with a different name. And attempt rename it to the
++#   name (for which the create had failed in the previous step).
++#   This should fail as well.
++# Args :
++#   $1 - object type (file/dir/symlink/hardlink)
++#   $2 - test directory
++#
++function test_ops
++{
++      typeset obj_type=$1
++      typeset testdir=$2
++
++      target_obj='target-file'
++
++      op="${ops[$obj_type]}"
++
++      log_note "The op : $op"
++      log_note "testdir=$testdir obj_type=$obj_type"
++
++      test_path="$testdir/$obj_type"
++      mkdir $test_path
++      log_note "Created test dir $test_path"
++
++      if [[ $obj_type = "symlink" || $obj_type = "hardlink" ]]; then
++              touch $test_path/$target_obj
++              log_note "Created target: $test_path/$target_obj"
++              op="$op $test_path/$target_obj"
++      fi
++
++      log_note "op : $op"
++      names='{a,A}{b,B}{c,C}{d,D}{e,E}{f,F}{g,G}{h,H}{i,I}{j,J}{k,K}{l,L}'
++      for name in $names; do
++              cmd="$op $test_path/$name"
++              out=$($cmd 2>&1)
++              ret=$?
++              log_note "cmd: $cmd ret: $ret out=$out"
++              if (($ret != 0)); then
++                      if [[ $out = *@(No space left on device)* ]]; then
++                              save_name="$test_path/$name"
++                              break;
++                      else
++                              log_err "$cmd failed with unexpected error : $out"
++                      fi
++              fi
++      done
++
++      log_note 'Test rename \"sample_name\" rename'
++      TMP_OBJ="$test_path/tmp_obj"
++      cmd="$op $TMP_OBJ"
++      out=$($cmd 2>&1)
++      ret=$?
++      if (($ret != 0)); then
++              log_err "cmd:$cmd failed out:$out"
++      fi
++
++      # Now, try to rename the tmp_obj to the name which we failed to add earlier.
++      # This should fail as well.
++      out=$(mv $TMP_OBJ $save_name 2>&1)
++      ret=$?
++      if (($ret != 0)); then
++              if [[ $out = *@(No space left on device)* ]]; then
++                      log_note "$cmd failed as expected : $out"
++              else
++                      log_err "$cmd failed with : $out"
++              fi
++      fi
++}
++
++for obj_type in ${obj_type[*]};
++do
++      log_note "Testing create of $obj_type"
++      test_ops $obj_type $TESTDIR
++done
++
++log_pass "Mixed mode FS: Ops on large number of colliding names fail gracefully"
+diff --git a/tests/zfs-tests/tests/functional/cp_files/.gitignore b/tests/zfs-tests/tests/functional/cp_files/.gitignore
+new file mode 100644
+index 00000000..eac05e15
+--- /dev/null
++++ b/tests/zfs-tests/tests/functional/cp_files/.gitignore
+@@ -0,0 +1 @@
++/cp_files
+diff --git a/tests/zfs-tests/tests/functional/cp_files/Makefile.am b/tests/zfs-tests/tests/functional/cp_files/Makefile.am
+new file mode 100644
+index 00000000..06c31f5f
+--- /dev/null
++++ b/tests/zfs-tests/tests/functional/cp_files/Makefile.am
+@@ -0,0 +1,13 @@
++include $(top_srcdir)/config/Rules.am
++
++pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cp_files
++
++dist_pkgdata_SCRIPTS = \
++      cp_files_001_pos.ksh \
++      cleanup.ksh \
++      setup.ksh
++
++pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cp_files
++
++pkgexec_PROGRAMS = cp_files
++cp_files_SOURCES= cp_files.c
+diff --git a/tests/zfs-tests/tests/functional/cp_files/cleanup.ksh b/tests/zfs-tests/tests/functional/cp_files/cleanup.ksh
+new file mode 100755
+index 00000000..3166bd6e
+--- /dev/null
++++ b/tests/zfs-tests/tests/functional/cp_files/cleanup.ksh
+@@ -0,0 +1,34 @@
++#!/bin/ksh -p
++#
++# CDDL HEADER START
++#
++# The contents of this file are subject to the terms of the
++# Common Development and Distribution License (the "License").
++# You may not use this file except in compliance with the License.
++#
++# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
++# or http://www.opensolaris.org/os/licensing.
++# See the License for the specific language governing permissions
++# and limitations under the License.
++#
++# When distributing Covered Code, include this CDDL HEADER in each
++# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
++# If applicable, add the following below this CDDL HEADER, with the
++# fields enclosed by brackets "[]" replaced with your own identifying
++# information: Portions Copyright [yyyy] [name of copyright owner]
++#
++# CDDL HEADER END
++#
++
++#
++# Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
++# Use is subject to license terms.
++#
++
++#
++# Copyright (c) 2013 by Delphix. All rights reserved.
++#
++
++. $STF_SUITE/include/libtest.shlib
++
++default_cleanup
+diff --git a/tests/zfs-tests/tests/functional/cp_files/cp_files.c b/tests/zfs-tests/tests/functional/cp_files/cp_files.c
+new file mode 100644
+index 00000000..9af64a11
+--- /dev/null
++++ b/tests/zfs-tests/tests/functional/cp_files/cp_files.c
+@@ -0,0 +1,58 @@
++#include <stdio.h>
++#include <stdlib.h>
++#include <unistd.h>
++#include <sys/types.h>
++#include <sys/stat.h>
++#include <fcntl.h>
++#include <dirent.h>
++#include <errno.h>
++#include <string.h>
++
++int
++main(int argc, char *argv[])
++{
++      int tfd;
++      DIR *sdir;
++      struct dirent *dirent;
++
++      if (argc != 3) {
++              fprintf(stderr, "Usage: %s SRC DST\n", argv[0]);
++              exit(1);
++      }
++
++      sdir = opendir(argv[1]);
++      if (sdir == NULL) {
++              fprintf(stderr, "Failed to open %s: %s\n",
++                  argv[1], strerror(errno));
++              exit(2);
++      }
++
++      tfd = open(argv[2], O_DIRECTORY);
++      if (tfd < 0) {
++              fprintf(stderr, "Failed to open %s: %s\n",
++                  argv[2], strerror(errno));
++              closedir(sdir);
++              exit(3);
++      }
++
++      while ((dirent = readdir(sdir)) != NULL) {
++              if (dirent->d_name[0] == '.' &&
++                  (dirent->d_name[1] == '.' || dirent->d_name[1] == '\0'))
++                      continue;
++
++              int fd = openat(tfd, dirent->d_name, O_CREAT|O_WRONLY, 0666);
++              if (fd < 0) {
++                      fprintf(stderr, "Failed to create %s/%s: %s\n",
++                          argv[2], dirent->d_name, strerror(errno));
++                      closedir(sdir);
++                      close(tfd);
++                      exit(4);
++              }
++              close(fd);
++      }
++
++      closedir(sdir);
++      close(tfd);
++
++      return (0);
++}
+diff --git a/tests/zfs-tests/tests/functional/cp_files/cp_files_001_pos.ksh b/tests/zfs-tests/tests/functional/cp_files/cp_files_001_pos.ksh
+new file mode 100755
+index 00000000..3e138cfc
+--- /dev/null
++++ b/tests/zfs-tests/tests/functional/cp_files/cp_files_001_pos.ksh
+@@ -0,0 +1,74 @@
++#! /bin/ksh -p
++#
++# CDDL HEADER START
++#
++# The contents of this file are subject to the terms of the
++# Common Development and Distribution License (the "License").
++# You may not use this file except in compliance with the License.
++#
++# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
++# or http://www.opensolaris.org/os/licensing.
++# See the License for the specific language governing permissions
++# and limitations under the License.
++#
++# When distributing Covered Code, include this CDDL HEADER in each
++# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
++# If applicable, add the following below this CDDL HEADER, with the
++# fields enclosed by brackets "[]" replaced with your own identifying
++# information: Portions Copyright [yyyy] [name of copyright owner]
++#
++# CDDL HEADER END
++#
++
++#
++# Copyright (c) 2018 by Nutanix. All rights reserved.
++#
++
++. $STF_SUITE/include/libtest.shlib
++
++#
++# DESCRIPTION:
++# Copy a large number of files between 2 directories
++# within a zfs filesystem works without errors.
++# This make sure zap upgrading and expanding works.
++#
++# STRATEGY:
++#
++# 1. Create NR_FILES files in directory src
++# 2. Check the number of files is correct
++# 3. Copy files from src to dst in readdir order
++# 4. Check the number of files is correct
++#
++
++verify_runnable "global"
++
++function cleanup
++{
++      rm -rf $TESTDIR/src $TESTDIR/dst
++}
++
++log_assert "Copy a large number of files between 2 directories" \
++      "within a zfs filesystem works without errors"
++
++log_onexit cleanup
++
++NR_FILES=60000
++BATCH=1000
++
++log_must mkdir $TESTDIR/src
++log_must mkdir $TESTDIR/dst
++
++WD=$(pwd)
++cd $TESTDIR/src
++# create NR_FILES in BATCH at a time to prevent overflowing argument buffer
++for i in $(seq $(($NR_FILES/$BATCH))); do touch $(seq $((($i-1)*$BATCH+1)) $(($i*$BATCH))); done
++cd $WD
++
++log_must test $NR_FILES -eq $(ls -U $TESTDIR/src | wc -l)
++
++# copy files from src to dst, use cp_files to make sure we copy in readdir order
++log_must $STF_SUITE/tests/functional/cp_files/cp_files $TESTDIR/src $TESTDIR/dst
++
++log_must test $NR_FILES -eq $(ls -U $TESTDIR/dst | wc -l)
++
++log_pass
+diff --git a/tests/zfs-tests/tests/functional/cp_files/setup.ksh b/tests/zfs-tests/tests/functional/cp_files/setup.ksh
+new file mode 100755
+index 00000000..fc5cec30
+--- /dev/null
++++ b/tests/zfs-tests/tests/functional/cp_files/setup.ksh
+@@ -0,0 +1,35 @@
++#!/bin/ksh -p
++#
++# CDDL HEADER START
++#
++# The contents of this file are subject to the terms of the
++# Common Development and Distribution License (the "License").
++# You may not use this file except in compliance with the License.
++#
++# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
++# or http://www.opensolaris.org/os/licensing.
++# See the License for the specific language governing permissions
++# and limitations under the License.
++#
++# When distributing Covered Code, include this CDDL HEADER in each
++# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
++# If applicable, add the following below this CDDL HEADER, with the
++# fields enclosed by brackets "[]" replaced with your own identifying
++# information: Portions Copyright [yyyy] [name of copyright owner]
++#
++# CDDL HEADER END
++#
++
++#
++# Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
++# Use is subject to license terms.
++#
++
++#
++# Copyright (c) 2013 by Delphix. All rights reserved.
++#
++
++. $STF_SUITE/include/libtest.shlib
++
++DISK=${DISKS%% *}
++default_setup $DISK
diff --git a/zfs-patches/0011-Trim-new-line-from-zfs_vdev_scheduler.patch b/zfs-patches/0011-Trim-new-line-from-zfs_vdev_scheduler.patch

new file mode 100644 (file)

index 0000000..09b797e
--- /dev/null
+++ b/zfs-patches/0011-Trim-new-line-from-zfs_vdev_scheduler.patch
@@ -0,0 +1,155 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Brian Behlendorf <behlendorf1@llnl.gov>
+Date: Tue, 5 Sep 2017 13:41:32 -0700
+Subject: [PATCH] Trim new line from zfs_vdev_scheduler
+
+Add a helper function to trim the tailing new line.  While we're
+here use this new hook to immediately apply the new scheduler.
+
+Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
+Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Closes #3356
+Closes #6573
+
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ module/zfs/vdev_disk.c | 71 +++++++++++++++++++++++++++++++++++++-------------
+ 1 file changed, 53 insertions(+), 18 deletions(-)
+
+diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c
+index 5ae50a31..d6212835 100644
+--- a/module/zfs/vdev_disk.c
++++ b/module/zfs/vdev_disk.c
+@@ -27,13 +27,14 @@
+  */
+ 
+ #include <sys/zfs_context.h>
+-#include <sys/spa.h>
++#include <sys/spa_impl.h>
+ #include <sys/vdev_disk.h>
+ #include <sys/vdev_impl.h>
+ #include <sys/abd.h>
+ #include <sys/fs/zfs.h>
+ #include <sys/zio.h>
+ #include <sys/sunldi.h>
++#include <linux/mod_compat.h>
+ 
+ char *zfs_vdev_scheduler = VDEV_SCHEDULER;
+ static void *zfs_vdev_holder = VDEV_HOLDER;
+@@ -113,15 +114,23 @@ vdev_disk_error(zio_t *zio)
+  * physical device.  This yields the largest possible requests for
+  * the device with the lowest total overhead.
+  */
+-static int
++static void
+ vdev_elevator_switch(vdev_t *v, char *elevator)
+ {
+       vdev_disk_t *vd = v->vdev_tsd;
+-      struct block_device *bdev = vd->vd_bdev;
+-      struct request_queue *q = bdev_get_queue(bdev);
+-      char *device = bdev->bd_disk->disk_name;
++      struct request_queue *q;
++      char *device;
+       int error;
+ 
++      for (int c = 0; c < v->vdev_children; c++)
++              vdev_elevator_switch(v->vdev_child[c], elevator);
++
++      if (!v->vdev_ops->vdev_op_leaf || vd->vd_bdev == NULL)
++              return;
++
++      q = bdev_get_queue(vd->vd_bdev);
++      device = vd->vd_bdev->bd_disk->disk_name;
++
+       /*
+        * Skip devices which are not whole disks (partitions).
+        * Device-mapper devices are excepted since they may be whole
+@@ -131,15 +140,15 @@ vdev_elevator_switch(vdev_t *v, char *elevator)
+        * "Skip devices without schedulers" check below will fail.
+        */
+       if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0)
+-              return (0);
++              return;
+ 
+       /* Skip devices without schedulers (loop, ram, dm, etc) */
+       if (!q->elevator || !blk_queue_stackable(q))
+-              return (0);
++              return;
+ 
+       /* Leave existing scheduler when set to "none" */
+       if ((strncmp(elevator, "none", 4) == 0) && (strlen(elevator) == 4))
+-              return (0);
++              return;
+ 
+ #ifdef HAVE_ELEVATOR_CHANGE
+       error = elevator_change(q, elevator);
+@@ -156,20 +165,16 @@ vdev_elevator_switch(vdev_t *v, char *elevator)
+       "     2>/dev/null; " \
+       "echo %s"
+ 
+-      {
+-              char *argv[] = { "/bin/sh", "-c", NULL, NULL };
+-              char *envp[] = { NULL };
++      char *argv[] = { "/bin/sh", "-c", NULL, NULL };
++      char *envp[] = { NULL };
+ 
+-              argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator);
+-              error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+-              strfree(argv[2]);
+-      }
++      argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator);
++      error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
++      strfree(argv[2]);
+ #endif /* HAVE_ELEVATOR_CHANGE */
+       if (error)
+               printk("ZFS: Unable to set \"%s\" scheduler for %s (%s): %d\n",
+                   elevator, v->vdev_path, device, error);
+-
+-      return (error);
+ }
+ 
+ /*
+@@ -798,6 +803,35 @@ vdev_disk_rele(vdev_t *vd)
+       /* XXX: Implement me as a vnode rele for the device */
+ }
+ 
++static int
++param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp)
++{
++      spa_t *spa = NULL;
++      char *p;
++
++      if (val == NULL)
++              return (SET_ERROR(-EINVAL));
++
++      if ((p = strchr(val, '\n')) != NULL)
++              *p = '\0';
++
++      mutex_enter(&spa_namespace_lock);
++      while ((spa = spa_next(spa)) != NULL) {
++              if (spa_state(spa) != POOL_STATE_ACTIVE ||
++                  !spa_writeable(spa) || spa_suspended(spa))
++                      continue;
++
++              spa_open_ref(spa, FTAG);
++              mutex_exit(&spa_namespace_lock);
++              vdev_elevator_switch(spa->spa_root_vdev, (char *)val);
++              mutex_enter(&spa_namespace_lock);
++              spa_close(spa, FTAG);
++      }
++      mutex_exit(&spa_namespace_lock);
++
++      return (param_set_charp(val, kp));
++}
++
+ vdev_ops_t vdev_disk_ops = {
+       vdev_disk_open,
+       vdev_disk_close,
+@@ -812,5 +846,6 @@ vdev_ops_t vdev_disk_ops = {
+       B_TRUE                  /* leaf vdev */
+ };
+ 
+-module_param(zfs_vdev_scheduler, charp, 0644);
++module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler,
++    param_get_charp, &zfs_vdev_scheduler, 0644);
+ MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");
diff --git a/zfs-patches/0012-module-param-callbacks-check-for-initialized-spa.patch b/zfs-patches/0012-module-param-callbacks-check-for-initialized-spa.patch

new file mode 100644 (file)

index 0000000..bfb22f5
--- /dev/null
+++ b/zfs-patches/0012-module-param-callbacks-check-for-initialized-spa.patch
@@ -0,0 +1,84 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Olaf Faaland <faaland1@llnl.gov>
+Date: Fri, 11 May 2018 12:46:07 -0700
+Subject: [PATCH] module param callbacks check for initialized spa
+
+Callbacks provided for module parameters are executed both
+after the module is loaded, when a user alters it via sysfs, e.g
+       echo bar > /sys/modules/zfs/parameters/foo
+
+as well as when the module is loaded with an argument, e.g.
+       modprobe zfs foo=bar
+
+In the latter case, the init functions likely have not run yet,
+including spa_init() which initializes the namespace lock so it is safe
+to use.
+
+Instead of immediately taking the namespace lock and attemping to
+iterate over initialized spa structures, check whether spa_mode_global
+is nonzero.  This is set by spa_init() after it has initialized the
+namespace lock.
+
+Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Reviewed-by: Tim Chase <tim@chase2k.com>
+Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
+Closes #7496
+Closes #7521
+
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ module/zfs/mmp.c       |  3 ++-
+ module/zfs/vdev_disk.c | 24 +++++++++++++-----------
+ 2 files changed, 15 insertions(+), 12 deletions(-)
+
+diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c
+index 3b74a6b6..7523310c 100644
+--- a/module/zfs/mmp.c
++++ b/module/zfs/mmp.c
+@@ -607,7 +607,8 @@ param_set_multihost_interval(const char *val, zfs_kernel_param_t *kp)
+       if (ret < 0)
+               return (ret);
+ 
+-      mmp_signal_all_threads();
++      if (spa_mode_global != 0)
++              mmp_signal_all_threads();
+ 
+       return (ret);
+ }
+diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c
+index d6212835..6761e755 100644
+--- a/module/zfs/vdev_disk.c
++++ b/module/zfs/vdev_disk.c
+@@ -815,19 +815,21 @@ param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp)
+       if ((p = strchr(val, '\n')) != NULL)
+               *p = '\0';
+ 
+-      mutex_enter(&spa_namespace_lock);
+-      while ((spa = spa_next(spa)) != NULL) {
+-              if (spa_state(spa) != POOL_STATE_ACTIVE ||
+-                  !spa_writeable(spa) || spa_suspended(spa))
+-                      continue;
+-
+-              spa_open_ref(spa, FTAG);
+-              mutex_exit(&spa_namespace_lock);
+-              vdev_elevator_switch(spa->spa_root_vdev, (char *)val);
++      if (spa_mode_global != 0) {
+               mutex_enter(&spa_namespace_lock);
+-              spa_close(spa, FTAG);
++              while ((spa = spa_next(spa)) != NULL) {
++                      if (spa_state(spa) != POOL_STATE_ACTIVE ||
++                          !spa_writeable(spa) || spa_suspended(spa))
++                              continue;
++
++                      spa_open_ref(spa, FTAG);
++                      mutex_exit(&spa_namespace_lock);
++                      vdev_elevator_switch(spa->spa_root_vdev, (char *)val);
++                      mutex_enter(&spa_namespace_lock);
++                      spa_close(spa, FTAG);
++              }
++              mutex_exit(&spa_namespace_lock);
+       }
+-      mutex_exit(&spa_namespace_lock);
+ 
+       return (param_set_charp(val, kp));
+ }
diff --git a/zfs-patches/0013-Support-Debian-DKMS-builds.patch b/zfs-patches/0013-Support-Debian-DKMS-builds.patch

new file mode 100644 (file)

index 0000000..17e7151
--- /dev/null
+++ b/zfs-patches/0013-Support-Debian-DKMS-builds.patch
@@ -0,0 +1,52 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Antonio Russo <antonio.e.russo@gmail.com>
+Date: Sat, 26 May 2018 13:56:24 -0400
+Subject: [PATCH] Support Debian DKMS builds
+
+scripts/dkms.mkconf calls configure with
+`--with-linux=${kernel_source_dir}`, but Debian puts it kernel source at
+`/lib/modules/<version>/source`. This patch adds the same logic to the
+DKMS file produced by `scripts/dkms.mkconf` that Debian has shipped in
+its official ZFS packaging: at DKMS build time, it checks if the system
+is a Debian system, and adjusts the path accordingly.
+
+Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Reviewed-by: George Melikov <mail@gmelikov.ru>
+Signed-off-by: Antonio Russo <antonio.e.russo@gmail.com>
+Closes #7358
+Closes #7540
+Closes #7554
+
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ scripts/dkms.mkconf | 17 ++++++++++++++++-
+ 1 file changed, 16 insertions(+), 1 deletion(-)
+
+diff --git a/scripts/dkms.mkconf b/scripts/dkms.mkconf
+index 880510ab..88c28938 100755
+--- a/scripts/dkms.mkconf
++++ b/scripts/dkms.mkconf
+@@ -25,7 +25,22 @@ PACKAGE_CONFIG="${pkgcfg}"
+ PRE_BUILD="configure
+   --prefix=/usr
+   --with-config=kernel
+-  --with-linux=\${kernel_source_dir}
++  --with-linux=\$(
++    case \`lsb_release -is\` in
++      (Debian|Devuan)
++        if [[ -e \${kernel_source_dir/%build/source} ]]
++        then
++          echo \${kernel_source_dir/%build/source}
++        else
++          # A kpkg exception for Proxmox 2.0
++          echo \${kernel_source_dir}
++        fi
++      ;;
++      (*)
++        echo \${kernel_source_dir}
++      ;;
++    esac
++  )
+   --with-linux-obj=\${kernel_source_dir}
+   --with-spl=\${source_tree}/spl-\${PACKAGE_VERSION}
+   --with-spl-obj=\${dkms_tree}/spl/\${PACKAGE_VERSION}/\${kernelver}/\${arch}
diff --git a/zfs-patches/0014-zpool-reopen-should-detect-expanded-devices.patch b/zfs-patches/0014-zpool-reopen-should-detect-expanded-devices.patch

new file mode 100644 (file)

index 0000000..f11577f
--- /dev/null
+++ b/zfs-patches/0014-zpool-reopen-should-detect-expanded-devices.patch
@@ -0,0 +1,376 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Sara Hartse <sara.hartse@gmail.com>
+Date: Thu, 31 May 2018 10:36:37 -0700
+Subject: [PATCH] zpool reopen should detect expanded devices
+
+Update bdev_capacity to have wholedisk vdevs query the
+size of the underlying block device (correcting for the size
+of the efi parition and partition alignment) and therefore detect
+expanded space.
+
+Correct vdev_get_stats_ex so that the expandsize is aligned
+to metaslab size and new space is only reported if it is large
+enough for a new metaslab.
+
+Reviewed by: Don Brady <don.brady@delphix.com>
+Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Reviewed by: George Wilson <george.wilson@delphix.com>
+Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
+Reviewed by: John Wren Kennedy <jwk404@gmail.com>
+Signed-off-by: sara hartse <sara.hartse@delphix.com>
+External-issue: LX-165
+Closes #7546
+Issue #7582
+
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ include/sys/vdev_disk.h                            | 12 +++++
+ lib/libefi/rdwr_efi.c                              | 20 +++++++-
+ lib/libzfs/libzfs_pool.c                           | 14 +-----
+ module/zfs/vdev.c                                  |  3 +-
+ module/zfs/vdev_disk.c                             | 46 +++++++++++++-----
+ .../cli_root/zpool_expand/zpool_expand_002_pos.ksh | 54 +++++++++++++++-------
+ 6 files changed, 107 insertions(+), 42 deletions(-)
+
+diff --git a/include/sys/vdev_disk.h b/include/sys/vdev_disk.h
+index 15570b10..b8a32b31 100644
+--- a/include/sys/vdev_disk.h
++++ b/include/sys/vdev_disk.h
+@@ -23,11 +23,23 @@
+  * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+  * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+  * LLNL-CODE-403049.
++ * Copyright (c) 2018 by Delphix. All rights reserved.
+  */
+ 
+ #ifndef _SYS_VDEV_DISK_H
+ #define       _SYS_VDEV_DISK_H
+ 
++/*
++ * Don't start the slice at the default block of 34; many storage
++ * devices will use a stripe width of 128k, other vendors prefer a 1m
++ * alignment.  It is best to play it safe and ensure a 1m alignment
++ * given 512B blocks.  When the block size is larger by a power of 2
++ * we will still be 1m aligned.  Some devices are sensitive to the
++ * partition ending alignment as well.
++ */
++#define       NEW_START_BLOCK         2048
++#define       PARTITION_END_ALIGNMENT 2048
++
+ #ifdef _KERNEL
+ #include <sys/vdev.h>
+ 
+diff --git a/lib/libefi/rdwr_efi.c b/lib/libefi/rdwr_efi.c
+index 7935047e..19cb17e5 100644
+--- a/lib/libefi/rdwr_efi.c
++++ b/lib/libefi/rdwr_efi.c
+@@ -22,6 +22,7 @@
+ /*
+  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+  * Copyright 2012 Nexenta Systems, Inc.  All rights reserved.
++ * Copyright (c) 2018 by Delphix. All rights reserved.
+  */
+ 
+ #include <stdio.h>
+@@ -1153,7 +1154,7 @@ efi_use_whole_disk(int fd)
+ 
+       /*
+        * Find the last physically non-zero partition.
+-       * This is the reserved partition.
++       * This should be the reserved partition.
+        */
+       for (i = 0; i < efi_label->efi_nparts; i ++) {
+               if (resv_start < efi_label->efi_parts[i].p_start) {
+@@ -1163,6 +1164,23 @@ efi_use_whole_disk(int fd)
+       }
+ 
+       /*
++       * Verify that we've found the reserved partition by checking
++       * that it looks the way it did when we created it in zpool_label_disk.
++       * If we've found the incorrect partition, then we know that this
++       * device was reformatted and no longer is soley used by ZFS.
++       */
++      if ((efi_label->efi_parts[resv_index].p_size != EFI_MIN_RESV_SIZE) ||
++          (efi_label->efi_parts[resv_index].p_tag != V_RESERVED) ||
++          (resv_index != 8)) {
++              if (efi_debug) {
++                      (void) fprintf(stderr,
++                          "efi_use_whole_disk: wholedisk not available\n");
++              }
++              efi_free(efi_label);
++              return (VT_ENOSPC);
++      }
++
++      /*
+        * Find the last physically non-zero partition before that.
+        * This is the data partition.
+        */
+diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
+index e00d5f51..53bc5034 100644
+--- a/lib/libzfs/libzfs_pool.c
++++ b/lib/libzfs/libzfs_pool.c
+@@ -22,7 +22,7 @@
+ /*
+  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
+  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+- * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
++ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+  * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
+  * Copyright (c) 2017 Datto Inc.
+  */
+@@ -42,6 +42,7 @@
+ #include <sys/efi_partition.h>
+ #include <sys/vtoc.h>
+ #include <sys/zfs_ioctl.h>
++#include <sys/vdev_disk.h>
+ #include <dlfcn.h>
+ 
+ #include "zfs_namecheck.h"
+@@ -913,17 +914,6 @@ zpool_prop_get_feature(zpool_handle_t *zhp, const char *propname, char *buf,
+ }
+ 
+ /*
+- * Don't start the slice at the default block of 34; many storage
+- * devices will use a stripe width of 128k, other vendors prefer a 1m
+- * alignment.  It is best to play it safe and ensure a 1m alignment
+- * given 512B blocks.  When the block size is larger by a power of 2
+- * we will still be 1m aligned.  Some devices are sensitive to the
+- * partition ending alignment as well.
+- */
+-#define       NEW_START_BLOCK         2048
+-#define       PARTITION_END_ALIGNMENT 2048
+-
+-/*
+  * Validate the given pool name, optionally putting an extended error message in
+  * 'buf'.
+  */
+diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
+index acac2a97..b643bd35 100644
+--- a/module/zfs/vdev.c
++++ b/module/zfs/vdev.c
+@@ -21,7 +21,7 @@
+ 
+ /*
+  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
++ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+  * Copyright 2017 Nexenta Systems, Inc.
+  * Copyright (c) 2014 Integros [integros.com]
+  * Copyright 2016 Toomas Soome <tsoome@me.com>
+@@ -3039,7 +3039,6 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
+                           vd->vdev_max_asize - vd->vdev_asize,
+                           1ULL << tvd->vdev_ms_shift);
+               }
+-              vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize;
+               if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
+                   !vd->vdev_ishole) {
+                       vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation;
+diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c
+index 6761e755..6dc0544f 100644
+--- a/module/zfs/vdev_disk.c
++++ b/module/zfs/vdev_disk.c
+@@ -23,7 +23,7 @@
+  * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+  * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
+  * LLNL-CODE-403049.
+- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
++ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+  */
+ 
+ #include <sys/zfs_context.h>
+@@ -35,10 +35,14 @@
+ #include <sys/zio.h>
+ #include <sys/sunldi.h>
+ #include <linux/mod_compat.h>
++#include <linux/msdos_fs.h>
+ 
+ char *zfs_vdev_scheduler = VDEV_SCHEDULER;
+ static void *zfs_vdev_holder = VDEV_HOLDER;
+ 
++/* size of the "reserved" partition, in blocks */
++#define       EFI_MIN_RESV_SIZE       (16 * 1024)
++
+ /*
+  * Virtual device vector for disks.
+  */
+@@ -82,17 +86,39 @@ vdev_bdev_mode(int smode)
+ }
+ #endif /* HAVE_OPEN_BDEV_EXCLUSIVE */
+ 
++/* The capacity (in bytes) of a bdev that is available to be used by a vdev */
+ static uint64_t
+-bdev_capacity(struct block_device *bdev)
++bdev_capacity(struct block_device *bdev, boolean_t wholedisk)
+ {
+       struct hd_struct *part = bdev->bd_part;
++      uint64_t sectors = get_capacity(bdev->bd_disk);
++      /* If there are no paritions, return the entire device capacity */
++      if (part == NULL)
++              return (sectors << SECTOR_BITS);
+ 
+-      /* The partition capacity referenced by the block device */
+-      if (part)
+-              return (part->nr_sects << 9);
+-
+-      /* Otherwise assume the full device capacity */
+-      return (get_capacity(bdev->bd_disk) << 9);
++      /*
++       * If there are partitions, decide if we are using a `wholedisk`
++       * layout (composed of part1 and part9) or just a single partition.
++       */
++      if (wholedisk) {
++              /* Verify the expected device layout */
++              ASSERT3P(bdev, !=, bdev->bd_contains);
++              /*
++               * Sectors used by the EFI partition (part9) as well as
++               * partion alignment.
++               */
++              uint64_t used = EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
++                  PARTITION_END_ALIGNMENT;
++
++              /* Space available to the vdev, i.e. the size of part1 */
++              if (sectors <= used)
++                      return (0);
++              uint64_t available = sectors - used;
++              return (available << SECTOR_BITS);
++      } else {
++              /* The partition capacity referenced by the block device */
++              return (part->nr_sects << SECTOR_BITS);
++      }
+ }
+ 
+ static void
+@@ -328,9 +354,7 @@ skip_open:
+       v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev));
+ 
+       /* Physical volume size in bytes */
+-      *psize = bdev_capacity(vd->vd_bdev);
+-
+-      /* TODO: report possible expansion size */
++      *psize = bdev_capacity(vd->vd_bdev, v->vdev_wholedisk);
+       *max_psize = *psize;
+ 
+       /* Based on the minimum sector size set the block size */
+diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh
+index d578ae60..66b6969d 100755
+--- a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh
++++ b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh
+@@ -26,7 +26,7 @@
+ #
+ 
+ #
+-# Copyright (c) 2012, 2016 by Delphix. All rights reserved.
++# Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ # Copyright (c) 2017 Lawrence Livermore National Security, LLC.
+ #
+ 
+@@ -43,8 +43,9 @@
+ # 1) Create 3 files
+ # 2) Create a pool backed by the files
+ # 3) Expand the files' size with truncate
+-# 4) Use zpool online -e to online the vdevs
+-# 5) Check that the pool size was expanded
++# 4) Use zpool reopen to check the expandsize
++# 5) Use zpool online -e to online the vdevs
++# 6) Check that the pool size was expanded
+ #
+ 
+ verify_runnable "global"
+@@ -64,8 +65,8 @@ log_onexit cleanup
+ 
+ log_assert "zpool can expand after zpool online -e zvol vdevs on LUN expansion"
+ 
+-
+ for type in " " mirror raidz raidz2; do
++      # Initialize the file devices and the pool
+       for i in 1 2 3; do
+               log_must truncate -s $org_size ${TEMPFILE}.$i
+       done
+@@ -80,13 +81,35 @@ for type in " " mirror raidz raidz2; do
+                   "$autoexp"
+       fi
+       typeset prev_size=$(get_pool_prop size $TESTPOOL1)
+-      typeset zfs_prev_size=$(zfs get -p avail $TESTPOOL1 | tail -1 | \
+-          awk '{print $3}')
++      typeset zfs_prev_size=$(get_prop avail $TESTPOOL1)
+ 
++      # Increase the size of the file devices
+       for i in 1 2 3; do
+               log_must truncate -s $exp_size ${TEMPFILE}.$i
+       done
+ 
++      # Reopen the pool and check that the `expandsize` property is set
++      log_must zpool reopen $TESTPOOL1
++      typeset zpool_expandsize=$(get_pool_prop expandsize $TESTPOOL1)
++
++      if [[ $type == "mirror" ]]; then
++              typeset expected_zpool_expandsize=$(($exp_size-$org_size))
++      else
++              typeset expected_zpool_expandsize=$((3*($exp_size-$org_size)))
++      fi
++
++      if [[ "$zpool_expandsize" = "-" ]]; then
++              log_fail "pool $TESTPOOL1 did not detect any " \
++                  "expandsize after reopen"
++      fi
++
++      if [[ $zpool_expandsize -ne $expected_zpool_expandsize ]]; then
++              log_fail "pool $TESTPOOL1 did not detect correct " \
++                  "expandsize after reopen: found $zpool_expandsize," \
++                  "expected $expected_zpool_expandsize"
++      fi
++
++      # Online the devices to add the new space to the pool
+       for i in 1 2 3; do
+               log_must zpool online -e $TESTPOOL1 ${TEMPFILE}.$i
+       done
+@@ -96,8 +119,7 @@ for type in " " mirror raidz raidz2; do
+       sync
+ 
+       typeset expand_size=$(get_pool_prop size $TESTPOOL1)
+-      typeset zfs_expand_size=$(zfs get -p avail $TESTPOOL1 | tail -1 | \
+-          awk '{print $3}')
++      typeset zfs_expand_size=$(get_prop avail $TESTPOOL1)
+       log_note "$TESTPOOL1 $type has previous size: $prev_size and " \
+           "expanded size: $expand_size"
+ 
+@@ -112,8 +134,8 @@ for type in " " mirror raidz raidz2; do
+                           grep "(+${expansion_size}" | wc -l)
+ 
+                       if [[ $size_addition -ne $i ]]; then
+-                              log_fail "pool $TESTPOOL1 is not autoexpand " \
+-                                  "after LUN expansion"
++                              log_fail "pool $TESTPOOL1 did not expand " \
++                                  "after LUN expansion and zpool online -e"
+                       fi
+               elif [[ $type == "mirror" ]]; then
+                       typeset expansion_size=$(($exp_size-$org_size))
+@@ -123,8 +145,8 @@ for type in " " mirror raidz raidz2; do
+                           grep "(+${expansion_size})" >/dev/null 2>&1
+ 
+                       if [[ $? -ne 0 ]]; then
+-                              log_fail "pool $TESTPOOL1 is not autoexpand " \
+-                                  "after LUN expansion"
++                              log_fail "pool $TESTPOOL1 did not expand " \
++                                  "after LUN expansion and zpool online -e"
+                       fi
+               else
+                       typeset expansion_size=$((3*($exp_size-$org_size)))
+@@ -134,13 +156,13 @@ for type in " " mirror raidz raidz2; do
+                           grep "(+${expansion_size})" >/dev/null 2>&1
+ 
+                       if [[ $? -ne 0 ]] ; then
+-                              log_fail "pool $TESTPOOL1 is not autoexpand " \
+-                                  "after LUN expansion"
++                              log_fail "pool $TESTPOOL1 did not expand " \
++                                  "after LUN expansion and zpool online -e"
+                       fi
+               fi
+       else
+-              log_fail "pool $TESTPOOL1 is not autoexpanded after LUN " \
+-                  "expansion"
++              log_fail "pool $TESTPOOL1 did not expand after LUN expansion " \
++                  "and zpool online -e"
+       fi
+       log_must zpool destroy $TESTPOOL1
+ done
diff --git a/zfs-patches/0015-Add-pool-state-proc-entry-SUSPENDED-pools.patch b/zfs-patches/0015-Add-pool-state-proc-entry-SUSPENDED-pools.patch

new file mode 100644 (file)

index 0000000..e1e0b9d
--- /dev/null
+++ b/zfs-patches/0015-Add-pool-state-proc-entry-SUSPENDED-pools.patch
@@ -0,0 +1,686 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Tony Hutter <hutter2@llnl.gov>
+Date: Wed, 6 Jun 2018 09:33:54 -0700
+Subject: [PATCH] Add pool state /proc entry, "SUSPENDED" pools
+
+1. Add a proc entry to display the pool's state:
+
+$ cat /proc/spl/kstat/zfs/tank/state
+ONLINE
+
+This is done without using the spa config locks, so it will
+never hang.
+
+2. Fix 'zpool status' and 'zpool list -o health' output to print
+"SUSPENDED" instead of "ONLINE" for suspended pools.
+
+Reviewed-by: Olaf Faaland <faaland1@llnl.gov>
+Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
+Signed-off-by: Tony Hutter <hutter2@llnl.gov>
+Closes #7331
+Closes #7563
+
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ cmd/zpool/zpool_main.c                             |   3 +-
+ configure.ac                                       |   1 +
+ include/libzfs.h                                   |   2 +
+ include/sys/spa.h                                  |   3 +
+ lib/libspl/include/sys/kstat.h                     |   2 +
+ lib/libzfs/libzfs_pool.c                           |  46 +++++--
+ lib/libzfs/libzfs_status.c                         |  12 +-
+ module/zfs/spa_misc.c                              |  40 ++++++
+ module/zfs/spa_stats.c                             |  62 +++++++++
+ tests/runfiles/linux.run                           |   4 +
+ tests/zfs-tests/include/libtest.shlib              |  38 ++++++
+ tests/zfs-tests/tests/functional/Makefile.am       |   1 +
+ tests/zfs-tests/tests/functional/kstat/Makefile.am |   5 +
+ tests/zfs-tests/tests/functional/kstat/cleanup.ksh |  28 ++++
+ tests/zfs-tests/tests/functional/kstat/setup.ksh   |  34 +++++
+ tests/zfs-tests/tests/functional/kstat/state.ksh   | 144 +++++++++++++++++++++
+ 16 files changed, 406 insertions(+), 19 deletions(-)
+ create mode 100644 tests/zfs-tests/tests/functional/kstat/Makefile.am
+ create mode 100755 tests/zfs-tests/tests/functional/kstat/cleanup.ksh
+ create mode 100755 tests/zfs-tests/tests/functional/kstat/setup.ksh
+ create mode 100755 tests/zfs-tests/tests/functional/kstat/state.ksh
+
+diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
+index b0756938..97697011 100644
+--- a/cmd/zpool/zpool_main.c
++++ b/cmd/zpool/zpool_main.c
+@@ -6226,7 +6226,8 @@ status_callback(zpool_handle_t *zhp, void *data)
+           &nvroot) == 0);
+       verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
+           (uint64_t **)&vs, &c) == 0);
+-      health = zpool_state_to_name(vs->vs_state, vs->vs_aux);
++
++      health = zpool_get_state_str(zhp);
+ 
+       (void) printf(gettext("  pool: %s\n"), zpool_get_name(zhp));
+       (void) printf(gettext(" state: %s\n"), health);
+diff --git a/configure.ac b/configure.ac
+index 3f4925c3..42cfc1a3 100644
+--- a/configure.ac
++++ b/configure.ac
+@@ -253,6 +253,7 @@ AC_CONFIG_FILES([
+       tests/zfs-tests/tests/functional/history/Makefile
+       tests/zfs-tests/tests/functional/inheritance/Makefile
+       tests/zfs-tests/tests/functional/inuse/Makefile
++      tests/zfs-tests/tests/functional/kstat/Makefile
+       tests/zfs-tests/tests/functional/large_files/Makefile
+       tests/zfs-tests/tests/functional/largest_pool/Makefile
+       tests/zfs-tests/tests/functional/link_count/Makefile
+diff --git a/include/libzfs.h b/include/libzfs.h
+index 945bd5b8..fea2fee4 100644
+--- a/include/libzfs.h
++++ b/include/libzfs.h
+@@ -296,6 +296,8 @@ int zfs_dev_is_whole_disk(char *dev_name);
+ char *zfs_get_underlying_path(char *dev_name);
+ char *zfs_get_enclosure_sysfs_path(char *dev_name);
+ 
++const char *zpool_get_state_str(zpool_handle_t *);
++
+ /*
+  * Functions to manage pool properties
+  */
+diff --git a/include/sys/spa.h b/include/sys/spa.h
+index 3b268419..810999c9 100644
+--- a/include/sys/spa.h
++++ b/include/sys/spa.h
+@@ -730,6 +730,7 @@ typedef struct spa_stats {
+       spa_stats_history_t     tx_assign_histogram;
+       spa_stats_history_t     io_history;
+       spa_stats_history_t     mmp_history;
++      spa_stats_history_t     state;          /* pool state */
+ } spa_stats_t;
+ 
+ typedef enum txg_state {
+@@ -889,6 +890,8 @@ extern void spa_history_log_internal_ds(struct dsl_dataset *ds, const char *op,
+ extern void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation,
+     dmu_tx_t *tx, const char *fmt, ...);
+ 
++extern const char *spa_state_to_name(spa_t *spa);
++
+ /* error handling */
+ struct zbookmark_phys;
+ extern void spa_log_error(spa_t *spa, zio_t *zio);
+diff --git a/lib/libspl/include/sys/kstat.h b/lib/libspl/include/sys/kstat.h
+index fcd3ed98..84c3d7ca 100644
+--- a/lib/libspl/include/sys/kstat.h
++++ b/lib/libspl/include/sys/kstat.h
+@@ -304,6 +304,8 @@ typedef struct kstat32 {
+ #define       KSTAT_FLAG_PERSISTENT           0x08
+ #define       KSTAT_FLAG_DORMANT              0x10
+ #define       KSTAT_FLAG_INVALID              0x20
++#define       KSTAT_FLAG_LONGSTRINGS          0x40
++#define       KSTAT_FLAG_NO_HEADERS           0x80
+ 
+ /*
+  * Dynamic update support
+diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
+index 53bc5034..315ba954 100644
+--- a/lib/libzfs/libzfs_pool.c
++++ b/lib/libzfs/libzfs_pool.c
+@@ -240,6 +240,38 @@ zpool_pool_state_to_name(pool_state_t state)
+ }
+ 
+ /*
++ * Given a pool handle, return the pool health string ("ONLINE", "DEGRADED",
++ * "SUSPENDED", etc).
++ */
++const char *
++zpool_get_state_str(zpool_handle_t *zhp)
++{
++      zpool_errata_t errata;
++      zpool_status_t status;
++      nvlist_t *nvroot;
++      vdev_stat_t *vs;
++      uint_t vsc;
++      const char *str;
++
++      status = zpool_get_status(zhp, NULL, &errata);
++
++      if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
++              str = gettext("FAULTED");
++      } else if (status == ZPOOL_STATUS_IO_FAILURE_WAIT ||
++          status == ZPOOL_STATUS_IO_FAILURE_MMP) {
++              str = gettext("SUSPENDED");
++      } else {
++              verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
++                  ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
++              verify(nvlist_lookup_uint64_array(nvroot,
++                  ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
++                  == 0);
++              str = zpool_state_to_name(vs->vs_state, vs->vs_aux);
++      }
++      return (str);
++}
++
++/*
+  * Get a zpool property value for 'prop' and return the value in
+  * a pre-allocated buffer.
+  */
+@@ -250,9 +282,6 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf,
+       uint64_t intval;
+       const char *strval;
+       zprop_source_t src = ZPROP_SRC_NONE;
+-      nvlist_t *nvroot;
+-      vdev_stat_t *vs;
+-      uint_t vsc;
+ 
+       if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
+               switch (prop) {
+@@ -261,7 +290,7 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf,
+                       break;
+ 
+               case ZPOOL_PROP_HEALTH:
+-                      (void) strlcpy(buf, "FAULTED", len);
++                      (void) strlcpy(buf, zpool_get_state_str(zhp), len);
+                       break;
+ 
+               case ZPOOL_PROP_GUID:
+@@ -362,14 +391,7 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf,
+                       break;
+ 
+               case ZPOOL_PROP_HEALTH:
+-                      verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
+-                          ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
+-                      verify(nvlist_lookup_uint64_array(nvroot,
+-                          ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
+-                          == 0);
+-
+-                      (void) strlcpy(buf, zpool_state_to_name(intval,
+-                          vs->vs_aux), len);
++                      (void) strlcpy(buf, zpool_get_state_str(zhp), len);
+                       break;
+               case ZPOOL_PROP_VERSION:
+                       if (intval >= SPA_VERSION_FEATURES) {
+diff --git a/lib/libzfs/libzfs_status.c b/lib/libzfs/libzfs_status.c
+index 6cdcd382..5e423f3a 100644
+--- a/lib/libzfs/libzfs_status.c
++++ b/lib/libzfs/libzfs_status.c
+@@ -403,12 +403,12 @@ zpool_status_t
+ zpool_get_status(zpool_handle_t *zhp, char **msgid, zpool_errata_t *errata)
+ {
+       zpool_status_t ret = check_status(zhp->zpool_config, B_FALSE, errata);
+-
+-      if (ret >= NMSGID)
+-              *msgid = NULL;
+-      else
+-              *msgid = zfs_msgid_table[ret];
+-
++      if (msgid != NULL) {
++              if (ret >= NMSGID)
++                      *msgid = NULL;
++              else
++                      *msgid = zfs_msgid_table[ret];
++      }
+       return (ret);
+ }
+ 
+diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
+index e92c3948..cc1c641d 100644
+--- a/module/zfs/spa_misc.c
++++ b/module/zfs/spa_misc.c
+@@ -2100,6 +2100,45 @@ spa_get_hostid(void)
+       return (myhostid);
+ }
+ 
++/*
++ * Return the pool state string ("ONLINE", "DEGRADED", "SUSPENDED", etc).
++ */
++const char *
++spa_state_to_name(spa_t *spa)
++{
++      vdev_state_t state = spa->spa_root_vdev->vdev_state;
++      vdev_aux_t aux = spa->spa_root_vdev->vdev_stat.vs_aux;
++
++      if (spa_suspended(spa) &&
++          (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE))
++              return ("SUSPENDED");
++
++      switch (state) {
++      case VDEV_STATE_CLOSED:
++      case VDEV_STATE_OFFLINE:
++              return ("OFFLINE");
++      case VDEV_STATE_REMOVED:
++              return ("REMOVED");
++      case VDEV_STATE_CANT_OPEN:
++              if (aux == VDEV_AUX_CORRUPT_DATA || aux == VDEV_AUX_BAD_LOG)
++                      return ("FAULTED");
++              else if (aux == VDEV_AUX_SPLIT_POOL)
++                      return ("SPLIT");
++              else
++                      return ("UNAVAIL");
++      case VDEV_STATE_FAULTED:
++              return ("FAULTED");
++      case VDEV_STATE_DEGRADED:
++              return ("DEGRADED");
++      case VDEV_STATE_HEALTHY:
++              return ("ONLINE");
++      default:
++              break;
++      }
++
++      return ("UNKNOWN");
++}
++
+ #if defined(_KERNEL) && defined(HAVE_SPL)
+ /* Namespace manipulation */
+ EXPORT_SYMBOL(spa_lookup);
+@@ -2178,6 +2217,7 @@ EXPORT_SYMBOL(spa_is_root);
+ EXPORT_SYMBOL(spa_writeable);
+ EXPORT_SYMBOL(spa_mode);
+ EXPORT_SYMBOL(spa_namespace_lock);
++EXPORT_SYMBOL(spa_state_to_name);
+ 
+ /* BEGIN CSTYLED */
+ module_param(zfs_flags, uint, 0644);
+diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c
+index 8950d9c5..ca3d0be7 100644
+--- a/module/zfs/spa_stats.c
++++ b/module/zfs/spa_stats.c
+@@ -22,6 +22,8 @@
+ #include <sys/zfs_context.h>
+ #include <sys/spa_impl.h>
+ #include <sys/vdev_impl.h>
++#include <sys/spa.h>
++#include <zfs_comutil.h>
+ 
+ /*
+  * Keeps stats on last N reads per spa_t, disabled by default.
+@@ -992,6 +994,64 @@ spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp,
+       return ((void *)smh);
+ }
+ 
++static void *
++spa_state_addr(kstat_t *ksp, loff_t n)
++{
++      return (ksp->ks_private);       /* return the spa_t */
++}
++
++static int
++spa_state_data(char *buf, size_t size, void *data)
++{
++      spa_t *spa = (spa_t *)data;
++      (void) snprintf(buf, size, "%s\n", spa_state_to_name(spa));
++      return (0);
++}
++
++/*
++ * Return the state of the pool in /proc/spl/kstat/zfs/<pool>/state.
++ *
++ * This is a lock-less read of the pool's state (unlike using 'zpool', which
++ * can potentially block for seconds).  Because it doesn't block, it can useful
++ * as a pool heartbeat value.
++ */
++static void
++spa_state_init(spa_t *spa)
++{
++      spa_stats_history_t *ssh = &spa->spa_stats.state;
++      char *name;
++      kstat_t *ksp;
++
++      mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL);
++
++      name = kmem_asprintf("zfs/%s", spa_name(spa));
++      ksp = kstat_create(name, 0, "state", "misc",
++          KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
++
++      ssh->kstat = ksp;
++      if (ksp) {
++              ksp->ks_lock = &ssh->lock;
++              ksp->ks_data = NULL;
++              ksp->ks_private = spa;
++              ksp->ks_flags |= KSTAT_FLAG_NO_HEADERS;
++              kstat_set_raw_ops(ksp, NULL, spa_state_data, spa_state_addr);
++              kstat_install(ksp);
++      }
++
++      strfree(name);
++}
++
++static void
++spa_health_destroy(spa_t *spa)
++{
++      spa_stats_history_t *ssh = &spa->spa_stats.state;
++      kstat_t *ksp = ssh->kstat;
++      if (ksp)
++              kstat_delete(ksp);
++
++      mutex_destroy(&ssh->lock);
++}
++
+ void
+ spa_stats_init(spa_t *spa)
+ {
+@@ -1000,11 +1060,13 @@ spa_stats_init(spa_t *spa)
+       spa_tx_assign_init(spa);
+       spa_io_history_init(spa);
+       spa_mmp_history_init(spa);
++      spa_state_init(spa);
+ }
+ 
+ void
+ spa_stats_destroy(spa_t *spa)
+ {
++      spa_health_destroy(spa);
+       spa_tx_assign_destroy(spa);
+       spa_txg_history_destroy(spa);
+       spa_read_history_destroy(spa);
+diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
+index 379c9f73..69e9eb26 100644
+--- a/tests/runfiles/linux.run
++++ b/tests/runfiles/linux.run
+@@ -467,6 +467,10 @@ tests = ['inuse_001_pos', 'inuse_003_pos', 'inuse_004_pos',
+ post =
+ tags = ['functional', 'inuse']
+ 
++[tests/functional/kstat]
++tests = ['state']
++tags = ['functional', 'kstat']
++
+ [tests/functional/large_files]
+ tests = ['large_files_001_pos', 'large_files_002_pos']
+ tags = ['functional', 'large_files']
+diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib
+index 13c85912..86dae6ea 100644
+--- a/tests/zfs-tests/include/libtest.shlib
++++ b/tests/zfs-tests/include/libtest.shlib
+@@ -26,6 +26,7 @@
+ # Copyright 2016 Nexenta Systems, Inc.
+ # Copyright (c) 2017 Lawrence Livermore National Security, LLC.
+ # Copyright (c) 2017 Datto Inc.
++# Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
+ #
+ 
+ . ${STF_TOOLS}/include/logapi.shlib
+@@ -3718,3 +3719,40 @@ function get_pool_devices #testpool #devdir
+       fi
+       echo $out
+ }
++
++#
++# Get scsi_debug device name.
++# Returns basename of scsi_debug device (for example "sdb").
++#
++function get_debug_device
++{
++      for i in {1..10} ; do
++              val=$(lsscsi | nawk '/scsi_debug/ {print $6; exit}' | cut -d / -f3)
++
++              # lsscsi can take time to settle
++              if [ "$val" != "-" ] ; then
++                      break
++              fi
++              sleep 1
++      done
++      echo "$val"
++}
++
++#
++# Returns SCSI host number for the given disk
++#
++function get_scsi_host #disk
++{
++      typeset disk=$1
++      ls /sys/block/${disk}/device/scsi_device | cut -d : -f 1
++}
++
++#
++# Simulate disk removal
++#
++function remove_disk #disk
++{
++      typeset disk=$1
++      on_off_disk $disk "offline"
++      block_device_wait
++}
+diff --git a/tests/zfs-tests/tests/functional/Makefile.am b/tests/zfs-tests/tests/functional/Makefile.am
+index ea52205a..bbbf3ba0 100644
+--- a/tests/zfs-tests/tests/functional/Makefile.am
++++ b/tests/zfs-tests/tests/functional/Makefile.am
+@@ -24,6 +24,7 @@ SUBDIRS = \
+       history \
+       inheritance \
+       inuse \
++      kstat \
+       large_files \
+       largest_pool \
+       libzfs \
+diff --git a/tests/zfs-tests/tests/functional/kstat/Makefile.am b/tests/zfs-tests/tests/functional/kstat/Makefile.am
+new file mode 100644
+index 00000000..8ad83ec3
+--- /dev/null
++++ b/tests/zfs-tests/tests/functional/kstat/Makefile.am
+@@ -0,0 +1,5 @@
++pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/kstat
++dist_pkgdata_SCRIPTS = \
++      setup.ksh \
++      cleanup.ksh \
++      state.ksh
+diff --git a/tests/zfs-tests/tests/functional/kstat/cleanup.ksh b/tests/zfs-tests/tests/functional/kstat/cleanup.ksh
+new file mode 100755
+index 00000000..8a212ce3
+--- /dev/null
++++ b/tests/zfs-tests/tests/functional/kstat/cleanup.ksh
+@@ -0,0 +1,28 @@
++#!/bin/ksh -p
++#
++# CDDL HEADER START
++#
++# The contents of this file are subject to the terms of the
++# Common Development and Distribution License (the "License").
++# You may not use this file except in compliance with the License.
++#
++# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
++# or http://www.opensolaris.org/os/licensing.
++# See the License for the specific language governing permissions
++# and limitations under the License.
++#
++# When distributing Covered Code, include this CDDL HEADER in each
++# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
++# If applicable, add the following below this CDDL HEADER, with the
++# fields enclosed by brackets "[]" replaced with your own identifying
++# information: Portions Copyright [yyyy] [name of copyright owner]
++#
++# CDDL HEADER END
++#
++#
++# Copyright (c) 2018 by Lawrence Livermore National Security, LLC.
++#
++
++. $STF_SUITE/include/libtest.shlib
++
++default_cleanup
+diff --git a/tests/zfs-tests/tests/functional/kstat/setup.ksh b/tests/zfs-tests/tests/functional/kstat/setup.ksh
+new file mode 100755
+index 00000000..57717a09
+--- /dev/null
++++ b/tests/zfs-tests/tests/functional/kstat/setup.ksh
+@@ -0,0 +1,34 @@
++#!/bin/ksh -p
++#
++# CDDL HEADER START
++#
++# The contents of this file are subject to the terms of the
++# Common Development and Distribution License (the "License").
++# You may not use this file except in compliance with the License.
++#
++# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
++# or http://www.opensolaris.org/os/licensing.
++# See the License for the specific language governing permissions
++# and limitations under the License.
++#
++# When distributing Covered Code, include this CDDL HEADER in each
++# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
++# If applicable, add the following below this CDDL HEADER, with the
++# fields enclosed by brackets "[]" replaced with your own identifying
++# information: Portions Copyright [yyyy] [name of copyright owner]
++#
++# CDDL HEADER END
++#
++#
++# Copyright (c) 2018 by Lawrence Livermore National Security, LLC.
++#
++
++. $STF_SUITE/include/libtest.shlib
++
++if ! is_linux ; then
++      log_unsupported "/proc/spl/kstat/<pool>/health only supported on Linux"
++fi
++
++default_mirror_setup $DISKS
++
++log_pass
+diff --git a/tests/zfs-tests/tests/functional/kstat/state.ksh b/tests/zfs-tests/tests/functional/kstat/state.ksh
+new file mode 100755
+index 00000000..bf0b6e31
+--- /dev/null
++++ b/tests/zfs-tests/tests/functional/kstat/state.ksh
+@@ -0,0 +1,144 @@
++#!/bin/ksh -p
++#
++# CDDL HEADER START
++#
++# The contents of this file are subject to the terms of the
++# Common Development and Distribution License (the "License").
++# You may not use this file except in compliance with the License.
++#
++# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
++# or http://www.opensolaris.org/os/licensing.
++# See the License for the specific language governing permissions
++# and limitations under the License.
++#
++# When distributing Covered Code, include this CDDL HEADER in each
++# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
++# If applicable, add the following below this CDDL HEADER, with the
++# fields enclosed by brackets "[]" replaced with your own identifying
++# information: Portions Copyright [yyyy] [name of copyright owner]
++#
++# CDDL HEADER END
++
++#
++# Copyright (c) 2018 by Lawrence Livermore National Security, LLC.
++#
++
++#
++# DESCRIPTION:
++# Test /proc/spl/kstat/zfs/<pool>/state kstat
++#
++# STRATEGY:
++# 1. Create a mirrored pool
++# 2. Check that pool is ONLINE
++# 3. Fault one disk
++# 4. Check that pool is DEGRADED
++# 5. Create a new pool with a single scsi_debug disk
++# 6. Remove the disk
++# 7. Check that pool is SUSPENDED
++# 8. Add the disk back in
++# 9. Clear errors and destroy the pools
++
++. $STF_SUITE/include/libtest.shlib
++
++verify_runnable "both"
++
++function cleanup
++{
++      # Destroy the scsi_debug pool
++      if [ -n "$TESTPOOL2" ] ; then
++              if  [ -n "$host" ] ; then
++                      # Re-enable the disk
++                      scan_scsi_hosts $host
++
++                      # Device may have changed names after being inserted
++                      SDISK=$(get_debug_device)
++                      log_must ln $DEV_RDSKDIR/$SDISK $REALDISK
++              fi
++
++              # Restore our working pool image
++              if [ -n "$BACKUP" ] ; then
++                      gunzip -c $BACKUP > $REALDISK
++                      log_must rm -f $BACKUP
++              fi
++
++              # Our disk is back.  Now we can clear errors and destroy the
++              # pool cleanly.
++              log_must zpool clear $TESTPOOL2
++
++              # Now that the disk is back and errors cleared, wait for our
++              # hung 'zpool scrub' to finish.
++              wait
++
++              destroy_pool $TESTPOOL2
++              log_must rm $REALDISK
++              unload_scsi_debug
++      fi
++}
++
++# Check that our pool state values match what's expected
++#
++# $1: pool name
++# $2: expected state ("ONLINE", "DEGRADED", "SUSPENDED", etc)
++function check_all
++{
++      pool=$1
++      expected=$2
++
++      state1=$(zpool status $pool | awk '/state: /{print $2}');
++      state2=$(zpool list -H -o health $pool)
++      state3=$(cat /proc/spl/kstat/zfs/$pool/state)
++      log_note "Checking $expected = $state1 = $state2 = $state3"
++      if [[ "$expected" == "$state1" &&  "$expected" == "$state2" && \
++          "$expected" == "$state3" ]] ; then
++              true
++      else
++              false
++      fi
++}
++
++log_onexit cleanup
++
++log_assert "Testing /proc/spl/kstat/zfs/<pool>/state kstat"
++
++# Test that the initial pool is healthy
++check_all $TESTPOOL "ONLINE"
++
++# Fault one of the disks, and check that pool is degraded
++DISK1=$(echo "$DISKS" | awk '{print $2}')
++zpool offline -tf $TESTPOOL $DISK1
++check_all $TESTPOOL "DEGRADED"
++
++# Create a new pool out of a scsi_debug disk
++TESTPOOL2=testpool2
++MINVDEVSIZE_MB=$((MINVDEVSIZE / 1048576))
++load_scsi_debug $MINVDEVSIZE_MB 1 1 1 '512b'
++
++SDISK=$(get_debug_device)
++host=$(get_scsi_host $SDISK)
++
++# Use $REALDISK instead of $SDISK in our pool because $SDISK can change names
++# as we remove/add the disk (i.e. /dev/sdf -> /dev/sdg).
++REALDISK=/dev/kstat-state-realdisk
++log_must [ ! -e $REALDISK ]
++ln $DEV_RDSKDIR/$SDISK $REALDISK
++
++log_must zpool create $TESTPOOL2 $REALDISK
++
++# Backup the contents of the disk image
++BACKUP=/tmp/kstat-state-realdisk.gz
++log_must [ ! -e $BACKUP ]
++gzip -c $REALDISK > $BACKUP
++
++# Yank out the disk from under the pool
++log_must rm $REALDISK
++remove_disk $SDISK
++
++# Run a 'zpool scrub' in the background to suspend the pool.  We run it in the
++# background since the command will hang when the pool gets suspended.  The
++# command will resume and exit after we restore the missing disk later on.
++zpool scrub $TESTPOOL2 &
++sleep 1               # Give the scrub some time to run before we check if it fails
++
++log_must check_all $TESTPOOL2 "SUSPENDED"
++
++log_pass "/proc/spl/kstat/zfs/<pool>/state test successful"
diff --git a/zfs-patches/0016-Linux-4.14-compat-blk_queue_stackable.patch b/zfs-patches/0016-Linux-4.14-compat-blk_queue_stackable.patch

new file mode 100644 (file)

index 0000000..82219a7
--- /dev/null
+++ b/zfs-patches/0016-Linux-4.14-compat-blk_queue_stackable.patch
@@ -0,0 +1,115 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Brian Behlendorf <behlendorf1@llnl.gov>
+Date: Tue, 19 Jun 2018 21:52:45 -0700
+Subject: [PATCH] Linux 4.14 compat: blk_queue_stackable()
+
+The blk_queue_stackable() function was replaced in the 4.14 kernel
+by queue_is_rq_based(), commit torvalds/linux@5fdee212.  This change
+resulted in the default elevator being used which can negatively
+impact performance.
+
+Rather than adding additional compatibility code to detect the
+new interface unconditionally attempt to set the elevator.  Since
+we expect this to fail for block devices without an elevator the
+error message has been moved in to zfs_dbgmsg().
+
+Finally, it was observed that the elevator_change() was removed
+from the 4.12 kernel, commit torvalds/linux@c033269.  Update the
+comment to clearly specify which are expected to export the
+elevator_change() symbol.
+
+Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
+Reviewed-by: Tony Hutter <hutter2@llnl.gov>
+Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Closes #7645
+
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ config/kernel-elevator-change.m4 |  4 ++--
+ include/linux/blkdev_compat.h    | 11 -----------
+ module/zfs/vdev_disk.c           | 22 ++++++++++------------
+ 3 files changed, 12 insertions(+), 25 deletions(-)
+
+diff --git a/config/kernel-elevator-change.m4 b/config/kernel-elevator-change.m4
+index ace5aa82..eba25257 100644
+--- a/config/kernel-elevator-change.m4
++++ b/config/kernel-elevator-change.m4
+@@ -1,6 +1,6 @@
+ dnl #
+-dnl # 2.6.36 API change
+-dnl # Verify the elevator_change() symbol is available.
++dnl # 2.6.36 API, exported elevator_change() symbol
++dnl # 4.12 API, removed elevator_change() symbol
+ dnl #
+ AC_DEFUN([ZFS_AC_KERNEL_ELEVATOR_CHANGE], [
+       AC_MSG_CHECKING([whether elevator_change() is available])
+diff --git a/include/linux/blkdev_compat.h b/include/linux/blkdev_compat.h
+index 27f05662..c8cdf38e 100644
+--- a/include/linux/blkdev_compat.h
++++ b/include/linux/blkdev_compat.h
+@@ -106,17 +106,6 @@ blk_queue_set_write_cache(struct request_queue *q, bool wc, bool fua)
+ #endif
+ 
+ /*
+- * 2.6.27 API change,
+- * The blk_queue_stackable() queue flag was added in 2.6.27 to handle dm
+- * stacking drivers.  Prior to this request stacking drivers were detected
+- * by checking (q->request_fn == NULL), for earlier kernels we revert to
+- * this legacy behavior.
+- */
+-#ifndef blk_queue_stackable
+-#define       blk_queue_stackable(q)  ((q)->request_fn == NULL)
+-#endif
+-
+-/*
+  * 2.6.34 API change,
+  * The blk_queue_max_hw_sectors() function replaces blk_queue_max_sectors().
+  */
+diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c
+index 6dc0544f..c5708cb2 100644
+--- a/module/zfs/vdev_disk.c
++++ b/module/zfs/vdev_disk.c
+@@ -168,23 +168,20 @@ vdev_elevator_switch(vdev_t *v, char *elevator)
+       if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0)
+               return;
+ 
+-      /* Skip devices without schedulers (loop, ram, dm, etc) */
+-      if (!q->elevator || !blk_queue_stackable(q))
+-              return;
+-
+       /* Leave existing scheduler when set to "none" */
+       if ((strncmp(elevator, "none", 4) == 0) && (strlen(elevator) == 4))
+               return;
+ 
++      /*
++       * The elevator_change() function was available in kernels from
++       * 2.6.36 to 4.11.  When not available fall back to using the user
++       * mode helper functionality to set the elevator via sysfs.  This
++       * requires /bin/echo and sysfs to be mounted which may not be true
++       * early in the boot process.
++       */
+ #ifdef HAVE_ELEVATOR_CHANGE
+       error = elevator_change(q, elevator);
+ #else
+-      /*
+-       * For pre-2.6.36 kernels elevator_change() is not available.
+-       * Therefore we fall back to using a usermodehelper to echo the
+-       * elevator into sysfs;  This requires /bin/echo and sysfs to be
+-       * mounted which may not be true early in the boot process.
+-       */
+ #define       SET_SCHEDULER_CMD \
+       "exec 0</dev/null " \
+       "     1>/sys/block/%s/queue/scheduler " \
+@@ -198,9 +195,10 @@ vdev_elevator_switch(vdev_t *v, char *elevator)
+       error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+       strfree(argv[2]);
+ #endif /* HAVE_ELEVATOR_CHANGE */
+-      if (error)
+-              printk("ZFS: Unable to set \"%s\" scheduler for %s (%s): %d\n",
++      if (error) {
++              zfs_dbgmsg("Unable to set \"%s\" scheduler for %s (%s): %d\n",
+                   elevator, v->vdev_path, device, error);
++      }
+ }
+ 
+ /*
diff --git a/zfs-patches/0017-Default-ashift-for-Amazon-EC2-NVMe-devices.patch b/zfs-patches/0017-Default-ashift-for-Amazon-EC2-NVMe-devices.patch

new file mode 100644 (file)

index 0000000..c1ec08a
--- /dev/null
+++ b/zfs-patches/0017-Default-ashift-for-Amazon-EC2-NVMe-devices.patch
@@ -0,0 +1,54 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Troels=20N=C3=B8rgaard?= <tnn@tradeshift.com>
+Date: Sat, 7 Jul 2018 01:15:19 +0200
+Subject: [PATCH] Default ashift for Amazon EC2 NVMe devices
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Add a default 4 KiB ashift for Amazon EC2 NVMe devices on instances with
+NVMe ephemeral devices, such as the types c5d, f1, i3 and m5d.
+As per the official documentation [1] a 4096 byte blocksize should be
+used to match the underlying hardware.
+
+The string was identified via:
+
+$ sudo sginfo -M /dev/nvme0n1
+INQUIRY response (cmd: 0x12)
+----------------------------
+Device Type                        0
+Vendor:                    NVMe
+Product:                   Amazon EC2 NVMe
+Revision level:
+
+$ lsblk -io KNAME,TYPE,SIZE,MODEL
+KNAME   TYPE    SIZE MODEL
+nvme0n1 disk  442.4G Amazon EC2 NVMe Instance Storage
+
+[1] https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/
+    storage-optimized-instances.html
+    Retrived 2018-07-03
+
+Reviewed-by: George Melikov <mail@gmelikov.ru>
+Reviewed-by: Giuseppe Di Natale <guss80@gmail.com>
+Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Signed-off-by: Troels Nørgaard <tnn@tradeshift.com>
+Closes #7676
+
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ cmd/zpool/zpool_vdev.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c
+index fd6bd9e7..69ff7ff6 100644
+--- a/cmd/zpool/zpool_vdev.c
++++ b/cmd/zpool/zpool_vdev.c
+@@ -191,6 +191,7 @@ static vdev_disk_db_entry_t vdev_disk_database[] = {
+       {"ATA     INTEL SSDSC2BP24", 4096},
+       {"ATA     INTEL SSDSC2BP48", 4096},
+       {"NA      SmrtStorSDLKAE9W", 4096},
++      {"NVMe    Amazon EC2 NVMe ", 4096},
+       /* Imported from Open Solaris */
+       {"ATA     MARVELL SD88SA02", 4096},
+       /* Advanced format Hard drives */
diff --git a/zfs-patches/0018-Fix-kernel-unaligned-access-on-sparc64.patch b/zfs-patches/0018-Fix-kernel-unaligned-access-on-sparc64.patch

new file mode 100644 (file)

index 0000000..f8870a6
--- /dev/null
+++ b/zfs-patches/0018-Fix-kernel-unaligned-access-on-sparc64.patch
@@ -0,0 +1,123 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Brian Behlendorf <behlendorf1@llnl.gov>
+Date: Wed, 11 Jul 2018 13:10:40 -0700
+Subject: [PATCH] Fix kernel unaligned access on sparc64
+
+Update the SA_COPY_DATA macro to check if architecture supports
+efficient unaligned memory accesses at compile time.  Otherwise
+fallback to using the sa_copy_data() function.
+
+The kernel provided CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS is
+used to determine availability in kernel space.  In user space
+the x86_64, x86, powerpc, and sometimes arm architectures will
+define the HAVE_EFFICIENT_UNALIGNED_ACCESS macro.
+
+Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Closes #7642
+Closes #7684
+
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ lib/libspl/include/sys/isa_defs.h |  7 +++++++
+ module/icp/algs/modes/ccm.c       |  2 +-
+ module/zfs/sa.c                   | 35 ++++++++++++++++++++---------------
+ 3 files changed, 28 insertions(+), 16 deletions(-)
+
+diff --git a/lib/libspl/include/sys/isa_defs.h b/lib/libspl/include/sys/isa_defs.h
+index a5bea039..7a90e077 100644
+--- a/lib/libspl/include/sys/isa_defs.h
++++ b/lib/libspl/include/sys/isa_defs.h
+@@ -55,6 +55,7 @@ extern "C" {
+ #endif
+ 
+ #define       _SUNOS_VTOC_16
++#define       HAVE_EFFICIENT_UNALIGNED_ACCESS
+ 
+ /* i386 arch specific defines */
+ #elif defined(__i386) || defined(__i386__)
+@@ -76,6 +77,7 @@ extern "C" {
+ #endif
+ 
+ #define       _SUNOS_VTOC_16
++#define       HAVE_EFFICIENT_UNALIGNED_ACCESS
+ 
+ /* powerpc arch specific defines */
+ #elif defined(__powerpc) || defined(__powerpc__) || defined(__powerpc64__)
+@@ -99,6 +101,7 @@ extern "C" {
+ #endif
+ 
+ #define       _SUNOS_VTOC_16
++#define       HAVE_EFFICIENT_UNALIGNED_ACCESS
+ 
+ /* arm arch specific defines */
+ #elif defined(__arm) || defined(__arm__) || defined(__aarch64__)
+@@ -129,6 +132,10 @@ extern "C" {
+ 
+ #define       _SUNOS_VTOC_16
+ 
++#if defined(__ARM_FEATURE_UNALIGNED)
++#define       HAVE_EFFICIENT_UNALIGNED_ACCESS
++#endif
++
+ /* sparc arch specific defines */
+ #elif defined(__sparc) || defined(__sparc__)
+ 
+diff --git a/module/icp/algs/modes/ccm.c b/module/icp/algs/modes/ccm.c
+index 22aeb0a6..fb41194f 100644
+--- a/module/icp/algs/modes/ccm.c
++++ b/module/icp/algs/modes/ccm.c
+@@ -28,7 +28,7 @@
+ #include <sys/crypto/common.h>
+ #include <sys/crypto/impl.h>
+ 
+-#if defined(__i386) || defined(__amd64)
++#ifdef HAVE_EFFICIENT_UNALIGNED_ACCESS
+ #include <sys/byteorder.h>
+ #define       UNALIGNED_POINTERS_PERMITTED
+ #endif
+diff --git a/module/zfs/sa.c b/module/zfs/sa.c
+index 8046dbde..1fb1a8b5 100644
+--- a/module/zfs/sa.c
++++ b/module/zfs/sa.c
+@@ -147,21 +147,26 @@ arc_byteswap_func_t sa_bswap_table[] = {
+       zfs_acl_byteswap,
+ };
+ 
+-#define       SA_COPY_DATA(f, s, t, l) \
+-      { \
+-              if (f == NULL) { \
+-                      if (l == 8) { \
+-                              *(uint64_t *)t = *(uint64_t *)s; \
+-                      } else if (l == 16) { \
+-                              *(uint64_t *)t = *(uint64_t *)s; \
+-                              *(uint64_t *)((uintptr_t)t + 8) = \
+-                                  *(uint64_t *)((uintptr_t)s + 8); \
+-                      } else { \
+-                              bcopy(s, t, l); \
+-                      } \
+-              } else \
+-                      sa_copy_data(f, s, t, l); \
+-      }
++#ifdef HAVE_EFFICIENT_UNALIGNED_ACCESS
++#define       SA_COPY_DATA(f, s, t, l)                                \
++do {                                                          \
++      if (f == NULL) {                                        \
++              if (l == 8) {                                   \
++                      *(uint64_t *)t = *(uint64_t *)s;        \
++              } else if (l == 16) {                           \
++                      *(uint64_t *)t = *(uint64_t *)s;        \
++                      *(uint64_t *)((uintptr_t)t + 8) =       \
++                          *(uint64_t *)((uintptr_t)s + 8);    \
++              } else {                                        \
++                      bcopy(s, t, l);                         \
++              }                                               \
++      } else {                                                \
++              sa_copy_data(f, s, t, l);                       \
++      }                                                       \
++} while (0)
++#else
++#define       SA_COPY_DATA(f, s, t, l)        sa_copy_data(f, s, t, l)
++#endif
+ 
+ /*
+  * This table is fixed and cannot be changed.  Its purpose is to
diff --git a/zfs-patches/0019-Fix-zpl_mount-deadlock.patch b/zfs-patches/0019-Fix-zpl_mount-deadlock.patch

new file mode 100644 (file)

index 0000000..93ee7ce
--- /dev/null
+++ b/zfs-patches/0019-Fix-zpl_mount-deadlock.patch
@@ -0,0 +1,91 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Brian Behlendorf <behlendorf1@llnl.gov>
+Date: Wed, 11 Jul 2018 15:49:10 -0700
+Subject: [PATCH] Fix zpl_mount() deadlock
+
+Commit 93b43af10 inadvertently introduced the following scenario which
+can result in a deadlock.  This issue was most easily reproduced by
+LXD containers using a ZFS storage backend but should be reproducible
+under any workload which is frequently mounting and unmounting.
+
+-- THREAD A --
+spa_sync()
+  spa_sync_upgrades()
+    rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); <- Waiting on B
+
+-- THREAD B --
+mount_fs()
+  zpl_mount()
+    zpl_mount_impl()
+      dmu_objset_hold()
+        dmu_objset_hold_flags()
+          dsl_pool_hold()
+            dsl_pool_config_enter()
+              rrw_enter(&dp->dp_config_rwlock, RW_READER, tag);
+    sget()
+      sget_userns()
+        grab_super()
+          down_write(&s->s_umount); <- Waiting on C
+
+-- THREAD C --
+cleanup_mnt()
+  deactivate_super()
+    down_write(&s->s_umount);
+    deactivate_locked_super()
+      zpl_kill_sb()
+        kill_anon_super()
+          generic_shutdown_super()
+            sync_filesystem()
+              zpl_sync_fs()
+                zfs_sync()
+                  zil_commit()
+                    txg_wait_synced() <- Waiting on A
+
+Reviewed by: Alek Pinchuk <apinchuk@datto.com>
+Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Closes #7598
+Closes #7659
+Closes #7691
+Closes #7693
+
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ include/sys/zfs_vfsops.h |  1 +
+ module/zfs/zpl_super.c   | 11 ++++++++++-
+ 2 files changed, 11 insertions(+), 1 deletion(-)
+
+diff --git a/include/sys/zfs_vfsops.h b/include/sys/zfs_vfsops.h
+index 2326da42..927153b2 100644
+--- a/include/sys/zfs_vfsops.h
++++ b/include/sys/zfs_vfsops.h
+@@ -32,6 +32,7 @@
+ #include <sys/zil.h>
+ #include <sys/sa.h>
+ #include <sys/rrwlock.h>
++#include <sys/dsl_dataset.h>
+ #include <sys/zfs_ioctl.h>
+ 
+ #ifdef        __cplusplus
+diff --git a/module/zfs/zpl_super.c b/module/zfs/zpl_super.c
+index fc10271b..5c426b0a 100644
+--- a/module/zfs/zpl_super.c
++++ b/module/zfs/zpl_super.c
+@@ -271,8 +271,17 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
+       if (err)
+               return (ERR_PTR(-err));
+ 
++      /*
++       * The dsl pool lock must be released prior to calling sget().
++       * It is possible sget() may block on the lock in grab_super()
++       * while deactivate_super() holds that same lock and waits for
++       * a txg sync.  If the dsl_pool lock is held over over sget()
++       * this can prevent the pool sync and cause a deadlock.
++       */
++      dsl_pool_rele(dmu_objset_pool(os), FTAG);
+       s = zpl_sget(fs_type, zpl_test_super, set_anon_super, flags, os);
+-      dmu_objset_rele(os, FTAG);
++      dsl_dataset_rele(dmu_objset_ds(os), FTAG);
++
+       if (IS_ERR(s))
+               return (ERR_CAST(s));
+ 
diff --git a/zfs-patches/0020-OpenZFS-8906-uts-illumos-rootfs-should-support-salte.patch b/zfs-patches/0020-OpenZFS-8906-uts-illumos-rootfs-should-support-salte.patch

new file mode 100644 (file)

index 0000000..395554b
--- /dev/null
+++ b/zfs-patches/0020-OpenZFS-8906-uts-illumos-rootfs-should-support-salte.patch
@@ -0,0 +1,133 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Toomas Soome <tsoome@me.com>
+Date: Wed, 1 Jun 2016 19:18:10 +0300
+Subject: [PATCH] OpenZFS 8906 - uts: illumos rootfs should support salted
+ cksum
+
+Porting notes:
+* As of grub-2.02 these checksums are not supported.  However, as
+  pointed out in #6501 there are alternatives such as EFISTUB which
+  work and have no such restriction.  A warning was added to the
+  checksum property section of the zfs.8 man page.
+
+Authored by: Toomas Soome <tsoome@me.com>
+Reviewed by: C Fraire <cfraire@me.com>
+Reviewed by: Robert Mustacchi <rm@joyent.com>
+Reviewed by: Yuri Pankov <yuripv@yuripv.net>
+Approved by: Dan McDonald <danmcd@joyent.com>
+Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>
+
+OpenZFS-issue: https://illumos.org/issues/8906
+OpenZFS-commit: https://github.com/openzfs/openzfs/commit/7dec52f
+Closes #6501
+Closes #7714
+
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ man/man5/zpool-features.5 | 18 +++++++-----------
+ man/man8/zfs.8            |  5 ++++-
+ module/zfs/zfs_ioctl.c    | 11 +----------
+ 3 files changed, 12 insertions(+), 22 deletions(-)
+
+diff --git a/man/man5/zpool-features.5 b/man/man5/zpool-features.5
+index 78ea559f..140ce269 100644
+--- a/man/man5/zpool-features.5
++++ b/man/man5/zpool-features.5
+@@ -14,7 +14,7 @@
+ .\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your
+ .\" own identifying information:
+ .\" Portions Copyright [yyyy] [name of copyright owner]
+-.TH ZPOOL-FEATURES 5 "Aug 27, 2013"
++.TH ZPOOL-FEATURES 5 "Jun 8, 2018"
+ .SH NAME
+ zpool\-features \- ZFS pool feature descriptions
+ .SH DESCRIPTION
+@@ -248,8 +248,9 @@ immediately activate the \fBlz4_compress\fR feature on the underlying
+ pool using the \fBzfs\fR(1M) command. Also, all newly written metadata
+ will be compressed with \fBlz4\fR algorithm. Since this feature is not
+ read-only compatible, this operation will render the pool unimportable
+-on systems without support for the \fBlz4_compress\fR feature. Booting
+-off of \fBlz4\fR-compressed root pools is supported.
++on systems without support for the \fBlz4_compress\fR feature.
++
++Booting off of \fBlz4\fR-compressed root pools is supported.
+ 
+ This feature becomes \fBactive\fR as soon as it is enabled and will
+ never return to being \fBenabled\fB.
+@@ -510,8 +511,7 @@ can turn on the \fBsha512\fR checksum on any dataset using the
+ and will return to being \fBenabled\fR once all filesystems that have
+ ever had their checksum set to \fBsha512\fR are destroyed.
+ 
+-Booting off of pools utilizing SHA-512/256 is supported (provided that
+-the updated GRUB stage2 module is installed).
++Booting off of pools utilizing SHA-512/256 is supported.
+ 
+ .RE
+ 
+@@ -545,9 +545,7 @@ can turn on the \fBskein\fR checksum on any dataset using the
+ and will return to being \fBenabled\fR once all filesystems that have
+ ever had their checksum set to \fBskein\fR are destroyed.
+ 
+-Booting off of pools using \fBskein\fR is \fBNOT\fR supported
+--- any attempt to enable \fBskein\fR on a root pool will fail with an
+-error.
++Booting off of pools using \fBskein\fR is supported.
+ 
+ .RE
+ 
+@@ -587,9 +585,7 @@ can turn on the \fBedonr\fR checksum on any dataset using the
+ and will return to being \fBenabled\fR once all filesystems that have
+ ever had their checksum set to \fBedonr\fR are destroyed.
+ 
+-Booting off of pools using \fBedonr\fR is \fBNOT\fR supported
+--- any attempt to enable \fBedonr\fR on a root pool will fail with an
+-error.
++Booting off of pools using \fBedonr\fR is supported.
+ 
+ .RE
+ 
+diff --git a/man/man8/zfs.8 b/man/man8/zfs.8
+index 48a5e6ea..bb3b46e3 100644
+--- a/man/man8/zfs.8
++++ b/man/man8/zfs.8
+@@ -29,7 +29,7 @@
+ .\" Copyright 2016 Nexenta Systems, Inc.
+ .\" Copyright 2016 Richard Laager. All rights reserved.
+ .\"
+-.Dd June 28, 2017
++.Dd July 13, 2018
+ .Dt ZFS 8 SMM
+ .Os Linux
+ .Sh NAME
+@@ -1049,6 +1049,9 @@ The
+ and
+ .Sy edonr
+ checksum algorithms require enabling the appropriate features on the pool.
++These algorithms are not supported by GRUB and should not be set on the
++.Sy bootfs
++filesystem when using GRUB to boot the system.
+ Please see
+ .Xr zpool-features 5
+ for more information on these algorithms.
+diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
+index f4f509a7..6516f646 100644
+--- a/module/zfs/zfs_ioctl.c
++++ b/module/zfs/zfs_ioctl.c
+@@ -3985,16 +3985,7 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
+ 
+               if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+                       return (err);
+-              /*
+-               * Salted checksums are not supported on root pools.
+-               */
+-              if (spa_bootfs(spa) != 0 &&
+-                  intval < ZIO_CHECKSUM_FUNCTIONS &&
+-                  (zio_checksum_table[intval].ci_flags &
+-                  ZCHECKSUM_FLAG_SALTED)) {
+-                      spa_close(spa, FTAG);
+-                      return (SET_ERROR(ERANGE));
+-              }
++
+               if (!spa_feature_is_enabled(spa, feature)) {
+                       spa_close(spa, FTAG);
+                       return (SET_ERROR(ENOTSUP));
diff --git a/zfs-patches/0021-Fix-zfs-incremental-send-remove-o-properties.patch b/zfs-patches/0021-Fix-zfs-incremental-send-remove-o-properties.patch

new file mode 100644 (file)

index 0000000..17c7d43
--- /dev/null
+++ b/zfs-patches/0021-Fix-zfs-incremental-send-remove-o-properties.patch
@@ -0,0 +1,108 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: LOLi <loli10K@users.noreply.github.com>
+Date: Tue, 1 May 2018 05:58:29 +0200
+Subject: [PATCH] Fix zfs incremental send remove '-o' properties
+
+When receiving an incremental send stream with intermediary snapshots
+zfs_receive_one() does not correctly identify the top-level dataset:
+consequently we restore said snapshots as if they were children
+datasets in the hierarchy, forcing inheritance of any property received
+with 'zfs send -o' and effectively removing any locally set value.
+
+The test case did not correctly verify this situation because it uses
+adjacent snapshots, basically testing 'zfs send -i' instead of
+'zfs send -I': this commit adds an additional intermediary snapshot to
+the test script.
+
+Reviewed-by: Paul Dagnelie <pcd@delphix.com>
+Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Signed-off-by: loli10K <ezomori.nozomu@gmail.com>
+Closes #7478
+
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ lib/libzfs/libzfs_sendrecv.c                       |  2 +-
+ .../zfs_receive/receive-o-x_props_override.ksh     | 22 +++++++++++++---------
+ 2 files changed, 14 insertions(+), 10 deletions(-)
+
+diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c
+index 5490581a..c5acd21a 100644
+--- a/lib/libzfs/libzfs_sendrecv.c
++++ b/lib/libzfs/libzfs_sendrecv.c
+@@ -3592,7 +3592,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
+               goto out;
+       }
+ 
+-      if (top_zfs && *top_zfs == NULL)
++      if (top_zfs && (*top_zfs == NULL || strcmp(*top_zfs, name) == 0))
+               toplevel = B_TRUE;
+       if (drrb->drr_type == DMU_OST_ZVOL) {
+               type = ZFS_TYPE_VOLUME;
+diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/receive-o-x_props_override.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/receive-o-x_props_override.ksh
+index e4e69851..4e3a5393 100755
+--- a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/receive-o-x_props_override.ksh
++++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/receive-o-x_props_override.ksh
+@@ -212,16 +212,17 @@ log_must eval "zfs send -R $orig@snap1 > $streamfile_repl"
+ log_must eval "zfs recv $dest < $streamfile_repl"
+ # Fill the datasets with properties and create an incremental replication stream
+ log_must zfs snapshot -r $orig@snap2
++log_must zfs snapshot -r $orig@snap3
+ log_must eval "zfs set copies=2 $orig"
+ log_must eval "zfs set '$userprop:orig'='$userval' $orig"
+ log_must eval "zfs set '$userprop:orig'='$userval' $origsub"
+ log_must eval "zfs set '$userprop:snap'='$userval' $orig@snap1"
+-log_must eval "zfs set '$userprop:snap'='$userval' $origsub@snap2"
+-log_must eval "zfs send -R -I $orig@snap1 $orig@snap2 > $streamfile_incr"
++log_must eval "zfs set '$userprop:snap'='$userval' $origsub@snap3"
++log_must eval "zfs send -R -I $orig@snap1 $orig@snap3 > $streamfile_incr"
+ # Sets various combination of override and exclude options
+ log_must eval "zfs recv -F -o atime=off -o '$userprop:dest2'='$userval' "\
+       "-o quota=123456789 -x compression -x '$userprop:orig' " \
+-      "-x '$userprop:snap2' $dest < $streamfile_incr"
++      "-x '$userprop:snap3' $dest < $streamfile_incr"
+ # Verify we can correctly override and exclude properties
+ log_must eval "check_prop_source $dest copies 2 received"
+ log_must eval "check_prop_source $dest atime off local"
+@@ -237,9 +238,9 @@ log_must eval "check_prop_missing $destsub '$userprop:orig'"
+ log_must eval "check_prop_source " \
+       "$dest@snap1 '$userprop:snap' '$userval' received"
+ log_must eval "check_prop_source " \
+-      "$destsub@snap2 '$userprop:snap' '$userval' received"
+-log_must eval "check_prop_missing $dest@snap2 '$userprop:snap2'"
+-log_must eval "check_prop_missing $destsub@snap2 '$userprop:snap2'"
++      "$destsub@snap3 '$userprop:snap' '$userval' received"
++log_must eval "check_prop_missing $dest@snap3 '$userprop:snap3'"
++log_must eval "check_prop_missing $destsub@snap3 '$userprop:snap3'"
+ # Cleanup
+ log_must zfs destroy -r -f $orig
+ log_must zfs destroy -r -f $dest
+@@ -270,7 +271,8 @@ log_must eval "zfs set compression=gzip $dest"
+ log_must eval "zfs set '$userprop:dest'='localval' $dest"
+ # Receive the new stream, verify we preserve locally set properties
+ log_must zfs snapshot -r $orig@snap2
+-log_must eval "zfs send -R -I $orig@snap1 $orig@snap2 > $streamfile_incr"
++log_must zfs snapshot -r $orig@snap3
++log_must eval "zfs send -R -I $orig@snap1 $orig@snap3 > $streamfile_incr"
+ log_must eval "zfs recv -F -x copies -x compression -x '$userprop:orig' " \
+       "-x '$userprop:dest' $dest < $streamfile_incr"
+ log_must eval "check_prop_source $dest '$userprop:dest' 'localval' local"
+@@ -305,7 +307,8 @@ log_must eval "check_prop_source $destsub quota 0 default"
+ log_must eval "zfs set quota=123456789 $dest"
+ log_must eval "zfs set canmount=off $destsub"
+ log_must zfs snapshot -r $orig@snap2
+-log_must eval "zfs send -R -I $orig@snap1 $orig@snap2 > $streamfile_incr"
++log_must zfs snapshot -r $orig@snap3
++log_must eval "zfs send -R -I $orig@snap1 $orig@snap3 > $streamfile_incr"
+ log_must eval "zfs recv -F -x quota -x canmount $dest < $streamfile_incr"
+ log_must eval "check_prop_source $dest quota 123456789 local"
+ log_must eval "check_prop_source $destsub quota 0 default"
+@@ -332,7 +335,8 @@ log_must eval "zfs set '$userprop:origsub'='$userval' $destsub"
+ mntpnt=$(get_prop mountpoint $orig)
+ log_must eval "dd if=/dev/urandom of=$mntpnt/file bs=1024k count=10"
+ log_must zfs snapshot -r $orig@snap2
+-log_must eval "zfs send -R -I $orig@snap1 $orig@snap2 > $streamfile_incr"
++log_must zfs snapshot -r $orig@snap3
++log_must eval "zfs send -R -I $orig@snap1 $orig@snap3 > $streamfile_incr"
+ log_must eval "dd if=$streamfile_incr of=$streamfile_trun bs=1024k count=9"
+ # Receive the truncated stream, verify original properties are kept
+ log_mustnot eval "zfs recv -F -o copies=3 -o quota=987654321 "\
diff --git a/zfs-patches/0022-Allow-inherited-properties-in-zfs_check_settable.patch b/zfs-patches/0022-Allow-inherited-properties-in-zfs_check_settable.patch

new file mode 100644 (file)

index 0000000..7e70804
--- /dev/null
+++ b/zfs-patches/0022-Allow-inherited-properties-in-zfs_check_settable.patch
@@ -0,0 +1,95 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: LOLi <loli10K@users.noreply.github.com>
+Date: Fri, 3 Aug 2018 23:56:25 +0200
+Subject: [PATCH] Allow inherited properties in zfs_check_settable()
+
+This change modifies how 'checksum' and 'dedup' properties are verified
+in zfs_check_settable() handling the case where they are explicitly
+inherited in the dataset hierarchy when receiving a recursive send
+stream.
+
+Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Reviewed-by: Tom Caputi <tcaputi@datto.com>
+Signed-off-by: loli10K <ezomori.nozomu@gmail.com>
+Closes #7755
+Closes #7576
+Closes #7757
+
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ module/zfs/zfs_ioctl.c                             | 26 +++++++++++-----------
+ .../zfs_receive/receive-o-x_props_override.ksh     |  6 +++--
+ 2 files changed, 17 insertions(+), 15 deletions(-)
+
+diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
+index 6516f646..b8783e54 100644
+--- a/module/zfs/zfs_ioctl.c
++++ b/module/zfs/zfs_ioctl.c
+@@ -3967,7 +3967,6 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
+       {
+               spa_feature_t feature;
+               spa_t *spa;
+-              uint64_t intval;
+               int err;
+ 
+               /* dedup feature version checks */
+@@ -3975,22 +3974,23 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
+                   zfs_earlier_version(dsname, SPA_VERSION_DEDUP))
+                       return (SET_ERROR(ENOTSUP));
+ 
+-              if (nvpair_value_uint64(pair, &intval) != 0)
+-                      return (SET_ERROR(EINVAL));
+-
+-              /* check prop value is enabled in features */
+-              feature = zio_checksum_to_feature(intval & ZIO_CHECKSUM_MASK);
+-              if (feature == SPA_FEATURE_NONE)
+-                      break;
++              if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
++                  nvpair_value_uint64(pair, &intval) == 0) {
++                      /* check prop value is enabled in features */
++                      feature = zio_checksum_to_feature(
++                          intval & ZIO_CHECKSUM_MASK);
++                      if (feature == SPA_FEATURE_NONE)
++                              break;
+ 
+-              if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+-                      return (err);
++                      if ((err = spa_open(dsname, &spa, FTAG)) != 0)
++                              return (err);
+ 
+-              if (!spa_feature_is_enabled(spa, feature)) {
++                      if (!spa_feature_is_enabled(spa, feature)) {
++                              spa_close(spa, FTAG);
++                              return (SET_ERROR(ENOTSUP));
++                      }
+                       spa_close(spa, FTAG);
+-                      return (SET_ERROR(ENOTSUP));
+               }
+-              spa_close(spa, FTAG);
+               break;
+       }
+ 
+diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/receive-o-x_props_override.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/receive-o-x_props_override.ksh
+index 4e3a5393..583d8eb1 100755
+--- a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/receive-o-x_props_override.ksh
++++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/receive-o-x_props_override.ksh
+@@ -221,15 +221,17 @@ log_must eval "zfs set '$userprop:snap'='$userval' $origsub@snap3"
+ log_must eval "zfs send -R -I $orig@snap1 $orig@snap3 > $streamfile_incr"
+ # Sets various combination of override and exclude options
+ log_must eval "zfs recv -F -o atime=off -o '$userprop:dest2'='$userval' "\
+-      "-o quota=123456789 -x compression -x '$userprop:orig' " \
+-      "-x '$userprop:snap3' $dest < $streamfile_incr"
++      "-o quota=123456789 -o checksum=sha512 -x compression "\
++        "-x '$userprop:orig' -x '$userprop:snap3' $dest < $streamfile_incr"
+ # Verify we can correctly override and exclude properties
+ log_must eval "check_prop_source $dest copies 2 received"
+ log_must eval "check_prop_source $dest atime off local"
+ log_must eval "check_prop_source $dest '$userprop:dest2' '$userval' local"
+ log_must eval "check_prop_source $dest quota 123456789 local"
++log_must eval "check_prop_source $dest checksum sha512 local"
+ log_must eval "check_prop_inherit $destsub copies $dest"
+ log_must eval "check_prop_inherit $destsub atime $dest"
++log_must eval "check_prop_inherit $destsub checksum $dest"
+ log_must eval "check_prop_inherit $destsub '$userprop:dest2' $dest"
+ log_must eval "check_prop_source $destsub quota 0 default"
+ log_must eval "check_prop_source $destsub compression off default"
diff --git a/zfs-patches/0023-Fix-arcstat.py-handling-of-unsupported-options.patch b/zfs-patches/0023-Fix-arcstat.py-handling-of-unsupported-options.patch

new file mode 100644 (file)

index 0000000..f5e0832
--- /dev/null
+++ b/zfs-patches/0023-Fix-arcstat.py-handling-of-unsupported-options.patch
@@ -0,0 +1,33 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: LOLi <loli10K@users.noreply.github.com>
+Date: Sat, 18 Aug 2018 22:10:36 +0200
+Subject: [PATCH] Fix arcstat.py handling of unsupported options
+
+This change allows the arcstat.py script to handle unsupported options
+gracefully and print both error and usage messages when one such option
+is provided.
+
+Reviewed-by: Giuseppe Di Natale <guss80@gmail.com>
+Reviewed-by: George Melikov <mail@gmelikov.ru>
+Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Signed-off-by: loli10K <ezomori.nozomu@gmail.com>
+Closes #7799
+
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ cmd/arcstat/arcstat.py | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/cmd/arcstat/arcstat.py b/cmd/arcstat/arcstat.py
+index 85c83ccc..b52a8c29 100755
+--- a/cmd/arcstat/arcstat.py
++++ b/cmd/arcstat/arcstat.py
+@@ -285,7 +285,7 @@ def init():
+             ]
+         )
+     except getopt.error as msg:
+-        sys.stderr.write(msg)
++        sys.stderr.write("Error: %s\n" % str(msg))
+         usage()
+         opts = None
+ 
diff --git a/zfs-patches/0024-Don-t-modify-argv-in-user-tools.patch b/zfs-patches/0024-Don-t-modify-argv-in-user-tools.patch

new file mode 100644 (file)

index 0000000..2162a70
--- /dev/null
+++ b/zfs-patches/0024-Don-t-modify-argv-in-user-tools.patch
@@ -0,0 +1,123 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: DeHackEd <DeHackEd@users.noreply.github.com>
+Date: Mon, 20 Aug 2018 12:55:18 -0400
+Subject: [PATCH] Don't modify argv[] in user tools
+
+argv[] gets modified during string parsing for input arguments. This
+is reflected in the live process listing. Don't do that.
+
+Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com>
+Reviewed-by: loli10K <ezomori.nozomu@gmail.com>
+Reviewed-by: Giuseppe Di Natale <guss80@gmail.com>
+Reviewed-by: George Melikov <mail@gmelikov.ru>
+Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Signed-off-by: DHE <git@dehacked.net>
+Closes #7760
+
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ cmd/zfs/zfs_main.c     | 18 ++++++++++++++++--
+ cmd/zpool/zpool_main.c | 18 ++++++++++++++++--
+ 2 files changed, 32 insertions(+), 4 deletions(-)
+
+diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c
+index f57df858..275d9c89 100644
+--- a/cmd/zfs/zfs_main.c
++++ b/cmd/zfs/zfs_main.c
+@@ -7041,6 +7041,7 @@ main(int argc, char **argv)
+       int ret = 0;
+       int i = 0;
+       char *cmdname;
++      char **newargv;
+ 
+       (void) setlocale(LC_ALL, "");
+       (void) textdomain(TEXT_DOMAIN);
+@@ -7096,16 +7097,25 @@ main(int argc, char **argv)
+       libzfs_print_on_error(g_zfs, B_TRUE);
+ 
+       /*
++       * Many commands modify input strings for string parsing reasons.
++       * We create a copy to protect the original argv.
++       */
++      newargv = malloc((argc + 1) * sizeof (newargv[0]));
++      for (i = 0; i < argc; i++)
++              newargv[i] = strdup(argv[i]);
++      newargv[argc] = NULL;
++
++      /*
+        * Run the appropriate command.
+        */
+       libzfs_mnttab_cache(g_zfs, B_TRUE);
+       if (find_command_idx(cmdname, &i) == 0) {
+               current_command = &command_table[i];
+-              ret = command_table[i].func(argc - 1, argv + 1);
++              ret = command_table[i].func(argc - 1, newargv + 1);
+       } else if (strchr(cmdname, '=') != NULL) {
+               verify(find_command_idx("set", &i) == 0);
+               current_command = &command_table[i];
+-              ret = command_table[i].func(argc, argv);
++              ret = command_table[i].func(argc, newargv);
+       } else {
+               (void) fprintf(stderr, gettext("unrecognized "
+                   "command '%s'\n"), cmdname);
+@@ -7113,6 +7123,10 @@ main(int argc, char **argv)
+               ret = 1;
+       }
+ 
++      for (i = 0; i < argc; i++)
++              free(newargv[i]);
++      free(newargv);
++
+       if (ret == 0 && log_history)
+               (void) zpool_log_history(g_zfs, history_str);
+ 
+diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
+index 97697011..a4fd0321 100644
+--- a/cmd/zpool/zpool_main.c
++++ b/cmd/zpool/zpool_main.c
+@@ -7971,6 +7971,7 @@ main(int argc, char **argv)
+       int ret = 0;
+       int i = 0;
+       char *cmdname;
++      char **newargv;
+ 
+       (void) setlocale(LC_ALL, "");
+       (void) textdomain(TEXT_DOMAIN);
+@@ -8006,15 +8007,24 @@ main(int argc, char **argv)
+       zfs_save_arguments(argc, argv, history_str, sizeof (history_str));
+ 
+       /*
++       * Many commands modify input strings for string parsing reasons.
++       * We create a copy to protect the original argv.
++       */
++      newargv = malloc((argc + 1) * sizeof (newargv[0]));
++      for (i = 0; i < argc; i++)
++              newargv[i] = strdup(argv[i]);
++      newargv[argc] = NULL;
++
++      /*
+        * Run the appropriate command.
+        */
+       if (find_command_idx(cmdname, &i) == 0) {
+               current_command = &command_table[i];
+-              ret = command_table[i].func(argc - 1, argv + 1);
++              ret = command_table[i].func(argc - 1, newargv + 1);
+       } else if (strchr(cmdname, '=')) {
+               verify(find_command_idx("set", &i) == 0);
+               current_command = &command_table[i];
+-              ret = command_table[i].func(argc, argv);
++              ret = command_table[i].func(argc, newargv);
+       } else if (strcmp(cmdname, "freeze") == 0 && argc == 3) {
+               /*
+                * 'freeze' is a vile debugging abomination, so we treat
+@@ -8031,6 +8041,10 @@ main(int argc, char **argv)
+               ret = 1;
+       }
+ 
++      for (i = 0; i < argc; i++)
++              free(newargv[i]);
++      free(newargv);
++
+       if (ret == 0 && log_history)
+               (void) zpool_log_history(g_zfs, history_str);
+ 
diff --git a/zfs-patches/0025-Add-missing-zfs-dracut-RPM-dependencies.patch b/zfs-patches/0025-Add-missing-zfs-dracut-RPM-dependencies.patch

new file mode 100644 (file)

index 0000000..fbcc19a
--- /dev/null
+++ b/zfs-patches/0025-Add-missing-zfs-dracut-RPM-dependencies.patch
@@ -0,0 +1,42 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Brian Behlendorf <behlendorf1@llnl.gov>
+Date: Tue, 31 Jul 2018 10:17:44 -0700
+Subject: [PATCH] Add missing zfs-dracut RPM dependencies
+
+The zfs-dracut package requires the hostid, basename, head, awk,
+and grep utilities be installed.  The first three are provided by
+coreutils but additional dependencies are required for awk and grep.
+
+Reviewed-by: Manuel Amador (Rudd-O) <rudd-o@rudd-o.com>
+Reviewed-by: Tony Hutter <hutter2@llnl.gov>
+Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Closes #7729
+Closes #7747
+
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ rpm/generic/zfs.spec.in | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in
+index 5b89db02..398221c6 100644
+--- a/rpm/generic/zfs.spec.in
++++ b/rpm/generic/zfs.spec.in
+@@ -196,7 +196,7 @@ Requires:       acl
+ Requires:       sudo
+ Requires:       sysstat
+ Requires:       rng-tools
+-Requires:     libaio
++Requires:       libaio
+ AutoReqProv:    no
+ 
+ %description test
+@@ -208,6 +208,8 @@ Summary:        Dracut module
+ Group:          System Environment/Kernel
+ Requires:       %{name}%{?_isa} = %{version}-%{release}
+ Requires:       dracut
++Requires:       /usr/bin/awk
++Requires:       grep
+ 
+ %description dracut
+ This package contains a dracut module used to construct an initramfs
diff --git a/zfs-patches/0026-Add-libaio-devel-BuildRequires.patch b/zfs-patches/0026-Add-libaio-devel-BuildRequires.patch

new file mode 100644 (file)

index 0000000..a68148a
--- /dev/null
+++ b/zfs-patches/0026-Add-libaio-devel-BuildRequires.patch
@@ -0,0 +1,31 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Brian Behlendorf <behlendorf1@llnl.gov>
+Date: Thu, 23 Aug 2018 09:34:34 -0700
+Subject: [PATCH] Add libaio-devel BuildRequires
+
+The zfs-test package needs a build requirement on the libaio-devel
+package.  Without it ./configure will correctly determine that
+mmap_libaio cannot be built and it will be skipped.
+
+Reviewed-by: George Melikov <mail@gmelikov.ru>
+Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Closes #7821
+Closes #7824
+
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ rpm/generic/zfs.spec.in | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in
+index 398221c6..16c5780b 100644
+--- a/rpm/generic/zfs.spec.in
++++ b/rpm/generic/zfs.spec.in
+@@ -197,6 +197,7 @@ Requires:       sudo
+ Requires:       sysstat
+ Requires:       rng-tools
+ Requires:       libaio
++BuildRequires:  libaio-devel
+ AutoReqProv:    no
+ 
+ %description test
diff --git a/zfs-patches/0027-Fix-libaio-devel-requirement-for-Debian-based-distri.patch b/zfs-patches/0027-Fix-libaio-devel-requirement-for-Debian-based-distri.patch

new file mode 100644 (file)

index 0000000..88e0c37
--- /dev/null
+++ b/zfs-patches/0027-Fix-libaio-devel-requirement-for-Debian-based-distri.patch
@@ -0,0 +1,36 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: LOLi <loli10K@users.noreply.github.com>
+Date: Sun, 26 Aug 2018 21:43:27 +0200
+Subject: [PATCH] Fix libaio-devel requirement for Debian-based distributions
+
+BuildRequires tags for "-devel" packages in the RPM spec file do not
+work when building on Debian-based distributions.
+
+Fix this issue by making this requirement conditional to RPM-based
+distributions.
+
+Reviewed-by: George Melikov <mail@gmelikov.ru>
+Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Signed-off-by: loli10K <ezomori.nozomu@gmail.com>
+Closes #7829
+Closes #7831
+
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ rpm/generic/zfs.spec.in | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in
+index 16c5780b..22565725 100644
+--- a/rpm/generic/zfs.spec.in
++++ b/rpm/generic/zfs.spec.in
+@@ -197,7 +197,9 @@ Requires:       sudo
+ Requires:       sysstat
+ Requires:       rng-tools
+ Requires:       libaio
++%if 0%{?rhel}%{?fedora}%{?suse_version}
+ BuildRequires:  libaio-devel
++%endif
+ AutoReqProv:    no
+ 
+ %description test
diff --git a/zfs-patches/0028-Fedora-28-Fix-misc-bounds-check-compiler-warnings.patch b/zfs-patches/0028-Fedora-28-Fix-misc-bounds-check-compiler-warnings.patch

new file mode 100644 (file)

index 0000000..e49a99b
--- /dev/null
+++ b/zfs-patches/0028-Fedora-28-Fix-misc-bounds-check-compiler-warnings.patch
@@ -0,0 +1,61 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Joao Carlos Mendes Luis <dioni21@users.noreply.github.com>
+Date: Sun, 26 Aug 2018 16:55:44 -0300
+Subject: [PATCH] Fedora 28: Fix misc bounds check compiler warnings
+
+Fix a bunch of truncation compiler warnings that show up
+on Fedora 28 (GCC 8.0.1).
+
+Reviewed-by: Giuseppe Di Natale <guss80@gmail.com>
+Reviewed-by: George Melikov <mail@gmelikov.ru>
+Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Issue #7368
+Closes #7826
+Closes #7830
+
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ lib/libshare/smb.c                             | 2 +-
+ module/icp/core/kcf_mech_tabs.c                | 2 +-
+ tests/zfs-tests/tests/functional/ctime/ctime.c | 2 +-
+ 3 files changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/lib/libshare/smb.c b/lib/libshare/smb.c
+index 76145bd9..91d4decb 100644
+--- a/lib/libshare/smb.c
++++ b/lib/libshare/smb.c
+@@ -218,7 +218,7 @@ smb_enable_share_one(const char *sharename, const char *sharepath)
+       int rc;
+ 
+       /* Support ZFS share name regexp '[[:alnum:]_-.: ]' */
+-      strncpy(name, sharename, sizeof (name));
++      strlcpy(name, sharename, sizeof (name));
+       name [sizeof (name)-1] = '\0';
+ 
+       pos = name;
+diff --git a/module/icp/core/kcf_mech_tabs.c b/module/icp/core/kcf_mech_tabs.c
+index 723bfdb6..741dae7a 100644
+--- a/module/icp/core/kcf_mech_tabs.c
++++ b/module/icp/core/kcf_mech_tabs.c
+@@ -321,7 +321,7 @@ kcf_create_mech_entry(kcf_ops_class_t class, char *mechname)
+               mutex_enter(&(me_tab[i].me_mutex));
+               if (me_tab[i].me_name[0] == 0) {
+                       /* Found an empty spot */
+-                      (void) strncpy(me_tab[i].me_name, mechname,
++                      (void) strlcpy(me_tab[i].me_name, mechname,
+                           CRYPTO_MAX_MECH_NAME);
+                       me_tab[i].me_name[CRYPTO_MAX_MECH_NAME-1] = '\0';
+                       me_tab[i].me_mechid = KCF_MECHID(class, i);
+diff --git a/tests/zfs-tests/tests/functional/ctime/ctime.c b/tests/zfs-tests/tests/functional/ctime/ctime.c
+index ba8af15f..1cd18323 100644
+--- a/tests/zfs-tests/tests/functional/ctime/ctime.c
++++ b/tests/zfs-tests/tests/functional/ctime/ctime.c
+@@ -155,7 +155,7 @@ do_link(const char *pfile)
+               return (-1);
+       }
+ 
+-      strncpy(pfile_copy, pfile, sizeof (pfile_copy));
++      strncpy(pfile_copy, pfile, sizeof (pfile_copy)-1);
+       pfile_copy[sizeof (pfile_copy) - 1] = '\0';
+       /*
+        * Figure out source file directory name, and create
diff --git a/zfs-patches/0029-Fix-problems-receiving-reallocated-dnodes.patch b/zfs-patches/0029-Fix-problems-receiving-reallocated-dnodes.patch

new file mode 100644 (file)

index 0000000..a504099
--- /dev/null
+++ b/zfs-patches/0029-Fix-problems-receiving-reallocated-dnodes.patch
@@ -0,0 +1,556 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Tim Chase <tim@chase2k.com>
+Date: Mon, 27 Aug 2018 10:28:32 -0400
+Subject: [PATCH] Fix problems receiving reallocated dnodes
+
+This is a port of 047116ac - Raw sends must be able to decrease nlevels,
+to the zfs-0.7-stable branch.  It includes the various fixes to the
+problem of receiving incremental streams which include reallocated dnodes
+in which the number of dnode slots has changed but excludes the parts
+which are related to raw streams.
+
+From 047116ac:
+
+    Currently, when a raw zfs send file includes a
+    DRR_OBJECT record that would decrease the number of
+    levels of an existing object, the object is reallocated
+    with dmu_object_reclaim() which creates the new dnode
+    using the old object's nlevels. For non-raw sends this
+    doesn't really matter, but raw sends require that
+    nlevels on the receive side match that of the send
+    side so that the checksum-of-MAC tree can be properly
+    maintained. This patch corrects the issue by freeing
+    the object completely before allocating it again in
+    this case.
+
+    This patch also corrects several issues with
+    dnode_hold_impl() and related functions that prevented
+    dnodes (particularly multi-slot dnodes) from being
+    reallocated properly due to the fact that existing
+    dnodes were not being fully cleaned up when they
+    were freed.
+
+    This patch adds a test to make sure that zfs recv
+    functions properly with incremental streams containing
+    dnodes of different sizes.
+
+This also includes a one-liner fix from loli10K to fix a test failure:
+https://github.com/zfsonlinux/zfs/pull/7792#discussion_r212769264
+
+Authored-by: Tom Caputi <tcaputi@datto.com>
+Reviewed by: Matthew Ahrens <mahrens@delphix.com>
+Reviewed-by: Jorgen Lundman <lundman@lundman.net>
+Signed-off-by: Tom Caputi <tcaputi@datto.com>
+Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Signed-off-by: Tim Chase <tim@chase2k.com>
+Ported-by: Tim Chase <tim@chase2k.com>
+
+Closes #6821
+Closes #6864
+
+NOTE: This is the first of the port of 3 related patches patches to the
+zfs-0.7-release branch of ZoL.  The other two patches should immediately
+follow this one.
+
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ cmd/ztest/ztest.c                                  | 25 +++++-
+ include/sys/dnode.h                                |  6 ++
+ lib/libzfs/libzfs_sendrecv.c                       |  1 +
+ module/zfs/dmu_object.c                            |  1 -
+ module/zfs/dmu_send.c                              | 51 +++++++++--
+ module/zfs/dnode.c                                 | 84 +++++++++++++++++--
+ module/zfs/dnode_sync.c                            |  2 +
+ tests/runfiles/linux.run                           |  2 +-
+ tests/zfs-tests/tests/functional/rsend/Makefile.am |  3 +-
+ .../functional/rsend/send_realloc_dnode_size.ksh   | 98 ++++++++++++++++++++++
+ 10 files changed, 258 insertions(+), 15 deletions(-)
+ create mode 100644 tests/zfs-tests/tests/functional/rsend/send_realloc_dnode_size.ksh
+
+diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
+index 1a320b03..a410eeef 100644
+--- a/cmd/ztest/ztest.c
++++ b/cmd/ztest/ztest.c
+@@ -197,7 +197,8 @@ extern uint64_t metaslab_gang_bang;
+ extern uint64_t metaslab_df_alloc_threshold;
+ extern int metaslab_preload_limit;
+ extern boolean_t zfs_compressed_arc_enabled;
+-extern int  zfs_abd_scatter_enabled;
++extern int zfs_abd_scatter_enabled;
++extern int dmu_object_alloc_chunk_shift;
+ 
+ static ztest_shared_opts_t *ztest_shared_opts;
+ static ztest_shared_opts_t ztest_opts;
+@@ -310,6 +311,7 @@ static ztest_shared_callstate_t *ztest_shared_callstate;
+ ztest_func_t ztest_dmu_read_write;
+ ztest_func_t ztest_dmu_write_parallel;
+ ztest_func_t ztest_dmu_object_alloc_free;
++ztest_func_t ztest_dmu_object_next_chunk;
+ ztest_func_t ztest_dmu_commit_callbacks;
+ ztest_func_t ztest_zap;
+ ztest_func_t ztest_zap_parallel;
+@@ -357,6 +359,7 @@ ztest_info_t ztest_info[] = {
+       ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always),
+       ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always),
+       ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always),
++      ZTI_INIT(ztest_dmu_object_next_chunk, 1, &zopt_sometimes),
+       ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always),
+       ZTI_INIT(ztest_zap, 30, &zopt_always),
+       ZTI_INIT(ztest_zap_parallel, 100, &zopt_always),
+@@ -3927,6 +3930,26 @@ ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id)
+       umem_free(od, size);
+ }
+ 
++/*
++ * Rewind the global allocator to verify object allocation backfilling.
++ */
++void
++ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id)
++{
++      objset_t *os = zd->zd_os;
++      int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
++      uint64_t object;
++
++      /*
++       * Rewind the global allocator randomly back to a lower object number
++       * to force backfilling and reclamation of recently freed dnodes.
++       */
++      mutex_enter(&os->os_obj_lock);
++      object = ztest_random(os->os_obj_next_chunk);
++      os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk);
++      mutex_exit(&os->os_obj_lock);
++}
++
+ #undef OD_ARRAY_SIZE
+ #define       OD_ARRAY_SIZE   2
+ 
+diff --git a/include/sys/dnode.h b/include/sys/dnode.h
+index c7efe559..ea7defe1 100644
+--- a/include/sys/dnode.h
++++ b/include/sys/dnode.h
+@@ -360,6 +360,7 @@ int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off,
+     int minlvl, uint64_t blkfill, uint64_t txg);
+ void dnode_evict_dbufs(dnode_t *dn);
+ void dnode_evict_bonus(dnode_t *dn);
++void dnode_free_interior_slots(dnode_t *dn);
+ 
+ #define       DNODE_IS_CACHEABLE(_dn)                                         \
+       ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL ||         \
+@@ -454,6 +455,11 @@ typedef struct dnode_stats {
+        */
+       kstat_named_t dnode_hold_free_txg;
+       /*
++       * Number of times dnode_free_interior_slots() needed to retry
++       * acquiring a slot zrl lock due to contention.
++       */
++      kstat_named_t dnode_free_interior_lock_retry;
++      /*
+        * Number of new dnodes allocated by dnode_allocate().
+        */
+       kstat_named_t dnode_allocate;
+diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c
+index c5acd21a..cadf16cc 100644
+--- a/lib/libzfs/libzfs_sendrecv.c
++++ b/lib/libzfs/libzfs_sendrecv.c
+@@ -3577,6 +3577,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
+               }
+ 
+               newfs = B_TRUE;
++              *cp = '/';
+       }
+ 
+       if (flags->verbose) {
+diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c
+index e7412b75..f53da407 100644
+--- a/module/zfs/dmu_object.c
++++ b/module/zfs/dmu_object.c
+@@ -275,7 +275,6 @@ dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
+       return (err);
+ }
+ 
+-
+ int
+ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
+ {
+diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
+index cdbc1cd1..148b5ff8 100644
+--- a/module/zfs/dmu_send.c
++++ b/module/zfs/dmu_send.c
+@@ -2156,10 +2156,8 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
+       }
+ 
+       err = dmu_object_info(rwa->os, drro->drr_object, &doi);
+-
+-      if (err != 0 && err != ENOENT)
++      if (err != 0 && err != ENOENT && err != EEXIST)
+               return (SET_ERROR(EINVAL));
+-      object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT;
+ 
+       if (drro->drr_object > rwa->max_object)
+               rwa->max_object = drro->drr_object;
+@@ -2175,13 +2173,56 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
+               nblkptr = deduce_nblkptr(drro->drr_bonustype,
+                   drro->drr_bonuslen);
+ 
++              object = drro->drr_object;
++
+               if (drro->drr_blksz != doi.doi_data_block_size ||
+-                  nblkptr < doi.doi_nblkptr) {
++                  nblkptr < doi.doi_nblkptr ||
++                  drro->drr_dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) {
+                       err = dmu_free_long_range(rwa->os, drro->drr_object,
+                           0, DMU_OBJECT_END);
+                       if (err != 0)
+                               return (SET_ERROR(EINVAL));
+               }
++      } else if (err == EEXIST) {
++              /*
++               * The object requested is currently an interior slot of a
++               * multi-slot dnode. This will be resolved when the next txg
++               * is synced out, since the send stream will have told us
++               * to free this slot when we freed the associated dnode
++               * earlier in the stream.
++               */
++              txg_wait_synced(dmu_objset_pool(rwa->os), 0);
++              object = drro->drr_object;
++      } else {
++              /* object is free and we are about to allocate a new one */
++              object = DMU_NEW_OBJECT;
++      }
++
++      /*
++       * If this is a multi-slot dnode there is a chance that this
++       * object will expand into a slot that is already used by
++       * another object from the previous snapshot. We must free
++       * these objects before we attempt to allocate the new dnode.
++       */
++      if (drro->drr_dn_slots > 1) {
++              for (uint64_t slot = drro->drr_object + 1;
++                  slot < drro->drr_object + drro->drr_dn_slots;
++                  slot++) {
++                      dmu_object_info_t slot_doi;
++
++                      err = dmu_object_info(rwa->os, slot, &slot_doi);
++                      if (err == ENOENT || err == EEXIST)
++                              continue;
++                      else if (err != 0)
++                              return (err);
++
++                      err = dmu_free_long_object(rwa->os, slot);
++
++                      if (err != 0)
++                              return (err);
++              }
++
++              txg_wait_synced(dmu_objset_pool(rwa->os), 0);
+       }
+ 
+       tx = dmu_tx_create(rwa->os);
+@@ -2732,7 +2773,7 @@ receive_read_record(struct receive_arg *ra)
+                * See receive_read_prefetch for an explanation why we're
+                * storing this object in the ignore_obj_list.
+                */
+-              if (err == ENOENT ||
++              if (err == ENOENT || err == EEXIST ||
+                   (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
+                       objlist_insert(&ra->ignore_objlist, drro->drr_object);
+                       err = 0;
+diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
+index e05a4d0a..df6a4872 100644
+--- a/module/zfs/dnode.c
++++ b/module/zfs/dnode.c
+@@ -55,6 +55,7 @@ dnode_stats_t dnode_stats = {
+       { "dnode_hold_free_overflow",           KSTAT_DATA_UINT64 },
+       { "dnode_hold_free_refcount",           KSTAT_DATA_UINT64 },
+       { "dnode_hold_free_txg",                KSTAT_DATA_UINT64 },
++      { "dnode_free_interior_lock_retry",     KSTAT_DATA_UINT64 },
+       { "dnode_allocate",                     KSTAT_DATA_UINT64 },
+       { "dnode_reallocate",                   KSTAT_DATA_UINT64 },
+       { "dnode_buf_evict",                    KSTAT_DATA_UINT64 },
+@@ -516,7 +517,8 @@ dnode_destroy(dnode_t *dn)
+       mutex_exit(&os->os_lock);
+ 
+       /* the dnode can no longer move, so we can release the handle */
+-      zrl_remove(&dn->dn_handle->dnh_zrlock);
++      if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock))
++              zrl_remove(&dn->dn_handle->dnh_zrlock);
+ 
+       dn->dn_allocated_txg = 0;
+       dn->dn_free_txg = 0;
+@@ -662,6 +664,8 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
+           DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
+ 
+       dn_slots = dn_slots > 0 ? dn_slots : DNODE_MIN_SLOTS;
++
++      dnode_free_interior_slots(dn);
+       DNODE_STAT_BUMP(dnode_reallocate);
+ 
+       /* clean up any unreferenced dbufs */
+@@ -1062,19 +1066,73 @@ dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr)
+ }
+ 
+ static boolean_t
+-dnode_check_slots(dnode_children_t *children, int idx, int slots, void *ptr)
++dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
+ {
+       ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+ 
+       for (int i = idx; i < idx + slots; i++) {
+               dnode_handle_t *dnh = &children->dnc_children[i];
+-              if (dnh->dnh_dnode != ptr)
++              dnode_t *dn = dnh->dnh_dnode;
++
++              if (dn == DN_SLOT_FREE) {
++                      continue;
++              } else if (DN_SLOT_IS_PTR(dn)) {
++                      mutex_enter(&dn->dn_mtx);
++                      dmu_object_type_t type = dn->dn_type;
++                      mutex_exit(&dn->dn_mtx);
++
++                      if (type != DMU_OT_NONE)
++                              return (B_FALSE);
++
++                      continue;
++              } else {
+                       return (B_FALSE);
++              }
++
++              return (B_FALSE);
+       }
+ 
+       return (B_TRUE);
+ }
+ 
++static void
++dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
++{
++      ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
++
++      for (int i = idx; i < idx + slots; i++) {
++              dnode_handle_t *dnh = &children->dnc_children[i];
++
++              ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
++
++              if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
++                      ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);
++                      dnode_destroy(dnh->dnh_dnode);
++                      dnh->dnh_dnode = DN_SLOT_FREE;
++              }
++      }
++}
++
++void
++dnode_free_interior_slots(dnode_t *dn)
++{
++      dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db);
++      int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT;
++      int idx = (dn->dn_object & (epb - 1)) + 1;
++      int slots = dn->dn_num_slots - 1;
++
++      if (slots == 0)
++              return;
++
++      ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
++
++      while (!dnode_slots_tryenter(children, idx, slots))
++              DNODE_STAT_BUMP(dnode_free_interior_lock_retry);
++
++      dnode_set_slots(children, idx, slots, DN_SLOT_FREE);
++      dnode_slots_rele(children, idx, slots);
++}
++
+ void
+ dnode_special_close(dnode_handle_t *dnh)
+ {
+@@ -1355,7 +1413,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
+               while (dn == DN_SLOT_UNINIT) {
+                       dnode_slots_hold(dnc, idx, slots);
+ 
+-                      if (!dnode_check_slots(dnc, idx, slots, DN_SLOT_FREE)) {
++                      if (!dnode_check_slots_free(dnc, idx, slots)) {
+                               DNODE_STAT_BUMP(dnode_hold_free_misses);
+                               dnode_slots_rele(dnc, idx, slots);
+                               dbuf_rele(db, FTAG);
+@@ -1368,15 +1426,29 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
+                               continue;
+                       }
+ 
+-                      if (!dnode_check_slots(dnc, idx, slots, DN_SLOT_FREE)) {
++                      if (!dnode_check_slots_free(dnc, idx, slots)) {
+                               DNODE_STAT_BUMP(dnode_hold_free_lock_misses);
+                               dnode_slots_rele(dnc, idx, slots);
+                               dbuf_rele(db, FTAG);
+                               return (SET_ERROR(ENOSPC));
+                       }
+ 
++                      /*
++                       * Allocated but otherwise free dnodes which would
++                       * be in the interior of a multi-slot dnodes need
++                       * to be freed.  Single slot dnodes can be safely
++                       * re-purposed as a performance optimization.
++                       */
++                      if (slots > 1)
++                              dnode_reclaim_slots(dnc, idx + 1, slots - 1);
++
+                       dnh = &dnc->dnc_children[idx];
+-                      dn = dnode_create(os, dn_block + idx, db, object, dnh);
++                      if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
++                              dn = dnh->dnh_dnode;
++                      } else {
++                              dn = dnode_create(os, dn_block + idx, db,
++                                  object, dnh);
++                      }
+               }
+ 
+               mutex_enter(&dn->dn_mtx);
+diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c
+index 742d962b..8d65e385 100644
+--- a/module/zfs/dnode_sync.c
++++ b/module/zfs/dnode_sync.c
+@@ -533,6 +533,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
+       if (dn->dn_allocated_txg != dn->dn_free_txg)
+               dmu_buf_will_dirty(&dn->dn_dbuf->db, tx);
+       bzero(dn->dn_phys, sizeof (dnode_phys_t) * dn->dn_num_slots);
++      dnode_free_interior_slots(dn);
+ 
+       mutex_enter(&dn->dn_mtx);
+       dn->dn_type = DMU_OT_NONE;
+@@ -540,6 +541,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
+       dn->dn_allocated_txg = 0;
+       dn->dn_free_txg = 0;
+       dn->dn_have_spill = B_FALSE;
++      dn->dn_num_slots = 1;
+       mutex_exit(&dn->dn_mtx);
+ 
+       ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
+diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
+index 69e9eb26..d8fe6f3a 100644
+--- a/tests/runfiles/linux.run
++++ b/tests/runfiles/linux.run
+@@ -605,7 +605,7 @@ tests = ['rsend_001_pos', 'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos',
+     'send-c_lz4_disabled', 'send-c_recv_lz4_disabled',
+     'send-c_mixed_compression', 'send-c_stream_size_estimate', 'send-cD',
+     'send-c_embedded_blocks', 'send-c_resume', 'send-cpL_varied_recsize',
+-    'send-c_recv_dedup', 'send_freeobjects']
++    'send-c_recv_dedup', 'send_freeobjects', 'send_realloc_dnode_size']
+ tags = ['functional', 'rsend']
+ 
+ [tests/functional/scrub_mirror]
+diff --git a/tests/zfs-tests/tests/functional/rsend/Makefile.am b/tests/zfs-tests/tests/functional/rsend/Makefile.am
+index 6b1aa8b3..a2837d1a 100644
+--- a/tests/zfs-tests/tests/functional/rsend/Makefile.am
++++ b/tests/zfs-tests/tests/functional/rsend/Makefile.am
+@@ -36,7 +36,8 @@ dist_pkgdata_SCRIPTS = \
+       send-c_volume.ksh \
+       send-c_zstreamdump.ksh \
+       send-cpL_varied_recsize.ksh \
+-      send_freeobjects.ksh
++      send_freeobjects.ksh \
++      send_realloc_dnode_size.ksh
+ 
+ dist_pkgdata_DATA = \
+       rsend.cfg \
+diff --git a/tests/zfs-tests/tests/functional/rsend/send_realloc_dnode_size.ksh b/tests/zfs-tests/tests/functional/rsend/send_realloc_dnode_size.ksh
+new file mode 100644
+index 00000000..20676394
+--- /dev/null
++++ b/tests/zfs-tests/tests/functional/rsend/send_realloc_dnode_size.ksh
+@@ -0,0 +1,98 @@
++#!/bin/ksh
++
++#
++# This file and its contents are supplied under the terms of the
++# Common Development and Distribution License ("CDDL"), version 1.0.
++# You may only use this file in accordance with the terms of version
++# 1.0 of the CDDL.
++#
++# A full copy of the text of the CDDL should have accompanied this
++# source.  A copy of the CDDL is also available via the Internet at
++# http://www.illumos.org/license/CDDL.
++#
++
++#
++# Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
++#
++
++. $STF_SUITE/include/libtest.shlib
++. $STF_SUITE/tests/functional/rsend/rsend.kshlib
++
++#
++# Description:
++# Verify incremental receive properly handles objects with changed
++# dnode slot count.
++#
++# Strategy:
++# 1. Populate a dataset with 1k byte dnodes and snapshot
++# 2. Remove objects, set dnodesize=legacy, and remount dataset so new objects
++#    get recycled numbers and formerly "interior" dnode slots get assigned
++#    to new objects
++# 3. Remove objects, set dnodesize=2k, and remount dataset so new objects
++#    overlap with recently recycled and formerly "normal" dnode slots get
++#    assigned to new objects
++# 4. Generate initial and incremental streams
++# 5. Verify initial and incremental streams can be received
++#
++
++verify_runnable "both"
++
++log_assert "Verify incremental receive handles objects with changed dnode size"
++
++function cleanup
++{
++      rm -f $BACKDIR/fs-dn-legacy
++      rm -f $BACKDIR/fs-dn-1k
++      rm -f $BACKDIR/fs-dn-2k
++
++      if datasetexists $POOL/fs ; then
++              log_must zfs destroy -rR $POOL/fs
++      fi
++
++      if datasetexists $POOL/newfs ; then
++              log_must zfs destroy -rR $POOL/newfs
++      fi
++}
++
++log_onexit cleanup
++
++# 1. Populate a dataset with 1k byte dnodes and snapshot
++log_must zfs create -o dnodesize=1k $POOL/fs
++log_must mk_files 200 262144 0 $POOL/fs
++log_must zfs snapshot $POOL/fs@a
++
++# 2. Remove objects, set dnodesize=legacy, and remount dataset so new objects
++#    get recycled numbers and formerly "interior" dnode slots get assigned
++#    to new objects
++rm /$POOL/fs/*
++
++log_must zfs unmount $POOL/fs
++log_must zfs set dnodesize=legacy $POOL/fs
++log_must zfs mount $POOL/fs
++
++log_must mk_files 200 262144 0 $POOL/fs
++log_must zfs snapshot $POOL/fs@b
++
++# 3. Remove objects, set dnodesize=2k, and remount dataset so new objects
++#    overlap with recently recycled and formerly "normal" dnode slots get
++#    assigned to new objects
++rm /$POOL/fs/*
++
++log_must zfs unmount $POOL/fs
++log_must zfs set dnodesize=2k $POOL/fs
++log_must zfs mount $POOL/fs
++
++mk_files 200 262144 0 $POOL/fs
++log_must zfs snapshot $POOL/fs@c
++
++# 4. Generate initial and incremental streams
++log_must eval "zfs send $POOL/fs@a > $BACKDIR/fs-dn-1k"
++log_must eval "zfs send -i $POOL/fs@a $POOL/fs@b > $BACKDIR/fs-dn-legacy"
++log_must eval "zfs send -i $POOL/fs@b $POOL/fs@c > $BACKDIR/fs-dn-2k"
++
++# 5. Verify initial and incremental streams can be received
++log_must eval "zfs recv $POOL/newfs < $BACKDIR/fs-dn-1k"
++log_must eval "zfs recv $POOL/newfs < $BACKDIR/fs-dn-legacy"
++log_must eval "zfs recv $POOL/newfs < $BACKDIR/fs-dn-2k"
++
++log_pass "Verify incremental receive handles objects with changed dnode size"
diff --git a/zfs-patches/0030-Fix-object-reclaim-when-using-large-dnodes.patch b/zfs-patches/0030-Fix-object-reclaim-when-using-large-dnodes.patch

new file mode 100644 (file)

index 0000000..fd5abd1
--- /dev/null
+++ b/zfs-patches/0030-Fix-object-reclaim-when-using-large-dnodes.patch
@@ -0,0 +1,134 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Tom Caputi <tcaputi@datto.com>
+Date: Tue, 17 Apr 2018 14:13:57 -0400
+Subject: [PATCH] Fix object reclaim when using large dnodes
+
+Currently, when the receive_object() code wants to reclaim an
+object, it always assumes that the dnode is the legacy 512 bytes,
+even when the incoming bonus buffer exceeds this length. This
+causes a buffer overflow if --enable-debug is not provided and
+triggers an ASSERT if it is. This patch resolves this issue and
+adds an ASSERT to ensure this can't happen again.
+
+Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Signed-off-by: Tom Caputi <tcaputi@datto.com>
+Closes #7097
+Closes #7433
+
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ module/zfs/dmu_object.c                             |  2 +-
+ module/zfs/dmu_send.c                               |  5 +++--
+ module/zfs/dnode.c                                  |  3 +--
+ .../functional/rsend/send_realloc_dnode_size.ksh    | 21 +++++++++++++++++----
+ 4 files changed, 22 insertions(+), 9 deletions(-)
+ mode change 100644 => 100755 tests/zfs-tests/tests/functional/rsend/send_realloc_dnode_size.ksh
+
+diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c
+index f53da407..1fc71d10 100644
+--- a/module/zfs/dmu_object.c
++++ b/module/zfs/dmu_object.c
+@@ -249,7 +249,7 @@ dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+     int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+ {
+       return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,
+-          bonuslen, 0, tx));
++          bonuslen, DNODE_MIN_SIZE, tx));
+ }
+ 
+ int
+diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
+index 148b5ff8..1de0f316 100644
+--- a/module/zfs/dmu_send.c
++++ b/module/zfs/dmu_send.c
+@@ -2244,9 +2244,10 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
+           drro->drr_bonustype != doi.doi_bonus_type ||
+           drro->drr_bonuslen != doi.doi_bonus_size) {
+               /* currently allocated, but with different properties */
+-              err = dmu_object_reclaim(rwa->os, drro->drr_object,
++              err = dmu_object_reclaim_dnsize(rwa->os, drro->drr_object,
+                   drro->drr_type, drro->drr_blksz,
+-                  drro->drr_bonustype, drro->drr_bonuslen, tx);
++                  drro->drr_bonustype, drro->drr_bonuslen,
++                  drro->drr_dn_slots << DNODE_SHIFT, tx);
+       }
+       if (err != 0) {
+               dmu_tx_commit(tx);
+diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
+index df6a4872..d465b545 100644
+--- a/module/zfs/dnode.c
++++ b/module/zfs/dnode.c
+@@ -662,8 +662,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
+       ASSERT(DMU_OT_IS_VALID(bonustype));
+       ASSERT3U(bonuslen, <=,
+           DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
+-
+-      dn_slots = dn_slots > 0 ? dn_slots : DNODE_MIN_SLOTS;
++      ASSERT3U(bonuslen, <=, DN_BONUS_SIZE(dn_slots << DNODE_SHIFT));
+ 
+       dnode_free_interior_slots(dn);
+       DNODE_STAT_BUMP(dnode_reallocate);
+diff --git a/tests/zfs-tests/tests/functional/rsend/send_realloc_dnode_size.ksh b/tests/zfs-tests/tests/functional/rsend/send_realloc_dnode_size.ksh
+old mode 100644
+new mode 100755
+index 20676394..12a72fa0
+--- a/tests/zfs-tests/tests/functional/rsend/send_realloc_dnode_size.ksh
++++ b/tests/zfs-tests/tests/functional/rsend/send_realloc_dnode_size.ksh
+@@ -13,6 +13,7 @@
+ 
+ #
+ # Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
++# Copyright (c) 2018 Datto Inc.
+ #
+ 
+ . $STF_SUITE/include/libtest.shlib
+@@ -31,8 +32,10 @@
+ # 3. Remove objects, set dnodesize=2k, and remount dataset so new objects
+ #    overlap with recently recycled and formerly "normal" dnode slots get
+ #    assigned to new objects
+-# 4. Generate initial and incremental streams
+-# 5. Verify initial and incremental streams can be received
++# 4. Create an empty file and add xattrs to it to exercise reclaiming a
++#    dnode that requires more than 1 slot for its bonus buffer (Zol #7433)
++# 5. Generate initial and incremental streams
++# 6. Verify initial and incremental streams can be received
+ #
+ 
+ verify_runnable "both"
+@@ -44,6 +47,7 @@ function cleanup
+       rm -f $BACKDIR/fs-dn-legacy
+       rm -f $BACKDIR/fs-dn-1k
+       rm -f $BACKDIR/fs-dn-2k
++      rm -f $BACKDIR/fs-attr
+ 
+       if datasetexists $POOL/fs ; then
+               log_must zfs destroy -rR $POOL/fs
+@@ -82,17 +86,26 @@ log_must zfs unmount $POOL/fs
+ log_must zfs set dnodesize=2k $POOL/fs
+ log_must zfs mount $POOL/fs
+ 
++log_must touch /$POOL/fs/attrs
+ mk_files 200 262144 0 $POOL/fs
+ log_must zfs snapshot $POOL/fs@c
+ 
+-# 4. Generate initial and incremental streams
++# 4. Create an empty file and add xattrs to it to exercise reclaiming a
++#    dnode that requires more than 1 slot for its bonus buffer (Zol #7433)
++log_must zfs set compression=on xattr=sa $POOL/fs
++log_must eval "python -c 'print \"a\" * 512' | attr -s bigval /$POOL/fs/attrs"
++log_must zfs snapshot $POOL/fs@d
++
++# 5. Generate initial and incremental streams
+ log_must eval "zfs send $POOL/fs@a > $BACKDIR/fs-dn-1k"
+ log_must eval "zfs send -i $POOL/fs@a $POOL/fs@b > $BACKDIR/fs-dn-legacy"
+ log_must eval "zfs send -i $POOL/fs@b $POOL/fs@c > $BACKDIR/fs-dn-2k"
++log_must eval "zfs send -i $POOL/fs@c $POOL/fs@d > $BACKDIR/fs-attr"
+ 
+-# 5. Verify initial and incremental streams can be received
++# 6. Verify initial and incremental streams can be received
+ log_must eval "zfs recv $POOL/newfs < $BACKDIR/fs-dn-1k"
+ log_must eval "zfs recv $POOL/newfs < $BACKDIR/fs-dn-legacy"
+ log_must eval "zfs recv $POOL/newfs < $BACKDIR/fs-dn-2k"
++log_must eval "zfs recv $POOL/newfs < $BACKDIR/fs-attr"
+ 
+ log_pass "Verify incremental receive handles objects with changed dnode size"
diff --git a/zfs-patches/0031-Fix-zfs-recv-of-non-large_dnode-send-streams.patch b/zfs-patches/0031-Fix-zfs-recv-of-non-large_dnode-send-streams.patch

new file mode 100644 (file)

index 0000000..c5a749e
--- /dev/null
+++ b/zfs-patches/0031-Fix-zfs-recv-of-non-large_dnode-send-streams.patch
@@ -0,0 +1,124 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Tom Caputi <tcaputi@datto.com>
+Date: Thu, 28 Jun 2018 17:55:11 -0400
+Subject: [PATCH] Fix 'zfs recv' of non large_dnode send streams
+
+Currently, there is a bug where older send streams without the
+DMU_BACKUP_FEATURE_LARGE_DNODE flag are not handled correctly.
+The code in receive_object() fails to handle cases where
+drro->drr_dn_slots is set to 0, which is always the case when the
+sending code does not support this feature flag. This patch fixes
+the issue by ensuring that that a value of 0 is treated as
+DNODE_MIN_SLOTS.
+
+Tested-by:  DHE <git@dehacked.net>
+Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Signed-off-by: Tom Caputi <tcaputi@datto.com>
+Closes #7617
+Closes #7662
+
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ module/zfs/dmu_object.c |  3 +++
+ module/zfs/dmu_send.c   | 33 +++++++++++++++++++++++++++------
+ 2 files changed, 30 insertions(+), 6 deletions(-)
+
+diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c
+index 1fc71d10..40c25362 100644
+--- a/module/zfs/dmu_object.c
++++ b/module/zfs/dmu_object.c
+@@ -261,6 +261,9 @@ dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
+       int dn_slots = dnodesize >> DNODE_SHIFT;
+       int err;
+ 
++      if (dn_slots == 0)
++              dn_slots = DNODE_MIN_SLOTS;
++
+       if (object == DMU_META_DNODE_OBJECT)
+               return (SET_ERROR(EBADF));
+ 
+diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
+index 1de0f316..13aae960 100644
+--- a/module/zfs/dmu_send.c
++++ b/module/zfs/dmu_send.c
+@@ -2139,6 +2139,8 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
+       dmu_tx_t *tx;
+       uint64_t object;
+       int err;
++      uint8_t dn_slots = drro->drr_dn_slots != 0 ?
++          drro->drr_dn_slots : DNODE_MIN_SLOTS;
+ 
+       if (drro->drr_type == DMU_OT_NONE ||
+           !DMU_OT_IS_VALID(drro->drr_type) ||
+@@ -2150,7 +2152,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
+           drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) ||
+           drro->drr_bonuslen >
+           DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os))) ||
+-          drro->drr_dn_slots >
++          dn_slots >
+           (spa_maxdnodesize(dmu_objset_spa(rwa->os)) >> DNODE_SHIFT))  {
+               return (SET_ERROR(EINVAL));
+       }
+@@ -2177,12 +2179,31 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
+ 
+               if (drro->drr_blksz != doi.doi_data_block_size ||
+                   nblkptr < doi.doi_nblkptr ||
+-                  drro->drr_dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) {
++                  dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) {
+                       err = dmu_free_long_range(rwa->os, drro->drr_object,
+                           0, DMU_OBJECT_END);
+                       if (err != 0)
+                               return (SET_ERROR(EINVAL));
+               }
++
++              /*
++               * The dmu does not currently support decreasing nlevels
++               * on an object. For non-raw sends, this does not matter
++               * and the new object can just use the previous one's nlevels.
++               * For raw sends, however, the structure of the received dnode
++               * (including nlevels) must match that of the send side.
++               * Therefore, instead of using dmu_object_reclaim(), we must
++               * free the object completely and call dmu_object_claim_dnsize()
++               * instead.
++               */
++              if (dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) {
++                      err = dmu_free_long_object(rwa->os, drro->drr_object);
++                      if (err != 0)
++                              return (SET_ERROR(EINVAL));
++
++                      txg_wait_synced(dmu_objset_pool(rwa->os), 0);
++                      object = DMU_NEW_OBJECT;
++              }
+       } else if (err == EEXIST) {
+               /*
+                * The object requested is currently an interior slot of a
+@@ -2204,9 +2225,9 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
+        * another object from the previous snapshot. We must free
+        * these objects before we attempt to allocate the new dnode.
+        */
+-      if (drro->drr_dn_slots > 1) {
++      if (dn_slots > 1) {
+               for (uint64_t slot = drro->drr_object + 1;
+-                  slot < drro->drr_object + drro->drr_dn_slots;
++                  slot < drro->drr_object + dn_slots;
+                   slot++) {
+                       dmu_object_info_t slot_doi;
+ 
+@@ -2238,7 +2259,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
+               err = dmu_object_claim_dnsize(rwa->os, drro->drr_object,
+                   drro->drr_type, drro->drr_blksz,
+                   drro->drr_bonustype, drro->drr_bonuslen,
+-                  drro->drr_dn_slots << DNODE_SHIFT, tx);
++                  dn_slots << DNODE_SHIFT, tx);
+       } else if (drro->drr_type != doi.doi_type ||
+           drro->drr_blksz != doi.doi_data_block_size ||
+           drro->drr_bonustype != doi.doi_bonus_type ||
+@@ -2247,7 +2268,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
+               err = dmu_object_reclaim_dnsize(rwa->os, drro->drr_object,
+                   drro->drr_type, drro->drr_blksz,
+                   drro->drr_bonustype, drro->drr_bonuslen,
+-                  drro->drr_dn_slots << DNODE_SHIFT, tx);
++                  dn_slots << DNODE_SHIFT, tx);
+       }
+       if (err != 0) {
+               dmu_tx_commit(tx);
diff --git a/zfs-patches/0032-Fix-build-with-CONFIG_GCC_PLUGIN_RANDSTRUCT.patch b/zfs-patches/0032-Fix-build-with-CONFIG_GCC_PLUGIN_RANDSTRUCT.patch

new file mode 100644 (file)

index 0000000..462cdbb
--- /dev/null
+++ b/zfs-patches/0032-Fix-build-with-CONFIG_GCC_PLUGIN_RANDSTRUCT.patch
@@ -0,0 +1,42 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Georgy Yakovlev <ya@sysdump.net>
+Date: Thu, 10 May 2018 23:00:18 -0700
+Subject: [PATCH] Fix build with CONFIG_GCC_PLUGIN_RANDSTRUCT
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+fs/zfs/zfs/metaslab.c:1055:2: error: positional initialization of field
+in ‘struct’ declared with ‘designated_init’ attribute
+[-Werror=designated-init]
+  metaslab_rt_remove,
+
+Signed-off-by: Georgy Yakovlev <ya@sysdump.net>
+Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
+Closes: #7069
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ module/zfs/metaslab.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
+index 5e413c06..ee24850d 100644
+--- a/module/zfs/metaslab.c
++++ b/module/zfs/metaslab.c
+@@ -1049,11 +1049,11 @@ metaslab_rt_vacate(range_tree_t *rt, void *arg)
+ }
+ 
+ static range_tree_ops_t metaslab_rt_ops = {
+-      metaslab_rt_create,
+-      metaslab_rt_destroy,
+-      metaslab_rt_add,
+-      metaslab_rt_remove,
+-      metaslab_rt_vacate
++      .rtop_create = metaslab_rt_create,
++      .rtop_destroy = metaslab_rt_destroy,
++      .rtop_add = metaslab_rt_add,
++      .rtop_remove = metaslab_rt_remove,
++      .rtop_vacate = metaslab_rt_vacate
+ };
+ 
+ /*
diff --git a/zfs-patches/0033-Correctly-handle-errors-from-kern_path.patch b/zfs-patches/0033-Correctly-handle-errors-from-kern_path.patch

new file mode 100644 (file)

index 0000000..5df65fc
--- /dev/null
+++ b/zfs-patches/0033-Correctly-handle-errors-from-kern_path.patch
@@ -0,0 +1,35 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Chris Siebenmann <cks.github@cs.toronto.edu>
+Date: Wed, 5 Sep 2018 01:26:56 -0400
+Subject: [PATCH] Correctly handle errors from kern_path
+
+As a regular kernel function, kern_path() returns errors as negative
+errnos, such as -ELOOP. zfsctl_snapdir_vget() must convert these into
+the positive errnos used throughout the ZFS code when it returns them
+to other ZFS functions so that the ZFS code properly sees them as
+errors.
+
+Reviewed-by: George Melikov <mail@gmelikov.ru>
+Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Signed-off-by: Chris Siebenmann <cks.git01@cs.toronto.edu>
+Closes #7764
+Closes #7864
+
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ module/zfs/zfs_ctldir.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c
+index 25edea78..0ab5b4f0 100644
+--- a/module/zfs/zfs_ctldir.c
++++ b/module/zfs/zfs_ctldir.c
+@@ -1180,7 +1180,7 @@ zfsctl_snapdir_vget(struct super_block *sb, uint64_t objsetid, int gen,
+               goto out;
+ 
+       /* Trigger automount */
+-      error = kern_path(mnt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &path);
++      error = -kern_path(mnt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &path);
+       if (error)
+               goto out;
+ 
diff --git a/zfs-patches/0034-Tag-zfs-0.7.10.patch b/zfs-patches/0034-Tag-zfs-0.7.10.patch

new file mode 100644 (file)

index 0000000..5c2f865
--- /dev/null
+++ b/zfs-patches/0034-Tag-zfs-0.7.10.patch
@@ -0,0 +1,56 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Tony Hutter <hutter2@llnl.gov>
+Date: Wed, 5 Sep 2018 10:37:32 -0700
+Subject: [PATCH] Tag zfs-0.7.10
+
+META file and changelog updated.
+
+Signed-off-by: Tony Hutter <hutter2@llnl.gov>
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ META                         | 2 +-
+ rpm/generic/zfs-kmod.spec.in | 3 +++
+ rpm/generic/zfs.spec.in      | 3 +++
+ 3 files changed, 7 insertions(+), 1 deletion(-)
+
+diff --git a/META b/META
+index fbada64e..89525ac8 100644
+--- a/META
++++ b/META
+@@ -1,7 +1,7 @@
+ Meta:         1
+ Name:         zfs
+ Branch:       1.0
+-Version:      0.7.9
++Version:      0.7.10
+ Release:      1
+ Release-Tags: relext
+ License:      CDDL
+diff --git a/rpm/generic/zfs-kmod.spec.in b/rpm/generic/zfs-kmod.spec.in
+index a3678681..8306ea76 100644
+--- a/rpm/generic/zfs-kmod.spec.in
++++ b/rpm/generic/zfs-kmod.spec.in
+@@ -191,6 +191,9 @@ chmod u+x ${RPM_BUILD_ROOT}%{kmodinstdir_prefix}/*/extra/*/*/*
+ rm -rf $RPM_BUILD_ROOT
+ 
+ %changelog
++* Wed Sep 05 2018 Tony Hutter <hutter2@llnl.gov> - 0.7.10-1
++- Released 0.7.10-1, detailed release notes are available at:
++- https://github.com/zfsonlinux/zfs/releases/tag/zfs-0.7.10
+ * Tue May 08 2018 Tony Hutter <hutter2@llnl.gov> - 0.7.9-1
+ - Released 0.7.9-1, detailed release notes are available at:
+ - https://github.com/zfsonlinux/zfs/releases/tag/zfs-0.7.9
+diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in
+index 22565725..76953aec 100644
+--- a/rpm/generic/zfs.spec.in
++++ b/rpm/generic/zfs.spec.in
+@@ -371,6 +371,9 @@ systemctl --system daemon-reload >/dev/null || true
+ %endif
+ 
+ %changelog
++* Wed Sep 05 2018 Tony Hutter <hutter2@llnl.gov> - 0.7.10-1
++- Released 0.7.10-1, detailed release notes are available at:
++- https://github.com/zfsonlinux/zfs/releases/tag/zfs-0.7.10
+ * Tue May 08 2018 Tony Hutter <hutter2@llnl.gov> - 0.7.9-1
+ - Released 0.7.9-1, detailed release notes are available at:
+ - https://github.com/zfsonlinux/zfs/releases/tag/zfs-0.7.9
diff --git a/zfs-patches/series b/zfs-patches/series

index 5d154db446f7469534e1180b460fcc16b24b69d2..18cec2ab656bfa0718b386fa4da5359528de884e 100644 (file)
--- a/zfs-patches/series
+++ b/zfs-patches/series
@@ -1,5 +1,34 @@
  0001-remove-DKMS-modules-and-dracut-build.patch
  0002-import-with-d-dev-disk-by-id-in-scan-service.patch
  0003-always-load-ZFS-module-on-boot.patch
-0004-Fix-zpl_mount-deadlock.patch
-0005-Fix-deadlock-between-zfs-umount-snapentry_expire.patch
+0004-Fix-deadlock-between-zfs-umount-snapentry_expire.patch
+0005-zv_suspend_lock-in-zvol_open-zvol_release.patch
+0006-Linux-4.18-compat-inode-timespec-timespec64.patch
+0007-Linux-compat-4.18-check_disk_size_change.patch
+0008-OpenZFS-8997-ztest-assertion-failure-in-zil_lwb_writ.patch
+0009-Fix-divide-by-zero-in-mmp_delay_update.patch
+0010-Fix-ENOSPC-in-Handle-zap_add-failures-in.patch
+0011-Trim-new-line-from-zfs_vdev_scheduler.patch
+0012-module-param-callbacks-check-for-initialized-spa.patch
+0013-Support-Debian-DKMS-builds.patch
+0014-zpool-reopen-should-detect-expanded-devices.patch
+0015-Add-pool-state-proc-entry-SUSPENDED-pools.patch
+0016-Linux-4.14-compat-blk_queue_stackable.patch
+0017-Default-ashift-for-Amazon-EC2-NVMe-devices.patch
+0018-Fix-kernel-unaligned-access-on-sparc64.patch
+0019-Fix-zpl_mount-deadlock.patch
+0020-OpenZFS-8906-uts-illumos-rootfs-should-support-salte.patch
+0021-Fix-zfs-incremental-send-remove-o-properties.patch
+0022-Allow-inherited-properties-in-zfs_check_settable.patch
+0023-Fix-arcstat.py-handling-of-unsupported-options.patch
+0024-Don-t-modify-argv-in-user-tools.patch
+0025-Add-missing-zfs-dracut-RPM-dependencies.patch
+0026-Add-libaio-devel-BuildRequires.patch
+0027-Fix-libaio-devel-requirement-for-Debian-based-distri.patch
+0028-Fedora-28-Fix-misc-bounds-check-compiler-warnings.patch
+0029-Fix-problems-receiving-reallocated-dnodes.patch
+0030-Fix-object-reclaim-when-using-large-dnodes.patch
+0031-Fix-zfs-recv-of-non-large_dnode-send-streams.patch
+0032-Fix-build-with-CONFIG_GCC_PLUGIN_RANDSTRUCT.patch
+0033-Correctly-handle-errors-from-kern_path.patch
+0034-Tag-zfs-0.7.10.patch
author	Stoiko Ivanov <s.ivanov@proxmox.com>
	Tue, 11 Sep 2018 09:43:41 +0000 (11:43 +0200)
committer	Thomas Lamprecht <t.lamprecht@proxmox.com>
	Thu, 13 Sep 2018 06:54:51 +0000 (08:54 +0200)
zfs-patches/0004-Fix-deadlock-between-zfs-umount-snapentry_expire.patch	[new file with mode: 0644]	patch \| blob
zfs-patches/0004-Fix-zpl_mount-deadlock.patch	[deleted file]	patch \| blob \| blame \| history
zfs-patches/0005-Fix-deadlock-between-zfs-umount-snapentry_expire.patch	[deleted file]	patch \| blob \| blame \| history
zfs-patches/0005-zv_suspend_lock-in-zvol_open-zvol_release.patch	[new file with mode: 0644]	patch \| blob
zfs-patches/0006-Linux-4.18-compat-inode-timespec-timespec64.patch	[new file with mode: 0644]	patch \| blob
zfs-patches/0007-Linux-compat-4.18-check_disk_size_change.patch	[new file with mode: 0644]	patch \| blob
zfs-patches/0008-OpenZFS-8997-ztest-assertion-failure-in-zil_lwb_writ.patch	[new file with mode: 0644]	patch \| blob
zfs-patches/0009-Fix-divide-by-zero-in-mmp_delay_update.patch	[new file with mode: 0644]	patch \| blob
zfs-patches/0010-Fix-ENOSPC-in-Handle-zap_add-failures-in.patch	[new file with mode: 0644]	patch \| blob
zfs-patches/0011-Trim-new-line-from-zfs_vdev_scheduler.patch	[new file with mode: 0644]	patch \| blob
zfs-patches/0012-module-param-callbacks-check-for-initialized-spa.patch	[new file with mode: 0644]	patch \| blob
zfs-patches/0013-Support-Debian-DKMS-builds.patch	[new file with mode: 0644]	patch \| blob
zfs-patches/0014-zpool-reopen-should-detect-expanded-devices.patch	[new file with mode: 0644]	patch \| blob
zfs-patches/0015-Add-pool-state-proc-entry-SUSPENDED-pools.patch	[new file with mode: 0644]	patch \| blob
zfs-patches/0016-Linux-4.14-compat-blk_queue_stackable.patch	[new file with mode: 0644]	patch \| blob
zfs-patches/0017-Default-ashift-for-Amazon-EC2-NVMe-devices.patch	[new file with mode: 0644]	patch \| blob
zfs-patches/0018-Fix-kernel-unaligned-access-on-sparc64.patch	[new file with mode: 0644]	patch \| blob
zfs-patches/0019-Fix-zpl_mount-deadlock.patch	[new file with mode: 0644]	patch \| blob
zfs-patches/0020-OpenZFS-8906-uts-illumos-rootfs-should-support-salte.patch	[new file with mode: 0644]	patch \| blob
zfs-patches/0021-Fix-zfs-incremental-send-remove-o-properties.patch	[new file with mode: 0644]	patch \| blob
zfs-patches/0022-Allow-inherited-properties-in-zfs_check_settable.patch	[new file with mode: 0644]	patch \| blob
zfs-patches/0023-Fix-arcstat.py-handling-of-unsupported-options.patch	[new file with mode: 0644]	patch \| blob
zfs-patches/0024-Don-t-modify-argv-in-user-tools.patch	[new file with mode: 0644]	patch \| blob
zfs-patches/0025-Add-missing-zfs-dracut-RPM-dependencies.patch	[new file with mode: 0644]	patch \| blob
zfs-patches/0026-Add-libaio-devel-BuildRequires.patch	[new file with mode: 0644]	patch \| blob
zfs-patches/0027-Fix-libaio-devel-requirement-for-Debian-based-distri.patch	[new file with mode: 0644]	patch \| blob
zfs-patches/0028-Fedora-28-Fix-misc-bounds-check-compiler-warnings.patch	[new file with mode: 0644]	patch \| blob
zfs-patches/0029-Fix-problems-receiving-reallocated-dnodes.patch	[new file with mode: 0644]	patch \| blob
zfs-patches/0030-Fix-object-reclaim-when-using-large-dnodes.patch	[new file with mode: 0644]	patch \| blob
zfs-patches/0031-Fix-zfs-recv-of-non-large_dnode-send-streams.patch	[new file with mode: 0644]	patch \| blob
zfs-patches/0032-Fix-build-with-CONFIG_GCC_PLUGIN_RANDSTRUCT.patch	[new file with mode: 0644]	patch \| blob
zfs-patches/0033-Correctly-handle-errors-from-kern_path.patch	[new file with mode: 0644]	patch \| blob
zfs-patches/0034-Tag-zfs-0.7.10.patch	[new file with mode: 0644]	patch \| blob
zfs-patches/series		patch \| blob \| blame \| history