]> git.proxmox.com Git - zfsonlinux.git/commitdiff
cherry-pick lock-inversion patch for zvol_open
authorStoiko Ivanov <s.ivanov@proxmox.com>
Tue, 11 Jan 2022 15:39:36 +0000 (16:39 +0100)
committerThomas Lamprecht <t.lamprecht@proxmox.com>
Tue, 11 Jan 2022 15:41:50 +0000 (16:41 +0100)
the changes to zvol_open added to 2.1.2 (for coping with kernel
changes in 5.13) seem to have introduced a lock order inversion [0].

(noticed while reviewing the 2.0.6->2.0.7 changes (the patch was
applied after 2.1.2 was already tagged)

[0] https://github.com/openzfs/zfs/pull/12863
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
debian/patches/0012-Fix-zvol_open-lock-inversion.patch [new file with mode: 0644]
debian/patches/series

diff --git a/debian/patches/0012-Fix-zvol_open-lock-inversion.patch b/debian/patches/0012-Fix-zvol_open-lock-inversion.patch
new file mode 100644 (file)
index 0000000..eb74550
--- /dev/null
@@ -0,0 +1,212 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Brian Behlendorf <behlendorf1@llnl.gov>
+Date: Fri, 17 Dec 2021 09:52:13 -0800
+Subject: [PATCH] Fix zvol_open() lock inversion
+
+When restructuring the zvol_open() logic for the Linux 5.13 kernel
+a lock inversion was accidentally introduced.  In the updated code
+the spa_namespace_lock is now taken before the zv_suspend_lock
+allowing the following scenario to occur:
+
+    down_read <=== waiting for zv_suspend_lock
+    zvol_open <=== holds spa_namespace_lock
+    __blkdev_get
+    blkdev_get_by_dev
+    blkdev_open
+    ...
+
+     mutex_lock <== waiting for spa_namespace_lock
+     spa_open_common
+     spa_open
+     dsl_pool_hold
+     dmu_objset_hold_flags
+     dmu_objset_hold
+     dsl_prop_get
+     dsl_prop_get_integer
+     zvol_create_minor
+     dmu_recv_end
+     zfs_ioc_recv_impl <=== holds zv_suspend_lock via zvol_suspend()
+     zfs_ioc_recv
+     ...
+
+This commit resolves the issue by moving the acquisition of the
+spa_namespace_lock back to after the zv_suspend_lock which restores
+the original ordering.
+
+Additionally, as part of this change the error exit paths were
+simplified where possible.
+
+Reviewed-by: Tony Hutter <hutter2@llnl.gov>
+Reviewed-by: Rich Ercolani <rincebrain@gmail.com>
+Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Closes #12863
+(cherry picked from commit 8a02d01e85556bbe3a1c6947bc11b8ef028d4023)
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ module/os/linux/zfs/zvol_os.c | 121 ++++++++++++++++------------------
+ 1 file changed, 58 insertions(+), 63 deletions(-)
+
+diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
+index 44caadd58..69479b3f7 100644
+--- a/module/os/linux/zfs/zvol_os.c
++++ b/module/os/linux/zfs/zvol_os.c
+@@ -496,8 +496,7 @@ zvol_open(struct block_device *bdev, fmode_t flag)
+ {
+       zvol_state_t *zv;
+       int error = 0;
+-      boolean_t drop_suspend = B_TRUE;
+-      boolean_t drop_namespace = B_FALSE;
++      boolean_t drop_suspend = B_FALSE;
+ #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
+       hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms);
+       hrtime_t start = gethrtime();
+@@ -517,7 +516,36 @@ retry:
+               return (SET_ERROR(-ENXIO));
+       }
+-      if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
++      mutex_enter(&zv->zv_state_lock);
++      /*
++       * Make sure zvol is not suspended during first open
++       * (hold zv_suspend_lock) and respect proper lock acquisition
++       * ordering - zv_suspend_lock before zv_state_lock
++       */
++      if (zv->zv_open_count == 0) {
++              if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
++                      mutex_exit(&zv->zv_state_lock);
++                      rw_enter(&zv->zv_suspend_lock, RW_READER);
++                      mutex_enter(&zv->zv_state_lock);
++                      /* check to see if zv_suspend_lock is needed */
++                      if (zv->zv_open_count != 0) {
++                              rw_exit(&zv->zv_suspend_lock);
++                      } else {
++                              drop_suspend = B_TRUE;
++                      }
++              } else {
++                      drop_suspend = B_TRUE;
++              }
++      }
++      rw_exit(&zvol_state_lock);
++
++      ASSERT(MUTEX_HELD(&zv->zv_state_lock));
++
++      if (zv->zv_open_count == 0) {
++              boolean_t drop_namespace = B_FALSE;
++
++              ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
++
+               /*
+                * In all other call paths the spa_namespace_lock is taken
+                * before the bdev->bd_mutex lock.  However, on open(2)
+@@ -542,84 +570,51 @@ retry:
+                * the kernel so the only option is to return the error for
+                * the caller to handle it.
+                */
+-              if (!mutex_tryenter(&spa_namespace_lock)) {
+-                      rw_exit(&zvol_state_lock);
++              if (!mutex_owned(&spa_namespace_lock)) {
++                      if (!mutex_tryenter(&spa_namespace_lock)) {
++                              mutex_exit(&zv->zv_state_lock);
++                              rw_exit(&zv->zv_suspend_lock);
+ #ifdef HAVE_BLKDEV_GET_ERESTARTSYS
+-                      schedule();
+-                      return (SET_ERROR(-ERESTARTSYS));
+-#else
+-                      if ((gethrtime() - start) > timeout)
++                              schedule();
+                               return (SET_ERROR(-ERESTARTSYS));
++#else
++                              if ((gethrtime() - start) > timeout)
++                                      return (SET_ERROR(-ERESTARTSYS));
+-                      schedule_timeout(MSEC_TO_TICK(10));
+-                      goto retry;
++                              schedule_timeout(MSEC_TO_TICK(10));
++                              goto retry;
+ #endif
+-              } else {
+-                      drop_namespace = B_TRUE;
+-              }
+-      }
+-
+-      mutex_enter(&zv->zv_state_lock);
+-      /*
+-       * make sure zvol is not suspended during first open
+-       * (hold zv_suspend_lock) and respect proper lock acquisition
+-       * ordering - zv_suspend_lock before zv_state_lock
+-       */
+-      if (zv->zv_open_count == 0) {
+-              if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
+-                      mutex_exit(&zv->zv_state_lock);
+-                      rw_enter(&zv->zv_suspend_lock, RW_READER);
+-                      mutex_enter(&zv->zv_state_lock);
+-                      /* check to see if zv_suspend_lock is needed */
+-                      if (zv->zv_open_count != 0) {
+-                              rw_exit(&zv->zv_suspend_lock);
+-                              drop_suspend = B_FALSE;
++                      } else {
++                              drop_namespace = B_TRUE;
+                       }
+               }
+-      } else {
+-              drop_suspend = B_FALSE;
+-      }
+-      rw_exit(&zvol_state_lock);
+-
+-      ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+-      if (zv->zv_open_count == 0) {
+-              ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
+               error = -zvol_first_open(zv, !(flag & FMODE_WRITE));
+-              if (error)
+-                      goto out_mutex;
+-      }
+-      if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
+-              error = -EROFS;
+-              goto out_open_count;
++              if (drop_namespace)
++                      mutex_exit(&spa_namespace_lock);
+       }
+-      zv->zv_open_count++;
+-
+-      mutex_exit(&zv->zv_state_lock);
+-      if (drop_namespace)
+-              mutex_exit(&spa_namespace_lock);
+-      if (drop_suspend)
+-              rw_exit(&zv->zv_suspend_lock);
+-
+-      zfs_check_media_change(bdev);
+-
+-      return (0);
++      if (error == 0) {
++              if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
++                      if (zv->zv_open_count == 0)
++                              zvol_last_close(zv);
+-out_open_count:
+-      if (zv->zv_open_count == 0)
+-              zvol_last_close(zv);
++                      error = SET_ERROR(-EROFS);
++              } else {
++                      zv->zv_open_count++;
++              }
++      }
+-out_mutex:
+       mutex_exit(&zv->zv_state_lock);
+-      if (drop_namespace)
+-              mutex_exit(&spa_namespace_lock);
+       if (drop_suspend)
+               rw_exit(&zv->zv_suspend_lock);
+-      return (SET_ERROR(error));
++      if (error == 0)
++              zfs_check_media_change(bdev);
++
++      return (error);
+ }
+ static void
index d2770d39486cc3c6a38b1b6128ff4aba0a03c9f1..8166db9133df31b9293a04ed6699861b88a2b2a0 100644 (file)
@@ -9,3 +9,4 @@
 0009-Patch-move-manpage-arcstat-1-to-arcstat-8.patch
 0010-arcstat-Fix-integer-division-with-python3.patch
 0011-arc-stat-summary-guard-access-to-l2arc-MFU-MRU-stats.patch
+0012-Fix-zvol_open-lock-inversion.patch