--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Brian Behlendorf <behlendorf1@llnl.gov>
+Date: Fri, 17 Dec 2021 09:52:13 -0800
+Subject: [PATCH] Fix zvol_open() lock inversion
+
+When restructuring the zvol_open() logic for the Linux 5.13 kernel
+a lock inversion was accidentally introduced. In the updated code
+the spa_namespace_lock is now taken before the zv_suspend_lock
+allowing the following scenario to occur:
+
+ down_read <=== waiting for zv_suspend_lock
+ zvol_open <=== holds spa_namespace_lock
+ __blkdev_get
+ blkdev_get_by_dev
+ blkdev_open
+ ...
+
+ mutex_lock <== waiting for spa_namespace_lock
+ spa_open_common
+ spa_open
+ dsl_pool_hold
+ dmu_objset_hold_flags
+ dmu_objset_hold
+ dsl_prop_get
+ dsl_prop_get_integer
+ zvol_create_minor
+ dmu_recv_end
+ zfs_ioc_recv_impl <=== holds zv_suspend_lock via zvol_suspend()
+ zfs_ioc_recv
+ ...
+
+This commit resolves the issue by moving the acquisition of the
+spa_namespace_lock back to after the zv_suspend_lock which restores
+the original ordering.
+
+Additionally, as part of this change the error exit paths were
+simplified where possible.
+
+Reviewed-by: Tony Hutter <hutter2@llnl.gov>
+Reviewed-by: Rich Ercolani <rincebrain@gmail.com>
+Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Closes #12863
+(cherry picked from commit 8a02d01e85556bbe3a1c6947bc11b8ef028d4023)
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ module/os/linux/zfs/zvol_os.c | 121 ++++++++++++++++------------------
+ 1 file changed, 58 insertions(+), 63 deletions(-)
+
+diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
+index 44caadd58..69479b3f7 100644
+--- a/module/os/linux/zfs/zvol_os.c
++++ b/module/os/linux/zfs/zvol_os.c
+@@ -496,8 +496,7 @@ zvol_open(struct block_device *bdev, fmode_t flag)
+ {
+ zvol_state_t *zv;
+ int error = 0;
+- boolean_t drop_suspend = B_TRUE;
+- boolean_t drop_namespace = B_FALSE;
++ boolean_t drop_suspend = B_FALSE;
+ #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
+ hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms);
+ hrtime_t start = gethrtime();
+@@ -517,7 +516,36 @@ retry:
+ return (SET_ERROR(-ENXIO));
+ }
+
+- if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
++ mutex_enter(&zv->zv_state_lock);
++ /*
++ * Make sure zvol is not suspended during first open
++ * (hold zv_suspend_lock) and respect proper lock acquisition
++ * ordering - zv_suspend_lock before zv_state_lock
++ */
++ if (zv->zv_open_count == 0) {
++ if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
++ mutex_exit(&zv->zv_state_lock);
++ rw_enter(&zv->zv_suspend_lock, RW_READER);
++ mutex_enter(&zv->zv_state_lock);
++ /* check to see if zv_suspend_lock is needed */
++ if (zv->zv_open_count != 0) {
++ rw_exit(&zv->zv_suspend_lock);
++ } else {
++ drop_suspend = B_TRUE;
++ }
++ } else {
++ drop_suspend = B_TRUE;
++ }
++ }
++ rw_exit(&zvol_state_lock);
++
++ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
++
++ if (zv->zv_open_count == 0) {
++ boolean_t drop_namespace = B_FALSE;
++
++ ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
++
+ /*
+ * In all other call paths the spa_namespace_lock is taken
+ * before the bdev->bd_mutex lock. However, on open(2)
+@@ -542,84 +570,51 @@ retry:
+ * the kernel so the only option is to return the error for
+ * the caller to handle it.
+ */
+- if (!mutex_tryenter(&spa_namespace_lock)) {
+- rw_exit(&zvol_state_lock);
++ if (!mutex_owned(&spa_namespace_lock)) {
++ if (!mutex_tryenter(&spa_namespace_lock)) {
++ mutex_exit(&zv->zv_state_lock);
++ rw_exit(&zv->zv_suspend_lock);
+
+ #ifdef HAVE_BLKDEV_GET_ERESTARTSYS
+- schedule();
+- return (SET_ERROR(-ERESTARTSYS));
+-#else
+- if ((gethrtime() - start) > timeout)
++ schedule();
+ return (SET_ERROR(-ERESTARTSYS));
++#else
++ if ((gethrtime() - start) > timeout)
++ return (SET_ERROR(-ERESTARTSYS));
+
+- schedule_timeout(MSEC_TO_TICK(10));
+- goto retry;
++ schedule_timeout(MSEC_TO_TICK(10));
++ goto retry;
+ #endif
+- } else {
+- drop_namespace = B_TRUE;
+- }
+- }
+-
+- mutex_enter(&zv->zv_state_lock);
+- /*
+- * make sure zvol is not suspended during first open
+- * (hold zv_suspend_lock) and respect proper lock acquisition
+- * ordering - zv_suspend_lock before zv_state_lock
+- */
+- if (zv->zv_open_count == 0) {
+- if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
+- mutex_exit(&zv->zv_state_lock);
+- rw_enter(&zv->zv_suspend_lock, RW_READER);
+- mutex_enter(&zv->zv_state_lock);
+- /* check to see if zv_suspend_lock is needed */
+- if (zv->zv_open_count != 0) {
+- rw_exit(&zv->zv_suspend_lock);
+- drop_suspend = B_FALSE;
++ } else {
++ drop_namespace = B_TRUE;
+ }
+ }
+- } else {
+- drop_suspend = B_FALSE;
+- }
+- rw_exit(&zvol_state_lock);
+-
+- ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+- if (zv->zv_open_count == 0) {
+- ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
+ error = -zvol_first_open(zv, !(flag & FMODE_WRITE));
+- if (error)
+- goto out_mutex;
+- }
+
+- if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
+- error = -EROFS;
+- goto out_open_count;
++ if (drop_namespace)
++ mutex_exit(&spa_namespace_lock);
+ }
+
+- zv->zv_open_count++;
+-
+- mutex_exit(&zv->zv_state_lock);
+- if (drop_namespace)
+- mutex_exit(&spa_namespace_lock);
+- if (drop_suspend)
+- rw_exit(&zv->zv_suspend_lock);
+-
+- zfs_check_media_change(bdev);
+-
+- return (0);
++ if (error == 0) {
++ if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
++ if (zv->zv_open_count == 0)
++ zvol_last_close(zv);
+
+-out_open_count:
+- if (zv->zv_open_count == 0)
+- zvol_last_close(zv);
++ error = SET_ERROR(-EROFS);
++ } else {
++ zv->zv_open_count++;
++ }
++ }
+
+-out_mutex:
+ mutex_exit(&zv->zv_state_lock);
+- if (drop_namespace)
+- mutex_exit(&spa_namespace_lock);
+ if (drop_suspend)
+ rw_exit(&zv->zv_suspend_lock);
+
+- return (SET_ERROR(error));
++ if (error == 0)
++ zfs_check_media_change(bdev);
++
++ return (error);
+ }
+
+ static void