zvol: Support blk-mq for better performance

author Tony Hutter <hutter2@llnl.gov>

Thu, 9 Jun 2022 14:10:38 +0000 (07:10 -0700)

committer GitHub <noreply@github.com>

Thu, 9 Jun 2022 14:10:38 +0000 (08:10 -0600)
author Tony Hutter <hutter2@llnl.gov>
Thu, 9 Jun 2022 14:10:38 +0000 (07:10 -0700)
committer GitHub <noreply@github.com>
Thu, 9 Jun 2022 14:10:38 +0000 (08:10 -0600)
diff --git a/config/kernel-blk-queue.m4 b/config/kernel-blk-queue.m4

index 6f42b98125cd2c19bf004173a2a69a7c10e004e2..29b0a28290ab24cc0c231d9a94e6ebef1c461fd2 100644 (file)
--- a/config/kernel-blk-queue.m4
+++ b/config/kernel-blk-queue.m4
@@ -359,6 +359,36 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS], [
         ])
  ])
  
+dnl #
+dnl # See if kernel supports block multi-queue and blk_status_t.
+dnl # blk_status_t represents the new status codes introduced in the 4.13
+dnl # kernel patch:
+dnl #
+dnl #  block: introduce new block status code type
+dnl #
+dnl # We do not currently support the "old" block multi-queue interfaces from
+dnl # prior kernels.
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_MQ], [
+       ZFS_LINUX_TEST_SRC([blk_mq], [
+               #include <linux/blk-mq.h>
+       ], [
+               struct blk_mq_tag_set tag_set __attribute__ ((unused)) = {0};
+               (void) blk_mq_alloc_tag_set(&tag_set);
+               return BLK_STS_OK;
+       ], [])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_BLK_MQ], [
+       AC_MSG_CHECKING([whether block multiqueue with blk_status_t is available])
+       ZFS_LINUX_TEST_RESULT([blk_mq], [
+               AC_MSG_RESULT(yes)
+               AC_DEFINE(HAVE_BLK_MQ, 1, [block multiqueue is available])
+       ], [
+               AC_MSG_RESULT(no)
+       ])
+])
+
  AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [
         ZFS_AC_KERNEL_SRC_BLK_QUEUE_PLUG
         ZFS_AC_KERNEL_SRC_BLK_QUEUE_BDI
@@ -370,6 +400,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [
         ZFS_AC_KERNEL_SRC_BLK_QUEUE_FLUSH
         ZFS_AC_KERNEL_SRC_BLK_QUEUE_MAX_HW_SECTORS
         ZFS_AC_KERNEL_SRC_BLK_QUEUE_MAX_SEGMENTS
+       ZFS_AC_KERNEL_SRC_BLK_MQ
  ])
  
  AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE], [
@@ -383,4 +414,5 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE], [
         ZFS_AC_KERNEL_BLK_QUEUE_FLUSH
         ZFS_AC_KERNEL_BLK_QUEUE_MAX_HW_SECTORS
         ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS
+       ZFS_AC_KERNEL_BLK_MQ
  ])
diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h

index fd91560a3cc4e7e07f8e38db0662308eb75eab9a..7964937a0f4db50e3f124c07723b6d9b8d55268f 100644 (file)
--- a/include/os/linux/kernel/linux/blkdev_compat.h
+++ b/include/os/linux/kernel/linux/blkdev_compat.h
@@ -34,6 +34,11 @@
  #include <linux/hdreg.h>
  #include <linux/major.h>
  #include <linux/msdos_fs.h>    /* for SECTOR_* */
+#include <linux/bio.h>
+
+#ifdef HAVE_BLK_MQ
+#include <linux/blk-mq.h>
+#endif
  
  #ifndef HAVE_BLK_QUEUE_FLAG_SET
  static inline void
@@ -608,4 +613,110 @@ blk_generic_alloc_queue(make_request_fn make_request, int node_id)
  }
  #endif /* !HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */
  
+/*
+ * All the io_*() helper functions below can operate on a bio, or a rq, but
+ * not both.  The older submit_bio() codepath will pass a bio, and the
+ * newer blk-mq codepath will pass a rq.
+ */
+static inline int
+io_data_dir(struct bio *bio, struct request *rq)
+{
+#ifdef HAVE_BLK_MQ
+       if (rq != NULL) {
+               if (op_is_write(req_op(rq))) {
+                       return (WRITE);
+               } else {
+                       return (READ);
+               }
+       }
+#else
+       ASSERT3P(rq, ==, NULL);
+#endif
+       return (bio_data_dir(bio));
+}
+
+static inline int
+io_is_flush(struct bio *bio, struct request *rq)
+{
+#ifdef HAVE_BLK_MQ
+       if (rq != NULL)
+               return (req_op(rq) == REQ_OP_FLUSH);
+#else
+       ASSERT3P(rq, ==, NULL);
+#endif
+       return (bio_is_flush(bio));
+}
+
+static inline int
+io_is_discard(struct bio *bio, struct request *rq)
+{
+#ifdef HAVE_BLK_MQ
+       if (rq != NULL)
+               return (req_op(rq) == REQ_OP_DISCARD);
+#else
+       ASSERT3P(rq, ==, NULL);
+#endif
+       return (bio_is_discard(bio));
+}
+
+static inline int
+io_is_secure_erase(struct bio *bio, struct request *rq)
+{
+#ifdef HAVE_BLK_MQ
+       if (rq != NULL)
+               return (req_op(rq) == REQ_OP_SECURE_ERASE);
+#else
+       ASSERT3P(rq, ==, NULL);
+#endif
+       return (bio_is_secure_erase(bio));
+}
+
+static inline int
+io_is_fua(struct bio *bio, struct request *rq)
+{
+#ifdef HAVE_BLK_MQ
+       if (rq != NULL)
+               return (rq->cmd_flags & REQ_FUA);
+#else
+       ASSERT3P(rq, ==, NULL);
+#endif
+       return (bio_is_fua(bio));
+}
+
+
+static inline uint64_t
+io_offset(struct bio *bio, struct request *rq)
+{
+#ifdef HAVE_BLK_MQ
+       if (rq != NULL)
+               return (blk_rq_pos(rq) << 9);
+#else
+       ASSERT3P(rq, ==, NULL);
+#endif
+       return (BIO_BI_SECTOR(bio) << 9);
+}
+
+static inline uint64_t
+io_size(struct bio *bio, struct request *rq)
+{
+#ifdef HAVE_BLK_MQ
+       if (rq != NULL)
+               return (blk_rq_bytes(rq));
+#else
+       ASSERT3P(rq, ==, NULL);
+#endif
+       return (BIO_BI_SIZE(bio));
+}
+
+static inline int
+io_has_data(struct bio *bio, struct request *rq)
+{
+#ifdef HAVE_BLK_MQ
+       if (rq != NULL)
+               return (bio_has_data(rq->bio));
+#else
+       ASSERT3P(rq, ==, NULL);
+#endif
+       return (bio_has_data(bio));
+}
  #endif /* _ZFS_BLKDEV_H */
diff --git a/include/os/linux/spl/sys/uio.h b/include/os/linux/spl/sys/uio.h

index 439eec986236d24e7453ef0e948ce705ab58fa1f..fe2b5c07a018530c5285d5bce835f7bade8af667 100644 (file)
--- a/include/os/linux/spl/sys/uio.h
+++ b/include/os/linux/spl/sys/uio.h
@@ -69,9 +69,20 @@ typedef struct zfs_uio {
         uint16_t        uio_fmode;
         uint16_t        uio_extflg;
         ssize_t         uio_resid;
+
         size_t          uio_skip;
+
+       struct request  *rq;
+
+       /*
+        * Used for saving rq_for_each_segment() state between calls
+        * to zfs_uiomove_bvec_rq().
+        */
+       struct req_iterator iter;
+       struct bio_vec bv;
  } zfs_uio_t;
  
+
  #define        zfs_uio_segflg(u)               (u)->uio_segflg
  #define        zfs_uio_offset(u)               (u)->uio_loffset
  #define        zfs_uio_resid(u)                (u)->uio_resid
@@ -116,17 +127,33 @@ zfs_uio_iovec_init(zfs_uio_t *uio, const struct iovec *iov,
  }
  
  static inline void
-zfs_uio_bvec_init(zfs_uio_t *uio, struct bio *bio)
+zfs_uio_bvec_init(zfs_uio_t *uio, struct bio *bio, struct request *rq)
  {
-       uio->uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)];
-       uio->uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio);
-       uio->uio_loffset = BIO_BI_SECTOR(bio) << 9;
+       /* Either bio or rq will be set, but not both */
+       ASSERT3P(uio, !=, bio);
+
+       if (bio) {
+               uio->uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio);
+               uio->uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)];
+       } else {
+               uio->uio_bvec = NULL;
+               uio->uio_iovcnt = 0;
+               memset(&uio->iter, 0, sizeof (uio->iter));
+       }
+
+       uio->uio_loffset = io_offset(bio, rq);
         uio->uio_segflg = UIO_BVEC;
         uio->uio_fault_disable = B_FALSE;
         uio->uio_fmode = 0;
         uio->uio_extflg = 0;
-       uio->uio_resid = BIO_BI_SIZE(bio);
-       uio->uio_skip = BIO_BI_SKIP(bio);
+       uio->uio_resid = io_size(bio, rq);
+       if (bio) {
+               uio->uio_skip = BIO_BI_SKIP(bio);
+       } else {
+               uio->uio_skip = 0;
+       }
+
+       uio->rq = rq;
  }
  
  #if defined(HAVE_VFS_IOV_ITER)
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4

index a086e1a5d56cec4f78f096555ba6a848575679b8..a7e5408e5e377490da07a9c97b85c3a770892a5e 100644 (file)
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -2248,9 +2248,74 @@ for each I/O submitter.
  When unset, requests are handled asynchronously by a thread pool.
  The number of requests which can be handled concurrently is controlled by
  .Sy zvol_threads .
+.Sy zvol_request_sync
+is ignored when running on a kernel that supports block multiqueue
+.Pq Li blk-mq .
  .
-.It Sy zvol_threads Ns = Ns Sy 32 Pq uint
-Max number of threads which can handle zvol I/O requests concurrently.
+.It Sy zvol_threads Ns = Ns Sy 0 Pq uint
+The number of system wide threads to use for processing zvol block IOs.
+If
+.Sy 0
+(the default) then internally set
+.Sy zvol_threads
+to the number of CPUs present or 32 (whichever is greater).
+.
+.It Sy zvol_blk_mq_threads Ns = Ns Sy 0 Pq uint
+The number of threads per zvol to use for queuing IO requests.
+This parameter will only appear if your kernel supports
+.Li blk-mq
+and is only read and assigned to a zvol at zvol load time.
+If
+.Sy 0
+(the default) then internally set
+.Sy zvol_blk_mq_threads
+to the number of CPUs present.
+.
+.It Sy zvol_use_blk_mq Ns = Ns Sy 0 Ns | Ns 1 Pq uint
+Set to
+.Sy 1
+to use the
+.Li blk-mq
+API for zvols.
+Set to
+.Sy 0
+(the default) to use the legacy zvol APIs.
+This setting can give better or worse zvol performance depending on
+the workload.
+This parameter will only appear if your kernel supports
+.Li blk-mq
+and is only read and assigned to a zvol at zvol load time.
+.
+.It Sy zvol_blk_mq_blocks_per_thread Ns = Ns Sy 8 Pq uint
+If
+.Sy zvol_use_blk_mq
+is enabled, then process this number of
+.Sy volblocksize Ns -sized blocks per zvol thread.
+This tunable can be use to favor better performance for zvol reads (lower
+values) or writes (higher values).
+If set to
+.Sy 0 ,
+then the zvol layer will process the maximum number of blocks
+per thread that it can.
+This parameter will only appear if your kernel supports
+.Li blk-mq
+and is only applied at each zvol's load time.
+.
+.It Sy zvol_blk_mq_queue_depth Ns = Ns Sy 0 Pq uint
+The queue_depth value for the zvol
+.Li blk-mq
+interface.
+This parameter will only appear if your kernel supports
+.Li blk-mq
+and is only applied at each zvol's load time.
+If
+.Sy 0
+(the default) then use the kernel's default queue depth.
+Values are clamped to the kernel's
+.Dv BLKDEV_MIN_RQ
+and
+.Dv BLKDEV_MAX_RQ Ns / Ns Dv BLKDEV_DEFAULT_RQ
+limits.
  .
  .It Sy zvol_volmode Ns = Ns Sy 1 Pq uint
  Defines zvol block devices behaviour when
diff --git a/module/os/linux/zfs/zfs_uio.c b/module/os/linux/zfs/zfs_uio.c

index 4f31bcb5959d291411c547533ffaa44e2dff009f..abb6dbe67cdf1f00169f58721fc41abf9018703c 100644 (file)
--- a/module/os/linux/zfs/zfs_uio.c
+++ b/module/os/linux/zfs/zfs_uio.c
@@ -126,7 +126,7 @@ zfs_uiomove_iov(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
  }
  
  static int
-zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
+zfs_uiomove_bvec_impl(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
  {
         const struct bio_vec *bv = uio->uio_bvec;
         size_t skip = uio->uio_skip;
@@ -137,10 +137,13 @@ zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
                 cnt = MIN(bv->bv_len - skip, n);
  
                 paddr = zfs_kmap_atomic(bv->bv_page);
-               if (rw == UIO_READ)
+               if (rw == UIO_READ) {
+                       /* Copy from buffer 'p' to the bvec data */
                         memcpy(paddr + bv->bv_offset + skip, p, cnt);
-               else
+               } else {
+                       /* Copy from bvec data to buffer 'p' */
                         memcpy(p, paddr + bv->bv_offset + skip, cnt);
+               }
                 zfs_kunmap_atomic(paddr);
  
                 skip += cnt;
@@ -158,6 +161,141 @@ zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
         return (0);
  }
  
+#ifdef HAVE_BLK_MQ
+static void
+zfs_copy_bvec(void *p, size_t skip, size_t cnt, zfs_uio_rw_t rw,
+    struct bio_vec *bv)
+{
+       void *paddr;
+
+       paddr = zfs_kmap_atomic(bv->bv_page);
+       if (rw == UIO_READ) {
+               /* Copy from buffer 'p' to the bvec data */
+               memcpy(paddr + bv->bv_offset + skip, p, cnt);
+       } else {
+               /* Copy from bvec data to buffer 'p' */
+               memcpy(p, paddr + bv->bv_offset + skip, cnt);
+       }
+       zfs_kunmap_atomic(paddr);
+}
+
+/*
+ * Copy 'n' bytes of data between the buffer p[] and the data represented
+ * by the request in the uio.
+ */
+static int
+zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
+{
+       struct request *rq = uio->rq;
+       struct bio_vec bv;
+       struct req_iterator iter;
+       size_t this_seg_start;  /* logical offset */
+       size_t this_seg_end;            /* logical offset */
+       size_t skip_in_seg;
+       size_t copy_from_seg;
+       size_t orig_loffset;
+       int copied = 0;
+
+       /*
+        * Get the original logical offset of this entire request (because
+        * uio->uio_loffset will be modified over time).
+        */
+       orig_loffset = io_offset(NULL, rq);
+       this_seg_start = orig_loffset;
+
+       rq_for_each_segment(bv, rq, iter) {
+               if (uio->iter.bio) {
+                       /*
+                        * If uio->iter.bio is present, then we know we've saved
+                        * uio->iter from a previous call to this function, and
+                        * we can skip ahead in this rq_for_each_segment() loop
+                        * to where we last left off.  That way, we don't need
+                        * to iterate over tons of segments we've already
+                        * processed - we can just restore the "saved state".
+                        */
+                       iter = uio->iter;
+                       bv = uio->bv;
+                       this_seg_start = uio->uio_loffset;
+                       memset(&uio->iter, 0, sizeof (uio->iter));
+                       continue;
+               }
+
+               /*
+                * Lookup what the logical offset of the last byte of this
+                * segment is.
+                */
+               this_seg_end = this_seg_start + bv.bv_len - 1;
+
+               /*
+                * We only need to operate on segments that have data we're
+                * copying.
+                */
+               if (uio->uio_loffset >= this_seg_start &&
+                   uio->uio_loffset <= this_seg_end) {
+                       /*
+                        * Some, or all, of the data in this segment needs to be
+                        * copied.
+                        */
+
+                       /*
+                        * We may be not be copying from the first byte in the
+                        * segment.  Figure out how many bytes to skip copying
+                        * from the beginning of this segment.
+                        */
+                       skip_in_seg = uio->uio_loffset - this_seg_start;
+
+                       /*
+                        * Calculate the total number of bytes from this
+                        * segment that we will be copying.
+                        */
+                       copy_from_seg = MIN(bv.bv_len - skip_in_seg, n);
+
+                       /* Copy the bytes */
+                       zfs_copy_bvec(p, skip_in_seg, copy_from_seg, rw, &bv);
+                       p = ((char *)p) + copy_from_seg;
+
+                       n -= copy_from_seg;
+                       uio->uio_resid -= copy_from_seg;
+                       uio->uio_loffset += copy_from_seg;
+                       copied = 1;     /* We copied some data */
+               }
+
+               if (n == 0) {
+                       /*
+                        * All done copying.  Save our 'iter' value to the uio.
+                        * This allows us to "save our state" and skip ahead in
+                        * the rq_for_each_segment() loop the next time we call
+                        * call zfs_uiomove_bvec_rq() on this uio (which we
+                        * will be doing for any remaining data in the uio).
+                        */
+                       uio->iter = iter; /* make a copy of the struct data */
+                       uio->bv = bv;
+                       return (0);
+               }
+
+               this_seg_start = this_seg_end + 1;
+       }
+
+       if (!copied) {
+               /* Didn't copy anything */
+               uio->uio_resid = 0;
+       }
+       return (0);
+}
+#endif
+
+static int
+zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
+{
+#ifdef HAVE_BLK_MQ
+       if (uio->rq != NULL)
+               return (zfs_uiomove_bvec_rq(p, n, rw, uio));
+#else
+       ASSERT3P(uio->rq, ==, NULL);
+#endif
+       return (zfs_uiomove_bvec_impl(p, n, rw, uio));
+}
+
  #if defined(HAVE_VFS_IOV_ITER)
  static int
  zfs_uiomove_iter(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio,
@@ -300,8 +438,14 @@ zfs_uioskip(zfs_uio_t *uio, size_t n)
  {
         if (n > uio->uio_resid)
                 return;
-
-       if (uio->uio_segflg == UIO_BVEC) {
+       /*
+        * When using a uio with a struct request, we simply
+        * use uio_loffset as a pointer to the next logical byte to
+        * copy in the request.  We don't have to do any fancy
+        * accounting with uio_bvec/uio_iovcnt since we don't use
+        * them.
+        */
+       if (uio->uio_segflg == UIO_BVEC && uio->rq == NULL) {
                 uio->uio_skip += n;
                 while (uio->uio_iovcnt &&
                     uio->uio_skip >= uio->uio_bvec->bv_len) {
diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c

index 39441700ae8c340ac04fef0a9e431eecd23bd0ea..acbab55d03eff7b368474e223eb76a425cd0ff18 100644 (file)
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -41,20 +41,77 @@
  #include <linux/blkdev_compat.h>
  #include <linux/task_io_accounting_ops.h>
  
+#ifdef HAVE_BLK_MQ
+#include <linux/blk-mq.h>
+#endif
+
+static void zvol_request_impl(zvol_state_t *zv, struct bio *bio,
+    struct request *rq, boolean_t force_sync);
+
  static unsigned int zvol_major = ZVOL_MAJOR;
  static unsigned int zvol_request_sync = 0;
  static unsigned int zvol_prefetch_bytes = (128 * 1024);
  static unsigned long zvol_max_discard_blocks = 16384;
-static unsigned int zvol_threads = 32;
  
  #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
  static const unsigned int zvol_open_timeout_ms = 1000;
  #endif
  
+static unsigned int zvol_threads = 0;
+#ifdef HAVE_BLK_MQ
+static unsigned int zvol_blk_mq_threads = 0;
+static unsigned int zvol_blk_mq_actual_threads;
+static boolean_t zvol_use_blk_mq = B_FALSE;
+
+/*
+ * The maximum number of volblocksize blocks to process per thread.  Typically,
+ * write heavy workloads preform better with higher values here, and read
+ * heavy workloads preform better with lower values, but that's not a hard
+ * and fast rule.  It's basically a knob to tune between "less overhead with
+ * less parallelism" and "more overhead, but more parallelism".
+ *
+ * '8' was chosen as a reasonable, balanced, default based off of sequential
+ * read and write tests to a zvol in an NVMe pool (with 16 CPUs).
+ */
+static unsigned int zvol_blk_mq_blocks_per_thread = 8;
+#endif
+
+#ifndef        BLKDEV_DEFAULT_RQ
+/* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */
+#define        BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ
+#endif
+
+/*
+ * Finalize our BIO or request.
+ */
+#ifdef HAVE_BLK_MQ
+#define        END_IO(zv, bio, rq, error)  do { \
+       if (bio) { \
+               BIO_END_IO(bio, error); \
+       } else { \
+               blk_mq_end_request(rq, errno_to_bi_status(error)); \
+       } \
+} while (0)
+#else
+#define        END_IO(zv, bio, rq, error)      BIO_END_IO(bio, error)
+#endif
+
+#ifdef HAVE_BLK_MQ
+static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ;
+static unsigned int zvol_actual_blk_mq_queue_depth;
+#endif
+
  struct zvol_state_os {
         struct gendisk          *zvo_disk;      /* generic disk */
         struct request_queue    *zvo_queue;     /* request queue */
         dev_t                   zvo_dev;        /* device id */
+
+#ifdef HAVE_BLK_MQ
+       struct blk_mq_tag_set tag_set;
+#endif
+
+       /* Set from the global 'zvol_use_blk_mq' at zvol load */
+       boolean_t use_blk_mq;
  };
  
  taskq_t *zvol_taskq;
@@ -63,8 +120,14 @@ static struct ida zvol_ida;
  typedef struct zv_request_stack {
         zvol_state_t    *zv;
         struct bio      *bio;
+       struct request *rq;
  } zv_request_t;
  
+typedef struct zv_work {
+       struct request  *rq;
+       struct work_struct work;
+} zv_work_t;
+
  typedef struct zv_request_task {
         zv_request_t zvr;
         taskq_ent_t     ent;
@@ -86,6 +149,62 @@ zv_request_task_free(zv_request_task_t *task)
         kmem_free(task, sizeof (*task));
  }
  
+#ifdef HAVE_BLK_MQ
+
+/*
+ * This is called when a new block multiqueue request comes in.  A request
+ * contains one or more BIOs.
+ */
+static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
+    const struct blk_mq_queue_data *bd)
+{
+       struct request *rq = bd->rq;
+       zvol_state_t *zv = rq->q->queuedata;
+
+       /* Tell the kernel that we are starting to process this request */
+       blk_mq_start_request(rq);
+
+       if (blk_rq_is_passthrough(rq)) {
+               /* Skip non filesystem request */
+               blk_mq_end_request(rq, BLK_STS_IOERR);
+               return (BLK_STS_IOERR);
+       }
+
+       zvol_request_impl(zv, NULL, rq, 0);
+
+       /* Acknowledge to the kernel that we got this request */
+       return (BLK_STS_OK);
+}
+
+static struct blk_mq_ops zvol_blk_mq_queue_ops = {
+       .queue_rq = zvol_mq_queue_rq,
+};
+
+/* Initialize our blk-mq struct */
+static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv)
+{
+       struct zvol_state_os *zso = zv->zv_zso;
+
+       memset(&zso->tag_set, 0, sizeof (zso->tag_set));
+
+       /* Initialize tag set. */
+       zso->tag_set.ops = &zvol_blk_mq_queue_ops;
+       zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads;
+       zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth;
+       zso->tag_set.numa_node = NUMA_NO_NODE;
+       zso->tag_set.cmd_size = 0;
+
+       /*
+        * We need BLK_MQ_F_BLOCKING here since we do blocking calls in
+        * zvol_request_impl()
+        */
+       zso->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
+       zso->tag_set.driver_data = zv;
+
+       return (blk_mq_alloc_tag_set(&zso->tag_set));
+}
+#endif /* HAVE_BLK_MQ */
+
  /*
   * Given a path, return TRUE if path is a ZVOL.
   */
@@ -107,38 +226,51 @@ static void
  zvol_write(zv_request_t *zvr)
  {
         struct bio *bio = zvr->bio;
+       struct request *rq = zvr->rq;
         int error = 0;
         zfs_uio_t uio;
-
-       zfs_uio_bvec_init(&uio, bio);
-
         zvol_state_t *zv = zvr->zv;
+       struct request_queue *q;
+       struct gendisk *disk;
+       unsigned long start_time = 0;
+       boolean_t acct = B_FALSE;
+
         ASSERT3P(zv, !=, NULL);
         ASSERT3U(zv->zv_open_count, >, 0);
         ASSERT3P(zv->zv_zilog, !=, NULL);
  
+       q = zv->zv_zso->zvo_queue;
+       disk = zv->zv_zso->zvo_disk;
+
         /* bio marked as FLUSH need to flush before write */
-       if (bio_is_flush(bio))
+       if (io_is_flush(bio, rq))
                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
  
         /* Some requests are just for flush and nothing else. */
-       if (uio.uio_resid == 0) {
+       if (io_size(bio, rq) == 0) {
                 rw_exit(&zv->zv_suspend_lock);
-               BIO_END_IO(bio, 0);
+               END_IO(zv, bio, rq, 0);
                 return;
         }
  
-       struct request_queue *q = zv->zv_zso->zvo_queue;
-       struct gendisk *disk = zv->zv_zso->zvo_disk;
+       zfs_uio_bvec_init(&uio, bio, rq);
+
         ssize_t start_resid = uio.uio_resid;
-       unsigned long start_time;
  
-       boolean_t acct = blk_queue_io_stat(q);
-       if (acct)
-               start_time = blk_generic_start_io_acct(q, disk, WRITE, bio);
+       /*
+        * With use_blk_mq, accounting is done by blk_mq_start_request()
+        * and blk_mq_end_request(), so we can skip it here.
+        */
+       if (bio) {
+               acct = blk_queue_io_stat(q);
+               if (acct) {
+                       start_time = blk_generic_start_io_acct(q, disk, WRITE,
+                           bio);
+               }
+       }
  
         boolean_t sync =
-           bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+           io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
  
         zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
             uio.uio_loffset, uio.uio_resid, RL_WRITER);
@@ -180,10 +312,11 @@ zvol_write(zv_request_t *zvr)
  
         rw_exit(&zv->zv_suspend_lock);
  
-       if (acct)
+       if (bio && acct) {
                 blk_generic_end_io_acct(q, disk, WRITE, bio, start_time);
+       }
  
-       BIO_END_IO(bio, -error);
+       END_IO(zv, bio, rq, -error);
  }
  
  static void
@@ -198,27 +331,33 @@ static void
  zvol_discard(zv_request_t *zvr)
  {
         struct bio *bio = zvr->bio;
+       struct request *rq = zvr->rq;
         zvol_state_t *zv = zvr->zv;
-       uint64_t start = BIO_BI_SECTOR(bio) << 9;
-       uint64_t size = BIO_BI_SIZE(bio);
+       uint64_t start = io_offset(bio, rq);
+       uint64_t size = io_size(bio, rq);
         uint64_t end = start + size;
         boolean_t sync;
         int error = 0;
         dmu_tx_t *tx;
+       struct request_queue *q = zv->zv_zso->zvo_queue;
+       struct gendisk *disk = zv->zv_zso->zvo_disk;
+       unsigned long start_time = 0;
+
+       boolean_t acct = blk_queue_io_stat(q);
  
         ASSERT3P(zv, !=, NULL);
         ASSERT3U(zv->zv_open_count, >, 0);
         ASSERT3P(zv->zv_zilog, !=, NULL);
  
-       struct request_queue *q = zv->zv_zso->zvo_queue;
-       struct gendisk *disk = zv->zv_zso->zvo_disk;
-       unsigned long start_time;
-
-       boolean_t acct = blk_queue_io_stat(q);
-       if (acct)
-               start_time = blk_generic_start_io_acct(q, disk, WRITE, bio);
+       if (bio) {
+               acct = blk_queue_io_stat(q);
+               if (acct) {
+                       start_time = blk_generic_start_io_acct(q, disk, WRITE,
+                           bio);
+               }
+       }
  
-       sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+       sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
  
         if (end > zv->zv_volsize) {
                 error = SET_ERROR(EIO);
@@ -231,7 +370,7 @@ zvol_discard(zv_request_t *zvr)
          * the unaligned parts which is slow (read-modify-write) and useless
          * since we are not freeing any space by doing so.
          */
-       if (!bio_is_secure_erase(bio)) {
+       if (!io_is_secure_erase(bio, rq)) {
                 start = P2ROUNDUP(start, zv->zv_volblocksize);
                 end = P2ALIGN(end, zv->zv_volblocksize);
                 size = end - start;
@@ -262,10 +401,12 @@ zvol_discard(zv_request_t *zvr)
  unlock:
         rw_exit(&zv->zv_suspend_lock);
  
-       if (acct)
-               blk_generic_end_io_acct(q, disk, WRITE, bio, start_time);
+       if (bio && acct) {
+               blk_generic_end_io_acct(q, disk, WRITE, bio,
+                   start_time);
+       }
  
-       BIO_END_IO(bio, -error);
+       END_IO(zv, bio, rq, -error);
  }
  
  static void
@@ -280,28 +421,41 @@ static void
  zvol_read(zv_request_t *zvr)
  {
         struct bio *bio = zvr->bio;
+       struct request *rq = zvr->rq;
         int error = 0;
         zfs_uio_t uio;
-
-       zfs_uio_bvec_init(&uio, bio);
-
+       boolean_t acct = B_FALSE;
         zvol_state_t *zv = zvr->zv;
+       struct request_queue *q;
+       struct gendisk *disk;
+       unsigned long start_time = 0;
+
         ASSERT3P(zv, !=, NULL);
         ASSERT3U(zv->zv_open_count, >, 0);
  
-       struct request_queue *q = zv->zv_zso->zvo_queue;
-       struct gendisk *disk = zv->zv_zso->zvo_disk;
+       zfs_uio_bvec_init(&uio, bio, rq);
+
+       q = zv->zv_zso->zvo_queue;
+       disk = zv->zv_zso->zvo_disk;
+
         ssize_t start_resid = uio.uio_resid;
-       unsigned long start_time;
  
-       boolean_t acct = blk_queue_io_stat(q);
-       if (acct)
-               start_time = blk_generic_start_io_acct(q, disk, READ, bio);
+       /*
+        * When blk-mq is being used, accounting is done by
+        * blk_mq_start_request() and blk_mq_end_request().
+        */
+       if (bio) {
+               acct = blk_queue_io_stat(q);
+               if (acct)
+                       start_time = blk_generic_start_io_acct(q, disk, READ,
+                           bio);
+       }
  
         zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
             uio.uio_loffset, uio.uio_resid, RL_READER);
  
         uint64_t volsize = zv->zv_volsize;
+
         while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
                 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
  
@@ -325,10 +479,11 @@ zvol_read(zv_request_t *zvr)
  
         rw_exit(&zv->zv_suspend_lock);
  
-       if (acct)
+       if (bio && acct) {
                 blk_generic_end_io_acct(q, disk, READ, bio, start_time);
+       }
  
-       BIO_END_IO(bio, -error);
+       END_IO(zv, bio, rq, -error);
  }
  
  static void
@@ -339,52 +494,49 @@ zvol_read_task(void *arg)
         zv_request_task_free(task);
  }
  
-#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
-#ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID
+
+/*
+ * Process a BIO or request
+ *
+ * Either 'bio' or 'rq' should be set depending on if we are processing a
+ * bio or a request (both should not be set).
+ *
+ * force_sync: Set to 0 to defer processing to a background taskq
+ *                     Set to 1 to process data synchronously
+ */
  static void
-zvol_submit_bio(struct bio *bio)
-#else
-static blk_qc_t
-zvol_submit_bio(struct bio *bio)
-#endif
-#else
-static MAKE_REQUEST_FN_RET
-zvol_request(struct request_queue *q, struct bio *bio)
-#endif
+zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
+    boolean_t force_sync)
  {
-#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
-#if defined(HAVE_BIO_BDEV_DISK)
-       struct request_queue *q = bio->bi_bdev->bd_disk->queue;
-#else
-       struct request_queue *q = bio->bi_disk->queue;
-#endif
-#endif
-       zvol_state_t *zv = q->queuedata;
         fstrans_cookie_t cookie = spl_fstrans_mark();
-       uint64_t offset = BIO_BI_SECTOR(bio) << 9;
-       uint64_t size = BIO_BI_SIZE(bio);
-       int rw = bio_data_dir(bio);
+       uint64_t offset = io_offset(bio, rq);
+       uint64_t size = io_size(bio, rq);
+       int rw = io_data_dir(bio, rq);
  
-       if (bio_has_data(bio) && offset + size > zv->zv_volsize) {
-               printk(KERN_INFO
-                   "%s: bad access: offset=%llu, size=%lu\n",
+       if (zvol_request_sync)
+               force_sync = 1;
+
+       zv_request_t zvr = {
+               .zv = zv,
+               .bio = bio,
+               .rq = rq,
+       };
+
+       if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) {
+               printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n",
                     zv->zv_zso->zvo_disk->disk_name,
                     (long long unsigned)offset,
                     (long unsigned)size);
  
-               BIO_END_IO(bio, -SET_ERROR(EIO));
+               END_IO(zv, bio, rq, -SET_ERROR(EIO));
                 goto out;
         }
  
-       zv_request_t zvr = {
-               .zv = zv,
-               .bio = bio,
-       };
         zv_request_task_t *task;
  
         if (rw == WRITE) {
                 if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
-                       BIO_END_IO(bio, -SET_ERROR(EROFS));
+                       END_IO(zv, bio, rq, -SET_ERROR(EROFS));
                         goto out;
                 }
  
@@ -421,7 +573,7 @@ zvol_request(struct request_queue *q, struct bio *bio)
                  * i/o may be a ZIL write (via zil_commit()), or a read of an
                  * indirect block, or a read of a data block (if this is a
                  * partial-block write).  We will indicate that the i/o is
-                * complete by calling BIO_END_IO() from the taskq callback.
+                * complete by calling END_IO() from the taskq callback.
                  *
                  * This design allows the calling thread to continue and
                  * initiate more concurrent operations by calling
@@ -441,12 +593,12 @@ zvol_request(struct request_queue *q, struct bio *bio)
                  * of one i/o at a time per zvol.  However, an even better
                  * design would be for zvol_request() to initiate the zio
                  * directly, and then be notified by the zio_done callback,
-                * which would call BIO_END_IO().  Unfortunately, the DMU/ZIL
+                * which would call END_IO().  Unfortunately, the DMU/ZIL
                  * interfaces lack this functionality (they block waiting for
                  * the i/o to complete).
                  */
-               if (bio_is_discard(bio) || bio_is_secure_erase(bio)) {
-                       if (zvol_request_sync) {
+               if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) {
+                       if (force_sync) {
                                 zvol_discard(&zvr);
                         } else {
                                 task = zv_request_task_create(zvr);
@@ -454,7 +606,7 @@ zvol_request(struct request_queue *q, struct bio *bio)
                                     zvol_discard_task, task, 0, &task->ent);
                         }
                 } else {
-                       if (zvol_request_sync) {
+                       if (force_sync) {
                                 zvol_write(&zvr);
                         } else {
                                 task = zv_request_task_create(zvr);
@@ -469,14 +621,14 @@ zvol_request(struct request_queue *q, struct bio *bio)
                  * data and require no additional handling.
                  */
                 if (size == 0) {
-                       BIO_END_IO(bio, 0);
+                       END_IO(zv, bio, rq, 0);
                         goto out;
                 }
  
                 rw_enter(&zv->zv_suspend_lock, RW_READER);
  
                 /* See comment in WRITE case above. */
-               if (zvol_request_sync) {
+               if (force_sync) {
                         zvol_read(&zvr);
                 } else {
                         task = zv_request_task_create(zvr);
@@ -487,8 +639,33 @@ zvol_request(struct request_queue *q, struct bio *bio)
  
  out:
         spl_fstrans_unmark(cookie);
-#if (defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \
-       defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS)) && \
+}
+
+#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
+#ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID
+static void
+zvol_submit_bio(struct bio *bio)
+#else
+static blk_qc_t
+zvol_submit_bio(struct bio *bio)
+#endif
+#else
+static MAKE_REQUEST_FN_RET
+zvol_request(struct request_queue *q, struct bio *bio)
+#endif
+{
+#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
+#if defined(HAVE_BIO_BDEV_DISK)
+       struct request_queue *q = bio->bi_bdev->bd_disk->queue;
+#else
+       struct request_queue *q = bio->bi_disk->queue;
+#endif
+#endif
+       zvol_state_t *zv = q->queuedata;
+
+       zvol_request_impl(zv, bio, NULL, 0);
+#if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \
+       defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
         !defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID)
         return (BLK_QC_T_NONE);
  #endif
@@ -805,6 +982,27 @@ zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
         return (0);
  }
  
+/*
+ * Why have two separate block_device_operations structs?
+ *
+ * Normally we'd just have one, and assign 'submit_bio' as needed.  However,
+ * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we
+ * can't just change submit_bio dynamically at runtime.  So just create two
+ * separate structs to get around this.
+ */
+static const struct block_device_operations zvol_ops_blk_mq = {
+       .open                   = zvol_open,
+       .release                = zvol_release,
+       .ioctl                  = zvol_ioctl,
+       .compat_ioctl           = zvol_compat_ioctl,
+       .check_events           = zvol_check_events,
+#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK
+       .revalidate_disk        = zvol_revalidate_disk,
+#endif
+       .getgeo                 = zvol_getgeo,
+       .owner                  = THIS_MODULE,
+};
+
  static const struct block_device_operations zvol_ops = {
         .open                   = zvol_open,
         .release                = zvol_release,
@@ -821,6 +1019,87 @@ static const struct block_device_operations zvol_ops = {
  #endif
  };
  
+static int
+zvol_alloc_non_blk_mq(struct zvol_state_os *zso)
+{
+#if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS)
+#if defined(HAVE_BLK_ALLOC_DISK)
+       zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE);
+       if (zso->zvo_disk == NULL)
+               return (1);
+
+       zso->zvo_disk->minors = ZVOL_MINORS;
+       zso->zvo_queue = zso->zvo_disk->queue;
+#else
+       zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
+       if (zso->zvo_queue == NULL)
+               return (1);
+
+       zso->zvo_disk = alloc_disk(ZVOL_MINORS);
+       if (zso->zvo_disk == NULL) {
+               blk_cleanup_queue(zso->zvo_queue);
+               return (1);
+       }
+
+       zso->zvo_disk->queue = zso->zvo_queue;
+#endif /* HAVE_BLK_ALLOC_DISK */
+#else
+       zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE);
+       if (zso->zvo_queue == NULL)
+               return (1);
+
+       zso->zvo_disk = alloc_disk(ZVOL_MINORS);
+       if (zso->zvo_disk == NULL) {
+               blk_cleanup_queue(zso->zvo_queue);
+               return (1);
+       }
+
+       zso->zvo_disk->queue = zso->zvo_queue;
+#endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */
+       return (0);
+
+}
+
+static int
+zvol_alloc_blk_mq(zvol_state_t *zv)
+{
+#ifdef HAVE_BLK_MQ
+       struct zvol_state_os *zso = zv->zv_zso;
+
+       /* Allocate our blk-mq tag_set */
+       if (zvol_blk_mq_alloc_tag_set(zv) != 0)
+               return (1);
+
+#if defined(HAVE_BLK_ALLOC_DISK)
+       zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv);
+       if (zso->zvo_disk == NULL) {
+               blk_mq_free_tag_set(&zso->tag_set);
+               return (1);
+       }
+       zso->zvo_queue = zso->zvo_disk->queue;
+       zso->zvo_disk->minors = ZVOL_MINORS;
+#else
+       zso->zvo_disk = alloc_disk(ZVOL_MINORS);
+       if (zso->zvo_disk == NULL) {
+               blk_cleanup_queue(zso->zvo_queue);
+               blk_mq_free_tag_set(&zso->tag_set);
+               return (1);
+       }
+       /* Allocate queue */
+       zso->zvo_queue = blk_mq_init_queue(&zso->tag_set);
+       if (IS_ERR(zso->zvo_queue)) {
+               blk_mq_free_tag_set(&zso->tag_set);
+               return (1);
+       }
+
+       /* Our queue is now created, assign it to our disk */
+       zso->zvo_disk->queue = zso->zvo_queue;
+
+#endif
+#endif
+       return (0);
+}
+
  /*
   * Allocate memory for a new zvol_state_t and setup the required
   * request queue and generic disk structures for the block device.
@@ -831,6 +1110,7 @@ zvol_alloc(dev_t dev, const char *name)
         zvol_state_t *zv;
         struct zvol_state_os *zso;
         uint64_t volmode;
+       int ret;
  
         if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0)
                 return (NULL);
@@ -849,48 +1129,44 @@ zvol_alloc(dev_t dev, const char *name)
         list_link_init(&zv->zv_next);
         mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
  
-#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
-#ifdef HAVE_BLK_ALLOC_DISK
-       zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE);
-       if (zso->zvo_disk == NULL)
-               goto out_kmem;
-
-       zso->zvo_disk->minors = ZVOL_MINORS;
-       zso->zvo_queue = zso->zvo_disk->queue;
-#else
-       zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
-       if (zso->zvo_queue == NULL)
-               goto out_kmem;
+#ifdef HAVE_BLK_MQ
+       zv->zv_zso->use_blk_mq = zvol_use_blk_mq;
+#endif
  
-       zso->zvo_disk = alloc_disk(ZVOL_MINORS);
-       if (zso->zvo_disk == NULL) {
-               blk_cleanup_queue(zso->zvo_queue);
-               goto out_kmem;
+       /*
+        * The block layer has 3 interfaces for getting BIOs:
+        *
+        * 1. blk-mq request queues (new)
+        * 2. submit_bio() (oldest)
+        * 3. regular request queues (old).
+        *
+        * Each of those interfaces has two permutations:
+        *
+        * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates
+        *    both the disk and its queue (5.14 kernel or newer)
+        *
+        * b) We don't have blk_*alloc_disk(), and have to allocate the
+        *    disk and the queue separately. (5.13 kernel or older)
+        */
+       if (zv->zv_zso->use_blk_mq) {
+               ret = zvol_alloc_blk_mq(zv);
+               zso->zvo_disk->fops = &zvol_ops_blk_mq;
+       } else {
+               ret = zvol_alloc_non_blk_mq(zso);
+               zso->zvo_disk->fops = &zvol_ops;
         }
-
-       zso->zvo_disk->queue = zso->zvo_queue;
-#endif /* HAVE_BLK_ALLOC_DISK */
-#else
-       zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE);
-       if (zso->zvo_queue == NULL)
+       if (ret != 0)
                 goto out_kmem;
  
-       zso->zvo_disk = alloc_disk(ZVOL_MINORS);
-       if (zso->zvo_disk == NULL) {
-               blk_cleanup_queue(zso->zvo_queue);
-               goto out_kmem;
-       }
-
-       zso->zvo_disk->queue = zso->zvo_queue;
-#endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */
-
         blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE);
  
         /* Limit read-ahead to a single page to prevent over-prefetching. */
         blk_queue_set_read_ahead(zso->zvo_queue, 1);
  
-       /* Disable write merging in favor of the ZIO pipeline. */
-       blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue);
+       if (!zv->zv_zso->use_blk_mq) {
+               /* Disable write merging in favor of the ZIO pipeline. */
+               blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue);
+       }
  
         /* Enable /proc/diskstats */
         blk_queue_flag_set(QUEUE_FLAG_IO_STAT, zso->zvo_queue);
@@ -918,7 +1194,6 @@ zvol_alloc(dev_t dev, const char *name)
         }
  
         zso->zvo_disk->first_minor = (dev & MINORMASK);
-       zso->zvo_disk->fops = &zvol_ops;
         zso->zvo_disk->private_data = zv;
         snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d",
             ZVOL_DEV_NAME, (dev & MINORMASK));
@@ -963,6 +1238,11 @@ zvol_os_free(zvol_state_t *zv)
         put_disk(zv->zv_zso->zvo_disk);
  #endif
  
+#ifdef HAVE_BLK_MQ
+       if (zv->zv_zso->use_blk_mq)
+               blk_mq_free_tag_set(&zv->zv_zso->tag_set);
+#endif
+
         ida_simple_remove(&zvol_ida,
             MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);
  
@@ -1044,8 +1324,69 @@ zvol_os_create_minor(const char *name)
  
         blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue,
             (DMU_MAX_ACCESS / 4) >> 9);
-       blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX);
-       blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX);
+
+       if (zv->zv_zso->use_blk_mq) {
+               /*
+                * IO requests can be really big (1MB).  When an IO request
+                * comes in, it is passed off to zvol_read() or zvol_write()
+                * in a new thread, where it is chunked up into 'volblocksize'
+                * sized pieces and processed.  So for example, if the request
+                * is a 1MB write and your volblocksize is 128k, one zvol_write
+                * thread will take that request and sequentially do ten 128k
+                * IOs.  This is due to the fact that the thread needs to lock
+                * each volblocksize sized block.  So you might be wondering:
+                * "instead of passing the whole 1MB request to one thread,
+                * why not pass ten individual 128k chunks to ten threads and
+                * process the whole write in parallel?"  The short answer is
+                * that there's a sweet spot number of chunks that balances
+                * the greater parallelism with the added overhead of more
+                * threads. The sweet spot can be different depending on if you
+                * have a read or write  heavy workload.  Writes typically want
+                * high chunk counts while reads typically want lower ones.  On
+                * a test pool with 6 NVMe drives in a 3x 2-disk mirror
+                * configuration, with volblocksize=8k, the sweet spot for good
+                * sequential reads and writes was at 8 chunks.
+                */
+
+               /*
+                * Below we tell the kernel how big we want our requests
+                * to be.  You would think that blk_queue_io_opt() would be
+                * used to do this since it is used to "set optimal request
+                * size for the queue", but that doesn't seem to do
+                * anything - the kernel still gives you huge requests
+                * with tons of little PAGE_SIZE segments contained within it.
+                *
+                * Knowing that the kernel will just give you PAGE_SIZE segments
+                * no matter what, you can say "ok, I want PAGE_SIZE byte
+                * segments, and I want 'N' of them per request", where N is
+                * the correct number of segments for the volblocksize and
+                * number of chunks you want.
+                */
+#ifdef HAVE_BLK_MQ
+               if (zvol_blk_mq_blocks_per_thread != 0) {
+                       unsigned int chunks;
+                       chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX);
+
+                       blk_queue_max_segment_size(zv->zv_zso->zvo_queue,
+                           PAGE_SIZE);
+                       blk_queue_max_segments(zv->zv_zso->zvo_queue,
+                           (zv->zv_volblocksize * chunks) / PAGE_SIZE);
+               } else {
+                       /*
+                        * Special case: zvol_blk_mq_blocks_per_thread = 0
+                        * Max everything out.
+                        */
+                       blk_queue_max_segments(zv->zv_zso->zvo_queue,
+                           UINT16_MAX);
+                       blk_queue_max_segment_size(zv->zv_zso->zvo_queue,
+                           UINT_MAX);
+               }
+#endif
+       } else {
+               blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX);
+               blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX);
+       }
+
         blk_queue_physical_block_size(zv->zv_zso->zvo_queue,
             zv->zv_volblocksize);
         blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize);
@@ -1167,19 +1508,54 @@ int
  zvol_init(void)
  {
         int error;
-       int threads = MIN(MAX(zvol_threads, 1), 1024);
+
+       /*
+        * zvol_threads is the module param the user passes in.
+        *
+        * zvol_actual_threads is what we use internally, since the user can
+        * pass zvol_thread = 0 to mean "use all the CPUs" (the default).
+        */
+       static unsigned int zvol_actual_threads;
+
+       if (zvol_threads == 0) {
+               /*
+                * See dde9380a1 for why 32 was chosen here.  This should
+                * probably be refined to be some multiple of the number
+                * of CPUs.
+                */
+               zvol_actual_threads = MAX(num_online_cpus(), 32);
+       } else {
+               zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024);
+       }
  
         error = register_blkdev(zvol_major, ZVOL_DRIVER);
         if (error) {
                 printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
                 return (error);
         }
-       zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri,
-           threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+
+#ifdef HAVE_BLK_MQ
+       if (zvol_blk_mq_queue_depth == 0) {
+               zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ;
+       } else {
+               zvol_actual_blk_mq_queue_depth =
+                   MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ);
+       }
+
+       if (zvol_blk_mq_threads == 0) {
+               zvol_blk_mq_actual_threads = num_online_cpus();
+       } else {
+               zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1),
+                   1024);
+       }
+#endif
+       zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_actual_threads, maxclsyspri,
+           zvol_actual_threads, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
         if (zvol_taskq == NULL) {
                 unregister_blkdev(zvol_major, ZVOL_DRIVER);
                 return (-ENOMEM);
         }
+
         zvol_init_impl();
         ida_init(&zvol_ida);
         return (0);
@@ -1202,7 +1578,8 @@ module_param(zvol_major, uint, 0444);
  MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
  
  module_param(zvol_threads, uint, 0444);
-MODULE_PARM_DESC(zvol_threads, "Max number of threads to handle I/O requests");
+MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set"
+    "to 0 to use all active CPUs");
  
  module_param(zvol_request_sync, uint, 0644);
  MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
@@ -1215,4 +1592,17 @@ MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
  
  module_param(zvol_volmode, uint, 0644);
  MODULE_PARM_DESC(zvol_volmode, "Default volmode property value");
+
+#ifdef HAVE_BLK_MQ
+module_param(zvol_blk_mq_queue_depth, uint, 0644);
+MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth");
+
+module_param(zvol_use_blk_mq, uint, 0644);
+MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols");
+
+module_param(zvol_blk_mq_blocks_per_thread, uint, 0644);
+MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread,
+    "Process volblocksize blocks per thread");
+#endif
+
  /* END CSTYLED */
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run

index 243221598d09f3ff7894628218f33b733b3e3968..89ee0d3cb7b63bb424499069b948a316250f4e2b 100644 (file)
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -937,9 +937,13 @@ tags = ['functional', 'zvol', 'zvol_cli']
  
  [tests/functional/zvol/zvol_misc]
  tests = ['zvol_misc_002_pos', 'zvol_misc_hierarchy', 'zvol_misc_rename_inuse',
-    'zvol_misc_snapdev', 'zvol_misc_volmode', 'zvol_misc_zil']
+    'zvol_misc_snapdev', 'zvol_misc_trim', 'zvol_misc_volmode', 'zvol_misc_zil']
  tags = ['functional', 'zvol', 'zvol_misc']
  
+[tests/functional/zvol/zvol_stress]
+tests = ['zvol_stress']
+tags = ['functional', 'zvol', 'zvol_stress']
+
  [tests/functional/zvol/zvol_swap]
  tests = ['zvol_swap_001_pos', 'zvol_swap_002_pos', 'zvol_swap_004_pos']
  tags = ['functional', 'zvol', 'zvol_swap']
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run

index 3985da146044c2773e0f075aa57cdda29bde0373..fa71f412ba6c5f9fc9821ce7089277efd4376fa6 100644 (file)
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -184,3 +184,8 @@ tags = ['functional', 'user_namespace']
  tests = ['groupspace_001_pos', 'groupspace_002_pos', 'groupspace_003_pos',
      'userquota_013_pos', 'userspace_003_pos']
  tags = ['functional', 'userquota']
+
+[tests/functional/zvol/zvol_misc:Linux]
+tests = ['zvol_misc_fua']
+tags = ['functional', 'zvol', 'zvol_misc']
+
diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg

index 99430bc103241ca8d9b73034da2217cb4db5ccb7..1ee786d131d74f61f5fade3f7a3def816a9bf576 100644 (file)
--- a/tests/zfs-tests/include/commands.cfg
+++ b/tests/zfs-tests/include/commands.cfg
@@ -120,10 +120,12 @@ export SYSTEM_FILES_FREEBSD='chflags
      showmount
      swapctl
      sysctl
+    trim
      uncompress'
  
  export SYSTEM_FILES_LINUX='attr
      blkid
+    blkdiscard
      blockdev
      chattr
      exportfs
diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib

index 51d4e225f10fabf9d68576e79bd0a9f95fdf69ca..cb20318f44c56cdd947f282dabf225657aa38a63 100644 (file)
--- a/tests/zfs-tests/include/libtest.shlib
+++ b/tests/zfs-tests/include/libtest.shlib
@@ -2770,20 +2770,22 @@ function is_te_enabled
         svcs -H -o state labeld 2>/dev/null | grep -q "enabled"
  }
  
+# Return the number of CPUs (cross-platform)
+function get_num_cpus
+{
+       if is_linux ; then
+               grep -c '^processor' /proc/cpuinfo
+       elif is_freebsd; then
+               sysctl -n kern.smp.cpus
+       else
+               psrinfo | wc -l
+       fi
+}
+
  # Utility function to determine if a system has multiple cpus.
  function is_mp
  {
-       case "$UNAME" in
-               Linux)
-                       (($(grep -c '^processor' /proc/cpuinfo) > 1))
-                       ;;
-               FreeBSD)
-                       sysctl -n kern.smp.cpus
-                       ;;
-               *)
-                       (($(psrinfo | wc -l) > 1))
-                       ;;
-       esac
+       [[ $(get_num_cpus) -gt 1 ]]
  }
  
  function get_cpu_freq
@@ -3320,14 +3322,23 @@ function get_tunable_impl
  {
         typeset name="$1"
         typeset module="${2:-zfs}"
+       typeset check_only="$3"
  
         eval "typeset tunable=\$$name"
         case "$tunable" in
         UNSUPPORTED)
-               log_unsupported "Tunable '$name' is unsupported on $UNAME"
+               if [ -z "$check_only" ] ; then
+                       log_unsupported "Tunable '$name' is unsupported on $UNAME"
+               else
+                       return 1
+               fi
                 ;;
         "")
-               log_fail "Tunable '$name' must be added to tunables.cfg"
+               if [ -z "$check_only" ] ; then
+                       log_fail "Tunable '$name' must be added to tunables.cfg"
+               else
+                       return 1
+               fi
                 ;;
         *)
                 ;;
@@ -3347,6 +3358,14 @@ function get_tunable_impl
         esac
  }
  
+# Does a tunable exist?
+#
+# $1: Tunable name
+function tunable_exists
+{
+       get_tunable_impl $1 "zfs" 1
+}
+
  #
  # Compute MD5 digest for given file or stdin if no file given.
  # Note: file path must not contain spaces
diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg

index d3838cb7c8ed3297db8d46546b69023d481786e5..d6a2fe5db7c60a333d226a5c5ae5a29e1c4cd2eb 100644 (file)
--- a/tests/zfs-tests/include/tunables.cfg
+++ b/tests/zfs-tests/include/tunables.cfg
@@ -87,6 +87,7 @@ VDEV_VALIDATE_SKIP            vdev.validate_skip              vdev_validate_skip
  VOL_INHIBIT_DEV                        UNSUPPORTED                     zvol_inhibit_dev
  VOL_MODE                       vol.mode                        zvol_volmode
  VOL_RECURSIVE                  vol.recursive                   UNSUPPORTED
+VOL_USE_BLK_MQ                 UNSUPPORTED                     zvol_use_blk_mq
  XATTR_COMPAT                   xattr_compat                    zfs_xattr_compat
  ZEVENT_LEN_MAX                 zevent.len_max                  zfs_zevent_len_max
  ZEVENT_RETAIN_MAX              zevent.retain_max               zfs_zevent_retain_max
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am

index ffc087351e3800e00b965a1f1355a4822690f21a..d759e51968cd2e8e65a0d1dedab38149eab3f8a2 100644 (file)
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1966,11 +1966,16 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
         functional/zvol/zvol_misc/zvol_misc_004_pos.ksh \
         functional/zvol/zvol_misc/zvol_misc_005_neg.ksh \
         functional/zvol/zvol_misc/zvol_misc_006_pos.ksh \
+       functional/zvol/zvol_misc/zvol_misc_fua.ksh \
         functional/zvol/zvol_misc/zvol_misc_hierarchy.ksh \
         functional/zvol/zvol_misc/zvol_misc_rename_inuse.ksh \
         functional/zvol/zvol_misc/zvol_misc_snapdev.ksh \
+       functional/zvol/zvol_misc/zvol_misc_trim.ksh \
         functional/zvol/zvol_misc/zvol_misc_volmode.ksh \
         functional/zvol/zvol_misc/zvol_misc_zil.ksh \
+       functional/zvol/zvol_stress/cleanup.ksh \
+       functional/zvol/zvol_stress/setup.ksh \
+       functional/zvol/zvol_stress/zvol_stress.ksh \
         functional/zvol/zvol_swap/cleanup.ksh \
         functional/zvol/zvol_swap/setup.ksh \
         functional/zvol/zvol_swap/zvol_swap_001_pos.ksh \
diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_common.shlib b/tests/zfs-tests/tests/functional/zvol/zvol_common.shlib

index c0fd90f58eaf7680608d09dd0326f2bb1b4da104..c04559fe337bf16e1f040afb99baaf70879cd032 100644 (file)
--- a/tests/zfs-tests/tests/functional/zvol/zvol_common.shlib
+++ b/tests/zfs-tests/tests/functional/zvol/zvol_common.shlib
@@ -128,3 +128,14 @@ function is_zvol_dumpified
  
         zdb -dddd $volume 2 | grep -q "dumpsize"
  }
+
+# enable/disable blk-mq (if available)
+#
+# $1: 1 = enable, 0 = disable
+function set_blk_mq
+{
+       # Not all kernels support blk-mq
+       if tunable_exists VOL_USE_BLK_MQ ; then
+               log_must set_tunable32 VOL_USE_BLK_MQ $1
+       fi
+}
diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh

new file mode 100755 (executable)

index 0000000..e441070
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh
@@ -0,0 +1,96 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2022 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/zvol/zvol_common.shlib
+
+#
+# DESCRIPTION:
+#      Verify that a zvol Force Unit Access (FUA) write works.
+#
+# STRATEGY:
+# 1. dd write 5MB of data with "oflag=dsync,direct" to a zvol.  Those flags
+#    together do a FUA write.
+# 3. Verify the data is correct.
+# 3. Repeat 1-2 for both the blk-mq and non-blk-mq cases.
+
+verify_runnable "global"
+
+if ! is_physical_device $DISKS; then
+       log_unsupported "This directory cannot be run on raw files."
+fi
+
+if ! is_linux ; then
+       log_unsupported "Only linux supports dd with oflag=dsync for FUA writes"
+fi
+
+typeset datafile1="$(mktemp zvol_misc_fua1.XXXXXX)"
+typeset datafile2="$(mktemp zvol_misc_fua2.XXXXXX)"
+typeset zvolpath=${ZVOL_DEVDIR}/$TESTPOOL/$TESTVOL
+
+function cleanup
+{
+       rm "$datafile1" "$datafile2"
+}
+
+function do_test {
+       # Wait for udev to create symlinks to our zvol
+       block_device_wait $zvolpath
+
+       # Create a data file
+       log_must dd if=/dev/urandom of="$datafile1" bs=1M count=5
+
+       # Write the data to our zvol using FUA
+       log_must dd if=$datafile1 of=$zvolpath oflag=dsync,direct bs=1M count=5
+
+       # Extract data from our zvol
+       log_must dd if=$zvolpath of="$datafile2" bs=1M count=5
+
+       # Compare the data we expect with what's on our zvol.  diff will return
+       # non-zero if they differ.
+       log_must diff $datafile1 $datafile2
+
+       log_must rm $datafile1 $datafile2
+}
+
+log_assert "Verify that a ZFS volume can do Force Unit Access (FUA)"
+log_onexit cleanup
+
+log_must zfs set compression=off $TESTPOOL/$TESTVOL
+
+log_note "Testing without blk-mq"
+
+set_blk_mq 0
+log_must zpool export $TESTPOOL
+log_must zpool import $TESTPOOL
+do_test
+
+set_blk_mq 1
+log_must zpool export $TESTPOOL
+log_must zpool import $TESTPOOL
+do_test
+
+log_pass "ZFS volume FUA works"
diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh

new file mode 100755 (executable)

index 0000000..2e417a0
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh
@@ -0,0 +1,136 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2022 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/include/math.shlib
+. $STF_SUITE/tests/functional/zvol/zvol_common.shlib
+
+#
+# DESCRIPTION:
+#      Verify we can TRIM a zvol
+#
+# STRATEGY:
+# 1. TRIM the entire zvol to remove data from older tests
+# 2. Create a 5MB data file
+# 3. Write the file to the zvol
+# 4. Observe 5MB of used space on the zvol
+# 5. TRIM the first 1MB and last 2MB of the 5MB block of data.
+# 6. Observe 2MB of used space on the zvol
+# 7. Verify the trimmed regions are zero'd on the zvol
+
+verify_runnable "global"
+
+if is_linux ; then
+       # We need '--force' here since the prior tests may leave a filesystem
+       # on the zvol, and blkdiscard will see that filesystem and print a
+       # warning unless you force it.
+       #
+       # Only blkdiscard >= v2.36 supports --force, so we need to
+       # check for it.
+       if blkdiscard --help | grep -q '\-\-force' ; then
+               trimcmd='blkdiscard --force'
+       else
+               trimcmd='blkdiscard'
+       fi
+else
+       # By default, FreeBSD 'trim' always does a dry-run.  '-f' makes
+       # it perform the actual operation.
+       trimcmd='trim -f'
+fi
+
+if ! is_physical_device $DISKS; then
+       log_unsupported "This directory cannot be run on raw files."
+fi
+
+typeset datafile1="$(mktemp zvol_misc_flags1.XXXXXX)"
+typeset datafile2="$(mktemp zvol_misc_flags2.XXXXXX)"
+typeset zvolpath=${ZVOL_DEVDIR}/$TESTPOOL/$TESTVOL
+
+function cleanup
+{
+       rm "$datafile1" "$datafile2"
+}
+
+function do_test {
+       # Wait for udev to create symlinks to our zvol
+       block_device_wait $zvolpath
+
+       # Create a data file
+       log_must dd if=/dev/urandom of="$datafile1" bs=1M count=5
+       
+       # Write to zvol
+       log_must dd if=$datafile1 of=$zvolpath conv=fsync
+
+       # Record how much space we've used (should be 5MB, with 128k
+       # of tolerance).
+       before="$(get_prop refer $TESTPOOL/$TESTVOL)"
+       log_must within_tolerance $before 5242880 131072
+
+       # We currently have 5MB of random data on the zvol.
+       # Trim the first 1MB and also trim 2MB at offset 3MB.
+       log_must $trimcmd -l $((1 * 1048576)) $zvolpath
+       log_must $trimcmd -o $((3 * 1048576)) -l $((2 * 1048576)) $zvolpath
+       sync_pool
+
+       # After trimming 3MB, the zvol should have 2MB of data (with 128k of
+       # tolerance).
+       after="$(get_prop refer $TESTPOOL/$TESTVOL)"
+       log_must within_tolerance $after 2097152 131072
+
+       # Make the same holes in our test data
+       log_must dd if=/dev/zero of="$datafile1" bs=1M count=1 conv=notrunc
+       log_must dd if=/dev/zero of="$datafile1" bs=1M count=2 seek=3 conv=notrunc
+
+       # Extract data from our zvol
+       log_must dd if=$zvolpath of="$datafile2" bs=1M count=5
+
+       # Compare the data we expect with what's on our zvol.  diff will return
+       # non-zero if they differ.
+       log_must diff $datafile1 $datafile2
+
+       log_must rm $datafile1 $datafile2
+}
+
+log_assert "Verify that a ZFS volume can be TRIMed"
+log_onexit cleanup
+
+log_must zfs set compression=off $TESTPOOL/$TESTVOL
+
+# Remove old data from previous tests
+log_must $trimcmd $zvolpath
+
+
+set_blk_mq 1
+log_must zpool export $TESTPOOL
+log_must zpool import $TESTPOOL
+do_test
+
+set_blk_mq 0
+log_must zpool export $TESTPOOL
+log_must zpool import $TESTPOOL
+do_test
+
+log_pass "ZFS volumes can be trimmed"
diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_stress/cleanup.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_stress/cleanup.ksh

new file mode 100755 (executable)

index 0000000..b81a372
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/zvol/zvol_stress/cleanup.ksh
@@ -0,0 +1,36 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+#
+# Copyright (c) 2013 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+verify_runnable "global"
+
+default_cleanup
diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_stress/setup.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_stress/setup.ksh

new file mode 100755 (executable)

index 0000000..9e70fc4
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/zvol/zvol_stress/setup.ksh
@@ -0,0 +1,36 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+#
+# Copyright (c) 2013 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+verify_runnable "global"
+
+default_setup "$DISKS"
diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh

new file mode 100755 (executable)

index 0000000..c1aadca
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh
@@ -0,0 +1,169 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright (c) 2022 by Lawrence Livermore National Security, LLC.
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/reservation/reservation.shlib
+. $STF_SUITE/tests/functional/zvol/zvol_common.shlib
+
+#
+# DESCRIPTION:
+# Stress test multithreaded transfers to multiple zvols.  Also verify
+# zvol errors show up in zpool status.
+#
+# STRATEGY:
+#
+# For both the normal submit_bio() codepath and the blk-mq codepath, do
+# the following:
+#
+# 1. Create one zvol per CPU
+# 2. In parallel, spawn an fio "write and verify" for each zvol
+# 3. Inject write errors
+# 4. Write to one of the zvols with dd and verify the errors
+#
+
+verify_runnable "global"
+
+num_zvols=$(get_num_cpus)
+
+# If we were making one big zvol from all the pool space, it would
+# be this big:
+biggest_zvol_size_possible=$(largest_volsize_from_pool $TESTPOOL)
+
+# Crude calculation: take the biggest zvol size we could possibly
+# create, knock 10% off it (for overhead) and divide by the number
+# of ZVOLs we want to make.
+#
+# Round the value using a printf
+typeset -f each_zvol_size=$(( floor($biggest_zvol_size_possible * 0.9 / \
+       $num_zvols )))
+
+typeset tmpdir="$(mktemp -d zvol_stress_fio_state.XXXXXX)"
+
+function create_zvols
+{
+       log_note "Creating $num_zvols zvols that are ${each_zvol_size}B each"
+       for i in $(seq $num_zvols) ; do
+               log_must zfs create -V $each_zvol_size $TESTPOOL/testvol$i
+               block_device_wait "$ZVOL_DEVDIR/$TESTPOOL/testvol$i"
+       done
+}
+
+function destroy_zvols
+{
+       for i in $(seq $num_zvols) ; do
+               log_must_busy zfs destroy $TESTPOOL/testvol$i
+       done
+}
+
+function do_zvol_stress
+{
+       # Write 10% of each zvol, or 50MB, whichever is less
+       zvol_write_size=$((each_zvol_size / 10))
+       if [ $zvol_write_size -gt $((50 * 1048576)) ] ; then
+               zvol_write_size=$((50 * 1048576))
+       fi
+       zvol_write_size_mb=$(($zvol_write_size / 1048576))
+
+       if is_linux ; then
+               engine=libaio
+       else
+               engine=psync
+       fi
+
+       # Spawn off one fio per zvol in parallel
+       pids=""
+       for i in $(seq $num_zvols) ; do
+               # Spawn one fio per zvol as its own process
+               fio --ioengine=$engine --name=zvol_stress$i --direct=0 \
+                       --filename="$ZVOL_DEVDIR/$TESTPOOL/testvol$i" --bs=1048576 \
+                       --iodepth=10 --readwrite=randwrite --size=${zvol_write_size} \
+                       --verify_async=2 --numjobs=1 --verify=sha1 \
+                       --verify_fatal=1 \
+                       --continue_on_error=none \
+                       --error_dump=1 \
+                       --exitall_on_error \
+                       --aux-path="$tmpdir" --do_verify=1 &
+               pids="$pids $!"
+       done
+
+       # Wait for all the spawned fios to finish and look for errors
+       fail=""
+       i=0
+       for pid in $pids ; do
+               log_note "$s waiting on $pid"
+               if ! wait $pid ; then
+                       log_fail "fio error on $TESTPOOL/testvol$i"
+               fi
+               i=$(($i + 1))
+       done
+}
+
+function cleanup
+{
+       log_must zinject -c all
+       log_must zpool clear $TESTPOOL
+       destroy_zvols
+       set_blk_mq 0
+
+       # Remove all fio's leftover state files
+       if [ -n "$tmpdir" ] ; then
+               log_must rm -fd "$tmpdir"/*.state "$tmpdir"
+       fi
+}
+
+log_onexit cleanup
+
+log_assert "Stress test zvols"
+
+set_blk_mq 0
+create_zvols
+# Do some fio write/verifies in parallel
+do_zvol_stress
+destroy_zvols
+
+# Enable blk-mq (block multi-queue), and re-run the same test
+set_blk_mq 1
+create_zvols
+do_zvol_stress
+
+# Inject some errors, and verify we see some IO errors in zpool status
+for DISK in $DISKS ; do
+       log_must zinject -d $DISK -f 10 -e io -T write $TESTPOOL
+done
+log_must dd if=/dev/zero of=$ZVOL_DEVDIR/$TESTPOOL/testvol1 bs=512 count=50
+log_must zinject -c all
+
+# We should see write errors
+typeset -i write_errors=$(zpool status -p | awk '
+       !NF { isvdev = 0 }
+       isvdev { errors += $4 }
+       /CKSUM$/ { isvdev = 1 }
+       END { print errors }
+')
+
+if [ $write_errors -eq 0 ] ; then
+       log_fail "Expected to see some write errors"
+else
+       log_note "Correctly saw $write_errors write errors"
+fi
+log_pass "Done with zvol_stress"
author	Tony Hutter <hutter2@llnl.gov>
	Thu, 9 Jun 2022 14:10:38 +0000 (07:10 -0700)
committer	GitHub <noreply@github.com>
	Thu, 9 Jun 2022 14:10:38 +0000 (08:10 -0600)
config/kernel-blk-queue.m4		patch \| blob \| blame \| history
include/os/linux/kernel/linux/blkdev_compat.h		patch \| blob \| blame \| history
include/os/linux/spl/sys/uio.h		patch \| blob \| blame \| history
man/man4/zfs.4		patch \| blob \| blame \| history
module/os/linux/zfs/zfs_uio.c		patch \| blob \| blame \| history
module/os/linux/zfs/zvol_os.c		patch \| blob \| blame \| history
tests/runfiles/common.run		patch \| blob \| blame \| history
tests/runfiles/linux.run		patch \| blob \| blame \| history
tests/zfs-tests/include/commands.cfg		patch \| blob \| blame \| history
tests/zfs-tests/include/libtest.shlib		patch \| blob \| blame \| history
tests/zfs-tests/include/tunables.cfg		patch \| blob \| blame \| history
tests/zfs-tests/tests/Makefile.am		patch \| blob \| blame \| history
tests/zfs-tests/tests/functional/zvol/zvol_common.shlib		patch \| blob \| blame \| history
tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh	[new file with mode: 0755]	patch \| blob
tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh	[new file with mode: 0755]	patch \| blob
tests/zfs-tests/tests/functional/zvol/zvol_stress/cleanup.ksh	[new file with mode: 0755]	patch \| blob
tests/zfs-tests/tests/functional/zvol/zvol_stress/setup.ksh	[new file with mode: 0755]	patch \| blob
tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh	[new file with mode: 0755]	patch \| blob