zvol processing should use struct bio

author Richard Yao <ryao@gentoo.org>

Fri, 4 Jul 2014 22:43:47 +0000 (18:43 -0400)

committer Richard Yao <ryao@gentoo.org>

Fri, 4 Sep 2015 19:30:24 +0000 (15:30 -0400)
author Richard Yao <ryao@gentoo.org>
Fri, 4 Jul 2014 22:43:47 +0000 (18:43 -0400)
committer Richard Yao <ryao@gentoo.org>
Fri, 4 Sep 2015 19:30:24 +0000 (15:30 -0400)
diff --git a/config/kernel-bio-rw-barrier.m4 b/config/kernel-bio-rw-barrier.m4

new file mode 100644 (file)

index 0000000..bcf0f7e
--- /dev/null
+++ b/config/kernel-bio-rw-barrier.m4
@@ -0,0 +1,25 @@
+dnl #
+dnl # Interface for issuing a discard bio:
+dnl # 2.6.28-2.6.35: BIO_RW_BARRIER
+dnl # 2.6.36-3.x:    REQ_BARRIER
+dnl #
+
+dnl # Since REQ_BARRIER is a preprocessor definition, there is no need for an
+dnl # autotools check for it. Also, REQ_BARRIER existed in the request layer
+dnl # until torvalds/linux@7b6d91daee5cac6402186ff224c3af39d79f4a0e unified the
+dnl # request layer and bio layer flags, so it would be wrong to assume that
+dnl # the APIs are mutually exclusive contrary to the typical case.
+AC_DEFUN([ZFS_AC_KERNEL_BIO_RW_BARRIER], [
+       AC_MSG_CHECKING([whether BIO_RW_BARRIER is defined])
+       ZFS_LINUX_TRY_COMPILE([
+               #include <linux/bio.h>
+       ],[
+               int flags __attribute__ ((unused));
+               flags = BIO_RW_BARRIER;
+       ],[
+               AC_MSG_RESULT(yes)
+               AC_DEFINE(HAVE_BIO_RW_BARRIER, 1, [BIO_RW_BARRIER is defined])
+       ],[
+               AC_MSG_RESULT(no)
+       ])
+])
diff --git a/config/kernel-bio-rw-discard.m4 b/config/kernel-bio-rw-discard.m4

new file mode 100644 (file)

index 0000000..0554b9a
--- /dev/null
+++ b/config/kernel-bio-rw-discard.m4
@@ -0,0 +1,25 @@
+dnl #
+dnl # Interface for issuing a discard bio:
+dnl # 2.6.28-2.6.35: BIO_RW_DISCARD
+dnl # 2.6.36-3.x:    REQ_DISCARD
+dnl #
+
+dnl # Since REQ_DISCARD is a preprocessor definition, there is no need for an
+dnl # autotools check for it. Also, REQ_DISCARD existed in the request layer
+dnl # until torvalds/linux@7b6d91daee5cac6402186ff224c3af39d79f4a0e unified the
+dnl # request layer and bio layer flags, so it would be wrong to assume that
+dnl # the APIs are mutually exclusive contrary to the typical case.
+AC_DEFUN([ZFS_AC_KERNEL_BIO_RW_DISCARD], [
+       AC_MSG_CHECKING([whether BIO_RW_DISCARD is defined])
+       ZFS_LINUX_TRY_COMPILE([
+               #include <linux/bio.h>
+       ],[
+               int flags __attribute__ ((unused));
+               flags = BIO_RW_DISCARD;
+       ],[
+               AC_MSG_RESULT(yes)
+               AC_DEFINE(HAVE_BIO_RW_DISCARD, 1, [BIO_RW_DISCARD is defined])
+       ],[
+               AC_MSG_RESULT(no)
+       ])
+])
diff --git a/config/kernel-current_bio_tail.m4 b/config/kernel-current_bio_tail.m4

new file mode 100644 (file)

index 0000000..b72f21e
--- /dev/null
+++ b/config/kernel-current_bio_tail.m4
@@ -0,0 +1,33 @@
+dnl #
+dnl # 2.6.34 API change
+dnl # current->bio_tail and current->bio_list were struct bio pointers prior to
+dnl # Linux 2.6.34. They were refactored into a struct bio_list pointer called
+dnl # current->bio_list in Linux 2.6.34.
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_CURRENT_BIO_TAIL], [
+       AC_MSG_CHECKING([whether current->bio_tail exists])
+       ZFS_LINUX_TRY_COMPILE([
+               #include <linux/sched.h>
+       ],[
+               current->bio_tail = (struct bio **) NULL;
+       ],[
+               AC_MSG_RESULT(yes)
+               AC_DEFINE(HAVE_CURRENT_BIO_TAIL, 1,
+                   [current->bio_tail exists])
+       ],[
+               AC_MSG_RESULT(no)
+               AC_MSG_CHECKING([whether current->bio_list exists])
+               ZFS_LINUX_TRY_COMPILE([
+                       #include <linux/sched.h>
+               ],[
+                       current->bio_list = (struct bio_list *) NULL;
+               ],[
+                       AC_MSG_RESULT(yes)
+                       AC_DEFINE(HAVE_CURRENT_BIO_LIST, 1,
+                           [current->bio_list exists])
+               ],[
+                       AC_MSG_ERROR(no - Please file a bug report at
+                           https://github.com/zfsonlinux/zfs/issues/new)
+               ])
+       ])
+])
diff --git a/config/kernel-mk-request-fn.m4 b/config/kernel-mk-request-fn.m4

new file mode 100644 (file)

index 0000000..88ee2eb
--- /dev/null
+++ b/config/kernel-mk-request-fn.m4
@@ -0,0 +1,43 @@
+dnl #
+dnl # Linux 3.2 API Change
+dnl # make_request_fn returns void instead of int.
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_MAKE_REQUEST_FN], [
+       AC_MSG_CHECKING([whether make_request_fn() returns int])
+       ZFS_LINUX_TRY_COMPILE([
+               #include <linux/blkdev.h>
+
+               int make_request(struct request_queue *q, struct bio *bio)
+               {
+                       return (0);
+               }
+       ],[
+               blk_queue_make_request(NULL, &make_request);
+       ],[
+               AC_MSG_RESULT(yes)
+               AC_DEFINE(MAKE_REQUEST_FN_RET, int,
+                   [make_request_fn() returns int])
+               AC_DEFINE(HAVE_MAKE_REQUEST_FN_RET_INT, 1,
+                   [Noting that make_request_fn() returns int])
+       ],[
+               AC_MSG_RESULT(no)
+               AC_MSG_CHECKING([whether make_request_fn() returns void])
+               ZFS_LINUX_TRY_COMPILE([
+                       #include <linux/blkdev.h>
+
+                       void make_request(struct request_queue *q, struct bio *bio)
+                       {
+                               return;
+                       }
+               ],[
+                       blk_queue_make_request(NULL, &make_request);
+               ],[
+                       AC_MSG_RESULT(yes)
+                       AC_DEFINE(MAKE_REQUEST_FN_RET, void,
+                           [make_request_fn() returns void])
+               ],[
+                       AC_MSG_ERROR(no - Please file a bug report at
+                           https://github.com/zfsonlinux/zfs/issues/new)
+               ])
+       ])
+])
diff --git a/config/kernel.m4 b/config/kernel.m4

index 09d8003f12a04f5b53f7ed061836c1039ba4adc0..b3dd7232c226d57c102e64eaf04ff26631cacae5 100644 (file)
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -7,6 +7,7 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [
         ZFS_AC_TEST_MODULE
         ZFS_AC_KERNEL_CONFIG
         ZFS_AC_KERNEL_DECLARE_EVENT_CLASS
+       ZFS_AC_KERNEL_CURRENT_BIO_TAIL
         ZFS_AC_KERNEL_BDEV_BLOCK_DEVICE_OPERATIONS
         ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID
         ZFS_AC_KERNEL_TYPE_FMODE_T
@@ -22,6 +23,8 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [
         ZFS_AC_KERNEL_BIO_FAILFAST_DTD
         ZFS_AC_KERNEL_REQ_FAILFAST_MASK
         ZFS_AC_KERNEL_BIO_END_IO_T_ARGS
+       ZFS_AC_KERNEL_BIO_RW_BARRIER
+       ZFS_AC_KERNEL_BIO_RW_DISCARD
         ZFS_AC_KERNEL_BIO_RW_SYNC
         ZFS_AC_KERNEL_BIO_RW_SYNCIO
         ZFS_AC_KERNEL_REQ_SYNC
@@ -100,6 +103,7 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [
         ZFS_AC_KERNEL_VFS_RW_ITERATE
         ZFS_AC_KERNEL_KMAP_ATOMIC_ARGS
         ZFS_AC_KERNEL_FOLLOW_DOWN_ONE
+       ZFS_AC_KERNEL_MAKE_REQUEST_FN
  
         AS_IF([test "$LINUX_OBJ" != "$LINUX"], [
                 KERNELMAKE_PARAMS="$KERNELMAKE_PARAMS O=$LINUX_OBJ"
diff --git a/include/linux/blkdev_compat.h b/include/linux/blkdev_compat.h

index cdcf12c420904684067fa1ba2a961efea13e219a..0b5c8af429e1748c95c6c83170c859c20eca433a 100644 (file)
--- a/include/linux/blkdev_compat.h
+++ b/include/linux/blkdev_compat.h
@@ -295,10 +295,16 @@ struct req_iterator {
  #define        BIO_BI_SECTOR(bio)      (bio)->bi_iter.bi_sector
  #define        BIO_BI_SIZE(bio)        (bio)->bi_iter.bi_size
  #define        BIO_BI_IDX(bio)         (bio)->bi_iter.bi_idx
+#define        bio_for_each_segment4(bv, bvp, b, i)    \
+       bio_for_each_segment((bv), (b), (i))
+typedef struct bvec_iter bvec_iterator_t;
  #else
  #define        BIO_BI_SECTOR(bio)      (bio)->bi_sector
  #define        BIO_BI_SIZE(bio)        (bio)->bi_size
  #define        BIO_BI_IDX(bio)         (bio)->bi_idx
+#define        bio_for_each_segment4(bv, bvp, b, i)    \
+       bio_for_each_segment((bvp), (b), (i))
+typedef int bvec_iterator_t;
  #endif
  
  /*
@@ -457,17 +463,30 @@ bio_set_flags_failfast(struct block_device *bdev, int *flags)
  #define        VDEV_REQ_FUA                    REQ_FUA
  #else
  #define        VDEV_WRITE_FLUSH_FUA            WRITE_BARRIER
+#ifdef HAVE_BIO_RW_BARRIER
+#define        VDEV_REQ_FLUSH                  (1 << BIO_RW_BARRIER)
+#define        VDEV_REQ_FUA                    (1 << BIO_RW_BARRIER)
+#else
  #define        VDEV_REQ_FLUSH                  REQ_HARDBARRIER
  #define        VDEV_REQ_FUA                    REQ_FUA
  #endif
+#endif
  
  /*
   * 2.6.32 API change
   * Use the normal I/O patch for discards.
   */
-#ifdef REQ_DISCARD
+#ifdef QUEUE_FLAG_DISCARD
+#ifdef HAVE_BIO_RW_DISCARD
+#define        VDEV_REQ_DISCARD                (1 << BIO_RW_DISCARD)
+#else
  #define        VDEV_REQ_DISCARD                REQ_DISCARD
  #endif
+#else
+#error "Allowing the build will cause discard requests to become writes "
+       "potentially triggering the DMU_MAX_ACCESS assertion. Please file a "
+       "an issue report at: https://github.com/zfsonlinux/zfs/issues/new"
+#endif
  
  /*
   * 2.6.33 API change
diff --git a/include/sys/dmu.h b/include/sys/dmu.h

index 4ad496ae0dc34fff225887c7f5d3ea1354c76ea0..d9434db463831df9b04f2e4214c07cc084e01732 100644 (file)
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -710,8 +710,8 @@ void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
         dmu_tx_t *tx);
  #ifdef _KERNEL
  #include <linux/blkdev_compat.h>
-int dmu_read_req(objset_t *os, uint64_t object, struct request *req);
-int dmu_write_req(objset_t *os, uint64_t object, struct request *req,
+int dmu_read_bio(objset_t *os, uint64_t object, struct bio *bio);
+int dmu_write_bio(objset_t *os, uint64_t object, struct bio *bio,
         dmu_tx_t *tx);
  int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
  int dmu_read_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size);
diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5

index 35ea9d9f6fbb85eecd24eb502174fbb5f14c436c..2ceb6551961c01a9f579a263317921abdda18256 100644 (file)
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -1591,17 +1591,6 @@ Max number of blocks to discard at once
  Default value: \fB16,384\fR.
  .RE
  
-.sp
-.ne 2
-.na
-\fBzvol_threads\fR (uint)
-.ad
-.RS 12n
-Max number of threads to handle zvol I/O requests
-.sp
-Default value: \fB32\fR.
-.RE
-
  .SH ZFS I/O SCHEDULER
  ZFS issues I/O operations to leaf vdevs to satisfy and complete I/Os.
  The I/O scheduler determines when and in what order those operations are
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c

index ac7499d0176ef1a383d66e553118b86268a63364..5e2a1db601b49fc85d59bf064cfcfa8a150f814f 100644 (file)
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -1049,15 +1049,16 @@ xuio_stat_wbuf_nocopy()
   * return value is the number of bytes successfully copied to arg_buf.
   */
  static int
-dmu_req_copy(void *arg_buf, int size, struct request *req, size_t req_offset)
+dmu_bio_copy(void *arg_buf, int size, struct bio *bio, size_t bio_offset)
  {
-       struct bio_vec bv, *bvp;
-       struct req_iterator iter;
+       struct bio_vec bv, *bvp = &bv;
+       bvec_iterator_t iter;
         char *bv_buf;
         int tocpy, bv_len, bv_offset;
         int offset = 0;
  
-       rq_for_each_segment4(bv, bvp, req, iter) {
+       bio_for_each_segment4(bv, bvp, bio, iter) {
+
                 /*
                  * Fully consumed the passed arg_buf. We use goto here because
                  * rq_for_each_segment is a double loop
@@ -1066,23 +1067,23 @@ dmu_req_copy(void *arg_buf, int size, struct request *req, size_t req_offset)
                 if (size == offset)
                         goto out;
  
-               /* Skip already copied bv */
-               if (req_offset >=  bv.bv_len) {
-                       req_offset -= bv.bv_len;
+               /* Skip already copied bvp */
+               if (bio_offset >= bvp->bv_len) {
+                       bio_offset -= bvp->bv_len;
                         continue;
                 }
  
-               bv_len = bv.bv_len - req_offset;
-               bv_offset = bv.bv_offset + req_offset;
-               req_offset = 0;
+               bv_len = bvp->bv_len - bio_offset;
+               bv_offset = bvp->bv_offset + bio_offset;
+               bio_offset = 0;
  
                 tocpy = MIN(bv_len, size - offset);
                 ASSERT3S(tocpy, >=, 0);
  
-               bv_buf = page_address(bv.bv_page) + bv_offset;
+               bv_buf = page_address(bvp->bv_page) + bv_offset;
                 ASSERT3P(bv_buf, !=, NULL);
  
-               if (rq_data_dir(req) == WRITE)
+               if (bio_data_dir(bio) == WRITE)
                         memcpy(arg_buf + offset, bv_buf, tocpy);
                 else
                         memcpy(bv_buf, arg_buf + offset, tocpy);
@@ -1094,13 +1095,13 @@ out:
  }
  
  int
-dmu_read_req(objset_t *os, uint64_t object, struct request *req)
+dmu_read_bio(objset_t *os, uint64_t object, struct bio *bio)
  {
-       uint64_t size = blk_rq_bytes(req);
-       uint64_t offset = blk_rq_pos(req) << 9;
+       uint64_t offset = BIO_BI_SECTOR(bio) << 9;
+       uint64_t size = BIO_BI_SIZE(bio);
         dmu_buf_t **dbp;
         int numbufs, i, err;
-       size_t req_offset;
+       size_t bio_offset;
  
         /*
          * NB: we could do this block-at-a-time, but it's nice
@@ -1111,7 +1112,7 @@ dmu_read_req(objset_t *os, uint64_t object, struct request *req)
         if (err)
                 return (err);
  
-       req_offset = 0;
+       bio_offset = 0;
         for (i = 0; i < numbufs; i++) {
                 uint64_t tocpy;
                 int64_t bufoff;
@@ -1125,8 +1126,8 @@ dmu_read_req(objset_t *os, uint64_t object, struct request *req)
                 if (tocpy == 0)
                         break;
  
-               didcpy = dmu_req_copy(db->db_data + bufoff, tocpy, req,
-                   req_offset);
+               didcpy = dmu_bio_copy(db->db_data + bufoff, tocpy, bio,
+                   bio_offset);
  
                 if (didcpy < tocpy)
                         err = EIO;
@@ -1136,7 +1137,7 @@ dmu_read_req(objset_t *os, uint64_t object, struct request *req)
  
                 size -= tocpy;
                 offset += didcpy;
-               req_offset += didcpy;
+               bio_offset += didcpy;
                 err = 0;
         }
         dmu_buf_rele_array(dbp, numbufs, FTAG);
@@ -1145,13 +1146,13 @@ dmu_read_req(objset_t *os, uint64_t object, struct request *req)
  }
  
  int
-dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx)
+dmu_write_bio(objset_t *os, uint64_t object, struct bio *bio, dmu_tx_t *tx)
  {
-       uint64_t size = blk_rq_bytes(req);
-       uint64_t offset = blk_rq_pos(req) << 9;
+       uint64_t offset = BIO_BI_SECTOR(bio) << 9;
+       uint64_t size = BIO_BI_SIZE(bio);
         dmu_buf_t **dbp;
         int numbufs, i, err;
-       size_t req_offset;
+       size_t bio_offset;
  
         if (size == 0)
                 return (0);
@@ -1161,7 +1162,7 @@ dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx)
         if (err)
                 return (err);
  
-       req_offset = 0;
+       bio_offset = 0;
         for (i = 0; i < numbufs; i++) {
                 uint64_t tocpy;
                 int64_t bufoff;
@@ -1182,8 +1183,8 @@ dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx)
                 else
                         dmu_buf_will_dirty(db, tx);
  
-               didcpy = dmu_req_copy(db->db_data + bufoff, tocpy, req,
-                   req_offset);
+               didcpy = dmu_bio_copy(db->db_data + bufoff, tocpy, bio,
+                   bio_offset);
  
                 if (tocpy == db->db_size)
                         dmu_buf_fill_done(db, tx);
@@ -1196,7 +1197,7 @@ dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx)
  
                 size -= tocpy;
                 offset += didcpy;
-               req_offset += didcpy;
+               bio_offset += didcpy;
                 err = 0;
         }
  
diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c

index 380ede35b517cf640d7e3a6b4e9e299f02c4b8f3..e7e2b3b93f407a0eb2cb2e766081c91dcf1a21e1 100644 (file)
--- a/module/zfs/vdev_disk.c
+++ b/module/zfs/vdev_disk.c
@@ -496,6 +496,22 @@ bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size)
         return (bio_size);
  }
  
+static inline void
+vdev_submit_bio(int rw, struct bio *bio)
+{
+#ifdef HAVE_CURRENT_BIO_TAIL
+       struct bio **bio_tail = current->bio_tail;
+       current->bio_tail = NULL;
+       submit_bio(rw, bio);
+       current->bio_tail = bio_tail;
+#else
+       struct bio_list *bio_list = current->bio_list;
+       current->bio_list = NULL;
+       submit_bio(rw, bio);
+       current->bio_list = bio_list;
+#endif
+}
+
  static int
  __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr,
      size_t kbuf_size, uint64_t kbuf_offset, int flags)
@@ -571,7 +587,7 @@ retry:
                 bio_offset += BIO_BI_SIZE(dr->dr_bio[i]);
         }
  
-       /* Extra reference to protect dio_request during submit_bio */
+       /* Extra reference to protect dio_request during vdev_submit_bio */
         vdev_disk_dio_get(dr);
         if (zio)
                 zio->io_delay = jiffies_64;
@@ -579,7 +595,7 @@ retry:
         /* Submit all bio's associated with this dio */
         for (i = 0; i < dr->dr_bio_count; i++)
                 if (dr->dr_bio[i])
-                       submit_bio(dr->dr_rw, dr->dr_bio[i]);
+                       vdev_submit_bio(dr->dr_rw, dr->dr_bio[i]);
  
         /*
          * On synchronous blocking requests we wait for all bio the completion
@@ -645,7 +661,7 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
         bio->bi_private = zio;
         bio->bi_bdev = bdev;
         zio->io_delay = jiffies_64;
-       submit_bio(VDEV_WRITE_FLUSH_FUA, bio);
+       vdev_submit_bio(VDEV_WRITE_FLUSH_FUA, bio);
         invalidate_bdev(bdev);
  
         return (0);
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c

index 0c6cddef4205890d9b9ef56c3b1dda6f805c864f..074ec51e6f9ec18b1572c2690ab44f67f976e9c0 100644 (file)
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -50,10 +50,8 @@
  
  unsigned int zvol_inhibit_dev = 0;
  unsigned int zvol_major = ZVOL_MAJOR;
-unsigned int zvol_threads = 32;
  unsigned long zvol_max_discard_blocks = 16384;
  
-static taskq_t *zvol_taskq;
  static kmutex_t zvol_state_lock;
  static list_t zvol_state_list;
  static char *zvol_tag = "zvol_tag";
@@ -590,34 +588,24 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
         }
  }
  
-/*
- * Common write path running under the zvol taskq context.  This function
- * is responsible for copying the request structure data in to the DMU and
- * signaling the request queue with the result of the copy.
- */
-static void
-zvol_write(void *arg)
+static int
+zvol_write(struct bio *bio)
  {
-       struct request *req = (struct request *)arg;
-       struct request_queue *q = req->q;
-       zvol_state_t *zv = q->queuedata;
-       fstrans_cookie_t cookie = spl_fstrans_mark();
-       uint64_t offset = blk_rq_pos(req) << 9;
-       uint64_t size = blk_rq_bytes(req);
+       zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data;
+       uint64_t offset = BIO_BI_SECTOR(bio) << 9;
+       uint64_t size = BIO_BI_SIZE(bio);
         int error = 0;
         dmu_tx_t *tx;
         rl_t *rl;
  
-       if (req->cmd_flags & VDEV_REQ_FLUSH)
+       if (bio->bi_rw & VDEV_REQ_FLUSH)
                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
  
         /*
          * Some requests are just for flush and nothing else.
          */
-       if (size == 0) {
-               error = 0;
+       if (size == 0)
                 goto out;
-       }
  
         rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_WRITER);
  
@@ -632,96 +620,77 @@ zvol_write(void *arg)
                 goto out;
         }
  
-       error = dmu_write_req(zv->zv_objset, ZVOL_OBJ, req, tx);
+       error = dmu_write_bio(zv->zv_objset, ZVOL_OBJ, bio, tx);
         if (error == 0)
                 zvol_log_write(zv, tx, offset, size,
-                   req->cmd_flags & VDEV_REQ_FUA);
+                   !!(bio->bi_rw & VDEV_REQ_FUA));
  
         dmu_tx_commit(tx);
         zfs_range_unlock(rl);
  
-       if ((req->cmd_flags & VDEV_REQ_FUA) ||
+       if ((bio->bi_rw & VDEV_REQ_FUA) ||
             zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)
                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
  
  out:
-       blk_end_request(req, -error, size);
-       spl_fstrans_unmark(cookie);
+       return (error);
  }
  
-#ifdef HAVE_BLK_QUEUE_DISCARD
-static void
-zvol_discard(void *arg)
+static int
+zvol_discard(struct bio *bio)
  {
-       struct request *req = (struct request *)arg;
-       struct request_queue *q = req->q;
-       zvol_state_t *zv = q->queuedata;
-       fstrans_cookie_t cookie = spl_fstrans_mark();
-       uint64_t start = blk_rq_pos(req) << 9;
-       uint64_t end = start + blk_rq_bytes(req);
+       zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data;
+       uint64_t start = BIO_BI_SECTOR(bio) << 9;
+       uint64_t size = BIO_BI_SIZE(bio);
+       uint64_t end = start + size;
         int error;
         rl_t *rl;
  
-       if (end > zv->zv_volsize) {
-               error = EIO;
-               goto out;
-       }
+       if (end > zv->zv_volsize)
+               return (SET_ERROR(EIO));
  
         /*
          * Align the request to volume block boundaries. If we don't,
          * then this will force dnode_free_range() to zero out the
          * unaligned parts, which is slow (read-modify-write) and
          * useless since we are not freeing any space by doing so.
+        * XXX: We should handle secure discard by zeroing out unaligned parts.
          */
         start = P2ROUNDUP(start, zv->zv_volblocksize);
         end = P2ALIGN(end, zv->zv_volblocksize);
  
-       if (start >= end) {
-               error = 0;
-               goto out;
-       }
+       if (start >= end)
+               return (0);
  
-       rl = zfs_range_lock(&zv->zv_znode, start, end - start, RL_WRITER);
+       rl = zfs_range_lock(&zv->zv_znode, start, size, RL_WRITER);
  
-       error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, end-start);
+       error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, size);
  
         /*
          * TODO: maybe we should add the operation to the log.
          */
  
         zfs_range_unlock(rl);
-out:
-       blk_end_request(req, -error, blk_rq_bytes(req));
-       spl_fstrans_unmark(cookie);
+
+       return (error);
  }
-#endif /* HAVE_BLK_QUEUE_DISCARD */
  
-/*
- * Common read path running under the zvol taskq context.  This function
- * is responsible for copying the requested data out of the DMU and in to
- * a linux request structure.  It then must signal the request queue with
- * an error code describing the result of the copy.
- */
-static void
-zvol_read(void *arg)
+static int
+zvol_read(struct bio *bio)
  {
-       struct request *req = (struct request *)arg;
-       struct request_queue *q = req->q;
-       zvol_state_t *zv = q->queuedata;
-       fstrans_cookie_t cookie = spl_fstrans_mark();
-       uint64_t offset = blk_rq_pos(req) << 9;
-       uint64_t size = blk_rq_bytes(req);
+       zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data;
+       uint64_t offset = BIO_BI_SECTOR(bio) << 9;
+       uint64_t len = BIO_BI_SIZE(bio);
         int error;
         rl_t *rl;
  
-       if (size == 0) {
-               error = 0;
-               goto out;
-       }
+       if (len == 0)
+               return (0);
+
  
-       rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
+       rl = zfs_range_lock(&zv->zv_znode, offset, len, RL_READER);
  
-       error = dmu_read_req(zv->zv_objset, ZVOL_OBJ, req);
+       error = dmu_read_bio(zv->zv_objset, ZVOL_OBJ, bio);
  
         zfs_range_unlock(rl);
  
@@ -729,91 +698,50 @@ zvol_read(void *arg)
         if (error == ECKSUM)
                 error = SET_ERROR(EIO);
  
-out:
-       blk_end_request(req, -error, size);
-       spl_fstrans_unmark(cookie);
-}
-
-/*
- * Request will be added back to the request queue and retried if
- * it cannot be immediately dispatched to the taskq for handling
- */
-static inline void
-zvol_dispatch(task_func_t func, struct request *req)
-{
-       if (!taskq_dispatch(zvol_taskq, func, (void *)req, TQ_NOSLEEP))
-               blk_requeue_request(req->q, req);
+       return (error);
  }
  
-/*
- * Common request path.  Rather than registering a custom make_request()
- * function we use the generic Linux version.  This is done because it allows
- * us to easily merge read requests which would otherwise we performed
- * synchronously by the DMU.  This is less critical in write case where the
- * DMU will perform the correct merging within a transaction group.  Using
- * the generic make_request() also let's use leverage the fact that the
- * elevator with ensure correct ordering in regards to barrior IOs.  On
- * the downside it means that in the write case we end up doing request
- * merging twice once in the elevator and once in the DMU.
- *
- * The request handler is called under a spin lock so all the real work
- * is handed off to be done in the context of the zvol taskq.  This function
- * simply performs basic request sanity checking and hands off the request.
- */
-static void
-zvol_request(struct request_queue *q)
+static MAKE_REQUEST_FN_RET
+zvol_request(struct request_queue *q, struct bio *bio)
  {
         zvol_state_t *zv = q->queuedata;
-       struct request *req;
-       unsigned int size;
-
-       while ((req = blk_fetch_request(q)) != NULL) {
-               size = blk_rq_bytes(req);
-
-               if (size != 0 && blk_rq_pos(req) + blk_rq_sectors(req) >
-                   get_capacity(zv->zv_disk)) {
-                       printk(KERN_INFO
-                           "%s: bad access: block=%llu, count=%lu\n",
-                           req->rq_disk->disk_name,
-                           (long long unsigned)blk_rq_pos(req),
-                           (long unsigned)blk_rq_sectors(req));
-                       __blk_end_request(req, -EIO, size);
-                       continue;
-               }
+       fstrans_cookie_t cookie = spl_fstrans_mark();
+       uint64_t offset = BIO_BI_SECTOR(bio);
+       unsigned int sectors = bio_sectors(bio);
+       int error = 0;
  
-               if (!blk_fs_request(req)) {
-                       printk(KERN_INFO "%s: non-fs cmd\n",
-                           req->rq_disk->disk_name);
-                       __blk_end_request(req, -EIO, size);
-                       continue;
+       if (bio_has_data(bio) && offset + sectors >
+           get_capacity(zv->zv_disk)) {
+               printk(KERN_INFO
+                   "%s: bad access: block=%llu, count=%lu\n",
+                   zv->zv_disk->disk_name,
+                   (long long unsigned)offset,
+                   (long unsigned)sectors);
+               error = SET_ERROR(EIO);
+               goto out;
+       }
+
+       if (bio_data_dir(bio) == WRITE) {
+               if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
+                       error = SET_ERROR(EROFS);
+                       goto out;
                 }
  
-               switch ((int)rq_data_dir(req)) {
-               case READ:
-                       zvol_dispatch(zvol_read, req);
-                       break;
-               case WRITE:
-                       if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
-                               __blk_end_request(req, -EROFS, size);
-                               break;
-                       }
+               if (bio->bi_rw & VDEV_REQ_DISCARD) {
+                       error = zvol_discard(bio);
+                       goto out;
+               }
  
-#ifdef HAVE_BLK_QUEUE_DISCARD
-                       if (req->cmd_flags & VDEV_REQ_DISCARD) {
-                               zvol_dispatch(zvol_discard, req);
-                               break;
-                       }
-#endif /* HAVE_BLK_QUEUE_DISCARD */
+               error = zvol_write(bio);
+       } else
+               error = zvol_read(bio);
  
-                       zvol_dispatch(zvol_write, req);
-                       break;
-               default:
-                       printk(KERN_INFO "%s: unknown cmd: %d\n",
-                           req->rq_disk->disk_name, (int)rq_data_dir(req));
-                       __blk_end_request(req, -EIO, size);
-                       break;
-               }
-       }
+out:
+       bio_endio(bio, -error);
+       spl_fstrans_unmark(cookie);
+#ifdef HAVE_MAKE_REQUEST_FN_RET_INT
+       return (0);
+#endif
  }
  
  static void
@@ -1259,25 +1187,17 @@ static zvol_state_t *
  zvol_alloc(dev_t dev, const char *name)
  {
         zvol_state_t *zv;
-       int error = 0;
  
         zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
  
         spin_lock_init(&zv->zv_lock);
         list_link_init(&zv->zv_next);
  
-       zv->zv_queue = blk_init_queue(zvol_request, &zv->zv_lock);
+       zv->zv_queue = blk_alloc_queue(GFP_ATOMIC);
         if (zv->zv_queue == NULL)
                 goto out_kmem;
  
-#ifdef HAVE_ELEVATOR_CHANGE
-       error = elevator_change(zv->zv_queue, "noop");
-#endif /* HAVE_ELEVATOR_CHANGE */
-       if (error) {
-               printk("ZFS: Unable to set \"%s\" scheduler for zvol %s: %d\n",
-                   "noop", name, error);
-               goto out_queue;
-       }
+       blk_queue_make_request(zv->zv_queue, zvol_request);
  
  #ifdef HAVE_BLK_QUEUE_FLUSH
         blk_queue_flush(zv->zv_queue, VDEV_REQ_FLUSH | VDEV_REQ_FUA);
@@ -1418,13 +1338,11 @@ __zvol_create_minor(const char *name, boolean_t ignore_snapdev)
         blk_queue_max_segment_size(zv->zv_queue, UINT_MAX);
         blk_queue_physical_block_size(zv->zv_queue, zv->zv_volblocksize);
         blk_queue_io_opt(zv->zv_queue, zv->zv_volblocksize);
-#ifdef HAVE_BLK_QUEUE_DISCARD
         blk_queue_max_discard_sectors(zv->zv_queue,
             (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
         blk_queue_discard_granularity(zv->zv_queue, zv->zv_volblocksize);
         queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zv->zv_queue);
-#endif
-#ifdef HAVE_BLK_QUEUE_NONROT
+#ifdef QUEUE_FLAG_NONROT
         queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zv->zv_queue);
  #endif
  #ifdef QUEUE_FLAG_ADD_RANDOM
@@ -1651,7 +1569,6 @@ zvol_set_snapdev(const char *dsname, uint64_t snapdev) {
  int
  zvol_init(void)
  {
-       int threads = MIN(MAX(zvol_threads, 1), 1024);
         int error;
  
         list_create(&zvol_state_list, sizeof (zvol_state_t),
@@ -1659,18 +1576,10 @@ zvol_init(void)
  
         mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL);
  
-       zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri,
-           threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
-       if (zvol_taskq == NULL) {
-               printk(KERN_INFO "ZFS: taskq_create() failed\n");
-               error = -ENOMEM;
-               goto out1;
-       }
-
         error = register_blkdev(zvol_major, ZVOL_DRIVER);
         if (error) {
                 printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
-               goto out2;
+               goto out;
         }
  
         blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS,
@@ -1678,9 +1587,7 @@ zvol_init(void)
  
         return (0);
  
-out2:
-       taskq_destroy(zvol_taskq);
-out1:
+out:
         mutex_destroy(&zvol_state_lock);
         list_destroy(&zvol_state_list);
  
@@ -1693,7 +1600,6 @@ zvol_fini(void)
         zvol_remove_minors(NULL);
         blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS);
         unregister_blkdev(zvol_major, ZVOL_DRIVER);
-       taskq_destroy(zvol_taskq);
         mutex_destroy(&zvol_state_lock);
         list_destroy(&zvol_state_list);
  }
@@ -1704,8 +1610,5 @@ MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
  module_param(zvol_major, uint, 0444);
  MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
  
-module_param(zvol_threads, uint, 0444);
-MODULE_PARM_DESC(zvol_threads, "Max number of threads to handle I/O requests");
-
  module_param(zvol_max_discard_blocks, ulong, 0444);
  MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
author	Richard Yao <ryao@gentoo.org>
	Fri, 4 Jul 2014 22:43:47 +0000 (18:43 -0400)
committer	Richard Yao <ryao@gentoo.org>
	Fri, 4 Sep 2015 19:30:24 +0000 (15:30 -0400)
config/kernel-bio-rw-barrier.m4	[new file with mode: 0644]	patch \| blob
config/kernel-bio-rw-discard.m4	[new file with mode: 0644]	patch \| blob
config/kernel-current_bio_tail.m4	[new file with mode: 0644]	patch \| blob
config/kernel-mk-request-fn.m4	[new file with mode: 0644]	patch \| blob
config/kernel.m4		patch \| blob \| blame \| history
include/linux/blkdev_compat.h		patch \| blob \| blame \| history
include/sys/dmu.h		patch \| blob \| blame \| history
man/man5/zfs-module-parameters.5		patch \| blob \| blame \| history
module/zfs/dmu.c		patch \| blob \| blame \| history
module/zfs/vdev_disk.c		patch \| blob \| blame \| history
module/zfs/zvol.c		patch \| blob \| blame \| history