]> git.proxmox.com Git - mirror_zfs-debian.git/blobdiff - module/zfs/zvol.c
Imported Upstream version 0.6.5.3
[mirror_zfs-debian.git] / module / zfs / zvol.c
index d180b5b5b76f468ba67aa6b71e5f35d094727c11..c81f02a3907b9fc77644e8eb34427c09d58b0ac0 100644 (file)
@@ -40,6 +40,7 @@
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/zap.h>
+#include <sys/zfeature.h>
 #include <sys/zil_impl.h>
 #include <sys/zio.h>
 #include <sys/zfs_rlock.h>
 
 unsigned int zvol_inhibit_dev = 0;
 unsigned int zvol_major = ZVOL_MAJOR;
-unsigned int zvol_threads = 32;
+unsigned int zvol_prefetch_bytes = (128 * 1024);
 unsigned long zvol_max_discard_blocks = 16384;
 
-static taskq_t *zvol_taskq;
 static kmutex_t zvol_state_lock;
 static list_t zvol_state_list;
 static char *zvol_tag = "zvol_tag";
@@ -380,8 +380,31 @@ out:
  * Sanity check volume block size.
  */
 int
-zvol_check_volblocksize(uint64_t volblocksize)
+zvol_check_volblocksize(const char *name, uint64_t volblocksize)
 {
+       /* Record sizes above 128k need the feature to be enabled */
+       if (volblocksize > SPA_OLD_MAXBLOCKSIZE) {
+               spa_t *spa;
+               int error;
+
+               if ((error = spa_open(name, &spa, FTAG)) != 0)
+                       return (error);
+
+               if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
+                       spa_close(spa, FTAG);
+                       return (SET_ERROR(ENOTSUP));
+               }
+
+               /*
+                * We don't allow setting the property above 1MB,
+                * unless the tunable has been changed.
+                */
+               if (volblocksize > zfs_max_recordsize)
+                       return (SET_ERROR(EDOM));
+
+               spa_close(spa, FTAG);
+       }
+
        if (volblocksize < SPA_MINBLOCKSIZE ||
            volblocksize > SPA_MAXBLOCKSIZE ||
            !ISP2(volblocksize))
@@ -566,34 +589,24 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
        }
 }
 
-/*
- * Common write path running under the zvol taskq context.  This function
- * is responsible for copying the request structure data in to the DMU and
- * signaling the request queue with the result of the copy.
- */
-static void
-zvol_write(void *arg)
+static int
+zvol_write(struct bio *bio)
 {
-       struct request *req = (struct request *)arg;
-       struct request_queue *q = req->q;
-       zvol_state_t *zv = q->queuedata;
-       fstrans_cookie_t cookie = spl_fstrans_mark();
-       uint64_t offset = blk_rq_pos(req) << 9;
-       uint64_t size = blk_rq_bytes(req);
+       zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data;
+       uint64_t offset = BIO_BI_SECTOR(bio) << 9;
+       uint64_t size = BIO_BI_SIZE(bio);
        int error = 0;
        dmu_tx_t *tx;
        rl_t *rl;
 
-       if (req->cmd_flags & VDEV_REQ_FLUSH)
+       if (bio->bi_rw & VDEV_REQ_FLUSH)
                zil_commit(zv->zv_zilog, ZVOL_OBJ);
 
        /*
         * Some requests are just for flush and nothing else.
         */
-       if (size == 0) {
-               error = 0;
+       if (size == 0)
                goto out;
-       }
 
        rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_WRITER);
 
@@ -608,96 +621,83 @@ zvol_write(void *arg)
                goto out;
        }
 
-       error = dmu_write_req(zv->zv_objset, ZVOL_OBJ, req, tx);
+       error = dmu_write_bio(zv->zv_objset, ZVOL_OBJ, bio, tx);
        if (error == 0)
                zvol_log_write(zv, tx, offset, size,
-                   req->cmd_flags & VDEV_REQ_FUA);
+                   !!(bio->bi_rw & VDEV_REQ_FUA));
 
        dmu_tx_commit(tx);
        zfs_range_unlock(rl);
 
-       if ((req->cmd_flags & VDEV_REQ_FUA) ||
+       if ((bio->bi_rw & VDEV_REQ_FUA) ||
            zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)
                zil_commit(zv->zv_zilog, ZVOL_OBJ);
 
 out:
-       blk_end_request(req, -error, size);
-       spl_fstrans_unmark(cookie);
+       return (error);
 }
 
-#ifdef HAVE_BLK_QUEUE_DISCARD
-static void
-zvol_discard(void *arg)
+static int
+zvol_discard(struct bio *bio)
 {
-       struct request *req = (struct request *)arg;
-       struct request_queue *q = req->q;
-       zvol_state_t *zv = q->queuedata;
-       fstrans_cookie_t cookie = spl_fstrans_mark();
-       uint64_t start = blk_rq_pos(req) << 9;
-       uint64_t end = start + blk_rq_bytes(req);
+       zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data;
+       uint64_t start = BIO_BI_SECTOR(bio) << 9;
+       uint64_t size = BIO_BI_SIZE(bio);
+       uint64_t end = start + size;
        int error;
        rl_t *rl;
 
-       if (end > zv->zv_volsize) {
-               error = EIO;
-               goto out;
-       }
+       if (end > zv->zv_volsize)
+               return (SET_ERROR(EIO));
 
        /*
-        * Align the request to volume block boundaries. If we don't,
-        * then this will force dnode_free_range() to zero out the
-        * unaligned parts, which is slow (read-modify-write) and
-        * useless since we are not freeing any space by doing so.
+        * Align the request to volume block boundaries when REQ_SECURE is
+        * available, but not requested. If we don't, then this will force
+        * dnode_free_range() to zero out the unaligned parts, which is slow
+        * (read-modify-write) and useless since we are not freeing any space
+        * by doing so. Kernels that do not support REQ_SECURE (2.6.32 through
+        * 2.6.35) will not receive this optimization.
         */
-       start = P2ROUNDUP(start, zv->zv_volblocksize);
-       end = P2ALIGN(end, zv->zv_volblocksize);
-
-       if (start >= end) {
-               error = 0;
-               goto out;
+#ifdef REQ_SECURE
+       if (!(bio->bi_rw & REQ_SECURE)) {
+               start = P2ROUNDUP(start, zv->zv_volblocksize);
+               end = P2ALIGN(end, zv->zv_volblocksize);
+               size = end - start;
        }
+#endif
 
-       rl = zfs_range_lock(&zv->zv_znode, start, end - start, RL_WRITER);
+       if (start >= end)
+               return (0);
+
+       rl = zfs_range_lock(&zv->zv_znode, start, size, RL_WRITER);
 
-       error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, end-start);
+       error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, size);
 
        /*
         * TODO: maybe we should add the operation to the log.
         */
 
        zfs_range_unlock(rl);
-out:
-       blk_end_request(req, -error, blk_rq_bytes(req));
-       spl_fstrans_unmark(cookie);
+
+       return (error);
 }
-#endif /* HAVE_BLK_QUEUE_DISCARD */
 
-/*
- * Common read path running under the zvol taskq context.  This function
- * is responsible for copying the requested data out of the DMU and in to
- * a linux request structure.  It then must signal the request queue with
- * an error code describing the result of the copy.
- */
-static void
-zvol_read(void *arg)
+static int
+zvol_read(struct bio *bio)
 {
-       struct request *req = (struct request *)arg;
-       struct request_queue *q = req->q;
-       zvol_state_t *zv = q->queuedata;
-       fstrans_cookie_t cookie = spl_fstrans_mark();
-       uint64_t offset = blk_rq_pos(req) << 9;
-       uint64_t size = blk_rq_bytes(req);
+       zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data;
+       uint64_t offset = BIO_BI_SECTOR(bio) << 9;
+       uint64_t len = BIO_BI_SIZE(bio);
        int error;
        rl_t *rl;
 
-       if (size == 0) {
-               error = 0;
-               goto out;
-       }
+       if (len == 0)
+               return (0);
 
-       rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
 
-       error = dmu_read_req(zv->zv_objset, ZVOL_OBJ, req);
+       rl = zfs_range_lock(&zv->zv_znode, offset, len, RL_READER);
+
+       error = dmu_read_bio(zv->zv_objset, ZVOL_OBJ, bio);
 
        zfs_range_unlock(rl);
 
@@ -705,91 +705,58 @@ zvol_read(void *arg)
        if (error == ECKSUM)
                error = SET_ERROR(EIO);
 
-out:
-       blk_end_request(req, -error, size);
-       spl_fstrans_unmark(cookie);
-}
-
-/*
- * Request will be added back to the request queue and retried if
- * it cannot be immediately dispatched to the taskq for handling
- */
-static inline void
-zvol_dispatch(task_func_t func, struct request *req)
-{
-       if (!taskq_dispatch(zvol_taskq, func, (void *)req, TQ_NOSLEEP))
-               blk_requeue_request(req->q, req);
+       return (error);
 }
 
-/*
- * Common request path.  Rather than registering a custom make_request()
- * function we use the generic Linux version.  This is done because it allows
- * us to easily merge read requests which would otherwise we performed
- * synchronously by the DMU.  This is less critical in write case where the
- * DMU will perform the correct merging within a transaction group.  Using
- * the generic make_request() also let's use leverage the fact that the
- * elevator with ensure correct ordering in regards to barrior IOs.  On
- * the downside it means that in the write case we end up doing request
- * merging twice once in the elevator and once in the DMU.
- *
- * The request handler is called under a spin lock so all the real work
- * is handed off to be done in the context of the zvol taskq.  This function
- * simply performs basic request sanity checking and hands off the request.
- */
-static void
-zvol_request(struct request_queue *q)
+static MAKE_REQUEST_FN_RET
+zvol_request(struct request_queue *q, struct bio *bio)
 {
        zvol_state_t *zv = q->queuedata;
-       struct request *req;
-       unsigned int size;
-
-       while ((req = blk_fetch_request(q)) != NULL) {
-               size = blk_rq_bytes(req);
-
-               if (size != 0 && blk_rq_pos(req) + blk_rq_sectors(req) >
-                   get_capacity(zv->zv_disk)) {
-                       printk(KERN_INFO
-                           "%s: bad access: block=%llu, count=%lu\n",
-                           req->rq_disk->disk_name,
-                           (long long unsigned)blk_rq_pos(req),
-                           (long unsigned)blk_rq_sectors(req));
-                       __blk_end_request(req, -EIO, size);
-                       continue;
-               }
+       fstrans_cookie_t cookie = spl_fstrans_mark();
+       uint64_t offset = BIO_BI_SECTOR(bio);
+       unsigned int sectors = bio_sectors(bio);
+       int rw = bio_data_dir(bio);
+#ifdef HAVE_GENERIC_IO_ACCT
+       unsigned long start = jiffies;
+#endif
+       int error = 0;
 
-               if (!blk_fs_request(req)) {
-                       printk(KERN_INFO "%s: non-fs cmd\n",
-                           req->rq_disk->disk_name);
-                       __blk_end_request(req, -EIO, size);
-                       continue;
-               }
+       if (bio_has_data(bio) && offset + sectors >
+           get_capacity(zv->zv_disk)) {
+               printk(KERN_INFO
+                   "%s: bad access: block=%llu, count=%lu\n",
+                   zv->zv_disk->disk_name,
+                   (long long unsigned)offset,
+                   (long unsigned)sectors);
+               error = SET_ERROR(EIO);
+               goto out1;
+       }
 
-               switch (rq_data_dir(req)) {
-               case READ:
-                       zvol_dispatch(zvol_read, req);
-                       break;
-               case WRITE:
-                       if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
-                               __blk_end_request(req, -EROFS, size);
-                               break;
-                       }
+       generic_start_io_acct(rw, sectors, &zv->zv_disk->part0);
 
-#ifdef HAVE_BLK_QUEUE_DISCARD
-                       if (req->cmd_flags & VDEV_REQ_DISCARD) {
-                               zvol_dispatch(zvol_discard, req);
-                               break;
-                       }
-#endif /* HAVE_BLK_QUEUE_DISCARD */
+       if (rw == WRITE) {
+               if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
+                       error = SET_ERROR(EROFS);
+                       goto out2;
+               }
 
-                       zvol_dispatch(zvol_write, req);
-                       break;
-               default:
-                       printk(KERN_INFO "%s: unknown cmd: %d\n",
-                           req->rq_disk->disk_name, (int)rq_data_dir(req));
-                       __blk_end_request(req, -EIO, size);
-                       break;
+               if (bio->bi_rw & VDEV_REQ_DISCARD) {
+                       error = zvol_discard(bio);
+                       goto out2;
                }
-       }
+
+               error = zvol_write(bio);
+       } else
+               error = zvol_read(bio);
+
+out2:
+       generic_end_io_acct(rw, &zv->zv_disk->part0, start);
+out1:
+       BIO_END_IO(bio, -error);
+       spl_fstrans_unmark(cookie);
+#ifdef HAVE_MAKE_REQUEST_FN_RET_INT
+       return (0);
+#endif
 }
 
 static void
@@ -1235,25 +1202,17 @@ static zvol_state_t *
 zvol_alloc(dev_t dev, const char *name)
 {
        zvol_state_t *zv;
-       int error = 0;
 
        zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
 
        spin_lock_init(&zv->zv_lock);
        list_link_init(&zv->zv_next);
 
-       zv->zv_queue = blk_init_queue(zvol_request, &zv->zv_lock);
+       zv->zv_queue = blk_alloc_queue(GFP_ATOMIC);
        if (zv->zv_queue == NULL)
                goto out_kmem;
 
-#ifdef HAVE_ELEVATOR_CHANGE
-       error = elevator_change(zv->zv_queue, "noop");
-#endif /* HAVE_ELEVATOR_CHANGE */
-       if (error) {
-               printk("ZFS: Unable to set \"%s\" scheduler for zvol %s: %d\n",
-                   "noop", name, error);
-               goto out_queue;
-       }
+       blk_queue_make_request(zv->zv_queue, zvol_request);
 
 #ifdef HAVE_BLK_QUEUE_FLUSH
        blk_queue_flush(zv->zv_queue, VDEV_REQ_FLUSH | VDEV_REQ_FUA);
@@ -1339,6 +1298,7 @@ __zvol_create_minor(const char *name, boolean_t ignore_snapdev)
        objset_t *os;
        dmu_object_info_t *doi;
        uint64_t volsize;
+       uint64_t len;
        unsigned minor = 0;
        int error = 0;
 
@@ -1389,20 +1349,21 @@ __zvol_create_minor(const char *name, boolean_t ignore_snapdev)
 
        set_capacity(zv->zv_disk, zv->zv_volsize >> 9);
 
-       blk_queue_max_hw_sectors(zv->zv_queue, DMU_MAX_ACCESS / 512);
+       blk_queue_max_hw_sectors(zv->zv_queue, (DMU_MAX_ACCESS / 4) >> 9);
        blk_queue_max_segments(zv->zv_queue, UINT16_MAX);
        blk_queue_max_segment_size(zv->zv_queue, UINT_MAX);
        blk_queue_physical_block_size(zv->zv_queue, zv->zv_volblocksize);
        blk_queue_io_opt(zv->zv_queue, zv->zv_volblocksize);
-#ifdef HAVE_BLK_QUEUE_DISCARD
        blk_queue_max_discard_sectors(zv->zv_queue,
            (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
        blk_queue_discard_granularity(zv->zv_queue, zv->zv_volblocksize);
        queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zv->zv_queue);
-#endif
-#ifdef HAVE_BLK_QUEUE_NONROT
+#ifdef QUEUE_FLAG_NONROT
        queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zv->zv_queue);
 #endif
+#ifdef QUEUE_FLAG_ADD_RANDOM
+       queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zv->zv_queue);
+#endif
 
        if (spa_writeable(dmu_objset_spa(os))) {
                if (zil_replay_disable)
@@ -1411,6 +1372,18 @@ __zvol_create_minor(const char *name, boolean_t ignore_snapdev)
                        zil_replay(os, zv, zvol_replay_vector);
        }
 
+       /*
+        * When udev detects the addition of the device it will immediately
+        * invoke blkid(8) to determine the type of content on the device.
+        * Prefetching the blocks commonly scanned by blkid(8) will speed
+        * up this process.
+        */
+       len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE);
+       if (len > 0) {
+               dmu_prefetch(os, ZVOL_OBJ, 0, len);
+               dmu_prefetch(os, ZVOL_OBJ, volsize - len, len);
+       }
+
        zv->zv_objset = NULL;
 out_dmu_objset_disown:
        dmu_objset_disown(os, zvol_tag);
@@ -1631,18 +1604,10 @@ zvol_init(void)
 
        mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL);
 
-       zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_threads, maxclsyspri,
-           zvol_threads, INT_MAX, TASKQ_PREPOPULATE);
-       if (zvol_taskq == NULL) {
-               printk(KERN_INFO "ZFS: taskq_create() failed\n");
-               error = -ENOMEM;
-               goto out1;
-       }
-
        error = register_blkdev(zvol_major, ZVOL_DRIVER);
        if (error) {
                printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
-               goto out2;
+               goto out;
        }
 
        blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS,
@@ -1650,9 +1615,7 @@ zvol_init(void)
 
        return (0);
 
-out2:
-       taskq_destroy(zvol_taskq);
-out1:
+out:
        mutex_destroy(&zvol_state_lock);
        list_destroy(&zvol_state_list);
 
@@ -1665,7 +1628,6 @@ zvol_fini(void)
        zvol_remove_minors(NULL);
        blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS);
        unregister_blkdev(zvol_major, ZVOL_DRIVER);
-       taskq_destroy(zvol_taskq);
        mutex_destroy(&zvol_state_lock);
        list_destroy(&zvol_state_list);
 }
@@ -1676,8 +1638,8 @@ MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
 module_param(zvol_major, uint, 0444);
 MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
 
-module_param(zvol_threads, uint, 0444);
-MODULE_PARM_DESC(zvol_threads, "Number of threads for zvol device");
-
 module_param(zvol_max_discard_blocks, ulong, 0444);
 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
+
+module_param(zvol_prefetch_bytes, uint, 0644);
+MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");