module/os/linux/zfs/zvol_os.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or https://opensource.org/licenses/CDDL-1.0.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
  23  */
  24
  25 #include <sys/dataset_kstats.h>
  26 #include <sys/dbuf.h>
  27 #include <sys/dmu_traverse.h>
  28 #include <sys/dsl_dataset.h>
  29 #include <sys/dsl_prop.h>
  30 #include <sys/dsl_dir.h>
  31 #include <sys/zap.h>
  32 #include <sys/zfeature.h>
  33 #include <sys/zil_impl.h>
  34 #include <sys/dmu_tx.h>
  35 #include <sys/zio.h>
  36 #include <sys/zfs_rlock.h>
  37 #include <sys/spa_impl.h>
  38 #include <sys/zvol.h>
  39 #include <sys/zvol_impl.h>
  40 #include <cityhash.h>
  41
  42 #include <linux/blkdev_compat.h>
  43 #include <linux/task_io_accounting_ops.h>
  44
  45 #ifdef HAVE_BLK_MQ
  46 #include <linux/blk-mq.h>
  47 #endif
  48
  49 static void zvol_request_impl(zvol_state_t *zv, struct bio *bio,
  50     struct request *rq, boolean_t force_sync);
  51
  52 static unsigned int zvol_major = ZVOL_MAJOR;
  53 static unsigned int zvol_request_sync = 0;
  54 static unsigned int zvol_prefetch_bytes = (128 * 1024);
  55 static unsigned long zvol_max_discard_blocks = 16384;
  56
  57 /*
  58  * Switch taskq at multiple of 512 MB offset. This can be set to a lower value
  59  * to utilize more threads for small files but may affect prefetch hits.
  60  */
  61 #define ZVOL_TASKQ_OFFSET_SHIFT 29
  62
  63 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
  64 static unsigned int zvol_open_timeout_ms = 1000;
  65 #endif
  66
  67 static unsigned int zvol_threads = 0;
  68 #ifdef HAVE_BLK_MQ
  69 static unsigned int zvol_blk_mq_threads = 0;
  70 static unsigned int zvol_blk_mq_actual_threads;
  71 static boolean_t zvol_use_blk_mq = B_FALSE;
  72
  73 /*
  74  * The maximum number of volblocksize blocks to process per thread.  Typically,
  75  * write heavy workloads preform better with higher values here, and read
  76  * heavy workloads preform better with lower values, but that's not a hard
  77  * and fast rule.  It's basically a knob to tune between "less overhead with
  78  * less parallelism" and "more overhead, but more parallelism".
  79  *
  80  * '8' was chosen as a reasonable, balanced, default based off of sequential
  81  * read and write tests to a zvol in an NVMe pool (with 16 CPUs).
  82  */
  83 static unsigned int zvol_blk_mq_blocks_per_thread = 8;
  84 #endif
  85
  86 static unsigned int zvol_num_taskqs = 0;
  87
  88 #ifndef BLKDEV_DEFAULT_RQ
  89 /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */
  90 #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ
  91 #endif
  92
  93 /*
  94  * Finalize our BIO or request.
  95  */
  96 #ifdef  HAVE_BLK_MQ
  97 #define END_IO(zv, bio, rq, error)  do { \
  98         if (bio) { \
  99                 BIO_END_IO(bio, error); \
 100         } else { \
 101                 blk_mq_end_request(rq, errno_to_bi_status(error)); \
 102         } \
 103 } while (0)
 104 #else
 105 #define END_IO(zv, bio, rq, error)      BIO_END_IO(bio, error)
 106 #endif
 107
 108 #ifdef HAVE_BLK_MQ
 109 static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ;
 110 static unsigned int zvol_actual_blk_mq_queue_depth;
 111 #endif
 112
 113 struct zvol_state_os {
 114         struct gendisk          *zvo_disk;      /* generic disk */
 115         struct request_queue    *zvo_queue;     /* request queue */
 116         dev_t                   zvo_dev;        /* device id */
 117
 118 #ifdef HAVE_BLK_MQ
 119         struct blk_mq_tag_set tag_set;
 120 #endif
 121
 122         /* Set from the global 'zvol_use_blk_mq' at zvol load */
 123         boolean_t use_blk_mq;
 124 };
 125
 126 typedef struct zv_taskq {
 127         uint_t tqs_cnt;
 128         taskq_t **tqs_taskq;
 129 } zv_taskq_t;
 130 static zv_taskq_t zvol_taskqs;
 131 static struct ida zvol_ida;
 132
 133 typedef struct zv_request_stack {
 134         zvol_state_t    *zv;
 135         struct bio      *bio;
 136         struct request *rq;
 137 } zv_request_t;
 138
 139 typedef struct zv_work {
 140         struct request  *rq;
 141         struct work_struct work;
 142 } zv_work_t;
 143
 144 typedef struct zv_request_task {
 145         zv_request_t zvr;
 146         taskq_ent_t     ent;
 147 } zv_request_task_t;
 148
 149 static zv_request_task_t *
 150 zv_request_task_create(zv_request_t zvr)
 151 {
 152         zv_request_task_t *task;
 153         task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP);
 154         taskq_init_ent(&task->ent);
 155         task->zvr = zvr;
 156         return (task);
 157 }
 158
 159 static void
 160 zv_request_task_free(zv_request_task_t *task)
 161 {
 162         kmem_free(task, sizeof (*task));
 163 }
 164
 165 #ifdef HAVE_BLK_MQ
 166
 167 /*
 168  * This is called when a new block multiqueue request comes in.  A request
 169  * contains one or more BIOs.
 170  */
 171 static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 172     const struct blk_mq_queue_data *bd)
 173 {
 174         struct request *rq = bd->rq;
 175         zvol_state_t *zv = rq->q->queuedata;
 176
 177         /* Tell the kernel that we are starting to process this request */
 178         blk_mq_start_request(rq);
 179
 180         if (blk_rq_is_passthrough(rq)) {
 181                 /* Skip non filesystem request */
 182                 blk_mq_end_request(rq, BLK_STS_IOERR);
 183                 return (BLK_STS_IOERR);
 184         }
 185
 186         zvol_request_impl(zv, NULL, rq, 0);
 187
 188         /* Acknowledge to the kernel that we got this request */
 189         return (BLK_STS_OK);
 190 }
 191
 192 static struct blk_mq_ops zvol_blk_mq_queue_ops = {
 193         .queue_rq = zvol_mq_queue_rq,
 194 };
 195
 196 /* Initialize our blk-mq struct */
 197 static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv)
 198 {
 199         struct zvol_state_os *zso = zv->zv_zso;
 200
 201         memset(&zso->tag_set, 0, sizeof (zso->tag_set));
 202
 203         /* Initialize tag set. */
 204         zso->tag_set.ops = &zvol_blk_mq_queue_ops;
 205         zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads;
 206         zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth;
 207         zso->tag_set.numa_node = NUMA_NO_NODE;
 208         zso->tag_set.cmd_size = 0;
 209
 210         /*
 211          * We need BLK_MQ_F_BLOCKING here since we do blocking calls in
 212          * zvol_request_impl()
 213          */
 214         zso->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
 215         zso->tag_set.driver_data = zv;
 216
 217         return (blk_mq_alloc_tag_set(&zso->tag_set));
 218 }
 219 #endif /* HAVE_BLK_MQ */
 220
 221 /*
 222  * Given a path, return TRUE if path is a ZVOL.
 223  */
 224 boolean_t
 225 zvol_os_is_zvol(const char *path)
 226 {
 227         dev_t dev = 0;
 228
 229         if (vdev_lookup_bdev(path, &dev) != 0)
 230                 return (B_FALSE);
 231
 232         if (MAJOR(dev) == zvol_major)
 233                 return (B_TRUE);
 234
 235         return (B_FALSE);
 236 }
 237
 238 static void
 239 zvol_write(zv_request_t *zvr)
 240 {
 241         struct bio *bio = zvr->bio;
 242         struct request *rq = zvr->rq;
 243         int error = 0;
 244         zfs_uio_t uio;
 245         zvol_state_t *zv = zvr->zv;
 246         struct request_queue *q;
 247         struct gendisk *disk;
 248         unsigned long start_time = 0;
 249         boolean_t acct = B_FALSE;
 250
 251         ASSERT3P(zv, !=, NULL);
 252         ASSERT3U(zv->zv_open_count, >, 0);
 253         ASSERT3P(zv->zv_zilog, !=, NULL);
 254
 255         q = zv->zv_zso->zvo_queue;
 256         disk = zv->zv_zso->zvo_disk;
 257
 258         /* bio marked as FLUSH need to flush before write */
 259         if (io_is_flush(bio, rq))
 260                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
 261
 262         /* Some requests are just for flush and nothing else. */
 263         if (io_size(bio, rq) == 0) {
 264                 rw_exit(&zv->zv_suspend_lock);
 265                 END_IO(zv, bio, rq, 0);
 266                 return;
 267         }
 268
 269         zfs_uio_bvec_init(&uio, bio, rq);
 270
 271         ssize_t start_resid = uio.uio_resid;
 272
 273         /*
 274          * With use_blk_mq, accounting is done by blk_mq_start_request()
 275          * and blk_mq_end_request(), so we can skip it here.
 276          */
 277         if (bio) {
 278                 acct = blk_queue_io_stat(q);
 279                 if (acct) {
 280                         start_time = blk_generic_start_io_acct(q, disk, WRITE,
 281                             bio);
 282                 }
 283         }
 284
 285         boolean_t sync =
 286             io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
 287
 288         zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
 289             uio.uio_loffset, uio.uio_resid, RL_WRITER);
 290
 291         uint64_t volsize = zv->zv_volsize;
 292         while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
 293                 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
 294                 uint64_t off = uio.uio_loffset;
 295                 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
 296
 297                 if (bytes > volsize - off)      /* don't write past the end */
 298                         bytes = volsize - off;
 299
 300                 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
 301
 302                 /* This will only fail for ENOSPC */
 303                 error = dmu_tx_assign(tx, TXG_WAIT);
 304                 if (error) {
 305                         dmu_tx_abort(tx);
 306                         break;
 307                 }
 308                 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
 309                 if (error == 0) {
 310                         zvol_log_write(zv, tx, off, bytes, sync);
 311                 }
 312                 dmu_tx_commit(tx);
 313
 314                 if (error)
 315                         break;
 316         }
 317         zfs_rangelock_exit(lr);
 318
 319         int64_t nwritten = start_resid - uio.uio_resid;
 320         dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
 321         task_io_account_write(nwritten);
 322
 323         if (sync)
 324                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
 325
 326         rw_exit(&zv->zv_suspend_lock);
 327
 328         if (bio && acct) {
 329                 blk_generic_end_io_acct(q, disk, WRITE, bio, start_time);
 330         }
 331
 332         END_IO(zv, bio, rq, -error);
 333 }
 334
 335 static void
 336 zvol_write_task(void *arg)
 337 {
 338         zv_request_task_t *task = arg;
 339         zvol_write(&task->zvr);
 340         zv_request_task_free(task);
 341 }
 342
 343 static void
 344 zvol_discard(zv_request_t *zvr)
 345 {
 346         struct bio *bio = zvr->bio;
 347         struct request *rq = zvr->rq;
 348         zvol_state_t *zv = zvr->zv;
 349         uint64_t start = io_offset(bio, rq);
 350         uint64_t size = io_size(bio, rq);
 351         uint64_t end = start + size;
 352         boolean_t sync;
 353         int error = 0;
 354         dmu_tx_t *tx;
 355         struct request_queue *q = zv->zv_zso->zvo_queue;
 356         struct gendisk *disk = zv->zv_zso->zvo_disk;
 357         unsigned long start_time = 0;
 358         boolean_t acct = B_FALSE;
 359
 360         ASSERT3P(zv, !=, NULL);
 361         ASSERT3U(zv->zv_open_count, >, 0);
 362         ASSERT3P(zv->zv_zilog, !=, NULL);
 363
 364         if (bio) {
 365                 acct = blk_queue_io_stat(q);
 366                 if (acct) {
 367                         start_time = blk_generic_start_io_acct(q, disk, WRITE,
 368                             bio);
 369                 }
 370         }
 371
 372         sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
 373
 374         if (end > zv->zv_volsize) {
 375                 error = SET_ERROR(EIO);
 376                 goto unlock;
 377         }
 378
 379         /*
 380          * Align the request to volume block boundaries when a secure erase is
 381          * not required.  This will prevent dnode_free_range() from zeroing out
 382          * the unaligned parts which is slow (read-modify-write) and useless
 383          * since we are not freeing any space by doing so.
 384          */
 385         if (!io_is_secure_erase(bio, rq)) {
 386                 start = P2ROUNDUP(start, zv->zv_volblocksize);
 387                 end = P2ALIGN(end, zv->zv_volblocksize);
 388                 size = end - start;
 389         }
 390
 391         if (start >= end)
 392                 goto unlock;
 393
 394         zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
 395             start, size, RL_WRITER);
 396
 397         tx = dmu_tx_create(zv->zv_objset);
 398         dmu_tx_mark_netfree(tx);
 399         error = dmu_tx_assign(tx, TXG_WAIT);
 400         if (error != 0) {
 401                 dmu_tx_abort(tx);
 402         } else {
 403                 zvol_log_truncate(zv, tx, start, size);
 404                 dmu_tx_commit(tx);
 405                 error = dmu_free_long_range(zv->zv_objset,
 406                     ZVOL_OBJ, start, size);
 407         }
 408         zfs_rangelock_exit(lr);
 409
 410         if (error == 0 && sync)
 411                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
 412
 413 unlock:
 414         rw_exit(&zv->zv_suspend_lock);
 415
 416         if (bio && acct) {
 417                 blk_generic_end_io_acct(q, disk, WRITE, bio,
 418                     start_time);
 419         }
 420
 421         END_IO(zv, bio, rq, -error);
 422 }
 423
 424 static void
 425 zvol_discard_task(void *arg)
 426 {
 427         zv_request_task_t *task = arg;
 428         zvol_discard(&task->zvr);
 429         zv_request_task_free(task);
 430 }
 431
 432 static void
 433 zvol_read(zv_request_t *zvr)
 434 {
 435         struct bio *bio = zvr->bio;
 436         struct request *rq = zvr->rq;
 437         int error = 0;
 438         zfs_uio_t uio;
 439         boolean_t acct = B_FALSE;
 440         zvol_state_t *zv = zvr->zv;
 441         struct request_queue *q;
 442         struct gendisk *disk;
 443         unsigned long start_time = 0;
 444
 445         ASSERT3P(zv, !=, NULL);
 446         ASSERT3U(zv->zv_open_count, >, 0);
 447
 448         zfs_uio_bvec_init(&uio, bio, rq);
 449
 450         q = zv->zv_zso->zvo_queue;
 451         disk = zv->zv_zso->zvo_disk;
 452
 453         ssize_t start_resid = uio.uio_resid;
 454
 455         /*
 456          * When blk-mq is being used, accounting is done by
 457          * blk_mq_start_request() and blk_mq_end_request().
 458          */
 459         if (bio) {
 460                 acct = blk_queue_io_stat(q);
 461                 if (acct)
 462                         start_time = blk_generic_start_io_acct(q, disk, READ,
 463                             bio);
 464         }
 465
 466         zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
 467             uio.uio_loffset, uio.uio_resid, RL_READER);
 468
 469         uint64_t volsize = zv->zv_volsize;
 470
 471         while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
 472                 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
 473
 474                 /* don't read past the end */
 475                 if (bytes > volsize - uio.uio_loffset)
 476                         bytes = volsize - uio.uio_loffset;
 477
 478                 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
 479                 if (error) {
 480                         /* convert checksum errors into IO errors */
 481                         if (error == ECKSUM)
 482                                 error = SET_ERROR(EIO);
 483                         break;
 484                 }
 485         }
 486         zfs_rangelock_exit(lr);
 487
 488         int64_t nread = start_resid - uio.uio_resid;
 489         dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
 490         task_io_account_read(nread);
 491
 492         rw_exit(&zv->zv_suspend_lock);
 493
 494         if (bio && acct) {
 495                 blk_generic_end_io_acct(q, disk, READ, bio, start_time);
 496         }
 497
 498         END_IO(zv, bio, rq, -error);
 499 }
 500
 501 static void
 502 zvol_read_task(void *arg)
 503 {
 504         zv_request_task_t *task = arg;
 505         zvol_read(&task->zvr);
 506         zv_request_task_free(task);
 507 }
 508
 509
 510 /*
 511  * Process a BIO or request
 512  *
 513  * Either 'bio' or 'rq' should be set depending on if we are processing a
 514  * bio or a request (both should not be set).
 515  *
 516  * force_sync:  Set to 0 to defer processing to a background taskq
 517  *                      Set to 1 to process data synchronously
 518  */
 519 static void
 520 zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 521     boolean_t force_sync)
 522 {
 523         fstrans_cookie_t cookie = spl_fstrans_mark();
 524         uint64_t offset = io_offset(bio, rq);
 525         uint64_t size = io_size(bio, rq);
 526         int rw = io_data_dir(bio, rq);
 527
 528         if (zvol_request_sync || zv->zv_threading == B_FALSE)
 529                 force_sync = 1;
 530
 531         zv_request_t zvr = {
 532                 .zv = zv,
 533                 .bio = bio,
 534                 .rq = rq,
 535         };
 536
 537         if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) {
 538                 printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n",
 539                     zv->zv_zso->zvo_disk->disk_name,
 540                     (long long unsigned)offset,
 541                     (long unsigned)size);
 542
 543                 END_IO(zv, bio, rq, -SET_ERROR(EIO));
 544                 goto out;
 545         }
 546
 547         zv_request_task_t *task;
 548         zv_taskq_t *ztqs = &zvol_taskqs;
 549         uint_t blk_mq_hw_queue = 0;
 550         uint_t tq_idx;
 551         uint_t taskq_hash;
 552 #ifdef HAVE_BLK_MQ
 553         if (rq)
 554                 blk_mq_hw_queue = rq->mq_hctx->queue_num;
 555 #endif
 556         taskq_hash = cityhash4((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT,
 557             blk_mq_hw_queue, 0);
 558         tq_idx = taskq_hash % ztqs->tqs_cnt;
 559
 560         if (rw == WRITE) {
 561                 if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
 562                         END_IO(zv, bio, rq, -SET_ERROR(EROFS));
 563                         goto out;
 564                 }
 565
 566                 /*
 567                  * Prevents the zvol from being suspended, or the ZIL being
 568                  * concurrently opened.  Will be released after the i/o
 569                  * completes.
 570                  */
 571                 rw_enter(&zv->zv_suspend_lock, RW_READER);
 572
 573                 /*
 574                  * Open a ZIL if this is the first time we have written to this
 575                  * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
 576                  * than zv_state_lock so that we don't need to acquire an
 577                  * additional lock in this path.
 578                  */
 579                 if (zv->zv_zilog == NULL) {
 580                         rw_exit(&zv->zv_suspend_lock);
 581                         rw_enter(&zv->zv_suspend_lock, RW_WRITER);
 582                         if (zv->zv_zilog == NULL) {
 583                                 zv->zv_zilog = zil_open(zv->zv_objset,
 584                                     zvol_get_data, &zv->zv_kstat.dk_zil_sums);
 585                                 zv->zv_flags |= ZVOL_WRITTEN_TO;
 586                                 /* replay / destroy done in zvol_create_minor */
 587                                 VERIFY0((zv->zv_zilog->zl_header->zh_flags &
 588                                     ZIL_REPLAY_NEEDED));
 589                         }
 590                         rw_downgrade(&zv->zv_suspend_lock);
 591                 }
 592
 593                 /*
 594                  * We don't want this thread to be blocked waiting for i/o to
 595                  * complete, so we instead wait from a taskq callback. The
 596                  * i/o may be a ZIL write (via zil_commit()), or a read of an
 597                  * indirect block, or a read of a data block (if this is a
 598                  * partial-block write).  We will indicate that the i/o is
 599                  * complete by calling END_IO() from the taskq callback.
 600                  *
 601                  * This design allows the calling thread to continue and
 602                  * initiate more concurrent operations by calling
 603                  * zvol_request() again. There are typically only a small
 604                  * number of threads available to call zvol_request() (e.g.
 605                  * one per iSCSI target), so keeping the latency of
 606                  * zvol_request() low is important for performance.
 607                  *
 608                  * The zvol_request_sync module parameter allows this
 609                  * behavior to be altered, for performance evaluation
 610                  * purposes.  If the callback blocks, setting
 611                  * zvol_request_sync=1 will result in much worse performance.
 612                  *
 613                  * We can have up to zvol_threads concurrent i/o's being
 614                  * processed for all zvols on the system.  This is typically
 615                  * a vast improvement over the zvol_request_sync=1 behavior
 616                  * of one i/o at a time per zvol.  However, an even better
 617                  * design would be for zvol_request() to initiate the zio
 618                  * directly, and then be notified by the zio_done callback,
 619                  * which would call END_IO().  Unfortunately, the DMU/ZIL
 620                  * interfaces lack this functionality (they block waiting for
 621                  * the i/o to complete).
 622                  */
 623                 if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) {
 624                         if (force_sync) {
 625                                 zvol_discard(&zvr);
 626                         } else {
 627                                 task = zv_request_task_create(zvr);
 628                                 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
 629                                     zvol_discard_task, task, 0, &task->ent);
 630                         }
 631                 } else {
 632                         if (force_sync) {
 633                                 zvol_write(&zvr);
 634                         } else {
 635                                 task = zv_request_task_create(zvr);
 636                                 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
 637                                     zvol_write_task, task, 0, &task->ent);
 638                         }
 639                 }
 640         } else {
 641                 /*
 642                  * The SCST driver, and possibly others, may issue READ I/Os
 643                  * with a length of zero bytes.  These empty I/Os contain no
 644                  * data and require no additional handling.
 645                  */
 646                 if (size == 0) {
 647                         END_IO(zv, bio, rq, 0);
 648                         goto out;
 649                 }
 650
 651                 rw_enter(&zv->zv_suspend_lock, RW_READER);
 652
 653                 /* See comment in WRITE case above. */
 654                 if (force_sync) {
 655                         zvol_read(&zvr);
 656                 } else {
 657                         task = zv_request_task_create(zvr);
 658                         taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
 659                             zvol_read_task, task, 0, &task->ent);
 660                 }
 661         }
 662
 663 out:
 664         spl_fstrans_unmark(cookie);
 665 }
 666
 667 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
 668 #ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID
 669 static void
 670 zvol_submit_bio(struct bio *bio)
 671 #else
 672 static blk_qc_t
 673 zvol_submit_bio(struct bio *bio)
 674 #endif
 675 #else
 676 static MAKE_REQUEST_FN_RET
 677 zvol_request(struct request_queue *q, struct bio *bio)
 678 #endif
 679 {
 680 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
 681 #if defined(HAVE_BIO_BDEV_DISK)
 682         struct request_queue *q = bio->bi_bdev->bd_disk->queue;
 683 #else
 684         struct request_queue *q = bio->bi_disk->queue;
 685 #endif
 686 #endif
 687         zvol_state_t *zv = q->queuedata;
 688
 689         zvol_request_impl(zv, bio, NULL, 0);
 690 #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \
 691         defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
 692         !defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID)
 693         return (BLK_QC_T_NONE);
 694 #endif
 695 }
 696
 697 static int
 698 #ifdef HAVE_BLK_MODE_T
 699 zvol_open(struct gendisk *disk, blk_mode_t flag)
 700 #else
 701 zvol_open(struct block_device *bdev, fmode_t flag)
 702 #endif
 703 {
 704         zvol_state_t *zv;
 705         int error = 0;
 706         boolean_t drop_suspend = B_FALSE;
 707 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
 708         hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms);
 709         hrtime_t start = gethrtime();
 710
 711 retry:
 712 #endif
 713         rw_enter(&zvol_state_lock, RW_READER);
 714         /*
 715          * Obtain a copy of private_data under the zvol_state_lock to make
 716          * sure that either the result of zvol free code path setting
 717          * disk->private_data to NULL is observed, or zvol_os_free()
 718          * is not called on this zv because of the positive zv_open_count.
 719          */
 720 #ifdef HAVE_BLK_MODE_T
 721         zv = disk->private_data;
 722 #else
 723         zv = bdev->bd_disk->private_data;
 724 #endif
 725         if (zv == NULL) {
 726                 rw_exit(&zvol_state_lock);
 727                 return (SET_ERROR(-ENXIO));
 728         }
 729
 730         mutex_enter(&zv->zv_state_lock);
 731         /*
 732          * Make sure zvol is not suspended during first open
 733          * (hold zv_suspend_lock) and respect proper lock acquisition
 734          * ordering - zv_suspend_lock before zv_state_lock
 735          */
 736         if (zv->zv_open_count == 0) {
 737                 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
 738                         mutex_exit(&zv->zv_state_lock);
 739                         rw_enter(&zv->zv_suspend_lock, RW_READER);
 740                         mutex_enter(&zv->zv_state_lock);
 741                         /* check to see if zv_suspend_lock is needed */
 742                         if (zv->zv_open_count != 0) {
 743                                 rw_exit(&zv->zv_suspend_lock);
 744                         } else {
 745                                 drop_suspend = B_TRUE;
 746                         }
 747                 } else {
 748                         drop_suspend = B_TRUE;
 749                 }
 750         }
 751         rw_exit(&zvol_state_lock);
 752
 753         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 754
 755         if (zv->zv_open_count == 0) {
 756                 boolean_t drop_namespace = B_FALSE;
 757
 758                 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
 759
 760                 /*
 761                  * In all other call paths the spa_namespace_lock is taken
 762                  * before the bdev->bd_mutex lock.  However, on open(2)
 763                  * the __blkdev_get() function calls fops->open() with the
 764                  * bdev->bd_mutex lock held.  This can result in a deadlock
 765                  * when zvols from one pool are used as vdevs in another.
 766                  *
 767                  * To prevent a lock inversion deadlock we preemptively
 768                  * take the spa_namespace_lock.  Normally the lock will not
 769                  * be contended and this is safe because spa_open_common()
 770                  * handles the case where the caller already holds the
 771                  * spa_namespace_lock.
 772                  *
 773                  * When the lock cannot be aquired after multiple retries
 774                  * this must be the vdev on zvol deadlock case and we have
 775                  * no choice but to return an error.  For 5.12 and older
 776                  * kernels returning -ERESTARTSYS will result in the
 777                  * bdev->bd_mutex being dropped, then reacquired, and
 778                  * fops->open() being called again.  This process can be
 779                  * repeated safely until both locks are acquired.  For 5.13
 780                  * and newer the -ERESTARTSYS retry logic was removed from
 781                  * the kernel so the only option is to return the error for
 782                  * the caller to handle it.
 783                  */
 784                 if (!mutex_owned(&spa_namespace_lock)) {
 785                         if (!mutex_tryenter(&spa_namespace_lock)) {
 786                                 mutex_exit(&zv->zv_state_lock);
 787                                 rw_exit(&zv->zv_suspend_lock);
 788
 789 #ifdef HAVE_BLKDEV_GET_ERESTARTSYS
 790                                 schedule();
 791                                 return (SET_ERROR(-ERESTARTSYS));
 792 #else
 793                                 if ((gethrtime() - start) > timeout)
 794                                         return (SET_ERROR(-ERESTARTSYS));
 795
 796                                 schedule_timeout(MSEC_TO_TICK(10));
 797                                 goto retry;
 798 #endif
 799                         } else {
 800                                 drop_namespace = B_TRUE;
 801                         }
 802                 }
 803
 804                 error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag)));
 805
 806                 if (drop_namespace)
 807                         mutex_exit(&spa_namespace_lock);
 808         }
 809
 810         if (error == 0) {
 811                 if ((blk_mode_is_open_write(flag)) &&
 812                     (zv->zv_flags & ZVOL_RDONLY)) {
 813                         if (zv->zv_open_count == 0)
 814                                 zvol_last_close(zv);
 815
 816                         error = SET_ERROR(-EROFS);
 817                 } else {
 818                         zv->zv_open_count++;
 819                 }
 820         }
 821
 822         mutex_exit(&zv->zv_state_lock);
 823         if (drop_suspend)
 824                 rw_exit(&zv->zv_suspend_lock);
 825
 826         if (error == 0)
 827 #ifdef HAVE_BLK_MODE_T
 828                 disk_check_media_change(disk);
 829 #else
 830                 zfs_check_media_change(bdev);
 831 #endif
 832
 833         return (error);
 834 }
 835
 836 static void
 837 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG
 838 zvol_release(struct gendisk *disk)
 839 #else
 840 zvol_release(struct gendisk *disk, fmode_t unused)
 841 #endif
 842 {
 843 #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG)
 844         (void) unused;
 845 #endif
 846         zvol_state_t *zv;
 847         boolean_t drop_suspend = B_TRUE;
 848
 849         rw_enter(&zvol_state_lock, RW_READER);
 850         zv = disk->private_data;
 851
 852         mutex_enter(&zv->zv_state_lock);
 853         ASSERT3U(zv->zv_open_count, >, 0);
 854         /*
 855          * make sure zvol is not suspended during last close
 856          * (hold zv_suspend_lock) and respect proper lock acquisition
 857          * ordering - zv_suspend_lock before zv_state_lock
 858          */
 859         if (zv->zv_open_count == 1) {
 860                 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
 861                         mutex_exit(&zv->zv_state_lock);
 862                         rw_enter(&zv->zv_suspend_lock, RW_READER);
 863                         mutex_enter(&zv->zv_state_lock);
 864                         /* check to see if zv_suspend_lock is needed */
 865                         if (zv->zv_open_count != 1) {
 866                                 rw_exit(&zv->zv_suspend_lock);
 867                                 drop_suspend = B_FALSE;
 868                         }
 869                 }
 870         } else {
 871                 drop_suspend = B_FALSE;
 872         }
 873         rw_exit(&zvol_state_lock);
 874
 875         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 876
 877         zv->zv_open_count--;
 878         if (zv->zv_open_count == 0) {
 879                 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
 880                 zvol_last_close(zv);
 881         }
 882
 883         mutex_exit(&zv->zv_state_lock);
 884
 885         if (drop_suspend)
 886                 rw_exit(&zv->zv_suspend_lock);
 887 }
 888
 889 static int
 890 zvol_ioctl(struct block_device *bdev, fmode_t mode,
 891     unsigned int cmd, unsigned long arg)
 892 {
 893         zvol_state_t *zv = bdev->bd_disk->private_data;
 894         int error = 0;
 895
 896         ASSERT3U(zv->zv_open_count, >, 0);
 897
 898         switch (cmd) {
 899         case BLKFLSBUF:
 900 #ifdef HAVE_FSYNC_BDEV
 901                 fsync_bdev(bdev);
 902 #elif defined(HAVE_SYNC_BLOCKDEV)
 903                 sync_blockdev(bdev);
 904 #else
 905 #error "Neither fsync_bdev() nor sync_blockdev() found"
 906 #endif
 907                 invalidate_bdev(bdev);
 908                 rw_enter(&zv->zv_suspend_lock, RW_READER);
 909
 910                 if (!(zv->zv_flags & ZVOL_RDONLY))
 911                         txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
 912
 913                 rw_exit(&zv->zv_suspend_lock);
 914                 break;
 915
 916         case BLKZNAME:
 917                 mutex_enter(&zv->zv_state_lock);
 918                 error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
 919                 mutex_exit(&zv->zv_state_lock);
 920                 break;
 921
 922         default:
 923                 error = -ENOTTY;
 924                 break;
 925         }
 926
 927         return (SET_ERROR(error));
 928 }
 929
 930 #ifdef CONFIG_COMPAT
 931 static int
 932 zvol_compat_ioctl(struct block_device *bdev, fmode_t mode,
 933     unsigned cmd, unsigned long arg)
 934 {
 935         return (zvol_ioctl(bdev, mode, cmd, arg));
 936 }
 937 #else
 938 #define zvol_compat_ioctl       NULL
 939 #endif
 940
 941 static unsigned int
 942 zvol_check_events(struct gendisk *disk, unsigned int clearing)
 943 {
 944         unsigned int mask = 0;
 945
 946         rw_enter(&zvol_state_lock, RW_READER);
 947
 948         zvol_state_t *zv = disk->private_data;
 949         if (zv != NULL) {
 950                 mutex_enter(&zv->zv_state_lock);
 951                 mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
 952                 zv->zv_changed = 0;
 953                 mutex_exit(&zv->zv_state_lock);
 954         }
 955
 956         rw_exit(&zvol_state_lock);
 957
 958         return (mask);
 959 }
 960
 961 static int
 962 zvol_revalidate_disk(struct gendisk *disk)
 963 {
 964         rw_enter(&zvol_state_lock, RW_READER);
 965
 966         zvol_state_t *zv = disk->private_data;
 967         if (zv != NULL) {
 968                 mutex_enter(&zv->zv_state_lock);
 969                 set_capacity(zv->zv_zso->zvo_disk,
 970                     zv->zv_volsize >> SECTOR_BITS);
 971                 mutex_exit(&zv->zv_state_lock);
 972         }
 973
 974         rw_exit(&zvol_state_lock);
 975
 976         return (0);
 977 }
 978
 979 int
 980 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
 981 {
 982         struct gendisk *disk = zv->zv_zso->zvo_disk;
 983
 984 #if defined(HAVE_REVALIDATE_DISK_SIZE)
 985         revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0);
 986 #elif defined(HAVE_REVALIDATE_DISK)
 987         revalidate_disk(disk);
 988 #else
 989         zvol_revalidate_disk(disk);
 990 #endif
 991         return (0);
 992 }
 993
 994 void
 995 zvol_os_clear_private(zvol_state_t *zv)
 996 {
 997         /*
 998          * Cleared while holding zvol_state_lock as a writer
 999          * which will prevent zvol_open() from opening it.
1000          */
1001         zv->zv_zso->zvo_disk->private_data = NULL;
1002 }
1003
1004 /*
1005  * Provide a simple virtual geometry for legacy compatibility.  For devices
1006  * smaller than 1 MiB a small head and sector count is used to allow very
1007  * tiny devices.  For devices over 1 Mib a standard head and sector count
1008  * is used to keep the cylinders count reasonable.
1009  */
1010 static int
1011 zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
1012 {
1013         zvol_state_t *zv = bdev->bd_disk->private_data;
1014         sector_t sectors;
1015
1016         ASSERT3U(zv->zv_open_count, >, 0);
1017
1018         sectors = get_capacity(zv->zv_zso->zvo_disk);
1019
1020         if (sectors > 2048) {
1021                 geo->heads = 16;
1022                 geo->sectors = 63;
1023         } else {
1024                 geo->heads = 2;
1025                 geo->sectors = 4;
1026         }
1027
1028         geo->start = 0;
1029         geo->cylinders = sectors / (geo->heads * geo->sectors);
1030
1031         return (0);
1032 }
1033
1034 /*
1035  * Why have two separate block_device_operations structs?
1036  *
1037  * Normally we'd just have one, and assign 'submit_bio' as needed.  However,
1038  * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we
1039  * can't just change submit_bio dynamically at runtime.  So just create two
1040  * separate structs to get around this.
1041  */
1042 static const struct block_device_operations zvol_ops_blk_mq = {
1043         .open                   = zvol_open,
1044         .release                = zvol_release,
1045         .ioctl                  = zvol_ioctl,
1046         .compat_ioctl           = zvol_compat_ioctl,
1047         .check_events           = zvol_check_events,
1048 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK
1049         .revalidate_disk        = zvol_revalidate_disk,
1050 #endif
1051         .getgeo                 = zvol_getgeo,
1052         .owner                  = THIS_MODULE,
1053 };
1054
1055 static const struct block_device_operations zvol_ops = {
1056         .open                   = zvol_open,
1057         .release                = zvol_release,
1058         .ioctl                  = zvol_ioctl,
1059         .compat_ioctl           = zvol_compat_ioctl,
1060         .check_events           = zvol_check_events,
1061 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK
1062         .revalidate_disk        = zvol_revalidate_disk,
1063 #endif
1064         .getgeo                 = zvol_getgeo,
1065         .owner                  = THIS_MODULE,
1066 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
1067         .submit_bio             = zvol_submit_bio,
1068 #endif
1069 };
1070
1071 static int
1072 zvol_alloc_non_blk_mq(struct zvol_state_os *zso)
1073 {
1074 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS)
1075 #if defined(HAVE_BLK_ALLOC_DISK)
1076         zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE);
1077         if (zso->zvo_disk == NULL)
1078                 return (1);
1079
1080         zso->zvo_disk->minors = ZVOL_MINORS;
1081         zso->zvo_queue = zso->zvo_disk->queue;
1082 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
1083         struct gendisk *disk = blk_alloc_disk(NULL, NUMA_NO_NODE);
1084         if (IS_ERR(disk)) {
1085                 zso->zvo_disk = NULL;
1086                 return (1);
1087         }
1088
1089         zso->zvo_disk = disk;
1090         zso->zvo_disk->minors = ZVOL_MINORS;
1091         zso->zvo_queue = zso->zvo_disk->queue;
1092 #else
1093         zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
1094         if (zso->zvo_queue == NULL)
1095                 return (1);
1096
1097         zso->zvo_disk = alloc_disk(ZVOL_MINORS);
1098         if (zso->zvo_disk == NULL) {
1099                 blk_cleanup_queue(zso->zvo_queue);
1100                 return (1);
1101         }
1102
1103         zso->zvo_disk->queue = zso->zvo_queue;
1104 #endif /* HAVE_BLK_ALLOC_DISK */
1105 #else
1106         zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE);
1107         if (zso->zvo_queue == NULL)
1108                 return (1);
1109
1110         zso->zvo_disk = alloc_disk(ZVOL_MINORS);
1111         if (zso->zvo_disk == NULL) {
1112                 blk_cleanup_queue(zso->zvo_queue);
1113                 return (1);
1114         }
1115
1116         zso->zvo_disk->queue = zso->zvo_queue;
1117 #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */
1118         return (0);
1119
1120 }
1121
1122 static int
1123 zvol_alloc_blk_mq(zvol_state_t *zv)
1124 {
1125 #ifdef HAVE_BLK_MQ
1126         struct zvol_state_os *zso = zv->zv_zso;
1127
1128         /* Allocate our blk-mq tag_set */
1129         if (zvol_blk_mq_alloc_tag_set(zv) != 0)
1130                 return (1);
1131
1132 #if defined(HAVE_BLK_ALLOC_DISK)
1133         zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv);
1134         if (zso->zvo_disk == NULL) {
1135                 blk_mq_free_tag_set(&zso->tag_set);
1136                 return (1);
1137         }
1138         zso->zvo_queue = zso->zvo_disk->queue;
1139         zso->zvo_disk->minors = ZVOL_MINORS;
1140 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
1141         struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, NULL, zv);
1142         if (IS_ERR(disk)) {
1143                 zso->zvo_disk = NULL;
1144                 blk_mq_free_tag_set(&zso->tag_set);
1145                 return (1);
1146         }
1147
1148         zso->zvo_disk = disk;
1149         zso->zvo_queue = zso->zvo_disk->queue;
1150         zso->zvo_disk->minors = ZVOL_MINORS;
1151 #else
1152         zso->zvo_disk = alloc_disk(ZVOL_MINORS);
1153         if (zso->zvo_disk == NULL) {
1154                 blk_cleanup_queue(zso->zvo_queue);
1155                 blk_mq_free_tag_set(&zso->tag_set);
1156                 return (1);
1157         }
1158         /* Allocate queue */
1159         zso->zvo_queue = blk_mq_init_queue(&zso->tag_set);
1160         if (IS_ERR(zso->zvo_queue)) {
1161                 blk_mq_free_tag_set(&zso->tag_set);
1162                 return (1);
1163         }
1164
1165         /* Our queue is now created, assign it to our disk */
1166         zso->zvo_disk->queue = zso->zvo_queue;
1167
1168 #endif
1169 #endif
1170         return (0);
1171 }
1172
1173 /*
1174  * Allocate memory for a new zvol_state_t and setup the required
1175  * request queue and generic disk structures for the block device.
1176  */
1177 static zvol_state_t *
1178 zvol_alloc(dev_t dev, const char *name)
1179 {
1180         zvol_state_t *zv;
1181         struct zvol_state_os *zso;
1182         uint64_t volmode;
1183         int ret;
1184
1185         if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0)
1186                 return (NULL);
1187
1188         if (volmode == ZFS_VOLMODE_DEFAULT)
1189                 volmode = zvol_volmode;
1190
1191         if (volmode == ZFS_VOLMODE_NONE)
1192                 return (NULL);
1193
1194         zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
1195         zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
1196         zv->zv_zso = zso;
1197         zv->zv_volmode = volmode;
1198
1199         list_link_init(&zv->zv_next);
1200         mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
1201
1202 #ifdef HAVE_BLK_MQ
1203         zv->zv_zso->use_blk_mq = zvol_use_blk_mq;
1204 #endif
1205
1206         /*
1207          * The block layer has 3 interfaces for getting BIOs:
1208          *
1209          * 1. blk-mq request queues (new)
1210          * 2. submit_bio() (oldest)
1211          * 3. regular request queues (old).
1212          *
1213          * Each of those interfaces has two permutations:
1214          *
1215          * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates
1216          *    both the disk and its queue (5.14 kernel or newer)
1217          *
1218          * b) We don't have blk_*alloc_disk(), and have to allocate the
1219          *    disk and the queue separately. (5.13 kernel or older)
1220          */
1221         if (zv->zv_zso->use_blk_mq) {
1222                 ret = zvol_alloc_blk_mq(zv);
1223                 zso->zvo_disk->fops = &zvol_ops_blk_mq;
1224         } else {
1225                 ret = zvol_alloc_non_blk_mq(zso);
1226                 zso->zvo_disk->fops = &zvol_ops;
1227         }
1228         if (ret != 0)
1229                 goto out_kmem;
1230
1231         blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE);
1232
1233         /* Limit read-ahead to a single page to prevent over-prefetching. */
1234         blk_queue_set_read_ahead(zso->zvo_queue, 1);
1235
1236         if (!zv->zv_zso->use_blk_mq) {
1237                 /* Disable write merging in favor of the ZIO pipeline. */
1238                 blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue);
1239         }
1240
1241         /* Enable /proc/diskstats */
1242         blk_queue_flag_set(QUEUE_FLAG_IO_STAT, zso->zvo_queue);
1243
1244         zso->zvo_queue->queuedata = zv;
1245         zso->zvo_dev = dev;
1246         zv->zv_open_count = 0;
1247         strlcpy(zv->zv_name, name, sizeof (zv->zv_name));
1248
1249         zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
1250         rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
1251
1252         zso->zvo_disk->major = zvol_major;
1253         zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE;
1254
1255         /*
1256          * Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices.
1257          * This is accomplished by limiting the number of minors for the
1258          * device to one and explicitly disabling partition scanning.
1259          */
1260         if (volmode == ZFS_VOLMODE_DEV) {
1261                 zso->zvo_disk->minors = 1;
1262                 zso->zvo_disk->flags &= ~ZFS_GENHD_FL_EXT_DEVT;
1263                 zso->zvo_disk->flags |= ZFS_GENHD_FL_NO_PART;
1264         }
1265
1266         zso->zvo_disk->first_minor = (dev & MINORMASK);
1267         zso->zvo_disk->private_data = zv;
1268         snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d",
1269             ZVOL_DEV_NAME, (dev & MINORMASK));
1270
1271         return (zv);
1272
1273 out_kmem:
1274         kmem_free(zso, sizeof (struct zvol_state_os));
1275         kmem_free(zv, sizeof (zvol_state_t));
1276         return (NULL);
1277 }
1278
1279 /*
1280  * Cleanup then free a zvol_state_t which was created by zvol_alloc().
1281  * At this time, the structure is not opened by anyone, is taken off
1282  * the zvol_state_list, and has its private data set to NULL.
1283  * The zvol_state_lock is dropped.
1284  *
1285  * This function may take many milliseconds to complete (e.g. we've seen
1286  * it take over 256ms), due to the calls to "blk_cleanup_queue" and
1287  * "del_gendisk". Thus, consumers need to be careful to account for this
1288  * latency when calling this function.
1289  */
1290 void
1291 zvol_os_free(zvol_state_t *zv)
1292 {
1293
1294         ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1295         ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
1296         ASSERT0(zv->zv_open_count);
1297         ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL);
1298
1299         rw_destroy(&zv->zv_suspend_lock);
1300         zfs_rangelock_fini(&zv->zv_rangelock);
1301
1302         del_gendisk(zv->zv_zso->zvo_disk);
1303 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
1304         (defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG))
1305 #if defined(HAVE_BLK_CLEANUP_DISK)
1306         blk_cleanup_disk(zv->zv_zso->zvo_disk);
1307 #else
1308         put_disk(zv->zv_zso->zvo_disk);
1309 #endif
1310 #else
1311         blk_cleanup_queue(zv->zv_zso->zvo_queue);
1312         put_disk(zv->zv_zso->zvo_disk);
1313 #endif
1314
1315 #ifdef HAVE_BLK_MQ
1316         if (zv->zv_zso->use_blk_mq)
1317                 blk_mq_free_tag_set(&zv->zv_zso->tag_set);
1318 #endif
1319
1320         ida_simple_remove(&zvol_ida,
1321             MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);
1322
1323         mutex_destroy(&zv->zv_state_lock);
1324         dataset_kstats_destroy(&zv->zv_kstat);
1325
1326         kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1327         kmem_free(zv, sizeof (zvol_state_t));
1328 }
1329
1330 void
1331 zvol_wait_close(zvol_state_t *zv)
1332 {
1333 }
1334
1335 /*
1336  * Create a block device minor node and setup the linkage between it
1337  * and the specified volume.  Once this function returns the block
1338  * device is live and ready for use.
1339  */
1340 int
1341 zvol_os_create_minor(const char *name)
1342 {
1343         zvol_state_t *zv;
1344         objset_t *os;
1345         dmu_object_info_t *doi;
1346         uint64_t volsize;
1347         uint64_t len;
1348         unsigned minor = 0;
1349         int error = 0;
1350         int idx;
1351         uint64_t hash = zvol_name_hash(name);
1352         uint64_t volthreading;
1353         bool replayed_zil = B_FALSE;
1354
1355         if (zvol_inhibit_dev)
1356                 return (0);
1357
1358         idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP));
1359         if (idx < 0)
1360                 return (SET_ERROR(-idx));
1361         minor = idx << ZVOL_MINOR_BITS;
1362         if (MINOR(minor) != minor) {
1363                 /* too many partitions can cause an overflow */
1364                 zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u",
1365                     name, minor, MINOR(minor));
1366                 ida_simple_remove(&zvol_ida, idx);
1367                 return (SET_ERROR(EINVAL));
1368         }
1369
1370         zv = zvol_find_by_name_hash(name, hash, RW_NONE);
1371         if (zv) {
1372                 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1373                 mutex_exit(&zv->zv_state_lock);
1374                 ida_simple_remove(&zvol_ida, idx);
1375                 return (SET_ERROR(EEXIST));
1376         }
1377
1378         doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
1379
1380         error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
1381         if (error)
1382                 goto out_doi;
1383
1384         error = dmu_object_info(os, ZVOL_OBJ, doi);
1385         if (error)
1386                 goto out_dmu_objset_disown;
1387
1388         error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1389         if (error)
1390                 goto out_dmu_objset_disown;
1391
1392         zv = zvol_alloc(MKDEV(zvol_major, minor), name);
1393         if (zv == NULL) {
1394                 error = SET_ERROR(EAGAIN);
1395                 goto out_dmu_objset_disown;
1396         }
1397         zv->zv_hash = hash;
1398
1399         if (dmu_objset_is_snapshot(os))
1400                 zv->zv_flags |= ZVOL_RDONLY;
1401
1402         zv->zv_volblocksize = doi->doi_data_block_size;
1403         zv->zv_volsize = volsize;
1404         zv->zv_objset = os;
1405
1406         /* Default */
1407         zv->zv_threading = B_TRUE;
1408         if (dsl_prop_get_integer(name, "volthreading", &volthreading, NULL)
1409             == 0)
1410                 zv->zv_threading = volthreading;
1411
1412         set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9);
1413
1414         blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue,
1415             (DMU_MAX_ACCESS / 4) >> 9);
1416
1417         if (zv->zv_zso->use_blk_mq) {
1418                 /*
1419                  * IO requests can be really big (1MB).  When an IO request
1420                  * comes in, it is passed off to zvol_read() or zvol_write()
1421                  * in a new thread, where it is chunked up into 'volblocksize'
1422                  * sized pieces and processed.  So for example, if the request
1423                  * is a 1MB write and your volblocksize is 128k, one zvol_write
1424                  * thread will take that request and sequentially do ten 128k
1425                  * IOs.  This is due to the fact that the thread needs to lock
1426                  * each volblocksize sized block.  So you might be wondering:
1427                  * "instead of passing the whole 1MB request to one thread,
1428                  * why not pass ten individual 128k chunks to ten threads and
1429                  * process the whole write in parallel?"  The short answer is
1430                  * that there's a sweet spot number of chunks that balances
1431                  * the greater parallelism with the added overhead of more
1432                  * threads. The sweet spot can be different depending on if you
1433                  * have a read or write  heavy workload.  Writes typically want
1434                  * high chunk counts while reads typically want lower ones.  On
1435                  * a test pool with 6 NVMe drives in a 3x 2-disk mirror
1436                  * configuration, with volblocksize=8k, the sweet spot for good
1437                  * sequential reads and writes was at 8 chunks.
1438                  */
1439
1440                 /*
1441                  * Below we tell the kernel how big we want our requests
1442                  * to be.  You would think that blk_queue_io_opt() would be
1443                  * used to do this since it is used to "set optimal request
1444                  * size for the queue", but that doesn't seem to do
1445                  * anything - the kernel still gives you huge requests
1446                  * with tons of little PAGE_SIZE segments contained within it.
1447                  *
1448                  * Knowing that the kernel will just give you PAGE_SIZE segments
1449                  * no matter what, you can say "ok, I want PAGE_SIZE byte
1450                  * segments, and I want 'N' of them per request", where N is
1451                  * the correct number of segments for the volblocksize and
1452                  * number of chunks you want.
1453                  */
1454 #ifdef HAVE_BLK_MQ
1455                 if (zvol_blk_mq_blocks_per_thread != 0) {
1456                         unsigned int chunks;
1457                         chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX);
1458
1459                         blk_queue_max_segment_size(zv->zv_zso->zvo_queue,
1460                             PAGE_SIZE);
1461                         blk_queue_max_segments(zv->zv_zso->zvo_queue,
1462                             (zv->zv_volblocksize * chunks) / PAGE_SIZE);
1463                 } else {
1464                         /*
1465                          * Special case: zvol_blk_mq_blocks_per_thread = 0
1466                          * Max everything out.
1467                          */
1468                         blk_queue_max_segments(zv->zv_zso->zvo_queue,
1469                             UINT16_MAX);
1470                         blk_queue_max_segment_size(zv->zv_zso->zvo_queue,
1471                             UINT_MAX);
1472                 }
1473 #endif
1474         } else {
1475                 blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX);
1476                 blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX);
1477         }
1478
1479         blk_queue_physical_block_size(zv->zv_zso->zvo_queue,
1480             zv->zv_volblocksize);
1481         blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize);
1482         blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue,
1483             (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
1484         blk_queue_discard_granularity(zv->zv_zso->zvo_queue,
1485             zv->zv_volblocksize);
1486 #ifdef QUEUE_FLAG_DISCARD
1487         blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue);
1488 #endif
1489 #ifdef QUEUE_FLAG_NONROT
1490         blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue);
1491 #endif
1492 #ifdef QUEUE_FLAG_ADD_RANDOM
1493         blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue);
1494 #endif
1495         /* This flag was introduced in kernel version 4.12. */
1496 #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH
1497         blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue);
1498 #endif
1499
1500         ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
1501         error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
1502         if (error)
1503                 goto out_dmu_objset_disown;
1504         ASSERT3P(zv->zv_zilog, ==, NULL);
1505         zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1506         if (spa_writeable(dmu_objset_spa(os))) {
1507                 if (zil_replay_disable)
1508                         replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE);
1509                 else
1510                         replayed_zil = zil_replay(os, zv, zvol_replay_vector);
1511         }
1512         if (replayed_zil)
1513                 zil_close(zv->zv_zilog);
1514         zv->zv_zilog = NULL;
1515
1516         /*
1517          * When udev detects the addition of the device it will immediately
1518          * invoke blkid(8) to determine the type of content on the device.
1519          * Prefetching the blocks commonly scanned by blkid(8) will speed
1520          * up this process.
1521          */
1522         len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE);
1523         if (len > 0) {
1524                 dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ);
1525                 dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len,
1526                     ZIO_PRIORITY_SYNC_READ);
1527         }
1528
1529         zv->zv_objset = NULL;
1530 out_dmu_objset_disown:
1531         dmu_objset_disown(os, B_TRUE, FTAG);
1532 out_doi:
1533         kmem_free(doi, sizeof (dmu_object_info_t));
1534
1535         /*
1536          * Keep in mind that once add_disk() is called, the zvol is
1537          * announced to the world, and zvol_open()/zvol_release() can
1538          * be called at any time. Incidentally, add_disk() itself calls
1539          * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close()
1540          * directly as well.
1541          */
1542         if (error == 0) {
1543                 rw_enter(&zvol_state_lock, RW_WRITER);
1544                 zvol_insert(zv);
1545                 rw_exit(&zvol_state_lock);
1546 #ifdef HAVE_ADD_DISK_RET
1547                 error = add_disk(zv->zv_zso->zvo_disk);
1548 #else
1549                 add_disk(zv->zv_zso->zvo_disk);
1550 #endif
1551         } else {
1552                 ida_simple_remove(&zvol_ida, idx);
1553         }
1554
1555         return (error);
1556 }
1557
1558 void
1559 zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
1560 {
1561         int readonly = get_disk_ro(zv->zv_zso->zvo_disk);
1562
1563         ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1564         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1565
1566         strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1567
1568         /* move to new hashtable entry  */
1569         zv->zv_hash = zvol_name_hash(zv->zv_name);
1570         hlist_del(&zv->zv_hlink);
1571         hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1572
1573         /*
1574          * The block device's read-only state is briefly changed causing
1575          * a KOBJ_CHANGE uevent to be issued.  This ensures udev detects
1576          * the name change and fixes the symlinks.  This does not change
1577          * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never
1578          * changes.  This would normally be done using kobject_uevent() but
1579          * that is a GPL-only symbol which is why we need this workaround.
1580          */
1581         set_disk_ro(zv->zv_zso->zvo_disk, !readonly);
1582         set_disk_ro(zv->zv_zso->zvo_disk, readonly);
1583
1584         dataset_kstats_rename(&zv->zv_kstat, newname);
1585 }
1586
1587 void
1588 zvol_os_set_disk_ro(zvol_state_t *zv, int flags)
1589 {
1590
1591         set_disk_ro(zv->zv_zso->zvo_disk, flags);
1592 }
1593
1594 void
1595 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity)
1596 {
1597
1598         set_capacity(zv->zv_zso->zvo_disk, capacity);
1599 }
1600
1601 int
1602 zvol_init(void)
1603 {
1604         int error;
1605
1606         /*
1607          * zvol_threads is the module param the user passes in.
1608          *
1609          * zvol_actual_threads is what we use internally, since the user can
1610          * pass zvol_thread = 0 to mean "use all the CPUs" (the default).
1611          */
1612         static unsigned int zvol_actual_threads;
1613
1614         if (zvol_threads == 0) {
1615                 /*
1616                  * See dde9380a1 for why 32 was chosen here.  This should
1617                  * probably be refined to be some multiple of the number
1618                  * of CPUs.
1619                  */
1620                 zvol_actual_threads = MAX(num_online_cpus(), 32);
1621         } else {
1622                 zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024);
1623         }
1624
1625         /*
1626          * Use atleast 32 zvol_threads but for many core system,
1627          * prefer 6 threads per taskq, but no more taskqs
1628          * than threads in them on large systems.
1629          *
1630          *                 taskq   total
1631          * cpus    taskqs  threads threads
1632          * ------- ------- ------- -------
1633          * 1       1       32       32
1634          * 2       1       32       32
1635          * 4       1       32       32
1636          * 8       2       16       32
1637          * 16      3       11       33
1638          * 32      5       7        35
1639          * 64      8       8        64
1640          * 128     11      12       132
1641          * 256     16      16       256
1642          */
1643         zv_taskq_t *ztqs = &zvol_taskqs;
1644         uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs);
1645         if (num_tqs == 0) {
1646                 num_tqs = 1 + num_online_cpus() / 6;
1647                 while (num_tqs * num_tqs > zvol_actual_threads)
1648                         num_tqs--;
1649         }
1650         uint_t per_tq_thread = zvol_actual_threads / num_tqs;
1651         if (per_tq_thread * num_tqs < zvol_actual_threads)
1652                 per_tq_thread++;
1653         ztqs->tqs_cnt = num_tqs;
1654         ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP);
1655         error = register_blkdev(zvol_major, ZVOL_DRIVER);
1656         if (error) {
1657                 kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *));
1658                 ztqs->tqs_taskq = NULL;
1659                 printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
1660                 return (error);
1661         }
1662
1663 #ifdef HAVE_BLK_MQ
1664         if (zvol_blk_mq_queue_depth == 0) {
1665                 zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ;
1666         } else {
1667                 zvol_actual_blk_mq_queue_depth =
1668                     MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ);
1669         }
1670
1671         if (zvol_blk_mq_threads == 0) {
1672                 zvol_blk_mq_actual_threads = num_online_cpus();
1673         } else {
1674                 zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1),
1675                     1024);
1676         }
1677 #endif
1678         for (uint_t i = 0; i < num_tqs; i++) {
1679                 char name[32];
1680                 (void) snprintf(name, sizeof (name), "%s_tq-%u",
1681                     ZVOL_DRIVER, i);
1682                 ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread,
1683                     maxclsyspri, per_tq_thread, INT_MAX,
1684                     TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
1685                 if (ztqs->tqs_taskq[i] == NULL) {
1686                         for (int j = i - 1; j >= 0; j--)
1687                                 taskq_destroy(ztqs->tqs_taskq[j]);
1688                         unregister_blkdev(zvol_major, ZVOL_DRIVER);
1689                         kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
1690                             sizeof (taskq_t *));
1691                         ztqs->tqs_taskq = NULL;
1692                         return (-ENOMEM);
1693                 }
1694         }
1695
1696         zvol_init_impl();
1697         ida_init(&zvol_ida);
1698         return (0);
1699 }
1700
1701 void
1702 zvol_fini(void)
1703 {
1704         zv_taskq_t *ztqs = &zvol_taskqs;
1705         zvol_fini_impl();
1706         unregister_blkdev(zvol_major, ZVOL_DRIVER);
1707
1708         if (ztqs->tqs_taskq == NULL) {
1709                 ASSERT3U(ztqs->tqs_cnt, ==, 0);
1710         } else {
1711                 for (uint_t i = 0; i < ztqs->tqs_cnt; i++) {
1712                         ASSERT3P(ztqs->tqs_taskq[i], !=, NULL);
1713                         taskq_destroy(ztqs->tqs_taskq[i]);
1714                 }
1715                 kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
1716                     sizeof (taskq_t *));
1717                 ztqs->tqs_taskq = NULL;
1718         }
1719
1720         ida_destroy(&zvol_ida);
1721 }
1722
1723 /* BEGIN CSTYLED */
1724 module_param(zvol_inhibit_dev, uint, 0644);
1725 MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
1726
1727 module_param(zvol_major, uint, 0444);
1728 MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
1729
1730 module_param(zvol_threads, uint, 0444);
1731 MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set"
1732     "to 0 to use all active CPUs");
1733
1734 module_param(zvol_request_sync, uint, 0644);
1735 MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
1736
1737 module_param(zvol_max_discard_blocks, ulong, 0444);
1738 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
1739
1740 module_param(zvol_num_taskqs, uint, 0444);
1741 MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs");
1742
1743 module_param(zvol_prefetch_bytes, uint, 0644);
1744 MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
1745
1746 module_param(zvol_volmode, uint, 0644);
1747 MODULE_PARM_DESC(zvol_volmode, "Default volmode property value");
1748
1749 #ifdef HAVE_BLK_MQ
1750 module_param(zvol_blk_mq_queue_depth, uint, 0644);
1751 MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth");
1752
1753 module_param(zvol_use_blk_mq, uint, 0644);
1754 MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols");
1755
1756 module_param(zvol_blk_mq_blocks_per_thread, uint, 0644);
1757 MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread,
1758     "Process volblocksize blocks per thread");
1759 #endif
1760
1761 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
1762 module_param(zvol_open_timeout_ms, uint, 0644);
1763 MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries");
1764 #endif
1765
1766 /* END CSTYLED */