module/os/linux/zfs/zvol_os.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or https://opensource.org/licenses/CDDL-1.0.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
  23  */
  24
  25 #include <sys/dataset_kstats.h>
  26 #include <sys/dbuf.h>
  27 #include <sys/dmu_traverse.h>
  28 #include <sys/dsl_dataset.h>
  29 #include <sys/dsl_prop.h>
  30 #include <sys/dsl_dir.h>
  31 #include <sys/zap.h>
  32 #include <sys/zfeature.h>
  33 #include <sys/zil_impl.h>
  34 #include <sys/dmu_tx.h>
  35 #include <sys/zio.h>
  36 #include <sys/zfs_rlock.h>
  37 #include <sys/spa_impl.h>
  38 #include <sys/zvol.h>
  39 #include <sys/zvol_impl.h>
  40 #include <cityhash.h>
  41
  42 #include <linux/blkdev_compat.h>
  43 #include <linux/task_io_accounting_ops.h>
  44
  45 #ifdef HAVE_BLK_MQ
  46 #include <linux/blk-mq.h>
  47 #endif
  48
  49 static void zvol_request_impl(zvol_state_t *zv, struct bio *bio,
  50     struct request *rq, boolean_t force_sync);
  51
  52 static unsigned int zvol_major = ZVOL_MAJOR;
  53 static unsigned int zvol_request_sync = 0;
  54 static unsigned int zvol_prefetch_bytes = (128 * 1024);
  55 static unsigned long zvol_max_discard_blocks = 16384;
  56
  57 /*
  58  * Switch taskq at multiple of 512 MB offset. This can be set to a lower value
  59  * to utilize more threads for small files but may affect prefetch hits.
  60  */
  61 #define ZVOL_TASKQ_OFFSET_SHIFT 29
  62
  63 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
  64 static unsigned int zvol_open_timeout_ms = 1000;
  65 #endif
  66
  67 static unsigned int zvol_threads = 0;
  68 #ifdef HAVE_BLK_MQ
  69 static unsigned int zvol_blk_mq_threads = 0;
  70 static unsigned int zvol_blk_mq_actual_threads;
  71 static boolean_t zvol_use_blk_mq = B_FALSE;
  72
  73 /*
  74  * The maximum number of volblocksize blocks to process per thread.  Typically,
  75  * write heavy workloads preform better with higher values here, and read
  76  * heavy workloads preform better with lower values, but that's not a hard
  77  * and fast rule.  It's basically a knob to tune between "less overhead with
  78  * less parallelism" and "more overhead, but more parallelism".
  79  *
  80  * '8' was chosen as a reasonable, balanced, default based off of sequential
  81  * read and write tests to a zvol in an NVMe pool (with 16 CPUs).
  82  */
  83 static unsigned int zvol_blk_mq_blocks_per_thread = 8;
  84 #endif
  85
  86 static unsigned int zvol_num_taskqs = 0;
  87
  88 #ifndef BLKDEV_DEFAULT_RQ
  89 /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */
  90 #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ
  91 #endif
  92
  93 /*
  94  * Finalize our BIO or request.
  95  */
  96 #ifdef  HAVE_BLK_MQ
  97 #define END_IO(zv, bio, rq, error)  do { \
  98         if (bio) { \
  99                 BIO_END_IO(bio, error); \
 100         } else { \
 101                 blk_mq_end_request(rq, errno_to_bi_status(error)); \
 102         } \
 103 } while (0)
 104 #else
 105 #define END_IO(zv, bio, rq, error)      BIO_END_IO(bio, error)
 106 #endif
 107
 108 #ifdef HAVE_BLK_MQ
 109 static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ;
 110 static unsigned int zvol_actual_blk_mq_queue_depth;
 111 #endif
 112
 113 struct zvol_state_os {
 114         struct gendisk          *zvo_disk;      /* generic disk */
 115         struct request_queue    *zvo_queue;     /* request queue */
 116         dev_t                   zvo_dev;        /* device id */
 117
 118 #ifdef HAVE_BLK_MQ
 119         struct blk_mq_tag_set tag_set;
 120 #endif
 121
 122         /* Set from the global 'zvol_use_blk_mq' at zvol load */
 123         boolean_t use_blk_mq;
 124 };
 125
 126 typedef struct zv_taskq {
 127         uint_t tqs_cnt;
 128         taskq_t **tqs_taskq;
 129 } zv_taskq_t;
 130 static zv_taskq_t zvol_taskqs;
 131 static struct ida zvol_ida;
 132
 133 typedef struct zv_request_stack {
 134         zvol_state_t    *zv;
 135         struct bio      *bio;
 136         struct request *rq;
 137 } zv_request_t;
 138
 139 typedef struct zv_work {
 140         struct request  *rq;
 141         struct work_struct work;
 142 } zv_work_t;
 143
 144 typedef struct zv_request_task {
 145         zv_request_t zvr;
 146         taskq_ent_t     ent;
 147 } zv_request_task_t;
 148
 149 static zv_request_task_t *
 150 zv_request_task_create(zv_request_t zvr)
 151 {
 152         zv_request_task_t *task;
 153         task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP);
 154         taskq_init_ent(&task->ent);
 155         task->zvr = zvr;
 156         return (task);
 157 }
 158
 159 static void
 160 zv_request_task_free(zv_request_task_t *task)
 161 {
 162         kmem_free(task, sizeof (*task));
 163 }
 164
 165 #ifdef HAVE_BLK_MQ
 166
 167 /*
 168  * This is called when a new block multiqueue request comes in.  A request
 169  * contains one or more BIOs.
 170  */
 171 static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 172     const struct blk_mq_queue_data *bd)
 173 {
 174         struct request *rq = bd->rq;
 175         zvol_state_t *zv = rq->q->queuedata;
 176
 177         /* Tell the kernel that we are starting to process this request */
 178         blk_mq_start_request(rq);
 179
 180         if (blk_rq_is_passthrough(rq)) {
 181                 /* Skip non filesystem request */
 182                 blk_mq_end_request(rq, BLK_STS_IOERR);
 183                 return (BLK_STS_IOERR);
 184         }
 185
 186         zvol_request_impl(zv, NULL, rq, 0);
 187
 188         /* Acknowledge to the kernel that we got this request */
 189         return (BLK_STS_OK);
 190 }
 191
 192 static struct blk_mq_ops zvol_blk_mq_queue_ops = {
 193         .queue_rq = zvol_mq_queue_rq,
 194 };
 195
 196 /* Initialize our blk-mq struct */
 197 static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv)
 198 {
 199         struct zvol_state_os *zso = zv->zv_zso;
 200
 201         memset(&zso->tag_set, 0, sizeof (zso->tag_set));
 202
 203         /* Initialize tag set. */
 204         zso->tag_set.ops = &zvol_blk_mq_queue_ops;
 205         zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads;
 206         zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth;
 207         zso->tag_set.numa_node = NUMA_NO_NODE;
 208         zso->tag_set.cmd_size = 0;
 209
 210         /*
 211          * We need BLK_MQ_F_BLOCKING here since we do blocking calls in
 212          * zvol_request_impl()
 213          */
 214         zso->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
 215         zso->tag_set.driver_data = zv;
 216
 217         return (blk_mq_alloc_tag_set(&zso->tag_set));
 218 }
 219 #endif /* HAVE_BLK_MQ */
 220
 221 /*
 222  * Given a path, return TRUE if path is a ZVOL.
 223  */
 224 boolean_t
 225 zvol_os_is_zvol(const char *path)
 226 {
 227         dev_t dev = 0;
 228
 229         if (vdev_lookup_bdev(path, &dev) != 0)
 230                 return (B_FALSE);
 231
 232         if (MAJOR(dev) == zvol_major)
 233                 return (B_TRUE);
 234
 235         return (B_FALSE);
 236 }
 237
 238 static void
 239 zvol_write(zv_request_t *zvr)
 240 {
 241         struct bio *bio = zvr->bio;
 242         struct request *rq = zvr->rq;
 243         int error = 0;
 244         zfs_uio_t uio;
 245         zvol_state_t *zv = zvr->zv;
 246         struct request_queue *q;
 247         struct gendisk *disk;
 248         unsigned long start_time = 0;
 249         boolean_t acct = B_FALSE;
 250
 251         ASSERT3P(zv, !=, NULL);
 252         ASSERT3U(zv->zv_open_count, >, 0);
 253         ASSERT3P(zv->zv_zilog, !=, NULL);
 254
 255         q = zv->zv_zso->zvo_queue;
 256         disk = zv->zv_zso->zvo_disk;
 257
 258         /* bio marked as FLUSH need to flush before write */
 259         if (io_is_flush(bio, rq))
 260                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
 261
 262         /* Some requests are just for flush and nothing else. */
 263         if (io_size(bio, rq) == 0) {
 264                 rw_exit(&zv->zv_suspend_lock);
 265                 END_IO(zv, bio, rq, 0);
 266                 return;
 267         }
 268
 269         zfs_uio_bvec_init(&uio, bio, rq);
 270
 271         ssize_t start_resid = uio.uio_resid;
 272
 273         /*
 274          * With use_blk_mq, accounting is done by blk_mq_start_request()
 275          * and blk_mq_end_request(), so we can skip it here.
 276          */
 277         if (bio) {
 278                 acct = blk_queue_io_stat(q);
 279                 if (acct) {
 280                         start_time = blk_generic_start_io_acct(q, disk, WRITE,
 281                             bio);
 282                 }
 283         }
 284
 285         boolean_t sync =
 286             io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
 287
 288         zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
 289             uio.uio_loffset, uio.uio_resid, RL_WRITER);
 290
 291         uint64_t volsize = zv->zv_volsize;
 292         while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
 293                 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
 294                 uint64_t off = uio.uio_loffset;
 295                 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
 296
 297                 if (bytes > volsize - off)      /* don't write past the end */
 298                         bytes = volsize - off;
 299
 300                 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
 301
 302                 /* This will only fail for ENOSPC */
 303                 error = dmu_tx_assign(tx, TXG_WAIT);
 304                 if (error) {
 305                         dmu_tx_abort(tx);
 306                         break;
 307                 }
 308                 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
 309                 if (error == 0) {
 310                         zvol_log_write(zv, tx, off, bytes, sync);
 311                 }
 312                 dmu_tx_commit(tx);
 313
 314                 if (error)
 315                         break;
 316         }
 317         zfs_rangelock_exit(lr);
 318
 319         int64_t nwritten = start_resid - uio.uio_resid;
 320         dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
 321         task_io_account_write(nwritten);
 322
 323         if (sync)
 324                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
 325
 326         rw_exit(&zv->zv_suspend_lock);
 327
 328         if (bio && acct) {
 329                 blk_generic_end_io_acct(q, disk, WRITE, bio, start_time);
 330         }
 331
 332         END_IO(zv, bio, rq, -error);
 333 }
 334
 335 static void
 336 zvol_write_task(void *arg)
 337 {
 338         zv_request_task_t *task = arg;
 339         zvol_write(&task->zvr);
 340         zv_request_task_free(task);
 341 }
 342
 343 static void
 344 zvol_discard(zv_request_t *zvr)
 345 {
 346         struct bio *bio = zvr->bio;
 347         struct request *rq = zvr->rq;
 348         zvol_state_t *zv = zvr->zv;
 349         uint64_t start = io_offset(bio, rq);
 350         uint64_t size = io_size(bio, rq);
 351         uint64_t end = start + size;
 352         boolean_t sync;
 353         int error = 0;
 354         dmu_tx_t *tx;
 355         struct request_queue *q = zv->zv_zso->zvo_queue;
 356         struct gendisk *disk = zv->zv_zso->zvo_disk;
 357         unsigned long start_time = 0;
 358         boolean_t acct = B_FALSE;
 359
 360         ASSERT3P(zv, !=, NULL);
 361         ASSERT3U(zv->zv_open_count, >, 0);
 362         ASSERT3P(zv->zv_zilog, !=, NULL);
 363
 364         if (bio) {
 365                 acct = blk_queue_io_stat(q);
 366                 if (acct) {
 367                         start_time = blk_generic_start_io_acct(q, disk, WRITE,
 368                             bio);
 369                 }
 370         }
 371
 372         sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
 373
 374         if (end > zv->zv_volsize) {
 375                 error = SET_ERROR(EIO);
 376                 goto unlock;
 377         }
 378
 379         /*
 380          * Align the request to volume block boundaries when a secure erase is
 381          * not required.  This will prevent dnode_free_range() from zeroing out
 382          * the unaligned parts which is slow (read-modify-write) and useless
 383          * since we are not freeing any space by doing so.
 384          */
 385         if (!io_is_secure_erase(bio, rq)) {
 386                 start = P2ROUNDUP(start, zv->zv_volblocksize);
 387                 end = P2ALIGN(end, zv->zv_volblocksize);
 388                 size = end - start;
 389         }
 390
 391         if (start >= end)
 392                 goto unlock;
 393
 394         zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
 395             start, size, RL_WRITER);
 396
 397         tx = dmu_tx_create(zv->zv_objset);
 398         dmu_tx_mark_netfree(tx);
 399         error = dmu_tx_assign(tx, TXG_WAIT);
 400         if (error != 0) {
 401                 dmu_tx_abort(tx);
 402         } else {
 403                 zvol_log_truncate(zv, tx, start, size);
 404                 dmu_tx_commit(tx);
 405                 error = dmu_free_long_range(zv->zv_objset,
 406                     ZVOL_OBJ, start, size);
 407         }
 408         zfs_rangelock_exit(lr);
 409
 410         if (error == 0 && sync)
 411                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
 412
 413 unlock:
 414         rw_exit(&zv->zv_suspend_lock);
 415
 416         if (bio && acct) {
 417                 blk_generic_end_io_acct(q, disk, WRITE, bio,
 418                     start_time);
 419         }
 420
 421         END_IO(zv, bio, rq, -error);
 422 }
 423
 424 static void
 425 zvol_discard_task(void *arg)
 426 {
 427         zv_request_task_t *task = arg;
 428         zvol_discard(&task->zvr);
 429         zv_request_task_free(task);
 430 }
 431
 432 static void
 433 zvol_read(zv_request_t *zvr)
 434 {
 435         struct bio *bio = zvr->bio;
 436         struct request *rq = zvr->rq;
 437         int error = 0;
 438         zfs_uio_t uio;
 439         boolean_t acct = B_FALSE;
 440         zvol_state_t *zv = zvr->zv;
 441         struct request_queue *q;
 442         struct gendisk *disk;
 443         unsigned long start_time = 0;
 444
 445         ASSERT3P(zv, !=, NULL);
 446         ASSERT3U(zv->zv_open_count, >, 0);
 447
 448         zfs_uio_bvec_init(&uio, bio, rq);
 449
 450         q = zv->zv_zso->zvo_queue;
 451         disk = zv->zv_zso->zvo_disk;
 452
 453         ssize_t start_resid = uio.uio_resid;
 454
 455         /*
 456          * When blk-mq is being used, accounting is done by
 457          * blk_mq_start_request() and blk_mq_end_request().
 458          */
 459         if (bio) {
 460                 acct = blk_queue_io_stat(q);
 461                 if (acct)
 462                         start_time = blk_generic_start_io_acct(q, disk, READ,
 463                             bio);
 464         }
 465
 466         zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
 467             uio.uio_loffset, uio.uio_resid, RL_READER);
 468
 469         uint64_t volsize = zv->zv_volsize;
 470
 471         while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
 472                 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
 473
 474                 /* don't read past the end */
 475                 if (bytes > volsize - uio.uio_loffset)
 476                         bytes = volsize - uio.uio_loffset;
 477
 478                 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
 479                 if (error) {
 480                         /* convert checksum errors into IO errors */
 481                         if (error == ECKSUM)
 482                                 error = SET_ERROR(EIO);
 483                         break;
 484                 }
 485         }
 486         zfs_rangelock_exit(lr);
 487
 488         int64_t nread = start_resid - uio.uio_resid;
 489         dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
 490         task_io_account_read(nread);
 491
 492         rw_exit(&zv->zv_suspend_lock);
 493
 494         if (bio && acct) {
 495                 blk_generic_end_io_acct(q, disk, READ, bio, start_time);
 496         }
 497
 498         END_IO(zv, bio, rq, -error);
 499 }
 500
 501 static void
 502 zvol_read_task(void *arg)
 503 {
 504         zv_request_task_t *task = arg;
 505         zvol_read(&task->zvr);
 506         zv_request_task_free(task);
 507 }
 508
 509
 510 /*
 511  * Process a BIO or request
 512  *
 513  * Either 'bio' or 'rq' should be set depending on if we are processing a
 514  * bio or a request (both should not be set).
 515  *
 516  * force_sync:  Set to 0 to defer processing to a background taskq
 517  *                      Set to 1 to process data synchronously
 518  */
 519 static void
 520 zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 521     boolean_t force_sync)
 522 {
 523         fstrans_cookie_t cookie = spl_fstrans_mark();
 524         uint64_t offset = io_offset(bio, rq);
 525         uint64_t size = io_size(bio, rq);
 526         int rw = io_data_dir(bio, rq);
 527
 528         if (zvol_request_sync || zv->zv_threading == B_FALSE)
 529                 force_sync = 1;
 530
 531         zv_request_t zvr = {
 532                 .zv = zv,
 533                 .bio = bio,
 534                 .rq = rq,
 535         };
 536
 537         if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) {
 538                 printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n",
 539                     zv->zv_zso->zvo_disk->disk_name,
 540                     (long long unsigned)offset,
 541                     (long unsigned)size);
 542
 543                 END_IO(zv, bio, rq, -SET_ERROR(EIO));
 544                 goto out;
 545         }
 546
 547         zv_request_task_t *task;
 548         zv_taskq_t *ztqs = &zvol_taskqs;
 549         uint_t blk_mq_hw_queue = 0;
 550         uint_t tq_idx;
 551         uint_t taskq_hash;
 552 #ifdef HAVE_BLK_MQ
 553         if (rq)
 554 #ifdef HAVE_BLK_MQ_RQ_HCTX
 555                 blk_mq_hw_queue = rq->mq_hctx->queue_num;
 556 #else
 557                 blk_mq_hw_queue =
 558                     rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num;
 559 #endif
 560 #endif
 561         taskq_hash = cityhash4((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT,
 562             blk_mq_hw_queue, 0);
 563         tq_idx = taskq_hash % ztqs->tqs_cnt;
 564
 565         if (rw == WRITE) {
 566                 if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
 567                         END_IO(zv, bio, rq, -SET_ERROR(EROFS));
 568                         goto out;
 569                 }
 570
 571                 /*
 572                  * Prevents the zvol from being suspended, or the ZIL being
 573                  * concurrently opened.  Will be released after the i/o
 574                  * completes.
 575                  */
 576                 rw_enter(&zv->zv_suspend_lock, RW_READER);
 577
 578                 /*
 579                  * Open a ZIL if this is the first time we have written to this
 580                  * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
 581                  * than zv_state_lock so that we don't need to acquire an
 582                  * additional lock in this path.
 583                  */
 584                 if (zv->zv_zilog == NULL) {
 585                         rw_exit(&zv->zv_suspend_lock);
 586                         rw_enter(&zv->zv_suspend_lock, RW_WRITER);
 587                         if (zv->zv_zilog == NULL) {
 588                                 zv->zv_zilog = zil_open(zv->zv_objset,
 589                                     zvol_get_data, &zv->zv_kstat.dk_zil_sums);
 590                                 zv->zv_flags |= ZVOL_WRITTEN_TO;
 591                                 /* replay / destroy done in zvol_create_minor */
 592                                 VERIFY0((zv->zv_zilog->zl_header->zh_flags &
 593                                     ZIL_REPLAY_NEEDED));
 594                         }
 595                         rw_downgrade(&zv->zv_suspend_lock);
 596                 }
 597
 598                 /*
 599                  * We don't want this thread to be blocked waiting for i/o to
 600                  * complete, so we instead wait from a taskq callback. The
 601                  * i/o may be a ZIL write (via zil_commit()), or a read of an
 602                  * indirect block, or a read of a data block (if this is a
 603                  * partial-block write).  We will indicate that the i/o is
 604                  * complete by calling END_IO() from the taskq callback.
 605                  *
 606                  * This design allows the calling thread to continue and
 607                  * initiate more concurrent operations by calling
 608                  * zvol_request() again. There are typically only a small
 609                  * number of threads available to call zvol_request() (e.g.
 610                  * one per iSCSI target), so keeping the latency of
 611                  * zvol_request() low is important for performance.
 612                  *
 613                  * The zvol_request_sync module parameter allows this
 614                  * behavior to be altered, for performance evaluation
 615                  * purposes.  If the callback blocks, setting
 616                  * zvol_request_sync=1 will result in much worse performance.
 617                  *
 618                  * We can have up to zvol_threads concurrent i/o's being
 619                  * processed for all zvols on the system.  This is typically
 620                  * a vast improvement over the zvol_request_sync=1 behavior
 621                  * of one i/o at a time per zvol.  However, an even better
 622                  * design would be for zvol_request() to initiate the zio
 623                  * directly, and then be notified by the zio_done callback,
 624                  * which would call END_IO().  Unfortunately, the DMU/ZIL
 625                  * interfaces lack this functionality (they block waiting for
 626                  * the i/o to complete).
 627                  */
 628                 if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) {
 629                         if (force_sync) {
 630                                 zvol_discard(&zvr);
 631                         } else {
 632                                 task = zv_request_task_create(zvr);
 633                                 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
 634                                     zvol_discard_task, task, 0, &task->ent);
 635                         }
 636                 } else {
 637                         if (force_sync) {
 638                                 zvol_write(&zvr);
 639                         } else {
 640                                 task = zv_request_task_create(zvr);
 641                                 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
 642                                     zvol_write_task, task, 0, &task->ent);
 643                         }
 644                 }
 645         } else {
 646                 /*
 647                  * The SCST driver, and possibly others, may issue READ I/Os
 648                  * with a length of zero bytes.  These empty I/Os contain no
 649                  * data and require no additional handling.
 650                  */
 651                 if (size == 0) {
 652                         END_IO(zv, bio, rq, 0);
 653                         goto out;
 654                 }
 655
 656                 rw_enter(&zv->zv_suspend_lock, RW_READER);
 657
 658                 /* See comment in WRITE case above. */
 659                 if (force_sync) {
 660                         zvol_read(&zvr);
 661                 } else {
 662                         task = zv_request_task_create(zvr);
 663                         taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
 664                             zvol_read_task, task, 0, &task->ent);
 665                 }
 666         }
 667
 668 out:
 669         spl_fstrans_unmark(cookie);
 670 }
 671
 672 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
 673 #ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID
 674 static void
 675 zvol_submit_bio(struct bio *bio)
 676 #else
 677 static blk_qc_t
 678 zvol_submit_bio(struct bio *bio)
 679 #endif
 680 #else
 681 static MAKE_REQUEST_FN_RET
 682 zvol_request(struct request_queue *q, struct bio *bio)
 683 #endif
 684 {
 685 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
 686 #if defined(HAVE_BIO_BDEV_DISK)
 687         struct request_queue *q = bio->bi_bdev->bd_disk->queue;
 688 #else
 689         struct request_queue *q = bio->bi_disk->queue;
 690 #endif
 691 #endif
 692         zvol_state_t *zv = q->queuedata;
 693
 694         zvol_request_impl(zv, bio, NULL, 0);
 695 #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \
 696         defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
 697         !defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID)
 698         return (BLK_QC_T_NONE);
 699 #endif
 700 }
 701
 702 static int
 703 #ifdef HAVE_BLK_MODE_T
 704 zvol_open(struct gendisk *disk, blk_mode_t flag)
 705 #else
 706 zvol_open(struct block_device *bdev, fmode_t flag)
 707 #endif
 708 {
 709         zvol_state_t *zv;
 710         int error = 0;
 711         boolean_t drop_suspend = B_FALSE;
 712 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
 713         hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms);
 714         hrtime_t start = gethrtime();
 715
 716 retry:
 717 #endif
 718         rw_enter(&zvol_state_lock, RW_READER);
 719         /*
 720          * Obtain a copy of private_data under the zvol_state_lock to make
 721          * sure that either the result of zvol free code path setting
 722          * disk->private_data to NULL is observed, or zvol_os_free()
 723          * is not called on this zv because of the positive zv_open_count.
 724          */
 725 #ifdef HAVE_BLK_MODE_T
 726         zv = disk->private_data;
 727 #else
 728         zv = bdev->bd_disk->private_data;
 729 #endif
 730         if (zv == NULL) {
 731                 rw_exit(&zvol_state_lock);
 732                 return (SET_ERROR(-ENXIO));
 733         }
 734
 735         mutex_enter(&zv->zv_state_lock);
 736         /*
 737          * Make sure zvol is not suspended during first open
 738          * (hold zv_suspend_lock) and respect proper lock acquisition
 739          * ordering - zv_suspend_lock before zv_state_lock
 740          */
 741         if (zv->zv_open_count == 0) {
 742                 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
 743                         mutex_exit(&zv->zv_state_lock);
 744                         rw_enter(&zv->zv_suspend_lock, RW_READER);
 745                         mutex_enter(&zv->zv_state_lock);
 746                         /* check to see if zv_suspend_lock is needed */
 747                         if (zv->zv_open_count != 0) {
 748                                 rw_exit(&zv->zv_suspend_lock);
 749                         } else {
 750                                 drop_suspend = B_TRUE;
 751                         }
 752                 } else {
 753                         drop_suspend = B_TRUE;
 754                 }
 755         }
 756         rw_exit(&zvol_state_lock);
 757
 758         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 759
 760         if (zv->zv_open_count == 0) {
 761                 boolean_t drop_namespace = B_FALSE;
 762
 763                 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
 764
 765                 /*
 766                  * In all other call paths the spa_namespace_lock is taken
 767                  * before the bdev->bd_mutex lock.  However, on open(2)
 768                  * the __blkdev_get() function calls fops->open() with the
 769                  * bdev->bd_mutex lock held.  This can result in a deadlock
 770                  * when zvols from one pool are used as vdevs in another.
 771                  *
 772                  * To prevent a lock inversion deadlock we preemptively
 773                  * take the spa_namespace_lock.  Normally the lock will not
 774                  * be contended and this is safe because spa_open_common()
 775                  * handles the case where the caller already holds the
 776                  * spa_namespace_lock.
 777                  *
 778                  * When the lock cannot be aquired after multiple retries
 779                  * this must be the vdev on zvol deadlock case and we have
 780                  * no choice but to return an error.  For 5.12 and older
 781                  * kernels returning -ERESTARTSYS will result in the
 782                  * bdev->bd_mutex being dropped, then reacquired, and
 783                  * fops->open() being called again.  This process can be
 784                  * repeated safely until both locks are acquired.  For 5.13
 785                  * and newer the -ERESTARTSYS retry logic was removed from
 786                  * the kernel so the only option is to return the error for
 787                  * the caller to handle it.
 788                  */
 789                 if (!mutex_owned(&spa_namespace_lock)) {
 790                         if (!mutex_tryenter(&spa_namespace_lock)) {
 791                                 mutex_exit(&zv->zv_state_lock);
 792                                 rw_exit(&zv->zv_suspend_lock);
 793
 794 #ifdef HAVE_BLKDEV_GET_ERESTARTSYS
 795                                 schedule();
 796                                 return (SET_ERROR(-ERESTARTSYS));
 797 #else
 798                                 if ((gethrtime() - start) > timeout)
 799                                         return (SET_ERROR(-ERESTARTSYS));
 800
 801                                 schedule_timeout(MSEC_TO_TICK(10));
 802                                 goto retry;
 803 #endif
 804                         } else {
 805                                 drop_namespace = B_TRUE;
 806                         }
 807                 }
 808
 809                 error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag)));
 810
 811                 if (drop_namespace)
 812                         mutex_exit(&spa_namespace_lock);
 813         }
 814
 815         if (error == 0) {
 816                 if ((blk_mode_is_open_write(flag)) &&
 817                     (zv->zv_flags & ZVOL_RDONLY)) {
 818                         if (zv->zv_open_count == 0)
 819                                 zvol_last_close(zv);
 820
 821                         error = SET_ERROR(-EROFS);
 822                 } else {
 823                         zv->zv_open_count++;
 824                 }
 825         }
 826
 827         mutex_exit(&zv->zv_state_lock);
 828         if (drop_suspend)
 829                 rw_exit(&zv->zv_suspend_lock);
 830
 831         if (error == 0)
 832 #ifdef HAVE_BLK_MODE_T
 833                 disk_check_media_change(disk);
 834 #else
 835                 zfs_check_media_change(bdev);
 836 #endif
 837
 838         return (error);
 839 }
 840
 841 static void
 842 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG
 843 zvol_release(struct gendisk *disk)
 844 #else
 845 zvol_release(struct gendisk *disk, fmode_t unused)
 846 #endif
 847 {
 848 #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG)
 849         (void) unused;
 850 #endif
 851         zvol_state_t *zv;
 852         boolean_t drop_suspend = B_TRUE;
 853
 854         rw_enter(&zvol_state_lock, RW_READER);
 855         zv = disk->private_data;
 856
 857         mutex_enter(&zv->zv_state_lock);
 858         ASSERT3U(zv->zv_open_count, >, 0);
 859         /*
 860          * make sure zvol is not suspended during last close
 861          * (hold zv_suspend_lock) and respect proper lock acquisition
 862          * ordering - zv_suspend_lock before zv_state_lock
 863          */
 864         if (zv->zv_open_count == 1) {
 865                 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
 866                         mutex_exit(&zv->zv_state_lock);
 867                         rw_enter(&zv->zv_suspend_lock, RW_READER);
 868                         mutex_enter(&zv->zv_state_lock);
 869                         /* check to see if zv_suspend_lock is needed */
 870                         if (zv->zv_open_count != 1) {
 871                                 rw_exit(&zv->zv_suspend_lock);
 872                                 drop_suspend = B_FALSE;
 873                         }
 874                 }
 875         } else {
 876                 drop_suspend = B_FALSE;
 877         }
 878         rw_exit(&zvol_state_lock);
 879
 880         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 881
 882         zv->zv_open_count--;
 883         if (zv->zv_open_count == 0) {
 884                 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
 885                 zvol_last_close(zv);
 886         }
 887
 888         mutex_exit(&zv->zv_state_lock);
 889
 890         if (drop_suspend)
 891                 rw_exit(&zv->zv_suspend_lock);
 892 }
 893
 894 static int
 895 zvol_ioctl(struct block_device *bdev, fmode_t mode,
 896     unsigned int cmd, unsigned long arg)
 897 {
 898         zvol_state_t *zv = bdev->bd_disk->private_data;
 899         int error = 0;
 900
 901         ASSERT3U(zv->zv_open_count, >, 0);
 902
 903         switch (cmd) {
 904         case BLKFLSBUF:
 905 #ifdef HAVE_FSYNC_BDEV
 906                 fsync_bdev(bdev);
 907 #elif defined(HAVE_SYNC_BLOCKDEV)
 908                 sync_blockdev(bdev);
 909 #else
 910 #error "Neither fsync_bdev() nor sync_blockdev() found"
 911 #endif
 912                 invalidate_bdev(bdev);
 913                 rw_enter(&zv->zv_suspend_lock, RW_READER);
 914
 915                 if (!(zv->zv_flags & ZVOL_RDONLY))
 916                         txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
 917
 918                 rw_exit(&zv->zv_suspend_lock);
 919                 break;
 920
 921         case BLKZNAME:
 922                 mutex_enter(&zv->zv_state_lock);
 923                 error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
 924                 mutex_exit(&zv->zv_state_lock);
 925                 break;
 926
 927         default:
 928                 error = -ENOTTY;
 929                 break;
 930         }
 931
 932         return (SET_ERROR(error));
 933 }
 934
 935 #ifdef CONFIG_COMPAT
 936 static int
 937 zvol_compat_ioctl(struct block_device *bdev, fmode_t mode,
 938     unsigned cmd, unsigned long arg)
 939 {
 940         return (zvol_ioctl(bdev, mode, cmd, arg));
 941 }
 942 #else
 943 #define zvol_compat_ioctl       NULL
 944 #endif
 945
 946 static unsigned int
 947 zvol_check_events(struct gendisk *disk, unsigned int clearing)
 948 {
 949         unsigned int mask = 0;
 950
 951         rw_enter(&zvol_state_lock, RW_READER);
 952
 953         zvol_state_t *zv = disk->private_data;
 954         if (zv != NULL) {
 955                 mutex_enter(&zv->zv_state_lock);
 956                 mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
 957                 zv->zv_changed = 0;
 958                 mutex_exit(&zv->zv_state_lock);
 959         }
 960
 961         rw_exit(&zvol_state_lock);
 962
 963         return (mask);
 964 }
 965
 966 static int
 967 zvol_revalidate_disk(struct gendisk *disk)
 968 {
 969         rw_enter(&zvol_state_lock, RW_READER);
 970
 971         zvol_state_t *zv = disk->private_data;
 972         if (zv != NULL) {
 973                 mutex_enter(&zv->zv_state_lock);
 974                 set_capacity(zv->zv_zso->zvo_disk,
 975                     zv->zv_volsize >> SECTOR_BITS);
 976                 mutex_exit(&zv->zv_state_lock);
 977         }
 978
 979         rw_exit(&zvol_state_lock);
 980
 981         return (0);
 982 }
 983
 984 int
 985 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
 986 {
 987         struct gendisk *disk = zv->zv_zso->zvo_disk;
 988
 989 #if defined(HAVE_REVALIDATE_DISK_SIZE)
 990         revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0);
 991 #elif defined(HAVE_REVALIDATE_DISK)
 992         revalidate_disk(disk);
 993 #else
 994         zvol_revalidate_disk(disk);
 995 #endif
 996         return (0);
 997 }
 998
 999 void
1000 zvol_os_clear_private(zvol_state_t *zv)
1001 {
1002         /*
1003          * Cleared while holding zvol_state_lock as a writer
1004          * which will prevent zvol_open() from opening it.
1005          */
1006         zv->zv_zso->zvo_disk->private_data = NULL;
1007 }
1008
1009 /*
1010  * Provide a simple virtual geometry for legacy compatibility.  For devices
1011  * smaller than 1 MiB a small head and sector count is used to allow very
1012  * tiny devices.  For devices over 1 Mib a standard head and sector count
1013  * is used to keep the cylinders count reasonable.
1014  */
1015 static int
1016 zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
1017 {
1018         zvol_state_t *zv = bdev->bd_disk->private_data;
1019         sector_t sectors;
1020
1021         ASSERT3U(zv->zv_open_count, >, 0);
1022
1023         sectors = get_capacity(zv->zv_zso->zvo_disk);
1024
1025         if (sectors > 2048) {
1026                 geo->heads = 16;
1027                 geo->sectors = 63;
1028         } else {
1029                 geo->heads = 2;
1030                 geo->sectors = 4;
1031         }
1032
1033         geo->start = 0;
1034         geo->cylinders = sectors / (geo->heads * geo->sectors);
1035
1036         return (0);
1037 }
1038
1039 /*
1040  * Why have two separate block_device_operations structs?
1041  *
1042  * Normally we'd just have one, and assign 'submit_bio' as needed.  However,
1043  * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we
1044  * can't just change submit_bio dynamically at runtime.  So just create two
1045  * separate structs to get around this.
1046  */
1047 static const struct block_device_operations zvol_ops_blk_mq = {
1048         .open                   = zvol_open,
1049         .release                = zvol_release,
1050         .ioctl                  = zvol_ioctl,
1051         .compat_ioctl           = zvol_compat_ioctl,
1052         .check_events           = zvol_check_events,
1053 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK
1054         .revalidate_disk        = zvol_revalidate_disk,
1055 #endif
1056         .getgeo                 = zvol_getgeo,
1057         .owner                  = THIS_MODULE,
1058 };
1059
1060 static const struct block_device_operations zvol_ops = {
1061         .open                   = zvol_open,
1062         .release                = zvol_release,
1063         .ioctl                  = zvol_ioctl,
1064         .compat_ioctl           = zvol_compat_ioctl,
1065         .check_events           = zvol_check_events,
1066 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK
1067         .revalidate_disk        = zvol_revalidate_disk,
1068 #endif
1069         .getgeo                 = zvol_getgeo,
1070         .owner                  = THIS_MODULE,
1071 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
1072         .submit_bio             = zvol_submit_bio,
1073 #endif
1074 };
1075
1076 static int
1077 zvol_alloc_non_blk_mq(struct zvol_state_os *zso)
1078 {
1079 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS)
1080 #if defined(HAVE_BLK_ALLOC_DISK)
1081         zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE);
1082         if (zso->zvo_disk == NULL)
1083                 return (1);
1084
1085         zso->zvo_disk->minors = ZVOL_MINORS;
1086         zso->zvo_queue = zso->zvo_disk->queue;
1087 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
1088         struct gendisk *disk = blk_alloc_disk(NULL, NUMA_NO_NODE);
1089         if (IS_ERR(disk)) {
1090                 zso->zvo_disk = NULL;
1091                 return (1);
1092         }
1093
1094         zso->zvo_disk = disk;
1095         zso->zvo_disk->minors = ZVOL_MINORS;
1096         zso->zvo_queue = zso->zvo_disk->queue;
1097 #else
1098         zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
1099         if (zso->zvo_queue == NULL)
1100                 return (1);
1101
1102         zso->zvo_disk = alloc_disk(ZVOL_MINORS);
1103         if (zso->zvo_disk == NULL) {
1104                 blk_cleanup_queue(zso->zvo_queue);
1105                 return (1);
1106         }
1107
1108         zso->zvo_disk->queue = zso->zvo_queue;
1109 #endif /* HAVE_BLK_ALLOC_DISK */
1110 #else
1111         zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE);
1112         if (zso->zvo_queue == NULL)
1113                 return (1);
1114
1115         zso->zvo_disk = alloc_disk(ZVOL_MINORS);
1116         if (zso->zvo_disk == NULL) {
1117                 blk_cleanup_queue(zso->zvo_queue);
1118                 return (1);
1119         }
1120
1121         zso->zvo_disk->queue = zso->zvo_queue;
1122 #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */
1123         return (0);
1124
1125 }
1126
1127 static int
1128 zvol_alloc_blk_mq(zvol_state_t *zv)
1129 {
1130 #ifdef HAVE_BLK_MQ
1131         struct zvol_state_os *zso = zv->zv_zso;
1132
1133         /* Allocate our blk-mq tag_set */
1134         if (zvol_blk_mq_alloc_tag_set(zv) != 0)
1135                 return (1);
1136
1137 #if defined(HAVE_BLK_ALLOC_DISK)
1138         zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv);
1139         if (zso->zvo_disk == NULL) {
1140                 blk_mq_free_tag_set(&zso->tag_set);
1141                 return (1);
1142         }
1143         zso->zvo_queue = zso->zvo_disk->queue;
1144         zso->zvo_disk->minors = ZVOL_MINORS;
1145 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
1146         struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, NULL, zv);
1147         if (IS_ERR(disk)) {
1148                 zso->zvo_disk = NULL;
1149                 blk_mq_free_tag_set(&zso->tag_set);
1150                 return (1);
1151         }
1152
1153         zso->zvo_disk = disk;
1154         zso->zvo_queue = zso->zvo_disk->queue;
1155         zso->zvo_disk->minors = ZVOL_MINORS;
1156 #else
1157         zso->zvo_disk = alloc_disk(ZVOL_MINORS);
1158         if (zso->zvo_disk == NULL) {
1159                 blk_cleanup_queue(zso->zvo_queue);
1160                 blk_mq_free_tag_set(&zso->tag_set);
1161                 return (1);
1162         }
1163         /* Allocate queue */
1164         zso->zvo_queue = blk_mq_init_queue(&zso->tag_set);
1165         if (IS_ERR(zso->zvo_queue)) {
1166                 blk_mq_free_tag_set(&zso->tag_set);
1167                 return (1);
1168         }
1169
1170         /* Our queue is now created, assign it to our disk */
1171         zso->zvo_disk->queue = zso->zvo_queue;
1172
1173 #endif
1174 #endif
1175         return (0);
1176 }
1177
1178 /*
1179  * Allocate memory for a new zvol_state_t and setup the required
1180  * request queue and generic disk structures for the block device.
1181  */
1182 static zvol_state_t *
1183 zvol_alloc(dev_t dev, const char *name)
1184 {
1185         zvol_state_t *zv;
1186         struct zvol_state_os *zso;
1187         uint64_t volmode;
1188         int ret;
1189
1190         if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0)
1191                 return (NULL);
1192
1193         if (volmode == ZFS_VOLMODE_DEFAULT)
1194                 volmode = zvol_volmode;
1195
1196         if (volmode == ZFS_VOLMODE_NONE)
1197                 return (NULL);
1198
1199         zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
1200         zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
1201         zv->zv_zso = zso;
1202         zv->zv_volmode = volmode;
1203
1204         list_link_init(&zv->zv_next);
1205         mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
1206
1207 #ifdef HAVE_BLK_MQ
1208         zv->zv_zso->use_blk_mq = zvol_use_blk_mq;
1209 #endif
1210
1211         /*
1212          * The block layer has 3 interfaces for getting BIOs:
1213          *
1214          * 1. blk-mq request queues (new)
1215          * 2. submit_bio() (oldest)
1216          * 3. regular request queues (old).
1217          *
1218          * Each of those interfaces has two permutations:
1219          *
1220          * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates
1221          *    both the disk and its queue (5.14 kernel or newer)
1222          *
1223          * b) We don't have blk_*alloc_disk(), and have to allocate the
1224          *    disk and the queue separately. (5.13 kernel or older)
1225          */
1226         if (zv->zv_zso->use_blk_mq) {
1227                 ret = zvol_alloc_blk_mq(zv);
1228                 zso->zvo_disk->fops = &zvol_ops_blk_mq;
1229         } else {
1230                 ret = zvol_alloc_non_blk_mq(zso);
1231                 zso->zvo_disk->fops = &zvol_ops;
1232         }
1233         if (ret != 0)
1234                 goto out_kmem;
1235
1236         blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE);
1237
1238         /* Limit read-ahead to a single page to prevent over-prefetching. */
1239         blk_queue_set_read_ahead(zso->zvo_queue, 1);
1240
1241         if (!zv->zv_zso->use_blk_mq) {
1242                 /* Disable write merging in favor of the ZIO pipeline. */
1243                 blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue);
1244         }
1245
1246         /* Enable /proc/diskstats */
1247         blk_queue_flag_set(QUEUE_FLAG_IO_STAT, zso->zvo_queue);
1248
1249         zso->zvo_queue->queuedata = zv;
1250         zso->zvo_dev = dev;
1251         zv->zv_open_count = 0;
1252         strlcpy(zv->zv_name, name, sizeof (zv->zv_name));
1253
1254         zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
1255         rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
1256
1257         zso->zvo_disk->major = zvol_major;
1258         zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE;
1259
1260         /*
1261          * Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices.
1262          * This is accomplished by limiting the number of minors for the
1263          * device to one and explicitly disabling partition scanning.
1264          */
1265         if (volmode == ZFS_VOLMODE_DEV) {
1266                 zso->zvo_disk->minors = 1;
1267                 zso->zvo_disk->flags &= ~ZFS_GENHD_FL_EXT_DEVT;
1268                 zso->zvo_disk->flags |= ZFS_GENHD_FL_NO_PART;
1269         }
1270
1271         zso->zvo_disk->first_minor = (dev & MINORMASK);
1272         zso->zvo_disk->private_data = zv;
1273         snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d",
1274             ZVOL_DEV_NAME, (dev & MINORMASK));
1275
1276         return (zv);
1277
1278 out_kmem:
1279         kmem_free(zso, sizeof (struct zvol_state_os));
1280         kmem_free(zv, sizeof (zvol_state_t));
1281         return (NULL);
1282 }
1283
1284 /*
1285  * Cleanup then free a zvol_state_t which was created by zvol_alloc().
1286  * At this time, the structure is not opened by anyone, is taken off
1287  * the zvol_state_list, and has its private data set to NULL.
1288  * The zvol_state_lock is dropped.
1289  *
1290  * This function may take many milliseconds to complete (e.g. we've seen
1291  * it take over 256ms), due to the calls to "blk_cleanup_queue" and
1292  * "del_gendisk". Thus, consumers need to be careful to account for this
1293  * latency when calling this function.
1294  */
1295 void
1296 zvol_os_free(zvol_state_t *zv)
1297 {
1298
1299         ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1300         ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
1301         ASSERT0(zv->zv_open_count);
1302         ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL);
1303
1304         rw_destroy(&zv->zv_suspend_lock);
1305         zfs_rangelock_fini(&zv->zv_rangelock);
1306
1307         del_gendisk(zv->zv_zso->zvo_disk);
1308 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
1309         (defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG))
1310 #if defined(HAVE_BLK_CLEANUP_DISK)
1311         blk_cleanup_disk(zv->zv_zso->zvo_disk);
1312 #else
1313         put_disk(zv->zv_zso->zvo_disk);
1314 #endif
1315 #else
1316         blk_cleanup_queue(zv->zv_zso->zvo_queue);
1317         put_disk(zv->zv_zso->zvo_disk);
1318 #endif
1319
1320 #ifdef HAVE_BLK_MQ
1321         if (zv->zv_zso->use_blk_mq)
1322                 blk_mq_free_tag_set(&zv->zv_zso->tag_set);
1323 #endif
1324
1325         ida_simple_remove(&zvol_ida,
1326             MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);
1327
1328         mutex_destroy(&zv->zv_state_lock);
1329         dataset_kstats_destroy(&zv->zv_kstat);
1330
1331         kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1332         kmem_free(zv, sizeof (zvol_state_t));
1333 }
1334
1335 void
1336 zvol_wait_close(zvol_state_t *zv)
1337 {
1338 }
1339
1340 /*
1341  * Create a block device minor node and setup the linkage between it
1342  * and the specified volume.  Once this function returns the block
1343  * device is live and ready for use.
1344  */
1345 int
1346 zvol_os_create_minor(const char *name)
1347 {
1348         zvol_state_t *zv;
1349         objset_t *os;
1350         dmu_object_info_t *doi;
1351         uint64_t volsize;
1352         uint64_t len;
1353         unsigned minor = 0;
1354         int error = 0;
1355         int idx;
1356         uint64_t hash = zvol_name_hash(name);
1357         uint64_t volthreading;
1358         bool replayed_zil = B_FALSE;
1359
1360         if (zvol_inhibit_dev)
1361                 return (0);
1362
1363         idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP));
1364         if (idx < 0)
1365                 return (SET_ERROR(-idx));
1366         minor = idx << ZVOL_MINOR_BITS;
1367         if (MINOR(minor) != minor) {
1368                 /* too many partitions can cause an overflow */
1369                 zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u",
1370                     name, minor, MINOR(minor));
1371                 ida_simple_remove(&zvol_ida, idx);
1372                 return (SET_ERROR(EINVAL));
1373         }
1374
1375         zv = zvol_find_by_name_hash(name, hash, RW_NONE);
1376         if (zv) {
1377                 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1378                 mutex_exit(&zv->zv_state_lock);
1379                 ida_simple_remove(&zvol_ida, idx);
1380                 return (SET_ERROR(EEXIST));
1381         }
1382
1383         doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
1384
1385         error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
1386         if (error)
1387                 goto out_doi;
1388
1389         error = dmu_object_info(os, ZVOL_OBJ, doi);
1390         if (error)
1391                 goto out_dmu_objset_disown;
1392
1393         error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1394         if (error)
1395                 goto out_dmu_objset_disown;
1396
1397         zv = zvol_alloc(MKDEV(zvol_major, minor), name);
1398         if (zv == NULL) {
1399                 error = SET_ERROR(EAGAIN);
1400                 goto out_dmu_objset_disown;
1401         }
1402         zv->zv_hash = hash;
1403
1404         if (dmu_objset_is_snapshot(os))
1405                 zv->zv_flags |= ZVOL_RDONLY;
1406
1407         zv->zv_volblocksize = doi->doi_data_block_size;
1408         zv->zv_volsize = volsize;
1409         zv->zv_objset = os;
1410
1411         /* Default */
1412         zv->zv_threading = B_TRUE;
1413         if (dsl_prop_get_integer(name, "volthreading", &volthreading, NULL)
1414             == 0)
1415                 zv->zv_threading = volthreading;
1416
1417         set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9);
1418
1419         blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue,
1420             (DMU_MAX_ACCESS / 4) >> 9);
1421
1422         if (zv->zv_zso->use_blk_mq) {
1423                 /*
1424                  * IO requests can be really big (1MB).  When an IO request
1425                  * comes in, it is passed off to zvol_read() or zvol_write()
1426                  * in a new thread, where it is chunked up into 'volblocksize'
1427                  * sized pieces and processed.  So for example, if the request
1428                  * is a 1MB write and your volblocksize is 128k, one zvol_write
1429                  * thread will take that request and sequentially do ten 128k
1430                  * IOs.  This is due to the fact that the thread needs to lock
1431                  * each volblocksize sized block.  So you might be wondering:
1432                  * "instead of passing the whole 1MB request to one thread,
1433                  * why not pass ten individual 128k chunks to ten threads and
1434                  * process the whole write in parallel?"  The short answer is
1435                  * that there's a sweet spot number of chunks that balances
1436                  * the greater parallelism with the added overhead of more
1437                  * threads. The sweet spot can be different depending on if you
1438                  * have a read or write  heavy workload.  Writes typically want
1439                  * high chunk counts while reads typically want lower ones.  On
1440                  * a test pool with 6 NVMe drives in a 3x 2-disk mirror
1441                  * configuration, with volblocksize=8k, the sweet spot for good
1442                  * sequential reads and writes was at 8 chunks.
1443                  */
1444
1445                 /*
1446                  * Below we tell the kernel how big we want our requests
1447                  * to be.  You would think that blk_queue_io_opt() would be
1448                  * used to do this since it is used to "set optimal request
1449                  * size for the queue", but that doesn't seem to do
1450                  * anything - the kernel still gives you huge requests
1451                  * with tons of little PAGE_SIZE segments contained within it.
1452                  *
1453                  * Knowing that the kernel will just give you PAGE_SIZE segments
1454                  * no matter what, you can say "ok, I want PAGE_SIZE byte
1455                  * segments, and I want 'N' of them per request", where N is
1456                  * the correct number of segments for the volblocksize and
1457                  * number of chunks you want.
1458                  */
1459 #ifdef HAVE_BLK_MQ
1460                 if (zvol_blk_mq_blocks_per_thread != 0) {
1461                         unsigned int chunks;
1462                         chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX);
1463
1464                         blk_queue_max_segment_size(zv->zv_zso->zvo_queue,
1465                             PAGE_SIZE);
1466                         blk_queue_max_segments(zv->zv_zso->zvo_queue,
1467                             (zv->zv_volblocksize * chunks) / PAGE_SIZE);
1468                 } else {
1469                         /*
1470                          * Special case: zvol_blk_mq_blocks_per_thread = 0
1471                          * Max everything out.
1472                          */
1473                         blk_queue_max_segments(zv->zv_zso->zvo_queue,
1474                             UINT16_MAX);
1475                         blk_queue_max_segment_size(zv->zv_zso->zvo_queue,
1476                             UINT_MAX);
1477                 }
1478 #endif
1479         } else {
1480                 blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX);
1481                 blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX);
1482         }
1483
1484         blk_queue_physical_block_size(zv->zv_zso->zvo_queue,
1485             zv->zv_volblocksize);
1486         blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize);
1487         blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue,
1488             (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
1489         blk_queue_discard_granularity(zv->zv_zso->zvo_queue,
1490             zv->zv_volblocksize);
1491 #ifdef QUEUE_FLAG_DISCARD
1492         blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue);
1493 #endif
1494 #ifdef QUEUE_FLAG_NONROT
1495         blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue);
1496 #endif
1497 #ifdef QUEUE_FLAG_ADD_RANDOM
1498         blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue);
1499 #endif
1500         /* This flag was introduced in kernel version 4.12. */
1501 #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH
1502         blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue);
1503 #endif
1504
1505         ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
1506         error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
1507         if (error)
1508                 goto out_dmu_objset_disown;
1509         ASSERT3P(zv->zv_zilog, ==, NULL);
1510         zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1511         if (spa_writeable(dmu_objset_spa(os))) {
1512                 if (zil_replay_disable)
1513                         replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE);
1514                 else
1515                         replayed_zil = zil_replay(os, zv, zvol_replay_vector);
1516         }
1517         if (replayed_zil)
1518                 zil_close(zv->zv_zilog);
1519         zv->zv_zilog = NULL;
1520
1521         /*
1522          * When udev detects the addition of the device it will immediately
1523          * invoke blkid(8) to determine the type of content on the device.
1524          * Prefetching the blocks commonly scanned by blkid(8) will speed
1525          * up this process.
1526          */
1527         len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE);
1528         if (len > 0) {
1529                 dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ);
1530                 dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len,
1531                     ZIO_PRIORITY_SYNC_READ);
1532         }
1533
1534         zv->zv_objset = NULL;
1535 out_dmu_objset_disown:
1536         dmu_objset_disown(os, B_TRUE, FTAG);
1537 out_doi:
1538         kmem_free(doi, sizeof (dmu_object_info_t));
1539
1540         /*
1541          * Keep in mind that once add_disk() is called, the zvol is
1542          * announced to the world, and zvol_open()/zvol_release() can
1543          * be called at any time. Incidentally, add_disk() itself calls
1544          * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close()
1545          * directly as well.
1546          */
1547         if (error == 0) {
1548                 rw_enter(&zvol_state_lock, RW_WRITER);
1549                 zvol_insert(zv);
1550                 rw_exit(&zvol_state_lock);
1551 #ifdef HAVE_ADD_DISK_RET
1552                 error = add_disk(zv->zv_zso->zvo_disk);
1553 #else
1554                 add_disk(zv->zv_zso->zvo_disk);
1555 #endif
1556         } else {
1557                 ida_simple_remove(&zvol_ida, idx);
1558         }
1559
1560         return (error);
1561 }
1562
1563 void
1564 zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
1565 {
1566         int readonly = get_disk_ro(zv->zv_zso->zvo_disk);
1567
1568         ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1569         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1570
1571         strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1572
1573         /* move to new hashtable entry  */
1574         zv->zv_hash = zvol_name_hash(zv->zv_name);
1575         hlist_del(&zv->zv_hlink);
1576         hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1577
1578         /*
1579          * The block device's read-only state is briefly changed causing
1580          * a KOBJ_CHANGE uevent to be issued.  This ensures udev detects
1581          * the name change and fixes the symlinks.  This does not change
1582          * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never
1583          * changes.  This would normally be done using kobject_uevent() but
1584          * that is a GPL-only symbol which is why we need this workaround.
1585          */
1586         set_disk_ro(zv->zv_zso->zvo_disk, !readonly);
1587         set_disk_ro(zv->zv_zso->zvo_disk, readonly);
1588
1589         dataset_kstats_rename(&zv->zv_kstat, newname);
1590 }
1591
1592 void
1593 zvol_os_set_disk_ro(zvol_state_t *zv, int flags)
1594 {
1595
1596         set_disk_ro(zv->zv_zso->zvo_disk, flags);
1597 }
1598
1599 void
1600 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity)
1601 {
1602
1603         set_capacity(zv->zv_zso->zvo_disk, capacity);
1604 }
1605
1606 int
1607 zvol_init(void)
1608 {
1609         int error;
1610
1611         /*
1612          * zvol_threads is the module param the user passes in.
1613          *
1614          * zvol_actual_threads is what we use internally, since the user can
1615          * pass zvol_thread = 0 to mean "use all the CPUs" (the default).
1616          */
1617         static unsigned int zvol_actual_threads;
1618
1619         if (zvol_threads == 0) {
1620                 /*
1621                  * See dde9380a1 for why 32 was chosen here.  This should
1622                  * probably be refined to be some multiple of the number
1623                  * of CPUs.
1624                  */
1625                 zvol_actual_threads = MAX(num_online_cpus(), 32);
1626         } else {
1627                 zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024);
1628         }
1629
1630         /*
1631          * Use atleast 32 zvol_threads but for many core system,
1632          * prefer 6 threads per taskq, but no more taskqs
1633          * than threads in them on large systems.
1634          *
1635          *                 taskq   total
1636          * cpus    taskqs  threads threads
1637          * ------- ------- ------- -------
1638          * 1       1       32       32
1639          * 2       1       32       32
1640          * 4       1       32       32
1641          * 8       2       16       32
1642          * 16      3       11       33
1643          * 32      5       7        35
1644          * 64      8       8        64
1645          * 128     11      12       132
1646          * 256     16      16       256
1647          */
1648         zv_taskq_t *ztqs = &zvol_taskqs;
1649         uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs);
1650         if (num_tqs == 0) {
1651                 num_tqs = 1 + num_online_cpus() / 6;
1652                 while (num_tqs * num_tqs > zvol_actual_threads)
1653                         num_tqs--;
1654         }
1655         uint_t per_tq_thread = zvol_actual_threads / num_tqs;
1656         if (per_tq_thread * num_tqs < zvol_actual_threads)
1657                 per_tq_thread++;
1658         ztqs->tqs_cnt = num_tqs;
1659         ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP);
1660         error = register_blkdev(zvol_major, ZVOL_DRIVER);
1661         if (error) {
1662                 kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *));
1663                 ztqs->tqs_taskq = NULL;
1664                 printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
1665                 return (error);
1666         }
1667
1668 #ifdef HAVE_BLK_MQ
1669         if (zvol_blk_mq_queue_depth == 0) {
1670                 zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ;
1671         } else {
1672                 zvol_actual_blk_mq_queue_depth =
1673                     MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ);
1674         }
1675
1676         if (zvol_blk_mq_threads == 0) {
1677                 zvol_blk_mq_actual_threads = num_online_cpus();
1678         } else {
1679                 zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1),
1680                     1024);
1681         }
1682 #endif
1683         for (uint_t i = 0; i < num_tqs; i++) {
1684                 char name[32];
1685                 (void) snprintf(name, sizeof (name), "%s_tq-%u",
1686                     ZVOL_DRIVER, i);
1687                 ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread,
1688                     maxclsyspri, per_tq_thread, INT_MAX,
1689                     TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
1690                 if (ztqs->tqs_taskq[i] == NULL) {
1691                         for (int j = i - 1; j >= 0; j--)
1692                                 taskq_destroy(ztqs->tqs_taskq[j]);
1693                         unregister_blkdev(zvol_major, ZVOL_DRIVER);
1694                         kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
1695                             sizeof (taskq_t *));
1696                         ztqs->tqs_taskq = NULL;
1697                         return (-ENOMEM);
1698                 }
1699         }
1700
1701         zvol_init_impl();
1702         ida_init(&zvol_ida);
1703         return (0);
1704 }
1705
1706 void
1707 zvol_fini(void)
1708 {
1709         zv_taskq_t *ztqs = &zvol_taskqs;
1710         zvol_fini_impl();
1711         unregister_blkdev(zvol_major, ZVOL_DRIVER);
1712
1713         if (ztqs->tqs_taskq == NULL) {
1714                 ASSERT3U(ztqs->tqs_cnt, ==, 0);
1715         } else {
1716                 for (uint_t i = 0; i < ztqs->tqs_cnt; i++) {
1717                         ASSERT3P(ztqs->tqs_taskq[i], !=, NULL);
1718                         taskq_destroy(ztqs->tqs_taskq[i]);
1719                 }
1720                 kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
1721                     sizeof (taskq_t *));
1722                 ztqs->tqs_taskq = NULL;
1723         }
1724
1725         ida_destroy(&zvol_ida);
1726 }
1727
1728 /* BEGIN CSTYLED */
1729 module_param(zvol_inhibit_dev, uint, 0644);
1730 MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
1731
1732 module_param(zvol_major, uint, 0444);
1733 MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
1734
1735 module_param(zvol_threads, uint, 0444);
1736 MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set"
1737     "to 0 to use all active CPUs");
1738
1739 module_param(zvol_request_sync, uint, 0644);
1740 MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
1741
1742 module_param(zvol_max_discard_blocks, ulong, 0444);
1743 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
1744
1745 module_param(zvol_num_taskqs, uint, 0444);
1746 MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs");
1747
1748 module_param(zvol_prefetch_bytes, uint, 0644);
1749 MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
1750
1751 module_param(zvol_volmode, uint, 0644);
1752 MODULE_PARM_DESC(zvol_volmode, "Default volmode property value");
1753
1754 #ifdef HAVE_BLK_MQ
1755 module_param(zvol_blk_mq_queue_depth, uint, 0644);
1756 MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth");
1757
1758 module_param(zvol_use_blk_mq, uint, 0644);
1759 MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols");
1760
1761 module_param(zvol_blk_mq_blocks_per_thread, uint, 0644);
1762 MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread,
1763     "Process volblocksize blocks per thread");
1764 #endif
1765
1766 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
1767 module_param(zvol_open_timeout_ms, uint, 0644);
1768 MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries");
1769 #endif
1770
1771 /* END CSTYLED */