]> git.proxmox.com Git - mirror_zfs.git/blame - module/os/linux/zfs/zvol_os.c
ZTS: Simplify zpool_initialize_verify_initialized
[mirror_zfs.git] / module / os / linux / zfs / zvol_os.c
CommitLineData
5df7e9d8
MM
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
0929c4de
MA
21/*
22 * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
23 */
5df7e9d8
MM
24
25#include <sys/dataset_kstats.h>
26#include <sys/dbuf.h>
27#include <sys/dmu_traverse.h>
28#include <sys/dsl_dataset.h>
29#include <sys/dsl_prop.h>
30#include <sys/dsl_dir.h>
31#include <sys/zap.h>
32#include <sys/zfeature.h>
33#include <sys/zil_impl.h>
34#include <sys/dmu_tx.h>
35#include <sys/zio.h>
36#include <sys/zfs_rlock.h>
37#include <sys/spa_impl.h>
38#include <sys/zvol.h>
39#include <sys/zvol_impl.h>
40
41#include <linux/blkdev_compat.h>
42#include <linux/task_io_accounting_ops.h>
43
44unsigned int zvol_major = ZVOL_MAJOR;
45unsigned int zvol_request_sync = 0;
46unsigned int zvol_prefetch_bytes = (128 * 1024);
47unsigned long zvol_max_discard_blocks = 16384;
48unsigned int zvol_threads = 32;
49
50struct zvol_state_os {
51 struct gendisk *zvo_disk; /* generic disk */
52 struct request_queue *zvo_queue; /* request queue */
5df7e9d8
MM
53 dev_t zvo_dev; /* device id */
54};
55
56taskq_t *zvol_taskq;
57static struct ida zvol_ida;
58
59typedef struct zv_request {
60 zvol_state_t *zv;
61 struct bio *bio;
0929c4de 62 taskq_ent_t ent;
5df7e9d8
MM
63} zv_request_t;
64
65/*
66 * Given a path, return TRUE if path is a ZVOL.
67 */
68static boolean_t
69zvol_is_zvol_impl(const char *device)
70{
71 struct block_device *bdev;
72 unsigned int major;
73
74 bdev = vdev_lookup_bdev(device);
75 if (IS_ERR(bdev))
76 return (B_FALSE);
77
78 major = MAJOR(bdev->bd_dev);
79 bdput(bdev);
80
81 if (major == zvol_major)
82 return (B_TRUE);
83
84 return (B_FALSE);
85}
86
87static void
88uio_from_bio(uio_t *uio, struct bio *bio)
89{
90 uio->uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)];
91 uio->uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio);
92 uio->uio_loffset = BIO_BI_SECTOR(bio) << 9;
93 uio->uio_segflg = UIO_BVEC;
5df7e9d8
MM
94 uio->uio_resid = BIO_BI_SIZE(bio);
95 uio->uio_skip = BIO_BI_SKIP(bio);
96}
97
98static void
99zvol_write(void *arg)
100{
101 int error = 0;
102
103 zv_request_t *zvr = arg;
104 struct bio *bio = zvr->bio;
105 uio_t uio = { { 0 }, 0 };
106 uio_from_bio(&uio, bio);
107
108 zvol_state_t *zv = zvr->zv;
0b32d817
RM
109 ASSERT3P(zv, !=, NULL);
110 ASSERT3U(zv->zv_open_count, >, 0);
111 ASSERT3P(zv->zv_zilog, !=, NULL);
5df7e9d8 112
0929c4de
MA
113 /* bio marked as FLUSH need to flush before write */
114 if (bio_is_flush(bio))
115 zil_commit(zv->zv_zilog, ZVOL_OBJ);
116
117 /* Some requests are just for flush and nothing else. */
118 if (uio.uio_resid == 0) {
119 rw_exit(&zv->zv_suspend_lock);
120 BIO_END_IO(bio, 0);
121 kmem_free(zvr, sizeof (zv_request_t));
122 return;
123 }
124
5df7e9d8
MM
125 ssize_t start_resid = uio.uio_resid;
126 unsigned long start_jif = jiffies;
127 blk_generic_start_io_acct(zv->zv_zso->zvo_queue, WRITE,
128 bio_sectors(bio), &zv->zv_zso->zvo_disk->part0);
129
130 boolean_t sync =
131 bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
132
0929c4de
MA
133 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
134 uio.uio_loffset, uio.uio_resid, RL_WRITER);
135
5df7e9d8
MM
136 uint64_t volsize = zv->zv_volsize;
137 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
138 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
139 uint64_t off = uio.uio_loffset;
140 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
141
142 if (bytes > volsize - off) /* don't write past the end */
143 bytes = volsize - off;
144
20f28785 145 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
5df7e9d8
MM
146
147 /* This will only fail for ENOSPC */
148 error = dmu_tx_assign(tx, TXG_WAIT);
149 if (error) {
150 dmu_tx_abort(tx);
151 break;
152 }
153 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
154 if (error == 0) {
155 zvol_log_write(zv, tx, off, bytes, sync);
156 }
157 dmu_tx_commit(tx);
158
159 if (error)
160 break;
161 }
0929c4de 162 zfs_rangelock_exit(lr);
5df7e9d8
MM
163
164 int64_t nwritten = start_resid - uio.uio_resid;
4547fc4e 165 dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
5df7e9d8
MM
166 task_io_account_write(nwritten);
167
168 if (sync)
169 zil_commit(zv->zv_zilog, ZVOL_OBJ);
170
171 rw_exit(&zv->zv_suspend_lock);
172 blk_generic_end_io_acct(zv->zv_zso->zvo_queue,
173 WRITE, &zv->zv_zso->zvo_disk->part0, start_jif);
174 BIO_END_IO(bio, -error);
175 kmem_free(zvr, sizeof (zv_request_t));
176}
177
178static void
179zvol_discard(void *arg)
180{
181 zv_request_t *zvr = arg;
182 struct bio *bio = zvr->bio;
183 zvol_state_t *zv = zvr->zv;
184 uint64_t start = BIO_BI_SECTOR(bio) << 9;
185 uint64_t size = BIO_BI_SIZE(bio);
186 uint64_t end = start + size;
187 boolean_t sync;
188 int error = 0;
189 dmu_tx_t *tx;
190 unsigned long start_jif;
191
0b32d817
RM
192 ASSERT3P(zv, !=, NULL);
193 ASSERT3U(zv->zv_open_count, >, 0);
194 ASSERT3P(zv->zv_zilog, !=, NULL);
5df7e9d8
MM
195
196 start_jif = jiffies;
197 blk_generic_start_io_acct(zv->zv_zso->zvo_queue, WRITE,
198 bio_sectors(bio), &zv->zv_zso->zvo_disk->part0);
199
200 sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
201
202 if (end > zv->zv_volsize) {
203 error = SET_ERROR(EIO);
204 goto unlock;
205 }
206
207 /*
208 * Align the request to volume block boundaries when a secure erase is
209 * not required. This will prevent dnode_free_range() from zeroing out
210 * the unaligned parts which is slow (read-modify-write) and useless
211 * since we are not freeing any space by doing so.
212 */
213 if (!bio_is_secure_erase(bio)) {
214 start = P2ROUNDUP(start, zv->zv_volblocksize);
215 end = P2ALIGN(end, zv->zv_volblocksize);
216 size = end - start;
217 }
218
219 if (start >= end)
220 goto unlock;
221
0929c4de
MA
222 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
223 start, size, RL_WRITER);
224
5df7e9d8
MM
225 tx = dmu_tx_create(zv->zv_objset);
226 dmu_tx_mark_netfree(tx);
227 error = dmu_tx_assign(tx, TXG_WAIT);
228 if (error != 0) {
229 dmu_tx_abort(tx);
230 } else {
231 zvol_log_truncate(zv, tx, start, size, B_TRUE);
232 dmu_tx_commit(tx);
233 error = dmu_free_long_range(zv->zv_objset,
234 ZVOL_OBJ, start, size);
235 }
0929c4de 236 zfs_rangelock_exit(lr);
5df7e9d8
MM
237
238 if (error == 0 && sync)
239 zil_commit(zv->zv_zilog, ZVOL_OBJ);
240
0929c4de 241unlock:
5df7e9d8
MM
242 rw_exit(&zv->zv_suspend_lock);
243 blk_generic_end_io_acct(zv->zv_zso->zvo_queue, WRITE,
244 &zv->zv_zso->zvo_disk->part0, start_jif);
245 BIO_END_IO(bio, -error);
246 kmem_free(zvr, sizeof (zv_request_t));
247}
248
249static void
250zvol_read(void *arg)
251{
252 int error = 0;
253
254 zv_request_t *zvr = arg;
255 struct bio *bio = zvr->bio;
256 uio_t uio = { { 0 }, 0 };
257 uio_from_bio(&uio, bio);
258
259 zvol_state_t *zv = zvr->zv;
0b32d817
RM
260 ASSERT3P(zv, !=, NULL);
261 ASSERT3U(zv->zv_open_count, >, 0);
5df7e9d8
MM
262
263 ssize_t start_resid = uio.uio_resid;
264 unsigned long start_jif = jiffies;
265 blk_generic_start_io_acct(zv->zv_zso->zvo_queue, READ, bio_sectors(bio),
266 &zv->zv_zso->zvo_disk->part0);
267
0929c4de
MA
268 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
269 uio.uio_loffset, uio.uio_resid, RL_READER);
270
5df7e9d8
MM
271 uint64_t volsize = zv->zv_volsize;
272 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
273 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
274
275 /* don't read past the end */
276 if (bytes > volsize - uio.uio_loffset)
277 bytes = volsize - uio.uio_loffset;
278
279 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
280 if (error) {
281 /* convert checksum errors into IO errors */
282 if (error == ECKSUM)
283 error = SET_ERROR(EIO);
284 break;
285 }
286 }
0929c4de 287 zfs_rangelock_exit(lr);
5df7e9d8
MM
288
289 int64_t nread = start_resid - uio.uio_resid;
4547fc4e 290 dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
5df7e9d8
MM
291 task_io_account_read(nread);
292
293 rw_exit(&zv->zv_suspend_lock);
294 blk_generic_end_io_acct(zv->zv_zso->zvo_queue, READ,
295 &zv->zv_zso->zvo_disk->part0, start_jif);
296 BIO_END_IO(bio, -error);
297 kmem_free(zvr, sizeof (zv_request_t));
298}
299
d817c171
CK
300#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
301static blk_qc_t
302zvol_submit_bio(struct bio *bio)
303#else
5df7e9d8
MM
304static MAKE_REQUEST_FN_RET
305zvol_request(struct request_queue *q, struct bio *bio)
d817c171 306#endif
5df7e9d8 307{
d817c171
CK
308#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
309 struct request_queue *q = bio->bi_disk->queue;
310#endif
5df7e9d8
MM
311 zvol_state_t *zv = q->queuedata;
312 fstrans_cookie_t cookie = spl_fstrans_mark();
313 uint64_t offset = BIO_BI_SECTOR(bio) << 9;
314 uint64_t size = BIO_BI_SIZE(bio);
315 int rw = bio_data_dir(bio);
316 zv_request_t *zvr;
317
318 if (bio_has_data(bio) && offset + size > zv->zv_volsize) {
319 printk(KERN_INFO
320 "%s: bad access: offset=%llu, size=%lu\n",
321 zv->zv_zso->zvo_disk->disk_name,
322 (long long unsigned)offset,
323 (long unsigned)size);
324
325 BIO_END_IO(bio, -SET_ERROR(EIO));
326 goto out;
327 }
328
329 if (rw == WRITE) {
5df7e9d8
MM
330 if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
331 BIO_END_IO(bio, -SET_ERROR(EROFS));
332 goto out;
333 }
334
335 /*
0929c4de
MA
336 * Prevents the zvol from being suspended, or the ZIL being
337 * concurrently opened. Will be released after the i/o
338 * completes.
5df7e9d8
MM
339 */
340 rw_enter(&zv->zv_suspend_lock, RW_READER);
341
342 /*
343 * Open a ZIL if this is the first time we have written to this
344 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
345 * than zv_state_lock so that we don't need to acquire an
346 * additional lock in this path.
347 */
348 if (zv->zv_zilog == NULL) {
349 rw_exit(&zv->zv_suspend_lock);
350 rw_enter(&zv->zv_suspend_lock, RW_WRITER);
351 if (zv->zv_zilog == NULL) {
352 zv->zv_zilog = zil_open(zv->zv_objset,
353 zvol_get_data);
354 zv->zv_flags |= ZVOL_WRITTEN_TO;
355 }
356 rw_downgrade(&zv->zv_suspend_lock);
357 }
358
5df7e9d8
MM
359 zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP);
360 zvr->zv = zv;
361 zvr->bio = bio;
0929c4de 362 taskq_init_ent(&zvr->ent);
5df7e9d8
MM
363
364 /*
0929c4de
MA
365 * We don't want this thread to be blocked waiting for i/o to
366 * complete, so we instead wait from a taskq callback. The
367 * i/o may be a ZIL write (via zil_commit()), or a read of an
368 * indirect block, or a read of a data block (if this is a
369 * partial-block write). We will indicate that the i/o is
370 * complete by calling BIO_END_IO() from the taskq callback.
371 *
372 * This design allows the calling thread to continue and
373 * initiate more concurrent operations by calling
374 * zvol_request() again. There are typically only a small
375 * number of threads available to call zvol_request() (e.g.
376 * one per iSCSI target), so keeping the latency of
377 * zvol_request() low is important for performance.
378 *
379 * The zvol_request_sync module parameter allows this
380 * behavior to be altered, for performance evaluation
381 * purposes. If the callback blocks, setting
382 * zvol_request_sync=1 will result in much worse performance.
383 *
384 * We can have up to zvol_threads concurrent i/o's being
385 * processed for all zvols on the system. This is typically
386 * a vast improvement over the zvol_request_sync=1 behavior
387 * of one i/o at a time per zvol. However, an even better
388 * design would be for zvol_request() to initiate the zio
389 * directly, and then be notified by the zio_done callback,
390 * which would call BIO_END_IO(). Unfortunately, the DMU/ZIL
391 * interfaces lack this functionality (they block waiting for
392 * the i/o to complete).
5df7e9d8 393 */
5df7e9d8 394 if (bio_is_discard(bio) || bio_is_secure_erase(bio)) {
0929c4de 395 if (zvol_request_sync) {
5df7e9d8 396 zvol_discard(zvr);
0929c4de
MA
397 } else {
398 taskq_dispatch_ent(zvol_taskq,
399 zvol_discard, zvr, 0, &zvr->ent);
400 }
5df7e9d8 401 } else {
0929c4de 402 if (zvol_request_sync) {
5df7e9d8 403 zvol_write(zvr);
0929c4de
MA
404 } else {
405 taskq_dispatch_ent(zvol_taskq,
406 zvol_write, zvr, 0, &zvr->ent);
407 }
5df7e9d8
MM
408 }
409 } else {
410 /*
411 * The SCST driver, and possibly others, may issue READ I/Os
412 * with a length of zero bytes. These empty I/Os contain no
413 * data and require no additional handling.
414 */
415 if (size == 0) {
416 BIO_END_IO(bio, 0);
417 goto out;
418 }
419
420 zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP);
421 zvr->zv = zv;
422 zvr->bio = bio;
0929c4de 423 taskq_init_ent(&zvr->ent);
5df7e9d8
MM
424
425 rw_enter(&zv->zv_suspend_lock, RW_READER);
426
0929c4de
MA
427 /* See comment in WRITE case above. */
428 if (zvol_request_sync) {
5df7e9d8 429 zvol_read(zvr);
0929c4de
MA
430 } else {
431 taskq_dispatch_ent(zvol_taskq,
432 zvol_read, zvr, 0, &zvr->ent);
433 }
5df7e9d8
MM
434 }
435
436out:
437 spl_fstrans_unmark(cookie);
d817c171
CK
438#if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \
439 defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS)
5df7e9d8
MM
440 return (BLK_QC_T_NONE);
441#endif
442}
443
444static int
445zvol_open(struct block_device *bdev, fmode_t flag)
446{
447 zvol_state_t *zv;
448 int error = 0;
449 boolean_t drop_suspend = B_TRUE;
450
451 rw_enter(&zvol_state_lock, RW_READER);
452 /*
453 * Obtain a copy of private_data under the zvol_state_lock to make
454 * sure that either the result of zvol free code path setting
455 * bdev->bd_disk->private_data to NULL is observed, or zvol_free()
456 * is not called on this zv because of the positive zv_open_count.
457 */
458 zv = bdev->bd_disk->private_data;
459 if (zv == NULL) {
460 rw_exit(&zvol_state_lock);
461 return (SET_ERROR(-ENXIO));
462 }
463
464 mutex_enter(&zv->zv_state_lock);
465 /*
466 * make sure zvol is not suspended during first open
467 * (hold zv_suspend_lock) and respect proper lock acquisition
468 * ordering - zv_suspend_lock before zv_state_lock
469 */
470 if (zv->zv_open_count == 0) {
471 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
472 mutex_exit(&zv->zv_state_lock);
473 rw_enter(&zv->zv_suspend_lock, RW_READER);
474 mutex_enter(&zv->zv_state_lock);
475 /* check to see if zv_suspend_lock is needed */
476 if (zv->zv_open_count != 0) {
477 rw_exit(&zv->zv_suspend_lock);
478 drop_suspend = B_FALSE;
479 }
480 }
481 } else {
482 drop_suspend = B_FALSE;
483 }
484 rw_exit(&zvol_state_lock);
485
486 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
5df7e9d8
MM
487
488 if (zv->zv_open_count == 0) {
0b32d817 489 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
5df7e9d8
MM
490 error = -zvol_first_open(zv, !(flag & FMODE_WRITE));
491 if (error)
492 goto out_mutex;
493 }
494
495 if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
496 error = -EROFS;
497 goto out_open_count;
498 }
499
500 zv->zv_open_count++;
501
502 mutex_exit(&zv->zv_state_lock);
503 if (drop_suspend)
504 rw_exit(&zv->zv_suspend_lock);
505
ae15f1c1 506 zfs_check_media_change(bdev);
5df7e9d8
MM
507
508 return (0);
509
510out_open_count:
511 if (zv->zv_open_count == 0)
512 zvol_last_close(zv);
513
514out_mutex:
515 mutex_exit(&zv->zv_state_lock);
516 if (drop_suspend)
517 rw_exit(&zv->zv_suspend_lock);
518 if (error == -EINTR) {
519 error = -ERESTARTSYS;
520 schedule();
521 }
522 return (SET_ERROR(error));
523}
524
5df7e9d8 525static void
5df7e9d8
MM
526zvol_release(struct gendisk *disk, fmode_t mode)
527{
528 zvol_state_t *zv;
529 boolean_t drop_suspend = B_TRUE;
530
531 rw_enter(&zvol_state_lock, RW_READER);
532 zv = disk->private_data;
533
534 mutex_enter(&zv->zv_state_lock);
0b32d817 535 ASSERT3U(zv->zv_open_count, >, 0);
5df7e9d8
MM
536 /*
537 * make sure zvol is not suspended during last close
538 * (hold zv_suspend_lock) and respect proper lock acquisition
539 * ordering - zv_suspend_lock before zv_state_lock
540 */
541 if (zv->zv_open_count == 1) {
542 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
543 mutex_exit(&zv->zv_state_lock);
544 rw_enter(&zv->zv_suspend_lock, RW_READER);
545 mutex_enter(&zv->zv_state_lock);
546 /* check to see if zv_suspend_lock is needed */
547 if (zv->zv_open_count != 1) {
548 rw_exit(&zv->zv_suspend_lock);
549 drop_suspend = B_FALSE;
550 }
551 }
552 } else {
553 drop_suspend = B_FALSE;
554 }
555 rw_exit(&zvol_state_lock);
556
557 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
5df7e9d8
MM
558
559 zv->zv_open_count--;
0b32d817
RM
560 if (zv->zv_open_count == 0) {
561 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
5df7e9d8 562 zvol_last_close(zv);
0b32d817 563 }
5df7e9d8
MM
564
565 mutex_exit(&zv->zv_state_lock);
566
567 if (drop_suspend)
568 rw_exit(&zv->zv_suspend_lock);
5df7e9d8
MM
569}
570
571static int
572zvol_ioctl(struct block_device *bdev, fmode_t mode,
573 unsigned int cmd, unsigned long arg)
574{
575 zvol_state_t *zv = bdev->bd_disk->private_data;
576 int error = 0;
577
578 ASSERT3U(zv->zv_open_count, >, 0);
579
580 switch (cmd) {
581 case BLKFLSBUF:
582 fsync_bdev(bdev);
583 invalidate_bdev(bdev);
584 rw_enter(&zv->zv_suspend_lock, RW_READER);
585
586 if (!(zv->zv_flags & ZVOL_RDONLY))
587 txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
588
589 rw_exit(&zv->zv_suspend_lock);
590 break;
591
592 case BLKZNAME:
593 mutex_enter(&zv->zv_state_lock);
594 error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
595 mutex_exit(&zv->zv_state_lock);
596 break;
597
598 default:
599 error = -ENOTTY;
600 break;
601 }
602
603 return (SET_ERROR(error));
604}
605
606#ifdef CONFIG_COMPAT
607static int
608zvol_compat_ioctl(struct block_device *bdev, fmode_t mode,
609 unsigned cmd, unsigned long arg)
610{
611 return (zvol_ioctl(bdev, mode, cmd, arg));
612}
613#else
614#define zvol_compat_ioctl NULL
615#endif
616
5df7e9d8
MM
617static unsigned int
618zvol_check_events(struct gendisk *disk, unsigned int clearing)
619{
620 unsigned int mask = 0;
621
622 rw_enter(&zvol_state_lock, RW_READER);
623
624 zvol_state_t *zv = disk->private_data;
625 if (zv != NULL) {
626 mutex_enter(&zv->zv_state_lock);
627 mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
628 zv->zv_changed = 0;
629 mutex_exit(&zv->zv_state_lock);
630 }
631
632 rw_exit(&zvol_state_lock);
633
634 return (mask);
635}
5df7e9d8
MM
636
637static int
638zvol_revalidate_disk(struct gendisk *disk)
639{
640 rw_enter(&zvol_state_lock, RW_READER);
641
642 zvol_state_t *zv = disk->private_data;
643 if (zv != NULL) {
644 mutex_enter(&zv->zv_state_lock);
645 set_capacity(zv->zv_zso->zvo_disk,
646 zv->zv_volsize >> SECTOR_BITS);
647 mutex_exit(&zv->zv_state_lock);
648 }
649
650 rw_exit(&zvol_state_lock);
651
652 return (0);
653}
654
65c7cc49 655static int
5df7e9d8
MM
656zvol_update_volsize(zvol_state_t *zv, uint64_t volsize)
657{
658
59b68723
CK
659#ifdef HAVE_REVALIDATE_DISK_SIZE
660 revalidate_disk_size(zv->zv_zso->zvo_disk, false);
661#else
5df7e9d8 662 revalidate_disk(zv->zv_zso->zvo_disk);
59b68723 663#endif
5df7e9d8
MM
664 return (0);
665}
666
667static void
668zvol_clear_private(zvol_state_t *zv)
669{
670 /*
671 * Cleared while holding zvol_state_lock as a writer
672 * which will prevent zvol_open() from opening it.
673 */
674 zv->zv_zso->zvo_disk->private_data = NULL;
675}
676
677/*
678 * Provide a simple virtual geometry for legacy compatibility. For devices
679 * smaller than 1 MiB a small head and sector count is used to allow very
680 * tiny devices. For devices over 1 Mib a standard head and sector count
681 * is used to keep the cylinders count reasonable.
682 */
683static int
684zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
685{
686 zvol_state_t *zv = bdev->bd_disk->private_data;
687 sector_t sectors;
688
689 ASSERT3U(zv->zv_open_count, >, 0);
690
691 sectors = get_capacity(zv->zv_zso->zvo_disk);
692
693 if (sectors > 2048) {
694 geo->heads = 16;
695 geo->sectors = 63;
696 } else {
697 geo->heads = 2;
698 geo->sectors = 4;
699 }
700
701 geo->start = 0;
702 geo->cylinders = sectors / (geo->heads * geo->sectors);
703
704 return (0);
705}
706
707/*
708 * Find a zvol_state_t given the full major+minor dev_t. If found,
709 * return with zv_state_lock taken, otherwise, return (NULL) without
710 * taking zv_state_lock.
711 */
712static zvol_state_t *
713zvol_find_by_dev(dev_t dev)
714{
715 zvol_state_t *zv;
716
717 rw_enter(&zvol_state_lock, RW_READER);
718 for (zv = list_head(&zvol_state_list); zv != NULL;
719 zv = list_next(&zvol_state_list, zv)) {
720 mutex_enter(&zv->zv_state_lock);
721 if (zv->zv_zso->zvo_dev == dev) {
722 rw_exit(&zvol_state_lock);
723 return (zv);
724 }
725 mutex_exit(&zv->zv_state_lock);
726 }
727 rw_exit(&zvol_state_lock);
728
729 return (NULL);
730}
731
5df7e9d8
MM
732static struct kobject *
733zvol_probe(dev_t dev, int *part, void *arg)
734{
735 zvol_state_t *zv;
736 struct kobject *kobj;
737
738 zv = zvol_find_by_dev(dev);
739 kobj = zv ? get_disk_and_module(zv->zv_zso->zvo_disk) : NULL;
740 ASSERT(zv == NULL || MUTEX_HELD(&zv->zv_state_lock));
741 if (zv)
742 mutex_exit(&zv->zv_state_lock);
743
744 return (kobj);
745}
746
747static struct block_device_operations zvol_ops = {
748 .open = zvol_open,
749 .release = zvol_release,
750 .ioctl = zvol_ioctl,
751 .compat_ioctl = zvol_compat_ioctl,
5df7e9d8 752 .check_events = zvol_check_events,
5df7e9d8
MM
753 .revalidate_disk = zvol_revalidate_disk,
754 .getgeo = zvol_getgeo,
755 .owner = THIS_MODULE,
d817c171
CK
756#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
757 .submit_bio = zvol_submit_bio,
758#endif
5df7e9d8
MM
759};
760
761/*
762 * Allocate memory for a new zvol_state_t and setup the required
763 * request queue and generic disk structures for the block device.
764 */
765static zvol_state_t *
766zvol_alloc(dev_t dev, const char *name)
767{
768 zvol_state_t *zv;
68dde63d 769 struct zvol_state_os *zso;
5df7e9d8
MM
770 uint64_t volmode;
771
772 if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0)
773 return (NULL);
774
775 if (volmode == ZFS_VOLMODE_DEFAULT)
776 volmode = zvol_volmode;
777
778 if (volmode == ZFS_VOLMODE_NONE)
779 return (NULL);
780
781 zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
68dde63d
BB
782 zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
783 zv->zv_zso = zso;
0ca45cb3 784 zv->zv_volmode = volmode;
5df7e9d8
MM
785
786 list_link_init(&zv->zv_next);
5df7e9d8
MM
787 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
788
d817c171
CK
789#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
790 zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
791#else
68dde63d 792 zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE);
d817c171 793#endif
68dde63d 794 if (zso->zvo_queue == NULL)
5df7e9d8
MM
795 goto out_kmem;
796
68dde63d 797 blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE);
5df7e9d8
MM
798
799 /* Limit read-ahead to a single page to prevent over-prefetching. */
68dde63d 800 blk_queue_set_read_ahead(zso->zvo_queue, 1);
5df7e9d8
MM
801
802 /* Disable write merging in favor of the ZIO pipeline. */
68dde63d 803 blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue);
5df7e9d8 804
68dde63d
BB
805 zso->zvo_disk = alloc_disk(ZVOL_MINORS);
806 if (zso->zvo_disk == NULL)
5df7e9d8
MM
807 goto out_queue;
808
68dde63d
BB
809 zso->zvo_queue->queuedata = zv;
810 zso->zvo_dev = dev;
5df7e9d8
MM
811 zv->zv_open_count = 0;
812 strlcpy(zv->zv_name, name, MAXNAMELEN);
813
2cc479d0 814 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
5df7e9d8
MM
815 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
816
68dde63d
BB
817 zso->zvo_disk->major = zvol_major;
818 zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE;
5df7e9d8
MM
819
820 if (volmode == ZFS_VOLMODE_DEV) {
821 /*
822 * ZFS_VOLMODE_DEV disable partitioning on ZVOL devices: set
823 * gendisk->minors = 1 as noted in include/linux/genhd.h.
824 * Also disable extended partition numbers (GENHD_FL_EXT_DEVT)
825 * and suppresses partition scanning (GENHD_FL_NO_PART_SCAN)
826 * setting gendisk->flags accordingly.
827 */
68dde63d 828 zso->zvo_disk->minors = 1;
5df7e9d8 829#if defined(GENHD_FL_EXT_DEVT)
68dde63d 830 zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT;
5df7e9d8
MM
831#endif
832#if defined(GENHD_FL_NO_PART_SCAN)
68dde63d 833 zso->zvo_disk->flags |= GENHD_FL_NO_PART_SCAN;
5df7e9d8
MM
834#endif
835 }
68dde63d
BB
836 zso->zvo_disk->first_minor = (dev & MINORMASK);
837 zso->zvo_disk->fops = &zvol_ops;
838 zso->zvo_disk->private_data = zv;
839 zso->zvo_disk->queue = zso->zvo_queue;
840 snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d",
5df7e9d8
MM
841 ZVOL_DEV_NAME, (dev & MINORMASK));
842
843 return (zv);
844
845out_queue:
68dde63d 846 blk_cleanup_queue(zso->zvo_queue);
5df7e9d8 847out_kmem:
68dde63d 848 kmem_free(zso, sizeof (struct zvol_state_os));
5df7e9d8
MM
849 kmem_free(zv, sizeof (zvol_state_t));
850 return (NULL);
851}
852
853/*
854 * Cleanup then free a zvol_state_t which was created by zvol_alloc().
855 * At this time, the structure is not opened by anyone, is taken off
856 * the zvol_state_list, and has its private data set to NULL.
857 * The zvol_state_lock is dropped.
99573cc0
PS
858 *
859 * This function may take many milliseconds to complete (e.g. we've seen
860 * it take over 256ms), due to the calls to "blk_cleanup_queue" and
861 * "del_gendisk". Thus, consumers need to be careful to account for this
862 * latency when calling this function.
5df7e9d8
MM
863 */
864static void
865zvol_free(zvol_state_t *zv)
866{
867
868 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
869 ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
0b32d817
RM
870 ASSERT0(zv->zv_open_count);
871 ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL);
5df7e9d8
MM
872
873 rw_destroy(&zv->zv_suspend_lock);
2cc479d0 874 zfs_rangelock_fini(&zv->zv_rangelock);
5df7e9d8
MM
875
876 del_gendisk(zv->zv_zso->zvo_disk);
877 blk_cleanup_queue(zv->zv_zso->zvo_queue);
878 put_disk(zv->zv_zso->zvo_disk);
879
880 ida_simple_remove(&zvol_ida,
881 MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);
882
883 mutex_destroy(&zv->zv_state_lock);
4547fc4e 884 dataset_kstats_destroy(&zv->zv_kstat);
5df7e9d8
MM
885
886 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
887 kmem_free(zv, sizeof (zvol_state_t));
888}
889
0ca45cb3
MM
890void
891zvol_wait_close(zvol_state_t *zv)
892{
893}
894
5df7e9d8
MM
895/*
896 * Create a block device minor node and setup the linkage between it
897 * and the specified volume. Once this function returns the block
898 * device is live and ready for use.
899 */
900static int
ec213971 901zvol_os_create_minor(const char *name)
5df7e9d8
MM
902{
903 zvol_state_t *zv;
904 objset_t *os;
905 dmu_object_info_t *doi;
906 uint64_t volsize;
907 uint64_t len;
908 unsigned minor = 0;
909 int error = 0;
910 int idx;
911 uint64_t hash = zvol_name_hash(name);
912
913 if (zvol_inhibit_dev)
914 return (0);
915
916 idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP));
917 if (idx < 0)
918 return (SET_ERROR(-idx));
919 minor = idx << ZVOL_MINOR_BITS;
920
921 zv = zvol_find_by_name_hash(name, hash, RW_NONE);
922 if (zv) {
923 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
924 mutex_exit(&zv->zv_state_lock);
925 ida_simple_remove(&zvol_ida, idx);
926 return (SET_ERROR(EEXIST));
927 }
928
929 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
930
931 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
932 if (error)
933 goto out_doi;
934
935 error = dmu_object_info(os, ZVOL_OBJ, doi);
936 if (error)
937 goto out_dmu_objset_disown;
938
939 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
940 if (error)
941 goto out_dmu_objset_disown;
942
943 zv = zvol_alloc(MKDEV(zvol_major, minor), name);
944 if (zv == NULL) {
945 error = SET_ERROR(EAGAIN);
946 goto out_dmu_objset_disown;
947 }
948 zv->zv_hash = hash;
949
950 if (dmu_objset_is_snapshot(os))
951 zv->zv_flags |= ZVOL_RDONLY;
952
953 zv->zv_volblocksize = doi->doi_data_block_size;
954 zv->zv_volsize = volsize;
955 zv->zv_objset = os;
956
957 set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9);
958
959 blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue,
960 (DMU_MAX_ACCESS / 4) >> 9);
961 blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX);
962 blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX);
963 blk_queue_physical_block_size(zv->zv_zso->zvo_queue,
964 zv->zv_volblocksize);
965 blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize);
966 blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue,
967 (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
968 blk_queue_discard_granularity(zv->zv_zso->zvo_queue,
969 zv->zv_volblocksize);
970 blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue);
971#ifdef QUEUE_FLAG_NONROT
972 blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue);
973#endif
974#ifdef QUEUE_FLAG_ADD_RANDOM
975 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue);
976#endif
977 /* This flag was introduced in kernel version 4.12. */
978#ifdef QUEUE_FLAG_SCSI_PASSTHROUGH
979 blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue);
980#endif
981
982 if (spa_writeable(dmu_objset_spa(os))) {
983 if (zil_replay_disable)
984 zil_destroy(dmu_objset_zil(os), B_FALSE);
985 else
986 zil_replay(os, zv, zvol_replay_vector);
987 }
4547fc4e
AJ
988 ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
989 dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
5df7e9d8
MM
990
991 /*
992 * When udev detects the addition of the device it will immediately
993 * invoke blkid(8) to determine the type of content on the device.
994 * Prefetching the blocks commonly scanned by blkid(8) will speed
995 * up this process.
996 */
997 len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE);
998 if (len > 0) {
999 dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ);
1000 dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len,
1001 ZIO_PRIORITY_SYNC_READ);
1002 }
1003
1004 zv->zv_objset = NULL;
1005out_dmu_objset_disown:
1006 dmu_objset_disown(os, B_TRUE, FTAG);
1007out_doi:
1008 kmem_free(doi, sizeof (dmu_object_info_t));
1009
1010 /*
1011 * Keep in mind that once add_disk() is called, the zvol is
1012 * announced to the world, and zvol_open()/zvol_release() can
1013 * be called at any time. Incidentally, add_disk() itself calls
1014 * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close()
1015 * directly as well.
1016 */
1017 if (error == 0) {
1018 rw_enter(&zvol_state_lock, RW_WRITER);
1019 zvol_insert(zv);
1020 rw_exit(&zvol_state_lock);
1021 add_disk(zv->zv_zso->zvo_disk);
1022 } else {
1023 ida_simple_remove(&zvol_ida, idx);
1024 }
1025
ec213971 1026 return (error);
5df7e9d8
MM
1027}
1028
1029static void
1030zvol_rename_minor(zvol_state_t *zv, const char *newname)
1031{
1032 int readonly = get_disk_ro(zv->zv_zso->zvo_disk);
1033
1034 ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1035 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1036
1037 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1038
1039 /* move to new hashtable entry */
1040 zv->zv_hash = zvol_name_hash(zv->zv_name);
1041 hlist_del(&zv->zv_hlink);
1042 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1043
1044 /*
1045 * The block device's read-only state is briefly changed causing
1046 * a KOBJ_CHANGE uevent to be issued. This ensures udev detects
1047 * the name change and fixes the symlinks. This does not change
1048 * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never
1049 * changes. This would normally be done using kobject_uevent() but
1050 * that is a GPL-only symbol which is why we need this workaround.
1051 */
1052 set_disk_ro(zv->zv_zso->zvo_disk, !readonly);
1053 set_disk_ro(zv->zv_zso->zvo_disk, readonly);
1054}
1055
1056static void
1057zvol_set_disk_ro_impl(zvol_state_t *zv, int flags)
1058{
1059
1060 set_disk_ro(zv->zv_zso->zvo_disk, flags);
1061}
1062
1063static void
1064zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity)
1065{
1066
1067 set_capacity(zv->zv_zso->zvo_disk, capacity);
1068}
1069
1070const static zvol_platform_ops_t zvol_linux_ops = {
1071 .zv_free = zvol_free,
1072 .zv_rename_minor = zvol_rename_minor,
ec213971 1073 .zv_create_minor = zvol_os_create_minor,
5df7e9d8
MM
1074 .zv_update_volsize = zvol_update_volsize,
1075 .zv_clear_private = zvol_clear_private,
1076 .zv_is_zvol = zvol_is_zvol_impl,
1077 .zv_set_disk_ro = zvol_set_disk_ro_impl,
1078 .zv_set_capacity = zvol_set_capacity_impl,
1079};
1080
1081int
1082zvol_init(void)
1083{
1084 int error;
1085 int threads = MIN(MAX(zvol_threads, 1), 1024);
1086
1087 error = register_blkdev(zvol_major, ZVOL_DRIVER);
1088 if (error) {
1089 printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
1090 return (error);
1091 }
1092 zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri,
1093 threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
1094 if (zvol_taskq == NULL) {
1095 unregister_blkdev(zvol_major, ZVOL_DRIVER);
1096 return (-ENOMEM);
1097 }
1098 zvol_init_impl();
1099 blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS,
1100 THIS_MODULE, zvol_probe, NULL, NULL);
1101
1102 ida_init(&zvol_ida);
1103 zvol_register_ops(&zvol_linux_ops);
1104 return (0);
1105}
1106
1107void
1108zvol_fini(void)
1109{
5df7e9d8
MM
1110 zvol_fini_impl();
1111 blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS);
1112 unregister_blkdev(zvol_major, ZVOL_DRIVER);
1113 taskq_destroy(zvol_taskq);
1114 ida_destroy(&zvol_ida);
1115}
1116
1117/* BEGIN CSTYLED */
1118module_param(zvol_inhibit_dev, uint, 0644);
1119MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
1120
1121module_param(zvol_major, uint, 0444);
1122MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
1123
1124module_param(zvol_threads, uint, 0444);
1125MODULE_PARM_DESC(zvol_threads, "Max number of threads to handle I/O requests");
1126
1127module_param(zvol_request_sync, uint, 0644);
1128MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
1129
1130module_param(zvol_max_discard_blocks, ulong, 0444);
1131MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
1132
1133module_param(zvol_prefetch_bytes, uint, 0644);
1134MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
1135
1136module_param(zvol_volmode, uint, 0644);
1137MODULE_PARM_DESC(zvol_volmode, "Default volmode property value");
1138/* END CSTYLED */