]> git.proxmox.com Git - mirror_zfs.git/blame - module/os/linux/zfs/zvol_os.c
Linux 5.11 compat: lookup_bdev()
[mirror_zfs.git] / module / os / linux / zfs / zvol_os.c
CommitLineData
5df7e9d8
MM
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
0929c4de
MA
21/*
22 * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
23 */
5df7e9d8
MM
24
25#include <sys/dataset_kstats.h>
26#include <sys/dbuf.h>
27#include <sys/dmu_traverse.h>
28#include <sys/dsl_dataset.h>
29#include <sys/dsl_prop.h>
30#include <sys/dsl_dir.h>
31#include <sys/zap.h>
32#include <sys/zfeature.h>
33#include <sys/zil_impl.h>
34#include <sys/dmu_tx.h>
35#include <sys/zio.h>
36#include <sys/zfs_rlock.h>
37#include <sys/spa_impl.h>
38#include <sys/zvol.h>
39#include <sys/zvol_impl.h>
40
41#include <linux/blkdev_compat.h>
42#include <linux/task_io_accounting_ops.h>
43
44unsigned int zvol_major = ZVOL_MAJOR;
45unsigned int zvol_request_sync = 0;
46unsigned int zvol_prefetch_bytes = (128 * 1024);
47unsigned long zvol_max_discard_blocks = 16384;
48unsigned int zvol_threads = 32;
49
50struct zvol_state_os {
51 struct gendisk *zvo_disk; /* generic disk */
52 struct request_queue *zvo_queue; /* request queue */
5df7e9d8
MM
53 dev_t zvo_dev; /* device id */
54};
55
56taskq_t *zvol_taskq;
57static struct ida zvol_ida;
58
59typedef struct zv_request {
60 zvol_state_t *zv;
61 struct bio *bio;
0929c4de 62 taskq_ent_t ent;
5df7e9d8
MM
63} zv_request_t;
64
65/*
66 * Given a path, return TRUE if path is a ZVOL.
67 */
68static boolean_t
67cff6e4 69zvol_is_zvol_impl(const char *path)
5df7e9d8 70{
67cff6e4 71 dev_t dev = 0;
5df7e9d8 72
67cff6e4 73 if (vdev_lookup_bdev(path, &dev) != 0)
5df7e9d8
MM
74 return (B_FALSE);
75
67cff6e4 76 if (MAJOR(dev) == zvol_major)
5df7e9d8
MM
77 return (B_TRUE);
78
79 return (B_FALSE);
80}
81
5df7e9d8
MM
82static void
83zvol_write(void *arg)
84{
5df7e9d8
MM
85 zv_request_t *zvr = arg;
86 struct bio *bio = zvr->bio;
58bc86c5
BB
87 int error = 0;
88 uio_t uio;
89
90 uio_bvec_init(&uio, bio);
5df7e9d8
MM
91
92 zvol_state_t *zv = zvr->zv;
00a27515
RM
93 ASSERT3P(zv, !=, NULL);
94 ASSERT3U(zv->zv_open_count, >, 0);
95 ASSERT3P(zv->zv_zilog, !=, NULL);
5df7e9d8 96
0929c4de
MA
97 /* bio marked as FLUSH need to flush before write */
98 if (bio_is_flush(bio))
99 zil_commit(zv->zv_zilog, ZVOL_OBJ);
100
101 /* Some requests are just for flush and nothing else. */
102 if (uio.uio_resid == 0) {
103 rw_exit(&zv->zv_suspend_lock);
104 BIO_END_IO(bio, 0);
105 kmem_free(zvr, sizeof (zv_request_t));
106 return;
107 }
108
5df7e9d8
MM
109 ssize_t start_resid = uio.uio_resid;
110 unsigned long start_jif = jiffies;
111 blk_generic_start_io_acct(zv->zv_zso->zvo_queue, WRITE,
112 bio_sectors(bio), &zv->zv_zso->zvo_disk->part0);
113
114 boolean_t sync =
115 bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
116
0929c4de
MA
117 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
118 uio.uio_loffset, uio.uio_resid, RL_WRITER);
119
5df7e9d8
MM
120 uint64_t volsize = zv->zv_volsize;
121 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
122 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
123 uint64_t off = uio.uio_loffset;
124 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
125
126 if (bytes > volsize - off) /* don't write past the end */
127 bytes = volsize - off;
128
20f28785 129 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
5df7e9d8
MM
130
131 /* This will only fail for ENOSPC */
132 error = dmu_tx_assign(tx, TXG_WAIT);
133 if (error) {
134 dmu_tx_abort(tx);
135 break;
136 }
137 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
138 if (error == 0) {
139 zvol_log_write(zv, tx, off, bytes, sync);
140 }
141 dmu_tx_commit(tx);
142
143 if (error)
144 break;
145 }
0929c4de 146 zfs_rangelock_exit(lr);
5df7e9d8
MM
147
148 int64_t nwritten = start_resid - uio.uio_resid;
4547fc4e 149 dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
5df7e9d8
MM
150 task_io_account_write(nwritten);
151
152 if (sync)
153 zil_commit(zv->zv_zilog, ZVOL_OBJ);
154
155 rw_exit(&zv->zv_suspend_lock);
156 blk_generic_end_io_acct(zv->zv_zso->zvo_queue,
157 WRITE, &zv->zv_zso->zvo_disk->part0, start_jif);
158 BIO_END_IO(bio, -error);
159 kmem_free(zvr, sizeof (zv_request_t));
160}
161
162static void
163zvol_discard(void *arg)
164{
165 zv_request_t *zvr = arg;
166 struct bio *bio = zvr->bio;
167 zvol_state_t *zv = zvr->zv;
168 uint64_t start = BIO_BI_SECTOR(bio) << 9;
169 uint64_t size = BIO_BI_SIZE(bio);
170 uint64_t end = start + size;
171 boolean_t sync;
172 int error = 0;
173 dmu_tx_t *tx;
174 unsigned long start_jif;
175
00a27515
RM
176 ASSERT3P(zv, !=, NULL);
177 ASSERT3U(zv->zv_open_count, >, 0);
178 ASSERT3P(zv->zv_zilog, !=, NULL);
5df7e9d8
MM
179
180 start_jif = jiffies;
181 blk_generic_start_io_acct(zv->zv_zso->zvo_queue, WRITE,
182 bio_sectors(bio), &zv->zv_zso->zvo_disk->part0);
183
184 sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
185
186 if (end > zv->zv_volsize) {
187 error = SET_ERROR(EIO);
188 goto unlock;
189 }
190
191 /*
192 * Align the request to volume block boundaries when a secure erase is
193 * not required. This will prevent dnode_free_range() from zeroing out
194 * the unaligned parts which is slow (read-modify-write) and useless
195 * since we are not freeing any space by doing so.
196 */
197 if (!bio_is_secure_erase(bio)) {
198 start = P2ROUNDUP(start, zv->zv_volblocksize);
199 end = P2ALIGN(end, zv->zv_volblocksize);
200 size = end - start;
201 }
202
203 if (start >= end)
204 goto unlock;
205
0929c4de
MA
206 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
207 start, size, RL_WRITER);
208
5df7e9d8
MM
209 tx = dmu_tx_create(zv->zv_objset);
210 dmu_tx_mark_netfree(tx);
211 error = dmu_tx_assign(tx, TXG_WAIT);
212 if (error != 0) {
213 dmu_tx_abort(tx);
214 } else {
215 zvol_log_truncate(zv, tx, start, size, B_TRUE);
216 dmu_tx_commit(tx);
217 error = dmu_free_long_range(zv->zv_objset,
218 ZVOL_OBJ, start, size);
219 }
0929c4de 220 zfs_rangelock_exit(lr);
5df7e9d8
MM
221
222 if (error == 0 && sync)
223 zil_commit(zv->zv_zilog, ZVOL_OBJ);
224
0929c4de 225unlock:
5df7e9d8
MM
226 rw_exit(&zv->zv_suspend_lock);
227 blk_generic_end_io_acct(zv->zv_zso->zvo_queue, WRITE,
228 &zv->zv_zso->zvo_disk->part0, start_jif);
229 BIO_END_IO(bio, -error);
230 kmem_free(zvr, sizeof (zv_request_t));
231}
232
233static void
234zvol_read(void *arg)
235{
5df7e9d8
MM
236 zv_request_t *zvr = arg;
237 struct bio *bio = zvr->bio;
58bc86c5
BB
238 int error = 0;
239 uio_t uio;
240
241 uio_bvec_init(&uio, bio);
5df7e9d8
MM
242
243 zvol_state_t *zv = zvr->zv;
00a27515
RM
244 ASSERT3P(zv, !=, NULL);
245 ASSERT3U(zv->zv_open_count, >, 0);
5df7e9d8
MM
246
247 ssize_t start_resid = uio.uio_resid;
248 unsigned long start_jif = jiffies;
249 blk_generic_start_io_acct(zv->zv_zso->zvo_queue, READ, bio_sectors(bio),
250 &zv->zv_zso->zvo_disk->part0);
251
0929c4de
MA
252 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
253 uio.uio_loffset, uio.uio_resid, RL_READER);
254
5df7e9d8
MM
255 uint64_t volsize = zv->zv_volsize;
256 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
257 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
258
259 /* don't read past the end */
260 if (bytes > volsize - uio.uio_loffset)
261 bytes = volsize - uio.uio_loffset;
262
263 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
264 if (error) {
265 /* convert checksum errors into IO errors */
266 if (error == ECKSUM)
267 error = SET_ERROR(EIO);
268 break;
269 }
270 }
0929c4de 271 zfs_rangelock_exit(lr);
5df7e9d8
MM
272
273 int64_t nread = start_resid - uio.uio_resid;
4547fc4e 274 dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
5df7e9d8
MM
275 task_io_account_read(nread);
276
277 rw_exit(&zv->zv_suspend_lock);
278 blk_generic_end_io_acct(zv->zv_zso->zvo_queue, READ,
279 &zv->zv_zso->zvo_disk->part0, start_jif);
280 BIO_END_IO(bio, -error);
281 kmem_free(zvr, sizeof (zv_request_t));
282}
283
d817c171
CK
284#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
285static blk_qc_t
286zvol_submit_bio(struct bio *bio)
287#else
5df7e9d8
MM
288static MAKE_REQUEST_FN_RET
289zvol_request(struct request_queue *q, struct bio *bio)
d817c171 290#endif
5df7e9d8 291{
d817c171
CK
292#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
293 struct request_queue *q = bio->bi_disk->queue;
294#endif
5df7e9d8
MM
295 zvol_state_t *zv = q->queuedata;
296 fstrans_cookie_t cookie = spl_fstrans_mark();
297 uint64_t offset = BIO_BI_SECTOR(bio) << 9;
298 uint64_t size = BIO_BI_SIZE(bio);
299 int rw = bio_data_dir(bio);
300 zv_request_t *zvr;
301
302 if (bio_has_data(bio) && offset + size > zv->zv_volsize) {
303 printk(KERN_INFO
304 "%s: bad access: offset=%llu, size=%lu\n",
305 zv->zv_zso->zvo_disk->disk_name,
306 (long long unsigned)offset,
307 (long unsigned)size);
308
309 BIO_END_IO(bio, -SET_ERROR(EIO));
310 goto out;
311 }
312
313 if (rw == WRITE) {
5df7e9d8
MM
314 if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
315 BIO_END_IO(bio, -SET_ERROR(EROFS));
316 goto out;
317 }
318
319 /*
0929c4de
MA
320 * Prevents the zvol from being suspended, or the ZIL being
321 * concurrently opened. Will be released after the i/o
322 * completes.
5df7e9d8
MM
323 */
324 rw_enter(&zv->zv_suspend_lock, RW_READER);
325
326 /*
327 * Open a ZIL if this is the first time we have written to this
328 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
329 * than zv_state_lock so that we don't need to acquire an
330 * additional lock in this path.
331 */
332 if (zv->zv_zilog == NULL) {
333 rw_exit(&zv->zv_suspend_lock);
334 rw_enter(&zv->zv_suspend_lock, RW_WRITER);
335 if (zv->zv_zilog == NULL) {
336 zv->zv_zilog = zil_open(zv->zv_objset,
337 zvol_get_data);
338 zv->zv_flags |= ZVOL_WRITTEN_TO;
339 }
340 rw_downgrade(&zv->zv_suspend_lock);
341 }
342
5df7e9d8
MM
343 zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP);
344 zvr->zv = zv;
345 zvr->bio = bio;
0929c4de 346 taskq_init_ent(&zvr->ent);
5df7e9d8
MM
347
348 /*
0929c4de
MA
349 * We don't want this thread to be blocked waiting for i/o to
350 * complete, so we instead wait from a taskq callback. The
351 * i/o may be a ZIL write (via zil_commit()), or a read of an
352 * indirect block, or a read of a data block (if this is a
353 * partial-block write). We will indicate that the i/o is
354 * complete by calling BIO_END_IO() from the taskq callback.
355 *
356 * This design allows the calling thread to continue and
357 * initiate more concurrent operations by calling
358 * zvol_request() again. There are typically only a small
359 * number of threads available to call zvol_request() (e.g.
360 * one per iSCSI target), so keeping the latency of
361 * zvol_request() low is important for performance.
362 *
363 * The zvol_request_sync module parameter allows this
364 * behavior to be altered, for performance evaluation
365 * purposes. If the callback blocks, setting
366 * zvol_request_sync=1 will result in much worse performance.
367 *
368 * We can have up to zvol_threads concurrent i/o's being
369 * processed for all zvols on the system. This is typically
370 * a vast improvement over the zvol_request_sync=1 behavior
371 * of one i/o at a time per zvol. However, an even better
372 * design would be for zvol_request() to initiate the zio
373 * directly, and then be notified by the zio_done callback,
374 * which would call BIO_END_IO(). Unfortunately, the DMU/ZIL
375 * interfaces lack this functionality (they block waiting for
376 * the i/o to complete).
5df7e9d8 377 */
5df7e9d8 378 if (bio_is_discard(bio) || bio_is_secure_erase(bio)) {
0929c4de 379 if (zvol_request_sync) {
5df7e9d8 380 zvol_discard(zvr);
0929c4de
MA
381 } else {
382 taskq_dispatch_ent(zvol_taskq,
383 zvol_discard, zvr, 0, &zvr->ent);
384 }
5df7e9d8 385 } else {
0929c4de 386 if (zvol_request_sync) {
5df7e9d8 387 zvol_write(zvr);
0929c4de
MA
388 } else {
389 taskq_dispatch_ent(zvol_taskq,
390 zvol_write, zvr, 0, &zvr->ent);
391 }
5df7e9d8
MM
392 }
393 } else {
394 /*
395 * The SCST driver, and possibly others, may issue READ I/Os
396 * with a length of zero bytes. These empty I/Os contain no
397 * data and require no additional handling.
398 */
399 if (size == 0) {
400 BIO_END_IO(bio, 0);
401 goto out;
402 }
403
404 zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP);
405 zvr->zv = zv;
406 zvr->bio = bio;
0929c4de 407 taskq_init_ent(&zvr->ent);
5df7e9d8
MM
408
409 rw_enter(&zv->zv_suspend_lock, RW_READER);
410
0929c4de
MA
411 /* See comment in WRITE case above. */
412 if (zvol_request_sync) {
5df7e9d8 413 zvol_read(zvr);
0929c4de
MA
414 } else {
415 taskq_dispatch_ent(zvol_taskq,
416 zvol_read, zvr, 0, &zvr->ent);
417 }
5df7e9d8
MM
418 }
419
420out:
421 spl_fstrans_unmark(cookie);
d817c171
CK
422#if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \
423 defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS)
5df7e9d8
MM
424 return (BLK_QC_T_NONE);
425#endif
426}
427
428static int
429zvol_open(struct block_device *bdev, fmode_t flag)
430{
431 zvol_state_t *zv;
432 int error = 0;
433 boolean_t drop_suspend = B_TRUE;
434
435 rw_enter(&zvol_state_lock, RW_READER);
436 /*
437 * Obtain a copy of private_data under the zvol_state_lock to make
438 * sure that either the result of zvol free code path setting
439 * bdev->bd_disk->private_data to NULL is observed, or zvol_free()
440 * is not called on this zv because of the positive zv_open_count.
441 */
442 zv = bdev->bd_disk->private_data;
443 if (zv == NULL) {
444 rw_exit(&zvol_state_lock);
445 return (SET_ERROR(-ENXIO));
446 }
447
448 mutex_enter(&zv->zv_state_lock);
449 /*
450 * make sure zvol is not suspended during first open
451 * (hold zv_suspend_lock) and respect proper lock acquisition
452 * ordering - zv_suspend_lock before zv_state_lock
453 */
454 if (zv->zv_open_count == 0) {
455 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
456 mutex_exit(&zv->zv_state_lock);
457 rw_enter(&zv->zv_suspend_lock, RW_READER);
458 mutex_enter(&zv->zv_state_lock);
459 /* check to see if zv_suspend_lock is needed */
460 if (zv->zv_open_count != 0) {
461 rw_exit(&zv->zv_suspend_lock);
462 drop_suspend = B_FALSE;
463 }
464 }
465 } else {
466 drop_suspend = B_FALSE;
467 }
468 rw_exit(&zvol_state_lock);
469
470 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
5df7e9d8
MM
471
472 if (zv->zv_open_count == 0) {
00a27515 473 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
5df7e9d8
MM
474 error = -zvol_first_open(zv, !(flag & FMODE_WRITE));
475 if (error)
476 goto out_mutex;
477 }
478
479 if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
480 error = -EROFS;
481 goto out_open_count;
482 }
483
484 zv->zv_open_count++;
485
486 mutex_exit(&zv->zv_state_lock);
487 if (drop_suspend)
488 rw_exit(&zv->zv_suspend_lock);
489
e767b1ca 490 zfs_check_media_change(bdev);
5df7e9d8
MM
491
492 return (0);
493
494out_open_count:
495 if (zv->zv_open_count == 0)
496 zvol_last_close(zv);
497
498out_mutex:
499 mutex_exit(&zv->zv_state_lock);
500 if (drop_suspend)
501 rw_exit(&zv->zv_suspend_lock);
502 if (error == -EINTR) {
503 error = -ERESTARTSYS;
504 schedule();
505 }
506 return (SET_ERROR(error));
507}
508
5df7e9d8 509static void
5df7e9d8
MM
510zvol_release(struct gendisk *disk, fmode_t mode)
511{
512 zvol_state_t *zv;
513 boolean_t drop_suspend = B_TRUE;
514
515 rw_enter(&zvol_state_lock, RW_READER);
516 zv = disk->private_data;
517
518 mutex_enter(&zv->zv_state_lock);
00a27515 519 ASSERT3U(zv->zv_open_count, >, 0);
5df7e9d8
MM
520 /*
521 * make sure zvol is not suspended during last close
522 * (hold zv_suspend_lock) and respect proper lock acquisition
523 * ordering - zv_suspend_lock before zv_state_lock
524 */
525 if (zv->zv_open_count == 1) {
526 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
527 mutex_exit(&zv->zv_state_lock);
528 rw_enter(&zv->zv_suspend_lock, RW_READER);
529 mutex_enter(&zv->zv_state_lock);
530 /* check to see if zv_suspend_lock is needed */
531 if (zv->zv_open_count != 1) {
532 rw_exit(&zv->zv_suspend_lock);
533 drop_suspend = B_FALSE;
534 }
535 }
536 } else {
537 drop_suspend = B_FALSE;
538 }
539 rw_exit(&zvol_state_lock);
540
541 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
5df7e9d8
MM
542
543 zv->zv_open_count--;
00a27515
RM
544 if (zv->zv_open_count == 0) {
545 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
5df7e9d8 546 zvol_last_close(zv);
00a27515 547 }
5df7e9d8
MM
548
549 mutex_exit(&zv->zv_state_lock);
550
551 if (drop_suspend)
552 rw_exit(&zv->zv_suspend_lock);
5df7e9d8
MM
553}
554
555static int
556zvol_ioctl(struct block_device *bdev, fmode_t mode,
557 unsigned int cmd, unsigned long arg)
558{
559 zvol_state_t *zv = bdev->bd_disk->private_data;
560 int error = 0;
561
562 ASSERT3U(zv->zv_open_count, >, 0);
563
564 switch (cmd) {
565 case BLKFLSBUF:
566 fsync_bdev(bdev);
567 invalidate_bdev(bdev);
568 rw_enter(&zv->zv_suspend_lock, RW_READER);
569
570 if (!(zv->zv_flags & ZVOL_RDONLY))
571 txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
572
573 rw_exit(&zv->zv_suspend_lock);
574 break;
575
576 case BLKZNAME:
577 mutex_enter(&zv->zv_state_lock);
578 error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
579 mutex_exit(&zv->zv_state_lock);
580 break;
581
582 default:
583 error = -ENOTTY;
584 break;
585 }
586
587 return (SET_ERROR(error));
588}
589
590#ifdef CONFIG_COMPAT
591static int
592zvol_compat_ioctl(struct block_device *bdev, fmode_t mode,
593 unsigned cmd, unsigned long arg)
594{
595 return (zvol_ioctl(bdev, mode, cmd, arg));
596}
597#else
598#define zvol_compat_ioctl NULL
599#endif
600
5df7e9d8
MM
601static unsigned int
602zvol_check_events(struct gendisk *disk, unsigned int clearing)
603{
604 unsigned int mask = 0;
605
606 rw_enter(&zvol_state_lock, RW_READER);
607
608 zvol_state_t *zv = disk->private_data;
609 if (zv != NULL) {
610 mutex_enter(&zv->zv_state_lock);
611 mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
612 zv->zv_changed = 0;
613 mutex_exit(&zv->zv_state_lock);
614 }
615
616 rw_exit(&zvol_state_lock);
617
618 return (mask);
619}
5df7e9d8
MM
620
621static int
622zvol_revalidate_disk(struct gendisk *disk)
623{
624 rw_enter(&zvol_state_lock, RW_READER);
625
626 zvol_state_t *zv = disk->private_data;
627 if (zv != NULL) {
628 mutex_enter(&zv->zv_state_lock);
629 set_capacity(zv->zv_zso->zvo_disk,
630 zv->zv_volsize >> SECTOR_BITS);
631 mutex_exit(&zv->zv_state_lock);
632 }
633
634 rw_exit(&zvol_state_lock);
635
636 return (0);
637}
638
65c7cc49 639static int
5df7e9d8
MM
640zvol_update_volsize(zvol_state_t *zv, uint64_t volsize)
641{
7a7e1014 642 struct gendisk *disk = zv->zv_zso->zvo_disk;
5df7e9d8 643
a30fed54 644#ifdef HAVE_REVALIDATE_DISK_SIZE
7a7e1014 645 revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0);
a30fed54 646#else
7a7e1014 647 revalidate_disk(disk);
a30fed54 648#endif
5df7e9d8
MM
649 return (0);
650}
651
652static void
653zvol_clear_private(zvol_state_t *zv)
654{
655 /*
656 * Cleared while holding zvol_state_lock as a writer
657 * which will prevent zvol_open() from opening it.
658 */
659 zv->zv_zso->zvo_disk->private_data = NULL;
660}
661
662/*
663 * Provide a simple virtual geometry for legacy compatibility. For devices
664 * smaller than 1 MiB a small head and sector count is used to allow very
665 * tiny devices. For devices over 1 Mib a standard head and sector count
666 * is used to keep the cylinders count reasonable.
667 */
668static int
669zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
670{
671 zvol_state_t *zv = bdev->bd_disk->private_data;
672 sector_t sectors;
673
674 ASSERT3U(zv->zv_open_count, >, 0);
675
676 sectors = get_capacity(zv->zv_zso->zvo_disk);
677
678 if (sectors > 2048) {
679 geo->heads = 16;
680 geo->sectors = 63;
681 } else {
682 geo->heads = 2;
683 geo->sectors = 4;
684 }
685
686 geo->start = 0;
687 geo->cylinders = sectors / (geo->heads * geo->sectors);
688
689 return (0);
690}
691
692/*
693 * Find a zvol_state_t given the full major+minor dev_t. If found,
694 * return with zv_state_lock taken, otherwise, return (NULL) without
695 * taking zv_state_lock.
696 */
697static zvol_state_t *
698zvol_find_by_dev(dev_t dev)
699{
700 zvol_state_t *zv;
701
702 rw_enter(&zvol_state_lock, RW_READER);
703 for (zv = list_head(&zvol_state_list); zv != NULL;
704 zv = list_next(&zvol_state_list, zv)) {
705 mutex_enter(&zv->zv_state_lock);
706 if (zv->zv_zso->zvo_dev == dev) {
707 rw_exit(&zvol_state_lock);
708 return (zv);
709 }
710 mutex_exit(&zv->zv_state_lock);
711 }
712 rw_exit(&zvol_state_lock);
713
714 return (NULL);
715}
716
5df7e9d8
MM
717static struct kobject *
718zvol_probe(dev_t dev, int *part, void *arg)
719{
720 zvol_state_t *zv;
721 struct kobject *kobj;
722
723 zv = zvol_find_by_dev(dev);
724 kobj = zv ? get_disk_and_module(zv->zv_zso->zvo_disk) : NULL;
725 ASSERT(zv == NULL || MUTEX_HELD(&zv->zv_state_lock));
726 if (zv)
727 mutex_exit(&zv->zv_state_lock);
728
729 return (kobj);
730}
731
732static struct block_device_operations zvol_ops = {
733 .open = zvol_open,
734 .release = zvol_release,
735 .ioctl = zvol_ioctl,
736 .compat_ioctl = zvol_compat_ioctl,
5df7e9d8 737 .check_events = zvol_check_events,
5df7e9d8
MM
738 .revalidate_disk = zvol_revalidate_disk,
739 .getgeo = zvol_getgeo,
740 .owner = THIS_MODULE,
d817c171
CK
741#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
742 .submit_bio = zvol_submit_bio,
743#endif
5df7e9d8
MM
744};
745
746/*
747 * Allocate memory for a new zvol_state_t and setup the required
748 * request queue and generic disk structures for the block device.
749 */
750static zvol_state_t *
751zvol_alloc(dev_t dev, const char *name)
752{
753 zvol_state_t *zv;
68dde63d 754 struct zvol_state_os *zso;
5df7e9d8
MM
755 uint64_t volmode;
756
757 if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0)
758 return (NULL);
759
760 if (volmode == ZFS_VOLMODE_DEFAULT)
761 volmode = zvol_volmode;
762
763 if (volmode == ZFS_VOLMODE_NONE)
764 return (NULL);
765
766 zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
68dde63d
BB
767 zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
768 zv->zv_zso = zso;
043ef5c2 769 zv->zv_volmode = volmode;
5df7e9d8
MM
770
771 list_link_init(&zv->zv_next);
5df7e9d8
MM
772 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
773
d817c171
CK
774#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
775 zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
776#else
68dde63d 777 zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE);
d817c171 778#endif
68dde63d 779 if (zso->zvo_queue == NULL)
5df7e9d8
MM
780 goto out_kmem;
781
68dde63d 782 blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE);
5df7e9d8
MM
783
784 /* Limit read-ahead to a single page to prevent over-prefetching. */
68dde63d 785 blk_queue_set_read_ahead(zso->zvo_queue, 1);
5df7e9d8
MM
786
787 /* Disable write merging in favor of the ZIO pipeline. */
68dde63d 788 blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue);
5df7e9d8 789
68dde63d
BB
790 zso->zvo_disk = alloc_disk(ZVOL_MINORS);
791 if (zso->zvo_disk == NULL)
5df7e9d8
MM
792 goto out_queue;
793
68dde63d
BB
794 zso->zvo_queue->queuedata = zv;
795 zso->zvo_dev = dev;
5df7e9d8
MM
796 zv->zv_open_count = 0;
797 strlcpy(zv->zv_name, name, MAXNAMELEN);
798
2cc479d0 799 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
5df7e9d8
MM
800 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
801
68dde63d
BB
802 zso->zvo_disk->major = zvol_major;
803 zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE;
5df7e9d8
MM
804
805 if (volmode == ZFS_VOLMODE_DEV) {
806 /*
807 * ZFS_VOLMODE_DEV disable partitioning on ZVOL devices: set
808 * gendisk->minors = 1 as noted in include/linux/genhd.h.
809 * Also disable extended partition numbers (GENHD_FL_EXT_DEVT)
810 * and suppresses partition scanning (GENHD_FL_NO_PART_SCAN)
811 * setting gendisk->flags accordingly.
812 */
68dde63d 813 zso->zvo_disk->minors = 1;
5df7e9d8 814#if defined(GENHD_FL_EXT_DEVT)
68dde63d 815 zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT;
5df7e9d8
MM
816#endif
817#if defined(GENHD_FL_NO_PART_SCAN)
68dde63d 818 zso->zvo_disk->flags |= GENHD_FL_NO_PART_SCAN;
5df7e9d8
MM
819#endif
820 }
68dde63d
BB
821 zso->zvo_disk->first_minor = (dev & MINORMASK);
822 zso->zvo_disk->fops = &zvol_ops;
823 zso->zvo_disk->private_data = zv;
824 zso->zvo_disk->queue = zso->zvo_queue;
825 snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d",
5df7e9d8
MM
826 ZVOL_DEV_NAME, (dev & MINORMASK));
827
828 return (zv);
829
830out_queue:
68dde63d 831 blk_cleanup_queue(zso->zvo_queue);
5df7e9d8 832out_kmem:
68dde63d 833 kmem_free(zso, sizeof (struct zvol_state_os));
5df7e9d8
MM
834 kmem_free(zv, sizeof (zvol_state_t));
835 return (NULL);
836}
837
838/*
839 * Cleanup then free a zvol_state_t which was created by zvol_alloc().
840 * At this time, the structure is not opened by anyone, is taken off
841 * the zvol_state_list, and has its private data set to NULL.
842 * The zvol_state_lock is dropped.
99573cc0
PS
843 *
844 * This function may take many milliseconds to complete (e.g. we've seen
845 * it take over 256ms), due to the calls to "blk_cleanup_queue" and
846 * "del_gendisk". Thus, consumers need to be careful to account for this
847 * latency when calling this function.
5df7e9d8
MM
848 */
849static void
850zvol_free(zvol_state_t *zv)
851{
852
853 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
854 ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
00a27515
RM
855 ASSERT0(zv->zv_open_count);
856 ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL);
5df7e9d8
MM
857
858 rw_destroy(&zv->zv_suspend_lock);
2cc479d0 859 zfs_rangelock_fini(&zv->zv_rangelock);
5df7e9d8
MM
860
861 del_gendisk(zv->zv_zso->zvo_disk);
862 blk_cleanup_queue(zv->zv_zso->zvo_queue);
863 put_disk(zv->zv_zso->zvo_disk);
864
865 ida_simple_remove(&zvol_ida,
866 MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);
867
868 mutex_destroy(&zv->zv_state_lock);
4547fc4e 869 dataset_kstats_destroy(&zv->zv_kstat);
5df7e9d8
MM
870
871 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
872 kmem_free(zv, sizeof (zvol_state_t));
873}
874
043ef5c2
MM
875void
876zvol_wait_close(zvol_state_t *zv)
877{
878}
879
5df7e9d8
MM
880/*
881 * Create a block device minor node and setup the linkage between it
882 * and the specified volume. Once this function returns the block
883 * device is live and ready for use.
884 */
885static int
ec213971 886zvol_os_create_minor(const char *name)
5df7e9d8
MM
887{
888 zvol_state_t *zv;
889 objset_t *os;
890 dmu_object_info_t *doi;
891 uint64_t volsize;
892 uint64_t len;
893 unsigned minor = 0;
894 int error = 0;
895 int idx;
896 uint64_t hash = zvol_name_hash(name);
897
898 if (zvol_inhibit_dev)
899 return (0);
900
901 idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP));
902 if (idx < 0)
903 return (SET_ERROR(-idx));
904 minor = idx << ZVOL_MINOR_BITS;
905
906 zv = zvol_find_by_name_hash(name, hash, RW_NONE);
907 if (zv) {
908 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
909 mutex_exit(&zv->zv_state_lock);
910 ida_simple_remove(&zvol_ida, idx);
911 return (SET_ERROR(EEXIST));
912 }
913
914 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
915
916 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
917 if (error)
918 goto out_doi;
919
920 error = dmu_object_info(os, ZVOL_OBJ, doi);
921 if (error)
922 goto out_dmu_objset_disown;
923
924 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
925 if (error)
926 goto out_dmu_objset_disown;
927
928 zv = zvol_alloc(MKDEV(zvol_major, minor), name);
929 if (zv == NULL) {
930 error = SET_ERROR(EAGAIN);
931 goto out_dmu_objset_disown;
932 }
933 zv->zv_hash = hash;
934
935 if (dmu_objset_is_snapshot(os))
936 zv->zv_flags |= ZVOL_RDONLY;
937
938 zv->zv_volblocksize = doi->doi_data_block_size;
939 zv->zv_volsize = volsize;
940 zv->zv_objset = os;
941
942 set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9);
943
944 blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue,
945 (DMU_MAX_ACCESS / 4) >> 9);
946 blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX);
947 blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX);
948 blk_queue_physical_block_size(zv->zv_zso->zvo_queue,
949 zv->zv_volblocksize);
950 blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize);
951 blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue,
952 (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
953 blk_queue_discard_granularity(zv->zv_zso->zvo_queue,
954 zv->zv_volblocksize);
955 blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue);
956#ifdef QUEUE_FLAG_NONROT
957 blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue);
958#endif
959#ifdef QUEUE_FLAG_ADD_RANDOM
960 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue);
961#endif
962 /* This flag was introduced in kernel version 4.12. */
963#ifdef QUEUE_FLAG_SCSI_PASSTHROUGH
964 blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue);
965#endif
966
967 if (spa_writeable(dmu_objset_spa(os))) {
968 if (zil_replay_disable)
969 zil_destroy(dmu_objset_zil(os), B_FALSE);
970 else
971 zil_replay(os, zv, zvol_replay_vector);
972 }
4547fc4e
AJ
973 ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
974 dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
5df7e9d8
MM
975
976 /*
977 * When udev detects the addition of the device it will immediately
978 * invoke blkid(8) to determine the type of content on the device.
979 * Prefetching the blocks commonly scanned by blkid(8) will speed
980 * up this process.
981 */
982 len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE);
983 if (len > 0) {
984 dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ);
985 dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len,
986 ZIO_PRIORITY_SYNC_READ);
987 }
988
989 zv->zv_objset = NULL;
990out_dmu_objset_disown:
991 dmu_objset_disown(os, B_TRUE, FTAG);
992out_doi:
993 kmem_free(doi, sizeof (dmu_object_info_t));
994
995 /*
996 * Keep in mind that once add_disk() is called, the zvol is
997 * announced to the world, and zvol_open()/zvol_release() can
998 * be called at any time. Incidentally, add_disk() itself calls
999 * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close()
1000 * directly as well.
1001 */
1002 if (error == 0) {
1003 rw_enter(&zvol_state_lock, RW_WRITER);
1004 zvol_insert(zv);
1005 rw_exit(&zvol_state_lock);
1006 add_disk(zv->zv_zso->zvo_disk);
1007 } else {
1008 ida_simple_remove(&zvol_ida, idx);
1009 }
1010
ec213971 1011 return (error);
5df7e9d8
MM
1012}
1013
1014static void
1015zvol_rename_minor(zvol_state_t *zv, const char *newname)
1016{
1017 int readonly = get_disk_ro(zv->zv_zso->zvo_disk);
1018
1019 ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1020 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1021
1022 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1023
1024 /* move to new hashtable entry */
1025 zv->zv_hash = zvol_name_hash(zv->zv_name);
1026 hlist_del(&zv->zv_hlink);
1027 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1028
1029 /*
1030 * The block device's read-only state is briefly changed causing
1031 * a KOBJ_CHANGE uevent to be issued. This ensures udev detects
1032 * the name change and fixes the symlinks. This does not change
1033 * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never
1034 * changes. This would normally be done using kobject_uevent() but
1035 * that is a GPL-only symbol which is why we need this workaround.
1036 */
1037 set_disk_ro(zv->zv_zso->zvo_disk, !readonly);
1038 set_disk_ro(zv->zv_zso->zvo_disk, readonly);
1039}
1040
1041static void
1042zvol_set_disk_ro_impl(zvol_state_t *zv, int flags)
1043{
1044
1045 set_disk_ro(zv->zv_zso->zvo_disk, flags);
1046}
1047
1048static void
1049zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity)
1050{
1051
1052 set_capacity(zv->zv_zso->zvo_disk, capacity);
1053}
1054
1055const static zvol_platform_ops_t zvol_linux_ops = {
1056 .zv_free = zvol_free,
1057 .zv_rename_minor = zvol_rename_minor,
ec213971 1058 .zv_create_minor = zvol_os_create_minor,
5df7e9d8
MM
1059 .zv_update_volsize = zvol_update_volsize,
1060 .zv_clear_private = zvol_clear_private,
1061 .zv_is_zvol = zvol_is_zvol_impl,
1062 .zv_set_disk_ro = zvol_set_disk_ro_impl,
1063 .zv_set_capacity = zvol_set_capacity_impl,
1064};
1065
1066int
1067zvol_init(void)
1068{
1069 int error;
1070 int threads = MIN(MAX(zvol_threads, 1), 1024);
1071
1072 error = register_blkdev(zvol_major, ZVOL_DRIVER);
1073 if (error) {
1074 printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
1075 return (error);
1076 }
1077 zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri,
1078 threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
1079 if (zvol_taskq == NULL) {
1080 unregister_blkdev(zvol_major, ZVOL_DRIVER);
1081 return (-ENOMEM);
1082 }
1083 zvol_init_impl();
1084 blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS,
1085 THIS_MODULE, zvol_probe, NULL, NULL);
1086
1087 ida_init(&zvol_ida);
1088 zvol_register_ops(&zvol_linux_ops);
1089 return (0);
1090}
1091
1092void
1093zvol_fini(void)
1094{
5df7e9d8
MM
1095 zvol_fini_impl();
1096 blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS);
1097 unregister_blkdev(zvol_major, ZVOL_DRIVER);
1098 taskq_destroy(zvol_taskq);
1099 ida_destroy(&zvol_ida);
1100}
1101
1102/* BEGIN CSTYLED */
1103module_param(zvol_inhibit_dev, uint, 0644);
1104MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
1105
1106module_param(zvol_major, uint, 0444);
1107MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
1108
1109module_param(zvol_threads, uint, 0444);
1110MODULE_PARM_DESC(zvol_threads, "Max number of threads to handle I/O requests");
1111
1112module_param(zvol_request_sync, uint, 0644);
1113MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
1114
1115module_param(zvol_max_discard_blocks, ulong, 0444);
1116MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
1117
1118module_param(zvol_prefetch_bytes, uint, 0644);
1119MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
1120
1121module_param(zvol_volmode, uint, 0644);
1122MODULE_PARM_DESC(zvol_volmode, "Default volmode property value");
1123/* END CSTYLED */