]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/zvol.c
Log zvol truncate/discard operations
[mirror_zfs.git] / module / zfs / zvol.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
25 * LLNL-CODE-403049.
26 *
27 * ZFS volume emulation driver.
28 *
29 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
30 * Volumes are accessed through the symbolic links named:
31 *
32 * /dev/<pool_name>/<dataset_name>
33 *
34 * Volumes are persistent through reboot and module load. No user command
35 * needs to be run before opening and using a device.
36 *
37 * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
38 */
39
40 #include <sys/dbuf.h>
41 #include <sys/dmu_traverse.h>
42 #include <sys/dsl_dataset.h>
43 #include <sys/dsl_prop.h>
44 #include <sys/zap.h>
45 #include <sys/zfeature.h>
46 #include <sys/zil_impl.h>
47 #include <sys/dmu_tx.h>
48 #include <sys/zio.h>
49 #include <sys/zfs_rlock.h>
50 #include <sys/zfs_znode.h>
51 #include <sys/zvol.h>
52 #include <linux/blkdev_compat.h>
53
54 unsigned int zvol_inhibit_dev = 0;
55 unsigned int zvol_major = ZVOL_MAJOR;
56 unsigned int zvol_prefetch_bytes = (128 * 1024);
57 unsigned long zvol_max_discard_blocks = 16384;
58
59 static kmutex_t zvol_state_lock;
60 static list_t zvol_state_list;
61 static char *zvol_tag = "zvol_tag";
62
63 /*
64 * The in-core state of each volume.
65 */
66 typedef struct zvol_state {
67 char zv_name[MAXNAMELEN]; /* name */
68 uint64_t zv_volsize; /* advertised space */
69 uint64_t zv_volblocksize; /* volume block size */
70 objset_t *zv_objset; /* objset handle */
71 uint32_t zv_flags; /* ZVOL_* flags */
72 uint32_t zv_open_count; /* open counts */
73 uint32_t zv_changed; /* disk changed */
74 zilog_t *zv_zilog; /* ZIL handle */
75 znode_t zv_znode; /* for range locking */
76 dmu_buf_t *zv_dbuf; /* bonus handle */
77 dev_t zv_dev; /* device id */
78 struct gendisk *zv_disk; /* generic disk */
79 struct request_queue *zv_queue; /* request queue */
80 list_node_t zv_next; /* next zvol_state_t linkage */
81 } zvol_state_t;
82
83 #define ZVOL_RDONLY 0x1
84
85 /*
86 * Find the next available range of ZVOL_MINORS minor numbers. The
87 * zvol_state_list is kept in ascending minor order so we simply need
88 * to scan the list for the first gap in the sequence. This allows us
89 * to recycle minor number as devices are created and removed.
90 */
91 static int
92 zvol_find_minor(unsigned *minor)
93 {
94 zvol_state_t *zv;
95
96 *minor = 0;
97 ASSERT(MUTEX_HELD(&zvol_state_lock));
98 for (zv = list_head(&zvol_state_list); zv != NULL;
99 zv = list_next(&zvol_state_list, zv), *minor += ZVOL_MINORS) {
100 if (MINOR(zv->zv_dev) != MINOR(*minor))
101 break;
102 }
103
104 /* All minors are in use */
105 if (*minor >= (1 << MINORBITS))
106 return (SET_ERROR(ENXIO));
107
108 return (0);
109 }
110
111 /*
112 * Find a zvol_state_t given the full major+minor dev_t.
113 */
114 static zvol_state_t *
115 zvol_find_by_dev(dev_t dev)
116 {
117 zvol_state_t *zv;
118
119 ASSERT(MUTEX_HELD(&zvol_state_lock));
120 for (zv = list_head(&zvol_state_list); zv != NULL;
121 zv = list_next(&zvol_state_list, zv)) {
122 if (zv->zv_dev == dev)
123 return (zv);
124 }
125
126 return (NULL);
127 }
128
129 /*
130 * Find a zvol_state_t given the name provided at zvol_alloc() time.
131 */
132 static zvol_state_t *
133 zvol_find_by_name(const char *name)
134 {
135 zvol_state_t *zv;
136
137 ASSERT(MUTEX_HELD(&zvol_state_lock));
138 for (zv = list_head(&zvol_state_list); zv != NULL;
139 zv = list_next(&zvol_state_list, zv)) {
140 if (strncmp(zv->zv_name, name, MAXNAMELEN) == 0)
141 return (zv);
142 }
143
144 return (NULL);
145 }
146
147
148 /*
149 * Given a path, return TRUE if path is a ZVOL.
150 */
151 boolean_t
152 zvol_is_zvol(const char *device)
153 {
154 struct block_device *bdev;
155 unsigned int major;
156
157 bdev = lookup_bdev(device);
158 if (IS_ERR(bdev))
159 return (B_FALSE);
160
161 major = MAJOR(bdev->bd_dev);
162 bdput(bdev);
163
164 if (major == zvol_major)
165 return (B_TRUE);
166
167 return (B_FALSE);
168 }
169
170 /*
171 * ZFS_IOC_CREATE callback handles dmu zvol and zap object creation.
172 */
173 void
174 zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
175 {
176 zfs_creat_t *zct = arg;
177 nvlist_t *nvprops = zct->zct_props;
178 int error;
179 uint64_t volblocksize, volsize;
180
181 VERIFY(nvlist_lookup_uint64(nvprops,
182 zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
183 if (nvlist_lookup_uint64(nvprops,
184 zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
185 volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
186
187 /*
188 * These properties must be removed from the list so the generic
189 * property setting step won't apply to them.
190 */
191 VERIFY(nvlist_remove_all(nvprops,
192 zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
193 (void) nvlist_remove_all(nvprops,
194 zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
195
196 error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
197 DMU_OT_NONE, 0, tx);
198 ASSERT(error == 0);
199
200 error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
201 DMU_OT_NONE, 0, tx);
202 ASSERT(error == 0);
203
204 error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
205 ASSERT(error == 0);
206 }
207
208 /*
209 * ZFS_IOC_OBJSET_STATS entry point.
210 */
211 int
212 zvol_get_stats(objset_t *os, nvlist_t *nv)
213 {
214 int error;
215 dmu_object_info_t *doi;
216 uint64_t val;
217
218 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
219 if (error)
220 return (SET_ERROR(error));
221
222 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
223 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
224 error = dmu_object_info(os, ZVOL_OBJ, doi);
225
226 if (error == 0) {
227 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
228 doi->doi_data_block_size);
229 }
230
231 kmem_free(doi, sizeof (dmu_object_info_t));
232
233 return (SET_ERROR(error));
234 }
235
236 static void
237 zvol_size_changed(zvol_state_t *zv, uint64_t volsize)
238 {
239 struct block_device *bdev;
240
241 bdev = bdget_disk(zv->zv_disk, 0);
242 if (bdev == NULL)
243 return;
244 set_capacity(zv->zv_disk, volsize >> 9);
245 zv->zv_volsize = volsize;
246 check_disk_size_change(zv->zv_disk, bdev);
247
248 bdput(bdev);
249 }
250
251 /*
252 * Sanity check volume size.
253 */
254 int
255 zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
256 {
257 if (volsize == 0)
258 return (SET_ERROR(EINVAL));
259
260 if (volsize % blocksize != 0)
261 return (SET_ERROR(EINVAL));
262
263 #ifdef _ILP32
264 if (volsize - 1 > MAXOFFSET_T)
265 return (SET_ERROR(EOVERFLOW));
266 #endif
267 return (0);
268 }
269
270 /*
271 * Ensure the zap is flushed then inform the VFS of the capacity change.
272 */
273 static int
274 zvol_update_volsize(uint64_t volsize, objset_t *os)
275 {
276 dmu_tx_t *tx;
277 int error;
278
279 ASSERT(MUTEX_HELD(&zvol_state_lock));
280
281 tx = dmu_tx_create(os);
282 dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
283 dmu_tx_mark_netfree(tx);
284 error = dmu_tx_assign(tx, TXG_WAIT);
285 if (error) {
286 dmu_tx_abort(tx);
287 return (SET_ERROR(error));
288 }
289
290 error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
291 &volsize, tx);
292 dmu_tx_commit(tx);
293
294 if (error == 0)
295 error = dmu_free_long_range(os,
296 ZVOL_OBJ, volsize, DMU_OBJECT_END);
297
298 return (error);
299 }
300
301 static int
302 zvol_update_live_volsize(zvol_state_t *zv, uint64_t volsize)
303 {
304 zvol_size_changed(zv, volsize);
305
306 /*
307 * We should post a event here describing the expansion. However,
308 * the zfs_ereport_post() interface doesn't nicely support posting
309 * events for zvols, it assumes events relate to vdevs or zios.
310 */
311
312 return (0);
313 }
314
315 /*
316 * Set ZFS_PROP_VOLSIZE set entry point.
317 */
318 int
319 zvol_set_volsize(const char *name, uint64_t volsize)
320 {
321 zvol_state_t *zv = NULL;
322 objset_t *os = NULL;
323 int error;
324 dmu_object_info_t *doi;
325 uint64_t readonly;
326 boolean_t owned = B_FALSE;
327
328 error = dsl_prop_get_integer(name,
329 zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL);
330 if (error != 0)
331 return (SET_ERROR(error));
332 if (readonly)
333 return (SET_ERROR(EROFS));
334
335 mutex_enter(&zvol_state_lock);
336 zv = zvol_find_by_name(name);
337
338 if (zv == NULL || zv->zv_objset == NULL) {
339 if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE,
340 FTAG, &os)) != 0) {
341 mutex_exit(&zvol_state_lock);
342 return (SET_ERROR(error));
343 }
344 owned = B_TRUE;
345 if (zv != NULL)
346 zv->zv_objset = os;
347 } else {
348 os = zv->zv_objset;
349 }
350
351 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
352
353 if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) ||
354 (error = zvol_check_volsize(volsize, doi->doi_data_block_size)))
355 goto out;
356
357 error = zvol_update_volsize(volsize, os);
358 kmem_free(doi, sizeof (dmu_object_info_t));
359
360 if (error == 0 && zv != NULL)
361 error = zvol_update_live_volsize(zv, volsize);
362 out:
363 if (owned) {
364 dmu_objset_disown(os, FTAG);
365 if (zv != NULL)
366 zv->zv_objset = NULL;
367 }
368 mutex_exit(&zvol_state_lock);
369 return (error);
370 }
371
372 /*
373 * Sanity check volume block size.
374 */
375 int
376 zvol_check_volblocksize(const char *name, uint64_t volblocksize)
377 {
378 /* Record sizes above 128k need the feature to be enabled */
379 if (volblocksize > SPA_OLD_MAXBLOCKSIZE) {
380 spa_t *spa;
381 int error;
382
383 if ((error = spa_open(name, &spa, FTAG)) != 0)
384 return (error);
385
386 if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
387 spa_close(spa, FTAG);
388 return (SET_ERROR(ENOTSUP));
389 }
390
391 /*
392 * We don't allow setting the property above 1MB,
393 * unless the tunable has been changed.
394 */
395 if (volblocksize > zfs_max_recordsize)
396 return (SET_ERROR(EDOM));
397
398 spa_close(spa, FTAG);
399 }
400
401 if (volblocksize < SPA_MINBLOCKSIZE ||
402 volblocksize > SPA_MAXBLOCKSIZE ||
403 !ISP2(volblocksize))
404 return (SET_ERROR(EDOM));
405
406 return (0);
407 }
408
409 /*
410 * Set ZFS_PROP_VOLBLOCKSIZE set entry point.
411 */
412 int
413 zvol_set_volblocksize(const char *name, uint64_t volblocksize)
414 {
415 zvol_state_t *zv;
416 dmu_tx_t *tx;
417 int error;
418
419 mutex_enter(&zvol_state_lock);
420
421 zv = zvol_find_by_name(name);
422 if (zv == NULL) {
423 error = SET_ERROR(ENXIO);
424 goto out;
425 }
426
427 if (zv->zv_flags & ZVOL_RDONLY) {
428 error = SET_ERROR(EROFS);
429 goto out;
430 }
431
432 tx = dmu_tx_create(zv->zv_objset);
433 dmu_tx_hold_bonus(tx, ZVOL_OBJ);
434 error = dmu_tx_assign(tx, TXG_WAIT);
435 if (error) {
436 dmu_tx_abort(tx);
437 } else {
438 error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ,
439 volblocksize, 0, tx);
440 if (error == ENOTSUP)
441 error = SET_ERROR(EBUSY);
442 dmu_tx_commit(tx);
443 if (error == 0)
444 zv->zv_volblocksize = volblocksize;
445 }
446 out:
447 mutex_exit(&zvol_state_lock);
448
449 return (SET_ERROR(error));
450 }
451
452 /*
453 * Replay a TX_TRUNCATE ZIL transaction if asked. TX_TRUNCATE is how we
454 * implement DKIOCFREE/free-long-range.
455 */
456 static int
457 zvol_replay_truncate(zvol_state_t *zv, lr_truncate_t *lr, boolean_t byteswap)
458 {
459 uint64_t offset, length;
460
461 if (byteswap)
462 byteswap_uint64_array(lr, sizeof (*lr));
463
464 offset = lr->lr_offset;
465 length = lr->lr_length;
466
467 return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length));
468 }
469
470 /*
471 * Replay a TX_WRITE ZIL transaction that didn't get committed
472 * after a system failure
473 */
474 static int
475 zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
476 {
477 objset_t *os = zv->zv_objset;
478 char *data = (char *)(lr + 1); /* data follows lr_write_t */
479 uint64_t off = lr->lr_offset;
480 uint64_t len = lr->lr_length;
481 dmu_tx_t *tx;
482 int error;
483
484 if (byteswap)
485 byteswap_uint64_array(lr, sizeof (*lr));
486
487 tx = dmu_tx_create(os);
488 dmu_tx_hold_write(tx, ZVOL_OBJ, off, len);
489 error = dmu_tx_assign(tx, TXG_WAIT);
490 if (error) {
491 dmu_tx_abort(tx);
492 } else {
493 dmu_write(os, ZVOL_OBJ, off, len, data, tx);
494 dmu_tx_commit(tx);
495 }
496
497 return (SET_ERROR(error));
498 }
499
500 static int
501 zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
502 {
503 return (SET_ERROR(ENOTSUP));
504 }
505
506 /*
507 * Callback vectors for replaying records.
508 * Only TX_WRITE and TX_TRUNCATE are needed for zvol.
509 */
510 zil_replay_func_t zvol_replay_vector[TX_MAX_TYPE] = {
511 (zil_replay_func_t)zvol_replay_err, /* no such transaction type */
512 (zil_replay_func_t)zvol_replay_err, /* TX_CREATE */
513 (zil_replay_func_t)zvol_replay_err, /* TX_MKDIR */
514 (zil_replay_func_t)zvol_replay_err, /* TX_MKXATTR */
515 (zil_replay_func_t)zvol_replay_err, /* TX_SYMLINK */
516 (zil_replay_func_t)zvol_replay_err, /* TX_REMOVE */
517 (zil_replay_func_t)zvol_replay_err, /* TX_RMDIR */
518 (zil_replay_func_t)zvol_replay_err, /* TX_LINK */
519 (zil_replay_func_t)zvol_replay_err, /* TX_RENAME */
520 (zil_replay_func_t)zvol_replay_write, /* TX_WRITE */
521 (zil_replay_func_t)zvol_replay_truncate, /* TX_TRUNCATE */
522 (zil_replay_func_t)zvol_replay_err, /* TX_SETATTR */
523 (zil_replay_func_t)zvol_replay_err, /* TX_ACL */
524 };
525
526 /*
527 * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
528 *
529 * We store data in the log buffers if it's small enough.
530 * Otherwise we will later flush the data out via dmu_sync().
531 */
532 ssize_t zvol_immediate_write_sz = 32768;
533
534 static void
535 zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
536 uint64_t size, int sync)
537 {
538 uint32_t blocksize = zv->zv_volblocksize;
539 zilog_t *zilog = zv->zv_zilog;
540 boolean_t slogging;
541 ssize_t immediate_write_sz;
542
543 if (zil_replaying(zilog, tx))
544 return;
545
546 immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
547 ? 0 : zvol_immediate_write_sz;
548 slogging = spa_has_slogs(zilog->zl_spa) &&
549 (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
550
551 while (size) {
552 itx_t *itx;
553 lr_write_t *lr;
554 ssize_t len;
555 itx_wr_state_t write_state;
556
557 /*
558 * Unlike zfs_log_write() we can be called with
559 * up to DMU_MAX_ACCESS/2 (5MB) writes.
560 */
561 if (blocksize > immediate_write_sz && !slogging &&
562 size >= blocksize && offset % blocksize == 0) {
563 write_state = WR_INDIRECT; /* uses dmu_sync */
564 len = blocksize;
565 } else if (sync) {
566 write_state = WR_COPIED;
567 len = MIN(ZIL_MAX_LOG_DATA, size);
568 } else {
569 write_state = WR_NEED_COPY;
570 len = MIN(ZIL_MAX_LOG_DATA, size);
571 }
572
573 itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
574 (write_state == WR_COPIED ? len : 0));
575 lr = (lr_write_t *)&itx->itx_lr;
576 if (write_state == WR_COPIED && dmu_read(zv->zv_objset,
577 ZVOL_OBJ, offset, len, lr+1, DMU_READ_NO_PREFETCH) != 0) {
578 zil_itx_destroy(itx);
579 itx = zil_itx_create(TX_WRITE, sizeof (*lr));
580 lr = (lr_write_t *)&itx->itx_lr;
581 write_state = WR_NEED_COPY;
582 }
583
584 itx->itx_wr_state = write_state;
585 if (write_state == WR_NEED_COPY)
586 itx->itx_sod += len;
587 lr->lr_foid = ZVOL_OBJ;
588 lr->lr_offset = offset;
589 lr->lr_length = len;
590 lr->lr_blkoff = 0;
591 BP_ZERO(&lr->lr_blkptr);
592
593 itx->itx_private = zv;
594 itx->itx_sync = sync;
595
596 (void) zil_itx_assign(zilog, itx, tx);
597
598 offset += len;
599 size -= len;
600 }
601 }
602
603 static int
604 zvol_write(struct bio *bio)
605 {
606 zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data;
607 uint64_t offset = BIO_BI_SECTOR(bio) << 9;
608 uint64_t size = BIO_BI_SIZE(bio);
609 int error = 0;
610 dmu_tx_t *tx;
611 rl_t *rl;
612 uio_t uio;
613
614 if (bio->bi_rw & VDEV_REQ_FLUSH)
615 zil_commit(zv->zv_zilog, ZVOL_OBJ);
616
617 /*
618 * Some requests are just for flush and nothing else.
619 */
620 if (size == 0)
621 goto out;
622
623 uio.uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)];
624 uio.uio_skip = BIO_BI_SKIP(bio);
625 uio.uio_resid = size;
626 uio.uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio);
627 uio.uio_loffset = offset;
628 uio.uio_limit = MAXOFFSET_T;
629 uio.uio_segflg = UIO_BVEC;
630
631 rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_WRITER);
632
633 tx = dmu_tx_create(zv->zv_objset);
634 dmu_tx_hold_write(tx, ZVOL_OBJ, offset, size);
635
636 /* This will only fail for ENOSPC */
637 error = dmu_tx_assign(tx, TXG_WAIT);
638 if (error) {
639 dmu_tx_abort(tx);
640 zfs_range_unlock(rl);
641 goto out;
642 }
643
644 error = dmu_write_uio(zv->zv_objset, ZVOL_OBJ, &uio, size, tx);
645 if (error == 0)
646 zvol_log_write(zv, tx, offset, size,
647 !!(bio->bi_rw & VDEV_REQ_FUA));
648
649 dmu_tx_commit(tx);
650 zfs_range_unlock(rl);
651
652 if ((bio->bi_rw & VDEV_REQ_FUA) ||
653 zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)
654 zil_commit(zv->zv_zilog, ZVOL_OBJ);
655
656 out:
657 return (error);
658 }
659
660 /*
661 * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
662 */
663 static void
664 zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
665 boolean_t sync)
666 {
667 itx_t *itx;
668 lr_truncate_t *lr;
669 zilog_t *zilog = zv->zv_zilog;
670
671 if (zil_replaying(zilog, tx))
672 return;
673
674 itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
675 lr = (lr_truncate_t *)&itx->itx_lr;
676 lr->lr_foid = ZVOL_OBJ;
677 lr->lr_offset = off;
678 lr->lr_length = len;
679
680 itx->itx_sync = sync;
681 zil_itx_assign(zilog, itx, tx);
682 }
683
684 static int
685 zvol_discard(struct bio *bio)
686 {
687 zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data;
688 uint64_t start = BIO_BI_SECTOR(bio) << 9;
689 uint64_t size = BIO_BI_SIZE(bio);
690 uint64_t end = start + size;
691 int error;
692 rl_t *rl;
693 dmu_tx_t *tx;
694
695 if (end > zv->zv_volsize)
696 return (SET_ERROR(EIO));
697
698 /*
699 * Align the request to volume block boundaries when REQ_SECURE is
700 * available, but not requested. If we don't, then this will force
701 * dnode_free_range() to zero out the unaligned parts, which is slow
702 * (read-modify-write) and useless since we are not freeing any space
703 * by doing so. Kernels that do not support REQ_SECURE (2.6.32 through
704 * 2.6.35) will not receive this optimization.
705 */
706 #ifdef REQ_SECURE
707 if (!(bio->bi_rw & REQ_SECURE)) {
708 start = P2ROUNDUP(start, zv->zv_volblocksize);
709 end = P2ALIGN(end, zv->zv_volblocksize);
710 size = end - start;
711 }
712 #endif
713
714 if (start >= end)
715 return (0);
716
717 rl = zfs_range_lock(&zv->zv_znode, start, size, RL_WRITER);
718 tx = dmu_tx_create(zv->zv_objset);
719 dmu_tx_mark_netfree(tx);
720 error = dmu_tx_assign(tx, TXG_WAIT);
721 if (error != 0) {
722 dmu_tx_abort(tx);
723 } else {
724 zvol_log_truncate(zv, tx, start, size, B_TRUE);
725 dmu_tx_commit(tx);
726 error = dmu_free_long_range(zv->zv_objset,
727 ZVOL_OBJ, start, size);
728 }
729
730 zfs_range_unlock(rl);
731
732 return (error);
733 }
734
735 static int
736 zvol_read(struct bio *bio)
737 {
738 zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data;
739 uint64_t offset = BIO_BI_SECTOR(bio) << 9;
740 uint64_t size = BIO_BI_SIZE(bio);
741 int error;
742 rl_t *rl;
743 uio_t uio;
744
745 if (size == 0)
746 return (0);
747
748 uio.uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)];
749 uio.uio_skip = BIO_BI_SKIP(bio);
750 uio.uio_resid = size;
751 uio.uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio);
752 uio.uio_loffset = offset;
753 uio.uio_limit = MAXOFFSET_T;
754 uio.uio_segflg = UIO_BVEC;
755
756 rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
757
758 error = dmu_read_uio(zv->zv_objset, ZVOL_OBJ, &uio, size);
759
760 zfs_range_unlock(rl);
761
762 /* convert checksum errors into IO errors */
763 if (error == ECKSUM)
764 error = SET_ERROR(EIO);
765
766 return (error);
767 }
768
769 static MAKE_REQUEST_FN_RET
770 zvol_request(struct request_queue *q, struct bio *bio)
771 {
772 zvol_state_t *zv = q->queuedata;
773 fstrans_cookie_t cookie = spl_fstrans_mark();
774 uint64_t offset = BIO_BI_SECTOR(bio);
775 unsigned int sectors = bio_sectors(bio);
776 int rw = bio_data_dir(bio);
777 #ifdef HAVE_GENERIC_IO_ACCT
778 unsigned long start = jiffies;
779 #endif
780 int error = 0;
781
782 if (bio_has_data(bio) && offset + sectors >
783 get_capacity(zv->zv_disk)) {
784 printk(KERN_INFO
785 "%s: bad access: block=%llu, count=%lu\n",
786 zv->zv_disk->disk_name,
787 (long long unsigned)offset,
788 (long unsigned)sectors);
789 error = SET_ERROR(EIO);
790 goto out1;
791 }
792
793 generic_start_io_acct(rw, sectors, &zv->zv_disk->part0);
794
795 if (rw == WRITE) {
796 if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
797 error = SET_ERROR(EROFS);
798 goto out2;
799 }
800
801 if (bio->bi_rw & VDEV_REQ_DISCARD) {
802 error = zvol_discard(bio);
803 goto out2;
804 }
805
806 error = zvol_write(bio);
807 } else
808 error = zvol_read(bio);
809
810 out2:
811 generic_end_io_acct(rw, &zv->zv_disk->part0, start);
812 out1:
813 BIO_END_IO(bio, -error);
814 spl_fstrans_unmark(cookie);
815 #ifdef HAVE_MAKE_REQUEST_FN_RET_INT
816 return (0);
817 #elif defined(HAVE_MAKE_REQUEST_FN_RET_QC)
818 return (BLK_QC_T_NONE);
819 #endif
820 }
821
822 static void
823 zvol_get_done(zgd_t *zgd, int error)
824 {
825 if (zgd->zgd_db)
826 dmu_buf_rele(zgd->zgd_db, zgd);
827
828 zfs_range_unlock(zgd->zgd_rl);
829
830 if (error == 0 && zgd->zgd_bp)
831 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
832
833 kmem_free(zgd, sizeof (zgd_t));
834 }
835
836 /*
837 * Get data to generate a TX_WRITE intent log record.
838 */
839 static int
840 zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
841 {
842 zvol_state_t *zv = arg;
843 objset_t *os = zv->zv_objset;
844 uint64_t object = ZVOL_OBJ;
845 uint64_t offset = lr->lr_offset;
846 uint64_t size = lr->lr_length;
847 blkptr_t *bp = &lr->lr_blkptr;
848 dmu_buf_t *db;
849 zgd_t *zgd;
850 int error;
851
852 ASSERT(zio != NULL);
853 ASSERT(size != 0);
854
855 zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
856 zgd->zgd_zilog = zv->zv_zilog;
857 zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
858
859 /*
860 * Write records come in two flavors: immediate and indirect.
861 * For small writes it's cheaper to store the data with the
862 * log record (immediate); for large writes it's cheaper to
863 * sync the data and get a pointer to it (indirect) so that
864 * we don't have to write the data twice.
865 */
866 if (buf != NULL) { /* immediate write */
867 error = dmu_read(os, object, offset, size, buf,
868 DMU_READ_NO_PREFETCH);
869 } else {
870 size = zv->zv_volblocksize;
871 offset = P2ALIGN_TYPED(offset, size, uint64_t);
872 error = dmu_buf_hold(os, object, offset, zgd, &db,
873 DMU_READ_NO_PREFETCH);
874 if (error == 0) {
875 blkptr_t *obp = dmu_buf_get_blkptr(db);
876 if (obp) {
877 ASSERT(BP_IS_HOLE(bp));
878 *bp = *obp;
879 }
880
881 zgd->zgd_db = db;
882 zgd->zgd_bp = &lr->lr_blkptr;
883
884 ASSERT(db != NULL);
885 ASSERT(db->db_offset == offset);
886 ASSERT(db->db_size == size);
887
888 error = dmu_sync(zio, lr->lr_common.lrc_txg,
889 zvol_get_done, zgd);
890
891 if (error == 0)
892 return (0);
893 }
894 }
895
896 zvol_get_done(zgd, error);
897
898 return (SET_ERROR(error));
899 }
900
901 /*
902 * The zvol_state_t's are inserted in increasing MINOR(dev_t) order.
903 */
904 static void
905 zvol_insert(zvol_state_t *zv_insert)
906 {
907 zvol_state_t *zv = NULL;
908
909 ASSERT(MUTEX_HELD(&zvol_state_lock));
910 ASSERT3U(MINOR(zv_insert->zv_dev) & ZVOL_MINOR_MASK, ==, 0);
911 for (zv = list_head(&zvol_state_list); zv != NULL;
912 zv = list_next(&zvol_state_list, zv)) {
913 if (MINOR(zv->zv_dev) > MINOR(zv_insert->zv_dev))
914 break;
915 }
916
917 list_insert_before(&zvol_state_list, zv, zv_insert);
918 }
919
920 /*
921 * Simply remove the zvol from to list of zvols.
922 */
923 static void
924 zvol_remove(zvol_state_t *zv_remove)
925 {
926 ASSERT(MUTEX_HELD(&zvol_state_lock));
927 list_remove(&zvol_state_list, zv_remove);
928 }
929
930 static int
931 zvol_first_open(zvol_state_t *zv)
932 {
933 objset_t *os;
934 uint64_t volsize;
935 int locked = 0;
936 int error;
937 uint64_t ro;
938
939 /*
940 * In all other cases the spa_namespace_lock is taken before the
941 * bdev->bd_mutex lock. But in this case the Linux __blkdev_get()
942 * function calls fops->open() with the bdev->bd_mutex lock held.
943 *
944 * To avoid a potential lock inversion deadlock we preemptively
945 * try to take the spa_namespace_lock(). Normally it will not
946 * be contended and this is safe because spa_open_common() handles
947 * the case where the caller already holds the spa_namespace_lock.
948 *
949 * When it is contended we risk a lock inversion if we were to
950 * block waiting for the lock. Luckily, the __blkdev_get()
951 * function allows us to return -ERESTARTSYS which will result in
952 * bdev->bd_mutex being dropped, reacquired, and fops->open() being
953 * called again. This process can be repeated safely until both
954 * locks are acquired.
955 */
956 if (!mutex_owned(&spa_namespace_lock)) {
957 locked = mutex_tryenter(&spa_namespace_lock);
958 if (!locked)
959 return (-SET_ERROR(ERESTARTSYS));
960 }
961
962 error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
963 if (error)
964 goto out_mutex;
965
966 /* lie and say we're read-only */
967 error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, 1, zvol_tag, &os);
968 if (error)
969 goto out_mutex;
970
971 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
972 if (error) {
973 dmu_objset_disown(os, zvol_tag);
974 goto out_mutex;
975 }
976
977 zv->zv_objset = os;
978 error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
979 if (error) {
980 dmu_objset_disown(os, zvol_tag);
981 goto out_mutex;
982 }
983
984 set_capacity(zv->zv_disk, volsize >> 9);
985 zv->zv_volsize = volsize;
986 zv->zv_zilog = zil_open(os, zvol_get_data);
987
988 if (ro || dmu_objset_is_snapshot(os) ||
989 !spa_writeable(dmu_objset_spa(os))) {
990 set_disk_ro(zv->zv_disk, 1);
991 zv->zv_flags |= ZVOL_RDONLY;
992 } else {
993 set_disk_ro(zv->zv_disk, 0);
994 zv->zv_flags &= ~ZVOL_RDONLY;
995 }
996
997 out_mutex:
998 if (locked)
999 mutex_exit(&spa_namespace_lock);
1000
1001 return (SET_ERROR(-error));
1002 }
1003
1004 static void
1005 zvol_last_close(zvol_state_t *zv)
1006 {
1007 zil_close(zv->zv_zilog);
1008 zv->zv_zilog = NULL;
1009
1010 dmu_buf_rele(zv->zv_dbuf, zvol_tag);
1011 zv->zv_dbuf = NULL;
1012
1013 /*
1014 * Evict cached data
1015 */
1016 if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) &&
1017 !(zv->zv_flags & ZVOL_RDONLY))
1018 txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
1019 (void) dmu_objset_evict_dbufs(zv->zv_objset);
1020
1021 dmu_objset_disown(zv->zv_objset, zvol_tag);
1022 zv->zv_objset = NULL;
1023 }
1024
1025 static int
1026 zvol_open(struct block_device *bdev, fmode_t flag)
1027 {
1028 zvol_state_t *zv = bdev->bd_disk->private_data;
1029 int error = 0, drop_mutex = 0;
1030
1031 /*
1032 * If the caller is already holding the mutex do not take it
1033 * again, this will happen as part of zvol_create_minor().
1034 * Once add_disk() is called the device is live and the kernel
1035 * will attempt to open it to read the partition information.
1036 */
1037 if (!mutex_owned(&zvol_state_lock)) {
1038 mutex_enter(&zvol_state_lock);
1039 drop_mutex = 1;
1040 }
1041
1042 ASSERT3P(zv, !=, NULL);
1043
1044 if (zv->zv_open_count == 0) {
1045 error = zvol_first_open(zv);
1046 if (error)
1047 goto out_mutex;
1048 }
1049
1050 if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
1051 error = -EROFS;
1052 goto out_open_count;
1053 }
1054
1055 zv->zv_open_count++;
1056
1057 out_open_count:
1058 if (zv->zv_open_count == 0)
1059 zvol_last_close(zv);
1060
1061 out_mutex:
1062 if (drop_mutex)
1063 mutex_exit(&zvol_state_lock);
1064
1065 check_disk_change(bdev);
1066
1067 return (SET_ERROR(error));
1068 }
1069
1070 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID
1071 static void
1072 #else
1073 static int
1074 #endif
1075 zvol_release(struct gendisk *disk, fmode_t mode)
1076 {
1077 zvol_state_t *zv = disk->private_data;
1078 int drop_mutex = 0;
1079
1080 if (!mutex_owned(&zvol_state_lock)) {
1081 mutex_enter(&zvol_state_lock);
1082 drop_mutex = 1;
1083 }
1084
1085 if (zv->zv_open_count > 0) {
1086 zv->zv_open_count--;
1087 if (zv->zv_open_count == 0)
1088 zvol_last_close(zv);
1089 }
1090
1091 if (drop_mutex)
1092 mutex_exit(&zvol_state_lock);
1093
1094 #ifndef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID
1095 return (0);
1096 #endif
1097 }
1098
1099 static int
1100 zvol_ioctl(struct block_device *bdev, fmode_t mode,
1101 unsigned int cmd, unsigned long arg)
1102 {
1103 zvol_state_t *zv = bdev->bd_disk->private_data;
1104 int error = 0;
1105
1106 if (zv == NULL)
1107 return (SET_ERROR(-ENXIO));
1108
1109 switch (cmd) {
1110 case BLKFLSBUF:
1111 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1112 break;
1113 case BLKZNAME:
1114 error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
1115 break;
1116
1117 default:
1118 error = -ENOTTY;
1119 break;
1120
1121 }
1122
1123 return (SET_ERROR(error));
1124 }
1125
1126 #ifdef CONFIG_COMPAT
1127 static int
1128 zvol_compat_ioctl(struct block_device *bdev, fmode_t mode,
1129 unsigned cmd, unsigned long arg)
1130 {
1131 return (zvol_ioctl(bdev, mode, cmd, arg));
1132 }
1133 #else
1134 #define zvol_compat_ioctl NULL
1135 #endif
1136
1137 static int zvol_media_changed(struct gendisk *disk)
1138 {
1139 zvol_state_t *zv = disk->private_data;
1140
1141 return (zv->zv_changed);
1142 }
1143
1144 static int zvol_revalidate_disk(struct gendisk *disk)
1145 {
1146 zvol_state_t *zv = disk->private_data;
1147
1148 zv->zv_changed = 0;
1149 set_capacity(zv->zv_disk, zv->zv_volsize >> 9);
1150
1151 return (0);
1152 }
1153
1154 /*
1155 * Provide a simple virtual geometry for legacy compatibility. For devices
1156 * smaller than 1 MiB a small head and sector count is used to allow very
1157 * tiny devices. For devices over 1 Mib a standard head and sector count
1158 * is used to keep the cylinders count reasonable.
1159 */
1160 static int
1161 zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
1162 {
1163 zvol_state_t *zv = bdev->bd_disk->private_data;
1164 sector_t sectors = get_capacity(zv->zv_disk);
1165
1166 if (sectors > 2048) {
1167 geo->heads = 16;
1168 geo->sectors = 63;
1169 } else {
1170 geo->heads = 2;
1171 geo->sectors = 4;
1172 }
1173
1174 geo->start = 0;
1175 geo->cylinders = sectors / (geo->heads * geo->sectors);
1176
1177 return (0);
1178 }
1179
1180 static struct kobject *
1181 zvol_probe(dev_t dev, int *part, void *arg)
1182 {
1183 zvol_state_t *zv;
1184 struct kobject *kobj;
1185
1186 mutex_enter(&zvol_state_lock);
1187 zv = zvol_find_by_dev(dev);
1188 kobj = zv ? get_disk(zv->zv_disk) : NULL;
1189 mutex_exit(&zvol_state_lock);
1190
1191 return (kobj);
1192 }
1193
1194 #ifdef HAVE_BDEV_BLOCK_DEVICE_OPERATIONS
1195 static struct block_device_operations zvol_ops = {
1196 .open = zvol_open,
1197 .release = zvol_release,
1198 .ioctl = zvol_ioctl,
1199 .compat_ioctl = zvol_compat_ioctl,
1200 .media_changed = zvol_media_changed,
1201 .revalidate_disk = zvol_revalidate_disk,
1202 .getgeo = zvol_getgeo,
1203 .owner = THIS_MODULE,
1204 };
1205
1206 #else /* HAVE_BDEV_BLOCK_DEVICE_OPERATIONS */
1207
1208 static int
1209 zvol_open_by_inode(struct inode *inode, struct file *file)
1210 {
1211 return (zvol_open(inode->i_bdev, file->f_mode));
1212 }
1213
1214 static int
1215 zvol_release_by_inode(struct inode *inode, struct file *file)
1216 {
1217 return (zvol_release(inode->i_bdev->bd_disk, file->f_mode));
1218 }
1219
1220 static int
1221 zvol_ioctl_by_inode(struct inode *inode, struct file *file,
1222 unsigned int cmd, unsigned long arg)
1223 {
1224 if (file == NULL || inode == NULL)
1225 return (SET_ERROR(-EINVAL));
1226
1227 return (zvol_ioctl(inode->i_bdev, file->f_mode, cmd, arg));
1228 }
1229
1230 #ifdef CONFIG_COMPAT
1231 static long
1232 zvol_compat_ioctl_by_inode(struct file *file,
1233 unsigned int cmd, unsigned long arg)
1234 {
1235 if (file == NULL)
1236 return (SET_ERROR(-EINVAL));
1237
1238 return (zvol_compat_ioctl(file->f_dentry->d_inode->i_bdev,
1239 file->f_mode, cmd, arg));
1240 }
1241 #else
1242 #define zvol_compat_ioctl_by_inode NULL
1243 #endif
1244
1245 static struct block_device_operations zvol_ops = {
1246 .open = zvol_open_by_inode,
1247 .release = zvol_release_by_inode,
1248 .ioctl = zvol_ioctl_by_inode,
1249 .compat_ioctl = zvol_compat_ioctl_by_inode,
1250 .media_changed = zvol_media_changed,
1251 .revalidate_disk = zvol_revalidate_disk,
1252 .getgeo = zvol_getgeo,
1253 .owner = THIS_MODULE,
1254 };
1255 #endif /* HAVE_BDEV_BLOCK_DEVICE_OPERATIONS */
1256
1257 /*
1258 * Allocate memory for a new zvol_state_t and setup the required
1259 * request queue and generic disk structures for the block device.
1260 */
1261 static zvol_state_t *
1262 zvol_alloc(dev_t dev, const char *name)
1263 {
1264 zvol_state_t *zv;
1265
1266 zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
1267
1268 list_link_init(&zv->zv_next);
1269
1270 zv->zv_queue = blk_alloc_queue(GFP_ATOMIC);
1271 if (zv->zv_queue == NULL)
1272 goto out_kmem;
1273
1274 blk_queue_make_request(zv->zv_queue, zvol_request);
1275
1276 #ifdef HAVE_BLK_QUEUE_FLUSH
1277 blk_queue_flush(zv->zv_queue, VDEV_REQ_FLUSH | VDEV_REQ_FUA);
1278 #else
1279 blk_queue_ordered(zv->zv_queue, QUEUE_ORDERED_DRAIN, NULL);
1280 #endif /* HAVE_BLK_QUEUE_FLUSH */
1281
1282 zv->zv_disk = alloc_disk(ZVOL_MINORS);
1283 if (zv->zv_disk == NULL)
1284 goto out_queue;
1285
1286 zv->zv_queue->queuedata = zv;
1287 zv->zv_dev = dev;
1288 zv->zv_open_count = 0;
1289 strlcpy(zv->zv_name, name, MAXNAMELEN);
1290
1291 mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
1292 avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
1293 sizeof (rl_t), offsetof(rl_t, r_node));
1294 zv->zv_znode.z_is_zvol = TRUE;
1295
1296 zv->zv_disk->major = zvol_major;
1297 zv->zv_disk->first_minor = (dev & MINORMASK);
1298 zv->zv_disk->fops = &zvol_ops;
1299 zv->zv_disk->private_data = zv;
1300 zv->zv_disk->queue = zv->zv_queue;
1301 snprintf(zv->zv_disk->disk_name, DISK_NAME_LEN, "%s%d",
1302 ZVOL_DEV_NAME, (dev & MINORMASK));
1303
1304 return (zv);
1305
1306 out_queue:
1307 blk_cleanup_queue(zv->zv_queue);
1308 out_kmem:
1309 kmem_free(zv, sizeof (zvol_state_t));
1310
1311 return (NULL);
1312 }
1313
1314 /*
1315 * Cleanup then free a zvol_state_t which was created by zvol_alloc().
1316 */
1317 static void
1318 zvol_free(zvol_state_t *zv)
1319 {
1320 avl_destroy(&zv->zv_znode.z_range_avl);
1321 mutex_destroy(&zv->zv_znode.z_range_lock);
1322
1323 del_gendisk(zv->zv_disk);
1324 blk_cleanup_queue(zv->zv_queue);
1325 put_disk(zv->zv_disk);
1326
1327 kmem_free(zv, sizeof (zvol_state_t));
1328 }
1329
1330 static int
1331 __zvol_snapdev_hidden(const char *name)
1332 {
1333 uint64_t snapdev;
1334 char *parent;
1335 char *atp;
1336 int error = 0;
1337
1338 parent = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1339 (void) strlcpy(parent, name, MAXPATHLEN);
1340
1341 if ((atp = strrchr(parent, '@')) != NULL) {
1342 *atp = '\0';
1343 error = dsl_prop_get_integer(parent, "snapdev", &snapdev, NULL);
1344 if ((error == 0) && (snapdev == ZFS_SNAPDEV_HIDDEN))
1345 error = SET_ERROR(ENODEV);
1346 }
1347
1348 kmem_free(parent, MAXPATHLEN);
1349
1350 return (SET_ERROR(error));
1351 }
1352
1353 static int
1354 __zvol_create_minor(const char *name, boolean_t ignore_snapdev)
1355 {
1356 zvol_state_t *zv;
1357 objset_t *os;
1358 dmu_object_info_t *doi;
1359 uint64_t volsize;
1360 uint64_t len;
1361 unsigned minor = 0;
1362 int error = 0;
1363
1364 ASSERT(MUTEX_HELD(&zvol_state_lock));
1365
1366 zv = zvol_find_by_name(name);
1367 if (zv) {
1368 error = SET_ERROR(EEXIST);
1369 goto out;
1370 }
1371
1372 if (ignore_snapdev == B_FALSE) {
1373 error = __zvol_snapdev_hidden(name);
1374 if (error)
1375 goto out;
1376 }
1377
1378 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
1379
1380 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, zvol_tag, &os);
1381 if (error)
1382 goto out_doi;
1383
1384 error = dmu_object_info(os, ZVOL_OBJ, doi);
1385 if (error)
1386 goto out_dmu_objset_disown;
1387
1388 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1389 if (error)
1390 goto out_dmu_objset_disown;
1391
1392 error = zvol_find_minor(&minor);
1393 if (error)
1394 goto out_dmu_objset_disown;
1395
1396 zv = zvol_alloc(MKDEV(zvol_major, minor), name);
1397 if (zv == NULL) {
1398 error = SET_ERROR(EAGAIN);
1399 goto out_dmu_objset_disown;
1400 }
1401
1402 if (dmu_objset_is_snapshot(os))
1403 zv->zv_flags |= ZVOL_RDONLY;
1404
1405 zv->zv_volblocksize = doi->doi_data_block_size;
1406 zv->zv_volsize = volsize;
1407 zv->zv_objset = os;
1408
1409 set_capacity(zv->zv_disk, zv->zv_volsize >> 9);
1410
1411 blk_queue_max_hw_sectors(zv->zv_queue, (DMU_MAX_ACCESS / 4) >> 9);
1412 blk_queue_max_segments(zv->zv_queue, UINT16_MAX);
1413 blk_queue_max_segment_size(zv->zv_queue, UINT_MAX);
1414 blk_queue_physical_block_size(zv->zv_queue, zv->zv_volblocksize);
1415 blk_queue_io_opt(zv->zv_queue, zv->zv_volblocksize);
1416 blk_queue_max_discard_sectors(zv->zv_queue,
1417 (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
1418 blk_queue_discard_granularity(zv->zv_queue, zv->zv_volblocksize);
1419 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zv->zv_queue);
1420 #ifdef QUEUE_FLAG_NONROT
1421 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zv->zv_queue);
1422 #endif
1423 #ifdef QUEUE_FLAG_ADD_RANDOM
1424 queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zv->zv_queue);
1425 #endif
1426
1427 if (spa_writeable(dmu_objset_spa(os))) {
1428 if (zil_replay_disable)
1429 zil_destroy(dmu_objset_zil(os), B_FALSE);
1430 else
1431 zil_replay(os, zv, zvol_replay_vector);
1432 }
1433
1434 /*
1435 * When udev detects the addition of the device it will immediately
1436 * invoke blkid(8) to determine the type of content on the device.
1437 * Prefetching the blocks commonly scanned by blkid(8) will speed
1438 * up this process.
1439 */
1440 len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE);
1441 if (len > 0) {
1442 dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ);
1443 dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len,
1444 ZIO_PRIORITY_SYNC_READ);
1445 }
1446
1447 zv->zv_objset = NULL;
1448 out_dmu_objset_disown:
1449 dmu_objset_disown(os, zvol_tag);
1450 out_doi:
1451 kmem_free(doi, sizeof (dmu_object_info_t));
1452 out:
1453
1454 if (error == 0) {
1455 zvol_insert(zv);
1456 add_disk(zv->zv_disk);
1457 }
1458
1459 return (SET_ERROR(error));
1460 }
1461
1462 /*
1463 * Create a block device minor node and setup the linkage between it
1464 * and the specified volume. Once this function returns the block
1465 * device is live and ready for use.
1466 */
1467 int
1468 zvol_create_minor(const char *name)
1469 {
1470 int error;
1471
1472 mutex_enter(&zvol_state_lock);
1473 error = __zvol_create_minor(name, B_FALSE);
1474 mutex_exit(&zvol_state_lock);
1475
1476 return (SET_ERROR(error));
1477 }
1478
1479 static int
1480 __zvol_remove_minor(const char *name)
1481 {
1482 zvol_state_t *zv;
1483
1484 ASSERT(MUTEX_HELD(&zvol_state_lock));
1485
1486 zv = zvol_find_by_name(name);
1487 if (zv == NULL)
1488 return (SET_ERROR(ENXIO));
1489
1490 if (zv->zv_open_count > 0)
1491 return (SET_ERROR(EBUSY));
1492
1493 zvol_remove(zv);
1494 zvol_free(zv);
1495
1496 return (0);
1497 }
1498
1499 /*
1500 * Remove a block device minor node for the specified volume.
1501 */
1502 int
1503 zvol_remove_minor(const char *name)
1504 {
1505 int error;
1506
1507 mutex_enter(&zvol_state_lock);
1508 error = __zvol_remove_minor(name);
1509 mutex_exit(&zvol_state_lock);
1510
1511 return (SET_ERROR(error));
1512 }
1513
1514 /*
1515 * Rename a block device minor mode for the specified volume.
1516 */
1517 static void
1518 __zvol_rename_minor(zvol_state_t *zv, const char *newname)
1519 {
1520 int readonly = get_disk_ro(zv->zv_disk);
1521
1522 ASSERT(MUTEX_HELD(&zvol_state_lock));
1523
1524 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1525
1526 /*
1527 * The block device's read-only state is briefly changed causing
1528 * a KOBJ_CHANGE uevent to be issued. This ensures udev detects
1529 * the name change and fixes the symlinks. This does not change
1530 * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never
1531 * changes. This would normally be done using kobject_uevent() but
1532 * that is a GPL-only symbol which is why we need this workaround.
1533 */
1534 set_disk_ro(zv->zv_disk, !readonly);
1535 set_disk_ro(zv->zv_disk, readonly);
1536 }
1537
1538 static int
1539 zvol_create_minors_cb(const char *dsname, void *arg)
1540 {
1541 (void) zvol_create_minor(dsname);
1542
1543 return (0);
1544 }
1545
1546 /*
1547 * Create minors for specified dataset including children and snapshots.
1548 */
1549 int
1550 zvol_create_minors(const char *name)
1551 {
1552 int error = 0;
1553
1554 if (!zvol_inhibit_dev)
1555 error = dmu_objset_find((char *)name, zvol_create_minors_cb,
1556 NULL, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
1557
1558 return (SET_ERROR(error));
1559 }
1560
1561 /*
1562 * Remove minors for specified dataset including children and snapshots.
1563 */
1564 void
1565 zvol_remove_minors(const char *name)
1566 {
1567 zvol_state_t *zv, *zv_next;
1568 int namelen = ((name) ? strlen(name) : 0);
1569
1570 if (zvol_inhibit_dev)
1571 return;
1572
1573 mutex_enter(&zvol_state_lock);
1574
1575 for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
1576 zv_next = list_next(&zvol_state_list, zv);
1577
1578 if (name == NULL || strcmp(zv->zv_name, name) == 0 ||
1579 (strncmp(zv->zv_name, name, namelen) == 0 &&
1580 zv->zv_name[namelen] == '/')) {
1581 zvol_remove(zv);
1582 zvol_free(zv);
1583 }
1584 }
1585
1586 mutex_exit(&zvol_state_lock);
1587 }
1588
1589 /*
1590 * Rename minors for specified dataset including children and snapshots.
1591 */
1592 void
1593 zvol_rename_minors(const char *oldname, const char *newname)
1594 {
1595 zvol_state_t *zv, *zv_next;
1596 int oldnamelen, newnamelen;
1597 char *name;
1598
1599 if (zvol_inhibit_dev)
1600 return;
1601
1602 oldnamelen = strlen(oldname);
1603 newnamelen = strlen(newname);
1604 name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
1605
1606 mutex_enter(&zvol_state_lock);
1607
1608 for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
1609 zv_next = list_next(&zvol_state_list, zv);
1610
1611 if (strcmp(zv->zv_name, oldname) == 0) {
1612 __zvol_rename_minor(zv, newname);
1613 } else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
1614 (zv->zv_name[oldnamelen] == '/' ||
1615 zv->zv_name[oldnamelen] == '@')) {
1616 snprintf(name, MAXNAMELEN, "%s%c%s", newname,
1617 zv->zv_name[oldnamelen],
1618 zv->zv_name + oldnamelen + 1);
1619 __zvol_rename_minor(zv, name);
1620 }
1621 }
1622
1623 mutex_exit(&zvol_state_lock);
1624
1625 kmem_free(name, MAXNAMELEN);
1626 }
1627
1628 static int
1629 snapdev_snapshot_changed_cb(const char *dsname, void *arg) {
1630 uint64_t snapdev = *(uint64_t *) arg;
1631
1632 if (strchr(dsname, '@') == NULL)
1633 return (0);
1634
1635 switch (snapdev) {
1636 case ZFS_SNAPDEV_VISIBLE:
1637 mutex_enter(&zvol_state_lock);
1638 (void) __zvol_create_minor(dsname, B_TRUE);
1639 mutex_exit(&zvol_state_lock);
1640 break;
1641 case ZFS_SNAPDEV_HIDDEN:
1642 (void) zvol_remove_minor(dsname);
1643 break;
1644 }
1645
1646 return (0);
1647 }
1648
1649 int
1650 zvol_set_snapdev(const char *dsname, uint64_t snapdev) {
1651 (void) dmu_objset_find((char *) dsname, snapdev_snapshot_changed_cb,
1652 &snapdev, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
1653 /* caller should continue to modify snapdev property */
1654 return (-1);
1655 }
1656
1657 int
1658 zvol_init(void)
1659 {
1660 int error;
1661
1662 list_create(&zvol_state_list, sizeof (zvol_state_t),
1663 offsetof(zvol_state_t, zv_next));
1664
1665 mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL);
1666
1667 error = register_blkdev(zvol_major, ZVOL_DRIVER);
1668 if (error) {
1669 printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
1670 goto out;
1671 }
1672
1673 blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS,
1674 THIS_MODULE, zvol_probe, NULL, NULL);
1675
1676 return (0);
1677
1678 out:
1679 mutex_destroy(&zvol_state_lock);
1680 list_destroy(&zvol_state_list);
1681
1682 return (SET_ERROR(error));
1683 }
1684
1685 void
1686 zvol_fini(void)
1687 {
1688 zvol_remove_minors(NULL);
1689 blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS);
1690 unregister_blkdev(zvol_major, ZVOL_DRIVER);
1691 mutex_destroy(&zvol_state_lock);
1692 list_destroy(&zvol_state_list);
1693 }
1694
1695 module_param(zvol_inhibit_dev, uint, 0644);
1696 MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
1697
1698 module_param(zvol_major, uint, 0444);
1699 MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
1700
1701 module_param(zvol_max_discard_blocks, ulong, 0444);
1702 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
1703
1704 module_param(zvol_prefetch_bytes, uint, 0644);
1705 MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");