4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
25 * All rights reserved.
27 * Portions Copyright 2010 Robert Milkowski
29 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
30 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
31 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
32 * Copyright (c) 2014 Integros [integros.com]
35 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
38 * ZFS volume emulation driver.
40 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
41 * Volumes are accessed through the symbolic links named:
43 * /dev/zvol/<pool_name>/<dataset_name>
45 * Volumes are persistent through reboot. No user command needs to be
46 * run before opening and using a device.
48 * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
49 * in the system. Except when they're simply character devices (volmode=dev).
52 #include <sys/types.h>
53 #include <sys/param.h>
54 #include <sys/kernel.h>
55 #include <sys/errno.h>
61 #include <sys/cmn_err.h>
65 #include <sys/spa_impl.h>
68 #include <sys/dmu_traverse.h>
69 #include <sys/dnode.h>
70 #include <sys/dsl_dataset.h>
71 #include <sys/dsl_prop.h>
72 #include <sys/dsl_dir.h>
73 #include <sys/byteorder.h>
74 #include <sys/sunddi.h>
75 #include <sys/dirent.h>
76 #include <sys/policy.h>
77 #include <sys/queue.h>
78 #include <sys/fs/zfs.h>
79 #include <sys/zfs_ioctl.h>
81 #include <sys/zfs_znode.h>
82 #include <sys/zfs_rlock.h>
83 #include <sys/vdev_impl.h>
84 #include <sys/vdev_raidz.h>
86 #include <sys/zil_impl.h>
87 #include <sys/dataset_kstats.h>
89 #include <sys/dmu_tx.h>
90 #include <sys/zfeature.h>
91 #include <sys/zio_checksum.h>
92 #include <sys/zil_impl.h>
93 #include <sys/filio.h>
95 #include <geom/geom.h>
97 #include <sys/zvol_impl.h>
99 #include "zfs_namecheck.h"
101 #define ZVOL_DIR "/dev/zvol/"
102 #define ZVOL_DUMPSIZE "dumpsize"
104 #ifdef ZVOL_LOCK_DEBUG
105 #define ZVOL_RW_READER RW_WRITER
106 #define ZVOL_RW_READ_HELD RW_WRITE_HELD
108 #define ZVOL_RW_READER RW_READER
109 #define ZVOL_RW_READ_HELD RW_READ_HELD
112 enum zvol_geom_state
{
118 struct zvol_state_os
{
120 #define zso_dev _zso_state._zso_dev
121 #define zso_geom _zso_state._zso_geom
124 struct zvol_state_dev
{
125 struct cdev
*zsd_cdev
;
126 uint64_t zsd_sync_cnt
;
130 struct zvol_state_geom
{
131 struct g_provider
*zsg_provider
;
132 struct bio_queue_head zsg_queue
;
133 struct mtx zsg_queue_mtx
;
134 enum zvol_geom_state zsg_state
;
139 struct proc
*zfsproc
;
141 static uint32_t zvol_minors
;
143 SYSCTL_DECL(_vfs_zfs
);
144 SYSCTL_NODE(_vfs_zfs
, OID_AUTO
, vol
, CTLFLAG_RW
, 0, "ZFS VOLUME");
145 SYSCTL_INT(_vfs_zfs_vol
, OID_AUTO
, mode
, CTLFLAG_RWTUN
, &zvol_volmode
, 0,
146 "Expose as GEOM providers (1), device files (2) or neither");
147 static boolean_t zpool_on_zvol
= B_FALSE
;
148 SYSCTL_INT(_vfs_zfs_vol
, OID_AUTO
, recursive
, CTLFLAG_RWTUN
, &zpool_on_zvol
, 0,
149 "Allow zpools to use zvols as vdevs (DANGEROUS)");
152 * Toggle unmap functionality.
154 boolean_t zvol_unmap_enabled
= B_TRUE
;
156 SYSCTL_INT(_vfs_zfs_vol
, OID_AUTO
, unmap_enabled
, CTLFLAG_RWTUN
,
157 &zvol_unmap_enabled
, 0, "Enable UNMAP functionality");
160 * zvol maximum transfer in one DMU tx.
162 int zvol_maxphys
= DMU_MAX_ACCESS
/ 2;
164 static void zvol_ensure_zilog(zvol_state_t
*zv
);
166 static d_open_t zvol_cdev_open
;
167 static d_close_t zvol_cdev_close
;
168 static d_ioctl_t zvol_cdev_ioctl
;
169 static d_read_t zvol_cdev_read
;
170 static d_write_t zvol_cdev_write
;
171 static d_strategy_t zvol_geom_bio_strategy
;
173 static struct cdevsw zvol_cdevsw
= {
175 .d_version
= D_VERSION
,
176 .d_flags
= D_DISK
| D_TRACKCLOSE
,
177 .d_open
= zvol_cdev_open
,
178 .d_close
= zvol_cdev_close
,
179 .d_ioctl
= zvol_cdev_ioctl
,
180 .d_read
= zvol_cdev_read
,
181 .d_write
= zvol_cdev_write
,
182 .d_strategy
= zvol_geom_bio_strategy
,
185 extern uint_t zfs_geom_probe_vdev_key
;
187 struct g_class zfs_zvol_class
= {
189 .version
= G_VERSION
,
192 DECLARE_GEOM_CLASS(zfs_zvol_class
, zfs_zvol
);
194 static int zvol_geom_open(struct g_provider
*pp
, int flag
, int count
);
195 static int zvol_geom_close(struct g_provider
*pp
, int flag
, int count
);
196 static void zvol_geom_run(zvol_state_t
*zv
);
197 static void zvol_geom_destroy(zvol_state_t
*zv
);
198 static int zvol_geom_access(struct g_provider
*pp
, int acr
, int acw
, int ace
);
199 static void zvol_geom_worker(void *arg
);
200 static void zvol_geom_bio_start(struct bio
*bp
);
201 static int zvol_geom_bio_getattr(struct bio
*bp
);
202 /* static d_strategy_t zvol_geom_bio_strategy; (declared elsewhere) */
205 * GEOM mode implementation
210 zvol_geom_open(struct g_provider
*pp
, int flag
, int count
)
214 boolean_t drop_suspend
= B_TRUE
;
216 if (!zpool_on_zvol
&& tsd_get(zfs_geom_probe_vdev_key
) != NULL
) {
218 * if zfs_geom_probe_vdev_key is set, that means that zfs is
219 * attempting to probe geom providers while looking for a
220 * replacement for a missing VDEV. In this case, the
221 * spa_namespace_lock will not be held, but it is still illegal
222 * to use a zvol as a vdev. Deadlocks can result if another
223 * thread has spa_namespace_lock
225 return (SET_ERROR(EOPNOTSUPP
));
228 rw_enter(&zvol_state_lock
, ZVOL_RW_READER
);
231 rw_exit(&zvol_state_lock
);
232 return (SET_ERROR(ENXIO
));
235 mutex_enter(&zv
->zv_state_lock
);
237 ASSERT(zv
->zv_zso
->zso_volmode
== ZFS_VOLMODE_GEOM
);
240 * make sure zvol is not suspended during first open
241 * (hold zv_suspend_lock) and respect proper lock acquisition
242 * ordering - zv_suspend_lock before zv_state_lock
244 if (zv
->zv_open_count
== 0) {
245 if (!rw_tryenter(&zv
->zv_suspend_lock
, ZVOL_RW_READER
)) {
246 mutex_exit(&zv
->zv_state_lock
);
247 rw_enter(&zv
->zv_suspend_lock
, ZVOL_RW_READER
);
248 mutex_enter(&zv
->zv_state_lock
);
249 /* check to see if zv_suspend_lock is needed */
250 if (zv
->zv_open_count
!= 0) {
251 rw_exit(&zv
->zv_suspend_lock
);
252 drop_suspend
= B_FALSE
;
256 drop_suspend
= B_FALSE
;
258 rw_exit(&zvol_state_lock
);
260 ASSERT(MUTEX_HELD(&zv
->zv_state_lock
));
262 if (zv
->zv_open_count
== 0) {
263 ASSERT(ZVOL_RW_READ_HELD(&zv
->zv_suspend_lock
));
264 err
= zvol_first_open(zv
, !(flag
& FWRITE
));
267 pp
->mediasize
= zv
->zv_volsize
;
268 pp
->stripeoffset
= 0;
269 pp
->stripesize
= zv
->zv_volblocksize
;
273 * Check for a bad on-disk format version now since we
274 * lied about owning the dataset readonly before.
276 if ((flag
& FWRITE
) && ((zv
->zv_flags
& ZVOL_RDONLY
) ||
277 dmu_objset_incompatible_encryption_version(zv
->zv_objset
))) {
281 if (zv
->zv_flags
& ZVOL_EXCL
) {
287 if (zv
->zv_open_count
!= 0) {
291 zv
->zv_flags
|= ZVOL_EXCL
;
295 zv
->zv_open_count
+= count
;
296 mutex_exit(&zv
->zv_state_lock
);
298 rw_exit(&zv
->zv_suspend_lock
);
302 if (zv
->zv_open_count
== 0)
305 mutex_exit(&zv
->zv_state_lock
);
307 rw_exit(&zv
->zv_suspend_lock
);
308 return (SET_ERROR(err
));
313 zvol_geom_close(struct g_provider
*pp
, int flag
, int count
)
316 boolean_t drop_suspend
= B_TRUE
;
318 rw_enter(&zvol_state_lock
, ZVOL_RW_READER
);
321 rw_exit(&zvol_state_lock
);
322 return (SET_ERROR(ENXIO
));
325 mutex_enter(&zv
->zv_state_lock
);
326 if (zv
->zv_flags
& ZVOL_EXCL
) {
327 ASSERT(zv
->zv_open_count
== 1);
328 zv
->zv_flags
&= ~ZVOL_EXCL
;
331 ASSERT(zv
->zv_zso
->zso_volmode
== ZFS_VOLMODE_GEOM
);
334 * If the open count is zero, this is a spurious close.
335 * That indicates a bug in the kernel / DDI framework.
337 ASSERT(zv
->zv_open_count
> 0);
340 * make sure zvol is not suspended during last close
341 * (hold zv_suspend_lock) and respect proper lock acquisition
342 * ordering - zv_suspend_lock before zv_state_lock
344 if ((zv
->zv_open_count
- count
) == 0) {
345 if (!rw_tryenter(&zv
->zv_suspend_lock
, ZVOL_RW_READER
)) {
346 mutex_exit(&zv
->zv_state_lock
);
347 rw_enter(&zv
->zv_suspend_lock
, ZVOL_RW_READER
);
348 mutex_enter(&zv
->zv_state_lock
);
349 /* check to see if zv_suspend_lock is needed */
350 if (zv
->zv_open_count
!= 1) {
351 rw_exit(&zv
->zv_suspend_lock
);
352 drop_suspend
= B_FALSE
;
356 drop_suspend
= B_FALSE
;
358 rw_exit(&zvol_state_lock
);
360 ASSERT(MUTEX_HELD(&zv
->zv_state_lock
));
363 * You may get multiple opens, but only one close.
365 zv
->zv_open_count
-= count
;
367 if (zv
->zv_open_count
== 0) {
368 ASSERT(ZVOL_RW_READ_HELD(&zv
->zv_suspend_lock
));
372 mutex_exit(&zv
->zv_state_lock
);
375 rw_exit(&zv
->zv_suspend_lock
);
380 zvol_geom_run(zvol_state_t
*zv
)
382 struct zvol_state_geom
*zsg
= &zv
->zv_zso
->zso_geom
;
383 struct g_provider
*pp
= zsg
->zsg_provider
;
385 ASSERT(zv
->zv_zso
->zso_volmode
== ZFS_VOLMODE_GEOM
);
387 g_error_provider(pp
, 0);
389 kproc_kthread_add(zvol_geom_worker
, zv
, &zfsproc
, NULL
, 0, 0,
390 "zfskern", "zvol %s", pp
->name
+ sizeof (ZVOL_DRIVER
));
394 zvol_geom_destroy(zvol_state_t
*zv
)
396 struct zvol_state_geom
*zsg
= &zv
->zv_zso
->zso_geom
;
397 struct g_provider
*pp
= zsg
->zsg_provider
;
399 ASSERT(zv
->zv_zso
->zso_volmode
== ZFS_VOLMODE_GEOM
);
403 mutex_enter(&zv
->zv_state_lock
);
404 VERIFY(zsg
->zsg_state
== ZVOL_GEOM_RUNNING
);
405 mutex_exit(&zv
->zv_state_lock
);
406 zsg
->zsg_provider
= NULL
;
408 g_wither_geom(pp
->geom
, ENXIO
);
412 zvol_geom_access(struct g_provider
*pp
, int acr
, int acw
, int ace
)
414 int count
, error
, flags
;
419 * To make it easier we expect either open or close, but not both
422 KASSERT((acr
>= 0 && acw
>= 0 && ace
>= 0) ||
423 (acr
<= 0 && acw
<= 0 && ace
<= 0),
424 ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
425 pp
->name
, acr
, acw
, ace
));
427 if (pp
->private == NULL
) {
428 if (acr
<= 0 && acw
<= 0 && ace
<= 0)
434 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
435 * ace != 0, because GEOM already handles that and handles it a bit
436 * differently. GEOM allows for multiple read/exclusive consumers and
437 * ZFS allows only one exclusive consumer, no matter if it is reader or
438 * writer. I like better the way GEOM works so I'll leave it for GEOM
439 * to decide what to do.
442 count
= acr
+ acw
+ ace
;
447 if (acr
!= 0 || ace
!= 0)
454 error
= zvol_geom_open(pp
, flags
, count
);
456 error
= zvol_geom_close(pp
, flags
, -count
);
462 zvol_geom_worker(void *arg
)
464 zvol_state_t
*zv
= arg
;
465 struct zvol_state_geom
*zsg
= &zv
->zv_zso
->zso_geom
;
468 ASSERT(zv
->zv_zso
->zso_volmode
== ZFS_VOLMODE_GEOM
);
470 thread_lock(curthread
);
471 sched_prio(curthread
, PRIBIO
);
472 thread_unlock(curthread
);
475 mtx_lock(&zsg
->zsg_queue_mtx
);
476 bp
= bioq_takefirst(&zsg
->zsg_queue
);
478 if (zsg
->zsg_state
== ZVOL_GEOM_STOPPED
) {
479 zsg
->zsg_state
= ZVOL_GEOM_RUNNING
;
480 wakeup(&zsg
->zsg_state
);
481 mtx_unlock(&zsg
->zsg_queue_mtx
);
484 msleep(&zsg
->zsg_queue
, &zsg
->zsg_queue_mtx
,
485 PRIBIO
| PDROP
, "zvol:io", 0);
488 mtx_unlock(&zsg
->zsg_queue_mtx
);
489 zvol_geom_bio_strategy(bp
);
494 zvol_geom_bio_start(struct bio
*bp
)
496 zvol_state_t
*zv
= bp
->bio_to
->private;
497 struct zvol_state_geom
*zsg
= &zv
->zv_zso
->zso_geom
;
500 if (bp
->bio_cmd
== BIO_GETATTR
) {
501 if (zvol_geom_bio_getattr(bp
))
502 g_io_deliver(bp
, EOPNOTSUPP
);
506 if (!THREAD_CAN_SLEEP()) {
507 mtx_lock(&zsg
->zsg_queue_mtx
);
508 first
= (bioq_first(&zsg
->zsg_queue
) == NULL
);
509 bioq_insert_tail(&zsg
->zsg_queue
, bp
);
510 mtx_unlock(&zsg
->zsg_queue_mtx
);
512 wakeup_one(&zsg
->zsg_queue
);
516 zvol_geom_bio_strategy(bp
);
520 zvol_geom_bio_getattr(struct bio
*bp
)
524 zv
= bp
->bio_to
->private;
527 spa_t
*spa
= dmu_objset_spa(zv
->zv_objset
);
528 uint64_t refd
, avail
, usedobjs
, availobjs
;
530 if (g_handleattr_int(bp
, "GEOM::candelete", 1))
532 if (strcmp(bp
->bio_attribute
, "blocksavail") == 0) {
533 dmu_objset_space(zv
->zv_objset
, &refd
, &avail
,
534 &usedobjs
, &availobjs
);
535 if (g_handleattr_off_t(bp
, "blocksavail", avail
/ DEV_BSIZE
))
537 } else if (strcmp(bp
->bio_attribute
, "blocksused") == 0) {
538 dmu_objset_space(zv
->zv_objset
, &refd
, &avail
,
539 &usedobjs
, &availobjs
);
540 if (g_handleattr_off_t(bp
, "blocksused", refd
/ DEV_BSIZE
))
542 } else if (strcmp(bp
->bio_attribute
, "poolblocksavail") == 0) {
543 avail
= metaslab_class_get_space(spa_normal_class(spa
));
544 avail
-= metaslab_class_get_alloc(spa_normal_class(spa
));
545 if (g_handleattr_off_t(bp
, "poolblocksavail",
548 } else if (strcmp(bp
->bio_attribute
, "poolblocksused") == 0) {
549 refd
= metaslab_class_get_alloc(spa_normal_class(spa
));
550 if (g_handleattr_off_t(bp
, "poolblocksused", refd
/ DEV_BSIZE
))
557 zvol_geom_bio_strategy(struct bio
*bp
)
560 uint64_t off
, volsize
;
564 zfs_locked_range_t
*lr
;
566 boolean_t doread
= B_FALSE
;
567 boolean_t is_dumpified
;
571 zv
= bp
->bio_to
->private;
573 zv
= bp
->bio_dev
->si_drv2
;
576 error
= SET_ERROR(ENXIO
);
580 rw_enter(&zv
->zv_suspend_lock
, ZVOL_RW_READER
);
582 switch (bp
->bio_cmd
) {
589 if (zv
->zv_flags
& ZVOL_RDONLY
) {
590 error
= SET_ERROR(EROFS
);
593 zvol_ensure_zilog(zv
);
594 if (bp
->bio_cmd
== BIO_FLUSH
)
602 off
= bp
->bio_offset
;
603 volsize
= zv
->zv_volsize
;
609 resid
= bp
->bio_length
;
611 if (resid
> 0 && (off
< 0 || off
>= volsize
)) {
612 error
= SET_ERROR(EIO
);
616 is_dumpified
= B_FALSE
;
617 sync
= !doread
&& !is_dumpified
&&
618 zv
->zv_objset
->os_sync
== ZFS_SYNC_ALWAYS
;
621 * There must be no buffer changes when doing a dmu_sync() because
622 * we can't change the data whilst calculating the checksum.
624 lr
= zfs_rangelock_enter(&zv
->zv_rangelock
, off
, resid
,
625 doread
? RL_READER
: RL_WRITER
);
627 if (bp
->bio_cmd
== BIO_DELETE
) {
628 dmu_tx_t
*tx
= dmu_tx_create(zv
->zv_objset
);
629 error
= dmu_tx_assign(tx
, TXG_WAIT
);
633 zvol_log_truncate(zv
, tx
, off
, resid
, sync
);
635 error
= dmu_free_long_range(zv
->zv_objset
, ZVOL_OBJ
,
641 while (resid
!= 0 && off
< volsize
) {
642 size_t size
= MIN(resid
, zvol_maxphys
);
644 error
= dmu_read(os
, ZVOL_OBJ
, off
, size
, addr
,
647 dmu_tx_t
*tx
= dmu_tx_create(os
);
648 dmu_tx_hold_write_by_dnode(tx
, zv
->zv_dn
, off
, size
);
649 error
= dmu_tx_assign(tx
, TXG_WAIT
);
653 dmu_write(os
, ZVOL_OBJ
, off
, size
, addr
, tx
);
654 zvol_log_write(zv
, tx
, off
, size
, sync
);
659 /* convert checksum errors into IO errors */
661 error
= SET_ERROR(EIO
);
669 zfs_rangelock_exit(lr
);
671 bp
->bio_completed
= bp
->bio_length
- resid
;
672 if (bp
->bio_completed
< bp
->bio_length
&& off
> volsize
)
675 switch (bp
->bio_cmd
) {
679 dataset_kstats_update_read_kstats(&zv
->zv_kstat
,
683 dataset_kstats_update_write_kstats(&zv
->zv_kstat
,
694 zil_commit(zv
->zv_zilog
, ZVOL_OBJ
);
697 rw_exit(&zv
->zv_suspend_lock
);
700 g_io_deliver(bp
, error
);
702 biofinish(bp
, NULL
, error
);
706 * Character device mode implementation
710 zvol_cdev_read(struct cdev
*dev
, struct uio
*uio
, int ioflag
)
714 zfs_locked_range_t
*lr
;
719 volsize
= zv
->zv_volsize
;
721 * uio_loffset == volsize isn't an error as
722 * its required for EOF processing.
724 if (uio
->uio_resid
> 0 &&
725 (uio
->uio_loffset
< 0 || uio
->uio_loffset
> volsize
))
726 return (SET_ERROR(EIO
));
728 lr
= zfs_rangelock_enter(&zv
->zv_rangelock
, uio
->uio_loffset
,
729 uio
->uio_resid
, RL_READER
);
730 while (uio
->uio_resid
> 0 && uio
->uio_loffset
< volsize
) {
731 uint64_t bytes
= MIN(uio
->uio_resid
, DMU_MAX_ACCESS
>> 1);
733 /* don't read past the end */
734 if (bytes
> volsize
- uio
->uio_loffset
)
735 bytes
= volsize
- uio
->uio_loffset
;
737 error
= dmu_read_uio_dnode(zv
->zv_dn
, uio
, bytes
);
739 /* convert checksum errors into IO errors */
741 error
= SET_ERROR(EIO
);
745 zfs_rangelock_exit(lr
);
751 zvol_cdev_write(struct cdev
*dev
, struct uio
*uio
, int ioflag
)
755 zfs_locked_range_t
*lr
;
761 volsize
= zv
->zv_volsize
;
763 if (uio
->uio_resid
> 0 &&
764 (uio
->uio_loffset
< 0 || uio
->uio_loffset
> volsize
))
765 return (SET_ERROR(EIO
));
767 sync
= (ioflag
& IO_SYNC
) ||
768 (zv
->zv_objset
->os_sync
== ZFS_SYNC_ALWAYS
);
770 rw_enter(&zv
->zv_suspend_lock
, ZVOL_RW_READER
);
771 zvol_ensure_zilog(zv
);
773 lr
= zfs_rangelock_enter(&zv
->zv_rangelock
, uio
->uio_loffset
,
774 uio
->uio_resid
, RL_WRITER
);
775 while (uio
->uio_resid
> 0 && uio
->uio_loffset
< volsize
) {
776 uint64_t bytes
= MIN(uio
->uio_resid
, DMU_MAX_ACCESS
>> 1);
777 uint64_t off
= uio
->uio_loffset
;
778 dmu_tx_t
*tx
= dmu_tx_create(zv
->zv_objset
);
780 if (bytes
> volsize
- off
) /* don't write past the end */
781 bytes
= volsize
- off
;
783 dmu_tx_hold_write_by_dnode(tx
, zv
->zv_dn
, off
, bytes
);
784 error
= dmu_tx_assign(tx
, TXG_WAIT
);
789 error
= dmu_write_uio_dnode(zv
->zv_dn
, uio
, bytes
, tx
);
791 zvol_log_write(zv
, tx
, off
, bytes
, sync
);
797 zfs_rangelock_exit(lr
);
799 zil_commit(zv
->zv_zilog
, ZVOL_OBJ
);
800 rw_exit(&zv
->zv_suspend_lock
);
805 zvol_cdev_open(struct cdev
*dev
, int flags
, int fmt
, struct thread
*td
)
808 struct zvol_state_dev
*zsd
;
810 boolean_t drop_suspend
= B_TRUE
;
812 rw_enter(&zvol_state_lock
, ZVOL_RW_READER
);
815 rw_exit(&zvol_state_lock
);
816 return (SET_ERROR(ENXIO
));
819 mutex_enter(&zv
->zv_state_lock
);
821 ASSERT(zv
->zv_zso
->zso_volmode
== ZFS_VOLMODE_DEV
);
824 * make sure zvol is not suspended during first open
825 * (hold zv_suspend_lock) and respect proper lock acquisition
826 * ordering - zv_suspend_lock before zv_state_lock
828 if (zv
->zv_open_count
== 0) {
829 if (!rw_tryenter(&zv
->zv_suspend_lock
, ZVOL_RW_READER
)) {
830 mutex_exit(&zv
->zv_state_lock
);
831 rw_enter(&zv
->zv_suspend_lock
, ZVOL_RW_READER
);
832 mutex_enter(&zv
->zv_state_lock
);
833 /* check to see if zv_suspend_lock is needed */
834 if (zv
->zv_open_count
!= 0) {
835 rw_exit(&zv
->zv_suspend_lock
);
836 drop_suspend
= B_FALSE
;
840 drop_suspend
= B_FALSE
;
842 rw_exit(&zvol_state_lock
);
844 ASSERT(MUTEX_HELD(&zv
->zv_state_lock
));
846 if (zv
->zv_open_count
== 0) {
847 ASSERT(ZVOL_RW_READ_HELD(&zv
->zv_suspend_lock
));
848 err
= zvol_first_open(zv
, !(flags
& FWRITE
));
853 if ((flags
& FWRITE
) && (zv
->zv_flags
& ZVOL_RDONLY
)) {
857 if (zv
->zv_flags
& ZVOL_EXCL
) {
863 if (zv
->zv_open_count
!= 0) {
867 zv
->zv_flags
|= ZVOL_EXCL
;
872 if (flags
& (FSYNC
| FDSYNC
)) {
873 zsd
= &zv
->zv_zso
->zso_dev
;
875 if (zsd
->zsd_sync_cnt
== 1)
876 zil_async_to_sync(zv
->zv_zilog
, ZVOL_OBJ
);
879 mutex_exit(&zv
->zv_state_lock
);
881 rw_exit(&zv
->zv_suspend_lock
);
885 if (zv
->zv_open_count
== 0)
888 mutex_exit(&zv
->zv_state_lock
);
890 rw_exit(&zv
->zv_suspend_lock
);
891 return (SET_ERROR(err
));
895 zvol_cdev_close(struct cdev
*dev
, int flags
, int fmt
, struct thread
*td
)
898 struct zvol_state_dev
*zsd
;
899 boolean_t drop_suspend
= B_TRUE
;
901 rw_enter(&zvol_state_lock
, ZVOL_RW_READER
);
904 rw_exit(&zvol_state_lock
);
905 return (SET_ERROR(ENXIO
));
908 mutex_enter(&zv
->zv_state_lock
);
909 if (zv
->zv_flags
& ZVOL_EXCL
) {
910 ASSERT(zv
->zv_open_count
== 1);
911 zv
->zv_flags
&= ~ZVOL_EXCL
;
914 ASSERT(zv
->zv_zso
->zso_volmode
== ZFS_VOLMODE_DEV
);
917 * If the open count is zero, this is a spurious close.
918 * That indicates a bug in the kernel / DDI framework.
920 ASSERT(zv
->zv_open_count
> 0);
922 * make sure zvol is not suspended during last close
923 * (hold zv_suspend_lock) and respect proper lock acquisition
924 * ordering - zv_suspend_lock before zv_state_lock
926 if (zv
->zv_open_count
== 1) {
927 if (!rw_tryenter(&zv
->zv_suspend_lock
, ZVOL_RW_READER
)) {
928 mutex_exit(&zv
->zv_state_lock
);
929 rw_enter(&zv
->zv_suspend_lock
, ZVOL_RW_READER
);
930 mutex_enter(&zv
->zv_state_lock
);
931 /* check to see if zv_suspend_lock is needed */
932 if (zv
->zv_open_count
!= 1) {
933 rw_exit(&zv
->zv_suspend_lock
);
934 drop_suspend
= B_FALSE
;
938 drop_suspend
= B_FALSE
;
940 rw_exit(&zvol_state_lock
);
942 ASSERT(MUTEX_HELD(&zv
->zv_state_lock
));
945 * You may get multiple opens, but only one close.
948 if (flags
& (FSYNC
| FDSYNC
)) {
949 zsd
= &zv
->zv_zso
->zso_dev
;
953 if (zv
->zv_open_count
== 0) {
954 ASSERT(ZVOL_RW_READ_HELD(&zv
->zv_suspend_lock
));
958 mutex_exit(&zv
->zv_state_lock
);
961 rw_exit(&zv
->zv_suspend_lock
);
966 zvol_cdev_ioctl(struct cdev
*dev
, ulong_t cmd
, caddr_t data
,
967 int fflag
, struct thread
*td
)
970 zfs_locked_range_t
*lr
;
971 off_t offset
, length
;
978 KASSERT(zv
->zv_open_count
> 0,
979 ("Device with zero access count in %s", __func__
));
981 i
= IOCPARM_LEN(cmd
);
983 case DIOCGSECTORSIZE
:
984 *(uint32_t *)data
= DEV_BSIZE
;
987 *(off_t
*)data
= zv
->zv_volsize
;
990 rw_enter(&zv
->zv_suspend_lock
, ZVOL_RW_READER
);
991 if (zv
->zv_zilog
!= NULL
)
992 zil_commit(zv
->zv_zilog
, ZVOL_OBJ
);
993 rw_exit(&zv
->zv_suspend_lock
);
996 if (!zvol_unmap_enabled
)
999 offset
= ((off_t
*)data
)[0];
1000 length
= ((off_t
*)data
)[1];
1001 if ((offset
% DEV_BSIZE
) != 0 || (length
% DEV_BSIZE
) != 0 ||
1002 offset
< 0 || offset
>= zv
->zv_volsize
||
1004 printf("%s: offset=%jd length=%jd\n", __func__
, offset
,
1009 rw_enter(&zv
->zv_suspend_lock
, ZVOL_RW_READER
);
1010 zvol_ensure_zilog(zv
);
1011 lr
= zfs_rangelock_enter(&zv
->zv_rangelock
, offset
, length
,
1013 dmu_tx_t
*tx
= dmu_tx_create(zv
->zv_objset
);
1014 error
= dmu_tx_assign(tx
, TXG_WAIT
);
1019 sync
= (zv
->zv_objset
->os_sync
== ZFS_SYNC_ALWAYS
);
1020 zvol_log_truncate(zv
, tx
, offset
, length
, sync
);
1022 error
= dmu_free_long_range(zv
->zv_objset
, ZVOL_OBJ
,
1025 zfs_rangelock_exit(lr
);
1027 zil_commit(zv
->zv_zilog
, ZVOL_OBJ
);
1028 rw_exit(&zv
->zv_suspend_lock
);
1030 case DIOCGSTRIPESIZE
:
1031 *(off_t
*)data
= zv
->zv_volblocksize
;
1033 case DIOCGSTRIPEOFFSET
:
1037 spa_t
*spa
= dmu_objset_spa(zv
->zv_objset
);
1038 struct diocgattr_arg
*arg
= (struct diocgattr_arg
*)data
;
1039 uint64_t refd
, avail
, usedobjs
, availobjs
;
1041 if (strcmp(arg
->name
, "GEOM::candelete") == 0)
1043 else if (strcmp(arg
->name
, "blocksavail") == 0) {
1044 dmu_objset_space(zv
->zv_objset
, &refd
, &avail
,
1045 &usedobjs
, &availobjs
);
1046 arg
->value
.off
= avail
/ DEV_BSIZE
;
1047 } else if (strcmp(arg
->name
, "blocksused") == 0) {
1048 dmu_objset_space(zv
->zv_objset
, &refd
, &avail
,
1049 &usedobjs
, &availobjs
);
1050 arg
->value
.off
= refd
/ DEV_BSIZE
;
1051 } else if (strcmp(arg
->name
, "poolblocksavail") == 0) {
1052 avail
= metaslab_class_get_space(spa_normal_class(spa
));
1053 avail
-= metaslab_class_get_alloc(
1054 spa_normal_class(spa
));
1055 arg
->value
.off
= avail
/ DEV_BSIZE
;
1056 } else if (strcmp(arg
->name
, "poolblocksused") == 0) {
1057 refd
= metaslab_class_get_alloc(spa_normal_class(spa
));
1058 arg
->value
.off
= refd
/ DEV_BSIZE
;
1065 off_t
*off
= (off_t
*)data
;
1069 hole
= (cmd
== FIOSEEKHOLE
);
1071 error
= dmu_offset_next(zv
->zv_objset
, ZVOL_OBJ
, hole
, &noff
);
1087 zvol_ensure_zilog(zvol_state_t
*zv
)
1089 ASSERT(ZVOL_RW_READ_HELD(&zv
->zv_suspend_lock
));
1092 * Open a ZIL if this is the first time we have written to this
1093 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
1094 * than zv_state_lock so that we don't need to acquire an
1095 * additional lock in this path.
1097 if (zv
->zv_zilog
== NULL
) {
1098 if (!rw_tryupgrade(&zv
->zv_suspend_lock
)) {
1099 rw_exit(&zv
->zv_suspend_lock
);
1100 rw_enter(&zv
->zv_suspend_lock
, RW_WRITER
);
1102 if (zv
->zv_zilog
== NULL
) {
1103 zv
->zv_zilog
= zil_open(zv
->zv_objset
,
1105 zv
->zv_flags
|= ZVOL_WRITTEN_TO
;
1107 rw_downgrade(&zv
->zv_suspend_lock
);
1112 zvol_is_zvol_impl(const char *device
)
1114 return (device
&& strncmp(device
, ZVOL_DIR
, strlen(ZVOL_DIR
)) == 0);
1118 zvol_rename_minor(zvol_state_t
*zv
, const char *newname
)
1120 ASSERT(RW_LOCK_HELD(&zvol_state_lock
));
1121 ASSERT(MUTEX_HELD(&zv
->zv_state_lock
));
1123 /* move to new hashtable entry */
1124 zv
->zv_hash
= zvol_name_hash(zv
->zv_name
);
1125 hlist_del(&zv
->zv_hlink
);
1126 hlist_add_head(&zv
->zv_hlink
, ZVOL_HT_HEAD(zv
->zv_hash
));
1128 if (zv
->zv_zso
->zso_volmode
== ZFS_VOLMODE_GEOM
) {
1129 struct zvol_state_geom
*zsg
= &zv
->zv_zso
->zso_geom
;
1130 struct g_provider
*pp
= zsg
->zsg_provider
;
1137 zsg
->zsg_provider
= NULL
;
1138 g_wither_provider(pp
, ENXIO
);
1140 pp
= g_new_providerf(gp
, "%s/%s", ZVOL_DRIVER
, newname
);
1141 pp
->flags
|= G_PF_DIRECT_RECEIVE
| G_PF_DIRECT_SEND
;
1142 pp
->sectorsize
= DEV_BSIZE
;
1143 pp
->mediasize
= zv
->zv_volsize
;
1145 zsg
->zsg_provider
= pp
;
1146 g_error_provider(pp
, 0);
1147 g_topology_unlock();
1148 } else if (zv
->zv_zso
->zso_volmode
== ZFS_VOLMODE_DEV
) {
1149 struct zvol_state_dev
*zsd
= &zv
->zv_zso
->zso_dev
;
1151 struct make_dev_args args
;
1153 dev
= zsd
->zsd_cdev
;
1156 dev
= zsd
->zsd_cdev
= NULL
;
1157 if (zv
->zv_open_count
> 0) {
1158 zv
->zv_flags
&= ~ZVOL_EXCL
;
1159 zv
->zv_open_count
= 0;
1160 /* XXX need suspend lock but lock order */
1161 zvol_last_close(zv
);
1165 make_dev_args_init(&args
);
1166 args
.mda_flags
= MAKEDEV_CHECKNAME
| MAKEDEV_WAITOK
;
1167 args
.mda_devsw
= &zvol_cdevsw
;
1169 args
.mda_uid
= UID_ROOT
;
1170 args
.mda_gid
= GID_OPERATOR
;
1171 args
.mda_mode
= 0640;
1172 args
.mda_si_drv2
= zv
;
1173 if (make_dev_s(&args
, &dev
, "%s/%s", ZVOL_DRIVER
, newname
)
1175 dev
->si_iosize_max
= MAXPHYS
;
1176 zsd
->zsd_cdev
= dev
;
1179 strlcpy(zv
->zv_name
, newname
, sizeof (zv
->zv_name
));
1183 * Remove minor node for the specified volume.
1186 zvol_free(zvol_state_t
*zv
)
1188 ASSERT(!RW_LOCK_HELD(&zv
->zv_suspend_lock
));
1189 ASSERT(!MUTEX_HELD(&zv
->zv_state_lock
));
1190 ASSERT(zv
->zv_open_count
== 0);
1192 ZFS_LOG(1, "ZVOL %s destroyed.", zv
->zv_name
);
1194 rw_destroy(&zv
->zv_suspend_lock
);
1195 zfs_rangelock_fini(&zv
->zv_rangelock
);
1197 if (zv
->zv_zso
->zso_volmode
== ZFS_VOLMODE_GEOM
) {
1198 struct zvol_state_geom
*zsg
= &zv
->zv_zso
->zso_geom
;
1201 zvol_geom_destroy(zv
);
1202 g_topology_unlock();
1203 mtx_destroy(&zsg
->zsg_queue_mtx
);
1204 } else if (zv
->zv_zso
->zso_volmode
== ZFS_VOLMODE_DEV
) {
1205 struct zvol_state_dev
*zsd
= &zv
->zv_zso
->zso_dev
;
1206 struct cdev
*dev
= zsd
->zsd_cdev
;
1212 mutex_destroy(&zv
->zv_state_lock
);
1213 dataset_kstats_destroy(&zv
->zv_kstat
);
1214 kmem_free(zv
->zv_zso
, sizeof (struct zvol_state_os
));
1215 kmem_free(zv
, sizeof (zvol_state_t
));
1220 * Create a minor node (plus a whole lot more) for the specified volume.
1223 zvol_create_minor_impl(const char *name
)
1227 dmu_object_info_t
*doi
;
1229 uint64_t volmode
, hash
;
1232 ZFS_LOG(1, "Creating ZVOL %s...", name
);
1234 hash
= zvol_name_hash(name
);
1235 if ((zv
= zvol_find_by_name_hash(name
, hash
, RW_NONE
)) != NULL
) {
1236 ASSERT(MUTEX_HELD(&zv
->zv_state_lock
));
1237 mutex_exit(&zv
->zv_state_lock
);
1238 return (SET_ERROR(EEXIST
));
1242 /* lie and say we're read-only */
1243 error
= dmu_objset_own(name
, DMU_OST_ZVOL
, B_TRUE
, B_TRUE
, FTAG
, &os
);
1244 doi
= kmem_alloc(sizeof (dmu_object_info_t
), KM_SLEEP
);
1249 error
= dmu_object_info(os
, ZVOL_OBJ
, doi
);
1251 goto out_dmu_objset_disown
;
1253 error
= zap_lookup(os
, ZVOL_ZAP_OBJ
, "size", 8, 1, &volsize
);
1255 goto out_dmu_objset_disown
;
1257 error
= dsl_prop_get_integer(name
,
1258 zfs_prop_to_name(ZFS_PROP_VOLMODE
), &volmode
, NULL
);
1259 if (error
!= 0 || volmode
== ZFS_VOLMODE_DEFAULT
)
1260 volmode
= zvol_volmode
;
1262 * zvol_alloc equivalent ...
1264 zv
= kmem_zalloc(sizeof (*zv
), KM_SLEEP
);
1266 mutex_init(&zv
->zv_state_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
1267 zv
->zv_zso
= kmem_zalloc(sizeof (struct zvol_state_os
), KM_SLEEP
);
1268 zv
->zv_zso
->zso_volmode
= volmode
;
1269 if (zv
->zv_zso
->zso_volmode
== ZFS_VOLMODE_GEOM
) {
1270 struct zvol_state_geom
*zsg
= &zv
->zv_zso
->zso_geom
;
1271 struct g_provider
*pp
;
1274 zsg
->zsg_state
= ZVOL_GEOM_UNINIT
;
1275 mtx_init(&zsg
->zsg_queue_mtx
, "zvol", NULL
, MTX_DEF
);
1278 gp
= g_new_geomf(&zfs_zvol_class
, "zfs::zvol::%s", name
);
1279 gp
->start
= zvol_geom_bio_start
;
1280 gp
->access
= zvol_geom_access
;
1281 pp
= g_new_providerf(gp
, "%s/%s", ZVOL_DRIVER
, name
);
1282 /* TODO: NULL check? */
1283 pp
->flags
|= G_PF_DIRECT_RECEIVE
| G_PF_DIRECT_SEND
;
1284 pp
->sectorsize
= DEV_BSIZE
;
1288 zsg
->zsg_provider
= pp
;
1289 bioq_init(&zsg
->zsg_queue
);
1290 } else if (zv
->zv_zso
->zso_volmode
== ZFS_VOLMODE_DEV
) {
1291 struct zvol_state_dev
*zsd
= &zv
->zv_zso
->zso_dev
;
1293 struct make_dev_args args
;
1295 make_dev_args_init(&args
);
1296 args
.mda_flags
= MAKEDEV_CHECKNAME
| MAKEDEV_WAITOK
;
1297 args
.mda_devsw
= &zvol_cdevsw
;
1299 args
.mda_uid
= UID_ROOT
;
1300 args
.mda_gid
= GID_OPERATOR
;
1301 args
.mda_mode
= 0640;
1302 args
.mda_si_drv2
= zv
;
1303 error
= make_dev_s(&args
, &dev
, "%s/%s", ZVOL_DRIVER
, name
);
1305 mutex_destroy(&zv
->zv_state_lock
);
1306 kmem_free(zv
->zv_zso
, sizeof (struct zvol_state_os
));
1307 kmem_free(zv
, sizeof (*zv
));
1308 dmu_objset_disown(os
, B_TRUE
, FTAG
);
1311 dev
->si_iosize_max
= MAXPHYS
;
1312 zsd
->zsd_cdev
= dev
;
1314 (void) strlcpy(zv
->zv_name
, name
, MAXPATHLEN
);
1315 rw_init(&zv
->zv_suspend_lock
, NULL
, RW_DEFAULT
, NULL
);
1316 zfs_rangelock_init(&zv
->zv_rangelock
, NULL
, NULL
);
1318 if (dmu_objset_is_snapshot(os
) || !spa_writeable(dmu_objset_spa(os
)))
1319 zv
->zv_flags
|= ZVOL_RDONLY
;
1321 zv
->zv_volblocksize
= doi
->doi_data_block_size
;
1322 zv
->zv_volsize
= volsize
;
1325 if (spa_writeable(dmu_objset_spa(os
))) {
1326 if (zil_replay_disable
)
1327 zil_destroy(dmu_objset_zil(os
), B_FALSE
);
1329 zil_replay(os
, zv
, zvol_replay_vector
);
1331 ASSERT3P(zv
->zv_kstat
.dk_kstats
, ==, NULL
);
1332 dataset_kstats_create(&zv
->zv_kstat
, zv
->zv_objset
);
1334 /* XXX do prefetch */
1336 zv
->zv_objset
= NULL
;
1337 out_dmu_objset_disown
:
1338 dmu_objset_disown(os
, B_TRUE
, FTAG
);
1340 if (zv
->zv_zso
->zso_volmode
== ZFS_VOLMODE_GEOM
) {
1343 g_topology_unlock();
1346 kmem_free(doi
, sizeof (dmu_object_info_t
));
1348 rw_enter(&zvol_state_lock
, RW_WRITER
);
1351 rw_exit(&zvol_state_lock
);
1353 ZFS_LOG(1, "ZVOL %s created.", name
);
1360 zvol_clear_private(zvol_state_t
*zv
)
1362 ASSERT(RW_LOCK_HELD(&zvol_state_lock
));
1363 if (zv
->zv_zso
->zso_volmode
== ZFS_VOLMODE_GEOM
) {
1364 struct zvol_state_geom
*zsg
= &zv
->zv_zso
->zso_geom
;
1365 struct g_provider
*pp
= zsg
->zsg_provider
;
1367 if (pp
== NULL
) /* XXX when? */
1370 mtx_lock(&zsg
->zsg_queue_mtx
);
1371 zsg
->zsg_state
= ZVOL_GEOM_STOPPED
;
1373 wakeup_one(&zsg
->zsg_queue
);
1374 while (zsg
->zsg_state
!= ZVOL_GEOM_RUNNING
)
1375 msleep(&zsg
->zsg_state
,
1376 &zsg
->zsg_queue_mtx
,
1378 mtx_unlock(&zsg
->zsg_queue_mtx
);
1379 ASSERT(!RW_LOCK_HELD(&zv
->zv_suspend_lock
));
1384 zvol_update_volsize(zvol_state_t
*zv
, uint64_t volsize
)
1386 zv
->zv_volsize
= volsize
;
1387 if (zv
->zv_zso
->zso_volmode
== ZFS_VOLMODE_GEOM
) {
1388 struct zvol_state_geom
*zsg
= &zv
->zv_zso
->zso_geom
;
1389 struct g_provider
*pp
= zsg
->zsg_provider
;
1391 if (pp
== NULL
) /* XXX when? */
1397 * Do not invoke resize event when initial size was zero.
1398 * ZVOL initializes the size on first open, this is not
1401 if (pp
->mediasize
== 0)
1402 pp
->mediasize
= zv
->zv_volsize
;
1404 g_resize_provider(pp
, zv
->zv_volsize
);
1406 g_topology_unlock();
1412 zvol_set_disk_ro_impl(zvol_state_t
*zv
, int flags
)
1414 // XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags);
1418 zvol_set_capacity_impl(zvol_state_t
*zv
, uint64_t capacity
)
1420 // XXX? set_capacity(zv->zv_zso->zvo_disk, capacity);
1423 const static zvol_platform_ops_t zvol_freebsd_ops
= {
1424 .zv_free
= zvol_free
,
1425 .zv_rename_minor
= zvol_rename_minor
,
1426 .zv_create_minor
= zvol_create_minor_impl
,
1427 .zv_update_volsize
= zvol_update_volsize
,
1428 .zv_clear_private
= zvol_clear_private
,
1429 .zv_is_zvol
= zvol_is_zvol_impl
,
1430 .zv_set_disk_ro
= zvol_set_disk_ro_impl
,
1431 .zv_set_capacity
= zvol_set_capacity_impl
,
1441 return (zvol_minors
!= 0);
1448 zvol_register_ops(&zvol_freebsd_ops
);