]> git.proxmox.com Git - mirror_zfs.git/blob - module/os/freebsd/zfs/zvol_os.c
Rename refcount.h to zfs_refcount.h
[mirror_zfs.git] / module / os / freebsd / zfs / zvol_os.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 *
24 * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
25 * All rights reserved.
26 *
27 * Portions Copyright 2010 Robert Milkowski
28 *
29 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
30 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
31 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
32 * Copyright (c) 2014 Integros [integros.com]
33 */
34
35 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
36
37 /*
38 * ZFS volume emulation driver.
39 *
40 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
41 * Volumes are accessed through the symbolic links named:
42 *
43 * /dev/zvol/<pool_name>/<dataset_name>
44 *
45 * Volumes are persistent through reboot. No user command needs to be
46 * run before opening and using a device.
47 *
48 * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
49 * in the system. Except when they're simply character devices (volmode=dev).
50 */
51
52 #include <sys/types.h>
53 #include <sys/param.h>
54 #include <sys/kernel.h>
55 #include <sys/errno.h>
56 #include <sys/uio.h>
57 #include <sys/bio.h>
58 #include <sys/buf.h>
59 #include <sys/kmem.h>
60 #include <sys/conf.h>
61 #include <sys/cmn_err.h>
62 #include <sys/stat.h>
63 #include <sys/zap.h>
64 #include <sys/spa.h>
65 #include <sys/spa_impl.h>
66 #include <sys/zio.h>
67 #include <sys/disk.h>
68 #include <sys/dmu_traverse.h>
69 #include <sys/dnode.h>
70 #include <sys/dsl_dataset.h>
71 #include <sys/dsl_prop.h>
72 #include <sys/dsl_dir.h>
73 #include <sys/byteorder.h>
74 #include <sys/sunddi.h>
75 #include <sys/dirent.h>
76 #include <sys/policy.h>
77 #include <sys/queue.h>
78 #include <sys/fs/zfs.h>
79 #include <sys/zfs_ioctl.h>
80 #include <sys/zil.h>
81 #include <sys/zfs_znode.h>
82 #include <sys/zfs_rlock.h>
83 #include <sys/vdev_impl.h>
84 #include <sys/vdev_raidz.h>
85 #include <sys/zvol.h>
86 #include <sys/zil_impl.h>
87 #include <sys/dataset_kstats.h>
88 #include <sys/dbuf.h>
89 #include <sys/dmu_tx.h>
90 #include <sys/zfeature.h>
91 #include <sys/zio_checksum.h>
92 #include <sys/zil_impl.h>
93 #include <sys/filio.h>
94
95 #include <geom/geom.h>
96 #include <sys/zvol.h>
97 #include <sys/zvol_impl.h>
98
99 #include "zfs_namecheck.h"
100
101 #define ZVOL_DIR "/dev/zvol/"
102 #define ZVOL_DUMPSIZE "dumpsize"
103
104 #ifdef ZVOL_LOCK_DEBUG
105 #define ZVOL_RW_READER RW_WRITER
106 #define ZVOL_RW_READ_HELD RW_WRITE_HELD
107 #else
108 #define ZVOL_RW_READER RW_READER
109 #define ZVOL_RW_READ_HELD RW_READ_HELD
110 #endif
111
112 enum zvol_geom_state {
113 ZVOL_GEOM_UNINIT,
114 ZVOL_GEOM_STOPPED,
115 ZVOL_GEOM_RUNNING,
116 };
117
118 struct zvol_state_os {
119 int zso_volmode;
120 #define zso_dev _zso_state._zso_dev
121 #define zso_geom _zso_state._zso_geom
122 union {
123 /* volmode=dev */
124 struct zvol_state_dev {
125 struct cdev *zsd_cdev;
126 uint64_t zsd_sync_cnt;
127 } _zso_dev;
128
129 /* volmode=geom */
130 struct zvol_state_geom {
131 struct g_provider *zsg_provider;
132 struct bio_queue_head zsg_queue;
133 struct mtx zsg_queue_mtx;
134 enum zvol_geom_state zsg_state;
135 } _zso_geom;
136 } _zso_state;
137 };
138
139 struct proc *zfsproc;
140
141 static uint32_t zvol_minors;
142
143 SYSCTL_DECL(_vfs_zfs);
144 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
145 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0,
146 "Expose as GEOM providers (1), device files (2) or neither");
147 static boolean_t zpool_on_zvol = B_FALSE;
148 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
149 "Allow zpools to use zvols as vdevs (DANGEROUS)");
150
151 /*
152 * Toggle unmap functionality.
153 */
154 boolean_t zvol_unmap_enabled = B_TRUE;
155
156 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
157 &zvol_unmap_enabled, 0, "Enable UNMAP functionality");
158
159 /*
160 * zvol maximum transfer in one DMU tx.
161 */
162 int zvol_maxphys = DMU_MAX_ACCESS / 2;
163
164 static void zvol_ensure_zilog(zvol_state_t *zv);
165
166 static d_open_t zvol_cdev_open;
167 static d_close_t zvol_cdev_close;
168 static d_ioctl_t zvol_cdev_ioctl;
169 static d_read_t zvol_cdev_read;
170 static d_write_t zvol_cdev_write;
171 static d_strategy_t zvol_geom_bio_strategy;
172
173 static struct cdevsw zvol_cdevsw = {
174 .d_name = "zvol",
175 .d_version = D_VERSION,
176 .d_flags = D_DISK | D_TRACKCLOSE,
177 .d_open = zvol_cdev_open,
178 .d_close = zvol_cdev_close,
179 .d_ioctl = zvol_cdev_ioctl,
180 .d_read = zvol_cdev_read,
181 .d_write = zvol_cdev_write,
182 .d_strategy = zvol_geom_bio_strategy,
183 };
184
185 extern uint_t zfs_geom_probe_vdev_key;
186
187 struct g_class zfs_zvol_class = {
188 .name = "ZFS::ZVOL",
189 .version = G_VERSION,
190 };
191
192 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
193
194 static int zvol_geom_open(struct g_provider *pp, int flag, int count);
195 static int zvol_geom_close(struct g_provider *pp, int flag, int count);
196 static void zvol_geom_run(zvol_state_t *zv);
197 static void zvol_geom_destroy(zvol_state_t *zv);
198 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
199 static void zvol_geom_worker(void *arg);
200 static void zvol_geom_bio_start(struct bio *bp);
201 static int zvol_geom_bio_getattr(struct bio *bp);
202 /* static d_strategy_t zvol_geom_bio_strategy; (declared elsewhere) */
203
204 /*
205 * GEOM mode implementation
206 */
207
208 /*ARGSUSED*/
209 static int
210 zvol_geom_open(struct g_provider *pp, int flag, int count)
211 {
212 zvol_state_t *zv;
213 int err = 0;
214 boolean_t drop_suspend = B_TRUE;
215
216 if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
217 /*
218 * if zfs_geom_probe_vdev_key is set, that means that zfs is
219 * attempting to probe geom providers while looking for a
220 * replacement for a missing VDEV. In this case, the
221 * spa_namespace_lock will not be held, but it is still illegal
222 * to use a zvol as a vdev. Deadlocks can result if another
223 * thread has spa_namespace_lock
224 */
225 return (SET_ERROR(EOPNOTSUPP));
226 }
227
228 rw_enter(&zvol_state_lock, ZVOL_RW_READER);
229 zv = pp->private;
230 if (zv == NULL) {
231 rw_exit(&zvol_state_lock);
232 return (SET_ERROR(ENXIO));
233 }
234
235 mutex_enter(&zv->zv_state_lock);
236
237 ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);
238
239 /*
240 * make sure zvol is not suspended during first open
241 * (hold zv_suspend_lock) and respect proper lock acquisition
242 * ordering - zv_suspend_lock before zv_state_lock
243 */
244 if (zv->zv_open_count == 0) {
245 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
246 mutex_exit(&zv->zv_state_lock);
247 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
248 mutex_enter(&zv->zv_state_lock);
249 /* check to see if zv_suspend_lock is needed */
250 if (zv->zv_open_count != 0) {
251 rw_exit(&zv->zv_suspend_lock);
252 drop_suspend = B_FALSE;
253 }
254 }
255 } else {
256 drop_suspend = B_FALSE;
257 }
258 rw_exit(&zvol_state_lock);
259
260 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
261
262 if (zv->zv_open_count == 0) {
263 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
264 err = zvol_first_open(zv, !(flag & FWRITE));
265 if (err)
266 goto out_mutex;
267 pp->mediasize = zv->zv_volsize;
268 pp->stripeoffset = 0;
269 pp->stripesize = zv->zv_volblocksize;
270 }
271
272 /*
273 * Check for a bad on-disk format version now since we
274 * lied about owning the dataset readonly before.
275 */
276 if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) ||
277 dmu_objset_incompatible_encryption_version(zv->zv_objset))) {
278 err = EROFS;
279 goto out_open_count;
280 }
281 if (zv->zv_flags & ZVOL_EXCL) {
282 err = EBUSY;
283 goto out_open_count;
284 }
285 #ifdef FEXCL
286 if (flag & FEXCL) {
287 if (zv->zv_open_count != 0) {
288 err = EBUSY;
289 goto out_open_count;
290 }
291 zv->zv_flags |= ZVOL_EXCL;
292 }
293 #endif
294
295 zv->zv_open_count += count;
296 mutex_exit(&zv->zv_state_lock);
297 if (drop_suspend)
298 rw_exit(&zv->zv_suspend_lock);
299 return (0);
300
301 out_open_count:
302 if (zv->zv_open_count == 0)
303 zvol_last_close(zv);
304 out_mutex:
305 mutex_exit(&zv->zv_state_lock);
306 if (drop_suspend)
307 rw_exit(&zv->zv_suspend_lock);
308 return (SET_ERROR(err));
309 }
310
311 /*ARGSUSED*/
312 static int
313 zvol_geom_close(struct g_provider *pp, int flag, int count)
314 {
315 zvol_state_t *zv;
316 boolean_t drop_suspend = B_TRUE;
317
318 rw_enter(&zvol_state_lock, ZVOL_RW_READER);
319 zv = pp->private;
320 if (zv == NULL) {
321 rw_exit(&zvol_state_lock);
322 return (SET_ERROR(ENXIO));
323 }
324
325 mutex_enter(&zv->zv_state_lock);
326 if (zv->zv_flags & ZVOL_EXCL) {
327 ASSERT(zv->zv_open_count == 1);
328 zv->zv_flags &= ~ZVOL_EXCL;
329 }
330
331 ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);
332
333 /*
334 * If the open count is zero, this is a spurious close.
335 * That indicates a bug in the kernel / DDI framework.
336 */
337 ASSERT(zv->zv_open_count > 0);
338
339 /*
340 * make sure zvol is not suspended during last close
341 * (hold zv_suspend_lock) and respect proper lock acquisition
342 * ordering - zv_suspend_lock before zv_state_lock
343 */
344 if ((zv->zv_open_count - count) == 0) {
345 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
346 mutex_exit(&zv->zv_state_lock);
347 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
348 mutex_enter(&zv->zv_state_lock);
349 /* check to see if zv_suspend_lock is needed */
350 if (zv->zv_open_count != 1) {
351 rw_exit(&zv->zv_suspend_lock);
352 drop_suspend = B_FALSE;
353 }
354 }
355 } else {
356 drop_suspend = B_FALSE;
357 }
358 rw_exit(&zvol_state_lock);
359
360 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
361
362 /*
363 * You may get multiple opens, but only one close.
364 */
365 zv->zv_open_count -= count;
366
367 if (zv->zv_open_count == 0) {
368 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
369 zvol_last_close(zv);
370 }
371
372 mutex_exit(&zv->zv_state_lock);
373
374 if (drop_suspend)
375 rw_exit(&zv->zv_suspend_lock);
376 return (0);
377 }
378
379 static void
380 zvol_geom_run(zvol_state_t *zv)
381 {
382 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
383 struct g_provider *pp = zsg->zsg_provider;
384
385 ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);
386
387 g_error_provider(pp, 0);
388
389 kproc_kthread_add(zvol_geom_worker, zv, &zfsproc, NULL, 0, 0,
390 "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER));
391 }
392
393 static void
394 zvol_geom_destroy(zvol_state_t *zv)
395 {
396 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
397 struct g_provider *pp = zsg->zsg_provider;
398
399 ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);
400
401 g_topology_assert();
402
403 mutex_enter(&zv->zv_state_lock);
404 VERIFY(zsg->zsg_state == ZVOL_GEOM_RUNNING);
405 mutex_exit(&zv->zv_state_lock);
406 zsg->zsg_provider = NULL;
407 pp->private = NULL;
408 g_wither_geom(pp->geom, ENXIO);
409 }
410
411 static int
412 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
413 {
414 int count, error, flags;
415
416 g_topology_assert();
417
418 /*
419 * To make it easier we expect either open or close, but not both
420 * at the same time.
421 */
422 KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
423 (acr <= 0 && acw <= 0 && ace <= 0),
424 ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
425 pp->name, acr, acw, ace));
426
427 if (pp->private == NULL) {
428 if (acr <= 0 && acw <= 0 && ace <= 0)
429 return (0);
430 return (pp->error);
431 }
432
433 /*
434 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
435 * ace != 0, because GEOM already handles that and handles it a bit
436 * differently. GEOM allows for multiple read/exclusive consumers and
437 * ZFS allows only one exclusive consumer, no matter if it is reader or
438 * writer. I like better the way GEOM works so I'll leave it for GEOM
439 * to decide what to do.
440 */
441
442 count = acr + acw + ace;
443 if (count == 0)
444 return (0);
445
446 flags = 0;
447 if (acr != 0 || ace != 0)
448 flags |= FREAD;
449 if (acw != 0)
450 flags |= FWRITE;
451
452 g_topology_unlock();
453 if (count > 0)
454 error = zvol_geom_open(pp, flags, count);
455 else
456 error = zvol_geom_close(pp, flags, -count);
457 g_topology_lock();
458 return (error);
459 }
460
461 static void
462 zvol_geom_worker(void *arg)
463 {
464 zvol_state_t *zv = arg;
465 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
466 struct bio *bp;
467
468 ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);
469
470 thread_lock(curthread);
471 sched_prio(curthread, PRIBIO);
472 thread_unlock(curthread);
473
474 for (;;) {
475 mtx_lock(&zsg->zsg_queue_mtx);
476 bp = bioq_takefirst(&zsg->zsg_queue);
477 if (bp == NULL) {
478 if (zsg->zsg_state == ZVOL_GEOM_STOPPED) {
479 zsg->zsg_state = ZVOL_GEOM_RUNNING;
480 wakeup(&zsg->zsg_state);
481 mtx_unlock(&zsg->zsg_queue_mtx);
482 kthread_exit();
483 }
484 msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx,
485 PRIBIO | PDROP, "zvol:io", 0);
486 continue;
487 }
488 mtx_unlock(&zsg->zsg_queue_mtx);
489 zvol_geom_bio_strategy(bp);
490 }
491 }
492
493 static void
494 zvol_geom_bio_start(struct bio *bp)
495 {
496 zvol_state_t *zv = bp->bio_to->private;
497 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
498 boolean_t first;
499
500 if (bp->bio_cmd == BIO_GETATTR) {
501 if (zvol_geom_bio_getattr(bp))
502 g_io_deliver(bp, EOPNOTSUPP);
503 return;
504 }
505
506 if (!THREAD_CAN_SLEEP()) {
507 mtx_lock(&zsg->zsg_queue_mtx);
508 first = (bioq_first(&zsg->zsg_queue) == NULL);
509 bioq_insert_tail(&zsg->zsg_queue, bp);
510 mtx_unlock(&zsg->zsg_queue_mtx);
511 if (first)
512 wakeup_one(&zsg->zsg_queue);
513 return;
514 }
515
516 zvol_geom_bio_strategy(bp);
517 }
518
519 static int
520 zvol_geom_bio_getattr(struct bio *bp)
521 {
522 zvol_state_t *zv;
523
524 zv = bp->bio_to->private;
525 ASSERT(zv != NULL);
526
527 spa_t *spa = dmu_objset_spa(zv->zv_objset);
528 uint64_t refd, avail, usedobjs, availobjs;
529
530 if (g_handleattr_int(bp, "GEOM::candelete", 1))
531 return (0);
532 if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
533 dmu_objset_space(zv->zv_objset, &refd, &avail,
534 &usedobjs, &availobjs);
535 if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE))
536 return (0);
537 } else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
538 dmu_objset_space(zv->zv_objset, &refd, &avail,
539 &usedobjs, &availobjs);
540 if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE))
541 return (0);
542 } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
543 avail = metaslab_class_get_space(spa_normal_class(spa));
544 avail -= metaslab_class_get_alloc(spa_normal_class(spa));
545 if (g_handleattr_off_t(bp, "poolblocksavail",
546 avail / DEV_BSIZE))
547 return (0);
548 } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
549 refd = metaslab_class_get_alloc(spa_normal_class(spa));
550 if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE))
551 return (0);
552 }
553 return (1);
554 }
555
556 static void
557 zvol_geom_bio_strategy(struct bio *bp)
558 {
559 zvol_state_t *zv;
560 uint64_t off, volsize;
561 size_t resid;
562 char *addr;
563 objset_t *os;
564 zfs_locked_range_t *lr;
565 int error = 0;
566 boolean_t doread = B_FALSE;
567 boolean_t is_dumpified;
568 boolean_t sync;
569
570 if (bp->bio_to)
571 zv = bp->bio_to->private;
572 else
573 zv = bp->bio_dev->si_drv2;
574
575 if (zv == NULL) {
576 error = SET_ERROR(ENXIO);
577 goto out;
578 }
579
580 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
581
582 switch (bp->bio_cmd) {
583 case BIO_READ:
584 doread = B_TRUE;
585 break;
586 case BIO_WRITE:
587 case BIO_FLUSH:
588 case BIO_DELETE:
589 if (zv->zv_flags & ZVOL_RDONLY) {
590 error = SET_ERROR(EROFS);
591 goto resume;
592 }
593 zvol_ensure_zilog(zv);
594 if (bp->bio_cmd == BIO_FLUSH)
595 goto sync;
596 break;
597 default:
598 error = EOPNOTSUPP;
599 goto resume;
600 }
601
602 off = bp->bio_offset;
603 volsize = zv->zv_volsize;
604
605 os = zv->zv_objset;
606 ASSERT(os != NULL);
607
608 addr = bp->bio_data;
609 resid = bp->bio_length;
610
611 if (resid > 0 && (off < 0 || off >= volsize)) {
612 error = SET_ERROR(EIO);
613 goto resume;
614 }
615
616 is_dumpified = B_FALSE;
617 sync = !doread && !is_dumpified &&
618 zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
619
620 /*
621 * There must be no buffer changes when doing a dmu_sync() because
622 * we can't change the data whilst calculating the checksum.
623 */
624 lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid,
625 doread ? RL_READER : RL_WRITER);
626
627 if (bp->bio_cmd == BIO_DELETE) {
628 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
629 error = dmu_tx_assign(tx, TXG_WAIT);
630 if (error != 0) {
631 dmu_tx_abort(tx);
632 } else {
633 zvol_log_truncate(zv, tx, off, resid, sync);
634 dmu_tx_commit(tx);
635 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
636 off, resid);
637 resid = 0;
638 }
639 goto unlock;
640 }
641 while (resid != 0 && off < volsize) {
642 size_t size = MIN(resid, zvol_maxphys);
643 if (doread) {
644 error = dmu_read(os, ZVOL_OBJ, off, size, addr,
645 DMU_READ_PREFETCH);
646 } else {
647 dmu_tx_t *tx = dmu_tx_create(os);
648 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
649 error = dmu_tx_assign(tx, TXG_WAIT);
650 if (error) {
651 dmu_tx_abort(tx);
652 } else {
653 dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
654 zvol_log_write(zv, tx, off, size, sync);
655 dmu_tx_commit(tx);
656 }
657 }
658 if (error) {
659 /* convert checksum errors into IO errors */
660 if (error == ECKSUM)
661 error = SET_ERROR(EIO);
662 break;
663 }
664 off += size;
665 addr += size;
666 resid -= size;
667 }
668 unlock:
669 zfs_rangelock_exit(lr);
670
671 bp->bio_completed = bp->bio_length - resid;
672 if (bp->bio_completed < bp->bio_length && off > volsize)
673 error = EINVAL;
674
675 switch (bp->bio_cmd) {
676 case BIO_FLUSH:
677 break;
678 case BIO_READ:
679 dataset_kstats_update_read_kstats(&zv->zv_kstat,
680 bp->bio_completed);
681 break;
682 case BIO_WRITE:
683 dataset_kstats_update_write_kstats(&zv->zv_kstat,
684 bp->bio_completed);
685 break;
686 case BIO_DELETE:
687 break;
688 default:
689 break;
690 }
691
692 if (sync) {
693 sync:
694 zil_commit(zv->zv_zilog, ZVOL_OBJ);
695 }
696 resume:
697 rw_exit(&zv->zv_suspend_lock);
698 out:
699 if (bp->bio_to)
700 g_io_deliver(bp, error);
701 else
702 biofinish(bp, NULL, error);
703 }
704
705 /*
706 * Character device mode implementation
707 */
708
709 static int
710 zvol_cdev_read(struct cdev *dev, struct uio *uio, int ioflag)
711 {
712 zvol_state_t *zv;
713 uint64_t volsize;
714 zfs_locked_range_t *lr;
715 int error = 0;
716
717 zv = dev->si_drv2;
718
719 volsize = zv->zv_volsize;
720 /*
721 * uio_loffset == volsize isn't an error as
722 * its required for EOF processing.
723 */
724 if (uio->uio_resid > 0 &&
725 (uio->uio_loffset < 0 || uio->uio_loffset > volsize))
726 return (SET_ERROR(EIO));
727
728 lr = zfs_rangelock_enter(&zv->zv_rangelock, uio->uio_loffset,
729 uio->uio_resid, RL_READER);
730 while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
731 uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
732
733 /* don't read past the end */
734 if (bytes > volsize - uio->uio_loffset)
735 bytes = volsize - uio->uio_loffset;
736
737 error = dmu_read_uio_dnode(zv->zv_dn, uio, bytes);
738 if (error) {
739 /* convert checksum errors into IO errors */
740 if (error == ECKSUM)
741 error = SET_ERROR(EIO);
742 break;
743 }
744 }
745 zfs_rangelock_exit(lr);
746
747 return (error);
748 }
749
750 static int
751 zvol_cdev_write(struct cdev *dev, struct uio *uio, int ioflag)
752 {
753 zvol_state_t *zv;
754 uint64_t volsize;
755 zfs_locked_range_t *lr;
756 int error = 0;
757 boolean_t sync;
758
759 zv = dev->si_drv2;
760
761 volsize = zv->zv_volsize;
762
763 if (uio->uio_resid > 0 &&
764 (uio->uio_loffset < 0 || uio->uio_loffset > volsize))
765 return (SET_ERROR(EIO));
766
767 sync = (ioflag & IO_SYNC) ||
768 (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
769
770 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
771 zvol_ensure_zilog(zv);
772
773 lr = zfs_rangelock_enter(&zv->zv_rangelock, uio->uio_loffset,
774 uio->uio_resid, RL_WRITER);
775 while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
776 uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
777 uint64_t off = uio->uio_loffset;
778 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
779
780 if (bytes > volsize - off) /* don't write past the end */
781 bytes = volsize - off;
782
783 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
784 error = dmu_tx_assign(tx, TXG_WAIT);
785 if (error) {
786 dmu_tx_abort(tx);
787 break;
788 }
789 error = dmu_write_uio_dnode(zv->zv_dn, uio, bytes, tx);
790 if (error == 0)
791 zvol_log_write(zv, tx, off, bytes, sync);
792 dmu_tx_commit(tx);
793
794 if (error)
795 break;
796 }
797 zfs_rangelock_exit(lr);
798 if (sync)
799 zil_commit(zv->zv_zilog, ZVOL_OBJ);
800 rw_exit(&zv->zv_suspend_lock);
801 return (error);
802 }
803
804 static int
805 zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
806 {
807 zvol_state_t *zv;
808 struct zvol_state_dev *zsd;
809 int err = 0;
810 boolean_t drop_suspend = B_TRUE;
811
812 rw_enter(&zvol_state_lock, ZVOL_RW_READER);
813 zv = dev->si_drv2;
814 if (zv == NULL) {
815 rw_exit(&zvol_state_lock);
816 return (SET_ERROR(ENXIO));
817 }
818
819 mutex_enter(&zv->zv_state_lock);
820
821 ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV);
822
823 /*
824 * make sure zvol is not suspended during first open
825 * (hold zv_suspend_lock) and respect proper lock acquisition
826 * ordering - zv_suspend_lock before zv_state_lock
827 */
828 if (zv->zv_open_count == 0) {
829 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
830 mutex_exit(&zv->zv_state_lock);
831 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
832 mutex_enter(&zv->zv_state_lock);
833 /* check to see if zv_suspend_lock is needed */
834 if (zv->zv_open_count != 0) {
835 rw_exit(&zv->zv_suspend_lock);
836 drop_suspend = B_FALSE;
837 }
838 }
839 } else {
840 drop_suspend = B_FALSE;
841 }
842 rw_exit(&zvol_state_lock);
843
844 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
845
846 if (zv->zv_open_count == 0) {
847 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
848 err = zvol_first_open(zv, !(flags & FWRITE));
849 if (err)
850 goto out_locked;
851 }
852
853 if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
854 err = EROFS;
855 goto out_opened;
856 }
857 if (zv->zv_flags & ZVOL_EXCL) {
858 err = EBUSY;
859 goto out_opened;
860 }
861 #ifdef FEXCL
862 if (flags & FEXCL) {
863 if (zv->zv_open_count != 0) {
864 err = EBUSY;
865 goto out_opened;
866 }
867 zv->zv_flags |= ZVOL_EXCL;
868 }
869 #endif
870
871 zv->zv_open_count++;
872 if (flags & (FSYNC | FDSYNC)) {
873 zsd = &zv->zv_zso->zso_dev;
874 zsd->zsd_sync_cnt++;
875 if (zsd->zsd_sync_cnt == 1)
876 zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ);
877 }
878
879 mutex_exit(&zv->zv_state_lock);
880 if (drop_suspend)
881 rw_exit(&zv->zv_suspend_lock);
882 return (0);
883
884 out_opened:
885 if (zv->zv_open_count == 0)
886 zvol_last_close(zv);
887 out_locked:
888 mutex_exit(&zv->zv_state_lock);
889 if (drop_suspend)
890 rw_exit(&zv->zv_suspend_lock);
891 return (SET_ERROR(err));
892 }
893
894 static int
895 zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
896 {
897 zvol_state_t *zv;
898 struct zvol_state_dev *zsd;
899 boolean_t drop_suspend = B_TRUE;
900
901 rw_enter(&zvol_state_lock, ZVOL_RW_READER);
902 zv = dev->si_drv2;
903 if (zv == NULL) {
904 rw_exit(&zvol_state_lock);
905 return (SET_ERROR(ENXIO));
906 }
907
908 mutex_enter(&zv->zv_state_lock);
909 if (zv->zv_flags & ZVOL_EXCL) {
910 ASSERT(zv->zv_open_count == 1);
911 zv->zv_flags &= ~ZVOL_EXCL;
912 }
913
914 ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV);
915
916 /*
917 * If the open count is zero, this is a spurious close.
918 * That indicates a bug in the kernel / DDI framework.
919 */
920 ASSERT(zv->zv_open_count > 0);
921 /*
922 * make sure zvol is not suspended during last close
923 * (hold zv_suspend_lock) and respect proper lock acquisition
924 * ordering - zv_suspend_lock before zv_state_lock
925 */
926 if (zv->zv_open_count == 1) {
927 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
928 mutex_exit(&zv->zv_state_lock);
929 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
930 mutex_enter(&zv->zv_state_lock);
931 /* check to see if zv_suspend_lock is needed */
932 if (zv->zv_open_count != 1) {
933 rw_exit(&zv->zv_suspend_lock);
934 drop_suspend = B_FALSE;
935 }
936 }
937 } else {
938 drop_suspend = B_FALSE;
939 }
940 rw_exit(&zvol_state_lock);
941
942 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
943
944 /*
945 * You may get multiple opens, but only one close.
946 */
947 zv->zv_open_count--;
948 if (flags & (FSYNC | FDSYNC)) {
949 zsd = &zv->zv_zso->zso_dev;
950 zsd->zsd_sync_cnt--;
951 }
952
953 if (zv->zv_open_count == 0) {
954 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
955 zvol_last_close(zv);
956 }
957
958 mutex_exit(&zv->zv_state_lock);
959
960 if (drop_suspend)
961 rw_exit(&zv->zv_suspend_lock);
962 return (0);
963 }
964
965 static int
966 zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
967 int fflag, struct thread *td)
968 {
969 zvol_state_t *zv;
970 zfs_locked_range_t *lr;
971 off_t offset, length;
972 int i, error;
973 boolean_t sync;
974
975 zv = dev->si_drv2;
976
977 error = 0;
978 KASSERT(zv->zv_open_count > 0,
979 ("Device with zero access count in %s", __func__));
980
981 i = IOCPARM_LEN(cmd);
982 switch (cmd) {
983 case DIOCGSECTORSIZE:
984 *(uint32_t *)data = DEV_BSIZE;
985 break;
986 case DIOCGMEDIASIZE:
987 *(off_t *)data = zv->zv_volsize;
988 break;
989 case DIOCGFLUSH:
990 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
991 if (zv->zv_zilog != NULL)
992 zil_commit(zv->zv_zilog, ZVOL_OBJ);
993 rw_exit(&zv->zv_suspend_lock);
994 break;
995 case DIOCGDELETE:
996 if (!zvol_unmap_enabled)
997 break;
998
999 offset = ((off_t *)data)[0];
1000 length = ((off_t *)data)[1];
1001 if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
1002 offset < 0 || offset >= zv->zv_volsize ||
1003 length <= 0) {
1004 printf("%s: offset=%jd length=%jd\n", __func__, offset,
1005 length);
1006 error = EINVAL;
1007 break;
1008 }
1009 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1010 zvol_ensure_zilog(zv);
1011 lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length,
1012 RL_WRITER);
1013 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1014 error = dmu_tx_assign(tx, TXG_WAIT);
1015 if (error != 0) {
1016 sync = FALSE;
1017 dmu_tx_abort(tx);
1018 } else {
1019 sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1020 zvol_log_truncate(zv, tx, offset, length, sync);
1021 dmu_tx_commit(tx);
1022 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1023 offset, length);
1024 }
1025 zfs_rangelock_exit(lr);
1026 if (sync)
1027 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1028 rw_exit(&zv->zv_suspend_lock);
1029 break;
1030 case DIOCGSTRIPESIZE:
1031 *(off_t *)data = zv->zv_volblocksize;
1032 break;
1033 case DIOCGSTRIPEOFFSET:
1034 *(off_t *)data = 0;
1035 break;
1036 case DIOCGATTR: {
1037 spa_t *spa = dmu_objset_spa(zv->zv_objset);
1038 struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
1039 uint64_t refd, avail, usedobjs, availobjs;
1040
1041 if (strcmp(arg->name, "GEOM::candelete") == 0)
1042 arg->value.i = 1;
1043 else if (strcmp(arg->name, "blocksavail") == 0) {
1044 dmu_objset_space(zv->zv_objset, &refd, &avail,
1045 &usedobjs, &availobjs);
1046 arg->value.off = avail / DEV_BSIZE;
1047 } else if (strcmp(arg->name, "blocksused") == 0) {
1048 dmu_objset_space(zv->zv_objset, &refd, &avail,
1049 &usedobjs, &availobjs);
1050 arg->value.off = refd / DEV_BSIZE;
1051 } else if (strcmp(arg->name, "poolblocksavail") == 0) {
1052 avail = metaslab_class_get_space(spa_normal_class(spa));
1053 avail -= metaslab_class_get_alloc(
1054 spa_normal_class(spa));
1055 arg->value.off = avail / DEV_BSIZE;
1056 } else if (strcmp(arg->name, "poolblocksused") == 0) {
1057 refd = metaslab_class_get_alloc(spa_normal_class(spa));
1058 arg->value.off = refd / DEV_BSIZE;
1059 } else
1060 error = ENOIOCTL;
1061 break;
1062 }
1063 case FIOSEEKHOLE:
1064 case FIOSEEKDATA: {
1065 off_t *off = (off_t *)data;
1066 uint64_t noff;
1067 boolean_t hole;
1068
1069 hole = (cmd == FIOSEEKHOLE);
1070 noff = *off;
1071 error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
1072 *off = noff;
1073 break;
1074 }
1075 default:
1076 error = ENOIOCTL;
1077 }
1078
1079 return (error);
1080 }
1081
1082 /*
1083 * Misc. helpers
1084 */
1085
1086 static void
1087 zvol_ensure_zilog(zvol_state_t *zv)
1088 {
1089 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1090
1091 /*
1092 * Open a ZIL if this is the first time we have written to this
1093 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
1094 * than zv_state_lock so that we don't need to acquire an
1095 * additional lock in this path.
1096 */
1097 if (zv->zv_zilog == NULL) {
1098 if (!rw_tryupgrade(&zv->zv_suspend_lock)) {
1099 rw_exit(&zv->zv_suspend_lock);
1100 rw_enter(&zv->zv_suspend_lock, RW_WRITER);
1101 }
1102 if (zv->zv_zilog == NULL) {
1103 zv->zv_zilog = zil_open(zv->zv_objset,
1104 zvol_get_data);
1105 zv->zv_flags |= ZVOL_WRITTEN_TO;
1106 }
1107 rw_downgrade(&zv->zv_suspend_lock);
1108 }
1109 }
1110
1111 static boolean_t
1112 zvol_is_zvol_impl(const char *device)
1113 {
1114 return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
1115 }
1116
1117 static void
1118 zvol_rename_minor(zvol_state_t *zv, const char *newname)
1119 {
1120 ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1121 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1122
1123 /* move to new hashtable entry */
1124 zv->zv_hash = zvol_name_hash(zv->zv_name);
1125 hlist_del(&zv->zv_hlink);
1126 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1127
1128 if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
1129 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1130 struct g_provider *pp = zsg->zsg_provider;
1131 struct g_geom *gp;
1132
1133 g_topology_lock();
1134 gp = pp->geom;
1135 ASSERT(gp != NULL);
1136
1137 zsg->zsg_provider = NULL;
1138 g_wither_provider(pp, ENXIO);
1139
1140 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
1141 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1142 pp->sectorsize = DEV_BSIZE;
1143 pp->mediasize = zv->zv_volsize;
1144 pp->private = zv;
1145 zsg->zsg_provider = pp;
1146 g_error_provider(pp, 0);
1147 g_topology_unlock();
1148 } else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) {
1149 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1150 struct cdev *dev;
1151 struct make_dev_args args;
1152
1153 dev = zsd->zsd_cdev;
1154 if (dev != NULL) {
1155 destroy_dev(dev);
1156 dev = zsd->zsd_cdev = NULL;
1157 if (zv->zv_open_count > 0) {
1158 zv->zv_flags &= ~ZVOL_EXCL;
1159 zv->zv_open_count = 0;
1160 /* XXX need suspend lock but lock order */
1161 zvol_last_close(zv);
1162 }
1163 }
1164
1165 make_dev_args_init(&args);
1166 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1167 args.mda_devsw = &zvol_cdevsw;
1168 args.mda_cr = NULL;
1169 args.mda_uid = UID_ROOT;
1170 args.mda_gid = GID_OPERATOR;
1171 args.mda_mode = 0640;
1172 args.mda_si_drv2 = zv;
1173 if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname)
1174 == 0) {
1175 dev->si_iosize_max = MAXPHYS;
1176 zsd->zsd_cdev = dev;
1177 }
1178 }
1179 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1180 }
1181
1182 /*
1183 * Remove minor node for the specified volume.
1184 */
1185 static void
1186 zvol_free(zvol_state_t *zv)
1187 {
1188 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1189 ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
1190 ASSERT(zv->zv_open_count == 0);
1191
1192 ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
1193
1194 rw_destroy(&zv->zv_suspend_lock);
1195 zfs_rangelock_fini(&zv->zv_rangelock);
1196
1197 if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
1198 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1199
1200 g_topology_lock();
1201 zvol_geom_destroy(zv);
1202 g_topology_unlock();
1203 mtx_destroy(&zsg->zsg_queue_mtx);
1204 } else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) {
1205 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1206 struct cdev *dev = zsd->zsd_cdev;
1207
1208 if (dev != NULL)
1209 destroy_dev(dev);
1210 }
1211
1212 mutex_destroy(&zv->zv_state_lock);
1213 dataset_kstats_destroy(&zv->zv_kstat);
1214 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1215 kmem_free(zv, sizeof (zvol_state_t));
1216 zvol_minors--;
1217 }
1218
1219 /*
1220 * Create a minor node (plus a whole lot more) for the specified volume.
1221 */
1222 static int
1223 zvol_create_minor_impl(const char *name)
1224 {
1225 zvol_state_t *zv;
1226 objset_t *os;
1227 dmu_object_info_t *doi;
1228 uint64_t volsize;
1229 uint64_t volmode, hash;
1230 int error;
1231
1232 ZFS_LOG(1, "Creating ZVOL %s...", name);
1233
1234 hash = zvol_name_hash(name);
1235 if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) {
1236 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1237 mutex_exit(&zv->zv_state_lock);
1238 return (SET_ERROR(EEXIST));
1239 }
1240
1241 DROP_GIANT();
1242 /* lie and say we're read-only */
1243 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
1244 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
1245
1246 if (error)
1247 goto out_doi;
1248
1249 error = dmu_object_info(os, ZVOL_OBJ, doi);
1250 if (error)
1251 goto out_dmu_objset_disown;
1252
1253 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1254 if (error)
1255 goto out_dmu_objset_disown;
1256
1257 error = dsl_prop_get_integer(name,
1258 zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL);
1259 if (error != 0 || volmode == ZFS_VOLMODE_DEFAULT)
1260 volmode = zvol_volmode;
1261 /*
1262 * zvol_alloc equivalent ...
1263 */
1264 zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
1265 zv->zv_hash = hash;
1266 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
1267 zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
1268 zv->zv_zso->zso_volmode = volmode;
1269 if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
1270 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1271 struct g_provider *pp;
1272 struct g_geom *gp;
1273
1274 zsg->zsg_state = ZVOL_GEOM_UNINIT;
1275 mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF);
1276
1277 g_topology_lock();
1278 gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
1279 gp->start = zvol_geom_bio_start;
1280 gp->access = zvol_geom_access;
1281 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
1282 /* TODO: NULL check? */
1283 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1284 pp->sectorsize = DEV_BSIZE;
1285 pp->mediasize = 0;
1286 pp->private = zv;
1287
1288 zsg->zsg_provider = pp;
1289 bioq_init(&zsg->zsg_queue);
1290 } else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) {
1291 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1292 struct cdev *dev;
1293 struct make_dev_args args;
1294
1295 make_dev_args_init(&args);
1296 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1297 args.mda_devsw = &zvol_cdevsw;
1298 args.mda_cr = NULL;
1299 args.mda_uid = UID_ROOT;
1300 args.mda_gid = GID_OPERATOR;
1301 args.mda_mode = 0640;
1302 args.mda_si_drv2 = zv;
1303 error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name);
1304 if (error != 0) {
1305 mutex_destroy(&zv->zv_state_lock);
1306 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1307 kmem_free(zv, sizeof (*zv));
1308 dmu_objset_disown(os, B_TRUE, FTAG);
1309 goto out_giant;
1310 }
1311 dev->si_iosize_max = MAXPHYS;
1312 zsd->zsd_cdev = dev;
1313 }
1314 (void) strlcpy(zv->zv_name, name, MAXPATHLEN);
1315 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
1316 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
1317
1318 if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
1319 zv->zv_flags |= ZVOL_RDONLY;
1320
1321 zv->zv_volblocksize = doi->doi_data_block_size;
1322 zv->zv_volsize = volsize;
1323 zv->zv_objset = os;
1324
1325 if (spa_writeable(dmu_objset_spa(os))) {
1326 if (zil_replay_disable)
1327 zil_destroy(dmu_objset_zil(os), B_FALSE);
1328 else
1329 zil_replay(os, zv, zvol_replay_vector);
1330 }
1331 ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
1332 dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
1333
1334 /* XXX do prefetch */
1335
1336 zv->zv_objset = NULL;
1337 out_dmu_objset_disown:
1338 dmu_objset_disown(os, B_TRUE, FTAG);
1339
1340 if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
1341 if (error == 0)
1342 zvol_geom_run(zv);
1343 g_topology_unlock();
1344 }
1345 out_doi:
1346 kmem_free(doi, sizeof (dmu_object_info_t));
1347 if (error == 0) {
1348 rw_enter(&zvol_state_lock, RW_WRITER);
1349 zvol_insert(zv);
1350 zvol_minors++;
1351 rw_exit(&zvol_state_lock);
1352 }
1353 ZFS_LOG(1, "ZVOL %s created.", name);
1354 out_giant:
1355 PICKUP_GIANT();
1356 return (error);
1357 }
1358
1359 static void
1360 zvol_clear_private(zvol_state_t *zv)
1361 {
1362 ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1363 if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
1364 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1365 struct g_provider *pp = zsg->zsg_provider;
1366
1367 if (pp == NULL) /* XXX when? */
1368 return;
1369
1370 mtx_lock(&zsg->zsg_queue_mtx);
1371 zsg->zsg_state = ZVOL_GEOM_STOPPED;
1372 pp->private = NULL;
1373 wakeup_one(&zsg->zsg_queue);
1374 while (zsg->zsg_state != ZVOL_GEOM_RUNNING)
1375 msleep(&zsg->zsg_state,
1376 &zsg->zsg_queue_mtx,
1377 0, "zvol:w", 0);
1378 mtx_unlock(&zsg->zsg_queue_mtx);
1379 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1380 }
1381 }
1382
1383 static int
1384 zvol_update_volsize(zvol_state_t *zv, uint64_t volsize)
1385 {
1386 zv->zv_volsize = volsize;
1387 if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
1388 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1389 struct g_provider *pp = zsg->zsg_provider;
1390
1391 if (pp == NULL) /* XXX when? */
1392 return (0);
1393
1394 g_topology_lock();
1395
1396 /*
1397 * Do not invoke resize event when initial size was zero.
1398 * ZVOL initializes the size on first open, this is not
1399 * real resizing.
1400 */
1401 if (pp->mediasize == 0)
1402 pp->mediasize = zv->zv_volsize;
1403 else
1404 g_resize_provider(pp, zv->zv_volsize);
1405
1406 g_topology_unlock();
1407 }
1408 return (0);
1409 }
1410
1411 static void
1412 zvol_set_disk_ro_impl(zvol_state_t *zv, int flags)
1413 {
1414 // XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags);
1415 }
1416
1417 static void
1418 zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity)
1419 {
1420 // XXX? set_capacity(zv->zv_zso->zvo_disk, capacity);
1421 }
1422
1423 const static zvol_platform_ops_t zvol_freebsd_ops = {
1424 .zv_free = zvol_free,
1425 .zv_rename_minor = zvol_rename_minor,
1426 .zv_create_minor = zvol_create_minor_impl,
1427 .zv_update_volsize = zvol_update_volsize,
1428 .zv_clear_private = zvol_clear_private,
1429 .zv_is_zvol = zvol_is_zvol_impl,
1430 .zv_set_disk_ro = zvol_set_disk_ro_impl,
1431 .zv_set_capacity = zvol_set_capacity_impl,
1432 };
1433
1434 /*
1435 * Public interfaces
1436 */
1437
1438 int
1439 zvol_busy(void)
1440 {
1441 return (zvol_minors != 0);
1442 }
1443
1444 int
1445 zvol_init(void)
1446 {
1447 zvol_init_impl();
1448 zvol_register_ops(&zvol_freebsd_ops);
1449 return (0);
1450 }
1451
1452 void
1453 zvol_fini(void)
1454 {
1455 zvol_fini_impl();
1456 }