]> git.proxmox.com Git - mirror_zfs-debian.git/blame - module/zfs/zvol.c
Imported Upstream version 0.6.5.4
[mirror_zfs-debian.git] / module / zfs / zvol.c
CommitLineData
60101509
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
25 * LLNL-CODE-403049.
26 *
27 * ZFS volume emulation driver.
28 *
29 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
30 * Volumes are accessed through the symbolic links named:
31 *
32 * /dev/<pool_name>/<dataset_name>
33 *
34 * Volumes are persistent through reboot and module load. No user command
35 * needs to be run before opening and using a device.
36 */
37
a08ee875 38#include <sys/dbuf.h>
60101509
BB
39#include <sys/dmu_traverse.h>
40#include <sys/dsl_dataset.h>
41#include <sys/dsl_prop.h>
42#include <sys/zap.h>
e10b0808 43#include <sys/zfeature.h>
60101509
BB
44#include <sys/zil_impl.h>
45#include <sys/zio.h>
46#include <sys/zfs_rlock.h>
47#include <sys/zfs_znode.h>
48#include <sys/zvol.h>
61e90960 49#include <linux/blkdev_compat.h>
60101509 50
74497b7a 51unsigned int zvol_inhibit_dev = 0;
60101509 52unsigned int zvol_major = ZVOL_MAJOR;
e10b0808 53unsigned int zvol_prefetch_bytes = (128 * 1024);
7c0e5708 54unsigned long zvol_max_discard_blocks = 16384;
60101509 55
60101509
BB
56static kmutex_t zvol_state_lock;
57static list_t zvol_state_list;
58static char *zvol_tag = "zvol_tag";
59
60/*
61 * The in-core state of each volume.
62 */
63typedef struct zvol_state {
4c0d8e50 64 char zv_name[MAXNAMELEN]; /* name */
a08ee875
LG
65 uint64_t zv_volsize; /* advertised space */
66 uint64_t zv_volblocksize; /* volume block size */
60101509
BB
67 objset_t *zv_objset; /* objset handle */
68 uint32_t zv_flags; /* ZVOL_* flags */
69 uint32_t zv_open_count; /* open counts */
70 uint32_t zv_changed; /* disk changed */
71 zilog_t *zv_zilog; /* ZIL handle */
72 znode_t zv_znode; /* for range locking */
73 dmu_buf_t *zv_dbuf; /* bonus handle */
74 dev_t zv_dev; /* device id */
75 struct gendisk *zv_disk; /* generic disk */
76 struct request_queue *zv_queue; /* request queue */
77 spinlock_t zv_lock; /* request queue lock */
78 list_node_t zv_next; /* next zvol_state_t linkage */
79} zvol_state_t;
80
81#define ZVOL_RDONLY 0x1
82
83/*
84 * Find the next available range of ZVOL_MINORS minor numbers. The
85 * zvol_state_list is kept in ascending minor order so we simply need
86 * to scan the list for the first gap in the sequence. This allows us
87 * to recycle minor number as devices are created and removed.
88 */
89static int
90zvol_find_minor(unsigned *minor)
91{
92 zvol_state_t *zv;
93
94 *minor = 0;
95 ASSERT(MUTEX_HELD(&zvol_state_lock));
96 for (zv = list_head(&zvol_state_list); zv != NULL;
a08ee875 97 zv = list_next(&zvol_state_list, zv), *minor += ZVOL_MINORS) {
60101509
BB
98 if (MINOR(zv->zv_dev) != MINOR(*minor))
99 break;
100 }
101
102 /* All minors are in use */
103 if (*minor >= (1 << MINORBITS))
a08ee875 104 return (SET_ERROR(ENXIO));
60101509 105
a08ee875 106 return (0);
60101509
BB
107}
108
109/*
110 * Find a zvol_state_t given the full major+minor dev_t.
111 */
112static zvol_state_t *
113zvol_find_by_dev(dev_t dev)
114{
115 zvol_state_t *zv;
116
117 ASSERT(MUTEX_HELD(&zvol_state_lock));
118 for (zv = list_head(&zvol_state_list); zv != NULL;
a08ee875 119 zv = list_next(&zvol_state_list, zv)) {
60101509 120 if (zv->zv_dev == dev)
a08ee875 121 return (zv);
60101509
BB
122 }
123
a08ee875 124 return (NULL);
60101509
BB
125}
126
127/*
128 * Find a zvol_state_t given the name provided at zvol_alloc() time.
129 */
130static zvol_state_t *
131zvol_find_by_name(const char *name)
132{
133 zvol_state_t *zv;
134
135 ASSERT(MUTEX_HELD(&zvol_state_lock));
136 for (zv = list_head(&zvol_state_list); zv != NULL;
a08ee875
LG
137 zv = list_next(&zvol_state_list, zv)) {
138 if (strncmp(zv->zv_name, name, MAXNAMELEN) == 0)
139 return (zv);
60101509
BB
140 }
141
a08ee875 142 return (NULL);
60101509
BB
143}
144
6c285672
JL
145
146/*
147 * Given a path, return TRUE if path is a ZVOL.
148 */
149boolean_t
150zvol_is_zvol(const char *device)
151{
152 struct block_device *bdev;
153 unsigned int major;
154
155 bdev = lookup_bdev(device);
156 if (IS_ERR(bdev))
157 return (B_FALSE);
158
159 major = MAJOR(bdev->bd_dev);
160 bdput(bdev);
161
162 if (major == zvol_major)
a08ee875 163 return (B_TRUE);
6c285672
JL
164
165 return (B_FALSE);
166}
167
60101509
BB
168/*
169 * ZFS_IOC_CREATE callback handles dmu zvol and zap object creation.
170 */
171void
172zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
173{
174 zfs_creat_t *zct = arg;
175 nvlist_t *nvprops = zct->zct_props;
176 int error;
177 uint64_t volblocksize, volsize;
178
179 VERIFY(nvlist_lookup_uint64(nvprops,
180 zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
181 if (nvlist_lookup_uint64(nvprops,
182 zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
183 volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
184
185 /*
186 * These properties must be removed from the list so the generic
187 * property setting step won't apply to them.
188 */
189 VERIFY(nvlist_remove_all(nvprops,
190 zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
191 (void) nvlist_remove_all(nvprops,
192 zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
193
194 error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
195 DMU_OT_NONE, 0, tx);
196 ASSERT(error == 0);
197
198 error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
199 DMU_OT_NONE, 0, tx);
200 ASSERT(error == 0);
201
202 error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
203 ASSERT(error == 0);
204}
205
206/*
207 * ZFS_IOC_OBJSET_STATS entry point.
208 */
209int
210zvol_get_stats(objset_t *os, nvlist_t *nv)
211{
212 int error;
213 dmu_object_info_t *doi;
214 uint64_t val;
215
216 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
217 if (error)
a08ee875 218 return (SET_ERROR(error));
60101509
BB
219
220 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
a08ee875 221 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
60101509
BB
222 error = dmu_object_info(os, ZVOL_OBJ, doi);
223
224 if (error == 0) {
225 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
226 doi->doi_data_block_size);
227 }
228
a08ee875 229 kmem_free(doi, sizeof (dmu_object_info_t));
60101509 230
a08ee875
LG
231 return (SET_ERROR(error));
232}
233
234static void
235zvol_size_changed(zvol_state_t *zv, uint64_t volsize)
236{
237 struct block_device *bdev;
238
239 bdev = bdget_disk(zv->zv_disk, 0);
240 if (bdev == NULL)
241 return;
242/*
243 * 2.6.28 API change
244 * Added check_disk_size_change() helper function.
245 */
246#ifdef HAVE_CHECK_DISK_SIZE_CHANGE
247 set_capacity(zv->zv_disk, volsize >> 9);
248 zv->zv_volsize = volsize;
249 check_disk_size_change(zv->zv_disk, bdev);
250#else
251 zv->zv_volsize = volsize;
252 zv->zv_changed = 1;
253 (void) check_disk_change(bdev);
254#endif /* HAVE_CHECK_DISK_SIZE_CHANGE */
255
256 bdput(bdev);
60101509
BB
257}
258
259/*
260 * Sanity check volume size.
261 */
262int
263zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
264{
265 if (volsize == 0)
a08ee875 266 return (SET_ERROR(EINVAL));
60101509
BB
267
268 if (volsize % blocksize != 0)
a08ee875 269 return (SET_ERROR(EINVAL));
60101509
BB
270
271#ifdef _ILP32
272 if (volsize - 1 > MAXOFFSET_T)
a08ee875 273 return (SET_ERROR(EOVERFLOW));
60101509
BB
274#endif
275 return (0);
276}
277
278/*
279 * Ensure the zap is flushed then inform the VFS of the capacity change.
280 */
281static int
a08ee875 282zvol_update_volsize(uint64_t volsize, objset_t *os)
60101509 283{
60101509
BB
284 dmu_tx_t *tx;
285 int error;
286
287 ASSERT(MUTEX_HELD(&zvol_state_lock));
288
df554c14 289 tx = dmu_tx_create(os);
60101509
BB
290 dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
291 error = dmu_tx_assign(tx, TXG_WAIT);
292 if (error) {
293 dmu_tx_abort(tx);
a08ee875 294 return (SET_ERROR(error));
60101509
BB
295 }
296
df554c14 297 error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
60101509
BB
298 &volsize, tx);
299 dmu_tx_commit(tx);
300
a08ee875
LG
301 if (error == 0)
302 error = dmu_free_long_range(os,
303 ZVOL_OBJ, volsize, DMU_OBJECT_END);
60101509 304
a08ee875
LG
305 return (error);
306}
60101509 307
a08ee875
LG
308static int
309zvol_update_live_volsize(zvol_state_t *zv, uint64_t volsize)
310{
311 zvol_size_changed(zv, volsize);
60101509 312
a08ee875
LG
313 /*
314 * We should post a event here describing the expansion. However,
315 * the zfs_ereport_post() interface doesn't nicely support posting
316 * events for zvols, it assumes events relate to vdevs or zios.
317 */
60101509
BB
318
319 return (0);
320}
321
322/*
323 * Set ZFS_PROP_VOLSIZE set entry point.
324 */
325int
326zvol_set_volsize(const char *name, uint64_t volsize)
327{
a08ee875 328 zvol_state_t *zv = NULL;
60101509 329 objset_t *os = NULL;
60101509 330 int error;
a08ee875
LG
331 dmu_object_info_t *doi;
332 uint64_t readonly;
333 boolean_t owned = B_FALSE;
60101509 334
a08ee875
LG
335 error = dsl_prop_get_integer(name,
336 zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL);
337 if (error != 0)
338 return (SET_ERROR(error));
339 if (readonly)
340 return (SET_ERROR(EROFS));
60101509 341
a08ee875 342 mutex_enter(&zvol_state_lock);
60101509 343 zv = zvol_find_by_name(name);
60101509 344
a08ee875
LG
345 if (zv == NULL || zv->zv_objset == NULL) {
346 if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE,
347 FTAG, &os)) != 0) {
348 mutex_exit(&zvol_state_lock);
349 return (SET_ERROR(error));
350 }
351 owned = B_TRUE;
352 if (zv != NULL)
353 zv->zv_objset = os;
354 } else {
355 os = zv->zv_objset;
356 }
60101509 357
a08ee875 358 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
60101509 359
a08ee875
LG
360 if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) ||
361 (error = zvol_check_volsize(volsize, doi->doi_data_block_size)))
362 goto out;
60101509 363
a08ee875
LG
364 error = zvol_update_volsize(volsize, os);
365 kmem_free(doi, sizeof (dmu_object_info_t));
60101509 366
a08ee875
LG
367 if (error == 0 && zv != NULL)
368 error = zvol_update_live_volsize(zv, volsize);
60101509 369out:
a08ee875
LG
370 if (owned) {
371 dmu_objset_disown(os, FTAG);
372 if (zv != NULL)
373 zv->zv_objset = NULL;
374 }
60101509 375 mutex_exit(&zvol_state_lock);
60101509
BB
376 return (error);
377}
378
379/*
380 * Sanity check volume block size.
381 */
382int
e10b0808 383zvol_check_volblocksize(const char *name, uint64_t volblocksize)
60101509 384{
e10b0808
AX
385 /* Record sizes above 128k need the feature to be enabled */
386 if (volblocksize > SPA_OLD_MAXBLOCKSIZE) {
387 spa_t *spa;
388 int error;
389
390 if ((error = spa_open(name, &spa, FTAG)) != 0)
391 return (error);
392
393 if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
394 spa_close(spa, FTAG);
395 return (SET_ERROR(ENOTSUP));
396 }
397
398 /*
399 * We don't allow setting the property above 1MB,
400 * unless the tunable has been changed.
401 */
402 if (volblocksize > zfs_max_recordsize)
403 return (SET_ERROR(EDOM));
404
405 spa_close(spa, FTAG);
406 }
407
60101509
BB
408 if (volblocksize < SPA_MINBLOCKSIZE ||
409 volblocksize > SPA_MAXBLOCKSIZE ||
410 !ISP2(volblocksize))
a08ee875 411 return (SET_ERROR(EDOM));
60101509
BB
412
413 return (0);
414}
415
416/*
417 * Set ZFS_PROP_VOLBLOCKSIZE set entry point.
418 */
419int
420zvol_set_volblocksize(const char *name, uint64_t volblocksize)
421{
422 zvol_state_t *zv;
423 dmu_tx_t *tx;
424 int error;
425
426 mutex_enter(&zvol_state_lock);
427
428 zv = zvol_find_by_name(name);
429 if (zv == NULL) {
a08ee875 430 error = SET_ERROR(ENXIO);
60101509
BB
431 goto out;
432 }
433
a08ee875
LG
434 if (zv->zv_flags & ZVOL_RDONLY) {
435 error = SET_ERROR(EROFS);
60101509
BB
436 goto out;
437 }
438
439 tx = dmu_tx_create(zv->zv_objset);
440 dmu_tx_hold_bonus(tx, ZVOL_OBJ);
441 error = dmu_tx_assign(tx, TXG_WAIT);
442 if (error) {
443 dmu_tx_abort(tx);
444 } else {
445 error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ,
446 volblocksize, 0, tx);
447 if (error == ENOTSUP)
a08ee875 448 error = SET_ERROR(EBUSY);
60101509
BB
449 dmu_tx_commit(tx);
450 if (error == 0)
451 zv->zv_volblocksize = volblocksize;
452 }
453out:
454 mutex_exit(&zvol_state_lock);
455
a08ee875 456 return (SET_ERROR(error));
60101509
BB
457}
458
459/*
460 * Replay a TX_WRITE ZIL transaction that didn't get committed
461 * after a system failure
462 */
463static int
464zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
465{
466 objset_t *os = zv->zv_objset;
467 char *data = (char *)(lr + 1); /* data follows lr_write_t */
468 uint64_t off = lr->lr_offset;
469 uint64_t len = lr->lr_length;
470 dmu_tx_t *tx;
471 int error;
472
473 if (byteswap)
474 byteswap_uint64_array(lr, sizeof (*lr));
475
476 tx = dmu_tx_create(os);
477 dmu_tx_hold_write(tx, ZVOL_OBJ, off, len);
478 error = dmu_tx_assign(tx, TXG_WAIT);
479 if (error) {
480 dmu_tx_abort(tx);
481 } else {
482 dmu_write(os, ZVOL_OBJ, off, len, data, tx);
483 dmu_tx_commit(tx);
484 }
485
a08ee875 486 return (SET_ERROR(error));
60101509
BB
487}
488
489static int
490zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
491{
a08ee875 492 return (SET_ERROR(ENOTSUP));
60101509
BB
493}
494
495/*
496 * Callback vectors for replaying records.
497 * Only TX_WRITE is needed for zvol.
498 */
b01615d5
RY
499zil_replay_func_t zvol_replay_vector[TX_MAX_TYPE] = {
500 (zil_replay_func_t)zvol_replay_err, /* no such transaction type */
501 (zil_replay_func_t)zvol_replay_err, /* TX_CREATE */
502 (zil_replay_func_t)zvol_replay_err, /* TX_MKDIR */
503 (zil_replay_func_t)zvol_replay_err, /* TX_MKXATTR */
504 (zil_replay_func_t)zvol_replay_err, /* TX_SYMLINK */
505 (zil_replay_func_t)zvol_replay_err, /* TX_REMOVE */
506 (zil_replay_func_t)zvol_replay_err, /* TX_RMDIR */
507 (zil_replay_func_t)zvol_replay_err, /* TX_LINK */
508 (zil_replay_func_t)zvol_replay_err, /* TX_RENAME */
509 (zil_replay_func_t)zvol_replay_write, /* TX_WRITE */
510 (zil_replay_func_t)zvol_replay_err, /* TX_TRUNCATE */
511 (zil_replay_func_t)zvol_replay_err, /* TX_SETATTR */
512 (zil_replay_func_t)zvol_replay_err, /* TX_ACL */
60101509
BB
513};
514
515/*
516 * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
517 *
518 * We store data in the log buffers if it's small enough.
519 * Otherwise we will later flush the data out via dmu_sync().
520 */
521ssize_t zvol_immediate_write_sz = 32768;
522
523static void
a08ee875
LG
524zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
525 uint64_t size, int sync)
60101509
BB
526{
527 uint32_t blocksize = zv->zv_volblocksize;
528 zilog_t *zilog = zv->zv_zilog;
529 boolean_t slogging;
ab85f845 530 ssize_t immediate_write_sz;
60101509
BB
531
532 if (zil_replaying(zilog, tx))
533 return;
534
ab85f845
ED
535 immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
536 ? 0 : zvol_immediate_write_sz;
537 slogging = spa_has_slogs(zilog->zl_spa) &&
538 (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
60101509
BB
539
540 while (size) {
541 itx_t *itx;
542 lr_write_t *lr;
543 ssize_t len;
544 itx_wr_state_t write_state;
545
546 /*
547 * Unlike zfs_log_write() we can be called with
548 * up to DMU_MAX_ACCESS/2 (5MB) writes.
549 */
ab85f845 550 if (blocksize > immediate_write_sz && !slogging &&
60101509
BB
551 size >= blocksize && offset % blocksize == 0) {
552 write_state = WR_INDIRECT; /* uses dmu_sync */
553 len = blocksize;
554 } else if (sync) {
555 write_state = WR_COPIED;
556 len = MIN(ZIL_MAX_LOG_DATA, size);
557 } else {
558 write_state = WR_NEED_COPY;
559 len = MIN(ZIL_MAX_LOG_DATA, size);
560 }
561
562 itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
563 (write_state == WR_COPIED ? len : 0));
564 lr = (lr_write_t *)&itx->itx_lr;
565 if (write_state == WR_COPIED && dmu_read(zv->zv_objset,
566 ZVOL_OBJ, offset, len, lr+1, DMU_READ_NO_PREFETCH) != 0) {
567 zil_itx_destroy(itx);
568 itx = zil_itx_create(TX_WRITE, sizeof (*lr));
569 lr = (lr_write_t *)&itx->itx_lr;
570 write_state = WR_NEED_COPY;
571 }
572
573 itx->itx_wr_state = write_state;
574 if (write_state == WR_NEED_COPY)
575 itx->itx_sod += len;
576 lr->lr_foid = ZVOL_OBJ;
577 lr->lr_offset = offset;
578 lr->lr_length = len;
579 lr->lr_blkoff = 0;
580 BP_ZERO(&lr->lr_blkptr);
581
582 itx->itx_private = zv;
583 itx->itx_sync = sync;
584
585 (void) zil_itx_assign(zilog, itx, tx);
586
587 offset += len;
588 size -= len;
589 }
590}
591
e10b0808
AX
592static int
593zvol_write(struct bio *bio)
60101509 594{
e10b0808
AX
595 zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data;
596 uint64_t offset = BIO_BI_SECTOR(bio) << 9;
597 uint64_t size = BIO_BI_SIZE(bio);
60101509
BB
598 int error = 0;
599 dmu_tx_t *tx;
600 rl_t *rl;
601
e10b0808 602 if (bio->bi_rw & VDEV_REQ_FLUSH)
b18019d2
ED
603 zil_commit(zv->zv_zilog, ZVOL_OBJ);
604
605 /*
606 * Some requests are just for flush and nothing else.
607 */
e10b0808 608 if (size == 0)
8630650a 609 goto out;
b18019d2 610
60101509
BB
611 rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_WRITER);
612
613 tx = dmu_tx_create(zv->zv_objset);
614 dmu_tx_hold_write(tx, ZVOL_OBJ, offset, size);
615
616 /* This will only fail for ENOSPC */
617 error = dmu_tx_assign(tx, TXG_WAIT);
618 if (error) {
619 dmu_tx_abort(tx);
620 zfs_range_unlock(rl);
8630650a 621 goto out;
60101509
BB
622 }
623
e10b0808 624 error = dmu_write_bio(zv->zv_objset, ZVOL_OBJ, bio, tx);
60101509 625 if (error == 0)
b18019d2 626 zvol_log_write(zv, tx, offset, size,
e10b0808 627 !!(bio->bi_rw & VDEV_REQ_FUA));
60101509
BB
628
629 dmu_tx_commit(tx);
630 zfs_range_unlock(rl);
631
e10b0808 632 if ((bio->bi_rw & VDEV_REQ_FUA) ||
b18019d2 633 zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)
60101509
BB
634 zil_commit(zv->zv_zilog, ZVOL_OBJ);
635
8630650a 636out:
e10b0808 637 return (error);
60101509
BB
638}
639
e10b0808
AX
640static int
641zvol_discard(struct bio *bio)
30930fba 642{
e10b0808
AX
643 zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data;
644 uint64_t start = BIO_BI_SECTOR(bio) << 9;
645 uint64_t size = BIO_BI_SIZE(bio);
646 uint64_t end = start + size;
30930fba
ED
647 int error;
648 rl_t *rl;
649
e10b0808
AX
650 if (end > zv->zv_volsize)
651 return (SET_ERROR(EIO));
30930fba 652
089fa91b 653 /*
e10b0808
AX
654 * Align the request to volume block boundaries when REQ_SECURE is
655 * available, but not requested. If we don't, then this will force
656 * dnode_free_range() to zero out the unaligned parts, which is slow
657 * (read-modify-write) and useless since we are not freeing any space
658 * by doing so. Kernels that do not support REQ_SECURE (2.6.32 through
659 * 2.6.35) will not receive this optimization.
089fa91b 660 */
e10b0808
AX
661#ifdef REQ_SECURE
662 if (!(bio->bi_rw & REQ_SECURE)) {
663 start = P2ROUNDUP(start, zv->zv_volblocksize);
664 end = P2ALIGN(end, zv->zv_volblocksize);
665 size = end - start;
30930fba 666 }
e10b0808 667#endif
30930fba 668
e10b0808
AX
669 if (start >= end)
670 return (0);
671
672 rl = zfs_range_lock(&zv->zv_znode, start, size, RL_WRITER);
30930fba 673
e10b0808 674 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, size);
30930fba
ED
675
676 /*
677 * TODO: maybe we should add the operation to the log.
678 */
679
680 zfs_range_unlock(rl);
e10b0808
AX
681
682 return (error);
30930fba 683}
30930fba 684
e10b0808
AX
685static int
686zvol_read(struct bio *bio)
60101509 687{
e10b0808
AX
688 zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data;
689 uint64_t offset = BIO_BI_SECTOR(bio) << 9;
690 uint64_t len = BIO_BI_SIZE(bio);
60101509
BB
691 int error;
692 rl_t *rl;
693
e10b0808
AX
694 if (len == 0)
695 return (0);
b18019d2 696
60101509 697
e10b0808
AX
698 rl = zfs_range_lock(&zv->zv_znode, offset, len, RL_READER);
699
700 error = dmu_read_bio(zv->zv_objset, ZVOL_OBJ, bio);
60101509
BB
701
702 zfs_range_unlock(rl);
703
704 /* convert checksum errors into IO errors */
705 if (error == ECKSUM)
a08ee875 706 error = SET_ERROR(EIO);
60101509 707
e10b0808 708 return (error);
60101509
BB
709}
710
e10b0808
AX
711static MAKE_REQUEST_FN_RET
712zvol_request(struct request_queue *q, struct bio *bio)
60101509
BB
713{
714 zvol_state_t *zv = q->queuedata;
e10b0808
AX
715 fstrans_cookie_t cookie = spl_fstrans_mark();
716 uint64_t offset = BIO_BI_SECTOR(bio);
717 unsigned int sectors = bio_sectors(bio);
718 int rw = bio_data_dir(bio);
719#ifdef HAVE_GENERIC_IO_ACCT
720 unsigned long start = jiffies;
721#endif
722 int error = 0;
60101509 723
e10b0808
AX
724 if (bio_has_data(bio) && offset + sectors >
725 get_capacity(zv->zv_disk)) {
726 printk(KERN_INFO
727 "%s: bad access: block=%llu, count=%lu\n",
728 zv->zv_disk->disk_name,
729 (long long unsigned)offset,
730 (long unsigned)sectors);
731 error = SET_ERROR(EIO);
732 goto out1;
733 }
60101509 734
e10b0808 735 generic_start_io_acct(rw, sectors, &zv->zv_disk->part0);
60101509 736
e10b0808
AX
737 if (rw == WRITE) {
738 if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
739 error = SET_ERROR(EROFS);
740 goto out2;
741 }
30930fba 742
e10b0808
AX
743 if (bio->bi_rw & VDEV_REQ_DISCARD) {
744 error = zvol_discard(bio);
745 goto out2;
60101509 746 }
e10b0808
AX
747
748 error = zvol_write(bio);
749 } else
750 error = zvol_read(bio);
751
752out2:
753 generic_end_io_acct(rw, &zv->zv_disk->part0, start);
754out1:
755 BIO_END_IO(bio, -error);
756 spl_fstrans_unmark(cookie);
757#ifdef HAVE_MAKE_REQUEST_FN_RET_INT
758 return (0);
94a40997
AX
759#elif defined(HAVE_MAKE_REQUEST_FN_RET_QC)
760 return (BLK_QC_T_NONE);
e10b0808 761#endif
60101509
BB
762}
763
764static void
765zvol_get_done(zgd_t *zgd, int error)
766{
767 if (zgd->zgd_db)
768 dmu_buf_rele(zgd->zgd_db, zgd);
769
770 zfs_range_unlock(zgd->zgd_rl);
771
772 if (error == 0 && zgd->zgd_bp)
773 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
774
775 kmem_free(zgd, sizeof (zgd_t));
776}
777
778/*
779 * Get data to generate a TX_WRITE intent log record.
780 */
781static int
782zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
783{
784 zvol_state_t *zv = arg;
785 objset_t *os = zv->zv_objset;
a08ee875 786 uint64_t object = ZVOL_OBJ;
60101509
BB
787 uint64_t offset = lr->lr_offset;
788 uint64_t size = lr->lr_length;
a08ee875 789 blkptr_t *bp = &lr->lr_blkptr;
60101509
BB
790 dmu_buf_t *db;
791 zgd_t *zgd;
792 int error;
793
794 ASSERT(zio != NULL);
795 ASSERT(size != 0);
796
ea04106b 797 zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
60101509
BB
798 zgd->zgd_zilog = zv->zv_zilog;
799 zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
800
801 /*
802 * Write records come in two flavors: immediate and indirect.
803 * For small writes it's cheaper to store the data with the
804 * log record (immediate); for large writes it's cheaper to
805 * sync the data and get a pointer to it (indirect) so that
806 * we don't have to write the data twice.
807 */
808 if (buf != NULL) { /* immediate write */
a08ee875 809 error = dmu_read(os, object, offset, size, buf,
60101509
BB
810 DMU_READ_NO_PREFETCH);
811 } else {
812 size = zv->zv_volblocksize;
813 offset = P2ALIGN_TYPED(offset, size, uint64_t);
a08ee875 814 error = dmu_buf_hold(os, object, offset, zgd, &db,
60101509
BB
815 DMU_READ_NO_PREFETCH);
816 if (error == 0) {
a08ee875
LG
817 blkptr_t *obp = dmu_buf_get_blkptr(db);
818 if (obp) {
819 ASSERT(BP_IS_HOLE(bp));
820 *bp = *obp;
821 }
822
60101509
BB
823 zgd->zgd_db = db;
824 zgd->zgd_bp = &lr->lr_blkptr;
825
826 ASSERT(db != NULL);
827 ASSERT(db->db_offset == offset);
828 ASSERT(db->db_size == size);
829
830 error = dmu_sync(zio, lr->lr_common.lrc_txg,
831 zvol_get_done, zgd);
832
833 if (error == 0)
834 return (0);
835 }
836 }
837
838 zvol_get_done(zgd, error);
839
a08ee875 840 return (SET_ERROR(error));
60101509
BB
841}
842
843/*
844 * The zvol_state_t's are inserted in increasing MINOR(dev_t) order.
845 */
846static void
847zvol_insert(zvol_state_t *zv_insert)
848{
849 zvol_state_t *zv = NULL;
850
851 ASSERT(MUTEX_HELD(&zvol_state_lock));
852 ASSERT3U(MINOR(zv_insert->zv_dev) & ZVOL_MINOR_MASK, ==, 0);
853 for (zv = list_head(&zvol_state_list); zv != NULL;
a08ee875 854 zv = list_next(&zvol_state_list, zv)) {
60101509
BB
855 if (MINOR(zv->zv_dev) > MINOR(zv_insert->zv_dev))
856 break;
857 }
858
859 list_insert_before(&zvol_state_list, zv, zv_insert);
860}
861
862/*
863 * Simply remove the zvol from to list of zvols.
864 */
865static void
866zvol_remove(zvol_state_t *zv_remove)
867{
868 ASSERT(MUTEX_HELD(&zvol_state_lock));
869 list_remove(&zvol_state_list, zv_remove);
870}
871
872static int
873zvol_first_open(zvol_state_t *zv)
874{
875 objset_t *os;
876 uint64_t volsize;
65d56083 877 int locked = 0;
60101509
BB
878 int error;
879 uint64_t ro;
880
65d56083
BB
881 /*
882 * In all other cases the spa_namespace_lock is taken before the
883 * bdev->bd_mutex lock. But in this case the Linux __blkdev_get()
884 * function calls fops->open() with the bdev->bd_mutex lock held.
885 *
886 * To avoid a potential lock inversion deadlock we preemptively
887 * try to take the spa_namespace_lock(). Normally it will not
888 * be contended and this is safe because spa_open_common() handles
889 * the case where the caller already holds the spa_namespace_lock.
890 *
891 * When it is contended we risk a lock inversion if we were to
892 * block waiting for the lock. Luckily, the __blkdev_get()
893 * function allows us to return -ERESTARTSYS which will result in
894 * bdev->bd_mutex being dropped, reacquired, and fops->open() being
895 * called again. This process can be repeated safely until both
896 * locks are acquired.
897 */
898 if (!mutex_owned(&spa_namespace_lock)) {
899 locked = mutex_tryenter(&spa_namespace_lock);
900 if (!locked)
a08ee875 901 return (-SET_ERROR(ERESTARTSYS));
65d56083
BB
902 }
903
ea04106b
AX
904 error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
905 if (error)
906 goto out_mutex;
907
60101509
BB
908 /* lie and say we're read-only */
909 error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, 1, zvol_tag, &os);
910 if (error)
babf3f9b 911 goto out_mutex;
60101509
BB
912
913 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
914 if (error) {
babf3f9b
MM
915 dmu_objset_disown(os, zvol_tag);
916 goto out_mutex;
60101509
BB
917 }
918
919 zv->zv_objset = os;
920 error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
921 if (error) {
babf3f9b
MM
922 dmu_objset_disown(os, zvol_tag);
923 goto out_mutex;
60101509
BB
924 }
925
926 set_capacity(zv->zv_disk, volsize >> 9);
927 zv->zv_volsize = volsize;
928 zv->zv_zilog = zil_open(os, zvol_get_data);
929
a4430fce
GW
930 if (ro || dmu_objset_is_snapshot(os) ||
931 !spa_writeable(dmu_objset_spa(os))) {
babf3f9b
MM
932 set_disk_ro(zv->zv_disk, 1);
933 zv->zv_flags |= ZVOL_RDONLY;
60101509 934 } else {
babf3f9b
MM
935 set_disk_ro(zv->zv_disk, 0);
936 zv->zv_flags &= ~ZVOL_RDONLY;
60101509
BB
937 }
938
babf3f9b
MM
939out_mutex:
940 if (locked)
941 mutex_exit(&spa_namespace_lock);
942
a08ee875 943 return (SET_ERROR(-error));
60101509
BB
944}
945
946static void
947zvol_last_close(zvol_state_t *zv)
948{
949 zil_close(zv->zv_zilog);
950 zv->zv_zilog = NULL;
04434775 951
60101509
BB
952 dmu_buf_rele(zv->zv_dbuf, zvol_tag);
953 zv->zv_dbuf = NULL;
04434775
MA
954
955 /*
956 * Evict cached data
957 */
958 if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) &&
959 !(zv->zv_flags & ZVOL_RDONLY))
960 txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
961 (void) dmu_objset_evict_dbufs(zv->zv_objset);
962
60101509
BB
963 dmu_objset_disown(zv->zv_objset, zvol_tag);
964 zv->zv_objset = NULL;
965}
966
967static int
968zvol_open(struct block_device *bdev, fmode_t flag)
969{
970 zvol_state_t *zv = bdev->bd_disk->private_data;
971 int error = 0, drop_mutex = 0;
972
973 /*
974 * If the caller is already holding the mutex do not take it
975 * again, this will happen as part of zvol_create_minor().
976 * Once add_disk() is called the device is live and the kernel
977 * will attempt to open it to read the partition information.
978 */
979 if (!mutex_owned(&zvol_state_lock)) {
980 mutex_enter(&zvol_state_lock);
981 drop_mutex = 1;
982 }
983
984 ASSERT3P(zv, !=, NULL);
985
986 if (zv->zv_open_count == 0) {
987 error = zvol_first_open(zv);
988 if (error)
989 goto out_mutex;
990 }
991
a08ee875 992 if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
60101509
BB
993 error = -EROFS;
994 goto out_open_count;
995 }
996
997 zv->zv_open_count++;
998
999out_open_count:
1000 if (zv->zv_open_count == 0)
1001 zvol_last_close(zv);
1002
1003out_mutex:
1004 if (drop_mutex)
1005 mutex_exit(&zvol_state_lock);
1006
1007 check_disk_change(bdev);
1008
a08ee875 1009 return (SET_ERROR(error));
60101509
BB
1010}
1011
c06d4368
AX
1012#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID
1013static void
1014#else
60101509 1015static int
c06d4368 1016#endif
60101509
BB
1017zvol_release(struct gendisk *disk, fmode_t mode)
1018{
1019 zvol_state_t *zv = disk->private_data;
1020 int drop_mutex = 0;
1021
1022 if (!mutex_owned(&zvol_state_lock)) {
1023 mutex_enter(&zvol_state_lock);
1024 drop_mutex = 1;
1025 }
1026
ea04106b
AX
1027 if (zv->zv_open_count > 0) {
1028 zv->zv_open_count--;
1029 if (zv->zv_open_count == 0)
1030 zvol_last_close(zv);
1031 }
60101509
BB
1032
1033 if (drop_mutex)
1034 mutex_exit(&zvol_state_lock);
1035
c06d4368 1036#ifndef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID
60101509 1037 return (0);
c06d4368 1038#endif
60101509
BB
1039}
1040
1041static int
1042zvol_ioctl(struct block_device *bdev, fmode_t mode,
a08ee875 1043 unsigned int cmd, unsigned long arg)
60101509
BB
1044{
1045 zvol_state_t *zv = bdev->bd_disk->private_data;
1046 int error = 0;
1047
1048 if (zv == NULL)
a08ee875 1049 return (SET_ERROR(-ENXIO));
60101509
BB
1050
1051 switch (cmd) {
1052 case BLKFLSBUF:
1053 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1054 break;
4c0d8e50
FN
1055 case BLKZNAME:
1056 error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
1057 break;
60101509
BB
1058
1059 default:
1060 error = -ENOTTY;
1061 break;
1062
1063 }
1064
a08ee875 1065 return (SET_ERROR(error));
60101509
BB
1066}
1067
1068#ifdef CONFIG_COMPAT
1069static int
1070zvol_compat_ioctl(struct block_device *bdev, fmode_t mode,
a08ee875 1071 unsigned cmd, unsigned long arg)
60101509 1072{
a08ee875 1073 return (zvol_ioctl(bdev, mode, cmd, arg));
60101509
BB
1074}
1075#else
a08ee875 1076#define zvol_compat_ioctl NULL
60101509
BB
1077#endif
1078
1079static int zvol_media_changed(struct gendisk *disk)
1080{
1081 zvol_state_t *zv = disk->private_data;
1082
a08ee875 1083 return (zv->zv_changed);
60101509
BB
1084}
1085
1086static int zvol_revalidate_disk(struct gendisk *disk)
1087{
1088 zvol_state_t *zv = disk->private_data;
1089
1090 zv->zv_changed = 0;
1091 set_capacity(zv->zv_disk, zv->zv_volsize >> 9);
1092
a08ee875 1093 return (0);
60101509
BB
1094}
1095
1096/*
1097 * Provide a simple virtual geometry for legacy compatibility. For devices
1098 * smaller than 1 MiB a small head and sector count is used to allow very
1099 * tiny devices. For devices over 1 Mib a standard head and sector count
1100 * is used to keep the cylinders count reasonable.
1101 */
1102static int
1103zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
1104{
1105 zvol_state_t *zv = bdev->bd_disk->private_data;
1106 sector_t sectors = get_capacity(zv->zv_disk);
1107
1108 if (sectors > 2048) {
1109 geo->heads = 16;
1110 geo->sectors = 63;
1111 } else {
1112 geo->heads = 2;
1113 geo->sectors = 4;
1114 }
1115
1116 geo->start = 0;
1117 geo->cylinders = sectors / (geo->heads * geo->sectors);
1118
a08ee875 1119 return (0);
60101509
BB
1120}
1121
1122static struct kobject *
1123zvol_probe(dev_t dev, int *part, void *arg)
1124{
1125 zvol_state_t *zv;
1126 struct kobject *kobj;
1127
1128 mutex_enter(&zvol_state_lock);
1129 zv = zvol_find_by_dev(dev);
23a61ccc 1130 kobj = zv ? get_disk(zv->zv_disk) : NULL;
60101509
BB
1131 mutex_exit(&zvol_state_lock);
1132
a08ee875 1133 return (kobj);
60101509
BB
1134}
1135
1136#ifdef HAVE_BDEV_BLOCK_DEVICE_OPERATIONS
1137static struct block_device_operations zvol_ops = {
a08ee875
LG
1138 .open = zvol_open,
1139 .release = zvol_release,
1140 .ioctl = zvol_ioctl,
1141 .compat_ioctl = zvol_compat_ioctl,
1142 .media_changed = zvol_media_changed,
1143 .revalidate_disk = zvol_revalidate_disk,
1144 .getgeo = zvol_getgeo,
1145 .owner = THIS_MODULE,
60101509
BB
1146};
1147
1148#else /* HAVE_BDEV_BLOCK_DEVICE_OPERATIONS */
1149
1150static int
1151zvol_open_by_inode(struct inode *inode, struct file *file)
1152{
a08ee875 1153 return (zvol_open(inode->i_bdev, file->f_mode));
60101509
BB
1154}
1155
1156static int
1157zvol_release_by_inode(struct inode *inode, struct file *file)
1158{
a08ee875 1159 return (zvol_release(inode->i_bdev->bd_disk, file->f_mode));
60101509
BB
1160}
1161
1162static int
1163zvol_ioctl_by_inode(struct inode *inode, struct file *file,
a08ee875 1164 unsigned int cmd, unsigned long arg)
60101509 1165{
b1c58213 1166 if (file == NULL || inode == NULL)
a08ee875
LG
1167 return (SET_ERROR(-EINVAL));
1168
1169 return (zvol_ioctl(inode->i_bdev, file->f_mode, cmd, arg));
60101509
BB
1170}
1171
a08ee875 1172#ifdef CONFIG_COMPAT
60101509
BB
1173static long
1174zvol_compat_ioctl_by_inode(struct file *file,
a08ee875 1175 unsigned int cmd, unsigned long arg)
60101509 1176{
b1c58213 1177 if (file == NULL)
a08ee875
LG
1178 return (SET_ERROR(-EINVAL));
1179
1180 return (zvol_compat_ioctl(file->f_dentry->d_inode->i_bdev,
1181 file->f_mode, cmd, arg));
60101509 1182}
a08ee875
LG
1183#else
1184#define zvol_compat_ioctl_by_inode NULL
1185#endif
60101509
BB
1186
1187static struct block_device_operations zvol_ops = {
a08ee875
LG
1188 .open = zvol_open_by_inode,
1189 .release = zvol_release_by_inode,
1190 .ioctl = zvol_ioctl_by_inode,
1191 .compat_ioctl = zvol_compat_ioctl_by_inode,
1192 .media_changed = zvol_media_changed,
1193 .revalidate_disk = zvol_revalidate_disk,
1194 .getgeo = zvol_getgeo,
1195 .owner = THIS_MODULE,
60101509
BB
1196};
1197#endif /* HAVE_BDEV_BLOCK_DEVICE_OPERATIONS */
1198
1199/*
1200 * Allocate memory for a new zvol_state_t and setup the required
1201 * request queue and generic disk structures for the block device.
1202 */
1203static zvol_state_t *
1204zvol_alloc(dev_t dev, const char *name)
1205{
1206 zvol_state_t *zv;
1207
ea04106b 1208 zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
c06d4368
AX
1209
1210 spin_lock_init(&zv->zv_lock);
1211 list_link_init(&zv->zv_next);
60101509 1212
e10b0808 1213 zv->zv_queue = blk_alloc_queue(GFP_ATOMIC);
60101509
BB
1214 if (zv->zv_queue == NULL)
1215 goto out_kmem;
1216
e10b0808 1217 blk_queue_make_request(zv->zv_queue, zvol_request);
7bd04f2d 1218
b18019d2
ED
1219#ifdef HAVE_BLK_QUEUE_FLUSH
1220 blk_queue_flush(zv->zv_queue, VDEV_REQ_FLUSH | VDEV_REQ_FUA);
1221#else
1222 blk_queue_ordered(zv->zv_queue, QUEUE_ORDERED_DRAIN, NULL);
1223#endif /* HAVE_BLK_QUEUE_FLUSH */
1224
60101509
BB
1225 zv->zv_disk = alloc_disk(ZVOL_MINORS);
1226 if (zv->zv_disk == NULL)
1227 goto out_queue;
1228
1229 zv->zv_queue->queuedata = zv;
1230 zv->zv_dev = dev;
1231 zv->zv_open_count = 0;
4c0d8e50 1232 strlcpy(zv->zv_name, name, MAXNAMELEN);
60101509
BB
1233
1234 mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
1235 avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
1236 sizeof (rl_t), offsetof(rl_t, r_node));
3c4988c8
BB
1237 zv->zv_znode.z_is_zvol = TRUE;
1238
60101509
BB
1239 zv->zv_disk->major = zvol_major;
1240 zv->zv_disk->first_minor = (dev & MINORMASK);
1241 zv->zv_disk->fops = &zvol_ops;
1242 zv->zv_disk->private_data = zv;
1243 zv->zv_disk->queue = zv->zv_queue;
4c0d8e50
FN
1244 snprintf(zv->zv_disk->disk_name, DISK_NAME_LEN, "%s%d",
1245 ZVOL_DEV_NAME, (dev & MINORMASK));
60101509 1246
a08ee875 1247 return (zv);
60101509
BB
1248
1249out_queue:
1250 blk_cleanup_queue(zv->zv_queue);
1251out_kmem:
1252 kmem_free(zv, sizeof (zvol_state_t));
c06d4368 1253
a08ee875 1254 return (NULL);
60101509
BB
1255}
1256
1257/*
1258 * Cleanup then free a zvol_state_t which was created by zvol_alloc().
1259 */
1260static void
1261zvol_free(zvol_state_t *zv)
1262{
1263 avl_destroy(&zv->zv_znode.z_range_avl);
1264 mutex_destroy(&zv->zv_znode.z_range_lock);
1265
1266 del_gendisk(zv->zv_disk);
1267 blk_cleanup_queue(zv->zv_queue);
1268 put_disk(zv->zv_disk);
1269
1270 kmem_free(zv, sizeof (zvol_state_t));
1271}
1272
1273static int
0b4d1b58
ED
1274__zvol_snapdev_hidden(const char *name)
1275{
a08ee875
LG
1276 uint64_t snapdev;
1277 char *parent;
1278 char *atp;
1279 int error = 0;
1280
ea04106b 1281 parent = kmem_alloc(MAXPATHLEN, KM_SLEEP);
a08ee875
LG
1282 (void) strlcpy(parent, name, MAXPATHLEN);
1283
1284 if ((atp = strrchr(parent, '@')) != NULL) {
1285 *atp = '\0';
1286 error = dsl_prop_get_integer(parent, "snapdev", &snapdev, NULL);
1287 if ((error == 0) && (snapdev == ZFS_SNAPDEV_HIDDEN))
1288 error = SET_ERROR(ENODEV);
1289 }
1290
1291 kmem_free(parent, MAXPATHLEN);
1292
1293 return (SET_ERROR(error));
0b4d1b58
ED
1294}
1295
1296static int
1297__zvol_create_minor(const char *name, boolean_t ignore_snapdev)
60101509
BB
1298{
1299 zvol_state_t *zv;
1300 objset_t *os;
1301 dmu_object_info_t *doi;
1302 uint64_t volsize;
e10b0808 1303 uint64_t len;
60101509
BB
1304 unsigned minor = 0;
1305 int error = 0;
1306
1307 ASSERT(MUTEX_HELD(&zvol_state_lock));
1308
1309 zv = zvol_find_by_name(name);
1310 if (zv) {
a08ee875 1311 error = SET_ERROR(EEXIST);
60101509
BB
1312 goto out;
1313 }
1314
0b4d1b58
ED
1315 if (ignore_snapdev == B_FALSE) {
1316 error = __zvol_snapdev_hidden(name);
1317 if (error)
1318 goto out;
1319 }
1320
ea04106b 1321 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
60101509
BB
1322
1323 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, zvol_tag, &os);
1324 if (error)
1325 goto out_doi;
1326
1327 error = dmu_object_info(os, ZVOL_OBJ, doi);
1328 if (error)
1329 goto out_dmu_objset_disown;
1330
1331 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1332 if (error)
1333 goto out_dmu_objset_disown;
1334
1335 error = zvol_find_minor(&minor);
1336 if (error)
1337 goto out_dmu_objset_disown;
1338
1339 zv = zvol_alloc(MKDEV(zvol_major, minor), name);
1340 if (zv == NULL) {
a08ee875 1341 error = SET_ERROR(EAGAIN);
60101509
BB
1342 goto out_dmu_objset_disown;
1343 }
1344
1345 if (dmu_objset_is_snapshot(os))
1346 zv->zv_flags |= ZVOL_RDONLY;
1347
1348 zv->zv_volblocksize = doi->doi_data_block_size;
1349 zv->zv_volsize = volsize;
1350 zv->zv_objset = os;
1351
1352 set_capacity(zv->zv_disk, zv->zv_volsize >> 9);
1353
e10b0808 1354 blk_queue_max_hw_sectors(zv->zv_queue, (DMU_MAX_ACCESS / 4) >> 9);
34037afe
ED
1355 blk_queue_max_segments(zv->zv_queue, UINT16_MAX);
1356 blk_queue_max_segment_size(zv->zv_queue, UINT_MAX);
1357 blk_queue_physical_block_size(zv->zv_queue, zv->zv_volblocksize);
1358 blk_queue_io_opt(zv->zv_queue, zv->zv_volblocksize);
7c0e5708
ED
1359 blk_queue_max_discard_sectors(zv->zv_queue,
1360 (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
ee5fd0bb 1361 blk_queue_discard_granularity(zv->zv_queue, zv->zv_volblocksize);
30930fba 1362 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zv->zv_queue);
e10b0808 1363#ifdef QUEUE_FLAG_NONROT
34037afe
ED
1364 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zv->zv_queue);
1365#endif
e10b0808
AX
1366#ifdef QUEUE_FLAG_ADD_RANDOM
1367 queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zv->zv_queue);
1368#endif
34037afe 1369
a4430fce
GW
1370 if (spa_writeable(dmu_objset_spa(os))) {
1371 if (zil_replay_disable)
1372 zil_destroy(dmu_objset_zil(os), B_FALSE);
1373 else
1374 zil_replay(os, zv, zvol_replay_vector);
1375 }
60101509 1376
e10b0808
AX
1377 /*
1378 * When udev detects the addition of the device it will immediately
1379 * invoke blkid(8) to determine the type of content on the device.
1380 * Prefetching the blocks commonly scanned by blkid(8) will speed
1381 * up this process.
1382 */
1383 len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE);
1384 if (len > 0) {
1385 dmu_prefetch(os, ZVOL_OBJ, 0, len);
1386 dmu_prefetch(os, ZVOL_OBJ, volsize - len, len);
1387 }
1388
f74a147c 1389 zv->zv_objset = NULL;
60101509
BB
1390out_dmu_objset_disown:
1391 dmu_objset_disown(os, zvol_tag);
60101509 1392out_doi:
a08ee875 1393 kmem_free(doi, sizeof (dmu_object_info_t));
60101509
BB
1394out:
1395
1396 if (error == 0) {
1397 zvol_insert(zv);
1398 add_disk(zv->zv_disk);
1399 }
1400
a08ee875 1401 return (SET_ERROR(error));
60101509
BB
1402}
1403
1404/*
1405 * Create a block device minor node and setup the linkage between it
1406 * and the specified volume. Once this function returns the block
1407 * device is live and ready for use.
1408 */
1409int
1410zvol_create_minor(const char *name)
1411{
1412 int error;
1413
1414 mutex_enter(&zvol_state_lock);
0b4d1b58 1415 error = __zvol_create_minor(name, B_FALSE);
60101509
BB
1416 mutex_exit(&zvol_state_lock);
1417
a08ee875 1418 return (SET_ERROR(error));
60101509
BB
1419}
1420
1421static int
1422__zvol_remove_minor(const char *name)
1423{
1424 zvol_state_t *zv;
1425
1426 ASSERT(MUTEX_HELD(&zvol_state_lock));
1427
1428 zv = zvol_find_by_name(name);
1429 if (zv == NULL)
a08ee875 1430 return (SET_ERROR(ENXIO));
60101509
BB
1431
1432 if (zv->zv_open_count > 0)
a08ee875 1433 return (SET_ERROR(EBUSY));
60101509
BB
1434
1435 zvol_remove(zv);
1436 zvol_free(zv);
1437
1438 return (0);
1439}
1440
1441/*
1442 * Remove a block device minor node for the specified volume.
1443 */
1444int
1445zvol_remove_minor(const char *name)
1446{
1447 int error;
1448
1449 mutex_enter(&zvol_state_lock);
1450 error = __zvol_remove_minor(name);
1451 mutex_exit(&zvol_state_lock);
1452
a08ee875
LG
1453 return (SET_ERROR(error));
1454}
1455
1456/*
1457 * Rename a block device minor mode for the specified volume.
1458 */
1459static void
1460__zvol_rename_minor(zvol_state_t *zv, const char *newname)
1461{
1462 int readonly = get_disk_ro(zv->zv_disk);
1463
1464 ASSERT(MUTEX_HELD(&zvol_state_lock));
1465
1466 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1467
1468 /*
1469 * The block device's read-only state is briefly changed causing
1470 * a KOBJ_CHANGE uevent to be issued. This ensures udev detects
1471 * the name change and fixes the symlinks. This does not change
1472 * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never
1473 * changes. This would normally be done using kobject_uevent() but
1474 * that is a GPL-only symbol which is why we need this workaround.
1475 */
1476 set_disk_ro(zv->zv_disk, !readonly);
1477 set_disk_ro(zv->zv_disk, readonly);
60101509
BB
1478}
1479
1480static int
a08ee875 1481zvol_create_minors_cb(const char *dsname, void *arg)
60101509 1482{
a08ee875 1483 (void) zvol_create_minor(dsname);
60101509 1484
d5674448 1485 return (0);
60101509
BB
1486}
1487
1488/*
a08ee875 1489 * Create minors for specified dataset including children and snapshots.
60101509
BB
1490 */
1491int
a08ee875 1492zvol_create_minors(const char *name)
60101509 1493{
60101509
BB
1494 int error = 0;
1495
a08ee875
LG
1496 if (!zvol_inhibit_dev)
1497 error = dmu_objset_find((char *)name, zvol_create_minors_cb,
1498 NULL, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
1499
1500 return (SET_ERROR(error));
1501}
1502
1503/*
1504 * Remove minors for specified dataset including children and snapshots.
1505 */
1506void
1507zvol_remove_minors(const char *name)
1508{
1509 zvol_state_t *zv, *zv_next;
1510 int namelen = ((name) ? strlen(name) : 0);
1511
74497b7a 1512 if (zvol_inhibit_dev)
a08ee875 1513 return;
74497b7a 1514
60101509 1515 mutex_enter(&zvol_state_lock);
a08ee875
LG
1516
1517 for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
1518 zv_next = list_next(&zvol_state_list, zv);
1519
1520 if (name == NULL || strcmp(zv->zv_name, name) == 0 ||
1521 (strncmp(zv->zv_name, name, namelen) == 0 &&
1522 zv->zv_name[namelen] == '/')) {
1523 zvol_remove(zv);
1524 zvol_free(zv);
60101509 1525 }
60101509 1526 }
60101509 1527
a08ee875 1528 mutex_exit(&zvol_state_lock);
60101509
BB
1529}
1530
1531/*
a08ee875 1532 * Rename minors for specified dataset including children and snapshots.
60101509
BB
1533 */
1534void
a08ee875 1535zvol_rename_minors(const char *oldname, const char *newname)
60101509
BB
1536{
1537 zvol_state_t *zv, *zv_next;
a08ee875
LG
1538 int oldnamelen, newnamelen;
1539 char *name;
60101509 1540
74497b7a
DH
1541 if (zvol_inhibit_dev)
1542 return;
1543
a08ee875
LG
1544 oldnamelen = strlen(oldname);
1545 newnamelen = strlen(newname);
ea04106b 1546 name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
60101509
BB
1547
1548 mutex_enter(&zvol_state_lock);
a08ee875 1549
60101509
BB
1550 for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
1551 zv_next = list_next(&zvol_state_list, zv);
1552
a08ee875
LG
1553 if (strcmp(zv->zv_name, oldname) == 0) {
1554 __zvol_rename_minor(zv, newname);
1555 } else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
1556 (zv->zv_name[oldnamelen] == '/' ||
1557 zv->zv_name[oldnamelen] == '@')) {
1558 snprintf(name, MAXNAMELEN, "%s%c%s", newname,
1559 zv->zv_name[oldnamelen],
1560 zv->zv_name + oldnamelen + 1);
1561 __zvol_rename_minor(zv, name);
60101509
BB
1562 }
1563 }
a08ee875 1564
60101509 1565 mutex_exit(&zvol_state_lock);
a08ee875
LG
1566
1567 kmem_free(name, MAXNAMELEN);
60101509
BB
1568}
1569
0b4d1b58
ED
1570static int
1571snapdev_snapshot_changed_cb(const char *dsname, void *arg) {
1572 uint64_t snapdev = *(uint64_t *) arg;
1573
1574 if (strchr(dsname, '@') == NULL)
a08ee875 1575 return (0);
0b4d1b58
ED
1576
1577 switch (snapdev) {
1578 case ZFS_SNAPDEV_VISIBLE:
1579 mutex_enter(&zvol_state_lock);
1580 (void) __zvol_create_minor(dsname, B_TRUE);
1581 mutex_exit(&zvol_state_lock);
1582 break;
1583 case ZFS_SNAPDEV_HIDDEN:
1584 (void) zvol_remove_minor(dsname);
1585 break;
1586 }
a08ee875
LG
1587
1588 return (0);
0b4d1b58
ED
1589}
1590
1591int
1592zvol_set_snapdev(const char *dsname, uint64_t snapdev) {
1593 (void) dmu_objset_find((char *) dsname, snapdev_snapshot_changed_cb,
1594 &snapdev, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
1595 /* caller should continue to modify snapdev property */
1596 return (-1);
1597}
1598
60101509
BB
1599int
1600zvol_init(void)
1601{
1602 int error;
1603
c06d4368 1604 list_create(&zvol_state_list, sizeof (zvol_state_t),
a08ee875
LG
1605 offsetof(zvol_state_t, zv_next));
1606
c06d4368
AX
1607 mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL);
1608
60101509
BB
1609 error = register_blkdev(zvol_major, ZVOL_DRIVER);
1610 if (error) {
1611 printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
e10b0808 1612 goto out;
60101509
BB
1613 }
1614
1615 blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS,
a08ee875 1616 THIS_MODULE, zvol_probe, NULL, NULL);
60101509 1617
c06d4368 1618 return (0);
60101509 1619
e10b0808 1620out:
c06d4368
AX
1621 mutex_destroy(&zvol_state_lock);
1622 list_destroy(&zvol_state_list);
60101509 1623
a08ee875 1624 return (SET_ERROR(error));
60101509
BB
1625}
1626
1627void
1628zvol_fini(void)
1629{
1630 zvol_remove_minors(NULL);
1631 blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS);
1632 unregister_blkdev(zvol_major, ZVOL_DRIVER);
60101509
BB
1633 mutex_destroy(&zvol_state_lock);
1634 list_destroy(&zvol_state_list);
1635}
1636
74497b7a
DH
1637module_param(zvol_inhibit_dev, uint, 0644);
1638MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
1639
30a9524e 1640module_param(zvol_major, uint, 0444);
60101509
BB
1641MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
1642
7c0e5708 1643module_param(zvol_max_discard_blocks, ulong, 0444);
a08ee875 1644MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
e10b0808
AX
1645
1646module_param(zvol_prefetch_bytes, uint, 0644);
1647MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");