*
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2016 Actifio, Inc. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
*/
/*
* and zvol_release()->zvol_last_close() directly as well.
*/
+#include <sys/dataset_kstats.h>
#include <sys/dbuf.h>
#include <sys/dmu_traverse.h>
#include <sys/dsl_dataset.h>
#include <sys/zfs_znode.h>
#include <sys/spa_impl.h>
#include <sys/zvol.h>
+
#include <linux/blkdev_compat.h>
+#include <linux/task_io_accounting_ops.h>
unsigned int zvol_inhibit_dev = 0;
unsigned int zvol_major = ZVOL_MAJOR;
unsigned int zvol_volmode = ZFS_VOLMODE_GEOM;
static taskq_t *zvol_taskq;
-static kmutex_t zvol_state_lock;
+static krwlock_t zvol_state_lock;
static list_t zvol_state_list;
#define ZVOL_HT_SIZE 1024
dev_t zv_dev; /* device id */
struct gendisk *zv_disk; /* generic disk */
struct request_queue *zv_queue; /* request queue */
+ dataset_kstats_t zv_kstat; /* zvol kstats */
list_node_t zv_next; /* next zvol_state_t linkage */
uint64_t zv_hash; /* name hash */
struct hlist_node zv_hlink; /* hash link */
{
zvol_state_t *zv;
- mutex_enter(&zvol_state_lock);
+ rw_enter(&zvol_state_lock, RW_READER);
for (zv = list_head(&zvol_state_list); zv != NULL;
zv = list_next(&zvol_state_list, zv)) {
mutex_enter(&zv->zv_state_lock);
if (zv->zv_dev == dev) {
- mutex_exit(&zvol_state_lock);
+ rw_exit(&zvol_state_lock);
return (zv);
}
mutex_exit(&zv->zv_state_lock);
}
- mutex_exit(&zvol_state_lock);
+ rw_exit(&zvol_state_lock);
return (NULL);
}
zvol_find_by_name_hash(const char *name, uint64_t hash, int mode)
{
zvol_state_t *zv;
- struct hlist_node *p;
+ struct hlist_node *p = NULL;
- mutex_enter(&zvol_state_lock);
+ rw_enter(&zvol_state_lock, RW_READER);
hlist_for_each(p, ZVOL_HT_HEAD(hash)) {
zv = hlist_entry(p, zvol_state_t, zv_hlink);
mutex_enter(&zv->zv_state_lock);
strncmp(zv->zv_name, name, MAXNAMELEN)
== 0);
}
- mutex_exit(&zvol_state_lock);
+ rw_exit(&zvol_state_lock);
return (zv);
}
mutex_exit(&zv->zv_state_lock);
}
- mutex_exit(&zvol_state_lock);
+ rw_exit(&zvol_state_lock);
return (NULL);
}
return (SET_ERROR(error));
}
-static void
-zvol_size_changed(zvol_state_t *zv, uint64_t volsize)
-{
- struct block_device *bdev;
-
- ASSERT(MUTEX_HELD(&zv->zv_state_lock));
-
- bdev = bdget_disk(zv->zv_disk, 0);
- if (bdev == NULL)
- return;
-
- set_capacity(zv->zv_disk, volsize >> 9);
- zv->zv_volsize = volsize;
- check_disk_size_change(zv->zv_disk, bdev);
-
- bdput(bdev);
-}
-
/*
* Sanity check volume size.
*/
return (error);
}
-static int
-zvol_update_live_volsize(zvol_state_t *zv, uint64_t volsize)
-{
- zvol_size_changed(zv, volsize);
-
- /*
- * We should post a event here describing the expansion. However,
- * the zfs_ereport_post() interface doesn't nicely support posting
- * events for zvols, it assumes events relate to vdevs or zios.
- */
-
- return (0);
-}
-
/*
- * Set ZFS_PROP_VOLSIZE set entry point.
+ * Set ZFS_PROP_VOLSIZE set entry point. Note that modifying the volume
+ * size will result in a udev "change" event being generated.
*/
int
zvol_set_volsize(const char *name, uint64_t volsize)
{
- zvol_state_t *zv = NULL;
objset_t *os = NULL;
- int error;
- dmu_object_info_t *doi;
+ struct gendisk *disk = NULL;
uint64_t readonly;
+ int error;
boolean_t owned = B_FALSE;
error = dsl_prop_get_integer(name,
if (readonly)
return (SET_ERROR(EROFS));
- zv = zvol_find_by_name(name, RW_READER);
+ zvol_state_t *zv = zvol_find_by_name(name, RW_READER);
ASSERT(zv == NULL || (MUTEX_HELD(&zv->zv_state_lock) &&
RW_READ_HELD(&zv->zv_suspend_lock)));
if (zv == NULL || zv->zv_objset == NULL) {
if (zv != NULL)
rw_exit(&zv->zv_suspend_lock);
- if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE,
+ if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE, B_TRUE,
FTAG, &os)) != 0) {
if (zv != NULL)
mutex_exit(&zv->zv_state_lock);
os = zv->zv_objset;
}
- doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
+ dmu_object_info_t *doi = kmem_alloc(sizeof (*doi), KM_SLEEP);
if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) ||
(error = zvol_check_volsize(volsize, doi->doi_data_block_size)))
goto out;
error = zvol_update_volsize(volsize, os);
-
- if (error == 0 && zv != NULL)
- error = zvol_update_live_volsize(zv, volsize);
+ if (error == 0 && zv != NULL) {
+ zv->zv_volsize = volsize;
+ zv->zv_changed = 1;
+ disk = zv->zv_disk;
+ }
out:
kmem_free(doi, sizeof (dmu_object_info_t));
if (owned) {
- dmu_objset_disown(os, FTAG);
+ dmu_objset_disown(os, B_TRUE, FTAG);
if (zv != NULL)
zv->zv_objset = NULL;
} else {
if (zv != NULL)
mutex_exit(&zv->zv_state_lock);
+ if (disk != NULL)
+ revalidate_disk(disk);
+
return (SET_ERROR(error));
}
if (zv == NULL)
return (SET_ERROR(ENXIO));
- ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
- RW_READ_HELD(&zv->zv_suspend_lock));
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
if (zv->zv_flags & ZVOL_RDONLY) {
mutex_exit(&zv->zv_state_lock);
* implement DKIOCFREE/free-long-range.
*/
static int
-zvol_replay_truncate(zvol_state_t *zv, lr_truncate_t *lr, boolean_t byteswap)
+zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
{
+ zvol_state_t *zv = arg1;
+ lr_truncate_t *lr = arg2;
uint64_t offset, length;
if (byteswap)
* after a system failure
*/
static int
-zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
+zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
{
+ zvol_state_t *zv = arg1;
+ lr_write_t *lr = arg2;
objset_t *os = zv->zv_objset;
- char *data = (char *)(lr + 1); /* data follows lr_write_t */
- uint64_t off = lr->lr_offset;
- uint64_t len = lr->lr_length;
+ char *data = (char *)(lr + 1); /* data follows lr_write_t */
+ uint64_t offset, length;
dmu_tx_t *tx;
int error;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
+ offset = lr->lr_offset;
+ length = lr->lr_length;
+
+ /* If it's a dmu_sync() block, write the whole block */
+ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+ uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
+ if (length < blocksize) {
+ offset -= offset % blocksize;
+ length = blocksize;
+ }
+ }
+
tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, ZVOL_OBJ, off, len);
+ dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
} else {
- dmu_write(os, ZVOL_OBJ, off, len, data, tx);
+ dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
dmu_tx_commit(tx);
}
- return (SET_ERROR(error));
+ return (error);
}
static int
-zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
+zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap)
{
return (SET_ERROR(ENOTSUP));
}
* Callback vectors for replaying records.
* Only TX_WRITE and TX_TRUNCATE are needed for zvol.
*/
-zil_replay_func_t zvol_replay_vector[TX_MAX_TYPE] = {
- (zil_replay_func_t)zvol_replay_err, /* no such transaction type */
- (zil_replay_func_t)zvol_replay_err, /* TX_CREATE */
- (zil_replay_func_t)zvol_replay_err, /* TX_MKDIR */
- (zil_replay_func_t)zvol_replay_err, /* TX_MKXATTR */
- (zil_replay_func_t)zvol_replay_err, /* TX_SYMLINK */
- (zil_replay_func_t)zvol_replay_err, /* TX_REMOVE */
- (zil_replay_func_t)zvol_replay_err, /* TX_RMDIR */
- (zil_replay_func_t)zvol_replay_err, /* TX_LINK */
- (zil_replay_func_t)zvol_replay_err, /* TX_RENAME */
- (zil_replay_func_t)zvol_replay_write, /* TX_WRITE */
- (zil_replay_func_t)zvol_replay_truncate, /* TX_TRUNCATE */
- (zil_replay_func_t)zvol_replay_err, /* TX_SETATTR */
- (zil_replay_func_t)zvol_replay_err, /* TX_ACL */
+zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
+ zvol_replay_err, /* no such transaction type */
+ zvol_replay_err, /* TX_CREATE */
+ zvol_replay_err, /* TX_MKDIR */
+ zvol_replay_err, /* TX_MKXATTR */
+ zvol_replay_err, /* TX_SYMLINK */
+ zvol_replay_err, /* TX_REMOVE */
+ zvol_replay_err, /* TX_RMDIR */
+ zvol_replay_err, /* TX_LINK */
+ zvol_replay_err, /* TX_RENAME */
+ zvol_replay_write, /* TX_WRITE */
+ zvol_replay_truncate, /* TX_TRUNCATE */
+ zvol_replay_err, /* TX_SETATTR */
+ zvol_replay_err, /* TX_ACL */
+ zvol_replay_err, /* TX_CREATE_ATTR */
+ zvol_replay_err, /* TX_CREATE_ACL_ATTR */
+ zvol_replay_err, /* TX_MKDIR_ACL */
+ zvol_replay_err, /* TX_MKDIR_ATTR */
+ zvol_replay_err, /* TX_MKDIR_ACL_ATTR */
+ zvol_replay_err, /* TX_WRITE2 */
};
/*
static void
zvol_write(void *arg)
{
+ int error = 0;
+
zv_request_t *zvr = arg;
struct bio *bio = zvr->bio;
uio_t uio;
- zvol_state_t *zv = zvr->zv;
- uint64_t volsize = zv->zv_volsize;
- boolean_t sync;
- int error = 0;
- unsigned long start_jif;
-
uio_from_bio(&uio, bio);
+ zvol_state_t *zv = zvr->zv;
ASSERT(zv && zv->zv_open_count > 0);
- start_jif = jiffies;
- generic_start_io_acct(WRITE, bio_sectors(bio), &zv->zv_disk->part0);
+ ssize_t start_resid = uio.uio_resid;
+ unsigned long start_jif = jiffies;
+ blk_generic_start_io_acct(zv->zv_queue, WRITE, bio_sectors(bio),
+ &zv->zv_disk->part0);
- sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+ boolean_t sync =
+ bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+ uint64_t volsize = zv->zv_volsize;
while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
uint64_t off = uio.uio_loffset;
break;
}
error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
- if (error == 0)
+ if (error == 0) {
zvol_log_write(zv, tx, off, bytes, sync);
+ }
dmu_tx_commit(tx);
if (error)
break;
}
zfs_range_unlock(zvr->rl);
+
+ int64_t nwritten = start_resid - uio.uio_resid;
+ dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
+ task_io_account_write(nwritten);
+
if (sync)
zil_commit(zv->zv_zilog, ZVOL_OBJ);
rw_exit(&zv->zv_suspend_lock);
- generic_end_io_acct(WRITE, &zv->zv_disk->part0, start_jif);
+ blk_generic_end_io_acct(zv->zv_queue, WRITE, &zv->zv_disk->part0,
+ start_jif);
BIO_END_IO(bio, -error);
kmem_free(zvr, sizeof (zv_request_t));
}
uint64_t start = BIO_BI_SECTOR(bio) << 9;
uint64_t size = BIO_BI_SIZE(bio);
uint64_t end = start + size;
+ boolean_t sync;
int error = 0;
dmu_tx_t *tx;
unsigned long start_jif;
ASSERT(zv && zv->zv_open_count > 0);
start_jif = jiffies;
- generic_start_io_acct(WRITE, bio_sectors(bio), &zv->zv_disk->part0);
+ blk_generic_start_io_acct(zv->zv_queue, WRITE, bio_sectors(bio),
+ &zv->zv_disk->part0);
+
+ sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
if (end > zv->zv_volsize) {
error = SET_ERROR(EIO);
- goto out;
+ goto unlock;
}
/*
}
if (start >= end)
- goto out;
+ goto unlock;
tx = dmu_tx_create(zv->zv_objset);
dmu_tx_mark_netfree(tx);
error = dmu_free_long_range(zv->zv_objset,
ZVOL_OBJ, start, size);
}
-
-out:
+unlock:
zfs_range_unlock(zvr->rl);
+ if (error == 0 && sync)
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+
rw_exit(&zv->zv_suspend_lock);
- generic_end_io_acct(WRITE, &zv->zv_disk->part0, start_jif);
+ blk_generic_end_io_acct(zv->zv_queue, WRITE, &zv->zv_disk->part0,
+ start_jif);
BIO_END_IO(bio, -error);
kmem_free(zvr, sizeof (zv_request_t));
}
static void
zvol_read(void *arg)
{
+ int error = 0;
+
zv_request_t *zvr = arg;
struct bio *bio = zvr->bio;
uio_t uio;
- zvol_state_t *zv = zvr->zv;
- uint64_t volsize = zv->zv_volsize;
- int error = 0;
- unsigned long start_jif;
-
uio_from_bio(&uio, bio);
+ zvol_state_t *zv = zvr->zv;
ASSERT(zv && zv->zv_open_count > 0);
- start_jif = jiffies;
- generic_start_io_acct(READ, bio_sectors(bio), &zv->zv_disk->part0);
+ ssize_t start_resid = uio.uio_resid;
+ unsigned long start_jif = jiffies;
+ blk_generic_start_io_acct(zv->zv_queue, READ, bio_sectors(bio),
+ &zv->zv_disk->part0);
+ uint64_t volsize = zv->zv_volsize;
while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
}
zfs_range_unlock(zvr->rl);
+ int64_t nread = start_resid - uio.uio_resid;
+ dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
+ task_io_account_read(nread);
+
rw_exit(&zv->zv_suspend_lock);
- generic_end_io_acct(READ, &zv->zv_disk->part0, start_jif);
+ blk_generic_end_io_acct(zv->zv_queue, READ, &zv->zv_disk->part0,
+ start_jif);
BIO_END_IO(bio, -error);
kmem_free(zvr, sizeof (zv_request_t));
}
}
if (rw == WRITE) {
+ boolean_t need_sync = B_FALSE;
+
if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
BIO_END_IO(bio, -SET_ERROR(EROFS));
goto out;
*/
zvr->rl = zfs_range_lock(&zv->zv_range_lock, offset, size,
RL_WRITER);
+ /*
+ * Sync writes and discards execute zil_commit() which may need
+ * to take a RL_READER lock on the whole block being modified
+ * via its zillog->zl_get_data(): to avoid circular dependency
+ * issues with taskq threads execute these requests
+ * synchronously here in zvol_request().
+ */
+ need_sync = bio_is_fua(bio) ||
+ zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
if (bio_is_discard(bio) || bio_is_secure_erase(bio)) {
- if (zvol_request_sync || taskq_dispatch(zvol_taskq,
- zvol_discard, zvr, TQ_SLEEP) == TASKQID_INVALID)
+ if (zvol_request_sync || need_sync ||
+ taskq_dispatch(zvol_taskq, zvol_discard, zvr,
+ TQ_SLEEP) == TASKQID_INVALID)
zvol_discard(zvr);
} else {
- if (zvol_request_sync || taskq_dispatch(zvol_taskq,
- zvol_write, zvr, TQ_SLEEP) == TASKQID_INVALID)
+ if (zvol_request_sync || need_sync ||
+ taskq_dispatch(zvol_taskq, zvol_write, zvr,
+ TQ_SLEEP) == TASKQID_INVALID)
zvol_write(zvr);
}
} else {
zfs_range_unlock(zgd->zgd_rl);
if (error == 0 && zgd->zgd_bp)
- zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
+ zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
kmem_free(zgd, sizeof (zgd_t));
}
* Get data to generate a TX_WRITE intent log record.
*/
static int
-zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
+zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
{
zvol_state_t *zv = arg;
uint64_t offset = lr->lr_offset;
zgd_t *zgd;
int error;
- ASSERT(zio != NULL);
- ASSERT(size != 0);
+ ASSERT3P(lwb, !=, NULL);
+ ASSERT3P(zio, !=, NULL);
+ ASSERT3U(size, !=, 0);
zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
- zgd->zgd_zilog = zv->zv_zilog;
- zgd->zgd_rl = zfs_range_lock(&zv->zv_range_lock, offset, size,
- RL_READER);
+ zgd->zgd_lwb = lwb;
/*
* Write records come in two flavors: immediate and indirect.
* we don't have to write the data twice.
*/
if (buf != NULL) { /* immediate write */
+ zgd->zgd_rl = zfs_range_lock(&zv->zv_range_lock, offset, size,
+ RL_READER);
error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
DMU_READ_NO_PREFETCH);
- } else {
+ } else { /* indirect write */
+ /*
+ * Have to lock the whole block to ensure when it's written out
+ * and its checksum is being calculated that no one can change
+ * the data. Contrarily to zfs_get_data we need not re-check
+ * blocksize after we get the lock because it cannot be changed.
+ */
size = zv->zv_volblocksize;
offset = P2ALIGN_TYPED(offset, size, uint64_t);
+ zgd->zgd_rl = zfs_range_lock(&zv->zv_range_lock, offset, size,
+ RL_READER);
error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db,
DMU_READ_NO_PREFETCH);
if (error == 0) {
static void
zvol_insert(zvol_state_t *zv)
{
- ASSERT(MUTEX_HELD(&zvol_state_lock));
+ ASSERT(RW_WRITE_HELD(&zvol_state_lock));
ASSERT3U(MINOR(zv->zv_dev) & ZVOL_MINOR_MASK, ==, 0);
list_insert_head(&zvol_state_list, zv);
hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
static void
zvol_remove(zvol_state_t *zv)
{
- ASSERT(MUTEX_HELD(&zvol_state_lock));
+ ASSERT(RW_WRITE_HELD(&zvol_state_lock));
list_remove(&zvol_state_list, zv);
hlist_del(&zv->zv_hlink);
}
uint64_t ro;
objset_t *os = zv->zv_objset;
- ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
- RW_LOCK_HELD(&zv->zv_suspend_lock));
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT(RW_LOCK_HELD(&zv->zv_suspend_lock));
error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
if (error)
return (NULL);
/* block all I/O, release in zvol_resume. */
- ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
- RW_WRITE_HELD(&zv->zv_suspend_lock));
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
atomic_inc(&zv->zv_suspend_ref);
}
static int
-zvol_first_open(zvol_state_t *zv)
+zvol_first_open(zvol_state_t *zv, boolean_t readonly)
{
objset_t *os;
int error, locked = 0;
+ boolean_t ro;
ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
return (-SET_ERROR(ERESTARTSYS));
}
- /* lie and say we're read-only */
- error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, 1, zv, &os);
+ ro = (readonly || (strchr(zv->zv_name, '@') != NULL));
+ error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, ro, B_TRUE, zv, &os);
if (error)
goto out_mutex;
error = zvol_setup_zv(zv);
if (error) {
- dmu_objset_disown(os, zv);
+ dmu_objset_disown(os, 1, zv);
zv->zv_objset = NULL;
}
zvol_shutdown_zv(zv);
- dmu_objset_disown(zv->zv_objset, zv);
+ dmu_objset_disown(zv->zv_objset, 1, zv);
zv->zv_objset = NULL;
}
{
zvol_state_t *zv;
int error = 0;
- boolean_t drop_suspend = B_FALSE;
-
- ASSERT(!mutex_owned(&zvol_state_lock));
+ boolean_t drop_suspend = B_TRUE;
- mutex_enter(&zvol_state_lock);
+ rw_enter(&zvol_state_lock, RW_READER);
/*
* Obtain a copy of private_data under the zvol_state_lock to make
* sure that either the result of zvol free code path setting
*/
zv = bdev->bd_disk->private_data;
if (zv == NULL) {
- mutex_exit(&zvol_state_lock);
+ rw_exit(&zvol_state_lock);
return (SET_ERROR(-ENXIO));
}
- /* take zv_suspend_lock before zv_state_lock */
- rw_enter(&zv->zv_suspend_lock, RW_READER);
-
mutex_enter(&zv->zv_state_lock);
-
/*
* make sure zvol is not suspended during first open
- * (hold zv_suspend_lock), otherwise, drop the lock
+ * (hold zv_suspend_lock) and respect proper lock acquisition
+ * ordering - zv_suspend_lock before zv_state_lock
*/
if (zv->zv_open_count == 0) {
- drop_suspend = B_TRUE;
+ if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
+ mutex_exit(&zv->zv_state_lock);
+ rw_enter(&zv->zv_suspend_lock, RW_READER);
+ mutex_enter(&zv->zv_state_lock);
+ /* check to see if zv_suspend_lock is needed */
+ if (zv->zv_open_count != 0) {
+ rw_exit(&zv->zv_suspend_lock);
+ drop_suspend = B_FALSE;
+ }
+ }
} else {
- rw_exit(&zv->zv_suspend_lock);
+ drop_suspend = B_FALSE;
}
+ rw_exit(&zvol_state_lock);
- mutex_exit(&zvol_state_lock);
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT(zv->zv_open_count != 0 || RW_READ_HELD(&zv->zv_suspend_lock));
if (zv->zv_open_count == 0) {
- error = zvol_first_open(zv);
+ error = zvol_first_open(zv, !(flag & FMODE_WRITE));
if (error)
goto out_mutex;
}
zv->zv_open_count++;
+ mutex_exit(&zv->zv_state_lock);
+ if (drop_suspend)
+ rw_exit(&zv->zv_suspend_lock);
+
check_disk_change(bdev);
+ return (0);
+
out_open_count:
if (zv->zv_open_count == 0)
zvol_last_close(zv);
+
out_mutex:
mutex_exit(&zv->zv_state_lock);
if (drop_suspend)
zvol_release(struct gendisk *disk, fmode_t mode)
{
zvol_state_t *zv;
- boolean_t drop_suspend = B_FALSE;
-
- ASSERT(!mutex_owned(&zvol_state_lock));
+ boolean_t drop_suspend = B_TRUE;
- mutex_enter(&zvol_state_lock);
+ rw_enter(&zvol_state_lock, RW_READER);
zv = disk->private_data;
- ASSERT(zv && zv->zv_open_count > 0);
-
- /* take zv_suspend_lock before zv_state_lock */
- rw_enter(&zv->zv_suspend_lock, RW_READER);
mutex_enter(&zv->zv_state_lock);
- mutex_exit(&zvol_state_lock);
-
+ ASSERT(zv->zv_open_count > 0);
/*
* make sure zvol is not suspended during last close
- * (hold zv_suspend_lock), otherwise, drop the lock
+ * (hold zv_suspend_lock) and respect proper lock acquisition
+ * ordering - zv_suspend_lock before zv_state_lock
*/
- if (zv->zv_open_count == 1)
- drop_suspend = B_TRUE;
- else
- rw_exit(&zv->zv_suspend_lock);
+ if (zv->zv_open_count == 1) {
+ if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
+ mutex_exit(&zv->zv_state_lock);
+ rw_enter(&zv->zv_suspend_lock, RW_READER);
+ mutex_enter(&zv->zv_state_lock);
+ /* check to see if zv_suspend_lock is needed */
+ if (zv->zv_open_count != 1) {
+ rw_exit(&zv->zv_suspend_lock);
+ drop_suspend = B_FALSE;
+ }
+ }
+ } else {
+ drop_suspend = B_FALSE;
+ }
+ rw_exit(&zvol_state_lock);
+
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT(zv->zv_open_count != 1 || RW_READ_HELD(&zv->zv_suspend_lock));
zv->zv_open_count--;
if (zv->zv_open_count == 0)
zvol_state_t *zv = bdev->bd_disk->private_data;
int error = 0;
- ASSERT(zv && zv->zv_open_count > 0);
+ ASSERT3U(zv->zv_open_count, >, 0);
switch (cmd) {
case BLKFLSBUF:
#define zvol_compat_ioctl NULL
#endif
+/*
+ * Linux 2.6.38 preferred interface.
+ */
+#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS
+static unsigned int
+zvol_check_events(struct gendisk *disk, unsigned int clearing)
+{
+ unsigned int mask = 0;
+
+ rw_enter(&zvol_state_lock, RW_READER);
+
+ zvol_state_t *zv = disk->private_data;
+ if (zv != NULL) {
+ mutex_enter(&zv->zv_state_lock);
+ mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
+ zv->zv_changed = 0;
+ mutex_exit(&zv->zv_state_lock);
+ }
+
+ rw_exit(&zvol_state_lock);
+
+ return (mask);
+}
+#else
static int zvol_media_changed(struct gendisk *disk)
{
+ int changed = 0;
+
+ rw_enter(&zvol_state_lock, RW_READER);
+
zvol_state_t *zv = disk->private_data;
+ if (zv != NULL) {
+ mutex_enter(&zv->zv_state_lock);
+ changed = zv->zv_changed;
+ zv->zv_changed = 0;
+ mutex_exit(&zv->zv_state_lock);
+ }
- ASSERT(zv && zv->zv_open_count > 0);
+ rw_exit(&zvol_state_lock);
- return (zv->zv_changed);
+ return (changed);
}
+#endif
static int zvol_revalidate_disk(struct gendisk *disk)
{
- zvol_state_t *zv = disk->private_data;
+ rw_enter(&zvol_state_lock, RW_READER);
- ASSERT(zv && zv->zv_open_count > 0);
+ zvol_state_t *zv = disk->private_data;
+ if (zv != NULL) {
+ mutex_enter(&zv->zv_state_lock);
+ set_capacity(zv->zv_disk, zv->zv_volsize >> SECTOR_BITS);
+ mutex_exit(&zv->zv_state_lock);
+ }
- zv->zv_changed = 0;
- set_capacity(zv->zv_disk, zv->zv_volsize >> 9);
+ rw_exit(&zvol_state_lock);
return (0);
}
zvol_state_t *zv = bdev->bd_disk->private_data;
sector_t sectors;
- ASSERT(zv && zv->zv_open_count > 0);
+ ASSERT3U(zv->zv_open_count, >, 0);
sectors = get_capacity(zv->zv_disk);
struct kobject *kobj;
zv = zvol_find_by_dev(dev);
- kobj = zv ? get_disk(zv->zv_disk) : NULL;
+ kobj = zv ? get_disk_and_module(zv->zv_disk) : NULL;
ASSERT(zv == NULL || MUTEX_HELD(&zv->zv_state_lock));
if (zv)
mutex_exit(&zv->zv_state_lock);
return (kobj);
}
-#ifdef HAVE_BDEV_BLOCK_DEVICE_OPERATIONS
static struct block_device_operations zvol_ops = {
.open = zvol_open,
.release = zvol_release,
.ioctl = zvol_ioctl,
.compat_ioctl = zvol_compat_ioctl,
- .media_changed = zvol_media_changed,
- .revalidate_disk = zvol_revalidate_disk,
- .getgeo = zvol_getgeo,
- .owner = THIS_MODULE,
-};
-
-#else /* HAVE_BDEV_BLOCK_DEVICE_OPERATIONS */
-
-static int
-zvol_open_by_inode(struct inode *inode, struct file *file)
-{
- return (zvol_open(inode->i_bdev, file->f_mode));
-}
-
-static int
-zvol_release_by_inode(struct inode *inode, struct file *file)
-{
- return (zvol_release(inode->i_bdev->bd_disk, file->f_mode));
-}
-
-static int
-zvol_ioctl_by_inode(struct inode *inode, struct file *file,
- unsigned int cmd, unsigned long arg)
-{
- if (file == NULL || inode == NULL)
- return (SET_ERROR(-EINVAL));
-
- return (zvol_ioctl(inode->i_bdev, file->f_mode, cmd, arg));
-}
-
-#ifdef CONFIG_COMPAT
-static long
-zvol_compat_ioctl_by_inode(struct file *file,
- unsigned int cmd, unsigned long arg)
-{
- if (file == NULL)
- return (SET_ERROR(-EINVAL));
-
- return (zvol_compat_ioctl(file->f_dentry->d_inode->i_bdev,
- file->f_mode, cmd, arg));
-}
+#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS
+ .check_events = zvol_check_events,
#else
-#define zvol_compat_ioctl_by_inode NULL
-#endif
-
-static struct block_device_operations zvol_ops = {
- .open = zvol_open_by_inode,
- .release = zvol_release_by_inode,
- .ioctl = zvol_ioctl_by_inode,
- .compat_ioctl = zvol_compat_ioctl_by_inode,
.media_changed = zvol_media_changed,
+#endif
.revalidate_disk = zvol_revalidate_disk,
.getgeo = zvol_getgeo,
.owner = THIS_MODULE,
};
-#endif /* HAVE_BDEV_BLOCK_DEVICE_OPERATIONS */
/*
* Allocate memory for a new zvol_state_t and setup the required
if (volmode == ZFS_VOLMODE_DEFAULT)
volmode = zvol_volmode;
+ if (volmode == ZFS_VOLMODE_NONE)
+ return (NULL);
+
zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
list_link_init(&zv->zv_next);
blk_queue_set_read_ahead(zv->zv_queue, 1);
/* Disable write merging in favor of the ZIO pipeline. */
- queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, zv->zv_queue);
+ blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zv->zv_queue);
zv->zv_disk = alloc_disk(ZVOL_MINORS);
if (zv->zv_disk == NULL)
rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
zv->zv_disk->major = zvol_major;
+#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS
+ zv->zv_disk->events = DISK_EVENT_MEDIA_CHANGE;
+#endif
+
if (volmode == ZFS_VOLMODE_DEV) {
/*
* ZFS_VOLMODE_DEV disable partitioning on ZVOL devices: set
{
zvol_state_t *zv = arg;
- ASSERT(!MUTEX_HELD(&zvol_state_lock));
ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
ASSERT(zv->zv_open_count == 0);
ida_simple_remove(&zvol_ida, MINOR(zv->zv_dev) >> ZVOL_MINOR_BITS);
mutex_destroy(&zv->zv_state_lock);
+ dataset_kstats_destroy(&zv->zv_kstat);
kmem_free(zv, sizeof (zvol_state_t));
}
doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
- error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os);
+ error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
if (error)
goto out_doi;
blk_queue_max_discard_sectors(zv->zv_queue,
(zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
blk_queue_discard_granularity(zv->zv_queue, zv->zv_volblocksize);
- queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zv->zv_queue);
+ blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_queue);
#ifdef QUEUE_FLAG_NONROT
- queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zv->zv_queue);
+ blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_queue);
#endif
#ifdef QUEUE_FLAG_ADD_RANDOM
- queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zv->zv_queue);
+ blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_queue);
#endif
if (spa_writeable(dmu_objset_spa(os))) {
else
zil_replay(os, zv, zvol_replay_vector);
}
+ ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
+ dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
/*
* When udev detects the addition of the device it will immediately
zv->zv_objset = NULL;
out_dmu_objset_disown:
- dmu_objset_disown(os, FTAG);
+ dmu_objset_disown(os, B_TRUE, FTAG);
out_doi:
kmem_free(doi, sizeof (dmu_object_info_t));
if (error == 0) {
- mutex_enter(&zvol_state_lock);
+ rw_enter(&zvol_state_lock, RW_WRITER);
zvol_insert(zv);
- mutex_exit(&zvol_state_lock);
+ rw_exit(&zvol_state_lock);
add_disk(zv->zv_disk);
} else {
ida_simple_remove(&zvol_ida, idx);
{
int readonly = get_disk_ro(zv->zv_disk);
- ASSERT(MUTEX_HELD(&zvol_state_lock));
+ ASSERT(RW_LOCK_HELD(&zvol_state_lock));
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
char *dsname = job->name;
objset_t *os = NULL;
- job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, FTAG,
- &os);
+ job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE,
+ FTAG, &os);
if (job->error == 0) {
dmu_prefetch(os, ZVOL_OBJ, 0, 0, 0, ZIO_PRIORITY_SYNC_READ);
- dmu_objset_disown(os, FTAG);
+ dmu_objset_disown(os, B_TRUE, FTAG);
}
}
list_create(&free_list, sizeof (zvol_state_t),
offsetof(zvol_state_t, zv_next));
- mutex_enter(&zvol_state_lock);
+ rw_enter(&zvol_state_lock, RW_WRITER);
for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
zv_next = list_next(&zvol_state_list, zv);
zvol_remove(zv);
/*
- * clear this while holding zvol_state_lock so
- * zvol_open won't open it
+ * Cleared while holding zvol_state_lock as a writer
+ * which will prevent zvol_open() from opening it.
*/
zv->zv_disk->private_data = NULL;
/* Drop zv_state_lock before zvol_free() */
mutex_exit(&zv->zv_state_lock);
- /* try parallel zv_free, if failed do it in place */
+ /* Try parallel zv_free, if failed do it in place */
t = taskq_dispatch(system_taskq, zvol_free, zv,
TQ_SLEEP);
if (t == TASKQID_INVALID)
mutex_exit(&zv->zv_state_lock);
}
}
- mutex_exit(&zvol_state_lock);
+ rw_exit(&zvol_state_lock);
- /*
- * Drop zvol_state_lock before calling zvol_free()
- */
+ /* Drop zvol_state_lock before calling zvol_free() */
while ((zv = list_head(&free_list)) != NULL) {
list_remove(&free_list, zv);
zvol_free(zv);
if (zvol_inhibit_dev)
return;
- mutex_enter(&zvol_state_lock);
+ rw_enter(&zvol_state_lock, RW_WRITER);
for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
zv_next = list_next(&zvol_state_list, zv);
}
zvol_remove(zv);
- /* clear this so zvol_open won't open it */
+ /*
+ * Cleared while holding zvol_state_lock as a writer
+ * which will prevent zvol_open() from opening it.
+ */
zv->zv_disk->private_data = NULL;
mutex_exit(&zv->zv_state_lock);
}
/* Drop zvol_state_lock before calling zvol_free() */
- mutex_exit(&zvol_state_lock);
+ rw_exit(&zvol_state_lock);
if (zv != NULL)
zvol_free(zv);
oldnamelen = strlen(oldname);
newnamelen = strlen(newname);
- mutex_enter(&zvol_state_lock);
+ rw_enter(&zvol_state_lock, RW_READER);
for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
zv_next = list_next(&zvol_state_list, zv);
mutex_exit(&zv->zv_state_lock);
}
- mutex_exit(&zvol_state_lock);
+ rw_exit(&zvol_state_lock);
}
typedef struct zvol_snapdev_cb_arg {
list_create(&zvol_state_list, sizeof (zvol_state_t),
offsetof(zvol_state_t, zv_next));
- mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL);
+ rw_init(&zvol_state_lock, NULL, RW_DEFAULT, NULL);
ida_init(&zvol_ida);
zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri,
taskq_destroy(zvol_taskq);
out:
ida_destroy(&zvol_ida);
- mutex_destroy(&zvol_state_lock);
+ rw_destroy(&zvol_state_lock);
list_destroy(&zvol_state_list);
return (SET_ERROR(error));
taskq_destroy(zvol_taskq);
list_destroy(&zvol_state_list);
- mutex_destroy(&zvol_state_lock);
+ rw_destroy(&zvol_state_lock);
ida_destroy(&zvol_ida);
}