*
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2016 Actifio, Inc. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
*/
/*
* and zvol_release()->zvol_last_close() directly as well.
*/
+#include <sys/dataset_kstats.h>
#include <sys/dbuf.h>
#include <sys/dmu_traverse.h>
#include <sys/dsl_dataset.h>
#include <sys/zfs_znode.h>
#include <sys/spa_impl.h>
#include <sys/zvol.h>
+
#include <linux/blkdev_compat.h>
+#include <linux/task_io_accounting_ops.h>
unsigned int zvol_inhibit_dev = 0;
unsigned int zvol_major = ZVOL_MAJOR;
unsigned int zvol_volmode = ZFS_VOLMODE_GEOM;
static taskq_t *zvol_taskq;
-static kmutex_t zvol_state_lock;
+static krwlock_t zvol_state_lock;
static list_t zvol_state_list;
#define ZVOL_HT_SIZE 1024
dev_t zv_dev; /* device id */
struct gendisk *zv_disk; /* generic disk */
struct request_queue *zv_queue; /* request queue */
+ dataset_kstats_t zv_kstat; /* zvol kstats */
list_node_t zv_next; /* next zvol_state_t linkage */
uint64_t zv_hash; /* name hash */
struct hlist_node zv_hlink; /* hash link */
{
zvol_state_t *zv;
- mutex_enter(&zvol_state_lock);
+ rw_enter(&zvol_state_lock, RW_READER);
for (zv = list_head(&zvol_state_list); zv != NULL;
zv = list_next(&zvol_state_list, zv)) {
mutex_enter(&zv->zv_state_lock);
if (zv->zv_dev == dev) {
- mutex_exit(&zvol_state_lock);
+ rw_exit(&zvol_state_lock);
return (zv);
}
mutex_exit(&zv->zv_state_lock);
}
- mutex_exit(&zvol_state_lock);
+ rw_exit(&zvol_state_lock);
return (NULL);
}
zvol_state_t *zv;
struct hlist_node *p = NULL;
- mutex_enter(&zvol_state_lock);
+ rw_enter(&zvol_state_lock, RW_READER);
hlist_for_each(p, ZVOL_HT_HEAD(hash)) {
zv = hlist_entry(p, zvol_state_t, zv_hlink);
mutex_enter(&zv->zv_state_lock);
strncmp(zv->zv_name, name, MAXNAMELEN)
== 0);
}
- mutex_exit(&zvol_state_lock);
+ rw_exit(&zvol_state_lock);
return (zv);
}
mutex_exit(&zv->zv_state_lock);
}
- mutex_exit(&zvol_state_lock);
+ rw_exit(&zvol_state_lock);
return (NULL);
}
return (SET_ERROR(error));
}
-static void
-zvol_size_changed(zvol_state_t *zv, uint64_t volsize)
-{
- struct block_device *bdev;
-
- ASSERT(MUTEX_HELD(&zv->zv_state_lock));
-
- bdev = bdget_disk(zv->zv_disk, 0);
- if (bdev == NULL)
- return;
-
- set_capacity(zv->zv_disk, volsize >> 9);
- zv->zv_volsize = volsize;
- check_disk_size_change(zv->zv_disk, bdev);
-
- bdput(bdev);
-}
-
/*
* Sanity check volume size.
*/
return (error);
}
-static int
-zvol_update_live_volsize(zvol_state_t *zv, uint64_t volsize)
-{
- zvol_size_changed(zv, volsize);
-
- /*
- * We should post a event here describing the expansion. However,
- * the zfs_ereport_post() interface doesn't nicely support posting
- * events for zvols, it assumes events relate to vdevs or zios.
- */
-
- return (0);
-}
-
/*
- * Set ZFS_PROP_VOLSIZE set entry point.
+ * Set ZFS_PROP_VOLSIZE set entry point. Note that modifying the volume
+ * size will result in a udev "change" event being generated.
*/
int
zvol_set_volsize(const char *name, uint64_t volsize)
{
- zvol_state_t *zv = NULL;
objset_t *os = NULL;
- int error;
- dmu_object_info_t *doi;
+ struct gendisk *disk = NULL;
uint64_t readonly;
+ int error;
boolean_t owned = B_FALSE;
error = dsl_prop_get_integer(name,
if (readonly)
return (SET_ERROR(EROFS));
- zv = zvol_find_by_name(name, RW_READER);
+ zvol_state_t *zv = zvol_find_by_name(name, RW_READER);
ASSERT(zv == NULL || (MUTEX_HELD(&zv->zv_state_lock) &&
RW_READ_HELD(&zv->zv_suspend_lock)));
os = zv->zv_objset;
}
- doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
+ dmu_object_info_t *doi = kmem_alloc(sizeof (*doi), KM_SLEEP);
if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) ||
(error = zvol_check_volsize(volsize, doi->doi_data_block_size)))
goto out;
error = zvol_update_volsize(volsize, os);
-
- if (error == 0 && zv != NULL)
- error = zvol_update_live_volsize(zv, volsize);
+ if (error == 0 && zv != NULL) {
+ zv->zv_volsize = volsize;
+ zv->zv_changed = 1;
+ disk = zv->zv_disk;
+ }
out:
kmem_free(doi, sizeof (dmu_object_info_t));
if (zv != NULL)
mutex_exit(&zv->zv_state_lock);
+ if (disk != NULL)
+ revalidate_disk(disk);
+
return (SET_ERROR(error));
}
if (zv == NULL)
return (SET_ERROR(ENXIO));
- ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
- RW_READ_HELD(&zv->zv_suspend_lock));
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
if (zv->zv_flags & ZVOL_RDONLY) {
mutex_exit(&zv->zv_state_lock);
static void
zvol_write(void *arg)
{
+ int error = 0;
+
zv_request_t *zvr = arg;
struct bio *bio = zvr->bio;
uio_t uio;
- zvol_state_t *zv = zvr->zv;
- uint64_t volsize = zv->zv_volsize;
- boolean_t sync;
- int error = 0;
- unsigned long start_jif;
-
uio_from_bio(&uio, bio);
+ zvol_state_t *zv = zvr->zv;
ASSERT(zv && zv->zv_open_count > 0);
- start_jif = jiffies;
+ ssize_t start_resid = uio.uio_resid;
+ unsigned long start_jif = jiffies;
blk_generic_start_io_acct(zv->zv_queue, WRITE, bio_sectors(bio),
&zv->zv_disk->part0);
- sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+ boolean_t sync =
+ bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+ uint64_t volsize = zv->zv_volsize;
while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
uint64_t off = uio.uio_loffset;
break;
}
error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
- if (error == 0)
+ if (error == 0) {
zvol_log_write(zv, tx, off, bytes, sync);
+ }
dmu_tx_commit(tx);
if (error)
break;
}
zfs_range_unlock(zvr->rl);
+
+ int64_t nwritten = start_resid - uio.uio_resid;
+ dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
+ task_io_account_write(nwritten);
+
if (sync)
zil_commit(zv->zv_zilog, ZVOL_OBJ);
static void
zvol_read(void *arg)
{
+ int error = 0;
+
zv_request_t *zvr = arg;
struct bio *bio = zvr->bio;
uio_t uio;
- zvol_state_t *zv = zvr->zv;
- uint64_t volsize = zv->zv_volsize;
- int error = 0;
- unsigned long start_jif;
-
uio_from_bio(&uio, bio);
+ zvol_state_t *zv = zvr->zv;
ASSERT(zv && zv->zv_open_count > 0);
- start_jif = jiffies;
+ ssize_t start_resid = uio.uio_resid;
+ unsigned long start_jif = jiffies;
blk_generic_start_io_acct(zv->zv_queue, READ, bio_sectors(bio),
&zv->zv_disk->part0);
+ uint64_t volsize = zv->zv_volsize;
while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
}
zfs_range_unlock(zvr->rl);
+ int64_t nread = start_resid - uio.uio_resid;
+ dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
+ task_io_account_read(nread);
+
rw_exit(&zv->zv_suspend_lock);
blk_generic_end_io_acct(zv->zv_queue, READ, &zv->zv_disk->part0,
start_jif);
static void
zvol_insert(zvol_state_t *zv)
{
- ASSERT(MUTEX_HELD(&zvol_state_lock));
+ ASSERT(RW_WRITE_HELD(&zvol_state_lock));
ASSERT3U(MINOR(zv->zv_dev) & ZVOL_MINOR_MASK, ==, 0);
list_insert_head(&zvol_state_list, zv);
hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
static void
zvol_remove(zvol_state_t *zv)
{
- ASSERT(MUTEX_HELD(&zvol_state_lock));
+ ASSERT(RW_WRITE_HELD(&zvol_state_lock));
list_remove(&zvol_state_list, zv);
hlist_del(&zv->zv_hlink);
}
uint64_t ro;
objset_t *os = zv->zv_objset;
- ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
- RW_LOCK_HELD(&zv->zv_suspend_lock));
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT(RW_LOCK_HELD(&zv->zv_suspend_lock));
error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
if (error)
return (NULL);
/* block all I/O, release in zvol_resume. */
- ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
- RW_WRITE_HELD(&zv->zv_suspend_lock));
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
atomic_inc(&zv->zv_suspend_ref);
int error = 0;
boolean_t drop_suspend = B_TRUE;
- ASSERT(!MUTEX_HELD(&zvol_state_lock));
-
- mutex_enter(&zvol_state_lock);
+ rw_enter(&zvol_state_lock, RW_READER);
/*
* Obtain a copy of private_data under the zvol_state_lock to make
* sure that either the result of zvol free code path setting
*/
zv = bdev->bd_disk->private_data;
if (zv == NULL) {
- mutex_exit(&zvol_state_lock);
+ rw_exit(&zvol_state_lock);
return (SET_ERROR(-ENXIO));
}
} else {
drop_suspend = B_FALSE;
}
- mutex_exit(&zvol_state_lock);
+ rw_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
ASSERT(zv->zv_open_count != 0 || RW_READ_HELD(&zv->zv_suspend_lock));
zv->zv_open_count++;
+ mutex_exit(&zv->zv_state_lock);
+ if (drop_suspend)
+ rw_exit(&zv->zv_suspend_lock);
+
check_disk_change(bdev);
+ return (0);
+
out_open_count:
if (zv->zv_open_count == 0)
zvol_last_close(zv);
+
out_mutex:
mutex_exit(&zv->zv_state_lock);
if (drop_suspend)
zvol_state_t *zv;
boolean_t drop_suspend = B_TRUE;
- ASSERT(!MUTEX_HELD(&zvol_state_lock));
-
- mutex_enter(&zvol_state_lock);
+ rw_enter(&zvol_state_lock, RW_READER);
zv = disk->private_data;
mutex_enter(&zv->zv_state_lock);
} else {
drop_suspend = B_FALSE;
}
- mutex_exit(&zvol_state_lock);
+ rw_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
ASSERT(zv->zv_open_count != 1 || RW_READ_HELD(&zv->zv_suspend_lock));
zvol_state_t *zv = bdev->bd_disk->private_data;
int error = 0;
- ASSERT(zv && zv->zv_open_count > 0);
+ ASSERT3U(zv->zv_open_count, >, 0);
switch (cmd) {
case BLKFLSBUF:
#define zvol_compat_ioctl NULL
#endif
+/*
+ * Linux 2.6.38 preferred interface.
+ */
+#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS
+static unsigned int
+zvol_check_events(struct gendisk *disk, unsigned int clearing)
+{
+ unsigned int mask = 0;
+
+ rw_enter(&zvol_state_lock, RW_READER);
+
+ zvol_state_t *zv = disk->private_data;
+ if (zv != NULL) {
+ mutex_enter(&zv->zv_state_lock);
+ mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
+ zv->zv_changed = 0;
+ mutex_exit(&zv->zv_state_lock);
+ }
+
+ rw_exit(&zvol_state_lock);
+
+ return (mask);
+}
+#else
static int zvol_media_changed(struct gendisk *disk)
{
+ int changed = 0;
+
+ rw_enter(&zvol_state_lock, RW_READER);
+
zvol_state_t *zv = disk->private_data;
+ if (zv != NULL) {
+ mutex_enter(&zv->zv_state_lock);
+ changed = zv->zv_changed;
+ zv->zv_changed = 0;
+ mutex_exit(&zv->zv_state_lock);
+ }
- ASSERT(zv && zv->zv_open_count > 0);
+ rw_exit(&zvol_state_lock);
- return (zv->zv_changed);
+ return (changed);
}
+#endif
static int zvol_revalidate_disk(struct gendisk *disk)
{
- zvol_state_t *zv = disk->private_data;
+ rw_enter(&zvol_state_lock, RW_READER);
- ASSERT(zv && zv->zv_open_count > 0);
+ zvol_state_t *zv = disk->private_data;
+ if (zv != NULL) {
+ mutex_enter(&zv->zv_state_lock);
+ set_capacity(zv->zv_disk, zv->zv_volsize >> SECTOR_BITS);
+ mutex_exit(&zv->zv_state_lock);
+ }
- zv->zv_changed = 0;
- set_capacity(zv->zv_disk, zv->zv_volsize >> 9);
+ rw_exit(&zvol_state_lock);
return (0);
}
zvol_state_t *zv = bdev->bd_disk->private_data;
sector_t sectors;
- ASSERT(zv && zv->zv_open_count > 0);
+ ASSERT3U(zv->zv_open_count, >, 0);
sectors = get_capacity(zv->zv_disk);
return (kobj);
}
-#ifdef HAVE_BDEV_BLOCK_DEVICE_OPERATIONS
static struct block_device_operations zvol_ops = {
.open = zvol_open,
.release = zvol_release,
.ioctl = zvol_ioctl,
.compat_ioctl = zvol_compat_ioctl,
- .media_changed = zvol_media_changed,
- .revalidate_disk = zvol_revalidate_disk,
- .getgeo = zvol_getgeo,
- .owner = THIS_MODULE,
-};
-
-#else /* HAVE_BDEV_BLOCK_DEVICE_OPERATIONS */
-
-static int
-zvol_open_by_inode(struct inode *inode, struct file *file)
-{
- return (zvol_open(inode->i_bdev, file->f_mode));
-}
-
-static int
-zvol_release_by_inode(struct inode *inode, struct file *file)
-{
- return (zvol_release(inode->i_bdev->bd_disk, file->f_mode));
-}
-
-static int
-zvol_ioctl_by_inode(struct inode *inode, struct file *file,
- unsigned int cmd, unsigned long arg)
-{
- if (file == NULL || inode == NULL)
- return (SET_ERROR(-EINVAL));
-
- return (zvol_ioctl(inode->i_bdev, file->f_mode, cmd, arg));
-}
-
-#ifdef CONFIG_COMPAT
-static long
-zvol_compat_ioctl_by_inode(struct file *file,
- unsigned int cmd, unsigned long arg)
-{
- if (file == NULL)
- return (SET_ERROR(-EINVAL));
-
- return (zvol_compat_ioctl(file->f_dentry->d_inode->i_bdev,
- file->f_mode, cmd, arg));
-}
+#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS
+ .check_events = zvol_check_events,
#else
-#define zvol_compat_ioctl_by_inode NULL
-#endif
-
-static struct block_device_operations zvol_ops = {
- .open = zvol_open_by_inode,
- .release = zvol_release_by_inode,
- .ioctl = zvol_ioctl_by_inode,
- .compat_ioctl = zvol_compat_ioctl_by_inode,
.media_changed = zvol_media_changed,
+#endif
.revalidate_disk = zvol_revalidate_disk,
.getgeo = zvol_getgeo,
.owner = THIS_MODULE,
};
-#endif /* HAVE_BDEV_BLOCK_DEVICE_OPERATIONS */
/*
* Allocate memory for a new zvol_state_t and setup the required
rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
zv->zv_disk->major = zvol_major;
+#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS
+ zv->zv_disk->events = DISK_EVENT_MEDIA_CHANGE;
+#endif
+
if (volmode == ZFS_VOLMODE_DEV) {
/*
* ZFS_VOLMODE_DEV disable partitioning on ZVOL devices: set
{
zvol_state_t *zv = arg;
- ASSERT(!MUTEX_HELD(&zvol_state_lock));
ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
ASSERT(zv->zv_open_count == 0);
ida_simple_remove(&zvol_ida, MINOR(zv->zv_dev) >> ZVOL_MINOR_BITS);
mutex_destroy(&zv->zv_state_lock);
+ dataset_kstats_destroy(&zv->zv_kstat);
kmem_free(zv, sizeof (zvol_state_t));
}
else
zil_replay(os, zv, zvol_replay_vector);
}
+ ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
+ dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
/*
* When udev detects the addition of the device it will immediately
kmem_free(doi, sizeof (dmu_object_info_t));
if (error == 0) {
- mutex_enter(&zvol_state_lock);
+ rw_enter(&zvol_state_lock, RW_WRITER);
zvol_insert(zv);
- mutex_exit(&zvol_state_lock);
+ rw_exit(&zvol_state_lock);
add_disk(zv->zv_disk);
} else {
ida_simple_remove(&zvol_ida, idx);
{
int readonly = get_disk_ro(zv->zv_disk);
- ASSERT(MUTEX_HELD(&zvol_state_lock));
+ ASSERT(RW_LOCK_HELD(&zvol_state_lock));
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
list_create(&free_list, sizeof (zvol_state_t),
offsetof(zvol_state_t, zv_next));
- mutex_enter(&zvol_state_lock);
+ rw_enter(&zvol_state_lock, RW_WRITER);
for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
zv_next = list_next(&zvol_state_list, zv);
zvol_remove(zv);
/*
- * clear this while holding zvol_state_lock so
- * zvol_open won't open it
+ * Cleared while holding zvol_state_lock as a writer
+ * which will prevent zvol_open() from opening it.
*/
zv->zv_disk->private_data = NULL;
/* Drop zv_state_lock before zvol_free() */
mutex_exit(&zv->zv_state_lock);
- /* try parallel zv_free, if failed do it in place */
+ /* Try parallel zv_free, if failed do it in place */
t = taskq_dispatch(system_taskq, zvol_free, zv,
TQ_SLEEP);
if (t == TASKQID_INVALID)
mutex_exit(&zv->zv_state_lock);
}
}
- mutex_exit(&zvol_state_lock);
+ rw_exit(&zvol_state_lock);
- /*
- * Drop zvol_state_lock before calling zvol_free()
- */
+ /* Drop zvol_state_lock before calling zvol_free() */
while ((zv = list_head(&free_list)) != NULL) {
list_remove(&free_list, zv);
zvol_free(zv);
if (zvol_inhibit_dev)
return;
- mutex_enter(&zvol_state_lock);
+ rw_enter(&zvol_state_lock, RW_WRITER);
for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
zv_next = list_next(&zvol_state_list, zv);
}
zvol_remove(zv);
- /* clear this so zvol_open won't open it */
+ /*
+ * Cleared while holding zvol_state_lock as a writer
+ * which will prevent zvol_open() from opening it.
+ */
zv->zv_disk->private_data = NULL;
mutex_exit(&zv->zv_state_lock);
}
/* Drop zvol_state_lock before calling zvol_free() */
- mutex_exit(&zvol_state_lock);
+ rw_exit(&zvol_state_lock);
if (zv != NULL)
zvol_free(zv);
oldnamelen = strlen(oldname);
newnamelen = strlen(newname);
- mutex_enter(&zvol_state_lock);
+ rw_enter(&zvol_state_lock, RW_READER);
for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
zv_next = list_next(&zvol_state_list, zv);
mutex_exit(&zv->zv_state_lock);
}
- mutex_exit(&zvol_state_lock);
+ rw_exit(&zvol_state_lock);
}
typedef struct zvol_snapdev_cb_arg {
list_create(&zvol_state_list, sizeof (zvol_state_t),
offsetof(zvol_state_t, zv_next));
- mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL);
+ rw_init(&zvol_state_lock, NULL, RW_DEFAULT, NULL);
ida_init(&zvol_ida);
zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri,
taskq_destroy(zvol_taskq);
out:
ida_destroy(&zvol_ida);
- mutex_destroy(&zvol_state_lock);
+ rw_destroy(&zvol_state_lock);
list_destroy(&zvol_state_list);
return (SET_ERROR(error));
taskq_destroy(zvol_taskq);
list_destroy(&zvol_state_list);
- mutex_destroy(&zvol_state_lock);
+ rw_destroy(&zvol_state_lock);
ida_destroy(&zvol_ida);
}