#include "block/thread-pool.h"
#include "qemu/iov.h"
#include "raw-aio.h"
+#include "qapi/util.h"
#if defined(__APPLE__) && (__MACH__)
#include <paths.h>
#include <linux/cdrom.h>
#include <linux/fd.h>
#include <linux/fs.h>
+#ifndef FS_NOCOW_FL
+#define FS_NOCOW_FL 0x00800000 /* Do not cow file */
+#endif
#endif
#ifdef CONFIG_FIEMAP
#include <linux/fiemap.h>
}
#endif
-static void raw_probe_alignment(BlockDriverState *bs)
+static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
{
BDRVRawState *s = bs->opaque;
char *buf;
s->buf_align = 0;
#ifdef BLKSSZGET
- if (ioctl(s->fd, BLKSSZGET, §or_size) >= 0) {
+ if (ioctl(fd, BLKSSZGET, §or_size) >= 0) {
bs->request_alignment = sector_size;
}
#endif
#ifdef DKIOCGETBLOCKSIZE
- if (ioctl(s->fd, DKIOCGETBLOCKSIZE, §or_size) >= 0) {
+ if (ioctl(fd, DKIOCGETBLOCKSIZE, §or_size) >= 0) {
bs->request_alignment = sector_size;
}
#endif
#ifdef DIOCGSECTORSIZE
- if (ioctl(s->fd, DIOCGSECTORSIZE, §or_size) >= 0) {
+ if (ioctl(fd, DIOCGSECTORSIZE, §or_size) >= 0) {
bs->request_alignment = sector_size;
}
#endif
#ifdef CONFIG_XFS
if (s->is_xfs) {
struct dioattr da;
- if (xfsctl(NULL, s->fd, XFS_IOC_DIOINFO, &da) >= 0) {
+ if (xfsctl(NULL, fd, XFS_IOC_DIOINFO, &da) >= 0) {
bs->request_alignment = da.d_miniosz;
/* The kernel returns wrong information for d_mem */
/* s->buf_align = da.d_mem; */
size_t align;
buf = qemu_memalign(MAX_BLOCKSIZE, 2 * MAX_BLOCKSIZE);
for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) {
- if (pread(s->fd, buf + align, MAX_BLOCKSIZE, 0) >= 0) {
+ if (pread(fd, buf + align, MAX_BLOCKSIZE, 0) >= 0) {
s->buf_align = align;
break;
}
size_t align;
buf = qemu_memalign(s->buf_align, MAX_BLOCKSIZE);
for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) {
- if (pread(s->fd, buf, align, 0) >= 0) {
+ if (pread(fd, buf, align, 0) >= 0) {
bs->request_alignment = align;
break;
}
}
qemu_vfree(buf);
}
+
+ if (!s->buf_align || !bs->request_alignment) {
+ error_setg(errp, "Could not find working O_DIRECT alignment. "
+ "Try cache.direct=off.");
+ }
}
static void raw_parse_flags(int bdrv_flags, int *open_flags)
}
}
+static void raw_detach_aio_context(BlockDriverState *bs)
+{
+#ifdef CONFIG_LINUX_AIO
+ BDRVRawState *s = bs->opaque;
+
+ if (s->use_aio) {
+ laio_detach_aio_context(s->aio_ctx, bdrv_get_aio_context(bs));
+ }
+#endif
+}
+
+static void raw_attach_aio_context(BlockDriverState *bs,
+ AioContext *new_context)
+{
+#ifdef CONFIG_LINUX_AIO
+ BDRVRawState *s = bs->opaque;
+
+ if (s->use_aio) {
+ laio_attach_aio_context(s->aio_ctx, new_context);
+ }
+#endif
+}
+
#ifdef CONFIG_LINUX_AIO
static int raw_set_aio(void **aio_ctx, int *use_aio, int bdrv_flags)
{
}
#endif
+ raw_attach_aio_context(bs, bdrv_get_aio_context(bs));
+
ret = 0;
fail:
if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) {
BDRVRawState *s;
BDRVRawReopenState *raw_s;
int ret = 0;
+ Error *local_err = NULL;
assert(state != NULL);
assert(state->bs != NULL);
s = state->bs->opaque;
- state->opaque = g_malloc0(sizeof(BDRVRawReopenState));
+ state->opaque = g_new0(BDRVRawReopenState, 1);
raw_s = state->opaque;
#ifdef CONFIG_LINUX_AIO
ret = -1;
}
}
+
+ /* Fail already reopen_prepare() if we can't get a working O_DIRECT
+ * alignment with the new fd. */
+ if (raw_s->fd != -1) {
+ raw_probe_alignment(state->bs, raw_s->fd, &local_err);
+ if (local_err) {
+ qemu_close(raw_s->fd);
+ raw_s->fd = -1;
+ error_propagate(errp, local_err);
+ ret = -EINVAL;
+ }
+ }
+
return ret;
}
state->opaque = NULL;
}
-static int raw_refresh_limits(BlockDriverState *bs)
+static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
{
BDRVRawState *s = bs->opaque;
- raw_probe_alignment(bs);
+ raw_probe_alignment(bs, s->fd, errp);
bs->bl.opt_mem_alignment = s->buf_align;
-
- return 0;
}
static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb)
}
if (len == -1 && errno == EINTR) {
continue;
+ } else if (len == -1 && errno == EINVAL &&
+ (aiocb->bs->open_flags & BDRV_O_NOCACHE) &&
+ !(aiocb->aio_type & QEMU_AIO_WRITE) &&
+ offset > 0) {
+ /* O_DIRECT pread() may fail with EINVAL when offset is unaligned
+ * after a short read. Assume that O_DIRECT short reads only occur
+ * at EOF. Therefore this is a short read, not an I/O error.
+ */
+ break;
} else if (len == -1) {
offset = -errno;
break;
* Ok, we have to do it the hard way, copy all segments into
* a single aligned buffer.
*/
- buf = qemu_blockalign(aiocb->bs, aiocb->aio_nbytes);
+ buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes);
+ if (buf == NULL) {
+ return -ENOMEM;
+ }
+
if (aiocb->aio_type & QEMU_AIO_WRITE) {
char *p = buf;
int i;
memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len);
p += aiocb->aio_iov[i].iov_len;
}
+ assert(p - buf == aiocb->aio_nbytes);
}
nbytes = handle_aiocb_rw_linear(aiocb, buf);
copy = aiocb->aio_iov[i].iov_len;
}
memcpy(aiocb->aio_iov[i].iov_base, p, copy);
+ assert(count >= copy);
p += copy;
count -= copy;
}
+ assert(count == 0);
}
qemu_vfree(buf);
acb->aio_type = type;
acb->aio_fildes = fd;
+ acb->aio_nbytes = nb_sectors * BDRV_SECTOR_SIZE;
+ acb->aio_offset = sector_num * BDRV_SECTOR_SIZE;
+
if (qiov) {
acb->aio_iov = qiov->iov;
acb->aio_niov = qiov->niov;
+ assert(qiov->size == acb->aio_nbytes);
}
- acb->aio_nbytes = nb_sectors * 512;
- acb->aio_offset = sector_num * 512;
trace_paio_submit_co(sector_num, nb_sectors, type);
pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
acb->aio_type = type;
acb->aio_fildes = fd;
+ acb->aio_nbytes = nb_sectors * BDRV_SECTOR_SIZE;
+ acb->aio_offset = sector_num * BDRV_SECTOR_SIZE;
+
if (qiov) {
acb->aio_iov = qiov->iov;
acb->aio_niov = qiov->niov;
+ assert(qiov->size == acb->aio_nbytes);
}
- acb->aio_nbytes = nb_sectors * 512;
- acb->aio_offset = sector_num * 512;
trace_paio_submit(acb, opaque, sector_num, nb_sectors, type);
pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
cb, opaque, type);
}
+static void raw_aio_plug(BlockDriverState *bs)
+{
+#ifdef CONFIG_LINUX_AIO
+ BDRVRawState *s = bs->opaque;
+ if (s->use_aio) {
+ laio_io_plug(bs, s->aio_ctx);
+ }
+#endif
+}
+
+static void raw_aio_unplug(BlockDriverState *bs)
+{
+#ifdef CONFIG_LINUX_AIO
+ BDRVRawState *s = bs->opaque;
+ if (s->use_aio) {
+ laio_io_unplug(bs, s->aio_ctx, true);
+ }
+#endif
+}
+
+static void raw_aio_flush_io_queue(BlockDriverState *bs)
+{
+#ifdef CONFIG_LINUX_AIO
+ BDRVRawState *s = bs->opaque;
+ if (s->use_aio) {
+ laio_io_unplug(bs, s->aio_ctx, false);
+ }
+#endif
+}
+
static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs,
int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
BlockDriverCompletionFunc *cb, void *opaque)
static void raw_close(BlockDriverState *bs)
{
BDRVRawState *s = bs->opaque;
+
+ raw_detach_aio_context(bs);
+
+#ifdef CONFIG_LINUX_AIO
+ if (s->use_aio) {
+ laio_cleanup(s->aio_ctx);
+ }
+#endif
if (s->fd >= 0) {
qemu_close(s->fd);
s->fd = -1;
struct stat st;
if (fstat(fd, &st))
- return -1;
+ return -errno;
if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
struct disklabel dl;
if (ioctl(fd, DIOCGDINFO, &dl))
- return -1;
+ return -errno;
return (uint64_t)dl.d_secsize *
dl.d_partitions[DISKPART(st.st_rdev)].p_size;
} else
struct stat st;
if (fstat(fd, &st))
- return -1;
+ return -errno;
if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
struct dkwedge_info dkw;
struct disklabel dl;
if (ioctl(fd, DIOCGDINFO, &dl))
- return -1;
+ return -errno;
return (uint64_t)dl.d_secsize *
dl.d_partitions[DISKPART(st.st_rdev)].p_size;
}
BDRVRawState *s = bs->opaque;
struct dk_minfo minfo;
int ret;
+ int64_t size;
ret = fd_open(bs);
if (ret < 0) {
* There are reports that lseek on some devices fails, but
* irc discussion said that contingency on contingency was overkill.
*/
- return lseek(s->fd, 0, SEEK_END);
+ size = lseek(s->fd, 0, SEEK_END);
+ if (size < 0) {
+ return -errno;
+ }
+ return size;
}
#elif defined(CONFIG_BSD)
static int64_t raw_getlength(BlockDriverState *bs)
if (size == 0)
#endif
#if defined(__APPLE__) && defined(__MACH__)
- size = LONG_LONG_MAX;
+ size = LLONG_MAX;
#else
size = lseek(fd, 0LL, SEEK_END);
+ if (size < 0) {
+ return -errno;
+ }
#endif
#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
switch(s->type) {
#endif
} else {
size = lseek(fd, 0, SEEK_END);
+ if (size < 0) {
+ return -errno;
+ }
}
return size;
}
{
BDRVRawState *s = bs->opaque;
int ret;
+ int64_t size;
ret = fd_open(bs);
if (ret < 0) {
return ret;
}
- return lseek(s->fd, 0, SEEK_END);
+ size = lseek(s->fd, 0, SEEK_END);
+ if (size < 0) {
+ return -errno;
+ }
+ return size;
}
#endif
return (int64_t)st.st_blocks * 512;
}
-static int raw_create(const char *filename, QEMUOptionParameter *options,
- Error **errp)
+static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
{
int fd;
int result = 0;
int64_t total_size = 0;
+ bool nocow = false;
+ PreallocMode prealloc;
+ char *buf = NULL;
+ Error *local_err = NULL;
strstart(filename, "file:", &filename);
/* Read out options */
- while (options && options->name) {
- if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
- total_size = options->value.n / BDRV_SECTOR_SIZE;
- }
- options++;
+ total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+ BDRV_SECTOR_SIZE);
+ nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false);
+ buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
+ prealloc = qapi_enum_parse(PreallocMode_lookup, buf,
+ PREALLOC_MODE_MAX, PREALLOC_MODE_OFF,
+ &local_err);
+ g_free(buf);
+ if (local_err) {
+ error_propagate(errp, local_err);
+ result = -EINVAL;
+ goto out;
}
fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
if (fd < 0) {
result = -errno;
error_setg_errno(errp, -result, "Could not create file");
- } else {
- if (ftruncate(fd, total_size * BDRV_SECTOR_SIZE) != 0) {
- result = -errno;
- error_setg_errno(errp, -result, "Could not resize file");
+ goto out;
+ }
+
+ if (nocow) {
+#ifdef __linux__
+ /* Set NOCOW flag to solve performance issue on fs like btrfs.
+ * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value
+ * will be ignored since any failure of this operation should not
+ * block the left work.
+ */
+ int attr;
+ if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
+ attr |= FS_NOCOW_FL;
+ ioctl(fd, FS_IOC_SETFLAGS, &attr);
}
- if (qemu_close(fd) != 0) {
- result = -errno;
- error_setg_errno(errp, -result, "Could not close the new file");
+#endif
+ }
+
+ if (ftruncate(fd, total_size) != 0) {
+ result = -errno;
+ error_setg_errno(errp, -result, "Could not resize file");
+ goto out_close;
+ }
+
+ if (prealloc == PREALLOC_MODE_FALLOC) {
+ /* posix_fallocate() doesn't set errno. */
+ result = -posix_fallocate(fd, 0, total_size);
+ if (result != 0) {
+ error_setg_errno(errp, -result,
+ "Could not preallocate data for the new file");
+ }
+ } else if (prealloc == PREALLOC_MODE_FULL) {
+ buf = g_malloc0(65536);
+ int64_t num = 0, left = total_size;
+
+ while (left > 0) {
+ num = MIN(left, 65536);
+ result = write(fd, buf, num);
+ if (result < 0) {
+ result = -errno;
+ error_setg_errno(errp, -result,
+ "Could not write to the new file");
+ break;
+ }
+ left -= num;
}
+ fsync(fd);
+ g_free(buf);
+ } else if (prealloc != PREALLOC_MODE_OFF) {
+ result = -EINVAL;
+ error_setg(errp, "Unsupported preallocation mode: %s",
+ PreallocMode_lookup[prealloc]);
}
+
+out_close:
+ if (qemu_close(fd) != 0 && result == 0) {
+ result = -errno;
+ error_setg_errno(errp, -result, "Could not close the new file");
+ }
+out:
return result;
}
return 0;
}
-static QEMUOptionParameter raw_create_options[] = {
- {
- .name = BLOCK_OPT_SIZE,
- .type = OPT_SIZE,
- .help = "Virtual disk size"
- },
- { NULL }
+static QemuOptsList raw_create_opts = {
+ .name = "raw-create-opts",
+ .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head),
+ .desc = {
+ {
+ .name = BLOCK_OPT_SIZE,
+ .type = QEMU_OPT_SIZE,
+ .help = "Virtual disk size"
+ },
+ {
+ .name = BLOCK_OPT_NOCOW,
+ .type = QEMU_OPT_BOOL,
+ .help = "Turn off copy-on-write (valid only on btrfs)"
+ },
+ {
+ .name = BLOCK_OPT_PREALLOC,
+ .type = QEMU_OPT_STRING,
+ .help = "Preallocation mode (allowed values: off, falloc, full)"
+ },
+ { /* end of list */ }
+ }
};
static BlockDriver bdrv_file = {
.bdrv_aio_flush = raw_aio_flush,
.bdrv_aio_discard = raw_aio_discard,
.bdrv_refresh_limits = raw_refresh_limits,
+ .bdrv_io_plug = raw_aio_plug,
+ .bdrv_io_unplug = raw_aio_unplug,
+ .bdrv_flush_io_queue = raw_aio_flush_io_queue,
.bdrv_truncate = raw_truncate,
.bdrv_getlength = raw_getlength,
.bdrv_get_allocated_file_size
= raw_get_allocated_file_size,
- .create_options = raw_create_options,
+ .bdrv_detach_aio_context = raw_detach_aio_context,
+ .bdrv_attach_aio_context = raw_attach_aio_context,
+
+ .create_opts = &raw_create_opts,
};
/***********************************************/
return -ENOTSUP;
}
-static int hdev_create(const char *filename, QEMUOptionParameter *options,
+static int hdev_create(const char *filename, QemuOpts *opts,
Error **errp)
{
int fd;
(void)has_prefix;
/* Read out options */
- while (options && options->name) {
- if (!strcmp(options->name, "size")) {
- total_size = options->value.n / BDRV_SECTOR_SIZE;
- }
- options++;
- }
+ total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+ BDRV_SECTOR_SIZE);
fd = qemu_open(filename, O_WRONLY | O_BINARY);
if (fd < 0) {
error_setg(errp,
"The given file is neither a block nor a character device");
ret = -ENODEV;
- } else if (lseek(fd, 0, SEEK_END) < total_size * BDRV_SECTOR_SIZE) {
+ } else if (lseek(fd, 0, SEEK_END) < total_size) {
error_setg(errp, "Device is too small");
ret = -ENOSPC;
}
.bdrv_reopen_prepare = raw_reopen_prepare,
.bdrv_reopen_commit = raw_reopen_commit,
.bdrv_reopen_abort = raw_reopen_abort,
- .bdrv_create = hdev_create,
- .create_options = raw_create_options,
+ .bdrv_create = hdev_create,
+ .create_opts = &raw_create_opts,
.bdrv_co_write_zeroes = hdev_co_write_zeroes,
.bdrv_aio_readv = raw_aio_readv,
.bdrv_aio_flush = raw_aio_flush,
.bdrv_aio_discard = hdev_aio_discard,
.bdrv_refresh_limits = raw_refresh_limits,
+ .bdrv_io_plug = raw_aio_plug,
+ .bdrv_io_unplug = raw_aio_unplug,
+ .bdrv_flush_io_queue = raw_aio_flush_io_queue,
.bdrv_truncate = raw_truncate,
.bdrv_getlength = raw_getlength,
.bdrv_get_allocated_file_size
= raw_get_allocated_file_size,
+ .bdrv_detach_aio_context = raw_detach_aio_context,
+ .bdrv_attach_aio_context = raw_attach_aio_context,
+
/* generic scsi device */
#ifdef __linux__
.bdrv_ioctl = hdev_ioctl,
.bdrv_reopen_prepare = raw_reopen_prepare,
.bdrv_reopen_commit = raw_reopen_commit,
.bdrv_reopen_abort = raw_reopen_abort,
- .bdrv_create = hdev_create,
- .create_options = raw_create_options,
+ .bdrv_create = hdev_create,
+ .create_opts = &raw_create_opts,
.bdrv_aio_readv = raw_aio_readv,
.bdrv_aio_writev = raw_aio_writev,
.bdrv_aio_flush = raw_aio_flush,
.bdrv_refresh_limits = raw_refresh_limits,
+ .bdrv_io_plug = raw_aio_plug,
+ .bdrv_io_unplug = raw_aio_unplug,
+ .bdrv_flush_io_queue = raw_aio_flush_io_queue,
.bdrv_truncate = raw_truncate,
.bdrv_getlength = raw_getlength,
.bdrv_get_allocated_file_size
= raw_get_allocated_file_size,
+ .bdrv_detach_aio_context = raw_detach_aio_context,
+ .bdrv_attach_aio_context = raw_attach_aio_context,
+
/* removable device support */
.bdrv_is_inserted = floppy_is_inserted,
.bdrv_media_changed = floppy_media_changed,
.bdrv_reopen_prepare = raw_reopen_prepare,
.bdrv_reopen_commit = raw_reopen_commit,
.bdrv_reopen_abort = raw_reopen_abort,
- .bdrv_create = hdev_create,
- .create_options = raw_create_options,
+ .bdrv_create = hdev_create,
+ .create_opts = &raw_create_opts,
.bdrv_aio_readv = raw_aio_readv,
.bdrv_aio_writev = raw_aio_writev,
.bdrv_aio_flush = raw_aio_flush,
.bdrv_refresh_limits = raw_refresh_limits,
+ .bdrv_io_plug = raw_aio_plug,
+ .bdrv_io_unplug = raw_aio_unplug,
+ .bdrv_flush_io_queue = raw_aio_flush_io_queue,
.bdrv_truncate = raw_truncate,
.bdrv_getlength = raw_getlength,
.bdrv_get_allocated_file_size
= raw_get_allocated_file_size,
+ .bdrv_detach_aio_context = raw_detach_aio_context,
+ .bdrv_attach_aio_context = raw_attach_aio_context,
+
/* removable device support */
.bdrv_is_inserted = cdrom_is_inserted,
.bdrv_eject = cdrom_eject,
.bdrv_reopen_commit = raw_reopen_commit,
.bdrv_reopen_abort = raw_reopen_abort,
.bdrv_create = hdev_create,
- .create_options = raw_create_options,
+ .create_opts = &raw_create_opts,
.bdrv_aio_readv = raw_aio_readv,
.bdrv_aio_writev = raw_aio_writev,
.bdrv_aio_flush = raw_aio_flush,
.bdrv_refresh_limits = raw_refresh_limits,
+ .bdrv_io_plug = raw_aio_plug,
+ .bdrv_io_unplug = raw_aio_unplug,
+ .bdrv_flush_io_queue = raw_aio_flush_io_queue,
.bdrv_truncate = raw_truncate,
.bdrv_getlength = raw_getlength,
.bdrv_get_allocated_file_size
= raw_get_allocated_file_size,
+ .bdrv_detach_aio_context = raw_detach_aio_context,
+ .bdrv_attach_aio_context = raw_attach_aio_context,
+
/* removable device support */
.bdrv_is_inserted = cdrom_is_inserted,
.bdrv_eject = cdrom_eject,
};
#endif /* __FreeBSD__ */
-#ifdef CONFIG_LINUX_AIO
-/**
- * Return the file descriptor for Linux AIO
- *
- * This function is a layering violation and should be removed when it becomes
- * possible to call the block layer outside the global mutex. It allows the
- * caller to hijack the file descriptor so I/O can be performed outside the
- * block layer.
- */
-int raw_get_aio_fd(BlockDriverState *bs)
-{
- BDRVRawState *s;
-
- if (!bs->drv) {
- return -ENOMEDIUM;
- }
-
- if (bs->drv == bdrv_find_format("raw")) {
- bs = bs->file;
- }
-
- /* raw-posix has several protocols so just check for raw_aio_readv */
- if (bs->drv->bdrv_aio_readv != raw_aio_readv) {
- return -ENOTSUP;
- }
-
- s = bs->opaque;
- if (!s->use_aio) {
- return -ENOTSUP;
- }
- return s->fd;
-}
-#endif /* CONFIG_LINUX_AIO */
-
static void bdrv_file_init(void)
{
/*