#include "qemu/module.h"
#include "qemu/option.h"
#include "qemu/units.h"
+#include "qemu/memalign.h"
#include "trace.h"
#include "block/thread-pool.h"
#include "qemu/iov.h"
#include "scsi/constants.h"
#if defined(__APPLE__) && (__MACH__)
+#include <sys/ioctl.h>
+#if defined(HAVE_HOST_BLOCK_DEVICE)
#include <paths.h>
#include <sys/param.h>
+#include <sys/mount.h>
#include <IOKit/IOKitLib.h>
#include <IOKit/IOBSD.h>
#include <IOKit/storage/IOMediaBSDClient.h>
//#include <IOKit/storage/IOCDTypes.h>
#include <IOKit/storage/IODVDMedia.h>
#include <CoreFoundation/CoreFoundation.h>
+#endif /* defined(HAVE_HOST_BLOCK_DEVICE) */
#endif
#ifdef __sun__
#include <sys/diskslice.h>
#endif
-#ifdef CONFIG_XFS
-#include <xfs/xfs.h>
-#endif
-
-#include "trace.h"
-
/* OS X does not have O_DSYNC */
#ifndef O_DSYNC
#ifdef O_SYNC
uint64_t locked_perm;
uint64_t locked_shared_perm;
+ uint64_t aio_max_batch;
+
int perm_change_fd;
int perm_change_flags;
BDRVReopenState *reopen_state;
-#ifdef CONFIG_XFS
- bool is_xfs:1;
-#endif
bool has_discard:1;
bool has_write_zeroes:1;
bool discard_zeroes:1;
int page_cache_inconsistent; /* errno from fdatasync failure */
bool has_fallocate;
bool needs_alignment;
+ bool force_alignment;
bool drop_cache;
bool check_cache_dropped;
struct {
bool check_cache_dropped;
} BDRVRawReopenState;
-static int fd_open(BlockDriverState *bs);
+static int fd_open(BlockDriverState *bs)
+{
+ BDRVRawState *s = bs->opaque;
+
+ /* this is just to ensure s->fd is sane (its called by io ops) */
+ if (s->fd >= 0) {
+ return 0;
+ }
+ return -EIO;
+}
+
static int64_t raw_getlength(BlockDriverState *bs);
typedef struct RawPosixAIOData {
return false;
}
+static bool raw_needs_alignment(BlockDriverState *bs)
+{
+ BDRVRawState *s = bs->opaque;
+
+ if ((bs->open_flags & BDRV_O_NOCACHE) != 0 && !dio_byte_aligned(s->fd)) {
+ return true;
+ }
+
+ return s->force_alignment;
+}
+
/* Check if read is allowed with given memory buffer and length.
*
* This function is used to check O_DIRECT memory buffer and request alignment.
if (probe_logical_blocksize(fd, &bs->bl.request_alignment) < 0) {
bs->bl.request_alignment = 0;
}
-#ifdef CONFIG_XFS
- if (s->is_xfs) {
- struct dioattr da;
- if (xfsctl(NULL, fd, XFS_IOC_DIOINFO, &da) >= 0) {
- bs->bl.request_alignment = da.d_miniosz;
- /* The kernel returns wrong information for d_mem */
- /* s->buf_align = da.d_mem; */
- }
+
+#ifdef __linux__
+ /*
+ * The XFS ioctl definitions are shipped in extra packages that might
+ * not always be available. Since we just need the XFS_IOC_DIOINFO ioctl
+ * here, we simply use our own definition instead:
+ */
+ struct xfs_dioattr {
+ uint32_t d_mem;
+ uint32_t d_miniosz;
+ uint32_t d_maxiosz;
+ } da;
+ if (ioctl(fd, _IOR('X', 30, struct xfs_dioattr), &da) >= 0) {
+ bs->bl.request_alignment = da.d_miniosz;
+ /* The kernel returns wrong information for d_mem */
+ /* s->buf_align = da.d_mem; */
}
#endif
.type = QEMU_OPT_STRING,
.help = "host AIO implementation (threads, native, io_uring)",
},
+ {
+ .name = "aio-max-batch",
+ .type = QEMU_OPT_NUMBER,
+ .help = "AIO max batch size (0 = auto handled by AIO backend, default: 0)",
+ },
{
.name = "locking",
.type = QEMU_OPT_STRING,
s->use_linux_io_uring = (aio == BLOCKDEV_AIO_OPTIONS_IO_URING);
#endif
+ s->aio_max_batch = qemu_opt_get_number(opts, "aio-max-batch", 0);
+
locking = qapi_enum_parse(&OnOffAuto_lookup,
qemu_opt_get(opts, "locking"),
ON_OFF_AUTO_AUTO, &local_err);
s->has_discard = true;
s->has_write_zeroes = true;
- if ((bs->open_flags & BDRV_O_NOCACHE) != 0 && !dio_byte_aligned(s->fd)) {
- s->needs_alignment = true;
- }
if (fstat(s->fd, &st) < 0) {
ret = -errno;
* so QEMU makes sure all IO operations on the device are aligned
* to sector size, or else FreeBSD will reject them with EINVAL.
*/
- s->needs_alignment = true;
- }
-#endif
-
-#ifdef CONFIG_XFS
- if (platform_test_xfs_fd(s->fd)) {
- s->is_xfs = true;
+ s->force_alignment = true;
}
#endif
+ s->needs_alignment = raw_needs_alignment(bs);
bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
if (S_ISREG(st.st_mode)) {
s->reopen_state = NULL;
}
-static int sg_get_max_transfer_length(int fd)
+static int hdev_get_max_hw_transfer(int fd, struct stat *st)
{
#ifdef BLKSECTGET
- int max_bytes = 0;
-
- if (ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
- return max_bytes;
+ if (S_ISBLK(st->st_mode)) {
+ unsigned short max_sectors = 0;
+ if (ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
+ return max_sectors * 512;
+ }
} else {
- return -errno;
+ int max_bytes = 0;
+ if (ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
+ return max_bytes;
+ }
}
+ return -errno;
#else
return -ENOSYS;
#endif
}
-static int sg_get_max_segments(int fd)
+static int hdev_get_max_segments(int fd, struct stat *st)
{
#ifdef CONFIG_LINUX
char buf[32];
int ret;
int sysfd = -1;
long max_segments;
- struct stat st;
- if (fstat(fd, &st)) {
- ret = -errno;
- goto out;
+ if (S_ISCHR(st->st_mode)) {
+ if (ioctl(fd, SG_GET_SG_TABLESIZE, &ret) == 0) {
+ return ret;
+ }
+ return -ENOTSUP;
+ }
+
+ if (!S_ISBLK(st->st_mode)) {
+ return -ENOTSUP;
}
sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
- major(st.st_rdev), minor(st.st_rdev));
+ major(st->st_rdev), minor(st->st_rdev));
sysfd = open(sysfspath, O_RDONLY);
if (sysfd == -1) {
ret = -errno;
static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
{
BDRVRawState *s = bs->opaque;
+ struct stat st;
+
+ s->needs_alignment = raw_needs_alignment(bs);
+ raw_probe_alignment(bs, s->fd, errp);
+
+ bs->bl.min_mem_alignment = s->buf_align;
+ bs->bl.opt_mem_alignment = MAX(s->buf_align, qemu_real_host_page_size);
+
+ /*
+ * Maximum transfers are best effort, so it is okay to ignore any
+ * errors. That said, based on the man page errors in fstat would be
+ * very much unexpected; the only possible case seems to be ENOMEM.
+ */
+ if (fstat(s->fd, &st)) {
+ return;
+ }
- if (bs->sg) {
- int ret = sg_get_max_transfer_length(s->fd);
+#if defined(__APPLE__) && (__MACH__)
+ struct statfs buf;
+
+ if (!fstatfs(s->fd, &buf)) {
+ bs->bl.opt_transfer = buf.f_iosize;
+ bs->bl.pdiscard_alignment = buf.f_bsize;
+ }
+#endif
+
+ if (bs->sg || S_ISBLK(st.st_mode)) {
+ int ret = hdev_get_max_hw_transfer(s->fd, &st);
if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
- bs->bl.max_transfer = pow2floor(ret);
+ bs->bl.max_hw_transfer = ret;
}
- ret = sg_get_max_segments(s->fd);
+ ret = hdev_get_max_segments(s->fd, &st);
if (ret > 0) {
- bs->bl.max_transfer = MIN(bs->bl.max_transfer,
- ret * qemu_real_host_page_size);
+ bs->bl.max_hw_iov = ret;
}
}
-
- raw_probe_alignment(bs, s->fd, errp);
- bs->bl.min_mem_alignment = s->buf_align;
- bs->bl.opt_mem_alignment = MAX(s->buf_align, qemu_real_host_page_size);
}
static int check_for_dasd(int fd)
RawPosixAIOData *aiocb = opaque;
int ret;
- ret = ioctl(aiocb->aio_fildes, aiocb->ioctl.cmd, aiocb->ioctl.buf);
+ do {
+ ret = ioctl(aiocb->aio_fildes, aiocb->ioctl.cmd, aiocb->ioctl.buf);
+ } while (ret == -1 && errno == EINTR);
if (ret == -1) {
return -errno;
}
}
}
+#if defined(CONFIG_FALLOCATE) || defined(BLKZEROOUT) || defined(BLKDISCARD)
static int translate_err(int err)
{
if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP ||
}
return err;
}
+#endif
#ifdef CONFIG_FALLOCATE
static int do_fallocate(int fd, int mode, off_t offset, off_t len)
*/
warn_report_once("Your file system is misbehaving: "
"fallocate(FALLOC_FL_PUNCH_HOLE) returned EINVAL. "
- "Please report this bug to your file sytem "
+ "Please report this bug to your file system "
"vendor.");
} else if (ret != -ENOTSUP) {
return ret;
static int handle_aiocb_discard(void *opaque)
{
RawPosixAIOData *aiocb = opaque;
- int ret = -EOPNOTSUPP;
+ int ret = -ENOTSUP;
BDRVRawState *s = aiocb->bs->opaque;
if (!s->has_discard) {
}
} while (errno == EINTR);
- ret = -errno;
+ ret = translate_err(-errno);
#endif
} else {
#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
aiocb->aio_offset, aiocb->aio_nbytes);
+ ret = translate_err(ret);
+#elif defined(__APPLE__) && (__MACH__)
+ fpunchhole_t fpunchhole;
+ fpunchhole.fp_flags = 0;
+ fpunchhole.reserved = 0;
+ fpunchhole.fp_offset = aiocb->aio_offset;
+ fpunchhole.fp_length = aiocb->aio_nbytes;
+ if (fcntl(s->fd, F_PUNCHHOLE, &fpunchhole) == -1) {
+ ret = errno == ENODEV ? -ENOTSUP : -errno;
+ } else {
+ ret = 0;
+ }
#endif
}
- ret = translate_err(ret);
if (ret == -ENOTSUP) {
s->has_discard = false;
}
} else if (s->use_linux_aio) {
LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
assert(qiov->size == bytes);
- return laio_co_submit(bs, aio, s->fd, offset, qiov, type);
+ return laio_co_submit(bs, aio, s->fd, offset, qiov, type,
+ s->aio_max_batch);
#endif
}
return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
}
-static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
- uint64_t bytes, QEMUIOVector *qiov,
- int flags)
+static int coroutine_fn raw_co_preadv(BlockDriverState *bs, int64_t offset,
+ int64_t bytes, QEMUIOVector *qiov,
+ BdrvRequestFlags flags)
{
return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ);
}
-static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,
- uint64_t bytes, QEMUIOVector *qiov,
- int flags)
+static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset,
+ int64_t bytes, QEMUIOVector *qiov,
+ BdrvRequestFlags flags)
{
assert(flags == 0);
return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
#ifdef CONFIG_LINUX_AIO
if (s->use_linux_aio) {
LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
- laio_io_unplug(bs, aio);
+ laio_io_unplug(bs, aio, s->aio_max_batch);
}
#endif
#ifdef CONFIG_LINUX_IO_URING
again:
#endif
if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) {
+ size = 0;
#ifdef DIOCGMEDIASIZE
- if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size))
-#elif defined(DIOCGPART)
- {
- struct partinfo pi;
- if (ioctl(fd, DIOCGPART, &pi) == 0)
- size = pi.media_size;
- else
- size = 0;
+ if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size)) {
+ size = 0;
}
- if (size == 0)
#endif
-#if defined(__APPLE__) && defined(__MACH__)
- {
+#ifdef DIOCGPART
+ if (size == 0) {
+ struct partinfo pi;
+ if (ioctl(fd, DIOCGPART, &pi) == 0) {
+ size = pi.media_size;
+ }
+ }
+#endif
+#if defined(DKIOCGETBLOCKCOUNT) && defined(DKIOCGETBLOCKSIZE)
+ if (size == 0) {
uint64_t sectors = 0;
uint32_t sector_size = 0;
if (ioctl(fd, DKIOCGETBLOCKCOUNT, §ors) == 0
&& ioctl(fd, DKIOCGETBLOCKSIZE, §or_size) == 0) {
size = sectors * sector_size;
- } else {
- size = lseek(fd, 0LL, SEEK_END);
- if (size < 0) {
- return -errno;
- }
}
}
-#else
- size = lseek(fd, 0LL, SEEK_END);
+#endif
+ if (size == 0) {
+ size = lseek(fd, 0LL, SEEK_END);
+ }
if (size < 0) {
return -errno;
}
-#endif
#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
switch(s->type) {
case FTYPE_CD:
* the specified offset) that are known to be in the same
* allocated/unallocated state.
*
- * 'bytes' is the max value 'pnum' should be set to.
+ * 'bytes' is a soft cap for 'pnum'. If the information is free, 'pnum' may
+ * well exceed it.
*/
static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
bool want_zero,
} else if (data == offset) {
/* On a data extent, compute bytes to the end of the extent,
* possibly including a partial sector at EOF. */
- *pnum = MIN(bytes, hole - offset);
+ *pnum = hole - offset;
/*
* We are not allowed to return partial sectors, though, so
} else {
/* On a hole, compute bytes to the beginning of the next extent. */
assert(hole == offset);
- *pnum = MIN(bytes, data - offset);
+ *pnum = data - offset;
ret = BDRV_BLOCK_ZERO;
}
*map = offset;
}
static coroutine_fn int
-raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int bytes, bool blkdev)
+raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
+ bool blkdev)
{
BDRVRawState *s = bs->opaque;
RawPosixAIOData acb;
}
static coroutine_fn int
-raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
+raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
{
return raw_do_pdiscard(bs, offset, bytes, false);
}
static int coroutine_fn
-raw_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int bytes,
+raw_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
BdrvRequestFlags flags, bool blkdev)
{
BDRVRawState *s = bs->opaque;
static int coroutine_fn raw_co_pwrite_zeroes(
BlockDriverState *bs, int64_t offset,
- int bytes, BdrvRequestFlags flags)
+ int64_t bytes, BdrvRequestFlags flags)
{
return raw_do_pwrite_zeroes(bs, offset, bytes, flags, false);
}
return stats;
}
+#if defined(HAVE_HOST_BLOCK_DEVICE)
static BlockStatsSpecific *hdev_get_specific_stats(BlockDriverState *bs)
{
BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
return stats;
}
+#endif /* HAVE_HOST_BLOCK_DEVICE */
static QemuOptsList raw_create_opts = {
.name = "raw-create-opts",
}
static int coroutine_fn raw_co_copy_range_from(
- BlockDriverState *bs, BdrvChild *src, uint64_t src_offset,
- BdrvChild *dst, uint64_t dst_offset, uint64_t bytes,
+ BlockDriverState *bs, BdrvChild *src, int64_t src_offset,
+ BdrvChild *dst, int64_t dst_offset, int64_t bytes,
BdrvRequestFlags read_flags, BdrvRequestFlags write_flags)
{
return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes,
static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs,
BdrvChild *src,
- uint64_t src_offset,
+ int64_t src_offset,
BdrvChild *dst,
- uint64_t dst_offset,
- uint64_t bytes,
+ int64_t dst_offset,
+ int64_t bytes,
BdrvRequestFlags read_flags,
BdrvRequestFlags write_flags)
{
/***********************************************/
/* host device */
+#if defined(HAVE_HOST_BLOCK_DEVICE)
+
#if defined(__APPLE__) && defined(__MACH__)
static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
CFIndex maxPathSize, int flags);
}
#endif /* linux */
-static int fd_open(BlockDriverState *bs)
-{
- BDRVRawState *s = bs->opaque;
-
- /* this is just to ensure s->fd is sane (its called by io ops) */
- if (s->fd >= 0)
- return 0;
- return -EIO;
-}
-
static coroutine_fn int
-hdev_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
+hdev_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
{
BDRVRawState *s = bs->opaque;
int ret;
}
static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
- int64_t offset, int bytes, BdrvRequestFlags flags)
+ int64_t offset, int64_t bytes, BdrvRequestFlags flags)
{
int rc;
};
#endif /* __FreeBSD__ */
+#endif /* HAVE_HOST_BLOCK_DEVICE */
+
static void bdrv_file_init(void)
{
/*
* registered last will get probed first.
*/
bdrv_register(&bdrv_file);
+#if defined(HAVE_HOST_BLOCK_DEVICE)
bdrv_register(&bdrv_host_device);
#ifdef __linux__
bdrv_register(&bdrv_host_cdrom);
#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
bdrv_register(&bdrv_host_cdrom);
#endif
+#endif /* HAVE_HOST_BLOCK_DEVICE */
}
block_init(bdrv_file_init);