uint64_t locked_perm;
uint64_t locked_shared_perm;
+ int perm_change_fd;
BDRVReopenState *reopen_state;
#ifdef CONFIG_XFS
bool page_cache_inconsistent:1;
bool has_fallocate;
bool needs_alignment;
+ bool drop_cache;
bool check_cache_dropped;
PRManager *pr_mgr;
typedef struct BDRVRawReopenState {
int fd;
int open_flags;
+ bool drop_cache;
bool check_cache_dropped;
} BDRVRawReopenState;
}
}
-static void raw_parse_flags(int bdrv_flags, int *open_flags)
+static void raw_parse_flags(int bdrv_flags, int *open_flags, bool has_writers)
{
+ bool read_write = false;
assert(open_flags != NULL);
*open_flags |= O_BINARY;
*open_flags &= ~O_ACCMODE;
- if (bdrv_flags & BDRV_O_RDWR) {
+
+ if (bdrv_flags & BDRV_O_AUTO_RDONLY) {
+ read_write = has_writers;
+ } else if (bdrv_flags & BDRV_O_RDWR) {
+ read_write = true;
+ }
+
+ if (read_write) {
*open_flags |= O_RDWR;
} else {
*open_flags |= O_RDONLY;
.type = QEMU_OPT_STRING,
.help = "id of persistent reservation manager object (default: none)",
},
+#if defined(__linux__)
+ {
+ .name = "drop-cache",
+ .type = QEMU_OPT_BOOL,
+ .help = "invalidate page cache during live migration (default: on)",
+ },
+#endif
{
.name = "x-check-cache-dropped",
.type = QEMU_OPT_BOOL,
},
};
+static const char *const mutable_opts[] = { "x-check-cache-dropped", NULL };
+
static int raw_open_common(BlockDriverState *bs, QDict *options,
int bdrv_flags, int open_flags,
bool device, Error **errp)
}
}
+ s->drop_cache = qemu_opt_get_bool(opts, "drop-cache", true);
s->check_cache_dropped = qemu_opt_get_bool(opts, "x-check-cache-dropped",
false);
s->open_flags = open_flags;
- raw_parse_flags(bdrv_flags, &s->open_flags);
+ raw_parse_flags(bdrv_flags, &s->open_flags, false);
s->fd = -1;
fd = qemu_open(filename, s->open_flags, 0644);
ret = fd < 0 ? -errno : 0;
- if (ret == -EACCES || ret == -EROFS) {
- /* Try to degrade to read-only, but if it doesn't work, still use the
- * normal error message. */
- if (bdrv_apply_auto_read_only(bs, NULL, NULL) == 0) {
- bdrv_flags &= ~BDRV_O_RDWR;
- raw_parse_flags(bdrv_flags, &s->open_flags);
- assert(!(s->open_flags & O_CREAT));
- fd = qemu_open(filename, s->open_flags);
- ret = fd < 0 ? -errno : 0;
- }
- }
-
if (ret < 0) {
error_setg_errno(errp, -ret, "Could not open '%s'", filename);
if (ret == -EROFS) {
}
#endif
- bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP;
+ bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
ret = 0;
fail:
if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) {
switch (op) {
case RAW_PL_PREPARE:
+ if ((s->perm | new_perm) == s->perm &&
+ (s->shared_perm & new_shared) == s->shared_perm)
+ {
+ /*
+ * We are going to unlock bytes, it should not fail. If it fail due
+ * to some fs-dependent permission-unrelated reasons (which occurs
+ * sometimes on NFS and leads to abort in bdrv_replace_child) we
+ * can't prevent such errors by any check here. And we ignore them
+ * anyway in ABORT and COMMIT.
+ */
+ return 0;
+ }
ret = raw_apply_lock_bytes(s, s->fd, s->perm | new_perm,
~s->shared_perm | ~new_shared,
false, errp);
}
static int raw_reconfigure_getfd(BlockDriverState *bs, int flags,
- int *open_flags, Error **errp)
+ int *open_flags, uint64_t perm, bool force_dup,
+ Error **errp)
{
BDRVRawState *s = bs->opaque;
int fd = -1;
int ret;
+ bool has_writers = perm &
+ (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED | BLK_PERM_RESIZE);
int fcntl_flags = O_APPEND | O_NONBLOCK;
#ifdef O_NOATIME
fcntl_flags |= O_NOATIME;
*open_flags |= O_NONBLOCK;
}
- raw_parse_flags(flags, open_flags);
+ raw_parse_flags(flags, open_flags, has_writers);
#ifdef O_ASYNC
/* Not all operating systems have O_ASYNC, and those that don't
assert((s->open_flags & O_ASYNC) == 0);
#endif
+ if (!force_dup && *open_flags == s->open_flags) {
+ /* We're lucky, the existing fd is fine */
+ return s->fd;
+ }
+
if ((*open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
/* dup the original fd */
fd = qemu_dup(s->fd);
goto out;
}
+ rs->drop_cache = qemu_opt_get_bool_del(opts, "drop-cache", true);
rs->check_cache_dropped =
qemu_opt_get_bool_del(opts, "x-check-cache-dropped", false);
qemu_opts_to_qdict(opts, state->options);
rs->fd = raw_reconfigure_getfd(state->bs, state->flags, &rs->open_flags,
- &local_err);
+ state->perm, true, &local_err);
if (local_err) {
error_propagate(errp, local_err);
ret = -1;
ret = -EINVAL;
goto out_fd;
}
-
- /* Copy locks to the new fd */
- ret = raw_apply_lock_bytes(NULL, rs->fd, s->locked_perm,
- s->locked_shared_perm, false, errp);
- if (ret < 0) {
- ret = -EINVAL;
- goto out_fd;
- }
}
s->reopen_state = state;
BDRVRawReopenState *rs = state->opaque;
BDRVRawState *s = state->bs->opaque;
+ s->drop_cache = rs->drop_cache;
s->check_cache_dropped = rs->check_cache_dropped;
s->open_flags = rs->open_flags;
#ifdef CONFIG_XFS
static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes)
{
+ int64_t len;
struct xfs_flock64 fl;
int err;
+ len = lseek(s->fd, 0, SEEK_END);
+ if (len < 0) {
+ return -errno;
+ }
+
+ if (offset + bytes > len) {
+ /* XFS_IOC_ZERO_RANGE does not increase the file length */
+ if (ftruncate(s->fd, offset + bytes) < 0) {
+ return -errno;
+ }
+ }
+
memset(&fl, 0, sizeof(fl));
fl.l_whence = SEEK_SET;
fl.l_start = offset;
}
#ifdef BLKZEROOUT
- do {
- uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
- if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
- return 0;
- }
- } while (errno == EINTR);
+ /* The BLKZEROOUT implementation in the kernel doesn't set
+ * BLKDEV_ZERO_NOFALLBACK, so we can't call this if we have to avoid slow
+ * fallbacks. */
+ if (!(aiocb->aio_type & QEMU_AIO_NO_FALLBACK)) {
+ do {
+ uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
+ if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
+ return 0;
+ }
+ } while (errno == EINTR);
- ret = translate_err(-errno);
+ ret = translate_err(-errno);
+ }
#endif
if (ret == -ENOTSUP) {
off_t data = 0, hole = 0;
int ret;
+ assert(QEMU_IS_ALIGNED(offset | bytes, bs->bl.request_alignment));
+
ret = fd_open(bs);
if (ret < 0) {
return ret;
/* On a data extent, compute bytes to the end of the extent,
* possibly including a partial sector at EOF. */
*pnum = MIN(bytes, hole - offset);
+
+ /*
+ * We are not allowed to return partial sectors, though, so
+ * round up if necessary.
+ */
+ if (!QEMU_IS_ALIGNED(*pnum, bs->bl.request_alignment)) {
+ int64_t file_length = raw_getlength(bs);
+ if (file_length > 0) {
+ /* Ignore errors, this is just a safeguard */
+ assert(hole == file_length);
+ }
+ *pnum = ROUND_UP(*pnum, bs->bl.request_alignment);
+ }
+
ret = BDRV_BLOCK_DATA;
} else {
/* On a hole, compute bytes to the beginning of the next extent. */
return;
}
+ if (!s->drop_cache) {
+ return;
+ }
+
if (s->open_flags & O_DIRECT) {
return; /* No host kernel page cache */
}
if (blkdev) {
acb.aio_type |= QEMU_AIO_BLKDEV;
}
+ if (flags & BDRV_REQ_NO_FALLBACK) {
+ acb.aio_type |= QEMU_AIO_NO_FALLBACK;
+ }
if (flags & BDRV_REQ_MAY_UNMAP) {
acb.aio_type |= QEMU_AIO_DISCARD;
static int raw_check_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared,
Error **errp)
{
- return raw_handle_perm_lock(bs, RAW_PL_PREPARE, perm, shared, errp);
+ BDRVRawState *s = bs->opaque;
+ BDRVRawReopenState *rs = NULL;
+ int open_flags;
+ int ret;
+
+ if (s->perm_change_fd) {
+ /*
+ * In the context of reopen, this function may be called several times
+ * (directly and recursively while change permissions of the parent).
+ * This is even true for children that don't inherit from the original
+ * reopen node, so s->reopen_state is not set.
+ *
+ * Ignore all but the first call.
+ */
+ return 0;
+ }
+
+ if (s->reopen_state) {
+ /* We already have a new file descriptor to set permissions for */
+ assert(s->reopen_state->perm == perm);
+ assert(s->reopen_state->shared_perm == shared);
+ rs = s->reopen_state->opaque;
+ s->perm_change_fd = rs->fd;
+ } else {
+ /* We may need a new fd if auto-read-only switches the mode */
+ ret = raw_reconfigure_getfd(bs, bs->open_flags, &open_flags, perm,
+ false, errp);
+ if (ret < 0) {
+ return ret;
+ } else if (ret != s->fd) {
+ s->perm_change_fd = ret;
+ }
+ }
+
+ /* Prepare permissions on old fd to avoid conflicts between old and new,
+ * but keep everything locked that new will need. */
+ ret = raw_handle_perm_lock(bs, RAW_PL_PREPARE, perm, shared, errp);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ /* Copy locks to the new fd */
+ if (s->perm_change_fd) {
+ ret = raw_apply_lock_bytes(NULL, s->perm_change_fd, perm, ~shared,
+ false, errp);
+ if (ret < 0) {
+ raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
+ goto fail;
+ }
+ }
+ return 0;
+
+fail:
+ if (s->perm_change_fd && !s->reopen_state) {
+ qemu_close(s->perm_change_fd);
+ }
+ s->perm_change_fd = 0;
+ return ret;
}
static void raw_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared)
{
BDRVRawState *s = bs->opaque;
+
+ /* For reopen, we have already switched to the new fd (.bdrv_set_perm is
+ * called after .bdrv_reopen_commit) */
+ if (s->perm_change_fd && s->fd != s->perm_change_fd) {
+ qemu_close(s->fd);
+ s->fd = s->perm_change_fd;
+ }
+ s->perm_change_fd = 0;
+
raw_handle_perm_lock(bs, RAW_PL_COMMIT, perm, shared, NULL);
s->perm = perm;
s->shared_perm = shared;
static void raw_abort_perm_update(BlockDriverState *bs)
{
+ BDRVRawState *s = bs->opaque;
+
+ /* For reopen, .bdrv_reopen_abort is called afterwards and will close
+ * the file descriptor. */
+ if (s->perm_change_fd && !s->reopen_state) {
+ qemu_close(s->perm_change_fd);
+ }
+ s->perm_change_fd = 0;
+
raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
}
.bdrv_set_perm = raw_set_perm,
.bdrv_abort_perm_update = raw_abort_perm_update,
.create_opts = &raw_create_opts,
+ .mutable_opts = mutable_opts,
};
/***********************************************/
.bdrv_reopen_abort = raw_reopen_abort,
.bdrv_co_create_opts = hdev_co_create_opts,
.create_opts = &raw_create_opts,
+ .mutable_opts = mutable_opts,
.bdrv_co_invalidate_cache = raw_co_invalidate_cache,
.bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
.bdrv_reopen_abort = raw_reopen_abort,
.bdrv_co_create_opts = hdev_co_create_opts,
.create_opts = &raw_create_opts,
+ .mutable_opts = mutable_opts,
.bdrv_co_invalidate_cache = raw_co_invalidate_cache,
.bdrv_reopen_abort = raw_reopen_abort,
.bdrv_co_create_opts = hdev_co_create_opts,
.create_opts = &raw_create_opts,
+ .mutable_opts = mutable_opts,
.bdrv_co_preadv = raw_co_preadv,
.bdrv_co_pwritev = raw_co_pwritev,