X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=block%2Fcopy-before-write.c;h=4ffabc5ca239ebec3b4531721119d7f2d29d5e23;hb=e582b629f0b50c10137ba47c4ca7fe30b3357e3d;hp=4858dcf8ff53d4a5e3fd7ee254b342ff5bc5a6a3;hpb=5a507426740b4d7bdd2b333cfa7f5da3e60ebad8;p=mirror_qemu.git diff --git a/block/copy-before-write.c b/block/copy-before-write.c index 4858dcf8ff..4ffabc5ca2 100644 --- a/block/copy-before-write.c +++ b/block/copy-before-write.c @@ -24,6 +24,7 @@ */ #include "qemu/osdep.h" +#include "qapi/qmp/qjson.h" #include "sysemu/block-backend.h" #include "qemu/cutils.h" @@ -31,25 +32,80 @@ #include "block/block_int.h" #include "block/qdict.h" #include "block/block-copy.h" +#include "block/dirty-bitmap.h" #include "block/copy-before-write.h" +#include "block/reqlist.h" + +#include "qapi/qapi-visit-block-core.h" typedef struct BDRVCopyBeforeWriteState { BlockCopyState *bcs; BdrvChild *target; + OnCbwError on_cbw_error; + uint32_t cbw_timeout_ns; + + /* + * @lock: protects access to @access_bitmap, @done_bitmap and + * @frozen_read_reqs + */ + CoMutex lock; + + /* + * @access_bitmap: represents areas allowed for reading by fleecing user. + * Reading from non-dirty areas leads to -EACCES. + */ + BdrvDirtyBitmap *access_bitmap; + + /* + * @done_bitmap: represents areas that was successfully copied to @target by + * copy-before-write operations. + */ + BdrvDirtyBitmap *done_bitmap; + + /* + * @frozen_read_reqs: current read requests for fleecing user in bs->file + * node. These areas must not be rewritten by guest. + */ + BlockReqList frozen_read_reqs; + + /* + * @snapshot_error is normally zero. But on first copy-before-write failure + * when @on_cbw_error == ON_CBW_ERROR_BREAK_SNAPSHOT, @snapshot_error takes + * value of this error (<0). After that all in-flight and further + * snapshot-API requests will fail with that error. + */ + int snapshot_error; } BDRVCopyBeforeWriteState; -static coroutine_fn int cbw_co_preadv( - BlockDriverState *bs, uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +static int coroutine_fn GRAPH_RDLOCK +cbw_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, + QEMUIOVector *qiov, BdrvRequestFlags flags) { return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags); } +static void block_copy_cb(void *opaque) +{ + BlockDriverState *bs = opaque; + + bdrv_dec_in_flight(bs); +} + +/* + * Do copy-before-write operation. + * + * On failure guest request must be failed too. + * + * On success, we also wait for all in-flight fleecing read requests in source + * node, and it's guaranteed that after cbw_do_copy_before_write() successful + * return there are no such requests and they will never appear. + */ static coroutine_fn int cbw_do_copy_before_write(BlockDriverState *bs, uint64_t offset, uint64_t bytes, BdrvRequestFlags flags) { BDRVCopyBeforeWriteState *s = bs->opaque; + int ret; uint64_t off, end; int64_t cluster_size = block_copy_cluster_size(s->bcs); @@ -57,14 +113,44 @@ static coroutine_fn int cbw_do_copy_before_write(BlockDriverState *bs, return 0; } + if (s->snapshot_error) { + return 0; + } + off = QEMU_ALIGN_DOWN(offset, cluster_size); end = QEMU_ALIGN_UP(offset + bytes, cluster_size); - return block_copy(s->bcs, off, end - off, true); + /* + * Increase in_flight, so that in case of timed-out block-copy, the + * remaining background block_copy() request (which can't be immediately + * cancelled by timeout) is presented in bs->in_flight. This way we are + * sure that on bs close() we'll previously wait for all timed-out but yet + * running block_copy calls. + */ + bdrv_inc_in_flight(bs); + ret = block_copy(s->bcs, off, end - off, true, s->cbw_timeout_ns, + block_copy_cb, bs); + if (ret < 0 && s->on_cbw_error == ON_CBW_ERROR_BREAK_GUEST_WRITE) { + return ret; + } + + WITH_QEMU_LOCK_GUARD(&s->lock) { + if (ret < 0) { + assert(s->on_cbw_error == ON_CBW_ERROR_BREAK_SNAPSHOT); + if (!s->snapshot_error) { + s->snapshot_error = ret; + } + } else { + bdrv_set_dirty_bitmap(s->done_bitmap, off, end - off); + } + reqlist_wait_all(&s->frozen_read_reqs, off, end - off, &s->lock); + } + + return 0; } -static int coroutine_fn cbw_co_pdiscard(BlockDriverState *bs, - int64_t offset, int bytes) +static int coroutine_fn GRAPH_RDLOCK +cbw_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes) { int ret = cbw_do_copy_before_write(bs, offset, bytes, 0); if (ret < 0) { @@ -74,8 +160,9 @@ static int coroutine_fn cbw_co_pdiscard(BlockDriverState *bs, return bdrv_co_pdiscard(bs->file, offset, bytes); } -static int coroutine_fn cbw_co_pwrite_zeroes(BlockDriverState *bs, - int64_t offset, int bytes, BdrvRequestFlags flags) +static int coroutine_fn GRAPH_RDLOCK +cbw_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes, + BdrvRequestFlags flags) { int ret = cbw_do_copy_before_write(bs, offset, bytes, flags); if (ret < 0) { @@ -85,10 +172,9 @@ static int coroutine_fn cbw_co_pwrite_zeroes(BlockDriverState *bs, return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags); } -static coroutine_fn int cbw_co_pwritev(BlockDriverState *bs, - uint64_t offset, - uint64_t bytes, - QEMUIOVector *qiov, int flags) +static coroutine_fn GRAPH_RDLOCK +int cbw_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes, + QEMUIOVector *qiov, BdrvRequestFlags flags) { int ret = cbw_do_copy_before_write(bs, offset, bytes, flags); if (ret < 0) { @@ -98,7 +184,7 @@ static coroutine_fn int cbw_co_pwritev(BlockDriverState *bs, return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags); } -static int coroutine_fn cbw_co_flush(BlockDriverState *bs) +static int coroutine_fn GRAPH_RDLOCK cbw_co_flush(BlockDriverState *bs) { if (!bs->file) { return 0; @@ -107,17 +193,159 @@ static int coroutine_fn cbw_co_flush(BlockDriverState *bs) return bdrv_co_flush(bs->file->bs); } +/* + * If @offset not accessible - return NULL. + * + * Otherwise, set @pnum to some bytes that accessible from @file (@file is set + * to bs->file or to s->target). Return newly allocated BlockReq object that + * should be than passed to cbw_snapshot_read_unlock(). + * + * It's guaranteed that guest writes will not interact in the region until + * cbw_snapshot_read_unlock() called. + */ +static coroutine_fn BlockReq * +cbw_snapshot_read_lock(BlockDriverState *bs, int64_t offset, int64_t bytes, + int64_t *pnum, BdrvChild **file) +{ + BDRVCopyBeforeWriteState *s = bs->opaque; + BlockReq *req = g_new(BlockReq, 1); + bool done; + + QEMU_LOCK_GUARD(&s->lock); + + if (s->snapshot_error) { + g_free(req); + return NULL; + } + + if (bdrv_dirty_bitmap_next_zero(s->access_bitmap, offset, bytes) != -1) { + g_free(req); + return NULL; + } + + done = bdrv_dirty_bitmap_status(s->done_bitmap, offset, bytes, pnum); + if (done) { + /* + * Special invalid BlockReq, that is handled in + * cbw_snapshot_read_unlock(). We don't need to lock something to read + * from s->target. + */ + *req = (BlockReq) {.offset = -1, .bytes = -1}; + *file = s->target; + } else { + reqlist_init_req(&s->frozen_read_reqs, req, offset, bytes); + *file = bs->file; + } + + return req; +} + +static coroutine_fn void +cbw_snapshot_read_unlock(BlockDriverState *bs, BlockReq *req) +{ + BDRVCopyBeforeWriteState *s = bs->opaque; + + if (req->offset == -1 && req->bytes == -1) { + g_free(req); + return; + } + + QEMU_LOCK_GUARD(&s->lock); + + reqlist_remove_req(req); + g_free(req); +} + +static int coroutine_fn GRAPH_RDLOCK +cbw_co_preadv_snapshot(BlockDriverState *bs, int64_t offset, int64_t bytes, + QEMUIOVector *qiov, size_t qiov_offset) +{ + BlockReq *req; + BdrvChild *file; + int ret; + + /* TODO: upgrade to async loop using AioTask */ + while (bytes) { + int64_t cur_bytes; + + req = cbw_snapshot_read_lock(bs, offset, bytes, &cur_bytes, &file); + if (!req) { + return -EACCES; + } + + ret = bdrv_co_preadv_part(file, offset, cur_bytes, + qiov, qiov_offset, 0); + cbw_snapshot_read_unlock(bs, req); + if (ret < 0) { + return ret; + } + + bytes -= cur_bytes; + offset += cur_bytes; + qiov_offset += cur_bytes; + } + + return 0; +} + +static int coroutine_fn GRAPH_RDLOCK +cbw_co_snapshot_block_status(BlockDriverState *bs, + bool want_zero, int64_t offset, int64_t bytes, + int64_t *pnum, int64_t *map, + BlockDriverState **file) +{ + BDRVCopyBeforeWriteState *s = bs->opaque; + BlockReq *req; + int ret; + int64_t cur_bytes; + BdrvChild *child; + + req = cbw_snapshot_read_lock(bs, offset, bytes, &cur_bytes, &child); + if (!req) { + return -EACCES; + } + + ret = bdrv_co_block_status(child->bs, offset, cur_bytes, pnum, map, file); + if (child == s->target) { + /* + * We refer to s->target only for areas that we've written to it. + * And we can not report unallocated blocks in s->target: this will + * break generic block-status-above logic, that will go to + * copy-before-write filtered child in this case. + */ + assert(ret & BDRV_BLOCK_ALLOCATED); + } + + cbw_snapshot_read_unlock(bs, req); + + return ret; +} + +static int coroutine_fn GRAPH_RDLOCK +cbw_co_pdiscard_snapshot(BlockDriverState *bs, int64_t offset, int64_t bytes) +{ + BDRVCopyBeforeWriteState *s = bs->opaque; + + WITH_QEMU_LOCK_GUARD(&s->lock) { + bdrv_reset_dirty_bitmap(s->access_bitmap, offset, bytes); + } + + block_copy_reset(s->bcs, offset, bytes); + + return bdrv_co_pdiscard(s->target, offset, bytes); +} + static void cbw_refresh_filename(BlockDriverState *bs) { pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->file->bs->filename); } -static void cbw_child_perm(BlockDriverState *bs, BdrvChild *c, - BdrvChildRole role, - BlockReopenQueue *reopen_queue, - uint64_t perm, uint64_t shared, - uint64_t *nperm, uint64_t *nshared) +static void GRAPH_RDLOCK +cbw_child_perm(BlockDriverState *bs, BdrvChild *c, BdrvChildRole role, + BlockReopenQueue *reopen_queue, + uint64_t perm, uint64_t shared, + uint64_t *nperm, uint64_t *nshared) { if (!(role & BDRV_CHILD_FILTERED)) { /* @@ -144,28 +372,83 @@ static void cbw_child_perm(BlockDriverState *bs, BdrvChild *c, } } -static int cbw_init(BlockDriverState *bs, BlockDriverState *source, - BlockDriverState *target, bool compress, Error **errp) +static BlockdevOptions *cbw_parse_options(QDict *options, Error **errp) +{ + BlockdevOptions *opts = NULL; + Visitor *v = NULL; + + qdict_put_str(options, "driver", "copy-before-write"); + + v = qobject_input_visitor_new_flat_confused(options, errp); + if (!v) { + goto out; + } + + visit_type_BlockdevOptions(v, NULL, &opts, errp); + if (!opts) { + goto out; + } + + /* + * Delete options which we are going to parse through BlockdevOptions + * object for original options. + */ + qdict_extract_subqdict(options, NULL, "bitmap"); + qdict_del(options, "on-cbw-error"); + qdict_del(options, "cbw-timeout"); + +out: + visit_free(v); + qdict_del(options, "driver"); + + return opts; +} + +static int cbw_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVCopyBeforeWriteState *s = bs->opaque; + BdrvDirtyBitmap *bitmap = NULL; + int64_t cluster_size; + g_autoptr(BlockdevOptions) full_opts = NULL; + BlockdevOptionsCbw *opts; + AioContext *ctx; + int ret; - bdrv_ref(target); - s->target = bdrv_attach_child(bs, target, "target", &child_of_bds, - BDRV_CHILD_DATA, errp); - if (!s->target) { - error_prepend(errp, "Cannot attach target child: "); + full_opts = cbw_parse_options(options, errp); + if (!full_opts) { return -EINVAL; } + assert(full_opts->driver == BLOCKDEV_DRIVER_COPY_BEFORE_WRITE); + opts = &full_opts->u.copy_before_write; - bdrv_ref(source); - bs->file = bdrv_attach_child(bs, source, "file", &child_of_bds, - BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, - errp); - if (!bs->file) { - error_prepend(errp, "Cannot attach file child: "); + ret = bdrv_open_file_child(NULL, options, "file", bs, errp); + if (ret < 0) { + return ret; + } + + s->target = bdrv_open_child(NULL, options, "target", bs, &child_of_bds, + BDRV_CHILD_DATA, false, errp); + if (!s->target) { return -EINVAL; } + ctx = bdrv_get_aio_context(bs); + aio_context_acquire(ctx); + + if (opts->bitmap) { + bitmap = block_dirty_bitmap_lookup(opts->bitmap->node, + opts->bitmap->name, NULL, errp); + if (!bitmap) { + ret = -EINVAL; + goto out; + } + } + s->on_cbw_error = opts->has_on_cbw_error ? opts->on_cbw_error : + ON_CBW_ERROR_BREAK_GUEST_WRITE; + s->cbw_timeout_ns = opts->has_cbw_timeout ? + opts->cbw_timeout * NANOSECONDS_PER_SECOND : 0; + bs->total_sectors = bs->file->bs->total_sectors; bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED | (BDRV_REQ_FUA & bs->file->bs->supported_write_flags); @@ -173,25 +456,70 @@ static int cbw_init(BlockDriverState *bs, BlockDriverState *source, ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) & bs->file->bs->supported_zero_flags); - s->bcs = block_copy_state_new(bs->file, s->target, false, compress, errp); + s->bcs = block_copy_state_new(bs->file, s->target, bitmap, errp); if (!s->bcs) { error_prepend(errp, "Cannot create block-copy-state: "); - return -EINVAL; + ret = -EINVAL; + goto out; } - return 0; + cluster_size = block_copy_cluster_size(s->bcs); + + s->done_bitmap = bdrv_create_dirty_bitmap(bs, cluster_size, NULL, errp); + if (!s->done_bitmap) { + ret = -EINVAL; + goto out; + } + bdrv_disable_dirty_bitmap(s->done_bitmap); + + /* s->access_bitmap starts equal to bcs bitmap */ + s->access_bitmap = bdrv_create_dirty_bitmap(bs, cluster_size, NULL, errp); + if (!s->access_bitmap) { + ret = -EINVAL; + goto out; + } + bdrv_disable_dirty_bitmap(s->access_bitmap); + bdrv_dirty_bitmap_merge_internal(s->access_bitmap, + block_copy_dirty_bitmap(s->bcs), NULL, + true); + + qemu_co_mutex_init(&s->lock); + QLIST_INIT(&s->frozen_read_reqs); + + ret = 0; +out: + aio_context_release(ctx); + return ret; +} + +static void cbw_close(BlockDriverState *bs) +{ + BDRVCopyBeforeWriteState *s = bs->opaque; + + bdrv_release_dirty_bitmap(s->access_bitmap); + bdrv_release_dirty_bitmap(s->done_bitmap); + + block_copy_state_free(s->bcs); + s->bcs = NULL; } -BlockDriver bdrv_cbw_filter = { +static BlockDriver bdrv_cbw_filter = { .format_name = "copy-before-write", .instance_size = sizeof(BDRVCopyBeforeWriteState), + .bdrv_open = cbw_open, + .bdrv_close = cbw_close, + .bdrv_co_preadv = cbw_co_preadv, .bdrv_co_pwritev = cbw_co_pwritev, .bdrv_co_pwrite_zeroes = cbw_co_pwrite_zeroes, .bdrv_co_pdiscard = cbw_co_pdiscard, .bdrv_co_flush = cbw_co_flush, + .bdrv_co_preadv_snapshot = cbw_co_preadv_snapshot, + .bdrv_co_pdiscard_snapshot = cbw_co_pdiscard_snapshot, + .bdrv_co_snapshot_block_status = cbw_co_snapshot_block_status, + .bdrv_refresh_filename = cbw_refresh_filename, .bdrv_child_perm = cbw_child_perm, @@ -202,55 +530,45 @@ BlockDriver bdrv_cbw_filter = { BlockDriverState *bdrv_cbw_append(BlockDriverState *source, BlockDriverState *target, const char *filter_node_name, - bool compress, BlockCopyState **bcs, Error **errp) { - ERRP_GUARD(); - int ret; BDRVCopyBeforeWriteState *state; BlockDriverState *top; + QDict *opts; assert(source->total_sectors == target->total_sectors); + GLOBAL_STATE_CODE(); - top = bdrv_new_open_driver(&bdrv_cbw_filter, filter_node_name, - BDRV_O_RDWR, errp); - if (!top) { - error_prepend(errp, "Cannot open driver: "); - return NULL; + opts = qdict_new(); + qdict_put_str(opts, "driver", "copy-before-write"); + if (filter_node_name) { + qdict_put_str(opts, "node-name", filter_node_name); } - state = top->opaque; + qdict_put_str(opts, "file", bdrv_get_node_name(source)); + qdict_put_str(opts, "target", bdrv_get_node_name(target)); - ret = cbw_init(top, source, target, compress, errp); - if (ret < 0) { - goto fail; - } - - bdrv_drained_begin(source); - ret = bdrv_replace_node(source, top, errp); - bdrv_drained_end(source); - if (ret < 0) { - error_prepend(errp, "Cannot append copy-before-write filter: "); - goto fail; + top = bdrv_insert_node(source, opts, BDRV_O_RDWR, errp); + if (!top) { + return NULL; } + state = top->opaque; *bcs = state->bcs; return top; - -fail: - block_copy_state_free(state->bcs); - bdrv_unref(top); - return NULL; } void bdrv_cbw_drop(BlockDriverState *bs) { - BDRVCopyBeforeWriteState *s = bs->opaque; - + GLOBAL_STATE_CODE(); bdrv_drop_filter(bs, &error_abort); - - block_copy_state_free(s->bcs); - bdrv_unref(bs); } + +static void cbw_init(void) +{ + bdrv_register(&bdrv_cbw_filter); +} + +block_init(cbw_init);