X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=block.c;h=854ebd60d448a2a2b18ca21045a772b160512007;hb=d7331bed11f5e65b3b640aab59ab22bc61a4e77d;hp=7547051ec2abd8cf84e495d097a83358bc9cc49e;hpb=272d28ecf2a2158c4bd89b42e5f45eb9e6201fde;p=mirror_qemu.git diff --git a/block.c b/block.c index 7547051ec2..854ebd60d4 100644 --- a/block.c +++ b/block.c @@ -26,8 +26,11 @@ #include "trace.h" #include "monitor.h" #include "block_int.h" +#include "blockjob.h" #include "module.h" #include "qjson.h" +#include "sysemu.h" +#include "notify.h" #include "qemu-coroutine.h" #include "qmp-commands.h" #include "qemu-timer.h" @@ -310,9 +313,16 @@ BlockDriverState *bdrv_new(const char *device_name) QTAILQ_INSERT_TAIL(&bdrv_states, bs, list); } bdrv_iostatus_disable(bs); + notifier_list_init(&bs->close_notifiers); + return bs; } +void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify) +{ + notifier_list_add(&bs->close_notifiers, notify); +} + BlockDriver *bdrv_find_format(const char *format_name) { BlockDriver *drv1; @@ -377,7 +387,8 @@ int bdrv_create(BlockDriver *drv, const char* filename, }; if (!drv->bdrv_create) { - return -ENOTSUP; + ret = -ENOTSUP; + goto out; } if (qemu_in_coroutine()) { @@ -392,8 +403,9 @@ int bdrv_create(BlockDriver *drv, const char* filename, } ret = cco.ret; - g_free(cco.filename); +out: + g_free(cco.filename); return ret; } @@ -433,7 +445,11 @@ int get_tmp_filename(char *filename, int size) return -EOVERFLOW; } fd = mkstemp(filename); - if (fd < 0 || close(fd)) { + if (fd < 0) { + return -errno; + } + if (close(fd) != 0) { + unlink(filename); return -errno; } return 0; @@ -649,12 +665,13 @@ static int bdrv_open_common(BlockDriverState *bs, const char *filename, bs->opaque = g_malloc0(drv->instance_size); bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB); + open_flags = flags | BDRV_O_CACHE_WB; /* * Clear flags that are internal to the block layer before opening the * image. */ - open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING); + open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING); /* * Snapshots should be writable. @@ -663,7 +680,7 @@ static int bdrv_open_common(BlockDriverState *bs, const char *filename, open_flags |= BDRV_O_RDWR; } - bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR); + bs->read_only = !(open_flags & BDRV_O_RDWR); /* Open the image, either directly or using a protocol */ if (drv->bdrv_file_open) { @@ -727,6 +744,42 @@ int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags) return 0; } +int bdrv_open_backing_file(BlockDriverState *bs) +{ + char backing_filename[PATH_MAX]; + int back_flags, ret; + BlockDriver *back_drv = NULL; + + if (bs->backing_hd != NULL) { + return 0; + } + + bs->open_flags &= ~BDRV_O_NO_BACKING; + if (bs->backing_file[0] == '\0') { + return 0; + } + + bs->backing_hd = bdrv_new(""); + bdrv_get_full_backing_filename(bs, backing_filename, + sizeof(backing_filename)); + + if (bs->backing_format[0] != '\0') { + back_drv = bdrv_find_format(bs->backing_format); + } + + /* backing files always opened read-only */ + back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT); + + ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv); + if (ret < 0) { + bdrv_delete(bs->backing_hd); + bs->backing_hd = NULL; + bs->open_flags |= BDRV_O_NO_BACKING; + return ret; + } + return 0; +} + /* * Opens a disk image (raw, qcow2, vmdk, ...) */ @@ -803,6 +856,10 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags, goto unlink_and_fail; } + if (flags & BDRV_O_RDWR) { + flags |= BDRV_O_ALLOW_RDWR; + } + /* Open the image */ ret = bdrv_open_common(bs, filename, flags, drv); if (ret < 0) { @@ -810,34 +867,12 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags, } /* If there is a backing file, use it */ - if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') { - char backing_filename[PATH_MAX]; - int back_flags; - BlockDriver *back_drv = NULL; - - bs->backing_hd = bdrv_new(""); - bdrv_get_full_backing_filename(bs, backing_filename, - sizeof(backing_filename)); - - if (bs->backing_format[0] != '\0') { - back_drv = bdrv_find_format(bs->backing_format); - } - - /* backing files always opened read-only */ - back_flags = - flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING); - - ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv); + if ((flags & BDRV_O_NO_BACKING) == 0) { + ret = bdrv_open_backing_file(bs); if (ret < 0) { bdrv_close(bs); return ret; } - if (bs->is_temporary) { - bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR); - } else { - /* base image inherits from "parent" */ - bs->backing_hd->keep_read_only = bs->keep_read_only; - } } if (!bdrv_key_required(bs)) { @@ -858,15 +893,248 @@ unlink_and_fail: return ret; } +typedef struct BlockReopenQueueEntry { + bool prepared; + BDRVReopenState state; + QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry; +} BlockReopenQueueEntry; + +/* + * Adds a BlockDriverState to a simple queue for an atomic, transactional + * reopen of multiple devices. + * + * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT + * already performed, or alternatively may be NULL a new BlockReopenQueue will + * be created and initialized. This newly created BlockReopenQueue should be + * passed back in for subsequent calls that are intended to be of the same + * atomic 'set'. + * + * bs is the BlockDriverState to add to the reopen queue. + * + * flags contains the open flags for the associated bs + * + * returns a pointer to bs_queue, which is either the newly allocated + * bs_queue, or the existing bs_queue being used. + * + */ +BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, + BlockDriverState *bs, int flags) +{ + assert(bs != NULL); + + BlockReopenQueueEntry *bs_entry; + if (bs_queue == NULL) { + bs_queue = g_new0(BlockReopenQueue, 1); + QSIMPLEQ_INIT(bs_queue); + } + + if (bs->file) { + bdrv_reopen_queue(bs_queue, bs->file, flags); + } + + bs_entry = g_new0(BlockReopenQueueEntry, 1); + QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry); + + bs_entry->state.bs = bs; + bs_entry->state.flags = flags; + + return bs_queue; +} + +/* + * Reopen multiple BlockDriverStates atomically & transactionally. + * + * The queue passed in (bs_queue) must have been built up previous + * via bdrv_reopen_queue(). + * + * Reopens all BDS specified in the queue, with the appropriate + * flags. All devices are prepared for reopen, and failure of any + * device will cause all device changes to be abandonded, and intermediate + * data cleaned up. + * + * If all devices prepare successfully, then the changes are committed + * to all devices. + * + */ +int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp) +{ + int ret = -1; + BlockReopenQueueEntry *bs_entry, *next; + Error *local_err = NULL; + + assert(bs_queue != NULL); + + bdrv_drain_all(); + + QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) { + if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) { + error_propagate(errp, local_err); + goto cleanup; + } + bs_entry->prepared = true; + } + + /* If we reach this point, we have success and just need to apply the + * changes + */ + QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) { + bdrv_reopen_commit(&bs_entry->state); + } + + ret = 0; + +cleanup: + QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) { + if (ret && bs_entry->prepared) { + bdrv_reopen_abort(&bs_entry->state); + } + g_free(bs_entry); + } + g_free(bs_queue); + return ret; +} + + +/* Reopen a single BlockDriverState with the specified flags. */ +int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp) +{ + int ret = -1; + Error *local_err = NULL; + BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags); + + ret = bdrv_reopen_multiple(queue, &local_err); + if (local_err != NULL) { + error_propagate(errp, local_err); + } + return ret; +} + + +/* + * Prepares a BlockDriverState for reopen. All changes are staged in the + * 'opaque' field of the BDRVReopenState, which is used and allocated by + * the block driver layer .bdrv_reopen_prepare() + * + * bs is the BlockDriverState to reopen + * flags are the new open flags + * queue is the reopen queue + * + * Returns 0 on success, non-zero on error. On error errp will be set + * as well. + * + * On failure, bdrv_reopen_abort() will be called to clean up any data. + * It is the responsibility of the caller to then call the abort() or + * commit() for any other BDS that have been left in a prepare() state + * + */ +int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue, + Error **errp) +{ + int ret = -1; + Error *local_err = NULL; + BlockDriver *drv; + + assert(reopen_state != NULL); + assert(reopen_state->bs->drv != NULL); + drv = reopen_state->bs->drv; + + /* if we are to stay read-only, do not allow permission change + * to r/w */ + if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) && + reopen_state->flags & BDRV_O_RDWR) { + error_set(errp, QERR_DEVICE_IS_READ_ONLY, + reopen_state->bs->device_name); + goto error; + } + + + ret = bdrv_flush(reopen_state->bs); + if (ret) { + error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive", + strerror(-ret)); + goto error; + } + + if (drv->bdrv_reopen_prepare) { + ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err); + if (ret) { + if (local_err != NULL) { + error_propagate(errp, local_err); + } else { + error_set(errp, QERR_OPEN_FILE_FAILED, + reopen_state->bs->filename); + } + goto error; + } + } else { + /* It is currently mandatory to have a bdrv_reopen_prepare() + * handler for each supported drv. */ + error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, + drv->format_name, reopen_state->bs->device_name, + "reopening of file"); + ret = -1; + goto error; + } + + ret = 0; + +error: + return ret; +} + +/* + * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and + * makes them final by swapping the staging BlockDriverState contents into + * the active BlockDriverState contents. + */ +void bdrv_reopen_commit(BDRVReopenState *reopen_state) +{ + BlockDriver *drv; + + assert(reopen_state != NULL); + drv = reopen_state->bs->drv; + assert(drv != NULL); + + /* If there are any driver level actions to take */ + if (drv->bdrv_reopen_commit) { + drv->bdrv_reopen_commit(reopen_state); + } + + /* set BDS specific flags now */ + reopen_state->bs->open_flags = reopen_state->flags; + reopen_state->bs->enable_write_cache = !!(reopen_state->flags & + BDRV_O_CACHE_WB); + reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR); +} + +/* + * Abort the reopen, and delete and free the staged changes in + * reopen_state + */ +void bdrv_reopen_abort(BDRVReopenState *reopen_state) +{ + BlockDriver *drv; + + assert(reopen_state != NULL); + drv = reopen_state->bs->drv; + assert(drv != NULL); + + if (drv->bdrv_reopen_abort) { + drv->bdrv_reopen_abort(reopen_state); + } +} + + void bdrv_close(BlockDriverState *bs) { bdrv_flush(bs); - if (bs->drv) { - if (bs->job) { - block_job_cancel_sync(bs->job); - } - bdrv_drain_all(); + if (bs->job) { + block_job_cancel_sync(bs->job); + } + bdrv_drain_all(); + notifier_list_notify(&bs->close_notifiers, bs); + if (bs->drv) { if (bs == bs_snapshots) { bs_snapshots = NULL; } @@ -896,10 +1164,10 @@ void bdrv_close(BlockDriverState *bs) bdrv_delete(bs->file); bs->file = NULL; } - - bdrv_dev_change_media_cb(bs, false); } + bdrv_dev_change_media_cb(bs, false); + /*throttling disk I/O limits*/ if (bs->io_limits_enabled) { bdrv_io_limits_disable(bs); @@ -970,98 +1238,124 @@ static void bdrv_rebind(BlockDriverState *bs) } } +static void bdrv_move_feature_fields(BlockDriverState *bs_dest, + BlockDriverState *bs_src) +{ + /* move some fields that need to stay attached to the device */ + bs_dest->open_flags = bs_src->open_flags; + + /* dev info */ + bs_dest->dev_ops = bs_src->dev_ops; + bs_dest->dev_opaque = bs_src->dev_opaque; + bs_dest->dev = bs_src->dev; + bs_dest->buffer_alignment = bs_src->buffer_alignment; + bs_dest->copy_on_read = bs_src->copy_on_read; + + bs_dest->enable_write_cache = bs_src->enable_write_cache; + + /* i/o timing parameters */ + bs_dest->slice_time = bs_src->slice_time; + bs_dest->slice_start = bs_src->slice_start; + bs_dest->slice_end = bs_src->slice_end; + bs_dest->io_limits = bs_src->io_limits; + bs_dest->io_base = bs_src->io_base; + bs_dest->throttled_reqs = bs_src->throttled_reqs; + bs_dest->block_timer = bs_src->block_timer; + bs_dest->io_limits_enabled = bs_src->io_limits_enabled; + + /* r/w error */ + bs_dest->on_read_error = bs_src->on_read_error; + bs_dest->on_write_error = bs_src->on_write_error; + + /* i/o status */ + bs_dest->iostatus_enabled = bs_src->iostatus_enabled; + bs_dest->iostatus = bs_src->iostatus; + + /* dirty bitmap */ + bs_dest->dirty_count = bs_src->dirty_count; + bs_dest->dirty_bitmap = bs_src->dirty_bitmap; + + /* job */ + bs_dest->in_use = bs_src->in_use; + bs_dest->job = bs_src->job; + + /* keep the same entry in bdrv_states */ + pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name), + bs_src->device_name); + bs_dest->list = bs_src->list; +} + /* - * Add new bs contents at the top of an image chain while the chain is - * live, while keeping required fields on the top layer. + * Swap bs contents for two image chains while they are live, + * while keeping required fields on the BlockDriverState that is + * actually attached to a device. * * This will modify the BlockDriverState fields, and swap contents - * between bs_new and bs_top. Both bs_new and bs_top are modified. + * between bs_new and bs_old. Both bs_new and bs_old are modified. * * bs_new is required to be anonymous. * * This function does not create any image files. */ -void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top) +void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old) { BlockDriverState tmp; - /* bs_new must be anonymous */ + /* bs_new must be anonymous and shouldn't have anything fancy enabled */ assert(bs_new->device_name[0] == '\0'); + assert(bs_new->dirty_bitmap == NULL); + assert(bs_new->job == NULL); + assert(bs_new->dev == NULL); + assert(bs_new->in_use == 0); + assert(bs_new->io_limits_enabled == false); + assert(bs_new->block_timer == NULL); tmp = *bs_new; + *bs_new = *bs_old; + *bs_old = tmp; - /* there are some fields that need to stay on the top layer: */ - tmp.open_flags = bs_top->open_flags; + /* there are some fields that should not be swapped, move them back */ + bdrv_move_feature_fields(&tmp, bs_old); + bdrv_move_feature_fields(bs_old, bs_new); + bdrv_move_feature_fields(bs_new, &tmp); - /* dev info */ - tmp.dev_ops = bs_top->dev_ops; - tmp.dev_opaque = bs_top->dev_opaque; - tmp.dev = bs_top->dev; - tmp.buffer_alignment = bs_top->buffer_alignment; - tmp.copy_on_read = bs_top->copy_on_read; + /* bs_new shouldn't be in bdrv_states even after the swap! */ + assert(bs_new->device_name[0] == '\0'); - /* i/o timing parameters */ - tmp.slice_time = bs_top->slice_time; - tmp.slice_start = bs_top->slice_start; - tmp.slice_end = bs_top->slice_end; - tmp.io_limits = bs_top->io_limits; - tmp.io_base = bs_top->io_base; - tmp.throttled_reqs = bs_top->throttled_reqs; - tmp.block_timer = bs_top->block_timer; - tmp.io_limits_enabled = bs_top->io_limits_enabled; - - /* geometry */ - tmp.cyls = bs_top->cyls; - tmp.heads = bs_top->heads; - tmp.secs = bs_top->secs; - tmp.translation = bs_top->translation; + /* Check a few fields that should remain attached to the device */ + assert(bs_new->dev == NULL); + assert(bs_new->job == NULL); + assert(bs_new->in_use == 0); + assert(bs_new->io_limits_enabled == false); + assert(bs_new->block_timer == NULL); - /* r/w error */ - tmp.on_read_error = bs_top->on_read_error; - tmp.on_write_error = bs_top->on_write_error; - - /* i/o status */ - tmp.iostatus_enabled = bs_top->iostatus_enabled; - tmp.iostatus = bs_top->iostatus; + bdrv_rebind(bs_new); + bdrv_rebind(bs_old); +} - /* keep the same entry in bdrv_states */ - pstrcpy(tmp.device_name, sizeof(tmp.device_name), bs_top->device_name); - tmp.list = bs_top->list; +/* + * Add new bs contents at the top of an image chain while the chain is + * live, while keeping required fields on the top layer. + * + * This will modify the BlockDriverState fields, and swap contents + * between bs_new and bs_top. Both bs_new and bs_top are modified. + * + * bs_new is required to be anonymous. + * + * This function does not create any image files. + */ +void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top) +{ + bdrv_swap(bs_new, bs_top); /* The contents of 'tmp' will become bs_top, as we are * swapping bs_new and bs_top contents. */ - tmp.backing_hd = bs_new; - pstrcpy(tmp.backing_file, sizeof(tmp.backing_file), bs_top->filename); - bdrv_get_format(bs_top, tmp.backing_format, sizeof(tmp.backing_format)); - - /* swap contents of the fixed new bs and the current top */ - *bs_new = *bs_top; - *bs_top = tmp; - - /* device_name[] was carried over from the old bs_top. bs_new - * shouldn't be in bdrv_states, so we need to make device_name[] - * reflect the anonymity of bs_new - */ - bs_new->device_name[0] = '\0'; - - /* clear the copied fields in the new backing file */ - bdrv_detach_dev(bs_new, bs_new->dev); - - qemu_co_queue_init(&bs_new->throttled_reqs); - memset(&bs_new->io_base, 0, sizeof(bs_new->io_base)); - memset(&bs_new->io_limits, 0, sizeof(bs_new->io_limits)); - bdrv_iostatus_disable(bs_new); - - /* we don't use bdrv_io_limits_disable() for this, because we don't want - * to affect or delete the block_timer, as it has been moved to bs_top */ - bs_new->io_limits_enabled = false; - bs_new->block_timer = NULL; - bs_new->slice_time = 0; - bs_new->slice_start = 0; - bs_new->slice_end = 0; - - bdrv_rebind(bs_new); - bdrv_rebind(bs_top); + bs_top->backing_hd = bs_new; + bs_top->open_flags &= ~BDRV_O_NO_BACKING; + pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file), + bs_new->filename); + pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format), + bs_new->drv ? bs_new->drv->format_name : ""); } void bdrv_delete(BlockDriverState *bs) @@ -1125,7 +1419,8 @@ void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops, } void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv, - BlockQMPEventAction action, int is_read) + enum MonitorEvent ev, + BlockErrorAction action, bool is_read) { QObject *data; const char *action_str; @@ -1148,7 +1443,7 @@ void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv, bdrv->device_name, action_str, is_read ? "read" : "write"); - monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data); + monitor_protocol_event(ev, data); qobject_decref(data); } @@ -1222,14 +1517,14 @@ bool bdrv_dev_is_medium_locked(BlockDriverState *bs) * free of errors) or -errno when an internal error occurred. The results of the * check are stored in res. */ -int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res) +int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix) { if (bs->drv->bdrv_check == NULL) { return -ENOTSUP; } memset(res, 0, sizeof(*res)); - return bs->drv->bdrv_check(bs, res); + return bs->drv->bdrv_check(bs, res, fix); } #define COMMIT_BUF_SECTORS 2048 @@ -1238,13 +1533,11 @@ int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res) int bdrv_commit(BlockDriverState *bs) { BlockDriver *drv = bs->drv; - BlockDriver *backing_drv; int64_t sector, total_sectors; int n, ro, open_flags; - int ret = 0, rw_ret = 0; + int ret = 0; uint8_t *buf; - char filename[1024]; - BlockDriverState *bs_rw, *bs_ro; + char filename[PATH_MAX]; if (!drv) return -ENOMEDIUM; @@ -1253,42 +1546,19 @@ int bdrv_commit(BlockDriverState *bs) return -ENOTSUP; } - if (bs->backing_hd->keep_read_only) { - return -EACCES; - } - if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) { return -EBUSY; } - backing_drv = bs->backing_hd->drv; ro = bs->backing_hd->read_only; - strncpy(filename, bs->backing_hd->filename, sizeof(filename)); + /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */ + pstrcpy(filename, sizeof(filename), bs->backing_hd->filename); open_flags = bs->backing_hd->open_flags; if (ro) { - /* re-open as RW */ - bdrv_delete(bs->backing_hd); - bs->backing_hd = NULL; - bs_rw = bdrv_new(""); - rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR, - backing_drv); - if (rw_ret < 0) { - bdrv_delete(bs_rw); - /* try to re-open read-only */ - bs_ro = bdrv_new(""); - ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR, - backing_drv); - if (ret < 0) { - bdrv_delete(bs_ro); - /* drive not functional anymore */ - bs->drv = NULL; - return ret; - } - bs->backing_hd = bs_ro; - return rw_ret; + if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) { + return -EACCES; } - bs->backing_hd = bs_rw; } total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS; @@ -1325,20 +1595,8 @@ ro_cleanup: g_free(buf); if (ro) { - /* re-open as RO */ - bdrv_delete(bs->backing_hd); - bs->backing_hd = NULL; - bs_ro = bdrv_new(""); - ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR, - backing_drv); - if (ret < 0) { - bdrv_delete(bs_ro); - /* drive not functional anymore */ - bs->drv = NULL; - return ret; - } - bs->backing_hd = bs_ro; - bs->backing_hd->keep_read_only = 0; + /* ignoring error return here */ + bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL); } return ret; @@ -1494,13 +1752,156 @@ int bdrv_change_backing_file(BlockDriverState *bs, ret = -ENOTSUP; } - if (ret == 0) { - pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: ""); - pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: ""); + if (ret == 0) { + pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: ""); + pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: ""); + } + return ret; +} + +/* + * Finds the image layer in the chain that has 'bs' as its backing file. + * + * active is the current topmost image. + * + * Returns NULL if bs is not found in active's image chain, + * or if active == bs. + */ +BlockDriverState *bdrv_find_overlay(BlockDriverState *active, + BlockDriverState *bs) +{ + BlockDriverState *overlay = NULL; + BlockDriverState *intermediate; + + assert(active != NULL); + assert(bs != NULL); + + /* if bs is the same as active, then by definition it has no overlay + */ + if (active == bs) { + return NULL; + } + + intermediate = active; + while (intermediate->backing_hd) { + if (intermediate->backing_hd == bs) { + overlay = intermediate; + break; + } + intermediate = intermediate->backing_hd; + } + + return overlay; +} + +typedef struct BlkIntermediateStates { + BlockDriverState *bs; + QSIMPLEQ_ENTRY(BlkIntermediateStates) entry; +} BlkIntermediateStates; + + +/* + * Drops images above 'base' up to and including 'top', and sets the image + * above 'top' to have base as its backing file. + * + * Requires that the overlay to 'top' is opened r/w, so that the backing file + * information in 'bs' can be properly updated. + * + * E.g., this will convert the following chain: + * bottom <- base <- intermediate <- top <- active + * + * to + * + * bottom <- base <- active + * + * It is allowed for bottom==base, in which case it converts: + * + * base <- intermediate <- top <- active + * + * to + * + * base <- active + * + * Error conditions: + * if active == top, that is considered an error + * + */ +int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top, + BlockDriverState *base) +{ + BlockDriverState *intermediate; + BlockDriverState *base_bs = NULL; + BlockDriverState *new_top_bs = NULL; + BlkIntermediateStates *intermediate_state, *next; + int ret = -EIO; + + QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete; + QSIMPLEQ_INIT(&states_to_delete); + + if (!top->drv || !base->drv) { + goto exit; + } + + new_top_bs = bdrv_find_overlay(active, top); + + if (new_top_bs == NULL) { + /* we could not find the image above 'top', this is an error */ + goto exit; + } + + /* special case of new_top_bs->backing_hd already pointing to base - nothing + * to do, no intermediate images */ + if (new_top_bs->backing_hd == base) { + ret = 0; + goto exit; + } + + intermediate = top; + + /* now we will go down through the list, and add each BDS we find + * into our deletion queue, until we hit the 'base' + */ + while (intermediate) { + intermediate_state = g_malloc0(sizeof(BlkIntermediateStates)); + intermediate_state->bs = intermediate; + QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry); + + if (intermediate->backing_hd == base) { + base_bs = intermediate->backing_hd; + break; + } + intermediate = intermediate->backing_hd; + } + if (base_bs == NULL) { + /* something went wrong, we did not end at the base. safely + * unravel everything, and exit with error */ + goto exit; + } + + /* success - we can delete the intermediate states, and link top->base */ + ret = bdrv_change_backing_file(new_top_bs, base_bs->filename, + base_bs->drv ? base_bs->drv->format_name : ""); + if (ret) { + goto exit; + } + new_top_bs->backing_hd = base_bs; + + + QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) { + /* so that bdrv_close() does not recursively close the chain */ + intermediate_state->bs->backing_hd = NULL; + bdrv_delete(intermediate_state->bs); + } + ret = 0; + +exit: + QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) { + g_free(intermediate_state); } return ret; } + static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, size_t size) { @@ -1606,6 +2007,20 @@ int bdrv_read(BlockDriverState *bs, int64_t sector_num, return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false); } +/* Just like bdrv_read(), but with I/O throttling temporarily disabled */ +int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + bool enabled; + int ret; + + enabled = bs->io_limits_enabled; + bs->io_limits_enabled = false; + ret = bdrv_read(bs, 0, buf, 1); + bs->io_limits_enabled = enabled; + return ret; +} + #define BITS_PER_LONG (sizeof(unsigned long) * 8) static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num, @@ -1758,8 +2173,8 @@ int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, return ret; } - /* No flush needed for cache modes that use O_DSYNC */ - if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) { + /* No flush needed for cache modes that already do it */ + if (bs->enable_write_cache) { bdrv_flush(bs); } @@ -1808,6 +2223,9 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, cluster_nb_sectors); } else { + /* This does not change the data on the disk, it is not necessary + * to flush even in cache=writethrough mode. + */ ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, &bounce_qiov); } @@ -1821,8 +2239,8 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, } skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; - qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes, - nb_sectors * BDRV_SECTOR_SIZE); + qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, + nb_sectors * BDRV_SECTOR_SIZE); err: qemu_vfree(bounce_buffer); @@ -1977,8 +2395,12 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); } + if (ret == 0 && !bs->enable_write_cache) { + ret = bdrv_co_flush(bs); + } + if (bs->dirty_bitmap) { - set_dirty_bitmap(bs, sector_num, nb_sectors, 1); + bdrv_set_dirty(bs, sector_num, nb_sectors); } if (bs->wr_highest_sector < sector_num + nb_sectors - 1) { @@ -2078,152 +2500,6 @@ void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr) *nb_sectors_ptr = length; } -struct partition { - uint8_t boot_ind; /* 0x80 - active */ - uint8_t head; /* starting head */ - uint8_t sector; /* starting sector */ - uint8_t cyl; /* starting cylinder */ - uint8_t sys_ind; /* What partition type */ - uint8_t end_head; /* end head */ - uint8_t end_sector; /* end sector */ - uint8_t end_cyl; /* end cylinder */ - uint32_t start_sect; /* starting sector counting from 0 */ - uint32_t nr_sects; /* nr of sectors in partition */ -} QEMU_PACKED; - -/* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */ -static int guess_disk_lchs(BlockDriverState *bs, - int *pcylinders, int *pheads, int *psectors) -{ - uint8_t buf[BDRV_SECTOR_SIZE]; - int ret, i, heads, sectors, cylinders; - struct partition *p; - uint32_t nr_sects; - uint64_t nb_sectors; - bool enabled; - - bdrv_get_geometry(bs, &nb_sectors); - - /** - * The function will be invoked during startup not only in sync I/O mode, - * but also in async I/O mode. So the I/O throttling function has to - * be disabled temporarily here, not permanently. - */ - enabled = bs->io_limits_enabled; - bs->io_limits_enabled = false; - ret = bdrv_read(bs, 0, buf, 1); - bs->io_limits_enabled = enabled; - if (ret < 0) - return -1; - /* test msdos magic */ - if (buf[510] != 0x55 || buf[511] != 0xaa) - return -1; - for(i = 0; i < 4; i++) { - p = ((struct partition *)(buf + 0x1be)) + i; - nr_sects = le32_to_cpu(p->nr_sects); - if (nr_sects && p->end_head) { - /* We make the assumption that the partition terminates on - a cylinder boundary */ - heads = p->end_head + 1; - sectors = p->end_sector & 63; - if (sectors == 0) - continue; - cylinders = nb_sectors / (heads * sectors); - if (cylinders < 1 || cylinders > 16383) - continue; - *pheads = heads; - *psectors = sectors; - *pcylinders = cylinders; -#if 0 - printf("guessed geometry: LCHS=%d %d %d\n", - cylinders, heads, sectors); -#endif - return 0; - } - } - return -1; -} - -void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs) -{ - int translation, lba_detected = 0; - int cylinders, heads, secs; - uint64_t nb_sectors; - - /* if a geometry hint is available, use it */ - bdrv_get_geometry(bs, &nb_sectors); - bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs); - translation = bdrv_get_translation_hint(bs); - if (cylinders != 0) { - *pcyls = cylinders; - *pheads = heads; - *psecs = secs; - } else { - if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) { - if (heads > 16) { - /* if heads > 16, it means that a BIOS LBA - translation was active, so the default - hardware geometry is OK */ - lba_detected = 1; - goto default_geometry; - } else { - *pcyls = cylinders; - *pheads = heads; - *psecs = secs; - /* disable any translation to be in sync with - the logical geometry */ - if (translation == BIOS_ATA_TRANSLATION_AUTO) { - bdrv_set_translation_hint(bs, - BIOS_ATA_TRANSLATION_NONE); - } - } - } else { - default_geometry: - /* if no geometry, use a standard physical disk geometry */ - cylinders = nb_sectors / (16 * 63); - - if (cylinders > 16383) - cylinders = 16383; - else if (cylinders < 2) - cylinders = 2; - *pcyls = cylinders; - *pheads = 16; - *psecs = 63; - if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) { - if ((*pcyls * *pheads) <= 131072) { - bdrv_set_translation_hint(bs, - BIOS_ATA_TRANSLATION_LARGE); - } else { - bdrv_set_translation_hint(bs, - BIOS_ATA_TRANSLATION_LBA); - } - } - } - bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs); - } -} - -void bdrv_set_geometry_hint(BlockDriverState *bs, - int cyls, int heads, int secs) -{ - bs->cyls = cyls; - bs->heads = heads; - bs->secs = secs; -} - -void bdrv_set_translation_hint(BlockDriverState *bs, int translation) -{ - bs->translation = translation; -} - -void bdrv_get_geometry_hint(BlockDriverState *bs, - int *pcyls, int *pheads, int *psecs) -{ - *pcyls = bs->cyls; - *pheads = bs->heads; - *psecs = bs->secs; -} - /* throttling disk io limits */ void bdrv_set_io_limits(BlockDriverState *bs, BlockIOLimit *io_limits) @@ -2232,128 +2508,49 @@ void bdrv_set_io_limits(BlockDriverState *bs, bs->io_limits_enabled = bdrv_io_limits_enabled(bs); } -/* Recognize floppy formats */ -typedef struct FDFormat { - FDriveType drive; - uint8_t last_sect; - uint8_t max_track; - uint8_t max_head; - FDriveRate rate; -} FDFormat; - -static const FDFormat fd_formats[] = { - /* First entry is default format */ - /* 1.44 MB 3"1/2 floppy disks */ - { FDRIVE_DRV_144, 18, 80, 1, FDRIVE_RATE_500K, }, - { FDRIVE_DRV_144, 20, 80, 1, FDRIVE_RATE_500K, }, - { FDRIVE_DRV_144, 21, 80, 1, FDRIVE_RATE_500K, }, - { FDRIVE_DRV_144, 21, 82, 1, FDRIVE_RATE_500K, }, - { FDRIVE_DRV_144, 21, 83, 1, FDRIVE_RATE_500K, }, - { FDRIVE_DRV_144, 22, 80, 1, FDRIVE_RATE_500K, }, - { FDRIVE_DRV_144, 23, 80, 1, FDRIVE_RATE_500K, }, - { FDRIVE_DRV_144, 24, 80, 1, FDRIVE_RATE_500K, }, - /* 2.88 MB 3"1/2 floppy disks */ - { FDRIVE_DRV_288, 36, 80, 1, FDRIVE_RATE_1M, }, - { FDRIVE_DRV_288, 39, 80, 1, FDRIVE_RATE_1M, }, - { FDRIVE_DRV_288, 40, 80, 1, FDRIVE_RATE_1M, }, - { FDRIVE_DRV_288, 44, 80, 1, FDRIVE_RATE_1M, }, - { FDRIVE_DRV_288, 48, 80, 1, FDRIVE_RATE_1M, }, - /* 720 kB 3"1/2 floppy disks */ - { FDRIVE_DRV_144, 9, 80, 1, FDRIVE_RATE_250K, }, - { FDRIVE_DRV_144, 10, 80, 1, FDRIVE_RATE_250K, }, - { FDRIVE_DRV_144, 10, 82, 1, FDRIVE_RATE_250K, }, - { FDRIVE_DRV_144, 10, 83, 1, FDRIVE_RATE_250K, }, - { FDRIVE_DRV_144, 13, 80, 1, FDRIVE_RATE_250K, }, - { FDRIVE_DRV_144, 14, 80, 1, FDRIVE_RATE_250K, }, - /* 1.2 MB 5"1/4 floppy disks */ - { FDRIVE_DRV_120, 15, 80, 1, FDRIVE_RATE_500K, }, - { FDRIVE_DRV_120, 18, 80, 1, FDRIVE_RATE_500K, }, - { FDRIVE_DRV_120, 18, 82, 1, FDRIVE_RATE_500K, }, - { FDRIVE_DRV_120, 18, 83, 1, FDRIVE_RATE_500K, }, - { FDRIVE_DRV_120, 20, 80, 1, FDRIVE_RATE_500K, }, - /* 720 kB 5"1/4 floppy disks */ - { FDRIVE_DRV_120, 9, 80, 1, FDRIVE_RATE_250K, }, - { FDRIVE_DRV_120, 11, 80, 1, FDRIVE_RATE_250K, }, - /* 360 kB 5"1/4 floppy disks */ - { FDRIVE_DRV_120, 9, 40, 1, FDRIVE_RATE_300K, }, - { FDRIVE_DRV_120, 9, 40, 0, FDRIVE_RATE_300K, }, - { FDRIVE_DRV_120, 10, 41, 1, FDRIVE_RATE_300K, }, - { FDRIVE_DRV_120, 10, 42, 1, FDRIVE_RATE_300K, }, - /* 320 kB 5"1/4 floppy disks */ - { FDRIVE_DRV_120, 8, 40, 1, FDRIVE_RATE_250K, }, - { FDRIVE_DRV_120, 8, 40, 0, FDRIVE_RATE_250K, }, - /* 360 kB must match 5"1/4 better than 3"1/2... */ - { FDRIVE_DRV_144, 9, 80, 0, FDRIVE_RATE_250K, }, - /* end */ - { FDRIVE_DRV_NONE, -1, -1, 0, 0, }, -}; - -void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads, - int *max_track, int *last_sect, - FDriveType drive_in, FDriveType *drive, - FDriveRate *rate) +void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error, + BlockdevOnError on_write_error) { - const FDFormat *parse; - uint64_t nb_sectors, size; - int i, first_match, match; - - bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect); - if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) { - /* User defined disk */ - *rate = FDRIVE_RATE_500K; - } else { - bdrv_get_geometry(bs, &nb_sectors); - match = -1; - first_match = -1; - for (i = 0; ; i++) { - parse = &fd_formats[i]; - if (parse->drive == FDRIVE_DRV_NONE) { - break; - } - if (drive_in == parse->drive || - drive_in == FDRIVE_DRV_NONE) { - size = (parse->max_head + 1) * parse->max_track * - parse->last_sect; - if (nb_sectors == size) { - match = i; - break; - } - if (first_match == -1) { - first_match = i; - } - } - } - if (match == -1) { - if (first_match == -1) { - match = 1; - } else { - match = first_match; - } - parse = &fd_formats[match]; - } - *nb_heads = parse->max_head + 1; - *max_track = parse->max_track; - *last_sect = parse->last_sect; - *drive = parse->drive; - *rate = parse->rate; - } + bs->on_read_error = on_read_error; + bs->on_write_error = on_write_error; } -int bdrv_get_translation_hint(BlockDriverState *bs) +BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read) { - return bs->translation; + return is_read ? bs->on_read_error : bs->on_write_error; } -void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error, - BlockErrorAction on_write_error) +BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error) { - bs->on_read_error = on_read_error; - bs->on_write_error = on_write_error; + BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error; + + switch (on_err) { + case BLOCKDEV_ON_ERROR_ENOSPC: + return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT; + case BLOCKDEV_ON_ERROR_STOP: + return BDRV_ACTION_STOP; + case BLOCKDEV_ON_ERROR_REPORT: + return BDRV_ACTION_REPORT; + case BLOCKDEV_ON_ERROR_IGNORE: + return BDRV_ACTION_IGNORE; + default: + abort(); + } } -BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read) +/* This is done by device models because, while the block layer knows + * about the error, it does not know whether an operation comes from + * the device or the block layer (from a job, for example). + */ +void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action, + bool is_read, int error) { - return is_read ? bs->on_read_error : bs->on_write_error; + assert(error >= 0); + bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read); + if (action == BDRV_ACTION_STOP) { + vm_stop(RUN_STATE_IO_ERROR); + bdrv_iostatus_set_err(bs, error); + } } int bdrv_is_read_only(BlockDriverState *bs) @@ -2371,6 +2568,18 @@ int bdrv_enable_write_cache(BlockDriverState *bs) return bs->enable_write_cache; } +void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce) +{ + bs->enable_write_cache = wce; + + /* so a reopen() will preserve wce */ + if (wce) { + bs->open_flags |= BDRV_O_CACHE_WB; + } else { + bs->open_flags &= ~BDRV_O_CACHE_WB; + } +} + int bdrv_is_encrypted(BlockDriverState *bs) { if (bs->backing_hd && bs->backing_hd->encrypted) @@ -2413,13 +2622,9 @@ int bdrv_set_key(BlockDriverState *bs, const char *key) return ret; } -void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size) +const char *bdrv_get_format_name(BlockDriverState *bs) { - if (!bs->drv) { - buf[0] = '\0'; - } else { - pstrcpy(buf, buf_size, bs->drv->format_name); - } + return bs->drv ? bs->drv->format_name : NULL; } void bdrv_iterate_format(void (*it)(void *opaque, const char *name), @@ -2466,6 +2671,11 @@ const char *bdrv_get_device_name(BlockDriverState *bs) return bs->device_name; } +int bdrv_get_flags(BlockDriverState *bs) +{ + return bs->open_flags; +} + void bdrv_flush_all(void) { BlockDriverState *bs; @@ -2569,72 +2779,131 @@ int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors, return data.ret; } -BlockInfoList *qmp_query_block(Error **errp) +/* + * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] + * + * Return true if the given sector is allocated in any image between + * BASE and TOP (inclusive). BASE can be NULL to check if the given + * sector is allocated in any image of the chain. Return false otherwise. + * + * 'pnum' is set to the number of sectors (including and immediately following + * the specified sector) that are known to be in the same + * allocated/unallocated state. + * + */ +int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top, + BlockDriverState *base, + int64_t sector_num, + int nb_sectors, int *pnum) { - BlockInfoList *head = NULL, *cur_item = NULL; - BlockDriverState *bs; - - QTAILQ_FOREACH(bs, &bdrv_states, list) { - BlockInfoList *info = g_malloc0(sizeof(*info)); + BlockDriverState *intermediate; + int ret, n = nb_sectors; - info->value = g_malloc0(sizeof(*info->value)); - info->value->device = g_strdup(bs->device_name); - info->value->type = g_strdup("unknown"); - info->value->locked = bdrv_dev_is_medium_locked(bs); - info->value->removable = bdrv_dev_has_removable_media(bs); - - if (bdrv_dev_has_removable_media(bs)) { - info->value->has_tray_open = true; - info->value->tray_open = bdrv_dev_is_tray_open(bs); + intermediate = top; + while (intermediate && intermediate != base) { + int pnum_inter; + ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors, + &pnum_inter); + if (ret < 0) { + return ret; + } else if (ret) { + *pnum = pnum_inter; + return 1; } - if (bdrv_iostatus_is_enabled(bs)) { - info->value->has_io_status = true; - info->value->io_status = bs->iostatus; + /* + * [sector_num, nb_sectors] is unallocated on top but intermediate + * might have + * + * [sector_num+x, nr_sectors] allocated. + */ + if (n > pnum_inter) { + n = pnum_inter; } - if (bs->drv) { - info->value->has_inserted = true; - info->value->inserted = g_malloc0(sizeof(*info->value->inserted)); - info->value->inserted->file = g_strdup(bs->filename); - info->value->inserted->ro = bs->read_only; - info->value->inserted->drv = g_strdup(bs->drv->format_name); - info->value->inserted->encrypted = bs->encrypted; - if (bs->backing_file[0]) { - info->value->inserted->has_backing_file = true; - info->value->inserted->backing_file = g_strdup(bs->backing_file); - } + intermediate = intermediate->backing_hd; + } - if (bs->io_limits_enabled) { - info->value->inserted->bps = - bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]; - info->value->inserted->bps_rd = - bs->io_limits.bps[BLOCK_IO_LIMIT_READ]; - info->value->inserted->bps_wr = - bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE]; - info->value->inserted->iops = - bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]; - info->value->inserted->iops_rd = - bs->io_limits.iops[BLOCK_IO_LIMIT_READ]; - info->value->inserted->iops_wr = - bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE]; - } + *pnum = n; + return 0; +} + +BlockInfo *bdrv_query_info(BlockDriverState *bs) +{ + BlockInfo *info = g_malloc0(sizeof(*info)); + info->device = g_strdup(bs->device_name); + info->type = g_strdup("unknown"); + info->locked = bdrv_dev_is_medium_locked(bs); + info->removable = bdrv_dev_has_removable_media(bs); + + if (bdrv_dev_has_removable_media(bs)) { + info->has_tray_open = true; + info->tray_open = bdrv_dev_is_tray_open(bs); + } + + if (bdrv_iostatus_is_enabled(bs)) { + info->has_io_status = true; + info->io_status = bs->iostatus; + } + + if (bs->dirty_bitmap) { + info->has_dirty = true; + info->dirty = g_malloc0(sizeof(*info->dirty)); + info->dirty->count = bdrv_get_dirty_count(bs) * + BDRV_SECTORS_PER_DIRTY_CHUNK * BDRV_SECTOR_SIZE; + } + + if (bs->drv) { + info->has_inserted = true; + info->inserted = g_malloc0(sizeof(*info->inserted)); + info->inserted->file = g_strdup(bs->filename); + info->inserted->ro = bs->read_only; + info->inserted->drv = g_strdup(bs->drv->format_name); + info->inserted->encrypted = bs->encrypted; + info->inserted->encryption_key_missing = bdrv_key_required(bs); + + if (bs->backing_file[0]) { + info->inserted->has_backing_file = true; + info->inserted->backing_file = g_strdup(bs->backing_file); } - /* XXX: waiting for the qapi to support GSList */ - if (!cur_item) { - head = cur_item = info; - } else { - cur_item->next = info; - cur_item = info; + info->inserted->backing_file_depth = bdrv_get_backing_file_depth(bs); + + if (bs->io_limits_enabled) { + info->inserted->bps = + bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]; + info->inserted->bps_rd = + bs->io_limits.bps[BLOCK_IO_LIMIT_READ]; + info->inserted->bps_wr = + bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE]; + info->inserted->iops = + bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]; + info->inserted->iops_rd = + bs->io_limits.iops[BLOCK_IO_LIMIT_READ]; + info->inserted->iops_wr = + bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE]; } } + return info; +} + +BlockInfoList *qmp_query_block(Error **errp) +{ + BlockInfoList *head = NULL, **p_next = &head; + BlockDriverState *bs; + + QTAILQ_FOREACH(bs, &bdrv_states, list) { + BlockInfoList *info = g_malloc0(sizeof(*info)); + info->value = bdrv_query_info(bs); + + *p_next = info; + p_next = &info->next; + } return head; } -/* Consider exposing this as a full fledged QMP command */ -static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp) +BlockStats *bdrv_query_stats(const BlockDriverState *bs) { BlockStats *s; @@ -2658,7 +2927,7 @@ static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp) if (bs->file) { s->has_parent = true; - s->parent = qmp_query_blockstat(bs->file, NULL); + s->parent = bdrv_query_stats(bs->file); } return s; @@ -2666,20 +2935,15 @@ static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp) BlockStatsList *qmp_query_blockstats(Error **errp) { - BlockStatsList *head = NULL, *cur_item = NULL; + BlockStatsList *head = NULL, **p_next = &head; BlockDriverState *bs; QTAILQ_FOREACH(bs, &bdrv_states, list) { BlockStatsList *info = g_malloc0(sizeof(*info)); - info->value = qmp_query_blockstat(bs, NULL); + info->value = bdrv_query_stats(bs); - /* XXX: waiting for the qapi to support GSList */ - if (!cur_item) { - head = cur_item = info; - } else { - cur_item->next = info; - cur_item = info; - } + *p_next = info; + p_next = &info->next; } return head; @@ -2712,9 +2976,7 @@ int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, if (bdrv_check_request(bs, sector_num, nb_sectors)) return -EIO; - if (bs->dirty_bitmap) { - set_dirty_bitmap(bs, sector_num, nb_sectors, 1); - } + assert(!bs->dirty_bitmap); return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); } @@ -2764,7 +3026,7 @@ void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event) return; } - return drv->bdrv_debug_event(bs, event); + drv->bdrv_debug_event(bs, event); } @@ -2891,22 +3153,99 @@ int bdrv_snapshot_load_tmp(BlockDriverState *bs, return -ENOTSUP; } +/* backing_file can either be relative, or absolute, or a protocol. If it is + * relative, it must be relative to the chain. So, passing in bs->filename + * from a BDS as backing_file should not be done, as that may be relative to + * the CWD rather than the chain. */ BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, const char *backing_file) { - if (!bs->drv) { + char *filename_full = NULL; + char *backing_file_full = NULL; + char *filename_tmp = NULL; + int is_protocol = 0; + BlockDriverState *curr_bs = NULL; + BlockDriverState *retval = NULL; + + if (!bs || !bs->drv || !backing_file) { return NULL; } - if (bs->backing_hd) { - if (strcmp(bs->backing_file, backing_file) == 0) { - return bs->backing_hd; + filename_full = g_malloc(PATH_MAX); + backing_file_full = g_malloc(PATH_MAX); + filename_tmp = g_malloc(PATH_MAX); + + is_protocol = path_has_protocol(backing_file); + + for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) { + + /* If either of the filename paths is actually a protocol, then + * compare unmodified paths; otherwise make paths relative */ + if (is_protocol || path_has_protocol(curr_bs->backing_file)) { + if (strcmp(backing_file, curr_bs->backing_file) == 0) { + retval = curr_bs->backing_hd; + break; + } } else { - return bdrv_find_backing_image(bs->backing_hd, backing_file); + /* If not an absolute filename path, make it relative to the current + * image's filename path */ + path_combine(filename_tmp, PATH_MAX, curr_bs->filename, + backing_file); + + /* We are going to compare absolute pathnames */ + if (!realpath(filename_tmp, filename_full)) { + continue; + } + + /* We need to make sure the backing filename we are comparing against + * is relative to the current image filename (or absolute) */ + path_combine(filename_tmp, PATH_MAX, curr_bs->filename, + curr_bs->backing_file); + + if (!realpath(filename_tmp, backing_file_full)) { + continue; + } + + if (strcmp(backing_file_full, filename_full) == 0) { + retval = curr_bs->backing_hd; + break; + } } } - return NULL; + g_free(filename_full); + g_free(backing_file_full); + g_free(filename_tmp); + return retval; +} + +int bdrv_get_backing_file_depth(BlockDriverState *bs) +{ + if (!bs->drv) { + return 0; + } + + if (!bs->backing_hd) { + return 0; + } + + return 1 + bdrv_get_backing_file_depth(bs->backing_hd); +} + +BlockDriverState *bdrv_find_base(BlockDriverState *bs) +{ + BlockDriverState *curr_bs = NULL; + + if (!bs) { + return NULL; + } + + curr_bs = bs; + + while (curr_bs->backing_hd) { + curr_bs = curr_bs->backing_hd; + } + return curr_bs; } #define NB_SUFFIXES 4 @@ -3101,13 +3440,13 @@ static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, // Add the first request to the merged one. If the requests are // overlapping, drop the last sectors of the first request. size = (reqs[i].sector - reqs[outidx].sector) << 9; - qemu_iovec_concat(qiov, reqs[outidx].qiov, size); + qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); // We should need to add any zeros between the two requests assert (reqs[i].sector <= oldreq_last); // Add the second request - qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size); + qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); reqs[outidx].nb_sectors = qiov->size >> 9; reqs[outidx].qiov = qiov; @@ -3182,7 +3521,7 @@ int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) void bdrv_aio_cancel(BlockDriverAIOCB *acb) { - acb->pool->cancel(acb); + acb->aiocb_info->cancel(acb); } /* block I/O throttling */ @@ -3372,7 +3711,7 @@ static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb) qemu_aio_release(acb); } -static AIOPool bdrv_em_aio_pool = { +static const AIOCBInfo bdrv_em_aiocb_info = { .aiocb_size = sizeof(BlockDriverAIOCBSync), .cancel = bdrv_aio_cancel_em, }; @@ -3382,7 +3721,7 @@ static void bdrv_aio_bh_cb(void *opaque) BlockDriverAIOCBSync *acb = opaque; if (!acb->is_write) - qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size); + qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); qemu_vfree(acb->bounce); acb->common.cb(acb->common.opaque, acb->ret); qemu_bh_delete(acb->bh); @@ -3401,14 +3740,14 @@ static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, { BlockDriverAIOCBSync *acb; - acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque); + acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); acb->is_write = is_write; acb->qiov = qiov; acb->bounce = qemu_blockalign(bs, qiov->size); acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb); if (is_write) { - qemu_iovec_to_buffer(acb->qiov, acb->bounce); + qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); } else { acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); @@ -3446,7 +3785,7 @@ static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb) qemu_aio_flush(); } -static AIOPool bdrv_em_co_aio_pool = { +static const AIOCBInfo bdrv_em_co_aiocb_info = { .aiocb_size = sizeof(BlockDriverAIOCBCoroutine), .cancel = bdrv_aio_co_cancel_em, }; @@ -3489,7 +3828,7 @@ static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, Coroutine *co; BlockDriverAIOCBCoroutine *acb; - acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque); + acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); acb->req.sector = sector_num; acb->req.nb_sectors = nb_sectors; acb->req.qiov = qiov; @@ -3519,7 +3858,7 @@ BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs, Coroutine *co; BlockDriverAIOCBCoroutine *acb; - acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque); + acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); co = qemu_coroutine_create(bdrv_aio_flush_co_entry); qemu_coroutine_enter(co, acb); @@ -3545,7 +3884,7 @@ BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs, trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); - acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque); + acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); acb->req.sector = sector_num; acb->req.nb_sectors = nb_sectors; co = qemu_coroutine_create(bdrv_aio_discard_co_entry); @@ -3565,18 +3904,13 @@ void bdrv_init_with_whitelist(void) bdrv_init(); } -void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs, +void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, BlockDriverCompletionFunc *cb, void *opaque) { BlockDriverAIOCB *acb; - if (pool->free_aiocb) { - acb = pool->free_aiocb; - pool->free_aiocb = acb->next; - } else { - acb = g_malloc0(pool->aiocb_size); - acb->pool = pool; - } + acb = g_slice_alloc(aiocb_info->aiocb_size); + acb->aiocb_info = aiocb_info; acb->bs = bs; acb->cb = cb; acb->opaque = opaque; @@ -3585,10 +3919,8 @@ void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs, void qemu_aio_release(void *p) { - BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p; - AIOPool *pool = acb->pool; - acb->next = pool->free_aiocb; - pool->free_aiocb = acb; + BlockDriverAIOCB *acb = p; + g_slice_free1(acb->aiocb_info->aiocb_size, acb); } /**************************************************************/ @@ -3672,7 +4004,7 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs) /* But don't actually force it to the disk with cache=unsafe */ if (bs->open_flags & BDRV_O_NO_FLUSH) { - return 0; + goto flush_parent; } if (bs->drv->bdrv_co_flush_to_disk) { @@ -3711,6 +4043,7 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs) /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH * in the case of cache=unsafe, so there are no useless flushes. */ +flush_parent: return bdrv_co_flush(bs->file); } @@ -3943,13 +4276,54 @@ int bdrv_get_dirty(BlockDriverState *bs, int64_t sector) if (bs->dirty_bitmap && (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) { - return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] & - (1UL << (chunk % (sizeof(unsigned long) * 8)))); + return !!(bs->dirty_bitmap[chunk / BITS_PER_LONG] & + (1UL << (chunk % BITS_PER_LONG))); } else { return 0; } } +int64_t bdrv_get_next_dirty(BlockDriverState *bs, int64_t sector) +{ + int64_t chunk; + int bit, elem; + + /* Avoid an infinite loop. */ + assert(bs->dirty_count > 0); + + sector = (sector | (BDRV_SECTORS_PER_DIRTY_CHUNK - 1)) + 1; + chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK; + + QEMU_BUILD_BUG_ON(sizeof(bs->dirty_bitmap[0]) * 8 != BITS_PER_LONG); + elem = chunk / BITS_PER_LONG; + bit = chunk % BITS_PER_LONG; + for (;;) { + if (sector >= bs->total_sectors) { + sector = 0; + bit = elem = 0; + } + if (bit == 0 && bs->dirty_bitmap[elem] == 0) { + sector += BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG; + elem++; + } else { + if (bs->dirty_bitmap[elem] & (1UL << bit)) { + return sector; + } + sector += BDRV_SECTORS_PER_DIRTY_CHUNK; + if (++bit == BITS_PER_LONG) { + bit = 0; + elem++; + } + } + } +} + +void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, + int nr_sectors) +{ + set_dirty_bitmap(bs, cur_sector, nr_sectors, 1); +} + void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors) { @@ -3983,9 +4357,9 @@ void bdrv_iostatus_enable(BlockDriverState *bs) bool bdrv_iostatus_is_enabled(const BlockDriverState *bs) { return (bs->iostatus_enabled && - (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC || - bs->on_write_error == BLOCK_ERR_STOP_ANY || - bs->on_read_error == BLOCK_ERR_STOP_ANY)); + (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC || + bs->on_write_error == BLOCKDEV_ON_ERROR_STOP || + bs->on_read_error == BLOCKDEV_ON_ERROR_STOP)); } void bdrv_iostatus_disable(BlockDriverState *bs) @@ -3997,17 +4371,16 @@ void bdrv_iostatus_reset(BlockDriverState *bs) { if (bdrv_iostatus_is_enabled(bs)) { bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK; + if (bs->job) { + block_job_iostatus_reset(bs->job); + } } } -/* XXX: Today this is set by device models because it makes the implementation - quite simple. However, the block layer knows about the error, so it's - possible to implement this without device models being involved */ void bdrv_iostatus_set_err(BlockDriverState *bs, int error) { - if (bdrv_iostatus_is_enabled(bs) && - bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) { - assert(error >= 0); + assert(bdrv_iostatus_is_enabled(bs)); + if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) { bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE : BLOCK_DEVICE_IO_STATUS_FAILED; } @@ -4181,130 +4554,3 @@ out: return ret; } - -void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs, - int64_t speed, BlockDriverCompletionFunc *cb, - void *opaque, Error **errp) -{ - BlockJob *job; - - if (bs->job || bdrv_in_use(bs)) { - error_set(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs)); - return NULL; - } - bdrv_set_in_use(bs, 1); - - job = g_malloc0(job_type->instance_size); - job->job_type = job_type; - job->bs = bs; - job->cb = cb; - job->opaque = opaque; - job->busy = true; - bs->job = job; - - /* Only set speed when necessary to avoid NotSupported error */ - if (speed != 0) { - Error *local_err = NULL; - - block_job_set_speed(job, speed, &local_err); - if (error_is_set(&local_err)) { - bs->job = NULL; - g_free(job); - bdrv_set_in_use(bs, 0); - error_propagate(errp, local_err); - return NULL; - } - } - return job; -} - -void block_job_complete(BlockJob *job, int ret) -{ - BlockDriverState *bs = job->bs; - - assert(bs->job == job); - job->cb(job->opaque, ret); - bs->job = NULL; - g_free(job); - bdrv_set_in_use(bs, 0); -} - -void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp) -{ - Error *local_err = NULL; - - if (!job->job_type->set_speed) { - error_set(errp, QERR_NOT_SUPPORTED); - return; - } - job->job_type->set_speed(job, speed, &local_err); - if (error_is_set(&local_err)) { - error_propagate(errp, local_err); - return; - } - - job->speed = speed; -} - -void block_job_cancel(BlockJob *job) -{ - job->cancelled = true; - if (job->co && !job->busy) { - qemu_coroutine_enter(job->co, NULL); - } -} - -bool block_job_is_cancelled(BlockJob *job) -{ - return job->cancelled; -} - -struct BlockCancelData { - BlockJob *job; - BlockDriverCompletionFunc *cb; - void *opaque; - bool cancelled; - int ret; -}; - -static void block_job_cancel_cb(void *opaque, int ret) -{ - struct BlockCancelData *data = opaque; - - data->cancelled = block_job_is_cancelled(data->job); - data->ret = ret; - data->cb(data->opaque, ret); -} - -int block_job_cancel_sync(BlockJob *job) -{ - struct BlockCancelData data; - BlockDriverState *bs = job->bs; - - assert(bs->job == job); - - /* Set up our own callback to store the result and chain to - * the original callback. - */ - data.job = job; - data.cb = job->cb; - data.opaque = job->opaque; - data.ret = -EINPROGRESS; - job->cb = block_job_cancel_cb; - job->opaque = &data; - block_job_cancel(job); - while (data.ret == -EINPROGRESS) { - qemu_aio_wait(); - } - return (data.cancelled && data.ret == 0) ? -ECANCELED : data.ret; -} - -void block_job_sleep_ns(BlockJob *job, QEMUClock *clock, int64_t ns) -{ - /* Check cancellation *before* setting busy = false, too! */ - if (!block_job_is_cancelled(job)) { - job->busy = false; - co_sleep_ns(clock, ns); - job->busy = true; - } -}