#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
-static BlockAIOCB *bdrv_co_aio_rw_vector(BdrvChild *child,
- int64_t sector_num,
- QEMUIOVector *qiov,
- int nb_sectors,
- BdrvRequestFlags flags,
- BlockCompletionFunc *cb,
- void *opaque,
- bool is_write);
+static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child,
+ int64_t offset,
+ QEMUIOVector *qiov,
+ BdrvRequestFlags flags,
+ BlockCompletionFunc *cb,
+ void *opaque,
+ bool is_write);
static void coroutine_fn bdrv_co_do_rw(void *opaque);
static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
int64_t offset, int count, BdrvRequestFlags flags);
int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
bs->bl.request_alignment);
- assert(is_power_of_2(alignment));
- head = offset & (alignment - 1);
- tail = (offset + count) & (alignment - 1);
- max_write_zeroes &= ~(alignment - 1);
+ assert(alignment % bs->bl.request_alignment == 0);
+ head = offset % alignment;
+ tail = (offset + count) % alignment;
+ max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment);
+ assert(max_write_zeroes >= bs->bl.request_alignment);
while (count > 0 && !ret) {
int num = count;
}
/*
- * Forwards an already correctly aligned write request to the BlockDriver.
+ * Forwards an already correctly aligned write request to the BlockDriver,
+ * after possibly fragmenting it.
*/
static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
int64_t start_sector = offset >> BDRV_SECTOR_BITS;
int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
+ uint64_t bytes_remaining = bytes;
+ int max_transfer;
assert(is_power_of_2(align));
assert((offset & (align - 1)) == 0);
assert(!qiov || bytes == qiov->size);
assert((bs->open_flags & BDRV_O_NO_IO) == 0);
assert(!(flags & ~BDRV_REQ_MASK));
+ max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
+ align);
waited = wait_serialising_requests(req);
assert(!waited || !req->serialising);
} else if (flags & BDRV_REQ_ZERO_WRITE) {
bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
- } else {
+ } else if (bytes <= max_transfer) {
bdrv_debug_event(bs, BLKDBG_PWRITEV);
ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags);
+ } else {
+ bdrv_debug_event(bs, BLKDBG_PWRITEV);
+ while (bytes_remaining) {
+ int num = MIN(bytes_remaining, max_transfer);
+ QEMUIOVector local_qiov;
+ int local_flags = flags;
+
+ assert(num);
+ if (num < bytes_remaining && (flags & BDRV_REQ_FUA) &&
+ !(bs->supported_write_flags & BDRV_REQ_FUA)) {
+ /* If FUA is going to be emulated by flush, we only
+ * need to flush on the last iteration */
+ local_flags &= ~BDRV_REQ_FUA;
+ }
+ qemu_iovec_init(&local_qiov, qiov->niov);
+ qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
+
+ ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining,
+ num, &local_qiov, local_flags);
+ qemu_iovec_destroy(&local_qiov);
+ if (ret < 0) {
+ break;
+ }
+ bytes_remaining -= num;
+ }
}
bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
if (ret >= 0) {
bs->total_sectors = MAX(bs->total_sectors, end_sector);
+ ret = 0;
}
return ret;
{
trace_bdrv_aio_readv(child->bs, sector_num, nb_sectors, opaque);
- return bdrv_co_aio_rw_vector(child, sector_num, qiov, nb_sectors, 0,
- cb, opaque, false);
+ assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size);
+ return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov,
+ 0, cb, opaque, false);
}
BlockAIOCB *bdrv_aio_writev(BdrvChild *child, int64_t sector_num,
{
trace_bdrv_aio_writev(child->bs, sector_num, nb_sectors, opaque);
- return bdrv_co_aio_rw_vector(child, sector_num, qiov, nb_sectors, 0,
- cb, opaque, true);
+ assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size);
+ return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov,
+ 0, cb, opaque, true);
}
void bdrv_aio_cancel(BlockAIOCB *acb)
union {
/* Used during read, write, trim */
struct {
- int64_t sector;
- int nb_sectors;
+ int64_t offset;
+ int bytes;
int flags;
QEMUIOVector *qiov;
};
BlockAIOCBCoroutine *acb = opaque;
if (!acb->is_write) {
- acb->req.error = bdrv_co_do_readv(acb->child, acb->req.sector,
- acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
+ acb->req.error = bdrv_co_preadv(acb->child, acb->req.offset,
+ acb->req.qiov->size, acb->req.qiov, acb->req.flags);
} else {
- acb->req.error = bdrv_co_do_writev(acb->child, acb->req.sector,
- acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
+ acb->req.error = bdrv_co_pwritev(acb->child, acb->req.offset,
+ acb->req.qiov->size, acb->req.qiov, acb->req.flags);
}
bdrv_co_complete(acb);
}
-static BlockAIOCB *bdrv_co_aio_rw_vector(BdrvChild *child,
- int64_t sector_num,
- QEMUIOVector *qiov,
- int nb_sectors,
- BdrvRequestFlags flags,
- BlockCompletionFunc *cb,
- void *opaque,
- bool is_write)
+static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child,
+ int64_t offset,
+ QEMUIOVector *qiov,
+ BdrvRequestFlags flags,
+ BlockCompletionFunc *cb,
+ void *opaque,
+ bool is_write)
{
Coroutine *co;
BlockAIOCBCoroutine *acb;
acb->child = child;
acb->need_bh = true;
acb->req.error = -EINPROGRESS;
- acb->req.sector = sector_num;
- acb->req.nb_sectors = nb_sectors;
+ acb->req.offset = offset;
acb->req.qiov = qiov;
acb->req.flags = flags;
acb->is_write = is_write;
return &acb->common;
}
-static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
+static void coroutine_fn bdrv_aio_pdiscard_co_entry(void *opaque)
{
BlockAIOCBCoroutine *acb = opaque;
BlockDriverState *bs = acb->common.bs;
- acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
+ acb->req.error = bdrv_co_pdiscard(bs, acb->req.offset, acb->req.bytes);
bdrv_co_complete(acb);
}
-BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
- int64_t sector_num, int nb_sectors,
- BlockCompletionFunc *cb, void *opaque)
+BlockAIOCB *bdrv_aio_pdiscard(BlockDriverState *bs, int64_t offset, int count,
+ BlockCompletionFunc *cb, void *opaque)
{
Coroutine *co;
BlockAIOCBCoroutine *acb;
- trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
+ trace_bdrv_aio_pdiscard(bs, offset, count, opaque);
acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
acb->need_bh = true;
acb->req.error = -EINPROGRESS;
- acb->req.sector = sector_num;
- acb->req.nb_sectors = nb_sectors;
- co = qemu_coroutine_create(bdrv_aio_discard_co_entry, acb);
+ acb->req.offset = offset;
+ acb->req.bytes = count;
+ co = qemu_coroutine_create(bdrv_aio_pdiscard_co_entry, acb);
qemu_coroutine_enter(co);
bdrv_co_maybe_schedule_bh(acb);
int current_gen = bs->write_gen;
/* Wait until any previous flushes are completed */
- while (bs->flush_started_gen != bs->flushed_gen) {
+ while (bs->active_flush_req != NULL) {
qemu_co_queue_wait(&bs->flush_queue);
}
- bs->flush_started_gen = current_gen;
+ bs->active_flush_req = &req;
/* Write back all layers by calling one driver function */
if (bs->drv->bdrv_co_flush) {
out:
/* Notify any pending flushes that we have completed */
bs->flushed_gen = current_gen;
- qemu_co_queue_restart_all(&bs->flush_queue);
+ bs->active_flush_req = NULL;
+ /* Return value is ignored - it's ok if wait queue is empty */
+ qemu_co_queue_next(&bs->flush_queue);
tracked_request_end(&req);
return ret;
typedef struct DiscardCo {
BlockDriverState *bs;
- int64_t sector_num;
- int nb_sectors;
+ int64_t offset;
+ int count;
int ret;
} DiscardCo;
-static void coroutine_fn bdrv_discard_co_entry(void *opaque)
+static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque)
{
DiscardCo *rwco = opaque;
- rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
+ rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->count);
}
-int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
- int nb_sectors)
+int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
+ int count)
{
BdrvTrackedRequest req;
- int max_discard, ret;
+ int max_pdiscard, ret;
+ int head, align;
if (!bs->drv) {
return -ENOMEDIUM;
}
- ret = bdrv_check_request(bs, sector_num, nb_sectors);
+ ret = bdrv_check_byte_request(bs, offset, count);
if (ret < 0) {
return ret;
} else if (bs->read_only) {
return 0;
}
- if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
+ if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) {
+ return 0;
+ }
+
+ /* Discard is advisory, so ignore any unaligned head or tail */
+ align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
+ assert(align % bs->bl.request_alignment == 0);
+ head = offset % align;
+ if (head) {
+ head = MIN(count, align - head);
+ count -= head;
+ offset += head;
+ }
+ count = QEMU_ALIGN_DOWN(count, align);
+ if (!count) {
return 0;
}
- tracked_request_begin(&req, bs, sector_num << BDRV_SECTOR_BITS,
- nb_sectors << BDRV_SECTOR_BITS, BDRV_TRACKED_DISCARD);
+ tracked_request_begin(&req, bs, offset, count, BDRV_TRACKED_DISCARD);
ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
if (ret < 0) {
goto out;
}
- max_discard = MIN_NON_ZERO(bs->bl.max_pdiscard >> BDRV_SECTOR_BITS,
- BDRV_REQUEST_MAX_SECTORS);
- while (nb_sectors > 0) {
- int ret;
- int num = nb_sectors;
- int discard_alignment = bs->bl.pdiscard_alignment >> BDRV_SECTOR_BITS;
-
- /* align request */
- if (discard_alignment &&
- num >= discard_alignment &&
- sector_num % discard_alignment) {
- if (num > discard_alignment) {
- num = discard_alignment;
- }
- num -= sector_num % discard_alignment;
- }
+ max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX),
+ align);
+ assert(max_pdiscard);
- /* limit request size */
- if (num > max_discard) {
- num = max_discard;
- }
+ while (count > 0) {
+ int ret;
+ int num = MIN(count, max_pdiscard);
- if (bs->drv->bdrv_co_discard) {
- ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
+ if (bs->drv->bdrv_co_pdiscard) {
+ ret = bs->drv->bdrv_co_pdiscard(bs, offset, num);
} else {
BlockAIOCB *acb;
CoroutineIOCompletion co = {
.coroutine = qemu_coroutine_self(),
};
- acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
- bdrv_co_io_em_complete, &co);
+ acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num,
+ bdrv_co_io_em_complete, &co);
if (acb == NULL) {
ret = -EIO;
goto out;
goto out;
}
- sector_num += num;
- nb_sectors -= num;
+ offset += num;
+ count -= num;
}
ret = 0;
out:
return ret;
}
-int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
+int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int count)
{
Coroutine *co;
DiscardCo rwco = {
.bs = bs,
- .sector_num = sector_num,
- .nb_sectors = nb_sectors,
+ .offset = offset,
+ .count = count,
.ret = NOT_DONE,
};
if (qemu_in_coroutine()) {
/* Fast-path if already in coroutine context */
- bdrv_discard_co_entry(&rwco);
+ bdrv_pdiscard_co_entry(&rwco);
} else {
AioContext *aio_context = bdrv_get_aio_context(bs);
- co = qemu_coroutine_create(bdrv_discard_co_entry, &rwco);
+ co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco);
qemu_coroutine_enter(co);
while (rwco.ret == NOT_DONE) {
aio_poll(aio_context, true);