2 * Block layer I/O functions
4 * Copyright (c) 2003 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 #include "qemu/osdep.h"
27 #include "sysemu/block-backend.h"
28 #include "block/aio-wait.h"
29 #include "block/blockjob.h"
30 #include "block/blockjob_int.h"
31 #include "block/block_int.h"
32 #include "qemu/cutils.h"
33 #include "qapi/error.h"
34 #include "qemu/error-report.h"
36 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
38 /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
39 #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
41 static AioWait drain_all_aio_wait
;
43 static int coroutine_fn
bdrv_co_do_pwrite_zeroes(BlockDriverState
*bs
,
44 int64_t offset
, int bytes
, BdrvRequestFlags flags
);
46 void bdrv_parent_drained_begin(BlockDriverState
*bs
, BdrvChild
*ignore
,
47 bool ignore_bds_parents
)
51 QLIST_FOREACH_SAFE(c
, &bs
->parents
, next_parent
, next
) {
52 if (c
== ignore
|| (ignore_bds_parents
&& c
->role
->parent_is_bds
)) {
55 bdrv_parent_drained_begin_single(c
, false);
59 void bdrv_parent_drained_end(BlockDriverState
*bs
, BdrvChild
*ignore
,
60 bool ignore_bds_parents
)
64 QLIST_FOREACH_SAFE(c
, &bs
->parents
, next_parent
, next
) {
65 if (c
== ignore
|| (ignore_bds_parents
&& c
->role
->parent_is_bds
)) {
68 if (c
->role
->drained_end
) {
69 c
->role
->drained_end(c
);
74 static bool bdrv_parent_drained_poll_single(BdrvChild
*c
)
76 if (c
->role
->drained_poll
) {
77 return c
->role
->drained_poll(c
);
82 static bool bdrv_parent_drained_poll(BlockDriverState
*bs
, BdrvChild
*ignore
,
83 bool ignore_bds_parents
)
88 QLIST_FOREACH_SAFE(c
, &bs
->parents
, next_parent
, next
) {
89 if (c
== ignore
|| (ignore_bds_parents
&& c
->role
->parent_is_bds
)) {
92 busy
|= bdrv_parent_drained_poll_single(c
);
98 void bdrv_parent_drained_begin_single(BdrvChild
*c
, bool poll
)
100 if (c
->role
->drained_begin
) {
101 c
->role
->drained_begin(c
);
104 BDRV_POLL_WHILE(c
->bs
, bdrv_parent_drained_poll_single(c
));
108 static void bdrv_merge_limits(BlockLimits
*dst
, const BlockLimits
*src
)
110 dst
->opt_transfer
= MAX(dst
->opt_transfer
, src
->opt_transfer
);
111 dst
->max_transfer
= MIN_NON_ZERO(dst
->max_transfer
, src
->max_transfer
);
112 dst
->opt_mem_alignment
= MAX(dst
->opt_mem_alignment
,
113 src
->opt_mem_alignment
);
114 dst
->min_mem_alignment
= MAX(dst
->min_mem_alignment
,
115 src
->min_mem_alignment
);
116 dst
->max_iov
= MIN_NON_ZERO(dst
->max_iov
, src
->max_iov
);
119 void bdrv_refresh_limits(BlockDriverState
*bs
, Error
**errp
)
121 BlockDriver
*drv
= bs
->drv
;
122 Error
*local_err
= NULL
;
124 memset(&bs
->bl
, 0, sizeof(bs
->bl
));
130 /* Default alignment based on whether driver has byte interface */
131 bs
->bl
.request_alignment
= (drv
->bdrv_co_preadv
||
132 drv
->bdrv_aio_preadv
) ? 1 : 512;
134 /* Take some limits from the children as a default */
136 bdrv_refresh_limits(bs
->file
->bs
, &local_err
);
138 error_propagate(errp
, local_err
);
141 bdrv_merge_limits(&bs
->bl
, &bs
->file
->bs
->bl
);
143 bs
->bl
.min_mem_alignment
= 512;
144 bs
->bl
.opt_mem_alignment
= getpagesize();
146 /* Safe default since most protocols use readv()/writev()/etc */
147 bs
->bl
.max_iov
= IOV_MAX
;
151 bdrv_refresh_limits(bs
->backing
->bs
, &local_err
);
153 error_propagate(errp
, local_err
);
156 bdrv_merge_limits(&bs
->bl
, &bs
->backing
->bs
->bl
);
159 /* Then let the driver override it */
160 if (drv
->bdrv_refresh_limits
) {
161 drv
->bdrv_refresh_limits(bs
, errp
);
166 * The copy-on-read flag is actually a reference count so multiple users may
167 * use the feature without worrying about clobbering its previous state.
168 * Copy-on-read stays enabled until all users have called to disable it.
170 void bdrv_enable_copy_on_read(BlockDriverState
*bs
)
172 atomic_inc(&bs
->copy_on_read
);
175 void bdrv_disable_copy_on_read(BlockDriverState
*bs
)
177 int old
= atomic_fetch_dec(&bs
->copy_on_read
);
183 BlockDriverState
*bs
;
189 bool ignore_bds_parents
;
192 static void coroutine_fn
bdrv_drain_invoke_entry(void *opaque
)
194 BdrvCoDrainData
*data
= opaque
;
195 BlockDriverState
*bs
= data
->bs
;
198 bs
->drv
->bdrv_co_drain_begin(bs
);
200 bs
->drv
->bdrv_co_drain_end(bs
);
203 /* Set data->done before reading bs->wakeup. */
204 atomic_mb_set(&data
->done
, true);
205 bdrv_dec_in_flight(bs
);
212 /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
213 static void bdrv_drain_invoke(BlockDriverState
*bs
, bool begin
)
215 BdrvCoDrainData
*data
;
217 if (!bs
->drv
|| (begin
&& !bs
->drv
->bdrv_co_drain_begin
) ||
218 (!begin
&& !bs
->drv
->bdrv_co_drain_end
)) {
222 data
= g_new(BdrvCoDrainData
, 1);
223 *data
= (BdrvCoDrainData
) {
229 /* Make sure the driver callback completes during the polling phase for
231 bdrv_inc_in_flight(bs
);
232 data
->co
= qemu_coroutine_create(bdrv_drain_invoke_entry
, data
);
233 aio_co_schedule(bdrv_get_aio_context(bs
), data
->co
);
236 BDRV_POLL_WHILE(bs
, !data
->done
);
241 /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */
242 bool bdrv_drain_poll(BlockDriverState
*bs
, bool recursive
,
243 BdrvChild
*ignore_parent
, bool ignore_bds_parents
)
245 BdrvChild
*child
, *next
;
247 if (bdrv_parent_drained_poll(bs
, ignore_parent
, ignore_bds_parents
)) {
251 if (atomic_read(&bs
->in_flight
)) {
256 assert(!ignore_bds_parents
);
257 QLIST_FOREACH_SAFE(child
, &bs
->children
, next
, next
) {
258 if (bdrv_drain_poll(child
->bs
, recursive
, child
, false)) {
267 static bool bdrv_drain_poll_top_level(BlockDriverState
*bs
, bool recursive
,
268 BdrvChild
*ignore_parent
)
270 /* Execute pending BHs first and check everything else only after the BHs
272 while (aio_poll(bs
->aio_context
, false));
274 return bdrv_drain_poll(bs
, recursive
, ignore_parent
, false);
277 static void bdrv_do_drained_begin(BlockDriverState
*bs
, bool recursive
,
278 BdrvChild
*parent
, bool ignore_bds_parents
,
280 static void bdrv_do_drained_end(BlockDriverState
*bs
, bool recursive
,
281 BdrvChild
*parent
, bool ignore_bds_parents
);
283 static void bdrv_co_drain_bh_cb(void *opaque
)
285 BdrvCoDrainData
*data
= opaque
;
286 Coroutine
*co
= data
->co
;
287 BlockDriverState
*bs
= data
->bs
;
290 bdrv_dec_in_flight(bs
);
292 bdrv_do_drained_begin(bs
, data
->recursive
, data
->parent
,
293 data
->ignore_bds_parents
, data
->poll
);
295 bdrv_do_drained_end(bs
, data
->recursive
, data
->parent
,
296 data
->ignore_bds_parents
);
300 bdrv_drain_all_begin();
307 static void coroutine_fn
bdrv_co_yield_to_drain(BlockDriverState
*bs
,
308 bool begin
, bool recursive
,
310 bool ignore_bds_parents
,
313 BdrvCoDrainData data
;
315 /* Calling bdrv_drain() from a BH ensures the current coroutine yields and
316 * other coroutines run if they were queued by aio_co_enter(). */
318 assert(qemu_in_coroutine());
319 data
= (BdrvCoDrainData
) {
320 .co
= qemu_coroutine_self(),
324 .recursive
= recursive
,
326 .ignore_bds_parents
= ignore_bds_parents
,
330 bdrv_inc_in_flight(bs
);
332 aio_bh_schedule_oneshot(bdrv_get_aio_context(bs
),
333 bdrv_co_drain_bh_cb
, &data
);
335 qemu_coroutine_yield();
336 /* If we are resumed from some other event (such as an aio completion or a
337 * timer callback), it is a bug in the caller that should be fixed. */
341 void bdrv_do_drained_begin_quiesce(BlockDriverState
*bs
,
342 BdrvChild
*parent
, bool ignore_bds_parents
)
344 assert(!qemu_in_coroutine());
346 /* Stop things in parent-to-child order */
347 if (atomic_fetch_inc(&bs
->quiesce_counter
) == 0) {
348 aio_disable_external(bdrv_get_aio_context(bs
));
351 bdrv_parent_drained_begin(bs
, parent
, ignore_bds_parents
);
352 bdrv_drain_invoke(bs
, true);
355 static void bdrv_do_drained_begin(BlockDriverState
*bs
, bool recursive
,
356 BdrvChild
*parent
, bool ignore_bds_parents
,
359 BdrvChild
*child
, *next
;
361 if (qemu_in_coroutine()) {
362 bdrv_co_yield_to_drain(bs
, true, recursive
, parent
, ignore_bds_parents
,
367 bdrv_do_drained_begin_quiesce(bs
, parent
, ignore_bds_parents
);
370 assert(!ignore_bds_parents
);
371 bs
->recursive_quiesce_counter
++;
372 QLIST_FOREACH_SAFE(child
, &bs
->children
, next
, next
) {
373 bdrv_do_drained_begin(child
->bs
, true, child
, ignore_bds_parents
,
379 * Wait for drained requests to finish.
381 * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The
382 * call is needed so things in this AioContext can make progress even
383 * though we don't return to the main AioContext loop - this automatically
384 * includes other nodes in the same AioContext and therefore all child
388 assert(!ignore_bds_parents
);
389 BDRV_POLL_WHILE(bs
, bdrv_drain_poll_top_level(bs
, recursive
, parent
));
393 void bdrv_drained_begin(BlockDriverState
*bs
)
395 bdrv_do_drained_begin(bs
, false, NULL
, false, true);
398 void bdrv_subtree_drained_begin(BlockDriverState
*bs
)
400 bdrv_do_drained_begin(bs
, true, NULL
, false, true);
403 static void bdrv_do_drained_end(BlockDriverState
*bs
, bool recursive
,
404 BdrvChild
*parent
, bool ignore_bds_parents
)
406 BdrvChild
*child
, *next
;
407 int old_quiesce_counter
;
409 if (qemu_in_coroutine()) {
410 bdrv_co_yield_to_drain(bs
, false, recursive
, parent
, ignore_bds_parents
,
414 assert(bs
->quiesce_counter
> 0);
415 old_quiesce_counter
= atomic_fetch_dec(&bs
->quiesce_counter
);
417 /* Re-enable things in child-to-parent order */
418 bdrv_drain_invoke(bs
, false);
419 bdrv_parent_drained_end(bs
, parent
, ignore_bds_parents
);
420 if (old_quiesce_counter
== 1) {
421 aio_enable_external(bdrv_get_aio_context(bs
));
425 assert(!ignore_bds_parents
);
426 bs
->recursive_quiesce_counter
--;
427 QLIST_FOREACH_SAFE(child
, &bs
->children
, next
, next
) {
428 bdrv_do_drained_end(child
->bs
, true, child
, ignore_bds_parents
);
433 void bdrv_drained_end(BlockDriverState
*bs
)
435 bdrv_do_drained_end(bs
, false, NULL
, false);
438 void bdrv_subtree_drained_end(BlockDriverState
*bs
)
440 bdrv_do_drained_end(bs
, true, NULL
, false);
443 void bdrv_apply_subtree_drain(BdrvChild
*child
, BlockDriverState
*new_parent
)
447 for (i
= 0; i
< new_parent
->recursive_quiesce_counter
; i
++) {
448 bdrv_do_drained_begin(child
->bs
, true, child
, false, true);
452 void bdrv_unapply_subtree_drain(BdrvChild
*child
, BlockDriverState
*old_parent
)
456 for (i
= 0; i
< old_parent
->recursive_quiesce_counter
; i
++) {
457 bdrv_do_drained_end(child
->bs
, true, child
, false);
462 * Wait for pending requests to complete on a single BlockDriverState subtree,
463 * and suspend block driver's internal I/O until next request arrives.
465 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
468 void coroutine_fn
bdrv_co_drain(BlockDriverState
*bs
)
470 assert(qemu_in_coroutine());
471 bdrv_drained_begin(bs
);
472 bdrv_drained_end(bs
);
475 void bdrv_drain(BlockDriverState
*bs
)
477 bdrv_drained_begin(bs
);
478 bdrv_drained_end(bs
);
481 static void bdrv_drain_assert_idle(BlockDriverState
*bs
)
483 BdrvChild
*child
, *next
;
485 assert(atomic_read(&bs
->in_flight
) == 0);
486 QLIST_FOREACH_SAFE(child
, &bs
->children
, next
, next
) {
487 bdrv_drain_assert_idle(child
->bs
);
491 unsigned int bdrv_drain_all_count
= 0;
493 static bool bdrv_drain_all_poll(void)
495 BlockDriverState
*bs
= NULL
;
498 /* Execute pending BHs first (may modify the graph) and check everything
499 * else only after the BHs have executed. */
500 while (aio_poll(qemu_get_aio_context(), false));
502 /* bdrv_drain_poll() can't make changes to the graph and we are holding the
503 * main AioContext lock, so iterating bdrv_next_all_states() is safe. */
504 while ((bs
= bdrv_next_all_states(bs
))) {
505 AioContext
*aio_context
= bdrv_get_aio_context(bs
);
506 aio_context_acquire(aio_context
);
507 result
|= bdrv_drain_poll(bs
, false, NULL
, true);
508 aio_context_release(aio_context
);
515 * Wait for pending requests to complete across all BlockDriverStates
517 * This function does not flush data to disk, use bdrv_flush_all() for that
518 * after calling this function.
520 * This pauses all block jobs and disables external clients. It must
521 * be paired with bdrv_drain_all_end().
523 * NOTE: no new block jobs or BlockDriverStates can be created between
524 * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls.
526 void bdrv_drain_all_begin(void)
528 BlockDriverState
*bs
= NULL
;
530 if (qemu_in_coroutine()) {
531 bdrv_co_yield_to_drain(NULL
, true, false, NULL
, true, true);
535 /* AIO_WAIT_WHILE() with a NULL context can only be called from the main
536 * loop AioContext, so make sure we're in the main context. */
537 assert(qemu_get_current_aio_context() == qemu_get_aio_context());
538 assert(bdrv_drain_all_count
< INT_MAX
);
539 bdrv_drain_all_count
++;
541 /* Quiesce all nodes, without polling in-flight requests yet. The graph
542 * cannot change during this loop. */
543 while ((bs
= bdrv_next_all_states(bs
))) {
544 AioContext
*aio_context
= bdrv_get_aio_context(bs
);
546 aio_context_acquire(aio_context
);
547 bdrv_do_drained_begin(bs
, false, NULL
, true, false);
548 aio_context_release(aio_context
);
551 /* Now poll the in-flight requests */
552 AIO_WAIT_WHILE(&drain_all_aio_wait
, NULL
, bdrv_drain_all_poll());
554 while ((bs
= bdrv_next_all_states(bs
))) {
555 bdrv_drain_assert_idle(bs
);
559 void bdrv_drain_all_end(void)
561 BlockDriverState
*bs
= NULL
;
563 while ((bs
= bdrv_next_all_states(bs
))) {
564 AioContext
*aio_context
= bdrv_get_aio_context(bs
);
566 aio_context_acquire(aio_context
);
567 bdrv_do_drained_end(bs
, false, NULL
, true);
568 aio_context_release(aio_context
);
571 assert(bdrv_drain_all_count
> 0);
572 bdrv_drain_all_count
--;
575 void bdrv_drain_all(void)
577 bdrv_drain_all_begin();
578 bdrv_drain_all_end();
582 * Remove an active request from the tracked requests list
584 * This function should be called when a tracked request is completing.
586 static void tracked_request_end(BdrvTrackedRequest
*req
)
588 if (req
->serialising
) {
589 atomic_dec(&req
->bs
->serialising_in_flight
);
592 qemu_co_mutex_lock(&req
->bs
->reqs_lock
);
593 QLIST_REMOVE(req
, list
);
594 qemu_co_queue_restart_all(&req
->wait_queue
);
595 qemu_co_mutex_unlock(&req
->bs
->reqs_lock
);
599 * Add an active request to the tracked requests list
601 static void tracked_request_begin(BdrvTrackedRequest
*req
,
602 BlockDriverState
*bs
,
605 enum BdrvTrackedRequestType type
)
607 *req
= (BdrvTrackedRequest
){
612 .co
= qemu_coroutine_self(),
613 .serialising
= false,
614 .overlap_offset
= offset
,
615 .overlap_bytes
= bytes
,
618 qemu_co_queue_init(&req
->wait_queue
);
620 qemu_co_mutex_lock(&bs
->reqs_lock
);
621 QLIST_INSERT_HEAD(&bs
->tracked_requests
, req
, list
);
622 qemu_co_mutex_unlock(&bs
->reqs_lock
);
625 static void mark_request_serialising(BdrvTrackedRequest
*req
, uint64_t align
)
627 int64_t overlap_offset
= req
->offset
& ~(align
- 1);
628 unsigned int overlap_bytes
= ROUND_UP(req
->offset
+ req
->bytes
, align
)
631 if (!req
->serialising
) {
632 atomic_inc(&req
->bs
->serialising_in_flight
);
633 req
->serialising
= true;
636 req
->overlap_offset
= MIN(req
->overlap_offset
, overlap_offset
);
637 req
->overlap_bytes
= MAX(req
->overlap_bytes
, overlap_bytes
);
640 static bool is_request_serialising_and_aligned(BdrvTrackedRequest
*req
)
643 * If the request is serialising, overlap_offset and overlap_bytes are set,
644 * so we can check if the request is aligned. Otherwise, don't care and
648 return req
->serialising
&& (req
->offset
== req
->overlap_offset
) &&
649 (req
->bytes
== req
->overlap_bytes
);
653 * Round a region to cluster boundaries
655 void bdrv_round_to_clusters(BlockDriverState
*bs
,
656 int64_t offset
, int64_t bytes
,
657 int64_t *cluster_offset
,
658 int64_t *cluster_bytes
)
662 if (bdrv_get_info(bs
, &bdi
) < 0 || bdi
.cluster_size
== 0) {
663 *cluster_offset
= offset
;
664 *cluster_bytes
= bytes
;
666 int64_t c
= bdi
.cluster_size
;
667 *cluster_offset
= QEMU_ALIGN_DOWN(offset
, c
);
668 *cluster_bytes
= QEMU_ALIGN_UP(offset
- *cluster_offset
+ bytes
, c
);
672 static int bdrv_get_cluster_size(BlockDriverState
*bs
)
677 ret
= bdrv_get_info(bs
, &bdi
);
678 if (ret
< 0 || bdi
.cluster_size
== 0) {
679 return bs
->bl
.request_alignment
;
681 return bdi
.cluster_size
;
685 static bool tracked_request_overlaps(BdrvTrackedRequest
*req
,
686 int64_t offset
, unsigned int bytes
)
689 if (offset
>= req
->overlap_offset
+ req
->overlap_bytes
) {
693 if (req
->overlap_offset
>= offset
+ bytes
) {
699 void bdrv_inc_in_flight(BlockDriverState
*bs
)
701 atomic_inc(&bs
->in_flight
);
704 void bdrv_wakeup(BlockDriverState
*bs
)
706 aio_wait_kick(bdrv_get_aio_wait(bs
));
707 aio_wait_kick(&drain_all_aio_wait
);
710 void bdrv_dec_in_flight(BlockDriverState
*bs
)
712 atomic_dec(&bs
->in_flight
);
716 static bool coroutine_fn
wait_serialising_requests(BdrvTrackedRequest
*self
)
718 BlockDriverState
*bs
= self
->bs
;
719 BdrvTrackedRequest
*req
;
723 if (!atomic_read(&bs
->serialising_in_flight
)) {
729 qemu_co_mutex_lock(&bs
->reqs_lock
);
730 QLIST_FOREACH(req
, &bs
->tracked_requests
, list
) {
731 if (req
== self
|| (!req
->serialising
&& !self
->serialising
)) {
734 if (tracked_request_overlaps(req
, self
->overlap_offset
,
735 self
->overlap_bytes
))
737 /* Hitting this means there was a reentrant request, for
738 * example, a block driver issuing nested requests. This must
739 * never happen since it means deadlock.
741 assert(qemu_coroutine_self() != req
->co
);
743 /* If the request is already (indirectly) waiting for us, or
744 * will wait for us as soon as it wakes up, then just go on
745 * (instead of producing a deadlock in the former case). */
746 if (!req
->waiting_for
) {
747 self
->waiting_for
= req
;
748 qemu_co_queue_wait(&req
->wait_queue
, &bs
->reqs_lock
);
749 self
->waiting_for
= NULL
;
756 qemu_co_mutex_unlock(&bs
->reqs_lock
);
762 static int bdrv_check_byte_request(BlockDriverState
*bs
, int64_t offset
,
765 if (size
> BDRV_REQUEST_MAX_SECTORS
<< BDRV_SECTOR_BITS
) {
769 if (!bdrv_is_inserted(bs
)) {
780 typedef struct RwCo
{
786 BdrvRequestFlags flags
;
789 static void coroutine_fn
bdrv_rw_co_entry(void *opaque
)
793 if (!rwco
->is_write
) {
794 rwco
->ret
= bdrv_co_preadv(rwco
->child
, rwco
->offset
,
795 rwco
->qiov
->size
, rwco
->qiov
,
798 rwco
->ret
= bdrv_co_pwritev(rwco
->child
, rwco
->offset
,
799 rwco
->qiov
->size
, rwco
->qiov
,
805 * Process a vectored synchronous request using coroutines
807 static int bdrv_prwv_co(BdrvChild
*child
, int64_t offset
,
808 QEMUIOVector
*qiov
, bool is_write
,
809 BdrvRequestFlags flags
)
816 .is_write
= is_write
,
821 if (qemu_in_coroutine()) {
822 /* Fast-path if already in coroutine context */
823 bdrv_rw_co_entry(&rwco
);
825 co
= qemu_coroutine_create(bdrv_rw_co_entry
, &rwco
);
826 bdrv_coroutine_enter(child
->bs
, co
);
827 BDRV_POLL_WHILE(child
->bs
, rwco
.ret
== NOT_DONE
);
833 * Process a synchronous request using coroutines
835 static int bdrv_rw_co(BdrvChild
*child
, int64_t sector_num
, uint8_t *buf
,
836 int nb_sectors
, bool is_write
, BdrvRequestFlags flags
)
840 .iov_base
= (void *)buf
,
841 .iov_len
= nb_sectors
* BDRV_SECTOR_SIZE
,
844 if (nb_sectors
< 0 || nb_sectors
> BDRV_REQUEST_MAX_SECTORS
) {
848 qemu_iovec_init_external(&qiov
, &iov
, 1);
849 return bdrv_prwv_co(child
, sector_num
<< BDRV_SECTOR_BITS
,
850 &qiov
, is_write
, flags
);
853 /* return < 0 if error. See bdrv_write() for the return codes */
854 int bdrv_read(BdrvChild
*child
, int64_t sector_num
,
855 uint8_t *buf
, int nb_sectors
)
857 return bdrv_rw_co(child
, sector_num
, buf
, nb_sectors
, false, 0);
860 /* Return < 0 if error. Important errors are:
861 -EIO generic I/O error (may happen for all errors)
862 -ENOMEDIUM No media inserted.
863 -EINVAL Invalid sector number or nb_sectors
864 -EACCES Trying to write a read-only device
866 int bdrv_write(BdrvChild
*child
, int64_t sector_num
,
867 const uint8_t *buf
, int nb_sectors
)
869 return bdrv_rw_co(child
, sector_num
, (uint8_t *)buf
, nb_sectors
, true, 0);
872 int bdrv_pwrite_zeroes(BdrvChild
*child
, int64_t offset
,
873 int bytes
, BdrvRequestFlags flags
)
881 qemu_iovec_init_external(&qiov
, &iov
, 1);
882 return bdrv_prwv_co(child
, offset
, &qiov
, true,
883 BDRV_REQ_ZERO_WRITE
| flags
);
887 * Completely zero out a block device with the help of bdrv_pwrite_zeroes.
888 * The operation is sped up by checking the block status and only writing
889 * zeroes to the device if they currently do not return zeroes. Optional
890 * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP,
893 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
895 int bdrv_make_zero(BdrvChild
*child
, BdrvRequestFlags flags
)
898 int64_t target_size
, bytes
, offset
= 0;
899 BlockDriverState
*bs
= child
->bs
;
901 target_size
= bdrv_getlength(bs
);
902 if (target_size
< 0) {
907 bytes
= MIN(target_size
- offset
, BDRV_REQUEST_MAX_BYTES
);
911 ret
= bdrv_block_status(bs
, offset
, bytes
, &bytes
, NULL
, NULL
);
913 error_report("error getting block status at offset %" PRId64
": %s",
914 offset
, strerror(-ret
));
917 if (ret
& BDRV_BLOCK_ZERO
) {
921 ret
= bdrv_pwrite_zeroes(child
, offset
, bytes
, flags
);
923 error_report("error writing zeroes at offset %" PRId64
": %s",
924 offset
, strerror(-ret
));
931 int bdrv_preadv(BdrvChild
*child
, int64_t offset
, QEMUIOVector
*qiov
)
935 ret
= bdrv_prwv_co(child
, offset
, qiov
, false, 0);
943 int bdrv_pread(BdrvChild
*child
, int64_t offset
, void *buf
, int bytes
)
947 .iov_base
= (void *)buf
,
955 qemu_iovec_init_external(&qiov
, &iov
, 1);
956 return bdrv_preadv(child
, offset
, &qiov
);
959 int bdrv_pwritev(BdrvChild
*child
, int64_t offset
, QEMUIOVector
*qiov
)
963 ret
= bdrv_prwv_co(child
, offset
, qiov
, true, 0);
971 int bdrv_pwrite(BdrvChild
*child
, int64_t offset
, const void *buf
, int bytes
)
975 .iov_base
= (void *) buf
,
983 qemu_iovec_init_external(&qiov
, &iov
, 1);
984 return bdrv_pwritev(child
, offset
, &qiov
);
988 * Writes to the file and ensures that no writes are reordered across this
989 * request (acts as a barrier)
991 * Returns 0 on success, -errno in error cases.
993 int bdrv_pwrite_sync(BdrvChild
*child
, int64_t offset
,
994 const void *buf
, int count
)
998 ret
= bdrv_pwrite(child
, offset
, buf
, count
);
1003 ret
= bdrv_flush(child
->bs
);
1011 typedef struct CoroutineIOCompletion
{
1012 Coroutine
*coroutine
;
1014 } CoroutineIOCompletion
;
1016 static void bdrv_co_io_em_complete(void *opaque
, int ret
)
1018 CoroutineIOCompletion
*co
= opaque
;
1021 aio_co_wake(co
->coroutine
);
1024 static int coroutine_fn
bdrv_driver_preadv(BlockDriverState
*bs
,
1025 uint64_t offset
, uint64_t bytes
,
1026 QEMUIOVector
*qiov
, int flags
)
1028 BlockDriver
*drv
= bs
->drv
;
1030 unsigned int nb_sectors
;
1032 assert(!(flags
& ~BDRV_REQ_MASK
));
1038 if (drv
->bdrv_co_preadv
) {
1039 return drv
->bdrv_co_preadv(bs
, offset
, bytes
, qiov
, flags
);
1042 if (drv
->bdrv_aio_preadv
) {
1044 CoroutineIOCompletion co
= {
1045 .coroutine
= qemu_coroutine_self(),
1048 acb
= drv
->bdrv_aio_preadv(bs
, offset
, bytes
, qiov
, flags
,
1049 bdrv_co_io_em_complete
, &co
);
1053 qemu_coroutine_yield();
1058 sector_num
= offset
>> BDRV_SECTOR_BITS
;
1059 nb_sectors
= bytes
>> BDRV_SECTOR_BITS
;
1061 assert((offset
& (BDRV_SECTOR_SIZE
- 1)) == 0);
1062 assert((bytes
& (BDRV_SECTOR_SIZE
- 1)) == 0);
1063 assert((bytes
>> BDRV_SECTOR_BITS
) <= BDRV_REQUEST_MAX_SECTORS
);
1064 assert(drv
->bdrv_co_readv
);
1066 return drv
->bdrv_co_readv(bs
, sector_num
, nb_sectors
, qiov
);
1069 static int coroutine_fn
bdrv_driver_pwritev(BlockDriverState
*bs
,
1070 uint64_t offset
, uint64_t bytes
,
1071 QEMUIOVector
*qiov
, int flags
)
1073 BlockDriver
*drv
= bs
->drv
;
1075 unsigned int nb_sectors
;
1078 assert(!(flags
& ~BDRV_REQ_MASK
));
1084 if (drv
->bdrv_co_pwritev
) {
1085 ret
= drv
->bdrv_co_pwritev(bs
, offset
, bytes
, qiov
,
1086 flags
& bs
->supported_write_flags
);
1087 flags
&= ~bs
->supported_write_flags
;
1091 if (drv
->bdrv_aio_pwritev
) {
1093 CoroutineIOCompletion co
= {
1094 .coroutine
= qemu_coroutine_self(),
1097 acb
= drv
->bdrv_aio_pwritev(bs
, offset
, bytes
, qiov
,
1098 flags
& bs
->supported_write_flags
,
1099 bdrv_co_io_em_complete
, &co
);
1100 flags
&= ~bs
->supported_write_flags
;
1104 qemu_coroutine_yield();
1110 sector_num
= offset
>> BDRV_SECTOR_BITS
;
1111 nb_sectors
= bytes
>> BDRV_SECTOR_BITS
;
1113 assert((offset
& (BDRV_SECTOR_SIZE
- 1)) == 0);
1114 assert((bytes
& (BDRV_SECTOR_SIZE
- 1)) == 0);
1115 assert((bytes
>> BDRV_SECTOR_BITS
) <= BDRV_REQUEST_MAX_SECTORS
);
1117 assert(drv
->bdrv_co_writev
);
1118 ret
= drv
->bdrv_co_writev(bs
, sector_num
, nb_sectors
, qiov
,
1119 flags
& bs
->supported_write_flags
);
1120 flags
&= ~bs
->supported_write_flags
;
1123 if (ret
== 0 && (flags
& BDRV_REQ_FUA
)) {
1124 ret
= bdrv_co_flush(bs
);
1130 static int coroutine_fn
1131 bdrv_driver_pwritev_compressed(BlockDriverState
*bs
, uint64_t offset
,
1132 uint64_t bytes
, QEMUIOVector
*qiov
)
1134 BlockDriver
*drv
= bs
->drv
;
1140 if (!drv
->bdrv_co_pwritev_compressed
) {
1144 return drv
->bdrv_co_pwritev_compressed(bs
, offset
, bytes
, qiov
);
1147 static int coroutine_fn
bdrv_co_do_copy_on_readv(BdrvChild
*child
,
1148 int64_t offset
, unsigned int bytes
, QEMUIOVector
*qiov
)
1150 BlockDriverState
*bs
= child
->bs
;
1152 /* Perform I/O through a temporary buffer so that users who scribble over
1153 * their read buffer while the operation is in progress do not end up
1154 * modifying the image file. This is critical for zero-copy guest I/O
1155 * where anything might happen inside guest memory.
1157 void *bounce_buffer
;
1159 BlockDriver
*drv
= bs
->drv
;
1161 QEMUIOVector local_qiov
;
1162 int64_t cluster_offset
;
1163 int64_t cluster_bytes
;
1166 int max_transfer
= MIN_NON_ZERO(bs
->bl
.max_transfer
,
1167 BDRV_REQUEST_MAX_BYTES
);
1168 unsigned int progress
= 0;
1174 /* FIXME We cannot require callers to have write permissions when all they
1175 * are doing is a read request. If we did things right, write permissions
1176 * would be obtained anyway, but internally by the copy-on-read code. As
1177 * long as it is implemented here rather than in a separate filter driver,
1178 * the copy-on-read code doesn't have its own BdrvChild, however, for which
1179 * it could request permissions. Therefore we have to bypass the permission
1180 * system for the moment. */
1181 // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
1183 /* Cover entire cluster so no additional backing file I/O is required when
1184 * allocating cluster in the image file. Note that this value may exceed
1185 * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which
1186 * is one reason we loop rather than doing it all at once.
1188 bdrv_round_to_clusters(bs
, offset
, bytes
, &cluster_offset
, &cluster_bytes
);
1189 skip_bytes
= offset
- cluster_offset
;
1191 trace_bdrv_co_do_copy_on_readv(bs
, offset
, bytes
,
1192 cluster_offset
, cluster_bytes
);
1194 bounce_buffer
= qemu_try_blockalign(bs
,
1195 MIN(MIN(max_transfer
, cluster_bytes
),
1196 MAX_BOUNCE_BUFFER
));
1197 if (bounce_buffer
== NULL
) {
1202 while (cluster_bytes
) {
1205 ret
= bdrv_is_allocated(bs
, cluster_offset
,
1206 MIN(cluster_bytes
, max_transfer
), &pnum
);
1208 /* Safe to treat errors in querying allocation as if
1209 * unallocated; we'll probably fail again soon on the
1210 * read, but at least that will set a decent errno.
1212 pnum
= MIN(cluster_bytes
, max_transfer
);
1215 /* Stop at EOF if the image ends in the middle of the cluster */
1216 if (ret
== 0 && pnum
== 0) {
1217 assert(progress
>= bytes
);
1221 assert(skip_bytes
< pnum
);
1224 /* Must copy-on-read; use the bounce buffer */
1225 iov
.iov_base
= bounce_buffer
;
1226 iov
.iov_len
= pnum
= MIN(pnum
, MAX_BOUNCE_BUFFER
);
1227 qemu_iovec_init_external(&local_qiov
, &iov
, 1);
1229 ret
= bdrv_driver_preadv(bs
, cluster_offset
, pnum
,
1235 bdrv_debug_event(bs
, BLKDBG_COR_WRITE
);
1236 if (drv
->bdrv_co_pwrite_zeroes
&&
1237 buffer_is_zero(bounce_buffer
, pnum
)) {
1238 /* FIXME: Should we (perhaps conditionally) be setting
1239 * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
1240 * that still correctly reads as zero? */
1241 ret
= bdrv_co_do_pwrite_zeroes(bs
, cluster_offset
, pnum
,
1242 BDRV_REQ_WRITE_UNCHANGED
);
1244 /* This does not change the data on the disk, it is not
1245 * necessary to flush even in cache=writethrough mode.
1247 ret
= bdrv_driver_pwritev(bs
, cluster_offset
, pnum
,
1249 BDRV_REQ_WRITE_UNCHANGED
);
1253 /* It might be okay to ignore write errors for guest
1254 * requests. If this is a deliberate copy-on-read
1255 * then we don't want to ignore the error. Simply
1256 * report it in all cases.
1261 qemu_iovec_from_buf(qiov
, progress
, bounce_buffer
+ skip_bytes
,
1264 /* Read directly into the destination */
1265 qemu_iovec_init(&local_qiov
, qiov
->niov
);
1266 qemu_iovec_concat(&local_qiov
, qiov
, progress
, pnum
- skip_bytes
);
1267 ret
= bdrv_driver_preadv(bs
, offset
+ progress
, local_qiov
.size
,
1269 qemu_iovec_destroy(&local_qiov
);
1275 cluster_offset
+= pnum
;
1276 cluster_bytes
-= pnum
;
1277 progress
+= pnum
- skip_bytes
;
1283 qemu_vfree(bounce_buffer
);
1288 * Forwards an already correctly aligned request to the BlockDriver. This
1289 * handles copy on read, zeroing after EOF, and fragmentation of large
1290 * reads; any other features must be implemented by the caller.
1292 static int coroutine_fn
bdrv_aligned_preadv(BdrvChild
*child
,
1293 BdrvTrackedRequest
*req
, int64_t offset
, unsigned int bytes
,
1294 int64_t align
, QEMUIOVector
*qiov
, int flags
)
1296 BlockDriverState
*bs
= child
->bs
;
1297 int64_t total_bytes
, max_bytes
;
1299 uint64_t bytes_remaining
= bytes
;
1302 assert(is_power_of_2(align
));
1303 assert((offset
& (align
- 1)) == 0);
1304 assert((bytes
& (align
- 1)) == 0);
1305 assert(!qiov
|| bytes
== qiov
->size
);
1306 assert((bs
->open_flags
& BDRV_O_NO_IO
) == 0);
1307 max_transfer
= QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs
->bl
.max_transfer
, INT_MAX
),
1310 /* TODO: We would need a per-BDS .supported_read_flags and
1311 * potential fallback support, if we ever implement any read flags
1312 * to pass through to drivers. For now, there aren't any
1313 * passthrough flags. */
1314 assert(!(flags
& ~(BDRV_REQ_NO_SERIALISING
| BDRV_REQ_COPY_ON_READ
)));
1316 /* Handle Copy on Read and associated serialisation */
1317 if (flags
& BDRV_REQ_COPY_ON_READ
) {
1318 /* If we touch the same cluster it counts as an overlap. This
1319 * guarantees that allocating writes will be serialized and not race
1320 * with each other for the same cluster. For example, in copy-on-read
1321 * it ensures that the CoR read and write operations are atomic and
1322 * guest writes cannot interleave between them. */
1323 mark_request_serialising(req
, bdrv_get_cluster_size(bs
));
1326 /* BDRV_REQ_SERIALISING is only for write operation */
1327 assert(!(flags
& BDRV_REQ_SERIALISING
));
1329 if (!(flags
& BDRV_REQ_NO_SERIALISING
)) {
1330 wait_serialising_requests(req
);
1333 if (flags
& BDRV_REQ_COPY_ON_READ
) {
1336 ret
= bdrv_is_allocated(bs
, offset
, bytes
, &pnum
);
1341 if (!ret
|| pnum
!= bytes
) {
1342 ret
= bdrv_co_do_copy_on_readv(child
, offset
, bytes
, qiov
);
1347 /* Forward the request to the BlockDriver, possibly fragmenting it */
1348 total_bytes
= bdrv_getlength(bs
);
1349 if (total_bytes
< 0) {
1354 max_bytes
= ROUND_UP(MAX(0, total_bytes
- offset
), align
);
1355 if (bytes
<= max_bytes
&& bytes
<= max_transfer
) {
1356 ret
= bdrv_driver_preadv(bs
, offset
, bytes
, qiov
, 0);
1360 while (bytes_remaining
) {
1364 QEMUIOVector local_qiov
;
1366 num
= MIN(bytes_remaining
, MIN(max_bytes
, max_transfer
));
1368 qemu_iovec_init(&local_qiov
, qiov
->niov
);
1369 qemu_iovec_concat(&local_qiov
, qiov
, bytes
- bytes_remaining
, num
);
1371 ret
= bdrv_driver_preadv(bs
, offset
+ bytes
- bytes_remaining
,
1372 num
, &local_qiov
, 0);
1374 qemu_iovec_destroy(&local_qiov
);
1376 num
= bytes_remaining
;
1377 ret
= qemu_iovec_memset(qiov
, bytes
- bytes_remaining
, 0,
1383 bytes_remaining
-= num
;
1387 return ret
< 0 ? ret
: 0;
1391 * Handle a read request in coroutine context
1393 int coroutine_fn
bdrv_co_preadv(BdrvChild
*child
,
1394 int64_t offset
, unsigned int bytes
, QEMUIOVector
*qiov
,
1395 BdrvRequestFlags flags
)
1397 BlockDriverState
*bs
= child
->bs
;
1398 BlockDriver
*drv
= bs
->drv
;
1399 BdrvTrackedRequest req
;
1401 uint64_t align
= bs
->bl
.request_alignment
;
1402 uint8_t *head_buf
= NULL
;
1403 uint8_t *tail_buf
= NULL
;
1404 QEMUIOVector local_qiov
;
1405 bool use_local_qiov
= false;
1408 trace_bdrv_co_preadv(child
->bs
, offset
, bytes
, flags
);
1414 ret
= bdrv_check_byte_request(bs
, offset
, bytes
);
1419 bdrv_inc_in_flight(bs
);
1421 /* Don't do copy-on-read if we read data before write operation */
1422 if (atomic_read(&bs
->copy_on_read
) && !(flags
& BDRV_REQ_NO_SERIALISING
)) {
1423 flags
|= BDRV_REQ_COPY_ON_READ
;
1426 /* Align read if necessary by padding qiov */
1427 if (offset
& (align
- 1)) {
1428 head_buf
= qemu_blockalign(bs
, align
);
1429 qemu_iovec_init(&local_qiov
, qiov
->niov
+ 2);
1430 qemu_iovec_add(&local_qiov
, head_buf
, offset
& (align
- 1));
1431 qemu_iovec_concat(&local_qiov
, qiov
, 0, qiov
->size
);
1432 use_local_qiov
= true;
1434 bytes
+= offset
& (align
- 1);
1435 offset
= offset
& ~(align
- 1);
1438 if ((offset
+ bytes
) & (align
- 1)) {
1439 if (!use_local_qiov
) {
1440 qemu_iovec_init(&local_qiov
, qiov
->niov
+ 1);
1441 qemu_iovec_concat(&local_qiov
, qiov
, 0, qiov
->size
);
1442 use_local_qiov
= true;
1444 tail_buf
= qemu_blockalign(bs
, align
);
1445 qemu_iovec_add(&local_qiov
, tail_buf
,
1446 align
- ((offset
+ bytes
) & (align
- 1)));
1448 bytes
= ROUND_UP(bytes
, align
);
1451 tracked_request_begin(&req
, bs
, offset
, bytes
, BDRV_TRACKED_READ
);
1452 ret
= bdrv_aligned_preadv(child
, &req
, offset
, bytes
, align
,
1453 use_local_qiov
? &local_qiov
: qiov
,
1455 tracked_request_end(&req
);
1456 bdrv_dec_in_flight(bs
);
1458 if (use_local_qiov
) {
1459 qemu_iovec_destroy(&local_qiov
);
1460 qemu_vfree(head_buf
);
1461 qemu_vfree(tail_buf
);
1467 static int coroutine_fn
bdrv_co_do_pwrite_zeroes(BlockDriverState
*bs
,
1468 int64_t offset
, int bytes
, BdrvRequestFlags flags
)
1470 BlockDriver
*drv
= bs
->drv
;
1472 struct iovec iov
= {0};
1474 bool need_flush
= false;
1478 int max_write_zeroes
= MIN_NON_ZERO(bs
->bl
.max_pwrite_zeroes
, INT_MAX
);
1479 int alignment
= MAX(bs
->bl
.pwrite_zeroes_alignment
,
1480 bs
->bl
.request_alignment
);
1481 int max_transfer
= MIN_NON_ZERO(bs
->bl
.max_transfer
, MAX_BOUNCE_BUFFER
);
1487 assert(alignment
% bs
->bl
.request_alignment
== 0);
1488 head
= offset
% alignment
;
1489 tail
= (offset
+ bytes
) % alignment
;
1490 max_write_zeroes
= QEMU_ALIGN_DOWN(max_write_zeroes
, alignment
);
1491 assert(max_write_zeroes
>= bs
->bl
.request_alignment
);
1493 while (bytes
> 0 && !ret
) {
1496 /* Align request. Block drivers can expect the "bulk" of the request
1497 * to be aligned, and that unaligned requests do not cross cluster
1501 /* Make a small request up to the first aligned sector. For
1502 * convenience, limit this request to max_transfer even if
1503 * we don't need to fall back to writes. */
1504 num
= MIN(MIN(bytes
, max_transfer
), alignment
- head
);
1505 head
= (head
+ num
) % alignment
;
1506 assert(num
< max_write_zeroes
);
1507 } else if (tail
&& num
> alignment
) {
1508 /* Shorten the request to the last aligned sector. */
1512 /* limit request size */
1513 if (num
> max_write_zeroes
) {
1514 num
= max_write_zeroes
;
1518 /* First try the efficient write zeroes operation */
1519 if (drv
->bdrv_co_pwrite_zeroes
) {
1520 ret
= drv
->bdrv_co_pwrite_zeroes(bs
, offset
, num
,
1521 flags
& bs
->supported_zero_flags
);
1522 if (ret
!= -ENOTSUP
&& (flags
& BDRV_REQ_FUA
) &&
1523 !(bs
->supported_zero_flags
& BDRV_REQ_FUA
)) {
1527 assert(!bs
->supported_zero_flags
);
1530 if (ret
== -ENOTSUP
) {
1531 /* Fall back to bounce buffer if write zeroes is unsupported */
1532 BdrvRequestFlags write_flags
= flags
& ~BDRV_REQ_ZERO_WRITE
;
1534 if ((flags
& BDRV_REQ_FUA
) &&
1535 !(bs
->supported_write_flags
& BDRV_REQ_FUA
)) {
1536 /* No need for bdrv_driver_pwrite() to do a fallback
1537 * flush on each chunk; use just one at the end */
1538 write_flags
&= ~BDRV_REQ_FUA
;
1541 num
= MIN(num
, max_transfer
);
1543 if (iov
.iov_base
== NULL
) {
1544 iov
.iov_base
= qemu_try_blockalign(bs
, num
);
1545 if (iov
.iov_base
== NULL
) {
1549 memset(iov
.iov_base
, 0, num
);
1551 qemu_iovec_init_external(&qiov
, &iov
, 1);
1553 ret
= bdrv_driver_pwritev(bs
, offset
, num
, &qiov
, write_flags
);
1555 /* Keep bounce buffer around if it is big enough for all
1556 * all future requests.
1558 if (num
< max_transfer
) {
1559 qemu_vfree(iov
.iov_base
);
1560 iov
.iov_base
= NULL
;
1569 if (ret
== 0 && need_flush
) {
1570 ret
= bdrv_co_flush(bs
);
1572 qemu_vfree(iov
.iov_base
);
1577 * Forwards an already correctly aligned write request to the BlockDriver,
1578 * after possibly fragmenting it.
1580 static int coroutine_fn
bdrv_aligned_pwritev(BdrvChild
*child
,
1581 BdrvTrackedRequest
*req
, int64_t offset
, unsigned int bytes
,
1582 int64_t align
, QEMUIOVector
*qiov
, int flags
)
1584 BlockDriverState
*bs
= child
->bs
;
1585 BlockDriver
*drv
= bs
->drv
;
1589 int64_t end_sector
= DIV_ROUND_UP(offset
+ bytes
, BDRV_SECTOR_SIZE
);
1590 uint64_t bytes_remaining
= bytes
;
1597 if (bdrv_has_readonly_bitmaps(bs
)) {
1601 assert(is_power_of_2(align
));
1602 assert((offset
& (align
- 1)) == 0);
1603 assert((bytes
& (align
- 1)) == 0);
1604 assert(!qiov
|| bytes
== qiov
->size
);
1605 assert((bs
->open_flags
& BDRV_O_NO_IO
) == 0);
1606 assert(!(flags
& ~BDRV_REQ_MASK
));
1607 max_transfer
= QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs
->bl
.max_transfer
, INT_MAX
),
1610 /* BDRV_REQ_NO_SERIALISING is only for read operation */
1611 assert(!(flags
& BDRV_REQ_NO_SERIALISING
));
1613 if (flags
& BDRV_REQ_SERIALISING
) {
1614 mark_request_serialising(req
, bdrv_get_cluster_size(bs
));
1617 waited
= wait_serialising_requests(req
);
1618 assert(!waited
|| !req
->serialising
||
1619 is_request_serialising_and_aligned(req
));
1620 assert(req
->overlap_offset
<= offset
);
1621 assert(offset
+ bytes
<= req
->overlap_offset
+ req
->overlap_bytes
);
1622 if (flags
& BDRV_REQ_WRITE_UNCHANGED
) {
1623 assert(child
->perm
& (BLK_PERM_WRITE_UNCHANGED
| BLK_PERM_WRITE
));
1625 assert(child
->perm
& BLK_PERM_WRITE
);
1627 assert(end_sector
<= bs
->total_sectors
|| child
->perm
& BLK_PERM_RESIZE
);
1629 ret
= notifier_with_return_list_notify(&bs
->before_write_notifiers
, req
);
1631 if (!ret
&& bs
->detect_zeroes
!= BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF
&&
1632 !(flags
& BDRV_REQ_ZERO_WRITE
) && drv
->bdrv_co_pwrite_zeroes
&&
1633 qemu_iovec_is_zero(qiov
)) {
1634 flags
|= BDRV_REQ_ZERO_WRITE
;
1635 if (bs
->detect_zeroes
== BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP
) {
1636 flags
|= BDRV_REQ_MAY_UNMAP
;
1641 /* Do nothing, write notifier decided to fail this request */
1642 } else if (flags
& BDRV_REQ_ZERO_WRITE
) {
1643 bdrv_debug_event(bs
, BLKDBG_PWRITEV_ZERO
);
1644 ret
= bdrv_co_do_pwrite_zeroes(bs
, offset
, bytes
, flags
);
1645 } else if (flags
& BDRV_REQ_WRITE_COMPRESSED
) {
1646 ret
= bdrv_driver_pwritev_compressed(bs
, offset
, bytes
, qiov
);
1647 } else if (bytes
<= max_transfer
) {
1648 bdrv_debug_event(bs
, BLKDBG_PWRITEV
);
1649 ret
= bdrv_driver_pwritev(bs
, offset
, bytes
, qiov
, flags
);
1651 bdrv_debug_event(bs
, BLKDBG_PWRITEV
);
1652 while (bytes_remaining
) {
1653 int num
= MIN(bytes_remaining
, max_transfer
);
1654 QEMUIOVector local_qiov
;
1655 int local_flags
= flags
;
1658 if (num
< bytes_remaining
&& (flags
& BDRV_REQ_FUA
) &&
1659 !(bs
->supported_write_flags
& BDRV_REQ_FUA
)) {
1660 /* If FUA is going to be emulated by flush, we only
1661 * need to flush on the last iteration */
1662 local_flags
&= ~BDRV_REQ_FUA
;
1664 qemu_iovec_init(&local_qiov
, qiov
->niov
);
1665 qemu_iovec_concat(&local_qiov
, qiov
, bytes
- bytes_remaining
, num
);
1667 ret
= bdrv_driver_pwritev(bs
, offset
+ bytes
- bytes_remaining
,
1668 num
, &local_qiov
, local_flags
);
1669 qemu_iovec_destroy(&local_qiov
);
1673 bytes_remaining
-= num
;
1676 bdrv_debug_event(bs
, BLKDBG_PWRITEV_DONE
);
1678 atomic_inc(&bs
->write_gen
);
1679 bdrv_set_dirty(bs
, offset
, bytes
);
1681 stat64_max(&bs
->wr_highest_offset
, offset
+ bytes
);
1684 bs
->total_sectors
= MAX(bs
->total_sectors
, end_sector
);
1691 static int coroutine_fn
bdrv_co_do_zero_pwritev(BdrvChild
*child
,
1694 BdrvRequestFlags flags
,
1695 BdrvTrackedRequest
*req
)
1697 BlockDriverState
*bs
= child
->bs
;
1698 uint8_t *buf
= NULL
;
1699 QEMUIOVector local_qiov
;
1701 uint64_t align
= bs
->bl
.request_alignment
;
1702 unsigned int head_padding_bytes
, tail_padding_bytes
;
1705 head_padding_bytes
= offset
& (align
- 1);
1706 tail_padding_bytes
= (align
- (offset
+ bytes
)) & (align
- 1);
1709 assert(flags
& BDRV_REQ_ZERO_WRITE
);
1710 if (head_padding_bytes
|| tail_padding_bytes
) {
1711 buf
= qemu_blockalign(bs
, align
);
1712 iov
= (struct iovec
) {
1716 qemu_iovec_init_external(&local_qiov
, &iov
, 1);
1718 if (head_padding_bytes
) {
1719 uint64_t zero_bytes
= MIN(bytes
, align
- head_padding_bytes
);
1721 /* RMW the unaligned part before head. */
1722 mark_request_serialising(req
, align
);
1723 wait_serialising_requests(req
);
1724 bdrv_debug_event(bs
, BLKDBG_PWRITEV_RMW_HEAD
);
1725 ret
= bdrv_aligned_preadv(child
, req
, offset
& ~(align
- 1), align
,
1726 align
, &local_qiov
, 0);
1730 bdrv_debug_event(bs
, BLKDBG_PWRITEV_RMW_AFTER_HEAD
);
1732 memset(buf
+ head_padding_bytes
, 0, zero_bytes
);
1733 ret
= bdrv_aligned_pwritev(child
, req
, offset
& ~(align
- 1), align
,
1735 flags
& ~BDRV_REQ_ZERO_WRITE
);
1739 offset
+= zero_bytes
;
1740 bytes
-= zero_bytes
;
1743 assert(!bytes
|| (offset
& (align
- 1)) == 0);
1744 if (bytes
>= align
) {
1745 /* Write the aligned part in the middle. */
1746 uint64_t aligned_bytes
= bytes
& ~(align
- 1);
1747 ret
= bdrv_aligned_pwritev(child
, req
, offset
, aligned_bytes
, align
,
1752 bytes
-= aligned_bytes
;
1753 offset
+= aligned_bytes
;
1756 assert(!bytes
|| (offset
& (align
- 1)) == 0);
1758 assert(align
== tail_padding_bytes
+ bytes
);
1759 /* RMW the unaligned part after tail. */
1760 mark_request_serialising(req
, align
);
1761 wait_serialising_requests(req
);
1762 bdrv_debug_event(bs
, BLKDBG_PWRITEV_RMW_TAIL
);
1763 ret
= bdrv_aligned_preadv(child
, req
, offset
, align
,
1764 align
, &local_qiov
, 0);
1768 bdrv_debug_event(bs
, BLKDBG_PWRITEV_RMW_AFTER_TAIL
);
1770 memset(buf
, 0, bytes
);
1771 ret
= bdrv_aligned_pwritev(child
, req
, offset
, align
, align
,
1772 &local_qiov
, flags
& ~BDRV_REQ_ZERO_WRITE
);
1781 * Handle a write request in coroutine context
1783 int coroutine_fn
bdrv_co_pwritev(BdrvChild
*child
,
1784 int64_t offset
, unsigned int bytes
, QEMUIOVector
*qiov
,
1785 BdrvRequestFlags flags
)
1787 BlockDriverState
*bs
= child
->bs
;
1788 BdrvTrackedRequest req
;
1789 uint64_t align
= bs
->bl
.request_alignment
;
1790 uint8_t *head_buf
= NULL
;
1791 uint8_t *tail_buf
= NULL
;
1792 QEMUIOVector local_qiov
;
1793 bool use_local_qiov
= false;
1796 trace_bdrv_co_pwritev(child
->bs
, offset
, bytes
, flags
);
1801 if (bs
->read_only
) {
1804 assert(!(bs
->open_flags
& BDRV_O_INACTIVE
));
1806 ret
= bdrv_check_byte_request(bs
, offset
, bytes
);
1811 bdrv_inc_in_flight(bs
);
1813 * Align write if necessary by performing a read-modify-write cycle.
1814 * Pad qiov with the read parts and be sure to have a tracked request not
1815 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
1817 tracked_request_begin(&req
, bs
, offset
, bytes
, BDRV_TRACKED_WRITE
);
1819 if (flags
& BDRV_REQ_ZERO_WRITE
) {
1820 ret
= bdrv_co_do_zero_pwritev(child
, offset
, bytes
, flags
, &req
);
1824 if (offset
& (align
- 1)) {
1825 QEMUIOVector head_qiov
;
1826 struct iovec head_iov
;
1828 mark_request_serialising(&req
, align
);
1829 wait_serialising_requests(&req
);
1831 head_buf
= qemu_blockalign(bs
, align
);
1832 head_iov
= (struct iovec
) {
1833 .iov_base
= head_buf
,
1836 qemu_iovec_init_external(&head_qiov
, &head_iov
, 1);
1838 bdrv_debug_event(bs
, BLKDBG_PWRITEV_RMW_HEAD
);
1839 ret
= bdrv_aligned_preadv(child
, &req
, offset
& ~(align
- 1), align
,
1840 align
, &head_qiov
, 0);
1844 bdrv_debug_event(bs
, BLKDBG_PWRITEV_RMW_AFTER_HEAD
);
1846 qemu_iovec_init(&local_qiov
, qiov
->niov
+ 2);
1847 qemu_iovec_add(&local_qiov
, head_buf
, offset
& (align
- 1));
1848 qemu_iovec_concat(&local_qiov
, qiov
, 0, qiov
->size
);
1849 use_local_qiov
= true;
1851 bytes
+= offset
& (align
- 1);
1852 offset
= offset
& ~(align
- 1);
1854 /* We have read the tail already if the request is smaller
1855 * than one aligned block.
1857 if (bytes
< align
) {
1858 qemu_iovec_add(&local_qiov
, head_buf
+ bytes
, align
- bytes
);
1863 if ((offset
+ bytes
) & (align
- 1)) {
1864 QEMUIOVector tail_qiov
;
1865 struct iovec tail_iov
;
1869 mark_request_serialising(&req
, align
);
1870 waited
= wait_serialising_requests(&req
);
1871 assert(!waited
|| !use_local_qiov
);
1873 tail_buf
= qemu_blockalign(bs
, align
);
1874 tail_iov
= (struct iovec
) {
1875 .iov_base
= tail_buf
,
1878 qemu_iovec_init_external(&tail_qiov
, &tail_iov
, 1);
1880 bdrv_debug_event(bs
, BLKDBG_PWRITEV_RMW_TAIL
);
1881 ret
= bdrv_aligned_preadv(child
, &req
, (offset
+ bytes
) & ~(align
- 1),
1882 align
, align
, &tail_qiov
, 0);
1886 bdrv_debug_event(bs
, BLKDBG_PWRITEV_RMW_AFTER_TAIL
);
1888 if (!use_local_qiov
) {
1889 qemu_iovec_init(&local_qiov
, qiov
->niov
+ 1);
1890 qemu_iovec_concat(&local_qiov
, qiov
, 0, qiov
->size
);
1891 use_local_qiov
= true;
1894 tail_bytes
= (offset
+ bytes
) & (align
- 1);
1895 qemu_iovec_add(&local_qiov
, tail_buf
+ tail_bytes
, align
- tail_bytes
);
1897 bytes
= ROUND_UP(bytes
, align
);
1900 ret
= bdrv_aligned_pwritev(child
, &req
, offset
, bytes
, align
,
1901 use_local_qiov
? &local_qiov
: qiov
,
1906 if (use_local_qiov
) {
1907 qemu_iovec_destroy(&local_qiov
);
1909 qemu_vfree(head_buf
);
1910 qemu_vfree(tail_buf
);
1912 tracked_request_end(&req
);
1913 bdrv_dec_in_flight(bs
);
1917 int coroutine_fn
bdrv_co_pwrite_zeroes(BdrvChild
*child
, int64_t offset
,
1918 int bytes
, BdrvRequestFlags flags
)
1920 trace_bdrv_co_pwrite_zeroes(child
->bs
, offset
, bytes
, flags
);
1922 if (!(child
->bs
->open_flags
& BDRV_O_UNMAP
)) {
1923 flags
&= ~BDRV_REQ_MAY_UNMAP
;
1926 return bdrv_co_pwritev(child
, offset
, bytes
, NULL
,
1927 BDRV_REQ_ZERO_WRITE
| flags
);
1931 * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not.
1933 int bdrv_flush_all(void)
1935 BdrvNextIterator it
;
1936 BlockDriverState
*bs
= NULL
;
1939 for (bs
= bdrv_first(&it
); bs
; bs
= bdrv_next(&it
)) {
1940 AioContext
*aio_context
= bdrv_get_aio_context(bs
);
1943 aio_context_acquire(aio_context
);
1944 ret
= bdrv_flush(bs
);
1945 if (ret
< 0 && !result
) {
1948 aio_context_release(aio_context
);
1955 typedef struct BdrvCoBlockStatusData
{
1956 BlockDriverState
*bs
;
1957 BlockDriverState
*base
;
1963 BlockDriverState
**file
;
1966 } BdrvCoBlockStatusData
;
1968 int coroutine_fn
bdrv_co_block_status_from_file(BlockDriverState
*bs
,
1974 BlockDriverState
**file
)
1976 assert(bs
->file
&& bs
->file
->bs
);
1979 *file
= bs
->file
->bs
;
1980 return BDRV_BLOCK_RAW
| BDRV_BLOCK_OFFSET_VALID
;
1983 int coroutine_fn
bdrv_co_block_status_from_backing(BlockDriverState
*bs
,
1989 BlockDriverState
**file
)
1991 assert(bs
->backing
&& bs
->backing
->bs
);
1994 *file
= bs
->backing
->bs
;
1995 return BDRV_BLOCK_RAW
| BDRV_BLOCK_OFFSET_VALID
;
1999 * Returns the allocation status of the specified sectors.
2000 * Drivers not implementing the functionality are assumed to not support
2001 * backing files, hence all their sectors are reported as allocated.
2003 * If 'want_zero' is true, the caller is querying for mapping
2004 * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and
2005 * _ZERO where possible; otherwise, the result favors larger 'pnum',
2006 * with a focus on accurate BDRV_BLOCK_ALLOCATED.
2008 * If 'offset' is beyond the end of the disk image the return value is
2009 * BDRV_BLOCK_EOF and 'pnum' is set to 0.
2011 * 'bytes' is the max value 'pnum' should be set to. If bytes goes
2012 * beyond the end of the disk image it will be clamped; if 'pnum' is set to
2013 * the end of the image, then the returned value will include BDRV_BLOCK_EOF.
2015 * 'pnum' is set to the number of bytes (including and immediately
2016 * following the specified offset) that are easily known to be in the
2017 * same allocated/unallocated state. Note that a second call starting
2018 * at the original offset plus returned pnum may have the same status.
2019 * The returned value is non-zero on success except at end-of-file.
2021 * Returns negative errno on failure. Otherwise, if the
2022 * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are
2023 * set to the host mapping and BDS corresponding to the guest offset.
2025 static int coroutine_fn
bdrv_co_block_status(BlockDriverState
*bs
,
2027 int64_t offset
, int64_t bytes
,
2028 int64_t *pnum
, int64_t *map
,
2029 BlockDriverState
**file
)
2032 int64_t n
; /* bytes */
2034 int64_t local_map
= 0;
2035 BlockDriverState
*local_file
= NULL
;
2036 int64_t aligned_offset
, aligned_bytes
;
2041 total_size
= bdrv_getlength(bs
);
2042 if (total_size
< 0) {
2047 if (offset
>= total_size
) {
2048 ret
= BDRV_BLOCK_EOF
;
2056 n
= total_size
- offset
;
2061 /* Must be non-NULL or bdrv_getlength() would have failed */
2063 if (!bs
->drv
->bdrv_co_block_status
) {
2065 ret
= BDRV_BLOCK_DATA
| BDRV_BLOCK_ALLOCATED
;
2066 if (offset
+ bytes
== total_size
) {
2067 ret
|= BDRV_BLOCK_EOF
;
2069 if (bs
->drv
->protocol_name
) {
2070 ret
|= BDRV_BLOCK_OFFSET_VALID
;
2077 bdrv_inc_in_flight(bs
);
2079 /* Round out to request_alignment boundaries */
2080 align
= bs
->bl
.request_alignment
;
2081 aligned_offset
= QEMU_ALIGN_DOWN(offset
, align
);
2082 aligned_bytes
= ROUND_UP(offset
+ bytes
, align
) - aligned_offset
;
2084 ret
= bs
->drv
->bdrv_co_block_status(bs
, want_zero
, aligned_offset
,
2085 aligned_bytes
, pnum
, &local_map
,
2093 * The driver's result must be a non-zero multiple of request_alignment.
2094 * Clamp pnum and adjust map to original request.
2096 assert(*pnum
&& QEMU_IS_ALIGNED(*pnum
, align
) &&
2097 align
> offset
- aligned_offset
);
2098 *pnum
-= offset
- aligned_offset
;
2099 if (*pnum
> bytes
) {
2102 if (ret
& BDRV_BLOCK_OFFSET_VALID
) {
2103 local_map
+= offset
- aligned_offset
;
2106 if (ret
& BDRV_BLOCK_RAW
) {
2107 assert(ret
& BDRV_BLOCK_OFFSET_VALID
&& local_file
);
2108 ret
= bdrv_co_block_status(local_file
, want_zero
, local_map
,
2109 *pnum
, pnum
, &local_map
, &local_file
);
2113 if (ret
& (BDRV_BLOCK_DATA
| BDRV_BLOCK_ZERO
)) {
2114 ret
|= BDRV_BLOCK_ALLOCATED
;
2115 } else if (want_zero
) {
2116 if (bdrv_unallocated_blocks_are_zero(bs
)) {
2117 ret
|= BDRV_BLOCK_ZERO
;
2118 } else if (bs
->backing
) {
2119 BlockDriverState
*bs2
= bs
->backing
->bs
;
2120 int64_t size2
= bdrv_getlength(bs2
);
2122 if (size2
>= 0 && offset
>= size2
) {
2123 ret
|= BDRV_BLOCK_ZERO
;
2128 if (want_zero
&& local_file
&& local_file
!= bs
&&
2129 (ret
& BDRV_BLOCK_DATA
) && !(ret
& BDRV_BLOCK_ZERO
) &&
2130 (ret
& BDRV_BLOCK_OFFSET_VALID
)) {
2134 ret2
= bdrv_co_block_status(local_file
, want_zero
, local_map
,
2135 *pnum
, &file_pnum
, NULL
, NULL
);
2137 /* Ignore errors. This is just providing extra information, it
2138 * is useful but not necessary.
2140 if (ret2
& BDRV_BLOCK_EOF
&&
2141 (!file_pnum
|| ret2
& BDRV_BLOCK_ZERO
)) {
2143 * It is valid for the format block driver to read
2144 * beyond the end of the underlying file's current
2145 * size; such areas read as zero.
2147 ret
|= BDRV_BLOCK_ZERO
;
2149 /* Limit request to the range reported by the protocol driver */
2151 ret
|= (ret2
& BDRV_BLOCK_ZERO
);
2157 bdrv_dec_in_flight(bs
);
2158 if (ret
>= 0 && offset
+ *pnum
== total_size
) {
2159 ret
|= BDRV_BLOCK_EOF
;
2171 static int coroutine_fn
bdrv_co_block_status_above(BlockDriverState
*bs
,
2172 BlockDriverState
*base
,
2178 BlockDriverState
**file
)
2180 BlockDriverState
*p
;
2185 for (p
= bs
; p
!= base
; p
= backing_bs(p
)) {
2186 ret
= bdrv_co_block_status(p
, want_zero
, offset
, bytes
, pnum
, map
,
2191 if (ret
& BDRV_BLOCK_ZERO
&& ret
& BDRV_BLOCK_EOF
&& !first
) {
2193 * Reading beyond the end of the file continues to read
2194 * zeroes, but we can only widen the result to the
2195 * unallocated length we learned from an earlier
2200 if (ret
& (BDRV_BLOCK_ZERO
| BDRV_BLOCK_DATA
)) {
2203 /* [offset, pnum] unallocated on this layer, which could be only
2204 * the first part of [offset, bytes]. */
2205 bytes
= MIN(bytes
, *pnum
);
2211 /* Coroutine wrapper for bdrv_block_status_above() */
2212 static void coroutine_fn
bdrv_block_status_above_co_entry(void *opaque
)
2214 BdrvCoBlockStatusData
*data
= opaque
;
2216 data
->ret
= bdrv_co_block_status_above(data
->bs
, data
->base
,
2218 data
->offset
, data
->bytes
,
2219 data
->pnum
, data
->map
, data
->file
);
2224 * Synchronous wrapper around bdrv_co_block_status_above().
2226 * See bdrv_co_block_status_above() for details.
2228 static int bdrv_common_block_status_above(BlockDriverState
*bs
,
2229 BlockDriverState
*base
,
2230 bool want_zero
, int64_t offset
,
2231 int64_t bytes
, int64_t *pnum
,
2233 BlockDriverState
**file
)
2236 BdrvCoBlockStatusData data
= {
2239 .want_zero
= want_zero
,
2248 if (qemu_in_coroutine()) {
2249 /* Fast-path if already in coroutine context */
2250 bdrv_block_status_above_co_entry(&data
);
2252 co
= qemu_coroutine_create(bdrv_block_status_above_co_entry
, &data
);
2253 bdrv_coroutine_enter(bs
, co
);
2254 BDRV_POLL_WHILE(bs
, !data
.done
);
2259 int bdrv_block_status_above(BlockDriverState
*bs
, BlockDriverState
*base
,
2260 int64_t offset
, int64_t bytes
, int64_t *pnum
,
2261 int64_t *map
, BlockDriverState
**file
)
2263 return bdrv_common_block_status_above(bs
, base
, true, offset
, bytes
,
2267 int bdrv_block_status(BlockDriverState
*bs
, int64_t offset
, int64_t bytes
,
2268 int64_t *pnum
, int64_t *map
, BlockDriverState
**file
)
2270 return bdrv_block_status_above(bs
, backing_bs(bs
),
2271 offset
, bytes
, pnum
, map
, file
);
2274 int coroutine_fn
bdrv_is_allocated(BlockDriverState
*bs
, int64_t offset
,
2275 int64_t bytes
, int64_t *pnum
)
2280 ret
= bdrv_common_block_status_above(bs
, backing_bs(bs
), false, offset
,
2281 bytes
, pnum
? pnum
: &dummy
, NULL
,
2286 return !!(ret
& BDRV_BLOCK_ALLOCATED
);
2290 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
2292 * Return true if (a prefix of) the given range is allocated in any image
2293 * between BASE and TOP (inclusive). BASE can be NULL to check if the given
2294 * offset is allocated in any image of the chain. Return false otherwise,
2295 * or negative errno on failure.
2297 * 'pnum' is set to the number of bytes (including and immediately
2298 * following the specified offset) that are known to be in the same
2299 * allocated/unallocated state. Note that a subsequent call starting
2300 * at 'offset + *pnum' may return the same allocation status (in other
2301 * words, the result is not necessarily the maximum possible range);
2302 * but 'pnum' will only be 0 when end of file is reached.
2305 int bdrv_is_allocated_above(BlockDriverState
*top
,
2306 BlockDriverState
*base
,
2307 int64_t offset
, int64_t bytes
, int64_t *pnum
)
2309 BlockDriverState
*intermediate
;
2314 while (intermediate
&& intermediate
!= base
) {
2318 ret
= bdrv_is_allocated(intermediate
, offset
, bytes
, &pnum_inter
);
2327 size_inter
= bdrv_getlength(intermediate
);
2328 if (size_inter
< 0) {
2331 if (n
> pnum_inter
&&
2332 (intermediate
== top
|| offset
+ pnum_inter
< size_inter
)) {
2336 intermediate
= backing_bs(intermediate
);
2343 typedef struct BdrvVmstateCo
{
2344 BlockDriverState
*bs
;
2351 static int coroutine_fn
2352 bdrv_co_rw_vmstate(BlockDriverState
*bs
, QEMUIOVector
*qiov
, int64_t pos
,
2355 BlockDriver
*drv
= bs
->drv
;
2358 bdrv_inc_in_flight(bs
);
2362 } else if (drv
->bdrv_load_vmstate
) {
2364 ret
= drv
->bdrv_load_vmstate(bs
, qiov
, pos
);
2366 ret
= drv
->bdrv_save_vmstate(bs
, qiov
, pos
);
2368 } else if (bs
->file
) {
2369 ret
= bdrv_co_rw_vmstate(bs
->file
->bs
, qiov
, pos
, is_read
);
2372 bdrv_dec_in_flight(bs
);
2376 static void coroutine_fn
bdrv_co_rw_vmstate_entry(void *opaque
)
2378 BdrvVmstateCo
*co
= opaque
;
2379 co
->ret
= bdrv_co_rw_vmstate(co
->bs
, co
->qiov
, co
->pos
, co
->is_read
);
2383 bdrv_rw_vmstate(BlockDriverState
*bs
, QEMUIOVector
*qiov
, int64_t pos
,
2386 if (qemu_in_coroutine()) {
2387 return bdrv_co_rw_vmstate(bs
, qiov
, pos
, is_read
);
2389 BdrvVmstateCo data
= {
2394 .ret
= -EINPROGRESS
,
2396 Coroutine
*co
= qemu_coroutine_create(bdrv_co_rw_vmstate_entry
, &data
);
2398 bdrv_coroutine_enter(bs
, co
);
2399 BDRV_POLL_WHILE(bs
, data
.ret
== -EINPROGRESS
);
2404 int bdrv_save_vmstate(BlockDriverState
*bs
, const uint8_t *buf
,
2405 int64_t pos
, int size
)
2408 struct iovec iov
= {
2409 .iov_base
= (void *) buf
,
2414 qemu_iovec_init_external(&qiov
, &iov
, 1);
2416 ret
= bdrv_writev_vmstate(bs
, &qiov
, pos
);
2424 int bdrv_writev_vmstate(BlockDriverState
*bs
, QEMUIOVector
*qiov
, int64_t pos
)
2426 return bdrv_rw_vmstate(bs
, qiov
, pos
, false);
2429 int bdrv_load_vmstate(BlockDriverState
*bs
, uint8_t *buf
,
2430 int64_t pos
, int size
)
2433 struct iovec iov
= {
2439 qemu_iovec_init_external(&qiov
, &iov
, 1);
2440 ret
= bdrv_readv_vmstate(bs
, &qiov
, pos
);
2448 int bdrv_readv_vmstate(BlockDriverState
*bs
, QEMUIOVector
*qiov
, int64_t pos
)
2450 return bdrv_rw_vmstate(bs
, qiov
, pos
, true);
2453 /**************************************************************/
2456 void bdrv_aio_cancel(BlockAIOCB
*acb
)
2459 bdrv_aio_cancel_async(acb
);
2460 while (acb
->refcnt
> 1) {
2461 if (acb
->aiocb_info
->get_aio_context
) {
2462 aio_poll(acb
->aiocb_info
->get_aio_context(acb
), true);
2463 } else if (acb
->bs
) {
2464 /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so
2465 * assert that we're not using an I/O thread. Thread-safe
2466 * code should use bdrv_aio_cancel_async exclusively.
2468 assert(bdrv_get_aio_context(acb
->bs
) == qemu_get_aio_context());
2469 aio_poll(bdrv_get_aio_context(acb
->bs
), true);
2474 qemu_aio_unref(acb
);
2477 /* Async version of aio cancel. The caller is not blocked if the acb implements
2478 * cancel_async, otherwise we do nothing and let the request normally complete.
2479 * In either case the completion callback must be called. */
2480 void bdrv_aio_cancel_async(BlockAIOCB
*acb
)
2482 if (acb
->aiocb_info
->cancel_async
) {
2483 acb
->aiocb_info
->cancel_async(acb
);
2487 /**************************************************************/
2488 /* Coroutine block device emulation */
2490 typedef struct FlushCo
{
2491 BlockDriverState
*bs
;
2496 static void coroutine_fn
bdrv_flush_co_entry(void *opaque
)
2498 FlushCo
*rwco
= opaque
;
2500 rwco
->ret
= bdrv_co_flush(rwco
->bs
);
2503 int coroutine_fn
bdrv_co_flush(BlockDriverState
*bs
)
2508 bdrv_inc_in_flight(bs
);
2510 if (!bdrv_is_inserted(bs
) || bdrv_is_read_only(bs
) ||
2515 qemu_co_mutex_lock(&bs
->reqs_lock
);
2516 current_gen
= atomic_read(&bs
->write_gen
);
2518 /* Wait until any previous flushes are completed */
2519 while (bs
->active_flush_req
) {
2520 qemu_co_queue_wait(&bs
->flush_queue
, &bs
->reqs_lock
);
2523 /* Flushes reach this point in nondecreasing current_gen order. */
2524 bs
->active_flush_req
= true;
2525 qemu_co_mutex_unlock(&bs
->reqs_lock
);
2527 /* Write back all layers by calling one driver function */
2528 if (bs
->drv
->bdrv_co_flush
) {
2529 ret
= bs
->drv
->bdrv_co_flush(bs
);
2533 /* Write back cached data to the OS even with cache=unsafe */
2534 BLKDBG_EVENT(bs
->file
, BLKDBG_FLUSH_TO_OS
);
2535 if (bs
->drv
->bdrv_co_flush_to_os
) {
2536 ret
= bs
->drv
->bdrv_co_flush_to_os(bs
);
2542 /* But don't actually force it to the disk with cache=unsafe */
2543 if (bs
->open_flags
& BDRV_O_NO_FLUSH
) {
2547 /* Check if we really need to flush anything */
2548 if (bs
->flushed_gen
== current_gen
) {
2552 BLKDBG_EVENT(bs
->file
, BLKDBG_FLUSH_TO_DISK
);
2554 /* bs->drv->bdrv_co_flush() might have ejected the BDS
2555 * (even in case of apparent success) */
2559 if (bs
->drv
->bdrv_co_flush_to_disk
) {
2560 ret
= bs
->drv
->bdrv_co_flush_to_disk(bs
);
2561 } else if (bs
->drv
->bdrv_aio_flush
) {
2563 CoroutineIOCompletion co
= {
2564 .coroutine
= qemu_coroutine_self(),
2567 acb
= bs
->drv
->bdrv_aio_flush(bs
, bdrv_co_io_em_complete
, &co
);
2571 qemu_coroutine_yield();
2576 * Some block drivers always operate in either writethrough or unsafe
2577 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
2578 * know how the server works (because the behaviour is hardcoded or
2579 * depends on server-side configuration), so we can't ensure that
2580 * everything is safe on disk. Returning an error doesn't work because
2581 * that would break guests even if the server operates in writethrough
2584 * Let's hope the user knows what he's doing.
2593 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
2594 * in the case of cache=unsafe, so there are no useless flushes.
2597 ret
= bs
->file
? bdrv_co_flush(bs
->file
->bs
) : 0;
2599 /* Notify any pending flushes that we have completed */
2601 bs
->flushed_gen
= current_gen
;
2604 qemu_co_mutex_lock(&bs
->reqs_lock
);
2605 bs
->active_flush_req
= false;
2606 /* Return value is ignored - it's ok if wait queue is empty */
2607 qemu_co_queue_next(&bs
->flush_queue
);
2608 qemu_co_mutex_unlock(&bs
->reqs_lock
);
2611 bdrv_dec_in_flight(bs
);
2615 int bdrv_flush(BlockDriverState
*bs
)
2618 FlushCo flush_co
= {
2623 if (qemu_in_coroutine()) {
2624 /* Fast-path if already in coroutine context */
2625 bdrv_flush_co_entry(&flush_co
);
2627 co
= qemu_coroutine_create(bdrv_flush_co_entry
, &flush_co
);
2628 bdrv_coroutine_enter(bs
, co
);
2629 BDRV_POLL_WHILE(bs
, flush_co
.ret
== NOT_DONE
);
2632 return flush_co
.ret
;
2635 typedef struct DiscardCo
{
2636 BlockDriverState
*bs
;
2641 static void coroutine_fn
bdrv_pdiscard_co_entry(void *opaque
)
2643 DiscardCo
*rwco
= opaque
;
2645 rwco
->ret
= bdrv_co_pdiscard(rwco
->bs
, rwco
->offset
, rwco
->bytes
);
2648 int coroutine_fn
bdrv_co_pdiscard(BlockDriverState
*bs
, int64_t offset
,
2651 BdrvTrackedRequest req
;
2652 int max_pdiscard
, ret
;
2653 int head
, tail
, align
;
2659 if (bdrv_has_readonly_bitmaps(bs
)) {
2663 ret
= bdrv_check_byte_request(bs
, offset
, bytes
);
2666 } else if (bs
->read_only
) {
2669 assert(!(bs
->open_flags
& BDRV_O_INACTIVE
));
2671 /* Do nothing if disabled. */
2672 if (!(bs
->open_flags
& BDRV_O_UNMAP
)) {
2676 if (!bs
->drv
->bdrv_co_pdiscard
&& !bs
->drv
->bdrv_aio_pdiscard
) {
2680 /* Discard is advisory, but some devices track and coalesce
2681 * unaligned requests, so we must pass everything down rather than
2682 * round here. Still, most devices will just silently ignore
2683 * unaligned requests (by returning -ENOTSUP), so we must fragment
2684 * the request accordingly. */
2685 align
= MAX(bs
->bl
.pdiscard_alignment
, bs
->bl
.request_alignment
);
2686 assert(align
% bs
->bl
.request_alignment
== 0);
2687 head
= offset
% align
;
2688 tail
= (offset
+ bytes
) % align
;
2690 bdrv_inc_in_flight(bs
);
2691 tracked_request_begin(&req
, bs
, offset
, bytes
, BDRV_TRACKED_DISCARD
);
2693 ret
= notifier_with_return_list_notify(&bs
->before_write_notifiers
, &req
);
2698 max_pdiscard
= QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs
->bl
.max_pdiscard
, INT_MAX
),
2700 assert(max_pdiscard
>= bs
->bl
.request_alignment
);
2706 /* Make small requests to get to alignment boundaries. */
2707 num
= MIN(bytes
, align
- head
);
2708 if (!QEMU_IS_ALIGNED(num
, bs
->bl
.request_alignment
)) {
2709 num
%= bs
->bl
.request_alignment
;
2711 head
= (head
+ num
) % align
;
2712 assert(num
< max_pdiscard
);
2715 /* Shorten the request to the last aligned cluster. */
2717 } else if (!QEMU_IS_ALIGNED(tail
, bs
->bl
.request_alignment
) &&
2718 tail
> bs
->bl
.request_alignment
) {
2719 tail
%= bs
->bl
.request_alignment
;
2723 /* limit request size */
2724 if (num
> max_pdiscard
) {
2732 if (bs
->drv
->bdrv_co_pdiscard
) {
2733 ret
= bs
->drv
->bdrv_co_pdiscard(bs
, offset
, num
);
2736 CoroutineIOCompletion co
= {
2737 .coroutine
= qemu_coroutine_self(),
2740 acb
= bs
->drv
->bdrv_aio_pdiscard(bs
, offset
, num
,
2741 bdrv_co_io_em_complete
, &co
);
2746 qemu_coroutine_yield();
2750 if (ret
&& ret
!= -ENOTSUP
) {
2759 atomic_inc(&bs
->write_gen
);
2760 bdrv_set_dirty(bs
, req
.offset
, req
.bytes
);
2761 tracked_request_end(&req
);
2762 bdrv_dec_in_flight(bs
);
2766 int bdrv_pdiscard(BlockDriverState
*bs
, int64_t offset
, int bytes
)
2776 if (qemu_in_coroutine()) {
2777 /* Fast-path if already in coroutine context */
2778 bdrv_pdiscard_co_entry(&rwco
);
2780 co
= qemu_coroutine_create(bdrv_pdiscard_co_entry
, &rwco
);
2781 bdrv_coroutine_enter(bs
, co
);
2782 BDRV_POLL_WHILE(bs
, rwco
.ret
== NOT_DONE
);
2788 int bdrv_co_ioctl(BlockDriverState
*bs
, int req
, void *buf
)
2790 BlockDriver
*drv
= bs
->drv
;
2791 CoroutineIOCompletion co
= {
2792 .coroutine
= qemu_coroutine_self(),
2796 bdrv_inc_in_flight(bs
);
2797 if (!drv
|| (!drv
->bdrv_aio_ioctl
&& !drv
->bdrv_co_ioctl
)) {
2802 if (drv
->bdrv_co_ioctl
) {
2803 co
.ret
= drv
->bdrv_co_ioctl(bs
, req
, buf
);
2805 acb
= drv
->bdrv_aio_ioctl(bs
, req
, buf
, bdrv_co_io_em_complete
, &co
);
2810 qemu_coroutine_yield();
2813 bdrv_dec_in_flight(bs
);
2817 void *qemu_blockalign(BlockDriverState
*bs
, size_t size
)
2819 return qemu_memalign(bdrv_opt_mem_align(bs
), size
);
2822 void *qemu_blockalign0(BlockDriverState
*bs
, size_t size
)
2824 return memset(qemu_blockalign(bs
, size
), 0, size
);
2827 void *qemu_try_blockalign(BlockDriverState
*bs
, size_t size
)
2829 size_t align
= bdrv_opt_mem_align(bs
);
2831 /* Ensure that NULL is never returned on success */
2837 return qemu_try_memalign(align
, size
);
2840 void *qemu_try_blockalign0(BlockDriverState
*bs
, size_t size
)
2842 void *mem
= qemu_try_blockalign(bs
, size
);
2845 memset(mem
, 0, size
);
2852 * Check if all memory in this vector is sector aligned.
2854 bool bdrv_qiov_is_aligned(BlockDriverState
*bs
, QEMUIOVector
*qiov
)
2857 size_t alignment
= bdrv_min_mem_align(bs
);
2859 for (i
= 0; i
< qiov
->niov
; i
++) {
2860 if ((uintptr_t) qiov
->iov
[i
].iov_base
% alignment
) {
2863 if (qiov
->iov
[i
].iov_len
% alignment
) {
2871 void bdrv_add_before_write_notifier(BlockDriverState
*bs
,
2872 NotifierWithReturn
*notifier
)
2874 notifier_with_return_list_add(&bs
->before_write_notifiers
, notifier
);
2877 void bdrv_io_plug(BlockDriverState
*bs
)
2881 QLIST_FOREACH(child
, &bs
->children
, next
) {
2882 bdrv_io_plug(child
->bs
);
2885 if (atomic_fetch_inc(&bs
->io_plugged
) == 0) {
2886 BlockDriver
*drv
= bs
->drv
;
2887 if (drv
&& drv
->bdrv_io_plug
) {
2888 drv
->bdrv_io_plug(bs
);
2893 void bdrv_io_unplug(BlockDriverState
*bs
)
2897 assert(bs
->io_plugged
);
2898 if (atomic_fetch_dec(&bs
->io_plugged
) == 1) {
2899 BlockDriver
*drv
= bs
->drv
;
2900 if (drv
&& drv
->bdrv_io_unplug
) {
2901 drv
->bdrv_io_unplug(bs
);
2905 QLIST_FOREACH(child
, &bs
->children
, next
) {
2906 bdrv_io_unplug(child
->bs
);
2910 void bdrv_register_buf(BlockDriverState
*bs
, void *host
, size_t size
)
2914 if (bs
->drv
&& bs
->drv
->bdrv_register_buf
) {
2915 bs
->drv
->bdrv_register_buf(bs
, host
, size
);
2917 QLIST_FOREACH(child
, &bs
->children
, next
) {
2918 bdrv_register_buf(child
->bs
, host
, size
);
2922 void bdrv_unregister_buf(BlockDriverState
*bs
, void *host
)
2926 if (bs
->drv
&& bs
->drv
->bdrv_unregister_buf
) {
2927 bs
->drv
->bdrv_unregister_buf(bs
, host
);
2929 QLIST_FOREACH(child
, &bs
->children
, next
) {
2930 bdrv_unregister_buf(child
->bs
, host
);
2934 static int coroutine_fn
bdrv_co_copy_range_internal(
2935 BdrvChild
*src
, uint64_t src_offset
, BdrvChild
*dst
,
2936 uint64_t dst_offset
, uint64_t bytes
,
2937 BdrvRequestFlags read_flags
, BdrvRequestFlags write_flags
,
2940 BdrvTrackedRequest req
;
2943 if (!dst
|| !dst
->bs
) {
2946 ret
= bdrv_check_byte_request(dst
->bs
, dst_offset
, bytes
);
2950 if (write_flags
& BDRV_REQ_ZERO_WRITE
) {
2951 return bdrv_co_pwrite_zeroes(dst
, dst_offset
, bytes
, write_flags
);
2954 if (!src
|| !src
->bs
) {
2957 ret
= bdrv_check_byte_request(src
->bs
, src_offset
, bytes
);
2962 if (!src
->bs
->drv
->bdrv_co_copy_range_from
2963 || !dst
->bs
->drv
->bdrv_co_copy_range_to
2964 || src
->bs
->encrypted
|| dst
->bs
->encrypted
) {
2969 bdrv_inc_in_flight(src
->bs
);
2970 tracked_request_begin(&req
, src
->bs
, src_offset
, bytes
,
2973 /* BDRV_REQ_SERIALISING is only for write operation */
2974 assert(!(read_flags
& BDRV_REQ_SERIALISING
));
2975 if (!(read_flags
& BDRV_REQ_NO_SERIALISING
)) {
2976 wait_serialising_requests(&req
);
2979 ret
= src
->bs
->drv
->bdrv_co_copy_range_from(src
->bs
,
2983 read_flags
, write_flags
);
2985 tracked_request_end(&req
);
2986 bdrv_dec_in_flight(src
->bs
);
2988 bdrv_inc_in_flight(dst
->bs
);
2989 tracked_request_begin(&req
, dst
->bs
, dst_offset
, bytes
,
2990 BDRV_TRACKED_WRITE
);
2992 /* BDRV_REQ_NO_SERIALISING is only for read operation */
2993 assert(!(write_flags
& BDRV_REQ_NO_SERIALISING
));
2994 if (write_flags
& BDRV_REQ_SERIALISING
) {
2995 mark_request_serialising(&req
, bdrv_get_cluster_size(dst
->bs
));
2997 wait_serialising_requests(&req
);
2999 ret
= dst
->bs
->drv
->bdrv_co_copy_range_to(dst
->bs
,
3003 read_flags
, write_flags
);
3005 tracked_request_end(&req
);
3006 bdrv_dec_in_flight(dst
->bs
);
3012 /* Copy range from @src to @dst.
3014 * See the comment of bdrv_co_copy_range for the parameter and return value
3016 int coroutine_fn
bdrv_co_copy_range_from(BdrvChild
*src
, uint64_t src_offset
,
3017 BdrvChild
*dst
, uint64_t dst_offset
,
3019 BdrvRequestFlags read_flags
,
3020 BdrvRequestFlags write_flags
)
3022 return bdrv_co_copy_range_internal(src
, src_offset
, dst
, dst_offset
,
3023 bytes
, read_flags
, write_flags
, true);
3026 /* Copy range from @src to @dst.
3028 * See the comment of bdrv_co_copy_range for the parameter and return value
3030 int coroutine_fn
bdrv_co_copy_range_to(BdrvChild
*src
, uint64_t src_offset
,
3031 BdrvChild
*dst
, uint64_t dst_offset
,
3033 BdrvRequestFlags read_flags
,
3034 BdrvRequestFlags write_flags
)
3036 return bdrv_co_copy_range_internal(src
, src_offset
, dst
, dst_offset
,
3037 bytes
, read_flags
, write_flags
, false);
3040 int coroutine_fn
bdrv_co_copy_range(BdrvChild
*src
, uint64_t src_offset
,
3041 BdrvChild
*dst
, uint64_t dst_offset
,
3042 uint64_t bytes
, BdrvRequestFlags read_flags
,
3043 BdrvRequestFlags write_flags
)
3045 return bdrv_co_copy_range_from(src
, src_offset
,
3047 bytes
, read_flags
, write_flags
);
3050 static void bdrv_parent_cb_resize(BlockDriverState
*bs
)
3053 QLIST_FOREACH(c
, &bs
->parents
, next_parent
) {
3054 if (c
->role
->resize
) {
3061 * Truncate file to 'offset' bytes (needed only for file protocols)
3063 int coroutine_fn
bdrv_co_truncate(BdrvChild
*child
, int64_t offset
,
3064 PreallocMode prealloc
, Error
**errp
)
3066 BlockDriverState
*bs
= child
->bs
;
3067 BlockDriver
*drv
= bs
->drv
;
3068 BdrvTrackedRequest req
;
3069 int64_t old_size
, new_bytes
;
3072 assert(child
->perm
& BLK_PERM_RESIZE
);
3074 /* if bs->drv == NULL, bs is closed, so there's nothing to do here */
3076 error_setg(errp
, "No medium inserted");
3080 error_setg(errp
, "Image size cannot be negative");
3084 old_size
= bdrv_getlength(bs
);
3086 error_setg_errno(errp
, -old_size
, "Failed to get old image size");
3090 if (offset
> old_size
) {
3091 new_bytes
= offset
- old_size
;
3096 bdrv_inc_in_flight(bs
);
3097 tracked_request_begin(&req
, bs
, offset
, new_bytes
, BDRV_TRACKED_TRUNCATE
);
3099 /* If we are growing the image and potentially using preallocation for the
3100 * new area, we need to make sure that no write requests are made to it
3101 * concurrently or they might be overwritten by preallocation. */
3103 mark_request_serialising(&req
, 1);
3104 wait_serialising_requests(&req
);
3107 if (!drv
->bdrv_co_truncate
) {
3108 if (bs
->file
&& drv
->is_filter
) {
3109 ret
= bdrv_co_truncate(bs
->file
, offset
, prealloc
, errp
);
3112 error_setg(errp
, "Image format driver does not support resize");
3116 if (bs
->read_only
) {
3117 error_setg(errp
, "Image is read-only");
3122 assert(!(bs
->open_flags
& BDRV_O_INACTIVE
));
3124 ret
= drv
->bdrv_co_truncate(bs
, offset
, prealloc
, errp
);
3128 ret
= refresh_total_sectors(bs
, offset
>> BDRV_SECTOR_BITS
);
3130 error_setg_errno(errp
, -ret
, "Could not refresh total sector count");
3132 offset
= bs
->total_sectors
* BDRV_SECTOR_SIZE
;
3134 bdrv_dirty_bitmap_truncate(bs
, offset
);
3135 bdrv_parent_cb_resize(bs
);
3136 atomic_inc(&bs
->write_gen
);
3139 tracked_request_end(&req
);
3140 bdrv_dec_in_flight(bs
);
3145 typedef struct TruncateCo
{
3148 PreallocMode prealloc
;
3153 static void coroutine_fn
bdrv_truncate_co_entry(void *opaque
)
3155 TruncateCo
*tco
= opaque
;
3156 tco
->ret
= bdrv_co_truncate(tco
->child
, tco
->offset
, tco
->prealloc
,
3160 int bdrv_truncate(BdrvChild
*child
, int64_t offset
, PreallocMode prealloc
,
3167 .prealloc
= prealloc
,
3172 if (qemu_in_coroutine()) {
3173 /* Fast-path if already in coroutine context */
3174 bdrv_truncate_co_entry(&tco
);
3176 co
= qemu_coroutine_create(bdrv_truncate_co_entry
, &tco
);
3177 qemu_coroutine_enter(co
);
3178 BDRV_POLL_WHILE(child
->bs
, tco
.ret
== NOT_DONE
);