]> git.proxmox.com Git - mirror_qemu.git/blame - block/io.c
block: Introduce bdrv_replace_child()
[mirror_qemu.git] / block / io.c
CommitLineData
61007b31
SH
1/*
2 * Block layer I/O functions
3 *
4 * Copyright (c) 2003 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
80c71a24 25#include "qemu/osdep.h"
61007b31 26#include "trace.h"
7f0e9da6 27#include "sysemu/block-backend.h"
61007b31
SH
28#include "block/blockjob.h"
29#include "block/block_int.h"
f348b6d1 30#include "qemu/cutils.h"
da34e65c 31#include "qapi/error.h"
d49b6836 32#include "qemu/error-report.h"
61007b31
SH
33
34#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
35
61007b31
SH
36static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
37 int64_t sector_num,
38 QEMUIOVector *qiov,
39 int nb_sectors,
40 BdrvRequestFlags flags,
41 BlockCompletionFunc *cb,
42 void *opaque,
43 bool is_write);
44static void coroutine_fn bdrv_co_do_rw(void *opaque);
45static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
46 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
47
c2066af0 48static void bdrv_parent_drained_begin(BlockDriverState *bs)
61007b31 49{
c2066af0 50 BdrvChild *c;
27ccdd52 51
c2066af0
KW
52 QLIST_FOREACH(c, &bs->parents, next_parent) {
53 if (c->role->drained_begin) {
54 c->role->drained_begin(c);
55 }
ce0f1412
PB
56 }
57}
61007b31 58
c2066af0 59static void bdrv_parent_drained_end(BlockDriverState *bs)
ce0f1412 60{
c2066af0 61 BdrvChild *c;
27ccdd52 62
c2066af0
KW
63 QLIST_FOREACH(c, &bs->parents, next_parent) {
64 if (c->role->drained_end) {
65 c->role->drained_end(c);
66 }
27ccdd52 67 }
61007b31
SH
68}
69
61007b31
SH
70void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
71{
72 BlockDriver *drv = bs->drv;
73 Error *local_err = NULL;
74
75 memset(&bs->bl, 0, sizeof(bs->bl));
76
77 if (!drv) {
78 return;
79 }
80
81 /* Take some limits from the children as a default */
82 if (bs->file) {
9a4f4c31 83 bdrv_refresh_limits(bs->file->bs, &local_err);
61007b31
SH
84 if (local_err) {
85 error_propagate(errp, local_err);
86 return;
87 }
9a4f4c31
KW
88 bs->bl.opt_transfer_length = bs->file->bs->bl.opt_transfer_length;
89 bs->bl.max_transfer_length = bs->file->bs->bl.max_transfer_length;
90 bs->bl.min_mem_alignment = bs->file->bs->bl.min_mem_alignment;
91 bs->bl.opt_mem_alignment = bs->file->bs->bl.opt_mem_alignment;
bd44feb7 92 bs->bl.max_iov = bs->file->bs->bl.max_iov;
61007b31 93 } else {
4196d2f0 94 bs->bl.min_mem_alignment = 512;
459b4e66 95 bs->bl.opt_mem_alignment = getpagesize();
bd44feb7
SH
96
97 /* Safe default since most protocols use readv()/writev()/etc */
98 bs->bl.max_iov = IOV_MAX;
61007b31
SH
99 }
100
760e0063
KW
101 if (bs->backing) {
102 bdrv_refresh_limits(bs->backing->bs, &local_err);
61007b31
SH
103 if (local_err) {
104 error_propagate(errp, local_err);
105 return;
106 }
107 bs->bl.opt_transfer_length =
108 MAX(bs->bl.opt_transfer_length,
760e0063 109 bs->backing->bs->bl.opt_transfer_length);
61007b31
SH
110 bs->bl.max_transfer_length =
111 MIN_NON_ZERO(bs->bl.max_transfer_length,
760e0063 112 bs->backing->bs->bl.max_transfer_length);
61007b31
SH
113 bs->bl.opt_mem_alignment =
114 MAX(bs->bl.opt_mem_alignment,
760e0063 115 bs->backing->bs->bl.opt_mem_alignment);
4196d2f0
DL
116 bs->bl.min_mem_alignment =
117 MAX(bs->bl.min_mem_alignment,
760e0063 118 bs->backing->bs->bl.min_mem_alignment);
bd44feb7
SH
119 bs->bl.max_iov =
120 MIN(bs->bl.max_iov,
121 bs->backing->bs->bl.max_iov);
61007b31
SH
122 }
123
124 /* Then let the driver override it */
125 if (drv->bdrv_refresh_limits) {
126 drv->bdrv_refresh_limits(bs, errp);
127 }
128}
129
130/**
131 * The copy-on-read flag is actually a reference count so multiple users may
132 * use the feature without worrying about clobbering its previous state.
133 * Copy-on-read stays enabled until all users have called to disable it.
134 */
135void bdrv_enable_copy_on_read(BlockDriverState *bs)
136{
137 bs->copy_on_read++;
138}
139
140void bdrv_disable_copy_on_read(BlockDriverState *bs)
141{
142 assert(bs->copy_on_read > 0);
143 bs->copy_on_read--;
144}
145
146/* Check if any requests are in-flight (including throttled requests) */
439db28c 147bool bdrv_requests_pending(BlockDriverState *bs)
61007b31 148{
37a639a7
KW
149 BdrvChild *child;
150
61007b31
SH
151 if (!QLIST_EMPTY(&bs->tracked_requests)) {
152 return true;
153 }
37a639a7
KW
154
155 QLIST_FOREACH(child, &bs->children, next) {
156 if (bdrv_requests_pending(child->bs)) {
157 return true;
158 }
61007b31 159 }
37a639a7 160
61007b31
SH
161 return false;
162}
163
67da1dc5
FZ
164static void bdrv_drain_recurse(BlockDriverState *bs)
165{
166 BdrvChild *child;
167
168 if (bs->drv && bs->drv->bdrv_drain) {
169 bs->drv->bdrv_drain(bs);
170 }
171 QLIST_FOREACH(child, &bs->children, next) {
172 bdrv_drain_recurse(child->bs);
173 }
174}
175
a77fd4bb
FZ
176typedef struct {
177 Coroutine *co;
178 BlockDriverState *bs;
179 QEMUBH *bh;
180 bool done;
181} BdrvCoDrainData;
182
b6e84c97
PB
183static void bdrv_drain_poll(BlockDriverState *bs)
184{
185 bool busy = true;
186
187 while (busy) {
188 /* Keep iterating */
b6e84c97
PB
189 busy = bdrv_requests_pending(bs);
190 busy |= aio_poll(bdrv_get_aio_context(bs), busy);
191 }
192}
193
a77fd4bb
FZ
194static void bdrv_co_drain_bh_cb(void *opaque)
195{
196 BdrvCoDrainData *data = opaque;
197 Coroutine *co = data->co;
198
199 qemu_bh_delete(data->bh);
b6e84c97 200 bdrv_drain_poll(data->bs);
a77fd4bb
FZ
201 data->done = true;
202 qemu_coroutine_enter(co, NULL);
203}
204
b6e84c97 205static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
a77fd4bb
FZ
206{
207 BdrvCoDrainData data;
208
209 /* Calling bdrv_drain() from a BH ensures the current coroutine yields and
210 * other coroutines run if they were queued from
211 * qemu_co_queue_run_restart(). */
212
213 assert(qemu_in_coroutine());
214 data = (BdrvCoDrainData) {
215 .co = qemu_coroutine_self(),
216 .bs = bs,
217 .done = false,
218 .bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_drain_bh_cb, &data),
219 };
220 qemu_bh_schedule(data.bh);
221
222 qemu_coroutine_yield();
223 /* If we are resumed from some other event (such as an aio completion or a
224 * timer callback), it is a bug in the caller that should be fixed. */
225 assert(data.done);
226}
227
61007b31 228/*
67da1dc5
FZ
229 * Wait for pending requests to complete on a single BlockDriverState subtree,
230 * and suspend block driver's internal I/O until next request arrives.
61007b31 231 *
61007b31
SH
232 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
233 * AioContext.
7a63f3cd
SH
234 *
235 * Only this BlockDriverState's AioContext is run, so in-flight requests must
236 * not depend on events in other AioContexts. In that case, use
237 * bdrv_drain_all() instead.
61007b31 238 */
b6e84c97 239void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
61007b31 240{
c2066af0 241 bdrv_parent_drained_begin(bs);
6b98bd64 242 bdrv_io_unplugged_begin(bs);
b6e84c97
PB
243 bdrv_drain_recurse(bs);
244 bdrv_co_yield_to_drain(bs);
6b98bd64 245 bdrv_io_unplugged_end(bs);
c2066af0 246 bdrv_parent_drained_end(bs);
b6e84c97 247}
f406c03c 248
b6e84c97
PB
249void bdrv_drain(BlockDriverState *bs)
250{
c2066af0 251 bdrv_parent_drained_begin(bs);
6b98bd64 252 bdrv_io_unplugged_begin(bs);
67da1dc5 253 bdrv_drain_recurse(bs);
a77fd4bb 254 if (qemu_in_coroutine()) {
b6e84c97
PB
255 bdrv_co_yield_to_drain(bs);
256 } else {
257 bdrv_drain_poll(bs);
61007b31 258 }
6b98bd64 259 bdrv_io_unplugged_end(bs);
c2066af0 260 bdrv_parent_drained_end(bs);
61007b31
SH
261}
262
263/*
264 * Wait for pending requests to complete across all BlockDriverStates
265 *
266 * This function does not flush data to disk, use bdrv_flush_all() for that
267 * after calling this function.
61007b31
SH
268 */
269void bdrv_drain_all(void)
270{
271 /* Always run first iteration so any pending completion BHs run */
272 bool busy = true;
7c8eece4 273 BlockDriverState *bs;
88be7b4b 274 BdrvNextIterator it;
f406c03c 275 GSList *aio_ctxs = NULL, *ctx;
61007b31 276
88be7b4b 277 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
61007b31
SH
278 AioContext *aio_context = bdrv_get_aio_context(bs);
279
280 aio_context_acquire(aio_context);
281 if (bs->job) {
282 block_job_pause(bs->job);
283 }
c2066af0 284 bdrv_parent_drained_begin(bs);
6b98bd64 285 bdrv_io_unplugged_begin(bs);
9dcf8ecd 286 bdrv_drain_recurse(bs);
61007b31 287 aio_context_release(aio_context);
f406c03c 288
764ba3ae 289 if (!g_slist_find(aio_ctxs, aio_context)) {
f406c03c
AY
290 aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
291 }
61007b31
SH
292 }
293
7a63f3cd
SH
294 /* Note that completion of an asynchronous I/O operation can trigger any
295 * number of other I/O operations on other devices---for example a
296 * coroutine can submit an I/O request to another device in response to
297 * request completion. Therefore we must keep looping until there was no
298 * more activity rather than simply draining each device independently.
299 */
61007b31
SH
300 while (busy) {
301 busy = false;
61007b31 302
f406c03c
AY
303 for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
304 AioContext *aio_context = ctx->data;
61007b31
SH
305
306 aio_context_acquire(aio_context);
88be7b4b 307 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
f406c03c 308 if (aio_context == bdrv_get_aio_context(bs)) {
f406c03c
AY
309 if (bdrv_requests_pending(bs)) {
310 busy = true;
311 aio_poll(aio_context, busy);
312 }
313 }
314 }
315 busy |= aio_poll(aio_context, false);
61007b31
SH
316 aio_context_release(aio_context);
317 }
318 }
319
88be7b4b 320 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
61007b31
SH
321 AioContext *aio_context = bdrv_get_aio_context(bs);
322
323 aio_context_acquire(aio_context);
6b98bd64 324 bdrv_io_unplugged_end(bs);
c2066af0 325 bdrv_parent_drained_end(bs);
61007b31
SH
326 if (bs->job) {
327 block_job_resume(bs->job);
328 }
329 aio_context_release(aio_context);
330 }
f406c03c 331 g_slist_free(aio_ctxs);
61007b31
SH
332}
333
334/**
335 * Remove an active request from the tracked requests list
336 *
337 * This function should be called when a tracked request is completing.
338 */
339static void tracked_request_end(BdrvTrackedRequest *req)
340{
341 if (req->serialising) {
342 req->bs->serialising_in_flight--;
343 }
344
345 QLIST_REMOVE(req, list);
346 qemu_co_queue_restart_all(&req->wait_queue);
347}
348
349/**
350 * Add an active request to the tracked requests list
351 */
352static void tracked_request_begin(BdrvTrackedRequest *req,
353 BlockDriverState *bs,
354 int64_t offset,
ebde595c
FZ
355 unsigned int bytes,
356 enum BdrvTrackedRequestType type)
61007b31
SH
357{
358 *req = (BdrvTrackedRequest){
359 .bs = bs,
360 .offset = offset,
361 .bytes = bytes,
ebde595c 362 .type = type,
61007b31
SH
363 .co = qemu_coroutine_self(),
364 .serialising = false,
365 .overlap_offset = offset,
366 .overlap_bytes = bytes,
367 };
368
369 qemu_co_queue_init(&req->wait_queue);
370
371 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
372}
373
374static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
375{
376 int64_t overlap_offset = req->offset & ~(align - 1);
377 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
378 - overlap_offset;
379
380 if (!req->serialising) {
381 req->bs->serialising_in_flight++;
382 req->serialising = true;
383 }
384
385 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
386 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
387}
388
389/**
390 * Round a region to cluster boundaries
391 */
392void bdrv_round_to_clusters(BlockDriverState *bs,
393 int64_t sector_num, int nb_sectors,
394 int64_t *cluster_sector_num,
395 int *cluster_nb_sectors)
396{
397 BlockDriverInfo bdi;
398
399 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
400 *cluster_sector_num = sector_num;
401 *cluster_nb_sectors = nb_sectors;
402 } else {
403 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
404 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
405 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
406 nb_sectors, c);
407 }
408}
409
410static int bdrv_get_cluster_size(BlockDriverState *bs)
411{
412 BlockDriverInfo bdi;
413 int ret;
414
415 ret = bdrv_get_info(bs, &bdi);
416 if (ret < 0 || bdi.cluster_size == 0) {
417 return bs->request_alignment;
418 } else {
419 return bdi.cluster_size;
420 }
421}
422
423static bool tracked_request_overlaps(BdrvTrackedRequest *req,
424 int64_t offset, unsigned int bytes)
425{
426 /* aaaa bbbb */
427 if (offset >= req->overlap_offset + req->overlap_bytes) {
428 return false;
429 }
430 /* bbbb aaaa */
431 if (req->overlap_offset >= offset + bytes) {
432 return false;
433 }
434 return true;
435}
436
437static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
438{
439 BlockDriverState *bs = self->bs;
440 BdrvTrackedRequest *req;
441 bool retry;
442 bool waited = false;
443
444 if (!bs->serialising_in_flight) {
445 return false;
446 }
447
448 do {
449 retry = false;
450 QLIST_FOREACH(req, &bs->tracked_requests, list) {
451 if (req == self || (!req->serialising && !self->serialising)) {
452 continue;
453 }
454 if (tracked_request_overlaps(req, self->overlap_offset,
455 self->overlap_bytes))
456 {
457 /* Hitting this means there was a reentrant request, for
458 * example, a block driver issuing nested requests. This must
459 * never happen since it means deadlock.
460 */
461 assert(qemu_coroutine_self() != req->co);
462
463 /* If the request is already (indirectly) waiting for us, or
464 * will wait for us as soon as it wakes up, then just go on
465 * (instead of producing a deadlock in the former case). */
466 if (!req->waiting_for) {
467 self->waiting_for = req;
468 qemu_co_queue_wait(&req->wait_queue);
469 self->waiting_for = NULL;
470 retry = true;
471 waited = true;
472 break;
473 }
474 }
475 }
476 } while (retry);
477
478 return waited;
479}
480
481static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
482 size_t size)
483{
484 if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
485 return -EIO;
486 }
487
488 if (!bdrv_is_inserted(bs)) {
489 return -ENOMEDIUM;
490 }
491
492 if (offset < 0) {
493 return -EIO;
494 }
495
496 return 0;
497}
498
499static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
500 int nb_sectors)
501{
502 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
503 return -EIO;
504 }
505
506 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
507 nb_sectors * BDRV_SECTOR_SIZE);
508}
509
510typedef struct RwCo {
511 BlockDriverState *bs;
512 int64_t offset;
513 QEMUIOVector *qiov;
514 bool is_write;
515 int ret;
516 BdrvRequestFlags flags;
517} RwCo;
518
519static void coroutine_fn bdrv_rw_co_entry(void *opaque)
520{
521 RwCo *rwco = opaque;
522
523 if (!rwco->is_write) {
cab3a356
KW
524 rwco->ret = bdrv_co_preadv(rwco->bs, rwco->offset,
525 rwco->qiov->size, rwco->qiov,
526 rwco->flags);
61007b31 527 } else {
cab3a356
KW
528 rwco->ret = bdrv_co_pwritev(rwco->bs, rwco->offset,
529 rwco->qiov->size, rwco->qiov,
530 rwco->flags);
61007b31
SH
531 }
532}
533
534/*
535 * Process a vectored synchronous request using coroutines
536 */
537static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
538 QEMUIOVector *qiov, bool is_write,
539 BdrvRequestFlags flags)
540{
541 Coroutine *co;
542 RwCo rwco = {
543 .bs = bs,
544 .offset = offset,
545 .qiov = qiov,
546 .is_write = is_write,
547 .ret = NOT_DONE,
548 .flags = flags,
549 };
550
61007b31
SH
551 if (qemu_in_coroutine()) {
552 /* Fast-path if already in coroutine context */
553 bdrv_rw_co_entry(&rwco);
554 } else {
555 AioContext *aio_context = bdrv_get_aio_context(bs);
556
557 co = qemu_coroutine_create(bdrv_rw_co_entry);
558 qemu_coroutine_enter(co, &rwco);
559 while (rwco.ret == NOT_DONE) {
560 aio_poll(aio_context, true);
561 }
562 }
563 return rwco.ret;
564}
565
566/*
567 * Process a synchronous request using coroutines
568 */
569static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
570 int nb_sectors, bool is_write, BdrvRequestFlags flags)
571{
572 QEMUIOVector qiov;
573 struct iovec iov = {
574 .iov_base = (void *)buf,
575 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
576 };
577
578 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
579 return -EINVAL;
580 }
581
582 qemu_iovec_init_external(&qiov, &iov, 1);
583 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
584 &qiov, is_write, flags);
585}
586
587/* return < 0 if error. See bdrv_write() for the return codes */
588int bdrv_read(BlockDriverState *bs, int64_t sector_num,
589 uint8_t *buf, int nb_sectors)
590{
591 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
592}
593
61007b31
SH
594/* Return < 0 if error. Important errors are:
595 -EIO generic I/O error (may happen for all errors)
596 -ENOMEDIUM No media inserted.
597 -EINVAL Invalid sector number or nb_sectors
598 -EACCES Trying to write a read-only device
599*/
600int bdrv_write(BlockDriverState *bs, int64_t sector_num,
601 const uint8_t *buf, int nb_sectors)
602{
603 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
604}
605
606int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
607 int nb_sectors, BdrvRequestFlags flags)
608{
609 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
610 BDRV_REQ_ZERO_WRITE | flags);
611}
612
613/*
614 * Completely zero out a block device with the help of bdrv_write_zeroes.
615 * The operation is sped up by checking the block status and only writing
616 * zeroes to the device if they currently do not return zeroes. Optional
465fe887
EB
617 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP,
618 * BDRV_REQ_FUA).
61007b31
SH
619 *
620 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
621 */
622int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
623{
624 int64_t target_sectors, ret, nb_sectors, sector_num = 0;
67a0fd2a 625 BlockDriverState *file;
61007b31
SH
626 int n;
627
628 target_sectors = bdrv_nb_sectors(bs);
629 if (target_sectors < 0) {
630 return target_sectors;
631 }
632
633 for (;;) {
634 nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
635 if (nb_sectors <= 0) {
636 return 0;
637 }
67a0fd2a 638 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n, &file);
61007b31
SH
639 if (ret < 0) {
640 error_report("error getting block status at sector %" PRId64 ": %s",
641 sector_num, strerror(-ret));
642 return ret;
643 }
644 if (ret & BDRV_BLOCK_ZERO) {
645 sector_num += n;
646 continue;
647 }
648 ret = bdrv_write_zeroes(bs, sector_num, n, flags);
649 if (ret < 0) {
650 error_report("error writing zeroes at sector %" PRId64 ": %s",
651 sector_num, strerror(-ret));
652 return ret;
653 }
654 sector_num += n;
655 }
656}
657
658int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
659{
660 QEMUIOVector qiov;
661 struct iovec iov = {
662 .iov_base = (void *)buf,
663 .iov_len = bytes,
664 };
665 int ret;
666
667 if (bytes < 0) {
668 return -EINVAL;
669 }
670
671 qemu_iovec_init_external(&qiov, &iov, 1);
672 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
673 if (ret < 0) {
674 return ret;
675 }
676
677 return bytes;
678}
679
680int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
681{
682 int ret;
683
684 ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
685 if (ret < 0) {
686 return ret;
687 }
688
689 return qiov->size;
690}
691
692int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
693 const void *buf, int bytes)
694{
695 QEMUIOVector qiov;
696 struct iovec iov = {
697 .iov_base = (void *) buf,
698 .iov_len = bytes,
699 };
700
701 if (bytes < 0) {
702 return -EINVAL;
703 }
704
705 qemu_iovec_init_external(&qiov, &iov, 1);
706 return bdrv_pwritev(bs, offset, &qiov);
707}
708
709/*
710 * Writes to the file and ensures that no writes are reordered across this
711 * request (acts as a barrier)
712 *
713 * Returns 0 on success, -errno in error cases.
714 */
715int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
716 const void *buf, int count)
717{
718 int ret;
719
720 ret = bdrv_pwrite(bs, offset, buf, count);
721 if (ret < 0) {
722 return ret;
723 }
724
855a6a93
KW
725 ret = bdrv_flush(bs);
726 if (ret < 0) {
727 return ret;
61007b31
SH
728 }
729
730 return 0;
731}
732
08844473
KW
733typedef struct CoroutineIOCompletion {
734 Coroutine *coroutine;
735 int ret;
736} CoroutineIOCompletion;
737
738static void bdrv_co_io_em_complete(void *opaque, int ret)
739{
740 CoroutineIOCompletion *co = opaque;
741
742 co->ret = ret;
743 qemu_coroutine_enter(co->coroutine, NULL);
744}
745
166fe960
KW
746static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
747 uint64_t offset, uint64_t bytes,
748 QEMUIOVector *qiov, int flags)
749{
750 BlockDriver *drv = bs->drv;
3fb06697
KW
751 int64_t sector_num;
752 unsigned int nb_sectors;
753
754 if (drv->bdrv_co_preadv) {
755 return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
756 }
757
758 sector_num = offset >> BDRV_SECTOR_BITS;
759 nb_sectors = bytes >> BDRV_SECTOR_BITS;
166fe960
KW
760
761 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
762 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
763 assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
764
08844473
KW
765 if (drv->bdrv_co_readv) {
766 return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
767 } else {
768 BlockAIOCB *acb;
769 CoroutineIOCompletion co = {
770 .coroutine = qemu_coroutine_self(),
771 };
772
773 acb = bs->drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
774 bdrv_co_io_em_complete, &co);
775 if (acb == NULL) {
776 return -EIO;
777 } else {
778 qemu_coroutine_yield();
779 return co.ret;
780 }
781 }
166fe960
KW
782}
783
78a07294
KW
784static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
785 uint64_t offset, uint64_t bytes,
786 QEMUIOVector *qiov, int flags)
787{
788 BlockDriver *drv = bs->drv;
3fb06697
KW
789 int64_t sector_num;
790 unsigned int nb_sectors;
78a07294
KW
791 int ret;
792
3fb06697
KW
793 if (drv->bdrv_co_pwritev) {
794 ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, flags);
795 goto emulate_flags;
796 }
797
798 sector_num = offset >> BDRV_SECTOR_BITS;
799 nb_sectors = bytes >> BDRV_SECTOR_BITS;
800
78a07294
KW
801 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
802 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
803 assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
804
805 if (drv->bdrv_co_writev_flags) {
806 ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov,
4df863f3
EB
807 flags & bs->supported_write_flags);
808 flags &= ~bs->supported_write_flags;
08844473 809 } else if (drv->bdrv_co_writev) {
4df863f3 810 assert(!bs->supported_write_flags);
78a07294 811 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
08844473
KW
812 } else {
813 BlockAIOCB *acb;
814 CoroutineIOCompletion co = {
815 .coroutine = qemu_coroutine_self(),
816 };
817
818 acb = bs->drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
819 bdrv_co_io_em_complete, &co);
820 if (acb == NULL) {
3fb06697 821 ret = -EIO;
08844473
KW
822 } else {
823 qemu_coroutine_yield();
3fb06697 824 ret = co.ret;
08844473 825 }
78a07294
KW
826 }
827
3fb06697 828emulate_flags:
4df863f3 829 if (ret == 0 && (flags & BDRV_REQ_FUA)) {
78a07294
KW
830 ret = bdrv_co_flush(bs);
831 }
832
833 return ret;
834}
835
61007b31
SH
836static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
837 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
838{
839 /* Perform I/O through a temporary buffer so that users who scribble over
840 * their read buffer while the operation is in progress do not end up
841 * modifying the image file. This is critical for zero-copy guest I/O
842 * where anything might happen inside guest memory.
843 */
844 void *bounce_buffer;
845
846 BlockDriver *drv = bs->drv;
847 struct iovec iov;
848 QEMUIOVector bounce_qiov;
849 int64_t cluster_sector_num;
850 int cluster_nb_sectors;
851 size_t skip_bytes;
852 int ret;
853
854 /* Cover entire cluster so no additional backing file I/O is required when
855 * allocating cluster in the image file.
856 */
857 bdrv_round_to_clusters(bs, sector_num, nb_sectors,
858 &cluster_sector_num, &cluster_nb_sectors);
859
860 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
861 cluster_sector_num, cluster_nb_sectors);
862
863 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
864 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
865 if (bounce_buffer == NULL) {
866 ret = -ENOMEM;
867 goto err;
868 }
869
870 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
871
166fe960
KW
872 ret = bdrv_driver_preadv(bs, cluster_sector_num * BDRV_SECTOR_SIZE,
873 cluster_nb_sectors * BDRV_SECTOR_SIZE,
874 &bounce_qiov, 0);
61007b31
SH
875 if (ret < 0) {
876 goto err;
877 }
878
879 if (drv->bdrv_co_write_zeroes &&
880 buffer_is_zero(bounce_buffer, iov.iov_len)) {
881 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
882 cluster_nb_sectors, 0);
883 } else {
884 /* This does not change the data on the disk, it is not necessary
885 * to flush even in cache=writethrough mode.
886 */
78a07294
KW
887 ret = bdrv_driver_pwritev(bs, cluster_sector_num * BDRV_SECTOR_SIZE,
888 cluster_nb_sectors * BDRV_SECTOR_SIZE,
889 &bounce_qiov, 0);
61007b31
SH
890 }
891
892 if (ret < 0) {
893 /* It might be okay to ignore write errors for guest requests. If this
894 * is a deliberate copy-on-read then we don't want to ignore the error.
895 * Simply report it in all cases.
896 */
897 goto err;
898 }
899
900 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
901 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
902 nb_sectors * BDRV_SECTOR_SIZE);
903
904err:
905 qemu_vfree(bounce_buffer);
906 return ret;
907}
908
909/*
910 * Forwards an already correctly aligned request to the BlockDriver. This
911 * handles copy on read and zeroing after EOF; any other features must be
912 * implemented by the caller.
913 */
914static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
915 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
916 int64_t align, QEMUIOVector *qiov, int flags)
917{
61007b31
SH
918 int ret;
919
920 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
921 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
922
923 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
924 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
925 assert(!qiov || bytes == qiov->size);
abb06c5a 926 assert((bs->open_flags & BDRV_O_NO_IO) == 0);
61007b31
SH
927
928 /* Handle Copy on Read and associated serialisation */
929 if (flags & BDRV_REQ_COPY_ON_READ) {
930 /* If we touch the same cluster it counts as an overlap. This
931 * guarantees that allocating writes will be serialized and not race
932 * with each other for the same cluster. For example, in copy-on-read
933 * it ensures that the CoR read and write operations are atomic and
934 * guest writes cannot interleave between them. */
935 mark_request_serialising(req, bdrv_get_cluster_size(bs));
936 }
937
61408b25
FZ
938 if (!(flags & BDRV_REQ_NO_SERIALISING)) {
939 wait_serialising_requests(req);
940 }
61007b31
SH
941
942 if (flags & BDRV_REQ_COPY_ON_READ) {
943 int pnum;
944
945 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
946 if (ret < 0) {
947 goto out;
948 }
949
950 if (!ret || pnum != nb_sectors) {
951 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
952 goto out;
953 }
954 }
955
956 /* Forward the request to the BlockDriver */
957 if (!bs->zero_beyond_eof) {
166fe960 958 ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0);
61007b31
SH
959 } else {
960 /* Read zeros after EOF */
961 int64_t total_sectors, max_nb_sectors;
962
963 total_sectors = bdrv_nb_sectors(bs);
964 if (total_sectors < 0) {
965 ret = total_sectors;
966 goto out;
967 }
968
969 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
970 align >> BDRV_SECTOR_BITS);
971 if (nb_sectors < max_nb_sectors) {
166fe960 972 ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0);
61007b31
SH
973 } else if (max_nb_sectors > 0) {
974 QEMUIOVector local_qiov;
975
976 qemu_iovec_init(&local_qiov, qiov->niov);
977 qemu_iovec_concat(&local_qiov, qiov, 0,
978 max_nb_sectors * BDRV_SECTOR_SIZE);
979
166fe960
KW
980 ret = bdrv_driver_preadv(bs, offset,
981 max_nb_sectors * BDRV_SECTOR_SIZE,
982 &local_qiov, 0);
61007b31
SH
983
984 qemu_iovec_destroy(&local_qiov);
985 } else {
986 ret = 0;
987 }
988
989 /* Reading beyond end of file is supposed to produce zeroes */
990 if (ret == 0 && total_sectors < sector_num + nb_sectors) {
991 uint64_t offset = MAX(0, total_sectors - sector_num);
992 uint64_t bytes = (sector_num + nb_sectors - offset) *
993 BDRV_SECTOR_SIZE;
994 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
995 }
996 }
997
998out:
999 return ret;
1000}
1001
61007b31
SH
1002/*
1003 * Handle a read request in coroutine context
1004 */
cab3a356 1005int coroutine_fn bdrv_co_preadv(BlockDriverState *bs,
61007b31
SH
1006 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1007 BdrvRequestFlags flags)
1008{
1009 BlockDriver *drv = bs->drv;
1010 BdrvTrackedRequest req;
1011
d01c07f2
FZ
1012 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
1013 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
61007b31
SH
1014 uint8_t *head_buf = NULL;
1015 uint8_t *tail_buf = NULL;
1016 QEMUIOVector local_qiov;
1017 bool use_local_qiov = false;
1018 int ret;
1019
1020 if (!drv) {
1021 return -ENOMEDIUM;
1022 }
1023
1024 ret = bdrv_check_byte_request(bs, offset, bytes);
1025 if (ret < 0) {
1026 return ret;
1027 }
1028
9568b511 1029 /* Don't do copy-on-read if we read data before write operation */
61408b25 1030 if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) {
61007b31
SH
1031 flags |= BDRV_REQ_COPY_ON_READ;
1032 }
1033
61007b31
SH
1034 /* Align read if necessary by padding qiov */
1035 if (offset & (align - 1)) {
1036 head_buf = qemu_blockalign(bs, align);
1037 qemu_iovec_init(&local_qiov, qiov->niov + 2);
1038 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1039 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1040 use_local_qiov = true;
1041
1042 bytes += offset & (align - 1);
1043 offset = offset & ~(align - 1);
1044 }
1045
1046 if ((offset + bytes) & (align - 1)) {
1047 if (!use_local_qiov) {
1048 qemu_iovec_init(&local_qiov, qiov->niov + 1);
1049 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1050 use_local_qiov = true;
1051 }
1052 tail_buf = qemu_blockalign(bs, align);
1053 qemu_iovec_add(&local_qiov, tail_buf,
1054 align - ((offset + bytes) & (align - 1)));
1055
1056 bytes = ROUND_UP(bytes, align);
1057 }
1058
ebde595c 1059 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
61007b31
SH
1060 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
1061 use_local_qiov ? &local_qiov : qiov,
1062 flags);
1063 tracked_request_end(&req);
1064
1065 if (use_local_qiov) {
1066 qemu_iovec_destroy(&local_qiov);
1067 qemu_vfree(head_buf);
1068 qemu_vfree(tail_buf);
1069 }
1070
1071 return ret;
1072}
1073
1074static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1075 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1076 BdrvRequestFlags flags)
1077{
1078 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1079 return -EINVAL;
1080 }
1081
cab3a356
KW
1082 return bdrv_co_preadv(bs, sector_num << BDRV_SECTOR_BITS,
1083 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
61007b31
SH
1084}
1085
1086int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1087 int nb_sectors, QEMUIOVector *qiov)
1088{
1089 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1090
1091 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1092}
1093
61408b25 1094int coroutine_fn bdrv_co_readv_no_serialising(BlockDriverState *bs,
9568b511
WC
1095 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1096{
61408b25 1097 trace_bdrv_co_readv_no_serialising(bs, sector_num, nb_sectors);
9568b511
WC
1098
1099 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
61408b25 1100 BDRV_REQ_NO_SERIALISING);
9568b511
WC
1101}
1102
61007b31
SH
1103int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1104 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1105{
1106 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1107
1108 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1109 BDRV_REQ_COPY_ON_READ);
1110}
1111
1112#define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
1113
1114static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1115 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
1116{
1117 BlockDriver *drv = bs->drv;
1118 QEMUIOVector qiov;
1119 struct iovec iov = {0};
1120 int ret = 0;
465fe887 1121 bool need_flush = false;
61007b31
SH
1122
1123 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes,
1124 BDRV_REQUEST_MAX_SECTORS);
1125
1126 while (nb_sectors > 0 && !ret) {
1127 int num = nb_sectors;
1128
1129 /* Align request. Block drivers can expect the "bulk" of the request
1130 * to be aligned.
1131 */
1132 if (bs->bl.write_zeroes_alignment
1133 && num > bs->bl.write_zeroes_alignment) {
1134 if (sector_num % bs->bl.write_zeroes_alignment != 0) {
1135 /* Make a small request up to the first aligned sector. */
1136 num = bs->bl.write_zeroes_alignment;
1137 num -= sector_num % bs->bl.write_zeroes_alignment;
1138 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
1139 /* Shorten the request to the last aligned sector. num cannot
1140 * underflow because num > bs->bl.write_zeroes_alignment.
1141 */
1142 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
1143 }
1144 }
1145
1146 /* limit request size */
1147 if (num > max_write_zeroes) {
1148 num = max_write_zeroes;
1149 }
1150
1151 ret = -ENOTSUP;
1152 /* First try the efficient write zeroes operation */
1153 if (drv->bdrv_co_write_zeroes) {
465fe887
EB
1154 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num,
1155 flags & bs->supported_zero_flags);
1156 if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
1157 !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
1158 need_flush = true;
1159 }
1160 } else {
1161 assert(!bs->supported_zero_flags);
61007b31
SH
1162 }
1163
1164 if (ret == -ENOTSUP) {
1165 /* Fall back to bounce buffer if write zeroes is unsupported */
1166 int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
1167 MAX_WRITE_ZEROES_BOUNCE_BUFFER);
465fe887
EB
1168 BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
1169
1170 if ((flags & BDRV_REQ_FUA) &&
1171 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1172 /* No need for bdrv_driver_pwrite() to do a fallback
1173 * flush on each chunk; use just one at the end */
1174 write_flags &= ~BDRV_REQ_FUA;
1175 need_flush = true;
1176 }
61007b31
SH
1177 num = MIN(num, max_xfer_len);
1178 iov.iov_len = num * BDRV_SECTOR_SIZE;
1179 if (iov.iov_base == NULL) {
1180 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
1181 if (iov.iov_base == NULL) {
1182 ret = -ENOMEM;
1183 goto fail;
1184 }
1185 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
1186 }
1187 qemu_iovec_init_external(&qiov, &iov, 1);
1188
78a07294 1189 ret = bdrv_driver_pwritev(bs, sector_num * BDRV_SECTOR_SIZE,
465fe887
EB
1190 num * BDRV_SECTOR_SIZE, &qiov,
1191 write_flags);
61007b31
SH
1192
1193 /* Keep bounce buffer around if it is big enough for all
1194 * all future requests.
1195 */
1196 if (num < max_xfer_len) {
1197 qemu_vfree(iov.iov_base);
1198 iov.iov_base = NULL;
1199 }
1200 }
1201
1202 sector_num += num;
1203 nb_sectors -= num;
1204 }
1205
1206fail:
465fe887
EB
1207 if (ret == 0 && need_flush) {
1208 ret = bdrv_co_flush(bs);
1209 }
61007b31
SH
1210 qemu_vfree(iov.iov_base);
1211 return ret;
1212}
1213
1214/*
1215 * Forwards an already correctly aligned write request to the BlockDriver.
1216 */
1217static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
1218 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1219 QEMUIOVector *qiov, int flags)
1220{
1221 BlockDriver *drv = bs->drv;
1222 bool waited;
1223 int ret;
1224
1225 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
1226 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
1227
1228 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
1229 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
1230 assert(!qiov || bytes == qiov->size);
abb06c5a 1231 assert((bs->open_flags & BDRV_O_NO_IO) == 0);
61007b31
SH
1232
1233 waited = wait_serialising_requests(req);
1234 assert(!waited || !req->serialising);
1235 assert(req->overlap_offset <= offset);
1236 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
1237
1238 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
1239
1240 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
1241 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
1242 qemu_iovec_is_zero(qiov)) {
1243 flags |= BDRV_REQ_ZERO_WRITE;
1244 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
1245 flags |= BDRV_REQ_MAY_UNMAP;
1246 }
1247 }
1248
1249 if (ret < 0) {
1250 /* Do nothing, write notifier decided to fail this request */
1251 } else if (flags & BDRV_REQ_ZERO_WRITE) {
9a4f4c31 1252 bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
61007b31
SH
1253 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
1254 } else {
9a4f4c31 1255 bdrv_debug_event(bs, BLKDBG_PWRITEV);
78a07294 1256 ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags);
61007b31 1257 }
9a4f4c31 1258 bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
61007b31 1259
61007b31
SH
1260 bdrv_set_dirty(bs, sector_num, nb_sectors);
1261
53d8f9d8
HR
1262 if (bs->wr_highest_offset < offset + bytes) {
1263 bs->wr_highest_offset = offset + bytes;
1264 }
61007b31
SH
1265
1266 if (ret >= 0) {
1267 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
1268 }
1269
1270 return ret;
1271}
1272
9eeb6dd1
FZ
1273static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs,
1274 int64_t offset,
1275 unsigned int bytes,
1276 BdrvRequestFlags flags,
1277 BdrvTrackedRequest *req)
1278{
1279 uint8_t *buf = NULL;
1280 QEMUIOVector local_qiov;
1281 struct iovec iov;
1282 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
1283 unsigned int head_padding_bytes, tail_padding_bytes;
1284 int ret = 0;
1285
1286 head_padding_bytes = offset & (align - 1);
1287 tail_padding_bytes = align - ((offset + bytes) & (align - 1));
1288
1289
1290 assert(flags & BDRV_REQ_ZERO_WRITE);
1291 if (head_padding_bytes || tail_padding_bytes) {
1292 buf = qemu_blockalign(bs, align);
1293 iov = (struct iovec) {
1294 .iov_base = buf,
1295 .iov_len = align,
1296 };
1297 qemu_iovec_init_external(&local_qiov, &iov, 1);
1298 }
1299 if (head_padding_bytes) {
1300 uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes);
1301
1302 /* RMW the unaligned part before head. */
1303 mark_request_serialising(req, align);
1304 wait_serialising_requests(req);
9a4f4c31 1305 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
9eeb6dd1
FZ
1306 ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align,
1307 align, &local_qiov, 0);
1308 if (ret < 0) {
1309 goto fail;
1310 }
9a4f4c31 1311 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
9eeb6dd1
FZ
1312
1313 memset(buf + head_padding_bytes, 0, zero_bytes);
1314 ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align,
1315 &local_qiov,
1316 flags & ~BDRV_REQ_ZERO_WRITE);
1317 if (ret < 0) {
1318 goto fail;
1319 }
1320 offset += zero_bytes;
1321 bytes -= zero_bytes;
1322 }
1323
1324 assert(!bytes || (offset & (align - 1)) == 0);
1325 if (bytes >= align) {
1326 /* Write the aligned part in the middle. */
1327 uint64_t aligned_bytes = bytes & ~(align - 1);
1328 ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes,
1329 NULL, flags);
1330 if (ret < 0) {
1331 goto fail;
1332 }
1333 bytes -= aligned_bytes;
1334 offset += aligned_bytes;
1335 }
1336
1337 assert(!bytes || (offset & (align - 1)) == 0);
1338 if (bytes) {
1339 assert(align == tail_padding_bytes + bytes);
1340 /* RMW the unaligned part after tail. */
1341 mark_request_serialising(req, align);
1342 wait_serialising_requests(req);
9a4f4c31 1343 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
9eeb6dd1
FZ
1344 ret = bdrv_aligned_preadv(bs, req, offset, align,
1345 align, &local_qiov, 0);
1346 if (ret < 0) {
1347 goto fail;
1348 }
9a4f4c31 1349 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
9eeb6dd1
FZ
1350
1351 memset(buf, 0, bytes);
1352 ret = bdrv_aligned_pwritev(bs, req, offset, align,
1353 &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE);
1354 }
1355fail:
1356 qemu_vfree(buf);
1357 return ret;
1358
1359}
1360
61007b31
SH
1361/*
1362 * Handle a write request in coroutine context
1363 */
cab3a356 1364int coroutine_fn bdrv_co_pwritev(BlockDriverState *bs,
61007b31
SH
1365 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1366 BdrvRequestFlags flags)
1367{
1368 BdrvTrackedRequest req;
d01c07f2
FZ
1369 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
1370 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
61007b31
SH
1371 uint8_t *head_buf = NULL;
1372 uint8_t *tail_buf = NULL;
1373 QEMUIOVector local_qiov;
1374 bool use_local_qiov = false;
1375 int ret;
1376
1377 if (!bs->drv) {
1378 return -ENOMEDIUM;
1379 }
1380 if (bs->read_only) {
eaf5fe2d 1381 return -EPERM;
61007b31 1382 }
04c01a5c 1383 assert(!(bs->open_flags & BDRV_O_INACTIVE));
61007b31
SH
1384
1385 ret = bdrv_check_byte_request(bs, offset, bytes);
1386 if (ret < 0) {
1387 return ret;
1388 }
1389
61007b31
SH
1390 /*
1391 * Align write if necessary by performing a read-modify-write cycle.
1392 * Pad qiov with the read parts and be sure to have a tracked request not
1393 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
1394 */
ebde595c 1395 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
61007b31 1396
9eeb6dd1
FZ
1397 if (!qiov) {
1398 ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req);
1399 goto out;
1400 }
1401
61007b31
SH
1402 if (offset & (align - 1)) {
1403 QEMUIOVector head_qiov;
1404 struct iovec head_iov;
1405
1406 mark_request_serialising(&req, align);
1407 wait_serialising_requests(&req);
1408
1409 head_buf = qemu_blockalign(bs, align);
1410 head_iov = (struct iovec) {
1411 .iov_base = head_buf,
1412 .iov_len = align,
1413 };
1414 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
1415
9a4f4c31 1416 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
61007b31
SH
1417 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
1418 align, &head_qiov, 0);
1419 if (ret < 0) {
1420 goto fail;
1421 }
9a4f4c31 1422 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
61007b31
SH
1423
1424 qemu_iovec_init(&local_qiov, qiov->niov + 2);
1425 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1426 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1427 use_local_qiov = true;
1428
1429 bytes += offset & (align - 1);
1430 offset = offset & ~(align - 1);
1431 }
1432
1433 if ((offset + bytes) & (align - 1)) {
1434 QEMUIOVector tail_qiov;
1435 struct iovec tail_iov;
1436 size_t tail_bytes;
1437 bool waited;
1438
1439 mark_request_serialising(&req, align);
1440 waited = wait_serialising_requests(&req);
1441 assert(!waited || !use_local_qiov);
1442
1443 tail_buf = qemu_blockalign(bs, align);
1444 tail_iov = (struct iovec) {
1445 .iov_base = tail_buf,
1446 .iov_len = align,
1447 };
1448 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
1449
9a4f4c31 1450 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
61007b31
SH
1451 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
1452 align, &tail_qiov, 0);
1453 if (ret < 0) {
1454 goto fail;
1455 }
9a4f4c31 1456 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
61007b31
SH
1457
1458 if (!use_local_qiov) {
1459 qemu_iovec_init(&local_qiov, qiov->niov + 1);
1460 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1461 use_local_qiov = true;
1462 }
1463
1464 tail_bytes = (offset + bytes) & (align - 1);
1465 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
1466
1467 bytes = ROUND_UP(bytes, align);
1468 }
1469
61007b31
SH
1470 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
1471 use_local_qiov ? &local_qiov : qiov,
1472 flags);
1473
1474fail:
61007b31
SH
1475
1476 if (use_local_qiov) {
1477 qemu_iovec_destroy(&local_qiov);
1478 }
1479 qemu_vfree(head_buf);
1480 qemu_vfree(tail_buf);
9eeb6dd1
FZ
1481out:
1482 tracked_request_end(&req);
61007b31
SH
1483 return ret;
1484}
1485
1486static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1487 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1488 BdrvRequestFlags flags)
1489{
1490 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1491 return -EINVAL;
1492 }
1493
cab3a356
KW
1494 return bdrv_co_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
1495 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
61007b31
SH
1496}
1497
1498int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1499 int nb_sectors, QEMUIOVector *qiov)
1500{
1501 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1502
1503 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
1504}
1505
1506int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
1507 int64_t sector_num, int nb_sectors,
1508 BdrvRequestFlags flags)
1509{
61007b31
SH
1510 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
1511
1512 if (!(bs->open_flags & BDRV_O_UNMAP)) {
1513 flags &= ~BDRV_REQ_MAY_UNMAP;
1514 }
61007b31 1515
d01c07f2
FZ
1516 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
1517 BDRV_REQ_ZERO_WRITE | flags);
61007b31
SH
1518}
1519
61007b31
SH
1520typedef struct BdrvCoGetBlockStatusData {
1521 BlockDriverState *bs;
1522 BlockDriverState *base;
67a0fd2a 1523 BlockDriverState **file;
61007b31
SH
1524 int64_t sector_num;
1525 int nb_sectors;
1526 int *pnum;
1527 int64_t ret;
1528 bool done;
1529} BdrvCoGetBlockStatusData;
1530
1531/*
1532 * Returns the allocation status of the specified sectors.
1533 * Drivers not implementing the functionality are assumed to not support
1534 * backing files, hence all their sectors are reported as allocated.
1535 *
1536 * If 'sector_num' is beyond the end of the disk image the return value is 0
1537 * and 'pnum' is set to 0.
1538 *
1539 * 'pnum' is set to the number of sectors (including and immediately following
1540 * the specified sector) that are known to be in the same
1541 * allocated/unallocated state.
1542 *
1543 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
1544 * beyond the end of the disk image it will be clamped.
67a0fd2a
FZ
1545 *
1546 * If returned value is positive and BDRV_BLOCK_OFFSET_VALID bit is set, 'file'
1547 * points to the BDS which the sector range is allocated in.
61007b31
SH
1548 */
1549static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
1550 int64_t sector_num,
67a0fd2a
FZ
1551 int nb_sectors, int *pnum,
1552 BlockDriverState **file)
61007b31
SH
1553{
1554 int64_t total_sectors;
1555 int64_t n;
1556 int64_t ret, ret2;
1557
1558 total_sectors = bdrv_nb_sectors(bs);
1559 if (total_sectors < 0) {
1560 return total_sectors;
1561 }
1562
1563 if (sector_num >= total_sectors) {
1564 *pnum = 0;
1565 return 0;
1566 }
1567
1568 n = total_sectors - sector_num;
1569 if (n < nb_sectors) {
1570 nb_sectors = n;
1571 }
1572
1573 if (!bs->drv->bdrv_co_get_block_status) {
1574 *pnum = nb_sectors;
1575 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
1576 if (bs->drv->protocol_name) {
1577 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
1578 }
1579 return ret;
1580 }
1581
67a0fd2a
FZ
1582 *file = NULL;
1583 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum,
1584 file);
61007b31
SH
1585 if (ret < 0) {
1586 *pnum = 0;
1587 return ret;
1588 }
1589
1590 if (ret & BDRV_BLOCK_RAW) {
1591 assert(ret & BDRV_BLOCK_OFFSET_VALID);
9a4f4c31 1592 return bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS,
67a0fd2a 1593 *pnum, pnum, file);
61007b31
SH
1594 }
1595
1596 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
1597 ret |= BDRV_BLOCK_ALLOCATED;
a53f1a95 1598 } else {
61007b31
SH
1599 if (bdrv_unallocated_blocks_are_zero(bs)) {
1600 ret |= BDRV_BLOCK_ZERO;
760e0063
KW
1601 } else if (bs->backing) {
1602 BlockDriverState *bs2 = bs->backing->bs;
61007b31
SH
1603 int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
1604 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
1605 ret |= BDRV_BLOCK_ZERO;
1606 }
1607 }
1608 }
1609
ac987b30 1610 if (*file && *file != bs &&
61007b31
SH
1611 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
1612 (ret & BDRV_BLOCK_OFFSET_VALID)) {
67a0fd2a 1613 BlockDriverState *file2;
61007b31
SH
1614 int file_pnum;
1615
ac987b30 1616 ret2 = bdrv_co_get_block_status(*file, ret >> BDRV_SECTOR_BITS,
67a0fd2a 1617 *pnum, &file_pnum, &file2);
61007b31
SH
1618 if (ret2 >= 0) {
1619 /* Ignore errors. This is just providing extra information, it
1620 * is useful but not necessary.
1621 */
1622 if (!file_pnum) {
1623 /* !file_pnum indicates an offset at or beyond the EOF; it is
1624 * perfectly valid for the format block driver to point to such
1625 * offsets, so catch it and mark everything as zero */
1626 ret |= BDRV_BLOCK_ZERO;
1627 } else {
1628 /* Limit request to the range reported by the protocol driver */
1629 *pnum = file_pnum;
1630 ret |= (ret2 & BDRV_BLOCK_ZERO);
1631 }
1632 }
1633 }
1634
1635 return ret;
1636}
1637
ba3f0e25
FZ
1638static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs,
1639 BlockDriverState *base,
1640 int64_t sector_num,
1641 int nb_sectors,
67a0fd2a
FZ
1642 int *pnum,
1643 BlockDriverState **file)
ba3f0e25
FZ
1644{
1645 BlockDriverState *p;
1646 int64_t ret = 0;
1647
1648 assert(bs != base);
760e0063 1649 for (p = bs; p != base; p = backing_bs(p)) {
67a0fd2a 1650 ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum, file);
ba3f0e25
FZ
1651 if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) {
1652 break;
1653 }
1654 /* [sector_num, pnum] unallocated on this layer, which could be only
1655 * the first part of [sector_num, nb_sectors]. */
1656 nb_sectors = MIN(nb_sectors, *pnum);
1657 }
1658 return ret;
1659}
1660
1661/* Coroutine wrapper for bdrv_get_block_status_above() */
1662static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque)
61007b31
SH
1663{
1664 BdrvCoGetBlockStatusData *data = opaque;
61007b31 1665
ba3f0e25
FZ
1666 data->ret = bdrv_co_get_block_status_above(data->bs, data->base,
1667 data->sector_num,
1668 data->nb_sectors,
67a0fd2a
FZ
1669 data->pnum,
1670 data->file);
61007b31
SH
1671 data->done = true;
1672}
1673
1674/*
ba3f0e25 1675 * Synchronous wrapper around bdrv_co_get_block_status_above().
61007b31 1676 *
ba3f0e25 1677 * See bdrv_co_get_block_status_above() for details.
61007b31 1678 */
ba3f0e25
FZ
1679int64_t bdrv_get_block_status_above(BlockDriverState *bs,
1680 BlockDriverState *base,
1681 int64_t sector_num,
67a0fd2a
FZ
1682 int nb_sectors, int *pnum,
1683 BlockDriverState **file)
61007b31
SH
1684{
1685 Coroutine *co;
1686 BdrvCoGetBlockStatusData data = {
1687 .bs = bs,
ba3f0e25 1688 .base = base,
67a0fd2a 1689 .file = file,
61007b31
SH
1690 .sector_num = sector_num,
1691 .nb_sectors = nb_sectors,
1692 .pnum = pnum,
1693 .done = false,
1694 };
1695
1696 if (qemu_in_coroutine()) {
1697 /* Fast-path if already in coroutine context */
ba3f0e25 1698 bdrv_get_block_status_above_co_entry(&data);
61007b31
SH
1699 } else {
1700 AioContext *aio_context = bdrv_get_aio_context(bs);
1701
ba3f0e25 1702 co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry);
61007b31
SH
1703 qemu_coroutine_enter(co, &data);
1704 while (!data.done) {
1705 aio_poll(aio_context, true);
1706 }
1707 }
1708 return data.ret;
1709}
1710
ba3f0e25
FZ
1711int64_t bdrv_get_block_status(BlockDriverState *bs,
1712 int64_t sector_num,
67a0fd2a
FZ
1713 int nb_sectors, int *pnum,
1714 BlockDriverState **file)
ba3f0e25 1715{
760e0063 1716 return bdrv_get_block_status_above(bs, backing_bs(bs),
67a0fd2a 1717 sector_num, nb_sectors, pnum, file);
ba3f0e25
FZ
1718}
1719
61007b31
SH
1720int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
1721 int nb_sectors, int *pnum)
1722{
67a0fd2a
FZ
1723 BlockDriverState *file;
1724 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum,
1725 &file);
61007b31
SH
1726 if (ret < 0) {
1727 return ret;
1728 }
1729 return !!(ret & BDRV_BLOCK_ALLOCATED);
1730}
1731
1732/*
1733 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
1734 *
1735 * Return true if the given sector is allocated in any image between
1736 * BASE and TOP (inclusive). BASE can be NULL to check if the given
1737 * sector is allocated in any image of the chain. Return false otherwise.
1738 *
1739 * 'pnum' is set to the number of sectors (including and immediately following
1740 * the specified sector) that are known to be in the same
1741 * allocated/unallocated state.
1742 *
1743 */
1744int bdrv_is_allocated_above(BlockDriverState *top,
1745 BlockDriverState *base,
1746 int64_t sector_num,
1747 int nb_sectors, int *pnum)
1748{
1749 BlockDriverState *intermediate;
1750 int ret, n = nb_sectors;
1751
1752 intermediate = top;
1753 while (intermediate && intermediate != base) {
1754 int pnum_inter;
1755 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
1756 &pnum_inter);
1757 if (ret < 0) {
1758 return ret;
1759 } else if (ret) {
1760 *pnum = pnum_inter;
1761 return 1;
1762 }
1763
1764 /*
1765 * [sector_num, nb_sectors] is unallocated on top but intermediate
1766 * might have
1767 *
1768 * [sector_num+x, nr_sectors] allocated.
1769 */
1770 if (n > pnum_inter &&
1771 (intermediate == top ||
1772 sector_num + pnum_inter < intermediate->total_sectors)) {
1773 n = pnum_inter;
1774 }
1775
760e0063 1776 intermediate = backing_bs(intermediate);
61007b31
SH
1777 }
1778
1779 *pnum = n;
1780 return 0;
1781}
1782
1783int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
1784 const uint8_t *buf, int nb_sectors)
1785{
1786 BlockDriver *drv = bs->drv;
1787 int ret;
1788
1789 if (!drv) {
1790 return -ENOMEDIUM;
1791 }
1792 if (!drv->bdrv_write_compressed) {
1793 return -ENOTSUP;
1794 }
1795 ret = bdrv_check_request(bs, sector_num, nb_sectors);
1796 if (ret < 0) {
1797 return ret;
1798 }
1799
1800 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
1801
1802 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
1803}
1804
1805int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
1806 int64_t pos, int size)
1807{
1808 QEMUIOVector qiov;
1809 struct iovec iov = {
1810 .iov_base = (void *) buf,
1811 .iov_len = size,
1812 };
1813
1814 qemu_iovec_init_external(&qiov, &iov, 1);
1815 return bdrv_writev_vmstate(bs, &qiov, pos);
1816}
1817
1818int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
1819{
1820 BlockDriver *drv = bs->drv;
1821
1822 if (!drv) {
1823 return -ENOMEDIUM;
1824 } else if (drv->bdrv_save_vmstate) {
1825 return drv->bdrv_save_vmstate(bs, qiov, pos);
1826 } else if (bs->file) {
9a4f4c31 1827 return bdrv_writev_vmstate(bs->file->bs, qiov, pos);
61007b31
SH
1828 }
1829
1830 return -ENOTSUP;
1831}
1832
1833int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
1834 int64_t pos, int size)
1835{
1836 BlockDriver *drv = bs->drv;
1837 if (!drv)
1838 return -ENOMEDIUM;
1839 if (drv->bdrv_load_vmstate)
1840 return drv->bdrv_load_vmstate(bs, buf, pos, size);
1841 if (bs->file)
9a4f4c31 1842 return bdrv_load_vmstate(bs->file->bs, buf, pos, size);
61007b31
SH
1843 return -ENOTSUP;
1844}
1845
1846/**************************************************************/
1847/* async I/Os */
1848
1849BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
1850 QEMUIOVector *qiov, int nb_sectors,
1851 BlockCompletionFunc *cb, void *opaque)
1852{
1853 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
1854
1855 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
1856 cb, opaque, false);
1857}
1858
1859BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
1860 QEMUIOVector *qiov, int nb_sectors,
1861 BlockCompletionFunc *cb, void *opaque)
1862{
1863 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
1864
1865 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
1866 cb, opaque, true);
1867}
1868
1869BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
1870 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
1871 BlockCompletionFunc *cb, void *opaque)
1872{
1873 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
1874
1875 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
1876 BDRV_REQ_ZERO_WRITE | flags,
1877 cb, opaque, true);
1878}
1879
61007b31
SH
1880void bdrv_aio_cancel(BlockAIOCB *acb)
1881{
1882 qemu_aio_ref(acb);
1883 bdrv_aio_cancel_async(acb);
1884 while (acb->refcnt > 1) {
1885 if (acb->aiocb_info->get_aio_context) {
1886 aio_poll(acb->aiocb_info->get_aio_context(acb), true);
1887 } else if (acb->bs) {
1888 aio_poll(bdrv_get_aio_context(acb->bs), true);
1889 } else {
1890 abort();
1891 }
1892 }
1893 qemu_aio_unref(acb);
1894}
1895
1896/* Async version of aio cancel. The caller is not blocked if the acb implements
1897 * cancel_async, otherwise we do nothing and let the request normally complete.
1898 * In either case the completion callback must be called. */
1899void bdrv_aio_cancel_async(BlockAIOCB *acb)
1900{
1901 if (acb->aiocb_info->cancel_async) {
1902 acb->aiocb_info->cancel_async(acb);
1903 }
1904}
1905
1906/**************************************************************/
1907/* async block device emulation */
1908
61007b31
SH
1909typedef struct BlockAIOCBCoroutine {
1910 BlockAIOCB common;
1911 BlockRequest req;
1912 bool is_write;
1913 bool need_bh;
1914 bool *done;
1915 QEMUBH* bh;
1916} BlockAIOCBCoroutine;
1917
1918static const AIOCBInfo bdrv_em_co_aiocb_info = {
1919 .aiocb_size = sizeof(BlockAIOCBCoroutine),
1920};
1921
1922static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
1923{
1924 if (!acb->need_bh) {
1925 acb->common.cb(acb->common.opaque, acb->req.error);
1926 qemu_aio_unref(acb);
1927 }
1928}
1929
1930static void bdrv_co_em_bh(void *opaque)
1931{
1932 BlockAIOCBCoroutine *acb = opaque;
1933
1934 assert(!acb->need_bh);
1935 qemu_bh_delete(acb->bh);
1936 bdrv_co_complete(acb);
1937}
1938
1939static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
1940{
1941 acb->need_bh = false;
1942 if (acb->req.error != -EINPROGRESS) {
1943 BlockDriverState *bs = acb->common.bs;
1944
1945 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
1946 qemu_bh_schedule(acb->bh);
1947 }
1948}
1949
1950/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
1951static void coroutine_fn bdrv_co_do_rw(void *opaque)
1952{
1953 BlockAIOCBCoroutine *acb = opaque;
1954 BlockDriverState *bs = acb->common.bs;
1955
1956 if (!acb->is_write) {
1957 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
1958 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
1959 } else {
1960 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
1961 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
1962 }
1963
1964 bdrv_co_complete(acb);
1965}
1966
1967static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
1968 int64_t sector_num,
1969 QEMUIOVector *qiov,
1970 int nb_sectors,
1971 BdrvRequestFlags flags,
1972 BlockCompletionFunc *cb,
1973 void *opaque,
1974 bool is_write)
1975{
1976 Coroutine *co;
1977 BlockAIOCBCoroutine *acb;
1978
1979 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
1980 acb->need_bh = true;
1981 acb->req.error = -EINPROGRESS;
1982 acb->req.sector = sector_num;
1983 acb->req.nb_sectors = nb_sectors;
1984 acb->req.qiov = qiov;
1985 acb->req.flags = flags;
1986 acb->is_write = is_write;
1987
1988 co = qemu_coroutine_create(bdrv_co_do_rw);
1989 qemu_coroutine_enter(co, acb);
1990
1991 bdrv_co_maybe_schedule_bh(acb);
1992 return &acb->common;
1993}
1994
1995static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
1996{
1997 BlockAIOCBCoroutine *acb = opaque;
1998 BlockDriverState *bs = acb->common.bs;
1999
2000 acb->req.error = bdrv_co_flush(bs);
2001 bdrv_co_complete(acb);
2002}
2003
2004BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
2005 BlockCompletionFunc *cb, void *opaque)
2006{
2007 trace_bdrv_aio_flush(bs, opaque);
2008
2009 Coroutine *co;
2010 BlockAIOCBCoroutine *acb;
2011
2012 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2013 acb->need_bh = true;
2014 acb->req.error = -EINPROGRESS;
2015
2016 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
2017 qemu_coroutine_enter(co, acb);
2018
2019 bdrv_co_maybe_schedule_bh(acb);
2020 return &acb->common;
2021}
2022
2023static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
2024{
2025 BlockAIOCBCoroutine *acb = opaque;
2026 BlockDriverState *bs = acb->common.bs;
2027
2028 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
2029 bdrv_co_complete(acb);
2030}
2031
2032BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
2033 int64_t sector_num, int nb_sectors,
2034 BlockCompletionFunc *cb, void *opaque)
2035{
2036 Coroutine *co;
2037 BlockAIOCBCoroutine *acb;
2038
2039 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
2040
2041 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2042 acb->need_bh = true;
2043 acb->req.error = -EINPROGRESS;
2044 acb->req.sector = sector_num;
2045 acb->req.nb_sectors = nb_sectors;
2046 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
2047 qemu_coroutine_enter(co, acb);
2048
2049 bdrv_co_maybe_schedule_bh(acb);
2050 return &acb->common;
2051}
2052
2053void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
2054 BlockCompletionFunc *cb, void *opaque)
2055{
2056 BlockAIOCB *acb;
2057
c84b3192 2058 acb = g_malloc(aiocb_info->aiocb_size);
61007b31
SH
2059 acb->aiocb_info = aiocb_info;
2060 acb->bs = bs;
2061 acb->cb = cb;
2062 acb->opaque = opaque;
2063 acb->refcnt = 1;
2064 return acb;
2065}
2066
2067void qemu_aio_ref(void *p)
2068{
2069 BlockAIOCB *acb = p;
2070 acb->refcnt++;
2071}
2072
2073void qemu_aio_unref(void *p)
2074{
2075 BlockAIOCB *acb = p;
2076 assert(acb->refcnt > 0);
2077 if (--acb->refcnt == 0) {
c84b3192 2078 g_free(acb);
61007b31
SH
2079 }
2080}
2081
2082/**************************************************************/
2083/* Coroutine block device emulation */
2084
61007b31
SH
2085static void coroutine_fn bdrv_flush_co_entry(void *opaque)
2086{
2087 RwCo *rwco = opaque;
2088
2089 rwco->ret = bdrv_co_flush(rwco->bs);
2090}
2091
2092int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
2093{
2094 int ret;
cdb5e315 2095 BdrvTrackedRequest req;
61007b31 2096
1b6bc94d
DA
2097 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
2098 bdrv_is_sg(bs)) {
61007b31
SH
2099 return 0;
2100 }
2101
cdb5e315 2102 tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH);
c32b82af
PD
2103
2104 /* Write back all layers by calling one driver function */
2105 if (bs->drv->bdrv_co_flush) {
2106 ret = bs->drv->bdrv_co_flush(bs);
2107 goto out;
2108 }
2109
61007b31
SH
2110 /* Write back cached data to the OS even with cache=unsafe */
2111 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
2112 if (bs->drv->bdrv_co_flush_to_os) {
2113 ret = bs->drv->bdrv_co_flush_to_os(bs);
2114 if (ret < 0) {
cdb5e315 2115 goto out;
61007b31
SH
2116 }
2117 }
2118
2119 /* But don't actually force it to the disk with cache=unsafe */
2120 if (bs->open_flags & BDRV_O_NO_FLUSH) {
2121 goto flush_parent;
2122 }
2123
2124 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
2125 if (bs->drv->bdrv_co_flush_to_disk) {
2126 ret = bs->drv->bdrv_co_flush_to_disk(bs);
2127 } else if (bs->drv->bdrv_aio_flush) {
2128 BlockAIOCB *acb;
2129 CoroutineIOCompletion co = {
2130 .coroutine = qemu_coroutine_self(),
2131 };
2132
2133 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
2134 if (acb == NULL) {
2135 ret = -EIO;
2136 } else {
2137 qemu_coroutine_yield();
2138 ret = co.ret;
2139 }
2140 } else {
2141 /*
2142 * Some block drivers always operate in either writethrough or unsafe
2143 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
2144 * know how the server works (because the behaviour is hardcoded or
2145 * depends on server-side configuration), so we can't ensure that
2146 * everything is safe on disk. Returning an error doesn't work because
2147 * that would break guests even if the server operates in writethrough
2148 * mode.
2149 *
2150 * Let's hope the user knows what he's doing.
2151 */
2152 ret = 0;
2153 }
2154 if (ret < 0) {
cdb5e315 2155 goto out;
61007b31
SH
2156 }
2157
2158 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
2159 * in the case of cache=unsafe, so there are no useless flushes.
2160 */
2161flush_parent:
cdb5e315
FZ
2162 ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
2163out:
2164 tracked_request_end(&req);
2165 return ret;
61007b31
SH
2166}
2167
2168int bdrv_flush(BlockDriverState *bs)
2169{
2170 Coroutine *co;
2171 RwCo rwco = {
2172 .bs = bs,
2173 .ret = NOT_DONE,
2174 };
2175
2176 if (qemu_in_coroutine()) {
2177 /* Fast-path if already in coroutine context */
2178 bdrv_flush_co_entry(&rwco);
2179 } else {
2180 AioContext *aio_context = bdrv_get_aio_context(bs);
2181
2182 co = qemu_coroutine_create(bdrv_flush_co_entry);
2183 qemu_coroutine_enter(co, &rwco);
2184 while (rwco.ret == NOT_DONE) {
2185 aio_poll(aio_context, true);
2186 }
2187 }
2188
2189 return rwco.ret;
2190}
2191
2192typedef struct DiscardCo {
2193 BlockDriverState *bs;
2194 int64_t sector_num;
2195 int nb_sectors;
2196 int ret;
2197} DiscardCo;
2198static void coroutine_fn bdrv_discard_co_entry(void *opaque)
2199{
2200 DiscardCo *rwco = opaque;
2201
2202 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
2203}
2204
2205int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
2206 int nb_sectors)
2207{
b1066c87 2208 BdrvTrackedRequest req;
61007b31
SH
2209 int max_discard, ret;
2210
2211 if (!bs->drv) {
2212 return -ENOMEDIUM;
2213 }
2214
2215 ret = bdrv_check_request(bs, sector_num, nb_sectors);
2216 if (ret < 0) {
2217 return ret;
2218 } else if (bs->read_only) {
eaf5fe2d 2219 return -EPERM;
61007b31 2220 }
04c01a5c 2221 assert(!(bs->open_flags & BDRV_O_INACTIVE));
61007b31 2222
61007b31
SH
2223 /* Do nothing if disabled. */
2224 if (!(bs->open_flags & BDRV_O_UNMAP)) {
2225 return 0;
2226 }
2227
2228 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
2229 return 0;
2230 }
2231
b1066c87
FZ
2232 tracked_request_begin(&req, bs, sector_num, nb_sectors,
2233 BDRV_TRACKED_DISCARD);
50824995
FZ
2234 bdrv_set_dirty(bs, sector_num, nb_sectors);
2235
61007b31
SH
2236 max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
2237 while (nb_sectors > 0) {
2238 int ret;
2239 int num = nb_sectors;
2240
2241 /* align request */
2242 if (bs->bl.discard_alignment &&
2243 num >= bs->bl.discard_alignment &&
2244 sector_num % bs->bl.discard_alignment) {
2245 if (num > bs->bl.discard_alignment) {
2246 num = bs->bl.discard_alignment;
2247 }
2248 num -= sector_num % bs->bl.discard_alignment;
2249 }
2250
2251 /* limit request size */
2252 if (num > max_discard) {
2253 num = max_discard;
2254 }
2255
2256 if (bs->drv->bdrv_co_discard) {
2257 ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
2258 } else {
2259 BlockAIOCB *acb;
2260 CoroutineIOCompletion co = {
2261 .coroutine = qemu_coroutine_self(),
2262 };
2263
2264 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
2265 bdrv_co_io_em_complete, &co);
2266 if (acb == NULL) {
b1066c87
FZ
2267 ret = -EIO;
2268 goto out;
61007b31
SH
2269 } else {
2270 qemu_coroutine_yield();
2271 ret = co.ret;
2272 }
2273 }
2274 if (ret && ret != -ENOTSUP) {
b1066c87 2275 goto out;
61007b31
SH
2276 }
2277
2278 sector_num += num;
2279 nb_sectors -= num;
2280 }
b1066c87
FZ
2281 ret = 0;
2282out:
2283 tracked_request_end(&req);
2284 return ret;
61007b31
SH
2285}
2286
2287int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
2288{
2289 Coroutine *co;
2290 DiscardCo rwco = {
2291 .bs = bs,
2292 .sector_num = sector_num,
2293 .nb_sectors = nb_sectors,
2294 .ret = NOT_DONE,
2295 };
2296
2297 if (qemu_in_coroutine()) {
2298 /* Fast-path if already in coroutine context */
2299 bdrv_discard_co_entry(&rwco);
2300 } else {
2301 AioContext *aio_context = bdrv_get_aio_context(bs);
2302
2303 co = qemu_coroutine_create(bdrv_discard_co_entry);
2304 qemu_coroutine_enter(co, &rwco);
2305 while (rwco.ret == NOT_DONE) {
2306 aio_poll(aio_context, true);
2307 }
2308 }
2309
2310 return rwco.ret;
2311}
2312
5c5ae76a
FZ
2313typedef struct {
2314 CoroutineIOCompletion *co;
2315 QEMUBH *bh;
2316} BdrvIoctlCompletionData;
61007b31 2317
5c5ae76a
FZ
2318static void bdrv_ioctl_bh_cb(void *opaque)
2319{
2320 BdrvIoctlCompletionData *data = opaque;
2321
2322 bdrv_co_io_em_complete(data->co, -ENOTSUP);
2323 qemu_bh_delete(data->bh);
2324}
2325
2326static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf)
61007b31
SH
2327{
2328 BlockDriver *drv = bs->drv;
5c5ae76a
FZ
2329 BdrvTrackedRequest tracked_req;
2330 CoroutineIOCompletion co = {
2331 .coroutine = qemu_coroutine_self(),
2332 };
2333 BlockAIOCB *acb;
61007b31 2334
5c5ae76a
FZ
2335 tracked_request_begin(&tracked_req, bs, 0, 0, BDRV_TRACKED_IOCTL);
2336 if (!drv || !drv->bdrv_aio_ioctl) {
2337 co.ret = -ENOTSUP;
2338 goto out;
2339 }
2340
2341 acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
2342 if (!acb) {
2343 BdrvIoctlCompletionData *data = g_new(BdrvIoctlCompletionData, 1);
2344 data->bh = aio_bh_new(bdrv_get_aio_context(bs),
2345 bdrv_ioctl_bh_cb, data);
2346 data->co = &co;
2347 qemu_bh_schedule(data->bh);
2348 }
2349 qemu_coroutine_yield();
2350out:
2351 tracked_request_end(&tracked_req);
2352 return co.ret;
2353}
2354
2355typedef struct {
2356 BlockDriverState *bs;
2357 int req;
2358 void *buf;
2359 int ret;
2360} BdrvIoctlCoData;
2361
2362static void coroutine_fn bdrv_co_ioctl_entry(void *opaque)
2363{
2364 BdrvIoctlCoData *data = opaque;
2365 data->ret = bdrv_co_do_ioctl(data->bs, data->req, data->buf);
2366}
2367
2368/* needed for generic scsi interface */
2369int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
2370{
2371 BdrvIoctlCoData data = {
2372 .bs = bs,
2373 .req = req,
2374 .buf = buf,
2375 .ret = -EINPROGRESS,
2376 };
2377
2378 if (qemu_in_coroutine()) {
2379 /* Fast-path if already in coroutine context */
2380 bdrv_co_ioctl_entry(&data);
2381 } else {
2382 Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry);
ba889444 2383
5c5ae76a 2384 qemu_coroutine_enter(co, &data);
ba889444
PB
2385 while (data.ret == -EINPROGRESS) {
2386 aio_poll(bdrv_get_aio_context(bs), true);
2387 }
5c5ae76a
FZ
2388 }
2389 return data.ret;
2390}
2391
2392static void coroutine_fn bdrv_co_aio_ioctl_entry(void *opaque)
2393{
2394 BlockAIOCBCoroutine *acb = opaque;
2395 acb->req.error = bdrv_co_do_ioctl(acb->common.bs,
2396 acb->req.req, acb->req.buf);
2397 bdrv_co_complete(acb);
61007b31
SH
2398}
2399
2400BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
2401 unsigned long int req, void *buf,
2402 BlockCompletionFunc *cb, void *opaque)
2403{
5c5ae76a
FZ
2404 BlockAIOCBCoroutine *acb = qemu_aio_get(&bdrv_em_co_aiocb_info,
2405 bs, cb, opaque);
2406 Coroutine *co;
61007b31 2407
5c5ae76a
FZ
2408 acb->need_bh = true;
2409 acb->req.error = -EINPROGRESS;
2410 acb->req.req = req;
2411 acb->req.buf = buf;
2412 co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry);
2413 qemu_coroutine_enter(co, acb);
2414
2415 bdrv_co_maybe_schedule_bh(acb);
2416 return &acb->common;
61007b31
SH
2417}
2418
2419void *qemu_blockalign(BlockDriverState *bs, size_t size)
2420{
2421 return qemu_memalign(bdrv_opt_mem_align(bs), size);
2422}
2423
2424void *qemu_blockalign0(BlockDriverState *bs, size_t size)
2425{
2426 return memset(qemu_blockalign(bs, size), 0, size);
2427}
2428
2429void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
2430{
2431 size_t align = bdrv_opt_mem_align(bs);
2432
2433 /* Ensure that NULL is never returned on success */
2434 assert(align > 0);
2435 if (size == 0) {
2436 size = align;
2437 }
2438
2439 return qemu_try_memalign(align, size);
2440}
2441
2442void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
2443{
2444 void *mem = qemu_try_blockalign(bs, size);
2445
2446 if (mem) {
2447 memset(mem, 0, size);
2448 }
2449
2450 return mem;
2451}
2452
2453/*
2454 * Check if all memory in this vector is sector aligned.
2455 */
2456bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
2457{
2458 int i;
4196d2f0 2459 size_t alignment = bdrv_min_mem_align(bs);
61007b31
SH
2460
2461 for (i = 0; i < qiov->niov; i++) {
2462 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
2463 return false;
2464 }
2465 if (qiov->iov[i].iov_len % alignment) {
2466 return false;
2467 }
2468 }
2469
2470 return true;
2471}
2472
2473void bdrv_add_before_write_notifier(BlockDriverState *bs,
2474 NotifierWithReturn *notifier)
2475{
2476 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
2477}
2478
2479void bdrv_io_plug(BlockDriverState *bs)
2480{
6b98bd64
PB
2481 BdrvChild *child;
2482
2483 QLIST_FOREACH(child, &bs->children, next) {
2484 bdrv_io_plug(child->bs);
2485 }
2486
2487 if (bs->io_plugged++ == 0 && bs->io_plug_disabled == 0) {
2488 BlockDriver *drv = bs->drv;
2489 if (drv && drv->bdrv_io_plug) {
2490 drv->bdrv_io_plug(bs);
2491 }
61007b31
SH
2492 }
2493}
2494
2495void bdrv_io_unplug(BlockDriverState *bs)
2496{
6b98bd64
PB
2497 BdrvChild *child;
2498
2499 assert(bs->io_plugged);
2500 if (--bs->io_plugged == 0 && bs->io_plug_disabled == 0) {
2501 BlockDriver *drv = bs->drv;
2502 if (drv && drv->bdrv_io_unplug) {
2503 drv->bdrv_io_unplug(bs);
2504 }
2505 }
2506
2507 QLIST_FOREACH(child, &bs->children, next) {
2508 bdrv_io_unplug(child->bs);
61007b31
SH
2509 }
2510}
2511
6b98bd64 2512void bdrv_io_unplugged_begin(BlockDriverState *bs)
61007b31 2513{
6b98bd64
PB
2514 BdrvChild *child;
2515
2516 if (bs->io_plug_disabled++ == 0 && bs->io_plugged > 0) {
2517 BlockDriver *drv = bs->drv;
2518 if (drv && drv->bdrv_io_unplug) {
2519 drv->bdrv_io_unplug(bs);
2520 }
2521 }
2522
2523 QLIST_FOREACH(child, &bs->children, next) {
2524 bdrv_io_unplugged_begin(child->bs);
2525 }
2526}
2527
2528void bdrv_io_unplugged_end(BlockDriverState *bs)
2529{
2530 BdrvChild *child;
2531
2532 assert(bs->io_plug_disabled);
2533 QLIST_FOREACH(child, &bs->children, next) {
2534 bdrv_io_unplugged_end(child->bs);
2535 }
2536
2537 if (--bs->io_plug_disabled == 0 && bs->io_plugged > 0) {
2538 BlockDriver *drv = bs->drv;
2539 if (drv && drv->bdrv_io_plug) {
2540 drv->bdrv_io_plug(bs);
2541 }
61007b31
SH
2542 }
2543}
51288d79
FZ
2544
2545void bdrv_drained_begin(BlockDriverState *bs)
2546{
2547 if (!bs->quiesce_counter++) {
2548 aio_disable_external(bdrv_get_aio_context(bs));
2549 }
bb9aaeca 2550 bdrv_parent_drained_begin(bs);
51288d79
FZ
2551 bdrv_drain(bs);
2552}
2553
2554void bdrv_drained_end(BlockDriverState *bs)
2555{
bb9aaeca
KW
2556 bdrv_parent_drained_end(bs);
2557
51288d79
FZ
2558 assert(bs->quiesce_counter > 0);
2559 if (--bs->quiesce_counter > 0) {
2560 return;
2561 }
2562 aio_enable_external(bdrv_get_aio_context(bs));
2563}