]> git.proxmox.com Git - mirror_qemu.git/blob - block/io.c
block: Byte-based bdrv_co_do_copy_on_readv()
[mirror_qemu.git] / block / io.c
1 /*
2 * Block layer I/O functions
3 *
4 * Copyright (c) 2003 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25 #include "qemu/osdep.h"
26 #include "trace.h"
27 #include "sysemu/block-backend.h"
28 #include "block/blockjob.h"
29 #include "block/block_int.h"
30 #include "qemu/cutils.h"
31 #include "qapi/error.h"
32 #include "qemu/error-report.h"
33
34 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
35
36 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
37 int64_t sector_num,
38 QEMUIOVector *qiov,
39 int nb_sectors,
40 BdrvRequestFlags flags,
41 BlockCompletionFunc *cb,
42 void *opaque,
43 bool is_write);
44 static void coroutine_fn bdrv_co_do_rw(void *opaque);
45 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
46 int64_t offset, int count, BdrvRequestFlags flags);
47
48 static void bdrv_parent_drained_begin(BlockDriverState *bs)
49 {
50 BdrvChild *c;
51
52 QLIST_FOREACH(c, &bs->parents, next_parent) {
53 if (c->role->drained_begin) {
54 c->role->drained_begin(c);
55 }
56 }
57 }
58
59 static void bdrv_parent_drained_end(BlockDriverState *bs)
60 {
61 BdrvChild *c;
62
63 QLIST_FOREACH(c, &bs->parents, next_parent) {
64 if (c->role->drained_end) {
65 c->role->drained_end(c);
66 }
67 }
68 }
69
70 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
71 {
72 BlockDriver *drv = bs->drv;
73 Error *local_err = NULL;
74
75 memset(&bs->bl, 0, sizeof(bs->bl));
76
77 if (!drv) {
78 return;
79 }
80
81 /* Take some limits from the children as a default */
82 if (bs->file) {
83 bdrv_refresh_limits(bs->file->bs, &local_err);
84 if (local_err) {
85 error_propagate(errp, local_err);
86 return;
87 }
88 bs->bl.opt_transfer_length = bs->file->bs->bl.opt_transfer_length;
89 bs->bl.max_transfer_length = bs->file->bs->bl.max_transfer_length;
90 bs->bl.min_mem_alignment = bs->file->bs->bl.min_mem_alignment;
91 bs->bl.opt_mem_alignment = bs->file->bs->bl.opt_mem_alignment;
92 bs->bl.max_iov = bs->file->bs->bl.max_iov;
93 } else {
94 bs->bl.min_mem_alignment = 512;
95 bs->bl.opt_mem_alignment = getpagesize();
96
97 /* Safe default since most protocols use readv()/writev()/etc */
98 bs->bl.max_iov = IOV_MAX;
99 }
100
101 if (bs->backing) {
102 bdrv_refresh_limits(bs->backing->bs, &local_err);
103 if (local_err) {
104 error_propagate(errp, local_err);
105 return;
106 }
107 bs->bl.opt_transfer_length =
108 MAX(bs->bl.opt_transfer_length,
109 bs->backing->bs->bl.opt_transfer_length);
110 bs->bl.max_transfer_length =
111 MIN_NON_ZERO(bs->bl.max_transfer_length,
112 bs->backing->bs->bl.max_transfer_length);
113 bs->bl.opt_mem_alignment =
114 MAX(bs->bl.opt_mem_alignment,
115 bs->backing->bs->bl.opt_mem_alignment);
116 bs->bl.min_mem_alignment =
117 MAX(bs->bl.min_mem_alignment,
118 bs->backing->bs->bl.min_mem_alignment);
119 bs->bl.max_iov =
120 MIN(bs->bl.max_iov,
121 bs->backing->bs->bl.max_iov);
122 }
123
124 /* Then let the driver override it */
125 if (drv->bdrv_refresh_limits) {
126 drv->bdrv_refresh_limits(bs, errp);
127 }
128 }
129
130 /**
131 * The copy-on-read flag is actually a reference count so multiple users may
132 * use the feature without worrying about clobbering its previous state.
133 * Copy-on-read stays enabled until all users have called to disable it.
134 */
135 void bdrv_enable_copy_on_read(BlockDriverState *bs)
136 {
137 bs->copy_on_read++;
138 }
139
140 void bdrv_disable_copy_on_read(BlockDriverState *bs)
141 {
142 assert(bs->copy_on_read > 0);
143 bs->copy_on_read--;
144 }
145
146 /* Check if any requests are in-flight (including throttled requests) */
147 bool bdrv_requests_pending(BlockDriverState *bs)
148 {
149 BdrvChild *child;
150
151 if (!QLIST_EMPTY(&bs->tracked_requests)) {
152 return true;
153 }
154
155 QLIST_FOREACH(child, &bs->children, next) {
156 if (bdrv_requests_pending(child->bs)) {
157 return true;
158 }
159 }
160
161 return false;
162 }
163
164 static void bdrv_drain_recurse(BlockDriverState *bs)
165 {
166 BdrvChild *child;
167
168 if (bs->drv && bs->drv->bdrv_drain) {
169 bs->drv->bdrv_drain(bs);
170 }
171 QLIST_FOREACH(child, &bs->children, next) {
172 bdrv_drain_recurse(child->bs);
173 }
174 }
175
176 typedef struct {
177 Coroutine *co;
178 BlockDriverState *bs;
179 QEMUBH *bh;
180 bool done;
181 } BdrvCoDrainData;
182
183 static void bdrv_drain_poll(BlockDriverState *bs)
184 {
185 bool busy = true;
186
187 while (busy) {
188 /* Keep iterating */
189 busy = bdrv_requests_pending(bs);
190 busy |= aio_poll(bdrv_get_aio_context(bs), busy);
191 }
192 }
193
194 static void bdrv_co_drain_bh_cb(void *opaque)
195 {
196 BdrvCoDrainData *data = opaque;
197 Coroutine *co = data->co;
198
199 qemu_bh_delete(data->bh);
200 bdrv_drain_poll(data->bs);
201 data->done = true;
202 qemu_coroutine_enter(co, NULL);
203 }
204
205 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
206 {
207 BdrvCoDrainData data;
208
209 /* Calling bdrv_drain() from a BH ensures the current coroutine yields and
210 * other coroutines run if they were queued from
211 * qemu_co_queue_run_restart(). */
212
213 assert(qemu_in_coroutine());
214 data = (BdrvCoDrainData) {
215 .co = qemu_coroutine_self(),
216 .bs = bs,
217 .done = false,
218 .bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_drain_bh_cb, &data),
219 };
220 qemu_bh_schedule(data.bh);
221
222 qemu_coroutine_yield();
223 /* If we are resumed from some other event (such as an aio completion or a
224 * timer callback), it is a bug in the caller that should be fixed. */
225 assert(data.done);
226 }
227
228 void bdrv_drained_begin(BlockDriverState *bs)
229 {
230 if (!bs->quiesce_counter++) {
231 aio_disable_external(bdrv_get_aio_context(bs));
232 bdrv_parent_drained_begin(bs);
233 }
234
235 bdrv_io_unplugged_begin(bs);
236 bdrv_drain_recurse(bs);
237 if (qemu_in_coroutine()) {
238 bdrv_co_yield_to_drain(bs);
239 } else {
240 bdrv_drain_poll(bs);
241 }
242 bdrv_io_unplugged_end(bs);
243 }
244
245 void bdrv_drained_end(BlockDriverState *bs)
246 {
247 assert(bs->quiesce_counter > 0);
248 if (--bs->quiesce_counter > 0) {
249 return;
250 }
251
252 bdrv_parent_drained_end(bs);
253 aio_enable_external(bdrv_get_aio_context(bs));
254 }
255
256 /*
257 * Wait for pending requests to complete on a single BlockDriverState subtree,
258 * and suspend block driver's internal I/O until next request arrives.
259 *
260 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
261 * AioContext.
262 *
263 * Only this BlockDriverState's AioContext is run, so in-flight requests must
264 * not depend on events in other AioContexts. In that case, use
265 * bdrv_drain_all() instead.
266 */
267 void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
268 {
269 assert(qemu_in_coroutine());
270 bdrv_drained_begin(bs);
271 bdrv_drained_end(bs);
272 }
273
274 void bdrv_drain(BlockDriverState *bs)
275 {
276 bdrv_drained_begin(bs);
277 bdrv_drained_end(bs);
278 }
279
280 /*
281 * Wait for pending requests to complete across all BlockDriverStates
282 *
283 * This function does not flush data to disk, use bdrv_flush_all() for that
284 * after calling this function.
285 */
286 void bdrv_drain_all(void)
287 {
288 /* Always run first iteration so any pending completion BHs run */
289 bool busy = true;
290 BlockDriverState *bs;
291 BdrvNextIterator it;
292 GSList *aio_ctxs = NULL, *ctx;
293
294 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
295 AioContext *aio_context = bdrv_get_aio_context(bs);
296
297 aio_context_acquire(aio_context);
298 if (bs->job) {
299 block_job_pause(bs->job);
300 }
301 bdrv_parent_drained_begin(bs);
302 bdrv_io_unplugged_begin(bs);
303 bdrv_drain_recurse(bs);
304 aio_context_release(aio_context);
305
306 if (!g_slist_find(aio_ctxs, aio_context)) {
307 aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
308 }
309 }
310
311 /* Note that completion of an asynchronous I/O operation can trigger any
312 * number of other I/O operations on other devices---for example a
313 * coroutine can submit an I/O request to another device in response to
314 * request completion. Therefore we must keep looping until there was no
315 * more activity rather than simply draining each device independently.
316 */
317 while (busy) {
318 busy = false;
319
320 for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
321 AioContext *aio_context = ctx->data;
322
323 aio_context_acquire(aio_context);
324 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
325 if (aio_context == bdrv_get_aio_context(bs)) {
326 if (bdrv_requests_pending(bs)) {
327 busy = true;
328 aio_poll(aio_context, busy);
329 }
330 }
331 }
332 busy |= aio_poll(aio_context, false);
333 aio_context_release(aio_context);
334 }
335 }
336
337 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
338 AioContext *aio_context = bdrv_get_aio_context(bs);
339
340 aio_context_acquire(aio_context);
341 bdrv_io_unplugged_end(bs);
342 bdrv_parent_drained_end(bs);
343 if (bs->job) {
344 block_job_resume(bs->job);
345 }
346 aio_context_release(aio_context);
347 }
348 g_slist_free(aio_ctxs);
349 }
350
351 /**
352 * Remove an active request from the tracked requests list
353 *
354 * This function should be called when a tracked request is completing.
355 */
356 static void tracked_request_end(BdrvTrackedRequest *req)
357 {
358 if (req->serialising) {
359 req->bs->serialising_in_flight--;
360 }
361
362 QLIST_REMOVE(req, list);
363 qemu_co_queue_restart_all(&req->wait_queue);
364 }
365
366 /**
367 * Add an active request to the tracked requests list
368 */
369 static void tracked_request_begin(BdrvTrackedRequest *req,
370 BlockDriverState *bs,
371 int64_t offset,
372 unsigned int bytes,
373 enum BdrvTrackedRequestType type)
374 {
375 *req = (BdrvTrackedRequest){
376 .bs = bs,
377 .offset = offset,
378 .bytes = bytes,
379 .type = type,
380 .co = qemu_coroutine_self(),
381 .serialising = false,
382 .overlap_offset = offset,
383 .overlap_bytes = bytes,
384 };
385
386 qemu_co_queue_init(&req->wait_queue);
387
388 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
389 }
390
391 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
392 {
393 int64_t overlap_offset = req->offset & ~(align - 1);
394 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
395 - overlap_offset;
396
397 if (!req->serialising) {
398 req->bs->serialising_in_flight++;
399 req->serialising = true;
400 }
401
402 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
403 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
404 }
405
406 /**
407 * Round a region to cluster boundaries (sector-based)
408 */
409 void bdrv_round_sectors_to_clusters(BlockDriverState *bs,
410 int64_t sector_num, int nb_sectors,
411 int64_t *cluster_sector_num,
412 int *cluster_nb_sectors)
413 {
414 BlockDriverInfo bdi;
415
416 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
417 *cluster_sector_num = sector_num;
418 *cluster_nb_sectors = nb_sectors;
419 } else {
420 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
421 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
422 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
423 nb_sectors, c);
424 }
425 }
426
427 /**
428 * Round a region to cluster boundaries
429 */
430 void bdrv_round_to_clusters(BlockDriverState *bs,
431 int64_t offset, unsigned int bytes,
432 int64_t *cluster_offset,
433 unsigned int *cluster_bytes)
434 {
435 BlockDriverInfo bdi;
436
437 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
438 *cluster_offset = offset;
439 *cluster_bytes = bytes;
440 } else {
441 int64_t c = bdi.cluster_size;
442 *cluster_offset = QEMU_ALIGN_DOWN(offset, c);
443 *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c);
444 }
445 }
446
447 static int bdrv_get_cluster_size(BlockDriverState *bs)
448 {
449 BlockDriverInfo bdi;
450 int ret;
451
452 ret = bdrv_get_info(bs, &bdi);
453 if (ret < 0 || bdi.cluster_size == 0) {
454 return bs->request_alignment;
455 } else {
456 return bdi.cluster_size;
457 }
458 }
459
460 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
461 int64_t offset, unsigned int bytes)
462 {
463 /* aaaa bbbb */
464 if (offset >= req->overlap_offset + req->overlap_bytes) {
465 return false;
466 }
467 /* bbbb aaaa */
468 if (req->overlap_offset >= offset + bytes) {
469 return false;
470 }
471 return true;
472 }
473
474 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
475 {
476 BlockDriverState *bs = self->bs;
477 BdrvTrackedRequest *req;
478 bool retry;
479 bool waited = false;
480
481 if (!bs->serialising_in_flight) {
482 return false;
483 }
484
485 do {
486 retry = false;
487 QLIST_FOREACH(req, &bs->tracked_requests, list) {
488 if (req == self || (!req->serialising && !self->serialising)) {
489 continue;
490 }
491 if (tracked_request_overlaps(req, self->overlap_offset,
492 self->overlap_bytes))
493 {
494 /* Hitting this means there was a reentrant request, for
495 * example, a block driver issuing nested requests. This must
496 * never happen since it means deadlock.
497 */
498 assert(qemu_coroutine_self() != req->co);
499
500 /* If the request is already (indirectly) waiting for us, or
501 * will wait for us as soon as it wakes up, then just go on
502 * (instead of producing a deadlock in the former case). */
503 if (!req->waiting_for) {
504 self->waiting_for = req;
505 qemu_co_queue_wait(&req->wait_queue);
506 self->waiting_for = NULL;
507 retry = true;
508 waited = true;
509 break;
510 }
511 }
512 }
513 } while (retry);
514
515 return waited;
516 }
517
518 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
519 size_t size)
520 {
521 if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
522 return -EIO;
523 }
524
525 if (!bdrv_is_inserted(bs)) {
526 return -ENOMEDIUM;
527 }
528
529 if (offset < 0) {
530 return -EIO;
531 }
532
533 return 0;
534 }
535
536 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
537 int nb_sectors)
538 {
539 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
540 return -EIO;
541 }
542
543 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
544 nb_sectors * BDRV_SECTOR_SIZE);
545 }
546
547 typedef struct RwCo {
548 BlockDriverState *bs;
549 int64_t offset;
550 QEMUIOVector *qiov;
551 bool is_write;
552 int ret;
553 BdrvRequestFlags flags;
554 } RwCo;
555
556 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
557 {
558 RwCo *rwco = opaque;
559
560 if (!rwco->is_write) {
561 rwco->ret = bdrv_co_preadv(rwco->bs, rwco->offset,
562 rwco->qiov->size, rwco->qiov,
563 rwco->flags);
564 } else {
565 rwco->ret = bdrv_co_pwritev(rwco->bs, rwco->offset,
566 rwco->qiov->size, rwco->qiov,
567 rwco->flags);
568 }
569 }
570
571 /*
572 * Process a vectored synchronous request using coroutines
573 */
574 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
575 QEMUIOVector *qiov, bool is_write,
576 BdrvRequestFlags flags)
577 {
578 Coroutine *co;
579 RwCo rwco = {
580 .bs = bs,
581 .offset = offset,
582 .qiov = qiov,
583 .is_write = is_write,
584 .ret = NOT_DONE,
585 .flags = flags,
586 };
587
588 if (qemu_in_coroutine()) {
589 /* Fast-path if already in coroutine context */
590 bdrv_rw_co_entry(&rwco);
591 } else {
592 AioContext *aio_context = bdrv_get_aio_context(bs);
593
594 co = qemu_coroutine_create(bdrv_rw_co_entry);
595 qemu_coroutine_enter(co, &rwco);
596 while (rwco.ret == NOT_DONE) {
597 aio_poll(aio_context, true);
598 }
599 }
600 return rwco.ret;
601 }
602
603 /*
604 * Process a synchronous request using coroutines
605 */
606 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
607 int nb_sectors, bool is_write, BdrvRequestFlags flags)
608 {
609 QEMUIOVector qiov;
610 struct iovec iov = {
611 .iov_base = (void *)buf,
612 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
613 };
614
615 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
616 return -EINVAL;
617 }
618
619 qemu_iovec_init_external(&qiov, &iov, 1);
620 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
621 &qiov, is_write, flags);
622 }
623
624 /* return < 0 if error. See bdrv_write() for the return codes */
625 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
626 uint8_t *buf, int nb_sectors)
627 {
628 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
629 }
630
631 /* Return < 0 if error. Important errors are:
632 -EIO generic I/O error (may happen for all errors)
633 -ENOMEDIUM No media inserted.
634 -EINVAL Invalid sector number or nb_sectors
635 -EACCES Trying to write a read-only device
636 */
637 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
638 const uint8_t *buf, int nb_sectors)
639 {
640 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
641 }
642
643 int bdrv_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
644 int count, BdrvRequestFlags flags)
645 {
646 QEMUIOVector qiov;
647 struct iovec iov = {
648 .iov_base = NULL,
649 .iov_len = count,
650 };
651
652 qemu_iovec_init_external(&qiov, &iov, 1);
653 return bdrv_prwv_co(bs, offset, &qiov, true,
654 BDRV_REQ_ZERO_WRITE | flags);
655 }
656
657 /*
658 * Completely zero out a block device with the help of bdrv_pwrite_zeroes.
659 * The operation is sped up by checking the block status and only writing
660 * zeroes to the device if they currently do not return zeroes. Optional
661 * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP,
662 * BDRV_REQ_FUA).
663 *
664 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
665 */
666 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
667 {
668 int64_t target_sectors, ret, nb_sectors, sector_num = 0;
669 BlockDriverState *file;
670 int n;
671
672 target_sectors = bdrv_nb_sectors(bs);
673 if (target_sectors < 0) {
674 return target_sectors;
675 }
676
677 for (;;) {
678 nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
679 if (nb_sectors <= 0) {
680 return 0;
681 }
682 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n, &file);
683 if (ret < 0) {
684 error_report("error getting block status at sector %" PRId64 ": %s",
685 sector_num, strerror(-ret));
686 return ret;
687 }
688 if (ret & BDRV_BLOCK_ZERO) {
689 sector_num += n;
690 continue;
691 }
692 ret = bdrv_pwrite_zeroes(bs, sector_num << BDRV_SECTOR_BITS,
693 n << BDRV_SECTOR_BITS, flags);
694 if (ret < 0) {
695 error_report("error writing zeroes at sector %" PRId64 ": %s",
696 sector_num, strerror(-ret));
697 return ret;
698 }
699 sector_num += n;
700 }
701 }
702
703 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
704 {
705 QEMUIOVector qiov;
706 struct iovec iov = {
707 .iov_base = (void *)buf,
708 .iov_len = bytes,
709 };
710 int ret;
711
712 if (bytes < 0) {
713 return -EINVAL;
714 }
715
716 qemu_iovec_init_external(&qiov, &iov, 1);
717 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
718 if (ret < 0) {
719 return ret;
720 }
721
722 return bytes;
723 }
724
725 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
726 {
727 int ret;
728
729 ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
730 if (ret < 0) {
731 return ret;
732 }
733
734 return qiov->size;
735 }
736
737 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
738 const void *buf, int bytes)
739 {
740 QEMUIOVector qiov;
741 struct iovec iov = {
742 .iov_base = (void *) buf,
743 .iov_len = bytes,
744 };
745
746 if (bytes < 0) {
747 return -EINVAL;
748 }
749
750 qemu_iovec_init_external(&qiov, &iov, 1);
751 return bdrv_pwritev(bs, offset, &qiov);
752 }
753
754 /*
755 * Writes to the file and ensures that no writes are reordered across this
756 * request (acts as a barrier)
757 *
758 * Returns 0 on success, -errno in error cases.
759 */
760 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
761 const void *buf, int count)
762 {
763 int ret;
764
765 ret = bdrv_pwrite(bs, offset, buf, count);
766 if (ret < 0) {
767 return ret;
768 }
769
770 ret = bdrv_flush(bs);
771 if (ret < 0) {
772 return ret;
773 }
774
775 return 0;
776 }
777
778 typedef struct CoroutineIOCompletion {
779 Coroutine *coroutine;
780 int ret;
781 } CoroutineIOCompletion;
782
783 static void bdrv_co_io_em_complete(void *opaque, int ret)
784 {
785 CoroutineIOCompletion *co = opaque;
786
787 co->ret = ret;
788 qemu_coroutine_enter(co->coroutine, NULL);
789 }
790
791 static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
792 uint64_t offset, uint64_t bytes,
793 QEMUIOVector *qiov, int flags)
794 {
795 BlockDriver *drv = bs->drv;
796 int64_t sector_num;
797 unsigned int nb_sectors;
798
799 assert(!(flags & ~BDRV_REQ_MASK));
800
801 if (drv->bdrv_co_preadv) {
802 return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
803 }
804
805 sector_num = offset >> BDRV_SECTOR_BITS;
806 nb_sectors = bytes >> BDRV_SECTOR_BITS;
807
808 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
809 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
810 assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
811
812 if (drv->bdrv_co_readv) {
813 return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
814 } else {
815 BlockAIOCB *acb;
816 CoroutineIOCompletion co = {
817 .coroutine = qemu_coroutine_self(),
818 };
819
820 acb = bs->drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
821 bdrv_co_io_em_complete, &co);
822 if (acb == NULL) {
823 return -EIO;
824 } else {
825 qemu_coroutine_yield();
826 return co.ret;
827 }
828 }
829 }
830
831 static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
832 uint64_t offset, uint64_t bytes,
833 QEMUIOVector *qiov, int flags)
834 {
835 BlockDriver *drv = bs->drv;
836 int64_t sector_num;
837 unsigned int nb_sectors;
838 int ret;
839
840 assert(!(flags & ~BDRV_REQ_MASK));
841
842 if (drv->bdrv_co_pwritev) {
843 ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov,
844 flags & bs->supported_write_flags);
845 flags &= ~bs->supported_write_flags;
846 goto emulate_flags;
847 }
848
849 sector_num = offset >> BDRV_SECTOR_BITS;
850 nb_sectors = bytes >> BDRV_SECTOR_BITS;
851
852 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
853 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
854 assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
855
856 if (drv->bdrv_co_writev_flags) {
857 ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov,
858 flags & bs->supported_write_flags);
859 flags &= ~bs->supported_write_flags;
860 } else if (drv->bdrv_co_writev) {
861 assert(!bs->supported_write_flags);
862 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
863 } else {
864 BlockAIOCB *acb;
865 CoroutineIOCompletion co = {
866 .coroutine = qemu_coroutine_self(),
867 };
868
869 acb = bs->drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
870 bdrv_co_io_em_complete, &co);
871 if (acb == NULL) {
872 ret = -EIO;
873 } else {
874 qemu_coroutine_yield();
875 ret = co.ret;
876 }
877 }
878
879 emulate_flags:
880 if (ret == 0 && (flags & BDRV_REQ_FUA)) {
881 ret = bdrv_co_flush(bs);
882 }
883
884 return ret;
885 }
886
887 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
888 int64_t offset, unsigned int bytes, QEMUIOVector *qiov)
889 {
890 /* Perform I/O through a temporary buffer so that users who scribble over
891 * their read buffer while the operation is in progress do not end up
892 * modifying the image file. This is critical for zero-copy guest I/O
893 * where anything might happen inside guest memory.
894 */
895 void *bounce_buffer;
896
897 BlockDriver *drv = bs->drv;
898 struct iovec iov;
899 QEMUIOVector bounce_qiov;
900 int64_t cluster_offset;
901 unsigned int cluster_bytes;
902 size_t skip_bytes;
903 int ret;
904
905 /* Cover entire cluster so no additional backing file I/O is required when
906 * allocating cluster in the image file.
907 */
908 bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
909
910 trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
911 cluster_offset, cluster_bytes);
912
913 iov.iov_len = cluster_bytes;
914 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
915 if (bounce_buffer == NULL) {
916 ret = -ENOMEM;
917 goto err;
918 }
919
920 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
921
922 ret = bdrv_driver_preadv(bs, cluster_offset, cluster_bytes,
923 &bounce_qiov, 0);
924 if (ret < 0) {
925 goto err;
926 }
927
928 if (drv->bdrv_co_pwrite_zeroes &&
929 buffer_is_zero(bounce_buffer, iov.iov_len)) {
930 ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, cluster_bytes, 0);
931 } else {
932 /* This does not change the data on the disk, it is not necessary
933 * to flush even in cache=writethrough mode.
934 */
935 ret = bdrv_driver_pwritev(bs, cluster_offset, cluster_bytes,
936 &bounce_qiov, 0);
937 }
938
939 if (ret < 0) {
940 /* It might be okay to ignore write errors for guest requests. If this
941 * is a deliberate copy-on-read then we don't want to ignore the error.
942 * Simply report it in all cases.
943 */
944 goto err;
945 }
946
947 skip_bytes = offset - cluster_offset;
948 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, bytes);
949
950 err:
951 qemu_vfree(bounce_buffer);
952 return ret;
953 }
954
955 /*
956 * Forwards an already correctly aligned request to the BlockDriver. This
957 * handles copy on read and zeroing after EOF; any other features must be
958 * implemented by the caller.
959 */
960 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
961 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
962 int64_t align, QEMUIOVector *qiov, int flags)
963 {
964 int ret;
965
966 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
967 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
968
969 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
970 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
971 assert(!qiov || bytes == qiov->size);
972 assert((bs->open_flags & BDRV_O_NO_IO) == 0);
973 assert(!(flags & ~BDRV_REQ_MASK));
974
975 /* Handle Copy on Read and associated serialisation */
976 if (flags & BDRV_REQ_COPY_ON_READ) {
977 /* If we touch the same cluster it counts as an overlap. This
978 * guarantees that allocating writes will be serialized and not race
979 * with each other for the same cluster. For example, in copy-on-read
980 * it ensures that the CoR read and write operations are atomic and
981 * guest writes cannot interleave between them. */
982 mark_request_serialising(req, bdrv_get_cluster_size(bs));
983 }
984
985 if (!(flags & BDRV_REQ_NO_SERIALISING)) {
986 wait_serialising_requests(req);
987 }
988
989 if (flags & BDRV_REQ_COPY_ON_READ) {
990 int pnum;
991
992 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
993 if (ret < 0) {
994 goto out;
995 }
996
997 if (!ret || pnum != nb_sectors) {
998 ret = bdrv_co_do_copy_on_readv(bs, offset, bytes, qiov);
999 goto out;
1000 }
1001 }
1002
1003 /* Forward the request to the BlockDriver */
1004 if (!bs->zero_beyond_eof) {
1005 ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0);
1006 } else {
1007 /* Read zeros after EOF */
1008 int64_t total_sectors, max_nb_sectors;
1009
1010 total_sectors = bdrv_nb_sectors(bs);
1011 if (total_sectors < 0) {
1012 ret = total_sectors;
1013 goto out;
1014 }
1015
1016 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
1017 align >> BDRV_SECTOR_BITS);
1018 if (nb_sectors < max_nb_sectors) {
1019 ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0);
1020 } else if (max_nb_sectors > 0) {
1021 QEMUIOVector local_qiov;
1022
1023 qemu_iovec_init(&local_qiov, qiov->niov);
1024 qemu_iovec_concat(&local_qiov, qiov, 0,
1025 max_nb_sectors * BDRV_SECTOR_SIZE);
1026
1027 ret = bdrv_driver_preadv(bs, offset,
1028 max_nb_sectors * BDRV_SECTOR_SIZE,
1029 &local_qiov, 0);
1030
1031 qemu_iovec_destroy(&local_qiov);
1032 } else {
1033 ret = 0;
1034 }
1035
1036 /* Reading beyond end of file is supposed to produce zeroes */
1037 if (ret == 0 && total_sectors < sector_num + nb_sectors) {
1038 uint64_t offset = MAX(0, total_sectors - sector_num);
1039 uint64_t bytes = (sector_num + nb_sectors - offset) *
1040 BDRV_SECTOR_SIZE;
1041 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
1042 }
1043 }
1044
1045 out:
1046 return ret;
1047 }
1048
1049 /*
1050 * Handle a read request in coroutine context
1051 */
1052 int coroutine_fn bdrv_co_preadv(BlockDriverState *bs,
1053 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1054 BdrvRequestFlags flags)
1055 {
1056 BlockDriver *drv = bs->drv;
1057 BdrvTrackedRequest req;
1058
1059 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
1060 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
1061 uint8_t *head_buf = NULL;
1062 uint8_t *tail_buf = NULL;
1063 QEMUIOVector local_qiov;
1064 bool use_local_qiov = false;
1065 int ret;
1066
1067 if (!drv) {
1068 return -ENOMEDIUM;
1069 }
1070
1071 ret = bdrv_check_byte_request(bs, offset, bytes);
1072 if (ret < 0) {
1073 return ret;
1074 }
1075
1076 /* Don't do copy-on-read if we read data before write operation */
1077 if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) {
1078 flags |= BDRV_REQ_COPY_ON_READ;
1079 }
1080
1081 /* Align read if necessary by padding qiov */
1082 if (offset & (align - 1)) {
1083 head_buf = qemu_blockalign(bs, align);
1084 qemu_iovec_init(&local_qiov, qiov->niov + 2);
1085 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1086 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1087 use_local_qiov = true;
1088
1089 bytes += offset & (align - 1);
1090 offset = offset & ~(align - 1);
1091 }
1092
1093 if ((offset + bytes) & (align - 1)) {
1094 if (!use_local_qiov) {
1095 qemu_iovec_init(&local_qiov, qiov->niov + 1);
1096 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1097 use_local_qiov = true;
1098 }
1099 tail_buf = qemu_blockalign(bs, align);
1100 qemu_iovec_add(&local_qiov, tail_buf,
1101 align - ((offset + bytes) & (align - 1)));
1102
1103 bytes = ROUND_UP(bytes, align);
1104 }
1105
1106 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
1107 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
1108 use_local_qiov ? &local_qiov : qiov,
1109 flags);
1110 tracked_request_end(&req);
1111
1112 if (use_local_qiov) {
1113 qemu_iovec_destroy(&local_qiov);
1114 qemu_vfree(head_buf);
1115 qemu_vfree(tail_buf);
1116 }
1117
1118 return ret;
1119 }
1120
1121 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1122 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1123 BdrvRequestFlags flags)
1124 {
1125 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1126 return -EINVAL;
1127 }
1128
1129 return bdrv_co_preadv(bs, sector_num << BDRV_SECTOR_BITS,
1130 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
1131 }
1132
1133 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1134 int nb_sectors, QEMUIOVector *qiov)
1135 {
1136 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1137
1138 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1139 }
1140
1141 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
1142
1143 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
1144 int64_t offset, int count, BdrvRequestFlags flags)
1145 {
1146 BlockDriver *drv = bs->drv;
1147 QEMUIOVector qiov;
1148 struct iovec iov = {0};
1149 int ret = 0;
1150 bool need_flush = false;
1151 int head = 0;
1152 int tail = 0;
1153
1154 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
1155 int alignment = MAX(bs->bl.pwrite_zeroes_alignment ?: 1,
1156 bs->request_alignment);
1157
1158 assert(is_power_of_2(alignment));
1159 head = offset & (alignment - 1);
1160 tail = (offset + count) & (alignment - 1);
1161 max_write_zeroes &= ~(alignment - 1);
1162
1163 while (count > 0 && !ret) {
1164 int num = count;
1165
1166 /* Align request. Block drivers can expect the "bulk" of the request
1167 * to be aligned, and that unaligned requests do not cross cluster
1168 * boundaries.
1169 */
1170 if (head) {
1171 /* Make a small request up to the first aligned sector. */
1172 num = MIN(count, alignment - head);
1173 head = 0;
1174 } else if (tail && num > alignment) {
1175 /* Shorten the request to the last aligned sector. */
1176 num -= tail;
1177 }
1178
1179 /* limit request size */
1180 if (num > max_write_zeroes) {
1181 num = max_write_zeroes;
1182 }
1183
1184 ret = -ENOTSUP;
1185 /* First try the efficient write zeroes operation */
1186 if (drv->bdrv_co_pwrite_zeroes) {
1187 ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num,
1188 flags & bs->supported_zero_flags);
1189 if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
1190 !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
1191 need_flush = true;
1192 }
1193 } else {
1194 assert(!bs->supported_zero_flags);
1195 }
1196
1197 if (ret == -ENOTSUP) {
1198 /* Fall back to bounce buffer if write zeroes is unsupported */
1199 int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
1200 MAX_WRITE_ZEROES_BOUNCE_BUFFER);
1201 BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
1202
1203 if ((flags & BDRV_REQ_FUA) &&
1204 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1205 /* No need for bdrv_driver_pwrite() to do a fallback
1206 * flush on each chunk; use just one at the end */
1207 write_flags &= ~BDRV_REQ_FUA;
1208 need_flush = true;
1209 }
1210 num = MIN(num, max_xfer_len << BDRV_SECTOR_BITS);
1211 iov.iov_len = num;
1212 if (iov.iov_base == NULL) {
1213 iov.iov_base = qemu_try_blockalign(bs, num);
1214 if (iov.iov_base == NULL) {
1215 ret = -ENOMEM;
1216 goto fail;
1217 }
1218 memset(iov.iov_base, 0, num);
1219 }
1220 qemu_iovec_init_external(&qiov, &iov, 1);
1221
1222 ret = bdrv_driver_pwritev(bs, offset, num, &qiov, write_flags);
1223
1224 /* Keep bounce buffer around if it is big enough for all
1225 * all future requests.
1226 */
1227 if (num < max_xfer_len << BDRV_SECTOR_BITS) {
1228 qemu_vfree(iov.iov_base);
1229 iov.iov_base = NULL;
1230 }
1231 }
1232
1233 offset += num;
1234 count -= num;
1235 }
1236
1237 fail:
1238 if (ret == 0 && need_flush) {
1239 ret = bdrv_co_flush(bs);
1240 }
1241 qemu_vfree(iov.iov_base);
1242 return ret;
1243 }
1244
1245 /*
1246 * Forwards an already correctly aligned write request to the BlockDriver.
1247 */
1248 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
1249 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1250 QEMUIOVector *qiov, int flags)
1251 {
1252 BlockDriver *drv = bs->drv;
1253 bool waited;
1254 int ret;
1255
1256 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
1257 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
1258
1259 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
1260 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
1261 assert(!qiov || bytes == qiov->size);
1262 assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1263 assert(!(flags & ~BDRV_REQ_MASK));
1264
1265 waited = wait_serialising_requests(req);
1266 assert(!waited || !req->serialising);
1267 assert(req->overlap_offset <= offset);
1268 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
1269
1270 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
1271
1272 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
1273 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
1274 qemu_iovec_is_zero(qiov)) {
1275 flags |= BDRV_REQ_ZERO_WRITE;
1276 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
1277 flags |= BDRV_REQ_MAY_UNMAP;
1278 }
1279 }
1280
1281 if (ret < 0) {
1282 /* Do nothing, write notifier decided to fail this request */
1283 } else if (flags & BDRV_REQ_ZERO_WRITE) {
1284 bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
1285 ret = bdrv_co_do_pwrite_zeroes(bs, sector_num << BDRV_SECTOR_BITS,
1286 nb_sectors << BDRV_SECTOR_BITS, flags);
1287 } else {
1288 bdrv_debug_event(bs, BLKDBG_PWRITEV);
1289 ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags);
1290 }
1291 bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
1292
1293 bdrv_set_dirty(bs, sector_num, nb_sectors);
1294
1295 if (bs->wr_highest_offset < offset + bytes) {
1296 bs->wr_highest_offset = offset + bytes;
1297 }
1298
1299 if (ret >= 0) {
1300 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
1301 }
1302
1303 return ret;
1304 }
1305
1306 static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs,
1307 int64_t offset,
1308 unsigned int bytes,
1309 BdrvRequestFlags flags,
1310 BdrvTrackedRequest *req)
1311 {
1312 uint8_t *buf = NULL;
1313 QEMUIOVector local_qiov;
1314 struct iovec iov;
1315 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
1316 unsigned int head_padding_bytes, tail_padding_bytes;
1317 int ret = 0;
1318
1319 head_padding_bytes = offset & (align - 1);
1320 tail_padding_bytes = align - ((offset + bytes) & (align - 1));
1321
1322
1323 assert(flags & BDRV_REQ_ZERO_WRITE);
1324 if (head_padding_bytes || tail_padding_bytes) {
1325 buf = qemu_blockalign(bs, align);
1326 iov = (struct iovec) {
1327 .iov_base = buf,
1328 .iov_len = align,
1329 };
1330 qemu_iovec_init_external(&local_qiov, &iov, 1);
1331 }
1332 if (head_padding_bytes) {
1333 uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes);
1334
1335 /* RMW the unaligned part before head. */
1336 mark_request_serialising(req, align);
1337 wait_serialising_requests(req);
1338 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1339 ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align,
1340 align, &local_qiov, 0);
1341 if (ret < 0) {
1342 goto fail;
1343 }
1344 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1345
1346 memset(buf + head_padding_bytes, 0, zero_bytes);
1347 ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align,
1348 &local_qiov,
1349 flags & ~BDRV_REQ_ZERO_WRITE);
1350 if (ret < 0) {
1351 goto fail;
1352 }
1353 offset += zero_bytes;
1354 bytes -= zero_bytes;
1355 }
1356
1357 assert(!bytes || (offset & (align - 1)) == 0);
1358 if (bytes >= align) {
1359 /* Write the aligned part in the middle. */
1360 uint64_t aligned_bytes = bytes & ~(align - 1);
1361 ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes,
1362 NULL, flags);
1363 if (ret < 0) {
1364 goto fail;
1365 }
1366 bytes -= aligned_bytes;
1367 offset += aligned_bytes;
1368 }
1369
1370 assert(!bytes || (offset & (align - 1)) == 0);
1371 if (bytes) {
1372 assert(align == tail_padding_bytes + bytes);
1373 /* RMW the unaligned part after tail. */
1374 mark_request_serialising(req, align);
1375 wait_serialising_requests(req);
1376 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1377 ret = bdrv_aligned_preadv(bs, req, offset, align,
1378 align, &local_qiov, 0);
1379 if (ret < 0) {
1380 goto fail;
1381 }
1382 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1383
1384 memset(buf, 0, bytes);
1385 ret = bdrv_aligned_pwritev(bs, req, offset, align,
1386 &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE);
1387 }
1388 fail:
1389 qemu_vfree(buf);
1390 return ret;
1391
1392 }
1393
1394 /*
1395 * Handle a write request in coroutine context
1396 */
1397 int coroutine_fn bdrv_co_pwritev(BlockDriverState *bs,
1398 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1399 BdrvRequestFlags flags)
1400 {
1401 BdrvTrackedRequest req;
1402 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
1403 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
1404 uint8_t *head_buf = NULL;
1405 uint8_t *tail_buf = NULL;
1406 QEMUIOVector local_qiov;
1407 bool use_local_qiov = false;
1408 int ret;
1409
1410 if (!bs->drv) {
1411 return -ENOMEDIUM;
1412 }
1413 if (bs->read_only) {
1414 return -EPERM;
1415 }
1416 assert(!(bs->open_flags & BDRV_O_INACTIVE));
1417
1418 ret = bdrv_check_byte_request(bs, offset, bytes);
1419 if (ret < 0) {
1420 return ret;
1421 }
1422
1423 /*
1424 * Align write if necessary by performing a read-modify-write cycle.
1425 * Pad qiov with the read parts and be sure to have a tracked request not
1426 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
1427 */
1428 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
1429
1430 if (!qiov) {
1431 ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req);
1432 goto out;
1433 }
1434
1435 if (offset & (align - 1)) {
1436 QEMUIOVector head_qiov;
1437 struct iovec head_iov;
1438
1439 mark_request_serialising(&req, align);
1440 wait_serialising_requests(&req);
1441
1442 head_buf = qemu_blockalign(bs, align);
1443 head_iov = (struct iovec) {
1444 .iov_base = head_buf,
1445 .iov_len = align,
1446 };
1447 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
1448
1449 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1450 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
1451 align, &head_qiov, 0);
1452 if (ret < 0) {
1453 goto fail;
1454 }
1455 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1456
1457 qemu_iovec_init(&local_qiov, qiov->niov + 2);
1458 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1459 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1460 use_local_qiov = true;
1461
1462 bytes += offset & (align - 1);
1463 offset = offset & ~(align - 1);
1464
1465 /* We have read the tail already if the request is smaller
1466 * than one aligned block.
1467 */
1468 if (bytes < align) {
1469 qemu_iovec_add(&local_qiov, head_buf + bytes, align - bytes);
1470 bytes = align;
1471 }
1472 }
1473
1474 if ((offset + bytes) & (align - 1)) {
1475 QEMUIOVector tail_qiov;
1476 struct iovec tail_iov;
1477 size_t tail_bytes;
1478 bool waited;
1479
1480 mark_request_serialising(&req, align);
1481 waited = wait_serialising_requests(&req);
1482 assert(!waited || !use_local_qiov);
1483
1484 tail_buf = qemu_blockalign(bs, align);
1485 tail_iov = (struct iovec) {
1486 .iov_base = tail_buf,
1487 .iov_len = align,
1488 };
1489 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
1490
1491 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1492 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
1493 align, &tail_qiov, 0);
1494 if (ret < 0) {
1495 goto fail;
1496 }
1497 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1498
1499 if (!use_local_qiov) {
1500 qemu_iovec_init(&local_qiov, qiov->niov + 1);
1501 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1502 use_local_qiov = true;
1503 }
1504
1505 tail_bytes = (offset + bytes) & (align - 1);
1506 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
1507
1508 bytes = ROUND_UP(bytes, align);
1509 }
1510
1511 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
1512 use_local_qiov ? &local_qiov : qiov,
1513 flags);
1514
1515 fail:
1516
1517 if (use_local_qiov) {
1518 qemu_iovec_destroy(&local_qiov);
1519 }
1520 qemu_vfree(head_buf);
1521 qemu_vfree(tail_buf);
1522 out:
1523 tracked_request_end(&req);
1524 return ret;
1525 }
1526
1527 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1528 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1529 BdrvRequestFlags flags)
1530 {
1531 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1532 return -EINVAL;
1533 }
1534
1535 return bdrv_co_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
1536 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
1537 }
1538
1539 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1540 int nb_sectors, QEMUIOVector *qiov)
1541 {
1542 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1543
1544 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
1545 }
1546
1547 int coroutine_fn bdrv_co_pwrite_zeroes(BlockDriverState *bs,
1548 int64_t offset, int count,
1549 BdrvRequestFlags flags)
1550 {
1551 trace_bdrv_co_pwrite_zeroes(bs, offset, count, flags);
1552
1553 if (!(bs->open_flags & BDRV_O_UNMAP)) {
1554 flags &= ~BDRV_REQ_MAY_UNMAP;
1555 }
1556
1557 return bdrv_co_pwritev(bs, offset, count, NULL,
1558 BDRV_REQ_ZERO_WRITE | flags);
1559 }
1560
1561 typedef struct BdrvCoGetBlockStatusData {
1562 BlockDriverState *bs;
1563 BlockDriverState *base;
1564 BlockDriverState **file;
1565 int64_t sector_num;
1566 int nb_sectors;
1567 int *pnum;
1568 int64_t ret;
1569 bool done;
1570 } BdrvCoGetBlockStatusData;
1571
1572 /*
1573 * Returns the allocation status of the specified sectors.
1574 * Drivers not implementing the functionality are assumed to not support
1575 * backing files, hence all their sectors are reported as allocated.
1576 *
1577 * If 'sector_num' is beyond the end of the disk image the return value is 0
1578 * and 'pnum' is set to 0.
1579 *
1580 * 'pnum' is set to the number of sectors (including and immediately following
1581 * the specified sector) that are known to be in the same
1582 * allocated/unallocated state.
1583 *
1584 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
1585 * beyond the end of the disk image it will be clamped.
1586 *
1587 * If returned value is positive and BDRV_BLOCK_OFFSET_VALID bit is set, 'file'
1588 * points to the BDS which the sector range is allocated in.
1589 */
1590 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
1591 int64_t sector_num,
1592 int nb_sectors, int *pnum,
1593 BlockDriverState **file)
1594 {
1595 int64_t total_sectors;
1596 int64_t n;
1597 int64_t ret, ret2;
1598
1599 total_sectors = bdrv_nb_sectors(bs);
1600 if (total_sectors < 0) {
1601 return total_sectors;
1602 }
1603
1604 if (sector_num >= total_sectors) {
1605 *pnum = 0;
1606 return 0;
1607 }
1608
1609 n = total_sectors - sector_num;
1610 if (n < nb_sectors) {
1611 nb_sectors = n;
1612 }
1613
1614 if (!bs->drv->bdrv_co_get_block_status) {
1615 *pnum = nb_sectors;
1616 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
1617 if (bs->drv->protocol_name) {
1618 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
1619 }
1620 return ret;
1621 }
1622
1623 *file = NULL;
1624 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum,
1625 file);
1626 if (ret < 0) {
1627 *pnum = 0;
1628 return ret;
1629 }
1630
1631 if (ret & BDRV_BLOCK_RAW) {
1632 assert(ret & BDRV_BLOCK_OFFSET_VALID);
1633 return bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS,
1634 *pnum, pnum, file);
1635 }
1636
1637 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
1638 ret |= BDRV_BLOCK_ALLOCATED;
1639 } else {
1640 if (bdrv_unallocated_blocks_are_zero(bs)) {
1641 ret |= BDRV_BLOCK_ZERO;
1642 } else if (bs->backing) {
1643 BlockDriverState *bs2 = bs->backing->bs;
1644 int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
1645 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
1646 ret |= BDRV_BLOCK_ZERO;
1647 }
1648 }
1649 }
1650
1651 if (*file && *file != bs &&
1652 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
1653 (ret & BDRV_BLOCK_OFFSET_VALID)) {
1654 BlockDriverState *file2;
1655 int file_pnum;
1656
1657 ret2 = bdrv_co_get_block_status(*file, ret >> BDRV_SECTOR_BITS,
1658 *pnum, &file_pnum, &file2);
1659 if (ret2 >= 0) {
1660 /* Ignore errors. This is just providing extra information, it
1661 * is useful but not necessary.
1662 */
1663 if (!file_pnum) {
1664 /* !file_pnum indicates an offset at or beyond the EOF; it is
1665 * perfectly valid for the format block driver to point to such
1666 * offsets, so catch it and mark everything as zero */
1667 ret |= BDRV_BLOCK_ZERO;
1668 } else {
1669 /* Limit request to the range reported by the protocol driver */
1670 *pnum = file_pnum;
1671 ret |= (ret2 & BDRV_BLOCK_ZERO);
1672 }
1673 }
1674 }
1675
1676 return ret;
1677 }
1678
1679 static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs,
1680 BlockDriverState *base,
1681 int64_t sector_num,
1682 int nb_sectors,
1683 int *pnum,
1684 BlockDriverState **file)
1685 {
1686 BlockDriverState *p;
1687 int64_t ret = 0;
1688
1689 assert(bs != base);
1690 for (p = bs; p != base; p = backing_bs(p)) {
1691 ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum, file);
1692 if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) {
1693 break;
1694 }
1695 /* [sector_num, pnum] unallocated on this layer, which could be only
1696 * the first part of [sector_num, nb_sectors]. */
1697 nb_sectors = MIN(nb_sectors, *pnum);
1698 }
1699 return ret;
1700 }
1701
1702 /* Coroutine wrapper for bdrv_get_block_status_above() */
1703 static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque)
1704 {
1705 BdrvCoGetBlockStatusData *data = opaque;
1706
1707 data->ret = bdrv_co_get_block_status_above(data->bs, data->base,
1708 data->sector_num,
1709 data->nb_sectors,
1710 data->pnum,
1711 data->file);
1712 data->done = true;
1713 }
1714
1715 /*
1716 * Synchronous wrapper around bdrv_co_get_block_status_above().
1717 *
1718 * See bdrv_co_get_block_status_above() for details.
1719 */
1720 int64_t bdrv_get_block_status_above(BlockDriverState *bs,
1721 BlockDriverState *base,
1722 int64_t sector_num,
1723 int nb_sectors, int *pnum,
1724 BlockDriverState **file)
1725 {
1726 Coroutine *co;
1727 BdrvCoGetBlockStatusData data = {
1728 .bs = bs,
1729 .base = base,
1730 .file = file,
1731 .sector_num = sector_num,
1732 .nb_sectors = nb_sectors,
1733 .pnum = pnum,
1734 .done = false,
1735 };
1736
1737 if (qemu_in_coroutine()) {
1738 /* Fast-path if already in coroutine context */
1739 bdrv_get_block_status_above_co_entry(&data);
1740 } else {
1741 AioContext *aio_context = bdrv_get_aio_context(bs);
1742
1743 co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry);
1744 qemu_coroutine_enter(co, &data);
1745 while (!data.done) {
1746 aio_poll(aio_context, true);
1747 }
1748 }
1749 return data.ret;
1750 }
1751
1752 int64_t bdrv_get_block_status(BlockDriverState *bs,
1753 int64_t sector_num,
1754 int nb_sectors, int *pnum,
1755 BlockDriverState **file)
1756 {
1757 return bdrv_get_block_status_above(bs, backing_bs(bs),
1758 sector_num, nb_sectors, pnum, file);
1759 }
1760
1761 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
1762 int nb_sectors, int *pnum)
1763 {
1764 BlockDriverState *file;
1765 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum,
1766 &file);
1767 if (ret < 0) {
1768 return ret;
1769 }
1770 return !!(ret & BDRV_BLOCK_ALLOCATED);
1771 }
1772
1773 /*
1774 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
1775 *
1776 * Return true if the given sector is allocated in any image between
1777 * BASE and TOP (inclusive). BASE can be NULL to check if the given
1778 * sector is allocated in any image of the chain. Return false otherwise.
1779 *
1780 * 'pnum' is set to the number of sectors (including and immediately following
1781 * the specified sector) that are known to be in the same
1782 * allocated/unallocated state.
1783 *
1784 */
1785 int bdrv_is_allocated_above(BlockDriverState *top,
1786 BlockDriverState *base,
1787 int64_t sector_num,
1788 int nb_sectors, int *pnum)
1789 {
1790 BlockDriverState *intermediate;
1791 int ret, n = nb_sectors;
1792
1793 intermediate = top;
1794 while (intermediate && intermediate != base) {
1795 int pnum_inter;
1796 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
1797 &pnum_inter);
1798 if (ret < 0) {
1799 return ret;
1800 } else if (ret) {
1801 *pnum = pnum_inter;
1802 return 1;
1803 }
1804
1805 /*
1806 * [sector_num, nb_sectors] is unallocated on top but intermediate
1807 * might have
1808 *
1809 * [sector_num+x, nr_sectors] allocated.
1810 */
1811 if (n > pnum_inter &&
1812 (intermediate == top ||
1813 sector_num + pnum_inter < intermediate->total_sectors)) {
1814 n = pnum_inter;
1815 }
1816
1817 intermediate = backing_bs(intermediate);
1818 }
1819
1820 *pnum = n;
1821 return 0;
1822 }
1823
1824 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
1825 const uint8_t *buf, int nb_sectors)
1826 {
1827 BlockDriver *drv = bs->drv;
1828 int ret;
1829
1830 if (!drv) {
1831 return -ENOMEDIUM;
1832 }
1833 if (!drv->bdrv_write_compressed) {
1834 return -ENOTSUP;
1835 }
1836 ret = bdrv_check_request(bs, sector_num, nb_sectors);
1837 if (ret < 0) {
1838 return ret;
1839 }
1840
1841 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
1842
1843 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
1844 }
1845
1846 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
1847 int64_t pos, int size)
1848 {
1849 QEMUIOVector qiov;
1850 struct iovec iov = {
1851 .iov_base = (void *) buf,
1852 .iov_len = size,
1853 };
1854
1855 qemu_iovec_init_external(&qiov, &iov, 1);
1856 return bdrv_writev_vmstate(bs, &qiov, pos);
1857 }
1858
1859 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
1860 {
1861 BlockDriver *drv = bs->drv;
1862
1863 if (!drv) {
1864 return -ENOMEDIUM;
1865 } else if (drv->bdrv_save_vmstate) {
1866 return drv->bdrv_save_vmstate(bs, qiov, pos);
1867 } else if (bs->file) {
1868 return bdrv_writev_vmstate(bs->file->bs, qiov, pos);
1869 }
1870
1871 return -ENOTSUP;
1872 }
1873
1874 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
1875 int64_t pos, int size)
1876 {
1877 BlockDriver *drv = bs->drv;
1878 if (!drv)
1879 return -ENOMEDIUM;
1880 if (drv->bdrv_load_vmstate)
1881 return drv->bdrv_load_vmstate(bs, buf, pos, size);
1882 if (bs->file)
1883 return bdrv_load_vmstate(bs->file->bs, buf, pos, size);
1884 return -ENOTSUP;
1885 }
1886
1887 /**************************************************************/
1888 /* async I/Os */
1889
1890 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
1891 QEMUIOVector *qiov, int nb_sectors,
1892 BlockCompletionFunc *cb, void *opaque)
1893 {
1894 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
1895
1896 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
1897 cb, opaque, false);
1898 }
1899
1900 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
1901 QEMUIOVector *qiov, int nb_sectors,
1902 BlockCompletionFunc *cb, void *opaque)
1903 {
1904 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
1905
1906 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
1907 cb, opaque, true);
1908 }
1909
1910 void bdrv_aio_cancel(BlockAIOCB *acb)
1911 {
1912 qemu_aio_ref(acb);
1913 bdrv_aio_cancel_async(acb);
1914 while (acb->refcnt > 1) {
1915 if (acb->aiocb_info->get_aio_context) {
1916 aio_poll(acb->aiocb_info->get_aio_context(acb), true);
1917 } else if (acb->bs) {
1918 aio_poll(bdrv_get_aio_context(acb->bs), true);
1919 } else {
1920 abort();
1921 }
1922 }
1923 qemu_aio_unref(acb);
1924 }
1925
1926 /* Async version of aio cancel. The caller is not blocked if the acb implements
1927 * cancel_async, otherwise we do nothing and let the request normally complete.
1928 * In either case the completion callback must be called. */
1929 void bdrv_aio_cancel_async(BlockAIOCB *acb)
1930 {
1931 if (acb->aiocb_info->cancel_async) {
1932 acb->aiocb_info->cancel_async(acb);
1933 }
1934 }
1935
1936 /**************************************************************/
1937 /* async block device emulation */
1938
1939 typedef struct BlockRequest {
1940 union {
1941 /* Used during read, write, trim */
1942 struct {
1943 int64_t sector;
1944 int nb_sectors;
1945 int flags;
1946 QEMUIOVector *qiov;
1947 };
1948 /* Used during ioctl */
1949 struct {
1950 int req;
1951 void *buf;
1952 };
1953 };
1954 BlockCompletionFunc *cb;
1955 void *opaque;
1956
1957 int error;
1958 } BlockRequest;
1959
1960 typedef struct BlockAIOCBCoroutine {
1961 BlockAIOCB common;
1962 BlockRequest req;
1963 bool is_write;
1964 bool need_bh;
1965 bool *done;
1966 QEMUBH* bh;
1967 } BlockAIOCBCoroutine;
1968
1969 static const AIOCBInfo bdrv_em_co_aiocb_info = {
1970 .aiocb_size = sizeof(BlockAIOCBCoroutine),
1971 };
1972
1973 static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
1974 {
1975 if (!acb->need_bh) {
1976 acb->common.cb(acb->common.opaque, acb->req.error);
1977 qemu_aio_unref(acb);
1978 }
1979 }
1980
1981 static void bdrv_co_em_bh(void *opaque)
1982 {
1983 BlockAIOCBCoroutine *acb = opaque;
1984
1985 assert(!acb->need_bh);
1986 qemu_bh_delete(acb->bh);
1987 bdrv_co_complete(acb);
1988 }
1989
1990 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
1991 {
1992 acb->need_bh = false;
1993 if (acb->req.error != -EINPROGRESS) {
1994 BlockDriverState *bs = acb->common.bs;
1995
1996 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
1997 qemu_bh_schedule(acb->bh);
1998 }
1999 }
2000
2001 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
2002 static void coroutine_fn bdrv_co_do_rw(void *opaque)
2003 {
2004 BlockAIOCBCoroutine *acb = opaque;
2005 BlockDriverState *bs = acb->common.bs;
2006
2007 if (!acb->is_write) {
2008 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
2009 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
2010 } else {
2011 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
2012 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
2013 }
2014
2015 bdrv_co_complete(acb);
2016 }
2017
2018 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
2019 int64_t sector_num,
2020 QEMUIOVector *qiov,
2021 int nb_sectors,
2022 BdrvRequestFlags flags,
2023 BlockCompletionFunc *cb,
2024 void *opaque,
2025 bool is_write)
2026 {
2027 Coroutine *co;
2028 BlockAIOCBCoroutine *acb;
2029
2030 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2031 acb->need_bh = true;
2032 acb->req.error = -EINPROGRESS;
2033 acb->req.sector = sector_num;
2034 acb->req.nb_sectors = nb_sectors;
2035 acb->req.qiov = qiov;
2036 acb->req.flags = flags;
2037 acb->is_write = is_write;
2038
2039 co = qemu_coroutine_create(bdrv_co_do_rw);
2040 qemu_coroutine_enter(co, acb);
2041
2042 bdrv_co_maybe_schedule_bh(acb);
2043 return &acb->common;
2044 }
2045
2046 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
2047 {
2048 BlockAIOCBCoroutine *acb = opaque;
2049 BlockDriverState *bs = acb->common.bs;
2050
2051 acb->req.error = bdrv_co_flush(bs);
2052 bdrv_co_complete(acb);
2053 }
2054
2055 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
2056 BlockCompletionFunc *cb, void *opaque)
2057 {
2058 trace_bdrv_aio_flush(bs, opaque);
2059
2060 Coroutine *co;
2061 BlockAIOCBCoroutine *acb;
2062
2063 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2064 acb->need_bh = true;
2065 acb->req.error = -EINPROGRESS;
2066
2067 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
2068 qemu_coroutine_enter(co, acb);
2069
2070 bdrv_co_maybe_schedule_bh(acb);
2071 return &acb->common;
2072 }
2073
2074 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
2075 {
2076 BlockAIOCBCoroutine *acb = opaque;
2077 BlockDriverState *bs = acb->common.bs;
2078
2079 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
2080 bdrv_co_complete(acb);
2081 }
2082
2083 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
2084 int64_t sector_num, int nb_sectors,
2085 BlockCompletionFunc *cb, void *opaque)
2086 {
2087 Coroutine *co;
2088 BlockAIOCBCoroutine *acb;
2089
2090 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
2091
2092 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2093 acb->need_bh = true;
2094 acb->req.error = -EINPROGRESS;
2095 acb->req.sector = sector_num;
2096 acb->req.nb_sectors = nb_sectors;
2097 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
2098 qemu_coroutine_enter(co, acb);
2099
2100 bdrv_co_maybe_schedule_bh(acb);
2101 return &acb->common;
2102 }
2103
2104 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
2105 BlockCompletionFunc *cb, void *opaque)
2106 {
2107 BlockAIOCB *acb;
2108
2109 acb = g_malloc(aiocb_info->aiocb_size);
2110 acb->aiocb_info = aiocb_info;
2111 acb->bs = bs;
2112 acb->cb = cb;
2113 acb->opaque = opaque;
2114 acb->refcnt = 1;
2115 return acb;
2116 }
2117
2118 void qemu_aio_ref(void *p)
2119 {
2120 BlockAIOCB *acb = p;
2121 acb->refcnt++;
2122 }
2123
2124 void qemu_aio_unref(void *p)
2125 {
2126 BlockAIOCB *acb = p;
2127 assert(acb->refcnt > 0);
2128 if (--acb->refcnt == 0) {
2129 g_free(acb);
2130 }
2131 }
2132
2133 /**************************************************************/
2134 /* Coroutine block device emulation */
2135
2136 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
2137 {
2138 RwCo *rwco = opaque;
2139
2140 rwco->ret = bdrv_co_flush(rwco->bs);
2141 }
2142
2143 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
2144 {
2145 int ret;
2146 BdrvTrackedRequest req;
2147
2148 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
2149 bdrv_is_sg(bs)) {
2150 return 0;
2151 }
2152
2153 tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH);
2154
2155 /* Write back all layers by calling one driver function */
2156 if (bs->drv->bdrv_co_flush) {
2157 ret = bs->drv->bdrv_co_flush(bs);
2158 goto out;
2159 }
2160
2161 /* Write back cached data to the OS even with cache=unsafe */
2162 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
2163 if (bs->drv->bdrv_co_flush_to_os) {
2164 ret = bs->drv->bdrv_co_flush_to_os(bs);
2165 if (ret < 0) {
2166 goto out;
2167 }
2168 }
2169
2170 /* But don't actually force it to the disk with cache=unsafe */
2171 if (bs->open_flags & BDRV_O_NO_FLUSH) {
2172 goto flush_parent;
2173 }
2174
2175 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
2176 if (bs->drv->bdrv_co_flush_to_disk) {
2177 ret = bs->drv->bdrv_co_flush_to_disk(bs);
2178 } else if (bs->drv->bdrv_aio_flush) {
2179 BlockAIOCB *acb;
2180 CoroutineIOCompletion co = {
2181 .coroutine = qemu_coroutine_self(),
2182 };
2183
2184 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
2185 if (acb == NULL) {
2186 ret = -EIO;
2187 } else {
2188 qemu_coroutine_yield();
2189 ret = co.ret;
2190 }
2191 } else {
2192 /*
2193 * Some block drivers always operate in either writethrough or unsafe
2194 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
2195 * know how the server works (because the behaviour is hardcoded or
2196 * depends on server-side configuration), so we can't ensure that
2197 * everything is safe on disk. Returning an error doesn't work because
2198 * that would break guests even if the server operates in writethrough
2199 * mode.
2200 *
2201 * Let's hope the user knows what he's doing.
2202 */
2203 ret = 0;
2204 }
2205 if (ret < 0) {
2206 goto out;
2207 }
2208
2209 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
2210 * in the case of cache=unsafe, so there are no useless flushes.
2211 */
2212 flush_parent:
2213 ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
2214 out:
2215 tracked_request_end(&req);
2216 return ret;
2217 }
2218
2219 int bdrv_flush(BlockDriverState *bs)
2220 {
2221 Coroutine *co;
2222 RwCo rwco = {
2223 .bs = bs,
2224 .ret = NOT_DONE,
2225 };
2226
2227 if (qemu_in_coroutine()) {
2228 /* Fast-path if already in coroutine context */
2229 bdrv_flush_co_entry(&rwco);
2230 } else {
2231 AioContext *aio_context = bdrv_get_aio_context(bs);
2232
2233 co = qemu_coroutine_create(bdrv_flush_co_entry);
2234 qemu_coroutine_enter(co, &rwco);
2235 while (rwco.ret == NOT_DONE) {
2236 aio_poll(aio_context, true);
2237 }
2238 }
2239
2240 return rwco.ret;
2241 }
2242
2243 typedef struct DiscardCo {
2244 BlockDriverState *bs;
2245 int64_t sector_num;
2246 int nb_sectors;
2247 int ret;
2248 } DiscardCo;
2249 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
2250 {
2251 DiscardCo *rwco = opaque;
2252
2253 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
2254 }
2255
2256 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
2257 int nb_sectors)
2258 {
2259 BdrvTrackedRequest req;
2260 int max_discard, ret;
2261
2262 if (!bs->drv) {
2263 return -ENOMEDIUM;
2264 }
2265
2266 ret = bdrv_check_request(bs, sector_num, nb_sectors);
2267 if (ret < 0) {
2268 return ret;
2269 } else if (bs->read_only) {
2270 return -EPERM;
2271 }
2272 assert(!(bs->open_flags & BDRV_O_INACTIVE));
2273
2274 /* Do nothing if disabled. */
2275 if (!(bs->open_flags & BDRV_O_UNMAP)) {
2276 return 0;
2277 }
2278
2279 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
2280 return 0;
2281 }
2282
2283 tracked_request_begin(&req, bs, sector_num, nb_sectors,
2284 BDRV_TRACKED_DISCARD);
2285 bdrv_set_dirty(bs, sector_num, nb_sectors);
2286
2287 max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
2288 while (nb_sectors > 0) {
2289 int ret;
2290 int num = nb_sectors;
2291
2292 /* align request */
2293 if (bs->bl.discard_alignment &&
2294 num >= bs->bl.discard_alignment &&
2295 sector_num % bs->bl.discard_alignment) {
2296 if (num > bs->bl.discard_alignment) {
2297 num = bs->bl.discard_alignment;
2298 }
2299 num -= sector_num % bs->bl.discard_alignment;
2300 }
2301
2302 /* limit request size */
2303 if (num > max_discard) {
2304 num = max_discard;
2305 }
2306
2307 if (bs->drv->bdrv_co_discard) {
2308 ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
2309 } else {
2310 BlockAIOCB *acb;
2311 CoroutineIOCompletion co = {
2312 .coroutine = qemu_coroutine_self(),
2313 };
2314
2315 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
2316 bdrv_co_io_em_complete, &co);
2317 if (acb == NULL) {
2318 ret = -EIO;
2319 goto out;
2320 } else {
2321 qemu_coroutine_yield();
2322 ret = co.ret;
2323 }
2324 }
2325 if (ret && ret != -ENOTSUP) {
2326 goto out;
2327 }
2328
2329 sector_num += num;
2330 nb_sectors -= num;
2331 }
2332 ret = 0;
2333 out:
2334 tracked_request_end(&req);
2335 return ret;
2336 }
2337
2338 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
2339 {
2340 Coroutine *co;
2341 DiscardCo rwco = {
2342 .bs = bs,
2343 .sector_num = sector_num,
2344 .nb_sectors = nb_sectors,
2345 .ret = NOT_DONE,
2346 };
2347
2348 if (qemu_in_coroutine()) {
2349 /* Fast-path if already in coroutine context */
2350 bdrv_discard_co_entry(&rwco);
2351 } else {
2352 AioContext *aio_context = bdrv_get_aio_context(bs);
2353
2354 co = qemu_coroutine_create(bdrv_discard_co_entry);
2355 qemu_coroutine_enter(co, &rwco);
2356 while (rwco.ret == NOT_DONE) {
2357 aio_poll(aio_context, true);
2358 }
2359 }
2360
2361 return rwco.ret;
2362 }
2363
2364 static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf)
2365 {
2366 BlockDriver *drv = bs->drv;
2367 BdrvTrackedRequest tracked_req;
2368 CoroutineIOCompletion co = {
2369 .coroutine = qemu_coroutine_self(),
2370 };
2371 BlockAIOCB *acb;
2372
2373 tracked_request_begin(&tracked_req, bs, 0, 0, BDRV_TRACKED_IOCTL);
2374 if (!drv || !drv->bdrv_aio_ioctl) {
2375 co.ret = -ENOTSUP;
2376 goto out;
2377 }
2378
2379 acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
2380 if (!acb) {
2381 co.ret = -ENOTSUP;
2382 goto out;
2383 }
2384 qemu_coroutine_yield();
2385 out:
2386 tracked_request_end(&tracked_req);
2387 return co.ret;
2388 }
2389
2390 typedef struct {
2391 BlockDriverState *bs;
2392 int req;
2393 void *buf;
2394 int ret;
2395 } BdrvIoctlCoData;
2396
2397 static void coroutine_fn bdrv_co_ioctl_entry(void *opaque)
2398 {
2399 BdrvIoctlCoData *data = opaque;
2400 data->ret = bdrv_co_do_ioctl(data->bs, data->req, data->buf);
2401 }
2402
2403 /* needed for generic scsi interface */
2404 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
2405 {
2406 BdrvIoctlCoData data = {
2407 .bs = bs,
2408 .req = req,
2409 .buf = buf,
2410 .ret = -EINPROGRESS,
2411 };
2412
2413 if (qemu_in_coroutine()) {
2414 /* Fast-path if already in coroutine context */
2415 bdrv_co_ioctl_entry(&data);
2416 } else {
2417 Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry);
2418
2419 qemu_coroutine_enter(co, &data);
2420 while (data.ret == -EINPROGRESS) {
2421 aio_poll(bdrv_get_aio_context(bs), true);
2422 }
2423 }
2424 return data.ret;
2425 }
2426
2427 static void coroutine_fn bdrv_co_aio_ioctl_entry(void *opaque)
2428 {
2429 BlockAIOCBCoroutine *acb = opaque;
2430 acb->req.error = bdrv_co_do_ioctl(acb->common.bs,
2431 acb->req.req, acb->req.buf);
2432 bdrv_co_complete(acb);
2433 }
2434
2435 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
2436 unsigned long int req, void *buf,
2437 BlockCompletionFunc *cb, void *opaque)
2438 {
2439 BlockAIOCBCoroutine *acb = qemu_aio_get(&bdrv_em_co_aiocb_info,
2440 bs, cb, opaque);
2441 Coroutine *co;
2442
2443 acb->need_bh = true;
2444 acb->req.error = -EINPROGRESS;
2445 acb->req.req = req;
2446 acb->req.buf = buf;
2447 co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry);
2448 qemu_coroutine_enter(co, acb);
2449
2450 bdrv_co_maybe_schedule_bh(acb);
2451 return &acb->common;
2452 }
2453
2454 void *qemu_blockalign(BlockDriverState *bs, size_t size)
2455 {
2456 return qemu_memalign(bdrv_opt_mem_align(bs), size);
2457 }
2458
2459 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
2460 {
2461 return memset(qemu_blockalign(bs, size), 0, size);
2462 }
2463
2464 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
2465 {
2466 size_t align = bdrv_opt_mem_align(bs);
2467
2468 /* Ensure that NULL is never returned on success */
2469 assert(align > 0);
2470 if (size == 0) {
2471 size = align;
2472 }
2473
2474 return qemu_try_memalign(align, size);
2475 }
2476
2477 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
2478 {
2479 void *mem = qemu_try_blockalign(bs, size);
2480
2481 if (mem) {
2482 memset(mem, 0, size);
2483 }
2484
2485 return mem;
2486 }
2487
2488 /*
2489 * Check if all memory in this vector is sector aligned.
2490 */
2491 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
2492 {
2493 int i;
2494 size_t alignment = bdrv_min_mem_align(bs);
2495
2496 for (i = 0; i < qiov->niov; i++) {
2497 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
2498 return false;
2499 }
2500 if (qiov->iov[i].iov_len % alignment) {
2501 return false;
2502 }
2503 }
2504
2505 return true;
2506 }
2507
2508 void bdrv_add_before_write_notifier(BlockDriverState *bs,
2509 NotifierWithReturn *notifier)
2510 {
2511 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
2512 }
2513
2514 void bdrv_io_plug(BlockDriverState *bs)
2515 {
2516 BdrvChild *child;
2517
2518 QLIST_FOREACH(child, &bs->children, next) {
2519 bdrv_io_plug(child->bs);
2520 }
2521
2522 if (bs->io_plugged++ == 0 && bs->io_plug_disabled == 0) {
2523 BlockDriver *drv = bs->drv;
2524 if (drv && drv->bdrv_io_plug) {
2525 drv->bdrv_io_plug(bs);
2526 }
2527 }
2528 }
2529
2530 void bdrv_io_unplug(BlockDriverState *bs)
2531 {
2532 BdrvChild *child;
2533
2534 assert(bs->io_plugged);
2535 if (--bs->io_plugged == 0 && bs->io_plug_disabled == 0) {
2536 BlockDriver *drv = bs->drv;
2537 if (drv && drv->bdrv_io_unplug) {
2538 drv->bdrv_io_unplug(bs);
2539 }
2540 }
2541
2542 QLIST_FOREACH(child, &bs->children, next) {
2543 bdrv_io_unplug(child->bs);
2544 }
2545 }
2546
2547 void bdrv_io_unplugged_begin(BlockDriverState *bs)
2548 {
2549 BdrvChild *child;
2550
2551 if (bs->io_plug_disabled++ == 0 && bs->io_plugged > 0) {
2552 BlockDriver *drv = bs->drv;
2553 if (drv && drv->bdrv_io_unplug) {
2554 drv->bdrv_io_unplug(bs);
2555 }
2556 }
2557
2558 QLIST_FOREACH(child, &bs->children, next) {
2559 bdrv_io_unplugged_begin(child->bs);
2560 }
2561 }
2562
2563 void bdrv_io_unplugged_end(BlockDriverState *bs)
2564 {
2565 BdrvChild *child;
2566
2567 assert(bs->io_plug_disabled);
2568 QLIST_FOREACH(child, &bs->children, next) {
2569 bdrv_io_unplugged_end(child->bs);
2570 }
2571
2572 if (--bs->io_plug_disabled == 0 && bs->io_plugged > 0) {
2573 BlockDriver *drv = bs->drv;
2574 if (drv && drv->bdrv_io_plug) {
2575 drv->bdrv_io_plug(bs);
2576 }
2577 }
2578 }