block/io.c

   1 /*
   2  * Block layer I/O functions
   3  *
   4  * Copyright (c) 2003 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "trace.h"
  27 #include "sysemu/block-backend.h"
  28 #include "block/blockjob.h"
  29 #include "block/block_int.h"
  30 #include "qemu/cutils.h"
  31 #include "qapi/error.h"
  32 #include "qemu/error-report.h"
  33
  34 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
  35
  36 static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child,
  37                                           int64_t offset,
  38                                           QEMUIOVector *qiov,
  39                                           BdrvRequestFlags flags,
  40                                           BlockCompletionFunc *cb,
  41                                           void *opaque,
  42                                           bool is_write);
  43 static void coroutine_fn bdrv_co_do_rw(void *opaque);
  44 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
  45     int64_t offset, int count, BdrvRequestFlags flags);
  46
  47 void bdrv_parent_drained_begin(BlockDriverState *bs)
  48 {
  49     BdrvChild *c;
  50
  51     QLIST_FOREACH(c, &bs->parents, next_parent) {
  52         if (c->role->drained_begin) {
  53             c->role->drained_begin(c);
  54         }
  55     }
  56 }
  57
  58 void bdrv_parent_drained_end(BlockDriverState *bs)
  59 {
  60     BdrvChild *c;
  61
  62     QLIST_FOREACH(c, &bs->parents, next_parent) {
  63         if (c->role->drained_end) {
  64             c->role->drained_end(c);
  65         }
  66     }
  67 }
  68
  69 static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
  70 {
  71     dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
  72     dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer);
  73     dst->opt_mem_alignment = MAX(dst->opt_mem_alignment,
  74                                  src->opt_mem_alignment);
  75     dst->min_mem_alignment = MAX(dst->min_mem_alignment,
  76                                  src->min_mem_alignment);
  77     dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov);
  78 }
  79
  80 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
  81 {
  82     BlockDriver *drv = bs->drv;
  83     Error *local_err = NULL;
  84
  85     memset(&bs->bl, 0, sizeof(bs->bl));
  86
  87     if (!drv) {
  88         return;
  89     }
  90
  91     /* Default alignment based on whether driver has byte interface */
  92     bs->bl.request_alignment = drv->bdrv_co_preadv ? 1 : 512;
  93
  94     /* Take some limits from the children as a default */
  95     if (bs->file) {
  96         bdrv_refresh_limits(bs->file->bs, &local_err);
  97         if (local_err) {
  98             error_propagate(errp, local_err);
  99             return;
 100         }
 101         bdrv_merge_limits(&bs->bl, &bs->file->bs->bl);
 102     } else {
 103         bs->bl.min_mem_alignment = 512;
 104         bs->bl.opt_mem_alignment = getpagesize();
 105
 106         /* Safe default since most protocols use readv()/writev()/etc */
 107         bs->bl.max_iov = IOV_MAX;
 108     }
 109
 110     if (bs->backing) {
 111         bdrv_refresh_limits(bs->backing->bs, &local_err);
 112         if (local_err) {
 113             error_propagate(errp, local_err);
 114             return;
 115         }
 116         bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl);
 117     }
 118
 119     /* Then let the driver override it */
 120     if (drv->bdrv_refresh_limits) {
 121         drv->bdrv_refresh_limits(bs, errp);
 122     }
 123 }
 124
 125 /**
 126  * The copy-on-read flag is actually a reference count so multiple users may
 127  * use the feature without worrying about clobbering its previous state.
 128  * Copy-on-read stays enabled until all users have called to disable it.
 129  */
 130 void bdrv_enable_copy_on_read(BlockDriverState *bs)
 131 {
 132     bs->copy_on_read++;
 133 }
 134
 135 void bdrv_disable_copy_on_read(BlockDriverState *bs)
 136 {
 137     assert(bs->copy_on_read > 0);
 138     bs->copy_on_read--;
 139 }
 140
 141 /* Check if any requests are in-flight (including throttled requests) */
 142 bool bdrv_requests_pending(BlockDriverState *bs)
 143 {
 144     BdrvChild *child;
 145
 146     if (atomic_read(&bs->in_flight)) {
 147         return true;
 148     }
 149
 150     QLIST_FOREACH(child, &bs->children, next) {
 151         if (bdrv_requests_pending(child->bs)) {
 152             return true;
 153         }
 154     }
 155
 156     return false;
 157 }
 158
 159 static bool bdrv_drain_recurse(BlockDriverState *bs)
 160 {
 161     BdrvChild *child;
 162     bool waited;
 163
 164     waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
 165
 166     if (bs->drv && bs->drv->bdrv_drain) {
 167         bs->drv->bdrv_drain(bs);
 168     }
 169
 170     QLIST_FOREACH(child, &bs->children, next) {
 171         waited |= bdrv_drain_recurse(child->bs);
 172     }
 173
 174     return waited;
 175 }
 176
 177 typedef struct {
 178     Coroutine *co;
 179     BlockDriverState *bs;
 180     bool done;
 181 } BdrvCoDrainData;
 182
 183 static void bdrv_co_drain_bh_cb(void *opaque)
 184 {
 185     BdrvCoDrainData *data = opaque;
 186     Coroutine *co = data->co;
 187     BlockDriverState *bs = data->bs;
 188
 189     bdrv_dec_in_flight(bs);
 190     bdrv_drained_begin(bs);
 191     data->done = true;
 192     aio_co_wake(co);
 193 }
 194
 195 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
 196 {
 197     BdrvCoDrainData data;
 198
 199     /* Calling bdrv_drain() from a BH ensures the current coroutine yields and
 200      * other coroutines run if they were queued from
 201      * qemu_co_queue_run_restart(). */
 202
 203     assert(qemu_in_coroutine());
 204     data = (BdrvCoDrainData) {
 205         .co = qemu_coroutine_self(),
 206         .bs = bs,
 207         .done = false,
 208     };
 209     bdrv_inc_in_flight(bs);
 210     aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
 211                             bdrv_co_drain_bh_cb, &data);
 212
 213     qemu_coroutine_yield();
 214     /* If we are resumed from some other event (such as an aio completion or a
 215      * timer callback), it is a bug in the caller that should be fixed. */
 216     assert(data.done);
 217 }
 218
 219 void bdrv_drained_begin(BlockDriverState *bs)
 220 {
 221     if (qemu_in_coroutine()) {
 222         bdrv_co_yield_to_drain(bs);
 223         return;
 224     }
 225
 226     if (!bs->quiesce_counter++) {
 227         aio_disable_external(bdrv_get_aio_context(bs));
 228         bdrv_parent_drained_begin(bs);
 229     }
 230
 231     bdrv_drain_recurse(bs);
 232 }
 233
 234 void bdrv_drained_end(BlockDriverState *bs)
 235 {
 236     assert(bs->quiesce_counter > 0);
 237     if (--bs->quiesce_counter > 0) {
 238         return;
 239     }
 240
 241     bdrv_parent_drained_end(bs);
 242     aio_enable_external(bdrv_get_aio_context(bs));
 243 }
 244
 245 /*
 246  * Wait for pending requests to complete on a single BlockDriverState subtree,
 247  * and suspend block driver's internal I/O until next request arrives.
 248  *
 249  * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
 250  * AioContext.
 251  *
 252  * Only this BlockDriverState's AioContext is run, so in-flight requests must
 253  * not depend on events in other AioContexts.  In that case, use
 254  * bdrv_drain_all() instead.
 255  */
 256 void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
 257 {
 258     assert(qemu_in_coroutine());
 259     bdrv_drained_begin(bs);
 260     bdrv_drained_end(bs);
 261 }
 262
 263 void bdrv_drain(BlockDriverState *bs)
 264 {
 265     bdrv_drained_begin(bs);
 266     bdrv_drained_end(bs);
 267 }
 268
 269 /*
 270  * Wait for pending requests to complete across all BlockDriverStates
 271  *
 272  * This function does not flush data to disk, use bdrv_flush_all() for that
 273  * after calling this function.
 274  *
 275  * This pauses all block jobs and disables external clients. It must
 276  * be paired with bdrv_drain_all_end().
 277  *
 278  * NOTE: no new block jobs or BlockDriverStates can be created between
 279  * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls.
 280  */
 281 void bdrv_drain_all_begin(void)
 282 {
 283     /* Always run first iteration so any pending completion BHs run */
 284     bool waited = true;
 285     BlockDriverState *bs;
 286     BdrvNextIterator it;
 287     BlockJob *job = NULL;
 288     GSList *aio_ctxs = NULL, *ctx;
 289
 290     while ((job = block_job_next(job))) {
 291         AioContext *aio_context = blk_get_aio_context(job->blk);
 292
 293         aio_context_acquire(aio_context);
 294         block_job_pause(job);
 295         aio_context_release(aio_context);
 296     }
 297
 298     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
 299         AioContext *aio_context = bdrv_get_aio_context(bs);
 300
 301         aio_context_acquire(aio_context);
 302         bdrv_parent_drained_begin(bs);
 303         aio_disable_external(aio_context);
 304         aio_context_release(aio_context);
 305
 306         if (!g_slist_find(aio_ctxs, aio_context)) {
 307             aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
 308         }
 309     }
 310
 311     /* Note that completion of an asynchronous I/O operation can trigger any
 312      * number of other I/O operations on other devices---for example a
 313      * coroutine can submit an I/O request to another device in response to
 314      * request completion.  Therefore we must keep looping until there was no
 315      * more activity rather than simply draining each device independently.
 316      */
 317     while (waited) {
 318         waited = false;
 319
 320         for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
 321             AioContext *aio_context = ctx->data;
 322
 323             aio_context_acquire(aio_context);
 324             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
 325                 if (aio_context == bdrv_get_aio_context(bs)) {
 326                     waited |= bdrv_drain_recurse(bs);
 327                 }
 328             }
 329             aio_context_release(aio_context);
 330         }
 331     }
 332
 333     g_slist_free(aio_ctxs);
 334 }
 335
 336 void bdrv_drain_all_end(void)
 337 {
 338     BlockDriverState *bs;
 339     BdrvNextIterator it;
 340     BlockJob *job = NULL;
 341
 342     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
 343         AioContext *aio_context = bdrv_get_aio_context(bs);
 344
 345         aio_context_acquire(aio_context);
 346         aio_enable_external(aio_context);
 347         bdrv_parent_drained_end(bs);
 348         aio_context_release(aio_context);
 349     }
 350
 351     while ((job = block_job_next(job))) {
 352         AioContext *aio_context = blk_get_aio_context(job->blk);
 353
 354         aio_context_acquire(aio_context);
 355         block_job_resume(job);
 356         aio_context_release(aio_context);
 357     }
 358 }
 359
 360 void bdrv_drain_all(void)
 361 {
 362     bdrv_drain_all_begin();
 363     bdrv_drain_all_end();
 364 }
 365
 366 /**
 367  * Remove an active request from the tracked requests list
 368  *
 369  * This function should be called when a tracked request is completing.
 370  */
 371 static void tracked_request_end(BdrvTrackedRequest *req)
 372 {
 373     if (req->serialising) {
 374         req->bs->serialising_in_flight--;
 375     }
 376
 377     QLIST_REMOVE(req, list);
 378     qemu_co_queue_restart_all(&req->wait_queue);
 379 }
 380
 381 /**
 382  * Add an active request to the tracked requests list
 383  */
 384 static void tracked_request_begin(BdrvTrackedRequest *req,
 385                                   BlockDriverState *bs,
 386                                   int64_t offset,
 387                                   unsigned int bytes,
 388                                   enum BdrvTrackedRequestType type)
 389 {
 390     *req = (BdrvTrackedRequest){
 391         .bs = bs,
 392         .offset         = offset,
 393         .bytes          = bytes,
 394         .type           = type,
 395         .co             = qemu_coroutine_self(),
 396         .serialising    = false,
 397         .overlap_offset = offset,
 398         .overlap_bytes  = bytes,
 399     };
 400
 401     qemu_co_queue_init(&req->wait_queue);
 402
 403     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
 404 }
 405
 406 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
 407 {
 408     int64_t overlap_offset = req->offset & ~(align - 1);
 409     unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
 410                                - overlap_offset;
 411
 412     if (!req->serialising) {
 413         req->bs->serialising_in_flight++;
 414         req->serialising = true;
 415     }
 416
 417     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
 418     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
 419 }
 420
 421 /**
 422  * Round a region to cluster boundaries (sector-based)
 423  */
 424 void bdrv_round_sectors_to_clusters(BlockDriverState *bs,
 425                                     int64_t sector_num, int nb_sectors,
 426                                     int64_t *cluster_sector_num,
 427                                     int *cluster_nb_sectors)
 428 {
 429     BlockDriverInfo bdi;
 430
 431     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
 432         *cluster_sector_num = sector_num;
 433         *cluster_nb_sectors = nb_sectors;
 434     } else {
 435         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
 436         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
 437         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
 438                                             nb_sectors, c);
 439     }
 440 }
 441
 442 /**
 443  * Round a region to cluster boundaries
 444  */
 445 void bdrv_round_to_clusters(BlockDriverState *bs,
 446                             int64_t offset, unsigned int bytes,
 447                             int64_t *cluster_offset,
 448                             unsigned int *cluster_bytes)
 449 {
 450     BlockDriverInfo bdi;
 451
 452     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
 453         *cluster_offset = offset;
 454         *cluster_bytes = bytes;
 455     } else {
 456         int64_t c = bdi.cluster_size;
 457         *cluster_offset = QEMU_ALIGN_DOWN(offset, c);
 458         *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c);
 459     }
 460 }
 461
 462 static int bdrv_get_cluster_size(BlockDriverState *bs)
 463 {
 464     BlockDriverInfo bdi;
 465     int ret;
 466
 467     ret = bdrv_get_info(bs, &bdi);
 468     if (ret < 0 || bdi.cluster_size == 0) {
 469         return bs->bl.request_alignment;
 470     } else {
 471         return bdi.cluster_size;
 472     }
 473 }
 474
 475 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
 476                                      int64_t offset, unsigned int bytes)
 477 {
 478     /*        aaaa   bbbb */
 479     if (offset >= req->overlap_offset + req->overlap_bytes) {
 480         return false;
 481     }
 482     /* bbbb   aaaa        */
 483     if (req->overlap_offset >= offset + bytes) {
 484         return false;
 485     }
 486     return true;
 487 }
 488
 489 void bdrv_inc_in_flight(BlockDriverState *bs)
 490 {
 491     atomic_inc(&bs->in_flight);
 492 }
 493
 494 static void dummy_bh_cb(void *opaque)
 495 {
 496 }
 497
 498 void bdrv_wakeup(BlockDriverState *bs)
 499 {
 500     if (bs->wakeup) {
 501         aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL);
 502     }
 503 }
 504
 505 void bdrv_dec_in_flight(BlockDriverState *bs)
 506 {
 507     atomic_dec(&bs->in_flight);
 508     bdrv_wakeup(bs);
 509 }
 510
 511 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
 512 {
 513     BlockDriverState *bs = self->bs;
 514     BdrvTrackedRequest *req;
 515     bool retry;
 516     bool waited = false;
 517
 518     if (!bs->serialising_in_flight) {
 519         return false;
 520     }
 521
 522     do {
 523         retry = false;
 524         QLIST_FOREACH(req, &bs->tracked_requests, list) {
 525             if (req == self || (!req->serialising && !self->serialising)) {
 526                 continue;
 527             }
 528             if (tracked_request_overlaps(req, self->overlap_offset,
 529                                          self->overlap_bytes))
 530             {
 531                 /* Hitting this means there was a reentrant request, for
 532                  * example, a block driver issuing nested requests.  This must
 533                  * never happen since it means deadlock.
 534                  */
 535                 assert(qemu_coroutine_self() != req->co);
 536
 537                 /* If the request is already (indirectly) waiting for us, or
 538                  * will wait for us as soon as it wakes up, then just go on
 539                  * (instead of producing a deadlock in the former case). */
 540                 if (!req->waiting_for) {
 541                     self->waiting_for = req;
 542                     qemu_co_queue_wait(&req->wait_queue, NULL);
 543                     self->waiting_for = NULL;
 544                     retry = true;
 545                     waited = true;
 546                     break;
 547                 }
 548             }
 549         }
 550     } while (retry);
 551
 552     return waited;
 553 }
 554
 555 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
 556                                    size_t size)
 557 {
 558     if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
 559         return -EIO;
 560     }
 561
 562     if (!bdrv_is_inserted(bs)) {
 563         return -ENOMEDIUM;
 564     }
 565
 566     if (offset < 0) {
 567         return -EIO;
 568     }
 569
 570     return 0;
 571 }
 572
 573 typedef struct RwCo {
 574     BdrvChild *child;
 575     int64_t offset;
 576     QEMUIOVector *qiov;
 577     bool is_write;
 578     int ret;
 579     BdrvRequestFlags flags;
 580 } RwCo;
 581
 582 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
 583 {
 584     RwCo *rwco = opaque;
 585
 586     if (!rwco->is_write) {
 587         rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset,
 588                                    rwco->qiov->size, rwco->qiov,
 589                                    rwco->flags);
 590     } else {
 591         rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset,
 592                                     rwco->qiov->size, rwco->qiov,
 593                                     rwco->flags);
 594     }
 595 }
 596
 597 /*
 598  * Process a vectored synchronous request using coroutines
 599  */
 600 static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
 601                         QEMUIOVector *qiov, bool is_write,
 602                         BdrvRequestFlags flags)
 603 {
 604     Coroutine *co;
 605     RwCo rwco = {
 606         .child = child,
 607         .offset = offset,
 608         .qiov = qiov,
 609         .is_write = is_write,
 610         .ret = NOT_DONE,
 611         .flags = flags,
 612     };
 613
 614     if (qemu_in_coroutine()) {
 615         /* Fast-path if already in coroutine context */
 616         bdrv_rw_co_entry(&rwco);
 617     } else {
 618         co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco);
 619         bdrv_coroutine_enter(child->bs, co);
 620         BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
 621     }
 622     return rwco.ret;
 623 }
 624
 625 /*
 626  * Process a synchronous request using coroutines
 627  */
 628 static int bdrv_rw_co(BdrvChild *child, int64_t sector_num, uint8_t *buf,
 629                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
 630 {
 631     QEMUIOVector qiov;
 632     struct iovec iov = {
 633         .iov_base = (void *)buf,
 634         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
 635     };
 636
 637     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
 638         return -EINVAL;
 639     }
 640
 641     qemu_iovec_init_external(&qiov, &iov, 1);
 642     return bdrv_prwv_co(child, sector_num << BDRV_SECTOR_BITS,
 643                         &qiov, is_write, flags);
 644 }
 645
 646 /* return < 0 if error. See bdrv_write() for the return codes */
 647 int bdrv_read(BdrvChild *child, int64_t sector_num,
 648               uint8_t *buf, int nb_sectors)
 649 {
 650     return bdrv_rw_co(child, sector_num, buf, nb_sectors, false, 0);
 651 }
 652
 653 /* Return < 0 if error. Important errors are:
 654   -EIO         generic I/O error (may happen for all errors)
 655   -ENOMEDIUM   No media inserted.
 656   -EINVAL      Invalid sector number or nb_sectors
 657   -EACCES      Trying to write a read-only device
 658 */
 659 int bdrv_write(BdrvChild *child, int64_t sector_num,
 660                const uint8_t *buf, int nb_sectors)
 661 {
 662     return bdrv_rw_co(child, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
 663 }
 664
 665 int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
 666                        int count, BdrvRequestFlags flags)
 667 {
 668     QEMUIOVector qiov;
 669     struct iovec iov = {
 670         .iov_base = NULL,
 671         .iov_len = count,
 672     };
 673
 674     qemu_iovec_init_external(&qiov, &iov, 1);
 675     return bdrv_prwv_co(child, offset, &qiov, true,
 676                         BDRV_REQ_ZERO_WRITE | flags);
 677 }
 678
 679 /*
 680  * Completely zero out a block device with the help of bdrv_pwrite_zeroes.
 681  * The operation is sped up by checking the block status and only writing
 682  * zeroes to the device if they currently do not return zeroes. Optional
 683  * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP,
 684  * BDRV_REQ_FUA).
 685  *
 686  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
 687  */
 688 int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags)
 689 {
 690     int64_t target_sectors, ret, nb_sectors, sector_num = 0;
 691     BlockDriverState *bs = child->bs;
 692     BlockDriverState *file;
 693     int n;
 694
 695     target_sectors = bdrv_nb_sectors(bs);
 696     if (target_sectors < 0) {
 697         return target_sectors;
 698     }
 699
 700     for (;;) {
 701         nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
 702         if (nb_sectors <= 0) {
 703             return 0;
 704         }
 705         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n, &file);
 706         if (ret < 0) {
 707             error_report("error getting block status at sector %" PRId64 ": %s",
 708                          sector_num, strerror(-ret));
 709             return ret;
 710         }
 711         if (ret & BDRV_BLOCK_ZERO) {
 712             sector_num += n;
 713             continue;
 714         }
 715         ret = bdrv_pwrite_zeroes(child, sector_num << BDRV_SECTOR_BITS,
 716                                  n << BDRV_SECTOR_BITS, flags);
 717         if (ret < 0) {
 718             error_report("error writing zeroes at sector %" PRId64 ": %s",
 719                          sector_num, strerror(-ret));
 720             return ret;
 721         }
 722         sector_num += n;
 723     }
 724 }
 725
 726 int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
 727 {
 728     int ret;
 729
 730     ret = bdrv_prwv_co(child, offset, qiov, false, 0);
 731     if (ret < 0) {
 732         return ret;
 733     }
 734
 735     return qiov->size;
 736 }
 737
 738 int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes)
 739 {
 740     QEMUIOVector qiov;
 741     struct iovec iov = {
 742         .iov_base = (void *)buf,
 743         .iov_len = bytes,
 744     };
 745
 746     if (bytes < 0) {
 747         return -EINVAL;
 748     }
 749
 750     qemu_iovec_init_external(&qiov, &iov, 1);
 751     return bdrv_preadv(child, offset, &qiov);
 752 }
 753
 754 int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
 755 {
 756     int ret;
 757
 758     ret = bdrv_prwv_co(child, offset, qiov, true, 0);
 759     if (ret < 0) {
 760         return ret;
 761     }
 762
 763     return qiov->size;
 764 }
 765
 766 int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes)
 767 {
 768     QEMUIOVector qiov;
 769     struct iovec iov = {
 770         .iov_base   = (void *) buf,
 771         .iov_len    = bytes,
 772     };
 773
 774     if (bytes < 0) {
 775         return -EINVAL;
 776     }
 777
 778     qemu_iovec_init_external(&qiov, &iov, 1);
 779     return bdrv_pwritev(child, offset, &qiov);
 780 }
 781
 782 /*
 783  * Writes to the file and ensures that no writes are reordered across this
 784  * request (acts as a barrier)
 785  *
 786  * Returns 0 on success, -errno in error cases.
 787  */
 788 int bdrv_pwrite_sync(BdrvChild *child, int64_t offset,
 789                      const void *buf, int count)
 790 {
 791     int ret;
 792
 793     ret = bdrv_pwrite(child, offset, buf, count);
 794     if (ret < 0) {
 795         return ret;
 796     }
 797
 798     ret = bdrv_flush(child->bs);
 799     if (ret < 0) {
 800         return ret;
 801     }
 802
 803     return 0;
 804 }
 805
 806 typedef struct CoroutineIOCompletion {
 807     Coroutine *coroutine;
 808     int ret;
 809 } CoroutineIOCompletion;
 810
 811 static void bdrv_co_io_em_complete(void *opaque, int ret)
 812 {
 813     CoroutineIOCompletion *co = opaque;
 814
 815     co->ret = ret;
 816     aio_co_wake(co->coroutine);
 817 }
 818
 819 static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
 820                                            uint64_t offset, uint64_t bytes,
 821                                            QEMUIOVector *qiov, int flags)
 822 {
 823     BlockDriver *drv = bs->drv;
 824     int64_t sector_num;
 825     unsigned int nb_sectors;
 826
 827     assert(!(flags & ~BDRV_REQ_MASK));
 828
 829     if (drv->bdrv_co_preadv) {
 830         return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
 831     }
 832
 833     sector_num = offset >> BDRV_SECTOR_BITS;
 834     nb_sectors = bytes >> BDRV_SECTOR_BITS;
 835
 836     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
 837     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
 838     assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
 839
 840     if (drv->bdrv_co_readv) {
 841         return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
 842     } else {
 843         BlockAIOCB *acb;
 844         CoroutineIOCompletion co = {
 845             .coroutine = qemu_coroutine_self(),
 846         };
 847
 848         acb = bs->drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
 849                                       bdrv_co_io_em_complete, &co);
 850         if (acb == NULL) {
 851             return -EIO;
 852         } else {
 853             qemu_coroutine_yield();
 854             return co.ret;
 855         }
 856     }
 857 }
 858
 859 static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
 860                                             uint64_t offset, uint64_t bytes,
 861                                             QEMUIOVector *qiov, int flags)
 862 {
 863     BlockDriver *drv = bs->drv;
 864     int64_t sector_num;
 865     unsigned int nb_sectors;
 866     int ret;
 867
 868     assert(!(flags & ~BDRV_REQ_MASK));
 869
 870     if (drv->bdrv_co_pwritev) {
 871         ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov,
 872                                    flags & bs->supported_write_flags);
 873         flags &= ~bs->supported_write_flags;
 874         goto emulate_flags;
 875     }
 876
 877     sector_num = offset >> BDRV_SECTOR_BITS;
 878     nb_sectors = bytes >> BDRV_SECTOR_BITS;
 879
 880     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
 881     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
 882     assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
 883
 884     if (drv->bdrv_co_writev_flags) {
 885         ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov,
 886                                         flags & bs->supported_write_flags);
 887         flags &= ~bs->supported_write_flags;
 888     } else if (drv->bdrv_co_writev) {
 889         assert(!bs->supported_write_flags);
 890         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
 891     } else {
 892         BlockAIOCB *acb;
 893         CoroutineIOCompletion co = {
 894             .coroutine = qemu_coroutine_self(),
 895         };
 896
 897         acb = bs->drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
 898                                        bdrv_co_io_em_complete, &co);
 899         if (acb == NULL) {
 900             ret = -EIO;
 901         } else {
 902             qemu_coroutine_yield();
 903             ret = co.ret;
 904         }
 905     }
 906
 907 emulate_flags:
 908     if (ret == 0 && (flags & BDRV_REQ_FUA)) {
 909         ret = bdrv_co_flush(bs);
 910     }
 911
 912     return ret;
 913 }
 914
 915 static int coroutine_fn
 916 bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
 917                                uint64_t bytes, QEMUIOVector *qiov)
 918 {
 919     BlockDriver *drv = bs->drv;
 920
 921     if (!drv->bdrv_co_pwritev_compressed) {
 922         return -ENOTSUP;
 923     }
 924
 925     return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov);
 926 }
 927
 928 static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child,
 929         int64_t offset, unsigned int bytes, QEMUIOVector *qiov)
 930 {
 931     BlockDriverState *bs = child->bs;
 932
 933     /* Perform I/O through a temporary buffer so that users who scribble over
 934      * their read buffer while the operation is in progress do not end up
 935      * modifying the image file.  This is critical for zero-copy guest I/O
 936      * where anything might happen inside guest memory.
 937      */
 938     void *bounce_buffer;
 939
 940     BlockDriver *drv = bs->drv;
 941     struct iovec iov;
 942     QEMUIOVector bounce_qiov;
 943     int64_t cluster_offset;
 944     unsigned int cluster_bytes;
 945     size_t skip_bytes;
 946     int ret;
 947
 948     /* FIXME We cannot require callers to have write permissions when all they
 949      * are doing is a read request. If we did things right, write permissions
 950      * would be obtained anyway, but internally by the copy-on-read code. As
 951      * long as it is implemented here rather than in a separat filter driver,
 952      * the copy-on-read code doesn't have its own BdrvChild, however, for which
 953      * it could request permissions. Therefore we have to bypass the permission
 954      * system for the moment. */
 955     // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
 956
 957     /* Cover entire cluster so no additional backing file I/O is required when
 958      * allocating cluster in the image file.
 959      */
 960     bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
 961
 962     trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
 963                                    cluster_offset, cluster_bytes);
 964
 965     iov.iov_len = cluster_bytes;
 966     iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
 967     if (bounce_buffer == NULL) {
 968         ret = -ENOMEM;
 969         goto err;
 970     }
 971
 972     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
 973
 974     ret = bdrv_driver_preadv(bs, cluster_offset, cluster_bytes,
 975                              &bounce_qiov, 0);
 976     if (ret < 0) {
 977         goto err;
 978     }
 979
 980     if (drv->bdrv_co_pwrite_zeroes &&
 981         buffer_is_zero(bounce_buffer, iov.iov_len)) {
 982         /* FIXME: Should we (perhaps conditionally) be setting
 983          * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
 984          * that still correctly reads as zero? */
 985         ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, cluster_bytes, 0);
 986     } else {
 987         /* This does not change the data on the disk, it is not necessary
 988          * to flush even in cache=writethrough mode.
 989          */
 990         ret = bdrv_driver_pwritev(bs, cluster_offset, cluster_bytes,
 991                                   &bounce_qiov, 0);
 992     }
 993
 994     if (ret < 0) {
 995         /* It might be okay to ignore write errors for guest requests.  If this
 996          * is a deliberate copy-on-read then we don't want to ignore the error.
 997          * Simply report it in all cases.
 998          */
 999         goto err;
1000     }
1001
1002     skip_bytes = offset - cluster_offset;
1003     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, bytes);
1004
1005 err:
1006     qemu_vfree(bounce_buffer);
1007     return ret;
1008 }
1009
1010 /*
1011  * Forwards an already correctly aligned request to the BlockDriver. This
1012  * handles copy on read, zeroing after EOF, and fragmentation of large
1013  * reads; any other features must be implemented by the caller.
1014  */
1015 static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
1016     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1017     int64_t align, QEMUIOVector *qiov, int flags)
1018 {
1019     BlockDriverState *bs = child->bs;
1020     int64_t total_bytes, max_bytes;
1021     int ret = 0;
1022     uint64_t bytes_remaining = bytes;
1023     int max_transfer;
1024
1025     assert(is_power_of_2(align));
1026     assert((offset & (align - 1)) == 0);
1027     assert((bytes & (align - 1)) == 0);
1028     assert(!qiov || bytes == qiov->size);
1029     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1030     max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
1031                                    align);
1032
1033     /* TODO: We would need a per-BDS .supported_read_flags and
1034      * potential fallback support, if we ever implement any read flags
1035      * to pass through to drivers.  For now, there aren't any
1036      * passthrough flags.  */
1037     assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ)));
1038
1039     /* Handle Copy on Read and associated serialisation */
1040     if (flags & BDRV_REQ_COPY_ON_READ) {
1041         /* If we touch the same cluster it counts as an overlap.  This
1042          * guarantees that allocating writes will be serialized and not race
1043          * with each other for the same cluster.  For example, in copy-on-read
1044          * it ensures that the CoR read and write operations are atomic and
1045          * guest writes cannot interleave between them. */
1046         mark_request_serialising(req, bdrv_get_cluster_size(bs));
1047     }
1048
1049     if (!(flags & BDRV_REQ_NO_SERIALISING)) {
1050         wait_serialising_requests(req);
1051     }
1052
1053     if (flags & BDRV_REQ_COPY_ON_READ) {
1054         int64_t start_sector = offset >> BDRV_SECTOR_BITS;
1055         int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
1056         unsigned int nb_sectors = end_sector - start_sector;
1057         int pnum;
1058
1059         ret = bdrv_is_allocated(bs, start_sector, nb_sectors, &pnum);
1060         if (ret < 0) {
1061             goto out;
1062         }
1063
1064         if (!ret || pnum != nb_sectors) {
1065             ret = bdrv_co_do_copy_on_readv(child, offset, bytes, qiov);
1066             goto out;
1067         }
1068     }
1069
1070     /* Forward the request to the BlockDriver, possibly fragmenting it */
1071     total_bytes = bdrv_getlength(bs);
1072     if (total_bytes < 0) {
1073         ret = total_bytes;
1074         goto out;
1075     }
1076
1077     max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
1078     if (bytes <= max_bytes && bytes <= max_transfer) {
1079         ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0);
1080         goto out;
1081     }
1082
1083     while (bytes_remaining) {
1084         int num;
1085
1086         if (max_bytes) {
1087             QEMUIOVector local_qiov;
1088
1089             num = MIN(bytes_remaining, MIN(max_bytes, max_transfer));
1090             assert(num);
1091             qemu_iovec_init(&local_qiov, qiov->niov);
1092             qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
1093
1094             ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining,
1095                                      num, &local_qiov, 0);
1096             max_bytes -= num;
1097             qemu_iovec_destroy(&local_qiov);
1098         } else {
1099             num = bytes_remaining;
1100             ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0,
1101                                     bytes_remaining);
1102         }
1103         if (ret < 0) {
1104             goto out;
1105         }
1106         bytes_remaining -= num;
1107     }
1108
1109 out:
1110     return ret < 0 ? ret : 0;
1111 }
1112
1113 /*
1114  * Handle a read request in coroutine context
1115  */
1116 int coroutine_fn bdrv_co_preadv(BdrvChild *child,
1117     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1118     BdrvRequestFlags flags)
1119 {
1120     BlockDriverState *bs = child->bs;
1121     BlockDriver *drv = bs->drv;
1122     BdrvTrackedRequest req;
1123
1124     uint64_t align = bs->bl.request_alignment;
1125     uint8_t *head_buf = NULL;
1126     uint8_t *tail_buf = NULL;
1127     QEMUIOVector local_qiov;
1128     bool use_local_qiov = false;
1129     int ret;
1130
1131     if (!drv) {
1132         return -ENOMEDIUM;
1133     }
1134
1135     ret = bdrv_check_byte_request(bs, offset, bytes);
1136     if (ret < 0) {
1137         return ret;
1138     }
1139
1140     bdrv_inc_in_flight(bs);
1141
1142     /* Don't do copy-on-read if we read data before write operation */
1143     if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) {
1144         flags |= BDRV_REQ_COPY_ON_READ;
1145     }
1146
1147     /* Align read if necessary by padding qiov */
1148     if (offset & (align - 1)) {
1149         head_buf = qemu_blockalign(bs, align);
1150         qemu_iovec_init(&local_qiov, qiov->niov + 2);
1151         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1152         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1153         use_local_qiov = true;
1154
1155         bytes += offset & (align - 1);
1156         offset = offset & ~(align - 1);
1157     }
1158
1159     if ((offset + bytes) & (align - 1)) {
1160         if (!use_local_qiov) {
1161             qemu_iovec_init(&local_qiov, qiov->niov + 1);
1162             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1163             use_local_qiov = true;
1164         }
1165         tail_buf = qemu_blockalign(bs, align);
1166         qemu_iovec_add(&local_qiov, tail_buf,
1167                        align - ((offset + bytes) & (align - 1)));
1168
1169         bytes = ROUND_UP(bytes, align);
1170     }
1171
1172     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
1173     ret = bdrv_aligned_preadv(child, &req, offset, bytes, align,
1174                               use_local_qiov ? &local_qiov : qiov,
1175                               flags);
1176     tracked_request_end(&req);
1177     bdrv_dec_in_flight(bs);
1178
1179     if (use_local_qiov) {
1180         qemu_iovec_destroy(&local_qiov);
1181         qemu_vfree(head_buf);
1182         qemu_vfree(tail_buf);
1183     }
1184
1185     return ret;
1186 }
1187
1188 static int coroutine_fn bdrv_co_do_readv(BdrvChild *child,
1189     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1190     BdrvRequestFlags flags)
1191 {
1192     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1193         return -EINVAL;
1194     }
1195
1196     return bdrv_co_preadv(child, sector_num << BDRV_SECTOR_BITS,
1197                           nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
1198 }
1199
1200 int coroutine_fn bdrv_co_readv(BdrvChild *child, int64_t sector_num,
1201                                int nb_sectors, QEMUIOVector *qiov)
1202 {
1203     trace_bdrv_co_readv(child->bs, sector_num, nb_sectors);
1204
1205     return bdrv_co_do_readv(child, sector_num, nb_sectors, qiov, 0);
1206 }
1207
1208 /* Maximum buffer for write zeroes fallback, in bytes */
1209 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
1210
1211 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
1212     int64_t offset, int count, BdrvRequestFlags flags)
1213 {
1214     BlockDriver *drv = bs->drv;
1215     QEMUIOVector qiov;
1216     struct iovec iov = {0};
1217     int ret = 0;
1218     bool need_flush = false;
1219     int head = 0;
1220     int tail = 0;
1221
1222     int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
1223     int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
1224                         bs->bl.request_alignment);
1225     int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
1226                                     MAX_WRITE_ZEROES_BOUNCE_BUFFER);
1227
1228     assert(alignment % bs->bl.request_alignment == 0);
1229     head = offset % alignment;
1230     tail = (offset + count) % alignment;
1231     max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment);
1232     assert(max_write_zeroes >= bs->bl.request_alignment);
1233
1234     while (count > 0 && !ret) {
1235         int num = count;
1236
1237         /* Align request.  Block drivers can expect the "bulk" of the request
1238          * to be aligned, and that unaligned requests do not cross cluster
1239          * boundaries.
1240          */
1241         if (head) {
1242             /* Make a small request up to the first aligned sector. For
1243              * convenience, limit this request to max_transfer even if
1244              * we don't need to fall back to writes.  */
1245             num = MIN(MIN(count, max_transfer), alignment - head);
1246             head = (head + num) % alignment;
1247             assert(num < max_write_zeroes);
1248         } else if (tail && num > alignment) {
1249             /* Shorten the request to the last aligned sector.  */
1250             num -= tail;
1251         }
1252
1253         /* limit request size */
1254         if (num > max_write_zeroes) {
1255             num = max_write_zeroes;
1256         }
1257
1258         ret = -ENOTSUP;
1259         /* First try the efficient write zeroes operation */
1260         if (drv->bdrv_co_pwrite_zeroes) {
1261             ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num,
1262                                              flags & bs->supported_zero_flags);
1263             if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
1264                 !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
1265                 need_flush = true;
1266             }
1267         } else {
1268             assert(!bs->supported_zero_flags);
1269         }
1270
1271         if (ret == -ENOTSUP) {
1272             /* Fall back to bounce buffer if write zeroes is unsupported */
1273             BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
1274
1275             if ((flags & BDRV_REQ_FUA) &&
1276                 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1277                 /* No need for bdrv_driver_pwrite() to do a fallback
1278                  * flush on each chunk; use just one at the end */
1279                 write_flags &= ~BDRV_REQ_FUA;
1280                 need_flush = true;
1281             }
1282             num = MIN(num, max_transfer);
1283             iov.iov_len = num;
1284             if (iov.iov_base == NULL) {
1285                 iov.iov_base = qemu_try_blockalign(bs, num);
1286                 if (iov.iov_base == NULL) {
1287                     ret = -ENOMEM;
1288                     goto fail;
1289                 }
1290                 memset(iov.iov_base, 0, num);
1291             }
1292             qemu_iovec_init_external(&qiov, &iov, 1);
1293
1294             ret = bdrv_driver_pwritev(bs, offset, num, &qiov, write_flags);
1295
1296             /* Keep bounce buffer around if it is big enough for all
1297              * all future requests.
1298              */
1299             if (num < max_transfer) {
1300                 qemu_vfree(iov.iov_base);
1301                 iov.iov_base = NULL;
1302             }
1303         }
1304
1305         offset += num;
1306         count -= num;
1307     }
1308
1309 fail:
1310     if (ret == 0 && need_flush) {
1311         ret = bdrv_co_flush(bs);
1312     }
1313     qemu_vfree(iov.iov_base);
1314     return ret;
1315 }
1316
1317 /*
1318  * Forwards an already correctly aligned write request to the BlockDriver,
1319  * after possibly fragmenting it.
1320  */
1321 static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
1322     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1323     int64_t align, QEMUIOVector *qiov, int flags)
1324 {
1325     BlockDriverState *bs = child->bs;
1326     BlockDriver *drv = bs->drv;
1327     bool waited;
1328     int ret;
1329
1330     int64_t start_sector = offset >> BDRV_SECTOR_BITS;
1331     int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
1332     uint64_t bytes_remaining = bytes;
1333     int max_transfer;
1334
1335     assert(is_power_of_2(align));
1336     assert((offset & (align - 1)) == 0);
1337     assert((bytes & (align - 1)) == 0);
1338     assert(!qiov || bytes == qiov->size);
1339     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1340     assert(!(flags & ~BDRV_REQ_MASK));
1341     max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
1342                                    align);
1343
1344     waited = wait_serialising_requests(req);
1345     assert(!waited || !req->serialising);
1346     assert(req->overlap_offset <= offset);
1347     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
1348     assert(child->perm & BLK_PERM_WRITE);
1349     assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE);
1350
1351     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
1352
1353     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
1354         !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
1355         qemu_iovec_is_zero(qiov)) {
1356         flags |= BDRV_REQ_ZERO_WRITE;
1357         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
1358             flags |= BDRV_REQ_MAY_UNMAP;
1359         }
1360     }
1361
1362     if (ret < 0) {
1363         /* Do nothing, write notifier decided to fail this request */
1364     } else if (flags & BDRV_REQ_ZERO_WRITE) {
1365         bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
1366         ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
1367     } else if (flags & BDRV_REQ_WRITE_COMPRESSED) {
1368         ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, qiov);
1369     } else if (bytes <= max_transfer) {
1370         bdrv_debug_event(bs, BLKDBG_PWRITEV);
1371         ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags);
1372     } else {
1373         bdrv_debug_event(bs, BLKDBG_PWRITEV);
1374         while (bytes_remaining) {
1375             int num = MIN(bytes_remaining, max_transfer);
1376             QEMUIOVector local_qiov;
1377             int local_flags = flags;
1378
1379             assert(num);
1380             if (num < bytes_remaining && (flags & BDRV_REQ_FUA) &&
1381                 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1382                 /* If FUA is going to be emulated by flush, we only
1383                  * need to flush on the last iteration */
1384                 local_flags &= ~BDRV_REQ_FUA;
1385             }
1386             qemu_iovec_init(&local_qiov, qiov->niov);
1387             qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
1388
1389             ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining,
1390                                       num, &local_qiov, local_flags);
1391             qemu_iovec_destroy(&local_qiov);
1392             if (ret < 0) {
1393                 break;
1394             }
1395             bytes_remaining -= num;
1396         }
1397     }
1398     bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
1399
1400     ++bs->write_gen;
1401     bdrv_set_dirty(bs, start_sector, end_sector - start_sector);
1402
1403     if (bs->wr_highest_offset < offset + bytes) {
1404         bs->wr_highest_offset = offset + bytes;
1405     }
1406
1407     if (ret >= 0) {
1408         bs->total_sectors = MAX(bs->total_sectors, end_sector);
1409         ret = 0;
1410     }
1411
1412     return ret;
1413 }
1414
1415 static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
1416                                                 int64_t offset,
1417                                                 unsigned int bytes,
1418                                                 BdrvRequestFlags flags,
1419                                                 BdrvTrackedRequest *req)
1420 {
1421     BlockDriverState *bs = child->bs;
1422     uint8_t *buf = NULL;
1423     QEMUIOVector local_qiov;
1424     struct iovec iov;
1425     uint64_t align = bs->bl.request_alignment;
1426     unsigned int head_padding_bytes, tail_padding_bytes;
1427     int ret = 0;
1428
1429     head_padding_bytes = offset & (align - 1);
1430     tail_padding_bytes = align - ((offset + bytes) & (align - 1));
1431
1432
1433     assert(flags & BDRV_REQ_ZERO_WRITE);
1434     if (head_padding_bytes || tail_padding_bytes) {
1435         buf = qemu_blockalign(bs, align);
1436         iov = (struct iovec) {
1437             .iov_base   = buf,
1438             .iov_len    = align,
1439         };
1440         qemu_iovec_init_external(&local_qiov, &iov, 1);
1441     }
1442     if (head_padding_bytes) {
1443         uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes);
1444
1445         /* RMW the unaligned part before head. */
1446         mark_request_serialising(req, align);
1447         wait_serialising_requests(req);
1448         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1449         ret = bdrv_aligned_preadv(child, req, offset & ~(align - 1), align,
1450                                   align, &local_qiov, 0);
1451         if (ret < 0) {
1452             goto fail;
1453         }
1454         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1455
1456         memset(buf + head_padding_bytes, 0, zero_bytes);
1457         ret = bdrv_aligned_pwritev(child, req, offset & ~(align - 1), align,
1458                                    align, &local_qiov,
1459                                    flags & ~BDRV_REQ_ZERO_WRITE);
1460         if (ret < 0) {
1461             goto fail;
1462         }
1463         offset += zero_bytes;
1464         bytes -= zero_bytes;
1465     }
1466
1467     assert(!bytes || (offset & (align - 1)) == 0);
1468     if (bytes >= align) {
1469         /* Write the aligned part in the middle. */
1470         uint64_t aligned_bytes = bytes & ~(align - 1);
1471         ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align,
1472                                    NULL, flags);
1473         if (ret < 0) {
1474             goto fail;
1475         }
1476         bytes -= aligned_bytes;
1477         offset += aligned_bytes;
1478     }
1479
1480     assert(!bytes || (offset & (align - 1)) == 0);
1481     if (bytes) {
1482         assert(align == tail_padding_bytes + bytes);
1483         /* RMW the unaligned part after tail. */
1484         mark_request_serialising(req, align);
1485         wait_serialising_requests(req);
1486         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1487         ret = bdrv_aligned_preadv(child, req, offset, align,
1488                                   align, &local_qiov, 0);
1489         if (ret < 0) {
1490             goto fail;
1491         }
1492         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1493
1494         memset(buf, 0, bytes);
1495         ret = bdrv_aligned_pwritev(child, req, offset, align, align,
1496                                    &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE);
1497     }
1498 fail:
1499     qemu_vfree(buf);
1500     return ret;
1501
1502 }
1503
1504 /*
1505  * Handle a write request in coroutine context
1506  */
1507 int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
1508     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1509     BdrvRequestFlags flags)
1510 {
1511     BlockDriverState *bs = child->bs;
1512     BdrvTrackedRequest req;
1513     uint64_t align = bs->bl.request_alignment;
1514     uint8_t *head_buf = NULL;
1515     uint8_t *tail_buf = NULL;
1516     QEMUIOVector local_qiov;
1517     bool use_local_qiov = false;
1518     int ret;
1519
1520     if (!bs->drv) {
1521         return -ENOMEDIUM;
1522     }
1523     if (bs->read_only) {
1524         return -EPERM;
1525     }
1526     assert(!(bs->open_flags & BDRV_O_INACTIVE));
1527
1528     ret = bdrv_check_byte_request(bs, offset, bytes);
1529     if (ret < 0) {
1530         return ret;
1531     }
1532
1533     bdrv_inc_in_flight(bs);
1534     /*
1535      * Align write if necessary by performing a read-modify-write cycle.
1536      * Pad qiov with the read parts and be sure to have a tracked request not
1537      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
1538      */
1539     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
1540
1541     if (!qiov) {
1542         ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req);
1543         goto out;
1544     }
1545
1546     if (offset & (align - 1)) {
1547         QEMUIOVector head_qiov;
1548         struct iovec head_iov;
1549
1550         mark_request_serialising(&req, align);
1551         wait_serialising_requests(&req);
1552
1553         head_buf = qemu_blockalign(bs, align);
1554         head_iov = (struct iovec) {
1555             .iov_base   = head_buf,
1556             .iov_len    = align,
1557         };
1558         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
1559
1560         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1561         ret = bdrv_aligned_preadv(child, &req, offset & ~(align - 1), align,
1562                                   align, &head_qiov, 0);
1563         if (ret < 0) {
1564             goto fail;
1565         }
1566         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1567
1568         qemu_iovec_init(&local_qiov, qiov->niov + 2);
1569         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1570         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1571         use_local_qiov = true;
1572
1573         bytes += offset & (align - 1);
1574         offset = offset & ~(align - 1);
1575
1576         /* We have read the tail already if the request is smaller
1577          * than one aligned block.
1578          */
1579         if (bytes < align) {
1580             qemu_iovec_add(&local_qiov, head_buf + bytes, align - bytes);
1581             bytes = align;
1582         }
1583     }
1584
1585     if ((offset + bytes) & (align - 1)) {
1586         QEMUIOVector tail_qiov;
1587         struct iovec tail_iov;
1588         size_t tail_bytes;
1589         bool waited;
1590
1591         mark_request_serialising(&req, align);
1592         waited = wait_serialising_requests(&req);
1593         assert(!waited || !use_local_qiov);
1594
1595         tail_buf = qemu_blockalign(bs, align);
1596         tail_iov = (struct iovec) {
1597             .iov_base   = tail_buf,
1598             .iov_len    = align,
1599         };
1600         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
1601
1602         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1603         ret = bdrv_aligned_preadv(child, &req, (offset + bytes) & ~(align - 1),
1604                                   align, align, &tail_qiov, 0);
1605         if (ret < 0) {
1606             goto fail;
1607         }
1608         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1609
1610         if (!use_local_qiov) {
1611             qemu_iovec_init(&local_qiov, qiov->niov + 1);
1612             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1613             use_local_qiov = true;
1614         }
1615
1616         tail_bytes = (offset + bytes) & (align - 1);
1617         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
1618
1619         bytes = ROUND_UP(bytes, align);
1620     }
1621
1622     ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align,
1623                                use_local_qiov ? &local_qiov : qiov,
1624                                flags);
1625
1626 fail:
1627
1628     if (use_local_qiov) {
1629         qemu_iovec_destroy(&local_qiov);
1630     }
1631     qemu_vfree(head_buf);
1632     qemu_vfree(tail_buf);
1633 out:
1634     tracked_request_end(&req);
1635     bdrv_dec_in_flight(bs);
1636     return ret;
1637 }
1638
1639 static int coroutine_fn bdrv_co_do_writev(BdrvChild *child,
1640     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1641     BdrvRequestFlags flags)
1642 {
1643     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1644         return -EINVAL;
1645     }
1646
1647     return bdrv_co_pwritev(child, sector_num << BDRV_SECTOR_BITS,
1648                            nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
1649 }
1650
1651 int coroutine_fn bdrv_co_writev(BdrvChild *child, int64_t sector_num,
1652     int nb_sectors, QEMUIOVector *qiov)
1653 {
1654     trace_bdrv_co_writev(child->bs, sector_num, nb_sectors);
1655
1656     return bdrv_co_do_writev(child, sector_num, nb_sectors, qiov, 0);
1657 }
1658
1659 int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
1660                                        int count, BdrvRequestFlags flags)
1661 {
1662     trace_bdrv_co_pwrite_zeroes(child->bs, offset, count, flags);
1663
1664     if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
1665         flags &= ~BDRV_REQ_MAY_UNMAP;
1666     }
1667
1668     return bdrv_co_pwritev(child, offset, count, NULL,
1669                            BDRV_REQ_ZERO_WRITE | flags);
1670 }
1671
1672 /*
1673  * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not.
1674  */
1675 int bdrv_flush_all(void)
1676 {
1677     BdrvNextIterator it;
1678     BlockDriverState *bs = NULL;
1679     int result = 0;
1680
1681     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
1682         AioContext *aio_context = bdrv_get_aio_context(bs);
1683         int ret;
1684
1685         aio_context_acquire(aio_context);
1686         ret = bdrv_flush(bs);
1687         if (ret < 0 && !result) {
1688             result = ret;
1689         }
1690         aio_context_release(aio_context);
1691     }
1692
1693     return result;
1694 }
1695
1696
1697 typedef struct BdrvCoGetBlockStatusData {
1698     BlockDriverState *bs;
1699     BlockDriverState *base;
1700     BlockDriverState **file;
1701     int64_t sector_num;
1702     int nb_sectors;
1703     int *pnum;
1704     int64_t ret;
1705     bool done;
1706 } BdrvCoGetBlockStatusData;
1707
1708 /*
1709  * Returns the allocation status of the specified sectors.
1710  * Drivers not implementing the functionality are assumed to not support
1711  * backing files, hence all their sectors are reported as allocated.
1712  *
1713  * If 'sector_num' is beyond the end of the disk image the return value is 0
1714  * and 'pnum' is set to 0.
1715  *
1716  * 'pnum' is set to the number of sectors (including and immediately following
1717  * the specified sector) that are known to be in the same
1718  * allocated/unallocated state.
1719  *
1720  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
1721  * beyond the end of the disk image it will be clamped.
1722  *
1723  * If returned value is positive and BDRV_BLOCK_OFFSET_VALID bit is set, 'file'
1724  * points to the BDS which the sector range is allocated in.
1725  */
1726 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
1727                                                      int64_t sector_num,
1728                                                      int nb_sectors, int *pnum,
1729                                                      BlockDriverState **file)
1730 {
1731     int64_t total_sectors;
1732     int64_t n;
1733     int64_t ret, ret2;
1734
1735     total_sectors = bdrv_nb_sectors(bs);
1736     if (total_sectors < 0) {
1737         return total_sectors;
1738     }
1739
1740     if (sector_num >= total_sectors) {
1741         *pnum = 0;
1742         return 0;
1743     }
1744
1745     n = total_sectors - sector_num;
1746     if (n < nb_sectors) {
1747         nb_sectors = n;
1748     }
1749
1750     if (!bs->drv->bdrv_co_get_block_status) {
1751         *pnum = nb_sectors;
1752         ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
1753         if (bs->drv->protocol_name) {
1754             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
1755         }
1756         return ret;
1757     }
1758
1759     *file = NULL;
1760     bdrv_inc_in_flight(bs);
1761     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum,
1762                                             file);
1763     if (ret < 0) {
1764         *pnum = 0;
1765         goto out;
1766     }
1767
1768     if (ret & BDRV_BLOCK_RAW) {
1769         assert(ret & BDRV_BLOCK_OFFSET_VALID);
1770         ret = bdrv_get_block_status(*file, ret >> BDRV_SECTOR_BITS,
1771                                     *pnum, pnum, file);
1772         goto out;
1773     }
1774
1775     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
1776         ret |= BDRV_BLOCK_ALLOCATED;
1777     } else {
1778         if (bdrv_unallocated_blocks_are_zero(bs)) {
1779             ret |= BDRV_BLOCK_ZERO;
1780         } else if (bs->backing) {
1781             BlockDriverState *bs2 = bs->backing->bs;
1782             int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
1783             if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
1784                 ret |= BDRV_BLOCK_ZERO;
1785             }
1786         }
1787     }
1788
1789     if (*file && *file != bs &&
1790         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
1791         (ret & BDRV_BLOCK_OFFSET_VALID)) {
1792         BlockDriverState *file2;
1793         int file_pnum;
1794
1795         ret2 = bdrv_co_get_block_status(*file, ret >> BDRV_SECTOR_BITS,
1796                                         *pnum, &file_pnum, &file2);
1797         if (ret2 >= 0) {
1798             /* Ignore errors.  This is just providing extra information, it
1799              * is useful but not necessary.
1800              */
1801             if (!file_pnum) {
1802                 /* !file_pnum indicates an offset at or beyond the EOF; it is
1803                  * perfectly valid for the format block driver to point to such
1804                  * offsets, so catch it and mark everything as zero */
1805                 ret |= BDRV_BLOCK_ZERO;
1806             } else {
1807                 /* Limit request to the range reported by the protocol driver */
1808                 *pnum = file_pnum;
1809                 ret |= (ret2 & BDRV_BLOCK_ZERO);
1810             }
1811         }
1812     }
1813
1814 out:
1815     bdrv_dec_in_flight(bs);
1816     return ret;
1817 }
1818
1819 static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs,
1820         BlockDriverState *base,
1821         int64_t sector_num,
1822         int nb_sectors,
1823         int *pnum,
1824         BlockDriverState **file)
1825 {
1826     BlockDriverState *p;
1827     int64_t ret = 0;
1828
1829     assert(bs != base);
1830     for (p = bs; p != base; p = backing_bs(p)) {
1831         ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum, file);
1832         if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) {
1833             break;
1834         }
1835         /* [sector_num, pnum] unallocated on this layer, which could be only
1836          * the first part of [sector_num, nb_sectors].  */
1837         nb_sectors = MIN(nb_sectors, *pnum);
1838     }
1839     return ret;
1840 }
1841
1842 /* Coroutine wrapper for bdrv_get_block_status_above() */
1843 static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque)
1844 {
1845     BdrvCoGetBlockStatusData *data = opaque;
1846
1847     data->ret = bdrv_co_get_block_status_above(data->bs, data->base,
1848                                                data->sector_num,
1849                                                data->nb_sectors,
1850                                                data->pnum,
1851                                                data->file);
1852     data->done = true;
1853 }
1854
1855 /*
1856  * Synchronous wrapper around bdrv_co_get_block_status_above().
1857  *
1858  * See bdrv_co_get_block_status_above() for details.
1859  */
1860 int64_t bdrv_get_block_status_above(BlockDriverState *bs,
1861                                     BlockDriverState *base,
1862                                     int64_t sector_num,
1863                                     int nb_sectors, int *pnum,
1864                                     BlockDriverState **file)
1865 {
1866     Coroutine *co;
1867     BdrvCoGetBlockStatusData data = {
1868         .bs = bs,
1869         .base = base,
1870         .file = file,
1871         .sector_num = sector_num,
1872         .nb_sectors = nb_sectors,
1873         .pnum = pnum,
1874         .done = false,
1875     };
1876
1877     if (qemu_in_coroutine()) {
1878         /* Fast-path if already in coroutine context */
1879         bdrv_get_block_status_above_co_entry(&data);
1880     } else {
1881         co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry,
1882                                    &data);
1883         bdrv_coroutine_enter(bs, co);
1884         BDRV_POLL_WHILE(bs, !data.done);
1885     }
1886     return data.ret;
1887 }
1888
1889 int64_t bdrv_get_block_status(BlockDriverState *bs,
1890                               int64_t sector_num,
1891                               int nb_sectors, int *pnum,
1892                               BlockDriverState **file)
1893 {
1894     return bdrv_get_block_status_above(bs, backing_bs(bs),
1895                                        sector_num, nb_sectors, pnum, file);
1896 }
1897
1898 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
1899                                    int nb_sectors, int *pnum)
1900 {
1901     BlockDriverState *file;
1902     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum,
1903                                         &file);
1904     if (ret < 0) {
1905         return ret;
1906     }
1907     return !!(ret & BDRV_BLOCK_ALLOCATED);
1908 }
1909
1910 /*
1911  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
1912  *
1913  * Return true if the given sector is allocated in any image between
1914  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
1915  * sector is allocated in any image of the chain.  Return false otherwise.
1916  *
1917  * 'pnum' is set to the number of sectors (including and immediately following
1918  *  the specified sector) that are known to be in the same
1919  *  allocated/unallocated state.
1920  *
1921  */
1922 int bdrv_is_allocated_above(BlockDriverState *top,
1923                             BlockDriverState *base,
1924                             int64_t sector_num,
1925                             int nb_sectors, int *pnum)
1926 {
1927     BlockDriverState *intermediate;
1928     int ret, n = nb_sectors;
1929
1930     intermediate = top;
1931     while (intermediate && intermediate != base) {
1932         int pnum_inter;
1933         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
1934                                 &pnum_inter);
1935         if (ret < 0) {
1936             return ret;
1937         } else if (ret) {
1938             *pnum = pnum_inter;
1939             return 1;
1940         }
1941
1942         /*
1943          * [sector_num, nb_sectors] is unallocated on top but intermediate
1944          * might have
1945          *
1946          * [sector_num+x, nr_sectors] allocated.
1947          */
1948         if (n > pnum_inter &&
1949             (intermediate == top ||
1950              sector_num + pnum_inter < intermediate->total_sectors)) {
1951             n = pnum_inter;
1952         }
1953
1954         intermediate = backing_bs(intermediate);
1955     }
1956
1957     *pnum = n;
1958     return 0;
1959 }
1960
1961 typedef struct BdrvVmstateCo {
1962     BlockDriverState    *bs;
1963     QEMUIOVector        *qiov;
1964     int64_t             pos;
1965     bool                is_read;
1966     int                 ret;
1967 } BdrvVmstateCo;
1968
1969 static int coroutine_fn
1970 bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
1971                    bool is_read)
1972 {
1973     BlockDriver *drv = bs->drv;
1974
1975     if (!drv) {
1976         return -ENOMEDIUM;
1977     } else if (drv->bdrv_load_vmstate) {
1978         return is_read ? drv->bdrv_load_vmstate(bs, qiov, pos)
1979                        : drv->bdrv_save_vmstate(bs, qiov, pos);
1980     } else if (bs->file) {
1981         return bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
1982     }
1983
1984     return -ENOTSUP;
1985 }
1986
1987 static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
1988 {
1989     BdrvVmstateCo *co = opaque;
1990     co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
1991 }
1992
1993 static inline int
1994 bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
1995                 bool is_read)
1996 {
1997     if (qemu_in_coroutine()) {
1998         return bdrv_co_rw_vmstate(bs, qiov, pos, is_read);
1999     } else {
2000         BdrvVmstateCo data = {
2001             .bs         = bs,
2002             .qiov       = qiov,
2003             .pos        = pos,
2004             .is_read    = is_read,
2005             .ret        = -EINPROGRESS,
2006         };
2007         Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data);
2008
2009         bdrv_coroutine_enter(bs, co);
2010         while (data.ret == -EINPROGRESS) {
2011             aio_poll(bdrv_get_aio_context(bs), true);
2012         }
2013         return data.ret;
2014     }
2015 }
2016
2017 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2018                       int64_t pos, int size)
2019 {
2020     QEMUIOVector qiov;
2021     struct iovec iov = {
2022         .iov_base   = (void *) buf,
2023         .iov_len    = size,
2024     };
2025     int ret;
2026
2027     qemu_iovec_init_external(&qiov, &iov, 1);
2028
2029     ret = bdrv_writev_vmstate(bs, &qiov, pos);
2030     if (ret < 0) {
2031         return ret;
2032     }
2033
2034     return size;
2035 }
2036
2037 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
2038 {
2039     return bdrv_rw_vmstate(bs, qiov, pos, false);
2040 }
2041
2042 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2043                       int64_t pos, int size)
2044 {
2045     QEMUIOVector qiov;
2046     struct iovec iov = {
2047         .iov_base   = buf,
2048         .iov_len    = size,
2049     };
2050     int ret;
2051
2052     qemu_iovec_init_external(&qiov, &iov, 1);
2053     ret = bdrv_readv_vmstate(bs, &qiov, pos);
2054     if (ret < 0) {
2055         return ret;
2056     }
2057
2058     return size;
2059 }
2060
2061 int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
2062 {
2063     return bdrv_rw_vmstate(bs, qiov, pos, true);
2064 }
2065
2066 /**************************************************************/
2067 /* async I/Os */
2068
2069 BlockAIOCB *bdrv_aio_readv(BdrvChild *child, int64_t sector_num,
2070                            QEMUIOVector *qiov, int nb_sectors,
2071                            BlockCompletionFunc *cb, void *opaque)
2072 {
2073     trace_bdrv_aio_readv(child->bs, sector_num, nb_sectors, opaque);
2074
2075     assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size);
2076     return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov,
2077                                   0, cb, opaque, false);
2078 }
2079
2080 BlockAIOCB *bdrv_aio_writev(BdrvChild *child, int64_t sector_num,
2081                             QEMUIOVector *qiov, int nb_sectors,
2082                             BlockCompletionFunc *cb, void *opaque)
2083 {
2084     trace_bdrv_aio_writev(child->bs, sector_num, nb_sectors, opaque);
2085
2086     assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size);
2087     return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov,
2088                                   0, cb, opaque, true);
2089 }
2090
2091 void bdrv_aio_cancel(BlockAIOCB *acb)
2092 {
2093     qemu_aio_ref(acb);
2094     bdrv_aio_cancel_async(acb);
2095     while (acb->refcnt > 1) {
2096         if (acb->aiocb_info->get_aio_context) {
2097             aio_poll(acb->aiocb_info->get_aio_context(acb), true);
2098         } else if (acb->bs) {
2099             /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so
2100              * assert that we're not using an I/O thread.  Thread-safe
2101              * code should use bdrv_aio_cancel_async exclusively.
2102              */
2103             assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context());
2104             aio_poll(bdrv_get_aio_context(acb->bs), true);
2105         } else {
2106             abort();
2107         }
2108     }
2109     qemu_aio_unref(acb);
2110 }
2111
2112 /* Async version of aio cancel. The caller is not blocked if the acb implements
2113  * cancel_async, otherwise we do nothing and let the request normally complete.
2114  * In either case the completion callback must be called. */
2115 void bdrv_aio_cancel_async(BlockAIOCB *acb)
2116 {
2117     if (acb->aiocb_info->cancel_async) {
2118         acb->aiocb_info->cancel_async(acb);
2119     }
2120 }
2121
2122 /**************************************************************/
2123 /* async block device emulation */
2124
2125 typedef struct BlockRequest {
2126     union {
2127         /* Used during read, write, trim */
2128         struct {
2129             int64_t offset;
2130             int bytes;
2131             int flags;
2132             QEMUIOVector *qiov;
2133         };
2134         /* Used during ioctl */
2135         struct {
2136             int req;
2137             void *buf;
2138         };
2139     };
2140     BlockCompletionFunc *cb;
2141     void *opaque;
2142
2143     int error;
2144 } BlockRequest;
2145
2146 typedef struct BlockAIOCBCoroutine {
2147     BlockAIOCB common;
2148     BdrvChild *child;
2149     BlockRequest req;
2150     bool is_write;
2151     bool need_bh;
2152     bool *done;
2153 } BlockAIOCBCoroutine;
2154
2155 static const AIOCBInfo bdrv_em_co_aiocb_info = {
2156     .aiocb_size         = sizeof(BlockAIOCBCoroutine),
2157 };
2158
2159 static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
2160 {
2161     if (!acb->need_bh) {
2162         bdrv_dec_in_flight(acb->common.bs);
2163         acb->common.cb(acb->common.opaque, acb->req.error);
2164         qemu_aio_unref(acb);
2165     }
2166 }
2167
2168 static void bdrv_co_em_bh(void *opaque)
2169 {
2170     BlockAIOCBCoroutine *acb = opaque;
2171
2172     assert(!acb->need_bh);
2173     bdrv_co_complete(acb);
2174 }
2175
2176 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
2177 {
2178     acb->need_bh = false;
2179     if (acb->req.error != -EINPROGRESS) {
2180         BlockDriverState *bs = acb->common.bs;
2181
2182         aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
2183     }
2184 }
2185
2186 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
2187 static void coroutine_fn bdrv_co_do_rw(void *opaque)
2188 {
2189     BlockAIOCBCoroutine *acb = opaque;
2190
2191     if (!acb->is_write) {
2192         acb->req.error = bdrv_co_preadv(acb->child, acb->req.offset,
2193             acb->req.qiov->size, acb->req.qiov, acb->req.flags);
2194     } else {
2195         acb->req.error = bdrv_co_pwritev(acb->child, acb->req.offset,
2196             acb->req.qiov->size, acb->req.qiov, acb->req.flags);
2197     }
2198
2199     bdrv_co_complete(acb);
2200 }
2201
2202 static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child,
2203                                           int64_t offset,
2204                                           QEMUIOVector *qiov,
2205                                           BdrvRequestFlags flags,
2206                                           BlockCompletionFunc *cb,
2207                                           void *opaque,
2208                                           bool is_write)
2209 {
2210     Coroutine *co;
2211     BlockAIOCBCoroutine *acb;
2212
2213     /* Matched by bdrv_co_complete's bdrv_dec_in_flight.  */
2214     bdrv_inc_in_flight(child->bs);
2215
2216     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, child->bs, cb, opaque);
2217     acb->child = child;
2218     acb->need_bh = true;
2219     acb->req.error = -EINPROGRESS;
2220     acb->req.offset = offset;
2221     acb->req.qiov = qiov;
2222     acb->req.flags = flags;
2223     acb->is_write = is_write;
2224
2225     co = qemu_coroutine_create(bdrv_co_do_rw, acb);
2226     bdrv_coroutine_enter(child->bs, co);
2227
2228     bdrv_co_maybe_schedule_bh(acb);
2229     return &acb->common;
2230 }
2231
2232 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
2233 {
2234     BlockAIOCBCoroutine *acb = opaque;
2235     BlockDriverState *bs = acb->common.bs;
2236
2237     acb->req.error = bdrv_co_flush(bs);
2238     bdrv_co_complete(acb);
2239 }
2240
2241 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
2242         BlockCompletionFunc *cb, void *opaque)
2243 {
2244     trace_bdrv_aio_flush(bs, opaque);
2245
2246     Coroutine *co;
2247     BlockAIOCBCoroutine *acb;
2248
2249     /* Matched by bdrv_co_complete's bdrv_dec_in_flight.  */
2250     bdrv_inc_in_flight(bs);
2251
2252     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2253     acb->need_bh = true;
2254     acb->req.error = -EINPROGRESS;
2255
2256     co = qemu_coroutine_create(bdrv_aio_flush_co_entry, acb);
2257     bdrv_coroutine_enter(bs, co);
2258
2259     bdrv_co_maybe_schedule_bh(acb);
2260     return &acb->common;
2261 }
2262
2263 /**************************************************************/
2264 /* Coroutine block device emulation */
2265
2266 typedef struct FlushCo {
2267     BlockDriverState *bs;
2268     int ret;
2269 } FlushCo;
2270
2271
2272 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
2273 {
2274     FlushCo *rwco = opaque;
2275
2276     rwco->ret = bdrv_co_flush(rwco->bs);
2277 }
2278
2279 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
2280 {
2281     int ret;
2282
2283     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
2284         bdrv_is_sg(bs)) {
2285         return 0;
2286     }
2287
2288     bdrv_inc_in_flight(bs);
2289
2290     int current_gen = bs->write_gen;
2291
2292     /* Wait until any previous flushes are completed */
2293     while (bs->active_flush_req) {
2294         qemu_co_queue_wait(&bs->flush_queue, NULL);
2295     }
2296
2297     bs->active_flush_req = true;
2298
2299     /* Write back all layers by calling one driver function */
2300     if (bs->drv->bdrv_co_flush) {
2301         ret = bs->drv->bdrv_co_flush(bs);
2302         goto out;
2303     }
2304
2305     /* Write back cached data to the OS even with cache=unsafe */
2306     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
2307     if (bs->drv->bdrv_co_flush_to_os) {
2308         ret = bs->drv->bdrv_co_flush_to_os(bs);
2309         if (ret < 0) {
2310             goto out;
2311         }
2312     }
2313
2314     /* But don't actually force it to the disk with cache=unsafe */
2315     if (bs->open_flags & BDRV_O_NO_FLUSH) {
2316         goto flush_parent;
2317     }
2318
2319     /* Check if we really need to flush anything */
2320     if (bs->flushed_gen == current_gen) {
2321         goto flush_parent;
2322     }
2323
2324     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
2325     if (bs->drv->bdrv_co_flush_to_disk) {
2326         ret = bs->drv->bdrv_co_flush_to_disk(bs);
2327     } else if (bs->drv->bdrv_aio_flush) {
2328         BlockAIOCB *acb;
2329         CoroutineIOCompletion co = {
2330             .coroutine = qemu_coroutine_self(),
2331         };
2332
2333         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
2334         if (acb == NULL) {
2335             ret = -EIO;
2336         } else {
2337             qemu_coroutine_yield();
2338             ret = co.ret;
2339         }
2340     } else {
2341         /*
2342          * Some block drivers always operate in either writethrough or unsafe
2343          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
2344          * know how the server works (because the behaviour is hardcoded or
2345          * depends on server-side configuration), so we can't ensure that
2346          * everything is safe on disk. Returning an error doesn't work because
2347          * that would break guests even if the server operates in writethrough
2348          * mode.
2349          *
2350          * Let's hope the user knows what he's doing.
2351          */
2352         ret = 0;
2353     }
2354
2355     if (ret < 0) {
2356         goto out;
2357     }
2358
2359     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
2360      * in the case of cache=unsafe, so there are no useless flushes.
2361      */
2362 flush_parent:
2363     ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
2364 out:
2365     /* Notify any pending flushes that we have completed */
2366     if (ret == 0) {
2367         bs->flushed_gen = current_gen;
2368     }
2369     bs->active_flush_req = false;
2370     /* Return value is ignored - it's ok if wait queue is empty */
2371     qemu_co_queue_next(&bs->flush_queue);
2372
2373     bdrv_dec_in_flight(bs);
2374     return ret;
2375 }
2376
2377 int bdrv_flush(BlockDriverState *bs)
2378 {
2379     Coroutine *co;
2380     FlushCo flush_co = {
2381         .bs = bs,
2382         .ret = NOT_DONE,
2383     };
2384
2385     if (qemu_in_coroutine()) {
2386         /* Fast-path if already in coroutine context */
2387         bdrv_flush_co_entry(&flush_co);
2388     } else {
2389         co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co);
2390         bdrv_coroutine_enter(bs, co);
2391         BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE);
2392     }
2393
2394     return flush_co.ret;
2395 }
2396
2397 typedef struct DiscardCo {
2398     BlockDriverState *bs;
2399     int64_t offset;
2400     int count;
2401     int ret;
2402 } DiscardCo;
2403 static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque)
2404 {
2405     DiscardCo *rwco = opaque;
2406
2407     rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->count);
2408 }
2409
2410 int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
2411                                   int count)
2412 {
2413     BdrvTrackedRequest req;
2414     int max_pdiscard, ret;
2415     int head, tail, align;
2416
2417     if (!bs->drv) {
2418         return -ENOMEDIUM;
2419     }
2420
2421     ret = bdrv_check_byte_request(bs, offset, count);
2422     if (ret < 0) {
2423         return ret;
2424     } else if (bs->read_only) {
2425         return -EPERM;
2426     }
2427     assert(!(bs->open_flags & BDRV_O_INACTIVE));
2428
2429     /* Do nothing if disabled.  */
2430     if (!(bs->open_flags & BDRV_O_UNMAP)) {
2431         return 0;
2432     }
2433
2434     if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) {
2435         return 0;
2436     }
2437
2438     /* Discard is advisory, but some devices track and coalesce
2439      * unaligned requests, so we must pass everything down rather than
2440      * round here.  Still, most devices will just silently ignore
2441      * unaligned requests (by returning -ENOTSUP), so we must fragment
2442      * the request accordingly.  */
2443     align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
2444     assert(align % bs->bl.request_alignment == 0);
2445     head = offset % align;
2446     tail = (offset + count) % align;
2447
2448     bdrv_inc_in_flight(bs);
2449     tracked_request_begin(&req, bs, offset, count, BDRV_TRACKED_DISCARD);
2450
2451     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
2452     if (ret < 0) {
2453         goto out;
2454     }
2455
2456     max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX),
2457                                    align);
2458     assert(max_pdiscard >= bs->bl.request_alignment);
2459
2460     while (count > 0) {
2461         int ret;
2462         int num = count;
2463
2464         if (head) {
2465             /* Make small requests to get to alignment boundaries. */
2466             num = MIN(count, align - head);
2467             if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) {
2468                 num %= bs->bl.request_alignment;
2469             }
2470             head = (head + num) % align;
2471             assert(num < max_pdiscard);
2472         } else if (tail) {
2473             if (num > align) {
2474                 /* Shorten the request to the last aligned cluster.  */
2475                 num -= tail;
2476             } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) &&
2477                        tail > bs->bl.request_alignment) {
2478                 tail %= bs->bl.request_alignment;
2479                 num -= tail;
2480             }
2481         }
2482         /* limit request size */
2483         if (num > max_pdiscard) {
2484             num = max_pdiscard;
2485         }
2486
2487         if (bs->drv->bdrv_co_pdiscard) {
2488             ret = bs->drv->bdrv_co_pdiscard(bs, offset, num);
2489         } else {
2490             BlockAIOCB *acb;
2491             CoroutineIOCompletion co = {
2492                 .coroutine = qemu_coroutine_self(),
2493             };
2494
2495             acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num,
2496                                              bdrv_co_io_em_complete, &co);
2497             if (acb == NULL) {
2498                 ret = -EIO;
2499                 goto out;
2500             } else {
2501                 qemu_coroutine_yield();
2502                 ret = co.ret;
2503             }
2504         }
2505         if (ret && ret != -ENOTSUP) {
2506             goto out;
2507         }
2508
2509         offset += num;
2510         count -= num;
2511     }
2512     ret = 0;
2513 out:
2514     ++bs->write_gen;
2515     bdrv_set_dirty(bs, req.offset >> BDRV_SECTOR_BITS,
2516                    req.bytes >> BDRV_SECTOR_BITS);
2517     tracked_request_end(&req);
2518     bdrv_dec_in_flight(bs);
2519     return ret;
2520 }
2521
2522 int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int count)
2523 {
2524     Coroutine *co;
2525     DiscardCo rwco = {
2526         .bs = bs,
2527         .offset = offset,
2528         .count = count,
2529         .ret = NOT_DONE,
2530     };
2531
2532     if (qemu_in_coroutine()) {
2533         /* Fast-path if already in coroutine context */
2534         bdrv_pdiscard_co_entry(&rwco);
2535     } else {
2536         co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco);
2537         bdrv_coroutine_enter(bs, co);
2538         BDRV_POLL_WHILE(bs, rwco.ret == NOT_DONE);
2539     }
2540
2541     return rwco.ret;
2542 }
2543
2544 int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
2545 {
2546     BlockDriver *drv = bs->drv;
2547     CoroutineIOCompletion co = {
2548         .coroutine = qemu_coroutine_self(),
2549     };
2550     BlockAIOCB *acb;
2551
2552     bdrv_inc_in_flight(bs);
2553     if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) {
2554         co.ret = -ENOTSUP;
2555         goto out;
2556     }
2557
2558     if (drv->bdrv_co_ioctl) {
2559         co.ret = drv->bdrv_co_ioctl(bs, req, buf);
2560     } else {
2561         acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
2562         if (!acb) {
2563             co.ret = -ENOTSUP;
2564             goto out;
2565         }
2566         qemu_coroutine_yield();
2567     }
2568 out:
2569     bdrv_dec_in_flight(bs);
2570     return co.ret;
2571 }
2572
2573 void *qemu_blockalign(BlockDriverState *bs, size_t size)
2574 {
2575     return qemu_memalign(bdrv_opt_mem_align(bs), size);
2576 }
2577
2578 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
2579 {
2580     return memset(qemu_blockalign(bs, size), 0, size);
2581 }
2582
2583 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
2584 {
2585     size_t align = bdrv_opt_mem_align(bs);
2586
2587     /* Ensure that NULL is never returned on success */
2588     assert(align > 0);
2589     if (size == 0) {
2590         size = align;
2591     }
2592
2593     return qemu_try_memalign(align, size);
2594 }
2595
2596 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
2597 {
2598     void *mem = qemu_try_blockalign(bs, size);
2599
2600     if (mem) {
2601         memset(mem, 0, size);
2602     }
2603
2604     return mem;
2605 }
2606
2607 /*
2608  * Check if all memory in this vector is sector aligned.
2609  */
2610 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
2611 {
2612     int i;
2613     size_t alignment = bdrv_min_mem_align(bs);
2614
2615     for (i = 0; i < qiov->niov; i++) {
2616         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
2617             return false;
2618         }
2619         if (qiov->iov[i].iov_len % alignment) {
2620             return false;
2621         }
2622     }
2623
2624     return true;
2625 }
2626
2627 void bdrv_add_before_write_notifier(BlockDriverState *bs,
2628                                     NotifierWithReturn *notifier)
2629 {
2630     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
2631 }
2632
2633 void bdrv_io_plug(BlockDriverState *bs)
2634 {
2635     BdrvChild *child;
2636
2637     QLIST_FOREACH(child, &bs->children, next) {
2638         bdrv_io_plug(child->bs);
2639     }
2640
2641     if (bs->io_plugged++ == 0) {
2642         BlockDriver *drv = bs->drv;
2643         if (drv && drv->bdrv_io_plug) {
2644             drv->bdrv_io_plug(bs);
2645         }
2646     }
2647 }
2648
2649 void bdrv_io_unplug(BlockDriverState *bs)
2650 {
2651     BdrvChild *child;
2652
2653     assert(bs->io_plugged);
2654     if (--bs->io_plugged == 0) {
2655         BlockDriver *drv = bs->drv;
2656         if (drv && drv->bdrv_io_unplug) {
2657             drv->bdrv_io_unplug(bs);
2658         }
2659     }
2660
2661     QLIST_FOREACH(child, &bs->children, next) {
2662         bdrv_io_unplug(child->bs);
2663     }
2664 }