commit: Switch commit_populate() to byte-based

[mirror_qemu.git] / block / io.c
diff --git a/block/io.c b/block/io.c

index 8706bfa5782db3162f9f49c44ab5a63d8502b251..5c146b5a109afda94500d3cf81326126f5b91180 100644 (file)
--- a/block/io.c
+++ b/block/io.c
@@ -26,6 +26,7 @@
  #include "trace.h"
  #include "sysemu/block-backend.h"
  #include "block/blockjob.h"
+#include "block/blockjob_int.h"
  #include "block/block_int.h"
  #include "qemu/cutils.h"
  #include "qapi/error.h"
@@ -33,16 +34,8 @@
  
  #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
  
-static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child,
-                                          int64_t offset,
-                                          QEMUIOVector *qiov,
-                                          BdrvRequestFlags flags,
-                                          BlockCompletionFunc *cb,
-                                          void *opaque,
-                                          bool is_write);
-static void coroutine_fn bdrv_co_do_rw(void *opaque);
  static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
-    int64_t offset, int count, BdrvRequestFlags flags);
+    int64_t offset, int bytes, BdrvRequestFlags flags);
  
  void bdrv_parent_drained_begin(BlockDriverState *bs)
  {
@@ -129,13 +122,13 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
   */
  void bdrv_enable_copy_on_read(BlockDriverState *bs)
  {
-    bs->copy_on_read++;
+    atomic_inc(&bs->copy_on_read);
  }
  
  void bdrv_disable_copy_on_read(BlockDriverState *bs)
  {
-    assert(bs->copy_on_read > 0);
-    bs->copy_on_read--;
+    int old = atomic_fetch_dec(&bs->copy_on_read);
+    assert(old >= 1);
  }
  
  /* Check if any requests are in-flight (including throttled requests) */
@@ -158,7 +151,7 @@ bool bdrv_requests_pending(BlockDriverState *bs)
  
  static bool bdrv_drain_recurse(BlockDriverState *bs)
  {
-    BdrvChild *child;
+    BdrvChild *child, *tmp;
      bool waited;
  
      waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
@@ -167,8 +160,25 @@ static bool bdrv_drain_recurse(BlockDriverState *bs)
          bs->drv->bdrv_drain(bs);
      }
  
-    QLIST_FOREACH(child, &bs->children, next) {
-        waited |= bdrv_drain_recurse(child->bs);
+    QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
+        BlockDriverState *bs = child->bs;
+        bool in_main_loop =
+            qemu_get_current_aio_context() == qemu_get_aio_context();
+        assert(bs->refcnt > 0);
+        if (in_main_loop) {
+            /* In case the recursive bdrv_drain_recurse processes a
+             * block_job_defer_to_main_loop BH and modifies the graph,
+             * let's hold a reference to bs until we are done.
+             *
+             * IOThread doesn't have such a BH, and it is not safe to call
+             * bdrv_unref without BQL, so skip doing it there.
+             */
+            bdrv_ref(bs);
+        }
+        waited |= bdrv_drain_recurse(bs);
+        if (in_main_loop) {
+            bdrv_unref(bs);
+        }
      }
  
      return waited;
@@ -223,7 +233,7 @@ void bdrv_drained_begin(BlockDriverState *bs)
          return;
      }
  
-    if (!bs->quiesce_counter++) {
+    if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
          aio_disable_external(bdrv_get_aio_context(bs));
          bdrv_parent_drained_begin(bs);
      }
@@ -234,7 +244,7 @@ void bdrv_drained_begin(BlockDriverState *bs)
  void bdrv_drained_end(BlockDriverState *bs)
  {
      assert(bs->quiesce_counter > 0);
-    if (--bs->quiesce_counter > 0) {
+    if (atomic_fetch_dec(&bs->quiesce_counter) > 1) {
          return;
      }
  
@@ -284,16 +294,9 @@ void bdrv_drain_all_begin(void)
      bool waited = true;
      BlockDriverState *bs;
      BdrvNextIterator it;
-    BlockJob *job = NULL;
      GSList *aio_ctxs = NULL, *ctx;
  
-    while ((job = block_job_next(job))) {
-        AioContext *aio_context = blk_get_aio_context(job->blk);
-
-        aio_context_acquire(aio_context);
-        block_job_pause(job);
-        aio_context_release(aio_context);
-    }
+    block_job_pause_all();
  
      for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
          AioContext *aio_context = bdrv_get_aio_context(bs);
@@ -337,7 +340,6 @@ void bdrv_drain_all_end(void)
  {
      BlockDriverState *bs;
      BdrvNextIterator it;
-    BlockJob *job = NULL;
  
      for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
          AioContext *aio_context = bdrv_get_aio_context(bs);
@@ -348,13 +350,7 @@ void bdrv_drain_all_end(void)
          aio_context_release(aio_context);
      }
  
-    while ((job = block_job_next(job))) {
-        AioContext *aio_context = blk_get_aio_context(job->blk);
-
-        aio_context_acquire(aio_context);
-        block_job_resume(job);
-        aio_context_release(aio_context);
-    }
+    block_job_resume_all();
  }
  
  void bdrv_drain_all(void)
@@ -371,11 +367,13 @@ void bdrv_drain_all(void)
  static void tracked_request_end(BdrvTrackedRequest *req)
  {
      if (req->serialising) {
-        req->bs->serialising_in_flight--;
+        atomic_dec(&req->bs->serialising_in_flight);
      }
  
+    qemu_co_mutex_lock(&req->bs->reqs_lock);
      QLIST_REMOVE(req, list);
      qemu_co_queue_restart_all(&req->wait_queue);
+    qemu_co_mutex_unlock(&req->bs->reqs_lock);
  }
  
  /**
@@ -400,7 +398,9 @@ static void tracked_request_begin(BdrvTrackedRequest *req,
  
      qemu_co_queue_init(&req->wait_queue);
  
+    qemu_co_mutex_lock(&bs->reqs_lock);
      QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
+    qemu_co_mutex_unlock(&bs->reqs_lock);
  }
  
  static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
@@ -410,7 +410,7 @@ static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
                                 - overlap_offset;
  
      if (!req->serialising) {
-        req->bs->serialising_in_flight++;
+        atomic_inc(&req->bs->serialising_in_flight);
          req->serialising = true;
      }
  
@@ -497,7 +497,8 @@ static void dummy_bh_cb(void *opaque)
  
  void bdrv_wakeup(BlockDriverState *bs)
  {
-    if (bs->wakeup) {
+    /* The barrier (or an atomic op) is in the caller.  */
+    if (atomic_read(&bs->wakeup)) {
          aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL);
      }
  }
@@ -515,12 +516,13 @@ static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
      bool retry;
      bool waited = false;
  
-    if (!bs->serialising_in_flight) {
+    if (!atomic_read(&bs->serialising_in_flight)) {
          return false;
      }
  
      do {
          retry = false;
+        qemu_co_mutex_lock(&bs->reqs_lock);
          QLIST_FOREACH(req, &bs->tracked_requests, list) {
              if (req == self || (!req->serialising && !self->serialising)) {
                  continue;
@@ -539,7 +541,7 @@ static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
                   * (instead of producing a deadlock in the former case). */
                  if (!req->waiting_for) {
                      self->waiting_for = req;
-                    qemu_co_queue_wait(&req->wait_queue, NULL);
+                    qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock);
                      self->waiting_for = NULL;
                      retry = true;
                      waited = true;
@@ -547,6 +549,7 @@ static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
                  }
              }
          }
+        qemu_co_mutex_unlock(&bs->reqs_lock);
      } while (retry);
  
      return waited;
@@ -663,12 +666,12 @@ int bdrv_write(BdrvChild *child, int64_t sector_num,
  }
  
  int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
-                       int count, BdrvRequestFlags flags)
+                       int bytes, BdrvRequestFlags flags)
  {
      QEMUIOVector qiov;
      struct iovec iov = {
          .iov_base = NULL,
-        .iov_len = count,
+        .iov_len = bytes,
      };
  
      qemu_iovec_init_external(&qiov, &iov, 1);
@@ -1140,7 +1143,7 @@ int coroutine_fn bdrv_co_preadv(BdrvChild *child,
      bdrv_inc_in_flight(bs);
  
      /* Don't do copy-on-read if we read data before write operation */
-    if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) {
+    if (atomic_read(&bs->copy_on_read) && !(flags & BDRV_REQ_NO_SERIALISING)) {
          flags |= BDRV_REQ_COPY_ON_READ;
      }
  
@@ -1209,7 +1212,7 @@ int coroutine_fn bdrv_co_readv(BdrvChild *child, int64_t sector_num,
  #define MAX_WRITE_ZEROES_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
  
  static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
-    int64_t offset, int count, BdrvRequestFlags flags)
+    int64_t offset, int bytes, BdrvRequestFlags flags)
  {
      BlockDriver *drv = bs->drv;
      QEMUIOVector qiov;
@@ -1227,12 +1230,12 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
  
      assert(alignment % bs->bl.request_alignment == 0);
      head = offset % alignment;
-    tail = (offset + count) % alignment;
+    tail = (offset + bytes) % alignment;
      max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment);
      assert(max_write_zeroes >= bs->bl.request_alignment);
  
-    while (count > 0 && !ret) {
-        int num = count;
+    while (bytes > 0 && !ret) {
+        int num = bytes;
  
          /* Align request.  Block drivers can expect the "bulk" of the request
           * to be aligned, and that unaligned requests do not cross cluster
@@ -1242,7 +1245,7 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
              /* Make a small request up to the first aligned sector. For
               * convenience, limit this request to max_transfer even if
               * we don't need to fall back to writes.  */
-            num = MIN(MIN(count, max_transfer), alignment - head);
+            num = MIN(MIN(bytes, max_transfer), alignment - head);
              head = (head + num) % alignment;
              assert(num < max_write_zeroes);
          } else if (tail && num > alignment) {
@@ -1303,7 +1306,7 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
          }
  
          offset += num;
-        count -= num;
+        bytes -= num;
      }
  
  fail:
@@ -1345,16 +1348,8 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
      assert(!waited || !req->serialising);
      assert(req->overlap_offset <= offset);
      assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
-    /* FIXME: Block migration uses the BlockBackend of the guest device at a
-     *        point when it has not yet taken write permissions. This will be
-     *        fixed by a future patch, but for now we have to bypass this
-     *        assertion for block migration to work. */
-    // assert(child->perm & BLK_PERM_WRITE);
-    /* FIXME: Because of the above, we also cannot guarantee that all format
-     *        BDS take the BLK_PERM_RESIZE permission on their file BDS, since
-     *        they are not obligated to do so if they do not have any parent
-     *        that has taken the permission to write to them. */
-    // assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE);
+    assert(child->perm & BLK_PERM_WRITE);
+    assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE);
  
      ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
  
@@ -1405,12 +1400,10 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
      }
      bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
  
-    ++bs->write_gen;
+    atomic_inc(&bs->write_gen);
      bdrv_set_dirty(bs, start_sector, end_sector - start_sector);
  
-    if (bs->wr_highest_offset < offset + bytes) {
-        bs->wr_highest_offset = offset + bytes;
-    }
+    stat64_max(&bs->wr_highest_offset, offset + bytes);
  
      if (ret >= 0) {
          bs->total_sectors = MAX(bs->total_sectors, end_sector);
@@ -1435,7 +1428,7 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
      int ret = 0;
  
      head_padding_bytes = offset & (align - 1);
-    tail_padding_bytes = align - ((offset + bytes) & (align - 1));
+    tail_padding_bytes = (align - (offset + bytes)) & (align - 1);
  
  
      assert(flags & BDRV_REQ_ZERO_WRITE);
@@ -1665,15 +1658,15 @@ int coroutine_fn bdrv_co_writev(BdrvChild *child, int64_t sector_num,
  }
  
  int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
-                                       int count, BdrvRequestFlags flags)
+                                       int bytes, BdrvRequestFlags flags)
  {
-    trace_bdrv_co_pwrite_zeroes(child->bs, offset, count, flags);
+    trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags);
  
      if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
          flags &= ~BDRV_REQ_MAY_UNMAP;
      }
  
-    return bdrv_co_pwritev(child, offset, count, NULL,
+    return bdrv_co_pwritev(child, offset, bytes, NULL,
                             BDRV_REQ_ZERO_WRITE | flags);
  }
  
@@ -1718,15 +1711,16 @@ typedef struct BdrvCoGetBlockStatusData {
   * Drivers not implementing the functionality are assumed to not support
   * backing files, hence all their sectors are reported as allocated.
   *
- * If 'sector_num' is beyond the end of the disk image the return value is 0
- * and 'pnum' is set to 0.
+ * If 'sector_num' is beyond the end of the disk image the return value is
+ * BDRV_BLOCK_EOF and 'pnum' is set to 0.
   *
   * 'pnum' is set to the number of sectors (including and immediately following
   * the specified sector) that are known to be in the same
   * allocated/unallocated state.
   *
   * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
- * beyond the end of the disk image it will be clamped.
+ * beyond the end of the disk image it will be clamped; if 'pnum' is set to
+ * the end of the image, then the returned value will include BDRV_BLOCK_EOF.
   *
   * If returned value is positive and BDRV_BLOCK_OFFSET_VALID bit is set, 'file'
   * points to the BDS which the sector range is allocated in.
@@ -1740,6 +1734,7 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
      int64_t n;
      int64_t ret, ret2;
  
+    *file = NULL;
      total_sectors = bdrv_nb_sectors(bs);
      if (total_sectors < 0) {
          return total_sectors;
@@ -1747,7 +1742,7 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
  
      if (sector_num >= total_sectors) {
          *pnum = 0;
-        return 0;
+        return BDRV_BLOCK_EOF;
      }
  
      n = total_sectors - sector_num;
@@ -1758,13 +1753,16 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
      if (!bs->drv->bdrv_co_get_block_status) {
          *pnum = nb_sectors;
          ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
+        if (sector_num + nb_sectors == total_sectors) {
+            ret |= BDRV_BLOCK_EOF;
+        }
          if (bs->drv->protocol_name) {
              ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
+            *file = bs;
          }
          return ret;
      }
  
-    *file = NULL;
      bdrv_inc_in_flight(bs);
      ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum,
                                              file);
@@ -1774,9 +1772,9 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
      }
  
      if (ret & BDRV_BLOCK_RAW) {
-        assert(ret & BDRV_BLOCK_OFFSET_VALID);
-        ret = bdrv_get_block_status(*file, ret >> BDRV_SECTOR_BITS,
-                                    *pnum, pnum, file);
+        assert(ret & BDRV_BLOCK_OFFSET_VALID && *file);
+        ret = bdrv_co_get_block_status(*file, ret >> BDRV_SECTOR_BITS,
+                                       *pnum, pnum, file);
          goto out;
      }
  
@@ -1806,10 +1804,13 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
              /* Ignore errors.  This is just providing extra information, it
               * is useful but not necessary.
               */
-            if (!file_pnum) {
-                /* !file_pnum indicates an offset at or beyond the EOF; it is
-                 * perfectly valid for the format block driver to point to such
-                 * offsets, so catch it and mark everything as zero */
+            if (ret2 & BDRV_BLOCK_EOF &&
+                (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) {
+                /*
+                 * It is valid for the format block driver to read
+                 * beyond the end of the underlying file's current
+                 * size; such areas read as zero.
+                 */
                  ret |= BDRV_BLOCK_ZERO;
              } else {
                  /* Limit request to the range reported by the protocol driver */
@@ -1821,6 +1822,9 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
  
  out:
      bdrv_dec_in_flight(bs);
+    if (ret >= 0 && sector_num + *pnum == total_sectors) {
+        ret |= BDRV_BLOCK_EOF;
+    }
      return ret;
  }
  
@@ -1833,16 +1837,30 @@ static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs,
  {
      BlockDriverState *p;
      int64_t ret = 0;
+    bool first = true;
  
      assert(bs != base);
      for (p = bs; p != base; p = backing_bs(p)) {
          ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum, file);
-        if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) {
+        if (ret < 0) {
+            break;
+        }
+        if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) {
+            /*
+             * Reading beyond the end of the file continues to read
+             * zeroes, but we can only widen the result to the
+             * unallocated length we learned from an earlier
+             * iteration.
+             */
+            *pnum = nb_sectors;
+        }
+        if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) {
              break;
          }
          /* [sector_num, pnum] unallocated on this layer, which could be only
           * the first part of [sector_num, nb_sectors].  */
          nb_sectors = MIN(nb_sectors, *pnum);
+        first = false;
      }
      return ret;
  }
@@ -1979,17 +1997,24 @@ bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
                     bool is_read)
  {
      BlockDriver *drv = bs->drv;
+    int ret = -ENOTSUP;
+
+    bdrv_inc_in_flight(bs);
  
      if (!drv) {
-        return -ENOMEDIUM;
+        ret = -ENOMEDIUM;
      } else if (drv->bdrv_load_vmstate) {
-        return is_read ? drv->bdrv_load_vmstate(bs, qiov, pos)
-                       : drv->bdrv_save_vmstate(bs, qiov, pos);
+        if (is_read) {
+            ret = drv->bdrv_load_vmstate(bs, qiov, pos);
+        } else {
+            ret = drv->bdrv_save_vmstate(bs, qiov, pos);
+        }
      } else if (bs->file) {
-        return bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
+        ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
      }
  
-    return -ENOTSUP;
+    bdrv_dec_in_flight(bs);
+    return ret;
  }
  
  static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
@@ -2015,9 +2040,7 @@ bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
          Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data);
  
          bdrv_coroutine_enter(bs, co);
-        while (data.ret == -EINPROGRESS) {
-            aio_poll(bdrv_get_aio_context(bs), true);
-        }
+        BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS);
          return data.ret;
      }
  }
@@ -2074,28 +2097,6 @@ int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
  /**************************************************************/
  /* async I/Os */
  
-BlockAIOCB *bdrv_aio_readv(BdrvChild *child, int64_t sector_num,
-                           QEMUIOVector *qiov, int nb_sectors,
-                           BlockCompletionFunc *cb, void *opaque)
-{
-    trace_bdrv_aio_readv(child->bs, sector_num, nb_sectors, opaque);
-
-    assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size);
-    return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov,
-                                  0, cb, opaque, false);
-}
-
-BlockAIOCB *bdrv_aio_writev(BdrvChild *child, int64_t sector_num,
-                            QEMUIOVector *qiov, int nb_sectors,
-                            BlockCompletionFunc *cb, void *opaque)
-{
-    trace_bdrv_aio_writev(child->bs, sector_num, nb_sectors, opaque);
-
-    assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size);
-    return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov,
-                                  0, cb, opaque, true);
-}
-
  void bdrv_aio_cancel(BlockAIOCB *acb)
  {
      qemu_aio_ref(acb);
@@ -2127,147 +2128,6 @@ void bdrv_aio_cancel_async(BlockAIOCB *acb)
      }
  }
  
-/**************************************************************/
-/* async block device emulation */
-
-typedef struct BlockRequest {
-    union {
-        /* Used during read, write, trim */
-        struct {
-            int64_t offset;
-            int bytes;
-            int flags;
-            QEMUIOVector *qiov;
-        };
-        /* Used during ioctl */
-        struct {
-            int req;
-            void *buf;
-        };
-    };
-    BlockCompletionFunc *cb;
-    void *opaque;
-
-    int error;
-} BlockRequest;
-
-typedef struct BlockAIOCBCoroutine {
-    BlockAIOCB common;
-    BdrvChild *child;
-    BlockRequest req;
-    bool is_write;
-    bool need_bh;
-    bool *done;
-} BlockAIOCBCoroutine;
-
-static const AIOCBInfo bdrv_em_co_aiocb_info = {
-    .aiocb_size         = sizeof(BlockAIOCBCoroutine),
-};
-
-static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
-{
-    if (!acb->need_bh) {
-        bdrv_dec_in_flight(acb->common.bs);
-        acb->common.cb(acb->common.opaque, acb->req.error);
-        qemu_aio_unref(acb);
-    }
-}
-
-static void bdrv_co_em_bh(void *opaque)
-{
-    BlockAIOCBCoroutine *acb = opaque;
-
-    assert(!acb->need_bh);
-    bdrv_co_complete(acb);
-}
-
-static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
-{
-    acb->need_bh = false;
-    if (acb->req.error != -EINPROGRESS) {
-        BlockDriverState *bs = acb->common.bs;
-
-        aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
-    }
-}
-
-/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
-static void coroutine_fn bdrv_co_do_rw(void *opaque)
-{
-    BlockAIOCBCoroutine *acb = opaque;
-
-    if (!acb->is_write) {
-        acb->req.error = bdrv_co_preadv(acb->child, acb->req.offset,
-            acb->req.qiov->size, acb->req.qiov, acb->req.flags);
-    } else {
-        acb->req.error = bdrv_co_pwritev(acb->child, acb->req.offset,
-            acb->req.qiov->size, acb->req.qiov, acb->req.flags);
-    }
-
-    bdrv_co_complete(acb);
-}
-
-static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child,
-                                          int64_t offset,
-                                          QEMUIOVector *qiov,
-                                          BdrvRequestFlags flags,
-                                          BlockCompletionFunc *cb,
-                                          void *opaque,
-                                          bool is_write)
-{
-    Coroutine *co;
-    BlockAIOCBCoroutine *acb;
-
-    /* Matched by bdrv_co_complete's bdrv_dec_in_flight.  */
-    bdrv_inc_in_flight(child->bs);
-
-    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, child->bs, cb, opaque);
-    acb->child = child;
-    acb->need_bh = true;
-    acb->req.error = -EINPROGRESS;
-    acb->req.offset = offset;
-    acb->req.qiov = qiov;
-    acb->req.flags = flags;
-    acb->is_write = is_write;
-
-    co = qemu_coroutine_create(bdrv_co_do_rw, acb);
-    bdrv_coroutine_enter(child->bs, co);
-
-    bdrv_co_maybe_schedule_bh(acb);
-    return &acb->common;
-}
-
-static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
-{
-    BlockAIOCBCoroutine *acb = opaque;
-    BlockDriverState *bs = acb->common.bs;
-
-    acb->req.error = bdrv_co_flush(bs);
-    bdrv_co_complete(acb);
-}
-
-BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
-        BlockCompletionFunc *cb, void *opaque)
-{
-    trace_bdrv_aio_flush(bs, opaque);
-
-    Coroutine *co;
-    BlockAIOCBCoroutine *acb;
-
-    /* Matched by bdrv_co_complete's bdrv_dec_in_flight.  */
-    bdrv_inc_in_flight(bs);
-
-    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
-    acb->need_bh = true;
-    acb->req.error = -EINPROGRESS;
-
-    co = qemu_coroutine_create(bdrv_aio_flush_co_entry, acb);
-    bdrv_coroutine_enter(bs, co);
-
-    bdrv_co_maybe_schedule_bh(acb);
-    return &acb->common;
-}
-
  /**************************************************************/
  /* Coroutine block device emulation */
  
@@ -2291,19 +2151,22 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
  
      bdrv_inc_in_flight(bs);
  
-    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
+    if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
          bdrv_is_sg(bs)) {
          goto early_exit;
      }
  
-    current_gen = bs->write_gen;
+    qemu_co_mutex_lock(&bs->reqs_lock);
+    current_gen = atomic_read(&bs->write_gen);
  
      /* Wait until any previous flushes are completed */
      while (bs->active_flush_req) {
-        qemu_co_queue_wait(&bs->flush_queue, NULL);
+        qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock);
      }
  
+    /* Flushes reach this point in nondecreasing current_gen order.  */
      bs->active_flush_req = true;
+    qemu_co_mutex_unlock(&bs->reqs_lock);
  
      /* Write back all layers by calling one driver function */
      if (bs->drv->bdrv_co_flush) {
@@ -2375,9 +2238,12 @@ out:
      if (ret == 0) {
          bs->flushed_gen = current_gen;
      }
+
+    qemu_co_mutex_lock(&bs->reqs_lock);
      bs->active_flush_req = false;
      /* Return value is ignored - it's ok if wait queue is empty */
      qemu_co_queue_next(&bs->flush_queue);
+    qemu_co_mutex_unlock(&bs->reqs_lock);
  
  early_exit:
      bdrv_dec_in_flight(bs);
@@ -2407,18 +2273,18 @@ int bdrv_flush(BlockDriverState *bs)
  typedef struct DiscardCo {
      BlockDriverState *bs;
      int64_t offset;
-    int count;
+    int bytes;
      int ret;
  } DiscardCo;
  static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque)
  {
      DiscardCo *rwco = opaque;
  
-    rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->count);
+    rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->bytes);
  }
  
  int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
-                                  int count)
+                                  int bytes)
  {
      BdrvTrackedRequest req;
      int max_pdiscard, ret;
@@ -2428,7 +2294,7 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
          return -ENOMEDIUM;
      }
  
-    ret = bdrv_check_byte_request(bs, offset, count);
+    ret = bdrv_check_byte_request(bs, offset, bytes);
      if (ret < 0) {
          return ret;
      } else if (bs->read_only) {
@@ -2453,10 +2319,10 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
      align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
      assert(align % bs->bl.request_alignment == 0);
      head = offset % align;
-    tail = (offset + count) % align;
+    tail = (offset + bytes) % align;
  
      bdrv_inc_in_flight(bs);
-    tracked_request_begin(&req, bs, offset, count, BDRV_TRACKED_DISCARD);
+    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD);
  
      ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
      if (ret < 0) {
@@ -2467,13 +2333,13 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
                                     align);
      assert(max_pdiscard >= bs->bl.request_alignment);
  
-    while (count > 0) {
+    while (bytes > 0) {
          int ret;
-        int num = count;
+        int num = bytes;
  
          if (head) {
              /* Make small requests to get to alignment boundaries. */
-            num = MIN(count, align - head);
+            num = MIN(bytes, align - head);
              if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) {
                  num %= bs->bl.request_alignment;
              }
@@ -2517,11 +2383,11 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
          }
  
          offset += num;
-        count -= num;
+        bytes -= num;
      }
      ret = 0;
  out:
-    ++bs->write_gen;
+    atomic_inc(&bs->write_gen);
      bdrv_set_dirty(bs, req.offset >> BDRV_SECTOR_BITS,
                     req.bytes >> BDRV_SECTOR_BITS);
      tracked_request_end(&req);
@@ -2529,13 +2395,13 @@ out:
      return ret;
  }
  
-int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int count)
+int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
  {
      Coroutine *co;
      DiscardCo rwco = {
          .bs = bs,
          .offset = offset,
-        .count = count,
+        .bytes = bytes,
          .ret = NOT_DONE,
      };
  
@@ -2648,7 +2514,7 @@ void bdrv_io_plug(BlockDriverState *bs)
          bdrv_io_plug(child->bs);
      }
  
-    if (bs->io_plugged++ == 0) {
+    if (atomic_fetch_inc(&bs->io_plugged) == 0) {
          BlockDriver *drv = bs->drv;
          if (drv && drv->bdrv_io_plug) {
              drv->bdrv_io_plug(bs);
@@ -2661,7 +2527,7 @@ void bdrv_io_unplug(BlockDriverState *bs)
      BdrvChild *child;
  
      assert(bs->io_plugged);
-    if (--bs->io_plugged == 0) {
+    if (atomic_fetch_dec(&bs->io_plugged) == 1) {
          BlockDriver *drv = bs->drv;
          if (drv && drv->bdrv_io_unplug) {
              drv->bdrv_io_unplug(bs);