]> git.proxmox.com Git - mirror_qemu.git/blobdiff - migration/block.c
io: fix incoming client socket initialization
[mirror_qemu.git] / migration / block.c
index ed865ed23bea60b0e74f37e8a06aeb809f73fb3c..7734ff728a57f9b919f1286fbba3ab074fedb054 100644 (file)
  * GNU GPL, version 2 or (at your option) any later version.
  */
 
+#include "qemu/osdep.h"
+#include "qapi/error.h"
 #include "qemu-common.h"
 #include "block/block.h"
 #include "qemu/error-report.h"
 #include "qemu/main-loop.h"
 #include "hw/hw.h"
+#include "qemu/cutils.h"
 #include "qemu/queue.h"
 #include "qemu/timer.h"
 #include "migration/block.h"
 #include "migration/migration.h"
 #include "sysemu/blockdev.h"
 #include "sysemu/block-backend.h"
-#include <assert.h>
 
 #define BLOCK_SIZE                       (1 << 20)
 #define BDRV_SECTORS_PER_DIRTY_CHUNK     (BLOCK_SIZE >> BDRV_SECTOR_BITS)
@@ -36,6 +38,8 @@
 
 #define MAX_IS_ALLOCATED_SEARCH 65536
 
+#define MAX_INFLIGHT_IO 512
+
 //#define DEBUG_BLK_MIGRATION
 
 #ifdef DEBUG_BLK_MIGRATION
 
 typedef struct BlkMigDevState {
     /* Written during setup phase.  Can be read without a lock.  */
-    BlockDriverState *bs;
+    BlockBackend *blk;
+    char *blk_name;
     int shared_base;
     int64_t total_sectors;
     QSIMPLEQ_ENTRY(BlkMigDevState) entry;
+    Error *blocker;
 
     /* Only used by migration thread.  Does not need a lock.  */
     int bulk_completed;
     int64_t cur_sector;
     int64_t cur_dirty;
 
-    /* Protected by block migration lock.  */
+    /* Data in the aio_bitmap is protected by block migration lock.
+     * Allocation and free happen during setup and cleanup respectively.
+     */
     unsigned long *aio_bitmap;
+
+    /* Protected by block migration lock.  */
     int64_t completed_sectors;
+
+    /* During migration this is protected by iothread lock / AioContext.
+     * Allocation and free happen during setup and cleanup respectively.
+     */
     BdrvDirtyBitmap *dirty_bitmap;
-    Error *blocker;
 } BlkMigDevState;
 
 typedef struct BlkMigBlock {
@@ -98,7 +111,7 @@ typedef struct BlkMigState {
     int prev_progress;
     int bulk_completed;
 
-    /* Lock must be taken _inside_ the iothread lock.  */
+    /* Lock must be taken _inside_ the iothread lock and any AioContexts.  */
     QemuMutex lock;
 } BlkMigState;
 
@@ -133,9 +146,9 @@ static void blk_send(QEMUFile *f, BlkMigBlock * blk)
                      | flags);
 
     /* device name */
-    len = strlen(bdrv_get_device_name(blk->bmds->bs));
+    len = strlen(blk->bmds->blk_name);
     qemu_put_byte(f, len);
-    qemu_put_buffer(f, (uint8_t *)bdrv_get_device_name(blk->bmds->bs), len);
+    qemu_put_buffer(f, (uint8_t *) blk->bmds->blk_name, len);
 
     /* if a block is zero we need to flush here since the network
      * bandwidth is now a lot higher than the storage device bandwidth.
@@ -189,7 +202,7 @@ static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector)
 {
     int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
 
-    if (sector < bdrv_nb_sectors(bmds->bs)) {
+    if (sector < blk_nb_sectors(bmds->blk)) {
         return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] &
             (1UL << (chunk % (sizeof(unsigned long) * 8))));
     } else {
@@ -223,10 +236,10 @@ static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num,
 
 static void alloc_aio_bitmap(BlkMigDevState *bmds)
 {
-    BlockDriverState *bs = bmds->bs;
+    BlockBackend *bb = bmds->blk;
     int64_t bitmap_size;
 
-    bitmap_size = bdrv_nb_sectors(bs) + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
+    bitmap_size = blk_nb_sectors(bb) + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
     bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
 
     bmds->aio_bitmap = g_malloc0(bitmap_size);
@@ -256,17 +269,21 @@ static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
 {
     int64_t total_sectors = bmds->total_sectors;
     int64_t cur_sector = bmds->cur_sector;
-    BlockDriverState *bs = bmds->bs;
+    BlockBackend *bb = bmds->blk;
     BlkMigBlock *blk;
     int nr_sectors;
 
     if (bmds->shared_base) {
         qemu_mutex_lock_iothread();
+        aio_context_acquire(blk_get_aio_context(bb));
+        /* Skip unallocated sectors; intentionally treats failure as
+         * an allocated sector */
         while (cur_sector < total_sectors &&
-               !bdrv_is_allocated(bs, cur_sector, MAX_IS_ALLOCATED_SEARCH,
-                                  &nr_sectors)) {
+               !bdrv_is_allocated(blk_bs(bb), cur_sector,
+                                  MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) {
             cur_sector += nr_sectors;
         }
+        aio_context_release(blk_get_aio_context(bb));
         qemu_mutex_unlock_iothread();
     }
 
@@ -300,11 +317,21 @@ static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
     block_mig_state.submitted++;
     blk_mig_unlock();
 
+    /* We do not know if bs is under the main thread (and thus does
+     * not acquire the AioContext when doing AIO) or rather under
+     * dataplane.  Thus acquire both the iothread mutex and the
+     * AioContext.
+     *
+     * This is ugly and will disappear when we make bdrv_* thread-safe,
+     * without the need to acquire the AioContext.
+     */
     qemu_mutex_lock_iothread();
-    blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
-                                nr_sectors, blk_mig_read_cb, blk);
+    aio_context_acquire(blk_get_aio_context(bmds->blk));
+    blk->aiocb = blk_aio_preadv(bb, cur_sector * BDRV_SECTOR_SIZE, &blk->qiov,
+                                0, blk_mig_read_cb, blk);
 
     bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, cur_sector, nr_sectors);
+    aio_context_release(blk_get_aio_context(bmds->blk));
     qemu_mutex_unlock_iothread();
 
     bmds->cur_sector = cur_sector + nr_sectors;
@@ -319,8 +346,10 @@ static int set_dirty_tracking(void)
     int ret;
 
     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
-        bmds->dirty_bitmap = bdrv_create_dirty_bitmap(bmds->bs, BLOCK_SIZE,
-                                                      NULL, NULL);
+        aio_context_acquire(blk_get_aio_context(bmds->blk));
+        bmds->dirty_bitmap = bdrv_create_dirty_bitmap(blk_bs(bmds->blk),
+                                                      BLOCK_SIZE, NULL, NULL);
+        aio_context_release(blk_get_aio_context(bmds->blk));
         if (!bmds->dirty_bitmap) {
             ret = -errno;
             goto fail;
@@ -331,26 +360,40 @@ static int set_dirty_tracking(void)
 fail:
     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
         if (bmds->dirty_bitmap) {
-            bdrv_release_dirty_bitmap(bmds->bs, bmds->dirty_bitmap);
+            aio_context_acquire(blk_get_aio_context(bmds->blk));
+            bdrv_release_dirty_bitmap(blk_bs(bmds->blk), bmds->dirty_bitmap);
+            aio_context_release(blk_get_aio_context(bmds->blk));
         }
     }
     return ret;
 }
 
+/* Called with iothread lock taken.  */
+
 static void unset_dirty_tracking(void)
 {
     BlkMigDevState *bmds;
 
     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
-        bdrv_release_dirty_bitmap(bmds->bs, bmds->dirty_bitmap);
+        aio_context_acquire(blk_get_aio_context(bmds->blk));
+        bdrv_release_dirty_bitmap(blk_bs(bmds->blk), bmds->dirty_bitmap);
+        aio_context_release(blk_get_aio_context(bmds->blk));
     }
 }
 
-static void init_blk_migration(QEMUFile *f)
+static int init_blk_migration(QEMUFile *f)
 {
     BlockDriverState *bs;
     BlkMigDevState *bmds;
     int64_t sectors;
+    BdrvNextIterator it;
+    int i, num_bs = 0;
+    struct {
+        BlkMigDevState *bmds;
+        BlockDriverState *bs;
+    } *bmds_bs;
+    Error *local_err = NULL;
+    int ret;
 
     block_mig_state.submitted = 0;
     block_mig_state.read_done = 0;
@@ -360,26 +403,33 @@ static void init_blk_migration(QEMUFile *f)
     block_mig_state.bulk_completed = 0;
     block_mig_state.zero_blocks = migrate_zero_blocks();
 
-    for (bs = bdrv_next(NULL); bs; bs = bdrv_next(bs)) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+        num_bs++;
+    }
+    bmds_bs = g_malloc0(num_bs * sizeof(*bmds_bs));
+
+    for (i = 0, bs = bdrv_first(&it); bs; bs = bdrv_next(&it), i++) {
         if (bdrv_is_read_only(bs)) {
             continue;
         }
 
         sectors = bdrv_nb_sectors(bs);
         if (sectors <= 0) {
-            return;
+            ret = sectors;
+            goto out;
         }
 
         bmds = g_new0(BlkMigDevState, 1);
-        bmds->bs = bs;
+        bmds->blk = blk_new(BLK_PERM_CONSISTENT_READ, BLK_PERM_ALL);
+        bmds->blk_name = g_strdup(bdrv_get_device_name(bs));
         bmds->bulk_completed = 0;
         bmds->total_sectors = sectors;
         bmds->completed_sectors = 0;
         bmds->shared_base = block_mig_state.shared_base;
-        alloc_aio_bitmap(bmds);
-        error_setg(&bmds->blocker, "block device is in use by migration");
-        bdrv_op_block_all(bs, bmds->blocker);
-        bdrv_ref(bs);
+
+        assert(i < num_bs);
+        bmds_bs[i].bmds = bmds;
+        bmds_bs[i].bs = bs;
 
         block_mig_state.total_sector_sum += sectors;
 
@@ -392,6 +442,30 @@ static void init_blk_migration(QEMUFile *f)
 
         QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry);
     }
+
+    /* Can only insert new BDSes now because doing so while iterating block
+     * devices may end up in a deadlock (iterating the new BDSes, too). */
+    for (i = 0; i < num_bs; i++) {
+        BlkMigDevState *bmds = bmds_bs[i].bmds;
+        BlockDriverState *bs = bmds_bs[i].bs;
+
+        if (bmds) {
+            ret = blk_insert_bs(bmds->blk, bs, &local_err);
+            if (ret < 0) {
+                error_report_err(local_err);
+                goto out;
+            }
+
+            alloc_aio_bitmap(bmds);
+            error_setg(&bmds->blocker, "block device is in use by migration");
+            bdrv_op_block_all(bs, bmds->blocker);
+        }
+    }
+
+    ret = 0;
+out:
+    g_free(bmds_bs);
+    return ret;
 }
 
 /* Called with no lock taken.  */
@@ -442,12 +516,13 @@ static void blk_mig_reset_dirty_cursor(void)
     }
 }
 
-/* Called with iothread lock taken.  */
+/* Called with iothread lock and AioContext taken.  */
 
 static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds,
                                  int is_async)
 {
     BlkMigBlock *blk;
+    BlockDriverState *bs = blk_bs(bmds->blk);
     int64_t total_sectors = bmds->total_sectors;
     int64_t sector;
     int nr_sectors;
@@ -457,11 +532,11 @@ static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds,
         blk_mig_lock();
         if (bmds_aio_inflight(bmds, sector)) {
             blk_mig_unlock();
-            bdrv_drain(bmds->bs);
+            blk_drain(bmds->blk);
         } else {
             blk_mig_unlock();
         }
-        if (bdrv_get_dirty(bmds->bs, bmds->dirty_bitmap, sector)) {
+        if (bdrv_get_dirty(bs, bmds->dirty_bitmap, sector)) {
 
             if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
                 nr_sectors = total_sectors - sector;
@@ -479,15 +554,18 @@ static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds,
                 blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
                 qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
 
-                blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov,
-                                            nr_sectors, blk_mig_read_cb, blk);
+                blk->aiocb = blk_aio_preadv(bmds->blk,
+                                            sector * BDRV_SECTOR_SIZE,
+                                            &blk->qiov, 0, blk_mig_read_cb,
+                                            blk);
 
                 blk_mig_lock();
                 block_mig_state.submitted++;
                 bmds_set_aio_inflight(bmds, sector, nr_sectors, 1);
                 blk_mig_unlock();
             } else {
-                ret = bdrv_read(bmds->bs, sector, blk->buf, nr_sectors);
+                ret = blk_pread(bmds->blk, sector * BDRV_SECTOR_SIZE, blk->buf,
+                                nr_sectors * BDRV_SECTOR_SIZE);
                 if (ret < 0) {
                     goto error;
                 }
@@ -498,6 +576,9 @@ static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds,
             }
 
             bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, sector, nr_sectors);
+            sector += nr_sectors;
+            bmds->cur_dirty = sector;
+
             break;
         }
         sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
@@ -525,7 +606,9 @@ static int blk_mig_save_dirty_block(QEMUFile *f, int is_async)
     int ret = 1;
 
     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
+        aio_context_acquire(blk_get_aio_context(bmds->blk));
         ret = mig_save_device_dirty(f, bmds, is_async);
+        aio_context_release(blk_get_aio_context(bmds->blk));
         if (ret <= 0) {
             break;
         }
@@ -583,7 +666,9 @@ static int64_t get_remaining_dirty(void)
     int64_t dirty = 0;
 
     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
+        aio_context_acquire(blk_get_aio_context(bmds->blk));
         dirty += bdrv_get_dirty_count(bmds->dirty_bitmap);
+        aio_context_release(blk_get_aio_context(bmds->blk));
     }
 
     return dirty << BDRV_SECTOR_BITS;
@@ -591,25 +676,33 @@ static int64_t get_remaining_dirty(void)
 
 /* Called with iothread lock taken.  */
 
-static void blk_mig_cleanup(void)
+static void block_migration_cleanup(void *opaque)
 {
     BlkMigDevState *bmds;
     BlkMigBlock *blk;
+    AioContext *ctx;
 
     bdrv_drain_all();
 
     unset_dirty_tracking();
 
-    blk_mig_lock();
     while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
         QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry);
-        bdrv_op_unblock_all(bmds->bs, bmds->blocker);
+        bdrv_op_unblock_all(blk_bs(bmds->blk), bmds->blocker);
         error_free(bmds->blocker);
-        bdrv_unref(bmds->bs);
+
+        /* Save ctx, because bmds->blk can disappear during blk_unref.  */
+        ctx = blk_get_aio_context(bmds->blk);
+        aio_context_acquire(ctx);
+        blk_unref(bmds->blk);
+        aio_context_release(ctx);
+
+        g_free(bmds->blk_name);
         g_free(bmds->aio_bitmap);
         g_free(bmds);
     }
 
+    blk_mig_lock();
     while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
         QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
         g_free(blk->buf);
@@ -618,11 +711,6 @@ static void blk_mig_cleanup(void)
     blk_mig_unlock();
 }
 
-static void block_migration_cancel(void *opaque)
-{
-    blk_mig_cleanup();
-}
-
 static int block_save_setup(QEMUFile *f, void *opaque)
 {
     int ret;
@@ -631,18 +719,21 @@ static int block_save_setup(QEMUFile *f, void *opaque)
             block_mig_state.submitted, block_mig_state.transferred);
 
     qemu_mutex_lock_iothread();
-    init_blk_migration(f);
+    ret = init_blk_migration(f);
+    if (ret < 0) {
+        qemu_mutex_unlock_iothread();
+        return ret;
+    }
 
     /* start track dirty blocks */
     ret = set_dirty_tracking();
 
+    qemu_mutex_unlock_iothread();
+
     if (ret) {
-        qemu_mutex_unlock_iothread();
         return ret;
     }
 
-    qemu_mutex_unlock_iothread();
-
     ret = flush_blks(f);
     blk_mig_reset_dirty_cursor();
     qemu_put_be64(f, BLK_MIG_FLAG_EOS);
@@ -670,7 +761,10 @@ static int block_save_iterate(QEMUFile *f, void *opaque)
     blk_mig_lock();
     while ((block_mig_state.submitted +
             block_mig_state.read_done) * BLOCK_SIZE <
-           qemu_file_get_rate_limit(f)) {
+           qemu_file_get_rate_limit(f) &&
+           (block_mig_state.submitted +
+            block_mig_state.read_done) <
+           MAX_INFLIGHT_IO) {
         blk_mig_unlock();
         if (block_mig_state.bulk_completed == 0) {
             /* first finish the bulk phase */
@@ -750,30 +844,33 @@ static int block_save_complete(QEMUFile *f, void *opaque)
 
     qemu_put_be64(f, BLK_MIG_FLAG_EOS);
 
-    blk_mig_cleanup();
     return 0;
 }
 
-static uint64_t block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
+static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
+                               uint64_t *non_postcopiable_pending,
+                               uint64_t *postcopiable_pending)
 {
     /* Estimate pending number of bytes to send */
     uint64_t pending;
 
     qemu_mutex_lock_iothread();
+    pending = get_remaining_dirty();
+    qemu_mutex_unlock_iothread();
+
     blk_mig_lock();
-    pending = get_remaining_dirty() +
-                       block_mig_state.submitted * BLOCK_SIZE +
-                       block_mig_state.read_done * BLOCK_SIZE;
+    pending += block_mig_state.submitted * BLOCK_SIZE +
+               block_mig_state.read_done * BLOCK_SIZE;
+    blk_mig_unlock();
 
     /* Report at least one block pending during bulk phase */
     if (pending <= max_size && !block_mig_state.bulk_completed) {
         pending = max_size + BLOCK_SIZE;
     }
-    blk_mig_unlock();
-    qemu_mutex_unlock_iothread();
 
     DPRINTF("Enter save live pending  %" PRIu64 "\n", pending);
-    return pending;
+    /* We don't do postcopy */
+    *non_postcopiable_pending += pending;
 }
 
 static int block_load(QEMUFile *f, void *opaque, int version_id)
@@ -782,8 +879,8 @@ static int block_load(QEMUFile *f, void *opaque, int version_id)
     int len, flags;
     char device_name[256];
     int64_t addr;
-    BlockDriverState *bs, *bs_prev = NULL;
-    BlockBackend *blk;
+    BlockBackend *blk, *blk_prev = NULL;;
+    Error *local_err = NULL;
     uint8_t *buf;
     int64_t total_sectors = 0;
     int nr_sectors;
@@ -807,16 +904,21 @@ static int block_load(QEMUFile *f, void *opaque, int version_id)
                         device_name);
                 return -EINVAL;
             }
-            bs = blk_bs(blk);
 
-            if (bs != bs_prev) {
-                bs_prev = bs;
-                total_sectors = bdrv_nb_sectors(bs);
+            if (blk != blk_prev) {
+                blk_prev = blk;
+                total_sectors = blk_nb_sectors(blk);
                 if (total_sectors <= 0) {
                     error_report("Error getting length of block device %s",
                                  device_name);
                     return -EINVAL;
                 }
+
+                blk_invalidate_cache(blk, &local_err);
+                if (local_err) {
+                    error_report_err(local_err);
+                    return -EINVAL;
+                }
             }
 
             if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) {
@@ -826,12 +928,14 @@ static int block_load(QEMUFile *f, void *opaque, int version_id)
             }
 
             if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
-                ret = bdrv_write_zeroes(bs, addr, nr_sectors,
+                ret = blk_pwrite_zeroes(blk, addr * BDRV_SECTOR_SIZE,
+                                        nr_sectors * BDRV_SECTOR_SIZE,
                                         BDRV_REQ_MAY_UNMAP);
             } else {
                 buf = g_malloc(BLOCK_SIZE);
                 qemu_get_buffer(f, buf, BLOCK_SIZE);
-                ret = bdrv_write(bs, addr, buf, nr_sectors);
+                ret = blk_pwrite(blk, addr * BDRV_SECTOR_SIZE, buf,
+                                 nr_sectors * BDRV_SECTOR_SIZE, 0);
                 g_free(buf);
             }
 
@@ -877,10 +981,10 @@ static SaveVMHandlers savevm_block_handlers = {
     .set_params = block_set_params,
     .save_live_setup = block_save_setup,
     .save_live_iterate = block_save_iterate,
-    .save_live_complete = block_save_complete,
+    .save_live_complete_precopy = block_save_complete,
     .save_live_pending = block_save_pending,
     .load_state = block_load,
-    .cancel = block_migration_cancel,
+    .cleanup = block_migration_cleanup,
     .is_active = block_is_active,
 };