migration: Report the error returned when save_live_iterate fails

[mirror_qemu.git] / migration / migration.c
diff --git a/migration/migration.c b/migration/migration.c

index 8ca034136bc11cedd8132e5fb9138f50f5442e94..3849b33108031da8bcb9260c10082f7be471d39d 100644 (file)
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -59,10 +59,8 @@
  #include "multifd.h"
  #include "qemu/yank.h"
  #include "sysemu/cpus.h"
-
-#ifdef CONFIG_VFIO
-#include "hw/vfio/vfio-common.h"
-#endif
+#include "yank_functions.h"
+#include "sysemu/qtest.h"
  
  #define MAX_THROTTLE  (128 << 20)      /* Migration transfer speed throttling */
  
@@ -191,8 +189,6 @@ static gint page_request_addr_cmp(gconstpointer ap, gconstpointer bp)
  
  void migration_object_init(void)
  {
-    Error *err = NULL;
-
      /* This can only be called once. */
      assert(!current_migration);
      current_migration = MIGRATION_OBJ(object_new(TYPE_MIGRATION));
@@ -213,23 +209,34 @@ void migration_object_init(void)
      qemu_mutex_init(&current_incoming->page_request_mutex);
      current_incoming->page_requested = g_tree_new(page_request_addr_cmp);
  
-    if (!migration_object_check(current_migration, &err)) {
-        error_report_err(err);
-        exit(1);
-    }
+    migration_object_check(current_migration, &error_fatal);
  
      blk_mig_init();
      ram_mig_init();
      dirty_bitmap_mig_init();
  }
  
+void migration_cancel(const Error *error)
+{
+    if (error) {
+        migrate_set_error(current_migration, error);
+    }
+    migrate_fd_cancel(current_migration);
+}
+
  void migration_shutdown(void)
  {
+    /*
+     * When the QEMU main thread exit, the COLO thread
+     * may wait a semaphore. So, we should wakeup the
+     * COLO thread before migration shutdown.
+     */
+    colo_shutdown();
      /*
       * Cancel the current migration - that will (eventually)
       * stop the migration using this structure
       */
-    migrate_fd_cancel(current_migration);
+    migration_cancel(NULL);
      object_unref(OBJECT(current_migration));
  
      /*
@@ -272,6 +279,7 @@ void migration_incoming_state_destroy(void)
      }
  
      if (mis->from_src_file) {
+        migration_ioc_unregister_yank_from_file(mis->from_src_file);
          qemu_fclose(mis->from_src_file);
          mis->from_src_file = NULL;
      }
@@ -279,6 +287,9 @@ void migration_incoming_state_destroy(void)
          g_array_free(mis->postcopy_remote_fds, TRUE);
          mis->postcopy_remote_fds = NULL;
      }
+    if (mis->transport_cleanup) {
+        mis->transport_cleanup(mis->transport_data);
+    }
  
      qemu_event_reset(&mis->main_thread_load_event);
  
@@ -390,7 +401,7 @@ int migrate_send_rp_message_req_pages(MigrationIncomingState *mis,
  int migrate_send_rp_req_pages(MigrationIncomingState *mis,
                                RAMBlock *rb, ram_addr_t start, uint64_t haddr)
  {
-    void *aligned = (void *)(uintptr_t)(haddr & (-qemu_ram_pagesize(rb)));
+    void *aligned = (void *)(uintptr_t)ROUND_DOWN(haddr, qemu_ram_pagesize(rb));
      bool received = false;
  
      WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) {
@@ -452,14 +463,12 @@ static void qemu_start_incoming_migration(const char *uri, Error **errp)
  {
      const char *p = NULL;
  
-    if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {
-        return;
-    }
-
+    migrate_protocol_allow_multifd(false); /* reset it anyway */
      qapi_event_send_migration(MIGRATION_STATUS_SETUP);
      if (strstart(uri, "tcp:", &p) ||
          strstart(uri, "unix:", NULL) ||
          strstart(uri, "vsock:", NULL)) {
+        migrate_protocol_allow_multifd(true);
          socket_start_incoming_migration(p ? p : uri, errp);
  #ifdef CONFIG_RDMA
      } else if (strstart(uri, "rdma:", &p)) {
@@ -470,7 +479,6 @@ static void qemu_start_incoming_migration(const char *uri, Error **errp)
      } else if (strstart(uri, "fd:", &p)) {
          fd_start_incoming_migration(p, errp);
      } else {
-        yank_unregister_instance(MIGRATION_YANK_INSTANCE);
          error_setg(errp, "unknown migration protocol: %s", uri);
      }
  }
@@ -589,8 +597,10 @@ static void process_incoming_migration_co(void *opaque)
          mis->have_colo_incoming_thread = true;
          qemu_coroutine_yield();
  
+        qemu_mutex_unlock_iothread();
          /* Wait checkpoint incoming thread exit before free resource */
          qemu_thread_join(&mis->colo_incoming_thread);
+        qemu_mutex_lock_iothread();
          /* We hold the global iothread lock, so it is safe here */
          colo_release_ram_cache();
      }
@@ -615,30 +625,25 @@ fail:
  }
  
  /**
- * @migration_incoming_setup: Setup incoming migration
- *
- * Returns 0 for no error or 1 for error
- *
+ * migration_incoming_setup: Setup incoming migration
   * @f: file for main migration channel
   * @errp: where to put errors
+ *
+ * Returns: %true on success, %false on error.
   */
-static int migration_incoming_setup(QEMUFile *f, Error **errp)
+static bool migration_incoming_setup(QEMUFile *f, Error **errp)
  {
      MigrationIncomingState *mis = migration_incoming_get_current();
-    Error *local_err = NULL;
  
-    if (multifd_load_setup(&local_err) != 0) {
-        /* We haven't been able to create multifd threads
-           nothing better to do */
-        error_report_err(local_err);
-        exit(EXIT_FAILURE);
+    if (multifd_load_setup(errp) != 0) {
+        return false;
      }
  
      if (!mis->from_src_file) {
          mis->from_src_file = f;
      }
      qemu_file_set_blocking(f, false);
-    return 0;
+    return true;
  }
  
  void migration_incoming_process(void)
@@ -681,14 +686,11 @@ static bool postcopy_try_recover(QEMUFile *f)
  
  void migration_fd_process_incoming(QEMUFile *f, Error **errp)
  {
-    Error *local_err = NULL;
-
      if (postcopy_try_recover(f)) {
          return;
      }
  
-    if (migration_incoming_setup(f, &local_err)) {
-        error_propagate(errp, local_err);
+    if (!migration_incoming_setup(f, errp)) {
          return;
      }
      migration_incoming_process();
@@ -709,8 +711,7 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
              return;
          }
  
-        if (migration_incoming_setup(f, &local_err)) {
-            error_propagate(errp, local_err);
+        if (!migration_incoming_setup(f, errp)) {
              return;
          }
  
@@ -996,6 +997,8 @@ static void populate_time_info(MigrationInfo *info, MigrationState *s)
  
  static void populate_ram_info(MigrationInfo *info, MigrationState *s)
  {
+    size_t page_size = qemu_target_page_size();
+
      info->has_ram = true;
      info->ram = g_malloc0(sizeof(*info->ram));
      info->ram->transferred = ram_counters.transferred;
@@ -1004,12 +1007,11 @@ static void populate_ram_info(MigrationInfo *info, MigrationState *s)
      /* legacy value.  It is not used anymore */
      info->ram->skipped = 0;
      info->ram->normal = ram_counters.normal;
-    info->ram->normal_bytes = ram_counters.normal *
-        qemu_target_page_size();
+    info->ram->normal_bytes = ram_counters.normal * page_size;
      info->ram->mbps = s->mbps;
      info->ram->dirty_sync_count = ram_counters.dirty_sync_count;
      info->ram->postcopy_requests = ram_counters.postcopy_requests;
-    info->ram->page_size = qemu_target_page_size();
+    info->ram->page_size = page_size;
      info->ram->multifd_bytes = ram_counters.multifd_bytes;
      info->ram->pages_per_second = s->pages_per_second;
  
@@ -1059,41 +1061,27 @@ static void populate_disk_info(MigrationInfo *info)
      }
  }
  
-static void populate_vfio_info(MigrationInfo *info)
-{
-#ifdef CONFIG_VFIO
-    if (vfio_mig_active()) {
-        info->has_vfio = true;
-        info->vfio = g_malloc0(sizeof(*info->vfio));
-        info->vfio->transferred = vfio_mig_bytes_transferred();
-    }
-#endif
-}
-
  static void fill_source_migration_info(MigrationInfo *info)
  {
      MigrationState *s = migrate_get_current();
+    GSList *cur_blocker = migration_blockers;
  
-    info->blocked = migration_is_blocked(NULL);
-    info->has_blocked_reasons = info->blocked;
      info->blocked_reasons = NULL;
-    if (info->blocked) {
-        GSList *cur_blocker = migration_blockers;
  
-        /*
-         * There are two types of reasons a migration might be blocked;
-         * a) devices marked in VMState as non-migratable, and
-         * b) Explicit migration blockers
-         * We need to add both of them here.
-         */
-        qemu_savevm_non_migratable_list(&info->blocked_reasons);
+    /*
+     * There are two types of reasons a migration might be blocked;
+     * a) devices marked in VMState as non-migratable, and
+     * b) Explicit migration blockers
+     * We need to add both of them here.
+     */
+    qemu_savevm_non_migratable_list(&info->blocked_reasons);
  
-        while (cur_blocker) {
-            QAPI_LIST_PREPEND(info->blocked_reasons,
-                              g_strdup(error_get_pretty(cur_blocker->data)));
-            cur_blocker = g_slist_next(cur_blocker);
-        }
+    while (cur_blocker) {
+        QAPI_LIST_PREPEND(info->blocked_reasons,
+                          g_strdup(error_get_pretty(cur_blocker->data)));
+        cur_blocker = g_slist_next(cur_blocker);
      }
+    info->has_blocked_reasons = info->blocked_reasons != NULL;
  
      switch (s->state) {
      case MIGRATION_STATUS_NONE:
@@ -1262,6 +1250,14 @@ static bool migrate_caps_check(bool *cap_list,
          }
      }
  
+    /* incoming side only */
+    if (runstate_check(RUN_STATE_INMIGRATE) &&
+        !migrate_multifd_is_allowed() &&
+        cap_list[MIGRATION_CAPABILITY_MULTIFD]) {
+        error_setg(errp, "multifd is not supported by current protocol");
+        return false;
+    }
+
      return true;
  }
  
@@ -1826,6 +1822,7 @@ static void migrate_fd_cleanup(MigrationState *s)
           * Close the file handle without the lock to make sure the
           * critical section won't block for long.
           */
+        migration_ioc_unregister_yank_from_file(tmp);
          qemu_fclose(tmp);
      }
  
@@ -1870,6 +1867,15 @@ void migrate_set_error(MigrationState *s, const Error *error)
      }
  }
  
+static void migrate_error_free(MigrationState *s)
+{
+    QEMU_LOCK_GUARD(&s->error_mutex);
+    if (s->error) {
+        error_free(s->error);
+        s->error = NULL;
+    }
+}
+
  void migrate_fd_error(MigrationState *s, const Error *error)
  {
      trace_migrate_fd_error(error_get_pretty(error));
@@ -1885,9 +1891,11 @@ static void migrate_fd_cancel(MigrationState *s)
      QEMUFile *f = migrate_get_current()->to_dst_file;
      trace_migrate_fd_cancel();
  
-    if (s->rp_state.from_dst_file) {
-        /* shutdown the rp socket, so causing the rp thread to shutdown */
-        qemu_file_shutdown(s->rp_state.from_dst_file);
+    WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
+        if (s->rp_state.from_dst_file) {
+            /* shutdown the rp socket, so causing the rp thread to shutdown */
+            qemu_file_shutdown(s->rp_state.from_dst_file);
+        }
      }
  
      do {
@@ -2054,6 +2062,20 @@ void migrate_init(MigrationState *s)
      s->threshold_size = 0;
  }
  
+int migrate_add_blocker_internal(Error *reason, Error **errp)
+{
+    /* Snapshots are similar to migrations, so check RUN_STATE_SAVE_VM too. */
+    if (runstate_check(RUN_STATE_SAVE_VM) || !migration_is_idle()) {
+        error_propagate_prepend(errp, error_copy(reason),
+                                "disallowing migration blocker "
+                                "(migration/snapshot in progress) for: ");
+        return -EBUSY;
+    }
+
+    migration_blockers = g_slist_prepend(migration_blockers, reason);
+    return 0;
+}
+
  int migrate_add_blocker(Error *reason, Error **errp)
  {
      if (only_migratable) {
@@ -2063,15 +2085,7 @@ int migrate_add_blocker(Error *reason, Error **errp)
          return -EACCES;
      }
  
-    if (migration_is_idle()) {
-        migration_blockers = g_slist_prepend(migration_blockers, reason);
-        return 0;
-    }
-
-    error_propagate_prepend(errp, error_copy(reason),
-                            "disallowing migration blocker "
-                            "(migration in progress) for: ");
-    return -EBUSY;
+    return migrate_add_blocker_internal(reason, errp);
  }
  
  void migrate_del_blocker(Error *reason)
@@ -2093,9 +2107,14 @@ void qmp_migrate_incoming(const char *uri, Error **errp)
          return;
      }
  
+    if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {
+        return;
+    }
+
      qemu_start_incoming_migration(uri, &local_err);
  
      if (local_err) {
+        yank_unregister_instance(MIGRATION_YANK_INSTANCE);
          error_propagate(errp, local_err);
          return;
      }
@@ -2107,6 +2126,13 @@ void qmp_migrate_recover(const char *uri, Error **errp)
  {
      MigrationIncomingState *mis = migration_incoming_get_current();
  
+    /*
+     * Don't even bother to use ERRP_GUARD() as it _must_ always be set by
+     * callers (no one should ignore a recover failure); if there is, it's a
+     * programming error.
+     */
+    assert(errp);
+
      if (mis->state != MIGRATION_STATUS_POSTCOPY_PAUSED) {
          error_setg(errp, "Migrate recover can only be run "
                     "when postcopy is paused.");
@@ -2124,8 +2150,13 @@ void qmp_migrate_recover(const char *uri, Error **errp)
       * only re-setup the migration stream and poke existing migration
       * to continue using that newly established channel.
       */
-    yank_unregister_instance(MIGRATION_YANK_INSTANCE);
      qemu_start_incoming_migration(uri, errp);
+
+    /* Safe to dereference with the assert above */
+    if (*errp) {
+        /* Reset the flag so user could still retry */
+        qatomic_set(&mis->postcopy_recover_triggered, false);
+    }
  }
  
  void qmp_migrate_pause(Error **errp)
@@ -2227,6 +2258,10 @@ static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc,
      }
  
      if (blk || blk_inc) {
+        if (migrate_colo_enabled()) {
+            error_setg(errp, "No disk migration is required in COLO mode");
+            return false;
+        }
          if (migrate_use_block() || migrate_use_block_incremental()) {
              error_setg(errp, "Command options are incompatible with "
                         "current migration capabilities");
@@ -2246,10 +2281,11 @@ static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc,
  
      migrate_init(s);
      /*
-     * set ram_counters memory to zero for a
+     * set ram_counters compression_counters memory to zero for a
       * new migration
       */
      memset(&ram_counters, 0, sizeof(ram_counters));
+    memset(&compression_counters, 0, sizeof(compression_counters));
  
      return true;
  }
@@ -2274,9 +2310,11 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
          }
      }
  
+    migrate_protocol_allow_multifd(false);
      if (strstart(uri, "tcp:", &p) ||
          strstart(uri, "unix:", NULL) ||
          strstart(uri, "vsock:", NULL)) {
+        migrate_protocol_allow_multifd(true);
          socket_start_outgoing_migration(s, p ? p : uri, &local_err);
  #ifdef CONFIG_RDMA
      } else if (strstart(uri, "rdma:", &p)) {
@@ -2310,7 +2348,7 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
  
  void qmp_migrate_cancel(Error **errp)
  {
-    migrate_fd_cancel(migrate_get_current());
+    migration_cancel(NULL);
  }
  
  void qmp_migrate_continue(MigrationStatus state, Error **errp)
@@ -2613,8 +2651,8 @@ static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname,
       * Since we currently insist on matching page sizes, just sanity check
       * we're being asked for whole host pages.
       */
-    if (start & (our_host_ps - 1) ||
-       (len & (our_host_ps - 1))) {
+    if (!QEMU_IS_ALIGNED(start, our_host_ps) ||
+        !QEMU_IS_ALIGNED(len, our_host_ps)) {
          error_report("%s: Misaligned page request, start: " RAM_ADDR_FMT
                       " len: %zd", __func__, start, len);
          mark_source_rp_bad(ms);
@@ -2671,6 +2709,23 @@ static int migrate_handle_rp_resume_ack(MigrationState *s, uint32_t value)
      return 0;
  }
  
+/* Release ms->rp_state.from_dst_file in a safe way */
+static void migration_release_from_dst_file(MigrationState *ms)
+{
+    QEMUFile *file;
+
+    WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
+        /*
+         * Reset the from_dst_file pointer first before releasing it, as we
+         * can't block within lock section
+         */
+        file = ms->rp_state.from_dst_file;
+        ms->rp_state.from_dst_file = NULL;
+    }
+
+    qemu_fclose(file);
+}
+
  /*
   * Handles messages sent on the return path towards the source VM
   *
@@ -2812,12 +2867,14 @@ out:
               * Maybe there is something we can do: it looks like a
               * network down issue, and we pause for a recovery.
               */
+            migration_release_from_dst_file(ms);
+            rp = NULL;
              if (postcopy_pause_return_path_thread(ms)) {
-                /* Reload rp, reset the rest */
-                if (rp != ms->rp_state.from_dst_file) {
-                    qemu_fclose(rp);
-                    rp = ms->rp_state.from_dst_file;
-                }
+                /*
+                 * Reload rp, reset the rest.  Referencing it is safe since
+                 * it's reset only by us above, or when migration completes
+                 */
+                rp = ms->rp_state.from_dst_file;
                  ms->rp_state.error = false;
                  goto retry;
              }
@@ -2828,8 +2885,7 @@ out:
      }
  
      trace_source_return_path_thread_end();
-    ms->rp_state.from_dst_file = NULL;
-    qemu_fclose(rp);
+    migration_release_from_dst_file(ms);
      rcu_unregister_thread();
      return NULL;
  }
@@ -2837,7 +2893,6 @@ out:
  static int open_return_path_on_source(MigrationState *ms,
                                        bool create_thread)
  {
-
      ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->to_dst_file);
      if (!ms->rp_state.from_dst_file) {
          return -1;
@@ -2852,6 +2907,7 @@ static int open_return_path_on_source(MigrationState *ms,
  
      qemu_thread_create(&ms->rp_state.rp_thread, "return path",
                         source_return_path_thread, ms, QEMU_THREAD_JOINABLE);
+    ms->rp_state.rp_thread_created = true;
  
      trace_open_return_path_on_source_continue();
  
@@ -2876,6 +2932,7 @@ static int await_return_path_close_on_source(MigrationState *ms)
      }
      trace_await_return_path_close_on_source_joining();
      qemu_thread_join(&ms->rp_state.rp_thread);
+    ms->rp_state.rp_thread_created = false;
      trace_await_return_path_close_on_source_close();
      return ms->rp_state.error;
  }
@@ -3121,6 +3178,7 @@ static void migration_completion(MigrationState *s)
          if (!ret) {
              bool inactivate = !migrate_colo_enabled();
              ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
+            trace_migration_completion_vm_stop(ret);
              if (ret >= 0) {
                  ret = migration_maybe_pause(s, &current_active_state,
                                              MIGRATION_STATUS_DEVICE);
@@ -3142,9 +3200,12 @@ static void migration_completion(MigrationState *s)
      } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
          trace_migration_completion_postcopy_end();
  
+        qemu_mutex_lock_iothread();
          qemu_savevm_state_complete_postcopy(s->to_dst_file);
+        qemu_mutex_unlock_iothread();
+
          trace_migration_completion_postcopy_end_after_complete();
-    } else if (s->state == MIGRATION_STATUS_CANCELLING) {
+    } else {
          goto fail;
      }
  
@@ -3154,7 +3215,7 @@ static void migration_completion(MigrationState *s)
       * it will wait for the destination to send it's status in
       * a SHUT command).
       */
-    if (s->rp_state.from_dst_file) {
+    if (s->rp_state.rp_thread_created) {
          int rp_error;
          trace_migration_return_path_end_before();
          rp_error = await_return_path_close_on_source(s);
@@ -3169,7 +3230,11 @@ static void migration_completion(MigrationState *s)
          goto fail_invalidate;
      }
  
-    if (!migrate_colo_enabled()) {
+    if (migrate_colo_enabled() && s->state == MIGRATION_STATUS_ACTIVE) {
+        /* COLO does not support postcopy */
+        migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
+                          MIGRATION_STATUS_COLO);
+    } else {
          migrate_set_state(&s->state, current_active_state,
                            MIGRATION_STATUS_COMPLETED);
      }
@@ -3314,8 +3379,17 @@ static MigThrError postcopy_pause(MigrationState *s)
      while (true) {
          QEMUFile *file;
  
-        /* Current channel is possibly broken. Release it. */
+        /*
+         * Current channel is possibly broken. Release it.  Note that this is
+         * guaranteed even without lock because to_dst_file should only be
+         * modified by the migration thread.  That also guarantees that the
+         * unregister of yank is safe too without the lock.  It should be safe
+         * even to be within the qemu_file_lock, but we didn't do that to avoid
+         * taking more mutex (yank_lock) within qemu_file_lock.  TL;DR: we make
+         * the qemu_file_lock critical section as small as possible.
+         */
          assert(s->to_dst_file);
+        migration_ioc_unregister_yank_from_file(s->to_dst_file);
          qemu_mutex_lock(&s->qemu_file_lock);
          file = s->to_dst_file;
          s->to_dst_file = NULL;
@@ -3545,28 +3619,21 @@ static void migration_iteration_finish(MigrationState *s)
          migration_calculate_complete(s);
          runstate_set(RUN_STATE_POSTMIGRATE);
          break;
-
-    case MIGRATION_STATUS_ACTIVE:
-        /*
-         * We should really assert here, but since it's during
-         * migration, let's try to reduce the usage of assertions.
-         */
+    case MIGRATION_STATUS_COLO:
          if (!migrate_colo_enabled()) {
              error_report("%s: critical error: calling COLO code without "
                           "COLO enabled", __func__);
          }
          migrate_start_colo_process(s);
-        /*
-         * Fixme: we will run VM in COLO no matter its old running state.
-         * After exited COLO, we will keep running.
-         */
          s->vm_was_running = true;
          /* Fallthrough */
      case MIGRATION_STATUS_FAILED:
      case MIGRATION_STATUS_CANCELLED:
      case MIGRATION_STATUS_CANCELLING:
          if (s->vm_was_running) {
-            vm_start();
+            if (!runstate_check(RUN_STATE_SHUTDOWN)) {
+                vm_start();
+            }
          } else {
              if (runstate_check(RUN_STATE_FINISH_MIGRATE)) {
                  runstate_set(RUN_STATE_POSTMIGRATE);
@@ -3669,6 +3736,44 @@ bool migration_rate_limit(void)
      return urgent;
  }
  
+/*
+ * if failover devices are present, wait they are completely
+ * unplugged
+ */
+
+static void qemu_savevm_wait_unplug(MigrationState *s, int old_state,
+                                    int new_state)
+{
+    if (qemu_savevm_state_guest_unplug_pending()) {
+        migrate_set_state(&s->state, old_state, MIGRATION_STATUS_WAIT_UNPLUG);
+
+        while (s->state == MIGRATION_STATUS_WAIT_UNPLUG &&
+               qemu_savevm_state_guest_unplug_pending()) {
+            qemu_sem_timedwait(&s->wait_unplug_sem, 250);
+        }
+        if (s->state != MIGRATION_STATUS_WAIT_UNPLUG) {
+            int timeout = 120; /* 30 seconds */
+            /*
+             * migration has been canceled
+             * but as we have started an unplug we must wait the end
+             * to be able to plug back the card
+             */
+            while (timeout-- && qemu_savevm_state_guest_unplug_pending()) {
+                qemu_sem_timedwait(&s->wait_unplug_sem, 250);
+            }
+            if (qemu_savevm_state_guest_unplug_pending() &&
+                !qtest_enabled()) {
+                warn_report("migration: partially unplugged device on "
+                            "failure");
+            }
+        }
+
+        migrate_set_state(&s->state, MIGRATION_STATUS_WAIT_UNPLUG, new_state);
+    } else {
+        migrate_set_state(&s->state, old_state, new_state);
+    }
+}
+
  /*
   * Master migration thread on the source VM.
   * It drives the migration and pumps the data down the outgoing channel.
@@ -3691,7 +3796,7 @@ static void *migration_thread(void *opaque)
       * If we opened the return path, we need to make sure dst has it
       * opened as well.
       */
-    if (s->rp_state.from_dst_file) {
+    if (s->rp_state.rp_thread_created) {
          /* Now tell the dest that it should open its end so it can reply */
          qemu_savevm_send_open_return_path(s->to_dst_file);
  
@@ -3715,22 +3820,10 @@ static void *migration_thread(void *opaque)
  
      qemu_savevm_state_setup(s->to_dst_file);
  
-    if (qemu_savevm_state_guest_unplug_pending()) {
-        migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
-                          MIGRATION_STATUS_WAIT_UNPLUG);
-
-        while (s->state == MIGRATION_STATUS_WAIT_UNPLUG &&
-               qemu_savevm_state_guest_unplug_pending()) {
-            qemu_sem_timedwait(&s->wait_unplug_sem, 250);
-        }
-
-        migrate_set_state(&s->state, MIGRATION_STATUS_WAIT_UNPLUG,
-                MIGRATION_STATUS_ACTIVE);
-    }
+    qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP,
+                               MIGRATION_STATUS_ACTIVE);
  
      s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
-    migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
-                      MIGRATION_STATUS_ACTIVE);
  
      trace_migration_thread_setup_complete();
  
@@ -3838,21 +3931,9 @@ static void *bg_migration_thread(void *opaque)
      qemu_savevm_state_header(s->to_dst_file);
      qemu_savevm_state_setup(s->to_dst_file);
  
-    if (qemu_savevm_state_guest_unplug_pending()) {
-        migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
-                          MIGRATION_STATUS_WAIT_UNPLUG);
-
-        while (s->state == MIGRATION_STATUS_WAIT_UNPLUG &&
-               qemu_savevm_state_guest_unplug_pending()) {
-            qemu_sem_timedwait(&s->wait_unplug_sem, 250);
-        }
+    qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP,
+                               MIGRATION_STATUS_ACTIVE);
  
-        migrate_set_state(&s->state, MIGRATION_STATUS_WAIT_UNPLUG,
-                          MIGRATION_STATUS_ACTIVE);
-    } else {
-        migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
-                MIGRATION_STATUS_ACTIVE);
-    }
      s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
  
      trace_migration_thread_setup_complete();
@@ -3950,6 +4031,13 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
      int64_t rate_limit;
      bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED;
  
+    /*
+     * If there's a previous error, free it and prepare for another one.
+     * Meanwhile if migration completes successfully, there won't have an error
+     * dumped when calling migrate_fd_cleanup().
+     */
+    migrate_error_free(s);
+
      s->expected_downtime = s->parameters.downtime_limit;
      if (resume) {
          assert(s->cleanup_bh);
@@ -3959,7 +4047,18 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
      }
      if (error_in) {
          migrate_fd_error(s, error_in);
-        migrate_fd_cleanup(s);
+        if (resume) {
+            /*
+             * Don't do cleanup for resume if channel is invalid, but only dump
+             * the error.  We wait for another channel connect from the user.
+             * The error_report still gives HMP user a hint on what failed.
+             * It's normally done in migrate_fd_cleanup(), but call it here
+             * explicitly.
+             */
+            error_report_err(error_copy(s->error));
+        } else {
+            migrate_fd_cleanup(s);
+        }
          return;
      }