migration/postcopy: postpone setting PostcopyState to END

[mirror_qemu.git] / migration / postcopy-ram.c
diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c

index 7814da5b4bf5e8b224b78f0d68d55911615cbbae..a793bad477c9ee39de84056eb793ad14af01bef7 100644 (file)
--- a/migration/postcopy-ram.c
+++ b/migration/postcopy-ram.c
@@ -23,10 +23,14 @@
  #include "savevm.h"
  #include "postcopy-ram.h"
  #include "ram.h"
+#include "qapi/error.h"
+#include "qemu/notify.h"
+#include "qemu/rcu.h"
  #include "sysemu/sysemu.h"
  #include "sysemu/balloon.h"
  #include "qemu/error-report.h"
  #include "trace.h"
+#include "hw/boards.h"
  
  /* Arbitrary limit on size of each discard command,
   * keeps them around ~200 bytes
@@ -45,6 +49,33 @@ struct PostcopyDiscardState {
      unsigned int nsentcmds;
  };
  
+static NotifierWithReturnList postcopy_notifier_list;
+
+void postcopy_infrastructure_init(void)
+{
+    notifier_with_return_list_init(&postcopy_notifier_list);
+}
+
+void postcopy_add_notifier(NotifierWithReturn *nn)
+{
+    notifier_with_return_list_add(&postcopy_notifier_list, nn);
+}
+
+void postcopy_remove_notifier(NotifierWithReturn *n)
+{
+    notifier_with_return_remove(n);
+}
+
+int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp)
+{
+    struct PostcopyNotifyData pnd;
+    pnd.reason = reason;
+    pnd.errp = errp;
+
+    return notifier_with_return_list_notify(&postcopy_notifier_list,
+                                            &pnd);
+}
+
  /* Postcopy needs to detect accesses to pages that haven't yet been copied
   * across, and efficiently map new pages in, the techniques for doing this
   * are target OS specific.
@@ -63,16 +94,17 @@ struct PostcopyDiscardState {
  
  typedef struct PostcopyBlocktimeContext {
      /* time when page fault initiated per vCPU */
-    int64_t *page_fault_vcpu_time;
+    uint32_t *page_fault_vcpu_time;
      /* page address per vCPU */
      uintptr_t *vcpu_addr;
-    int64_t total_blocktime;
+    uint32_t total_blocktime;
      /* blocktime per vCPU */
-    int64_t *vcpu_blocktime;
+    uint32_t *vcpu_blocktime;
      /* point in time when last page fault was initiated */
-    int64_t last_begin;
+    uint32_t last_begin;
      /* number of vCPU are suspended */
      int smp_cpus_down;
+    uint64_t start_time;
  
      /*
       * Handler for exit event, necessary for
@@ -98,23 +130,27 @@ static void migration_exit_cb(Notifier *n, void *data)
  
  static struct PostcopyBlocktimeContext *blocktime_context_new(void)
  {
+    MachineState *ms = MACHINE(qdev_get_machine());
+    unsigned int smp_cpus = ms->smp.cpus;
      PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1);
-    ctx->page_fault_vcpu_time = g_new0(int64_t, smp_cpus);
+    ctx->page_fault_vcpu_time = g_new0(uint32_t, smp_cpus);
      ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus);
-    ctx->vcpu_blocktime = g_new0(int64_t, smp_cpus);
+    ctx->vcpu_blocktime = g_new0(uint32_t, smp_cpus);
  
      ctx->exit_notifier.notify = migration_exit_cb;
+    ctx->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
      qemu_add_exit_notifier(&ctx->exit_notifier);
      return ctx;
  }
  
-static int64List *get_vcpu_blocktime_list(PostcopyBlocktimeContext *ctx)
+static uint32List *get_vcpu_blocktime_list(PostcopyBlocktimeContext *ctx)
  {
-    int64List *list = NULL, *entry = NULL;
+    MachineState *ms = MACHINE(qdev_get_machine());
+    uint32List *list = NULL, *entry = NULL;
      int i;
  
-    for (i = smp_cpus - 1; i >= 0; i--) {
-        entry = g_new0(int64List, 1);
+    for (i = ms->smp.cpus - 1; i >= 0; i--) {
+        entry = g_new0(uint32List, 1);
          entry->value = ctx->vcpu_blocktime[i];
          entry->next = list;
          list = entry;
@@ -145,7 +181,7 @@ void fill_destination_postcopy_migration_info(MigrationInfo *info)
      info->postcopy_vcpu_blocktime = get_vcpu_blocktime_list(bc);
  }
  
-static uint64_t get_postcopy_total_blocktime(void)
+static uint32_t get_postcopy_total_blocktime(void)
  {
      MigrationIncomingState *mis = migration_incoming_get_current();
      PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
@@ -288,18 +324,12 @@ static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis)
  
  /* Callback from postcopy_ram_supported_by_host block iterator.
   */
-static int test_ramblock_postcopiable(const char *block_name, void *host_addr,
-                             ram_addr_t offset, ram_addr_t length, void *opaque)
+static int test_ramblock_postcopiable(RAMBlock *rb, void *opaque)
  {
-    RAMBlock *rb = qemu_ram_block_by_name(block_name);
+    const char *block_name = qemu_ram_get_idstr(rb);
+    ram_addr_t length = qemu_ram_get_used_length(rb);
      size_t pagesize = qemu_ram_pagesize(rb);
  
-    if (qemu_ram_is_shared(rb)) {
-        error_report("Postcopy on shared RAM (%s) is not yet supported",
-                     block_name);
-        return 1;
-    }
-
      if (length % pagesize) {
          error_report("Postcopy requires RAM blocks to be a page size multiple,"
                       " block %s is 0x" RAM_ADDR_FMT " bytes with a "
@@ -323,6 +353,7 @@ bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
      struct uffdio_register reg_struct;
      struct uffdio_range range_struct;
      uint64_t feature_mask;
+    Error *local_err = NULL;
  
      if (qemu_target_page_size() > pagesize) {
          error_report("Target page size bigger than host page size");
@@ -336,13 +367,19 @@ bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
          goto out;
      }
  
+    /* Give devices a chance to object */
+    if (postcopy_notify(POSTCOPY_NOTIFY_PROBE, &local_err)) {
+        error_report_err(local_err);
+        goto out;
+    }
+
      /* Version and features check */
      if (!ufd_check_and_apply(ufd, mis)) {
          goto out;
      }
  
      /* We don't support postcopy with shared RAM yet */
-    if (qemu_ram_foreach_block(test_ramblock_postcopiable, NULL)) {
+    if (foreach_not_ignored_block(test_ramblock_postcopiable, NULL)) {
          goto out;
      }
  
@@ -411,9 +448,12 @@ out:
   * must be done right at the start prior to pre-copy.
   * opaque should be the MIS.
   */
-static int init_range(const char *block_name, void *host_addr,
-                      ram_addr_t offset, ram_addr_t length, void *opaque)
+static int init_range(RAMBlock *rb, void *opaque)
  {
+    const char *block_name = qemu_ram_get_idstr(rb);
+    void *host_addr = qemu_ram_get_host_addr(rb);
+    ram_addr_t offset = qemu_ram_get_offset(rb);
+    ram_addr_t length = qemu_ram_get_used_length(rb);
      trace_postcopy_init_range(block_name, host_addr, offset, length);
  
      /*
@@ -433,9 +473,12 @@ static int init_range(const char *block_name, void *host_addr,
   * At the end of migration, undo the effects of init_range
   * opaque should be the MIS.
   */
-static int cleanup_range(const char *block_name, void *host_addr,
-                        ram_addr_t offset, ram_addr_t length, void *opaque)
+static int cleanup_range(RAMBlock *rb, void *opaque)
  {
+    const char *block_name = qemu_ram_get_idstr(rb);
+    void *host_addr = qemu_ram_get_host_addr(rb);
+    ram_addr_t offset = qemu_ram_get_offset(rb);
+    ram_addr_t length = qemu_ram_get_used_length(rb);
      MigrationIncomingState *mis = opaque;
      struct uffdio_range range_struct;
      trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
@@ -468,15 +511,29 @@ static int cleanup_range(const char *block_name, void *host_addr,
   * postcopy later; must be called prior to any precopy.
   * called from arch_init's similarly named ram_postcopy_incoming_init
   */
-int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
+int postcopy_ram_incoming_init(MigrationIncomingState *mis)
  {
-    if (qemu_ram_foreach_block(init_range, NULL)) {
+    if (foreach_not_ignored_block(init_range, NULL)) {
          return -1;
      }
  
      return 0;
  }
  
+/*
+ * Manage a single vote to the QEMU balloon inhibitor for all postcopy usage,
+ * last caller wins.
+ */
+static void postcopy_balloon_inhibit(bool state)
+{
+    static bool cur_state = false;
+
+    if (state != cur_state) {
+        qemu_balloon_inhibit(state);
+        cur_state = state;
+    }
+}
+
  /*
   * At the end of a migration where postcopy_ram_incoming_init was called.
   */
@@ -485,31 +542,30 @@ int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
      trace_postcopy_ram_incoming_cleanup_entry();
  
      if (mis->have_fault_thread) {
-        uint64_t tmp64;
+        Error *local_err = NULL;
+
+        /* Let the fault thread quit */
+        atomic_set(&mis->fault_thread_quit, 1);
+        postcopy_fault_thread_notify(mis);
+        trace_postcopy_ram_incoming_cleanup_join();
+        qemu_thread_join(&mis->fault_thread);
  
-        if (qemu_ram_foreach_block(cleanup_range, mis)) {
+        if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_END, &local_err)) {
+            error_report_err(local_err);
              return -1;
          }
-        /*
-         * Tell the fault_thread to exit, it's an eventfd that should
-         * currently be at 0, we're going to increment it to 1
-         */
-        tmp64 = 1;
-        if (write(mis->userfault_quit_fd, &tmp64, 8) == 8) {
-            trace_postcopy_ram_incoming_cleanup_join();
-            qemu_thread_join(&mis->fault_thread);
-        } else {
-            /* Not much we can do here, but may as well report it */
-            error_report("%s: incrementing userfault_quit_fd: %s", __func__,
-                         strerror(errno));
+
+        if (foreach_not_ignored_block(cleanup_range, mis)) {
+            return -1;
          }
+
          trace_postcopy_ram_incoming_cleanup_closeuf();
          close(mis->userfault_fd);
-        close(mis->userfault_quit_fd);
+        close(mis->userfault_event_fd);
          mis->have_fault_thread = false;
      }
  
-    qemu_balloon_inhibit(false);
+    postcopy_balloon_inhibit(false);
  
      if (enable_mlock) {
          if (os_mlock() < 0) {
@@ -521,8 +577,6 @@ int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
          }
      }
  
-    postcopy_state_set(POSTCOPY_INCOMING_END);
-
      if (mis->postcopy_tmp_page) {
          munmap(mis->postcopy_tmp_page, mis->largest_page_size);
          mis->postcopy_tmp_page = NULL;
@@ -541,9 +595,12 @@ int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
  /*
   * Disable huge pages on an area
   */
-static int nhp_range(const char *block_name, void *host_addr,
-                    ram_addr_t offset, ram_addr_t length, void *opaque)
+static int nhp_range(RAMBlock *rb, void *opaque)
  {
+    const char *block_name = qemu_ram_get_idstr(rb);
+    void *host_addr = qemu_ram_get_host_addr(rb);
+    ram_addr_t offset = qemu_ram_get_offset(rb);
+    ram_addr_t length = qemu_ram_get_used_length(rb);
      trace_postcopy_nhp_range(block_name, host_addr, offset, length);
  
      /*
@@ -563,7 +620,7 @@ static int nhp_range(const char *block_name, void *host_addr,
   */
  int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
  {
-    if (qemu_ram_foreach_block(nhp_range, mis)) {
+    if (foreach_not_ignored_block(nhp_range, mis)) {
          return -1;
      }
  
@@ -574,22 +631,20 @@ int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
  
  /*
   * Mark the given area of RAM as requiring notification to unwritten areas
- * Used as a  callback on qemu_ram_foreach_block.
+ * Used as a  callback on foreach_not_ignored_block.
   *   host_addr: Base of area to mark
   *   offset: Offset in the whole ram arena
   *   length: Length of the section
   *   opaque: MigrationIncomingState pointer
   * Returns 0 on success
   */
-static int ram_block_enable_notify(const char *block_name, void *host_addr,
-                                   ram_addr_t offset, ram_addr_t length,
-                                   void *opaque)
+static int ram_block_enable_notify(RAMBlock *rb, void *opaque)
  {
      MigrationIncomingState *mis = opaque;
      struct uffdio_register reg_struct;
  
-    reg_struct.range.start = (uintptr_t)host_addr;
-    reg_struct.range.len = length;
+    reg_struct.range.start = (uintptr_t)qemu_ram_get_host_addr(rb);
+    reg_struct.range.len = qemu_ram_get_used_length(rb);
      reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
  
      /* Now tell our userfault_fd that it's responsible for this area */
@@ -601,7 +656,59 @@ static int ram_block_enable_notify(const char *block_name, void *host_addr,
          error_report("%s userfault: Region doesn't support COPY", __func__);
          return -1;
      }
+    if (reg_struct.ioctls & ((__u64)1 << _UFFDIO_ZEROPAGE)) {
+        qemu_ram_set_uf_zeroable(rb);
+    }
+
+    return 0;
+}
+
+int postcopy_wake_shared(struct PostCopyFD *pcfd,
+                         uint64_t client_addr,
+                         RAMBlock *rb)
+{
+    size_t pagesize = qemu_ram_pagesize(rb);
+    struct uffdio_range range;
+    int ret;
+    trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb));
+    range.start = client_addr & ~(pagesize - 1);
+    range.len = pagesize;
+    ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range);
+    if (ret) {
+        error_report("%s: Failed to wake: %zx in %s (%s)",
+                     __func__, (size_t)client_addr, qemu_ram_get_idstr(rb),
+                     strerror(errno));
+    }
+    return ret;
+}
  
+/*
+ * Callback from shared fault handlers to ask for a page,
+ * the page must be specified by a RAMBlock and an offset in that rb
+ * Note: Only for use by shared fault handlers (in fault thread)
+ */
+int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
+                                 uint64_t client_addr, uint64_t rb_offset)
+{
+    size_t pagesize = qemu_ram_pagesize(rb);
+    uint64_t aligned_rbo = rb_offset & ~(pagesize - 1);
+    MigrationIncomingState *mis = migration_incoming_get_current();
+
+    trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb),
+                                       rb_offset);
+    if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) {
+        trace_postcopy_request_shared_page_present(pcfd->idstr,
+                                        qemu_ram_get_idstr(rb), rb_offset);
+        return postcopy_wake_shared(pcfd, client_addr, rb);
+    }
+    if (rb != mis->last_rb) {
+        mis->last_rb = rb;
+        migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
+                                  aligned_rbo, pagesize);
+    } else {
+        /* Save some space */
+        migrate_send_rp_req_pages(mis, NULL, aligned_rbo, pagesize);
+    }
      return 0;
  }
  
@@ -619,6 +726,13 @@ static int get_mem_fault_cpu_index(uint32_t pid)
      return -1;
  }
  
+static uint32_t get_low_time_offset(PostcopyBlocktimeContext *dc)
+{
+    int64_t start_time_offset = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
+                                    dc->start_time;
+    return start_time_offset < 1 ? 1 : start_time_offset & UINT32_MAX;
+}
+
  /*
   * This function is being called when pagefault occurs. It
   * tracks down vCPU blocking time.
@@ -633,7 +747,7 @@ static void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid,
      int cpu, already_received;
      MigrationIncomingState *mis = migration_incoming_get_current();
      PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
-    int64_t now_ms;
+    uint32_t low_time_offset;
  
      if (!dc || ptid == 0) {
          return;
@@ -643,22 +757,24 @@ static void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid,
          return;
      }
  
-    now_ms = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+    low_time_offset = get_low_time_offset(dc);
      if (dc->vcpu_addr[cpu] == 0) {
          atomic_inc(&dc->smp_cpus_down);
      }
  
-    atomic_xchg__nocheck(&dc->last_begin, now_ms);
-    atomic_xchg__nocheck(&dc->page_fault_vcpu_time[cpu], now_ms);
-    atomic_xchg__nocheck(&dc->vcpu_addr[cpu], addr);
+    atomic_xchg(&dc->last_begin, low_time_offset);
+    atomic_xchg(&dc->page_fault_vcpu_time[cpu], low_time_offset);
+    atomic_xchg(&dc->vcpu_addr[cpu], addr);
  
-    /* check it here, not at the begining of the function,
-     * due to, check could accur early than bitmap_set in
-     * qemu_ufd_copy_ioctl */
+    /*
+     * check it here, not at the beginning of the function,
+     * due to, check could occur early than bitmap_set in
+     * qemu_ufd_copy_ioctl
+     */
      already_received = ramblock_recv_bitmap_test(rb, (void *)addr);
      if (already_received) {
-        atomic_xchg__nocheck(&dc->vcpu_addr[cpu], 0);
-        atomic_xchg__nocheck(&dc->page_fault_vcpu_time[cpu], 0);
+        atomic_xchg(&dc->vcpu_addr[cpu], 0);
+        atomic_xchg(&dc->page_fault_vcpu_time[cpu], 0);
          atomic_dec(&dc->smp_cpus_down);
      }
      trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu],
@@ -696,31 +812,31 @@ static void mark_postcopy_blocktime_end(uintptr_t addr)
  {
      MigrationIncomingState *mis = migration_incoming_get_current();
      PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
+    MachineState *ms = MACHINE(qdev_get_machine());
+    unsigned int smp_cpus = ms->smp.cpus;
      int i, affected_cpu = 0;
-    int64_t now_ms;
      bool vcpu_total_blocktime = false;
-    int64_t read_vcpu_time;
+    uint32_t read_vcpu_time, low_time_offset;
  
      if (!dc) {
          return;
      }
  
-    now_ms = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
-
+    low_time_offset = get_low_time_offset(dc);
      /* lookup cpu, to clear it,
       * that algorithm looks straighforward, but it's not
       * optimal, more optimal algorithm is keeping tree or hash
       * where key is address value is a list of  */
      for (i = 0; i < smp_cpus; i++) {
-        uint64_t vcpu_blocktime = 0;
+        uint32_t vcpu_blocktime = 0;
  
          read_vcpu_time = atomic_fetch_add(&dc->page_fault_vcpu_time[i], 0);
          if (atomic_fetch_add(&dc->vcpu_addr[i], 0) != addr ||
              read_vcpu_time == 0) {
              continue;
          }
-        atomic_xchg__nocheck(&dc->vcpu_addr[i], 0);
-        vcpu_blocktime = now_ms - read_vcpu_time;
+        atomic_xchg(&dc->vcpu_addr[i], 0);
+        vcpu_blocktime = low_time_offset - read_vcpu_time;
          affected_cpu += 1;
          /* we need to know is that mark_postcopy_end was due to
           * faulted page, another possible case it's prefetched
@@ -735,12 +851,24 @@ static void mark_postcopy_blocktime_end(uintptr_t addr)
  
      atomic_sub(&dc->smp_cpus_down, affected_cpu);
      if (vcpu_total_blocktime) {
-        dc->total_blocktime += now_ms - atomic_fetch_add(&dc->last_begin, 0);
+        dc->total_blocktime += low_time_offset - atomic_fetch_add(
+                &dc->last_begin, 0);
      }
      trace_mark_postcopy_blocktime_end(addr, dc, dc->total_blocktime,
                                        affected_cpu);
  }
  
+static bool postcopy_pause_fault_thread(MigrationIncomingState *mis)
+{
+    trace_postcopy_pause_fault_thread();
+
+    qemu_sem_wait(&mis->postcopy_pause_sem_fault);
+
+    trace_postcopy_pause_fault_thread_continued();
+
+    return true;
+}
+
  /*
   * Handle faults detected by the USERFAULT markings
   */
@@ -749,95 +877,220 @@ static void *postcopy_ram_fault_thread(void *opaque)
      MigrationIncomingState *mis = opaque;
      struct uffd_msg msg;
      int ret;
+    size_t index;
      RAMBlock *rb = NULL;
-    RAMBlock *last_rb = NULL; /* last RAMBlock we sent part of */
  
      trace_postcopy_ram_fault_thread_entry();
+    rcu_register_thread();
+    mis->last_rb = NULL; /* last RAMBlock we sent part of */
      qemu_sem_post(&mis->fault_thread_sem);
  
+    struct pollfd *pfd;
+    size_t pfd_len = 2 + mis->postcopy_remote_fds->len;
+
+    pfd = g_new0(struct pollfd, pfd_len);
+
+    pfd[0].fd = mis->userfault_fd;
+    pfd[0].events = POLLIN;
+    pfd[1].fd = mis->userfault_event_fd;
+    pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */
+    trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd);
+    for (index = 0; index < mis->postcopy_remote_fds->len; index++) {
+        struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds,
+                                                 struct PostCopyFD, index);
+        pfd[2 + index].fd = pcfd->fd;
+        pfd[2 + index].events = POLLIN;
+        trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr,
+                                                  pcfd->fd);
+    }
+
      while (true) {
          ram_addr_t rb_offset;
-        struct pollfd pfd[2];
+        int poll_result;
  
          /*
           * We're mainly waiting for the kernel to give us a faulting HVA,
           * however we can be told to quit via userfault_quit_fd which is
           * an eventfd
           */
-        pfd[0].fd = mis->userfault_fd;
-        pfd[0].events = POLLIN;
-        pfd[0].revents = 0;
-        pfd[1].fd = mis->userfault_quit_fd;
-        pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */
-        pfd[1].revents = 0;
-
-        if (poll(pfd, 2, -1 /* Wait forever */) == -1) {
+
+        poll_result = poll(pfd, pfd_len, -1 /* Wait forever */);
+        if (poll_result == -1) {
              error_report("%s: userfault poll: %s", __func__, strerror(errno));
              break;
          }
  
-        if (pfd[1].revents) {
-            trace_postcopy_ram_fault_thread_quit();
-            break;
+        if (!mis->to_src_file) {
+            /*
+             * Possibly someone tells us that the return path is
+             * broken already using the event. We should hold until
+             * the channel is rebuilt.
+             */
+            if (postcopy_pause_fault_thread(mis)) {
+                mis->last_rb = NULL;
+                /* Continue to read the userfaultfd */
+            } else {
+                error_report("%s: paused but don't allow to continue",
+                             __func__);
+                break;
+            }
          }
  
-        ret = read(mis->userfault_fd, &msg, sizeof(msg));
-        if (ret != sizeof(msg)) {
-            if (errno == EAGAIN) {
-                /*
-                 * if a wake up happens on the other thread just after
-                 * the poll, there is nothing to read.
-                 */
-                continue;
+        if (pfd[1].revents) {
+            uint64_t tmp64 = 0;
+
+            /* Consume the signal */
+            if (read(mis->userfault_event_fd, &tmp64, 8) != 8) {
+                /* Nothing obviously nicer than posting this error. */
+                error_report("%s: read() failed", __func__);
              }
-            if (ret < 0) {
-                error_report("%s: Failed to read full userfault message: %s",
-                             __func__, strerror(errno));
+
+            if (atomic_read(&mis->fault_thread_quit)) {
+                trace_postcopy_ram_fault_thread_quit();
                  break;
-            } else {
-                error_report("%s: Read %d bytes from userfaultfd expected %zd",
-                             __func__, ret, sizeof(msg));
-                break; /* Lost alignment, don't know what we'd read next */
              }
          }
-        if (msg.event != UFFD_EVENT_PAGEFAULT) {
-            error_report("%s: Read unexpected event %ud from userfaultfd",
-                         __func__, msg.event);
-            continue; /* It's not a page fault, shouldn't happen */
-        }
  
-        rb = qemu_ram_block_from_host(
-                 (void *)(uintptr_t)msg.arg.pagefault.address,
-                 true, &rb_offset);
-        if (!rb) {
-            error_report("postcopy_ram_fault_thread: Fault outside guest: %"
-                         PRIx64, (uint64_t)msg.arg.pagefault.address);
-            break;
-        }
+        if (pfd[0].revents) {
+            poll_result--;
+            ret = read(mis->userfault_fd, &msg, sizeof(msg));
+            if (ret != sizeof(msg)) {
+                if (errno == EAGAIN) {
+                    /*
+                     * if a wake up happens on the other thread just after
+                     * the poll, there is nothing to read.
+                     */
+                    continue;
+                }
+                if (ret < 0) {
+                    error_report("%s: Failed to read full userfault "
+                                 "message: %s",
+                                 __func__, strerror(errno));
+                    break;
+                } else {
+                    error_report("%s: Read %d bytes from userfaultfd "
+                                 "expected %zd",
+                                 __func__, ret, sizeof(msg));
+                    break; /* Lost alignment, don't know what we'd read next */
+                }
+            }
+            if (msg.event != UFFD_EVENT_PAGEFAULT) {
+                error_report("%s: Read unexpected event %ud from userfaultfd",
+                             __func__, msg.event);
+                continue; /* It's not a page fault, shouldn't happen */
+            }
  
-        rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
-        trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
+            rb = qemu_ram_block_from_host(
+                     (void *)(uintptr_t)msg.arg.pagefault.address,
+                     true, &rb_offset);
+            if (!rb) {
+                error_report("postcopy_ram_fault_thread: Fault outside guest: %"
+                             PRIx64, (uint64_t)msg.arg.pagefault.address);
+                break;
+            }
+
+            rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
+            trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
                                                  qemu_ram_get_idstr(rb),
                                                  rb_offset,
                                                  msg.arg.pagefault.feat.ptid);
+            mark_postcopy_blocktime_begin(
+                    (uintptr_t)(msg.arg.pagefault.address),
+                                msg.arg.pagefault.feat.ptid, rb);
  
-        mark_postcopy_blocktime_begin((uintptr_t)(msg.arg.pagefault.address),
-                                      msg.arg.pagefault.feat.ptid, rb);
-        /*
-         * Send the request to the source - we want to request one
-         * of our host page sizes (which is >= TPS)
-         */
-        if (rb != last_rb) {
-            last_rb = rb;
-            migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
-                                     rb_offset, qemu_ram_pagesize(rb));
-        } else {
-            /* Save some space */
-            migrate_send_rp_req_pages(mis, NULL,
-                                     rb_offset, qemu_ram_pagesize(rb));
+retry:
+            /*
+             * Send the request to the source - we want to request one
+             * of our host page sizes (which is >= TPS)
+             */
+            if (rb != mis->last_rb) {
+                mis->last_rb = rb;
+                ret = migrate_send_rp_req_pages(mis,
+                                                qemu_ram_get_idstr(rb),
+                                                rb_offset,
+                                                qemu_ram_pagesize(rb));
+            } else {
+                /* Save some space */
+                ret = migrate_send_rp_req_pages(mis,
+                                                NULL,
+                                                rb_offset,
+                                                qemu_ram_pagesize(rb));
+            }
+
+            if (ret) {
+                /* May be network failure, try to wait for recovery */
+                if (ret == -EIO && postcopy_pause_fault_thread(mis)) {
+                    /* We got reconnected somehow, try to continue */
+                    mis->last_rb = NULL;
+                    goto retry;
+                } else {
+                    /* This is a unavoidable fault */
+                    error_report("%s: migrate_send_rp_req_pages() get %d",
+                                 __func__, ret);
+                    break;
+                }
+            }
+        }
+
+        /* Now handle any requests from external processes on shared memory */
+        /* TODO: May need to handle devices deregistering during postcopy */
+        for (index = 2; index < pfd_len && poll_result; index++) {
+            if (pfd[index].revents) {
+                struct PostCopyFD *pcfd =
+                    &g_array_index(mis->postcopy_remote_fds,
+                                   struct PostCopyFD, index - 2);
+
+                poll_result--;
+                if (pfd[index].revents & POLLERR) {
+                    error_report("%s: POLLERR on poll %zd fd=%d",
+                                 __func__, index, pcfd->fd);
+                    pfd[index].events = 0;
+                    continue;
+                }
+
+                ret = read(pcfd->fd, &msg, sizeof(msg));
+                if (ret != sizeof(msg)) {
+                    if (errno == EAGAIN) {
+                        /*
+                         * if a wake up happens on the other thread just after
+                         * the poll, there is nothing to read.
+                         */
+                        continue;
+                    }
+                    if (ret < 0) {
+                        error_report("%s: Failed to read full userfault "
+                                     "message: %s (shared) revents=%d",
+                                     __func__, strerror(errno),
+                                     pfd[index].revents);
+                        /*TODO: Could just disable this sharer */
+                        break;
+                    } else {
+                        error_report("%s: Read %d bytes from userfaultfd "
+                                     "expected %zd (shared)",
+                                     __func__, ret, sizeof(msg));
+                        /*TODO: Could just disable this sharer */
+                        break; /*Lost alignment,don't know what we'd read next*/
+                    }
+                }
+                if (msg.event != UFFD_EVENT_PAGEFAULT) {
+                    error_report("%s: Read unexpected event %ud "
+                                 "from userfaultfd (shared)",
+                                 __func__, msg.event);
+                    continue; /* It's not a page fault, shouldn't happen */
+                }
+                /* Call the device handler registered with us */
+                ret = pcfd->handler(pcfd, &msg);
+                if (ret) {
+                    error_report("%s: Failed to resolve shared fault on %zd/%s",
+                                 __func__, index, pcfd->idstr);
+                    /* TODO: Fail? Disable this sharer? */
+                }
+            }
          }
      }
+    rcu_unregister_thread();
      trace_postcopy_ram_fault_thread_exit();
+    g_free(pfd);
      return NULL;
  }
  
@@ -860,9 +1113,9 @@ int postcopy_ram_enable_notify(MigrationIncomingState *mis)
      }
  
      /* Now an eventfd we use to tell the fault-thread to quit */
-    mis->userfault_quit_fd = eventfd(0, EFD_CLOEXEC);
-    if (mis->userfault_quit_fd == -1) {
-        error_report("%s: Opening userfault_quit_fd: %s", __func__,
+    mis->userfault_event_fd = eventfd(0, EFD_CLOEXEC);
+    if (mis->userfault_event_fd == -1) {
+        error_report("%s: Opening userfault_event_fd: %s", __func__,
                       strerror(errno));
          close(mis->userfault_fd);
          return -1;
@@ -876,15 +1129,42 @@ int postcopy_ram_enable_notify(MigrationIncomingState *mis)
      mis->have_fault_thread = true;
  
      /* Mark so that we get notified of accesses to unwritten areas */
-    if (qemu_ram_foreach_block(ram_block_enable_notify, mis)) {
+    if (foreach_not_ignored_block(ram_block_enable_notify, mis)) {
+        error_report("ram_block_enable_notify failed");
          return -1;
      }
  
+    mis->postcopy_tmp_page = mmap(NULL, mis->largest_page_size,
+                                  PROT_READ | PROT_WRITE, MAP_PRIVATE |
+                                  MAP_ANONYMOUS, -1, 0);
+    if (mis->postcopy_tmp_page == MAP_FAILED) {
+        mis->postcopy_tmp_page = NULL;
+        error_report("%s: Failed to map postcopy_tmp_page %s",
+                     __func__, strerror(errno));
+        return -1;
+    }
+
+    /*
+     * Map large zero page when kernel can't use UFFDIO_ZEROPAGE for hugepages
+     */
+    mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size,
+                                       PROT_READ | PROT_WRITE,
+                                       MAP_PRIVATE | MAP_ANONYMOUS,
+                                       -1, 0);
+    if (mis->postcopy_tmp_zero_page == MAP_FAILED) {
+        int e = errno;
+        mis->postcopy_tmp_zero_page = NULL;
+        error_report("%s: Failed to map large zero page %s",
+                     __func__, strerror(e));
+        return -e;
+    }
+    memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size);
+
      /*
       * Ballooning can mark pages as absent while we're postcopying
       * that would cause false userfaults.
       */
-    qemu_balloon_inhibit(true);
+    postcopy_balloon_inhibit(true);
  
      trace_postcopy_ram_enable_notify();
  
@@ -918,6 +1198,22 @@ static int qemu_ufd_copy_ioctl(int userfault_fd, void *host_addr,
      return ret;
  }
  
+int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset)
+{
+    int i;
+    MigrationIncomingState *mis = migration_incoming_get_current();
+    GArray *pcrfds = mis->postcopy_remote_fds;
+
+    for (i = 0; i < pcrfds->len; i++) {
+        struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
+        int ret = cur->waker(cur, rb, offset);
+        if (ret) {
+            return ret;
+        }
+    }
+    return 0;
+}
+
  /*
   * Place a host page (from) at (host) atomically
   * returns 0 on success
@@ -941,7 +1237,8 @@ int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
      }
  
      trace_postcopy_place_page(host);
-    return 0;
+    return postcopy_notify_shared_wake(rb,
+                                       qemu_ram_block_host_offset(rb, host));
  }
  
  /*
@@ -951,62 +1248,26 @@ int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
  int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
                               RAMBlock *rb)
  {
+    size_t pagesize = qemu_ram_pagesize(rb);
      trace_postcopy_place_page_zero(host);
  
-    if (qemu_ram_pagesize(rb) == getpagesize()) {
-        if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, NULL, getpagesize(),
-                                rb)) {
+    /* Normal RAMBlocks can zero a page using UFFDIO_ZEROPAGE
+     * but it's not available for everything (e.g. hugetlbpages)
+     */
+    if (qemu_ram_is_uf_zeroable(rb)) {
+        if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, NULL, pagesize, rb)) {
              int e = errno;
              error_report("%s: %s zero host: %p",
                           __func__, strerror(e), host);
  
              return -e;
          }
+        return postcopy_notify_shared_wake(rb,
+                                           qemu_ram_block_host_offset(rb,
+                                                                      host));
      } else {
-        /* The kernel can't use UFFDIO_ZEROPAGE for hugepages */
-        if (!mis->postcopy_tmp_zero_page) {
-            mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size,
-                                               PROT_READ | PROT_WRITE,
-                                               MAP_PRIVATE | MAP_ANONYMOUS,
-                                               -1, 0);
-            if (mis->postcopy_tmp_zero_page == MAP_FAILED) {
-                int e = errno;
-                mis->postcopy_tmp_zero_page = NULL;
-                error_report("%s: %s mapping large zero page",
-                             __func__, strerror(e));
-                return -e;
-            }
-            memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size);
-        }
-        return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page,
-                                   rb);
+        return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page, rb);
      }
-
-    return 0;
-}
-
-/*
- * Returns a target page of memory that can be mapped at a later point in time
- * using postcopy_place_page
- * The same address is used repeatedly, postcopy_place_page just takes the
- * backing page away.
- * Returns: Pointer to allocated page
- *
- */
-void *postcopy_get_tmp_page(MigrationIncomingState *mis)
-{
-    if (!mis->postcopy_tmp_page) {
-        mis->postcopy_tmp_page = mmap(NULL, mis->largest_page_size,
-                             PROT_READ | PROT_WRITE, MAP_PRIVATE |
-                             MAP_ANONYMOUS, -1, 0);
-        if (mis->postcopy_tmp_page == MAP_FAILED) {
-            mis->postcopy_tmp_page = NULL;
-            error_report("%s: %s", __func__, strerror(errno));
-            return NULL;
-        }
-    }
-
-    return mis->postcopy_tmp_page;
  }
  
  #else
@@ -1021,7 +1282,7 @@ bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
      return false;
  }
  
-int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
+int postcopy_ram_incoming_init(MigrationIncomingState *mis)
  {
      error_report("postcopy_ram_incoming_init: No OS support");
      return -1;
@@ -1039,6 +1300,13 @@ int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
      return -1;
  }
  
+int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
+                                 uint64_t client_addr, uint64_t rb_offset)
+{
+    assert(0);
+    return -1;
+}
+
  int postcopy_ram_enable_notify(MigrationIncomingState *mis)
  {
      assert(0);
@@ -1059,37 +1327,47 @@ int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
      return -1;
  }
  
-void *postcopy_get_tmp_page(MigrationIncomingState *mis)
+int postcopy_wake_shared(struct PostCopyFD *pcfd,
+                         uint64_t client_addr,
+                         RAMBlock *rb)
  {
      assert(0);
-    return NULL;
+    return -1;
  }
-
  #endif
  
  /* ------------------------------------------------------------------------- */
  
+void postcopy_fault_thread_notify(MigrationIncomingState *mis)
+{
+    uint64_t tmp64 = 1;
+
+    /*
+     * Wakeup the fault_thread.  It's an eventfd that should currently
+     * be at 0, we're going to increment it to 1
+     */
+    if (write(mis->userfault_event_fd, &tmp64, 8) != 8) {
+        /* Not much we can do here, but may as well report it */
+        error_report("%s: incrementing failed: %s", __func__,
+                     strerror(errno));
+    }
+}
+
  /**
   * postcopy_discard_send_init: Called at the start of each RAMBlock before
   *   asking to discard individual ranges.
   *
   * @ms: The current migration state.
- * @offset: the bitmap offset of the named RAMBlock in the migration
- *   bitmap.
+ * @offset: the bitmap offset of the named RAMBlock in the migration bitmap.
   * @name: RAMBlock that discards will operate on.
- *
- * returns: a new PDS.
   */
-PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms,
-                                                 const char *name)
+static PostcopyDiscardState pds = {0};
+void postcopy_discard_send_init(MigrationState *ms, const char *name)
  {
-    PostcopyDiscardState *res = g_malloc0(sizeof(PostcopyDiscardState));
-
-    if (res) {
-        res->ramblock_name = name;
-    }
-
-    return res;
+    pds.ramblock_name = name;
+    pds.cur_entry = 0;
+    pds.nsentwords = 0;
+    pds.nsentcmds = 0;
  }
  
  /**
@@ -1098,30 +1376,29 @@ PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms,
   *   be sent later.
   *
   * @ms: Current migration state.
- * @pds: Structure initialised by postcopy_discard_send_init().
   * @start,@length: a range of pages in the migration bitmap in the
   *   RAM block passed to postcopy_discard_send_init() (length=1 is one page)
   */
-void postcopy_discard_send_range(MigrationState *ms, PostcopyDiscardState *pds,
-                                unsigned long start, unsigned long length)
+void postcopy_discard_send_range(MigrationState *ms, unsigned long start,
+                                 unsigned long length)
  {
      size_t tp_size = qemu_target_page_size();
      /* Convert to byte offsets within the RAM block */
-    pds->start_list[pds->cur_entry] = start  * tp_size;
-    pds->length_list[pds->cur_entry] = length * tp_size;
-    trace_postcopy_discard_send_range(pds->ramblock_name, start, length);
-    pds->cur_entry++;
-    pds->nsentwords++;
+    pds.start_list[pds.cur_entry] = start  * tp_size;
+    pds.length_list[pds.cur_entry] = length * tp_size;
+    trace_postcopy_discard_send_range(pds.ramblock_name, start, length);
+    pds.cur_entry++;
+    pds.nsentwords++;
  
-    if (pds->cur_entry == MAX_DISCARDS_PER_COMMAND) {
+    if (pds.cur_entry == MAX_DISCARDS_PER_COMMAND) {
          /* Full set, ship it! */
          qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
-                                              pds->ramblock_name,
-                                              pds->cur_entry,
-                                              pds->start_list,
-                                              pds->length_list);
-        pds->nsentcmds++;
-        pds->cur_entry = 0;
+                                              pds.ramblock_name,
+                                              pds.cur_entry,
+                                              pds.start_list,
+                                              pds.length_list);
+        pds.nsentcmds++;
+        pds.cur_entry = 0;
      }
  }
  
@@ -1130,24 +1407,21 @@ void postcopy_discard_send_range(MigrationState *ms, PostcopyDiscardState *pds,
   * bitmap code. Sends any outstanding discard messages, frees the PDS
   *
   * @ms: Current migration state.
- * @pds: Structure initialised by postcopy_discard_send_init().
   */
-void postcopy_discard_send_finish(MigrationState *ms, PostcopyDiscardState *pds)
+void postcopy_discard_send_finish(MigrationState *ms)
  {
      /* Anything unsent? */
-    if (pds->cur_entry) {
+    if (pds.cur_entry) {
          qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
-                                              pds->ramblock_name,
-                                              pds->cur_entry,
-                                              pds->start_list,
-                                              pds->length_list);
-        pds->nsentcmds++;
+                                              pds.ramblock_name,
+                                              pds.cur_entry,
+                                              pds.start_list,
+                                              pds.length_list);
+        pds.nsentcmds++;
      }
  
-    trace_postcopy_discard_send_finish(pds->ramblock_name, pds->nsentwords,
-                                       pds->nsentcmds);
-
-    g_free(pds);
+    trace_postcopy_discard_send_finish(pds.ramblock_name, pds.nsentwords,
+                                       pds.nsentcmds);
  }
  
  /*
@@ -1167,3 +1441,31 @@ PostcopyState postcopy_state_set(PostcopyState new_state)
  {
      return atomic_xchg(&incoming_postcopy_state, new_state);
  }
+
+/* Register a handler for external shared memory postcopy
+ * called on the destination.
+ */
+void postcopy_register_shared_ufd(struct PostCopyFD *pcfd)
+{
+    MigrationIncomingState *mis = migration_incoming_get_current();
+
+    mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds,
+                                                  *pcfd);
+}
+
+/* Unregister a handler for external shared memory postcopy
+ */
+void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd)
+{
+    guint i;
+    MigrationIncomingState *mis = migration_incoming_get_current();
+    GArray *pcrfds = mis->postcopy_remote_fds;
+
+    for (i = 0; i < pcrfds->len; i++) {
+        struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
+        if (cur->fd == pcfd->fd) {
+            mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i);
+            return;
+        }
+    }
+}