]> git.proxmox.com Git - mirror_qemu.git/blame - migration/postcopy-ram.c
accel/tcg: Rename tcg_cpus_exec() -> tcg_cpu_exec()
[mirror_qemu.git] / migration / postcopy-ram.c
CommitLineData
eb59db53
DDAG
1/*
2 * Postcopy migration for RAM
3 *
4 * Copyright 2013-2015 Red Hat, Inc. and/or its affiliates
5 *
6 * Authors:
7 * Dave Gilbert <dgilbert@redhat.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2 or later.
10 * See the COPYING file in the top-level directory.
11 *
12 */
13
14/*
15 * Postcopy is a migration technique where the execution flips from the
16 * source to the destination before all the data has been copied.
17 */
18
1393a485 19#include "qemu/osdep.h"
b85ea5fa 20#include "qemu/madvise.h"
51180423 21#include "exec/target_page.h"
6666c96a 22#include "migration.h"
08a0aee1 23#include "qemu-file.h"
20a519a0 24#include "savevm.h"
be07b0ac 25#include "postcopy-ram.h"
7b1e1a22 26#include "ram.h"
1693c64c
DDAG
27#include "qapi/error.h"
28#include "qemu/notify.h"
d4842052 29#include "qemu/rcu.h"
eb59db53
DDAG
30#include "sysemu/sysemu.h"
31#include "qemu/error-report.h"
32#include "trace.h"
5cc8767d 33#include "hw/boards.h"
898ba906 34#include "exec/ramblock.h"
36f62f11 35#include "socket.h"
36f62f11 36#include "yank_functions.h"
f0afaf6c 37#include "tls.h"
d5890ea0 38#include "qemu/userfaultfd.h"
ae30b9b2 39#include "qemu/mmap-alloc.h"
1f0776f1 40#include "options.h"
eb59db53 41
e0b266f0
DDAG
42/* Arbitrary limit on size of each discard command,
43 * keeps them around ~200 bytes
44 */
45#define MAX_DISCARDS_PER_COMMAND 12
46
47struct PostcopyDiscardState {
48 const char *ramblock_name;
e0b266f0
DDAG
49 uint16_t cur_entry;
50 /*
51 * Start and length of a discard range (bytes)
52 */
53 uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
54 uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
55 unsigned int nsentwords;
56 unsigned int nsentcmds;
57};
58
1693c64c
DDAG
59static NotifierWithReturnList postcopy_notifier_list;
60
61void postcopy_infrastructure_init(void)
62{
63 notifier_with_return_list_init(&postcopy_notifier_list);
64}
65
66void postcopy_add_notifier(NotifierWithReturn *nn)
67{
68 notifier_with_return_list_add(&postcopy_notifier_list, nn);
69}
70
71void postcopy_remove_notifier(NotifierWithReturn *n)
72{
73 notifier_with_return_remove(n);
74}
75
76int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp)
77{
78 struct PostcopyNotifyData pnd;
79 pnd.reason = reason;
80 pnd.errp = errp;
81
82 return notifier_with_return_list_notify(&postcopy_notifier_list,
83 &pnd);
84}
85
095c12a4
PX
86/*
87 * NOTE: this routine is not thread safe, we can't call it concurrently. But it
88 * should be good enough for migration's purposes.
89 */
90void postcopy_thread_create(MigrationIncomingState *mis,
91 QemuThread *thread, const char *name,
92 void *(*fn)(void *), int joinable)
93{
94 qemu_sem_init(&mis->thread_sync_sem, 0);
95 qemu_thread_create(thread, name, fn, mis, joinable);
96 qemu_sem_wait(&mis->thread_sync_sem);
97 qemu_sem_destroy(&mis->thread_sync_sem);
98}
99
eb59db53
DDAG
100/* Postcopy needs to detect accesses to pages that haven't yet been copied
101 * across, and efficiently map new pages in, the techniques for doing this
102 * are target OS specific.
103 */
104#if defined(__linux__)
105
c4faeed2 106#include <poll.h>
eb59db53
DDAG
107#include <sys/ioctl.h>
108#include <sys/syscall.h>
eb59db53
DDAG
109#include <asm/types.h> /* for __u64 */
110#endif
111
d8b9d771
MF
112#if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD)
113#include <sys/eventfd.h>
eb59db53
DDAG
114#include <linux/userfaultfd.h>
115
2a4c42f1
AP
116typedef struct PostcopyBlocktimeContext {
117 /* time when page fault initiated per vCPU */
118 uint32_t *page_fault_vcpu_time;
119 /* page address per vCPU */
120 uintptr_t *vcpu_addr;
121 uint32_t total_blocktime;
122 /* blocktime per vCPU */
123 uint32_t *vcpu_blocktime;
124 /* point in time when last page fault was initiated */
125 uint32_t last_begin;
126 /* number of vCPU are suspended */
127 int smp_cpus_down;
128 uint64_t start_time;
129
130 /*
131 * Handler for exit event, necessary for
132 * releasing whole blocktime_ctx
133 */
134 Notifier exit_notifier;
135} PostcopyBlocktimeContext;
136
137static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx)
138{
139 g_free(ctx->page_fault_vcpu_time);
140 g_free(ctx->vcpu_addr);
141 g_free(ctx->vcpu_blocktime);
142 g_free(ctx);
143}
144
145static void migration_exit_cb(Notifier *n, void *data)
146{
147 PostcopyBlocktimeContext *ctx = container_of(n, PostcopyBlocktimeContext,
148 exit_notifier);
149 destroy_blocktime_context(ctx);
150}
151
152static struct PostcopyBlocktimeContext *blocktime_context_new(void)
153{
5cc8767d
LX
154 MachineState *ms = MACHINE(qdev_get_machine());
155 unsigned int smp_cpus = ms->smp.cpus;
2a4c42f1
AP
156 PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1);
157 ctx->page_fault_vcpu_time = g_new0(uint32_t, smp_cpus);
158 ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus);
159 ctx->vcpu_blocktime = g_new0(uint32_t, smp_cpus);
160
161 ctx->exit_notifier.notify = migration_exit_cb;
162 ctx->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
163 qemu_add_exit_notifier(&ctx->exit_notifier);
164 return ctx;
165}
ca6011c2 166
65ace060
AP
167static uint32List *get_vcpu_blocktime_list(PostcopyBlocktimeContext *ctx)
168{
5cc8767d 169 MachineState *ms = MACHINE(qdev_get_machine());
54aa3de7 170 uint32List *list = NULL;
65ace060
AP
171 int i;
172
5cc8767d 173 for (i = ms->smp.cpus - 1; i >= 0; i--) {
54aa3de7 174 QAPI_LIST_PREPEND(list, ctx->vcpu_blocktime[i]);
65ace060
AP
175 }
176
177 return list;
178}
179
180/*
181 * This function just populates MigrationInfo from postcopy's
182 * blocktime context. It will not populate MigrationInfo,
183 * unless postcopy-blocktime capability was set.
184 *
185 * @info: pointer to MigrationInfo to populate
186 */
187void fill_destination_postcopy_migration_info(MigrationInfo *info)
188{
189 MigrationIncomingState *mis = migration_incoming_get_current();
190 PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
191
192 if (!bc) {
193 return;
194 }
195
196 info->has_postcopy_blocktime = true;
197 info->postcopy_blocktime = bc->total_blocktime;
198 info->has_postcopy_vcpu_blocktime = true;
199 info->postcopy_vcpu_blocktime = get_vcpu_blocktime_list(bc);
200}
201
202static uint32_t get_postcopy_total_blocktime(void)
203{
204 MigrationIncomingState *mis = migration_incoming_get_current();
205 PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
206
207 if (!bc) {
208 return 0;
209 }
210
211 return bc->total_blocktime;
212}
213
54ae0886
AP
214/**
215 * receive_ufd_features: check userfault fd features, to request only supported
216 * features in the future.
217 *
218 * Returns: true on success
219 *
220 * __NR_userfaultfd - should be checked before
221 * @features: out parameter will contain uffdio_api.features provided by kernel
222 * in case of success
223 */
224static bool receive_ufd_features(uint64_t *features)
eb59db53 225{
54ae0886
AP
226 struct uffdio_api api_struct = {0};
227 int ufd;
228 bool ret = true;
229
d5890ea0 230 ufd = uffd_open(O_CLOEXEC);
54ae0886 231 if (ufd == -1) {
d5890ea0 232 error_report("%s: uffd_open() failed: %s", __func__, strerror(errno));
54ae0886
AP
233 return false;
234 }
eb59db53 235
54ae0886 236 /* ask features */
eb59db53
DDAG
237 api_struct.api = UFFD_API;
238 api_struct.features = 0;
239 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
5553499f 240 error_report("%s: UFFDIO_API failed: %s", __func__,
eb59db53 241 strerror(errno));
54ae0886
AP
242 ret = false;
243 goto release_ufd;
244 }
245
246 *features = api_struct.features;
247
248release_ufd:
249 close(ufd);
250 return ret;
251}
252
253/**
254 * request_ufd_features: this function should be called only once on a newly
255 * opened ufd, subsequent calls will lead to error.
256 *
3a4452d8 257 * Returns: true on success
54ae0886
AP
258 *
259 * @ufd: fd obtained from userfaultfd syscall
260 * @features: bit mask see UFFD_API_FEATURES
261 */
262static bool request_ufd_features(int ufd, uint64_t features)
263{
264 struct uffdio_api api_struct = {0};
265 uint64_t ioctl_mask;
266
267 api_struct.api = UFFD_API;
268 api_struct.features = features;
269 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
270 error_report("%s failed: UFFDIO_API failed: %s", __func__,
271 strerror(errno));
eb59db53
DDAG
272 return false;
273 }
274
275 ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
276 (__u64)1 << _UFFDIO_UNREGISTER;
277 if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
278 error_report("Missing userfault features: %" PRIx64,
279 (uint64_t)(~api_struct.ioctls & ioctl_mask));
280 return false;
281 }
282
54ae0886
AP
283 return true;
284}
285
74c38cf7
PX
286static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis,
287 Error **errp)
54ae0886
AP
288{
289 uint64_t asked_features = 0;
290 static uint64_t supported_features;
291
74c38cf7 292 ERRP_GUARD();
54ae0886
AP
293 /*
294 * it's not possible to
295 * request UFFD_API twice per one fd
296 * userfault fd features is persistent
297 */
298 if (!supported_features) {
299 if (!receive_ufd_features(&supported_features)) {
74c38cf7 300 error_setg(errp, "Userfault feature detection failed");
54ae0886
AP
301 return false;
302 }
303 }
304
2a4c42f1 305#ifdef UFFD_FEATURE_THREAD_ID
2d1c37c6 306 if (UFFD_FEATURE_THREAD_ID & supported_features) {
2a4c42f1 307 asked_features |= UFFD_FEATURE_THREAD_ID;
2d1c37c6
PX
308 if (migrate_postcopy_blocktime()) {
309 if (!mis->blocktime_ctx) {
310 mis->blocktime_ctx = blocktime_context_new();
311 }
312 }
2a4c42f1
AP
313 }
314#endif
315
54ae0886
AP
316 /*
317 * request features, even if asked_features is 0, due to
318 * kernel expects UFFD_API before UFFDIO_REGISTER, per
319 * userfault file descriptor
320 */
321 if (!request_ufd_features(ufd, asked_features)) {
74c38cf7 322 error_setg(errp, "Failed features %" PRIu64, asked_features);
54ae0886
AP
323 return false;
324 }
325
8e3b0cbb 326 if (qemu_real_host_page_size() != ram_pagesize_summary()) {
7e8cafb7
DDAG
327 bool have_hp = false;
328 /* We've got a huge page */
329#ifdef UFFD_FEATURE_MISSING_HUGETLBFS
54ae0886 330 have_hp = supported_features & UFFD_FEATURE_MISSING_HUGETLBFS;
7e8cafb7
DDAG
331#endif
332 if (!have_hp) {
74c38cf7
PX
333 error_setg(errp,
334 "Userfault on this host does not support huge pages");
7e8cafb7
DDAG
335 return false;
336 }
337 }
eb59db53
DDAG
338 return true;
339}
340
8679638b
DDAG
341/* Callback from postcopy_ram_supported_by_host block iterator.
342 */
74c38cf7 343static int test_ramblock_postcopiable(RAMBlock *rb, Error **errp)
8679638b 344{
754cb9c0
YK
345 const char *block_name = qemu_ram_get_idstr(rb);
346 ram_addr_t length = qemu_ram_get_used_length(rb);
5d214a92 347 size_t pagesize = qemu_ram_pagesize(rb);
ae30b9b2 348 QemuFsType fs;
5d214a92 349
5d214a92 350 if (length % pagesize) {
74c38cf7
PX
351 error_setg(errp,
352 "Postcopy requires RAM blocks to be a page size multiple,"
353 " block %s is 0x" RAM_ADDR_FMT " bytes with a "
354 "page size of 0x%zx", block_name, length, pagesize);
5d214a92
DDAG
355 return 1;
356 }
ae30b9b2
PX
357
358 if (rb->fd >= 0) {
359 fs = qemu_fd_getfs(rb->fd);
360 if (fs != QEMU_FS_TYPE_TMPFS && fs != QEMU_FS_TYPE_HUGETLBFS) {
74c38cf7
PX
361 error_setg(errp,
362 "Host backend files need to be TMPFS or HUGETLBFS only");
ae30b9b2
PX
363 return 1;
364 }
365 }
366
8679638b
DDAG
367 return 0;
368}
369
58b7c17e
DDAG
370/*
371 * Note: This has the side effect of munlock'ing all of RAM, that's
372 * normally fine since if the postcopy succeeds it gets turned back on at the
373 * end.
374 */
74c38cf7 375bool postcopy_ram_supported_by_host(MigrationIncomingState *mis, Error **errp)
eb59db53 376{
8e3b0cbb 377 long pagesize = qemu_real_host_page_size();
eb59db53
DDAG
378 int ufd = -1;
379 bool ret = false; /* Error unless we change it */
380 void *testarea = NULL;
381 struct uffdio_register reg_struct;
382 struct uffdio_range range_struct;
383 uint64_t feature_mask;
ae30b9b2 384 RAMBlock *block;
eb59db53 385
74c38cf7 386 ERRP_GUARD();
20afaed9 387 if (qemu_target_page_size() > pagesize) {
74c38cf7 388 error_setg(errp, "Target page size bigger than host page size");
eb59db53
DDAG
389 goto out;
390 }
391
d5890ea0 392 ufd = uffd_open(O_CLOEXEC);
eb59db53 393 if (ufd == -1) {
74c38cf7 394 error_setg(errp, "Userfaultfd not available: %s", strerror(errno));
eb59db53
DDAG
395 goto out;
396 }
397
1693c64c 398 /* Give devices a chance to object */
74c38cf7 399 if (postcopy_notify(POSTCOPY_NOTIFY_PROBE, errp)) {
1693c64c
DDAG
400 goto out;
401 }
402
eb59db53 403 /* Version and features check */
74c38cf7 404 if (!ufd_check_and_apply(ufd, mis, errp)) {
eb59db53
DDAG
405 goto out;
406 }
407
ae30b9b2
PX
408 /*
409 * We don't support postcopy with some type of ramblocks.
410 *
f161c88a 411 * NOTE: we explicitly ignored migrate_ram_is_ignored() instead we checked
ae30b9b2
PX
412 * all possible ramblocks. This is because this function can be called
413 * when creating the migration object, during the phase RAM_MIGRATABLE
414 * is not even properly set for all the ramblocks.
415 *
416 * A side effect of this is we'll also check against RAM_SHARED
417 * ramblocks even if migrate_ignore_shared() is set (in which case
418 * we'll never migrate RAM_SHARED at all), but normally this shouldn't
419 * affect in reality, or we can revisit.
420 */
421 RAMBLOCK_FOREACH(block) {
74c38cf7 422 if (test_ramblock_postcopiable(block, errp)) {
ae30b9b2
PX
423 goto out;
424 }
8679638b
DDAG
425 }
426
58b7c17e
DDAG
427 /*
428 * userfault and mlock don't go together; we'll put it back later if
429 * it was enabled.
430 */
431 if (munlockall()) {
74c38cf7 432 error_setg(errp, "munlockall() failed: %s", strerror(errno));
617a32f5 433 goto out;
58b7c17e
DDAG
434 }
435
eb59db53
DDAG
436 /*
437 * We need to check that the ops we need are supported on anon memory
438 * To do that we need to register a chunk and see the flags that
439 * are returned.
440 */
441 testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
442 MAP_ANONYMOUS, -1, 0);
443 if (testarea == MAP_FAILED) {
74c38cf7 444 error_setg(errp, "Failed to map test area: %s", strerror(errno));
eb59db53
DDAG
445 goto out;
446 }
7648297d 447 g_assert(QEMU_PTR_IS_ALIGNED(testarea, pagesize));
eb59db53
DDAG
448
449 reg_struct.range.start = (uintptr_t)testarea;
450 reg_struct.range.len = pagesize;
451 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
452
453 if (ioctl(ufd, UFFDIO_REGISTER, &reg_struct)) {
74c38cf7 454 error_setg(errp, "UFFDIO_REGISTER failed: %s", strerror(errno));
eb59db53
DDAG
455 goto out;
456 }
457
458 range_struct.start = (uintptr_t)testarea;
459 range_struct.len = pagesize;
460 if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
74c38cf7 461 error_setg(errp, "UFFDIO_UNREGISTER failed: %s", strerror(errno));
eb59db53
DDAG
462 goto out;
463 }
464
465 feature_mask = (__u64)1 << _UFFDIO_WAKE |
466 (__u64)1 << _UFFDIO_COPY |
467 (__u64)1 << _UFFDIO_ZEROPAGE;
468 if ((reg_struct.ioctls & feature_mask) != feature_mask) {
74c38cf7
PX
469 error_setg(errp, "Missing userfault map features: %" PRIx64,
470 (uint64_t)(~reg_struct.ioctls & feature_mask));
eb59db53
DDAG
471 goto out;
472 }
473
474 /* Success! */
475 ret = true;
476out:
477 if (testarea) {
478 munmap(testarea, pagesize);
479 }
480 if (ufd != -1) {
481 close(ufd);
482 }
483 return ret;
484}
485
1caddf8a
DDAG
486/*
487 * Setup an area of RAM so that it *can* be used for postcopy later; this
488 * must be done right at the start prior to pre-copy.
489 * opaque should be the MIS.
490 */
754cb9c0 491static int init_range(RAMBlock *rb, void *opaque)
1caddf8a 492{
754cb9c0
YK
493 const char *block_name = qemu_ram_get_idstr(rb);
494 void *host_addr = qemu_ram_get_host_addr(rb);
495 ram_addr_t offset = qemu_ram_get_offset(rb);
496 ram_addr_t length = qemu_ram_get_used_length(rb);
1caddf8a
DDAG
497 trace_postcopy_init_range(block_name, host_addr, offset, length);
498
898ba906
DH
499 /*
500 * Save the used_length before running the guest. In case we have to
501 * resize RAM blocks when syncing RAM block sizes from the source during
502 * precopy, we'll update it manually via the ram block notifier.
503 */
504 rb->postcopy_length = length;
505
1caddf8a
DDAG
506 /*
507 * We need the whole of RAM to be truly empty for postcopy, so things
508 * like ROMs and any data tables built during init must be zero'd
509 * - we're going to get the copy from the source anyway.
510 * (Precopy will just overwrite this data, so doesn't need the discard)
511 */
aaa2064c 512 if (ram_discard_range(block_name, 0, length)) {
1caddf8a
DDAG
513 return -1;
514 }
515
516 return 0;
517}
518
519/*
520 * At the end of migration, undo the effects of init_range
521 * opaque should be the MIS.
522 */
754cb9c0 523static int cleanup_range(RAMBlock *rb, void *opaque)
1caddf8a 524{
754cb9c0
YK
525 const char *block_name = qemu_ram_get_idstr(rb);
526 void *host_addr = qemu_ram_get_host_addr(rb);
527 ram_addr_t offset = qemu_ram_get_offset(rb);
898ba906 528 ram_addr_t length = rb->postcopy_length;
1caddf8a
DDAG
529 MigrationIncomingState *mis = opaque;
530 struct uffdio_range range_struct;
531 trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
532
533 /*
534 * We turned off hugepage for the precopy stage with postcopy enabled
535 * we can turn it back on now.
536 */
1d741439 537 qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE);
1caddf8a
DDAG
538
539 /*
540 * We can also turn off userfault now since we should have all the
541 * pages. It can be useful to leave it on to debug postcopy
542 * if you're not sure it's always getting every page.
543 */
544 range_struct.start = (uintptr_t)host_addr;
545 range_struct.len = length;
546
547 if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
548 error_report("%s: userfault unregister %s", __func__, strerror(errno));
549
550 return -1;
551 }
552
553 return 0;
554}
555
556/*
557 * Initialise postcopy-ram, setting the RAM to a state where we can go into
558 * postcopy later; must be called prior to any precopy.
559 * called from arch_init's similarly named ram_postcopy_incoming_init
560 */
c136180c 561int postcopy_ram_incoming_init(MigrationIncomingState *mis)
1caddf8a 562{
fbd162e6 563 if (foreach_not_ignored_block(init_range, NULL)) {
1caddf8a
DDAG
564 return -1;
565 }
566
567 return 0;
568}
569
476ebf77
PX
570static void postcopy_temp_pages_cleanup(MigrationIncomingState *mis)
571{
77dadc3f
PX
572 int i;
573
574 if (mis->postcopy_tmp_pages) {
575 for (i = 0; i < mis->postcopy_channels; i++) {
576 if (mis->postcopy_tmp_pages[i].tmp_huge_page) {
577 munmap(mis->postcopy_tmp_pages[i].tmp_huge_page,
578 mis->largest_page_size);
579 mis->postcopy_tmp_pages[i].tmp_huge_page = NULL;
580 }
581 }
582 g_free(mis->postcopy_tmp_pages);
583 mis->postcopy_tmp_pages = NULL;
476ebf77
PX
584 }
585
586 if (mis->postcopy_tmp_zero_page) {
587 munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size);
588 mis->postcopy_tmp_zero_page = NULL;
589 }
590}
591
1caddf8a
DDAG
592/*
593 * At the end of a migration where postcopy_ram_incoming_init was called.
594 */
595int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
596{
c4faeed2
DDAG
597 trace_postcopy_ram_incoming_cleanup_entry();
598
6621883f
PX
599 if (mis->preempt_thread_status == PREEMPT_THREAD_CREATED) {
600 /* Notify the fast load thread to quit */
601 mis->preempt_thread_status = PREEMPT_THREAD_QUIT;
cf02f29e
PX
602 /*
603 * Update preempt_thread_status before reading count. Note: mutex
604 * lock only provide ACQUIRE semantic, and it doesn't stops this
605 * write to be reordered after reading the count.
606 */
607 smp_mb();
608 /*
609 * It's possible that the preempt thread is still handling the last
610 * pages to arrive which were requested by guest page faults.
611 * Making sure nothing is left behind by waiting on the condvar if
612 * that unlikely case happened.
613 */
614 WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) {
615 if (qatomic_read(&mis->page_requested_count)) {
616 /*
617 * It is guaranteed to receive a signal later, because the
618 * count>0 now, so it's destined to be decreased to zero
619 * very soon by the preempt thread.
620 */
621 qemu_cond_wait(&mis->page_request_cond,
622 &mis->page_request_mutex);
623 }
624 }
625 /* Notify the fast load thread to quit */
6621883f
PX
626 if (mis->postcopy_qemufile_dst) {
627 qemu_file_shutdown(mis->postcopy_qemufile_dst);
628 }
36f62f11 629 qemu_thread_join(&mis->postcopy_prio_thread);
6621883f 630 mis->preempt_thread_status = PREEMPT_THREAD_NONE;
36f62f11
PX
631 }
632
c4faeed2 633 if (mis->have_fault_thread) {
46343570
DDAG
634 Error *local_err = NULL;
635
55d0fe82 636 /* Let the fault thread quit */
d73415a3 637 qatomic_set(&mis->fault_thread_quit, 1);
55d0fe82
IM
638 postcopy_fault_thread_notify(mis);
639 trace_postcopy_ram_incoming_cleanup_join();
640 qemu_thread_join(&mis->fault_thread);
641
46343570
DDAG
642 if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_END, &local_err)) {
643 error_report_err(local_err);
644 return -1;
645 }
646
fbd162e6 647 if (foreach_not_ignored_block(cleanup_range, mis)) {
c4faeed2
DDAG
648 return -1;
649 }
9ab7ef9b 650
c4faeed2
DDAG
651 trace_postcopy_ram_incoming_cleanup_closeuf();
652 close(mis->userfault_fd);
64f615fe 653 close(mis->userfault_event_fd);
c4faeed2 654 mis->have_fault_thread = false;
1caddf8a
DDAG
655 }
656
58b7c17e
DDAG
657 if (enable_mlock) {
658 if (os_mlock() < 0) {
659 error_report("mlock: %s", strerror(errno));
660 /*
661 * It doesn't feel right to fail at this point, we have a valid
662 * VM state.
663 */
664 }
665 }
666
476ebf77
PX
667 postcopy_temp_pages_cleanup(mis);
668
65ace060
AP
669 trace_postcopy_ram_incoming_cleanup_blocktime(
670 get_postcopy_total_blocktime());
671
c4faeed2 672 trace_postcopy_ram_incoming_cleanup_exit();
1caddf8a
DDAG
673 return 0;
674}
675
f9527107
DDAG
676/*
677 * Disable huge pages on an area
678 */
754cb9c0 679static int nhp_range(RAMBlock *rb, void *opaque)
f9527107 680{
754cb9c0
YK
681 const char *block_name = qemu_ram_get_idstr(rb);
682 void *host_addr = qemu_ram_get_host_addr(rb);
683 ram_addr_t offset = qemu_ram_get_offset(rb);
898ba906 684 ram_addr_t length = rb->postcopy_length;
f9527107
DDAG
685 trace_postcopy_nhp_range(block_name, host_addr, offset, length);
686
687 /*
688 * Before we do discards we need to ensure those discards really
689 * do delete areas of the page, even if THP thinks a hugepage would
690 * be a good idea, so force hugepages off.
691 */
1d741439 692 qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE);
f9527107
DDAG
693
694 return 0;
695}
696
697/*
698 * Userfault requires us to mark RAM as NOHUGEPAGE prior to discard
699 * however leaving it until after precopy means that most of the precopy
700 * data is still THPd
701 */
702int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
703{
fbd162e6 704 if (foreach_not_ignored_block(nhp_range, mis)) {
f9527107
DDAG
705 return -1;
706 }
707
708 postcopy_state_set(POSTCOPY_INCOMING_DISCARD);
709
710 return 0;
711}
712
f0a227ad
DDAG
713/*
714 * Mark the given area of RAM as requiring notification to unwritten areas
fbd162e6 715 * Used as a callback on foreach_not_ignored_block.
f0a227ad
DDAG
716 * host_addr: Base of area to mark
717 * offset: Offset in the whole ram arena
718 * length: Length of the section
719 * opaque: MigrationIncomingState pointer
720 * Returns 0 on success
721 */
754cb9c0 722static int ram_block_enable_notify(RAMBlock *rb, void *opaque)
f0a227ad
DDAG
723{
724 MigrationIncomingState *mis = opaque;
725 struct uffdio_register reg_struct;
726
754cb9c0 727 reg_struct.range.start = (uintptr_t)qemu_ram_get_host_addr(rb);
898ba906 728 reg_struct.range.len = rb->postcopy_length;
f0a227ad
DDAG
729 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
730
731 /* Now tell our userfault_fd that it's responsible for this area */
732 if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, &reg_struct)) {
733 error_report("%s userfault register: %s", __func__, strerror(errno));
734 return -1;
735 }
665414ad
DDAG
736 if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
737 error_report("%s userfault: Region doesn't support COPY", __func__);
738 return -1;
739 }
2ce16640 740 if (reg_struct.ioctls & ((__u64)1 << _UFFDIO_ZEROPAGE)) {
2ce16640
DDAG
741 qemu_ram_set_uf_zeroable(rb);
742 }
f0a227ad
DDAG
743
744 return 0;
745}
746
5efc3564
DDAG
747int postcopy_wake_shared(struct PostCopyFD *pcfd,
748 uint64_t client_addr,
749 RAMBlock *rb)
750{
751 size_t pagesize = qemu_ram_pagesize(rb);
752 struct uffdio_range range;
753 int ret;
754 trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb));
7648297d 755 range.start = ROUND_DOWN(client_addr, pagesize);
5efc3564
DDAG
756 range.len = pagesize;
757 ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range);
758 if (ret) {
759 error_report("%s: Failed to wake: %zx in %s (%s)",
760 __func__, (size_t)client_addr, qemu_ram_get_idstr(rb),
761 strerror(errno));
762 }
763 return ret;
764}
765
9470c5e0
DH
766static int postcopy_request_page(MigrationIncomingState *mis, RAMBlock *rb,
767 ram_addr_t start, uint64_t haddr)
768{
769 void *aligned = (void *)(uintptr_t)ROUND_DOWN(haddr, qemu_ram_pagesize(rb));
770
771 /*
772 * Discarded pages (via RamDiscardManager) are never migrated. On unlikely
773 * access, place a zeropage, which will also set the relevant bits in the
774 * recv_bitmap accordingly, so we won't try placing a zeropage twice.
775 *
776 * Checking a single bit is sufficient to handle pagesize > TPS as either
777 * all relevant bits are set or not.
778 */
779 assert(QEMU_IS_ALIGNED(start, qemu_ram_pagesize(rb)));
780 if (ramblock_page_is_discarded(rb, start)) {
781 bool received = ramblock_recv_bitmap_test_byte_offset(rb, start);
782
783 return received ? 0 : postcopy_place_page_zero(mis, aligned, rb);
784 }
785
786 return migrate_send_rp_req_pages(mis, rb, start, haddr);
787}
788
096bf4c8
DDAG
789/*
790 * Callback from shared fault handlers to ask for a page,
791 * the page must be specified by a RAMBlock and an offset in that rb
792 * Note: Only for use by shared fault handlers (in fault thread)
793 */
794int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
795 uint64_t client_addr, uint64_t rb_offset)
796{
7648297d 797 uint64_t aligned_rbo = ROUND_DOWN(rb_offset, qemu_ram_pagesize(rb));
096bf4c8
DDAG
798 MigrationIncomingState *mis = migration_incoming_get_current();
799
800 trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb),
801 rb_offset);
dedfb4b2
DDAG
802 if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) {
803 trace_postcopy_request_shared_page_present(pcfd->idstr,
804 qemu_ram_get_idstr(rb), rb_offset);
805 return postcopy_wake_shared(pcfd, client_addr, rb);
806 }
9470c5e0 807 postcopy_request_page(mis, rb, aligned_rbo, client_addr);
096bf4c8
DDAG
808 return 0;
809}
810
575b0b33
AP
811static int get_mem_fault_cpu_index(uint32_t pid)
812{
813 CPUState *cpu_iter;
814
815 CPU_FOREACH(cpu_iter) {
816 if (cpu_iter->thread_id == pid) {
817 trace_get_mem_fault_cpu_index(cpu_iter->cpu_index, pid);
818 return cpu_iter->cpu_index;
819 }
820 }
821 trace_get_mem_fault_cpu_index(-1, pid);
822 return -1;
823}
824
825static uint32_t get_low_time_offset(PostcopyBlocktimeContext *dc)
826{
827 int64_t start_time_offset = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
828 dc->start_time;
829 return start_time_offset < 1 ? 1 : start_time_offset & UINT32_MAX;
830}
831
832/*
833 * This function is being called when pagefault occurs. It
834 * tracks down vCPU blocking time.
835 *
836 * @addr: faulted host virtual address
837 * @ptid: faulted process thread id
838 * @rb: ramblock appropriate to addr
839 */
840static void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid,
841 RAMBlock *rb)
842{
843 int cpu, already_received;
844 MigrationIncomingState *mis = migration_incoming_get_current();
845 PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
846 uint32_t low_time_offset;
847
848 if (!dc || ptid == 0) {
849 return;
850 }
851 cpu = get_mem_fault_cpu_index(ptid);
852 if (cpu < 0) {
853 return;
854 }
855
856 low_time_offset = get_low_time_offset(dc);
857 if (dc->vcpu_addr[cpu] == 0) {
d73415a3 858 qatomic_inc(&dc->smp_cpus_down);
575b0b33
AP
859 }
860
d73415a3
SH
861 qatomic_xchg(&dc->last_begin, low_time_offset);
862 qatomic_xchg(&dc->page_fault_vcpu_time[cpu], low_time_offset);
863 qatomic_xchg(&dc->vcpu_addr[cpu], addr);
575b0b33 864
da1725d3
WY
865 /*
866 * check it here, not at the beginning of the function,
867 * due to, check could occur early than bitmap_set in
868 * qemu_ufd_copy_ioctl
869 */
575b0b33
AP
870 already_received = ramblock_recv_bitmap_test(rb, (void *)addr);
871 if (already_received) {
d73415a3
SH
872 qatomic_xchg(&dc->vcpu_addr[cpu], 0);
873 qatomic_xchg(&dc->page_fault_vcpu_time[cpu], 0);
874 qatomic_dec(&dc->smp_cpus_down);
575b0b33
AP
875 }
876 trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu],
877 cpu, already_received);
878}
879
880/*
881 * This function just provide calculated blocktime per cpu and trace it.
882 * Total blocktime is calculated in mark_postcopy_blocktime_end.
883 *
884 *
885 * Assume we have 3 CPU
886 *
887 * S1 E1 S1 E1
888 * -----***********------------xxx***************------------------------> CPU1
889 *
890 * S2 E2
891 * ------------****************xxx---------------------------------------> CPU2
892 *
893 * S3 E3
894 * ------------------------****xxx********-------------------------------> CPU3
895 *
896 * We have sequence S1,S2,E1,S3,S1,E2,E3,E1
897 * S2,E1 - doesn't match condition due to sequence S1,S2,E1 doesn't include CPU3
898 * S3,S1,E2 - sequence includes all CPUs, in this case overlap will be S1,E2 -
899 * it's a part of total blocktime.
900 * S1 - here is last_begin
901 * Legend of the picture is following:
902 * * - means blocktime per vCPU
903 * x - means overlapped blocktime (total blocktime)
904 *
905 * @addr: host virtual address
906 */
907static void mark_postcopy_blocktime_end(uintptr_t addr)
908{
909 MigrationIncomingState *mis = migration_incoming_get_current();
910 PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
5cc8767d
LX
911 MachineState *ms = MACHINE(qdev_get_machine());
912 unsigned int smp_cpus = ms->smp.cpus;
575b0b33
AP
913 int i, affected_cpu = 0;
914 bool vcpu_total_blocktime = false;
915 uint32_t read_vcpu_time, low_time_offset;
916
917 if (!dc) {
918 return;
919 }
920
921 low_time_offset = get_low_time_offset(dc);
922 /* lookup cpu, to clear it,
3a4452d8 923 * that algorithm looks straightforward, but it's not
575b0b33
AP
924 * optimal, more optimal algorithm is keeping tree or hash
925 * where key is address value is a list of */
926 for (i = 0; i < smp_cpus; i++) {
927 uint32_t vcpu_blocktime = 0;
928
d73415a3
SH
929 read_vcpu_time = qatomic_fetch_add(&dc->page_fault_vcpu_time[i], 0);
930 if (qatomic_fetch_add(&dc->vcpu_addr[i], 0) != addr ||
575b0b33
AP
931 read_vcpu_time == 0) {
932 continue;
933 }
d73415a3 934 qatomic_xchg(&dc->vcpu_addr[i], 0);
575b0b33
AP
935 vcpu_blocktime = low_time_offset - read_vcpu_time;
936 affected_cpu += 1;
937 /* we need to know is that mark_postcopy_end was due to
938 * faulted page, another possible case it's prefetched
939 * page and in that case we shouldn't be here */
940 if (!vcpu_total_blocktime &&
d73415a3 941 qatomic_fetch_add(&dc->smp_cpus_down, 0) == smp_cpus) {
575b0b33
AP
942 vcpu_total_blocktime = true;
943 }
944 /* continue cycle, due to one page could affect several vCPUs */
945 dc->vcpu_blocktime[i] += vcpu_blocktime;
946 }
947
d73415a3 948 qatomic_sub(&dc->smp_cpus_down, affected_cpu);
575b0b33 949 if (vcpu_total_blocktime) {
d73415a3 950 dc->total_blocktime += low_time_offset - qatomic_fetch_add(
575b0b33
AP
951 &dc->last_begin, 0);
952 }
953 trace_mark_postcopy_blocktime_end(addr, dc, dc->total_blocktime,
954 affected_cpu);
955}
956
27dd21b4 957static void postcopy_pause_fault_thread(MigrationIncomingState *mis)
3a7804c3
PX
958{
959 trace_postcopy_pause_fault_thread();
3a7804c3 960 qemu_sem_wait(&mis->postcopy_pause_sem_fault);
3a7804c3 961 trace_postcopy_pause_fault_thread_continued();
3a7804c3
PX
962}
963
f0a227ad
DDAG
964/*
965 * Handle faults detected by the USERFAULT markings
966 */
967static void *postcopy_ram_fault_thread(void *opaque)
968{
969 MigrationIncomingState *mis = opaque;
c4faeed2
DDAG
970 struct uffd_msg msg;
971 int ret;
00fa4fc8 972 size_t index;
c4faeed2 973 RAMBlock *rb = NULL;
f0a227ad 974
c4faeed2 975 trace_postcopy_ram_fault_thread_entry();
74637e6f 976 rcu_register_thread();
096bf4c8 977 mis->last_rb = NULL; /* last RAMBlock we sent part of */
095c12a4 978 qemu_sem_post(&mis->thread_sync_sem);
f0a227ad 979
00fa4fc8
DDAG
980 struct pollfd *pfd;
981 size_t pfd_len = 2 + mis->postcopy_remote_fds->len;
982
983 pfd = g_new0(struct pollfd, pfd_len);
984
985 pfd[0].fd = mis->userfault_fd;
986 pfd[0].events = POLLIN;
987 pfd[1].fd = mis->userfault_event_fd;
988 pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */
989 trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd);
990 for (index = 0; index < mis->postcopy_remote_fds->len; index++) {
991 struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds,
992 struct PostCopyFD, index);
993 pfd[2 + index].fd = pcfd->fd;
994 pfd[2 + index].events = POLLIN;
995 trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr,
996 pcfd->fd);
997 }
998
c4faeed2
DDAG
999 while (true) {
1000 ram_addr_t rb_offset;
00fa4fc8 1001 int poll_result;
c4faeed2
DDAG
1002
1003 /*
1004 * We're mainly waiting for the kernel to give us a faulting HVA,
1005 * however we can be told to quit via userfault_quit_fd which is
1006 * an eventfd
1007 */
00fa4fc8
DDAG
1008
1009 poll_result = poll(pfd, pfd_len, -1 /* Wait forever */);
1010 if (poll_result == -1) {
c4faeed2
DDAG
1011 error_report("%s: userfault poll: %s", __func__, strerror(errno));
1012 break;
1013 }
1014
3a7804c3
PX
1015 if (!mis->to_src_file) {
1016 /*
1017 * Possibly someone tells us that the return path is
1018 * broken already using the event. We should hold until
1019 * the channel is rebuilt.
1020 */
27dd21b4 1021 postcopy_pause_fault_thread(mis);
3a7804c3
PX
1022 }
1023
c4faeed2 1024 if (pfd[1].revents) {
64f615fe
PX
1025 uint64_t tmp64 = 0;
1026
1027 /* Consume the signal */
1028 if (read(mis->userfault_event_fd, &tmp64, 8) != 8) {
1029 /* Nothing obviously nicer than posting this error. */
1030 error_report("%s: read() failed", __func__);
1031 }
1032
d73415a3 1033 if (qatomic_read(&mis->fault_thread_quit)) {
64f615fe
PX
1034 trace_postcopy_ram_fault_thread_quit();
1035 break;
1036 }
c4faeed2
DDAG
1037 }
1038
00fa4fc8
DDAG
1039 if (pfd[0].revents) {
1040 poll_result--;
1041 ret = read(mis->userfault_fd, &msg, sizeof(msg));
1042 if (ret != sizeof(msg)) {
1043 if (errno == EAGAIN) {
1044 /*
1045 * if a wake up happens on the other thread just after
1046 * the poll, there is nothing to read.
1047 */
1048 continue;
1049 }
1050 if (ret < 0) {
1051 error_report("%s: Failed to read full userfault "
1052 "message: %s",
1053 __func__, strerror(errno));
1054 break;
1055 } else {
1056 error_report("%s: Read %d bytes from userfaultfd "
1057 "expected %zd",
1058 __func__, ret, sizeof(msg));
1059 break; /* Lost alignment, don't know what we'd read next */
1060 }
c4faeed2 1061 }
00fa4fc8
DDAG
1062 if (msg.event != UFFD_EVENT_PAGEFAULT) {
1063 error_report("%s: Read unexpected event %ud from userfaultfd",
1064 __func__, msg.event);
1065 continue; /* It's not a page fault, shouldn't happen */
c4faeed2 1066 }
c4faeed2 1067
00fa4fc8
DDAG
1068 rb = qemu_ram_block_from_host(
1069 (void *)(uintptr_t)msg.arg.pagefault.address,
1070 true, &rb_offset);
1071 if (!rb) {
1072 error_report("postcopy_ram_fault_thread: Fault outside guest: %"
1073 PRIx64, (uint64_t)msg.arg.pagefault.address);
1074 break;
1075 }
c4faeed2 1076
7648297d 1077 rb_offset = ROUND_DOWN(rb_offset, qemu_ram_pagesize(rb));
00fa4fc8 1078 trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
c4faeed2 1079 qemu_ram_get_idstr(rb),
575b0b33
AP
1080 rb_offset,
1081 msg.arg.pagefault.feat.ptid);
1082 mark_postcopy_blocktime_begin(
1083 (uintptr_t)(msg.arg.pagefault.address),
1084 msg.arg.pagefault.feat.ptid, rb);
1085
3a7804c3 1086retry:
00fa4fc8
DDAG
1087 /*
1088 * Send the request to the source - we want to request one
1089 * of our host page sizes (which is >= TPS)
1090 */
9470c5e0
DH
1091 ret = postcopy_request_page(mis, rb, rb_offset,
1092 msg.arg.pagefault.address);
3a7804c3
PX
1093 if (ret) {
1094 /* May be network failure, try to wait for recovery */
27dd21b4
PX
1095 postcopy_pause_fault_thread(mis);
1096 goto retry;
00fa4fc8
DDAG
1097 }
1098 }
c4faeed2 1099
00fa4fc8
DDAG
1100 /* Now handle any requests from external processes on shared memory */
1101 /* TODO: May need to handle devices deregistering during postcopy */
1102 for (index = 2; index < pfd_len && poll_result; index++) {
1103 if (pfd[index].revents) {
1104 struct PostCopyFD *pcfd =
1105 &g_array_index(mis->postcopy_remote_fds,
1106 struct PostCopyFD, index - 2);
1107
1108 poll_result--;
1109 if (pfd[index].revents & POLLERR) {
1110 error_report("%s: POLLERR on poll %zd fd=%d",
1111 __func__, index, pcfd->fd);
1112 pfd[index].events = 0;
1113 continue;
1114 }
1115
1116 ret = read(pcfd->fd, &msg, sizeof(msg));
1117 if (ret != sizeof(msg)) {
1118 if (errno == EAGAIN) {
1119 /*
1120 * if a wake up happens on the other thread just after
1121 * the poll, there is nothing to read.
1122 */
1123 continue;
1124 }
1125 if (ret < 0) {
1126 error_report("%s: Failed to read full userfault "
1127 "message: %s (shared) revents=%d",
1128 __func__, strerror(errno),
1129 pfd[index].revents);
1130 /*TODO: Could just disable this sharer */
1131 break;
1132 } else {
1133 error_report("%s: Read %d bytes from userfaultfd "
1134 "expected %zd (shared)",
1135 __func__, ret, sizeof(msg));
1136 /*TODO: Could just disable this sharer */
1137 break; /*Lost alignment,don't know what we'd read next*/
1138 }
1139 }
1140 if (msg.event != UFFD_EVENT_PAGEFAULT) {
1141 error_report("%s: Read unexpected event %ud "
1142 "from userfaultfd (shared)",
1143 __func__, msg.event);
1144 continue; /* It's not a page fault, shouldn't happen */
1145 }
1146 /* Call the device handler registered with us */
1147 ret = pcfd->handler(pcfd, &msg);
1148 if (ret) {
1149 error_report("%s: Failed to resolve shared fault on %zd/%s",
1150 __func__, index, pcfd->idstr);
1151 /* TODO: Fail? Disable this sharer? */
1152 }
1153 }
c4faeed2
DDAG
1154 }
1155 }
74637e6f 1156 rcu_unregister_thread();
c4faeed2 1157 trace_postcopy_ram_fault_thread_exit();
fc6008f3 1158 g_free(pfd);
f0a227ad
DDAG
1159 return NULL;
1160}
1161
476ebf77
PX
1162static int postcopy_temp_pages_setup(MigrationIncomingState *mis)
1163{
77dadc3f
PX
1164 PostcopyTmpPage *tmp_page;
1165 int err, i, channels;
1166 void *temp_page;
1167
36f62f11
PX
1168 if (migrate_postcopy_preempt()) {
1169 /* If preemption enabled, need extra channel for urgent requests */
1170 mis->postcopy_channels = RAM_CHANNEL_MAX;
1171 } else {
1172 /* Both precopy/postcopy on the same channel */
1173 mis->postcopy_channels = 1;
1174 }
77dadc3f
PX
1175
1176 channels = mis->postcopy_channels;
1177 mis->postcopy_tmp_pages = g_malloc0_n(sizeof(PostcopyTmpPage), channels);
1178
1179 for (i = 0; i < channels; i++) {
1180 tmp_page = &mis->postcopy_tmp_pages[i];
1181 temp_page = mmap(NULL, mis->largest_page_size, PROT_READ | PROT_WRITE,
1182 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1183 if (temp_page == MAP_FAILED) {
1184 err = errno;
1185 error_report("%s: Failed to map postcopy_tmp_pages[%d]: %s",
1186 __func__, i, strerror(err));
1187 /* Clean up will be done later */
1188 return -err;
1189 }
1190 tmp_page->tmp_huge_page = temp_page;
1191 /* Initialize default states for each tmp page */
1192 postcopy_temp_page_reset(tmp_page);
476ebf77
PX
1193 }
1194
1195 /*
1196 * Map large zero page when kernel can't use UFFDIO_ZEROPAGE for hugepages
1197 */
1198 mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size,
1199 PROT_READ | PROT_WRITE,
1200 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1201 if (mis->postcopy_tmp_zero_page == MAP_FAILED) {
1202 err = errno;
1203 mis->postcopy_tmp_zero_page = NULL;
1204 error_report("%s: Failed to map large zero page %s",
1205 __func__, strerror(err));
1206 return -err;
1207 }
1208
1209 memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size);
1210
1211 return 0;
1212}
1213
2a7eb148 1214int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
f0a227ad 1215{
74c38cf7
PX
1216 Error *local_err = NULL;
1217
c4faeed2 1218 /* Open the fd for the kernel to give us userfaults */
d5890ea0 1219 mis->userfault_fd = uffd_open(O_CLOEXEC | O_NONBLOCK);
c4faeed2
DDAG
1220 if (mis->userfault_fd == -1) {
1221 error_report("%s: Failed to open userfault fd: %s", __func__,
1222 strerror(errno));
1223 return -1;
1224 }
1225
1226 /*
1227 * Although the host check already tested the API, we need to
1228 * do the check again as an ABI handshake on the new fd.
1229 */
74c38cf7
PX
1230 if (!ufd_check_and_apply(mis->userfault_fd, mis, &local_err)) {
1231 error_report_err(local_err);
c4faeed2
DDAG
1232 return -1;
1233 }
1234
1235 /* Now an eventfd we use to tell the fault-thread to quit */
64f615fe
PX
1236 mis->userfault_event_fd = eventfd(0, EFD_CLOEXEC);
1237 if (mis->userfault_event_fd == -1) {
1238 error_report("%s: Opening userfault_event_fd: %s", __func__,
c4faeed2
DDAG
1239 strerror(errno));
1240 close(mis->userfault_fd);
1241 return -1;
1242 }
1243
36f62f11 1244 postcopy_thread_create(mis, &mis->fault_thread, "fault-default",
095c12a4 1245 postcopy_ram_fault_thread, QEMU_THREAD_JOINABLE);
c4faeed2 1246 mis->have_fault_thread = true;
f0a227ad
DDAG
1247
1248 /* Mark so that we get notified of accesses to unwritten areas */
fbd162e6 1249 if (foreach_not_ignored_block(ram_block_enable_notify, mis)) {
91b02dc7 1250 error_report("ram_block_enable_notify failed");
f0a227ad
DDAG
1251 return -1;
1252 }
1253
476ebf77
PX
1254 if (postcopy_temp_pages_setup(mis)) {
1255 /* Error dumped in the sub-function */
3414322a
WY
1256 return -1;
1257 }
1258
36f62f11
PX
1259 if (migrate_postcopy_preempt()) {
1260 /*
1261 * This thread needs to be created after the temp pages because
1262 * it'll fetch RAM_CHANNEL_POSTCOPY PostcopyTmpPage immediately.
1263 */
1264 postcopy_thread_create(mis, &mis->postcopy_prio_thread, "fault-fast",
1265 postcopy_preempt_thread, QEMU_THREAD_JOINABLE);
6621883f 1266 mis->preempt_thread_status = PREEMPT_THREAD_CREATED;
36f62f11
PX
1267 }
1268
c4faeed2
DDAG
1269 trace_postcopy_ram_enable_notify();
1270
f0a227ad
DDAG
1271 return 0;
1272}
1273
eef621c4 1274static int qemu_ufd_copy_ioctl(MigrationIncomingState *mis, void *host_addr,
f9494614 1275 void *from_addr, uint64_t pagesize, RAMBlock *rb)
727b9d7e 1276{
eef621c4 1277 int userfault_fd = mis->userfault_fd;
f9494614 1278 int ret;
eef621c4 1279
727b9d7e
AP
1280 if (from_addr) {
1281 struct uffdio_copy copy_struct;
1282 copy_struct.dst = (uint64_t)(uintptr_t)host_addr;
1283 copy_struct.src = (uint64_t)(uintptr_t)from_addr;
1284 copy_struct.len = pagesize;
1285 copy_struct.mode = 0;
f9494614 1286 ret = ioctl(userfault_fd, UFFDIO_COPY, &copy_struct);
727b9d7e
AP
1287 } else {
1288 struct uffdio_zeropage zero_struct;
1289 zero_struct.range.start = (uint64_t)(uintptr_t)host_addr;
1290 zero_struct.range.len = pagesize;
1291 zero_struct.mode = 0;
f9494614
AP
1292 ret = ioctl(userfault_fd, UFFDIO_ZEROPAGE, &zero_struct);
1293 }
1294 if (!ret) {
8f8bfffc 1295 qemu_mutex_lock(&mis->page_request_mutex);
f9494614
AP
1296 ramblock_recv_bitmap_set_range(rb, host_addr,
1297 pagesize / qemu_target_page_size());
8f8bfffc
PX
1298 /*
1299 * If this page resolves a page fault for a previous recorded faulted
1300 * address, take a special note to maintain the requested page list.
1301 */
1302 if (g_tree_lookup(mis->page_requested, host_addr)) {
1303 g_tree_remove(mis->page_requested, host_addr);
cf02f29e
PX
1304 int left_pages = qatomic_dec_fetch(&mis->page_requested_count);
1305
8f8bfffc 1306 trace_postcopy_page_req_del(host_addr, mis->page_requested_count);
cf02f29e
PX
1307 /* Order the update of count and read of preempt status */
1308 smp_mb();
1309 if (mis->preempt_thread_status == PREEMPT_THREAD_QUIT &&
1310 left_pages == 0) {
1311 /*
1312 * This probably means the main thread is waiting for us.
1313 * Notify that we've finished receiving the last requested
1314 * page.
1315 */
1316 qemu_cond_signal(&mis->page_request_cond);
1317 }
8f8bfffc
PX
1318 }
1319 qemu_mutex_unlock(&mis->page_request_mutex);
575b0b33 1320 mark_postcopy_blocktime_end((uintptr_t)host_addr);
727b9d7e 1321 }
f9494614 1322 return ret;
727b9d7e
AP
1323}
1324
d488b349
DDAG
1325int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset)
1326{
1327 int i;
1328 MigrationIncomingState *mis = migration_incoming_get_current();
1329 GArray *pcrfds = mis->postcopy_remote_fds;
1330
1331 for (i = 0; i < pcrfds->len; i++) {
1332 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1333 int ret = cur->waker(cur, rb, offset);
1334 if (ret) {
1335 return ret;
1336 }
1337 }
1338 return 0;
1339}
1340
696ed9a9
DDAG
1341/*
1342 * Place a host page (from) at (host) atomically
1343 * returns 0 on success
1344 */
df9ff5e1 1345int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
8be4620b 1346 RAMBlock *rb)
696ed9a9 1347{
8be4620b 1348 size_t pagesize = qemu_ram_pagesize(rb);
696ed9a9 1349
696ed9a9
DDAG
1350 /* copy also acks to the kernel waking the stalled thread up
1351 * TODO: We can inhibit that ack and only do it if it was requested
1352 * which would be slightly cheaper, but we'd have to be careful
1353 * of the order of updating our page state.
1354 */
eef621c4 1355 if (qemu_ufd_copy_ioctl(mis, host, from, pagesize, rb)) {
696ed9a9 1356 int e = errno;
df9ff5e1
DDAG
1357 error_report("%s: %s copy host: %p from: %p (size: %zd)",
1358 __func__, strerror(e), host, from, pagesize);
696ed9a9
DDAG
1359
1360 return -e;
1361 }
1362
1363 trace_postcopy_place_page(host);
dedfb4b2
DDAG
1364 return postcopy_notify_shared_wake(rb,
1365 qemu_ram_block_host_offset(rb, host));
696ed9a9
DDAG
1366}
1367
1368/*
1369 * Place a zero page at (host) atomically
1370 * returns 0 on success
1371 */
df9ff5e1 1372int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
8be4620b 1373 RAMBlock *rb)
696ed9a9 1374{
2ce16640 1375 size_t pagesize = qemu_ram_pagesize(rb);
df9ff5e1 1376 trace_postcopy_place_page_zero(host);
696ed9a9 1377
2ce16640
DDAG
1378 /* Normal RAMBlocks can zero a page using UFFDIO_ZEROPAGE
1379 * but it's not available for everything (e.g. hugetlbpages)
1380 */
1381 if (qemu_ram_is_uf_zeroable(rb)) {
eef621c4 1382 if (qemu_ufd_copy_ioctl(mis, host, NULL, pagesize, rb)) {
df9ff5e1
DDAG
1383 int e = errno;
1384 error_report("%s: %s zero host: %p",
1385 __func__, strerror(e), host);
696ed9a9 1386
df9ff5e1
DDAG
1387 return -e;
1388 }
dedfb4b2
DDAG
1389 return postcopy_notify_shared_wake(rb,
1390 qemu_ram_block_host_offset(rb,
1391 host));
df9ff5e1 1392 } else {
6629890d 1393 return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page, rb);
696ed9a9 1394 }
696ed9a9
DDAG
1395}
1396
eb59db53
DDAG
1397#else
1398/* No target OS support, stubs just fail */
65ace060
AP
1399void fill_destination_postcopy_migration_info(MigrationInfo *info)
1400{
1401}
1402
74c38cf7 1403bool postcopy_ram_supported_by_host(MigrationIncomingState *mis, Error **errp)
eb59db53
DDAG
1404{
1405 error_report("%s: No OS support", __func__);
1406 return false;
1407}
1408
c136180c 1409int postcopy_ram_incoming_init(MigrationIncomingState *mis)
1caddf8a
DDAG
1410{
1411 error_report("postcopy_ram_incoming_init: No OS support");
1412 return -1;
1413}
1414
1415int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
1416{
1417 assert(0);
1418 return -1;
1419}
1420
f9527107
DDAG
1421int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
1422{
1423 assert(0);
1424 return -1;
1425}
1426
c188c539
MT
1427int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
1428 uint64_t client_addr, uint64_t rb_offset)
1429{
1430 assert(0);
1431 return -1;
1432}
1433
2a7eb148 1434int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
f0a227ad
DDAG
1435{
1436 assert(0);
1437 return -1;
1438}
696ed9a9 1439
df9ff5e1 1440int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
8be4620b 1441 RAMBlock *rb)
696ed9a9
DDAG
1442{
1443 assert(0);
1444 return -1;
1445}
1446
df9ff5e1 1447int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
8be4620b 1448 RAMBlock *rb)
696ed9a9
DDAG
1449{
1450 assert(0);
1451 return -1;
1452}
1453
5efc3564
DDAG
1454int postcopy_wake_shared(struct PostCopyFD *pcfd,
1455 uint64_t client_addr,
1456 RAMBlock *rb)
1457{
1458 assert(0);
1459 return -1;
1460}
eb59db53
DDAG
1461#endif
1462
e0b266f0 1463/* ------------------------------------------------------------------------- */
77dadc3f
PX
1464void postcopy_temp_page_reset(PostcopyTmpPage *tmp_page)
1465{
1466 tmp_page->target_pages = 0;
1467 tmp_page->host_addr = NULL;
1468 /*
1469 * This is set to true when reset, and cleared as long as we received any
1470 * of the non-zero small page within this huge page.
1471 */
1472 tmp_page->all_zero = true;
1473}
e0b266f0 1474
9ab7ef9b
PX
1475void postcopy_fault_thread_notify(MigrationIncomingState *mis)
1476{
1477 uint64_t tmp64 = 1;
1478
1479 /*
1480 * Wakeup the fault_thread. It's an eventfd that should currently
1481 * be at 0, we're going to increment it to 1
1482 */
1483 if (write(mis->userfault_event_fd, &tmp64, 8) != 8) {
1484 /* Not much we can do here, but may as well report it */
1485 error_report("%s: incrementing failed: %s", __func__,
1486 strerror(errno));
1487 }
1488}
1489
e0b266f0
DDAG
1490/**
1491 * postcopy_discard_send_init: Called at the start of each RAMBlock before
1492 * asking to discard individual ranges.
1493 *
1494 * @ms: The current migration state.
810cf2bb 1495 * @offset: the bitmap offset of the named RAMBlock in the migration bitmap.
e0b266f0 1496 * @name: RAMBlock that discards will operate on.
e0b266f0 1497 */
810cf2bb
WY
1498static PostcopyDiscardState pds = {0};
1499void postcopy_discard_send_init(MigrationState *ms, const char *name)
e0b266f0 1500{
810cf2bb
WY
1501 pds.ramblock_name = name;
1502 pds.cur_entry = 0;
1503 pds.nsentwords = 0;
1504 pds.nsentcmds = 0;
e0b266f0
DDAG
1505}
1506
1507/**
1508 * postcopy_discard_send_range: Called by the bitmap code for each chunk to
1509 * discard. May send a discard message, may just leave it queued to
1510 * be sent later.
1511 *
1512 * @ms: Current migration state.
e0b266f0
DDAG
1513 * @start,@length: a range of pages in the migration bitmap in the
1514 * RAM block passed to postcopy_discard_send_init() (length=1 is one page)
1515 */
810cf2bb
WY
1516void postcopy_discard_send_range(MigrationState *ms, unsigned long start,
1517 unsigned long length)
e0b266f0 1518{
20afaed9 1519 size_t tp_size = qemu_target_page_size();
e0b266f0 1520 /* Convert to byte offsets within the RAM block */
810cf2bb
WY
1521 pds.start_list[pds.cur_entry] = start * tp_size;
1522 pds.length_list[pds.cur_entry] = length * tp_size;
1523 trace_postcopy_discard_send_range(pds.ramblock_name, start, length);
1524 pds.cur_entry++;
1525 pds.nsentwords++;
e0b266f0 1526
810cf2bb 1527 if (pds.cur_entry == MAX_DISCARDS_PER_COMMAND) {
e0b266f0 1528 /* Full set, ship it! */
89a02a9f 1529 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
810cf2bb
WY
1530 pds.ramblock_name,
1531 pds.cur_entry,
1532 pds.start_list,
1533 pds.length_list);
1534 pds.nsentcmds++;
1535 pds.cur_entry = 0;
e0b266f0
DDAG
1536 }
1537}
1538
1539/**
1540 * postcopy_discard_send_finish: Called at the end of each RAMBlock by the
1541 * bitmap code. Sends any outstanding discard messages, frees the PDS
1542 *
1543 * @ms: Current migration state.
e0b266f0 1544 */
810cf2bb 1545void postcopy_discard_send_finish(MigrationState *ms)
e0b266f0
DDAG
1546{
1547 /* Anything unsent? */
810cf2bb 1548 if (pds.cur_entry) {
89a02a9f 1549 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
810cf2bb
WY
1550 pds.ramblock_name,
1551 pds.cur_entry,
1552 pds.start_list,
1553 pds.length_list);
1554 pds.nsentcmds++;
e0b266f0
DDAG
1555 }
1556
810cf2bb
WY
1557 trace_postcopy_discard_send_finish(pds.ramblock_name, pds.nsentwords,
1558 pds.nsentcmds);
e0b266f0 1559}
bac3b212
JQ
1560
1561/*
1562 * Current state of incoming postcopy; note this is not part of
1563 * MigrationIncomingState since it's state is used during cleanup
1564 * at the end as MIS is being freed.
1565 */
1566static PostcopyState incoming_postcopy_state;
1567
1568PostcopyState postcopy_state_get(void)
1569{
4592eaf3 1570 return qatomic_load_acquire(&incoming_postcopy_state);
bac3b212
JQ
1571}
1572
1573/* Set the state and return the old state */
1574PostcopyState postcopy_state_set(PostcopyState new_state)
1575{
d73415a3 1576 return qatomic_xchg(&incoming_postcopy_state, new_state);
bac3b212 1577}
00fa4fc8
DDAG
1578
1579/* Register a handler for external shared memory postcopy
1580 * called on the destination.
1581 */
1582void postcopy_register_shared_ufd(struct PostCopyFD *pcfd)
1583{
1584 MigrationIncomingState *mis = migration_incoming_get_current();
1585
1586 mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds,
1587 *pcfd);
1588}
1589
1590/* Unregister a handler for external shared memory postcopy
1591 */
1592void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd)
1593{
1594 guint i;
1595 MigrationIncomingState *mis = migration_incoming_get_current();
1596 GArray *pcrfds = mis->postcopy_remote_fds;
1597
56559980
JQ
1598 if (!pcrfds) {
1599 /* migration has already finished and freed the array */
1600 return;
1601 }
00fa4fc8
DDAG
1602 for (i = 0; i < pcrfds->len; i++) {
1603 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1604 if (cur->fd == pcfd->fd) {
1605 mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i);
1606 return;
1607 }
1608 }
1609}
36f62f11 1610
6720c2b3 1611void postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file)
36f62f11
PX
1612{
1613 /*
1614 * The new loading channel has its own threads, so it needs to be
1615 * blocked too. It's by default true, just be explicit.
1616 */
1617 qemu_file_set_blocking(file, true);
1618 mis->postcopy_qemufile_dst = file;
5655aab0 1619 qemu_sem_post(&mis->postcopy_qemufile_dst_done);
36f62f11 1620 trace_postcopy_preempt_new_channel();
36f62f11
PX
1621}
1622
f0afaf6c
PX
1623/*
1624 * Setup the postcopy preempt channel with the IOC. If ERROR is specified,
1625 * setup the error instead. This helper will free the ERROR if specified.
1626 */
d0edb8a1 1627static void
f0afaf6c
PX
1628postcopy_preempt_send_channel_done(MigrationState *s,
1629 QIOChannel *ioc, Error *local_err)
36f62f11 1630{
f0afaf6c 1631 if (local_err) {
d0edb8a1
PX
1632 migrate_set_error(s, local_err);
1633 error_free(local_err);
1634 } else {
1635 migration_ioc_register_yank(ioc);
1636 s->postcopy_qemufile_src = qemu_file_new_output(ioc);
1637 trace_postcopy_preempt_new_channel();
1638 }
1639
1640 /*
1641 * Kick the waiter in all cases. The waiter should check upon
1642 * postcopy_qemufile_src to know whether it failed or not.
1643 */
1644 qemu_sem_post(&s->postcopy_qemufile_src_sem);
f0afaf6c
PX
1645}
1646
1647static void
1648postcopy_preempt_tls_handshake(QIOTask *task, gpointer opaque)
1649{
1650 g_autoptr(QIOChannel) ioc = QIO_CHANNEL(qio_task_get_source(task));
1651 MigrationState *s = opaque;
1652 Error *local_err = NULL;
1653
1654 qio_task_propagate_error(task, &local_err);
1655 postcopy_preempt_send_channel_done(s, ioc, local_err);
1656}
1657
1658static void
1659postcopy_preempt_send_channel_new(QIOTask *task, gpointer opaque)
1660{
1661 g_autoptr(QIOChannel) ioc = QIO_CHANNEL(qio_task_get_source(task));
1662 MigrationState *s = opaque;
1663 QIOChannelTLS *tioc;
1664 Error *local_err = NULL;
1665
1666 if (qio_task_propagate_error(task, &local_err)) {
1667 goto out;
1668 }
1669
1670 if (migrate_channel_requires_tls_upgrade(ioc)) {
0deb7e9b 1671 tioc = migration_tls_client_create(ioc, s->hostname, &local_err);
f0afaf6c
PX
1672 if (!tioc) {
1673 goto out;
1674 }
1675 trace_postcopy_preempt_tls_handshake();
1676 qio_channel_set_name(QIO_CHANNEL(tioc), "migration-tls-preempt");
1677 qio_channel_tls_handshake(tioc, postcopy_preempt_tls_handshake,
1678 s, NULL, NULL);
1679 /* Setup the channel until TLS handshake finished */
1680 return;
1681 }
1682
1683out:
1684 /* This handles both good and error cases */
1685 postcopy_preempt_send_channel_done(s, ioc, local_err);
d0edb8a1 1686}
36f62f11 1687
5655aab0
PX
1688/*
1689 * This function will kick off an async task to establish the preempt
1690 * channel, and wait until the connection setup completed. Returns 0 if
1691 * channel established, -1 for error.
1692 */
1693int postcopy_preempt_establish_channel(MigrationState *s)
d0edb8a1
PX
1694{
1695 /* If preempt not enabled, no need to wait */
1696 if (!migrate_postcopy_preempt()) {
1697 return 0;
1698 }
1699
06064a67
PX
1700 /*
1701 * Kick off async task to establish preempt channel. Only do so with
1702 * 8.0+ machines, because 7.1/7.2 require the channel to be created in
1703 * setup phase of migration (even if racy in an unreliable network).
1704 */
1705 if (!s->preempt_pre_7_2) {
1706 postcopy_preempt_setup(s);
1707 }
5655aab0 1708
d0edb8a1
PX
1709 /*
1710 * We need the postcopy preempt channel to be established before
1711 * starting doing anything.
1712 */
1713 qemu_sem_wait(&s->postcopy_qemufile_src_sem);
1714
1715 return s->postcopy_qemufile_src ? 0 : -1;
1716}
1717
fc063a7b 1718void postcopy_preempt_setup(MigrationState *s)
d0edb8a1 1719{
d0edb8a1
PX
1720 /* Kick an async task to connect */
1721 socket_send_channel_create(postcopy_preempt_send_channel_new, s);
36f62f11
PX
1722}
1723
60bb3c58
PX
1724static void postcopy_pause_ram_fast_load(MigrationIncomingState *mis)
1725{
1726 trace_postcopy_pause_fast_load();
1727 qemu_mutex_unlock(&mis->postcopy_prio_thread_mutex);
1728 qemu_sem_wait(&mis->postcopy_pause_sem_fast_load);
1729 qemu_mutex_lock(&mis->postcopy_prio_thread_mutex);
1730 trace_postcopy_pause_fast_load_continued();
1731}
1732
6621883f
PX
1733static bool preempt_thread_should_run(MigrationIncomingState *mis)
1734{
1735 return mis->preempt_thread_status != PREEMPT_THREAD_QUIT;
1736}
1737
36f62f11
PX
1738void *postcopy_preempt_thread(void *opaque)
1739{
1740 MigrationIncomingState *mis = opaque;
1741 int ret;
1742
1743 trace_postcopy_preempt_thread_entry();
1744
1745 rcu_register_thread();
1746
1747 qemu_sem_post(&mis->thread_sync_sem);
1748
a5d35dc7
PX
1749 /*
1750 * The preempt channel is established in asynchronous way. Wait
1751 * for its completion.
1752 */
1753 qemu_sem_wait(&mis->postcopy_qemufile_dst_done);
1754
36f62f11 1755 /* Sending RAM_SAVE_FLAG_EOS to terminate this thread */
60bb3c58 1756 qemu_mutex_lock(&mis->postcopy_prio_thread_mutex);
6621883f 1757 while (preempt_thread_should_run(mis)) {
60bb3c58
PX
1758 ret = ram_load_postcopy(mis->postcopy_qemufile_dst,
1759 RAM_CHANNEL_POSTCOPY);
1760 /* If error happened, go into recovery routine */
6621883f 1761 if (ret && preempt_thread_should_run(mis)) {
60bb3c58
PX
1762 postcopy_pause_ram_fast_load(mis);
1763 } else {
1764 /* We're done */
1765 break;
1766 }
1767 }
1768 qemu_mutex_unlock(&mis->postcopy_prio_thread_mutex);
36f62f11
PX
1769
1770 rcu_unregister_thread();
1771
1772 trace_postcopy_preempt_thread_exit();
1773
60bb3c58 1774 return NULL;
36f62f11 1775}