]> git.proxmox.com Git - mirror_qemu.git/blob - migration/postcopy-ram.c
target/loongarch: Constify loongarch_tcg_ops
[mirror_qemu.git] / migration / postcopy-ram.c
1 /*
2 * Postcopy migration for RAM
3 *
4 * Copyright 2013-2015 Red Hat, Inc. and/or its affiliates
5 *
6 * Authors:
7 * Dave Gilbert <dgilbert@redhat.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2 or later.
10 * See the COPYING file in the top-level directory.
11 *
12 */
13
14 /*
15 * Postcopy is a migration technique where the execution flips from the
16 * source to the destination before all the data has been copied.
17 */
18
19 #include "qemu/osdep.h"
20 #include "qemu/madvise.h"
21 #include "exec/target_page.h"
22 #include "migration.h"
23 #include "qemu-file.h"
24 #include "savevm.h"
25 #include "postcopy-ram.h"
26 #include "ram.h"
27 #include "qapi/error.h"
28 #include "qemu/notify.h"
29 #include "qemu/rcu.h"
30 #include "sysemu/sysemu.h"
31 #include "qemu/error-report.h"
32 #include "trace.h"
33 #include "hw/boards.h"
34 #include "exec/ramblock.h"
35 #include "socket.h"
36 #include "yank_functions.h"
37 #include "tls.h"
38 #include "qemu/userfaultfd.h"
39 #include "qemu/mmap-alloc.h"
40 #include "options.h"
41
42 /* Arbitrary limit on size of each discard command,
43 * keeps them around ~200 bytes
44 */
45 #define MAX_DISCARDS_PER_COMMAND 12
46
47 struct PostcopyDiscardState {
48 const char *ramblock_name;
49 uint16_t cur_entry;
50 /*
51 * Start and length of a discard range (bytes)
52 */
53 uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
54 uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
55 unsigned int nsentwords;
56 unsigned int nsentcmds;
57 };
58
59 static NotifierWithReturnList postcopy_notifier_list;
60
61 void postcopy_infrastructure_init(void)
62 {
63 notifier_with_return_list_init(&postcopy_notifier_list);
64 }
65
66 void postcopy_add_notifier(NotifierWithReturn *nn)
67 {
68 notifier_with_return_list_add(&postcopy_notifier_list, nn);
69 }
70
71 void postcopy_remove_notifier(NotifierWithReturn *n)
72 {
73 notifier_with_return_remove(n);
74 }
75
76 int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp)
77 {
78 struct PostcopyNotifyData pnd;
79 pnd.reason = reason;
80 pnd.errp = errp;
81
82 return notifier_with_return_list_notify(&postcopy_notifier_list,
83 &pnd);
84 }
85
86 /*
87 * NOTE: this routine is not thread safe, we can't call it concurrently. But it
88 * should be good enough for migration's purposes.
89 */
90 void postcopy_thread_create(MigrationIncomingState *mis,
91 QemuThread *thread, const char *name,
92 void *(*fn)(void *), int joinable)
93 {
94 qemu_sem_init(&mis->thread_sync_sem, 0);
95 qemu_thread_create(thread, name, fn, mis, joinable);
96 qemu_sem_wait(&mis->thread_sync_sem);
97 qemu_sem_destroy(&mis->thread_sync_sem);
98 }
99
100 /* Postcopy needs to detect accesses to pages that haven't yet been copied
101 * across, and efficiently map new pages in, the techniques for doing this
102 * are target OS specific.
103 */
104 #if defined(__linux__)
105
106 #include <poll.h>
107 #include <sys/ioctl.h>
108 #include <sys/syscall.h>
109 #include <asm/types.h> /* for __u64 */
110 #endif
111
112 #if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD)
113 #include <sys/eventfd.h>
114 #include <linux/userfaultfd.h>
115
116 typedef struct PostcopyBlocktimeContext {
117 /* time when page fault initiated per vCPU */
118 uint32_t *page_fault_vcpu_time;
119 /* page address per vCPU */
120 uintptr_t *vcpu_addr;
121 uint32_t total_blocktime;
122 /* blocktime per vCPU */
123 uint32_t *vcpu_blocktime;
124 /* point in time when last page fault was initiated */
125 uint32_t last_begin;
126 /* number of vCPU are suspended */
127 int smp_cpus_down;
128 uint64_t start_time;
129
130 /*
131 * Handler for exit event, necessary for
132 * releasing whole blocktime_ctx
133 */
134 Notifier exit_notifier;
135 } PostcopyBlocktimeContext;
136
137 static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx)
138 {
139 g_free(ctx->page_fault_vcpu_time);
140 g_free(ctx->vcpu_addr);
141 g_free(ctx->vcpu_blocktime);
142 g_free(ctx);
143 }
144
145 static void migration_exit_cb(Notifier *n, void *data)
146 {
147 PostcopyBlocktimeContext *ctx = container_of(n, PostcopyBlocktimeContext,
148 exit_notifier);
149 destroy_blocktime_context(ctx);
150 }
151
152 static struct PostcopyBlocktimeContext *blocktime_context_new(void)
153 {
154 MachineState *ms = MACHINE(qdev_get_machine());
155 unsigned int smp_cpus = ms->smp.cpus;
156 PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1);
157 ctx->page_fault_vcpu_time = g_new0(uint32_t, smp_cpus);
158 ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus);
159 ctx->vcpu_blocktime = g_new0(uint32_t, smp_cpus);
160
161 ctx->exit_notifier.notify = migration_exit_cb;
162 ctx->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
163 qemu_add_exit_notifier(&ctx->exit_notifier);
164 return ctx;
165 }
166
167 static uint32List *get_vcpu_blocktime_list(PostcopyBlocktimeContext *ctx)
168 {
169 MachineState *ms = MACHINE(qdev_get_machine());
170 uint32List *list = NULL;
171 int i;
172
173 for (i = ms->smp.cpus - 1; i >= 0; i--) {
174 QAPI_LIST_PREPEND(list, ctx->vcpu_blocktime[i]);
175 }
176
177 return list;
178 }
179
180 /*
181 * This function just populates MigrationInfo from postcopy's
182 * blocktime context. It will not populate MigrationInfo,
183 * unless postcopy-blocktime capability was set.
184 *
185 * @info: pointer to MigrationInfo to populate
186 */
187 void fill_destination_postcopy_migration_info(MigrationInfo *info)
188 {
189 MigrationIncomingState *mis = migration_incoming_get_current();
190 PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
191
192 if (!bc) {
193 return;
194 }
195
196 info->has_postcopy_blocktime = true;
197 info->postcopy_blocktime = bc->total_blocktime;
198 info->has_postcopy_vcpu_blocktime = true;
199 info->postcopy_vcpu_blocktime = get_vcpu_blocktime_list(bc);
200 }
201
202 static uint32_t get_postcopy_total_blocktime(void)
203 {
204 MigrationIncomingState *mis = migration_incoming_get_current();
205 PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
206
207 if (!bc) {
208 return 0;
209 }
210
211 return bc->total_blocktime;
212 }
213
214 /**
215 * receive_ufd_features: check userfault fd features, to request only supported
216 * features in the future.
217 *
218 * Returns: true on success
219 *
220 * __NR_userfaultfd - should be checked before
221 * @features: out parameter will contain uffdio_api.features provided by kernel
222 * in case of success
223 */
224 static bool receive_ufd_features(uint64_t *features)
225 {
226 struct uffdio_api api_struct = {0};
227 int ufd;
228 bool ret = true;
229
230 ufd = uffd_open(O_CLOEXEC);
231 if (ufd == -1) {
232 error_report("%s: uffd_open() failed: %s", __func__, strerror(errno));
233 return false;
234 }
235
236 /* ask features */
237 api_struct.api = UFFD_API;
238 api_struct.features = 0;
239 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
240 error_report("%s: UFFDIO_API failed: %s", __func__,
241 strerror(errno));
242 ret = false;
243 goto release_ufd;
244 }
245
246 *features = api_struct.features;
247
248 release_ufd:
249 close(ufd);
250 return ret;
251 }
252
253 /**
254 * request_ufd_features: this function should be called only once on a newly
255 * opened ufd, subsequent calls will lead to error.
256 *
257 * Returns: true on success
258 *
259 * @ufd: fd obtained from userfaultfd syscall
260 * @features: bit mask see UFFD_API_FEATURES
261 */
262 static bool request_ufd_features(int ufd, uint64_t features)
263 {
264 struct uffdio_api api_struct = {0};
265 uint64_t ioctl_mask;
266
267 api_struct.api = UFFD_API;
268 api_struct.features = features;
269 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
270 error_report("%s failed: UFFDIO_API failed: %s", __func__,
271 strerror(errno));
272 return false;
273 }
274
275 ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
276 (__u64)1 << _UFFDIO_UNREGISTER;
277 if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
278 error_report("Missing userfault features: %" PRIx64,
279 (uint64_t)(~api_struct.ioctls & ioctl_mask));
280 return false;
281 }
282
283 return true;
284 }
285
286 static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis,
287 Error **errp)
288 {
289 uint64_t asked_features = 0;
290 static uint64_t supported_features;
291
292 ERRP_GUARD();
293 /*
294 * it's not possible to
295 * request UFFD_API twice per one fd
296 * userfault fd features is persistent
297 */
298 if (!supported_features) {
299 if (!receive_ufd_features(&supported_features)) {
300 error_setg(errp, "Userfault feature detection failed");
301 return false;
302 }
303 }
304
305 #ifdef UFFD_FEATURE_THREAD_ID
306 if (UFFD_FEATURE_THREAD_ID & supported_features) {
307 asked_features |= UFFD_FEATURE_THREAD_ID;
308 if (migrate_postcopy_blocktime()) {
309 if (!mis->blocktime_ctx) {
310 mis->blocktime_ctx = blocktime_context_new();
311 }
312 }
313 }
314 #endif
315
316 /*
317 * request features, even if asked_features is 0, due to
318 * kernel expects UFFD_API before UFFDIO_REGISTER, per
319 * userfault file descriptor
320 */
321 if (!request_ufd_features(ufd, asked_features)) {
322 error_setg(errp, "Failed features %" PRIu64, asked_features);
323 return false;
324 }
325
326 if (qemu_real_host_page_size() != ram_pagesize_summary()) {
327 bool have_hp = false;
328 /* We've got a huge page */
329 #ifdef UFFD_FEATURE_MISSING_HUGETLBFS
330 have_hp = supported_features & UFFD_FEATURE_MISSING_HUGETLBFS;
331 #endif
332 if (!have_hp) {
333 error_setg(errp,
334 "Userfault on this host does not support huge pages");
335 return false;
336 }
337 }
338 return true;
339 }
340
341 /* Callback from postcopy_ram_supported_by_host block iterator.
342 */
343 static int test_ramblock_postcopiable(RAMBlock *rb, Error **errp)
344 {
345 const char *block_name = qemu_ram_get_idstr(rb);
346 ram_addr_t length = qemu_ram_get_used_length(rb);
347 size_t pagesize = qemu_ram_pagesize(rb);
348 QemuFsType fs;
349
350 if (length % pagesize) {
351 error_setg(errp,
352 "Postcopy requires RAM blocks to be a page size multiple,"
353 " block %s is 0x" RAM_ADDR_FMT " bytes with a "
354 "page size of 0x%zx", block_name, length, pagesize);
355 return 1;
356 }
357
358 if (rb->fd >= 0) {
359 fs = qemu_fd_getfs(rb->fd);
360 if (fs != QEMU_FS_TYPE_TMPFS && fs != QEMU_FS_TYPE_HUGETLBFS) {
361 error_setg(errp,
362 "Host backend files need to be TMPFS or HUGETLBFS only");
363 return 1;
364 }
365 }
366
367 return 0;
368 }
369
370 /*
371 * Note: This has the side effect of munlock'ing all of RAM, that's
372 * normally fine since if the postcopy succeeds it gets turned back on at the
373 * end.
374 */
375 bool postcopy_ram_supported_by_host(MigrationIncomingState *mis, Error **errp)
376 {
377 long pagesize = qemu_real_host_page_size();
378 int ufd = -1;
379 bool ret = false; /* Error unless we change it */
380 void *testarea = NULL;
381 struct uffdio_register reg_struct;
382 struct uffdio_range range_struct;
383 uint64_t feature_mask;
384 RAMBlock *block;
385
386 ERRP_GUARD();
387 if (qemu_target_page_size() > pagesize) {
388 error_setg(errp, "Target page size bigger than host page size");
389 goto out;
390 }
391
392 ufd = uffd_open(O_CLOEXEC);
393 if (ufd == -1) {
394 error_setg(errp, "Userfaultfd not available: %s", strerror(errno));
395 goto out;
396 }
397
398 /* Give devices a chance to object */
399 if (postcopy_notify(POSTCOPY_NOTIFY_PROBE, errp)) {
400 goto out;
401 }
402
403 /* Version and features check */
404 if (!ufd_check_and_apply(ufd, mis, errp)) {
405 goto out;
406 }
407
408 /*
409 * We don't support postcopy with some type of ramblocks.
410 *
411 * NOTE: we explicitly ignored migrate_ram_is_ignored() instead we checked
412 * all possible ramblocks. This is because this function can be called
413 * when creating the migration object, during the phase RAM_MIGRATABLE
414 * is not even properly set for all the ramblocks.
415 *
416 * A side effect of this is we'll also check against RAM_SHARED
417 * ramblocks even if migrate_ignore_shared() is set (in which case
418 * we'll never migrate RAM_SHARED at all), but normally this shouldn't
419 * affect in reality, or we can revisit.
420 */
421 RAMBLOCK_FOREACH(block) {
422 if (test_ramblock_postcopiable(block, errp)) {
423 goto out;
424 }
425 }
426
427 /*
428 * userfault and mlock don't go together; we'll put it back later if
429 * it was enabled.
430 */
431 if (munlockall()) {
432 error_setg(errp, "munlockall() failed: %s", strerror(errno));
433 goto out;
434 }
435
436 /*
437 * We need to check that the ops we need are supported on anon memory
438 * To do that we need to register a chunk and see the flags that
439 * are returned.
440 */
441 testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
442 MAP_ANONYMOUS, -1, 0);
443 if (testarea == MAP_FAILED) {
444 error_setg(errp, "Failed to map test area: %s", strerror(errno));
445 goto out;
446 }
447 g_assert(QEMU_PTR_IS_ALIGNED(testarea, pagesize));
448
449 reg_struct.range.start = (uintptr_t)testarea;
450 reg_struct.range.len = pagesize;
451 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
452
453 if (ioctl(ufd, UFFDIO_REGISTER, &reg_struct)) {
454 error_setg(errp, "UFFDIO_REGISTER failed: %s", strerror(errno));
455 goto out;
456 }
457
458 range_struct.start = (uintptr_t)testarea;
459 range_struct.len = pagesize;
460 if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
461 error_setg(errp, "UFFDIO_UNREGISTER failed: %s", strerror(errno));
462 goto out;
463 }
464
465 feature_mask = (__u64)1 << _UFFDIO_WAKE |
466 (__u64)1 << _UFFDIO_COPY |
467 (__u64)1 << _UFFDIO_ZEROPAGE;
468 if ((reg_struct.ioctls & feature_mask) != feature_mask) {
469 error_setg(errp, "Missing userfault map features: %" PRIx64,
470 (uint64_t)(~reg_struct.ioctls & feature_mask));
471 goto out;
472 }
473
474 /* Success! */
475 ret = true;
476 out:
477 if (testarea) {
478 munmap(testarea, pagesize);
479 }
480 if (ufd != -1) {
481 close(ufd);
482 }
483 return ret;
484 }
485
486 /*
487 * Setup an area of RAM so that it *can* be used for postcopy later; this
488 * must be done right at the start prior to pre-copy.
489 * opaque should be the MIS.
490 */
491 static int init_range(RAMBlock *rb, void *opaque)
492 {
493 const char *block_name = qemu_ram_get_idstr(rb);
494 void *host_addr = qemu_ram_get_host_addr(rb);
495 ram_addr_t offset = qemu_ram_get_offset(rb);
496 ram_addr_t length = qemu_ram_get_used_length(rb);
497 trace_postcopy_init_range(block_name, host_addr, offset, length);
498
499 /*
500 * Save the used_length before running the guest. In case we have to
501 * resize RAM blocks when syncing RAM block sizes from the source during
502 * precopy, we'll update it manually via the ram block notifier.
503 */
504 rb->postcopy_length = length;
505
506 /*
507 * We need the whole of RAM to be truly empty for postcopy, so things
508 * like ROMs and any data tables built during init must be zero'd
509 * - we're going to get the copy from the source anyway.
510 * (Precopy will just overwrite this data, so doesn't need the discard)
511 */
512 if (ram_discard_range(block_name, 0, length)) {
513 return -1;
514 }
515
516 return 0;
517 }
518
519 /*
520 * At the end of migration, undo the effects of init_range
521 * opaque should be the MIS.
522 */
523 static int cleanup_range(RAMBlock *rb, void *opaque)
524 {
525 const char *block_name = qemu_ram_get_idstr(rb);
526 void *host_addr = qemu_ram_get_host_addr(rb);
527 ram_addr_t offset = qemu_ram_get_offset(rb);
528 ram_addr_t length = rb->postcopy_length;
529 MigrationIncomingState *mis = opaque;
530 struct uffdio_range range_struct;
531 trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
532
533 /*
534 * We turned off hugepage for the precopy stage with postcopy enabled
535 * we can turn it back on now.
536 */
537 qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE);
538
539 /*
540 * We can also turn off userfault now since we should have all the
541 * pages. It can be useful to leave it on to debug postcopy
542 * if you're not sure it's always getting every page.
543 */
544 range_struct.start = (uintptr_t)host_addr;
545 range_struct.len = length;
546
547 if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
548 error_report("%s: userfault unregister %s", __func__, strerror(errno));
549
550 return -1;
551 }
552
553 return 0;
554 }
555
556 /*
557 * Initialise postcopy-ram, setting the RAM to a state where we can go into
558 * postcopy later; must be called prior to any precopy.
559 * called from arch_init's similarly named ram_postcopy_incoming_init
560 */
561 int postcopy_ram_incoming_init(MigrationIncomingState *mis)
562 {
563 if (foreach_not_ignored_block(init_range, NULL)) {
564 return -1;
565 }
566
567 return 0;
568 }
569
570 static void postcopy_temp_pages_cleanup(MigrationIncomingState *mis)
571 {
572 int i;
573
574 if (mis->postcopy_tmp_pages) {
575 for (i = 0; i < mis->postcopy_channels; i++) {
576 if (mis->postcopy_tmp_pages[i].tmp_huge_page) {
577 munmap(mis->postcopy_tmp_pages[i].tmp_huge_page,
578 mis->largest_page_size);
579 mis->postcopy_tmp_pages[i].tmp_huge_page = NULL;
580 }
581 }
582 g_free(mis->postcopy_tmp_pages);
583 mis->postcopy_tmp_pages = NULL;
584 }
585
586 if (mis->postcopy_tmp_zero_page) {
587 munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size);
588 mis->postcopy_tmp_zero_page = NULL;
589 }
590 }
591
592 /*
593 * At the end of a migration where postcopy_ram_incoming_init was called.
594 */
595 int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
596 {
597 trace_postcopy_ram_incoming_cleanup_entry();
598
599 if (mis->preempt_thread_status == PREEMPT_THREAD_CREATED) {
600 /* Notify the fast load thread to quit */
601 mis->preempt_thread_status = PREEMPT_THREAD_QUIT;
602 /*
603 * Update preempt_thread_status before reading count. Note: mutex
604 * lock only provide ACQUIRE semantic, and it doesn't stops this
605 * write to be reordered after reading the count.
606 */
607 smp_mb();
608 /*
609 * It's possible that the preempt thread is still handling the last
610 * pages to arrive which were requested by guest page faults.
611 * Making sure nothing is left behind by waiting on the condvar if
612 * that unlikely case happened.
613 */
614 WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) {
615 if (qatomic_read(&mis->page_requested_count)) {
616 /*
617 * It is guaranteed to receive a signal later, because the
618 * count>0 now, so it's destined to be decreased to zero
619 * very soon by the preempt thread.
620 */
621 qemu_cond_wait(&mis->page_request_cond,
622 &mis->page_request_mutex);
623 }
624 }
625 /* Notify the fast load thread to quit */
626 if (mis->postcopy_qemufile_dst) {
627 qemu_file_shutdown(mis->postcopy_qemufile_dst);
628 }
629 qemu_thread_join(&mis->postcopy_prio_thread);
630 mis->preempt_thread_status = PREEMPT_THREAD_NONE;
631 }
632
633 if (mis->have_fault_thread) {
634 Error *local_err = NULL;
635
636 /* Let the fault thread quit */
637 qatomic_set(&mis->fault_thread_quit, 1);
638 postcopy_fault_thread_notify(mis);
639 trace_postcopy_ram_incoming_cleanup_join();
640 qemu_thread_join(&mis->fault_thread);
641
642 if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_END, &local_err)) {
643 error_report_err(local_err);
644 return -1;
645 }
646
647 if (foreach_not_ignored_block(cleanup_range, mis)) {
648 return -1;
649 }
650
651 trace_postcopy_ram_incoming_cleanup_closeuf();
652 close(mis->userfault_fd);
653 close(mis->userfault_event_fd);
654 mis->have_fault_thread = false;
655 }
656
657 if (enable_mlock) {
658 if (os_mlock() < 0) {
659 error_report("mlock: %s", strerror(errno));
660 /*
661 * It doesn't feel right to fail at this point, we have a valid
662 * VM state.
663 */
664 }
665 }
666
667 postcopy_temp_pages_cleanup(mis);
668
669 trace_postcopy_ram_incoming_cleanup_blocktime(
670 get_postcopy_total_blocktime());
671
672 trace_postcopy_ram_incoming_cleanup_exit();
673 return 0;
674 }
675
676 /*
677 * Disable huge pages on an area
678 */
679 static int nhp_range(RAMBlock *rb, void *opaque)
680 {
681 const char *block_name = qemu_ram_get_idstr(rb);
682 void *host_addr = qemu_ram_get_host_addr(rb);
683 ram_addr_t offset = qemu_ram_get_offset(rb);
684 ram_addr_t length = rb->postcopy_length;
685 trace_postcopy_nhp_range(block_name, host_addr, offset, length);
686
687 /*
688 * Before we do discards we need to ensure those discards really
689 * do delete areas of the page, even if THP thinks a hugepage would
690 * be a good idea, so force hugepages off.
691 */
692 qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE);
693
694 return 0;
695 }
696
697 /*
698 * Userfault requires us to mark RAM as NOHUGEPAGE prior to discard
699 * however leaving it until after precopy means that most of the precopy
700 * data is still THPd
701 */
702 int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
703 {
704 if (foreach_not_ignored_block(nhp_range, mis)) {
705 return -1;
706 }
707
708 postcopy_state_set(POSTCOPY_INCOMING_DISCARD);
709
710 return 0;
711 }
712
713 /*
714 * Mark the given area of RAM as requiring notification to unwritten areas
715 * Used as a callback on foreach_not_ignored_block.
716 * host_addr: Base of area to mark
717 * offset: Offset in the whole ram arena
718 * length: Length of the section
719 * opaque: MigrationIncomingState pointer
720 * Returns 0 on success
721 */
722 static int ram_block_enable_notify(RAMBlock *rb, void *opaque)
723 {
724 MigrationIncomingState *mis = opaque;
725 struct uffdio_register reg_struct;
726
727 reg_struct.range.start = (uintptr_t)qemu_ram_get_host_addr(rb);
728 reg_struct.range.len = rb->postcopy_length;
729 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
730
731 /* Now tell our userfault_fd that it's responsible for this area */
732 if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, &reg_struct)) {
733 error_report("%s userfault register: %s", __func__, strerror(errno));
734 return -1;
735 }
736 if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
737 error_report("%s userfault: Region doesn't support COPY", __func__);
738 return -1;
739 }
740 if (reg_struct.ioctls & ((__u64)1 << _UFFDIO_ZEROPAGE)) {
741 qemu_ram_set_uf_zeroable(rb);
742 }
743
744 return 0;
745 }
746
747 int postcopy_wake_shared(struct PostCopyFD *pcfd,
748 uint64_t client_addr,
749 RAMBlock *rb)
750 {
751 size_t pagesize = qemu_ram_pagesize(rb);
752 struct uffdio_range range;
753 int ret;
754 trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb));
755 range.start = ROUND_DOWN(client_addr, pagesize);
756 range.len = pagesize;
757 ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range);
758 if (ret) {
759 error_report("%s: Failed to wake: %zx in %s (%s)",
760 __func__, (size_t)client_addr, qemu_ram_get_idstr(rb),
761 strerror(errno));
762 }
763 return ret;
764 }
765
766 static int postcopy_request_page(MigrationIncomingState *mis, RAMBlock *rb,
767 ram_addr_t start, uint64_t haddr)
768 {
769 void *aligned = (void *)(uintptr_t)ROUND_DOWN(haddr, qemu_ram_pagesize(rb));
770
771 /*
772 * Discarded pages (via RamDiscardManager) are never migrated. On unlikely
773 * access, place a zeropage, which will also set the relevant bits in the
774 * recv_bitmap accordingly, so we won't try placing a zeropage twice.
775 *
776 * Checking a single bit is sufficient to handle pagesize > TPS as either
777 * all relevant bits are set or not.
778 */
779 assert(QEMU_IS_ALIGNED(start, qemu_ram_pagesize(rb)));
780 if (ramblock_page_is_discarded(rb, start)) {
781 bool received = ramblock_recv_bitmap_test_byte_offset(rb, start);
782
783 return received ? 0 : postcopy_place_page_zero(mis, aligned, rb);
784 }
785
786 return migrate_send_rp_req_pages(mis, rb, start, haddr);
787 }
788
789 /*
790 * Callback from shared fault handlers to ask for a page,
791 * the page must be specified by a RAMBlock and an offset in that rb
792 * Note: Only for use by shared fault handlers (in fault thread)
793 */
794 int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
795 uint64_t client_addr, uint64_t rb_offset)
796 {
797 uint64_t aligned_rbo = ROUND_DOWN(rb_offset, qemu_ram_pagesize(rb));
798 MigrationIncomingState *mis = migration_incoming_get_current();
799
800 trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb),
801 rb_offset);
802 if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) {
803 trace_postcopy_request_shared_page_present(pcfd->idstr,
804 qemu_ram_get_idstr(rb), rb_offset);
805 return postcopy_wake_shared(pcfd, client_addr, rb);
806 }
807 postcopy_request_page(mis, rb, aligned_rbo, client_addr);
808 return 0;
809 }
810
811 static int get_mem_fault_cpu_index(uint32_t pid)
812 {
813 CPUState *cpu_iter;
814
815 CPU_FOREACH(cpu_iter) {
816 if (cpu_iter->thread_id == pid) {
817 trace_get_mem_fault_cpu_index(cpu_iter->cpu_index, pid);
818 return cpu_iter->cpu_index;
819 }
820 }
821 trace_get_mem_fault_cpu_index(-1, pid);
822 return -1;
823 }
824
825 static uint32_t get_low_time_offset(PostcopyBlocktimeContext *dc)
826 {
827 int64_t start_time_offset = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
828 dc->start_time;
829 return start_time_offset < 1 ? 1 : start_time_offset & UINT32_MAX;
830 }
831
832 /*
833 * This function is being called when pagefault occurs. It
834 * tracks down vCPU blocking time.
835 *
836 * @addr: faulted host virtual address
837 * @ptid: faulted process thread id
838 * @rb: ramblock appropriate to addr
839 */
840 static void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid,
841 RAMBlock *rb)
842 {
843 int cpu, already_received;
844 MigrationIncomingState *mis = migration_incoming_get_current();
845 PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
846 uint32_t low_time_offset;
847
848 if (!dc || ptid == 0) {
849 return;
850 }
851 cpu = get_mem_fault_cpu_index(ptid);
852 if (cpu < 0) {
853 return;
854 }
855
856 low_time_offset = get_low_time_offset(dc);
857 if (dc->vcpu_addr[cpu] == 0) {
858 qatomic_inc(&dc->smp_cpus_down);
859 }
860
861 qatomic_xchg(&dc->last_begin, low_time_offset);
862 qatomic_xchg(&dc->page_fault_vcpu_time[cpu], low_time_offset);
863 qatomic_xchg(&dc->vcpu_addr[cpu], addr);
864
865 /*
866 * check it here, not at the beginning of the function,
867 * due to, check could occur early than bitmap_set in
868 * qemu_ufd_copy_ioctl
869 */
870 already_received = ramblock_recv_bitmap_test(rb, (void *)addr);
871 if (already_received) {
872 qatomic_xchg(&dc->vcpu_addr[cpu], 0);
873 qatomic_xchg(&dc->page_fault_vcpu_time[cpu], 0);
874 qatomic_dec(&dc->smp_cpus_down);
875 }
876 trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu],
877 cpu, already_received);
878 }
879
880 /*
881 * This function just provide calculated blocktime per cpu and trace it.
882 * Total blocktime is calculated in mark_postcopy_blocktime_end.
883 *
884 *
885 * Assume we have 3 CPU
886 *
887 * S1 E1 S1 E1
888 * -----***********------------xxx***************------------------------> CPU1
889 *
890 * S2 E2
891 * ------------****************xxx---------------------------------------> CPU2
892 *
893 * S3 E3
894 * ------------------------****xxx********-------------------------------> CPU3
895 *
896 * We have sequence S1,S2,E1,S3,S1,E2,E3,E1
897 * S2,E1 - doesn't match condition due to sequence S1,S2,E1 doesn't include CPU3
898 * S3,S1,E2 - sequence includes all CPUs, in this case overlap will be S1,E2 -
899 * it's a part of total blocktime.
900 * S1 - here is last_begin
901 * Legend of the picture is following:
902 * * - means blocktime per vCPU
903 * x - means overlapped blocktime (total blocktime)
904 *
905 * @addr: host virtual address
906 */
907 static void mark_postcopy_blocktime_end(uintptr_t addr)
908 {
909 MigrationIncomingState *mis = migration_incoming_get_current();
910 PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
911 MachineState *ms = MACHINE(qdev_get_machine());
912 unsigned int smp_cpus = ms->smp.cpus;
913 int i, affected_cpu = 0;
914 bool vcpu_total_blocktime = false;
915 uint32_t read_vcpu_time, low_time_offset;
916
917 if (!dc) {
918 return;
919 }
920
921 low_time_offset = get_low_time_offset(dc);
922 /* lookup cpu, to clear it,
923 * that algorithm looks straightforward, but it's not
924 * optimal, more optimal algorithm is keeping tree or hash
925 * where key is address value is a list of */
926 for (i = 0; i < smp_cpus; i++) {
927 uint32_t vcpu_blocktime = 0;
928
929 read_vcpu_time = qatomic_fetch_add(&dc->page_fault_vcpu_time[i], 0);
930 if (qatomic_fetch_add(&dc->vcpu_addr[i], 0) != addr ||
931 read_vcpu_time == 0) {
932 continue;
933 }
934 qatomic_xchg(&dc->vcpu_addr[i], 0);
935 vcpu_blocktime = low_time_offset - read_vcpu_time;
936 affected_cpu += 1;
937 /* we need to know is that mark_postcopy_end was due to
938 * faulted page, another possible case it's prefetched
939 * page and in that case we shouldn't be here */
940 if (!vcpu_total_blocktime &&
941 qatomic_fetch_add(&dc->smp_cpus_down, 0) == smp_cpus) {
942 vcpu_total_blocktime = true;
943 }
944 /* continue cycle, due to one page could affect several vCPUs */
945 dc->vcpu_blocktime[i] += vcpu_blocktime;
946 }
947
948 qatomic_sub(&dc->smp_cpus_down, affected_cpu);
949 if (vcpu_total_blocktime) {
950 dc->total_blocktime += low_time_offset - qatomic_fetch_add(
951 &dc->last_begin, 0);
952 }
953 trace_mark_postcopy_blocktime_end(addr, dc, dc->total_blocktime,
954 affected_cpu);
955 }
956
957 static void postcopy_pause_fault_thread(MigrationIncomingState *mis)
958 {
959 trace_postcopy_pause_fault_thread();
960 qemu_sem_wait(&mis->postcopy_pause_sem_fault);
961 trace_postcopy_pause_fault_thread_continued();
962 }
963
964 /*
965 * Handle faults detected by the USERFAULT markings
966 */
967 static void *postcopy_ram_fault_thread(void *opaque)
968 {
969 MigrationIncomingState *mis = opaque;
970 struct uffd_msg msg;
971 int ret;
972 size_t index;
973 RAMBlock *rb = NULL;
974
975 trace_postcopy_ram_fault_thread_entry();
976 rcu_register_thread();
977 mis->last_rb = NULL; /* last RAMBlock we sent part of */
978 qemu_sem_post(&mis->thread_sync_sem);
979
980 struct pollfd *pfd;
981 size_t pfd_len = 2 + mis->postcopy_remote_fds->len;
982
983 pfd = g_new0(struct pollfd, pfd_len);
984
985 pfd[0].fd = mis->userfault_fd;
986 pfd[0].events = POLLIN;
987 pfd[1].fd = mis->userfault_event_fd;
988 pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */
989 trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd);
990 for (index = 0; index < mis->postcopy_remote_fds->len; index++) {
991 struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds,
992 struct PostCopyFD, index);
993 pfd[2 + index].fd = pcfd->fd;
994 pfd[2 + index].events = POLLIN;
995 trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr,
996 pcfd->fd);
997 }
998
999 while (true) {
1000 ram_addr_t rb_offset;
1001 int poll_result;
1002
1003 /*
1004 * We're mainly waiting for the kernel to give us a faulting HVA,
1005 * however we can be told to quit via userfault_quit_fd which is
1006 * an eventfd
1007 */
1008
1009 poll_result = poll(pfd, pfd_len, -1 /* Wait forever */);
1010 if (poll_result == -1) {
1011 error_report("%s: userfault poll: %s", __func__, strerror(errno));
1012 break;
1013 }
1014
1015 if (!mis->to_src_file) {
1016 /*
1017 * Possibly someone tells us that the return path is
1018 * broken already using the event. We should hold until
1019 * the channel is rebuilt.
1020 */
1021 postcopy_pause_fault_thread(mis);
1022 }
1023
1024 if (pfd[1].revents) {
1025 uint64_t tmp64 = 0;
1026
1027 /* Consume the signal */
1028 if (read(mis->userfault_event_fd, &tmp64, 8) != 8) {
1029 /* Nothing obviously nicer than posting this error. */
1030 error_report("%s: read() failed", __func__);
1031 }
1032
1033 if (qatomic_read(&mis->fault_thread_quit)) {
1034 trace_postcopy_ram_fault_thread_quit();
1035 break;
1036 }
1037 }
1038
1039 if (pfd[0].revents) {
1040 poll_result--;
1041 ret = read(mis->userfault_fd, &msg, sizeof(msg));
1042 if (ret != sizeof(msg)) {
1043 if (errno == EAGAIN) {
1044 /*
1045 * if a wake up happens on the other thread just after
1046 * the poll, there is nothing to read.
1047 */
1048 continue;
1049 }
1050 if (ret < 0) {
1051 error_report("%s: Failed to read full userfault "
1052 "message: %s",
1053 __func__, strerror(errno));
1054 break;
1055 } else {
1056 error_report("%s: Read %d bytes from userfaultfd "
1057 "expected %zd",
1058 __func__, ret, sizeof(msg));
1059 break; /* Lost alignment, don't know what we'd read next */
1060 }
1061 }
1062 if (msg.event != UFFD_EVENT_PAGEFAULT) {
1063 error_report("%s: Read unexpected event %ud from userfaultfd",
1064 __func__, msg.event);
1065 continue; /* It's not a page fault, shouldn't happen */
1066 }
1067
1068 rb = qemu_ram_block_from_host(
1069 (void *)(uintptr_t)msg.arg.pagefault.address,
1070 true, &rb_offset);
1071 if (!rb) {
1072 error_report("postcopy_ram_fault_thread: Fault outside guest: %"
1073 PRIx64, (uint64_t)msg.arg.pagefault.address);
1074 break;
1075 }
1076
1077 rb_offset = ROUND_DOWN(rb_offset, qemu_ram_pagesize(rb));
1078 trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
1079 qemu_ram_get_idstr(rb),
1080 rb_offset,
1081 msg.arg.pagefault.feat.ptid);
1082 mark_postcopy_blocktime_begin(
1083 (uintptr_t)(msg.arg.pagefault.address),
1084 msg.arg.pagefault.feat.ptid, rb);
1085
1086 retry:
1087 /*
1088 * Send the request to the source - we want to request one
1089 * of our host page sizes (which is >= TPS)
1090 */
1091 ret = postcopy_request_page(mis, rb, rb_offset,
1092 msg.arg.pagefault.address);
1093 if (ret) {
1094 /* May be network failure, try to wait for recovery */
1095 postcopy_pause_fault_thread(mis);
1096 goto retry;
1097 }
1098 }
1099
1100 /* Now handle any requests from external processes on shared memory */
1101 /* TODO: May need to handle devices deregistering during postcopy */
1102 for (index = 2; index < pfd_len && poll_result; index++) {
1103 if (pfd[index].revents) {
1104 struct PostCopyFD *pcfd =
1105 &g_array_index(mis->postcopy_remote_fds,
1106 struct PostCopyFD, index - 2);
1107
1108 poll_result--;
1109 if (pfd[index].revents & POLLERR) {
1110 error_report("%s: POLLERR on poll %zd fd=%d",
1111 __func__, index, pcfd->fd);
1112 pfd[index].events = 0;
1113 continue;
1114 }
1115
1116 ret = read(pcfd->fd, &msg, sizeof(msg));
1117 if (ret != sizeof(msg)) {
1118 if (errno == EAGAIN) {
1119 /*
1120 * if a wake up happens on the other thread just after
1121 * the poll, there is nothing to read.
1122 */
1123 continue;
1124 }
1125 if (ret < 0) {
1126 error_report("%s: Failed to read full userfault "
1127 "message: %s (shared) revents=%d",
1128 __func__, strerror(errno),
1129 pfd[index].revents);
1130 /*TODO: Could just disable this sharer */
1131 break;
1132 } else {
1133 error_report("%s: Read %d bytes from userfaultfd "
1134 "expected %zd (shared)",
1135 __func__, ret, sizeof(msg));
1136 /*TODO: Could just disable this sharer */
1137 break; /*Lost alignment,don't know what we'd read next*/
1138 }
1139 }
1140 if (msg.event != UFFD_EVENT_PAGEFAULT) {
1141 error_report("%s: Read unexpected event %ud "
1142 "from userfaultfd (shared)",
1143 __func__, msg.event);
1144 continue; /* It's not a page fault, shouldn't happen */
1145 }
1146 /* Call the device handler registered with us */
1147 ret = pcfd->handler(pcfd, &msg);
1148 if (ret) {
1149 error_report("%s: Failed to resolve shared fault on %zd/%s",
1150 __func__, index, pcfd->idstr);
1151 /* TODO: Fail? Disable this sharer? */
1152 }
1153 }
1154 }
1155 }
1156 rcu_unregister_thread();
1157 trace_postcopy_ram_fault_thread_exit();
1158 g_free(pfd);
1159 return NULL;
1160 }
1161
1162 static int postcopy_temp_pages_setup(MigrationIncomingState *mis)
1163 {
1164 PostcopyTmpPage *tmp_page;
1165 int err, i, channels;
1166 void *temp_page;
1167
1168 if (migrate_postcopy_preempt()) {
1169 /* If preemption enabled, need extra channel for urgent requests */
1170 mis->postcopy_channels = RAM_CHANNEL_MAX;
1171 } else {
1172 /* Both precopy/postcopy on the same channel */
1173 mis->postcopy_channels = 1;
1174 }
1175
1176 channels = mis->postcopy_channels;
1177 mis->postcopy_tmp_pages = g_malloc0_n(sizeof(PostcopyTmpPage), channels);
1178
1179 for (i = 0; i < channels; i++) {
1180 tmp_page = &mis->postcopy_tmp_pages[i];
1181 temp_page = mmap(NULL, mis->largest_page_size, PROT_READ | PROT_WRITE,
1182 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1183 if (temp_page == MAP_FAILED) {
1184 err = errno;
1185 error_report("%s: Failed to map postcopy_tmp_pages[%d]: %s",
1186 __func__, i, strerror(err));
1187 /* Clean up will be done later */
1188 return -err;
1189 }
1190 tmp_page->tmp_huge_page = temp_page;
1191 /* Initialize default states for each tmp page */
1192 postcopy_temp_page_reset(tmp_page);
1193 }
1194
1195 /*
1196 * Map large zero page when kernel can't use UFFDIO_ZEROPAGE for hugepages
1197 */
1198 mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size,
1199 PROT_READ | PROT_WRITE,
1200 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1201 if (mis->postcopy_tmp_zero_page == MAP_FAILED) {
1202 err = errno;
1203 mis->postcopy_tmp_zero_page = NULL;
1204 error_report("%s: Failed to map large zero page %s",
1205 __func__, strerror(err));
1206 return -err;
1207 }
1208
1209 memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size);
1210
1211 return 0;
1212 }
1213
1214 int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
1215 {
1216 Error *local_err = NULL;
1217
1218 /* Open the fd for the kernel to give us userfaults */
1219 mis->userfault_fd = uffd_open(O_CLOEXEC | O_NONBLOCK);
1220 if (mis->userfault_fd == -1) {
1221 error_report("%s: Failed to open userfault fd: %s", __func__,
1222 strerror(errno));
1223 return -1;
1224 }
1225
1226 /*
1227 * Although the host check already tested the API, we need to
1228 * do the check again as an ABI handshake on the new fd.
1229 */
1230 if (!ufd_check_and_apply(mis->userfault_fd, mis, &local_err)) {
1231 error_report_err(local_err);
1232 return -1;
1233 }
1234
1235 /* Now an eventfd we use to tell the fault-thread to quit */
1236 mis->userfault_event_fd = eventfd(0, EFD_CLOEXEC);
1237 if (mis->userfault_event_fd == -1) {
1238 error_report("%s: Opening userfault_event_fd: %s", __func__,
1239 strerror(errno));
1240 close(mis->userfault_fd);
1241 return -1;
1242 }
1243
1244 postcopy_thread_create(mis, &mis->fault_thread, "fault-default",
1245 postcopy_ram_fault_thread, QEMU_THREAD_JOINABLE);
1246 mis->have_fault_thread = true;
1247
1248 /* Mark so that we get notified of accesses to unwritten areas */
1249 if (foreach_not_ignored_block(ram_block_enable_notify, mis)) {
1250 error_report("ram_block_enable_notify failed");
1251 return -1;
1252 }
1253
1254 if (postcopy_temp_pages_setup(mis)) {
1255 /* Error dumped in the sub-function */
1256 return -1;
1257 }
1258
1259 if (migrate_postcopy_preempt()) {
1260 /*
1261 * This thread needs to be created after the temp pages because
1262 * it'll fetch RAM_CHANNEL_POSTCOPY PostcopyTmpPage immediately.
1263 */
1264 postcopy_thread_create(mis, &mis->postcopy_prio_thread, "fault-fast",
1265 postcopy_preempt_thread, QEMU_THREAD_JOINABLE);
1266 mis->preempt_thread_status = PREEMPT_THREAD_CREATED;
1267 }
1268
1269 trace_postcopy_ram_enable_notify();
1270
1271 return 0;
1272 }
1273
1274 static int qemu_ufd_copy_ioctl(MigrationIncomingState *mis, void *host_addr,
1275 void *from_addr, uint64_t pagesize, RAMBlock *rb)
1276 {
1277 int userfault_fd = mis->userfault_fd;
1278 int ret;
1279
1280 if (from_addr) {
1281 struct uffdio_copy copy_struct;
1282 copy_struct.dst = (uint64_t)(uintptr_t)host_addr;
1283 copy_struct.src = (uint64_t)(uintptr_t)from_addr;
1284 copy_struct.len = pagesize;
1285 copy_struct.mode = 0;
1286 ret = ioctl(userfault_fd, UFFDIO_COPY, &copy_struct);
1287 } else {
1288 struct uffdio_zeropage zero_struct;
1289 zero_struct.range.start = (uint64_t)(uintptr_t)host_addr;
1290 zero_struct.range.len = pagesize;
1291 zero_struct.mode = 0;
1292 ret = ioctl(userfault_fd, UFFDIO_ZEROPAGE, &zero_struct);
1293 }
1294 if (!ret) {
1295 qemu_mutex_lock(&mis->page_request_mutex);
1296 ramblock_recv_bitmap_set_range(rb, host_addr,
1297 pagesize / qemu_target_page_size());
1298 /*
1299 * If this page resolves a page fault for a previous recorded faulted
1300 * address, take a special note to maintain the requested page list.
1301 */
1302 if (g_tree_lookup(mis->page_requested, host_addr)) {
1303 g_tree_remove(mis->page_requested, host_addr);
1304 int left_pages = qatomic_dec_fetch(&mis->page_requested_count);
1305
1306 trace_postcopy_page_req_del(host_addr, mis->page_requested_count);
1307 /* Order the update of count and read of preempt status */
1308 smp_mb();
1309 if (mis->preempt_thread_status == PREEMPT_THREAD_QUIT &&
1310 left_pages == 0) {
1311 /*
1312 * This probably means the main thread is waiting for us.
1313 * Notify that we've finished receiving the last requested
1314 * page.
1315 */
1316 qemu_cond_signal(&mis->page_request_cond);
1317 }
1318 }
1319 qemu_mutex_unlock(&mis->page_request_mutex);
1320 mark_postcopy_blocktime_end((uintptr_t)host_addr);
1321 }
1322 return ret;
1323 }
1324
1325 int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset)
1326 {
1327 int i;
1328 MigrationIncomingState *mis = migration_incoming_get_current();
1329 GArray *pcrfds = mis->postcopy_remote_fds;
1330
1331 for (i = 0; i < pcrfds->len; i++) {
1332 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1333 int ret = cur->waker(cur, rb, offset);
1334 if (ret) {
1335 return ret;
1336 }
1337 }
1338 return 0;
1339 }
1340
1341 /*
1342 * Place a host page (from) at (host) atomically
1343 * returns 0 on success
1344 */
1345 int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
1346 RAMBlock *rb)
1347 {
1348 size_t pagesize = qemu_ram_pagesize(rb);
1349
1350 /* copy also acks to the kernel waking the stalled thread up
1351 * TODO: We can inhibit that ack and only do it if it was requested
1352 * which would be slightly cheaper, but we'd have to be careful
1353 * of the order of updating our page state.
1354 */
1355 if (qemu_ufd_copy_ioctl(mis, host, from, pagesize, rb)) {
1356 int e = errno;
1357 error_report("%s: %s copy host: %p from: %p (size: %zd)",
1358 __func__, strerror(e), host, from, pagesize);
1359
1360 return -e;
1361 }
1362
1363 trace_postcopy_place_page(host);
1364 return postcopy_notify_shared_wake(rb,
1365 qemu_ram_block_host_offset(rb, host));
1366 }
1367
1368 /*
1369 * Place a zero page at (host) atomically
1370 * returns 0 on success
1371 */
1372 int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
1373 RAMBlock *rb)
1374 {
1375 size_t pagesize = qemu_ram_pagesize(rb);
1376 trace_postcopy_place_page_zero(host);
1377
1378 /* Normal RAMBlocks can zero a page using UFFDIO_ZEROPAGE
1379 * but it's not available for everything (e.g. hugetlbpages)
1380 */
1381 if (qemu_ram_is_uf_zeroable(rb)) {
1382 if (qemu_ufd_copy_ioctl(mis, host, NULL, pagesize, rb)) {
1383 int e = errno;
1384 error_report("%s: %s zero host: %p",
1385 __func__, strerror(e), host);
1386
1387 return -e;
1388 }
1389 return postcopy_notify_shared_wake(rb,
1390 qemu_ram_block_host_offset(rb,
1391 host));
1392 } else {
1393 return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page, rb);
1394 }
1395 }
1396
1397 #else
1398 /* No target OS support, stubs just fail */
1399 void fill_destination_postcopy_migration_info(MigrationInfo *info)
1400 {
1401 }
1402
1403 bool postcopy_ram_supported_by_host(MigrationIncomingState *mis, Error **errp)
1404 {
1405 error_report("%s: No OS support", __func__);
1406 return false;
1407 }
1408
1409 int postcopy_ram_incoming_init(MigrationIncomingState *mis)
1410 {
1411 error_report("postcopy_ram_incoming_init: No OS support");
1412 return -1;
1413 }
1414
1415 int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
1416 {
1417 assert(0);
1418 return -1;
1419 }
1420
1421 int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
1422 {
1423 assert(0);
1424 return -1;
1425 }
1426
1427 int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
1428 uint64_t client_addr, uint64_t rb_offset)
1429 {
1430 assert(0);
1431 return -1;
1432 }
1433
1434 int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
1435 {
1436 assert(0);
1437 return -1;
1438 }
1439
1440 int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
1441 RAMBlock *rb)
1442 {
1443 assert(0);
1444 return -1;
1445 }
1446
1447 int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
1448 RAMBlock *rb)
1449 {
1450 assert(0);
1451 return -1;
1452 }
1453
1454 int postcopy_wake_shared(struct PostCopyFD *pcfd,
1455 uint64_t client_addr,
1456 RAMBlock *rb)
1457 {
1458 assert(0);
1459 return -1;
1460 }
1461 #endif
1462
1463 /* ------------------------------------------------------------------------- */
1464 void postcopy_temp_page_reset(PostcopyTmpPage *tmp_page)
1465 {
1466 tmp_page->target_pages = 0;
1467 tmp_page->host_addr = NULL;
1468 /*
1469 * This is set to true when reset, and cleared as long as we received any
1470 * of the non-zero small page within this huge page.
1471 */
1472 tmp_page->all_zero = true;
1473 }
1474
1475 void postcopy_fault_thread_notify(MigrationIncomingState *mis)
1476 {
1477 uint64_t tmp64 = 1;
1478
1479 /*
1480 * Wakeup the fault_thread. It's an eventfd that should currently
1481 * be at 0, we're going to increment it to 1
1482 */
1483 if (write(mis->userfault_event_fd, &tmp64, 8) != 8) {
1484 /* Not much we can do here, but may as well report it */
1485 error_report("%s: incrementing failed: %s", __func__,
1486 strerror(errno));
1487 }
1488 }
1489
1490 /**
1491 * postcopy_discard_send_init: Called at the start of each RAMBlock before
1492 * asking to discard individual ranges.
1493 *
1494 * @ms: The current migration state.
1495 * @offset: the bitmap offset of the named RAMBlock in the migration bitmap.
1496 * @name: RAMBlock that discards will operate on.
1497 */
1498 static PostcopyDiscardState pds = {0};
1499 void postcopy_discard_send_init(MigrationState *ms, const char *name)
1500 {
1501 pds.ramblock_name = name;
1502 pds.cur_entry = 0;
1503 pds.nsentwords = 0;
1504 pds.nsentcmds = 0;
1505 }
1506
1507 /**
1508 * postcopy_discard_send_range: Called by the bitmap code for each chunk to
1509 * discard. May send a discard message, may just leave it queued to
1510 * be sent later.
1511 *
1512 * @ms: Current migration state.
1513 * @start,@length: a range of pages in the migration bitmap in the
1514 * RAM block passed to postcopy_discard_send_init() (length=1 is one page)
1515 */
1516 void postcopy_discard_send_range(MigrationState *ms, unsigned long start,
1517 unsigned long length)
1518 {
1519 size_t tp_size = qemu_target_page_size();
1520 /* Convert to byte offsets within the RAM block */
1521 pds.start_list[pds.cur_entry] = start * tp_size;
1522 pds.length_list[pds.cur_entry] = length * tp_size;
1523 trace_postcopy_discard_send_range(pds.ramblock_name, start, length);
1524 pds.cur_entry++;
1525 pds.nsentwords++;
1526
1527 if (pds.cur_entry == MAX_DISCARDS_PER_COMMAND) {
1528 /* Full set, ship it! */
1529 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1530 pds.ramblock_name,
1531 pds.cur_entry,
1532 pds.start_list,
1533 pds.length_list);
1534 pds.nsentcmds++;
1535 pds.cur_entry = 0;
1536 }
1537 }
1538
1539 /**
1540 * postcopy_discard_send_finish: Called at the end of each RAMBlock by the
1541 * bitmap code. Sends any outstanding discard messages, frees the PDS
1542 *
1543 * @ms: Current migration state.
1544 */
1545 void postcopy_discard_send_finish(MigrationState *ms)
1546 {
1547 /* Anything unsent? */
1548 if (pds.cur_entry) {
1549 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1550 pds.ramblock_name,
1551 pds.cur_entry,
1552 pds.start_list,
1553 pds.length_list);
1554 pds.nsentcmds++;
1555 }
1556
1557 trace_postcopy_discard_send_finish(pds.ramblock_name, pds.nsentwords,
1558 pds.nsentcmds);
1559 }
1560
1561 /*
1562 * Current state of incoming postcopy; note this is not part of
1563 * MigrationIncomingState since it's state is used during cleanup
1564 * at the end as MIS is being freed.
1565 */
1566 static PostcopyState incoming_postcopy_state;
1567
1568 PostcopyState postcopy_state_get(void)
1569 {
1570 return qatomic_load_acquire(&incoming_postcopy_state);
1571 }
1572
1573 /* Set the state and return the old state */
1574 PostcopyState postcopy_state_set(PostcopyState new_state)
1575 {
1576 return qatomic_xchg(&incoming_postcopy_state, new_state);
1577 }
1578
1579 /* Register a handler for external shared memory postcopy
1580 * called on the destination.
1581 */
1582 void postcopy_register_shared_ufd(struct PostCopyFD *pcfd)
1583 {
1584 MigrationIncomingState *mis = migration_incoming_get_current();
1585
1586 mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds,
1587 *pcfd);
1588 }
1589
1590 /* Unregister a handler for external shared memory postcopy
1591 */
1592 void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd)
1593 {
1594 guint i;
1595 MigrationIncomingState *mis = migration_incoming_get_current();
1596 GArray *pcrfds = mis->postcopy_remote_fds;
1597
1598 if (!pcrfds) {
1599 /* migration has already finished and freed the array */
1600 return;
1601 }
1602 for (i = 0; i < pcrfds->len; i++) {
1603 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1604 if (cur->fd == pcfd->fd) {
1605 mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i);
1606 return;
1607 }
1608 }
1609 }
1610
1611 void postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file)
1612 {
1613 /*
1614 * The new loading channel has its own threads, so it needs to be
1615 * blocked too. It's by default true, just be explicit.
1616 */
1617 qemu_file_set_blocking(file, true);
1618 mis->postcopy_qemufile_dst = file;
1619 qemu_sem_post(&mis->postcopy_qemufile_dst_done);
1620 trace_postcopy_preempt_new_channel();
1621 }
1622
1623 /*
1624 * Setup the postcopy preempt channel with the IOC. If ERROR is specified,
1625 * setup the error instead. This helper will free the ERROR if specified.
1626 */
1627 static void
1628 postcopy_preempt_send_channel_done(MigrationState *s,
1629 QIOChannel *ioc, Error *local_err)
1630 {
1631 if (local_err) {
1632 migrate_set_error(s, local_err);
1633 error_free(local_err);
1634 } else {
1635 migration_ioc_register_yank(ioc);
1636 s->postcopy_qemufile_src = qemu_file_new_output(ioc);
1637 trace_postcopy_preempt_new_channel();
1638 }
1639
1640 /*
1641 * Kick the waiter in all cases. The waiter should check upon
1642 * postcopy_qemufile_src to know whether it failed or not.
1643 */
1644 qemu_sem_post(&s->postcopy_qemufile_src_sem);
1645 }
1646
1647 static void
1648 postcopy_preempt_tls_handshake(QIOTask *task, gpointer opaque)
1649 {
1650 g_autoptr(QIOChannel) ioc = QIO_CHANNEL(qio_task_get_source(task));
1651 MigrationState *s = opaque;
1652 Error *local_err = NULL;
1653
1654 qio_task_propagate_error(task, &local_err);
1655 postcopy_preempt_send_channel_done(s, ioc, local_err);
1656 }
1657
1658 static void
1659 postcopy_preempt_send_channel_new(QIOTask *task, gpointer opaque)
1660 {
1661 g_autoptr(QIOChannel) ioc = QIO_CHANNEL(qio_task_get_source(task));
1662 MigrationState *s = opaque;
1663 QIOChannelTLS *tioc;
1664 Error *local_err = NULL;
1665
1666 if (qio_task_propagate_error(task, &local_err)) {
1667 goto out;
1668 }
1669
1670 if (migrate_channel_requires_tls_upgrade(ioc)) {
1671 tioc = migration_tls_client_create(ioc, s->hostname, &local_err);
1672 if (!tioc) {
1673 goto out;
1674 }
1675 trace_postcopy_preempt_tls_handshake();
1676 qio_channel_set_name(QIO_CHANNEL(tioc), "migration-tls-preempt");
1677 qio_channel_tls_handshake(tioc, postcopy_preempt_tls_handshake,
1678 s, NULL, NULL);
1679 /* Setup the channel until TLS handshake finished */
1680 return;
1681 }
1682
1683 out:
1684 /* This handles both good and error cases */
1685 postcopy_preempt_send_channel_done(s, ioc, local_err);
1686 }
1687
1688 /*
1689 * This function will kick off an async task to establish the preempt
1690 * channel, and wait until the connection setup completed. Returns 0 if
1691 * channel established, -1 for error.
1692 */
1693 int postcopy_preempt_establish_channel(MigrationState *s)
1694 {
1695 /* If preempt not enabled, no need to wait */
1696 if (!migrate_postcopy_preempt()) {
1697 return 0;
1698 }
1699
1700 /*
1701 * Kick off async task to establish preempt channel. Only do so with
1702 * 8.0+ machines, because 7.1/7.2 require the channel to be created in
1703 * setup phase of migration (even if racy in an unreliable network).
1704 */
1705 if (!s->preempt_pre_7_2) {
1706 postcopy_preempt_setup(s);
1707 }
1708
1709 /*
1710 * We need the postcopy preempt channel to be established before
1711 * starting doing anything.
1712 */
1713 qemu_sem_wait(&s->postcopy_qemufile_src_sem);
1714
1715 return s->postcopy_qemufile_src ? 0 : -1;
1716 }
1717
1718 void postcopy_preempt_setup(MigrationState *s)
1719 {
1720 /* Kick an async task to connect */
1721 socket_send_channel_create(postcopy_preempt_send_channel_new, s);
1722 }
1723
1724 static void postcopy_pause_ram_fast_load(MigrationIncomingState *mis)
1725 {
1726 trace_postcopy_pause_fast_load();
1727 qemu_mutex_unlock(&mis->postcopy_prio_thread_mutex);
1728 qemu_sem_wait(&mis->postcopy_pause_sem_fast_load);
1729 qemu_mutex_lock(&mis->postcopy_prio_thread_mutex);
1730 trace_postcopy_pause_fast_load_continued();
1731 }
1732
1733 static bool preempt_thread_should_run(MigrationIncomingState *mis)
1734 {
1735 return mis->preempt_thread_status != PREEMPT_THREAD_QUIT;
1736 }
1737
1738 void *postcopy_preempt_thread(void *opaque)
1739 {
1740 MigrationIncomingState *mis = opaque;
1741 int ret;
1742
1743 trace_postcopy_preempt_thread_entry();
1744
1745 rcu_register_thread();
1746
1747 qemu_sem_post(&mis->thread_sync_sem);
1748
1749 /*
1750 * The preempt channel is established in asynchronous way. Wait
1751 * for its completion.
1752 */
1753 qemu_sem_wait(&mis->postcopy_qemufile_dst_done);
1754
1755 /* Sending RAM_SAVE_FLAG_EOS to terminate this thread */
1756 qemu_mutex_lock(&mis->postcopy_prio_thread_mutex);
1757 while (preempt_thread_should_run(mis)) {
1758 ret = ram_load_postcopy(mis->postcopy_qemufile_dst,
1759 RAM_CHANNEL_POSTCOPY);
1760 /* If error happened, go into recovery routine */
1761 if (ret && preempt_thread_should_run(mis)) {
1762 postcopy_pause_ram_fast_load(mis);
1763 } else {
1764 /* We're done */
1765 break;
1766 }
1767 }
1768 qemu_mutex_unlock(&mis->postcopy_prio_thread_mutex);
1769
1770 rcu_unregister_thread();
1771
1772 trace_postcopy_preempt_thread_exit();
1773
1774 return NULL;
1775 }