]> git.proxmox.com Git - mirror_qemu.git/blob - migration/postcopy-ram.c
eddba05b576d5f89376a0232f62521998844ac63
[mirror_qemu.git] / migration / postcopy-ram.c
1 /*
2 * Postcopy migration for RAM
3 *
4 * Copyright 2013-2015 Red Hat, Inc. and/or its affiliates
5 *
6 * Authors:
7 * Dave Gilbert <dgilbert@redhat.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2 or later.
10 * See the COPYING file in the top-level directory.
11 *
12 */
13
14 /*
15 * Postcopy is a migration technique where the execution flips from the
16 * source to the destination before all the data has been copied.
17 */
18
19 #include "qemu/osdep.h"
20 #include "exec/target_page.h"
21 #include "migration.h"
22 #include "qemu-file.h"
23 #include "savevm.h"
24 #include "postcopy-ram.h"
25 #include "ram.h"
26 #include "qapi/error.h"
27 #include "qemu/notify.h"
28 #include "sysemu/sysemu.h"
29 #include "sysemu/balloon.h"
30 #include "qemu/error-report.h"
31 #include "trace.h"
32
33 /* Arbitrary limit on size of each discard command,
34 * keeps them around ~200 bytes
35 */
36 #define MAX_DISCARDS_PER_COMMAND 12
37
38 struct PostcopyDiscardState {
39 const char *ramblock_name;
40 uint16_t cur_entry;
41 /*
42 * Start and length of a discard range (bytes)
43 */
44 uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
45 uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
46 unsigned int nsentwords;
47 unsigned int nsentcmds;
48 };
49
50 static NotifierWithReturnList postcopy_notifier_list;
51
52 void postcopy_infrastructure_init(void)
53 {
54 notifier_with_return_list_init(&postcopy_notifier_list);
55 }
56
57 void postcopy_add_notifier(NotifierWithReturn *nn)
58 {
59 notifier_with_return_list_add(&postcopy_notifier_list, nn);
60 }
61
62 void postcopy_remove_notifier(NotifierWithReturn *n)
63 {
64 notifier_with_return_remove(n);
65 }
66
67 int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp)
68 {
69 struct PostcopyNotifyData pnd;
70 pnd.reason = reason;
71 pnd.errp = errp;
72
73 return notifier_with_return_list_notify(&postcopy_notifier_list,
74 &pnd);
75 }
76
77 /* Postcopy needs to detect accesses to pages that haven't yet been copied
78 * across, and efficiently map new pages in, the techniques for doing this
79 * are target OS specific.
80 */
81 #if defined(__linux__)
82
83 #include <poll.h>
84 #include <sys/ioctl.h>
85 #include <sys/syscall.h>
86 #include <asm/types.h> /* for __u64 */
87 #endif
88
89 #if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD)
90 #include <sys/eventfd.h>
91 #include <linux/userfaultfd.h>
92
93 typedef struct PostcopyBlocktimeContext {
94 /* time when page fault initiated per vCPU */
95 uint32_t *page_fault_vcpu_time;
96 /* page address per vCPU */
97 uintptr_t *vcpu_addr;
98 uint32_t total_blocktime;
99 /* blocktime per vCPU */
100 uint32_t *vcpu_blocktime;
101 /* point in time when last page fault was initiated */
102 uint32_t last_begin;
103 /* number of vCPU are suspended */
104 int smp_cpus_down;
105 uint64_t start_time;
106
107 /*
108 * Handler for exit event, necessary for
109 * releasing whole blocktime_ctx
110 */
111 Notifier exit_notifier;
112 } PostcopyBlocktimeContext;
113
114 static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx)
115 {
116 g_free(ctx->page_fault_vcpu_time);
117 g_free(ctx->vcpu_addr);
118 g_free(ctx->vcpu_blocktime);
119 g_free(ctx);
120 }
121
122 static void migration_exit_cb(Notifier *n, void *data)
123 {
124 PostcopyBlocktimeContext *ctx = container_of(n, PostcopyBlocktimeContext,
125 exit_notifier);
126 destroy_blocktime_context(ctx);
127 }
128
129 static struct PostcopyBlocktimeContext *blocktime_context_new(void)
130 {
131 PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1);
132 ctx->page_fault_vcpu_time = g_new0(uint32_t, smp_cpus);
133 ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus);
134 ctx->vcpu_blocktime = g_new0(uint32_t, smp_cpus);
135
136 ctx->exit_notifier.notify = migration_exit_cb;
137 ctx->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
138 qemu_add_exit_notifier(&ctx->exit_notifier);
139 return ctx;
140 }
141
142 /**
143 * receive_ufd_features: check userfault fd features, to request only supported
144 * features in the future.
145 *
146 * Returns: true on success
147 *
148 * __NR_userfaultfd - should be checked before
149 * @features: out parameter will contain uffdio_api.features provided by kernel
150 * in case of success
151 */
152 static bool receive_ufd_features(uint64_t *features)
153 {
154 struct uffdio_api api_struct = {0};
155 int ufd;
156 bool ret = true;
157
158 /* if we are here __NR_userfaultfd should exists */
159 ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
160 if (ufd == -1) {
161 error_report("%s: syscall __NR_userfaultfd failed: %s", __func__,
162 strerror(errno));
163 return false;
164 }
165
166 /* ask features */
167 api_struct.api = UFFD_API;
168 api_struct.features = 0;
169 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
170 error_report("%s: UFFDIO_API failed: %s", __func__,
171 strerror(errno));
172 ret = false;
173 goto release_ufd;
174 }
175
176 *features = api_struct.features;
177
178 release_ufd:
179 close(ufd);
180 return ret;
181 }
182
183 /**
184 * request_ufd_features: this function should be called only once on a newly
185 * opened ufd, subsequent calls will lead to error.
186 *
187 * Returns: true on succes
188 *
189 * @ufd: fd obtained from userfaultfd syscall
190 * @features: bit mask see UFFD_API_FEATURES
191 */
192 static bool request_ufd_features(int ufd, uint64_t features)
193 {
194 struct uffdio_api api_struct = {0};
195 uint64_t ioctl_mask;
196
197 api_struct.api = UFFD_API;
198 api_struct.features = features;
199 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
200 error_report("%s failed: UFFDIO_API failed: %s", __func__,
201 strerror(errno));
202 return false;
203 }
204
205 ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
206 (__u64)1 << _UFFDIO_UNREGISTER;
207 if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
208 error_report("Missing userfault features: %" PRIx64,
209 (uint64_t)(~api_struct.ioctls & ioctl_mask));
210 return false;
211 }
212
213 return true;
214 }
215
216 static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis)
217 {
218 uint64_t asked_features = 0;
219 static uint64_t supported_features;
220
221 /*
222 * it's not possible to
223 * request UFFD_API twice per one fd
224 * userfault fd features is persistent
225 */
226 if (!supported_features) {
227 if (!receive_ufd_features(&supported_features)) {
228 error_report("%s failed", __func__);
229 return false;
230 }
231 }
232
233 #ifdef UFFD_FEATURE_THREAD_ID
234 if (migrate_postcopy_blocktime() && mis &&
235 UFFD_FEATURE_THREAD_ID & supported_features) {
236 /* kernel supports that feature */
237 /* don't create blocktime_context if it exists */
238 if (!mis->blocktime_ctx) {
239 mis->blocktime_ctx = blocktime_context_new();
240 }
241
242 asked_features |= UFFD_FEATURE_THREAD_ID;
243 }
244 #endif
245
246 /*
247 * request features, even if asked_features is 0, due to
248 * kernel expects UFFD_API before UFFDIO_REGISTER, per
249 * userfault file descriptor
250 */
251 if (!request_ufd_features(ufd, asked_features)) {
252 error_report("%s failed: features %" PRIu64, __func__,
253 asked_features);
254 return false;
255 }
256
257 if (getpagesize() != ram_pagesize_summary()) {
258 bool have_hp = false;
259 /* We've got a huge page */
260 #ifdef UFFD_FEATURE_MISSING_HUGETLBFS
261 have_hp = supported_features & UFFD_FEATURE_MISSING_HUGETLBFS;
262 #endif
263 if (!have_hp) {
264 error_report("Userfault on this host does not support huge pages");
265 return false;
266 }
267 }
268 return true;
269 }
270
271 /* Callback from postcopy_ram_supported_by_host block iterator.
272 */
273 static int test_ramblock_postcopiable(const char *block_name, void *host_addr,
274 ram_addr_t offset, ram_addr_t length, void *opaque)
275 {
276 RAMBlock *rb = qemu_ram_block_by_name(block_name);
277 size_t pagesize = qemu_ram_pagesize(rb);
278
279 if (length % pagesize) {
280 error_report("Postcopy requires RAM blocks to be a page size multiple,"
281 " block %s is 0x" RAM_ADDR_FMT " bytes with a "
282 "page size of 0x%zx", block_name, length, pagesize);
283 return 1;
284 }
285 return 0;
286 }
287
288 /*
289 * Note: This has the side effect of munlock'ing all of RAM, that's
290 * normally fine since if the postcopy succeeds it gets turned back on at the
291 * end.
292 */
293 bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
294 {
295 long pagesize = getpagesize();
296 int ufd = -1;
297 bool ret = false; /* Error unless we change it */
298 void *testarea = NULL;
299 struct uffdio_register reg_struct;
300 struct uffdio_range range_struct;
301 uint64_t feature_mask;
302 Error *local_err = NULL;
303
304 if (qemu_target_page_size() > pagesize) {
305 error_report("Target page size bigger than host page size");
306 goto out;
307 }
308
309 ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
310 if (ufd == -1) {
311 error_report("%s: userfaultfd not available: %s", __func__,
312 strerror(errno));
313 goto out;
314 }
315
316 /* Give devices a chance to object */
317 if (postcopy_notify(POSTCOPY_NOTIFY_PROBE, &local_err)) {
318 error_report_err(local_err);
319 goto out;
320 }
321
322 /* Version and features check */
323 if (!ufd_check_and_apply(ufd, mis)) {
324 goto out;
325 }
326
327 /* We don't support postcopy with shared RAM yet */
328 if (qemu_ram_foreach_block(test_ramblock_postcopiable, NULL)) {
329 goto out;
330 }
331
332 /*
333 * userfault and mlock don't go together; we'll put it back later if
334 * it was enabled.
335 */
336 if (munlockall()) {
337 error_report("%s: munlockall: %s", __func__, strerror(errno));
338 return -1;
339 }
340
341 /*
342 * We need to check that the ops we need are supported on anon memory
343 * To do that we need to register a chunk and see the flags that
344 * are returned.
345 */
346 testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
347 MAP_ANONYMOUS, -1, 0);
348 if (testarea == MAP_FAILED) {
349 error_report("%s: Failed to map test area: %s", __func__,
350 strerror(errno));
351 goto out;
352 }
353 g_assert(((size_t)testarea & (pagesize-1)) == 0);
354
355 reg_struct.range.start = (uintptr_t)testarea;
356 reg_struct.range.len = pagesize;
357 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
358
359 if (ioctl(ufd, UFFDIO_REGISTER, &reg_struct)) {
360 error_report("%s userfault register: %s", __func__, strerror(errno));
361 goto out;
362 }
363
364 range_struct.start = (uintptr_t)testarea;
365 range_struct.len = pagesize;
366 if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
367 error_report("%s userfault unregister: %s", __func__, strerror(errno));
368 goto out;
369 }
370
371 feature_mask = (__u64)1 << _UFFDIO_WAKE |
372 (__u64)1 << _UFFDIO_COPY |
373 (__u64)1 << _UFFDIO_ZEROPAGE;
374 if ((reg_struct.ioctls & feature_mask) != feature_mask) {
375 error_report("Missing userfault map features: %" PRIx64,
376 (uint64_t)(~reg_struct.ioctls & feature_mask));
377 goto out;
378 }
379
380 /* Success! */
381 ret = true;
382 out:
383 if (testarea) {
384 munmap(testarea, pagesize);
385 }
386 if (ufd != -1) {
387 close(ufd);
388 }
389 return ret;
390 }
391
392 /*
393 * Setup an area of RAM so that it *can* be used for postcopy later; this
394 * must be done right at the start prior to pre-copy.
395 * opaque should be the MIS.
396 */
397 static int init_range(const char *block_name, void *host_addr,
398 ram_addr_t offset, ram_addr_t length, void *opaque)
399 {
400 trace_postcopy_init_range(block_name, host_addr, offset, length);
401
402 /*
403 * We need the whole of RAM to be truly empty for postcopy, so things
404 * like ROMs and any data tables built during init must be zero'd
405 * - we're going to get the copy from the source anyway.
406 * (Precopy will just overwrite this data, so doesn't need the discard)
407 */
408 if (ram_discard_range(block_name, 0, length)) {
409 return -1;
410 }
411
412 return 0;
413 }
414
415 /*
416 * At the end of migration, undo the effects of init_range
417 * opaque should be the MIS.
418 */
419 static int cleanup_range(const char *block_name, void *host_addr,
420 ram_addr_t offset, ram_addr_t length, void *opaque)
421 {
422 MigrationIncomingState *mis = opaque;
423 struct uffdio_range range_struct;
424 trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
425
426 /*
427 * We turned off hugepage for the precopy stage with postcopy enabled
428 * we can turn it back on now.
429 */
430 qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE);
431
432 /*
433 * We can also turn off userfault now since we should have all the
434 * pages. It can be useful to leave it on to debug postcopy
435 * if you're not sure it's always getting every page.
436 */
437 range_struct.start = (uintptr_t)host_addr;
438 range_struct.len = length;
439
440 if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
441 error_report("%s: userfault unregister %s", __func__, strerror(errno));
442
443 return -1;
444 }
445
446 return 0;
447 }
448
449 /*
450 * Initialise postcopy-ram, setting the RAM to a state where we can go into
451 * postcopy later; must be called prior to any precopy.
452 * called from arch_init's similarly named ram_postcopy_incoming_init
453 */
454 int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
455 {
456 if (qemu_ram_foreach_block(init_range, NULL)) {
457 return -1;
458 }
459
460 return 0;
461 }
462
463 /*
464 * At the end of a migration where postcopy_ram_incoming_init was called.
465 */
466 int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
467 {
468 trace_postcopy_ram_incoming_cleanup_entry();
469
470 if (mis->have_fault_thread) {
471 Error *local_err = NULL;
472
473 if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_END, &local_err)) {
474 error_report_err(local_err);
475 return -1;
476 }
477
478 if (qemu_ram_foreach_block(cleanup_range, mis)) {
479 return -1;
480 }
481 /* Let the fault thread quit */
482 atomic_set(&mis->fault_thread_quit, 1);
483 postcopy_fault_thread_notify(mis);
484 trace_postcopy_ram_incoming_cleanup_join();
485 qemu_thread_join(&mis->fault_thread);
486
487 trace_postcopy_ram_incoming_cleanup_closeuf();
488 close(mis->userfault_fd);
489 close(mis->userfault_event_fd);
490 mis->have_fault_thread = false;
491 }
492
493 qemu_balloon_inhibit(false);
494
495 if (enable_mlock) {
496 if (os_mlock() < 0) {
497 error_report("mlock: %s", strerror(errno));
498 /*
499 * It doesn't feel right to fail at this point, we have a valid
500 * VM state.
501 */
502 }
503 }
504
505 postcopy_state_set(POSTCOPY_INCOMING_END);
506
507 if (mis->postcopy_tmp_page) {
508 munmap(mis->postcopy_tmp_page, mis->largest_page_size);
509 mis->postcopy_tmp_page = NULL;
510 }
511 if (mis->postcopy_tmp_zero_page) {
512 munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size);
513 mis->postcopy_tmp_zero_page = NULL;
514 }
515 trace_postcopy_ram_incoming_cleanup_exit();
516 return 0;
517 }
518
519 /*
520 * Disable huge pages on an area
521 */
522 static int nhp_range(const char *block_name, void *host_addr,
523 ram_addr_t offset, ram_addr_t length, void *opaque)
524 {
525 trace_postcopy_nhp_range(block_name, host_addr, offset, length);
526
527 /*
528 * Before we do discards we need to ensure those discards really
529 * do delete areas of the page, even if THP thinks a hugepage would
530 * be a good idea, so force hugepages off.
531 */
532 qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE);
533
534 return 0;
535 }
536
537 /*
538 * Userfault requires us to mark RAM as NOHUGEPAGE prior to discard
539 * however leaving it until after precopy means that most of the precopy
540 * data is still THPd
541 */
542 int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
543 {
544 if (qemu_ram_foreach_block(nhp_range, mis)) {
545 return -1;
546 }
547
548 postcopy_state_set(POSTCOPY_INCOMING_DISCARD);
549
550 return 0;
551 }
552
553 /*
554 * Mark the given area of RAM as requiring notification to unwritten areas
555 * Used as a callback on qemu_ram_foreach_block.
556 * host_addr: Base of area to mark
557 * offset: Offset in the whole ram arena
558 * length: Length of the section
559 * opaque: MigrationIncomingState pointer
560 * Returns 0 on success
561 */
562 static int ram_block_enable_notify(const char *block_name, void *host_addr,
563 ram_addr_t offset, ram_addr_t length,
564 void *opaque)
565 {
566 MigrationIncomingState *mis = opaque;
567 struct uffdio_register reg_struct;
568
569 reg_struct.range.start = (uintptr_t)host_addr;
570 reg_struct.range.len = length;
571 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
572
573 /* Now tell our userfault_fd that it's responsible for this area */
574 if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, &reg_struct)) {
575 error_report("%s userfault register: %s", __func__, strerror(errno));
576 return -1;
577 }
578 if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
579 error_report("%s userfault: Region doesn't support COPY", __func__);
580 return -1;
581 }
582 if (reg_struct.ioctls & ((__u64)1 << _UFFDIO_ZEROPAGE)) {
583 RAMBlock *rb = qemu_ram_block_by_name(block_name);
584 qemu_ram_set_uf_zeroable(rb);
585 }
586
587 return 0;
588 }
589
590 int postcopy_wake_shared(struct PostCopyFD *pcfd,
591 uint64_t client_addr,
592 RAMBlock *rb)
593 {
594 size_t pagesize = qemu_ram_pagesize(rb);
595 struct uffdio_range range;
596 int ret;
597 trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb));
598 range.start = client_addr & ~(pagesize - 1);
599 range.len = pagesize;
600 ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range);
601 if (ret) {
602 error_report("%s: Failed to wake: %zx in %s (%s)",
603 __func__, (size_t)client_addr, qemu_ram_get_idstr(rb),
604 strerror(errno));
605 }
606 return ret;
607 }
608
609 /*
610 * Callback from shared fault handlers to ask for a page,
611 * the page must be specified by a RAMBlock and an offset in that rb
612 * Note: Only for use by shared fault handlers (in fault thread)
613 */
614 int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
615 uint64_t client_addr, uint64_t rb_offset)
616 {
617 size_t pagesize = qemu_ram_pagesize(rb);
618 uint64_t aligned_rbo = rb_offset & ~(pagesize - 1);
619 MigrationIncomingState *mis = migration_incoming_get_current();
620
621 trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb),
622 rb_offset);
623 if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) {
624 trace_postcopy_request_shared_page_present(pcfd->idstr,
625 qemu_ram_get_idstr(rb), rb_offset);
626 return postcopy_wake_shared(pcfd, client_addr, rb);
627 }
628 if (rb != mis->last_rb) {
629 mis->last_rb = rb;
630 migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
631 aligned_rbo, pagesize);
632 } else {
633 /* Save some space */
634 migrate_send_rp_req_pages(mis, NULL, aligned_rbo, pagesize);
635 }
636 return 0;
637 }
638
639 /*
640 * Handle faults detected by the USERFAULT markings
641 */
642 static void *postcopy_ram_fault_thread(void *opaque)
643 {
644 MigrationIncomingState *mis = opaque;
645 struct uffd_msg msg;
646 int ret;
647 size_t index;
648 RAMBlock *rb = NULL;
649
650 trace_postcopy_ram_fault_thread_entry();
651 mis->last_rb = NULL; /* last RAMBlock we sent part of */
652 qemu_sem_post(&mis->fault_thread_sem);
653
654 struct pollfd *pfd;
655 size_t pfd_len = 2 + mis->postcopy_remote_fds->len;
656
657 pfd = g_new0(struct pollfd, pfd_len);
658
659 pfd[0].fd = mis->userfault_fd;
660 pfd[0].events = POLLIN;
661 pfd[1].fd = mis->userfault_event_fd;
662 pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */
663 trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd);
664 for (index = 0; index < mis->postcopy_remote_fds->len; index++) {
665 struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds,
666 struct PostCopyFD, index);
667 pfd[2 + index].fd = pcfd->fd;
668 pfd[2 + index].events = POLLIN;
669 trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr,
670 pcfd->fd);
671 }
672
673 while (true) {
674 ram_addr_t rb_offset;
675 int poll_result;
676
677 /*
678 * We're mainly waiting for the kernel to give us a faulting HVA,
679 * however we can be told to quit via userfault_quit_fd which is
680 * an eventfd
681 */
682
683 poll_result = poll(pfd, pfd_len, -1 /* Wait forever */);
684 if (poll_result == -1) {
685 error_report("%s: userfault poll: %s", __func__, strerror(errno));
686 break;
687 }
688
689 if (pfd[1].revents) {
690 uint64_t tmp64 = 0;
691
692 /* Consume the signal */
693 if (read(mis->userfault_event_fd, &tmp64, 8) != 8) {
694 /* Nothing obviously nicer than posting this error. */
695 error_report("%s: read() failed", __func__);
696 }
697
698 if (atomic_read(&mis->fault_thread_quit)) {
699 trace_postcopy_ram_fault_thread_quit();
700 break;
701 }
702 }
703
704 if (pfd[0].revents) {
705 poll_result--;
706 ret = read(mis->userfault_fd, &msg, sizeof(msg));
707 if (ret != sizeof(msg)) {
708 if (errno == EAGAIN) {
709 /*
710 * if a wake up happens on the other thread just after
711 * the poll, there is nothing to read.
712 */
713 continue;
714 }
715 if (ret < 0) {
716 error_report("%s: Failed to read full userfault "
717 "message: %s",
718 __func__, strerror(errno));
719 break;
720 } else {
721 error_report("%s: Read %d bytes from userfaultfd "
722 "expected %zd",
723 __func__, ret, sizeof(msg));
724 break; /* Lost alignment, don't know what we'd read next */
725 }
726 }
727 if (msg.event != UFFD_EVENT_PAGEFAULT) {
728 error_report("%s: Read unexpected event %ud from userfaultfd",
729 __func__, msg.event);
730 continue; /* It's not a page fault, shouldn't happen */
731 }
732
733 rb = qemu_ram_block_from_host(
734 (void *)(uintptr_t)msg.arg.pagefault.address,
735 true, &rb_offset);
736 if (!rb) {
737 error_report("postcopy_ram_fault_thread: Fault outside guest: %"
738 PRIx64, (uint64_t)msg.arg.pagefault.address);
739 break;
740 }
741
742 rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
743 trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
744 qemu_ram_get_idstr(rb),
745 rb_offset);
746 /*
747 * Send the request to the source - we want to request one
748 * of our host page sizes (which is >= TPS)
749 */
750 if (rb != mis->last_rb) {
751 mis->last_rb = rb;
752 migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
753 rb_offset, qemu_ram_pagesize(rb));
754 } else {
755 /* Save some space */
756 migrate_send_rp_req_pages(mis, NULL,
757 rb_offset, qemu_ram_pagesize(rb));
758 }
759 }
760
761 /* Now handle any requests from external processes on shared memory */
762 /* TODO: May need to handle devices deregistering during postcopy */
763 for (index = 2; index < pfd_len && poll_result; index++) {
764 if (pfd[index].revents) {
765 struct PostCopyFD *pcfd =
766 &g_array_index(mis->postcopy_remote_fds,
767 struct PostCopyFD, index - 2);
768
769 poll_result--;
770 if (pfd[index].revents & POLLERR) {
771 error_report("%s: POLLERR on poll %zd fd=%d",
772 __func__, index, pcfd->fd);
773 pfd[index].events = 0;
774 continue;
775 }
776
777 ret = read(pcfd->fd, &msg, sizeof(msg));
778 if (ret != sizeof(msg)) {
779 if (errno == EAGAIN) {
780 /*
781 * if a wake up happens on the other thread just after
782 * the poll, there is nothing to read.
783 */
784 continue;
785 }
786 if (ret < 0) {
787 error_report("%s: Failed to read full userfault "
788 "message: %s (shared) revents=%d",
789 __func__, strerror(errno),
790 pfd[index].revents);
791 /*TODO: Could just disable this sharer */
792 break;
793 } else {
794 error_report("%s: Read %d bytes from userfaultfd "
795 "expected %zd (shared)",
796 __func__, ret, sizeof(msg));
797 /*TODO: Could just disable this sharer */
798 break; /*Lost alignment,don't know what we'd read next*/
799 }
800 }
801 if (msg.event != UFFD_EVENT_PAGEFAULT) {
802 error_report("%s: Read unexpected event %ud "
803 "from userfaultfd (shared)",
804 __func__, msg.event);
805 continue; /* It's not a page fault, shouldn't happen */
806 }
807 /* Call the device handler registered with us */
808 ret = pcfd->handler(pcfd, &msg);
809 if (ret) {
810 error_report("%s: Failed to resolve shared fault on %zd/%s",
811 __func__, index, pcfd->idstr);
812 /* TODO: Fail? Disable this sharer? */
813 }
814 }
815 }
816 }
817 trace_postcopy_ram_fault_thread_exit();
818 g_free(pfd);
819 return NULL;
820 }
821
822 int postcopy_ram_enable_notify(MigrationIncomingState *mis)
823 {
824 /* Open the fd for the kernel to give us userfaults */
825 mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
826 if (mis->userfault_fd == -1) {
827 error_report("%s: Failed to open userfault fd: %s", __func__,
828 strerror(errno));
829 return -1;
830 }
831
832 /*
833 * Although the host check already tested the API, we need to
834 * do the check again as an ABI handshake on the new fd.
835 */
836 if (!ufd_check_and_apply(mis->userfault_fd, mis)) {
837 return -1;
838 }
839
840 /* Now an eventfd we use to tell the fault-thread to quit */
841 mis->userfault_event_fd = eventfd(0, EFD_CLOEXEC);
842 if (mis->userfault_event_fd == -1) {
843 error_report("%s: Opening userfault_event_fd: %s", __func__,
844 strerror(errno));
845 close(mis->userfault_fd);
846 return -1;
847 }
848
849 qemu_sem_init(&mis->fault_thread_sem, 0);
850 qemu_thread_create(&mis->fault_thread, "postcopy/fault",
851 postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE);
852 qemu_sem_wait(&mis->fault_thread_sem);
853 qemu_sem_destroy(&mis->fault_thread_sem);
854 mis->have_fault_thread = true;
855
856 /* Mark so that we get notified of accesses to unwritten areas */
857 if (qemu_ram_foreach_block(ram_block_enable_notify, mis)) {
858 return -1;
859 }
860
861 /*
862 * Ballooning can mark pages as absent while we're postcopying
863 * that would cause false userfaults.
864 */
865 qemu_balloon_inhibit(true);
866
867 trace_postcopy_ram_enable_notify();
868
869 return 0;
870 }
871
872 static int qemu_ufd_copy_ioctl(int userfault_fd, void *host_addr,
873 void *from_addr, uint64_t pagesize, RAMBlock *rb)
874 {
875 int ret;
876 if (from_addr) {
877 struct uffdio_copy copy_struct;
878 copy_struct.dst = (uint64_t)(uintptr_t)host_addr;
879 copy_struct.src = (uint64_t)(uintptr_t)from_addr;
880 copy_struct.len = pagesize;
881 copy_struct.mode = 0;
882 ret = ioctl(userfault_fd, UFFDIO_COPY, &copy_struct);
883 } else {
884 struct uffdio_zeropage zero_struct;
885 zero_struct.range.start = (uint64_t)(uintptr_t)host_addr;
886 zero_struct.range.len = pagesize;
887 zero_struct.mode = 0;
888 ret = ioctl(userfault_fd, UFFDIO_ZEROPAGE, &zero_struct);
889 }
890 if (!ret) {
891 ramblock_recv_bitmap_set_range(rb, host_addr,
892 pagesize / qemu_target_page_size());
893 }
894 return ret;
895 }
896
897 int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset)
898 {
899 int i;
900 MigrationIncomingState *mis = migration_incoming_get_current();
901 GArray *pcrfds = mis->postcopy_remote_fds;
902
903 for (i = 0; i < pcrfds->len; i++) {
904 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
905 int ret = cur->waker(cur, rb, offset);
906 if (ret) {
907 return ret;
908 }
909 }
910 return 0;
911 }
912
913 /*
914 * Place a host page (from) at (host) atomically
915 * returns 0 on success
916 */
917 int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
918 RAMBlock *rb)
919 {
920 size_t pagesize = qemu_ram_pagesize(rb);
921
922 /* copy also acks to the kernel waking the stalled thread up
923 * TODO: We can inhibit that ack and only do it if it was requested
924 * which would be slightly cheaper, but we'd have to be careful
925 * of the order of updating our page state.
926 */
927 if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, from, pagesize, rb)) {
928 int e = errno;
929 error_report("%s: %s copy host: %p from: %p (size: %zd)",
930 __func__, strerror(e), host, from, pagesize);
931
932 return -e;
933 }
934
935 trace_postcopy_place_page(host);
936 return postcopy_notify_shared_wake(rb,
937 qemu_ram_block_host_offset(rb, host));
938 }
939
940 /*
941 * Place a zero page at (host) atomically
942 * returns 0 on success
943 */
944 int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
945 RAMBlock *rb)
946 {
947 size_t pagesize = qemu_ram_pagesize(rb);
948 trace_postcopy_place_page_zero(host);
949
950 /* Normal RAMBlocks can zero a page using UFFDIO_ZEROPAGE
951 * but it's not available for everything (e.g. hugetlbpages)
952 */
953 if (qemu_ram_is_uf_zeroable(rb)) {
954 if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, NULL, pagesize, rb)) {
955 int e = errno;
956 error_report("%s: %s zero host: %p",
957 __func__, strerror(e), host);
958
959 return -e;
960 }
961 return postcopy_notify_shared_wake(rb,
962 qemu_ram_block_host_offset(rb,
963 host));
964 } else {
965 /* The kernel can't use UFFDIO_ZEROPAGE for hugepages */
966 if (!mis->postcopy_tmp_zero_page) {
967 mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size,
968 PROT_READ | PROT_WRITE,
969 MAP_PRIVATE | MAP_ANONYMOUS,
970 -1, 0);
971 if (mis->postcopy_tmp_zero_page == MAP_FAILED) {
972 int e = errno;
973 mis->postcopy_tmp_zero_page = NULL;
974 error_report("%s: %s mapping large zero page",
975 __func__, strerror(e));
976 return -e;
977 }
978 memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size);
979 }
980 return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page,
981 rb);
982 }
983 }
984
985 /*
986 * Returns a target page of memory that can be mapped at a later point in time
987 * using postcopy_place_page
988 * The same address is used repeatedly, postcopy_place_page just takes the
989 * backing page away.
990 * Returns: Pointer to allocated page
991 *
992 */
993 void *postcopy_get_tmp_page(MigrationIncomingState *mis)
994 {
995 if (!mis->postcopy_tmp_page) {
996 mis->postcopy_tmp_page = mmap(NULL, mis->largest_page_size,
997 PROT_READ | PROT_WRITE, MAP_PRIVATE |
998 MAP_ANONYMOUS, -1, 0);
999 if (mis->postcopy_tmp_page == MAP_FAILED) {
1000 mis->postcopy_tmp_page = NULL;
1001 error_report("%s: %s", __func__, strerror(errno));
1002 return NULL;
1003 }
1004 }
1005
1006 return mis->postcopy_tmp_page;
1007 }
1008
1009 #else
1010 /* No target OS support, stubs just fail */
1011 bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
1012 {
1013 error_report("%s: No OS support", __func__);
1014 return false;
1015 }
1016
1017 int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
1018 {
1019 error_report("postcopy_ram_incoming_init: No OS support");
1020 return -1;
1021 }
1022
1023 int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
1024 {
1025 assert(0);
1026 return -1;
1027 }
1028
1029 int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
1030 {
1031 assert(0);
1032 return -1;
1033 }
1034
1035 int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
1036 uint64_t client_addr, uint64_t rb_offset)
1037 {
1038 assert(0);
1039 return -1;
1040 }
1041
1042 int postcopy_ram_enable_notify(MigrationIncomingState *mis)
1043 {
1044 assert(0);
1045 return -1;
1046 }
1047
1048 int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
1049 RAMBlock *rb)
1050 {
1051 assert(0);
1052 return -1;
1053 }
1054
1055 int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
1056 RAMBlock *rb)
1057 {
1058 assert(0);
1059 return -1;
1060 }
1061
1062 void *postcopy_get_tmp_page(MigrationIncomingState *mis)
1063 {
1064 assert(0);
1065 return NULL;
1066 }
1067
1068 int postcopy_wake_shared(struct PostCopyFD *pcfd,
1069 uint64_t client_addr,
1070 RAMBlock *rb)
1071 {
1072 assert(0);
1073 return -1;
1074 }
1075 #endif
1076
1077 /* ------------------------------------------------------------------------- */
1078
1079 void postcopy_fault_thread_notify(MigrationIncomingState *mis)
1080 {
1081 uint64_t tmp64 = 1;
1082
1083 /*
1084 * Wakeup the fault_thread. It's an eventfd that should currently
1085 * be at 0, we're going to increment it to 1
1086 */
1087 if (write(mis->userfault_event_fd, &tmp64, 8) != 8) {
1088 /* Not much we can do here, but may as well report it */
1089 error_report("%s: incrementing failed: %s", __func__,
1090 strerror(errno));
1091 }
1092 }
1093
1094 /**
1095 * postcopy_discard_send_init: Called at the start of each RAMBlock before
1096 * asking to discard individual ranges.
1097 *
1098 * @ms: The current migration state.
1099 * @offset: the bitmap offset of the named RAMBlock in the migration
1100 * bitmap.
1101 * @name: RAMBlock that discards will operate on.
1102 *
1103 * returns: a new PDS.
1104 */
1105 PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms,
1106 const char *name)
1107 {
1108 PostcopyDiscardState *res = g_malloc0(sizeof(PostcopyDiscardState));
1109
1110 if (res) {
1111 res->ramblock_name = name;
1112 }
1113
1114 return res;
1115 }
1116
1117 /**
1118 * postcopy_discard_send_range: Called by the bitmap code for each chunk to
1119 * discard. May send a discard message, may just leave it queued to
1120 * be sent later.
1121 *
1122 * @ms: Current migration state.
1123 * @pds: Structure initialised by postcopy_discard_send_init().
1124 * @start,@length: a range of pages in the migration bitmap in the
1125 * RAM block passed to postcopy_discard_send_init() (length=1 is one page)
1126 */
1127 void postcopy_discard_send_range(MigrationState *ms, PostcopyDiscardState *pds,
1128 unsigned long start, unsigned long length)
1129 {
1130 size_t tp_size = qemu_target_page_size();
1131 /* Convert to byte offsets within the RAM block */
1132 pds->start_list[pds->cur_entry] = start * tp_size;
1133 pds->length_list[pds->cur_entry] = length * tp_size;
1134 trace_postcopy_discard_send_range(pds->ramblock_name, start, length);
1135 pds->cur_entry++;
1136 pds->nsentwords++;
1137
1138 if (pds->cur_entry == MAX_DISCARDS_PER_COMMAND) {
1139 /* Full set, ship it! */
1140 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1141 pds->ramblock_name,
1142 pds->cur_entry,
1143 pds->start_list,
1144 pds->length_list);
1145 pds->nsentcmds++;
1146 pds->cur_entry = 0;
1147 }
1148 }
1149
1150 /**
1151 * postcopy_discard_send_finish: Called at the end of each RAMBlock by the
1152 * bitmap code. Sends any outstanding discard messages, frees the PDS
1153 *
1154 * @ms: Current migration state.
1155 * @pds: Structure initialised by postcopy_discard_send_init().
1156 */
1157 void postcopy_discard_send_finish(MigrationState *ms, PostcopyDiscardState *pds)
1158 {
1159 /* Anything unsent? */
1160 if (pds->cur_entry) {
1161 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1162 pds->ramblock_name,
1163 pds->cur_entry,
1164 pds->start_list,
1165 pds->length_list);
1166 pds->nsentcmds++;
1167 }
1168
1169 trace_postcopy_discard_send_finish(pds->ramblock_name, pds->nsentwords,
1170 pds->nsentcmds);
1171
1172 g_free(pds);
1173 }
1174
1175 /*
1176 * Current state of incoming postcopy; note this is not part of
1177 * MigrationIncomingState since it's state is used during cleanup
1178 * at the end as MIS is being freed.
1179 */
1180 static PostcopyState incoming_postcopy_state;
1181
1182 PostcopyState postcopy_state_get(void)
1183 {
1184 return atomic_mb_read(&incoming_postcopy_state);
1185 }
1186
1187 /* Set the state and return the old state */
1188 PostcopyState postcopy_state_set(PostcopyState new_state)
1189 {
1190 return atomic_xchg(&incoming_postcopy_state, new_state);
1191 }
1192
1193 /* Register a handler for external shared memory postcopy
1194 * called on the destination.
1195 */
1196 void postcopy_register_shared_ufd(struct PostCopyFD *pcfd)
1197 {
1198 MigrationIncomingState *mis = migration_incoming_get_current();
1199
1200 mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds,
1201 *pcfd);
1202 }
1203
1204 /* Unregister a handler for external shared memory postcopy
1205 */
1206 void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd)
1207 {
1208 guint i;
1209 MigrationIncomingState *mis = migration_incoming_get_current();
1210 GArray *pcrfds = mis->postcopy_remote_fds;
1211
1212 for (i = 0; i < pcrfds->len; i++) {
1213 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1214 if (cur->fd == pcfd->fd) {
1215 mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i);
1216 return;
1217 }
1218 }
1219 }