]> git.proxmox.com Git - mirror_qemu.git/blame - migration/migration.c
migration: Move return path cleanup to main migration thread
[mirror_qemu.git] / migration / migration.c
CommitLineData
5bb7910a
AL
1/*
2 * QEMU live migration
3 *
4 * Copyright IBM, Corp. 2008
5 *
6 * Authors:
7 * Anthony Liguori <aliguori@us.ibm.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
11 *
6b620ca3
PB
12 * Contributions after 2012-01-13 are licensed under the terms of the
13 * GNU GPL, version 2 or (at your option) any later version.
5bb7910a
AL
14 */
15
1393a485 16#include "qemu/osdep.h"
f348b6d1 17#include "qemu/cutils.h"
d49b6836 18#include "qemu/error-report.h"
db725815 19#include "qemu/main-loop.h"
795c40b8 20#include "migration/blocker.h"
f4dbe1bf 21#include "exec.h"
7fcac4a2 22#include "fd.h"
61e8b148 23#include "socket.h"
54d31236 24#include "sysemu/runstate.h"
46517dd4 25#include "sysemu/sysemu.h"
b0c3cf94 26#include "sysemu/cpu-throttle.h"
e1a3ecee 27#include "rdma.h"
7b1e1a22 28#include "ram.h"
c323518a 29#include "ram-compress.h"
84a899de 30#include "migration/global_state.h"
c4b63b7c 31#include "migration/misc.h"
6666c96a 32#include "migration.h"
947701cc 33#include "migration-stats.h"
20a519a0 34#include "savevm.h"
08a0aee1 35#include "qemu-file.h"
6720c2b3 36#include "channel.h"
987772d9 37#include "migration/vmstate.h"
737e150e 38#include "block/block.h"
e688df6b 39#include "qapi/error.h"
9aca82ba 40#include "qapi/clone-visitor.h"
31e4c354 41#include "qapi/qapi-visit-migration.h"
9aca82ba 42#include "qapi/qapi-visit-sockets.h"
9af23989
MA
43#include "qapi/qapi-commands-migration.h"
44#include "qapi/qapi-events-migration.h"
cc7a8ea7 45#include "qapi/qmp/qerror.h"
15280c36 46#include "qapi/qmp/qnull.h"
ab28bd23 47#include "qemu/rcu.h"
2c9e6fec 48#include "block.h"
be07b0ac 49#include "postcopy-ram.h"
766bd176 50#include "qemu/thread.h"
c09e5bb1 51#include "trace.h"
51180423 52#include "exec/target_page.h"
61b67d47 53#include "io/channel-buffer.h"
85a8578e 54#include "io/channel-tls.h"
35a6ed4f 55#include "migration/colo.h"
4ffdb337 56#include "hw/boards.h"
9d18af93 57#include "monitor/monitor.h"
50510ea2 58#include "net/announce.h"
c7e0acd5 59#include "qemu/queue.h"
d32ca5ad 60#include "multifd.h"
1b1f4ab6 61#include "threadinfo.h"
b5eea99e 62#include "qemu/yank.h"
6e8c25b4 63#include "sysemu/cpus.h"
39675fff 64#include "yank_functions.h"
1b529d90 65#include "sysemu/qtest.h"
1f0776f1 66#include "options.h"
15699cf5 67#include "sysemu/dirtylimit.h"
065e2813 68
99a0db9b
GH
69static NotifierList migration_state_notifiers =
70 NOTIFIER_LIST_INITIALIZER(migration_state_notifiers);
71
da6f1790
JQ
72/* Messages sent on the return path from destination to source */
73enum mig_rp_message_type {
74 MIG_RP_MSG_INVALID = 0, /* Must be 0 */
75 MIG_RP_MSG_SHUT, /* sibling will not send any more RP messages */
76 MIG_RP_MSG_PONG, /* Response to a PING; data (seq: be32 ) */
77
78 MIG_RP_MSG_REQ_PAGES_ID, /* data (start: be64, len: be32, id: string) */
79 MIG_RP_MSG_REQ_PAGES, /* data (start: be64, len: be32) */
a335debb 80 MIG_RP_MSG_RECV_BITMAP, /* send recved_bitmap back to source */
13955b89 81 MIG_RP_MSG_RESUME_ACK, /* tell source that we are ready to resume */
1b4adb10 82 MIG_RP_MSG_SWITCHOVER_ACK, /* Tell source it's OK to do switchover */
da6f1790
JQ
83
84 MIG_RP_MSG_MAX
85};
86
17549e84
JQ
87/* When we add fault tolerance, we could have several
88 migrations at once. For now we don't need to add
89 dynamic creation of migration */
90
e5cb7e76 91static MigrationState *current_migration;
e1b1b1bc 92static MigrationIncomingState *current_incoming;
e5cb7e76 93
3af8554b
DDAG
94static GSList *migration_blockers;
95
8b0b29dc 96static bool migration_object_check(MigrationState *ms, Error **errp);
0331c8ca
DDAG
97static int migration_maybe_pause(MigrationState *s,
98 int *current_active_state,
99 int new_state);
892ae715 100static void migrate_fd_cancel(MigrationState *s);
36e9aab3 101static int await_return_path_close_on_source(MigrationState *s);
8b0b29dc 102
d6f74fd1
PX
103static bool migration_needs_multiple_sockets(void)
104{
51b07548 105 return migrate_multifd() || migrate_postcopy_preempt();
d6f74fd1 106}
f444eeda 107
d6f74fd1 108static bool uri_supports_multi_channels(const char *uri)
f444eeda 109{
d6f74fd1
PX
110 return strstart(uri, "tcp:", NULL) || strstart(uri, "unix:", NULL) ||
111 strstart(uri, "vsock:", NULL);
f444eeda
PX
112}
113
d6f74fd1
PX
114static bool
115migration_channels_and_uri_compatible(const char *uri, Error **errp)
f444eeda 116{
d6f74fd1
PX
117 if (migration_needs_multiple_sockets() &&
118 !uri_supports_multi_channels(uri)) {
119 error_setg(errp, "Migration requires multi-channel URIs (e.g. tcp)");
120 return false;
121 }
122
123 return true;
f444eeda
PX
124}
125
8f8bfffc
PX
126static gint page_request_addr_cmp(gconstpointer ap, gconstpointer bp)
127{
128 uintptr_t a = (uintptr_t) ap, b = (uintptr_t) bp;
129
130 return (a > b) - (a < b);
131}
132
e5cb7e76
PX
133void migration_object_init(void)
134{
135 /* This can only be called once. */
136 assert(!current_migration);
137 current_migration = MIGRATION_OBJ(object_new(TYPE_MIGRATION));
4ffdb337 138
e1b1b1bc
PX
139 /*
140 * Init the migrate incoming object as well no matter whether
141 * we'll use it or not.
142 */
143 assert(!current_incoming);
144 current_incoming = g_new0(MigrationIncomingState, 1);
145 current_incoming->state = MIGRATION_STATUS_NONE;
146 current_incoming->postcopy_remote_fds =
147 g_array_new(FALSE, TRUE, sizeof(struct PostCopyFD));
148 qemu_mutex_init(&current_incoming->rp_mutex);
60bb3c58 149 qemu_mutex_init(&current_incoming->postcopy_prio_thread_mutex);
e1b1b1bc
PX
150 qemu_event_init(&current_incoming->main_thread_load_event, false);
151 qemu_sem_init(&current_incoming->postcopy_pause_sem_dst, 0);
152 qemu_sem_init(&current_incoming->postcopy_pause_sem_fault, 0);
60bb3c58 153 qemu_sem_init(&current_incoming->postcopy_pause_sem_fast_load, 0);
5655aab0
PX
154 qemu_sem_init(&current_incoming->postcopy_qemufile_dst_done, 0);
155
8f8bfffc 156 qemu_mutex_init(&current_incoming->page_request_mutex);
cf02f29e 157 qemu_cond_init(&current_incoming->page_request_cond);
8f8bfffc 158 current_incoming->page_requested = g_tree_new(page_request_addr_cmp);
e1b1b1bc 159
f9734d5d 160 migration_object_check(current_migration, &error_fatal);
e0d17dfd
PB
161
162 blk_mig_init();
163 ram_mig_init();
164 dirty_bitmap_mig_init();
e5cb7e76
PX
165}
166
458fecca 167void migration_cancel(const Error *error)
c7c0e724 168{
458fecca
LV
169 if (error) {
170 migrate_set_error(current_migration, error);
171 }
acac51ba
HH
172 if (migrate_dirty_limit()) {
173 qmp_cancel_vcpu_dirty_limit(false, -1, NULL);
174 }
c7c0e724
DH
175 migrate_fd_cancel(current_migration);
176}
177
892ae715 178void migration_shutdown(void)
1f895604 179{
795969ab
RL
180 /*
181 * When the QEMU main thread exit, the COLO thread
182 * may wait a semaphore. So, we should wakeup the
183 * COLO thread before migration shutdown.
184 */
185 colo_shutdown();
892ae715
DDAG
186 /*
187 * Cancel the current migration - that will (eventually)
188 * stop the migration using this structure
189 */
458fecca 190 migration_cancel(NULL);
1f895604 191 object_unref(OBJECT(current_migration));
1499ab09
VSO
192
193 /*
194 * Cancel outgoing migration of dirty bitmaps. It should
195 * at least unref used block nodes.
196 */
197 dirty_bitmap_mig_cancel_outgoing();
198
199 /*
200 * Cancel incoming migration of dirty bitmaps. Dirty bitmaps
201 * are non-critical data, and their loss never considered as
202 * something serious.
203 */
204 dirty_bitmap_mig_cancel_incoming();
1f895604
VSO
205}
206
bca7856a 207/* For outgoing */
859bc756 208MigrationState *migrate_get_current(void)
17549e84 209{
e5cb7e76
PX
210 /* This can only be called after the object created. */
211 assert(current_migration);
212 return current_migration;
17549e84
JQ
213}
214
bca7856a
DDAG
215MigrationIncomingState *migration_incoming_get_current(void)
216{
e1b1b1bc
PX
217 assert(current_incoming);
218 return current_incoming;
bca7856a
DDAG
219}
220
e031149c
PX
221void migration_incoming_transport_cleanup(MigrationIncomingState *mis)
222{
223 if (mis->socket_address_list) {
224 qapi_free_SocketAddressList(mis->socket_address_list);
225 mis->socket_address_list = NULL;
226 }
227
228 if (mis->transport_cleanup) {
229 mis->transport_cleanup(mis->transport_data);
230 mis->transport_data = mis->transport_cleanup = NULL;
231 }
232}
233
bca7856a
DDAG
234void migration_incoming_state_destroy(void)
235{
b4b076da
JQ
236 struct MigrationIncomingState *mis = migration_incoming_get_current();
237
cfc3bcf3 238 multifd_load_cleanup();
c323518a 239 compress_threads_load_cleanup();
cfc3bcf3 240
3482655b 241 if (mis->to_src_file) {
660819b1
PX
242 /* Tell source that we are done */
243 migrate_send_rp_shut(mis, qemu_file_get_error(mis->from_src_file) != 0);
3482655b
PX
244 qemu_fclose(mis->to_src_file);
245 mis->to_src_file = NULL;
246 }
247
660819b1 248 if (mis->from_src_file) {
39675fff 249 migration_ioc_unregister_yank_from_file(mis->from_src_file);
660819b1
PX
250 qemu_fclose(mis->from_src_file);
251 mis->from_src_file = NULL;
252 }
00fa4fc8
DDAG
253 if (mis->postcopy_remote_fds) {
254 g_array_free(mis->postcopy_remote_fds, TRUE);
255 mis->postcopy_remote_fds = NULL;
256 }
660819b1 257
e031149c 258 migration_incoming_transport_cleanup(mis);
1783c00f
DDAG
259 qemu_event_reset(&mis->main_thread_load_event);
260
8f8bfffc
PX
261 if (mis->page_requested) {
262 g_tree_destroy(mis->page_requested);
263 mis->page_requested = NULL;
264 }
265
36f62f11
PX
266 if (mis->postcopy_qemufile_dst) {
267 migration_ioc_unregister_yank_from_file(mis->postcopy_qemufile_dst);
268 qemu_fclose(mis->postcopy_qemufile_dst);
269 mis->postcopy_qemufile_dst = NULL;
270 }
271
b5eea99e 272 yank_unregister_instance(MIGRATION_YANK_INSTANCE);
bca7856a
DDAG
273}
274
b05dc723
JQ
275static void migrate_generate_event(int new_state)
276{
b890902c 277 if (migrate_events()) {
3ab72385 278 qapi_event_send_migration(new_state);
b05dc723
JQ
279 }
280}
281
da6f1790
JQ
282/*
283 * Send a message on the return channel back to the source
284 * of the migration.
285 */
d6208e35
PX
286static int migrate_send_rp_message(MigrationIncomingState *mis,
287 enum mig_rp_message_type message_type,
288 uint16_t len, void *data)
da6f1790 289{
d6208e35
PX
290 int ret = 0;
291
da6f1790 292 trace_migrate_send_rp_message((int)message_type, len);
37396950 293 QEMU_LOCK_GUARD(&mis->rp_mutex);
d6208e35
PX
294
295 /*
296 * It's possible that the file handle got lost due to network
297 * failures.
298 */
299 if (!mis->to_src_file) {
300 ret = -EIO;
37396950 301 return ret;
d6208e35
PX
302 }
303
da6f1790
JQ
304 qemu_put_be16(mis->to_src_file, (unsigned int)message_type);
305 qemu_put_be16(mis->to_src_file, len);
306 qemu_put_buffer(mis->to_src_file, data, len);
307 qemu_fflush(mis->to_src_file);
d6208e35
PX
308
309 /* It's possible that qemu file got error during sending */
310 ret = qemu_file_get_error(mis->to_src_file);
311
d6208e35 312 return ret;
da6f1790
JQ
313}
314
2e2bce16
PX
315/* Request one page from the source VM at the given start address.
316 * rb: the RAMBlock to request the page in
1e2d90eb
DDAG
317 * Start: Address offset within the RB
318 * Len: Length in bytes required - must be a multiple of pagesize
319 */
7a267fc4
PX
320int migrate_send_rp_message_req_pages(MigrationIncomingState *mis,
321 RAMBlock *rb, ram_addr_t start)
1e2d90eb 322{
cb8d4c8f 323 uint8_t bufc[12 + 1 + 255]; /* start (8), len (4), rbname up to 256 */
1e2d90eb 324 size_t msglen = 12; /* start + len */
2e2bce16 325 size_t len = qemu_ram_pagesize(rb);
d6208e35 326 enum mig_rp_message_type msg_type;
2e2bce16
PX
327 const char *rbname;
328 int rbname_len;
1e2d90eb
DDAG
329
330 *(uint64_t *)bufc = cpu_to_be64((uint64_t)start);
331 *(uint32_t *)(bufc + 8) = cpu_to_be32((uint32_t)len);
332
2e2bce16
PX
333 /*
334 * We maintain the last ramblock that we requested for page. Note that we
335 * don't need locking because this function will only be called within the
336 * postcopy ram fault thread.
337 */
338 if (rb != mis->last_rb) {
339 mis->last_rb = rb;
340
341 rbname = qemu_ram_get_idstr(rb);
342 rbname_len = strlen(rbname);
343
1e2d90eb
DDAG
344 assert(rbname_len < 256);
345
346 bufc[msglen++] = rbname_len;
347 memcpy(bufc + msglen, rbname, rbname_len);
348 msglen += rbname_len;
d6208e35 349 msg_type = MIG_RP_MSG_REQ_PAGES_ID;
1e2d90eb 350 } else {
d6208e35 351 msg_type = MIG_RP_MSG_REQ_PAGES;
1e2d90eb 352 }
d6208e35
PX
353
354 return migrate_send_rp_message(mis, msg_type, msglen, bufc);
1e2d90eb
DDAG
355}
356
7a267fc4 357int migrate_send_rp_req_pages(MigrationIncomingState *mis,
8f8bfffc 358 RAMBlock *rb, ram_addr_t start, uint64_t haddr)
7a267fc4 359{
7648297d 360 void *aligned = (void *)(uintptr_t)ROUND_DOWN(haddr, qemu_ram_pagesize(rb));
a2429283 361 bool received = false;
8f8bfffc
PX
362
363 WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) {
364 received = ramblock_recv_bitmap_test_byte_offset(rb, start);
365 if (!received && !g_tree_lookup(mis->page_requested, aligned)) {
366 /*
367 * The page has not been received, and it's not yet in the page
368 * request list. Queue it. Set the value of element to 1, so that
369 * things like g_tree_lookup() will return TRUE (1) when found.
370 */
371 g_tree_insert(mis->page_requested, aligned, (gpointer)1);
cf02f29e 372 qatomic_inc(&mis->page_requested_count);
8f8bfffc
PX
373 trace_postcopy_page_req_add(aligned, mis->page_requested_count);
374 }
375 }
376
377 /*
378 * If the page is there, skip sending the message. We don't even need the
379 * lock because as long as the page arrived, it'll be there forever.
380 */
381 if (received) {
382 return 0;
383 }
384
7a267fc4
PX
385 return migrate_send_rp_message_req_pages(mis, rb, start);
386}
387
aad555c2
ZC
388static bool migration_colo_enabled;
389bool migration_incoming_colo_enabled(void)
390{
391 return migration_colo_enabled;
392}
393
394void migration_incoming_disable_colo(void)
395{
18b1d3c9 396 ram_block_discard_disable(false);
aad555c2
ZC
397 migration_colo_enabled = false;
398}
399
18b1d3c9 400int migration_incoming_enable_colo(void)
aad555c2 401{
51e47cf8
VSO
402#ifndef CONFIG_REPLICATION
403 error_report("ENABLE_COLO command come in migration stream, but COLO "
404 "module is not built in");
405 return -ENOTSUP;
406#endif
407
121ccedc
VSO
408 if (!migrate_colo()) {
409 error_report("ENABLE_COLO command come in migration stream, but c-colo "
410 "capability is not set");
411 return -EINVAL;
412 }
413
18b1d3c9
DH
414 if (ram_block_discard_disable(true)) {
415 error_report("COLO: cannot disable RAM discard");
416 return -EBUSY;
417 }
aad555c2 418 migration_colo_enabled = true;
18b1d3c9 419 return 0;
aad555c2
ZC
420}
421
9aca82ba
JQ
422void migrate_add_address(SocketAddress *address)
423{
424 MigrationIncomingState *mis = migration_incoming_get_current();
9aca82ba 425
54aa3de7
EB
426 QAPI_LIST_PREPEND(mis->socket_address_list,
427 QAPI_CLONE(SocketAddress, address));
9aca82ba
JQ
428}
429
e69d50d6 430static void qemu_start_incoming_migration(const char *uri, Error **errp)
5bb7910a 431{
d658f65c 432 const char *p = NULL;
34c9dd8e 433
d6f74fd1
PX
434 /* URI is not suitable for migration? */
435 if (!migration_channels_and_uri_compatible(uri, errp)) {
436 return;
437 }
438
3ab72385 439 qapi_event_send_migration(MIGRATION_STATUS_SETUP);
e69d50d6
PB
440 if (strstart(uri, "tcp:", &p) ||
441 strstart(uri, "unix:", NULL) ||
442 strstart(uri, "vsock:", NULL)) {
d658f65c 443 socket_start_incoming_migration(p ? p : uri, errp);
2da776db 444#ifdef CONFIG_RDMA
adde220a 445 } else if (strstart(uri, "rdma:", &p)) {
2da776db
MH
446 rdma_start_incoming_migration(p, errp);
447#endif
adde220a 448 } else if (strstart(uri, "exec:", &p)) {
43eaae28 449 exec_start_incoming_migration(p, errp);
adde220a 450 } else if (strstart(uri, "fd:", &p)) {
43eaae28 451 fd_start_incoming_migration(p, errp);
adde220a 452 } else {
312fd5f2 453 error_setg(errp, "unknown migration protocol: %s", uri);
8ca5e801 454 }
5bb7910a
AL
455}
456
0aa6aefc
DL
457static void process_incoming_migration_bh(void *opaque)
458{
459 Error *local_err = NULL;
460 MigrationIncomingState *mis = opaque;
461
0f073f44
DDAG
462 /* If capability late_block_activate is set:
463 * Only fire up the block code now if we're going to restart the
464 * VM, else 'cont' will do it.
465 * This causes file locking to happen; so we don't want it to happen
466 * unless we really are starting the VM.
467 */
468 if (!migrate_late_block_activate() ||
469 (autostart && (!global_state_received() ||
470 global_state_get_runstate() == RUN_STATE_RUNNING))) {
3b717194 471 /* Make sure all file formats throw away their mutable metadata.
0f073f44 472 * If we get an error here, just don't restart the VM yet. */
3b717194 473 bdrv_activate_all(&local_err);
0f073f44
DDAG
474 if (local_err) {
475 error_report_err(local_err);
476 local_err = NULL;
477 autostart = false;
478 }
d35ff5e6
KW
479 }
480
0aa6aefc
DL
481 /*
482 * This must happen after all error conditions are dealt with and
483 * we're sure the VM is going to be running on this host.
484 */
7659505c 485 qemu_announce_self(&mis->announce_timer, migrate_announce_params());
0aa6aefc 486
cfc3bcf3 487 multifd_load_shutdown();
0aa6aefc 488
b35ebdf0
VSO
489 dirty_bitmap_mig_before_vm_start();
490
0aa6aefc
DL
491 if (!global_state_received() ||
492 global_state_get_runstate() == RUN_STATE_RUNNING) {
493 if (autostart) {
494 vm_start();
495 } else {
496 runstate_set(RUN_STATE_PAUSED);
497 }
db009729
ZC
498 } else if (migration_incoming_colo_enabled()) {
499 migration_incoming_disable_colo();
500 vm_start();
0aa6aefc
DL
501 } else {
502 runstate_set(global_state_get_runstate());
503 }
0aa6aefc
DL
504 /*
505 * This must happen after any state changes since as soon as an external
506 * observer sees this event they might start to prod at the VM assuming
507 * it's ready to use.
508 */
509 migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
510 MIGRATION_STATUS_COMPLETED);
511 qemu_bh_delete(mis->bh);
512 migration_incoming_state_destroy();
513}
514
38e8f9af
MAL
515static void coroutine_fn
516process_incoming_migration_co(void *opaque)
511c0231 517{
b4b076da 518 MigrationIncomingState *mis = migration_incoming_get_current();
e9bef235 519 PostcopyState ps;
1c12e1f5
PB
520 int ret;
521
4f0fae7f 522 assert(mis->from_src_file);
c323518a
LS
523
524 if (compress_threads_load_setup(mis->from_src_file)) {
525 error_report("Failed to setup decompress threads");
526 goto fail;
527 }
528
67f11b5c 529 mis->largest_page_size = qemu_ram_pagesize_largest();
093e3c42 530 postcopy_state_set(POSTCOPY_INCOMING_NONE);
93d7af6f
HZ
531 migrate_set_state(&mis->state, MIGRATION_STATUS_NONE,
532 MIGRATION_STATUS_ACTIVE);
dd42ce24
VSO
533
534 mis->loadvm_co = qemu_coroutine_self();
4f0fae7f 535 ret = qemu_loadvm_state(mis->from_src_file);
dd42ce24 536 mis->loadvm_co = NULL;
bca7856a 537
e9bef235
DDAG
538 ps = postcopy_state_get();
539 trace_process_incoming_migration_co_end(ret, ps);
540 if (ps != POSTCOPY_INCOMING_NONE) {
541 if (ps == POSTCOPY_INCOMING_ADVISE) {
542 /*
543 * Where a migration had postcopy enabled (and thus went to advise)
544 * but managed to complete within the precopy period, we can use
545 * the normal exit.
546 */
547 postcopy_ram_incoming_cleanup(mis);
548 } else if (ret >= 0) {
549 /*
550 * Postcopy was started, cleanup should happen at the end of the
551 * postcopy thread.
552 */
553 trace_process_incoming_migration_co_postcopy_end_main();
554 return;
555 }
556 /* Else if something went wrong then just fall out of the normal exit */
557 }
558
ecbfec6d
VSO
559 if (ret < 0) {
560 error_report("load of migration failed: %s", strerror(-ret));
561 goto fail;
562 }
563
d0a14a2b
VSO
564 if (colo_incoming_co() < 0) {
565 goto fail;
25d0c16f
HZ
566 }
567
0aa6aefc
DL
568 mis->bh = qemu_bh_new(process_incoming_migration_bh, mis);
569 qemu_bh_schedule(mis->bh);
6d99c2d4
FL
570 return;
571fail:
6d99c2d4
FL
572 migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
573 MIGRATION_STATUS_FAILED);
574 qemu_fclose(mis->from_src_file);
e5bac1f5
LB
575
576 multifd_load_cleanup();
c323518a 577 compress_threads_load_cleanup();
e5bac1f5 578
6d99c2d4 579 exit(EXIT_FAILURE);
511c0231
JQ
580}
581
b673eab4 582/**
7d6f6933 583 * migration_incoming_setup: Setup incoming migration
b673eab4
JQ
584 * @f: file for main migration channel
585 * @errp: where to put errors
7d6f6933
MA
586 *
587 * Returns: %true on success, %false on error.
b673eab4 588 */
7d6f6933 589static bool migration_incoming_setup(QEMUFile *f, Error **errp)
82a4da79 590{
4f0fae7f 591 MigrationIncomingState *mis = migration_incoming_get_current();
82a4da79 592
4f0fae7f
JQ
593 if (!mis->from_src_file) {
594 mis->from_src_file = f;
595 }
06ad5135 596 qemu_file_set_blocking(f, false);
7d6f6933 597 return true;
e595a01a
JQ
598}
599
36c2f8be 600void migration_incoming_process(void)
e595a01a
JQ
601{
602 Coroutine *co = qemu_coroutine_create(process_incoming_migration_co, NULL);
0b8b8753 603 qemu_coroutine_enter(co);
82a4da79
PB
604}
605
884835fa 606/* Returns true if recovered from a paused migration, otherwise false */
a39e9339 607static bool postcopy_try_recover(void)
e595a01a 608{
d96c9e8d
PX
609 MigrationIncomingState *mis = migration_incoming_get_current();
610
611 if (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
612 /* Resumed from a paused postcopy migration */
613
a39e9339
PX
614 /* This should be set already in migration_incoming_setup() */
615 assert(mis->from_src_file);
d96c9e8d 616 /* Postcopy has standalone thread to do vm load */
a39e9339 617 qemu_file_set_blocking(mis->from_src_file, true);
d96c9e8d
PX
618
619 /* Re-configure the return path */
a39e9339 620 mis->to_src_file = qemu_file_get_return_path(mis->from_src_file);
d96c9e8d
PX
621
622 migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_PAUSED,
623 MIGRATION_STATUS_POSTCOPY_RECOVER);
624
625 /*
626 * Here, we only wake up the main loading thread (while the
60bb3c58 627 * rest threads will still be waiting), so that we can receive
d96c9e8d 628 * commands from source now, and answer it if needed. The
60bb3c58 629 * rest threads will be woken up afterwards until we are sure
d96c9e8d
PX
630 * that source is ready to reply to page requests.
631 */
632 qemu_sem_post(&mis->postcopy_pause_sem_dst);
884835fa
PX
633 return true;
634 }
635
636 return false;
637}
638
b673eab4 639void migration_fd_process_incoming(QEMUFile *f, Error **errp)
884835fa 640{
a39e9339 641 if (!migration_incoming_setup(f, errp)) {
884835fa 642 return;
d96c9e8d 643 }
a39e9339 644 if (postcopy_try_recover()) {
b673eab4
JQ
645 return;
646 }
884835fa 647 migration_incoming_process();
e595a01a
JQ
648}
649
5655aab0
PX
650/*
651 * Returns true when we want to start a new incoming migration process,
652 * false otherwise.
653 */
654static bool migration_should_start_incoming(bool main_channel)
655{
656 /* Multifd doesn't start unless all channels are established */
51b07548 657 if (migrate_multifd()) {
5655aab0
PX
658 return migration_has_all_channels();
659 }
660
661 /* Preempt channel only starts when the main channel is created */
662 if (migrate_postcopy_preempt()) {
663 return main_channel;
664 }
665
666 /*
667 * For all the rest types of migration, we should only reach here when
668 * it's the main channel that's being created, and we should always
669 * proceed with this channel.
670 */
671 assert(main_channel);
672 return true;
673}
674
49ed0d24 675void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
4f0fae7f
JQ
676{
677 MigrationIncomingState *mis = migration_incoming_get_current();
b673eab4 678 Error *local_err = NULL;
36f62f11 679 QEMUFile *f;
6720c2b3 680 bool default_channel = true;
681 uint32_t channel_magic = 0;
682 int ret = 0;
4f0fae7f 683
51b07548 684 if (migrate_multifd() && !migrate_postcopy_ram() &&
6720c2b3 685 qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
686 /*
687 * With multiple channels, it is possible that we receive channels
688 * out of order on destination side, causing incorrect mapping of
689 * source channels on destination side. Check channel MAGIC to
690 * decide type of channel. Please note this is best effort, postcopy
691 * preempt channel does not send any magic number so avoid it for
692 * postcopy live migration. Also tls live migration already does
693 * tls handshake while initializing main channel so with tls this
694 * issue is not possible.
695 */
696 ret = migration_channel_read_peek(ioc, (void *)&channel_magic,
697 sizeof(channel_magic), &local_err);
698
699 if (ret != 0) {
700 error_propagate(errp, local_err);
701 return;
702 }
703
704 default_channel = (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC));
705 } else {
706 default_channel = !mis->from_src_file;
707 }
708
709 if (multifd_load_setup(errp) != 0) {
710 error_setg(errp, "Failed to setup multifd channels");
711 return;
712 }
713
714 if (default_channel) {
36f62f11 715 f = qemu_file_new_input(ioc);
a429e7f4 716
7d6f6933 717 if (!migration_incoming_setup(f, errp)) {
b673eab4
JQ
718 return;
719 }
a429e7f4
PX
720 } else {
721 /* Multiple connections */
36f62f11 722 assert(migration_needs_multiple_sockets());
51b07548 723 if (migrate_multifd()) {
6720c2b3 724 multifd_recv_new_channel(ioc, &local_err);
36f62f11
PX
725 } else {
726 assert(migrate_postcopy_preempt());
727 f = qemu_file_new_input(ioc);
6720c2b3 728 postcopy_preempt_new_channel(mis, f);
36f62f11 729 }
49ed0d24
FL
730 if (local_err) {
731 error_propagate(errp, local_err);
732 return;
733 }
4f0fae7f 734 }
81e62053 735
5655aab0 736 if (migration_should_start_incoming(default_channel)) {
a39e9339
PX
737 /* If it's a recovery, we're done */
738 if (postcopy_try_recover()) {
739 return;
740 }
81e62053
PX
741 migration_incoming_process();
742 }
4f0fae7f
JQ
743}
744
428d8908
JQ
745/**
746 * @migration_has_all_channels: We have received all channels that we need
747 *
748 * Returns true when we have got connections to all the channels that
749 * we need for migration.
750 */
751bool migration_has_all_channels(void)
752{
ca273df3 753 MigrationIncomingState *mis = migration_incoming_get_current();
62c1e0ca 754
36f62f11
PX
755 if (!mis->from_src_file) {
756 return false;
757 }
62c1e0ca 758
51b07548 759 if (migrate_multifd()) {
36f62f11
PX
760 return multifd_recv_all_channels_created();
761 }
762
763 if (migrate_postcopy_preempt()) {
764 return mis->postcopy_qemufile_dst != NULL;
765 }
766
767 return true;
428d8908
JQ
768}
769
1b4adb10
AH
770int migrate_send_rp_switchover_ack(MigrationIncomingState *mis)
771{
772 return migrate_send_rp_message(mis, MIG_RP_MSG_SWITCHOVER_ACK, 0, NULL);
773}
774
6decec93
DDAG
775/*
776 * Send a 'SHUT' message on the return channel with the given value
777 * to indicate that we've finished with the RP. Non-0 value indicates
778 * error.
779 */
780void migrate_send_rp_shut(MigrationIncomingState *mis,
781 uint32_t value)
782{
783 uint32_t buf;
784
785 buf = cpu_to_be32(value);
786 migrate_send_rp_message(mis, MIG_RP_MSG_SHUT, sizeof(buf), &buf);
787}
788
789/*
790 * Send a 'PONG' message on the return channel with the given value
791 * (normally in response to a 'PING')
792 */
793void migrate_send_rp_pong(MigrationIncomingState *mis,
794 uint32_t value)
795{
796 uint32_t buf;
797
798 buf = cpu_to_be32(value);
799 migrate_send_rp_message(mis, MIG_RP_MSG_PONG, sizeof(buf), &buf);
800}
801
a335debb
PX
802void migrate_send_rp_recv_bitmap(MigrationIncomingState *mis,
803 char *block_name)
804{
805 char buf[512];
806 int len;
807 int64_t res;
808
809 /*
810 * First, we send the header part. It contains only the len of
811 * idstr, and the idstr itself.
812 */
813 len = strlen(block_name);
814 buf[0] = len;
815 memcpy(buf + 1, block_name, len);
816
817 if (mis->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
818 error_report("%s: MSG_RP_RECV_BITMAP only used for recovery",
819 __func__);
820 return;
821 }
822
823 migrate_send_rp_message(mis, MIG_RP_MSG_RECV_BITMAP, len + 1, buf);
824
825 /*
826 * Next, we dump the received bitmap to the stream.
827 *
828 * TODO: currently we are safe since we are the only one that is
829 * using the to_src_file handle (fault thread is still paused),
830 * and it's ok even not taking the mutex. However the best way is
831 * to take the lock before sending the message header, and release
832 * the lock after sending the bitmap.
833 */
834 qemu_mutex_lock(&mis->rp_mutex);
835 res = ramblock_recv_bitmap_send(mis->to_src_file, block_name);
836 qemu_mutex_unlock(&mis->rp_mutex);
837
838 trace_migrate_send_rp_recv_bitmap(block_name, res);
839}
840
13955b89
PX
841void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value)
842{
843 uint32_t buf;
844
845 buf = cpu_to_be32(value);
846 migrate_send_rp_message(mis, MIG_RP_MSG_RESUME_ACK, sizeof(buf), &buf);
847}
848
f6844b99
DDAG
849/*
850 * Return true if we're already in the middle of a migration
851 * (i.e. any of the active or setup states)
852 */
3d63da16 853bool migration_is_setup_or_active(int state)
f6844b99
DDAG
854{
855 switch (state) {
856 case MIGRATION_STATUS_ACTIVE:
9ec055ae 857 case MIGRATION_STATUS_POSTCOPY_ACTIVE:
a688d2c1 858 case MIGRATION_STATUS_POSTCOPY_PAUSED:
135b87b4 859 case MIGRATION_STATUS_POSTCOPY_RECOVER:
f6844b99 860 case MIGRATION_STATUS_SETUP:
31e06077
DDAG
861 case MIGRATION_STATUS_PRE_SWITCHOVER:
862 case MIGRATION_STATUS_DEVICE:
c7e0acd5 863 case MIGRATION_STATUS_WAIT_UNPLUG:
19dd408a 864 case MIGRATION_STATUS_COLO:
f6844b99
DDAG
865 return true;
866
867 default:
868 return false;
869
870 }
871}
872
392d87e2
JQ
873bool migration_is_running(int state)
874{
875 switch (state) {
876 case MIGRATION_STATUS_ACTIVE:
877 case MIGRATION_STATUS_POSTCOPY_ACTIVE:
878 case MIGRATION_STATUS_POSTCOPY_PAUSED:
879 case MIGRATION_STATUS_POSTCOPY_RECOVER:
880 case MIGRATION_STATUS_SETUP:
881 case MIGRATION_STATUS_PRE_SWITCHOVER:
882 case MIGRATION_STATUS_DEVICE:
883 case MIGRATION_STATUS_WAIT_UNPLUG:
884 case MIGRATION_STATUS_CANCELLING:
392d87e2
JQ
885 return true;
886
887 default:
888 return false;
889
890 }
891}
892
db18dee7
PX
893static bool migrate_show_downtime(MigrationState *s)
894{
895 return (s->state == MIGRATION_STATUS_COMPLETED) || migration_in_postcopy();
896}
897
640dfb14
WY
898static void populate_time_info(MigrationInfo *info, MigrationState *s)
899{
900 info->has_status = true;
901 info->has_setup_time = true;
902 info->setup_time = s->setup_time;
db18dee7 903
640dfb14
WY
904 if (s->state == MIGRATION_STATUS_COMPLETED) {
905 info->has_total_time = true;
906 info->total_time = s->total_time;
640dfb14
WY
907 } else {
908 info->has_total_time = true;
909 info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
910 s->start_time;
db18dee7
PX
911 }
912
913 if (migrate_show_downtime(s)) {
914 info->has_downtime = true;
915 info->downtime = s->downtime;
916 } else {
640dfb14
WY
917 info->has_expected_downtime = true;
918 info->expected_downtime = s->expected_downtime;
919 }
920}
921
a22463a5
DDAG
922static void populate_ram_info(MigrationInfo *info, MigrationState *s)
923{
144fa06b
JQ
924 size_t page_size = qemu_target_page_size();
925
a22463a5 926 info->ram = g_malloc0(sizeof(*info->ram));
aff3f660 927 info->ram->transferred = stat64_get(&mig_stats.transferred);
a22463a5 928 info->ram->total = ram_bytes_total();
aff3f660 929 info->ram->duplicate = stat64_get(&mig_stats.zero_pages);
bedf53c1
JQ
930 /* legacy value. It is not used anymore */
931 info->ram->skipped = 0;
aff3f660 932 info->ram->normal = stat64_get(&mig_stats.normal_pages);
23b7576d 933 info->ram->normal_bytes = info->ram->normal * page_size;
a22463a5 934 info->ram->mbps = s->mbps;
536b5a4e 935 info->ram->dirty_sync_count =
aff3f660 936 stat64_get(&mig_stats.dirty_sync_count);
cf20c897 937 info->ram->dirty_sync_missed_zero_copy =
aff3f660 938 stat64_get(&mig_stats.dirty_sync_missed_zero_copy);
3c764f9b 939 info->ram->postcopy_requests =
aff3f660 940 stat64_get(&mig_stats.postcopy_requests);
144fa06b 941 info->ram->page_size = page_size;
aff3f660 942 info->ram->multifd_bytes = stat64_get(&mig_stats.multifd_bytes);
aecbfe9c 943 info->ram->pages_per_second = s->pages_per_second;
aff3f660
JQ
944 info->ram->precopy_bytes = stat64_get(&mig_stats.precopy_bytes);
945 info->ram->downtime_bytes = stat64_get(&mig_stats.downtime_bytes);
946 info->ram->postcopy_bytes = stat64_get(&mig_stats.postcopy_bytes);
a22463a5 947
87dca0c9 948 if (migrate_xbzrle()) {
114f5aee
JQ
949 info->xbzrle_cache = g_malloc0(sizeof(*info->xbzrle_cache));
950 info->xbzrle_cache->cache_size = migrate_xbzrle_cache_size();
9360447d
JQ
951 info->xbzrle_cache->bytes = xbzrle_counters.bytes;
952 info->xbzrle_cache->pages = xbzrle_counters.pages;
953 info->xbzrle_cache->cache_miss = xbzrle_counters.cache_miss;
954 info->xbzrle_cache->cache_miss_rate = xbzrle_counters.cache_miss_rate;
e460a4b1 955 info->xbzrle_cache->encoding_rate = xbzrle_counters.encoding_rate;
9360447d 956 info->xbzrle_cache->overflow = xbzrle_counters.overflow;
114f5aee
JQ
957 }
958
a7a94d14 959 if (migrate_compress()) {
76e03000
XG
960 info->compression = g_malloc0(sizeof(*info->compression));
961 info->compression->pages = compression_counters.pages;
962 info->compression->busy = compression_counters.busy;
963 info->compression->busy_rate = compression_counters.busy_rate;
964 info->compression->compressed_size =
965 compression_counters.compressed_size;
966 info->compression->compression_rate =
967 compression_counters.compression_rate;
968 }
969
338182c8
JQ
970 if (cpu_throttle_active()) {
971 info->has_cpu_throttle_percentage = true;
972 info->cpu_throttle_percentage = cpu_throttle_get_percentage();
973 }
974
a22463a5
DDAG
975 if (s->state != MIGRATION_STATUS_COMPLETED) {
976 info->ram->remaining = ram_bytes_remaining();
72f8e587 977 info->ram->dirty_pages_rate =
aff3f660 978 stat64_get(&mig_stats.dirty_pages_rate);
a22463a5 979 }
15699cf5
HH
980
981 if (migrate_dirty_limit() && dirtylimit_in_service()) {
982 info->has_dirty_limit_throttle_time_per_round = true;
983 info->dirty_limit_throttle_time_per_round =
984 dirtylimit_throttle_time_per_round();
985
986 info->has_dirty_limit_ring_full_time = true;
987 info->dirty_limit_ring_full_time = dirtylimit_ring_full_time();
988 }
a22463a5
DDAG
989}
990
930ac04c
JQ
991static void populate_disk_info(MigrationInfo *info)
992{
993 if (blk_mig_active()) {
930ac04c
JQ
994 info->disk = g_malloc0(sizeof(*info->disk));
995 info->disk->transferred = blk_mig_bytes_transferred();
996 info->disk->remaining = blk_mig_bytes_remaining();
997 info->disk->total = blk_mig_bytes_total();
998 }
999}
1000
65ace060 1001static void fill_source_migration_info(MigrationInfo *info)
5bb7910a 1002{
17549e84 1003 MigrationState *s = migrate_get_current();
552de79b 1004 int state = qatomic_read(&s->state);
372043f3 1005 GSList *cur_blocker = migration_blockers;
17549e84 1006
3af8554b 1007 info->blocked_reasons = NULL;
3af8554b 1008
372043f3
MA
1009 /*
1010 * There are two types of reasons a migration might be blocked;
1011 * a) devices marked in VMState as non-migratable, and
1012 * b) Explicit migration blockers
1013 * We need to add both of them here.
1014 */
1015 qemu_savevm_non_migratable_list(&info->blocked_reasons);
3af8554b 1016
372043f3
MA
1017 while (cur_blocker) {
1018 QAPI_LIST_PREPEND(info->blocked_reasons,
1019 g_strdup(error_get_pretty(cur_blocker->data)));
1020 cur_blocker = g_slist_next(cur_blocker);
3af8554b 1021 }
372043f3 1022 info->has_blocked_reasons = info->blocked_reasons != NULL;
3af8554b 1023
552de79b 1024 switch (state) {
31194731 1025 case MIGRATION_STATUS_NONE:
17549e84 1026 /* no migration has happened ever */
65ace060
AP
1027 /* do not overwrite destination migration status */
1028 return;
31194731 1029 case MIGRATION_STATUS_SETUP:
29ae8a41 1030 info->has_status = true;
ed4fbd10 1031 info->has_total_time = false;
29ae8a41 1032 break;
31194731
HZ
1033 case MIGRATION_STATUS_ACTIVE:
1034 case MIGRATION_STATUS_CANCELLING:
9ec055ae 1035 case MIGRATION_STATUS_POSTCOPY_ACTIVE:
31e06077
DDAG
1036 case MIGRATION_STATUS_PRE_SWITCHOVER:
1037 case MIGRATION_STATUS_DEVICE:
a688d2c1 1038 case MIGRATION_STATUS_POSTCOPY_PAUSED:
135b87b4 1039 case MIGRATION_STATUS_POSTCOPY_RECOVER:
640dfb14
WY
1040 /* TODO add some postcopy stats */
1041 populate_time_info(info, s);
a22463a5 1042 populate_ram_info(info, s);
930ac04c 1043 populate_disk_info(info);
38c482b4 1044 migration_populate_vfio_info(info);
17549e84 1045 break;
0b827d5e
HZ
1046 case MIGRATION_STATUS_COLO:
1047 info->has_status = true;
1048 /* TODO: display COLO specific information (checkpoint info etc.) */
1049 break;
31194731 1050 case MIGRATION_STATUS_COMPLETED:
640dfb14 1051 populate_time_info(info, s);
a22463a5 1052 populate_ram_info(info, s);
38c482b4 1053 migration_populate_vfio_info(info);
17549e84 1054 break;
31194731 1055 case MIGRATION_STATUS_FAILED:
791e7c82 1056 info->has_status = true;
d59ce6f3 1057 if (s->error) {
d59ce6f3
DB
1058 info->error_desc = g_strdup(error_get_pretty(s->error));
1059 }
17549e84 1060 break;
31194731 1061 case MIGRATION_STATUS_CANCELLED:
791e7c82 1062 info->has_status = true;
17549e84 1063 break;
c7e0acd5
JF
1064 case MIGRATION_STATUS_WAIT_UNPLUG:
1065 info->has_status = true;
1066 break;
5bb7910a 1067 }
552de79b 1068 info->status = state;
5bb7910a
AL
1069}
1070
65ace060
AP
1071static void fill_destination_migration_info(MigrationInfo *info)
1072{
1073 MigrationIncomingState *mis = migration_incoming_get_current();
1074
9aca82ba
JQ
1075 if (mis->socket_address_list) {
1076 info->has_socket_address = true;
1077 info->socket_address =
1078 QAPI_CLONE(SocketAddressList, mis->socket_address_list);
1079 }
1080
65ace060
AP
1081 switch (mis->state) {
1082 case MIGRATION_STATUS_NONE:
1083 return;
65ace060
AP
1084 case MIGRATION_STATUS_SETUP:
1085 case MIGRATION_STATUS_CANCELLING:
1086 case MIGRATION_STATUS_CANCELLED:
1087 case MIGRATION_STATUS_ACTIVE:
1088 case MIGRATION_STATUS_POSTCOPY_ACTIVE:
3c9928d9
PX
1089 case MIGRATION_STATUS_POSTCOPY_PAUSED:
1090 case MIGRATION_STATUS_POSTCOPY_RECOVER:
65ace060
AP
1091 case MIGRATION_STATUS_FAILED:
1092 case MIGRATION_STATUS_COLO:
1093 info->has_status = true;
1094 break;
1095 case MIGRATION_STATUS_COMPLETED:
1096 info->has_status = true;
1097 fill_destination_postcopy_migration_info(info);
1098 break;
1099 }
1100 info->status = mis->state;
1101}
1102
1103MigrationInfo *qmp_query_migrate(Error **errp)
1104{
1105 MigrationInfo *info = g_malloc0(sizeof(*info));
1106
1107 fill_destination_migration_info(info);
1108 fill_source_migration_info(info);
1109
1110 return info;
1111}
1112
4886a1bc
DDAG
1113void qmp_migrate_start_postcopy(Error **errp)
1114{
1115 MigrationState *s = migrate_get_current();
1116
16b0fd32 1117 if (!migrate_postcopy()) {
a54d340b 1118 error_setg(errp, "Enable postcopy with migrate_set_capability before"
4886a1bc
DDAG
1119 " the start of migration");
1120 return;
1121 }
1122
1123 if (s->state == MIGRATION_STATUS_NONE) {
1124 error_setg(errp, "Postcopy must be started after migration has been"
1125 " started");
1126 return;
1127 }
1128 /*
1129 * we don't error if migration has finished since that would be racy
1130 * with issuing this command.
1131 */
d73415a3 1132 qatomic_set(&s->start_postcopy, true);
4886a1bc
DDAG
1133}
1134
065e2813
AL
1135/* shared migration helpers */
1136
48781e5b 1137void migrate_set_state(int *state, int old_state, int new_state)
51cf4c1a 1138{
a31fedee 1139 assert(new_state < MIGRATION_STATUS__MAX);
d73415a3 1140 if (qatomic_cmpxchg(state, old_state, new_state) == old_state) {
a31fedee 1141 trace_migrate_set_state(MigrationStatus_str(new_state));
b05dc723 1142 migrate_generate_event(new_state);
51cf4c1a
Z
1143 }
1144}
1145
fd392cfa 1146static void migrate_fd_cleanup(MigrationState *s)
065e2813 1147{
bb1fadc4
PB
1148 qemu_bh_delete(s->cleanup_bh);
1149 s->cleanup_bh = NULL;
1150
83174765
PX
1151 g_free(s->hostname);
1152 s->hostname = NULL;
e3bf5e68
DH
1153 json_writer_free(s->vmdesc);
1154 s->vmdesc = NULL;
83174765 1155
0ceccd85
PX
1156 qemu_savevm_state_cleanup();
1157
89a02a9f 1158 if (s->to_dst_file) {
62df066f 1159 QEMUFile *tmp;
f986c3d2 1160
9013dca5 1161 trace_migrate_fd_cleanup();
404a7c05 1162 qemu_mutex_unlock_iothread();
1d34e4bf
DDAG
1163 if (s->migration_thread_running) {
1164 qemu_thread_join(&s->thread);
1165 s->migration_thread_running = false;
1166 }
404a7c05
PB
1167 qemu_mutex_lock_iothread();
1168
1398b2e3 1169 multifd_save_cleanup();
62df066f
PX
1170 qemu_mutex_lock(&s->qemu_file_lock);
1171 tmp = s->to_dst_file;
89a02a9f 1172 s->to_dst_file = NULL;
62df066f
PX
1173 qemu_mutex_unlock(&s->qemu_file_lock);
1174 /*
1175 * Close the file handle without the lock to make sure the
1176 * critical section won't block for long.
1177 */
39675fff 1178 migration_ioc_unregister_yank_from_file(tmp);
62df066f 1179 qemu_fclose(tmp);
065e2813
AL
1180 }
1181
36e9aab3
FR
1182 /*
1183 * We already cleaned up to_dst_file, so errors from the return
1184 * path might be due to that, ignore them.
1185 */
1186 await_return_path_close_on_source(s);
1187
8f8d528e 1188 assert(!migration_is_active(s));
7a2c1721 1189
94f5a437 1190 if (s->state == MIGRATION_STATUS_CANCELLING) {
48781e5b 1191 migrate_set_state(&s->state, MIGRATION_STATUS_CANCELLING,
94f5a437 1192 MIGRATION_STATUS_CANCELLED);
7a2c1721 1193 }
a3fa1d78 1194
87db1a7d
JQ
1195 if (s->error) {
1196 /* It is used on info migrate. We can't free it */
1197 error_report_err(error_copy(s->error));
1198 }
a3fa1d78 1199 notifier_list_notify(&migration_state_notifiers, s);
b1a87956 1200 block_cleanup_parameters();
b5eea99e 1201 yank_unregister_instance(MIGRATION_YANK_INSTANCE);
065e2813
AL
1202}
1203
fd392cfa
YK
1204static void migrate_fd_cleanup_schedule(MigrationState *s)
1205{
1206 /*
1207 * Ref the state for bh, because it may be called when
1208 * there're already no other refs
1209 */
1210 object_ref(OBJECT(s));
1211 qemu_bh_schedule(s->cleanup_bh);
1212}
1213
1214static void migrate_fd_cleanup_bh(void *opaque)
1215{
1216 MigrationState *s = opaque;
1217 migrate_fd_cleanup(s);
1218 object_unref(OBJECT(s));
1219}
1220
87db1a7d
JQ
1221void migrate_set_error(MigrationState *s, const Error *error)
1222{
6e8a355d 1223 QEMU_LOCK_GUARD(&s->error_mutex);
87db1a7d
JQ
1224 if (!s->error) {
1225 s->error = error_copy(error);
1226 }
87db1a7d
JQ
1227}
1228
ca7bd082
PX
1229static void migrate_error_free(MigrationState *s)
1230{
1231 QEMU_LOCK_GUARD(&s->error_mutex);
1232 if (s->error) {
1233 error_free(s->error);
1234 s->error = NULL;
1235 }
1236}
1237
aaf26bd3 1238static void migrate_fd_error(MigrationState *s, const Error *error)
065e2813 1239{
25174055 1240 trace_migrate_fd_error(error_get_pretty(error));
89a02a9f 1241 assert(s->to_dst_file == NULL);
48781e5b
HZ
1242 migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
1243 MIGRATION_STATUS_FAILED);
87db1a7d 1244 migrate_set_error(s, error);
458cf28e
JQ
1245}
1246
0edda1c4 1247static void migrate_fd_cancel(MigrationState *s)
065e2813 1248{
6f2b811a 1249 int old_state ;
7478fb0d 1250
9013dca5 1251 trace_migrate_fd_cancel();
065e2813 1252
43044ac0
PX
1253 WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
1254 if (s->rp_state.from_dst_file) {
1255 /* shutdown the rp socket, so causing the rp thread to shutdown */
1256 qemu_file_shutdown(s->rp_state.from_dst_file);
1257 }
70b20477
DDAG
1258 }
1259
6f2b811a
Z
1260 do {
1261 old_state = s->state;
392d87e2 1262 if (!migration_is_running(old_state)) {
6f2b811a
Z
1263 break;
1264 }
a7b36b48
DDAG
1265 /* If the migration is paused, kick it out of the pause */
1266 if (old_state == MIGRATION_STATUS_PRE_SWITCHOVER) {
1267 qemu_sem_post(&s->pause_sem);
1268 }
48781e5b 1269 migrate_set_state(&s->state, old_state, MIGRATION_STATUS_CANCELLING);
31194731 1270 } while (s->state != MIGRATION_STATUS_CANCELLING);
a26ba26e
DDAG
1271
1272 /*
1273 * If we're unlucky the migration code might be stuck somewhere in a
1274 * send/write while the network has failed and is waiting to timeout;
1275 * if we've got shutdown(2) available then we can force it to quit.
a26ba26e 1276 */
7478fb0d
FR
1277 if (s->state == MIGRATION_STATUS_CANCELLING) {
1278 WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
1279 if (s->to_dst_file) {
1280 qemu_file_shutdown(s->to_dst_file);
1281 }
1282 }
a26ba26e 1283 }
1d2acc31
HZ
1284 if (s->state == MIGRATION_STATUS_CANCELLING && s->block_inactive) {
1285 Error *local_err = NULL;
1286
3b717194 1287 bdrv_activate_all(&local_err);
1d2acc31
HZ
1288 if (local_err) {
1289 error_report_err(local_err);
1290 } else {
1291 s->block_inactive = false;
1292 }
1293 }
065e2813
AL
1294}
1295
99a0db9b
GH
1296void add_migration_state_change_notifier(Notifier *notify)
1297{
1298 notifier_list_add(&migration_state_notifiers, notify);
1299}
1300
1301void remove_migration_state_change_notifier(Notifier *notify)
1302{
31552529 1303 notifier_remove(notify);
99a0db9b
GH
1304}
1305
02edd2e7 1306bool migration_in_setup(MigrationState *s)
afe2df69 1307{
31194731 1308 return s->state == MIGRATION_STATUS_SETUP;
afe2df69
GH
1309}
1310
7073693b 1311bool migration_has_finished(MigrationState *s)
99a0db9b 1312{
31194731 1313 return s->state == MIGRATION_STATUS_COMPLETED;
99a0db9b 1314}
0edda1c4 1315
afe2df69
GH
1316bool migration_has_failed(MigrationState *s)
1317{
31194731
HZ
1318 return (s->state == MIGRATION_STATUS_CANCELLED ||
1319 s->state == MIGRATION_STATUS_FAILED);
afe2df69
GH
1320}
1321
5727309d 1322bool migration_in_postcopy(void)
9ec055ae 1323{
5727309d
JQ
1324 MigrationState *s = migrate_get_current();
1325
3748fef9
DDAG
1326 switch (s->state) {
1327 case MIGRATION_STATUS_POSTCOPY_ACTIVE:
1328 case MIGRATION_STATUS_POSTCOPY_PAUSED:
1329 case MIGRATION_STATUS_POSTCOPY_RECOVER:
1330 return true;
1331 default:
1332 return false;
1333 }
9ec055ae
DDAG
1334}
1335
b82fc321
DDAG
1336bool migration_in_postcopy_after_devices(MigrationState *s)
1337{
5727309d 1338 return migration_in_postcopy() && s->postcopy_after_devices;
b82fc321
DDAG
1339}
1340
06df2e69
DH
1341bool migration_in_incoming_postcopy(void)
1342{
1343 PostcopyState ps = postcopy_state_get();
1344
1345 return ps >= POSTCOPY_INCOMING_DISCARD && ps < POSTCOPY_INCOMING_END;
1346}
1347
80fe315c
DH
1348bool migration_incoming_postcopy_advised(void)
1349{
1350 PostcopyState ps = postcopy_state_get();
1351
1352 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
1353}
1354
1a8e44a8
AG
1355bool migration_in_bg_snapshot(void)
1356{
1357 MigrationState *s = migrate_get_current();
1358
1359 return migrate_background_snapshot() &&
1360 migration_is_setup_or_active(s->state);
1361}
1362
fab35005 1363bool migration_is_idle(void)
fe44dc91 1364{
daff7f0b
MA
1365 MigrationState *s = current_migration;
1366
1367 if (!s) {
1368 return true;
1369 }
fe44dc91
AA
1370
1371 switch (s->state) {
1372 case MIGRATION_STATUS_NONE:
1373 case MIGRATION_STATUS_CANCELLED:
1374 case MIGRATION_STATUS_COMPLETED:
1375 case MIGRATION_STATUS_FAILED:
1376 return true;
1377 case MIGRATION_STATUS_SETUP:
1378 case MIGRATION_STATUS_CANCELLING:
1379 case MIGRATION_STATUS_ACTIVE:
1380 case MIGRATION_STATUS_POSTCOPY_ACTIVE:
1381 case MIGRATION_STATUS_COLO:
31e06077
DDAG
1382 case MIGRATION_STATUS_PRE_SWITCHOVER:
1383 case MIGRATION_STATUS_DEVICE:
c7e0acd5 1384 case MIGRATION_STATUS_WAIT_UNPLUG:
fe44dc91
AA
1385 return false;
1386 case MIGRATION_STATUS__MAX:
1387 g_assert_not_reached();
1388 }
1389
1390 return false;
1391}
1392
8f8d528e
WY
1393bool migration_is_active(MigrationState *s)
1394{
1395 return (s->state == MIGRATION_STATUS_ACTIVE ||
1396 s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
1397}
1398
08fc4cb5 1399int migrate_init(MigrationState *s, Error **errp)
0edda1c4 1400{
08fc4cb5
AH
1401 int ret;
1402
1403 ret = qemu_savevm_state_prepare(errp);
1404 if (ret) {
1405 return ret;
1406 }
1407
389775d1
DDAG
1408 /*
1409 * Reinitialise all migration state, except
1410 * parameters/capabilities that the user set, and
1411 * locks.
1412 */
389775d1 1413 s->cleanup_bh = 0;
8518278a 1414 s->vm_start_bh = 0;
89a02a9f 1415 s->to_dst_file = NULL;
389775d1 1416 s->state = MIGRATION_STATUS_NONE;
389775d1
DDAG
1417 s->rp_state.from_dst_file = NULL;
1418 s->rp_state.error = false;
1419 s->mbps = 0.0;
aecbfe9c 1420 s->pages_per_second = 0.0;
389775d1
DDAG
1421 s->downtime = 0;
1422 s->expected_downtime = 0;
389775d1 1423 s->setup_time = 0;
389775d1 1424 s->start_postcopy = false;
b82fc321 1425 s->postcopy_after_devices = false;
389775d1 1426 s->migration_thread_running = false;
d59ce6f3
DB
1427 error_free(s->error);
1428 s->error = NULL;
d8053e73 1429 s->hostname = NULL;
389775d1 1430
48781e5b 1431 migrate_set_state(&s->state, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP);
0edda1c4 1432
4af246a3
PX
1433 s->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1434 s->total_time = 0;
f4584076 1435 s->vm_old_state = -1;
b15df1ae
PX
1436 s->iteration_initial_bytes = 0;
1437 s->threshold_size = 0;
1b4adb10 1438 s->switchover_acked = false;
f543aa22
AH
1439 /*
1440 * set mig_stats compression_counters memory to zero for a
1441 * new migration
1442 */
1443 memset(&mig_stats, 0, sizeof(mig_stats));
1444 memset(&compression_counters, 0, sizeof(compression_counters));
1445 migration_reset_vfio_bytes_transferred();
08fc4cb5
AH
1446
1447 return 0;
0edda1c4 1448}
cab30143 1449
60fd6801 1450int migrate_add_blocker_internal(Error *reason, Error **errp)
fa2756b7 1451{
4c170330
PX
1452 /* Snapshots are similar to migrations, so check RUN_STATE_SAVE_VM too. */
1453 if (runstate_check(RUN_STATE_SAVE_VM) || !migration_is_idle()) {
1454 error_propagate_prepend(errp, error_copy(reason),
1455 "disallowing migration blocker "
1456 "(migration/snapshot in progress) for: ");
1457 return -EBUSY;
fe44dc91
AA
1458 }
1459
4c170330
PX
1460 migration_blockers = g_slist_prepend(migration_blockers, reason);
1461 return 0;
fa2756b7
AL
1462}
1463
60fd6801
PX
1464int migrate_add_blocker(Error *reason, Error **errp)
1465{
1466 if (only_migratable) {
1467 error_propagate_prepend(errp, error_copy(reason),
1468 "disallowing migration blocker "
1469 "(--only-migratable) for: ");
1470 return -EACCES;
1471 }
1472
1473 return migrate_add_blocker_internal(reason, errp);
1474}
1475
fa2756b7
AL
1476void migrate_del_blocker(Error *reason)
1477{
1478 migration_blockers = g_slist_remove(migration_blockers, reason);
1479}
1480
bf1ae1f4
DDAG
1481void qmp_migrate_incoming(const char *uri, Error **errp)
1482{
1483 Error *local_err = NULL;
4debb5f5 1484 static bool once = true;
bf1ae1f4 1485
4debb5f5
DDAG
1486 if (!once) {
1487 error_setg(errp, "The incoming migration has already been started");
603d5a42 1488 return;
4debb5f5 1489 }
e69d50d6
PB
1490 if (!runstate_check(RUN_STATE_INMIGRATE)) {
1491 error_setg(errp, "'-incoming' was not specified on the command line");
1492 return;
1493 }
bf1ae1f4 1494
cc48c587
PX
1495 if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {
1496 return;
1497 }
1498
bf1ae1f4
DDAG
1499 qemu_start_incoming_migration(uri, &local_err);
1500
1501 if (local_err) {
cc48c587 1502 yank_unregister_instance(MIGRATION_YANK_INSTANCE);
bf1ae1f4
DDAG
1503 error_propagate(errp, local_err);
1504 return;
1505 }
1506
4debb5f5 1507 once = false;
bf1ae1f4
DDAG
1508}
1509
02affd41
PX
1510void qmp_migrate_recover(const char *uri, Error **errp)
1511{
1512 MigrationIncomingState *mis = migration_incoming_get_current();
1513
b7f9afd4
PX
1514 /*
1515 * Don't even bother to use ERRP_GUARD() as it _must_ always be set by
1516 * callers (no one should ignore a recover failure); if there is, it's a
1517 * programming error.
1518 */
1519 assert(errp);
1520
02affd41
PX
1521 if (mis->state != MIGRATION_STATUS_POSTCOPY_PAUSED) {
1522 error_setg(errp, "Migrate recover can only be run "
1523 "when postcopy is paused.");
1524 return;
1525 }
1526
08401c04
PX
1527 /* If there's an existing transport, release it */
1528 migration_incoming_transport_cleanup(mis);
02affd41
PX
1529
1530 /*
1531 * Note that this call will never start a real migration; it will
1532 * only re-setup the migration stream and poke existing migration
1533 * to continue using that newly established channel.
1534 */
1535 qemu_start_incoming_migration(uri, errp);
1536}
1537
bfbf89c2
PX
1538void qmp_migrate_pause(Error **errp)
1539{
1540 MigrationState *ms = migrate_get_current();
1541 MigrationIncomingState *mis = migration_incoming_get_current();
7478fb0d 1542 int ret = 0;
bfbf89c2
PX
1543
1544 if (ms->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
1545 /* Source side, during postcopy */
1546 qemu_mutex_lock(&ms->qemu_file_lock);
7478fb0d
FR
1547 if (ms->to_dst_file) {
1548 ret = qemu_file_shutdown(ms->to_dst_file);
1549 }
bfbf89c2
PX
1550 qemu_mutex_unlock(&ms->qemu_file_lock);
1551 if (ret) {
1552 error_setg(errp, "Failed to pause source migration");
1553 }
1554 return;
1555 }
1556
1557 if (mis->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
1558 ret = qemu_file_shutdown(mis->from_src_file);
1559 if (ret) {
1560 error_setg(errp, "Failed to pause destination migration");
1561 }
1562 return;
1563 }
1564
1565 error_setg(errp, "migrate-pause is currently only supported "
1566 "during postcopy-active state");
1567}
1568
24f3902b
GK
1569bool migration_is_blocked(Error **errp)
1570{
1571 if (qemu_savevm_state_blocked(errp)) {
1572 return true;
1573 }
1574
1575 if (migration_blockers) {
250561e1 1576 error_propagate(errp, error_copy(migration_blockers->data));
24f3902b
GK
1577 return true;
1578 }
1579
1580 return false;
1581}
1582
d3e35b8f
PX
1583/* Returns true if continue to migrate, or false if error detected */
1584static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc,
1585 bool resume, Error **errp)
cab30143 1586{
be7059cd 1587 Error *local_err = NULL;
d3e35b8f
PX
1588
1589 if (resume) {
1590 if (s->state != MIGRATION_STATUS_POSTCOPY_PAUSED) {
1591 error_setg(errp, "Cannot resume if there is no "
1592 "paused migration");
1593 return false;
1594 }
97ca211c
PX
1595
1596 /*
1597 * Postcopy recovery won't work well with release-ram
1598 * capability since release-ram will drop the page buffer as
1599 * long as the page is put into the send buffer. So if there
1600 * is a network failure happened, any page buffers that have
1601 * not yet reached the destination VM but have already been
1602 * sent from the source VM will be lost forever. Let's refuse
1603 * the client from resuming such a postcopy migration.
1604 * Luckily release-ram was designed to only be used when src
1605 * and destination VMs are on the same host, so it should be
1606 * fine.
1607 */
1608 if (migrate_release_ram()) {
1609 error_setg(errp, "Postcopy recovery cannot work "
1610 "when release-ram capability is set");
1611 return false;
1612 }
1613
d3e35b8f
PX
1614 /* This is a resume, skip init status */
1615 return true;
1616 }
cab30143 1617
392d87e2 1618 if (migration_is_running(s->state)) {
c6bd8c70 1619 error_setg(errp, QERR_MIGRATION_ACTIVE);
d3e35b8f 1620 return false;
cab30143 1621 }
d3e35b8f 1622
ca99993a
DDAG
1623 if (runstate_check(RUN_STATE_INMIGRATE)) {
1624 error_setg(errp, "Guest is waiting for an incoming migration");
d3e35b8f 1625 return false;
ca99993a
DDAG
1626 }
1627
36d0fe65
T
1628 if (runstate_check(RUN_STATE_POSTMIGRATE)) {
1629 error_setg(errp, "Can't migrate the vm that was paused due to "
1630 "previous migration");
1631 return false;
1632 }
1633
24f3902b 1634 if (migration_is_blocked(errp)) {
d3e35b8f 1635 return false;
fa2756b7
AL
1636 }
1637
d3e35b8f 1638 if (blk || blk_inc) {
5e804644 1639 if (migrate_colo()) {
3ba02445
RL
1640 error_setg(errp, "No disk migration is required in COLO mode");
1641 return false;
1642 }
6f8be708 1643 if (migrate_block() || migrate_block_incremental()) {
2833c59b
JQ
1644 error_setg(errp, "Command options are incompatible with "
1645 "current migration capabilities");
d3e35b8f 1646 return false;
2833c59b 1647 }
9eb1109c 1648 if (!migrate_cap_set(MIGRATION_CAPABILITY_BLOCK, true, &local_err)) {
2833c59b 1649 error_propagate(errp, local_err);
d3e35b8f 1650 return false;
2833c59b
JQ
1651 }
1652 s->must_remove_block_options = true;
1653 }
1654
d3e35b8f 1655 if (blk_inc) {
87c22901 1656 migrate_set_block_incremental(true);
2833c59b
JQ
1657 }
1658
08fc4cb5
AH
1659 if (migrate_init(s, errp)) {
1660 return false;
1661 }
cab30143 1662
d3e35b8f
PX
1663 return true;
1664}
1665
1666void qmp_migrate(const char *uri, bool has_blk, bool blk,
1667 bool has_inc, bool inc, bool has_detach, bool detach,
1668 bool has_resume, bool resume, Error **errp)
1669{
8c69ae9e 1670 bool resume_requested;
d3e35b8f
PX
1671 Error *local_err = NULL;
1672 MigrationState *s = migrate_get_current();
d658f65c 1673 const char *p = NULL;
d3e35b8f 1674
d6f74fd1
PX
1675 /* URI is not suitable for migration? */
1676 if (!migration_channels_and_uri_compatible(uri, errp)) {
1677 return;
1678 }
1679
8c69ae9e 1680 resume_requested = has_resume && resume;
d3e35b8f 1681 if (!migrate_prepare(s, has_blk && blk, has_inc && inc,
8c69ae9e 1682 resume_requested, errp)) {
d3e35b8f
PX
1683 /* Error detected, put into errp */
1684 return;
1685 }
1686
8c69ae9e 1687 if (!resume_requested) {
b5eea99e
LS
1688 if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {
1689 return;
1690 }
1691 }
1692
d658f65c 1693 if (strstart(uri, "tcp:", &p) ||
9ba3b2ba
LM
1694 strstart(uri, "unix:", NULL) ||
1695 strstart(uri, "vsock:", NULL)) {
d658f65c 1696 socket_start_outgoing_migration(s, p ? p : uri, &local_err);
2da776db 1697#ifdef CONFIG_RDMA
41310c68 1698 } else if (strstart(uri, "rdma:", &p)) {
2da776db
MH
1699 rdma_start_outgoing_migration(s, p, &local_err);
1700#endif
cab30143 1701 } else if (strstart(uri, "exec:", &p)) {
f37afb5a 1702 exec_start_outgoing_migration(s, p, &local_err);
cab30143 1703 } else if (strstart(uri, "fd:", &p)) {
f37afb5a 1704 fd_start_outgoing_migration(s, p, &local_err);
99a0db9b 1705 } else {
8c69ae9e 1706 if (!resume_requested) {
b5eea99e
LS
1707 yank_unregister_instance(MIGRATION_YANK_INSTANCE);
1708 }
908927db 1709 error_setg(&local_err, QERR_INVALID_PARAMETER_VALUE, "uri",
c6bd8c70 1710 "a valid migration protocol");
48781e5b
HZ
1711 migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
1712 MIGRATION_STATUS_FAILED);
b1a87956 1713 block_cleanup_parameters();
e1c37d0e 1714 return;
cab30143
JQ
1715 }
1716
f37afb5a 1717 if (local_err) {
8c69ae9e 1718 if (!resume_requested) {
b5eea99e
LS
1719 yank_unregister_instance(MIGRATION_YANK_INSTANCE);
1720 }
d59ce6f3 1721 migrate_fd_error(s, local_err);
f37afb5a 1722 error_propagate(errp, local_err);
e1c37d0e 1723 return;
1299c631 1724 }
cab30143
JQ
1725}
1726
6cdedb07 1727void qmp_migrate_cancel(Error **errp)
cab30143 1728{
458fecca 1729 migration_cancel(NULL);
cab30143
JQ
1730}
1731
89cfc02c
DDAG
1732void qmp_migrate_continue(MigrationStatus state, Error **errp)
1733{
1734 MigrationState *s = migrate_get_current();
1735 if (s->state != state) {
1736 error_setg(errp, "Migration not in expected state: %s",
1737 MigrationStatus_str(s->state));
1738 return;
1739 }
1740 qemu_sem_post(&s->pause_sem);
1741}
1742
70b20477
DDAG
1743/* migration thread support */
1744/*
1745 * Something bad happened to the RP stream, mark an error
1746 * The caller shall print or trace something to indicate why
1747 */
1748static void mark_source_rp_bad(MigrationState *s)
1749{
1750 s->rp_state.error = true;
1751}
1752
1753static struct rp_cmd_args {
1754 ssize_t len; /* -1 = variable */
1755 const char *name;
1756} rp_cmd_args[] = {
1757 [MIG_RP_MSG_INVALID] = { .len = -1, .name = "INVALID" },
1758 [MIG_RP_MSG_SHUT] = { .len = 4, .name = "SHUT" },
1759 [MIG_RP_MSG_PONG] = { .len = 4, .name = "PONG" },
1e2d90eb
DDAG
1760 [MIG_RP_MSG_REQ_PAGES] = { .len = 12, .name = "REQ_PAGES" },
1761 [MIG_RP_MSG_REQ_PAGES_ID] = { .len = -1, .name = "REQ_PAGES_ID" },
a335debb 1762 [MIG_RP_MSG_RECV_BITMAP] = { .len = -1, .name = "RECV_BITMAP" },
13955b89 1763 [MIG_RP_MSG_RESUME_ACK] = { .len = 4, .name = "RESUME_ACK" },
1b4adb10 1764 [MIG_RP_MSG_SWITCHOVER_ACK] = { .len = 0, .name = "SWITCHOVER_ACK" },
70b20477
DDAG
1765 [MIG_RP_MSG_MAX] = { .len = -1, .name = "MAX" },
1766};
1767
1e2d90eb
DDAG
1768/*
1769 * Process a request for pages received on the return path,
1770 * We're allowed to send more than requested (e.g. to round to our page size)
1771 * and we don't need to send pages that have already been sent.
1772 */
1773static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname,
1774 ram_addr_t start, size_t len)
1775{
8e3b0cbb 1776 long our_host_ps = qemu_real_host_page_size();
6c595cde 1777
1e2d90eb 1778 trace_migrate_handle_rp_req_pages(rbname, start, len);
6c595cde
DDAG
1779
1780 /*
1781 * Since we currently insist on matching page sizes, just sanity check
1782 * we're being asked for whole host pages.
1783 */
7648297d
DH
1784 if (!QEMU_IS_ALIGNED(start, our_host_ps) ||
1785 !QEMU_IS_ALIGNED(len, our_host_ps)) {
6c595cde
DDAG
1786 error_report("%s: Misaligned page request, start: " RAM_ADDR_FMT
1787 " len: %zd", __func__, start, len);
1788 mark_source_rp_bad(ms);
1789 return;
1790 }
1791
96506894 1792 if (ram_save_queue_pages(rbname, start, len)) {
6c595cde
DDAG
1793 mark_source_rp_bad(ms);
1794 }
1e2d90eb
DDAG
1795}
1796
a335debb
PX
1797static int migrate_handle_rp_recv_bitmap(MigrationState *s, char *block_name)
1798{
1799 RAMBlock *block = qemu_ram_block_by_name(block_name);
1800
1801 if (!block) {
1802 error_report("%s: invalid block name '%s'", __func__, block_name);
1803 return -EINVAL;
1804 }
1805
1806 /* Fetch the received bitmap and refresh the dirty bitmap */
1807 return ram_dirty_bitmap_reload(s, block);
1808}
1809
13955b89
PX
1810static int migrate_handle_rp_resume_ack(MigrationState *s, uint32_t value)
1811{
1812 trace_source_return_path_thread_resume_ack(value);
1813
1814 if (value != MIGRATION_RESUME_ACK_VALUE) {
1815 error_report("%s: illegal resume_ack value %"PRIu32,
1816 __func__, value);
1817 return -1;
1818 }
1819
1820 /* Now both sides are active. */
1821 migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_RECOVER,
1822 MIGRATION_STATUS_POSTCOPY_ACTIVE);
1823
94190696
PX
1824 /* Notify send thread that time to continue send pages */
1825 qemu_sem_post(&s->rp_state.rp_sem);
13955b89
PX
1826
1827 return 0;
1828}
1829
93589827
PX
1830/*
1831 * Release ms->rp_state.from_dst_file (and postcopy_qemufile_src if
1832 * existed) in a safe way.
1833 */
1834static void migration_release_dst_files(MigrationState *ms)
43044ac0
PX
1835{
1836 QEMUFile *file;
1837
1838 WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
1839 /*
1840 * Reset the from_dst_file pointer first before releasing it, as we
1841 * can't block within lock section
1842 */
1843 file = ms->rp_state.from_dst_file;
1844 ms->rp_state.from_dst_file = NULL;
1845 }
1846
93589827
PX
1847 /*
1848 * Do the same to postcopy fast path socket too if there is. No
1849 * locking needed because this qemufile should only be managed by
1850 * return path thread.
1851 */
1852 if (ms->postcopy_qemufile_src) {
1853 migration_ioc_unregister_yank_from_file(ms->postcopy_qemufile_src);
1854 qemu_file_shutdown(ms->postcopy_qemufile_src);
1855 qemu_fclose(ms->postcopy_qemufile_src);
1856 ms->postcopy_qemufile_src = NULL;
1857 }
1858
43044ac0
PX
1859 qemu_fclose(file);
1860}
1861
70b20477
DDAG
1862/*
1863 * Handles messages sent on the return path towards the source VM
1864 *
1865 */
1866static void *source_return_path_thread(void *opaque)
1867{
1868 MigrationState *ms = opaque;
1869 QEMUFile *rp = ms->rp_state.from_dst_file;
1870 uint16_t header_len, header_type;
568b01ca 1871 uint8_t buf[512];
70b20477 1872 uint32_t tmp32, sibling_error;
1e2d90eb
DDAG
1873 ram_addr_t start = 0; /* =0 to silence warning */
1874 size_t len = 0, expected_len;
70b20477
DDAG
1875 int res;
1876
1877 trace_source_return_path_thread_entry();
74637e6f 1878 rcu_register_thread();
14b1742e 1879
70b20477
DDAG
1880 while (!ms->rp_state.error && !qemu_file_get_error(rp) &&
1881 migration_is_setup_or_active(ms->state)) {
1882 trace_source_return_path_thread_loop_top();
1883 header_type = qemu_get_be16(rp);
1884 header_len = qemu_get_be16(rp);
1885
7a9ddfbf
PX
1886 if (qemu_file_get_error(rp)) {
1887 mark_source_rp_bad(ms);
1888 goto out;
1889 }
1890
70b20477
DDAG
1891 if (header_type >= MIG_RP_MSG_MAX ||
1892 header_type == MIG_RP_MSG_INVALID) {
1893 error_report("RP: Received invalid message 0x%04x length 0x%04x",
ed8b2828 1894 header_type, header_len);
70b20477
DDAG
1895 mark_source_rp_bad(ms);
1896 goto out;
1897 }
1898
1899 if ((rp_cmd_args[header_type].len != -1 &&
1900 header_len != rp_cmd_args[header_type].len) ||
568b01ca 1901 header_len > sizeof(buf)) {
70b20477 1902 error_report("RP: Received '%s' message (0x%04x) with"
ed8b2828
MZ
1903 "incorrect length %d expecting %zu",
1904 rp_cmd_args[header_type].name, header_type, header_len,
1905 (size_t)rp_cmd_args[header_type].len);
70b20477
DDAG
1906 mark_source_rp_bad(ms);
1907 goto out;
1908 }
1909
1910 /* We know we've got a valid header by this point */
1911 res = qemu_get_buffer(rp, buf, header_len);
1912 if (res != header_len) {
1913 error_report("RP: Failed reading data for message 0x%04x"
1914 " read %d expected %d",
1915 header_type, res, header_len);
1916 mark_source_rp_bad(ms);
1917 goto out;
1918 }
1919
1920 /* OK, we have the message and the data */
1921 switch (header_type) {
1922 case MIG_RP_MSG_SHUT:
4d885131 1923 sibling_error = ldl_be_p(buf);
70b20477
DDAG
1924 trace_source_return_path_thread_shut(sibling_error);
1925 if (sibling_error) {
1926 error_report("RP: Sibling indicated error %d", sibling_error);
1927 mark_source_rp_bad(ms);
1928 }
1929 /*
1930 * We'll let the main thread deal with closing the RP
1931 * we could do a shutdown(2) on it, but we're the only user
1932 * anyway, so there's nothing gained.
1933 */
1934 goto out;
1935
1936 case MIG_RP_MSG_PONG:
4d885131 1937 tmp32 = ldl_be_p(buf);
70b20477 1938 trace_source_return_path_thread_pong(tmp32);
b28fb582 1939 qemu_sem_post(&ms->rp_state.rp_pong_acks);
70b20477
DDAG
1940 break;
1941
1e2d90eb 1942 case MIG_RP_MSG_REQ_PAGES:
4d885131
PM
1943 start = ldq_be_p(buf);
1944 len = ldl_be_p(buf + 8);
1e2d90eb
DDAG
1945 migrate_handle_rp_req_pages(ms, NULL, start, len);
1946 break;
1947
1948 case MIG_RP_MSG_REQ_PAGES_ID:
1949 expected_len = 12 + 1; /* header + termination */
1950
1951 if (header_len >= expected_len) {
4d885131
PM
1952 start = ldq_be_p(buf);
1953 len = ldl_be_p(buf + 8);
1e2d90eb
DDAG
1954 /* Now we expect an idstr */
1955 tmp32 = buf[12]; /* Length of the following idstr */
1956 buf[13 + tmp32] = '\0';
1957 expected_len += tmp32;
1958 }
1959 if (header_len != expected_len) {
1960 error_report("RP: Req_Page_id with length %d expecting %zd",
ed8b2828 1961 header_len, expected_len);
1e2d90eb
DDAG
1962 mark_source_rp_bad(ms);
1963 goto out;
1964 }
1965 migrate_handle_rp_req_pages(ms, (char *)&buf[13], start, len);
1966 break;
1967
a335debb
PX
1968 case MIG_RP_MSG_RECV_BITMAP:
1969 if (header_len < 1) {
1970 error_report("%s: missing block name", __func__);
1971 mark_source_rp_bad(ms);
1972 goto out;
1973 }
1974 /* Format: len (1B) + idstr (<255B). This ends the idstr. */
1975 buf[buf[0] + 1] = '\0';
1976 if (migrate_handle_rp_recv_bitmap(ms, (char *)(buf + 1))) {
1977 mark_source_rp_bad(ms);
1978 goto out;
1979 }
1980 break;
1981
13955b89
PX
1982 case MIG_RP_MSG_RESUME_ACK:
1983 tmp32 = ldl_be_p(buf);
1984 if (migrate_handle_rp_resume_ack(ms, tmp32)) {
1985 mark_source_rp_bad(ms);
1986 goto out;
1987 }
1988 break;
1989
1b4adb10
AH
1990 case MIG_RP_MSG_SWITCHOVER_ACK:
1991 ms->switchover_acked = true;
1992 trace_source_return_path_thread_switchover_acked();
1993 break;
1994
70b20477
DDAG
1995 default:
1996 break;
1997 }
1998 }
14b1742e
PX
1999
2000out:
ef796ee9 2001 if (qemu_file_get_error(rp)) {
70b20477
DDAG
2002 trace_source_return_path_thread_bad_end();
2003 mark_source_rp_bad(ms);
2004 }
2005
2006 trace_source_return_path_thread_end();
74637e6f 2007 rcu_unregister_thread();
70b20477
DDAG
2008 return NULL;
2009}
2010
ef796ee9 2011static int open_return_path_on_source(MigrationState *ms)
70b20477 2012{
89a02a9f 2013 ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->to_dst_file);
70b20477
DDAG
2014 if (!ms->rp_state.from_dst_file) {
2015 return -1;
2016 }
2017
2018 trace_open_return_path_on_source();
d3e35b8f 2019
70b20477
DDAG
2020 qemu_thread_create(&ms->rp_state.rp_thread, "return path",
2021 source_return_path_thread, ms, QEMU_THREAD_JOINABLE);
53021ea1 2022 ms->rp_state.rp_thread_created = true;
70b20477
DDAG
2023
2024 trace_open_return_path_on_source_continue();
2025
2026 return 0;
2027}
2028
70b20477
DDAG
2029/* Returns 0 if the RP was ok, otherwise there was an error on the RP */
2030static int await_return_path_close_on_source(MigrationState *ms)
2031{
d50f5dc0
FR
2032 int ret;
2033
2034 if (!ms->rp_state.rp_thread_created) {
2035 return 0;
2036 }
2037
2038 trace_migration_return_path_end_before();
2039
70b20477 2040 /*
639decf5
FR
2041 * If this is a normal exit then the destination will send a SHUT
2042 * and the rp_thread will exit, however if there's an error we
2043 * need to cause it to exit. shutdown(2), if we have it, will
2044 * cause it to unblock if it's stuck waiting for the destination.
70b20477 2045 */
639decf5
FR
2046 WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
2047 if (ms->to_dst_file && ms->rp_state.from_dst_file &&
2048 qemu_file_get_error(ms->to_dst_file)) {
2049 qemu_file_shutdown(ms->rp_state.from_dst_file);
2050 }
70b20477 2051 }
639decf5 2052
70b20477
DDAG
2053 trace_await_return_path_close_on_source_joining();
2054 qemu_thread_join(&ms->rp_state.rp_thread);
53021ea1 2055 ms->rp_state.rp_thread_created = false;
70b20477 2056 trace_await_return_path_close_on_source_close();
d50f5dc0
FR
2057
2058 ret = ms->rp_state.error;
ef796ee9 2059 ms->rp_state.error = false;
36e9aab3
FR
2060
2061 migration_release_dst_files(ms);
2062
d50f5dc0
FR
2063 trace_migration_return_path_end_after(ret);
2064 return ret;
70b20477
DDAG
2065}
2066
5655aab0
PX
2067static inline void
2068migration_wait_main_channel(MigrationState *ms)
2069{
2070 /* Wait until one PONG message received */
2071 qemu_sem_wait(&ms->rp_state.rp_pong_acks);
2072}
2073
1d34e4bf
DDAG
2074/*
2075 * Switch from normal iteration to postcopy
2076 * Returns non-0 on error
2077 */
908927db 2078static int postcopy_start(MigrationState *ms, Error **errp)
1d34e4bf
DDAG
2079{
2080 int ret;
61b67d47
DB
2081 QIOChannelBuffer *bioc;
2082 QEMUFile *fb;
1d34e4bf 2083 int64_t time_at_stop = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
52033349 2084 uint64_t bandwidth = migrate_max_postcopy_bandwidth();
ef8d6488 2085 bool restart_block = false;
0331c8ca 2086 int cur_state = MIGRATION_STATUS_ACTIVE;
d0edb8a1 2087
5655aab0
PX
2088 if (migrate_postcopy_preempt()) {
2089 migration_wait_main_channel(ms);
2090 if (postcopy_preempt_establish_channel(ms)) {
2091 migrate_set_state(&ms->state, ms->state, MIGRATION_STATUS_FAILED);
2092 return -1;
2093 }
d0edb8a1
PX
2094 }
2095
0331c8ca
DDAG
2096 if (!migrate_pause_before_switchover()) {
2097 migrate_set_state(&ms->state, MIGRATION_STATUS_ACTIVE,
2098 MIGRATION_STATUS_POSTCOPY_ACTIVE);
2099 }
1d34e4bf
DDAG
2100
2101 trace_postcopy_start();
2102 qemu_mutex_lock_iothread();
2103 trace_postcopy_start_set_run();
2104
fb064112 2105 qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
1d34e4bf
DDAG
2106 global_state_store();
2107 ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
76b1c7fe
KW
2108 if (ret < 0) {
2109 goto fail;
2110 }
1d34e4bf 2111
0331c8ca
DDAG
2112 ret = migration_maybe_pause(ms, &cur_state,
2113 MIGRATION_STATUS_POSTCOPY_ACTIVE);
2114 if (ret < 0) {
2115 goto fail;
2116 }
2117
76b1c7fe 2118 ret = bdrv_inactivate_all();
1d34e4bf
DDAG
2119 if (ret < 0) {
2120 goto fail;
2121 }
ef8d6488 2122 restart_block = true;
1d34e4bf 2123
1c0d249d
DDAG
2124 /*
2125 * Cause any non-postcopiable, but iterative devices to
2126 * send out their final data.
2127 */
a1fbe750 2128 qemu_savevm_state_complete_precopy(ms->to_dst_file, true, false);
1c0d249d 2129
1d34e4bf
DDAG
2130 /*
2131 * in Finish migrate and with the io-lock held everything should
2132 * be quiet, but we've potentially still got dirty pages and we
2133 * need to tell the destination to throw any pages it's already received
2134 * that are dirty
2135 */
58110f0a 2136 if (migrate_postcopy_ram()) {
739fcc1b 2137 ram_postcopy_send_discard_bitmap(ms);
1d34e4bf
DDAG
2138 }
2139
2140 /*
2141 * send rest of state - note things that are doing postcopy
2142 * will notice we're in POSTCOPY_ACTIVE and not actually
2143 * wrap their state up here
2144 */
e1fde0e0 2145 migration_rate_set(bandwidth);
58110f0a
VSO
2146 if (migrate_postcopy_ram()) {
2147 /* Ping just for debugging, helps line traces up */
2148 qemu_savevm_send_ping(ms->to_dst_file, 2);
2149 }
1d34e4bf
DDAG
2150
2151 /*
2152 * While loading the device state we may trigger page transfer
2153 * requests and the fd must be free to process those, and thus
2154 * the destination must read the whole device state off the fd before
2155 * it starts processing it. Unfortunately the ad-hoc migration format
2156 * doesn't allow the destination to know the size to read without fully
2157 * parsing it through each devices load-state code (especially the open
2158 * coded devices that use get/put).
2159 * So we wrap the device state up in a package with a length at the start;
2160 * to do this we use a qemu_buf to hold the whole of the device state.
2161 */
61b67d47 2162 bioc = qio_channel_buffer_new(4096);
6f01f136 2163 qio_channel_set_name(QIO_CHANNEL(bioc), "migration-postcopy-buffer");
77ef2dc1 2164 fb = qemu_file_new_output(QIO_CHANNEL(bioc));
61b67d47 2165 object_unref(OBJECT(bioc));
1d34e4bf 2166
c76201ab
DDAG
2167 /*
2168 * Make sure the receiver can get incoming pages before we send the rest
2169 * of the state
2170 */
2171 qemu_savevm_send_postcopy_listen(fb);
2172
a1fbe750 2173 qemu_savevm_state_complete_precopy(fb, false, false);
58110f0a
VSO
2174 if (migrate_postcopy_ram()) {
2175 qemu_savevm_send_ping(fb, 3);
2176 }
1d34e4bf
DDAG
2177
2178 qemu_savevm_send_postcopy_run(fb);
2179
2180 /* <><> end of stuff going into the package */
1d34e4bf 2181
ef8d6488
DDAG
2182 /* Last point of recovery; as soon as we send the package the destination
2183 * can open devices and potentially start running.
2184 * Lets just check again we've not got any errors.
2185 */
2186 ret = qemu_file_get_error(ms->to_dst_file);
2187 if (ret) {
908927db 2188 error_setg(errp, "postcopy_start: Migration stream errored (pre package)");
ef8d6488
DDAG
2189 goto fail_closefb;
2190 }
2191
2192 restart_block = false;
2193
1d34e4bf 2194 /* Now send that blob */
61b67d47 2195 if (qemu_savevm_send_packaged(ms->to_dst_file, bioc->data, bioc->usage)) {
1d34e4bf
DDAG
2196 goto fail_closefb;
2197 }
2198 qemu_fclose(fb);
b82fc321
DDAG
2199
2200 /* Send a notify to give a chance for anything that needs to happen
2201 * at the transition to postcopy and after the device state; in particular
2202 * spice needs to trigger a transition now
2203 */
2204 ms->postcopy_after_devices = true;
2205 notifier_list_notify(&migration_state_notifiers, ms);
2206
1d34e4bf
DDAG
2207 ms->downtime = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - time_at_stop;
2208
2209 qemu_mutex_unlock_iothread();
2210
58110f0a
VSO
2211 if (migrate_postcopy_ram()) {
2212 /*
2213 * Although this ping is just for debug, it could potentially be
2214 * used for getting a better measurement of downtime at the source.
2215 */
2216 qemu_savevm_send_ping(ms->to_dst_file, 4);
2217 }
1d34e4bf 2218
ced1c616
PB
2219 if (migrate_release_ram()) {
2220 ram_postcopy_migrated_memory_release(ms);
2221 }
2222
89a02a9f 2223 ret = qemu_file_get_error(ms->to_dst_file);
1d34e4bf 2224 if (ret) {
908927db 2225 error_setg(errp, "postcopy_start: Migration stream errored");
48781e5b 2226 migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
1d34e4bf
DDAG
2227 MIGRATION_STATUS_FAILED);
2228 }
2229
c01b16ed
PX
2230 trace_postcopy_preempt_enabled(migrate_postcopy_preempt());
2231
1d34e4bf
DDAG
2232 return ret;
2233
2234fail_closefb:
2235 qemu_fclose(fb);
2236fail:
48781e5b 2237 migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
1d34e4bf 2238 MIGRATION_STATUS_FAILED);
ef8d6488
DDAG
2239 if (restart_block) {
2240 /* A failure happened early enough that we know the destination hasn't
2241 * accessed block devices, so we're safe to recover.
2242 */
2243 Error *local_err = NULL;
2244
3b717194 2245 bdrv_activate_all(&local_err);
ef8d6488
DDAG
2246 if (local_err) {
2247 error_report_err(local_err);
2248 }
2249 }
1d34e4bf
DDAG
2250 qemu_mutex_unlock_iothread();
2251 return -1;
2252}
2253
e91d8951
DDAG
2254/**
2255 * migration_maybe_pause: Pause if required to by
2256 * migrate_pause_before_switchover called with the iothread locked
2257 * Returns: 0 on success
2258 */
0331c8ca
DDAG
2259static int migration_maybe_pause(MigrationState *s,
2260 int *current_active_state,
2261 int new_state)
e91d8951
DDAG
2262{
2263 if (!migrate_pause_before_switchover()) {
2264 return 0;
2265 }
2266
2267 /* Since leaving this state is not atomic with posting the semaphore
2268 * it's possible that someone could have issued multiple migrate_continue
2269 * and the semaphore is incorrectly positive at this point;
2270 * the docs say it's undefined to reinit a semaphore that's already
2271 * init'd, so use timedwait to eat up any existing posts.
2272 */
2273 while (qemu_sem_timedwait(&s->pause_sem, 1) == 0) {
2274 /* This block intentionally left blank */
2275 }
2276
8958338b
ZF
2277 /*
2278 * If the migration is cancelled when it is in the completion phase,
2279 * the migration state is set to MIGRATION_STATUS_CANCELLING.
2280 * So we don't need to wait a semaphore, otherwise we would always
2281 * wait for the 'pause_sem' semaphore.
2282 */
2283 if (s->state != MIGRATION_STATUS_CANCELLING) {
2284 qemu_mutex_unlock_iothread();
2285 migrate_set_state(&s->state, *current_active_state,
2286 MIGRATION_STATUS_PRE_SWITCHOVER);
2287 qemu_sem_wait(&s->pause_sem);
2288 migrate_set_state(&s->state, MIGRATION_STATUS_PRE_SWITCHOVER,
2289 new_state);
2290 *current_active_state = new_state;
2291 qemu_mutex_lock_iothread();
2292 }
e91d8951 2293
0331c8ca 2294 return s->state == new_state ? 0 : -EINVAL;
e91d8951
DDAG
2295}
2296
09f6c85e
DDAG
2297/**
2298 * migration_completion: Used by migration_thread when there's not much left.
2299 * The caller 'breaks' the loop when this returns.
2300 *
2301 * @s: Current migration state
09f6c85e 2302 */
2ad87305 2303static void migration_completion(MigrationState *s)
09f6c85e
DDAG
2304{
2305 int ret;
2ad87305 2306 int current_active_state = s->state;
09f6c85e 2307
b10ac0c4
DDAG
2308 if (s->state == MIGRATION_STATUS_ACTIVE) {
2309 qemu_mutex_lock_iothread();
64909f97 2310 s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
fb064112 2311 qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
f4584076
VSO
2312
2313 s->vm_old_state = runstate_get();
c33f1829
VSO
2314 global_state_store();
2315
2316 ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
2317 trace_migration_completion_vm_stop(ret);
2318 if (ret >= 0) {
2319 ret = migration_maybe_pause(s, &current_active_state,
2320 MIGRATION_STATUS_DEVICE);
2321 }
2322 if (ret >= 0) {
2323 /*
2324 * Inactivate disks except in COLO, and track that we
2325 * have done so in order to remember to reactivate
2326 * them if migration fails or is cancelled.
2327 */
2328 s->block_inactive = !migrate_colo();
2329 migration_rate_set(RATE_LIMIT_DISABLED);
2330 ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false,
2331 s->block_inactive);
b10ac0c4 2332 }
c33f1829 2333
b10ac0c4 2334 qemu_mutex_unlock_iothread();
09f6c85e 2335
b10ac0c4
DDAG
2336 if (ret < 0) {
2337 goto fail;
09f6c85e 2338 }
b10ac0c4
DDAG
2339 } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
2340 trace_migration_completion_postcopy_end();
2341
68b88468 2342 qemu_mutex_lock_iothread();
89a02a9f 2343 qemu_savevm_state_complete_postcopy(s->to_dst_file);
68b88468
EGE
2344 qemu_mutex_unlock_iothread();
2345
6621883f
PX
2346 /*
2347 * Shutdown the postcopy fast path thread. This is only needed
2348 * when dest QEMU binary is old (7.1/7.2). QEMU 8.0+ doesn't need
2349 * this.
2350 */
2351 if (migrate_postcopy_preempt() && s->preempt_pre_7_2) {
36f62f11
PX
2352 postcopy_preempt_shutdown_file(s);
2353 }
2354
b10ac0c4 2355 trace_migration_completion_postcopy_end_after_complete();
444252b9 2356 } else {
6ba11211 2357 goto fail;
09f6c85e 2358 }
09f6c85e 2359
d50f5dc0
FR
2360 if (await_return_path_close_on_source(s)) {
2361 goto fail;
09f6c85e
DDAG
2362 }
2363
89a02a9f 2364 if (qemu_file_get_error(s->to_dst_file)) {
09f6c85e 2365 trace_migration_completion_file_err();
6dab4c93 2366 goto fail;
09f6c85e
DDAG
2367 }
2368
5e804644 2369 if (migrate_colo() && s->state == MIGRATION_STATUS_ACTIVE) {
eeeb48ee
ZC
2370 /* COLO does not support postcopy */
2371 migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
2372 MIGRATION_STATUS_COLO);
2373 } else {
0b827d5e
HZ
2374 migrate_set_state(&s->state, current_active_state,
2375 MIGRATION_STATUS_COMPLETED);
2376 }
2377
09f6c85e
DDAG
2378 return;
2379
6dab4c93
EB
2380fail:
2381 if (s->block_inactive && (s->state == MIGRATION_STATUS_ACTIVE ||
2382 s->state == MIGRATION_STATUS_DEVICE)) {
2383 /*
2384 * If not doing postcopy, vm_start() will be called: let's
2385 * regain control on images.
2386 */
fe904ea8
GK
2387 Error *local_err = NULL;
2388
1d2acc31 2389 qemu_mutex_lock_iothread();
3b717194 2390 bdrv_activate_all(&local_err);
fe904ea8
GK
2391 if (local_err) {
2392 error_report_err(local_err);
1d2acc31
HZ
2393 } else {
2394 s->block_inactive = false;
fe904ea8 2395 }
1d2acc31 2396 qemu_mutex_unlock_iothread();
fe904ea8
GK
2397 }
2398
48781e5b
HZ
2399 migrate_set_state(&s->state, current_active_state,
2400 MIGRATION_STATUS_FAILED);
09f6c85e
DDAG
2401}
2402
8518278a
AG
2403/**
2404 * bg_migration_completion: Used by bg_migration_thread when after all the
2405 * RAM has been saved. The caller 'breaks' the loop when this returns.
2406 *
2407 * @s: Current migration state
2408 */
2409static void bg_migration_completion(MigrationState *s)
2410{
2411 int current_active_state = s->state;
2412
8518278a
AG
2413 if (s->state == MIGRATION_STATUS_ACTIVE) {
2414 /*
2415 * By this moment we have RAM content saved into the migration stream.
2416 * The next step is to flush the non-RAM content (device state)
2417 * right after the ram content. The device state has been stored into
2418 * the temporary buffer before RAM saving started.
2419 */
2420 qemu_put_buffer(s->to_dst_file, s->bioc->data, s->bioc->usage);
2421 qemu_fflush(s->to_dst_file);
2422 } else if (s->state == MIGRATION_STATUS_CANCELLING) {
2423 goto fail;
2424 }
2425
2426 if (qemu_file_get_error(s->to_dst_file)) {
2427 trace_migration_completion_file_err();
2428 goto fail;
2429 }
2430
2431 migrate_set_state(&s->state, current_active_state,
2432 MIGRATION_STATUS_COMPLETED);
2433 return;
2434
2435fail:
2436 migrate_set_state(&s->state, current_active_state,
2437 MIGRATION_STATUS_FAILED);
2438}
2439
b23c2ade
PX
2440typedef enum MigThrError {
2441 /* No error detected */
2442 MIG_THR_ERR_NONE = 0,
2443 /* Detected error, but resumed successfully */
2444 MIG_THR_ERR_RECOVERED = 1,
2445 /* Detected fatal error, need to exit */
2446 MIG_THR_ERR_FATAL = 2,
2447} MigThrError;
2448
94190696
PX
2449static int postcopy_resume_handshake(MigrationState *s)
2450{
2451 qemu_savevm_send_postcopy_resume(s->to_dst_file);
2452
2453 while (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) {
2454 qemu_sem_wait(&s->rp_state.rp_sem);
2455 }
2456
2457 if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
2458 return 0;
2459 }
2460
2461 return -1;
2462}
2463
135b87b4
PX
2464/* Return zero if success, or <0 for error */
2465static int postcopy_do_resume(MigrationState *s)
2466{
d1b8eadb
PX
2467 int ret;
2468
2469 /*
2470 * Call all the resume_prepare() hooks, so that modules can be
2471 * ready for the migration resume.
2472 */
2473 ret = qemu_savevm_state_resume_prepare(s);
2474 if (ret) {
2475 error_report("%s: resume_prepare() failure detected: %d",
2476 __func__, ret);
2477 return ret;
2478 }
2479
5655aab0
PX
2480 /*
2481 * If preempt is enabled, re-establish the preempt channel. Note that
2482 * we do it after resume prepare to make sure the main channel will be
2483 * created before the preempt channel. E.g. with weak network, the
2484 * dest QEMU may get messed up with the preempt and main channels on
2485 * the order of connection setup. This guarantees the correct order.
2486 */
2487 ret = postcopy_preempt_establish_channel(s);
2488 if (ret) {
2489 error_report("%s: postcopy_preempt_establish_channel(): %d",
2490 __func__, ret);
2491 return ret;
2492 }
2493
d1b8eadb 2494 /*
94190696
PX
2495 * Last handshake with destination on the resume (destination will
2496 * switch to postcopy-active afterwards)
d1b8eadb 2497 */
94190696
PX
2498 ret = postcopy_resume_handshake(s);
2499 if (ret) {
2500 error_report("%s: handshake failed: %d", __func__, ret);
2501 return ret;
2502 }
d1b8eadb 2503
135b87b4
PX
2504 return 0;
2505}
2506
b23c2ade
PX
2507/*
2508 * We don't return until we are in a safe state to continue current
2509 * postcopy migration. Returns MIG_THR_ERR_RECOVERED if recovered, or
2510 * MIG_THR_ERR_FATAL if unrecovery failure happened.
2511 */
2512static MigThrError postcopy_pause(MigrationState *s)
2513{
2514 assert(s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
b23c2ade 2515
135b87b4 2516 while (true) {
62df066f
PX
2517 QEMUFile *file;
2518
39675fff
PX
2519 /*
2520 * Current channel is possibly broken. Release it. Note that this is
2521 * guaranteed even without lock because to_dst_file should only be
2522 * modified by the migration thread. That also guarantees that the
2523 * unregister of yank is safe too without the lock. It should be safe
2524 * even to be within the qemu_file_lock, but we didn't do that to avoid
2525 * taking more mutex (yank_lock) within qemu_file_lock. TL;DR: we make
2526 * the qemu_file_lock critical section as small as possible.
2527 */
135b87b4 2528 assert(s->to_dst_file);
39675fff 2529 migration_ioc_unregister_yank_from_file(s->to_dst_file);
62df066f
PX
2530 qemu_mutex_lock(&s->qemu_file_lock);
2531 file = s->to_dst_file;
135b87b4 2532 s->to_dst_file = NULL;
62df066f
PX
2533 qemu_mutex_unlock(&s->qemu_file_lock);
2534
2535 qemu_file_shutdown(file);
2536 qemu_fclose(file);
b23c2ade 2537
ef796ee9
FR
2538 /*
2539 * We're already pausing, so ignore any errors on the return
2540 * path and just wait for the thread to finish. It will be
2541 * re-created when we resume.
2542 */
2543 await_return_path_close_on_source(s);
2544
d246ea50
PX
2545 migrate_set_state(&s->state, s->state,
2546 MIGRATION_STATUS_POSTCOPY_PAUSED);
2547
135b87b4
PX
2548 error_report("Detected IO failure for postcopy. "
2549 "Migration paused.");
b23c2ade 2550
135b87b4
PX
2551 /*
2552 * We wait until things fixed up. Then someone will setup the
2553 * status back for us.
2554 */
2555 while (s->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
2556 qemu_sem_wait(&s->postcopy_pause_sem);
2557 }
2558
2559 if (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) {
2560 /* Woken up by a recover procedure. Give it a shot */
b23c2ade 2561
135b87b4
PX
2562 /* Do the resume logic */
2563 if (postcopy_do_resume(s) == 0) {
2564 /* Let's continue! */
2565 trace_postcopy_pause_continued();
2566 return MIG_THR_ERR_RECOVERED;
2567 } else {
2568 /*
2569 * Something wrong happened during the recovery, let's
2570 * pause again. Pause is always better than throwing
2571 * data away.
2572 */
2573 continue;
2574 }
2575 } else {
2576 /* This is not right... Time to quit. */
2577 return MIG_THR_ERR_FATAL;
2578 }
2579 }
b23c2ade
PX
2580}
2581
2582static MigThrError migration_detect_error(MigrationState *s)
2583{
2584 int ret;
c3c5eae6 2585 int state = s->state;
3d661c8a 2586 Error *local_error = NULL;
c3c5eae6
DDAG
2587
2588 if (state == MIGRATION_STATUS_CANCELLING ||
2589 state == MIGRATION_STATUS_CANCELLED) {
2590 /* End the migration, but don't set the state to failed */
2591 return MIG_THR_ERR_FATAL;
2592 }
b23c2ade 2593
60bb3c58
PX
2594 /*
2595 * Try to detect any file errors. Note that postcopy_qemufile_src will
2596 * be NULL when postcopy preempt is not enabled.
2597 */
2598 ret = qemu_file_get_error_obj_any(s->to_dst_file,
2599 s->postcopy_qemufile_src,
2600 &local_error);
b23c2ade
PX
2601 if (!ret) {
2602 /* Everything is fine */
3d661c8a 2603 assert(!local_error);
b23c2ade
PX
2604 return MIG_THR_ERR_NONE;
2605 }
2606
3d661c8a
YK
2607 if (local_error) {
2608 migrate_set_error(s, local_error);
2609 error_free(local_error);
2610 }
2611
d5c8f2af 2612 if (state == MIGRATION_STATUS_POSTCOPY_ACTIVE && ret) {
b23c2ade
PX
2613 /*
2614 * For postcopy, we allow the network to be down for a
2615 * while. After that, it can be continued by a
2616 * recovery phase.
2617 */
2618 return postcopy_pause(s);
2619 } else {
2620 /*
2621 * For precopy (or postcopy with error outside IO), we fail
2622 * with no time.
2623 */
c3c5eae6 2624 migrate_set_state(&s->state, state, MIGRATION_STATUS_FAILED);
b23c2ade
PX
2625 trace_migration_thread_file_err();
2626
2627 /* Time to stop the migration, now. */
2628 return MIG_THR_ERR_FATAL;
2629 }
2630}
2631
cf011f08
PX
2632static void migration_calculate_complete(MigrationState *s)
2633{
99319e2d 2634 uint64_t bytes = migration_transferred_bytes(s->to_dst_file);
cf011f08 2635 int64_t end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
6cde6fbe 2636 int64_t transfer_time;
cf011f08
PX
2637
2638 s->total_time = end_time - s->start_time;
2639 if (!s->downtime) {
2640 /*
2641 * It's still not set, so we are precopy migration. For
2642 * postcopy, downtime is calculated during postcopy_start().
2643 */
2644 s->downtime = end_time - s->downtime_start;
2645 }
2646
6cde6fbe
JQ
2647 transfer_time = s->total_time - s->setup_time;
2648 if (transfer_time) {
2649 s->mbps = ((double) bytes * 8.0) / transfer_time / 1000;
cf011f08
PX
2650 }
2651}
2652
87f3bd87
IR
2653static void update_iteration_initial_status(MigrationState *s)
2654{
2655 /*
2656 * Update these three fields at the same time to avoid mismatch info lead
2657 * wrong speed calculation.
2658 */
2659 s->iteration_start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
99319e2d 2660 s->iteration_initial_bytes = migration_transferred_bytes(s->to_dst_file);
87f3bd87
IR
2661 s->iteration_initial_pages = ram_get_total_transferred_pages();
2662}
2663
b15df1ae
PX
2664static void migration_update_counters(MigrationState *s,
2665 int64_t current_time)
2666{
aecbfe9c 2667 uint64_t transferred, transferred_pages, time_spent;
0c8f0efd 2668 uint64_t current_bytes; /* bytes transferred since the beginning */
b15df1ae
PX
2669 double bandwidth;
2670
2671 if (current_time < s->iteration_start_time + BUFFER_DELAY) {
2672 return;
2673 }
2674
99319e2d 2675 current_bytes = migration_transferred_bytes(s->to_dst_file);
0c8f0efd 2676 transferred = current_bytes - s->iteration_initial_bytes;
b15df1ae
PX
2677 time_spent = current_time - s->iteration_start_time;
2678 bandwidth = (double)transferred / time_spent;
f5da8ba4 2679 s->threshold_size = bandwidth * migrate_downtime_limit();
b15df1ae
PX
2680
2681 s->mbps = (((double) transferred * 8.0) /
2682 ((double) time_spent / 1000.0)) / 1000.0 / 1000.0;
2683
aecbfe9c
XG
2684 transferred_pages = ram_get_total_transferred_pages() -
2685 s->iteration_initial_pages;
2686 s->pages_per_second = (double) transferred_pages /
2687 (((double) time_spent / 1000.0));
2688
b15df1ae
PX
2689 /*
2690 * if we haven't sent anything, we don't want to
2691 * recalculate. 10000 is a small enough number for our purposes
2692 */
aff3f660 2693 if (stat64_get(&mig_stats.dirty_pages_rate) &&
72f8e587 2694 transferred > 10000) {
73208a33 2695 s->expected_downtime =
aff3f660 2696 stat64_get(&mig_stats.dirty_bytes_last_sync) / bandwidth;
b15df1ae
PX
2697 }
2698
813cd616 2699 migration_rate_reset(s->to_dst_file);
b15df1ae 2700
87f3bd87 2701 update_iteration_initial_status(s);
b15df1ae
PX
2702
2703 trace_migrate_transferred(transferred, time_spent,
0781c1ed 2704 bandwidth, s->threshold_size);
b15df1ae
PX
2705}
2706
1b4adb10
AH
2707static bool migration_can_switchover(MigrationState *s)
2708{
2709 if (!migrate_switchover_ack()) {
2710 return true;
2711 }
2712
2713 /* No reason to wait for switchover ACK if VM is stopped */
2714 if (!runstate_is_running()) {
2715 return true;
2716 }
2717
2718 return s->switchover_acked;
2719}
2720
2ad87305
PX
2721/* Migration thread iteration status */
2722typedef enum {
2723 MIG_ITERATE_RESUME, /* Resume current iteration */
2724 MIG_ITERATE_SKIP, /* Skip current iteration */
2725 MIG_ITERATE_BREAK, /* Break the loop */
2726} MigIterateState;
2727
2728/*
2729 * Return true if continue to the next iteration directly, false
2730 * otherwise.
2731 */
2732static MigIterateState migration_iteration_run(MigrationState *s)
2733{
24beea4e 2734 uint64_t must_precopy, can_postcopy;
908927db 2735 Error *local_err = NULL;
2ad87305 2736 bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE;
1b4adb10 2737 bool can_switchover = migration_can_switchover(s);
2ad87305 2738
24beea4e
JQ
2739 qemu_savevm_state_pending_estimate(&must_precopy, &can_postcopy);
2740 uint64_t pending_size = must_precopy + can_postcopy;
2ad87305 2741
24beea4e 2742 trace_migrate_pending_estimate(pending_size, must_precopy, can_postcopy);
c8df4a7a 2743
24beea4e
JQ
2744 if (must_precopy <= s->threshold_size) {
2745 qemu_savevm_state_pending_exact(&must_precopy, &can_postcopy);
2746 pending_size = must_precopy + can_postcopy;
2747 trace_migrate_pending_exact(pending_size, must_precopy, can_postcopy);
c8df4a7a 2748 }
2ad87305 2749
1b4adb10 2750 if ((!pending_size || pending_size < s->threshold_size) && can_switchover) {
2ad87305
PX
2751 trace_migration_thread_low_pending(pending_size);
2752 migration_completion(s);
2753 return MIG_ITERATE_BREAK;
2754 }
2755
d9df9292 2756 /* Still a significant amount to transfer */
1b4adb10 2757 if (!in_postcopy && must_precopy <= s->threshold_size && can_switchover &&
d9df9292 2758 qatomic_read(&s->start_postcopy)) {
908927db
TG
2759 if (postcopy_start(s, &local_err)) {
2760 migrate_set_error(s, local_err);
2761 error_report_err(local_err);
d9df9292
JQ
2762 }
2763 return MIG_ITERATE_SKIP;
2764 }
2765
2766 /* Just another iteration step */
2767 qemu_savevm_state_iterate(s->to_dst_file, in_postcopy);
2ad87305
PX
2768 return MIG_ITERATE_RESUME;
2769}
2770
199aa6d4
PX
2771static void migration_iteration_finish(MigrationState *s)
2772{
2773 /* If we enabled cpu throttling for auto-converge, turn it off. */
2774 cpu_throttle_stop();
2775
2776 qemu_mutex_lock_iothread();
2777 switch (s->state) {
2778 case MIGRATION_STATUS_COMPLETED:
2779 migration_calculate_complete(s);
2780 runstate_set(RUN_STATE_POSTMIGRATE);
2781 break;
751fe4c6 2782 case MIGRATION_STATUS_COLO:
d70178a8 2783 assert(migrate_colo());
199aa6d4 2784 migrate_start_colo_process(s);
f4584076 2785 s->vm_old_state = RUN_STATE_RUNNING;
199aa6d4
PX
2786 /* Fallthrough */
2787 case MIGRATION_STATUS_FAILED:
2788 case MIGRATION_STATUS_CANCELLED:
57225e5f 2789 case MIGRATION_STATUS_CANCELLING:
f4584076 2790 if (s->vm_old_state == RUN_STATE_RUNNING) {
aa505f8e
RL
2791 if (!runstate_check(RUN_STATE_SHUTDOWN)) {
2792 vm_start();
2793 }
199aa6d4
PX
2794 } else {
2795 if (runstate_check(RUN_STATE_FINISH_MIGRATE)) {
a4c6275a 2796 runstate_set(s->vm_old_state);
199aa6d4
PX
2797 }
2798 }
2799 break;
2800
2801 default:
2802 /* Should not reach here, but if so, forgive the VM. */
2803 error_report("%s: Unknown ending state %d", __func__, s->state);
2804 break;
2805 }
fd392cfa 2806 migrate_fd_cleanup_schedule(s);
199aa6d4
PX
2807 qemu_mutex_unlock_iothread();
2808}
2809
8518278a
AG
2810static void bg_migration_iteration_finish(MigrationState *s)
2811{
3a8b81f2
FE
2812 /*
2813 * Stop tracking RAM writes - un-protect memory, un-register UFFD
2814 * memory ranges, flush kernel wait queues and wake up threads
2815 * waiting for write fault to be resolved.
2816 */
2817 ram_write_tracking_stop();
2818
8518278a
AG
2819 qemu_mutex_lock_iothread();
2820 switch (s->state) {
2821 case MIGRATION_STATUS_COMPLETED:
2822 migration_calculate_complete(s);
2823 break;
2824
2825 case MIGRATION_STATUS_ACTIVE:
2826 case MIGRATION_STATUS_FAILED:
2827 case MIGRATION_STATUS_CANCELLED:
2828 case MIGRATION_STATUS_CANCELLING:
2829 break;
2830
2831 default:
2832 /* Should not reach here, but if so, forgive the VM. */
2833 error_report("%s: Unknown ending state %d", __func__, s->state);
2834 break;
2835 }
2836
2837 migrate_fd_cleanup_schedule(s);
2838 qemu_mutex_unlock_iothread();
2839}
2840
2841/*
2842 * Return true if continue to the next iteration directly, false
2843 * otherwise.
2844 */
2845static MigIterateState bg_migration_iteration_run(MigrationState *s)
2846{
2847 int res;
2848
2849 res = qemu_savevm_state_iterate(s->to_dst_file, false);
2850 if (res > 0) {
2851 bg_migration_completion(s);
2852 return MIG_ITERATE_BREAK;
2853 }
2854
2855 return MIG_ITERATE_RESUME;
2856}
2857
ad767bed
DDAG
2858void migration_make_urgent_request(void)
2859{
2860 qemu_sem_post(&migrate_get_current()->rate_limit_sem);
2861}
2862
2863void migration_consume_urgent_request(void)
2864{
2865 qemu_sem_wait(&migrate_get_current()->rate_limit_sem);
2866}
2867
97e1e067
DDAG
2868/* Returns true if the rate limiting was broken by an urgent request */
2869bool migration_rate_limit(void)
2870{
2871 int64_t now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
2872 MigrationState *s = migrate_get_current();
2873
2874 bool urgent = false;
2875 migration_update_counters(s, now);
e1fde0e0 2876 if (migration_rate_exceeded(s->to_dst_file)) {
77386127
LS
2877
2878 if (qemu_file_get_error(s->to_dst_file)) {
2879 return false;
2880 }
97e1e067
DDAG
2881 /*
2882 * Wait for a delay to do rate limiting OR
2883 * something urgent to post the semaphore.
2884 */
2885 int ms = s->iteration_start_time + BUFFER_DELAY - now;
2886 trace_migration_rate_limit_pre(ms);
2887 if (qemu_sem_timedwait(&s->rate_limit_sem, ms) == 0) {
2888 /*
2889 * We were woken by one or more urgent things but
2890 * the timedwait will have consumed one of them.
2891 * The service routine for the urgent wake will dec
2892 * the semaphore itself for each item it consumes,
2893 * so add this one we just eat back.
2894 */
2895 qemu_sem_post(&s->rate_limit_sem);
2896 urgent = true;
2897 }
2898 trace_migration_rate_limit_post(urgent);
2899 }
2900 return urgent;
2901}
2902
fde93d99
LV
2903/*
2904 * if failover devices are present, wait they are completely
2905 * unplugged
2906 */
2907
2908static void qemu_savevm_wait_unplug(MigrationState *s, int old_state,
2909 int new_state)
2910{
2911 if (qemu_savevm_state_guest_unplug_pending()) {
2912 migrate_set_state(&s->state, old_state, MIGRATION_STATUS_WAIT_UNPLUG);
2913
2914 while (s->state == MIGRATION_STATUS_WAIT_UNPLUG &&
2915 qemu_savevm_state_guest_unplug_pending()) {
2916 qemu_sem_timedwait(&s->wait_unplug_sem, 250);
2917 }
944bc528
LV
2918 if (s->state != MIGRATION_STATUS_WAIT_UNPLUG) {
2919 int timeout = 120; /* 30 seconds */
2920 /*
2921 * migration has been canceled
2922 * but as we have started an unplug we must wait the end
2923 * to be able to plug back the card
2924 */
2925 while (timeout-- && qemu_savevm_state_guest_unplug_pending()) {
2926 qemu_sem_timedwait(&s->wait_unplug_sem, 250);
2927 }
1b529d90
LV
2928 if (qemu_savevm_state_guest_unplug_pending() &&
2929 !qtest_enabled()) {
a51dcef0
LV
2930 warn_report("migration: partially unplugged device on "
2931 "failure");
2932 }
944bc528 2933 }
fde93d99
LV
2934
2935 migrate_set_state(&s->state, MIGRATION_STATUS_WAIT_UNPLUG, new_state);
2936 } else {
2937 migrate_set_state(&s->state, old_state, new_state);
2938 }
2939}
2940
70b20477
DDAG
2941/*
2942 * Master migration thread on the source VM.
2943 * It drives the migration and pumps the data down the outgoing channel.
2944 */
5f496a1b 2945static void *migration_thread(void *opaque)
0d82d0e8 2946{
9848a404 2947 MigrationState *s = opaque;
1b1f4ab6 2948 MigrationThread *thread = NULL;
bc72ad67 2949 int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
b23c2ade 2950 MigThrError thr_error;
ad767bed 2951 bool urgent = false;
76f5933a 2952
788fa680 2953 thread = migration_threads_add("live_migration", qemu_get_thread_id());
1b1f4ab6 2954
ab28bd23
PB
2955 rcu_register_thread();
2956
892ae715 2957 object_ref(OBJECT(s));
87f3bd87 2958 update_iteration_initial_status(s);
b15df1ae 2959
89a02a9f 2960 qemu_savevm_state_header(s->to_dst_file);
1d34e4bf 2961
62a02658
PX
2962 /*
2963 * If we opened the return path, we need to make sure dst has it
2964 * opened as well.
2965 */
43044ac0 2966 if (s->rp_state.rp_thread_created) {
1d34e4bf 2967 /* Now tell the dest that it should open its end so it can reply */
89a02a9f 2968 qemu_savevm_send_open_return_path(s->to_dst_file);
1d34e4bf
DDAG
2969
2970 /* And do a ping that will make stuff easier to debug */
89a02a9f 2971 qemu_savevm_send_ping(s->to_dst_file, 1);
0425dc97 2972 }
1d34e4bf 2973
58110f0a 2974 if (migrate_postcopy()) {
1d34e4bf
DDAG
2975 /*
2976 * Tell the destination that we *might* want to do postcopy later;
2977 * if the other end can't do postcopy it should fail now, nice and
2978 * early.
2979 */
89a02a9f 2980 qemu_savevm_send_postcopy_advise(s->to_dst_file);
1d34e4bf
DDAG
2981 }
2982
5e804644 2983 if (migrate_colo()) {
aad555c2
ZC
2984 /* Notify migration destination that we enable COLO */
2985 qemu_savevm_send_colo_enable(s->to_dst_file);
2986 }
2987
9907e842 2988 qemu_savevm_state_setup(s->to_dst_file);
0d82d0e8 2989
fde93d99
LV
2990 qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP,
2991 MIGRATION_STATUS_ACTIVE);
c7e0acd5 2992
bc72ad67 2993 s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
29ae8a41 2994
9ec055ae
DDAG
2995 trace_migration_thread_setup_complete();
2996
8f8d528e 2997 while (migration_is_active(s)) {
e1fde0e0 2998 if (urgent || !migration_rate_exceeded(s->to_dst_file)) {
2ad87305
PX
2999 MigIterateState iter_state = migration_iteration_run(s);
3000 if (iter_state == MIG_ITERATE_SKIP) {
3001 continue;
3002 } else if (iter_state == MIG_ITERATE_BREAK) {
09f6c85e 3003 break;
c369f40d
JQ
3004 }
3005 }
f4410a5d 3006
b23c2ade
PX
3007 /*
3008 * Try to detect any kind of failures, and see whether we
3009 * should stop the migration now.
3010 */
3011 thr_error = migration_detect_error(s);
3012 if (thr_error == MIG_THR_ERR_FATAL) {
3013 /* Stop migration */
fd45ee2c 3014 break;
b23c2ade
PX
3015 } else if (thr_error == MIG_THR_ERR_RECOVERED) {
3016 /*
3017 * Just recovered from a e.g. network failure, reset all
3018 * the local variables. This is important to avoid
3019 * breaking transferred_bytes and bandwidth calculation
3020 */
87f3bd87 3021 update_iteration_initial_status(s);
fd45ee2c 3022 }
b15df1ae 3023
97e1e067 3024 urgent = migration_rate_limit();
a3fa1d78
PB
3025 }
3026
1d34e4bf 3027 trace_migration_thread_after_loop();
199aa6d4 3028 migration_iteration_finish(s);
892ae715 3029 object_unref(OBJECT(s));
ab28bd23 3030 rcu_unregister_thread();
788fa680 3031 migration_threads_remove(thread);
0d82d0e8
JQ
3032 return NULL;
3033}
3034
8518278a
AG
3035static void bg_migration_vm_start_bh(void *opaque)
3036{
3037 MigrationState *s = opaque;
3038
3039 qemu_bh_delete(s->vm_start_bh);
3040 s->vm_start_bh = NULL;
3041
3042 vm_start();
3043 s->downtime = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - s->downtime_start;
3044}
3045
3046/**
3047 * Background snapshot thread, based on live migration code.
3048 * This is an alternative implementation of live migration mechanism
3049 * introduced specifically to support background snapshots.
3050 *
3051 * It takes advantage of userfault_fd write protection mechanism introduced
3052 * in v5.7 kernel. Compared to existing dirty page logging migration much
3053 * lesser stream traffic is produced resulting in smaller snapshot images,
3054 * simply cause of no page duplicates can get into the stream.
3055 *
3056 * Another key point is that generated vmstate stream reflects machine state
3057 * 'frozen' at the beginning of snapshot creation compared to dirty page logging
3058 * mechanism, which effectively results in that saved snapshot is the state of VM
3059 * at the end of the process.
3060 */
3061static void *bg_migration_thread(void *opaque)
3062{
3063 MigrationState *s = opaque;
3064 int64_t setup_start;
3065 MigThrError thr_error;
3066 QEMUFile *fb;
3067 bool early_fail = true;
3068
3069 rcu_register_thread();
3070 object_ref(OBJECT(s));
3071
e1fde0e0 3072 migration_rate_set(RATE_LIMIT_DISABLED);
8518278a
AG
3073
3074 setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
3075 /*
3076 * We want to save vmstate for the moment when migration has been
3077 * initiated but also we want to save RAM content while VM is running.
3078 * The RAM content should appear first in the vmstate. So, we first
3079 * stash the non-RAM part of the vmstate to the temporary buffer,
3080 * then write RAM part of the vmstate to the migration stream
3081 * with vCPUs running and, finally, write stashed non-RAM part of
3082 * the vmstate from the buffer to the migration stream.
3083 */
ecb23efe 3084 s->bioc = qio_channel_buffer_new(512 * 1024);
8518278a 3085 qio_channel_set_name(QIO_CHANNEL(s->bioc), "vmstate-buffer");
77ef2dc1 3086 fb = qemu_file_new_output(QIO_CHANNEL(s->bioc));
8518278a
AG
3087 object_unref(OBJECT(s->bioc));
3088
3089 update_iteration_initial_status(s);
3090
eeccb99c
AG
3091 /*
3092 * Prepare for tracking memory writes with UFFD-WP - populate
3093 * RAM pages before protecting.
3094 */
3095#ifdef __linux__
3096 ram_write_tracking_prepare();
3097#endif
3098
8518278a
AG
3099 qemu_savevm_state_header(s->to_dst_file);
3100 qemu_savevm_state_setup(s->to_dst_file);
3101
fde93d99
LV
3102 qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP,
3103 MIGRATION_STATUS_ACTIVE);
8518278a 3104
8518278a
AG
3105 s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
3106
3107 trace_migration_thread_setup_complete();
3108 s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
3109
3110 qemu_mutex_lock_iothread();
3111
3112 /*
3113 * If VM is currently in suspended state, then, to make a valid runstate
3114 * transition in vm_stop_force_state() we need to wakeup it up.
3115 */
3116 qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
f4584076 3117 s->vm_old_state = runstate_get();
8518278a 3118
c33f1829 3119 global_state_store();
8518278a
AG
3120 /* Forcibly stop VM before saving state of vCPUs and devices */
3121 if (vm_stop_force_state(RUN_STATE_PAUSED)) {
3122 goto fail;
3123 }
3124 /*
3125 * Put vCPUs in sync with shadow context structures, then
3126 * save their state to channel-buffer along with devices.
3127 */
3128 cpu_synchronize_all_states();
3129 if (qemu_savevm_state_complete_precopy_non_iterable(fb, false, false)) {
3130 goto fail;
3131 }
ecb23efe
AG
3132 /*
3133 * Since we are going to get non-iterable state data directly
3134 * from s->bioc->data, explicit flush is needed here.
3135 */
3136 qemu_fflush(fb);
3137
8518278a
AG
3138 /* Now initialize UFFD context and start tracking RAM writes */
3139 if (ram_write_tracking_start()) {
3140 goto fail;
3141 }
3142 early_fail = false;
3143
3144 /*
3145 * Start VM from BH handler to avoid write-fault lock here.
3146 * UFFD-WP protection for the whole RAM is already enabled so
3147 * calling VM state change notifiers from vm_start() would initiate
3148 * writes to virtio VQs memory which is in write-protected region.
3149 */
3150 s->vm_start_bh = qemu_bh_new(bg_migration_vm_start_bh, s);
3151 qemu_bh_schedule(s->vm_start_bh);
3152
3153 qemu_mutex_unlock_iothread();
3154
3155 while (migration_is_active(s)) {
3156 MigIterateState iter_state = bg_migration_iteration_run(s);
3157 if (iter_state == MIG_ITERATE_SKIP) {
3158 continue;
3159 } else if (iter_state == MIG_ITERATE_BREAK) {
3160 break;
3161 }
3162
3163 /*
3164 * Try to detect any kind of failures, and see whether we
3165 * should stop the migration now.
3166 */
3167 thr_error = migration_detect_error(s);
3168 if (thr_error == MIG_THR_ERR_FATAL) {
3169 /* Stop migration */
3170 break;
3171 }
3172
3173 migration_update_counters(s, qemu_clock_get_ms(QEMU_CLOCK_REALTIME));
3174 }
3175
3176 trace_migration_thread_after_loop();
3177
3178fail:
3179 if (early_fail) {
3180 migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
3181 MIGRATION_STATUS_FAILED);
3182 qemu_mutex_unlock_iothread();
3183 }
3184
3185 bg_migration_iteration_finish(s);
3186
3187 qemu_fclose(fb);
3188 object_unref(OBJECT(s));
3189 rcu_unregister_thread();
3190
3191 return NULL;
3192}
3193
cce8040b 3194void migrate_fd_connect(MigrationState *s, Error *error_in)
0d82d0e8 3195{
00f4b572 3196 Error *local_err = NULL;
52033349 3197 uint64_t rate_limit;
d3e35b8f
PX
3198 bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED;
3199
ca7bd082
PX
3200 /*
3201 * If there's a previous error, free it and prepare for another one.
3202 * Meanwhile if migration completes successfully, there won't have an error
3203 * dumped when calling migrate_fd_cleanup().
3204 */
3205 migrate_error_free(s);
3206
f5da8ba4 3207 s->expected_downtime = migrate_downtime_limit();
9cbc3649
MAL
3208 if (resume) {
3209 assert(s->cleanup_bh);
3210 } else {
3211 assert(!s->cleanup_bh);
3212 s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup_bh, s);
3213 }
cce8040b
DDAG
3214 if (error_in) {
3215 migrate_fd_error(s, error_in);
ca30f24d
PX
3216 if (resume) {
3217 /*
3218 * Don't do cleanup for resume if channel is invalid, but only dump
3219 * the error. We wait for another channel connect from the user.
3220 * The error_report still gives HMP user a hint on what failed.
3221 * It's normally done in migrate_fd_cleanup(), but call it here
3222 * explicitly.
3223 */
3224 error_report_err(error_copy(s->error));
3225 } else {
3226 migrate_fd_cleanup(s);
3227 }
cce8040b
DDAG
3228 return;
3229 }
0d82d0e8 3230
d3e35b8f
PX
3231 if (resume) {
3232 /* This is a resumed migration */
9d3ebbe2 3233 rate_limit = migrate_max_postcopy_bandwidth();
d3e35b8f
PX
3234 } else {
3235 /* This is a fresh new migration */
9d3ebbe2 3236 rate_limit = migrate_max_bandwidth();
442773ce 3237
d3e35b8f
PX
3238 /* Notify before starting migration thread */
3239 notifier_list_notify(&migration_state_notifiers, s);
3240 }
3241
e1fde0e0 3242 migration_rate_set(rate_limit);
d3e35b8f 3243 qemu_file_set_blocking(s->to_dst_file, true);
9287ac27 3244
1d34e4bf 3245 /*
c788ada8
PX
3246 * Open the return path. For postcopy, it is used exclusively. For
3247 * precopy, only if user specified "return-path" capability would
3248 * QEMU uses the return path.
1d34e4bf 3249 */
38ad1110 3250 if (migrate_postcopy_ram() || migrate_return_path()) {
ef796ee9 3251 if (open_return_path_on_source(s)) {
908927db 3252 error_setg(&local_err, "Unable to open return-path for postcopy");
d3e35b8f 3253 migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
908927db
TG
3254 migrate_set_error(s, local_err);
3255 error_report_err(local_err);
1d34e4bf
DDAG
3256 migrate_fd_cleanup(s);
3257 return;
3258 }
3259 }
3260
06064a67
PX
3261 /*
3262 * This needs to be done before resuming a postcopy. Note: for newer
3263 * QEMUs we will delay the channel creation until postcopy_start(), to
3264 * avoid disorder of channel creations.
3265 */
3266 if (migrate_postcopy_preempt() && s->preempt_pre_7_2) {
3267 postcopy_preempt_setup(s);
3268 }
3269
d3e35b8f 3270 if (resume) {
135b87b4
PX
3271 /* Wakeup the main migration thread to do the recovery */
3272 migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_PAUSED,
3273 MIGRATION_STATUS_POSTCOPY_RECOVER);
3274 qemu_sem_post(&s->postcopy_pause_sem);
d3e35b8f
PX
3275 return;
3276 }
3277
00f4b572 3278 if (multifd_save_setup(&local_err) != 0) {
908927db 3279 migrate_set_error(s, local_err);
00f4b572 3280 error_report_err(local_err);
f986c3d2
JQ
3281 migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
3282 MIGRATION_STATUS_FAILED);
3283 migrate_fd_cleanup(s);
3284 return;
3285 }
8518278a
AG
3286
3287 if (migrate_background_snapshot()) {
3288 qemu_thread_create(&s->thread, "bg_snapshot",
3289 bg_migration_thread, s, QEMU_THREAD_JOINABLE);
3290 } else {
3291 qemu_thread_create(&s->thread, "live_migration",
3292 migration_thread, s, QEMU_THREAD_JOINABLE);
3293 }
1d34e4bf 3294 s->migration_thread_running = true;
0d82d0e8 3295}
093e3c42 3296
e5cb7e76
PX
3297static void migration_class_init(ObjectClass *klass, void *data)
3298{
3299 DeviceClass *dc = DEVICE_CLASS(klass);
3300
3301 dc->user_creatable = false;
4f67d30b 3302 device_class_set_props(dc, migration_properties);
e5cb7e76
PX
3303}
3304
b91bf5e4
MAL
3305static void migration_instance_finalize(Object *obj)
3306{
3307 MigrationState *ms = MIGRATION_OBJ(obj);
b91bf5e4 3308
87db1a7d 3309 qemu_mutex_destroy(&ms->error_mutex);
62df066f 3310 qemu_mutex_destroy(&ms->qemu_file_lock);
c7e0acd5 3311 qemu_sem_destroy(&ms->wait_unplug_sem);
ad767bed 3312 qemu_sem_destroy(&ms->rate_limit_sem);
e91d8951 3313 qemu_sem_destroy(&ms->pause_sem);
b23c2ade 3314 qemu_sem_destroy(&ms->postcopy_pause_sem);
edd090c7 3315 qemu_sem_destroy(&ms->rp_state.rp_sem);
b28fb582 3316 qemu_sem_destroy(&ms->rp_state.rp_pong_acks);
d0edb8a1 3317 qemu_sem_destroy(&ms->postcopy_qemufile_src_sem);
ab105cc1 3318 error_free(ms->error);
b91bf5e4
MAL
3319}
3320
e5cb7e76
PX
3321static void migration_instance_init(Object *obj)
3322{
3323 MigrationState *ms = MIGRATION_OBJ(obj);
3324
3325 ms->state = MIGRATION_STATUS_NONE;
e5cb7e76 3326 ms->mbps = -1;
aecbfe9c 3327 ms->pages_per_second = -1;
e91d8951 3328 qemu_sem_init(&ms->pause_sem, 0);
87db1a7d 3329 qemu_mutex_init(&ms->error_mutex);
8b0b29dc 3330
61a174e2 3331 migrate_params_init(&ms->parameters);
b23c2ade
PX
3332
3333 qemu_sem_init(&ms->postcopy_pause_sem, 0);
edd090c7 3334 qemu_sem_init(&ms->rp_state.rp_sem, 0);
b28fb582 3335 qemu_sem_init(&ms->rp_state.rp_pong_acks, 0);
ad767bed 3336 qemu_sem_init(&ms->rate_limit_sem, 0);
c7e0acd5 3337 qemu_sem_init(&ms->wait_unplug_sem, 0);
d0edb8a1 3338 qemu_sem_init(&ms->postcopy_qemufile_src_sem, 0);
62df066f 3339 qemu_mutex_init(&ms->qemu_file_lock);
8b0b29dc
PX
3340}
3341
3342/*
3343 * Return true if check pass, false otherwise. Error will be put
3344 * inside errp if provided.
3345 */
3346static bool migration_object_check(MigrationState *ms, Error **errp)
3347{
6b19a7d9 3348 /* Assuming all off */
b02c7fc9 3349 bool old_caps[MIGRATION_CAPABILITY__MAX] = { 0 };
6b19a7d9 3350
8b0b29dc
PX
3351 if (!migrate_params_check(&ms->parameters, errp)) {
3352 return false;
3353 }
3354
b02c7fc9 3355 return migrate_caps_check(old_caps, ms->capabilities, errp);
e5cb7e76
PX
3356}
3357
3358static const TypeInfo migration_type = {
3359 .name = TYPE_MIGRATION,
01f6e14c 3360 /*
c8d3ff38 3361 * NOTE: TYPE_MIGRATION is not really a device, as the object is
2194abd6 3362 * not created using qdev_new(), it is not attached to the qdev
c8d3ff38
PX
3363 * device tree, and it is never realized.
3364 *
3365 * TODO: Make this TYPE_OBJECT once QOM provides something like
3366 * TYPE_DEVICE's "-global" properties.
01f6e14c 3367 */
e5cb7e76
PX
3368 .parent = TYPE_DEVICE,
3369 .class_init = migration_class_init,
3370 .class_size = sizeof(MigrationClass),
3371 .instance_size = sizeof(MigrationState),
3372 .instance_init = migration_instance_init,
b91bf5e4 3373 .instance_finalize = migration_instance_finalize,
e5cb7e76
PX
3374};
3375
3376static void register_migration_types(void)
3377{
3378 type_register_static(&migration_type);
3379}
3380
3381type_init(register_migration_types);