]> git.proxmox.com Git - pve-qemu.git/blame - debian/patches/pve/0017-PVE-internal-snapshot-async.patch
fix #3084: fall back to open-iscsi initiatorname
[pve-qemu.git] / debian / patches / pve / 0017-PVE-internal-snapshot-async.patch
CommitLineData
23102ed6 1From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
6402d961 2From: Dietmar Maurer <dietmar@proxmox.com>
83faa3fe
TL
3Date: Mon, 6 Apr 2020 12:16:46 +0200
4Subject: [PATCH] PVE: internal snapshot async
6402d961
TL
5
6Truncate at 1024 boundary (Fabian Ebner will send a patch for stable)
95259824 7
d7f4e01a
TL
8Put qemu_savevm_state_{header,setup} into the main loop and the rest
9of the iteration into a coroutine. The former need to lock the
10iothread (and we can't unlock it in the coroutine), and the latter
11can't deal with being in a separate thread, so a coroutine it must
12be.
13
b855dce7 14Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
6402d961 15Signed-off-by: Dietmar Maurer <dietmar@proxmox.com>
d7f4e01a 16Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
95259824 17---
53e83913 18 Makefile.objs | 1 +
b855dce7 19 hmp-commands-info.hx | 13 +
6838f038 20 hmp-commands.hx | 32 +++
d7f4e01a 21 include/block/aio.h | 10 +
6838f038 22 include/migration/snapshot.h | 1 +
be901f66 23 include/monitor/hmp.h | 5 +
d7f4e01a 24 monitor/hmp-cmds.c | 57 ++++
b855dce7 25 qapi/migration.json | 34 +++
53e83913 26 qapi/misc.json | 32 +++
83faa3fe 27 qemu-options.hx | 12 +
d7f4e01a 28 savevm-async.c | 542 +++++++++++++++++++++++++++++++++++
83faa3fe 29 softmmu/vl.c | 10 +
d7f4e01a
TL
30 util/async.c | 30 ++
31 13 files changed, 779 insertions(+)
95259824
WB
32 create mode 100644 savevm-async.c
33
34diff --git a/Makefile.objs b/Makefile.objs
60ae3775 35index d22b3b45d7..a1307c12a8 100644
95259824
WB
36--- a/Makefile.objs
37+++ b/Makefile.objs
60ae3775 38@@ -46,6 +46,7 @@ common-obj-y += bootdevice.o iothread.o
be901f66 39 common-obj-y += dump/
53e83913 40 common-obj-y += job-qmp.o
be901f66 41 common-obj-y += monitor/
95259824 42+common-obj-y += savevm-async.o
6838f038 43 common-obj-y += net/
83faa3fe 44 common-obj-y += qdev-monitor.o
6838f038 45 common-obj-$(CONFIG_WIN32) += os-win32.o
95259824 46diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx
60ae3775 47index 30209e3903..ae8ff21789 100644
95259824
WB
48--- a/hmp-commands-info.hx
49+++ b/hmp-commands-info.hx
60ae3775 50@@ -580,6 +580,19 @@ SRST
83faa3fe
TL
51 Show current migration xbzrle cache size.
52 ERST
53
b855dce7 54+ {
95259824
WB
55+ .name = "savevm",
56+ .args_type = "",
57+ .params = "",
58+ .help = "show savevm status",
a544966d 59+ .cmd = hmp_info_savevm,
95259824
WB
60+ },
61+
83faa3fe
TL
62+SRST
63+ ``info savevm``
64+ Show savevm status.
65+ERST
66+
b855dce7 67 {
83faa3fe
TL
68 .name = "balloon",
69 .args_type = "",
95259824 70diff --git a/hmp-commands.hx b/hmp-commands.hx
60ae3775 71index 60f395c276..2b58ac4a1c 100644
95259824
WB
72--- a/hmp-commands.hx
73+++ b/hmp-commands.hx
60ae3775 74@@ -1829,3 +1829,35 @@ ERST
83faa3fe
TL
75 .flags = "p",
76 },
77
95259824
WB
78+
79+ {
80+ .name = "savevm-start",
81+ .args_type = "statefile:s?",
82+ .params = "[statefile]",
83+ .help = "Prepare for snapshot and halt VM. Save VM state to statefile.",
a544966d 84+ .cmd = hmp_savevm_start,
95259824
WB
85+ },
86+
87+ {
88+ .name = "snapshot-drive",
89+ .args_type = "device:s,name:s",
90+ .params = "device name",
91+ .help = "Create internal snapshot.",
a544966d 92+ .cmd = hmp_snapshot_drive,
95259824
WB
93+ },
94+
95+ {
96+ .name = "delete-drive-snapshot",
97+ .args_type = "device:s,name:s",
98+ .params = "device name",
99+ .help = "Delete internal snapshot.",
a544966d 100+ .cmd = hmp_delete_drive_snapshot,
95259824
WB
101+ },
102+
103+ {
104+ .name = "savevm-end",
105+ .args_type = "",
106+ .params = "",
107+ .help = "Resume VM after snaphot.",
a544966d 108+ .cmd = hmp_savevm_end,
95259824 109+ },
d7f4e01a 110diff --git a/include/block/aio.h b/include/block/aio.h
60ae3775 111index b2f703fa3f..c37617b404 100644
d7f4e01a
TL
112--- a/include/block/aio.h
113+++ b/include/block/aio.h
114@@ -17,6 +17,7 @@
115 #ifdef CONFIG_LINUX_IO_URING
116 #include <liburing.h>
117 #endif
118+#include "qemu/coroutine.h"
119 #include "qemu/queue.h"
120 #include "qemu/event_notifier.h"
121 #include "qemu/thread.h"
122@@ -654,6 +655,15 @@ static inline bool aio_node_check(AioContext *ctx, bool is_external)
123 */
124 void aio_co_schedule(AioContext *ctx, struct Coroutine *co);
125
126+/**
127+ * aio_co_reschedule_self:
128+ * @new_ctx: the new context
129+ *
130+ * Move the currently running coroutine to new_ctx. If the coroutine is already
131+ * running in new_ctx, do nothing.
132+ */
133+void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx);
134+
135 /**
136 * aio_co_wake:
137 * @co: the coroutine
be901f66
SR
138diff --git a/include/migration/snapshot.h b/include/migration/snapshot.h
139index c85b6ec75b..4411b7121d 100644
140--- a/include/migration/snapshot.h
141+++ b/include/migration/snapshot.h
142@@ -17,5 +17,6 @@
143
144 int save_snapshot(const char *name, Error **errp);
145 int load_snapshot(const char *name, Error **errp);
146+int load_snapshot_from_blockdev(const char *filename, Error **errp);
147
148 #endif
149diff --git a/include/monitor/hmp.h b/include/monitor/hmp.h
60ae3775 150index c986cfd28b..243952d32f 100644
be901f66
SR
151--- a/include/monitor/hmp.h
152+++ b/include/monitor/hmp.h
153@@ -25,6 +25,7 @@ void hmp_info_status(Monitor *mon, const QDict *qdict);
154 void hmp_info_uuid(Monitor *mon, const QDict *qdict);
155 void hmp_info_chardev(Monitor *mon, const QDict *qdict);
156 void hmp_info_mice(Monitor *mon, const QDict *qdict);
157+void hmp_info_savevm(Monitor *mon, const QDict *qdict);
158 void hmp_info_migrate(Monitor *mon, const QDict *qdict);
159 void hmp_info_migrate_capabilities(Monitor *mon, const QDict *qdict);
160 void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict);
83faa3fe 161@@ -83,6 +84,10 @@ void hmp_netdev_add(Monitor *mon, const QDict *qdict);
be901f66
SR
162 void hmp_netdev_del(Monitor *mon, const QDict *qdict);
163 void hmp_getfd(Monitor *mon, const QDict *qdict);
164 void hmp_closefd(Monitor *mon, const QDict *qdict);
165+void hmp_savevm_start(Monitor *mon, const QDict *qdict);
166+void hmp_snapshot_drive(Monitor *mon, const QDict *qdict);
167+void hmp_delete_drive_snapshot(Monitor *mon, const QDict *qdict);
168+void hmp_savevm_end(Monitor *mon, const QDict *qdict);
169 void hmp_sendkey(Monitor *mon, const QDict *qdict);
170 void hmp_screendump(Monitor *mon, const QDict *qdict);
83faa3fe 171 void hmp_chardev_add(Monitor *mon, const QDict *qdict);
be901f66 172diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c
60ae3775 173index 6e26ea2cd0..280bb447a6 100644
be901f66
SR
174--- a/monitor/hmp-cmds.c
175+++ b/monitor/hmp-cmds.c
60ae3775 176@@ -1904,6 +1904,63 @@ void hmp_info_memory_devices(Monitor *mon, const QDict *qdict)
83faa3fe 177 hmp_handle_error(mon, err);
95259824
WB
178 }
179
180+void hmp_savevm_start(Monitor *mon, const QDict *qdict)
181+{
182+ Error *errp = NULL;
183+ const char *statefile = qdict_get_try_str(qdict, "statefile");
184+
185+ qmp_savevm_start(statefile != NULL, statefile, &errp);
83faa3fe 186+ hmp_handle_error(mon, errp);
95259824
WB
187+}
188+
189+void hmp_snapshot_drive(Monitor *mon, const QDict *qdict)
190+{
191+ Error *errp = NULL;
192+ const char *name = qdict_get_str(qdict, "name");
193+ const char *device = qdict_get_str(qdict, "device");
194+
195+ qmp_snapshot_drive(device, name, &errp);
83faa3fe 196+ hmp_handle_error(mon, errp);
95259824
WB
197+}
198+
199+void hmp_delete_drive_snapshot(Monitor *mon, const QDict *qdict)
200+{
201+ Error *errp = NULL;
202+ const char *name = qdict_get_str(qdict, "name");
203+ const char *device = qdict_get_str(qdict, "device");
204+
205+ qmp_delete_drive_snapshot(device, name, &errp);
83faa3fe 206+ hmp_handle_error(mon, errp);
95259824
WB
207+}
208+
209+void hmp_savevm_end(Monitor *mon, const QDict *qdict)
210+{
211+ Error *errp = NULL;
212+
213+ qmp_savevm_end(&errp);
83faa3fe 214+ hmp_handle_error(mon, errp);
95259824
WB
215+}
216+
217+void hmp_info_savevm(Monitor *mon, const QDict *qdict)
218+{
219+ SaveVMInfo *info;
220+ info = qmp_query_savevm(NULL);
221+
222+ if (info->has_status) {
223+ monitor_printf(mon, "savevm status: %s\n", info->status);
224+ monitor_printf(mon, "total time: %" PRIu64 " milliseconds\n",
225+ info->total_time);
226+ } else {
227+ monitor_printf(mon, "savevm status: not running\n");
228+ }
229+ if (info->has_bytes) {
230+ monitor_printf(mon, "Bytes saved: %"PRIu64"\n", info->bytes);
231+ }
232+ if (info->has_error) {
233+ monitor_printf(mon, "Error: %s\n", info->error);
234+ }
235+}
236+
237 void hmp_info_iothreads(Monitor *mon, const QDict *qdict)
238 {
239 IOThreadInfoList *info_list = qmp_query_iothreads(NULL);
6838f038 240diff --git a/qapi/migration.json b/qapi/migration.json
60ae3775 241index ea53b23dca..c556257544 100644
6838f038
WB
242--- a/qapi/migration.json
243+++ b/qapi/migration.json
60ae3775 244@@ -225,6 +225,40 @@
b855dce7
TL
245 '*compression': 'CompressionStats',
246 '*socket-address': ['SocketAddress'] } }
95259824 247
b855dce7 248+##
a544966d 249+# @SaveVMInfo:
95259824
WB
250+#
251+# Information about current migration process.
252+#
a544966d 253+# @status: string describing the current savevm status.
95259824
WB
254+# This can be 'active', 'completed', 'failed'.
255+# If this field is not returned, no savevm process
256+# has been initiated
257+#
a544966d 258+# @error: string containing error message is status is failed.
95259824 259+#
a544966d 260+# @total-time: total amount of milliseconds since savevm started.
95259824
WB
261+# If savevm has ended, it returns the total save time
262+#
a544966d 263+# @bytes: total amount of data transfered
95259824
WB
264+#
265+# Since: 1.3
266+##
267+{ 'struct': 'SaveVMInfo',
268+ 'data': {'*status': 'str', '*error': 'str',
269+ '*total-time': 'int', '*bytes': 'int'} }
270+
271+##
a544966d 272+# @query-savevm:
95259824
WB
273+#
274+# Returns information about current savevm process.
275+#
276+# Returns: @SaveVMInfo
277+#
278+# Since: 1.3
279+##
280+{ 'command': 'query-savevm', 'returns': 'SaveVMInfo' }
281+
b855dce7 282 ##
a544966d 283 # @query-migrate:
95259824 284 #
53e83913 285diff --git a/qapi/misc.json b/qapi/misc.json
60ae3775 286index 44b1fb6fa7..9895899f8b 100644
53e83913
WB
287--- a/qapi/misc.json
288+++ b/qapi/misc.json
60ae3775 289@@ -1168,6 +1168,38 @@
b855dce7 290 ##
6402d961 291 { 'command': 'query-fdsets', 'returns': ['FdsetInfo'] }
53e83913 292
b855dce7 293+##
53e83913
WB
294+# @savevm-start:
295+#
296+# Prepare for snapshot and halt VM. Save VM state to statefile.
297+#
298+##
299+{ 'command': 'savevm-start', 'data': { '*statefile': 'str' } }
300+
301+##
302+# @snapshot-drive:
303+#
304+# Create an internal drive snapshot.
305+#
306+##
307+{ 'command': 'snapshot-drive', 'data': { 'device': 'str', 'name': 'str' } }
308+
309+##
310+# @delete-drive-snapshot:
311+#
312+# Delete a drive snapshot.
313+#
314+##
315+{ 'command': 'delete-drive-snapshot', 'data': { 'device': 'str', 'name': 'str' } }
316+
317+##
318+# @savevm-end:
319+#
320+# Resume VM after a snapshot.
321+#
322+##
323+{ 'command': 'savevm-end' }
324+
b855dce7 325 ##
53e83913
WB
326 # @AcpiTableOptions:
327 #
95259824 328diff --git a/qemu-options.hx b/qemu-options.hx
60ae3775 329index 708583b4ce..d32995cc50 100644
95259824
WB
330--- a/qemu-options.hx
331+++ b/qemu-options.hx
60ae3775 332@@ -3866,6 +3866,18 @@ SRST
83faa3fe
TL
333 Start right away with a saved state (``loadvm`` in monitor)
334 ERST
95259824
WB
335
336+DEF("loadstate", HAS_ARG, QEMU_OPTION_loadstate, \
337+ "-loadstate file\n" \
338+ " start right away with a saved state\n",
339+ QEMU_ARCH_ALL)
83faa3fe
TL
340+SRST
341+``-loadstate file``
342+ Start right away with a saved state. This option does not rollback
343+ disk state like @code{loadvm}, so user must make sure that disk
344+ have correct state. @var{file} can be any valid device URL. See the section
345+ for "Device URL Syntax" for more information.
346+ERST
95259824
WB
347+
348 #ifndef _WIN32
349 DEF("daemonize", 0, QEMU_OPTION_daemonize, \
350 "-daemonize daemonize QEMU after initializing\n", QEMU_ARCH_ALL)
95259824
WB
351diff --git a/savevm-async.c b/savevm-async.c
352new file mode 100644
60ae3775 353index 0000000000..f918e18dce
95259824
WB
354--- /dev/null
355+++ b/savevm-async.c
d7f4e01a 356@@ -0,0 +1,542 @@
95259824 357+#include "qemu/osdep.h"
6838f038
WB
358+#include "migration/migration.h"
359+#include "migration/savevm.h"
360+#include "migration/snapshot.h"
361+#include "migration/global_state.h"
362+#include "migration/ram.h"
363+#include "migration/qemu-file.h"
95259824 364+#include "sysemu/sysemu.h"
6402d961 365+#include "sysemu/runstate.h"
95259824 366+#include "block/block.h"
95259824 367+#include "sysemu/block-backend.h"
53e83913
WB
368+#include "qapi/error.h"
369+#include "qapi/qmp/qerror.h"
370+#include "qapi/qmp/qdict.h"
371+#include "qapi/qapi-commands-migration.h"
372+#include "qapi/qapi-commands-misc.h"
0775f12b 373+#include "qapi/qapi-commands-block.h"
95259824 374+#include "qemu/cutils.h"
6402d961
TL
375+#include "qemu/main-loop.h"
376+#include "qemu/rcu.h"
95259824
WB
377+
378+/* #define DEBUG_SAVEVM_STATE */
379+
0775f12b
WB
380+/* used while emulated sync operation in progress */
381+#define NOT_DONE -EINPROGRESS
67af0fa4 382+
95259824
WB
383+#ifdef DEBUG_SAVEVM_STATE
384+#define DPRINTF(fmt, ...) \
385+ do { printf("savevm-async: " fmt, ## __VA_ARGS__); } while (0)
386+#else
387+#define DPRINTF(fmt, ...) \
388+ do { } while (0)
389+#endif
390+
391+enum {
392+ SAVE_STATE_DONE,
393+ SAVE_STATE_ERROR,
394+ SAVE_STATE_ACTIVE,
395+ SAVE_STATE_COMPLETED,
396+ SAVE_STATE_CANCELLED
397+};
398+
399+
400+static struct SnapshotState {
67af0fa4 401+ BlockBackend *target;
95259824
WB
402+ size_t bs_pos;
403+ int state;
404+ Error *error;
405+ Error *blocker;
406+ int saved_vm_running;
407+ QEMUFile *file;
408+ int64_t total_time;
d7f4e01a
TL
409+ QEMUBH *finalize_bh;
410+ Coroutine *co;
95259824
WB
411+} snap_state;
412+
413+SaveVMInfo *qmp_query_savevm(Error **errp)
414+{
415+ SaveVMInfo *info = g_malloc0(sizeof(*info));
416+ struct SnapshotState *s = &snap_state;
417+
418+ if (s->state != SAVE_STATE_DONE) {
419+ info->has_bytes = true;
420+ info->bytes = s->bs_pos;
421+ switch (s->state) {
422+ case SAVE_STATE_ERROR:
423+ info->has_status = true;
424+ info->status = g_strdup("failed");
425+ info->has_total_time = true;
426+ info->total_time = s->total_time;
427+ if (s->error) {
428+ info->has_error = true;
429+ info->error = g_strdup(error_get_pretty(s->error));
430+ }
431+ break;
432+ case SAVE_STATE_ACTIVE:
433+ info->has_status = true;
434+ info->status = g_strdup("active");
435+ info->has_total_time = true;
436+ info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME)
437+ - s->total_time;
438+ break;
439+ case SAVE_STATE_COMPLETED:
440+ info->has_status = true;
441+ info->status = g_strdup("completed");
442+ info->has_total_time = true;
443+ info->total_time = s->total_time;
444+ break;
445+ }
446+ }
447+
448+ return info;
449+}
450+
451+static int save_snapshot_cleanup(void)
452+{
453+ int ret = 0;
454+
455+ DPRINTF("save_snapshot_cleanup\n");
456+
457+ snap_state.total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
458+ snap_state.total_time;
459+
460+ if (snap_state.file) {
461+ ret = qemu_fclose(snap_state.file);
462+ }
463+
67af0fa4 464+ if (snap_state.target) {
95259824 465+ /* try to truncate, but ignore errors (will fail on block devices).
6402d961
TL
466+ * note1: bdrv_read() need whole blocks, so we need to round up
467+ * note2: PVE requires 1024 (BDRV_SECTOR_SIZE*2) alignment
95259824 468+ */
6402d961 469+ size_t size = QEMU_ALIGN_UP(snap_state.bs_pos, BDRV_SECTOR_SIZE*2);
60ae3775 470+ blk_truncate(snap_state.target, size, false, PREALLOC_MODE_OFF, 0, NULL);
67af0fa4 471+ blk_op_unblock_all(snap_state.target, snap_state.blocker);
95259824
WB
472+ error_free(snap_state.blocker);
473+ snap_state.blocker = NULL;
67af0fa4
WB
474+ blk_unref(snap_state.target);
475+ snap_state.target = NULL;
95259824
WB
476+ }
477+
478+ return ret;
479+}
480+
481+static void save_snapshot_error(const char *fmt, ...)
482+{
483+ va_list ap;
484+ char *msg;
485+
486+ va_start(ap, fmt);
487+ msg = g_strdup_vprintf(fmt, ap);
488+ va_end(ap);
489+
490+ DPRINTF("save_snapshot_error: %s\n", msg);
491+
492+ if (!snap_state.error) {
493+ error_set(&snap_state.error, ERROR_CLASS_GENERIC_ERROR, "%s", msg);
494+ }
495+
496+ g_free (msg);
497+
498+ snap_state.state = SAVE_STATE_ERROR;
95259824
WB
499+}
500+
6402d961 501+static int block_state_close(void *opaque, Error **errp)
95259824
WB
502+{
503+ snap_state.file = NULL;
67af0fa4 504+ return blk_flush(snap_state.target);
95259824
WB
505+}
506+
0775f12b
WB
507+typedef struct BlkRwCo {
508+ int64_t offset;
509+ QEMUIOVector *qiov;
510+ ssize_t ret;
511+} BlkRwCo;
512+
513+static void coroutine_fn block_state_write_entry(void *opaque) {
514+ BlkRwCo *rwco = opaque;
515+ rwco->ret = blk_co_pwritev(snap_state.target, rwco->offset, rwco->qiov->size,
516+ rwco->qiov, 0);
d7f4e01a 517+ aio_wait_kick();
0775f12b
WB
518+}
519+
67af0fa4 520+static ssize_t block_state_writev_buffer(void *opaque, struct iovec *iov,
6402d961 521+ int iovcnt, int64_t pos, Error **errp)
95259824 522+{
67af0fa4 523+ QEMUIOVector qiov;
0775f12b
WB
524+ BlkRwCo rwco;
525+
526+ assert(pos == snap_state.bs_pos);
527+ rwco = (BlkRwCo) {
528+ .offset = pos,
529+ .qiov = &qiov,
530+ .ret = NOT_DONE,
531+ };
95259824 532+
67af0fa4 533+ qemu_iovec_init_external(&qiov, iov, iovcnt);
0775f12b
WB
534+
535+ if (qemu_in_coroutine()) {
536+ block_state_write_entry(&rwco);
537+ } else {
538+ Coroutine *co = qemu_coroutine_create(&block_state_write_entry, &rwco);
539+ bdrv_coroutine_enter(blk_bs(snap_state.target), co);
540+ BDRV_POLL_WHILE(blk_bs(snap_state.target), rwco.ret == NOT_DONE);
95259824 541+ }
0775f12b
WB
542+ if (rwco.ret < 0) {
543+ return rwco.ret;
544+ }
545+
67af0fa4
WB
546+ snap_state.bs_pos += qiov.size;
547+ return qiov.size;
95259824
WB
548+}
549+
0775f12b
WB
550+static const QEMUFileOps block_file_ops = {
551+ .writev_buffer = block_state_writev_buffer,
552+ .close = block_state_close,
553+};
554+
d7f4e01a 555+static void process_savevm_finalize(void *opaque)
0775f12b
WB
556+{
557+ int ret;
d7f4e01a
TL
558+ AioContext *iohandler_ctx = iohandler_get_aio_context();
559+ MigrationState *ms = migrate_get_current();
560+
561+#ifdef DEBUG_SAVEVM_STATE
562+ int64_t start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
563+#endif
564+
565+ qemu_bh_delete(snap_state.finalize_bh);
566+ snap_state.finalize_bh = NULL;
567+ snap_state.co = NULL;
568+
569+ /* We need to own the target bdrv's context for the following functions,
570+ * so move it back. It can stay in the main context and live out its live
571+ * there, since we're done with it after this method ends anyway.
572+ */
573+ aio_context_acquire(iohandler_ctx);
574+ blk_set_aio_context(snap_state.target, qemu_get_aio_context(), NULL);
575+ aio_context_release(iohandler_ctx);
576+
577+ ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
578+ if (ret < 0) {
579+ save_snapshot_error("vm_stop_force_state error %d", ret);
580+ }
581+
582+ (void)qemu_savevm_state_complete_precopy(snap_state.file, false, false);
583+ ret = qemu_file_get_error(snap_state.file);
584+ if (ret < 0) {
585+ save_snapshot_error("qemu_savevm_state_iterate error %d", ret);
586+ }
587+
588+ DPRINTF("state saving complete\n");
589+ DPRINTF("timing: process_savevm_finalize (state saving) took %ld ms\n",
590+ qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - start_time);
591+
592+ /* clear migration state */
593+ migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP,
594+ ret ? MIGRATION_STATUS_FAILED : MIGRATION_STATUS_COMPLETED);
595+ ms->to_dst_file = NULL;
596+
597+ qemu_savevm_state_cleanup();
598+
0775f12b
WB
599+ ret = save_snapshot_cleanup();
600+ if (ret < 0) {
601+ save_snapshot_error("save_snapshot_cleanup error %d", ret);
602+ } else if (snap_state.state == SAVE_STATE_ACTIVE) {
603+ snap_state.state = SAVE_STATE_COMPLETED;
604+ } else {
605+ save_snapshot_error("process_savevm_cleanup: invalid state: %d",
606+ snap_state.state);
95259824 607+ }
0775f12b
WB
608+ if (snap_state.saved_vm_running) {
609+ vm_start();
610+ snap_state.saved_vm_running = false;
95259824 611+ }
d7f4e01a
TL
612+
613+ DPRINTF("timing: process_savevm_finalize (full) took %ld ms\n",
614+ qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - start_time);
95259824
WB
615+}
616+
d7f4e01a 617+static void coroutine_fn process_savevm_co(void *opaque)
95259824
WB
618+{
619+ int ret;
620+ int64_t maxlen;
d7f4e01a
TL
621+ BdrvNextIterator it;
622+ BlockDriverState *bs = NULL;
95259824 623+
d7f4e01a
TL
624+#ifdef DEBUG_SAVEVM_STATE
625+ int64_t start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
626+#endif
95259824 627+
6838f038 628+ ret = qemu_file_get_error(snap_state.file);
95259824 629+ if (ret < 0) {
6838f038 630+ save_snapshot_error("qemu_savevm_state_setup failed");
d7f4e01a 631+ return;
95259824
WB
632+ }
633+
634+ while (snap_state.state == SAVE_STATE_ACTIVE) {
0775f12b 635+ uint64_t pending_size, pend_precopy, pend_compatible, pend_postcopy;
95259824 636+
0775f12b
WB
637+ qemu_savevm_state_pending(snap_state.file, 0, &pend_precopy, &pend_compatible, &pend_postcopy);
638+ pending_size = pend_precopy + pend_compatible + pend_postcopy;
95259824 639+
0775f12b
WB
640+ maxlen = blk_getlength(snap_state.target) - 30*1024*1024;
641+
642+ if (pending_size > 400000 && snap_state.bs_pos + pending_size < maxlen) {
0775f12b
WB
643+ ret = qemu_savevm_state_iterate(snap_state.file, false);
644+ if (ret < 0) {
645+ save_snapshot_error("qemu_savevm_state_iterate error %d", ret);
646+ break;
647+ }
d7f4e01a 648+ DPRINTF("savevm iterate pending size %lu ret %d\n", pending_size, ret);
95259824 649+ } else {
b855dce7 650+ qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
0775f12b
WB
651+ ret = global_state_store();
652+ if (ret) {
653+ save_snapshot_error("global_state_store error %d", ret);
95259824 654+ break;
0775f12b 655+ }
d7f4e01a
TL
656+
657+ DPRINTF("savevm iterate complete\n");
95259824
WB
658+ break;
659+ }
95259824
WB
660+ }
661+
d7f4e01a
TL
662+ DPRINTF("timing: process_savevm_co took %ld ms\n",
663+ qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - start_time);
664+
665+#ifdef DEBUG_SAVEVM_STATE
666+ int64_t start_time_flush = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
667+#endif
668+ /* If a drive runs in an IOThread we can flush it async, and only
669+ * need to sync-flush whatever IO happens between now and
670+ * vm_stop_force_state. bdrv_next can only be called from main AioContext,
671+ * so move there now and after every flush.
672+ */
673+ aio_co_reschedule_self(qemu_get_aio_context());
674+ for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
675+ /* target has BDRV_O_NO_FLUSH, no sense calling bdrv_flush on it */
676+ if (bs == blk_bs(snap_state.target)) {
677+ continue;
678+ }
679+
680+ AioContext *bs_ctx = bdrv_get_aio_context(bs);
681+ if (bs_ctx != qemu_get_aio_context()) {
682+ DPRINTF("savevm: async flushing drive %s\n", bs->filename);
683+ aio_co_reschedule_self(bs_ctx);
684+ bdrv_flush(bs);
685+ aio_co_reschedule_self(qemu_get_aio_context());
686+ }
687+ }
688+
689+ DPRINTF("timing: async flushing took %ld ms\n",
690+ qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - start_time_flush);
95259824 691+
d7f4e01a 692+ qemu_bh_schedule(snap_state.finalize_bh);
95259824
WB
693+}
694+
95259824
WB
695+void qmp_savevm_start(bool has_statefile, const char *statefile, Error **errp)
696+{
95259824 697+ Error *local_err = NULL;
d7f4e01a
TL
698+ MigrationState *ms = migrate_get_current();
699+ AioContext *iohandler_ctx = iohandler_get_aio_context();
95259824 700+
67af0fa4 701+ int bdrv_oflags = BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_NO_FLUSH;
95259824
WB
702+
703+ if (snap_state.state != SAVE_STATE_DONE) {
704+ error_set(errp, ERROR_CLASS_GENERIC_ERROR,
705+ "VM snapshot already started\n");
706+ return;
707+ }
708+
d7f4e01a
TL
709+ if (migration_is_running(ms->state)) {
710+ error_set(errp, ERROR_CLASS_GENERIC_ERROR, QERR_MIGRATION_ACTIVE);
711+ return;
712+ }
713+
714+ if (migrate_use_block()) {
715+ error_set(errp, ERROR_CLASS_GENERIC_ERROR,
716+ "Block migration and snapshots are incompatible");
717+ return;
718+ }
719+
95259824
WB
720+ /* initialize snapshot info */
721+ snap_state.saved_vm_running = runstate_is_running();
722+ snap_state.bs_pos = 0;
723+ snap_state.total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
724+ snap_state.blocker = NULL;
725+
726+ if (snap_state.error) {
727+ error_free(snap_state.error);
728+ snap_state.error = NULL;
729+ }
730+
731+ if (!has_statefile) {
732+ vm_stop(RUN_STATE_SAVE_VM);
733+ snap_state.state = SAVE_STATE_COMPLETED;
734+ return;
735+ }
736+
737+ if (qemu_savevm_state_blocked(errp)) {
738+ return;
739+ }
740+
741+ /* Open the image */
95259824
WB
742+ QDict *options = NULL;
743+ options = qdict_new();
53e83913 744+ qdict_put_str(options, "driver", "raw");
67af0fa4
WB
745+ snap_state.target = blk_new_open(statefile, NULL, options, bdrv_oflags, &local_err);
746+ if (!snap_state.target) {
95259824
WB
747+ error_set(errp, ERROR_CLASS_GENERIC_ERROR, "failed to open '%s'", statefile);
748+ goto restart;
749+ }
750+
751+ snap_state.file = qemu_fopen_ops(&snap_state, &block_file_ops);
752+
753+ if (!snap_state.file) {
754+ error_set(errp, ERROR_CLASS_GENERIC_ERROR, "failed to open '%s'", statefile);
755+ goto restart;
756+ }
757+
d7f4e01a
TL
758+ /*
759+ * qemu_savevm_* paths use migration code and expect a migration state.
760+ * State is cleared in process_savevm_co, but has to be initialized
761+ * here (blocking main thread, from QMP) to avoid race conditions.
762+ */
763+ migrate_init(ms);
764+ memset(&ram_counters, 0, sizeof(ram_counters));
765+ ms->to_dst_file = snap_state.file;
95259824
WB
766+
767+ error_setg(&snap_state.blocker, "block device is in use by savevm");
67af0fa4 768+ blk_op_block_all(snap_state.target, snap_state.blocker);
95259824 769+
0775f12b 770+ snap_state.state = SAVE_STATE_ACTIVE;
d7f4e01a
TL
771+ snap_state.finalize_bh = qemu_bh_new(process_savevm_finalize, &snap_state);
772+ snap_state.co = qemu_coroutine_create(&process_savevm_co, NULL);
773+ qemu_mutex_unlock_iothread();
774+ qemu_savevm_state_header(snap_state.file);
775+ qemu_savevm_state_setup(snap_state.file);
776+ qemu_mutex_lock_iothread();
777+
778+ /* Async processing from here on out happens in iohandler context, so let
779+ * the target bdrv have its home there.
780+ */
781+ blk_set_aio_context(snap_state.target, iohandler_ctx, &local_err);
782+
783+ aio_co_schedule(iohandler_ctx, snap_state.co);
95259824
WB
784+
785+ return;
786+
787+restart:
788+
789+ save_snapshot_error("setup failed");
790+
791+ if (snap_state.saved_vm_running) {
792+ vm_start();
793+ }
794+}
795+
796+void qmp_savevm_end(Error **errp)
797+{
798+ if (snap_state.state == SAVE_STATE_DONE) {
799+ error_set(errp, ERROR_CLASS_GENERIC_ERROR,
800+ "VM snapshot not started\n");
801+ return;
802+ }
803+
804+ if (snap_state.state == SAVE_STATE_ACTIVE) {
805+ snap_state.state = SAVE_STATE_CANCELLED;
806+ return;
807+ }
808+
809+ if (snap_state.saved_vm_running) {
810+ vm_start();
811+ }
812+
813+ snap_state.state = SAVE_STATE_DONE;
814+}
815+
0775f12b 816+// FIXME: Deprecated
95259824
WB
817+void qmp_snapshot_drive(const char *device, const char *name, Error **errp)
818+{
0775f12b
WB
819+ // Compatibility to older qemu-server.
820+ qmp_blockdev_snapshot_internal_sync(device, name, errp);
95259824
WB
821+}
822+
0775f12b 823+// FIXME: Deprecated
95259824
WB
824+void qmp_delete_drive_snapshot(const char *device, const char *name,
825+ Error **errp)
826+{
0775f12b
WB
827+ // Compatibility to older qemu-server.
828+ (void)qmp_blockdev_snapshot_delete_internal_sync(device, false, NULL,
829+ true, name, errp);
95259824
WB
830+}
831+
67af0fa4 832+static ssize_t loadstate_get_buffer(void *opaque, uint8_t *buf, int64_t pos,
6402d961 833+ size_t size, Error **errp)
95259824 834+{
67af0fa4
WB
835+ BlockBackend *be = opaque;
836+ int64_t maxlen = blk_getlength(be);
95259824
WB
837+ if (pos > maxlen) {
838+ return -EIO;
839+ }
840+ if ((pos + size) > maxlen) {
841+ size = maxlen - pos - 1;
842+ }
843+ if (size == 0) {
844+ return 0;
845+ }
67af0fa4 846+ return blk_pread(be, pos, buf, size);
95259824
WB
847+}
848+
849+static const QEMUFileOps loadstate_file_ops = {
850+ .get_buffer = loadstate_get_buffer,
851+};
852+
6838f038 853+int load_snapshot_from_blockdev(const char *filename, Error **errp)
95259824 854+{
67af0fa4 855+ BlockBackend *be;
95259824
WB
856+ Error *local_err = NULL;
857+ Error *blocker = NULL;
858+
859+ QEMUFile *f;
67af0fa4 860+ int ret = -EINVAL;
95259824 861+
67af0fa4 862+ be = blk_new_open(filename, NULL, NULL, 0, &local_err);
95259824 863+
67af0fa4 864+ if (!be) {
6838f038 865+ error_setg(errp, "Could not open VM state file");
95259824
WB
866+ goto the_end;
867+ }
868+
67af0fa4
WB
869+ error_setg(&blocker, "block device is in use by load state");
870+ blk_op_block_all(be, blocker);
871+
95259824 872+ /* restore the VM state */
67af0fa4 873+ f = qemu_fopen_ops(be, &loadstate_file_ops);
95259824 874+ if (!f) {
6838f038 875+ error_setg(errp, "Could not open VM state file");
95259824
WB
876+ goto the_end;
877+ }
878+
6838f038 879+ qemu_system_reset(SHUTDOWN_CAUSE_NONE);
95259824
WB
880+ ret = qemu_loadvm_state(f);
881+
882+ qemu_fclose(f);
883+ migration_incoming_state_destroy();
884+ if (ret < 0) {
6838f038 885+ error_setg_errno(errp, -ret, "Error while loading VM state");
95259824
WB
886+ goto the_end;
887+ }
888+
889+ ret = 0;
890+
891+ the_end:
67af0fa4
WB
892+ if (be) {
893+ blk_op_unblock_all(be, blocker);
95259824 894+ error_free(blocker);
67af0fa4 895+ blk_unref(be);
95259824
WB
896+ }
897+ return ret;
898+}
83faa3fe 899diff --git a/softmmu/vl.c b/softmmu/vl.c
60ae3775 900index 4eb9d1f7fd..670b7e427c 100644
83faa3fe
TL
901--- a/softmmu/vl.c
902+++ b/softmmu/vl.c
60ae3775 903@@ -2844,6 +2844,7 @@ void qemu_init(int argc, char **argv, char **envp)
95259824
WB
904 int optind;
905 const char *optarg;
906 const char *loadvm = NULL;
907+ const char *loadstate = NULL;
908 MachineClass *machine_class;
be901f66 909 const char *cpu_option;
95259824 910 const char *vga_model = NULL;
60ae3775 911@@ -3408,6 +3409,9 @@ void qemu_init(int argc, char **argv, char **envp)
95259824
WB
912 case QEMU_OPTION_loadvm:
913 loadvm = optarg;
914 break;
915+ case QEMU_OPTION_loadstate:
916+ loadstate = optarg;
917+ break;
918 case QEMU_OPTION_full_screen:
53e83913
WB
919 dpy.has_full_screen = true;
920 dpy.full_screen = true;
60ae3775 921@@ -4464,6 +4468,12 @@ void qemu_init(int argc, char **argv, char **envp)
95259824 922 autostart = 0;
b855dce7 923 exit(1);
95259824
WB
924 }
925+ } else if (loadstate) {
6838f038
WB
926+ Error *local_err = NULL;
927+ if (load_snapshot_from_blockdev(loadstate, &local_err) < 0) {
928+ error_report_err(local_err);
95259824
WB
929+ autostart = 0;
930+ }
931 }
b855dce7
TL
932 if (replay_mode != REPLAY_MODE_NONE) {
933 replay_vmstate_init();
d7f4e01a 934diff --git a/util/async.c b/util/async.c
60ae3775 935index 1319eee3bc..b68e73f488 100644
d7f4e01a
TL
936--- a/util/async.c
937+++ b/util/async.c
60ae3775 938@@ -559,6 +559,36 @@ void aio_co_schedule(AioContext *ctx, Coroutine *co)
d7f4e01a
TL
939 aio_context_unref(ctx);
940 }
941
942+typedef struct AioCoRescheduleSelf {
943+ Coroutine *co;
944+ AioContext *new_ctx;
945+} AioCoRescheduleSelf;
946+
947+static void aio_co_reschedule_self_bh(void *opaque)
948+{
949+ AioCoRescheduleSelf *data = opaque;
950+ aio_co_schedule(data->new_ctx, data->co);
951+}
952+
953+void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx)
954+{
955+ AioContext *old_ctx = qemu_get_current_aio_context();
956+
957+ if (old_ctx != new_ctx) {
958+ AioCoRescheduleSelf data = {
959+ .co = qemu_coroutine_self(),
960+ .new_ctx = new_ctx,
961+ };
962+ /*
963+ * We can't directly schedule the coroutine in the target context
964+ * because this would be racy: The other thread could try to enter the
965+ * coroutine before it has yielded in this one.
966+ */
967+ aio_bh_schedule_oneshot(old_ctx, aio_co_reschedule_self_bh, &data);
968+ qemu_coroutine_yield();
969+ }
970+}
971+
972 void aio_co_wake(struct Coroutine *co)
973 {
974 AioContext *ctx;