2 * Migration support for VFIO devices
4 * Copyright NVIDIA, Inc. 2020
6 * This work is licensed under the terms of the GNU GPL, version 2. See
7 * the COPYING file in the top-level directory.
10 #include "qemu/osdep.h"
11 #include "qemu/main-loop.h"
12 #include "qemu/cutils.h"
13 #include "qemu/units.h"
14 #include <linux/vfio.h>
15 #include <sys/ioctl.h>
17 #include "sysemu/runstate.h"
18 #include "hw/vfio/vfio-common.h"
19 #include "migration/migration.h"
20 #include "migration/vmstate.h"
21 #include "migration/qemu-file.h"
22 #include "migration/register.h"
23 #include "migration/blocker.h"
24 #include "migration/misc.h"
25 #include "qapi/error.h"
26 #include "exec/ramlist.h"
27 #include "exec/ram_addr.h"
33 * Flags to be used as unique delimiters for VFIO devices in the migration
34 * stream. These flags are composed as:
35 * 0xffffffff => MSB 32-bit all 1s
36 * 0xef10 => Magic ID, represents emulated (virtual) function IO
37 * 0x0000 => 16-bits reserved for flags
39 * The beginning of state information is marked by _DEV_CONFIG_STATE,
40 * _DEV_SETUP_STATE, or _DEV_DATA_STATE, respectively. The end of a
41 * certain state information is marked by _END_OF_STATE.
43 #define VFIO_MIG_FLAG_END_OF_STATE (0xffffffffef100001ULL)
44 #define VFIO_MIG_FLAG_DEV_CONFIG_STATE (0xffffffffef100002ULL)
45 #define VFIO_MIG_FLAG_DEV_SETUP_STATE (0xffffffffef100003ULL)
46 #define VFIO_MIG_FLAG_DEV_DATA_STATE (0xffffffffef100004ULL)
49 * This is an arbitrary size based on migration of mlx5 devices, where typically
50 * total device migration size is on the order of 100s of MB. Testing with
51 * larger values, e.g. 128MB and 1GB, did not show a performance improvement.
53 #define VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE (1 * MiB)
55 static int64_t bytes_transferred
;
57 static const char *mig_state_to_str(enum vfio_device_mig_state state
)
60 case VFIO_DEVICE_STATE_ERROR
:
62 case VFIO_DEVICE_STATE_STOP
:
64 case VFIO_DEVICE_STATE_RUNNING
:
66 case VFIO_DEVICE_STATE_STOP_COPY
:
68 case VFIO_DEVICE_STATE_RESUMING
:
71 return "UNKNOWN STATE";
75 static int vfio_migration_set_state(VFIODevice
*vbasedev
,
76 enum vfio_device_mig_state new_state
,
77 enum vfio_device_mig_state recover_state
)
79 VFIOMigration
*migration
= vbasedev
->migration
;
80 uint64_t buf
[DIV_ROUND_UP(sizeof(struct vfio_device_feature
) +
81 sizeof(struct vfio_device_feature_mig_state
),
82 sizeof(uint64_t))] = {};
83 struct vfio_device_feature
*feature
= (struct vfio_device_feature
*)buf
;
84 struct vfio_device_feature_mig_state
*mig_state
=
85 (struct vfio_device_feature_mig_state
*)feature
->data
;
88 feature
->argsz
= sizeof(buf
);
90 VFIO_DEVICE_FEATURE_SET
| VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE
;
91 mig_state
->device_state
= new_state
;
92 if (ioctl(vbasedev
->fd
, VFIO_DEVICE_FEATURE
, feature
)) {
93 /* Try to set the device in some good state */
96 if (recover_state
== VFIO_DEVICE_STATE_ERROR
) {
97 error_report("%s: Failed setting device state to %s, err: %s. "
98 "Recover state is ERROR. Resetting device",
99 vbasedev
->name
, mig_state_to_str(new_state
),
106 "%s: Failed setting device state to %s, err: %s. Setting device in recover state %s",
107 vbasedev
->name
, mig_state_to_str(new_state
),
108 strerror(errno
), mig_state_to_str(recover_state
));
110 mig_state
->device_state
= recover_state
;
111 if (ioctl(vbasedev
->fd
, VFIO_DEVICE_FEATURE
, feature
)) {
114 "%s: Failed setting device in recover state, err: %s. Resetting device",
115 vbasedev
->name
, strerror(errno
));
120 migration
->device_state
= recover_state
;
125 migration
->device_state
= new_state
;
126 if (mig_state
->data_fd
!= -1) {
127 if (migration
->data_fd
!= -1) {
129 * This can happen if the device is asynchronously reset and
130 * terminates a data transfer.
132 error_report("%s: data_fd out of sync", vbasedev
->name
);
133 close(mig_state
->data_fd
);
138 migration
->data_fd
= mig_state
->data_fd
;
141 trace_vfio_migration_set_state(vbasedev
->name
, mig_state_to_str(new_state
));
146 if (ioctl(vbasedev
->fd
, VFIO_DEVICE_RESET
)) {
147 hw_error("%s: Failed resetting device, err: %s", vbasedev
->name
,
151 migration
->device_state
= VFIO_DEVICE_STATE_RUNNING
;
156 static int vfio_load_buffer(QEMUFile
*f
, VFIODevice
*vbasedev
,
159 VFIOMigration
*migration
= vbasedev
->migration
;
162 ret
= qemu_file_get_to_fd(f
, migration
->data_fd
, data_size
);
163 trace_vfio_load_state_device_data(vbasedev
->name
, data_size
, ret
);
168 static int vfio_save_device_config_state(QEMUFile
*f
, void *opaque
)
170 VFIODevice
*vbasedev
= opaque
;
172 qemu_put_be64(f
, VFIO_MIG_FLAG_DEV_CONFIG_STATE
);
174 if (vbasedev
->ops
&& vbasedev
->ops
->vfio_save_config
) {
175 vbasedev
->ops
->vfio_save_config(vbasedev
, f
);
178 qemu_put_be64(f
, VFIO_MIG_FLAG_END_OF_STATE
);
180 trace_vfio_save_device_config_state(vbasedev
->name
);
182 return qemu_file_get_error(f
);
185 static int vfio_load_device_config_state(QEMUFile
*f
, void *opaque
)
187 VFIODevice
*vbasedev
= opaque
;
190 if (vbasedev
->ops
&& vbasedev
->ops
->vfio_load_config
) {
193 ret
= vbasedev
->ops
->vfio_load_config(vbasedev
, f
);
195 error_report("%s: Failed to load device config space",
201 data
= qemu_get_be64(f
);
202 if (data
!= VFIO_MIG_FLAG_END_OF_STATE
) {
203 error_report("%s: Failed loading device config space, "
204 "end flag incorrect 0x%"PRIx64
, vbasedev
->name
, data
);
208 trace_vfio_load_device_config_state(vbasedev
->name
);
209 return qemu_file_get_error(f
);
212 static void vfio_migration_cleanup(VFIODevice
*vbasedev
)
214 VFIOMigration
*migration
= vbasedev
->migration
;
216 close(migration
->data_fd
);
217 migration
->data_fd
= -1;
220 static int vfio_query_stop_copy_size(VFIODevice
*vbasedev
,
221 uint64_t *stop_copy_size
)
223 uint64_t buf
[DIV_ROUND_UP(sizeof(struct vfio_device_feature
) +
224 sizeof(struct vfio_device_feature_mig_data_size
),
225 sizeof(uint64_t))] = {};
226 struct vfio_device_feature
*feature
= (struct vfio_device_feature
*)buf
;
227 struct vfio_device_feature_mig_data_size
*mig_data_size
=
228 (struct vfio_device_feature_mig_data_size
*)feature
->data
;
230 feature
->argsz
= sizeof(buf
);
232 VFIO_DEVICE_FEATURE_GET
| VFIO_DEVICE_FEATURE_MIG_DATA_SIZE
;
234 if (ioctl(vbasedev
->fd
, VFIO_DEVICE_FEATURE
, feature
)) {
238 *stop_copy_size
= mig_data_size
->stop_copy_length
;
243 /* Returns 1 if end-of-stream is reached, 0 if more data and -errno if error */
244 static int vfio_save_block(QEMUFile
*f
, VFIOMigration
*migration
)
248 data_size
= read(migration
->data_fd
, migration
->data_buffer
,
249 migration
->data_buffer_size
);
253 if (data_size
== 0) {
257 qemu_put_be64(f
, VFIO_MIG_FLAG_DEV_DATA_STATE
);
258 qemu_put_be64(f
, data_size
);
259 qemu_put_buffer(f
, migration
->data_buffer
, data_size
);
260 bytes_transferred
+= data_size
;
262 trace_vfio_save_block(migration
->vbasedev
->name
, data_size
);
264 return qemu_file_get_error(f
);
267 /* ---------------------------------------------------------------------- */
269 static int vfio_save_setup(QEMUFile
*f
, void *opaque
)
271 VFIODevice
*vbasedev
= opaque
;
272 VFIOMigration
*migration
= vbasedev
->migration
;
273 uint64_t stop_copy_size
= VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE
;
275 qemu_put_be64(f
, VFIO_MIG_FLAG_DEV_SETUP_STATE
);
277 vfio_query_stop_copy_size(vbasedev
, &stop_copy_size
);
278 migration
->data_buffer_size
= MIN(VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE
,
280 migration
->data_buffer
= g_try_malloc0(migration
->data_buffer_size
);
281 if (!migration
->data_buffer
) {
282 error_report("%s: Failed to allocate migration data buffer",
287 trace_vfio_save_setup(vbasedev
->name
, migration
->data_buffer_size
);
289 qemu_put_be64(f
, VFIO_MIG_FLAG_END_OF_STATE
);
291 return qemu_file_get_error(f
);
294 static void vfio_save_cleanup(void *opaque
)
296 VFIODevice
*vbasedev
= opaque
;
297 VFIOMigration
*migration
= vbasedev
->migration
;
299 g_free(migration
->data_buffer
);
300 migration
->data_buffer
= NULL
;
301 vfio_migration_cleanup(vbasedev
);
302 trace_vfio_save_cleanup(vbasedev
->name
);
306 * Migration size of VFIO devices can be as little as a few KBs or as big as
307 * many GBs. This value should be big enough to cover the worst case.
309 #define VFIO_MIG_STOP_COPY_SIZE (100 * GiB)
312 * Only exact function is implemented and not estimate function. The reason is
313 * that during pre-copy phase of migration the estimate function is called
314 * repeatedly while pending RAM size is over the threshold, thus migration
315 * can't converge and querying the VFIO device pending data size is useless.
317 static void vfio_state_pending_exact(void *opaque
, uint64_t *must_precopy
,
318 uint64_t *can_postcopy
)
320 VFIODevice
*vbasedev
= opaque
;
321 uint64_t stop_copy_size
= VFIO_MIG_STOP_COPY_SIZE
;
324 * If getting pending migration size fails, VFIO_MIG_STOP_COPY_SIZE is
325 * reported so downtime limit won't be violated.
327 vfio_query_stop_copy_size(vbasedev
, &stop_copy_size
);
328 *must_precopy
+= stop_copy_size
;
330 trace_vfio_state_pending_exact(vbasedev
->name
, *must_precopy
, *can_postcopy
,
334 static int vfio_save_complete_precopy(QEMUFile
*f
, void *opaque
)
336 VFIODevice
*vbasedev
= opaque
;
339 /* We reach here with device state STOP only */
340 ret
= vfio_migration_set_state(vbasedev
, VFIO_DEVICE_STATE_STOP_COPY
,
341 VFIO_DEVICE_STATE_STOP
);
347 ret
= vfio_save_block(f
, vbasedev
->migration
);
353 qemu_put_be64(f
, VFIO_MIG_FLAG_END_OF_STATE
);
354 ret
= qemu_file_get_error(f
);
360 * If setting the device in STOP state fails, the device should be reset.
361 * To do so, use ERROR state as a recover state.
363 ret
= vfio_migration_set_state(vbasedev
, VFIO_DEVICE_STATE_STOP
,
364 VFIO_DEVICE_STATE_ERROR
);
365 trace_vfio_save_complete_precopy(vbasedev
->name
, ret
);
370 static void vfio_save_state(QEMUFile
*f
, void *opaque
)
372 VFIODevice
*vbasedev
= opaque
;
375 ret
= vfio_save_device_config_state(f
, opaque
);
377 error_report("%s: Failed to save device config space",
379 qemu_file_set_error(f
, ret
);
383 static int vfio_load_setup(QEMUFile
*f
, void *opaque
)
385 VFIODevice
*vbasedev
= opaque
;
387 return vfio_migration_set_state(vbasedev
, VFIO_DEVICE_STATE_RESUMING
,
388 vbasedev
->migration
->device_state
);
391 static int vfio_load_cleanup(void *opaque
)
393 VFIODevice
*vbasedev
= opaque
;
395 vfio_migration_cleanup(vbasedev
);
396 trace_vfio_load_cleanup(vbasedev
->name
);
401 static int vfio_load_state(QEMUFile
*f
, void *opaque
, int version_id
)
403 VFIODevice
*vbasedev
= opaque
;
407 data
= qemu_get_be64(f
);
408 while (data
!= VFIO_MIG_FLAG_END_OF_STATE
) {
410 trace_vfio_load_state(vbasedev
->name
, data
);
413 case VFIO_MIG_FLAG_DEV_CONFIG_STATE
:
415 return vfio_load_device_config_state(f
, opaque
);
417 case VFIO_MIG_FLAG_DEV_SETUP_STATE
:
419 data
= qemu_get_be64(f
);
420 if (data
== VFIO_MIG_FLAG_END_OF_STATE
) {
423 error_report("%s: SETUP STATE: EOS not found 0x%"PRIx64
,
424 vbasedev
->name
, data
);
429 case VFIO_MIG_FLAG_DEV_DATA_STATE
:
431 uint64_t data_size
= qemu_get_be64(f
);
434 ret
= vfio_load_buffer(f
, vbasedev
, data_size
);
442 error_report("%s: Unknown tag 0x%"PRIx64
, vbasedev
->name
, data
);
446 data
= qemu_get_be64(f
);
447 ret
= qemu_file_get_error(f
);
455 static const SaveVMHandlers savevm_vfio_handlers
= {
456 .save_setup
= vfio_save_setup
,
457 .save_cleanup
= vfio_save_cleanup
,
458 .state_pending_exact
= vfio_state_pending_exact
,
459 .save_live_complete_precopy
= vfio_save_complete_precopy
,
460 .save_state
= vfio_save_state
,
461 .load_setup
= vfio_load_setup
,
462 .load_cleanup
= vfio_load_cleanup
,
463 .load_state
= vfio_load_state
,
466 /* ---------------------------------------------------------------------- */
468 static void vfio_vmstate_change(void *opaque
, bool running
, RunState state
)
470 VFIODevice
*vbasedev
= opaque
;
471 enum vfio_device_mig_state new_state
;
475 new_state
= VFIO_DEVICE_STATE_RUNNING
;
477 new_state
= VFIO_DEVICE_STATE_STOP
;
481 * If setting the device in new_state fails, the device should be reset.
482 * To do so, use ERROR state as a recover state.
484 ret
= vfio_migration_set_state(vbasedev
, new_state
,
485 VFIO_DEVICE_STATE_ERROR
);
488 * Migration should be aborted in this case, but vm_state_notify()
489 * currently does not support reporting failures.
491 if (migrate_get_current()->to_dst_file
) {
492 qemu_file_set_error(migrate_get_current()->to_dst_file
, ret
);
496 trace_vfio_vmstate_change(vbasedev
->name
, running
, RunState_str(state
),
497 mig_state_to_str(new_state
));
500 static void vfio_migration_state_notifier(Notifier
*notifier
, void *data
)
502 MigrationState
*s
= data
;
503 VFIOMigration
*migration
= container_of(notifier
, VFIOMigration
,
505 VFIODevice
*vbasedev
= migration
->vbasedev
;
507 trace_vfio_migration_state_notifier(vbasedev
->name
,
508 MigrationStatus_str(s
->state
));
511 case MIGRATION_STATUS_CANCELLING
:
512 case MIGRATION_STATUS_CANCELLED
:
513 case MIGRATION_STATUS_FAILED
:
514 bytes_transferred
= 0;
516 * If setting the device in RUNNING state fails, the device should
517 * be reset. To do so, use ERROR state as a recover state.
519 vfio_migration_set_state(vbasedev
, VFIO_DEVICE_STATE_RUNNING
,
520 VFIO_DEVICE_STATE_ERROR
);
524 static void vfio_migration_exit(VFIODevice
*vbasedev
)
526 g_free(vbasedev
->migration
);
527 vbasedev
->migration
= NULL
;
530 static int vfio_migration_query_flags(VFIODevice
*vbasedev
, uint64_t *mig_flags
)
532 uint64_t buf
[DIV_ROUND_UP(sizeof(struct vfio_device_feature
) +
533 sizeof(struct vfio_device_feature_migration
),
534 sizeof(uint64_t))] = {};
535 struct vfio_device_feature
*feature
= (struct vfio_device_feature
*)buf
;
536 struct vfio_device_feature_migration
*mig
=
537 (struct vfio_device_feature_migration
*)feature
->data
;
539 feature
->argsz
= sizeof(buf
);
540 feature
->flags
= VFIO_DEVICE_FEATURE_GET
| VFIO_DEVICE_FEATURE_MIGRATION
;
541 if (ioctl(vbasedev
->fd
, VFIO_DEVICE_FEATURE
, feature
)) {
542 if (errno
== ENOTTY
) {
543 error_report("%s: VFIO migration is not supported in kernel",
546 error_report("%s: Failed to query VFIO migration support, err: %s",
547 vbasedev
->name
, strerror(errno
));
553 *mig_flags
= mig
->flags
;
558 static int vfio_migration_init(VFIODevice
*vbasedev
)
562 VFIOMigration
*migration
;
564 g_autofree
char *path
= NULL
, *oid
= NULL
;
565 uint64_t mig_flags
= 0;
567 if (!vbasedev
->ops
->vfio_get_object
) {
571 obj
= vbasedev
->ops
->vfio_get_object(vbasedev
);
576 ret
= vfio_migration_query_flags(vbasedev
, &mig_flags
);
581 /* Basic migration functionality must be supported */
582 if (!(mig_flags
& VFIO_MIGRATION_STOP_COPY
)) {
586 vbasedev
->migration
= g_new0(VFIOMigration
, 1);
587 migration
= vbasedev
->migration
;
588 migration
->vbasedev
= vbasedev
;
589 migration
->device_state
= VFIO_DEVICE_STATE_RUNNING
;
590 migration
->data_fd
= -1;
592 oid
= vmstate_if_get_id(VMSTATE_IF(DEVICE(obj
)));
594 path
= g_strdup_printf("%s/vfio", oid
);
596 path
= g_strdup("vfio");
598 strpadcpy(id
, sizeof(id
), path
, '\0');
600 register_savevm_live(id
, VMSTATE_INSTANCE_ID_ANY
, 1, &savevm_vfio_handlers
,
603 migration
->vm_state
= qdev_add_vm_change_state_handler(vbasedev
->dev
,
606 migration
->migration_state
.notify
= vfio_migration_state_notifier
;
607 add_migration_state_change_notifier(&migration
->migration_state
);
612 /* ---------------------------------------------------------------------- */
614 int64_t vfio_mig_bytes_transferred(void)
616 return bytes_transferred
;
619 int vfio_migration_probe(VFIODevice
*vbasedev
, Error
**errp
)
623 if (!vbasedev
->enable_migration
) {
627 ret
= vfio_migration_init(vbasedev
);
632 ret
= vfio_block_multiple_devices_migration(errp
);
637 trace_vfio_migration_probe(vbasedev
->name
);
641 error_setg(&vbasedev
->migration_blocker
,
642 "VFIO device doesn't support migration");
644 ret
= migrate_add_blocker(vbasedev
->migration_blocker
, errp
);
646 error_free(vbasedev
->migration_blocker
);
647 vbasedev
->migration_blocker
= NULL
;
652 void vfio_migration_finalize(VFIODevice
*vbasedev
)
654 if (vbasedev
->migration
) {
655 VFIOMigration
*migration
= vbasedev
->migration
;
657 remove_migration_state_change_notifier(&migration
->migration_state
);
658 qemu_del_vm_change_state_handler(migration
->vm_state
);
659 unregister_savevm(VMSTATE_IF(vbasedev
->dev
), "vfio", vbasedev
);
660 vfio_migration_exit(vbasedev
);
661 vfio_unblock_multiple_devices_migration();
664 if (vbasedev
->migration_blocker
) {
665 migrate_del_blocker(vbasedev
->migration_blocker
);
666 error_free(vbasedev
->migration_blocker
);
667 vbasedev
->migration_blocker
= NULL
;