]> git.proxmox.com Git - mirror_qemu.git/blame - hw/virtio/virtio-mem.c
Merge tag 'pull-tcg-20230620' of https://gitlab.com/rth7680/qemu into staging
[mirror_qemu.git] / hw / virtio / virtio-mem.c
CommitLineData
910b2576
DH
1/*
2 * Virtio MEM device
3 *
4 * Copyright (C) 2020 Red Hat, Inc.
5 *
6 * Authors:
7 * David Hildenbrand <david@redhat.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2.
10 * See the COPYING file in the top-level directory.
11 */
12
13#include "qemu/osdep.h"
910b2576
DH
14#include "qemu/iov.h"
15#include "qemu/cutils.h"
16#include "qemu/error-report.h"
17#include "qemu/units.h"
18#include "sysemu/numa.h"
19#include "sysemu/sysemu.h"
20#include "sysemu/reset.h"
21#include "hw/virtio/virtio.h"
22#include "hw/virtio/virtio-bus.h"
23#include "hw/virtio/virtio-access.h"
24#include "hw/virtio/virtio-mem.h"
25#include "qapi/error.h"
26#include "qapi/visitor.h"
27#include "exec/ram_addr.h"
28#include "migration/misc.h"
29#include "hw/boards.h"
30#include "hw/qdev-properties.h"
2becc36a 31#include CONFIG_DEVICES
43e54950 32#include "trace.h"
910b2576 33
3b95a71b
DH
34static const VMStateDescription vmstate_virtio_mem_device_early;
35
23ad8dec
DH
36/*
37 * We only had legacy x86 guests that did not support
38 * VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE. Other targets don't have legacy guests.
39 */
40#if defined(TARGET_X86_64) || defined(TARGET_I386)
41#define VIRTIO_MEM_HAS_LEGACY_GUESTS
42#endif
43
910b2576 44/*
228957fe
DH
45 * Let's not allow blocks smaller than 1 MiB, for example, to keep the tracking
46 * bitmap small.
910b2576 47 */
228957fe
DH
48#define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)(1 * MiB))
49
1263615e
GS
50static uint32_t virtio_mem_default_thp_size(void)
51{
52 uint32_t default_thp_size = VIRTIO_MEM_MIN_BLOCK_SIZE;
53
54#if defined(__x86_64__) || defined(__arm__) || defined(__powerpc64__)
55 default_thp_size = 2 * MiB;
56#elif defined(__aarch64__)
8e3b0cbb 57 if (qemu_real_host_page_size() == 4 * KiB) {
1263615e 58 default_thp_size = 2 * MiB;
8e3b0cbb 59 } else if (qemu_real_host_page_size() == 16 * KiB) {
1263615e 60 default_thp_size = 32 * MiB;
8e3b0cbb 61 } else if (qemu_real_host_page_size() == 64 * KiB) {
1263615e
GS
62 default_thp_size = 512 * MiB;
63 }
228957fe
DH
64#endif
65
1263615e
GS
66 return default_thp_size;
67}
68
228957fe
DH
69/*
70 * We want to have a reasonable default block size such that
71 * 1. We avoid splitting THPs when unplugging memory, which degrades
72 * performance.
73 * 2. We avoid placing THPs for plugged blocks that also cover unplugged
74 * blocks.
75 *
76 * The actual THP size might differ between Linux kernels, so we try to probe
77 * it. In the future (if we ever run into issues regarding 2.), we might want
78 * to disable THP in case we fail to properly probe the THP size, or if the
79 * block size is configured smaller than the THP size.
80 */
81static uint32_t thp_size;
82
83#define HPAGE_PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
84static uint32_t virtio_mem_thp_size(void)
85{
86 gchar *content = NULL;
87 const char *endptr;
88 uint64_t tmp;
89
90 if (thp_size) {
91 return thp_size;
92 }
93
94 /*
95 * Try to probe the actual THP size, fallback to (sane but eventually
96 * incorrect) default sizes.
97 */
98 if (g_file_get_contents(HPAGE_PMD_SIZE_PATH, &content, NULL, NULL) &&
99 !qemu_strtou64(content, &endptr, 0, &tmp) &&
100 (!endptr || *endptr == '\n')) {
1263615e
GS
101 /* Sanity-check the value and fallback to something reasonable. */
102 if (!tmp || !is_power_of_2(tmp)) {
228957fe
DH
103 warn_report("Read unsupported THP size: %" PRIx64, tmp);
104 } else {
105 thp_size = tmp;
106 }
107 }
108
109 if (!thp_size) {
1263615e 110 thp_size = virtio_mem_default_thp_size();
228957fe
DH
111 warn_report("Could not detect THP size, falling back to %" PRIx64
112 " MiB.", thp_size / MiB);
113 }
114
115 g_free(content);
116 return thp_size;
117}
118
119static uint64_t virtio_mem_default_block_size(RAMBlock *rb)
120{
121 const uint64_t page_size = qemu_ram_pagesize(rb);
122
123 /* We can have hugetlbfs with a page size smaller than the THP size. */
8e3b0cbb 124 if (page_size == qemu_real_host_page_size()) {
228957fe
DH
125 return MAX(page_size, virtio_mem_thp_size());
126 }
127 return MAX(page_size, VIRTIO_MEM_MIN_BLOCK_SIZE);
128}
129
23ad8dec
DH
130#if defined(VIRTIO_MEM_HAS_LEGACY_GUESTS)
131static bool virtio_mem_has_shared_zeropage(RAMBlock *rb)
132{
133 /*
134 * We only have a guaranteed shared zeropage on ordinary MAP_PRIVATE
135 * anonymous RAM. In any other case, reading unplugged *can* populate a
136 * fresh page, consuming actual memory.
137 */
138 return !qemu_ram_is_shared(rb) && rb->fd < 0 &&
8e3b0cbb 139 qemu_ram_pagesize(rb) == qemu_real_host_page_size();
23ad8dec
DH
140}
141#endif /* VIRTIO_MEM_HAS_LEGACY_GUESTS */
142
910b2576
DH
143/*
144 * Size the usable region bigger than the requested size if possible. Esp.
145 * Linux guests will only add (aligned) memory blocks in case they fully
146 * fit into the usable region, but plug+online only a subset of the pages.
147 * The memory block size corresponds mostly to the section size.
148 *
149 * This allows e.g., to add 20MB with a section size of 128MB on x86_64, and
b1b87327 150 * a section size of 512MB on arm64 (as long as the start address is properly
910b2576
DH
151 * aligned, similar to ordinary DIMMs).
152 *
153 * We can change this at any time and maybe even make it configurable if
154 * necessary (as the section size can change). But it's more likely that the
155 * section size will rather get smaller and not bigger over time.
156 */
157#if defined(TARGET_X86_64) || defined(TARGET_I386)
158#define VIRTIO_MEM_USABLE_EXTENT (2 * (128 * MiB))
b1b87327
GS
159#elif defined(TARGET_ARM)
160#define VIRTIO_MEM_USABLE_EXTENT (2 * (512 * MiB))
910b2576
DH
161#else
162#error VIRTIO_MEM_USABLE_EXTENT not defined
163#endif
164
165static bool virtio_mem_is_busy(void)
166{
167 /*
168 * Postcopy cannot handle concurrent discards and we don't want to migrate
169 * pages on-demand with stale content when plugging new blocks.
0bc7806c
DH
170 *
171 * For precopy, we don't want unplugged blocks in our migration stream, and
172 * when plugging new blocks, the page content might differ between source
173 * and destination (observable by the guest when not initializing pages
174 * after plugging them) until we're running on the destination (as we didn't
175 * migrate these blocks when they were unplugged).
910b2576 176 */
0bc7806c 177 return migration_in_incoming_postcopy() || !migration_is_idle();
910b2576
DH
178}
179
7a9d5d02
DH
180typedef int (*virtio_mem_range_cb)(const VirtIOMEM *vmem, void *arg,
181 uint64_t offset, uint64_t size);
182
183static int virtio_mem_for_each_unplugged_range(const VirtIOMEM *vmem, void *arg,
184 virtio_mem_range_cb cb)
185{
186 unsigned long first_zero_bit, last_zero_bit;
187 uint64_t offset, size;
188 int ret = 0;
189
190 first_zero_bit = find_first_zero_bit(vmem->bitmap, vmem->bitmap_size);
191 while (first_zero_bit < vmem->bitmap_size) {
192 offset = first_zero_bit * vmem->block_size;
193 last_zero_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size,
194 first_zero_bit + 1) - 1;
195 size = (last_zero_bit - first_zero_bit + 1) * vmem->block_size;
196
197 ret = cb(vmem, arg, offset, size);
198 if (ret) {
199 break;
200 }
201 first_zero_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size,
202 last_zero_bit + 2);
203 }
204 return ret;
205}
206
d71920d4
DH
207static int virtio_mem_for_each_plugged_range(const VirtIOMEM *vmem, void *arg,
208 virtio_mem_range_cb cb)
209{
210 unsigned long first_bit, last_bit;
211 uint64_t offset, size;
212 int ret = 0;
213
214 first_bit = find_first_bit(vmem->bitmap, vmem->bitmap_size);
215 while (first_bit < vmem->bitmap_size) {
216 offset = first_bit * vmem->block_size;
217 last_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size,
218 first_bit + 1) - 1;
219 size = (last_bit - first_bit + 1) * vmem->block_size;
220
221 ret = cb(vmem, arg, offset, size);
222 if (ret) {
223 break;
224 }
225 first_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size,
226 last_bit + 2);
227 }
228 return ret;
229}
230
2044969f
DH
231/*
232 * Adjust the memory section to cover the intersection with the given range.
233 *
234 * Returns false if the intersection is empty, otherwise returns true.
235 */
82ba778e 236static bool virtio_mem_intersect_memory_section(MemoryRegionSection *s,
2044969f
DH
237 uint64_t offset, uint64_t size)
238{
239 uint64_t start = MAX(s->offset_within_region, offset);
240 uint64_t end = MIN(s->offset_within_region + int128_get64(s->size),
241 offset + size);
242
243 if (end <= start) {
244 return false;
245 }
246
247 s->offset_within_address_space += start - s->offset_within_region;
248 s->offset_within_region = start;
249 s->size = int128_make64(end - start);
250 return true;
251}
252
253typedef int (*virtio_mem_section_cb)(MemoryRegionSection *s, void *arg);
254
255static int virtio_mem_for_each_plugged_section(const VirtIOMEM *vmem,
256 MemoryRegionSection *s,
257 void *arg,
258 virtio_mem_section_cb cb)
259{
260 unsigned long first_bit, last_bit;
261 uint64_t offset, size;
262 int ret = 0;
263
b11cf32e 264 first_bit = s->offset_within_region / vmem->block_size;
2044969f
DH
265 first_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size, first_bit);
266 while (first_bit < vmem->bitmap_size) {
267 MemoryRegionSection tmp = *s;
268
269 offset = first_bit * vmem->block_size;
270 last_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size,
271 first_bit + 1) - 1;
272 size = (last_bit - first_bit + 1) * vmem->block_size;
273
82ba778e 274 if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) {
2044969f
DH
275 break;
276 }
277 ret = cb(&tmp, arg);
278 if (ret) {
279 break;
280 }
281 first_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size,
282 last_bit + 2);
283 }
284 return ret;
285}
286
372aa6fd
DH
287static int virtio_mem_for_each_unplugged_section(const VirtIOMEM *vmem,
288 MemoryRegionSection *s,
289 void *arg,
290 virtio_mem_section_cb cb)
291{
292 unsigned long first_bit, last_bit;
293 uint64_t offset, size;
294 int ret = 0;
295
b11cf32e 296 first_bit = s->offset_within_region / vmem->block_size;
372aa6fd
DH
297 first_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size, first_bit);
298 while (first_bit < vmem->bitmap_size) {
299 MemoryRegionSection tmp = *s;
300
301 offset = first_bit * vmem->block_size;
302 last_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size,
303 first_bit + 1) - 1;
304 size = (last_bit - first_bit + 1) * vmem->block_size;
305
82ba778e 306 if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) {
372aa6fd
DH
307 break;
308 }
309 ret = cb(&tmp, arg);
310 if (ret) {
311 break;
312 }
313 first_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size,
314 last_bit + 2);
315 }
316 return ret;
317}
318
2044969f
DH
319static int virtio_mem_notify_populate_cb(MemoryRegionSection *s, void *arg)
320{
321 RamDiscardListener *rdl = arg;
322
323 return rdl->notify_populate(rdl, s);
324}
325
326static int virtio_mem_notify_discard_cb(MemoryRegionSection *s, void *arg)
327{
328 RamDiscardListener *rdl = arg;
329
330 rdl->notify_discard(rdl, s);
331 return 0;
332}
333
334static void virtio_mem_notify_unplug(VirtIOMEM *vmem, uint64_t offset,
335 uint64_t size)
336{
337 RamDiscardListener *rdl;
338
339 QLIST_FOREACH(rdl, &vmem->rdl_list, next) {
340 MemoryRegionSection tmp = *rdl->section;
341
82ba778e 342 if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) {
2044969f
DH
343 continue;
344 }
345 rdl->notify_discard(rdl, &tmp);
346 }
347}
348
349static int virtio_mem_notify_plug(VirtIOMEM *vmem, uint64_t offset,
350 uint64_t size)
351{
352 RamDiscardListener *rdl, *rdl2;
353 int ret = 0;
354
355 QLIST_FOREACH(rdl, &vmem->rdl_list, next) {
356 MemoryRegionSection tmp = *rdl->section;
357
82ba778e 358 if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) {
2044969f
DH
359 continue;
360 }
361 ret = rdl->notify_populate(rdl, &tmp);
362 if (ret) {
363 break;
364 }
365 }
366
367 if (ret) {
368 /* Notify all already-notified listeners. */
369 QLIST_FOREACH(rdl2, &vmem->rdl_list, next) {
29f1b328 370 MemoryRegionSection tmp = *rdl2->section;
2044969f
DH
371
372 if (rdl2 == rdl) {
373 break;
374 }
82ba778e 375 if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) {
2044969f
DH
376 continue;
377 }
378 rdl2->notify_discard(rdl2, &tmp);
379 }
380 }
381 return ret;
382}
383
384static void virtio_mem_notify_unplug_all(VirtIOMEM *vmem)
385{
386 RamDiscardListener *rdl;
387
388 if (!vmem->size) {
389 return;
390 }
391
392 QLIST_FOREACH(rdl, &vmem->rdl_list, next) {
393 if (rdl->double_discard_supported) {
394 rdl->notify_discard(rdl, rdl->section);
395 } else {
396 virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl,
397 virtio_mem_notify_discard_cb);
398 }
399 }
400}
401
402static bool virtio_mem_test_bitmap(const VirtIOMEM *vmem, uint64_t start_gpa,
910b2576
DH
403 uint64_t size, bool plugged)
404{
405 const unsigned long first_bit = (start_gpa - vmem->addr) / vmem->block_size;
406 const unsigned long last_bit = first_bit + (size / vmem->block_size) - 1;
407 unsigned long found_bit;
408
409 /* We fake a shorter bitmap to avoid searching too far. */
410 if (plugged) {
411 found_bit = find_next_zero_bit(vmem->bitmap, last_bit + 1, first_bit);
412 } else {
413 found_bit = find_next_bit(vmem->bitmap, last_bit + 1, first_bit);
414 }
415 return found_bit > last_bit;
416}
417
418static void virtio_mem_set_bitmap(VirtIOMEM *vmem, uint64_t start_gpa,
419 uint64_t size, bool plugged)
420{
421 const unsigned long bit = (start_gpa - vmem->addr) / vmem->block_size;
422 const unsigned long nbits = size / vmem->block_size;
423
424 if (plugged) {
425 bitmap_set(vmem->bitmap, bit, nbits);
426 } else {
427 bitmap_clear(vmem->bitmap, bit, nbits);
428 }
429}
430
431static void virtio_mem_send_response(VirtIOMEM *vmem, VirtQueueElement *elem,
432 struct virtio_mem_resp *resp)
433{
434 VirtIODevice *vdev = VIRTIO_DEVICE(vmem);
435 VirtQueue *vq = vmem->vq;
436
43e54950 437 trace_virtio_mem_send_response(le16_to_cpu(resp->type));
910b2576
DH
438 iov_from_buf(elem->in_sg, elem->in_num, 0, resp, sizeof(*resp));
439
440 virtqueue_push(vq, elem, sizeof(*resp));
441 virtio_notify(vdev, vq);
442}
443
444static void virtio_mem_send_response_simple(VirtIOMEM *vmem,
445 VirtQueueElement *elem,
446 uint16_t type)
447{
448 struct virtio_mem_resp resp = {
449 .type = cpu_to_le16(type),
450 };
451
452 virtio_mem_send_response(vmem, elem, &resp);
453}
454
2044969f
DH
455static bool virtio_mem_valid_range(const VirtIOMEM *vmem, uint64_t gpa,
456 uint64_t size)
910b2576
DH
457{
458 if (!QEMU_IS_ALIGNED(gpa, vmem->block_size)) {
459 return false;
460 }
461 if (gpa + size < gpa || !size) {
462 return false;
463 }
464 if (gpa < vmem->addr || gpa >= vmem->addr + vmem->usable_region_size) {
465 return false;
466 }
467 if (gpa + size > vmem->addr + vmem->usable_region_size) {
468 return false;
469 }
470 return true;
471}
472
473static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa,
474 uint64_t size, bool plug)
475{
476 const uint64_t offset = start_gpa - vmem->addr;
3aca6380 477 RAMBlock *rb = vmem->memdev->mr.ram_block;
910b2576
DH
478
479 if (virtio_mem_is_busy()) {
480 return -EBUSY;
481 }
482
483 if (!plug) {
3aca6380 484 if (ram_block_discard_range(rb, offset, size)) {
910b2576
DH
485 return -EBUSY;
486 }
2044969f 487 virtio_mem_notify_unplug(vmem, offset, size);
09b3b7e0
DH
488 } else {
489 int ret = 0;
490
491 if (vmem->prealloc) {
492 void *area = memory_region_get_ram_ptr(&vmem->memdev->mr) + offset;
493 int fd = memory_region_get_fd(&vmem->memdev->mr);
494 Error *local_err = NULL;
495
e04a34e5 496 qemu_prealloc_mem(fd, area, size, 1, NULL, &local_err);
09b3b7e0
DH
497 if (local_err) {
498 static bool warned;
499
500 /*
501 * Warn only once, we don't want to fill the log with these
502 * warnings.
503 */
504 if (!warned) {
505 warn_report_err(local_err);
506 warned = true;
507 } else {
508 error_free(local_err);
509 }
510 ret = -EBUSY;
511 }
512 }
513 if (!ret) {
514 ret = virtio_mem_notify_plug(vmem, offset, size);
515 }
516
517 if (ret) {
518 /* Could be preallocation or a notifier populated memory. */
519 ram_block_discard_range(vmem->memdev->mr.ram_block, offset, size);
520 return -EBUSY;
521 }
910b2576
DH
522 }
523 virtio_mem_set_bitmap(vmem, start_gpa, size, plug);
524 return 0;
525}
526
527static int virtio_mem_state_change_request(VirtIOMEM *vmem, uint64_t gpa,
528 uint16_t nb_blocks, bool plug)
529{
530 const uint64_t size = nb_blocks * vmem->block_size;
531 int ret;
532
533 if (!virtio_mem_valid_range(vmem, gpa, size)) {
534 return VIRTIO_MEM_RESP_ERROR;
535 }
536
537 if (plug && (vmem->size + size > vmem->requested_size)) {
538 return VIRTIO_MEM_RESP_NACK;
539 }
540
541 /* test if really all blocks are in the opposite state */
542 if (!virtio_mem_test_bitmap(vmem, gpa, size, !plug)) {
543 return VIRTIO_MEM_RESP_ERROR;
544 }
545
546 ret = virtio_mem_set_block_state(vmem, gpa, size, plug);
547 if (ret) {
548 return VIRTIO_MEM_RESP_BUSY;
549 }
550 if (plug) {
551 vmem->size += size;
552 } else {
553 vmem->size -= size;
554 }
c95b4437 555 notifier_list_notify(&vmem->size_change_notifiers, &vmem->size);
910b2576
DH
556 return VIRTIO_MEM_RESP_ACK;
557}
558
559static void virtio_mem_plug_request(VirtIOMEM *vmem, VirtQueueElement *elem,
560 struct virtio_mem_req *req)
561{
562 const uint64_t gpa = le64_to_cpu(req->u.plug.addr);
563 const uint16_t nb_blocks = le16_to_cpu(req->u.plug.nb_blocks);
564 uint16_t type;
565
43e54950 566 trace_virtio_mem_plug_request(gpa, nb_blocks);
910b2576
DH
567 type = virtio_mem_state_change_request(vmem, gpa, nb_blocks, true);
568 virtio_mem_send_response_simple(vmem, elem, type);
569}
570
571static void virtio_mem_unplug_request(VirtIOMEM *vmem, VirtQueueElement *elem,
572 struct virtio_mem_req *req)
573{
574 const uint64_t gpa = le64_to_cpu(req->u.unplug.addr);
575 const uint16_t nb_blocks = le16_to_cpu(req->u.unplug.nb_blocks);
576 uint16_t type;
577
43e54950 578 trace_virtio_mem_unplug_request(gpa, nb_blocks);
910b2576
DH
579 type = virtio_mem_state_change_request(vmem, gpa, nb_blocks, false);
580 virtio_mem_send_response_simple(vmem, elem, type);
581}
582
583static void virtio_mem_resize_usable_region(VirtIOMEM *vmem,
584 uint64_t requested_size,
585 bool can_shrink)
586{
587 uint64_t newsize = MIN(memory_region_size(&vmem->memdev->mr),
588 requested_size + VIRTIO_MEM_USABLE_EXTENT);
589
0aed2800
DH
590 /* The usable region size always has to be multiples of the block size. */
591 newsize = QEMU_ALIGN_UP(newsize, vmem->block_size);
592
910b2576
DH
593 if (!requested_size) {
594 newsize = 0;
595 }
596
597 if (newsize < vmem->usable_region_size && !can_shrink) {
598 return;
599 }
600
43e54950 601 trace_virtio_mem_resized_usable_region(vmem->usable_region_size, newsize);
910b2576
DH
602 vmem->usable_region_size = newsize;
603}
604
605static int virtio_mem_unplug_all(VirtIOMEM *vmem)
606{
607 RAMBlock *rb = vmem->memdev->mr.ram_block;
910b2576
DH
608
609 if (virtio_mem_is_busy()) {
610 return -EBUSY;
611 }
612
3aca6380 613 if (ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb))) {
910b2576
DH
614 return -EBUSY;
615 }
2044969f
DH
616 virtio_mem_notify_unplug_all(vmem);
617
910b2576 618 bitmap_clear(vmem->bitmap, 0, vmem->bitmap_size);
c95b4437
DH
619 if (vmem->size) {
620 vmem->size = 0;
621 notifier_list_notify(&vmem->size_change_notifiers, &vmem->size);
622 }
43e54950 623 trace_virtio_mem_unplugged_all();
910b2576
DH
624 virtio_mem_resize_usable_region(vmem, vmem->requested_size, true);
625 return 0;
626}
627
628static void virtio_mem_unplug_all_request(VirtIOMEM *vmem,
629 VirtQueueElement *elem)
630{
43e54950 631 trace_virtio_mem_unplug_all_request();
910b2576
DH
632 if (virtio_mem_unplug_all(vmem)) {
633 virtio_mem_send_response_simple(vmem, elem, VIRTIO_MEM_RESP_BUSY);
634 } else {
635 virtio_mem_send_response_simple(vmem, elem, VIRTIO_MEM_RESP_ACK);
636 }
637}
638
639static void virtio_mem_state_request(VirtIOMEM *vmem, VirtQueueElement *elem,
640 struct virtio_mem_req *req)
641{
642 const uint16_t nb_blocks = le16_to_cpu(req->u.state.nb_blocks);
643 const uint64_t gpa = le64_to_cpu(req->u.state.addr);
644 const uint64_t size = nb_blocks * vmem->block_size;
645 struct virtio_mem_resp resp = {
646 .type = cpu_to_le16(VIRTIO_MEM_RESP_ACK),
647 };
648
43e54950 649 trace_virtio_mem_state_request(gpa, nb_blocks);
910b2576
DH
650 if (!virtio_mem_valid_range(vmem, gpa, size)) {
651 virtio_mem_send_response_simple(vmem, elem, VIRTIO_MEM_RESP_ERROR);
652 return;
653 }
654
655 if (virtio_mem_test_bitmap(vmem, gpa, size, true)) {
656 resp.u.state.state = cpu_to_le16(VIRTIO_MEM_STATE_PLUGGED);
657 } else if (virtio_mem_test_bitmap(vmem, gpa, size, false)) {
658 resp.u.state.state = cpu_to_le16(VIRTIO_MEM_STATE_UNPLUGGED);
659 } else {
660 resp.u.state.state = cpu_to_le16(VIRTIO_MEM_STATE_MIXED);
661 }
43e54950 662 trace_virtio_mem_state_response(le16_to_cpu(resp.u.state.state));
910b2576
DH
663 virtio_mem_send_response(vmem, elem, &resp);
664}
665
666static void virtio_mem_handle_request(VirtIODevice *vdev, VirtQueue *vq)
667{
668 const int len = sizeof(struct virtio_mem_req);
669 VirtIOMEM *vmem = VIRTIO_MEM(vdev);
670 VirtQueueElement *elem;
671 struct virtio_mem_req req;
672 uint16_t type;
673
674 while (true) {
675 elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
676 if (!elem) {
677 return;
678 }
679
680 if (iov_to_buf(elem->out_sg, elem->out_num, 0, &req, len) < len) {
681 virtio_error(vdev, "virtio-mem protocol violation: invalid request"
682 " size: %d", len);
0c404e45 683 virtqueue_detach_element(vq, elem, 0);
910b2576
DH
684 g_free(elem);
685 return;
686 }
687
688 if (iov_size(elem->in_sg, elem->in_num) <
689 sizeof(struct virtio_mem_resp)) {
690 virtio_error(vdev, "virtio-mem protocol violation: not enough space"
691 " for response: %zu",
692 iov_size(elem->in_sg, elem->in_num));
0c404e45 693 virtqueue_detach_element(vq, elem, 0);
910b2576
DH
694 g_free(elem);
695 return;
696 }
697
698 type = le16_to_cpu(req.type);
699 switch (type) {
700 case VIRTIO_MEM_REQ_PLUG:
701 virtio_mem_plug_request(vmem, elem, &req);
702 break;
703 case VIRTIO_MEM_REQ_UNPLUG:
704 virtio_mem_unplug_request(vmem, elem, &req);
705 break;
706 case VIRTIO_MEM_REQ_UNPLUG_ALL:
707 virtio_mem_unplug_all_request(vmem, elem);
708 break;
709 case VIRTIO_MEM_REQ_STATE:
710 virtio_mem_state_request(vmem, elem, &req);
711 break;
712 default:
713 virtio_error(vdev, "virtio-mem protocol violation: unknown request"
714 " type: %d", type);
0c404e45 715 virtqueue_detach_element(vq, elem, 0);
910b2576
DH
716 g_free(elem);
717 return;
718 }
719
720 g_free(elem);
721 }
722}
723
724static void virtio_mem_get_config(VirtIODevice *vdev, uint8_t *config_data)
725{
726 VirtIOMEM *vmem = VIRTIO_MEM(vdev);
727 struct virtio_mem_config *config = (void *) config_data;
728
729 config->block_size = cpu_to_le64(vmem->block_size);
730 config->node_id = cpu_to_le16(vmem->node);
731 config->requested_size = cpu_to_le64(vmem->requested_size);
732 config->plugged_size = cpu_to_le64(vmem->size);
733 config->addr = cpu_to_le64(vmem->addr);
734 config->region_size = cpu_to_le64(memory_region_size(&vmem->memdev->mr));
735 config->usable_region_size = cpu_to_le64(vmem->usable_region_size);
736}
737
738static uint64_t virtio_mem_get_features(VirtIODevice *vdev, uint64_t features,
739 Error **errp)
740{
741 MachineState *ms = MACHINE(qdev_get_machine());
23ad8dec 742 VirtIOMEM *vmem = VIRTIO_MEM(vdev);
910b2576
DH
743
744 if (ms->numa_state) {
745#if defined(CONFIG_ACPI)
746 virtio_add_feature(&features, VIRTIO_MEM_F_ACPI_PXM);
747#endif
748 }
23ad8dec
DH
749 assert(vmem->unplugged_inaccessible != ON_OFF_AUTO_AUTO);
750 if (vmem->unplugged_inaccessible == ON_OFF_AUTO_ON) {
751 virtio_add_feature(&features, VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE);
752 }
910b2576
DH
753 return features;
754}
755
23ad8dec
DH
756static int virtio_mem_validate_features(VirtIODevice *vdev)
757{
758 if (virtio_host_has_feature(vdev, VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE) &&
759 !virtio_vdev_has_feature(vdev, VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE)) {
760 return -EFAULT;
761 }
762 return 0;
763}
764
910b2576
DH
765static void virtio_mem_system_reset(void *opaque)
766{
767 VirtIOMEM *vmem = VIRTIO_MEM(opaque);
768
769 /*
770 * During usual resets, we will unplug all memory and shrink the usable
771 * region size. This is, however, not possible in all scenarios. Then,
772 * the guest has to deal with this manually (VIRTIO_MEM_REQ_UNPLUG_ALL).
773 */
774 virtio_mem_unplug_all(vmem);
775}
776
777static void virtio_mem_device_realize(DeviceState *dev, Error **errp)
778{
779 MachineState *ms = MACHINE(qdev_get_machine());
780 int nb_numa_nodes = ms->numa_state ? ms->numa_state->num_nodes : 0;
781 VirtIODevice *vdev = VIRTIO_DEVICE(dev);
782 VirtIOMEM *vmem = VIRTIO_MEM(dev);
783 uint64_t page_size;
784 RAMBlock *rb;
785 int ret;
786
787 if (!vmem->memdev) {
788 error_setg(errp, "'%s' property is not set", VIRTIO_MEM_MEMDEV_PROP);
789 return;
790 } else if (host_memory_backend_is_mapped(vmem->memdev)) {
910b2576 791 error_setg(errp, "'%s' property specifies a busy memdev: %s",
7a309cc9
MA
792 VIRTIO_MEM_MEMDEV_PROP,
793 object_get_canonical_path_component(OBJECT(vmem->memdev)));
910b2576
DH
794 return;
795 } else if (!memory_region_is_ram(&vmem->memdev->mr) ||
796 memory_region_is_rom(&vmem->memdev->mr) ||
797 !vmem->memdev->mr.ram_block) {
798 error_setg(errp, "'%s' property specifies an unsupported memdev",
799 VIRTIO_MEM_MEMDEV_PROP);
800 return;
ce1761f0
DH
801 } else if (vmem->memdev->prealloc) {
802 error_setg(errp, "'%s' property specifies a memdev with preallocation"
803 " enabled: %s. Instead, specify 'prealloc=on' for the"
804 " virtio-mem device. ", VIRTIO_MEM_MEMDEV_PROP,
805 object_get_canonical_path_component(OBJECT(vmem->memdev)));
806 return;
910b2576
DH
807 }
808
809 if ((nb_numa_nodes && vmem->node >= nb_numa_nodes) ||
810 (!nb_numa_nodes && vmem->node)) {
811 error_setg(errp, "'%s' property has value '%" PRIu32 "', which exceeds"
812 "the number of numa nodes: %d", VIRTIO_MEM_NODE_PROP,
813 vmem->node, nb_numa_nodes ? nb_numa_nodes : 1);
814 return;
815 }
816
817 if (enable_mlock) {
818 error_setg(errp, "Incompatible with mlock");
819 return;
820 }
821
822 rb = vmem->memdev->mr.ram_block;
823 page_size = qemu_ram_pagesize(rb);
824
23ad8dec
DH
825#if defined(VIRTIO_MEM_HAS_LEGACY_GUESTS)
826 switch (vmem->unplugged_inaccessible) {
827 case ON_OFF_AUTO_AUTO:
828 if (virtio_mem_has_shared_zeropage(rb)) {
829 vmem->unplugged_inaccessible = ON_OFF_AUTO_OFF;
830 } else {
831 vmem->unplugged_inaccessible = ON_OFF_AUTO_ON;
832 }
833 break;
834 case ON_OFF_AUTO_OFF:
835 if (!virtio_mem_has_shared_zeropage(rb)) {
836 warn_report("'%s' property set to 'off' with a memdev that does"
837 " not support the shared zeropage.",
838 VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP);
839 }
840 break;
841 default:
842 break;
843 }
844#else /* VIRTIO_MEM_HAS_LEGACY_GUESTS */
845 vmem->unplugged_inaccessible = ON_OFF_AUTO_ON;
846#endif /* VIRTIO_MEM_HAS_LEGACY_GUESTS */
847
228957fe
DH
848 /*
849 * If the block size wasn't configured by the user, use a sane default. This
850 * allows using hugetlbfs backends of any page size without manual
851 * intervention.
852 */
853 if (!vmem->block_size) {
854 vmem->block_size = virtio_mem_default_block_size(rb);
855 }
856
910b2576
DH
857 if (vmem->block_size < page_size) {
858 error_setg(errp, "'%s' property has to be at least the page size (0x%"
859 PRIx64 ")", VIRTIO_MEM_BLOCK_SIZE_PROP, page_size);
860 return;
228957fe
DH
861 } else if (vmem->block_size < virtio_mem_default_block_size(rb)) {
862 warn_report("'%s' property is smaller than the default block size (%"
863 PRIx64 " MiB)", VIRTIO_MEM_BLOCK_SIZE_PROP,
864 virtio_mem_default_block_size(rb) / MiB);
7656d9ce
DH
865 }
866 if (!QEMU_IS_ALIGNED(vmem->requested_size, vmem->block_size)) {
910b2576
DH
867 error_setg(errp, "'%s' property has to be multiples of '%s' (0x%" PRIx64
868 ")", VIRTIO_MEM_REQUESTED_SIZE_PROP,
869 VIRTIO_MEM_BLOCK_SIZE_PROP, vmem->block_size);
870 return;
d31992ae
DH
871 } else if (!QEMU_IS_ALIGNED(vmem->addr, vmem->block_size)) {
872 error_setg(errp, "'%s' property has to be multiples of '%s' (0x%" PRIx64
873 ")", VIRTIO_MEM_ADDR_PROP, VIRTIO_MEM_BLOCK_SIZE_PROP,
874 vmem->block_size);
875 return;
910b2576
DH
876 } else if (!QEMU_IS_ALIGNED(memory_region_size(&vmem->memdev->mr),
877 vmem->block_size)) {
878 error_setg(errp, "'%s' property memdev size has to be multiples of"
879 "'%s' (0x%" PRIx64 ")", VIRTIO_MEM_MEMDEV_PROP,
880 VIRTIO_MEM_BLOCK_SIZE_PROP, vmem->block_size);
881 return;
882 }
883
bc072ed4 884 if (ram_block_coordinated_discard_require(true)) {
910b2576
DH
885 error_setg(errp, "Discarding RAM is disabled");
886 return;
887 }
888
889 ret = ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb));
890 if (ret) {
891 error_setg_errno(errp, -ret, "Unexpected error discarding RAM");
bc072ed4 892 ram_block_coordinated_discard_require(false);
910b2576
DH
893 return;
894 }
895
896 virtio_mem_resize_usable_region(vmem, vmem->requested_size, true);
897
898 vmem->bitmap_size = memory_region_size(&vmem->memdev->mr) /
899 vmem->block_size;
900 vmem->bitmap = bitmap_new(vmem->bitmap_size);
901
3857cd5c 902 virtio_init(vdev, VIRTIO_ID_MEM, sizeof(struct virtio_mem_config));
910b2576
DH
903 vmem->vq = virtio_add_queue(vdev, 128, virtio_mem_handle_request);
904
905 host_memory_backend_set_mapped(vmem->memdev, true);
906 vmstate_register_ram(&vmem->memdev->mr, DEVICE(vmem));
3b95a71b
DH
907 if (vmem->early_migration) {
908 vmstate_register(VMSTATE_IF(vmem), VMSTATE_INSTANCE_ID_ANY,
909 &vmstate_virtio_mem_device_early, vmem);
910 }
910b2576 911 qemu_register_reset(virtio_mem_system_reset, vmem);
2044969f
DH
912
913 /*
914 * Set ourselves as RamDiscardManager before the plug handler maps the
915 * memory region and exposes it via an address space.
916 */
917 memory_region_set_ram_discard_manager(&vmem->memdev->mr,
918 RAM_DISCARD_MANAGER(vmem));
910b2576
DH
919}
920
921static void virtio_mem_device_unrealize(DeviceState *dev)
922{
923 VirtIODevice *vdev = VIRTIO_DEVICE(dev);
924 VirtIOMEM *vmem = VIRTIO_MEM(dev);
925
2044969f
DH
926 /*
927 * The unplug handler unmapped the memory region, it cannot be
928 * found via an address space anymore. Unset ourselves.
929 */
930 memory_region_set_ram_discard_manager(&vmem->memdev->mr, NULL);
910b2576 931 qemu_unregister_reset(virtio_mem_system_reset, vmem);
3b95a71b
DH
932 if (vmem->early_migration) {
933 vmstate_unregister(VMSTATE_IF(vmem), &vmstate_virtio_mem_device_early,
934 vmem);
935 }
910b2576
DH
936 vmstate_unregister_ram(&vmem->memdev->mr, DEVICE(vmem));
937 host_memory_backend_set_mapped(vmem->memdev, false);
938 virtio_del_queue(vdev, 0);
939 virtio_cleanup(vdev);
940 g_free(vmem->bitmap);
bc072ed4 941 ram_block_coordinated_discard_require(false);
910b2576
DH
942}
943
7a9d5d02
DH
944static int virtio_mem_discard_range_cb(const VirtIOMEM *vmem, void *arg,
945 uint64_t offset, uint64_t size)
910b2576
DH
946{
947 RAMBlock *rb = vmem->memdev->mr.ram_block;
910b2576 948
3aca6380 949 return ram_block_discard_range(rb, offset, size) ? -EINVAL : 0;
910b2576
DH
950}
951
7a9d5d02
DH
952static int virtio_mem_restore_unplugged(VirtIOMEM *vmem)
953{
954 /* Make sure all memory is really discarded after migration. */
955 return virtio_mem_for_each_unplugged_range(vmem, NULL,
956 virtio_mem_discard_range_cb);
957}
958
910b2576
DH
959static int virtio_mem_post_load(void *opaque, int version_id)
960{
2044969f
DH
961 VirtIOMEM *vmem = VIRTIO_MEM(opaque);
962 RamDiscardListener *rdl;
963 int ret;
964
d71920d4
DH
965 if (vmem->prealloc && !vmem->early_migration) {
966 warn_report("Proper preallocation with migration requires a newer QEMU machine");
967 }
968
2044969f
DH
969 /*
970 * We started out with all memory discarded and our memory region is mapped
971 * into an address space. Replay, now that we updated the bitmap.
972 */
973 QLIST_FOREACH(rdl, &vmem->rdl_list, next) {
974 ret = virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl,
975 virtio_mem_notify_populate_cb);
976 if (ret) {
977 return ret;
978 }
979 }
980
910b2576
DH
981 if (migration_in_incoming_postcopy()) {
982 return 0;
983 }
984
2044969f 985 return virtio_mem_restore_unplugged(vmem);
910b2576
DH
986}
987
d71920d4
DH
988static int virtio_mem_prealloc_range_cb(const VirtIOMEM *vmem, void *arg,
989 uint64_t offset, uint64_t size)
990{
991 void *area = memory_region_get_ram_ptr(&vmem->memdev->mr) + offset;
992 int fd = memory_region_get_fd(&vmem->memdev->mr);
993 Error *local_err = NULL;
994
995 qemu_prealloc_mem(fd, area, size, 1, NULL, &local_err);
996 if (local_err) {
997 error_report_err(local_err);
998 return -ENOMEM;
999 }
1000 return 0;
1001}
1002
1003static int virtio_mem_post_load_early(void *opaque, int version_id)
1004{
1005 VirtIOMEM *vmem = VIRTIO_MEM(opaque);
1006 RAMBlock *rb = vmem->memdev->mr.ram_block;
1007 int ret;
1008
1009 if (!vmem->prealloc) {
1010 return 0;
1011 }
1012
1013 /*
1014 * We restored the bitmap and verified that the basic properties
1015 * match on source and destination, so we can go ahead and preallocate
1016 * memory for all plugged memory blocks, before actual RAM migration starts
1017 * touching this memory.
1018 */
1019 ret = virtio_mem_for_each_plugged_range(vmem, NULL,
1020 virtio_mem_prealloc_range_cb);
1021 if (ret) {
1022 return ret;
1023 }
1024
1025 /*
1026 * This is tricky: postcopy wants to start with a clean slate. On
1027 * POSTCOPY_INCOMING_ADVISE, postcopy code discards all (ordinarily
1028 * preallocated) RAM such that postcopy will work as expected later.
1029 *
1030 * However, we run after POSTCOPY_INCOMING_ADVISE -- but before actual
1031 * RAM migration. So let's discard all memory again. This looks like an
1032 * expensive NOP, but actually serves a purpose: we made sure that we
1033 * were able to allocate all required backend memory once. We cannot
1034 * guarantee that the backend memory we will free will remain free
1035 * until we need it during postcopy, but at least we can catch the
1036 * obvious setup issues this way.
1037 */
1038 if (migration_incoming_postcopy_advised()) {
1039 if (ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb))) {
1040 return -EBUSY;
1041 }
1042 }
1043 return 0;
1044}
1045
383ee445
DH
1046typedef struct VirtIOMEMMigSanityChecks {
1047 VirtIOMEM *parent;
1048 uint64_t addr;
1049 uint64_t region_size;
1050 uint64_t block_size;
1051 uint32_t node;
1052} VirtIOMEMMigSanityChecks;
1053
1054static int virtio_mem_mig_sanity_checks_pre_save(void *opaque)
1055{
1056 VirtIOMEMMigSanityChecks *tmp = opaque;
1057 VirtIOMEM *vmem = tmp->parent;
1058
1059 tmp->addr = vmem->addr;
1060 tmp->region_size = memory_region_size(&vmem->memdev->mr);
1061 tmp->block_size = vmem->block_size;
1062 tmp->node = vmem->node;
1063 return 0;
1064}
1065
1066static int virtio_mem_mig_sanity_checks_post_load(void *opaque, int version_id)
1067{
1068 VirtIOMEMMigSanityChecks *tmp = opaque;
1069 VirtIOMEM *vmem = tmp->parent;
1070 const uint64_t new_region_size = memory_region_size(&vmem->memdev->mr);
1071
1072 if (tmp->addr != vmem->addr) {
1073 error_report("Property '%s' changed from 0x%" PRIx64 " to 0x%" PRIx64,
1074 VIRTIO_MEM_ADDR_PROP, tmp->addr, vmem->addr);
1075 return -EINVAL;
1076 }
1077 /*
1078 * Note: Preparation for resizeable memory regions. The maximum size
1079 * of the memory region must not change during migration.
1080 */
1081 if (tmp->region_size != new_region_size) {
1082 error_report("Property '%s' size changed from 0x%" PRIx64 " to 0x%"
1083 PRIx64, VIRTIO_MEM_MEMDEV_PROP, tmp->region_size,
1084 new_region_size);
1085 return -EINVAL;
1086 }
1087 if (tmp->block_size != vmem->block_size) {
1088 error_report("Property '%s' changed from 0x%" PRIx64 " to 0x%" PRIx64,
1089 VIRTIO_MEM_BLOCK_SIZE_PROP, tmp->block_size,
1090 vmem->block_size);
1091 return -EINVAL;
1092 }
1093 if (tmp->node != vmem->node) {
1094 error_report("Property '%s' changed from %" PRIu32 " to %" PRIu32,
1095 VIRTIO_MEM_NODE_PROP, tmp->node, vmem->node);
1096 return -EINVAL;
1097 }
1098 return 0;
1099}
1100
1101static const VMStateDescription vmstate_virtio_mem_sanity_checks = {
1102 .name = "virtio-mem-device/sanity-checks",
1103 .pre_save = virtio_mem_mig_sanity_checks_pre_save,
1104 .post_load = virtio_mem_mig_sanity_checks_post_load,
1105 .fields = (VMStateField[]) {
1106 VMSTATE_UINT64(addr, VirtIOMEMMigSanityChecks),
1107 VMSTATE_UINT64(region_size, VirtIOMEMMigSanityChecks),
1108 VMSTATE_UINT64(block_size, VirtIOMEMMigSanityChecks),
1109 VMSTATE_UINT32(node, VirtIOMEMMigSanityChecks),
1110 VMSTATE_END_OF_LIST(),
1111 },
1112};
1113
3b95a71b
DH
1114static bool virtio_mem_vmstate_field_exists(void *opaque, int version_id)
1115{
1116 const VirtIOMEM *vmem = VIRTIO_MEM(opaque);
1117
1118 /* With early migration, these fields were already migrated. */
1119 return !vmem->early_migration;
1120}
1121
910b2576
DH
1122static const VMStateDescription vmstate_virtio_mem_device = {
1123 .name = "virtio-mem-device",
1124 .minimum_version_id = 1,
1125 .version_id = 1,
0fd7616e 1126 .priority = MIG_PRI_VIRTIO_MEM,
910b2576 1127 .post_load = virtio_mem_post_load,
3b95a71b
DH
1128 .fields = (VMStateField[]) {
1129 VMSTATE_WITH_TMP_TEST(VirtIOMEM, virtio_mem_vmstate_field_exists,
1130 VirtIOMEMMigSanityChecks,
1131 vmstate_virtio_mem_sanity_checks),
1132 VMSTATE_UINT64(usable_region_size, VirtIOMEM),
1133 VMSTATE_UINT64_TEST(size, VirtIOMEM, virtio_mem_vmstate_field_exists),
1134 VMSTATE_UINT64(requested_size, VirtIOMEM),
1135 VMSTATE_BITMAP_TEST(bitmap, VirtIOMEM, virtio_mem_vmstate_field_exists,
1136 0, bitmap_size),
1137 VMSTATE_END_OF_LIST()
1138 },
1139};
1140
1141/*
1142 * Transfer properties that are immutable while migration is active early,
1143 * such that we have have this information around before migrating any RAM
1144 * content.
1145 *
1146 * Note that virtio_mem_is_busy() makes sure these properties can no longer
1147 * change on the migration source until migration completed.
1148 *
1149 * With QEMU compat machines, we transmit these properties later, via
1150 * vmstate_virtio_mem_device instead -- see virtio_mem_vmstate_field_exists().
1151 */
1152static const VMStateDescription vmstate_virtio_mem_device_early = {
1153 .name = "virtio-mem-device-early",
1154 .minimum_version_id = 1,
1155 .version_id = 1,
1156 .early_setup = true,
d71920d4 1157 .post_load = virtio_mem_post_load_early,
910b2576 1158 .fields = (VMStateField[]) {
383ee445
DH
1159 VMSTATE_WITH_TMP(VirtIOMEM, VirtIOMEMMigSanityChecks,
1160 vmstate_virtio_mem_sanity_checks),
910b2576 1161 VMSTATE_UINT64(size, VirtIOMEM),
910b2576
DH
1162 VMSTATE_BITMAP(bitmap, VirtIOMEM, 0, bitmap_size),
1163 VMSTATE_END_OF_LIST()
1164 },
1165};
1166
1167static const VMStateDescription vmstate_virtio_mem = {
1168 .name = "virtio-mem",
1169 .minimum_version_id = 1,
1170 .version_id = 1,
1171 .fields = (VMStateField[]) {
1172 VMSTATE_VIRTIO_DEVICE,
1173 VMSTATE_END_OF_LIST()
1174 },
1175};
1176
1177static void virtio_mem_fill_device_info(const VirtIOMEM *vmem,
1178 VirtioMEMDeviceInfo *vi)
1179{
1180 vi->memaddr = vmem->addr;
1181 vi->node = vmem->node;
1182 vi->requested_size = vmem->requested_size;
1183 vi->size = vmem->size;
1184 vi->max_size = memory_region_size(&vmem->memdev->mr);
1185 vi->block_size = vmem->block_size;
1186 vi->memdev = object_get_canonical_path(OBJECT(vmem->memdev));
1187}
1188
1189static MemoryRegion *virtio_mem_get_memory_region(VirtIOMEM *vmem, Error **errp)
1190{
1191 if (!vmem->memdev) {
1192 error_setg(errp, "'%s' property must be set", VIRTIO_MEM_MEMDEV_PROP);
1193 return NULL;
1194 }
1195
1196 return &vmem->memdev->mr;
1197}
1198
c95b4437
DH
1199static void virtio_mem_add_size_change_notifier(VirtIOMEM *vmem,
1200 Notifier *notifier)
1201{
1202 notifier_list_add(&vmem->size_change_notifiers, notifier);
1203}
1204
1205static void virtio_mem_remove_size_change_notifier(VirtIOMEM *vmem,
1206 Notifier *notifier)
1207{
1208 notifier_remove(notifier);
1209}
1210
910b2576
DH
1211static void virtio_mem_get_size(Object *obj, Visitor *v, const char *name,
1212 void *opaque, Error **errp)
1213{
1214 const VirtIOMEM *vmem = VIRTIO_MEM(obj);
1215 uint64_t value = vmem->size;
1216
1217 visit_type_size(v, name, &value, errp);
1218}
1219
1220static void virtio_mem_get_requested_size(Object *obj, Visitor *v,
1221 const char *name, void *opaque,
1222 Error **errp)
1223{
1224 const VirtIOMEM *vmem = VIRTIO_MEM(obj);
1225 uint64_t value = vmem->requested_size;
1226
1227 visit_type_size(v, name, &value, errp);
1228}
1229
1230static void virtio_mem_set_requested_size(Object *obj, Visitor *v,
1231 const char *name, void *opaque,
1232 Error **errp)
1233{
1234 VirtIOMEM *vmem = VIRTIO_MEM(obj);
910b2576
DH
1235 uint64_t value;
1236
d1c81c34 1237 if (!visit_type_size(v, name, &value, errp)) {
910b2576
DH
1238 return;
1239 }
1240
1241 /*
1242 * The block size and memory backend are not fixed until the device was
1243 * realized. realize() will verify these properties then.
1244 */
1245 if (DEVICE(obj)->realized) {
1246 if (!QEMU_IS_ALIGNED(value, vmem->block_size)) {
1247 error_setg(errp, "'%s' has to be multiples of '%s' (0x%" PRIx64
1248 ")", name, VIRTIO_MEM_BLOCK_SIZE_PROP,
1249 vmem->block_size);
1250 return;
1251 } else if (value > memory_region_size(&vmem->memdev->mr)) {
1252 error_setg(errp, "'%s' cannot exceed the memory backend size"
1253 "(0x%" PRIx64 ")", name,
1254 memory_region_size(&vmem->memdev->mr));
1255 return;
1256 }
1257
1258 if (value != vmem->requested_size) {
1259 virtio_mem_resize_usable_region(vmem, value, false);
1260 vmem->requested_size = value;
1261 }
1262 /*
1263 * Trigger a config update so the guest gets notified. We trigger
1264 * even if the size didn't change (especially helpful for debugging).
1265 */
1266 virtio_notify_config(VIRTIO_DEVICE(vmem));
1267 } else {
1268 vmem->requested_size = value;
1269 }
1270}
1271
1272static void virtio_mem_get_block_size(Object *obj, Visitor *v, const char *name,
1273 void *opaque, Error **errp)
1274{
1275 const VirtIOMEM *vmem = VIRTIO_MEM(obj);
1276 uint64_t value = vmem->block_size;
1277
228957fe
DH
1278 /*
1279 * If not configured by the user (and we're not realized yet), use the
1280 * default block size we would use with the current memory backend.
1281 */
1282 if (!value) {
1283 if (vmem->memdev && memory_region_is_ram(&vmem->memdev->mr)) {
1284 value = virtio_mem_default_block_size(vmem->memdev->mr.ram_block);
1285 } else {
1286 value = virtio_mem_thp_size();
1287 }
1288 }
1289
910b2576
DH
1290 visit_type_size(v, name, &value, errp);
1291}
1292
1293static void virtio_mem_set_block_size(Object *obj, Visitor *v, const char *name,
1294 void *opaque, Error **errp)
1295{
1296 VirtIOMEM *vmem = VIRTIO_MEM(obj);
910b2576
DH
1297 uint64_t value;
1298
1299 if (DEVICE(obj)->realized) {
1300 error_setg(errp, "'%s' cannot be changed", name);
1301 return;
1302 }
1303
d1c81c34 1304 if (!visit_type_size(v, name, &value, errp)) {
910b2576
DH
1305 return;
1306 }
1307
1308 if (value < VIRTIO_MEM_MIN_BLOCK_SIZE) {
1309 error_setg(errp, "'%s' property has to be at least 0x%" PRIx32, name,
1310 VIRTIO_MEM_MIN_BLOCK_SIZE);
1311 return;
1312 } else if (!is_power_of_2(value)) {
1313 error_setg(errp, "'%s' property has to be a power of two", name);
1314 return;
1315 }
1316 vmem->block_size = value;
1317}
1318
1319static void virtio_mem_instance_init(Object *obj)
1320{
1321 VirtIOMEM *vmem = VIRTIO_MEM(obj);
1322
c95b4437 1323 notifier_list_init(&vmem->size_change_notifiers);
2044969f 1324 QLIST_INIT(&vmem->rdl_list);
910b2576
DH
1325
1326 object_property_add(obj, VIRTIO_MEM_SIZE_PROP, "size", virtio_mem_get_size,
1327 NULL, NULL, NULL);
1328 object_property_add(obj, VIRTIO_MEM_REQUESTED_SIZE_PROP, "size",
1329 virtio_mem_get_requested_size,
1330 virtio_mem_set_requested_size, NULL, NULL);
1331 object_property_add(obj, VIRTIO_MEM_BLOCK_SIZE_PROP, "size",
1332 virtio_mem_get_block_size, virtio_mem_set_block_size,
1333 NULL, NULL);
1334}
1335
1336static Property virtio_mem_properties[] = {
1337 DEFINE_PROP_UINT64(VIRTIO_MEM_ADDR_PROP, VirtIOMEM, addr, 0),
1338 DEFINE_PROP_UINT32(VIRTIO_MEM_NODE_PROP, VirtIOMEM, node, 0),
09b3b7e0 1339 DEFINE_PROP_BOOL(VIRTIO_MEM_PREALLOC_PROP, VirtIOMEM, prealloc, false),
910b2576
DH
1340 DEFINE_PROP_LINK(VIRTIO_MEM_MEMDEV_PROP, VirtIOMEM, memdev,
1341 TYPE_MEMORY_BACKEND, HostMemoryBackend *),
23ad8dec
DH
1342#if defined(VIRTIO_MEM_HAS_LEGACY_GUESTS)
1343 DEFINE_PROP_ON_OFF_AUTO(VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP, VirtIOMEM,
d5cef025 1344 unplugged_inaccessible, ON_OFF_AUTO_ON),
23ad8dec 1345#endif
3b95a71b
DH
1346 DEFINE_PROP_BOOL(VIRTIO_MEM_EARLY_MIGRATION_PROP, VirtIOMEM,
1347 early_migration, true),
910b2576
DH
1348 DEFINE_PROP_END_OF_LIST(),
1349};
1350
2044969f
DH
1351static uint64_t virtio_mem_rdm_get_min_granularity(const RamDiscardManager *rdm,
1352 const MemoryRegion *mr)
1353{
1354 const VirtIOMEM *vmem = VIRTIO_MEM(rdm);
1355
1356 g_assert(mr == &vmem->memdev->mr);
1357 return vmem->block_size;
1358}
1359
1360static bool virtio_mem_rdm_is_populated(const RamDiscardManager *rdm,
1361 const MemoryRegionSection *s)
1362{
1363 const VirtIOMEM *vmem = VIRTIO_MEM(rdm);
1364 uint64_t start_gpa = vmem->addr + s->offset_within_region;
1365 uint64_t end_gpa = start_gpa + int128_get64(s->size);
1366
1367 g_assert(s->mr == &vmem->memdev->mr);
1368
1369 start_gpa = QEMU_ALIGN_DOWN(start_gpa, vmem->block_size);
1370 end_gpa = QEMU_ALIGN_UP(end_gpa, vmem->block_size);
1371
1372 if (!virtio_mem_valid_range(vmem, start_gpa, end_gpa - start_gpa)) {
1373 return false;
1374 }
1375
1376 return virtio_mem_test_bitmap(vmem, start_gpa, end_gpa - start_gpa, true);
1377}
1378
1379struct VirtIOMEMReplayData {
1380 void *fn;
1381 void *opaque;
1382};
1383
1384static int virtio_mem_rdm_replay_populated_cb(MemoryRegionSection *s, void *arg)
1385{
1386 struct VirtIOMEMReplayData *data = arg;
1387
1388 return ((ReplayRamPopulate)data->fn)(s, data->opaque);
1389}
1390
1391static int virtio_mem_rdm_replay_populated(const RamDiscardManager *rdm,
1392 MemoryRegionSection *s,
1393 ReplayRamPopulate replay_fn,
1394 void *opaque)
1395{
1396 const VirtIOMEM *vmem = VIRTIO_MEM(rdm);
1397 struct VirtIOMEMReplayData data = {
1398 .fn = replay_fn,
1399 .opaque = opaque,
1400 };
1401
1402 g_assert(s->mr == &vmem->memdev->mr);
1403 return virtio_mem_for_each_plugged_section(vmem, s, &data,
1404 virtio_mem_rdm_replay_populated_cb);
1405}
1406
372aa6fd
DH
1407static int virtio_mem_rdm_replay_discarded_cb(MemoryRegionSection *s,
1408 void *arg)
1409{
1410 struct VirtIOMEMReplayData *data = arg;
1411
1412 ((ReplayRamDiscard)data->fn)(s, data->opaque);
1413 return 0;
1414}
1415
1416static void virtio_mem_rdm_replay_discarded(const RamDiscardManager *rdm,
1417 MemoryRegionSection *s,
1418 ReplayRamDiscard replay_fn,
1419 void *opaque)
1420{
1421 const VirtIOMEM *vmem = VIRTIO_MEM(rdm);
1422 struct VirtIOMEMReplayData data = {
1423 .fn = replay_fn,
1424 .opaque = opaque,
1425 };
1426
1427 g_assert(s->mr == &vmem->memdev->mr);
1428 virtio_mem_for_each_unplugged_section(vmem, s, &data,
1429 virtio_mem_rdm_replay_discarded_cb);
1430}
1431
2044969f
DH
1432static void virtio_mem_rdm_register_listener(RamDiscardManager *rdm,
1433 RamDiscardListener *rdl,
1434 MemoryRegionSection *s)
1435{
1436 VirtIOMEM *vmem = VIRTIO_MEM(rdm);
1437 int ret;
1438
1439 g_assert(s->mr == &vmem->memdev->mr);
1440 rdl->section = memory_region_section_new_copy(s);
1441
1442 QLIST_INSERT_HEAD(&vmem->rdl_list, rdl, next);
1443 ret = virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl,
1444 virtio_mem_notify_populate_cb);
1445 if (ret) {
1446 error_report("%s: Replaying plugged ranges failed: %s", __func__,
1447 strerror(-ret));
1448 }
1449}
1450
1451static void virtio_mem_rdm_unregister_listener(RamDiscardManager *rdm,
1452 RamDiscardListener *rdl)
1453{
1454 VirtIOMEM *vmem = VIRTIO_MEM(rdm);
1455
1456 g_assert(rdl->section->mr == &vmem->memdev->mr);
1457 if (vmem->size) {
1458 if (rdl->double_discard_supported) {
1459 rdl->notify_discard(rdl, rdl->section);
1460 } else {
1461 virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl,
1462 virtio_mem_notify_discard_cb);
1463 }
1464 }
1465
1466 memory_region_section_free_copy(rdl->section);
1467 rdl->section = NULL;
1468 QLIST_REMOVE(rdl, next);
1469}
1470
910b2576
DH
1471static void virtio_mem_class_init(ObjectClass *klass, void *data)
1472{
1473 DeviceClass *dc = DEVICE_CLASS(klass);
1474 VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
1475 VirtIOMEMClass *vmc = VIRTIO_MEM_CLASS(klass);
2044969f 1476 RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_CLASS(klass);
910b2576
DH
1477
1478 device_class_set_props(dc, virtio_mem_properties);
1479 dc->vmsd = &vmstate_virtio_mem;
1480
1481 set_bit(DEVICE_CATEGORY_MISC, dc->categories);
1482 vdc->realize = virtio_mem_device_realize;
1483 vdc->unrealize = virtio_mem_device_unrealize;
1484 vdc->get_config = virtio_mem_get_config;
1485 vdc->get_features = virtio_mem_get_features;
23ad8dec 1486 vdc->validate_features = virtio_mem_validate_features;
910b2576
DH
1487 vdc->vmsd = &vmstate_virtio_mem_device;
1488
1489 vmc->fill_device_info = virtio_mem_fill_device_info;
1490 vmc->get_memory_region = virtio_mem_get_memory_region;
c95b4437
DH
1491 vmc->add_size_change_notifier = virtio_mem_add_size_change_notifier;
1492 vmc->remove_size_change_notifier = virtio_mem_remove_size_change_notifier;
2044969f
DH
1493
1494 rdmc->get_min_granularity = virtio_mem_rdm_get_min_granularity;
1495 rdmc->is_populated = virtio_mem_rdm_is_populated;
1496 rdmc->replay_populated = virtio_mem_rdm_replay_populated;
372aa6fd 1497 rdmc->replay_discarded = virtio_mem_rdm_replay_discarded;
2044969f
DH
1498 rdmc->register_listener = virtio_mem_rdm_register_listener;
1499 rdmc->unregister_listener = virtio_mem_rdm_unregister_listener;
910b2576
DH
1500}
1501
1502static const TypeInfo virtio_mem_info = {
1503 .name = TYPE_VIRTIO_MEM,
1504 .parent = TYPE_VIRTIO_DEVICE,
1505 .instance_size = sizeof(VirtIOMEM),
1506 .instance_init = virtio_mem_instance_init,
1507 .class_init = virtio_mem_class_init,
1508 .class_size = sizeof(VirtIOMEMClass),
2044969f
DH
1509 .interfaces = (InterfaceInfo[]) {
1510 { TYPE_RAM_DISCARD_MANAGER },
1511 { }
1512 },
910b2576
DH
1513};
1514
1515static void virtio_register_types(void)
1516{
1517 type_register_static(&virtio_mem_info);
1518}
1519
1520type_init(virtio_register_types)