config VFIO_USER_SERVER_ALLOWED
bool
imply VFIO_USER_SERVER
+
+config HV_BALLOON_POSSIBLE
+ bool
F: hw/usb/canokey.h
F: docs/system/devices/canokey.rst
+Hyper-V Dynamic Memory Protocol
+M: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
+S: Supported
+F: hw/hyperv/hv-balloon*.c
+F: hw/hyperv/hv-balloon*.h
+F: include/hw/hyperv/dynmem-proto.h
+F: include/hw/hyperv/hv-balloon.h
+
Subsystems
----------
Overall Audio backends
q->cq_phase = !q->cq_phase;
}
cid = le16_to_cpu(c->cid);
- if (cid == 0 || cid > NVME_QUEUE_SIZE) {
- warn_report("NVMe: Unexpected CID in completion queue: %"PRIu32", "
- "queue size: %u", cid, NVME_QUEUE_SIZE);
+ if (cid == 0 || cid > NVME_NUM_REQS) {
+ warn_report("NVMe: Unexpected CID in completion queue: %" PRIu32
+ ", should be within: 1..%u inclusively", cid,
+ NVME_NUM_REQS);
continue;
}
trace_nvme_complete_command(s, q->index, cid);
g_autofree uint64_t *l1_table = NULL;
BdrvDirtyBitmap *bitmap;
QemuUUID uuid;
- char uuidstr[UUID_FMT_LEN + 1];
+ char uuidstr[UUID_STR_LEN];
int i;
if (data_size < sizeof(bf)) {
static void vdi_header_print(VdiHeader *header)
{
- char uuidstr[37];
+ char uuidstr[UUID_STR_LEN];
QemuUUID uuid;
logout("text %s", header->text);
logout("signature 0x%08x\n", header->signature);
loads-stores
memory
modules
+ pci
qom-api
qdev-api
ui
--- /dev/null
+=============
+PCI subsystem
+=============
+
+API Reference
+-------------
+
+.. kernel-doc:: include/hw/pci/pci.h
``vexpress-a15``, and have IRQs from 40 upwards. If a dtb is
provided on the command line then QEMU will edit it to include
suitable entries describing these transports for the guest.
+- QEMU does not currently support either dynamic or static remapping
+ of the area of memory at address 0: it is always mapped to alias
+ the first flash bank
Booting a Linux kernel
----------------------
return &s->pchip.iommu_as;
}
+static const PCIIOMMUOps typhoon_iommu_ops = {
+ .get_address_space = typhoon_pci_dma_iommu,
+};
+
static void typhoon_set_irq(void *opaque, int irq, int level)
{
TyphoonState *s = opaque;
"iommu-typhoon", UINT64_MAX);
address_space_init(&s->pchip.iommu_as, MEMORY_REGION(&s->pchip.iommu),
"pchip0-pci");
- pci_setup_iommu(b, typhoon_pci_dma_iommu, s);
+ pci_setup_iommu(b, &typhoon_iommu_ops, s);
/* Pchip0 PCI special/interrupt acknowledge, 0x801.F800.0000, 64MB. */
memory_region_init_io(&s->pchip.reg_iack, OBJECT(s), &alpha_pci_iack_ops,
return &sdev->as;
}
+static const PCIIOMMUOps smmu_ops = {
+ .get_address_space = smmu_find_add_as,
+};
+
IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid)
{
uint8_t bus_n, devfn;
s->smmu_pcibus_by_busptr = g_hash_table_new(NULL, NULL);
if (s->primary_bus) {
- pci_setup_iommu(s->primary_bus, smmu_find_add_as, s);
+ pci_setup_iommu(s->primary_bus, &smmu_ops, s);
} else {
error_setg(errp, "SMMU is not attached to any PCI bus!");
}
MemoryRegion vram;
MemoryRegion sram;
MemoryRegion flashalias;
- MemoryRegion lowram;
MemoryRegion a15sram;
bool secure;
bool virt;
{
MachineState *machine = MACHINE(vms);
MemoryRegion *sysmem = get_system_memory();
- ram_addr_t low_ram_size;
if (ram_size > 0x40000000) {
/* 1GB is the maximum the address space permits */
exit(1);
}
- low_ram_size = ram_size;
- if (low_ram_size > 0x4000000) {
- low_ram_size = 0x4000000;
- }
- /* RAM is from 0x60000000 upwards. The bottom 64MB of the
+ /*
+ * RAM is from 0x60000000 upwards. The bottom 64MB of the
* address space should in theory be remappable to various
- * things including ROM or RAM; we always map the RAM there.
+ * things including ROM or RAM; we always map the flash there.
*/
- memory_region_init_alias(&vms->lowram, NULL, "vexpress.lowmem",
- machine->ram, 0, low_ram_size);
- memory_region_add_subregion(sysmem, 0x0, &vms->lowram);
memory_region_add_subregion(sysmem, 0x60000000, machine->ram);
/* 0x1e000000 A9MPCore (SCU) private memory region */
build_append_int_noprefix(table_data, 3, 1); /* ARM PL011 UART */
build_append_int_noprefix(table_data, 0, 3); /* Reserved */
/* Base Address */
- build_append_gas(table_data, AML_AS_SYSTEM_MEMORY, 8, 0, 1,
+ build_append_gas(table_data, AML_AS_SYSTEM_MEMORY, 32, 0, 3,
vms->memmap[VIRT_UART].base);
/* Interrupt Type */
build_append_int_noprefix(table_data,
build_append_int_noprefix(table_data, 34, 2);
/* BaseAddressRegister[] */
- build_append_gas(table_data, AML_AS_SYSTEM_MEMORY, 8, 0, 1,
+ build_append_gas(table_data, AML_AS_SYSTEM_MEMORY, 32, 0, 3,
vms->memmap[VIRT_UART].base);
/* AddressSize[] */
qemu_fdt_setprop(ms->fdt, "/pmu", "compatible",
compat, sizeof(compat));
qemu_fdt_setprop_cells(ms->fdt, "/pmu", "interrupts",
- GIC_FDT_IRQ_TYPE_PPI, VIRTUAL_PMU_IRQ, irqflags);
+ GIC_FDT_IRQ_TYPE_PPI,
+ INTID_TO_PPI(VIRTUAL_PMU_IRQ), irqflags);
}
}
return;
}
- if (xen_device_frontend_scanf(xendev, "protocol", "%ms",
- &str) != 1) {
- protocol = BLKIF_PROTOCOL_NATIVE;
+ if (xen_device_frontend_scanf(xendev, "protocol", "%ms", &str) != 1) {
+ /* x86 defaults to the 32-bit protocol even for 64-bit guests. */
+ if (object_dynamic_cast(OBJECT(qdev_get_machine()), "x86-machine")) {
+ protocol = BLKIF_PROTOCOL_X86_32;
+ } else {
+ protocol = BLKIF_PROTOCOL_NATIVE;
+ }
} else {
if (strcmp(str, XEN_IO_PROTO_ABI_X86_32) == 0) {
protocol = BLKIF_PROTOCOL_X86_32;
ssize_t gunzip(void *dst, size_t dstlen, uint8_t *src, size_t srclen)
{
- z_stream s;
+ z_stream s = {};
ssize_t dstbytes;
int r, i, flags;
MemoryDeviceInfo *value;
PCDIMMDeviceInfo *di;
SgxEPCDeviceInfo *se;
+ HvBalloonDeviceInfo *hi;
for (info = info_list; info; info = info->next) {
value = info->value;
monitor_printf(mon, " node: %" PRId64 "\n", se->node);
monitor_printf(mon, " memdev: %s\n", se->memdev);
break;
+ case MEMORY_DEVICE_INFO_KIND_HV_BALLOON:
+ hi = value->u.hv_balloon.data;
+ monitor_printf(mon, "Memory device [%s]: \"%s\"\n",
+ MemoryDeviceInfoKind_str(value->type),
+ hi->id ? hi->id : "");
+ if (hi->has_memaddr) {
+ monitor_printf(mon, " memaddr: 0x%" PRIx64 "\n",
+ hi->memaddr);
+ }
+ monitor_printf(mon, " max-size: %" PRIu64 "\n", hi->max_size);
+ if (hi->memdev) {
+ monitor_printf(mon, " memdev: %s\n", hi->memdev);
+ }
+ break;
default:
g_assert_not_reached();
}
int rc;
rc = snprintf(buffer, sizeof(buffer), "0x%"PRIx64":0x%"PRIx64":%u",
- rr->low, rr->high, rr->type);
+ range_lob(&rr->range), range_upb(&rr->range), rr->type);
assert(rc < sizeof(buffer));
visit_type_str(v, name, &p, errp);
Property *prop = opaque;
ReservedRegion *rr = object_field_prop_ptr(obj, prop);
const char *endptr;
+ uint64_t lob, upb;
char *str;
int ret;
return;
}
- ret = qemu_strtou64(str, &endptr, 16, &rr->low);
+ ret = qemu_strtou64(str, &endptr, 16, &lob);
if (ret) {
error_setg(errp, "start address of '%s'"
" must be a hexadecimal integer", name);
goto separator_error;
}
- ret = qemu_strtou64(endptr + 1, &endptr, 16, &rr->high);
+ ret = qemu_strtou64(endptr + 1, &endptr, 16, &upb);
if (ret) {
error_setg(errp, "end address of '%s'"
" must be a hexadecimal integer", name);
goto separator_error;
}
+ range_set_bounds(&rr->range, lob, upb);
+
ret = qemu_strtoui(endptr + 1, &endptr, 10, &rr->type);
if (ret) {
error_setg(errp, "type of '%s'"
{
Property *prop = opaque;
QemuUUID *uuid = object_field_prop_ptr(obj, prop);
- char buffer[UUID_FMT_LEN + 1];
+ char buffer[UUID_STR_LEN];
char *p = buffer;
qemu_uuid_unparse(uuid, buffer);
case DAC_CNTL:
val = s->regs.dac_cntl;
break;
- case GPIO_VGA_DDC:
- val = s->regs.gpio_vga_ddc;
+ case GPIO_VGA_DDC ... GPIO_VGA_DDC + 3:
+ val = ati_reg_read_offs(s->regs.gpio_vga_ddc,
+ addr - GPIO_VGA_DDC, size);
break;
- case GPIO_DVI_DDC:
- val = s->regs.gpio_dvi_ddc;
+ case GPIO_DVI_DDC ... GPIO_DVI_DDC + 3:
+ val = ati_reg_read_offs(s->regs.gpio_dvi_ddc,
+ addr - GPIO_DVI_DDC, size);
break;
case GPIO_MONID ... GPIO_MONID + 3:
val = ati_reg_read_offs(s->regs.gpio_monid,
case PALETTE_DATA:
val = vga_ioport_read(&s->vga, VGA_PEL_D);
break;
+ case PALETTE_30_DATA:
+ val = s->regs.palette[vga_ioport_read(&s->vga, VGA_PEL_IR)];
+ break;
case CNFG_CNTL:
val = s->regs.config_cntl;
break;
PCI_BASE_ADDRESS_0, size) & 0xfffffff0;
break;
case CONFIG_APER_SIZE:
- val = s->vga.vram_size;
+ val = s->vga.vram_size / 2;
break;
case CONFIG_REG_1_BASE:
val = pci_default_read_config(&s->dev,
PCI_BASE_ADDRESS_2, size) & 0xfffffff0;
break;
case CONFIG_REG_APER_SIZE:
- val = memory_region_size(&s->mm);
+ val = memory_region_size(&s->mm) / 2;
+ break;
+ case HOST_PATH_CNTL:
+ val = BIT(23); /* Radeon HDP_APER_CNTL */
break;
case MC_STATUS:
val = 5;
s->regs.dac_cntl = data & 0xffffe3ff;
s->vga.dac_8bit = !!(data & DAC_8BIT_EN);
break;
- case GPIO_VGA_DDC:
+ /*
+ * GPIO regs for DDC access. Because some drivers access these via
+ * multiple byte writes we have to be careful when we send bits to
+ * avoid spurious changes in bitbang_i2c state. Only do it when either
+ * the enable bits are changed or output bits changed while enabled.
+ */
+ case GPIO_VGA_DDC ... GPIO_VGA_DDC + 3:
if (s->dev_id != PCI_DEVICE_ID_ATI_RAGE128_PF) {
/* FIXME: Maybe add a property to select VGA or DVI port? */
}
break;
- case GPIO_DVI_DDC:
+ case GPIO_DVI_DDC ... GPIO_DVI_DDC + 3:
if (s->dev_id != PCI_DEVICE_ID_ATI_RAGE128_PF) {
- s->regs.gpio_dvi_ddc = ati_i2c(&s->bbi2c, data, 0);
+ ati_reg_write_offs(&s->regs.gpio_dvi_ddc,
+ addr - GPIO_DVI_DDC, data, size);
+ if ((addr <= GPIO_DVI_DDC + 2 && addr + size > GPIO_DVI_DDC + 2) ||
+ (addr == GPIO_DVI_DDC && (s->regs.gpio_dvi_ddc & 0x30000))) {
+ s->regs.gpio_dvi_ddc = ati_i2c(&s->bbi2c,
+ s->regs.gpio_dvi_ddc, 0);
+ }
}
break;
case GPIO_MONID ... GPIO_MONID + 3:
/* FIXME What does Radeon have here? */
if (s->dev_id == PCI_DEVICE_ID_ATI_RAGE128_PF) {
+ /* Rage128p accesses DDC via MONID(1-2) with additional mask bit */
ati_reg_write_offs(&s->regs.gpio_monid,
addr - GPIO_MONID, data, size);
- /*
- * Rage128p accesses DDC used to get EDID via these bits.
- * Because some drivers access this via multiple byte writes
- * we have to be careful when we send bits to avoid spurious
- * changes in bitbang_i2c state. So only do it when mask is set
- * and either the enable bits are changed or output bits changed
- * while enabled.
- */
if ((s->regs.gpio_monid & BIT(25)) &&
((addr <= GPIO_MONID + 2 && addr + size > GPIO_MONID + 2) ||
(addr == GPIO_MONID && (s->regs.gpio_monid & 0x60000)))) {
data >>= 8;
vga_ioport_write(&s->vga, VGA_PEL_D, data & 0xff);
break;
+ case PALETTE_30_DATA:
+ s->regs.palette[vga_ioport_read(&s->vga, VGA_PEL_IW)] = data;
+ vga_ioport_write(&s->vga, VGA_PEL_D, (data >> 22) & 0xff);
+ vga_ioport_write(&s->vga, VGA_PEL_D, (data >> 12) & 0xff);
+ vga_ioport_write(&s->vga, VGA_PEL_D, (data >> 2) & 0xff);
+ break;
case CNFG_CNTL:
s->regs.config_cntl = data;
break;
DEFINE_PROP_UINT16("x-device-id", ATIVGAState, dev_id,
PCI_DEVICE_ID_ATI_RAGE128_PF),
DEFINE_PROP_BOOL("guest_hwcursor", ATIVGAState, cursor_guest_mode, false),
+ DEFINE_PROP_UINT8("x-pixman", ATIVGAState, use_pixman, 3),
DEFINE_PROP_END_OF_LIST()
};
k->exit = ati_vga_exit;
}
+static void ati_vga_init(Object *o)
+{
+ object_property_set_description(o, "x-pixman", "Use pixman for: "
+ "1: fill, 2: blit");
+}
+
static const TypeInfo ati_vga_info = {
.name = TYPE_ATI_VGA,
.parent = TYPE_PCI_DEVICE,
.instance_size = sizeof(ATIVGAState),
.class_init = ati_vga_class_init,
+ .instance_init = ati_vga_init,
.interfaces = (InterfaceInfo[]) {
{ INTERFACE_CONVENTIONAL_PCI_DEVICE },
{ },
switch (s->regs.dp_mix & GMC_ROP3_MASK) {
case ROP3_SRCCOPY:
{
+ bool fallback = false;
unsigned src_x = (s->regs.dp_cntl & DST_X_LEFT_TO_RIGHT ?
s->regs.src_x : s->regs.src_x + 1 - s->regs.dst_width);
unsigned src_y = (s->regs.dp_cntl & DST_Y_TOP_TO_BOTTOM ?
src_bits, dst_bits, src_stride, dst_stride, bpp, bpp,
src_x, src_y, dst_x, dst_y,
s->regs.dst_width, s->regs.dst_height);
- if (s->regs.dp_cntl & DST_X_LEFT_TO_RIGHT &&
+ if ((s->use_pixman & BIT(1)) &&
+ s->regs.dp_cntl & DST_X_LEFT_TO_RIGHT &&
s->regs.dp_cntl & DST_Y_TOP_TO_BOTTOM) {
- pixman_blt((uint32_t *)src_bits, (uint32_t *)dst_bits,
- src_stride, dst_stride, bpp, bpp,
- src_x, src_y, dst_x, dst_y,
- s->regs.dst_width, s->regs.dst_height);
- } else {
+ fallback = !pixman_blt((uint32_t *)src_bits, (uint32_t *)dst_bits,
+ src_stride, dst_stride, bpp, bpp,
+ src_x, src_y, dst_x, dst_y,
+ s->regs.dst_width, s->regs.dst_height);
+ } else if (s->use_pixman & BIT(1)) {
/* FIXME: We only really need a temporary if src and dst overlap */
int llb = s->regs.dst_width * (bpp / 8);
int tmp_stride = DIV_ROUND_UP(llb, sizeof(uint32_t));
uint32_t *tmp = g_malloc(tmp_stride * sizeof(uint32_t) *
s->regs.dst_height);
- pixman_blt((uint32_t *)src_bits, tmp,
- src_stride, tmp_stride, bpp, bpp,
- src_x, src_y, 0, 0,
- s->regs.dst_width, s->regs.dst_height);
- pixman_blt(tmp, (uint32_t *)dst_bits,
- tmp_stride, dst_stride, bpp, bpp,
- 0, 0, dst_x, dst_y,
- s->regs.dst_width, s->regs.dst_height);
+ fallback = !pixman_blt((uint32_t *)src_bits, tmp,
+ src_stride, tmp_stride, bpp, bpp,
+ src_x, src_y, 0, 0,
+ s->regs.dst_width, s->regs.dst_height);
+ if (!fallback) {
+ fallback = !pixman_blt(tmp, (uint32_t *)dst_bits,
+ tmp_stride, dst_stride, bpp, bpp,
+ 0, 0, dst_x, dst_y,
+ s->regs.dst_width, s->regs.dst_height);
+ }
g_free(tmp);
+ } else {
+ fallback = true;
+ }
+ if (fallback) {
+ unsigned int y, i, j, bypp = bpp / 8;
+ unsigned int src_pitch = src_stride * sizeof(uint32_t);
+ unsigned int dst_pitch = dst_stride * sizeof(uint32_t);
+
+ for (y = 0; y < s->regs.dst_height; y++) {
+ i = dst_x * bypp;
+ j = src_x * bypp;
+ if (s->regs.dp_cntl & DST_Y_TOP_TO_BOTTOM) {
+ i += (dst_y + y) * dst_pitch;
+ j += (src_y + y) * src_pitch;
+ } else {
+ i += (dst_y + s->regs.dst_height - 1 - y) * dst_pitch;
+ j += (src_y + s->regs.dst_height - 1 - y) * src_pitch;
+ }
+ memmove(&dst_bits[i], &src_bits[j], s->regs.dst_width * bypp);
+ }
}
if (dst_bits >= s->vga.vram_ptr + s->vga.vbe_start_addr &&
dst_bits < s->vga.vram_ptr + s->vga.vbe_start_addr +
dst_stride /= sizeof(uint32_t);
DPRINTF("pixman_fill(%p, %d, %d, %d, %d, %d, %d, %x)\n",
- dst_bits, dst_stride, bpp,
- dst_x, dst_y,
- s->regs.dst_width, s->regs.dst_height,
- filler);
- pixman_fill((uint32_t *)dst_bits, dst_stride, bpp,
- dst_x, dst_y,
- s->regs.dst_width, s->regs.dst_height,
- filler);
+ dst_bits, dst_stride, bpp, dst_x, dst_y,
+ s->regs.dst_width, s->regs.dst_height, filler);
+ if (!(s->use_pixman & BIT(0)) ||
+ !pixman_fill((uint32_t *)dst_bits, dst_stride, bpp, dst_x, dst_y,
+ s->regs.dst_width, s->regs.dst_height, filler)) {
+ /* fallback when pixman failed or we don't want to call it */
+ unsigned int x, y, i, bypp = bpp / 8;
+ unsigned int dst_pitch = dst_stride * sizeof(uint32_t);
+ for (y = 0; y < s->regs.dst_height; y++) {
+ i = dst_x * bypp + (dst_y + y) * dst_pitch;
+ for (x = 0; x < s->regs.dst_width; x++, i += bypp) {
+ stn_he_p(&dst_bits[i], bypp, filler);
+ }
+ }
+ }
if (dst_bits >= s->vga.vram_ptr + s->vga.vbe_start_addr &&
dst_bits < s->vga.vram_ptr + s->vga.vbe_start_addr +
s->vga.vbe_regs[VBE_DISPI_INDEX_YRES] * s->vga.vbe_line_offset) {
{"AMCGPIO_EN_MIR", 0x00a8},
{"PALETTE_INDEX", 0x00b0},
{"PALETTE_DATA", 0x00b4},
+ {"PALETTE_30_DATA", 0x00b8},
{"CNFG_CNTL", 0x00e0},
{"GEN_RESET_CNTL", 0x00f0},
{"CNFG_MEMSIZE", 0x00f8},
{"CONFIG_APER_SIZE", 0x0108},
{"CONFIG_REG_1_BASE", 0x010c},
{"CONFIG_REG_APER_SIZE", 0x0110},
+ {"HOST_PATH_CNTL", 0x0130},
{"MEM_CNTL", 0x0140},
{"MC_FB_LOCATION", 0x0148},
{"MC_AGP_LOCATION", 0x014C},
uint32_t gpio_dvi_ddc;
uint32_t gpio_monid;
uint32_t config_cntl;
+ uint32_t palette[256];
uint32_t crtc_h_total_disp;
uint32_t crtc_h_sync_strt_wid;
uint32_t crtc_v_total_disp;
char *model;
uint16_t dev_id;
uint8_t mode;
+ uint8_t use_pixman;
bool cursor_guest_mode;
uint16_t cursor_size;
uint32_t cursor_offset;
#define AMCGPIO_EN_MIR 0x00a8
#define PALETTE_INDEX 0x00b0
#define PALETTE_DATA 0x00b4
+#define PALETTE_30_DATA 0x00b8
#define CNFG_CNTL 0x00e0
#define GEN_RESET_CNTL 0x00f0
#define CNFG_MEMSIZE 0x00f8
#define CONFIG_APER_SIZE 0x0108
#define CONFIG_REG_1_BASE 0x010c
#define CONFIG_REG_APER_SIZE 0x0110
+#define HOST_PATH_CNTL 0x0130
#define MEM_CNTL 0x0140
#define MC_FB_LOCATION 0x0148
#define MC_AGP_LOCATION 0x014C
#define DAFB_INTR_MASK 0x104
#define DAFB_INTR_STAT 0x108
#define DAFB_INTR_CLEAR 0x10c
-#define DAFB_RESET 0x200
-#define DAFB_LUT 0x213
+#define DAFB_LUT_INDEX 0x200
+#define DAFB_LUT 0x210
#define DAFB_INTR_VBL 0x4
case DAFB_MODE_SENSE:
val = macfb_sense_read(s);
break;
+ case DAFB_LUT ... DAFB_LUT + 3:
+ val = s->color_palette[s->palette_current];
+ s->palette_current = (s->palette_current + 1) %
+ ARRAY_SIZE(s->color_palette);
+ break;
default:
if (addr < MACFB_CTRL_TOPADDR) {
val = s->regs[addr >> 2];
s->regs[DAFB_INTR_STAT >> 2] &= ~DAFB_INTR_VBL;
macfb_update_irq(s);
break;
- case DAFB_RESET:
- s->palette_current = 0;
- s->regs[DAFB_INTR_STAT >> 2] &= ~DAFB_INTR_VBL;
- macfb_update_irq(s);
+ case DAFB_LUT_INDEX:
+ s->palette_current = (val & 0xff) * 3;
break;
- case DAFB_LUT:
- s->color_palette[s->palette_current] = val;
+ case DAFB_LUT ... DAFB_LUT + 3:
+ s->color_palette[s->palette_current] = val & 0xff;
s->palette_current = (s->palette_current + 1) %
ARRAY_SIZE(s->color_palette);
if (s->palette_current % 3) {
.instance_init = virtio_gpu_rutabaga_initfn,
.interfaces = (InterfaceInfo[]) {
{ INTERFACE_CONVENTIONAL_PCI_DEVICE },
+ { },
}
},
};
assert(QTAILQ_EMPTY(&g->cmdq));
QTAILQ_FOREACH(res, &g->reslist, next) {
+ if (res->blob_size) {
+ continue;
+ }
qemu_put_be32(f, res->resource_id);
qemu_put_be32(f, res->width);
qemu_put_be32(f, res->height);
return vmstate_save_state(f, &vmstate_virtio_gpu_scanouts, g, NULL);
}
+static bool virtio_gpu_load_restore_mapping(VirtIOGPU *g,
+ struct virtio_gpu_simple_resource *res)
+{
+ int i;
+
+ for (i = 0; i < res->iov_cnt; i++) {
+ hwaddr len = res->iov[i].iov_len;
+ res->iov[i].iov_base =
+ dma_memory_map(VIRTIO_DEVICE(g)->dma_as, res->addrs[i], &len,
+ DMA_DIRECTION_TO_DEVICE, MEMTXATTRS_UNSPECIFIED);
+
+ if (!res->iov[i].iov_base || len != res->iov[i].iov_len) {
+ /* Clean up the half-a-mapping we just created... */
+ if (res->iov[i].iov_base) {
+ dma_memory_unmap(VIRTIO_DEVICE(g)->dma_as, res->iov[i].iov_base,
+ len, DMA_DIRECTION_TO_DEVICE, 0);
+ }
+ /* ...and the mappings for previous loop iterations */
+ res->iov_cnt = i;
+ virtio_gpu_cleanup_mapping(g, res);
+ return false;
+ }
+ }
+
+ QTAILQ_INSERT_HEAD(&g->reslist, res, next);
+ g->hostmem += res->hostmem;
+ return true;
+}
+
static int virtio_gpu_load(QEMUFile *f, void *opaque, size_t size,
const VMStateField *field)
{
VirtIOGPU *g = opaque;
struct virtio_gpu_simple_resource *res;
- struct virtio_gpu_scanout *scanout;
uint32_t resource_id, pformat;
void *bits = NULL;
int i;
qemu_get_buffer(f, (void *)pixman_image_get_data(res->image),
pixman_image_get_stride(res->image) * res->height);
- /* restore mapping */
- for (i = 0; i < res->iov_cnt; i++) {
- hwaddr len = res->iov[i].iov_len;
- res->iov[i].iov_base =
- dma_memory_map(VIRTIO_DEVICE(g)->dma_as, res->addrs[i], &len,
- DMA_DIRECTION_TO_DEVICE,
- MEMTXATTRS_UNSPECIFIED);
-
- if (!res->iov[i].iov_base || len != res->iov[i].iov_len) {
- /* Clean up the half-a-mapping we just created... */
- if (res->iov[i].iov_base) {
- dma_memory_unmap(VIRTIO_DEVICE(g)->dma_as,
- res->iov[i].iov_base,
- len,
- DMA_DIRECTION_TO_DEVICE,
- 0);
- }
- /* ...and the mappings for previous loop iterations */
- res->iov_cnt = i;
- virtio_gpu_cleanup_mapping(g, res);
- pixman_image_unref(res->image);
- g_free(res);
- return -EINVAL;
- }
+ if (!virtio_gpu_load_restore_mapping(g, res)) {
+ pixman_image_unref(res->image);
+ g_free(res);
+ return -EINVAL;
}
- QTAILQ_INSERT_HEAD(&g->reslist, res, next);
- g->hostmem += res->hostmem;
-
resource_id = qemu_get_be32(f);
}
/* load & apply scanout state */
vmstate_load_state(f, &vmstate_virtio_gpu_scanouts, g, 1);
+
+ return 0;
+}
+
+static int virtio_gpu_blob_save(QEMUFile *f, void *opaque, size_t size,
+ const VMStateField *field, JSONWriter *vmdesc)
+{
+ VirtIOGPU *g = opaque;
+ struct virtio_gpu_simple_resource *res;
+ int i;
+
+ /* in 2d mode we should never find unprocessed commands here */
+ assert(QTAILQ_EMPTY(&g->cmdq));
+
+ QTAILQ_FOREACH(res, &g->reslist, next) {
+ if (!res->blob_size) {
+ continue;
+ }
+ qemu_put_be32(f, res->resource_id);
+ qemu_put_be32(f, res->blob_size);
+ qemu_put_be32(f, res->iov_cnt);
+ for (i = 0; i < res->iov_cnt; i++) {
+ qemu_put_be64(f, res->addrs[i]);
+ qemu_put_be32(f, res->iov[i].iov_len);
+ }
+ }
+ qemu_put_be32(f, 0); /* end of list */
+
+ return 0;
+}
+
+static int virtio_gpu_blob_load(QEMUFile *f, void *opaque, size_t size,
+ const VMStateField *field)
+{
+ VirtIOGPU *g = opaque;
+ struct virtio_gpu_simple_resource *res;
+ uint32_t resource_id;
+ int i;
+
+ resource_id = qemu_get_be32(f);
+ while (resource_id != 0) {
+ res = virtio_gpu_find_resource(g, resource_id);
+ if (res) {
+ return -EINVAL;
+ }
+
+ res = g_new0(struct virtio_gpu_simple_resource, 1);
+ res->resource_id = resource_id;
+ res->blob_size = qemu_get_be32(f);
+ res->iov_cnt = qemu_get_be32(f);
+ res->addrs = g_new(uint64_t, res->iov_cnt);
+ res->iov = g_new(struct iovec, res->iov_cnt);
+
+ /* read data */
+ for (i = 0; i < res->iov_cnt; i++) {
+ res->addrs[i] = qemu_get_be64(f);
+ res->iov[i].iov_len = qemu_get_be32(f);
+ }
+
+ if (!virtio_gpu_load_restore_mapping(g, res)) {
+ g_free(res);
+ return -EINVAL;
+ }
+
+ virtio_gpu_init_udmabuf(res);
+
+ resource_id = qemu_get_be32(f);
+ }
+
+ return 0;
+}
+
+static int virtio_gpu_post_load(void *opaque, int version_id)
+{
+ VirtIOGPU *g = opaque;
+ struct virtio_gpu_scanout *scanout;
+ struct virtio_gpu_simple_resource *res;
+ int i;
+
for (i = 0; i < g->parent_obj.conf.max_outputs; i++) {
/* FIXME: should take scanout.r.{x,y} into account */
scanout = &g->parent_obj.scanout[i];
}
}
+static bool virtio_gpu_blob_state_needed(void *opaque)
+{
+ VirtIOGPU *g = VIRTIO_GPU(opaque);
+
+ return virtio_gpu_blob_enabled(g->parent_obj.conf);
+}
+
+const VMStateDescription vmstate_virtio_gpu_blob_state = {
+ .name = "virtio-gpu/blob",
+ .minimum_version_id = VIRTIO_GPU_VM_VERSION,
+ .version_id = VIRTIO_GPU_VM_VERSION,
+ .needed = virtio_gpu_blob_state_needed,
+ .fields = (const VMStateField[]){
+ {
+ .name = "virtio-gpu/blob",
+ .info = &(const VMStateInfo) {
+ .name = "blob",
+ .get = virtio_gpu_blob_load,
+ .put = virtio_gpu_blob_save,
+ },
+ .flags = VMS_SINGLE,
+ } /* device */,
+ VMSTATE_END_OF_LIST()
+ },
+};
+
/*
* For historical reasons virtio_gpu does not adhere to virtio migration
* scheme as described in doc/virtio-migration.txt, in a sense that no
} /* device */,
VMSTATE_END_OF_LIST()
},
+ .subsections = (const VMStateDescription * []) {
+ &vmstate_virtio_gpu_blob_state,
+ NULL
+ },
+ .post_load = virtio_gpu_post_load,
};
static Property virtio_gpu_properties[] = {
bool
default y
depends on VMBUS
+
+config HV_BALLOON_SUPPORTED
+ bool
+
+config HV_BALLOON
+ bool
+ default y
+ depends on VMBUS
+ depends on HV_BALLOON_POSSIBLE
+ depends on HV_BALLOON_SUPPORTED
--- /dev/null
+/*
+ * QEMU Hyper-V Dynamic Memory Protocol driver
+ *
+ * Copyright (C) 2020-2023 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef HW_HYPERV_HV_BALLOON_INTERNAL_H
+#define HW_HYPERV_HV_BALLOON_INTERNAL_H
+
+#include "qemu/osdep.h"
+
+#define HV_BALLOON_PFN_SHIFT 12
+#define HV_BALLOON_PAGE_SIZE (1 << HV_BALLOON_PFN_SHIFT)
+
+#define SUM_OVERFLOW_U64(in1, in2) ((in1) > UINT64_MAX - (in2))
+#define SUM_SATURATE_U64(in1, in2) \
+ ({ \
+ uint64_t _in1 = (in1), _in2 = (in2); \
+ uint64_t _result; \
+ \
+ if (!SUM_OVERFLOW_U64(_in1, _in2)) { \
+ _result = _in1 + _in2; \
+ } else { \
+ _result = UINT64_MAX; \
+ } \
+ \
+ _result; \
+ })
+
+#endif
--- /dev/null
+/*
+ * QEMU Hyper-V Dynamic Memory Protocol driver
+ *
+ * Copyright (C) 2020-2023 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "hv-balloon-internal.h"
+#include "hv-balloon-our_range_memslots.h"
+#include "trace.h"
+
+/* OurRange */
+static void our_range_init(OurRange *our_range, uint64_t start, uint64_t count)
+{
+ assert(count <= UINT64_MAX - start);
+ our_range->range.start = start;
+ our_range->range.count = count;
+
+ hvb_page_range_tree_init(&our_range->removed_guest);
+ hvb_page_range_tree_init(&our_range->removed_both);
+
+ /* mark the whole range as unused but for potential use */
+ our_range->added = 0;
+ our_range->unusable_tail = 0;
+}
+
+static void our_range_destroy(OurRange *our_range)
+{
+ hvb_page_range_tree_destroy(&our_range->removed_guest);
+ hvb_page_range_tree_destroy(&our_range->removed_both);
+}
+
+void hvb_our_range_clear_removed_trees(OurRange *our_range)
+{
+ hvb_page_range_tree_destroy(&our_range->removed_guest);
+ hvb_page_range_tree_destroy(&our_range->removed_both);
+ hvb_page_range_tree_init(&our_range->removed_guest);
+ hvb_page_range_tree_init(&our_range->removed_both);
+}
+
+void hvb_our_range_mark_added(OurRange *our_range, uint64_t additional_size)
+{
+ assert(additional_size <= UINT64_MAX - our_range->added);
+
+ our_range->added += additional_size;
+
+ assert(our_range->added <= UINT64_MAX - our_range->unusable_tail);
+ assert(our_range->added + our_range->unusable_tail <=
+ our_range->range.count);
+}
+
+/* OurRangeMemslots */
+static void our_range_memslots_init_slots(OurRangeMemslots *our_range,
+ MemoryRegion *backing_mr,
+ Object *memslot_owner)
+{
+ OurRangeMemslotsSlots *memslots = &our_range->slots;
+ unsigned int idx;
+ uint64_t memslot_offset;
+
+ assert(memslots->count > 0);
+ memslots->slots = g_new0(MemoryRegion, memslots->count);
+
+ /* Initialize our memslots, but don't map them yet. */
+ assert(memslots->size_each > 0);
+ for (idx = 0, memslot_offset = 0; idx < memslots->count;
+ idx++, memslot_offset += memslots->size_each) {
+ uint64_t memslot_size;
+ g_autofree char *name = NULL;
+
+ /* The size of the last memslot might be smaller. */
+ if (idx == memslots->count - 1) {
+ uint64_t region_size;
+
+ assert(our_range->mr);
+ region_size = memory_region_size(our_range->mr);
+ memslot_size = region_size - memslot_offset;
+ } else {
+ memslot_size = memslots->size_each;
+ }
+
+ name = g_strdup_printf("memslot-%u", idx);
+ memory_region_init_alias(&memslots->slots[idx], memslot_owner, name,
+ backing_mr, memslot_offset, memslot_size);
+ /*
+ * We want to be able to atomically and efficiently activate/deactivate
+ * individual memslots without affecting adjacent memslots in memory
+ * notifiers.
+ */
+ memory_region_set_unmergeable(&memslots->slots[idx], true);
+ }
+
+ memslots->mapped_count = 0;
+}
+
+OurRangeMemslots *hvb_our_range_memslots_new(uint64_t addr,
+ MemoryRegion *parent_mr,
+ MemoryRegion *backing_mr,
+ Object *memslot_owner,
+ unsigned int memslot_count,
+ uint64_t memslot_size)
+{
+ OurRangeMemslots *our_range;
+
+ our_range = g_malloc(sizeof(*our_range));
+ our_range_init(&our_range->range,
+ addr / HV_BALLOON_PAGE_SIZE,
+ memory_region_size(parent_mr) / HV_BALLOON_PAGE_SIZE);
+ our_range->slots.size_each = memslot_size;
+ our_range->slots.count = memslot_count;
+ our_range->mr = parent_mr;
+ our_range_memslots_init_slots(our_range, backing_mr, memslot_owner);
+
+ return our_range;
+}
+
+static void our_range_memslots_free_memslots(OurRangeMemslots *our_range)
+{
+ OurRangeMemslotsSlots *memslots = &our_range->slots;
+ unsigned int idx;
+ uint64_t offset;
+
+ memory_region_transaction_begin();
+ for (idx = 0, offset = 0; idx < memslots->mapped_count;
+ idx++, offset += memslots->size_each) {
+ trace_hv_balloon_unmap_slot(idx, memslots->count, offset);
+ assert(memory_region_is_mapped(&memslots->slots[idx]));
+ memory_region_del_subregion(our_range->mr, &memslots->slots[idx]);
+ }
+ memory_region_transaction_commit();
+
+ for (idx = 0; idx < memslots->count; idx++) {
+ object_unparent(OBJECT(&memslots->slots[idx]));
+ }
+
+ g_clear_pointer(&our_range->slots.slots, g_free);
+}
+
+void hvb_our_range_memslots_free(OurRangeMemslots *our_range)
+{
+ OurRangeMemslotsSlots *memslots = &our_range->slots;
+ MemoryRegion *hostmem_mr;
+ RAMBlock *rb;
+
+ assert(our_range->slots.count > 0);
+ assert(our_range->slots.slots);
+
+ hostmem_mr = memslots->slots[0].alias;
+ rb = hostmem_mr->ram_block;
+ ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb));
+
+ our_range_memslots_free_memslots(our_range);
+ our_range_destroy(&our_range->range);
+ g_free(our_range);
+}
+
+void hvb_our_range_memslots_ensure_mapped_additional(OurRangeMemslots *our_range,
+ uint64_t additional_map_size)
+{
+ OurRangeMemslotsSlots *memslots = &our_range->slots;
+ uint64_t total_map_size;
+ unsigned int idx;
+ uint64_t offset;
+
+ total_map_size = (our_range->range.added + additional_map_size) *
+ HV_BALLOON_PAGE_SIZE;
+ idx = memslots->mapped_count;
+ assert(memslots->size_each > 0);
+ offset = idx * memslots->size_each;
+
+ /*
+ * Activate all memslots covered by the newly added region in a single
+ * transaction.
+ */
+ memory_region_transaction_begin();
+ for ( ; idx < memslots->count;
+ idx++, offset += memslots->size_each) {
+ /*
+ * If this memslot starts beyond or at the end of the range to map so
+ * does every next one.
+ */
+ if (offset >= total_map_size) {
+ break;
+ }
+
+ /*
+ * Instead of enabling/disabling memslot, we add/remove them. This
+ * should make address space updates faster, because we don't have to
+ * loop over many disabled subregions.
+ */
+ trace_hv_balloon_map_slot(idx, memslots->count, offset);
+ assert(!memory_region_is_mapped(&memslots->slots[idx]));
+ memory_region_add_subregion(our_range->mr, offset,
+ &memslots->slots[idx]);
+
+ memslots->mapped_count++;
+ }
+ memory_region_transaction_commit();
+}
--- /dev/null
+/*
+ * QEMU Hyper-V Dynamic Memory Protocol driver
+ *
+ * Copyright (C) 2020-2023 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef HW_HYPERV_HV_BALLOON_OUR_RANGE_MEMSLOTS_H
+#define HW_HYPERV_HV_BALLOON_OUR_RANGE_MEMSLOTS_H
+
+#include "qemu/osdep.h"
+
+#include "exec/memory.h"
+#include "qom/object.h"
+#include "hv-balloon-page_range_tree.h"
+
+/* OurRange */
+#define OUR_RANGE(ptr) ((OurRange *)(ptr))
+
+/* "our range" means the memory range owned by this driver (for hot-adding) */
+typedef struct OurRange {
+ PageRange range;
+
+ /* How many pages were hot-added to the guest */
+ uint64_t added;
+
+ /* Pages at the end not currently usable */
+ uint64_t unusable_tail;
+
+ /* Memory removed from the guest */
+ PageRangeTree removed_guest, removed_both;
+} OurRange;
+
+static inline uint64_t our_range_get_remaining_start(OurRange *our_range)
+{
+ return our_range->range.start + our_range->added;
+}
+
+static inline uint64_t our_range_get_remaining_size(OurRange *our_range)
+{
+ return our_range->range.count - our_range->added - our_range->unusable_tail;
+}
+
+void hvb_our_range_mark_added(OurRange *our_range, uint64_t additional_size);
+
+static inline void our_range_mark_remaining_unusable(OurRange *our_range)
+{
+ our_range->unusable_tail = our_range->range.count - our_range->added;
+}
+
+static inline PageRangeTree our_range_get_removed_tree(OurRange *our_range,
+ bool both)
+{
+ if (both) {
+ return our_range->removed_both;
+ } else {
+ return our_range->removed_guest;
+ }
+}
+
+static inline bool our_range_is_removed_tree_empty(OurRange *our_range,
+ bool both)
+{
+ if (both) {
+ return page_range_tree_is_empty(our_range->removed_both);
+ } else {
+ return page_range_tree_is_empty(our_range->removed_guest);
+ }
+}
+
+void hvb_our_range_clear_removed_trees(OurRange *our_range);
+
+/* OurRangeMemslots */
+typedef struct OurRangeMemslotsSlots {
+ /* Nominal size of each memslot (the last one might be smaller) */
+ uint64_t size_each;
+
+ /* Slots array and its element count */
+ MemoryRegion *slots;
+ unsigned int count;
+
+ /* How many slots are currently mapped */
+ unsigned int mapped_count;
+} OurRangeMemslotsSlots;
+
+typedef struct OurRangeMemslots {
+ OurRange range;
+
+ /* Memslots covering our range */
+ OurRangeMemslotsSlots slots;
+
+ MemoryRegion *mr;
+} OurRangeMemslots;
+
+OurRangeMemslots *hvb_our_range_memslots_new(uint64_t addr,
+ MemoryRegion *parent_mr,
+ MemoryRegion *backing_mr,
+ Object *memslot_owner,
+ unsigned int memslot_count,
+ uint64_t memslot_size);
+void hvb_our_range_memslots_free(OurRangeMemslots *our_range);
+
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(OurRangeMemslots, hvb_our_range_memslots_free)
+
+void hvb_our_range_memslots_ensure_mapped_additional(OurRangeMemslots *our_range,
+ uint64_t additional_map_size);
+
+#endif
--- /dev/null
+/*
+ * QEMU Hyper-V Dynamic Memory Protocol driver
+ *
+ * Copyright (C) 2020-2023 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "hv-balloon-internal.h"
+#include "hv-balloon-page_range_tree.h"
+
+/*
+ * temporarily avoid warnings about enhanced GTree API usage requiring a
+ * too recent Glib version until GLIB_VERSION_MAX_ALLOWED finally reaches
+ * the Glib version with this API
+ */
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+
+/* PageRangeTree */
+static gint page_range_tree_key_compare(gconstpointer leftp,
+ gconstpointer rightp,
+ gpointer user_data)
+{
+ const uint64_t *left = leftp, *right = rightp;
+
+ if (*left < *right) {
+ return -1;
+ } else if (*left > *right) {
+ return 1;
+ } else { /* *left == *right */
+ return 0;
+ }
+}
+
+static GTreeNode *page_range_tree_insert_new(PageRangeTree tree,
+ uint64_t start, uint64_t count)
+{
+ uint64_t *key = g_malloc(sizeof(*key));
+ PageRange *range = g_malloc(sizeof(*range));
+
+ assert(count > 0);
+
+ *key = range->start = start;
+ range->count = count;
+
+ return g_tree_insert_node(tree.t, key, range);
+}
+
+void hvb_page_range_tree_insert(PageRangeTree tree,
+ uint64_t start, uint64_t count,
+ uint64_t *dupcount)
+{
+ GTreeNode *node;
+ bool joinable;
+ uint64_t intersection;
+ PageRange *range;
+
+ assert(!SUM_OVERFLOW_U64(start, count));
+ if (count == 0) {
+ return;
+ }
+
+ node = g_tree_upper_bound(tree.t, &start);
+ if (node) {
+ node = g_tree_node_previous(node);
+ } else {
+ node = g_tree_node_last(tree.t);
+ }
+
+ if (node) {
+ range = g_tree_node_value(node);
+ assert(range);
+ intersection = page_range_intersection_size(range, start, count);
+ joinable = page_range_joinable_right(range, start, count);
+ }
+
+ if (!node ||
+ (!intersection && !joinable)) {
+ /*
+ * !node case: the tree is empty or the very first node in the tree
+ * already has a higher key (the start of its range).
+ * the other case: there is a gap in the tree between the new range
+ * and the previous one.
+ * anyway, let's just insert the new range into the tree.
+ */
+ node = page_range_tree_insert_new(tree, start, count);
+ assert(node);
+ range = g_tree_node_value(node);
+ assert(range);
+ } else {
+ /*
+ * the previous range in the tree either partially covers the new
+ * range or ends just at its beginning - extend it
+ */
+ if (dupcount) {
+ *dupcount += intersection;
+ }
+
+ count += start - range->start;
+ range->count = MAX(range->count, count);
+ }
+
+ /* check next nodes for possible merging */
+ for (node = g_tree_node_next(node); node; ) {
+ PageRange *rangecur;
+
+ rangecur = g_tree_node_value(node);
+ assert(rangecur);
+
+ intersection = page_range_intersection_size(rangecur,
+ range->start, range->count);
+ joinable = page_range_joinable_left(rangecur,
+ range->start, range->count);
+ if (!intersection && !joinable) {
+ /* the current node is disjoint */
+ break;
+ }
+
+ if (dupcount) {
+ *dupcount += intersection;
+ }
+
+ count = rangecur->count + (rangecur->start - range->start);
+ range->count = MAX(range->count, count);
+
+ /* the current node was merged in, remove it */
+ start = rangecur->start;
+ node = g_tree_node_next(node);
+ /* no hinted removal in GTree... */
+ g_tree_remove(tree.t, &start);
+ }
+}
+
+bool hvb_page_range_tree_pop(PageRangeTree tree, PageRange *out,
+ uint64_t maxcount)
+{
+ GTreeNode *node;
+ PageRange *range;
+
+ node = g_tree_node_last(tree.t);
+ if (!node) {
+ return false;
+ }
+
+ range = g_tree_node_value(node);
+ assert(range);
+
+ out->start = range->start;
+
+ /* can't modify range->start as it is the node key */
+ if (range->count > maxcount) {
+ out->start += range->count - maxcount;
+ out->count = maxcount;
+ range->count -= maxcount;
+ } else {
+ out->count = range->count;
+ /* no hinted removal in GTree... */
+ g_tree_remove(tree.t, &out->start);
+ }
+
+ return true;
+}
+
+bool hvb_page_range_tree_intree_any(PageRangeTree tree,
+ uint64_t start, uint64_t count)
+{
+ GTreeNode *node;
+
+ if (count == 0) {
+ return false;
+ }
+
+ /* find the first node that can possibly intersect our range */
+ node = g_tree_upper_bound(tree.t, &start);
+ if (node) {
+ /*
+ * a NULL node below means that the very first node in the tree
+ * already has a higher key (the start of its range).
+ */
+ node = g_tree_node_previous(node);
+ } else {
+ /* a NULL node below means that the tree is empty */
+ node = g_tree_node_last(tree.t);
+ }
+ /* node range start <= range start */
+
+ if (!node) {
+ /* node range start > range start */
+ node = g_tree_node_first(tree.t);
+ }
+
+ for ( ; node; node = g_tree_node_next(node)) {
+ PageRange *range = g_tree_node_value(node);
+
+ assert(range);
+ /*
+ * if this node starts beyond or at the end of our range so does
+ * every next one
+ */
+ if (range->start >= start + count) {
+ break;
+ }
+
+ if (page_range_intersection_size(range, start, count) > 0) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+void hvb_page_range_tree_init(PageRangeTree *tree)
+{
+ tree->t = g_tree_new_full(page_range_tree_key_compare, NULL,
+ g_free, g_free);
+}
+
+void hvb_page_range_tree_destroy(PageRangeTree *tree)
+{
+ /* g_tree_destroy() is not NULL-safe */
+ if (!tree->t) {
+ return;
+ }
+
+ g_tree_destroy(tree->t);
+ tree->t = NULL;
+}
--- /dev/null
+/*
+ * QEMU Hyper-V Dynamic Memory Protocol driver
+ *
+ * Copyright (C) 2020-2023 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef HW_HYPERV_HV_BALLOON_PAGE_RANGE_TREE_H
+#define HW_HYPERV_HV_BALLOON_PAGE_RANGE_TREE_H
+
+#include "qemu/osdep.h"
+
+/* PageRange */
+typedef struct PageRange {
+ uint64_t start;
+ uint64_t count;
+} PageRange;
+
+/* return just the part of range before (start) */
+static inline void page_range_part_before(const PageRange *range,
+ uint64_t start, PageRange *out)
+{
+ uint64_t endr = range->start + range->count;
+ uint64_t end = MIN(endr, start);
+
+ out->start = range->start;
+ if (end > out->start) {
+ out->count = end - out->start;
+ } else {
+ out->count = 0;
+ }
+}
+
+/* return just the part of range after (start, count) */
+static inline void page_range_part_after(const PageRange *range,
+ uint64_t start, uint64_t count,
+ PageRange *out)
+{
+ uint64_t end = range->start + range->count;
+ uint64_t ends = start + count;
+
+ out->start = MAX(range->start, ends);
+ if (end > out->start) {
+ out->count = end - out->start;
+ } else {
+ out->count = 0;
+ }
+}
+
+static inline void page_range_intersect(const PageRange *range,
+ uint64_t start, uint64_t count,
+ PageRange *out)
+{
+ uint64_t end1 = range->start + range->count;
+ uint64_t end2 = start + count;
+ uint64_t end = MIN(end1, end2);
+
+ out->start = MAX(range->start, start);
+ out->count = out->start < end ? end - out->start : 0;
+}
+
+static inline uint64_t page_range_intersection_size(const PageRange *range,
+ uint64_t start, uint64_t count)
+{
+ PageRange trange;
+
+ page_range_intersect(range, start, count, &trange);
+ return trange.count;
+}
+
+static inline bool page_range_joinable_left(const PageRange *range,
+ uint64_t start, uint64_t count)
+{
+ return start + count == range->start;
+}
+
+static inline bool page_range_joinable_right(const PageRange *range,
+ uint64_t start, uint64_t count)
+{
+ return range->start + range->count == start;
+}
+
+static inline bool page_range_joinable(const PageRange *range,
+ uint64_t start, uint64_t count)
+{
+ return page_range_joinable_left(range, start, count) ||
+ page_range_joinable_right(range, start, count);
+}
+
+/* PageRangeTree */
+/* type safety */
+typedef struct PageRangeTree {
+ GTree *t;
+} PageRangeTree;
+
+static inline bool page_range_tree_is_empty(PageRangeTree tree)
+{
+ guint nnodes = g_tree_nnodes(tree.t);
+
+ return nnodes == 0;
+}
+
+void hvb_page_range_tree_init(PageRangeTree *tree);
+void hvb_page_range_tree_destroy(PageRangeTree *tree);
+
+bool hvb_page_range_tree_intree_any(PageRangeTree tree,
+ uint64_t start, uint64_t count);
+
+bool hvb_page_range_tree_pop(PageRangeTree tree, PageRange *out,
+ uint64_t maxcount);
+
+void hvb_page_range_tree_insert(PageRangeTree tree,
+ uint64_t start, uint64_t count,
+ uint64_t *dupcount);
+
+#endif
--- /dev/null
+/*
+ * QEMU Hyper-V Dynamic Memory Protocol driver
+ *
+ * Copyright (C) 2023 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qapi/qapi-commands-machine.h"
+#include "qapi/qapi-types-machine.h"
+
+HvBalloonInfo *qmp_query_hv_balloon_status_report(Error **errp)
+{
+ error_setg(errp, "hv-balloon device not enabled in this build");
+ return NULL;
+}
--- /dev/null
+/*
+ * QEMU Hyper-V Dynamic Memory Protocol driver
+ *
+ * Copyright (C) 2020-2023 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "hv-balloon-internal.h"
+
+#include "exec/address-spaces.h"
+#include "exec/cpu-common.h"
+#include "exec/ramblock.h"
+#include "hw/boards.h"
+#include "hw/hyperv/dynmem-proto.h"
+#include "hw/hyperv/hv-balloon.h"
+#include "hw/hyperv/vmbus.h"
+#include "hw/mem/memory-device.h"
+#include "hw/mem/pc-dimm.h"
+#include "hw/qdev-core.h"
+#include "hw/qdev-properties.h"
+#include "monitor/qdev.h"
+#include "qapi/error.h"
+#include "qapi/qapi-commands-machine.h"
+#include "qapi/qapi-events-machine.h"
+#include "qapi/qapi-types-machine.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/visitor.h"
+#include "qemu/error-report.h"
+#include "qemu/module.h"
+#include "qemu/units.h"
+#include "qemu/timer.h"
+#include "sysemu/balloon.h"
+#include "sysemu/hostmem.h"
+#include "sysemu/reset.h"
+#include "hv-balloon-our_range_memslots.h"
+#include "hv-balloon-page_range_tree.h"
+#include "trace.h"
+
+#define HV_BALLOON_ADDR_PROP "addr"
+#define HV_BALLOON_MEMDEV_PROP "memdev"
+#define HV_BALLOON_GUID "525074DC-8985-46e2-8057-A307DC18A502"
+
+/*
+ * Some Windows versions (at least Server 2019) will crash with various
+ * error codes when receiving DM protocol requests (at least
+ * DM_MEM_HOT_ADD_REQUEST) immediately after boot.
+ *
+ * It looks like Hyper-V from Server 2016 uses a 50-second after-boot
+ * delay, probably to workaround this issue, so we'll use this value, too.
+ */
+#define HV_BALLOON_POST_INIT_WAIT (50 * 1000)
+
+#define HV_BALLOON_HA_CHUNK_SIZE (2 * GiB)
+#define HV_BALLOON_HA_CHUNK_PAGES (HV_BALLOON_HA_CHUNK_SIZE / HV_BALLOON_PAGE_SIZE)
+
+#define HV_BALLOON_HA_MEMSLOT_SIZE_ALIGN (128 * MiB)
+
+#define HV_BALLOON_HR_CHUNK_PAGES 585728
+/*
+ * ^ that's the maximum number of pages
+ * that Windows returns in one hot remove response
+ *
+ * If the number requested is too high Windows will no longer honor
+ * these requests
+ */
+
+struct HvBalloonClass {
+ VMBusDeviceClass parent_class;
+} HvBalloonClass;
+
+typedef enum State {
+ /* not a real state */
+ S_NO_CHANGE = 0,
+
+ S_WAIT_RESET,
+ S_POST_RESET_CLOSED,
+
+ /* init flow */
+ S_VERSION,
+ S_CAPS,
+ S_POST_INIT_WAIT,
+
+ S_IDLE,
+
+ /* balloon op flow */
+ S_BALLOON_POSTING,
+ S_BALLOON_RB_WAIT,
+ S_BALLOON_REPLY_WAIT,
+
+ /* unballoon + hot add ops flow */
+ S_UNBALLOON_POSTING,
+ S_UNBALLOON_RB_WAIT,
+ S_UNBALLOON_REPLY_WAIT,
+ S_HOT_ADD_SETUP,
+ S_HOT_ADD_RB_WAIT,
+ S_HOT_ADD_POSTING,
+ S_HOT_ADD_REPLY_WAIT,
+} State;
+
+typedef struct StateDesc {
+ State state;
+ const char *desc;
+} StateDesc;
+
+typedef struct HvBalloon {
+ VMBusDevice parent;
+ State state;
+
+ union dm_version version;
+ union dm_caps caps;
+
+ QEMUTimer post_init_timer;
+
+ unsigned int trans_id;
+
+ struct {
+ bool enabled;
+ bool received;
+ uint64_t committed;
+ uint64_t available;
+ } status_report;
+
+ /* Guest target size */
+ uint64_t target;
+ bool target_changed;
+
+ /* Current (un)balloon / hot-add operation parameters */
+ union {
+ uint64_t balloon_diff;
+
+ struct {
+ uint64_t unballoon_diff;
+ uint64_t hot_add_diff;
+ };
+
+ struct {
+ PageRange hot_add_range;
+ uint64_t ha_current_count;
+ };
+ };
+
+ OurRangeMemslots *our_range;
+
+ /* Count of memslots covering our memory */
+ unsigned int memslot_count;
+
+ /* Nominal size of each memslot (the last one might be smaller) */
+ uint64_t memslot_size;
+
+ /* Non-ours removed memory */
+ PageRangeTree removed_guest, removed_both;
+
+ /* Grand totals of removed memory (both ours and non-ours) */
+ uint64_t removed_guest_ctr, removed_both_ctr;
+
+ /* MEMORY_DEVICE props */
+ uint64_t addr;
+ HostMemoryBackend *hostmem;
+ MemoryRegion *mr;
+} HvBalloon;
+
+OBJECT_DEFINE_TYPE_WITH_INTERFACES(HvBalloon, hv_balloon, HV_BALLOON, VMBUS_DEVICE, \
+ { TYPE_MEMORY_DEVICE }, { })
+
+#define HV_BALLOON_SET_STATE(hvb, news) \
+ do { \
+ assert(news != S_NO_CHANGE); \
+ hv_balloon_state_set(hvb, news, # news); \
+ } while (0)
+
+#define HV_BALLOON_STATE_DESC_SET(stdesc, news) \
+ _hv_balloon_state_desc_set(stdesc, news, # news)
+
+#define HV_BALLOON_STATE_DESC_INIT \
+ { \
+ .state = S_NO_CHANGE, \
+ }
+
+typedef struct HvBalloonReq {
+ VMBusChanReq vmreq;
+} HvBalloonReq;
+
+/* total our memory includes parts currently removed from the guest */
+static uint64_t hv_balloon_total_our_ram(HvBalloon *balloon)
+{
+ if (!balloon->our_range) {
+ return 0;
+ }
+
+ return balloon->our_range->range.added;
+}
+
+/* TODO: unify the code below with virtio-balloon and cache the value */
+static int build_dimm_list(Object *obj, void *opaque)
+{
+ GSList **list = opaque;
+
+ if (object_dynamic_cast(obj, TYPE_PC_DIMM)) {
+ DeviceState *dev = DEVICE(obj);
+ if (dev->realized) { /* only realized DIMMs matter */
+ *list = g_slist_prepend(*list, dev);
+ }
+ }
+
+ object_child_foreach(obj, build_dimm_list, opaque);
+ return 0;
+}
+
+static ram_addr_t get_current_ram_size(void)
+{
+ GSList *list = NULL, *item;
+ ram_addr_t size = current_machine->ram_size;
+
+ build_dimm_list(qdev_get_machine(), &list);
+ for (item = list; item; item = g_slist_next(item)) {
+ Object *obj = OBJECT(item->data);
+ if (!strcmp(object_get_typename(obj), TYPE_PC_DIMM))
+ size += object_property_get_int(obj, PC_DIMM_SIZE_PROP,
+ &error_abort);
+ }
+ g_slist_free(list);
+
+ return size;
+}
+
+/* total RAM includes memory currently removed from the guest */
+static uint64_t hv_balloon_total_ram(HvBalloon *balloon)
+{
+ ram_addr_t ram_size = get_current_ram_size();
+ uint64_t ram_size_pages = ram_size >> HV_BALLOON_PFN_SHIFT;
+ uint64_t our_ram_size_pages = hv_balloon_total_our_ram(balloon);
+
+ assert(ram_size_pages > 0);
+
+ return SUM_SATURATE_U64(ram_size_pages, our_ram_size_pages);
+}
+
+/*
+ * calculating the total RAM size is a slow operation,
+ * avoid it as much as possible
+ */
+static uint64_t hv_balloon_total_removed_rs(HvBalloon *balloon,
+ uint64_t ram_size_pages)
+{
+ uint64_t total_removed;
+
+ total_removed = SUM_SATURATE_U64(balloon->removed_guest_ctr,
+ balloon->removed_both_ctr);
+
+ /* possible if guest returns pages outside actual RAM */
+ if (total_removed > ram_size_pages) {
+ total_removed = ram_size_pages;
+ }
+
+ return total_removed;
+}
+
+/* Returns whether the state has actually changed */
+static bool hv_balloon_state_set(HvBalloon *balloon,
+ State newst, const char *newststr)
+{
+ if (newst == S_NO_CHANGE || balloon->state == newst) {
+ return false;
+ }
+
+ balloon->state = newst;
+ trace_hv_balloon_state_change(newststr);
+ return true;
+}
+
+static void _hv_balloon_state_desc_set(StateDesc *stdesc,
+ State newst, const char *newststr)
+{
+ /* state setting is only permitted on a freshly init desc */
+ assert(stdesc->state == S_NO_CHANGE);
+
+ assert(newst != S_NO_CHANGE);
+
+ stdesc->state = newst;
+ stdesc->desc = newststr;
+}
+
+static VMBusChannel *hv_balloon_get_channel_maybe(HvBalloon *balloon)
+{
+ return vmbus_device_channel(&balloon->parent, 0);
+}
+
+static VMBusChannel *hv_balloon_get_channel(HvBalloon *balloon)
+{
+ VMBusChannel *chan;
+
+ chan = hv_balloon_get_channel_maybe(balloon);
+ assert(chan != NULL);
+ return chan;
+}
+
+static ssize_t hv_balloon_send_packet(VMBusChannel *chan,
+ struct dm_message *msg)
+{
+ int ret;
+
+ ret = vmbus_channel_reserve(chan, 0, msg->hdr.size);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return vmbus_channel_send(chan, VMBUS_PACKET_DATA_INBAND,
+ NULL, 0, msg, msg->hdr.size, false,
+ msg->hdr.trans_id);
+}
+
+static bool hv_balloon_unballoon_get_source(HvBalloon *balloon,
+ PageRangeTree *dtree,
+ uint64_t **dctr,
+ bool *is_our_range)
+{
+ OurRange *our_range = OUR_RANGE(balloon->our_range);
+
+ /* Try the boot memory first */
+ if (g_tree_nnodes(balloon->removed_guest.t) > 0) {
+ *dtree = balloon->removed_guest;
+ *dctr = &balloon->removed_guest_ctr;
+ *is_our_range = false;
+ } else if (g_tree_nnodes(balloon->removed_both.t) > 0) {
+ *dtree = balloon->removed_both;
+ *dctr = &balloon->removed_both_ctr;
+ *is_our_range = false;
+ } else if (!our_range) {
+ return false;
+ } else if (!our_range_is_removed_tree_empty(our_range, false)) {
+ *dtree = our_range_get_removed_tree(our_range, false);
+ *dctr = &balloon->removed_guest_ctr;
+ *is_our_range = true;
+ } else if (!our_range_is_removed_tree_empty(our_range, true)) {
+ *dtree = our_range_get_removed_tree(our_range, true);
+ *dctr = &balloon->removed_both_ctr;
+ *is_our_range = true;
+ } else {
+ return false;
+ }
+
+ return true;
+}
+
+static void hv_balloon_unballoon_rb_wait(HvBalloon *balloon, StateDesc *stdesc)
+{
+ VMBusChannel *chan = hv_balloon_get_channel(balloon);
+ struct dm_unballoon_request *ur;
+ size_t ur_size = sizeof(*ur) + sizeof(ur->range_array[0]);
+
+ assert(balloon->state == S_UNBALLOON_RB_WAIT);
+
+ if (vmbus_channel_reserve(chan, 0, ur_size) < 0) {
+ return;
+ }
+
+ HV_BALLOON_STATE_DESC_SET(stdesc, S_UNBALLOON_POSTING);
+}
+
+static void hv_balloon_unballoon_posting(HvBalloon *balloon, StateDesc *stdesc)
+{
+ VMBusChannel *chan = hv_balloon_get_channel(balloon);
+ PageRangeTree dtree;
+ uint64_t *dctr;
+ bool our_range;
+ struct dm_unballoon_request *ur;
+ size_t ur_size = sizeof(*ur) + sizeof(ur->range_array[0]);
+ PageRange range;
+ bool bret;
+ ssize_t ret;
+
+ assert(balloon->state == S_UNBALLOON_POSTING);
+ assert(balloon->unballoon_diff > 0);
+
+ if (!hv_balloon_unballoon_get_source(balloon, &dtree, &dctr, &our_range)) {
+ error_report("trying to unballoon but nothing seems to be ballooned");
+ /*
+ * there is little we can do as we might have already
+ * sent the guest a partial request we can't cancel
+ */
+ return;
+ }
+
+ assert(balloon->our_range || !our_range);
+ assert(dtree.t);
+ assert(dctr);
+
+ ur = alloca(ur_size);
+ memset(ur, 0, ur_size);
+ ur->hdr.type = DM_UNBALLOON_REQUEST;
+ ur->hdr.size = ur_size;
+ ur->hdr.trans_id = balloon->trans_id;
+
+ bret = hvb_page_range_tree_pop(dtree, &range, MIN(balloon->unballoon_diff,
+ HV_BALLOON_HA_CHUNK_PAGES));
+ assert(bret);
+ /* TODO: madvise? */
+
+ *dctr -= range.count;
+ balloon->unballoon_diff -= range.count;
+
+ ur->range_count = 1;
+ ur->range_array[0].finfo.start_page = range.start;
+ ur->range_array[0].finfo.page_cnt = range.count;
+ ur->more_pages = balloon->unballoon_diff > 0;
+
+ trace_hv_balloon_outgoing_unballoon(ur->hdr.trans_id,
+ range.count, range.start,
+ balloon->unballoon_diff);
+
+ if (ur->more_pages) {
+ HV_BALLOON_STATE_DESC_SET(stdesc, S_UNBALLOON_RB_WAIT);
+ } else {
+ HV_BALLOON_STATE_DESC_SET(stdesc, S_UNBALLOON_REPLY_WAIT);
+ }
+
+ ret = vmbus_channel_send(chan, VMBUS_PACKET_DATA_INBAND,
+ NULL, 0, ur, ur_size, false,
+ ur->hdr.trans_id);
+ if (ret <= 0) {
+ error_report("error %zd when posting unballoon msg, expect problems",
+ ret);
+ }
+}
+
+static bool hv_balloon_our_range_ensure(HvBalloon *balloon)
+{
+ uint64_t align;
+ MemoryRegion *hostmem_mr;
+ g_autoptr(OurRangeMemslots) our_range_memslots = NULL;
+ OurRange *our_range;
+
+ if (balloon->our_range) {
+ return true;
+ }
+
+ if (!balloon->hostmem) {
+ return false;
+ }
+
+ align = (1 << balloon->caps.cap_bits.hot_add_alignment) * MiB;
+ assert(QEMU_IS_ALIGNED(balloon->addr, align));
+
+ hostmem_mr = host_memory_backend_get_memory(balloon->hostmem);
+
+ our_range_memslots = hvb_our_range_memslots_new(balloon->addr,
+ balloon->mr, hostmem_mr,
+ OBJECT(balloon),
+ balloon->memslot_count,
+ balloon->memslot_size);
+ our_range = OUR_RANGE(our_range_memslots);
+
+ if (hvb_page_range_tree_intree_any(balloon->removed_guest,
+ our_range->range.start,
+ our_range->range.count) ||
+ hvb_page_range_tree_intree_any(balloon->removed_both,
+ our_range->range.start,
+ our_range->range.count)) {
+ error_report("some parts of the memory backend were already returned by the guest. this should not happen, please reboot the guest and try again");
+ return false;
+ }
+
+ trace_hv_balloon_our_range_add(our_range->range.count,
+ our_range->range.start);
+
+ balloon->our_range = g_steal_pointer(&our_range_memslots);
+ return true;
+}
+
+static void hv_balloon_hot_add_setup(HvBalloon *balloon, StateDesc *stdesc)
+{
+ /* need to make copy since it is in union with hot_add_range */
+ uint64_t hot_add_diff = balloon->hot_add_diff;
+ PageRange *hot_add_range = &balloon->hot_add_range;
+ uint64_t align, our_range_remaining;
+ OurRange *our_range;
+
+ assert(balloon->state == S_HOT_ADD_SETUP);
+ assert(hot_add_diff > 0);
+
+ if (!hv_balloon_our_range_ensure(balloon)) {
+ goto ret_idle;
+ }
+
+ our_range = OUR_RANGE(balloon->our_range);
+
+ align = (1 << balloon->caps.cap_bits.hot_add_alignment) *
+ (MiB / HV_BALLOON_PAGE_SIZE);
+
+ /* Absolute GPA in pages */
+ hot_add_range->start = our_range_get_remaining_start(our_range);
+ assert(QEMU_IS_ALIGNED(hot_add_range->start, align));
+
+ our_range_remaining = our_range_get_remaining_size(our_range);
+ hot_add_range->count = MIN(our_range_remaining, hot_add_diff);
+ hot_add_range->count = QEMU_ALIGN_DOWN(hot_add_range->count, align);
+ if (hot_add_range->count == 0) {
+ goto ret_idle;
+ }
+
+ hvb_our_range_memslots_ensure_mapped_additional(balloon->our_range,
+ hot_add_range->count);
+
+ HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_RB_WAIT);
+ return;
+
+ret_idle:
+ HV_BALLOON_STATE_DESC_SET(stdesc, S_IDLE);
+}
+
+static void hv_balloon_hot_add_rb_wait(HvBalloon *balloon, StateDesc *stdesc)
+{
+ VMBusChannel *chan = hv_balloon_get_channel(balloon);
+ struct dm_hot_add *ha;
+ size_t ha_size = sizeof(*ha) + sizeof(ha->range);
+
+ assert(balloon->state == S_HOT_ADD_RB_WAIT);
+
+ if (vmbus_channel_reserve(chan, 0, ha_size) < 0) {
+ return;
+ }
+
+ HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_POSTING);
+}
+
+static void hv_balloon_hot_add_posting(HvBalloon *balloon, StateDesc *stdesc)
+{
+ PageRange *hot_add_range = &balloon->hot_add_range;
+ uint64_t *current_count = &balloon->ha_current_count;
+ VMBusChannel *chan = hv_balloon_get_channel(balloon);
+ struct dm_hot_add *ha;
+ size_t ha_size = sizeof(*ha) + sizeof(ha->range);
+ union dm_mem_page_range *ha_region;
+ uint64_t align, chunk_max_size;
+ ssize_t ret;
+
+ assert(balloon->state == S_HOT_ADD_POSTING);
+ assert(hot_add_range->count > 0);
+
+ align = (1 << balloon->caps.cap_bits.hot_add_alignment) *
+ (MiB / HV_BALLOON_PAGE_SIZE);
+ if (align >= HV_BALLOON_HA_CHUNK_PAGES) {
+ /*
+ * If the required alignment is higher than the chunk size we let it
+ * override that size.
+ */
+ chunk_max_size = align;
+ } else {
+ chunk_max_size = QEMU_ALIGN_DOWN(HV_BALLOON_HA_CHUNK_PAGES, align);
+ }
+
+ /*
+ * hot_add_range->count starts aligned in hv_balloon_hot_add_setup(),
+ * then it is either reduced by subtracting aligned current_count or
+ * further hot-adds are prevented by marking the whole remaining our range
+ * as unusable in hv_balloon_handle_hot_add_response().
+ */
+ *current_count = MIN(hot_add_range->count, chunk_max_size);
+
+ ha = alloca(ha_size);
+ ha_region = &(&ha->range)[1];
+ memset(ha, 0, ha_size);
+ ha->hdr.type = DM_MEM_HOT_ADD_REQUEST;
+ ha->hdr.size = ha_size;
+ ha->hdr.trans_id = balloon->trans_id;
+
+ ha->range.finfo.start_page = hot_add_range->start;
+ ha->range.finfo.page_cnt = *current_count;
+ ha_region->finfo.start_page = hot_add_range->start;
+ ha_region->finfo.page_cnt = ha->range.finfo.page_cnt;
+
+ trace_hv_balloon_outgoing_hot_add(ha->hdr.trans_id,
+ *current_count, hot_add_range->start);
+
+ ret = vmbus_channel_send(chan, VMBUS_PACKET_DATA_INBAND,
+ NULL, 0, ha, ha_size, false,
+ ha->hdr.trans_id);
+ if (ret <= 0) {
+ error_report("error %zd when posting hot add msg, expect problems",
+ ret);
+ }
+
+ HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_REPLY_WAIT);
+}
+
+static void hv_balloon_balloon_rb_wait(HvBalloon *balloon, StateDesc *stdesc)
+{
+ VMBusChannel *chan = hv_balloon_get_channel(balloon);
+ size_t bl_size = sizeof(struct dm_balloon);
+
+ assert(balloon->state == S_BALLOON_RB_WAIT);
+
+ if (vmbus_channel_reserve(chan, 0, bl_size) < 0) {
+ return;
+ }
+
+ HV_BALLOON_STATE_DESC_SET(stdesc, S_BALLOON_POSTING);
+}
+
+static void hv_balloon_balloon_posting(HvBalloon *balloon, StateDesc *stdesc)
+{
+ VMBusChannel *chan = hv_balloon_get_channel(balloon);
+ struct dm_balloon bl;
+ size_t bl_size = sizeof(bl);
+ ssize_t ret;
+
+ assert(balloon->state == S_BALLOON_POSTING);
+ assert(balloon->balloon_diff > 0);
+
+ memset(&bl, 0, sizeof(bl));
+ bl.hdr.type = DM_BALLOON_REQUEST;
+ bl.hdr.size = bl_size;
+ bl.hdr.trans_id = balloon->trans_id;
+ bl.num_pages = MIN(balloon->balloon_diff, HV_BALLOON_HR_CHUNK_PAGES);
+
+ trace_hv_balloon_outgoing_balloon(bl.hdr.trans_id, bl.num_pages,
+ balloon->balloon_diff);
+
+ ret = vmbus_channel_send(chan, VMBUS_PACKET_DATA_INBAND,
+ NULL, 0, &bl, bl_size, false,
+ bl.hdr.trans_id);
+ if (ret <= 0) {
+ error_report("error %zd when posting balloon msg, expect problems",
+ ret);
+ }
+
+ HV_BALLOON_STATE_DESC_SET(stdesc, S_BALLOON_REPLY_WAIT);
+}
+
+static void hv_balloon_idle_state_process_target(HvBalloon *balloon,
+ StateDesc *stdesc)
+{
+ bool can_balloon = balloon->caps.cap_bits.balloon;
+ uint64_t ram_size_pages, total_removed;
+
+ ram_size_pages = hv_balloon_total_ram(balloon);
+ total_removed = hv_balloon_total_removed_rs(balloon, ram_size_pages);
+
+ /*
+ * we need to cache the values computed from the balloon target value when
+ * starting the adjustment procedure in case someone changes the target when
+ * the procedure is in progress
+ */
+ if (balloon->target > ram_size_pages - total_removed) {
+ bool can_hot_add = balloon->caps.cap_bits.hot_add;
+ uint64_t target_diff = balloon->target -
+ (ram_size_pages - total_removed);
+
+ balloon->unballoon_diff = MIN(target_diff, total_removed);
+
+ if (can_hot_add) {
+ balloon->hot_add_diff = target_diff - balloon->unballoon_diff;
+ } else {
+ balloon->hot_add_diff = 0;
+ }
+
+ if (balloon->unballoon_diff > 0) {
+ assert(can_balloon);
+ HV_BALLOON_STATE_DESC_SET(stdesc, S_UNBALLOON_RB_WAIT);
+ } else if (balloon->hot_add_diff > 0) {
+ HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_SETUP);
+ }
+ } else if (can_balloon &&
+ balloon->target < ram_size_pages - total_removed) {
+ balloon->balloon_diff = ram_size_pages - total_removed -
+ balloon->target;
+ HV_BALLOON_STATE_DESC_SET(stdesc, S_BALLOON_RB_WAIT);
+ }
+}
+
+static void hv_balloon_idle_state(HvBalloon *balloon,
+ StateDesc *stdesc)
+{
+ assert(balloon->state == S_IDLE);
+
+ if (balloon->target_changed) {
+ balloon->target_changed = false;
+ hv_balloon_idle_state_process_target(balloon, stdesc);
+ return;
+ }
+}
+
+static const struct {
+ void (*handler)(HvBalloon *balloon, StateDesc *stdesc);
+} state_handlers[] = {
+ [S_IDLE].handler = hv_balloon_idle_state,
+ [S_BALLOON_POSTING].handler = hv_balloon_balloon_posting,
+ [S_BALLOON_RB_WAIT].handler = hv_balloon_balloon_rb_wait,
+ [S_UNBALLOON_POSTING].handler = hv_balloon_unballoon_posting,
+ [S_UNBALLOON_RB_WAIT].handler = hv_balloon_unballoon_rb_wait,
+ [S_HOT_ADD_SETUP].handler = hv_balloon_hot_add_setup,
+ [S_HOT_ADD_RB_WAIT].handler = hv_balloon_hot_add_rb_wait,
+ [S_HOT_ADD_POSTING].handler = hv_balloon_hot_add_posting,
+};
+
+static void hv_balloon_handle_state(HvBalloon *balloon, StateDesc *stdesc)
+{
+ if (balloon->state >= ARRAY_SIZE(state_handlers) ||
+ !state_handlers[balloon->state].handler) {
+ return;
+ }
+
+ state_handlers[balloon->state].handler(balloon, stdesc);
+}
+
+static void hv_balloon_remove_response_insert_range(PageRangeTree tree,
+ const PageRange *range,
+ uint64_t *ctr1,
+ uint64_t *ctr2,
+ uint64_t *ctr3)
+{
+ uint64_t dupcount, effcount;
+
+ if (range->count == 0) {
+ return;
+ }
+
+ dupcount = 0;
+ hvb_page_range_tree_insert(tree, range->start, range->count, &dupcount);
+
+ assert(dupcount <= range->count);
+ effcount = range->count - dupcount;
+
+ *ctr1 += effcount;
+ *ctr2 += effcount;
+ if (ctr3) {
+ *ctr3 += effcount;
+ }
+}
+
+static void hv_balloon_remove_response_handle_range(HvBalloon *balloon,
+ PageRange *range,
+ bool both,
+ uint64_t *removedctr)
+{
+ OurRange *our_range = OUR_RANGE(balloon->our_range);
+ PageRangeTree globaltree =
+ both ? balloon->removed_both : balloon->removed_guest;
+ uint64_t *globalctr =
+ both ? &balloon->removed_both_ctr : &balloon->removed_guest_ctr;
+ PageRange rangeeff;
+
+ if (range->count == 0) {
+ return;
+ }
+
+ trace_hv_balloon_remove_response(range->count, range->start, both);
+
+ if (our_range) {
+ /* Includes the not-yet-hot-added and unusable parts. */
+ rangeeff = our_range->range;
+ } else {
+ rangeeff.start = rangeeff.count = 0;
+ }
+
+ if (page_range_intersection_size(range, rangeeff.start, rangeeff.count) > 0) {
+ PageRangeTree ourtree = our_range_get_removed_tree(our_range, both);
+ PageRange rangehole, rangecommon;
+ uint64_t ourremoved = 0;
+
+ /* process the hole before our range, if it exists */
+ page_range_part_before(range, rangeeff.start, &rangehole);
+ hv_balloon_remove_response_insert_range(globaltree, &rangehole,
+ globalctr, removedctr, NULL);
+ if (rangehole.count > 0) {
+ trace_hv_balloon_remove_response_hole(rangehole.count,
+ rangehole.start,
+ range->count, range->start,
+ rangeeff.start, both);
+ }
+
+ /* process our part */
+ page_range_intersect(range, rangeeff.start, rangeeff.count,
+ &rangecommon);
+ hv_balloon_remove_response_insert_range(ourtree, &rangecommon,
+ globalctr, removedctr,
+ &ourremoved);
+ if (rangecommon.count > 0) {
+ trace_hv_balloon_remove_response_common(rangecommon.count,
+ rangecommon.start,
+ range->count, range->start,
+ rangeeff.count,
+ rangeeff.start, ourremoved,
+ both);
+ }
+
+ /* calculate what's left after our range */
+ rangecommon = *range;
+ page_range_part_after(&rangecommon, rangeeff.start, rangeeff.count,
+ range);
+ }
+
+ /* process the remainder of the range that lies after our range */
+ if (range->count > 0) {
+ hv_balloon_remove_response_insert_range(globaltree, range,
+ globalctr, removedctr, NULL);
+ trace_hv_balloon_remove_response_remainder(range->count, range->start,
+ both);
+ range->count = 0;
+ }
+}
+
+static void hv_balloon_remove_response_handle_pages(HvBalloon *balloon,
+ PageRange *range,
+ uint64_t start,
+ uint64_t count,
+ bool both,
+ uint64_t *removedctr)
+{
+ assert(count > 0);
+
+ /*
+ * if there is an existing range that the new range can't be joined to
+ * dump it into tree(s)
+ */
+ if (range->count > 0 && !page_range_joinable(range, start, count)) {
+ hv_balloon_remove_response_handle_range(balloon, range, both,
+ removedctr);
+ }
+
+ if (range->count == 0) {
+ range->start = start;
+ range->count = count;
+ } else if (page_range_joinable_left(range, start, count)) {
+ range->start = start;
+ range->count += count;
+ } else { /* page_range_joinable_right() */
+ range->count += count;
+ }
+}
+
+static gboolean hv_balloon_handle_remove_host_addr_node(gpointer key,
+ gpointer value,
+ gpointer data)
+{
+ PageRange *range = value;
+ uint64_t pageoff;
+
+ for (pageoff = 0; pageoff < range->count; ) {
+ uint64_t addr_64 = (range->start + pageoff) * HV_BALLOON_PAGE_SIZE;
+ void *addr;
+ RAMBlock *rb;
+ ram_addr_t rb_offset;
+ size_t rb_page_size;
+ size_t discard_size;
+
+ assert(addr_64 <= UINTPTR_MAX);
+ addr = (void *)((uintptr_t)addr_64);
+ rb = qemu_ram_block_from_host(addr, false, &rb_offset);
+ rb_page_size = qemu_ram_pagesize(rb);
+
+ if (rb_page_size != HV_BALLOON_PAGE_SIZE) {
+ /* TODO: these should end in "removed_guest" */
+ warn_report("guest reported removed page backed by unsupported page size %zu",
+ rb_page_size);
+ pageoff++;
+ continue;
+ }
+
+ discard_size = MIN(range->count - pageoff,
+ (rb->max_length - rb_offset) /
+ HV_BALLOON_PAGE_SIZE);
+ discard_size = MAX(discard_size, 1);
+
+ if (ram_block_discard_range(rb, rb_offset, discard_size *
+ HV_BALLOON_PAGE_SIZE) != 0) {
+ warn_report("guest reported removed page failed discard");
+ }
+
+ pageoff += discard_size;
+ }
+
+ return false;
+}
+
+static void hv_balloon_handle_remove_host_addr_tree(PageRangeTree tree)
+{
+ g_tree_foreach(tree.t, hv_balloon_handle_remove_host_addr_node, NULL);
+}
+
+static int hv_balloon_handle_remove_section(PageRangeTree tree,
+ const MemoryRegionSection *section,
+ uint64_t count)
+{
+ void *addr = memory_region_get_ram_ptr(section->mr) +
+ section->offset_within_region;
+ uint64_t addr_page;
+
+ assert(count > 0);
+
+ if ((uintptr_t)addr % HV_BALLOON_PAGE_SIZE) {
+ warn_report("guest reported removed pages at an unaligned host addr %p",
+ addr);
+ return -EINVAL;
+ }
+
+ addr_page = (uintptr_t)addr / HV_BALLOON_PAGE_SIZE;
+ hvb_page_range_tree_insert(tree, addr_page, count, NULL);
+
+ return 0;
+}
+
+static void hv_balloon_handle_remove_ranges(HvBalloon *balloon,
+ union dm_mem_page_range ranges[],
+ uint32_t count)
+{
+ uint64_t removedcnt;
+ PageRangeTree removed_host_addr;
+ PageRange range_guest, range_both;
+
+ hvb_page_range_tree_init(&removed_host_addr);
+ range_guest.count = range_both.count = removedcnt = 0;
+ for (unsigned int ctr = 0; ctr < count; ctr++) {
+ union dm_mem_page_range *mr = &ranges[ctr];
+ hwaddr pa;
+ MemoryRegionSection section;
+
+ for (unsigned int offset = 0; offset < mr->finfo.page_cnt; ) {
+ int ret;
+ uint64_t pageno = mr->finfo.start_page + offset;
+ uint64_t pagecnt = 1;
+
+ pa = (hwaddr)pageno << HV_BALLOON_PFN_SHIFT;
+ section = memory_region_find(get_system_memory(), pa,
+ (mr->finfo.page_cnt - offset) *
+ HV_BALLOON_PAGE_SIZE);
+ if (!section.mr) {
+ warn_report("guest reported removed page %"PRIu64" not found in RAM",
+ pageno);
+ ret = -EINVAL;
+ goto finish_page;
+ }
+
+ pagecnt = int128_get64(section.size) / HV_BALLOON_PAGE_SIZE;
+ if (pagecnt <= 0) {
+ warn_report("guest reported removed page %"PRIu64" in a section smaller than page size",
+ pageno);
+ pagecnt = 1; /* skip the whole page */
+ ret = -EINVAL;
+ goto finish_page;
+ }
+
+ if (!memory_region_is_ram(section.mr) ||
+ memory_region_is_rom(section.mr) ||
+ memory_region_is_romd(section.mr)) {
+ warn_report("guest reported removed page %"PRIu64" in a section that is not an ordinary RAM",
+ pageno);
+ ret = -EINVAL;
+ goto finish_page;
+ }
+
+ ret = hv_balloon_handle_remove_section(removed_host_addr, §ion,
+ pagecnt);
+
+ finish_page:
+ if (ret == 0) {
+ hv_balloon_remove_response_handle_pages(balloon,
+ &range_both,
+ pageno, pagecnt,
+ true, &removedcnt);
+ } else {
+ hv_balloon_remove_response_handle_pages(balloon,
+ &range_guest,
+ pageno, pagecnt,
+ false, &removedcnt);
+ }
+
+ if (section.mr) {
+ memory_region_unref(section.mr);
+ }
+
+ offset += pagecnt;
+ }
+ }
+
+ hv_balloon_remove_response_handle_range(balloon, &range_both, true,
+ &removedcnt);
+ hv_balloon_remove_response_handle_range(balloon, &range_guest, false,
+ &removedcnt);
+
+ hv_balloon_handle_remove_host_addr_tree(removed_host_addr);
+ hvb_page_range_tree_destroy(&removed_host_addr);
+
+ if (removedcnt > balloon->balloon_diff) {
+ warn_report("guest reported more pages removed than currently pending (%"PRIu64" vs %"PRIu64")",
+ removedcnt, balloon->balloon_diff);
+ balloon->balloon_diff = 0;
+ } else {
+ balloon->balloon_diff -= removedcnt;
+ }
+}
+
+static bool hv_balloon_handle_msg_size(HvBalloonReq *req, size_t minsize,
+ const char *msgname)
+{
+ VMBusChanReq *vmreq = &req->vmreq;
+ uint32_t msglen = vmreq->msglen;
+
+ if (msglen >= minsize) {
+ return true;
+ }
+
+ warn_report("%s message too short (%u vs %zu), ignoring", msgname,
+ (unsigned int)msglen, minsize);
+ return false;
+}
+
+static void hv_balloon_handle_version_request(HvBalloon *balloon,
+ HvBalloonReq *req,
+ StateDesc *stdesc)
+{
+ VMBusChanReq *vmreq = &req->vmreq;
+ struct dm_version_request *msgVr = vmreq->msg;
+ struct dm_version_response respVr;
+
+ if (balloon->state != S_VERSION) {
+ warn_report("unexpected DM_VERSION_REQUEST in %d state",
+ balloon->state);
+ return;
+ }
+
+ if (!hv_balloon_handle_msg_size(req, sizeof(*msgVr),
+ "DM_VERSION_REQUEST")) {
+ return;
+ }
+
+ trace_hv_balloon_incoming_version(msgVr->version.major_version,
+ msgVr->version.minor_version);
+
+ memset(&respVr, 0, sizeof(respVr));
+ respVr.hdr.type = DM_VERSION_RESPONSE;
+ respVr.hdr.size = sizeof(respVr);
+ respVr.hdr.trans_id = msgVr->hdr.trans_id;
+ respVr.is_accepted = msgVr->version.version >= DYNMEM_PROTOCOL_VERSION_1 &&
+ msgVr->version.version <= DYNMEM_PROTOCOL_VERSION_3;
+
+ hv_balloon_send_packet(vmreq->chan, (struct dm_message *)&respVr);
+
+ if (respVr.is_accepted) {
+ HV_BALLOON_STATE_DESC_SET(stdesc, S_CAPS);
+ }
+}
+
+static void hv_balloon_handle_caps_report(HvBalloon *balloon,
+ HvBalloonReq *req,
+ StateDesc *stdesc)
+{
+ VMBusChanReq *vmreq = &req->vmreq;
+ struct dm_capabilities *msgCap = vmreq->msg;
+ struct dm_capabilities_resp_msg respCap;
+
+ if (balloon->state != S_CAPS) {
+ warn_report("unexpected DM_CAPABILITIES_REPORT in %d state",
+ balloon->state);
+ return;
+ }
+
+ if (!hv_balloon_handle_msg_size(req, sizeof(*msgCap),
+ "DM_CAPABILITIES_REPORT")) {
+ return;
+ }
+
+ trace_hv_balloon_incoming_caps(msgCap->caps.caps);
+ balloon->caps = msgCap->caps;
+
+ memset(&respCap, 0, sizeof(respCap));
+ respCap.hdr.type = DM_CAPABILITIES_RESPONSE;
+ respCap.hdr.size = sizeof(respCap);
+ respCap.hdr.trans_id = msgCap->hdr.trans_id;
+ respCap.is_accepted = 1;
+ respCap.hot_remove = 1;
+ respCap.suppress_pressure_reports = !balloon->status_report.enabled;
+ hv_balloon_send_packet(vmreq->chan, (struct dm_message *)&respCap);
+
+ timer_mod(&balloon->post_init_timer,
+ qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
+ HV_BALLOON_POST_INIT_WAIT);
+
+ HV_BALLOON_STATE_DESC_SET(stdesc, S_POST_INIT_WAIT);
+}
+
+static void hv_balloon_handle_status_report(HvBalloon *balloon,
+ HvBalloonReq *req)
+{
+ VMBusChanReq *vmreq = &req->vmreq;
+ struct dm_status *msgStatus = vmreq->msg;
+
+ if (!hv_balloon_handle_msg_size(req, sizeof(*msgStatus),
+ "DM_STATUS_REPORT")) {
+ return;
+ }
+
+ if (!balloon->status_report.enabled) {
+ return;
+ }
+
+ balloon->status_report.committed = msgStatus->num_committed;
+ balloon->status_report.committed *= HV_BALLOON_PAGE_SIZE;
+ balloon->status_report.available = msgStatus->num_avail;
+ balloon->status_report.available *= HV_BALLOON_PAGE_SIZE;
+ balloon->status_report.received = true;
+
+ qapi_event_send_hv_balloon_status_report(balloon->status_report.committed,
+ balloon->status_report.available);
+}
+
+HvBalloonInfo *qmp_query_hv_balloon_status_report(Error **errp)
+{
+ HvBalloon *balloon;
+ HvBalloonInfo *info;
+
+ balloon = HV_BALLOON(object_resolve_path_type("", TYPE_HV_BALLOON, NULL));
+ if (!balloon) {
+ error_setg(errp, "no %s device present", TYPE_HV_BALLOON);
+ return NULL;
+ }
+
+ if (!balloon->status_report.enabled) {
+ error_setg(errp, "guest memory status reporting not enabled");
+ return NULL;
+ }
+
+ if (!balloon->status_report.received) {
+ error_setg(errp, "no guest memory status report received yet");
+ return NULL;
+ }
+
+ info = g_malloc0(sizeof(*info));
+ info->committed = balloon->status_report.committed;
+ info->available = balloon->status_report.available;
+ return info;
+}
+
+static void hv_balloon_handle_unballoon_response(HvBalloon *balloon,
+ HvBalloonReq *req,
+ StateDesc *stdesc)
+{
+ VMBusChanReq *vmreq = &req->vmreq;
+ struct dm_unballoon_response *msgUrR = vmreq->msg;
+
+ if (balloon->state != S_UNBALLOON_REPLY_WAIT) {
+ warn_report("unexpected DM_UNBALLOON_RESPONSE in %d state",
+ balloon->state);
+ return;
+ }
+
+ if (!hv_balloon_handle_msg_size(req, sizeof(*msgUrR),
+ "DM_UNBALLOON_RESPONSE"))
+ return;
+
+ trace_hv_balloon_incoming_unballoon(msgUrR->hdr.trans_id);
+
+ balloon->trans_id++;
+
+ if (balloon->hot_add_diff > 0) {
+ bool can_hot_add = balloon->caps.cap_bits.hot_add;
+
+ assert(can_hot_add);
+ HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_SETUP);
+ } else {
+ HV_BALLOON_STATE_DESC_SET(stdesc, S_IDLE);
+ }
+}
+
+static void hv_balloon_handle_hot_add_response(HvBalloon *balloon,
+ HvBalloonReq *req,
+ StateDesc *stdesc)
+{
+ PageRange *hot_add_range = &balloon->hot_add_range;
+ VMBusChanReq *vmreq = &req->vmreq;
+ struct dm_hot_add_response *msgHaR = vmreq->msg;
+ OurRange *our_range;
+
+ if (balloon->state != S_HOT_ADD_REPLY_WAIT) {
+ warn_report("unexpected DM_HOT_ADD_RESPONSE in %d state",
+ balloon->state);
+ return;
+ }
+
+ assert(balloon->our_range);
+ our_range = OUR_RANGE(balloon->our_range);
+
+ if (!hv_balloon_handle_msg_size(req, sizeof(*msgHaR),
+ "DM_HOT_ADD_RESPONSE"))
+ return;
+
+ trace_hv_balloon_incoming_hot_add(msgHaR->hdr.trans_id, msgHaR->result,
+ msgHaR->page_count);
+
+ balloon->trans_id++;
+
+ if (msgHaR->result) {
+ if (msgHaR->page_count > balloon->ha_current_count) {
+ warn_report("DM_HOT_ADD_RESPONSE page count higher than requested (%"PRIu32" vs %"PRIu64")",
+ msgHaR->page_count, balloon->ha_current_count);
+ msgHaR->page_count = balloon->ha_current_count;
+ }
+
+ hvb_our_range_mark_added(our_range, msgHaR->page_count);
+ hot_add_range->start += msgHaR->page_count;
+ hot_add_range->count -= msgHaR->page_count;
+ }
+
+ if (!msgHaR->result || msgHaR->page_count < balloon->ha_current_count) {
+ /*
+ * the current planned range was only partially hot-added, take note
+ * how much of it remains and don't attempt any further hot adds
+ */
+ our_range_mark_remaining_unusable(our_range);
+
+ goto ret_idle;
+ }
+
+ /* any pages remaining to hot-add in our range? */
+ if (hot_add_range->count > 0) {
+ HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_RB_WAIT);
+ return;
+ }
+
+ret_idle:
+ HV_BALLOON_STATE_DESC_SET(stdesc, S_IDLE);
+}
+
+static void hv_balloon_handle_balloon_response(HvBalloon *balloon,
+ HvBalloonReq *req,
+ StateDesc *stdesc)
+{
+ VMBusChanReq *vmreq = &req->vmreq;
+ struct dm_balloon_response *msgBR = vmreq->msg;
+
+ if (balloon->state != S_BALLOON_REPLY_WAIT) {
+ warn_report("unexpected DM_BALLOON_RESPONSE in %d state",
+ balloon->state);
+ return;
+ }
+
+ if (!hv_balloon_handle_msg_size(req, sizeof(*msgBR),
+ "DM_BALLOON_RESPONSE"))
+ return;
+
+ trace_hv_balloon_incoming_balloon(msgBR->hdr.trans_id, msgBR->range_count,
+ msgBR->more_pages);
+
+ if (vmreq->msglen < sizeof(*msgBR) +
+ (uint64_t)sizeof(msgBR->range_array[0]) * msgBR->range_count) {
+ warn_report("DM_BALLOON_RESPONSE too short for the range count");
+ return;
+ }
+
+ if (msgBR->range_count == 0) {
+ /* The guest is already at its minimum size */
+ balloon->balloon_diff = 0;
+ goto ret_end_trans;
+ } else {
+ hv_balloon_handle_remove_ranges(balloon,
+ msgBR->range_array,
+ msgBR->range_count);
+ }
+
+ /* More responses expected? */
+ if (msgBR->more_pages) {
+ return;
+ }
+
+ret_end_trans:
+ balloon->trans_id++;
+
+ if (balloon->balloon_diff > 0) {
+ HV_BALLOON_STATE_DESC_SET(stdesc, S_BALLOON_RB_WAIT);
+ } else {
+ HV_BALLOON_STATE_DESC_SET(stdesc, S_IDLE);
+ }
+}
+
+static void hv_balloon_handle_packet(HvBalloon *balloon, HvBalloonReq *req,
+ StateDesc *stdesc)
+{
+ VMBusChanReq *vmreq = &req->vmreq;
+ struct dm_message *msg = vmreq->msg;
+
+ if (vmreq->msglen < sizeof(msg->hdr)) {
+ return;
+ }
+
+ switch (msg->hdr.type) {
+ case DM_VERSION_REQUEST:
+ hv_balloon_handle_version_request(balloon, req, stdesc);
+ break;
+
+ case DM_CAPABILITIES_REPORT:
+ hv_balloon_handle_caps_report(balloon, req, stdesc);
+ break;
+
+ case DM_STATUS_REPORT:
+ hv_balloon_handle_status_report(balloon, req);
+ break;
+
+ case DM_MEM_HOT_ADD_RESPONSE:
+ hv_balloon_handle_hot_add_response(balloon, req, stdesc);
+ break;
+
+ case DM_UNBALLOON_RESPONSE:
+ hv_balloon_handle_unballoon_response(balloon, req, stdesc);
+ break;
+
+ case DM_BALLOON_RESPONSE:
+ hv_balloon_handle_balloon_response(balloon, req, stdesc);
+ break;
+
+ default:
+ warn_report("unknown DM message %u", msg->hdr.type);
+ break;
+ }
+}
+
+static bool hv_balloon_recv_channel(HvBalloon *balloon, StateDesc *stdesc)
+{
+ VMBusChannel *chan;
+ HvBalloonReq *req;
+
+ if (balloon->state == S_WAIT_RESET ||
+ balloon->state == S_POST_RESET_CLOSED) {
+ return false;
+ }
+
+ chan = hv_balloon_get_channel(balloon);
+ if (vmbus_channel_recv_start(chan)) {
+ return false;
+ }
+
+ while ((req = vmbus_channel_recv_peek(chan, sizeof(*req)))) {
+ hv_balloon_handle_packet(balloon, req, stdesc);
+ vmbus_free_req(req);
+ vmbus_channel_recv_pop(chan);
+
+ if (stdesc->state != S_NO_CHANGE) {
+ break;
+ }
+ }
+
+ return vmbus_channel_recv_done(chan) > 0;
+}
+
+/* old state handler -> new state transition (potential) */
+static bool hv_balloon_event_loop_state(HvBalloon *balloon)
+{
+ StateDesc state_new = HV_BALLOON_STATE_DESC_INIT;
+
+ hv_balloon_handle_state(balloon, &state_new);
+ return hv_balloon_state_set(balloon, state_new.state, state_new.desc);
+}
+
+/* VMBus message -> new state transition (potential) */
+static bool hv_balloon_event_loop_recv(HvBalloon *balloon)
+{
+ StateDesc state_new = HV_BALLOON_STATE_DESC_INIT;
+ bool any_recv, state_changed;
+
+ any_recv = hv_balloon_recv_channel(balloon, &state_new);
+ state_changed = hv_balloon_state_set(balloon,
+ state_new.state, state_new.desc);
+
+ return state_changed || any_recv;
+}
+
+static void hv_balloon_event_loop(HvBalloon *balloon)
+{
+ bool state_repeat, recv_repeat;
+
+ do {
+ state_repeat = hv_balloon_event_loop_state(balloon);
+ recv_repeat = hv_balloon_event_loop_recv(balloon);
+ } while (state_repeat || recv_repeat);
+}
+
+static void hv_balloon_vmdev_chan_notify(VMBusChannel *chan)
+{
+ HvBalloon *balloon = HV_BALLOON(vmbus_channel_device(chan));
+
+ hv_balloon_event_loop(balloon);
+}
+
+static void hv_balloon_stat(void *opaque, BalloonInfo *info)
+{
+ HvBalloon *balloon = opaque;
+ info->actual = (hv_balloon_total_ram(balloon) - balloon->removed_both_ctr)
+ << HV_BALLOON_PFN_SHIFT;
+}
+
+static void hv_balloon_to_target(void *opaque, ram_addr_t target)
+{
+ HvBalloon *balloon = opaque;
+ uint64_t target_pages = target >> HV_BALLOON_PFN_SHIFT;
+
+ if (!target_pages) {
+ return;
+ }
+
+ /*
+ * always set target_changed, even with unchanged target, as the user
+ * might be asking us to try again reaching it
+ */
+ balloon->target = target_pages;
+ balloon->target_changed = true;
+
+ hv_balloon_event_loop(balloon);
+}
+
+static int hv_balloon_vmdev_open_channel(VMBusChannel *chan)
+{
+ HvBalloon *balloon = HV_BALLOON(vmbus_channel_device(chan));
+
+ if (balloon->state != S_POST_RESET_CLOSED) {
+ warn_report("guest trying to open a DM channel in invalid %d state",
+ balloon->state);
+ return -EINVAL;
+ }
+
+ HV_BALLOON_SET_STATE(balloon, S_VERSION);
+ hv_balloon_event_loop(balloon);
+
+ return 0;
+}
+
+static void hv_balloon_vmdev_close_channel(VMBusChannel *chan)
+{
+ HvBalloon *balloon = HV_BALLOON(vmbus_channel_device(chan));
+
+ timer_del(&balloon->post_init_timer);
+
+ /* Don't report stale data */
+ balloon->status_report.received = false;
+
+ HV_BALLOON_SET_STATE(balloon, S_WAIT_RESET);
+ hv_balloon_event_loop(balloon);
+}
+
+static void hv_balloon_post_init_timer(void *opaque)
+{
+ HvBalloon *balloon = opaque;
+
+ if (balloon->state != S_POST_INIT_WAIT) {
+ return;
+ }
+
+ HV_BALLOON_SET_STATE(balloon, S_IDLE);
+ hv_balloon_event_loop(balloon);
+}
+
+static void hv_balloon_system_reset_unrealize_common(HvBalloon *balloon)
+{
+ g_clear_pointer(&balloon->our_range, hvb_our_range_memslots_free);
+}
+
+static void hv_balloon_system_reset(void *opaque)
+{
+ HvBalloon *balloon = HV_BALLOON(opaque);
+
+ hv_balloon_system_reset_unrealize_common(balloon);
+}
+
+static void hv_balloon_ensure_mr(HvBalloon *balloon)
+{
+ MemoryRegion *hostmem_mr;
+
+ assert(balloon->hostmem);
+
+ if (balloon->mr) {
+ return;
+ }
+
+ hostmem_mr = host_memory_backend_get_memory(balloon->hostmem);
+
+ balloon->mr = g_new0(MemoryRegion, 1);
+ memory_region_init(balloon->mr, OBJECT(balloon), TYPE_HV_BALLOON,
+ memory_region_size(hostmem_mr));
+
+ /*
+ * The VM can indicate an alignment up to 32 GiB. Memory device core can
+ * usually only handle/guarantee 1 GiB alignment. The user will have to
+ * specify a larger maxmem eventually.
+ *
+ * The memory device core will warn the user in case maxmem might have to be
+ * increased and will fail plugging the device if there is not sufficient
+ * space after alignment.
+ *
+ * TODO: we could do the alignment ourselves in a slightly bigger region.
+ * But this feels better, although the warning might be annoying. Maybe
+ * we can optimize that in the future (e.g., with such a device on the
+ * cmdline place/size the device memory region differently.
+ */
+ balloon->mr->align = MAX(32 * GiB, memory_region_get_alignment(hostmem_mr));
+}
+
+static void hv_balloon_free_mr(HvBalloon *balloon)
+{
+ if (!balloon->mr) {
+ return;
+ }
+
+ object_unparent(OBJECT(balloon->mr));
+ g_clear_pointer(&balloon->mr, g_free);
+}
+
+static void hv_balloon_vmdev_realize(VMBusDevice *vdev, Error **errp)
+{
+ ERRP_GUARD();
+ HvBalloon *balloon = HV_BALLOON(vdev);
+ int ret;
+
+ balloon->state = S_WAIT_RESET;
+
+ ret = qemu_add_balloon_handler(hv_balloon_to_target, hv_balloon_stat,
+ balloon);
+ if (ret < 0) {
+ /* This also protects against having multiple hv-balloon instances */
+ error_setg(errp, "Only one balloon device is supported");
+ return;
+ }
+
+ if (balloon->hostmem) {
+ if (host_memory_backend_is_mapped(balloon->hostmem)) {
+ Object *obj = OBJECT(balloon->hostmem);
+
+ error_setg(errp, "'%s' property specifies a busy memdev: %s",
+ HV_BALLOON_MEMDEV_PROP,
+ object_get_canonical_path_component(obj));
+ goto out_balloon_handler;
+ }
+
+ hv_balloon_ensure_mr(balloon);
+
+ /* This is rather unlikely to happen, but let's still check for it. */
+ if (!QEMU_IS_ALIGNED(memory_region_size(balloon->mr),
+ HV_BALLOON_PAGE_SIZE)) {
+ error_setg(errp, "'%s' property memdev size has to be a multiple of 0x%" PRIx64,
+ HV_BALLOON_MEMDEV_PROP, (uint64_t)HV_BALLOON_PAGE_SIZE);
+ goto out_balloon_handler;
+ }
+
+ host_memory_backend_set_mapped(balloon->hostmem, true);
+ vmstate_register_ram(host_memory_backend_get_memory(balloon->hostmem),
+ DEVICE(balloon));
+ } else if (balloon->addr) {
+ error_setg(errp, "'%s' property must not be set without a memdev",
+ HV_BALLOON_MEMDEV_PROP);
+ goto out_balloon_handler;
+ }
+
+ timer_init_ms(&balloon->post_init_timer, QEMU_CLOCK_VIRTUAL,
+ hv_balloon_post_init_timer, balloon);
+
+ qemu_register_reset(hv_balloon_system_reset, balloon);
+
+ return;
+
+out_balloon_handler:
+ qemu_remove_balloon_handler(balloon);
+}
+
+/*
+ * VMBus device reset has to be implemented in case the guest decides to
+ * disconnect and reconnect to the VMBus without rebooting the whole system.
+ *
+ * However, the hot-added memory can't be removed here as Windows keeps on using
+ * it until the system is restarted, even after disconnecting from the VMBus.
+ */
+static void hv_balloon_vmdev_reset(VMBusDevice *vdev)
+{
+ HvBalloon *balloon = HV_BALLOON(vdev);
+
+ if (balloon->state == S_POST_RESET_CLOSED) {
+ return;
+ }
+
+ if (balloon->our_range) {
+ hvb_our_range_clear_removed_trees(OUR_RANGE(balloon->our_range));
+ }
+
+ hvb_page_range_tree_destroy(&balloon->removed_guest);
+ hvb_page_range_tree_destroy(&balloon->removed_both);
+ hvb_page_range_tree_init(&balloon->removed_guest);
+ hvb_page_range_tree_init(&balloon->removed_both);
+
+ balloon->trans_id = 0;
+ balloon->removed_guest_ctr = 0;
+ balloon->removed_both_ctr = 0;
+
+ HV_BALLOON_SET_STATE(balloon, S_POST_RESET_CLOSED);
+ hv_balloon_event_loop(balloon);
+}
+
+/*
+ * Clean up things that were (possibly) allocated pre-realization, for example
+ * from memory_device_pre_plug(), so we don't leak them if the device don't
+ * actually get realized in the end.
+ */
+static void hv_balloon_unrealize_finalize_common(HvBalloon *balloon)
+{
+ hv_balloon_free_mr(balloon);
+ balloon->addr = 0;
+
+ balloon->memslot_count = 0;
+}
+
+static void hv_balloon_vmdev_unrealize(VMBusDevice *vdev)
+{
+ HvBalloon *balloon = HV_BALLOON(vdev);
+
+ qemu_unregister_reset(hv_balloon_system_reset, balloon);
+
+ hv_balloon_system_reset_unrealize_common(balloon);
+
+ qemu_remove_balloon_handler(balloon);
+
+ if (balloon->hostmem) {
+ vmstate_unregister_ram(host_memory_backend_get_memory(balloon->hostmem),
+ DEVICE(balloon));
+ host_memory_backend_set_mapped(balloon->hostmem, false);
+ }
+
+ hvb_page_range_tree_destroy(&balloon->removed_guest);
+ hvb_page_range_tree_destroy(&balloon->removed_both);
+
+ hv_balloon_unrealize_finalize_common(balloon);
+}
+
+static uint64_t hv_balloon_md_get_addr(const MemoryDeviceState *md)
+{
+ return object_property_get_uint(OBJECT(md), HV_BALLOON_ADDR_PROP,
+ &error_abort);
+}
+
+static void hv_balloon_md_set_addr(MemoryDeviceState *md, uint64_t addr,
+ Error **errp)
+{
+ object_property_set_uint(OBJECT(md), HV_BALLOON_ADDR_PROP, addr, errp);
+}
+
+static MemoryRegion *hv_balloon_md_get_memory_region(MemoryDeviceState *md,
+ Error **errp)
+{
+ HvBalloon *balloon = HV_BALLOON(md);
+
+ if (!balloon->hostmem) {
+ return NULL;
+ }
+
+ hv_balloon_ensure_mr(balloon);
+
+ return balloon->mr;
+}
+
+static void hv_balloon_md_fill_device_info(const MemoryDeviceState *md,
+ MemoryDeviceInfo *info)
+{
+ HvBalloonDeviceInfo *hi = g_new0(HvBalloonDeviceInfo, 1);
+ const HvBalloon *balloon = HV_BALLOON(md);
+ DeviceState *dev = DEVICE(md);
+
+ if (dev->id) {
+ hi->id = g_strdup(dev->id);
+ }
+
+ if (balloon->hostmem) {
+ hi->memdev = object_get_canonical_path(OBJECT(balloon->hostmem));
+ hi->memaddr = balloon->addr;
+ hi->has_memaddr = true;
+ hi->max_size = memory_region_size(balloon->mr);
+ /* TODO: expose current provided size or something else? */
+ } else {
+ hi->max_size = 0;
+ }
+
+ info->u.hv_balloon.data = hi;
+ info->type = MEMORY_DEVICE_INFO_KIND_HV_BALLOON;
+}
+
+static void hv_balloon_decide_memslots(MemoryDeviceState *md,
+ unsigned int limit)
+{
+ HvBalloon *balloon = HV_BALLOON(md);
+ MemoryRegion *hostmem_mr;
+ uint64_t region_size, memslot_size, memslots;
+
+ /* We're called exactly once, before realizing the device. */
+ assert(!balloon->memslot_count);
+
+ /* We should not be called if we don't have a memory backend */
+ assert(balloon->hostmem);
+
+ hostmem_mr = host_memory_backend_get_memory(balloon->hostmem);
+ region_size = memory_region_size(hostmem_mr);
+
+ assert(region_size > 0);
+ memslot_size = QEMU_ALIGN_UP(region_size / limit,
+ HV_BALLOON_HA_MEMSLOT_SIZE_ALIGN);
+ memslots = QEMU_ALIGN_UP(region_size, memslot_size) / memslot_size;
+
+ if (memslots > 1) {
+ balloon->memslot_size = memslot_size;
+ } else {
+ balloon->memslot_size = region_size;
+ }
+
+ assert(memslots <= UINT_MAX);
+ balloon->memslot_count = memslots;
+}
+
+static unsigned int hv_balloon_get_memslots(MemoryDeviceState *md)
+{
+ const HvBalloon *balloon = HV_BALLOON(md);
+
+ /* We're called after setting the suggested limit. */
+ assert(balloon->memslot_count > 0);
+
+ return balloon->memslot_count;
+}
+
+static void hv_balloon_init(Object *obj)
+{
+}
+
+static void hv_balloon_finalize(Object *obj)
+{
+ HvBalloon *balloon = HV_BALLOON(obj);
+
+ hv_balloon_unrealize_finalize_common(balloon);
+}
+
+static Property hv_balloon_properties[] = {
+ DEFINE_PROP_BOOL("status-report", HvBalloon,
+ status_report.enabled, false),
+
+ /* MEMORY_DEVICE props */
+ DEFINE_PROP_LINK(HV_BALLOON_MEMDEV_PROP, HvBalloon, hostmem,
+ TYPE_MEMORY_BACKEND, HostMemoryBackend *),
+ DEFINE_PROP_UINT64(HV_BALLOON_ADDR_PROP, HvBalloon, addr, 0),
+
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void hv_balloon_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VMBusDeviceClass *vdc = VMBUS_DEVICE_CLASS(klass);
+ MemoryDeviceClass *mdc = MEMORY_DEVICE_CLASS(klass);
+
+ device_class_set_props(dc, hv_balloon_properties);
+ qemu_uuid_parse(HV_BALLOON_GUID, &vdc->classid);
+ set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+
+ vdc->vmdev_realize = hv_balloon_vmdev_realize;
+ vdc->vmdev_unrealize = hv_balloon_vmdev_unrealize;
+ vdc->vmdev_reset = hv_balloon_vmdev_reset;
+ vdc->open_channel = hv_balloon_vmdev_open_channel;
+ vdc->close_channel = hv_balloon_vmdev_close_channel;
+ vdc->chan_notify_cb = hv_balloon_vmdev_chan_notify;
+
+ mdc->get_addr = hv_balloon_md_get_addr;
+ mdc->set_addr = hv_balloon_md_set_addr;
+ mdc->get_plugged_size = memory_device_get_region_size;
+ mdc->get_memory_region = hv_balloon_md_get_memory_region;
+ mdc->decide_memslots = hv_balloon_decide_memslots;
+ mdc->get_memslots = hv_balloon_get_memslots;
+ mdc->fill_device_info = hv_balloon_md_fill_device_info;
+}
specific_ss.add(when: 'CONFIG_HYPERV_TESTDEV', if_true: files('hyperv_testdev.c'))
specific_ss.add(when: 'CONFIG_VMBUS', if_true: files('vmbus.c'))
specific_ss.add(when: 'CONFIG_SYNDBG', if_true: files('syndbg.c'))
+specific_ss.add(when: 'CONFIG_HV_BALLOON', if_true: files('hv-balloon.c', 'hv-balloon-page_range_tree.c', 'hv-balloon-our_range_memslots.c'), if_false: files('hv-balloon-stub.c'))
vmbus_open_channel(uint32_t chan_id, uint32_t gpadl_id, uint32_t target_vp) "channel #%d gpadl #%d target vp %d"
vmbus_channel_open(uint32_t chan_id, uint32_t status) "channel #%d status %d"
vmbus_close_channel(uint32_t chan_id) "channel #%d"
+
+# hv-balloon
+hv_balloon_state_change(const char *tostr) "-> %s"
+hv_balloon_incoming_version(uint16_t major, uint16_t minor) "incoming proto version %u.%u"
+hv_balloon_incoming_caps(uint32_t caps) "incoming caps 0x%x"
+hv_balloon_outgoing_unballoon(uint32_t trans_id, uint64_t count, uint64_t start, uint64_t rempages) "posting unballoon %"PRIu32" for %"PRIu64" @ 0x%"PRIx64", remaining %"PRIu64
+hv_balloon_incoming_unballoon(uint32_t trans_id) "incoming unballoon response %"PRIu32
+hv_balloon_outgoing_hot_add(uint32_t trans_id, uint64_t count, uint64_t start) "posting hot add %"PRIu32" for %"PRIu64" @ 0x%"PRIx64
+hv_balloon_incoming_hot_add(uint32_t trans_id, uint32_t result, uint32_t count) "incoming hot add response %"PRIu32", result %"PRIu32", count %"PRIu32
+hv_balloon_outgoing_balloon(uint32_t trans_id, uint64_t count, uint64_t rempages) "posting balloon %"PRIu32" for %"PRIu64", remaining %"PRIu64
+hv_balloon_incoming_balloon(uint32_t trans_id, uint32_t range_count, uint32_t more_pages) "incoming balloon response %"PRIu32", ranges %"PRIu32", more %"PRIu32
+hv_balloon_our_range_add(uint64_t count, uint64_t start) "adding our range %"PRIu64" @ 0x%"PRIx64
+hv_balloon_remove_response(uint64_t count, uint64_t start, unsigned int both) "processing remove response range %"PRIu64" @ 0x%"PRIx64", both %u"
+hv_balloon_remove_response_hole(uint64_t counthole, uint64_t starthole, uint64_t countrange, uint64_t startrange, uint64_t starthpr, unsigned int both) "response range hole %"PRIu64" @ 0x%"PRIx64" from range %"PRIu64" @ 0x%"PRIx64", before our start 0x%"PRIx64", both %u"
+hv_balloon_remove_response_common(uint64_t countcommon, uint64_t startcommon, uint64_t countrange, uint64_t startrange, uint64_t counthpr, uint64_t starthpr, uint64_t removed, unsigned int both) "response common range %"PRIu64" @ 0x%"PRIx64" from range %"PRIu64" @ 0x%"PRIx64" with our %"PRIu64" @ 0x%"PRIx64", removed %"PRIu64", both %u"
+hv_balloon_remove_response_remainder(uint64_t count, uint64_t start, unsigned int both) "remove response remaining range %"PRIu64" @ 0x%"PRIx64", both %u"
+hv_balloon_map_slot(unsigned int idx, unsigned int total_slots, uint64_t offset) "mapping memslot %u / %u @ 0x%"PRIx64
+hv_balloon_unmap_slot(unsigned int idx, unsigned int total_slots, uint64_t offset) "unmapping memslot %u / %u @ 0x%"PRIx64
VMBus *vmbus = VMBUS(qdev_get_parent_bus(dev));
BusChild *child;
Error *err = NULL;
- char idstr[UUID_FMT_LEN + 1];
+ char idstr[UUID_STR_LEN];
assert(!qemu_uuid_is_null(&vdev->instanceid));
static char *vmbus_get_fw_dev_path(DeviceState *dev)
{
VMBusDevice *vdev = VMBUS_DEVICE(dev);
- char uuid[UUID_FMT_LEN + 1];
+ char uuid[UUID_STR_LEN];
qemu_uuid_unparse(&vdev->instanceid, uuid);
return g_strdup_printf("%s@%s", qdev_fw_name(dev), uuid);
select ACPI_VMGENID
select VIRTIO_PMEM_SUPPORTED
select VIRTIO_MEM_SUPPORTED
+ select HV_BALLOON_SUPPORTED
config PC_PCI
bool
return &iommu_as[devfn]->as;
}
+static const PCIIOMMUOps amdvi_iommu_ops = {
+ .get_address_space = amdvi_host_dma_iommu,
+};
+
static const MemoryRegionOps mmio_mem_ops = {
.read = amdvi_mmio_read,
.write = amdvi_mmio_write,
AMDVI_MMIO_SIZE);
memory_region_add_subregion(get_system_memory(), AMDVI_BASE_ADDR,
&s->mmio);
- pci_setup_iommu(bus, amdvi_host_dma_iommu, s);
+ pci_setup_iommu(bus, &amdvi_iommu_ops, s);
amdvi_init(s);
}
* Rsvd field masks for spte:
* vtd_spte_rsvd 4k pages
* vtd_spte_rsvd_large large pages
+ *
+ * We support only 3-level and 4-level page tables (see vtd_init() which
+ * sets only VTD_CAP_SAGAW_39bit and maybe VTD_CAP_SAGAW_48bit bits in s->cap).
*/
-static uint64_t vtd_spte_rsvd[5];
-static uint64_t vtd_spte_rsvd_large[5];
+#define VTD_SPTE_RSVD_LEN 5
+static uint64_t vtd_spte_rsvd[VTD_SPTE_RSVD_LEN];
+static uint64_t vtd_spte_rsvd_large[VTD_SPTE_RSVD_LEN];
static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level)
{
- uint64_t rsvd_mask = vtd_spte_rsvd[level];
+ uint64_t rsvd_mask;
+
+ /*
+ * We should have caught a guest-mis-programmed level earlier,
+ * via vtd_is_level_supported.
+ */
+ assert(level < VTD_SPTE_RSVD_LEN);
+ /*
+ * Zero level doesn't exist. The smallest level is VTD_SL_PT_LEVEL=1 and
+ * checked by vtd_is_last_slpte().
+ */
+ assert(level);
if ((level == VTD_SL_PD_LEVEL || level == VTD_SL_PDP_LEVEL) &&
(slpte & VTD_SL_PT_PAGE_SIZE_MASK)) {
/* large page */
rsvd_mask = vtd_spte_rsvd_large[level];
+ } else {
+ rsvd_mask = vtd_spte_rsvd[level];
}
return slpte & rsvd_mask;
return &vtd_as->as;
}
+static PCIIOMMUOps vtd_iommu_ops = {
+ .get_address_space = vtd_host_dma_iommu,
+};
+
static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
{
X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
s->vtd_address_spaces = g_hash_table_new_full(vtd_as_hash, vtd_as_equal,
g_free, g_free);
vtd_init(s);
- pci_setup_iommu(bus, vtd_host_dma_iommu, dev);
+ pci_setup_iommu(bus, &vtd_iommu_ops, dev);
/* Pseudo address space under root PCI bus. */
x86ms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC);
qemu_add_machine_init_done_notifier(&vtd_machine_done_notify);
break;
}
+ /* If the guest has set a per-vCPU callback vector, prefer that. */
+ if (gsi && kvm_xen_has_vcpu_callback_vector()) {
+ in_kernel = kvm_xen_has_cap(EVTCHN_SEND);
+ gsi = 0;
+ }
+
if (!ret) {
/* If vector delivery was turned *off* then tell the kernel */
if ((s->callback_param >> CALLBACK_VIA_TYPE_SHIFT) ==
return -ESRCH;
}
+ QEMU_IOTHREAD_LOCK_GUARD();
return xen_evtchn_soft_reset();
}
s->entries.v1[GNTTAB_RESERVED_XENSTORE].flags = GTF_permit_access;
s->entries.v1[GNTTAB_RESERVED_XENSTORE].frame = XEN_SPECIAL_PFN(XENSTORE);
- memset(s->map_track, 0, s->max_frames * ENTRIES_PER_FRAME_V1);
-
return 0;
}
} else {
deliver_watch(s, path, token);
/*
- * If the message was queued because there was already ring activity,
- * no need to wake the guest. But if not, we need to send the evtchn.
+ * Attempt to queue the message into the actual ring, and send
+ * the event channel notification if any bytes are copied.
*/
- xen_be_evtchn_notify(s->eh, s->be_port);
+ if (s->rsp_pending && put_rsp(s) > 0) {
+ xen_be_evtchn_notify(s->eh, s->be_port);
+ }
}
}
#include "hw/i386/pc.h"
#include "hw/char/serial.h"
#include "hw/char/parallel.h"
+#include "hw/hyperv/hv-balloon.h"
#include "hw/i386/fw_cfg.h"
#include "hw/i386/vmport.h"
#include "sysemu/cpus.h"
#include "hw/i386/kvm/xen_evtchn.h"
#include "hw/i386/kvm/xen_gnttab.h"
#include "hw/i386/kvm/xen_xenstore.h"
+#include "hw/mem/memory-device.h"
#include "e820_memory_layout.h"
#include "trace.h"
#include CONFIG_DEVICES
error_propagate(errp, local_err);
}
+static void pc_hv_balloon_pre_plug(HotplugHandler *hotplug_dev,
+ DeviceState *dev, Error **errp)
+{
+ /* The vmbus handler has no hotplug handler; we should never end up here. */
+ g_assert(!dev->hotplugged);
+ memory_device_pre_plug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev), NULL,
+ errp);
+}
+
+static void pc_hv_balloon_plug(HotplugHandler *hotplug_dev,
+ DeviceState *dev, Error **errp)
+{
+ memory_device_plug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev));
+}
+
static void pc_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev,
DeviceState *dev, Error **errp)
{
return;
}
pcms->iommu = dev;
+ } else if (object_dynamic_cast(OBJECT(dev), TYPE_HV_BALLOON)) {
+ pc_hv_balloon_pre_plug(hotplug_dev, dev, errp);
}
}
x86_cpu_plug(hotplug_dev, dev, errp);
} else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI)) {
virtio_md_pci_plug(VIRTIO_MD_PCI(dev), MACHINE(hotplug_dev), errp);
+ } else if (object_dynamic_cast(OBJECT(dev), TYPE_HV_BALLOON)) {
+ pc_hv_balloon_plug(hotplug_dev, dev, errp);
}
}
object_dynamic_cast(OBJECT(dev), TYPE_CPU) ||
object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI) ||
object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI) ||
+ object_dynamic_cast(OBJECT(dev), TYPE_HV_BALLOON) ||
object_dynamic_cast(OBJECT(dev), TYPE_X86_IOMMU_DEVICE)) {
return HOTPLUG_HANDLER(machine);
}
#include "exec/address-spaces.h"
#include "trace.h"
+static bool memory_device_is_empty(const MemoryDeviceState *md)
+{
+ const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md);
+ Error *local_err = NULL;
+ MemoryRegion *mr;
+
+ /* dropping const here is fine as we don't touch the memory region */
+ mr = mdc->get_memory_region((MemoryDeviceState *)md, &local_err);
+ if (local_err) {
+ /* Not empty, we'll report errors later when ontaining the MR again. */
+ error_free(local_err);
+ return false;
+ }
+ return !mr;
+}
+
static gint memory_device_addr_sort(gconstpointer a, gconstpointer b)
{
const MemoryDeviceState *md_a = MEMORY_DEVICE(a);
return 0;
}
- if (!QEMU_IS_ALIGNED(size, align)) {
- error_setg(errp, "backend memory size must be multiple of 0x%"
- PRIx64, align);
- return 0;
- }
-
if (hint) {
if (range_init(&new, *hint, size) || !range_contains_range(&as, &new)) {
error_setg(errp, "can't add memory device [0x%" PRIx64 ":0x%" PRIx64
uint64_t next_addr;
Range tmp;
+ if (memory_device_is_empty(md)) {
+ continue;
+ }
+
range_init_nofail(&tmp, mdc->get_addr(md),
memory_device_get_region_size(md, &error_abort));
const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(item->data);
MemoryDeviceInfo *info = g_new0(MemoryDeviceInfo, 1);
+ /* Let's query infotmation even for empty memory devices. */
mdc->fill_device_info(md, info);
QAPI_LIST_APPEND(tail, info);
const MemoryDeviceState *md = MEMORY_DEVICE(obj);
const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(obj);
- if (dev->realized) {
+ if (dev->realized && !memory_device_is_empty(md)) {
*size += mdc->get_plugged_size(md, &error_abort);
}
}
uint64_t addr, align = 0;
MemoryRegion *mr;
+ /* We support empty memory devices even without device memory. */
+ if (memory_device_is_empty(md)) {
+ return;
+ }
+
if (!ms->device_memory) {
error_setg(errp, "the configuration is not prepared for memory devices"
" (e.g., for memory hotplug), consider specifying the"
void memory_device_plug(MemoryDeviceState *md, MachineState *ms)
{
const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md);
- const unsigned int memslots = memory_device_get_memslots(md);
- const uint64_t addr = mdc->get_addr(md);
+ unsigned int memslots;
+ uint64_t addr;
MemoryRegion *mr;
+ if (memory_device_is_empty(md)) {
+ return;
+ }
+
+ memslots = memory_device_get_memslots(md);
+ addr = mdc->get_addr(md);
+
/*
* We expect that a previous call to memory_device_pre_plug() succeeded, so
* it can't fail at this point.
const unsigned int memslots = memory_device_get_memslots(md);
MemoryRegion *mr;
+ if (memory_device_is_empty(md)) {
+ return;
+ }
+
/*
* We expect that a previous call to memory_device_pre_plug() succeeded, so
* it can't fail at this point.
return &s->astro->iommu_as;
}
+static const PCIIOMMUOps elroy_pcihost_iommu_ops = {
+ .get_address_space = elroy_pcihost_set_iommu,
+};
+
/*
* Encoding in IOSAPIC:
* base_addr == 0xfffa0000, we want to get 0xa0ff0000.
&elroy->pci_io);
/* Host memory as seen from the PCI side, via the IOMMU. */
- pci_setup_iommu(PCI_HOST_BRIDGE(elroy)->bus, elroy_pcihost_set_iommu,
+ pci_setup_iommu(PCI_HOST_BRIDGE(elroy)->bus, &elroy_pcihost_iommu_ops,
elroy);
}
}
return &s->pci.address_space;
}
+static const PCIIOMMUOps designware_iommu_ops = {
+ .get_address_space = designware_pcie_host_set_iommu,
+};
+
static void designware_pcie_host_realize(DeviceState *dev, Error **errp)
{
PCIHostState *pci = PCI_HOST_BRIDGE(dev);
address_space_init(&s->pci.address_space,
&s->pci.address_space_root,
"pcie-bus-address-space");
- pci_setup_iommu(pci->bus, designware_pcie_host_set_iommu, s);
+ pci_setup_iommu(pci->bus, &designware_iommu_ops, s);
qdev_realize(DEVICE(&s->root), BUS(pci->bus), &error_fatal);
}
return &s->bm_as;
}
+static const PCIIOMMUOps dino_iommu_ops = {
+ .get_address_space = dino_pcihost_set_iommu,
+};
+
/*
* Dino interrupts are connected as shown on Page 78, Table 23
* (Little-endian bit numbers)
g_free(name);
}
- pci_setup_iommu(phb->bus, dino_pcihost_set_iommu, s);
+ pci_setup_iommu(phb->bus, &dino_iommu_ops, s);
sysbus_init_mmio(sbd, &s->this_mem);
return &ds->dma_as;
}
+static PCIIOMMUOps pnv_phb3_iommu_ops = {
+ .get_address_space = pnv_phb3_dma_iommu,
+};
+
static void pnv_phb3_instance_init(Object *obj)
{
PnvPHB3 *phb = PNV_PHB3(obj);
object_property_set_int(OBJECT(pci->bus), "chip-id", phb->chip_id,
&error_abort);
- pci_setup_iommu(pci->bus, pnv_phb3_dma_iommu, phb);
+ pci_setup_iommu(pci->bus, &pnv_phb3_iommu_ops, phb);
}
static void pnv_phb3_realize(DeviceState *dev, Error **errp)
&phb->phb_regs_mr);
}
+static PCIIOMMUOps pnv_phb4_iommu_ops = {
+ .get_address_space = pnv_phb4_dma_iommu,
+};
+
static void pnv_phb4_instance_init(Object *obj)
{
PnvPHB4 *phb = PNV_PHB4(obj);
object_property_set_int(OBJECT(pci->bus), "chip-id", phb->chip_id,
&error_abort);
- pci_setup_iommu(pci->bus, pnv_phb4_dma_iommu, phb);
+ pci_setup_iommu(pci->bus, &pnv_phb4_iommu_ops, phb);
pci->bus->flags |= PCI_BUS_EXTENDED_CONFIG_SPACE;
}
return &s->bm_as;
}
+static const PCIIOMMUOps ppce500_iommu_ops = {
+ .get_address_space = e500_pcihost_set_iommu,
+};
+
static void e500_pcihost_realize(DeviceState *dev, Error **errp)
{
SysBusDevice *sbd = SYS_BUS_DEVICE(dev);
memory_region_init(&s->bm, OBJECT(s), "bm-e500", UINT64_MAX);
memory_region_add_subregion(&s->bm, 0x0, &s->busmem);
address_space_init(&s->bm_as, &s->bm, "pci-bm");
- pci_setup_iommu(b, e500_pcihost_set_iommu, s);
+ pci_setup_iommu(b, &ppce500_iommu_ops, s);
pci_create_simple(b, 0, "e500-host-bridge");
return &s->bm_as;
}
+static const PCIIOMMUOps raven_iommu_ops = {
+ .get_address_space = raven_pcihost_set_iommu,
+};
+
static void raven_change_gpio(void *opaque, int n, int level)
{
PREPPCIState *s = opaque;
memory_region_add_subregion(&s->bm, 0 , &s->bm_pci_memory_alias);
memory_region_add_subregion(&s->bm, 0x80000000, &s->bm_ram_alias);
address_space_init(&s->bm_as, &s->bm, "raven-bm");
- pci_setup_iommu(&s->pci_bus, raven_pcihost_set_iommu, s);
+ pci_setup_iommu(&s->pci_bus, &raven_iommu_ops, s);
h->bus = &s->pci_bus;
return &is->iommu_as;
}
+static const PCIIOMMUOps sabre_iommu_ops = {
+ .get_address_space = sabre_pci_dma_iommu,
+};
+
static void sabre_config_write(void *opaque, hwaddr addr,
uint64_t val, unsigned size)
{
/* IOMMU */
memory_region_add_subregion_overlap(&s->sabre_config, 0x200,
sysbus_mmio_get_region(SYS_BUS_DEVICE(s->iommu), 0), 1);
- pci_setup_iommu(phb->bus, sabre_pci_dma_iommu, s->iommu);
+ pci_setup_iommu(phb->bus, &sabre_iommu_ops, s->iommu);
/* APB secondary busses */
pci_dev = pci_new_multifunction(PCI_DEVFN(1, 0), TYPE_SIMBA_PCI_BRIDGE);
PCIBus *iommu_bus = bus;
uint8_t devfn = dev->devfn;
- while (iommu_bus && !iommu_bus->iommu_fn && iommu_bus->parent_dev) {
+ while (iommu_bus && !iommu_bus->iommu_ops && iommu_bus->parent_dev) {
PCIBus *parent_bus = pci_get_bus(iommu_bus->parent_dev);
/*
iommu_bus = parent_bus;
}
- if (!pci_bus_bypass_iommu(bus) && iommu_bus && iommu_bus->iommu_fn) {
- return iommu_bus->iommu_fn(bus, iommu_bus->iommu_opaque, devfn);
+ if (!pci_bus_bypass_iommu(bus) && iommu_bus->iommu_ops) {
+ return iommu_bus->iommu_ops->get_address_space(bus,
+ iommu_bus->iommu_opaque, devfn);
}
return &address_space_memory;
}
-void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque)
+void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *ops, void *opaque)
{
- bus->iommu_fn = fn;
+ /*
+ * If called, pci_setup_iommu() should provide a minimum set of
+ * useful callbacks for the bus.
+ */
+ assert(ops);
+ assert(ops->get_address_space);
+
+ bus->iommu_ops = ops;
bus->iommu_opaque = opaque;
}
return &s->bm_as;
}
+static const PCIIOMMUOps ppc440_iommu_ops = {
+ .get_address_space = ppc440_pcix_set_iommu,
+};
+
/*
* Some guests on sam460ex write all kinds of garbage here such as
* missing enable bit and low bits set and still expect this to work
memory_region_init(&s->bm, OBJECT(s), "bm-ppc440-pcix", UINT64_MAX);
memory_region_add_subregion(&s->bm, 0x0, &s->busmem);
address_space_init(&s->bm_as, &s->bm, "pci-bm");
- pci_setup_iommu(h->bus, ppc440_pcix_set_iommu, s);
+ pci_setup_iommu(h->bus, &ppc440_iommu_ops, s);
memory_region_init(&s->container, OBJECT(s), "pci-container", PCI_ALL_SIZE);
memory_region_init_io(&h->conf_mem, OBJECT(s), &ppc440_pcix_host_conf_ops,
return &phb->iommu_as;
}
+static const PCIIOMMUOps spapr_iommu_ops = {
+ .get_address_space = spapr_pci_dma_iommu,
+};
+
static char *spapr_phb_vfio_get_loc_code(SpaprPhbState *sphb, PCIDevice *pdev)
{
g_autofree char *path = NULL;
memory_region_add_subregion(&sphb->iommu_root, SPAPR_PCI_MSI_WINDOW,
&sphb->msiwindow);
- pci_setup_iommu(bus, spapr_pci_dma_iommu, sphb);
+ pci_setup_iommu(bus, &spapr_iommu_ops, sphb);
pci_bus_set_route_irq_fn(bus, spapr_route_intx_pin_to_irq);
*/
#include "qemu/osdep.h"
+#include <sys/ioctl.h>
#include <linux/vfio.h>
#include "hw/ppc/spapr.h"
#include "hw/pci-host/spapr.h"
#include "hw/pci/msix.h"
#include "hw/pci/pci_device.h"
-#include "hw/vfio/vfio.h"
+#include "hw/vfio/vfio-common.h"
#include "qemu/error-report.h"
+/*
+ * Interfaces for IBM EEH (Enhanced Error Handling)
+ */
+static bool vfio_eeh_container_ok(VFIOContainer *container)
+{
+ /*
+ * As of 2016-03-04 (linux-4.5) the host kernel EEH/VFIO
+ * implementation is broken if there are multiple groups in a
+ * container. The hardware works in units of Partitionable
+ * Endpoints (== IOMMU groups) and the EEH operations naively
+ * iterate across all groups in the container, without any logic
+ * to make sure the groups have their state synchronized. For
+ * certain operations (ENABLE) that might be ok, until an error
+ * occurs, but for others (GET_STATE) it's clearly broken.
+ */
+
+ /*
+ * XXX Once fixed kernels exist, test for them here
+ */
+
+ if (QLIST_EMPTY(&container->group_list)) {
+ return false;
+ }
+
+ if (QLIST_NEXT(QLIST_FIRST(&container->group_list), container_next)) {
+ return false;
+ }
+
+ return true;
+}
+
+static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op)
+{
+ struct vfio_eeh_pe_op pe_op = {
+ .argsz = sizeof(pe_op),
+ .op = op,
+ };
+ int ret;
+
+ if (!vfio_eeh_container_ok(container)) {
+ error_report("vfio/eeh: EEH_PE_OP 0x%x: "
+ "kernel requires a container with exactly one group", op);
+ return -EPERM;
+ }
+
+ ret = ioctl(container->fd, VFIO_EEH_PE_OP, &pe_op);
+ if (ret < 0) {
+ error_report("vfio/eeh: EEH_PE_OP 0x%x failed: %m", op);
+ return -errno;
+ }
+
+ return ret;
+}
+
+static VFIOContainer *vfio_eeh_as_container(AddressSpace *as)
+{
+ VFIOAddressSpace *space = vfio_get_address_space(as);
+ VFIOContainer *container = NULL;
+
+ if (QLIST_EMPTY(&space->containers)) {
+ /* No containers to act on */
+ goto out;
+ }
+
+ container = QLIST_FIRST(&space->containers);
+
+ if (QLIST_NEXT(container, next)) {
+ /*
+ * We don't yet have logic to synchronize EEH state across
+ * multiple containers
+ */
+ container = NULL;
+ goto out;
+ }
+
+out:
+ vfio_put_address_space(space);
+ return container;
+}
+
+static bool vfio_eeh_as_ok(AddressSpace *as)
+{
+ VFIOContainer *container = vfio_eeh_as_container(as);
+
+ return (container != NULL) && vfio_eeh_container_ok(container);
+}
+
+static int vfio_eeh_as_op(AddressSpace *as, uint32_t op)
+{
+ VFIOContainer *container = vfio_eeh_as_container(as);
+
+ if (!container) {
+ return -ENODEV;
+ }
+ return vfio_eeh_container_op(container, op);
+}
+
bool spapr_phb_eeh_available(SpaprPhbState *sphb)
{
return vfio_eeh_as_ok(&sphb->iommu_as);
iommu->elem_by_devfn = NULL;
}
+static const PCIIOMMUOps remote_iommu_ops = {
+ .get_address_space = remote_iommu_find_add_as,
+};
+
void remote_iommu_setup(PCIBus *pci_bus)
{
RemoteIommu *iommu = NULL;
iommu = REMOTE_IOMMU(object_new(TYPE_REMOTE_IOMMU));
- pci_setup_iommu(pci_bus, remote_iommu_find_add_as, iommu);
+ pci_setup_iommu(pci_bus, &remote_iommu_ops, iommu);
object_property_add_child(OBJECT(pci_bus), "remote-iommu", OBJECT(iommu));
static void rtc_set_time(MC146818RtcState *s)
{
- struct tm tm;
+ struct tm tm = {};
g_autofree const char *qom_path = object_get_canonical_path(OBJECT(s));
rtc_get_time(s, &tm);
return &iommu->as;
}
+static const PCIIOMMUOps s390_iommu_ops = {
+ .get_address_space = s390_pci_dma_iommu,
+};
+
static uint8_t set_ind_atomic(uint64_t ind_loc, uint8_t to_be_set)
{
uint8_t expected, actual;
b = pci_register_root_bus(dev, NULL, s390_pci_set_irq, s390_pci_map_irq,
NULL, get_system_memory(), get_system_io(), 0,
64, TYPE_PCI_BUS);
- pci_setup_iommu(b, s390_pci_dma_iommu, s);
+ pci_setup_iommu(b, &s390_iommu_ops, s);
bus = BUS(b);
qbus_set_hotplug_handler(bus, OBJECT(dev));
pdev = PCI_DEVICE(dev);
pci_bridge_map_irq(pb, dev->id, s390_pci_map_irq);
- pci_setup_iommu(&pb->sec_bus, s390_pci_dma_iommu, s);
+ pci_setup_iommu(&pb->sec_bus, &s390_iommu_ops, s);
qbus_set_hotplug_handler(BUS(&pb->sec_bus), OBJECT(s));
#include <linux/vfio.h>
#include <sys/ioctl.h>
#include "qapi/error.h"
-#include "hw/vfio/vfio.h"
#include "hw/vfio/vfio-common.h"
#include "hw/s390x/ap-device.h"
#include "qemu/error-report.h"
#include <sys/ioctl.h>
#include "qapi/error.h"
-#include "hw/vfio/vfio.h"
#include "hw/vfio/vfio-common.h"
#include "hw/s390x/s390-ccw.h"
#include "hw/s390x/vfio-ccw.h"
#include <linux/vfio.h>
#include "hw/vfio/vfio-common.h"
-#include "hw/vfio/vfio.h"
#include "hw/vfio/pci.h"
#include "exec/address-spaces.h"
#include "exec/memory.h"
return true;
}
-void vfio_host_win_add(VFIOContainer *container, hwaddr min_iova,
- hwaddr max_iova, uint64_t iova_pgsizes)
-{
- VFIOHostDMAWindow *hostwin;
-
- QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
- if (ranges_overlap(hostwin->min_iova,
- hostwin->max_iova - hostwin->min_iova + 1,
- min_iova,
- max_iova - min_iova + 1)) {
- hw_error("%s: Overlapped IOMMU are not enabled", __func__);
- }
- }
-
- hostwin = g_malloc0(sizeof(*hostwin));
-
- hostwin->min_iova = min_iova;
- hostwin->max_iova = max_iova;
- hostwin->iova_pgsizes = iova_pgsizes;
- QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next);
-}
-
-int vfio_host_win_del(VFIOContainer *container,
- hwaddr min_iova, hwaddr max_iova)
-{
- VFIOHostDMAWindow *hostwin;
-
- QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
- if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) {
- QLIST_REMOVE(hostwin, hostwin_next);
- g_free(hostwin);
- return 0;
- }
- }
-
- return -1;
-}
-
static bool vfio_listener_skipped_section(MemoryRegionSection *section)
{
return (!memory_region_is_ram(section->mr) &&
g_free(vrdl);
}
-static VFIOHostDMAWindow *vfio_find_hostwin(VFIOContainer *container,
- hwaddr iova, hwaddr end)
-{
- VFIOHostDMAWindow *hostwin;
- bool hostwin_found = false;
-
- QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
- if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
- hostwin_found = true;
- break;
- }
- }
-
- return hostwin_found ? hostwin : NULL;
-}
-
static bool vfio_known_safe_misalignment(MemoryRegionSection *section)
{
MemoryRegion *mr = section->mr;
Int128 llend, llsize;
void *vaddr;
int ret;
- VFIOHostDMAWindow *hostwin;
Error *err = NULL;
if (!vfio_listener_valid_section(section, "region_add")) {
goto fail;
}
- hostwin = vfio_find_hostwin(container, iova, end);
- if (!hostwin) {
- error_setg(&err, "Container %p can't map guest IOVA region"
- " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, end);
- goto fail;
- }
-
memory_region_ref(section->mr);
if (memory_region_is_iommu(section->mr)) {
goto fail;
}
+ if (container->iova_ranges) {
+ ret = memory_region_iommu_set_iova_ranges(giommu->iommu_mr,
+ container->iova_ranges, &err);
+ if (ret) {
+ g_free(giommu);
+ goto fail;
+ }
+ }
+
ret = memory_region_register_iommu_notifier(section->mr, &giommu->n,
&err);
if (ret) {
llsize = int128_sub(llend, int128_make64(iova));
if (memory_region_is_ram_device(section->mr)) {
- hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
+ hwaddr pgmask = (1ULL << ctz64(container->pgsizes)) - 1;
if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) {
trace_vfio_listener_region_add_no_dma_map(
if (memory_region_is_ram_device(section->mr)) {
hwaddr pgmask;
- VFIOHostDMAWindow *hostwin;
-
- hostwin = vfio_find_hostwin(container, iova, end);
- assert(hostwin); /* or region_add() would have failed */
- pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
+ pgmask = (1ULL << ctz64(container->pgsizes)) - 1;
try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
} else if (memory_region_has_ram_discard_manager(section->mr)) {
vfio_unregister_ram_discard_listener(container, section);
#include "qemu/osdep.h"
#include <sys/ioctl.h>
-#ifdef CONFIG_KVM
-#include <linux/kvm.h>
-#endif
#include <linux/vfio.h>
#include "hw/vfio/vfio-common.h"
-#include "hw/vfio/vfio.h"
#include "exec/address-spaces.h"
#include "exec/memory.h"
#include "exec/ram_addr.h"
#include "hw/hw.h"
#include "qemu/error-report.h"
#include "qemu/range.h"
-#include "sysemu/kvm.h"
#include "sysemu/reset.h"
#include "trace.h"
#include "qapi/error.h"
return -errno;
}
-int vfio_container_add_section_window(VFIOContainer *container,
- MemoryRegionSection *section,
- Error **errp)
-{
- VFIOHostDMAWindow *hostwin;
- hwaddr pgsize = 0;
- int ret;
-
- if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) {
- return 0;
- }
-
- /* For now intersections are not allowed, we may relax this later */
- QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
- if (ranges_overlap(hostwin->min_iova,
- hostwin->max_iova - hostwin->min_iova + 1,
- section->offset_within_address_space,
- int128_get64(section->size))) {
- error_setg(errp,
- "region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing"
- "host DMA window [0x%"PRIx64",0x%"PRIx64"]",
- section->offset_within_address_space,
- section->offset_within_address_space +
- int128_get64(section->size) - 1,
- hostwin->min_iova, hostwin->max_iova);
- return -EINVAL;
- }
- }
-
- ret = vfio_spapr_create_window(container, section, &pgsize);
- if (ret) {
- error_setg_errno(errp, -ret, "Failed to create SPAPR window");
- return ret;
- }
-
- vfio_host_win_add(container, section->offset_within_address_space,
- section->offset_within_address_space +
- int128_get64(section->size) - 1, pgsize);
-#ifdef CONFIG_KVM
- if (kvm_enabled()) {
- VFIOGroup *group;
- IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
- struct kvm_vfio_spapr_tce param;
- struct kvm_device_attr attr = {
- .group = KVM_DEV_VFIO_GROUP,
- .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
- .addr = (uint64_t)(unsigned long)¶m,
- };
-
- if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD,
- ¶m.tablefd)) {
- QLIST_FOREACH(group, &container->group_list, container_next) {
- param.groupfd = group->fd;
- if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
- error_setg_errno(errp, errno,
- "vfio: failed GROUP_SET_SPAPR_TCE for "
- "KVM VFIO device %d and group fd %d",
- param.tablefd, param.groupfd);
- return -errno;
- }
- trace_vfio_spapr_group_attach(param.groupfd, param.tablefd);
- }
- }
- }
-#endif
- return 0;
-}
-
-void vfio_container_del_section_window(VFIOContainer *container,
- MemoryRegionSection *section)
-{
- if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) {
- return;
- }
-
- vfio_spapr_remove_window(container,
- section->offset_within_address_space);
- if (vfio_host_win_del(container,
- section->offset_within_address_space,
- section->offset_within_address_space +
- int128_get64(section->size) - 1) < 0) {
- hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx,
- __func__, section->offset_within_address_space);
- }
-}
-
int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
{
int ret;
return ret;
}
-static void vfio_listener_release(VFIOContainer *container)
-{
- memory_listener_unregister(&container->listener);
- if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
- memory_listener_unregister(&container->prereg_listener);
- }
-}
-
static struct vfio_info_cap_header *
vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
{
/* If the capability cannot be found, assume no DMA limiting */
hdr = vfio_get_iommu_type1_info_cap(info,
VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL);
- if (hdr == NULL) {
+ if (!hdr) {
return false;
}
return true;
}
+static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info,
+ VFIOContainer *container)
+{
+ struct vfio_info_cap_header *hdr;
+ struct vfio_iommu_type1_info_cap_iova_range *cap;
+
+ hdr = vfio_get_iommu_type1_info_cap(info,
+ VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE);
+ if (!hdr) {
+ return false;
+ }
+
+ cap = (void *)hdr;
+
+ for (int i = 0; i < cap->nr_iovas; i++) {
+ Range *range = g_new(Range, 1);
+
+ range_set_bounds(range, cap->iova_ranges[i].start,
+ cap->iova_ranges[i].end);
+ container->iova_ranges =
+ range_list_insert(container->iova_ranges, range);
+ }
+
+ return true;
+}
+
static void vfio_kvm_device_add_group(VFIOGroup *group)
{
Error *err = NULL;
}
}
+static void vfio_free_container(VFIOContainer *container)
+{
+ g_list_free_full(container->iova_ranges, g_free);
+ g_free(container);
+}
+
static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
Error **errp)
{
container->error = NULL;
container->dirty_pages_supported = false;
container->dma_max_mappings = 0;
+ container->iova_ranges = NULL;
QLIST_INIT(&container->giommu_list);
- QLIST_INIT(&container->hostwin_list);
QLIST_INIT(&container->vrdl_list);
ret = vfio_init_container(container, group->fd, errp);
if (!vfio_get_info_dma_avail(info, &container->dma_max_mappings)) {
container->dma_max_mappings = 65535;
}
- vfio_get_iommu_info_migration(container, info);
- g_free(info);
- /*
- * FIXME: We should parse VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE
- * information to get the actual window extent rather than assume
- * a 64-bit IOVA address space.
- */
- vfio_host_win_add(container, 0, (hwaddr)-1, container->pgsizes);
+ vfio_get_info_iova_range(info, container);
+ vfio_get_iommu_info_migration(container, info);
+ g_free(info);
break;
}
case VFIO_SPAPR_TCE_v2_IOMMU:
case VFIO_SPAPR_TCE_IOMMU:
{
- struct vfio_iommu_spapr_tce_info info;
- bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU;
-
- /*
- * The host kernel code implementing VFIO_IOMMU_DISABLE is called
- * when container fd is closed so we do not call it explicitly
- * in this file.
- */
- if (!v2) {
- ret = ioctl(fd, VFIO_IOMMU_ENABLE);
- if (ret) {
- error_setg_errno(errp, errno, "failed to enable container");
- ret = -errno;
- goto enable_discards_exit;
- }
- } else {
- container->prereg_listener = vfio_prereg_listener;
-
- memory_listener_register(&container->prereg_listener,
- &address_space_memory);
- if (container->error) {
- memory_listener_unregister(&container->prereg_listener);
- ret = -1;
- error_propagate_prepend(errp, container->error,
- "RAM memory listener initialization failed: ");
- goto enable_discards_exit;
- }
- }
-
- info.argsz = sizeof(info);
- ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
+ ret = vfio_spapr_container_init(container, errp);
if (ret) {
- error_setg_errno(errp, errno,
- "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed");
- ret = -errno;
- if (v2) {
- memory_listener_unregister(&container->prereg_listener);
- }
goto enable_discards_exit;
}
-
- if (v2) {
- container->pgsizes = info.ddw.pgsizes;
- /*
- * There is a default window in just created container.
- * To make region_add/del simpler, we better remove this
- * window now and let those iommu_listener callbacks
- * create/remove them when needed.
- */
- ret = vfio_spapr_remove_window(container, info.dma32_window_start);
- if (ret) {
- error_setg_errno(errp, -ret,
- "failed to remove existing window");
- goto enable_discards_exit;
- }
- } else {
- /* The default table uses 4K pages */
- container->pgsizes = 0x1000;
- vfio_host_win_add(container, info.dma32_window_start,
- info.dma32_window_start +
- info.dma32_window_size - 1,
- 0x1000);
- }
+ break;
}
}
QLIST_REMOVE(group, container_next);
QLIST_REMOVE(container, next);
vfio_kvm_device_del_group(group);
- vfio_listener_release(container);
+ memory_listener_unregister(&container->listener);
+ if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU ||
+ container->iommu_type == VFIO_SPAPR_TCE_IOMMU) {
+ vfio_spapr_container_deinit(container);
+ }
enable_discards_exit:
vfio_ram_block_discard_disable(container, false);
free_container_exit:
- g_free(container);
+ vfio_free_container(container);
close_fd_exit:
close(fd);
* group.
*/
if (QLIST_EMPTY(&container->group_list)) {
- vfio_listener_release(container);
+ memory_listener_unregister(&container->listener);
+ if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU ||
+ container->iommu_type == VFIO_SPAPR_TCE_IOMMU) {
+ vfio_spapr_container_deinit(container);
+ }
}
if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
if (QLIST_EMPTY(&container->group_list)) {
VFIOAddressSpace *space = container->space;
VFIOGuestIOMMU *giommu, *tmp;
- VFIOHostDMAWindow *hostwin, *next;
QLIST_REMOVE(container, next);
g_free(giommu);
}
- QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next,
- next) {
- QLIST_REMOVE(hostwin, hostwin_next);
- g_free(hostwin);
- }
-
trace_vfio_disconnect_container(container->fd);
close(container->fd);
- g_free(container);
+ vfio_free_container(container);
vfio_put_address_space(space);
}
close(vbasedev->fd);
}
-/*
- * Interfaces for IBM EEH (Enhanced Error Handling)
- */
-static bool vfio_eeh_container_ok(VFIOContainer *container)
-{
- /*
- * As of 2016-03-04 (linux-4.5) the host kernel EEH/VFIO
- * implementation is broken if there are multiple groups in a
- * container. The hardware works in units of Partitionable
- * Endpoints (== IOMMU groups) and the EEH operations naively
- * iterate across all groups in the container, without any logic
- * to make sure the groups have their state synchronized. For
- * certain operations (ENABLE) that might be ok, until an error
- * occurs, but for others (GET_STATE) it's clearly broken.
- */
-
- /*
- * XXX Once fixed kernels exist, test for them here
- */
-
- if (QLIST_EMPTY(&container->group_list)) {
- return false;
- }
-
- if (QLIST_NEXT(QLIST_FIRST(&container->group_list), container_next)) {
- return false;
- }
-
- return true;
-}
-
-static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op)
-{
- struct vfio_eeh_pe_op pe_op = {
- .argsz = sizeof(pe_op),
- .op = op,
- };
- int ret;
-
- if (!vfio_eeh_container_ok(container)) {
- error_report("vfio/eeh: EEH_PE_OP 0x%x: "
- "kernel requires a container with exactly one group", op);
- return -EPERM;
- }
-
- ret = ioctl(container->fd, VFIO_EEH_PE_OP, &pe_op);
- if (ret < 0) {
- error_report("vfio/eeh: EEH_PE_OP 0x%x failed: %m", op);
- return -errno;
- }
-
- return ret;
-}
-
-static VFIOContainer *vfio_eeh_as_container(AddressSpace *as)
-{
- VFIOAddressSpace *space = vfio_get_address_space(as);
- VFIOContainer *container = NULL;
-
- if (QLIST_EMPTY(&space->containers)) {
- /* No containers to act on */
- goto out;
- }
-
- container = QLIST_FIRST(&space->containers);
-
- if (QLIST_NEXT(container, next)) {
- /*
- * We don't yet have logic to synchronize EEH state across
- * multiple containers
- */
- container = NULL;
- goto out;
- }
-
-out:
- vfio_put_address_space(space);
- return container;
-}
-
-bool vfio_eeh_as_ok(AddressSpace *as)
-{
- VFIOContainer *container = vfio_eeh_as_container(as);
-
- return (container != NULL) && vfio_eeh_container_ok(container);
-}
-
-int vfio_eeh_as_op(AddressSpace *as, uint32_t op)
-{
- VFIOContainer *container = vfio_eeh_as_container(as);
-
- if (!container) {
- return -ENODEV;
- }
- return vfio_eeh_container_op(container, op);
-}
-
static int vfio_device_groupid(VFIODevice *vbasedev, Error **errp)
{
char *tmp, group_path[PATH_MAX], *group_name;
#include <sys/ioctl.h>
#include "hw/vfio/vfio-common.h"
-#include "hw/vfio/vfio.h"
#include "hw/hw.h"
#include "trace.h"
#include "qapi/error.h"
struct stat st;
int i, ret;
bool is_mdev;
- char uuid[UUID_FMT_LEN];
+ char uuid[UUID_STR_LEN];
char *name;
if (!vbasedev->sysfsdev) {
#include "qemu/osdep.h"
#include <sys/ioctl.h>
#include <linux/vfio.h>
+#ifdef CONFIG_KVM
+#include <linux/kvm.h>
+#endif
+#include "sysemu/kvm.h"
+#include "exec/address-spaces.h"
#include "hw/vfio/vfio-common.h"
#include "hw/hw.h"
trace_vfio_prereg_unregister(reg.vaddr, reg.size, ret ? -errno : 0);
}
-const MemoryListener vfio_prereg_listener = {
+static const MemoryListener vfio_prereg_listener = {
.name = "vfio-pre-reg",
.region_add = vfio_prereg_listener_region_add,
.region_del = vfio_prereg_listener_region_del,
};
-int vfio_spapr_create_window(VFIOContainer *container,
- MemoryRegionSection *section,
- hwaddr *pgsize)
+static void vfio_host_win_add(VFIOContainer *container, hwaddr min_iova,
+ hwaddr max_iova, uint64_t iova_pgsizes)
+{
+ VFIOHostDMAWindow *hostwin;
+
+ QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
+ if (ranges_overlap(hostwin->min_iova,
+ hostwin->max_iova - hostwin->min_iova + 1,
+ min_iova,
+ max_iova - min_iova + 1)) {
+ hw_error("%s: Overlapped IOMMU are not enabled", __func__);
+ }
+ }
+
+ hostwin = g_malloc0(sizeof(*hostwin));
+
+ hostwin->min_iova = min_iova;
+ hostwin->max_iova = max_iova;
+ hostwin->iova_pgsizes = iova_pgsizes;
+ QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next);
+}
+
+static int vfio_host_win_del(VFIOContainer *container,
+ hwaddr min_iova, hwaddr max_iova)
+{
+ VFIOHostDMAWindow *hostwin;
+
+ QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
+ if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) {
+ QLIST_REMOVE(hostwin, hostwin_next);
+ g_free(hostwin);
+ return 0;
+ }
+ }
+
+ return -1;
+}
+
+static VFIOHostDMAWindow *vfio_find_hostwin(VFIOContainer *container,
+ hwaddr iova, hwaddr end)
+{
+ VFIOHostDMAWindow *hostwin;
+ bool hostwin_found = false;
+
+ QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
+ if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
+ hostwin_found = true;
+ break;
+ }
+ }
+
+ return hostwin_found ? hostwin : NULL;
+}
+
+static int vfio_spapr_remove_window(VFIOContainer *container,
+ hwaddr offset_within_address_space)
+{
+ struct vfio_iommu_spapr_tce_remove remove = {
+ .argsz = sizeof(remove),
+ .start_addr = offset_within_address_space,
+ };
+ int ret;
+
+ ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
+ if (ret) {
+ error_report("Failed to remove window at %"PRIx64,
+ (uint64_t)remove.start_addr);
+ return -errno;
+ }
+
+ trace_vfio_spapr_remove_window(offset_within_address_space);
+
+ return 0;
+}
+
+static int vfio_spapr_create_window(VFIOContainer *container,
+ MemoryRegionSection *section,
+ hwaddr *pgsize)
{
int ret = 0;
IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
return 0;
}
-int vfio_spapr_remove_window(VFIOContainer *container,
- hwaddr offset_within_address_space)
+int vfio_container_add_section_window(VFIOContainer *container,
+ MemoryRegionSection *section,
+ Error **errp)
{
- struct vfio_iommu_spapr_tce_remove remove = {
- .argsz = sizeof(remove),
- .start_addr = offset_within_address_space,
- };
+ VFIOHostDMAWindow *hostwin;
+ hwaddr pgsize = 0;
int ret;
- ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
+ /*
+ * VFIO_SPAPR_TCE_IOMMU supports a single host window between
+ * [dma32_window_start, dma32_window_size), we need to ensure
+ * the section fall in this range.
+ */
+ if (container->iommu_type == VFIO_SPAPR_TCE_IOMMU) {
+ hwaddr iova, end;
+
+ iova = section->offset_within_address_space;
+ end = iova + int128_get64(section->size) - 1;
+
+ if (!vfio_find_hostwin(container, iova, end)) {
+ error_setg(errp, "Container %p can't map guest IOVA region"
+ " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container,
+ iova, end);
+ return -EINVAL;
+ }
+ return 0;
+ }
+
+ if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) {
+ return 0;
+ }
+
+ /* For now intersections are not allowed, we may relax this later */
+ QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
+ if (ranges_overlap(hostwin->min_iova,
+ hostwin->max_iova - hostwin->min_iova + 1,
+ section->offset_within_address_space,
+ int128_get64(section->size))) {
+ error_setg(errp,
+ "region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing"
+ "host DMA window [0x%"PRIx64",0x%"PRIx64"]",
+ section->offset_within_address_space,
+ section->offset_within_address_space +
+ int128_get64(section->size) - 1,
+ hostwin->min_iova, hostwin->max_iova);
+ return -EINVAL;
+ }
+ }
+
+ ret = vfio_spapr_create_window(container, section, &pgsize);
if (ret) {
- error_report("Failed to remove window at %"PRIx64,
- (uint64_t)remove.start_addr);
- return -errno;
+ error_setg_errno(errp, -ret, "Failed to create SPAPR window");
+ return ret;
}
- trace_vfio_spapr_remove_window(offset_within_address_space);
+ vfio_host_win_add(container, section->offset_within_address_space,
+ section->offset_within_address_space +
+ int128_get64(section->size) - 1, pgsize);
+#ifdef CONFIG_KVM
+ if (kvm_enabled()) {
+ VFIOGroup *group;
+ IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
+ struct kvm_vfio_spapr_tce param;
+ struct kvm_device_attr attr = {
+ .group = KVM_DEV_VFIO_GROUP,
+ .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
+ .addr = (uint64_t)(unsigned long)¶m,
+ };
+
+ if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD,
+ ¶m.tablefd)) {
+ QLIST_FOREACH(group, &container->group_list, container_next) {
+ param.groupfd = group->fd;
+ if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
+ error_setg_errno(errp, errno,
+ "vfio: failed GROUP_SET_SPAPR_TCE for "
+ "KVM VFIO device %d and group fd %d",
+ param.tablefd, param.groupfd);
+ return -errno;
+ }
+ trace_vfio_spapr_group_attach(param.groupfd, param.tablefd);
+ }
+ }
+ }
+#endif
+ return 0;
+}
+
+void vfio_container_del_section_window(VFIOContainer *container,
+ MemoryRegionSection *section)
+{
+ if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) {
+ return;
+ }
+
+ vfio_spapr_remove_window(container,
+ section->offset_within_address_space);
+ if (vfio_host_win_del(container,
+ section->offset_within_address_space,
+ section->offset_within_address_space +
+ int128_get64(section->size) - 1) < 0) {
+ hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx,
+ __func__, section->offset_within_address_space);
+ }
+}
+
+int vfio_spapr_container_init(VFIOContainer *container, Error **errp)
+{
+ struct vfio_iommu_spapr_tce_info info;
+ bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU;
+ int ret, fd = container->fd;
+
+ QLIST_INIT(&container->hostwin_list);
+
+ /*
+ * The host kernel code implementing VFIO_IOMMU_DISABLE is called
+ * when container fd is closed so we do not call it explicitly
+ * in this file.
+ */
+ if (!v2) {
+ ret = ioctl(fd, VFIO_IOMMU_ENABLE);
+ if (ret) {
+ error_setg_errno(errp, errno, "failed to enable container");
+ return -errno;
+ }
+ } else {
+ container->prereg_listener = vfio_prereg_listener;
+
+ memory_listener_register(&container->prereg_listener,
+ &address_space_memory);
+ if (container->error) {
+ ret = -1;
+ error_propagate_prepend(errp, container->error,
+ "RAM memory listener initialization failed: ");
+ goto listener_unregister_exit;
+ }
+ }
+
+ info.argsz = sizeof(info);
+ ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
+ if (ret) {
+ error_setg_errno(errp, errno,
+ "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed");
+ ret = -errno;
+ goto listener_unregister_exit;
+ }
+
+ if (v2) {
+ container->pgsizes = info.ddw.pgsizes;
+ /*
+ * There is a default window in just created container.
+ * To make region_add/del simpler, we better remove this
+ * window now and let those iommu_listener callbacks
+ * create/remove them when needed.
+ */
+ ret = vfio_spapr_remove_window(container, info.dma32_window_start);
+ if (ret) {
+ error_setg_errno(errp, -ret,
+ "failed to remove existing window");
+ goto listener_unregister_exit;
+ }
+ } else {
+ /* The default table uses 4K pages */
+ container->pgsizes = 0x1000;
+ vfio_host_win_add(container, info.dma32_window_start,
+ info.dma32_window_start +
+ info.dma32_window_size - 1,
+ 0x1000);
+ }
return 0;
+
+listener_unregister_exit:
+ if (v2) {
+ memory_listener_unregister(&container->prereg_listener);
+ }
+ return ret;
+}
+
+void vfio_spapr_container_deinit(VFIOContainer *container)
+{
+ VFIOHostDMAWindow *hostwin, *next;
+
+ if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
+ memory_listener_unregister(&container->prereg_listener);
+ }
+ QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next,
+ next) {
+ QLIST_REMOVE(hostwin, hostwin_next);
+ g_free(hostwin);
+ }
}
virtio_iommu_notify_flag_del(const char *name) "del notifier from mr %s"
virtio_iommu_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)"
virtio_iommu_freeze_granule(uint64_t page_size_mask) "granule set to 0x%"PRIx64
+virtio_iommu_host_resv_regions(const char *name, uint32_t index, uint64_t lob, uint64_t upb) "mr=%s host-resv-reg[%d] = [0x%"PRIx64",0x%"PRIx64"]"
# virtio-mem.c
virtio_mem_send_response(uint16_t type) "type=%" PRIu16
static Property virtio_iommu_pci_properties[] = {
DEFINE_PROP_UINT32("class", VirtIOPCIProxy, class_code, 0),
DEFINE_PROP_ARRAY("reserved-regions", VirtIOIOMMUPCI,
- vdev.nb_reserved_regions, vdev.reserved_regions,
+ vdev.nr_prop_resv_regions, vdev.prop_resv_regions,
qdev_prop_reserved_region, ReservedRegion),
DEFINE_PROP_END_OF_LIST(),
};
"for the virtio-iommu-pci device");
return;
}
- for (int i = 0; i < s->nb_reserved_regions; i++) {
- if (s->reserved_regions[i].type != VIRTIO_IOMMU_RESV_MEM_T_RESERVED &&
- s->reserved_regions[i].type != VIRTIO_IOMMU_RESV_MEM_T_MSI) {
+ for (int i = 0; i < s->nr_prop_resv_regions; i++) {
+ if (s->prop_resv_regions[i].type != VIRTIO_IOMMU_RESV_MEM_T_RESERVED &&
+ s->prop_resv_regions[i].type != VIRTIO_IOMMU_RESV_MEM_T_MSI) {
error_setg(errp, "reserved region %d has an invalid type", i);
error_append_hint(errp, "Valid values are 0 and 1\n");
return;
#include "qemu/osdep.h"
#include "qemu/log.h"
#include "qemu/iov.h"
+#include "qemu/range.h"
+#include "qemu/reserved-region.h"
#include "exec/target_page.h"
#include "hw/qdev-properties.h"
#include "hw/virtio/virtio.h"
#include "sysemu/kvm.h"
#include "sysemu/reset.h"
#include "sysemu/sysemu.h"
+#include "qemu/reserved-region.h"
#include "qapi/error.h"
#include "qemu/error-report.h"
#include "trace.h"
g_free(domain);
}
+static void add_prop_resv_regions(IOMMUDevice *sdev)
+{
+ VirtIOIOMMU *s = sdev->viommu;
+ int i;
+
+ for (i = 0; i < s->nr_prop_resv_regions; i++) {
+ ReservedRegion *reg = g_new0(ReservedRegion, 1);
+
+ *reg = s->prop_resv_regions[i];
+ sdev->resv_regions = resv_region_list_insert(sdev->resv_regions, reg);
+ }
+}
+
static AddressSpace *virtio_iommu_find_add_as(PCIBus *bus, void *opaque,
int devfn)
{
memory_region_init(&sdev->root, OBJECT(s), name, UINT64_MAX);
address_space_init(&sdev->as, &sdev->root, TYPE_VIRTIO_IOMMU);
+ add_prop_resv_regions(sdev);
/*
* Build the IOMMU disabled container with aliases to the
return &sdev->as;
}
+static const PCIIOMMUOps virtio_iommu_ops = {
+ .get_address_space = virtio_iommu_find_add_as,
+};
+
static int virtio_iommu_attach(VirtIOIOMMU *s,
struct virtio_iommu_req_attach *req)
{
return ret;
}
-static ssize_t virtio_iommu_fill_resv_mem_prop(VirtIOIOMMU *s, uint32_t ep,
+static ssize_t virtio_iommu_fill_resv_mem_prop(IOMMUDevice *sdev, uint32_t ep,
uint8_t *buf, size_t free)
{
struct virtio_iommu_probe_resv_mem prop = {};
size_t size = sizeof(prop), length = size - sizeof(prop.head), total;
- int i;
-
- total = size * s->nb_reserved_regions;
+ GList *l;
+ total = size * g_list_length(sdev->resv_regions);
if (total > free) {
return -ENOSPC;
}
- for (i = 0; i < s->nb_reserved_regions; i++) {
- unsigned subtype = s->reserved_regions[i].type;
+ for (l = sdev->resv_regions; l; l = l->next) {
+ ReservedRegion *reg = l->data;
+ unsigned subtype = reg->type;
+ Range *range = ®->range;
assert(subtype == VIRTIO_IOMMU_RESV_MEM_T_RESERVED ||
subtype == VIRTIO_IOMMU_RESV_MEM_T_MSI);
prop.head.type = cpu_to_le16(VIRTIO_IOMMU_PROBE_T_RESV_MEM);
prop.head.length = cpu_to_le16(length);
prop.subtype = subtype;
- prop.start = cpu_to_le64(s->reserved_regions[i].low);
- prop.end = cpu_to_le64(s->reserved_regions[i].high);
+ prop.start = cpu_to_le64(range_lob(range));
+ prop.end = cpu_to_le64(range_upb(range));
memcpy(buf, &prop, size);
uint8_t *buf)
{
uint32_t ep_id = le32_to_cpu(req->endpoint);
+ IOMMUMemoryRegion *iommu_mr = virtio_iommu_mr(s, ep_id);
size_t free = VIOMMU_PROBE_SIZE;
+ IOMMUDevice *sdev;
ssize_t count;
- if (!virtio_iommu_mr(s, ep_id)) {
+ if (!iommu_mr) {
return VIRTIO_IOMMU_S_NOENT;
}
- count = virtio_iommu_fill_resv_mem_prop(s, ep_id, buf, free);
+ sdev = container_of(iommu_mr, IOMMUDevice, iommu_mr);
+ if (!sdev) {
+ return -EINVAL;
+ }
+
+ count = virtio_iommu_fill_resv_mem_prop(sdev, ep_id, buf, free);
if (count < 0) {
return VIRTIO_IOMMU_S_INVAL;
}
buf += count;
free -= count;
+ sdev->probe_done = true;
return VIRTIO_IOMMU_S_OK;
}
bool bypass_allowed;
int granule;
bool found;
- int i;
+ GList *l;
interval.low = addr;
interval.high = addr + 1;
goto unlock;
}
- for (i = 0; i < s->nb_reserved_regions; i++) {
- ReservedRegion *reg = &s->reserved_regions[i];
+ for (l = sdev->resv_regions; l; l = l->next) {
+ ReservedRegion *reg = l->data;
- if (addr >= reg->low && addr <= reg->high) {
+ if (range_contains(®->range, addr)) {
switch (reg->type) {
case VIRTIO_IOMMU_RESV_MEM_T_MSI:
entry.perm = flag;
return 0;
}
+/**
+ * rebuild_resv_regions: rebuild resv regions with both the
+ * info of host resv ranges and property set resv ranges
+ */
+static int rebuild_resv_regions(IOMMUDevice *sdev)
+{
+ GList *l;
+ int i = 0;
+
+ /* free the existing list and rebuild it from scratch */
+ g_list_free_full(sdev->resv_regions, g_free);
+ sdev->resv_regions = NULL;
+
+ /* First add host reserved regions if any, all tagged as RESERVED */
+ for (l = sdev->host_resv_ranges; l; l = l->next) {
+ ReservedRegion *reg = g_new0(ReservedRegion, 1);
+ Range *r = (Range *)l->data;
+
+ reg->type = VIRTIO_IOMMU_RESV_MEM_T_RESERVED;
+ range_set_bounds(®->range, range_lob(r), range_upb(r));
+ sdev->resv_regions = resv_region_list_insert(sdev->resv_regions, reg);
+ trace_virtio_iommu_host_resv_regions(sdev->iommu_mr.parent_obj.name, i,
+ range_lob(®->range),
+ range_upb(®->range));
+ i++;
+ }
+ /*
+ * then add higher priority reserved regions set by the machine
+ * through properties
+ */
+ add_prop_resv_regions(sdev);
+ return 0;
+}
+
+/**
+ * virtio_iommu_set_iova_ranges: Conveys the usable IOVA ranges
+ *
+ * The function turns those into reserved ranges. Once some
+ * reserved ranges have been set, new reserved regions cannot be
+ * added outside of the original ones.
+ *
+ * @mr: IOMMU MR
+ * @iova_ranges: list of usable IOVA ranges
+ * @errp: error handle
+ */
+static int virtio_iommu_set_iova_ranges(IOMMUMemoryRegion *mr,
+ GList *iova_ranges,
+ Error **errp)
+{
+ IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr);
+ GList *current_ranges = sdev->host_resv_ranges;
+ GList *l, *tmp, *new_ranges = NULL;
+ int ret = -EINVAL;
+
+ /* check that each new resv region is included in an existing one */
+ if (sdev->host_resv_ranges) {
+ range_inverse_array(iova_ranges,
+ &new_ranges,
+ 0, UINT64_MAX);
+
+ for (tmp = new_ranges; tmp; tmp = tmp->next) {
+ Range *newr = (Range *)tmp->data;
+ bool included = false;
+
+ for (l = current_ranges; l; l = l->next) {
+ Range * r = (Range *)l->data;
+
+ if (range_contains_range(r, newr)) {
+ included = true;
+ break;
+ }
+ }
+ if (!included) {
+ goto error;
+ }
+ }
+ /* all new reserved ranges are included in existing ones */
+ ret = 0;
+ goto out;
+ }
+
+ if (sdev->probe_done) {
+ warn_report("%s: Notified about new host reserved regions after probe",
+ mr->parent_obj.name);
+ }
+
+ range_inverse_array(iova_ranges,
+ &sdev->host_resv_ranges,
+ 0, UINT64_MAX);
+ rebuild_resv_regions(sdev);
+
+ return 0;
+error:
+ error_setg(errp, "IOMMU mr=%s Conflicting host reserved ranges set!",
+ mr->parent_obj.name);
+out:
+ g_list_free_full(new_ranges, g_free);
+ return ret;
+}
+
static void virtio_iommu_system_reset(void *opaque)
{
VirtIOIOMMU *s = opaque;
s->as_by_busptr = g_hash_table_new_full(NULL, NULL, NULL, g_free);
if (s->primary_bus) {
- pci_setup_iommu(s->primary_bus, virtio_iommu_find_add_as, s);
+ pci_setup_iommu(s->primary_bus, &virtio_iommu_ops, s);
} else {
error_setg(errp, "VIRTIO-IOMMU is not attached to any PCI bus!");
}
imrc->replay = virtio_iommu_replay;
imrc->notify_flag_changed = virtio_iommu_notify_flag_changed;
imrc->iommu_set_page_size_mask = virtio_iommu_set_page_size_mask;
+ imrc->iommu_set_iova_ranges = virtio_iommu_set_iova_ranges;
}
static const TypeInfo virtio_iommu_info = {
static MemoryRegion *virtio_pmem_get_memory_region(VirtIOPMEM *pmem,
Error **errp)
{
- assert(pmem->memdev);
+ if (!pmem->memdev) {
+ error_setg(errp, "'%s' property must be set", VIRTIO_PMEM_MEMDEV_PROP);
+ return NULL;
+ }
return &pmem->memdev->mr;
}
#include "qemu/bswap.h"
#include "qemu/queue.h"
#include "qemu/int128.h"
+#include "qemu/range.h"
#include "qemu/notify.h"
#include "qom/object.h"
#include "qemu/rcu.h"
typedef struct MemoryRegionOps MemoryRegionOps;
struct ReservedRegion {
- hwaddr low;
- hwaddr high;
+ Range range;
unsigned type;
};
int (*iommu_set_page_size_mask)(IOMMUMemoryRegion *iommu,
uint64_t page_size_mask,
Error **errp);
+ /**
+ * @iommu_set_iova_ranges:
+ *
+ * Propagate information about the usable IOVA ranges for a given IOMMU
+ * memory region. Used for example to propagate host physical device
+ * reserved memory region constraints to the virtual IOMMU.
+ *
+ * Optional method: if this method is not provided, then the default IOVA
+ * aperture is used.
+ *
+ * @iommu: the IOMMUMemoryRegion
+ *
+ * @iova_ranges: list of ordered IOVA ranges (at least one range)
+ *
+ * Returns 0 on success, or a negative error. In case of failure, the error
+ * object must be created.
+ */
+ int (*iommu_set_iova_ranges)(IOMMUMemoryRegion *iommu,
+ GList *iova_ranges,
+ Error **errp);
};
typedef struct RamDiscardListener RamDiscardListener;
uint64_t page_size_mask,
Error **errp);
+/**
+ * memory_region_iommu_set_iova_ranges - Set the usable IOVA ranges
+ * for a given IOMMU MR region
+ *
+ * @iommu: IOMMU memory region
+ * @iova_ranges: list of ordered IOVA ranges (at least one range)
+ * @errp: pointer to Error*, to store an error if it happens.
+ */
+int memory_region_iommu_set_iova_ranges(IOMMUMemoryRegion *iommu,
+ GList *iova_ranges,
+ Error **errp);
+
/**
* memory_region_name: get a memory region's name
*
--- /dev/null
+#ifndef HW_HYPERV_DYNMEM_PROTO_H
+#define HW_HYPERV_DYNMEM_PROTO_H
+
+/*
+ * Hyper-V Dynamic Memory Protocol definitions
+ *
+ * Copyright (C) 2020-2023 Oracle and/or its affiliates.
+ *
+ * Based on drivers/hv/hv_balloon.c from Linux kernel:
+ * Copyright (c) 2012, Microsoft Corporation.
+ *
+ * Author: K. Y. Srinivasan <kys@microsoft.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*
+ * Protocol versions. The low word is the minor version, the high word the major
+ * version.
+ *
+ * History:
+ * Initial version 1.0
+ * Changed to 0.1 on 2009/03/25
+ * Changes to 0.2 on 2009/05/14
+ * Changes to 0.3 on 2009/12/03
+ * Changed to 1.0 on 2011/04/05
+ * Changed to 2.0 on 2019/12/10
+ */
+
+#define DYNMEM_MAKE_VERSION(Major, Minor) ((uint32_t)(((Major) << 16) | (Minor)))
+#define DYNMEM_MAJOR_VERSION(Version) ((uint32_t)(Version) >> 16)
+#define DYNMEM_MINOR_VERSION(Version) ((uint32_t)(Version) & 0xff)
+
+enum {
+ DYNMEM_PROTOCOL_VERSION_1 = DYNMEM_MAKE_VERSION(0, 3),
+ DYNMEM_PROTOCOL_VERSION_2 = DYNMEM_MAKE_VERSION(1, 0),
+ DYNMEM_PROTOCOL_VERSION_3 = DYNMEM_MAKE_VERSION(2, 0),
+
+ DYNMEM_PROTOCOL_VERSION_WIN7 = DYNMEM_PROTOCOL_VERSION_1,
+ DYNMEM_PROTOCOL_VERSION_WIN8 = DYNMEM_PROTOCOL_VERSION_2,
+ DYNMEM_PROTOCOL_VERSION_WIN10 = DYNMEM_PROTOCOL_VERSION_3,
+
+ DYNMEM_PROTOCOL_VERSION_CURRENT = DYNMEM_PROTOCOL_VERSION_WIN10
+};
+
+
+
+/*
+ * Message Types
+ */
+
+enum dm_message_type {
+ /*
+ * Version 0.3
+ */
+ DM_ERROR = 0,
+ DM_VERSION_REQUEST = 1,
+ DM_VERSION_RESPONSE = 2,
+ DM_CAPABILITIES_REPORT = 3,
+ DM_CAPABILITIES_RESPONSE = 4,
+ DM_STATUS_REPORT = 5,
+ DM_BALLOON_REQUEST = 6,
+ DM_BALLOON_RESPONSE = 7,
+ DM_UNBALLOON_REQUEST = 8,
+ DM_UNBALLOON_RESPONSE = 9,
+ DM_MEM_HOT_ADD_REQUEST = 10,
+ DM_MEM_HOT_ADD_RESPONSE = 11,
+ DM_VERSION_03_MAX = 11,
+ /*
+ * Version 1.0.
+ */
+ DM_INFO_MESSAGE = 12,
+ DM_VERSION_1_MAX = 12,
+
+ /*
+ * Version 2.0
+ */
+ DM_MEM_HOT_REMOVE_REQUEST = 13,
+ DM_MEM_HOT_REMOVE_RESPONSE = 14
+};
+
+
+/*
+ * Structures defining the dynamic memory management
+ * protocol.
+ */
+
+union dm_version {
+ struct {
+ uint16_t minor_version;
+ uint16_t major_version;
+ };
+ uint32_t version;
+} QEMU_PACKED;
+
+
+union dm_caps {
+ struct {
+ uint64_t balloon:1;
+ uint64_t hot_add:1;
+ /*
+ * To support guests that may have alignment
+ * limitations on hot-add, the guest can specify
+ * its alignment requirements; a value of n
+ * represents an alignment of 2^n in mega bytes.
+ */
+ uint64_t hot_add_alignment:4;
+ uint64_t hot_remove:1;
+ uint64_t reservedz:57;
+ } cap_bits;
+ uint64_t caps;
+} QEMU_PACKED;
+
+union dm_mem_page_range {
+ struct {
+ /*
+ * The PFN number of the first page in the range.
+ * 40 bits is the architectural limit of a PFN
+ * number for AMD64.
+ */
+ uint64_t start_page:40;
+ /*
+ * The number of pages in the range.
+ */
+ uint64_t page_cnt:24;
+ } finfo;
+ uint64_t page_range;
+} QEMU_PACKED;
+
+
+
+/*
+ * The header for all dynamic memory messages:
+ *
+ * type: Type of the message.
+ * size: Size of the message in bytes; including the header.
+ * trans_id: The guest is responsible for manufacturing this ID.
+ */
+
+struct dm_header {
+ uint16_t type;
+ uint16_t size;
+ uint32_t trans_id;
+} QEMU_PACKED;
+
+/*
+ * A generic message format for dynamic memory.
+ * Specific message formats are defined later in the file.
+ */
+
+struct dm_message {
+ struct dm_header hdr;
+ uint8_t data[]; /* enclosed message */
+} QEMU_PACKED;
+
+
+/*
+ * Specific message types supporting the dynamic memory protocol.
+ */
+
+/*
+ * Version negotiation message. Sent from the guest to the host.
+ * The guest is free to try different versions until the host
+ * accepts the version.
+ *
+ * dm_version: The protocol version requested.
+ * is_last_attempt: If TRUE, this is the last version guest will request.
+ * reservedz: Reserved field, set to zero.
+ */
+
+struct dm_version_request {
+ struct dm_header hdr;
+ union dm_version version;
+ uint32_t is_last_attempt:1;
+ uint32_t reservedz:31;
+} QEMU_PACKED;
+
+/*
+ * Version response message; Host to Guest and indicates
+ * if the host has accepted the version sent by the guest.
+ *
+ * is_accepted: If TRUE, host has accepted the version and the guest
+ * should proceed to the next stage of the protocol. FALSE indicates that
+ * guest should re-try with a different version.
+ *
+ * reservedz: Reserved field, set to zero.
+ */
+
+struct dm_version_response {
+ struct dm_header hdr;
+ uint64_t is_accepted:1;
+ uint64_t reservedz:63;
+} QEMU_PACKED;
+
+/*
+ * Message reporting capabilities. This is sent from the guest to the
+ * host.
+ */
+
+struct dm_capabilities {
+ struct dm_header hdr;
+ union dm_caps caps;
+ uint64_t min_page_cnt;
+ uint64_t max_page_number;
+} QEMU_PACKED;
+
+/*
+ * Response to the capabilities message. This is sent from the host to the
+ * guest. This message notifies if the host has accepted the guest's
+ * capabilities. If the host has not accepted, the guest must shutdown
+ * the service.
+ *
+ * is_accepted: Indicates if the host has accepted guest's capabilities.
+ * reservedz: Must be 0.
+ */
+
+struct dm_capabilities_resp_msg {
+ struct dm_header hdr;
+ uint64_t is_accepted:1;
+ uint64_t hot_remove:1;
+ uint64_t suppress_pressure_reports:1;
+ uint64_t reservedz:61;
+} QEMU_PACKED;
+
+/*
+ * This message is used to report memory pressure from the guest.
+ * This message is not part of any transaction and there is no
+ * response to this message.
+ *
+ * num_avail: Available memory in pages.
+ * num_committed: Committed memory in pages.
+ * page_file_size: The accumulated size of all page files
+ * in the system in pages.
+ * zero_free: The nunber of zero and free pages.
+ * page_file_writes: The writes to the page file in pages.
+ * io_diff: An indicator of file cache efficiency or page file activity,
+ * calculated as File Cache Page Fault Count - Page Read Count.
+ * This value is in pages.
+ *
+ * Some of these metrics are Windows specific and fortunately
+ * the algorithm on the host side that computes the guest memory
+ * pressure only uses num_committed value.
+ */
+
+struct dm_status {
+ struct dm_header hdr;
+ uint64_t num_avail;
+ uint64_t num_committed;
+ uint64_t page_file_size;
+ uint64_t zero_free;
+ uint32_t page_file_writes;
+ uint32_t io_diff;
+} QEMU_PACKED;
+
+
+/*
+ * Message to ask the guest to allocate memory - balloon up message.
+ * This message is sent from the host to the guest. The guest may not be
+ * able to allocate as much memory as requested.
+ *
+ * num_pages: number of pages to allocate.
+ */
+
+struct dm_balloon {
+ struct dm_header hdr;
+ uint32_t num_pages;
+ uint32_t reservedz;
+} QEMU_PACKED;
+
+
+/*
+ * Balloon response message; this message is sent from the guest
+ * to the host in response to the balloon message.
+ *
+ * reservedz: Reserved; must be set to zero.
+ * more_pages: If FALSE, this is the last message of the transaction.
+ * if TRUE there will atleast one more message from the guest.
+ *
+ * range_count: The number of ranges in the range array.
+ *
+ * range_array: An array of page ranges returned to the host.
+ *
+ */
+
+struct dm_balloon_response {
+ struct dm_header hdr;
+ uint32_t reservedz;
+ uint32_t more_pages:1;
+ uint32_t range_count:31;
+ union dm_mem_page_range range_array[];
+} QEMU_PACKED;
+
+/*
+ * Un-balloon message; this message is sent from the host
+ * to the guest to give guest more memory.
+ *
+ * more_pages: If FALSE, this is the last message of the transaction.
+ * if TRUE there will atleast one more message from the guest.
+ *
+ * reservedz: Reserved; must be set to zero.
+ *
+ * range_count: The number of ranges in the range array.
+ *
+ * range_array: An array of page ranges returned to the host.
+ *
+ */
+
+struct dm_unballoon_request {
+ struct dm_header hdr;
+ uint32_t more_pages:1;
+ uint32_t reservedz:31;
+ uint32_t range_count;
+ union dm_mem_page_range range_array[];
+} QEMU_PACKED;
+
+/*
+ * Un-balloon response message; this message is sent from the guest
+ * to the host in response to an unballoon request.
+ *
+ */
+
+struct dm_unballoon_response {
+ struct dm_header hdr;
+} QEMU_PACKED;
+
+
+/*
+ * Hot add request message. Message sent from the host to the guest.
+ *
+ * mem_range: Memory range to hot add.
+ *
+ */
+
+struct dm_hot_add {
+ struct dm_header hdr;
+ union dm_mem_page_range range;
+} QEMU_PACKED;
+
+/*
+ * Hot add response message.
+ * This message is sent by the guest to report the status of a hot add request.
+ * If page_count is less than the requested page count, then the host should
+ * assume all further hot add requests will fail, since this indicates that
+ * the guest has hit an upper physical memory barrier.
+ *
+ * Hot adds may also fail due to low resources; in this case, the guest must
+ * not complete this message until the hot add can succeed, and the host must
+ * not send a new hot add request until the response is sent.
+ * If VSC fails to hot add memory DYNMEM_NUMBER_OF_UNSUCCESSFUL_HOTADD_ATTEMPTS
+ * times it fails the request.
+ *
+ *
+ * page_count: number of pages that were successfully hot added.
+ *
+ * result: result of the operation 1: success, 0: failure.
+ *
+ */
+
+struct dm_hot_add_response {
+ struct dm_header hdr;
+ uint32_t page_count;
+ uint32_t result;
+} QEMU_PACKED;
+
+struct dm_hot_remove {
+ struct dm_header hdr;
+ uint32_t virtual_node;
+ uint32_t page_count;
+ uint32_t qos_flags;
+ uint32_t reservedZ;
+} QEMU_PACKED;
+
+struct dm_hot_remove_response {
+ struct dm_header hdr;
+ uint32_t result;
+ uint32_t range_count;
+ uint64_t more_pages:1;
+ uint64_t reservedz:63;
+ union dm_mem_page_range range_array[];
+} QEMU_PACKED;
+
+#define DM_REMOVE_QOS_LARGE (1 << 0)
+#define DM_REMOVE_QOS_LOCAL (1 << 1)
+#define DM_REMOVE_QOS_MASK (0x3)
+
+/*
+ * Types of information sent from host to the guest.
+ */
+
+enum dm_info_type {
+ INFO_TYPE_MAX_PAGE_CNT = 0,
+ MAX_INFO_TYPE
+};
+
+
+/*
+ * Header for the information message.
+ */
+
+struct dm_info_header {
+ enum dm_info_type type;
+ uint32_t data_size;
+ uint8_t data[];
+} QEMU_PACKED;
+
+/*
+ * This message is sent from the host to the guest to pass
+ * some relevant information (win8 addition).
+ *
+ * reserved: no used.
+ * info_size: size of the information blob.
+ * info: information blob.
+ */
+
+struct dm_info_msg {
+ struct dm_header hdr;
+ uint32_t reserved;
+ uint32_t info_size;
+ uint8_t info[];
+};
+
+#endif
--- /dev/null
+/*
+ * QEMU Hyper-V Dynamic Memory Protocol driver
+ *
+ * Copyright (C) 2020-2023 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef HW_HV_BALLOON_H
+#define HW_HV_BALLOON_H
+
+#include "qom/object.h"
+
+#define TYPE_HV_BALLOON "hv-balloon"
+OBJECT_DECLARE_SIMPLE_TYPE(HvBalloon, HV_BALLOON)
+
+#endif
* address in guest physical memory can either be specified explicitly
* or get assigned automatically.
*
+ * Some memory device might not own a memory region in certain device
+ * configurations. Such devices can logically get (un)plugged, however,
+ * empty memory devices are mostly ignored by the memory device code.
+ *
* Conceptually, memory devices only span one memory region. If multiple
* successive memory regions are used, a covering memory region has to
* be provided. Scattered memory regions are not supported for single
uint64_t (*get_plugged_size)(const MemoryDeviceState *md, Error **errp);
/*
- * Return the memory region of the memory device.
+ * Return the memory region of the memory device. If the device is
+ * completely empty, returns NULL without an error.
*
* Called when (un)plugging the memory device, to (un)map the
* memory region in guest physical memory, but also to detect the
void pci_device_deassert_intx(PCIDevice *dev);
-typedef AddressSpace *(*PCIIOMMUFunc)(PCIBus *, void *, int);
+
+/**
+ * struct PCIIOMMUOps: callbacks structure for specific IOMMU handlers
+ * of a PCIBus
+ *
+ * Allows to modify the behavior of some IOMMU operations of the PCI
+ * framework for a set of devices on a PCI bus.
+ */
+typedef struct PCIIOMMUOps {
+ /**
+ * @get_address_space: get the address space for a set of devices
+ * on a PCI bus.
+ *
+ * Mandatory callback which returns a pointer to an #AddressSpace
+ *
+ * @bus: the #PCIBus being accessed.
+ *
+ * @opaque: the data passed to pci_setup_iommu().
+ *
+ * @devfn: device and function number
+ */
+ AddressSpace * (*get_address_space)(PCIBus *bus, void *opaque, int devfn);
+} PCIIOMMUOps;
AddressSpace *pci_device_iommu_address_space(PCIDevice *dev);
-void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque);
+
+/**
+ * pci_setup_iommu: Initialize specific IOMMU handlers for a PCIBus
+ *
+ * Let PCI host bridges define specific operations.
+ *
+ * @bus: the #PCIBus being updated.
+ * @ops: the #PCIIOMMUOps
+ * @opaque: passed to callbacks of the @ops structure.
+ */
+void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *ops, void *opaque);
pcibus_t pci_bar_address(PCIDevice *d,
int reg, uint8_t type, pcibus_t size);
struct PCIBus {
BusState qbus;
enum PCIBusFlags flags;
- PCIIOMMUFunc iommu_fn;
+ const PCIIOMMUOps *iommu_ops;
void *iommu_opaque;
uint8_t devfn_min;
uint32_t slot_reserved_mask;
QLIST_HEAD(, VFIORamDiscardListener) vrdl_list;
QLIST_ENTRY(VFIOContainer) next;
QLIST_HEAD(, VFIODevice) device_list;
+ GList *iova_ranges;
} VFIOContainer;
typedef struct VFIOGuestIOMMU {
hwaddr pages;
} VFIOBitmap;
-void vfio_host_win_add(VFIOContainer *container,
- hwaddr min_iova, hwaddr max_iova,
- uint64_t iova_pgsizes);
-int vfio_host_win_del(VFIOContainer *container, hwaddr min_iova,
- hwaddr max_iova);
VFIOAddressSpace *vfio_get_address_space(AddressSpace *as);
void vfio_put_address_space(VFIOAddressSpace *space);
bool vfio_devices_all_running_and_saving(VFIOContainer *container);
int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap,
hwaddr iova, hwaddr size);
+/* SPAPR specific */
int vfio_container_add_section_window(VFIOContainer *container,
MemoryRegionSection *section,
Error **errp);
void vfio_container_del_section_window(VFIOContainer *container,
MemoryRegionSection *section);
+int vfio_spapr_container_init(VFIOContainer *container, Error **errp);
+void vfio_spapr_container_deinit(VFIOContainer *container);
void vfio_disable_irqindex(VFIODevice *vbasedev, int index);
void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index);
struct vfio_info_cap_header *
vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id);
#endif
-extern const MemoryListener vfio_prereg_listener;
-
-int vfio_spapr_create_window(VFIOContainer *container,
- MemoryRegionSection *section,
- hwaddr *pgsize);
-int vfio_spapr_remove_window(VFIOContainer *container,
- hwaddr offset_within_address_space);
bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp);
void vfio_migration_exit(VFIODevice *vbasedev);
+++ /dev/null
-#ifndef HW_VFIO_H
-#define HW_VFIO_H
-
-bool vfio_eeh_as_ok(AddressSpace *as);
-int vfio_eeh_as_op(AddressSpace *as, uint32_t op);
-
-#endif
AddressSpace as;
MemoryRegion root; /* The root container of the device */
MemoryRegion bypass_mr; /* The alias of shared memory MR */
+ GList *resv_regions;
+ GList *host_resv_ranges;
+ bool probe_done;
} IOMMUDevice;
typedef struct IOMMUPciBus {
GHashTable *as_by_busptr;
IOMMUPciBus *iommu_pcibus_by_bus_num[PCI_BUS_MAX];
PCIBus *primary_bus;
- ReservedRegion *reserved_regions;
- uint32_t nb_reserved_regions;
+ ReservedRegion *prop_resv_regions;
+ uint32_t nr_prop_resv_regions;
GTree *domains;
QemuRecMutex mutex;
GTree *endpoints;
return !(last2 < first1 || last1 < first2);
}
+/*
+ * Return -1 if @a < @b, 1 @a > @b, and 0 if they touch or overlap.
+ * Both @a and @b must not be empty.
+ */
+int range_compare(Range *a, Range *b);
+
GList *range_list_insert(GList *list, Range *data);
+/*
+ * Inverse an array of sorted ranges over the [low, high] span, ie.
+ * original ranges becomes holes in the newly allocated inv_ranges
+ */
+void range_inverse_array(GList *in_ranges,
+ GList **out_ranges,
+ uint64_t low, uint64_t high);
+
#endif
--- /dev/null
+/*
+ * QEMU ReservedRegion helpers
+ *
+ * Copyright (c) 2023 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef QEMU_RESERVED_REGION_H
+#define QEMU_RESERVED_REGION_H
+
+#include "exec/memory.h"
+
+/*
+ * Insert a new region into a sorted list of reserved regions. In case
+ * there is overlap with existing regions, the new added region has
+ * higher priority and replaces the overlapped segment.
+ */
+GList *resv_region_list_insert(GList *list, ReservedRegion *reg);
+
+#endif
"%02hhx%02hhx-" \
"%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx"
-#define UUID_FMT_LEN 36
-
#define UUID_NONE "00000000-0000-0000-0000-000000000000"
+QEMU_BUILD_BUG_ON(sizeof(UUID_NONE) - 1 != 36);
+
+#define UUID_STR_LEN sizeof(UUID_NONE)
void qemu_uuid_generate(QemuUUID *out);
int kvm_xen_soft_reset(void);
uint32_t kvm_xen_get_caps(void);
void *kvm_xen_get_vcpu_info_hva(uint32_t vcpu_id);
+bool kvm_xen_has_vcpu_callback_vector(void);
void kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id, int type);
void kvm_xen_set_callback_asserted(void);
int kvm_xen_set_vcpu_virq(uint32_t vcpu_id, uint16_t virq, uint16_t port);
"Error not from zero copy");
return -1;
}
+ if (serr->ee_data < serr->ee_info) {
+ error_setg_errno(errp, serr->ee_origin,
+ "Wrong notification bounds");
+ return -1;
+ }
/* No errors, count successfully finished sendmsg()*/
sioc->zero_copy_sent += serr->ee_data - serr->ee_info + 1;
endif
endif
+hv_balloon = false
+if get_option('hv_balloon').allowed() and have_system
+ if cc.links('''
+ #include <string.h>
+ #include <gmodule.h>
+ int main(void) {
+ GTree *tree;
+
+ tree = g_tree_new((GCompareFunc)strcmp);
+ (void)g_tree_node_first(tree);
+ g_tree_destroy(tree);
+ return 0;
+ }
+ ''', dependencies: glib)
+ hv_balloon = true
+ else
+ if get_option('hv_balloon').enabled()
+ error('could not enable hv-balloon, update your glib')
+ else
+ warning('could not find glib support for hv-balloon, disabling')
+ endif
+ endif
+endif
+
libssh = not_found
if not get_option('libssh').auto() or have_block
libssh = dependency('libssh', version: '>=0.8.7',
(targetos == 'linux' ? ['CONFIG_LINUX=y'] : []) + \
(have_pvrdma ? ['CONFIG_PVRDMA=y'] : []) + \
(multiprocess_allowed ? ['CONFIG_MULTIPROCESS_ALLOWED=y'] : []) + \
- (vfio_user_server_allowed ? ['CONFIG_VFIO_USER_SERVER_ALLOWED=y'] : [])
+ (vfio_user_server_allowed ? ['CONFIG_VFIO_USER_SERVER_ALLOWED=y'] : []) + \
+ (hv_balloon ? ['CONFIG_HV_BALLOON_POSSIBLE=y'] : [])
ignored = [ 'TARGET_XML_FILES', 'TARGET_ABI_DIR', 'TARGET_ARCH' ]
endif
summary_info += {'seccomp support': seccomp}
summary_info += {'GlusterFS support': glusterfs}
+summary_info += {'hv-balloon support': hv_balloon}
summary_info += {'TPM support': have_tpm}
summary_info += {'libssh support': libssh}
summary_info += {'lzo support': lzo}
description: 'use libgio for D-Bus support')
option('glusterfs', type : 'feature', value : 'auto',
description: 'Glusterfs block device driver')
+option('hv_balloon', type : 'feature', value : 'auto',
+ description: 'hv-balloon driver (requires Glib 2.68+ GTree API)')
option('libdw', type : 'feature', value : 'auto',
description: 'debuginfo support')
option('libiscsi', type : 'feature', value : 'auto',
static int vmstate_uuid_post_load(void *opaque, int version_id)
{
SaveState *state = opaque;
- char uuid_src[UUID_FMT_LEN + 1];
- char uuid_dst[UUID_FMT_LEN + 1];
+ char uuid_src[UUID_STR_LEN];
+ char uuid_dst[UUID_STR_LEN];
if (!qemu_uuid_set) {
/*
[QAPI_EVENT_QUORUM_FAILURE] = { 1000 * SCALE_MS },
[QAPI_EVENT_VSERPORT_CHANGE] = { 1000 * SCALE_MS },
[QAPI_EVENT_MEMORY_DEVICE_SIZE_CHANGE] = { 1000 * SCALE_MS },
+ [QAPI_EVENT_HV_BALLOON_STATUS_REPORT] = { 1000 * SCALE_MS },
};
/*
{ 'event': 'BALLOON_CHANGE',
'data': { 'actual': 'int' } }
+##
+# @HvBalloonInfo:
+#
+# hv-balloon guest-provided memory status information.
+#
+# @committed: the amount of memory in use inside the guest plus the
+# amount of the memory unusable inside the guest (ballooned out,
+# offline, etc.)
+#
+# @available: the amount of the memory inside the guest available for
+# new allocations ("free")
+#
+# Since: 8.2
+##
+{ 'struct': 'HvBalloonInfo',
+ 'data': { 'committed': 'size', 'available': 'size' } }
+
+##
+# @query-hv-balloon-status-report:
+#
+# Returns the hv-balloon driver data contained in the last received "STATUS"
+# message from the guest.
+#
+# Returns:
+# - @HvBalloonInfo on success
+# - If no hv-balloon device is present, guest memory status reporting
+# is not enabled or no guest memory status report received yet,
+# GenericError
+#
+# Since: 8.2
+#
+# Example:
+#
+# -> { "execute": "query-hv-balloon-status-report" }
+# <- { "return": {
+# "committed": 816640000,
+# "available": 3333054464
+# }
+# }
+##
+{ 'command': 'query-hv-balloon-status-report', 'returns': 'HvBalloonInfo' }
+
+##
+# @HV_BALLOON_STATUS_REPORT:
+#
+# Emitted when the hv-balloon driver receives a "STATUS" message from
+# the guest.
+#
+# Note: this event is rate-limited.
+#
+# Since: 8.2
+#
+# Example:
+#
+# <- { "event": "HV_BALLOON_STATUS_REPORT",
+# "data": { "committed": 816640000, "available": 3333054464 },
+# "timestamp": { "seconds": 1600295492, "microseconds": 661044 } }
+#
+##
+{ 'event': 'HV_BALLOON_STATUS_REPORT',
+ 'data': 'HvBalloonInfo' }
+
##
# @MemoryInfo:
#
}
}
+##
+# @HvBalloonDeviceInfo:
+#
+# hv-balloon provided memory state information
+#
+# @id: device's ID
+#
+# @memaddr: physical address in memory, where device is mapped
+#
+# @max-size: the maximum size of memory that the device can provide
+#
+# @memdev: memory backend linked with device
+#
+# Since: 8.2
+##
+{ 'struct': 'HvBalloonDeviceInfo',
+ 'data': { '*id': 'str',
+ '*memaddr': 'size',
+ 'max-size': 'size',
+ '*memdev': 'str'
+ }
+}
+
##
# @MemoryDeviceInfoKind:
#
#
# @sgx-epc: since 6.2.
#
+# @hv-balloon: since 8.2.
+#
# Since: 2.1
##
{ 'enum': 'MemoryDeviceInfoKind',
- 'data': [ 'dimm', 'nvdimm', 'virtio-pmem', 'virtio-mem', 'sgx-epc' ] }
+ 'data': [ 'dimm', 'nvdimm', 'virtio-pmem', 'virtio-mem', 'sgx-epc',
+ 'hv-balloon' ] }
##
# @PCDIMMDeviceInfoWrapper:
{ 'struct': 'SgxEPCDeviceInfoWrapper',
'data': { 'data': 'SgxEPCDeviceInfo' } }
+##
+# @HvBalloonDeviceInfoWrapper:
+#
+# Since: 8.2
+##
+{ 'struct': 'HvBalloonDeviceInfoWrapper',
+ 'data': { 'data': 'HvBalloonDeviceInfo' } }
+
##
# @MemoryDeviceInfo:
#
'nvdimm': 'PCDIMMDeviceInfoWrapper',
'virtio-pmem': 'VirtioPMEMDeviceInfoWrapper',
'virtio-mem': 'VirtioMEMDeviceInfoWrapper',
- 'sgx-epc': 'SgxEPCDeviceInfoWrapper'
+ 'sgx-epc': 'SgxEPCDeviceInfoWrapper',
+ 'hv-balloon': 'HvBalloonDeviceInfoWrapper'
}
}
printf "%s\n" ' gtk-clipboard clipboard support for the gtk UI (EXPERIMENTAL, MAY HANG)'
printf "%s\n" ' guest-agent Build QEMU Guest Agent'
printf "%s\n" ' guest-agent-msi Build MSI package for the QEMU Guest Agent'
+ printf "%s\n" ' hv-balloon hv-balloon driver (requires Glib 2.68+ GTree API)'
printf "%s\n" ' hvf HVF acceleration support'
printf "%s\n" ' iconv Font glyph conversion support'
printf "%s\n" ' jack JACK sound support'
--disable-guest-agent-msi) printf "%s" -Dguest_agent_msi=disabled ;;
--enable-hexagon-idef-parser) printf "%s" -Dhexagon_idef_parser=true ;;
--disable-hexagon-idef-parser) printf "%s" -Dhexagon_idef_parser=false ;;
+ --enable-hv-balloon) printf "%s" -Dhv_balloon=enabled ;;
+ --disable-hv-balloon) printf "%s" -Dhv_balloon=disabled ;;
--enable-hvf) printf "%s" -Dhvf=enabled ;;
--disable-hvf) printf "%s" -Dhvf=disabled ;;
--iasl=*) quote_sh "-Diasl=$2" ;;
return ret;
}
+int memory_region_iommu_set_iova_ranges(IOMMUMemoryRegion *iommu_mr,
+ GList *iova_ranges,
+ Error **errp)
+{
+ IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr);
+ int ret = 0;
+
+ if (imrc->iommu_set_iova_ranges) {
+ ret = imrc->iommu_set_iova_ranges(iommu_mr, iova_ranges, errp);
+ }
+ return ret;
+}
+
int memory_region_register_iommu_notifier(MemoryRegion *mr,
IOMMUNotifier *n, Error **errp)
{
# Load/store register (pointer authentication)
# LDRA immediate is 10 bits signed and scaled, but the bits aren't all contiguous
-%ldra_imm 22:s1 12:9 !function=times_2
+%ldra_imm 22:s1 12:9 !function=times_8
LDRA 11 111 0 00 m:1 . 1 ......... w:1 1 rn:5 rt:5 imm=%ldra_imm
return x * 4;
}
+static inline int times_8(DisasContext *s, int x)
+{
+ return x * 8;
+}
+
static inline int times_2_plus_1(DisasContext *s, int x)
{
return x * 2 + 1;
fi.submap |= 1 << XENFEAT_writable_page_tables |
1 << XENFEAT_writable_descriptor_tables |
1 << XENFEAT_auto_translated_physmap |
- 1 << XENFEAT_supervisor_mode_kernel |
1 << XENFEAT_hvm_callback_vector |
1 << XENFEAT_hvm_safe_pvclock |
1 << XENFEAT_hvm_pirqs;
trace_kvm_xen_set_vcpu_callback(cs->cpu_index, vector);
- return kvm_vcpu_ioctl(cs, KVM_XEN_HVM_SET_ATTR, &xva);
+ return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &xva);
}
static void do_set_vcpu_callback_vector(CPUState *cs, run_on_cpu_data data)
}
}
+bool kvm_xen_has_vcpu_callback_vector(void)
+{
+ CPUState *cs = qemu_get_cpu(0);
+
+ return cs && !!X86_CPU(cs)->env.xen_vcpu_callback_vector;
+}
+
void kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id, int type)
{
CPUState *cs = qemu_get_cpu(vcpu_id);
* deliver it as an MSI.
*/
MSIMessage msg = {
- .address = APIC_DEFAULT_ADDRESS | X86_CPU(cs)->apic_id,
+ .address = APIC_DEFAULT_ADDRESS |
+ (X86_CPU(cs)->apic_id << MSI_ADDR_DEST_ID_SHIFT),
.data = vector | (1UL << MSI_DATA_LEVEL_SHIFT),
};
kvm_irqchip_send_msi(kvm_state, msg);
int ret = -ENOSYS;
switch (cmd) {
case HVMOP_set_evtchn_upcall_vector:
- ret = kvm_xen_hcall_evtchn_upcall_vector(exit, cpu,
- exit->u.hcall.params[0]);
+ ret = kvm_xen_hcall_evtchn_upcall_vector(exit, cpu, arg);
break;
case HVMOP_pagetable_dying:
{ "query-acpi-ospm-status", ERROR_CLASS_GENERIC_ERROR },
{ "query-balloon", ERROR_CLASS_DEVICE_NOT_ACTIVE },
{ "query-hotpluggable-cpus", ERROR_CLASS_GENERIC_ERROR },
+ { "query-hv-balloon-status-report", ERROR_CLASS_GENERIC_ERROR },
{ "query-vm-generation-id", ERROR_CLASS_GENERIC_ERROR },
/* Only valid with a USB bus added */
{ "x-query-usb", ERROR_CLASS_GENERIC_ERROR },
'test-opts-visitor': [testqapi],
'test-visitor-serialization': [testqapi],
'test-bitmap': [],
+ 'test-resv-mem': [],
# all code tested by test-x86-cpuid is inside topology.h
'test-x86-cpuid': [],
'test-cutils': [],
--- /dev/null
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * reserved-region/range.c unit-tests.
+ *
+ * Copyright (C) 2023, Red Hat, Inc.
+ *
+ * Author: Eric Auger <eric.auger@redhat.com>
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/range.h"
+#include "exec/memory.h"
+#include "qemu/reserved-region.h"
+
+#define DEBUG 0
+
+#if DEBUG
+static void print_ranges(const char *prefix, GList *ranges)
+{
+ GList *l;
+ int i = 0;
+
+ if (!g_list_length(ranges)) {
+ printf("%s is void\n", prefix);
+ return;
+ }
+ for (l = ranges; l; l = l->next) {
+ Range *r = (Range *)l->data;
+
+ printf("%s rev[%i] = [0x%"PRIx64",0x%"PRIx64"]\n",
+ prefix, i, range_lob(r), range_upb(r));
+ i++;
+ }
+}
+#endif
+
+static void compare_ranges(const char *prefix, GList *ranges,
+ GList *expected)
+{
+ GList *l, *e;
+
+#if DEBUG
+ print_ranges("out", ranges);
+ print_ranges("expected", expected);
+#endif
+ g_assert_cmpint(g_list_length(ranges), ==, g_list_length(expected));
+ for (l = ranges, e = expected; l ; l = l->next, e = e->next) {
+ Range *r = (Range *)l->data;
+ Range *er = (Range *)e->data;
+
+ g_assert_true(range_lob(r) == range_lob(er) &&
+ range_upb(r) == range_upb(er));
+ }
+}
+
+static GList *insert_sorted_range(GList *list, uint64_t lob, uint64_t upb)
+{
+ Range *new = g_new0(Range, 1);
+
+ range_set_bounds(new, lob, upb);
+ return range_list_insert(list, new);
+}
+
+static void reset(GList **in, GList **out, GList **expected)
+{
+ g_list_free_full(*in, g_free);
+ g_list_free_full(*out, g_free);
+ g_list_free_full(*expected, g_free);
+ *in = NULL;
+ *out = NULL;
+ *expected = NULL;
+}
+
+static void
+run_range_inverse_array(const char *prefix, GList **in, GList **expected,
+ uint64_t low, uint64_t high)
+{
+ GList *out = NULL;
+ range_inverse_array(*in, &out, low, high);
+ compare_ranges(prefix, out, *expected);
+ reset(in, &out, expected);
+}
+
+static void check_range_reverse_array(void)
+{
+ GList *in = NULL, *expected = NULL;
+
+ /* test 1 */
+
+ in = insert_sorted_range(in, 0x10000, UINT64_MAX);
+ expected = insert_sorted_range(expected, 0x0, 0xFFFF);
+ run_range_inverse_array("test1", &in, &expected, 0x0, UINT64_MAX);
+
+ /* test 2 */
+
+ in = insert_sorted_range(in, 0x10000, 0xFFFFFFFFFFFF);
+ expected = insert_sorted_range(expected, 0x0, 0xFFFF);
+ expected = insert_sorted_range(expected, 0x1000000000000, UINT64_MAX);
+ run_range_inverse_array("test1", &in, &expected, 0x0, UINT64_MAX);
+
+ /* test 3 */
+
+ in = insert_sorted_range(in, 0x0, 0xFFFF);
+ in = insert_sorted_range(in, 0x10000, 0x2FFFF);
+ expected = insert_sorted_range(expected, 0x30000, UINT64_MAX);
+ run_range_inverse_array("test1", &in, &expected, 0x0, UINT64_MAX);
+
+ /* test 4 */
+
+ in = insert_sorted_range(in, 0x50000, 0x5FFFF);
+ in = insert_sorted_range(in, 0x60000, 0xFFFFFFFFFFFF);
+ expected = insert_sorted_range(expected, 0x0, 0x4FFFF);
+ expected = insert_sorted_range(expected, 0x1000000000000, UINT64_MAX);
+ run_range_inverse_array("test1", &in, &expected, 0x0, UINT64_MAX);
+
+ /* test 5 */
+
+ in = insert_sorted_range(in, 0x0, UINT64_MAX);
+ run_range_inverse_array("test1", &in, &expected, 0x0, UINT64_MAX);
+
+ /* test 6 */
+ in = insert_sorted_range(in, 0x10000, 0x1FFFF);
+ in = insert_sorted_range(in, 0x30000, 0x6FFFF);
+ in = insert_sorted_range(in, 0x90000, UINT64_MAX);
+ expected = insert_sorted_range(expected, 0x0, 0xFFFF);
+ expected = insert_sorted_range(expected, 0x20000, 0x2FFFF);
+ expected = insert_sorted_range(expected, 0x70000, 0x8FFFF);
+ run_range_inverse_array("test1", &in, &expected, 0x0, UINT64_MAX);
+}
+
+static void check_range_reverse_array_low_end(void)
+{
+ GList *in = NULL, *expected = NULL;
+
+ /* test 1 */
+ in = insert_sorted_range(in, 0x0, UINT64_MAX);
+ run_range_inverse_array("test1", &in, &expected, 0x10000, 0xFFFFFF);
+
+ /* test 2 */
+
+ in = insert_sorted_range(in, 0x0, 0xFFFF);
+ in = insert_sorted_range(in, 0x20000, 0x2FFFF);
+ expected = insert_sorted_range(expected, 0x40000, 0xFFFFFFFFFFFF);
+ run_range_inverse_array("test2", &in, &expected, 0x40000, 0xFFFFFFFFFFFF);
+
+ /* test 3 */
+ in = insert_sorted_range(in, 0x0, 0xFFFF);
+ in = insert_sorted_range(in, 0x20000, 0x2FFFF);
+ in = insert_sorted_range(in, 0x1000000000000, UINT64_MAX);
+ expected = insert_sorted_range(expected, 0x40000, 0xFFFFFFFFFFFF);
+ run_range_inverse_array("test3", &in, &expected, 0x40000, 0xFFFFFFFFFFFF);
+
+ /* test 4 */
+
+ in = insert_sorted_range(in, 0x0, 0xFFFF);
+ in = insert_sorted_range(in, 0x20000, 0x2FFFF);
+ in = insert_sorted_range(in, 0x1000000000000, UINT64_MAX);
+ expected = insert_sorted_range(expected, 0x30000, 0xFFFFFFFFFFFF);
+ run_range_inverse_array("test4", &in, &expected, 0x20000, 0xFFFFFFFFFFFF);
+
+ /* test 5 */
+
+ in = insert_sorted_range(in, 0x2000, 0xFFFF);
+ in = insert_sorted_range(in, 0x20000, 0x2FFFF);
+ in = insert_sorted_range(in, 0x100000000, 0x1FFFFFFFF);
+ expected = insert_sorted_range(expected, 0x1000, 0x1FFF);
+ expected = insert_sorted_range(expected, 0x10000, 0x1FFFF);
+ expected = insert_sorted_range(expected, 0x30000, 0xFFFFFFFF);
+ expected = insert_sorted_range(expected, 0x200000000, 0xFFFFFFFFFFFF);
+ run_range_inverse_array("test5", &in, &expected, 0x1000, 0xFFFFFFFFFFFF);
+
+ /* test 6 */
+
+ in = insert_sorted_range(in, 0x10000000 , 0x1FFFFFFF);
+ in = insert_sorted_range(in, 0x100000000, 0x1FFFFFFFF);
+ expected = insert_sorted_range(expected, 0x0, 0xFFFF);
+ run_range_inverse_array("test6", &in, &expected, 0x0, 0xFFFF);
+}
+
+static ReservedRegion *alloc_resv_mem(unsigned type, uint64_t lob, uint64_t upb)
+{
+ ReservedRegion *r;
+
+ r = g_new0(ReservedRegion, 1);
+ r->type = type;
+ range_set_bounds(&r->range, lob, upb);
+ return r;
+}
+
+static void print_resv_region_list(const char *prefix, GList *list,
+ uint32_t expected_length)
+{
+ int i = g_list_length(list);
+
+ g_assert_cmpint(i, ==, expected_length);
+#if DEBUG
+ i = 0;
+ for (GList *l = list; l; l = l->next) {
+ ReservedRegion *r = (ReservedRegion *)l->data;
+ Range *range = &r->range;
+
+ printf("%s item[%d]=[0x%x, 0x%"PRIx64", 0x%"PRIx64"]\n",
+ prefix, i++, r->type, range_lob(range), range_upb(range));
+ }
+#endif
+}
+
+static void free_resv_region(gpointer data)
+{
+ ReservedRegion *reg = (ReservedRegion *)data;
+
+ g_free(reg);
+}
+
+static void check_resv_region_list_insert(void)
+{
+ ReservedRegion *r[10];
+ GList *l = NULL;
+
+ r[0] = alloc_resv_mem(0xA, 0, 0xFFFF);
+ r[1] = alloc_resv_mem(0xA, 0x20000, 0x2FFFF);
+ l = resv_region_list_insert(l, r[0]);
+ l = resv_region_list_insert(l, r[1]);
+ print_resv_region_list("test1", l, 2);
+
+ /* adjacent on left */
+ r[2] = alloc_resv_mem(0xB, 0x0, 0xFFF);
+ l = resv_region_list_insert(l, r[2]);
+ /* adjacent on right */
+ r[3] = alloc_resv_mem(0xC, 0x21000, 0x2FFFF);
+ l = resv_region_list_insert(l, r[3]);
+ print_resv_region_list("test2", l, 4);
+
+ /* exact overlap of D into C*/
+ r[4] = alloc_resv_mem(0xD, 0x21000, 0x2FFFF);
+ l = resv_region_list_insert(l, r[4]);
+ print_resv_region_list("test3", l, 4);
+
+ /* in the middle */
+ r[5] = alloc_resv_mem(0xE, 0x22000, 0x23FFF);
+ l = resv_region_list_insert(l, r[5]);
+ print_resv_region_list("test4", l, 6);
+
+ /* overwrites several existing ones */
+ r[6] = alloc_resv_mem(0xF, 0x10000, 0x2FFFF);
+ l = resv_region_list_insert(l, r[6]);
+ print_resv_region_list("test5", l, 3);
+
+ /* contiguous at the end */
+ r[7] = alloc_resv_mem(0x0, 0x30000, 0x40000);
+ l = resv_region_list_insert(l, r[7]);
+ print_resv_region_list("test6", l, 4);
+
+ g_list_free_full(l, free_resv_region);
+ l = NULL;
+
+ r[0] = alloc_resv_mem(0x0, 0x10000, 0x1FFFF);
+ l = resv_region_list_insert(l, r[0]);
+ /* insertion before the 1st item */
+ r[1] = alloc_resv_mem(0x1, 0x0, 0xFF);
+ l = resv_region_list_insert(l, r[1]);
+ print_resv_region_list("test8", l, 2);
+
+ /* collision on the left side */
+ r[2] = alloc_resv_mem(0xA, 0x1200, 0x11FFF);
+ l = resv_region_list_insert(l, r[2]);
+ print_resv_region_list("test9", l, 3);
+
+ /* collision on the right side */
+ r[3] = alloc_resv_mem(0xA, 0x1F000, 0x2FFFF);
+ l = resv_region_list_insert(l, r[3]);
+ print_resv_region_list("test10", l, 4);
+
+ /* override everything */
+ r[4] = alloc_resv_mem(0xF, 0x0, UINT64_MAX);
+ l = resv_region_list_insert(l, r[4]);
+ print_resv_region_list("test11", l, 1);
+
+ g_list_free_full(l, free_resv_region);
+ l = NULL;
+
+ r[0] = alloc_resv_mem(0xF, 0x1000000000000, UINT64_MAX);
+ l = resv_region_list_insert(l, r[0]);
+ print_resv_region_list("test12", l, 1);
+
+ r[1] = alloc_resv_mem(0xA, 0x0, 0xFFFFFFF);
+ l = resv_region_list_insert(l, r[1]);
+ print_resv_region_list("test12", l, 2);
+
+ r[2] = alloc_resv_mem(0xB, 0x100000000, 0x1FFFFFFFF);
+ l = resv_region_list_insert(l, r[2]);
+ print_resv_region_list("test12", l, 3);
+
+ r[3] = alloc_resv_mem(0x0, 0x010000000, 0x2FFFFFFFF);
+ l = resv_region_list_insert(l, r[3]);
+ print_resv_region_list("test12", l, 3);
+
+ g_list_free_full(l, free_resv_region);
+}
+
+int main(int argc, char **argv)
+{
+ g_test_init(&argc, &argv, NULL);
+
+ g_test_add_func("/resv-mem/range_reverse_array",
+ check_range_reverse_array);
+ g_test_add_func("/resv-mem/range_reverse_array_low_end",
+ check_range_reverse_array_low_end);
+ g_test_add_func("/resv-mem/resv_region_list_insert",
+ check_resv_region_list_insert);
+
+ g_test_run();
+
+ return 0;
+}
int i;
for (i = 0; i < ARRAY_SIZE(uuid_test_data); i++) {
- char out[37];
+ char out[UUID_STR_LEN];
if (!uuid_test_data[i].check_unparse) {
continue;
/* Loop over all events in the buffer */
while (used < len) {
- struct inotify_event *ev =
- (struct inotify_event *)(buf + used);
- const char *name = ev->len ? ev->name : "";
- QFileMonitorDir *dir = g_hash_table_lookup(mon->idmap,
- GINT_TO_POINTER(ev->wd));
- uint32_t iev = ev->mask &
- (IN_CREATE | IN_MODIFY | IN_DELETE | IN_IGNORED |
- IN_MOVED_TO | IN_MOVED_FROM | IN_ATTRIB);
+ const char *name;
+ QFileMonitorDir *dir;
+ uint32_t iev;
int qev;
gsize i;
+ struct inotify_event *ev = (struct inotify_event *)(buf + used);
+
+ /*
+ * We trust the kenel to provide valid buffer with complete event
+ * records.
+ */
+ assert(len - used >= sizeof(struct inotify_event));
+ assert(len - used - sizeof(struct inotify_event) >= ev->len);
+
+ name = ev->len ? ev->name : "";
+ dir = g_hash_table_lookup(mon->idmap, GINT_TO_POINTER(ev->wd));
+ iev = ev->mask &
+ (IN_CREATE | IN_MODIFY | IN_DELETE | IN_IGNORED |
+ IN_MOVED_TO | IN_MOVED_FROM | IN_ATTRIB);
used += sizeof(struct inotify_event) + ev->len;
util_ss.add(files('qht.c'))
util_ss.add(files('qsp.c'))
util_ss.add(files('range.c'))
+util_ss.add(files('reserved-region.c'))
util_ss.add(files('stats64.c'))
util_ss.add(files('systemd.c'))
util_ss.add(files('transactions.c'))
#include "qemu/osdep.h"
#include "qemu/range.h"
-/*
- * Return -1 if @a < @b, 1 @a > @b, and 0 if they touch or overlap.
- * Both @a and @b must not be empty.
- */
-static inline int range_compare(Range *a, Range *b)
+int range_compare(Range *a, Range *b)
{
assert(!range_is_empty(a) && !range_is_empty(b));
return list;
}
+
+static inline
+GList *append_new_range(GList *list, uint64_t lob, uint64_t upb)
+{
+ Range *new = g_new0(Range, 1);
+
+ range_set_bounds(new, lob, upb);
+ return g_list_append(list, new);
+}
+
+
+void range_inverse_array(GList *in, GList **rev,
+ uint64_t low, uint64_t high)
+{
+ Range *r, *rn;
+ GList *l = in, *out = *rev;
+
+ for (l = in; l && range_upb(l->data) < low; l = l->next) {
+ continue;
+ }
+
+ if (!l) {
+ out = append_new_range(out, low, high);
+ goto exit;
+ }
+ r = (Range *)l->data;
+
+ /* first range lob is greater than min, insert a first range */
+ if (range_lob(r) > low) {
+ out = append_new_range(out, low, MIN(range_lob(r) - 1, high));
+ }
+
+ /* insert a range inbetween each original range until we reach high */
+ for (; l->next; l = l->next) {
+ r = (Range *)l->data;
+ rn = (Range *)l->next->data;
+ if (range_lob(r) >= high) {
+ goto exit;
+ }
+ if (range_compare(r, rn)) {
+ out = append_new_range(out, range_upb(r) + 1,
+ MIN(range_lob(rn) - 1, high));
+ }
+ }
+
+ /* last range */
+ r = (Range *)l->data;
+
+ /* last range upb is less than max, insert a last range */
+ if (range_upb(r) < high) {
+ out = append_new_range(out, range_upb(r) + 1, high);
+ }
+exit:
+ *rev = out;
+}
--- /dev/null
+/*
+ * QEMU ReservedRegion helpers
+ *
+ * Copyright (c) 2023 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/range.h"
+#include "qemu/reserved-region.h"
+
+GList *resv_region_list_insert(GList *list, ReservedRegion *reg)
+{
+ ReservedRegion *resv_iter, *new_reg;
+ Range *r = ®->range;
+ Range *range_iter;
+ GList *l;
+
+ for (l = list; l ; ) {
+ resv_iter = (ReservedRegion *)l->data;
+ range_iter = &resv_iter->range;
+
+ /* Skip all list elements strictly less than range to add */
+ if (range_compare(range_iter, r) < 0) {
+ l = l->next;
+ } else if (range_compare(range_iter, r) > 0) {
+ return g_list_insert_before(list, l, reg);
+ } else { /* there is an overlap */
+ if (range_contains_range(r, range_iter)) {
+ /* new range contains current item, simply remove this latter */
+ GList *prev = l->prev;
+ g_free(l->data);
+ list = g_list_delete_link(list, l);
+ if (prev) {
+ l = prev->next;
+ } else {
+ l = list;
+ }
+ } else if (range_contains_range(range_iter, r)) {
+ /* new region is included in the current region */
+ if (range_lob(range_iter) == range_lob(r)) {
+ /* adjacent on the left side, derives into 2 regions */
+ range_set_bounds(range_iter, range_upb(r) + 1,
+ range_upb(range_iter));
+ return g_list_insert_before(list, l, reg);
+ } else if (range_upb(range_iter) == range_upb(r)) {
+ /* adjacent on the right side, derives into 2 regions */
+ range_set_bounds(range_iter, range_lob(range_iter),
+ range_lob(r) - 1);
+ l = l->next;
+ } else {
+ uint64_t lob = range_lob(range_iter);
+ /*
+ * the new range is in the middle of an existing one,
+ * split this latter into 3 regs instead
+ */
+ range_set_bounds(range_iter, range_upb(r) + 1,
+ range_upb(range_iter));
+ new_reg = g_new0(ReservedRegion, 1);
+ new_reg->type = resv_iter->type;
+ range_set_bounds(&new_reg->range,
+ lob, range_lob(r) - 1);
+ list = g_list_insert_before(list, l, new_reg);
+ return g_list_insert_before(list, l, reg);
+ }
+ } else if (range_lob(r) < range_lob(range_iter)) {
+ range_set_bounds(range_iter, range_upb(r) + 1,
+ range_upb(range_iter));
+ return g_list_insert_before(list, l, reg);
+ } else { /* intersection on the upper range */
+ range_set_bounds(range_iter, range_lob(range_iter),
+ range_lob(r) - 1);
+ l = l->next;
+ }
+ } /* overlap */
+ }
+ return g_list_append(list, reg);
+}
+
void qemu_uuid_unparse(const QemuUUID *uuid, char *out)
{
const unsigned char *uu = &uuid->data[0];
- snprintf(out, UUID_FMT_LEN + 1, UUID_FMT,
+ snprintf(out, UUID_STR_LEN, UUID_FMT,
uu[0], uu[1], uu[2], uu[3], uu[4], uu[5], uu[6], uu[7],
uu[8], uu[9], uu[10], uu[11], uu[12], uu[13], uu[14], uu[15]);
}