2 * Copyright (c) 2007, Neocleus Corporation.
4 * This work is licensed under the terms of the GNU GPL, version 2. See
5 * the COPYING file in the top-level directory.
8 * Assign a PCI device from the host to a guest VM.
10 * This implementation uses the classic device assignment interface of KVM
11 * and is only available on x86 hosts. It is expected to be obsoleted by VFIO
12 * based device assignment.
14 * Adapted for KVM (qemu-kvm) by Qumranet. QEMU version was based on qemu-kvm
15 * revision 4144fe9d48. See its repository for the history.
17 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
18 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
19 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
20 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
21 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
26 #include <sys/types.h>
29 #include "hw/i386/pc.h"
30 #include "qemu/error-report.h"
31 #include "ui/console.h"
32 #include "hw/loader.h"
33 #include "monitor/monitor.h"
34 #include "qemu/range.h"
35 #include "sysemu/sysemu.h"
36 #include "hw/pci/pci.h"
37 #include "hw/pci/msi.h"
39 #include "hw/pci/pci-assign.h"
41 #define MSIX_PAGE_SIZE 0x1000
43 /* From linux/ioport.h */
44 #define IORESOURCE_IO 0x00000100 /* Resource type */
45 #define IORESOURCE_MEM 0x00000200
46 #define IORESOURCE_IRQ 0x00000400
47 #define IORESOURCE_DMA 0x00000800
48 #define IORESOURCE_PREFETCH 0x00002000 /* No side effects */
49 #define IORESOURCE_MEM_64 0x00100000
51 typedef struct PCIRegion
{
52 int type
; /* Memory or port I/O */
55 uint64_t size
; /* size of the region */
59 typedef struct PCIDevRegions
{
60 uint8_t bus
, dev
, func
; /* Bus inside domain, device and function */
61 int irq
; /* IRQ number */
62 uint16_t region_number
; /* number of active regions */
64 /* Port I/O or MMIO Regions */
65 PCIRegion regions
[PCI_NUM_REGIONS
- 1];
69 typedef struct AssignedDevRegion
{
70 MemoryRegion container
;
71 MemoryRegion real_iomem
;
73 uint8_t *r_virtbase
; /* mmapped access address for memory regions */
74 uint32_t r_baseport
; /* the base guest port for I/O regions */
76 pcibus_t e_size
; /* emulated size of region in bytes */
77 pcibus_t r_size
; /* real size of region in bytes */
81 #define ASSIGNED_DEVICE_PREFER_MSI_BIT 0
82 #define ASSIGNED_DEVICE_SHARE_INTX_BIT 1
84 #define ASSIGNED_DEVICE_PREFER_MSI_MASK (1 << ASSIGNED_DEVICE_PREFER_MSI_BIT)
85 #define ASSIGNED_DEVICE_SHARE_INTX_MASK (1 << ASSIGNED_DEVICE_SHARE_INTX_BIT)
87 typedef struct MSIXTableEntry
{
94 typedef enum AssignedIRQType
{
95 ASSIGNED_IRQ_NONE
= 0,
96 ASSIGNED_IRQ_INTX_HOST_INTX
,
97 ASSIGNED_IRQ_INTX_HOST_MSI
,
102 typedef struct AssignedDevice
{
104 PCIHostDeviceAddress host
;
108 AssignedDevRegion v_addrs
[PCI_NUM_REGIONS
- 1];
109 PCIDevRegions real_device
;
110 PCIINTxRoute intx_route
;
111 AssignedIRQType assigned_irq_type
;
113 #define ASSIGNED_DEVICE_CAP_MSI (1 << 0)
114 #define ASSIGNED_DEVICE_CAP_MSIX (1 << 1)
116 #define ASSIGNED_DEVICE_MSI_ENABLED (1 << 0)
117 #define ASSIGNED_DEVICE_MSIX_ENABLED (1 << 1)
118 #define ASSIGNED_DEVICE_MSIX_MASKED (1 << 2)
121 uint8_t emulate_config_read
[PCI_CONFIG_SPACE_SIZE
];
122 uint8_t emulate_config_write
[PCI_CONFIG_SPACE_SIZE
];
125 MSIXTableEntry
*msix_table
;
126 hwaddr msix_table_addr
;
133 #define TYPE_PCI_ASSIGN "kvm-pci-assign"
134 #define PCI_ASSIGN(obj) OBJECT_CHECK(AssignedDevice, (obj), TYPE_PCI_ASSIGN)
136 static void assigned_dev_update_irq_routing(PCIDevice
*dev
);
138 static void assigned_dev_load_option_rom(AssignedDevice
*dev
);
140 static void assigned_dev_unregister_msix_mmio(AssignedDevice
*dev
);
142 static uint64_t assigned_dev_ioport_rw(AssignedDevRegion
*dev_region
,
143 hwaddr addr
, int size
,
147 int fd
= dev_region
->region
->resource_fd
;
150 DEBUG("pwrite data=%" PRIx64
", size=%d, e_phys=" TARGET_FMT_plx
151 ", addr="TARGET_FMT_plx
"\n", *data
, size
, addr
, addr
);
152 if (pwrite(fd
, data
, size
, addr
) != size
) {
153 error_report("%s - pwrite failed %s", __func__
, strerror(errno
));
156 if (pread(fd
, &val
, size
, addr
) != size
) {
157 error_report("%s - pread failed %s", __func__
, strerror(errno
));
158 val
= (1UL << (size
* 8)) - 1;
160 DEBUG("pread val=%" PRIx64
", size=%d, e_phys=" TARGET_FMT_plx
161 ", addr=" TARGET_FMT_plx
"\n", val
, size
, addr
, addr
);
166 static void assigned_dev_ioport_write(void *opaque
, hwaddr addr
,
167 uint64_t data
, unsigned size
)
169 assigned_dev_ioport_rw(opaque
, addr
, size
, &data
);
172 static uint64_t assigned_dev_ioport_read(void *opaque
,
173 hwaddr addr
, unsigned size
)
175 return assigned_dev_ioport_rw(opaque
, addr
, size
, NULL
);
178 static uint32_t slow_bar_readb(void *opaque
, hwaddr addr
)
180 AssignedDevRegion
*d
= opaque
;
181 uint8_t *in
= d
->u
.r_virtbase
+ addr
;
185 DEBUG("addr=0x" TARGET_FMT_plx
" val=0x%08x\n", addr
, r
);
190 static uint32_t slow_bar_readw(void *opaque
, hwaddr addr
)
192 AssignedDevRegion
*d
= opaque
;
193 uint16_t *in
= (uint16_t *)(d
->u
.r_virtbase
+ addr
);
197 DEBUG("addr=0x" TARGET_FMT_plx
" val=0x%08x\n", addr
, r
);
202 static uint32_t slow_bar_readl(void *opaque
, hwaddr addr
)
204 AssignedDevRegion
*d
= opaque
;
205 uint32_t *in
= (uint32_t *)(d
->u
.r_virtbase
+ addr
);
209 DEBUG("addr=0x" TARGET_FMT_plx
" val=0x%08x\n", addr
, r
);
214 static void slow_bar_writeb(void *opaque
, hwaddr addr
, uint32_t val
)
216 AssignedDevRegion
*d
= opaque
;
217 uint8_t *out
= d
->u
.r_virtbase
+ addr
;
219 DEBUG("addr=0x" TARGET_FMT_plx
" val=0x%02x\n", addr
, val
);
223 static void slow_bar_writew(void *opaque
, hwaddr addr
, uint32_t val
)
225 AssignedDevRegion
*d
= opaque
;
226 uint16_t *out
= (uint16_t *)(d
->u
.r_virtbase
+ addr
);
228 DEBUG("addr=0x" TARGET_FMT_plx
" val=0x%04x\n", addr
, val
);
232 static void slow_bar_writel(void *opaque
, hwaddr addr
, uint32_t val
)
234 AssignedDevRegion
*d
= opaque
;
235 uint32_t *out
= (uint32_t *)(d
->u
.r_virtbase
+ addr
);
237 DEBUG("addr=0x" TARGET_FMT_plx
" val=0x%08x\n", addr
, val
);
241 static const MemoryRegionOps slow_bar_ops
= {
243 .read
= { slow_bar_readb
, slow_bar_readw
, slow_bar_readl
, },
244 .write
= { slow_bar_writeb
, slow_bar_writew
, slow_bar_writel
, },
246 .endianness
= DEVICE_NATIVE_ENDIAN
,
249 static void assigned_dev_iomem_setup(PCIDevice
*pci_dev
, int region_num
,
252 AssignedDevice
*r_dev
= PCI_ASSIGN(pci_dev
);
253 AssignedDevRegion
*region
= &r_dev
->v_addrs
[region_num
];
254 PCIRegion
*real_region
= &r_dev
->real_device
.regions
[region_num
];
257 memory_region_init(®ion
->container
, OBJECT(pci_dev
),
258 "assigned-dev-container", e_size
);
259 memory_region_add_subregion(®ion
->container
, 0, ®ion
->real_iomem
);
261 /* deal with MSI-X MMIO page */
262 if (real_region
->base_addr
<= r_dev
->msix_table_addr
&&
263 real_region
->base_addr
+ real_region
->size
>
264 r_dev
->msix_table_addr
) {
265 uint64_t offset
= r_dev
->msix_table_addr
- real_region
->base_addr
;
267 memory_region_add_subregion_overlap(®ion
->container
,
275 static const MemoryRegionOps assigned_dev_ioport_ops
= {
276 .read
= assigned_dev_ioport_read
,
277 .write
= assigned_dev_ioport_write
,
278 .endianness
= DEVICE_NATIVE_ENDIAN
,
281 static void assigned_dev_ioport_setup(PCIDevice
*pci_dev
, int region_num
,
284 AssignedDevice
*r_dev
= PCI_ASSIGN(pci_dev
);
285 AssignedDevRegion
*region
= &r_dev
->v_addrs
[region_num
];
287 region
->e_size
= size
;
288 memory_region_init(®ion
->container
, OBJECT(pci_dev
),
289 "assigned-dev-container", size
);
290 memory_region_init_io(®ion
->real_iomem
, OBJECT(pci_dev
),
291 &assigned_dev_ioport_ops
, r_dev
->v_addrs
+ region_num
,
292 "assigned-dev-iomem", size
);
293 memory_region_add_subregion(®ion
->container
, 0, ®ion
->real_iomem
);
296 static uint32_t assigned_dev_pci_read(PCIDevice
*d
, int pos
, int len
)
298 AssignedDevice
*pci_dev
= PCI_ASSIGN(d
);
301 int fd
= pci_dev
->real_device
.config_fd
;
304 ret
= pread(fd
, &val
, len
, pos
);
306 if ((ret
< 0) && (errno
== EINTR
|| errno
== EAGAIN
)) {
310 hw_error("pci read failed, ret = %zd errno = %d\n", ret
, errno
);
316 static uint8_t assigned_dev_pci_read_byte(PCIDevice
*d
, int pos
)
318 return (uint8_t)assigned_dev_pci_read(d
, pos
, 1);
321 static void assigned_dev_pci_write(PCIDevice
*d
, int pos
, uint32_t val
, int len
)
323 AssignedDevice
*pci_dev
= PCI_ASSIGN(d
);
325 int fd
= pci_dev
->real_device
.config_fd
;
328 ret
= pwrite(fd
, &val
, len
, pos
);
330 if ((ret
< 0) && (errno
== EINTR
|| errno
== EAGAIN
)) {
334 hw_error("pci write failed, ret = %zd errno = %d\n", ret
, errno
);
338 static void assigned_dev_emulate_config_read(AssignedDevice
*dev
,
339 uint32_t offset
, uint32_t len
)
341 memset(dev
->emulate_config_read
+ offset
, 0xff, len
);
344 static void assigned_dev_direct_config_read(AssignedDevice
*dev
,
345 uint32_t offset
, uint32_t len
)
347 memset(dev
->emulate_config_read
+ offset
, 0, len
);
350 static void assigned_dev_direct_config_write(AssignedDevice
*dev
,
351 uint32_t offset
, uint32_t len
)
353 memset(dev
->emulate_config_write
+ offset
, 0, len
);
356 static uint8_t pci_find_cap_offset(PCIDevice
*d
, uint8_t cap
, uint8_t start
)
360 int pos
= start
? start
: PCI_CAPABILITY_LIST
;
363 status
= assigned_dev_pci_read_byte(d
, PCI_STATUS
);
364 if ((status
& PCI_STATUS_CAP_LIST
) == 0) {
369 pos
= assigned_dev_pci_read_byte(d
, pos
);
375 id
= assigned_dev_pci_read_byte(d
, pos
+ PCI_CAP_LIST_ID
);
384 pos
+= PCI_CAP_LIST_NEXT
;
389 static void assigned_dev_register_regions(PCIRegion
*io_regions
,
390 unsigned long regions_num
,
391 AssignedDevice
*pci_dev
,
395 PCIRegion
*cur_region
= io_regions
;
397 for (i
= 0; i
< regions_num
; i
++, cur_region
++) {
398 if (!cur_region
->valid
) {
402 /* handle memory io regions */
403 if (cur_region
->type
& IORESOURCE_MEM
) {
404 int t
= PCI_BASE_ADDRESS_SPACE_MEMORY
;
405 if (cur_region
->type
& IORESOURCE_PREFETCH
) {
406 t
|= PCI_BASE_ADDRESS_MEM_PREFETCH
;
408 if (cur_region
->type
& IORESOURCE_MEM_64
) {
409 t
|= PCI_BASE_ADDRESS_MEM_TYPE_64
;
412 /* map physical memory */
413 pci_dev
->v_addrs
[i
].u
.r_virtbase
= mmap(NULL
, cur_region
->size
,
414 PROT_WRITE
| PROT_READ
,
416 cur_region
->resource_fd
,
419 if (pci_dev
->v_addrs
[i
].u
.r_virtbase
== MAP_FAILED
) {
420 pci_dev
->v_addrs
[i
].u
.r_virtbase
= NULL
;
421 error_setg_errno(errp
, errno
, "Couldn't mmap 0x%" PRIx64
"!",
422 cur_region
->base_addr
);
426 pci_dev
->v_addrs
[i
].r_size
= cur_region
->size
;
427 pci_dev
->v_addrs
[i
].e_size
= 0;
430 pci_dev
->v_addrs
[i
].u
.r_virtbase
+=
431 (cur_region
->base_addr
& 0xFFF);
433 if (cur_region
->size
& 0xFFF) {
434 error_report("PCI region %d at address 0x%" PRIx64
" has "
435 "size 0x%" PRIx64
", which is not a multiple of "
436 "4K. You might experience some performance hit "
438 i
, cur_region
->base_addr
, cur_region
->size
);
439 memory_region_init_io(&pci_dev
->v_addrs
[i
].real_iomem
,
440 OBJECT(pci_dev
), &slow_bar_ops
,
441 &pci_dev
->v_addrs
[i
],
442 "assigned-dev-slow-bar",
445 void *virtbase
= pci_dev
->v_addrs
[i
].u
.r_virtbase
;
447 snprintf(name
, sizeof(name
), "%s.bar%d",
448 object_get_typename(OBJECT(pci_dev
)), i
);
449 memory_region_init_ram_ptr(&pci_dev
->v_addrs
[i
].real_iomem
,
450 OBJECT(pci_dev
), name
,
451 cur_region
->size
, virtbase
);
452 vmstate_register_ram(&pci_dev
->v_addrs
[i
].real_iomem
,
456 assigned_dev_iomem_setup(&pci_dev
->dev
, i
, cur_region
->size
);
457 pci_register_bar((PCIDevice
*) pci_dev
, i
, t
,
458 &pci_dev
->v_addrs
[i
].container
);
461 /* handle port io regions */
465 /* Test kernel support for ioport resource read/write. Old
466 * kernels return EIO. New kernels only allow 1/2/4 byte reads
467 * so should return EINVAL for a 3 byte read */
468 ret
= pread(pci_dev
->v_addrs
[i
].region
->resource_fd
, &val
, 3, 0);
470 error_report("Unexpected return from I/O port read: %d", ret
);
472 } else if (errno
!= EINVAL
) {
473 error_report("Kernel doesn't support ioport resource "
474 "access, hiding this region.");
475 close(pci_dev
->v_addrs
[i
].region
->resource_fd
);
476 cur_region
->valid
= 0;
480 pci_dev
->v_addrs
[i
].u
.r_baseport
= cur_region
->base_addr
;
481 pci_dev
->v_addrs
[i
].r_size
= cur_region
->size
;
482 pci_dev
->v_addrs
[i
].e_size
= 0;
484 assigned_dev_ioport_setup(&pci_dev
->dev
, i
, cur_region
->size
);
485 pci_register_bar((PCIDevice
*) pci_dev
, i
,
486 PCI_BASE_ADDRESS_SPACE_IO
,
487 &pci_dev
->v_addrs
[i
].container
);
494 static void get_real_id(const char *devpath
, const char *idname
, uint16_t *val
,
501 snprintf(name
, sizeof(name
), "%s%s", devpath
, idname
);
502 f
= fopen(name
, "r");
504 error_setg_file_open(errp
, errno
, name
);
507 if (fscanf(f
, "%li\n", &id
) == 1) {
510 error_setg(errp
, "Failed to parse contents of '%s'", name
);
515 static void get_real_vendor_id(const char *devpath
, uint16_t *val
,
518 get_real_id(devpath
, "vendor", val
, errp
);
521 static void get_real_device_id(const char *devpath
, uint16_t *val
,
524 get_real_id(devpath
, "device", val
, errp
);
527 static void get_real_device(AssignedDevice
*pci_dev
, Error
**errp
)
529 char dir
[128], name
[128];
532 uint64_t start
, end
, size
, flags
;
535 PCIDevRegions
*dev
= &pci_dev
->real_device
;
536 Error
*local_err
= NULL
;
538 dev
->region_number
= 0;
540 snprintf(dir
, sizeof(dir
), "/sys/bus/pci/devices/%04x:%02x:%02x.%x/",
541 pci_dev
->host
.domain
, pci_dev
->host
.bus
,
542 pci_dev
->host
.slot
, pci_dev
->host
.function
);
544 snprintf(name
, sizeof(name
), "%sconfig", dir
);
546 if (pci_dev
->configfd_name
&& *pci_dev
->configfd_name
) {
547 dev
->config_fd
= monitor_fd_param(cur_mon
, pci_dev
->configfd_name
,
550 error_propagate(errp
, local_err
);
554 dev
->config_fd
= open(name
, O_RDWR
);
556 if (dev
->config_fd
== -1) {
557 error_setg_file_open(errp
, errno
, name
);
562 r
= read(dev
->config_fd
, pci_dev
->dev
.config
,
563 pci_config_size(&pci_dev
->dev
));
565 if (errno
== EINTR
|| errno
== EAGAIN
) {
568 error_setg_errno(errp
, errno
, "read(\"%s\")",
569 (pci_dev
->configfd_name
&& *pci_dev
->configfd_name
) ?
570 pci_dev
->configfd_name
: name
);
574 /* Restore or clear multifunction, this is always controlled by qemu */
575 if (pci_dev
->dev
.cap_present
& QEMU_PCI_CAP_MULTIFUNCTION
) {
576 pci_dev
->dev
.config
[PCI_HEADER_TYPE
] |= PCI_HEADER_TYPE_MULTI_FUNCTION
;
578 pci_dev
->dev
.config
[PCI_HEADER_TYPE
] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION
;
581 /* Clear host resource mapping info. If we choose not to register a
582 * BAR, such as might be the case with the option ROM, we can get
583 * confusing, unwritable, residual addresses from the host here. */
584 memset(&pci_dev
->dev
.config
[PCI_BASE_ADDRESS_0
], 0, 24);
585 memset(&pci_dev
->dev
.config
[PCI_ROM_ADDRESS
], 0, 4);
587 snprintf(name
, sizeof(name
), "%sresource", dir
);
589 f
= fopen(name
, "r");
591 error_setg_file_open(errp
, errno
, name
);
595 for (r
= 0; r
< PCI_ROM_SLOT
; r
++) {
596 if (fscanf(f
, "%" SCNi64
" %" SCNi64
" %" SCNi64
"\n",
597 &start
, &end
, &flags
) != 3) {
601 rp
= dev
->regions
+ r
;
603 rp
->resource_fd
= -1;
604 size
= end
- start
+ 1;
605 flags
&= IORESOURCE_IO
| IORESOURCE_MEM
| IORESOURCE_PREFETCH
607 if (size
== 0 || (flags
& ~IORESOURCE_PREFETCH
) == 0) {
610 if (flags
& IORESOURCE_MEM
) {
611 flags
&= ~IORESOURCE_IO
;
613 flags
&= ~IORESOURCE_PREFETCH
;
615 snprintf(name
, sizeof(name
), "%sresource%d", dir
, r
);
616 fd
= open(name
, O_RDWR
);
620 rp
->resource_fd
= fd
;
624 rp
->base_addr
= start
;
626 pci_dev
->v_addrs
[r
].region
= rp
;
627 DEBUG("region %d size %" PRIu64
" start 0x%" PRIx64
628 " type %d resource_fd %d\n",
629 r
, rp
->size
, start
, rp
->type
, rp
->resource_fd
);
634 /* read and fill vendor ID */
635 get_real_vendor_id(dir
, &id
, &local_err
);
637 error_propagate(errp
, local_err
);
640 pci_dev
->dev
.config
[0] = id
& 0xff;
641 pci_dev
->dev
.config
[1] = (id
& 0xff00) >> 8;
643 /* read and fill device ID */
644 get_real_device_id(dir
, &id
, &local_err
);
646 error_propagate(errp
, local_err
);
649 pci_dev
->dev
.config
[2] = id
& 0xff;
650 pci_dev
->dev
.config
[3] = (id
& 0xff00) >> 8;
652 pci_word_test_and_clear_mask(pci_dev
->emulate_config_write
+ PCI_COMMAND
,
653 PCI_COMMAND_MASTER
| PCI_COMMAND_INTX_DISABLE
);
655 dev
->region_number
= r
;
658 static void free_msi_virqs(AssignedDevice
*dev
)
662 for (i
= 0; i
< dev
->msi_virq_nr
; i
++) {
663 if (dev
->msi_virq
[i
] >= 0) {
664 kvm_irqchip_release_virq(kvm_state
, dev
->msi_virq
[i
]);
665 dev
->msi_virq
[i
] = -1;
668 g_free(dev
->msi_virq
);
669 dev
->msi_virq
= NULL
;
670 dev
->msi_virq_nr
= 0;
673 static void free_assigned_device(AssignedDevice
*dev
)
677 if (dev
->cap
.available
& ASSIGNED_DEVICE_CAP_MSIX
) {
678 assigned_dev_unregister_msix_mmio(dev
);
680 for (i
= 0; i
< dev
->real_device
.region_number
; i
++) {
681 PCIRegion
*pci_region
= &dev
->real_device
.regions
[i
];
682 AssignedDevRegion
*region
= &dev
->v_addrs
[i
];
684 if (!pci_region
->valid
) {
687 if (pci_region
->type
& IORESOURCE_IO
) {
688 if (region
->u
.r_baseport
) {
689 memory_region_del_subregion(®ion
->container
,
690 ®ion
->real_iomem
);
692 } else if (pci_region
->type
& IORESOURCE_MEM
) {
693 if (region
->u
.r_virtbase
) {
694 memory_region_del_subregion(®ion
->container
,
695 ®ion
->real_iomem
);
697 /* Remove MSI-X table subregion */
698 if (pci_region
->base_addr
<= dev
->msix_table_addr
&&
699 pci_region
->base_addr
+ pci_region
->size
>
700 dev
->msix_table_addr
) {
701 memory_region_del_subregion(®ion
->container
,
704 if (munmap(region
->u
.r_virtbase
,
705 (pci_region
->size
+ 0xFFF) & 0xFFFFF000)) {
706 error_report("Failed to unmap assigned device region: %s",
711 if (pci_region
->resource_fd
>= 0) {
712 close(pci_region
->resource_fd
);
716 if (dev
->real_device
.config_fd
>= 0) {
717 close(dev
->real_device
.config_fd
);
723 /* This function tries to determine the cause of the PCI assignment failure. It
724 * always returns the cause as a dynamically allocated, human readable string.
725 * If the function fails to determine the cause for any internal reason, then
726 * the returned string will state that fact.
728 static char *assign_failed_examine(const AssignedDevice
*dev
)
730 char name
[PATH_MAX
], dir
[PATH_MAX
], driver
[PATH_MAX
] = {}, *ns
;
731 uint16_t vendor_id
, device_id
;
733 Error
*local_err
= NULL
;
735 snprintf(dir
, sizeof(dir
), "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
736 dev
->host
.domain
, dev
->host
.bus
, dev
->host
.slot
,
739 snprintf(name
, sizeof(name
), "%sdriver", dir
);
741 r
= readlink(name
, driver
, sizeof(driver
));
742 if ((r
<= 0) || r
>= sizeof(driver
)) {
747 ns
= strrchr(driver
, '/');
754 if ((get_real_vendor_id(dir
, &vendor_id
, &local_err
), local_err
) ||
755 (get_real_device_id(dir
, &device_id
, &local_err
), local_err
)) {
756 /* We're already analyzing an assignment error, so we suppress this
757 * one just like the others above.
759 error_free(local_err
);
763 return g_strdup_printf(
764 "*** The driver '%s' is occupying your device %04x:%02x:%02x.%x.\n"
766 "*** You can try the following commands to free it:\n"
768 "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub/new_id\n"
769 "*** $ echo \"%04x:%02x:%02x.%x\" > /sys/bus/pci/drivers/%s/unbind\n"
770 "*** $ echo \"%04x:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
772 "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub/remove_id\n"
774 ns
, dev
->host
.domain
, dev
->host
.bus
, dev
->host
.slot
,
775 dev
->host
.function
, vendor_id
, device_id
,
776 dev
->host
.domain
, dev
->host
.bus
, dev
->host
.slot
, dev
->host
.function
,
777 ns
, dev
->host
.domain
, dev
->host
.bus
, dev
->host
.slot
,
778 dev
->host
.function
, vendor_id
, device_id
);
781 return g_strdup("Couldn't find out why.");
784 static void assign_device(AssignedDevice
*dev
, Error
**errp
)
786 uint32_t flags
= KVM_DEV_ASSIGN_ENABLE_IOMMU
;
789 /* Only pass non-zero PCI segment to capable module */
790 if (!kvm_check_extension(kvm_state
, KVM_CAP_PCI_SEGMENT
) &&
792 error_setg(errp
, "Can't assign device inside non-zero PCI segment "
793 "as this KVM module doesn't support it.");
797 if (!kvm_check_extension(kvm_state
, KVM_CAP_IOMMU
)) {
798 error_setg(errp
, "No IOMMU found. Unable to assign device \"%s\"",
803 if (dev
->features
& ASSIGNED_DEVICE_SHARE_INTX_MASK
&&
804 kvm_has_intx_set_mask()) {
805 flags
|= KVM_DEV_ASSIGN_PCI_2_3
;
808 r
= kvm_device_pci_assign(kvm_state
, &dev
->host
, flags
, &dev
->dev_id
);
814 cause
= assign_failed_examine(dev
);
815 error_setg_errno(errp
, -r
, "Failed to assign device \"%s\"\n%s",
816 dev
->dev
.qdev
.id
, cause
);
821 error_setg_errno(errp
, -r
, "Failed to assign device \"%s\"",
828 static void verify_irqchip_in_kernel(Error
**errp
)
830 if (kvm_irqchip_in_kernel()) {
833 error_setg(errp
, "pci-assign requires KVM with in-kernel irqchip enabled");
836 static int assign_intx(AssignedDevice
*dev
, Error
**errp
)
838 AssignedIRQType new_type
;
839 PCIINTxRoute intx_route
;
842 Error
*local_err
= NULL
;
844 /* Interrupt PIN 0 means don't use INTx */
845 if (assigned_dev_pci_read_byte(&dev
->dev
, PCI_INTERRUPT_PIN
) == 0) {
846 pci_device_set_intx_routing_notifier(&dev
->dev
, NULL
);
850 verify_irqchip_in_kernel(&local_err
);
852 error_propagate(errp
, local_err
);
856 pci_device_set_intx_routing_notifier(&dev
->dev
,
857 assigned_dev_update_irq_routing
);
859 intx_route
= pci_device_route_intx_to_irq(&dev
->dev
, dev
->intpin
);
860 assert(intx_route
.mode
!= PCI_INTX_INVERTED
);
862 if (!pci_intx_route_changed(&dev
->intx_route
, &intx_route
)) {
866 switch (dev
->assigned_irq_type
) {
867 case ASSIGNED_IRQ_INTX_HOST_INTX
:
868 case ASSIGNED_IRQ_INTX_HOST_MSI
:
869 intx_host_msi
= dev
->assigned_irq_type
== ASSIGNED_IRQ_INTX_HOST_MSI
;
870 r
= kvm_device_intx_deassign(kvm_state
, dev
->dev_id
, intx_host_msi
);
872 case ASSIGNED_IRQ_MSI
:
873 r
= kvm_device_msi_deassign(kvm_state
, dev
->dev_id
);
875 case ASSIGNED_IRQ_MSIX
:
876 r
= kvm_device_msix_deassign(kvm_state
, dev
->dev_id
);
883 perror("assign_intx: deassignment of previous interrupt failed");
885 dev
->assigned_irq_type
= ASSIGNED_IRQ_NONE
;
887 if (intx_route
.mode
== PCI_INTX_DISABLED
) {
888 dev
->intx_route
= intx_route
;
893 if (dev
->features
& ASSIGNED_DEVICE_PREFER_MSI_MASK
&&
894 dev
->cap
.available
& ASSIGNED_DEVICE_CAP_MSI
) {
895 intx_host_msi
= true;
896 new_type
= ASSIGNED_IRQ_INTX_HOST_MSI
;
898 intx_host_msi
= false;
899 new_type
= ASSIGNED_IRQ_INTX_HOST_INTX
;
902 r
= kvm_device_intx_assign(kvm_state
, dev
->dev_id
, intx_host_msi
,
905 if (r
== -EIO
&& !(dev
->features
& ASSIGNED_DEVICE_PREFER_MSI_MASK
) &&
906 dev
->cap
.available
& ASSIGNED_DEVICE_CAP_MSI
) {
907 /* Retry with host-side MSI. There might be an IRQ conflict and
908 * either the kernel or the device doesn't support sharing. */
909 error_report("Host-side INTx sharing not supported, "
910 "using MSI instead");
911 error_printf("Some devices do not work properly in this mode.\n");
912 dev
->features
|= ASSIGNED_DEVICE_PREFER_MSI_MASK
;
915 error_setg_errno(errp
, -r
,
916 "Failed to assign irq for \"%s\"\n"
917 "Perhaps you are assigning a device "
918 "that shares an IRQ with another device?",
923 dev
->intx_route
= intx_route
;
924 dev
->assigned_irq_type
= new_type
;
928 static void deassign_device(AssignedDevice
*dev
)
932 r
= kvm_device_pci_deassign(kvm_state
, dev
->dev_id
);
936 /* The pci config space got updated. Check if irq numbers have changed
939 static void assigned_dev_update_irq_routing(PCIDevice
*dev
)
941 AssignedDevice
*assigned_dev
= PCI_ASSIGN(dev
);
945 r
= assign_intx(assigned_dev
, &err
);
947 error_report_err(err
);
949 qdev_unplug(&dev
->qdev
, &err
);
954 static void assigned_dev_update_msi(PCIDevice
*pci_dev
)
956 AssignedDevice
*assigned_dev
= PCI_ASSIGN(pci_dev
);
957 uint8_t ctrl_byte
= pci_get_byte(pci_dev
->config
+ pci_dev
->msi_cap
+
961 /* Some guests gratuitously disable MSI even if they're not using it,
962 * try to catch this by only deassigning irqs if the guest is using
963 * MSI or intends to start. */
964 if (assigned_dev
->assigned_irq_type
== ASSIGNED_IRQ_MSI
||
965 (ctrl_byte
& PCI_MSI_FLAGS_ENABLE
)) {
966 r
= kvm_device_msi_deassign(kvm_state
, assigned_dev
->dev_id
);
967 /* -ENXIO means no assigned irq */
968 if (r
&& r
!= -ENXIO
) {
969 perror("assigned_dev_update_msi: deassign irq");
972 free_msi_virqs(assigned_dev
);
974 assigned_dev
->assigned_irq_type
= ASSIGNED_IRQ_NONE
;
975 pci_device_set_intx_routing_notifier(pci_dev
, NULL
);
978 if (ctrl_byte
& PCI_MSI_FLAGS_ENABLE
) {
979 MSIMessage msg
= msi_get_message(pci_dev
, 0);
982 virq
= kvm_irqchip_add_msi_route(kvm_state
, msg
);
984 perror("assigned_dev_update_msi: kvm_irqchip_add_msi_route");
988 assigned_dev
->msi_virq
= g_malloc(sizeof(*assigned_dev
->msi_virq
));
989 assigned_dev
->msi_virq_nr
= 1;
990 assigned_dev
->msi_virq
[0] = virq
;
991 if (kvm_device_msi_assign(kvm_state
, assigned_dev
->dev_id
, virq
) < 0) {
992 perror("assigned_dev_update_msi: kvm_device_msi_assign");
995 assigned_dev
->intx_route
.mode
= PCI_INTX_DISABLED
;
996 assigned_dev
->intx_route
.irq
= -1;
997 assigned_dev
->assigned_irq_type
= ASSIGNED_IRQ_MSI
;
999 Error
*local_err
= NULL
;
1001 assign_intx(assigned_dev
, &local_err
);
1003 error_report_err(local_err
);
1008 static void assigned_dev_update_msi_msg(PCIDevice
*pci_dev
)
1010 AssignedDevice
*assigned_dev
= PCI_ASSIGN(pci_dev
);
1011 uint8_t ctrl_byte
= pci_get_byte(pci_dev
->config
+ pci_dev
->msi_cap
+
1014 if (assigned_dev
->assigned_irq_type
!= ASSIGNED_IRQ_MSI
||
1015 !(ctrl_byte
& PCI_MSI_FLAGS_ENABLE
)) {
1019 kvm_irqchip_update_msi_route(kvm_state
, assigned_dev
->msi_virq
[0],
1020 msi_get_message(pci_dev
, 0));
1023 static bool assigned_dev_msix_masked(MSIXTableEntry
*entry
)
1025 return (entry
->ctrl
& cpu_to_le32(0x1)) != 0;
1029 * When MSI-X is first enabled the vector table typically has all the
1030 * vectors masked, so we can't use that as the obvious test to figure out
1031 * how many vectors to initially enable. Instead we look at the data field
1032 * because this is what worked for pci-assign for a long time. This makes
1033 * sure the physical MSI-X state tracks the guest's view, which is important
1034 * for some VF/PF and PF/fw communication channels.
1036 static bool assigned_dev_msix_skipped(MSIXTableEntry
*entry
)
1038 return !entry
->data
;
1041 static int assigned_dev_update_msix_mmio(PCIDevice
*pci_dev
)
1043 AssignedDevice
*adev
= PCI_ASSIGN(pci_dev
);
1044 uint16_t entries_nr
= 0;
1046 MSIXTableEntry
*entry
= adev
->msix_table
;
1049 /* Get the usable entry number for allocating */
1050 for (i
= 0; i
< adev
->msix_max
; i
++, entry
++) {
1051 if (assigned_dev_msix_skipped(entry
)) {
1057 DEBUG("MSI-X entries: %d\n", entries_nr
);
1059 /* It's valid to enable MSI-X with all entries masked */
1064 r
= kvm_device_msix_init_vectors(kvm_state
, adev
->dev_id
, entries_nr
);
1066 error_report("fail to set MSI-X entry number for MSIX! %s",
1071 free_msi_virqs(adev
);
1073 adev
->msi_virq_nr
= adev
->msix_max
;
1074 adev
->msi_virq
= g_malloc(adev
->msix_max
* sizeof(*adev
->msi_virq
));
1076 entry
= adev
->msix_table
;
1077 for (i
= 0; i
< adev
->msix_max
; i
++, entry
++) {
1078 adev
->msi_virq
[i
] = -1;
1080 if (assigned_dev_msix_skipped(entry
)) {
1084 msg
.address
= entry
->addr_lo
| ((uint64_t)entry
->addr_hi
<< 32);
1085 msg
.data
= entry
->data
;
1086 r
= kvm_irqchip_add_msi_route(kvm_state
, msg
);
1090 adev
->msi_virq
[i
] = r
;
1092 DEBUG("MSI-X vector %d, gsi %d, addr %08x_%08x, data %08x\n", i
,
1093 r
, entry
->addr_hi
, entry
->addr_lo
, entry
->data
);
1095 r
= kvm_device_msix_set_vector(kvm_state
, adev
->dev_id
, i
,
1098 error_report("fail to set MSI-X entry! %s", strerror(-r
));
1106 static void assigned_dev_update_msix(PCIDevice
*pci_dev
)
1108 AssignedDevice
*assigned_dev
= PCI_ASSIGN(pci_dev
);
1109 uint16_t ctrl_word
= pci_get_word(pci_dev
->config
+ pci_dev
->msix_cap
+
1113 /* Some guests gratuitously disable MSIX even if they're not using it,
1114 * try to catch this by only deassigning irqs if the guest is using
1115 * MSIX or intends to start. */
1116 if ((assigned_dev
->assigned_irq_type
== ASSIGNED_IRQ_MSIX
) ||
1117 (ctrl_word
& PCI_MSIX_FLAGS_ENABLE
)) {
1118 r
= kvm_device_msix_deassign(kvm_state
, assigned_dev
->dev_id
);
1119 /* -ENXIO means no assigned irq */
1120 if (r
&& r
!= -ENXIO
) {
1121 perror("assigned_dev_update_msix: deassign irq");
1124 free_msi_virqs(assigned_dev
);
1126 assigned_dev
->assigned_irq_type
= ASSIGNED_IRQ_NONE
;
1127 pci_device_set_intx_routing_notifier(pci_dev
, NULL
);
1130 if (ctrl_word
& PCI_MSIX_FLAGS_ENABLE
) {
1131 if (assigned_dev_update_msix_mmio(pci_dev
) < 0) {
1132 perror("assigned_dev_update_msix_mmio");
1136 if (assigned_dev
->msi_virq_nr
> 0) {
1137 if (kvm_device_msix_assign(kvm_state
, assigned_dev
->dev_id
) < 0) {
1138 perror("assigned_dev_enable_msix: assign irq");
1142 assigned_dev
->intx_route
.mode
= PCI_INTX_DISABLED
;
1143 assigned_dev
->intx_route
.irq
= -1;
1144 assigned_dev
->assigned_irq_type
= ASSIGNED_IRQ_MSIX
;
1146 Error
*local_err
= NULL
;
1148 assign_intx(assigned_dev
, &local_err
);
1150 error_report_err(local_err
);
1155 static uint32_t assigned_dev_pci_read_config(PCIDevice
*pci_dev
,
1156 uint32_t address
, int len
)
1158 AssignedDevice
*assigned_dev
= PCI_ASSIGN(pci_dev
);
1159 uint32_t virt_val
= pci_default_read_config(pci_dev
, address
, len
);
1160 uint32_t real_val
, emulate_mask
, full_emulation_mask
;
1163 memcpy(&emulate_mask
, assigned_dev
->emulate_config_read
+ address
, len
);
1164 emulate_mask
= le32_to_cpu(emulate_mask
);
1166 full_emulation_mask
= 0xffffffff >> (32 - len
* 8);
1168 if (emulate_mask
!= full_emulation_mask
) {
1169 real_val
= assigned_dev_pci_read(pci_dev
, address
, len
);
1170 return (virt_val
& emulate_mask
) | (real_val
& ~emulate_mask
);
1176 static void assigned_dev_pci_write_config(PCIDevice
*pci_dev
, uint32_t address
,
1177 uint32_t val
, int len
)
1179 AssignedDevice
*assigned_dev
= PCI_ASSIGN(pci_dev
);
1180 uint16_t old_cmd
= pci_get_word(pci_dev
->config
+ PCI_COMMAND
);
1181 uint32_t emulate_mask
, full_emulation_mask
;
1184 pci_default_write_config(pci_dev
, address
, val
, len
);
1186 if (kvm_has_intx_set_mask() &&
1187 range_covers_byte(address
, len
, PCI_COMMAND
+ 1)) {
1188 bool intx_masked
= (pci_get_word(pci_dev
->config
+ PCI_COMMAND
) &
1189 PCI_COMMAND_INTX_DISABLE
);
1191 if (intx_masked
!= !!(old_cmd
& PCI_COMMAND_INTX_DISABLE
)) {
1192 ret
= kvm_device_intx_set_mask(kvm_state
, assigned_dev
->dev_id
,
1195 perror("assigned_dev_pci_write_config: set intx mask");
1199 if (assigned_dev
->cap
.available
& ASSIGNED_DEVICE_CAP_MSI
) {
1200 if (range_covers_byte(address
, len
,
1201 pci_dev
->msi_cap
+ PCI_MSI_FLAGS
)) {
1202 assigned_dev_update_msi(pci_dev
);
1203 } else if (ranges_overlap(address
, len
, /* 32bit MSI only */
1204 pci_dev
->msi_cap
+ PCI_MSI_ADDRESS_LO
, 6)) {
1205 assigned_dev_update_msi_msg(pci_dev
);
1208 if (assigned_dev
->cap
.available
& ASSIGNED_DEVICE_CAP_MSIX
) {
1209 if (range_covers_byte(address
, len
,
1210 pci_dev
->msix_cap
+ PCI_MSIX_FLAGS
+ 1)) {
1211 assigned_dev_update_msix(pci_dev
);
1216 memcpy(&emulate_mask
, assigned_dev
->emulate_config_write
+ address
, len
);
1217 emulate_mask
= le32_to_cpu(emulate_mask
);
1219 full_emulation_mask
= 0xffffffff >> (32 - len
* 8);
1221 if (emulate_mask
!= full_emulation_mask
) {
1223 val
&= ~emulate_mask
;
1224 val
|= assigned_dev_pci_read(pci_dev
, address
, len
) & emulate_mask
;
1226 assigned_dev_pci_write(pci_dev
, address
, val
, len
);
1230 static void assigned_dev_setup_cap_read(AssignedDevice
*dev
, uint32_t offset
,
1233 assigned_dev_direct_config_read(dev
, offset
, len
);
1234 assigned_dev_emulate_config_read(dev
, offset
+ PCI_CAP_LIST_NEXT
, 1);
1237 static int assigned_device_pci_cap_init(PCIDevice
*pci_dev
, Error
**errp
)
1239 AssignedDevice
*dev
= PCI_ASSIGN(pci_dev
);
1240 PCIRegion
*pci_region
= dev
->real_device
.regions
;
1242 Error
*local_err
= NULL
;
1244 /* Clear initial capabilities pointer and status copied from hw */
1245 pci_set_byte(pci_dev
->config
+ PCI_CAPABILITY_LIST
, 0);
1246 pci_set_word(pci_dev
->config
+ PCI_STATUS
,
1247 pci_get_word(pci_dev
->config
+ PCI_STATUS
) &
1248 ~PCI_STATUS_CAP_LIST
);
1250 /* Expose MSI capability
1251 * MSI capability is the 1st capability in capability config */
1252 pos
= pci_find_cap_offset(pci_dev
, PCI_CAP_ID_MSI
, 0);
1253 if (pos
!= 0 && kvm_check_extension(kvm_state
, KVM_CAP_ASSIGN_DEV_IRQ
)) {
1254 verify_irqchip_in_kernel(&local_err
);
1256 error_propagate(errp
, local_err
);
1259 dev
->cap
.available
|= ASSIGNED_DEVICE_CAP_MSI
;
1260 /* Only 32-bit/no-mask currently supported */
1261 ret
= pci_add_capability2(pci_dev
, PCI_CAP_ID_MSI
, pos
, 10,
1264 error_propagate(errp
, local_err
);
1267 pci_dev
->msi_cap
= pos
;
1269 pci_set_word(pci_dev
->config
+ pos
+ PCI_MSI_FLAGS
,
1270 pci_get_word(pci_dev
->config
+ pos
+ PCI_MSI_FLAGS
) &
1271 PCI_MSI_FLAGS_QMASK
);
1272 pci_set_long(pci_dev
->config
+ pos
+ PCI_MSI_ADDRESS_LO
, 0);
1273 pci_set_word(pci_dev
->config
+ pos
+ PCI_MSI_DATA_32
, 0);
1275 /* Set writable fields */
1276 pci_set_word(pci_dev
->wmask
+ pos
+ PCI_MSI_FLAGS
,
1277 PCI_MSI_FLAGS_QSIZE
| PCI_MSI_FLAGS_ENABLE
);
1278 pci_set_long(pci_dev
->wmask
+ pos
+ PCI_MSI_ADDRESS_LO
, 0xfffffffc);
1279 pci_set_word(pci_dev
->wmask
+ pos
+ PCI_MSI_DATA_32
, 0xffff);
1281 /* Expose MSI-X capability */
1282 pos
= pci_find_cap_offset(pci_dev
, PCI_CAP_ID_MSIX
, 0);
1283 if (pos
!= 0 && kvm_device_msix_supported(kvm_state
)) {
1285 uint32_t msix_table_entry
;
1288 verify_irqchip_in_kernel(&local_err
);
1290 error_propagate(errp
, local_err
);
1293 dev
->cap
.available
|= ASSIGNED_DEVICE_CAP_MSIX
;
1294 ret
= pci_add_capability2(pci_dev
, PCI_CAP_ID_MSIX
, pos
, 12,
1297 error_propagate(errp
, local_err
);
1300 pci_dev
->msix_cap
= pos
;
1302 msix_max
= (pci_get_word(pci_dev
->config
+ pos
+ PCI_MSIX_FLAGS
) &
1303 PCI_MSIX_FLAGS_QSIZE
) + 1;
1304 msix_max
= MIN(msix_max
, KVM_MAX_MSIX_PER_DEV
);
1305 pci_set_word(pci_dev
->config
+ pos
+ PCI_MSIX_FLAGS
, msix_max
- 1);
1307 /* Only enable and function mask bits are writable */
1308 pci_set_word(pci_dev
->wmask
+ pos
+ PCI_MSIX_FLAGS
,
1309 PCI_MSIX_FLAGS_ENABLE
| PCI_MSIX_FLAGS_MASKALL
);
1311 msix_table_entry
= pci_get_long(pci_dev
->config
+ pos
+ PCI_MSIX_TABLE
);
1312 bar_nr
= msix_table_entry
& PCI_MSIX_FLAGS_BIRMASK
;
1313 msix_table_entry
&= ~PCI_MSIX_FLAGS_BIRMASK
;
1314 dev
->msix_table_addr
= pci_region
[bar_nr
].base_addr
+ msix_table_entry
;
1315 dev
->msix_max
= msix_max
;
1318 /* Minimal PM support, nothing writable, device appears to NAK changes */
1319 pos
= pci_find_cap_offset(pci_dev
, PCI_CAP_ID_PM
, 0);
1323 ret
= pci_add_capability2(pci_dev
, PCI_CAP_ID_PM
, pos
, PCI_PM_SIZEOF
,
1326 error_propagate(errp
, local_err
);
1330 assigned_dev_setup_cap_read(dev
, pos
, PCI_PM_SIZEOF
);
1332 pmc
= pci_get_word(pci_dev
->config
+ pos
+ PCI_CAP_FLAGS
);
1333 pmc
&= (PCI_PM_CAP_VER_MASK
| PCI_PM_CAP_DSI
);
1334 pci_set_word(pci_dev
->config
+ pos
+ PCI_CAP_FLAGS
, pmc
);
1336 /* assign_device will bring the device up to D0, so we don't need
1337 * to worry about doing that ourselves here. */
1338 pci_set_word(pci_dev
->config
+ pos
+ PCI_PM_CTRL
,
1339 PCI_PM_CTRL_NO_SOFT_RESET
);
1341 pci_set_byte(pci_dev
->config
+ pos
+ PCI_PM_PPB_EXTENSIONS
, 0);
1342 pci_set_byte(pci_dev
->config
+ pos
+ PCI_PM_DATA_REGISTER
, 0);
1345 pos
= pci_find_cap_offset(pci_dev
, PCI_CAP_ID_EXP
, 0);
1347 uint8_t version
, size
= 0;
1348 uint16_t type
, devctl
, lnksta
;
1349 uint32_t devcap
, lnkcap
;
1351 version
= pci_get_byte(pci_dev
->config
+ pos
+ PCI_EXP_FLAGS
);
1352 version
&= PCI_EXP_FLAGS_VERS
;
1355 } else if (version
== 2) {
1357 * Check for non-std size, accept reduced size to 0x34,
1358 * which is what bcm5761 implemented, violating the
1359 * PCIe v3.0 spec that regs should exist and be read as 0,
1360 * not optionally provided and shorten the struct size.
1362 size
= MIN(0x3c, PCI_CONFIG_SPACE_SIZE
- pos
);
1364 error_setg(errp
, "Invalid size PCIe cap-id 0x%x",
1367 } else if (size
!= 0x3c) {
1368 error_report("WARNING, %s: PCIe cap-id 0x%x has "
1369 "non-standard size 0x%x; std size should be 0x3c",
1370 __func__
, PCI_CAP_ID_EXP
, size
);
1372 } else if (version
== 0) {
1374 vid
= pci_get_word(pci_dev
->config
+ PCI_VENDOR_ID
);
1375 did
= pci_get_word(pci_dev
->config
+ PCI_DEVICE_ID
);
1376 if (vid
== PCI_VENDOR_ID_INTEL
&& did
== 0x10ed) {
1378 * quirk for Intel 82599 VF with invalid PCIe capability
1379 * version, should really be version 2 (same as PF)
1386 error_setg(errp
, "Unsupported PCI express capability version %d",
1391 ret
= pci_add_capability2(pci_dev
, PCI_CAP_ID_EXP
, pos
, size
,
1394 error_propagate(errp
, local_err
);
1398 assigned_dev_setup_cap_read(dev
, pos
, size
);
1400 type
= pci_get_word(pci_dev
->config
+ pos
+ PCI_EXP_FLAGS
);
1401 type
= (type
& PCI_EXP_FLAGS_TYPE
) >> 4;
1402 if (type
!= PCI_EXP_TYPE_ENDPOINT
&&
1403 type
!= PCI_EXP_TYPE_LEG_END
&& type
!= PCI_EXP_TYPE_RC_END
) {
1404 error_setg(errp
, "Device assignment only supports endpoint "
1405 "assignment, device type %d", type
);
1409 /* capabilities, pass existing read-only copy
1410 * PCI_EXP_FLAGS_IRQ: updated by hardware, should be direct read */
1412 /* device capabilities: hide FLR */
1413 devcap
= pci_get_long(pci_dev
->config
+ pos
+ PCI_EXP_DEVCAP
);
1414 devcap
&= ~PCI_EXP_DEVCAP_FLR
;
1415 pci_set_long(pci_dev
->config
+ pos
+ PCI_EXP_DEVCAP
, devcap
);
1417 /* device control: clear all error reporting enable bits, leaving
1418 * only a few host values. Note, these are
1419 * all writable, but not passed to hw.
1421 devctl
= pci_get_word(pci_dev
->config
+ pos
+ PCI_EXP_DEVCTL
);
1422 devctl
= (devctl
& (PCI_EXP_DEVCTL_READRQ
| PCI_EXP_DEVCTL_PAYLOAD
)) |
1423 PCI_EXP_DEVCTL_RELAX_EN
| PCI_EXP_DEVCTL_NOSNOOP_EN
;
1424 pci_set_word(pci_dev
->config
+ pos
+ PCI_EXP_DEVCTL
, devctl
);
1425 devctl
= PCI_EXP_DEVCTL_BCR_FLR
| PCI_EXP_DEVCTL_AUX_PME
;
1426 pci_set_word(pci_dev
->wmask
+ pos
+ PCI_EXP_DEVCTL
, ~devctl
);
1428 /* Clear device status */
1429 pci_set_word(pci_dev
->config
+ pos
+ PCI_EXP_DEVSTA
, 0);
1431 /* Link capabilities, expose links and latencues, clear reporting */
1432 lnkcap
= pci_get_long(pci_dev
->config
+ pos
+ PCI_EXP_LNKCAP
);
1433 lnkcap
&= (PCI_EXP_LNKCAP_SLS
| PCI_EXP_LNKCAP_MLW
|
1434 PCI_EXP_LNKCAP_ASPMS
| PCI_EXP_LNKCAP_L0SEL
|
1435 PCI_EXP_LNKCAP_L1EL
);
1436 pci_set_long(pci_dev
->config
+ pos
+ PCI_EXP_LNKCAP
, lnkcap
);
1438 /* Link control, pass existing read-only copy. Should be writable? */
1440 /* Link status, only expose current speed and width */
1441 lnksta
= pci_get_word(pci_dev
->config
+ pos
+ PCI_EXP_LNKSTA
);
1442 lnksta
&= (PCI_EXP_LNKSTA_CLS
| PCI_EXP_LNKSTA_NLW
);
1443 pci_set_word(pci_dev
->config
+ pos
+ PCI_EXP_LNKSTA
, lnksta
);
1446 /* Slot capabilities, control, status - not needed for endpoints */
1447 pci_set_long(pci_dev
->config
+ pos
+ PCI_EXP_SLTCAP
, 0);
1448 pci_set_word(pci_dev
->config
+ pos
+ PCI_EXP_SLTCTL
, 0);
1449 pci_set_word(pci_dev
->config
+ pos
+ PCI_EXP_SLTSTA
, 0);
1451 /* Root control, capabilities, status - not needed for endpoints */
1452 pci_set_word(pci_dev
->config
+ pos
+ PCI_EXP_RTCTL
, 0);
1453 pci_set_word(pci_dev
->config
+ pos
+ PCI_EXP_RTCAP
, 0);
1454 pci_set_long(pci_dev
->config
+ pos
+ PCI_EXP_RTSTA
, 0);
1456 /* Device capabilities/control 2, pass existing read-only copy */
1457 /* Link control 2, pass existing read-only copy */
1461 pos
= pci_find_cap_offset(pci_dev
, PCI_CAP_ID_PCIX
, 0);
1466 /* Only expose the minimum, 8 byte capability */
1467 ret
= pci_add_capability2(pci_dev
, PCI_CAP_ID_PCIX
, pos
, 8,
1470 error_propagate(errp
, local_err
);
1474 assigned_dev_setup_cap_read(dev
, pos
, 8);
1476 /* Command register, clear upper bits, including extended modes */
1477 cmd
= pci_get_word(pci_dev
->config
+ pos
+ PCI_X_CMD
);
1478 cmd
&= (PCI_X_CMD_DPERR_E
| PCI_X_CMD_ERO
| PCI_X_CMD_MAX_READ
|
1479 PCI_X_CMD_MAX_SPLIT
);
1480 pci_set_word(pci_dev
->config
+ pos
+ PCI_X_CMD
, cmd
);
1482 /* Status register, update with emulated PCI bus location, clear
1483 * error bits, leave the rest. */
1484 status
= pci_get_long(pci_dev
->config
+ pos
+ PCI_X_STATUS
);
1485 status
&= ~(PCI_X_STATUS_BUS
| PCI_X_STATUS_DEVFN
);
1486 status
|= pci_requester_id(pci_dev
);
1487 status
&= ~(PCI_X_STATUS_SPL_DISC
| PCI_X_STATUS_UNX_SPL
|
1488 PCI_X_STATUS_SPL_ERR
);
1489 pci_set_long(pci_dev
->config
+ pos
+ PCI_X_STATUS
, status
);
1492 pos
= pci_find_cap_offset(pci_dev
, PCI_CAP_ID_VPD
, 0);
1494 /* Direct R/W passthrough */
1495 ret
= pci_add_capability2(pci_dev
, PCI_CAP_ID_VPD
, pos
, 8,
1498 error_propagate(errp
, local_err
);
1502 assigned_dev_setup_cap_read(dev
, pos
, 8);
1504 /* direct write for cap content */
1505 assigned_dev_direct_config_write(dev
, pos
+ 2, 6);
1508 /* Devices can have multiple vendor capabilities, get them all */
1509 for (pos
= 0; (pos
= pci_find_cap_offset(pci_dev
, PCI_CAP_ID_VNDR
, pos
));
1510 pos
+= PCI_CAP_LIST_NEXT
) {
1511 uint8_t len
= pci_get_byte(pci_dev
->config
+ pos
+ PCI_CAP_FLAGS
);
1512 /* Direct R/W passthrough */
1513 ret
= pci_add_capability2(pci_dev
, PCI_CAP_ID_VNDR
, pos
, len
,
1516 error_propagate(errp
, local_err
);
1520 assigned_dev_setup_cap_read(dev
, pos
, len
);
1522 /* direct write for cap content */
1523 assigned_dev_direct_config_write(dev
, pos
+ 2, len
- 2);
1526 /* If real and virtual capability list status bits differ, virtualize the
1528 if ((pci_get_word(pci_dev
->config
+ PCI_STATUS
) & PCI_STATUS_CAP_LIST
) !=
1529 (assigned_dev_pci_read_byte(pci_dev
, PCI_STATUS
) &
1530 PCI_STATUS_CAP_LIST
)) {
1531 dev
->emulate_config_read
[PCI_STATUS
] |= PCI_STATUS_CAP_LIST
;
1538 assigned_dev_msix_mmio_read(void *opaque
, hwaddr addr
,
1541 AssignedDevice
*adev
= opaque
;
1544 memcpy(&val
, (void *)((uint8_t *)adev
->msix_table
+ addr
), size
);
1549 static void assigned_dev_msix_mmio_write(void *opaque
, hwaddr addr
,
1550 uint64_t val
, unsigned size
)
1552 AssignedDevice
*adev
= opaque
;
1553 PCIDevice
*pdev
= &adev
->dev
;
1555 MSIXTableEntry orig
;
1558 if (i
>= adev
->msix_max
) {
1559 return; /* Drop write */
1562 ctrl
= pci_get_word(pdev
->config
+ pdev
->msix_cap
+ PCI_MSIX_FLAGS
);
1564 DEBUG("write to MSI-X table offset 0x%lx, val 0x%lx\n", addr
, val
);
1566 if (ctrl
& PCI_MSIX_FLAGS_ENABLE
) {
1567 orig
= adev
->msix_table
[i
];
1570 memcpy((uint8_t *)adev
->msix_table
+ addr
, &val
, size
);
1572 if (ctrl
& PCI_MSIX_FLAGS_ENABLE
) {
1573 MSIXTableEntry
*entry
= &adev
->msix_table
[i
];
1575 if (!assigned_dev_msix_masked(&orig
) &&
1576 assigned_dev_msix_masked(entry
)) {
1578 * Vector masked, disable it
1580 * XXX It's not clear if we can or should actually attempt
1581 * to mask or disable the interrupt. KVM doesn't have
1582 * support for pending bits and kvm_assign_set_msix_entry
1583 * doesn't modify the device hardware mask. Interrupts
1584 * while masked are simply not injected to the guest, so
1585 * are lost. Can we get away with always injecting an
1586 * interrupt on unmask?
1588 } else if (assigned_dev_msix_masked(&orig
) &&
1589 !assigned_dev_msix_masked(entry
)) {
1590 /* Vector unmasked */
1591 if (i
>= adev
->msi_virq_nr
|| adev
->msi_virq
[i
] < 0) {
1592 /* Previously unassigned vector, start from scratch */
1593 assigned_dev_update_msix(pdev
);
1596 /* Update an existing, previously masked vector */
1600 msg
.address
= entry
->addr_lo
|
1601 ((uint64_t)entry
->addr_hi
<< 32);
1602 msg
.data
= entry
->data
;
1604 ret
= kvm_irqchip_update_msi_route(kvm_state
,
1605 adev
->msi_virq
[i
], msg
);
1607 error_report("Error updating irq routing entry (%d)", ret
);
1614 static const MemoryRegionOps assigned_dev_msix_mmio_ops
= {
1615 .read
= assigned_dev_msix_mmio_read
,
1616 .write
= assigned_dev_msix_mmio_write
,
1617 .endianness
= DEVICE_NATIVE_ENDIAN
,
1619 .min_access_size
= 4,
1620 .max_access_size
= 8,
1623 .min_access_size
= 4,
1624 .max_access_size
= 8,
1628 static void assigned_dev_msix_reset(AssignedDevice
*dev
)
1630 MSIXTableEntry
*entry
;
1633 if (!dev
->msix_table
) {
1637 memset(dev
->msix_table
, 0, MSIX_PAGE_SIZE
);
1639 for (i
= 0, entry
= dev
->msix_table
; i
< dev
->msix_max
; i
++, entry
++) {
1640 entry
->ctrl
= cpu_to_le32(0x1); /* Masked */
1644 static void assigned_dev_register_msix_mmio(AssignedDevice
*dev
, Error
**errp
)
1646 dev
->msix_table
= mmap(NULL
, MSIX_PAGE_SIZE
, PROT_READ
|PROT_WRITE
,
1647 MAP_ANONYMOUS
|MAP_PRIVATE
, 0, 0);
1648 if (dev
->msix_table
== MAP_FAILED
) {
1649 error_setg_errno(errp
, errno
, "failed to allocate msix_table");
1650 dev
->msix_table
= NULL
;
1654 assigned_dev_msix_reset(dev
);
1656 memory_region_init_io(&dev
->mmio
, OBJECT(dev
), &assigned_dev_msix_mmio_ops
,
1657 dev
, "assigned-dev-msix", MSIX_PAGE_SIZE
);
1660 static void assigned_dev_unregister_msix_mmio(AssignedDevice
*dev
)
1662 if (!dev
->msix_table
) {
1666 if (munmap(dev
->msix_table
, MSIX_PAGE_SIZE
) == -1) {
1667 error_report("error unmapping msix_table! %s", strerror(errno
));
1669 dev
->msix_table
= NULL
;
1672 static const VMStateDescription vmstate_assigned_device
= {
1673 .name
= "pci-assign",
1677 static void reset_assigned_device(DeviceState
*dev
)
1679 PCIDevice
*pci_dev
= PCI_DEVICE(dev
);
1680 AssignedDevice
*adev
= PCI_ASSIGN(pci_dev
);
1681 char reset_file
[64];
1682 const char reset
[] = "1";
1686 * If a guest is reset without being shutdown, MSI/MSI-X can still
1687 * be running. We want to return the device to a known state on
1688 * reset, so disable those here. We especially do not want MSI-X
1689 * enabled since it lives in MMIO space, which is about to get
1692 if (adev
->assigned_irq_type
== ASSIGNED_IRQ_MSIX
) {
1693 uint16_t ctrl
= pci_get_word(pci_dev
->config
+
1694 pci_dev
->msix_cap
+ PCI_MSIX_FLAGS
);
1696 pci_set_word(pci_dev
->config
+ pci_dev
->msix_cap
+ PCI_MSIX_FLAGS
,
1697 ctrl
& ~PCI_MSIX_FLAGS_ENABLE
);
1698 assigned_dev_update_msix(pci_dev
);
1699 } else if (adev
->assigned_irq_type
== ASSIGNED_IRQ_MSI
) {
1700 uint8_t ctrl
= pci_get_byte(pci_dev
->config
+
1701 pci_dev
->msi_cap
+ PCI_MSI_FLAGS
);
1703 pci_set_byte(pci_dev
->config
+ pci_dev
->msi_cap
+ PCI_MSI_FLAGS
,
1704 ctrl
& ~PCI_MSI_FLAGS_ENABLE
);
1705 assigned_dev_update_msi(pci_dev
);
1708 snprintf(reset_file
, sizeof(reset_file
),
1709 "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/reset",
1710 adev
->host
.domain
, adev
->host
.bus
, adev
->host
.slot
,
1711 adev
->host
.function
);
1714 * Issue a device reset via pci-sysfs. Note that we use write(2) here
1715 * and ignore the return value because some kernels have a bug that
1716 * returns 0 rather than bytes written on success, sending us into an
1717 * infinite retry loop using other write mechanisms.
1719 fd
= open(reset_file
, O_WRONLY
);
1721 ret
= write(fd
, reset
, strlen(reset
));
1727 * When a 0 is written to the bus master register, the device is logically
1728 * disconnected from the PCI bus. This avoids further DMA transfers.
1730 assigned_dev_pci_write_config(pci_dev
, PCI_COMMAND
, 0, 1);
1733 static void assigned_realize(struct PCIDevice
*pci_dev
, Error
**errp
)
1735 AssignedDevice
*dev
= PCI_ASSIGN(pci_dev
);
1738 Error
*local_err
= NULL
;
1740 if (!kvm_enabled()) {
1741 error_setg(&local_err
, "pci-assign requires KVM support");
1742 goto exit_with_error
;
1745 if (!dev
->host
.domain
&& !dev
->host
.bus
&& !dev
->host
.slot
&&
1746 !dev
->host
.function
) {
1747 error_setg(&local_err
, "no host device specified");
1748 goto exit_with_error
;
1752 * Set up basic config space access control. Will be further refined during
1753 * device initialization.
1755 assigned_dev_emulate_config_read(dev
, 0, PCI_CONFIG_SPACE_SIZE
);
1756 assigned_dev_direct_config_read(dev
, PCI_STATUS
, 2);
1757 assigned_dev_direct_config_read(dev
, PCI_REVISION_ID
, 1);
1758 assigned_dev_direct_config_read(dev
, PCI_CLASS_PROG
, 3);
1759 assigned_dev_direct_config_read(dev
, PCI_CACHE_LINE_SIZE
, 1);
1760 assigned_dev_direct_config_read(dev
, PCI_LATENCY_TIMER
, 1);
1761 assigned_dev_direct_config_read(dev
, PCI_BIST
, 1);
1762 assigned_dev_direct_config_read(dev
, PCI_CARDBUS_CIS
, 4);
1763 assigned_dev_direct_config_read(dev
, PCI_SUBSYSTEM_VENDOR_ID
, 2);
1764 assigned_dev_direct_config_read(dev
, PCI_SUBSYSTEM_ID
, 2);
1765 assigned_dev_direct_config_read(dev
, PCI_CAPABILITY_LIST
+ 1, 7);
1766 assigned_dev_direct_config_read(dev
, PCI_MIN_GNT
, 1);
1767 assigned_dev_direct_config_read(dev
, PCI_MAX_LAT
, 1);
1768 memcpy(dev
->emulate_config_write
, dev
->emulate_config_read
,
1769 sizeof(dev
->emulate_config_read
));
1771 get_real_device(dev
, &local_err
);
1776 if (assigned_device_pci_cap_init(pci_dev
, &local_err
) < 0) {
1780 /* intercept MSI-X entry page in the MMIO */
1781 if (dev
->cap
.available
& ASSIGNED_DEVICE_CAP_MSIX
) {
1782 assigned_dev_register_msix_mmio(dev
, &local_err
);
1788 /* handle real device's MMIO/PIO BARs */
1789 assigned_dev_register_regions(dev
->real_device
.regions
,
1790 dev
->real_device
.region_number
, dev
,
1796 /* handle interrupt routing */
1797 e_intx
= dev
->dev
.config
[PCI_INTERRUPT_PIN
] - 1;
1798 dev
->intpin
= e_intx
;
1799 dev
->intx_route
.mode
= PCI_INTX_DISABLED
;
1800 dev
->intx_route
.irq
= -1;
1802 /* assign device to guest */
1803 assign_device(dev
, &local_err
);
1808 /* assign legacy INTx to the device */
1809 r
= assign_intx(dev
, &local_err
);
1814 assigned_dev_load_option_rom(dev
);
1819 deassign_device(dev
);
1822 free_assigned_device(dev
);
1826 error_propagate(errp
, local_err
);
1829 static void assigned_exitfn(struct PCIDevice
*pci_dev
)
1831 AssignedDevice
*dev
= PCI_ASSIGN(pci_dev
);
1833 deassign_device(dev
);
1834 free_assigned_device(dev
);
1837 static void assigned_dev_instance_init(Object
*obj
)
1839 PCIDevice
*pci_dev
= PCI_DEVICE(obj
);
1840 AssignedDevice
*d
= PCI_ASSIGN(pci_dev
);
1842 device_add_bootindex_property(obj
, &d
->bootindex
,
1844 &pci_dev
->qdev
, NULL
);
1847 static Property assigned_dev_properties
[] = {
1848 DEFINE_PROP_PCI_HOST_DEVADDR("host", AssignedDevice
, host
),
1849 DEFINE_PROP_BIT("prefer_msi", AssignedDevice
, features
,
1850 ASSIGNED_DEVICE_PREFER_MSI_BIT
, false),
1851 DEFINE_PROP_BIT("share_intx", AssignedDevice
, features
,
1852 ASSIGNED_DEVICE_SHARE_INTX_BIT
, true),
1853 DEFINE_PROP_STRING("configfd", AssignedDevice
, configfd_name
),
1854 DEFINE_PROP_END_OF_LIST(),
1857 static void assign_class_init(ObjectClass
*klass
, void *data
)
1859 PCIDeviceClass
*k
= PCI_DEVICE_CLASS(klass
);
1860 DeviceClass
*dc
= DEVICE_CLASS(klass
);
1862 k
->realize
= assigned_realize
;
1863 k
->exit
= assigned_exitfn
;
1864 k
->config_read
= assigned_dev_pci_read_config
;
1865 k
->config_write
= assigned_dev_pci_write_config
;
1866 dc
->props
= assigned_dev_properties
;
1867 dc
->vmsd
= &vmstate_assigned_device
;
1868 dc
->reset
= reset_assigned_device
;
1869 set_bit(DEVICE_CATEGORY_MISC
, dc
->categories
);
1870 dc
->desc
= "KVM-based PCI passthrough";
1873 static const TypeInfo assign_info
= {
1874 .name
= TYPE_PCI_ASSIGN
,
1875 .parent
= TYPE_PCI_DEVICE
,
1876 .instance_size
= sizeof(AssignedDevice
),
1877 .class_init
= assign_class_init
,
1878 .instance_init
= assigned_dev_instance_init
,
1881 static void assign_register_types(void)
1883 type_register_static(&assign_info
);
1886 type_init(assign_register_types
)
1888 static void assigned_dev_load_option_rom(AssignedDevice
*dev
)
1892 pci_assign_dev_load_option_rom(&dev
->dev
, OBJECT(dev
), &size
,
1893 dev
->host
.domain
, dev
->host
.bus
,
1894 dev
->host
.slot
, dev
->host
.function
);
1897 error_report("pci-assign: Invalid ROM.");