2 * Copyright (c) 2007, Neocleus Corporation.
4 * This work is licensed under the terms of the GNU GPL, version 2. See
5 * the COPYING file in the top-level directory.
8 * Assign a PCI device from the host to a guest VM.
10 * This implementation uses the classic device assignment interface of KVM
11 * and is only available on x86 hosts. It is expected to be obsoleted by VFIO
12 * based device assignment.
14 * Adapted for KVM (qemu-kvm) by Qumranet. QEMU version was based on qemu-kvm
15 * revision 4144fe9d48. See its repository for the history.
17 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
18 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
19 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
20 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
21 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
27 #include <sys/types.h>
31 #include "qemu-error.h"
33 #include "hw/loader.h"
41 #define MSIX_PAGE_SIZE 0x1000
43 /* From linux/ioport.h */
44 #define IORESOURCE_IO 0x00000100 /* Resource type */
45 #define IORESOURCE_MEM 0x00000200
46 #define IORESOURCE_IRQ 0x00000400
47 #define IORESOURCE_DMA 0x00000800
48 #define IORESOURCE_PREFETCH 0x00002000 /* No side effects */
50 //#define DEVICE_ASSIGNMENT_DEBUG
52 #ifdef DEVICE_ASSIGNMENT_DEBUG
53 #define DEBUG(fmt, ...) \
55 fprintf(stderr, "%s: " fmt, __func__ , __VA_ARGS__); \
58 #define DEBUG(fmt, ...)
61 typedef struct PCIRegion
{
62 int type
; /* Memory or port I/O */
65 uint64_t size
; /* size of the region */
69 typedef struct PCIDevRegions
{
70 uint8_t bus
, dev
, func
; /* Bus inside domain, device and function */
71 int irq
; /* IRQ number */
72 uint16_t region_number
; /* number of active regions */
74 /* Port I/O or MMIO Regions */
75 PCIRegion regions
[PCI_NUM_REGIONS
- 1];
79 typedef struct AssignedDevRegion
{
80 MemoryRegion container
;
81 MemoryRegion real_iomem
;
83 uint8_t *r_virtbase
; /* mmapped access address for memory regions */
84 uint32_t r_baseport
; /* the base guest port for I/O regions */
86 pcibus_t e_size
; /* emulated size of region in bytes */
87 pcibus_t r_size
; /* real size of region in bytes */
91 #define ASSIGNED_DEVICE_PREFER_MSI_BIT 0
92 #define ASSIGNED_DEVICE_SHARE_INTX_BIT 1
94 #define ASSIGNED_DEVICE_PREFER_MSI_MASK (1 << ASSIGNED_DEVICE_PREFER_MSI_BIT)
95 #define ASSIGNED_DEVICE_SHARE_INTX_MASK (1 << ASSIGNED_DEVICE_SHARE_INTX_BIT)
97 typedef struct MSIXTableEntry
{
104 typedef enum AssignedIRQType
{
105 ASSIGNED_IRQ_NONE
= 0,
106 ASSIGNED_IRQ_INTX_HOST_INTX
,
107 ASSIGNED_IRQ_INTX_HOST_MSI
,
112 typedef struct AssignedDevice
{
114 PCIHostDeviceAddress host
;
118 AssignedDevRegion v_addrs
[PCI_NUM_REGIONS
- 1];
119 PCIDevRegions real_device
;
120 PCIINTxRoute intx_route
;
121 AssignedIRQType assigned_irq_type
;
123 #define ASSIGNED_DEVICE_CAP_MSI (1 << 0)
124 #define ASSIGNED_DEVICE_CAP_MSIX (1 << 1)
126 #define ASSIGNED_DEVICE_MSI_ENABLED (1 << 0)
127 #define ASSIGNED_DEVICE_MSIX_ENABLED (1 << 1)
128 #define ASSIGNED_DEVICE_MSIX_MASKED (1 << 2)
131 uint8_t emulate_config_read
[PCI_CONFIG_SPACE_SIZE
];
132 uint8_t emulate_config_write
[PCI_CONFIG_SPACE_SIZE
];
135 MSIXTableEntry
*msix_table
;
136 target_phys_addr_t msix_table_addr
;
143 static void assigned_dev_update_irq_routing(PCIDevice
*dev
);
145 static void assigned_dev_load_option_rom(AssignedDevice
*dev
);
147 static void assigned_dev_unregister_msix_mmio(AssignedDevice
*dev
);
149 static uint64_t assigned_dev_ioport_rw(AssignedDevRegion
*dev_region
,
150 target_phys_addr_t addr
, int size
,
154 int fd
= dev_region
->region
->resource_fd
;
158 DEBUG("pwrite data=%" PRIx64
", size=%d, e_phys=" TARGET_FMT_plx
159 ", addr="TARGET_FMT_plx
"\n", *data
, size
, addr
, addr
);
160 if (pwrite(fd
, data
, size
, addr
) != size
) {
161 error_report("%s - pwrite failed %s",
162 __func__
, strerror(errno
));
165 if (pread(fd
, &val
, size
, addr
) != size
) {
166 error_report("%s - pread failed %s",
167 __func__
, strerror(errno
));
168 val
= (1UL << (size
* 8)) - 1;
170 DEBUG("pread val=%" PRIx64
", size=%d, e_phys=" TARGET_FMT_plx
171 ", addr=" TARGET_FMT_plx
"\n", val
, size
, addr
, addr
);
174 uint32_t port
= addr
+ dev_region
->u
.r_baseport
;
177 DEBUG("out data=%" PRIx64
", size=%d, e_phys=" TARGET_FMT_plx
178 ", host=%x\n", *data
, size
, addr
, port
);
202 DEBUG("in data=%" PRIx64
", size=%d, e_phys=" TARGET_FMT_plx
203 ", host=%x\n", val
, size
, addr
, port
);
209 static void assigned_dev_ioport_write(void *opaque
, target_phys_addr_t addr
,
210 uint64_t data
, unsigned size
)
212 assigned_dev_ioport_rw(opaque
, addr
, size
, &data
);
215 static uint64_t assigned_dev_ioport_read(void *opaque
,
216 target_phys_addr_t addr
, unsigned size
)
218 return assigned_dev_ioport_rw(opaque
, addr
, size
, NULL
);
221 static uint32_t slow_bar_readb(void *opaque
, target_phys_addr_t addr
)
223 AssignedDevRegion
*d
= opaque
;
224 uint8_t *in
= d
->u
.r_virtbase
+ addr
;
228 DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx
" val=0x%08x\n", addr
, r
);
233 static uint32_t slow_bar_readw(void *opaque
, target_phys_addr_t addr
)
235 AssignedDevRegion
*d
= opaque
;
236 uint16_t *in
= (uint16_t *)(d
->u
.r_virtbase
+ addr
);
240 DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx
" val=0x%08x\n", addr
, r
);
245 static uint32_t slow_bar_readl(void *opaque
, target_phys_addr_t addr
)
247 AssignedDevRegion
*d
= opaque
;
248 uint32_t *in
= (uint32_t *)(d
->u
.r_virtbase
+ addr
);
252 DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx
" val=0x%08x\n", addr
, r
);
257 static void slow_bar_writeb(void *opaque
, target_phys_addr_t addr
, uint32_t val
)
259 AssignedDevRegion
*d
= opaque
;
260 uint8_t *out
= d
->u
.r_virtbase
+ addr
;
262 DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx
" val=0x%02x\n", addr
, val
);
266 static void slow_bar_writew(void *opaque
, target_phys_addr_t addr
, uint32_t val
)
268 AssignedDevRegion
*d
= opaque
;
269 uint16_t *out
= (uint16_t *)(d
->u
.r_virtbase
+ addr
);
271 DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx
" val=0x%04x\n", addr
, val
);
275 static void slow_bar_writel(void *opaque
, target_phys_addr_t addr
, uint32_t val
)
277 AssignedDevRegion
*d
= opaque
;
278 uint32_t *out
= (uint32_t *)(d
->u
.r_virtbase
+ addr
);
280 DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx
" val=0x%08x\n", addr
, val
);
284 static const MemoryRegionOps slow_bar_ops
= {
286 .read
= { slow_bar_readb
, slow_bar_readw
, slow_bar_readl
, },
287 .write
= { slow_bar_writeb
, slow_bar_writew
, slow_bar_writel
, },
289 .endianness
= DEVICE_NATIVE_ENDIAN
,
292 static void assigned_dev_iomem_setup(PCIDevice
*pci_dev
, int region_num
,
295 AssignedDevice
*r_dev
= DO_UPCAST(AssignedDevice
, dev
, pci_dev
);
296 AssignedDevRegion
*region
= &r_dev
->v_addrs
[region_num
];
297 PCIRegion
*real_region
= &r_dev
->real_device
.regions
[region_num
];
300 memory_region_init(®ion
->container
, "assigned-dev-container",
302 memory_region_add_subregion(®ion
->container
, 0, ®ion
->real_iomem
);
304 /* deal with MSI-X MMIO page */
305 if (real_region
->base_addr
<= r_dev
->msix_table_addr
&&
306 real_region
->base_addr
+ real_region
->size
>
307 r_dev
->msix_table_addr
) {
308 uint64_t offset
= r_dev
->msix_table_addr
- real_region
->base_addr
;
310 memory_region_add_subregion_overlap(®ion
->container
,
318 static const MemoryRegionOps assigned_dev_ioport_ops
= {
319 .read
= assigned_dev_ioport_read
,
320 .write
= assigned_dev_ioport_write
,
321 .endianness
= DEVICE_NATIVE_ENDIAN
,
324 static void assigned_dev_ioport_setup(PCIDevice
*pci_dev
, int region_num
,
327 AssignedDevice
*r_dev
= DO_UPCAST(AssignedDevice
, dev
, pci_dev
);
328 AssignedDevRegion
*region
= &r_dev
->v_addrs
[region_num
];
330 region
->e_size
= size
;
331 memory_region_init(®ion
->container
, "assigned-dev-container", size
);
332 memory_region_init_io(®ion
->real_iomem
, &assigned_dev_ioport_ops
,
333 r_dev
->v_addrs
+ region_num
,
334 "assigned-dev-iomem", size
);
335 memory_region_add_subregion(®ion
->container
, 0, ®ion
->real_iomem
);
338 static uint32_t assigned_dev_pci_read(PCIDevice
*d
, int pos
, int len
)
340 AssignedDevice
*pci_dev
= DO_UPCAST(AssignedDevice
, dev
, d
);
343 int fd
= pci_dev
->real_device
.config_fd
;
346 ret
= pread(fd
, &val
, len
, pos
);
348 if ((ret
< 0) && (errno
== EINTR
|| errno
== EAGAIN
)) {
352 hw_error("pci read failed, ret = %zd errno = %d\n", ret
, errno
);
358 static uint8_t assigned_dev_pci_read_byte(PCIDevice
*d
, int pos
)
360 return (uint8_t)assigned_dev_pci_read(d
, pos
, 1);
363 static void assigned_dev_pci_write(PCIDevice
*d
, int pos
, uint32_t val
, int len
)
365 AssignedDevice
*pci_dev
= DO_UPCAST(AssignedDevice
, dev
, d
);
367 int fd
= pci_dev
->real_device
.config_fd
;
370 ret
= pwrite(fd
, &val
, len
, pos
);
372 if ((ret
< 0) && (errno
== EINTR
|| errno
== EAGAIN
)) {
376 hw_error("pci write failed, ret = %zd errno = %d\n", ret
, errno
);
380 static void assigned_dev_emulate_config_read(AssignedDevice
*dev
,
381 uint32_t offset
, uint32_t len
)
383 memset(dev
->emulate_config_read
+ offset
, 0xff, len
);
386 static void assigned_dev_direct_config_read(AssignedDevice
*dev
,
387 uint32_t offset
, uint32_t len
)
389 memset(dev
->emulate_config_read
+ offset
, 0, len
);
392 static void assigned_dev_direct_config_write(AssignedDevice
*dev
,
393 uint32_t offset
, uint32_t len
)
395 memset(dev
->emulate_config_write
+ offset
, 0, len
);
398 static uint8_t pci_find_cap_offset(PCIDevice
*d
, uint8_t cap
, uint8_t start
)
402 int pos
= start
? start
: PCI_CAPABILITY_LIST
;
405 status
= assigned_dev_pci_read_byte(d
, PCI_STATUS
);
406 if ((status
& PCI_STATUS_CAP_LIST
) == 0) {
411 pos
= assigned_dev_pci_read_byte(d
, pos
);
417 id
= assigned_dev_pci_read_byte(d
, pos
+ PCI_CAP_LIST_ID
);
426 pos
+= PCI_CAP_LIST_NEXT
;
431 static int assigned_dev_register_regions(PCIRegion
*io_regions
,
432 unsigned long regions_num
,
433 AssignedDevice
*pci_dev
)
436 PCIRegion
*cur_region
= io_regions
;
438 for (i
= 0; i
< regions_num
; i
++, cur_region
++) {
439 if (!cur_region
->valid
) {
443 /* handle memory io regions */
444 if (cur_region
->type
& IORESOURCE_MEM
) {
445 int t
= cur_region
->type
& IORESOURCE_PREFETCH
446 ? PCI_BASE_ADDRESS_MEM_PREFETCH
447 : PCI_BASE_ADDRESS_SPACE_MEMORY
;
449 /* map physical memory */
450 pci_dev
->v_addrs
[i
].u
.r_virtbase
= mmap(NULL
, cur_region
->size
,
451 PROT_WRITE
| PROT_READ
,
453 cur_region
->resource_fd
,
456 if (pci_dev
->v_addrs
[i
].u
.r_virtbase
== MAP_FAILED
) {
457 pci_dev
->v_addrs
[i
].u
.r_virtbase
= NULL
;
458 error_report("%s: Error: Couldn't mmap 0x%" PRIx64
"!",
459 __func__
, cur_region
->base_addr
);
463 pci_dev
->v_addrs
[i
].r_size
= cur_region
->size
;
464 pci_dev
->v_addrs
[i
].e_size
= 0;
467 pci_dev
->v_addrs
[i
].u
.r_virtbase
+=
468 (cur_region
->base_addr
& 0xFFF);
470 if (cur_region
->size
& 0xFFF) {
471 error_report("PCI region %d at address 0x%" PRIx64
" has "
472 "size 0x%" PRIx64
", which is not a multiple of "
473 "4K. You might experience some performance hit "
475 i
, cur_region
->base_addr
, cur_region
->size
);
476 memory_region_init_io(&pci_dev
->v_addrs
[i
].real_iomem
,
477 &slow_bar_ops
, &pci_dev
->v_addrs
[i
],
478 "assigned-dev-slow-bar",
481 void *virtbase
= pci_dev
->v_addrs
[i
].u
.r_virtbase
;
483 snprintf(name
, sizeof(name
), "%s.bar%d",
484 object_get_typename(OBJECT(pci_dev
)), i
);
485 memory_region_init_ram_ptr(&pci_dev
->v_addrs
[i
].real_iomem
,
486 name
, cur_region
->size
,
488 vmstate_register_ram(&pci_dev
->v_addrs
[i
].real_iomem
,
492 assigned_dev_iomem_setup(&pci_dev
->dev
, i
, cur_region
->size
);
493 pci_register_bar((PCIDevice
*) pci_dev
, i
, t
,
494 &pci_dev
->v_addrs
[i
].container
);
497 /* handle port io regions */
501 /* Test kernel support for ioport resource read/write. Old
502 * kernels return EIO. New kernels only allow 1/2/4 byte reads
503 * so should return EINVAL for a 3 byte read */
504 ret
= pread(pci_dev
->v_addrs
[i
].region
->resource_fd
, &val
, 3, 0);
506 error_report("Unexpected return from I/O port read: %d", ret
);
508 } else if (errno
!= EINVAL
) {
509 error_report("Kernel doesn't support ioport resource "
510 "access, hiding this region.");
511 close(pci_dev
->v_addrs
[i
].region
->resource_fd
);
512 cur_region
->valid
= 0;
516 pci_dev
->v_addrs
[i
].u
.r_baseport
= cur_region
->base_addr
;
517 pci_dev
->v_addrs
[i
].r_size
= cur_region
->size
;
518 pci_dev
->v_addrs
[i
].e_size
= 0;
520 assigned_dev_ioport_setup(&pci_dev
->dev
, i
, cur_region
->size
);
521 pci_register_bar((PCIDevice
*) pci_dev
, i
,
522 PCI_BASE_ADDRESS_SPACE_IO
,
523 &pci_dev
->v_addrs
[i
].container
);
531 static int get_real_id(const char *devpath
, const char *idname
, uint16_t *val
)
537 snprintf(name
, sizeof(name
), "%s%s", devpath
, idname
);
538 f
= fopen(name
, "r");
540 error_report("%s: %s: %m", __func__
, name
);
543 if (fscanf(f
, "%li\n", &id
) == 1) {
553 static int get_real_vendor_id(const char *devpath
, uint16_t *val
)
555 return get_real_id(devpath
, "vendor", val
);
558 static int get_real_device_id(const char *devpath
, uint16_t *val
)
560 return get_real_id(devpath
, "device", val
);
563 static int get_real_device(AssignedDevice
*pci_dev
, uint16_t r_seg
,
564 uint8_t r_bus
, uint8_t r_dev
, uint8_t r_func
)
566 char dir
[128], name
[128];
569 uint64_t start
, end
, size
, flags
;
572 PCIDevRegions
*dev
= &pci_dev
->real_device
;
574 dev
->region_number
= 0;
576 snprintf(dir
, sizeof(dir
), "/sys/bus/pci/devices/%04x:%02x:%02x.%x/",
577 r_seg
, r_bus
, r_dev
, r_func
);
579 snprintf(name
, sizeof(name
), "%sconfig", dir
);
581 if (pci_dev
->configfd_name
&& *pci_dev
->configfd_name
) {
582 if (qemu_isdigit(pci_dev
->configfd_name
[0])) {
583 dev
->config_fd
= strtol(pci_dev
->configfd_name
, NULL
, 0);
585 dev
->config_fd
= monitor_get_fd(cur_mon
, pci_dev
->configfd_name
);
586 if (dev
->config_fd
< 0) {
587 error_report("%s: (%s) unkown", __func__
,
588 pci_dev
->configfd_name
);
593 dev
->config_fd
= open(name
, O_RDWR
);
595 if (dev
->config_fd
== -1) {
596 error_report("%s: %s: %m", __func__
, name
);
601 r
= read(dev
->config_fd
, pci_dev
->dev
.config
,
602 pci_config_size(&pci_dev
->dev
));
604 if (errno
== EINTR
|| errno
== EAGAIN
) {
607 error_report("%s: read failed, errno = %d", __func__
, errno
);
610 /* Restore or clear multifunction, this is always controlled by qemu */
611 if (pci_dev
->dev
.cap_present
& QEMU_PCI_CAP_MULTIFUNCTION
) {
612 pci_dev
->dev
.config
[PCI_HEADER_TYPE
] |= PCI_HEADER_TYPE_MULTI_FUNCTION
;
614 pci_dev
->dev
.config
[PCI_HEADER_TYPE
] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION
;
617 /* Clear host resource mapping info. If we choose not to register a
618 * BAR, such as might be the case with the option ROM, we can get
619 * confusing, unwritable, residual addresses from the host here. */
620 memset(&pci_dev
->dev
.config
[PCI_BASE_ADDRESS_0
], 0, 24);
621 memset(&pci_dev
->dev
.config
[PCI_ROM_ADDRESS
], 0, 4);
623 snprintf(name
, sizeof(name
), "%sresource", dir
);
625 f
= fopen(name
, "r");
627 error_report("%s: %s: %m", __func__
, name
);
631 for (r
= 0; r
< PCI_ROM_SLOT
; r
++) {
632 if (fscanf(f
, "%" SCNi64
" %" SCNi64
" %" SCNi64
"\n",
633 &start
, &end
, &flags
) != 3) {
637 rp
= dev
->regions
+ r
;
639 rp
->resource_fd
= -1;
640 size
= end
- start
+ 1;
641 flags
&= IORESOURCE_IO
| IORESOURCE_MEM
| IORESOURCE_PREFETCH
;
642 if (size
== 0 || (flags
& ~IORESOURCE_PREFETCH
) == 0) {
645 if (flags
& IORESOURCE_MEM
) {
646 flags
&= ~IORESOURCE_IO
;
648 flags
&= ~IORESOURCE_PREFETCH
;
650 snprintf(name
, sizeof(name
), "%sresource%d", dir
, r
);
651 fd
= open(name
, O_RDWR
);
655 rp
->resource_fd
= fd
;
659 rp
->base_addr
= start
;
661 pci_dev
->v_addrs
[r
].region
= rp
;
662 DEBUG("region %d size %" PRIu64
" start 0x%" PRIx64
663 " type %d resource_fd %d\n",
664 r
, rp
->size
, start
, rp
->type
, rp
->resource_fd
);
669 /* read and fill vendor ID */
670 v
= get_real_vendor_id(dir
, &id
);
674 pci_dev
->dev
.config
[0] = id
& 0xff;
675 pci_dev
->dev
.config
[1] = (id
& 0xff00) >> 8;
677 /* read and fill device ID */
678 v
= get_real_device_id(dir
, &id
);
682 pci_dev
->dev
.config
[2] = id
& 0xff;
683 pci_dev
->dev
.config
[3] = (id
& 0xff00) >> 8;
685 pci_word_test_and_clear_mask(pci_dev
->emulate_config_write
+ PCI_COMMAND
,
686 PCI_COMMAND_MASTER
| PCI_COMMAND_INTX_DISABLE
);
688 dev
->region_number
= r
;
692 static void free_msi_virqs(AssignedDevice
*dev
)
696 for (i
= 0; i
< dev
->msi_virq_nr
; i
++) {
697 if (dev
->msi_virq
[i
] >= 0) {
698 kvm_irqchip_release_virq(kvm_state
, dev
->msi_virq
[i
]);
699 dev
->msi_virq
[i
] = -1;
702 g_free(dev
->msi_virq
);
703 dev
->msi_virq
= NULL
;
704 dev
->msi_virq_nr
= 0;
707 static void free_assigned_device(AssignedDevice
*dev
)
711 if (dev
->cap
.available
& ASSIGNED_DEVICE_CAP_MSIX
) {
712 assigned_dev_unregister_msix_mmio(dev
);
714 for (i
= 0; i
< dev
->real_device
.region_number
; i
++) {
715 PCIRegion
*pci_region
= &dev
->real_device
.regions
[i
];
716 AssignedDevRegion
*region
= &dev
->v_addrs
[i
];
718 if (!pci_region
->valid
) {
721 if (pci_region
->type
& IORESOURCE_IO
) {
722 if (region
->u
.r_baseport
) {
723 memory_region_del_subregion(®ion
->container
,
724 ®ion
->real_iomem
);
725 memory_region_destroy(®ion
->real_iomem
);
726 memory_region_destroy(®ion
->container
);
728 } else if (pci_region
->type
& IORESOURCE_MEM
) {
729 if (region
->u
.r_virtbase
) {
730 memory_region_del_subregion(®ion
->container
,
731 ®ion
->real_iomem
);
733 /* Remove MSI-X table subregion */
734 if (pci_region
->base_addr
<= dev
->msix_table_addr
&&
735 pci_region
->base_addr
+ pci_region
->size
>
736 dev
->msix_table_addr
) {
737 memory_region_del_subregion(®ion
->container
,
741 memory_region_destroy(®ion
->real_iomem
);
742 memory_region_destroy(®ion
->container
);
743 if (munmap(region
->u
.r_virtbase
,
744 (pci_region
->size
+ 0xFFF) & 0xFFFFF000)) {
745 error_report("Failed to unmap assigned device region: %s",
750 if (pci_region
->resource_fd
>= 0) {
751 close(pci_region
->resource_fd
);
755 if (dev
->real_device
.config_fd
>= 0) {
756 close(dev
->real_device
.config_fd
);
762 static void assign_failed_examine(AssignedDevice
*dev
)
764 char name
[PATH_MAX
], dir
[PATH_MAX
], driver
[PATH_MAX
] = {}, *ns
;
765 uint16_t vendor_id
, device_id
;
768 snprintf(dir
, sizeof(dir
), "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
769 dev
->host
.domain
, dev
->host
.bus
, dev
->host
.slot
,
772 snprintf(name
, sizeof(name
), "%sdriver", dir
);
774 r
= readlink(name
, driver
, sizeof(driver
));
775 if ((r
<= 0) || r
>= sizeof(driver
)) {
779 ns
= strrchr(driver
, '/');
786 if (get_real_vendor_id(dir
, &vendor_id
) ||
787 get_real_device_id(dir
, &device_id
)) {
791 error_report("*** The driver '%s' is occupying your device "
792 "%04x:%02x:%02x.%x.",
793 ns
, dev
->host
.domain
, dev
->host
.bus
, dev
->host
.slot
,
796 error_report("*** You can try the following commands to free it:");
798 error_report("*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub/"
799 "new_id", vendor_id
, device_id
);
800 error_report("*** $ echo \"%04x:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
802 dev
->host
.domain
, dev
->host
.bus
, dev
->host
.slot
,
803 dev
->host
.function
, ns
);
804 error_report("*** $ echo \"%04x:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
806 dev
->host
.domain
, dev
->host
.bus
, dev
->host
.slot
,
808 error_report("*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub"
809 "/remove_id", vendor_id
, device_id
);
815 error_report("Couldn't find out why.");
818 static int assign_device(AssignedDevice
*dev
)
820 uint32_t flags
= KVM_DEV_ASSIGN_ENABLE_IOMMU
;
823 /* Only pass non-zero PCI segment to capable module */
824 if (!kvm_check_extension(kvm_state
, KVM_CAP_PCI_SEGMENT
) &&
826 error_report("Can't assign device inside non-zero PCI segment "
827 "as this KVM module doesn't support it.");
831 if (!kvm_check_extension(kvm_state
, KVM_CAP_IOMMU
)) {
832 error_report("No IOMMU found. Unable to assign device \"%s\"",
837 if (dev
->features
& ASSIGNED_DEVICE_SHARE_INTX_MASK
&&
838 kvm_has_intx_set_mask()) {
839 flags
|= KVM_DEV_ASSIGN_PCI_2_3
;
842 r
= kvm_device_pci_assign(kvm_state
, &dev
->host
, flags
, &dev
->dev_id
);
844 error_report("Failed to assign device \"%s\" : %s",
845 dev
->dev
.qdev
.id
, strerror(-r
));
849 assign_failed_examine(dev
);
858 static bool check_irqchip_in_kernel(void)
860 if (kvm_irqchip_in_kernel()) {
863 error_report("pci-assign: error: requires KVM with in-kernel irqchip "
868 static int assign_intx(AssignedDevice
*dev
)
870 AssignedIRQType new_type
;
871 PCIINTxRoute intx_route
;
875 /* Interrupt PIN 0 means don't use INTx */
876 if (assigned_dev_pci_read_byte(&dev
->dev
, PCI_INTERRUPT_PIN
) == 0) {
877 pci_device_set_intx_routing_notifier(&dev
->dev
, NULL
);
881 if (!check_irqchip_in_kernel()) {
885 pci_device_set_intx_routing_notifier(&dev
->dev
,
886 assigned_dev_update_irq_routing
);
888 intx_route
= pci_device_route_intx_to_irq(&dev
->dev
, dev
->intpin
);
889 assert(intx_route
.mode
!= PCI_INTX_INVERTED
);
891 if (dev
->intx_route
.mode
== intx_route
.mode
&&
892 dev
->intx_route
.irq
== intx_route
.irq
) {
896 switch (dev
->assigned_irq_type
) {
897 case ASSIGNED_IRQ_INTX_HOST_INTX
:
898 case ASSIGNED_IRQ_INTX_HOST_MSI
:
899 intx_host_msi
= dev
->assigned_irq_type
== ASSIGNED_IRQ_INTX_HOST_MSI
;
900 r
= kvm_device_intx_deassign(kvm_state
, dev
->dev_id
, intx_host_msi
);
902 case ASSIGNED_IRQ_MSI
:
903 r
= kvm_device_msi_deassign(kvm_state
, dev
->dev_id
);
905 case ASSIGNED_IRQ_MSIX
:
906 r
= kvm_device_msix_deassign(kvm_state
, dev
->dev_id
);
913 perror("assign_intx: deassignment of previous interrupt failed");
915 dev
->assigned_irq_type
= ASSIGNED_IRQ_NONE
;
917 if (intx_route
.mode
== PCI_INTX_DISABLED
) {
918 dev
->intx_route
= intx_route
;
923 if (dev
->features
& ASSIGNED_DEVICE_PREFER_MSI_MASK
&&
924 dev
->cap
.available
& ASSIGNED_DEVICE_CAP_MSI
) {
925 intx_host_msi
= true;
926 new_type
= ASSIGNED_IRQ_INTX_HOST_MSI
;
928 intx_host_msi
= false;
929 new_type
= ASSIGNED_IRQ_INTX_HOST_INTX
;
932 r
= kvm_device_intx_assign(kvm_state
, dev
->dev_id
, intx_host_msi
,
935 if (r
== -EIO
&& !(dev
->features
& ASSIGNED_DEVICE_PREFER_MSI_MASK
) &&
936 dev
->cap
.available
& ASSIGNED_DEVICE_CAP_MSI
) {
937 /* Retry with host-side MSI. There might be an IRQ conflict and
938 * either the kernel or the device doesn't support sharing. */
939 error_report("Host-side INTx sharing not supported, "
940 "using MSI instead.\n"
941 "Some devices do not to work properly in this mode.");
942 dev
->features
|= ASSIGNED_DEVICE_PREFER_MSI_MASK
;
945 error_report("Failed to assign irq for \"%s\": %s",
946 dev
->dev
.qdev
.id
, strerror(-r
));
947 error_report("Perhaps you are assigning a device "
948 "that shares an IRQ with another device?");
952 dev
->intx_route
= intx_route
;
953 dev
->assigned_irq_type
= new_type
;
957 static void deassign_device(AssignedDevice
*dev
)
961 r
= kvm_device_pci_deassign(kvm_state
, dev
->dev_id
);
965 /* The pci config space got updated. Check if irq numbers have changed
968 static void assigned_dev_update_irq_routing(PCIDevice
*dev
)
970 AssignedDevice
*assigned_dev
= DO_UPCAST(AssignedDevice
, dev
, dev
);
974 r
= assign_intx(assigned_dev
);
976 qdev_unplug(&dev
->qdev
, &err
);
981 static void assigned_dev_update_msi(PCIDevice
*pci_dev
)
983 AssignedDevice
*assigned_dev
= DO_UPCAST(AssignedDevice
, dev
, pci_dev
);
984 uint8_t ctrl_byte
= pci_get_byte(pci_dev
->config
+ pci_dev
->msi_cap
+
988 /* Some guests gratuitously disable MSI even if they're not using it,
989 * try to catch this by only deassigning irqs if the guest is using
990 * MSI or intends to start. */
991 if (assigned_dev
->assigned_irq_type
== ASSIGNED_IRQ_MSI
||
992 (ctrl_byte
& PCI_MSI_FLAGS_ENABLE
)) {
993 r
= kvm_device_msi_deassign(kvm_state
, assigned_dev
->dev_id
);
994 /* -ENXIO means no assigned irq */
995 if (r
&& r
!= -ENXIO
) {
996 perror("assigned_dev_update_msi: deassign irq");
999 free_msi_virqs(assigned_dev
);
1001 assigned_dev
->assigned_irq_type
= ASSIGNED_IRQ_NONE
;
1002 pci_device_set_intx_routing_notifier(pci_dev
, NULL
);
1005 if (ctrl_byte
& PCI_MSI_FLAGS_ENABLE
) {
1006 uint8_t *pos
= pci_dev
->config
+ pci_dev
->msi_cap
;
1010 msg
.address
= pci_get_long(pos
+ PCI_MSI_ADDRESS_LO
);
1011 msg
.data
= pci_get_word(pos
+ PCI_MSI_DATA_32
);
1012 virq
= kvm_irqchip_add_msi_route(kvm_state
, msg
);
1014 perror("assigned_dev_update_msi: kvm_irqchip_add_msi_route");
1018 assigned_dev
->msi_virq
= g_malloc(sizeof(*assigned_dev
->msi_virq
));
1019 assigned_dev
->msi_virq_nr
= 1;
1020 assigned_dev
->msi_virq
[0] = virq
;
1021 if (kvm_device_msi_assign(kvm_state
, assigned_dev
->dev_id
, virq
) < 0) {
1022 perror("assigned_dev_update_msi: kvm_device_msi_assign");
1025 assigned_dev
->intx_route
.mode
= PCI_INTX_DISABLED
;
1026 assigned_dev
->intx_route
.irq
= -1;
1027 assigned_dev
->assigned_irq_type
= ASSIGNED_IRQ_MSI
;
1029 assign_intx(assigned_dev
);
1033 static bool assigned_dev_msix_masked(MSIXTableEntry
*entry
)
1035 return (entry
->ctrl
& cpu_to_le32(0x1)) != 0;
1038 static int assigned_dev_update_msix_mmio(PCIDevice
*pci_dev
)
1040 AssignedDevice
*adev
= DO_UPCAST(AssignedDevice
, dev
, pci_dev
);
1041 uint16_t entries_nr
= 0;
1043 MSIXTableEntry
*entry
= adev
->msix_table
;
1046 /* Get the usable entry number for allocating */
1047 for (i
= 0; i
< adev
->msix_max
; i
++, entry
++) {
1048 if (assigned_dev_msix_masked(entry
)) {
1054 DEBUG("MSI-X entries: %d\n", entries_nr
);
1056 /* It's valid to enable MSI-X with all entries masked */
1061 r
= kvm_device_msix_init_vectors(kvm_state
, adev
->dev_id
, entries_nr
);
1063 error_report("fail to set MSI-X entry number for MSIX! %s",
1068 free_msi_virqs(adev
);
1070 adev
->msi_virq_nr
= adev
->msix_max
;
1071 adev
->msi_virq
= g_malloc(adev
->msix_max
* sizeof(*adev
->msi_virq
));
1073 entry
= adev
->msix_table
;
1074 for (i
= 0; i
< adev
->msix_max
; i
++, entry
++) {
1075 adev
->msi_virq
[i
] = -1;
1077 if (assigned_dev_msix_masked(entry
)) {
1081 msg
.address
= entry
->addr_lo
| ((uint64_t)entry
->addr_hi
<< 32);
1082 msg
.data
= entry
->data
;
1083 r
= kvm_irqchip_add_msi_route(kvm_state
, msg
);
1087 adev
->msi_virq
[i
] = r
;
1089 DEBUG("MSI-X vector %d, gsi %d, addr %08x_%08x, data %08x\n", i
,
1090 r
, entry
->addr_hi
, entry
->addr_lo
, entry
->data
);
1092 r
= kvm_device_msix_set_vector(kvm_state
, adev
->dev_id
, i
,
1095 error_report("fail to set MSI-X entry! %s", strerror(-r
));
1103 static void assigned_dev_update_msix(PCIDevice
*pci_dev
)
1105 AssignedDevice
*assigned_dev
= DO_UPCAST(AssignedDevice
, dev
, pci_dev
);
1106 uint16_t ctrl_word
= pci_get_word(pci_dev
->config
+ pci_dev
->msix_cap
+
1110 /* Some guests gratuitously disable MSIX even if they're not using it,
1111 * try to catch this by only deassigning irqs if the guest is using
1112 * MSIX or intends to start. */
1113 if ((assigned_dev
->assigned_irq_type
== ASSIGNED_IRQ_MSIX
) ||
1114 (ctrl_word
& PCI_MSIX_FLAGS_ENABLE
)) {
1115 r
= kvm_device_msix_deassign(kvm_state
, assigned_dev
->dev_id
);
1116 /* -ENXIO means no assigned irq */
1117 if (r
&& r
!= -ENXIO
) {
1118 perror("assigned_dev_update_msix: deassign irq");
1121 free_msi_virqs(assigned_dev
);
1123 assigned_dev
->assigned_irq_type
= ASSIGNED_IRQ_NONE
;
1124 pci_device_set_intx_routing_notifier(pci_dev
, NULL
);
1127 if (ctrl_word
& PCI_MSIX_FLAGS_ENABLE
) {
1128 if (assigned_dev_update_msix_mmio(pci_dev
) < 0) {
1129 perror("assigned_dev_update_msix_mmio");
1133 if (assigned_dev
->msi_virq_nr
> 0) {
1134 if (kvm_device_msix_assign(kvm_state
, assigned_dev
->dev_id
) < 0) {
1135 perror("assigned_dev_enable_msix: assign irq");
1139 assigned_dev
->intx_route
.mode
= PCI_INTX_DISABLED
;
1140 assigned_dev
->intx_route
.irq
= -1;
1141 assigned_dev
->assigned_irq_type
= ASSIGNED_IRQ_MSIX
;
1143 assign_intx(assigned_dev
);
1147 static uint32_t assigned_dev_pci_read_config(PCIDevice
*pci_dev
,
1148 uint32_t address
, int len
)
1150 AssignedDevice
*assigned_dev
= DO_UPCAST(AssignedDevice
, dev
, pci_dev
);
1151 uint32_t virt_val
= pci_default_read_config(pci_dev
, address
, len
);
1152 uint32_t real_val
, emulate_mask
, full_emulation_mask
;
1155 memcpy(&emulate_mask
, assigned_dev
->emulate_config_read
+ address
, len
);
1156 emulate_mask
= le32_to_cpu(emulate_mask
);
1158 full_emulation_mask
= 0xffffffff >> (32 - len
* 8);
1160 if (emulate_mask
!= full_emulation_mask
) {
1161 real_val
= assigned_dev_pci_read(pci_dev
, address
, len
);
1162 return (virt_val
& emulate_mask
) | (real_val
& ~emulate_mask
);
1168 static void assigned_dev_pci_write_config(PCIDevice
*pci_dev
, uint32_t address
,
1169 uint32_t val
, int len
)
1171 AssignedDevice
*assigned_dev
= DO_UPCAST(AssignedDevice
, dev
, pci_dev
);
1172 uint16_t old_cmd
= pci_get_word(pci_dev
->config
+ PCI_COMMAND
);
1173 uint32_t emulate_mask
, full_emulation_mask
;
1176 pci_default_write_config(pci_dev
, address
, val
, len
);
1178 if (kvm_has_intx_set_mask() &&
1179 range_covers_byte(address
, len
, PCI_COMMAND
+ 1)) {
1180 bool intx_masked
= (pci_get_word(pci_dev
->config
+ PCI_COMMAND
) &
1181 PCI_COMMAND_INTX_DISABLE
);
1183 if (intx_masked
!= !!(old_cmd
& PCI_COMMAND_INTX_DISABLE
)) {
1184 ret
= kvm_device_intx_set_mask(kvm_state
, assigned_dev
->dev_id
,
1187 perror("assigned_dev_pci_write_config: set intx mask");
1191 if (assigned_dev
->cap
.available
& ASSIGNED_DEVICE_CAP_MSI
) {
1192 if (range_covers_byte(address
, len
,
1193 pci_dev
->msi_cap
+ PCI_MSI_FLAGS
)) {
1194 assigned_dev_update_msi(pci_dev
);
1197 if (assigned_dev
->cap
.available
& ASSIGNED_DEVICE_CAP_MSIX
) {
1198 if (range_covers_byte(address
, len
,
1199 pci_dev
->msix_cap
+ PCI_MSIX_FLAGS
+ 1)) {
1200 assigned_dev_update_msix(pci_dev
);
1205 memcpy(&emulate_mask
, assigned_dev
->emulate_config_write
+ address
, len
);
1206 emulate_mask
= le32_to_cpu(emulate_mask
);
1208 full_emulation_mask
= 0xffffffff >> (32 - len
* 8);
1210 if (emulate_mask
!= full_emulation_mask
) {
1212 val
&= ~emulate_mask
;
1213 val
|= assigned_dev_pci_read(pci_dev
, address
, len
) & emulate_mask
;
1215 assigned_dev_pci_write(pci_dev
, address
, val
, len
);
1219 static void assigned_dev_setup_cap_read(AssignedDevice
*dev
, uint32_t offset
,
1222 assigned_dev_direct_config_read(dev
, offset
, len
);
1223 assigned_dev_emulate_config_read(dev
, offset
+ PCI_CAP_LIST_NEXT
, 1);
1226 static int assigned_device_pci_cap_init(PCIDevice
*pci_dev
)
1228 AssignedDevice
*dev
= DO_UPCAST(AssignedDevice
, dev
, pci_dev
);
1229 PCIRegion
*pci_region
= dev
->real_device
.regions
;
1232 /* Clear initial capabilities pointer and status copied from hw */
1233 pci_set_byte(pci_dev
->config
+ PCI_CAPABILITY_LIST
, 0);
1234 pci_set_word(pci_dev
->config
+ PCI_STATUS
,
1235 pci_get_word(pci_dev
->config
+ PCI_STATUS
) &
1236 ~PCI_STATUS_CAP_LIST
);
1238 /* Expose MSI capability
1239 * MSI capability is the 1st capability in capability config */
1240 pos
= pci_find_cap_offset(pci_dev
, PCI_CAP_ID_MSI
, 0);
1241 if (pos
!= 0 && kvm_check_extension(kvm_state
, KVM_CAP_ASSIGN_DEV_IRQ
)) {
1242 if (!check_irqchip_in_kernel()) {
1245 dev
->cap
.available
|= ASSIGNED_DEVICE_CAP_MSI
;
1246 /* Only 32-bit/no-mask currently supported */
1247 ret
= pci_add_capability(pci_dev
, PCI_CAP_ID_MSI
, pos
, 10);
1251 pci_dev
->msi_cap
= pos
;
1253 pci_set_word(pci_dev
->config
+ pos
+ PCI_MSI_FLAGS
,
1254 pci_get_word(pci_dev
->config
+ pos
+ PCI_MSI_FLAGS
) &
1255 PCI_MSI_FLAGS_QMASK
);
1256 pci_set_long(pci_dev
->config
+ pos
+ PCI_MSI_ADDRESS_LO
, 0);
1257 pci_set_word(pci_dev
->config
+ pos
+ PCI_MSI_DATA_32
, 0);
1259 /* Set writable fields */
1260 pci_set_word(pci_dev
->wmask
+ pos
+ PCI_MSI_FLAGS
,
1261 PCI_MSI_FLAGS_QSIZE
| PCI_MSI_FLAGS_ENABLE
);
1262 pci_set_long(pci_dev
->wmask
+ pos
+ PCI_MSI_ADDRESS_LO
, 0xfffffffc);
1263 pci_set_word(pci_dev
->wmask
+ pos
+ PCI_MSI_DATA_32
, 0xffff);
1265 /* Expose MSI-X capability */
1266 pos
= pci_find_cap_offset(pci_dev
, PCI_CAP_ID_MSIX
, 0);
1267 if (pos
!= 0 && kvm_device_msix_supported(kvm_state
)) {
1269 uint32_t msix_table_entry
;
1271 if (!check_irqchip_in_kernel()) {
1274 dev
->cap
.available
|= ASSIGNED_DEVICE_CAP_MSIX
;
1275 ret
= pci_add_capability(pci_dev
, PCI_CAP_ID_MSIX
, pos
, 12);
1279 pci_dev
->msix_cap
= pos
;
1281 pci_set_word(pci_dev
->config
+ pos
+ PCI_MSIX_FLAGS
,
1282 pci_get_word(pci_dev
->config
+ pos
+ PCI_MSIX_FLAGS
) &
1283 PCI_MSIX_FLAGS_QSIZE
);
1285 /* Only enable and function mask bits are writable */
1286 pci_set_word(pci_dev
->wmask
+ pos
+ PCI_MSIX_FLAGS
,
1287 PCI_MSIX_FLAGS_ENABLE
| PCI_MSIX_FLAGS_MASKALL
);
1289 msix_table_entry
= pci_get_long(pci_dev
->config
+ pos
+ PCI_MSIX_TABLE
);
1290 bar_nr
= msix_table_entry
& PCI_MSIX_FLAGS_BIRMASK
;
1291 msix_table_entry
&= ~PCI_MSIX_FLAGS_BIRMASK
;
1292 dev
->msix_table_addr
= pci_region
[bar_nr
].base_addr
+ msix_table_entry
;
1293 dev
->msix_max
= pci_get_word(pci_dev
->config
+ pos
+ PCI_MSIX_FLAGS
);
1294 dev
->msix_max
&= PCI_MSIX_FLAGS_QSIZE
;
1298 /* Minimal PM support, nothing writable, device appears to NAK changes */
1299 pos
= pci_find_cap_offset(pci_dev
, PCI_CAP_ID_PM
, 0);
1303 ret
= pci_add_capability(pci_dev
, PCI_CAP_ID_PM
, pos
, PCI_PM_SIZEOF
);
1308 assigned_dev_setup_cap_read(dev
, pos
, PCI_PM_SIZEOF
);
1310 pmc
= pci_get_word(pci_dev
->config
+ pos
+ PCI_CAP_FLAGS
);
1311 pmc
&= (PCI_PM_CAP_VER_MASK
| PCI_PM_CAP_DSI
);
1312 pci_set_word(pci_dev
->config
+ pos
+ PCI_CAP_FLAGS
, pmc
);
1314 /* assign_device will bring the device up to D0, so we don't need
1315 * to worry about doing that ourselves here. */
1316 pci_set_word(pci_dev
->config
+ pos
+ PCI_PM_CTRL
,
1317 PCI_PM_CTRL_NO_SOFT_RESET
);
1319 pci_set_byte(pci_dev
->config
+ pos
+ PCI_PM_PPB_EXTENSIONS
, 0);
1320 pci_set_byte(pci_dev
->config
+ pos
+ PCI_PM_DATA_REGISTER
, 0);
1323 pos
= pci_find_cap_offset(pci_dev
, PCI_CAP_ID_EXP
, 0);
1325 uint8_t version
, size
= 0;
1326 uint16_t type
, devctl
, lnksta
;
1327 uint32_t devcap
, lnkcap
;
1329 version
= pci_get_byte(pci_dev
->config
+ pos
+ PCI_EXP_FLAGS
);
1330 version
&= PCI_EXP_FLAGS_VERS
;
1333 } else if (version
== 2) {
1335 * Check for non-std size, accept reduced size to 0x34,
1336 * which is what bcm5761 implemented, violating the
1337 * PCIe v3.0 spec that regs should exist and be read as 0,
1338 * not optionally provided and shorten the struct size.
1340 size
= MIN(0x3c, PCI_CONFIG_SPACE_SIZE
- pos
);
1342 error_report("%s: Invalid size PCIe cap-id 0x%x",
1343 __func__
, PCI_CAP_ID_EXP
);
1345 } else if (size
!= 0x3c) {
1346 error_report("WARNING, %s: PCIe cap-id 0x%x has "
1347 "non-standard size 0x%x; std size should be 0x3c",
1348 __func__
, PCI_CAP_ID_EXP
, size
);
1350 } else if (version
== 0) {
1352 vid
= pci_get_word(pci_dev
->config
+ PCI_VENDOR_ID
);
1353 did
= pci_get_word(pci_dev
->config
+ PCI_DEVICE_ID
);
1354 if (vid
== PCI_VENDOR_ID_INTEL
&& did
== 0x10ed) {
1356 * quirk for Intel 82599 VF with invalid PCIe capability
1357 * version, should really be version 2 (same as PF)
1364 error_report("%s: Unsupported PCI express capability version %d",
1369 ret
= pci_add_capability(pci_dev
, PCI_CAP_ID_EXP
, pos
, size
);
1374 assigned_dev_setup_cap_read(dev
, pos
, size
);
1376 type
= pci_get_word(pci_dev
->config
+ pos
+ PCI_EXP_FLAGS
);
1377 type
= (type
& PCI_EXP_FLAGS_TYPE
) >> 4;
1378 if (type
!= PCI_EXP_TYPE_ENDPOINT
&&
1379 type
!= PCI_EXP_TYPE_LEG_END
&& type
!= PCI_EXP_TYPE_RC_END
) {
1380 error_report("Device assignment only supports endpoint assignment,"
1381 " device type %d", type
);
1385 /* capabilities, pass existing read-only copy
1386 * PCI_EXP_FLAGS_IRQ: updated by hardware, should be direct read */
1388 /* device capabilities: hide FLR */
1389 devcap
= pci_get_long(pci_dev
->config
+ pos
+ PCI_EXP_DEVCAP
);
1390 devcap
&= ~PCI_EXP_DEVCAP_FLR
;
1391 pci_set_long(pci_dev
->config
+ pos
+ PCI_EXP_DEVCAP
, devcap
);
1393 /* device control: clear all error reporting enable bits, leaving
1394 * only a few host values. Note, these are
1395 * all writable, but not passed to hw.
1397 devctl
= pci_get_word(pci_dev
->config
+ pos
+ PCI_EXP_DEVCTL
);
1398 devctl
= (devctl
& (PCI_EXP_DEVCTL_READRQ
| PCI_EXP_DEVCTL_PAYLOAD
)) |
1399 PCI_EXP_DEVCTL_RELAX_EN
| PCI_EXP_DEVCTL_NOSNOOP_EN
;
1400 pci_set_word(pci_dev
->config
+ pos
+ PCI_EXP_DEVCTL
, devctl
);
1401 devctl
= PCI_EXP_DEVCTL_BCR_FLR
| PCI_EXP_DEVCTL_AUX_PME
;
1402 pci_set_word(pci_dev
->wmask
+ pos
+ PCI_EXP_DEVCTL
, ~devctl
);
1404 /* Clear device status */
1405 pci_set_word(pci_dev
->config
+ pos
+ PCI_EXP_DEVSTA
, 0);
1407 /* Link capabilities, expose links and latencues, clear reporting */
1408 lnkcap
= pci_get_long(pci_dev
->config
+ pos
+ PCI_EXP_LNKCAP
);
1409 lnkcap
&= (PCI_EXP_LNKCAP_SLS
| PCI_EXP_LNKCAP_MLW
|
1410 PCI_EXP_LNKCAP_ASPMS
| PCI_EXP_LNKCAP_L0SEL
|
1411 PCI_EXP_LNKCAP_L1EL
);
1412 pci_set_long(pci_dev
->config
+ pos
+ PCI_EXP_LNKCAP
, lnkcap
);
1414 /* Link control, pass existing read-only copy. Should be writable? */
1416 /* Link status, only expose current speed and width */
1417 lnksta
= pci_get_word(pci_dev
->config
+ pos
+ PCI_EXP_LNKSTA
);
1418 lnksta
&= (PCI_EXP_LNKSTA_CLS
| PCI_EXP_LNKSTA_NLW
);
1419 pci_set_word(pci_dev
->config
+ pos
+ PCI_EXP_LNKSTA
, lnksta
);
1422 /* Slot capabilities, control, status - not needed for endpoints */
1423 pci_set_long(pci_dev
->config
+ pos
+ PCI_EXP_SLTCAP
, 0);
1424 pci_set_word(pci_dev
->config
+ pos
+ PCI_EXP_SLTCTL
, 0);
1425 pci_set_word(pci_dev
->config
+ pos
+ PCI_EXP_SLTSTA
, 0);
1427 /* Root control, capabilities, status - not needed for endpoints */
1428 pci_set_word(pci_dev
->config
+ pos
+ PCI_EXP_RTCTL
, 0);
1429 pci_set_word(pci_dev
->config
+ pos
+ PCI_EXP_RTCAP
, 0);
1430 pci_set_long(pci_dev
->config
+ pos
+ PCI_EXP_RTSTA
, 0);
1432 /* Device capabilities/control 2, pass existing read-only copy */
1433 /* Link control 2, pass existing read-only copy */
1437 pos
= pci_find_cap_offset(pci_dev
, PCI_CAP_ID_PCIX
, 0);
1442 /* Only expose the minimum, 8 byte capability */
1443 ret
= pci_add_capability(pci_dev
, PCI_CAP_ID_PCIX
, pos
, 8);
1448 assigned_dev_setup_cap_read(dev
, pos
, 8);
1450 /* Command register, clear upper bits, including extended modes */
1451 cmd
= pci_get_word(pci_dev
->config
+ pos
+ PCI_X_CMD
);
1452 cmd
&= (PCI_X_CMD_DPERR_E
| PCI_X_CMD_ERO
| PCI_X_CMD_MAX_READ
|
1453 PCI_X_CMD_MAX_SPLIT
);
1454 pci_set_word(pci_dev
->config
+ pos
+ PCI_X_CMD
, cmd
);
1456 /* Status register, update with emulated PCI bus location, clear
1457 * error bits, leave the rest. */
1458 status
= pci_get_long(pci_dev
->config
+ pos
+ PCI_X_STATUS
);
1459 status
&= ~(PCI_X_STATUS_BUS
| PCI_X_STATUS_DEVFN
);
1460 status
|= (pci_bus_num(pci_dev
->bus
) << 8) | pci_dev
->devfn
;
1461 status
&= ~(PCI_X_STATUS_SPL_DISC
| PCI_X_STATUS_UNX_SPL
|
1462 PCI_X_STATUS_SPL_ERR
);
1463 pci_set_long(pci_dev
->config
+ pos
+ PCI_X_STATUS
, status
);
1466 pos
= pci_find_cap_offset(pci_dev
, PCI_CAP_ID_VPD
, 0);
1468 /* Direct R/W passthrough */
1469 ret
= pci_add_capability(pci_dev
, PCI_CAP_ID_VPD
, pos
, 8);
1474 assigned_dev_setup_cap_read(dev
, pos
, 8);
1476 /* direct write for cap content */
1477 assigned_dev_direct_config_write(dev
, pos
+ 2, 6);
1480 /* Devices can have multiple vendor capabilities, get them all */
1481 for (pos
= 0; (pos
= pci_find_cap_offset(pci_dev
, PCI_CAP_ID_VNDR
, pos
));
1482 pos
+= PCI_CAP_LIST_NEXT
) {
1483 uint8_t len
= pci_get_byte(pci_dev
->config
+ pos
+ PCI_CAP_FLAGS
);
1484 /* Direct R/W passthrough */
1485 ret
= pci_add_capability(pci_dev
, PCI_CAP_ID_VNDR
, pos
, len
);
1490 assigned_dev_setup_cap_read(dev
, pos
, len
);
1492 /* direct write for cap content */
1493 assigned_dev_direct_config_write(dev
, pos
+ 2, len
- 2);
1496 /* If real and virtual capability list status bits differ, virtualize the
1498 if ((pci_get_word(pci_dev
->config
+ PCI_STATUS
) & PCI_STATUS_CAP_LIST
) !=
1499 (assigned_dev_pci_read_byte(pci_dev
, PCI_STATUS
) &
1500 PCI_STATUS_CAP_LIST
)) {
1501 dev
->emulate_config_read
[PCI_STATUS
] |= PCI_STATUS_CAP_LIST
;
1508 assigned_dev_msix_mmio_read(void *opaque
, target_phys_addr_t addr
,
1511 AssignedDevice
*adev
= opaque
;
1514 memcpy(&val
, (void *)((uint8_t *)adev
->msix_table
+ addr
), size
);
1519 static void assigned_dev_msix_mmio_write(void *opaque
, target_phys_addr_t addr
,
1520 uint64_t val
, unsigned size
)
1522 AssignedDevice
*adev
= opaque
;
1523 PCIDevice
*pdev
= &adev
->dev
;
1525 MSIXTableEntry orig
;
1528 if (i
>= adev
->msix_max
) {
1529 return; /* Drop write */
1532 ctrl
= pci_get_word(pdev
->config
+ pdev
->msix_cap
+ PCI_MSIX_FLAGS
);
1534 DEBUG("write to MSI-X table offset 0x%lx, val 0x%lx\n", addr
, val
);
1536 if (ctrl
& PCI_MSIX_FLAGS_ENABLE
) {
1537 orig
= adev
->msix_table
[i
];
1540 memcpy((uint8_t *)adev
->msix_table
+ addr
, &val
, size
);
1542 if (ctrl
& PCI_MSIX_FLAGS_ENABLE
) {
1543 MSIXTableEntry
*entry
= &adev
->msix_table
[i
];
1545 if (!assigned_dev_msix_masked(&orig
) &&
1546 assigned_dev_msix_masked(entry
)) {
1548 * Vector masked, disable it
1550 * XXX It's not clear if we can or should actually attempt
1551 * to mask or disable the interrupt. KVM doesn't have
1552 * support for pending bits and kvm_assign_set_msix_entry
1553 * doesn't modify the device hardware mask. Interrupts
1554 * while masked are simply not injected to the guest, so
1555 * are lost. Can we get away with always injecting an
1556 * interrupt on unmask?
1558 } else if (assigned_dev_msix_masked(&orig
) &&
1559 !assigned_dev_msix_masked(entry
)) {
1560 /* Vector unmasked */
1561 if (i
>= adev
->msi_virq_nr
|| adev
->msi_virq
[i
] < 0) {
1562 /* Previously unassigned vector, start from scratch */
1563 assigned_dev_update_msix(pdev
);
1566 /* Update an existing, previously masked vector */
1570 msg
.address
= entry
->addr_lo
|
1571 ((uint64_t)entry
->addr_hi
<< 32);
1572 msg
.data
= entry
->data
;
1574 ret
= kvm_irqchip_update_msi_route(kvm_state
,
1575 adev
->msi_virq
[i
], msg
);
1577 error_report("Error updating irq routing entry (%d)", ret
);
1584 static const MemoryRegionOps assigned_dev_msix_mmio_ops
= {
1585 .read
= assigned_dev_msix_mmio_read
,
1586 .write
= assigned_dev_msix_mmio_write
,
1587 .endianness
= DEVICE_NATIVE_ENDIAN
,
1589 .min_access_size
= 4,
1590 .max_access_size
= 8,
1593 .min_access_size
= 4,
1594 .max_access_size
= 8,
1598 static void assigned_dev_msix_reset(AssignedDevice
*dev
)
1600 MSIXTableEntry
*entry
;
1603 if (!dev
->msix_table
) {
1607 memset(dev
->msix_table
, 0, MSIX_PAGE_SIZE
);
1609 for (i
= 0, entry
= dev
->msix_table
; i
< dev
->msix_max
; i
++, entry
++) {
1610 entry
->ctrl
= cpu_to_le32(0x1); /* Masked */
1614 static int assigned_dev_register_msix_mmio(AssignedDevice
*dev
)
1616 dev
->msix_table
= mmap(NULL
, MSIX_PAGE_SIZE
, PROT_READ
|PROT_WRITE
,
1617 MAP_ANONYMOUS
|MAP_PRIVATE
, 0, 0);
1618 if (dev
->msix_table
== MAP_FAILED
) {
1619 error_report("fail allocate msix_table! %s", strerror(errno
));
1623 assigned_dev_msix_reset(dev
);
1625 memory_region_init_io(&dev
->mmio
, &assigned_dev_msix_mmio_ops
, dev
,
1626 "assigned-dev-msix", MSIX_PAGE_SIZE
);
1630 static void assigned_dev_unregister_msix_mmio(AssignedDevice
*dev
)
1632 if (!dev
->msix_table
) {
1636 memory_region_destroy(&dev
->mmio
);
1638 if (munmap(dev
->msix_table
, MSIX_PAGE_SIZE
) == -1) {
1639 error_report("error unmapping msix_table! %s", strerror(errno
));
1641 dev
->msix_table
= NULL
;
1644 static const VMStateDescription vmstate_assigned_device
= {
1645 .name
= "pci-assign",
1649 static void reset_assigned_device(DeviceState
*dev
)
1651 PCIDevice
*pci_dev
= DO_UPCAST(PCIDevice
, qdev
, dev
);
1652 AssignedDevice
*adev
= DO_UPCAST(AssignedDevice
, dev
, pci_dev
);
1653 char reset_file
[64];
1654 const char reset
[] = "1";
1658 * If a guest is reset without being shutdown, MSI/MSI-X can still
1659 * be running. We want to return the device to a known state on
1660 * reset, so disable those here. We especially do not want MSI-X
1661 * enabled since it lives in MMIO space, which is about to get
1664 if (adev
->assigned_irq_type
== ASSIGNED_IRQ_MSIX
) {
1665 uint16_t ctrl
= pci_get_word(pci_dev
->config
+
1666 pci_dev
->msix_cap
+ PCI_MSIX_FLAGS
);
1668 pci_set_word(pci_dev
->config
+ pci_dev
->msix_cap
+ PCI_MSIX_FLAGS
,
1669 ctrl
& ~PCI_MSIX_FLAGS_ENABLE
);
1670 assigned_dev_update_msix(pci_dev
);
1671 } else if (adev
->assigned_irq_type
== ASSIGNED_IRQ_MSI
) {
1672 uint8_t ctrl
= pci_get_byte(pci_dev
->config
+
1673 pci_dev
->msi_cap
+ PCI_MSI_FLAGS
);
1675 pci_set_byte(pci_dev
->config
+ pci_dev
->msi_cap
+ PCI_MSI_FLAGS
,
1676 ctrl
& ~PCI_MSI_FLAGS_ENABLE
);
1677 assigned_dev_update_msi(pci_dev
);
1680 snprintf(reset_file
, sizeof(reset_file
),
1681 "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/reset",
1682 adev
->host
.domain
, adev
->host
.bus
, adev
->host
.slot
,
1683 adev
->host
.function
);
1686 * Issue a device reset via pci-sysfs. Note that we use write(2) here
1687 * and ignore the return value because some kernels have a bug that
1688 * returns 0 rather than bytes written on success, sending us into an
1689 * infinite retry loop using other write mechanisms.
1691 fd
= open(reset_file
, O_WRONLY
);
1693 ret
= write(fd
, reset
, strlen(reset
));
1699 * When a 0 is written to the bus master register, the device is logically
1700 * disconnected from the PCI bus. This avoids further DMA transfers.
1702 assigned_dev_pci_write_config(pci_dev
, PCI_COMMAND
, 0, 1);
1705 static int assigned_initfn(struct PCIDevice
*pci_dev
)
1707 AssignedDevice
*dev
= DO_UPCAST(AssignedDevice
, dev
, pci_dev
);
1711 if (!kvm_enabled()) {
1712 error_report("pci-assign: error: requires KVM support");
1716 if (!dev
->host
.domain
&& !dev
->host
.bus
&& !dev
->host
.slot
&&
1717 !dev
->host
.function
) {
1718 error_report("pci-assign: error: no host device specified");
1723 * Set up basic config space access control. Will be further refined during
1724 * device initialization.
1726 assigned_dev_emulate_config_read(dev
, 0, PCI_CONFIG_SPACE_SIZE
);
1727 assigned_dev_direct_config_read(dev
, PCI_STATUS
, 2);
1728 assigned_dev_direct_config_read(dev
, PCI_REVISION_ID
, 1);
1729 assigned_dev_direct_config_read(dev
, PCI_CLASS_PROG
, 3);
1730 assigned_dev_direct_config_read(dev
, PCI_CACHE_LINE_SIZE
, 1);
1731 assigned_dev_direct_config_read(dev
, PCI_LATENCY_TIMER
, 1);
1732 assigned_dev_direct_config_read(dev
, PCI_BIST
, 1);
1733 assigned_dev_direct_config_read(dev
, PCI_CARDBUS_CIS
, 4);
1734 assigned_dev_direct_config_read(dev
, PCI_SUBSYSTEM_VENDOR_ID
, 2);
1735 assigned_dev_direct_config_read(dev
, PCI_SUBSYSTEM_ID
, 2);
1736 assigned_dev_direct_config_read(dev
, PCI_CAPABILITY_LIST
+ 1, 7);
1737 assigned_dev_direct_config_read(dev
, PCI_MIN_GNT
, 1);
1738 assigned_dev_direct_config_read(dev
, PCI_MAX_LAT
, 1);
1739 memcpy(dev
->emulate_config_write
, dev
->emulate_config_read
,
1740 sizeof(dev
->emulate_config_read
));
1742 if (get_real_device(dev
, dev
->host
.domain
, dev
->host
.bus
,
1743 dev
->host
.slot
, dev
->host
.function
)) {
1744 error_report("pci-assign: Error: Couldn't get real device (%s)!",
1749 if (assigned_device_pci_cap_init(pci_dev
) < 0) {
1753 /* intercept MSI-X entry page in the MMIO */
1754 if (dev
->cap
.available
& ASSIGNED_DEVICE_CAP_MSIX
) {
1755 if (assigned_dev_register_msix_mmio(dev
)) {
1760 /* handle real device's MMIO/PIO BARs */
1761 if (assigned_dev_register_regions(dev
->real_device
.regions
,
1762 dev
->real_device
.region_number
,
1767 /* handle interrupt routing */
1768 e_intx
= dev
->dev
.config
[PCI_INTERRUPT_PIN
] - 1;
1769 dev
->intpin
= e_intx
;
1770 dev
->intx_route
.mode
= PCI_INTX_DISABLED
;
1771 dev
->intx_route
.irq
= -1;
1773 /* assign device to guest */
1774 r
= assign_device(dev
);
1779 /* assign legacy INTx to the device */
1780 r
= assign_intx(dev
);
1785 assigned_dev_load_option_rom(dev
);
1787 add_boot_device_path(dev
->bootindex
, &pci_dev
->qdev
, NULL
);
1792 deassign_device(dev
);
1794 free_assigned_device(dev
);
1798 static void assigned_exitfn(struct PCIDevice
*pci_dev
)
1800 AssignedDevice
*dev
= DO_UPCAST(AssignedDevice
, dev
, pci_dev
);
1802 deassign_device(dev
);
1803 free_assigned_device(dev
);
1806 static Property assigned_dev_properties
[] = {
1807 DEFINE_PROP_PCI_HOST_DEVADDR("host", AssignedDevice
, host
),
1808 DEFINE_PROP_BIT("prefer_msi", AssignedDevice
, features
,
1809 ASSIGNED_DEVICE_PREFER_MSI_BIT
, false),
1810 DEFINE_PROP_BIT("share_intx", AssignedDevice
, features
,
1811 ASSIGNED_DEVICE_SHARE_INTX_BIT
, true),
1812 DEFINE_PROP_INT32("bootindex", AssignedDevice
, bootindex
, -1),
1813 DEFINE_PROP_STRING("configfd", AssignedDevice
, configfd_name
),
1814 DEFINE_PROP_END_OF_LIST(),
1817 static void assign_class_init(ObjectClass
*klass
, void *data
)
1819 PCIDeviceClass
*k
= PCI_DEVICE_CLASS(klass
);
1820 DeviceClass
*dc
= DEVICE_CLASS(klass
);
1822 k
->init
= assigned_initfn
;
1823 k
->exit
= assigned_exitfn
;
1824 k
->config_read
= assigned_dev_pci_read_config
;
1825 k
->config_write
= assigned_dev_pci_write_config
;
1826 dc
->props
= assigned_dev_properties
;
1827 dc
->vmsd
= &vmstate_assigned_device
;
1828 dc
->reset
= reset_assigned_device
;
1829 dc
->desc
= "KVM-based PCI passthrough";
1832 static const TypeInfo assign_info
= {
1833 .name
= "kvm-pci-assign",
1834 .parent
= TYPE_PCI_DEVICE
,
1835 .instance_size
= sizeof(AssignedDevice
),
1836 .class_init
= assign_class_init
,
1839 static void assign_register_types(void)
1841 type_register_static(&assign_info
);
1844 type_init(assign_register_types
)
1847 * Scan the assigned devices for the devices that have an option ROM, and then
1848 * load the corresponding ROM data to RAM. If an error occurs while loading an
1849 * option ROM, we just ignore that option ROM and continue with the next one.
1851 static void assigned_dev_load_option_rom(AssignedDevice
*dev
)
1853 char name
[32], rom_file
[64];
1859 /* If loading ROM from file, pci handles it */
1860 if (dev
->dev
.romfile
|| !dev
->dev
.rom_bar
) {
1864 snprintf(rom_file
, sizeof(rom_file
),
1865 "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/rom",
1866 dev
->host
.domain
, dev
->host
.bus
, dev
->host
.slot
,
1867 dev
->host
.function
);
1869 if (stat(rom_file
, &st
)) {
1873 if (access(rom_file
, F_OK
)) {
1874 error_report("pci-assign: Insufficient privileges for %s", rom_file
);
1878 /* Write "1" to the ROM file to enable it */
1879 fp
= fopen(rom_file
, "r+");
1884 if (fwrite(&val
, 1, 1, fp
) != 1) {
1887 fseek(fp
, 0, SEEK_SET
);
1889 snprintf(name
, sizeof(name
), "%s.rom",
1890 object_get_typename(OBJECT(dev
)));
1891 memory_region_init_ram(&dev
->dev
.rom
, name
, st
.st_size
);
1892 vmstate_register_ram(&dev
->dev
.rom
, &dev
->dev
.qdev
);
1893 ptr
= memory_region_get_ram_ptr(&dev
->dev
.rom
);
1894 memset(ptr
, 0xff, st
.st_size
);
1896 if (!fread(ptr
, 1, st
.st_size
, fp
)) {
1897 error_report("pci-assign: Cannot read from host %s\n"
1898 "\tDevice option ROM contents are probably invalid "
1899 "(check dmesg).\n\tSkip option ROM probe with rombar=0, "
1900 "or load from file with romfile=", rom_file
);
1901 memory_region_destroy(&dev
->dev
.rom
);
1905 pci_register_bar(&dev
->dev
, PCI_ROM_SLOT
, 0, &dev
->dev
.rom
);
1906 dev
->dev
.has_rom
= true;
1908 /* Write "0" to disable ROM */
1909 fseek(fp
, 0, SEEK_SET
);
1911 if (!fwrite(&val
, 1, 1, fp
)) {
1912 DEBUG("%s\n", "Failed to disable pci-sysfs rom file");