]> git.proxmox.com Git - qemu.git/blame - hw/vfio_pci.c
vfio-pci: Add extra debugging
[qemu.git] / hw / vfio_pci.c
CommitLineData
65501a74
AW
1/*
2 * vfio based device assignment support
3 *
4 * Copyright Red Hat, Inc. 2012
5 *
6 * Authors:
7 * Alex Williamson <alex.williamson@redhat.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
11 *
12 * Based on qemu-kvm device-assignment:
13 * Adapted for KVM by Qumranet.
14 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
15 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
16 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
17 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
18 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
19 */
20
21#include <dirent.h>
22#include <unistd.h>
23#include <sys/ioctl.h>
24#include <sys/mman.h>
25#include <sys/stat.h>
26#include <sys/types.h>
27#include <linux/vfio.h>
28
29#include "config.h"
1de7afc9 30#include "qemu/event_notifier.h"
022c62cb 31#include "exec/address-spaces.h"
9c17d615 32#include "sysemu/kvm.h"
022c62cb 33#include "exec/memory.h"
83c9f4ca
PB
34#include "hw/pci/msi.h"
35#include "hw/pci/msix.h"
36#include "hw/pci/pci.h"
5c97e5eb 37#include "qemu-common.h"
1de7afc9
PB
38#include "qemu/error-report.h"
39#include "qemu/queue.h"
40#include "qemu/range.h"
65501a74
AW
41
42/* #define DEBUG_VFIO */
43#ifdef DEBUG_VFIO
44#define DPRINTF(fmt, ...) \
45 do { fprintf(stderr, "vfio: " fmt, ## __VA_ARGS__); } while (0)
46#else
47#define DPRINTF(fmt, ...) \
48 do { } while (0)
49#endif
50
82ca8912
AW
51/* Extra debugging, trap acceleration paths for more logging */
52#define VFIO_ALLOW_MMAP 1
53#define VFIO_ALLOW_KVM_INTX 1
54
7076eabc
AW
55struct VFIODevice;
56
57typedef struct VFIOQuirk {
58 MemoryRegion mem;
59 struct VFIODevice *vdev;
60 QLIST_ENTRY(VFIOQuirk) next;
61 uint32_t data;
62 uint32_t data2;
63} VFIOQuirk;
64
5c97e5eb
AW
65typedef struct VFIOBAR {
66 off_t fd_offset; /* offset of BAR within device fd */
67 int fd; /* device fd, allows us to pass VFIOBAR as opaque data */
68 MemoryRegion mem; /* slow, read/write access */
69 MemoryRegion mmap_mem; /* direct mapped access */
70 void *mmap;
71 size_t size;
72 uint32_t flags; /* VFIO region flags (rd/wr/mmap) */
73 uint8_t nr; /* cache the BAR number for debug */
7076eabc 74 QLIST_HEAD(, VFIOQuirk) quirks;
5c97e5eb
AW
75} VFIOBAR;
76
f15689c7
AW
77typedef struct VFIOVGARegion {
78 MemoryRegion mem;
79 off_t offset;
80 int nr;
7076eabc 81 QLIST_HEAD(, VFIOQuirk) quirks;
f15689c7
AW
82} VFIOVGARegion;
83
84typedef struct VFIOVGA {
85 off_t fd_offset;
86 int fd;
87 VFIOVGARegion region[QEMU_PCI_VGA_NUM_REGIONS];
88} VFIOVGA;
89
5c97e5eb
AW
90typedef struct VFIOINTx {
91 bool pending; /* interrupt pending */
92 bool kvm_accel; /* set when QEMU bypass through KVM enabled */
93 uint8_t pin; /* which pin to pull for qemu_set_irq */
94 EventNotifier interrupt; /* eventfd triggered on interrupt */
95 EventNotifier unmask; /* eventfd for unmask on QEMU bypass */
96 PCIINTxRoute route; /* routing info for QEMU bypass */
97 uint32_t mmap_timeout; /* delay to re-enable mmaps after interrupt */
98 QEMUTimer *mmap_timer; /* enable mmaps after periods w/o interrupts */
99} VFIOINTx;
100
5c97e5eb
AW
101typedef struct VFIOMSIVector {
102 EventNotifier interrupt; /* eventfd triggered on interrupt */
103 struct VFIODevice *vdev; /* back pointer to device */
104 int virq; /* KVM irqchip route for QEMU bypass */
105 bool use;
106} VFIOMSIVector;
107
108enum {
109 VFIO_INT_NONE = 0,
110 VFIO_INT_INTx = 1,
111 VFIO_INT_MSI = 2,
112 VFIO_INT_MSIX = 3,
113};
114
115struct VFIOGroup;
116
117typedef struct VFIOContainer {
118 int fd; /* /dev/vfio/vfio, empowered by the attached groups */
119 struct {
120 /* enable abstraction to support various iommu backends */
121 union {
122 MemoryListener listener; /* Used by type1 iommu */
123 };
124 void (*release)(struct VFIOContainer *);
125 } iommu_data;
126 QLIST_HEAD(, VFIOGroup) group_list;
127 QLIST_ENTRY(VFIOContainer) next;
128} VFIOContainer;
129
130/* Cache of MSI-X setup plus extra mmap and memory region for split BAR map */
131typedef struct VFIOMSIXInfo {
132 uint8_t table_bar;
133 uint8_t pba_bar;
134 uint16_t entries;
135 uint32_t table_offset;
136 uint32_t pba_offset;
137 MemoryRegion mmap_mem;
138 void *mmap;
139} VFIOMSIXInfo;
140
141typedef struct VFIODevice {
142 PCIDevice pdev;
143 int fd;
144 VFIOINTx intx;
145 unsigned int config_size;
4b5d5e87 146 uint8_t *emulated_config_bits; /* QEMU emulated bits, little-endian */
5c97e5eb
AW
147 off_t config_offset; /* Offset of config space region within device fd */
148 unsigned int rom_size;
149 off_t rom_offset; /* Offset of ROM region within device fd */
150 int msi_cap_size;
151 VFIOMSIVector *msi_vectors;
152 VFIOMSIXInfo *msix;
153 int nr_vectors; /* Number of MSI/MSIX vectors currently in use */
154 int interrupt; /* Current interrupt type */
155 VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */
f15689c7 156 VFIOVGA vga; /* 0xa0000, 0x3b0, 0x3c0 */
5c97e5eb
AW
157 PCIHostDeviceAddress host;
158 QLIST_ENTRY(VFIODevice) next;
159 struct VFIOGroup *group;
f15689c7
AW
160 uint32_t features;
161#define VFIO_FEATURE_ENABLE_VGA_BIT 0
162#define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT)
5c97e5eb 163 bool reset_works;
f15689c7 164 bool has_vga;
5c97e5eb
AW
165} VFIODevice;
166
167typedef struct VFIOGroup {
168 int fd;
169 int groupid;
170 VFIOContainer *container;
171 QLIST_HEAD(, VFIODevice) device_list;
172 QLIST_ENTRY(VFIOGroup) next;
173 QLIST_ENTRY(VFIOGroup) container_next;
174} VFIOGroup;
175
65501a74
AW
176#define MSIX_CAP_LENGTH 12
177
178static QLIST_HEAD(, VFIOContainer)
179 container_list = QLIST_HEAD_INITIALIZER(container_list);
180
181static QLIST_HEAD(, VFIOGroup)
182 group_list = QLIST_HEAD_INITIALIZER(group_list);
183
184static void vfio_disable_interrupts(VFIODevice *vdev);
185static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
7076eabc
AW
186static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
187 uint32_t val, int len);
65501a74
AW
188static void vfio_mmap_set_enabled(VFIODevice *vdev, bool enabled);
189
190/*
191 * Common VFIO interrupt disable
192 */
193static void vfio_disable_irqindex(VFIODevice *vdev, int index)
194{
195 struct vfio_irq_set irq_set = {
196 .argsz = sizeof(irq_set),
197 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
198 .index = index,
199 .start = 0,
200 .count = 0,
201 };
202
203 ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
65501a74
AW
204}
205
206/*
207 * INTx
208 */
209static void vfio_unmask_intx(VFIODevice *vdev)
210{
211 struct vfio_irq_set irq_set = {
212 .argsz = sizeof(irq_set),
213 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
214 .index = VFIO_PCI_INTX_IRQ_INDEX,
215 .start = 0,
216 .count = 1,
217 };
218
219 ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
220}
221
e1d1e586
AW
222#ifdef CONFIG_KVM /* Unused outside of CONFIG_KVM code */
223static void vfio_mask_intx(VFIODevice *vdev)
224{
225 struct vfio_irq_set irq_set = {
226 .argsz = sizeof(irq_set),
227 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
228 .index = VFIO_PCI_INTX_IRQ_INDEX,
229 .start = 0,
230 .count = 1,
231 };
232
233 ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
234}
235#endif
236
ea486926
AW
237/*
238 * Disabling BAR mmaping can be slow, but toggling it around INTx can
239 * also be a huge overhead. We try to get the best of both worlds by
240 * waiting until an interrupt to disable mmaps (subsequent transitions
241 * to the same state are effectively no overhead). If the interrupt has
242 * been serviced and the time gap is long enough, we re-enable mmaps for
243 * performance. This works well for things like graphics cards, which
244 * may not use their interrupt at all and are penalized to an unusable
245 * level by read/write BAR traps. Other devices, like NICs, have more
246 * regular interrupts and see much better latency by staying in non-mmap
247 * mode. We therefore set the default mmap_timeout such that a ping
248 * is just enough to keep the mmap disabled. Users can experiment with
249 * other options with the x-intx-mmap-timeout-ms parameter (a value of
250 * zero disables the timer).
251 */
252static void vfio_intx_mmap_enable(void *opaque)
253{
254 VFIODevice *vdev = opaque;
255
256 if (vdev->intx.pending) {
257 qemu_mod_timer(vdev->intx.mmap_timer,
258 qemu_get_clock_ms(vm_clock) + vdev->intx.mmap_timeout);
259 return;
260 }
261
262 vfio_mmap_set_enabled(vdev, true);
263}
264
65501a74
AW
265static void vfio_intx_interrupt(void *opaque)
266{
267 VFIODevice *vdev = opaque;
268
269 if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) {
270 return;
271 }
272
273 DPRINTF("%s(%04x:%02x:%02x.%x) Pin %c\n", __func__, vdev->host.domain,
274 vdev->host.bus, vdev->host.slot, vdev->host.function,
275 'A' + vdev->intx.pin);
276
277 vdev->intx.pending = true;
278 qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 1);
ea486926
AW
279 vfio_mmap_set_enabled(vdev, false);
280 if (vdev->intx.mmap_timeout) {
281 qemu_mod_timer(vdev->intx.mmap_timer,
282 qemu_get_clock_ms(vm_clock) + vdev->intx.mmap_timeout);
283 }
65501a74
AW
284}
285
286static void vfio_eoi(VFIODevice *vdev)
287{
288 if (!vdev->intx.pending) {
289 return;
290 }
291
292 DPRINTF("%s(%04x:%02x:%02x.%x) EOI\n", __func__, vdev->host.domain,
293 vdev->host.bus, vdev->host.slot, vdev->host.function);
294
295 vdev->intx.pending = false;
296 qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
297 vfio_unmask_intx(vdev);
298}
299
e1d1e586
AW
300static void vfio_enable_intx_kvm(VFIODevice *vdev)
301{
302#ifdef CONFIG_KVM
303 struct kvm_irqfd irqfd = {
304 .fd = event_notifier_get_fd(&vdev->intx.interrupt),
305 .gsi = vdev->intx.route.irq,
306 .flags = KVM_IRQFD_FLAG_RESAMPLE,
307 };
308 struct vfio_irq_set *irq_set;
309 int ret, argsz;
310 int32_t *pfd;
311
82ca8912 312 if (!VFIO_ALLOW_KVM_INTX || !kvm_irqfds_enabled() ||
e1d1e586
AW
313 vdev->intx.route.mode != PCI_INTX_ENABLED ||
314 !kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) {
315 return;
316 }
317
318 /* Get to a known interrupt state */
319 qemu_set_fd_handler(irqfd.fd, NULL, NULL, vdev);
320 vfio_mask_intx(vdev);
321 vdev->intx.pending = false;
322 qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
323
324 /* Get an eventfd for resample/unmask */
325 if (event_notifier_init(&vdev->intx.unmask, 0)) {
312fd5f2 326 error_report("vfio: Error: event_notifier_init failed eoi");
e1d1e586
AW
327 goto fail;
328 }
329
330 /* KVM triggers it, VFIO listens for it */
331 irqfd.resamplefd = event_notifier_get_fd(&vdev->intx.unmask);
332
333 if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
312fd5f2 334 error_report("vfio: Error: Failed to setup resample irqfd: %m");
e1d1e586
AW
335 goto fail_irqfd;
336 }
337
338 argsz = sizeof(*irq_set) + sizeof(*pfd);
339
340 irq_set = g_malloc0(argsz);
341 irq_set->argsz = argsz;
342 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK;
343 irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
344 irq_set->start = 0;
345 irq_set->count = 1;
346 pfd = (int32_t *)&irq_set->data;
347
348 *pfd = irqfd.resamplefd;
349
350 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
351 g_free(irq_set);
352 if (ret) {
312fd5f2 353 error_report("vfio: Error: Failed to setup INTx unmask fd: %m");
e1d1e586
AW
354 goto fail_vfio;
355 }
356
357 /* Let'em rip */
358 vfio_unmask_intx(vdev);
359
360 vdev->intx.kvm_accel = true;
361
362 DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel enabled\n",
363 __func__, vdev->host.domain, vdev->host.bus,
364 vdev->host.slot, vdev->host.function);
365
366 return;
367
368fail_vfio:
369 irqfd.flags = KVM_IRQFD_FLAG_DEASSIGN;
370 kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd);
371fail_irqfd:
372 event_notifier_cleanup(&vdev->intx.unmask);
373fail:
374 qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
375 vfio_unmask_intx(vdev);
376#endif
377}
378
379static void vfio_disable_intx_kvm(VFIODevice *vdev)
380{
381#ifdef CONFIG_KVM
382 struct kvm_irqfd irqfd = {
383 .fd = event_notifier_get_fd(&vdev->intx.interrupt),
384 .gsi = vdev->intx.route.irq,
385 .flags = KVM_IRQFD_FLAG_DEASSIGN,
386 };
387
388 if (!vdev->intx.kvm_accel) {
389 return;
390 }
391
392 /*
393 * Get to a known state, hardware masked, QEMU ready to accept new
394 * interrupts, QEMU IRQ de-asserted.
395 */
396 vfio_mask_intx(vdev);
397 vdev->intx.pending = false;
398 qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
399
400 /* Tell KVM to stop listening for an INTx irqfd */
401 if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
312fd5f2 402 error_report("vfio: Error: Failed to disable INTx irqfd: %m");
e1d1e586
AW
403 }
404
405 /* We only need to close the eventfd for VFIO to cleanup the kernel side */
406 event_notifier_cleanup(&vdev->intx.unmask);
407
408 /* QEMU starts listening for interrupt events. */
409 qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
410
411 vdev->intx.kvm_accel = false;
412
413 /* If we've missed an event, let it re-fire through QEMU */
414 vfio_unmask_intx(vdev);
415
416 DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel disabled\n",
417 __func__, vdev->host.domain, vdev->host.bus,
418 vdev->host.slot, vdev->host.function);
419#endif
420}
421
422static void vfio_update_irq(PCIDevice *pdev)
423{
424 VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
425 PCIINTxRoute route;
426
427 if (vdev->interrupt != VFIO_INT_INTx) {
428 return;
429 }
430
431 route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
432
433 if (!pci_intx_route_changed(&vdev->intx.route, &route)) {
434 return; /* Nothing changed */
435 }
436
437 DPRINTF("%s(%04x:%02x:%02x.%x) IRQ moved %d -> %d\n", __func__,
438 vdev->host.domain, vdev->host.bus, vdev->host.slot,
439 vdev->host.function, vdev->intx.route.irq, route.irq);
440
441 vfio_disable_intx_kvm(vdev);
442
443 vdev->intx.route = route;
444
445 if (route.mode != PCI_INTX_ENABLED) {
446 return;
447 }
448
449 vfio_enable_intx_kvm(vdev);
450
451 /* Re-enable the interrupt in cased we missed an EOI */
452 vfio_eoi(vdev);
453}
454
65501a74
AW
455static int vfio_enable_intx(VFIODevice *vdev)
456{
65501a74 457 uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
1a403133
AW
458 int ret, argsz;
459 struct vfio_irq_set *irq_set;
460 int32_t *pfd;
65501a74 461
ea486926 462 if (!pin) {
65501a74
AW
463 return 0;
464 }
465
466 vfio_disable_interrupts(vdev);
467
468 vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
e1d1e586
AW
469
470#ifdef CONFIG_KVM
471 /*
472 * Only conditional to avoid generating error messages on platforms
473 * where we won't actually use the result anyway.
474 */
d281084d
AW
475 if (kvm_irqfds_enabled() &&
476 kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) {
e1d1e586
AW
477 vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev,
478 vdev->intx.pin);
479 }
480#endif
481
65501a74
AW
482 ret = event_notifier_init(&vdev->intx.interrupt, 0);
483 if (ret) {
312fd5f2 484 error_report("vfio: Error: event_notifier_init failed");
65501a74
AW
485 return ret;
486 }
487
1a403133
AW
488 argsz = sizeof(*irq_set) + sizeof(*pfd);
489
490 irq_set = g_malloc0(argsz);
491 irq_set->argsz = argsz;
492 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
493 irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
494 irq_set->start = 0;
495 irq_set->count = 1;
496 pfd = (int32_t *)&irq_set->data;
497
498 *pfd = event_notifier_get_fd(&vdev->intx.interrupt);
499 qemu_set_fd_handler(*pfd, vfio_intx_interrupt, NULL, vdev);
65501a74 500
1a403133
AW
501 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
502 g_free(irq_set);
503 if (ret) {
312fd5f2 504 error_report("vfio: Error: Failed to setup INTx fd: %m");
1a403133 505 qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
ce59af2d 506 event_notifier_cleanup(&vdev->intx.interrupt);
65501a74
AW
507 return -errno;
508 }
509
e1d1e586
AW
510 vfio_enable_intx_kvm(vdev);
511
65501a74
AW
512 vdev->interrupt = VFIO_INT_INTx;
513
514 DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
515 vdev->host.bus, vdev->host.slot, vdev->host.function);
516
517 return 0;
518}
519
520static void vfio_disable_intx(VFIODevice *vdev)
521{
522 int fd;
523
ea486926 524 qemu_del_timer(vdev->intx.mmap_timer);
e1d1e586 525 vfio_disable_intx_kvm(vdev);
65501a74
AW
526 vfio_disable_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
527 vdev->intx.pending = false;
528 qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
529 vfio_mmap_set_enabled(vdev, true);
530
531 fd = event_notifier_get_fd(&vdev->intx.interrupt);
532 qemu_set_fd_handler(fd, NULL, NULL, vdev);
533 event_notifier_cleanup(&vdev->intx.interrupt);
534
535 vdev->interrupt = VFIO_INT_NONE;
536
537 DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
538 vdev->host.bus, vdev->host.slot, vdev->host.function);
539}
540
541/*
542 * MSI/X
543 */
544static void vfio_msi_interrupt(void *opaque)
545{
546 VFIOMSIVector *vector = opaque;
547 VFIODevice *vdev = vector->vdev;
548 int nr = vector - vdev->msi_vectors;
549
550 if (!event_notifier_test_and_clear(&vector->interrupt)) {
551 return;
552 }
553
554 DPRINTF("%s(%04x:%02x:%02x.%x) vector %d\n", __func__,
555 vdev->host.domain, vdev->host.bus, vdev->host.slot,
556 vdev->host.function, nr);
557
558 if (vdev->interrupt == VFIO_INT_MSIX) {
559 msix_notify(&vdev->pdev, nr);
560 } else if (vdev->interrupt == VFIO_INT_MSI) {
561 msi_notify(&vdev->pdev, nr);
562 } else {
312fd5f2 563 error_report("vfio: MSI interrupt receieved, but not enabled?");
65501a74
AW
564 }
565}
566
567static int vfio_enable_vectors(VFIODevice *vdev, bool msix)
568{
569 struct vfio_irq_set *irq_set;
570 int ret = 0, i, argsz;
571 int32_t *fds;
572
573 argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
574
575 irq_set = g_malloc0(argsz);
576 irq_set->argsz = argsz;
577 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
578 irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX;
579 irq_set->start = 0;
580 irq_set->count = vdev->nr_vectors;
581 fds = (int32_t *)&irq_set->data;
582
583 for (i = 0; i < vdev->nr_vectors; i++) {
584 if (!vdev->msi_vectors[i].use) {
585 fds[i] = -1;
586 continue;
587 }
588
589 fds[i] = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
590 }
591
592 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
593
594 g_free(irq_set);
595
65501a74
AW
596 return ret;
597}
598
b0223e29
AW
599static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
600 MSIMessage *msg, IOHandler *handler)
65501a74
AW
601{
602 VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
603 VFIOMSIVector *vector;
604 int ret;
605
606 DPRINTF("%s(%04x:%02x:%02x.%x) vector %d used\n", __func__,
607 vdev->host.domain, vdev->host.bus, vdev->host.slot,
608 vdev->host.function, nr);
609
65501a74
AW
610 vector = &vdev->msi_vectors[nr];
611 vector->vdev = vdev;
612 vector->use = true;
613
614 msix_vector_use(pdev, nr);
615
616 if (event_notifier_init(&vector->interrupt, 0)) {
312fd5f2 617 error_report("vfio: Error: event_notifier_init failed");
65501a74
AW
618 }
619
620 /*
621 * Attempt to enable route through KVM irqchip,
622 * default to userspace handling if unavailable.
623 */
b0223e29 624 vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
65501a74
AW
625 if (vector->virq < 0 ||
626 kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
627 vector->virq) < 0) {
628 if (vector->virq >= 0) {
629 kvm_irqchip_release_virq(kvm_state, vector->virq);
630 vector->virq = -1;
631 }
632 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
b0223e29 633 handler, NULL, vector);
65501a74
AW
634 }
635
636 /*
637 * We don't want to have the host allocate all possible MSI vectors
638 * for a device if they're not in use, so we shutdown and incrementally
639 * increase them as needed.
640 */
641 if (vdev->nr_vectors < nr + 1) {
65501a74
AW
642 vfio_disable_irqindex(vdev, VFIO_PCI_MSIX_IRQ_INDEX);
643 vdev->nr_vectors = nr + 1;
644 ret = vfio_enable_vectors(vdev, true);
645 if (ret) {
312fd5f2 646 error_report("vfio: failed to enable vectors, %d", ret);
65501a74 647 }
65501a74 648 } else {
1a403133
AW
649 int argsz;
650 struct vfio_irq_set *irq_set;
651 int32_t *pfd;
652
653 argsz = sizeof(*irq_set) + sizeof(*pfd);
654
655 irq_set = g_malloc0(argsz);
656 irq_set->argsz = argsz;
657 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
658 VFIO_IRQ_SET_ACTION_TRIGGER;
659 irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
660 irq_set->start = nr;
661 irq_set->count = 1;
662 pfd = (int32_t *)&irq_set->data;
663
664 *pfd = event_notifier_get_fd(&vector->interrupt);
665
666 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
667 g_free(irq_set);
65501a74 668 if (ret) {
312fd5f2 669 error_report("vfio: failed to modify vector, %d", ret);
65501a74 670 }
65501a74
AW
671 }
672
673 return 0;
674}
675
b0223e29
AW
676static int vfio_msix_vector_use(PCIDevice *pdev,
677 unsigned int nr, MSIMessage msg)
678{
679 return vfio_msix_vector_do_use(pdev, nr, &msg, vfio_msi_interrupt);
680}
681
65501a74
AW
682static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
683{
684 VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
685 VFIOMSIVector *vector = &vdev->msi_vectors[nr];
1a403133
AW
686 int argsz;
687 struct vfio_irq_set *irq_set;
688 int32_t *pfd;
65501a74
AW
689
690 DPRINTF("%s(%04x:%02x:%02x.%x) vector %d released\n", __func__,
691 vdev->host.domain, vdev->host.bus, vdev->host.slot,
692 vdev->host.function, nr);
693
694 /*
695 * XXX What's the right thing to do here? This turns off the interrupt
696 * completely, but do we really just want to switch the interrupt to
697 * bouncing through userspace and let msix.c drop it? Not sure.
698 */
699 msix_vector_unuse(pdev, nr);
1a403133
AW
700
701 argsz = sizeof(*irq_set) + sizeof(*pfd);
702
703 irq_set = g_malloc0(argsz);
704 irq_set->argsz = argsz;
705 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
706 VFIO_IRQ_SET_ACTION_TRIGGER;
707 irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
708 irq_set->start = nr;
709 irq_set->count = 1;
710 pfd = (int32_t *)&irq_set->data;
711
712 *pfd = -1;
713
714 ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
715
716 g_free(irq_set);
65501a74
AW
717
718 if (vector->virq < 0) {
719 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
720 NULL, NULL, NULL);
721 } else {
722 kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->interrupt,
723 vector->virq);
724 kvm_irqchip_release_virq(kvm_state, vector->virq);
725 vector->virq = -1;
726 }
727
728 event_notifier_cleanup(&vector->interrupt);
729 vector->use = false;
730}
731
fd704adc
AW
732static void vfio_enable_msix(VFIODevice *vdev)
733{
734 vfio_disable_interrupts(vdev);
735
736 vdev->msi_vectors = g_malloc0(vdev->msix->entries * sizeof(VFIOMSIVector));
737
738 vdev->interrupt = VFIO_INT_MSIX;
739
b0223e29
AW
740 /*
741 * Some communication channels between VF & PF or PF & fw rely on the
742 * physical state of the device and expect that enabling MSI-X from the
743 * guest enables the same on the host. When our guest is Linux, the
744 * guest driver call to pci_enable_msix() sets the enabling bit in the
745 * MSI-X capability, but leaves the vector table masked. We therefore
746 * can't rely on a vector_use callback (from request_irq() in the guest)
747 * to switch the physical device into MSI-X mode because that may come a
748 * long time after pci_enable_msix(). This code enables vector 0 with
749 * triggering to userspace, then immediately release the vector, leaving
750 * the physical device with no vectors enabled, but MSI-X enabled, just
751 * like the guest view.
752 */
753 vfio_msix_vector_do_use(&vdev->pdev, 0, NULL, NULL);
754 vfio_msix_vector_release(&vdev->pdev, 0);
755
fd704adc 756 if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
bbef882c 757 vfio_msix_vector_release, NULL)) {
312fd5f2 758 error_report("vfio: msix_set_vector_notifiers failed");
fd704adc
AW
759 }
760
761 DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
762 vdev->host.bus, vdev->host.slot, vdev->host.function);
763}
764
65501a74
AW
765static void vfio_enable_msi(VFIODevice *vdev)
766{
767 int ret, i;
768
769 vfio_disable_interrupts(vdev);
770
771 vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
772retry:
773 vdev->msi_vectors = g_malloc0(vdev->nr_vectors * sizeof(VFIOMSIVector));
774
775 for (i = 0; i < vdev->nr_vectors; i++) {
776 MSIMessage msg;
777 VFIOMSIVector *vector = &vdev->msi_vectors[i];
778
779 vector->vdev = vdev;
780 vector->use = true;
781
782 if (event_notifier_init(&vector->interrupt, 0)) {
312fd5f2 783 error_report("vfio: Error: event_notifier_init failed");
65501a74
AW
784 }
785
a771c517 786 msg = msi_get_message(&vdev->pdev, i);
65501a74
AW
787
788 /*
789 * Attempt to enable route through KVM irqchip,
790 * default to userspace handling if unavailable.
791 */
792 vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
793 if (vector->virq < 0 ||
794 kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
795 vector->virq) < 0) {
796 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
797 vfio_msi_interrupt, NULL, vector);
798 }
799 }
800
801 ret = vfio_enable_vectors(vdev, false);
802 if (ret) {
803 if (ret < 0) {
312fd5f2 804 error_report("vfio: Error: Failed to setup MSI fds: %m");
65501a74
AW
805 } else if (ret != vdev->nr_vectors) {
806 error_report("vfio: Error: Failed to enable %d "
312fd5f2 807 "MSI vectors, retry with %d", vdev->nr_vectors, ret);
65501a74
AW
808 }
809
810 for (i = 0; i < vdev->nr_vectors; i++) {
811 VFIOMSIVector *vector = &vdev->msi_vectors[i];
812 if (vector->virq >= 0) {
813 kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->interrupt,
814 vector->virq);
815 kvm_irqchip_release_virq(kvm_state, vector->virq);
816 vector->virq = -1;
817 } else {
818 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
819 NULL, NULL, NULL);
820 }
821 event_notifier_cleanup(&vector->interrupt);
822 }
823
824 g_free(vdev->msi_vectors);
825
826 if (ret > 0 && ret != vdev->nr_vectors) {
827 vdev->nr_vectors = ret;
828 goto retry;
829 }
830 vdev->nr_vectors = 0;
831
832 return;
833 }
834
fd704adc
AW
835 vdev->interrupt = VFIO_INT_MSI;
836
65501a74
AW
837 DPRINTF("%s(%04x:%02x:%02x.%x) Enabled %d MSI vectors\n", __func__,
838 vdev->host.domain, vdev->host.bus, vdev->host.slot,
839 vdev->host.function, vdev->nr_vectors);
840}
841
fd704adc
AW
842static void vfio_disable_msi_common(VFIODevice *vdev)
843{
844 g_free(vdev->msi_vectors);
845 vdev->msi_vectors = NULL;
846 vdev->nr_vectors = 0;
847 vdev->interrupt = VFIO_INT_NONE;
848
849 vfio_enable_intx(vdev);
850}
851
852static void vfio_disable_msix(VFIODevice *vdev)
853{
854 msix_unset_vector_notifiers(&vdev->pdev);
855
856 if (vdev->nr_vectors) {
857 vfio_disable_irqindex(vdev, VFIO_PCI_MSIX_IRQ_INDEX);
858 }
859
860 vfio_disable_msi_common(vdev);
861
a011b10e
AW
862 DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
863 vdev->host.bus, vdev->host.slot, vdev->host.function);
fd704adc
AW
864}
865
866static void vfio_disable_msi(VFIODevice *vdev)
65501a74
AW
867{
868 int i;
869
fd704adc 870 vfio_disable_irqindex(vdev, VFIO_PCI_MSI_IRQ_INDEX);
65501a74
AW
871
872 for (i = 0; i < vdev->nr_vectors; i++) {
873 VFIOMSIVector *vector = &vdev->msi_vectors[i];
874
875 if (!vector->use) {
876 continue;
877 }
878
879 if (vector->virq >= 0) {
880 kvm_irqchip_remove_irqfd_notifier(kvm_state,
881 &vector->interrupt, vector->virq);
882 kvm_irqchip_release_virq(kvm_state, vector->virq);
883 vector->virq = -1;
884 } else {
885 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
886 NULL, NULL, NULL);
887 }
888
65501a74
AW
889 event_notifier_cleanup(&vector->interrupt);
890 }
891
fd704adc 892 vfio_disable_msi_common(vdev);
65501a74 893
fd704adc
AW
894 DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
895 vdev->host.bus, vdev->host.slot, vdev->host.function);
65501a74
AW
896}
897
898/*
899 * IO Port/MMIO - Beware of the endians, VFIO is always little endian
900 */
a8170e5e 901static void vfio_bar_write(void *opaque, hwaddr addr,
65501a74
AW
902 uint64_t data, unsigned size)
903{
904 VFIOBAR *bar = opaque;
905 union {
906 uint8_t byte;
907 uint16_t word;
908 uint32_t dword;
909 uint64_t qword;
910 } buf;
911
912 switch (size) {
913 case 1:
914 buf.byte = data;
915 break;
916 case 2:
917 buf.word = cpu_to_le16(data);
918 break;
919 case 4:
920 buf.dword = cpu_to_le32(data);
921 break;
922 default:
923 hw_error("vfio: unsupported write size, %d bytes\n", size);
924 break;
925 }
926
927 if (pwrite(bar->fd, &buf, size, bar->fd_offset + addr) != size) {
312fd5f2 928 error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m",
65501a74
AW
929 __func__, addr, data, size);
930 }
931
82ca8912
AW
932#ifdef DEBUG_VFIO
933 {
934 VFIODevice *vdev = container_of(bar, VFIODevice, bars[bar->nr]);
935
936 DPRINTF("%s(%04x:%02x:%02x.%x:BAR%d+0x%"HWADDR_PRIx", 0x%"PRIx64
937 ", %d)\n", __func__, vdev->host.domain, vdev->host.bus,
938 vdev->host.slot, vdev->host.function, bar->nr, addr,
939 data, size);
940 }
941#endif
65501a74
AW
942
943 /*
944 * A read or write to a BAR always signals an INTx EOI. This will
945 * do nothing if not pending (including not in INTx mode). We assume
946 * that a BAR access is in response to an interrupt and that BAR
947 * accesses will service the interrupt. Unfortunately, we don't know
948 * which access will service the interrupt, so we're potentially
949 * getting quite a few host interrupts per guest interrupt.
950 */
3a4f2816 951 vfio_eoi(container_of(bar, VFIODevice, bars[bar->nr]));
65501a74
AW
952}
953
954static uint64_t vfio_bar_read(void *opaque,
a8170e5e 955 hwaddr addr, unsigned size)
65501a74
AW
956{
957 VFIOBAR *bar = opaque;
958 union {
959 uint8_t byte;
960 uint16_t word;
961 uint32_t dword;
962 uint64_t qword;
963 } buf;
964 uint64_t data = 0;
965
966 if (pread(bar->fd, &buf, size, bar->fd_offset + addr) != size) {
312fd5f2 967 error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m",
65501a74
AW
968 __func__, addr, size);
969 return (uint64_t)-1;
970 }
971
972 switch (size) {
973 case 1:
974 data = buf.byte;
975 break;
976 case 2:
977 data = le16_to_cpu(buf.word);
978 break;
979 case 4:
980 data = le32_to_cpu(buf.dword);
981 break;
982 default:
983 hw_error("vfio: unsupported read size, %d bytes\n", size);
984 break;
985 }
986
82ca8912
AW
987#ifdef DEBUG_VFIO
988 {
989 VFIODevice *vdev = container_of(bar, VFIODevice, bars[bar->nr]);
990
991 DPRINTF("%s(%04x:%02x:%02x.%x:BAR%d+0x%"HWADDR_PRIx
992 ", %d) = 0x%"PRIx64"\n", __func__, vdev->host.domain,
993 vdev->host.bus, vdev->host.slot, vdev->host.function,
994 bar->nr, addr, size, data);
995 }
996#endif
65501a74
AW
997
998 /* Same as write above */
3a4f2816 999 vfio_eoi(container_of(bar, VFIODevice, bars[bar->nr]));
65501a74
AW
1000
1001 return data;
1002}
1003
1004static const MemoryRegionOps vfio_bar_ops = {
1005 .read = vfio_bar_read,
1006 .write = vfio_bar_write,
1007 .endianness = DEVICE_LITTLE_ENDIAN,
1008};
1009
f15689c7
AW
1010static void vfio_vga_write(void *opaque, hwaddr addr,
1011 uint64_t data, unsigned size)
1012{
1013 VFIOVGARegion *region = opaque;
1014 VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
1015 union {
1016 uint8_t byte;
1017 uint16_t word;
1018 uint32_t dword;
1019 uint64_t qword;
1020 } buf;
1021 off_t offset = vga->fd_offset + region->offset + addr;
1022
1023 switch (size) {
1024 case 1:
1025 buf.byte = data;
1026 break;
1027 case 2:
1028 buf.word = cpu_to_le16(data);
1029 break;
1030 case 4:
1031 buf.dword = cpu_to_le32(data);
1032 break;
1033 default:
1034 hw_error("vfio: unsupported write size, %d bytes\n", size);
1035 break;
1036 }
1037
1038 if (pwrite(vga->fd, &buf, size, offset) != size) {
1039 error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m",
1040 __func__, region->offset + addr, data, size);
1041 }
1042
1043 DPRINTF("%s(0x%"HWADDR_PRIx", 0x%"PRIx64", %d)\n",
1044 __func__, region->offset + addr, data, size);
1045}
1046
1047static uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size)
1048{
1049 VFIOVGARegion *region = opaque;
1050 VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
1051 union {
1052 uint8_t byte;
1053 uint16_t word;
1054 uint32_t dword;
1055 uint64_t qword;
1056 } buf;
1057 uint64_t data = 0;
1058 off_t offset = vga->fd_offset + region->offset + addr;
1059
1060 if (pread(vga->fd, &buf, size, offset) != size) {
1061 error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m",
1062 __func__, region->offset + addr, size);
1063 return (uint64_t)-1;
1064 }
1065
1066 switch (size) {
1067 case 1:
1068 data = buf.byte;
1069 break;
1070 case 2:
1071 data = le16_to_cpu(buf.word);
1072 break;
1073 case 4:
1074 data = le32_to_cpu(buf.dword);
1075 break;
1076 default:
1077 hw_error("vfio: unsupported read size, %d bytes\n", size);
1078 break;
1079 }
1080
1081 DPRINTF("%s(0x%"HWADDR_PRIx", %d) = 0x%"PRIx64"\n",
1082 __func__, region->offset + addr, size, data);
1083
1084 return data;
1085}
1086
1087static const MemoryRegionOps vfio_vga_ops = {
1088 .read = vfio_vga_read,
1089 .write = vfio_vga_write,
1090 .endianness = DEVICE_LITTLE_ENDIAN,
1091};
1092
7076eabc
AW
1093/*
1094 * Device specific quirks
1095 */
1096
1097#define PCI_VENDOR_ID_ATI 0x1002
1098
1099/*
1100 * Device 1002:68f9 (Advanced Micro Devices [AMD] nee ATI Cedar PRO [Radeon
1101 * HD 5450/6350]) reports the upper byte of the physical address of the
1102 * I/O port BAR4 through VGA register 0x3c3. The BAR is 256 bytes, so the
1103 * lower byte is known to be zero. Probing for this quirk reads 0xff from
1104 * port 0x3c3 on some devices so we store the physical address and replace
1105 * reads with the virtual address any time it matches. XXX Research when
1106 * to enable quirk.
1107 */
1108static uint64_t vfio_ati_3c3_quirk_read(void *opaque,
1109 hwaddr addr, unsigned size)
1110{
1111 VFIOQuirk *quirk = opaque;
1112 VFIODevice *vdev = quirk->vdev;
1113 PCIDevice *pdev = &vdev->pdev;
1114 uint64_t data = vfio_vga_read(&vdev->vga.region[QEMU_PCI_VGA_IO_HI],
1115 addr + 0x3, size);
1116
1117 if (data == quirk->data) {
1118 data = pci_get_byte(pdev->config + PCI_BASE_ADDRESS_4 + 1);
1119 DPRINTF("%s(0x3c3, 1) = 0x%"PRIx64"\n", __func__, data);
1120 }
1121
1122 return data;
1123}
1124
1125static const MemoryRegionOps vfio_ati_3c3_quirk = {
1126 .read = vfio_ati_3c3_quirk_read,
1127 .endianness = DEVICE_LITTLE_ENDIAN,
1128};
1129
1130static void vfio_vga_probe_ati_3c3_quirk(VFIODevice *vdev)
1131{
1132 PCIDevice *pdev = &vdev->pdev;
1133 off_t physoffset = vdev->config_offset + PCI_BASE_ADDRESS_4;
1134 uint32_t physbar;
1135 VFIOQuirk *quirk;
1136
1137 if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI ||
1138 vdev->bars[4].size < 256) {
1139 return;
1140 }
1141
1142 /* Get I/O port BAR physical address */
1143 if (pread(vdev->fd, &physbar, 4, physoffset) != 4) {
1144 error_report("vfio: probe failed for ATI/AMD 0x3c3 quirk on device "
1145 "%04x:%02x:%02x.%x", vdev->host.domain,
1146 vdev->host.bus, vdev->host.slot, vdev->host.function);
1147 return;
1148 }
1149
1150 quirk = g_malloc0(sizeof(*quirk));
1151 quirk->vdev = vdev;
1152 quirk->data = (physbar >> 8) & 0xff;
1153
1154 memory_region_init_io(&quirk->mem, &vfio_ati_3c3_quirk, quirk,
1155 "vfio-ati-3c3-quirk", 1);
1156 memory_region_add_subregion(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem, 3,
1157 &quirk->mem);
1158
1159 QLIST_INSERT_HEAD(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks,
1160 quirk, next);
1161
1162 DPRINTF("Enabled ATI/AMD quirk 0x3c3 for device %04x:%02x:%02x.%x\n",
1163 vdev->host.domain, vdev->host.bus, vdev->host.slot,
1164 vdev->host.function);
1165}
1166
1167/*
1168 * Device 1002:68f9 (Advanced Micro Devices [AMD] nee ATI Cedar PRO [Radeon
1169 * HD 5450/6350]) reports the physical address of MMIO BAR0 through a
1170 * write/read operation on I/O port BAR4. When uint32_t 0x4010 is written
1171 * to offset 0x0, the subsequent read from offset 0x4 returns the contents
1172 * of BAR0. Test for this quirk on all ATI/AMD devices. XXX - Note that
1173 * 0x10 is the offset of BAR0 in config sapce, is this a window to all of
1174 * config space?
1175 */
1176static uint64_t vfio_ati_4010_quirk_read(void *opaque,
1177 hwaddr addr, unsigned size)
1178{
1179 VFIOQuirk *quirk = opaque;
1180 VFIODevice *vdev = quirk->vdev;
1181 PCIDevice *pdev = &vdev->pdev;
1182 uint64_t data = vfio_bar_read(&vdev->bars[4], addr, size);
1183
1184 if (addr == 4 && size == 4 && quirk->data) {
1185 data = pci_get_long(pdev->config + PCI_BASE_ADDRESS_0);
1186 DPRINTF("%s(BAR4+0x4) = 0x%"PRIx64"\n", __func__, data);
1187 }
1188
1189 quirk->data = 0;
1190
1191 return data;
1192}
1193
1194static void vfio_ati_4010_quirk_write(void *opaque, hwaddr addr,
1195 uint64_t data, unsigned size)
1196{
1197 VFIOQuirk *quirk = opaque;
1198 VFIODevice *vdev = quirk->vdev;
1199
1200 vfio_bar_write(&vdev->bars[4], addr, data, size);
1201
1202 quirk->data = (addr == 0 && size == 4 && data == 0x4010) ? 1 : 0;
1203}
1204
1205static const MemoryRegionOps vfio_ati_4010_quirk = {
1206 .read = vfio_ati_4010_quirk_read,
1207 .write = vfio_ati_4010_quirk_write,
1208 .endianness = DEVICE_LITTLE_ENDIAN,
1209};
1210
1211static void vfio_probe_ati_4010_quirk(VFIODevice *vdev, int nr)
1212{
1213 PCIDevice *pdev = &vdev->pdev;
1214 off_t physoffset = vdev->config_offset + PCI_BASE_ADDRESS_0;
1215 uint32_t physbar0;
1216 uint64_t data;
1217 VFIOQuirk *quirk;
1218
1219 if (!vdev->has_vga || nr != 4 || !vdev->bars[0].size ||
1220 pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) {
1221 return;
1222 }
1223
1224 /* Get I/O port BAR physical address */
1225 if (pread(vdev->fd, &physbar0, 4, physoffset) != 4) {
1226 error_report("vfio: probe failed for ATI/AMD 0x4010 quirk on device "
1227 "%04x:%02x:%02x.%x", vdev->host.domain,
1228 vdev->host.bus, vdev->host.slot, vdev->host.function);
1229 return;
1230 }
1231
1232 /* Write 0x4010 to I/O port BAR offset 0 */
1233 vfio_bar_write(&vdev->bars[4], 0, 0x4010, 4);
1234 /* Read back result */
1235 data = vfio_bar_read(&vdev->bars[4], 4, 4);
1236
1237 /* If the register matches the physical address of BAR0, we need a quirk */
1238 if (data != physbar0) {
1239 return;
1240 }
1241
1242 quirk = g_malloc0(sizeof(*quirk));
1243 quirk->vdev = vdev;
1244
1245 memory_region_init_io(&quirk->mem, &vfio_ati_4010_quirk, quirk,
1246 "vfio-ati-4010-quirk", 8);
1247 memory_region_add_subregion_overlap(&vdev->bars[nr].mem, 0, &quirk->mem, 1);
1248
1249 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1250
1251 DPRINTF("Enabled ATI/AMD quirk 0x4010 for device %04x:%02x:%02x.%x\n",
1252 vdev->host.domain, vdev->host.bus, vdev->host.slot,
1253 vdev->host.function);
1254}
1255
1256/*
1257 * Device 1002:5b63 (Advanced Micro Devices [AMD] nee ATI RV370 [Radeon X550])
1258 * retrieves the upper half of the MMIO BAR0 physical address by writing
1259 * 0xf10 to I/O port BAR1 offset 0 and reading the result from offset 6.
1260 * XXX - 0x10 is the offset of BAR0 in PCI config space, this could provide
1261 * full access to config space. Config space is little endian, so the data
1262 * register probably starts at 0x4.
1263 */
1264static uint64_t vfio_ati_f10_quirk_read(void *opaque,
1265 hwaddr addr, unsigned size)
1266{
1267 VFIOQuirk *quirk = opaque;
1268 VFIODevice *vdev = quirk->vdev;
1269 PCIDevice *pdev = &vdev->pdev;
1270 uint64_t data = vfio_bar_read(&vdev->bars[1], addr, size);
1271
1272 if (addr == 6 && size == 2 && quirk->data) {
1273 data = pci_get_word(pdev->config + PCI_BASE_ADDRESS_0 + 2);
1274 DPRINTF("%s(BAR1+0x6) = 0x%"PRIx64"\n", __func__, data);
1275 }
1276
1277 quirk->data = 0;
1278
1279 return data;
1280}
1281
1282static void vfio_ati_f10_quirk_write(void *opaque, hwaddr addr,
1283 uint64_t data, unsigned size)
1284{
1285 VFIOQuirk *quirk = opaque;
1286 VFIODevice *vdev = quirk->vdev;
1287
1288 vfio_bar_write(&vdev->bars[1], addr, data, size);
1289
1290 quirk->data = (addr == 0 && size == 4 && data == 0xf10) ? 1 : 0;
1291}
1292
1293static const MemoryRegionOps vfio_ati_f10_quirk = {
1294 .read = vfio_ati_f10_quirk_read,
1295 .write = vfio_ati_f10_quirk_write,
1296 .endianness = DEVICE_LITTLE_ENDIAN,
1297};
1298
1299static void vfio_probe_ati_f10_quirk(VFIODevice *vdev, int nr)
1300{
1301 PCIDevice *pdev = &vdev->pdev;
1302 off_t physoffset = vdev->config_offset + PCI_BASE_ADDRESS_0;
1303 uint32_t physbar0;
1304 uint64_t data;
1305 VFIOQuirk *quirk;
1306
1307 if (!vdev->has_vga || nr != 1 || !vdev->bars[0].size ||
1308 pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) {
1309 return;
1310 }
1311
1312 /* Get I/O port BAR physical address */
1313 if (pread(vdev->fd, &physbar0, 4, physoffset) != 4) {
1314 error_report("vfio: probe failed for ATI/AMD 0xf10 quirk on device "
1315 "%04x:%02x:%02x.%x", vdev->host.domain,
1316 vdev->host.bus, vdev->host.slot, vdev->host.function);
1317 return;
1318 }
1319
1320 vfio_bar_write(&vdev->bars[1], 0, 0xf10, 4);
1321 data = vfio_bar_read(&vdev->bars[1], 0x6, 2);
1322
1323 /* If the register matches the physical address of BAR0, we need a quirk */
1324 if (data != (le32_to_cpu(physbar0) >> 16)) {
1325 return;
1326 }
1327
1328 quirk = g_malloc0(sizeof(*quirk));
1329 quirk->vdev = vdev;
1330
1331 memory_region_init_io(&quirk->mem, &vfio_ati_f10_quirk, quirk,
1332 "vfio-ati-f10-quirk", 8);
1333 memory_region_add_subregion_overlap(&vdev->bars[nr].mem, 0, &quirk->mem, 1);
1334
1335 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1336
1337 DPRINTF("Enabled ATI/AMD quirk 0xf10 for device %04x:%02x:%02x.%x\n",
1338 vdev->host.domain, vdev->host.bus, vdev->host.slot,
1339 vdev->host.function);
1340}
1341
1342#define PCI_VENDOR_ID_NVIDIA 0x10de
1343
1344/*
1345 * Nvidia has several different methods to get to config space, the
1346 * nouveu project has several of these documented here:
1347 * https://github.com/pathscale/envytools/tree/master/hwdocs
1348 *
1349 * The first quirk is actually not documented in envytools and is found
1350 * on 10de:01d1 (NVIDIA Corporation G72 [GeForce 7300 LE]). This is an
1351 * NV46 chipset. The backdoor uses the legacy VGA I/O ports to access
1352 * the mirror of PCI config space found at BAR0 offset 0x1800. The access
1353 * sequence first writes 0x338 to I/O port 0x3d4. The target offset is
1354 * then written to 0x3d0. Finally 0x538 is written for a read and 0x738
1355 * is written for a write to 0x3d4. The BAR0 offset is then accessible
1356 * through 0x3d0. This quirk doesn't seem to be necessary on newer cards
1357 * that use the I/O port BAR5 window but it doesn't hurt to leave it.
1358 */
1359enum {
1360 NV_3D0_NONE,
1361 NV_3D0_SELECT,
1362 NV_3D0_WINDOW,
1363 NV_3D0_READ,
1364 NV_3D0_WRITE,
1365};
1366
1367static uint64_t vfio_nvidia_3d0_quirk_read(void *opaque,
1368 hwaddr addr, unsigned size)
1369{
1370 VFIOQuirk *quirk = opaque;
1371 VFIODevice *vdev = quirk->vdev;
1372 PCIDevice *pdev = &vdev->pdev;
1373 uint64_t data = vfio_vga_read(&vdev->vga.region[QEMU_PCI_VGA_IO_HI],
1374 addr + 0x10, size);
1375
1376 if (quirk->data == NV_3D0_READ && addr == 0) {
1377 data = vfio_pci_read_config(pdev, quirk->data2, size);
1378 DPRINTF("%s(0x3d0, %d) = 0x%"PRIx64"\n", __func__, size, data);
1379 }
1380
1381 quirk->data = NV_3D0_NONE;
1382
1383 return data;
1384}
1385
1386static void vfio_nvidia_3d0_quirk_write(void *opaque, hwaddr addr,
1387 uint64_t data, unsigned size)
1388{
1389 VFIOQuirk *quirk = opaque;
1390 VFIODevice *vdev = quirk->vdev;
1391 PCIDevice *pdev = &vdev->pdev;
1392
1393 switch (quirk->data) {
1394 case NV_3D0_NONE:
1395 if (addr == 4 && data == 0x338) {
1396 quirk->data = NV_3D0_SELECT;
1397 }
1398 break;
1399 case NV_3D0_SELECT:
1400 quirk->data = NV_3D0_NONE;
1401 if (addr == 0 && (data & ~0xff) == 0x1800) {
1402 quirk->data = NV_3D0_WINDOW;
1403 quirk->data2 = data & 0xff;
1404 }
1405 break;
1406 case NV_3D0_WINDOW:
1407 quirk->data = NV_3D0_NONE;
1408 if (addr == 4) {
1409 if (data == 0x538) {
1410 quirk->data = NV_3D0_READ;
1411 } else if (data == 0x738) {
1412 quirk->data = NV_3D0_WRITE;
1413 }
1414 }
1415 break;
1416 case NV_3D0_WRITE:
1417 quirk->data = NV_3D0_NONE;
1418 if (addr == 0) {
1419 vfio_pci_write_config(pdev, quirk->data2, data, size);
1420 DPRINTF("%s(0x3d0, 0x%"PRIx64", %d)\n", __func__, data, size);
1421 return;
1422 }
1423 break;
1424 default:
1425 quirk->data = NV_3D0_NONE;
1426 }
1427
1428 vfio_vga_write(&vdev->vga.region[QEMU_PCI_VGA_IO_HI],
1429 addr + 0x10, data, size);
1430}
1431
1432static const MemoryRegionOps vfio_nvidia_3d0_quirk = {
1433 .read = vfio_nvidia_3d0_quirk_read,
1434 .write = vfio_nvidia_3d0_quirk_write,
1435 .endianness = DEVICE_LITTLE_ENDIAN,
1436};
1437
1438static void vfio_vga_probe_nvidia_3d0_quirk(VFIODevice *vdev)
1439{
1440 PCIDevice *pdev = &vdev->pdev;
1441 VFIOQuirk *quirk;
1442
1443 if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA ||
1444 !vdev->bars[1].size) {
1445 return;
1446 }
1447
1448 quirk = g_malloc0(sizeof(*quirk));
1449 quirk->vdev = vdev;
1450
1451 memory_region_init_io(&quirk->mem, &vfio_nvidia_3d0_quirk, quirk,
1452 "vfio-nvidia-3d0-quirk", 6);
1453 memory_region_add_subregion(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem,
1454 0x10, &quirk->mem);
1455
1456 QLIST_INSERT_HEAD(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks,
1457 quirk, next);
1458
1459 DPRINTF("Enabled NVIDIA VGA 0x3d0 quirk for device %04x:%02x:%02x.%x\n",
1460 vdev->host.domain, vdev->host.bus, vdev->host.slot,
1461 vdev->host.function);
1462}
1463
1464/*
1465 * The second quirk is documented in envytools. The I/O port BAR5 is just
1466 * a set of address/data ports to the MMIO BARs. The BAR we care about is
1467 * again BAR0. This backdoor is apparently a bit newer than the one above
1468 * so we need to not only trap 256 bytes @0x1800, but all of PCI config
1469 * space, including extended space is available at the 4k @0x88000.
1470 */
1471enum {
1472 NV_BAR5_ADDRESS = 0x1,
1473 NV_BAR5_ENABLE = 0x2,
1474 NV_BAR5_MASTER = 0x4,
1475 NV_BAR5_VALID = 0x7,
1476};
1477
1478static uint64_t vfio_nvidia_bar5_window_quirk_read(void *opaque,
1479 hwaddr addr, unsigned size)
1480{
1481 VFIOQuirk *quirk = opaque;
1482 VFIODevice *vdev = quirk->vdev;
1483 uint64_t data = vfio_bar_read(&vdev->bars[5], addr, size);
1484
1485 if (addr == 0xc && quirk->data == NV_BAR5_VALID) {
1486 data = vfio_pci_read_config(&vdev->pdev, quirk->data2, size);
1487 DPRINTF("%s(%04x:%02x:%02x.%x:BAR5+0x%"HWADDR_PRIx", %d) = 0x%"
1488 PRIx64"\n", __func__, vdev->host.domain, vdev->host.bus,
1489 vdev->host.slot, vdev->host.function, addr, size, data);
1490 }
1491
1492 return data;
1493}
1494
1495static void vfio_nvidia_bar5_window_quirk_write(void *opaque, hwaddr addr,
1496 uint64_t data, unsigned size)
1497{
1498 VFIOQuirk *quirk = opaque;
1499 VFIODevice *vdev = quirk->vdev;
1500
1501 /*
1502 * Use quirk->data to track enables and quirk->data2 for the offset
1503 */
1504 switch (addr) {
1505 case 0x0:
1506 if (data & 0x1) {
1507 quirk->data |= NV_BAR5_MASTER;
1508 } else {
1509 quirk->data &= ~NV_BAR5_MASTER;
1510 }
1511 break;
1512 case 0x4:
1513 if (data & 0x1) {
1514 quirk->data |= NV_BAR5_ENABLE;
1515 } else {
1516 quirk->data &= ~NV_BAR5_ENABLE;
1517 }
1518 break;
1519 case 0x8:
1520 if (quirk->data & NV_BAR5_MASTER) {
1521 if ((data & ~0xfff) == 0x88000) {
1522 quirk->data |= NV_BAR5_ADDRESS;
1523 quirk->data2 = data & 0xfff;
1524 } else if ((data & ~0xff) == 0x1800) {
1525 quirk->data |= NV_BAR5_ADDRESS;
1526 quirk->data2 = data & 0xff;
1527 } else {
1528 quirk->data &= ~NV_BAR5_ADDRESS;
1529 }
1530 }
1531 break;
1532 case 0xc:
1533 if (quirk->data == NV_BAR5_VALID) {
1534 vfio_pci_write_config(&vdev->pdev, quirk->data2, data, size);
1535 DPRINTF("%s(%04x:%02x:%02x.%x:BAR5+0x%"HWADDR_PRIx", 0x%"
1536 PRIx64", %d)\n", __func__, vdev->host.domain,
1537 vdev->host.bus, vdev->host.slot, vdev->host.function,
1538 addr, data, size);
1539 return;
1540 }
1541 }
1542
1543 vfio_bar_write(&vdev->bars[5], addr, data, size);
1544}
1545
1546static const MemoryRegionOps vfio_nvidia_bar5_window_quirk = {
1547 .read = vfio_nvidia_bar5_window_quirk_read,
1548 .write = vfio_nvidia_bar5_window_quirk_write,
1549 .valid.min_access_size = 4,
1550 .endianness = DEVICE_LITTLE_ENDIAN,
1551};
1552
1553static void vfio_probe_nvidia_bar5_window_quirk(VFIODevice *vdev, int nr)
1554{
1555 PCIDevice *pdev = &vdev->pdev;
1556 VFIOQuirk *quirk;
1557
1558 if (!vdev->has_vga || nr != 5 ||
1559 pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA) {
1560 return;
1561 }
1562
1563 quirk = g_malloc0(sizeof(*quirk));
1564 quirk->vdev = vdev;
1565
1566 memory_region_init_io(&quirk->mem, &vfio_nvidia_bar5_window_quirk, quirk,
1567 "vfio-nvidia-bar5-window-quirk", 16);
1568 memory_region_add_subregion_overlap(&vdev->bars[nr].mem, 0, &quirk->mem, 1);
1569
1570 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1571
1572 DPRINTF("Enabled NVIDIA BAR5 window quirk for device %04x:%02x:%02x.%x\n",
1573 vdev->host.domain, vdev->host.bus, vdev->host.slot,
1574 vdev->host.function);
1575}
1576
1577/*
1578 * Finally, BAR0 itself. We want to redirect any accesses to either
1579 * 0x1800 or 0x88000 through the PCI config space access functions.
1580 *
1581 * NB - quirk at a page granularity or else they don't seem to work when
1582 * BARs are mmap'd
1583 *
1584 * Here's offset 0x88000...
1585 */
1586static uint64_t vfio_nvidia_bar0_88000_quirk_read(void *opaque,
1587 hwaddr addr, unsigned size)
1588{
1589 VFIOQuirk *quirk = opaque;
1590 VFIODevice *vdev = quirk->vdev;
1591 hwaddr base = 0x88000 & TARGET_PAGE_MASK;
1592 hwaddr offset = 0x88000 & ~TARGET_PAGE_MASK;
1593 uint64_t data = vfio_bar_read(&vdev->bars[0], addr + base, size);
1594
1595 if (ranges_overlap(addr, size, offset, PCI_CONFIG_SPACE_SIZE)) {
1596 data = vfio_pci_read_config(&vdev->pdev, addr - offset, size);
1597
1598 DPRINTF("%s(%04x:%02x:%02x.%x:BAR0+0x%"HWADDR_PRIx", %d) = 0x%"
1599 PRIx64"\n", __func__, vdev->host.domain, vdev->host.bus,
1600 vdev->host.slot, vdev->host.function, addr + base, size, data);
1601 }
1602
1603 return data;
1604}
1605
1606static void vfio_nvidia_bar0_88000_quirk_write(void *opaque, hwaddr addr,
1607 uint64_t data, unsigned size)
1608{
1609 VFIOQuirk *quirk = opaque;
1610 VFIODevice *vdev = quirk->vdev;
1611 hwaddr base = 0x88000 & TARGET_PAGE_MASK;
1612 hwaddr offset = 0x88000 & ~TARGET_PAGE_MASK;
1613
1614 if (ranges_overlap(addr, size, offset, PCI_CONFIG_SPACE_SIZE)) {
1615 vfio_pci_write_config(&vdev->pdev, addr - offset, data, size);
1616
1617 DPRINTF("%s(%04x:%02x:%02x.%x:BAR0+0x%"HWADDR_PRIx", 0x%"
1618 PRIx64", %d)\n", __func__, vdev->host.domain, vdev->host.bus,
1619 vdev->host.slot, vdev->host.function, addr + base, data, size);
1620 } else {
1621 vfio_bar_write(&vdev->bars[0], addr + base, data, size);
1622 }
1623}
1624
1625static const MemoryRegionOps vfio_nvidia_bar0_88000_quirk = {
1626 .read = vfio_nvidia_bar0_88000_quirk_read,
1627 .write = vfio_nvidia_bar0_88000_quirk_write,
1628 .endianness = DEVICE_LITTLE_ENDIAN,
1629};
1630
1631static void vfio_probe_nvidia_bar0_88000_quirk(VFIODevice *vdev, int nr)
1632{
1633 PCIDevice *pdev = &vdev->pdev;
1634 VFIOQuirk *quirk;
1635
1636 if (!vdev->has_vga || nr != 0 ||
1637 pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA) {
1638 return;
1639 }
1640
1641 quirk = g_malloc0(sizeof(*quirk));
1642 quirk->vdev = vdev;
1643
1644 memory_region_init_io(&quirk->mem, &vfio_nvidia_bar0_88000_quirk, quirk,
1645 "vfio-nvidia-bar0-88000-quirk",
1646 TARGET_PAGE_ALIGN(PCIE_CONFIG_SPACE_SIZE));
1647 memory_region_add_subregion_overlap(&vdev->bars[nr].mem,
1648 0x88000 & TARGET_PAGE_MASK,
1649 &quirk->mem, 1);
1650
1651 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1652
1653 DPRINTF("Enabled NVIDIA BAR0 0x88000 quirk for device %04x:%02x:%02x.%x\n",
1654 vdev->host.domain, vdev->host.bus, vdev->host.slot,
1655 vdev->host.function);
1656}
1657
1658/*
1659 * And here's the same for BAR0 offset 0x1800...
1660 */
1661static uint64_t vfio_nvidia_bar0_1800_quirk_read(void *opaque,
1662 hwaddr addr, unsigned size)
1663{
1664 VFIOQuirk *quirk = opaque;
1665 VFIODevice *vdev = quirk->vdev;
1666 hwaddr base = 0x1800 & TARGET_PAGE_MASK;
1667 hwaddr offset = 0x1800 & ~TARGET_PAGE_MASK;
1668 uint64_t data = vfio_bar_read(&vdev->bars[0], addr + base, size);
1669
1670 if (ranges_overlap(addr, size, offset, PCI_CONFIG_SPACE_SIZE)) {
1671 data = vfio_pci_read_config(&vdev->pdev, addr - offset, size);
1672
1673 DPRINTF("%s(%04x:%02x:%02x.%x:BAR0+0x%"HWADDR_PRIx", %d) = 0x%"
1674 PRIx64"\n", __func__, vdev->host.domain, vdev->host.bus,
1675 vdev->host.slot, vdev->host.function, addr + base, size, data);
1676 }
1677
1678 return data;
1679}
1680
1681static void vfio_nvidia_bar0_1800_quirk_write(void *opaque, hwaddr addr,
1682 uint64_t data, unsigned size)
1683{
1684 VFIOQuirk *quirk = opaque;
1685 VFIODevice *vdev = quirk->vdev;
1686 hwaddr base = 0x1800 & TARGET_PAGE_MASK;
1687 hwaddr offset = 0x1800 & ~TARGET_PAGE_MASK;
1688
1689 if (ranges_overlap(addr, size, offset, PCI_CONFIG_SPACE_SIZE)) {
1690 vfio_pci_write_config(&vdev->pdev, addr - offset, data, size);
1691
1692 DPRINTF("%s(%04x:%02x:%02x.%x:BAR0+0x%"HWADDR_PRIx", 0x%"
1693 PRIx64", %d)\n", __func__, vdev->host.domain, vdev->host.bus,
1694 vdev->host.slot, vdev->host.function, addr + base, data, size);
1695 } else {
1696 vfio_bar_write(&vdev->bars[0], addr + base, data, size);
1697 }
1698}
1699
1700static const MemoryRegionOps vfio_nvidia_bar0_1800_quirk = {
1701 .read = vfio_nvidia_bar0_1800_quirk_read,
1702 .write = vfio_nvidia_bar0_1800_quirk_write,
1703 .endianness = DEVICE_LITTLE_ENDIAN,
1704};
1705
1706static void vfio_probe_nvidia_bar0_1800_quirk(VFIODevice *vdev, int nr)
1707{
1708 PCIDevice *pdev = &vdev->pdev;
1709 VFIOQuirk *quirk;
1710
1711 if (!vdev->has_vga || nr != 0 ||
1712 pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA) {
1713 return;
1714 }
1715
1716 /* Log the chipset ID */
1717 DPRINTF("Nvidia NV%02x\n",
1718 (unsigned int)(vfio_bar_read(&vdev->bars[0], 0, 4) >> 20) & 0xff);
1719
1720 quirk = g_malloc0(sizeof(*quirk));
1721 quirk->vdev = vdev;
1722
1723 memory_region_init_io(&quirk->mem, &vfio_nvidia_bar0_1800_quirk, quirk,
1724 "vfio-nvidia-bar0-1800-quirk",
1725 TARGET_PAGE_ALIGN(PCI_CONFIG_SPACE_SIZE));
1726 memory_region_add_subregion_overlap(&vdev->bars[nr].mem,
1727 0x1800 & TARGET_PAGE_MASK,
1728 &quirk->mem, 1);
1729
1730 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1731
1732 DPRINTF("Enabled NVIDIA BAR0 0x1800 quirk for device %04x:%02x:%02x.%x\n",
1733 vdev->host.domain, vdev->host.bus, vdev->host.slot,
1734 vdev->host.function);
1735}
1736
1737/*
1738 * TODO - Some Nvidia devices provide config access to their companion HDA
1739 * device and even to their parent bridge via these config space mirrors.
1740 * Add quirks for those regions.
1741 */
1742
1743/*
1744 * Common quirk probe entry points.
1745 */
1746static void vfio_vga_quirk_setup(VFIODevice *vdev)
1747{
1748 vfio_vga_probe_ati_3c3_quirk(vdev);
1749 vfio_vga_probe_nvidia_3d0_quirk(vdev);
1750}
1751
1752static void vfio_vga_quirk_teardown(VFIODevice *vdev)
1753{
1754 int i;
1755
1756 for (i = 0; i < ARRAY_SIZE(vdev->vga.region); i++) {
1757 while (!QLIST_EMPTY(&vdev->vga.region[i].quirks)) {
1758 VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga.region[i].quirks);
1759 memory_region_del_subregion(&vdev->vga.region[i].mem, &quirk->mem);
1760 QLIST_REMOVE(quirk, next);
1761 g_free(quirk);
1762 }
1763 }
1764}
1765
1766static void vfio_bar_quirk_setup(VFIODevice *vdev, int nr)
1767{
1768 vfio_probe_ati_4010_quirk(vdev, nr);
1769 vfio_probe_ati_f10_quirk(vdev, nr);
1770 vfio_probe_nvidia_bar5_window_quirk(vdev, nr);
1771 vfio_probe_nvidia_bar0_88000_quirk(vdev, nr);
1772 vfio_probe_nvidia_bar0_1800_quirk(vdev, nr);
1773}
1774
1775static void vfio_bar_quirk_teardown(VFIODevice *vdev, int nr)
1776{
1777 VFIOBAR *bar = &vdev->bars[nr];
1778
1779 while (!QLIST_EMPTY(&bar->quirks)) {
1780 VFIOQuirk *quirk = QLIST_FIRST(&bar->quirks);
1781 memory_region_del_subregion(&bar->mem, &quirk->mem);
1782 QLIST_REMOVE(quirk, next);
1783 g_free(quirk);
1784 }
1785}
1786
65501a74
AW
1787/*
1788 * PCI config space
1789 */
1790static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
1791{
1792 VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
4b5d5e87 1793 uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val;
65501a74 1794
4b5d5e87
AW
1795 memcpy(&emu_bits, vdev->emulated_config_bits + addr, len);
1796 emu_bits = le32_to_cpu(emu_bits);
65501a74 1797
4b5d5e87
AW
1798 if (emu_bits) {
1799 emu_val = pci_default_read_config(pdev, addr, len);
1800 }
1801
1802 if (~emu_bits & (0xffffffffU >> (32 - len * 8))) {
1803 ssize_t ret;
1804
1805 ret = pread(vdev->fd, &phys_val, len, vdev->config_offset + addr);
1806 if (ret != len) {
312fd5f2 1807 error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed: %m",
65501a74
AW
1808 __func__, vdev->host.domain, vdev->host.bus,
1809 vdev->host.slot, vdev->host.function, addr, len);
1810 return -errno;
1811 }
4b5d5e87 1812 phys_val = le32_to_cpu(phys_val);
65501a74
AW
1813 }
1814
4b5d5e87 1815 val = (emu_val & emu_bits) | (phys_val & ~emu_bits);
65501a74
AW
1816
1817 DPRINTF("%s(%04x:%02x:%02x.%x, @0x%x, len=0x%x) %x\n", __func__,
1818 vdev->host.domain, vdev->host.bus, vdev->host.slot,
1819 vdev->host.function, addr, len, val);
1820
1821 return val;
1822}
1823
1824static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
1825 uint32_t val, int len)
1826{
1827 VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
1828 uint32_t val_le = cpu_to_le32(val);
1829
1830 DPRINTF("%s(%04x:%02x:%02x.%x, @0x%x, 0x%x, len=0x%x)\n", __func__,
1831 vdev->host.domain, vdev->host.bus, vdev->host.slot,
1832 vdev->host.function, addr, val, len);
1833
1834 /* Write everything to VFIO, let it filter out what we can't write */
1835 if (pwrite(vdev->fd, &val_le, len, vdev->config_offset + addr) != len) {
312fd5f2 1836 error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x) failed: %m",
65501a74
AW
1837 __func__, vdev->host.domain, vdev->host.bus,
1838 vdev->host.slot, vdev->host.function, addr, val, len);
1839 }
1840
65501a74
AW
1841 /* MSI/MSI-X Enabling/Disabling */
1842 if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
1843 ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
1844 int is_enabled, was_enabled = msi_enabled(pdev);
1845
1846 pci_default_write_config(pdev, addr, val, len);
1847
1848 is_enabled = msi_enabled(pdev);
1849
1850 if (!was_enabled && is_enabled) {
1851 vfio_enable_msi(vdev);
1852 } else if (was_enabled && !is_enabled) {
fd704adc 1853 vfio_disable_msi(vdev);
65501a74 1854 }
4b5d5e87 1855 } else if (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
65501a74
AW
1856 ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) {
1857 int is_enabled, was_enabled = msix_enabled(pdev);
1858
1859 pci_default_write_config(pdev, addr, val, len);
1860
1861 is_enabled = msix_enabled(pdev);
1862
1863 if (!was_enabled && is_enabled) {
fd704adc 1864 vfio_enable_msix(vdev);
65501a74 1865 } else if (was_enabled && !is_enabled) {
fd704adc 1866 vfio_disable_msix(vdev);
65501a74 1867 }
4b5d5e87
AW
1868 } else {
1869 /* Write everything to QEMU to keep emulated bits correct */
1870 pci_default_write_config(pdev, addr, val, len);
65501a74
AW
1871 }
1872}
1873
1874/*
1875 * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
1876 */
af6bc27e 1877static int vfio_dma_unmap(VFIOContainer *container,
a8170e5e 1878 hwaddr iova, ram_addr_t size)
af6bc27e
AW
1879{
1880 struct vfio_iommu_type1_dma_unmap unmap = {
1881 .argsz = sizeof(unmap),
1882 .flags = 0,
1883 .iova = iova,
1884 .size = size,
1885 };
1886
1887 if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
1888 DPRINTF("VFIO_UNMAP_DMA: %d\n", -errno);
1889 return -errno;
1890 }
1891
1892 return 0;
1893}
1894
a8170e5e 1895static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
65501a74
AW
1896 ram_addr_t size, void *vaddr, bool readonly)
1897{
1898 struct vfio_iommu_type1_dma_map map = {
1899 .argsz = sizeof(map),
1900 .flags = VFIO_DMA_MAP_FLAG_READ,
5976cdd5 1901 .vaddr = (__u64)(uintptr_t)vaddr,
65501a74
AW
1902 .iova = iova,
1903 .size = size,
1904 };
1905
1906 if (!readonly) {
1907 map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
1908 }
1909
12af1344
AW
1910 /*
1911 * Try the mapping, if it fails with EBUSY, unmap the region and try
1912 * again. This shouldn't be necessary, but we sometimes see it in
1913 * the the VGA ROM space.
1914 */
1915 if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
1916 (errno == EBUSY && vfio_dma_unmap(container, iova, size) == 0 &&
1917 ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
1918 return 0;
65501a74
AW
1919 }
1920
12af1344
AW
1921 DPRINTF("VFIO_MAP_DMA: %d\n", -errno);
1922 return -errno;
65501a74
AW
1923}
1924
65501a74
AW
1925static bool vfio_listener_skipped_section(MemoryRegionSection *section)
1926{
1927 return !memory_region_is_ram(section->mr);
1928}
1929
1930static void vfio_listener_region_add(MemoryListener *listener,
1931 MemoryRegionSection *section)
1932{
1933 VFIOContainer *container = container_of(listener, VFIOContainer,
1934 iommu_data.listener);
a8170e5e 1935 hwaddr iova, end;
65501a74
AW
1936 void *vaddr;
1937 int ret;
1938
1939 if (vfio_listener_skipped_section(section)) {
82ca8912 1940 DPRINTF("SKIPPING region_add %"HWADDR_PRIx" - %"PRIx64"\n",
65501a74
AW
1941 section->offset_within_address_space,
1942 section->offset_within_address_space + section->size - 1);
1943 return;
1944 }
1945
1946 if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
1947 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
312fd5f2 1948 error_report("%s received unaligned region", __func__);
65501a74
AW
1949 return;
1950 }
1951
1952 iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
1953 end = (section->offset_within_address_space + section->size) &
1954 TARGET_PAGE_MASK;
1955
1956 if (iova >= end) {
1957 return;
1958 }
1959
1960 vaddr = memory_region_get_ram_ptr(section->mr) +
1961 section->offset_within_region +
1962 (iova - section->offset_within_address_space);
1963
82ca8912 1964 DPRINTF("region_add %"HWADDR_PRIx" - %"HWADDR_PRIx" [%p]\n",
65501a74
AW
1965 iova, end - 1, vaddr);
1966
1967 ret = vfio_dma_map(container, iova, end - iova, vaddr, section->readonly);
1968 if (ret) {
a8170e5e 1969 error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
312fd5f2 1970 "0x%"HWADDR_PRIx", %p) = %d (%m)",
65501a74
AW
1971 container, iova, end - iova, vaddr, ret);
1972 }
1973}
1974
1975static void vfio_listener_region_del(MemoryListener *listener,
1976 MemoryRegionSection *section)
1977{
1978 VFIOContainer *container = container_of(listener, VFIOContainer,
1979 iommu_data.listener);
a8170e5e 1980 hwaddr iova, end;
65501a74
AW
1981 int ret;
1982
1983 if (vfio_listener_skipped_section(section)) {
82ca8912 1984 DPRINTF("SKIPPING region_del %"HWADDR_PRIx" - %"PRIx64"\n",
65501a74
AW
1985 section->offset_within_address_space,
1986 section->offset_within_address_space + section->size - 1);
1987 return;
1988 }
1989
1990 if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
1991 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
312fd5f2 1992 error_report("%s received unaligned region", __func__);
65501a74
AW
1993 return;
1994 }
1995
1996 iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
1997 end = (section->offset_within_address_space + section->size) &
1998 TARGET_PAGE_MASK;
1999
2000 if (iova >= end) {
2001 return;
2002 }
2003
82ca8912 2004 DPRINTF("region_del %"HWADDR_PRIx" - %"HWADDR_PRIx"\n",
65501a74
AW
2005 iova, end - 1);
2006
2007 ret = vfio_dma_unmap(container, iova, end - iova);
2008 if (ret) {
a8170e5e 2009 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
312fd5f2 2010 "0x%"HWADDR_PRIx") = %d (%m)",
65501a74
AW
2011 container, iova, end - iova, ret);
2012 }
2013}
2014
2015static MemoryListener vfio_memory_listener = {
65501a74
AW
2016 .region_add = vfio_listener_region_add,
2017 .region_del = vfio_listener_region_del,
65501a74
AW
2018};
2019
2020static void vfio_listener_release(VFIOContainer *container)
2021{
2022 memory_listener_unregister(&container->iommu_data.listener);
2023}
2024
2025/*
2026 * Interrupt setup
2027 */
2028static void vfio_disable_interrupts(VFIODevice *vdev)
2029{
2030 switch (vdev->interrupt) {
2031 case VFIO_INT_INTx:
2032 vfio_disable_intx(vdev);
2033 break;
2034 case VFIO_INT_MSI:
fd704adc 2035 vfio_disable_msi(vdev);
65501a74
AW
2036 break;
2037 case VFIO_INT_MSIX:
fd704adc 2038 vfio_disable_msix(vdev);
65501a74
AW
2039 break;
2040 }
2041}
2042
2043static int vfio_setup_msi(VFIODevice *vdev, int pos)
2044{
2045 uint16_t ctrl;
2046 bool msi_64bit, msi_maskbit;
2047 int ret, entries;
2048
65501a74
AW
2049 if (pread(vdev->fd, &ctrl, sizeof(ctrl),
2050 vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
2051 return -errno;
2052 }
2053 ctrl = le16_to_cpu(ctrl);
2054
2055 msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
2056 msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
2057 entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
2058
2059 DPRINTF("%04x:%02x:%02x.%x PCI MSI CAP @0x%x\n", vdev->host.domain,
2060 vdev->host.bus, vdev->host.slot, vdev->host.function, pos);
2061
2062 ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit);
2063 if (ret < 0) {
e43b9a5a
AW
2064 if (ret == -ENOTSUP) {
2065 return 0;
2066 }
312fd5f2 2067 error_report("vfio: msi_init failed");
65501a74
AW
2068 return ret;
2069 }
2070 vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
2071
2072 return 0;
2073}
2074
2075/*
2076 * We don't have any control over how pci_add_capability() inserts
2077 * capabilities into the chain. In order to setup MSI-X we need a
2078 * MemoryRegion for the BAR. In order to setup the BAR and not
2079 * attempt to mmap the MSI-X table area, which VFIO won't allow, we
2080 * need to first look for where the MSI-X table lives. So we
2081 * unfortunately split MSI-X setup across two functions.
2082 */
2083static int vfio_early_setup_msix(VFIODevice *vdev)
2084{
2085 uint8_t pos;
2086 uint16_t ctrl;
2087 uint32_t table, pba;
2088
2089 pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
2090 if (!pos) {
2091 return 0;
2092 }
2093
2094 if (pread(vdev->fd, &ctrl, sizeof(ctrl),
2095 vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
2096 return -errno;
2097 }
2098
2099 if (pread(vdev->fd, &table, sizeof(table),
2100 vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) {
2101 return -errno;
2102 }
2103
2104 if (pread(vdev->fd, &pba, sizeof(pba),
2105 vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) {
2106 return -errno;
2107 }
2108
2109 ctrl = le16_to_cpu(ctrl);
2110 table = le32_to_cpu(table);
2111 pba = le32_to_cpu(pba);
2112
2113 vdev->msix = g_malloc0(sizeof(*(vdev->msix)));
2114 vdev->msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
2115 vdev->msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
2116 vdev->msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
2117 vdev->msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
2118 vdev->msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
2119
2120 DPRINTF("%04x:%02x:%02x.%x "
2121 "PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, entries %d\n",
2122 vdev->host.domain, vdev->host.bus, vdev->host.slot,
2123 vdev->host.function, pos, vdev->msix->table_bar,
2124 vdev->msix->table_offset, vdev->msix->entries);
2125
2126 return 0;
2127}
2128
2129static int vfio_setup_msix(VFIODevice *vdev, int pos)
2130{
2131 int ret;
2132
65501a74
AW
2133 ret = msix_init(&vdev->pdev, vdev->msix->entries,
2134 &vdev->bars[vdev->msix->table_bar].mem,
2135 vdev->msix->table_bar, vdev->msix->table_offset,
2136 &vdev->bars[vdev->msix->pba_bar].mem,
2137 vdev->msix->pba_bar, vdev->msix->pba_offset, pos);
2138 if (ret < 0) {
e43b9a5a
AW
2139 if (ret == -ENOTSUP) {
2140 return 0;
2141 }
312fd5f2 2142 error_report("vfio: msix_init failed");
65501a74
AW
2143 return ret;
2144 }
2145
65501a74
AW
2146 return 0;
2147}
2148
2149static void vfio_teardown_msi(VFIODevice *vdev)
2150{
2151 msi_uninit(&vdev->pdev);
2152
2153 if (vdev->msix) {
65501a74
AW
2154 msix_uninit(&vdev->pdev, &vdev->bars[vdev->msix->table_bar].mem,
2155 &vdev->bars[vdev->msix->pba_bar].mem);
2156 }
2157}
2158
2159/*
2160 * Resource setup
2161 */
2162static void vfio_mmap_set_enabled(VFIODevice *vdev, bool enabled)
2163{
2164 int i;
2165
2166 for (i = 0; i < PCI_ROM_SLOT; i++) {
2167 VFIOBAR *bar = &vdev->bars[i];
2168
2169 if (!bar->size) {
2170 continue;
2171 }
2172
2173 memory_region_set_enabled(&bar->mmap_mem, enabled);
2174 if (vdev->msix && vdev->msix->table_bar == i) {
2175 memory_region_set_enabled(&vdev->msix->mmap_mem, enabled);
2176 }
2177 }
2178}
2179
2180static void vfio_unmap_bar(VFIODevice *vdev, int nr)
2181{
2182 VFIOBAR *bar = &vdev->bars[nr];
2183
2184 if (!bar->size) {
2185 return;
2186 }
2187
7076eabc
AW
2188 vfio_bar_quirk_teardown(vdev, nr);
2189
65501a74
AW
2190 memory_region_del_subregion(&bar->mem, &bar->mmap_mem);
2191 munmap(bar->mmap, memory_region_size(&bar->mmap_mem));
2192
2193 if (vdev->msix && vdev->msix->table_bar == nr) {
2194 memory_region_del_subregion(&bar->mem, &vdev->msix->mmap_mem);
2195 munmap(vdev->msix->mmap, memory_region_size(&vdev->msix->mmap_mem));
2196 }
2197
2198 memory_region_destroy(&bar->mem);
2199}
2200
2201static int vfio_mmap_bar(VFIOBAR *bar, MemoryRegion *mem, MemoryRegion *submem,
2202 void **map, size_t size, off_t offset,
2203 const char *name)
2204{
2205 int ret = 0;
2206
82ca8912 2207 if (VFIO_ALLOW_MMAP && size && bar->flags & VFIO_REGION_INFO_FLAG_MMAP) {
65501a74
AW
2208 int prot = 0;
2209
2210 if (bar->flags & VFIO_REGION_INFO_FLAG_READ) {
2211 prot |= PROT_READ;
2212 }
2213
2214 if (bar->flags & VFIO_REGION_INFO_FLAG_WRITE) {
2215 prot |= PROT_WRITE;
2216 }
2217
2218 *map = mmap(NULL, size, prot, MAP_SHARED,
2219 bar->fd, bar->fd_offset + offset);
2220 if (*map == MAP_FAILED) {
2221 *map = NULL;
2222 ret = -errno;
2223 goto empty_region;
2224 }
2225
2226 memory_region_init_ram_ptr(submem, name, size, *map);
2227 } else {
2228empty_region:
2229 /* Create a zero sized sub-region to make cleanup easy. */
2230 memory_region_init(submem, name, 0);
2231 }
2232
2233 memory_region_add_subregion(mem, offset, submem);
2234
2235 return ret;
2236}
2237
2238static void vfio_map_bar(VFIODevice *vdev, int nr)
2239{
2240 VFIOBAR *bar = &vdev->bars[nr];
2241 unsigned size = bar->size;
2242 char name[64];
2243 uint32_t pci_bar;
2244 uint8_t type;
2245 int ret;
2246
2247 /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
2248 if (!size) {
2249 return;
2250 }
2251
2252 snprintf(name, sizeof(name), "VFIO %04x:%02x:%02x.%x BAR %d",
2253 vdev->host.domain, vdev->host.bus, vdev->host.slot,
2254 vdev->host.function, nr);
2255
2256 /* Determine what type of BAR this is for registration */
2257 ret = pread(vdev->fd, &pci_bar, sizeof(pci_bar),
2258 vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr));
2259 if (ret != sizeof(pci_bar)) {
312fd5f2 2260 error_report("vfio: Failed to read BAR %d (%m)", nr);
65501a74
AW
2261 return;
2262 }
2263
2264 pci_bar = le32_to_cpu(pci_bar);
2265 type = pci_bar & (pci_bar & PCI_BASE_ADDRESS_SPACE_IO ?
2266 ~PCI_BASE_ADDRESS_IO_MASK : ~PCI_BASE_ADDRESS_MEM_MASK);
2267
2268 /* A "slow" read/write mapping underlies all BARs */
2269 memory_region_init_io(&bar->mem, &vfio_bar_ops, bar, name, size);
2270 pci_register_bar(&vdev->pdev, nr, type, &bar->mem);
2271
2272 /*
2273 * We can't mmap areas overlapping the MSIX vector table, so we
2274 * potentially insert a direct-mapped subregion before and after it.
2275 */
2276 if (vdev->msix && vdev->msix->table_bar == nr) {
2277 size = vdev->msix->table_offset & TARGET_PAGE_MASK;
2278 }
2279
2280 strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
2281 if (vfio_mmap_bar(bar, &bar->mem,
2282 &bar->mmap_mem, &bar->mmap, size, 0, name)) {
312fd5f2 2283 error_report("%s unsupported. Performance may be slow", name);
65501a74
AW
2284 }
2285
2286 if (vdev->msix && vdev->msix->table_bar == nr) {
2287 unsigned start;
2288
2289 start = TARGET_PAGE_ALIGN(vdev->msix->table_offset +
2290 (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
2291
2292 size = start < bar->size ? bar->size - start : 0;
2293 strncat(name, " msix-hi", sizeof(name) - strlen(name) - 1);
2294 /* VFIOMSIXInfo contains another MemoryRegion for this mapping */
2295 if (vfio_mmap_bar(bar, &bar->mem, &vdev->msix->mmap_mem,
2296 &vdev->msix->mmap, size, start, name)) {
312fd5f2 2297 error_report("%s unsupported. Performance may be slow", name);
65501a74
AW
2298 }
2299 }
7076eabc
AW
2300
2301 vfio_bar_quirk_setup(vdev, nr);
65501a74
AW
2302}
2303
2304static void vfio_map_bars(VFIODevice *vdev)
2305{
2306 int i;
2307
2308 for (i = 0; i < PCI_ROM_SLOT; i++) {
2309 vfio_map_bar(vdev, i);
2310 }
f15689c7
AW
2311
2312 if (vdev->has_vga) {
2313 memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_MEM].mem,
2314 &vfio_vga_ops,
2315 &vdev->vga.region[QEMU_PCI_VGA_MEM],
2316 "vfio-vga-mmio@0xa0000",
2317 QEMU_PCI_VGA_MEM_SIZE);
2318 memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_IO_LO].mem,
2319 &vfio_vga_ops,
2320 &vdev->vga.region[QEMU_PCI_VGA_IO_LO],
2321 "vfio-vga-io@0x3b0",
2322 QEMU_PCI_VGA_IO_LO_SIZE);
2323 memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem,
2324 &vfio_vga_ops,
2325 &vdev->vga.region[QEMU_PCI_VGA_IO_HI],
2326 "vfio-vga-io@0x3c0",
2327 QEMU_PCI_VGA_IO_HI_SIZE);
2328
2329 pci_register_vga(&vdev->pdev, &vdev->vga.region[QEMU_PCI_VGA_MEM].mem,
2330 &vdev->vga.region[QEMU_PCI_VGA_IO_LO].mem,
2331 &vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem);
7076eabc 2332 vfio_vga_quirk_setup(vdev);
f15689c7 2333 }
65501a74
AW
2334}
2335
2336static void vfio_unmap_bars(VFIODevice *vdev)
2337{
2338 int i;
2339
2340 for (i = 0; i < PCI_ROM_SLOT; i++) {
2341 vfio_unmap_bar(vdev, i);
2342 }
f15689c7
AW
2343
2344 if (vdev->has_vga) {
7076eabc 2345 vfio_vga_quirk_teardown(vdev);
f15689c7
AW
2346 pci_unregister_vga(&vdev->pdev);
2347 memory_region_destroy(&vdev->vga.region[QEMU_PCI_VGA_MEM].mem);
2348 memory_region_destroy(&vdev->vga.region[QEMU_PCI_VGA_IO_LO].mem);
2349 memory_region_destroy(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem);
2350 }
65501a74
AW
2351}
2352
2353/*
2354 * General setup
2355 */
2356static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
2357{
2358 uint8_t tmp, next = 0xff;
2359
2360 for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp;
2361 tmp = pdev->config[tmp + 1]) {
2362 if (tmp > pos && tmp < next) {
2363 next = tmp;
2364 }
2365 }
2366
2367 return next - pos;
2368}
2369
96adc5c7
AW
2370static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask)
2371{
2372 pci_set_word(buf, (pci_get_word(buf) & ~mask) | val);
2373}
2374
2375static void vfio_add_emulated_word(VFIODevice *vdev, int pos,
2376 uint16_t val, uint16_t mask)
2377{
2378 vfio_set_word_bits(vdev->pdev.config + pos, val, mask);
2379 vfio_set_word_bits(vdev->pdev.wmask + pos, ~mask, mask);
2380 vfio_set_word_bits(vdev->emulated_config_bits + pos, mask, mask);
2381}
2382
2383static void vfio_set_long_bits(uint8_t *buf, uint32_t val, uint32_t mask)
2384{
2385 pci_set_long(buf, (pci_get_long(buf) & ~mask) | val);
2386}
2387
2388static void vfio_add_emulated_long(VFIODevice *vdev, int pos,
2389 uint32_t val, uint32_t mask)
2390{
2391 vfio_set_long_bits(vdev->pdev.config + pos, val, mask);
2392 vfio_set_long_bits(vdev->pdev.wmask + pos, ~mask, mask);
2393 vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask);
2394}
2395
2396static int vfio_setup_pcie_cap(VFIODevice *vdev, int pos, uint8_t size)
2397{
2398 uint16_t flags;
2399 uint8_t type;
2400
2401 flags = pci_get_word(vdev->pdev.config + pos + PCI_CAP_FLAGS);
2402 type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
2403
2404 if (type != PCI_EXP_TYPE_ENDPOINT &&
2405 type != PCI_EXP_TYPE_LEG_END &&
2406 type != PCI_EXP_TYPE_RC_END) {
2407
2408 error_report("vfio: Assignment of PCIe type 0x%x "
2409 "devices is not currently supported", type);
2410 return -EINVAL;
2411 }
2412
2413 if (!pci_bus_is_express(vdev->pdev.bus)) {
2414 /*
2415 * Use express capability as-is on PCI bus. It doesn't make much
2416 * sense to even expose, but some drivers (ex. tg3) depend on it
2417 * and guests don't seem to be particular about it. We'll need
2418 * to revist this or force express devices to express buses if we
2419 * ever expose an IOMMU to the guest.
2420 */
2421 } else if (pci_bus_is_root(vdev->pdev.bus)) {
2422 /*
2423 * On a Root Complex bus Endpoints become Root Complex Integrated
2424 * Endpoints, which changes the type and clears the LNK & LNK2 fields.
2425 */
2426 if (type == PCI_EXP_TYPE_ENDPOINT) {
2427 vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
2428 PCI_EXP_TYPE_RC_END << 4,
2429 PCI_EXP_FLAGS_TYPE);
2430
2431 /* Link Capabilities, Status, and Control goes away */
2432 if (size > PCI_EXP_LNKCTL) {
2433 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP, 0, ~0);
2434 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
2435 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA, 0, ~0);
2436
2437#ifndef PCI_EXP_LNKCAP2
2438#define PCI_EXP_LNKCAP2 44
2439#endif
2440#ifndef PCI_EXP_LNKSTA2
2441#define PCI_EXP_LNKSTA2 50
2442#endif
2443 /* Link 2 Capabilities, Status, and Control goes away */
2444 if (size > PCI_EXP_LNKCAP2) {
2445 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP2, 0, ~0);
2446 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL2, 0, ~0);
2447 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA2, 0, ~0);
2448 }
2449 }
2450
2451 } else if (type == PCI_EXP_TYPE_LEG_END) {
2452 /*
2453 * Legacy endpoints don't belong on the root complex. Windows
2454 * seems to be happier with devices if we skip the capability.
2455 */
2456 return 0;
2457 }
2458
2459 } else {
2460 /*
2461 * Convert Root Complex Integrated Endpoints to regular endpoints.
2462 * These devices don't support LNK/LNK2 capabilities, so make them up.
2463 */
2464 if (type == PCI_EXP_TYPE_RC_END) {
2465 vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
2466 PCI_EXP_TYPE_ENDPOINT << 4,
2467 PCI_EXP_FLAGS_TYPE);
2468 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP,
2469 PCI_EXP_LNK_MLW_1 | PCI_EXP_LNK_LS_25, ~0);
2470 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
2471 }
2472
2473 /* Mark the Link Status bits as emulated to allow virtual negotiation */
2474 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA,
2475 pci_get_word(vdev->pdev.config + pos +
2476 PCI_EXP_LNKSTA),
2477 PCI_EXP_LNKCAP_MLW | PCI_EXP_LNKCAP_SLS);
2478 }
2479
2480 pos = pci_add_capability(&vdev->pdev, PCI_CAP_ID_EXP, pos, size);
2481 if (pos >= 0) {
2482 vdev->pdev.exp.exp_cap = pos;
2483 }
2484
2485 return pos;
2486}
2487
65501a74
AW
2488static int vfio_add_std_cap(VFIODevice *vdev, uint8_t pos)
2489{
2490 PCIDevice *pdev = &vdev->pdev;
2491 uint8_t cap_id, next, size;
2492 int ret;
2493
2494 cap_id = pdev->config[pos];
2495 next = pdev->config[pos + 1];
2496
2497 /*
2498 * If it becomes important to configure capabilities to their actual
2499 * size, use this as the default when it's something we don't recognize.
2500 * Since QEMU doesn't actually handle many of the config accesses,
2501 * exact size doesn't seem worthwhile.
2502 */
2503 size = vfio_std_cap_max_size(pdev, pos);
2504
2505 /*
2506 * pci_add_capability always inserts the new capability at the head
2507 * of the chain. Therefore to end up with a chain that matches the
2508 * physical device, we insert from the end by making this recursive.
2509 * This is also why we pre-caclulate size above as cached config space
2510 * will be changed as we unwind the stack.
2511 */
2512 if (next) {
2513 ret = vfio_add_std_cap(vdev, next);
2514 if (ret) {
2515 return ret;
2516 }
2517 } else {
96adc5c7
AW
2518 /* Begin the rebuild, use QEMU emulated list bits */
2519 pdev->config[PCI_CAPABILITY_LIST] = 0;
2520 vdev->emulated_config_bits[PCI_CAPABILITY_LIST] = 0xff;
2521 vdev->emulated_config_bits[PCI_STATUS] |= PCI_STATUS_CAP_LIST;
65501a74
AW
2522 }
2523
96adc5c7
AW
2524 /* Use emulated next pointer to allow dropping caps */
2525 pci_set_byte(vdev->emulated_config_bits + pos + 1, 0xff);
2526
65501a74
AW
2527 switch (cap_id) {
2528 case PCI_CAP_ID_MSI:
2529 ret = vfio_setup_msi(vdev, pos);
2530 break;
96adc5c7
AW
2531 case PCI_CAP_ID_EXP:
2532 ret = vfio_setup_pcie_cap(vdev, pos, size);
2533 break;
65501a74
AW
2534 case PCI_CAP_ID_MSIX:
2535 ret = vfio_setup_msix(vdev, pos);
2536 break;
2537 default:
2538 ret = pci_add_capability(pdev, cap_id, pos, size);
2539 break;
2540 }
2541
2542 if (ret < 0) {
2543 error_report("vfio: %04x:%02x:%02x.%x Error adding PCI capability "
312fd5f2 2544 "0x%x[0x%x]@0x%x: %d", vdev->host.domain,
65501a74
AW
2545 vdev->host.bus, vdev->host.slot, vdev->host.function,
2546 cap_id, size, pos, ret);
2547 return ret;
2548 }
2549
2550 return 0;
2551}
2552
2553static int vfio_add_capabilities(VFIODevice *vdev)
2554{
2555 PCIDevice *pdev = &vdev->pdev;
2556
2557 if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
2558 !pdev->config[PCI_CAPABILITY_LIST]) {
2559 return 0; /* Nothing to add */
2560 }
2561
2562 return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
2563}
2564
2565static int vfio_load_rom(VFIODevice *vdev)
2566{
2567 uint64_t size = vdev->rom_size;
2568 char name[32];
2569 off_t off = 0, voff = vdev->rom_offset;
2570 ssize_t bytes;
2571 void *ptr;
2572
2573 /* If loading ROM from file, pci handles it */
2574 if (vdev->pdev.romfile || !vdev->pdev.rom_bar || !size) {
2575 return 0;
2576 }
2577
2578 DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
2579 vdev->host.bus, vdev->host.slot, vdev->host.function);
2580
2581 snprintf(name, sizeof(name), "vfio[%04x:%02x:%02x.%x].rom",
2582 vdev->host.domain, vdev->host.bus, vdev->host.slot,
2583 vdev->host.function);
2584 memory_region_init_ram(&vdev->pdev.rom, name, size);
2585 ptr = memory_region_get_ram_ptr(&vdev->pdev.rom);
2586 memset(ptr, 0xff, size);
2587
2588 while (size) {
2589 bytes = pread(vdev->fd, ptr + off, size, voff + off);
2590 if (bytes == 0) {
2591 break; /* expect that we could get back less than the ROM BAR */
2592 } else if (bytes > 0) {
2593 off += bytes;
2594 size -= bytes;
2595 } else {
2596 if (errno == EINTR || errno == EAGAIN) {
2597 continue;
2598 }
312fd5f2 2599 error_report("vfio: Error reading device ROM: %m");
65501a74
AW
2600 memory_region_destroy(&vdev->pdev.rom);
2601 return -errno;
2602 }
2603 }
2604
2605 pci_register_bar(&vdev->pdev, PCI_ROM_SLOT, 0, &vdev->pdev.rom);
2606 vdev->pdev.has_rom = true;
2607 return 0;
2608}
2609
2610static int vfio_connect_container(VFIOGroup *group)
2611{
2612 VFIOContainer *container;
2613 int ret, fd;
2614
2615 if (group->container) {
2616 return 0;
2617 }
2618
2619 QLIST_FOREACH(container, &container_list, next) {
2620 if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
2621 group->container = container;
2622 QLIST_INSERT_HEAD(&container->group_list, group, container_next);
2623 return 0;
2624 }
2625 }
2626
2627 fd = qemu_open("/dev/vfio/vfio", O_RDWR);
2628 if (fd < 0) {
312fd5f2 2629 error_report("vfio: failed to open /dev/vfio/vfio: %m");
65501a74
AW
2630 return -errno;
2631 }
2632
2633 ret = ioctl(fd, VFIO_GET_API_VERSION);
2634 if (ret != VFIO_API_VERSION) {
2635 error_report("vfio: supported vfio version: %d, "
312fd5f2 2636 "reported version: %d", VFIO_API_VERSION, ret);
65501a74
AW
2637 close(fd);
2638 return -EINVAL;
2639 }
2640
2641 container = g_malloc0(sizeof(*container));
2642 container->fd = fd;
2643
2644 if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
2645 ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
2646 if (ret) {
312fd5f2 2647 error_report("vfio: failed to set group container: %m");
65501a74
AW
2648 g_free(container);
2649 close(fd);
2650 return -errno;
2651 }
2652
2653 ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
2654 if (ret) {
312fd5f2 2655 error_report("vfio: failed to set iommu for container: %m");
65501a74
AW
2656 g_free(container);
2657 close(fd);
2658 return -errno;
2659 }
2660
2661 container->iommu_data.listener = vfio_memory_listener;
2662 container->iommu_data.release = vfio_listener_release;
2663
f6790af6 2664 memory_listener_register(&container->iommu_data.listener, &address_space_memory);
65501a74 2665 } else {
312fd5f2 2666 error_report("vfio: No available IOMMU models");
65501a74
AW
2667 g_free(container);
2668 close(fd);
2669 return -EINVAL;
2670 }
2671
2672 QLIST_INIT(&container->group_list);
2673 QLIST_INSERT_HEAD(&container_list, container, next);
2674
2675 group->container = container;
2676 QLIST_INSERT_HEAD(&container->group_list, group, container_next);
2677
2678 return 0;
2679}
2680
2681static void vfio_disconnect_container(VFIOGroup *group)
2682{
2683 VFIOContainer *container = group->container;
2684
2685 if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
312fd5f2 2686 error_report("vfio: error disconnecting group %d from container",
65501a74
AW
2687 group->groupid);
2688 }
2689
2690 QLIST_REMOVE(group, container_next);
2691 group->container = NULL;
2692
2693 if (QLIST_EMPTY(&container->group_list)) {
2694 if (container->iommu_data.release) {
2695 container->iommu_data.release(container);
2696 }
2697 QLIST_REMOVE(container, next);
2698 DPRINTF("vfio_disconnect_container: close container->fd\n");
2699 close(container->fd);
2700 g_free(container);
2701 }
2702}
2703
2704static VFIOGroup *vfio_get_group(int groupid)
2705{
2706 VFIOGroup *group;
2707 char path[32];
2708 struct vfio_group_status status = { .argsz = sizeof(status) };
2709
2710 QLIST_FOREACH(group, &group_list, next) {
2711 if (group->groupid == groupid) {
2712 return group;
2713 }
2714 }
2715
2716 group = g_malloc0(sizeof(*group));
2717
2718 snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
2719 group->fd = qemu_open(path, O_RDWR);
2720 if (group->fd < 0) {
312fd5f2 2721 error_report("vfio: error opening %s: %m", path);
65501a74
AW
2722 g_free(group);
2723 return NULL;
2724 }
2725
2726 if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
312fd5f2 2727 error_report("vfio: error getting group status: %m");
65501a74
AW
2728 close(group->fd);
2729 g_free(group);
2730 return NULL;
2731 }
2732
2733 if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
2734 error_report("vfio: error, group %d is not viable, please ensure "
2735 "all devices within the iommu_group are bound to their "
312fd5f2 2736 "vfio bus driver.", groupid);
65501a74
AW
2737 close(group->fd);
2738 g_free(group);
2739 return NULL;
2740 }
2741
2742 group->groupid = groupid;
2743 QLIST_INIT(&group->device_list);
2744
2745 if (vfio_connect_container(group)) {
312fd5f2 2746 error_report("vfio: failed to setup container for group %d", groupid);
65501a74
AW
2747 close(group->fd);
2748 g_free(group);
2749 return NULL;
2750 }
2751
2752 QLIST_INSERT_HEAD(&group_list, group, next);
2753
2754 return group;
2755}
2756
2757static void vfio_put_group(VFIOGroup *group)
2758{
2759 if (!QLIST_EMPTY(&group->device_list)) {
2760 return;
2761 }
2762
2763 vfio_disconnect_container(group);
2764 QLIST_REMOVE(group, next);
2765 DPRINTF("vfio_put_group: close group->fd\n");
2766 close(group->fd);
2767 g_free(group);
2768}
2769
2770static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev)
2771{
2772 struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
2773 struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
2774 int ret, i;
2775
2776 ret = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
2777 if (ret < 0) {
1a9522cc 2778 error_report("vfio: error getting device %s from group %d: %m",
65501a74 2779 name, group->groupid);
1a9522cc 2780 error_printf("Verify all devices in group %d are bound to vfio-pci "
65501a74
AW
2781 "or pci-stub and not already in use\n", group->groupid);
2782 return ret;
2783 }
2784
2785 vdev->fd = ret;
2786 vdev->group = group;
2787 QLIST_INSERT_HEAD(&group->device_list, vdev, next);
2788
2789 /* Sanity check device */
2790 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_INFO, &dev_info);
2791 if (ret) {
312fd5f2 2792 error_report("vfio: error getting device info: %m");
65501a74
AW
2793 goto error;
2794 }
2795
2796 DPRINTF("Device %s flags: %u, regions: %u, irgs: %u\n", name,
2797 dev_info.flags, dev_info.num_regions, dev_info.num_irqs);
2798
2799 if (!(dev_info.flags & VFIO_DEVICE_FLAGS_PCI)) {
312fd5f2 2800 error_report("vfio: Um, this isn't a PCI device");
65501a74
AW
2801 goto error;
2802 }
2803
2804 vdev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
2805 if (!vdev->reset_works) {
312fd5f2 2806 error_report("Warning, device %s does not support reset", name);
65501a74
AW
2807 }
2808
8fc94e5a 2809 if (dev_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) {
312fd5f2 2810 error_report("vfio: unexpected number of io regions %u",
65501a74
AW
2811 dev_info.num_regions);
2812 goto error;
2813 }
2814
8fc94e5a 2815 if (dev_info.num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1) {
312fd5f2 2816 error_report("vfio: unexpected number of irqs %u", dev_info.num_irqs);
65501a74
AW
2817 goto error;
2818 }
2819
2820 for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) {
2821 reg_info.index = i;
2822
2823 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
2824 if (ret) {
312fd5f2 2825 error_report("vfio: Error getting region %d info: %m", i);
65501a74
AW
2826 goto error;
2827 }
2828
2829 DPRINTF("Device %s region %d:\n", name, i);
2830 DPRINTF(" size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
2831 (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
2832 (unsigned long)reg_info.flags);
2833
2834 vdev->bars[i].flags = reg_info.flags;
2835 vdev->bars[i].size = reg_info.size;
2836 vdev->bars[i].fd_offset = reg_info.offset;
2837 vdev->bars[i].fd = vdev->fd;
2838 vdev->bars[i].nr = i;
7076eabc 2839 QLIST_INIT(&vdev->bars[i].quirks);
65501a74
AW
2840 }
2841
2842 reg_info.index = VFIO_PCI_ROM_REGION_INDEX;
2843
2844 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
2845 if (ret) {
312fd5f2 2846 error_report("vfio: Error getting ROM info: %m");
65501a74
AW
2847 goto error;
2848 }
2849
2850 DPRINTF("Device %s ROM:\n", name);
2851 DPRINTF(" size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
2852 (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
2853 (unsigned long)reg_info.flags);
2854
2855 vdev->rom_size = reg_info.size;
2856 vdev->rom_offset = reg_info.offset;
2857
2858 reg_info.index = VFIO_PCI_CONFIG_REGION_INDEX;
2859
2860 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
2861 if (ret) {
312fd5f2 2862 error_report("vfio: Error getting config info: %m");
65501a74
AW
2863 goto error;
2864 }
2865
2866 DPRINTF("Device %s config:\n", name);
2867 DPRINTF(" size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
2868 (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
2869 (unsigned long)reg_info.flags);
2870
2871 vdev->config_size = reg_info.size;
6a659bbf
AW
2872 if (vdev->config_size == PCI_CONFIG_SPACE_SIZE) {
2873 vdev->pdev.cap_present &= ~QEMU_PCI_CAP_EXPRESS;
2874 }
65501a74
AW
2875 vdev->config_offset = reg_info.offset;
2876
f15689c7
AW
2877 if ((vdev->features & VFIO_FEATURE_ENABLE_VGA) &&
2878 dev_info.num_regions > VFIO_PCI_VGA_REGION_INDEX) {
2879 struct vfio_region_info vga_info = {
2880 .argsz = sizeof(vga_info),
2881 .index = VFIO_PCI_VGA_REGION_INDEX,
2882 };
2883
2884 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &vga_info);
2885 if (ret) {
2886 error_report(
2887 "vfio: Device does not support requested feature x-vga");
2888 goto error;
2889 }
2890
2891 if (!(vga_info.flags & VFIO_REGION_INFO_FLAG_READ) ||
2892 !(vga_info.flags & VFIO_REGION_INFO_FLAG_WRITE) ||
2893 vga_info.size < 0xbffff + 1) {
2894 error_report("vfio: Unexpected VGA info, flags 0x%lx, size 0x%lx",
2895 (unsigned long)vga_info.flags,
2896 (unsigned long)vga_info.size);
2897 goto error;
2898 }
2899
2900 vdev->vga.fd_offset = vga_info.offset;
2901 vdev->vga.fd = vdev->fd;
2902
2903 vdev->vga.region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE;
2904 vdev->vga.region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM;
7076eabc 2905 QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_MEM].quirks);
f15689c7
AW
2906
2907 vdev->vga.region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE;
2908 vdev->vga.region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO;
7076eabc 2909 QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_IO_LO].quirks);
f15689c7
AW
2910
2911 vdev->vga.region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE;
2912 vdev->vga.region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI;
7076eabc 2913 QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks);
f15689c7
AW
2914
2915 vdev->has_vga = true;
2916 }
2917
65501a74
AW
2918error:
2919 if (ret) {
2920 QLIST_REMOVE(vdev, next);
2921 vdev->group = NULL;
2922 close(vdev->fd);
2923 }
2924 return ret;
2925}
2926
2927static void vfio_put_device(VFIODevice *vdev)
2928{
2929 QLIST_REMOVE(vdev, next);
2930 vdev->group = NULL;
2931 DPRINTF("vfio_put_device: close vdev->fd\n");
2932 close(vdev->fd);
2933 if (vdev->msix) {
2934 g_free(vdev->msix);
2935 vdev->msix = NULL;
2936 }
2937}
2938
2939static int vfio_initfn(PCIDevice *pdev)
2940{
2941 VFIODevice *pvdev, *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
2942 VFIOGroup *group;
2943 char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
2944 ssize_t len;
2945 struct stat st;
2946 int groupid;
2947 int ret;
2948
2949 /* Check that the host device exists */
2950 snprintf(path, sizeof(path),
2951 "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
2952 vdev->host.domain, vdev->host.bus, vdev->host.slot,
2953 vdev->host.function);
2954 if (stat(path, &st) < 0) {
312fd5f2 2955 error_report("vfio: error: no such host device: %s", path);
65501a74
AW
2956 return -errno;
2957 }
2958
2959 strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
2960
2961 len = readlink(path, iommu_group_path, PATH_MAX);
2962 if (len <= 0) {
312fd5f2 2963 error_report("vfio: error no iommu_group for device");
65501a74
AW
2964 return -errno;
2965 }
2966
2967 iommu_group_path[len] = 0;
2968 group_name = basename(iommu_group_path);
2969
2970 if (sscanf(group_name, "%d", &groupid) != 1) {
312fd5f2 2971 error_report("vfio: error reading %s: %m", path);
65501a74
AW
2972 return -errno;
2973 }
2974
2975 DPRINTF("%s(%04x:%02x:%02x.%x) group %d\n", __func__, vdev->host.domain,
2976 vdev->host.bus, vdev->host.slot, vdev->host.function, groupid);
2977
2978 group = vfio_get_group(groupid);
2979 if (!group) {
312fd5f2 2980 error_report("vfio: failed to get group %d", groupid);
65501a74
AW
2981 return -ENOENT;
2982 }
2983
2984 snprintf(path, sizeof(path), "%04x:%02x:%02x.%01x",
2985 vdev->host.domain, vdev->host.bus, vdev->host.slot,
2986 vdev->host.function);
2987
2988 QLIST_FOREACH(pvdev, &group->device_list, next) {
2989 if (pvdev->host.domain == vdev->host.domain &&
2990 pvdev->host.bus == vdev->host.bus &&
2991 pvdev->host.slot == vdev->host.slot &&
2992 pvdev->host.function == vdev->host.function) {
2993
312fd5f2 2994 error_report("vfio: error: device %s is already attached", path);
65501a74
AW
2995 vfio_put_group(group);
2996 return -EBUSY;
2997 }
2998 }
2999
3000 ret = vfio_get_device(group, path, vdev);
3001 if (ret) {
312fd5f2 3002 error_report("vfio: failed to get device %s", path);
65501a74
AW
3003 vfio_put_group(group);
3004 return ret;
3005 }
3006
3007 /* Get a copy of config space */
3008 ret = pread(vdev->fd, vdev->pdev.config,
3009 MIN(pci_config_size(&vdev->pdev), vdev->config_size),
3010 vdev->config_offset);
3011 if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) {
3012 ret = ret < 0 ? -errno : -EFAULT;
312fd5f2 3013 error_report("vfio: Failed to read device config space");
65501a74
AW
3014 goto out_put;
3015 }
3016
4b5d5e87
AW
3017 /* vfio emulates a lot for us, but some bits need extra love */
3018 vdev->emulated_config_bits = g_malloc0(vdev->config_size);
3019
3020 /* QEMU can choose to expose the ROM or not */
3021 memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4);
3022
3023 /* QEMU can change multi-function devices to single function, or reverse */
3024 vdev->emulated_config_bits[PCI_HEADER_TYPE] =
3025 PCI_HEADER_TYPE_MULTI_FUNCTION;
3026
65501a74
AW
3027 /*
3028 * Clear host resource mapping info. If we choose not to register a
3029 * BAR, such as might be the case with the option ROM, we can get
3030 * confusing, unwritable, residual addresses from the host here.
3031 */
3032 memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
3033 memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
3034
3035 vfio_load_rom(vdev);
3036
3037 ret = vfio_early_setup_msix(vdev);
3038 if (ret) {
3039 goto out_put;
3040 }
3041
3042 vfio_map_bars(vdev);
3043
3044 ret = vfio_add_capabilities(vdev);
3045 if (ret) {
3046 goto out_teardown;
3047 }
3048
4b5d5e87
AW
3049 /* QEMU emulates all of MSI & MSIX */
3050 if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
3051 memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
3052 MSIX_CAP_LENGTH);
3053 }
3054
3055 if (pdev->cap_present & QEMU_PCI_CAP_MSI) {
3056 memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff,
3057 vdev->msi_cap_size);
3058 }
3059
65501a74 3060 if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
ea486926
AW
3061 vdev->intx.mmap_timer = qemu_new_timer_ms(vm_clock,
3062 vfio_intx_mmap_enable, vdev);
e1d1e586 3063 pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_update_irq);
65501a74
AW
3064 ret = vfio_enable_intx(vdev);
3065 if (ret) {
3066 goto out_teardown;
3067 }
3068 }
3069
3070 return 0;
3071
3072out_teardown:
3073 pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
3074 vfio_teardown_msi(vdev);
3075 vfio_unmap_bars(vdev);
3076out_put:
4b5d5e87 3077 g_free(vdev->emulated_config_bits);
65501a74
AW
3078 vfio_put_device(vdev);
3079 vfio_put_group(group);
3080 return ret;
3081}
3082
3083static void vfio_exitfn(PCIDevice *pdev)
3084{
3085 VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
3086 VFIOGroup *group = vdev->group;
3087
3088 pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
3089 vfio_disable_interrupts(vdev);
ea486926
AW
3090 if (vdev->intx.mmap_timer) {
3091 qemu_free_timer(vdev->intx.mmap_timer);
3092 }
65501a74
AW
3093 vfio_teardown_msi(vdev);
3094 vfio_unmap_bars(vdev);
4b5d5e87 3095 g_free(vdev->emulated_config_bits);
65501a74
AW
3096 vfio_put_device(vdev);
3097 vfio_put_group(group);
3098}
3099
3100static void vfio_pci_reset(DeviceState *dev)
3101{
3102 PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev);
3103 VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
5834a83f 3104 uint16_t cmd;
65501a74 3105
5834a83f
AW
3106 DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
3107 vdev->host.bus, vdev->host.slot, vdev->host.function);
3108
3109 vfio_disable_interrupts(vdev);
65501a74 3110
5834a83f
AW
3111 /*
3112 * Stop any ongoing DMA by disconecting I/O, MMIO, and bus master.
3113 * Also put INTx Disable in known state.
3114 */
3115 cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2);
3116 cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
3117 PCI_COMMAND_INTX_DISABLE);
3118 vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
3119
3120 if (vdev->reset_works) {
3121 if (ioctl(vdev->fd, VFIO_DEVICE_RESET)) {
3122 error_report("vfio: Error unable to reset physical device "
312fd5f2 3123 "(%04x:%02x:%02x.%x): %m", vdev->host.domain,
5834a83f
AW
3124 vdev->host.bus, vdev->host.slot, vdev->host.function);
3125 }
65501a74 3126 }
5834a83f
AW
3127
3128 vfio_enable_intx(vdev);
65501a74
AW
3129}
3130
3131static Property vfio_pci_dev_properties[] = {
3132 DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIODevice, host),
ea486926
AW
3133 DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIODevice,
3134 intx.mmap_timeout, 1100),
f15689c7
AW
3135 DEFINE_PROP_BIT("x-vga", VFIODevice, features,
3136 VFIO_FEATURE_ENABLE_VGA_BIT, false),
65501a74
AW
3137 /*
3138 * TODO - support passed fds... is this necessary?
3139 * DEFINE_PROP_STRING("vfiofd", VFIODevice, vfiofd_name),
3140 * DEFINE_PROP_STRING("vfiogroupfd, VFIODevice, vfiogroupfd_name),
3141 */
3142 DEFINE_PROP_END_OF_LIST(),
3143};
3144
d9f0e638
AW
3145static const VMStateDescription vfio_pci_vmstate = {
3146 .name = "vfio-pci",
3147 .unmigratable = 1,
3148};
65501a74
AW
3149
3150static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
3151{
3152 DeviceClass *dc = DEVICE_CLASS(klass);
3153 PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
3154
3155 dc->reset = vfio_pci_reset;
3156 dc->props = vfio_pci_dev_properties;
d9f0e638
AW
3157 dc->vmsd = &vfio_pci_vmstate;
3158 dc->desc = "VFIO-based PCI device assignment";
65501a74
AW
3159 pdc->init = vfio_initfn;
3160 pdc->exit = vfio_exitfn;
3161 pdc->config_read = vfio_pci_read_config;
3162 pdc->config_write = vfio_pci_write_config;
6a659bbf 3163 pdc->is_express = 1; /* We might be */
65501a74
AW
3164}
3165
3166static const TypeInfo vfio_pci_dev_info = {
3167 .name = "vfio-pci",
3168 .parent = TYPE_PCI_DEVICE,
3169 .instance_size = sizeof(VFIODevice),
3170 .class_init = vfio_pci_dev_class_init,
3171};
3172
3173static void register_vfio_pci_dev_type(void)
3174{
3175 type_register_static(&vfio_pci_dev_info);
3176}
3177
3178type_init(register_vfio_pci_dev_type)