]> git.proxmox.com Git - mirror_qemu.git/blame - hw/vfio_pci.c
vfio-pci: Move devices to D0 on reset
[mirror_qemu.git] / hw / vfio_pci.c
CommitLineData
65501a74
AW
1/*
2 * vfio based device assignment support
3 *
4 * Copyright Red Hat, Inc. 2012
5 *
6 * Authors:
7 * Alex Williamson <alex.williamson@redhat.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
11 *
12 * Based on qemu-kvm device-assignment:
13 * Adapted for KVM by Qumranet.
14 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
15 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
16 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
17 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
18 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
19 */
20
21#include <dirent.h>
22#include <unistd.h>
23#include <sys/ioctl.h>
24#include <sys/mman.h>
25#include <sys/stat.h>
26#include <sys/types.h>
27#include <linux/vfio.h>
28
29#include "config.h"
1de7afc9 30#include "qemu/event_notifier.h"
022c62cb 31#include "exec/address-spaces.h"
9c17d615 32#include "sysemu/kvm.h"
022c62cb 33#include "exec/memory.h"
83c9f4ca
PB
34#include "hw/pci/msi.h"
35#include "hw/pci/msix.h"
36#include "hw/pci/pci.h"
5c97e5eb 37#include "qemu-common.h"
1de7afc9
PB
38#include "qemu/error-report.h"
39#include "qemu/queue.h"
40#include "qemu/range.h"
65501a74
AW
41
42/* #define DEBUG_VFIO */
43#ifdef DEBUG_VFIO
44#define DPRINTF(fmt, ...) \
45 do { fprintf(stderr, "vfio: " fmt, ## __VA_ARGS__); } while (0)
46#else
47#define DPRINTF(fmt, ...) \
48 do { } while (0)
49#endif
50
82ca8912
AW
51/* Extra debugging, trap acceleration paths for more logging */
52#define VFIO_ALLOW_MMAP 1
53#define VFIO_ALLOW_KVM_INTX 1
54
7076eabc
AW
55struct VFIODevice;
56
57typedef struct VFIOQuirk {
58 MemoryRegion mem;
59 struct VFIODevice *vdev;
60 QLIST_ENTRY(VFIOQuirk) next;
61 uint32_t data;
62 uint32_t data2;
63} VFIOQuirk;
64
5c97e5eb
AW
65typedef struct VFIOBAR {
66 off_t fd_offset; /* offset of BAR within device fd */
67 int fd; /* device fd, allows us to pass VFIOBAR as opaque data */
68 MemoryRegion mem; /* slow, read/write access */
69 MemoryRegion mmap_mem; /* direct mapped access */
70 void *mmap;
71 size_t size;
72 uint32_t flags; /* VFIO region flags (rd/wr/mmap) */
73 uint8_t nr; /* cache the BAR number for debug */
7076eabc 74 QLIST_HEAD(, VFIOQuirk) quirks;
5c97e5eb
AW
75} VFIOBAR;
76
f15689c7
AW
77typedef struct VFIOVGARegion {
78 MemoryRegion mem;
79 off_t offset;
80 int nr;
7076eabc 81 QLIST_HEAD(, VFIOQuirk) quirks;
f15689c7
AW
82} VFIOVGARegion;
83
84typedef struct VFIOVGA {
85 off_t fd_offset;
86 int fd;
87 VFIOVGARegion region[QEMU_PCI_VGA_NUM_REGIONS];
88} VFIOVGA;
89
5c97e5eb
AW
90typedef struct VFIOINTx {
91 bool pending; /* interrupt pending */
92 bool kvm_accel; /* set when QEMU bypass through KVM enabled */
93 uint8_t pin; /* which pin to pull for qemu_set_irq */
94 EventNotifier interrupt; /* eventfd triggered on interrupt */
95 EventNotifier unmask; /* eventfd for unmask on QEMU bypass */
96 PCIINTxRoute route; /* routing info for QEMU bypass */
97 uint32_t mmap_timeout; /* delay to re-enable mmaps after interrupt */
98 QEMUTimer *mmap_timer; /* enable mmaps after periods w/o interrupts */
99} VFIOINTx;
100
5c97e5eb
AW
101typedef struct VFIOMSIVector {
102 EventNotifier interrupt; /* eventfd triggered on interrupt */
103 struct VFIODevice *vdev; /* back pointer to device */
104 int virq; /* KVM irqchip route for QEMU bypass */
105 bool use;
106} VFIOMSIVector;
107
108enum {
109 VFIO_INT_NONE = 0,
110 VFIO_INT_INTx = 1,
111 VFIO_INT_MSI = 2,
112 VFIO_INT_MSIX = 3,
113};
114
115struct VFIOGroup;
116
117typedef struct VFIOContainer {
118 int fd; /* /dev/vfio/vfio, empowered by the attached groups */
119 struct {
120 /* enable abstraction to support various iommu backends */
121 union {
122 MemoryListener listener; /* Used by type1 iommu */
123 };
124 void (*release)(struct VFIOContainer *);
125 } iommu_data;
126 QLIST_HEAD(, VFIOGroup) group_list;
127 QLIST_ENTRY(VFIOContainer) next;
128} VFIOContainer;
129
130/* Cache of MSI-X setup plus extra mmap and memory region for split BAR map */
131typedef struct VFIOMSIXInfo {
132 uint8_t table_bar;
133 uint8_t pba_bar;
134 uint16_t entries;
135 uint32_t table_offset;
136 uint32_t pba_offset;
137 MemoryRegion mmap_mem;
138 void *mmap;
139} VFIOMSIXInfo;
140
141typedef struct VFIODevice {
142 PCIDevice pdev;
143 int fd;
144 VFIOINTx intx;
145 unsigned int config_size;
4b5d5e87 146 uint8_t *emulated_config_bits; /* QEMU emulated bits, little-endian */
5c97e5eb
AW
147 off_t config_offset; /* Offset of config space region within device fd */
148 unsigned int rom_size;
149 off_t rom_offset; /* Offset of ROM region within device fd */
150 int msi_cap_size;
151 VFIOMSIVector *msi_vectors;
152 VFIOMSIXInfo *msix;
153 int nr_vectors; /* Number of MSI/MSIX vectors currently in use */
154 int interrupt; /* Current interrupt type */
155 VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */
f15689c7 156 VFIOVGA vga; /* 0xa0000, 0x3b0, 0x3c0 */
5c97e5eb
AW
157 PCIHostDeviceAddress host;
158 QLIST_ENTRY(VFIODevice) next;
159 struct VFIOGroup *group;
f15689c7
AW
160 uint32_t features;
161#define VFIO_FEATURE_ENABLE_VGA_BIT 0
162#define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT)
ba661818 163 uint8_t pm_cap;
5c97e5eb 164 bool reset_works;
f15689c7 165 bool has_vga;
5c97e5eb
AW
166} VFIODevice;
167
168typedef struct VFIOGroup {
169 int fd;
170 int groupid;
171 VFIOContainer *container;
172 QLIST_HEAD(, VFIODevice) device_list;
173 QLIST_ENTRY(VFIOGroup) next;
174 QLIST_ENTRY(VFIOGroup) container_next;
175} VFIOGroup;
176
65501a74
AW
177#define MSIX_CAP_LENGTH 12
178
179static QLIST_HEAD(, VFIOContainer)
180 container_list = QLIST_HEAD_INITIALIZER(container_list);
181
182static QLIST_HEAD(, VFIOGroup)
183 group_list = QLIST_HEAD_INITIALIZER(group_list);
184
185static void vfio_disable_interrupts(VFIODevice *vdev);
186static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
7076eabc
AW
187static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
188 uint32_t val, int len);
65501a74
AW
189static void vfio_mmap_set_enabled(VFIODevice *vdev, bool enabled);
190
191/*
192 * Common VFIO interrupt disable
193 */
194static void vfio_disable_irqindex(VFIODevice *vdev, int index)
195{
196 struct vfio_irq_set irq_set = {
197 .argsz = sizeof(irq_set),
198 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
199 .index = index,
200 .start = 0,
201 .count = 0,
202 };
203
204 ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
65501a74
AW
205}
206
207/*
208 * INTx
209 */
210static void vfio_unmask_intx(VFIODevice *vdev)
211{
212 struct vfio_irq_set irq_set = {
213 .argsz = sizeof(irq_set),
214 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
215 .index = VFIO_PCI_INTX_IRQ_INDEX,
216 .start = 0,
217 .count = 1,
218 };
219
220 ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
221}
222
e1d1e586
AW
223#ifdef CONFIG_KVM /* Unused outside of CONFIG_KVM code */
224static void vfio_mask_intx(VFIODevice *vdev)
225{
226 struct vfio_irq_set irq_set = {
227 .argsz = sizeof(irq_set),
228 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
229 .index = VFIO_PCI_INTX_IRQ_INDEX,
230 .start = 0,
231 .count = 1,
232 };
233
234 ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
235}
236#endif
237
ea486926
AW
238/*
239 * Disabling BAR mmaping can be slow, but toggling it around INTx can
240 * also be a huge overhead. We try to get the best of both worlds by
241 * waiting until an interrupt to disable mmaps (subsequent transitions
242 * to the same state are effectively no overhead). If the interrupt has
243 * been serviced and the time gap is long enough, we re-enable mmaps for
244 * performance. This works well for things like graphics cards, which
245 * may not use their interrupt at all and are penalized to an unusable
246 * level by read/write BAR traps. Other devices, like NICs, have more
247 * regular interrupts and see much better latency by staying in non-mmap
248 * mode. We therefore set the default mmap_timeout such that a ping
249 * is just enough to keep the mmap disabled. Users can experiment with
250 * other options with the x-intx-mmap-timeout-ms parameter (a value of
251 * zero disables the timer).
252 */
253static void vfio_intx_mmap_enable(void *opaque)
254{
255 VFIODevice *vdev = opaque;
256
257 if (vdev->intx.pending) {
258 qemu_mod_timer(vdev->intx.mmap_timer,
259 qemu_get_clock_ms(vm_clock) + vdev->intx.mmap_timeout);
260 return;
261 }
262
263 vfio_mmap_set_enabled(vdev, true);
264}
265
65501a74
AW
266static void vfio_intx_interrupt(void *opaque)
267{
268 VFIODevice *vdev = opaque;
269
270 if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) {
271 return;
272 }
273
274 DPRINTF("%s(%04x:%02x:%02x.%x) Pin %c\n", __func__, vdev->host.domain,
275 vdev->host.bus, vdev->host.slot, vdev->host.function,
276 'A' + vdev->intx.pin);
277
278 vdev->intx.pending = true;
279 qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 1);
ea486926
AW
280 vfio_mmap_set_enabled(vdev, false);
281 if (vdev->intx.mmap_timeout) {
282 qemu_mod_timer(vdev->intx.mmap_timer,
283 qemu_get_clock_ms(vm_clock) + vdev->intx.mmap_timeout);
284 }
65501a74
AW
285}
286
287static void vfio_eoi(VFIODevice *vdev)
288{
289 if (!vdev->intx.pending) {
290 return;
291 }
292
293 DPRINTF("%s(%04x:%02x:%02x.%x) EOI\n", __func__, vdev->host.domain,
294 vdev->host.bus, vdev->host.slot, vdev->host.function);
295
296 vdev->intx.pending = false;
297 qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
298 vfio_unmask_intx(vdev);
299}
300
e1d1e586
AW
301static void vfio_enable_intx_kvm(VFIODevice *vdev)
302{
303#ifdef CONFIG_KVM
304 struct kvm_irqfd irqfd = {
305 .fd = event_notifier_get_fd(&vdev->intx.interrupt),
306 .gsi = vdev->intx.route.irq,
307 .flags = KVM_IRQFD_FLAG_RESAMPLE,
308 };
309 struct vfio_irq_set *irq_set;
310 int ret, argsz;
311 int32_t *pfd;
312
82ca8912 313 if (!VFIO_ALLOW_KVM_INTX || !kvm_irqfds_enabled() ||
e1d1e586
AW
314 vdev->intx.route.mode != PCI_INTX_ENABLED ||
315 !kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) {
316 return;
317 }
318
319 /* Get to a known interrupt state */
320 qemu_set_fd_handler(irqfd.fd, NULL, NULL, vdev);
321 vfio_mask_intx(vdev);
322 vdev->intx.pending = false;
323 qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
324
325 /* Get an eventfd for resample/unmask */
326 if (event_notifier_init(&vdev->intx.unmask, 0)) {
312fd5f2 327 error_report("vfio: Error: event_notifier_init failed eoi");
e1d1e586
AW
328 goto fail;
329 }
330
331 /* KVM triggers it, VFIO listens for it */
332 irqfd.resamplefd = event_notifier_get_fd(&vdev->intx.unmask);
333
334 if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
312fd5f2 335 error_report("vfio: Error: Failed to setup resample irqfd: %m");
e1d1e586
AW
336 goto fail_irqfd;
337 }
338
339 argsz = sizeof(*irq_set) + sizeof(*pfd);
340
341 irq_set = g_malloc0(argsz);
342 irq_set->argsz = argsz;
343 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK;
344 irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
345 irq_set->start = 0;
346 irq_set->count = 1;
347 pfd = (int32_t *)&irq_set->data;
348
349 *pfd = irqfd.resamplefd;
350
351 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
352 g_free(irq_set);
353 if (ret) {
312fd5f2 354 error_report("vfio: Error: Failed to setup INTx unmask fd: %m");
e1d1e586
AW
355 goto fail_vfio;
356 }
357
358 /* Let'em rip */
359 vfio_unmask_intx(vdev);
360
361 vdev->intx.kvm_accel = true;
362
363 DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel enabled\n",
364 __func__, vdev->host.domain, vdev->host.bus,
365 vdev->host.slot, vdev->host.function);
366
367 return;
368
369fail_vfio:
370 irqfd.flags = KVM_IRQFD_FLAG_DEASSIGN;
371 kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd);
372fail_irqfd:
373 event_notifier_cleanup(&vdev->intx.unmask);
374fail:
375 qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
376 vfio_unmask_intx(vdev);
377#endif
378}
379
380static void vfio_disable_intx_kvm(VFIODevice *vdev)
381{
382#ifdef CONFIG_KVM
383 struct kvm_irqfd irqfd = {
384 .fd = event_notifier_get_fd(&vdev->intx.interrupt),
385 .gsi = vdev->intx.route.irq,
386 .flags = KVM_IRQFD_FLAG_DEASSIGN,
387 };
388
389 if (!vdev->intx.kvm_accel) {
390 return;
391 }
392
393 /*
394 * Get to a known state, hardware masked, QEMU ready to accept new
395 * interrupts, QEMU IRQ de-asserted.
396 */
397 vfio_mask_intx(vdev);
398 vdev->intx.pending = false;
399 qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
400
401 /* Tell KVM to stop listening for an INTx irqfd */
402 if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
312fd5f2 403 error_report("vfio: Error: Failed to disable INTx irqfd: %m");
e1d1e586
AW
404 }
405
406 /* We only need to close the eventfd for VFIO to cleanup the kernel side */
407 event_notifier_cleanup(&vdev->intx.unmask);
408
409 /* QEMU starts listening for interrupt events. */
410 qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
411
412 vdev->intx.kvm_accel = false;
413
414 /* If we've missed an event, let it re-fire through QEMU */
415 vfio_unmask_intx(vdev);
416
417 DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel disabled\n",
418 __func__, vdev->host.domain, vdev->host.bus,
419 vdev->host.slot, vdev->host.function);
420#endif
421}
422
423static void vfio_update_irq(PCIDevice *pdev)
424{
425 VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
426 PCIINTxRoute route;
427
428 if (vdev->interrupt != VFIO_INT_INTx) {
429 return;
430 }
431
432 route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
433
434 if (!pci_intx_route_changed(&vdev->intx.route, &route)) {
435 return; /* Nothing changed */
436 }
437
438 DPRINTF("%s(%04x:%02x:%02x.%x) IRQ moved %d -> %d\n", __func__,
439 vdev->host.domain, vdev->host.bus, vdev->host.slot,
440 vdev->host.function, vdev->intx.route.irq, route.irq);
441
442 vfio_disable_intx_kvm(vdev);
443
444 vdev->intx.route = route;
445
446 if (route.mode != PCI_INTX_ENABLED) {
447 return;
448 }
449
450 vfio_enable_intx_kvm(vdev);
451
452 /* Re-enable the interrupt in cased we missed an EOI */
453 vfio_eoi(vdev);
454}
455
65501a74
AW
456static int vfio_enable_intx(VFIODevice *vdev)
457{
65501a74 458 uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
1a403133
AW
459 int ret, argsz;
460 struct vfio_irq_set *irq_set;
461 int32_t *pfd;
65501a74 462
ea486926 463 if (!pin) {
65501a74
AW
464 return 0;
465 }
466
467 vfio_disable_interrupts(vdev);
468
469 vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
e1d1e586
AW
470
471#ifdef CONFIG_KVM
472 /*
473 * Only conditional to avoid generating error messages on platforms
474 * where we won't actually use the result anyway.
475 */
d281084d
AW
476 if (kvm_irqfds_enabled() &&
477 kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) {
e1d1e586
AW
478 vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev,
479 vdev->intx.pin);
480 }
481#endif
482
65501a74
AW
483 ret = event_notifier_init(&vdev->intx.interrupt, 0);
484 if (ret) {
312fd5f2 485 error_report("vfio: Error: event_notifier_init failed");
65501a74
AW
486 return ret;
487 }
488
1a403133
AW
489 argsz = sizeof(*irq_set) + sizeof(*pfd);
490
491 irq_set = g_malloc0(argsz);
492 irq_set->argsz = argsz;
493 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
494 irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
495 irq_set->start = 0;
496 irq_set->count = 1;
497 pfd = (int32_t *)&irq_set->data;
498
499 *pfd = event_notifier_get_fd(&vdev->intx.interrupt);
500 qemu_set_fd_handler(*pfd, vfio_intx_interrupt, NULL, vdev);
65501a74 501
1a403133
AW
502 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
503 g_free(irq_set);
504 if (ret) {
312fd5f2 505 error_report("vfio: Error: Failed to setup INTx fd: %m");
1a403133 506 qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
ce59af2d 507 event_notifier_cleanup(&vdev->intx.interrupt);
65501a74
AW
508 return -errno;
509 }
510
e1d1e586
AW
511 vfio_enable_intx_kvm(vdev);
512
65501a74
AW
513 vdev->interrupt = VFIO_INT_INTx;
514
515 DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
516 vdev->host.bus, vdev->host.slot, vdev->host.function);
517
518 return 0;
519}
520
521static void vfio_disable_intx(VFIODevice *vdev)
522{
523 int fd;
524
ea486926 525 qemu_del_timer(vdev->intx.mmap_timer);
e1d1e586 526 vfio_disable_intx_kvm(vdev);
65501a74
AW
527 vfio_disable_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
528 vdev->intx.pending = false;
529 qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
530 vfio_mmap_set_enabled(vdev, true);
531
532 fd = event_notifier_get_fd(&vdev->intx.interrupt);
533 qemu_set_fd_handler(fd, NULL, NULL, vdev);
534 event_notifier_cleanup(&vdev->intx.interrupt);
535
536 vdev->interrupt = VFIO_INT_NONE;
537
538 DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
539 vdev->host.bus, vdev->host.slot, vdev->host.function);
540}
541
542/*
543 * MSI/X
544 */
545static void vfio_msi_interrupt(void *opaque)
546{
547 VFIOMSIVector *vector = opaque;
548 VFIODevice *vdev = vector->vdev;
549 int nr = vector - vdev->msi_vectors;
550
551 if (!event_notifier_test_and_clear(&vector->interrupt)) {
552 return;
553 }
554
555 DPRINTF("%s(%04x:%02x:%02x.%x) vector %d\n", __func__,
556 vdev->host.domain, vdev->host.bus, vdev->host.slot,
557 vdev->host.function, nr);
558
559 if (vdev->interrupt == VFIO_INT_MSIX) {
560 msix_notify(&vdev->pdev, nr);
561 } else if (vdev->interrupt == VFIO_INT_MSI) {
562 msi_notify(&vdev->pdev, nr);
563 } else {
312fd5f2 564 error_report("vfio: MSI interrupt receieved, but not enabled?");
65501a74
AW
565 }
566}
567
568static int vfio_enable_vectors(VFIODevice *vdev, bool msix)
569{
570 struct vfio_irq_set *irq_set;
571 int ret = 0, i, argsz;
572 int32_t *fds;
573
574 argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
575
576 irq_set = g_malloc0(argsz);
577 irq_set->argsz = argsz;
578 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
579 irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX;
580 irq_set->start = 0;
581 irq_set->count = vdev->nr_vectors;
582 fds = (int32_t *)&irq_set->data;
583
584 for (i = 0; i < vdev->nr_vectors; i++) {
585 if (!vdev->msi_vectors[i].use) {
586 fds[i] = -1;
587 continue;
588 }
589
590 fds[i] = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
591 }
592
593 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
594
595 g_free(irq_set);
596
65501a74
AW
597 return ret;
598}
599
b0223e29
AW
600static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
601 MSIMessage *msg, IOHandler *handler)
65501a74
AW
602{
603 VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
604 VFIOMSIVector *vector;
605 int ret;
606
607 DPRINTF("%s(%04x:%02x:%02x.%x) vector %d used\n", __func__,
608 vdev->host.domain, vdev->host.bus, vdev->host.slot,
609 vdev->host.function, nr);
610
65501a74
AW
611 vector = &vdev->msi_vectors[nr];
612 vector->vdev = vdev;
613 vector->use = true;
614
615 msix_vector_use(pdev, nr);
616
617 if (event_notifier_init(&vector->interrupt, 0)) {
312fd5f2 618 error_report("vfio: Error: event_notifier_init failed");
65501a74
AW
619 }
620
621 /*
622 * Attempt to enable route through KVM irqchip,
623 * default to userspace handling if unavailable.
624 */
b0223e29 625 vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
65501a74
AW
626 if (vector->virq < 0 ||
627 kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
628 vector->virq) < 0) {
629 if (vector->virq >= 0) {
630 kvm_irqchip_release_virq(kvm_state, vector->virq);
631 vector->virq = -1;
632 }
633 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
b0223e29 634 handler, NULL, vector);
65501a74
AW
635 }
636
637 /*
638 * We don't want to have the host allocate all possible MSI vectors
639 * for a device if they're not in use, so we shutdown and incrementally
640 * increase them as needed.
641 */
642 if (vdev->nr_vectors < nr + 1) {
65501a74
AW
643 vfio_disable_irqindex(vdev, VFIO_PCI_MSIX_IRQ_INDEX);
644 vdev->nr_vectors = nr + 1;
645 ret = vfio_enable_vectors(vdev, true);
646 if (ret) {
312fd5f2 647 error_report("vfio: failed to enable vectors, %d", ret);
65501a74 648 }
65501a74 649 } else {
1a403133
AW
650 int argsz;
651 struct vfio_irq_set *irq_set;
652 int32_t *pfd;
653
654 argsz = sizeof(*irq_set) + sizeof(*pfd);
655
656 irq_set = g_malloc0(argsz);
657 irq_set->argsz = argsz;
658 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
659 VFIO_IRQ_SET_ACTION_TRIGGER;
660 irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
661 irq_set->start = nr;
662 irq_set->count = 1;
663 pfd = (int32_t *)&irq_set->data;
664
665 *pfd = event_notifier_get_fd(&vector->interrupt);
666
667 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
668 g_free(irq_set);
65501a74 669 if (ret) {
312fd5f2 670 error_report("vfio: failed to modify vector, %d", ret);
65501a74 671 }
65501a74
AW
672 }
673
674 return 0;
675}
676
b0223e29
AW
677static int vfio_msix_vector_use(PCIDevice *pdev,
678 unsigned int nr, MSIMessage msg)
679{
680 return vfio_msix_vector_do_use(pdev, nr, &msg, vfio_msi_interrupt);
681}
682
65501a74
AW
683static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
684{
685 VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
686 VFIOMSIVector *vector = &vdev->msi_vectors[nr];
1a403133
AW
687 int argsz;
688 struct vfio_irq_set *irq_set;
689 int32_t *pfd;
65501a74
AW
690
691 DPRINTF("%s(%04x:%02x:%02x.%x) vector %d released\n", __func__,
692 vdev->host.domain, vdev->host.bus, vdev->host.slot,
693 vdev->host.function, nr);
694
695 /*
696 * XXX What's the right thing to do here? This turns off the interrupt
697 * completely, but do we really just want to switch the interrupt to
698 * bouncing through userspace and let msix.c drop it? Not sure.
699 */
700 msix_vector_unuse(pdev, nr);
1a403133
AW
701
702 argsz = sizeof(*irq_set) + sizeof(*pfd);
703
704 irq_set = g_malloc0(argsz);
705 irq_set->argsz = argsz;
706 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
707 VFIO_IRQ_SET_ACTION_TRIGGER;
708 irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
709 irq_set->start = nr;
710 irq_set->count = 1;
711 pfd = (int32_t *)&irq_set->data;
712
713 *pfd = -1;
714
715 ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
716
717 g_free(irq_set);
65501a74
AW
718
719 if (vector->virq < 0) {
720 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
721 NULL, NULL, NULL);
722 } else {
723 kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->interrupt,
724 vector->virq);
725 kvm_irqchip_release_virq(kvm_state, vector->virq);
726 vector->virq = -1;
727 }
728
729 event_notifier_cleanup(&vector->interrupt);
730 vector->use = false;
731}
732
fd704adc
AW
733static void vfio_enable_msix(VFIODevice *vdev)
734{
735 vfio_disable_interrupts(vdev);
736
737 vdev->msi_vectors = g_malloc0(vdev->msix->entries * sizeof(VFIOMSIVector));
738
739 vdev->interrupt = VFIO_INT_MSIX;
740
b0223e29
AW
741 /*
742 * Some communication channels between VF & PF or PF & fw rely on the
743 * physical state of the device and expect that enabling MSI-X from the
744 * guest enables the same on the host. When our guest is Linux, the
745 * guest driver call to pci_enable_msix() sets the enabling bit in the
746 * MSI-X capability, but leaves the vector table masked. We therefore
747 * can't rely on a vector_use callback (from request_irq() in the guest)
748 * to switch the physical device into MSI-X mode because that may come a
749 * long time after pci_enable_msix(). This code enables vector 0 with
750 * triggering to userspace, then immediately release the vector, leaving
751 * the physical device with no vectors enabled, but MSI-X enabled, just
752 * like the guest view.
753 */
754 vfio_msix_vector_do_use(&vdev->pdev, 0, NULL, NULL);
755 vfio_msix_vector_release(&vdev->pdev, 0);
756
fd704adc 757 if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
bbef882c 758 vfio_msix_vector_release, NULL)) {
312fd5f2 759 error_report("vfio: msix_set_vector_notifiers failed");
fd704adc
AW
760 }
761
762 DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
763 vdev->host.bus, vdev->host.slot, vdev->host.function);
764}
765
65501a74
AW
766static void vfio_enable_msi(VFIODevice *vdev)
767{
768 int ret, i;
769
770 vfio_disable_interrupts(vdev);
771
772 vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
773retry:
774 vdev->msi_vectors = g_malloc0(vdev->nr_vectors * sizeof(VFIOMSIVector));
775
776 for (i = 0; i < vdev->nr_vectors; i++) {
777 MSIMessage msg;
778 VFIOMSIVector *vector = &vdev->msi_vectors[i];
779
780 vector->vdev = vdev;
781 vector->use = true;
782
783 if (event_notifier_init(&vector->interrupt, 0)) {
312fd5f2 784 error_report("vfio: Error: event_notifier_init failed");
65501a74
AW
785 }
786
a771c517 787 msg = msi_get_message(&vdev->pdev, i);
65501a74
AW
788
789 /*
790 * Attempt to enable route through KVM irqchip,
791 * default to userspace handling if unavailable.
792 */
793 vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
794 if (vector->virq < 0 ||
795 kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
796 vector->virq) < 0) {
797 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
798 vfio_msi_interrupt, NULL, vector);
799 }
800 }
801
802 ret = vfio_enable_vectors(vdev, false);
803 if (ret) {
804 if (ret < 0) {
312fd5f2 805 error_report("vfio: Error: Failed to setup MSI fds: %m");
65501a74
AW
806 } else if (ret != vdev->nr_vectors) {
807 error_report("vfio: Error: Failed to enable %d "
312fd5f2 808 "MSI vectors, retry with %d", vdev->nr_vectors, ret);
65501a74
AW
809 }
810
811 for (i = 0; i < vdev->nr_vectors; i++) {
812 VFIOMSIVector *vector = &vdev->msi_vectors[i];
813 if (vector->virq >= 0) {
814 kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->interrupt,
815 vector->virq);
816 kvm_irqchip_release_virq(kvm_state, vector->virq);
817 vector->virq = -1;
818 } else {
819 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
820 NULL, NULL, NULL);
821 }
822 event_notifier_cleanup(&vector->interrupt);
823 }
824
825 g_free(vdev->msi_vectors);
826
827 if (ret > 0 && ret != vdev->nr_vectors) {
828 vdev->nr_vectors = ret;
829 goto retry;
830 }
831 vdev->nr_vectors = 0;
832
833 return;
834 }
835
fd704adc
AW
836 vdev->interrupt = VFIO_INT_MSI;
837
65501a74
AW
838 DPRINTF("%s(%04x:%02x:%02x.%x) Enabled %d MSI vectors\n", __func__,
839 vdev->host.domain, vdev->host.bus, vdev->host.slot,
840 vdev->host.function, vdev->nr_vectors);
841}
842
fd704adc
AW
843static void vfio_disable_msi_common(VFIODevice *vdev)
844{
845 g_free(vdev->msi_vectors);
846 vdev->msi_vectors = NULL;
847 vdev->nr_vectors = 0;
848 vdev->interrupt = VFIO_INT_NONE;
849
850 vfio_enable_intx(vdev);
851}
852
853static void vfio_disable_msix(VFIODevice *vdev)
854{
855 msix_unset_vector_notifiers(&vdev->pdev);
856
857 if (vdev->nr_vectors) {
858 vfio_disable_irqindex(vdev, VFIO_PCI_MSIX_IRQ_INDEX);
859 }
860
861 vfio_disable_msi_common(vdev);
862
a011b10e
AW
863 DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
864 vdev->host.bus, vdev->host.slot, vdev->host.function);
fd704adc
AW
865}
866
867static void vfio_disable_msi(VFIODevice *vdev)
65501a74
AW
868{
869 int i;
870
fd704adc 871 vfio_disable_irqindex(vdev, VFIO_PCI_MSI_IRQ_INDEX);
65501a74
AW
872
873 for (i = 0; i < vdev->nr_vectors; i++) {
874 VFIOMSIVector *vector = &vdev->msi_vectors[i];
875
876 if (!vector->use) {
877 continue;
878 }
879
880 if (vector->virq >= 0) {
881 kvm_irqchip_remove_irqfd_notifier(kvm_state,
882 &vector->interrupt, vector->virq);
883 kvm_irqchip_release_virq(kvm_state, vector->virq);
884 vector->virq = -1;
885 } else {
886 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
887 NULL, NULL, NULL);
888 }
889
65501a74
AW
890 event_notifier_cleanup(&vector->interrupt);
891 }
892
fd704adc 893 vfio_disable_msi_common(vdev);
65501a74 894
fd704adc
AW
895 DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
896 vdev->host.bus, vdev->host.slot, vdev->host.function);
65501a74
AW
897}
898
899/*
900 * IO Port/MMIO - Beware of the endians, VFIO is always little endian
901 */
a8170e5e 902static void vfio_bar_write(void *opaque, hwaddr addr,
65501a74
AW
903 uint64_t data, unsigned size)
904{
905 VFIOBAR *bar = opaque;
906 union {
907 uint8_t byte;
908 uint16_t word;
909 uint32_t dword;
910 uint64_t qword;
911 } buf;
912
913 switch (size) {
914 case 1:
915 buf.byte = data;
916 break;
917 case 2:
918 buf.word = cpu_to_le16(data);
919 break;
920 case 4:
921 buf.dword = cpu_to_le32(data);
922 break;
923 default:
924 hw_error("vfio: unsupported write size, %d bytes\n", size);
925 break;
926 }
927
928 if (pwrite(bar->fd, &buf, size, bar->fd_offset + addr) != size) {
312fd5f2 929 error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m",
65501a74
AW
930 __func__, addr, data, size);
931 }
932
82ca8912
AW
933#ifdef DEBUG_VFIO
934 {
935 VFIODevice *vdev = container_of(bar, VFIODevice, bars[bar->nr]);
936
937 DPRINTF("%s(%04x:%02x:%02x.%x:BAR%d+0x%"HWADDR_PRIx", 0x%"PRIx64
938 ", %d)\n", __func__, vdev->host.domain, vdev->host.bus,
939 vdev->host.slot, vdev->host.function, bar->nr, addr,
940 data, size);
941 }
942#endif
65501a74
AW
943
944 /*
945 * A read or write to a BAR always signals an INTx EOI. This will
946 * do nothing if not pending (including not in INTx mode). We assume
947 * that a BAR access is in response to an interrupt and that BAR
948 * accesses will service the interrupt. Unfortunately, we don't know
949 * which access will service the interrupt, so we're potentially
950 * getting quite a few host interrupts per guest interrupt.
951 */
3a4f2816 952 vfio_eoi(container_of(bar, VFIODevice, bars[bar->nr]));
65501a74
AW
953}
954
955static uint64_t vfio_bar_read(void *opaque,
a8170e5e 956 hwaddr addr, unsigned size)
65501a74
AW
957{
958 VFIOBAR *bar = opaque;
959 union {
960 uint8_t byte;
961 uint16_t word;
962 uint32_t dword;
963 uint64_t qword;
964 } buf;
965 uint64_t data = 0;
966
967 if (pread(bar->fd, &buf, size, bar->fd_offset + addr) != size) {
312fd5f2 968 error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m",
65501a74
AW
969 __func__, addr, size);
970 return (uint64_t)-1;
971 }
972
973 switch (size) {
974 case 1:
975 data = buf.byte;
976 break;
977 case 2:
978 data = le16_to_cpu(buf.word);
979 break;
980 case 4:
981 data = le32_to_cpu(buf.dword);
982 break;
983 default:
984 hw_error("vfio: unsupported read size, %d bytes\n", size);
985 break;
986 }
987
82ca8912
AW
988#ifdef DEBUG_VFIO
989 {
990 VFIODevice *vdev = container_of(bar, VFIODevice, bars[bar->nr]);
991
992 DPRINTF("%s(%04x:%02x:%02x.%x:BAR%d+0x%"HWADDR_PRIx
993 ", %d) = 0x%"PRIx64"\n", __func__, vdev->host.domain,
994 vdev->host.bus, vdev->host.slot, vdev->host.function,
995 bar->nr, addr, size, data);
996 }
997#endif
65501a74
AW
998
999 /* Same as write above */
3a4f2816 1000 vfio_eoi(container_of(bar, VFIODevice, bars[bar->nr]));
65501a74
AW
1001
1002 return data;
1003}
1004
1005static const MemoryRegionOps vfio_bar_ops = {
1006 .read = vfio_bar_read,
1007 .write = vfio_bar_write,
1008 .endianness = DEVICE_LITTLE_ENDIAN,
1009};
1010
f15689c7
AW
1011static void vfio_vga_write(void *opaque, hwaddr addr,
1012 uint64_t data, unsigned size)
1013{
1014 VFIOVGARegion *region = opaque;
1015 VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
1016 union {
1017 uint8_t byte;
1018 uint16_t word;
1019 uint32_t dword;
1020 uint64_t qword;
1021 } buf;
1022 off_t offset = vga->fd_offset + region->offset + addr;
1023
1024 switch (size) {
1025 case 1:
1026 buf.byte = data;
1027 break;
1028 case 2:
1029 buf.word = cpu_to_le16(data);
1030 break;
1031 case 4:
1032 buf.dword = cpu_to_le32(data);
1033 break;
1034 default:
1035 hw_error("vfio: unsupported write size, %d bytes\n", size);
1036 break;
1037 }
1038
1039 if (pwrite(vga->fd, &buf, size, offset) != size) {
1040 error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m",
1041 __func__, region->offset + addr, data, size);
1042 }
1043
1044 DPRINTF("%s(0x%"HWADDR_PRIx", 0x%"PRIx64", %d)\n",
1045 __func__, region->offset + addr, data, size);
1046}
1047
1048static uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size)
1049{
1050 VFIOVGARegion *region = opaque;
1051 VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
1052 union {
1053 uint8_t byte;
1054 uint16_t word;
1055 uint32_t dword;
1056 uint64_t qword;
1057 } buf;
1058 uint64_t data = 0;
1059 off_t offset = vga->fd_offset + region->offset + addr;
1060
1061 if (pread(vga->fd, &buf, size, offset) != size) {
1062 error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m",
1063 __func__, region->offset + addr, size);
1064 return (uint64_t)-1;
1065 }
1066
1067 switch (size) {
1068 case 1:
1069 data = buf.byte;
1070 break;
1071 case 2:
1072 data = le16_to_cpu(buf.word);
1073 break;
1074 case 4:
1075 data = le32_to_cpu(buf.dword);
1076 break;
1077 default:
1078 hw_error("vfio: unsupported read size, %d bytes\n", size);
1079 break;
1080 }
1081
1082 DPRINTF("%s(0x%"HWADDR_PRIx", %d) = 0x%"PRIx64"\n",
1083 __func__, region->offset + addr, size, data);
1084
1085 return data;
1086}
1087
1088static const MemoryRegionOps vfio_vga_ops = {
1089 .read = vfio_vga_read,
1090 .write = vfio_vga_write,
1091 .endianness = DEVICE_LITTLE_ENDIAN,
1092};
1093
7076eabc
AW
1094/*
1095 * Device specific quirks
1096 */
1097
1098#define PCI_VENDOR_ID_ATI 0x1002
1099
1100/*
1101 * Device 1002:68f9 (Advanced Micro Devices [AMD] nee ATI Cedar PRO [Radeon
1102 * HD 5450/6350]) reports the upper byte of the physical address of the
1103 * I/O port BAR4 through VGA register 0x3c3. The BAR is 256 bytes, so the
1104 * lower byte is known to be zero. Probing for this quirk reads 0xff from
1105 * port 0x3c3 on some devices so we store the physical address and replace
1106 * reads with the virtual address any time it matches. XXX Research when
1107 * to enable quirk.
1108 */
1109static uint64_t vfio_ati_3c3_quirk_read(void *opaque,
1110 hwaddr addr, unsigned size)
1111{
1112 VFIOQuirk *quirk = opaque;
1113 VFIODevice *vdev = quirk->vdev;
1114 PCIDevice *pdev = &vdev->pdev;
1115 uint64_t data = vfio_vga_read(&vdev->vga.region[QEMU_PCI_VGA_IO_HI],
1116 addr + 0x3, size);
1117
1118 if (data == quirk->data) {
1119 data = pci_get_byte(pdev->config + PCI_BASE_ADDRESS_4 + 1);
1120 DPRINTF("%s(0x3c3, 1) = 0x%"PRIx64"\n", __func__, data);
1121 }
1122
1123 return data;
1124}
1125
1126static const MemoryRegionOps vfio_ati_3c3_quirk = {
1127 .read = vfio_ati_3c3_quirk_read,
1128 .endianness = DEVICE_LITTLE_ENDIAN,
1129};
1130
1131static void vfio_vga_probe_ati_3c3_quirk(VFIODevice *vdev)
1132{
1133 PCIDevice *pdev = &vdev->pdev;
1134 off_t physoffset = vdev->config_offset + PCI_BASE_ADDRESS_4;
1135 uint32_t physbar;
1136 VFIOQuirk *quirk;
1137
1138 if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI ||
1139 vdev->bars[4].size < 256) {
1140 return;
1141 }
1142
1143 /* Get I/O port BAR physical address */
1144 if (pread(vdev->fd, &physbar, 4, physoffset) != 4) {
1145 error_report("vfio: probe failed for ATI/AMD 0x3c3 quirk on device "
1146 "%04x:%02x:%02x.%x", vdev->host.domain,
1147 vdev->host.bus, vdev->host.slot, vdev->host.function);
1148 return;
1149 }
1150
1151 quirk = g_malloc0(sizeof(*quirk));
1152 quirk->vdev = vdev;
1153 quirk->data = (physbar >> 8) & 0xff;
1154
1155 memory_region_init_io(&quirk->mem, &vfio_ati_3c3_quirk, quirk,
1156 "vfio-ati-3c3-quirk", 1);
1157 memory_region_add_subregion(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem, 3,
1158 &quirk->mem);
1159
1160 QLIST_INSERT_HEAD(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks,
1161 quirk, next);
1162
1163 DPRINTF("Enabled ATI/AMD quirk 0x3c3 for device %04x:%02x:%02x.%x\n",
1164 vdev->host.domain, vdev->host.bus, vdev->host.slot,
1165 vdev->host.function);
1166}
1167
1168/*
1169 * Device 1002:68f9 (Advanced Micro Devices [AMD] nee ATI Cedar PRO [Radeon
1170 * HD 5450/6350]) reports the physical address of MMIO BAR0 through a
1171 * write/read operation on I/O port BAR4. When uint32_t 0x4010 is written
1172 * to offset 0x0, the subsequent read from offset 0x4 returns the contents
1173 * of BAR0. Test for this quirk on all ATI/AMD devices. XXX - Note that
1174 * 0x10 is the offset of BAR0 in config sapce, is this a window to all of
1175 * config space?
1176 */
1177static uint64_t vfio_ati_4010_quirk_read(void *opaque,
1178 hwaddr addr, unsigned size)
1179{
1180 VFIOQuirk *quirk = opaque;
1181 VFIODevice *vdev = quirk->vdev;
1182 PCIDevice *pdev = &vdev->pdev;
1183 uint64_t data = vfio_bar_read(&vdev->bars[4], addr, size);
1184
1185 if (addr == 4 && size == 4 && quirk->data) {
1186 data = pci_get_long(pdev->config + PCI_BASE_ADDRESS_0);
1187 DPRINTF("%s(BAR4+0x4) = 0x%"PRIx64"\n", __func__, data);
1188 }
1189
1190 quirk->data = 0;
1191
1192 return data;
1193}
1194
1195static void vfio_ati_4010_quirk_write(void *opaque, hwaddr addr,
1196 uint64_t data, unsigned size)
1197{
1198 VFIOQuirk *quirk = opaque;
1199 VFIODevice *vdev = quirk->vdev;
1200
1201 vfio_bar_write(&vdev->bars[4], addr, data, size);
1202
1203 quirk->data = (addr == 0 && size == 4 && data == 0x4010) ? 1 : 0;
1204}
1205
1206static const MemoryRegionOps vfio_ati_4010_quirk = {
1207 .read = vfio_ati_4010_quirk_read,
1208 .write = vfio_ati_4010_quirk_write,
1209 .endianness = DEVICE_LITTLE_ENDIAN,
1210};
1211
1212static void vfio_probe_ati_4010_quirk(VFIODevice *vdev, int nr)
1213{
1214 PCIDevice *pdev = &vdev->pdev;
1215 off_t physoffset = vdev->config_offset + PCI_BASE_ADDRESS_0;
1216 uint32_t physbar0;
1217 uint64_t data;
1218 VFIOQuirk *quirk;
1219
1220 if (!vdev->has_vga || nr != 4 || !vdev->bars[0].size ||
1221 pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) {
1222 return;
1223 }
1224
1225 /* Get I/O port BAR physical address */
1226 if (pread(vdev->fd, &physbar0, 4, physoffset) != 4) {
1227 error_report("vfio: probe failed for ATI/AMD 0x4010 quirk on device "
1228 "%04x:%02x:%02x.%x", vdev->host.domain,
1229 vdev->host.bus, vdev->host.slot, vdev->host.function);
1230 return;
1231 }
1232
1233 /* Write 0x4010 to I/O port BAR offset 0 */
1234 vfio_bar_write(&vdev->bars[4], 0, 0x4010, 4);
1235 /* Read back result */
1236 data = vfio_bar_read(&vdev->bars[4], 4, 4);
1237
1238 /* If the register matches the physical address of BAR0, we need a quirk */
1239 if (data != physbar0) {
1240 return;
1241 }
1242
1243 quirk = g_malloc0(sizeof(*quirk));
1244 quirk->vdev = vdev;
1245
1246 memory_region_init_io(&quirk->mem, &vfio_ati_4010_quirk, quirk,
1247 "vfio-ati-4010-quirk", 8);
1248 memory_region_add_subregion_overlap(&vdev->bars[nr].mem, 0, &quirk->mem, 1);
1249
1250 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1251
1252 DPRINTF("Enabled ATI/AMD quirk 0x4010 for device %04x:%02x:%02x.%x\n",
1253 vdev->host.domain, vdev->host.bus, vdev->host.slot,
1254 vdev->host.function);
1255}
1256
1257/*
1258 * Device 1002:5b63 (Advanced Micro Devices [AMD] nee ATI RV370 [Radeon X550])
1259 * retrieves the upper half of the MMIO BAR0 physical address by writing
1260 * 0xf10 to I/O port BAR1 offset 0 and reading the result from offset 6.
1261 * XXX - 0x10 is the offset of BAR0 in PCI config space, this could provide
1262 * full access to config space. Config space is little endian, so the data
1263 * register probably starts at 0x4.
1264 */
1265static uint64_t vfio_ati_f10_quirk_read(void *opaque,
1266 hwaddr addr, unsigned size)
1267{
1268 VFIOQuirk *quirk = opaque;
1269 VFIODevice *vdev = quirk->vdev;
1270 PCIDevice *pdev = &vdev->pdev;
1271 uint64_t data = vfio_bar_read(&vdev->bars[1], addr, size);
1272
1273 if (addr == 6 && size == 2 && quirk->data) {
1274 data = pci_get_word(pdev->config + PCI_BASE_ADDRESS_0 + 2);
1275 DPRINTF("%s(BAR1+0x6) = 0x%"PRIx64"\n", __func__, data);
1276 }
1277
1278 quirk->data = 0;
1279
1280 return data;
1281}
1282
1283static void vfio_ati_f10_quirk_write(void *opaque, hwaddr addr,
1284 uint64_t data, unsigned size)
1285{
1286 VFIOQuirk *quirk = opaque;
1287 VFIODevice *vdev = quirk->vdev;
1288
1289 vfio_bar_write(&vdev->bars[1], addr, data, size);
1290
1291 quirk->data = (addr == 0 && size == 4 && data == 0xf10) ? 1 : 0;
1292}
1293
1294static const MemoryRegionOps vfio_ati_f10_quirk = {
1295 .read = vfio_ati_f10_quirk_read,
1296 .write = vfio_ati_f10_quirk_write,
1297 .endianness = DEVICE_LITTLE_ENDIAN,
1298};
1299
1300static void vfio_probe_ati_f10_quirk(VFIODevice *vdev, int nr)
1301{
1302 PCIDevice *pdev = &vdev->pdev;
1303 off_t physoffset = vdev->config_offset + PCI_BASE_ADDRESS_0;
1304 uint32_t physbar0;
1305 uint64_t data;
1306 VFIOQuirk *quirk;
1307
1308 if (!vdev->has_vga || nr != 1 || !vdev->bars[0].size ||
1309 pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) {
1310 return;
1311 }
1312
1313 /* Get I/O port BAR physical address */
1314 if (pread(vdev->fd, &physbar0, 4, physoffset) != 4) {
1315 error_report("vfio: probe failed for ATI/AMD 0xf10 quirk on device "
1316 "%04x:%02x:%02x.%x", vdev->host.domain,
1317 vdev->host.bus, vdev->host.slot, vdev->host.function);
1318 return;
1319 }
1320
1321 vfio_bar_write(&vdev->bars[1], 0, 0xf10, 4);
1322 data = vfio_bar_read(&vdev->bars[1], 0x6, 2);
1323
1324 /* If the register matches the physical address of BAR0, we need a quirk */
1325 if (data != (le32_to_cpu(physbar0) >> 16)) {
1326 return;
1327 }
1328
1329 quirk = g_malloc0(sizeof(*quirk));
1330 quirk->vdev = vdev;
1331
1332 memory_region_init_io(&quirk->mem, &vfio_ati_f10_quirk, quirk,
1333 "vfio-ati-f10-quirk", 8);
1334 memory_region_add_subregion_overlap(&vdev->bars[nr].mem, 0, &quirk->mem, 1);
1335
1336 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1337
1338 DPRINTF("Enabled ATI/AMD quirk 0xf10 for device %04x:%02x:%02x.%x\n",
1339 vdev->host.domain, vdev->host.bus, vdev->host.slot,
1340 vdev->host.function);
1341}
1342
1343#define PCI_VENDOR_ID_NVIDIA 0x10de
1344
1345/*
1346 * Nvidia has several different methods to get to config space, the
1347 * nouveu project has several of these documented here:
1348 * https://github.com/pathscale/envytools/tree/master/hwdocs
1349 *
1350 * The first quirk is actually not documented in envytools and is found
1351 * on 10de:01d1 (NVIDIA Corporation G72 [GeForce 7300 LE]). This is an
1352 * NV46 chipset. The backdoor uses the legacy VGA I/O ports to access
1353 * the mirror of PCI config space found at BAR0 offset 0x1800. The access
1354 * sequence first writes 0x338 to I/O port 0x3d4. The target offset is
1355 * then written to 0x3d0. Finally 0x538 is written for a read and 0x738
1356 * is written for a write to 0x3d4. The BAR0 offset is then accessible
1357 * through 0x3d0. This quirk doesn't seem to be necessary on newer cards
1358 * that use the I/O port BAR5 window but it doesn't hurt to leave it.
1359 */
1360enum {
1361 NV_3D0_NONE,
1362 NV_3D0_SELECT,
1363 NV_3D0_WINDOW,
1364 NV_3D0_READ,
1365 NV_3D0_WRITE,
1366};
1367
1368static uint64_t vfio_nvidia_3d0_quirk_read(void *opaque,
1369 hwaddr addr, unsigned size)
1370{
1371 VFIOQuirk *quirk = opaque;
1372 VFIODevice *vdev = quirk->vdev;
1373 PCIDevice *pdev = &vdev->pdev;
1374 uint64_t data = vfio_vga_read(&vdev->vga.region[QEMU_PCI_VGA_IO_HI],
1375 addr + 0x10, size);
1376
1377 if (quirk->data == NV_3D0_READ && addr == 0) {
1378 data = vfio_pci_read_config(pdev, quirk->data2, size);
1379 DPRINTF("%s(0x3d0, %d) = 0x%"PRIx64"\n", __func__, size, data);
1380 }
1381
1382 quirk->data = NV_3D0_NONE;
1383
1384 return data;
1385}
1386
1387static void vfio_nvidia_3d0_quirk_write(void *opaque, hwaddr addr,
1388 uint64_t data, unsigned size)
1389{
1390 VFIOQuirk *quirk = opaque;
1391 VFIODevice *vdev = quirk->vdev;
1392 PCIDevice *pdev = &vdev->pdev;
1393
1394 switch (quirk->data) {
1395 case NV_3D0_NONE:
1396 if (addr == 4 && data == 0x338) {
1397 quirk->data = NV_3D0_SELECT;
1398 }
1399 break;
1400 case NV_3D0_SELECT:
1401 quirk->data = NV_3D0_NONE;
1402 if (addr == 0 && (data & ~0xff) == 0x1800) {
1403 quirk->data = NV_3D0_WINDOW;
1404 quirk->data2 = data & 0xff;
1405 }
1406 break;
1407 case NV_3D0_WINDOW:
1408 quirk->data = NV_3D0_NONE;
1409 if (addr == 4) {
1410 if (data == 0x538) {
1411 quirk->data = NV_3D0_READ;
1412 } else if (data == 0x738) {
1413 quirk->data = NV_3D0_WRITE;
1414 }
1415 }
1416 break;
1417 case NV_3D0_WRITE:
1418 quirk->data = NV_3D0_NONE;
1419 if (addr == 0) {
1420 vfio_pci_write_config(pdev, quirk->data2, data, size);
1421 DPRINTF("%s(0x3d0, 0x%"PRIx64", %d)\n", __func__, data, size);
1422 return;
1423 }
1424 break;
1425 default:
1426 quirk->data = NV_3D0_NONE;
1427 }
1428
1429 vfio_vga_write(&vdev->vga.region[QEMU_PCI_VGA_IO_HI],
1430 addr + 0x10, data, size);
1431}
1432
1433static const MemoryRegionOps vfio_nvidia_3d0_quirk = {
1434 .read = vfio_nvidia_3d0_quirk_read,
1435 .write = vfio_nvidia_3d0_quirk_write,
1436 .endianness = DEVICE_LITTLE_ENDIAN,
1437};
1438
1439static void vfio_vga_probe_nvidia_3d0_quirk(VFIODevice *vdev)
1440{
1441 PCIDevice *pdev = &vdev->pdev;
1442 VFIOQuirk *quirk;
1443
1444 if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA ||
1445 !vdev->bars[1].size) {
1446 return;
1447 }
1448
1449 quirk = g_malloc0(sizeof(*quirk));
1450 quirk->vdev = vdev;
1451
1452 memory_region_init_io(&quirk->mem, &vfio_nvidia_3d0_quirk, quirk,
1453 "vfio-nvidia-3d0-quirk", 6);
1454 memory_region_add_subregion(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem,
1455 0x10, &quirk->mem);
1456
1457 QLIST_INSERT_HEAD(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks,
1458 quirk, next);
1459
1460 DPRINTF("Enabled NVIDIA VGA 0x3d0 quirk for device %04x:%02x:%02x.%x\n",
1461 vdev->host.domain, vdev->host.bus, vdev->host.slot,
1462 vdev->host.function);
1463}
1464
1465/*
1466 * The second quirk is documented in envytools. The I/O port BAR5 is just
1467 * a set of address/data ports to the MMIO BARs. The BAR we care about is
1468 * again BAR0. This backdoor is apparently a bit newer than the one above
1469 * so we need to not only trap 256 bytes @0x1800, but all of PCI config
1470 * space, including extended space is available at the 4k @0x88000.
1471 */
1472enum {
1473 NV_BAR5_ADDRESS = 0x1,
1474 NV_BAR5_ENABLE = 0x2,
1475 NV_BAR5_MASTER = 0x4,
1476 NV_BAR5_VALID = 0x7,
1477};
1478
1479static uint64_t vfio_nvidia_bar5_window_quirk_read(void *opaque,
1480 hwaddr addr, unsigned size)
1481{
1482 VFIOQuirk *quirk = opaque;
1483 VFIODevice *vdev = quirk->vdev;
1484 uint64_t data = vfio_bar_read(&vdev->bars[5], addr, size);
1485
1486 if (addr == 0xc && quirk->data == NV_BAR5_VALID) {
1487 data = vfio_pci_read_config(&vdev->pdev, quirk->data2, size);
1488 DPRINTF("%s(%04x:%02x:%02x.%x:BAR5+0x%"HWADDR_PRIx", %d) = 0x%"
1489 PRIx64"\n", __func__, vdev->host.domain, vdev->host.bus,
1490 vdev->host.slot, vdev->host.function, addr, size, data);
1491 }
1492
1493 return data;
1494}
1495
1496static void vfio_nvidia_bar5_window_quirk_write(void *opaque, hwaddr addr,
1497 uint64_t data, unsigned size)
1498{
1499 VFIOQuirk *quirk = opaque;
1500 VFIODevice *vdev = quirk->vdev;
1501
1502 /*
1503 * Use quirk->data to track enables and quirk->data2 for the offset
1504 */
1505 switch (addr) {
1506 case 0x0:
1507 if (data & 0x1) {
1508 quirk->data |= NV_BAR5_MASTER;
1509 } else {
1510 quirk->data &= ~NV_BAR5_MASTER;
1511 }
1512 break;
1513 case 0x4:
1514 if (data & 0x1) {
1515 quirk->data |= NV_BAR5_ENABLE;
1516 } else {
1517 quirk->data &= ~NV_BAR5_ENABLE;
1518 }
1519 break;
1520 case 0x8:
1521 if (quirk->data & NV_BAR5_MASTER) {
1522 if ((data & ~0xfff) == 0x88000) {
1523 quirk->data |= NV_BAR5_ADDRESS;
1524 quirk->data2 = data & 0xfff;
1525 } else if ((data & ~0xff) == 0x1800) {
1526 quirk->data |= NV_BAR5_ADDRESS;
1527 quirk->data2 = data & 0xff;
1528 } else {
1529 quirk->data &= ~NV_BAR5_ADDRESS;
1530 }
1531 }
1532 break;
1533 case 0xc:
1534 if (quirk->data == NV_BAR5_VALID) {
1535 vfio_pci_write_config(&vdev->pdev, quirk->data2, data, size);
1536 DPRINTF("%s(%04x:%02x:%02x.%x:BAR5+0x%"HWADDR_PRIx", 0x%"
1537 PRIx64", %d)\n", __func__, vdev->host.domain,
1538 vdev->host.bus, vdev->host.slot, vdev->host.function,
1539 addr, data, size);
1540 return;
1541 }
1542 }
1543
1544 vfio_bar_write(&vdev->bars[5], addr, data, size);
1545}
1546
1547static const MemoryRegionOps vfio_nvidia_bar5_window_quirk = {
1548 .read = vfio_nvidia_bar5_window_quirk_read,
1549 .write = vfio_nvidia_bar5_window_quirk_write,
1550 .valid.min_access_size = 4,
1551 .endianness = DEVICE_LITTLE_ENDIAN,
1552};
1553
1554static void vfio_probe_nvidia_bar5_window_quirk(VFIODevice *vdev, int nr)
1555{
1556 PCIDevice *pdev = &vdev->pdev;
1557 VFIOQuirk *quirk;
1558
1559 if (!vdev->has_vga || nr != 5 ||
1560 pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA) {
1561 return;
1562 }
1563
1564 quirk = g_malloc0(sizeof(*quirk));
1565 quirk->vdev = vdev;
1566
1567 memory_region_init_io(&quirk->mem, &vfio_nvidia_bar5_window_quirk, quirk,
1568 "vfio-nvidia-bar5-window-quirk", 16);
1569 memory_region_add_subregion_overlap(&vdev->bars[nr].mem, 0, &quirk->mem, 1);
1570
1571 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1572
1573 DPRINTF("Enabled NVIDIA BAR5 window quirk for device %04x:%02x:%02x.%x\n",
1574 vdev->host.domain, vdev->host.bus, vdev->host.slot,
1575 vdev->host.function);
1576}
1577
1578/*
1579 * Finally, BAR0 itself. We want to redirect any accesses to either
1580 * 0x1800 or 0x88000 through the PCI config space access functions.
1581 *
1582 * NB - quirk at a page granularity or else they don't seem to work when
1583 * BARs are mmap'd
1584 *
1585 * Here's offset 0x88000...
1586 */
1587static uint64_t vfio_nvidia_bar0_88000_quirk_read(void *opaque,
1588 hwaddr addr, unsigned size)
1589{
1590 VFIOQuirk *quirk = opaque;
1591 VFIODevice *vdev = quirk->vdev;
1592 hwaddr base = 0x88000 & TARGET_PAGE_MASK;
1593 hwaddr offset = 0x88000 & ~TARGET_PAGE_MASK;
1594 uint64_t data = vfio_bar_read(&vdev->bars[0], addr + base, size);
1595
1596 if (ranges_overlap(addr, size, offset, PCI_CONFIG_SPACE_SIZE)) {
1597 data = vfio_pci_read_config(&vdev->pdev, addr - offset, size);
1598
1599 DPRINTF("%s(%04x:%02x:%02x.%x:BAR0+0x%"HWADDR_PRIx", %d) = 0x%"
1600 PRIx64"\n", __func__, vdev->host.domain, vdev->host.bus,
1601 vdev->host.slot, vdev->host.function, addr + base, size, data);
1602 }
1603
1604 return data;
1605}
1606
1607static void vfio_nvidia_bar0_88000_quirk_write(void *opaque, hwaddr addr,
1608 uint64_t data, unsigned size)
1609{
1610 VFIOQuirk *quirk = opaque;
1611 VFIODevice *vdev = quirk->vdev;
1612 hwaddr base = 0x88000 & TARGET_PAGE_MASK;
1613 hwaddr offset = 0x88000 & ~TARGET_PAGE_MASK;
1614
1615 if (ranges_overlap(addr, size, offset, PCI_CONFIG_SPACE_SIZE)) {
1616 vfio_pci_write_config(&vdev->pdev, addr - offset, data, size);
1617
1618 DPRINTF("%s(%04x:%02x:%02x.%x:BAR0+0x%"HWADDR_PRIx", 0x%"
1619 PRIx64", %d)\n", __func__, vdev->host.domain, vdev->host.bus,
1620 vdev->host.slot, vdev->host.function, addr + base, data, size);
1621 } else {
1622 vfio_bar_write(&vdev->bars[0], addr + base, data, size);
1623 }
1624}
1625
1626static const MemoryRegionOps vfio_nvidia_bar0_88000_quirk = {
1627 .read = vfio_nvidia_bar0_88000_quirk_read,
1628 .write = vfio_nvidia_bar0_88000_quirk_write,
1629 .endianness = DEVICE_LITTLE_ENDIAN,
1630};
1631
1632static void vfio_probe_nvidia_bar0_88000_quirk(VFIODevice *vdev, int nr)
1633{
1634 PCIDevice *pdev = &vdev->pdev;
1635 VFIOQuirk *quirk;
1636
1637 if (!vdev->has_vga || nr != 0 ||
1638 pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA) {
1639 return;
1640 }
1641
1642 quirk = g_malloc0(sizeof(*quirk));
1643 quirk->vdev = vdev;
1644
1645 memory_region_init_io(&quirk->mem, &vfio_nvidia_bar0_88000_quirk, quirk,
1646 "vfio-nvidia-bar0-88000-quirk",
1647 TARGET_PAGE_ALIGN(PCIE_CONFIG_SPACE_SIZE));
1648 memory_region_add_subregion_overlap(&vdev->bars[nr].mem,
1649 0x88000 & TARGET_PAGE_MASK,
1650 &quirk->mem, 1);
1651
1652 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1653
1654 DPRINTF("Enabled NVIDIA BAR0 0x88000 quirk for device %04x:%02x:%02x.%x\n",
1655 vdev->host.domain, vdev->host.bus, vdev->host.slot,
1656 vdev->host.function);
1657}
1658
1659/*
1660 * And here's the same for BAR0 offset 0x1800...
1661 */
1662static uint64_t vfio_nvidia_bar0_1800_quirk_read(void *opaque,
1663 hwaddr addr, unsigned size)
1664{
1665 VFIOQuirk *quirk = opaque;
1666 VFIODevice *vdev = quirk->vdev;
1667 hwaddr base = 0x1800 & TARGET_PAGE_MASK;
1668 hwaddr offset = 0x1800 & ~TARGET_PAGE_MASK;
1669 uint64_t data = vfio_bar_read(&vdev->bars[0], addr + base, size);
1670
1671 if (ranges_overlap(addr, size, offset, PCI_CONFIG_SPACE_SIZE)) {
1672 data = vfio_pci_read_config(&vdev->pdev, addr - offset, size);
1673
1674 DPRINTF("%s(%04x:%02x:%02x.%x:BAR0+0x%"HWADDR_PRIx", %d) = 0x%"
1675 PRIx64"\n", __func__, vdev->host.domain, vdev->host.bus,
1676 vdev->host.slot, vdev->host.function, addr + base, size, data);
1677 }
1678
1679 return data;
1680}
1681
1682static void vfio_nvidia_bar0_1800_quirk_write(void *opaque, hwaddr addr,
1683 uint64_t data, unsigned size)
1684{
1685 VFIOQuirk *quirk = opaque;
1686 VFIODevice *vdev = quirk->vdev;
1687 hwaddr base = 0x1800 & TARGET_PAGE_MASK;
1688 hwaddr offset = 0x1800 & ~TARGET_PAGE_MASK;
1689
1690 if (ranges_overlap(addr, size, offset, PCI_CONFIG_SPACE_SIZE)) {
1691 vfio_pci_write_config(&vdev->pdev, addr - offset, data, size);
1692
1693 DPRINTF("%s(%04x:%02x:%02x.%x:BAR0+0x%"HWADDR_PRIx", 0x%"
1694 PRIx64", %d)\n", __func__, vdev->host.domain, vdev->host.bus,
1695 vdev->host.slot, vdev->host.function, addr + base, data, size);
1696 } else {
1697 vfio_bar_write(&vdev->bars[0], addr + base, data, size);
1698 }
1699}
1700
1701static const MemoryRegionOps vfio_nvidia_bar0_1800_quirk = {
1702 .read = vfio_nvidia_bar0_1800_quirk_read,
1703 .write = vfio_nvidia_bar0_1800_quirk_write,
1704 .endianness = DEVICE_LITTLE_ENDIAN,
1705};
1706
1707static void vfio_probe_nvidia_bar0_1800_quirk(VFIODevice *vdev, int nr)
1708{
1709 PCIDevice *pdev = &vdev->pdev;
1710 VFIOQuirk *quirk;
1711
1712 if (!vdev->has_vga || nr != 0 ||
1713 pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA) {
1714 return;
1715 }
1716
1717 /* Log the chipset ID */
1718 DPRINTF("Nvidia NV%02x\n",
1719 (unsigned int)(vfio_bar_read(&vdev->bars[0], 0, 4) >> 20) & 0xff);
1720
1721 quirk = g_malloc0(sizeof(*quirk));
1722 quirk->vdev = vdev;
1723
1724 memory_region_init_io(&quirk->mem, &vfio_nvidia_bar0_1800_quirk, quirk,
1725 "vfio-nvidia-bar0-1800-quirk",
1726 TARGET_PAGE_ALIGN(PCI_CONFIG_SPACE_SIZE));
1727 memory_region_add_subregion_overlap(&vdev->bars[nr].mem,
1728 0x1800 & TARGET_PAGE_MASK,
1729 &quirk->mem, 1);
1730
1731 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1732
1733 DPRINTF("Enabled NVIDIA BAR0 0x1800 quirk for device %04x:%02x:%02x.%x\n",
1734 vdev->host.domain, vdev->host.bus, vdev->host.slot,
1735 vdev->host.function);
1736}
1737
1738/*
1739 * TODO - Some Nvidia devices provide config access to their companion HDA
1740 * device and even to their parent bridge via these config space mirrors.
1741 * Add quirks for those regions.
1742 */
1743
1744/*
1745 * Common quirk probe entry points.
1746 */
1747static void vfio_vga_quirk_setup(VFIODevice *vdev)
1748{
1749 vfio_vga_probe_ati_3c3_quirk(vdev);
1750 vfio_vga_probe_nvidia_3d0_quirk(vdev);
1751}
1752
1753static void vfio_vga_quirk_teardown(VFIODevice *vdev)
1754{
1755 int i;
1756
1757 for (i = 0; i < ARRAY_SIZE(vdev->vga.region); i++) {
1758 while (!QLIST_EMPTY(&vdev->vga.region[i].quirks)) {
1759 VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga.region[i].quirks);
1760 memory_region_del_subregion(&vdev->vga.region[i].mem, &quirk->mem);
1761 QLIST_REMOVE(quirk, next);
1762 g_free(quirk);
1763 }
1764 }
1765}
1766
1767static void vfio_bar_quirk_setup(VFIODevice *vdev, int nr)
1768{
1769 vfio_probe_ati_4010_quirk(vdev, nr);
1770 vfio_probe_ati_f10_quirk(vdev, nr);
1771 vfio_probe_nvidia_bar5_window_quirk(vdev, nr);
1772 vfio_probe_nvidia_bar0_88000_quirk(vdev, nr);
1773 vfio_probe_nvidia_bar0_1800_quirk(vdev, nr);
1774}
1775
1776static void vfio_bar_quirk_teardown(VFIODevice *vdev, int nr)
1777{
1778 VFIOBAR *bar = &vdev->bars[nr];
1779
1780 while (!QLIST_EMPTY(&bar->quirks)) {
1781 VFIOQuirk *quirk = QLIST_FIRST(&bar->quirks);
1782 memory_region_del_subregion(&bar->mem, &quirk->mem);
1783 QLIST_REMOVE(quirk, next);
1784 g_free(quirk);
1785 }
1786}
1787
65501a74
AW
1788/*
1789 * PCI config space
1790 */
1791static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
1792{
1793 VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
4b5d5e87 1794 uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val;
65501a74 1795
4b5d5e87
AW
1796 memcpy(&emu_bits, vdev->emulated_config_bits + addr, len);
1797 emu_bits = le32_to_cpu(emu_bits);
65501a74 1798
4b5d5e87
AW
1799 if (emu_bits) {
1800 emu_val = pci_default_read_config(pdev, addr, len);
1801 }
1802
1803 if (~emu_bits & (0xffffffffU >> (32 - len * 8))) {
1804 ssize_t ret;
1805
1806 ret = pread(vdev->fd, &phys_val, len, vdev->config_offset + addr);
1807 if (ret != len) {
312fd5f2 1808 error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed: %m",
65501a74
AW
1809 __func__, vdev->host.domain, vdev->host.bus,
1810 vdev->host.slot, vdev->host.function, addr, len);
1811 return -errno;
1812 }
4b5d5e87 1813 phys_val = le32_to_cpu(phys_val);
65501a74
AW
1814 }
1815
4b5d5e87 1816 val = (emu_val & emu_bits) | (phys_val & ~emu_bits);
65501a74
AW
1817
1818 DPRINTF("%s(%04x:%02x:%02x.%x, @0x%x, len=0x%x) %x\n", __func__,
1819 vdev->host.domain, vdev->host.bus, vdev->host.slot,
1820 vdev->host.function, addr, len, val);
1821
1822 return val;
1823}
1824
1825static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
1826 uint32_t val, int len)
1827{
1828 VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
1829 uint32_t val_le = cpu_to_le32(val);
1830
1831 DPRINTF("%s(%04x:%02x:%02x.%x, @0x%x, 0x%x, len=0x%x)\n", __func__,
1832 vdev->host.domain, vdev->host.bus, vdev->host.slot,
1833 vdev->host.function, addr, val, len);
1834
1835 /* Write everything to VFIO, let it filter out what we can't write */
1836 if (pwrite(vdev->fd, &val_le, len, vdev->config_offset + addr) != len) {
312fd5f2 1837 error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x) failed: %m",
65501a74
AW
1838 __func__, vdev->host.domain, vdev->host.bus,
1839 vdev->host.slot, vdev->host.function, addr, val, len);
1840 }
1841
65501a74
AW
1842 /* MSI/MSI-X Enabling/Disabling */
1843 if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
1844 ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
1845 int is_enabled, was_enabled = msi_enabled(pdev);
1846
1847 pci_default_write_config(pdev, addr, val, len);
1848
1849 is_enabled = msi_enabled(pdev);
1850
1851 if (!was_enabled && is_enabled) {
1852 vfio_enable_msi(vdev);
1853 } else if (was_enabled && !is_enabled) {
fd704adc 1854 vfio_disable_msi(vdev);
65501a74 1855 }
4b5d5e87 1856 } else if (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
65501a74
AW
1857 ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) {
1858 int is_enabled, was_enabled = msix_enabled(pdev);
1859
1860 pci_default_write_config(pdev, addr, val, len);
1861
1862 is_enabled = msix_enabled(pdev);
1863
1864 if (!was_enabled && is_enabled) {
fd704adc 1865 vfio_enable_msix(vdev);
65501a74 1866 } else if (was_enabled && !is_enabled) {
fd704adc 1867 vfio_disable_msix(vdev);
65501a74 1868 }
4b5d5e87
AW
1869 } else {
1870 /* Write everything to QEMU to keep emulated bits correct */
1871 pci_default_write_config(pdev, addr, val, len);
65501a74
AW
1872 }
1873}
1874
1875/*
1876 * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
1877 */
af6bc27e 1878static int vfio_dma_unmap(VFIOContainer *container,
a8170e5e 1879 hwaddr iova, ram_addr_t size)
af6bc27e
AW
1880{
1881 struct vfio_iommu_type1_dma_unmap unmap = {
1882 .argsz = sizeof(unmap),
1883 .flags = 0,
1884 .iova = iova,
1885 .size = size,
1886 };
1887
1888 if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
1889 DPRINTF("VFIO_UNMAP_DMA: %d\n", -errno);
1890 return -errno;
1891 }
1892
1893 return 0;
1894}
1895
a8170e5e 1896static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
65501a74
AW
1897 ram_addr_t size, void *vaddr, bool readonly)
1898{
1899 struct vfio_iommu_type1_dma_map map = {
1900 .argsz = sizeof(map),
1901 .flags = VFIO_DMA_MAP_FLAG_READ,
5976cdd5 1902 .vaddr = (__u64)(uintptr_t)vaddr,
65501a74
AW
1903 .iova = iova,
1904 .size = size,
1905 };
1906
1907 if (!readonly) {
1908 map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
1909 }
1910
12af1344
AW
1911 /*
1912 * Try the mapping, if it fails with EBUSY, unmap the region and try
1913 * again. This shouldn't be necessary, but we sometimes see it in
1914 * the the VGA ROM space.
1915 */
1916 if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
1917 (errno == EBUSY && vfio_dma_unmap(container, iova, size) == 0 &&
1918 ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
1919 return 0;
65501a74
AW
1920 }
1921
12af1344
AW
1922 DPRINTF("VFIO_MAP_DMA: %d\n", -errno);
1923 return -errno;
65501a74
AW
1924}
1925
65501a74
AW
1926static bool vfio_listener_skipped_section(MemoryRegionSection *section)
1927{
1928 return !memory_region_is_ram(section->mr);
1929}
1930
1931static void vfio_listener_region_add(MemoryListener *listener,
1932 MemoryRegionSection *section)
1933{
1934 VFIOContainer *container = container_of(listener, VFIOContainer,
1935 iommu_data.listener);
a8170e5e 1936 hwaddr iova, end;
65501a74
AW
1937 void *vaddr;
1938 int ret;
1939
1940 if (vfio_listener_skipped_section(section)) {
82ca8912 1941 DPRINTF("SKIPPING region_add %"HWADDR_PRIx" - %"PRIx64"\n",
65501a74
AW
1942 section->offset_within_address_space,
1943 section->offset_within_address_space + section->size - 1);
1944 return;
1945 }
1946
1947 if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
1948 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
312fd5f2 1949 error_report("%s received unaligned region", __func__);
65501a74
AW
1950 return;
1951 }
1952
1953 iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
1954 end = (section->offset_within_address_space + section->size) &
1955 TARGET_PAGE_MASK;
1956
1957 if (iova >= end) {
1958 return;
1959 }
1960
1961 vaddr = memory_region_get_ram_ptr(section->mr) +
1962 section->offset_within_region +
1963 (iova - section->offset_within_address_space);
1964
82ca8912 1965 DPRINTF("region_add %"HWADDR_PRIx" - %"HWADDR_PRIx" [%p]\n",
65501a74
AW
1966 iova, end - 1, vaddr);
1967
1968 ret = vfio_dma_map(container, iova, end - iova, vaddr, section->readonly);
1969 if (ret) {
a8170e5e 1970 error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
312fd5f2 1971 "0x%"HWADDR_PRIx", %p) = %d (%m)",
65501a74
AW
1972 container, iova, end - iova, vaddr, ret);
1973 }
1974}
1975
1976static void vfio_listener_region_del(MemoryListener *listener,
1977 MemoryRegionSection *section)
1978{
1979 VFIOContainer *container = container_of(listener, VFIOContainer,
1980 iommu_data.listener);
a8170e5e 1981 hwaddr iova, end;
65501a74
AW
1982 int ret;
1983
1984 if (vfio_listener_skipped_section(section)) {
82ca8912 1985 DPRINTF("SKIPPING region_del %"HWADDR_PRIx" - %"PRIx64"\n",
65501a74
AW
1986 section->offset_within_address_space,
1987 section->offset_within_address_space + section->size - 1);
1988 return;
1989 }
1990
1991 if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
1992 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
312fd5f2 1993 error_report("%s received unaligned region", __func__);
65501a74
AW
1994 return;
1995 }
1996
1997 iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
1998 end = (section->offset_within_address_space + section->size) &
1999 TARGET_PAGE_MASK;
2000
2001 if (iova >= end) {
2002 return;
2003 }
2004
82ca8912 2005 DPRINTF("region_del %"HWADDR_PRIx" - %"HWADDR_PRIx"\n",
65501a74
AW
2006 iova, end - 1);
2007
2008 ret = vfio_dma_unmap(container, iova, end - iova);
2009 if (ret) {
a8170e5e 2010 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
312fd5f2 2011 "0x%"HWADDR_PRIx") = %d (%m)",
65501a74
AW
2012 container, iova, end - iova, ret);
2013 }
2014}
2015
2016static MemoryListener vfio_memory_listener = {
65501a74
AW
2017 .region_add = vfio_listener_region_add,
2018 .region_del = vfio_listener_region_del,
65501a74
AW
2019};
2020
2021static void vfio_listener_release(VFIOContainer *container)
2022{
2023 memory_listener_unregister(&container->iommu_data.listener);
2024}
2025
2026/*
2027 * Interrupt setup
2028 */
2029static void vfio_disable_interrupts(VFIODevice *vdev)
2030{
2031 switch (vdev->interrupt) {
2032 case VFIO_INT_INTx:
2033 vfio_disable_intx(vdev);
2034 break;
2035 case VFIO_INT_MSI:
fd704adc 2036 vfio_disable_msi(vdev);
65501a74
AW
2037 break;
2038 case VFIO_INT_MSIX:
fd704adc 2039 vfio_disable_msix(vdev);
65501a74
AW
2040 break;
2041 }
2042}
2043
2044static int vfio_setup_msi(VFIODevice *vdev, int pos)
2045{
2046 uint16_t ctrl;
2047 bool msi_64bit, msi_maskbit;
2048 int ret, entries;
2049
65501a74
AW
2050 if (pread(vdev->fd, &ctrl, sizeof(ctrl),
2051 vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
2052 return -errno;
2053 }
2054 ctrl = le16_to_cpu(ctrl);
2055
2056 msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
2057 msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
2058 entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
2059
2060 DPRINTF("%04x:%02x:%02x.%x PCI MSI CAP @0x%x\n", vdev->host.domain,
2061 vdev->host.bus, vdev->host.slot, vdev->host.function, pos);
2062
2063 ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit);
2064 if (ret < 0) {
e43b9a5a
AW
2065 if (ret == -ENOTSUP) {
2066 return 0;
2067 }
312fd5f2 2068 error_report("vfio: msi_init failed");
65501a74
AW
2069 return ret;
2070 }
2071 vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
2072
2073 return 0;
2074}
2075
2076/*
2077 * We don't have any control over how pci_add_capability() inserts
2078 * capabilities into the chain. In order to setup MSI-X we need a
2079 * MemoryRegion for the BAR. In order to setup the BAR and not
2080 * attempt to mmap the MSI-X table area, which VFIO won't allow, we
2081 * need to first look for where the MSI-X table lives. So we
2082 * unfortunately split MSI-X setup across two functions.
2083 */
2084static int vfio_early_setup_msix(VFIODevice *vdev)
2085{
2086 uint8_t pos;
2087 uint16_t ctrl;
2088 uint32_t table, pba;
2089
2090 pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
2091 if (!pos) {
2092 return 0;
2093 }
2094
2095 if (pread(vdev->fd, &ctrl, sizeof(ctrl),
2096 vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
2097 return -errno;
2098 }
2099
2100 if (pread(vdev->fd, &table, sizeof(table),
2101 vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) {
2102 return -errno;
2103 }
2104
2105 if (pread(vdev->fd, &pba, sizeof(pba),
2106 vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) {
2107 return -errno;
2108 }
2109
2110 ctrl = le16_to_cpu(ctrl);
2111 table = le32_to_cpu(table);
2112 pba = le32_to_cpu(pba);
2113
2114 vdev->msix = g_malloc0(sizeof(*(vdev->msix)));
2115 vdev->msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
2116 vdev->msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
2117 vdev->msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
2118 vdev->msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
2119 vdev->msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
2120
2121 DPRINTF("%04x:%02x:%02x.%x "
2122 "PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, entries %d\n",
2123 vdev->host.domain, vdev->host.bus, vdev->host.slot,
2124 vdev->host.function, pos, vdev->msix->table_bar,
2125 vdev->msix->table_offset, vdev->msix->entries);
2126
2127 return 0;
2128}
2129
2130static int vfio_setup_msix(VFIODevice *vdev, int pos)
2131{
2132 int ret;
2133
65501a74
AW
2134 ret = msix_init(&vdev->pdev, vdev->msix->entries,
2135 &vdev->bars[vdev->msix->table_bar].mem,
2136 vdev->msix->table_bar, vdev->msix->table_offset,
2137 &vdev->bars[vdev->msix->pba_bar].mem,
2138 vdev->msix->pba_bar, vdev->msix->pba_offset, pos);
2139 if (ret < 0) {
e43b9a5a
AW
2140 if (ret == -ENOTSUP) {
2141 return 0;
2142 }
312fd5f2 2143 error_report("vfio: msix_init failed");
65501a74
AW
2144 return ret;
2145 }
2146
65501a74
AW
2147 return 0;
2148}
2149
2150static void vfio_teardown_msi(VFIODevice *vdev)
2151{
2152 msi_uninit(&vdev->pdev);
2153
2154 if (vdev->msix) {
65501a74
AW
2155 msix_uninit(&vdev->pdev, &vdev->bars[vdev->msix->table_bar].mem,
2156 &vdev->bars[vdev->msix->pba_bar].mem);
2157 }
2158}
2159
2160/*
2161 * Resource setup
2162 */
2163static void vfio_mmap_set_enabled(VFIODevice *vdev, bool enabled)
2164{
2165 int i;
2166
2167 for (i = 0; i < PCI_ROM_SLOT; i++) {
2168 VFIOBAR *bar = &vdev->bars[i];
2169
2170 if (!bar->size) {
2171 continue;
2172 }
2173
2174 memory_region_set_enabled(&bar->mmap_mem, enabled);
2175 if (vdev->msix && vdev->msix->table_bar == i) {
2176 memory_region_set_enabled(&vdev->msix->mmap_mem, enabled);
2177 }
2178 }
2179}
2180
2181static void vfio_unmap_bar(VFIODevice *vdev, int nr)
2182{
2183 VFIOBAR *bar = &vdev->bars[nr];
2184
2185 if (!bar->size) {
2186 return;
2187 }
2188
7076eabc
AW
2189 vfio_bar_quirk_teardown(vdev, nr);
2190
65501a74
AW
2191 memory_region_del_subregion(&bar->mem, &bar->mmap_mem);
2192 munmap(bar->mmap, memory_region_size(&bar->mmap_mem));
2193
2194 if (vdev->msix && vdev->msix->table_bar == nr) {
2195 memory_region_del_subregion(&bar->mem, &vdev->msix->mmap_mem);
2196 munmap(vdev->msix->mmap, memory_region_size(&vdev->msix->mmap_mem));
2197 }
2198
2199 memory_region_destroy(&bar->mem);
2200}
2201
2202static int vfio_mmap_bar(VFIOBAR *bar, MemoryRegion *mem, MemoryRegion *submem,
2203 void **map, size_t size, off_t offset,
2204 const char *name)
2205{
2206 int ret = 0;
2207
82ca8912 2208 if (VFIO_ALLOW_MMAP && size && bar->flags & VFIO_REGION_INFO_FLAG_MMAP) {
65501a74
AW
2209 int prot = 0;
2210
2211 if (bar->flags & VFIO_REGION_INFO_FLAG_READ) {
2212 prot |= PROT_READ;
2213 }
2214
2215 if (bar->flags & VFIO_REGION_INFO_FLAG_WRITE) {
2216 prot |= PROT_WRITE;
2217 }
2218
2219 *map = mmap(NULL, size, prot, MAP_SHARED,
2220 bar->fd, bar->fd_offset + offset);
2221 if (*map == MAP_FAILED) {
2222 *map = NULL;
2223 ret = -errno;
2224 goto empty_region;
2225 }
2226
2227 memory_region_init_ram_ptr(submem, name, size, *map);
2228 } else {
2229empty_region:
2230 /* Create a zero sized sub-region to make cleanup easy. */
2231 memory_region_init(submem, name, 0);
2232 }
2233
2234 memory_region_add_subregion(mem, offset, submem);
2235
2236 return ret;
2237}
2238
2239static void vfio_map_bar(VFIODevice *vdev, int nr)
2240{
2241 VFIOBAR *bar = &vdev->bars[nr];
2242 unsigned size = bar->size;
2243 char name[64];
2244 uint32_t pci_bar;
2245 uint8_t type;
2246 int ret;
2247
2248 /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
2249 if (!size) {
2250 return;
2251 }
2252
2253 snprintf(name, sizeof(name), "VFIO %04x:%02x:%02x.%x BAR %d",
2254 vdev->host.domain, vdev->host.bus, vdev->host.slot,
2255 vdev->host.function, nr);
2256
2257 /* Determine what type of BAR this is for registration */
2258 ret = pread(vdev->fd, &pci_bar, sizeof(pci_bar),
2259 vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr));
2260 if (ret != sizeof(pci_bar)) {
312fd5f2 2261 error_report("vfio: Failed to read BAR %d (%m)", nr);
65501a74
AW
2262 return;
2263 }
2264
2265 pci_bar = le32_to_cpu(pci_bar);
2266 type = pci_bar & (pci_bar & PCI_BASE_ADDRESS_SPACE_IO ?
2267 ~PCI_BASE_ADDRESS_IO_MASK : ~PCI_BASE_ADDRESS_MEM_MASK);
2268
2269 /* A "slow" read/write mapping underlies all BARs */
2270 memory_region_init_io(&bar->mem, &vfio_bar_ops, bar, name, size);
2271 pci_register_bar(&vdev->pdev, nr, type, &bar->mem);
2272
2273 /*
2274 * We can't mmap areas overlapping the MSIX vector table, so we
2275 * potentially insert a direct-mapped subregion before and after it.
2276 */
2277 if (vdev->msix && vdev->msix->table_bar == nr) {
2278 size = vdev->msix->table_offset & TARGET_PAGE_MASK;
2279 }
2280
2281 strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
2282 if (vfio_mmap_bar(bar, &bar->mem,
2283 &bar->mmap_mem, &bar->mmap, size, 0, name)) {
312fd5f2 2284 error_report("%s unsupported. Performance may be slow", name);
65501a74
AW
2285 }
2286
2287 if (vdev->msix && vdev->msix->table_bar == nr) {
2288 unsigned start;
2289
2290 start = TARGET_PAGE_ALIGN(vdev->msix->table_offset +
2291 (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
2292
2293 size = start < bar->size ? bar->size - start : 0;
2294 strncat(name, " msix-hi", sizeof(name) - strlen(name) - 1);
2295 /* VFIOMSIXInfo contains another MemoryRegion for this mapping */
2296 if (vfio_mmap_bar(bar, &bar->mem, &vdev->msix->mmap_mem,
2297 &vdev->msix->mmap, size, start, name)) {
312fd5f2 2298 error_report("%s unsupported. Performance may be slow", name);
65501a74
AW
2299 }
2300 }
7076eabc
AW
2301
2302 vfio_bar_quirk_setup(vdev, nr);
65501a74
AW
2303}
2304
2305static void vfio_map_bars(VFIODevice *vdev)
2306{
2307 int i;
2308
2309 for (i = 0; i < PCI_ROM_SLOT; i++) {
2310 vfio_map_bar(vdev, i);
2311 }
f15689c7
AW
2312
2313 if (vdev->has_vga) {
2314 memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_MEM].mem,
2315 &vfio_vga_ops,
2316 &vdev->vga.region[QEMU_PCI_VGA_MEM],
2317 "vfio-vga-mmio@0xa0000",
2318 QEMU_PCI_VGA_MEM_SIZE);
2319 memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_IO_LO].mem,
2320 &vfio_vga_ops,
2321 &vdev->vga.region[QEMU_PCI_VGA_IO_LO],
2322 "vfio-vga-io@0x3b0",
2323 QEMU_PCI_VGA_IO_LO_SIZE);
2324 memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem,
2325 &vfio_vga_ops,
2326 &vdev->vga.region[QEMU_PCI_VGA_IO_HI],
2327 "vfio-vga-io@0x3c0",
2328 QEMU_PCI_VGA_IO_HI_SIZE);
2329
2330 pci_register_vga(&vdev->pdev, &vdev->vga.region[QEMU_PCI_VGA_MEM].mem,
2331 &vdev->vga.region[QEMU_PCI_VGA_IO_LO].mem,
2332 &vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem);
7076eabc 2333 vfio_vga_quirk_setup(vdev);
f15689c7 2334 }
65501a74
AW
2335}
2336
2337static void vfio_unmap_bars(VFIODevice *vdev)
2338{
2339 int i;
2340
2341 for (i = 0; i < PCI_ROM_SLOT; i++) {
2342 vfio_unmap_bar(vdev, i);
2343 }
f15689c7
AW
2344
2345 if (vdev->has_vga) {
7076eabc 2346 vfio_vga_quirk_teardown(vdev);
f15689c7
AW
2347 pci_unregister_vga(&vdev->pdev);
2348 memory_region_destroy(&vdev->vga.region[QEMU_PCI_VGA_MEM].mem);
2349 memory_region_destroy(&vdev->vga.region[QEMU_PCI_VGA_IO_LO].mem);
2350 memory_region_destroy(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem);
2351 }
65501a74
AW
2352}
2353
2354/*
2355 * General setup
2356 */
2357static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
2358{
2359 uint8_t tmp, next = 0xff;
2360
2361 for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp;
2362 tmp = pdev->config[tmp + 1]) {
2363 if (tmp > pos && tmp < next) {
2364 next = tmp;
2365 }
2366 }
2367
2368 return next - pos;
2369}
2370
96adc5c7
AW
2371static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask)
2372{
2373 pci_set_word(buf, (pci_get_word(buf) & ~mask) | val);
2374}
2375
2376static void vfio_add_emulated_word(VFIODevice *vdev, int pos,
2377 uint16_t val, uint16_t mask)
2378{
2379 vfio_set_word_bits(vdev->pdev.config + pos, val, mask);
2380 vfio_set_word_bits(vdev->pdev.wmask + pos, ~mask, mask);
2381 vfio_set_word_bits(vdev->emulated_config_bits + pos, mask, mask);
2382}
2383
2384static void vfio_set_long_bits(uint8_t *buf, uint32_t val, uint32_t mask)
2385{
2386 pci_set_long(buf, (pci_get_long(buf) & ~mask) | val);
2387}
2388
2389static void vfio_add_emulated_long(VFIODevice *vdev, int pos,
2390 uint32_t val, uint32_t mask)
2391{
2392 vfio_set_long_bits(vdev->pdev.config + pos, val, mask);
2393 vfio_set_long_bits(vdev->pdev.wmask + pos, ~mask, mask);
2394 vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask);
2395}
2396
2397static int vfio_setup_pcie_cap(VFIODevice *vdev, int pos, uint8_t size)
2398{
2399 uint16_t flags;
2400 uint8_t type;
2401
2402 flags = pci_get_word(vdev->pdev.config + pos + PCI_CAP_FLAGS);
2403 type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
2404
2405 if (type != PCI_EXP_TYPE_ENDPOINT &&
2406 type != PCI_EXP_TYPE_LEG_END &&
2407 type != PCI_EXP_TYPE_RC_END) {
2408
2409 error_report("vfio: Assignment of PCIe type 0x%x "
2410 "devices is not currently supported", type);
2411 return -EINVAL;
2412 }
2413
2414 if (!pci_bus_is_express(vdev->pdev.bus)) {
2415 /*
2416 * Use express capability as-is on PCI bus. It doesn't make much
2417 * sense to even expose, but some drivers (ex. tg3) depend on it
2418 * and guests don't seem to be particular about it. We'll need
2419 * to revist this or force express devices to express buses if we
2420 * ever expose an IOMMU to the guest.
2421 */
2422 } else if (pci_bus_is_root(vdev->pdev.bus)) {
2423 /*
2424 * On a Root Complex bus Endpoints become Root Complex Integrated
2425 * Endpoints, which changes the type and clears the LNK & LNK2 fields.
2426 */
2427 if (type == PCI_EXP_TYPE_ENDPOINT) {
2428 vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
2429 PCI_EXP_TYPE_RC_END << 4,
2430 PCI_EXP_FLAGS_TYPE);
2431
2432 /* Link Capabilities, Status, and Control goes away */
2433 if (size > PCI_EXP_LNKCTL) {
2434 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP, 0, ~0);
2435 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
2436 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA, 0, ~0);
2437
2438#ifndef PCI_EXP_LNKCAP2
2439#define PCI_EXP_LNKCAP2 44
2440#endif
2441#ifndef PCI_EXP_LNKSTA2
2442#define PCI_EXP_LNKSTA2 50
2443#endif
2444 /* Link 2 Capabilities, Status, and Control goes away */
2445 if (size > PCI_EXP_LNKCAP2) {
2446 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP2, 0, ~0);
2447 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL2, 0, ~0);
2448 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA2, 0, ~0);
2449 }
2450 }
2451
2452 } else if (type == PCI_EXP_TYPE_LEG_END) {
2453 /*
2454 * Legacy endpoints don't belong on the root complex. Windows
2455 * seems to be happier with devices if we skip the capability.
2456 */
2457 return 0;
2458 }
2459
2460 } else {
2461 /*
2462 * Convert Root Complex Integrated Endpoints to regular endpoints.
2463 * These devices don't support LNK/LNK2 capabilities, so make them up.
2464 */
2465 if (type == PCI_EXP_TYPE_RC_END) {
2466 vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
2467 PCI_EXP_TYPE_ENDPOINT << 4,
2468 PCI_EXP_FLAGS_TYPE);
2469 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP,
2470 PCI_EXP_LNK_MLW_1 | PCI_EXP_LNK_LS_25, ~0);
2471 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
2472 }
2473
2474 /* Mark the Link Status bits as emulated to allow virtual negotiation */
2475 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA,
2476 pci_get_word(vdev->pdev.config + pos +
2477 PCI_EXP_LNKSTA),
2478 PCI_EXP_LNKCAP_MLW | PCI_EXP_LNKCAP_SLS);
2479 }
2480
2481 pos = pci_add_capability(&vdev->pdev, PCI_CAP_ID_EXP, pos, size);
2482 if (pos >= 0) {
2483 vdev->pdev.exp.exp_cap = pos;
2484 }
2485
2486 return pos;
2487}
2488
65501a74
AW
2489static int vfio_add_std_cap(VFIODevice *vdev, uint8_t pos)
2490{
2491 PCIDevice *pdev = &vdev->pdev;
2492 uint8_t cap_id, next, size;
2493 int ret;
2494
2495 cap_id = pdev->config[pos];
2496 next = pdev->config[pos + 1];
2497
2498 /*
2499 * If it becomes important to configure capabilities to their actual
2500 * size, use this as the default when it's something we don't recognize.
2501 * Since QEMU doesn't actually handle many of the config accesses,
2502 * exact size doesn't seem worthwhile.
2503 */
2504 size = vfio_std_cap_max_size(pdev, pos);
2505
2506 /*
2507 * pci_add_capability always inserts the new capability at the head
2508 * of the chain. Therefore to end up with a chain that matches the
2509 * physical device, we insert from the end by making this recursive.
2510 * This is also why we pre-caclulate size above as cached config space
2511 * will be changed as we unwind the stack.
2512 */
2513 if (next) {
2514 ret = vfio_add_std_cap(vdev, next);
2515 if (ret) {
2516 return ret;
2517 }
2518 } else {
96adc5c7
AW
2519 /* Begin the rebuild, use QEMU emulated list bits */
2520 pdev->config[PCI_CAPABILITY_LIST] = 0;
2521 vdev->emulated_config_bits[PCI_CAPABILITY_LIST] = 0xff;
2522 vdev->emulated_config_bits[PCI_STATUS] |= PCI_STATUS_CAP_LIST;
65501a74
AW
2523 }
2524
96adc5c7
AW
2525 /* Use emulated next pointer to allow dropping caps */
2526 pci_set_byte(vdev->emulated_config_bits + pos + 1, 0xff);
2527
65501a74
AW
2528 switch (cap_id) {
2529 case PCI_CAP_ID_MSI:
2530 ret = vfio_setup_msi(vdev, pos);
2531 break;
96adc5c7
AW
2532 case PCI_CAP_ID_EXP:
2533 ret = vfio_setup_pcie_cap(vdev, pos, size);
2534 break;
65501a74
AW
2535 case PCI_CAP_ID_MSIX:
2536 ret = vfio_setup_msix(vdev, pos);
2537 break;
ba661818
AW
2538 case PCI_CAP_ID_PM:
2539 vdev->pm_cap = pos;
65501a74
AW
2540 default:
2541 ret = pci_add_capability(pdev, cap_id, pos, size);
2542 break;
2543 }
2544
2545 if (ret < 0) {
2546 error_report("vfio: %04x:%02x:%02x.%x Error adding PCI capability "
312fd5f2 2547 "0x%x[0x%x]@0x%x: %d", vdev->host.domain,
65501a74
AW
2548 vdev->host.bus, vdev->host.slot, vdev->host.function,
2549 cap_id, size, pos, ret);
2550 return ret;
2551 }
2552
2553 return 0;
2554}
2555
2556static int vfio_add_capabilities(VFIODevice *vdev)
2557{
2558 PCIDevice *pdev = &vdev->pdev;
2559
2560 if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
2561 !pdev->config[PCI_CAPABILITY_LIST]) {
2562 return 0; /* Nothing to add */
2563 }
2564
2565 return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
2566}
2567
2568static int vfio_load_rom(VFIODevice *vdev)
2569{
2570 uint64_t size = vdev->rom_size;
2571 char name[32];
2572 off_t off = 0, voff = vdev->rom_offset;
2573 ssize_t bytes;
2574 void *ptr;
2575
2576 /* If loading ROM from file, pci handles it */
2577 if (vdev->pdev.romfile || !vdev->pdev.rom_bar || !size) {
2578 return 0;
2579 }
2580
2581 DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
2582 vdev->host.bus, vdev->host.slot, vdev->host.function);
2583
2584 snprintf(name, sizeof(name), "vfio[%04x:%02x:%02x.%x].rom",
2585 vdev->host.domain, vdev->host.bus, vdev->host.slot,
2586 vdev->host.function);
2587 memory_region_init_ram(&vdev->pdev.rom, name, size);
2588 ptr = memory_region_get_ram_ptr(&vdev->pdev.rom);
2589 memset(ptr, 0xff, size);
2590
2591 while (size) {
2592 bytes = pread(vdev->fd, ptr + off, size, voff + off);
2593 if (bytes == 0) {
2594 break; /* expect that we could get back less than the ROM BAR */
2595 } else if (bytes > 0) {
2596 off += bytes;
2597 size -= bytes;
2598 } else {
2599 if (errno == EINTR || errno == EAGAIN) {
2600 continue;
2601 }
312fd5f2 2602 error_report("vfio: Error reading device ROM: %m");
65501a74
AW
2603 memory_region_destroy(&vdev->pdev.rom);
2604 return -errno;
2605 }
2606 }
2607
2608 pci_register_bar(&vdev->pdev, PCI_ROM_SLOT, 0, &vdev->pdev.rom);
2609 vdev->pdev.has_rom = true;
2610 return 0;
2611}
2612
2613static int vfio_connect_container(VFIOGroup *group)
2614{
2615 VFIOContainer *container;
2616 int ret, fd;
2617
2618 if (group->container) {
2619 return 0;
2620 }
2621
2622 QLIST_FOREACH(container, &container_list, next) {
2623 if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
2624 group->container = container;
2625 QLIST_INSERT_HEAD(&container->group_list, group, container_next);
2626 return 0;
2627 }
2628 }
2629
2630 fd = qemu_open("/dev/vfio/vfio", O_RDWR);
2631 if (fd < 0) {
312fd5f2 2632 error_report("vfio: failed to open /dev/vfio/vfio: %m");
65501a74
AW
2633 return -errno;
2634 }
2635
2636 ret = ioctl(fd, VFIO_GET_API_VERSION);
2637 if (ret != VFIO_API_VERSION) {
2638 error_report("vfio: supported vfio version: %d, "
312fd5f2 2639 "reported version: %d", VFIO_API_VERSION, ret);
65501a74
AW
2640 close(fd);
2641 return -EINVAL;
2642 }
2643
2644 container = g_malloc0(sizeof(*container));
2645 container->fd = fd;
2646
2647 if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
2648 ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
2649 if (ret) {
312fd5f2 2650 error_report("vfio: failed to set group container: %m");
65501a74
AW
2651 g_free(container);
2652 close(fd);
2653 return -errno;
2654 }
2655
2656 ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
2657 if (ret) {
312fd5f2 2658 error_report("vfio: failed to set iommu for container: %m");
65501a74
AW
2659 g_free(container);
2660 close(fd);
2661 return -errno;
2662 }
2663
2664 container->iommu_data.listener = vfio_memory_listener;
2665 container->iommu_data.release = vfio_listener_release;
2666
f6790af6 2667 memory_listener_register(&container->iommu_data.listener, &address_space_memory);
65501a74 2668 } else {
312fd5f2 2669 error_report("vfio: No available IOMMU models");
65501a74
AW
2670 g_free(container);
2671 close(fd);
2672 return -EINVAL;
2673 }
2674
2675 QLIST_INIT(&container->group_list);
2676 QLIST_INSERT_HEAD(&container_list, container, next);
2677
2678 group->container = container;
2679 QLIST_INSERT_HEAD(&container->group_list, group, container_next);
2680
2681 return 0;
2682}
2683
2684static void vfio_disconnect_container(VFIOGroup *group)
2685{
2686 VFIOContainer *container = group->container;
2687
2688 if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
312fd5f2 2689 error_report("vfio: error disconnecting group %d from container",
65501a74
AW
2690 group->groupid);
2691 }
2692
2693 QLIST_REMOVE(group, container_next);
2694 group->container = NULL;
2695
2696 if (QLIST_EMPTY(&container->group_list)) {
2697 if (container->iommu_data.release) {
2698 container->iommu_data.release(container);
2699 }
2700 QLIST_REMOVE(container, next);
2701 DPRINTF("vfio_disconnect_container: close container->fd\n");
2702 close(container->fd);
2703 g_free(container);
2704 }
2705}
2706
2707static VFIOGroup *vfio_get_group(int groupid)
2708{
2709 VFIOGroup *group;
2710 char path[32];
2711 struct vfio_group_status status = { .argsz = sizeof(status) };
2712
2713 QLIST_FOREACH(group, &group_list, next) {
2714 if (group->groupid == groupid) {
2715 return group;
2716 }
2717 }
2718
2719 group = g_malloc0(sizeof(*group));
2720
2721 snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
2722 group->fd = qemu_open(path, O_RDWR);
2723 if (group->fd < 0) {
312fd5f2 2724 error_report("vfio: error opening %s: %m", path);
65501a74
AW
2725 g_free(group);
2726 return NULL;
2727 }
2728
2729 if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
312fd5f2 2730 error_report("vfio: error getting group status: %m");
65501a74
AW
2731 close(group->fd);
2732 g_free(group);
2733 return NULL;
2734 }
2735
2736 if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
2737 error_report("vfio: error, group %d is not viable, please ensure "
2738 "all devices within the iommu_group are bound to their "
312fd5f2 2739 "vfio bus driver.", groupid);
65501a74
AW
2740 close(group->fd);
2741 g_free(group);
2742 return NULL;
2743 }
2744
2745 group->groupid = groupid;
2746 QLIST_INIT(&group->device_list);
2747
2748 if (vfio_connect_container(group)) {
312fd5f2 2749 error_report("vfio: failed to setup container for group %d", groupid);
65501a74
AW
2750 close(group->fd);
2751 g_free(group);
2752 return NULL;
2753 }
2754
2755 QLIST_INSERT_HEAD(&group_list, group, next);
2756
2757 return group;
2758}
2759
2760static void vfio_put_group(VFIOGroup *group)
2761{
2762 if (!QLIST_EMPTY(&group->device_list)) {
2763 return;
2764 }
2765
2766 vfio_disconnect_container(group);
2767 QLIST_REMOVE(group, next);
2768 DPRINTF("vfio_put_group: close group->fd\n");
2769 close(group->fd);
2770 g_free(group);
2771}
2772
2773static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev)
2774{
2775 struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
2776 struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
2777 int ret, i;
2778
2779 ret = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
2780 if (ret < 0) {
1a9522cc 2781 error_report("vfio: error getting device %s from group %d: %m",
65501a74 2782 name, group->groupid);
1a9522cc 2783 error_printf("Verify all devices in group %d are bound to vfio-pci "
65501a74
AW
2784 "or pci-stub and not already in use\n", group->groupid);
2785 return ret;
2786 }
2787
2788 vdev->fd = ret;
2789 vdev->group = group;
2790 QLIST_INSERT_HEAD(&group->device_list, vdev, next);
2791
2792 /* Sanity check device */
2793 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_INFO, &dev_info);
2794 if (ret) {
312fd5f2 2795 error_report("vfio: error getting device info: %m");
65501a74
AW
2796 goto error;
2797 }
2798
2799 DPRINTF("Device %s flags: %u, regions: %u, irgs: %u\n", name,
2800 dev_info.flags, dev_info.num_regions, dev_info.num_irqs);
2801
2802 if (!(dev_info.flags & VFIO_DEVICE_FLAGS_PCI)) {
312fd5f2 2803 error_report("vfio: Um, this isn't a PCI device");
65501a74
AW
2804 goto error;
2805 }
2806
2807 vdev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
2808 if (!vdev->reset_works) {
312fd5f2 2809 error_report("Warning, device %s does not support reset", name);
65501a74
AW
2810 }
2811
8fc94e5a 2812 if (dev_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) {
312fd5f2 2813 error_report("vfio: unexpected number of io regions %u",
65501a74
AW
2814 dev_info.num_regions);
2815 goto error;
2816 }
2817
8fc94e5a 2818 if (dev_info.num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1) {
312fd5f2 2819 error_report("vfio: unexpected number of irqs %u", dev_info.num_irqs);
65501a74
AW
2820 goto error;
2821 }
2822
2823 for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) {
2824 reg_info.index = i;
2825
2826 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
2827 if (ret) {
312fd5f2 2828 error_report("vfio: Error getting region %d info: %m", i);
65501a74
AW
2829 goto error;
2830 }
2831
2832 DPRINTF("Device %s region %d:\n", name, i);
2833 DPRINTF(" size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
2834 (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
2835 (unsigned long)reg_info.flags);
2836
2837 vdev->bars[i].flags = reg_info.flags;
2838 vdev->bars[i].size = reg_info.size;
2839 vdev->bars[i].fd_offset = reg_info.offset;
2840 vdev->bars[i].fd = vdev->fd;
2841 vdev->bars[i].nr = i;
7076eabc 2842 QLIST_INIT(&vdev->bars[i].quirks);
65501a74
AW
2843 }
2844
2845 reg_info.index = VFIO_PCI_ROM_REGION_INDEX;
2846
2847 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
2848 if (ret) {
312fd5f2 2849 error_report("vfio: Error getting ROM info: %m");
65501a74
AW
2850 goto error;
2851 }
2852
2853 DPRINTF("Device %s ROM:\n", name);
2854 DPRINTF(" size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
2855 (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
2856 (unsigned long)reg_info.flags);
2857
2858 vdev->rom_size = reg_info.size;
2859 vdev->rom_offset = reg_info.offset;
2860
2861 reg_info.index = VFIO_PCI_CONFIG_REGION_INDEX;
2862
2863 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
2864 if (ret) {
312fd5f2 2865 error_report("vfio: Error getting config info: %m");
65501a74
AW
2866 goto error;
2867 }
2868
2869 DPRINTF("Device %s config:\n", name);
2870 DPRINTF(" size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
2871 (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
2872 (unsigned long)reg_info.flags);
2873
2874 vdev->config_size = reg_info.size;
6a659bbf
AW
2875 if (vdev->config_size == PCI_CONFIG_SPACE_SIZE) {
2876 vdev->pdev.cap_present &= ~QEMU_PCI_CAP_EXPRESS;
2877 }
65501a74
AW
2878 vdev->config_offset = reg_info.offset;
2879
f15689c7
AW
2880 if ((vdev->features & VFIO_FEATURE_ENABLE_VGA) &&
2881 dev_info.num_regions > VFIO_PCI_VGA_REGION_INDEX) {
2882 struct vfio_region_info vga_info = {
2883 .argsz = sizeof(vga_info),
2884 .index = VFIO_PCI_VGA_REGION_INDEX,
2885 };
2886
2887 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &vga_info);
2888 if (ret) {
2889 error_report(
2890 "vfio: Device does not support requested feature x-vga");
2891 goto error;
2892 }
2893
2894 if (!(vga_info.flags & VFIO_REGION_INFO_FLAG_READ) ||
2895 !(vga_info.flags & VFIO_REGION_INFO_FLAG_WRITE) ||
2896 vga_info.size < 0xbffff + 1) {
2897 error_report("vfio: Unexpected VGA info, flags 0x%lx, size 0x%lx",
2898 (unsigned long)vga_info.flags,
2899 (unsigned long)vga_info.size);
2900 goto error;
2901 }
2902
2903 vdev->vga.fd_offset = vga_info.offset;
2904 vdev->vga.fd = vdev->fd;
2905
2906 vdev->vga.region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE;
2907 vdev->vga.region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM;
7076eabc 2908 QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_MEM].quirks);
f15689c7
AW
2909
2910 vdev->vga.region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE;
2911 vdev->vga.region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO;
7076eabc 2912 QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_IO_LO].quirks);
f15689c7
AW
2913
2914 vdev->vga.region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE;
2915 vdev->vga.region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI;
7076eabc 2916 QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks);
f15689c7
AW
2917
2918 vdev->has_vga = true;
2919 }
2920
65501a74
AW
2921error:
2922 if (ret) {
2923 QLIST_REMOVE(vdev, next);
2924 vdev->group = NULL;
2925 close(vdev->fd);
2926 }
2927 return ret;
2928}
2929
2930static void vfio_put_device(VFIODevice *vdev)
2931{
2932 QLIST_REMOVE(vdev, next);
2933 vdev->group = NULL;
2934 DPRINTF("vfio_put_device: close vdev->fd\n");
2935 close(vdev->fd);
2936 if (vdev->msix) {
2937 g_free(vdev->msix);
2938 vdev->msix = NULL;
2939 }
2940}
2941
2942static int vfio_initfn(PCIDevice *pdev)
2943{
2944 VFIODevice *pvdev, *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
2945 VFIOGroup *group;
2946 char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
2947 ssize_t len;
2948 struct stat st;
2949 int groupid;
2950 int ret;
2951
2952 /* Check that the host device exists */
2953 snprintf(path, sizeof(path),
2954 "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
2955 vdev->host.domain, vdev->host.bus, vdev->host.slot,
2956 vdev->host.function);
2957 if (stat(path, &st) < 0) {
312fd5f2 2958 error_report("vfio: error: no such host device: %s", path);
65501a74
AW
2959 return -errno;
2960 }
2961
2962 strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
2963
2964 len = readlink(path, iommu_group_path, PATH_MAX);
2965 if (len <= 0) {
312fd5f2 2966 error_report("vfio: error no iommu_group for device");
65501a74
AW
2967 return -errno;
2968 }
2969
2970 iommu_group_path[len] = 0;
2971 group_name = basename(iommu_group_path);
2972
2973 if (sscanf(group_name, "%d", &groupid) != 1) {
312fd5f2 2974 error_report("vfio: error reading %s: %m", path);
65501a74
AW
2975 return -errno;
2976 }
2977
2978 DPRINTF("%s(%04x:%02x:%02x.%x) group %d\n", __func__, vdev->host.domain,
2979 vdev->host.bus, vdev->host.slot, vdev->host.function, groupid);
2980
2981 group = vfio_get_group(groupid);
2982 if (!group) {
312fd5f2 2983 error_report("vfio: failed to get group %d", groupid);
65501a74
AW
2984 return -ENOENT;
2985 }
2986
2987 snprintf(path, sizeof(path), "%04x:%02x:%02x.%01x",
2988 vdev->host.domain, vdev->host.bus, vdev->host.slot,
2989 vdev->host.function);
2990
2991 QLIST_FOREACH(pvdev, &group->device_list, next) {
2992 if (pvdev->host.domain == vdev->host.domain &&
2993 pvdev->host.bus == vdev->host.bus &&
2994 pvdev->host.slot == vdev->host.slot &&
2995 pvdev->host.function == vdev->host.function) {
2996
312fd5f2 2997 error_report("vfio: error: device %s is already attached", path);
65501a74
AW
2998 vfio_put_group(group);
2999 return -EBUSY;
3000 }
3001 }
3002
3003 ret = vfio_get_device(group, path, vdev);
3004 if (ret) {
312fd5f2 3005 error_report("vfio: failed to get device %s", path);
65501a74
AW
3006 vfio_put_group(group);
3007 return ret;
3008 }
3009
3010 /* Get a copy of config space */
3011 ret = pread(vdev->fd, vdev->pdev.config,
3012 MIN(pci_config_size(&vdev->pdev), vdev->config_size),
3013 vdev->config_offset);
3014 if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) {
3015 ret = ret < 0 ? -errno : -EFAULT;
312fd5f2 3016 error_report("vfio: Failed to read device config space");
65501a74
AW
3017 goto out_put;
3018 }
3019
4b5d5e87
AW
3020 /* vfio emulates a lot for us, but some bits need extra love */
3021 vdev->emulated_config_bits = g_malloc0(vdev->config_size);
3022
3023 /* QEMU can choose to expose the ROM or not */
3024 memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4);
3025
3026 /* QEMU can change multi-function devices to single function, or reverse */
3027 vdev->emulated_config_bits[PCI_HEADER_TYPE] =
3028 PCI_HEADER_TYPE_MULTI_FUNCTION;
3029
65501a74
AW
3030 /*
3031 * Clear host resource mapping info. If we choose not to register a
3032 * BAR, such as might be the case with the option ROM, we can get
3033 * confusing, unwritable, residual addresses from the host here.
3034 */
3035 memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
3036 memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
3037
3038 vfio_load_rom(vdev);
3039
3040 ret = vfio_early_setup_msix(vdev);
3041 if (ret) {
3042 goto out_put;
3043 }
3044
3045 vfio_map_bars(vdev);
3046
3047 ret = vfio_add_capabilities(vdev);
3048 if (ret) {
3049 goto out_teardown;
3050 }
3051
4b5d5e87
AW
3052 /* QEMU emulates all of MSI & MSIX */
3053 if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
3054 memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
3055 MSIX_CAP_LENGTH);
3056 }
3057
3058 if (pdev->cap_present & QEMU_PCI_CAP_MSI) {
3059 memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff,
3060 vdev->msi_cap_size);
3061 }
3062
65501a74 3063 if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
ea486926
AW
3064 vdev->intx.mmap_timer = qemu_new_timer_ms(vm_clock,
3065 vfio_intx_mmap_enable, vdev);
e1d1e586 3066 pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_update_irq);
65501a74
AW
3067 ret = vfio_enable_intx(vdev);
3068 if (ret) {
3069 goto out_teardown;
3070 }
3071 }
3072
3073 return 0;
3074
3075out_teardown:
3076 pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
3077 vfio_teardown_msi(vdev);
3078 vfio_unmap_bars(vdev);
3079out_put:
4b5d5e87 3080 g_free(vdev->emulated_config_bits);
65501a74
AW
3081 vfio_put_device(vdev);
3082 vfio_put_group(group);
3083 return ret;
3084}
3085
3086static void vfio_exitfn(PCIDevice *pdev)
3087{
3088 VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
3089 VFIOGroup *group = vdev->group;
3090
3091 pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
3092 vfio_disable_interrupts(vdev);
ea486926
AW
3093 if (vdev->intx.mmap_timer) {
3094 qemu_free_timer(vdev->intx.mmap_timer);
3095 }
65501a74
AW
3096 vfio_teardown_msi(vdev);
3097 vfio_unmap_bars(vdev);
4b5d5e87 3098 g_free(vdev->emulated_config_bits);
65501a74
AW
3099 vfio_put_device(vdev);
3100 vfio_put_group(group);
3101}
3102
3103static void vfio_pci_reset(DeviceState *dev)
3104{
3105 PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev);
3106 VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
5834a83f 3107 uint16_t cmd;
65501a74 3108
5834a83f
AW
3109 DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
3110 vdev->host.bus, vdev->host.slot, vdev->host.function);
3111
3112 vfio_disable_interrupts(vdev);
65501a74 3113
ba661818
AW
3114 /* Make sure the device is in D0 */
3115 if (vdev->pm_cap) {
3116 uint16_t pmcsr;
3117 uint8_t state;
3118
3119 pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
3120 state = pmcsr & PCI_PM_CTRL_STATE_MASK;
3121 if (state) {
3122 pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
3123 vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2);
3124 /* vfio handles the necessary delay here */
3125 pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
3126 state = pmcsr & PCI_PM_CTRL_STATE_MASK;
3127 if (state) {
3128 error_report("vfio: Unable to power on device, stuck in D%d\n",
3129 state);
3130 }
3131 }
3132 }
3133
5834a83f
AW
3134 /*
3135 * Stop any ongoing DMA by disconecting I/O, MMIO, and bus master.
3136 * Also put INTx Disable in known state.
3137 */
3138 cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2);
3139 cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
3140 PCI_COMMAND_INTX_DISABLE);
3141 vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
3142
3143 if (vdev->reset_works) {
3144 if (ioctl(vdev->fd, VFIO_DEVICE_RESET)) {
3145 error_report("vfio: Error unable to reset physical device "
312fd5f2 3146 "(%04x:%02x:%02x.%x): %m", vdev->host.domain,
5834a83f
AW
3147 vdev->host.bus, vdev->host.slot, vdev->host.function);
3148 }
65501a74 3149 }
5834a83f
AW
3150
3151 vfio_enable_intx(vdev);
65501a74
AW
3152}
3153
3154static Property vfio_pci_dev_properties[] = {
3155 DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIODevice, host),
ea486926
AW
3156 DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIODevice,
3157 intx.mmap_timeout, 1100),
f15689c7
AW
3158 DEFINE_PROP_BIT("x-vga", VFIODevice, features,
3159 VFIO_FEATURE_ENABLE_VGA_BIT, false),
65501a74
AW
3160 /*
3161 * TODO - support passed fds... is this necessary?
3162 * DEFINE_PROP_STRING("vfiofd", VFIODevice, vfiofd_name),
3163 * DEFINE_PROP_STRING("vfiogroupfd, VFIODevice, vfiogroupfd_name),
3164 */
3165 DEFINE_PROP_END_OF_LIST(),
3166};
3167
d9f0e638
AW
3168static const VMStateDescription vfio_pci_vmstate = {
3169 .name = "vfio-pci",
3170 .unmigratable = 1,
3171};
65501a74
AW
3172
3173static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
3174{
3175 DeviceClass *dc = DEVICE_CLASS(klass);
3176 PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
3177
3178 dc->reset = vfio_pci_reset;
3179 dc->props = vfio_pci_dev_properties;
d9f0e638
AW
3180 dc->vmsd = &vfio_pci_vmstate;
3181 dc->desc = "VFIO-based PCI device assignment";
65501a74
AW
3182 pdc->init = vfio_initfn;
3183 pdc->exit = vfio_exitfn;
3184 pdc->config_read = vfio_pci_read_config;
3185 pdc->config_write = vfio_pci_write_config;
6a659bbf 3186 pdc->is_express = 1; /* We might be */
65501a74
AW
3187}
3188
3189static const TypeInfo vfio_pci_dev_info = {
3190 .name = "vfio-pci",
3191 .parent = TYPE_PCI_DEVICE,
3192 .instance_size = sizeof(VFIODevice),
3193 .class_init = vfio_pci_dev_class_init,
3194};
3195
3196static void register_vfio_pci_dev_type(void)
3197{
3198 type_register_static(&vfio_pci_dev_info);
3199}
3200
3201type_init(register_vfio_pci_dev_type)