]> git.proxmox.com Git - mirror_qemu.git/blame - hw/vfio_pci.c
vfio-pci: Add KVM INTx acceleration
[mirror_qemu.git] / hw / vfio_pci.c
CommitLineData
65501a74
AW
1/*
2 * vfio based device assignment support
3 *
4 * Copyright Red Hat, Inc. 2012
5 *
6 * Authors:
7 * Alex Williamson <alex.williamson@redhat.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
11 *
12 * Based on qemu-kvm device-assignment:
13 * Adapted for KVM by Qumranet.
14 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
15 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
16 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
17 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
18 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
19 */
20
21#include <dirent.h>
22#include <unistd.h>
23#include <sys/ioctl.h>
24#include <sys/mman.h>
25#include <sys/stat.h>
26#include <sys/types.h>
27#include <linux/vfio.h>
28
29#include "config.h"
30#include "event_notifier.h"
31#include "exec-memory.h"
32#include "kvm.h"
33#include "memory.h"
34#include "msi.h"
35#include "msix.h"
5c97e5eb
AW
36#include "pci.h"
37#include "qemu-common.h"
65501a74 38#include "qemu-error.h"
5c97e5eb 39#include "qemu-queue.h"
65501a74 40#include "range.h"
65501a74
AW
41
42/* #define DEBUG_VFIO */
43#ifdef DEBUG_VFIO
44#define DPRINTF(fmt, ...) \
45 do { fprintf(stderr, "vfio: " fmt, ## __VA_ARGS__); } while (0)
46#else
47#define DPRINTF(fmt, ...) \
48 do { } while (0)
49#endif
50
5c97e5eb
AW
51typedef struct VFIOBAR {
52 off_t fd_offset; /* offset of BAR within device fd */
53 int fd; /* device fd, allows us to pass VFIOBAR as opaque data */
54 MemoryRegion mem; /* slow, read/write access */
55 MemoryRegion mmap_mem; /* direct mapped access */
56 void *mmap;
57 size_t size;
58 uint32_t flags; /* VFIO region flags (rd/wr/mmap) */
59 uint8_t nr; /* cache the BAR number for debug */
60} VFIOBAR;
61
62typedef struct VFIOINTx {
63 bool pending; /* interrupt pending */
64 bool kvm_accel; /* set when QEMU bypass through KVM enabled */
65 uint8_t pin; /* which pin to pull for qemu_set_irq */
66 EventNotifier interrupt; /* eventfd triggered on interrupt */
67 EventNotifier unmask; /* eventfd for unmask on QEMU bypass */
68 PCIINTxRoute route; /* routing info for QEMU bypass */
69 uint32_t mmap_timeout; /* delay to re-enable mmaps after interrupt */
70 QEMUTimer *mmap_timer; /* enable mmaps after periods w/o interrupts */
71} VFIOINTx;
72
73struct VFIODevice;
74
75typedef struct VFIOMSIVector {
76 EventNotifier interrupt; /* eventfd triggered on interrupt */
77 struct VFIODevice *vdev; /* back pointer to device */
78 int virq; /* KVM irqchip route for QEMU bypass */
79 bool use;
80} VFIOMSIVector;
81
82enum {
83 VFIO_INT_NONE = 0,
84 VFIO_INT_INTx = 1,
85 VFIO_INT_MSI = 2,
86 VFIO_INT_MSIX = 3,
87};
88
89struct VFIOGroup;
90
91typedef struct VFIOContainer {
92 int fd; /* /dev/vfio/vfio, empowered by the attached groups */
93 struct {
94 /* enable abstraction to support various iommu backends */
95 union {
96 MemoryListener listener; /* Used by type1 iommu */
97 };
98 void (*release)(struct VFIOContainer *);
99 } iommu_data;
100 QLIST_HEAD(, VFIOGroup) group_list;
101 QLIST_ENTRY(VFIOContainer) next;
102} VFIOContainer;
103
104/* Cache of MSI-X setup plus extra mmap and memory region for split BAR map */
105typedef struct VFIOMSIXInfo {
106 uint8_t table_bar;
107 uint8_t pba_bar;
108 uint16_t entries;
109 uint32_t table_offset;
110 uint32_t pba_offset;
111 MemoryRegion mmap_mem;
112 void *mmap;
113} VFIOMSIXInfo;
114
115typedef struct VFIODevice {
116 PCIDevice pdev;
117 int fd;
118 VFIOINTx intx;
119 unsigned int config_size;
120 off_t config_offset; /* Offset of config space region within device fd */
121 unsigned int rom_size;
122 off_t rom_offset; /* Offset of ROM region within device fd */
123 int msi_cap_size;
124 VFIOMSIVector *msi_vectors;
125 VFIOMSIXInfo *msix;
126 int nr_vectors; /* Number of MSI/MSIX vectors currently in use */
127 int interrupt; /* Current interrupt type */
128 VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */
129 PCIHostDeviceAddress host;
130 QLIST_ENTRY(VFIODevice) next;
131 struct VFIOGroup *group;
132 bool reset_works;
133} VFIODevice;
134
135typedef struct VFIOGroup {
136 int fd;
137 int groupid;
138 VFIOContainer *container;
139 QLIST_HEAD(, VFIODevice) device_list;
140 QLIST_ENTRY(VFIOGroup) next;
141 QLIST_ENTRY(VFIOGroup) container_next;
142} VFIOGroup;
143
65501a74
AW
144#define MSIX_CAP_LENGTH 12
145
146static QLIST_HEAD(, VFIOContainer)
147 container_list = QLIST_HEAD_INITIALIZER(container_list);
148
149static QLIST_HEAD(, VFIOGroup)
150 group_list = QLIST_HEAD_INITIALIZER(group_list);
151
152static void vfio_disable_interrupts(VFIODevice *vdev);
153static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
154static void vfio_mmap_set_enabled(VFIODevice *vdev, bool enabled);
155
156/*
157 * Common VFIO interrupt disable
158 */
159static void vfio_disable_irqindex(VFIODevice *vdev, int index)
160{
161 struct vfio_irq_set irq_set = {
162 .argsz = sizeof(irq_set),
163 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
164 .index = index,
165 .start = 0,
166 .count = 0,
167 };
168
169 ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
65501a74
AW
170}
171
172/*
173 * INTx
174 */
175static void vfio_unmask_intx(VFIODevice *vdev)
176{
177 struct vfio_irq_set irq_set = {
178 .argsz = sizeof(irq_set),
179 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
180 .index = VFIO_PCI_INTX_IRQ_INDEX,
181 .start = 0,
182 .count = 1,
183 };
184
185 ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
186}
187
e1d1e586
AW
188#ifdef CONFIG_KVM /* Unused outside of CONFIG_KVM code */
189static void vfio_mask_intx(VFIODevice *vdev)
190{
191 struct vfio_irq_set irq_set = {
192 .argsz = sizeof(irq_set),
193 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
194 .index = VFIO_PCI_INTX_IRQ_INDEX,
195 .start = 0,
196 .count = 1,
197 };
198
199 ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
200}
201#endif
202
ea486926
AW
203/*
204 * Disabling BAR mmaping can be slow, but toggling it around INTx can
205 * also be a huge overhead. We try to get the best of both worlds by
206 * waiting until an interrupt to disable mmaps (subsequent transitions
207 * to the same state are effectively no overhead). If the interrupt has
208 * been serviced and the time gap is long enough, we re-enable mmaps for
209 * performance. This works well for things like graphics cards, which
210 * may not use their interrupt at all and are penalized to an unusable
211 * level by read/write BAR traps. Other devices, like NICs, have more
212 * regular interrupts and see much better latency by staying in non-mmap
213 * mode. We therefore set the default mmap_timeout such that a ping
214 * is just enough to keep the mmap disabled. Users can experiment with
215 * other options with the x-intx-mmap-timeout-ms parameter (a value of
216 * zero disables the timer).
217 */
218static void vfio_intx_mmap_enable(void *opaque)
219{
220 VFIODevice *vdev = opaque;
221
222 if (vdev->intx.pending) {
223 qemu_mod_timer(vdev->intx.mmap_timer,
224 qemu_get_clock_ms(vm_clock) + vdev->intx.mmap_timeout);
225 return;
226 }
227
228 vfio_mmap_set_enabled(vdev, true);
229}
230
65501a74
AW
231static void vfio_intx_interrupt(void *opaque)
232{
233 VFIODevice *vdev = opaque;
234
235 if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) {
236 return;
237 }
238
239 DPRINTF("%s(%04x:%02x:%02x.%x) Pin %c\n", __func__, vdev->host.domain,
240 vdev->host.bus, vdev->host.slot, vdev->host.function,
241 'A' + vdev->intx.pin);
242
243 vdev->intx.pending = true;
244 qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 1);
ea486926
AW
245 vfio_mmap_set_enabled(vdev, false);
246 if (vdev->intx.mmap_timeout) {
247 qemu_mod_timer(vdev->intx.mmap_timer,
248 qemu_get_clock_ms(vm_clock) + vdev->intx.mmap_timeout);
249 }
65501a74
AW
250}
251
252static void vfio_eoi(VFIODevice *vdev)
253{
254 if (!vdev->intx.pending) {
255 return;
256 }
257
258 DPRINTF("%s(%04x:%02x:%02x.%x) EOI\n", __func__, vdev->host.domain,
259 vdev->host.bus, vdev->host.slot, vdev->host.function);
260
261 vdev->intx.pending = false;
262 qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
263 vfio_unmask_intx(vdev);
264}
265
e1d1e586
AW
266static void vfio_enable_intx_kvm(VFIODevice *vdev)
267{
268#ifdef CONFIG_KVM
269 struct kvm_irqfd irqfd = {
270 .fd = event_notifier_get_fd(&vdev->intx.interrupt),
271 .gsi = vdev->intx.route.irq,
272 .flags = KVM_IRQFD_FLAG_RESAMPLE,
273 };
274 struct vfio_irq_set *irq_set;
275 int ret, argsz;
276 int32_t *pfd;
277
278 if (!kvm_irqchip_in_kernel() ||
279 vdev->intx.route.mode != PCI_INTX_ENABLED ||
280 !kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) {
281 return;
282 }
283
284 /* Get to a known interrupt state */
285 qemu_set_fd_handler(irqfd.fd, NULL, NULL, vdev);
286 vfio_mask_intx(vdev);
287 vdev->intx.pending = false;
288 qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
289
290 /* Get an eventfd for resample/unmask */
291 if (event_notifier_init(&vdev->intx.unmask, 0)) {
292 error_report("vfio: Error: event_notifier_init failed eoi\n");
293 goto fail;
294 }
295
296 /* KVM triggers it, VFIO listens for it */
297 irqfd.resamplefd = event_notifier_get_fd(&vdev->intx.unmask);
298
299 if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
300 error_report("vfio: Error: Failed to setup resample irqfd: %m\n");
301 goto fail_irqfd;
302 }
303
304 argsz = sizeof(*irq_set) + sizeof(*pfd);
305
306 irq_set = g_malloc0(argsz);
307 irq_set->argsz = argsz;
308 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK;
309 irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
310 irq_set->start = 0;
311 irq_set->count = 1;
312 pfd = (int32_t *)&irq_set->data;
313
314 *pfd = irqfd.resamplefd;
315
316 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
317 g_free(irq_set);
318 if (ret) {
319 error_report("vfio: Error: Failed to setup INTx unmask fd: %m\n");
320 goto fail_vfio;
321 }
322
323 /* Let'em rip */
324 vfio_unmask_intx(vdev);
325
326 vdev->intx.kvm_accel = true;
327
328 DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel enabled\n",
329 __func__, vdev->host.domain, vdev->host.bus,
330 vdev->host.slot, vdev->host.function);
331
332 return;
333
334fail_vfio:
335 irqfd.flags = KVM_IRQFD_FLAG_DEASSIGN;
336 kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd);
337fail_irqfd:
338 event_notifier_cleanup(&vdev->intx.unmask);
339fail:
340 qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
341 vfio_unmask_intx(vdev);
342#endif
343}
344
345static void vfio_disable_intx_kvm(VFIODevice *vdev)
346{
347#ifdef CONFIG_KVM
348 struct kvm_irqfd irqfd = {
349 .fd = event_notifier_get_fd(&vdev->intx.interrupt),
350 .gsi = vdev->intx.route.irq,
351 .flags = KVM_IRQFD_FLAG_DEASSIGN,
352 };
353
354 if (!vdev->intx.kvm_accel) {
355 return;
356 }
357
358 /*
359 * Get to a known state, hardware masked, QEMU ready to accept new
360 * interrupts, QEMU IRQ de-asserted.
361 */
362 vfio_mask_intx(vdev);
363 vdev->intx.pending = false;
364 qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
365
366 /* Tell KVM to stop listening for an INTx irqfd */
367 if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
368 error_report("vfio: Error: Failed to disable INTx irqfd: %m\n");
369 }
370
371 /* We only need to close the eventfd for VFIO to cleanup the kernel side */
372 event_notifier_cleanup(&vdev->intx.unmask);
373
374 /* QEMU starts listening for interrupt events. */
375 qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
376
377 vdev->intx.kvm_accel = false;
378
379 /* If we've missed an event, let it re-fire through QEMU */
380 vfio_unmask_intx(vdev);
381
382 DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel disabled\n",
383 __func__, vdev->host.domain, vdev->host.bus,
384 vdev->host.slot, vdev->host.function);
385#endif
386}
387
388static void vfio_update_irq(PCIDevice *pdev)
389{
390 VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
391 PCIINTxRoute route;
392
393 if (vdev->interrupt != VFIO_INT_INTx) {
394 return;
395 }
396
397 route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
398
399 if (!pci_intx_route_changed(&vdev->intx.route, &route)) {
400 return; /* Nothing changed */
401 }
402
403 DPRINTF("%s(%04x:%02x:%02x.%x) IRQ moved %d -> %d\n", __func__,
404 vdev->host.domain, vdev->host.bus, vdev->host.slot,
405 vdev->host.function, vdev->intx.route.irq, route.irq);
406
407 vfio_disable_intx_kvm(vdev);
408
409 vdev->intx.route = route;
410
411 if (route.mode != PCI_INTX_ENABLED) {
412 return;
413 }
414
415 vfio_enable_intx_kvm(vdev);
416
417 /* Re-enable the interrupt in cased we missed an EOI */
418 vfio_eoi(vdev);
419}
420
65501a74
AW
421static int vfio_enable_intx(VFIODevice *vdev)
422{
65501a74 423 uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
1a403133
AW
424 int ret, argsz;
425 struct vfio_irq_set *irq_set;
426 int32_t *pfd;
65501a74 427
ea486926 428 if (!pin) {
65501a74
AW
429 return 0;
430 }
431
432 vfio_disable_interrupts(vdev);
433
434 vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
e1d1e586
AW
435
436#ifdef CONFIG_KVM
437 /*
438 * Only conditional to avoid generating error messages on platforms
439 * where we won't actually use the result anyway.
440 */
441 if (kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) {
442 vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev,
443 vdev->intx.pin);
444 }
445#endif
446
65501a74
AW
447 ret = event_notifier_init(&vdev->intx.interrupt, 0);
448 if (ret) {
449 error_report("vfio: Error: event_notifier_init failed\n");
450 return ret;
451 }
452
1a403133
AW
453 argsz = sizeof(*irq_set) + sizeof(*pfd);
454
455 irq_set = g_malloc0(argsz);
456 irq_set->argsz = argsz;
457 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
458 irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
459 irq_set->start = 0;
460 irq_set->count = 1;
461 pfd = (int32_t *)&irq_set->data;
462
463 *pfd = event_notifier_get_fd(&vdev->intx.interrupt);
464 qemu_set_fd_handler(*pfd, vfio_intx_interrupt, NULL, vdev);
65501a74 465
1a403133
AW
466 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
467 g_free(irq_set);
468 if (ret) {
65501a74 469 error_report("vfio: Error: Failed to setup INTx fd: %m\n");
1a403133 470 qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
ce59af2d 471 event_notifier_cleanup(&vdev->intx.interrupt);
65501a74
AW
472 return -errno;
473 }
474
e1d1e586
AW
475 vfio_enable_intx_kvm(vdev);
476
65501a74
AW
477 vdev->interrupt = VFIO_INT_INTx;
478
479 DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
480 vdev->host.bus, vdev->host.slot, vdev->host.function);
481
482 return 0;
483}
484
485static void vfio_disable_intx(VFIODevice *vdev)
486{
487 int fd;
488
ea486926 489 qemu_del_timer(vdev->intx.mmap_timer);
e1d1e586 490 vfio_disable_intx_kvm(vdev);
65501a74
AW
491 vfio_disable_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
492 vdev->intx.pending = false;
493 qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
494 vfio_mmap_set_enabled(vdev, true);
495
496 fd = event_notifier_get_fd(&vdev->intx.interrupt);
497 qemu_set_fd_handler(fd, NULL, NULL, vdev);
498 event_notifier_cleanup(&vdev->intx.interrupt);
499
500 vdev->interrupt = VFIO_INT_NONE;
501
502 DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
503 vdev->host.bus, vdev->host.slot, vdev->host.function);
504}
505
506/*
507 * MSI/X
508 */
509static void vfio_msi_interrupt(void *opaque)
510{
511 VFIOMSIVector *vector = opaque;
512 VFIODevice *vdev = vector->vdev;
513 int nr = vector - vdev->msi_vectors;
514
515 if (!event_notifier_test_and_clear(&vector->interrupt)) {
516 return;
517 }
518
519 DPRINTF("%s(%04x:%02x:%02x.%x) vector %d\n", __func__,
520 vdev->host.domain, vdev->host.bus, vdev->host.slot,
521 vdev->host.function, nr);
522
523 if (vdev->interrupt == VFIO_INT_MSIX) {
524 msix_notify(&vdev->pdev, nr);
525 } else if (vdev->interrupt == VFIO_INT_MSI) {
526 msi_notify(&vdev->pdev, nr);
527 } else {
528 error_report("vfio: MSI interrupt receieved, but not enabled?\n");
529 }
530}
531
532static int vfio_enable_vectors(VFIODevice *vdev, bool msix)
533{
534 struct vfio_irq_set *irq_set;
535 int ret = 0, i, argsz;
536 int32_t *fds;
537
538 argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
539
540 irq_set = g_malloc0(argsz);
541 irq_set->argsz = argsz;
542 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
543 irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX;
544 irq_set->start = 0;
545 irq_set->count = vdev->nr_vectors;
546 fds = (int32_t *)&irq_set->data;
547
548 for (i = 0; i < vdev->nr_vectors; i++) {
549 if (!vdev->msi_vectors[i].use) {
550 fds[i] = -1;
551 continue;
552 }
553
554 fds[i] = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
555 }
556
557 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
558
559 g_free(irq_set);
560
65501a74
AW
561 return ret;
562}
563
564static int vfio_msix_vector_use(PCIDevice *pdev,
565 unsigned int nr, MSIMessage msg)
566{
567 VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
568 VFIOMSIVector *vector;
569 int ret;
570
571 DPRINTF("%s(%04x:%02x:%02x.%x) vector %d used\n", __func__,
572 vdev->host.domain, vdev->host.bus, vdev->host.slot,
573 vdev->host.function, nr);
574
65501a74
AW
575 vector = &vdev->msi_vectors[nr];
576 vector->vdev = vdev;
577 vector->use = true;
578
579 msix_vector_use(pdev, nr);
580
581 if (event_notifier_init(&vector->interrupt, 0)) {
582 error_report("vfio: Error: event_notifier_init failed\n");
583 }
584
585 /*
586 * Attempt to enable route through KVM irqchip,
587 * default to userspace handling if unavailable.
588 */
589 vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
590 if (vector->virq < 0 ||
591 kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
592 vector->virq) < 0) {
593 if (vector->virq >= 0) {
594 kvm_irqchip_release_virq(kvm_state, vector->virq);
595 vector->virq = -1;
596 }
597 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
598 vfio_msi_interrupt, NULL, vector);
599 }
600
601 /*
602 * We don't want to have the host allocate all possible MSI vectors
603 * for a device if they're not in use, so we shutdown and incrementally
604 * increase them as needed.
605 */
606 if (vdev->nr_vectors < nr + 1) {
65501a74
AW
607 vfio_disable_irqindex(vdev, VFIO_PCI_MSIX_IRQ_INDEX);
608 vdev->nr_vectors = nr + 1;
609 ret = vfio_enable_vectors(vdev, true);
610 if (ret) {
611 error_report("vfio: failed to enable vectors, %d\n", ret);
612 }
65501a74 613 } else {
1a403133
AW
614 int argsz;
615 struct vfio_irq_set *irq_set;
616 int32_t *pfd;
617
618 argsz = sizeof(*irq_set) + sizeof(*pfd);
619
620 irq_set = g_malloc0(argsz);
621 irq_set->argsz = argsz;
622 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
623 VFIO_IRQ_SET_ACTION_TRIGGER;
624 irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
625 irq_set->start = nr;
626 irq_set->count = 1;
627 pfd = (int32_t *)&irq_set->data;
628
629 *pfd = event_notifier_get_fd(&vector->interrupt);
630
631 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
632 g_free(irq_set);
65501a74
AW
633 if (ret) {
634 error_report("vfio: failed to modify vector, %d\n", ret);
635 }
65501a74
AW
636 }
637
638 return 0;
639}
640
641static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
642{
643 VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
644 VFIOMSIVector *vector = &vdev->msi_vectors[nr];
1a403133
AW
645 int argsz;
646 struct vfio_irq_set *irq_set;
647 int32_t *pfd;
65501a74
AW
648
649 DPRINTF("%s(%04x:%02x:%02x.%x) vector %d released\n", __func__,
650 vdev->host.domain, vdev->host.bus, vdev->host.slot,
651 vdev->host.function, nr);
652
653 /*
654 * XXX What's the right thing to do here? This turns off the interrupt
655 * completely, but do we really just want to switch the interrupt to
656 * bouncing through userspace and let msix.c drop it? Not sure.
657 */
658 msix_vector_unuse(pdev, nr);
1a403133
AW
659
660 argsz = sizeof(*irq_set) + sizeof(*pfd);
661
662 irq_set = g_malloc0(argsz);
663 irq_set->argsz = argsz;
664 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
665 VFIO_IRQ_SET_ACTION_TRIGGER;
666 irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
667 irq_set->start = nr;
668 irq_set->count = 1;
669 pfd = (int32_t *)&irq_set->data;
670
671 *pfd = -1;
672
673 ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
674
675 g_free(irq_set);
65501a74
AW
676
677 if (vector->virq < 0) {
678 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
679 NULL, NULL, NULL);
680 } else {
681 kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->interrupt,
682 vector->virq);
683 kvm_irqchip_release_virq(kvm_state, vector->virq);
684 vector->virq = -1;
685 }
686
687 event_notifier_cleanup(&vector->interrupt);
688 vector->use = false;
689}
690
691/* TODO This should move to msi.c */
692static MSIMessage msi_get_msg(PCIDevice *pdev, unsigned int vector)
693{
694 uint16_t flags = pci_get_word(pdev->config + pdev->msi_cap + PCI_MSI_FLAGS);
695 bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
696 MSIMessage msg;
697
698 if (msi64bit) {
699 msg.address = pci_get_quad(pdev->config +
700 pdev->msi_cap + PCI_MSI_ADDRESS_LO);
701 } else {
702 msg.address = pci_get_long(pdev->config +
703 pdev->msi_cap + PCI_MSI_ADDRESS_LO);
704 }
705
706 msg.data = pci_get_word(pdev->config + pdev->msi_cap +
707 (msi64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32));
708 msg.data += vector;
709
710 return msg;
711}
712
fd704adc
AW
713static void vfio_enable_msix(VFIODevice *vdev)
714{
715 vfio_disable_interrupts(vdev);
716
717 vdev->msi_vectors = g_malloc0(vdev->msix->entries * sizeof(VFIOMSIVector));
718
719 vdev->interrupt = VFIO_INT_MSIX;
720
721 if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
722 vfio_msix_vector_release)) {
723 error_report("vfio: msix_set_vector_notifiers failed\n");
724 }
725
726 DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
727 vdev->host.bus, vdev->host.slot, vdev->host.function);
728}
729
65501a74
AW
730static void vfio_enable_msi(VFIODevice *vdev)
731{
732 int ret, i;
733
734 vfio_disable_interrupts(vdev);
735
736 vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
737retry:
738 vdev->msi_vectors = g_malloc0(vdev->nr_vectors * sizeof(VFIOMSIVector));
739
740 for (i = 0; i < vdev->nr_vectors; i++) {
741 MSIMessage msg;
742 VFIOMSIVector *vector = &vdev->msi_vectors[i];
743
744 vector->vdev = vdev;
745 vector->use = true;
746
747 if (event_notifier_init(&vector->interrupt, 0)) {
748 error_report("vfio: Error: event_notifier_init failed\n");
749 }
750
751 msg = msi_get_msg(&vdev->pdev, i);
752
753 /*
754 * Attempt to enable route through KVM irqchip,
755 * default to userspace handling if unavailable.
756 */
757 vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
758 if (vector->virq < 0 ||
759 kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
760 vector->virq) < 0) {
761 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
762 vfio_msi_interrupt, NULL, vector);
763 }
764 }
765
766 ret = vfio_enable_vectors(vdev, false);
767 if (ret) {
768 if (ret < 0) {
769 error_report("vfio: Error: Failed to setup MSI fds: %m\n");
770 } else if (ret != vdev->nr_vectors) {
771 error_report("vfio: Error: Failed to enable %d "
772 "MSI vectors, retry with %d\n", vdev->nr_vectors, ret);
773 }
774
775 for (i = 0; i < vdev->nr_vectors; i++) {
776 VFIOMSIVector *vector = &vdev->msi_vectors[i];
777 if (vector->virq >= 0) {
778 kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->interrupt,
779 vector->virq);
780 kvm_irqchip_release_virq(kvm_state, vector->virq);
781 vector->virq = -1;
782 } else {
783 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
784 NULL, NULL, NULL);
785 }
786 event_notifier_cleanup(&vector->interrupt);
787 }
788
789 g_free(vdev->msi_vectors);
790
791 if (ret > 0 && ret != vdev->nr_vectors) {
792 vdev->nr_vectors = ret;
793 goto retry;
794 }
795 vdev->nr_vectors = 0;
796
797 return;
798 }
799
fd704adc
AW
800 vdev->interrupt = VFIO_INT_MSI;
801
65501a74
AW
802 DPRINTF("%s(%04x:%02x:%02x.%x) Enabled %d MSI vectors\n", __func__,
803 vdev->host.domain, vdev->host.bus, vdev->host.slot,
804 vdev->host.function, vdev->nr_vectors);
805}
806
fd704adc
AW
807static void vfio_disable_msi_common(VFIODevice *vdev)
808{
809 g_free(vdev->msi_vectors);
810 vdev->msi_vectors = NULL;
811 vdev->nr_vectors = 0;
812 vdev->interrupt = VFIO_INT_NONE;
813
814 vfio_enable_intx(vdev);
815}
816
817static void vfio_disable_msix(VFIODevice *vdev)
818{
819 msix_unset_vector_notifiers(&vdev->pdev);
820
821 if (vdev->nr_vectors) {
822 vfio_disable_irqindex(vdev, VFIO_PCI_MSIX_IRQ_INDEX);
823 }
824
825 vfio_disable_msi_common(vdev);
826
a011b10e
AW
827 DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
828 vdev->host.bus, vdev->host.slot, vdev->host.function);
fd704adc
AW
829}
830
831static void vfio_disable_msi(VFIODevice *vdev)
65501a74
AW
832{
833 int i;
834
fd704adc 835 vfio_disable_irqindex(vdev, VFIO_PCI_MSI_IRQ_INDEX);
65501a74
AW
836
837 for (i = 0; i < vdev->nr_vectors; i++) {
838 VFIOMSIVector *vector = &vdev->msi_vectors[i];
839
840 if (!vector->use) {
841 continue;
842 }
843
844 if (vector->virq >= 0) {
845 kvm_irqchip_remove_irqfd_notifier(kvm_state,
846 &vector->interrupt, vector->virq);
847 kvm_irqchip_release_virq(kvm_state, vector->virq);
848 vector->virq = -1;
849 } else {
850 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
851 NULL, NULL, NULL);
852 }
853
65501a74
AW
854 event_notifier_cleanup(&vector->interrupt);
855 }
856
fd704adc 857 vfio_disable_msi_common(vdev);
65501a74 858
fd704adc
AW
859 DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
860 vdev->host.bus, vdev->host.slot, vdev->host.function);
65501a74
AW
861}
862
863/*
864 * IO Port/MMIO - Beware of the endians, VFIO is always little endian
865 */
a8170e5e 866static void vfio_bar_write(void *opaque, hwaddr addr,
65501a74
AW
867 uint64_t data, unsigned size)
868{
869 VFIOBAR *bar = opaque;
870 union {
871 uint8_t byte;
872 uint16_t word;
873 uint32_t dword;
874 uint64_t qword;
875 } buf;
876
877 switch (size) {
878 case 1:
879 buf.byte = data;
880 break;
881 case 2:
882 buf.word = cpu_to_le16(data);
883 break;
884 case 4:
885 buf.dword = cpu_to_le32(data);
886 break;
887 default:
888 hw_error("vfio: unsupported write size, %d bytes\n", size);
889 break;
890 }
891
892 if (pwrite(bar->fd, &buf, size, bar->fd_offset + addr) != size) {
a8170e5e 893 error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m\n",
65501a74
AW
894 __func__, addr, data, size);
895 }
896
a8170e5e 897 DPRINTF("%s(BAR%d+0x%"HWADDR_PRIx", 0x%"PRIx64", %d)\n",
65501a74
AW
898 __func__, bar->nr, addr, data, size);
899
900 /*
901 * A read or write to a BAR always signals an INTx EOI. This will
902 * do nothing if not pending (including not in INTx mode). We assume
903 * that a BAR access is in response to an interrupt and that BAR
904 * accesses will service the interrupt. Unfortunately, we don't know
905 * which access will service the interrupt, so we're potentially
906 * getting quite a few host interrupts per guest interrupt.
907 */
3a4f2816 908 vfio_eoi(container_of(bar, VFIODevice, bars[bar->nr]));
65501a74
AW
909}
910
911static uint64_t vfio_bar_read(void *opaque,
a8170e5e 912 hwaddr addr, unsigned size)
65501a74
AW
913{
914 VFIOBAR *bar = opaque;
915 union {
916 uint8_t byte;
917 uint16_t word;
918 uint32_t dword;
919 uint64_t qword;
920 } buf;
921 uint64_t data = 0;
922
923 if (pread(bar->fd, &buf, size, bar->fd_offset + addr) != size) {
a8170e5e 924 error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m\n",
65501a74
AW
925 __func__, addr, size);
926 return (uint64_t)-1;
927 }
928
929 switch (size) {
930 case 1:
931 data = buf.byte;
932 break;
933 case 2:
934 data = le16_to_cpu(buf.word);
935 break;
936 case 4:
937 data = le32_to_cpu(buf.dword);
938 break;
939 default:
940 hw_error("vfio: unsupported read size, %d bytes\n", size);
941 break;
942 }
943
a8170e5e 944 DPRINTF("%s(BAR%d+0x%"HWADDR_PRIx", %d) = 0x%"PRIx64"\n",
65501a74
AW
945 __func__, bar->nr, addr, size, data);
946
947 /* Same as write above */
3a4f2816 948 vfio_eoi(container_of(bar, VFIODevice, bars[bar->nr]));
65501a74
AW
949
950 return data;
951}
952
953static const MemoryRegionOps vfio_bar_ops = {
954 .read = vfio_bar_read,
955 .write = vfio_bar_write,
956 .endianness = DEVICE_LITTLE_ENDIAN,
957};
958
959/*
960 * PCI config space
961 */
962static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
963{
964 VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
965 uint32_t val = 0;
966
967 /*
968 * We only need QEMU PCI config support for the ROM BAR, the MSI and MSIX
969 * capabilities, and the multifunction bit below. We let VFIO handle
970 * virtualizing everything else. Performance is not a concern here.
971 */
972 if (ranges_overlap(addr, len, PCI_ROM_ADDRESS, 4) ||
973 (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
974 ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) ||
975 (pdev->cap_present & QEMU_PCI_CAP_MSI &&
976 ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size))) {
977
978 val = pci_default_read_config(pdev, addr, len);
979 } else {
980 if (pread(vdev->fd, &val, len, vdev->config_offset + addr) != len) {
981 error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed: %m\n",
982 __func__, vdev->host.domain, vdev->host.bus,
983 vdev->host.slot, vdev->host.function, addr, len);
984 return -errno;
985 }
986 val = le32_to_cpu(val);
987 }
988
989 /* Multifunction bit is virualized in QEMU */
990 if (unlikely(ranges_overlap(addr, len, PCI_HEADER_TYPE, 1))) {
991 uint32_t mask = PCI_HEADER_TYPE_MULTI_FUNCTION;
992
993 if (len == 4) {
994 mask <<= 16;
995 }
996
997 if (pdev->cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
998 val |= mask;
999 } else {
1000 val &= ~mask;
1001 }
1002 }
1003
1004 DPRINTF("%s(%04x:%02x:%02x.%x, @0x%x, len=0x%x) %x\n", __func__,
1005 vdev->host.domain, vdev->host.bus, vdev->host.slot,
1006 vdev->host.function, addr, len, val);
1007
1008 return val;
1009}
1010
1011static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
1012 uint32_t val, int len)
1013{
1014 VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
1015 uint32_t val_le = cpu_to_le32(val);
1016
1017 DPRINTF("%s(%04x:%02x:%02x.%x, @0x%x, 0x%x, len=0x%x)\n", __func__,
1018 vdev->host.domain, vdev->host.bus, vdev->host.slot,
1019 vdev->host.function, addr, val, len);
1020
1021 /* Write everything to VFIO, let it filter out what we can't write */
1022 if (pwrite(vdev->fd, &val_le, len, vdev->config_offset + addr) != len) {
1023 error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x) failed: %m\n",
1024 __func__, vdev->host.domain, vdev->host.bus,
1025 vdev->host.slot, vdev->host.function, addr, val, len);
1026 }
1027
1028 /* Write standard header bits to emulation */
1029 if (addr < PCI_CONFIG_HEADER_SIZE) {
1030 pci_default_write_config(pdev, addr, val, len);
1031 return;
1032 }
1033
1034 /* MSI/MSI-X Enabling/Disabling */
1035 if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
1036 ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
1037 int is_enabled, was_enabled = msi_enabled(pdev);
1038
1039 pci_default_write_config(pdev, addr, val, len);
1040
1041 is_enabled = msi_enabled(pdev);
1042
1043 if (!was_enabled && is_enabled) {
1044 vfio_enable_msi(vdev);
1045 } else if (was_enabled && !is_enabled) {
fd704adc 1046 vfio_disable_msi(vdev);
65501a74
AW
1047 }
1048 }
1049
1050 if (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
1051 ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) {
1052 int is_enabled, was_enabled = msix_enabled(pdev);
1053
1054 pci_default_write_config(pdev, addr, val, len);
1055
1056 is_enabled = msix_enabled(pdev);
1057
1058 if (!was_enabled && is_enabled) {
fd704adc 1059 vfio_enable_msix(vdev);
65501a74 1060 } else if (was_enabled && !is_enabled) {
fd704adc 1061 vfio_disable_msix(vdev);
65501a74
AW
1062 }
1063 }
1064}
1065
1066/*
1067 * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
1068 */
af6bc27e 1069static int vfio_dma_unmap(VFIOContainer *container,
a8170e5e 1070 hwaddr iova, ram_addr_t size)
af6bc27e
AW
1071{
1072 struct vfio_iommu_type1_dma_unmap unmap = {
1073 .argsz = sizeof(unmap),
1074 .flags = 0,
1075 .iova = iova,
1076 .size = size,
1077 };
1078
1079 if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
1080 DPRINTF("VFIO_UNMAP_DMA: %d\n", -errno);
1081 return -errno;
1082 }
1083
1084 return 0;
1085}
1086
a8170e5e 1087static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
65501a74
AW
1088 ram_addr_t size, void *vaddr, bool readonly)
1089{
1090 struct vfio_iommu_type1_dma_map map = {
1091 .argsz = sizeof(map),
1092 .flags = VFIO_DMA_MAP_FLAG_READ,
5976cdd5 1093 .vaddr = (__u64)(uintptr_t)vaddr,
65501a74
AW
1094 .iova = iova,
1095 .size = size,
1096 };
1097
1098 if (!readonly) {
1099 map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
1100 }
1101
12af1344
AW
1102 /*
1103 * Try the mapping, if it fails with EBUSY, unmap the region and try
1104 * again. This shouldn't be necessary, but we sometimes see it in
1105 * the the VGA ROM space.
1106 */
1107 if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
1108 (errno == EBUSY && vfio_dma_unmap(container, iova, size) == 0 &&
1109 ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
1110 return 0;
65501a74
AW
1111 }
1112
12af1344
AW
1113 DPRINTF("VFIO_MAP_DMA: %d\n", -errno);
1114 return -errno;
65501a74
AW
1115}
1116
65501a74
AW
1117static bool vfio_listener_skipped_section(MemoryRegionSection *section)
1118{
1119 return !memory_region_is_ram(section->mr);
1120}
1121
1122static void vfio_listener_region_add(MemoryListener *listener,
1123 MemoryRegionSection *section)
1124{
1125 VFIOContainer *container = container_of(listener, VFIOContainer,
1126 iommu_data.listener);
a8170e5e 1127 hwaddr iova, end;
65501a74
AW
1128 void *vaddr;
1129 int ret;
1130
1131 if (vfio_listener_skipped_section(section)) {
a8170e5e 1132 DPRINTF("vfio: SKIPPING region_add %"HWADDR_PRIx" - %"PRIx64"\n",
65501a74
AW
1133 section->offset_within_address_space,
1134 section->offset_within_address_space + section->size - 1);
1135 return;
1136 }
1137
1138 if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
1139 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
1140 error_report("%s received unaligned region\n", __func__);
1141 return;
1142 }
1143
1144 iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
1145 end = (section->offset_within_address_space + section->size) &
1146 TARGET_PAGE_MASK;
1147
1148 if (iova >= end) {
1149 return;
1150 }
1151
1152 vaddr = memory_region_get_ram_ptr(section->mr) +
1153 section->offset_within_region +
1154 (iova - section->offset_within_address_space);
1155
a8170e5e 1156 DPRINTF("vfio: region_add %"HWADDR_PRIx" - %"HWADDR_PRIx" [%p]\n",
65501a74
AW
1157 iova, end - 1, vaddr);
1158
1159 ret = vfio_dma_map(container, iova, end - iova, vaddr, section->readonly);
1160 if (ret) {
a8170e5e
AK
1161 error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
1162 "0x%"HWADDR_PRIx", %p) = %d (%m)\n",
65501a74
AW
1163 container, iova, end - iova, vaddr, ret);
1164 }
1165}
1166
1167static void vfio_listener_region_del(MemoryListener *listener,
1168 MemoryRegionSection *section)
1169{
1170 VFIOContainer *container = container_of(listener, VFIOContainer,
1171 iommu_data.listener);
a8170e5e 1172 hwaddr iova, end;
65501a74
AW
1173 int ret;
1174
1175 if (vfio_listener_skipped_section(section)) {
a8170e5e 1176 DPRINTF("vfio: SKIPPING region_del %"HWADDR_PRIx" - %"PRIx64"\n",
65501a74
AW
1177 section->offset_within_address_space,
1178 section->offset_within_address_space + section->size - 1);
1179 return;
1180 }
1181
1182 if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
1183 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
1184 error_report("%s received unaligned region\n", __func__);
1185 return;
1186 }
1187
1188 iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
1189 end = (section->offset_within_address_space + section->size) &
1190 TARGET_PAGE_MASK;
1191
1192 if (iova >= end) {
1193 return;
1194 }
1195
a8170e5e 1196 DPRINTF("vfio: region_del %"HWADDR_PRIx" - %"HWADDR_PRIx"\n",
65501a74
AW
1197 iova, end - 1);
1198
1199 ret = vfio_dma_unmap(container, iova, end - iova);
1200 if (ret) {
a8170e5e
AK
1201 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
1202 "0x%"HWADDR_PRIx") = %d (%m)\n",
65501a74
AW
1203 container, iova, end - iova, ret);
1204 }
1205}
1206
1207static MemoryListener vfio_memory_listener = {
65501a74
AW
1208 .region_add = vfio_listener_region_add,
1209 .region_del = vfio_listener_region_del,
65501a74
AW
1210};
1211
1212static void vfio_listener_release(VFIOContainer *container)
1213{
1214 memory_listener_unregister(&container->iommu_data.listener);
1215}
1216
1217/*
1218 * Interrupt setup
1219 */
1220static void vfio_disable_interrupts(VFIODevice *vdev)
1221{
1222 switch (vdev->interrupt) {
1223 case VFIO_INT_INTx:
1224 vfio_disable_intx(vdev);
1225 break;
1226 case VFIO_INT_MSI:
fd704adc 1227 vfio_disable_msi(vdev);
65501a74
AW
1228 break;
1229 case VFIO_INT_MSIX:
fd704adc 1230 vfio_disable_msix(vdev);
65501a74
AW
1231 break;
1232 }
1233}
1234
1235static int vfio_setup_msi(VFIODevice *vdev, int pos)
1236{
1237 uint16_t ctrl;
1238 bool msi_64bit, msi_maskbit;
1239 int ret, entries;
1240
65501a74
AW
1241 if (pread(vdev->fd, &ctrl, sizeof(ctrl),
1242 vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
1243 return -errno;
1244 }
1245 ctrl = le16_to_cpu(ctrl);
1246
1247 msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
1248 msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
1249 entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
1250
1251 DPRINTF("%04x:%02x:%02x.%x PCI MSI CAP @0x%x\n", vdev->host.domain,
1252 vdev->host.bus, vdev->host.slot, vdev->host.function, pos);
1253
1254 ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit);
1255 if (ret < 0) {
e43b9a5a
AW
1256 if (ret == -ENOTSUP) {
1257 return 0;
1258 }
65501a74
AW
1259 error_report("vfio: msi_init failed\n");
1260 return ret;
1261 }
1262 vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
1263
1264 return 0;
1265}
1266
1267/*
1268 * We don't have any control over how pci_add_capability() inserts
1269 * capabilities into the chain. In order to setup MSI-X we need a
1270 * MemoryRegion for the BAR. In order to setup the BAR and not
1271 * attempt to mmap the MSI-X table area, which VFIO won't allow, we
1272 * need to first look for where the MSI-X table lives. So we
1273 * unfortunately split MSI-X setup across two functions.
1274 */
1275static int vfio_early_setup_msix(VFIODevice *vdev)
1276{
1277 uint8_t pos;
1278 uint16_t ctrl;
1279 uint32_t table, pba;
1280
1281 pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
1282 if (!pos) {
1283 return 0;
1284 }
1285
1286 if (pread(vdev->fd, &ctrl, sizeof(ctrl),
1287 vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
1288 return -errno;
1289 }
1290
1291 if (pread(vdev->fd, &table, sizeof(table),
1292 vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) {
1293 return -errno;
1294 }
1295
1296 if (pread(vdev->fd, &pba, sizeof(pba),
1297 vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) {
1298 return -errno;
1299 }
1300
1301 ctrl = le16_to_cpu(ctrl);
1302 table = le32_to_cpu(table);
1303 pba = le32_to_cpu(pba);
1304
1305 vdev->msix = g_malloc0(sizeof(*(vdev->msix)));
1306 vdev->msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
1307 vdev->msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
1308 vdev->msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
1309 vdev->msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
1310 vdev->msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
1311
1312 DPRINTF("%04x:%02x:%02x.%x "
1313 "PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, entries %d\n",
1314 vdev->host.domain, vdev->host.bus, vdev->host.slot,
1315 vdev->host.function, pos, vdev->msix->table_bar,
1316 vdev->msix->table_offset, vdev->msix->entries);
1317
1318 return 0;
1319}
1320
1321static int vfio_setup_msix(VFIODevice *vdev, int pos)
1322{
1323 int ret;
1324
65501a74
AW
1325 ret = msix_init(&vdev->pdev, vdev->msix->entries,
1326 &vdev->bars[vdev->msix->table_bar].mem,
1327 vdev->msix->table_bar, vdev->msix->table_offset,
1328 &vdev->bars[vdev->msix->pba_bar].mem,
1329 vdev->msix->pba_bar, vdev->msix->pba_offset, pos);
1330 if (ret < 0) {
e43b9a5a
AW
1331 if (ret == -ENOTSUP) {
1332 return 0;
1333 }
65501a74
AW
1334 error_report("vfio: msix_init failed\n");
1335 return ret;
1336 }
1337
65501a74
AW
1338 return 0;
1339}
1340
1341static void vfio_teardown_msi(VFIODevice *vdev)
1342{
1343 msi_uninit(&vdev->pdev);
1344
1345 if (vdev->msix) {
65501a74
AW
1346 msix_uninit(&vdev->pdev, &vdev->bars[vdev->msix->table_bar].mem,
1347 &vdev->bars[vdev->msix->pba_bar].mem);
1348 }
1349}
1350
1351/*
1352 * Resource setup
1353 */
1354static void vfio_mmap_set_enabled(VFIODevice *vdev, bool enabled)
1355{
1356 int i;
1357
1358 for (i = 0; i < PCI_ROM_SLOT; i++) {
1359 VFIOBAR *bar = &vdev->bars[i];
1360
1361 if (!bar->size) {
1362 continue;
1363 }
1364
1365 memory_region_set_enabled(&bar->mmap_mem, enabled);
1366 if (vdev->msix && vdev->msix->table_bar == i) {
1367 memory_region_set_enabled(&vdev->msix->mmap_mem, enabled);
1368 }
1369 }
1370}
1371
1372static void vfio_unmap_bar(VFIODevice *vdev, int nr)
1373{
1374 VFIOBAR *bar = &vdev->bars[nr];
1375
1376 if (!bar->size) {
1377 return;
1378 }
1379
1380 memory_region_del_subregion(&bar->mem, &bar->mmap_mem);
1381 munmap(bar->mmap, memory_region_size(&bar->mmap_mem));
1382
1383 if (vdev->msix && vdev->msix->table_bar == nr) {
1384 memory_region_del_subregion(&bar->mem, &vdev->msix->mmap_mem);
1385 munmap(vdev->msix->mmap, memory_region_size(&vdev->msix->mmap_mem));
1386 }
1387
1388 memory_region_destroy(&bar->mem);
1389}
1390
1391static int vfio_mmap_bar(VFIOBAR *bar, MemoryRegion *mem, MemoryRegion *submem,
1392 void **map, size_t size, off_t offset,
1393 const char *name)
1394{
1395 int ret = 0;
1396
1397 if (size && bar->flags & VFIO_REGION_INFO_FLAG_MMAP) {
1398 int prot = 0;
1399
1400 if (bar->flags & VFIO_REGION_INFO_FLAG_READ) {
1401 prot |= PROT_READ;
1402 }
1403
1404 if (bar->flags & VFIO_REGION_INFO_FLAG_WRITE) {
1405 prot |= PROT_WRITE;
1406 }
1407
1408 *map = mmap(NULL, size, prot, MAP_SHARED,
1409 bar->fd, bar->fd_offset + offset);
1410 if (*map == MAP_FAILED) {
1411 *map = NULL;
1412 ret = -errno;
1413 goto empty_region;
1414 }
1415
1416 memory_region_init_ram_ptr(submem, name, size, *map);
1417 } else {
1418empty_region:
1419 /* Create a zero sized sub-region to make cleanup easy. */
1420 memory_region_init(submem, name, 0);
1421 }
1422
1423 memory_region_add_subregion(mem, offset, submem);
1424
1425 return ret;
1426}
1427
1428static void vfio_map_bar(VFIODevice *vdev, int nr)
1429{
1430 VFIOBAR *bar = &vdev->bars[nr];
1431 unsigned size = bar->size;
1432 char name[64];
1433 uint32_t pci_bar;
1434 uint8_t type;
1435 int ret;
1436
1437 /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
1438 if (!size) {
1439 return;
1440 }
1441
1442 snprintf(name, sizeof(name), "VFIO %04x:%02x:%02x.%x BAR %d",
1443 vdev->host.domain, vdev->host.bus, vdev->host.slot,
1444 vdev->host.function, nr);
1445
1446 /* Determine what type of BAR this is for registration */
1447 ret = pread(vdev->fd, &pci_bar, sizeof(pci_bar),
1448 vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr));
1449 if (ret != sizeof(pci_bar)) {
1450 error_report("vfio: Failed to read BAR %d (%m)\n", nr);
1451 return;
1452 }
1453
1454 pci_bar = le32_to_cpu(pci_bar);
1455 type = pci_bar & (pci_bar & PCI_BASE_ADDRESS_SPACE_IO ?
1456 ~PCI_BASE_ADDRESS_IO_MASK : ~PCI_BASE_ADDRESS_MEM_MASK);
1457
1458 /* A "slow" read/write mapping underlies all BARs */
1459 memory_region_init_io(&bar->mem, &vfio_bar_ops, bar, name, size);
1460 pci_register_bar(&vdev->pdev, nr, type, &bar->mem);
1461
1462 /*
1463 * We can't mmap areas overlapping the MSIX vector table, so we
1464 * potentially insert a direct-mapped subregion before and after it.
1465 */
1466 if (vdev->msix && vdev->msix->table_bar == nr) {
1467 size = vdev->msix->table_offset & TARGET_PAGE_MASK;
1468 }
1469
1470 strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
1471 if (vfio_mmap_bar(bar, &bar->mem,
1472 &bar->mmap_mem, &bar->mmap, size, 0, name)) {
1473 error_report("%s unsupported. Performance may be slow\n", name);
1474 }
1475
1476 if (vdev->msix && vdev->msix->table_bar == nr) {
1477 unsigned start;
1478
1479 start = TARGET_PAGE_ALIGN(vdev->msix->table_offset +
1480 (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
1481
1482 size = start < bar->size ? bar->size - start : 0;
1483 strncat(name, " msix-hi", sizeof(name) - strlen(name) - 1);
1484 /* VFIOMSIXInfo contains another MemoryRegion for this mapping */
1485 if (vfio_mmap_bar(bar, &bar->mem, &vdev->msix->mmap_mem,
1486 &vdev->msix->mmap, size, start, name)) {
1487 error_report("%s unsupported. Performance may be slow\n", name);
1488 }
1489 }
1490}
1491
1492static void vfio_map_bars(VFIODevice *vdev)
1493{
1494 int i;
1495
1496 for (i = 0; i < PCI_ROM_SLOT; i++) {
1497 vfio_map_bar(vdev, i);
1498 }
1499}
1500
1501static void vfio_unmap_bars(VFIODevice *vdev)
1502{
1503 int i;
1504
1505 for (i = 0; i < PCI_ROM_SLOT; i++) {
1506 vfio_unmap_bar(vdev, i);
1507 }
1508}
1509
1510/*
1511 * General setup
1512 */
1513static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
1514{
1515 uint8_t tmp, next = 0xff;
1516
1517 for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp;
1518 tmp = pdev->config[tmp + 1]) {
1519 if (tmp > pos && tmp < next) {
1520 next = tmp;
1521 }
1522 }
1523
1524 return next - pos;
1525}
1526
1527static int vfio_add_std_cap(VFIODevice *vdev, uint8_t pos)
1528{
1529 PCIDevice *pdev = &vdev->pdev;
1530 uint8_t cap_id, next, size;
1531 int ret;
1532
1533 cap_id = pdev->config[pos];
1534 next = pdev->config[pos + 1];
1535
1536 /*
1537 * If it becomes important to configure capabilities to their actual
1538 * size, use this as the default when it's something we don't recognize.
1539 * Since QEMU doesn't actually handle many of the config accesses,
1540 * exact size doesn't seem worthwhile.
1541 */
1542 size = vfio_std_cap_max_size(pdev, pos);
1543
1544 /*
1545 * pci_add_capability always inserts the new capability at the head
1546 * of the chain. Therefore to end up with a chain that matches the
1547 * physical device, we insert from the end by making this recursive.
1548 * This is also why we pre-caclulate size above as cached config space
1549 * will be changed as we unwind the stack.
1550 */
1551 if (next) {
1552 ret = vfio_add_std_cap(vdev, next);
1553 if (ret) {
1554 return ret;
1555 }
1556 } else {
1557 pdev->config[PCI_CAPABILITY_LIST] = 0; /* Begin the rebuild */
1558 }
1559
1560 switch (cap_id) {
1561 case PCI_CAP_ID_MSI:
1562 ret = vfio_setup_msi(vdev, pos);
1563 break;
1564 case PCI_CAP_ID_MSIX:
1565 ret = vfio_setup_msix(vdev, pos);
1566 break;
1567 default:
1568 ret = pci_add_capability(pdev, cap_id, pos, size);
1569 break;
1570 }
1571
1572 if (ret < 0) {
1573 error_report("vfio: %04x:%02x:%02x.%x Error adding PCI capability "
1574 "0x%x[0x%x]@0x%x: %d\n", vdev->host.domain,
1575 vdev->host.bus, vdev->host.slot, vdev->host.function,
1576 cap_id, size, pos, ret);
1577 return ret;
1578 }
1579
1580 return 0;
1581}
1582
1583static int vfio_add_capabilities(VFIODevice *vdev)
1584{
1585 PCIDevice *pdev = &vdev->pdev;
1586
1587 if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
1588 !pdev->config[PCI_CAPABILITY_LIST]) {
1589 return 0; /* Nothing to add */
1590 }
1591
1592 return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
1593}
1594
1595static int vfio_load_rom(VFIODevice *vdev)
1596{
1597 uint64_t size = vdev->rom_size;
1598 char name[32];
1599 off_t off = 0, voff = vdev->rom_offset;
1600 ssize_t bytes;
1601 void *ptr;
1602
1603 /* If loading ROM from file, pci handles it */
1604 if (vdev->pdev.romfile || !vdev->pdev.rom_bar || !size) {
1605 return 0;
1606 }
1607
1608 DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
1609 vdev->host.bus, vdev->host.slot, vdev->host.function);
1610
1611 snprintf(name, sizeof(name), "vfio[%04x:%02x:%02x.%x].rom",
1612 vdev->host.domain, vdev->host.bus, vdev->host.slot,
1613 vdev->host.function);
1614 memory_region_init_ram(&vdev->pdev.rom, name, size);
1615 ptr = memory_region_get_ram_ptr(&vdev->pdev.rom);
1616 memset(ptr, 0xff, size);
1617
1618 while (size) {
1619 bytes = pread(vdev->fd, ptr + off, size, voff + off);
1620 if (bytes == 0) {
1621 break; /* expect that we could get back less than the ROM BAR */
1622 } else if (bytes > 0) {
1623 off += bytes;
1624 size -= bytes;
1625 } else {
1626 if (errno == EINTR || errno == EAGAIN) {
1627 continue;
1628 }
1629 error_report("vfio: Error reading device ROM: %m\n");
1630 memory_region_destroy(&vdev->pdev.rom);
1631 return -errno;
1632 }
1633 }
1634
1635 pci_register_bar(&vdev->pdev, PCI_ROM_SLOT, 0, &vdev->pdev.rom);
1636 vdev->pdev.has_rom = true;
1637 return 0;
1638}
1639
1640static int vfio_connect_container(VFIOGroup *group)
1641{
1642 VFIOContainer *container;
1643 int ret, fd;
1644
1645 if (group->container) {
1646 return 0;
1647 }
1648
1649 QLIST_FOREACH(container, &container_list, next) {
1650 if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
1651 group->container = container;
1652 QLIST_INSERT_HEAD(&container->group_list, group, container_next);
1653 return 0;
1654 }
1655 }
1656
1657 fd = qemu_open("/dev/vfio/vfio", O_RDWR);
1658 if (fd < 0) {
1659 error_report("vfio: failed to open /dev/vfio/vfio: %m\n");
1660 return -errno;
1661 }
1662
1663 ret = ioctl(fd, VFIO_GET_API_VERSION);
1664 if (ret != VFIO_API_VERSION) {
1665 error_report("vfio: supported vfio version: %d, "
1666 "reported version: %d\n", VFIO_API_VERSION, ret);
1667 close(fd);
1668 return -EINVAL;
1669 }
1670
1671 container = g_malloc0(sizeof(*container));
1672 container->fd = fd;
1673
1674 if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
1675 ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
1676 if (ret) {
1677 error_report("vfio: failed to set group container: %m\n");
1678 g_free(container);
1679 close(fd);
1680 return -errno;
1681 }
1682
1683 ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
1684 if (ret) {
1685 error_report("vfio: failed to set iommu for container: %m\n");
1686 g_free(container);
1687 close(fd);
1688 return -errno;
1689 }
1690
1691 container->iommu_data.listener = vfio_memory_listener;
1692 container->iommu_data.release = vfio_listener_release;
1693
f6790af6 1694 memory_listener_register(&container->iommu_data.listener, &address_space_memory);
65501a74
AW
1695 } else {
1696 error_report("vfio: No available IOMMU models\n");
1697 g_free(container);
1698 close(fd);
1699 return -EINVAL;
1700 }
1701
1702 QLIST_INIT(&container->group_list);
1703 QLIST_INSERT_HEAD(&container_list, container, next);
1704
1705 group->container = container;
1706 QLIST_INSERT_HEAD(&container->group_list, group, container_next);
1707
1708 return 0;
1709}
1710
1711static void vfio_disconnect_container(VFIOGroup *group)
1712{
1713 VFIOContainer *container = group->container;
1714
1715 if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
1716 error_report("vfio: error disconnecting group %d from container\n",
1717 group->groupid);
1718 }
1719
1720 QLIST_REMOVE(group, container_next);
1721 group->container = NULL;
1722
1723 if (QLIST_EMPTY(&container->group_list)) {
1724 if (container->iommu_data.release) {
1725 container->iommu_data.release(container);
1726 }
1727 QLIST_REMOVE(container, next);
1728 DPRINTF("vfio_disconnect_container: close container->fd\n");
1729 close(container->fd);
1730 g_free(container);
1731 }
1732}
1733
1734static VFIOGroup *vfio_get_group(int groupid)
1735{
1736 VFIOGroup *group;
1737 char path[32];
1738 struct vfio_group_status status = { .argsz = sizeof(status) };
1739
1740 QLIST_FOREACH(group, &group_list, next) {
1741 if (group->groupid == groupid) {
1742 return group;
1743 }
1744 }
1745
1746 group = g_malloc0(sizeof(*group));
1747
1748 snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
1749 group->fd = qemu_open(path, O_RDWR);
1750 if (group->fd < 0) {
1751 error_report("vfio: error opening %s: %m\n", path);
1752 g_free(group);
1753 return NULL;
1754 }
1755
1756 if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
1757 error_report("vfio: error getting group status: %m\n");
1758 close(group->fd);
1759 g_free(group);
1760 return NULL;
1761 }
1762
1763 if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
1764 error_report("vfio: error, group %d is not viable, please ensure "
1765 "all devices within the iommu_group are bound to their "
1766 "vfio bus driver.\n", groupid);
1767 close(group->fd);
1768 g_free(group);
1769 return NULL;
1770 }
1771
1772 group->groupid = groupid;
1773 QLIST_INIT(&group->device_list);
1774
1775 if (vfio_connect_container(group)) {
1776 error_report("vfio: failed to setup container for group %d\n", groupid);
1777 close(group->fd);
1778 g_free(group);
1779 return NULL;
1780 }
1781
1782 QLIST_INSERT_HEAD(&group_list, group, next);
1783
1784 return group;
1785}
1786
1787static void vfio_put_group(VFIOGroup *group)
1788{
1789 if (!QLIST_EMPTY(&group->device_list)) {
1790 return;
1791 }
1792
1793 vfio_disconnect_container(group);
1794 QLIST_REMOVE(group, next);
1795 DPRINTF("vfio_put_group: close group->fd\n");
1796 close(group->fd);
1797 g_free(group);
1798}
1799
1800static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev)
1801{
1802 struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
1803 struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
1804 int ret, i;
1805
1806 ret = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
1807 if (ret < 0) {
1808 error_report("vfio: error getting device %s from group %d: %m\n",
1809 name, group->groupid);
1810 error_report("Verify all devices in group %d are bound to vfio-pci "
1811 "or pci-stub and not already in use\n", group->groupid);
1812 return ret;
1813 }
1814
1815 vdev->fd = ret;
1816 vdev->group = group;
1817 QLIST_INSERT_HEAD(&group->device_list, vdev, next);
1818
1819 /* Sanity check device */
1820 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_INFO, &dev_info);
1821 if (ret) {
1822 error_report("vfio: error getting device info: %m\n");
1823 goto error;
1824 }
1825
1826 DPRINTF("Device %s flags: %u, regions: %u, irgs: %u\n", name,
1827 dev_info.flags, dev_info.num_regions, dev_info.num_irqs);
1828
1829 if (!(dev_info.flags & VFIO_DEVICE_FLAGS_PCI)) {
1830 error_report("vfio: Um, this isn't a PCI device\n");
1831 goto error;
1832 }
1833
1834 vdev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
1835 if (!vdev->reset_works) {
1836 error_report("Warning, device %s does not support reset\n", name);
1837 }
1838
1839 if (dev_info.num_regions != VFIO_PCI_NUM_REGIONS) {
1840 error_report("vfio: unexpected number of io regions %u\n",
1841 dev_info.num_regions);
1842 goto error;
1843 }
1844
1845 if (dev_info.num_irqs != VFIO_PCI_NUM_IRQS) {
1846 error_report("vfio: unexpected number of irqs %u\n", dev_info.num_irqs);
1847 goto error;
1848 }
1849
1850 for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) {
1851 reg_info.index = i;
1852
1853 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
1854 if (ret) {
1855 error_report("vfio: Error getting region %d info: %m\n", i);
1856 goto error;
1857 }
1858
1859 DPRINTF("Device %s region %d:\n", name, i);
1860 DPRINTF(" size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
1861 (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
1862 (unsigned long)reg_info.flags);
1863
1864 vdev->bars[i].flags = reg_info.flags;
1865 vdev->bars[i].size = reg_info.size;
1866 vdev->bars[i].fd_offset = reg_info.offset;
1867 vdev->bars[i].fd = vdev->fd;
1868 vdev->bars[i].nr = i;
1869 }
1870
1871 reg_info.index = VFIO_PCI_ROM_REGION_INDEX;
1872
1873 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
1874 if (ret) {
1875 error_report("vfio: Error getting ROM info: %m\n");
1876 goto error;
1877 }
1878
1879 DPRINTF("Device %s ROM:\n", name);
1880 DPRINTF(" size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
1881 (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
1882 (unsigned long)reg_info.flags);
1883
1884 vdev->rom_size = reg_info.size;
1885 vdev->rom_offset = reg_info.offset;
1886
1887 reg_info.index = VFIO_PCI_CONFIG_REGION_INDEX;
1888
1889 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
1890 if (ret) {
1891 error_report("vfio: Error getting config info: %m\n");
1892 goto error;
1893 }
1894
1895 DPRINTF("Device %s config:\n", name);
1896 DPRINTF(" size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
1897 (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
1898 (unsigned long)reg_info.flags);
1899
1900 vdev->config_size = reg_info.size;
1901 vdev->config_offset = reg_info.offset;
1902
1903error:
1904 if (ret) {
1905 QLIST_REMOVE(vdev, next);
1906 vdev->group = NULL;
1907 close(vdev->fd);
1908 }
1909 return ret;
1910}
1911
1912static void vfio_put_device(VFIODevice *vdev)
1913{
1914 QLIST_REMOVE(vdev, next);
1915 vdev->group = NULL;
1916 DPRINTF("vfio_put_device: close vdev->fd\n");
1917 close(vdev->fd);
1918 if (vdev->msix) {
1919 g_free(vdev->msix);
1920 vdev->msix = NULL;
1921 }
1922}
1923
1924static int vfio_initfn(PCIDevice *pdev)
1925{
1926 VFIODevice *pvdev, *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
1927 VFIOGroup *group;
1928 char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
1929 ssize_t len;
1930 struct stat st;
1931 int groupid;
1932 int ret;
1933
1934 /* Check that the host device exists */
1935 snprintf(path, sizeof(path),
1936 "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
1937 vdev->host.domain, vdev->host.bus, vdev->host.slot,
1938 vdev->host.function);
1939 if (stat(path, &st) < 0) {
1940 error_report("vfio: error: no such host device: %s\n", path);
1941 return -errno;
1942 }
1943
1944 strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
1945
1946 len = readlink(path, iommu_group_path, PATH_MAX);
1947 if (len <= 0) {
1948 error_report("vfio: error no iommu_group for device\n");
1949 return -errno;
1950 }
1951
1952 iommu_group_path[len] = 0;
1953 group_name = basename(iommu_group_path);
1954
1955 if (sscanf(group_name, "%d", &groupid) != 1) {
1956 error_report("vfio: error reading %s: %m\n", path);
1957 return -errno;
1958 }
1959
1960 DPRINTF("%s(%04x:%02x:%02x.%x) group %d\n", __func__, vdev->host.domain,
1961 vdev->host.bus, vdev->host.slot, vdev->host.function, groupid);
1962
1963 group = vfio_get_group(groupid);
1964 if (!group) {
1965 error_report("vfio: failed to get group %d\n", groupid);
1966 return -ENOENT;
1967 }
1968
1969 snprintf(path, sizeof(path), "%04x:%02x:%02x.%01x",
1970 vdev->host.domain, vdev->host.bus, vdev->host.slot,
1971 vdev->host.function);
1972
1973 QLIST_FOREACH(pvdev, &group->device_list, next) {
1974 if (pvdev->host.domain == vdev->host.domain &&
1975 pvdev->host.bus == vdev->host.bus &&
1976 pvdev->host.slot == vdev->host.slot &&
1977 pvdev->host.function == vdev->host.function) {
1978
1979 error_report("vfio: error: device %s is already attached\n", path);
1980 vfio_put_group(group);
1981 return -EBUSY;
1982 }
1983 }
1984
1985 ret = vfio_get_device(group, path, vdev);
1986 if (ret) {
1987 error_report("vfio: failed to get device %s\n", path);
1988 vfio_put_group(group);
1989 return ret;
1990 }
1991
1992 /* Get a copy of config space */
1993 ret = pread(vdev->fd, vdev->pdev.config,
1994 MIN(pci_config_size(&vdev->pdev), vdev->config_size),
1995 vdev->config_offset);
1996 if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) {
1997 ret = ret < 0 ? -errno : -EFAULT;
1998 error_report("vfio: Failed to read device config space\n");
1999 goto out_put;
2000 }
2001
2002 /*
2003 * Clear host resource mapping info. If we choose not to register a
2004 * BAR, such as might be the case with the option ROM, we can get
2005 * confusing, unwritable, residual addresses from the host here.
2006 */
2007 memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
2008 memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
2009
2010 vfio_load_rom(vdev);
2011
2012 ret = vfio_early_setup_msix(vdev);
2013 if (ret) {
2014 goto out_put;
2015 }
2016
2017 vfio_map_bars(vdev);
2018
2019 ret = vfio_add_capabilities(vdev);
2020 if (ret) {
2021 goto out_teardown;
2022 }
2023
2024 if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
ea486926
AW
2025 vdev->intx.mmap_timer = qemu_new_timer_ms(vm_clock,
2026 vfio_intx_mmap_enable, vdev);
e1d1e586 2027 pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_update_irq);
65501a74
AW
2028 ret = vfio_enable_intx(vdev);
2029 if (ret) {
2030 goto out_teardown;
2031 }
2032 }
2033
2034 return 0;
2035
2036out_teardown:
2037 pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
2038 vfio_teardown_msi(vdev);
2039 vfio_unmap_bars(vdev);
2040out_put:
2041 vfio_put_device(vdev);
2042 vfio_put_group(group);
2043 return ret;
2044}
2045
2046static void vfio_exitfn(PCIDevice *pdev)
2047{
2048 VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
2049 VFIOGroup *group = vdev->group;
2050
2051 pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
2052 vfio_disable_interrupts(vdev);
ea486926
AW
2053 if (vdev->intx.mmap_timer) {
2054 qemu_free_timer(vdev->intx.mmap_timer);
2055 }
65501a74
AW
2056 vfio_teardown_msi(vdev);
2057 vfio_unmap_bars(vdev);
2058 vfio_put_device(vdev);
2059 vfio_put_group(group);
2060}
2061
2062static void vfio_pci_reset(DeviceState *dev)
2063{
2064 PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev);
2065 VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
5834a83f 2066 uint16_t cmd;
65501a74 2067
5834a83f
AW
2068 DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
2069 vdev->host.bus, vdev->host.slot, vdev->host.function);
2070
2071 vfio_disable_interrupts(vdev);
65501a74 2072
5834a83f
AW
2073 /*
2074 * Stop any ongoing DMA by disconecting I/O, MMIO, and bus master.
2075 * Also put INTx Disable in known state.
2076 */
2077 cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2);
2078 cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
2079 PCI_COMMAND_INTX_DISABLE);
2080 vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
2081
2082 if (vdev->reset_works) {
2083 if (ioctl(vdev->fd, VFIO_DEVICE_RESET)) {
2084 error_report("vfio: Error unable to reset physical device "
2085 "(%04x:%02x:%02x.%x): %m\n", vdev->host.domain,
2086 vdev->host.bus, vdev->host.slot, vdev->host.function);
2087 }
65501a74 2088 }
5834a83f
AW
2089
2090 vfio_enable_intx(vdev);
65501a74
AW
2091}
2092
2093static Property vfio_pci_dev_properties[] = {
2094 DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIODevice, host),
ea486926
AW
2095 DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIODevice,
2096 intx.mmap_timeout, 1100),
65501a74
AW
2097 /*
2098 * TODO - support passed fds... is this necessary?
2099 * DEFINE_PROP_STRING("vfiofd", VFIODevice, vfiofd_name),
2100 * DEFINE_PROP_STRING("vfiogroupfd, VFIODevice, vfiogroupfd_name),
2101 */
2102 DEFINE_PROP_END_OF_LIST(),
2103};
2104
d9f0e638
AW
2105static const VMStateDescription vfio_pci_vmstate = {
2106 .name = "vfio-pci",
2107 .unmigratable = 1,
2108};
65501a74
AW
2109
2110static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
2111{
2112 DeviceClass *dc = DEVICE_CLASS(klass);
2113 PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
2114
2115 dc->reset = vfio_pci_reset;
2116 dc->props = vfio_pci_dev_properties;
d9f0e638
AW
2117 dc->vmsd = &vfio_pci_vmstate;
2118 dc->desc = "VFIO-based PCI device assignment";
65501a74
AW
2119 pdc->init = vfio_initfn;
2120 pdc->exit = vfio_exitfn;
2121 pdc->config_read = vfio_pci_read_config;
2122 pdc->config_write = vfio_pci_write_config;
2123}
2124
2125static const TypeInfo vfio_pci_dev_info = {
2126 .name = "vfio-pci",
2127 .parent = TYPE_PCI_DEVICE,
2128 .instance_size = sizeof(VFIODevice),
2129 .class_init = vfio_pci_dev_class_init,
2130};
2131
2132static void register_vfio_pci_dev_type(void)
2133{
2134 type_register_static(&vfio_pci_dev_info);
2135}
2136
2137type_init(register_vfio_pci_dev_type)