]> git.proxmox.com Git - mirror_qemu.git/blob - hw/vfio/pci.c
vfio/pci: Extract PCI structures to a separate header
[mirror_qemu.git] / hw / vfio / pci.c
1 /*
2 * vfio based device assignment support
3 *
4 * Copyright Red Hat, Inc. 2012
5 *
6 * Authors:
7 * Alex Williamson <alex.williamson@redhat.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
11 *
12 * Based on qemu-kvm device-assignment:
13 * Adapted for KVM by Qumranet.
14 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
15 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
16 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
17 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
18 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
19 */
20
21 #include <linux/vfio.h>
22 #include <sys/ioctl.h>
23 #include <sys/mman.h>
24 #include <sys/stat.h>
25 #include <sys/types.h>
26 #include <unistd.h>
27
28 #include "config.h"
29 #include "hw/pci/msi.h"
30 #include "hw/pci/msix.h"
31 #include "qemu/error-report.h"
32 #include "qemu/range.h"
33 #include "sysemu/kvm.h"
34 #include "sysemu/sysemu.h"
35 #include "pci.h"
36 #include "trace.h"
37
38 /*
39 * List of device ids/vendor ids for which to disable
40 * option rom loading. This avoids the guest hangs during rom
41 * execution as noticed with the BCM 57810 card for lack of a
42 * more better way to handle such issues.
43 * The user can still override by specifying a romfile or
44 * rombar=1.
45 * Please see https://bugs.launchpad.net/qemu/+bug/1284874
46 * for an analysis of the 57810 card hang. When adding
47 * a new vendor id/device id combination below, please also add
48 * your card/environment details and information that could
49 * help in debugging to the bug tracking this issue
50 */
51 static const VFIORomBlacklistEntry romblacklist[] = {
52 /* Broadcom BCM 57810 */
53 { 0x14e4, 0x168e }
54 };
55
56 #define MSIX_CAP_LENGTH 12
57
58 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
59 static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
60 static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
61 uint32_t val, int len);
62 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
63
64 /*
65 * Disabling BAR mmaping can be slow, but toggling it around INTx can
66 * also be a huge overhead. We try to get the best of both worlds by
67 * waiting until an interrupt to disable mmaps (subsequent transitions
68 * to the same state are effectively no overhead). If the interrupt has
69 * been serviced and the time gap is long enough, we re-enable mmaps for
70 * performance. This works well for things like graphics cards, which
71 * may not use their interrupt at all and are penalized to an unusable
72 * level by read/write BAR traps. Other devices, like NICs, have more
73 * regular interrupts and see much better latency by staying in non-mmap
74 * mode. We therefore set the default mmap_timeout such that a ping
75 * is just enough to keep the mmap disabled. Users can experiment with
76 * other options with the x-intx-mmap-timeout-ms parameter (a value of
77 * zero disables the timer).
78 */
79 static void vfio_intx_mmap_enable(void *opaque)
80 {
81 VFIOPCIDevice *vdev = opaque;
82
83 if (vdev->intx.pending) {
84 timer_mod(vdev->intx.mmap_timer,
85 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
86 return;
87 }
88
89 vfio_mmap_set_enabled(vdev, true);
90 }
91
92 static void vfio_intx_interrupt(void *opaque)
93 {
94 VFIOPCIDevice *vdev = opaque;
95
96 if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) {
97 return;
98 }
99
100 trace_vfio_intx_interrupt(vdev->vbasedev.name, 'A' + vdev->intx.pin);
101
102 vdev->intx.pending = true;
103 pci_irq_assert(&vdev->pdev);
104 vfio_mmap_set_enabled(vdev, false);
105 if (vdev->intx.mmap_timeout) {
106 timer_mod(vdev->intx.mmap_timer,
107 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
108 }
109 }
110
111 static void vfio_intx_eoi(VFIODevice *vbasedev)
112 {
113 VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
114
115 if (!vdev->intx.pending) {
116 return;
117 }
118
119 trace_vfio_intx_eoi(vbasedev->name);
120
121 vdev->intx.pending = false;
122 pci_irq_deassert(&vdev->pdev);
123 vfio_unmask_single_irqindex(vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
124 }
125
126 static void vfio_intx_enable_kvm(VFIOPCIDevice *vdev)
127 {
128 #ifdef CONFIG_KVM
129 struct kvm_irqfd irqfd = {
130 .fd = event_notifier_get_fd(&vdev->intx.interrupt),
131 .gsi = vdev->intx.route.irq,
132 .flags = KVM_IRQFD_FLAG_RESAMPLE,
133 };
134 struct vfio_irq_set *irq_set;
135 int ret, argsz;
136 int32_t *pfd;
137
138 if (vdev->no_kvm_intx || !kvm_irqfds_enabled() ||
139 vdev->intx.route.mode != PCI_INTX_ENABLED ||
140 !kvm_resamplefds_enabled()) {
141 return;
142 }
143
144 /* Get to a known interrupt state */
145 qemu_set_fd_handler(irqfd.fd, NULL, NULL, vdev);
146 vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
147 vdev->intx.pending = false;
148 pci_irq_deassert(&vdev->pdev);
149
150 /* Get an eventfd for resample/unmask */
151 if (event_notifier_init(&vdev->intx.unmask, 0)) {
152 error_report("vfio: Error: event_notifier_init failed eoi");
153 goto fail;
154 }
155
156 /* KVM triggers it, VFIO listens for it */
157 irqfd.resamplefd = event_notifier_get_fd(&vdev->intx.unmask);
158
159 if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
160 error_report("vfio: Error: Failed to setup resample irqfd: %m");
161 goto fail_irqfd;
162 }
163
164 argsz = sizeof(*irq_set) + sizeof(*pfd);
165
166 irq_set = g_malloc0(argsz);
167 irq_set->argsz = argsz;
168 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK;
169 irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
170 irq_set->start = 0;
171 irq_set->count = 1;
172 pfd = (int32_t *)&irq_set->data;
173
174 *pfd = irqfd.resamplefd;
175
176 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
177 g_free(irq_set);
178 if (ret) {
179 error_report("vfio: Error: Failed to setup INTx unmask fd: %m");
180 goto fail_vfio;
181 }
182
183 /* Let'em rip */
184 vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
185
186 vdev->intx.kvm_accel = true;
187
188 trace_vfio_intx_enable_kvm(vdev->vbasedev.name);
189
190 return;
191
192 fail_vfio:
193 irqfd.flags = KVM_IRQFD_FLAG_DEASSIGN;
194 kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd);
195 fail_irqfd:
196 event_notifier_cleanup(&vdev->intx.unmask);
197 fail:
198 qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
199 vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
200 #endif
201 }
202
203 static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev)
204 {
205 #ifdef CONFIG_KVM
206 struct kvm_irqfd irqfd = {
207 .fd = event_notifier_get_fd(&vdev->intx.interrupt),
208 .gsi = vdev->intx.route.irq,
209 .flags = KVM_IRQFD_FLAG_DEASSIGN,
210 };
211
212 if (!vdev->intx.kvm_accel) {
213 return;
214 }
215
216 /*
217 * Get to a known state, hardware masked, QEMU ready to accept new
218 * interrupts, QEMU IRQ de-asserted.
219 */
220 vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
221 vdev->intx.pending = false;
222 pci_irq_deassert(&vdev->pdev);
223
224 /* Tell KVM to stop listening for an INTx irqfd */
225 if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
226 error_report("vfio: Error: Failed to disable INTx irqfd: %m");
227 }
228
229 /* We only need to close the eventfd for VFIO to cleanup the kernel side */
230 event_notifier_cleanup(&vdev->intx.unmask);
231
232 /* QEMU starts listening for interrupt events. */
233 qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
234
235 vdev->intx.kvm_accel = false;
236
237 /* If we've missed an event, let it re-fire through QEMU */
238 vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
239
240 trace_vfio_intx_disable_kvm(vdev->vbasedev.name);
241 #endif
242 }
243
244 static void vfio_intx_update(PCIDevice *pdev)
245 {
246 VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
247 PCIINTxRoute route;
248
249 if (vdev->interrupt != VFIO_INT_INTx) {
250 return;
251 }
252
253 route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
254
255 if (!pci_intx_route_changed(&vdev->intx.route, &route)) {
256 return; /* Nothing changed */
257 }
258
259 trace_vfio_intx_update(vdev->vbasedev.name,
260 vdev->intx.route.irq, route.irq);
261
262 vfio_intx_disable_kvm(vdev);
263
264 vdev->intx.route = route;
265
266 if (route.mode != PCI_INTX_ENABLED) {
267 return;
268 }
269
270 vfio_intx_enable_kvm(vdev);
271
272 /* Re-enable the interrupt in cased we missed an EOI */
273 vfio_intx_eoi(&vdev->vbasedev);
274 }
275
276 static int vfio_intx_enable(VFIOPCIDevice *vdev)
277 {
278 uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
279 int ret, argsz;
280 struct vfio_irq_set *irq_set;
281 int32_t *pfd;
282
283 if (!pin) {
284 return 0;
285 }
286
287 vfio_disable_interrupts(vdev);
288
289 vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
290 pci_config_set_interrupt_pin(vdev->pdev.config, pin);
291
292 #ifdef CONFIG_KVM
293 /*
294 * Only conditional to avoid generating error messages on platforms
295 * where we won't actually use the result anyway.
296 */
297 if (kvm_irqfds_enabled() && kvm_resamplefds_enabled()) {
298 vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev,
299 vdev->intx.pin);
300 }
301 #endif
302
303 ret = event_notifier_init(&vdev->intx.interrupt, 0);
304 if (ret) {
305 error_report("vfio: Error: event_notifier_init failed");
306 return ret;
307 }
308
309 argsz = sizeof(*irq_set) + sizeof(*pfd);
310
311 irq_set = g_malloc0(argsz);
312 irq_set->argsz = argsz;
313 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
314 irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
315 irq_set->start = 0;
316 irq_set->count = 1;
317 pfd = (int32_t *)&irq_set->data;
318
319 *pfd = event_notifier_get_fd(&vdev->intx.interrupt);
320 qemu_set_fd_handler(*pfd, vfio_intx_interrupt, NULL, vdev);
321
322 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
323 g_free(irq_set);
324 if (ret) {
325 error_report("vfio: Error: Failed to setup INTx fd: %m");
326 qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
327 event_notifier_cleanup(&vdev->intx.interrupt);
328 return -errno;
329 }
330
331 vfio_intx_enable_kvm(vdev);
332
333 vdev->interrupt = VFIO_INT_INTx;
334
335 trace_vfio_intx_enable(vdev->vbasedev.name);
336
337 return 0;
338 }
339
340 static void vfio_intx_disable(VFIOPCIDevice *vdev)
341 {
342 int fd;
343
344 timer_del(vdev->intx.mmap_timer);
345 vfio_intx_disable_kvm(vdev);
346 vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
347 vdev->intx.pending = false;
348 pci_irq_deassert(&vdev->pdev);
349 vfio_mmap_set_enabled(vdev, true);
350
351 fd = event_notifier_get_fd(&vdev->intx.interrupt);
352 qemu_set_fd_handler(fd, NULL, NULL, vdev);
353 event_notifier_cleanup(&vdev->intx.interrupt);
354
355 vdev->interrupt = VFIO_INT_NONE;
356
357 trace_vfio_intx_disable(vdev->vbasedev.name);
358 }
359
360 /*
361 * MSI/X
362 */
363 static void vfio_msi_interrupt(void *opaque)
364 {
365 VFIOMSIVector *vector = opaque;
366 VFIOPCIDevice *vdev = vector->vdev;
367 MSIMessage (*get_msg)(PCIDevice *dev, unsigned vector);
368 void (*notify)(PCIDevice *dev, unsigned vector);
369 MSIMessage msg;
370 int nr = vector - vdev->msi_vectors;
371
372 if (!event_notifier_test_and_clear(&vector->interrupt)) {
373 return;
374 }
375
376 if (vdev->interrupt == VFIO_INT_MSIX) {
377 get_msg = msix_get_message;
378 notify = msix_notify;
379 } else if (vdev->interrupt == VFIO_INT_MSI) {
380 get_msg = msi_get_message;
381 notify = msi_notify;
382 } else {
383 abort();
384 }
385
386 msg = get_msg(&vdev->pdev, nr);
387 trace_vfio_msi_interrupt(vdev->vbasedev.name, nr, msg.address, msg.data);
388 notify(&vdev->pdev, nr);
389 }
390
391 static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
392 {
393 struct vfio_irq_set *irq_set;
394 int ret = 0, i, argsz;
395 int32_t *fds;
396
397 argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
398
399 irq_set = g_malloc0(argsz);
400 irq_set->argsz = argsz;
401 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
402 irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX;
403 irq_set->start = 0;
404 irq_set->count = vdev->nr_vectors;
405 fds = (int32_t *)&irq_set->data;
406
407 for (i = 0; i < vdev->nr_vectors; i++) {
408 int fd = -1;
409
410 /*
411 * MSI vs MSI-X - The guest has direct access to MSI mask and pending
412 * bits, therefore we always use the KVM signaling path when setup.
413 * MSI-X mask and pending bits are emulated, so we want to use the
414 * KVM signaling path only when configured and unmasked.
415 */
416 if (vdev->msi_vectors[i].use) {
417 if (vdev->msi_vectors[i].virq < 0 ||
418 (msix && msix_is_masked(&vdev->pdev, i))) {
419 fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
420 } else {
421 fd = event_notifier_get_fd(&vdev->msi_vectors[i].kvm_interrupt);
422 }
423 }
424
425 fds[i] = fd;
426 }
427
428 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
429
430 g_free(irq_set);
431
432 return ret;
433 }
434
435 static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
436 MSIMessage *msg, bool msix)
437 {
438 int virq;
439
440 if ((msix && vdev->no_kvm_msix) || (!msix && vdev->no_kvm_msi) || !msg) {
441 return;
442 }
443
444 if (event_notifier_init(&vector->kvm_interrupt, 0)) {
445 return;
446 }
447
448 virq = kvm_irqchip_add_msi_route(kvm_state, *msg);
449 if (virq < 0) {
450 event_notifier_cleanup(&vector->kvm_interrupt);
451 return;
452 }
453
454 if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
455 NULL, virq) < 0) {
456 kvm_irqchip_release_virq(kvm_state, virq);
457 event_notifier_cleanup(&vector->kvm_interrupt);
458 return;
459 }
460
461 vector->virq = virq;
462 }
463
464 static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector)
465 {
466 kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
467 vector->virq);
468 kvm_irqchip_release_virq(kvm_state, vector->virq);
469 vector->virq = -1;
470 event_notifier_cleanup(&vector->kvm_interrupt);
471 }
472
473 static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg)
474 {
475 kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg);
476 }
477
478 static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
479 MSIMessage *msg, IOHandler *handler)
480 {
481 VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
482 VFIOMSIVector *vector;
483 int ret;
484
485 trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr);
486
487 vector = &vdev->msi_vectors[nr];
488
489 if (!vector->use) {
490 vector->vdev = vdev;
491 vector->virq = -1;
492 if (event_notifier_init(&vector->interrupt, 0)) {
493 error_report("vfio: Error: event_notifier_init failed");
494 }
495 vector->use = true;
496 msix_vector_use(pdev, nr);
497 }
498
499 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
500 handler, NULL, vector);
501
502 /*
503 * Attempt to enable route through KVM irqchip,
504 * default to userspace handling if unavailable.
505 */
506 if (vector->virq >= 0) {
507 if (!msg) {
508 vfio_remove_kvm_msi_virq(vector);
509 } else {
510 vfio_update_kvm_msi_virq(vector, *msg);
511 }
512 } else {
513 vfio_add_kvm_msi_virq(vdev, vector, msg, true);
514 }
515
516 /*
517 * We don't want to have the host allocate all possible MSI vectors
518 * for a device if they're not in use, so we shutdown and incrementally
519 * increase them as needed.
520 */
521 if (vdev->nr_vectors < nr + 1) {
522 vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
523 vdev->nr_vectors = nr + 1;
524 ret = vfio_enable_vectors(vdev, true);
525 if (ret) {
526 error_report("vfio: failed to enable vectors, %d", ret);
527 }
528 } else {
529 int argsz;
530 struct vfio_irq_set *irq_set;
531 int32_t *pfd;
532
533 argsz = sizeof(*irq_set) + sizeof(*pfd);
534
535 irq_set = g_malloc0(argsz);
536 irq_set->argsz = argsz;
537 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
538 VFIO_IRQ_SET_ACTION_TRIGGER;
539 irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
540 irq_set->start = nr;
541 irq_set->count = 1;
542 pfd = (int32_t *)&irq_set->data;
543
544 if (vector->virq >= 0) {
545 *pfd = event_notifier_get_fd(&vector->kvm_interrupt);
546 } else {
547 *pfd = event_notifier_get_fd(&vector->interrupt);
548 }
549
550 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
551 g_free(irq_set);
552 if (ret) {
553 error_report("vfio: failed to modify vector, %d", ret);
554 }
555 }
556
557 return 0;
558 }
559
560 static int vfio_msix_vector_use(PCIDevice *pdev,
561 unsigned int nr, MSIMessage msg)
562 {
563 return vfio_msix_vector_do_use(pdev, nr, &msg, vfio_msi_interrupt);
564 }
565
566 static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
567 {
568 VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
569 VFIOMSIVector *vector = &vdev->msi_vectors[nr];
570
571 trace_vfio_msix_vector_release(vdev->vbasedev.name, nr);
572
573 /*
574 * There are still old guests that mask and unmask vectors on every
575 * interrupt. If we're using QEMU bypass with a KVM irqfd, leave all of
576 * the KVM setup in place, simply switch VFIO to use the non-bypass
577 * eventfd. We'll then fire the interrupt through QEMU and the MSI-X
578 * core will mask the interrupt and set pending bits, allowing it to
579 * be re-asserted on unmask. Nothing to do if already using QEMU mode.
580 */
581 if (vector->virq >= 0) {
582 int argsz;
583 struct vfio_irq_set *irq_set;
584 int32_t *pfd;
585
586 argsz = sizeof(*irq_set) + sizeof(*pfd);
587
588 irq_set = g_malloc0(argsz);
589 irq_set->argsz = argsz;
590 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
591 VFIO_IRQ_SET_ACTION_TRIGGER;
592 irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
593 irq_set->start = nr;
594 irq_set->count = 1;
595 pfd = (int32_t *)&irq_set->data;
596
597 *pfd = event_notifier_get_fd(&vector->interrupt);
598
599 ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
600
601 g_free(irq_set);
602 }
603 }
604
605 static void vfio_msix_enable(VFIOPCIDevice *vdev)
606 {
607 vfio_disable_interrupts(vdev);
608
609 vdev->msi_vectors = g_malloc0(vdev->msix->entries * sizeof(VFIOMSIVector));
610
611 vdev->interrupt = VFIO_INT_MSIX;
612
613 /*
614 * Some communication channels between VF & PF or PF & fw rely on the
615 * physical state of the device and expect that enabling MSI-X from the
616 * guest enables the same on the host. When our guest is Linux, the
617 * guest driver call to pci_enable_msix() sets the enabling bit in the
618 * MSI-X capability, but leaves the vector table masked. We therefore
619 * can't rely on a vector_use callback (from request_irq() in the guest)
620 * to switch the physical device into MSI-X mode because that may come a
621 * long time after pci_enable_msix(). This code enables vector 0 with
622 * triggering to userspace, then immediately release the vector, leaving
623 * the physical device with no vectors enabled, but MSI-X enabled, just
624 * like the guest view.
625 */
626 vfio_msix_vector_do_use(&vdev->pdev, 0, NULL, NULL);
627 vfio_msix_vector_release(&vdev->pdev, 0);
628
629 if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
630 vfio_msix_vector_release, NULL)) {
631 error_report("vfio: msix_set_vector_notifiers failed");
632 }
633
634 trace_vfio_msix_enable(vdev->vbasedev.name);
635 }
636
637 static void vfio_msi_enable(VFIOPCIDevice *vdev)
638 {
639 int ret, i;
640
641 vfio_disable_interrupts(vdev);
642
643 vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
644 retry:
645 vdev->msi_vectors = g_malloc0(vdev->nr_vectors * sizeof(VFIOMSIVector));
646
647 for (i = 0; i < vdev->nr_vectors; i++) {
648 VFIOMSIVector *vector = &vdev->msi_vectors[i];
649 MSIMessage msg = msi_get_message(&vdev->pdev, i);
650
651 vector->vdev = vdev;
652 vector->virq = -1;
653 vector->use = true;
654
655 if (event_notifier_init(&vector->interrupt, 0)) {
656 error_report("vfio: Error: event_notifier_init failed");
657 }
658
659 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
660 vfio_msi_interrupt, NULL, vector);
661
662 /*
663 * Attempt to enable route through KVM irqchip,
664 * default to userspace handling if unavailable.
665 */
666 vfio_add_kvm_msi_virq(vdev, vector, &msg, false);
667 }
668
669 /* Set interrupt type prior to possible interrupts */
670 vdev->interrupt = VFIO_INT_MSI;
671
672 ret = vfio_enable_vectors(vdev, false);
673 if (ret) {
674 if (ret < 0) {
675 error_report("vfio: Error: Failed to setup MSI fds: %m");
676 } else if (ret != vdev->nr_vectors) {
677 error_report("vfio: Error: Failed to enable %d "
678 "MSI vectors, retry with %d", vdev->nr_vectors, ret);
679 }
680
681 for (i = 0; i < vdev->nr_vectors; i++) {
682 VFIOMSIVector *vector = &vdev->msi_vectors[i];
683 if (vector->virq >= 0) {
684 vfio_remove_kvm_msi_virq(vector);
685 }
686 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
687 NULL, NULL, NULL);
688 event_notifier_cleanup(&vector->interrupt);
689 }
690
691 g_free(vdev->msi_vectors);
692
693 if (ret > 0 && ret != vdev->nr_vectors) {
694 vdev->nr_vectors = ret;
695 goto retry;
696 }
697 vdev->nr_vectors = 0;
698
699 /*
700 * Failing to setup MSI doesn't really fall within any specification.
701 * Let's try leaving interrupts disabled and hope the guest figures
702 * out to fall back to INTx for this device.
703 */
704 error_report("vfio: Error: Failed to enable MSI");
705 vdev->interrupt = VFIO_INT_NONE;
706
707 return;
708 }
709
710 trace_vfio_msi_enable(vdev->vbasedev.name, vdev->nr_vectors);
711 }
712
713 static void vfio_msi_disable_common(VFIOPCIDevice *vdev)
714 {
715 int i;
716
717 for (i = 0; i < vdev->nr_vectors; i++) {
718 VFIOMSIVector *vector = &vdev->msi_vectors[i];
719 if (vdev->msi_vectors[i].use) {
720 if (vector->virq >= 0) {
721 vfio_remove_kvm_msi_virq(vector);
722 }
723 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
724 NULL, NULL, NULL);
725 event_notifier_cleanup(&vector->interrupt);
726 }
727 }
728
729 g_free(vdev->msi_vectors);
730 vdev->msi_vectors = NULL;
731 vdev->nr_vectors = 0;
732 vdev->interrupt = VFIO_INT_NONE;
733
734 vfio_intx_enable(vdev);
735 }
736
737 static void vfio_msix_disable(VFIOPCIDevice *vdev)
738 {
739 int i;
740
741 msix_unset_vector_notifiers(&vdev->pdev);
742
743 /*
744 * MSI-X will only release vectors if MSI-X is still enabled on the
745 * device, check through the rest and release it ourselves if necessary.
746 */
747 for (i = 0; i < vdev->nr_vectors; i++) {
748 if (vdev->msi_vectors[i].use) {
749 vfio_msix_vector_release(&vdev->pdev, i);
750 msix_vector_unuse(&vdev->pdev, i);
751 }
752 }
753
754 if (vdev->nr_vectors) {
755 vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
756 }
757
758 vfio_msi_disable_common(vdev);
759
760 trace_vfio_msix_disable(vdev->vbasedev.name);
761 }
762
763 static void vfio_msi_disable(VFIOPCIDevice *vdev)
764 {
765 vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX);
766 vfio_msi_disable_common(vdev);
767
768 trace_vfio_msi_disable(vdev->vbasedev.name);
769 }
770
771 static void vfio_update_msi(VFIOPCIDevice *vdev)
772 {
773 int i;
774
775 for (i = 0; i < vdev->nr_vectors; i++) {
776 VFIOMSIVector *vector = &vdev->msi_vectors[i];
777 MSIMessage msg;
778
779 if (!vector->use || vector->virq < 0) {
780 continue;
781 }
782
783 msg = msi_get_message(&vdev->pdev, i);
784 vfio_update_kvm_msi_virq(vector, msg);
785 }
786 }
787
788 static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
789 {
790 struct vfio_region_info reg_info = {
791 .argsz = sizeof(reg_info),
792 .index = VFIO_PCI_ROM_REGION_INDEX
793 };
794 uint64_t size;
795 off_t off = 0;
796 ssize_t bytes;
797
798 if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info)) {
799 error_report("vfio: Error getting ROM info: %m");
800 return;
801 }
802
803 trace_vfio_pci_load_rom(vdev->vbasedev.name, (unsigned long)reg_info.size,
804 (unsigned long)reg_info.offset,
805 (unsigned long)reg_info.flags);
806
807 vdev->rom_size = size = reg_info.size;
808 vdev->rom_offset = reg_info.offset;
809
810 if (!vdev->rom_size) {
811 vdev->rom_read_failed = true;
812 error_report("vfio-pci: Cannot read device rom at "
813 "%s", vdev->vbasedev.name);
814 error_printf("Device option ROM contents are probably invalid "
815 "(check dmesg).\nSkip option ROM probe with rombar=0, "
816 "or load from file with romfile=\n");
817 return;
818 }
819
820 vdev->rom = g_malloc(size);
821 memset(vdev->rom, 0xff, size);
822
823 while (size) {
824 bytes = pread(vdev->vbasedev.fd, vdev->rom + off,
825 size, vdev->rom_offset + off);
826 if (bytes == 0) {
827 break;
828 } else if (bytes > 0) {
829 off += bytes;
830 size -= bytes;
831 } else {
832 if (errno == EINTR || errno == EAGAIN) {
833 continue;
834 }
835 error_report("vfio: Error reading device ROM: %m");
836 break;
837 }
838 }
839 }
840
841 static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size)
842 {
843 VFIOPCIDevice *vdev = opaque;
844 union {
845 uint8_t byte;
846 uint16_t word;
847 uint32_t dword;
848 uint64_t qword;
849 } val;
850 uint64_t data = 0;
851
852 /* Load the ROM lazily when the guest tries to read it */
853 if (unlikely(!vdev->rom && !vdev->rom_read_failed)) {
854 vfio_pci_load_rom(vdev);
855 }
856
857 memcpy(&val, vdev->rom + addr,
858 (addr < vdev->rom_size) ? MIN(size, vdev->rom_size - addr) : 0);
859
860 switch (size) {
861 case 1:
862 data = val.byte;
863 break;
864 case 2:
865 data = le16_to_cpu(val.word);
866 break;
867 case 4:
868 data = le32_to_cpu(val.dword);
869 break;
870 default:
871 hw_error("vfio: unsupported read size, %d bytes\n", size);
872 break;
873 }
874
875 trace_vfio_rom_read(vdev->vbasedev.name, addr, size, data);
876
877 return data;
878 }
879
880 static void vfio_rom_write(void *opaque, hwaddr addr,
881 uint64_t data, unsigned size)
882 {
883 }
884
885 static const MemoryRegionOps vfio_rom_ops = {
886 .read = vfio_rom_read,
887 .write = vfio_rom_write,
888 .endianness = DEVICE_LITTLE_ENDIAN,
889 };
890
891 static bool vfio_blacklist_opt_rom(VFIOPCIDevice *vdev)
892 {
893 PCIDevice *pdev = &vdev->pdev;
894 uint16_t vendor_id, device_id;
895 int count = 0;
896
897 vendor_id = pci_get_word(pdev->config + PCI_VENDOR_ID);
898 device_id = pci_get_word(pdev->config + PCI_DEVICE_ID);
899
900 while (count < ARRAY_SIZE(romblacklist)) {
901 if (romblacklist[count].vendor_id == vendor_id &&
902 romblacklist[count].device_id == device_id) {
903 return true;
904 }
905 count++;
906 }
907
908 return false;
909 }
910
911 static void vfio_pci_size_rom(VFIOPCIDevice *vdev)
912 {
913 uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK);
914 off_t offset = vdev->config_offset + PCI_ROM_ADDRESS;
915 DeviceState *dev = DEVICE(vdev);
916 char name[32];
917 int fd = vdev->vbasedev.fd;
918
919 if (vdev->pdev.romfile || !vdev->pdev.rom_bar) {
920 /* Since pci handles romfile, just print a message and return */
921 if (vfio_blacklist_opt_rom(vdev) && vdev->pdev.romfile) {
922 error_printf("Warning : Device at %04x:%02x:%02x.%x "
923 "is known to cause system instability issues during "
924 "option rom execution. "
925 "Proceeding anyway since user specified romfile\n",
926 vdev->host.domain, vdev->host.bus, vdev->host.slot,
927 vdev->host.function);
928 }
929 return;
930 }
931
932 /*
933 * Use the same size ROM BAR as the physical device. The contents
934 * will get filled in later when the guest tries to read it.
935 */
936 if (pread(fd, &orig, 4, offset) != 4 ||
937 pwrite(fd, &size, 4, offset) != 4 ||
938 pread(fd, &size, 4, offset) != 4 ||
939 pwrite(fd, &orig, 4, offset) != 4) {
940 error_report("%s(%04x:%02x:%02x.%x) failed: %m",
941 __func__, vdev->host.domain, vdev->host.bus,
942 vdev->host.slot, vdev->host.function);
943 return;
944 }
945
946 size = ~(le32_to_cpu(size) & PCI_ROM_ADDRESS_MASK) + 1;
947
948 if (!size) {
949 return;
950 }
951
952 if (vfio_blacklist_opt_rom(vdev)) {
953 if (dev->opts && qemu_opt_get(dev->opts, "rombar")) {
954 error_printf("Warning : Device at %04x:%02x:%02x.%x "
955 "is known to cause system instability issues during "
956 "option rom execution. "
957 "Proceeding anyway since user specified non zero value for "
958 "rombar\n",
959 vdev->host.domain, vdev->host.bus, vdev->host.slot,
960 vdev->host.function);
961 } else {
962 error_printf("Warning : Rom loading for device at "
963 "%04x:%02x:%02x.%x has been disabled due to "
964 "system instability issues. "
965 "Specify rombar=1 or romfile to force\n",
966 vdev->host.domain, vdev->host.bus, vdev->host.slot,
967 vdev->host.function);
968 return;
969 }
970 }
971
972 trace_vfio_pci_size_rom(vdev->vbasedev.name, size);
973
974 snprintf(name, sizeof(name), "vfio[%04x:%02x:%02x.%x].rom",
975 vdev->host.domain, vdev->host.bus, vdev->host.slot,
976 vdev->host.function);
977
978 memory_region_init_io(&vdev->pdev.rom, OBJECT(vdev),
979 &vfio_rom_ops, vdev, name, size);
980
981 pci_register_bar(&vdev->pdev, PCI_ROM_SLOT,
982 PCI_BASE_ADDRESS_SPACE_MEMORY, &vdev->pdev.rom);
983
984 vdev->pdev.has_rom = true;
985 vdev->rom_read_failed = false;
986 }
987
988 static void vfio_vga_write(void *opaque, hwaddr addr,
989 uint64_t data, unsigned size)
990 {
991 VFIOVGARegion *region = opaque;
992 VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
993 union {
994 uint8_t byte;
995 uint16_t word;
996 uint32_t dword;
997 uint64_t qword;
998 } buf;
999 off_t offset = vga->fd_offset + region->offset + addr;
1000
1001 switch (size) {
1002 case 1:
1003 buf.byte = data;
1004 break;
1005 case 2:
1006 buf.word = cpu_to_le16(data);
1007 break;
1008 case 4:
1009 buf.dword = cpu_to_le32(data);
1010 break;
1011 default:
1012 hw_error("vfio: unsupported write size, %d bytes", size);
1013 break;
1014 }
1015
1016 if (pwrite(vga->fd, &buf, size, offset) != size) {
1017 error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m",
1018 __func__, region->offset + addr, data, size);
1019 }
1020
1021 trace_vfio_vga_write(region->offset + addr, data, size);
1022 }
1023
1024 static uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size)
1025 {
1026 VFIOVGARegion *region = opaque;
1027 VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
1028 union {
1029 uint8_t byte;
1030 uint16_t word;
1031 uint32_t dword;
1032 uint64_t qword;
1033 } buf;
1034 uint64_t data = 0;
1035 off_t offset = vga->fd_offset + region->offset + addr;
1036
1037 if (pread(vga->fd, &buf, size, offset) != size) {
1038 error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m",
1039 __func__, region->offset + addr, size);
1040 return (uint64_t)-1;
1041 }
1042
1043 switch (size) {
1044 case 1:
1045 data = buf.byte;
1046 break;
1047 case 2:
1048 data = le16_to_cpu(buf.word);
1049 break;
1050 case 4:
1051 data = le32_to_cpu(buf.dword);
1052 break;
1053 default:
1054 hw_error("vfio: unsupported read size, %d bytes", size);
1055 break;
1056 }
1057
1058 trace_vfio_vga_read(region->offset + addr, size, data);
1059
1060 return data;
1061 }
1062
1063 static const MemoryRegionOps vfio_vga_ops = {
1064 .read = vfio_vga_read,
1065 .write = vfio_vga_write,
1066 .endianness = DEVICE_LITTLE_ENDIAN,
1067 };
1068
1069 /*
1070 * Device specific quirks
1071 */
1072
1073 /* Is range1 fully contained within range2? */
1074 static bool vfio_range_contained(uint64_t first1, uint64_t len1,
1075 uint64_t first2, uint64_t len2) {
1076 return (first1 >= first2 && first1 + len1 <= first2 + len2);
1077 }
1078
1079 static bool vfio_flags_enabled(uint8_t flags, uint8_t mask)
1080 {
1081 return (mask && (flags & mask) == mask);
1082 }
1083
1084 static uint64_t vfio_generic_window_quirk_read(void *opaque,
1085 hwaddr addr, unsigned size)
1086 {
1087 VFIOQuirk *quirk = opaque;
1088 VFIOPCIDevice *vdev = quirk->vdev;
1089 uint64_t data;
1090
1091 if (vfio_flags_enabled(quirk->data.flags, quirk->data.read_flags) &&
1092 ranges_overlap(addr, size,
1093 quirk->data.data_offset, quirk->data.data_size)) {
1094 hwaddr offset = addr - quirk->data.data_offset;
1095
1096 if (!vfio_range_contained(addr, size, quirk->data.data_offset,
1097 quirk->data.data_size)) {
1098 hw_error("%s: window data read not fully contained: %s",
1099 __func__, memory_region_name(&quirk->mem));
1100 }
1101
1102 data = vfio_pci_read_config(&vdev->pdev,
1103 quirk->data.address_val + offset, size);
1104
1105 trace_vfio_generic_window_quirk_read(memory_region_name(&quirk->mem),
1106 vdev->vbasedev.name,
1107 quirk->data.bar,
1108 addr, size, data);
1109 } else {
1110 data = vfio_region_read(&vdev->bars[quirk->data.bar].region,
1111 addr + quirk->data.base_offset, size);
1112 }
1113
1114 return data;
1115 }
1116
1117 static void vfio_generic_window_quirk_write(void *opaque, hwaddr addr,
1118 uint64_t data, unsigned size)
1119 {
1120 VFIOQuirk *quirk = opaque;
1121 VFIOPCIDevice *vdev = quirk->vdev;
1122
1123 if (ranges_overlap(addr, size,
1124 quirk->data.address_offset, quirk->data.address_size)) {
1125
1126 if (addr != quirk->data.address_offset) {
1127 hw_error("%s: offset write into address window: %s",
1128 __func__, memory_region_name(&quirk->mem));
1129 }
1130
1131 if ((data & ~quirk->data.address_mask) == quirk->data.address_match) {
1132 quirk->data.flags |= quirk->data.write_flags |
1133 quirk->data.read_flags;
1134 quirk->data.address_val = data & quirk->data.address_mask;
1135 } else {
1136 quirk->data.flags &= ~(quirk->data.write_flags |
1137 quirk->data.read_flags);
1138 }
1139 }
1140
1141 if (vfio_flags_enabled(quirk->data.flags, quirk->data.write_flags) &&
1142 ranges_overlap(addr, size,
1143 quirk->data.data_offset, quirk->data.data_size)) {
1144 hwaddr offset = addr - quirk->data.data_offset;
1145
1146 if (!vfio_range_contained(addr, size, quirk->data.data_offset,
1147 quirk->data.data_size)) {
1148 hw_error("%s: window data write not fully contained: %s",
1149 __func__, memory_region_name(&quirk->mem));
1150 }
1151
1152 vfio_pci_write_config(&vdev->pdev,
1153 quirk->data.address_val + offset, data, size);
1154 trace_vfio_generic_window_quirk_write(memory_region_name(&quirk->mem),
1155 vdev->vbasedev.name,
1156 quirk->data.bar,
1157 addr, data, size);
1158 return;
1159 }
1160
1161 vfio_region_write(&vdev->bars[quirk->data.bar].region,
1162 addr + quirk->data.base_offset, data, size);
1163 }
1164
1165 static const MemoryRegionOps vfio_generic_window_quirk = {
1166 .read = vfio_generic_window_quirk_read,
1167 .write = vfio_generic_window_quirk_write,
1168 .endianness = DEVICE_LITTLE_ENDIAN,
1169 };
1170
1171 static uint64_t vfio_generic_quirk_read(void *opaque,
1172 hwaddr addr, unsigned size)
1173 {
1174 VFIOQuirk *quirk = opaque;
1175 VFIOPCIDevice *vdev = quirk->vdev;
1176 hwaddr base = quirk->data.address_match & TARGET_PAGE_MASK;
1177 hwaddr offset = quirk->data.address_match & ~TARGET_PAGE_MASK;
1178 uint64_t data;
1179
1180 if (vfio_flags_enabled(quirk->data.flags, quirk->data.read_flags) &&
1181 ranges_overlap(addr, size, offset, quirk->data.address_mask + 1)) {
1182 if (!vfio_range_contained(addr, size, offset,
1183 quirk->data.address_mask + 1)) {
1184 hw_error("%s: read not fully contained: %s",
1185 __func__, memory_region_name(&quirk->mem));
1186 }
1187
1188 data = vfio_pci_read_config(&vdev->pdev, addr - offset, size);
1189
1190 trace_vfio_generic_quirk_read(memory_region_name(&quirk->mem),
1191 vdev->vbasedev.name, quirk->data.bar,
1192 addr + base, size, data);
1193 } else {
1194 data = vfio_region_read(&vdev->bars[quirk->data.bar].region,
1195 addr + base, size);
1196 }
1197
1198 return data;
1199 }
1200
1201 static void vfio_generic_quirk_write(void *opaque, hwaddr addr,
1202 uint64_t data, unsigned size)
1203 {
1204 VFIOQuirk *quirk = opaque;
1205 VFIOPCIDevice *vdev = quirk->vdev;
1206 hwaddr base = quirk->data.address_match & TARGET_PAGE_MASK;
1207 hwaddr offset = quirk->data.address_match & ~TARGET_PAGE_MASK;
1208
1209 if (vfio_flags_enabled(quirk->data.flags, quirk->data.write_flags) &&
1210 ranges_overlap(addr, size, offset, quirk->data.address_mask + 1)) {
1211 if (!vfio_range_contained(addr, size, offset,
1212 quirk->data.address_mask + 1)) {
1213 hw_error("%s: write not fully contained: %s",
1214 __func__, memory_region_name(&quirk->mem));
1215 }
1216
1217 vfio_pci_write_config(&vdev->pdev, addr - offset, data, size);
1218
1219 trace_vfio_generic_quirk_write(memory_region_name(&quirk->mem),
1220 vdev->vbasedev.name, quirk->data.bar,
1221 addr + base, data, size);
1222 } else {
1223 vfio_region_write(&vdev->bars[quirk->data.bar].region,
1224 addr + base, data, size);
1225 }
1226 }
1227
1228 static const MemoryRegionOps vfio_generic_quirk = {
1229 .read = vfio_generic_quirk_read,
1230 .write = vfio_generic_quirk_write,
1231 .endianness = DEVICE_LITTLE_ENDIAN,
1232 };
1233
1234 #define PCI_VENDOR_ID_ATI 0x1002
1235
1236 /*
1237 * Radeon HD cards (HD5450 & HD7850) report the upper byte of the I/O port BAR
1238 * through VGA register 0x3c3. On newer cards, the I/O port BAR is always
1239 * BAR4 (older cards like the X550 used BAR1, but we don't care to support
1240 * those). Note that on bare metal, a read of 0x3c3 doesn't always return the
1241 * I/O port BAR address. Originally this was coded to return the virtual BAR
1242 * address only if the physical register read returns the actual BAR address,
1243 * but users have reported greater success if we return the virtual address
1244 * unconditionally.
1245 */
1246 static uint64_t vfio_ati_3c3_quirk_read(void *opaque,
1247 hwaddr addr, unsigned size)
1248 {
1249 VFIOQuirk *quirk = opaque;
1250 VFIOPCIDevice *vdev = quirk->vdev;
1251 uint64_t data = vfio_pci_read_config(&vdev->pdev,
1252 PCI_BASE_ADDRESS_0 + (4 * 4) + 1,
1253 size);
1254 trace_vfio_ati_3c3_quirk_read(data);
1255
1256 return data;
1257 }
1258
1259 static const MemoryRegionOps vfio_ati_3c3_quirk = {
1260 .read = vfio_ati_3c3_quirk_read,
1261 .endianness = DEVICE_LITTLE_ENDIAN,
1262 };
1263
1264 static void vfio_vga_probe_ati_3c3_quirk(VFIOPCIDevice *vdev)
1265 {
1266 PCIDevice *pdev = &vdev->pdev;
1267 VFIOQuirk *quirk;
1268
1269 if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) {
1270 return;
1271 }
1272
1273 /*
1274 * As long as the BAR is >= 256 bytes it will be aligned such that the
1275 * lower byte is always zero. Filter out anything else, if it exists.
1276 */
1277 if (!vdev->bars[4].ioport || vdev->bars[4].region.size < 256) {
1278 return;
1279 }
1280
1281 quirk = g_malloc0(sizeof(*quirk));
1282 quirk->vdev = vdev;
1283
1284 memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_ati_3c3_quirk, quirk,
1285 "vfio-ati-3c3-quirk", 1);
1286 memory_region_add_subregion(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem,
1287 3 /* offset 3 bytes from 0x3c0 */, &quirk->mem);
1288
1289 QLIST_INSERT_HEAD(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks,
1290 quirk, next);
1291
1292 trace_vfio_vga_probe_ati_3c3_quirk(vdev->vbasedev.name);
1293 }
1294
1295 /*
1296 * Newer ATI/AMD devices, including HD5450 and HD7850, have a window to PCI
1297 * config space through MMIO BAR2 at offset 0x4000. Nothing seems to access
1298 * the MMIO space directly, but a window to this space is provided through
1299 * I/O port BAR4. Offset 0x0 is the address register and offset 0x4 is the
1300 * data register. When the address is programmed to a range of 0x4000-0x4fff
1301 * PCI configuration space is available. Experimentation seems to indicate
1302 * that only read-only access is provided, but we drop writes when the window
1303 * is enabled to config space nonetheless.
1304 */
1305 static void vfio_probe_ati_bar4_window_quirk(VFIOPCIDevice *vdev, int nr)
1306 {
1307 PCIDevice *pdev = &vdev->pdev;
1308 VFIOQuirk *quirk;
1309
1310 if (!vdev->has_vga || nr != 4 ||
1311 pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) {
1312 return;
1313 }
1314
1315 quirk = g_malloc0(sizeof(*quirk));
1316 quirk->vdev = vdev;
1317 quirk->data.address_size = 4;
1318 quirk->data.data_offset = 4;
1319 quirk->data.data_size = 4;
1320 quirk->data.address_match = 0x4000;
1321 quirk->data.address_mask = PCIE_CONFIG_SPACE_SIZE - 1;
1322 quirk->data.bar = nr;
1323 quirk->data.read_flags = quirk->data.write_flags = 1;
1324
1325 memory_region_init_io(&quirk->mem, OBJECT(vdev),
1326 &vfio_generic_window_quirk, quirk,
1327 "vfio-ati-bar4-window-quirk", 8);
1328 memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem,
1329 quirk->data.base_offset, &quirk->mem, 1);
1330
1331 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1332
1333 trace_vfio_probe_ati_bar4_window_quirk(vdev->vbasedev.name);
1334 }
1335
1336 #define PCI_VENDOR_ID_REALTEK 0x10ec
1337
1338 /*
1339 * RTL8168 devices have a backdoor that can access the MSI-X table. At BAR2
1340 * offset 0x70 there is a dword data register, offset 0x74 is a dword address
1341 * register. According to the Linux r8169 driver, the MSI-X table is addressed
1342 * when the "type" portion of the address register is set to 0x1. This appears
1343 * to be bits 16:30. Bit 31 is both a write indicator and some sort of
1344 * "address latched" indicator. Bits 12:15 are a mask field, which we can
1345 * ignore because the MSI-X table should always be accessed as a dword (full
1346 * mask). Bits 0:11 is offset within the type.
1347 *
1348 * Example trace:
1349 *
1350 * Read from MSI-X table offset 0
1351 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x1f000, 4) // store read addr
1352 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x8001f000 // latch
1353 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x70, 4) = 0xfee00398 // read data
1354 *
1355 * Write 0xfee00000 to MSI-X table offset 0
1356 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x70, 0xfee00000, 4) // write data
1357 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x8001f000, 4) // do write
1358 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x1f000 // complete
1359 */
1360
1361 static uint64_t vfio_rtl8168_window_quirk_read(void *opaque,
1362 hwaddr addr, unsigned size)
1363 {
1364 VFIOQuirk *quirk = opaque;
1365 VFIOPCIDevice *vdev = quirk->vdev;
1366 uint64_t val = 0;
1367
1368 if (!quirk->data.flags) { /* Non-MSI-X table access */
1369 return vfio_region_read(&vdev->bars[quirk->data.bar].region,
1370 addr + 0x70, size);
1371 }
1372
1373 switch (addr) {
1374 case 4: /* address */
1375 val = quirk->data.address_match ^ 0x80000000U; /* latch/complete */
1376 break;
1377 case 0: /* data */
1378 if ((vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX)) {
1379 memory_region_dispatch_read(&vdev->pdev.msix_table_mmio,
1380 (hwaddr)(quirk->data.address_match & 0xfff),
1381 &val, size, MEMTXATTRS_UNSPECIFIED);
1382 }
1383 break;
1384 }
1385
1386 trace_vfio_rtl8168_quirk_read(vdev->vbasedev.name,
1387 addr ? "address" : "data", val);
1388 return val;
1389 }
1390
1391 static void vfio_rtl8168_window_quirk_write(void *opaque, hwaddr addr,
1392 uint64_t data, unsigned size)
1393 {
1394 VFIOQuirk *quirk = opaque;
1395 VFIOPCIDevice *vdev = quirk->vdev;
1396
1397 switch (addr) {
1398 case 4: /* address */
1399 if ((data & 0x7fff0000) == 0x10000) { /* MSI-X table */
1400 quirk->data.flags = 1; /* Activate reads */
1401 quirk->data.address_match = data;
1402
1403 trace_vfio_rtl8168_quirk_write(vdev->vbasedev.name, data);
1404
1405 if (data & 0x80000000U) { /* Do write */
1406 if (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX) {
1407 hwaddr offset = data & 0xfff;
1408 uint64_t val = quirk->data.address_mask;
1409
1410 trace_vfio_rtl8168_quirk_msix(vdev->vbasedev.name,
1411 (uint16_t)offset, val);
1412
1413 /* Write to the proper guest MSI-X table instead */
1414 memory_region_dispatch_write(&vdev->pdev.msix_table_mmio,
1415 offset, val, size,
1416 MEMTXATTRS_UNSPECIFIED);
1417 }
1418 return; /* Do not write guest MSI-X data to hardware */
1419 }
1420 } else {
1421 quirk->data.flags = 0; /* De-activate reads, non-MSI-X */
1422 }
1423 break;
1424 case 0: /* data */
1425 quirk->data.address_mask = data;
1426 break;
1427 }
1428
1429 vfio_region_write(&vdev->bars[quirk->data.bar].region,
1430 addr + 0x70, data, size);
1431 }
1432
1433 static const MemoryRegionOps vfio_rtl8168_window_quirk = {
1434 .read = vfio_rtl8168_window_quirk_read,
1435 .write = vfio_rtl8168_window_quirk_write,
1436 .valid = {
1437 .min_access_size = 4,
1438 .max_access_size = 4,
1439 .unaligned = false,
1440 },
1441 .endianness = DEVICE_LITTLE_ENDIAN,
1442 };
1443
1444 static void vfio_probe_rtl8168_bar2_window_quirk(VFIOPCIDevice *vdev, int nr)
1445 {
1446 PCIDevice *pdev = &vdev->pdev;
1447 VFIOQuirk *quirk;
1448
1449 if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_REALTEK ||
1450 pci_get_word(pdev->config + PCI_DEVICE_ID) != 0x8168 || nr != 2) {
1451 return;
1452 }
1453
1454 quirk = g_malloc0(sizeof(*quirk));
1455 quirk->vdev = vdev;
1456 quirk->data.bar = nr;
1457
1458 memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_rtl8168_window_quirk,
1459 quirk, "vfio-rtl8168-window-quirk", 8);
1460 memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem,
1461 0x70, &quirk->mem, 1);
1462
1463 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1464
1465 trace_vfio_rtl8168_quirk_enable(vdev->vbasedev.name);
1466 }
1467
1468 /*
1469 * Trap the BAR2 MMIO window to config space as well.
1470 */
1471 static void vfio_probe_ati_bar2_4000_quirk(VFIOPCIDevice *vdev, int nr)
1472 {
1473 PCIDevice *pdev = &vdev->pdev;
1474 VFIOQuirk *quirk;
1475
1476 /* Only enable on newer devices where BAR2 is 64bit */
1477 if (!vdev->has_vga || nr != 2 || !vdev->bars[2].mem64 ||
1478 pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) {
1479 return;
1480 }
1481
1482 quirk = g_malloc0(sizeof(*quirk));
1483 quirk->vdev = vdev;
1484 quirk->data.flags = quirk->data.read_flags = quirk->data.write_flags = 1;
1485 quirk->data.address_match = 0x4000;
1486 quirk->data.address_mask = PCIE_CONFIG_SPACE_SIZE - 1;
1487 quirk->data.bar = nr;
1488
1489 memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_generic_quirk, quirk,
1490 "vfio-ati-bar2-4000-quirk",
1491 TARGET_PAGE_ALIGN(quirk->data.address_mask + 1));
1492 memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem,
1493 quirk->data.address_match & TARGET_PAGE_MASK,
1494 &quirk->mem, 1);
1495
1496 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1497
1498 trace_vfio_probe_ati_bar2_4000_quirk(vdev->vbasedev.name);
1499 }
1500
1501 /*
1502 * Older ATI/AMD cards like the X550 have a similar window to that above.
1503 * I/O port BAR1 provides a window to a mirror of PCI config space located
1504 * in BAR2 at offset 0xf00. We don't care to support such older cards, but
1505 * note it for future reference.
1506 */
1507
1508 #define PCI_VENDOR_ID_NVIDIA 0x10de
1509
1510 /*
1511 * Nvidia has several different methods to get to config space, the
1512 * nouveu project has several of these documented here:
1513 * https://github.com/pathscale/envytools/tree/master/hwdocs
1514 *
1515 * The first quirk is actually not documented in envytools and is found
1516 * on 10de:01d1 (NVIDIA Corporation G72 [GeForce 7300 LE]). This is an
1517 * NV46 chipset. The backdoor uses the legacy VGA I/O ports to access
1518 * the mirror of PCI config space found at BAR0 offset 0x1800. The access
1519 * sequence first writes 0x338 to I/O port 0x3d4. The target offset is
1520 * then written to 0x3d0. Finally 0x538 is written for a read and 0x738
1521 * is written for a write to 0x3d4. The BAR0 offset is then accessible
1522 * through 0x3d0. This quirk doesn't seem to be necessary on newer cards
1523 * that use the I/O port BAR5 window but it doesn't hurt to leave it.
1524 */
1525 enum {
1526 NV_3D0_NONE = 0,
1527 NV_3D0_SELECT,
1528 NV_3D0_WINDOW,
1529 NV_3D0_READ,
1530 NV_3D0_WRITE,
1531 };
1532
1533 static uint64_t vfio_nvidia_3d0_quirk_read(void *opaque,
1534 hwaddr addr, unsigned size)
1535 {
1536 VFIOQuirk *quirk = opaque;
1537 VFIOPCIDevice *vdev = quirk->vdev;
1538 PCIDevice *pdev = &vdev->pdev;
1539 uint64_t data = vfio_vga_read(&vdev->vga.region[QEMU_PCI_VGA_IO_HI],
1540 addr + quirk->data.base_offset, size);
1541
1542 if (quirk->data.flags == NV_3D0_READ && addr == quirk->data.data_offset) {
1543 data = vfio_pci_read_config(pdev, quirk->data.address_val, size);
1544 trace_vfio_nvidia_3d0_quirk_read(size, data);
1545 }
1546
1547 quirk->data.flags = NV_3D0_NONE;
1548
1549 return data;
1550 }
1551
1552 static void vfio_nvidia_3d0_quirk_write(void *opaque, hwaddr addr,
1553 uint64_t data, unsigned size)
1554 {
1555 VFIOQuirk *quirk = opaque;
1556 VFIOPCIDevice *vdev = quirk->vdev;
1557 PCIDevice *pdev = &vdev->pdev;
1558
1559 switch (quirk->data.flags) {
1560 case NV_3D0_NONE:
1561 if (addr == quirk->data.address_offset && data == 0x338) {
1562 quirk->data.flags = NV_3D0_SELECT;
1563 }
1564 break;
1565 case NV_3D0_SELECT:
1566 quirk->data.flags = NV_3D0_NONE;
1567 if (addr == quirk->data.data_offset &&
1568 (data & ~quirk->data.address_mask) == quirk->data.address_match) {
1569 quirk->data.flags = NV_3D0_WINDOW;
1570 quirk->data.address_val = data & quirk->data.address_mask;
1571 }
1572 break;
1573 case NV_3D0_WINDOW:
1574 quirk->data.flags = NV_3D0_NONE;
1575 if (addr == quirk->data.address_offset) {
1576 if (data == 0x538) {
1577 quirk->data.flags = NV_3D0_READ;
1578 } else if (data == 0x738) {
1579 quirk->data.flags = NV_3D0_WRITE;
1580 }
1581 }
1582 break;
1583 case NV_3D0_WRITE:
1584 quirk->data.flags = NV_3D0_NONE;
1585 if (addr == quirk->data.data_offset) {
1586 vfio_pci_write_config(pdev, quirk->data.address_val, data, size);
1587 trace_vfio_nvidia_3d0_quirk_write(data, size);
1588 return;
1589 }
1590 break;
1591 }
1592
1593 vfio_vga_write(&vdev->vga.region[QEMU_PCI_VGA_IO_HI],
1594 addr + quirk->data.base_offset, data, size);
1595 }
1596
1597 static const MemoryRegionOps vfio_nvidia_3d0_quirk = {
1598 .read = vfio_nvidia_3d0_quirk_read,
1599 .write = vfio_nvidia_3d0_quirk_write,
1600 .endianness = DEVICE_LITTLE_ENDIAN,
1601 };
1602
1603 static void vfio_vga_probe_nvidia_3d0_quirk(VFIOPCIDevice *vdev)
1604 {
1605 PCIDevice *pdev = &vdev->pdev;
1606 VFIOQuirk *quirk;
1607
1608 if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA ||
1609 !vdev->bars[1].region.size) {
1610 return;
1611 }
1612
1613 quirk = g_malloc0(sizeof(*quirk));
1614 quirk->vdev = vdev;
1615 quirk->data.base_offset = 0x10;
1616 quirk->data.address_offset = 4;
1617 quirk->data.address_size = 2;
1618 quirk->data.address_match = 0x1800;
1619 quirk->data.address_mask = PCI_CONFIG_SPACE_SIZE - 1;
1620 quirk->data.data_offset = 0;
1621 quirk->data.data_size = 4;
1622
1623 memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_nvidia_3d0_quirk,
1624 quirk, "vfio-nvidia-3d0-quirk", 6);
1625 memory_region_add_subregion(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem,
1626 quirk->data.base_offset, &quirk->mem);
1627
1628 QLIST_INSERT_HEAD(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks,
1629 quirk, next);
1630
1631 trace_vfio_vga_probe_nvidia_3d0_quirk(vdev->vbasedev.name);
1632 }
1633
1634 /*
1635 * The second quirk is documented in envytools. The I/O port BAR5 is just
1636 * a set of address/data ports to the MMIO BARs. The BAR we care about is
1637 * again BAR0. This backdoor is apparently a bit newer than the one above
1638 * so we need to not only trap 256 bytes @0x1800, but all of PCI config
1639 * space, including extended space is available at the 4k @0x88000.
1640 */
1641 enum {
1642 NV_BAR5_ADDRESS = 0x1,
1643 NV_BAR5_ENABLE = 0x2,
1644 NV_BAR5_MASTER = 0x4,
1645 NV_BAR5_VALID = 0x7,
1646 };
1647
1648 static void vfio_nvidia_bar5_window_quirk_write(void *opaque, hwaddr addr,
1649 uint64_t data, unsigned size)
1650 {
1651 VFIOQuirk *quirk = opaque;
1652
1653 switch (addr) {
1654 case 0x0:
1655 if (data & 0x1) {
1656 quirk->data.flags |= NV_BAR5_MASTER;
1657 } else {
1658 quirk->data.flags &= ~NV_BAR5_MASTER;
1659 }
1660 break;
1661 case 0x4:
1662 if (data & 0x1) {
1663 quirk->data.flags |= NV_BAR5_ENABLE;
1664 } else {
1665 quirk->data.flags &= ~NV_BAR5_ENABLE;
1666 }
1667 break;
1668 case 0x8:
1669 if (quirk->data.flags & NV_BAR5_MASTER) {
1670 if ((data & ~0xfff) == 0x88000) {
1671 quirk->data.flags |= NV_BAR5_ADDRESS;
1672 quirk->data.address_val = data & 0xfff;
1673 } else if ((data & ~0xff) == 0x1800) {
1674 quirk->data.flags |= NV_BAR5_ADDRESS;
1675 quirk->data.address_val = data & 0xff;
1676 } else {
1677 quirk->data.flags &= ~NV_BAR5_ADDRESS;
1678 }
1679 }
1680 break;
1681 }
1682
1683 vfio_generic_window_quirk_write(opaque, addr, data, size);
1684 }
1685
1686 static const MemoryRegionOps vfio_nvidia_bar5_window_quirk = {
1687 .read = vfio_generic_window_quirk_read,
1688 .write = vfio_nvidia_bar5_window_quirk_write,
1689 .valid.min_access_size = 4,
1690 .endianness = DEVICE_LITTLE_ENDIAN,
1691 };
1692
1693 static void vfio_probe_nvidia_bar5_window_quirk(VFIOPCIDevice *vdev, int nr)
1694 {
1695 PCIDevice *pdev = &vdev->pdev;
1696 VFIOQuirk *quirk;
1697
1698 if (!vdev->has_vga || nr != 5 ||
1699 pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA) {
1700 return;
1701 }
1702
1703 quirk = g_malloc0(sizeof(*quirk));
1704 quirk->vdev = vdev;
1705 quirk->data.read_flags = quirk->data.write_flags = NV_BAR5_VALID;
1706 quirk->data.address_offset = 0x8;
1707 quirk->data.address_size = 0; /* actually 4, but avoids generic code */
1708 quirk->data.data_offset = 0xc;
1709 quirk->data.data_size = 4;
1710 quirk->data.bar = nr;
1711
1712 memory_region_init_io(&quirk->mem, OBJECT(vdev),
1713 &vfio_nvidia_bar5_window_quirk, quirk,
1714 "vfio-nvidia-bar5-window-quirk", 16);
1715 memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem,
1716 0, &quirk->mem, 1);
1717
1718 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1719
1720 trace_vfio_probe_nvidia_bar5_window_quirk(vdev->vbasedev.name);
1721 }
1722
1723 static void vfio_nvidia_88000_quirk_write(void *opaque, hwaddr addr,
1724 uint64_t data, unsigned size)
1725 {
1726 VFIOQuirk *quirk = opaque;
1727 VFIOPCIDevice *vdev = quirk->vdev;
1728 PCIDevice *pdev = &vdev->pdev;
1729 hwaddr base = quirk->data.address_match & TARGET_PAGE_MASK;
1730
1731 vfio_generic_quirk_write(opaque, addr, data, size);
1732
1733 /*
1734 * Nvidia seems to acknowledge MSI interrupts by writing 0xff to the
1735 * MSI capability ID register. Both the ID and next register are
1736 * read-only, so we allow writes covering either of those to real hw.
1737 * NB - only fixed for the 0x88000 MMIO window.
1738 */
1739 if ((pdev->cap_present & QEMU_PCI_CAP_MSI) &&
1740 vfio_range_contained(addr, size, pdev->msi_cap, PCI_MSI_FLAGS)) {
1741 vfio_region_write(&vdev->bars[quirk->data.bar].region,
1742 addr + base, data, size);
1743 }
1744 }
1745
1746 static const MemoryRegionOps vfio_nvidia_88000_quirk = {
1747 .read = vfio_generic_quirk_read,
1748 .write = vfio_nvidia_88000_quirk_write,
1749 .endianness = DEVICE_LITTLE_ENDIAN,
1750 };
1751
1752 /*
1753 * Finally, BAR0 itself. We want to redirect any accesses to either
1754 * 0x1800 or 0x88000 through the PCI config space access functions.
1755 *
1756 * NB - quirk at a page granularity or else they don't seem to work when
1757 * BARs are mmap'd
1758 *
1759 * Here's offset 0x88000...
1760 */
1761 static void vfio_probe_nvidia_bar0_88000_quirk(VFIOPCIDevice *vdev, int nr)
1762 {
1763 PCIDevice *pdev = &vdev->pdev;
1764 VFIOQuirk *quirk;
1765 uint16_t vendor, class;
1766
1767 vendor = pci_get_word(pdev->config + PCI_VENDOR_ID);
1768 class = pci_get_word(pdev->config + PCI_CLASS_DEVICE);
1769
1770 if (nr != 0 || vendor != PCI_VENDOR_ID_NVIDIA ||
1771 class != PCI_CLASS_DISPLAY_VGA) {
1772 return;
1773 }
1774
1775 quirk = g_malloc0(sizeof(*quirk));
1776 quirk->vdev = vdev;
1777 quirk->data.flags = quirk->data.read_flags = quirk->data.write_flags = 1;
1778 quirk->data.address_match = 0x88000;
1779 quirk->data.address_mask = PCIE_CONFIG_SPACE_SIZE - 1;
1780 quirk->data.bar = nr;
1781
1782 memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_nvidia_88000_quirk,
1783 quirk, "vfio-nvidia-bar0-88000-quirk",
1784 TARGET_PAGE_ALIGN(quirk->data.address_mask + 1));
1785 memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem,
1786 quirk->data.address_match & TARGET_PAGE_MASK,
1787 &quirk->mem, 1);
1788
1789 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1790
1791 trace_vfio_probe_nvidia_bar0_88000_quirk(vdev->vbasedev.name);
1792 }
1793
1794 /*
1795 * And here's the same for BAR0 offset 0x1800...
1796 */
1797 static void vfio_probe_nvidia_bar0_1800_quirk(VFIOPCIDevice *vdev, int nr)
1798 {
1799 PCIDevice *pdev = &vdev->pdev;
1800 VFIOQuirk *quirk;
1801
1802 if (!vdev->has_vga || nr != 0 ||
1803 pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA) {
1804 return;
1805 }
1806
1807 /* Log the chipset ID */
1808 trace_vfio_probe_nvidia_bar0_1800_quirk_id(
1809 (unsigned int)(vfio_region_read(&vdev->bars[0].region, 0, 4) >> 20)
1810 & 0xff);
1811
1812 quirk = g_malloc0(sizeof(*quirk));
1813 quirk->vdev = vdev;
1814 quirk->data.flags = quirk->data.read_flags = quirk->data.write_flags = 1;
1815 quirk->data.address_match = 0x1800;
1816 quirk->data.address_mask = PCI_CONFIG_SPACE_SIZE - 1;
1817 quirk->data.bar = nr;
1818
1819 memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_generic_quirk, quirk,
1820 "vfio-nvidia-bar0-1800-quirk",
1821 TARGET_PAGE_ALIGN(quirk->data.address_mask + 1));
1822 memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem,
1823 quirk->data.address_match & TARGET_PAGE_MASK,
1824 &quirk->mem, 1);
1825
1826 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1827
1828 trace_vfio_probe_nvidia_bar0_1800_quirk(vdev->vbasedev.name);
1829 }
1830
1831 /*
1832 * TODO - Some Nvidia devices provide config access to their companion HDA
1833 * device and even to their parent bridge via these config space mirrors.
1834 * Add quirks for those regions.
1835 */
1836
1837 /*
1838 * Common quirk probe entry points.
1839 */
1840 static void vfio_vga_quirk_setup(VFIOPCIDevice *vdev)
1841 {
1842 vfio_vga_probe_ati_3c3_quirk(vdev);
1843 vfio_vga_probe_nvidia_3d0_quirk(vdev);
1844 }
1845
1846 static void vfio_vga_quirk_teardown(VFIOPCIDevice *vdev)
1847 {
1848 VFIOQuirk *quirk;
1849 int i;
1850
1851 for (i = 0; i < ARRAY_SIZE(vdev->vga.region); i++) {
1852 QLIST_FOREACH(quirk, &vdev->vga.region[i].quirks, next) {
1853 memory_region_del_subregion(&vdev->vga.region[i].mem, &quirk->mem);
1854 }
1855 }
1856 }
1857
1858 static void vfio_vga_quirk_free(VFIOPCIDevice *vdev)
1859 {
1860 int i;
1861
1862 for (i = 0; i < ARRAY_SIZE(vdev->vga.region); i++) {
1863 while (!QLIST_EMPTY(&vdev->vga.region[i].quirks)) {
1864 VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga.region[i].quirks);
1865 object_unparent(OBJECT(&quirk->mem));
1866 QLIST_REMOVE(quirk, next);
1867 g_free(quirk);
1868 }
1869 }
1870 }
1871
1872 static void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr)
1873 {
1874 vfio_probe_ati_bar4_window_quirk(vdev, nr);
1875 vfio_probe_ati_bar2_4000_quirk(vdev, nr);
1876 vfio_probe_nvidia_bar5_window_quirk(vdev, nr);
1877 vfio_probe_nvidia_bar0_88000_quirk(vdev, nr);
1878 vfio_probe_nvidia_bar0_1800_quirk(vdev, nr);
1879 vfio_probe_rtl8168_bar2_window_quirk(vdev, nr);
1880 }
1881
1882 static void vfio_bar_quirk_teardown(VFIOPCIDevice *vdev, int nr)
1883 {
1884 VFIOBAR *bar = &vdev->bars[nr];
1885 VFIOQuirk *quirk;
1886
1887 QLIST_FOREACH(quirk, &bar->quirks, next) {
1888 memory_region_del_subregion(&bar->region.mem, &quirk->mem);
1889 }
1890 }
1891
1892 static void vfio_bar_quirk_free(VFIOPCIDevice *vdev, int nr)
1893 {
1894 VFIOBAR *bar = &vdev->bars[nr];
1895
1896 while (!QLIST_EMPTY(&bar->quirks)) {
1897 VFIOQuirk *quirk = QLIST_FIRST(&bar->quirks);
1898 object_unparent(OBJECT(&quirk->mem));
1899 QLIST_REMOVE(quirk, next);
1900 g_free(quirk);
1901 }
1902 }
1903
1904 /*
1905 * PCI config space
1906 */
1907 static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
1908 {
1909 VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
1910 uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val;
1911
1912 memcpy(&emu_bits, vdev->emulated_config_bits + addr, len);
1913 emu_bits = le32_to_cpu(emu_bits);
1914
1915 if (emu_bits) {
1916 emu_val = pci_default_read_config(pdev, addr, len);
1917 }
1918
1919 if (~emu_bits & (0xffffffffU >> (32 - len * 8))) {
1920 ssize_t ret;
1921
1922 ret = pread(vdev->vbasedev.fd, &phys_val, len,
1923 vdev->config_offset + addr);
1924 if (ret != len) {
1925 error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed: %m",
1926 __func__, vdev->host.domain, vdev->host.bus,
1927 vdev->host.slot, vdev->host.function, addr, len);
1928 return -errno;
1929 }
1930 phys_val = le32_to_cpu(phys_val);
1931 }
1932
1933 val = (emu_val & emu_bits) | (phys_val & ~emu_bits);
1934
1935 trace_vfio_pci_read_config(vdev->vbasedev.name, addr, len, val);
1936
1937 return val;
1938 }
1939
1940 static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
1941 uint32_t val, int len)
1942 {
1943 VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
1944 uint32_t val_le = cpu_to_le32(val);
1945
1946 trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len);
1947
1948 /* Write everything to VFIO, let it filter out what we can't write */
1949 if (pwrite(vdev->vbasedev.fd, &val_le, len, vdev->config_offset + addr)
1950 != len) {
1951 error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x) failed: %m",
1952 __func__, vdev->host.domain, vdev->host.bus,
1953 vdev->host.slot, vdev->host.function, addr, val, len);
1954 }
1955
1956 /* MSI/MSI-X Enabling/Disabling */
1957 if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
1958 ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
1959 int is_enabled, was_enabled = msi_enabled(pdev);
1960
1961 pci_default_write_config(pdev, addr, val, len);
1962
1963 is_enabled = msi_enabled(pdev);
1964
1965 if (!was_enabled) {
1966 if (is_enabled) {
1967 vfio_msi_enable(vdev);
1968 }
1969 } else {
1970 if (!is_enabled) {
1971 vfio_msi_disable(vdev);
1972 } else {
1973 vfio_update_msi(vdev);
1974 }
1975 }
1976 } else if (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
1977 ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) {
1978 int is_enabled, was_enabled = msix_enabled(pdev);
1979
1980 pci_default_write_config(pdev, addr, val, len);
1981
1982 is_enabled = msix_enabled(pdev);
1983
1984 if (!was_enabled && is_enabled) {
1985 vfio_msix_enable(vdev);
1986 } else if (was_enabled && !is_enabled) {
1987 vfio_msix_disable(vdev);
1988 }
1989 } else {
1990 /* Write everything to QEMU to keep emulated bits correct */
1991 pci_default_write_config(pdev, addr, val, len);
1992 }
1993 }
1994
1995 /*
1996 * Interrupt setup
1997 */
1998 static void vfio_disable_interrupts(VFIOPCIDevice *vdev)
1999 {
2000 /*
2001 * More complicated than it looks. Disabling MSI/X transitions the
2002 * device to INTx mode (if supported). Therefore we need to first
2003 * disable MSI/X and then cleanup by disabling INTx.
2004 */
2005 if (vdev->interrupt == VFIO_INT_MSIX) {
2006 vfio_msix_disable(vdev);
2007 } else if (vdev->interrupt == VFIO_INT_MSI) {
2008 vfio_msi_disable(vdev);
2009 }
2010
2011 if (vdev->interrupt == VFIO_INT_INTx) {
2012 vfio_intx_disable(vdev);
2013 }
2014 }
2015
2016 static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos)
2017 {
2018 uint16_t ctrl;
2019 bool msi_64bit, msi_maskbit;
2020 int ret, entries;
2021
2022 if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl),
2023 vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
2024 return -errno;
2025 }
2026 ctrl = le16_to_cpu(ctrl);
2027
2028 msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
2029 msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
2030 entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
2031
2032 trace_vfio_msi_setup(vdev->vbasedev.name, pos);
2033
2034 ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit);
2035 if (ret < 0) {
2036 if (ret == -ENOTSUP) {
2037 return 0;
2038 }
2039 error_report("vfio: msi_init failed");
2040 return ret;
2041 }
2042 vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
2043
2044 return 0;
2045 }
2046
2047 /*
2048 * We don't have any control over how pci_add_capability() inserts
2049 * capabilities into the chain. In order to setup MSI-X we need a
2050 * MemoryRegion for the BAR. In order to setup the BAR and not
2051 * attempt to mmap the MSI-X table area, which VFIO won't allow, we
2052 * need to first look for where the MSI-X table lives. So we
2053 * unfortunately split MSI-X setup across two functions.
2054 */
2055 static int vfio_msix_early_setup(VFIOPCIDevice *vdev)
2056 {
2057 uint8_t pos;
2058 uint16_t ctrl;
2059 uint32_t table, pba;
2060 int fd = vdev->vbasedev.fd;
2061 VFIOMSIXInfo *msix;
2062
2063 pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
2064 if (!pos) {
2065 return 0;
2066 }
2067
2068 if (pread(fd, &ctrl, sizeof(ctrl),
2069 vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
2070 return -errno;
2071 }
2072
2073 if (pread(fd, &table, sizeof(table),
2074 vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) {
2075 return -errno;
2076 }
2077
2078 if (pread(fd, &pba, sizeof(pba),
2079 vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) {
2080 return -errno;
2081 }
2082
2083 ctrl = le16_to_cpu(ctrl);
2084 table = le32_to_cpu(table);
2085 pba = le32_to_cpu(pba);
2086
2087 msix = g_malloc0(sizeof(*msix));
2088 msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
2089 msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
2090 msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
2091 msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
2092 msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
2093
2094 /*
2095 * Test the size of the pba_offset variable and catch if it extends outside
2096 * of the specified BAR. If it is the case, we need to apply a hardware
2097 * specific quirk if the device is known or we have a broken configuration.
2098 */
2099 if (msix->pba_offset >= vdev->bars[msix->pba_bar].region.size) {
2100 PCIDevice *pdev = &vdev->pdev;
2101 uint16_t vendor = pci_get_word(pdev->config + PCI_VENDOR_ID);
2102 uint16_t device = pci_get_word(pdev->config + PCI_DEVICE_ID);
2103
2104 /*
2105 * Chelsio T5 Virtual Function devices are encoded as 0x58xx for T5
2106 * adapters. The T5 hardware returns an incorrect value of 0x8000 for
2107 * the VF PBA offset while the BAR itself is only 8k. The correct value
2108 * is 0x1000, so we hard code that here.
2109 */
2110 if (vendor == PCI_VENDOR_ID_CHELSIO && (device & 0xff00) == 0x5800) {
2111 msix->pba_offset = 0x1000;
2112 } else {
2113 error_report("vfio: Hardware reports invalid configuration, "
2114 "MSIX PBA outside of specified BAR");
2115 g_free(msix);
2116 return -EINVAL;
2117 }
2118 }
2119
2120 trace_vfio_msix_early_setup(vdev->vbasedev.name, pos, msix->table_bar,
2121 msix->table_offset, msix->entries);
2122 vdev->msix = msix;
2123
2124 return 0;
2125 }
2126
2127 static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos)
2128 {
2129 int ret;
2130
2131 ret = msix_init(&vdev->pdev, vdev->msix->entries,
2132 &vdev->bars[vdev->msix->table_bar].region.mem,
2133 vdev->msix->table_bar, vdev->msix->table_offset,
2134 &vdev->bars[vdev->msix->pba_bar].region.mem,
2135 vdev->msix->pba_bar, vdev->msix->pba_offset, pos);
2136 if (ret < 0) {
2137 if (ret == -ENOTSUP) {
2138 return 0;
2139 }
2140 error_report("vfio: msix_init failed");
2141 return ret;
2142 }
2143
2144 return 0;
2145 }
2146
2147 static void vfio_teardown_msi(VFIOPCIDevice *vdev)
2148 {
2149 msi_uninit(&vdev->pdev);
2150
2151 if (vdev->msix) {
2152 msix_uninit(&vdev->pdev,
2153 &vdev->bars[vdev->msix->table_bar].region.mem,
2154 &vdev->bars[vdev->msix->pba_bar].region.mem);
2155 }
2156 }
2157
2158 /*
2159 * Resource setup
2160 */
2161 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled)
2162 {
2163 int i;
2164
2165 for (i = 0; i < PCI_ROM_SLOT; i++) {
2166 VFIOBAR *bar = &vdev->bars[i];
2167
2168 if (!bar->region.size) {
2169 continue;
2170 }
2171
2172 memory_region_set_enabled(&bar->region.mmap_mem, enabled);
2173 if (vdev->msix && vdev->msix->table_bar == i) {
2174 memory_region_set_enabled(&vdev->msix->mmap_mem, enabled);
2175 }
2176 }
2177 }
2178
2179 static void vfio_unregister_bar(VFIOPCIDevice *vdev, int nr)
2180 {
2181 VFIOBAR *bar = &vdev->bars[nr];
2182
2183 if (!bar->region.size) {
2184 return;
2185 }
2186
2187 vfio_bar_quirk_teardown(vdev, nr);
2188
2189 memory_region_del_subregion(&bar->region.mem, &bar->region.mmap_mem);
2190
2191 if (vdev->msix && vdev->msix->table_bar == nr) {
2192 memory_region_del_subregion(&bar->region.mem, &vdev->msix->mmap_mem);
2193 }
2194 }
2195
2196 static void vfio_unmap_bar(VFIOPCIDevice *vdev, int nr)
2197 {
2198 VFIOBAR *bar = &vdev->bars[nr];
2199
2200 if (!bar->region.size) {
2201 return;
2202 }
2203
2204 vfio_bar_quirk_free(vdev, nr);
2205
2206 munmap(bar->region.mmap, memory_region_size(&bar->region.mmap_mem));
2207
2208 if (vdev->msix && vdev->msix->table_bar == nr) {
2209 munmap(vdev->msix->mmap, memory_region_size(&vdev->msix->mmap_mem));
2210 }
2211 }
2212
2213 static void vfio_map_bar(VFIOPCIDevice *vdev, int nr)
2214 {
2215 VFIOBAR *bar = &vdev->bars[nr];
2216 uint64_t size = bar->region.size;
2217 char name[64];
2218 uint32_t pci_bar;
2219 uint8_t type;
2220 int ret;
2221
2222 /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
2223 if (!size) {
2224 return;
2225 }
2226
2227 snprintf(name, sizeof(name), "VFIO %04x:%02x:%02x.%x BAR %d",
2228 vdev->host.domain, vdev->host.bus, vdev->host.slot,
2229 vdev->host.function, nr);
2230
2231 /* Determine what type of BAR this is for registration */
2232 ret = pread(vdev->vbasedev.fd, &pci_bar, sizeof(pci_bar),
2233 vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr));
2234 if (ret != sizeof(pci_bar)) {
2235 error_report("vfio: Failed to read BAR %d (%m)", nr);
2236 return;
2237 }
2238
2239 pci_bar = le32_to_cpu(pci_bar);
2240 bar->ioport = (pci_bar & PCI_BASE_ADDRESS_SPACE_IO);
2241 bar->mem64 = bar->ioport ? 0 : (pci_bar & PCI_BASE_ADDRESS_MEM_TYPE_64);
2242 type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK :
2243 ~PCI_BASE_ADDRESS_MEM_MASK);
2244
2245 /* A "slow" read/write mapping underlies all BARs */
2246 memory_region_init_io(&bar->region.mem, OBJECT(vdev), &vfio_region_ops,
2247 bar, name, size);
2248 pci_register_bar(&vdev->pdev, nr, type, &bar->region.mem);
2249
2250 /*
2251 * We can't mmap areas overlapping the MSIX vector table, so we
2252 * potentially insert a direct-mapped subregion before and after it.
2253 */
2254 if (vdev->msix && vdev->msix->table_bar == nr) {
2255 size = vdev->msix->table_offset & qemu_real_host_page_mask;
2256 }
2257
2258 strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
2259 if (vfio_mmap_region(OBJECT(vdev), &bar->region, &bar->region.mem,
2260 &bar->region.mmap_mem, &bar->region.mmap,
2261 size, 0, name)) {
2262 error_report("%s unsupported. Performance may be slow", name);
2263 }
2264
2265 if (vdev->msix && vdev->msix->table_bar == nr) {
2266 uint64_t start;
2267
2268 start = REAL_HOST_PAGE_ALIGN((uint64_t)vdev->msix->table_offset +
2269 (vdev->msix->entries *
2270 PCI_MSIX_ENTRY_SIZE));
2271
2272 size = start < bar->region.size ? bar->region.size - start : 0;
2273 strncat(name, " msix-hi", sizeof(name) - strlen(name) - 1);
2274 /* VFIOMSIXInfo contains another MemoryRegion for this mapping */
2275 if (vfio_mmap_region(OBJECT(vdev), &bar->region, &bar->region.mem,
2276 &vdev->msix->mmap_mem,
2277 &vdev->msix->mmap, size, start, name)) {
2278 error_report("%s unsupported. Performance may be slow", name);
2279 }
2280 }
2281
2282 vfio_bar_quirk_setup(vdev, nr);
2283 }
2284
2285 static void vfio_map_bars(VFIOPCIDevice *vdev)
2286 {
2287 int i;
2288
2289 for (i = 0; i < PCI_ROM_SLOT; i++) {
2290 vfio_map_bar(vdev, i);
2291 }
2292
2293 if (vdev->has_vga) {
2294 memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_MEM].mem,
2295 OBJECT(vdev), &vfio_vga_ops,
2296 &vdev->vga.region[QEMU_PCI_VGA_MEM],
2297 "vfio-vga-mmio@0xa0000",
2298 QEMU_PCI_VGA_MEM_SIZE);
2299 memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_IO_LO].mem,
2300 OBJECT(vdev), &vfio_vga_ops,
2301 &vdev->vga.region[QEMU_PCI_VGA_IO_LO],
2302 "vfio-vga-io@0x3b0",
2303 QEMU_PCI_VGA_IO_LO_SIZE);
2304 memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem,
2305 OBJECT(vdev), &vfio_vga_ops,
2306 &vdev->vga.region[QEMU_PCI_VGA_IO_HI],
2307 "vfio-vga-io@0x3c0",
2308 QEMU_PCI_VGA_IO_HI_SIZE);
2309
2310 pci_register_vga(&vdev->pdev, &vdev->vga.region[QEMU_PCI_VGA_MEM].mem,
2311 &vdev->vga.region[QEMU_PCI_VGA_IO_LO].mem,
2312 &vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem);
2313 vfio_vga_quirk_setup(vdev);
2314 }
2315 }
2316
2317 static void vfio_unregister_bars(VFIOPCIDevice *vdev)
2318 {
2319 int i;
2320
2321 for (i = 0; i < PCI_ROM_SLOT; i++) {
2322 vfio_unregister_bar(vdev, i);
2323 }
2324
2325 if (vdev->has_vga) {
2326 vfio_vga_quirk_teardown(vdev);
2327 pci_unregister_vga(&vdev->pdev);
2328 }
2329 }
2330
2331 static void vfio_unmap_bars(VFIOPCIDevice *vdev)
2332 {
2333 int i;
2334
2335 for (i = 0; i < PCI_ROM_SLOT; i++) {
2336 vfio_unmap_bar(vdev, i);
2337 }
2338
2339 if (vdev->has_vga) {
2340 vfio_vga_quirk_free(vdev);
2341 }
2342 }
2343
2344 /*
2345 * General setup
2346 */
2347 static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
2348 {
2349 uint8_t tmp, next = 0xff;
2350
2351 for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp;
2352 tmp = pdev->config[tmp + 1]) {
2353 if (tmp > pos && tmp < next) {
2354 next = tmp;
2355 }
2356 }
2357
2358 return next - pos;
2359 }
2360
2361 static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask)
2362 {
2363 pci_set_word(buf, (pci_get_word(buf) & ~mask) | val);
2364 }
2365
2366 static void vfio_add_emulated_word(VFIOPCIDevice *vdev, int pos,
2367 uint16_t val, uint16_t mask)
2368 {
2369 vfio_set_word_bits(vdev->pdev.config + pos, val, mask);
2370 vfio_set_word_bits(vdev->pdev.wmask + pos, ~mask, mask);
2371 vfio_set_word_bits(vdev->emulated_config_bits + pos, mask, mask);
2372 }
2373
2374 static void vfio_set_long_bits(uint8_t *buf, uint32_t val, uint32_t mask)
2375 {
2376 pci_set_long(buf, (pci_get_long(buf) & ~mask) | val);
2377 }
2378
2379 static void vfio_add_emulated_long(VFIOPCIDevice *vdev, int pos,
2380 uint32_t val, uint32_t mask)
2381 {
2382 vfio_set_long_bits(vdev->pdev.config + pos, val, mask);
2383 vfio_set_long_bits(vdev->pdev.wmask + pos, ~mask, mask);
2384 vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask);
2385 }
2386
2387 static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size)
2388 {
2389 uint16_t flags;
2390 uint8_t type;
2391
2392 flags = pci_get_word(vdev->pdev.config + pos + PCI_CAP_FLAGS);
2393 type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
2394
2395 if (type != PCI_EXP_TYPE_ENDPOINT &&
2396 type != PCI_EXP_TYPE_LEG_END &&
2397 type != PCI_EXP_TYPE_RC_END) {
2398
2399 error_report("vfio: Assignment of PCIe type 0x%x "
2400 "devices is not currently supported", type);
2401 return -EINVAL;
2402 }
2403
2404 if (!pci_bus_is_express(vdev->pdev.bus)) {
2405 /*
2406 * Use express capability as-is on PCI bus. It doesn't make much
2407 * sense to even expose, but some drivers (ex. tg3) depend on it
2408 * and guests don't seem to be particular about it. We'll need
2409 * to revist this or force express devices to express buses if we
2410 * ever expose an IOMMU to the guest.
2411 */
2412 } else if (pci_bus_is_root(vdev->pdev.bus)) {
2413 /*
2414 * On a Root Complex bus Endpoints become Root Complex Integrated
2415 * Endpoints, which changes the type and clears the LNK & LNK2 fields.
2416 */
2417 if (type == PCI_EXP_TYPE_ENDPOINT) {
2418 vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
2419 PCI_EXP_TYPE_RC_END << 4,
2420 PCI_EXP_FLAGS_TYPE);
2421
2422 /* Link Capabilities, Status, and Control goes away */
2423 if (size > PCI_EXP_LNKCTL) {
2424 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP, 0, ~0);
2425 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
2426 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA, 0, ~0);
2427
2428 #ifndef PCI_EXP_LNKCAP2
2429 #define PCI_EXP_LNKCAP2 44
2430 #endif
2431 #ifndef PCI_EXP_LNKSTA2
2432 #define PCI_EXP_LNKSTA2 50
2433 #endif
2434 /* Link 2 Capabilities, Status, and Control goes away */
2435 if (size > PCI_EXP_LNKCAP2) {
2436 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP2, 0, ~0);
2437 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL2, 0, ~0);
2438 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA2, 0, ~0);
2439 }
2440 }
2441
2442 } else if (type == PCI_EXP_TYPE_LEG_END) {
2443 /*
2444 * Legacy endpoints don't belong on the root complex. Windows
2445 * seems to be happier with devices if we skip the capability.
2446 */
2447 return 0;
2448 }
2449
2450 } else {
2451 /*
2452 * Convert Root Complex Integrated Endpoints to regular endpoints.
2453 * These devices don't support LNK/LNK2 capabilities, so make them up.
2454 */
2455 if (type == PCI_EXP_TYPE_RC_END) {
2456 vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
2457 PCI_EXP_TYPE_ENDPOINT << 4,
2458 PCI_EXP_FLAGS_TYPE);
2459 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP,
2460 PCI_EXP_LNK_MLW_1 | PCI_EXP_LNK_LS_25, ~0);
2461 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
2462 }
2463
2464 /* Mark the Link Status bits as emulated to allow virtual negotiation */
2465 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA,
2466 pci_get_word(vdev->pdev.config + pos +
2467 PCI_EXP_LNKSTA),
2468 PCI_EXP_LNKCAP_MLW | PCI_EXP_LNKCAP_SLS);
2469 }
2470
2471 pos = pci_add_capability(&vdev->pdev, PCI_CAP_ID_EXP, pos, size);
2472 if (pos >= 0) {
2473 vdev->pdev.exp.exp_cap = pos;
2474 }
2475
2476 return pos;
2477 }
2478
2479 static void vfio_check_pcie_flr(VFIOPCIDevice *vdev, uint8_t pos)
2480 {
2481 uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP);
2482
2483 if (cap & PCI_EXP_DEVCAP_FLR) {
2484 trace_vfio_check_pcie_flr(vdev->vbasedev.name);
2485 vdev->has_flr = true;
2486 }
2487 }
2488
2489 static void vfio_check_pm_reset(VFIOPCIDevice *vdev, uint8_t pos)
2490 {
2491 uint16_t csr = pci_get_word(vdev->pdev.config + pos + PCI_PM_CTRL);
2492
2493 if (!(csr & PCI_PM_CTRL_NO_SOFT_RESET)) {
2494 trace_vfio_check_pm_reset(vdev->vbasedev.name);
2495 vdev->has_pm_reset = true;
2496 }
2497 }
2498
2499 static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos)
2500 {
2501 uint8_t cap = pci_get_byte(vdev->pdev.config + pos + PCI_AF_CAP);
2502
2503 if ((cap & PCI_AF_CAP_TP) && (cap & PCI_AF_CAP_FLR)) {
2504 trace_vfio_check_af_flr(vdev->vbasedev.name);
2505 vdev->has_flr = true;
2506 }
2507 }
2508
2509 static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
2510 {
2511 PCIDevice *pdev = &vdev->pdev;
2512 uint8_t cap_id, next, size;
2513 int ret;
2514
2515 cap_id = pdev->config[pos];
2516 next = pdev->config[pos + 1];
2517
2518 /*
2519 * If it becomes important to configure capabilities to their actual
2520 * size, use this as the default when it's something we don't recognize.
2521 * Since QEMU doesn't actually handle many of the config accesses,
2522 * exact size doesn't seem worthwhile.
2523 */
2524 size = vfio_std_cap_max_size(pdev, pos);
2525
2526 /*
2527 * pci_add_capability always inserts the new capability at the head
2528 * of the chain. Therefore to end up with a chain that matches the
2529 * physical device, we insert from the end by making this recursive.
2530 * This is also why we pre-caclulate size above as cached config space
2531 * will be changed as we unwind the stack.
2532 */
2533 if (next) {
2534 ret = vfio_add_std_cap(vdev, next);
2535 if (ret) {
2536 return ret;
2537 }
2538 } else {
2539 /* Begin the rebuild, use QEMU emulated list bits */
2540 pdev->config[PCI_CAPABILITY_LIST] = 0;
2541 vdev->emulated_config_bits[PCI_CAPABILITY_LIST] = 0xff;
2542 vdev->emulated_config_bits[PCI_STATUS] |= PCI_STATUS_CAP_LIST;
2543 }
2544
2545 /* Use emulated next pointer to allow dropping caps */
2546 pci_set_byte(vdev->emulated_config_bits + pos + 1, 0xff);
2547
2548 switch (cap_id) {
2549 case PCI_CAP_ID_MSI:
2550 ret = vfio_msi_setup(vdev, pos);
2551 break;
2552 case PCI_CAP_ID_EXP:
2553 vfio_check_pcie_flr(vdev, pos);
2554 ret = vfio_setup_pcie_cap(vdev, pos, size);
2555 break;
2556 case PCI_CAP_ID_MSIX:
2557 ret = vfio_msix_setup(vdev, pos);
2558 break;
2559 case PCI_CAP_ID_PM:
2560 vfio_check_pm_reset(vdev, pos);
2561 vdev->pm_cap = pos;
2562 ret = pci_add_capability(pdev, cap_id, pos, size);
2563 break;
2564 case PCI_CAP_ID_AF:
2565 vfio_check_af_flr(vdev, pos);
2566 ret = pci_add_capability(pdev, cap_id, pos, size);
2567 break;
2568 default:
2569 ret = pci_add_capability(pdev, cap_id, pos, size);
2570 break;
2571 }
2572
2573 if (ret < 0) {
2574 error_report("vfio: %04x:%02x:%02x.%x Error adding PCI capability "
2575 "0x%x[0x%x]@0x%x: %d", vdev->host.domain,
2576 vdev->host.bus, vdev->host.slot, vdev->host.function,
2577 cap_id, size, pos, ret);
2578 return ret;
2579 }
2580
2581 return 0;
2582 }
2583
2584 static int vfio_add_capabilities(VFIOPCIDevice *vdev)
2585 {
2586 PCIDevice *pdev = &vdev->pdev;
2587
2588 if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
2589 !pdev->config[PCI_CAPABILITY_LIST]) {
2590 return 0; /* Nothing to add */
2591 }
2592
2593 return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
2594 }
2595
2596 static void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
2597 {
2598 PCIDevice *pdev = &vdev->pdev;
2599 uint16_t cmd;
2600
2601 vfio_disable_interrupts(vdev);
2602
2603 /* Make sure the device is in D0 */
2604 if (vdev->pm_cap) {
2605 uint16_t pmcsr;
2606 uint8_t state;
2607
2608 pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
2609 state = pmcsr & PCI_PM_CTRL_STATE_MASK;
2610 if (state) {
2611 pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
2612 vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2);
2613 /* vfio handles the necessary delay here */
2614 pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
2615 state = pmcsr & PCI_PM_CTRL_STATE_MASK;
2616 if (state) {
2617 error_report("vfio: Unable to power on device, stuck in D%d",
2618 state);
2619 }
2620 }
2621 }
2622
2623 /*
2624 * Stop any ongoing DMA by disconecting I/O, MMIO, and bus master.
2625 * Also put INTx Disable in known state.
2626 */
2627 cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2);
2628 cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
2629 PCI_COMMAND_INTX_DISABLE);
2630 vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
2631 }
2632
2633 static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
2634 {
2635 vfio_intx_enable(vdev);
2636 }
2637
2638 static bool vfio_pci_host_match(PCIHostDeviceAddress *host1,
2639 PCIHostDeviceAddress *host2)
2640 {
2641 return (host1->domain == host2->domain && host1->bus == host2->bus &&
2642 host1->slot == host2->slot && host1->function == host2->function);
2643 }
2644
2645 static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
2646 {
2647 VFIOGroup *group;
2648 struct vfio_pci_hot_reset_info *info;
2649 struct vfio_pci_dependent_device *devices;
2650 struct vfio_pci_hot_reset *reset;
2651 int32_t *fds;
2652 int ret, i, count;
2653 bool multi = false;
2654
2655 trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
2656
2657 vfio_pci_pre_reset(vdev);
2658 vdev->vbasedev.needs_reset = false;
2659
2660 info = g_malloc0(sizeof(*info));
2661 info->argsz = sizeof(*info);
2662
2663 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
2664 if (ret && errno != ENOSPC) {
2665 ret = -errno;
2666 if (!vdev->has_pm_reset) {
2667 error_report("vfio: Cannot reset device %04x:%02x:%02x.%x, "
2668 "no available reset mechanism.", vdev->host.domain,
2669 vdev->host.bus, vdev->host.slot, vdev->host.function);
2670 }
2671 goto out_single;
2672 }
2673
2674 count = info->count;
2675 info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices)));
2676 info->argsz = sizeof(*info) + (count * sizeof(*devices));
2677 devices = &info->devices[0];
2678
2679 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
2680 if (ret) {
2681 ret = -errno;
2682 error_report("vfio: hot reset info failed: %m");
2683 goto out_single;
2684 }
2685
2686 trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name);
2687
2688 /* Verify that we have all the groups required */
2689 for (i = 0; i < info->count; i++) {
2690 PCIHostDeviceAddress host;
2691 VFIOPCIDevice *tmp;
2692 VFIODevice *vbasedev_iter;
2693
2694 host.domain = devices[i].segment;
2695 host.bus = devices[i].bus;
2696 host.slot = PCI_SLOT(devices[i].devfn);
2697 host.function = PCI_FUNC(devices[i].devfn);
2698
2699 trace_vfio_pci_hot_reset_dep_devices(host.domain,
2700 host.bus, host.slot, host.function, devices[i].group_id);
2701
2702 if (vfio_pci_host_match(&host, &vdev->host)) {
2703 continue;
2704 }
2705
2706 QLIST_FOREACH(group, &vfio_group_list, next) {
2707 if (group->groupid == devices[i].group_id) {
2708 break;
2709 }
2710 }
2711
2712 if (!group) {
2713 if (!vdev->has_pm_reset) {
2714 error_report("vfio: Cannot reset device %s, "
2715 "depends on group %d which is not owned.",
2716 vdev->vbasedev.name, devices[i].group_id);
2717 }
2718 ret = -EPERM;
2719 goto out;
2720 }
2721
2722 /* Prep dependent devices for reset and clear our marker. */
2723 QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
2724 if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
2725 continue;
2726 }
2727 tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
2728 if (vfio_pci_host_match(&host, &tmp->host)) {
2729 if (single) {
2730 ret = -EINVAL;
2731 goto out_single;
2732 }
2733 vfio_pci_pre_reset(tmp);
2734 tmp->vbasedev.needs_reset = false;
2735 multi = true;
2736 break;
2737 }
2738 }
2739 }
2740
2741 if (!single && !multi) {
2742 ret = -EINVAL;
2743 goto out_single;
2744 }
2745
2746 /* Determine how many group fds need to be passed */
2747 count = 0;
2748 QLIST_FOREACH(group, &vfio_group_list, next) {
2749 for (i = 0; i < info->count; i++) {
2750 if (group->groupid == devices[i].group_id) {
2751 count++;
2752 break;
2753 }
2754 }
2755 }
2756
2757 reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
2758 reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
2759 fds = &reset->group_fds[0];
2760
2761 /* Fill in group fds */
2762 QLIST_FOREACH(group, &vfio_group_list, next) {
2763 for (i = 0; i < info->count; i++) {
2764 if (group->groupid == devices[i].group_id) {
2765 fds[reset->count++] = group->fd;
2766 break;
2767 }
2768 }
2769 }
2770
2771 /* Bus reset! */
2772 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
2773 g_free(reset);
2774
2775 trace_vfio_pci_hot_reset_result(vdev->vbasedev.name,
2776 ret ? "%m" : "Success");
2777
2778 out:
2779 /* Re-enable INTx on affected devices */
2780 for (i = 0; i < info->count; i++) {
2781 PCIHostDeviceAddress host;
2782 VFIOPCIDevice *tmp;
2783 VFIODevice *vbasedev_iter;
2784
2785 host.domain = devices[i].segment;
2786 host.bus = devices[i].bus;
2787 host.slot = PCI_SLOT(devices[i].devfn);
2788 host.function = PCI_FUNC(devices[i].devfn);
2789
2790 if (vfio_pci_host_match(&host, &vdev->host)) {
2791 continue;
2792 }
2793
2794 QLIST_FOREACH(group, &vfio_group_list, next) {
2795 if (group->groupid == devices[i].group_id) {
2796 break;
2797 }
2798 }
2799
2800 if (!group) {
2801 break;
2802 }
2803
2804 QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
2805 if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
2806 continue;
2807 }
2808 tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
2809 if (vfio_pci_host_match(&host, &tmp->host)) {
2810 vfio_pci_post_reset(tmp);
2811 break;
2812 }
2813 }
2814 }
2815 out_single:
2816 vfio_pci_post_reset(vdev);
2817 g_free(info);
2818
2819 return ret;
2820 }
2821
2822 /*
2823 * We want to differentiate hot reset of mulitple in-use devices vs hot reset
2824 * of a single in-use device. VFIO_DEVICE_RESET will already handle the case
2825 * of doing hot resets when there is only a single device per bus. The in-use
2826 * here refers to how many VFIODevices are affected. A hot reset that affects
2827 * multiple devices, but only a single in-use device, means that we can call
2828 * it from our bus ->reset() callback since the extent is effectively a single
2829 * device. This allows us to make use of it in the hotplug path. When there
2830 * are multiple in-use devices, we can only trigger the hot reset during a
2831 * system reset and thus from our reset handler. We separate _one vs _multi
2832 * here so that we don't overlap and do a double reset on the system reset
2833 * path where both our reset handler and ->reset() callback are used. Calling
2834 * _one() will only do a hot reset for the one in-use devices case, calling
2835 * _multi() will do nothing if a _one() would have been sufficient.
2836 */
2837 static int vfio_pci_hot_reset_one(VFIOPCIDevice *vdev)
2838 {
2839 return vfio_pci_hot_reset(vdev, true);
2840 }
2841
2842 static int vfio_pci_hot_reset_multi(VFIODevice *vbasedev)
2843 {
2844 VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2845 return vfio_pci_hot_reset(vdev, false);
2846 }
2847
2848 static void vfio_pci_compute_needs_reset(VFIODevice *vbasedev)
2849 {
2850 VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2851 if (!vbasedev->reset_works || (!vdev->has_flr && vdev->has_pm_reset)) {
2852 vbasedev->needs_reset = true;
2853 }
2854 }
2855
2856 static VFIODeviceOps vfio_pci_ops = {
2857 .vfio_compute_needs_reset = vfio_pci_compute_needs_reset,
2858 .vfio_hot_reset_multi = vfio_pci_hot_reset_multi,
2859 .vfio_eoi = vfio_intx_eoi,
2860 };
2861
2862 static int vfio_populate_device(VFIOPCIDevice *vdev)
2863 {
2864 VFIODevice *vbasedev = &vdev->vbasedev;
2865 struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
2866 struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
2867 int i, ret = -1;
2868
2869 /* Sanity check device */
2870 if (!(vbasedev->flags & VFIO_DEVICE_FLAGS_PCI)) {
2871 error_report("vfio: Um, this isn't a PCI device");
2872 goto error;
2873 }
2874
2875 if (vbasedev->num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) {
2876 error_report("vfio: unexpected number of io regions %u",
2877 vbasedev->num_regions);
2878 goto error;
2879 }
2880
2881 if (vbasedev->num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1) {
2882 error_report("vfio: unexpected number of irqs %u", vbasedev->num_irqs);
2883 goto error;
2884 }
2885
2886 for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) {
2887 reg_info.index = i;
2888
2889 ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
2890 if (ret) {
2891 error_report("vfio: Error getting region %d info: %m", i);
2892 goto error;
2893 }
2894
2895 trace_vfio_populate_device_region(vbasedev->name, i,
2896 (unsigned long)reg_info.size,
2897 (unsigned long)reg_info.offset,
2898 (unsigned long)reg_info.flags);
2899
2900 vdev->bars[i].region.vbasedev = vbasedev;
2901 vdev->bars[i].region.flags = reg_info.flags;
2902 vdev->bars[i].region.size = reg_info.size;
2903 vdev->bars[i].region.fd_offset = reg_info.offset;
2904 vdev->bars[i].region.nr = i;
2905 QLIST_INIT(&vdev->bars[i].quirks);
2906 }
2907
2908 reg_info.index = VFIO_PCI_CONFIG_REGION_INDEX;
2909
2910 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
2911 if (ret) {
2912 error_report("vfio: Error getting config info: %m");
2913 goto error;
2914 }
2915
2916 trace_vfio_populate_device_config(vdev->vbasedev.name,
2917 (unsigned long)reg_info.size,
2918 (unsigned long)reg_info.offset,
2919 (unsigned long)reg_info.flags);
2920
2921 vdev->config_size = reg_info.size;
2922 if (vdev->config_size == PCI_CONFIG_SPACE_SIZE) {
2923 vdev->pdev.cap_present &= ~QEMU_PCI_CAP_EXPRESS;
2924 }
2925 vdev->config_offset = reg_info.offset;
2926
2927 if ((vdev->features & VFIO_FEATURE_ENABLE_VGA) &&
2928 vbasedev->num_regions > VFIO_PCI_VGA_REGION_INDEX) {
2929 struct vfio_region_info vga_info = {
2930 .argsz = sizeof(vga_info),
2931 .index = VFIO_PCI_VGA_REGION_INDEX,
2932 };
2933
2934 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_REGION_INFO, &vga_info);
2935 if (ret) {
2936 error_report(
2937 "vfio: Device does not support requested feature x-vga");
2938 goto error;
2939 }
2940
2941 if (!(vga_info.flags & VFIO_REGION_INFO_FLAG_READ) ||
2942 !(vga_info.flags & VFIO_REGION_INFO_FLAG_WRITE) ||
2943 vga_info.size < 0xbffff + 1) {
2944 error_report("vfio: Unexpected VGA info, flags 0x%lx, size 0x%lx",
2945 (unsigned long)vga_info.flags,
2946 (unsigned long)vga_info.size);
2947 goto error;
2948 }
2949
2950 vdev->vga.fd_offset = vga_info.offset;
2951 vdev->vga.fd = vdev->vbasedev.fd;
2952
2953 vdev->vga.region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE;
2954 vdev->vga.region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM;
2955 QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_MEM].quirks);
2956
2957 vdev->vga.region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE;
2958 vdev->vga.region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO;
2959 QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_IO_LO].quirks);
2960
2961 vdev->vga.region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE;
2962 vdev->vga.region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI;
2963 QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks);
2964
2965 vdev->has_vga = true;
2966 }
2967
2968 irq_info.index = VFIO_PCI_ERR_IRQ_INDEX;
2969
2970 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
2971 if (ret) {
2972 /* This can fail for an old kernel or legacy PCI dev */
2973 trace_vfio_populate_device_get_irq_info_failure();
2974 ret = 0;
2975 } else if (irq_info.count == 1) {
2976 vdev->pci_aer = true;
2977 } else {
2978 error_report("vfio: %s "
2979 "Could not enable error recovery for the device",
2980 vbasedev->name);
2981 }
2982
2983 error:
2984 return ret;
2985 }
2986
2987 static void vfio_put_device(VFIOPCIDevice *vdev)
2988 {
2989 g_free(vdev->vbasedev.name);
2990 if (vdev->msix) {
2991 object_unparent(OBJECT(&vdev->msix->mmap_mem));
2992 g_free(vdev->msix);
2993 vdev->msix = NULL;
2994 }
2995 vfio_put_base_device(&vdev->vbasedev);
2996 }
2997
2998 static void vfio_err_notifier_handler(void *opaque)
2999 {
3000 VFIOPCIDevice *vdev = opaque;
3001
3002 if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
3003 return;
3004 }
3005
3006 /*
3007 * TBD. Retrieve the error details and decide what action
3008 * needs to be taken. One of the actions could be to pass
3009 * the error to the guest and have the guest driver recover
3010 * from the error. This requires that PCIe capabilities be
3011 * exposed to the guest. For now, we just terminate the
3012 * guest to contain the error.
3013 */
3014
3015 error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected. "
3016 "Please collect any data possible and then kill the guest",
3017 __func__, vdev->host.domain, vdev->host.bus,
3018 vdev->host.slot, vdev->host.function);
3019
3020 vm_stop(RUN_STATE_INTERNAL_ERROR);
3021 }
3022
3023 /*
3024 * Registers error notifier for devices supporting error recovery.
3025 * If we encounter a failure in this function, we report an error
3026 * and continue after disabling error recovery support for the
3027 * device.
3028 */
3029 static void vfio_register_err_notifier(VFIOPCIDevice *vdev)
3030 {
3031 int ret;
3032 int argsz;
3033 struct vfio_irq_set *irq_set;
3034 int32_t *pfd;
3035
3036 if (!vdev->pci_aer) {
3037 return;
3038 }
3039
3040 if (event_notifier_init(&vdev->err_notifier, 0)) {
3041 error_report("vfio: Unable to init event notifier for error detection");
3042 vdev->pci_aer = false;
3043 return;
3044 }
3045
3046 argsz = sizeof(*irq_set) + sizeof(*pfd);
3047
3048 irq_set = g_malloc0(argsz);
3049 irq_set->argsz = argsz;
3050 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
3051 VFIO_IRQ_SET_ACTION_TRIGGER;
3052 irq_set->index = VFIO_PCI_ERR_IRQ_INDEX;
3053 irq_set->start = 0;
3054 irq_set->count = 1;
3055 pfd = (int32_t *)&irq_set->data;
3056
3057 *pfd = event_notifier_get_fd(&vdev->err_notifier);
3058 qemu_set_fd_handler(*pfd, vfio_err_notifier_handler, NULL, vdev);
3059
3060 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
3061 if (ret) {
3062 error_report("vfio: Failed to set up error notification");
3063 qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
3064 event_notifier_cleanup(&vdev->err_notifier);
3065 vdev->pci_aer = false;
3066 }
3067 g_free(irq_set);
3068 }
3069
3070 static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev)
3071 {
3072 int argsz;
3073 struct vfio_irq_set *irq_set;
3074 int32_t *pfd;
3075 int ret;
3076
3077 if (!vdev->pci_aer) {
3078 return;
3079 }
3080
3081 argsz = sizeof(*irq_set) + sizeof(*pfd);
3082
3083 irq_set = g_malloc0(argsz);
3084 irq_set->argsz = argsz;
3085 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
3086 VFIO_IRQ_SET_ACTION_TRIGGER;
3087 irq_set->index = VFIO_PCI_ERR_IRQ_INDEX;
3088 irq_set->start = 0;
3089 irq_set->count = 1;
3090 pfd = (int32_t *)&irq_set->data;
3091 *pfd = -1;
3092
3093 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
3094 if (ret) {
3095 error_report("vfio: Failed to de-assign error fd: %m");
3096 }
3097 g_free(irq_set);
3098 qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier),
3099 NULL, NULL, vdev);
3100 event_notifier_cleanup(&vdev->err_notifier);
3101 }
3102
3103 static void vfio_req_notifier_handler(void *opaque)
3104 {
3105 VFIOPCIDevice *vdev = opaque;
3106
3107 if (!event_notifier_test_and_clear(&vdev->req_notifier)) {
3108 return;
3109 }
3110
3111 qdev_unplug(&vdev->pdev.qdev, NULL);
3112 }
3113
3114 static void vfio_register_req_notifier(VFIOPCIDevice *vdev)
3115 {
3116 struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info),
3117 .index = VFIO_PCI_REQ_IRQ_INDEX };
3118 int argsz;
3119 struct vfio_irq_set *irq_set;
3120 int32_t *pfd;
3121
3122 if (!(vdev->features & VFIO_FEATURE_ENABLE_REQ)) {
3123 return;
3124 }
3125
3126 if (ioctl(vdev->vbasedev.fd,
3127 VFIO_DEVICE_GET_IRQ_INFO, &irq_info) < 0 || irq_info.count < 1) {
3128 return;
3129 }
3130
3131 if (event_notifier_init(&vdev->req_notifier, 0)) {
3132 error_report("vfio: Unable to init event notifier for device request");
3133 return;
3134 }
3135
3136 argsz = sizeof(*irq_set) + sizeof(*pfd);
3137
3138 irq_set = g_malloc0(argsz);
3139 irq_set->argsz = argsz;
3140 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
3141 VFIO_IRQ_SET_ACTION_TRIGGER;
3142 irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
3143 irq_set->start = 0;
3144 irq_set->count = 1;
3145 pfd = (int32_t *)&irq_set->data;
3146
3147 *pfd = event_notifier_get_fd(&vdev->req_notifier);
3148 qemu_set_fd_handler(*pfd, vfio_req_notifier_handler, NULL, vdev);
3149
3150 if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set)) {
3151 error_report("vfio: Failed to set up device request notification");
3152 qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
3153 event_notifier_cleanup(&vdev->req_notifier);
3154 } else {
3155 vdev->req_enabled = true;
3156 }
3157
3158 g_free(irq_set);
3159 }
3160
3161 static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
3162 {
3163 int argsz;
3164 struct vfio_irq_set *irq_set;
3165 int32_t *pfd;
3166
3167 if (!vdev->req_enabled) {
3168 return;
3169 }
3170
3171 argsz = sizeof(*irq_set) + sizeof(*pfd);
3172
3173 irq_set = g_malloc0(argsz);
3174 irq_set->argsz = argsz;
3175 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
3176 VFIO_IRQ_SET_ACTION_TRIGGER;
3177 irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
3178 irq_set->start = 0;
3179 irq_set->count = 1;
3180 pfd = (int32_t *)&irq_set->data;
3181 *pfd = -1;
3182
3183 if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set)) {
3184 error_report("vfio: Failed to de-assign device request fd: %m");
3185 }
3186 g_free(irq_set);
3187 qemu_set_fd_handler(event_notifier_get_fd(&vdev->req_notifier),
3188 NULL, NULL, vdev);
3189 event_notifier_cleanup(&vdev->req_notifier);
3190
3191 vdev->req_enabled = false;
3192 }
3193
3194 /*
3195 * AMD Radeon PCI config reset, based on Linux:
3196 * drivers/gpu/drm/radeon/ci_smc.c:ci_is_smc_running()
3197 * drivers/gpu/drm/radeon/radeon_device.c:radeon_pci_config_reset
3198 * drivers/gpu/drm/radeon/ci_smc.c:ci_reset_smc()
3199 * drivers/gpu/drm/radeon/ci_smc.c:ci_stop_smc_clock()
3200 * IDs: include/drm/drm_pciids.h
3201 * Registers: http://cgit.freedesktop.org/~agd5f/linux/commit/?id=4e2aa447f6f0
3202 *
3203 * Bonaire and Hawaii GPUs do not respond to a bus reset. This is a bug in the
3204 * hardware that should be fixed on future ASICs. The symptom of this is that
3205 * once the accerlated driver loads, Windows guests will bsod on subsequent
3206 * attmpts to load the driver, such as after VM reset or shutdown/restart. To
3207 * work around this, we do an AMD specific PCI config reset, followed by an SMC
3208 * reset. The PCI config reset only works if SMC firmware is running, so we
3209 * have a dependency on the state of the device as to whether this reset will
3210 * be effective. There are still cases where we won't be able to kick the
3211 * device into working, but this greatly improves the usability overall. The
3212 * config reset magic is relatively common on AMD GPUs, but the setup and SMC
3213 * poking is largely ASIC specific.
3214 */
3215 static bool vfio_radeon_smc_is_running(VFIOPCIDevice *vdev)
3216 {
3217 uint32_t clk, pc_c;
3218
3219 /*
3220 * Registers 200h and 204h are index and data registers for accessing
3221 * indirect configuration registers within the device.
3222 */
3223 vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4);
3224 clk = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
3225 vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000370, 4);
3226 pc_c = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
3227
3228 return (!(clk & 1) && (0x20100 <= pc_c));
3229 }
3230
3231 /*
3232 * The scope of a config reset is controlled by a mode bit in the misc register
3233 * and a fuse, exposed as a bit in another register. The fuse is the default
3234 * (0 = GFX, 1 = whole GPU), the misc bit is a toggle, with the forumula
3235 * scope = !(misc ^ fuse), where the resulting scope is defined the same as
3236 * the fuse. A truth table therefore tells us that if misc == fuse, we need
3237 * to flip the value of the bit in the misc register.
3238 */
3239 static void vfio_radeon_set_gfx_only_reset(VFIOPCIDevice *vdev)
3240 {
3241 uint32_t misc, fuse;
3242 bool a, b;
3243
3244 vfio_region_write(&vdev->bars[5].region, 0x200, 0xc00c0000, 4);
3245 fuse = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
3246 b = fuse & 64;
3247
3248 vfio_region_write(&vdev->bars[5].region, 0x200, 0xc0000010, 4);
3249 misc = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
3250 a = misc & 2;
3251
3252 if (a == b) {
3253 vfio_region_write(&vdev->bars[5].region, 0x204, misc ^ 2, 4);
3254 vfio_region_read(&vdev->bars[5].region, 0x204, 4); /* flush */
3255 }
3256 }
3257
3258 static int vfio_radeon_reset(VFIOPCIDevice *vdev)
3259 {
3260 PCIDevice *pdev = &vdev->pdev;
3261 int i, ret = 0;
3262 uint32_t data;
3263
3264 /* Defer to a kernel implemented reset */
3265 if (vdev->vbasedev.reset_works) {
3266 return -ENODEV;
3267 }
3268
3269 /* Enable only memory BAR access */
3270 vfio_pci_write_config(pdev, PCI_COMMAND, PCI_COMMAND_MEMORY, 2);
3271
3272 /* Reset only works if SMC firmware is loaded and running */
3273 if (!vfio_radeon_smc_is_running(vdev)) {
3274 ret = -EINVAL;
3275 goto out;
3276 }
3277
3278 /* Make sure only the GFX function is reset */
3279 vfio_radeon_set_gfx_only_reset(vdev);
3280
3281 /* AMD PCI config reset */
3282 vfio_pci_write_config(pdev, 0x7c, 0x39d5e86b, 4);
3283 usleep(100);
3284
3285 /* Read back the memory size to make sure we're out of reset */
3286 for (i = 0; i < 100000; i++) {
3287 if (vfio_region_read(&vdev->bars[5].region, 0x5428, 4) != 0xffffffff) {
3288 break;
3289 }
3290 usleep(1);
3291 }
3292
3293 /* Reset SMC */
3294 vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000000, 4);
3295 data = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
3296 data |= 1;
3297 vfio_region_write(&vdev->bars[5].region, 0x204, data, 4);
3298
3299 /* Disable SMC clock */
3300 vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4);
3301 data = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
3302 data |= 1;
3303 vfio_region_write(&vdev->bars[5].region, 0x204, data, 4);
3304
3305 out:
3306 /* Restore PCI command register */
3307 vfio_pci_write_config(pdev, PCI_COMMAND, 0, 2);
3308
3309 return ret;
3310 }
3311
3312 static void vfio_setup_resetfn(VFIOPCIDevice *vdev)
3313 {
3314 PCIDevice *pdev = &vdev->pdev;
3315 uint16_t vendor, device;
3316
3317 vendor = pci_get_word(pdev->config + PCI_VENDOR_ID);
3318 device = pci_get_word(pdev->config + PCI_DEVICE_ID);
3319
3320 switch (vendor) {
3321 case 0x1002:
3322 switch (device) {
3323 /* Bonaire */
3324 case 0x6649: /* Bonaire [FirePro W5100] */
3325 case 0x6650:
3326 case 0x6651:
3327 case 0x6658: /* Bonaire XTX [Radeon R7 260X] */
3328 case 0x665c: /* Bonaire XT [Radeon HD 7790/8770 / R9 260 OEM] */
3329 case 0x665d: /* Bonaire [Radeon R7 200 Series] */
3330 /* Hawaii */
3331 case 0x67A0: /* Hawaii XT GL [FirePro W9100] */
3332 case 0x67A1: /* Hawaii PRO GL [FirePro W8100] */
3333 case 0x67A2:
3334 case 0x67A8:
3335 case 0x67A9:
3336 case 0x67AA:
3337 case 0x67B0: /* Hawaii XT [Radeon R9 290X] */
3338 case 0x67B1: /* Hawaii PRO [Radeon R9 290] */
3339 case 0x67B8:
3340 case 0x67B9:
3341 case 0x67BA:
3342 case 0x67BE:
3343 vdev->resetfn = vfio_radeon_reset;
3344 break;
3345 }
3346 break;
3347 }
3348 }
3349
3350 static int vfio_initfn(PCIDevice *pdev)
3351 {
3352 VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
3353 VFIODevice *vbasedev_iter;
3354 VFIOGroup *group;
3355 char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
3356 ssize_t len;
3357 struct stat st;
3358 int groupid;
3359 int ret;
3360
3361 /* Check that the host device exists */
3362 snprintf(path, sizeof(path),
3363 "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
3364 vdev->host.domain, vdev->host.bus, vdev->host.slot,
3365 vdev->host.function);
3366 if (stat(path, &st) < 0) {
3367 error_report("vfio: error: no such host device: %s", path);
3368 return -errno;
3369 }
3370
3371 vdev->vbasedev.ops = &vfio_pci_ops;
3372
3373 vdev->vbasedev.type = VFIO_DEVICE_TYPE_PCI;
3374 vdev->vbasedev.name = g_strdup_printf("%04x:%02x:%02x.%01x",
3375 vdev->host.domain, vdev->host.bus,
3376 vdev->host.slot, vdev->host.function);
3377
3378 strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
3379
3380 len = readlink(path, iommu_group_path, sizeof(path));
3381 if (len <= 0 || len >= sizeof(path)) {
3382 error_report("vfio: error no iommu_group for device");
3383 return len < 0 ? -errno : -ENAMETOOLONG;
3384 }
3385
3386 iommu_group_path[len] = 0;
3387 group_name = basename(iommu_group_path);
3388
3389 if (sscanf(group_name, "%d", &groupid) != 1) {
3390 error_report("vfio: error reading %s: %m", path);
3391 return -errno;
3392 }
3393
3394 trace_vfio_initfn(vdev->vbasedev.name, groupid);
3395
3396 group = vfio_get_group(groupid, pci_device_iommu_address_space(pdev));
3397 if (!group) {
3398 error_report("vfio: failed to get group %d", groupid);
3399 return -ENOENT;
3400 }
3401
3402 snprintf(path, sizeof(path), "%04x:%02x:%02x.%01x",
3403 vdev->host.domain, vdev->host.bus, vdev->host.slot,
3404 vdev->host.function);
3405
3406 QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
3407 if (strcmp(vbasedev_iter->name, vdev->vbasedev.name) == 0) {
3408 error_report("vfio: error: device %s is already attached", path);
3409 vfio_put_group(group);
3410 return -EBUSY;
3411 }
3412 }
3413
3414 ret = vfio_get_device(group, path, &vdev->vbasedev);
3415 if (ret) {
3416 error_report("vfio: failed to get device %s", path);
3417 vfio_put_group(group);
3418 return ret;
3419 }
3420
3421 ret = vfio_populate_device(vdev);
3422 if (ret) {
3423 return ret;
3424 }
3425
3426 /* Get a copy of config space */
3427 ret = pread(vdev->vbasedev.fd, vdev->pdev.config,
3428 MIN(pci_config_size(&vdev->pdev), vdev->config_size),
3429 vdev->config_offset);
3430 if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) {
3431 ret = ret < 0 ? -errno : -EFAULT;
3432 error_report("vfio: Failed to read device config space");
3433 return ret;
3434 }
3435
3436 /* vfio emulates a lot for us, but some bits need extra love */
3437 vdev->emulated_config_bits = g_malloc0(vdev->config_size);
3438
3439 /* QEMU can choose to expose the ROM or not */
3440 memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4);
3441
3442 /* QEMU can change multi-function devices to single function, or reverse */
3443 vdev->emulated_config_bits[PCI_HEADER_TYPE] =
3444 PCI_HEADER_TYPE_MULTI_FUNCTION;
3445
3446 /* Restore or clear multifunction, this is always controlled by QEMU */
3447 if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
3448 vdev->pdev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
3449 } else {
3450 vdev->pdev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION;
3451 }
3452
3453 /*
3454 * Clear host resource mapping info. If we choose not to register a
3455 * BAR, such as might be the case with the option ROM, we can get
3456 * confusing, unwritable, residual addresses from the host here.
3457 */
3458 memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
3459 memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
3460
3461 vfio_pci_size_rom(vdev);
3462
3463 ret = vfio_msix_early_setup(vdev);
3464 if (ret) {
3465 return ret;
3466 }
3467
3468 vfio_map_bars(vdev);
3469
3470 ret = vfio_add_capabilities(vdev);
3471 if (ret) {
3472 goto out_teardown;
3473 }
3474
3475 /* QEMU emulates all of MSI & MSIX */
3476 if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
3477 memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
3478 MSIX_CAP_LENGTH);
3479 }
3480
3481 if (pdev->cap_present & QEMU_PCI_CAP_MSI) {
3482 memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff,
3483 vdev->msi_cap_size);
3484 }
3485
3486 if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
3487 vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
3488 vfio_intx_mmap_enable, vdev);
3489 pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_intx_update);
3490 ret = vfio_intx_enable(vdev);
3491 if (ret) {
3492 goto out_teardown;
3493 }
3494 }
3495
3496 vfio_register_err_notifier(vdev);
3497 vfio_register_req_notifier(vdev);
3498 vfio_setup_resetfn(vdev);
3499
3500 return 0;
3501
3502 out_teardown:
3503 pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
3504 vfio_teardown_msi(vdev);
3505 vfio_unregister_bars(vdev);
3506 return ret;
3507 }
3508
3509 static void vfio_instance_finalize(Object *obj)
3510 {
3511 PCIDevice *pci_dev = PCI_DEVICE(obj);
3512 VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pci_dev);
3513 VFIOGroup *group = vdev->vbasedev.group;
3514
3515 vfio_unmap_bars(vdev);
3516 g_free(vdev->emulated_config_bits);
3517 g_free(vdev->rom);
3518 vfio_put_device(vdev);
3519 vfio_put_group(group);
3520 }
3521
3522 static void vfio_exitfn(PCIDevice *pdev)
3523 {
3524 VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
3525
3526 vfio_unregister_req_notifier(vdev);
3527 vfio_unregister_err_notifier(vdev);
3528 pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
3529 vfio_disable_interrupts(vdev);
3530 if (vdev->intx.mmap_timer) {
3531 timer_free(vdev->intx.mmap_timer);
3532 }
3533 vfio_teardown_msi(vdev);
3534 vfio_unregister_bars(vdev);
3535 }
3536
3537 static void vfio_pci_reset(DeviceState *dev)
3538 {
3539 PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev);
3540 VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
3541
3542 trace_vfio_pci_reset(vdev->vbasedev.name);
3543
3544 vfio_pci_pre_reset(vdev);
3545
3546 if (vdev->resetfn && !vdev->resetfn(vdev)) {
3547 goto post_reset;
3548 }
3549
3550 if (vdev->vbasedev.reset_works &&
3551 (vdev->has_flr || !vdev->has_pm_reset) &&
3552 !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) {
3553 trace_vfio_pci_reset_flr(vdev->vbasedev.name);
3554 goto post_reset;
3555 }
3556
3557 /* See if we can do our own bus reset */
3558 if (!vfio_pci_hot_reset_one(vdev)) {
3559 goto post_reset;
3560 }
3561
3562 /* If nothing else works and the device supports PM reset, use it */
3563 if (vdev->vbasedev.reset_works && vdev->has_pm_reset &&
3564 !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) {
3565 trace_vfio_pci_reset_pm(vdev->vbasedev.name);
3566 goto post_reset;
3567 }
3568
3569 post_reset:
3570 vfio_pci_post_reset(vdev);
3571 }
3572
3573 static void vfio_instance_init(Object *obj)
3574 {
3575 PCIDevice *pci_dev = PCI_DEVICE(obj);
3576 VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, PCI_DEVICE(obj));
3577
3578 device_add_bootindex_property(obj, &vdev->bootindex,
3579 "bootindex", NULL,
3580 &pci_dev->qdev, NULL);
3581 }
3582
3583 static Property vfio_pci_dev_properties[] = {
3584 DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host),
3585 DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIOPCIDevice,
3586 intx.mmap_timeout, 1100),
3587 DEFINE_PROP_BIT("x-vga", VFIOPCIDevice, features,
3588 VFIO_FEATURE_ENABLE_VGA_BIT, false),
3589 DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features,
3590 VFIO_FEATURE_ENABLE_REQ_BIT, true),
3591 DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false),
3592 DEFINE_PROP_BOOL("x-no-kvm-intx", VFIOPCIDevice, no_kvm_intx, false),
3593 DEFINE_PROP_BOOL("x-no-kvm-msi", VFIOPCIDevice, no_kvm_msi, false),
3594 DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false),
3595 /*
3596 * TODO - support passed fds... is this necessary?
3597 * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name),
3598 * DEFINE_PROP_STRING("vfiogroupfd, VFIOPCIDevice, vfiogroupfd_name),
3599 */
3600 DEFINE_PROP_END_OF_LIST(),
3601 };
3602
3603 static const VMStateDescription vfio_pci_vmstate = {
3604 .name = "vfio-pci",
3605 .unmigratable = 1,
3606 };
3607
3608 static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
3609 {
3610 DeviceClass *dc = DEVICE_CLASS(klass);
3611 PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
3612
3613 dc->reset = vfio_pci_reset;
3614 dc->props = vfio_pci_dev_properties;
3615 dc->vmsd = &vfio_pci_vmstate;
3616 dc->desc = "VFIO-based PCI device assignment";
3617 set_bit(DEVICE_CATEGORY_MISC, dc->categories);
3618 pdc->init = vfio_initfn;
3619 pdc->exit = vfio_exitfn;
3620 pdc->config_read = vfio_pci_read_config;
3621 pdc->config_write = vfio_pci_write_config;
3622 pdc->is_express = 1; /* We might be */
3623 }
3624
3625 static const TypeInfo vfio_pci_dev_info = {
3626 .name = "vfio-pci",
3627 .parent = TYPE_PCI_DEVICE,
3628 .instance_size = sizeof(VFIOPCIDevice),
3629 .class_init = vfio_pci_dev_class_init,
3630 .instance_init = vfio_instance_init,
3631 .instance_finalize = vfio_instance_finalize,
3632 };
3633
3634 static void register_vfio_pci_dev_type(void)
3635 {
3636 type_register_static(&vfio_pci_dev_info);
3637 }
3638
3639 type_init(register_vfio_pci_dev_type)