1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
4 * Author: Alex Williamson <alex.williamson@redhat.com>
6 * Derived from original vfio:
7 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
8 * Author: Tom Lyon, pugs@cisco.com
11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13 #include <linux/device.h>
14 #include <linux/eventfd.h>
15 #include <linux/file.h>
16 #include <linux/interrupt.h>
17 #include <linux/iommu.h>
18 #include <linux/module.h>
19 #include <linux/mutex.h>
20 #include <linux/notifier.h>
21 #include <linux/pci.h>
22 #include <linux/pm_runtime.h>
23 #include <linux/slab.h>
24 #include <linux/types.h>
25 #include <linux/uaccess.h>
26 #include <linux/vfio.h>
27 #include <linux/vgaarb.h>
28 #include <linux/nospec.h>
29 #include <linux/sched/mm.h>
31 #include "vfio_pci_private.h"
33 #define DRIVER_VERSION "0.2"
34 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
35 #define DRIVER_DESC "VFIO PCI - User Level meta-driver"
37 static char ids
[1024] __initdata
;
38 module_param_string(ids
, ids
, sizeof(ids
), 0);
39 MODULE_PARM_DESC(ids
, "Initial PCI IDs to add to the vfio driver, format is \"vendor:device[:subvendor[:subdevice[:class[:class_mask]]]]\" and multiple comma separated entries can be specified");
41 static bool nointxmask
;
42 module_param_named(nointxmask
, nointxmask
, bool, S_IRUGO
| S_IWUSR
);
43 MODULE_PARM_DESC(nointxmask
,
44 "Disable support for PCI 2.3 style INTx masking. If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag.");
46 #ifdef CONFIG_VFIO_PCI_VGA
47 static bool disable_vga
;
48 module_param(disable_vga
, bool, S_IRUGO
);
49 MODULE_PARM_DESC(disable_vga
, "Disable VGA resource access through vfio-pci");
52 static bool disable_idle_d3
;
53 module_param(disable_idle_d3
, bool, S_IRUGO
| S_IWUSR
);
54 MODULE_PARM_DESC(disable_idle_d3
,
55 "Disable using the PCI D3 low power state for idle, unused devices");
57 static bool enable_sriov
;
59 module_param(enable_sriov
, bool, 0644);
60 MODULE_PARM_DESC(enable_sriov
, "Enable support for SR-IOV configuration. Enabling SR-IOV on a PF typically requires support of the userspace PF driver, enabling VFs without such support may result in non-functional VFs or PF.");
63 static bool disable_denylist
;
64 module_param(disable_denylist
, bool, 0444);
65 MODULE_PARM_DESC(disable_denylist
, "Disable use of device denylist. Disabling the denylist allows binding to devices with known errata that may lead to exploitable stability or security issues when accessed by untrusted users.");
67 static inline bool vfio_vga_disabled(void)
69 #ifdef CONFIG_VFIO_PCI_VGA
76 static bool vfio_pci_dev_in_denylist(struct pci_dev
*pdev
)
78 switch (pdev
->vendor
) {
79 case PCI_VENDOR_ID_INTEL
:
80 switch (pdev
->device
) {
81 case PCI_DEVICE_ID_INTEL_QAT_C3XXX
:
82 case PCI_DEVICE_ID_INTEL_QAT_C3XXX_VF
:
83 case PCI_DEVICE_ID_INTEL_QAT_C62X
:
84 case PCI_DEVICE_ID_INTEL_QAT_C62X_VF
:
85 case PCI_DEVICE_ID_INTEL_QAT_DH895XCC
:
86 case PCI_DEVICE_ID_INTEL_QAT_DH895XCC_VF
:
96 static bool vfio_pci_is_denylisted(struct pci_dev
*pdev
)
98 if (!vfio_pci_dev_in_denylist(pdev
))
101 if (disable_denylist
) {
103 "device denylist disabled - allowing device %04x:%04x.\n",
104 pdev
->vendor
, pdev
->device
);
108 pci_warn(pdev
, "%04x:%04x exists in vfio-pci device denylist, driver probing disallowed.\n",
109 pdev
->vendor
, pdev
->device
);
115 * Our VGA arbiter participation is limited since we don't know anything
116 * about the device itself. However, if the device is the only VGA device
117 * downstream of a bridge and VFIO VGA support is disabled, then we can
118 * safely return legacy VGA IO and memory as not decoded since the user
119 * has no way to get to it and routing can be disabled externally at the
122 static unsigned int vfio_pci_set_decode(struct pci_dev
*pdev
, bool single_vga
)
124 struct pci_dev
*tmp
= NULL
;
125 unsigned char max_busnr
;
126 unsigned int decodes
;
128 if (single_vga
|| !vfio_vga_disabled() || pci_is_root_bus(pdev
->bus
))
129 return VGA_RSRC_NORMAL_IO
| VGA_RSRC_NORMAL_MEM
|
130 VGA_RSRC_LEGACY_IO
| VGA_RSRC_LEGACY_MEM
;
132 max_busnr
= pci_bus_max_busnr(pdev
->bus
);
133 decodes
= VGA_RSRC_NORMAL_IO
| VGA_RSRC_NORMAL_MEM
;
135 while ((tmp
= pci_get_class(PCI_CLASS_DISPLAY_VGA
<< 8, tmp
)) != NULL
) {
137 pci_domain_nr(tmp
->bus
) != pci_domain_nr(pdev
->bus
) ||
138 pci_is_root_bus(tmp
->bus
))
141 if (tmp
->bus
->number
>= pdev
->bus
->number
&&
142 tmp
->bus
->number
<= max_busnr
) {
144 decodes
|= VGA_RSRC_LEGACY_IO
| VGA_RSRC_LEGACY_MEM
;
152 static inline bool vfio_pci_is_vga(struct pci_dev
*pdev
)
154 return (pdev
->class >> 8) == PCI_CLASS_DISPLAY_VGA
;
157 static void vfio_pci_probe_mmaps(struct vfio_pci_device
*vdev
)
159 struct resource
*res
;
161 struct vfio_pci_dummy_resource
*dummy_res
;
163 for (i
= 0; i
< PCI_STD_NUM_BARS
; i
++) {
164 int bar
= i
+ PCI_STD_RESOURCES
;
166 res
= &vdev
->pdev
->resource
[bar
];
168 if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP
))
171 if (!(res
->flags
& IORESOURCE_MEM
))
175 * The PCI core shouldn't set up a resource with a
176 * type but zero size. But there may be bugs that
177 * cause us to do that.
179 if (!resource_size(res
))
182 if (resource_size(res
) >= PAGE_SIZE
) {
183 vdev
->bar_mmap_supported
[bar
] = true;
187 if (!(res
->start
& ~PAGE_MASK
)) {
189 * Add a dummy resource to reserve the remainder
190 * of the exclusive page in case that hot-add
191 * device's bar is assigned into it.
193 dummy_res
= kzalloc(sizeof(*dummy_res
), GFP_KERNEL
);
194 if (dummy_res
== NULL
)
197 dummy_res
->resource
.name
= "vfio sub-page reserved";
198 dummy_res
->resource
.start
= res
->end
+ 1;
199 dummy_res
->resource
.end
= res
->start
+ PAGE_SIZE
- 1;
200 dummy_res
->resource
.flags
= res
->flags
;
201 if (request_resource(res
->parent
,
202 &dummy_res
->resource
)) {
206 dummy_res
->index
= bar
;
207 list_add(&dummy_res
->res_next
,
208 &vdev
->dummy_resources_list
);
209 vdev
->bar_mmap_supported
[bar
] = true;
213 * Here we don't handle the case when the BAR is not page
214 * aligned because we can't expect the BAR will be
215 * assigned into the same location in a page in guest
216 * when we passthrough the BAR. And it's hard to access
217 * this BAR in userspace because we have no way to get
218 * the BAR's location in a page.
221 vdev
->bar_mmap_supported
[bar
] = false;
225 static void vfio_pci_try_bus_reset(struct vfio_pci_device
*vdev
);
226 static void vfio_pci_disable(struct vfio_pci_device
*vdev
);
227 static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev
*pdev
, void *data
);
230 * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
231 * _and_ the ability detect when the device is asserting INTx via PCI_STATUS.
232 * If a device implements the former but not the latter we would typically
233 * expect broken_intx_masking be set and require an exclusive interrupt.
234 * However since we do have control of the device's ability to assert INTx,
235 * we can instead pretend that the device does not implement INTx, virtualizing
236 * the pin register to report zero and maintaining DisINTx set on the host.
238 static bool vfio_pci_nointx(struct pci_dev
*pdev
)
240 switch (pdev
->vendor
) {
241 case PCI_VENDOR_ID_INTEL
:
242 switch (pdev
->device
) {
243 /* All i40e (XL710/X710/XXV710) 10/20/25/40GbE NICs */
246 case 0x1580 ... 0x1581:
247 case 0x1583 ... 0x158b:
248 case 0x37d0 ... 0x37d2:
260 static void vfio_pci_probe_power_state(struct vfio_pci_device
*vdev
)
262 struct pci_dev
*pdev
= vdev
->pdev
;
268 pci_read_config_word(pdev
, pdev
->pm_cap
+ PCI_PM_CTRL
, &pmcsr
);
270 vdev
->needs_pm_restore
= !(pmcsr
& PCI_PM_CTRL_NO_SOFT_RESET
);
274 * pci_set_power_state() wrapper handling devices which perform a soft reset on
275 * D3->D0 transition. Save state prior to D0/1/2->D3, stash it on the vdev,
276 * restore when returned to D0. Saved separately from pci_saved_state for use
277 * by PM capability emulation and separately from pci_dev internal saved state
278 * to avoid it being overwritten and consumed around other resets.
280 int vfio_pci_set_power_state(struct vfio_pci_device
*vdev
, pci_power_t state
)
282 struct pci_dev
*pdev
= vdev
->pdev
;
283 bool needs_restore
= false, needs_save
= false;
286 if (vdev
->needs_pm_restore
) {
287 if (pdev
->current_state
< PCI_D3hot
&& state
>= PCI_D3hot
) {
288 pci_save_state(pdev
);
292 if (pdev
->current_state
>= PCI_D3hot
&& state
<= PCI_D0
)
293 needs_restore
= true;
296 ret
= pci_set_power_state(pdev
, state
);
299 /* D3 might be unsupported via quirk, skip unless in D3 */
300 if (needs_save
&& pdev
->current_state
>= PCI_D3hot
) {
301 vdev
->pm_save
= pci_store_saved_state(pdev
);
302 } else if (needs_restore
) {
303 pci_load_and_free_saved_state(pdev
, &vdev
->pm_save
);
304 pci_restore_state(pdev
);
311 static int vfio_pci_enable(struct vfio_pci_device
*vdev
)
313 struct pci_dev
*pdev
= vdev
->pdev
;
318 vfio_pci_set_power_state(vdev
, PCI_D0
);
320 /* Don't allow our initial saved state to include busmaster */
321 pci_clear_master(pdev
);
323 ret
= pci_enable_device(pdev
);
327 /* If reset fails because of the device lock, fail this path entirely */
328 ret
= pci_try_reset_function(pdev
);
329 if (ret
== -EAGAIN
) {
330 pci_disable_device(pdev
);
334 vdev
->reset_works
= !ret
;
335 pci_save_state(pdev
);
336 vdev
->pci_saved_state
= pci_store_saved_state(pdev
);
337 if (!vdev
->pci_saved_state
)
338 pci_dbg(pdev
, "%s: Couldn't store saved state\n", __func__
);
340 if (likely(!nointxmask
)) {
341 if (vfio_pci_nointx(pdev
)) {
342 pci_info(pdev
, "Masking broken INTx support\n");
346 vdev
->pci_2_3
= pci_intx_mask_supported(pdev
);
349 pci_read_config_word(pdev
, PCI_COMMAND
, &cmd
);
350 if (vdev
->pci_2_3
&& (cmd
& PCI_COMMAND_INTX_DISABLE
)) {
351 cmd
&= ~PCI_COMMAND_INTX_DISABLE
;
352 pci_write_config_word(pdev
, PCI_COMMAND
, cmd
);
355 ret
= vfio_config_init(vdev
);
357 kfree(vdev
->pci_saved_state
);
358 vdev
->pci_saved_state
= NULL
;
359 pci_disable_device(pdev
);
363 msix_pos
= pdev
->msix_cap
;
368 pci_read_config_word(pdev
, msix_pos
+ PCI_MSIX_FLAGS
, &flags
);
369 pci_read_config_dword(pdev
, msix_pos
+ PCI_MSIX_TABLE
, &table
);
371 vdev
->msix_bar
= table
& PCI_MSIX_TABLE_BIR
;
372 vdev
->msix_offset
= table
& PCI_MSIX_TABLE_OFFSET
;
373 vdev
->msix_size
= ((flags
& PCI_MSIX_FLAGS_QSIZE
) + 1) * 16;
375 vdev
->msix_bar
= 0xFF;
377 if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev
))
378 vdev
->has_vga
= true;
380 if (vfio_pci_is_vga(pdev
) &&
381 pdev
->vendor
== PCI_VENDOR_ID_INTEL
&&
382 IS_ENABLED(CONFIG_VFIO_PCI_IGD
)) {
383 ret
= vfio_pci_igd_init(vdev
);
384 if (ret
&& ret
!= -ENODEV
) {
385 pci_warn(pdev
, "Failed to setup Intel IGD regions\n");
390 vfio_pci_probe_mmaps(vdev
);
395 vfio_pci_disable(vdev
);
399 static void vfio_pci_disable(struct vfio_pci_device
*vdev
)
401 struct pci_dev
*pdev
= vdev
->pdev
;
402 struct vfio_pci_dummy_resource
*dummy_res
, *tmp
;
403 struct vfio_pci_ioeventfd
*ioeventfd
, *ioeventfd_tmp
;
406 /* Stop the device from further DMA */
407 pci_clear_master(pdev
);
409 vfio_pci_set_irqs_ioctl(vdev
, VFIO_IRQ_SET_DATA_NONE
|
410 VFIO_IRQ_SET_ACTION_TRIGGER
,
411 vdev
->irq_type
, 0, 0, NULL
);
413 /* Device closed, don't need mutex here */
414 list_for_each_entry_safe(ioeventfd
, ioeventfd_tmp
,
415 &vdev
->ioeventfds_list
, next
) {
416 vfio_virqfd_disable(&ioeventfd
->virqfd
);
417 list_del(&ioeventfd
->next
);
420 vdev
->ioeventfds_nr
= 0;
422 vdev
->virq_disabled
= false;
424 for (i
= 0; i
< vdev
->num_regions
; i
++)
425 vdev
->region
[i
].ops
->release(vdev
, &vdev
->region
[i
]);
427 vdev
->num_regions
= 0;
429 vdev
->region
= NULL
; /* don't krealloc a freed pointer */
431 vfio_config_free(vdev
);
433 for (i
= 0; i
< PCI_STD_NUM_BARS
; i
++) {
434 bar
= i
+ PCI_STD_RESOURCES
;
435 if (!vdev
->barmap
[bar
])
437 pci_iounmap(pdev
, vdev
->barmap
[bar
]);
438 pci_release_selected_regions(pdev
, 1 << bar
);
439 vdev
->barmap
[bar
] = NULL
;
442 list_for_each_entry_safe(dummy_res
, tmp
,
443 &vdev
->dummy_resources_list
, res_next
) {
444 list_del(&dummy_res
->res_next
);
445 release_resource(&dummy_res
->resource
);
449 vdev
->needs_reset
= true;
452 * If we have saved state, restore it. If we can reset the device,
453 * even better. Resetting with current state seems better than
454 * nothing, but saving and restoring current state without reset
457 if (pci_load_and_free_saved_state(pdev
, &vdev
->pci_saved_state
)) {
458 pci_info(pdev
, "%s: Couldn't reload saved state\n", __func__
);
460 if (!vdev
->reset_works
)
463 pci_save_state(pdev
);
467 * Disable INTx and MSI, presumably to avoid spurious interrupts
468 * during reset. Stolen from pci_reset_function()
470 pci_write_config_word(pdev
, PCI_COMMAND
, PCI_COMMAND_INTX_DISABLE
);
473 * Try to get the locks ourselves to prevent a deadlock. The
474 * success of this is dependent on being able to lock the device,
475 * which is not always possible.
476 * We can not use the "try" reset interface here, which will
477 * overwrite the previously restored configuration information.
479 if (vdev
->reset_works
&& pci_dev_trylock(pdev
)) {
480 if (!__pci_reset_function_locked(pdev
))
481 vdev
->needs_reset
= false;
482 pci_dev_unlock(pdev
);
485 pci_restore_state(pdev
);
487 pci_disable_device(pdev
);
489 vfio_pci_try_bus_reset(vdev
);
491 if (!disable_idle_d3
)
492 vfio_pci_set_power_state(vdev
, PCI_D3hot
);
495 static struct pci_driver vfio_pci_driver
;
497 static struct vfio_pci_device
*get_pf_vdev(struct vfio_pci_device
*vdev
)
499 struct pci_dev
*physfn
= pci_physfn(vdev
->pdev
);
500 struct vfio_device
*pf_dev
;
502 if (!vdev
->pdev
->is_virtfn
)
505 pf_dev
= vfio_device_get_from_dev(&physfn
->dev
);
509 if (pci_dev_driver(physfn
) != &vfio_pci_driver
) {
510 vfio_device_put(pf_dev
);
514 return container_of(pf_dev
, struct vfio_pci_device
, vdev
);
517 static void vfio_pci_vf_token_user_add(struct vfio_pci_device
*vdev
, int val
)
519 struct vfio_pci_device
*pf_vdev
= get_pf_vdev(vdev
);
524 mutex_lock(&pf_vdev
->vf_token
->lock
);
525 pf_vdev
->vf_token
->users
+= val
;
526 WARN_ON(pf_vdev
->vf_token
->users
< 0);
527 mutex_unlock(&pf_vdev
->vf_token
->lock
);
529 vfio_device_put(&pf_vdev
->vdev
);
532 static void vfio_pci_release(struct vfio_device
*core_vdev
)
534 struct vfio_pci_device
*vdev
=
535 container_of(core_vdev
, struct vfio_pci_device
, vdev
);
537 mutex_lock(&vdev
->reflck
->lock
);
539 if (!(--vdev
->refcnt
)) {
540 vfio_pci_vf_token_user_add(vdev
, -1);
541 vfio_spapr_pci_eeh_release(vdev
->pdev
);
542 vfio_pci_disable(vdev
);
544 mutex_lock(&vdev
->igate
);
545 if (vdev
->err_trigger
) {
546 eventfd_ctx_put(vdev
->err_trigger
);
547 vdev
->err_trigger
= NULL
;
549 if (vdev
->req_trigger
) {
550 eventfd_ctx_put(vdev
->req_trigger
);
551 vdev
->req_trigger
= NULL
;
553 mutex_unlock(&vdev
->igate
);
556 mutex_unlock(&vdev
->reflck
->lock
);
559 static int vfio_pci_open(struct vfio_device
*core_vdev
)
561 struct vfio_pci_device
*vdev
=
562 container_of(core_vdev
, struct vfio_pci_device
, vdev
);
565 mutex_lock(&vdev
->reflck
->lock
);
568 ret
= vfio_pci_enable(vdev
);
572 vfio_spapr_pci_eeh_open(vdev
->pdev
);
573 vfio_pci_vf_token_user_add(vdev
, 1);
577 mutex_unlock(&vdev
->reflck
->lock
);
581 static int vfio_pci_get_irq_count(struct vfio_pci_device
*vdev
, int irq_type
)
583 if (irq_type
== VFIO_PCI_INTX_IRQ_INDEX
) {
586 if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX
) ||
587 vdev
->nointx
|| vdev
->pdev
->is_virtfn
)
590 pci_read_config_byte(vdev
->pdev
, PCI_INTERRUPT_PIN
, &pin
);
593 } else if (irq_type
== VFIO_PCI_MSI_IRQ_INDEX
) {
597 pos
= vdev
->pdev
->msi_cap
;
599 pci_read_config_word(vdev
->pdev
,
600 pos
+ PCI_MSI_FLAGS
, &flags
);
601 return 1 << ((flags
& PCI_MSI_FLAGS_QMASK
) >> 1);
603 } else if (irq_type
== VFIO_PCI_MSIX_IRQ_INDEX
) {
607 pos
= vdev
->pdev
->msix_cap
;
609 pci_read_config_word(vdev
->pdev
,
610 pos
+ PCI_MSIX_FLAGS
, &flags
);
612 return (flags
& PCI_MSIX_FLAGS_QSIZE
) + 1;
614 } else if (irq_type
== VFIO_PCI_ERR_IRQ_INDEX
) {
615 if (pci_is_pcie(vdev
->pdev
))
617 } else if (irq_type
== VFIO_PCI_REQ_IRQ_INDEX
) {
624 static int vfio_pci_count_devs(struct pci_dev
*pdev
, void *data
)
630 struct vfio_pci_fill_info
{
633 struct vfio_pci_dependent_device
*devices
;
636 static int vfio_pci_fill_devs(struct pci_dev
*pdev
, void *data
)
638 struct vfio_pci_fill_info
*fill
= data
;
639 struct iommu_group
*iommu_group
;
641 if (fill
->cur
== fill
->max
)
642 return -EAGAIN
; /* Something changed, try again */
644 iommu_group
= iommu_group_get(&pdev
->dev
);
646 return -EPERM
; /* Cannot reset non-isolated devices */
648 fill
->devices
[fill
->cur
].group_id
= iommu_group_id(iommu_group
);
649 fill
->devices
[fill
->cur
].segment
= pci_domain_nr(pdev
->bus
);
650 fill
->devices
[fill
->cur
].bus
= pdev
->bus
->number
;
651 fill
->devices
[fill
->cur
].devfn
= pdev
->devfn
;
653 iommu_group_put(iommu_group
);
657 struct vfio_pci_group_entry
{
658 struct vfio_group
*group
;
662 struct vfio_pci_group_info
{
664 struct vfio_pci_group_entry
*groups
;
667 static int vfio_pci_validate_devs(struct pci_dev
*pdev
, void *data
)
669 struct vfio_pci_group_info
*info
= data
;
670 struct iommu_group
*group
;
673 group
= iommu_group_get(&pdev
->dev
);
677 id
= iommu_group_id(group
);
679 for (i
= 0; i
< info
->count
; i
++)
680 if (info
->groups
[i
].id
== id
)
683 iommu_group_put(group
);
685 return (i
== info
->count
) ? -EINVAL
: 0;
688 static bool vfio_pci_dev_below_slot(struct pci_dev
*pdev
, struct pci_slot
*slot
)
690 for (; pdev
; pdev
= pdev
->bus
->self
)
691 if (pdev
->bus
== slot
->bus
)
692 return (pdev
->slot
== slot
);
696 struct vfio_pci_walk_info
{
697 int (*fn
)(struct pci_dev
*, void *data
);
699 struct pci_dev
*pdev
;
704 static int vfio_pci_walk_wrapper(struct pci_dev
*pdev
, void *data
)
706 struct vfio_pci_walk_info
*walk
= data
;
708 if (!walk
->slot
|| vfio_pci_dev_below_slot(pdev
, walk
->pdev
->slot
))
709 walk
->ret
= walk
->fn(pdev
, walk
->data
);
714 static int vfio_pci_for_each_slot_or_bus(struct pci_dev
*pdev
,
715 int (*fn
)(struct pci_dev
*,
716 void *data
), void *data
,
719 struct vfio_pci_walk_info walk
= {
720 .fn
= fn
, .data
= data
, .pdev
= pdev
, .slot
= slot
, .ret
= 0,
723 pci_walk_bus(pdev
->bus
, vfio_pci_walk_wrapper
, &walk
);
728 static int msix_mmappable_cap(struct vfio_pci_device
*vdev
,
729 struct vfio_info_cap
*caps
)
731 struct vfio_info_cap_header header
= {
732 .id
= VFIO_REGION_INFO_CAP_MSIX_MAPPABLE
,
736 return vfio_info_add_capability(caps
, &header
, sizeof(header
));
739 int vfio_pci_register_dev_region(struct vfio_pci_device
*vdev
,
740 unsigned int type
, unsigned int subtype
,
741 const struct vfio_pci_regops
*ops
,
742 size_t size
, u32 flags
, void *data
)
744 struct vfio_pci_region
*region
;
746 region
= krealloc(vdev
->region
,
747 (vdev
->num_regions
+ 1) * sizeof(*region
),
752 vdev
->region
= region
;
753 vdev
->region
[vdev
->num_regions
].type
= type
;
754 vdev
->region
[vdev
->num_regions
].subtype
= subtype
;
755 vdev
->region
[vdev
->num_regions
].ops
= ops
;
756 vdev
->region
[vdev
->num_regions
].size
= size
;
757 vdev
->region
[vdev
->num_regions
].flags
= flags
;
758 vdev
->region
[vdev
->num_regions
].data
= data
;
765 struct vfio_devices
{
766 struct vfio_pci_device
**devices
;
771 static long vfio_pci_ioctl(struct vfio_device
*core_vdev
,
772 unsigned int cmd
, unsigned long arg
)
774 struct vfio_pci_device
*vdev
=
775 container_of(core_vdev
, struct vfio_pci_device
, vdev
);
778 if (cmd
== VFIO_DEVICE_GET_INFO
) {
779 struct vfio_device_info info
;
780 struct vfio_info_cap caps
= { .buf
= NULL
, .size
= 0 };
784 minsz
= offsetofend(struct vfio_device_info
, num_irqs
);
786 /* For backward compatibility, cannot require this */
787 capsz
= offsetofend(struct vfio_iommu_type1_info
, cap_offset
);
789 if (copy_from_user(&info
, (void __user
*)arg
, minsz
))
792 if (info
.argsz
< minsz
)
795 if (info
.argsz
>= capsz
) {
800 info
.flags
= VFIO_DEVICE_FLAGS_PCI
;
802 if (vdev
->reset_works
)
803 info
.flags
|= VFIO_DEVICE_FLAGS_RESET
;
805 info
.num_regions
= VFIO_PCI_NUM_REGIONS
+ vdev
->num_regions
;
806 info
.num_irqs
= VFIO_PCI_NUM_IRQS
;
808 ret
= vfio_pci_info_zdev_add_caps(vdev
, &caps
);
809 if (ret
&& ret
!= -ENODEV
) {
810 pci_warn(vdev
->pdev
, "Failed to setup zPCI info capabilities\n");
815 info
.flags
|= VFIO_DEVICE_FLAGS_CAPS
;
816 if (info
.argsz
< sizeof(info
) + caps
.size
) {
817 info
.argsz
= sizeof(info
) + caps
.size
;
819 vfio_info_cap_shift(&caps
, sizeof(info
));
820 if (copy_to_user((void __user
*)arg
+
821 sizeof(info
), caps
.buf
,
826 info
.cap_offset
= sizeof(info
);
832 return copy_to_user((void __user
*)arg
, &info
, minsz
) ?
835 } else if (cmd
== VFIO_DEVICE_GET_REGION_INFO
) {
836 struct pci_dev
*pdev
= vdev
->pdev
;
837 struct vfio_region_info info
;
838 struct vfio_info_cap caps
= { .buf
= NULL
, .size
= 0 };
841 minsz
= offsetofend(struct vfio_region_info
, offset
);
843 if (copy_from_user(&info
, (void __user
*)arg
, minsz
))
846 if (info
.argsz
< minsz
)
849 switch (info
.index
) {
850 case VFIO_PCI_CONFIG_REGION_INDEX
:
851 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
852 info
.size
= pdev
->cfg_size
;
853 info
.flags
= VFIO_REGION_INFO_FLAG_READ
|
854 VFIO_REGION_INFO_FLAG_WRITE
;
856 case VFIO_PCI_BAR0_REGION_INDEX
... VFIO_PCI_BAR5_REGION_INDEX
:
857 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
858 info
.size
= pci_resource_len(pdev
, info
.index
);
864 info
.flags
= VFIO_REGION_INFO_FLAG_READ
|
865 VFIO_REGION_INFO_FLAG_WRITE
;
866 if (vdev
->bar_mmap_supported
[info
.index
]) {
867 info
.flags
|= VFIO_REGION_INFO_FLAG_MMAP
;
868 if (info
.index
== vdev
->msix_bar
) {
869 ret
= msix_mmappable_cap(vdev
, &caps
);
876 case VFIO_PCI_ROM_REGION_INDEX
:
882 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
885 /* Report the BAR size, not the ROM size */
886 info
.size
= pci_resource_len(pdev
, info
.index
);
888 /* Shadow ROMs appear as PCI option ROMs */
889 if (pdev
->resource
[PCI_ROM_RESOURCE
].flags
&
890 IORESOURCE_ROM_SHADOW
)
897 * Is it really there? Enable memory decode for
898 * implicit access in pci_map_rom().
900 cmd
= vfio_pci_memory_lock_and_enable(vdev
);
901 io
= pci_map_rom(pdev
, &size
);
903 info
.flags
= VFIO_REGION_INFO_FLAG_READ
;
904 pci_unmap_rom(pdev
, io
);
908 vfio_pci_memory_unlock_and_restore(vdev
, cmd
);
912 case VFIO_PCI_VGA_REGION_INDEX
:
916 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
918 info
.flags
= VFIO_REGION_INFO_FLAG_READ
|
919 VFIO_REGION_INFO_FLAG_WRITE
;
924 struct vfio_region_info_cap_type cap_type
= {
925 .header
.id
= VFIO_REGION_INFO_CAP_TYPE
,
926 .header
.version
= 1 };
929 VFIO_PCI_NUM_REGIONS
+ vdev
->num_regions
)
931 info
.index
= array_index_nospec(info
.index
,
932 VFIO_PCI_NUM_REGIONS
+
935 i
= info
.index
- VFIO_PCI_NUM_REGIONS
;
937 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
938 info
.size
= vdev
->region
[i
].size
;
939 info
.flags
= vdev
->region
[i
].flags
;
941 cap_type
.type
= vdev
->region
[i
].type
;
942 cap_type
.subtype
= vdev
->region
[i
].subtype
;
944 ret
= vfio_info_add_capability(&caps
, &cap_type
.header
,
949 if (vdev
->region
[i
].ops
->add_capability
) {
950 ret
= vdev
->region
[i
].ops
->add_capability(vdev
,
951 &vdev
->region
[i
], &caps
);
959 info
.flags
|= VFIO_REGION_INFO_FLAG_CAPS
;
960 if (info
.argsz
< sizeof(info
) + caps
.size
) {
961 info
.argsz
= sizeof(info
) + caps
.size
;
964 vfio_info_cap_shift(&caps
, sizeof(info
));
965 if (copy_to_user((void __user
*)arg
+
966 sizeof(info
), caps
.buf
,
971 info
.cap_offset
= sizeof(info
);
977 return copy_to_user((void __user
*)arg
, &info
, minsz
) ?
980 } else if (cmd
== VFIO_DEVICE_GET_IRQ_INFO
) {
981 struct vfio_irq_info info
;
983 minsz
= offsetofend(struct vfio_irq_info
, count
);
985 if (copy_from_user(&info
, (void __user
*)arg
, minsz
))
988 if (info
.argsz
< minsz
|| info
.index
>= VFIO_PCI_NUM_IRQS
)
991 switch (info
.index
) {
992 case VFIO_PCI_INTX_IRQ_INDEX
... VFIO_PCI_MSIX_IRQ_INDEX
:
993 case VFIO_PCI_REQ_IRQ_INDEX
:
995 case VFIO_PCI_ERR_IRQ_INDEX
:
996 if (pci_is_pcie(vdev
->pdev
))
1003 info
.flags
= VFIO_IRQ_INFO_EVENTFD
;
1005 info
.count
= vfio_pci_get_irq_count(vdev
, info
.index
);
1007 if (info
.index
== VFIO_PCI_INTX_IRQ_INDEX
)
1008 info
.flags
|= (VFIO_IRQ_INFO_MASKABLE
|
1009 VFIO_IRQ_INFO_AUTOMASKED
);
1011 info
.flags
|= VFIO_IRQ_INFO_NORESIZE
;
1013 return copy_to_user((void __user
*)arg
, &info
, minsz
) ?
1016 } else if (cmd
== VFIO_DEVICE_SET_IRQS
) {
1017 struct vfio_irq_set hdr
;
1020 size_t data_size
= 0;
1022 minsz
= offsetofend(struct vfio_irq_set
, count
);
1024 if (copy_from_user(&hdr
, (void __user
*)arg
, minsz
))
1027 max
= vfio_pci_get_irq_count(vdev
, hdr
.index
);
1029 ret
= vfio_set_irqs_validate_and_prepare(&hdr
, max
,
1030 VFIO_PCI_NUM_IRQS
, &data_size
);
1035 data
= memdup_user((void __user
*)(arg
+ minsz
),
1038 return PTR_ERR(data
);
1041 mutex_lock(&vdev
->igate
);
1043 ret
= vfio_pci_set_irqs_ioctl(vdev
, hdr
.flags
, hdr
.index
,
1044 hdr
.start
, hdr
.count
, data
);
1046 mutex_unlock(&vdev
->igate
);
1051 } else if (cmd
== VFIO_DEVICE_RESET
) {
1054 if (!vdev
->reset_works
)
1057 vfio_pci_zap_and_down_write_memory_lock(vdev
);
1058 ret
= pci_try_reset_function(vdev
->pdev
);
1059 up_write(&vdev
->memory_lock
);
1063 } else if (cmd
== VFIO_DEVICE_GET_PCI_HOT_RESET_INFO
) {
1064 struct vfio_pci_hot_reset_info hdr
;
1065 struct vfio_pci_fill_info fill
= { 0 };
1066 struct vfio_pci_dependent_device
*devices
= NULL
;
1070 minsz
= offsetofend(struct vfio_pci_hot_reset_info
, count
);
1072 if (copy_from_user(&hdr
, (void __user
*)arg
, minsz
))
1075 if (hdr
.argsz
< minsz
)
1080 /* Can we do a slot or bus reset or neither? */
1081 if (!pci_probe_reset_slot(vdev
->pdev
->slot
))
1083 else if (pci_probe_reset_bus(vdev
->pdev
->bus
))
1086 /* How many devices are affected? */
1087 ret
= vfio_pci_for_each_slot_or_bus(vdev
->pdev
,
1088 vfio_pci_count_devs
,
1093 WARN_ON(!fill
.max
); /* Should always be at least one */
1096 * If there's enough space, fill it now, otherwise return
1097 * -ENOSPC and the number of devices affected.
1099 if (hdr
.argsz
< sizeof(hdr
) + (fill
.max
* sizeof(*devices
))) {
1101 hdr
.count
= fill
.max
;
1102 goto reset_info_exit
;
1105 devices
= kcalloc(fill
.max
, sizeof(*devices
), GFP_KERNEL
);
1109 fill
.devices
= devices
;
1111 ret
= vfio_pci_for_each_slot_or_bus(vdev
->pdev
,
1116 * If a device was removed between counting and filling,
1117 * we may come up short of fill.max. If a device was
1118 * added, we'll have a return of -EAGAIN above.
1121 hdr
.count
= fill
.cur
;
1124 if (copy_to_user((void __user
*)arg
, &hdr
, minsz
))
1128 if (copy_to_user((void __user
*)(arg
+ minsz
), devices
,
1129 hdr
.count
* sizeof(*devices
)))
1136 } else if (cmd
== VFIO_DEVICE_PCI_HOT_RESET
) {
1137 struct vfio_pci_hot_reset hdr
;
1139 struct vfio_pci_group_entry
*groups
;
1140 struct vfio_pci_group_info info
;
1141 struct vfio_devices devs
= { .cur_index
= 0 };
1143 int i
, group_idx
, mem_idx
= 0, count
= 0, ret
= 0;
1145 minsz
= offsetofend(struct vfio_pci_hot_reset
, count
);
1147 if (copy_from_user(&hdr
, (void __user
*)arg
, minsz
))
1150 if (hdr
.argsz
< minsz
|| hdr
.flags
)
1153 /* Can we do a slot or bus reset or neither? */
1154 if (!pci_probe_reset_slot(vdev
->pdev
->slot
))
1156 else if (pci_probe_reset_bus(vdev
->pdev
->bus
))
1160 * We can't let userspace give us an arbitrarily large
1161 * buffer to copy, so verify how many we think there
1162 * could be. Note groups can have multiple devices so
1163 * one group per device is the max.
1165 ret
= vfio_pci_for_each_slot_or_bus(vdev
->pdev
,
1166 vfio_pci_count_devs
,
1171 /* Somewhere between 1 and count is OK */
1172 if (!hdr
.count
|| hdr
.count
> count
)
1175 group_fds
= kcalloc(hdr
.count
, sizeof(*group_fds
), GFP_KERNEL
);
1176 groups
= kcalloc(hdr
.count
, sizeof(*groups
), GFP_KERNEL
);
1177 if (!group_fds
|| !groups
) {
1183 if (copy_from_user(group_fds
, (void __user
*)(arg
+ minsz
),
1184 hdr
.count
* sizeof(*group_fds
))) {
1191 * For each group_fd, get the group through the vfio external
1192 * user interface and store the group and iommu ID. This
1193 * ensures the group is held across the reset.
1195 for (group_idx
= 0; group_idx
< hdr
.count
; group_idx
++) {
1196 struct vfio_group
*group
;
1197 struct fd f
= fdget(group_fds
[group_idx
]);
1203 group
= vfio_group_get_external_user(f
.file
);
1205 if (IS_ERR(group
)) {
1206 ret
= PTR_ERR(group
);
1210 groups
[group_idx
].group
= group
;
1211 groups
[group_idx
].id
=
1212 vfio_external_user_iommu_id(group
);
1217 /* release reference to groups on error */
1219 goto hot_reset_release
;
1221 info
.count
= hdr
.count
;
1222 info
.groups
= groups
;
1225 * Test whether all the affected devices are contained
1226 * by the set of groups provided by the user.
1228 ret
= vfio_pci_for_each_slot_or_bus(vdev
->pdev
,
1229 vfio_pci_validate_devs
,
1232 goto hot_reset_release
;
1234 devs
.max_index
= count
;
1235 devs
.devices
= kcalloc(count
, sizeof(struct vfio_device
*),
1237 if (!devs
.devices
) {
1239 goto hot_reset_release
;
1243 * We need to get memory_lock for each device, but devices
1244 * can share mmap_lock, therefore we need to zap and hold
1245 * the vma_lock for each device, and only then get each
1248 ret
= vfio_pci_for_each_slot_or_bus(vdev
->pdev
,
1249 vfio_pci_try_zap_and_vma_lock_cb
,
1252 goto hot_reset_release
;
1254 for (; mem_idx
< devs
.cur_index
; mem_idx
++) {
1255 struct vfio_pci_device
*tmp
= devs
.devices
[mem_idx
];
1257 ret
= down_write_trylock(&tmp
->memory_lock
);
1260 goto hot_reset_release
;
1262 mutex_unlock(&tmp
->vma_lock
);
1265 /* User has access, do the reset */
1266 ret
= pci_reset_bus(vdev
->pdev
);
1269 for (i
= 0; i
< devs
.cur_index
; i
++) {
1270 struct vfio_pci_device
*tmp
= devs
.devices
[i
];
1273 up_write(&tmp
->memory_lock
);
1275 mutex_unlock(&tmp
->vma_lock
);
1276 vfio_device_put(&tmp
->vdev
);
1278 kfree(devs
.devices
);
1280 for (group_idx
--; group_idx
>= 0; group_idx
--)
1281 vfio_group_put_external_user(groups
[group_idx
].group
);
1285 } else if (cmd
== VFIO_DEVICE_IOEVENTFD
) {
1286 struct vfio_device_ioeventfd ioeventfd
;
1289 minsz
= offsetofend(struct vfio_device_ioeventfd
, fd
);
1291 if (copy_from_user(&ioeventfd
, (void __user
*)arg
, minsz
))
1294 if (ioeventfd
.argsz
< minsz
)
1297 if (ioeventfd
.flags
& ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK
)
1300 count
= ioeventfd
.flags
& VFIO_DEVICE_IOEVENTFD_SIZE_MASK
;
1302 if (hweight8(count
) != 1 || ioeventfd
.fd
< -1)
1305 return vfio_pci_ioeventfd(vdev
, ioeventfd
.offset
,
1306 ioeventfd
.data
, count
, ioeventfd
.fd
);
1307 } else if (cmd
== VFIO_DEVICE_FEATURE
) {
1308 struct vfio_device_feature feature
;
1311 minsz
= offsetofend(struct vfio_device_feature
, flags
);
1313 if (copy_from_user(&feature
, (void __user
*)arg
, minsz
))
1316 if (feature
.argsz
< minsz
)
1319 /* Check unknown flags */
1320 if (feature
.flags
& ~(VFIO_DEVICE_FEATURE_MASK
|
1321 VFIO_DEVICE_FEATURE_SET
|
1322 VFIO_DEVICE_FEATURE_GET
|
1323 VFIO_DEVICE_FEATURE_PROBE
))
1326 /* GET & SET are mutually exclusive except with PROBE */
1327 if (!(feature
.flags
& VFIO_DEVICE_FEATURE_PROBE
) &&
1328 (feature
.flags
& VFIO_DEVICE_FEATURE_SET
) &&
1329 (feature
.flags
& VFIO_DEVICE_FEATURE_GET
))
1332 switch (feature
.flags
& VFIO_DEVICE_FEATURE_MASK
) {
1333 case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN
:
1334 if (!vdev
->vf_token
)
1338 * We do not support GET of the VF Token UUID as this
1339 * could expose the token of the previous device user.
1341 if (feature
.flags
& VFIO_DEVICE_FEATURE_GET
)
1344 if (feature
.flags
& VFIO_DEVICE_FEATURE_PROBE
)
1347 /* Don't SET unless told to do so */
1348 if (!(feature
.flags
& VFIO_DEVICE_FEATURE_SET
))
1351 if (feature
.argsz
< minsz
+ sizeof(uuid
))
1354 if (copy_from_user(&uuid
, (void __user
*)(arg
+ minsz
),
1358 mutex_lock(&vdev
->vf_token
->lock
);
1359 uuid_copy(&vdev
->vf_token
->uuid
, &uuid
);
1360 mutex_unlock(&vdev
->vf_token
->lock
);
1371 static ssize_t
vfio_pci_rw(struct vfio_pci_device
*vdev
, char __user
*buf
,
1372 size_t count
, loff_t
*ppos
, bool iswrite
)
1374 unsigned int index
= VFIO_PCI_OFFSET_TO_INDEX(*ppos
);
1376 if (index
>= VFIO_PCI_NUM_REGIONS
+ vdev
->num_regions
)
1380 case VFIO_PCI_CONFIG_REGION_INDEX
:
1381 return vfio_pci_config_rw(vdev
, buf
, count
, ppos
, iswrite
);
1383 case VFIO_PCI_ROM_REGION_INDEX
:
1386 return vfio_pci_bar_rw(vdev
, buf
, count
, ppos
, false);
1388 case VFIO_PCI_BAR0_REGION_INDEX
... VFIO_PCI_BAR5_REGION_INDEX
:
1389 return vfio_pci_bar_rw(vdev
, buf
, count
, ppos
, iswrite
);
1391 case VFIO_PCI_VGA_REGION_INDEX
:
1392 return vfio_pci_vga_rw(vdev
, buf
, count
, ppos
, iswrite
);
1394 index
-= VFIO_PCI_NUM_REGIONS
;
1395 return vdev
->region
[index
].ops
->rw(vdev
, buf
,
1396 count
, ppos
, iswrite
);
1402 static ssize_t
vfio_pci_read(struct vfio_device
*core_vdev
, char __user
*buf
,
1403 size_t count
, loff_t
*ppos
)
1405 struct vfio_pci_device
*vdev
=
1406 container_of(core_vdev
, struct vfio_pci_device
, vdev
);
1411 return vfio_pci_rw(vdev
, buf
, count
, ppos
, false);
1414 static ssize_t
vfio_pci_write(struct vfio_device
*core_vdev
, const char __user
*buf
,
1415 size_t count
, loff_t
*ppos
)
1417 struct vfio_pci_device
*vdev
=
1418 container_of(core_vdev
, struct vfio_pci_device
, vdev
);
1423 return vfio_pci_rw(vdev
, (char __user
*)buf
, count
, ppos
, true);
1426 /* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */
1427 static int vfio_pci_zap_and_vma_lock(struct vfio_pci_device
*vdev
, bool try)
1429 struct vfio_pci_mmap_vma
*mmap_vma
, *tmp
;
1433 * vma_lock is nested under mmap_lock for vm_ops callback paths.
1434 * The memory_lock semaphore is used by both code paths calling
1435 * into this function to zap vmas and the vm_ops.fault callback
1436 * to protect the memory enable state of the device.
1438 * When zapping vmas we need to maintain the mmap_lock => vma_lock
1439 * ordering, which requires using vma_lock to walk vma_list to
1440 * acquire an mm, then dropping vma_lock to get the mmap_lock and
1441 * reacquiring vma_lock. This logic is derived from similar
1442 * requirements in uverbs_user_mmap_disassociate().
1444 * mmap_lock must always be the top-level lock when it is taken.
1445 * Therefore we can only hold the memory_lock write lock when
1446 * vma_list is empty, as we'd need to take mmap_lock to clear
1447 * entries. vma_list can only be guaranteed empty when holding
1448 * vma_lock, thus memory_lock is nested under vma_lock.
1450 * This enables the vm_ops.fault callback to acquire vma_lock,
1451 * followed by memory_lock read lock, while already holding
1452 * mmap_lock without risk of deadlock.
1455 struct mm_struct
*mm
= NULL
;
1458 if (!mutex_trylock(&vdev
->vma_lock
))
1461 mutex_lock(&vdev
->vma_lock
);
1463 while (!list_empty(&vdev
->vma_list
)) {
1464 mmap_vma
= list_first_entry(&vdev
->vma_list
,
1465 struct vfio_pci_mmap_vma
,
1467 mm
= mmap_vma
->vma
->vm_mm
;
1468 if (mmget_not_zero(mm
))
1471 list_del(&mmap_vma
->vma_next
);
1477 mutex_unlock(&vdev
->vma_lock
);
1480 if (!mmap_read_trylock(mm
)) {
1488 if (!mutex_trylock(&vdev
->vma_lock
)) {
1489 mmap_read_unlock(mm
);
1494 mutex_lock(&vdev
->vma_lock
);
1496 list_for_each_entry_safe(mmap_vma
, tmp
,
1497 &vdev
->vma_list
, vma_next
) {
1498 struct vm_area_struct
*vma
= mmap_vma
->vma
;
1500 if (vma
->vm_mm
!= mm
)
1503 list_del(&mmap_vma
->vma_next
);
1506 zap_vma_ptes(vma
, vma
->vm_start
,
1507 vma
->vm_end
- vma
->vm_start
);
1509 mutex_unlock(&vdev
->vma_lock
);
1510 mmap_read_unlock(mm
);
1515 void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_device
*vdev
)
1517 vfio_pci_zap_and_vma_lock(vdev
, false);
1518 down_write(&vdev
->memory_lock
);
1519 mutex_unlock(&vdev
->vma_lock
);
1522 u16
vfio_pci_memory_lock_and_enable(struct vfio_pci_device
*vdev
)
1526 down_write(&vdev
->memory_lock
);
1527 pci_read_config_word(vdev
->pdev
, PCI_COMMAND
, &cmd
);
1528 if (!(cmd
& PCI_COMMAND_MEMORY
))
1529 pci_write_config_word(vdev
->pdev
, PCI_COMMAND
,
1530 cmd
| PCI_COMMAND_MEMORY
);
1535 void vfio_pci_memory_unlock_and_restore(struct vfio_pci_device
*vdev
, u16 cmd
)
1537 pci_write_config_word(vdev
->pdev
, PCI_COMMAND
, cmd
);
1538 up_write(&vdev
->memory_lock
);
1541 /* Caller holds vma_lock */
1542 static int __vfio_pci_add_vma(struct vfio_pci_device
*vdev
,
1543 struct vm_area_struct
*vma
)
1545 struct vfio_pci_mmap_vma
*mmap_vma
;
1547 mmap_vma
= kmalloc(sizeof(*mmap_vma
), GFP_KERNEL
);
1551 mmap_vma
->vma
= vma
;
1552 list_add(&mmap_vma
->vma_next
, &vdev
->vma_list
);
1558 * Zap mmaps on open so that we can fault them in on access and therefore
1559 * our vma_list only tracks mappings accessed since last zap.
1561 static void vfio_pci_mmap_open(struct vm_area_struct
*vma
)
1563 zap_vma_ptes(vma
, vma
->vm_start
, vma
->vm_end
- vma
->vm_start
);
1566 static void vfio_pci_mmap_close(struct vm_area_struct
*vma
)
1568 struct vfio_pci_device
*vdev
= vma
->vm_private_data
;
1569 struct vfio_pci_mmap_vma
*mmap_vma
;
1571 mutex_lock(&vdev
->vma_lock
);
1572 list_for_each_entry(mmap_vma
, &vdev
->vma_list
, vma_next
) {
1573 if (mmap_vma
->vma
== vma
) {
1574 list_del(&mmap_vma
->vma_next
);
1579 mutex_unlock(&vdev
->vma_lock
);
1582 static vm_fault_t
vfio_pci_mmap_fault(struct vm_fault
*vmf
)
1584 struct vm_area_struct
*vma
= vmf
->vma
;
1585 struct vfio_pci_device
*vdev
= vma
->vm_private_data
;
1586 struct vfio_pci_mmap_vma
*mmap_vma
;
1587 vm_fault_t ret
= VM_FAULT_NOPAGE
;
1589 mutex_lock(&vdev
->vma_lock
);
1590 down_read(&vdev
->memory_lock
);
1592 if (!__vfio_pci_memory_enabled(vdev
)) {
1593 ret
= VM_FAULT_SIGBUS
;
1598 * We populate the whole vma on fault, so we need to test whether
1599 * the vma has already been mapped, such as for concurrent faults
1600 * to the same vma. io_remap_pfn_range() will trigger a BUG_ON if
1601 * we ask it to fill the same range again.
1603 list_for_each_entry(mmap_vma
, &vdev
->vma_list
, vma_next
) {
1604 if (mmap_vma
->vma
== vma
)
1608 if (io_remap_pfn_range(vma
, vma
->vm_start
, vma
->vm_pgoff
,
1609 vma
->vm_end
- vma
->vm_start
,
1610 vma
->vm_page_prot
)) {
1611 ret
= VM_FAULT_SIGBUS
;
1612 zap_vma_ptes(vma
, vma
->vm_start
, vma
->vm_end
- vma
->vm_start
);
1616 if (__vfio_pci_add_vma(vdev
, vma
)) {
1618 zap_vma_ptes(vma
, vma
->vm_start
, vma
->vm_end
- vma
->vm_start
);
1622 up_read(&vdev
->memory_lock
);
1623 mutex_unlock(&vdev
->vma_lock
);
1627 static const struct vm_operations_struct vfio_pci_mmap_ops
= {
1628 .open
= vfio_pci_mmap_open
,
1629 .close
= vfio_pci_mmap_close
,
1630 .fault
= vfio_pci_mmap_fault
,
1633 static int vfio_pci_mmap(struct vfio_device
*core_vdev
, struct vm_area_struct
*vma
)
1635 struct vfio_pci_device
*vdev
=
1636 container_of(core_vdev
, struct vfio_pci_device
, vdev
);
1637 struct pci_dev
*pdev
= vdev
->pdev
;
1639 u64 phys_len
, req_len
, pgoff
, req_start
;
1642 index
= vma
->vm_pgoff
>> (VFIO_PCI_OFFSET_SHIFT
- PAGE_SHIFT
);
1644 if (index
>= VFIO_PCI_NUM_REGIONS
+ vdev
->num_regions
)
1646 if (vma
->vm_end
< vma
->vm_start
)
1648 if ((vma
->vm_flags
& VM_SHARED
) == 0)
1650 if (index
>= VFIO_PCI_NUM_REGIONS
) {
1651 int regnum
= index
- VFIO_PCI_NUM_REGIONS
;
1652 struct vfio_pci_region
*region
= vdev
->region
+ regnum
;
1654 if (region
->ops
&& region
->ops
->mmap
&&
1655 (region
->flags
& VFIO_REGION_INFO_FLAG_MMAP
))
1656 return region
->ops
->mmap(vdev
, region
, vma
);
1659 if (index
>= VFIO_PCI_ROM_REGION_INDEX
)
1661 if (!vdev
->bar_mmap_supported
[index
])
1664 phys_len
= PAGE_ALIGN(pci_resource_len(pdev
, index
));
1665 req_len
= vma
->vm_end
- vma
->vm_start
;
1666 pgoff
= vma
->vm_pgoff
&
1667 ((1U << (VFIO_PCI_OFFSET_SHIFT
- PAGE_SHIFT
)) - 1);
1668 req_start
= pgoff
<< PAGE_SHIFT
;
1670 if (req_start
+ req_len
> phys_len
)
1674 * Even though we don't make use of the barmap for the mmap,
1675 * we need to request the region and the barmap tracks that.
1677 if (!vdev
->barmap
[index
]) {
1678 ret
= pci_request_selected_regions(pdev
,
1679 1 << index
, "vfio-pci");
1683 vdev
->barmap
[index
] = pci_iomap(pdev
, index
, 0);
1684 if (!vdev
->barmap
[index
]) {
1685 pci_release_selected_regions(pdev
, 1 << index
);
1690 vma
->vm_private_data
= vdev
;
1691 vma
->vm_page_prot
= pgprot_noncached(vma
->vm_page_prot
);
1692 vma
->vm_pgoff
= (pci_resource_start(pdev
, index
) >> PAGE_SHIFT
) + pgoff
;
1695 * See remap_pfn_range(), called from vfio_pci_fault() but we can't
1696 * change vm_flags within the fault handler. Set them now.
1698 vma
->vm_flags
|= VM_IO
| VM_PFNMAP
| VM_DONTEXPAND
| VM_DONTDUMP
;
1699 vma
->vm_ops
= &vfio_pci_mmap_ops
;
1704 static void vfio_pci_request(struct vfio_device
*core_vdev
, unsigned int count
)
1706 struct vfio_pci_device
*vdev
=
1707 container_of(core_vdev
, struct vfio_pci_device
, vdev
);
1708 struct pci_dev
*pdev
= vdev
->pdev
;
1710 mutex_lock(&vdev
->igate
);
1712 if (vdev
->req_trigger
) {
1714 pci_notice_ratelimited(pdev
,
1715 "Relaying device request to user (#%u)\n",
1717 eventfd_signal(vdev
->req_trigger
, 1);
1718 } else if (count
== 0) {
1720 "No device request channel registered, blocked until released by user\n");
1723 mutex_unlock(&vdev
->igate
);
1726 static int vfio_pci_validate_vf_token(struct vfio_pci_device
*vdev
,
1727 bool vf_token
, uuid_t
*uuid
)
1730 * There's always some degree of trust or collaboration between SR-IOV
1731 * PF and VFs, even if just that the PF hosts the SR-IOV capability and
1732 * can disrupt VFs with a reset, but often the PF has more explicit
1733 * access to deny service to the VF or access data passed through the
1734 * VF. We therefore require an opt-in via a shared VF token (UUID) to
1735 * represent this trust. This both prevents that a VF driver might
1736 * assume the PF driver is a trusted, in-kernel driver, and also that
1737 * a PF driver might be replaced with a rogue driver, unknown to in-use
1740 * Therefore when presented with a VF, if the PF is a vfio device and
1741 * it is bound to the vfio-pci driver, the user needs to provide a VF
1742 * token to access the device, in the form of appending a vf_token to
1743 * the device name, for example:
1745 * "0000:04:10.0 vf_token=bd8d9d2b-5a5f-4f5a-a211-f591514ba1f3"
1747 * When presented with a PF which has VFs in use, the user must also
1748 * provide the current VF token to prove collaboration with existing
1749 * VF users. If VFs are not in use, the VF token provided for the PF
1750 * device will act to set the VF token.
1752 * If the VF token is provided but unused, an error is generated.
1754 if (!vdev
->pdev
->is_virtfn
&& !vdev
->vf_token
&& !vf_token
)
1755 return 0; /* No VF token provided or required */
1757 if (vdev
->pdev
->is_virtfn
) {
1758 struct vfio_pci_device
*pf_vdev
= get_pf_vdev(vdev
);
1763 return 0; /* PF is not vfio-pci, no VF token */
1765 pci_info_ratelimited(vdev
->pdev
,
1766 "VF token incorrectly provided, PF not bound to vfio-pci\n");
1771 vfio_device_put(&pf_vdev
->vdev
);
1772 pci_info_ratelimited(vdev
->pdev
,
1773 "VF token required to access device\n");
1777 mutex_lock(&pf_vdev
->vf_token
->lock
);
1778 match
= uuid_equal(uuid
, &pf_vdev
->vf_token
->uuid
);
1779 mutex_unlock(&pf_vdev
->vf_token
->lock
);
1781 vfio_device_put(&pf_vdev
->vdev
);
1784 pci_info_ratelimited(vdev
->pdev
,
1785 "Incorrect VF token provided for device\n");
1788 } else if (vdev
->vf_token
) {
1789 mutex_lock(&vdev
->vf_token
->lock
);
1790 if (vdev
->vf_token
->users
) {
1792 mutex_unlock(&vdev
->vf_token
->lock
);
1793 pci_info_ratelimited(vdev
->pdev
,
1794 "VF token required to access device\n");
1798 if (!uuid_equal(uuid
, &vdev
->vf_token
->uuid
)) {
1799 mutex_unlock(&vdev
->vf_token
->lock
);
1800 pci_info_ratelimited(vdev
->pdev
,
1801 "Incorrect VF token provided for device\n");
1804 } else if (vf_token
) {
1805 uuid_copy(&vdev
->vf_token
->uuid
, uuid
);
1808 mutex_unlock(&vdev
->vf_token
->lock
);
1809 } else if (vf_token
) {
1810 pci_info_ratelimited(vdev
->pdev
,
1811 "VF token incorrectly provided, not a PF or VF\n");
1818 #define VF_TOKEN_ARG "vf_token="
1820 static int vfio_pci_match(struct vfio_device
*core_vdev
, char *buf
)
1822 struct vfio_pci_device
*vdev
=
1823 container_of(core_vdev
, struct vfio_pci_device
, vdev
);
1824 bool vf_token
= false;
1828 if (strncmp(pci_name(vdev
->pdev
), buf
, strlen(pci_name(vdev
->pdev
))))
1829 return 0; /* No match */
1831 if (strlen(buf
) > strlen(pci_name(vdev
->pdev
))) {
1832 buf
+= strlen(pci_name(vdev
->pdev
));
1835 return 0; /* No match: non-whitespace after name */
1843 if (!vf_token
&& !strncmp(buf
, VF_TOKEN_ARG
,
1844 strlen(VF_TOKEN_ARG
))) {
1845 buf
+= strlen(VF_TOKEN_ARG
);
1847 if (strlen(buf
) < UUID_STRING_LEN
)
1850 ret
= uuid_parse(buf
, &uuid
);
1855 buf
+= UUID_STRING_LEN
;
1857 /* Unknown/duplicate option */
1863 ret
= vfio_pci_validate_vf_token(vdev
, vf_token
, &uuid
);
1867 return 1; /* Match */
1870 static const struct vfio_device_ops vfio_pci_ops
= {
1872 .open
= vfio_pci_open
,
1873 .release
= vfio_pci_release
,
1874 .ioctl
= vfio_pci_ioctl
,
1875 .read
= vfio_pci_read
,
1876 .write
= vfio_pci_write
,
1877 .mmap
= vfio_pci_mmap
,
1878 .request
= vfio_pci_request
,
1879 .match
= vfio_pci_match
,
1882 static int vfio_pci_reflck_attach(struct vfio_pci_device
*vdev
);
1883 static void vfio_pci_reflck_put(struct vfio_pci_reflck
*reflck
);
1885 static int vfio_pci_bus_notifier(struct notifier_block
*nb
,
1886 unsigned long action
, void *data
)
1888 struct vfio_pci_device
*vdev
= container_of(nb
,
1889 struct vfio_pci_device
, nb
);
1890 struct device
*dev
= data
;
1891 struct pci_dev
*pdev
= to_pci_dev(dev
);
1892 struct pci_dev
*physfn
= pci_physfn(pdev
);
1894 if (action
== BUS_NOTIFY_ADD_DEVICE
&&
1895 pdev
->is_virtfn
&& physfn
== vdev
->pdev
) {
1896 pci_info(vdev
->pdev
, "Captured SR-IOV VF %s driver_override\n",
1898 pdev
->driver_override
= kasprintf(GFP_KERNEL
, "%s",
1900 } else if (action
== BUS_NOTIFY_BOUND_DRIVER
&&
1901 pdev
->is_virtfn
&& physfn
== vdev
->pdev
) {
1902 struct pci_driver
*drv
= pci_dev_driver(pdev
);
1904 if (drv
&& drv
!= &vfio_pci_driver
)
1905 pci_warn(vdev
->pdev
,
1906 "VF %s bound to driver %s while PF bound to vfio-pci\n",
1907 pci_name(pdev
), drv
->name
);
1913 static int vfio_pci_vf_init(struct vfio_pci_device
*vdev
)
1915 struct pci_dev
*pdev
= vdev
->pdev
;
1918 if (!pdev
->is_physfn
)
1921 vdev
->vf_token
= kzalloc(sizeof(*vdev
->vf_token
), GFP_KERNEL
);
1922 if (!vdev
->vf_token
)
1925 mutex_init(&vdev
->vf_token
->lock
);
1926 uuid_gen(&vdev
->vf_token
->uuid
);
1928 vdev
->nb
.notifier_call
= vfio_pci_bus_notifier
;
1929 ret
= bus_register_notifier(&pci_bus_type
, &vdev
->nb
);
1931 kfree(vdev
->vf_token
);
1937 static void vfio_pci_vf_uninit(struct vfio_pci_device
*vdev
)
1939 if (!vdev
->vf_token
)
1942 bus_unregister_notifier(&pci_bus_type
, &vdev
->nb
);
1943 WARN_ON(vdev
->vf_token
->users
);
1944 mutex_destroy(&vdev
->vf_token
->lock
);
1945 kfree(vdev
->vf_token
);
1948 static int vfio_pci_vga_init(struct vfio_pci_device
*vdev
)
1950 struct pci_dev
*pdev
= vdev
->pdev
;
1953 if (!vfio_pci_is_vga(pdev
))
1956 ret
= vga_client_register(pdev
, vfio_pci_set_decode
);
1959 vga_set_legacy_decoding(pdev
, vfio_pci_set_decode(pdev
, false));
1963 static void vfio_pci_vga_uninit(struct vfio_pci_device
*vdev
)
1965 struct pci_dev
*pdev
= vdev
->pdev
;
1967 if (!vfio_pci_is_vga(pdev
))
1969 vga_client_unregister(pdev
);
1970 vga_set_legacy_decoding(pdev
, VGA_RSRC_NORMAL_IO
| VGA_RSRC_NORMAL_MEM
|
1971 VGA_RSRC_LEGACY_IO
|
1972 VGA_RSRC_LEGACY_MEM
);
1975 static int vfio_pci_probe(struct pci_dev
*pdev
, const struct pci_device_id
*id
)
1977 struct vfio_pci_device
*vdev
;
1978 struct iommu_group
*group
;
1981 if (vfio_pci_is_denylisted(pdev
))
1984 if (pdev
->hdr_type
!= PCI_HEADER_TYPE_NORMAL
)
1988 * Prevent binding to PFs with VFs enabled, the VFs might be in use
1989 * by the host or other users. We cannot capture the VFs if they
1990 * already exist, nor can we track VF users. Disabling SR-IOV here
1991 * would initiate removing the VFs, which would unbind the driver,
1992 * which is prone to blocking if that VF is also in use by vfio-pci.
1993 * Just reject these PFs and let the user sort it out.
1995 if (pci_num_vf(pdev
)) {
1996 pci_warn(pdev
, "Cannot bind to PF with SR-IOV enabled\n");
2000 group
= vfio_iommu_group_get(&pdev
->dev
);
2004 vdev
= kzalloc(sizeof(*vdev
), GFP_KERNEL
);
2010 vfio_init_group_dev(&vdev
->vdev
, &pdev
->dev
, &vfio_pci_ops
);
2012 vdev
->irq_type
= VFIO_PCI_NUM_IRQS
;
2013 mutex_init(&vdev
->igate
);
2014 spin_lock_init(&vdev
->irqlock
);
2015 mutex_init(&vdev
->ioeventfds_lock
);
2016 INIT_LIST_HEAD(&vdev
->dummy_resources_list
);
2017 INIT_LIST_HEAD(&vdev
->ioeventfds_list
);
2018 mutex_init(&vdev
->vma_lock
);
2019 INIT_LIST_HEAD(&vdev
->vma_list
);
2020 init_rwsem(&vdev
->memory_lock
);
2022 ret
= vfio_pci_reflck_attach(vdev
);
2025 ret
= vfio_pci_vf_init(vdev
);
2028 ret
= vfio_pci_vga_init(vdev
);
2032 vfio_pci_probe_power_state(vdev
);
2034 if (!disable_idle_d3
) {
2036 * pci-core sets the device power state to an unknown value at
2037 * bootup and after being removed from a driver. The only
2038 * transition it allows from this unknown state is to D0, which
2039 * typically happens when a driver calls pci_enable_device().
2040 * We're not ready to enable the device yet, but we do want to
2041 * be able to get to D3. Therefore first do a D0 transition
2042 * before going to D3.
2044 vfio_pci_set_power_state(vdev
, PCI_D0
);
2045 vfio_pci_set_power_state(vdev
, PCI_D3hot
);
2048 ret
= vfio_register_group_dev(&vdev
->vdev
);
2051 dev_set_drvdata(&pdev
->dev
, vdev
);
2055 if (!disable_idle_d3
)
2056 vfio_pci_set_power_state(vdev
, PCI_D0
);
2058 vfio_pci_vf_uninit(vdev
);
2060 vfio_pci_reflck_put(vdev
->reflck
);
2062 kfree(vdev
->pm_save
);
2065 vfio_iommu_group_put(group
, &pdev
->dev
);
2069 static void vfio_pci_remove(struct pci_dev
*pdev
)
2071 struct vfio_pci_device
*vdev
= dev_get_drvdata(&pdev
->dev
);
2073 pci_disable_sriov(pdev
);
2075 vfio_unregister_group_dev(&vdev
->vdev
);
2077 vfio_pci_vf_uninit(vdev
);
2078 vfio_pci_reflck_put(vdev
->reflck
);
2079 vfio_pci_vga_uninit(vdev
);
2081 vfio_iommu_group_put(pdev
->dev
.iommu_group
, &pdev
->dev
);
2083 if (!disable_idle_d3
)
2084 vfio_pci_set_power_state(vdev
, PCI_D0
);
2086 mutex_destroy(&vdev
->ioeventfds_lock
);
2087 kfree(vdev
->region
);
2088 kfree(vdev
->pm_save
);
2092 static pci_ers_result_t
vfio_pci_aer_err_detected(struct pci_dev
*pdev
,
2093 pci_channel_state_t state
)
2095 struct vfio_pci_device
*vdev
;
2096 struct vfio_device
*device
;
2098 device
= vfio_device_get_from_dev(&pdev
->dev
);
2100 return PCI_ERS_RESULT_DISCONNECT
;
2102 vdev
= container_of(device
, struct vfio_pci_device
, vdev
);
2104 mutex_lock(&vdev
->igate
);
2106 if (vdev
->err_trigger
)
2107 eventfd_signal(vdev
->err_trigger
, 1);
2109 mutex_unlock(&vdev
->igate
);
2111 vfio_device_put(device
);
2113 return PCI_ERS_RESULT_CAN_RECOVER
;
2116 static int vfio_pci_sriov_configure(struct pci_dev
*pdev
, int nr_virtfn
)
2118 struct vfio_device
*device
;
2126 device
= vfio_device_get_from_dev(&pdev
->dev
);
2131 pci_disable_sriov(pdev
);
2133 ret
= pci_enable_sriov(pdev
, nr_virtfn
);
2135 vfio_device_put(device
);
2137 return ret
< 0 ? ret
: nr_virtfn
;
2140 static const struct pci_error_handlers vfio_err_handlers
= {
2141 .error_detected
= vfio_pci_aer_err_detected
,
2144 static struct pci_driver vfio_pci_driver
= {
2146 .id_table
= NULL
, /* only dynamic ids */
2147 .probe
= vfio_pci_probe
,
2148 .remove
= vfio_pci_remove
,
2149 .sriov_configure
= vfio_pci_sriov_configure
,
2150 .err_handler
= &vfio_err_handlers
,
2153 static DEFINE_MUTEX(reflck_lock
);
2155 static struct vfio_pci_reflck
*vfio_pci_reflck_alloc(void)
2157 struct vfio_pci_reflck
*reflck
;
2159 reflck
= kzalloc(sizeof(*reflck
), GFP_KERNEL
);
2161 return ERR_PTR(-ENOMEM
);
2163 kref_init(&reflck
->kref
);
2164 mutex_init(&reflck
->lock
);
2169 static void vfio_pci_reflck_get(struct vfio_pci_reflck
*reflck
)
2171 kref_get(&reflck
->kref
);
2174 static int vfio_pci_reflck_find(struct pci_dev
*pdev
, void *data
)
2176 struct vfio_pci_reflck
**preflck
= data
;
2177 struct vfio_device
*device
;
2178 struct vfio_pci_device
*vdev
;
2180 device
= vfio_device_get_from_dev(&pdev
->dev
);
2184 if (pci_dev_driver(pdev
) != &vfio_pci_driver
) {
2185 vfio_device_put(device
);
2189 vdev
= container_of(device
, struct vfio_pci_device
, vdev
);
2192 vfio_pci_reflck_get(vdev
->reflck
);
2193 *preflck
= vdev
->reflck
;
2194 vfio_device_put(device
);
2198 vfio_device_put(device
);
2202 static int vfio_pci_reflck_attach(struct vfio_pci_device
*vdev
)
2204 bool slot
= !pci_probe_reset_slot(vdev
->pdev
->slot
);
2206 mutex_lock(&reflck_lock
);
2208 if (pci_is_root_bus(vdev
->pdev
->bus
) ||
2209 vfio_pci_for_each_slot_or_bus(vdev
->pdev
, vfio_pci_reflck_find
,
2210 &vdev
->reflck
, slot
) <= 0)
2211 vdev
->reflck
= vfio_pci_reflck_alloc();
2213 mutex_unlock(&reflck_lock
);
2215 return PTR_ERR_OR_ZERO(vdev
->reflck
);
2218 static void vfio_pci_reflck_release(struct kref
*kref
)
2220 struct vfio_pci_reflck
*reflck
= container_of(kref
,
2221 struct vfio_pci_reflck
,
2225 mutex_unlock(&reflck_lock
);
2228 static void vfio_pci_reflck_put(struct vfio_pci_reflck
*reflck
)
2230 kref_put_mutex(&reflck
->kref
, vfio_pci_reflck_release
, &reflck_lock
);
2233 static int vfio_pci_get_unused_devs(struct pci_dev
*pdev
, void *data
)
2235 struct vfio_devices
*devs
= data
;
2236 struct vfio_device
*device
;
2237 struct vfio_pci_device
*vdev
;
2239 if (devs
->cur_index
== devs
->max_index
)
2242 device
= vfio_device_get_from_dev(&pdev
->dev
);
2246 if (pci_dev_driver(pdev
) != &vfio_pci_driver
) {
2247 vfio_device_put(device
);
2251 vdev
= container_of(device
, struct vfio_pci_device
, vdev
);
2253 /* Fault if the device is not unused */
2255 vfio_device_put(device
);
2259 devs
->devices
[devs
->cur_index
++] = vdev
;
2263 static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev
*pdev
, void *data
)
2265 struct vfio_devices
*devs
= data
;
2266 struct vfio_device
*device
;
2267 struct vfio_pci_device
*vdev
;
2269 if (devs
->cur_index
== devs
->max_index
)
2272 device
= vfio_device_get_from_dev(&pdev
->dev
);
2276 if (pci_dev_driver(pdev
) != &vfio_pci_driver
) {
2277 vfio_device_put(device
);
2281 vdev
= container_of(device
, struct vfio_pci_device
, vdev
);
2284 * Locking multiple devices is prone to deadlock, runaway and
2285 * unwind if we hit contention.
2287 if (!vfio_pci_zap_and_vma_lock(vdev
, true)) {
2288 vfio_device_put(device
);
2292 devs
->devices
[devs
->cur_index
++] = vdev
;
2297 * If a bus or slot reset is available for the provided device and:
2298 * - All of the devices affected by that bus or slot reset are unused
2300 * - At least one of the affected devices is marked dirty via
2301 * needs_reset (such as by lack of FLR support)
2302 * Then attempt to perform that bus or slot reset. Callers are required
2303 * to hold vdev->reflck->lock, protecting the bus/slot reset group from
2304 * concurrent opens. A vfio_device reference is acquired for each device
2305 * to prevent unbinds during the reset operation.
2307 * NB: vfio-core considers a group to be viable even if some devices are
2308 * bound to drivers like pci-stub or pcieport. Here we require all devices
2309 * to be bound to vfio_pci since that's the only way we can be sure they
2312 static void vfio_pci_try_bus_reset(struct vfio_pci_device
*vdev
)
2314 struct vfio_devices devs
= { .cur_index
= 0 };
2315 int i
= 0, ret
= -EINVAL
;
2317 struct vfio_pci_device
*tmp
;
2319 if (!pci_probe_reset_slot(vdev
->pdev
->slot
))
2321 else if (pci_probe_reset_bus(vdev
->pdev
->bus
))
2324 if (vfio_pci_for_each_slot_or_bus(vdev
->pdev
, vfio_pci_count_devs
,
2329 devs
.devices
= kcalloc(i
, sizeof(struct vfio_device
*), GFP_KERNEL
);
2333 if (vfio_pci_for_each_slot_or_bus(vdev
->pdev
,
2334 vfio_pci_get_unused_devs
,
2338 /* Does at least one need a reset? */
2339 for (i
= 0; i
< devs
.cur_index
; i
++) {
2340 tmp
= devs
.devices
[i
];
2341 if (tmp
->needs_reset
) {
2342 ret
= pci_reset_bus(vdev
->pdev
);
2348 for (i
= 0; i
< devs
.cur_index
; i
++) {
2349 tmp
= devs
.devices
[i
];
2352 * If reset was successful, affected devices no longer need
2353 * a reset and we should return all the collateral devices
2354 * to low power. If not successful, we either didn't reset
2355 * the bus or timed out waiting for it, so let's not touch
2359 tmp
->needs_reset
= false;
2361 if (tmp
!= vdev
&& !disable_idle_d3
)
2362 vfio_pci_set_power_state(tmp
, PCI_D3hot
);
2365 vfio_device_put(&tmp
->vdev
);
2368 kfree(devs
.devices
);
2371 static void __exit
vfio_pci_cleanup(void)
2373 pci_unregister_driver(&vfio_pci_driver
);
2374 vfio_pci_uninit_perm_bits();
2377 static void __init
vfio_pci_fill_ids(void)
2382 /* no ids passed actually */
2386 /* add ids specified in the module parameter */
2388 while ((id
= strsep(&p
, ","))) {
2389 unsigned int vendor
, device
, subvendor
= PCI_ANY_ID
,
2390 subdevice
= PCI_ANY_ID
, class = 0, class_mask
= 0;
2396 fields
= sscanf(id
, "%x:%x:%x:%x:%x:%x",
2397 &vendor
, &device
, &subvendor
, &subdevice
,
2398 &class, &class_mask
);
2401 pr_warn("invalid id string \"%s\"\n", id
);
2405 rc
= pci_add_dynid(&vfio_pci_driver
, vendor
, device
,
2406 subvendor
, subdevice
, class, class_mask
, 0);
2408 pr_warn("failed to add dynamic id [%04x:%04x[%04x:%04x]] class %#08x/%08x (%d)\n",
2409 vendor
, device
, subvendor
, subdevice
,
2410 class, class_mask
, rc
);
2412 pr_info("add [%04x:%04x[%04x:%04x]] class %#08x/%08x\n",
2413 vendor
, device
, subvendor
, subdevice
,
2418 static int __init
vfio_pci_init(void)
2422 /* Allocate shared config space permission data used by all devices */
2423 ret
= vfio_pci_init_perm_bits();
2427 /* Register and scan for devices */
2428 ret
= pci_register_driver(&vfio_pci_driver
);
2432 vfio_pci_fill_ids();
2434 if (disable_denylist
)
2435 pr_warn("device denylist disabled.\n");
2440 vfio_pci_uninit_perm_bits();
2444 module_init(vfio_pci_init
);
2445 module_exit(vfio_pci_cleanup
);
2447 MODULE_VERSION(DRIVER_VERSION
);
2448 MODULE_LICENSE("GPL v2");
2449 MODULE_AUTHOR(DRIVER_AUTHOR
);
2450 MODULE_DESCRIPTION(DRIVER_DESC
);