1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
4 * Author: Alex Williamson <alex.williamson@redhat.com>
6 * Derived from original vfio:
7 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
8 * Author: Tom Lyon, pugs@cisco.com
11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13 #include <linux/device.h>
14 #include <linux/eventfd.h>
15 #include <linux/file.h>
16 #include <linux/interrupt.h>
17 #include <linux/iommu.h>
18 #include <linux/module.h>
19 #include <linux/mutex.h>
20 #include <linux/notifier.h>
21 #include <linux/pci.h>
22 #include <linux/pm_runtime.h>
23 #include <linux/slab.h>
24 #include <linux/types.h>
25 #include <linux/uaccess.h>
26 #include <linux/vgaarb.h>
27 #include <linux/nospec.h>
28 #include <linux/sched/mm.h>
30 #include <linux/vfio_pci_core.h>
32 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
33 #define DRIVER_DESC "core driver for VFIO based PCI devices"
35 static bool nointxmask
;
36 static bool disable_vga
;
37 static bool disable_idle_d3
;
39 static inline bool vfio_vga_disabled(void)
41 #ifdef CONFIG_VFIO_PCI_VGA
49 * Our VGA arbiter participation is limited since we don't know anything
50 * about the device itself. However, if the device is the only VGA device
51 * downstream of a bridge and VFIO VGA support is disabled, then we can
52 * safely return legacy VGA IO and memory as not decoded since the user
53 * has no way to get to it and routing can be disabled externally at the
56 static unsigned int vfio_pci_set_decode(struct pci_dev
*pdev
, bool single_vga
)
58 struct pci_dev
*tmp
= NULL
;
59 unsigned char max_busnr
;
62 if (single_vga
|| !vfio_vga_disabled() || pci_is_root_bus(pdev
->bus
))
63 return VGA_RSRC_NORMAL_IO
| VGA_RSRC_NORMAL_MEM
|
64 VGA_RSRC_LEGACY_IO
| VGA_RSRC_LEGACY_MEM
;
66 max_busnr
= pci_bus_max_busnr(pdev
->bus
);
67 decodes
= VGA_RSRC_NORMAL_IO
| VGA_RSRC_NORMAL_MEM
;
69 while ((tmp
= pci_get_class(PCI_CLASS_DISPLAY_VGA
<< 8, tmp
)) != NULL
) {
71 pci_domain_nr(tmp
->bus
) != pci_domain_nr(pdev
->bus
) ||
72 pci_is_root_bus(tmp
->bus
))
75 if (tmp
->bus
->number
>= pdev
->bus
->number
&&
76 tmp
->bus
->number
<= max_busnr
) {
78 decodes
|= VGA_RSRC_LEGACY_IO
| VGA_RSRC_LEGACY_MEM
;
86 static void vfio_pci_probe_mmaps(struct vfio_pci_core_device
*vdev
)
90 struct vfio_pci_dummy_resource
*dummy_res
;
92 for (i
= 0; i
< PCI_STD_NUM_BARS
; i
++) {
93 int bar
= i
+ PCI_STD_RESOURCES
;
95 res
= &vdev
->pdev
->resource
[bar
];
97 if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP
))
100 if (!(res
->flags
& IORESOURCE_MEM
))
104 * The PCI core shouldn't set up a resource with a
105 * type but zero size. But there may be bugs that
106 * cause us to do that.
108 if (!resource_size(res
))
111 if (resource_size(res
) >= PAGE_SIZE
) {
112 vdev
->bar_mmap_supported
[bar
] = true;
116 if (!(res
->start
& ~PAGE_MASK
)) {
118 * Add a dummy resource to reserve the remainder
119 * of the exclusive page in case that hot-add
120 * device's bar is assigned into it.
122 dummy_res
= kzalloc(sizeof(*dummy_res
), GFP_KERNEL
);
123 if (dummy_res
== NULL
)
126 dummy_res
->resource
.name
= "vfio sub-page reserved";
127 dummy_res
->resource
.start
= res
->end
+ 1;
128 dummy_res
->resource
.end
= res
->start
+ PAGE_SIZE
- 1;
129 dummy_res
->resource
.flags
= res
->flags
;
130 if (request_resource(res
->parent
,
131 &dummy_res
->resource
)) {
135 dummy_res
->index
= bar
;
136 list_add(&dummy_res
->res_next
,
137 &vdev
->dummy_resources_list
);
138 vdev
->bar_mmap_supported
[bar
] = true;
142 * Here we don't handle the case when the BAR is not page
143 * aligned because we can't expect the BAR will be
144 * assigned into the same location in a page in guest
145 * when we passthrough the BAR. And it's hard to access
146 * this BAR in userspace because we have no way to get
147 * the BAR's location in a page.
150 vdev
->bar_mmap_supported
[bar
] = false;
154 struct vfio_pci_group_info
;
155 static bool vfio_pci_dev_set_try_reset(struct vfio_device_set
*dev_set
);
156 static int vfio_pci_dev_set_hot_reset(struct vfio_device_set
*dev_set
,
157 struct vfio_pci_group_info
*groups
);
160 * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
161 * _and_ the ability detect when the device is asserting INTx via PCI_STATUS.
162 * If a device implements the former but not the latter we would typically
163 * expect broken_intx_masking be set and require an exclusive interrupt.
164 * However since we do have control of the device's ability to assert INTx,
165 * we can instead pretend that the device does not implement INTx, virtualizing
166 * the pin register to report zero and maintaining DisINTx set on the host.
168 static bool vfio_pci_nointx(struct pci_dev
*pdev
)
170 switch (pdev
->vendor
) {
171 case PCI_VENDOR_ID_INTEL
:
172 switch (pdev
->device
) {
173 /* All i40e (XL710/X710/XXV710) 10/20/25/40GbE NICs */
176 case 0x1580 ... 0x1581:
177 case 0x1583 ... 0x158b:
178 case 0x37d0 ... 0x37d2:
190 static void vfio_pci_probe_power_state(struct vfio_pci_core_device
*vdev
)
192 struct pci_dev
*pdev
= vdev
->pdev
;
198 pci_read_config_word(pdev
, pdev
->pm_cap
+ PCI_PM_CTRL
, &pmcsr
);
200 vdev
->needs_pm_restore
= !(pmcsr
& PCI_PM_CTRL_NO_SOFT_RESET
);
204 * pci_set_power_state() wrapper handling devices which perform a soft reset on
205 * D3->D0 transition. Save state prior to D0/1/2->D3, stash it on the vdev,
206 * restore when returned to D0. Saved separately from pci_saved_state for use
207 * by PM capability emulation and separately from pci_dev internal saved state
208 * to avoid it being overwritten and consumed around other resets.
210 int vfio_pci_set_power_state(struct vfio_pci_core_device
*vdev
, pci_power_t state
)
212 struct pci_dev
*pdev
= vdev
->pdev
;
213 bool needs_restore
= false, needs_save
= false;
216 if (vdev
->needs_pm_restore
) {
217 if (pdev
->current_state
< PCI_D3hot
&& state
>= PCI_D3hot
) {
218 pci_save_state(pdev
);
222 if (pdev
->current_state
>= PCI_D3hot
&& state
<= PCI_D0
)
223 needs_restore
= true;
226 ret
= pci_set_power_state(pdev
, state
);
229 /* D3 might be unsupported via quirk, skip unless in D3 */
230 if (needs_save
&& pdev
->current_state
>= PCI_D3hot
) {
231 vdev
->pm_save
= pci_store_saved_state(pdev
);
232 } else if (needs_restore
) {
233 pci_load_and_free_saved_state(pdev
, &vdev
->pm_save
);
234 pci_restore_state(pdev
);
241 int vfio_pci_core_enable(struct vfio_pci_core_device
*vdev
)
243 struct pci_dev
*pdev
= vdev
->pdev
;
248 vfio_pci_set_power_state(vdev
, PCI_D0
);
250 /* Don't allow our initial saved state to include busmaster */
251 pci_clear_master(pdev
);
253 ret
= pci_enable_device(pdev
);
257 /* If reset fails because of the device lock, fail this path entirely */
258 ret
= pci_try_reset_function(pdev
);
259 if (ret
== -EAGAIN
) {
260 pci_disable_device(pdev
);
264 vdev
->reset_works
= !ret
;
265 pci_save_state(pdev
);
266 vdev
->pci_saved_state
= pci_store_saved_state(pdev
);
267 if (!vdev
->pci_saved_state
)
268 pci_dbg(pdev
, "%s: Couldn't store saved state\n", __func__
);
270 if (likely(!nointxmask
)) {
271 if (vfio_pci_nointx(pdev
)) {
272 pci_info(pdev
, "Masking broken INTx support\n");
276 vdev
->pci_2_3
= pci_intx_mask_supported(pdev
);
279 pci_read_config_word(pdev
, PCI_COMMAND
, &cmd
);
280 if (vdev
->pci_2_3
&& (cmd
& PCI_COMMAND_INTX_DISABLE
)) {
281 cmd
&= ~PCI_COMMAND_INTX_DISABLE
;
282 pci_write_config_word(pdev
, PCI_COMMAND
, cmd
);
285 ret
= vfio_config_init(vdev
);
287 kfree(vdev
->pci_saved_state
);
288 vdev
->pci_saved_state
= NULL
;
289 pci_disable_device(pdev
);
293 msix_pos
= pdev
->msix_cap
;
298 pci_read_config_word(pdev
, msix_pos
+ PCI_MSIX_FLAGS
, &flags
);
299 pci_read_config_dword(pdev
, msix_pos
+ PCI_MSIX_TABLE
, &table
);
301 vdev
->msix_bar
= table
& PCI_MSIX_TABLE_BIR
;
302 vdev
->msix_offset
= table
& PCI_MSIX_TABLE_OFFSET
;
303 vdev
->msix_size
= ((flags
& PCI_MSIX_FLAGS_QSIZE
) + 1) * 16;
305 vdev
->msix_bar
= 0xFF;
307 if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev
))
308 vdev
->has_vga
= true;
313 EXPORT_SYMBOL_GPL(vfio_pci_core_enable
);
315 void vfio_pci_core_disable(struct vfio_pci_core_device
*vdev
)
317 struct pci_dev
*pdev
= vdev
->pdev
;
318 struct vfio_pci_dummy_resource
*dummy_res
, *tmp
;
319 struct vfio_pci_ioeventfd
*ioeventfd
, *ioeventfd_tmp
;
322 /* For needs_reset */
323 lockdep_assert_held(&vdev
->vdev
.dev_set
->lock
);
325 /* Stop the device from further DMA */
326 pci_clear_master(pdev
);
328 vfio_pci_set_irqs_ioctl(vdev
, VFIO_IRQ_SET_DATA_NONE
|
329 VFIO_IRQ_SET_ACTION_TRIGGER
,
330 vdev
->irq_type
, 0, 0, NULL
);
332 /* Device closed, don't need mutex here */
333 list_for_each_entry_safe(ioeventfd
, ioeventfd_tmp
,
334 &vdev
->ioeventfds_list
, next
) {
335 vfio_virqfd_disable(&ioeventfd
->virqfd
);
336 list_del(&ioeventfd
->next
);
339 vdev
->ioeventfds_nr
= 0;
341 vdev
->virq_disabled
= false;
343 for (i
= 0; i
< vdev
->num_regions
; i
++)
344 vdev
->region
[i
].ops
->release(vdev
, &vdev
->region
[i
]);
346 vdev
->num_regions
= 0;
348 vdev
->region
= NULL
; /* don't krealloc a freed pointer */
350 vfio_config_free(vdev
);
352 for (i
= 0; i
< PCI_STD_NUM_BARS
; i
++) {
353 bar
= i
+ PCI_STD_RESOURCES
;
354 if (!vdev
->barmap
[bar
])
356 pci_iounmap(pdev
, vdev
->barmap
[bar
]);
357 pci_release_selected_regions(pdev
, 1 << bar
);
358 vdev
->barmap
[bar
] = NULL
;
361 list_for_each_entry_safe(dummy_res
, tmp
,
362 &vdev
->dummy_resources_list
, res_next
) {
363 list_del(&dummy_res
->res_next
);
364 release_resource(&dummy_res
->resource
);
368 vdev
->needs_reset
= true;
371 * If we have saved state, restore it. If we can reset the device,
372 * even better. Resetting with current state seems better than
373 * nothing, but saving and restoring current state without reset
376 if (pci_load_and_free_saved_state(pdev
, &vdev
->pci_saved_state
)) {
377 pci_info(pdev
, "%s: Couldn't reload saved state\n", __func__
);
379 if (!vdev
->reset_works
)
382 pci_save_state(pdev
);
386 * Disable INTx and MSI, presumably to avoid spurious interrupts
387 * during reset. Stolen from pci_reset_function()
389 pci_write_config_word(pdev
, PCI_COMMAND
, PCI_COMMAND_INTX_DISABLE
);
392 * Try to get the locks ourselves to prevent a deadlock. The
393 * success of this is dependent on being able to lock the device,
394 * which is not always possible.
395 * We can not use the "try" reset interface here, which will
396 * overwrite the previously restored configuration information.
398 if (vdev
->reset_works
&& pci_dev_trylock(pdev
)) {
399 if (!__pci_reset_function_locked(pdev
))
400 vdev
->needs_reset
= false;
401 pci_dev_unlock(pdev
);
404 pci_restore_state(pdev
);
406 pci_disable_device(pdev
);
408 if (!vfio_pci_dev_set_try_reset(vdev
->vdev
.dev_set
) && !disable_idle_d3
)
409 vfio_pci_set_power_state(vdev
, PCI_D3hot
);
411 EXPORT_SYMBOL_GPL(vfio_pci_core_disable
);
413 static struct vfio_pci_core_device
*get_pf_vdev(struct vfio_pci_core_device
*vdev
)
415 struct pci_dev
*physfn
= pci_physfn(vdev
->pdev
);
416 struct vfio_device
*pf_dev
;
418 if (!vdev
->pdev
->is_virtfn
)
421 pf_dev
= vfio_device_get_from_dev(&physfn
->dev
);
425 if (pci_dev_driver(physfn
) != pci_dev_driver(vdev
->pdev
)) {
426 vfio_device_put(pf_dev
);
430 return container_of(pf_dev
, struct vfio_pci_core_device
, vdev
);
433 static void vfio_pci_vf_token_user_add(struct vfio_pci_core_device
*vdev
, int val
)
435 struct vfio_pci_core_device
*pf_vdev
= get_pf_vdev(vdev
);
440 mutex_lock(&pf_vdev
->vf_token
->lock
);
441 pf_vdev
->vf_token
->users
+= val
;
442 WARN_ON(pf_vdev
->vf_token
->users
< 0);
443 mutex_unlock(&pf_vdev
->vf_token
->lock
);
445 vfio_device_put(&pf_vdev
->vdev
);
448 void vfio_pci_core_close_device(struct vfio_device
*core_vdev
)
450 struct vfio_pci_core_device
*vdev
=
451 container_of(core_vdev
, struct vfio_pci_core_device
, vdev
);
453 vfio_pci_vf_token_user_add(vdev
, -1);
454 vfio_spapr_pci_eeh_release(vdev
->pdev
);
455 vfio_pci_core_disable(vdev
);
457 mutex_lock(&vdev
->igate
);
458 if (vdev
->err_trigger
) {
459 eventfd_ctx_put(vdev
->err_trigger
);
460 vdev
->err_trigger
= NULL
;
462 if (vdev
->req_trigger
) {
463 eventfd_ctx_put(vdev
->req_trigger
);
464 vdev
->req_trigger
= NULL
;
466 mutex_unlock(&vdev
->igate
);
468 EXPORT_SYMBOL_GPL(vfio_pci_core_close_device
);
470 void vfio_pci_core_finish_enable(struct vfio_pci_core_device
*vdev
)
472 vfio_pci_probe_mmaps(vdev
);
473 vfio_spapr_pci_eeh_open(vdev
->pdev
);
474 vfio_pci_vf_token_user_add(vdev
, 1);
476 EXPORT_SYMBOL_GPL(vfio_pci_core_finish_enable
);
478 static int vfio_pci_get_irq_count(struct vfio_pci_core_device
*vdev
, int irq_type
)
480 if (irq_type
== VFIO_PCI_INTX_IRQ_INDEX
) {
483 if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX
) ||
484 vdev
->nointx
|| vdev
->pdev
->is_virtfn
)
487 pci_read_config_byte(vdev
->pdev
, PCI_INTERRUPT_PIN
, &pin
);
490 } else if (irq_type
== VFIO_PCI_MSI_IRQ_INDEX
) {
494 pos
= vdev
->pdev
->msi_cap
;
496 pci_read_config_word(vdev
->pdev
,
497 pos
+ PCI_MSI_FLAGS
, &flags
);
498 return 1 << ((flags
& PCI_MSI_FLAGS_QMASK
) >> 1);
500 } else if (irq_type
== VFIO_PCI_MSIX_IRQ_INDEX
) {
504 pos
= vdev
->pdev
->msix_cap
;
506 pci_read_config_word(vdev
->pdev
,
507 pos
+ PCI_MSIX_FLAGS
, &flags
);
509 return (flags
& PCI_MSIX_FLAGS_QSIZE
) + 1;
511 } else if (irq_type
== VFIO_PCI_ERR_IRQ_INDEX
) {
512 if (pci_is_pcie(vdev
->pdev
))
514 } else if (irq_type
== VFIO_PCI_REQ_IRQ_INDEX
) {
521 static int vfio_pci_count_devs(struct pci_dev
*pdev
, void *data
)
527 struct vfio_pci_fill_info
{
530 struct vfio_pci_dependent_device
*devices
;
533 static int vfio_pci_fill_devs(struct pci_dev
*pdev
, void *data
)
535 struct vfio_pci_fill_info
*fill
= data
;
536 struct iommu_group
*iommu_group
;
538 if (fill
->cur
== fill
->max
)
539 return -EAGAIN
; /* Something changed, try again */
541 iommu_group
= iommu_group_get(&pdev
->dev
);
543 return -EPERM
; /* Cannot reset non-isolated devices */
545 fill
->devices
[fill
->cur
].group_id
= iommu_group_id(iommu_group
);
546 fill
->devices
[fill
->cur
].segment
= pci_domain_nr(pdev
->bus
);
547 fill
->devices
[fill
->cur
].bus
= pdev
->bus
->number
;
548 fill
->devices
[fill
->cur
].devfn
= pdev
->devfn
;
550 iommu_group_put(iommu_group
);
554 struct vfio_pci_group_info
{
556 struct vfio_group
**groups
;
559 static bool vfio_pci_dev_below_slot(struct pci_dev
*pdev
, struct pci_slot
*slot
)
561 for (; pdev
; pdev
= pdev
->bus
->self
)
562 if (pdev
->bus
== slot
->bus
)
563 return (pdev
->slot
== slot
);
567 struct vfio_pci_walk_info
{
568 int (*fn
)(struct pci_dev
*, void *data
);
570 struct pci_dev
*pdev
;
575 static int vfio_pci_walk_wrapper(struct pci_dev
*pdev
, void *data
)
577 struct vfio_pci_walk_info
*walk
= data
;
579 if (!walk
->slot
|| vfio_pci_dev_below_slot(pdev
, walk
->pdev
->slot
))
580 walk
->ret
= walk
->fn(pdev
, walk
->data
);
585 static int vfio_pci_for_each_slot_or_bus(struct pci_dev
*pdev
,
586 int (*fn
)(struct pci_dev
*,
587 void *data
), void *data
,
590 struct vfio_pci_walk_info walk
= {
591 .fn
= fn
, .data
= data
, .pdev
= pdev
, .slot
= slot
, .ret
= 0,
594 pci_walk_bus(pdev
->bus
, vfio_pci_walk_wrapper
, &walk
);
599 static int msix_mmappable_cap(struct vfio_pci_core_device
*vdev
,
600 struct vfio_info_cap
*caps
)
602 struct vfio_info_cap_header header
= {
603 .id
= VFIO_REGION_INFO_CAP_MSIX_MAPPABLE
,
607 return vfio_info_add_capability(caps
, &header
, sizeof(header
));
610 int vfio_pci_register_dev_region(struct vfio_pci_core_device
*vdev
,
611 unsigned int type
, unsigned int subtype
,
612 const struct vfio_pci_regops
*ops
,
613 size_t size
, u32 flags
, void *data
)
615 struct vfio_pci_region
*region
;
617 region
= krealloc(vdev
->region
,
618 (vdev
->num_regions
+ 1) * sizeof(*region
),
623 vdev
->region
= region
;
624 vdev
->region
[vdev
->num_regions
].type
= type
;
625 vdev
->region
[vdev
->num_regions
].subtype
= subtype
;
626 vdev
->region
[vdev
->num_regions
].ops
= ops
;
627 vdev
->region
[vdev
->num_regions
].size
= size
;
628 vdev
->region
[vdev
->num_regions
].flags
= flags
;
629 vdev
->region
[vdev
->num_regions
].data
= data
;
635 EXPORT_SYMBOL_GPL(vfio_pci_register_dev_region
);
637 long vfio_pci_core_ioctl(struct vfio_device
*core_vdev
, unsigned int cmd
,
640 struct vfio_pci_core_device
*vdev
=
641 container_of(core_vdev
, struct vfio_pci_core_device
, vdev
);
644 if (cmd
== VFIO_DEVICE_GET_INFO
) {
645 struct vfio_device_info info
;
646 struct vfio_info_cap caps
= { .buf
= NULL
, .size
= 0 };
650 minsz
= offsetofend(struct vfio_device_info
, num_irqs
);
652 /* For backward compatibility, cannot require this */
653 capsz
= offsetofend(struct vfio_iommu_type1_info
, cap_offset
);
655 if (copy_from_user(&info
, (void __user
*)arg
, minsz
))
658 if (info
.argsz
< minsz
)
661 if (info
.argsz
>= capsz
) {
666 info
.flags
= VFIO_DEVICE_FLAGS_PCI
;
668 if (vdev
->reset_works
)
669 info
.flags
|= VFIO_DEVICE_FLAGS_RESET
;
671 info
.num_regions
= VFIO_PCI_NUM_REGIONS
+ vdev
->num_regions
;
672 info
.num_irqs
= VFIO_PCI_NUM_IRQS
;
674 ret
= vfio_pci_info_zdev_add_caps(vdev
, &caps
);
675 if (ret
&& ret
!= -ENODEV
) {
676 pci_warn(vdev
->pdev
, "Failed to setup zPCI info capabilities\n");
681 info
.flags
|= VFIO_DEVICE_FLAGS_CAPS
;
682 if (info
.argsz
< sizeof(info
) + caps
.size
) {
683 info
.argsz
= sizeof(info
) + caps
.size
;
685 vfio_info_cap_shift(&caps
, sizeof(info
));
686 if (copy_to_user((void __user
*)arg
+
687 sizeof(info
), caps
.buf
,
692 info
.cap_offset
= sizeof(info
);
698 return copy_to_user((void __user
*)arg
, &info
, minsz
) ?
701 } else if (cmd
== VFIO_DEVICE_GET_REGION_INFO
) {
702 struct pci_dev
*pdev
= vdev
->pdev
;
703 struct vfio_region_info info
;
704 struct vfio_info_cap caps
= { .buf
= NULL
, .size
= 0 };
707 minsz
= offsetofend(struct vfio_region_info
, offset
);
709 if (copy_from_user(&info
, (void __user
*)arg
, minsz
))
712 if (info
.argsz
< minsz
)
715 switch (info
.index
) {
716 case VFIO_PCI_CONFIG_REGION_INDEX
:
717 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
718 info
.size
= pdev
->cfg_size
;
719 info
.flags
= VFIO_REGION_INFO_FLAG_READ
|
720 VFIO_REGION_INFO_FLAG_WRITE
;
722 case VFIO_PCI_BAR0_REGION_INDEX
... VFIO_PCI_BAR5_REGION_INDEX
:
723 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
724 info
.size
= pci_resource_len(pdev
, info
.index
);
730 info
.flags
= VFIO_REGION_INFO_FLAG_READ
|
731 VFIO_REGION_INFO_FLAG_WRITE
;
732 if (vdev
->bar_mmap_supported
[info
.index
]) {
733 info
.flags
|= VFIO_REGION_INFO_FLAG_MMAP
;
734 if (info
.index
== vdev
->msix_bar
) {
735 ret
= msix_mmappable_cap(vdev
, &caps
);
742 case VFIO_PCI_ROM_REGION_INDEX
:
748 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
751 /* Report the BAR size, not the ROM size */
752 info
.size
= pci_resource_len(pdev
, info
.index
);
754 /* Shadow ROMs appear as PCI option ROMs */
755 if (pdev
->resource
[PCI_ROM_RESOURCE
].flags
&
756 IORESOURCE_ROM_SHADOW
)
763 * Is it really there? Enable memory decode for
764 * implicit access in pci_map_rom().
766 cmd
= vfio_pci_memory_lock_and_enable(vdev
);
767 io
= pci_map_rom(pdev
, &size
);
769 info
.flags
= VFIO_REGION_INFO_FLAG_READ
;
770 pci_unmap_rom(pdev
, io
);
774 vfio_pci_memory_unlock_and_restore(vdev
, cmd
);
778 case VFIO_PCI_VGA_REGION_INDEX
:
782 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
784 info
.flags
= VFIO_REGION_INFO_FLAG_READ
|
785 VFIO_REGION_INFO_FLAG_WRITE
;
790 struct vfio_region_info_cap_type cap_type
= {
791 .header
.id
= VFIO_REGION_INFO_CAP_TYPE
,
792 .header
.version
= 1 };
795 VFIO_PCI_NUM_REGIONS
+ vdev
->num_regions
)
797 info
.index
= array_index_nospec(info
.index
,
798 VFIO_PCI_NUM_REGIONS
+
801 i
= info
.index
- VFIO_PCI_NUM_REGIONS
;
803 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
804 info
.size
= vdev
->region
[i
].size
;
805 info
.flags
= vdev
->region
[i
].flags
;
807 cap_type
.type
= vdev
->region
[i
].type
;
808 cap_type
.subtype
= vdev
->region
[i
].subtype
;
810 ret
= vfio_info_add_capability(&caps
, &cap_type
.header
,
815 if (vdev
->region
[i
].ops
->add_capability
) {
816 ret
= vdev
->region
[i
].ops
->add_capability(vdev
,
817 &vdev
->region
[i
], &caps
);
825 info
.flags
|= VFIO_REGION_INFO_FLAG_CAPS
;
826 if (info
.argsz
< sizeof(info
) + caps
.size
) {
827 info
.argsz
= sizeof(info
) + caps
.size
;
830 vfio_info_cap_shift(&caps
, sizeof(info
));
831 if (copy_to_user((void __user
*)arg
+
832 sizeof(info
), caps
.buf
,
837 info
.cap_offset
= sizeof(info
);
843 return copy_to_user((void __user
*)arg
, &info
, minsz
) ?
846 } else if (cmd
== VFIO_DEVICE_GET_IRQ_INFO
) {
847 struct vfio_irq_info info
;
849 minsz
= offsetofend(struct vfio_irq_info
, count
);
851 if (copy_from_user(&info
, (void __user
*)arg
, minsz
))
854 if (info
.argsz
< minsz
|| info
.index
>= VFIO_PCI_NUM_IRQS
)
857 switch (info
.index
) {
858 case VFIO_PCI_INTX_IRQ_INDEX
... VFIO_PCI_MSIX_IRQ_INDEX
:
859 case VFIO_PCI_REQ_IRQ_INDEX
:
861 case VFIO_PCI_ERR_IRQ_INDEX
:
862 if (pci_is_pcie(vdev
->pdev
))
869 info
.flags
= VFIO_IRQ_INFO_EVENTFD
;
871 info
.count
= vfio_pci_get_irq_count(vdev
, info
.index
);
873 if (info
.index
== VFIO_PCI_INTX_IRQ_INDEX
)
874 info
.flags
|= (VFIO_IRQ_INFO_MASKABLE
|
875 VFIO_IRQ_INFO_AUTOMASKED
);
877 info
.flags
|= VFIO_IRQ_INFO_NORESIZE
;
879 return copy_to_user((void __user
*)arg
, &info
, minsz
) ?
882 } else if (cmd
== VFIO_DEVICE_SET_IRQS
) {
883 struct vfio_irq_set hdr
;
886 size_t data_size
= 0;
888 minsz
= offsetofend(struct vfio_irq_set
, count
);
890 if (copy_from_user(&hdr
, (void __user
*)arg
, minsz
))
893 max
= vfio_pci_get_irq_count(vdev
, hdr
.index
);
895 ret
= vfio_set_irqs_validate_and_prepare(&hdr
, max
,
896 VFIO_PCI_NUM_IRQS
, &data_size
);
901 data
= memdup_user((void __user
*)(arg
+ minsz
),
904 return PTR_ERR(data
);
907 mutex_lock(&vdev
->igate
);
909 ret
= vfio_pci_set_irqs_ioctl(vdev
, hdr
.flags
, hdr
.index
,
910 hdr
.start
, hdr
.count
, data
);
912 mutex_unlock(&vdev
->igate
);
917 } else if (cmd
== VFIO_DEVICE_RESET
) {
920 if (!vdev
->reset_works
)
923 vfio_pci_zap_and_down_write_memory_lock(vdev
);
924 ret
= pci_try_reset_function(vdev
->pdev
);
925 up_write(&vdev
->memory_lock
);
929 } else if (cmd
== VFIO_DEVICE_GET_PCI_HOT_RESET_INFO
) {
930 struct vfio_pci_hot_reset_info hdr
;
931 struct vfio_pci_fill_info fill
= { 0 };
932 struct vfio_pci_dependent_device
*devices
= NULL
;
936 minsz
= offsetofend(struct vfio_pci_hot_reset_info
, count
);
938 if (copy_from_user(&hdr
, (void __user
*)arg
, minsz
))
941 if (hdr
.argsz
< minsz
)
946 /* Can we do a slot or bus reset or neither? */
947 if (!pci_probe_reset_slot(vdev
->pdev
->slot
))
949 else if (pci_probe_reset_bus(vdev
->pdev
->bus
))
952 /* How many devices are affected? */
953 ret
= vfio_pci_for_each_slot_or_bus(vdev
->pdev
,
959 WARN_ON(!fill
.max
); /* Should always be at least one */
962 * If there's enough space, fill it now, otherwise return
963 * -ENOSPC and the number of devices affected.
965 if (hdr
.argsz
< sizeof(hdr
) + (fill
.max
* sizeof(*devices
))) {
967 hdr
.count
= fill
.max
;
968 goto reset_info_exit
;
971 devices
= kcalloc(fill
.max
, sizeof(*devices
), GFP_KERNEL
);
975 fill
.devices
= devices
;
977 ret
= vfio_pci_for_each_slot_or_bus(vdev
->pdev
,
982 * If a device was removed between counting and filling,
983 * we may come up short of fill.max. If a device was
984 * added, we'll have a return of -EAGAIN above.
987 hdr
.count
= fill
.cur
;
990 if (copy_to_user((void __user
*)arg
, &hdr
, minsz
))
994 if (copy_to_user((void __user
*)(arg
+ minsz
), devices
,
995 hdr
.count
* sizeof(*devices
)))
1002 } else if (cmd
== VFIO_DEVICE_PCI_HOT_RESET
) {
1003 struct vfio_pci_hot_reset hdr
;
1005 struct vfio_group
**groups
;
1006 struct vfio_pci_group_info info
;
1008 int group_idx
, count
= 0, ret
= 0;
1010 minsz
= offsetofend(struct vfio_pci_hot_reset
, count
);
1012 if (copy_from_user(&hdr
, (void __user
*)arg
, minsz
))
1015 if (hdr
.argsz
< minsz
|| hdr
.flags
)
1018 /* Can we do a slot or bus reset or neither? */
1019 if (!pci_probe_reset_slot(vdev
->pdev
->slot
))
1021 else if (pci_probe_reset_bus(vdev
->pdev
->bus
))
1025 * We can't let userspace give us an arbitrarily large
1026 * buffer to copy, so verify how many we think there
1027 * could be. Note groups can have multiple devices so
1028 * one group per device is the max.
1030 ret
= vfio_pci_for_each_slot_or_bus(vdev
->pdev
,
1031 vfio_pci_count_devs
,
1036 /* Somewhere between 1 and count is OK */
1037 if (!hdr
.count
|| hdr
.count
> count
)
1040 group_fds
= kcalloc(hdr
.count
, sizeof(*group_fds
), GFP_KERNEL
);
1041 groups
= kcalloc(hdr
.count
, sizeof(*groups
), GFP_KERNEL
);
1042 if (!group_fds
|| !groups
) {
1048 if (copy_from_user(group_fds
, (void __user
*)(arg
+ minsz
),
1049 hdr
.count
* sizeof(*group_fds
))) {
1056 * For each group_fd, get the group through the vfio external
1057 * user interface and store the group and iommu ID. This
1058 * ensures the group is held across the reset.
1060 for (group_idx
= 0; group_idx
< hdr
.count
; group_idx
++) {
1061 struct vfio_group
*group
;
1062 struct fd f
= fdget(group_fds
[group_idx
]);
1068 group
= vfio_group_get_external_user(f
.file
);
1070 if (IS_ERR(group
)) {
1071 ret
= PTR_ERR(group
);
1075 groups
[group_idx
] = group
;
1080 /* release reference to groups on error */
1082 goto hot_reset_release
;
1084 info
.count
= hdr
.count
;
1085 info
.groups
= groups
;
1087 ret
= vfio_pci_dev_set_hot_reset(vdev
->vdev
.dev_set
, &info
);
1090 for (group_idx
--; group_idx
>= 0; group_idx
--)
1091 vfio_group_put_external_user(groups
[group_idx
]);
1095 } else if (cmd
== VFIO_DEVICE_IOEVENTFD
) {
1096 struct vfio_device_ioeventfd ioeventfd
;
1099 minsz
= offsetofend(struct vfio_device_ioeventfd
, fd
);
1101 if (copy_from_user(&ioeventfd
, (void __user
*)arg
, minsz
))
1104 if (ioeventfd
.argsz
< minsz
)
1107 if (ioeventfd
.flags
& ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK
)
1110 count
= ioeventfd
.flags
& VFIO_DEVICE_IOEVENTFD_SIZE_MASK
;
1112 if (hweight8(count
) != 1 || ioeventfd
.fd
< -1)
1115 return vfio_pci_ioeventfd(vdev
, ioeventfd
.offset
,
1116 ioeventfd
.data
, count
, ioeventfd
.fd
);
1117 } else if (cmd
== VFIO_DEVICE_FEATURE
) {
1118 struct vfio_device_feature feature
;
1121 minsz
= offsetofend(struct vfio_device_feature
, flags
);
1123 if (copy_from_user(&feature
, (void __user
*)arg
, minsz
))
1126 if (feature
.argsz
< minsz
)
1129 /* Check unknown flags */
1130 if (feature
.flags
& ~(VFIO_DEVICE_FEATURE_MASK
|
1131 VFIO_DEVICE_FEATURE_SET
|
1132 VFIO_DEVICE_FEATURE_GET
|
1133 VFIO_DEVICE_FEATURE_PROBE
))
1136 /* GET & SET are mutually exclusive except with PROBE */
1137 if (!(feature
.flags
& VFIO_DEVICE_FEATURE_PROBE
) &&
1138 (feature
.flags
& VFIO_DEVICE_FEATURE_SET
) &&
1139 (feature
.flags
& VFIO_DEVICE_FEATURE_GET
))
1142 switch (feature
.flags
& VFIO_DEVICE_FEATURE_MASK
) {
1143 case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN
:
1144 if (!vdev
->vf_token
)
1148 * We do not support GET of the VF Token UUID as this
1149 * could expose the token of the previous device user.
1151 if (feature
.flags
& VFIO_DEVICE_FEATURE_GET
)
1154 if (feature
.flags
& VFIO_DEVICE_FEATURE_PROBE
)
1157 /* Don't SET unless told to do so */
1158 if (!(feature
.flags
& VFIO_DEVICE_FEATURE_SET
))
1161 if (feature
.argsz
< minsz
+ sizeof(uuid
))
1164 if (copy_from_user(&uuid
, (void __user
*)(arg
+ minsz
),
1168 mutex_lock(&vdev
->vf_token
->lock
);
1169 uuid_copy(&vdev
->vf_token
->uuid
, &uuid
);
1170 mutex_unlock(&vdev
->vf_token
->lock
);
1180 EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl
);
1182 static ssize_t
vfio_pci_rw(struct vfio_pci_core_device
*vdev
, char __user
*buf
,
1183 size_t count
, loff_t
*ppos
, bool iswrite
)
1185 unsigned int index
= VFIO_PCI_OFFSET_TO_INDEX(*ppos
);
1187 if (index
>= VFIO_PCI_NUM_REGIONS
+ vdev
->num_regions
)
1191 case VFIO_PCI_CONFIG_REGION_INDEX
:
1192 return vfio_pci_config_rw(vdev
, buf
, count
, ppos
, iswrite
);
1194 case VFIO_PCI_ROM_REGION_INDEX
:
1197 return vfio_pci_bar_rw(vdev
, buf
, count
, ppos
, false);
1199 case VFIO_PCI_BAR0_REGION_INDEX
... VFIO_PCI_BAR5_REGION_INDEX
:
1200 return vfio_pci_bar_rw(vdev
, buf
, count
, ppos
, iswrite
);
1202 case VFIO_PCI_VGA_REGION_INDEX
:
1203 return vfio_pci_vga_rw(vdev
, buf
, count
, ppos
, iswrite
);
1205 index
-= VFIO_PCI_NUM_REGIONS
;
1206 return vdev
->region
[index
].ops
->rw(vdev
, buf
,
1207 count
, ppos
, iswrite
);
1213 ssize_t
vfio_pci_core_read(struct vfio_device
*core_vdev
, char __user
*buf
,
1214 size_t count
, loff_t
*ppos
)
1216 struct vfio_pci_core_device
*vdev
=
1217 container_of(core_vdev
, struct vfio_pci_core_device
, vdev
);
1222 return vfio_pci_rw(vdev
, buf
, count
, ppos
, false);
1224 EXPORT_SYMBOL_GPL(vfio_pci_core_read
);
1226 ssize_t
vfio_pci_core_write(struct vfio_device
*core_vdev
, const char __user
*buf
,
1227 size_t count
, loff_t
*ppos
)
1229 struct vfio_pci_core_device
*vdev
=
1230 container_of(core_vdev
, struct vfio_pci_core_device
, vdev
);
1235 return vfio_pci_rw(vdev
, (char __user
*)buf
, count
, ppos
, true);
1237 EXPORT_SYMBOL_GPL(vfio_pci_core_write
);
1239 /* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */
1240 static int vfio_pci_zap_and_vma_lock(struct vfio_pci_core_device
*vdev
, bool try)
1242 struct vfio_pci_mmap_vma
*mmap_vma
, *tmp
;
1246 * vma_lock is nested under mmap_lock for vm_ops callback paths.
1247 * The memory_lock semaphore is used by both code paths calling
1248 * into this function to zap vmas and the vm_ops.fault callback
1249 * to protect the memory enable state of the device.
1251 * When zapping vmas we need to maintain the mmap_lock => vma_lock
1252 * ordering, which requires using vma_lock to walk vma_list to
1253 * acquire an mm, then dropping vma_lock to get the mmap_lock and
1254 * reacquiring vma_lock. This logic is derived from similar
1255 * requirements in uverbs_user_mmap_disassociate().
1257 * mmap_lock must always be the top-level lock when it is taken.
1258 * Therefore we can only hold the memory_lock write lock when
1259 * vma_list is empty, as we'd need to take mmap_lock to clear
1260 * entries. vma_list can only be guaranteed empty when holding
1261 * vma_lock, thus memory_lock is nested under vma_lock.
1263 * This enables the vm_ops.fault callback to acquire vma_lock,
1264 * followed by memory_lock read lock, while already holding
1265 * mmap_lock without risk of deadlock.
1268 struct mm_struct
*mm
= NULL
;
1271 if (!mutex_trylock(&vdev
->vma_lock
))
1274 mutex_lock(&vdev
->vma_lock
);
1276 while (!list_empty(&vdev
->vma_list
)) {
1277 mmap_vma
= list_first_entry(&vdev
->vma_list
,
1278 struct vfio_pci_mmap_vma
,
1280 mm
= mmap_vma
->vma
->vm_mm
;
1281 if (mmget_not_zero(mm
))
1284 list_del(&mmap_vma
->vma_next
);
1290 mutex_unlock(&vdev
->vma_lock
);
1293 if (!mmap_read_trylock(mm
)) {
1301 if (!mutex_trylock(&vdev
->vma_lock
)) {
1302 mmap_read_unlock(mm
);
1307 mutex_lock(&vdev
->vma_lock
);
1309 list_for_each_entry_safe(mmap_vma
, tmp
,
1310 &vdev
->vma_list
, vma_next
) {
1311 struct vm_area_struct
*vma
= mmap_vma
->vma
;
1313 if (vma
->vm_mm
!= mm
)
1316 list_del(&mmap_vma
->vma_next
);
1319 zap_vma_ptes(vma
, vma
->vm_start
,
1320 vma
->vm_end
- vma
->vm_start
);
1322 mutex_unlock(&vdev
->vma_lock
);
1323 mmap_read_unlock(mm
);
1328 void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device
*vdev
)
1330 vfio_pci_zap_and_vma_lock(vdev
, false);
1331 down_write(&vdev
->memory_lock
);
1332 mutex_unlock(&vdev
->vma_lock
);
1335 u16
vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device
*vdev
)
1339 down_write(&vdev
->memory_lock
);
1340 pci_read_config_word(vdev
->pdev
, PCI_COMMAND
, &cmd
);
1341 if (!(cmd
& PCI_COMMAND_MEMORY
))
1342 pci_write_config_word(vdev
->pdev
, PCI_COMMAND
,
1343 cmd
| PCI_COMMAND_MEMORY
);
1348 void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device
*vdev
, u16 cmd
)
1350 pci_write_config_word(vdev
->pdev
, PCI_COMMAND
, cmd
);
1351 up_write(&vdev
->memory_lock
);
1354 /* Caller holds vma_lock */
1355 static int __vfio_pci_add_vma(struct vfio_pci_core_device
*vdev
,
1356 struct vm_area_struct
*vma
)
1358 struct vfio_pci_mmap_vma
*mmap_vma
;
1360 mmap_vma
= kmalloc(sizeof(*mmap_vma
), GFP_KERNEL
);
1364 mmap_vma
->vma
= vma
;
1365 list_add(&mmap_vma
->vma_next
, &vdev
->vma_list
);
1371 * Zap mmaps on open so that we can fault them in on access and therefore
1372 * our vma_list only tracks mappings accessed since last zap.
1374 static void vfio_pci_mmap_open(struct vm_area_struct
*vma
)
1376 zap_vma_ptes(vma
, vma
->vm_start
, vma
->vm_end
- vma
->vm_start
);
1379 static void vfio_pci_mmap_close(struct vm_area_struct
*vma
)
1381 struct vfio_pci_core_device
*vdev
= vma
->vm_private_data
;
1382 struct vfio_pci_mmap_vma
*mmap_vma
;
1384 mutex_lock(&vdev
->vma_lock
);
1385 list_for_each_entry(mmap_vma
, &vdev
->vma_list
, vma_next
) {
1386 if (mmap_vma
->vma
== vma
) {
1387 list_del(&mmap_vma
->vma_next
);
1392 mutex_unlock(&vdev
->vma_lock
);
1395 static vm_fault_t
vfio_pci_mmap_fault(struct vm_fault
*vmf
)
1397 struct vm_area_struct
*vma
= vmf
->vma
;
1398 struct vfio_pci_core_device
*vdev
= vma
->vm_private_data
;
1399 struct vfio_pci_mmap_vma
*mmap_vma
;
1400 vm_fault_t ret
= VM_FAULT_NOPAGE
;
1402 mutex_lock(&vdev
->vma_lock
);
1403 down_read(&vdev
->memory_lock
);
1405 if (!__vfio_pci_memory_enabled(vdev
)) {
1406 ret
= VM_FAULT_SIGBUS
;
1411 * We populate the whole vma on fault, so we need to test whether
1412 * the vma has already been mapped, such as for concurrent faults
1413 * to the same vma. io_remap_pfn_range() will trigger a BUG_ON if
1414 * we ask it to fill the same range again.
1416 list_for_each_entry(mmap_vma
, &vdev
->vma_list
, vma_next
) {
1417 if (mmap_vma
->vma
== vma
)
1421 if (io_remap_pfn_range(vma
, vma
->vm_start
, vma
->vm_pgoff
,
1422 vma
->vm_end
- vma
->vm_start
,
1423 vma
->vm_page_prot
)) {
1424 ret
= VM_FAULT_SIGBUS
;
1425 zap_vma_ptes(vma
, vma
->vm_start
, vma
->vm_end
- vma
->vm_start
);
1429 if (__vfio_pci_add_vma(vdev
, vma
)) {
1431 zap_vma_ptes(vma
, vma
->vm_start
, vma
->vm_end
- vma
->vm_start
);
1435 up_read(&vdev
->memory_lock
);
1436 mutex_unlock(&vdev
->vma_lock
);
1440 static const struct vm_operations_struct vfio_pci_mmap_ops
= {
1441 .open
= vfio_pci_mmap_open
,
1442 .close
= vfio_pci_mmap_close
,
1443 .fault
= vfio_pci_mmap_fault
,
1446 int vfio_pci_core_mmap(struct vfio_device
*core_vdev
, struct vm_area_struct
*vma
)
1448 struct vfio_pci_core_device
*vdev
=
1449 container_of(core_vdev
, struct vfio_pci_core_device
, vdev
);
1450 struct pci_dev
*pdev
= vdev
->pdev
;
1452 u64 phys_len
, req_len
, pgoff
, req_start
;
1455 index
= vma
->vm_pgoff
>> (VFIO_PCI_OFFSET_SHIFT
- PAGE_SHIFT
);
1457 if (index
>= VFIO_PCI_NUM_REGIONS
+ vdev
->num_regions
)
1459 if (vma
->vm_end
< vma
->vm_start
)
1461 if ((vma
->vm_flags
& VM_SHARED
) == 0)
1463 if (index
>= VFIO_PCI_NUM_REGIONS
) {
1464 int regnum
= index
- VFIO_PCI_NUM_REGIONS
;
1465 struct vfio_pci_region
*region
= vdev
->region
+ regnum
;
1467 if (region
->ops
&& region
->ops
->mmap
&&
1468 (region
->flags
& VFIO_REGION_INFO_FLAG_MMAP
))
1469 return region
->ops
->mmap(vdev
, region
, vma
);
1472 if (index
>= VFIO_PCI_ROM_REGION_INDEX
)
1474 if (!vdev
->bar_mmap_supported
[index
])
1477 phys_len
= PAGE_ALIGN(pci_resource_len(pdev
, index
));
1478 req_len
= vma
->vm_end
- vma
->vm_start
;
1479 pgoff
= vma
->vm_pgoff
&
1480 ((1U << (VFIO_PCI_OFFSET_SHIFT
- PAGE_SHIFT
)) - 1);
1481 req_start
= pgoff
<< PAGE_SHIFT
;
1483 if (req_start
+ req_len
> phys_len
)
1487 * Even though we don't make use of the barmap for the mmap,
1488 * we need to request the region and the barmap tracks that.
1490 if (!vdev
->barmap
[index
]) {
1491 ret
= pci_request_selected_regions(pdev
,
1492 1 << index
, "vfio-pci");
1496 vdev
->barmap
[index
] = pci_iomap(pdev
, index
, 0);
1497 if (!vdev
->barmap
[index
]) {
1498 pci_release_selected_regions(pdev
, 1 << index
);
1503 vma
->vm_private_data
= vdev
;
1504 vma
->vm_page_prot
= pgprot_noncached(vma
->vm_page_prot
);
1505 vma
->vm_pgoff
= (pci_resource_start(pdev
, index
) >> PAGE_SHIFT
) + pgoff
;
1508 * See remap_pfn_range(), called from vfio_pci_fault() but we can't
1509 * change vm_flags within the fault handler. Set them now.
1511 vma
->vm_flags
|= VM_IO
| VM_PFNMAP
| VM_DONTEXPAND
| VM_DONTDUMP
;
1512 vma
->vm_ops
= &vfio_pci_mmap_ops
;
1516 EXPORT_SYMBOL_GPL(vfio_pci_core_mmap
);
1518 void vfio_pci_core_request(struct vfio_device
*core_vdev
, unsigned int count
)
1520 struct vfio_pci_core_device
*vdev
=
1521 container_of(core_vdev
, struct vfio_pci_core_device
, vdev
);
1522 struct pci_dev
*pdev
= vdev
->pdev
;
1524 mutex_lock(&vdev
->igate
);
1526 if (vdev
->req_trigger
) {
1528 pci_notice_ratelimited(pdev
,
1529 "Relaying device request to user (#%u)\n",
1531 eventfd_signal(vdev
->req_trigger
, 1);
1532 } else if (count
== 0) {
1534 "No device request channel registered, blocked until released by user\n");
1537 mutex_unlock(&vdev
->igate
);
1539 EXPORT_SYMBOL_GPL(vfio_pci_core_request
);
1541 static int vfio_pci_validate_vf_token(struct vfio_pci_core_device
*vdev
,
1542 bool vf_token
, uuid_t
*uuid
)
1545 * There's always some degree of trust or collaboration between SR-IOV
1546 * PF and VFs, even if just that the PF hosts the SR-IOV capability and
1547 * can disrupt VFs with a reset, but often the PF has more explicit
1548 * access to deny service to the VF or access data passed through the
1549 * VF. We therefore require an opt-in via a shared VF token (UUID) to
1550 * represent this trust. This both prevents that a VF driver might
1551 * assume the PF driver is a trusted, in-kernel driver, and also that
1552 * a PF driver might be replaced with a rogue driver, unknown to in-use
1555 * Therefore when presented with a VF, if the PF is a vfio device and
1556 * it is bound to the vfio-pci driver, the user needs to provide a VF
1557 * token to access the device, in the form of appending a vf_token to
1558 * the device name, for example:
1560 * "0000:04:10.0 vf_token=bd8d9d2b-5a5f-4f5a-a211-f591514ba1f3"
1562 * When presented with a PF which has VFs in use, the user must also
1563 * provide the current VF token to prove collaboration with existing
1564 * VF users. If VFs are not in use, the VF token provided for the PF
1565 * device will act to set the VF token.
1567 * If the VF token is provided but unused, an error is generated.
1569 if (!vdev
->pdev
->is_virtfn
&& !vdev
->vf_token
&& !vf_token
)
1570 return 0; /* No VF token provided or required */
1572 if (vdev
->pdev
->is_virtfn
) {
1573 struct vfio_pci_core_device
*pf_vdev
= get_pf_vdev(vdev
);
1578 return 0; /* PF is not vfio-pci, no VF token */
1580 pci_info_ratelimited(vdev
->pdev
,
1581 "VF token incorrectly provided, PF not bound to vfio-pci\n");
1586 vfio_device_put(&pf_vdev
->vdev
);
1587 pci_info_ratelimited(vdev
->pdev
,
1588 "VF token required to access device\n");
1592 mutex_lock(&pf_vdev
->vf_token
->lock
);
1593 match
= uuid_equal(uuid
, &pf_vdev
->vf_token
->uuid
);
1594 mutex_unlock(&pf_vdev
->vf_token
->lock
);
1596 vfio_device_put(&pf_vdev
->vdev
);
1599 pci_info_ratelimited(vdev
->pdev
,
1600 "Incorrect VF token provided for device\n");
1603 } else if (vdev
->vf_token
) {
1604 mutex_lock(&vdev
->vf_token
->lock
);
1605 if (vdev
->vf_token
->users
) {
1607 mutex_unlock(&vdev
->vf_token
->lock
);
1608 pci_info_ratelimited(vdev
->pdev
,
1609 "VF token required to access device\n");
1613 if (!uuid_equal(uuid
, &vdev
->vf_token
->uuid
)) {
1614 mutex_unlock(&vdev
->vf_token
->lock
);
1615 pci_info_ratelimited(vdev
->pdev
,
1616 "Incorrect VF token provided for device\n");
1619 } else if (vf_token
) {
1620 uuid_copy(&vdev
->vf_token
->uuid
, uuid
);
1623 mutex_unlock(&vdev
->vf_token
->lock
);
1624 } else if (vf_token
) {
1625 pci_info_ratelimited(vdev
->pdev
,
1626 "VF token incorrectly provided, not a PF or VF\n");
1633 #define VF_TOKEN_ARG "vf_token="
1635 int vfio_pci_core_match(struct vfio_device
*core_vdev
, char *buf
)
1637 struct vfio_pci_core_device
*vdev
=
1638 container_of(core_vdev
, struct vfio_pci_core_device
, vdev
);
1639 bool vf_token
= false;
1643 if (strncmp(pci_name(vdev
->pdev
), buf
, strlen(pci_name(vdev
->pdev
))))
1644 return 0; /* No match */
1646 if (strlen(buf
) > strlen(pci_name(vdev
->pdev
))) {
1647 buf
+= strlen(pci_name(vdev
->pdev
));
1650 return 0; /* No match: non-whitespace after name */
1658 if (!vf_token
&& !strncmp(buf
, VF_TOKEN_ARG
,
1659 strlen(VF_TOKEN_ARG
))) {
1660 buf
+= strlen(VF_TOKEN_ARG
);
1662 if (strlen(buf
) < UUID_STRING_LEN
)
1665 ret
= uuid_parse(buf
, &uuid
);
1670 buf
+= UUID_STRING_LEN
;
1672 /* Unknown/duplicate option */
1678 ret
= vfio_pci_validate_vf_token(vdev
, vf_token
, &uuid
);
1682 return 1; /* Match */
1684 EXPORT_SYMBOL_GPL(vfio_pci_core_match
);
1686 static int vfio_pci_bus_notifier(struct notifier_block
*nb
,
1687 unsigned long action
, void *data
)
1689 struct vfio_pci_core_device
*vdev
= container_of(nb
,
1690 struct vfio_pci_core_device
, nb
);
1691 struct device
*dev
= data
;
1692 struct pci_dev
*pdev
= to_pci_dev(dev
);
1693 struct pci_dev
*physfn
= pci_physfn(pdev
);
1695 if (action
== BUS_NOTIFY_ADD_DEVICE
&&
1696 pdev
->is_virtfn
&& physfn
== vdev
->pdev
) {
1697 pci_info(vdev
->pdev
, "Captured SR-IOV VF %s driver_override\n",
1699 pdev
->driver_override
= kasprintf(GFP_KERNEL
, "%s",
1700 vdev
->vdev
.ops
->name
);
1701 } else if (action
== BUS_NOTIFY_BOUND_DRIVER
&&
1702 pdev
->is_virtfn
&& physfn
== vdev
->pdev
) {
1703 struct pci_driver
*drv
= pci_dev_driver(pdev
);
1705 if (drv
&& drv
!= pci_dev_driver(vdev
->pdev
))
1706 pci_warn(vdev
->pdev
,
1707 "VF %s bound to driver %s while PF bound to driver %s\n",
1708 pci_name(pdev
), drv
->name
,
1709 pci_dev_driver(vdev
->pdev
)->name
);
1715 static int vfio_pci_vf_init(struct vfio_pci_core_device
*vdev
)
1717 struct pci_dev
*pdev
= vdev
->pdev
;
1720 if (!pdev
->is_physfn
)
1723 vdev
->vf_token
= kzalloc(sizeof(*vdev
->vf_token
), GFP_KERNEL
);
1724 if (!vdev
->vf_token
)
1727 mutex_init(&vdev
->vf_token
->lock
);
1728 uuid_gen(&vdev
->vf_token
->uuid
);
1730 vdev
->nb
.notifier_call
= vfio_pci_bus_notifier
;
1731 ret
= bus_register_notifier(&pci_bus_type
, &vdev
->nb
);
1733 kfree(vdev
->vf_token
);
1739 static void vfio_pci_vf_uninit(struct vfio_pci_core_device
*vdev
)
1741 if (!vdev
->vf_token
)
1744 bus_unregister_notifier(&pci_bus_type
, &vdev
->nb
);
1745 WARN_ON(vdev
->vf_token
->users
);
1746 mutex_destroy(&vdev
->vf_token
->lock
);
1747 kfree(vdev
->vf_token
);
1750 static int vfio_pci_vga_init(struct vfio_pci_core_device
*vdev
)
1752 struct pci_dev
*pdev
= vdev
->pdev
;
1755 if (!vfio_pci_is_vga(pdev
))
1758 ret
= vga_client_register(pdev
, vfio_pci_set_decode
);
1761 vga_set_legacy_decoding(pdev
, vfio_pci_set_decode(pdev
, false));
1765 static void vfio_pci_vga_uninit(struct vfio_pci_core_device
*vdev
)
1767 struct pci_dev
*pdev
= vdev
->pdev
;
1769 if (!vfio_pci_is_vga(pdev
))
1771 vga_client_unregister(pdev
);
1772 vga_set_legacy_decoding(pdev
, VGA_RSRC_NORMAL_IO
| VGA_RSRC_NORMAL_MEM
|
1773 VGA_RSRC_LEGACY_IO
|
1774 VGA_RSRC_LEGACY_MEM
);
1777 void vfio_pci_core_init_device(struct vfio_pci_core_device
*vdev
,
1778 struct pci_dev
*pdev
,
1779 const struct vfio_device_ops
*vfio_pci_ops
)
1781 vfio_init_group_dev(&vdev
->vdev
, &pdev
->dev
, vfio_pci_ops
);
1783 vdev
->irq_type
= VFIO_PCI_NUM_IRQS
;
1784 mutex_init(&vdev
->igate
);
1785 spin_lock_init(&vdev
->irqlock
);
1786 mutex_init(&vdev
->ioeventfds_lock
);
1787 INIT_LIST_HEAD(&vdev
->dummy_resources_list
);
1788 INIT_LIST_HEAD(&vdev
->ioeventfds_list
);
1789 mutex_init(&vdev
->vma_lock
);
1790 INIT_LIST_HEAD(&vdev
->vma_list
);
1791 init_rwsem(&vdev
->memory_lock
);
1793 EXPORT_SYMBOL_GPL(vfio_pci_core_init_device
);
1795 void vfio_pci_core_uninit_device(struct vfio_pci_core_device
*vdev
)
1797 mutex_destroy(&vdev
->igate
);
1798 mutex_destroy(&vdev
->ioeventfds_lock
);
1799 mutex_destroy(&vdev
->vma_lock
);
1800 vfio_uninit_group_dev(&vdev
->vdev
);
1801 kfree(vdev
->region
);
1802 kfree(vdev
->pm_save
);
1804 EXPORT_SYMBOL_GPL(vfio_pci_core_uninit_device
);
1806 int vfio_pci_core_register_device(struct vfio_pci_core_device
*vdev
)
1808 struct pci_dev
*pdev
= vdev
->pdev
;
1809 struct iommu_group
*group
;
1812 if (pdev
->hdr_type
!= PCI_HEADER_TYPE_NORMAL
)
1816 * Prevent binding to PFs with VFs enabled, the VFs might be in use
1817 * by the host or other users. We cannot capture the VFs if they
1818 * already exist, nor can we track VF users. Disabling SR-IOV here
1819 * would initiate removing the VFs, which would unbind the driver,
1820 * which is prone to blocking if that VF is also in use by vfio-pci.
1821 * Just reject these PFs and let the user sort it out.
1823 if (pci_num_vf(pdev
)) {
1824 pci_warn(pdev
, "Cannot bind to PF with SR-IOV enabled\n");
1828 group
= vfio_iommu_group_get(&pdev
->dev
);
1832 if (pci_is_root_bus(pdev
->bus
)) {
1833 ret
= vfio_assign_device_set(&vdev
->vdev
, vdev
);
1834 } else if (!pci_probe_reset_slot(pdev
->slot
)) {
1835 ret
= vfio_assign_device_set(&vdev
->vdev
, pdev
->slot
);
1838 * If there is no slot reset support for this device, the whole
1839 * bus needs to be grouped together to support bus-wide resets.
1841 ret
= vfio_assign_device_set(&vdev
->vdev
, pdev
->bus
);
1846 ret
= vfio_pci_vf_init(vdev
);
1849 ret
= vfio_pci_vga_init(vdev
);
1853 vfio_pci_probe_power_state(vdev
);
1855 if (!disable_idle_d3
) {
1857 * pci-core sets the device power state to an unknown value at
1858 * bootup and after being removed from a driver. The only
1859 * transition it allows from this unknown state is to D0, which
1860 * typically happens when a driver calls pci_enable_device().
1861 * We're not ready to enable the device yet, but we do want to
1862 * be able to get to D3. Therefore first do a D0 transition
1863 * before going to D3.
1865 vfio_pci_set_power_state(vdev
, PCI_D0
);
1866 vfio_pci_set_power_state(vdev
, PCI_D3hot
);
1869 ret
= vfio_register_group_dev(&vdev
->vdev
);
1875 if (!disable_idle_d3
)
1876 vfio_pci_set_power_state(vdev
, PCI_D0
);
1878 vfio_pci_vf_uninit(vdev
);
1880 vfio_iommu_group_put(group
, &pdev
->dev
);
1883 EXPORT_SYMBOL_GPL(vfio_pci_core_register_device
);
1885 void vfio_pci_core_unregister_device(struct vfio_pci_core_device
*vdev
)
1887 struct pci_dev
*pdev
= vdev
->pdev
;
1889 pci_disable_sriov(pdev
);
1891 vfio_unregister_group_dev(&vdev
->vdev
);
1893 vfio_pci_vf_uninit(vdev
);
1894 vfio_pci_vga_uninit(vdev
);
1896 vfio_iommu_group_put(pdev
->dev
.iommu_group
, &pdev
->dev
);
1898 if (!disable_idle_d3
)
1899 vfio_pci_set_power_state(vdev
, PCI_D0
);
1901 EXPORT_SYMBOL_GPL(vfio_pci_core_unregister_device
);
1903 static pci_ers_result_t
vfio_pci_aer_err_detected(struct pci_dev
*pdev
,
1904 pci_channel_state_t state
)
1906 struct vfio_pci_core_device
*vdev
;
1907 struct vfio_device
*device
;
1909 device
= vfio_device_get_from_dev(&pdev
->dev
);
1911 return PCI_ERS_RESULT_DISCONNECT
;
1913 vdev
= container_of(device
, struct vfio_pci_core_device
, vdev
);
1915 mutex_lock(&vdev
->igate
);
1917 if (vdev
->err_trigger
)
1918 eventfd_signal(vdev
->err_trigger
, 1);
1920 mutex_unlock(&vdev
->igate
);
1922 vfio_device_put(device
);
1924 return PCI_ERS_RESULT_CAN_RECOVER
;
1927 int vfio_pci_core_sriov_configure(struct pci_dev
*pdev
, int nr_virtfn
)
1929 struct vfio_device
*device
;
1932 device
= vfio_device_get_from_dev(&pdev
->dev
);
1937 pci_disable_sriov(pdev
);
1939 ret
= pci_enable_sriov(pdev
, nr_virtfn
);
1941 vfio_device_put(device
);
1943 return ret
< 0 ? ret
: nr_virtfn
;
1945 EXPORT_SYMBOL_GPL(vfio_pci_core_sriov_configure
);
1947 const struct pci_error_handlers vfio_pci_core_err_handlers
= {
1948 .error_detected
= vfio_pci_aer_err_detected
,
1950 EXPORT_SYMBOL_GPL(vfio_pci_core_err_handlers
);
1952 static bool vfio_dev_in_groups(struct vfio_pci_core_device
*vdev
,
1953 struct vfio_pci_group_info
*groups
)
1957 for (i
= 0; i
< groups
->count
; i
++)
1958 if (groups
->groups
[i
] == vdev
->vdev
.group
)
1963 static int vfio_pci_is_device_in_set(struct pci_dev
*pdev
, void *data
)
1965 struct vfio_device_set
*dev_set
= data
;
1966 struct vfio_device
*cur
;
1968 list_for_each_entry(cur
, &dev_set
->device_list
, dev_set_list
)
1969 if (cur
->dev
== &pdev
->dev
)
1975 * vfio-core considers a group to be viable and will create a vfio_device even
1976 * if some devices are bound to drivers like pci-stub or pcieport. Here we
1977 * require all PCI devices to be inside our dev_set since that ensures they stay
1978 * put and that every driver controlling the device can co-ordinate with the
1981 * Returns the pci_dev to pass to pci_reset_bus() if every PCI device to be
1982 * reset is inside the dev_set, and pci_reset_bus() can succeed. NULL otherwise.
1984 static struct pci_dev
*
1985 vfio_pci_dev_set_resettable(struct vfio_device_set
*dev_set
)
1987 struct pci_dev
*pdev
;
1989 lockdep_assert_held(&dev_set
->lock
);
1992 * By definition all PCI devices in the dev_set share the same PCI
1993 * reset, so any pci_dev will have the same outcomes for
1994 * pci_probe_reset_*() and pci_reset_bus().
1996 pdev
= list_first_entry(&dev_set
->device_list
,
1997 struct vfio_pci_core_device
,
1998 vdev
.dev_set_list
)->pdev
;
2000 /* pci_reset_bus() is supported */
2001 if (pci_probe_reset_slot(pdev
->slot
) && pci_probe_reset_bus(pdev
->bus
))
2004 if (vfio_pci_for_each_slot_or_bus(pdev
, vfio_pci_is_device_in_set
,
2006 !pci_probe_reset_slot(pdev
->slot
)))
2012 * We need to get memory_lock for each device, but devices can share mmap_lock,
2013 * therefore we need to zap and hold the vma_lock for each device, and only then
2014 * get each memory_lock.
2016 static int vfio_pci_dev_set_hot_reset(struct vfio_device_set
*dev_set
,
2017 struct vfio_pci_group_info
*groups
)
2019 struct vfio_pci_core_device
*cur_mem
;
2020 struct vfio_pci_core_device
*cur_vma
;
2021 struct vfio_pci_core_device
*cur
;
2022 struct pci_dev
*pdev
;
2026 mutex_lock(&dev_set
->lock
);
2027 cur_mem
= list_first_entry(&dev_set
->device_list
,
2028 struct vfio_pci_core_device
,
2031 pdev
= vfio_pci_dev_set_resettable(dev_set
);
2037 list_for_each_entry(cur_vma
, &dev_set
->device_list
, vdev
.dev_set_list
) {
2039 * Test whether all the affected devices are contained by the
2040 * set of groups provided by the user.
2042 if (!vfio_dev_in_groups(cur_vma
, groups
)) {
2048 * Locking multiple devices is prone to deadlock, runaway and
2049 * unwind if we hit contention.
2051 if (!vfio_pci_zap_and_vma_lock(cur_vma
, true)) {
2058 list_for_each_entry(cur_mem
, &dev_set
->device_list
, vdev
.dev_set_list
) {
2059 if (!down_write_trylock(&cur_mem
->memory_lock
)) {
2063 mutex_unlock(&cur_mem
->vma_lock
);
2067 ret
= pci_reset_bus(pdev
);
2070 list_for_each_entry(cur
, &dev_set
->device_list
, vdev
.dev_set_list
) {
2076 up_write(&cur
->memory_lock
);
2078 mutex_unlock(&cur
->vma_lock
);
2081 mutex_unlock(&dev_set
->lock
);
2085 static bool vfio_pci_dev_set_needs_reset(struct vfio_device_set
*dev_set
)
2087 struct vfio_pci_core_device
*cur
;
2088 bool needs_reset
= false;
2090 list_for_each_entry(cur
, &dev_set
->device_list
, vdev
.dev_set_list
) {
2091 /* No VFIO device in the set can have an open device FD */
2092 if (cur
->vdev
.open_count
)
2094 needs_reset
|= cur
->needs_reset
;
2100 * If a bus or slot reset is available for the provided dev_set and:
2101 * - All of the devices affected by that bus or slot reset are unused
2102 * - At least one of the affected devices is marked dirty via
2103 * needs_reset (such as by lack of FLR support)
2104 * Then attempt to perform that bus or slot reset.
2105 * Returns true if the dev_set was reset.
2107 static bool vfio_pci_dev_set_try_reset(struct vfio_device_set
*dev_set
)
2109 struct vfio_pci_core_device
*cur
;
2110 struct pci_dev
*pdev
;
2113 if (!vfio_pci_dev_set_needs_reset(dev_set
))
2116 pdev
= vfio_pci_dev_set_resettable(dev_set
);
2120 ret
= pci_reset_bus(pdev
);
2124 list_for_each_entry(cur
, &dev_set
->device_list
, vdev
.dev_set_list
) {
2125 cur
->needs_reset
= false;
2126 if (!disable_idle_d3
)
2127 vfio_pci_set_power_state(cur
, PCI_D3hot
);
2132 void vfio_pci_core_set_params(bool is_nointxmask
, bool is_disable_vga
,
2133 bool is_disable_idle_d3
)
2135 nointxmask
= is_nointxmask
;
2136 disable_vga
= is_disable_vga
;
2137 disable_idle_d3
= is_disable_idle_d3
;
2139 EXPORT_SYMBOL_GPL(vfio_pci_core_set_params
);
2141 static void vfio_pci_core_cleanup(void)
2143 vfio_pci_uninit_perm_bits();
2146 static int __init
vfio_pci_core_init(void)
2148 /* Allocate shared config space permission data used by all devices */
2149 return vfio_pci_init_perm_bits();
2152 module_init(vfio_pci_core_init
);
2153 module_exit(vfio_pci_core_cleanup
);
2155 MODULE_LICENSE("GPL v2");
2156 MODULE_AUTHOR(DRIVER_AUTHOR
);
2157 MODULE_DESCRIPTION(DRIVER_DESC
);