4 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
5 * Author: Alex Williamson <alex.williamson@redhat.com>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
11 * Derived from original vfio:
12 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
13 * Author: Tom Lyon, pugs@cisco.com
16 #include <linux/cdev.h>
17 #include <linux/compat.h>
18 #include <linux/device.h>
19 #include <linux/file.h>
20 #include <linux/anon_inodes.h>
22 #include <linux/idr.h>
23 #include <linux/iommu.h>
24 #include <linux/list.h>
25 #include <linux/miscdevice.h>
26 #include <linux/module.h>
27 #include <linux/mutex.h>
28 #include <linux/pci.h>
29 #include <linux/rwsem.h>
30 #include <linux/sched.h>
31 #include <linux/slab.h>
32 #include <linux/stat.h>
33 #include <linux/string.h>
34 #include <linux/uaccess.h>
35 #include <linux/vfio.h>
36 #include <linux/wait.h>
38 #define DRIVER_VERSION "0.3"
39 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
40 #define DRIVER_DESC "VFIO - User Level meta-driver"
44 struct list_head iommu_drivers_list
;
45 struct mutex iommu_drivers_lock
;
46 struct list_head group_list
;
48 struct mutex group_lock
;
49 struct cdev group_cdev
;
51 wait_queue_head_t release_q
;
54 struct vfio_iommu_driver
{
55 const struct vfio_iommu_driver_ops
*ops
;
56 struct list_head vfio_next
;
59 struct vfio_container
{
61 struct list_head group_list
;
62 struct rw_semaphore group_lock
;
63 struct vfio_iommu_driver
*iommu_driver
;
68 struct vfio_unbound_dev
{
70 struct list_head unbound_next
;
76 atomic_t container_users
;
77 struct iommu_group
*iommu_group
;
78 struct vfio_container
*container
;
79 struct list_head device_list
;
80 struct mutex device_lock
;
82 struct notifier_block nb
;
83 struct list_head vfio_next
;
84 struct list_head container_next
;
85 struct list_head unbound_list
;
86 struct mutex unbound_lock
;
88 wait_queue_head_t container_q
;
91 struct blocking_notifier_head notifier
;
97 const struct vfio_device_ops
*ops
;
98 struct vfio_group
*group
;
99 struct list_head group_next
;
103 #ifdef CONFIG_VFIO_NOIOMMU
104 static bool noiommu __read_mostly
;
105 module_param_named(enable_unsafe_noiommu_mode
,
106 noiommu
, bool, S_IRUGO
| S_IWUSR
);
107 MODULE_PARM_DESC(enable_unsafe_noiommu_mode
, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)");
111 * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
112 * and remove functions, any use cases other than acquiring the first
113 * reference for the purpose of calling vfio_add_group_dev() or removing
114 * that symmetric reference after vfio_del_group_dev() should use the raw
115 * iommu_group_{get,put} functions. In particular, vfio_iommu_group_put()
116 * removes the device from the dummy group and cannot be nested.
118 struct iommu_group
*vfio_iommu_group_get(struct device
*dev
)
120 struct iommu_group
*group
;
121 int __maybe_unused ret
;
123 group
= iommu_group_get(dev
);
125 #ifdef CONFIG_VFIO_NOIOMMU
127 * With noiommu enabled, an IOMMU group will be created for a device
128 * that doesn't already have one and doesn't have an iommu_ops on their
129 * bus. We set iommudata simply to be able to identify these groups
130 * as special use and for reclamation later.
132 if (group
|| !noiommu
|| iommu_present(dev
->bus
))
135 group
= iommu_group_alloc();
139 iommu_group_set_name(group
, "vfio-noiommu");
140 iommu_group_set_iommudata(group
, &noiommu
, NULL
);
141 ret
= iommu_group_add_device(group
, dev
);
143 iommu_group_put(group
);
148 * Where to taint? At this point we've added an IOMMU group for a
149 * device that is not backed by iommu_ops, therefore any iommu_
150 * callback using iommu_ops can legitimately Oops. So, while we may
151 * be about to give a DMA capable device to a user without IOMMU
152 * protection, which is clearly taint-worthy, let's go ahead and do
155 add_taint(TAINT_USER
, LOCKDEP_STILL_OK
);
156 dev_warn(dev
, "Adding kernel taint for vfio-noiommu group on device\n");
161 EXPORT_SYMBOL_GPL(vfio_iommu_group_get
);
163 void vfio_iommu_group_put(struct iommu_group
*group
, struct device
*dev
)
165 #ifdef CONFIG_VFIO_NOIOMMU
166 if (iommu_group_get_iommudata(group
) == &noiommu
)
167 iommu_group_remove_device(dev
);
170 iommu_group_put(group
);
172 EXPORT_SYMBOL_GPL(vfio_iommu_group_put
);
174 #ifdef CONFIG_VFIO_NOIOMMU
175 static void *vfio_noiommu_open(unsigned long arg
)
177 if (arg
!= VFIO_NOIOMMU_IOMMU
)
178 return ERR_PTR(-EINVAL
);
179 if (!capable(CAP_SYS_RAWIO
))
180 return ERR_PTR(-EPERM
);
185 static void vfio_noiommu_release(void *iommu_data
)
189 static long vfio_noiommu_ioctl(void *iommu_data
,
190 unsigned int cmd
, unsigned long arg
)
192 if (cmd
== VFIO_CHECK_EXTENSION
)
193 return noiommu
&& (arg
== VFIO_NOIOMMU_IOMMU
) ? 1 : 0;
198 static int vfio_noiommu_attach_group(void *iommu_data
,
199 struct iommu_group
*iommu_group
)
201 return iommu_group_get_iommudata(iommu_group
) == &noiommu
? 0 : -EINVAL
;
204 static void vfio_noiommu_detach_group(void *iommu_data
,
205 struct iommu_group
*iommu_group
)
209 static const struct vfio_iommu_driver_ops vfio_noiommu_ops
= {
210 .name
= "vfio-noiommu",
211 .owner
= THIS_MODULE
,
212 .open
= vfio_noiommu_open
,
213 .release
= vfio_noiommu_release
,
214 .ioctl
= vfio_noiommu_ioctl
,
215 .attach_group
= vfio_noiommu_attach_group
,
216 .detach_group
= vfio_noiommu_detach_group
,
222 * IOMMU driver registration
224 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops
*ops
)
226 struct vfio_iommu_driver
*driver
, *tmp
;
228 driver
= kzalloc(sizeof(*driver
), GFP_KERNEL
);
234 mutex_lock(&vfio
.iommu_drivers_lock
);
236 /* Check for duplicates */
237 list_for_each_entry(tmp
, &vfio
.iommu_drivers_list
, vfio_next
) {
238 if (tmp
->ops
== ops
) {
239 mutex_unlock(&vfio
.iommu_drivers_lock
);
245 list_add(&driver
->vfio_next
, &vfio
.iommu_drivers_list
);
247 mutex_unlock(&vfio
.iommu_drivers_lock
);
251 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver
);
253 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops
*ops
)
255 struct vfio_iommu_driver
*driver
;
257 mutex_lock(&vfio
.iommu_drivers_lock
);
258 list_for_each_entry(driver
, &vfio
.iommu_drivers_list
, vfio_next
) {
259 if (driver
->ops
== ops
) {
260 list_del(&driver
->vfio_next
);
261 mutex_unlock(&vfio
.iommu_drivers_lock
);
266 mutex_unlock(&vfio
.iommu_drivers_lock
);
268 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver
);
271 * Group minor allocation/free - both called with vfio.group_lock held
273 static int vfio_alloc_group_minor(struct vfio_group
*group
)
275 return idr_alloc(&vfio
.group_idr
, group
, 0, MINORMASK
+ 1, GFP_KERNEL
);
278 static void vfio_free_group_minor(int minor
)
280 idr_remove(&vfio
.group_idr
, minor
);
283 static int vfio_iommu_group_notifier(struct notifier_block
*nb
,
284 unsigned long action
, void *data
);
285 static void vfio_group_get(struct vfio_group
*group
);
288 * Container objects - containers are created when /dev/vfio/vfio is
289 * opened, but their lifecycle extends until the last user is done, so
290 * it's freed via kref. Must support container/group/device being
291 * closed in any order.
293 static void vfio_container_get(struct vfio_container
*container
)
295 kref_get(&container
->kref
);
298 static void vfio_container_release(struct kref
*kref
)
300 struct vfio_container
*container
;
301 container
= container_of(kref
, struct vfio_container
, kref
);
306 static void vfio_container_put(struct vfio_container
*container
)
308 kref_put(&container
->kref
, vfio_container_release
);
311 static void vfio_group_unlock_and_free(struct vfio_group
*group
)
313 mutex_unlock(&vfio
.group_lock
);
315 * Unregister outside of lock. A spurious callback is harmless now
316 * that the group is no longer in vfio.group_list.
318 iommu_group_unregister_notifier(group
->iommu_group
, &group
->nb
);
323 * Group objects - create, release, get, put, search
325 static struct vfio_group
*vfio_create_group(struct iommu_group
*iommu_group
)
327 struct vfio_group
*group
, *tmp
;
331 group
= kzalloc(sizeof(*group
), GFP_KERNEL
);
333 return ERR_PTR(-ENOMEM
);
335 kref_init(&group
->kref
);
336 INIT_LIST_HEAD(&group
->device_list
);
337 mutex_init(&group
->device_lock
);
338 INIT_LIST_HEAD(&group
->unbound_list
);
339 mutex_init(&group
->unbound_lock
);
340 atomic_set(&group
->container_users
, 0);
341 atomic_set(&group
->opened
, 0);
342 init_waitqueue_head(&group
->container_q
);
343 group
->iommu_group
= iommu_group
;
344 #ifdef CONFIG_VFIO_NOIOMMU
345 group
->noiommu
= (iommu_group_get_iommudata(iommu_group
) == &noiommu
);
347 BLOCKING_INIT_NOTIFIER_HEAD(&group
->notifier
);
349 group
->nb
.notifier_call
= vfio_iommu_group_notifier
;
352 * blocking notifiers acquire a rwsem around registering and hold
353 * it around callback. Therefore, need to register outside of
354 * vfio.group_lock to avoid A-B/B-A contention. Our callback won't
355 * do anything unless it can find the group in vfio.group_list, so
356 * no harm in registering early.
358 ret
= iommu_group_register_notifier(iommu_group
, &group
->nb
);
364 mutex_lock(&vfio
.group_lock
);
366 /* Did we race creating this group? */
367 list_for_each_entry(tmp
, &vfio
.group_list
, vfio_next
) {
368 if (tmp
->iommu_group
== iommu_group
) {
370 vfio_group_unlock_and_free(group
);
375 minor
= vfio_alloc_group_minor(group
);
377 vfio_group_unlock_and_free(group
);
378 return ERR_PTR(minor
);
381 dev
= device_create(vfio
.class, NULL
,
382 MKDEV(MAJOR(vfio
.group_devt
), minor
),
383 group
, "%s%d", group
->noiommu
? "noiommu-" : "",
384 iommu_group_id(iommu_group
));
386 vfio_free_group_minor(minor
);
387 vfio_group_unlock_and_free(group
);
388 return ERR_CAST(dev
);
391 group
->minor
= minor
;
394 list_add(&group
->vfio_next
, &vfio
.group_list
);
396 mutex_unlock(&vfio
.group_lock
);
401 /* called with vfio.group_lock held */
402 static void vfio_group_release(struct kref
*kref
)
404 struct vfio_group
*group
= container_of(kref
, struct vfio_group
, kref
);
405 struct vfio_unbound_dev
*unbound
, *tmp
;
406 struct iommu_group
*iommu_group
= group
->iommu_group
;
408 WARN_ON(!list_empty(&group
->device_list
));
409 WARN_ON(group
->notifier
.head
);
411 list_for_each_entry_safe(unbound
, tmp
,
412 &group
->unbound_list
, unbound_next
) {
413 list_del(&unbound
->unbound_next
);
417 device_destroy(vfio
.class, MKDEV(MAJOR(vfio
.group_devt
), group
->minor
));
418 list_del(&group
->vfio_next
);
419 vfio_free_group_minor(group
->minor
);
420 vfio_group_unlock_and_free(group
);
421 iommu_group_put(iommu_group
);
424 static void vfio_group_put(struct vfio_group
*group
)
426 kref_put_mutex(&group
->kref
, vfio_group_release
, &vfio
.group_lock
);
429 struct vfio_group_put_work
{
430 struct work_struct work
;
431 struct vfio_group
*group
;
434 static void vfio_group_put_bg(struct work_struct
*work
)
436 struct vfio_group_put_work
*do_work
;
438 do_work
= container_of(work
, struct vfio_group_put_work
, work
);
440 vfio_group_put(do_work
->group
);
444 static void vfio_group_schedule_put(struct vfio_group
*group
)
446 struct vfio_group_put_work
*do_work
;
448 do_work
= kmalloc(sizeof(*do_work
), GFP_KERNEL
);
449 if (WARN_ON(!do_work
))
452 INIT_WORK(&do_work
->work
, vfio_group_put_bg
);
453 do_work
->group
= group
;
454 schedule_work(&do_work
->work
);
457 /* Assume group_lock or group reference is held */
458 static void vfio_group_get(struct vfio_group
*group
)
460 kref_get(&group
->kref
);
464 * Not really a try as we will sleep for mutex, but we need to make
465 * sure the group pointer is valid under lock and get a reference.
467 static struct vfio_group
*vfio_group_try_get(struct vfio_group
*group
)
469 struct vfio_group
*target
= group
;
471 mutex_lock(&vfio
.group_lock
);
472 list_for_each_entry(group
, &vfio
.group_list
, vfio_next
) {
473 if (group
== target
) {
474 vfio_group_get(group
);
475 mutex_unlock(&vfio
.group_lock
);
479 mutex_unlock(&vfio
.group_lock
);
485 struct vfio_group
*vfio_group_get_from_iommu(struct iommu_group
*iommu_group
)
487 struct vfio_group
*group
;
489 mutex_lock(&vfio
.group_lock
);
490 list_for_each_entry(group
, &vfio
.group_list
, vfio_next
) {
491 if (group
->iommu_group
== iommu_group
) {
492 vfio_group_get(group
);
493 mutex_unlock(&vfio
.group_lock
);
497 mutex_unlock(&vfio
.group_lock
);
502 static struct vfio_group
*vfio_group_get_from_minor(int minor
)
504 struct vfio_group
*group
;
506 mutex_lock(&vfio
.group_lock
);
507 group
= idr_find(&vfio
.group_idr
, minor
);
509 mutex_unlock(&vfio
.group_lock
);
512 vfio_group_get(group
);
513 mutex_unlock(&vfio
.group_lock
);
518 static struct vfio_group
*vfio_group_get_from_dev(struct device
*dev
)
520 struct iommu_group
*iommu_group
;
521 struct vfio_group
*group
;
523 iommu_group
= iommu_group_get(dev
);
527 group
= vfio_group_get_from_iommu(iommu_group
);
528 iommu_group_put(iommu_group
);
534 * Device objects - create, release, get, put, search
537 struct vfio_device
*vfio_group_create_device(struct vfio_group
*group
,
539 const struct vfio_device_ops
*ops
,
542 struct vfio_device
*device
;
544 device
= kzalloc(sizeof(*device
), GFP_KERNEL
);
546 return ERR_PTR(-ENOMEM
);
548 kref_init(&device
->kref
);
550 device
->group
= group
;
552 device
->device_data
= device_data
;
553 dev_set_drvdata(dev
, device
);
555 /* No need to get group_lock, caller has group reference */
556 vfio_group_get(group
);
558 mutex_lock(&group
->device_lock
);
559 list_add(&device
->group_next
, &group
->device_list
);
560 mutex_unlock(&group
->device_lock
);
565 static void vfio_device_release(struct kref
*kref
)
567 struct vfio_device
*device
= container_of(kref
,
568 struct vfio_device
, kref
);
569 struct vfio_group
*group
= device
->group
;
571 list_del(&device
->group_next
);
572 mutex_unlock(&group
->device_lock
);
574 dev_set_drvdata(device
->dev
, NULL
);
578 /* vfio_del_group_dev may be waiting for this device */
579 wake_up(&vfio
.release_q
);
582 /* Device reference always implies a group reference */
583 void vfio_device_put(struct vfio_device
*device
)
585 struct vfio_group
*group
= device
->group
;
586 kref_put_mutex(&device
->kref
, vfio_device_release
, &group
->device_lock
);
587 vfio_group_put(group
);
589 EXPORT_SYMBOL_GPL(vfio_device_put
);
591 static void vfio_device_get(struct vfio_device
*device
)
593 vfio_group_get(device
->group
);
594 kref_get(&device
->kref
);
597 static struct vfio_device
*vfio_group_get_device(struct vfio_group
*group
,
600 struct vfio_device
*device
;
602 mutex_lock(&group
->device_lock
);
603 list_for_each_entry(device
, &group
->device_list
, group_next
) {
604 if (device
->dev
== dev
) {
605 vfio_device_get(device
);
606 mutex_unlock(&group
->device_lock
);
610 mutex_unlock(&group
->device_lock
);
615 * Some drivers, like pci-stub, are only used to prevent other drivers from
616 * claiming a device and are therefore perfectly legitimate for a user owned
617 * group. The pci-stub driver has no dependencies on DMA or the IOVA mapping
618 * of the device, but it does prevent the user from having direct access to
619 * the device, which is useful in some circumstances.
621 * We also assume that we can include PCI interconnect devices, ie. bridges.
622 * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
623 * then all of the downstream devices will be part of the same IOMMU group as
624 * the bridge. Thus, if placing the bridge into the user owned IOVA space
625 * breaks anything, it only does so for user owned devices downstream. Note
626 * that error notification via MSI can be affected for platforms that handle
627 * MSI within the same IOVA space as DMA.
629 static const char * const vfio_driver_whitelist
[] = { "pci-stub" };
631 static bool vfio_dev_whitelisted(struct device
*dev
, struct device_driver
*drv
)
635 if (dev_is_pci(dev
)) {
636 struct pci_dev
*pdev
= to_pci_dev(dev
);
638 if (pdev
->hdr_type
!= PCI_HEADER_TYPE_NORMAL
)
642 for (i
= 0; i
< ARRAY_SIZE(vfio_driver_whitelist
); i
++) {
643 if (!strcmp(drv
->name
, vfio_driver_whitelist
[i
]))
651 * A vfio group is viable for use by userspace if all devices are in
652 * one of the following states:
654 * - bound to a vfio driver
655 * - bound to a whitelisted driver
656 * - a PCI interconnect device
658 * We use two methods to determine whether a device is bound to a vfio
659 * driver. The first is to test whether the device exists in the vfio
660 * group. The second is to test if the device exists on the group
661 * unbound_list, indicating it's in the middle of transitioning from
662 * a vfio driver to driver-less.
664 static int vfio_dev_viable(struct device
*dev
, void *data
)
666 struct vfio_group
*group
= data
;
667 struct vfio_device
*device
;
668 struct device_driver
*drv
= READ_ONCE(dev
->driver
);
669 struct vfio_unbound_dev
*unbound
;
672 mutex_lock(&group
->unbound_lock
);
673 list_for_each_entry(unbound
, &group
->unbound_list
, unbound_next
) {
674 if (dev
== unbound
->dev
) {
679 mutex_unlock(&group
->unbound_lock
);
681 if (!ret
|| !drv
|| vfio_dev_whitelisted(dev
, drv
))
684 device
= vfio_group_get_device(group
, dev
);
686 vfio_device_put(device
);
694 * Async device support
696 static int vfio_group_nb_add_dev(struct vfio_group
*group
, struct device
*dev
)
698 struct vfio_device
*device
;
700 /* Do we already know about it? We shouldn't */
701 device
= vfio_group_get_device(group
, dev
);
702 if (WARN_ON_ONCE(device
)) {
703 vfio_device_put(device
);
707 /* Nothing to do for idle groups */
708 if (!atomic_read(&group
->container_users
))
711 /* TODO Prevent device auto probing */
712 WARN(1, "Device %s added to live group %d!\n", dev_name(dev
),
713 iommu_group_id(group
->iommu_group
));
718 static int vfio_group_nb_verify(struct vfio_group
*group
, struct device
*dev
)
720 /* We don't care what happens when the group isn't in use */
721 if (!atomic_read(&group
->container_users
))
724 return vfio_dev_viable(dev
, group
);
727 static int vfio_iommu_group_notifier(struct notifier_block
*nb
,
728 unsigned long action
, void *data
)
730 struct vfio_group
*group
= container_of(nb
, struct vfio_group
, nb
);
731 struct device
*dev
= data
;
732 struct vfio_unbound_dev
*unbound
;
735 * Need to go through a group_lock lookup to get a reference or we
736 * risk racing a group being removed. Ignore spurious notifies.
738 group
= vfio_group_try_get(group
);
743 case IOMMU_GROUP_NOTIFY_ADD_DEVICE
:
744 vfio_group_nb_add_dev(group
, dev
);
746 case IOMMU_GROUP_NOTIFY_DEL_DEVICE
:
748 * Nothing to do here. If the device is in use, then the
749 * vfio sub-driver should block the remove callback until
750 * it is unused. If the device is unused or attached to a
751 * stub driver, then it should be released and we don't
752 * care that it will be going away.
755 case IOMMU_GROUP_NOTIFY_BIND_DRIVER
:
756 pr_debug("%s: Device %s, group %d binding to driver\n",
757 __func__
, dev_name(dev
),
758 iommu_group_id(group
->iommu_group
));
760 case IOMMU_GROUP_NOTIFY_BOUND_DRIVER
:
761 pr_debug("%s: Device %s, group %d bound to driver %s\n",
762 __func__
, dev_name(dev
),
763 iommu_group_id(group
->iommu_group
), dev
->driver
->name
);
764 BUG_ON(vfio_group_nb_verify(group
, dev
));
766 case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER
:
767 pr_debug("%s: Device %s, group %d unbinding from driver %s\n",
768 __func__
, dev_name(dev
),
769 iommu_group_id(group
->iommu_group
), dev
->driver
->name
);
771 case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER
:
772 pr_debug("%s: Device %s, group %d unbound from driver\n",
773 __func__
, dev_name(dev
),
774 iommu_group_id(group
->iommu_group
));
776 * XXX An unbound device in a live group is ok, but we'd
777 * really like to avoid the above BUG_ON by preventing other
778 * drivers from binding to it. Once that occurs, we have to
779 * stop the system to maintain isolation. At a minimum, we'd
780 * want a toggle to disable driver auto probe for this device.
783 mutex_lock(&group
->unbound_lock
);
784 list_for_each_entry(unbound
,
785 &group
->unbound_list
, unbound_next
) {
786 if (dev
== unbound
->dev
) {
787 list_del(&unbound
->unbound_next
);
792 mutex_unlock(&group
->unbound_lock
);
797 * If we're the last reference to the group, the group will be
798 * released, which includes unregistering the iommu group notifier.
799 * We hold a read-lock on that notifier list, unregistering needs
800 * a write-lock... deadlock. Release our reference asynchronously
801 * to avoid that situation.
803 vfio_group_schedule_put(group
);
810 int vfio_add_group_dev(struct device
*dev
,
811 const struct vfio_device_ops
*ops
, void *device_data
)
813 struct iommu_group
*iommu_group
;
814 struct vfio_group
*group
;
815 struct vfio_device
*device
;
817 iommu_group
= iommu_group_get(dev
);
821 group
= vfio_group_get_from_iommu(iommu_group
);
823 group
= vfio_create_group(iommu_group
);
825 iommu_group_put(iommu_group
);
826 return PTR_ERR(group
);
830 * A found vfio_group already holds a reference to the
831 * iommu_group. A created vfio_group keeps the reference.
833 iommu_group_put(iommu_group
);
836 device
= vfio_group_get_device(group
, dev
);
838 WARN(1, "Device %s already exists on group %d\n",
839 dev_name(dev
), iommu_group_id(iommu_group
));
840 vfio_device_put(device
);
841 vfio_group_put(group
);
845 device
= vfio_group_create_device(group
, dev
, ops
, device_data
);
846 if (IS_ERR(device
)) {
847 vfio_group_put(group
);
848 return PTR_ERR(device
);
852 * Drop all but the vfio_device reference. The vfio_device holds
853 * a reference to the vfio_group, which holds a reference to the
856 vfio_group_put(group
);
860 EXPORT_SYMBOL_GPL(vfio_add_group_dev
);
863 * Get a reference to the vfio_device for a device. Even if the
864 * caller thinks they own the device, they could be racing with a
865 * release call path, so we can't trust drvdata for the shortcut.
866 * Go the long way around, from the iommu_group to the vfio_group
867 * to the vfio_device.
869 struct vfio_device
*vfio_device_get_from_dev(struct device
*dev
)
871 struct vfio_group
*group
;
872 struct vfio_device
*device
;
874 group
= vfio_group_get_from_dev(dev
);
878 device
= vfio_group_get_device(group
, dev
);
879 vfio_group_put(group
);
883 EXPORT_SYMBOL_GPL(vfio_device_get_from_dev
);
885 static struct vfio_device
*vfio_device_get_from_name(struct vfio_group
*group
,
888 struct vfio_device
*it
, *device
= NULL
;
890 mutex_lock(&group
->device_lock
);
891 list_for_each_entry(it
, &group
->device_list
, group_next
) {
892 if (!strcmp(dev_name(it
->dev
), buf
)) {
894 vfio_device_get(device
);
898 mutex_unlock(&group
->device_lock
);
904 * Caller must hold a reference to the vfio_device
906 void *vfio_device_data(struct vfio_device
*device
)
908 return device
->device_data
;
910 EXPORT_SYMBOL_GPL(vfio_device_data
);
912 /* Given a referenced group, check if it contains the device */
913 static bool vfio_dev_present(struct vfio_group
*group
, struct device
*dev
)
915 struct vfio_device
*device
;
917 device
= vfio_group_get_device(group
, dev
);
921 vfio_device_put(device
);
926 * Decrement the device reference count and wait for the device to be
927 * removed. Open file descriptors for the device... */
928 void *vfio_del_group_dev(struct device
*dev
)
930 struct vfio_device
*device
= dev_get_drvdata(dev
);
931 struct vfio_group
*group
= device
->group
;
932 void *device_data
= device
->device_data
;
933 struct vfio_unbound_dev
*unbound
;
936 bool interrupted
= false;
938 struct device_driver
*drv
;
943 * The group exists so long as we have a device reference. Get
944 * a group reference and use it to scan for the device going away.
946 vfio_group_get(group
);
949 * When the device is removed from the group, the group suddenly
950 * becomes non-viable; the device has a driver (until the unbind
951 * completes), but it's not present in the group. This is bad news
952 * for any external users that need to re-acquire a group reference
953 * in order to match and release their existing reference. To
954 * solve this, we track such devices on the unbound_list to bridge
955 * the gap until they're fully unbound.
957 unbound
= kzalloc(sizeof(*unbound
), GFP_KERNEL
);
960 mutex_lock(&group
->unbound_lock
);
961 list_add(&unbound
->unbound_next
, &group
->unbound_list
);
962 mutex_unlock(&group
->unbound_lock
);
966 vfio_device_put(device
);
969 * If the device is still present in the group after the above
970 * 'put', then it is in use and we need to request it from the
971 * bus driver. The driver may in turn need to request the
972 * device from the user. We send the request on an arbitrary
973 * interval with counter to allow the driver to take escalating
974 * measures to release the device if it has the ability to do so.
977 device
= vfio_group_get_device(group
, dev
);
981 if (device
->ops
->request
) {
984 device
->ops
->request(device_data
, i
++);
987 vfio_device_put(device
);
990 ret
= wait_event_timeout(vfio
.release_q
,
991 !vfio_dev_present(group
, dev
), HZ
* 10);
993 ret
= wait_event_interruptible_timeout(vfio
.release_q
,
994 !vfio_dev_present(group
, dev
), HZ
* 10);
995 if (ret
== -ERESTARTSYS
) {
998 "Device is currently in use, task"
1000 "blocked until device is released",
1001 current
->comm
, task_pid_nr(current
));
1009 * A concurrent operation may have released the driver
1010 * successfully while we had dropped the lock,
1013 if (dev
->driver
!= drv
) {
1014 vfio_group_put(group
);
1021 * In order to support multiple devices per group, devices can be
1022 * plucked from the group while other devices in the group are still
1023 * in use. The container persists with this group and those remaining
1024 * devices still attached. If the user creates an isolation violation
1025 * by binding this device to another driver while the group is still in
1026 * use, that's their fault. However, in the case of removing the last,
1027 * or potentially the only, device in the group there can be no other
1028 * in-use devices in the group. The user has done their due diligence
1029 * and we should lay no claims to those devices. In order to do that,
1030 * we need to make sure the group is detached from the container.
1031 * Without this stall, we're potentially racing with a user process
1032 * that may attempt to immediately bind this device to another driver.
1034 if (list_empty(&group
->device_list
))
1035 wait_event(group
->container_q
, !group
->container
);
1037 vfio_group_put(group
);
1041 EXPORT_SYMBOL_GPL(vfio_del_group_dev
);
1044 * VFIO base fd, /dev/vfio/vfio
1046 static long vfio_ioctl_check_extension(struct vfio_container
*container
,
1049 struct vfio_iommu_driver
*driver
;
1052 down_read(&container
->group_lock
);
1054 driver
= container
->iommu_driver
;
1057 /* No base extensions yet */
1060 * If no driver is set, poll all registered drivers for
1061 * extensions and return the first positive result. If
1062 * a driver is already set, further queries will be passed
1063 * only to that driver.
1066 mutex_lock(&vfio
.iommu_drivers_lock
);
1067 list_for_each_entry(driver
, &vfio
.iommu_drivers_list
,
1070 #ifdef CONFIG_VFIO_NOIOMMU
1071 if (!list_empty(&container
->group_list
) &&
1072 (container
->noiommu
!=
1073 (driver
->ops
== &vfio_noiommu_ops
)))
1077 if (!try_module_get(driver
->ops
->owner
))
1080 ret
= driver
->ops
->ioctl(NULL
,
1081 VFIO_CHECK_EXTENSION
,
1083 module_put(driver
->ops
->owner
);
1087 mutex_unlock(&vfio
.iommu_drivers_lock
);
1089 ret
= driver
->ops
->ioctl(container
->iommu_data
,
1090 VFIO_CHECK_EXTENSION
, arg
);
1093 up_read(&container
->group_lock
);
1098 /* hold write lock on container->group_lock */
1099 static int __vfio_container_attach_groups(struct vfio_container
*container
,
1100 struct vfio_iommu_driver
*driver
,
1103 struct vfio_group
*group
;
1106 list_for_each_entry(group
, &container
->group_list
, container_next
) {
1107 ret
= driver
->ops
->attach_group(data
, group
->iommu_group
);
1115 list_for_each_entry_continue_reverse(group
, &container
->group_list
,
1117 driver
->ops
->detach_group(data
, group
->iommu_group
);
1123 static long vfio_ioctl_set_iommu(struct vfio_container
*container
,
1126 struct vfio_iommu_driver
*driver
;
1129 down_write(&container
->group_lock
);
1132 * The container is designed to be an unprivileged interface while
1133 * the group can be assigned to specific users. Therefore, only by
1134 * adding a group to a container does the user get the privilege of
1135 * enabling the iommu, which may allocate finite resources. There
1136 * is no unset_iommu, but by removing all the groups from a container,
1137 * the container is deprivileged and returns to an unset state.
1139 if (list_empty(&container
->group_list
) || container
->iommu_driver
) {
1140 up_write(&container
->group_lock
);
1144 mutex_lock(&vfio
.iommu_drivers_lock
);
1145 list_for_each_entry(driver
, &vfio
.iommu_drivers_list
, vfio_next
) {
1148 #ifdef CONFIG_VFIO_NOIOMMU
1150 * Only noiommu containers can use vfio-noiommu and noiommu
1151 * containers can only use vfio-noiommu.
1153 if (container
->noiommu
!= (driver
->ops
== &vfio_noiommu_ops
))
1157 if (!try_module_get(driver
->ops
->owner
))
1161 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1162 * so test which iommu driver reported support for this
1163 * extension and call open on them. We also pass them the
1164 * magic, allowing a single driver to support multiple
1165 * interfaces if they'd like.
1167 if (driver
->ops
->ioctl(NULL
, VFIO_CHECK_EXTENSION
, arg
) <= 0) {
1168 module_put(driver
->ops
->owner
);
1172 data
= driver
->ops
->open(arg
);
1174 ret
= PTR_ERR(data
);
1175 module_put(driver
->ops
->owner
);
1179 ret
= __vfio_container_attach_groups(container
, driver
, data
);
1181 driver
->ops
->release(data
);
1182 module_put(driver
->ops
->owner
);
1186 container
->iommu_driver
= driver
;
1187 container
->iommu_data
= data
;
1191 mutex_unlock(&vfio
.iommu_drivers_lock
);
1192 up_write(&container
->group_lock
);
1197 static long vfio_fops_unl_ioctl(struct file
*filep
,
1198 unsigned int cmd
, unsigned long arg
)
1200 struct vfio_container
*container
= filep
->private_data
;
1201 struct vfio_iommu_driver
*driver
;
1209 case VFIO_GET_API_VERSION
:
1210 ret
= VFIO_API_VERSION
;
1212 case VFIO_CHECK_EXTENSION
:
1213 ret
= vfio_ioctl_check_extension(container
, arg
);
1215 case VFIO_SET_IOMMU
:
1216 ret
= vfio_ioctl_set_iommu(container
, arg
);
1219 driver
= container
->iommu_driver
;
1220 data
= container
->iommu_data
;
1222 if (driver
) /* passthrough all unrecognized ioctls */
1223 ret
= driver
->ops
->ioctl(data
, cmd
, arg
);
1229 #ifdef CONFIG_COMPAT
1230 static long vfio_fops_compat_ioctl(struct file
*filep
,
1231 unsigned int cmd
, unsigned long arg
)
1233 arg
= (unsigned long)compat_ptr(arg
);
1234 return vfio_fops_unl_ioctl(filep
, cmd
, arg
);
1236 #endif /* CONFIG_COMPAT */
1238 static int vfio_fops_open(struct inode
*inode
, struct file
*filep
)
1240 struct vfio_container
*container
;
1242 container
= kzalloc(sizeof(*container
), GFP_KERNEL
);
1246 INIT_LIST_HEAD(&container
->group_list
);
1247 init_rwsem(&container
->group_lock
);
1248 kref_init(&container
->kref
);
1250 filep
->private_data
= container
;
1255 static int vfio_fops_release(struct inode
*inode
, struct file
*filep
)
1257 struct vfio_container
*container
= filep
->private_data
;
1259 filep
->private_data
= NULL
;
1261 vfio_container_put(container
);
1267 * Once an iommu driver is set, we optionally pass read/write/mmap
1268 * on to the driver, allowing management interfaces beyond ioctl.
1270 static ssize_t
vfio_fops_read(struct file
*filep
, char __user
*buf
,
1271 size_t count
, loff_t
*ppos
)
1273 struct vfio_container
*container
= filep
->private_data
;
1274 struct vfio_iommu_driver
*driver
;
1275 ssize_t ret
= -EINVAL
;
1277 driver
= container
->iommu_driver
;
1278 if (likely(driver
&& driver
->ops
->read
))
1279 ret
= driver
->ops
->read(container
->iommu_data
,
1285 static ssize_t
vfio_fops_write(struct file
*filep
, const char __user
*buf
,
1286 size_t count
, loff_t
*ppos
)
1288 struct vfio_container
*container
= filep
->private_data
;
1289 struct vfio_iommu_driver
*driver
;
1290 ssize_t ret
= -EINVAL
;
1292 driver
= container
->iommu_driver
;
1293 if (likely(driver
&& driver
->ops
->write
))
1294 ret
= driver
->ops
->write(container
->iommu_data
,
1300 static int vfio_fops_mmap(struct file
*filep
, struct vm_area_struct
*vma
)
1302 struct vfio_container
*container
= filep
->private_data
;
1303 struct vfio_iommu_driver
*driver
;
1306 driver
= container
->iommu_driver
;
1307 if (likely(driver
&& driver
->ops
->mmap
))
1308 ret
= driver
->ops
->mmap(container
->iommu_data
, vma
);
1313 static const struct file_operations vfio_fops
= {
1314 .owner
= THIS_MODULE
,
1315 .open
= vfio_fops_open
,
1316 .release
= vfio_fops_release
,
1317 .read
= vfio_fops_read
,
1318 .write
= vfio_fops_write
,
1319 .unlocked_ioctl
= vfio_fops_unl_ioctl
,
1320 #ifdef CONFIG_COMPAT
1321 .compat_ioctl
= vfio_fops_compat_ioctl
,
1323 .mmap
= vfio_fops_mmap
,
1327 * VFIO Group fd, /dev/vfio/$GROUP
1329 static void __vfio_group_unset_container(struct vfio_group
*group
)
1331 struct vfio_container
*container
= group
->container
;
1332 struct vfio_iommu_driver
*driver
;
1334 down_write(&container
->group_lock
);
1336 driver
= container
->iommu_driver
;
1338 driver
->ops
->detach_group(container
->iommu_data
,
1339 group
->iommu_group
);
1341 group
->container
= NULL
;
1342 wake_up(&group
->container_q
);
1343 list_del(&group
->container_next
);
1345 /* Detaching the last group deprivileges a container, remove iommu */
1346 if (driver
&& list_empty(&container
->group_list
)) {
1347 driver
->ops
->release(container
->iommu_data
);
1348 module_put(driver
->ops
->owner
);
1349 container
->iommu_driver
= NULL
;
1350 container
->iommu_data
= NULL
;
1353 up_write(&container
->group_lock
);
1355 vfio_container_put(container
);
1359 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1360 * if there was no container to unset. Since the ioctl is called on
1361 * the group, we know that still exists, therefore the only valid
1362 * transition here is 1->0.
1364 static int vfio_group_unset_container(struct vfio_group
*group
)
1366 int users
= atomic_cmpxchg(&group
->container_users
, 1, 0);
1373 __vfio_group_unset_container(group
);
1379 * When removing container users, anything that removes the last user
1380 * implicitly removes the group from the container. That is, if the
1381 * group file descriptor is closed, as well as any device file descriptors,
1382 * the group is free.
1384 static void vfio_group_try_dissolve_container(struct vfio_group
*group
)
1386 if (0 == atomic_dec_if_positive(&group
->container_users
))
1387 __vfio_group_unset_container(group
);
1390 static int vfio_group_set_container(struct vfio_group
*group
, int container_fd
)
1393 struct vfio_container
*container
;
1394 struct vfio_iommu_driver
*driver
;
1397 if (atomic_read(&group
->container_users
))
1400 if (group
->noiommu
&& !capable(CAP_SYS_RAWIO
))
1403 f
= fdget(container_fd
);
1407 /* Sanity check, is this really our fd? */
1408 if (f
.file
->f_op
!= &vfio_fops
) {
1413 container
= f
.file
->private_data
;
1414 WARN_ON(!container
); /* fget ensures we don't race vfio_release */
1416 down_write(&container
->group_lock
);
1418 /* Real groups and fake groups cannot mix */
1419 if (!list_empty(&container
->group_list
) &&
1420 container
->noiommu
!= group
->noiommu
) {
1425 driver
= container
->iommu_driver
;
1427 ret
= driver
->ops
->attach_group(container
->iommu_data
,
1428 group
->iommu_group
);
1433 group
->container
= container
;
1434 container
->noiommu
= group
->noiommu
;
1435 list_add(&group
->container_next
, &container
->group_list
);
1437 /* Get a reference on the container and mark a user within the group */
1438 vfio_container_get(container
);
1439 atomic_inc(&group
->container_users
);
1442 up_write(&container
->group_lock
);
1447 static bool vfio_group_viable(struct vfio_group
*group
)
1449 return (iommu_group_for_each_dev(group
->iommu_group
,
1450 group
, vfio_dev_viable
) == 0);
1453 static int vfio_group_add_container_user(struct vfio_group
*group
)
1455 if (!atomic_inc_not_zero(&group
->container_users
))
1458 if (group
->noiommu
) {
1459 atomic_dec(&group
->container_users
);
1462 if (!group
->container
->iommu_driver
|| !vfio_group_viable(group
)) {
1463 atomic_dec(&group
->container_users
);
1470 static const struct file_operations vfio_device_fops
;
1472 static int vfio_group_get_device_fd(struct vfio_group
*group
, char *buf
)
1474 struct vfio_device
*device
;
1478 if (0 == atomic_read(&group
->container_users
) ||
1479 !group
->container
->iommu_driver
|| !vfio_group_viable(group
))
1482 if (group
->noiommu
&& !capable(CAP_SYS_RAWIO
))
1485 device
= vfio_device_get_from_name(group
, buf
);
1489 ret
= device
->ops
->open(device
->device_data
);
1491 vfio_device_put(device
);
1496 * We can't use anon_inode_getfd() because we need to modify
1497 * the f_mode flags directly to allow more than just ioctls
1499 ret
= get_unused_fd_flags(O_CLOEXEC
);
1501 device
->ops
->release(device
->device_data
);
1502 vfio_device_put(device
);
1506 filep
= anon_inode_getfile("[vfio-device]", &vfio_device_fops
,
1508 if (IS_ERR(filep
)) {
1510 ret
= PTR_ERR(filep
);
1511 device
->ops
->release(device
->device_data
);
1512 vfio_device_put(device
);
1517 * TODO: add an anon_inode interface to do this.
1518 * Appears to be missing by lack of need rather than
1519 * explicitly prevented. Now there's need.
1521 filep
->f_mode
|= (FMODE_LSEEK
| FMODE_PREAD
| FMODE_PWRITE
);
1523 atomic_inc(&group
->container_users
);
1525 fd_install(ret
, filep
);
1528 dev_warn(device
->dev
, "vfio-noiommu device opened by user "
1529 "(%s:%d)\n", current
->comm
, task_pid_nr(current
));
1534 static long vfio_group_fops_unl_ioctl(struct file
*filep
,
1535 unsigned int cmd
, unsigned long arg
)
1537 struct vfio_group
*group
= filep
->private_data
;
1541 case VFIO_GROUP_GET_STATUS
:
1543 struct vfio_group_status status
;
1544 unsigned long minsz
;
1546 minsz
= offsetofend(struct vfio_group_status
, flags
);
1548 if (copy_from_user(&status
, (void __user
*)arg
, minsz
))
1551 if (status
.argsz
< minsz
)
1556 if (vfio_group_viable(group
))
1557 status
.flags
|= VFIO_GROUP_FLAGS_VIABLE
;
1559 if (group
->container
)
1560 status
.flags
|= VFIO_GROUP_FLAGS_CONTAINER_SET
;
1562 if (copy_to_user((void __user
*)arg
, &status
, minsz
))
1568 case VFIO_GROUP_SET_CONTAINER
:
1572 if (get_user(fd
, (int __user
*)arg
))
1578 ret
= vfio_group_set_container(group
, fd
);
1581 case VFIO_GROUP_UNSET_CONTAINER
:
1582 ret
= vfio_group_unset_container(group
);
1584 case VFIO_GROUP_GET_DEVICE_FD
:
1588 buf
= strndup_user((const char __user
*)arg
, PAGE_SIZE
);
1590 return PTR_ERR(buf
);
1592 ret
= vfio_group_get_device_fd(group
, buf
);
1601 #ifdef CONFIG_COMPAT
1602 static long vfio_group_fops_compat_ioctl(struct file
*filep
,
1603 unsigned int cmd
, unsigned long arg
)
1605 arg
= (unsigned long)compat_ptr(arg
);
1606 return vfio_group_fops_unl_ioctl(filep
, cmd
, arg
);
1608 #endif /* CONFIG_COMPAT */
1610 static int vfio_group_fops_open(struct inode
*inode
, struct file
*filep
)
1612 struct vfio_group
*group
;
1615 group
= vfio_group_get_from_minor(iminor(inode
));
1619 if (group
->noiommu
&& !capable(CAP_SYS_RAWIO
)) {
1620 vfio_group_put(group
);
1624 /* Do we need multiple instances of the group open? Seems not. */
1625 opened
= atomic_cmpxchg(&group
->opened
, 0, 1);
1627 vfio_group_put(group
);
1631 /* Is something still in use from a previous open? */
1632 if (group
->container
) {
1633 atomic_dec(&group
->opened
);
1634 vfio_group_put(group
);
1638 /* Warn if previous user didn't cleanup and re-init to drop them */
1639 if (WARN_ON(group
->notifier
.head
))
1640 BLOCKING_INIT_NOTIFIER_HEAD(&group
->notifier
);
1642 filep
->private_data
= group
;
1647 static int vfio_group_fops_release(struct inode
*inode
, struct file
*filep
)
1649 struct vfio_group
*group
= filep
->private_data
;
1651 filep
->private_data
= NULL
;
1653 vfio_group_try_dissolve_container(group
);
1655 atomic_dec(&group
->opened
);
1657 vfio_group_put(group
);
1662 static const struct file_operations vfio_group_fops
= {
1663 .owner
= THIS_MODULE
,
1664 .unlocked_ioctl
= vfio_group_fops_unl_ioctl
,
1665 #ifdef CONFIG_COMPAT
1666 .compat_ioctl
= vfio_group_fops_compat_ioctl
,
1668 .open
= vfio_group_fops_open
,
1669 .release
= vfio_group_fops_release
,
1675 static int vfio_device_fops_release(struct inode
*inode
, struct file
*filep
)
1677 struct vfio_device
*device
= filep
->private_data
;
1679 device
->ops
->release(device
->device_data
);
1681 vfio_group_try_dissolve_container(device
->group
);
1683 vfio_device_put(device
);
1688 static long vfio_device_fops_unl_ioctl(struct file
*filep
,
1689 unsigned int cmd
, unsigned long arg
)
1691 struct vfio_device
*device
= filep
->private_data
;
1693 if (unlikely(!device
->ops
->ioctl
))
1696 return device
->ops
->ioctl(device
->device_data
, cmd
, arg
);
1699 static ssize_t
vfio_device_fops_read(struct file
*filep
, char __user
*buf
,
1700 size_t count
, loff_t
*ppos
)
1702 struct vfio_device
*device
= filep
->private_data
;
1704 if (unlikely(!device
->ops
->read
))
1707 return device
->ops
->read(device
->device_data
, buf
, count
, ppos
);
1710 static ssize_t
vfio_device_fops_write(struct file
*filep
,
1711 const char __user
*buf
,
1712 size_t count
, loff_t
*ppos
)
1714 struct vfio_device
*device
= filep
->private_data
;
1716 if (unlikely(!device
->ops
->write
))
1719 return device
->ops
->write(device
->device_data
, buf
, count
, ppos
);
1722 static int vfio_device_fops_mmap(struct file
*filep
, struct vm_area_struct
*vma
)
1724 struct vfio_device
*device
= filep
->private_data
;
1726 if (unlikely(!device
->ops
->mmap
))
1729 return device
->ops
->mmap(device
->device_data
, vma
);
1732 #ifdef CONFIG_COMPAT
1733 static long vfio_device_fops_compat_ioctl(struct file
*filep
,
1734 unsigned int cmd
, unsigned long arg
)
1736 arg
= (unsigned long)compat_ptr(arg
);
1737 return vfio_device_fops_unl_ioctl(filep
, cmd
, arg
);
1739 #endif /* CONFIG_COMPAT */
1741 static const struct file_operations vfio_device_fops
= {
1742 .owner
= THIS_MODULE
,
1743 .release
= vfio_device_fops_release
,
1744 .read
= vfio_device_fops_read
,
1745 .write
= vfio_device_fops_write
,
1746 .unlocked_ioctl
= vfio_device_fops_unl_ioctl
,
1747 #ifdef CONFIG_COMPAT
1748 .compat_ioctl
= vfio_device_fops_compat_ioctl
,
1750 .mmap
= vfio_device_fops_mmap
,
1754 * External user API, exported by symbols to be linked dynamically.
1756 * The protocol includes:
1757 * 1. do normal VFIO init operation:
1758 * - opening a new container;
1759 * - attaching group(s) to it;
1760 * - setting an IOMMU driver for a container.
1761 * When IOMMU is set for a container, all groups in it are
1762 * considered ready to use by an external user.
1764 * 2. User space passes a group fd to an external user.
1765 * The external user calls vfio_group_get_external_user()
1767 * - the group is initialized;
1768 * - IOMMU is set for it.
1769 * If both checks passed, vfio_group_get_external_user()
1770 * increments the container user counter to prevent
1771 * the VFIO group from disposal before KVM exits.
1773 * 3. The external user calls vfio_external_user_iommu_id()
1774 * to know an IOMMU ID.
1776 * 4. When the external KVM finishes, it calls
1777 * vfio_group_put_external_user() to release the VFIO group.
1778 * This call decrements the container user counter.
1780 struct vfio_group
*vfio_group_get_external_user(struct file
*filep
)
1782 struct vfio_group
*group
= filep
->private_data
;
1785 if (filep
->f_op
!= &vfio_group_fops
)
1786 return ERR_PTR(-EINVAL
);
1788 ret
= vfio_group_add_container_user(group
);
1790 return ERR_PTR(ret
);
1792 vfio_group_get(group
);
1796 EXPORT_SYMBOL_GPL(vfio_group_get_external_user
);
1798 void vfio_group_put_external_user(struct vfio_group
*group
)
1800 vfio_group_try_dissolve_container(group
);
1801 vfio_group_put(group
);
1803 EXPORT_SYMBOL_GPL(vfio_group_put_external_user
);
1805 bool vfio_external_group_match_file(struct vfio_group
*test_group
,
1808 struct vfio_group
*group
= filep
->private_data
;
1810 return (filep
->f_op
== &vfio_group_fops
) && (group
== test_group
);
1812 EXPORT_SYMBOL_GPL(vfio_external_group_match_file
);
1814 int vfio_external_user_iommu_id(struct vfio_group
*group
)
1816 return iommu_group_id(group
->iommu_group
);
1818 EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id
);
1820 long vfio_external_check_extension(struct vfio_group
*group
, unsigned long arg
)
1822 return vfio_ioctl_check_extension(group
->container
, arg
);
1824 EXPORT_SYMBOL_GPL(vfio_external_check_extension
);
1827 * Sub-module support
1830 * Helper for managing a buffer of info chain capabilities, allocate or
1831 * reallocate a buffer with additional @size, filling in @id and @version
1832 * of the capability. A pointer to the new capability is returned.
1834 * NB. The chain is based at the head of the buffer, so new entries are
1835 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1836 * next offsets prior to copying to the user buffer.
1838 struct vfio_info_cap_header
*vfio_info_cap_add(struct vfio_info_cap
*caps
,
1839 size_t size
, u16 id
, u16 version
)
1842 struct vfio_info_cap_header
*header
, *tmp
;
1844 buf
= krealloc(caps
->buf
, caps
->size
+ size
, GFP_KERNEL
);
1848 return ERR_PTR(-ENOMEM
);
1852 header
= buf
+ caps
->size
;
1854 /* Eventually copied to user buffer, zero */
1855 memset(header
, 0, size
);
1858 header
->version
= version
;
1860 /* Add to the end of the capability chain */
1861 for (tmp
= buf
; tmp
->next
; tmp
= buf
+ tmp
->next
)
1864 tmp
->next
= caps
->size
;
1869 EXPORT_SYMBOL_GPL(vfio_info_cap_add
);
1871 void vfio_info_cap_shift(struct vfio_info_cap
*caps
, size_t offset
)
1873 struct vfio_info_cap_header
*tmp
;
1874 void *buf
= (void *)caps
->buf
;
1876 for (tmp
= buf
; tmp
->next
; tmp
= buf
+ tmp
->next
- offset
)
1877 tmp
->next
+= offset
;
1879 EXPORT_SYMBOL(vfio_info_cap_shift
);
1881 static int sparse_mmap_cap(struct vfio_info_cap
*caps
, void *cap_type
)
1883 struct vfio_info_cap_header
*header
;
1884 struct vfio_region_info_cap_sparse_mmap
*sparse_cap
, *sparse
= cap_type
;
1887 size
= sizeof(*sparse
) + sparse
->nr_areas
* sizeof(*sparse
->areas
);
1888 header
= vfio_info_cap_add(caps
, size
,
1889 VFIO_REGION_INFO_CAP_SPARSE_MMAP
, 1);
1891 return PTR_ERR(header
);
1893 sparse_cap
= container_of(header
,
1894 struct vfio_region_info_cap_sparse_mmap
, header
);
1895 sparse_cap
->nr_areas
= sparse
->nr_areas
;
1896 memcpy(sparse_cap
->areas
, sparse
->areas
,
1897 sparse
->nr_areas
* sizeof(*sparse
->areas
));
1901 static int region_type_cap(struct vfio_info_cap
*caps
, void *cap_type
)
1903 struct vfio_info_cap_header
*header
;
1904 struct vfio_region_info_cap_type
*type_cap
, *cap
= cap_type
;
1906 header
= vfio_info_cap_add(caps
, sizeof(*cap
),
1907 VFIO_REGION_INFO_CAP_TYPE
, 1);
1909 return PTR_ERR(header
);
1911 type_cap
= container_of(header
, struct vfio_region_info_cap_type
,
1913 type_cap
->type
= cap
->type
;
1914 type_cap
->subtype
= cap
->subtype
;
1918 int vfio_info_add_capability(struct vfio_info_cap
*caps
, int cap_type_id
,
1926 switch (cap_type_id
) {
1927 case VFIO_REGION_INFO_CAP_SPARSE_MMAP
:
1928 ret
= sparse_mmap_cap(caps
, cap_type
);
1931 case VFIO_REGION_INFO_CAP_TYPE
:
1932 ret
= region_type_cap(caps
, cap_type
);
1938 EXPORT_SYMBOL(vfio_info_add_capability
);
1940 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set
*hdr
, int num_irqs
,
1941 int max_irq_type
, size_t *data_size
)
1943 unsigned long minsz
;
1946 minsz
= offsetofend(struct vfio_irq_set
, count
);
1948 if ((hdr
->argsz
< minsz
) || (hdr
->index
>= max_irq_type
) ||
1949 (hdr
->count
>= (U32_MAX
- hdr
->start
)) ||
1950 (hdr
->flags
& ~(VFIO_IRQ_SET_DATA_TYPE_MASK
|
1951 VFIO_IRQ_SET_ACTION_TYPE_MASK
)))
1957 if (hdr
->start
>= num_irqs
|| hdr
->start
+ hdr
->count
> num_irqs
)
1960 switch (hdr
->flags
& VFIO_IRQ_SET_DATA_TYPE_MASK
) {
1961 case VFIO_IRQ_SET_DATA_NONE
:
1964 case VFIO_IRQ_SET_DATA_BOOL
:
1965 size
= sizeof(uint8_t);
1967 case VFIO_IRQ_SET_DATA_EVENTFD
:
1968 size
= sizeof(int32_t);
1975 if (hdr
->argsz
- minsz
< hdr
->count
* size
)
1981 *data_size
= hdr
->count
* size
;
1986 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare
);
1989 * Pin a set of guest PFNs and return their associated host PFNs for local
1991 * @dev [in] : device
1992 * @user_pfn [in]: array of user/guest PFNs to be pinned.
1993 * @npage [in] : count of elements in user_pfn array. This count should not
1994 * be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1995 * @prot [in] : protection flags
1996 * @phys_pfn[out]: array of host PFNs
1997 * Return error or number of pages pinned.
1999 int vfio_pin_pages(struct device
*dev
, unsigned long *user_pfn
, int npage
,
2000 int prot
, unsigned long *phys_pfn
)
2002 struct vfio_container
*container
;
2003 struct vfio_group
*group
;
2004 struct vfio_iommu_driver
*driver
;
2007 if (!dev
|| !user_pfn
|| !phys_pfn
|| !npage
)
2010 if (npage
> VFIO_PIN_PAGES_MAX_ENTRIES
)
2013 group
= vfio_group_get_from_dev(dev
);
2017 ret
= vfio_group_add_container_user(group
);
2021 container
= group
->container
;
2022 driver
= container
->iommu_driver
;
2023 if (likely(driver
&& driver
->ops
->pin_pages
))
2024 ret
= driver
->ops
->pin_pages(container
->iommu_data
, user_pfn
,
2025 npage
, prot
, phys_pfn
);
2029 vfio_group_try_dissolve_container(group
);
2032 vfio_group_put(group
);
2035 EXPORT_SYMBOL(vfio_pin_pages
);
2038 * Unpin set of host PFNs for local domain only.
2039 * @dev [in] : device
2040 * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
2041 * PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2042 * @npage [in] : count of elements in user_pfn array. This count should not
2043 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2044 * Return error or number of pages unpinned.
2046 int vfio_unpin_pages(struct device
*dev
, unsigned long *user_pfn
, int npage
)
2048 struct vfio_container
*container
;
2049 struct vfio_group
*group
;
2050 struct vfio_iommu_driver
*driver
;
2053 if (!dev
|| !user_pfn
|| !npage
)
2056 if (npage
> VFIO_PIN_PAGES_MAX_ENTRIES
)
2059 group
= vfio_group_get_from_dev(dev
);
2063 ret
= vfio_group_add_container_user(group
);
2065 goto err_unpin_pages
;
2067 container
= group
->container
;
2068 driver
= container
->iommu_driver
;
2069 if (likely(driver
&& driver
->ops
->unpin_pages
))
2070 ret
= driver
->ops
->unpin_pages(container
->iommu_data
, user_pfn
,
2075 vfio_group_try_dissolve_container(group
);
2078 vfio_group_put(group
);
2081 EXPORT_SYMBOL(vfio_unpin_pages
);
2083 static int vfio_register_iommu_notifier(struct vfio_group
*group
,
2084 unsigned long *events
,
2085 struct notifier_block
*nb
)
2087 struct vfio_container
*container
;
2088 struct vfio_iommu_driver
*driver
;
2091 ret
= vfio_group_add_container_user(group
);
2095 container
= group
->container
;
2096 driver
= container
->iommu_driver
;
2097 if (likely(driver
&& driver
->ops
->register_notifier
))
2098 ret
= driver
->ops
->register_notifier(container
->iommu_data
,
2103 vfio_group_try_dissolve_container(group
);
2108 static int vfio_unregister_iommu_notifier(struct vfio_group
*group
,
2109 struct notifier_block
*nb
)
2111 struct vfio_container
*container
;
2112 struct vfio_iommu_driver
*driver
;
2115 ret
= vfio_group_add_container_user(group
);
2119 container
= group
->container
;
2120 driver
= container
->iommu_driver
;
2121 if (likely(driver
&& driver
->ops
->unregister_notifier
))
2122 ret
= driver
->ops
->unregister_notifier(container
->iommu_data
,
2127 vfio_group_try_dissolve_container(group
);
2132 void vfio_group_set_kvm(struct vfio_group
*group
, struct kvm
*kvm
)
2135 blocking_notifier_call_chain(&group
->notifier
,
2136 VFIO_GROUP_NOTIFY_SET_KVM
, kvm
);
2138 EXPORT_SYMBOL_GPL(vfio_group_set_kvm
);
2140 static int vfio_register_group_notifier(struct vfio_group
*group
,
2141 unsigned long *events
,
2142 struct notifier_block
*nb
)
2145 bool set_kvm
= false;
2147 if (*events
& VFIO_GROUP_NOTIFY_SET_KVM
)
2150 /* clear known events */
2151 *events
&= ~VFIO_GROUP_NOTIFY_SET_KVM
;
2153 /* refuse to continue if still events remaining */
2157 ret
= vfio_group_add_container_user(group
);
2161 ret
= blocking_notifier_chain_register(&group
->notifier
, nb
);
2164 * The attaching of kvm and vfio_group might already happen, so
2165 * here we replay once upon registration.
2167 if (!ret
&& set_kvm
&& group
->kvm
)
2168 blocking_notifier_call_chain(&group
->notifier
,
2169 VFIO_GROUP_NOTIFY_SET_KVM
, group
->kvm
);
2171 vfio_group_try_dissolve_container(group
);
2176 static int vfio_unregister_group_notifier(struct vfio_group
*group
,
2177 struct notifier_block
*nb
)
2181 ret
= vfio_group_add_container_user(group
);
2185 ret
= blocking_notifier_chain_unregister(&group
->notifier
, nb
);
2187 vfio_group_try_dissolve_container(group
);
2192 int vfio_register_notifier(struct device
*dev
, enum vfio_notify_type type
,
2193 unsigned long *events
, struct notifier_block
*nb
)
2195 struct vfio_group
*group
;
2198 if (!dev
|| !nb
|| !events
|| (*events
== 0))
2201 group
= vfio_group_get_from_dev(dev
);
2206 case VFIO_IOMMU_NOTIFY
:
2207 ret
= vfio_register_iommu_notifier(group
, events
, nb
);
2209 case VFIO_GROUP_NOTIFY
:
2210 ret
= vfio_register_group_notifier(group
, events
, nb
);
2216 vfio_group_put(group
);
2219 EXPORT_SYMBOL(vfio_register_notifier
);
2221 int vfio_unregister_notifier(struct device
*dev
, enum vfio_notify_type type
,
2222 struct notifier_block
*nb
)
2224 struct vfio_group
*group
;
2230 group
= vfio_group_get_from_dev(dev
);
2235 case VFIO_IOMMU_NOTIFY
:
2236 ret
= vfio_unregister_iommu_notifier(group
, nb
);
2238 case VFIO_GROUP_NOTIFY
:
2239 ret
= vfio_unregister_group_notifier(group
, nb
);
2245 vfio_group_put(group
);
2248 EXPORT_SYMBOL(vfio_unregister_notifier
);
2251 * Module/class support
2253 static char *vfio_devnode(struct device
*dev
, umode_t
*mode
)
2255 return kasprintf(GFP_KERNEL
, "vfio/%s", dev_name(dev
));
2258 static struct miscdevice vfio_dev
= {
2259 .minor
= VFIO_MINOR
,
2262 .nodename
= "vfio/vfio",
2263 .mode
= S_IRUGO
| S_IWUGO
,
2266 static int __init
vfio_init(void)
2270 idr_init(&vfio
.group_idr
);
2271 mutex_init(&vfio
.group_lock
);
2272 mutex_init(&vfio
.iommu_drivers_lock
);
2273 INIT_LIST_HEAD(&vfio
.group_list
);
2274 INIT_LIST_HEAD(&vfio
.iommu_drivers_list
);
2275 init_waitqueue_head(&vfio
.release_q
);
2277 ret
= misc_register(&vfio_dev
);
2279 pr_err("vfio: misc device register failed\n");
2283 /* /dev/vfio/$GROUP */
2284 vfio
.class = class_create(THIS_MODULE
, "vfio");
2285 if (IS_ERR(vfio
.class)) {
2286 ret
= PTR_ERR(vfio
.class);
2290 vfio
.class->devnode
= vfio_devnode
;
2292 ret
= alloc_chrdev_region(&vfio
.group_devt
, 0, MINORMASK
, "vfio");
2294 goto err_alloc_chrdev
;
2296 cdev_init(&vfio
.group_cdev
, &vfio_group_fops
);
2297 ret
= cdev_add(&vfio
.group_cdev
, vfio
.group_devt
, MINORMASK
);
2301 pr_info(DRIVER_DESC
" version: " DRIVER_VERSION
"\n");
2303 #ifdef CONFIG_VFIO_NOIOMMU
2304 vfio_register_iommu_driver(&vfio_noiommu_ops
);
2309 unregister_chrdev_region(vfio
.group_devt
, MINORMASK
);
2311 class_destroy(vfio
.class);
2314 misc_deregister(&vfio_dev
);
2318 static void __exit
vfio_cleanup(void)
2320 WARN_ON(!list_empty(&vfio
.group_list
));
2322 #ifdef CONFIG_VFIO_NOIOMMU
2323 vfio_unregister_iommu_driver(&vfio_noiommu_ops
);
2325 idr_destroy(&vfio
.group_idr
);
2326 cdev_del(&vfio
.group_cdev
);
2327 unregister_chrdev_region(vfio
.group_devt
, MINORMASK
);
2328 class_destroy(vfio
.class);
2330 misc_deregister(&vfio_dev
);
2333 module_init(vfio_init
);
2334 module_exit(vfio_cleanup
);
2336 MODULE_VERSION(DRIVER_VERSION
);
2337 MODULE_LICENSE("GPL v2");
2338 MODULE_AUTHOR(DRIVER_AUTHOR
);
2339 MODULE_DESCRIPTION(DRIVER_DESC
);
2340 MODULE_ALIAS_MISCDEV(VFIO_MINOR
);
2341 MODULE_ALIAS("devname:vfio/vfio");
2342 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");