]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - drivers/vfio/vfio.c
Merge remote-tracking branches 'asoc/topic/adsp', 'asoc/topic/ak4613', 'asoc/topic...
[mirror_ubuntu-bionic-kernel.git] / drivers / vfio / vfio.c
CommitLineData
cba3345c
AW
1/*
2 * VFIO core
3 *
4 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
5 * Author: Alex Williamson <alex.williamson@redhat.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * Derived from original vfio:
12 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
13 * Author: Tom Lyon, pugs@cisco.com
14 */
15
16#include <linux/cdev.h>
17#include <linux/compat.h>
18#include <linux/device.h>
19#include <linux/file.h>
20#include <linux/anon_inodes.h>
21#include <linux/fs.h>
22#include <linux/idr.h>
23#include <linux/iommu.h>
24#include <linux/list.h>
d1099901 25#include <linux/miscdevice.h>
cba3345c
AW
26#include <linux/module.h>
27#include <linux/mutex.h>
5f096b14 28#include <linux/pci.h>
9587f44a 29#include <linux/rwsem.h>
cba3345c
AW
30#include <linux/sched.h>
31#include <linux/slab.h>
664e9386 32#include <linux/stat.h>
cba3345c
AW
33#include <linux/string.h>
34#include <linux/uaccess.h>
35#include <linux/vfio.h>
36#include <linux/wait.h>
37
38#define DRIVER_VERSION "0.3"
39#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
40#define DRIVER_DESC "VFIO - User Level meta-driver"
41
42static struct vfio {
43 struct class *class;
44 struct list_head iommu_drivers_list;
45 struct mutex iommu_drivers_lock;
46 struct list_head group_list;
47 struct idr group_idr;
48 struct mutex group_lock;
49 struct cdev group_cdev;
d1099901 50 dev_t group_devt;
cba3345c
AW
51 wait_queue_head_t release_q;
52} vfio;
53
54struct vfio_iommu_driver {
55 const struct vfio_iommu_driver_ops *ops;
56 struct list_head vfio_next;
57};
58
59struct vfio_container {
60 struct kref kref;
61 struct list_head group_list;
9587f44a 62 struct rw_semaphore group_lock;
cba3345c
AW
63 struct vfio_iommu_driver *iommu_driver;
64 void *iommu_data;
03a76b60 65 bool noiommu;
cba3345c
AW
66};
67
60720a0f
AW
68struct vfio_unbound_dev {
69 struct device *dev;
70 struct list_head unbound_next;
71};
72
cba3345c
AW
73struct vfio_group {
74 struct kref kref;
75 int minor;
76 atomic_t container_users;
77 struct iommu_group *iommu_group;
78 struct vfio_container *container;
79 struct list_head device_list;
80 struct mutex device_lock;
81 struct device *dev;
82 struct notifier_block nb;
83 struct list_head vfio_next;
84 struct list_head container_next;
60720a0f
AW
85 struct list_head unbound_list;
86 struct mutex unbound_lock;
6d6768c6 87 atomic_t opened;
03a76b60 88 bool noiommu;
ccd46dba
JS
89 struct kvm *kvm;
90 struct blocking_notifier_head notifier;
cba3345c
AW
91};
92
93struct vfio_device {
94 struct kref kref;
95 struct device *dev;
96 const struct vfio_device_ops *ops;
97 struct vfio_group *group;
98 struct list_head group_next;
99 void *device_data;
100};
101
03a76b60
AW
102#ifdef CONFIG_VFIO_NOIOMMU
103static bool noiommu __read_mostly;
104module_param_named(enable_unsafe_noiommu_mode,
105 noiommu, bool, S_IRUGO | S_IWUSR);
106MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)");
107#endif
108
109/*
110 * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
111 * and remove functions, any use cases other than acquiring the first
112 * reference for the purpose of calling vfio_add_group_dev() or removing
113 * that symmetric reference after vfio_del_group_dev() should use the raw
114 * iommu_group_{get,put} functions. In particular, vfio_iommu_group_put()
115 * removes the device from the dummy group and cannot be nested.
116 */
117struct iommu_group *vfio_iommu_group_get(struct device *dev)
118{
119 struct iommu_group *group;
120 int __maybe_unused ret;
121
122 group = iommu_group_get(dev);
123
124#ifdef CONFIG_VFIO_NOIOMMU
125 /*
126 * With noiommu enabled, an IOMMU group will be created for a device
127 * that doesn't already have one and doesn't have an iommu_ops on their
16ab8a5c
AW
128 * bus. We set iommudata simply to be able to identify these groups
129 * as special use and for reclamation later.
03a76b60
AW
130 */
131 if (group || !noiommu || iommu_present(dev->bus))
132 return group;
133
134 group = iommu_group_alloc();
135 if (IS_ERR(group))
136 return NULL;
137
138 iommu_group_set_name(group, "vfio-noiommu");
16ab8a5c 139 iommu_group_set_iommudata(group, &noiommu, NULL);
03a76b60
AW
140 ret = iommu_group_add_device(group, dev);
141 iommu_group_put(group);
142 if (ret)
143 return NULL;
144
145 /*
146 * Where to taint? At this point we've added an IOMMU group for a
147 * device that is not backed by iommu_ops, therefore any iommu_
148 * callback using iommu_ops can legitimately Oops. So, while we may
149 * be about to give a DMA capable device to a user without IOMMU
150 * protection, which is clearly taint-worthy, let's go ahead and do
151 * it here.
152 */
153 add_taint(TAINT_USER, LOCKDEP_STILL_OK);
154 dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
155#endif
156
157 return group;
158}
159EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
160
161void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
162{
163#ifdef CONFIG_VFIO_NOIOMMU
16ab8a5c 164 if (iommu_group_get_iommudata(group) == &noiommu)
03a76b60
AW
165 iommu_group_remove_device(dev);
166#endif
167
168 iommu_group_put(group);
169}
170EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
171
172#ifdef CONFIG_VFIO_NOIOMMU
173static void *vfio_noiommu_open(unsigned long arg)
174{
175 if (arg != VFIO_NOIOMMU_IOMMU)
176 return ERR_PTR(-EINVAL);
177 if (!capable(CAP_SYS_RAWIO))
178 return ERR_PTR(-EPERM);
179
180 return NULL;
181}
182
183static void vfio_noiommu_release(void *iommu_data)
184{
185}
186
187static long vfio_noiommu_ioctl(void *iommu_data,
188 unsigned int cmd, unsigned long arg)
189{
190 if (cmd == VFIO_CHECK_EXTENSION)
191 return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
192
193 return -ENOTTY;
194}
195
03a76b60
AW
196static int vfio_noiommu_attach_group(void *iommu_data,
197 struct iommu_group *iommu_group)
198{
16ab8a5c 199 return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
03a76b60
AW
200}
201
202static void vfio_noiommu_detach_group(void *iommu_data,
203 struct iommu_group *iommu_group)
204{
205}
206
207static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
208 .name = "vfio-noiommu",
209 .owner = THIS_MODULE,
210 .open = vfio_noiommu_open,
211 .release = vfio_noiommu_release,
212 .ioctl = vfio_noiommu_ioctl,
213 .attach_group = vfio_noiommu_attach_group,
214 .detach_group = vfio_noiommu_detach_group,
215};
216#endif
217
218
cba3345c
AW
219/**
220 * IOMMU driver registration
221 */
222int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
223{
224 struct vfio_iommu_driver *driver, *tmp;
225
226 driver = kzalloc(sizeof(*driver), GFP_KERNEL);
227 if (!driver)
228 return -ENOMEM;
229
230 driver->ops = ops;
231
232 mutex_lock(&vfio.iommu_drivers_lock);
233
234 /* Check for duplicates */
235 list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
236 if (tmp->ops == ops) {
237 mutex_unlock(&vfio.iommu_drivers_lock);
238 kfree(driver);
239 return -EINVAL;
240 }
241 }
242
243 list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
244
245 mutex_unlock(&vfio.iommu_drivers_lock);
246
247 return 0;
248}
249EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
250
251void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
252{
253 struct vfio_iommu_driver *driver;
254
255 mutex_lock(&vfio.iommu_drivers_lock);
256 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
257 if (driver->ops == ops) {
258 list_del(&driver->vfio_next);
259 mutex_unlock(&vfio.iommu_drivers_lock);
260 kfree(driver);
261 return;
262 }
263 }
264 mutex_unlock(&vfio.iommu_drivers_lock);
265}
266EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
267
268/**
269 * Group minor allocation/free - both called with vfio.group_lock held
270 */
271static int vfio_alloc_group_minor(struct vfio_group *group)
272{
d1099901 273 return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
cba3345c
AW
274}
275
276static void vfio_free_group_minor(int minor)
277{
278 idr_remove(&vfio.group_idr, minor);
279}
280
281static int vfio_iommu_group_notifier(struct notifier_block *nb,
282 unsigned long action, void *data);
283static void vfio_group_get(struct vfio_group *group);
284
285/**
286 * Container objects - containers are created when /dev/vfio/vfio is
287 * opened, but their lifecycle extends until the last user is done, so
288 * it's freed via kref. Must support container/group/device being
289 * closed in any order.
290 */
291static void vfio_container_get(struct vfio_container *container)
292{
293 kref_get(&container->kref);
294}
295
296static void vfio_container_release(struct kref *kref)
297{
298 struct vfio_container *container;
299 container = container_of(kref, struct vfio_container, kref);
300
301 kfree(container);
302}
303
304static void vfio_container_put(struct vfio_container *container)
305{
306 kref_put(&container->kref, vfio_container_release);
307}
308
9df7b25a
JL
309static void vfio_group_unlock_and_free(struct vfio_group *group)
310{
311 mutex_unlock(&vfio.group_lock);
312 /*
313 * Unregister outside of lock. A spurious callback is harmless now
314 * that the group is no longer in vfio.group_list.
315 */
316 iommu_group_unregister_notifier(group->iommu_group, &group->nb);
317 kfree(group);
318}
319
cba3345c
AW
320/**
321 * Group objects - create, release, get, put, search
322 */
16ab8a5c 323static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
cba3345c
AW
324{
325 struct vfio_group *group, *tmp;
326 struct device *dev;
327 int ret, minor;
328
329 group = kzalloc(sizeof(*group), GFP_KERNEL);
330 if (!group)
331 return ERR_PTR(-ENOMEM);
332
333 kref_init(&group->kref);
334 INIT_LIST_HEAD(&group->device_list);
335 mutex_init(&group->device_lock);
60720a0f
AW
336 INIT_LIST_HEAD(&group->unbound_list);
337 mutex_init(&group->unbound_lock);
cba3345c 338 atomic_set(&group->container_users, 0);
6d6768c6 339 atomic_set(&group->opened, 0);
cba3345c 340 group->iommu_group = iommu_group;
16ab8a5c
AW
341#ifdef CONFIG_VFIO_NOIOMMU
342 group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
343#endif
ccd46dba 344 BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
cba3345c
AW
345
346 group->nb.notifier_call = vfio_iommu_group_notifier;
347
348 /*
349 * blocking notifiers acquire a rwsem around registering and hold
350 * it around callback. Therefore, need to register outside of
351 * vfio.group_lock to avoid A-B/B-A contention. Our callback won't
352 * do anything unless it can find the group in vfio.group_list, so
353 * no harm in registering early.
354 */
355 ret = iommu_group_register_notifier(iommu_group, &group->nb);
356 if (ret) {
357 kfree(group);
358 return ERR_PTR(ret);
359 }
360
361 mutex_lock(&vfio.group_lock);
362
cba3345c
AW
363 /* Did we race creating this group? */
364 list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
365 if (tmp->iommu_group == iommu_group) {
366 vfio_group_get(tmp);
9df7b25a 367 vfio_group_unlock_and_free(group);
cba3345c
AW
368 return tmp;
369 }
370 }
371
2f51bf4b
ZL
372 minor = vfio_alloc_group_minor(group);
373 if (minor < 0) {
374 vfio_group_unlock_and_free(group);
375 return ERR_PTR(minor);
376 }
377
d1099901
AW
378 dev = device_create(vfio.class, NULL,
379 MKDEV(MAJOR(vfio.group_devt), minor),
03a76b60
AW
380 group, "%s%d", group->noiommu ? "noiommu-" : "",
381 iommu_group_id(iommu_group));
cba3345c
AW
382 if (IS_ERR(dev)) {
383 vfio_free_group_minor(minor);
9df7b25a 384 vfio_group_unlock_and_free(group);
cba3345c
AW
385 return (struct vfio_group *)dev; /* ERR_PTR */
386 }
387
388 group->minor = minor;
389 group->dev = dev;
390
391 list_add(&group->vfio_next, &vfio.group_list);
392
393 mutex_unlock(&vfio.group_lock);
394
395 return group;
396}
397
6d2cd3ce 398/* called with vfio.group_lock held */
cba3345c
AW
399static void vfio_group_release(struct kref *kref)
400{
401 struct vfio_group *group = container_of(kref, struct vfio_group, kref);
60720a0f 402 struct vfio_unbound_dev *unbound, *tmp;
4a68810d 403 struct iommu_group *iommu_group = group->iommu_group;
cba3345c
AW
404
405 WARN_ON(!list_empty(&group->device_list));
65b1adeb 406 WARN_ON(group->notifier.head);
cba3345c 407
60720a0f
AW
408 list_for_each_entry_safe(unbound, tmp,
409 &group->unbound_list, unbound_next) {
410 list_del(&unbound->unbound_next);
411 kfree(unbound);
412 }
413
d1099901 414 device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
cba3345c
AW
415 list_del(&group->vfio_next);
416 vfio_free_group_minor(group->minor);
9df7b25a 417 vfio_group_unlock_and_free(group);
4a68810d 418 iommu_group_put(iommu_group);
cba3345c
AW
419}
420
421static void vfio_group_put(struct vfio_group *group)
422{
6d2cd3ce 423 kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
cba3345c
AW
424}
425
426/* Assume group_lock or group reference is held */
427static void vfio_group_get(struct vfio_group *group)
428{
429 kref_get(&group->kref);
430}
431
432/*
433 * Not really a try as we will sleep for mutex, but we need to make
434 * sure the group pointer is valid under lock and get a reference.
435 */
436static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
437{
438 struct vfio_group *target = group;
439
440 mutex_lock(&vfio.group_lock);
441 list_for_each_entry(group, &vfio.group_list, vfio_next) {
442 if (group == target) {
443 vfio_group_get(group);
444 mutex_unlock(&vfio.group_lock);
445 return group;
446 }
447 }
448 mutex_unlock(&vfio.group_lock);
449
450 return NULL;
451}
452
453static
454struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
455{
456 struct vfio_group *group;
457
458 mutex_lock(&vfio.group_lock);
459 list_for_each_entry(group, &vfio.group_list, vfio_next) {
460 if (group->iommu_group == iommu_group) {
461 vfio_group_get(group);
462 mutex_unlock(&vfio.group_lock);
463 return group;
464 }
465 }
466 mutex_unlock(&vfio.group_lock);
467
468 return NULL;
469}
470
471static struct vfio_group *vfio_group_get_from_minor(int minor)
472{
473 struct vfio_group *group;
474
475 mutex_lock(&vfio.group_lock);
476 group = idr_find(&vfio.group_idr, minor);
477 if (!group) {
478 mutex_unlock(&vfio.group_lock);
479 return NULL;
480 }
481 vfio_group_get(group);
482 mutex_unlock(&vfio.group_lock);
483
484 return group;
485}
486
7ed3ea8a
KW
487static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
488{
489 struct iommu_group *iommu_group;
490 struct vfio_group *group;
491
492 iommu_group = iommu_group_get(dev);
493 if (!iommu_group)
494 return NULL;
495
496 group = vfio_group_get_from_iommu(iommu_group);
497 iommu_group_put(iommu_group);
498
499 return group;
500}
501
cba3345c
AW
502/**
503 * Device objects - create, release, get, put, search
504 */
505static
506struct vfio_device *vfio_group_create_device(struct vfio_group *group,
507 struct device *dev,
508 const struct vfio_device_ops *ops,
509 void *device_data)
510{
511 struct vfio_device *device;
cba3345c
AW
512
513 device = kzalloc(sizeof(*device), GFP_KERNEL);
514 if (!device)
515 return ERR_PTR(-ENOMEM);
516
517 kref_init(&device->kref);
518 device->dev = dev;
519 device->group = group;
520 device->ops = ops;
521 device->device_data = device_data;
8283b491 522 dev_set_drvdata(dev, device);
cba3345c
AW
523
524 /* No need to get group_lock, caller has group reference */
525 vfio_group_get(group);
526
527 mutex_lock(&group->device_lock);
528 list_add(&device->group_next, &group->device_list);
529 mutex_unlock(&group->device_lock);
530
531 return device;
532}
533
534static void vfio_device_release(struct kref *kref)
535{
536 struct vfio_device *device = container_of(kref,
537 struct vfio_device, kref);
538 struct vfio_group *group = device->group;
539
cba3345c
AW
540 list_del(&device->group_next);
541 mutex_unlock(&group->device_lock);
542
543 dev_set_drvdata(device->dev, NULL);
544
545 kfree(device);
546
547 /* vfio_del_group_dev may be waiting for this device */
548 wake_up(&vfio.release_q);
549}
550
551/* Device reference always implies a group reference */
44f50716 552void vfio_device_put(struct vfio_device *device)
cba3345c 553{
934ad4c2 554 struct vfio_group *group = device->group;
90b1253e 555 kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
934ad4c2 556 vfio_group_put(group);
cba3345c 557}
44f50716 558EXPORT_SYMBOL_GPL(vfio_device_put);
cba3345c
AW
559
560static void vfio_device_get(struct vfio_device *device)
561{
562 vfio_group_get(device->group);
563 kref_get(&device->kref);
564}
565
566static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
567 struct device *dev)
568{
569 struct vfio_device *device;
570
571 mutex_lock(&group->device_lock);
572 list_for_each_entry(device, &group->device_list, group_next) {
573 if (device->dev == dev) {
574 vfio_device_get(device);
575 mutex_unlock(&group->device_lock);
576 return device;
577 }
578 }
579 mutex_unlock(&group->device_lock);
580 return NULL;
581}
582
583/*
5f096b14
AW
584 * Some drivers, like pci-stub, are only used to prevent other drivers from
585 * claiming a device and are therefore perfectly legitimate for a user owned
586 * group. The pci-stub driver has no dependencies on DMA or the IOVA mapping
587 * of the device, but it does prevent the user from having direct access to
588 * the device, which is useful in some circumstances.
589 *
590 * We also assume that we can include PCI interconnect devices, ie. bridges.
591 * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
592 * then all of the downstream devices will be part of the same IOMMU group as
593 * the bridge. Thus, if placing the bridge into the user owned IOVA space
594 * breaks anything, it only does so for user owned devices downstream. Note
595 * that error notification via MSI can be affected for platforms that handle
596 * MSI within the same IOVA space as DMA.
cba3345c 597 */
5f096b14 598static const char * const vfio_driver_whitelist[] = { "pci-stub" };
cba3345c 599
5f096b14 600static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
cba3345c
AW
601{
602 int i;
603
5f096b14
AW
604 if (dev_is_pci(dev)) {
605 struct pci_dev *pdev = to_pci_dev(dev);
606
607 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
608 return true;
609 }
610
cba3345c
AW
611 for (i = 0; i < ARRAY_SIZE(vfio_driver_whitelist); i++) {
612 if (!strcmp(drv->name, vfio_driver_whitelist[i]))
613 return true;
614 }
615
616 return false;
617}
618
619/*
60720a0f
AW
620 * A vfio group is viable for use by userspace if all devices are in
621 * one of the following states:
622 * - driver-less
623 * - bound to a vfio driver
624 * - bound to a whitelisted driver
5f096b14 625 * - a PCI interconnect device
60720a0f
AW
626 *
627 * We use two methods to determine whether a device is bound to a vfio
628 * driver. The first is to test whether the device exists in the vfio
629 * group. The second is to test if the device exists on the group
630 * unbound_list, indicating it's in the middle of transitioning from
631 * a vfio driver to driver-less.
cba3345c
AW
632 */
633static int vfio_dev_viable(struct device *dev, void *data)
634{
635 struct vfio_group *group = data;
636 struct vfio_device *device;
de2b3eea 637 struct device_driver *drv = ACCESS_ONCE(dev->driver);
60720a0f
AW
638 struct vfio_unbound_dev *unbound;
639 int ret = -EINVAL;
640
641 mutex_lock(&group->unbound_lock);
642 list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
643 if (dev == unbound->dev) {
644 ret = 0;
645 break;
646 }
647 }
648 mutex_unlock(&group->unbound_lock);
cba3345c 649
5f096b14 650 if (!ret || !drv || vfio_dev_whitelisted(dev, drv))
cba3345c
AW
651 return 0;
652
653 device = vfio_group_get_device(group, dev);
654 if (device) {
655 vfio_device_put(device);
656 return 0;
657 }
658
60720a0f 659 return ret;
cba3345c
AW
660}
661
662/**
663 * Async device support
664 */
665static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
666{
667 struct vfio_device *device;
668
669 /* Do we already know about it? We shouldn't */
670 device = vfio_group_get_device(group, dev);
671 if (WARN_ON_ONCE(device)) {
672 vfio_device_put(device);
673 return 0;
674 }
675
676 /* Nothing to do for idle groups */
677 if (!atomic_read(&group->container_users))
678 return 0;
679
680 /* TODO Prevent device auto probing */
049af106 681 WARN(1, "Device %s added to live group %d!\n", dev_name(dev),
cba3345c
AW
682 iommu_group_id(group->iommu_group));
683
684 return 0;
685}
686
cba3345c
AW
687static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
688{
689 /* We don't care what happens when the group isn't in use */
690 if (!atomic_read(&group->container_users))
691 return 0;
692
693 return vfio_dev_viable(dev, group);
694}
695
696static int vfio_iommu_group_notifier(struct notifier_block *nb,
697 unsigned long action, void *data)
698{
699 struct vfio_group *group = container_of(nb, struct vfio_group, nb);
700 struct device *dev = data;
60720a0f 701 struct vfio_unbound_dev *unbound;
cba3345c
AW
702
703 /*
c6401930
AW
704 * Need to go through a group_lock lookup to get a reference or we
705 * risk racing a group being removed. Ignore spurious notifies.
cba3345c
AW
706 */
707 group = vfio_group_try_get(group);
c6401930 708 if (!group)
cba3345c
AW
709 return NOTIFY_OK;
710
711 switch (action) {
712 case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
713 vfio_group_nb_add_dev(group, dev);
714 break;
715 case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
de9c7602
AW
716 /*
717 * Nothing to do here. If the device is in use, then the
718 * vfio sub-driver should block the remove callback until
719 * it is unused. If the device is unused or attached to a
720 * stub driver, then it should be released and we don't
721 * care that it will be going away.
722 */
cba3345c
AW
723 break;
724 case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
725 pr_debug("%s: Device %s, group %d binding to driver\n",
726 __func__, dev_name(dev),
727 iommu_group_id(group->iommu_group));
728 break;
729 case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
730 pr_debug("%s: Device %s, group %d bound to driver %s\n",
731 __func__, dev_name(dev),
732 iommu_group_id(group->iommu_group), dev->driver->name);
733 BUG_ON(vfio_group_nb_verify(group, dev));
734 break;
735 case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
736 pr_debug("%s: Device %s, group %d unbinding from driver %s\n",
737 __func__, dev_name(dev),
738 iommu_group_id(group->iommu_group), dev->driver->name);
739 break;
740 case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
741 pr_debug("%s: Device %s, group %d unbound from driver\n",
742 __func__, dev_name(dev),
743 iommu_group_id(group->iommu_group));
744 /*
745 * XXX An unbound device in a live group is ok, but we'd
746 * really like to avoid the above BUG_ON by preventing other
747 * drivers from binding to it. Once that occurs, we have to
748 * stop the system to maintain isolation. At a minimum, we'd
749 * want a toggle to disable driver auto probe for this device.
750 */
60720a0f
AW
751
752 mutex_lock(&group->unbound_lock);
753 list_for_each_entry(unbound,
754 &group->unbound_list, unbound_next) {
755 if (dev == unbound->dev) {
756 list_del(&unbound->unbound_next);
757 kfree(unbound);
758 break;
759 }
760 }
761 mutex_unlock(&group->unbound_lock);
cba3345c
AW
762 break;
763 }
764
765 vfio_group_put(group);
766 return NOTIFY_OK;
767}
768
769/**
770 * VFIO driver API
771 */
772int vfio_add_group_dev(struct device *dev,
773 const struct vfio_device_ops *ops, void *device_data)
774{
775 struct iommu_group *iommu_group;
776 struct vfio_group *group;
777 struct vfio_device *device;
778
779 iommu_group = iommu_group_get(dev);
780 if (!iommu_group)
781 return -EINVAL;
782
783 group = vfio_group_get_from_iommu(iommu_group);
784 if (!group) {
16ab8a5c 785 group = vfio_create_group(iommu_group);
cba3345c
AW
786 if (IS_ERR(group)) {
787 iommu_group_put(iommu_group);
788 return PTR_ERR(group);
789 }
4a68810d
AW
790 } else {
791 /*
792 * A found vfio_group already holds a reference to the
793 * iommu_group. A created vfio_group keeps the reference.
794 */
795 iommu_group_put(iommu_group);
cba3345c
AW
796 }
797
798 device = vfio_group_get_device(group, dev);
799 if (device) {
800 WARN(1, "Device %s already exists on group %d\n",
801 dev_name(dev), iommu_group_id(iommu_group));
802 vfio_device_put(device);
803 vfio_group_put(group);
cba3345c
AW
804 return -EBUSY;
805 }
806
807 device = vfio_group_create_device(group, dev, ops, device_data);
808 if (IS_ERR(device)) {
809 vfio_group_put(group);
cba3345c
AW
810 return PTR_ERR(device);
811 }
812
813 /*
4a68810d
AW
814 * Drop all but the vfio_device reference. The vfio_device holds
815 * a reference to the vfio_group, which holds a reference to the
816 * iommu_group.
cba3345c
AW
817 */
818 vfio_group_put(group);
819
820 return 0;
821}
822EXPORT_SYMBOL_GPL(vfio_add_group_dev);
823
44f50716 824/**
20f30017
AW
825 * Get a reference to the vfio_device for a device. Even if the
826 * caller thinks they own the device, they could be racing with a
827 * release call path, so we can't trust drvdata for the shortcut.
828 * Go the long way around, from the iommu_group to the vfio_group
829 * to the vfio_device.
44f50716
VMP
830 */
831struct vfio_device *vfio_device_get_from_dev(struct device *dev)
832{
20f30017
AW
833 struct vfio_group *group;
834 struct vfio_device *device;
835
7ed3ea8a 836 group = vfio_group_get_from_dev(dev);
20f30017
AW
837 if (!group)
838 return NULL;
839
840 device = vfio_group_get_device(group, dev);
841 vfio_group_put(group);
44f50716
VMP
842
843 return device;
844}
845EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
846
4bc94d5d
AW
847static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
848 char *buf)
849{
e324fc82 850 struct vfio_device *it, *device = NULL;
4bc94d5d
AW
851
852 mutex_lock(&group->device_lock);
e324fc82
JR
853 list_for_each_entry(it, &group->device_list, group_next) {
854 if (!strcmp(dev_name(it->dev), buf)) {
855 device = it;
4bc94d5d
AW
856 vfio_device_get(device);
857 break;
858 }
859 }
860 mutex_unlock(&group->device_lock);
861
862 return device;
863}
864
44f50716
VMP
865/*
866 * Caller must hold a reference to the vfio_device
867 */
868void *vfio_device_data(struct vfio_device *device)
869{
870 return device->device_data;
871}
872EXPORT_SYMBOL_GPL(vfio_device_data);
873
e014e944
AW
874/* Given a referenced group, check if it contains the device */
875static bool vfio_dev_present(struct vfio_group *group, struct device *dev)
cba3345c 876{
cba3345c
AW
877 struct vfio_device *device;
878
cba3345c 879 device = vfio_group_get_device(group, dev);
e014e944 880 if (!device)
cba3345c 881 return false;
cba3345c
AW
882
883 vfio_device_put(device);
cba3345c
AW
884 return true;
885}
886
887/*
888 * Decrement the device reference count and wait for the device to be
889 * removed. Open file descriptors for the device... */
890void *vfio_del_group_dev(struct device *dev)
891{
892 struct vfio_device *device = dev_get_drvdata(dev);
893 struct vfio_group *group = device->group;
cba3345c 894 void *device_data = device->device_data;
60720a0f 895 struct vfio_unbound_dev *unbound;
13060b64 896 unsigned int i = 0;
db7d4d7f
AW
897 long ret;
898 bool interrupted = false;
cba3345c 899
e014e944
AW
900 /*
901 * The group exists so long as we have a device reference. Get
902 * a group reference and use it to scan for the device going away.
903 */
904 vfio_group_get(group);
905
60720a0f
AW
906 /*
907 * When the device is removed from the group, the group suddenly
908 * becomes non-viable; the device has a driver (until the unbind
909 * completes), but it's not present in the group. This is bad news
910 * for any external users that need to re-acquire a group reference
911 * in order to match and release their existing reference. To
912 * solve this, we track such devices on the unbound_list to bridge
913 * the gap until they're fully unbound.
914 */
915 unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
916 if (unbound) {
917 unbound->dev = dev;
918 mutex_lock(&group->unbound_lock);
919 list_add(&unbound->unbound_next, &group->unbound_list);
920 mutex_unlock(&group->unbound_lock);
921 }
922 WARN_ON(!unbound);
923
cba3345c
AW
924 vfio_device_put(device);
925
13060b64
AW
926 /*
927 * If the device is still present in the group after the above
928 * 'put', then it is in use and we need to request it from the
929 * bus driver. The driver may in turn need to request the
930 * device from the user. We send the request on an arbitrary
931 * interval with counter to allow the driver to take escalating
932 * measures to release the device if it has the ability to do so.
933 */
934 do {
935 device = vfio_group_get_device(group, dev);
936 if (!device)
937 break;
938
939 if (device->ops->request)
940 device->ops->request(device_data, i++);
941
942 vfio_device_put(device);
943
db7d4d7f
AW
944 if (interrupted) {
945 ret = wait_event_timeout(vfio.release_q,
946 !vfio_dev_present(group, dev), HZ * 10);
947 } else {
948 ret = wait_event_interruptible_timeout(vfio.release_q,
949 !vfio_dev_present(group, dev), HZ * 10);
950 if (ret == -ERESTARTSYS) {
951 interrupted = true;
952 dev_warn(dev,
953 "Device is currently in use, task"
954 " \"%s\" (%d) "
955 "blocked until device is released",
956 current->comm, task_pid_nr(current));
957 }
958 }
959 } while (ret <= 0);
e014e944
AW
960
961 vfio_group_put(group);
cba3345c 962
cba3345c
AW
963 return device_data;
964}
965EXPORT_SYMBOL_GPL(vfio_del_group_dev);
966
967/**
968 * VFIO base fd, /dev/vfio/vfio
969 */
970static long vfio_ioctl_check_extension(struct vfio_container *container,
971 unsigned long arg)
972{
0b43c082 973 struct vfio_iommu_driver *driver;
cba3345c
AW
974 long ret = 0;
975
0b43c082
AW
976 down_read(&container->group_lock);
977
978 driver = container->iommu_driver;
979
cba3345c
AW
980 switch (arg) {
981 /* No base extensions yet */
982 default:
983 /*
984 * If no driver is set, poll all registered drivers for
985 * extensions and return the first positive result. If
986 * a driver is already set, further queries will be passed
987 * only to that driver.
988 */
989 if (!driver) {
990 mutex_lock(&vfio.iommu_drivers_lock);
ae5515d6
AW
991 list_for_each_entry(driver, &vfio.iommu_drivers_list,
992 vfio_next) {
03a76b60
AW
993
994#ifdef CONFIG_VFIO_NOIOMMU
995 if (!list_empty(&container->group_list) &&
996 (container->noiommu !=
997 (driver->ops == &vfio_noiommu_ops)))
998 continue;
999#endif
1000
cba3345c
AW
1001 if (!try_module_get(driver->ops->owner))
1002 continue;
1003
1004 ret = driver->ops->ioctl(NULL,
1005 VFIO_CHECK_EXTENSION,
1006 arg);
1007 module_put(driver->ops->owner);
1008 if (ret > 0)
1009 break;
1010 }
1011 mutex_unlock(&vfio.iommu_drivers_lock);
1012 } else
1013 ret = driver->ops->ioctl(container->iommu_data,
1014 VFIO_CHECK_EXTENSION, arg);
1015 }
1016
0b43c082
AW
1017 up_read(&container->group_lock);
1018
cba3345c
AW
1019 return ret;
1020}
1021
9587f44a 1022/* hold write lock on container->group_lock */
cba3345c
AW
1023static int __vfio_container_attach_groups(struct vfio_container *container,
1024 struct vfio_iommu_driver *driver,
1025 void *data)
1026{
1027 struct vfio_group *group;
1028 int ret = -ENODEV;
1029
1030 list_for_each_entry(group, &container->group_list, container_next) {
1031 ret = driver->ops->attach_group(data, group->iommu_group);
1032 if (ret)
1033 goto unwind;
1034 }
1035
1036 return ret;
1037
1038unwind:
1039 list_for_each_entry_continue_reverse(group, &container->group_list,
1040 container_next) {
1041 driver->ops->detach_group(data, group->iommu_group);
1042 }
1043
1044 return ret;
1045}
1046
1047static long vfio_ioctl_set_iommu(struct vfio_container *container,
1048 unsigned long arg)
1049{
1050 struct vfio_iommu_driver *driver;
1051 long ret = -ENODEV;
1052
9587f44a 1053 down_write(&container->group_lock);
cba3345c
AW
1054
1055 /*
1056 * The container is designed to be an unprivileged interface while
1057 * the group can be assigned to specific users. Therefore, only by
1058 * adding a group to a container does the user get the privilege of
1059 * enabling the iommu, which may allocate finite resources. There
1060 * is no unset_iommu, but by removing all the groups from a container,
1061 * the container is deprivileged and returns to an unset state.
1062 */
1063 if (list_empty(&container->group_list) || container->iommu_driver) {
9587f44a 1064 up_write(&container->group_lock);
cba3345c
AW
1065 return -EINVAL;
1066 }
1067
1068 mutex_lock(&vfio.iommu_drivers_lock);
ae5515d6 1069 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
cba3345c
AW
1070 void *data;
1071
03a76b60
AW
1072#ifdef CONFIG_VFIO_NOIOMMU
1073 /*
1074 * Only noiommu containers can use vfio-noiommu and noiommu
1075 * containers can only use vfio-noiommu.
1076 */
1077 if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1078 continue;
1079#endif
1080
cba3345c
AW
1081 if (!try_module_get(driver->ops->owner))
1082 continue;
1083
1084 /*
1085 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1086 * so test which iommu driver reported support for this
1087 * extension and call open on them. We also pass them the
1088 * magic, allowing a single driver to support multiple
1089 * interfaces if they'd like.
1090 */
1091 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1092 module_put(driver->ops->owner);
1093 continue;
1094 }
1095
cba3345c
AW
1096 data = driver->ops->open(arg);
1097 if (IS_ERR(data)) {
1098 ret = PTR_ERR(data);
1099 module_put(driver->ops->owner);
7c435b46 1100 continue;
cba3345c
AW
1101 }
1102
1103 ret = __vfio_container_attach_groups(container, driver, data);
7c435b46 1104 if (ret) {
cba3345c
AW
1105 driver->ops->release(data);
1106 module_put(driver->ops->owner);
7c435b46 1107 continue;
cba3345c
AW
1108 }
1109
7c435b46
AW
1110 container->iommu_driver = driver;
1111 container->iommu_data = data;
1112 break;
cba3345c
AW
1113 }
1114
1115 mutex_unlock(&vfio.iommu_drivers_lock);
9587f44a 1116 up_write(&container->group_lock);
cba3345c
AW
1117
1118 return ret;
1119}
1120
1121static long vfio_fops_unl_ioctl(struct file *filep,
1122 unsigned int cmd, unsigned long arg)
1123{
1124 struct vfio_container *container = filep->private_data;
1125 struct vfio_iommu_driver *driver;
1126 void *data;
1127 long ret = -EINVAL;
1128
1129 if (!container)
1130 return ret;
1131
cba3345c
AW
1132 switch (cmd) {
1133 case VFIO_GET_API_VERSION:
1134 ret = VFIO_API_VERSION;
1135 break;
1136 case VFIO_CHECK_EXTENSION:
1137 ret = vfio_ioctl_check_extension(container, arg);
1138 break;
1139 case VFIO_SET_IOMMU:
1140 ret = vfio_ioctl_set_iommu(container, arg);
1141 break;
1142 default:
0b43c082
AW
1143 down_read(&container->group_lock);
1144
1145 driver = container->iommu_driver;
1146 data = container->iommu_data;
1147
cba3345c
AW
1148 if (driver) /* passthrough all unrecognized ioctls */
1149 ret = driver->ops->ioctl(data, cmd, arg);
0b43c082
AW
1150
1151 up_read(&container->group_lock);
cba3345c
AW
1152 }
1153
1154 return ret;
1155}
1156
1157#ifdef CONFIG_COMPAT
1158static long vfio_fops_compat_ioctl(struct file *filep,
1159 unsigned int cmd, unsigned long arg)
1160{
1161 arg = (unsigned long)compat_ptr(arg);
1162 return vfio_fops_unl_ioctl(filep, cmd, arg);
1163}
1164#endif /* CONFIG_COMPAT */
1165
1166static int vfio_fops_open(struct inode *inode, struct file *filep)
1167{
1168 struct vfio_container *container;
1169
1170 container = kzalloc(sizeof(*container), GFP_KERNEL);
1171 if (!container)
1172 return -ENOMEM;
1173
1174 INIT_LIST_HEAD(&container->group_list);
9587f44a 1175 init_rwsem(&container->group_lock);
cba3345c
AW
1176 kref_init(&container->kref);
1177
1178 filep->private_data = container;
1179
1180 return 0;
1181}
1182
1183static int vfio_fops_release(struct inode *inode, struct file *filep)
1184{
1185 struct vfio_container *container = filep->private_data;
1186
1187 filep->private_data = NULL;
1188
1189 vfio_container_put(container);
1190
1191 return 0;
1192}
1193
1194/*
1195 * Once an iommu driver is set, we optionally pass read/write/mmap
1196 * on to the driver, allowing management interfaces beyond ioctl.
1197 */
1198static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1199 size_t count, loff_t *ppos)
1200{
1201 struct vfio_container *container = filep->private_data;
0b43c082
AW
1202 struct vfio_iommu_driver *driver;
1203 ssize_t ret = -EINVAL;
cba3345c 1204
0b43c082
AW
1205 down_read(&container->group_lock);
1206
1207 driver = container->iommu_driver;
1208 if (likely(driver && driver->ops->read))
1209 ret = driver->ops->read(container->iommu_data,
1210 buf, count, ppos);
cba3345c 1211
0b43c082
AW
1212 up_read(&container->group_lock);
1213
1214 return ret;
cba3345c
AW
1215}
1216
1217static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1218 size_t count, loff_t *ppos)
1219{
1220 struct vfio_container *container = filep->private_data;
0b43c082
AW
1221 struct vfio_iommu_driver *driver;
1222 ssize_t ret = -EINVAL;
cba3345c 1223
0b43c082
AW
1224 down_read(&container->group_lock);
1225
1226 driver = container->iommu_driver;
1227 if (likely(driver && driver->ops->write))
1228 ret = driver->ops->write(container->iommu_data,
1229 buf, count, ppos);
1230
1231 up_read(&container->group_lock);
cba3345c 1232
0b43c082 1233 return ret;
cba3345c
AW
1234}
1235
1236static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1237{
1238 struct vfio_container *container = filep->private_data;
0b43c082
AW
1239 struct vfio_iommu_driver *driver;
1240 int ret = -EINVAL;
cba3345c 1241
0b43c082 1242 down_read(&container->group_lock);
cba3345c 1243
0b43c082
AW
1244 driver = container->iommu_driver;
1245 if (likely(driver && driver->ops->mmap))
1246 ret = driver->ops->mmap(container->iommu_data, vma);
1247
1248 up_read(&container->group_lock);
1249
1250 return ret;
cba3345c
AW
1251}
1252
1253static const struct file_operations vfio_fops = {
1254 .owner = THIS_MODULE,
1255 .open = vfio_fops_open,
1256 .release = vfio_fops_release,
1257 .read = vfio_fops_read,
1258 .write = vfio_fops_write,
1259 .unlocked_ioctl = vfio_fops_unl_ioctl,
1260#ifdef CONFIG_COMPAT
1261 .compat_ioctl = vfio_fops_compat_ioctl,
1262#endif
1263 .mmap = vfio_fops_mmap,
1264};
1265
1266/**
1267 * VFIO Group fd, /dev/vfio/$GROUP
1268 */
1269static void __vfio_group_unset_container(struct vfio_group *group)
1270{
1271 struct vfio_container *container = group->container;
1272 struct vfio_iommu_driver *driver;
1273
9587f44a 1274 down_write(&container->group_lock);
cba3345c
AW
1275
1276 driver = container->iommu_driver;
1277 if (driver)
1278 driver->ops->detach_group(container->iommu_data,
1279 group->iommu_group);
1280
1281 group->container = NULL;
1282 list_del(&group->container_next);
1283
1284 /* Detaching the last group deprivileges a container, remove iommu */
1285 if (driver && list_empty(&container->group_list)) {
1286 driver->ops->release(container->iommu_data);
1287 module_put(driver->ops->owner);
1288 container->iommu_driver = NULL;
1289 container->iommu_data = NULL;
1290 }
1291
9587f44a 1292 up_write(&container->group_lock);
cba3345c
AW
1293
1294 vfio_container_put(container);
1295}
1296
1297/*
1298 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1299 * if there was no container to unset. Since the ioctl is called on
1300 * the group, we know that still exists, therefore the only valid
1301 * transition here is 1->0.
1302 */
1303static int vfio_group_unset_container(struct vfio_group *group)
1304{
1305 int users = atomic_cmpxchg(&group->container_users, 1, 0);
1306
1307 if (!users)
1308 return -EINVAL;
1309 if (users != 1)
1310 return -EBUSY;
1311
1312 __vfio_group_unset_container(group);
1313
1314 return 0;
1315}
1316
1317/*
1318 * When removing container users, anything that removes the last user
1319 * implicitly removes the group from the container. That is, if the
1320 * group file descriptor is closed, as well as any device file descriptors,
1321 * the group is free.
1322 */
1323static void vfio_group_try_dissolve_container(struct vfio_group *group)
1324{
1325 if (0 == atomic_dec_if_positive(&group->container_users))
1326 __vfio_group_unset_container(group);
1327}
1328
1329static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1330{
2903ff01 1331 struct fd f;
cba3345c
AW
1332 struct vfio_container *container;
1333 struct vfio_iommu_driver *driver;
2903ff01 1334 int ret = 0;
cba3345c
AW
1335
1336 if (atomic_read(&group->container_users))
1337 return -EINVAL;
1338
03a76b60
AW
1339 if (group->noiommu && !capable(CAP_SYS_RAWIO))
1340 return -EPERM;
1341
2903ff01
AV
1342 f = fdget(container_fd);
1343 if (!f.file)
cba3345c
AW
1344 return -EBADF;
1345
1346 /* Sanity check, is this really our fd? */
2903ff01
AV
1347 if (f.file->f_op != &vfio_fops) {
1348 fdput(f);
cba3345c
AW
1349 return -EINVAL;
1350 }
1351
2903ff01 1352 container = f.file->private_data;
cba3345c
AW
1353 WARN_ON(!container); /* fget ensures we don't race vfio_release */
1354
9587f44a 1355 down_write(&container->group_lock);
cba3345c 1356
03a76b60
AW
1357 /* Real groups and fake groups cannot mix */
1358 if (!list_empty(&container->group_list) &&
1359 container->noiommu != group->noiommu) {
1360 ret = -EPERM;
1361 goto unlock_out;
1362 }
1363
cba3345c
AW
1364 driver = container->iommu_driver;
1365 if (driver) {
1366 ret = driver->ops->attach_group(container->iommu_data,
1367 group->iommu_group);
1368 if (ret)
1369 goto unlock_out;
1370 }
1371
1372 group->container = container;
03a76b60 1373 container->noiommu = group->noiommu;
cba3345c
AW
1374 list_add(&group->container_next, &container->group_list);
1375
1376 /* Get a reference on the container and mark a user within the group */
1377 vfio_container_get(container);
1378 atomic_inc(&group->container_users);
1379
1380unlock_out:
9587f44a 1381 up_write(&container->group_lock);
2903ff01 1382 fdput(f);
cba3345c
AW
1383 return ret;
1384}
1385
1386static bool vfio_group_viable(struct vfio_group *group)
1387{
1388 return (iommu_group_for_each_dev(group->iommu_group,
1389 group, vfio_dev_viable) == 0);
1390}
1391
32f55d83
KW
1392static int vfio_group_add_container_user(struct vfio_group *group)
1393{
1394 if (!atomic_inc_not_zero(&group->container_users))
1395 return -EINVAL;
1396
1397 if (group->noiommu) {
1398 atomic_dec(&group->container_users);
1399 return -EPERM;
1400 }
1401 if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1402 atomic_dec(&group->container_users);
1403 return -EINVAL;
1404 }
1405
1406 return 0;
1407}
1408
cba3345c
AW
1409static const struct file_operations vfio_device_fops;
1410
1411static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1412{
1413 struct vfio_device *device;
1414 struct file *filep;
4bc94d5d 1415 int ret;
cba3345c
AW
1416
1417 if (0 == atomic_read(&group->container_users) ||
1418 !group->container->iommu_driver || !vfio_group_viable(group))
1419 return -EINVAL;
1420
03a76b60
AW
1421 if (group->noiommu && !capable(CAP_SYS_RAWIO))
1422 return -EPERM;
1423
4bc94d5d
AW
1424 device = vfio_device_get_from_name(group, buf);
1425 if (!device)
1426 return -ENODEV;
cba3345c 1427
4bc94d5d
AW
1428 ret = device->ops->open(device->device_data);
1429 if (ret) {
1430 vfio_device_put(device);
1431 return ret;
1432 }
cba3345c 1433
4bc94d5d
AW
1434 /*
1435 * We can't use anon_inode_getfd() because we need to modify
1436 * the f_mode flags directly to allow more than just ioctls
1437 */
1438 ret = get_unused_fd_flags(O_CLOEXEC);
1439 if (ret < 0) {
1440 device->ops->release(device->device_data);
1441 vfio_device_put(device);
1442 return ret;
1443 }
cba3345c 1444
4bc94d5d
AW
1445 filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1446 device, O_RDWR);
1447 if (IS_ERR(filep)) {
1448 put_unused_fd(ret);
1449 ret = PTR_ERR(filep);
1450 device->ops->release(device->device_data);
1451 vfio_device_put(device);
1452 return ret;
1453 }
1454
1455 /*
1456 * TODO: add an anon_inode interface to do this.
1457 * Appears to be missing by lack of need rather than
1458 * explicitly prevented. Now there's need.
1459 */
1460 filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
cba3345c 1461
4bc94d5d 1462 atomic_inc(&group->container_users);
31605deb 1463
4bc94d5d 1464 fd_install(ret, filep);
cba3345c 1465
03a76b60
AW
1466 if (group->noiommu)
1467 dev_warn(device->dev, "vfio-noiommu device opened by user "
1468 "(%s:%d)\n", current->comm, task_pid_nr(current));
1469
cba3345c
AW
1470 return ret;
1471}
1472
1473static long vfio_group_fops_unl_ioctl(struct file *filep,
1474 unsigned int cmd, unsigned long arg)
1475{
1476 struct vfio_group *group = filep->private_data;
1477 long ret = -ENOTTY;
1478
1479 switch (cmd) {
1480 case VFIO_GROUP_GET_STATUS:
1481 {
1482 struct vfio_group_status status;
1483 unsigned long minsz;
1484
1485 minsz = offsetofend(struct vfio_group_status, flags);
1486
1487 if (copy_from_user(&status, (void __user *)arg, minsz))
1488 return -EFAULT;
1489
1490 if (status.argsz < minsz)
1491 return -EINVAL;
1492
1493 status.flags = 0;
1494
1495 if (vfio_group_viable(group))
1496 status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1497
1498 if (group->container)
1499 status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1500
1501 if (copy_to_user((void __user *)arg, &status, minsz))
1502 return -EFAULT;
1503
1504 ret = 0;
1505 break;
1506 }
1507 case VFIO_GROUP_SET_CONTAINER:
1508 {
1509 int fd;
1510
1511 if (get_user(fd, (int __user *)arg))
1512 return -EFAULT;
1513
1514 if (fd < 0)
1515 return -EINVAL;
1516
1517 ret = vfio_group_set_container(group, fd);
1518 break;
1519 }
1520 case VFIO_GROUP_UNSET_CONTAINER:
1521 ret = vfio_group_unset_container(group);
1522 break;
1523 case VFIO_GROUP_GET_DEVICE_FD:
1524 {
1525 char *buf;
1526
1527 buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1528 if (IS_ERR(buf))
1529 return PTR_ERR(buf);
1530
1531 ret = vfio_group_get_device_fd(group, buf);
1532 kfree(buf);
1533 break;
1534 }
1535 }
1536
1537 return ret;
1538}
1539
1540#ifdef CONFIG_COMPAT
1541static long vfio_group_fops_compat_ioctl(struct file *filep,
1542 unsigned int cmd, unsigned long arg)
1543{
1544 arg = (unsigned long)compat_ptr(arg);
1545 return vfio_group_fops_unl_ioctl(filep, cmd, arg);
1546}
1547#endif /* CONFIG_COMPAT */
1548
1549static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1550{
1551 struct vfio_group *group;
6d6768c6 1552 int opened;
cba3345c
AW
1553
1554 group = vfio_group_get_from_minor(iminor(inode));
1555 if (!group)
1556 return -ENODEV;
1557
03a76b60
AW
1558 if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1559 vfio_group_put(group);
1560 return -EPERM;
1561 }
1562
6d6768c6
AW
1563 /* Do we need multiple instances of the group open? Seems not. */
1564 opened = atomic_cmpxchg(&group->opened, 0, 1);
1565 if (opened) {
1566 vfio_group_put(group);
1567 return -EBUSY;
1568 }
1569
1570 /* Is something still in use from a previous open? */
cba3345c 1571 if (group->container) {
6d6768c6 1572 atomic_dec(&group->opened);
cba3345c
AW
1573 vfio_group_put(group);
1574 return -EBUSY;
1575 }
1576
65b1adeb
AW
1577 /* Warn if previous user didn't cleanup and re-init to drop them */
1578 if (WARN_ON(group->notifier.head))
1579 BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
1580
cba3345c
AW
1581 filep->private_data = group;
1582
1583 return 0;
1584}
1585
1586static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1587{
1588 struct vfio_group *group = filep->private_data;
1589
1590 filep->private_data = NULL;
1591
1592 vfio_group_try_dissolve_container(group);
1593
6d6768c6
AW
1594 atomic_dec(&group->opened);
1595
cba3345c
AW
1596 vfio_group_put(group);
1597
1598 return 0;
1599}
1600
1601static const struct file_operations vfio_group_fops = {
1602 .owner = THIS_MODULE,
1603 .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1604#ifdef CONFIG_COMPAT
1605 .compat_ioctl = vfio_group_fops_compat_ioctl,
1606#endif
1607 .open = vfio_group_fops_open,
1608 .release = vfio_group_fops_release,
1609};
1610
1611/**
1612 * VFIO Device fd
1613 */
1614static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1615{
1616 struct vfio_device *device = filep->private_data;
1617
1618 device->ops->release(device->device_data);
1619
1620 vfio_group_try_dissolve_container(device->group);
1621
1622 vfio_device_put(device);
1623
1624 return 0;
1625}
1626
1627static long vfio_device_fops_unl_ioctl(struct file *filep,
1628 unsigned int cmd, unsigned long arg)
1629{
1630 struct vfio_device *device = filep->private_data;
1631
1632 if (unlikely(!device->ops->ioctl))
1633 return -EINVAL;
1634
1635 return device->ops->ioctl(device->device_data, cmd, arg);
1636}
1637
1638static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1639 size_t count, loff_t *ppos)
1640{
1641 struct vfio_device *device = filep->private_data;
1642
1643 if (unlikely(!device->ops->read))
1644 return -EINVAL;
1645
1646 return device->ops->read(device->device_data, buf, count, ppos);
1647}
1648
1649static ssize_t vfio_device_fops_write(struct file *filep,
1650 const char __user *buf,
1651 size_t count, loff_t *ppos)
1652{
1653 struct vfio_device *device = filep->private_data;
1654
1655 if (unlikely(!device->ops->write))
1656 return -EINVAL;
1657
1658 return device->ops->write(device->device_data, buf, count, ppos);
1659}
1660
1661static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1662{
1663 struct vfio_device *device = filep->private_data;
1664
1665 if (unlikely(!device->ops->mmap))
1666 return -EINVAL;
1667
1668 return device->ops->mmap(device->device_data, vma);
1669}
1670
1671#ifdef CONFIG_COMPAT
1672static long vfio_device_fops_compat_ioctl(struct file *filep,
1673 unsigned int cmd, unsigned long arg)
1674{
1675 arg = (unsigned long)compat_ptr(arg);
1676 return vfio_device_fops_unl_ioctl(filep, cmd, arg);
1677}
1678#endif /* CONFIG_COMPAT */
1679
1680static const struct file_operations vfio_device_fops = {
1681 .owner = THIS_MODULE,
1682 .release = vfio_device_fops_release,
1683 .read = vfio_device_fops_read,
1684 .write = vfio_device_fops_write,
1685 .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1686#ifdef CONFIG_COMPAT
1687 .compat_ioctl = vfio_device_fops_compat_ioctl,
1688#endif
1689 .mmap = vfio_device_fops_mmap,
1690};
1691
6cdd9782
AK
1692/**
1693 * External user API, exported by symbols to be linked dynamically.
1694 *
1695 * The protocol includes:
1696 * 1. do normal VFIO init operation:
1697 * - opening a new container;
1698 * - attaching group(s) to it;
1699 * - setting an IOMMU driver for a container.
1700 * When IOMMU is set for a container, all groups in it are
1701 * considered ready to use by an external user.
1702 *
1703 * 2. User space passes a group fd to an external user.
1704 * The external user calls vfio_group_get_external_user()
1705 * to verify that:
1706 * - the group is initialized;
1707 * - IOMMU is set for it.
1708 * If both checks passed, vfio_group_get_external_user()
1709 * increments the container user counter to prevent
1710 * the VFIO group from disposal before KVM exits.
1711 *
1712 * 3. The external user calls vfio_external_user_iommu_id()
1713 * to know an IOMMU ID.
1714 *
1715 * 4. When the external KVM finishes, it calls
1716 * vfio_group_put_external_user() to release the VFIO group.
1717 * This call decrements the container user counter.
1718 */
1719struct vfio_group *vfio_group_get_external_user(struct file *filep)
1720{
1721 struct vfio_group *group = filep->private_data;
32f55d83 1722 int ret;
6cdd9782
AK
1723
1724 if (filep->f_op != &vfio_group_fops)
1725 return ERR_PTR(-EINVAL);
1726
32f55d83
KW
1727 ret = vfio_group_add_container_user(group);
1728 if (ret)
1729 return ERR_PTR(ret);
6cdd9782
AK
1730
1731 vfio_group_get(group);
1732
1733 return group;
1734}
1735EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1736
1737void vfio_group_put_external_user(struct vfio_group *group)
1738{
6cdd9782 1739 vfio_group_try_dissolve_container(group);
d370c917 1740 vfio_group_put(group);
6cdd9782
AK
1741}
1742EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1743
1744int vfio_external_user_iommu_id(struct vfio_group *group)
1745{
1746 return iommu_group_id(group->iommu_group);
1747}
1748EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1749
88d7ab89
AW
1750long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1751{
1752 return vfio_ioctl_check_extension(group->container, arg);
1753}
1754EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1755
d7a8d5ed
AW
1756/**
1757 * Sub-module support
1758 */
1759/*
1760 * Helper for managing a buffer of info chain capabilities, allocate or
1761 * reallocate a buffer with additional @size, filling in @id and @version
1762 * of the capability. A pointer to the new capability is returned.
1763 *
1764 * NB. The chain is based at the head of the buffer, so new entries are
1765 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1766 * next offsets prior to copying to the user buffer.
1767 */
1768struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1769 size_t size, u16 id, u16 version)
1770{
1771 void *buf;
1772 struct vfio_info_cap_header *header, *tmp;
1773
1774 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1775 if (!buf) {
1776 kfree(caps->buf);
1777 caps->size = 0;
1778 return ERR_PTR(-ENOMEM);
1779 }
1780
1781 caps->buf = buf;
1782 header = buf + caps->size;
1783
1784 /* Eventually copied to user buffer, zero */
1785 memset(header, 0, size);
1786
1787 header->id = id;
1788 header->version = version;
1789
1790 /* Add to the end of the capability chain */
5ba6de98 1791 for (tmp = buf; tmp->next; tmp = buf + tmp->next)
d7a8d5ed
AW
1792 ; /* nothing */
1793
1794 tmp->next = caps->size;
1795 caps->size += size;
1796
1797 return header;
1798}
1799EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1800
1801void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1802{
1803 struct vfio_info_cap_header *tmp;
5ba6de98 1804 void *buf = (void *)caps->buf;
d7a8d5ed 1805
5ba6de98 1806 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
d7a8d5ed
AW
1807 tmp->next += offset;
1808}
b3c0a866 1809EXPORT_SYMBOL(vfio_info_cap_shift);
d7a8d5ed 1810
b3c0a866
KW
1811static int sparse_mmap_cap(struct vfio_info_cap *caps, void *cap_type)
1812{
1813 struct vfio_info_cap_header *header;
1814 struct vfio_region_info_cap_sparse_mmap *sparse_cap, *sparse = cap_type;
1815 size_t size;
1816
1817 size = sizeof(*sparse) + sparse->nr_areas * sizeof(*sparse->areas);
1818 header = vfio_info_cap_add(caps, size,
1819 VFIO_REGION_INFO_CAP_SPARSE_MMAP, 1);
1820 if (IS_ERR(header))
1821 return PTR_ERR(header);
1822
1823 sparse_cap = container_of(header,
1824 struct vfio_region_info_cap_sparse_mmap, header);
1825 sparse_cap->nr_areas = sparse->nr_areas;
1826 memcpy(sparse_cap->areas, sparse->areas,
1827 sparse->nr_areas * sizeof(*sparse->areas));
1828 return 0;
1829}
1830
1831static int region_type_cap(struct vfio_info_cap *caps, void *cap_type)
1832{
1833 struct vfio_info_cap_header *header;
1834 struct vfio_region_info_cap_type *type_cap, *cap = cap_type;
1835
1836 header = vfio_info_cap_add(caps, sizeof(*cap),
1837 VFIO_REGION_INFO_CAP_TYPE, 1);
1838 if (IS_ERR(header))
1839 return PTR_ERR(header);
1840
1841 type_cap = container_of(header, struct vfio_region_info_cap_type,
1842 header);
1843 type_cap->type = cap->type;
1844 type_cap->subtype = cap->subtype;
1845 return 0;
1846}
1847
1848int vfio_info_add_capability(struct vfio_info_cap *caps, int cap_type_id,
1849 void *cap_type)
1850{
1851 int ret = -EINVAL;
1852
1853 if (!cap_type)
1854 return 0;
1855
1856 switch (cap_type_id) {
1857 case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
1858 ret = sparse_mmap_cap(caps, cap_type);
1859 break;
1860
1861 case VFIO_REGION_INFO_CAP_TYPE:
1862 ret = region_type_cap(caps, cap_type);
1863 break;
1864 }
1865
1866 return ret;
1867}
1868EXPORT_SYMBOL(vfio_info_add_capability);
2169037d 1869
c747f08a
KW
1870int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1871 int max_irq_type, size_t *data_size)
1872{
1873 unsigned long minsz;
1874 size_t size;
1875
1876 minsz = offsetofend(struct vfio_irq_set, count);
1877
1878 if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1879 (hdr->count >= (U32_MAX - hdr->start)) ||
1880 (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1881 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1882 return -EINVAL;
1883
1884 if (data_size)
1885 *data_size = 0;
1886
1887 if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1888 return -EINVAL;
1889
1890 switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1891 case VFIO_IRQ_SET_DATA_NONE:
1892 size = 0;
1893 break;
1894 case VFIO_IRQ_SET_DATA_BOOL:
1895 size = sizeof(uint8_t);
1896 break;
1897 case VFIO_IRQ_SET_DATA_EVENTFD:
1898 size = sizeof(int32_t);
1899 break;
1900 default:
1901 return -EINVAL;
1902 }
1903
1904 if (size) {
1905 if (hdr->argsz - minsz < hdr->count * size)
1906 return -EINVAL;
1907
1908 if (!data_size)
1909 return -EINVAL;
1910
1911 *data_size = hdr->count * size;
1912 }
1913
1914 return 0;
1915}
1916EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1917
2169037d
KW
1918/*
1919 * Pin a set of guest PFNs and return their associated host PFNs for local
1920 * domain only.
1921 * @dev [in] : device
d9d84780 1922 * @user_pfn [in]: array of user/guest PFNs to be pinned.
2169037d
KW
1923 * @npage [in] : count of elements in user_pfn array. This count should not
1924 * be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1925 * @prot [in] : protection flags
1926 * @phys_pfn[out]: array of host PFNs
1927 * Return error or number of pages pinned.
1928 */
1929int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
1930 int prot, unsigned long *phys_pfn)
1931{
1932 struct vfio_container *container;
1933 struct vfio_group *group;
1934 struct vfio_iommu_driver *driver;
1935 int ret;
1936
1937 if (!dev || !user_pfn || !phys_pfn || !npage)
1938 return -EINVAL;
1939
1940 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1941 return -E2BIG;
1942
1943 group = vfio_group_get_from_dev(dev);
d256459f
CJ
1944 if (!group)
1945 return -ENODEV;
2169037d
KW
1946
1947 ret = vfio_group_add_container_user(group);
1948 if (ret)
1949 goto err_pin_pages;
1950
1951 container = group->container;
1952 down_read(&container->group_lock);
1953
1954 driver = container->iommu_driver;
1955 if (likely(driver && driver->ops->pin_pages))
1956 ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
1957 npage, prot, phys_pfn);
1958 else
1959 ret = -ENOTTY;
1960
1961 up_read(&container->group_lock);
1962 vfio_group_try_dissolve_container(group);
1963
1964err_pin_pages:
1965 vfio_group_put(group);
1966 return ret;
1967}
1968EXPORT_SYMBOL(vfio_pin_pages);
1969
1970/*
1971 * Unpin set of host PFNs for local domain only.
1972 * @dev [in] : device
1973 * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
1974 * PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1975 * @npage [in] : count of elements in user_pfn array. This count should not
1976 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1977 * Return error or number of pages unpinned.
1978 */
1979int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
1980{
1981 struct vfio_container *container;
1982 struct vfio_group *group;
1983 struct vfio_iommu_driver *driver;
1984 int ret;
1985
1986 if (!dev || !user_pfn || !npage)
1987 return -EINVAL;
1988
1989 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1990 return -E2BIG;
1991
1992 group = vfio_group_get_from_dev(dev);
d256459f
CJ
1993 if (!group)
1994 return -ENODEV;
2169037d
KW
1995
1996 ret = vfio_group_add_container_user(group);
1997 if (ret)
1998 goto err_unpin_pages;
1999
2000 container = group->container;
2001 down_read(&container->group_lock);
2002
2003 driver = container->iommu_driver;
2004 if (likely(driver && driver->ops->unpin_pages))
2005 ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
2006 npage);
2007 else
2008 ret = -ENOTTY;
2009
2010 up_read(&container->group_lock);
2011 vfio_group_try_dissolve_container(group);
2012
2013err_unpin_pages:
2014 vfio_group_put(group);
2015 return ret;
2016}
2017EXPORT_SYMBOL(vfio_unpin_pages);
2018
22195cbd
JS
2019static int vfio_register_iommu_notifier(struct vfio_group *group,
2020 unsigned long *events,
2021 struct notifier_block *nb)
c086de81
KW
2022{
2023 struct vfio_container *container;
c086de81
KW
2024 struct vfio_iommu_driver *driver;
2025 int ret;
2026
c086de81
KW
2027 ret = vfio_group_add_container_user(group);
2028 if (ret)
22195cbd 2029 return -EINVAL;
c086de81
KW
2030
2031 container = group->container;
2032 down_read(&container->group_lock);
2033
2034 driver = container->iommu_driver;
2035 if (likely(driver && driver->ops->register_notifier))
22195cbd
JS
2036 ret = driver->ops->register_notifier(container->iommu_data,
2037 events, nb);
c086de81
KW
2038 else
2039 ret = -ENOTTY;
2040
2041 up_read(&container->group_lock);
2042 vfio_group_try_dissolve_container(group);
2043
c086de81
KW
2044 return ret;
2045}
c086de81 2046
22195cbd
JS
2047static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2048 struct notifier_block *nb)
c086de81
KW
2049{
2050 struct vfio_container *container;
c086de81
KW
2051 struct vfio_iommu_driver *driver;
2052 int ret;
2053
c086de81
KW
2054 ret = vfio_group_add_container_user(group);
2055 if (ret)
22195cbd 2056 return -EINVAL;
c086de81
KW
2057
2058 container = group->container;
2059 down_read(&container->group_lock);
2060
2061 driver = container->iommu_driver;
2062 if (likely(driver && driver->ops->unregister_notifier))
2063 ret = driver->ops->unregister_notifier(container->iommu_data,
2064 nb);
2065 else
2066 ret = -ENOTTY;
2067
2068 up_read(&container->group_lock);
2069 vfio_group_try_dissolve_container(group);
2070
22195cbd
JS
2071 return ret;
2072}
2073
ccd46dba
JS
2074void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2075{
2076 group->kvm = kvm;
2077 blocking_notifier_call_chain(&group->notifier,
2078 VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2079}
2080EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2081
2082static int vfio_register_group_notifier(struct vfio_group *group,
2083 unsigned long *events,
2084 struct notifier_block *nb)
2085{
2086 struct vfio_container *container;
2087 int ret;
2088 bool set_kvm = false;
2089
2090 if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2091 set_kvm = true;
2092
2093 /* clear known events */
2094 *events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2095
2096 /* refuse to continue if still events remaining */
2097 if (*events)
2098 return -EINVAL;
2099
2100 ret = vfio_group_add_container_user(group);
2101 if (ret)
2102 return -EINVAL;
2103
2104 container = group->container;
2105 down_read(&container->group_lock);
2106
2107 ret = blocking_notifier_chain_register(&group->notifier, nb);
2108
2109 /*
2110 * The attaching of kvm and vfio_group might already happen, so
2111 * here we replay once upon registration.
2112 */
2113 if (!ret && set_kvm && group->kvm)
2114 blocking_notifier_call_chain(&group->notifier,
2115 VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2116
2117 up_read(&container->group_lock);
2118 vfio_group_try_dissolve_container(group);
2119
2120 return ret;
2121}
2122
2123static int vfio_unregister_group_notifier(struct vfio_group *group,
2124 struct notifier_block *nb)
2125{
2126 struct vfio_container *container;
2127 int ret;
2128
2129 ret = vfio_group_add_container_user(group);
2130 if (ret)
2131 return -EINVAL;
2132
2133 container = group->container;
2134 down_read(&container->group_lock);
2135
2136 ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2137
2138 up_read(&container->group_lock);
2139 vfio_group_try_dissolve_container(group);
2140
2141 return ret;
2142}
2143
22195cbd
JS
2144int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2145 unsigned long *events, struct notifier_block *nb)
2146{
2147 struct vfio_group *group;
2148 int ret;
2149
2150 if (!dev || !nb || !events || (*events == 0))
2151 return -EINVAL;
2152
2153 group = vfio_group_get_from_dev(dev);
2154 if (!group)
2155 return -ENODEV;
2156
2157 switch (type) {
2158 case VFIO_IOMMU_NOTIFY:
2159 ret = vfio_register_iommu_notifier(group, events, nb);
2160 break;
ccd46dba
JS
2161 case VFIO_GROUP_NOTIFY:
2162 ret = vfio_register_group_notifier(group, events, nb);
2163 break;
22195cbd
JS
2164 default:
2165 ret = -EINVAL;
2166 }
2167
2168 vfio_group_put(group);
2169 return ret;
2170}
2171EXPORT_SYMBOL(vfio_register_notifier);
2172
2173int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2174 struct notifier_block *nb)
2175{
2176 struct vfio_group *group;
2177 int ret;
2178
2179 if (!dev || !nb)
2180 return -EINVAL;
2181
2182 group = vfio_group_get_from_dev(dev);
2183 if (!group)
2184 return -ENODEV;
2185
2186 switch (type) {
2187 case VFIO_IOMMU_NOTIFY:
2188 ret = vfio_unregister_iommu_notifier(group, nb);
2189 break;
ccd46dba
JS
2190 case VFIO_GROUP_NOTIFY:
2191 ret = vfio_unregister_group_notifier(group, nb);
2192 break;
22195cbd
JS
2193 default:
2194 ret = -EINVAL;
2195 }
2196
c086de81
KW
2197 vfio_group_put(group);
2198 return ret;
2199}
2200EXPORT_SYMBOL(vfio_unregister_notifier);
2201
cba3345c
AW
2202/**
2203 * Module/class support
2204 */
2205static char *vfio_devnode(struct device *dev, umode_t *mode)
2206{
2207 return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2208}
2209
d1099901
AW
2210static struct miscdevice vfio_dev = {
2211 .minor = VFIO_MINOR,
2212 .name = "vfio",
2213 .fops = &vfio_fops,
2214 .nodename = "vfio/vfio",
2215 .mode = S_IRUGO | S_IWUGO,
2216};
2217
cba3345c
AW
2218static int __init vfio_init(void)
2219{
2220 int ret;
2221
2222 idr_init(&vfio.group_idr);
2223 mutex_init(&vfio.group_lock);
2224 mutex_init(&vfio.iommu_drivers_lock);
2225 INIT_LIST_HEAD(&vfio.group_list);
2226 INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2227 init_waitqueue_head(&vfio.release_q);
2228
d1099901
AW
2229 ret = misc_register(&vfio_dev);
2230 if (ret) {
2231 pr_err("vfio: misc device register failed\n");
2232 return ret;
2233 }
2234
2235 /* /dev/vfio/$GROUP */
cba3345c
AW
2236 vfio.class = class_create(THIS_MODULE, "vfio");
2237 if (IS_ERR(vfio.class)) {
2238 ret = PTR_ERR(vfio.class);
2239 goto err_class;
2240 }
2241
2242 vfio.class->devnode = vfio_devnode;
2243
d1099901 2244 ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK, "vfio");
cba3345c 2245 if (ret)
d1099901 2246 goto err_alloc_chrdev;
cba3345c 2247
cba3345c 2248 cdev_init(&vfio.group_cdev, &vfio_group_fops);
d1099901 2249 ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK);
cba3345c 2250 if (ret)
d1099901 2251 goto err_cdev_add;
cba3345c
AW
2252
2253 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2254
03a76b60
AW
2255#ifdef CONFIG_VFIO_NOIOMMU
2256 vfio_register_iommu_driver(&vfio_noiommu_ops);
2257#endif
cba3345c
AW
2258 return 0;
2259
d1099901
AW
2260err_cdev_add:
2261 unregister_chrdev_region(vfio.group_devt, MINORMASK);
2262err_alloc_chrdev:
cba3345c
AW
2263 class_destroy(vfio.class);
2264 vfio.class = NULL;
2265err_class:
d1099901 2266 misc_deregister(&vfio_dev);
cba3345c
AW
2267 return ret;
2268}
2269
2270static void __exit vfio_cleanup(void)
2271{
2272 WARN_ON(!list_empty(&vfio.group_list));
2273
03a76b60
AW
2274#ifdef CONFIG_VFIO_NOIOMMU
2275 vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2276#endif
cba3345c
AW
2277 idr_destroy(&vfio.group_idr);
2278 cdev_del(&vfio.group_cdev);
d1099901 2279 unregister_chrdev_region(vfio.group_devt, MINORMASK);
cba3345c
AW
2280 class_destroy(vfio.class);
2281 vfio.class = NULL;
d1099901 2282 misc_deregister(&vfio_dev);
cba3345c
AW
2283}
2284
2285module_init(vfio_init);
2286module_exit(vfio_cleanup);
2287
2288MODULE_VERSION(DRIVER_VERSION);
2289MODULE_LICENSE("GPL v2");
2290MODULE_AUTHOR(DRIVER_AUTHOR);
2291MODULE_DESCRIPTION(DRIVER_DESC);
d1099901
AW
2292MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2293MODULE_ALIAS("devname:vfio/vfio");
0ca582fd 2294MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");