]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - drivers/vfio/vfio.c
powerpc/mm/books3s: Add new pte bit to mark pte temporarily invalid.
[mirror_ubuntu-bionic-kernel.git] / drivers / vfio / vfio.c
1 /*
2 * VFIO core
3 *
4 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
5 * Author: Alex Williamson <alex.williamson@redhat.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * Derived from original vfio:
12 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
13 * Author: Tom Lyon, pugs@cisco.com
14 */
15
16 #include <linux/cdev.h>
17 #include <linux/compat.h>
18 #include <linux/device.h>
19 #include <linux/file.h>
20 #include <linux/anon_inodes.h>
21 #include <linux/fs.h>
22 #include <linux/idr.h>
23 #include <linux/iommu.h>
24 #include <linux/list.h>
25 #include <linux/miscdevice.h>
26 #include <linux/module.h>
27 #include <linux/mutex.h>
28 #include <linux/pci.h>
29 #include <linux/rwsem.h>
30 #include <linux/sched.h>
31 #include <linux/slab.h>
32 #include <linux/stat.h>
33 #include <linux/string.h>
34 #include <linux/uaccess.h>
35 #include <linux/vfio.h>
36 #include <linux/wait.h>
37
38 #define DRIVER_VERSION "0.3"
39 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
40 #define DRIVER_DESC "VFIO - User Level meta-driver"
41
42 static struct vfio {
43 struct class *class;
44 struct list_head iommu_drivers_list;
45 struct mutex iommu_drivers_lock;
46 struct list_head group_list;
47 struct idr group_idr;
48 struct mutex group_lock;
49 struct cdev group_cdev;
50 dev_t group_devt;
51 wait_queue_head_t release_q;
52 } vfio;
53
54 struct vfio_iommu_driver {
55 const struct vfio_iommu_driver_ops *ops;
56 struct list_head vfio_next;
57 };
58
59 struct vfio_container {
60 struct kref kref;
61 struct list_head group_list;
62 struct rw_semaphore group_lock;
63 struct vfio_iommu_driver *iommu_driver;
64 void *iommu_data;
65 bool noiommu;
66 };
67
68 struct vfio_unbound_dev {
69 struct device *dev;
70 struct list_head unbound_next;
71 };
72
73 struct vfio_group {
74 struct kref kref;
75 int minor;
76 atomic_t container_users;
77 struct iommu_group *iommu_group;
78 struct vfio_container *container;
79 struct list_head device_list;
80 struct mutex device_lock;
81 struct device *dev;
82 struct notifier_block nb;
83 struct list_head vfio_next;
84 struct list_head container_next;
85 struct list_head unbound_list;
86 struct mutex unbound_lock;
87 atomic_t opened;
88 wait_queue_head_t container_q;
89 bool noiommu;
90 struct kvm *kvm;
91 struct blocking_notifier_head notifier;
92 };
93
94 struct vfio_device {
95 struct kref kref;
96 struct device *dev;
97 const struct vfio_device_ops *ops;
98 struct vfio_group *group;
99 struct list_head group_next;
100 void *device_data;
101 };
102
103 #ifdef CONFIG_VFIO_NOIOMMU
104 static bool noiommu __read_mostly;
105 module_param_named(enable_unsafe_noiommu_mode,
106 noiommu, bool, S_IRUGO | S_IWUSR);
107 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)");
108 #endif
109
110 /*
111 * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
112 * and remove functions, any use cases other than acquiring the first
113 * reference for the purpose of calling vfio_add_group_dev() or removing
114 * that symmetric reference after vfio_del_group_dev() should use the raw
115 * iommu_group_{get,put} functions. In particular, vfio_iommu_group_put()
116 * removes the device from the dummy group and cannot be nested.
117 */
118 struct iommu_group *vfio_iommu_group_get(struct device *dev)
119 {
120 struct iommu_group *group;
121 int __maybe_unused ret;
122
123 group = iommu_group_get(dev);
124
125 #ifdef CONFIG_VFIO_NOIOMMU
126 /*
127 * With noiommu enabled, an IOMMU group will be created for a device
128 * that doesn't already have one and doesn't have an iommu_ops on their
129 * bus. We set iommudata simply to be able to identify these groups
130 * as special use and for reclamation later.
131 */
132 if (group || !noiommu || iommu_present(dev->bus))
133 return group;
134
135 group = iommu_group_alloc();
136 if (IS_ERR(group))
137 return NULL;
138
139 iommu_group_set_name(group, "vfio-noiommu");
140 iommu_group_set_iommudata(group, &noiommu, NULL);
141 ret = iommu_group_add_device(group, dev);
142 if (ret) {
143 iommu_group_put(group);
144 return NULL;
145 }
146
147 /*
148 * Where to taint? At this point we've added an IOMMU group for a
149 * device that is not backed by iommu_ops, therefore any iommu_
150 * callback using iommu_ops can legitimately Oops. So, while we may
151 * be about to give a DMA capable device to a user without IOMMU
152 * protection, which is clearly taint-worthy, let's go ahead and do
153 * it here.
154 */
155 add_taint(TAINT_USER, LOCKDEP_STILL_OK);
156 dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
157 #endif
158
159 return group;
160 }
161 EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
162
163 void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
164 {
165 #ifdef CONFIG_VFIO_NOIOMMU
166 if (iommu_group_get_iommudata(group) == &noiommu)
167 iommu_group_remove_device(dev);
168 #endif
169
170 iommu_group_put(group);
171 }
172 EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
173
174 #ifdef CONFIG_VFIO_NOIOMMU
175 static void *vfio_noiommu_open(unsigned long arg)
176 {
177 if (arg != VFIO_NOIOMMU_IOMMU)
178 return ERR_PTR(-EINVAL);
179 if (!capable(CAP_SYS_RAWIO))
180 return ERR_PTR(-EPERM);
181
182 return NULL;
183 }
184
185 static void vfio_noiommu_release(void *iommu_data)
186 {
187 }
188
189 static long vfio_noiommu_ioctl(void *iommu_data,
190 unsigned int cmd, unsigned long arg)
191 {
192 if (cmd == VFIO_CHECK_EXTENSION)
193 return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
194
195 return -ENOTTY;
196 }
197
198 static int vfio_noiommu_attach_group(void *iommu_data,
199 struct iommu_group *iommu_group)
200 {
201 return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
202 }
203
204 static void vfio_noiommu_detach_group(void *iommu_data,
205 struct iommu_group *iommu_group)
206 {
207 }
208
209 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
210 .name = "vfio-noiommu",
211 .owner = THIS_MODULE,
212 .open = vfio_noiommu_open,
213 .release = vfio_noiommu_release,
214 .ioctl = vfio_noiommu_ioctl,
215 .attach_group = vfio_noiommu_attach_group,
216 .detach_group = vfio_noiommu_detach_group,
217 };
218 #endif
219
220
221 /**
222 * IOMMU driver registration
223 */
224 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
225 {
226 struct vfio_iommu_driver *driver, *tmp;
227
228 driver = kzalloc(sizeof(*driver), GFP_KERNEL);
229 if (!driver)
230 return -ENOMEM;
231
232 driver->ops = ops;
233
234 mutex_lock(&vfio.iommu_drivers_lock);
235
236 /* Check for duplicates */
237 list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
238 if (tmp->ops == ops) {
239 mutex_unlock(&vfio.iommu_drivers_lock);
240 kfree(driver);
241 return -EINVAL;
242 }
243 }
244
245 list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
246
247 mutex_unlock(&vfio.iommu_drivers_lock);
248
249 return 0;
250 }
251 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
252
253 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
254 {
255 struct vfio_iommu_driver *driver;
256
257 mutex_lock(&vfio.iommu_drivers_lock);
258 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
259 if (driver->ops == ops) {
260 list_del(&driver->vfio_next);
261 mutex_unlock(&vfio.iommu_drivers_lock);
262 kfree(driver);
263 return;
264 }
265 }
266 mutex_unlock(&vfio.iommu_drivers_lock);
267 }
268 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
269
270 /**
271 * Group minor allocation/free - both called with vfio.group_lock held
272 */
273 static int vfio_alloc_group_minor(struct vfio_group *group)
274 {
275 return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
276 }
277
278 static void vfio_free_group_minor(int minor)
279 {
280 idr_remove(&vfio.group_idr, minor);
281 }
282
283 static int vfio_iommu_group_notifier(struct notifier_block *nb,
284 unsigned long action, void *data);
285 static void vfio_group_get(struct vfio_group *group);
286
287 /**
288 * Container objects - containers are created when /dev/vfio/vfio is
289 * opened, but their lifecycle extends until the last user is done, so
290 * it's freed via kref. Must support container/group/device being
291 * closed in any order.
292 */
293 static void vfio_container_get(struct vfio_container *container)
294 {
295 kref_get(&container->kref);
296 }
297
298 static void vfio_container_release(struct kref *kref)
299 {
300 struct vfio_container *container;
301 container = container_of(kref, struct vfio_container, kref);
302
303 kfree(container);
304 }
305
306 static void vfio_container_put(struct vfio_container *container)
307 {
308 kref_put(&container->kref, vfio_container_release);
309 }
310
311 static void vfio_group_unlock_and_free(struct vfio_group *group)
312 {
313 mutex_unlock(&vfio.group_lock);
314 /*
315 * Unregister outside of lock. A spurious callback is harmless now
316 * that the group is no longer in vfio.group_list.
317 */
318 iommu_group_unregister_notifier(group->iommu_group, &group->nb);
319 kfree(group);
320 }
321
322 /**
323 * Group objects - create, release, get, put, search
324 */
325 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
326 {
327 struct vfio_group *group, *tmp;
328 struct device *dev;
329 int ret, minor;
330
331 group = kzalloc(sizeof(*group), GFP_KERNEL);
332 if (!group)
333 return ERR_PTR(-ENOMEM);
334
335 kref_init(&group->kref);
336 INIT_LIST_HEAD(&group->device_list);
337 mutex_init(&group->device_lock);
338 INIT_LIST_HEAD(&group->unbound_list);
339 mutex_init(&group->unbound_lock);
340 atomic_set(&group->container_users, 0);
341 atomic_set(&group->opened, 0);
342 init_waitqueue_head(&group->container_q);
343 group->iommu_group = iommu_group;
344 #ifdef CONFIG_VFIO_NOIOMMU
345 group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
346 #endif
347 BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
348
349 group->nb.notifier_call = vfio_iommu_group_notifier;
350
351 /*
352 * blocking notifiers acquire a rwsem around registering and hold
353 * it around callback. Therefore, need to register outside of
354 * vfio.group_lock to avoid A-B/B-A contention. Our callback won't
355 * do anything unless it can find the group in vfio.group_list, so
356 * no harm in registering early.
357 */
358 ret = iommu_group_register_notifier(iommu_group, &group->nb);
359 if (ret) {
360 kfree(group);
361 return ERR_PTR(ret);
362 }
363
364 mutex_lock(&vfio.group_lock);
365
366 /* Did we race creating this group? */
367 list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
368 if (tmp->iommu_group == iommu_group) {
369 vfio_group_get(tmp);
370 vfio_group_unlock_and_free(group);
371 return tmp;
372 }
373 }
374
375 minor = vfio_alloc_group_minor(group);
376 if (minor < 0) {
377 vfio_group_unlock_and_free(group);
378 return ERR_PTR(minor);
379 }
380
381 dev = device_create(vfio.class, NULL,
382 MKDEV(MAJOR(vfio.group_devt), minor),
383 group, "%s%d", group->noiommu ? "noiommu-" : "",
384 iommu_group_id(iommu_group));
385 if (IS_ERR(dev)) {
386 vfio_free_group_minor(minor);
387 vfio_group_unlock_and_free(group);
388 return ERR_CAST(dev);
389 }
390
391 group->minor = minor;
392 group->dev = dev;
393
394 list_add(&group->vfio_next, &vfio.group_list);
395
396 mutex_unlock(&vfio.group_lock);
397
398 return group;
399 }
400
401 /* called with vfio.group_lock held */
402 static void vfio_group_release(struct kref *kref)
403 {
404 struct vfio_group *group = container_of(kref, struct vfio_group, kref);
405 struct vfio_unbound_dev *unbound, *tmp;
406 struct iommu_group *iommu_group = group->iommu_group;
407
408 WARN_ON(!list_empty(&group->device_list));
409 WARN_ON(group->notifier.head);
410
411 list_for_each_entry_safe(unbound, tmp,
412 &group->unbound_list, unbound_next) {
413 list_del(&unbound->unbound_next);
414 kfree(unbound);
415 }
416
417 device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
418 list_del(&group->vfio_next);
419 vfio_free_group_minor(group->minor);
420 vfio_group_unlock_and_free(group);
421 iommu_group_put(iommu_group);
422 }
423
424 static void vfio_group_put(struct vfio_group *group)
425 {
426 kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
427 }
428
429 struct vfio_group_put_work {
430 struct work_struct work;
431 struct vfio_group *group;
432 };
433
434 static void vfio_group_put_bg(struct work_struct *work)
435 {
436 struct vfio_group_put_work *do_work;
437
438 do_work = container_of(work, struct vfio_group_put_work, work);
439
440 vfio_group_put(do_work->group);
441 kfree(do_work);
442 }
443
444 static void vfio_group_schedule_put(struct vfio_group *group)
445 {
446 struct vfio_group_put_work *do_work;
447
448 do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
449 if (WARN_ON(!do_work))
450 return;
451
452 INIT_WORK(&do_work->work, vfio_group_put_bg);
453 do_work->group = group;
454 schedule_work(&do_work->work);
455 }
456
457 /* Assume group_lock or group reference is held */
458 static void vfio_group_get(struct vfio_group *group)
459 {
460 kref_get(&group->kref);
461 }
462
463 /*
464 * Not really a try as we will sleep for mutex, but we need to make
465 * sure the group pointer is valid under lock and get a reference.
466 */
467 static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
468 {
469 struct vfio_group *target = group;
470
471 mutex_lock(&vfio.group_lock);
472 list_for_each_entry(group, &vfio.group_list, vfio_next) {
473 if (group == target) {
474 vfio_group_get(group);
475 mutex_unlock(&vfio.group_lock);
476 return group;
477 }
478 }
479 mutex_unlock(&vfio.group_lock);
480
481 return NULL;
482 }
483
484 static
485 struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
486 {
487 struct vfio_group *group;
488
489 mutex_lock(&vfio.group_lock);
490 list_for_each_entry(group, &vfio.group_list, vfio_next) {
491 if (group->iommu_group == iommu_group) {
492 vfio_group_get(group);
493 mutex_unlock(&vfio.group_lock);
494 return group;
495 }
496 }
497 mutex_unlock(&vfio.group_lock);
498
499 return NULL;
500 }
501
502 static struct vfio_group *vfio_group_get_from_minor(int minor)
503 {
504 struct vfio_group *group;
505
506 mutex_lock(&vfio.group_lock);
507 group = idr_find(&vfio.group_idr, minor);
508 if (!group) {
509 mutex_unlock(&vfio.group_lock);
510 return NULL;
511 }
512 vfio_group_get(group);
513 mutex_unlock(&vfio.group_lock);
514
515 return group;
516 }
517
518 static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
519 {
520 struct iommu_group *iommu_group;
521 struct vfio_group *group;
522
523 iommu_group = iommu_group_get(dev);
524 if (!iommu_group)
525 return NULL;
526
527 group = vfio_group_get_from_iommu(iommu_group);
528 iommu_group_put(iommu_group);
529
530 return group;
531 }
532
533 /**
534 * Device objects - create, release, get, put, search
535 */
536 static
537 struct vfio_device *vfio_group_create_device(struct vfio_group *group,
538 struct device *dev,
539 const struct vfio_device_ops *ops,
540 void *device_data)
541 {
542 struct vfio_device *device;
543
544 device = kzalloc(sizeof(*device), GFP_KERNEL);
545 if (!device)
546 return ERR_PTR(-ENOMEM);
547
548 kref_init(&device->kref);
549 device->dev = dev;
550 device->group = group;
551 device->ops = ops;
552 device->device_data = device_data;
553 dev_set_drvdata(dev, device);
554
555 /* No need to get group_lock, caller has group reference */
556 vfio_group_get(group);
557
558 mutex_lock(&group->device_lock);
559 list_add(&device->group_next, &group->device_list);
560 mutex_unlock(&group->device_lock);
561
562 return device;
563 }
564
565 static void vfio_device_release(struct kref *kref)
566 {
567 struct vfio_device *device = container_of(kref,
568 struct vfio_device, kref);
569 struct vfio_group *group = device->group;
570
571 list_del(&device->group_next);
572 mutex_unlock(&group->device_lock);
573
574 dev_set_drvdata(device->dev, NULL);
575
576 kfree(device);
577
578 /* vfio_del_group_dev may be waiting for this device */
579 wake_up(&vfio.release_q);
580 }
581
582 /* Device reference always implies a group reference */
583 void vfio_device_put(struct vfio_device *device)
584 {
585 struct vfio_group *group = device->group;
586 kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
587 vfio_group_put(group);
588 }
589 EXPORT_SYMBOL_GPL(vfio_device_put);
590
591 static void vfio_device_get(struct vfio_device *device)
592 {
593 vfio_group_get(device->group);
594 kref_get(&device->kref);
595 }
596
597 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
598 struct device *dev)
599 {
600 struct vfio_device *device;
601
602 mutex_lock(&group->device_lock);
603 list_for_each_entry(device, &group->device_list, group_next) {
604 if (device->dev == dev) {
605 vfio_device_get(device);
606 mutex_unlock(&group->device_lock);
607 return device;
608 }
609 }
610 mutex_unlock(&group->device_lock);
611 return NULL;
612 }
613
614 /*
615 * Some drivers, like pci-stub, are only used to prevent other drivers from
616 * claiming a device and are therefore perfectly legitimate for a user owned
617 * group. The pci-stub driver has no dependencies on DMA or the IOVA mapping
618 * of the device, but it does prevent the user from having direct access to
619 * the device, which is useful in some circumstances.
620 *
621 * We also assume that we can include PCI interconnect devices, ie. bridges.
622 * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
623 * then all of the downstream devices will be part of the same IOMMU group as
624 * the bridge. Thus, if placing the bridge into the user owned IOVA space
625 * breaks anything, it only does so for user owned devices downstream. Note
626 * that error notification via MSI can be affected for platforms that handle
627 * MSI within the same IOVA space as DMA.
628 */
629 static const char * const vfio_driver_whitelist[] = { "pci-stub" };
630
631 static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
632 {
633 int i;
634
635 if (dev_is_pci(dev)) {
636 struct pci_dev *pdev = to_pci_dev(dev);
637
638 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
639 return true;
640 }
641
642 for (i = 0; i < ARRAY_SIZE(vfio_driver_whitelist); i++) {
643 if (!strcmp(drv->name, vfio_driver_whitelist[i]))
644 return true;
645 }
646
647 return false;
648 }
649
650 /*
651 * A vfio group is viable for use by userspace if all devices are in
652 * one of the following states:
653 * - driver-less
654 * - bound to a vfio driver
655 * - bound to a whitelisted driver
656 * - a PCI interconnect device
657 *
658 * We use two methods to determine whether a device is bound to a vfio
659 * driver. The first is to test whether the device exists in the vfio
660 * group. The second is to test if the device exists on the group
661 * unbound_list, indicating it's in the middle of transitioning from
662 * a vfio driver to driver-less.
663 */
664 static int vfio_dev_viable(struct device *dev, void *data)
665 {
666 struct vfio_group *group = data;
667 struct vfio_device *device;
668 struct device_driver *drv = READ_ONCE(dev->driver);
669 struct vfio_unbound_dev *unbound;
670 int ret = -EINVAL;
671
672 mutex_lock(&group->unbound_lock);
673 list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
674 if (dev == unbound->dev) {
675 ret = 0;
676 break;
677 }
678 }
679 mutex_unlock(&group->unbound_lock);
680
681 if (!ret || !drv || vfio_dev_whitelisted(dev, drv))
682 return 0;
683
684 device = vfio_group_get_device(group, dev);
685 if (device) {
686 vfio_device_put(device);
687 return 0;
688 }
689
690 return ret;
691 }
692
693 /**
694 * Async device support
695 */
696 static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
697 {
698 struct vfio_device *device;
699
700 /* Do we already know about it? We shouldn't */
701 device = vfio_group_get_device(group, dev);
702 if (WARN_ON_ONCE(device)) {
703 vfio_device_put(device);
704 return 0;
705 }
706
707 /* Nothing to do for idle groups */
708 if (!atomic_read(&group->container_users))
709 return 0;
710
711 /* TODO Prevent device auto probing */
712 WARN(1, "Device %s added to live group %d!\n", dev_name(dev),
713 iommu_group_id(group->iommu_group));
714
715 return 0;
716 }
717
718 static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
719 {
720 /* We don't care what happens when the group isn't in use */
721 if (!atomic_read(&group->container_users))
722 return 0;
723
724 return vfio_dev_viable(dev, group);
725 }
726
727 static int vfio_iommu_group_notifier(struct notifier_block *nb,
728 unsigned long action, void *data)
729 {
730 struct vfio_group *group = container_of(nb, struct vfio_group, nb);
731 struct device *dev = data;
732 struct vfio_unbound_dev *unbound;
733
734 /*
735 * Need to go through a group_lock lookup to get a reference or we
736 * risk racing a group being removed. Ignore spurious notifies.
737 */
738 group = vfio_group_try_get(group);
739 if (!group)
740 return NOTIFY_OK;
741
742 switch (action) {
743 case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
744 vfio_group_nb_add_dev(group, dev);
745 break;
746 case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
747 /*
748 * Nothing to do here. If the device is in use, then the
749 * vfio sub-driver should block the remove callback until
750 * it is unused. If the device is unused or attached to a
751 * stub driver, then it should be released and we don't
752 * care that it will be going away.
753 */
754 break;
755 case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
756 pr_debug("%s: Device %s, group %d binding to driver\n",
757 __func__, dev_name(dev),
758 iommu_group_id(group->iommu_group));
759 break;
760 case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
761 pr_debug("%s: Device %s, group %d bound to driver %s\n",
762 __func__, dev_name(dev),
763 iommu_group_id(group->iommu_group), dev->driver->name);
764 BUG_ON(vfio_group_nb_verify(group, dev));
765 break;
766 case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
767 pr_debug("%s: Device %s, group %d unbinding from driver %s\n",
768 __func__, dev_name(dev),
769 iommu_group_id(group->iommu_group), dev->driver->name);
770 break;
771 case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
772 pr_debug("%s: Device %s, group %d unbound from driver\n",
773 __func__, dev_name(dev),
774 iommu_group_id(group->iommu_group));
775 /*
776 * XXX An unbound device in a live group is ok, but we'd
777 * really like to avoid the above BUG_ON by preventing other
778 * drivers from binding to it. Once that occurs, we have to
779 * stop the system to maintain isolation. At a minimum, we'd
780 * want a toggle to disable driver auto probe for this device.
781 */
782
783 mutex_lock(&group->unbound_lock);
784 list_for_each_entry(unbound,
785 &group->unbound_list, unbound_next) {
786 if (dev == unbound->dev) {
787 list_del(&unbound->unbound_next);
788 kfree(unbound);
789 break;
790 }
791 }
792 mutex_unlock(&group->unbound_lock);
793 break;
794 }
795
796 /*
797 * If we're the last reference to the group, the group will be
798 * released, which includes unregistering the iommu group notifier.
799 * We hold a read-lock on that notifier list, unregistering needs
800 * a write-lock... deadlock. Release our reference asynchronously
801 * to avoid that situation.
802 */
803 vfio_group_schedule_put(group);
804 return NOTIFY_OK;
805 }
806
807 /**
808 * VFIO driver API
809 */
810 int vfio_add_group_dev(struct device *dev,
811 const struct vfio_device_ops *ops, void *device_data)
812 {
813 struct iommu_group *iommu_group;
814 struct vfio_group *group;
815 struct vfio_device *device;
816
817 iommu_group = iommu_group_get(dev);
818 if (!iommu_group)
819 return -EINVAL;
820
821 group = vfio_group_get_from_iommu(iommu_group);
822 if (!group) {
823 group = vfio_create_group(iommu_group);
824 if (IS_ERR(group)) {
825 iommu_group_put(iommu_group);
826 return PTR_ERR(group);
827 }
828 } else {
829 /*
830 * A found vfio_group already holds a reference to the
831 * iommu_group. A created vfio_group keeps the reference.
832 */
833 iommu_group_put(iommu_group);
834 }
835
836 device = vfio_group_get_device(group, dev);
837 if (device) {
838 WARN(1, "Device %s already exists on group %d\n",
839 dev_name(dev), iommu_group_id(iommu_group));
840 vfio_device_put(device);
841 vfio_group_put(group);
842 return -EBUSY;
843 }
844
845 device = vfio_group_create_device(group, dev, ops, device_data);
846 if (IS_ERR(device)) {
847 vfio_group_put(group);
848 return PTR_ERR(device);
849 }
850
851 /*
852 * Drop all but the vfio_device reference. The vfio_device holds
853 * a reference to the vfio_group, which holds a reference to the
854 * iommu_group.
855 */
856 vfio_group_put(group);
857
858 return 0;
859 }
860 EXPORT_SYMBOL_GPL(vfio_add_group_dev);
861
862 /**
863 * Get a reference to the vfio_device for a device. Even if the
864 * caller thinks they own the device, they could be racing with a
865 * release call path, so we can't trust drvdata for the shortcut.
866 * Go the long way around, from the iommu_group to the vfio_group
867 * to the vfio_device.
868 */
869 struct vfio_device *vfio_device_get_from_dev(struct device *dev)
870 {
871 struct vfio_group *group;
872 struct vfio_device *device;
873
874 group = vfio_group_get_from_dev(dev);
875 if (!group)
876 return NULL;
877
878 device = vfio_group_get_device(group, dev);
879 vfio_group_put(group);
880
881 return device;
882 }
883 EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
884
885 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
886 char *buf)
887 {
888 struct vfio_device *it, *device = NULL;
889
890 mutex_lock(&group->device_lock);
891 list_for_each_entry(it, &group->device_list, group_next) {
892 if (!strcmp(dev_name(it->dev), buf)) {
893 device = it;
894 vfio_device_get(device);
895 break;
896 }
897 }
898 mutex_unlock(&group->device_lock);
899
900 return device;
901 }
902
903 /*
904 * Caller must hold a reference to the vfio_device
905 */
906 void *vfio_device_data(struct vfio_device *device)
907 {
908 return device->device_data;
909 }
910 EXPORT_SYMBOL_GPL(vfio_device_data);
911
912 /* Given a referenced group, check if it contains the device */
913 static bool vfio_dev_present(struct vfio_group *group, struct device *dev)
914 {
915 struct vfio_device *device;
916
917 device = vfio_group_get_device(group, dev);
918 if (!device)
919 return false;
920
921 vfio_device_put(device);
922 return true;
923 }
924
925 /*
926 * Decrement the device reference count and wait for the device to be
927 * removed. Open file descriptors for the device... */
928 void *vfio_del_group_dev(struct device *dev)
929 {
930 struct vfio_device *device = dev_get_drvdata(dev);
931 struct vfio_group *group = device->group;
932 void *device_data = device->device_data;
933 struct vfio_unbound_dev *unbound;
934 unsigned int i = 0;
935 long ret;
936 bool interrupted = false;
937 bool locked = true;
938 struct device_driver *drv;
939
940 drv = dev->driver;
941
942 /*
943 * The group exists so long as we have a device reference. Get
944 * a group reference and use it to scan for the device going away.
945 */
946 vfio_group_get(group);
947
948 /*
949 * When the device is removed from the group, the group suddenly
950 * becomes non-viable; the device has a driver (until the unbind
951 * completes), but it's not present in the group. This is bad news
952 * for any external users that need to re-acquire a group reference
953 * in order to match and release their existing reference. To
954 * solve this, we track such devices on the unbound_list to bridge
955 * the gap until they're fully unbound.
956 */
957 unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
958 if (unbound) {
959 unbound->dev = dev;
960 mutex_lock(&group->unbound_lock);
961 list_add(&unbound->unbound_next, &group->unbound_list);
962 mutex_unlock(&group->unbound_lock);
963 }
964 WARN_ON(!unbound);
965
966 vfio_device_put(device);
967
968 /*
969 * If the device is still present in the group after the above
970 * 'put', then it is in use and we need to request it from the
971 * bus driver. The driver may in turn need to request the
972 * device from the user. We send the request on an arbitrary
973 * interval with counter to allow the driver to take escalating
974 * measures to release the device if it has the ability to do so.
975 */
976 do {
977 device = vfio_group_get_device(group, dev);
978 if (!device)
979 break;
980
981 if (device->ops->request) {
982 device_unlock(dev);
983 locked = false;
984 device->ops->request(device_data, i++);
985 }
986
987 vfio_device_put(device);
988
989 if (interrupted) {
990 ret = wait_event_timeout(vfio.release_q,
991 !vfio_dev_present(group, dev), HZ * 10);
992 } else {
993 ret = wait_event_interruptible_timeout(vfio.release_q,
994 !vfio_dev_present(group, dev), HZ * 10);
995 if (ret == -ERESTARTSYS) {
996 interrupted = true;
997 dev_warn(dev,
998 "Device is currently in use, task"
999 " \"%s\" (%d) "
1000 "blocked until device is released",
1001 current->comm, task_pid_nr(current));
1002 }
1003 }
1004
1005 if (!locked) {
1006 device_lock(dev);
1007 locked = true;
1008 /*
1009 * A concurrent operation may have released the driver
1010 * successfully while we had dropped the lock,
1011 * check for that.
1012 */
1013 if (dev->driver != drv) {
1014 vfio_group_put(group);
1015 return NULL;
1016 }
1017 }
1018 } while (ret <= 0);
1019
1020 /*
1021 * In order to support multiple devices per group, devices can be
1022 * plucked from the group while other devices in the group are still
1023 * in use. The container persists with this group and those remaining
1024 * devices still attached. If the user creates an isolation violation
1025 * by binding this device to another driver while the group is still in
1026 * use, that's their fault. However, in the case of removing the last,
1027 * or potentially the only, device in the group there can be no other
1028 * in-use devices in the group. The user has done their due diligence
1029 * and we should lay no claims to those devices. In order to do that,
1030 * we need to make sure the group is detached from the container.
1031 * Without this stall, we're potentially racing with a user process
1032 * that may attempt to immediately bind this device to another driver.
1033 */
1034 if (list_empty(&group->device_list))
1035 wait_event(group->container_q, !group->container);
1036
1037 vfio_group_put(group);
1038
1039 return device_data;
1040 }
1041 EXPORT_SYMBOL_GPL(vfio_del_group_dev);
1042
1043 /**
1044 * VFIO base fd, /dev/vfio/vfio
1045 */
1046 static long vfio_ioctl_check_extension(struct vfio_container *container,
1047 unsigned long arg)
1048 {
1049 struct vfio_iommu_driver *driver;
1050 long ret = 0;
1051
1052 down_read(&container->group_lock);
1053
1054 driver = container->iommu_driver;
1055
1056 switch (arg) {
1057 /* No base extensions yet */
1058 default:
1059 /*
1060 * If no driver is set, poll all registered drivers for
1061 * extensions and return the first positive result. If
1062 * a driver is already set, further queries will be passed
1063 * only to that driver.
1064 */
1065 if (!driver) {
1066 mutex_lock(&vfio.iommu_drivers_lock);
1067 list_for_each_entry(driver, &vfio.iommu_drivers_list,
1068 vfio_next) {
1069
1070 #ifdef CONFIG_VFIO_NOIOMMU
1071 if (!list_empty(&container->group_list) &&
1072 (container->noiommu !=
1073 (driver->ops == &vfio_noiommu_ops)))
1074 continue;
1075 #endif
1076
1077 if (!try_module_get(driver->ops->owner))
1078 continue;
1079
1080 ret = driver->ops->ioctl(NULL,
1081 VFIO_CHECK_EXTENSION,
1082 arg);
1083 module_put(driver->ops->owner);
1084 if (ret > 0)
1085 break;
1086 }
1087 mutex_unlock(&vfio.iommu_drivers_lock);
1088 } else
1089 ret = driver->ops->ioctl(container->iommu_data,
1090 VFIO_CHECK_EXTENSION, arg);
1091 }
1092
1093 up_read(&container->group_lock);
1094
1095 return ret;
1096 }
1097
1098 /* hold write lock on container->group_lock */
1099 static int __vfio_container_attach_groups(struct vfio_container *container,
1100 struct vfio_iommu_driver *driver,
1101 void *data)
1102 {
1103 struct vfio_group *group;
1104 int ret = -ENODEV;
1105
1106 list_for_each_entry(group, &container->group_list, container_next) {
1107 ret = driver->ops->attach_group(data, group->iommu_group);
1108 if (ret)
1109 goto unwind;
1110 }
1111
1112 return ret;
1113
1114 unwind:
1115 list_for_each_entry_continue_reverse(group, &container->group_list,
1116 container_next) {
1117 driver->ops->detach_group(data, group->iommu_group);
1118 }
1119
1120 return ret;
1121 }
1122
1123 static long vfio_ioctl_set_iommu(struct vfio_container *container,
1124 unsigned long arg)
1125 {
1126 struct vfio_iommu_driver *driver;
1127 long ret = -ENODEV;
1128
1129 down_write(&container->group_lock);
1130
1131 /*
1132 * The container is designed to be an unprivileged interface while
1133 * the group can be assigned to specific users. Therefore, only by
1134 * adding a group to a container does the user get the privilege of
1135 * enabling the iommu, which may allocate finite resources. There
1136 * is no unset_iommu, but by removing all the groups from a container,
1137 * the container is deprivileged and returns to an unset state.
1138 */
1139 if (list_empty(&container->group_list) || container->iommu_driver) {
1140 up_write(&container->group_lock);
1141 return -EINVAL;
1142 }
1143
1144 mutex_lock(&vfio.iommu_drivers_lock);
1145 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1146 void *data;
1147
1148 #ifdef CONFIG_VFIO_NOIOMMU
1149 /*
1150 * Only noiommu containers can use vfio-noiommu and noiommu
1151 * containers can only use vfio-noiommu.
1152 */
1153 if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1154 continue;
1155 #endif
1156
1157 if (!try_module_get(driver->ops->owner))
1158 continue;
1159
1160 /*
1161 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1162 * so test which iommu driver reported support for this
1163 * extension and call open on them. We also pass them the
1164 * magic, allowing a single driver to support multiple
1165 * interfaces if they'd like.
1166 */
1167 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1168 module_put(driver->ops->owner);
1169 continue;
1170 }
1171
1172 data = driver->ops->open(arg);
1173 if (IS_ERR(data)) {
1174 ret = PTR_ERR(data);
1175 module_put(driver->ops->owner);
1176 continue;
1177 }
1178
1179 ret = __vfio_container_attach_groups(container, driver, data);
1180 if (ret) {
1181 driver->ops->release(data);
1182 module_put(driver->ops->owner);
1183 continue;
1184 }
1185
1186 container->iommu_driver = driver;
1187 container->iommu_data = data;
1188 break;
1189 }
1190
1191 mutex_unlock(&vfio.iommu_drivers_lock);
1192 up_write(&container->group_lock);
1193
1194 return ret;
1195 }
1196
1197 static long vfio_fops_unl_ioctl(struct file *filep,
1198 unsigned int cmd, unsigned long arg)
1199 {
1200 struct vfio_container *container = filep->private_data;
1201 struct vfio_iommu_driver *driver;
1202 void *data;
1203 long ret = -EINVAL;
1204
1205 if (!container)
1206 return ret;
1207
1208 switch (cmd) {
1209 case VFIO_GET_API_VERSION:
1210 ret = VFIO_API_VERSION;
1211 break;
1212 case VFIO_CHECK_EXTENSION:
1213 ret = vfio_ioctl_check_extension(container, arg);
1214 break;
1215 case VFIO_SET_IOMMU:
1216 ret = vfio_ioctl_set_iommu(container, arg);
1217 break;
1218 default:
1219 driver = container->iommu_driver;
1220 data = container->iommu_data;
1221
1222 if (driver) /* passthrough all unrecognized ioctls */
1223 ret = driver->ops->ioctl(data, cmd, arg);
1224 }
1225
1226 return ret;
1227 }
1228
1229 #ifdef CONFIG_COMPAT
1230 static long vfio_fops_compat_ioctl(struct file *filep,
1231 unsigned int cmd, unsigned long arg)
1232 {
1233 arg = (unsigned long)compat_ptr(arg);
1234 return vfio_fops_unl_ioctl(filep, cmd, arg);
1235 }
1236 #endif /* CONFIG_COMPAT */
1237
1238 static int vfio_fops_open(struct inode *inode, struct file *filep)
1239 {
1240 struct vfio_container *container;
1241
1242 container = kzalloc(sizeof(*container), GFP_KERNEL);
1243 if (!container)
1244 return -ENOMEM;
1245
1246 INIT_LIST_HEAD(&container->group_list);
1247 init_rwsem(&container->group_lock);
1248 kref_init(&container->kref);
1249
1250 filep->private_data = container;
1251
1252 return 0;
1253 }
1254
1255 static int vfio_fops_release(struct inode *inode, struct file *filep)
1256 {
1257 struct vfio_container *container = filep->private_data;
1258
1259 filep->private_data = NULL;
1260
1261 vfio_container_put(container);
1262
1263 return 0;
1264 }
1265
1266 /*
1267 * Once an iommu driver is set, we optionally pass read/write/mmap
1268 * on to the driver, allowing management interfaces beyond ioctl.
1269 */
1270 static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1271 size_t count, loff_t *ppos)
1272 {
1273 struct vfio_container *container = filep->private_data;
1274 struct vfio_iommu_driver *driver;
1275 ssize_t ret = -EINVAL;
1276
1277 driver = container->iommu_driver;
1278 if (likely(driver && driver->ops->read))
1279 ret = driver->ops->read(container->iommu_data,
1280 buf, count, ppos);
1281
1282 return ret;
1283 }
1284
1285 static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1286 size_t count, loff_t *ppos)
1287 {
1288 struct vfio_container *container = filep->private_data;
1289 struct vfio_iommu_driver *driver;
1290 ssize_t ret = -EINVAL;
1291
1292 driver = container->iommu_driver;
1293 if (likely(driver && driver->ops->write))
1294 ret = driver->ops->write(container->iommu_data,
1295 buf, count, ppos);
1296
1297 return ret;
1298 }
1299
1300 static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1301 {
1302 struct vfio_container *container = filep->private_data;
1303 struct vfio_iommu_driver *driver;
1304 int ret = -EINVAL;
1305
1306 driver = container->iommu_driver;
1307 if (likely(driver && driver->ops->mmap))
1308 ret = driver->ops->mmap(container->iommu_data, vma);
1309
1310 return ret;
1311 }
1312
1313 static const struct file_operations vfio_fops = {
1314 .owner = THIS_MODULE,
1315 .open = vfio_fops_open,
1316 .release = vfio_fops_release,
1317 .read = vfio_fops_read,
1318 .write = vfio_fops_write,
1319 .unlocked_ioctl = vfio_fops_unl_ioctl,
1320 #ifdef CONFIG_COMPAT
1321 .compat_ioctl = vfio_fops_compat_ioctl,
1322 #endif
1323 .mmap = vfio_fops_mmap,
1324 };
1325
1326 /**
1327 * VFIO Group fd, /dev/vfio/$GROUP
1328 */
1329 static void __vfio_group_unset_container(struct vfio_group *group)
1330 {
1331 struct vfio_container *container = group->container;
1332 struct vfio_iommu_driver *driver;
1333
1334 down_write(&container->group_lock);
1335
1336 driver = container->iommu_driver;
1337 if (driver)
1338 driver->ops->detach_group(container->iommu_data,
1339 group->iommu_group);
1340
1341 group->container = NULL;
1342 wake_up(&group->container_q);
1343 list_del(&group->container_next);
1344
1345 /* Detaching the last group deprivileges a container, remove iommu */
1346 if (driver && list_empty(&container->group_list)) {
1347 driver->ops->release(container->iommu_data);
1348 module_put(driver->ops->owner);
1349 container->iommu_driver = NULL;
1350 container->iommu_data = NULL;
1351 }
1352
1353 up_write(&container->group_lock);
1354
1355 vfio_container_put(container);
1356 }
1357
1358 /*
1359 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1360 * if there was no container to unset. Since the ioctl is called on
1361 * the group, we know that still exists, therefore the only valid
1362 * transition here is 1->0.
1363 */
1364 static int vfio_group_unset_container(struct vfio_group *group)
1365 {
1366 int users = atomic_cmpxchg(&group->container_users, 1, 0);
1367
1368 if (!users)
1369 return -EINVAL;
1370 if (users != 1)
1371 return -EBUSY;
1372
1373 __vfio_group_unset_container(group);
1374
1375 return 0;
1376 }
1377
1378 /*
1379 * When removing container users, anything that removes the last user
1380 * implicitly removes the group from the container. That is, if the
1381 * group file descriptor is closed, as well as any device file descriptors,
1382 * the group is free.
1383 */
1384 static void vfio_group_try_dissolve_container(struct vfio_group *group)
1385 {
1386 if (0 == atomic_dec_if_positive(&group->container_users))
1387 __vfio_group_unset_container(group);
1388 }
1389
1390 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1391 {
1392 struct fd f;
1393 struct vfio_container *container;
1394 struct vfio_iommu_driver *driver;
1395 int ret = 0;
1396
1397 if (atomic_read(&group->container_users))
1398 return -EINVAL;
1399
1400 if (group->noiommu && !capable(CAP_SYS_RAWIO))
1401 return -EPERM;
1402
1403 f = fdget(container_fd);
1404 if (!f.file)
1405 return -EBADF;
1406
1407 /* Sanity check, is this really our fd? */
1408 if (f.file->f_op != &vfio_fops) {
1409 fdput(f);
1410 return -EINVAL;
1411 }
1412
1413 container = f.file->private_data;
1414 WARN_ON(!container); /* fget ensures we don't race vfio_release */
1415
1416 down_write(&container->group_lock);
1417
1418 /* Real groups and fake groups cannot mix */
1419 if (!list_empty(&container->group_list) &&
1420 container->noiommu != group->noiommu) {
1421 ret = -EPERM;
1422 goto unlock_out;
1423 }
1424
1425 driver = container->iommu_driver;
1426 if (driver) {
1427 ret = driver->ops->attach_group(container->iommu_data,
1428 group->iommu_group);
1429 if (ret)
1430 goto unlock_out;
1431 }
1432
1433 group->container = container;
1434 container->noiommu = group->noiommu;
1435 list_add(&group->container_next, &container->group_list);
1436
1437 /* Get a reference on the container and mark a user within the group */
1438 vfio_container_get(container);
1439 atomic_inc(&group->container_users);
1440
1441 unlock_out:
1442 up_write(&container->group_lock);
1443 fdput(f);
1444 return ret;
1445 }
1446
1447 static bool vfio_group_viable(struct vfio_group *group)
1448 {
1449 return (iommu_group_for_each_dev(group->iommu_group,
1450 group, vfio_dev_viable) == 0);
1451 }
1452
1453 static int vfio_group_add_container_user(struct vfio_group *group)
1454 {
1455 if (!atomic_inc_not_zero(&group->container_users))
1456 return -EINVAL;
1457
1458 if (group->noiommu) {
1459 atomic_dec(&group->container_users);
1460 return -EPERM;
1461 }
1462 if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1463 atomic_dec(&group->container_users);
1464 return -EINVAL;
1465 }
1466
1467 return 0;
1468 }
1469
1470 static const struct file_operations vfio_device_fops;
1471
1472 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1473 {
1474 struct vfio_device *device;
1475 struct file *filep;
1476 int ret;
1477
1478 if (0 == atomic_read(&group->container_users) ||
1479 !group->container->iommu_driver || !vfio_group_viable(group))
1480 return -EINVAL;
1481
1482 if (group->noiommu && !capable(CAP_SYS_RAWIO))
1483 return -EPERM;
1484
1485 device = vfio_device_get_from_name(group, buf);
1486 if (!device)
1487 return -ENODEV;
1488
1489 ret = device->ops->open(device->device_data);
1490 if (ret) {
1491 vfio_device_put(device);
1492 return ret;
1493 }
1494
1495 /*
1496 * We can't use anon_inode_getfd() because we need to modify
1497 * the f_mode flags directly to allow more than just ioctls
1498 */
1499 ret = get_unused_fd_flags(O_CLOEXEC);
1500 if (ret < 0) {
1501 device->ops->release(device->device_data);
1502 vfio_device_put(device);
1503 return ret;
1504 }
1505
1506 filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1507 device, O_RDWR);
1508 if (IS_ERR(filep)) {
1509 put_unused_fd(ret);
1510 ret = PTR_ERR(filep);
1511 device->ops->release(device->device_data);
1512 vfio_device_put(device);
1513 return ret;
1514 }
1515
1516 /*
1517 * TODO: add an anon_inode interface to do this.
1518 * Appears to be missing by lack of need rather than
1519 * explicitly prevented. Now there's need.
1520 */
1521 filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1522
1523 atomic_inc(&group->container_users);
1524
1525 fd_install(ret, filep);
1526
1527 if (group->noiommu)
1528 dev_warn(device->dev, "vfio-noiommu device opened by user "
1529 "(%s:%d)\n", current->comm, task_pid_nr(current));
1530
1531 return ret;
1532 }
1533
1534 static long vfio_group_fops_unl_ioctl(struct file *filep,
1535 unsigned int cmd, unsigned long arg)
1536 {
1537 struct vfio_group *group = filep->private_data;
1538 long ret = -ENOTTY;
1539
1540 switch (cmd) {
1541 case VFIO_GROUP_GET_STATUS:
1542 {
1543 struct vfio_group_status status;
1544 unsigned long minsz;
1545
1546 minsz = offsetofend(struct vfio_group_status, flags);
1547
1548 if (copy_from_user(&status, (void __user *)arg, minsz))
1549 return -EFAULT;
1550
1551 if (status.argsz < minsz)
1552 return -EINVAL;
1553
1554 status.flags = 0;
1555
1556 if (vfio_group_viable(group))
1557 status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1558
1559 if (group->container)
1560 status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1561
1562 if (copy_to_user((void __user *)arg, &status, minsz))
1563 return -EFAULT;
1564
1565 ret = 0;
1566 break;
1567 }
1568 case VFIO_GROUP_SET_CONTAINER:
1569 {
1570 int fd;
1571
1572 if (get_user(fd, (int __user *)arg))
1573 return -EFAULT;
1574
1575 if (fd < 0)
1576 return -EINVAL;
1577
1578 ret = vfio_group_set_container(group, fd);
1579 break;
1580 }
1581 case VFIO_GROUP_UNSET_CONTAINER:
1582 ret = vfio_group_unset_container(group);
1583 break;
1584 case VFIO_GROUP_GET_DEVICE_FD:
1585 {
1586 char *buf;
1587
1588 buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1589 if (IS_ERR(buf))
1590 return PTR_ERR(buf);
1591
1592 ret = vfio_group_get_device_fd(group, buf);
1593 kfree(buf);
1594 break;
1595 }
1596 }
1597
1598 return ret;
1599 }
1600
1601 #ifdef CONFIG_COMPAT
1602 static long vfio_group_fops_compat_ioctl(struct file *filep,
1603 unsigned int cmd, unsigned long arg)
1604 {
1605 arg = (unsigned long)compat_ptr(arg);
1606 return vfio_group_fops_unl_ioctl(filep, cmd, arg);
1607 }
1608 #endif /* CONFIG_COMPAT */
1609
1610 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1611 {
1612 struct vfio_group *group;
1613 int opened;
1614
1615 group = vfio_group_get_from_minor(iminor(inode));
1616 if (!group)
1617 return -ENODEV;
1618
1619 if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1620 vfio_group_put(group);
1621 return -EPERM;
1622 }
1623
1624 /* Do we need multiple instances of the group open? Seems not. */
1625 opened = atomic_cmpxchg(&group->opened, 0, 1);
1626 if (opened) {
1627 vfio_group_put(group);
1628 return -EBUSY;
1629 }
1630
1631 /* Is something still in use from a previous open? */
1632 if (group->container) {
1633 atomic_dec(&group->opened);
1634 vfio_group_put(group);
1635 return -EBUSY;
1636 }
1637
1638 /* Warn if previous user didn't cleanup and re-init to drop them */
1639 if (WARN_ON(group->notifier.head))
1640 BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
1641
1642 filep->private_data = group;
1643
1644 return 0;
1645 }
1646
1647 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1648 {
1649 struct vfio_group *group = filep->private_data;
1650
1651 filep->private_data = NULL;
1652
1653 vfio_group_try_dissolve_container(group);
1654
1655 atomic_dec(&group->opened);
1656
1657 vfio_group_put(group);
1658
1659 return 0;
1660 }
1661
1662 static const struct file_operations vfio_group_fops = {
1663 .owner = THIS_MODULE,
1664 .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1665 #ifdef CONFIG_COMPAT
1666 .compat_ioctl = vfio_group_fops_compat_ioctl,
1667 #endif
1668 .open = vfio_group_fops_open,
1669 .release = vfio_group_fops_release,
1670 };
1671
1672 /**
1673 * VFIO Device fd
1674 */
1675 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1676 {
1677 struct vfio_device *device = filep->private_data;
1678
1679 device->ops->release(device->device_data);
1680
1681 vfio_group_try_dissolve_container(device->group);
1682
1683 vfio_device_put(device);
1684
1685 return 0;
1686 }
1687
1688 static long vfio_device_fops_unl_ioctl(struct file *filep,
1689 unsigned int cmd, unsigned long arg)
1690 {
1691 struct vfio_device *device = filep->private_data;
1692
1693 if (unlikely(!device->ops->ioctl))
1694 return -EINVAL;
1695
1696 return device->ops->ioctl(device->device_data, cmd, arg);
1697 }
1698
1699 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1700 size_t count, loff_t *ppos)
1701 {
1702 struct vfio_device *device = filep->private_data;
1703
1704 if (unlikely(!device->ops->read))
1705 return -EINVAL;
1706
1707 return device->ops->read(device->device_data, buf, count, ppos);
1708 }
1709
1710 static ssize_t vfio_device_fops_write(struct file *filep,
1711 const char __user *buf,
1712 size_t count, loff_t *ppos)
1713 {
1714 struct vfio_device *device = filep->private_data;
1715
1716 if (unlikely(!device->ops->write))
1717 return -EINVAL;
1718
1719 return device->ops->write(device->device_data, buf, count, ppos);
1720 }
1721
1722 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1723 {
1724 struct vfio_device *device = filep->private_data;
1725
1726 if (unlikely(!device->ops->mmap))
1727 return -EINVAL;
1728
1729 return device->ops->mmap(device->device_data, vma);
1730 }
1731
1732 #ifdef CONFIG_COMPAT
1733 static long vfio_device_fops_compat_ioctl(struct file *filep,
1734 unsigned int cmd, unsigned long arg)
1735 {
1736 arg = (unsigned long)compat_ptr(arg);
1737 return vfio_device_fops_unl_ioctl(filep, cmd, arg);
1738 }
1739 #endif /* CONFIG_COMPAT */
1740
1741 static const struct file_operations vfio_device_fops = {
1742 .owner = THIS_MODULE,
1743 .release = vfio_device_fops_release,
1744 .read = vfio_device_fops_read,
1745 .write = vfio_device_fops_write,
1746 .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1747 #ifdef CONFIG_COMPAT
1748 .compat_ioctl = vfio_device_fops_compat_ioctl,
1749 #endif
1750 .mmap = vfio_device_fops_mmap,
1751 };
1752
1753 /**
1754 * External user API, exported by symbols to be linked dynamically.
1755 *
1756 * The protocol includes:
1757 * 1. do normal VFIO init operation:
1758 * - opening a new container;
1759 * - attaching group(s) to it;
1760 * - setting an IOMMU driver for a container.
1761 * When IOMMU is set for a container, all groups in it are
1762 * considered ready to use by an external user.
1763 *
1764 * 2. User space passes a group fd to an external user.
1765 * The external user calls vfio_group_get_external_user()
1766 * to verify that:
1767 * - the group is initialized;
1768 * - IOMMU is set for it.
1769 * If both checks passed, vfio_group_get_external_user()
1770 * increments the container user counter to prevent
1771 * the VFIO group from disposal before KVM exits.
1772 *
1773 * 3. The external user calls vfio_external_user_iommu_id()
1774 * to know an IOMMU ID.
1775 *
1776 * 4. When the external KVM finishes, it calls
1777 * vfio_group_put_external_user() to release the VFIO group.
1778 * This call decrements the container user counter.
1779 */
1780 struct vfio_group *vfio_group_get_external_user(struct file *filep)
1781 {
1782 struct vfio_group *group = filep->private_data;
1783 int ret;
1784
1785 if (filep->f_op != &vfio_group_fops)
1786 return ERR_PTR(-EINVAL);
1787
1788 ret = vfio_group_add_container_user(group);
1789 if (ret)
1790 return ERR_PTR(ret);
1791
1792 vfio_group_get(group);
1793
1794 return group;
1795 }
1796 EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1797
1798 void vfio_group_put_external_user(struct vfio_group *group)
1799 {
1800 vfio_group_try_dissolve_container(group);
1801 vfio_group_put(group);
1802 }
1803 EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1804
1805 bool vfio_external_group_match_file(struct vfio_group *test_group,
1806 struct file *filep)
1807 {
1808 struct vfio_group *group = filep->private_data;
1809
1810 return (filep->f_op == &vfio_group_fops) && (group == test_group);
1811 }
1812 EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
1813
1814 int vfio_external_user_iommu_id(struct vfio_group *group)
1815 {
1816 return iommu_group_id(group->iommu_group);
1817 }
1818 EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1819
1820 long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1821 {
1822 return vfio_ioctl_check_extension(group->container, arg);
1823 }
1824 EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1825
1826 /**
1827 * Sub-module support
1828 */
1829 /*
1830 * Helper for managing a buffer of info chain capabilities, allocate or
1831 * reallocate a buffer with additional @size, filling in @id and @version
1832 * of the capability. A pointer to the new capability is returned.
1833 *
1834 * NB. The chain is based at the head of the buffer, so new entries are
1835 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1836 * next offsets prior to copying to the user buffer.
1837 */
1838 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1839 size_t size, u16 id, u16 version)
1840 {
1841 void *buf;
1842 struct vfio_info_cap_header *header, *tmp;
1843
1844 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1845 if (!buf) {
1846 kfree(caps->buf);
1847 caps->size = 0;
1848 return ERR_PTR(-ENOMEM);
1849 }
1850
1851 caps->buf = buf;
1852 header = buf + caps->size;
1853
1854 /* Eventually copied to user buffer, zero */
1855 memset(header, 0, size);
1856
1857 header->id = id;
1858 header->version = version;
1859
1860 /* Add to the end of the capability chain */
1861 for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1862 ; /* nothing */
1863
1864 tmp->next = caps->size;
1865 caps->size += size;
1866
1867 return header;
1868 }
1869 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1870
1871 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1872 {
1873 struct vfio_info_cap_header *tmp;
1874 void *buf = (void *)caps->buf;
1875
1876 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1877 tmp->next += offset;
1878 }
1879 EXPORT_SYMBOL(vfio_info_cap_shift);
1880
1881 static int sparse_mmap_cap(struct vfio_info_cap *caps, void *cap_type)
1882 {
1883 struct vfio_info_cap_header *header;
1884 struct vfio_region_info_cap_sparse_mmap *sparse_cap, *sparse = cap_type;
1885 size_t size;
1886
1887 size = sizeof(*sparse) + sparse->nr_areas * sizeof(*sparse->areas);
1888 header = vfio_info_cap_add(caps, size,
1889 VFIO_REGION_INFO_CAP_SPARSE_MMAP, 1);
1890 if (IS_ERR(header))
1891 return PTR_ERR(header);
1892
1893 sparse_cap = container_of(header,
1894 struct vfio_region_info_cap_sparse_mmap, header);
1895 sparse_cap->nr_areas = sparse->nr_areas;
1896 memcpy(sparse_cap->areas, sparse->areas,
1897 sparse->nr_areas * sizeof(*sparse->areas));
1898 return 0;
1899 }
1900
1901 static int region_type_cap(struct vfio_info_cap *caps, void *cap_type)
1902 {
1903 struct vfio_info_cap_header *header;
1904 struct vfio_region_info_cap_type *type_cap, *cap = cap_type;
1905
1906 header = vfio_info_cap_add(caps, sizeof(*cap),
1907 VFIO_REGION_INFO_CAP_TYPE, 1);
1908 if (IS_ERR(header))
1909 return PTR_ERR(header);
1910
1911 type_cap = container_of(header, struct vfio_region_info_cap_type,
1912 header);
1913 type_cap->type = cap->type;
1914 type_cap->subtype = cap->subtype;
1915 return 0;
1916 }
1917
1918 int vfio_info_add_capability(struct vfio_info_cap *caps, int cap_type_id,
1919 void *cap_type)
1920 {
1921 int ret = -EINVAL;
1922
1923 if (!cap_type)
1924 return 0;
1925
1926 switch (cap_type_id) {
1927 case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
1928 ret = sparse_mmap_cap(caps, cap_type);
1929 break;
1930
1931 case VFIO_REGION_INFO_CAP_TYPE:
1932 ret = region_type_cap(caps, cap_type);
1933 break;
1934 }
1935
1936 return ret;
1937 }
1938 EXPORT_SYMBOL(vfio_info_add_capability);
1939
1940 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1941 int max_irq_type, size_t *data_size)
1942 {
1943 unsigned long minsz;
1944 size_t size;
1945
1946 minsz = offsetofend(struct vfio_irq_set, count);
1947
1948 if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1949 (hdr->count >= (U32_MAX - hdr->start)) ||
1950 (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1951 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1952 return -EINVAL;
1953
1954 if (data_size)
1955 *data_size = 0;
1956
1957 if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1958 return -EINVAL;
1959
1960 switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1961 case VFIO_IRQ_SET_DATA_NONE:
1962 size = 0;
1963 break;
1964 case VFIO_IRQ_SET_DATA_BOOL:
1965 size = sizeof(uint8_t);
1966 break;
1967 case VFIO_IRQ_SET_DATA_EVENTFD:
1968 size = sizeof(int32_t);
1969 break;
1970 default:
1971 return -EINVAL;
1972 }
1973
1974 if (size) {
1975 if (hdr->argsz - minsz < hdr->count * size)
1976 return -EINVAL;
1977
1978 if (!data_size)
1979 return -EINVAL;
1980
1981 *data_size = hdr->count * size;
1982 }
1983
1984 return 0;
1985 }
1986 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1987
1988 /*
1989 * Pin a set of guest PFNs and return their associated host PFNs for local
1990 * domain only.
1991 * @dev [in] : device
1992 * @user_pfn [in]: array of user/guest PFNs to be pinned.
1993 * @npage [in] : count of elements in user_pfn array. This count should not
1994 * be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1995 * @prot [in] : protection flags
1996 * @phys_pfn[out]: array of host PFNs
1997 * Return error or number of pages pinned.
1998 */
1999 int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
2000 int prot, unsigned long *phys_pfn)
2001 {
2002 struct vfio_container *container;
2003 struct vfio_group *group;
2004 struct vfio_iommu_driver *driver;
2005 int ret;
2006
2007 if (!dev || !user_pfn || !phys_pfn || !npage)
2008 return -EINVAL;
2009
2010 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2011 return -E2BIG;
2012
2013 group = vfio_group_get_from_dev(dev);
2014 if (!group)
2015 return -ENODEV;
2016
2017 ret = vfio_group_add_container_user(group);
2018 if (ret)
2019 goto err_pin_pages;
2020
2021 container = group->container;
2022 driver = container->iommu_driver;
2023 if (likely(driver && driver->ops->pin_pages))
2024 ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
2025 npage, prot, phys_pfn);
2026 else
2027 ret = -ENOTTY;
2028
2029 vfio_group_try_dissolve_container(group);
2030
2031 err_pin_pages:
2032 vfio_group_put(group);
2033 return ret;
2034 }
2035 EXPORT_SYMBOL(vfio_pin_pages);
2036
2037 /*
2038 * Unpin set of host PFNs for local domain only.
2039 * @dev [in] : device
2040 * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
2041 * PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2042 * @npage [in] : count of elements in user_pfn array. This count should not
2043 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2044 * Return error or number of pages unpinned.
2045 */
2046 int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
2047 {
2048 struct vfio_container *container;
2049 struct vfio_group *group;
2050 struct vfio_iommu_driver *driver;
2051 int ret;
2052
2053 if (!dev || !user_pfn || !npage)
2054 return -EINVAL;
2055
2056 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2057 return -E2BIG;
2058
2059 group = vfio_group_get_from_dev(dev);
2060 if (!group)
2061 return -ENODEV;
2062
2063 ret = vfio_group_add_container_user(group);
2064 if (ret)
2065 goto err_unpin_pages;
2066
2067 container = group->container;
2068 driver = container->iommu_driver;
2069 if (likely(driver && driver->ops->unpin_pages))
2070 ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
2071 npage);
2072 else
2073 ret = -ENOTTY;
2074
2075 vfio_group_try_dissolve_container(group);
2076
2077 err_unpin_pages:
2078 vfio_group_put(group);
2079 return ret;
2080 }
2081 EXPORT_SYMBOL(vfio_unpin_pages);
2082
2083 static int vfio_register_iommu_notifier(struct vfio_group *group,
2084 unsigned long *events,
2085 struct notifier_block *nb)
2086 {
2087 struct vfio_container *container;
2088 struct vfio_iommu_driver *driver;
2089 int ret;
2090
2091 ret = vfio_group_add_container_user(group);
2092 if (ret)
2093 return -EINVAL;
2094
2095 container = group->container;
2096 driver = container->iommu_driver;
2097 if (likely(driver && driver->ops->register_notifier))
2098 ret = driver->ops->register_notifier(container->iommu_data,
2099 events, nb);
2100 else
2101 ret = -ENOTTY;
2102
2103 vfio_group_try_dissolve_container(group);
2104
2105 return ret;
2106 }
2107
2108 static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2109 struct notifier_block *nb)
2110 {
2111 struct vfio_container *container;
2112 struct vfio_iommu_driver *driver;
2113 int ret;
2114
2115 ret = vfio_group_add_container_user(group);
2116 if (ret)
2117 return -EINVAL;
2118
2119 container = group->container;
2120 driver = container->iommu_driver;
2121 if (likely(driver && driver->ops->unregister_notifier))
2122 ret = driver->ops->unregister_notifier(container->iommu_data,
2123 nb);
2124 else
2125 ret = -ENOTTY;
2126
2127 vfio_group_try_dissolve_container(group);
2128
2129 return ret;
2130 }
2131
2132 void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2133 {
2134 group->kvm = kvm;
2135 blocking_notifier_call_chain(&group->notifier,
2136 VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2137 }
2138 EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2139
2140 static int vfio_register_group_notifier(struct vfio_group *group,
2141 unsigned long *events,
2142 struct notifier_block *nb)
2143 {
2144 int ret;
2145 bool set_kvm = false;
2146
2147 if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2148 set_kvm = true;
2149
2150 /* clear known events */
2151 *events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2152
2153 /* refuse to continue if still events remaining */
2154 if (*events)
2155 return -EINVAL;
2156
2157 ret = vfio_group_add_container_user(group);
2158 if (ret)
2159 return -EINVAL;
2160
2161 ret = blocking_notifier_chain_register(&group->notifier, nb);
2162
2163 /*
2164 * The attaching of kvm and vfio_group might already happen, so
2165 * here we replay once upon registration.
2166 */
2167 if (!ret && set_kvm && group->kvm)
2168 blocking_notifier_call_chain(&group->notifier,
2169 VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2170
2171 vfio_group_try_dissolve_container(group);
2172
2173 return ret;
2174 }
2175
2176 static int vfio_unregister_group_notifier(struct vfio_group *group,
2177 struct notifier_block *nb)
2178 {
2179 int ret;
2180
2181 ret = vfio_group_add_container_user(group);
2182 if (ret)
2183 return -EINVAL;
2184
2185 ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2186
2187 vfio_group_try_dissolve_container(group);
2188
2189 return ret;
2190 }
2191
2192 int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2193 unsigned long *events, struct notifier_block *nb)
2194 {
2195 struct vfio_group *group;
2196 int ret;
2197
2198 if (!dev || !nb || !events || (*events == 0))
2199 return -EINVAL;
2200
2201 group = vfio_group_get_from_dev(dev);
2202 if (!group)
2203 return -ENODEV;
2204
2205 switch (type) {
2206 case VFIO_IOMMU_NOTIFY:
2207 ret = vfio_register_iommu_notifier(group, events, nb);
2208 break;
2209 case VFIO_GROUP_NOTIFY:
2210 ret = vfio_register_group_notifier(group, events, nb);
2211 break;
2212 default:
2213 ret = -EINVAL;
2214 }
2215
2216 vfio_group_put(group);
2217 return ret;
2218 }
2219 EXPORT_SYMBOL(vfio_register_notifier);
2220
2221 int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2222 struct notifier_block *nb)
2223 {
2224 struct vfio_group *group;
2225 int ret;
2226
2227 if (!dev || !nb)
2228 return -EINVAL;
2229
2230 group = vfio_group_get_from_dev(dev);
2231 if (!group)
2232 return -ENODEV;
2233
2234 switch (type) {
2235 case VFIO_IOMMU_NOTIFY:
2236 ret = vfio_unregister_iommu_notifier(group, nb);
2237 break;
2238 case VFIO_GROUP_NOTIFY:
2239 ret = vfio_unregister_group_notifier(group, nb);
2240 break;
2241 default:
2242 ret = -EINVAL;
2243 }
2244
2245 vfio_group_put(group);
2246 return ret;
2247 }
2248 EXPORT_SYMBOL(vfio_unregister_notifier);
2249
2250 /**
2251 * Module/class support
2252 */
2253 static char *vfio_devnode(struct device *dev, umode_t *mode)
2254 {
2255 return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2256 }
2257
2258 static struct miscdevice vfio_dev = {
2259 .minor = VFIO_MINOR,
2260 .name = "vfio",
2261 .fops = &vfio_fops,
2262 .nodename = "vfio/vfio",
2263 .mode = S_IRUGO | S_IWUGO,
2264 };
2265
2266 static int __init vfio_init(void)
2267 {
2268 int ret;
2269
2270 idr_init(&vfio.group_idr);
2271 mutex_init(&vfio.group_lock);
2272 mutex_init(&vfio.iommu_drivers_lock);
2273 INIT_LIST_HEAD(&vfio.group_list);
2274 INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2275 init_waitqueue_head(&vfio.release_q);
2276
2277 ret = misc_register(&vfio_dev);
2278 if (ret) {
2279 pr_err("vfio: misc device register failed\n");
2280 return ret;
2281 }
2282
2283 /* /dev/vfio/$GROUP */
2284 vfio.class = class_create(THIS_MODULE, "vfio");
2285 if (IS_ERR(vfio.class)) {
2286 ret = PTR_ERR(vfio.class);
2287 goto err_class;
2288 }
2289
2290 vfio.class->devnode = vfio_devnode;
2291
2292 ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK, "vfio");
2293 if (ret)
2294 goto err_alloc_chrdev;
2295
2296 cdev_init(&vfio.group_cdev, &vfio_group_fops);
2297 ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK);
2298 if (ret)
2299 goto err_cdev_add;
2300
2301 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2302
2303 #ifdef CONFIG_VFIO_NOIOMMU
2304 vfio_register_iommu_driver(&vfio_noiommu_ops);
2305 #endif
2306 return 0;
2307
2308 err_cdev_add:
2309 unregister_chrdev_region(vfio.group_devt, MINORMASK);
2310 err_alloc_chrdev:
2311 class_destroy(vfio.class);
2312 vfio.class = NULL;
2313 err_class:
2314 misc_deregister(&vfio_dev);
2315 return ret;
2316 }
2317
2318 static void __exit vfio_cleanup(void)
2319 {
2320 WARN_ON(!list_empty(&vfio.group_list));
2321
2322 #ifdef CONFIG_VFIO_NOIOMMU
2323 vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2324 #endif
2325 idr_destroy(&vfio.group_idr);
2326 cdev_del(&vfio.group_cdev);
2327 unregister_chrdev_region(vfio.group_devt, MINORMASK);
2328 class_destroy(vfio.class);
2329 vfio.class = NULL;
2330 misc_deregister(&vfio_dev);
2331 }
2332
2333 module_init(vfio_init);
2334 module_exit(vfio_cleanup);
2335
2336 MODULE_VERSION(DRIVER_VERSION);
2337 MODULE_LICENSE("GPL v2");
2338 MODULE_AUTHOR(DRIVER_AUTHOR);
2339 MODULE_DESCRIPTION(DRIVER_DESC);
2340 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2341 MODULE_ALIAS("devname:vfio/vfio");
2342 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");