drivers/vfio/vfio.c

   1 /*
   2  * VFIO core
   3  *
   4  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   5  *     Author: Alex Williamson <alex.williamson@redhat.com>
   6  *
   7  * This program is free software; you can redistribute it and/or modify
   8  * it under the terms of the GNU General Public License version 2 as
   9  * published by the Free Software Foundation.
  10  *
  11  * Derived from original vfio:
  12  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  13  * Author: Tom Lyon, pugs@cisco.com
  14  */
  15
  16 #include <linux/cdev.h>
  17 #include <linux/compat.h>
  18 #include <linux/device.h>
  19 #include <linux/file.h>
  20 #include <linux/anon_inodes.h>
  21 #include <linux/fs.h>
  22 #include <linux/idr.h>
  23 #include <linux/iommu.h>
  24 #include <linux/list.h>
  25 #include <linux/miscdevice.h>
  26 #include <linux/module.h>
  27 #include <linux/mutex.h>
  28 #include <linux/pci.h>
  29 #include <linux/rwsem.h>
  30 #include <linux/sched.h>
  31 #include <linux/slab.h>
  32 #include <linux/stat.h>
  33 #include <linux/string.h>
  34 #include <linux/uaccess.h>
  35 #include <linux/vfio.h>
  36 #include <linux/wait.h>
  37
  38 #define DRIVER_VERSION  "0.3"
  39 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
  40 #define DRIVER_DESC     "VFIO - User Level meta-driver"
  41
  42 static struct vfio {
  43         struct class                    *class;
  44         struct list_head                iommu_drivers_list;
  45         struct mutex                    iommu_drivers_lock;
  46         struct list_head                group_list;
  47         struct idr                      group_idr;
  48         struct mutex                    group_lock;
  49         struct cdev                     group_cdev;
  50         dev_t                           group_devt;
  51         wait_queue_head_t               release_q;
  52 } vfio;
  53
  54 struct vfio_iommu_driver {
  55         const struct vfio_iommu_driver_ops      *ops;
  56         struct list_head                        vfio_next;
  57 };
  58
  59 struct vfio_container {
  60         struct kref                     kref;
  61         struct list_head                group_list;
  62         struct rw_semaphore             group_lock;
  63         struct vfio_iommu_driver        *iommu_driver;
  64         void                            *iommu_data;
  65         bool                            noiommu;
  66 };
  67
  68 struct vfio_unbound_dev {
  69         struct device                   *dev;
  70         struct list_head                unbound_next;
  71 };
  72
  73 struct vfio_group {
  74         struct kref                     kref;
  75         int                             minor;
  76         atomic_t                        container_users;
  77         struct iommu_group              *iommu_group;
  78         struct vfio_container           *container;
  79         struct list_head                device_list;
  80         struct mutex                    device_lock;
  81         struct device                   *dev;
  82         struct notifier_block           nb;
  83         struct list_head                vfio_next;
  84         struct list_head                container_next;
  85         struct list_head                unbound_list;
  86         struct mutex                    unbound_lock;
  87         atomic_t                        opened;
  88         bool                            noiommu;
  89         struct kvm                      *kvm;
  90         struct blocking_notifier_head   notifier;
  91 };
  92
  93 struct vfio_device {
  94         struct kref                     kref;
  95         struct device                   *dev;
  96         const struct vfio_device_ops    *ops;
  97         struct vfio_group               *group;
  98         struct list_head                group_next;
  99         void                            *device_data;
 100 };
 101
 102 #ifdef CONFIG_VFIO_NOIOMMU
 103 static bool noiommu __read_mostly;
 104 module_param_named(enable_unsafe_noiommu_mode,
 105                    noiommu, bool, S_IRUGO | S_IWUSR);
 106 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
 107 #endif
 108
 109 /*
 110  * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
 111  * and remove functions, any use cases other than acquiring the first
 112  * reference for the purpose of calling vfio_add_group_dev() or removing
 113  * that symmetric reference after vfio_del_group_dev() should use the raw
 114  * iommu_group_{get,put} functions.  In particular, vfio_iommu_group_put()
 115  * removes the device from the dummy group and cannot be nested.
 116  */
 117 struct iommu_group *vfio_iommu_group_get(struct device *dev)
 118 {
 119         struct iommu_group *group;
 120         int __maybe_unused ret;
 121
 122         group = iommu_group_get(dev);
 123
 124 #ifdef CONFIG_VFIO_NOIOMMU
 125         /*
 126          * With noiommu enabled, an IOMMU group will be created for a device
 127          * that doesn't already have one and doesn't have an iommu_ops on their
 128          * bus.  We set iommudata simply to be able to identify these groups
 129          * as special use and for reclamation later.
 130          */
 131         if (group || !noiommu || iommu_present(dev->bus))
 132                 return group;
 133
 134         group = iommu_group_alloc();
 135         if (IS_ERR(group))
 136                 return NULL;
 137
 138         iommu_group_set_name(group, "vfio-noiommu");
 139         iommu_group_set_iommudata(group, &noiommu, NULL);
 140         ret = iommu_group_add_device(group, dev);
 141         iommu_group_put(group);
 142         if (ret)
 143                 return NULL;
 144
 145         /*
 146          * Where to taint?  At this point we've added an IOMMU group for a
 147          * device that is not backed by iommu_ops, therefore any iommu_
 148          * callback using iommu_ops can legitimately Oops.  So, while we may
 149          * be about to give a DMA capable device to a user without IOMMU
 150          * protection, which is clearly taint-worthy, let's go ahead and do
 151          * it here.
 152          */
 153         add_taint(TAINT_USER, LOCKDEP_STILL_OK);
 154         dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
 155 #endif
 156
 157         return group;
 158 }
 159 EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
 160
 161 void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
 162 {
 163 #ifdef CONFIG_VFIO_NOIOMMU
 164         if (iommu_group_get_iommudata(group) == &noiommu)
 165                 iommu_group_remove_device(dev);
 166 #endif
 167
 168         iommu_group_put(group);
 169 }
 170 EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
 171
 172 #ifdef CONFIG_VFIO_NOIOMMU
 173 static void *vfio_noiommu_open(unsigned long arg)
 174 {
 175         if (arg != VFIO_NOIOMMU_IOMMU)
 176                 return ERR_PTR(-EINVAL);
 177         if (!capable(CAP_SYS_RAWIO))
 178                 return ERR_PTR(-EPERM);
 179
 180         return NULL;
 181 }
 182
 183 static void vfio_noiommu_release(void *iommu_data)
 184 {
 185 }
 186
 187 static long vfio_noiommu_ioctl(void *iommu_data,
 188                                unsigned int cmd, unsigned long arg)
 189 {
 190         if (cmd == VFIO_CHECK_EXTENSION)
 191                 return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
 192
 193         return -ENOTTY;
 194 }
 195
 196 static int vfio_noiommu_attach_group(void *iommu_data,
 197                                      struct iommu_group *iommu_group)
 198 {
 199         return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
 200 }
 201
 202 static void vfio_noiommu_detach_group(void *iommu_data,
 203                                       struct iommu_group *iommu_group)
 204 {
 205 }
 206
 207 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
 208         .name = "vfio-noiommu",
 209         .owner = THIS_MODULE,
 210         .open = vfio_noiommu_open,
 211         .release = vfio_noiommu_release,
 212         .ioctl = vfio_noiommu_ioctl,
 213         .attach_group = vfio_noiommu_attach_group,
 214         .detach_group = vfio_noiommu_detach_group,
 215 };
 216 #endif
 217
 218
 219 /**
 220  * IOMMU driver registration
 221  */
 222 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 223 {
 224         struct vfio_iommu_driver *driver, *tmp;
 225
 226         driver = kzalloc(sizeof(*driver), GFP_KERNEL);
 227         if (!driver)
 228                 return -ENOMEM;
 229
 230         driver->ops = ops;
 231
 232         mutex_lock(&vfio.iommu_drivers_lock);
 233
 234         /* Check for duplicates */
 235         list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
 236                 if (tmp->ops == ops) {
 237                         mutex_unlock(&vfio.iommu_drivers_lock);
 238                         kfree(driver);
 239                         return -EINVAL;
 240                 }
 241         }
 242
 243         list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
 244
 245         mutex_unlock(&vfio.iommu_drivers_lock);
 246
 247         return 0;
 248 }
 249 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
 250
 251 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 252 {
 253         struct vfio_iommu_driver *driver;
 254
 255         mutex_lock(&vfio.iommu_drivers_lock);
 256         list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
 257                 if (driver->ops == ops) {
 258                         list_del(&driver->vfio_next);
 259                         mutex_unlock(&vfio.iommu_drivers_lock);
 260                         kfree(driver);
 261                         return;
 262                 }
 263         }
 264         mutex_unlock(&vfio.iommu_drivers_lock);
 265 }
 266 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
 267
 268 /**
 269  * Group minor allocation/free - both called with vfio.group_lock held
 270  */
 271 static int vfio_alloc_group_minor(struct vfio_group *group)
 272 {
 273         return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
 274 }
 275
 276 static void vfio_free_group_minor(int minor)
 277 {
 278         idr_remove(&vfio.group_idr, minor);
 279 }
 280
 281 static int vfio_iommu_group_notifier(struct notifier_block *nb,
 282                                      unsigned long action, void *data);
 283 static void vfio_group_get(struct vfio_group *group);
 284
 285 /**
 286  * Container objects - containers are created when /dev/vfio/vfio is
 287  * opened, but their lifecycle extends until the last user is done, so
 288  * it's freed via kref.  Must support container/group/device being
 289  * closed in any order.
 290  */
 291 static void vfio_container_get(struct vfio_container *container)
 292 {
 293         kref_get(&container->kref);
 294 }
 295
 296 static void vfio_container_release(struct kref *kref)
 297 {
 298         struct vfio_container *container;
 299         container = container_of(kref, struct vfio_container, kref);
 300
 301         kfree(container);
 302 }
 303
 304 static void vfio_container_put(struct vfio_container *container)
 305 {
 306         kref_put(&container->kref, vfio_container_release);
 307 }
 308
 309 static void vfio_group_unlock_and_free(struct vfio_group *group)
 310 {
 311         mutex_unlock(&vfio.group_lock);
 312         /*
 313          * Unregister outside of lock.  A spurious callback is harmless now
 314          * that the group is no longer in vfio.group_list.
 315          */
 316         iommu_group_unregister_notifier(group->iommu_group, &group->nb);
 317         kfree(group);
 318 }
 319
 320 /**
 321  * Group objects - create, release, get, put, search
 322  */
 323 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
 324 {
 325         struct vfio_group *group, *tmp;
 326         struct device *dev;
 327         int ret, minor;
 328
 329         group = kzalloc(sizeof(*group), GFP_KERNEL);
 330         if (!group)
 331                 return ERR_PTR(-ENOMEM);
 332
 333         kref_init(&group->kref);
 334         INIT_LIST_HEAD(&group->device_list);
 335         mutex_init(&group->device_lock);
 336         INIT_LIST_HEAD(&group->unbound_list);
 337         mutex_init(&group->unbound_lock);
 338         atomic_set(&group->container_users, 0);
 339         atomic_set(&group->opened, 0);
 340         group->iommu_group = iommu_group;
 341 #ifdef CONFIG_VFIO_NOIOMMU
 342         group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
 343 #endif
 344         BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
 345
 346         group->nb.notifier_call = vfio_iommu_group_notifier;
 347
 348         /*
 349          * blocking notifiers acquire a rwsem around registering and hold
 350          * it around callback.  Therefore, need to register outside of
 351          * vfio.group_lock to avoid A-B/B-A contention.  Our callback won't
 352          * do anything unless it can find the group in vfio.group_list, so
 353          * no harm in registering early.
 354          */
 355         ret = iommu_group_register_notifier(iommu_group, &group->nb);
 356         if (ret) {
 357                 kfree(group);
 358                 return ERR_PTR(ret);
 359         }
 360
 361         mutex_lock(&vfio.group_lock);
 362
 363         /* Did we race creating this group? */
 364         list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
 365                 if (tmp->iommu_group == iommu_group) {
 366                         vfio_group_get(tmp);
 367                         vfio_group_unlock_and_free(group);
 368                         return tmp;
 369                 }
 370         }
 371
 372         minor = vfio_alloc_group_minor(group);
 373         if (minor < 0) {
 374                 vfio_group_unlock_and_free(group);
 375                 return ERR_PTR(minor);
 376         }
 377
 378         dev = device_create(vfio.class, NULL,
 379                             MKDEV(MAJOR(vfio.group_devt), minor),
 380                             group, "%s%d", group->noiommu ? "noiommu-" : "",
 381                             iommu_group_id(iommu_group));
 382         if (IS_ERR(dev)) {
 383                 vfio_free_group_minor(minor);
 384                 vfio_group_unlock_and_free(group);
 385                 return (struct vfio_group *)dev; /* ERR_PTR */
 386         }
 387
 388         group->minor = minor;
 389         group->dev = dev;
 390
 391         list_add(&group->vfio_next, &vfio.group_list);
 392
 393         mutex_unlock(&vfio.group_lock);
 394
 395         return group;
 396 }
 397
 398 /* called with vfio.group_lock held */
 399 static void vfio_group_release(struct kref *kref)
 400 {
 401         struct vfio_group *group = container_of(kref, struct vfio_group, kref);
 402         struct vfio_unbound_dev *unbound, *tmp;
 403         struct iommu_group *iommu_group = group->iommu_group;
 404
 405         WARN_ON(!list_empty(&group->device_list));
 406         WARN_ON(group->notifier.head);
 407
 408         list_for_each_entry_safe(unbound, tmp,
 409                                  &group->unbound_list, unbound_next) {
 410                 list_del(&unbound->unbound_next);
 411                 kfree(unbound);
 412         }
 413
 414         device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
 415         list_del(&group->vfio_next);
 416         vfio_free_group_minor(group->minor);
 417         vfio_group_unlock_and_free(group);
 418         iommu_group_put(iommu_group);
 419 }
 420
 421 static void vfio_group_put(struct vfio_group *group)
 422 {
 423         kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
 424 }
 425
 426 /* Assume group_lock or group reference is held */
 427 static void vfio_group_get(struct vfio_group *group)
 428 {
 429         kref_get(&group->kref);
 430 }
 431
 432 /*
 433  * Not really a try as we will sleep for mutex, but we need to make
 434  * sure the group pointer is valid under lock and get a reference.
 435  */
 436 static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
 437 {
 438         struct vfio_group *target = group;
 439
 440         mutex_lock(&vfio.group_lock);
 441         list_for_each_entry(group, &vfio.group_list, vfio_next) {
 442                 if (group == target) {
 443                         vfio_group_get(group);
 444                         mutex_unlock(&vfio.group_lock);
 445                         return group;
 446                 }
 447         }
 448         mutex_unlock(&vfio.group_lock);
 449
 450         return NULL;
 451 }
 452
 453 static
 454 struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
 455 {
 456         struct vfio_group *group;
 457
 458         mutex_lock(&vfio.group_lock);
 459         list_for_each_entry(group, &vfio.group_list, vfio_next) {
 460                 if (group->iommu_group == iommu_group) {
 461                         vfio_group_get(group);
 462                         mutex_unlock(&vfio.group_lock);
 463                         return group;
 464                 }
 465         }
 466         mutex_unlock(&vfio.group_lock);
 467
 468         return NULL;
 469 }
 470
 471 static struct vfio_group *vfio_group_get_from_minor(int minor)
 472 {
 473         struct vfio_group *group;
 474
 475         mutex_lock(&vfio.group_lock);
 476         group = idr_find(&vfio.group_idr, minor);
 477         if (!group) {
 478                 mutex_unlock(&vfio.group_lock);
 479                 return NULL;
 480         }
 481         vfio_group_get(group);
 482         mutex_unlock(&vfio.group_lock);
 483
 484         return group;
 485 }
 486
 487 static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
 488 {
 489         struct iommu_group *iommu_group;
 490         struct vfio_group *group;
 491
 492         iommu_group = iommu_group_get(dev);
 493         if (!iommu_group)
 494                 return NULL;
 495
 496         group = vfio_group_get_from_iommu(iommu_group);
 497         iommu_group_put(iommu_group);
 498
 499         return group;
 500 }
 501
 502 /**
 503  * Device objects - create, release, get, put, search
 504  */
 505 static
 506 struct vfio_device *vfio_group_create_device(struct vfio_group *group,
 507                                              struct device *dev,
 508                                              const struct vfio_device_ops *ops,
 509                                              void *device_data)
 510 {
 511         struct vfio_device *device;
 512
 513         device = kzalloc(sizeof(*device), GFP_KERNEL);
 514         if (!device)
 515                 return ERR_PTR(-ENOMEM);
 516
 517         kref_init(&device->kref);
 518         device->dev = dev;
 519         device->group = group;
 520         device->ops = ops;
 521         device->device_data = device_data;
 522         dev_set_drvdata(dev, device);
 523
 524         /* No need to get group_lock, caller has group reference */
 525         vfio_group_get(group);
 526
 527         mutex_lock(&group->device_lock);
 528         list_add(&device->group_next, &group->device_list);
 529         mutex_unlock(&group->device_lock);
 530
 531         return device;
 532 }
 533
 534 static void vfio_device_release(struct kref *kref)
 535 {
 536         struct vfio_device *device = container_of(kref,
 537                                                   struct vfio_device, kref);
 538         struct vfio_group *group = device->group;
 539
 540         list_del(&device->group_next);
 541         mutex_unlock(&group->device_lock);
 542
 543         dev_set_drvdata(device->dev, NULL);
 544
 545         kfree(device);
 546
 547         /* vfio_del_group_dev may be waiting for this device */
 548         wake_up(&vfio.release_q);
 549 }
 550
 551 /* Device reference always implies a group reference */
 552 void vfio_device_put(struct vfio_device *device)
 553 {
 554         struct vfio_group *group = device->group;
 555         kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
 556         vfio_group_put(group);
 557 }
 558 EXPORT_SYMBOL_GPL(vfio_device_put);
 559
 560 static void vfio_device_get(struct vfio_device *device)
 561 {
 562         vfio_group_get(device->group);
 563         kref_get(&device->kref);
 564 }
 565
 566 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
 567                                                  struct device *dev)
 568 {
 569         struct vfio_device *device;
 570
 571         mutex_lock(&group->device_lock);
 572         list_for_each_entry(device, &group->device_list, group_next) {
 573                 if (device->dev == dev) {
 574                         vfio_device_get(device);
 575                         mutex_unlock(&group->device_lock);
 576                         return device;
 577                 }
 578         }
 579         mutex_unlock(&group->device_lock);
 580         return NULL;
 581 }
 582
 583 /*
 584  * Some drivers, like pci-stub, are only used to prevent other drivers from
 585  * claiming a device and are therefore perfectly legitimate for a user owned
 586  * group.  The pci-stub driver has no dependencies on DMA or the IOVA mapping
 587  * of the device, but it does prevent the user from having direct access to
 588  * the device, which is useful in some circumstances.
 589  *
 590  * We also assume that we can include PCI interconnect devices, ie. bridges.
 591  * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
 592  * then all of the downstream devices will be part of the same IOMMU group as
 593  * the bridge.  Thus, if placing the bridge into the user owned IOVA space
 594  * breaks anything, it only does so for user owned devices downstream.  Note
 595  * that error notification via MSI can be affected for platforms that handle
 596  * MSI within the same IOVA space as DMA.
 597  */
 598 static const char * const vfio_driver_whitelist[] = { "pci-stub" };
 599
 600 static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
 601 {
 602         int i;
 603
 604         if (dev_is_pci(dev)) {
 605                 struct pci_dev *pdev = to_pci_dev(dev);
 606
 607                 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
 608                         return true;
 609         }
 610
 611         for (i = 0; i < ARRAY_SIZE(vfio_driver_whitelist); i++) {
 612                 if (!strcmp(drv->name, vfio_driver_whitelist[i]))
 613                         return true;
 614         }
 615
 616         return false;
 617 }
 618
 619 /*
 620  * A vfio group is viable for use by userspace if all devices are in
 621  * one of the following states:
 622  *  - driver-less
 623  *  - bound to a vfio driver
 624  *  - bound to a whitelisted driver
 625  *  - a PCI interconnect device
 626  *
 627  * We use two methods to determine whether a device is bound to a vfio
 628  * driver.  The first is to test whether the device exists in the vfio
 629  * group.  The second is to test if the device exists on the group
 630  * unbound_list, indicating it's in the middle of transitioning from
 631  * a vfio driver to driver-less.
 632  */
 633 static int vfio_dev_viable(struct device *dev, void *data)
 634 {
 635         struct vfio_group *group = data;
 636         struct vfio_device *device;
 637         struct device_driver *drv = ACCESS_ONCE(dev->driver);
 638         struct vfio_unbound_dev *unbound;
 639         int ret = -EINVAL;
 640
 641         mutex_lock(&group->unbound_lock);
 642         list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
 643                 if (dev == unbound->dev) {
 644                         ret = 0;
 645                         break;
 646                 }
 647         }
 648         mutex_unlock(&group->unbound_lock);
 649
 650         if (!ret || !drv || vfio_dev_whitelisted(dev, drv))
 651                 return 0;
 652
 653         device = vfio_group_get_device(group, dev);
 654         if (device) {
 655                 vfio_device_put(device);
 656                 return 0;
 657         }
 658
 659         return ret;
 660 }
 661
 662 /**
 663  * Async device support
 664  */
 665 static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
 666 {
 667         struct vfio_device *device;
 668
 669         /* Do we already know about it?  We shouldn't */
 670         device = vfio_group_get_device(group, dev);
 671         if (WARN_ON_ONCE(device)) {
 672                 vfio_device_put(device);
 673                 return 0;
 674         }
 675
 676         /* Nothing to do for idle groups */
 677         if (!atomic_read(&group->container_users))
 678                 return 0;
 679
 680         /* TODO Prevent device auto probing */
 681         WARN(1, "Device %s added to live group %d!\n", dev_name(dev),
 682              iommu_group_id(group->iommu_group));
 683
 684         return 0;
 685 }
 686
 687 static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
 688 {
 689         /* We don't care what happens when the group isn't in use */
 690         if (!atomic_read(&group->container_users))
 691                 return 0;
 692
 693         return vfio_dev_viable(dev, group);
 694 }
 695
 696 static int vfio_iommu_group_notifier(struct notifier_block *nb,
 697                                      unsigned long action, void *data)
 698 {
 699         struct vfio_group *group = container_of(nb, struct vfio_group, nb);
 700         struct device *dev = data;
 701         struct vfio_unbound_dev *unbound;
 702
 703         /*
 704          * Need to go through a group_lock lookup to get a reference or we
 705          * risk racing a group being removed.  Ignore spurious notifies.
 706          */
 707         group = vfio_group_try_get(group);
 708         if (!group)
 709                 return NOTIFY_OK;
 710
 711         switch (action) {
 712         case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
 713                 vfio_group_nb_add_dev(group, dev);
 714                 break;
 715         case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
 716                 /*
 717                  * Nothing to do here.  If the device is in use, then the
 718                  * vfio sub-driver should block the remove callback until
 719                  * it is unused.  If the device is unused or attached to a
 720                  * stub driver, then it should be released and we don't
 721                  * care that it will be going away.
 722                  */
 723                 break;
 724         case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
 725                 pr_debug("%s: Device %s, group %d binding to driver\n",
 726                          __func__, dev_name(dev),
 727                          iommu_group_id(group->iommu_group));
 728                 break;
 729         case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
 730                 pr_debug("%s: Device %s, group %d bound to driver %s\n",
 731                          __func__, dev_name(dev),
 732                          iommu_group_id(group->iommu_group), dev->driver->name);
 733                 BUG_ON(vfio_group_nb_verify(group, dev));
 734                 break;
 735         case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
 736                 pr_debug("%s: Device %s, group %d unbinding from driver %s\n",
 737                          __func__, dev_name(dev),
 738                          iommu_group_id(group->iommu_group), dev->driver->name);
 739                 break;
 740         case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
 741                 pr_debug("%s: Device %s, group %d unbound from driver\n",
 742                          __func__, dev_name(dev),
 743                          iommu_group_id(group->iommu_group));
 744                 /*
 745                  * XXX An unbound device in a live group is ok, but we'd
 746                  * really like to avoid the above BUG_ON by preventing other
 747                  * drivers from binding to it.  Once that occurs, we have to
 748                  * stop the system to maintain isolation.  At a minimum, we'd
 749                  * want a toggle to disable driver auto probe for this device.
 750                  */
 751
 752                 mutex_lock(&group->unbound_lock);
 753                 list_for_each_entry(unbound,
 754                                     &group->unbound_list, unbound_next) {
 755                         if (dev == unbound->dev) {
 756                                 list_del(&unbound->unbound_next);
 757                                 kfree(unbound);
 758                                 break;
 759                         }
 760                 }
 761                 mutex_unlock(&group->unbound_lock);
 762                 break;
 763         }
 764
 765         vfio_group_put(group);
 766         return NOTIFY_OK;
 767 }
 768
 769 /**
 770  * VFIO driver API
 771  */
 772 int vfio_add_group_dev(struct device *dev,
 773                        const struct vfio_device_ops *ops, void *device_data)
 774 {
 775         struct iommu_group *iommu_group;
 776         struct vfio_group *group;
 777         struct vfio_device *device;
 778
 779         iommu_group = iommu_group_get(dev);
 780         if (!iommu_group)
 781                 return -EINVAL;
 782
 783         group = vfio_group_get_from_iommu(iommu_group);
 784         if (!group) {
 785                 group = vfio_create_group(iommu_group);
 786                 if (IS_ERR(group)) {
 787                         iommu_group_put(iommu_group);
 788                         return PTR_ERR(group);
 789                 }
 790         } else {
 791                 /*
 792                  * A found vfio_group already holds a reference to the
 793                  * iommu_group.  A created vfio_group keeps the reference.
 794                  */
 795                 iommu_group_put(iommu_group);
 796         }
 797
 798         device = vfio_group_get_device(group, dev);
 799         if (device) {
 800                 WARN(1, "Device %s already exists on group %d\n",
 801                      dev_name(dev), iommu_group_id(iommu_group));
 802                 vfio_device_put(device);
 803                 vfio_group_put(group);
 804                 return -EBUSY;
 805         }
 806
 807         device = vfio_group_create_device(group, dev, ops, device_data);
 808         if (IS_ERR(device)) {
 809                 vfio_group_put(group);
 810                 return PTR_ERR(device);
 811         }
 812
 813         /*
 814          * Drop all but the vfio_device reference.  The vfio_device holds
 815          * a reference to the vfio_group, which holds a reference to the
 816          * iommu_group.
 817          */
 818         vfio_group_put(group);
 819
 820         return 0;
 821 }
 822 EXPORT_SYMBOL_GPL(vfio_add_group_dev);
 823
 824 /**
 825  * Get a reference to the vfio_device for a device.  Even if the
 826  * caller thinks they own the device, they could be racing with a
 827  * release call path, so we can't trust drvdata for the shortcut.
 828  * Go the long way around, from the iommu_group to the vfio_group
 829  * to the vfio_device.
 830  */
 831 struct vfio_device *vfio_device_get_from_dev(struct device *dev)
 832 {
 833         struct vfio_group *group;
 834         struct vfio_device *device;
 835
 836         group = vfio_group_get_from_dev(dev);
 837         if (!group)
 838                 return NULL;
 839
 840         device = vfio_group_get_device(group, dev);
 841         vfio_group_put(group);
 842
 843         return device;
 844 }
 845 EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
 846
 847 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
 848                                                      char *buf)
 849 {
 850         struct vfio_device *it, *device = NULL;
 851
 852         mutex_lock(&group->device_lock);
 853         list_for_each_entry(it, &group->device_list, group_next) {
 854                 if (!strcmp(dev_name(it->dev), buf)) {
 855                         device = it;
 856                         vfio_device_get(device);
 857                         break;
 858                 }
 859         }
 860         mutex_unlock(&group->device_lock);
 861
 862         return device;
 863 }
 864
 865 /*
 866  * Caller must hold a reference to the vfio_device
 867  */
 868 void *vfio_device_data(struct vfio_device *device)
 869 {
 870         return device->device_data;
 871 }
 872 EXPORT_SYMBOL_GPL(vfio_device_data);
 873
 874 /* Given a referenced group, check if it contains the device */
 875 static bool vfio_dev_present(struct vfio_group *group, struct device *dev)
 876 {
 877         struct vfio_device *device;
 878
 879         device = vfio_group_get_device(group, dev);
 880         if (!device)
 881                 return false;
 882
 883         vfio_device_put(device);
 884         return true;
 885 }
 886
 887 /*
 888  * Decrement the device reference count and wait for the device to be
 889  * removed.  Open file descriptors for the device... */
 890 void *vfio_del_group_dev(struct device *dev)
 891 {
 892         struct vfio_device *device = dev_get_drvdata(dev);
 893         struct vfio_group *group = device->group;
 894         void *device_data = device->device_data;
 895         struct vfio_unbound_dev *unbound;
 896         unsigned int i = 0;
 897         long ret;
 898         bool interrupted = false;
 899
 900         /*
 901          * The group exists so long as we have a device reference.  Get
 902          * a group reference and use it to scan for the device going away.
 903          */
 904         vfio_group_get(group);
 905
 906         /*
 907          * When the device is removed from the group, the group suddenly
 908          * becomes non-viable; the device has a driver (until the unbind
 909          * completes), but it's not present in the group.  This is bad news
 910          * for any external users that need to re-acquire a group reference
 911          * in order to match and release their existing reference.  To
 912          * solve this, we track such devices on the unbound_list to bridge
 913          * the gap until they're fully unbound.
 914          */
 915         unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
 916         if (unbound) {
 917                 unbound->dev = dev;
 918                 mutex_lock(&group->unbound_lock);
 919                 list_add(&unbound->unbound_next, &group->unbound_list);
 920                 mutex_unlock(&group->unbound_lock);
 921         }
 922         WARN_ON(!unbound);
 923
 924         vfio_device_put(device);
 925
 926         /*
 927          * If the device is still present in the group after the above
 928          * 'put', then it is in use and we need to request it from the
 929          * bus driver.  The driver may in turn need to request the
 930          * device from the user.  We send the request on an arbitrary
 931          * interval with counter to allow the driver to take escalating
 932          * measures to release the device if it has the ability to do so.
 933          */
 934         do {
 935                 device = vfio_group_get_device(group, dev);
 936                 if (!device)
 937                         break;
 938
 939                 if (device->ops->request)
 940                         device->ops->request(device_data, i++);
 941
 942                 vfio_device_put(device);
 943
 944                 if (interrupted) {
 945                         ret = wait_event_timeout(vfio.release_q,
 946                                         !vfio_dev_present(group, dev), HZ * 10);
 947                 } else {
 948                         ret = wait_event_interruptible_timeout(vfio.release_q,
 949                                         !vfio_dev_present(group, dev), HZ * 10);
 950                         if (ret == -ERESTARTSYS) {
 951                                 interrupted = true;
 952                                 dev_warn(dev,
 953                                          "Device is currently in use, task"
 954                                          " \"%s\" (%d) "
 955                                          "blocked until device is released",
 956                                          current->comm, task_pid_nr(current));
 957                         }
 958                 }
 959         } while (ret <= 0);
 960
 961         vfio_group_put(group);
 962
 963         return device_data;
 964 }
 965 EXPORT_SYMBOL_GPL(vfio_del_group_dev);
 966
 967 /**
 968  * VFIO base fd, /dev/vfio/vfio
 969  */
 970 static long vfio_ioctl_check_extension(struct vfio_container *container,
 971                                        unsigned long arg)
 972 {
 973         struct vfio_iommu_driver *driver;
 974         long ret = 0;
 975
 976         down_read(&container->group_lock);
 977
 978         driver = container->iommu_driver;
 979
 980         switch (arg) {
 981                 /* No base extensions yet */
 982         default:
 983                 /*
 984                  * If no driver is set, poll all registered drivers for
 985                  * extensions and return the first positive result.  If
 986                  * a driver is already set, further queries will be passed
 987                  * only to that driver.
 988                  */
 989                 if (!driver) {
 990                         mutex_lock(&vfio.iommu_drivers_lock);
 991                         list_for_each_entry(driver, &vfio.iommu_drivers_list,
 992                                             vfio_next) {
 993
 994 #ifdef CONFIG_VFIO_NOIOMMU
 995                                 if (!list_empty(&container->group_list) &&
 996                                     (container->noiommu !=
 997                                      (driver->ops == &vfio_noiommu_ops)))
 998                                         continue;
 999 #endif
1000
1001                                 if (!try_module_get(driver->ops->owner))
1002                                         continue;
1003
1004                                 ret = driver->ops->ioctl(NULL,
1005                                                          VFIO_CHECK_EXTENSION,
1006                                                          arg);
1007                                 module_put(driver->ops->owner);
1008                                 if (ret > 0)
1009                                         break;
1010                         }
1011                         mutex_unlock(&vfio.iommu_drivers_lock);
1012                 } else
1013                         ret = driver->ops->ioctl(container->iommu_data,
1014                                                  VFIO_CHECK_EXTENSION, arg);
1015         }
1016
1017         up_read(&container->group_lock);
1018
1019         return ret;
1020 }
1021
1022 /* hold write lock on container->group_lock */
1023 static int __vfio_container_attach_groups(struct vfio_container *container,
1024                                           struct vfio_iommu_driver *driver,
1025                                           void *data)
1026 {
1027         struct vfio_group *group;
1028         int ret = -ENODEV;
1029
1030         list_for_each_entry(group, &container->group_list, container_next) {
1031                 ret = driver->ops->attach_group(data, group->iommu_group);
1032                 if (ret)
1033                         goto unwind;
1034         }
1035
1036         return ret;
1037
1038 unwind:
1039         list_for_each_entry_continue_reverse(group, &container->group_list,
1040                                              container_next) {
1041                 driver->ops->detach_group(data, group->iommu_group);
1042         }
1043
1044         return ret;
1045 }
1046
1047 static long vfio_ioctl_set_iommu(struct vfio_container *container,
1048                                  unsigned long arg)
1049 {
1050         struct vfio_iommu_driver *driver;
1051         long ret = -ENODEV;
1052
1053         down_write(&container->group_lock);
1054
1055         /*
1056          * The container is designed to be an unprivileged interface while
1057          * the group can be assigned to specific users.  Therefore, only by
1058          * adding a group to a container does the user get the privilege of
1059          * enabling the iommu, which may allocate finite resources.  There
1060          * is no unset_iommu, but by removing all the groups from a container,
1061          * the container is deprivileged and returns to an unset state.
1062          */
1063         if (list_empty(&container->group_list) || container->iommu_driver) {
1064                 up_write(&container->group_lock);
1065                 return -EINVAL;
1066         }
1067
1068         mutex_lock(&vfio.iommu_drivers_lock);
1069         list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1070                 void *data;
1071
1072 #ifdef CONFIG_VFIO_NOIOMMU
1073                 /*
1074                  * Only noiommu containers can use vfio-noiommu and noiommu
1075                  * containers can only use vfio-noiommu.
1076                  */
1077                 if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1078                         continue;
1079 #endif
1080
1081                 if (!try_module_get(driver->ops->owner))
1082                         continue;
1083
1084                 /*
1085                  * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1086                  * so test which iommu driver reported support for this
1087                  * extension and call open on them.  We also pass them the
1088                  * magic, allowing a single driver to support multiple
1089                  * interfaces if they'd like.
1090                  */
1091                 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1092                         module_put(driver->ops->owner);
1093                         continue;
1094                 }
1095
1096                 data = driver->ops->open(arg);
1097                 if (IS_ERR(data)) {
1098                         ret = PTR_ERR(data);
1099                         module_put(driver->ops->owner);
1100                         continue;
1101                 }
1102
1103                 ret = __vfio_container_attach_groups(container, driver, data);
1104                 if (ret) {
1105                         driver->ops->release(data);
1106                         module_put(driver->ops->owner);
1107                         continue;
1108                 }
1109
1110                 container->iommu_driver = driver;
1111                 container->iommu_data = data;
1112                 break;
1113         }
1114
1115         mutex_unlock(&vfio.iommu_drivers_lock);
1116         up_write(&container->group_lock);
1117
1118         return ret;
1119 }
1120
1121 static long vfio_fops_unl_ioctl(struct file *filep,
1122                                 unsigned int cmd, unsigned long arg)
1123 {
1124         struct vfio_container *container = filep->private_data;
1125         struct vfio_iommu_driver *driver;
1126         void *data;
1127         long ret = -EINVAL;
1128
1129         if (!container)
1130                 return ret;
1131
1132         switch (cmd) {
1133         case VFIO_GET_API_VERSION:
1134                 ret = VFIO_API_VERSION;
1135                 break;
1136         case VFIO_CHECK_EXTENSION:
1137                 ret = vfio_ioctl_check_extension(container, arg);
1138                 break;
1139         case VFIO_SET_IOMMU:
1140                 ret = vfio_ioctl_set_iommu(container, arg);
1141                 break;
1142         default:
1143                 down_read(&container->group_lock);
1144
1145                 driver = container->iommu_driver;
1146                 data = container->iommu_data;
1147
1148                 if (driver) /* passthrough all unrecognized ioctls */
1149                         ret = driver->ops->ioctl(data, cmd, arg);
1150
1151                 up_read(&container->group_lock);
1152         }
1153
1154         return ret;
1155 }
1156
1157 #ifdef CONFIG_COMPAT
1158 static long vfio_fops_compat_ioctl(struct file *filep,
1159                                    unsigned int cmd, unsigned long arg)
1160 {
1161         arg = (unsigned long)compat_ptr(arg);
1162         return vfio_fops_unl_ioctl(filep, cmd, arg);
1163 }
1164 #endif  /* CONFIG_COMPAT */
1165
1166 static int vfio_fops_open(struct inode *inode, struct file *filep)
1167 {
1168         struct vfio_container *container;
1169
1170         container = kzalloc(sizeof(*container), GFP_KERNEL);
1171         if (!container)
1172                 return -ENOMEM;
1173
1174         INIT_LIST_HEAD(&container->group_list);
1175         init_rwsem(&container->group_lock);
1176         kref_init(&container->kref);
1177
1178         filep->private_data = container;
1179
1180         return 0;
1181 }
1182
1183 static int vfio_fops_release(struct inode *inode, struct file *filep)
1184 {
1185         struct vfio_container *container = filep->private_data;
1186
1187         filep->private_data = NULL;
1188
1189         vfio_container_put(container);
1190
1191         return 0;
1192 }
1193
1194 /*
1195  * Once an iommu driver is set, we optionally pass read/write/mmap
1196  * on to the driver, allowing management interfaces beyond ioctl.
1197  */
1198 static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1199                               size_t count, loff_t *ppos)
1200 {
1201         struct vfio_container *container = filep->private_data;
1202         struct vfio_iommu_driver *driver;
1203         ssize_t ret = -EINVAL;
1204
1205         down_read(&container->group_lock);
1206
1207         driver = container->iommu_driver;
1208         if (likely(driver && driver->ops->read))
1209                 ret = driver->ops->read(container->iommu_data,
1210                                         buf, count, ppos);
1211
1212         up_read(&container->group_lock);
1213
1214         return ret;
1215 }
1216
1217 static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1218                                size_t count, loff_t *ppos)
1219 {
1220         struct vfio_container *container = filep->private_data;
1221         struct vfio_iommu_driver *driver;
1222         ssize_t ret = -EINVAL;
1223
1224         down_read(&container->group_lock);
1225
1226         driver = container->iommu_driver;
1227         if (likely(driver && driver->ops->write))
1228                 ret = driver->ops->write(container->iommu_data,
1229                                          buf, count, ppos);
1230
1231         up_read(&container->group_lock);
1232
1233         return ret;
1234 }
1235
1236 static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1237 {
1238         struct vfio_container *container = filep->private_data;
1239         struct vfio_iommu_driver *driver;
1240         int ret = -EINVAL;
1241
1242         down_read(&container->group_lock);
1243
1244         driver = container->iommu_driver;
1245         if (likely(driver && driver->ops->mmap))
1246                 ret = driver->ops->mmap(container->iommu_data, vma);
1247
1248         up_read(&container->group_lock);
1249
1250         return ret;
1251 }
1252
1253 static const struct file_operations vfio_fops = {
1254         .owner          = THIS_MODULE,
1255         .open           = vfio_fops_open,
1256         .release        = vfio_fops_release,
1257         .read           = vfio_fops_read,
1258         .write          = vfio_fops_write,
1259         .unlocked_ioctl = vfio_fops_unl_ioctl,
1260 #ifdef CONFIG_COMPAT
1261         .compat_ioctl   = vfio_fops_compat_ioctl,
1262 #endif
1263         .mmap           = vfio_fops_mmap,
1264 };
1265
1266 /**
1267  * VFIO Group fd, /dev/vfio/$GROUP
1268  */
1269 static void __vfio_group_unset_container(struct vfio_group *group)
1270 {
1271         struct vfio_container *container = group->container;
1272         struct vfio_iommu_driver *driver;
1273
1274         down_write(&container->group_lock);
1275
1276         driver = container->iommu_driver;
1277         if (driver)
1278                 driver->ops->detach_group(container->iommu_data,
1279                                           group->iommu_group);
1280
1281         group->container = NULL;
1282         list_del(&group->container_next);
1283
1284         /* Detaching the last group deprivileges a container, remove iommu */
1285         if (driver && list_empty(&container->group_list)) {
1286                 driver->ops->release(container->iommu_data);
1287                 module_put(driver->ops->owner);
1288                 container->iommu_driver = NULL;
1289                 container->iommu_data = NULL;
1290         }
1291
1292         up_write(&container->group_lock);
1293
1294         vfio_container_put(container);
1295 }
1296
1297 /*
1298  * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1299  * if there was no container to unset.  Since the ioctl is called on
1300  * the group, we know that still exists, therefore the only valid
1301  * transition here is 1->0.
1302  */
1303 static int vfio_group_unset_container(struct vfio_group *group)
1304 {
1305         int users = atomic_cmpxchg(&group->container_users, 1, 0);
1306
1307         if (!users)
1308                 return -EINVAL;
1309         if (users != 1)
1310                 return -EBUSY;
1311
1312         __vfio_group_unset_container(group);
1313
1314         return 0;
1315 }
1316
1317 /*
1318  * When removing container users, anything that removes the last user
1319  * implicitly removes the group from the container.  That is, if the
1320  * group file descriptor is closed, as well as any device file descriptors,
1321  * the group is free.
1322  */
1323 static void vfio_group_try_dissolve_container(struct vfio_group *group)
1324 {
1325         if (0 == atomic_dec_if_positive(&group->container_users))
1326                 __vfio_group_unset_container(group);
1327 }
1328
1329 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1330 {
1331         struct fd f;
1332         struct vfio_container *container;
1333         struct vfio_iommu_driver *driver;
1334         int ret = 0;
1335
1336         if (atomic_read(&group->container_users))
1337                 return -EINVAL;
1338
1339         if (group->noiommu && !capable(CAP_SYS_RAWIO))
1340                 return -EPERM;
1341
1342         f = fdget(container_fd);
1343         if (!f.file)
1344                 return -EBADF;
1345
1346         /* Sanity check, is this really our fd? */
1347         if (f.file->f_op != &vfio_fops) {
1348                 fdput(f);
1349                 return -EINVAL;
1350         }
1351
1352         container = f.file->private_data;
1353         WARN_ON(!container); /* fget ensures we don't race vfio_release */
1354
1355         down_write(&container->group_lock);
1356
1357         /* Real groups and fake groups cannot mix */
1358         if (!list_empty(&container->group_list) &&
1359             container->noiommu != group->noiommu) {
1360                 ret = -EPERM;
1361                 goto unlock_out;
1362         }
1363
1364         driver = container->iommu_driver;
1365         if (driver) {
1366                 ret = driver->ops->attach_group(container->iommu_data,
1367                                                 group->iommu_group);
1368                 if (ret)
1369                         goto unlock_out;
1370         }
1371
1372         group->container = container;
1373         container->noiommu = group->noiommu;
1374         list_add(&group->container_next, &container->group_list);
1375
1376         /* Get a reference on the container and mark a user within the group */
1377         vfio_container_get(container);
1378         atomic_inc(&group->container_users);
1379
1380 unlock_out:
1381         up_write(&container->group_lock);
1382         fdput(f);
1383         return ret;
1384 }
1385
1386 static bool vfio_group_viable(struct vfio_group *group)
1387 {
1388         return (iommu_group_for_each_dev(group->iommu_group,
1389                                          group, vfio_dev_viable) == 0);
1390 }
1391
1392 static int vfio_group_add_container_user(struct vfio_group *group)
1393 {
1394         if (!atomic_inc_not_zero(&group->container_users))
1395                 return -EINVAL;
1396
1397         if (group->noiommu) {
1398                 atomic_dec(&group->container_users);
1399                 return -EPERM;
1400         }
1401         if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1402                 atomic_dec(&group->container_users);
1403                 return -EINVAL;
1404         }
1405
1406         return 0;
1407 }
1408
1409 static const struct file_operations vfio_device_fops;
1410
1411 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1412 {
1413         struct vfio_device *device;
1414         struct file *filep;
1415         int ret;
1416
1417         if (0 == atomic_read(&group->container_users) ||
1418             !group->container->iommu_driver || !vfio_group_viable(group))
1419                 return -EINVAL;
1420
1421         if (group->noiommu && !capable(CAP_SYS_RAWIO))
1422                 return -EPERM;
1423
1424         device = vfio_device_get_from_name(group, buf);
1425         if (!device)
1426                 return -ENODEV;
1427
1428         ret = device->ops->open(device->device_data);
1429         if (ret) {
1430                 vfio_device_put(device);
1431                 return ret;
1432         }
1433
1434         /*
1435          * We can't use anon_inode_getfd() because we need to modify
1436          * the f_mode flags directly to allow more than just ioctls
1437          */
1438         ret = get_unused_fd_flags(O_CLOEXEC);
1439         if (ret < 0) {
1440                 device->ops->release(device->device_data);
1441                 vfio_device_put(device);
1442                 return ret;
1443         }
1444
1445         filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1446                                    device, O_RDWR);
1447         if (IS_ERR(filep)) {
1448                 put_unused_fd(ret);
1449                 ret = PTR_ERR(filep);
1450                 device->ops->release(device->device_data);
1451                 vfio_device_put(device);
1452                 return ret;
1453         }
1454
1455         /*
1456          * TODO: add an anon_inode interface to do this.
1457          * Appears to be missing by lack of need rather than
1458          * explicitly prevented.  Now there's need.
1459          */
1460         filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1461
1462         atomic_inc(&group->container_users);
1463
1464         fd_install(ret, filep);
1465
1466         if (group->noiommu)
1467                 dev_warn(device->dev, "vfio-noiommu device opened by user "
1468                          "(%s:%d)\n", current->comm, task_pid_nr(current));
1469
1470         return ret;
1471 }
1472
1473 static long vfio_group_fops_unl_ioctl(struct file *filep,
1474                                       unsigned int cmd, unsigned long arg)
1475 {
1476         struct vfio_group *group = filep->private_data;
1477         long ret = -ENOTTY;
1478
1479         switch (cmd) {
1480         case VFIO_GROUP_GET_STATUS:
1481         {
1482                 struct vfio_group_status status;
1483                 unsigned long minsz;
1484
1485                 minsz = offsetofend(struct vfio_group_status, flags);
1486
1487                 if (copy_from_user(&status, (void __user *)arg, minsz))
1488                         return -EFAULT;
1489
1490                 if (status.argsz < minsz)
1491                         return -EINVAL;
1492
1493                 status.flags = 0;
1494
1495                 if (vfio_group_viable(group))
1496                         status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1497
1498                 if (group->container)
1499                         status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1500
1501                 if (copy_to_user((void __user *)arg, &status, minsz))
1502                         return -EFAULT;
1503
1504                 ret = 0;
1505                 break;
1506         }
1507         case VFIO_GROUP_SET_CONTAINER:
1508         {
1509                 int fd;
1510
1511                 if (get_user(fd, (int __user *)arg))
1512                         return -EFAULT;
1513
1514                 if (fd < 0)
1515                         return -EINVAL;
1516
1517                 ret = vfio_group_set_container(group, fd);
1518                 break;
1519         }
1520         case VFIO_GROUP_UNSET_CONTAINER:
1521                 ret = vfio_group_unset_container(group);
1522                 break;
1523         case VFIO_GROUP_GET_DEVICE_FD:
1524         {
1525                 char *buf;
1526
1527                 buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1528                 if (IS_ERR(buf))
1529                         return PTR_ERR(buf);
1530
1531                 ret = vfio_group_get_device_fd(group, buf);
1532                 kfree(buf);
1533                 break;
1534         }
1535         }
1536
1537         return ret;
1538 }
1539
1540 #ifdef CONFIG_COMPAT
1541 static long vfio_group_fops_compat_ioctl(struct file *filep,
1542                                          unsigned int cmd, unsigned long arg)
1543 {
1544         arg = (unsigned long)compat_ptr(arg);
1545         return vfio_group_fops_unl_ioctl(filep, cmd, arg);
1546 }
1547 #endif  /* CONFIG_COMPAT */
1548
1549 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1550 {
1551         struct vfio_group *group;
1552         int opened;
1553
1554         group = vfio_group_get_from_minor(iminor(inode));
1555         if (!group)
1556                 return -ENODEV;
1557
1558         if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1559                 vfio_group_put(group);
1560                 return -EPERM;
1561         }
1562
1563         /* Do we need multiple instances of the group open?  Seems not. */
1564         opened = atomic_cmpxchg(&group->opened, 0, 1);
1565         if (opened) {
1566                 vfio_group_put(group);
1567                 return -EBUSY;
1568         }
1569
1570         /* Is something still in use from a previous open? */
1571         if (group->container) {
1572                 atomic_dec(&group->opened);
1573                 vfio_group_put(group);
1574                 return -EBUSY;
1575         }
1576
1577         /* Warn if previous user didn't cleanup and re-init to drop them */
1578         if (WARN_ON(group->notifier.head))
1579                 BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
1580
1581         filep->private_data = group;
1582
1583         return 0;
1584 }
1585
1586 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1587 {
1588         struct vfio_group *group = filep->private_data;
1589
1590         filep->private_data = NULL;
1591
1592         vfio_group_try_dissolve_container(group);
1593
1594         atomic_dec(&group->opened);
1595
1596         vfio_group_put(group);
1597
1598         return 0;
1599 }
1600
1601 static const struct file_operations vfio_group_fops = {
1602         .owner          = THIS_MODULE,
1603         .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1604 #ifdef CONFIG_COMPAT
1605         .compat_ioctl   = vfio_group_fops_compat_ioctl,
1606 #endif
1607         .open           = vfio_group_fops_open,
1608         .release        = vfio_group_fops_release,
1609 };
1610
1611 /**
1612  * VFIO Device fd
1613  */
1614 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1615 {
1616         struct vfio_device *device = filep->private_data;
1617
1618         device->ops->release(device->device_data);
1619
1620         vfio_group_try_dissolve_container(device->group);
1621
1622         vfio_device_put(device);
1623
1624         return 0;
1625 }
1626
1627 static long vfio_device_fops_unl_ioctl(struct file *filep,
1628                                        unsigned int cmd, unsigned long arg)
1629 {
1630         struct vfio_device *device = filep->private_data;
1631
1632         if (unlikely(!device->ops->ioctl))
1633                 return -EINVAL;
1634
1635         return device->ops->ioctl(device->device_data, cmd, arg);
1636 }
1637
1638 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1639                                      size_t count, loff_t *ppos)
1640 {
1641         struct vfio_device *device = filep->private_data;
1642
1643         if (unlikely(!device->ops->read))
1644                 return -EINVAL;
1645
1646         return device->ops->read(device->device_data, buf, count, ppos);
1647 }
1648
1649 static ssize_t vfio_device_fops_write(struct file *filep,
1650                                       const char __user *buf,
1651                                       size_t count, loff_t *ppos)
1652 {
1653         struct vfio_device *device = filep->private_data;
1654
1655         if (unlikely(!device->ops->write))
1656                 return -EINVAL;
1657
1658         return device->ops->write(device->device_data, buf, count, ppos);
1659 }
1660
1661 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1662 {
1663         struct vfio_device *device = filep->private_data;
1664
1665         if (unlikely(!device->ops->mmap))
1666                 return -EINVAL;
1667
1668         return device->ops->mmap(device->device_data, vma);
1669 }
1670
1671 #ifdef CONFIG_COMPAT
1672 static long vfio_device_fops_compat_ioctl(struct file *filep,
1673                                           unsigned int cmd, unsigned long arg)
1674 {
1675         arg = (unsigned long)compat_ptr(arg);
1676         return vfio_device_fops_unl_ioctl(filep, cmd, arg);
1677 }
1678 #endif  /* CONFIG_COMPAT */
1679
1680 static const struct file_operations vfio_device_fops = {
1681         .owner          = THIS_MODULE,
1682         .release        = vfio_device_fops_release,
1683         .read           = vfio_device_fops_read,
1684         .write          = vfio_device_fops_write,
1685         .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1686 #ifdef CONFIG_COMPAT
1687         .compat_ioctl   = vfio_device_fops_compat_ioctl,
1688 #endif
1689         .mmap           = vfio_device_fops_mmap,
1690 };
1691
1692 /**
1693  * External user API, exported by symbols to be linked dynamically.
1694  *
1695  * The protocol includes:
1696  *  1. do normal VFIO init operation:
1697  *      - opening a new container;
1698  *      - attaching group(s) to it;
1699  *      - setting an IOMMU driver for a container.
1700  * When IOMMU is set for a container, all groups in it are
1701  * considered ready to use by an external user.
1702  *
1703  * 2. User space passes a group fd to an external user.
1704  * The external user calls vfio_group_get_external_user()
1705  * to verify that:
1706  *      - the group is initialized;
1707  *      - IOMMU is set for it.
1708  * If both checks passed, vfio_group_get_external_user()
1709  * increments the container user counter to prevent
1710  * the VFIO group from disposal before KVM exits.
1711  *
1712  * 3. The external user calls vfio_external_user_iommu_id()
1713  * to know an IOMMU ID.
1714  *
1715  * 4. When the external KVM finishes, it calls
1716  * vfio_group_put_external_user() to release the VFIO group.
1717  * This call decrements the container user counter.
1718  */
1719 struct vfio_group *vfio_group_get_external_user(struct file *filep)
1720 {
1721         struct vfio_group *group = filep->private_data;
1722         int ret;
1723
1724         if (filep->f_op != &vfio_group_fops)
1725                 return ERR_PTR(-EINVAL);
1726
1727         ret = vfio_group_add_container_user(group);
1728         if (ret)
1729                 return ERR_PTR(ret);
1730
1731         vfio_group_get(group);
1732
1733         return group;
1734 }
1735 EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1736
1737 void vfio_group_put_external_user(struct vfio_group *group)
1738 {
1739         vfio_group_try_dissolve_container(group);
1740         vfio_group_put(group);
1741 }
1742 EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1743
1744 int vfio_external_user_iommu_id(struct vfio_group *group)
1745 {
1746         return iommu_group_id(group->iommu_group);
1747 }
1748 EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1749
1750 long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1751 {
1752         return vfio_ioctl_check_extension(group->container, arg);
1753 }
1754 EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1755
1756 /**
1757  * Sub-module support
1758  */
1759 /*
1760  * Helper for managing a buffer of info chain capabilities, allocate or
1761  * reallocate a buffer with additional @size, filling in @id and @version
1762  * of the capability.  A pointer to the new capability is returned.
1763  *
1764  * NB. The chain is based at the head of the buffer, so new entries are
1765  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1766  * next offsets prior to copying to the user buffer.
1767  */
1768 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1769                                                size_t size, u16 id, u16 version)
1770 {
1771         void *buf;
1772         struct vfio_info_cap_header *header, *tmp;
1773
1774         buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1775         if (!buf) {
1776                 kfree(caps->buf);
1777                 caps->size = 0;
1778                 return ERR_PTR(-ENOMEM);
1779         }
1780
1781         caps->buf = buf;
1782         header = buf + caps->size;
1783
1784         /* Eventually copied to user buffer, zero */
1785         memset(header, 0, size);
1786
1787         header->id = id;
1788         header->version = version;
1789
1790         /* Add to the end of the capability chain */
1791         for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1792                 ; /* nothing */
1793
1794         tmp->next = caps->size;
1795         caps->size += size;
1796
1797         return header;
1798 }
1799 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1800
1801 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1802 {
1803         struct vfio_info_cap_header *tmp;
1804         void *buf = (void *)caps->buf;
1805
1806         for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1807                 tmp->next += offset;
1808 }
1809 EXPORT_SYMBOL(vfio_info_cap_shift);
1810
1811 static int sparse_mmap_cap(struct vfio_info_cap *caps, void *cap_type)
1812 {
1813         struct vfio_info_cap_header *header;
1814         struct vfio_region_info_cap_sparse_mmap *sparse_cap, *sparse = cap_type;
1815         size_t size;
1816
1817         size = sizeof(*sparse) + sparse->nr_areas *  sizeof(*sparse->areas);
1818         header = vfio_info_cap_add(caps, size,
1819                                    VFIO_REGION_INFO_CAP_SPARSE_MMAP, 1);
1820         if (IS_ERR(header))
1821                 return PTR_ERR(header);
1822
1823         sparse_cap = container_of(header,
1824                         struct vfio_region_info_cap_sparse_mmap, header);
1825         sparse_cap->nr_areas = sparse->nr_areas;
1826         memcpy(sparse_cap->areas, sparse->areas,
1827                sparse->nr_areas * sizeof(*sparse->areas));
1828         return 0;
1829 }
1830
1831 static int region_type_cap(struct vfio_info_cap *caps, void *cap_type)
1832 {
1833         struct vfio_info_cap_header *header;
1834         struct vfio_region_info_cap_type *type_cap, *cap = cap_type;
1835
1836         header = vfio_info_cap_add(caps, sizeof(*cap),
1837                                    VFIO_REGION_INFO_CAP_TYPE, 1);
1838         if (IS_ERR(header))
1839                 return PTR_ERR(header);
1840
1841         type_cap = container_of(header, struct vfio_region_info_cap_type,
1842                                 header);
1843         type_cap->type = cap->type;
1844         type_cap->subtype = cap->subtype;
1845         return 0;
1846 }
1847
1848 int vfio_info_add_capability(struct vfio_info_cap *caps, int cap_type_id,
1849                              void *cap_type)
1850 {
1851         int ret = -EINVAL;
1852
1853         if (!cap_type)
1854                 return 0;
1855
1856         switch (cap_type_id) {
1857         case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
1858                 ret = sparse_mmap_cap(caps, cap_type);
1859                 break;
1860
1861         case VFIO_REGION_INFO_CAP_TYPE:
1862                 ret = region_type_cap(caps, cap_type);
1863                 break;
1864         }
1865
1866         return ret;
1867 }
1868 EXPORT_SYMBOL(vfio_info_add_capability);
1869
1870 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1871                                        int max_irq_type, size_t *data_size)
1872 {
1873         unsigned long minsz;
1874         size_t size;
1875
1876         minsz = offsetofend(struct vfio_irq_set, count);
1877
1878         if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1879             (hdr->count >= (U32_MAX - hdr->start)) ||
1880             (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1881                                 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1882                 return -EINVAL;
1883
1884         if (data_size)
1885                 *data_size = 0;
1886
1887         if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1888                 return -EINVAL;
1889
1890         switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1891         case VFIO_IRQ_SET_DATA_NONE:
1892                 size = 0;
1893                 break;
1894         case VFIO_IRQ_SET_DATA_BOOL:
1895                 size = sizeof(uint8_t);
1896                 break;
1897         case VFIO_IRQ_SET_DATA_EVENTFD:
1898                 size = sizeof(int32_t);
1899                 break;
1900         default:
1901                 return -EINVAL;
1902         }
1903
1904         if (size) {
1905                 if (hdr->argsz - minsz < hdr->count * size)
1906                         return -EINVAL;
1907
1908                 if (!data_size)
1909                         return -EINVAL;
1910
1911                 *data_size = hdr->count * size;
1912         }
1913
1914         return 0;
1915 }
1916 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1917
1918 /*
1919  * Pin a set of guest PFNs and return their associated host PFNs for local
1920  * domain only.
1921  * @dev [in]     : device
1922  * @user_pfn [in]: array of user/guest PFNs to be unpinned.
1923  * @npage [in]   : count of elements in user_pfn array.  This count should not
1924  *                 be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1925  * @prot [in]    : protection flags
1926  * @phys_pfn[out]: array of host PFNs
1927  * Return error or number of pages pinned.
1928  */
1929 int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
1930                    int prot, unsigned long *phys_pfn)
1931 {
1932         struct vfio_container *container;
1933         struct vfio_group *group;
1934         struct vfio_iommu_driver *driver;
1935         int ret;
1936
1937         if (!dev || !user_pfn || !phys_pfn || !npage)
1938                 return -EINVAL;
1939
1940         if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1941                 return -E2BIG;
1942
1943         group = vfio_group_get_from_dev(dev);
1944         if (!group)
1945                 return -ENODEV;
1946
1947         ret = vfio_group_add_container_user(group);
1948         if (ret)
1949                 goto err_pin_pages;
1950
1951         container = group->container;
1952         down_read(&container->group_lock);
1953
1954         driver = container->iommu_driver;
1955         if (likely(driver && driver->ops->pin_pages))
1956                 ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
1957                                              npage, prot, phys_pfn);
1958         else
1959                 ret = -ENOTTY;
1960
1961         up_read(&container->group_lock);
1962         vfio_group_try_dissolve_container(group);
1963
1964 err_pin_pages:
1965         vfio_group_put(group);
1966         return ret;
1967 }
1968 EXPORT_SYMBOL(vfio_pin_pages);
1969
1970 /*
1971  * Unpin set of host PFNs for local domain only.
1972  * @dev [in]     : device
1973  * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
1974  *                 PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1975  * @npage [in]   : count of elements in user_pfn array.  This count should not
1976  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1977  * Return error or number of pages unpinned.
1978  */
1979 int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
1980 {
1981         struct vfio_container *container;
1982         struct vfio_group *group;
1983         struct vfio_iommu_driver *driver;
1984         int ret;
1985
1986         if (!dev || !user_pfn || !npage)
1987                 return -EINVAL;
1988
1989         if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1990                 return -E2BIG;
1991
1992         group = vfio_group_get_from_dev(dev);
1993         if (!group)
1994                 return -ENODEV;
1995
1996         ret = vfio_group_add_container_user(group);
1997         if (ret)
1998                 goto err_unpin_pages;
1999
2000         container = group->container;
2001         down_read(&container->group_lock);
2002
2003         driver = container->iommu_driver;
2004         if (likely(driver && driver->ops->unpin_pages))
2005                 ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
2006                                                npage);
2007         else
2008                 ret = -ENOTTY;
2009
2010         up_read(&container->group_lock);
2011         vfio_group_try_dissolve_container(group);
2012
2013 err_unpin_pages:
2014         vfio_group_put(group);
2015         return ret;
2016 }
2017 EXPORT_SYMBOL(vfio_unpin_pages);
2018
2019 static int vfio_register_iommu_notifier(struct vfio_group *group,
2020                                         unsigned long *events,
2021                                         struct notifier_block *nb)
2022 {
2023         struct vfio_container *container;
2024         struct vfio_iommu_driver *driver;
2025         int ret;
2026
2027         ret = vfio_group_add_container_user(group);
2028         if (ret)
2029                 return -EINVAL;
2030
2031         container = group->container;
2032         down_read(&container->group_lock);
2033
2034         driver = container->iommu_driver;
2035         if (likely(driver && driver->ops->register_notifier))
2036                 ret = driver->ops->register_notifier(container->iommu_data,
2037                                                      events, nb);
2038         else
2039                 ret = -ENOTTY;
2040
2041         up_read(&container->group_lock);
2042         vfio_group_try_dissolve_container(group);
2043
2044         return ret;
2045 }
2046
2047 static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2048                                           struct notifier_block *nb)
2049 {
2050         struct vfio_container *container;
2051         struct vfio_iommu_driver *driver;
2052         int ret;
2053
2054         ret = vfio_group_add_container_user(group);
2055         if (ret)
2056                 return -EINVAL;
2057
2058         container = group->container;
2059         down_read(&container->group_lock);
2060
2061         driver = container->iommu_driver;
2062         if (likely(driver && driver->ops->unregister_notifier))
2063                 ret = driver->ops->unregister_notifier(container->iommu_data,
2064                                                        nb);
2065         else
2066                 ret = -ENOTTY;
2067
2068         up_read(&container->group_lock);
2069         vfio_group_try_dissolve_container(group);
2070
2071         return ret;
2072 }
2073
2074 void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2075 {
2076         group->kvm = kvm;
2077         blocking_notifier_call_chain(&group->notifier,
2078                                 VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2079 }
2080 EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2081
2082 static int vfio_register_group_notifier(struct vfio_group *group,
2083                                         unsigned long *events,
2084                                         struct notifier_block *nb)
2085 {
2086         struct vfio_container *container;
2087         int ret;
2088         bool set_kvm = false;
2089
2090         if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2091                 set_kvm = true;
2092
2093         /* clear known events */
2094         *events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2095
2096         /* refuse to continue if still events remaining */
2097         if (*events)
2098                 return -EINVAL;
2099
2100         ret = vfio_group_add_container_user(group);
2101         if (ret)
2102                 return -EINVAL;
2103
2104         container = group->container;
2105         down_read(&container->group_lock);
2106
2107         ret = blocking_notifier_chain_register(&group->notifier, nb);
2108
2109         /*
2110          * The attaching of kvm and vfio_group might already happen, so
2111          * here we replay once upon registration.
2112          */
2113         if (!ret && set_kvm && group->kvm)
2114                 blocking_notifier_call_chain(&group->notifier,
2115                                         VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2116
2117         up_read(&container->group_lock);
2118         vfio_group_try_dissolve_container(group);
2119
2120         return ret;
2121 }
2122
2123 static int vfio_unregister_group_notifier(struct vfio_group *group,
2124                                          struct notifier_block *nb)
2125 {
2126         struct vfio_container *container;
2127         int ret;
2128
2129         ret = vfio_group_add_container_user(group);
2130         if (ret)
2131                 return -EINVAL;
2132
2133         container = group->container;
2134         down_read(&container->group_lock);
2135
2136         ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2137
2138         up_read(&container->group_lock);
2139         vfio_group_try_dissolve_container(group);
2140
2141         return ret;
2142 }
2143
2144 int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2145                            unsigned long *events, struct notifier_block *nb)
2146 {
2147         struct vfio_group *group;
2148         int ret;
2149
2150         if (!dev || !nb || !events || (*events == 0))
2151                 return -EINVAL;
2152
2153         group = vfio_group_get_from_dev(dev);
2154         if (!group)
2155                 return -ENODEV;
2156
2157         switch (type) {
2158         case VFIO_IOMMU_NOTIFY:
2159                 ret = vfio_register_iommu_notifier(group, events, nb);
2160                 break;
2161         case VFIO_GROUP_NOTIFY:
2162                 ret = vfio_register_group_notifier(group, events, nb);
2163                 break;
2164         default:
2165                 ret = -EINVAL;
2166         }
2167
2168         vfio_group_put(group);
2169         return ret;
2170 }
2171 EXPORT_SYMBOL(vfio_register_notifier);
2172
2173 int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2174                              struct notifier_block *nb)
2175 {
2176         struct vfio_group *group;
2177         int ret;
2178
2179         if (!dev || !nb)
2180                 return -EINVAL;
2181
2182         group = vfio_group_get_from_dev(dev);
2183         if (!group)
2184                 return -ENODEV;
2185
2186         switch (type) {
2187         case VFIO_IOMMU_NOTIFY:
2188                 ret = vfio_unregister_iommu_notifier(group, nb);
2189                 break;
2190         case VFIO_GROUP_NOTIFY:
2191                 ret = vfio_unregister_group_notifier(group, nb);
2192                 break;
2193         default:
2194                 ret = -EINVAL;
2195         }
2196
2197         vfio_group_put(group);
2198         return ret;
2199 }
2200 EXPORT_SYMBOL(vfio_unregister_notifier);
2201
2202 /**
2203  * Module/class support
2204  */
2205 static char *vfio_devnode(struct device *dev, umode_t *mode)
2206 {
2207         return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2208 }
2209
2210 static struct miscdevice vfio_dev = {
2211         .minor = VFIO_MINOR,
2212         .name = "vfio",
2213         .fops = &vfio_fops,
2214         .nodename = "vfio/vfio",
2215         .mode = S_IRUGO | S_IWUGO,
2216 };
2217
2218 static int __init vfio_init(void)
2219 {
2220         int ret;
2221
2222         idr_init(&vfio.group_idr);
2223         mutex_init(&vfio.group_lock);
2224         mutex_init(&vfio.iommu_drivers_lock);
2225         INIT_LIST_HEAD(&vfio.group_list);
2226         INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2227         init_waitqueue_head(&vfio.release_q);
2228
2229         ret = misc_register(&vfio_dev);
2230         if (ret) {
2231                 pr_err("vfio: misc device register failed\n");
2232                 return ret;
2233         }
2234
2235         /* /dev/vfio/$GROUP */
2236         vfio.class = class_create(THIS_MODULE, "vfio");
2237         if (IS_ERR(vfio.class)) {
2238                 ret = PTR_ERR(vfio.class);
2239                 goto err_class;
2240         }
2241
2242         vfio.class->devnode = vfio_devnode;
2243
2244         ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK, "vfio");
2245         if (ret)
2246                 goto err_alloc_chrdev;
2247
2248         cdev_init(&vfio.group_cdev, &vfio_group_fops);
2249         ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK);
2250         if (ret)
2251                 goto err_cdev_add;
2252
2253         pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2254
2255         /*
2256          * Attempt to load known iommu-drivers.  This gives us a working
2257          * environment without the user needing to explicitly load iommu
2258          * drivers.
2259          */
2260         request_module_nowait("vfio_iommu_type1");
2261         request_module_nowait("vfio_iommu_spapr_tce");
2262
2263 #ifdef CONFIG_VFIO_NOIOMMU
2264         vfio_register_iommu_driver(&vfio_noiommu_ops);
2265 #endif
2266         return 0;
2267
2268 err_cdev_add:
2269         unregister_chrdev_region(vfio.group_devt, MINORMASK);
2270 err_alloc_chrdev:
2271         class_destroy(vfio.class);
2272         vfio.class = NULL;
2273 err_class:
2274         misc_deregister(&vfio_dev);
2275         return ret;
2276 }
2277
2278 static void __exit vfio_cleanup(void)
2279 {
2280         WARN_ON(!list_empty(&vfio.group_list));
2281
2282 #ifdef CONFIG_VFIO_NOIOMMU
2283         vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2284 #endif
2285         idr_destroy(&vfio.group_idr);
2286         cdev_del(&vfio.group_cdev);
2287         unregister_chrdev_region(vfio.group_devt, MINORMASK);
2288         class_destroy(vfio.class);
2289         vfio.class = NULL;
2290         misc_deregister(&vfio_dev);
2291 }
2292
2293 module_init(vfio_init);
2294 module_exit(vfio_cleanup);
2295
2296 MODULE_VERSION(DRIVER_VERSION);
2297 MODULE_LICENSE("GPL v2");
2298 MODULE_AUTHOR(DRIVER_AUTHOR);
2299 MODULE_DESCRIPTION(DRIVER_DESC);
2300 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2301 MODULE_ALIAS("devname:vfio/vfio");