drivers/vfio/vfio.c

   1 /*
   2  * VFIO core
   3  *
   4  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   5  *     Author: Alex Williamson <alex.williamson@redhat.com>
   6  *
   7  * This program is free software; you can redistribute it and/or modify
   8  * it under the terms of the GNU General Public License version 2 as
   9  * published by the Free Software Foundation.
  10  *
  11  * Derived from original vfio:
  12  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  13  * Author: Tom Lyon, pugs@cisco.com
  14  */
  15
  16 #include <linux/cdev.h>
  17 #include <linux/compat.h>
  18 #include <linux/device.h>
  19 #include <linux/file.h>
  20 #include <linux/anon_inodes.h>
  21 #include <linux/fs.h>
  22 #include <linux/idr.h>
  23 #include <linux/iommu.h>
  24 #include <linux/list.h>
  25 #include <linux/miscdevice.h>
  26 #include <linux/module.h>
  27 #include <linux/mutex.h>
  28 #include <linux/pci.h>
  29 #include <linux/rwsem.h>
  30 #include <linux/sched.h>
  31 #include <linux/slab.h>
  32 #include <linux/stat.h>
  33 #include <linux/string.h>
  34 #include <linux/uaccess.h>
  35 #include <linux/vfio.h>
  36 #include <linux/wait.h>
  37
  38 #define DRIVER_VERSION  "0.3"
  39 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
  40 #define DRIVER_DESC     "VFIO - User Level meta-driver"
  41
  42 static struct vfio {
  43         struct class                    *class;
  44         struct list_head                iommu_drivers_list;
  45         struct mutex                    iommu_drivers_lock;
  46         struct list_head                group_list;
  47         struct idr                      group_idr;
  48         struct mutex                    group_lock;
  49         struct cdev                     group_cdev;
  50         dev_t                           group_devt;
  51         wait_queue_head_t               release_q;
  52 } vfio;
  53
  54 struct vfio_iommu_driver {
  55         const struct vfio_iommu_driver_ops      *ops;
  56         struct list_head                        vfio_next;
  57 };
  58
  59 struct vfio_container {
  60         struct kref                     kref;
  61         struct list_head                group_list;
  62         struct rw_semaphore             group_lock;
  63         struct vfio_iommu_driver        *iommu_driver;
  64         void                            *iommu_data;
  65         bool                            noiommu;
  66 };
  67
  68 struct vfio_unbound_dev {
  69         struct device                   *dev;
  70         struct list_head                unbound_next;
  71 };
  72
  73 struct vfio_group {
  74         struct kref                     kref;
  75         int                             minor;
  76         atomic_t                        container_users;
  77         struct iommu_group              *iommu_group;
  78         struct vfio_container           *container;
  79         struct list_head                device_list;
  80         struct mutex                    device_lock;
  81         struct device                   *dev;
  82         struct notifier_block           nb;
  83         struct list_head                vfio_next;
  84         struct list_head                container_next;
  85         struct list_head                unbound_list;
  86         struct mutex                    unbound_lock;
  87         atomic_t                        opened;
  88         bool                            noiommu;
  89         struct kvm                      *kvm;
  90         struct blocking_notifier_head   notifier;
  91 };
  92
  93 struct vfio_device {
  94         struct kref                     kref;
  95         struct device                   *dev;
  96         const struct vfio_device_ops    *ops;
  97         struct vfio_group               *group;
  98         struct list_head                group_next;
  99         void                            *device_data;
 100 };
 101
 102 #ifdef CONFIG_VFIO_NOIOMMU
 103 static bool noiommu __read_mostly;
 104 module_param_named(enable_unsafe_noiommu_mode,
 105                    noiommu, bool, S_IRUGO | S_IWUSR);
 106 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
 107 #endif
 108
 109 /*
 110  * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
 111  * and remove functions, any use cases other than acquiring the first
 112  * reference for the purpose of calling vfio_add_group_dev() or removing
 113  * that symmetric reference after vfio_del_group_dev() should use the raw
 114  * iommu_group_{get,put} functions.  In particular, vfio_iommu_group_put()
 115  * removes the device from the dummy group and cannot be nested.
 116  */
 117 struct iommu_group *vfio_iommu_group_get(struct device *dev)
 118 {
 119         struct iommu_group *group;
 120         int __maybe_unused ret;
 121
 122         group = iommu_group_get(dev);
 123
 124 #ifdef CONFIG_VFIO_NOIOMMU
 125         /*
 126          * With noiommu enabled, an IOMMU group will be created for a device
 127          * that doesn't already have one and doesn't have an iommu_ops on their
 128          * bus.  We set iommudata simply to be able to identify these groups
 129          * as special use and for reclamation later.
 130          */
 131         if (group || !noiommu || iommu_present(dev->bus))
 132                 return group;
 133
 134         group = iommu_group_alloc();
 135         if (IS_ERR(group))
 136                 return NULL;
 137
 138         iommu_group_set_name(group, "vfio-noiommu");
 139         iommu_group_set_iommudata(group, &noiommu, NULL);
 140         ret = iommu_group_add_device(group, dev);
 141         iommu_group_put(group);
 142         if (ret)
 143                 return NULL;
 144
 145         /*
 146          * Where to taint?  At this point we've added an IOMMU group for a
 147          * device that is not backed by iommu_ops, therefore any iommu_
 148          * callback using iommu_ops can legitimately Oops.  So, while we may
 149          * be about to give a DMA capable device to a user without IOMMU
 150          * protection, which is clearly taint-worthy, let's go ahead and do
 151          * it here.
 152          */
 153         add_taint(TAINT_USER, LOCKDEP_STILL_OK);
 154         dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
 155 #endif
 156
 157         return group;
 158 }
 159 EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
 160
 161 void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
 162 {
 163 #ifdef CONFIG_VFIO_NOIOMMU
 164         if (iommu_group_get_iommudata(group) == &noiommu)
 165                 iommu_group_remove_device(dev);
 166 #endif
 167
 168         iommu_group_put(group);
 169 }
 170 EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
 171
 172 #ifdef CONFIG_VFIO_NOIOMMU
 173 static void *vfio_noiommu_open(unsigned long arg)
 174 {
 175         if (arg != VFIO_NOIOMMU_IOMMU)
 176                 return ERR_PTR(-EINVAL);
 177         if (!capable(CAP_SYS_RAWIO))
 178                 return ERR_PTR(-EPERM);
 179
 180         return NULL;
 181 }
 182
 183 static void vfio_noiommu_release(void *iommu_data)
 184 {
 185 }
 186
 187 static long vfio_noiommu_ioctl(void *iommu_data,
 188                                unsigned int cmd, unsigned long arg)
 189 {
 190         if (cmd == VFIO_CHECK_EXTENSION)
 191                 return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
 192
 193         return -ENOTTY;
 194 }
 195
 196 static int vfio_noiommu_attach_group(void *iommu_data,
 197                                      struct iommu_group *iommu_group)
 198 {
 199         return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
 200 }
 201
 202 static void vfio_noiommu_detach_group(void *iommu_data,
 203                                       struct iommu_group *iommu_group)
 204 {
 205 }
 206
 207 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
 208         .name = "vfio-noiommu",
 209         .owner = THIS_MODULE,
 210         .open = vfio_noiommu_open,
 211         .release = vfio_noiommu_release,
 212         .ioctl = vfio_noiommu_ioctl,
 213         .attach_group = vfio_noiommu_attach_group,
 214         .detach_group = vfio_noiommu_detach_group,
 215 };
 216 #endif
 217
 218
 219 /**
 220  * IOMMU driver registration
 221  */
 222 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 223 {
 224         struct vfio_iommu_driver *driver, *tmp;
 225
 226         driver = kzalloc(sizeof(*driver), GFP_KERNEL);
 227         if (!driver)
 228                 return -ENOMEM;
 229
 230         driver->ops = ops;
 231
 232         mutex_lock(&vfio.iommu_drivers_lock);
 233
 234         /* Check for duplicates */
 235         list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
 236                 if (tmp->ops == ops) {
 237                         mutex_unlock(&vfio.iommu_drivers_lock);
 238                         kfree(driver);
 239                         return -EINVAL;
 240                 }
 241         }
 242
 243         list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
 244
 245         mutex_unlock(&vfio.iommu_drivers_lock);
 246
 247         return 0;
 248 }
 249 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
 250
 251 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 252 {
 253         struct vfio_iommu_driver *driver;
 254
 255         mutex_lock(&vfio.iommu_drivers_lock);
 256         list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
 257                 if (driver->ops == ops) {
 258                         list_del(&driver->vfio_next);
 259                         mutex_unlock(&vfio.iommu_drivers_lock);
 260                         kfree(driver);
 261                         return;
 262                 }
 263         }
 264         mutex_unlock(&vfio.iommu_drivers_lock);
 265 }
 266 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
 267
 268 /**
 269  * Group minor allocation/free - both called with vfio.group_lock held
 270  */
 271 static int vfio_alloc_group_minor(struct vfio_group *group)
 272 {
 273         return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
 274 }
 275
 276 static void vfio_free_group_minor(int minor)
 277 {
 278         idr_remove(&vfio.group_idr, minor);
 279 }
 280
 281 static int vfio_iommu_group_notifier(struct notifier_block *nb,
 282                                      unsigned long action, void *data);
 283 static void vfio_group_get(struct vfio_group *group);
 284
 285 /**
 286  * Container objects - containers are created when /dev/vfio/vfio is
 287  * opened, but their lifecycle extends until the last user is done, so
 288  * it's freed via kref.  Must support container/group/device being
 289  * closed in any order.
 290  */
 291 static void vfio_container_get(struct vfio_container *container)
 292 {
 293         kref_get(&container->kref);
 294 }
 295
 296 static void vfio_container_release(struct kref *kref)
 297 {
 298         struct vfio_container *container;
 299         container = container_of(kref, struct vfio_container, kref);
 300
 301         kfree(container);
 302 }
 303
 304 static void vfio_container_put(struct vfio_container *container)
 305 {
 306         kref_put(&container->kref, vfio_container_release);
 307 }
 308
 309 static void vfio_group_unlock_and_free(struct vfio_group *group)
 310 {
 311         mutex_unlock(&vfio.group_lock);
 312         /*
 313          * Unregister outside of lock.  A spurious callback is harmless now
 314          * that the group is no longer in vfio.group_list.
 315          */
 316         iommu_group_unregister_notifier(group->iommu_group, &group->nb);
 317         kfree(group);
 318 }
 319
 320 /**
 321  * Group objects - create, release, get, put, search
 322  */
 323 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
 324 {
 325         struct vfio_group *group, *tmp;
 326         struct device *dev;
 327         int ret, minor;
 328
 329         group = kzalloc(sizeof(*group), GFP_KERNEL);
 330         if (!group)
 331                 return ERR_PTR(-ENOMEM);
 332
 333         kref_init(&group->kref);
 334         INIT_LIST_HEAD(&group->device_list);
 335         mutex_init(&group->device_lock);
 336         INIT_LIST_HEAD(&group->unbound_list);
 337         mutex_init(&group->unbound_lock);
 338         atomic_set(&group->container_users, 0);
 339         atomic_set(&group->opened, 0);
 340         group->iommu_group = iommu_group;
 341 #ifdef CONFIG_VFIO_NOIOMMU
 342         group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
 343 #endif
 344         BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
 345
 346         group->nb.notifier_call = vfio_iommu_group_notifier;
 347
 348         /*
 349          * blocking notifiers acquire a rwsem around registering and hold
 350          * it around callback.  Therefore, need to register outside of
 351          * vfio.group_lock to avoid A-B/B-A contention.  Our callback won't
 352          * do anything unless it can find the group in vfio.group_list, so
 353          * no harm in registering early.
 354          */
 355         ret = iommu_group_register_notifier(iommu_group, &group->nb);
 356         if (ret) {
 357                 kfree(group);
 358                 return ERR_PTR(ret);
 359         }
 360
 361         mutex_lock(&vfio.group_lock);
 362
 363         /* Did we race creating this group? */
 364         list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
 365                 if (tmp->iommu_group == iommu_group) {
 366                         vfio_group_get(tmp);
 367                         vfio_group_unlock_and_free(group);
 368                         return tmp;
 369                 }
 370         }
 371
 372         minor = vfio_alloc_group_minor(group);
 373         if (minor < 0) {
 374                 vfio_group_unlock_and_free(group);
 375                 return ERR_PTR(minor);
 376         }
 377
 378         dev = device_create(vfio.class, NULL,
 379                             MKDEV(MAJOR(vfio.group_devt), minor),
 380                             group, "%s%d", group->noiommu ? "noiommu-" : "",
 381                             iommu_group_id(iommu_group));
 382         if (IS_ERR(dev)) {
 383                 vfio_free_group_minor(minor);
 384                 vfio_group_unlock_and_free(group);
 385                 return ERR_CAST(dev);
 386         }
 387
 388         group->minor = minor;
 389         group->dev = dev;
 390
 391         list_add(&group->vfio_next, &vfio.group_list);
 392
 393         mutex_unlock(&vfio.group_lock);
 394
 395         return group;
 396 }
 397
 398 /* called with vfio.group_lock held */
 399 static void vfio_group_release(struct kref *kref)
 400 {
 401         struct vfio_group *group = container_of(kref, struct vfio_group, kref);
 402         struct vfio_unbound_dev *unbound, *tmp;
 403         struct iommu_group *iommu_group = group->iommu_group;
 404
 405         WARN_ON(!list_empty(&group->device_list));
 406         WARN_ON(group->notifier.head);
 407
 408         list_for_each_entry_safe(unbound, tmp,
 409                                  &group->unbound_list, unbound_next) {
 410                 list_del(&unbound->unbound_next);
 411                 kfree(unbound);
 412         }
 413
 414         device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
 415         list_del(&group->vfio_next);
 416         vfio_free_group_minor(group->minor);
 417         vfio_group_unlock_and_free(group);
 418         iommu_group_put(iommu_group);
 419 }
 420
 421 static void vfio_group_put(struct vfio_group *group)
 422 {
 423         kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
 424 }
 425
 426 struct vfio_group_put_work {
 427         struct work_struct work;
 428         struct vfio_group *group;
 429 };
 430
 431 static void vfio_group_put_bg(struct work_struct *work)
 432 {
 433         struct vfio_group_put_work *do_work;
 434
 435         do_work = container_of(work, struct vfio_group_put_work, work);
 436
 437         vfio_group_put(do_work->group);
 438         kfree(do_work);
 439 }
 440
 441 static void vfio_group_schedule_put(struct vfio_group *group)
 442 {
 443         struct vfio_group_put_work *do_work;
 444
 445         do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
 446         if (WARN_ON(!do_work))
 447                 return;
 448
 449         INIT_WORK(&do_work->work, vfio_group_put_bg);
 450         do_work->group = group;
 451         schedule_work(&do_work->work);
 452 }
 453
 454 /* Assume group_lock or group reference is held */
 455 static void vfio_group_get(struct vfio_group *group)
 456 {
 457         kref_get(&group->kref);
 458 }
 459
 460 /*
 461  * Not really a try as we will sleep for mutex, but we need to make
 462  * sure the group pointer is valid under lock and get a reference.
 463  */
 464 static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
 465 {
 466         struct vfio_group *target = group;
 467
 468         mutex_lock(&vfio.group_lock);
 469         list_for_each_entry(group, &vfio.group_list, vfio_next) {
 470                 if (group == target) {
 471                         vfio_group_get(group);
 472                         mutex_unlock(&vfio.group_lock);
 473                         return group;
 474                 }
 475         }
 476         mutex_unlock(&vfio.group_lock);
 477
 478         return NULL;
 479 }
 480
 481 static
 482 struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
 483 {
 484         struct vfio_group *group;
 485
 486         mutex_lock(&vfio.group_lock);
 487         list_for_each_entry(group, &vfio.group_list, vfio_next) {
 488                 if (group->iommu_group == iommu_group) {
 489                         vfio_group_get(group);
 490                         mutex_unlock(&vfio.group_lock);
 491                         return group;
 492                 }
 493         }
 494         mutex_unlock(&vfio.group_lock);
 495
 496         return NULL;
 497 }
 498
 499 static struct vfio_group *vfio_group_get_from_minor(int minor)
 500 {
 501         struct vfio_group *group;
 502
 503         mutex_lock(&vfio.group_lock);
 504         group = idr_find(&vfio.group_idr, minor);
 505         if (!group) {
 506                 mutex_unlock(&vfio.group_lock);
 507                 return NULL;
 508         }
 509         vfio_group_get(group);
 510         mutex_unlock(&vfio.group_lock);
 511
 512         return group;
 513 }
 514
 515 static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
 516 {
 517         struct iommu_group *iommu_group;
 518         struct vfio_group *group;
 519
 520         iommu_group = iommu_group_get(dev);
 521         if (!iommu_group)
 522                 return NULL;
 523
 524         group = vfio_group_get_from_iommu(iommu_group);
 525         iommu_group_put(iommu_group);
 526
 527         return group;
 528 }
 529
 530 /**
 531  * Device objects - create, release, get, put, search
 532  */
 533 static
 534 struct vfio_device *vfio_group_create_device(struct vfio_group *group,
 535                                              struct device *dev,
 536                                              const struct vfio_device_ops *ops,
 537                                              void *device_data)
 538 {
 539         struct vfio_device *device;
 540
 541         device = kzalloc(sizeof(*device), GFP_KERNEL);
 542         if (!device)
 543                 return ERR_PTR(-ENOMEM);
 544
 545         kref_init(&device->kref);
 546         device->dev = dev;
 547         device->group = group;
 548         device->ops = ops;
 549         device->device_data = device_data;
 550         dev_set_drvdata(dev, device);
 551
 552         /* No need to get group_lock, caller has group reference */
 553         vfio_group_get(group);
 554
 555         mutex_lock(&group->device_lock);
 556         list_add(&device->group_next, &group->device_list);
 557         mutex_unlock(&group->device_lock);
 558
 559         return device;
 560 }
 561
 562 static void vfio_device_release(struct kref *kref)
 563 {
 564         struct vfio_device *device = container_of(kref,
 565                                                   struct vfio_device, kref);
 566         struct vfio_group *group = device->group;
 567
 568         list_del(&device->group_next);
 569         mutex_unlock(&group->device_lock);
 570
 571         dev_set_drvdata(device->dev, NULL);
 572
 573         kfree(device);
 574
 575         /* vfio_del_group_dev may be waiting for this device */
 576         wake_up(&vfio.release_q);
 577 }
 578
 579 /* Device reference always implies a group reference */
 580 void vfio_device_put(struct vfio_device *device)
 581 {
 582         struct vfio_group *group = device->group;
 583         kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
 584         vfio_group_put(group);
 585 }
 586 EXPORT_SYMBOL_GPL(vfio_device_put);
 587
 588 static void vfio_device_get(struct vfio_device *device)
 589 {
 590         vfio_group_get(device->group);
 591         kref_get(&device->kref);
 592 }
 593
 594 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
 595                                                  struct device *dev)
 596 {
 597         struct vfio_device *device;
 598
 599         mutex_lock(&group->device_lock);
 600         list_for_each_entry(device, &group->device_list, group_next) {
 601                 if (device->dev == dev) {
 602                         vfio_device_get(device);
 603                         mutex_unlock(&group->device_lock);
 604                         return device;
 605                 }
 606         }
 607         mutex_unlock(&group->device_lock);
 608         return NULL;
 609 }
 610
 611 /*
 612  * Some drivers, like pci-stub, are only used to prevent other drivers from
 613  * claiming a device and are therefore perfectly legitimate for a user owned
 614  * group.  The pci-stub driver has no dependencies on DMA or the IOVA mapping
 615  * of the device, but it does prevent the user from having direct access to
 616  * the device, which is useful in some circumstances.
 617  *
 618  * We also assume that we can include PCI interconnect devices, ie. bridges.
 619  * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
 620  * then all of the downstream devices will be part of the same IOMMU group as
 621  * the bridge.  Thus, if placing the bridge into the user owned IOVA space
 622  * breaks anything, it only does so for user owned devices downstream.  Note
 623  * that error notification via MSI can be affected for platforms that handle
 624  * MSI within the same IOVA space as DMA.
 625  */
 626 static const char * const vfio_driver_whitelist[] = { "pci-stub" };
 627
 628 static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
 629 {
 630         int i;
 631
 632         if (dev_is_pci(dev)) {
 633                 struct pci_dev *pdev = to_pci_dev(dev);
 634
 635                 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
 636                         return true;
 637         }
 638
 639         for (i = 0; i < ARRAY_SIZE(vfio_driver_whitelist); i++) {
 640                 if (!strcmp(drv->name, vfio_driver_whitelist[i]))
 641                         return true;
 642         }
 643
 644         return false;
 645 }
 646
 647 /*
 648  * A vfio group is viable for use by userspace if all devices are in
 649  * one of the following states:
 650  *  - driver-less
 651  *  - bound to a vfio driver
 652  *  - bound to a whitelisted driver
 653  *  - a PCI interconnect device
 654  *
 655  * We use two methods to determine whether a device is bound to a vfio
 656  * driver.  The first is to test whether the device exists in the vfio
 657  * group.  The second is to test if the device exists on the group
 658  * unbound_list, indicating it's in the middle of transitioning from
 659  * a vfio driver to driver-less.
 660  */
 661 static int vfio_dev_viable(struct device *dev, void *data)
 662 {
 663         struct vfio_group *group = data;
 664         struct vfio_device *device;
 665         struct device_driver *drv = ACCESS_ONCE(dev->driver);
 666         struct vfio_unbound_dev *unbound;
 667         int ret = -EINVAL;
 668
 669         mutex_lock(&group->unbound_lock);
 670         list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
 671                 if (dev == unbound->dev) {
 672                         ret = 0;
 673                         break;
 674                 }
 675         }
 676         mutex_unlock(&group->unbound_lock);
 677
 678         if (!ret || !drv || vfio_dev_whitelisted(dev, drv))
 679                 return 0;
 680
 681         device = vfio_group_get_device(group, dev);
 682         if (device) {
 683                 vfio_device_put(device);
 684                 return 0;
 685         }
 686
 687         return ret;
 688 }
 689
 690 /**
 691  * Async device support
 692  */
 693 static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
 694 {
 695         struct vfio_device *device;
 696
 697         /* Do we already know about it?  We shouldn't */
 698         device = vfio_group_get_device(group, dev);
 699         if (WARN_ON_ONCE(device)) {
 700                 vfio_device_put(device);
 701                 return 0;
 702         }
 703
 704         /* Nothing to do for idle groups */
 705         if (!atomic_read(&group->container_users))
 706                 return 0;
 707
 708         /* TODO Prevent device auto probing */
 709         WARN(1, "Device %s added to live group %d!\n", dev_name(dev),
 710              iommu_group_id(group->iommu_group));
 711
 712         return 0;
 713 }
 714
 715 static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
 716 {
 717         /* We don't care what happens when the group isn't in use */
 718         if (!atomic_read(&group->container_users))
 719                 return 0;
 720
 721         return vfio_dev_viable(dev, group);
 722 }
 723
 724 static int vfio_iommu_group_notifier(struct notifier_block *nb,
 725                                      unsigned long action, void *data)
 726 {
 727         struct vfio_group *group = container_of(nb, struct vfio_group, nb);
 728         struct device *dev = data;
 729         struct vfio_unbound_dev *unbound;
 730
 731         /*
 732          * Need to go through a group_lock lookup to get a reference or we
 733          * risk racing a group being removed.  Ignore spurious notifies.
 734          */
 735         group = vfio_group_try_get(group);
 736         if (!group)
 737                 return NOTIFY_OK;
 738
 739         switch (action) {
 740         case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
 741                 vfio_group_nb_add_dev(group, dev);
 742                 break;
 743         case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
 744                 /*
 745                  * Nothing to do here.  If the device is in use, then the
 746                  * vfio sub-driver should block the remove callback until
 747                  * it is unused.  If the device is unused or attached to a
 748                  * stub driver, then it should be released and we don't
 749                  * care that it will be going away.
 750                  */
 751                 break;
 752         case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
 753                 pr_debug("%s: Device %s, group %d binding to driver\n",
 754                          __func__, dev_name(dev),
 755                          iommu_group_id(group->iommu_group));
 756                 break;
 757         case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
 758                 pr_debug("%s: Device %s, group %d bound to driver %s\n",
 759                          __func__, dev_name(dev),
 760                          iommu_group_id(group->iommu_group), dev->driver->name);
 761                 BUG_ON(vfio_group_nb_verify(group, dev));
 762                 break;
 763         case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
 764                 pr_debug("%s: Device %s, group %d unbinding from driver %s\n",
 765                          __func__, dev_name(dev),
 766                          iommu_group_id(group->iommu_group), dev->driver->name);
 767                 break;
 768         case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
 769                 pr_debug("%s: Device %s, group %d unbound from driver\n",
 770                          __func__, dev_name(dev),
 771                          iommu_group_id(group->iommu_group));
 772                 /*
 773                  * XXX An unbound device in a live group is ok, but we'd
 774                  * really like to avoid the above BUG_ON by preventing other
 775                  * drivers from binding to it.  Once that occurs, we have to
 776                  * stop the system to maintain isolation.  At a minimum, we'd
 777                  * want a toggle to disable driver auto probe for this device.
 778                  */
 779
 780                 mutex_lock(&group->unbound_lock);
 781                 list_for_each_entry(unbound,
 782                                     &group->unbound_list, unbound_next) {
 783                         if (dev == unbound->dev) {
 784                                 list_del(&unbound->unbound_next);
 785                                 kfree(unbound);
 786                                 break;
 787                         }
 788                 }
 789                 mutex_unlock(&group->unbound_lock);
 790                 break;
 791         }
 792
 793         /*
 794          * If we're the last reference to the group, the group will be
 795          * released, which includes unregistering the iommu group notifier.
 796          * We hold a read-lock on that notifier list, unregistering needs
 797          * a write-lock... deadlock.  Release our reference asynchronously
 798          * to avoid that situation.
 799          */
 800         vfio_group_schedule_put(group);
 801         return NOTIFY_OK;
 802 }
 803
 804 /**
 805  * VFIO driver API
 806  */
 807 int vfio_add_group_dev(struct device *dev,
 808                        const struct vfio_device_ops *ops, void *device_data)
 809 {
 810         struct iommu_group *iommu_group;
 811         struct vfio_group *group;
 812         struct vfio_device *device;
 813
 814         iommu_group = iommu_group_get(dev);
 815         if (!iommu_group)
 816                 return -EINVAL;
 817
 818         group = vfio_group_get_from_iommu(iommu_group);
 819         if (!group) {
 820                 group = vfio_create_group(iommu_group);
 821                 if (IS_ERR(group)) {
 822                         iommu_group_put(iommu_group);
 823                         return PTR_ERR(group);
 824                 }
 825         } else {
 826                 /*
 827                  * A found vfio_group already holds a reference to the
 828                  * iommu_group.  A created vfio_group keeps the reference.
 829                  */
 830                 iommu_group_put(iommu_group);
 831         }
 832
 833         device = vfio_group_get_device(group, dev);
 834         if (device) {
 835                 WARN(1, "Device %s already exists on group %d\n",
 836                      dev_name(dev), iommu_group_id(iommu_group));
 837                 vfio_device_put(device);
 838                 vfio_group_put(group);
 839                 return -EBUSY;
 840         }
 841
 842         device = vfio_group_create_device(group, dev, ops, device_data);
 843         if (IS_ERR(device)) {
 844                 vfio_group_put(group);
 845                 return PTR_ERR(device);
 846         }
 847
 848         /*
 849          * Drop all but the vfio_device reference.  The vfio_device holds
 850          * a reference to the vfio_group, which holds a reference to the
 851          * iommu_group.
 852          */
 853         vfio_group_put(group);
 854
 855         return 0;
 856 }
 857 EXPORT_SYMBOL_GPL(vfio_add_group_dev);
 858
 859 /**
 860  * Get a reference to the vfio_device for a device.  Even if the
 861  * caller thinks they own the device, they could be racing with a
 862  * release call path, so we can't trust drvdata for the shortcut.
 863  * Go the long way around, from the iommu_group to the vfio_group
 864  * to the vfio_device.
 865  */
 866 struct vfio_device *vfio_device_get_from_dev(struct device *dev)
 867 {
 868         struct vfio_group *group;
 869         struct vfio_device *device;
 870
 871         group = vfio_group_get_from_dev(dev);
 872         if (!group)
 873                 return NULL;
 874
 875         device = vfio_group_get_device(group, dev);
 876         vfio_group_put(group);
 877
 878         return device;
 879 }
 880 EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
 881
 882 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
 883                                                      char *buf)
 884 {
 885         struct vfio_device *it, *device = NULL;
 886
 887         mutex_lock(&group->device_lock);
 888         list_for_each_entry(it, &group->device_list, group_next) {
 889                 if (!strcmp(dev_name(it->dev), buf)) {
 890                         device = it;
 891                         vfio_device_get(device);
 892                         break;
 893                 }
 894         }
 895         mutex_unlock(&group->device_lock);
 896
 897         return device;
 898 }
 899
 900 /*
 901  * Caller must hold a reference to the vfio_device
 902  */
 903 void *vfio_device_data(struct vfio_device *device)
 904 {
 905         return device->device_data;
 906 }
 907 EXPORT_SYMBOL_GPL(vfio_device_data);
 908
 909 /* Given a referenced group, check if it contains the device */
 910 static bool vfio_dev_present(struct vfio_group *group, struct device *dev)
 911 {
 912         struct vfio_device *device;
 913
 914         device = vfio_group_get_device(group, dev);
 915         if (!device)
 916                 return false;
 917
 918         vfio_device_put(device);
 919         return true;
 920 }
 921
 922 /*
 923  * Decrement the device reference count and wait for the device to be
 924  * removed.  Open file descriptors for the device... */
 925 void *vfio_del_group_dev(struct device *dev)
 926 {
 927         struct vfio_device *device = dev_get_drvdata(dev);
 928         struct vfio_group *group = device->group;
 929         void *device_data = device->device_data;
 930         struct vfio_unbound_dev *unbound;
 931         unsigned int i = 0;
 932         long ret;
 933         bool interrupted = false;
 934
 935         /*
 936          * The group exists so long as we have a device reference.  Get
 937          * a group reference and use it to scan for the device going away.
 938          */
 939         vfio_group_get(group);
 940
 941         /*
 942          * When the device is removed from the group, the group suddenly
 943          * becomes non-viable; the device has a driver (until the unbind
 944          * completes), but it's not present in the group.  This is bad news
 945          * for any external users that need to re-acquire a group reference
 946          * in order to match and release their existing reference.  To
 947          * solve this, we track such devices on the unbound_list to bridge
 948          * the gap until they're fully unbound.
 949          */
 950         unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
 951         if (unbound) {
 952                 unbound->dev = dev;
 953                 mutex_lock(&group->unbound_lock);
 954                 list_add(&unbound->unbound_next, &group->unbound_list);
 955                 mutex_unlock(&group->unbound_lock);
 956         }
 957         WARN_ON(!unbound);
 958
 959         vfio_device_put(device);
 960
 961         /*
 962          * If the device is still present in the group after the above
 963          * 'put', then it is in use and we need to request it from the
 964          * bus driver.  The driver may in turn need to request the
 965          * device from the user.  We send the request on an arbitrary
 966          * interval with counter to allow the driver to take escalating
 967          * measures to release the device if it has the ability to do so.
 968          */
 969         do {
 970                 device = vfio_group_get_device(group, dev);
 971                 if (!device)
 972                         break;
 973
 974                 if (device->ops->request)
 975                         device->ops->request(device_data, i++);
 976
 977                 vfio_device_put(device);
 978
 979                 if (interrupted) {
 980                         ret = wait_event_timeout(vfio.release_q,
 981                                         !vfio_dev_present(group, dev), HZ * 10);
 982                 } else {
 983                         ret = wait_event_interruptible_timeout(vfio.release_q,
 984                                         !vfio_dev_present(group, dev), HZ * 10);
 985                         if (ret == -ERESTARTSYS) {
 986                                 interrupted = true;
 987                                 dev_warn(dev,
 988                                          "Device is currently in use, task"
 989                                          " \"%s\" (%d) "
 990                                          "blocked until device is released",
 991                                          current->comm, task_pid_nr(current));
 992                         }
 993                 }
 994         } while (ret <= 0);
 995
 996         vfio_group_put(group);
 997
 998         return device_data;
 999 }
1000 EXPORT_SYMBOL_GPL(vfio_del_group_dev);
1001
1002 /**
1003  * VFIO base fd, /dev/vfio/vfio
1004  */
1005 static long vfio_ioctl_check_extension(struct vfio_container *container,
1006                                        unsigned long arg)
1007 {
1008         struct vfio_iommu_driver *driver;
1009         long ret = 0;
1010
1011         down_read(&container->group_lock);
1012
1013         driver = container->iommu_driver;
1014
1015         switch (arg) {
1016                 /* No base extensions yet */
1017         default:
1018                 /*
1019                  * If no driver is set, poll all registered drivers for
1020                  * extensions and return the first positive result.  If
1021                  * a driver is already set, further queries will be passed
1022                  * only to that driver.
1023                  */
1024                 if (!driver) {
1025                         mutex_lock(&vfio.iommu_drivers_lock);
1026                         list_for_each_entry(driver, &vfio.iommu_drivers_list,
1027                                             vfio_next) {
1028
1029 #ifdef CONFIG_VFIO_NOIOMMU
1030                                 if (!list_empty(&container->group_list) &&
1031                                     (container->noiommu !=
1032                                      (driver->ops == &vfio_noiommu_ops)))
1033                                         continue;
1034 #endif
1035
1036                                 if (!try_module_get(driver->ops->owner))
1037                                         continue;
1038
1039                                 ret = driver->ops->ioctl(NULL,
1040                                                          VFIO_CHECK_EXTENSION,
1041                                                          arg);
1042                                 module_put(driver->ops->owner);
1043                                 if (ret > 0)
1044                                         break;
1045                         }
1046                         mutex_unlock(&vfio.iommu_drivers_lock);
1047                 } else
1048                         ret = driver->ops->ioctl(container->iommu_data,
1049                                                  VFIO_CHECK_EXTENSION, arg);
1050         }
1051
1052         up_read(&container->group_lock);
1053
1054         return ret;
1055 }
1056
1057 /* hold write lock on container->group_lock */
1058 static int __vfio_container_attach_groups(struct vfio_container *container,
1059                                           struct vfio_iommu_driver *driver,
1060                                           void *data)
1061 {
1062         struct vfio_group *group;
1063         int ret = -ENODEV;
1064
1065         list_for_each_entry(group, &container->group_list, container_next) {
1066                 ret = driver->ops->attach_group(data, group->iommu_group);
1067                 if (ret)
1068                         goto unwind;
1069         }
1070
1071         return ret;
1072
1073 unwind:
1074         list_for_each_entry_continue_reverse(group, &container->group_list,
1075                                              container_next) {
1076                 driver->ops->detach_group(data, group->iommu_group);
1077         }
1078
1079         return ret;
1080 }
1081
1082 static long vfio_ioctl_set_iommu(struct vfio_container *container,
1083                                  unsigned long arg)
1084 {
1085         struct vfio_iommu_driver *driver;
1086         long ret = -ENODEV;
1087
1088         down_write(&container->group_lock);
1089
1090         /*
1091          * The container is designed to be an unprivileged interface while
1092          * the group can be assigned to specific users.  Therefore, only by
1093          * adding a group to a container does the user get the privilege of
1094          * enabling the iommu, which may allocate finite resources.  There
1095          * is no unset_iommu, but by removing all the groups from a container,
1096          * the container is deprivileged and returns to an unset state.
1097          */
1098         if (list_empty(&container->group_list) || container->iommu_driver) {
1099                 up_write(&container->group_lock);
1100                 return -EINVAL;
1101         }
1102
1103         mutex_lock(&vfio.iommu_drivers_lock);
1104         list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1105                 void *data;
1106
1107 #ifdef CONFIG_VFIO_NOIOMMU
1108                 /*
1109                  * Only noiommu containers can use vfio-noiommu and noiommu
1110                  * containers can only use vfio-noiommu.
1111                  */
1112                 if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1113                         continue;
1114 #endif
1115
1116                 if (!try_module_get(driver->ops->owner))
1117                         continue;
1118
1119                 /*
1120                  * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1121                  * so test which iommu driver reported support for this
1122                  * extension and call open on them.  We also pass them the
1123                  * magic, allowing a single driver to support multiple
1124                  * interfaces if they'd like.
1125                  */
1126                 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1127                         module_put(driver->ops->owner);
1128                         continue;
1129                 }
1130
1131                 data = driver->ops->open(arg);
1132                 if (IS_ERR(data)) {
1133                         ret = PTR_ERR(data);
1134                         module_put(driver->ops->owner);
1135                         continue;
1136                 }
1137
1138                 ret = __vfio_container_attach_groups(container, driver, data);
1139                 if (ret) {
1140                         driver->ops->release(data);
1141                         module_put(driver->ops->owner);
1142                         continue;
1143                 }
1144
1145                 container->iommu_driver = driver;
1146                 container->iommu_data = data;
1147                 break;
1148         }
1149
1150         mutex_unlock(&vfio.iommu_drivers_lock);
1151         up_write(&container->group_lock);
1152
1153         return ret;
1154 }
1155
1156 static long vfio_fops_unl_ioctl(struct file *filep,
1157                                 unsigned int cmd, unsigned long arg)
1158 {
1159         struct vfio_container *container = filep->private_data;
1160         struct vfio_iommu_driver *driver;
1161         void *data;
1162         long ret = -EINVAL;
1163
1164         if (!container)
1165                 return ret;
1166
1167         switch (cmd) {
1168         case VFIO_GET_API_VERSION:
1169                 ret = VFIO_API_VERSION;
1170                 break;
1171         case VFIO_CHECK_EXTENSION:
1172                 ret = vfio_ioctl_check_extension(container, arg);
1173                 break;
1174         case VFIO_SET_IOMMU:
1175                 ret = vfio_ioctl_set_iommu(container, arg);
1176                 break;
1177         default:
1178                 driver = container->iommu_driver;
1179                 data = container->iommu_data;
1180
1181                 if (driver) /* passthrough all unrecognized ioctls */
1182                         ret = driver->ops->ioctl(data, cmd, arg);
1183         }
1184
1185         return ret;
1186 }
1187
1188 #ifdef CONFIG_COMPAT
1189 static long vfio_fops_compat_ioctl(struct file *filep,
1190                                    unsigned int cmd, unsigned long arg)
1191 {
1192         arg = (unsigned long)compat_ptr(arg);
1193         return vfio_fops_unl_ioctl(filep, cmd, arg);
1194 }
1195 #endif  /* CONFIG_COMPAT */
1196
1197 static int vfio_fops_open(struct inode *inode, struct file *filep)
1198 {
1199         struct vfio_container *container;
1200
1201         container = kzalloc(sizeof(*container), GFP_KERNEL);
1202         if (!container)
1203                 return -ENOMEM;
1204
1205         INIT_LIST_HEAD(&container->group_list);
1206         init_rwsem(&container->group_lock);
1207         kref_init(&container->kref);
1208
1209         filep->private_data = container;
1210
1211         return 0;
1212 }
1213
1214 static int vfio_fops_release(struct inode *inode, struct file *filep)
1215 {
1216         struct vfio_container *container = filep->private_data;
1217
1218         filep->private_data = NULL;
1219
1220         vfio_container_put(container);
1221
1222         return 0;
1223 }
1224
1225 /*
1226  * Once an iommu driver is set, we optionally pass read/write/mmap
1227  * on to the driver, allowing management interfaces beyond ioctl.
1228  */
1229 static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1230                               size_t count, loff_t *ppos)
1231 {
1232         struct vfio_container *container = filep->private_data;
1233         struct vfio_iommu_driver *driver;
1234         ssize_t ret = -EINVAL;
1235
1236         driver = container->iommu_driver;
1237         if (likely(driver && driver->ops->read))
1238                 ret = driver->ops->read(container->iommu_data,
1239                                         buf, count, ppos);
1240
1241         return ret;
1242 }
1243
1244 static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1245                                size_t count, loff_t *ppos)
1246 {
1247         struct vfio_container *container = filep->private_data;
1248         struct vfio_iommu_driver *driver;
1249         ssize_t ret = -EINVAL;
1250
1251         driver = container->iommu_driver;
1252         if (likely(driver && driver->ops->write))
1253                 ret = driver->ops->write(container->iommu_data,
1254                                          buf, count, ppos);
1255
1256         return ret;
1257 }
1258
1259 static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1260 {
1261         struct vfio_container *container = filep->private_data;
1262         struct vfio_iommu_driver *driver;
1263         int ret = -EINVAL;
1264
1265         driver = container->iommu_driver;
1266         if (likely(driver && driver->ops->mmap))
1267                 ret = driver->ops->mmap(container->iommu_data, vma);
1268
1269         return ret;
1270 }
1271
1272 static const struct file_operations vfio_fops = {
1273         .owner          = THIS_MODULE,
1274         .open           = vfio_fops_open,
1275         .release        = vfio_fops_release,
1276         .read           = vfio_fops_read,
1277         .write          = vfio_fops_write,
1278         .unlocked_ioctl = vfio_fops_unl_ioctl,
1279 #ifdef CONFIG_COMPAT
1280         .compat_ioctl   = vfio_fops_compat_ioctl,
1281 #endif
1282         .mmap           = vfio_fops_mmap,
1283 };
1284
1285 /**
1286  * VFIO Group fd, /dev/vfio/$GROUP
1287  */
1288 static void __vfio_group_unset_container(struct vfio_group *group)
1289 {
1290         struct vfio_container *container = group->container;
1291         struct vfio_iommu_driver *driver;
1292
1293         down_write(&container->group_lock);
1294
1295         driver = container->iommu_driver;
1296         if (driver)
1297                 driver->ops->detach_group(container->iommu_data,
1298                                           group->iommu_group);
1299
1300         group->container = NULL;
1301         list_del(&group->container_next);
1302
1303         /* Detaching the last group deprivileges a container, remove iommu */
1304         if (driver && list_empty(&container->group_list)) {
1305                 driver->ops->release(container->iommu_data);
1306                 module_put(driver->ops->owner);
1307                 container->iommu_driver = NULL;
1308                 container->iommu_data = NULL;
1309         }
1310
1311         up_write(&container->group_lock);
1312
1313         vfio_container_put(container);
1314 }
1315
1316 /*
1317  * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1318  * if there was no container to unset.  Since the ioctl is called on
1319  * the group, we know that still exists, therefore the only valid
1320  * transition here is 1->0.
1321  */
1322 static int vfio_group_unset_container(struct vfio_group *group)
1323 {
1324         int users = atomic_cmpxchg(&group->container_users, 1, 0);
1325
1326         if (!users)
1327                 return -EINVAL;
1328         if (users != 1)
1329                 return -EBUSY;
1330
1331         __vfio_group_unset_container(group);
1332
1333         return 0;
1334 }
1335
1336 /*
1337  * When removing container users, anything that removes the last user
1338  * implicitly removes the group from the container.  That is, if the
1339  * group file descriptor is closed, as well as any device file descriptors,
1340  * the group is free.
1341  */
1342 static void vfio_group_try_dissolve_container(struct vfio_group *group)
1343 {
1344         if (0 == atomic_dec_if_positive(&group->container_users))
1345                 __vfio_group_unset_container(group);
1346 }
1347
1348 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1349 {
1350         struct fd f;
1351         struct vfio_container *container;
1352         struct vfio_iommu_driver *driver;
1353         int ret = 0;
1354
1355         if (atomic_read(&group->container_users))
1356                 return -EINVAL;
1357
1358         if (group->noiommu && !capable(CAP_SYS_RAWIO))
1359                 return -EPERM;
1360
1361         f = fdget(container_fd);
1362         if (!f.file)
1363                 return -EBADF;
1364
1365         /* Sanity check, is this really our fd? */
1366         if (f.file->f_op != &vfio_fops) {
1367                 fdput(f);
1368                 return -EINVAL;
1369         }
1370
1371         container = f.file->private_data;
1372         WARN_ON(!container); /* fget ensures we don't race vfio_release */
1373
1374         down_write(&container->group_lock);
1375
1376         /* Real groups and fake groups cannot mix */
1377         if (!list_empty(&container->group_list) &&
1378             container->noiommu != group->noiommu) {
1379                 ret = -EPERM;
1380                 goto unlock_out;
1381         }
1382
1383         driver = container->iommu_driver;
1384         if (driver) {
1385                 ret = driver->ops->attach_group(container->iommu_data,
1386                                                 group->iommu_group);
1387                 if (ret)
1388                         goto unlock_out;
1389         }
1390
1391         group->container = container;
1392         container->noiommu = group->noiommu;
1393         list_add(&group->container_next, &container->group_list);
1394
1395         /* Get a reference on the container and mark a user within the group */
1396         vfio_container_get(container);
1397         atomic_inc(&group->container_users);
1398
1399 unlock_out:
1400         up_write(&container->group_lock);
1401         fdput(f);
1402         return ret;
1403 }
1404
1405 static bool vfio_group_viable(struct vfio_group *group)
1406 {
1407         return (iommu_group_for_each_dev(group->iommu_group,
1408                                          group, vfio_dev_viable) == 0);
1409 }
1410
1411 static int vfio_group_add_container_user(struct vfio_group *group)
1412 {
1413         if (!atomic_inc_not_zero(&group->container_users))
1414                 return -EINVAL;
1415
1416         if (group->noiommu) {
1417                 atomic_dec(&group->container_users);
1418                 return -EPERM;
1419         }
1420         if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1421                 atomic_dec(&group->container_users);
1422                 return -EINVAL;
1423         }
1424
1425         return 0;
1426 }
1427
1428 static const struct file_operations vfio_device_fops;
1429
1430 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1431 {
1432         struct vfio_device *device;
1433         struct file *filep;
1434         int ret;
1435
1436         if (0 == atomic_read(&group->container_users) ||
1437             !group->container->iommu_driver || !vfio_group_viable(group))
1438                 return -EINVAL;
1439
1440         if (group->noiommu && !capable(CAP_SYS_RAWIO))
1441                 return -EPERM;
1442
1443         device = vfio_device_get_from_name(group, buf);
1444         if (!device)
1445                 return -ENODEV;
1446
1447         ret = device->ops->open(device->device_data);
1448         if (ret) {
1449                 vfio_device_put(device);
1450                 return ret;
1451         }
1452
1453         /*
1454          * We can't use anon_inode_getfd() because we need to modify
1455          * the f_mode flags directly to allow more than just ioctls
1456          */
1457         ret = get_unused_fd_flags(O_CLOEXEC);
1458         if (ret < 0) {
1459                 device->ops->release(device->device_data);
1460                 vfio_device_put(device);
1461                 return ret;
1462         }
1463
1464         filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1465                                    device, O_RDWR);
1466         if (IS_ERR(filep)) {
1467                 put_unused_fd(ret);
1468                 ret = PTR_ERR(filep);
1469                 device->ops->release(device->device_data);
1470                 vfio_device_put(device);
1471                 return ret;
1472         }
1473
1474         /*
1475          * TODO: add an anon_inode interface to do this.
1476          * Appears to be missing by lack of need rather than
1477          * explicitly prevented.  Now there's need.
1478          */
1479         filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1480
1481         atomic_inc(&group->container_users);
1482
1483         fd_install(ret, filep);
1484
1485         if (group->noiommu)
1486                 dev_warn(device->dev, "vfio-noiommu device opened by user "
1487                          "(%s:%d)\n", current->comm, task_pid_nr(current));
1488
1489         return ret;
1490 }
1491
1492 static long vfio_group_fops_unl_ioctl(struct file *filep,
1493                                       unsigned int cmd, unsigned long arg)
1494 {
1495         struct vfio_group *group = filep->private_data;
1496         long ret = -ENOTTY;
1497
1498         switch (cmd) {
1499         case VFIO_GROUP_GET_STATUS:
1500         {
1501                 struct vfio_group_status status;
1502                 unsigned long minsz;
1503
1504                 minsz = offsetofend(struct vfio_group_status, flags);
1505
1506                 if (copy_from_user(&status, (void __user *)arg, minsz))
1507                         return -EFAULT;
1508
1509                 if (status.argsz < minsz)
1510                         return -EINVAL;
1511
1512                 status.flags = 0;
1513
1514                 if (vfio_group_viable(group))
1515                         status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1516
1517                 if (group->container)
1518                         status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1519
1520                 if (copy_to_user((void __user *)arg, &status, minsz))
1521                         return -EFAULT;
1522
1523                 ret = 0;
1524                 break;
1525         }
1526         case VFIO_GROUP_SET_CONTAINER:
1527         {
1528                 int fd;
1529
1530                 if (get_user(fd, (int __user *)arg))
1531                         return -EFAULT;
1532
1533                 if (fd < 0)
1534                         return -EINVAL;
1535
1536                 ret = vfio_group_set_container(group, fd);
1537                 break;
1538         }
1539         case VFIO_GROUP_UNSET_CONTAINER:
1540                 ret = vfio_group_unset_container(group);
1541                 break;
1542         case VFIO_GROUP_GET_DEVICE_FD:
1543         {
1544                 char *buf;
1545
1546                 buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1547                 if (IS_ERR(buf))
1548                         return PTR_ERR(buf);
1549
1550                 ret = vfio_group_get_device_fd(group, buf);
1551                 kfree(buf);
1552                 break;
1553         }
1554         }
1555
1556         return ret;
1557 }
1558
1559 #ifdef CONFIG_COMPAT
1560 static long vfio_group_fops_compat_ioctl(struct file *filep,
1561                                          unsigned int cmd, unsigned long arg)
1562 {
1563         arg = (unsigned long)compat_ptr(arg);
1564         return vfio_group_fops_unl_ioctl(filep, cmd, arg);
1565 }
1566 #endif  /* CONFIG_COMPAT */
1567
1568 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1569 {
1570         struct vfio_group *group;
1571         int opened;
1572
1573         group = vfio_group_get_from_minor(iminor(inode));
1574         if (!group)
1575                 return -ENODEV;
1576
1577         if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1578                 vfio_group_put(group);
1579                 return -EPERM;
1580         }
1581
1582         /* Do we need multiple instances of the group open?  Seems not. */
1583         opened = atomic_cmpxchg(&group->opened, 0, 1);
1584         if (opened) {
1585                 vfio_group_put(group);
1586                 return -EBUSY;
1587         }
1588
1589         /* Is something still in use from a previous open? */
1590         if (group->container) {
1591                 atomic_dec(&group->opened);
1592                 vfio_group_put(group);
1593                 return -EBUSY;
1594         }
1595
1596         /* Warn if previous user didn't cleanup and re-init to drop them */
1597         if (WARN_ON(group->notifier.head))
1598                 BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
1599
1600         filep->private_data = group;
1601
1602         return 0;
1603 }
1604
1605 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1606 {
1607         struct vfio_group *group = filep->private_data;
1608
1609         filep->private_data = NULL;
1610
1611         vfio_group_try_dissolve_container(group);
1612
1613         atomic_dec(&group->opened);
1614
1615         vfio_group_put(group);
1616
1617         return 0;
1618 }
1619
1620 static const struct file_operations vfio_group_fops = {
1621         .owner          = THIS_MODULE,
1622         .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1623 #ifdef CONFIG_COMPAT
1624         .compat_ioctl   = vfio_group_fops_compat_ioctl,
1625 #endif
1626         .open           = vfio_group_fops_open,
1627         .release        = vfio_group_fops_release,
1628 };
1629
1630 /**
1631  * VFIO Device fd
1632  */
1633 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1634 {
1635         struct vfio_device *device = filep->private_data;
1636
1637         device->ops->release(device->device_data);
1638
1639         vfio_group_try_dissolve_container(device->group);
1640
1641         vfio_device_put(device);
1642
1643         return 0;
1644 }
1645
1646 static long vfio_device_fops_unl_ioctl(struct file *filep,
1647                                        unsigned int cmd, unsigned long arg)
1648 {
1649         struct vfio_device *device = filep->private_data;
1650
1651         if (unlikely(!device->ops->ioctl))
1652                 return -EINVAL;
1653
1654         return device->ops->ioctl(device->device_data, cmd, arg);
1655 }
1656
1657 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1658                                      size_t count, loff_t *ppos)
1659 {
1660         struct vfio_device *device = filep->private_data;
1661
1662         if (unlikely(!device->ops->read))
1663                 return -EINVAL;
1664
1665         return device->ops->read(device->device_data, buf, count, ppos);
1666 }
1667
1668 static ssize_t vfio_device_fops_write(struct file *filep,
1669                                       const char __user *buf,
1670                                       size_t count, loff_t *ppos)
1671 {
1672         struct vfio_device *device = filep->private_data;
1673
1674         if (unlikely(!device->ops->write))
1675                 return -EINVAL;
1676
1677         return device->ops->write(device->device_data, buf, count, ppos);
1678 }
1679
1680 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1681 {
1682         struct vfio_device *device = filep->private_data;
1683
1684         if (unlikely(!device->ops->mmap))
1685                 return -EINVAL;
1686
1687         return device->ops->mmap(device->device_data, vma);
1688 }
1689
1690 #ifdef CONFIG_COMPAT
1691 static long vfio_device_fops_compat_ioctl(struct file *filep,
1692                                           unsigned int cmd, unsigned long arg)
1693 {
1694         arg = (unsigned long)compat_ptr(arg);
1695         return vfio_device_fops_unl_ioctl(filep, cmd, arg);
1696 }
1697 #endif  /* CONFIG_COMPAT */
1698
1699 static const struct file_operations vfio_device_fops = {
1700         .owner          = THIS_MODULE,
1701         .release        = vfio_device_fops_release,
1702         .read           = vfio_device_fops_read,
1703         .write          = vfio_device_fops_write,
1704         .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1705 #ifdef CONFIG_COMPAT
1706         .compat_ioctl   = vfio_device_fops_compat_ioctl,
1707 #endif
1708         .mmap           = vfio_device_fops_mmap,
1709 };
1710
1711 /**
1712  * External user API, exported by symbols to be linked dynamically.
1713  *
1714  * The protocol includes:
1715  *  1. do normal VFIO init operation:
1716  *      - opening a new container;
1717  *      - attaching group(s) to it;
1718  *      - setting an IOMMU driver for a container.
1719  * When IOMMU is set for a container, all groups in it are
1720  * considered ready to use by an external user.
1721  *
1722  * 2. User space passes a group fd to an external user.
1723  * The external user calls vfio_group_get_external_user()
1724  * to verify that:
1725  *      - the group is initialized;
1726  *      - IOMMU is set for it.
1727  * If both checks passed, vfio_group_get_external_user()
1728  * increments the container user counter to prevent
1729  * the VFIO group from disposal before KVM exits.
1730  *
1731  * 3. The external user calls vfio_external_user_iommu_id()
1732  * to know an IOMMU ID.
1733  *
1734  * 4. When the external KVM finishes, it calls
1735  * vfio_group_put_external_user() to release the VFIO group.
1736  * This call decrements the container user counter.
1737  */
1738 struct vfio_group *vfio_group_get_external_user(struct file *filep)
1739 {
1740         struct vfio_group *group = filep->private_data;
1741         int ret;
1742
1743         if (filep->f_op != &vfio_group_fops)
1744                 return ERR_PTR(-EINVAL);
1745
1746         ret = vfio_group_add_container_user(group);
1747         if (ret)
1748                 return ERR_PTR(ret);
1749
1750         vfio_group_get(group);
1751
1752         return group;
1753 }
1754 EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1755
1756 void vfio_group_put_external_user(struct vfio_group *group)
1757 {
1758         vfio_group_try_dissolve_container(group);
1759         vfio_group_put(group);
1760 }
1761 EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1762
1763 bool vfio_external_group_match_file(struct vfio_group *test_group,
1764                                     struct file *filep)
1765 {
1766         struct vfio_group *group = filep->private_data;
1767
1768         return (filep->f_op == &vfio_group_fops) && (group == test_group);
1769 }
1770 EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
1771
1772 int vfio_external_user_iommu_id(struct vfio_group *group)
1773 {
1774         return iommu_group_id(group->iommu_group);
1775 }
1776 EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1777
1778 long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1779 {
1780         return vfio_ioctl_check_extension(group->container, arg);
1781 }
1782 EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1783
1784 /**
1785  * Sub-module support
1786  */
1787 /*
1788  * Helper for managing a buffer of info chain capabilities, allocate or
1789  * reallocate a buffer with additional @size, filling in @id and @version
1790  * of the capability.  A pointer to the new capability is returned.
1791  *
1792  * NB. The chain is based at the head of the buffer, so new entries are
1793  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1794  * next offsets prior to copying to the user buffer.
1795  */
1796 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1797                                                size_t size, u16 id, u16 version)
1798 {
1799         void *buf;
1800         struct vfio_info_cap_header *header, *tmp;
1801
1802         buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1803         if (!buf) {
1804                 kfree(caps->buf);
1805                 caps->size = 0;
1806                 return ERR_PTR(-ENOMEM);
1807         }
1808
1809         caps->buf = buf;
1810         header = buf + caps->size;
1811
1812         /* Eventually copied to user buffer, zero */
1813         memset(header, 0, size);
1814
1815         header->id = id;
1816         header->version = version;
1817
1818         /* Add to the end of the capability chain */
1819         for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1820                 ; /* nothing */
1821
1822         tmp->next = caps->size;
1823         caps->size += size;
1824
1825         return header;
1826 }
1827 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1828
1829 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1830 {
1831         struct vfio_info_cap_header *tmp;
1832         void *buf = (void *)caps->buf;
1833
1834         for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1835                 tmp->next += offset;
1836 }
1837 EXPORT_SYMBOL(vfio_info_cap_shift);
1838
1839 static int sparse_mmap_cap(struct vfio_info_cap *caps, void *cap_type)
1840 {
1841         struct vfio_info_cap_header *header;
1842         struct vfio_region_info_cap_sparse_mmap *sparse_cap, *sparse = cap_type;
1843         size_t size;
1844
1845         size = sizeof(*sparse) + sparse->nr_areas *  sizeof(*sparse->areas);
1846         header = vfio_info_cap_add(caps, size,
1847                                    VFIO_REGION_INFO_CAP_SPARSE_MMAP, 1);
1848         if (IS_ERR(header))
1849                 return PTR_ERR(header);
1850
1851         sparse_cap = container_of(header,
1852                         struct vfio_region_info_cap_sparse_mmap, header);
1853         sparse_cap->nr_areas = sparse->nr_areas;
1854         memcpy(sparse_cap->areas, sparse->areas,
1855                sparse->nr_areas * sizeof(*sparse->areas));
1856         return 0;
1857 }
1858
1859 static int region_type_cap(struct vfio_info_cap *caps, void *cap_type)
1860 {
1861         struct vfio_info_cap_header *header;
1862         struct vfio_region_info_cap_type *type_cap, *cap = cap_type;
1863
1864         header = vfio_info_cap_add(caps, sizeof(*cap),
1865                                    VFIO_REGION_INFO_CAP_TYPE, 1);
1866         if (IS_ERR(header))
1867                 return PTR_ERR(header);
1868
1869         type_cap = container_of(header, struct vfio_region_info_cap_type,
1870                                 header);
1871         type_cap->type = cap->type;
1872         type_cap->subtype = cap->subtype;
1873         return 0;
1874 }
1875
1876 int vfio_info_add_capability(struct vfio_info_cap *caps, int cap_type_id,
1877                              void *cap_type)
1878 {
1879         int ret = -EINVAL;
1880
1881         if (!cap_type)
1882                 return 0;
1883
1884         switch (cap_type_id) {
1885         case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
1886                 ret = sparse_mmap_cap(caps, cap_type);
1887                 break;
1888
1889         case VFIO_REGION_INFO_CAP_TYPE:
1890                 ret = region_type_cap(caps, cap_type);
1891                 break;
1892         }
1893
1894         return ret;
1895 }
1896 EXPORT_SYMBOL(vfio_info_add_capability);
1897
1898 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1899                                        int max_irq_type, size_t *data_size)
1900 {
1901         unsigned long minsz;
1902         size_t size;
1903
1904         minsz = offsetofend(struct vfio_irq_set, count);
1905
1906         if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1907             (hdr->count >= (U32_MAX - hdr->start)) ||
1908             (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1909                                 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1910                 return -EINVAL;
1911
1912         if (data_size)
1913                 *data_size = 0;
1914
1915         if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1916                 return -EINVAL;
1917
1918         switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1919         case VFIO_IRQ_SET_DATA_NONE:
1920                 size = 0;
1921                 break;
1922         case VFIO_IRQ_SET_DATA_BOOL:
1923                 size = sizeof(uint8_t);
1924                 break;
1925         case VFIO_IRQ_SET_DATA_EVENTFD:
1926                 size = sizeof(int32_t);
1927                 break;
1928         default:
1929                 return -EINVAL;
1930         }
1931
1932         if (size) {
1933                 if (hdr->argsz - minsz < hdr->count * size)
1934                         return -EINVAL;
1935
1936                 if (!data_size)
1937                         return -EINVAL;
1938
1939                 *data_size = hdr->count * size;
1940         }
1941
1942         return 0;
1943 }
1944 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1945
1946 /*
1947  * Pin a set of guest PFNs and return their associated host PFNs for local
1948  * domain only.
1949  * @dev [in]     : device
1950  * @user_pfn [in]: array of user/guest PFNs to be pinned.
1951  * @npage [in]   : count of elements in user_pfn array.  This count should not
1952  *                 be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1953  * @prot [in]    : protection flags
1954  * @phys_pfn[out]: array of host PFNs
1955  * Return error or number of pages pinned.
1956  */
1957 int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
1958                    int prot, unsigned long *phys_pfn)
1959 {
1960         struct vfio_container *container;
1961         struct vfio_group *group;
1962         struct vfio_iommu_driver *driver;
1963         int ret;
1964
1965         if (!dev || !user_pfn || !phys_pfn || !npage)
1966                 return -EINVAL;
1967
1968         if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1969                 return -E2BIG;
1970
1971         group = vfio_group_get_from_dev(dev);
1972         if (!group)
1973                 return -ENODEV;
1974
1975         ret = vfio_group_add_container_user(group);
1976         if (ret)
1977                 goto err_pin_pages;
1978
1979         container = group->container;
1980         driver = container->iommu_driver;
1981         if (likely(driver && driver->ops->pin_pages))
1982                 ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
1983                                              npage, prot, phys_pfn);
1984         else
1985                 ret = -ENOTTY;
1986
1987         vfio_group_try_dissolve_container(group);
1988
1989 err_pin_pages:
1990         vfio_group_put(group);
1991         return ret;
1992 }
1993 EXPORT_SYMBOL(vfio_pin_pages);
1994
1995 /*
1996  * Unpin set of host PFNs for local domain only.
1997  * @dev [in]     : device
1998  * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
1999  *                 PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2000  * @npage [in]   : count of elements in user_pfn array.  This count should not
2001  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2002  * Return error or number of pages unpinned.
2003  */
2004 int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
2005 {
2006         struct vfio_container *container;
2007         struct vfio_group *group;
2008         struct vfio_iommu_driver *driver;
2009         int ret;
2010
2011         if (!dev || !user_pfn || !npage)
2012                 return -EINVAL;
2013
2014         if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2015                 return -E2BIG;
2016
2017         group = vfio_group_get_from_dev(dev);
2018         if (!group)
2019                 return -ENODEV;
2020
2021         ret = vfio_group_add_container_user(group);
2022         if (ret)
2023                 goto err_unpin_pages;
2024
2025         container = group->container;
2026         driver = container->iommu_driver;
2027         if (likely(driver && driver->ops->unpin_pages))
2028                 ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
2029                                                npage);
2030         else
2031                 ret = -ENOTTY;
2032
2033         vfio_group_try_dissolve_container(group);
2034
2035 err_unpin_pages:
2036         vfio_group_put(group);
2037         return ret;
2038 }
2039 EXPORT_SYMBOL(vfio_unpin_pages);
2040
2041 static int vfio_register_iommu_notifier(struct vfio_group *group,
2042                                         unsigned long *events,
2043                                         struct notifier_block *nb)
2044 {
2045         struct vfio_container *container;
2046         struct vfio_iommu_driver *driver;
2047         int ret;
2048
2049         ret = vfio_group_add_container_user(group);
2050         if (ret)
2051                 return -EINVAL;
2052
2053         container = group->container;
2054         driver = container->iommu_driver;
2055         if (likely(driver && driver->ops->register_notifier))
2056                 ret = driver->ops->register_notifier(container->iommu_data,
2057                                                      events, nb);
2058         else
2059                 ret = -ENOTTY;
2060
2061         vfio_group_try_dissolve_container(group);
2062
2063         return ret;
2064 }
2065
2066 static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2067                                           struct notifier_block *nb)
2068 {
2069         struct vfio_container *container;
2070         struct vfio_iommu_driver *driver;
2071         int ret;
2072
2073         ret = vfio_group_add_container_user(group);
2074         if (ret)
2075                 return -EINVAL;
2076
2077         container = group->container;
2078         driver = container->iommu_driver;
2079         if (likely(driver && driver->ops->unregister_notifier))
2080                 ret = driver->ops->unregister_notifier(container->iommu_data,
2081                                                        nb);
2082         else
2083                 ret = -ENOTTY;
2084
2085         vfio_group_try_dissolve_container(group);
2086
2087         return ret;
2088 }
2089
2090 void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2091 {
2092         group->kvm = kvm;
2093         blocking_notifier_call_chain(&group->notifier,
2094                                 VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2095 }
2096 EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2097
2098 static int vfio_register_group_notifier(struct vfio_group *group,
2099                                         unsigned long *events,
2100                                         struct notifier_block *nb)
2101 {
2102         int ret;
2103         bool set_kvm = false;
2104
2105         if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2106                 set_kvm = true;
2107
2108         /* clear known events */
2109         *events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2110
2111         /* refuse to continue if still events remaining */
2112         if (*events)
2113                 return -EINVAL;
2114
2115         ret = vfio_group_add_container_user(group);
2116         if (ret)
2117                 return -EINVAL;
2118
2119         ret = blocking_notifier_chain_register(&group->notifier, nb);
2120
2121         /*
2122          * The attaching of kvm and vfio_group might already happen, so
2123          * here we replay once upon registration.
2124          */
2125         if (!ret && set_kvm && group->kvm)
2126                 blocking_notifier_call_chain(&group->notifier,
2127                                         VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2128
2129         vfio_group_try_dissolve_container(group);
2130
2131         return ret;
2132 }
2133
2134 static int vfio_unregister_group_notifier(struct vfio_group *group,
2135                                          struct notifier_block *nb)
2136 {
2137         int ret;
2138
2139         ret = vfio_group_add_container_user(group);
2140         if (ret)
2141                 return -EINVAL;
2142
2143         ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2144
2145         vfio_group_try_dissolve_container(group);
2146
2147         return ret;
2148 }
2149
2150 int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2151                            unsigned long *events, struct notifier_block *nb)
2152 {
2153         struct vfio_group *group;
2154         int ret;
2155
2156         if (!dev || !nb || !events || (*events == 0))
2157                 return -EINVAL;
2158
2159         group = vfio_group_get_from_dev(dev);
2160         if (!group)
2161                 return -ENODEV;
2162
2163         switch (type) {
2164         case VFIO_IOMMU_NOTIFY:
2165                 ret = vfio_register_iommu_notifier(group, events, nb);
2166                 break;
2167         case VFIO_GROUP_NOTIFY:
2168                 ret = vfio_register_group_notifier(group, events, nb);
2169                 break;
2170         default:
2171                 ret = -EINVAL;
2172         }
2173
2174         vfio_group_put(group);
2175         return ret;
2176 }
2177 EXPORT_SYMBOL(vfio_register_notifier);
2178
2179 int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2180                              struct notifier_block *nb)
2181 {
2182         struct vfio_group *group;
2183         int ret;
2184
2185         if (!dev || !nb)
2186                 return -EINVAL;
2187
2188         group = vfio_group_get_from_dev(dev);
2189         if (!group)
2190                 return -ENODEV;
2191
2192         switch (type) {
2193         case VFIO_IOMMU_NOTIFY:
2194                 ret = vfio_unregister_iommu_notifier(group, nb);
2195                 break;
2196         case VFIO_GROUP_NOTIFY:
2197                 ret = vfio_unregister_group_notifier(group, nb);
2198                 break;
2199         default:
2200                 ret = -EINVAL;
2201         }
2202
2203         vfio_group_put(group);
2204         return ret;
2205 }
2206 EXPORT_SYMBOL(vfio_unregister_notifier);
2207
2208 /**
2209  * Module/class support
2210  */
2211 static char *vfio_devnode(struct device *dev, umode_t *mode)
2212 {
2213         return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2214 }
2215
2216 static struct miscdevice vfio_dev = {
2217         .minor = VFIO_MINOR,
2218         .name = "vfio",
2219         .fops = &vfio_fops,
2220         .nodename = "vfio/vfio",
2221         .mode = S_IRUGO | S_IWUGO,
2222 };
2223
2224 static int __init vfio_init(void)
2225 {
2226         int ret;
2227
2228         idr_init(&vfio.group_idr);
2229         mutex_init(&vfio.group_lock);
2230         mutex_init(&vfio.iommu_drivers_lock);
2231         INIT_LIST_HEAD(&vfio.group_list);
2232         INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2233         init_waitqueue_head(&vfio.release_q);
2234
2235         ret = misc_register(&vfio_dev);
2236         if (ret) {
2237                 pr_err("vfio: misc device register failed\n");
2238                 return ret;
2239         }
2240
2241         /* /dev/vfio/$GROUP */
2242         vfio.class = class_create(THIS_MODULE, "vfio");
2243         if (IS_ERR(vfio.class)) {
2244                 ret = PTR_ERR(vfio.class);
2245                 goto err_class;
2246         }
2247
2248         vfio.class->devnode = vfio_devnode;
2249
2250         ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK, "vfio");
2251         if (ret)
2252                 goto err_alloc_chrdev;
2253
2254         cdev_init(&vfio.group_cdev, &vfio_group_fops);
2255         ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK);
2256         if (ret)
2257                 goto err_cdev_add;
2258
2259         pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2260
2261 #ifdef CONFIG_VFIO_NOIOMMU
2262         vfio_register_iommu_driver(&vfio_noiommu_ops);
2263 #endif
2264         return 0;
2265
2266 err_cdev_add:
2267         unregister_chrdev_region(vfio.group_devt, MINORMASK);
2268 err_alloc_chrdev:
2269         class_destroy(vfio.class);
2270         vfio.class = NULL;
2271 err_class:
2272         misc_deregister(&vfio_dev);
2273         return ret;
2274 }
2275
2276 static void __exit vfio_cleanup(void)
2277 {
2278         WARN_ON(!list_empty(&vfio.group_list));
2279
2280 #ifdef CONFIG_VFIO_NOIOMMU
2281         vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2282 #endif
2283         idr_destroy(&vfio.group_idr);
2284         cdev_del(&vfio.group_cdev);
2285         unregister_chrdev_region(vfio.group_devt, MINORMASK);
2286         class_destroy(vfio.class);
2287         vfio.class = NULL;
2288         misc_deregister(&vfio_dev);
2289 }
2290
2291 module_init(vfio_init);
2292 module_exit(vfio_cleanup);
2293
2294 MODULE_VERSION(DRIVER_VERSION);
2295 MODULE_LICENSE("GPL v2");
2296 MODULE_AUTHOR(DRIVER_AUTHOR);
2297 MODULE_DESCRIPTION(DRIVER_DESC);
2298 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2299 MODULE_ALIAS("devname:vfio/vfio");
2300 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");