virt/kvm/eventfd.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * kvm eventfd support - use eventfd objects to signal various KVM events
   4  *
   5  * Copyright 2009 Novell.  All Rights Reserved.
   6  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
   7  *
   8  * Author:
   9  *      Gregory Haskins <ghaskins@novell.com>
  10  */
  11
  12 #include <linux/kvm_host.h>
  13 #include <linux/kvm.h>
  14 #include <linux/kvm_irqfd.h>
  15 #include <linux/workqueue.h>
  16 #include <linux/syscalls.h>
  17 #include <linux/wait.h>
  18 #include <linux/poll.h>
  19 #include <linux/file.h>
  20 #include <linux/list.h>
  21 #include <linux/eventfd.h>
  22 #include <linux/kernel.h>
  23 #include <linux/srcu.h>
  24 #include <linux/slab.h>
  25 #include <linux/seqlock.h>
  26 #include <linux/irqbypass.h>
  27 #include <trace/events/kvm.h>
  28
  29 #include <kvm/iodev.h>
  30
  31 #ifdef CONFIG_HAVE_KVM_IRQFD
  32
  33 static struct workqueue_struct *irqfd_cleanup_wq;
  34
  35 bool __attribute__((weak))
  36 kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
  37 {
  38         return true;
  39 }
  40
  41 static void
  42 irqfd_inject(struct work_struct *work)
  43 {
  44         struct kvm_kernel_irqfd *irqfd =
  45                 container_of(work, struct kvm_kernel_irqfd, inject);
  46         struct kvm *kvm = irqfd->kvm;
  47
  48         if (!irqfd->resampler) {
  49                 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1,
  50                                 false);
  51                 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0,
  52                                 false);
  53         } else
  54                 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
  55                             irqfd->gsi, 1, false);
  56 }
  57
  58 /*
  59  * Since resampler irqfds share an IRQ source ID, we de-assert once
  60  * then notify all of the resampler irqfds using this GSI.  We can't
  61  * do multiple de-asserts or we risk racing with incoming re-asserts.
  62  */
  63 static void
  64 irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
  65 {
  66         struct kvm_kernel_irqfd_resampler *resampler;
  67         struct kvm *kvm;
  68         struct kvm_kernel_irqfd *irqfd;
  69         int idx;
  70
  71         resampler = container_of(kian,
  72                         struct kvm_kernel_irqfd_resampler, notifier);
  73         kvm = resampler->kvm;
  74
  75         kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
  76                     resampler->notifier.gsi, 0, false);
  77
  78         idx = srcu_read_lock(&kvm->irq_srcu);
  79
  80         list_for_each_entry_rcu(irqfd, &resampler->list, resampler_link)
  81                 eventfd_signal(irqfd->resamplefd, 1);
  82
  83         srcu_read_unlock(&kvm->irq_srcu, idx);
  84 }
  85
  86 static void
  87 irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd)
  88 {
  89         struct kvm_kernel_irqfd_resampler *resampler = irqfd->resampler;
  90         struct kvm *kvm = resampler->kvm;
  91
  92         mutex_lock(&kvm->irqfds.resampler_lock);
  93
  94         list_del_rcu(&irqfd->resampler_link);
  95         synchronize_srcu(&kvm->irq_srcu);
  96
  97         if (list_empty(&resampler->list)) {
  98                 list_del(&resampler->link);
  99                 kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
 100                 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
 101                             resampler->notifier.gsi, 0, false);
 102                 kfree(resampler);
 103         }
 104
 105         mutex_unlock(&kvm->irqfds.resampler_lock);
 106 }
 107
 108 /*
 109  * Race-free decouple logic (ordering is critical)
 110  */
 111 static void
 112 irqfd_shutdown(struct work_struct *work)
 113 {
 114         struct kvm_kernel_irqfd *irqfd =
 115                 container_of(work, struct kvm_kernel_irqfd, shutdown);
 116         struct kvm *kvm = irqfd->kvm;
 117         u64 cnt;
 118
 119         /* Make sure irqfd has been initialized in assign path. */
 120         synchronize_srcu(&kvm->irq_srcu);
 121
 122         /*
 123          * Synchronize with the wait-queue and unhook ourselves to prevent
 124          * further events.
 125          */
 126         eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt);
 127
 128         /*
 129          * We know no new events will be scheduled at this point, so block
 130          * until all previously outstanding events have completed
 131          */
 132         flush_work(&irqfd->inject);
 133
 134         if (irqfd->resampler) {
 135                 irqfd_resampler_shutdown(irqfd);
 136                 eventfd_ctx_put(irqfd->resamplefd);
 137         }
 138
 139         /*
 140          * It is now safe to release the object's resources
 141          */
 142 #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
 143         irq_bypass_unregister_consumer(&irqfd->consumer);
 144 #endif
 145         eventfd_ctx_put(irqfd->eventfd);
 146         kfree(irqfd);
 147 }
 148
 149
 150 /* assumes kvm->irqfds.lock is held */
 151 static bool
 152 irqfd_is_active(struct kvm_kernel_irqfd *irqfd)
 153 {
 154         return list_empty(&irqfd->list) ? false : true;
 155 }
 156
 157 /*
 158  * Mark the irqfd as inactive and schedule it for removal
 159  *
 160  * assumes kvm->irqfds.lock is held
 161  */
 162 static void
 163 irqfd_deactivate(struct kvm_kernel_irqfd *irqfd)
 164 {
 165         BUG_ON(!irqfd_is_active(irqfd));
 166
 167         list_del_init(&irqfd->list);
 168
 169         queue_work(irqfd_cleanup_wq, &irqfd->shutdown);
 170 }
 171
 172 int __attribute__((weak)) kvm_arch_set_irq_inatomic(
 173                                 struct kvm_kernel_irq_routing_entry *irq,
 174                                 struct kvm *kvm, int irq_source_id,
 175                                 int level,
 176                                 bool line_status)
 177 {
 178         return -EWOULDBLOCK;
 179 }
 180
 181 /*
 182  * Called with wqh->lock held and interrupts disabled
 183  */
 184 static int
 185 irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 186 {
 187         struct kvm_kernel_irqfd *irqfd =
 188                 container_of(wait, struct kvm_kernel_irqfd, wait);
 189         __poll_t flags = key_to_poll(key);
 190         struct kvm_kernel_irq_routing_entry irq;
 191         struct kvm *kvm = irqfd->kvm;
 192         unsigned seq;
 193         int idx;
 194         int ret = 0;
 195
 196         if (flags & EPOLLIN) {
 197                 u64 cnt;
 198                 eventfd_ctx_do_read(irqfd->eventfd, &cnt);
 199
 200                 idx = srcu_read_lock(&kvm->irq_srcu);
 201                 do {
 202                         seq = read_seqcount_begin(&irqfd->irq_entry_sc);
 203                         irq = irqfd->irq_entry;
 204                 } while (read_seqcount_retry(&irqfd->irq_entry_sc, seq));
 205                 /* An event has been signaled, inject an interrupt */
 206                 if (kvm_arch_set_irq_inatomic(&irq, kvm,
 207                                               KVM_USERSPACE_IRQ_SOURCE_ID, 1,
 208                                               false) == -EWOULDBLOCK)
 209                         schedule_work(&irqfd->inject);
 210                 srcu_read_unlock(&kvm->irq_srcu, idx);
 211                 ret = 1;
 212         }
 213
 214         if (flags & EPOLLHUP) {
 215                 /* The eventfd is closing, detach from KVM */
 216                 unsigned long iflags;
 217
 218                 spin_lock_irqsave(&kvm->irqfds.lock, iflags);
 219
 220                 /*
 221                  * We must check if someone deactivated the irqfd before
 222                  * we could acquire the irqfds.lock since the item is
 223                  * deactivated from the KVM side before it is unhooked from
 224                  * the wait-queue.  If it is already deactivated, we can
 225                  * simply return knowing the other side will cleanup for us.
 226                  * We cannot race against the irqfd going away since the
 227                  * other side is required to acquire wqh->lock, which we hold
 228                  */
 229                 if (irqfd_is_active(irqfd))
 230                         irqfd_deactivate(irqfd);
 231
 232                 spin_unlock_irqrestore(&kvm->irqfds.lock, iflags);
 233         }
 234
 235         return ret;
 236 }
 237
 238 static void
 239 irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
 240                         poll_table *pt)
 241 {
 242         struct kvm_kernel_irqfd *irqfd =
 243                 container_of(pt, struct kvm_kernel_irqfd, pt);
 244         add_wait_queue_priority(wqh, &irqfd->wait);
 245 }
 246
 247 /* Must be called under irqfds.lock */
 248 static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd)
 249 {
 250         struct kvm_kernel_irq_routing_entry *e;
 251         struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
 252         int n_entries;
 253
 254         n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi);
 255
 256         write_seqcount_begin(&irqfd->irq_entry_sc);
 257
 258         e = entries;
 259         if (n_entries == 1)
 260                 irqfd->irq_entry = *e;
 261         else
 262                 irqfd->irq_entry.type = 0;
 263
 264         write_seqcount_end(&irqfd->irq_entry_sc);
 265 }
 266
 267 #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
 268 void __attribute__((weak)) kvm_arch_irq_bypass_stop(
 269                                 struct irq_bypass_consumer *cons)
 270 {
 271 }
 272
 273 void __attribute__((weak)) kvm_arch_irq_bypass_start(
 274                                 struct irq_bypass_consumer *cons)
 275 {
 276 }
 277
 278 int  __attribute__((weak)) kvm_arch_update_irqfd_routing(
 279                                 struct kvm *kvm, unsigned int host_irq,
 280                                 uint32_t guest_irq, bool set)
 281 {
 282         return 0;
 283 }
 284 #endif
 285
 286 static int
 287 kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 288 {
 289         struct kvm_kernel_irqfd *irqfd, *tmp;
 290         struct fd f;
 291         struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
 292         int ret;
 293         __poll_t events;
 294         int idx;
 295
 296         if (!kvm_arch_intc_initialized(kvm))
 297                 return -EAGAIN;
 298
 299         if (!kvm_arch_irqfd_allowed(kvm, args))
 300                 return -EINVAL;
 301
 302         irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL_ACCOUNT);
 303         if (!irqfd)
 304                 return -ENOMEM;
 305
 306         irqfd->kvm = kvm;
 307         irqfd->gsi = args->gsi;
 308         INIT_LIST_HEAD(&irqfd->list);
 309         INIT_WORK(&irqfd->inject, irqfd_inject);
 310         INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
 311         seqcount_spinlock_init(&irqfd->irq_entry_sc, &kvm->irqfds.lock);
 312
 313         f = fdget(args->fd);
 314         if (!f.file) {
 315                 ret = -EBADF;
 316                 goto out;
 317         }
 318
 319         eventfd = eventfd_ctx_fileget(f.file);
 320         if (IS_ERR(eventfd)) {
 321                 ret = PTR_ERR(eventfd);
 322                 goto fail;
 323         }
 324
 325         irqfd->eventfd = eventfd;
 326
 327         if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) {
 328                 struct kvm_kernel_irqfd_resampler *resampler;
 329
 330                 resamplefd = eventfd_ctx_fdget(args->resamplefd);
 331                 if (IS_ERR(resamplefd)) {
 332                         ret = PTR_ERR(resamplefd);
 333                         goto fail;
 334                 }
 335
 336                 irqfd->resamplefd = resamplefd;
 337                 INIT_LIST_HEAD(&irqfd->resampler_link);
 338
 339                 mutex_lock(&kvm->irqfds.resampler_lock);
 340
 341                 list_for_each_entry(resampler,
 342                                     &kvm->irqfds.resampler_list, link) {
 343                         if (resampler->notifier.gsi == irqfd->gsi) {
 344                                 irqfd->resampler = resampler;
 345                                 break;
 346                         }
 347                 }
 348
 349                 if (!irqfd->resampler) {
 350                         resampler = kzalloc(sizeof(*resampler),
 351                                             GFP_KERNEL_ACCOUNT);
 352                         if (!resampler) {
 353                                 ret = -ENOMEM;
 354                                 mutex_unlock(&kvm->irqfds.resampler_lock);
 355                                 goto fail;
 356                         }
 357
 358                         resampler->kvm = kvm;
 359                         INIT_LIST_HEAD(&resampler->list);
 360                         resampler->notifier.gsi = irqfd->gsi;
 361                         resampler->notifier.irq_acked = irqfd_resampler_ack;
 362                         INIT_LIST_HEAD(&resampler->link);
 363
 364                         list_add(&resampler->link, &kvm->irqfds.resampler_list);
 365                         kvm_register_irq_ack_notifier(kvm,
 366                                                       &resampler->notifier);
 367                         irqfd->resampler = resampler;
 368                 }
 369
 370                 list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list);
 371                 synchronize_srcu(&kvm->irq_srcu);
 372
 373                 mutex_unlock(&kvm->irqfds.resampler_lock);
 374         }
 375
 376         /*
 377          * Install our own custom wake-up handling so we are notified via
 378          * a callback whenever someone signals the underlying eventfd
 379          */
 380         init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
 381         init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc);
 382
 383         spin_lock_irq(&kvm->irqfds.lock);
 384
 385         ret = 0;
 386         list_for_each_entry(tmp, &kvm->irqfds.items, list) {
 387                 if (irqfd->eventfd != tmp->eventfd)
 388                         continue;
 389                 /* This fd is used for another irq already. */
 390                 ret = -EBUSY;
 391                 spin_unlock_irq(&kvm->irqfds.lock);
 392                 goto fail;
 393         }
 394
 395         idx = srcu_read_lock(&kvm->irq_srcu);
 396         irqfd_update(kvm, irqfd);
 397
 398         list_add_tail(&irqfd->list, &kvm->irqfds.items);
 399
 400         spin_unlock_irq(&kvm->irqfds.lock);
 401
 402         /*
 403          * Check if there was an event already pending on the eventfd
 404          * before we registered, and trigger it as if we didn't miss it.
 405          */
 406         events = vfs_poll(f.file, &irqfd->pt);
 407
 408         if (events & EPOLLIN)
 409                 schedule_work(&irqfd->inject);
 410
 411 #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
 412         if (kvm_arch_has_irq_bypass()) {
 413                 irqfd->consumer.token = (void *)irqfd->eventfd;
 414                 irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer;
 415                 irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer;
 416                 irqfd->consumer.stop = kvm_arch_irq_bypass_stop;
 417                 irqfd->consumer.start = kvm_arch_irq_bypass_start;
 418                 ret = irq_bypass_register_consumer(&irqfd->consumer);
 419                 if (ret)
 420                         pr_info("irq bypass consumer (token %p) registration fails: %d\n",
 421                                 irqfd->consumer.token, ret);
 422         }
 423 #endif
 424
 425         srcu_read_unlock(&kvm->irq_srcu, idx);
 426
 427         /*
 428          * do not drop the file until the irqfd is fully initialized, otherwise
 429          * we might race against the EPOLLHUP
 430          */
 431         fdput(f);
 432         return 0;
 433
 434 fail:
 435         if (irqfd->resampler)
 436                 irqfd_resampler_shutdown(irqfd);
 437
 438         if (resamplefd && !IS_ERR(resamplefd))
 439                 eventfd_ctx_put(resamplefd);
 440
 441         if (eventfd && !IS_ERR(eventfd))
 442                 eventfd_ctx_put(eventfd);
 443
 444         fdput(f);
 445
 446 out:
 447         kfree(irqfd);
 448         return ret;
 449 }
 450
 451 bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin)
 452 {
 453         struct kvm_irq_ack_notifier *kian;
 454         int gsi, idx;
 455
 456         idx = srcu_read_lock(&kvm->irq_srcu);
 457         gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
 458         if (gsi != -1)
 459                 hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
 460                                          link)
 461                         if (kian->gsi == gsi) {
 462                                 srcu_read_unlock(&kvm->irq_srcu, idx);
 463                                 return true;
 464                         }
 465
 466         srcu_read_unlock(&kvm->irq_srcu, idx);
 467
 468         return false;
 469 }
 470 EXPORT_SYMBOL_GPL(kvm_irq_has_notifier);
 471
 472 void kvm_notify_acked_gsi(struct kvm *kvm, int gsi)
 473 {
 474         struct kvm_irq_ack_notifier *kian;
 475
 476         hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
 477                                  link)
 478                 if (kian->gsi == gsi)
 479                         kian->irq_acked(kian);
 480 }
 481
 482 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
 483 {
 484         int gsi, idx;
 485
 486         trace_kvm_ack_irq(irqchip, pin);
 487
 488         idx = srcu_read_lock(&kvm->irq_srcu);
 489         gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
 490         if (gsi != -1)
 491                 kvm_notify_acked_gsi(kvm, gsi);
 492         srcu_read_unlock(&kvm->irq_srcu, idx);
 493 }
 494
 495 void kvm_register_irq_ack_notifier(struct kvm *kvm,
 496                                    struct kvm_irq_ack_notifier *kian)
 497 {
 498         mutex_lock(&kvm->irq_lock);
 499         hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list);
 500         mutex_unlock(&kvm->irq_lock);
 501         kvm_arch_post_irq_ack_notifier_list_update(kvm);
 502 }
 503
 504 void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
 505                                     struct kvm_irq_ack_notifier *kian)
 506 {
 507         mutex_lock(&kvm->irq_lock);
 508         hlist_del_init_rcu(&kian->link);
 509         mutex_unlock(&kvm->irq_lock);
 510         synchronize_srcu(&kvm->irq_srcu);
 511         kvm_arch_post_irq_ack_notifier_list_update(kvm);
 512 }
 513 #endif
 514
 515 void
 516 kvm_eventfd_init(struct kvm *kvm)
 517 {
 518 #ifdef CONFIG_HAVE_KVM_IRQFD
 519         spin_lock_init(&kvm->irqfds.lock);
 520         INIT_LIST_HEAD(&kvm->irqfds.items);
 521         INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
 522         mutex_init(&kvm->irqfds.resampler_lock);
 523 #endif
 524         INIT_LIST_HEAD(&kvm->ioeventfds);
 525 }
 526
 527 #ifdef CONFIG_HAVE_KVM_IRQFD
 528 /*
 529  * shutdown any irqfd's that match fd+gsi
 530  */
 531 static int
 532 kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
 533 {
 534         struct kvm_kernel_irqfd *irqfd, *tmp;
 535         struct eventfd_ctx *eventfd;
 536
 537         eventfd = eventfd_ctx_fdget(args->fd);
 538         if (IS_ERR(eventfd))
 539                 return PTR_ERR(eventfd);
 540
 541         spin_lock_irq(&kvm->irqfds.lock);
 542
 543         list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) {
 544                 if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) {
 545                         /*
 546                          * This clearing of irq_entry.type is needed for when
 547                          * another thread calls kvm_irq_routing_update before
 548                          * we flush workqueue below (we synchronize with
 549                          * kvm_irq_routing_update using irqfds.lock).
 550                          */
 551                         write_seqcount_begin(&irqfd->irq_entry_sc);
 552                         irqfd->irq_entry.type = 0;
 553                         write_seqcount_end(&irqfd->irq_entry_sc);
 554                         irqfd_deactivate(irqfd);
 555                 }
 556         }
 557
 558         spin_unlock_irq(&kvm->irqfds.lock);
 559         eventfd_ctx_put(eventfd);
 560
 561         /*
 562          * Block until we know all outstanding shutdown jobs have completed
 563          * so that we guarantee there will not be any more interrupts on this
 564          * gsi once this deassign function returns.
 565          */
 566         flush_workqueue(irqfd_cleanup_wq);
 567
 568         return 0;
 569 }
 570
 571 int
 572 kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
 573 {
 574         if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE))
 575                 return -EINVAL;
 576
 577         if (args->flags & KVM_IRQFD_FLAG_DEASSIGN)
 578                 return kvm_irqfd_deassign(kvm, args);
 579
 580         return kvm_irqfd_assign(kvm, args);
 581 }
 582
 583 /*
 584  * This function is called as the kvm VM fd is being released. Shutdown all
 585  * irqfds that still remain open
 586  */
 587 void
 588 kvm_irqfd_release(struct kvm *kvm)
 589 {
 590         struct kvm_kernel_irqfd *irqfd, *tmp;
 591
 592         spin_lock_irq(&kvm->irqfds.lock);
 593
 594         list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list)
 595                 irqfd_deactivate(irqfd);
 596
 597         spin_unlock_irq(&kvm->irqfds.lock);
 598
 599         /*
 600          * Block until we know all outstanding shutdown jobs have completed
 601          * since we do not take a kvm* reference.
 602          */
 603         flush_workqueue(irqfd_cleanup_wq);
 604
 605 }
 606
 607 /*
 608  * Take note of a change in irq routing.
 609  * Caller must invoke synchronize_srcu(&kvm->irq_srcu) afterwards.
 610  */
 611 void kvm_irq_routing_update(struct kvm *kvm)
 612 {
 613         struct kvm_kernel_irqfd *irqfd;
 614
 615         spin_lock_irq(&kvm->irqfds.lock);
 616
 617         list_for_each_entry(irqfd, &kvm->irqfds.items, list) {
 618                 irqfd_update(kvm, irqfd);
 619
 620 #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
 621                 if (irqfd->producer) {
 622                         int ret = kvm_arch_update_irqfd_routing(
 623                                         irqfd->kvm, irqfd->producer->irq,
 624                                         irqfd->gsi, 1);
 625                         WARN_ON(ret);
 626                 }
 627 #endif
 628         }
 629
 630         spin_unlock_irq(&kvm->irqfds.lock);
 631 }
 632
 633 /*
 634  * create a host-wide workqueue for issuing deferred shutdown requests
 635  * aggregated from all vm* instances. We need our own isolated
 636  * queue to ease flushing work items when a VM exits.
 637  */
 638 int kvm_irqfd_init(void)
 639 {
 640         irqfd_cleanup_wq = alloc_workqueue("kvm-irqfd-cleanup", 0, 0);
 641         if (!irqfd_cleanup_wq)
 642                 return -ENOMEM;
 643
 644         return 0;
 645 }
 646
 647 void kvm_irqfd_exit(void)
 648 {
 649         destroy_workqueue(irqfd_cleanup_wq);
 650 }
 651 #endif
 652
 653 /*
 654  * --------------------------------------------------------------------
 655  * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal.
 656  *
 657  * userspace can register a PIO/MMIO address with an eventfd for receiving
 658  * notification when the memory has been touched.
 659  * --------------------------------------------------------------------
 660  */
 661
 662 struct _ioeventfd {
 663         struct list_head     list;
 664         u64                  addr;
 665         int                  length;
 666         struct eventfd_ctx  *eventfd;
 667         u64                  datamatch;
 668         struct kvm_io_device dev;
 669         u8                   bus_idx;
 670         bool                 wildcard;
 671 };
 672
 673 static inline struct _ioeventfd *
 674 to_ioeventfd(struct kvm_io_device *dev)
 675 {
 676         return container_of(dev, struct _ioeventfd, dev);
 677 }
 678
 679 static void
 680 ioeventfd_release(struct _ioeventfd *p)
 681 {
 682         eventfd_ctx_put(p->eventfd);
 683         list_del(&p->list);
 684         kfree(p);
 685 }
 686
 687 static bool
 688 ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
 689 {
 690         u64 _val;
 691
 692         if (addr != p->addr)
 693                 /* address must be precise for a hit */
 694                 return false;
 695
 696         if (!p->length)
 697                 /* length = 0 means only look at the address, so always a hit */
 698                 return true;
 699
 700         if (len != p->length)
 701                 /* address-range must be precise for a hit */
 702                 return false;
 703
 704         if (p->wildcard)
 705                 /* all else equal, wildcard is always a hit */
 706                 return true;
 707
 708         /* otherwise, we have to actually compare the data */
 709
 710         BUG_ON(!IS_ALIGNED((unsigned long)val, len));
 711
 712         switch (len) {
 713         case 1:
 714                 _val = *(u8 *)val;
 715                 break;
 716         case 2:
 717                 _val = *(u16 *)val;
 718                 break;
 719         case 4:
 720                 _val = *(u32 *)val;
 721                 break;
 722         case 8:
 723                 _val = *(u64 *)val;
 724                 break;
 725         default:
 726                 return false;
 727         }
 728
 729         return _val == p->datamatch;
 730 }
 731
 732 /* MMIO/PIO writes trigger an event if the addr/val match */
 733 static int
 734 ioeventfd_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
 735                 int len, const void *val)
 736 {
 737         struct _ioeventfd *p = to_ioeventfd(this);
 738
 739         if (!ioeventfd_in_range(p, addr, len, val))
 740                 return -EOPNOTSUPP;
 741
 742         eventfd_signal(p->eventfd, 1);
 743         return 0;
 744 }
 745
 746 /*
 747  * This function is called as KVM is completely shutting down.  We do not
 748  * need to worry about locking just nuke anything we have as quickly as possible
 749  */
 750 static void
 751 ioeventfd_destructor(struct kvm_io_device *this)
 752 {
 753         struct _ioeventfd *p = to_ioeventfd(this);
 754
 755         ioeventfd_release(p);
 756 }
 757
 758 static const struct kvm_io_device_ops ioeventfd_ops = {
 759         .write      = ioeventfd_write,
 760         .destructor = ioeventfd_destructor,
 761 };
 762
 763 /* assumes kvm->slots_lock held */
 764 static bool
 765 ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
 766 {
 767         struct _ioeventfd *_p;
 768
 769         list_for_each_entry(_p, &kvm->ioeventfds, list)
 770                 if (_p->bus_idx == p->bus_idx &&
 771                     _p->addr == p->addr &&
 772                     (!_p->length || !p->length ||
 773                      (_p->length == p->length &&
 774                       (_p->wildcard || p->wildcard ||
 775                        _p->datamatch == p->datamatch))))
 776                         return true;
 777
 778         return false;
 779 }
 780
 781 static enum kvm_bus ioeventfd_bus_from_flags(__u32 flags)
 782 {
 783         if (flags & KVM_IOEVENTFD_FLAG_PIO)
 784                 return KVM_PIO_BUS;
 785         if (flags & KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY)
 786                 return KVM_VIRTIO_CCW_NOTIFY_BUS;
 787         return KVM_MMIO_BUS;
 788 }
 789
 790 static int kvm_assign_ioeventfd_idx(struct kvm *kvm,
 791                                 enum kvm_bus bus_idx,
 792                                 struct kvm_ioeventfd *args)
 793 {
 794
 795         struct eventfd_ctx *eventfd;
 796         struct _ioeventfd *p;
 797         int ret;
 798
 799         eventfd = eventfd_ctx_fdget(args->fd);
 800         if (IS_ERR(eventfd))
 801                 return PTR_ERR(eventfd);
 802
 803         p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT);
 804         if (!p) {
 805                 ret = -ENOMEM;
 806                 goto fail;
 807         }
 808
 809         INIT_LIST_HEAD(&p->list);
 810         p->addr    = args->addr;
 811         p->bus_idx = bus_idx;
 812         p->length  = args->len;
 813         p->eventfd = eventfd;
 814
 815         /* The datamatch feature is optional, otherwise this is a wildcard */
 816         if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)
 817                 p->datamatch = args->datamatch;
 818         else
 819                 p->wildcard = true;
 820
 821         mutex_lock(&kvm->slots_lock);
 822
 823         /* Verify that there isn't a match already */
 824         if (ioeventfd_check_collision(kvm, p)) {
 825                 ret = -EEXIST;
 826                 goto unlock_fail;
 827         }
 828
 829         kvm_iodevice_init(&p->dev, &ioeventfd_ops);
 830
 831         ret = kvm_io_bus_register_dev(kvm, bus_idx, p->addr, p->length,
 832                                       &p->dev);
 833         if (ret < 0)
 834                 goto unlock_fail;
 835
 836         kvm_get_bus(kvm, bus_idx)->ioeventfd_count++;
 837         list_add_tail(&p->list, &kvm->ioeventfds);
 838
 839         mutex_unlock(&kvm->slots_lock);
 840
 841         return 0;
 842
 843 unlock_fail:
 844         mutex_unlock(&kvm->slots_lock);
 845
 846 fail:
 847         kfree(p);
 848         eventfd_ctx_put(eventfd);
 849
 850         return ret;
 851 }
 852
 853 static int
 854 kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx,
 855                            struct kvm_ioeventfd *args)
 856 {
 857         struct _ioeventfd        *p, *tmp;
 858         struct eventfd_ctx       *eventfd;
 859         struct kvm_io_bus        *bus;
 860         int                       ret = -ENOENT;
 861         bool                      wildcard;
 862
 863         eventfd = eventfd_ctx_fdget(args->fd);
 864         if (IS_ERR(eventfd))
 865                 return PTR_ERR(eventfd);
 866
 867         wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH);
 868
 869         mutex_lock(&kvm->slots_lock);
 870
 871         list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) {
 872
 873                 if (p->bus_idx != bus_idx ||
 874                     p->eventfd != eventfd  ||
 875                     p->addr != args->addr  ||
 876                     p->length != args->len ||
 877                     p->wildcard != wildcard)
 878                         continue;
 879
 880                 if (!p->wildcard && p->datamatch != args->datamatch)
 881                         continue;
 882
 883                 kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
 884                 bus = kvm_get_bus(kvm, bus_idx);
 885                 if (bus)
 886                         bus->ioeventfd_count--;
 887                 ioeventfd_release(p);
 888                 ret = 0;
 889                 break;
 890         }
 891
 892         mutex_unlock(&kvm->slots_lock);
 893
 894         eventfd_ctx_put(eventfd);
 895
 896         return ret;
 897 }
 898
 899 static int kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 900 {
 901         enum kvm_bus bus_idx = ioeventfd_bus_from_flags(args->flags);
 902         int ret = kvm_deassign_ioeventfd_idx(kvm, bus_idx, args);
 903
 904         if (!args->len && bus_idx == KVM_MMIO_BUS)
 905                 kvm_deassign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args);
 906
 907         return ret;
 908 }
 909
 910 static int
 911 kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 912 {
 913         enum kvm_bus              bus_idx;
 914         int ret;
 915
 916         bus_idx = ioeventfd_bus_from_flags(args->flags);
 917         /* must be natural-word sized, or 0 to ignore length */
 918         switch (args->len) {
 919         case 0:
 920         case 1:
 921         case 2:
 922         case 4:
 923         case 8:
 924                 break;
 925         default:
 926                 return -EINVAL;
 927         }
 928
 929         /* check for range overflow */
 930         if (args->addr + args->len < args->addr)
 931                 return -EINVAL;
 932
 933         /* check for extra flags that we don't understand */
 934         if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK)
 935                 return -EINVAL;
 936
 937         /* ioeventfd with no length can't be combined with DATAMATCH */
 938         if (!args->len && (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH))
 939                 return -EINVAL;
 940
 941         ret = kvm_assign_ioeventfd_idx(kvm, bus_idx, args);
 942         if (ret)
 943                 goto fail;
 944
 945         /* When length is ignored, MMIO is also put on a separate bus, for
 946          * faster lookups.
 947          */
 948         if (!args->len && bus_idx == KVM_MMIO_BUS) {
 949                 ret = kvm_assign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args);
 950                 if (ret < 0)
 951                         goto fast_fail;
 952         }
 953
 954         return 0;
 955
 956 fast_fail:
 957         kvm_deassign_ioeventfd_idx(kvm, bus_idx, args);
 958 fail:
 959         return ret;
 960 }
 961
 962 int
 963 kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 964 {
 965         if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN)
 966                 return kvm_deassign_ioeventfd(kvm, args);
 967
 968         return kvm_assign_ioeventfd(kvm, args);
 969 }