Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

[mirror_ubuntu-zesty-kernel.git] / virt / kvm / kvm_main.c
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c

index 95519bc959edcaaefe27d1fd9e1d7a51aa52b6e2..384eaa7b02fa993f77981d6c91e6dd7f039c1f34 100644 (file)
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -52,11 +52,13 @@
  
  #include <asm/processor.h>
  #include <asm/io.h>
+#include <asm/ioctl.h>
  #include <asm/uaccess.h>
  #include <asm/pgtable.h>
  
  #include "coalesced_mmio.h"
  #include "async_pf.h"
+#include "vfio.h"
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/kvm.h>
@@ -95,8 +97,6 @@ static int hardware_enable_all(void);
  static void hardware_disable_all(void);
  
  static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
-static void update_memslots(struct kvm_memslots *slots,
-                           struct kvm_memory_slot *new, u64 last_generation);
  
  static void kvm_release_pfn_dirty(pfn_t pfn);
  static void mark_page_dirty_in_slot(struct kvm *kvm,
@@ -129,7 +129,8 @@ int vcpu_load(struct kvm_vcpu *vcpu)
                 struct pid *oldpid = vcpu->pid;
                 struct pid *newpid = get_task_pid(current, PIDTYPE_PID);
                 rcu_assign_pointer(vcpu->pid, newpid);
-               synchronize_rcu();
+               if (oldpid)
+                       synchronize_rcu();
                 put_pid(oldpid);
         }
         cpu = get_cpu();
@@ -152,7 +153,7 @@ static void ack_flush(void *_completed)
  {
  }
  
-static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
+bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
  {
         int i, cpu, me;
         cpumask_var_t cpus;
@@ -189,7 +190,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
         long dirty_count = kvm->tlbs_dirty;
  
         smp_mb();
-       if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
+       if (kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
                 ++kvm->stat.remote_tlb_flush;
         cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
  }
@@ -197,17 +198,17 @@ EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
  
  void kvm_reload_remote_mmus(struct kvm *kvm)
  {
-       make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
+       kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
  }
  
  void kvm_make_mclock_inprogress_request(struct kvm *kvm)
  {
-       make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
+       kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
  }
  
  void kvm_make_scan_ioapic_request(struct kvm *kvm)
  {
-       make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
+       kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
  }
  
  int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
@@ -295,6 +296,9 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
                 kvm_flush_remote_tlbs(kvm);
  
         spin_unlock(&kvm->mmu_lock);
+
+       kvm_arch_mmu_notifier_invalidate_page(kvm, address);
+
         srcu_read_unlock(&kvm->srcu, idx);
  }
  
@@ -368,7 +372,8 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
  
  static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
                                               struct mm_struct *mm,
-                                             unsigned long address)
+                                             unsigned long start,
+                                             unsigned long end)
  {
         struct kvm *kvm = mmu_notifier_to_kvm(mn);
         int young, idx;
@@ -376,7 +381,7 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
         idx = srcu_read_lock(&kvm->srcu);
         spin_lock(&kvm->mmu_lock);
  
-       young = kvm_age_hva(kvm, address);
+       young = kvm_age_hva(kvm, start, end);
         if (young)
                 kvm_flush_remote_tlbs(kvm);
  
@@ -476,6 +481,13 @@ static struct kvm *kvm_create_vm(unsigned long type)
         kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
         if (!kvm->memslots)
                 goto out_err_no_srcu;
+
+       /*
+        * Init kvm generation close to the maximum to easily test the
+        * code of handling generation number wrap-around.
+        */
+       kvm->memslots->generation = -150;
+
         kvm_init_memslots_id(kvm);
         if (init_srcu_struct(&kvm->srcu))
                 goto out_err_no_srcu;
@@ -687,8 +699,7 @@ static void sort_memslots(struct kvm_memslots *slots)
  }
  
  static void update_memslots(struct kvm_memslots *slots,
-                           struct kvm_memory_slot *new,
-                           u64 last_generation)
+                           struct kvm_memory_slot *new)
  {
         if (new) {
                 int id = new->id;
@@ -699,15 +710,13 @@ static void update_memslots(struct kvm_memslots *slots,
                 if (new->npages != npages)
                         sort_memslots(slots);
         }
-
-       slots->generation = last_generation + 1;
  }
  
  static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
  {
         u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
  
-#ifdef KVM_CAP_READONLY_MEM
+#ifdef __KVM_HAVE_READONLY_MEM
         valid_flags |= KVM_MEM_READONLY;
  #endif
  
@@ -722,10 +731,24 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
  {
         struct kvm_memslots *old_memslots = kvm->memslots;
  
-       update_memslots(slots, new, kvm->memslots->generation);
+       /*
+        * Set the low bit in the generation, which disables SPTE caching
+        * until the end of synchronize_srcu_expedited.
+        */
+       WARN_ON(old_memslots->generation & 1);
+       slots->generation = old_memslots->generation + 1;
+
+       update_memslots(slots, new);
         rcu_assign_pointer(kvm->memslots, slots);
         synchronize_srcu_expedited(&kvm->srcu);
  
+       /*
+        * Increment the new memslot generation a second time. This prevents
+        * vm exits that race with memslot updates from caching a memslot
+        * generation that will (potentially) be valid forever.
+        */
+       slots->generation++;
+
         kvm_arch_memslots_updated(kvm);
  
         return old_memslots;
@@ -776,7 +799,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
         base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
         npages = mem->memory_size >> PAGE_SHIFT;
  
-       r = -EINVAL;
         if (npages > KVM_MEM_MAX_NR_PAGES)
                 goto out;
  
@@ -790,7 +812,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
         new.npages = npages;
         new.flags = mem->flags;
  
-       r = -EINVAL;
         if (npages) {
                 if (!old.npages)
                         change = KVM_MR_CREATE;
@@ -846,7 +867,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
         }
  
         if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) {
-               r = -ENOMEM;
                 slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots),
                                 GFP_KERNEL);
                 if (!slots)
@@ -1075,9 +1095,9 @@ EXPORT_SYMBOL_GPL(gfn_to_hva);
   * If writable is set to false, the hva returned by this function is only
   * allowed to be read.
   */
-unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
+unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
+                                     gfn_t gfn, bool *writable)
  {
-       struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
         unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
  
         if (!kvm_is_error_hva(hva) && writable)
@@ -1086,6 +1106,13 @@ unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
         return hva;
  }
  
+unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
+{
+       struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+
+       return gfn_to_hva_memslot_prot(slot, gfn, writable);
+}
+
  static int kvm_read_hva(void *data, void __user *hva, int len)
  {
         return __copy_from_user(data, hva, len);
@@ -1107,6 +1134,43 @@ static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
         return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
  }
  
+int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm,
+                        unsigned long addr, bool write_fault,
+                        struct page **pagep)
+{
+       int npages;
+       int locked = 1;
+       int flags = FOLL_TOUCH | FOLL_HWPOISON |
+                   (pagep ? FOLL_GET : 0) |
+                   (write_fault ? FOLL_WRITE : 0);
+
+       /*
+        * If retrying the fault, we get here *not* having allowed the filemap
+        * to wait on the page lock. We should now allow waiting on the IO with
+        * the mmap semaphore released.
+        */
+       down_read(&mm->mmap_sem);
+       npages = __get_user_pages(tsk, mm, addr, 1, flags, pagep, NULL,
+                                 &locked);
+       if (!locked) {
+               VM_BUG_ON(npages);
+
+               if (!pagep)
+                       return 0;
+
+               /*
+                * The previous call has now waited on the IO. Now we can
+                * retry and complete. Pass TRIED to ensure we do not re
+                * schedule async IO (see e.g. filemap_fault).
+                */
+               down_read(&mm->mmap_sem);
+               npages = __get_user_pages(tsk, mm, addr, 1, flags | FOLL_TRIED,
+                                         pagep, NULL, NULL);
+       }
+       up_read(&mm->mmap_sem);
+       return npages;
+}
+
  static inline int check_user_page_hwpoison(unsigned long addr)
  {
         int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
@@ -1169,9 +1233,15 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
                 npages = get_user_page_nowait(current, current->mm,
                                               addr, write_fault, page);
                 up_read(&current->mm->mmap_sem);
-       } else
-               npages = get_user_pages_fast(addr, 1, write_fault,
-                                            page);
+       } else {
+               /*
+                * By now we have tried gup_fast, and possibly async_pf, and we
+                * are certainly not atomic. Time to retry the gup, allowing
+                * mmap semaphore to be relinquished in the case of IO.
+                */
+               npages = kvm_get_user_page_io(current, current->mm, addr,
+                                             write_fault, page);
+       }
         if (npages != 1)
                 return npages;
  
@@ -1768,8 +1838,7 @@ static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
         bool eligible;
  
         eligible = !vcpu->spin_loop.in_spin_loop ||
-                       (vcpu->spin_loop.in_spin_loop &&
-                        vcpu->spin_loop.dy_eligible);
+                   vcpu->spin_loop.dy_eligible;
  
         if (vcpu->spin_loop.in_spin_loop)
                 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
@@ -1975,6 +2044,9 @@ static long kvm_vcpu_ioctl(struct file *filp,
         if (vcpu->kvm->mm != current->mm)
                 return -EIO;
  
+       if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
+               return -EINVAL;
+
  #if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)
         /*
          * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
@@ -2259,6 +2331,29 @@ struct kvm_device *kvm_device_from_filp(struct file *filp)
         return filp->private_data;
  }
  
+static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
+#ifdef CONFIG_KVM_MPIC
+       [KVM_DEV_TYPE_FSL_MPIC_20]      = &kvm_mpic_ops,
+       [KVM_DEV_TYPE_FSL_MPIC_42]      = &kvm_mpic_ops,
+#endif
+
+#ifdef CONFIG_KVM_XICS
+       [KVM_DEV_TYPE_XICS]             = &kvm_xics_ops,
+#endif
+};
+
+int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type)
+{
+       if (type >= ARRAY_SIZE(kvm_device_ops_table))
+               return -ENOSPC;
+
+       if (kvm_device_ops_table[type] != NULL)
+               return -EEXIST;
+
+       kvm_device_ops_table[type] = ops;
+       return 0;
+}
+
  static int kvm_ioctl_create_device(struct kvm *kvm,
                                    struct kvm_create_device *cd)
  {
@@ -2267,36 +2362,12 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
         bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
         int ret;
  
-       switch (cd->type) {
-#ifdef CONFIG_KVM_MPIC
-       case KVM_DEV_TYPE_FSL_MPIC_20:
-       case KVM_DEV_TYPE_FSL_MPIC_42:
-               ops = &kvm_mpic_ops;
-               break;
-#endif
-#ifdef CONFIG_KVM_XICS
-       case KVM_DEV_TYPE_XICS:
-               ops = &kvm_xics_ops;
-               break;
-#endif
-#ifdef CONFIG_KVM_VFIO
-       case KVM_DEV_TYPE_VFIO:
-               ops = &kvm_vfio_ops;
-               break;
-#endif
-#ifdef CONFIG_KVM_ARM_VGIC
-       case KVM_DEV_TYPE_ARM_VGIC_V2:
-               ops = &kvm_arm_vgic_v2_ops;
-               break;
-#endif
-#ifdef CONFIG_S390
-       case KVM_DEV_TYPE_FLIC:
-               ops = &kvm_flic_ops;
-               break;
-#endif
-       default:
+       if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
+               return -ENODEV;
+
+       ops = kvm_device_ops_table[cd->type];
+       if (ops == NULL)
                 return -ENODEV;
-       }
  
         if (test)
                 return 0;
@@ -2611,7 +2682,6 @@ static long kvm_dev_ioctl(struct file *filp,
  
         switch (ioctl) {
         case KVM_GET_API_VERSION:
-               r = -EINVAL;
                 if (arg)
                         goto out;
                 r = KVM_API_VERSION;
@@ -2623,7 +2693,6 @@ static long kvm_dev_ioctl(struct file *filp,
                 r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
                 break;
         case KVM_GET_VCPU_MMAP_SIZE:
-               r = -EINVAL;
                 if (arg)
                         goto out;
                 r = PAGE_SIZE;     /* struct kvm_run */
@@ -2668,7 +2737,7 @@ static void hardware_enable_nolock(void *junk)
  
         cpumask_set_cpu(cpu, cpus_hardware_enabled);
  
-       r = kvm_arch_hardware_enable(NULL);
+       r = kvm_arch_hardware_enable();
  
         if (r) {
                 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
@@ -2693,7 +2762,7 @@ static void hardware_disable_nolock(void *junk)
         if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
                 return;
         cpumask_clear_cpu(cpu, cpus_hardware_enabled);
-       kvm_arch_hardware_disable(NULL);
+       kvm_arch_hardware_disable();
  }
  
  static void hardware_disable(void)
@@ -3123,6 +3192,8 @@ static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
         if (vcpu->preempted)
                 vcpu->preempted = false;
  
+       kvm_arch_sched_in(vcpu, cpu);
+
         kvm_arch_vcpu_load(vcpu, cpu);
  }
  
@@ -3214,6 +3285,9 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
                 goto out_undebugfs;
         }
  
+       r = kvm_vfio_ops_init();
+       WARN_ON(r);
+
         return 0;
  
  out_undebugfs: