]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blobdiff - virt/kvm/kvm_main.c
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
[mirror_ubuntu-zesty-kernel.git] / virt / kvm / kvm_main.c
index 95519bc959edcaaefe27d1fd9e1d7a51aa52b6e2..384eaa7b02fa993f77981d6c91e6dd7f039c1f34 100644 (file)
 
 #include <asm/processor.h>
 #include <asm/io.h>
+#include <asm/ioctl.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
 #include "coalesced_mmio.h"
 #include "async_pf.h"
+#include "vfio.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/kvm.h>
@@ -95,8 +97,6 @@ static int hardware_enable_all(void);
 static void hardware_disable_all(void);
 
 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
-static void update_memslots(struct kvm_memslots *slots,
-                           struct kvm_memory_slot *new, u64 last_generation);
 
 static void kvm_release_pfn_dirty(pfn_t pfn);
 static void mark_page_dirty_in_slot(struct kvm *kvm,
@@ -129,7 +129,8 @@ int vcpu_load(struct kvm_vcpu *vcpu)
                struct pid *oldpid = vcpu->pid;
                struct pid *newpid = get_task_pid(current, PIDTYPE_PID);
                rcu_assign_pointer(vcpu->pid, newpid);
-               synchronize_rcu();
+               if (oldpid)
+                       synchronize_rcu();
                put_pid(oldpid);
        }
        cpu = get_cpu();
@@ -152,7 +153,7 @@ static void ack_flush(void *_completed)
 {
 }
 
-static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
+bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
 {
        int i, cpu, me;
        cpumask_var_t cpus;
@@ -189,7 +190,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
        long dirty_count = kvm->tlbs_dirty;
 
        smp_mb();
-       if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
+       if (kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
                ++kvm->stat.remote_tlb_flush;
        cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
 }
@@ -197,17 +198,17 @@ EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
 
 void kvm_reload_remote_mmus(struct kvm *kvm)
 {
-       make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
+       kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
 }
 
 void kvm_make_mclock_inprogress_request(struct kvm *kvm)
 {
-       make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
+       kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
 }
 
 void kvm_make_scan_ioapic_request(struct kvm *kvm)
 {
-       make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
+       kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
 }
 
 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
@@ -295,6 +296,9 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
                kvm_flush_remote_tlbs(kvm);
 
        spin_unlock(&kvm->mmu_lock);
+
+       kvm_arch_mmu_notifier_invalidate_page(kvm, address);
+
        srcu_read_unlock(&kvm->srcu, idx);
 }
 
@@ -368,7 +372,8 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 
 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
                                              struct mm_struct *mm,
-                                             unsigned long address)
+                                             unsigned long start,
+                                             unsigned long end)
 {
        struct kvm *kvm = mmu_notifier_to_kvm(mn);
        int young, idx;
@@ -376,7 +381,7 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
        idx = srcu_read_lock(&kvm->srcu);
        spin_lock(&kvm->mmu_lock);
 
-       young = kvm_age_hva(kvm, address);
+       young = kvm_age_hva(kvm, start, end);
        if (young)
                kvm_flush_remote_tlbs(kvm);
 
@@ -476,6 +481,13 @@ static struct kvm *kvm_create_vm(unsigned long type)
        kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
        if (!kvm->memslots)
                goto out_err_no_srcu;
+
+       /*
+        * Init kvm generation close to the maximum to easily test the
+        * code of handling generation number wrap-around.
+        */
+       kvm->memslots->generation = -150;
+
        kvm_init_memslots_id(kvm);
        if (init_srcu_struct(&kvm->srcu))
                goto out_err_no_srcu;
@@ -687,8 +699,7 @@ static void sort_memslots(struct kvm_memslots *slots)
 }
 
 static void update_memslots(struct kvm_memslots *slots,
-                           struct kvm_memory_slot *new,
-                           u64 last_generation)
+                           struct kvm_memory_slot *new)
 {
        if (new) {
                int id = new->id;
@@ -699,15 +710,13 @@ static void update_memslots(struct kvm_memslots *slots,
                if (new->npages != npages)
                        sort_memslots(slots);
        }
-
-       slots->generation = last_generation + 1;
 }
 
 static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
 {
        u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
 
-#ifdef KVM_CAP_READONLY_MEM
+#ifdef __KVM_HAVE_READONLY_MEM
        valid_flags |= KVM_MEM_READONLY;
 #endif
 
@@ -722,10 +731,24 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
 {
        struct kvm_memslots *old_memslots = kvm->memslots;
 
-       update_memslots(slots, new, kvm->memslots->generation);
+       /*
+        * Set the low bit in the generation, which disables SPTE caching
+        * until the end of synchronize_srcu_expedited.
+        */
+       WARN_ON(old_memslots->generation & 1);
+       slots->generation = old_memslots->generation + 1;
+
+       update_memslots(slots, new);
        rcu_assign_pointer(kvm->memslots, slots);
        synchronize_srcu_expedited(&kvm->srcu);
 
+       /*
+        * Increment the new memslot generation a second time. This prevents
+        * vm exits that race with memslot updates from caching a memslot
+        * generation that will (potentially) be valid forever.
+        */
+       slots->generation++;
+
        kvm_arch_memslots_updated(kvm);
 
        return old_memslots;
@@ -776,7 +799,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
        base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
        npages = mem->memory_size >> PAGE_SHIFT;
 
-       r = -EINVAL;
        if (npages > KVM_MEM_MAX_NR_PAGES)
                goto out;
 
@@ -790,7 +812,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
        new.npages = npages;
        new.flags = mem->flags;
 
-       r = -EINVAL;
        if (npages) {
                if (!old.npages)
                        change = KVM_MR_CREATE;
@@ -846,7 +867,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
        }
 
        if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) {
-               r = -ENOMEM;
                slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots),
                                GFP_KERNEL);
                if (!slots)
@@ -1075,9 +1095,9 @@ EXPORT_SYMBOL_GPL(gfn_to_hva);
  * If writable is set to false, the hva returned by this function is only
  * allowed to be read.
  */
-unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
+unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
+                                     gfn_t gfn, bool *writable)
 {
-       struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
        unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
 
        if (!kvm_is_error_hva(hva) && writable)
@@ -1086,6 +1106,13 @@ unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
        return hva;
 }
 
+unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
+{
+       struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+
+       return gfn_to_hva_memslot_prot(slot, gfn, writable);
+}
+
 static int kvm_read_hva(void *data, void __user *hva, int len)
 {
        return __copy_from_user(data, hva, len);
@@ -1107,6 +1134,43 @@ static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
        return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
 }
 
+int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm,
+                        unsigned long addr, bool write_fault,
+                        struct page **pagep)
+{
+       int npages;
+       int locked = 1;
+       int flags = FOLL_TOUCH | FOLL_HWPOISON |
+                   (pagep ? FOLL_GET : 0) |
+                   (write_fault ? FOLL_WRITE : 0);
+
+       /*
+        * If retrying the fault, we get here *not* having allowed the filemap
+        * to wait on the page lock. We should now allow waiting on the IO with
+        * the mmap semaphore released.
+        */
+       down_read(&mm->mmap_sem);
+       npages = __get_user_pages(tsk, mm, addr, 1, flags, pagep, NULL,
+                                 &locked);
+       if (!locked) {
+               VM_BUG_ON(npages);
+
+               if (!pagep)
+                       return 0;
+
+               /*
+                * The previous call has now waited on the IO. Now we can
+                * retry and complete. Pass TRIED to ensure we do not re
+                * schedule async IO (see e.g. filemap_fault).
+                */
+               down_read(&mm->mmap_sem);
+               npages = __get_user_pages(tsk, mm, addr, 1, flags | FOLL_TRIED,
+                                         pagep, NULL, NULL);
+       }
+       up_read(&mm->mmap_sem);
+       return npages;
+}
+
 static inline int check_user_page_hwpoison(unsigned long addr)
 {
        int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
@@ -1169,9 +1233,15 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
                npages = get_user_page_nowait(current, current->mm,
                                              addr, write_fault, page);
                up_read(&current->mm->mmap_sem);
-       } else
-               npages = get_user_pages_fast(addr, 1, write_fault,
-                                            page);
+       } else {
+               /*
+                * By now we have tried gup_fast, and possibly async_pf, and we
+                * are certainly not atomic. Time to retry the gup, allowing
+                * mmap semaphore to be relinquished in the case of IO.
+                */
+               npages = kvm_get_user_page_io(current, current->mm, addr,
+                                             write_fault, page);
+       }
        if (npages != 1)
                return npages;
 
@@ -1768,8 +1838,7 @@ static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
        bool eligible;
 
        eligible = !vcpu->spin_loop.in_spin_loop ||
-                       (vcpu->spin_loop.in_spin_loop &&
-                        vcpu->spin_loop.dy_eligible);
+                   vcpu->spin_loop.dy_eligible;
 
        if (vcpu->spin_loop.in_spin_loop)
                kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
@@ -1975,6 +2044,9 @@ static long kvm_vcpu_ioctl(struct file *filp,
        if (vcpu->kvm->mm != current->mm)
                return -EIO;
 
+       if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
+               return -EINVAL;
+
 #if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)
        /*
         * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
@@ -2259,6 +2331,29 @@ struct kvm_device *kvm_device_from_filp(struct file *filp)
        return filp->private_data;
 }
 
+static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
+#ifdef CONFIG_KVM_MPIC
+       [KVM_DEV_TYPE_FSL_MPIC_20]      = &kvm_mpic_ops,
+       [KVM_DEV_TYPE_FSL_MPIC_42]      = &kvm_mpic_ops,
+#endif
+
+#ifdef CONFIG_KVM_XICS
+       [KVM_DEV_TYPE_XICS]             = &kvm_xics_ops,
+#endif
+};
+
+int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type)
+{
+       if (type >= ARRAY_SIZE(kvm_device_ops_table))
+               return -ENOSPC;
+
+       if (kvm_device_ops_table[type] != NULL)
+               return -EEXIST;
+
+       kvm_device_ops_table[type] = ops;
+       return 0;
+}
+
 static int kvm_ioctl_create_device(struct kvm *kvm,
                                   struct kvm_create_device *cd)
 {
@@ -2267,36 +2362,12 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
        bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
        int ret;
 
-       switch (cd->type) {
-#ifdef CONFIG_KVM_MPIC
-       case KVM_DEV_TYPE_FSL_MPIC_20:
-       case KVM_DEV_TYPE_FSL_MPIC_42:
-               ops = &kvm_mpic_ops;
-               break;
-#endif
-#ifdef CONFIG_KVM_XICS
-       case KVM_DEV_TYPE_XICS:
-               ops = &kvm_xics_ops;
-               break;
-#endif
-#ifdef CONFIG_KVM_VFIO
-       case KVM_DEV_TYPE_VFIO:
-               ops = &kvm_vfio_ops;
-               break;
-#endif
-#ifdef CONFIG_KVM_ARM_VGIC
-       case KVM_DEV_TYPE_ARM_VGIC_V2:
-               ops = &kvm_arm_vgic_v2_ops;
-               break;
-#endif
-#ifdef CONFIG_S390
-       case KVM_DEV_TYPE_FLIC:
-               ops = &kvm_flic_ops;
-               break;
-#endif
-       default:
+       if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
+               return -ENODEV;
+
+       ops = kvm_device_ops_table[cd->type];
+       if (ops == NULL)
                return -ENODEV;
-       }
 
        if (test)
                return 0;
@@ -2611,7 +2682,6 @@ static long kvm_dev_ioctl(struct file *filp,
 
        switch (ioctl) {
        case KVM_GET_API_VERSION:
-               r = -EINVAL;
                if (arg)
                        goto out;
                r = KVM_API_VERSION;
@@ -2623,7 +2693,6 @@ static long kvm_dev_ioctl(struct file *filp,
                r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
                break;
        case KVM_GET_VCPU_MMAP_SIZE:
-               r = -EINVAL;
                if (arg)
                        goto out;
                r = PAGE_SIZE;     /* struct kvm_run */
@@ -2668,7 +2737,7 @@ static void hardware_enable_nolock(void *junk)
 
        cpumask_set_cpu(cpu, cpus_hardware_enabled);
 
-       r = kvm_arch_hardware_enable(NULL);
+       r = kvm_arch_hardware_enable();
 
        if (r) {
                cpumask_clear_cpu(cpu, cpus_hardware_enabled);
@@ -2693,7 +2762,7 @@ static void hardware_disable_nolock(void *junk)
        if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
                return;
        cpumask_clear_cpu(cpu, cpus_hardware_enabled);
-       kvm_arch_hardware_disable(NULL);
+       kvm_arch_hardware_disable();
 }
 
 static void hardware_disable(void)
@@ -3123,6 +3192,8 @@ static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
        if (vcpu->preempted)
                vcpu->preempted = false;
 
+       kvm_arch_sched_in(vcpu, cpu);
+
        kvm_arch_vcpu_load(vcpu, cpu);
 }
 
@@ -3214,6 +3285,9 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
                goto out_undebugfs;
        }
 
+       r = kvm_vfio_ops_init();
+       WARN_ON(r);
+
        return 0;
 
 out_undebugfs: