]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blobdiff - virt/kvm/kvm_main.c
UBUNTU: Ubuntu-4.15.0-96.97
[mirror_ubuntu-bionic-kernel.git] / virt / kvm / kvm_main.c
index 38de930fe93cd349881dbfe46ec857ad53ccf7d5..3c54b2e1b59c9baf9cb59473bd1ebf9944527de5 100644 (file)
@@ -51,6 +51,7 @@
 #include <linux/slab.h>
 #include <linux/sort.h>
 #include <linux/bsearch.h>
+#include <linux/kthread.h>
 
 #include <asm/processor.h>
 #include <asm/io.h>
@@ -92,7 +93,7 @@ EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
  *     kvm->lock --> kvm->slots_lock --> kvm->irq_lock
  */
 
-DEFINE_SPINLOCK(kvm_lock);
+DEFINE_MUTEX(kvm_lock);
 static DEFINE_RAW_SPINLOCK(kvm_count_lock);
 LIST_HEAD(vm_list);
 
@@ -140,10 +141,30 @@ __weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
 {
 }
 
+bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
+{
+       /*
+        * The metadata used by is_zone_device_page() to determine whether or
+        * not a page is ZONE_DEVICE is guaranteed to be valid if and only if
+        * the device has been pinned, e.g. by get_user_pages().  WARN if the
+        * page_count() is zero to help detect bad usage of this helper.
+        */
+       if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
+               return false;
+
+       return is_zone_device_page(pfn_to_page(pfn));
+}
+
 bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
 {
+       /*
+        * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
+        * perspective they are "normal" pages, albeit with slightly different
+        * usage rules.
+        */
        if (pfn_valid(pfn))
-               return PageReserved(pfn_to_page(pfn));
+               return PageReserved(pfn_to_page(pfn)) &&
+                      !kvm_is_zone_device_pfn(pfn);
 
        return true;
 }
@@ -595,8 +616,9 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
 
                stat_data->kvm = kvm;
                stat_data->offset = p->offset;
+               stat_data->mode = p->mode ? p->mode : 0644;
                kvm->debugfs_stat_data[p - debugfs_entries] = stat_data;
-               if (!debugfs_create_file(p->name, 0644,
+               if (!debugfs_create_file(p->name, stat_data->mode,
                                         kvm->debugfs_dentry,
                                         stat_data,
                                         stat_fops_per_vm[p->kind]))
@@ -605,6 +627,23 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
        return 0;
 }
 
+/*
+ * Called after the VM is otherwise initialized, but just before adding it to
+ * the vm_list.
+ */
+int __weak kvm_arch_post_init_vm(struct kvm *kvm)
+{
+       return 0;
+}
+
+/*
+ * Called just after removing the VM from the vm_list, but before doing any
+ * other destruction.
+ */
+void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
+{
+}
+
 static struct kvm *kvm_create_vm(unsigned long type)
 {
        int r, i;
@@ -659,22 +698,31 @@ static struct kvm *kvm_create_vm(unsigned long type)
                rcu_assign_pointer(kvm->buses[i],
                        kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL));
                if (!kvm->buses[i])
-                       goto out_err;
+                       goto out_err_no_mmu_notifier;
        }
 
        r = kvm_init_mmu_notifier(kvm);
+       if (r)
+               goto out_err_no_mmu_notifier;
+
+       r = kvm_arch_post_init_vm(kvm);
        if (r)
                goto out_err;
 
-       spin_lock(&kvm_lock);
+       mutex_lock(&kvm_lock);
        list_add(&kvm->vm_list, &vm_list);
-       spin_unlock(&kvm_lock);
+       mutex_unlock(&kvm_lock);
 
        preempt_notifier_inc();
 
        return kvm;
 
 out_err:
+#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+       if (kvm->mmu_notifier.ops)
+               mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
+#endif
+out_err_no_mmu_notifier:
        cleanup_srcu_struct(&kvm->irq_srcu);
 out_err_no_irq_srcu:
        cleanup_srcu_struct(&kvm->srcu);
@@ -714,9 +762,11 @@ static void kvm_destroy_vm(struct kvm *kvm)
        kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
        kvm_destroy_vm_debugfs(kvm);
        kvm_arch_sync_events(kvm);
-       spin_lock(&kvm_lock);
+       mutex_lock(&kvm_lock);
        list_del(&kvm->vm_list);
-       spin_unlock(&kvm_lock);
+       mutex_unlock(&kvm_lock);
+       kvm_arch_pre_destroy_vm(kvm);
+
        kvm_free_irq_routing(kvm);
        for (i = 0; i < KVM_NR_BUSES; i++) {
                struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
@@ -855,6 +905,7 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
                int as_id, struct kvm_memslots *slots)
 {
        struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id);
+       u64 gen;
 
        /*
         * Set the low bit in the generation, which disables SPTE caching
@@ -877,9 +928,11 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
         * space 0 will use generations 0, 4, 8, ... while * address space 1 will
         * use generations 2, 6, 10, 14, ...
         */
-       slots->generation += KVM_ADDRESS_SPACE_NUM * 2 - 1;
+       gen = slots->generation + KVM_ADDRESS_SPACE_NUM * 2 - 1;
+
+       kvm_arch_memslots_updated(kvm, gen);
 
-       kvm_arch_memslots_updated(kvm, slots);
+       slots->generation = gen;
 
        return old_memslots;
 }
@@ -1223,14 +1276,14 @@ bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
 
-unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn)
+unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
        struct vm_area_struct *vma;
        unsigned long addr, size;
 
        size = PAGE_SIZE;
 
-       addr = gfn_to_hva(kvm, gfn);
+       addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
        if (kvm_is_error_hva(addr))
                return PAGE_SIZE;
 
@@ -1697,7 +1750,7 @@ EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
 
 void kvm_set_pfn_dirty(kvm_pfn_t pfn)
 {
-       if (!kvm_is_reserved_pfn(pfn)) {
+       if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) {
                struct page *page = pfn_to_page(pfn);
 
                if (!PageReserved(page))
@@ -1708,7 +1761,7 @@ EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
 
 void kvm_set_pfn_accessed(kvm_pfn_t pfn)
 {
-       if (!kvm_is_reserved_pfn(pfn))
+       if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
                mark_page_accessed(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
@@ -2311,6 +2364,29 @@ static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
 #endif
 }
 
+/*
+ * Unlike kvm_arch_vcpu_runnable, this function is called outside
+ * a vcpu_load/vcpu_put pair.  However, for most architectures
+ * kvm_arch_vcpu_runnable does not require vcpu_load.
+ */
+bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
+{
+       return kvm_arch_vcpu_runnable(vcpu);
+}
+
+static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
+{
+       if (kvm_arch_dy_runnable(vcpu))
+               return true;
+
+#ifdef CONFIG_KVM_ASYNC_PF
+       if (!list_empty_careful(&vcpu->async_pf.done))
+               return true;
+#endif
+
+       return false;
+}
+
 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
 {
        struct kvm *kvm = me->kvm;
@@ -2340,7 +2416,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
                                continue;
                        if (vcpu == me)
                                continue;
-                       if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
+                       if (swait_active(&vcpu->wq) && !vcpu_dy_runnable(vcpu))
                                continue;
                        if (yield_to_kernel_mode && !kvm_arch_vcpu_in_kernel(vcpu))
                                continue;
@@ -2806,6 +2882,9 @@ static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
 {
        struct kvm_device *dev = filp->private_data;
 
+       if (dev->kvm->mm != current->mm)
+               return -EIO;
+
        switch (ioctl) {
        case KVM_SET_DEVICE_ATTR:
                return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
@@ -2877,12 +2956,14 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
        struct kvm_device_ops *ops = NULL;
        struct kvm_device *dev;
        bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
+       int type;
        int ret;
 
        if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
                return -ENODEV;
 
-       ops = kvm_device_ops_table[cd->type];
+       type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
+       ops = kvm_device_ops_table[type];
        if (ops == NULL)
                return -ENODEV;
 
@@ -2897,7 +2978,7 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
        dev->kvm = kvm;
 
        mutex_lock(&kvm->lock);
-       ret = ops->create(dev, cd->type);
+       ret = ops->create(dev, type);
        if (ret < 0) {
                mutex_unlock(&kvm->lock);
                kfree(dev);
@@ -2953,8 +3034,6 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
        case KVM_CAP_MULTI_ADDRESS_SPACE:
                return KVM_ADDRESS_SPACE_NUM;
 #endif
-       case KVM_CAP_MAX_VCPU_ID:
-               return KVM_MAX_VCPU_ID;
        default:
                break;
        }
@@ -3681,7 +3760,9 @@ static int kvm_debugfs_open(struct inode *inode, struct file *file,
        if (!refcount_inc_not_zero(&stat_data->kvm->users_count))
                return -ENOENT;
 
-       if (simple_attr_open(inode, file, get, set, fmt)) {
+       if (simple_attr_open(inode, file, get,
+                            stat_data->mode & S_IWUGO ? set : NULL,
+                            fmt)) {
                kvm_put_kvm(stat_data->kvm);
                return -ENOMEM;
        }
@@ -3795,13 +3876,13 @@ static int vm_stat_get(void *_offset, u64 *val)
        u64 tmp_val;
 
        *val = 0;
-       spin_lock(&kvm_lock);
+       mutex_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list) {
                stat_tmp.kvm = kvm;
                vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
                *val += tmp_val;
        }
-       spin_unlock(&kvm_lock);
+       mutex_unlock(&kvm_lock);
        return 0;
 }
 
@@ -3814,12 +3895,12 @@ static int vm_stat_clear(void *_offset, u64 val)
        if (val)
                return -EINVAL;
 
-       spin_lock(&kvm_lock);
+       mutex_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list) {
                stat_tmp.kvm = kvm;
                vm_stat_clear_per_vm((void *)&stat_tmp, 0);
        }
-       spin_unlock(&kvm_lock);
+       mutex_unlock(&kvm_lock);
 
        return 0;
 }
@@ -3834,13 +3915,13 @@ static int vcpu_stat_get(void *_offset, u64 *val)
        u64 tmp_val;
 
        *val = 0;
-       spin_lock(&kvm_lock);
+       mutex_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list) {
                stat_tmp.kvm = kvm;
                vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
                *val += tmp_val;
        }
-       spin_unlock(&kvm_lock);
+       mutex_unlock(&kvm_lock);
        return 0;
 }
 
@@ -3853,12 +3934,12 @@ static int vcpu_stat_clear(void *_offset, u64 val)
        if (val)
                return -EINVAL;
 
-       spin_lock(&kvm_lock);
+       mutex_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list) {
                stat_tmp.kvm = kvm;
                vcpu_stat_clear_per_vm((void *)&stat_tmp, 0);
        }
-       spin_unlock(&kvm_lock);
+       mutex_unlock(&kvm_lock);
 
        return 0;
 }
@@ -3879,7 +3960,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
        if (!kvm_dev.this_device || !kvm)
                return;
 
-       spin_lock(&kvm_lock);
+       mutex_lock(&kvm_lock);
        if (type == KVM_EVENT_CREATE_VM) {
                kvm_createvm_count++;
                kvm_active_vms++;
@@ -3888,7 +3969,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
        }
        created = kvm_createvm_count;
        active = kvm_active_vms;
-       spin_unlock(&kvm_lock);
+       mutex_unlock(&kvm_lock);
 
        env = kzalloc(sizeof(*env), GFP_KERNEL);
        if (!env)
@@ -3905,7 +3986,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
        }
        add_uevent_var(env, "PID=%d", kvm->userspace_pid);
 
-       if (kvm->debugfs_dentry) {
+       if (!IS_ERR_OR_NULL(kvm->debugfs_dentry)) {
                char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL);
 
                if (p) {
@@ -3932,7 +4013,8 @@ static int kvm_init_debug(void)
 
        kvm_debugfs_num_entries = 0;
        for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) {
-               if (!debugfs_create_file(p->name, 0644, kvm_debugfs_dir,
+               int mode = p->mode ? p->mode : 0644;
+               if (!debugfs_create_file(p->name, mode, kvm_debugfs_dir,
                                         (void *)(long)p->offset,
                                         stat_fops[p->kind]))
                        goto out_dir;
@@ -4119,3 +4201,86 @@ void kvm_exit(void)
        kvm_vfio_ops_exit();
 }
 EXPORT_SYMBOL_GPL(kvm_exit);
+
+struct kvm_vm_worker_thread_context {
+       struct kvm *kvm;
+       struct task_struct *parent;
+       struct completion init_done;
+       kvm_vm_thread_fn_t thread_fn;
+       uintptr_t data;
+       int err;
+};
+
+static int kvm_vm_worker_thread(void *context)
+{
+       /*
+        * The init_context is allocated on the stack of the parent thread, so
+        * we have to locally copy anything that is needed beyond initialization
+        */
+       struct kvm_vm_worker_thread_context *init_context = context;
+       struct kvm *kvm = init_context->kvm;
+       kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
+       uintptr_t data = init_context->data;
+       int err;
+
+       err = kthread_park(current);
+       /* kthread_park(current) is never supposed to return an error */
+       WARN_ON(err != 0);
+       if (err)
+               goto init_complete;
+
+       err = cgroup_attach_task_all(init_context->parent, current);
+       if (err) {
+               kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
+                       __func__, err);
+               goto init_complete;
+       }
+
+       set_user_nice(current, task_nice(init_context->parent));
+
+init_complete:
+       init_context->err = err;
+       complete(&init_context->init_done);
+       init_context = NULL;
+
+       if (err)
+               return err;
+
+       /* Wait to be woken up by the spawner before proceeding. */
+       kthread_parkme();
+
+       if (!kthread_should_stop())
+               err = thread_fn(kvm, data);
+
+       return err;
+}
+
+int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
+                               uintptr_t data, const char *name,
+                               struct task_struct **thread_ptr)
+{
+       struct kvm_vm_worker_thread_context init_context = {};
+       struct task_struct *thread;
+
+       *thread_ptr = NULL;
+       init_context.kvm = kvm;
+       init_context.parent = current;
+       init_context.thread_fn = thread_fn;
+       init_context.data = data;
+       init_completion(&init_context.init_done);
+
+       thread = kthread_run(kvm_vm_worker_thread, &init_context,
+                            "%s-%d", name, task_pid_nr(current));
+       if (IS_ERR(thread))
+               return PTR_ERR(thread);
+
+       /* kthread_run is never supposed to return NULL */
+       WARN_ON(thread == NULL);
+
+       wait_for_completion(&init_context.init_done);
+
+       if (!init_context.err)
+               *thread_ptr = thread;
+
+       return init_context.err;
+}