UBUNTU: Ubuntu-4.15.0-96.97

[mirror_ubuntu-bionic-kernel.git] / virt / kvm / kvm_main.c
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c

index 38de930fe93cd349881dbfe46ec857ad53ccf7d5..3c54b2e1b59c9baf9cb59473bd1ebf9944527de5 100644 (file)
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -51,6 +51,7 @@
  #include <linux/slab.h>
  #include <linux/sort.h>
  #include <linux/bsearch.h>
+#include <linux/kthread.h>
  
  #include <asm/processor.h>
  #include <asm/io.h>
@@ -92,7 +93,7 @@ EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
   *     kvm->lock --> kvm->slots_lock --> kvm->irq_lock
   */
  
-DEFINE_SPINLOCK(kvm_lock);
+DEFINE_MUTEX(kvm_lock);
  static DEFINE_RAW_SPINLOCK(kvm_count_lock);
  LIST_HEAD(vm_list);
  
@@ -140,10 +141,30 @@ __weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
  {
  }
  
+bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
+{
+       /*
+        * The metadata used by is_zone_device_page() to determine whether or
+        * not a page is ZONE_DEVICE is guaranteed to be valid if and only if
+        * the device has been pinned, e.g. by get_user_pages().  WARN if the
+        * page_count() is zero to help detect bad usage of this helper.
+        */
+       if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
+               return false;
+
+       return is_zone_device_page(pfn_to_page(pfn));
+}
+
  bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
  {
+       /*
+        * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
+        * perspective they are "normal" pages, albeit with slightly different
+        * usage rules.
+        */
         if (pfn_valid(pfn))
-               return PageReserved(pfn_to_page(pfn));
+               return PageReserved(pfn_to_page(pfn)) &&
+                      !kvm_is_zone_device_pfn(pfn);
  
         return true;
  }
@@ -595,8 +616,9 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
  
                 stat_data->kvm = kvm;
                 stat_data->offset = p->offset;
+               stat_data->mode = p->mode ? p->mode : 0644;
                 kvm->debugfs_stat_data[p - debugfs_entries] = stat_data;
-               if (!debugfs_create_file(p->name, 0644,
+               if (!debugfs_create_file(p->name, stat_data->mode,
                                          kvm->debugfs_dentry,
                                          stat_data,
                                          stat_fops_per_vm[p->kind]))
@@ -605,6 +627,23 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
         return 0;
  }
  
+/*
+ * Called after the VM is otherwise initialized, but just before adding it to
+ * the vm_list.
+ */
+int __weak kvm_arch_post_init_vm(struct kvm *kvm)
+{
+       return 0;
+}
+
+/*
+ * Called just after removing the VM from the vm_list, but before doing any
+ * other destruction.
+ */
+void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
+{
+}
+
  static struct kvm *kvm_create_vm(unsigned long type)
  {
         int r, i;
@@ -659,22 +698,31 @@ static struct kvm *kvm_create_vm(unsigned long type)
                 rcu_assign_pointer(kvm->buses[i],
                         kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL));
                 if (!kvm->buses[i])
-                       goto out_err;
+                       goto out_err_no_mmu_notifier;
         }
  
         r = kvm_init_mmu_notifier(kvm);
+       if (r)
+               goto out_err_no_mmu_notifier;
+
+       r = kvm_arch_post_init_vm(kvm);
         if (r)
                 goto out_err;
  
-       spin_lock(&kvm_lock);
+       mutex_lock(&kvm_lock);
         list_add(&kvm->vm_list, &vm_list);
-       spin_unlock(&kvm_lock);
+       mutex_unlock(&kvm_lock);
  
         preempt_notifier_inc();
  
         return kvm;
  
  out_err:
+#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+       if (kvm->mmu_notifier.ops)
+               mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
+#endif
+out_err_no_mmu_notifier:
         cleanup_srcu_struct(&kvm->irq_srcu);
  out_err_no_irq_srcu:
         cleanup_srcu_struct(&kvm->srcu);
@@ -714,9 +762,11 @@ static void kvm_destroy_vm(struct kvm *kvm)
         kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
         kvm_destroy_vm_debugfs(kvm);
         kvm_arch_sync_events(kvm);
-       spin_lock(&kvm_lock);
+       mutex_lock(&kvm_lock);
         list_del(&kvm->vm_list);
-       spin_unlock(&kvm_lock);
+       mutex_unlock(&kvm_lock);
+       kvm_arch_pre_destroy_vm(kvm);
+
         kvm_free_irq_routing(kvm);
         for (i = 0; i < KVM_NR_BUSES; i++) {
                 struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
@@ -855,6 +905,7 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
                 int as_id, struct kvm_memslots *slots)
  {
         struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id);
+       u64 gen;
  
         /*
          * Set the low bit in the generation, which disables SPTE caching
@@ -877,9 +928,11 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
          * space 0 will use generations 0, 4, 8, ... while * address space 1 will
          * use generations 2, 6, 10, 14, ...
          */
-       slots->generation += KVM_ADDRESS_SPACE_NUM * 2 - 1;
+       gen = slots->generation + KVM_ADDRESS_SPACE_NUM * 2 - 1;
+
+       kvm_arch_memslots_updated(kvm, gen);
  
-       kvm_arch_memslots_updated(kvm, slots);
+       slots->generation = gen;
  
         return old_memslots;
  }
@@ -1223,14 +1276,14 @@ bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
  }
  EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
  
-unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn)
+unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
  {
         struct vm_area_struct *vma;
         unsigned long addr, size;
  
         size = PAGE_SIZE;
  
-       addr = gfn_to_hva(kvm, gfn);
+       addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
         if (kvm_is_error_hva(addr))
                 return PAGE_SIZE;
  
@@ -1697,7 +1750,7 @@ EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
  
  void kvm_set_pfn_dirty(kvm_pfn_t pfn)
  {
-       if (!kvm_is_reserved_pfn(pfn)) {
+       if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) {
                 struct page *page = pfn_to_page(pfn);
  
                 if (!PageReserved(page))
@@ -1708,7 +1761,7 @@ EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
  
  void kvm_set_pfn_accessed(kvm_pfn_t pfn)
  {
-       if (!kvm_is_reserved_pfn(pfn))
+       if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
                 mark_page_accessed(pfn_to_page(pfn));
  }
  EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
@@ -2311,6 +2364,29 @@ static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
  #endif
  }
  
+/*
+ * Unlike kvm_arch_vcpu_runnable, this function is called outside
+ * a vcpu_load/vcpu_put pair.  However, for most architectures
+ * kvm_arch_vcpu_runnable does not require vcpu_load.
+ */
+bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
+{
+       return kvm_arch_vcpu_runnable(vcpu);
+}
+
+static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
+{
+       if (kvm_arch_dy_runnable(vcpu))
+               return true;
+
+#ifdef CONFIG_KVM_ASYNC_PF
+       if (!list_empty_careful(&vcpu->async_pf.done))
+               return true;
+#endif
+
+       return false;
+}
+
  void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
  {
         struct kvm *kvm = me->kvm;
@@ -2340,7 +2416,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
                                 continue;
                         if (vcpu == me)
                                 continue;
-                       if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
+                       if (swait_active(&vcpu->wq) && !vcpu_dy_runnable(vcpu))
                                 continue;
                         if (yield_to_kernel_mode && !kvm_arch_vcpu_in_kernel(vcpu))
                                 continue;
@@ -2806,6 +2882,9 @@ static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
  {
         struct kvm_device *dev = filp->private_data;
  
+       if (dev->kvm->mm != current->mm)
+               return -EIO;
+
         switch (ioctl) {
         case KVM_SET_DEVICE_ATTR:
                 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
@@ -2877,12 +2956,14 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
         struct kvm_device_ops *ops = NULL;
         struct kvm_device *dev;
         bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
+       int type;
         int ret;
  
         if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
                 return -ENODEV;
  
-       ops = kvm_device_ops_table[cd->type];
+       type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
+       ops = kvm_device_ops_table[type];
         if (ops == NULL)
                 return -ENODEV;
  
@@ -2897,7 +2978,7 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
         dev->kvm = kvm;
  
         mutex_lock(&kvm->lock);
-       ret = ops->create(dev, cd->type);
+       ret = ops->create(dev, type);
         if (ret < 0) {
                 mutex_unlock(&kvm->lock);
                 kfree(dev);
@@ -2953,8 +3034,6 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
         case KVM_CAP_MULTI_ADDRESS_SPACE:
                 return KVM_ADDRESS_SPACE_NUM;
  #endif
-       case KVM_CAP_MAX_VCPU_ID:
-               return KVM_MAX_VCPU_ID;
         default:
                 break;
         }
@@ -3681,7 +3760,9 @@ static int kvm_debugfs_open(struct inode *inode, struct file *file,
         if (!refcount_inc_not_zero(&stat_data->kvm->users_count))
                 return -ENOENT;
  
-       if (simple_attr_open(inode, file, get, set, fmt)) {
+       if (simple_attr_open(inode, file, get,
+                            stat_data->mode & S_IWUGO ? set : NULL,
+                            fmt)) {
                 kvm_put_kvm(stat_data->kvm);
                 return -ENOMEM;
         }
@@ -3795,13 +3876,13 @@ static int vm_stat_get(void *_offset, u64 *val)
         u64 tmp_val;
  
         *val = 0;
-       spin_lock(&kvm_lock);
+       mutex_lock(&kvm_lock);
         list_for_each_entry(kvm, &vm_list, vm_list) {
                 stat_tmp.kvm = kvm;
                 vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
                 *val += tmp_val;
         }
-       spin_unlock(&kvm_lock);
+       mutex_unlock(&kvm_lock);
         return 0;
  }
  
@@ -3814,12 +3895,12 @@ static int vm_stat_clear(void *_offset, u64 val)
         if (val)
                 return -EINVAL;
  
-       spin_lock(&kvm_lock);
+       mutex_lock(&kvm_lock);
         list_for_each_entry(kvm, &vm_list, vm_list) {
                 stat_tmp.kvm = kvm;
                 vm_stat_clear_per_vm((void *)&stat_tmp, 0);
         }
-       spin_unlock(&kvm_lock);
+       mutex_unlock(&kvm_lock);
  
         return 0;
  }
@@ -3834,13 +3915,13 @@ static int vcpu_stat_get(void *_offset, u64 *val)
         u64 tmp_val;
  
         *val = 0;
-       spin_lock(&kvm_lock);
+       mutex_lock(&kvm_lock);
         list_for_each_entry(kvm, &vm_list, vm_list) {
                 stat_tmp.kvm = kvm;
                 vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
                 *val += tmp_val;
         }
-       spin_unlock(&kvm_lock);
+       mutex_unlock(&kvm_lock);
         return 0;
  }
  
@@ -3853,12 +3934,12 @@ static int vcpu_stat_clear(void *_offset, u64 val)
         if (val)
                 return -EINVAL;
  
-       spin_lock(&kvm_lock);
+       mutex_lock(&kvm_lock);
         list_for_each_entry(kvm, &vm_list, vm_list) {
                 stat_tmp.kvm = kvm;
                 vcpu_stat_clear_per_vm((void *)&stat_tmp, 0);
         }
-       spin_unlock(&kvm_lock);
+       mutex_unlock(&kvm_lock);
  
         return 0;
  }
@@ -3879,7 +3960,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
         if (!kvm_dev.this_device || !kvm)
                 return;
  
-       spin_lock(&kvm_lock);
+       mutex_lock(&kvm_lock);
         if (type == KVM_EVENT_CREATE_VM) {
                 kvm_createvm_count++;
                 kvm_active_vms++;
@@ -3888,7 +3969,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
         }
         created = kvm_createvm_count;
         active = kvm_active_vms;
-       spin_unlock(&kvm_lock);
+       mutex_unlock(&kvm_lock);
  
         env = kzalloc(sizeof(*env), GFP_KERNEL);
         if (!env)
@@ -3905,7 +3986,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
         }
         add_uevent_var(env, "PID=%d", kvm->userspace_pid);
  
-       if (kvm->debugfs_dentry) {
+       if (!IS_ERR_OR_NULL(kvm->debugfs_dentry)) {
                 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL);
  
                 if (p) {
@@ -3932,7 +4013,8 @@ static int kvm_init_debug(void)
  
         kvm_debugfs_num_entries = 0;
         for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) {
-               if (!debugfs_create_file(p->name, 0644, kvm_debugfs_dir,
+               int mode = p->mode ? p->mode : 0644;
+               if (!debugfs_create_file(p->name, mode, kvm_debugfs_dir,
                                          (void *)(long)p->offset,
                                          stat_fops[p->kind]))
                         goto out_dir;
@@ -4119,3 +4201,86 @@ void kvm_exit(void)
         kvm_vfio_ops_exit();
  }
  EXPORT_SYMBOL_GPL(kvm_exit);
+
+struct kvm_vm_worker_thread_context {
+       struct kvm *kvm;
+       struct task_struct *parent;
+       struct completion init_done;
+       kvm_vm_thread_fn_t thread_fn;
+       uintptr_t data;
+       int err;
+};
+
+static int kvm_vm_worker_thread(void *context)
+{
+       /*
+        * The init_context is allocated on the stack of the parent thread, so
+        * we have to locally copy anything that is needed beyond initialization
+        */
+       struct kvm_vm_worker_thread_context *init_context = context;
+       struct kvm *kvm = init_context->kvm;
+       kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
+       uintptr_t data = init_context->data;
+       int err;
+
+       err = kthread_park(current);
+       /* kthread_park(current) is never supposed to return an error */
+       WARN_ON(err != 0);
+       if (err)
+               goto init_complete;
+
+       err = cgroup_attach_task_all(init_context->parent, current);
+       if (err) {
+               kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
+                       __func__, err);
+               goto init_complete;
+       }
+
+       set_user_nice(current, task_nice(init_context->parent));
+
+init_complete:
+       init_context->err = err;
+       complete(&init_context->init_done);
+       init_context = NULL;
+
+       if (err)
+               return err;
+
+       /* Wait to be woken up by the spawner before proceeding. */
+       kthread_parkme();
+
+       if (!kthread_should_stop())
+               err = thread_fn(kvm, data);
+
+       return err;
+}
+
+int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
+                               uintptr_t data, const char *name,
+                               struct task_struct **thread_ptr)
+{
+       struct kvm_vm_worker_thread_context init_context = {};
+       struct task_struct *thread;
+
+       *thread_ptr = NULL;
+       init_context.kvm = kvm;
+       init_context.parent = current;
+       init_context.thread_fn = thread_fn;
+       init_context.data = data;
+       init_completion(&init_context.init_done);
+
+       thread = kthread_run(kvm_vm_worker_thread, &init_context,
+                            "%s-%d", name, task_pid_nr(current));
+       if (IS_ERR(thread))
+               return PTR_ERR(thread);
+
+       /* kthread_run is never supposed to return NULL */
+       WARN_ON(thread == NULL);
+
+       wait_for_completion(&init_context.init_done);
+
+       if (!init_context.err)
+               *thread_ptr = thread;
+
+       return init_context.err;
+}