]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blobdiff - virt/kvm/kvm_main.c
KVM: MMU: try to fix up page faults before giving up
[mirror_ubuntu-artful-kernel.git] / virt / kvm / kvm_main.c
index dd4ac9d9e8f5c71abd930e1ed079b4c52834cd51..154b9ab459b0a3ac51bd30262afa372ae26d67aa 100644 (file)
@@ -63,6 +63,9 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/kvm.h>
 
+/* Worst case buffer size needed for holding an integer. */
+#define ITOA_MAX_LEN 12
+
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
@@ -100,6 +103,9 @@ static __read_mostly struct preempt_ops kvm_preempt_ops;
 struct dentry *kvm_debugfs_dir;
 EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
 
+static int kvm_debugfs_num_entries;
+static const struct file_operations *stat_fops_per_vm[];
+
 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
                           unsigned long arg);
 #ifdef CONFIG_KVM_COMPAT
@@ -542,6 +548,58 @@ static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
        kvfree(slots);
 }
 
+static void kvm_destroy_vm_debugfs(struct kvm *kvm)
+{
+       int i;
+
+       if (!kvm->debugfs_dentry)
+               return;
+
+       debugfs_remove_recursive(kvm->debugfs_dentry);
+
+       for (i = 0; i < kvm_debugfs_num_entries; i++)
+               kfree(kvm->debugfs_stat_data[i]);
+       kfree(kvm->debugfs_stat_data);
+}
+
+static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
+{
+       char dir_name[ITOA_MAX_LEN * 2];
+       struct kvm_stat_data *stat_data;
+       struct kvm_stats_debugfs_item *p;
+
+       if (!debugfs_initialized())
+               return 0;
+
+       snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd);
+       kvm->debugfs_dentry = debugfs_create_dir(dir_name,
+                                                kvm_debugfs_dir);
+       if (!kvm->debugfs_dentry)
+               return -ENOMEM;
+
+       kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
+                                        sizeof(*kvm->debugfs_stat_data),
+                                        GFP_KERNEL);
+       if (!kvm->debugfs_stat_data)
+               return -ENOMEM;
+
+       for (p = debugfs_entries; p->name; p++) {
+               stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL);
+               if (!stat_data)
+                       return -ENOMEM;
+
+               stat_data->kvm = kvm;
+               stat_data->offset = p->offset;
+               kvm->debugfs_stat_data[p - debugfs_entries] = stat_data;
+               if (!debugfs_create_file(p->name, 0444,
+                                        kvm->debugfs_dentry,
+                                        stat_data,
+                                        stat_fops_per_vm[p->kind]))
+                       return -ENOMEM;
+       }
+       return 0;
+}
+
 static struct kvm *kvm_create_vm(unsigned long type)
 {
        int r, i;
@@ -647,6 +705,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
        int i;
        struct mm_struct *mm = kvm->mm;
 
+       kvm_destroy_vm_debugfs(kvm);
        kvm_arch_sync_events(kvm);
        spin_lock(&kvm_lock);
        list_del(&kvm->vm_list);
@@ -1383,6 +1442,52 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
        return true;
 }
 
+static int hva_to_pfn_remapped(struct vm_area_struct *vma,
+                              unsigned long addr, bool *async,
+                              bool write_fault, kvm_pfn_t *p_pfn)
+{
+       unsigned long pfn;
+       int r;
+
+       r = follow_pfn(vma, addr, &pfn);
+       if (r) {
+               /*
+                * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
+                * not call the fault handler, so do it here.
+                */
+               bool unlocked = false;
+               r = fixup_user_fault(current, current->mm, addr,
+                                    (write_fault ? FAULT_FLAG_WRITE : 0),
+                                    &unlocked);
+               if (unlocked)
+                       return -EAGAIN;
+               if (r)
+                       return r;
+
+               r = follow_pfn(vma, addr, &pfn);
+               if (r)
+                       return r;
+
+       }
+
+
+       /*
+        * Get a reference here because callers of *hva_to_pfn* and
+        * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
+        * returned pfn.  This is only needed if the VMA has VM_MIXEDMAP
+        * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will
+        * simply do nothing for reserved pfns.
+        *
+        * Whoever called remap_pfn_range is also going to call e.g.
+        * unmap_mapping_range before the underlying pages are freed,
+        * causing a call to our MMU notifier.
+        */ 
+       kvm_get_pfn(pfn);
+
+       *p_pfn = pfn;
+       return 0;
+}
+
 /*
  * Pin guest page in memory and return its pfn.
  * @addr: host virtual address which maps memory to the guest
@@ -1402,7 +1507,7 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
 {
        struct vm_area_struct *vma;
        kvm_pfn_t pfn = 0;
-       int npages;
+       int npages, r;
 
        /* we can do it either atomically or asynchronously, not both */
        BUG_ON(atomic && async);
@@ -1424,14 +1529,17 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
                goto exit;
        }
 
+retry:
        vma = find_vma_intersection(current->mm, addr, addr + 1);
 
        if (vma == NULL)
                pfn = KVM_PFN_ERR_FAULT;
-       else if ((vma->vm_flags & VM_PFNMAP)) {
-               pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
-                       vma->vm_pgoff;
-               BUG_ON(!kvm_is_reserved_pfn(pfn));
+       else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
+               r = hva_to_pfn_remapped(vma, addr, async, write_fault, &pfn);
+               if (r == -EAGAIN)
+                       goto retry;
+               if (r < 0)
+                       pfn = KVM_PFN_ERR_FAULT;
        } else {
                if (async && vma_is_valid(vma, write_fault))
                        *async = true;
@@ -2287,9 +2395,20 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
        if (id >= KVM_MAX_VCPU_ID)
                return -EINVAL;
 
+       mutex_lock(&kvm->lock);
+       if (kvm->created_vcpus == KVM_MAX_VCPUS) {
+               mutex_unlock(&kvm->lock);
+               return -EINVAL;
+       }
+
+       kvm->created_vcpus++;
+       mutex_unlock(&kvm->lock);
+
        vcpu = kvm_arch_vcpu_create(kvm, id);
-       if (IS_ERR(vcpu))
-               return PTR_ERR(vcpu);
+       if (IS_ERR(vcpu)) {
+               r = PTR_ERR(vcpu);
+               goto vcpu_decrement;
+       }
 
        preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
 
@@ -2298,14 +2417,6 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
                goto vcpu_destroy;
 
        mutex_lock(&kvm->lock);
-       if (!kvm_vcpu_compatible(vcpu)) {
-               r = -EINVAL;
-               goto unlock_vcpu_destroy;
-       }
-       if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
-               r = -EINVAL;
-               goto unlock_vcpu_destroy;
-       }
        if (kvm_get_vcpu_by_id(kvm, id)) {
                r = -EEXIST;
                goto unlock_vcpu_destroy;
@@ -2338,6 +2449,10 @@ unlock_vcpu_destroy:
        mutex_unlock(&kvm->lock);
 vcpu_destroy:
        kvm_arch_vcpu_destroy(vcpu);
+vcpu_decrement:
+       mutex_lock(&kvm->lock);
+       kvm->created_vcpus--;
+       mutex_unlock(&kvm->lock);
        return r;
 }
 
@@ -2876,7 +2991,7 @@ static long kvm_vm_ioctl(struct file *filp,
        case KVM_SET_GSI_ROUTING: {
                struct kvm_irq_routing routing;
                struct kvm_irq_routing __user *urouting;
-               struct kvm_irq_routing_entry *entries;
+               struct kvm_irq_routing_entry *entries = NULL;
 
                r = -EFAULT;
                if (copy_from_user(&routing, argp, sizeof(routing)))
@@ -2886,15 +3001,17 @@ static long kvm_vm_ioctl(struct file *filp,
                        goto out;
                if (routing.flags)
                        goto out;
-               r = -ENOMEM;
-               entries = vmalloc(routing.nr * sizeof(*entries));
-               if (!entries)
-                       goto out;
-               r = -EFAULT;
-               urouting = argp;
-               if (copy_from_user(entries, urouting->entries,
-                                  routing.nr * sizeof(*entries)))
-                       goto out_free_irq_routing;
+               if (routing.nr) {
+                       r = -ENOMEM;
+                       entries = vmalloc(routing.nr * sizeof(*entries));
+                       if (!entries)
+                               goto out;
+                       r = -EFAULT;
+                       urouting = argp;
+                       if (copy_from_user(entries, urouting->entries,
+                                          routing.nr * sizeof(*entries)))
+                               goto out_free_irq_routing;
+               }
                r = kvm_set_irq_routing(kvm, entries, routing.nr,
                                        routing.flags);
 out_free_irq_routing:
@@ -2999,8 +3116,15 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
        }
 #endif
        r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR | O_CLOEXEC);
-       if (r < 0)
+       if (r < 0) {
+               kvm_put_kvm(kvm);
+               return r;
+       }
+
+       if (kvm_create_vm_debugfs(kvm, r) < 0) {
                kvm_put_kvm(kvm);
+               return -ENOMEM;
+       }
 
        return r;
 }
@@ -3425,15 +3549,114 @@ static struct notifier_block kvm_cpu_notifier = {
        .notifier_call = kvm_cpu_hotplug,
 };
 
+static int kvm_debugfs_open(struct inode *inode, struct file *file,
+                          int (*get)(void *, u64 *), int (*set)(void *, u64),
+                          const char *fmt)
+{
+       struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
+                                         inode->i_private;
+
+       /* The debugfs files are a reference to the kvm struct which
+        * is still valid when kvm_destroy_vm is called.
+        * To avoid the race between open and the removal of the debugfs
+        * directory we test against the users count.
+        */
+       if (!atomic_add_unless(&stat_data->kvm->users_count, 1, 0))
+               return -ENOENT;
+
+       if (simple_attr_open(inode, file, get, set, fmt)) {
+               kvm_put_kvm(stat_data->kvm);
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+static int kvm_debugfs_release(struct inode *inode, struct file *file)
+{
+       struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
+                                         inode->i_private;
+
+       simple_attr_release(inode, file);
+       kvm_put_kvm(stat_data->kvm);
+
+       return 0;
+}
+
+static int vm_stat_get_per_vm(void *data, u64 *val)
+{
+       struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
+
+       *val = *(u32 *)((void *)stat_data->kvm + stat_data->offset);
+
+       return 0;
+}
+
+static int vm_stat_get_per_vm_open(struct inode *inode, struct file *file)
+{
+       __simple_attr_check_format("%llu\n", 0ull);
+       return kvm_debugfs_open(inode, file, vm_stat_get_per_vm,
+                               NULL, "%llu\n");
+}
+
+static const struct file_operations vm_stat_get_per_vm_fops = {
+       .owner   = THIS_MODULE,
+       .open    = vm_stat_get_per_vm_open,
+       .release = kvm_debugfs_release,
+       .read    = simple_attr_read,
+       .write   = simple_attr_write,
+       .llseek  = generic_file_llseek,
+};
+
+static int vcpu_stat_get_per_vm(void *data, u64 *val)
+{
+       int i;
+       struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
+       struct kvm_vcpu *vcpu;
+
+       *val = 0;
+
+       kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
+               *val += *(u32 *)((void *)vcpu + stat_data->offset);
+
+       return 0;
+}
+
+static int vcpu_stat_get_per_vm_open(struct inode *inode, struct file *file)
+{
+       __simple_attr_check_format("%llu\n", 0ull);
+       return kvm_debugfs_open(inode, file, vcpu_stat_get_per_vm,
+                                NULL, "%llu\n");
+}
+
+static const struct file_operations vcpu_stat_get_per_vm_fops = {
+       .owner   = THIS_MODULE,
+       .open    = vcpu_stat_get_per_vm_open,
+       .release = kvm_debugfs_release,
+       .read    = simple_attr_read,
+       .write   = simple_attr_write,
+       .llseek  = generic_file_llseek,
+};
+
+static const struct file_operations *stat_fops_per_vm[] = {
+       [KVM_STAT_VCPU] = &vcpu_stat_get_per_vm_fops,
+       [KVM_STAT_VM]   = &vm_stat_get_per_vm_fops,
+};
+
 static int vm_stat_get(void *_offset, u64 *val)
 {
        unsigned offset = (long)_offset;
        struct kvm *kvm;
+       struct kvm_stat_data stat_tmp = {.offset = offset};
+       u64 tmp_val;
 
        *val = 0;
        spin_lock(&kvm_lock);
-       list_for_each_entry(kvm, &vm_list, vm_list)
-               *val += *(u32 *)((void *)kvm + offset);
+       list_for_each_entry(kvm, &vm_list, vm_list) {
+               stat_tmp.kvm = kvm;
+               vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
+               *val += tmp_val;
+       }
        spin_unlock(&kvm_lock);
        return 0;
 }
@@ -3444,15 +3667,16 @@ static int vcpu_stat_get(void *_offset, u64 *val)
 {
        unsigned offset = (long)_offset;
        struct kvm *kvm;
-       struct kvm_vcpu *vcpu;
-       int i;
+       struct kvm_stat_data stat_tmp = {.offset = offset};
+       u64 tmp_val;
 
        *val = 0;
        spin_lock(&kvm_lock);
-       list_for_each_entry(kvm, &vm_list, vm_list)
-               kvm_for_each_vcpu(i, vcpu, kvm)
-                       *val += *(u32 *)((void *)vcpu + offset);
-
+       list_for_each_entry(kvm, &vm_list, vm_list) {
+               stat_tmp.kvm = kvm;
+               vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
+               *val += tmp_val;
+       }
        spin_unlock(&kvm_lock);
        return 0;
 }
@@ -3473,7 +3697,8 @@ static int kvm_init_debug(void)
        if (kvm_debugfs_dir == NULL)
                goto out;
 
-       for (p = debugfs_entries; p->name; ++p) {
+       kvm_debugfs_num_entries = 0;
+       for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) {
                if (!debugfs_create_file(p->name, 0444, kvm_debugfs_dir,
                                         (void *)(long)p->offset,
                                         stat_fops[p->kind]))