]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - virt/kvm/kvm_main.c
KVM: PPC: Remove prog_flags
[mirror_ubuntu-bionic-kernel.git] / virt / kvm / kvm_main.c
CommitLineData
6aa8b732
AK
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * Copyright (C) 2006 Qumranet, Inc.
9611c187 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
6aa8b732
AK
9 *
10 * Authors:
11 * Avi Kivity <avi@qumranet.com>
12 * Yaniv Kamay <yaniv@qumranet.com>
13 *
14 * This work is licensed under the terms of the GNU GPL, version 2. See
15 * the COPYING file in the top-level directory.
16 *
17 */
18
e2174021 19#include "iodev.h"
6aa8b732 20
edf88417 21#include <linux/kvm_host.h>
6aa8b732
AK
22#include <linux/kvm.h>
23#include <linux/module.h>
24#include <linux/errno.h>
6aa8b732 25#include <linux/percpu.h>
6aa8b732
AK
26#include <linux/mm.h>
27#include <linux/miscdevice.h>
28#include <linux/vmalloc.h>
6aa8b732 29#include <linux/reboot.h>
6aa8b732
AK
30#include <linux/debugfs.h>
31#include <linux/highmem.h>
32#include <linux/file.h>
fb3600cc 33#include <linux/syscore_ops.h>
774c47f1 34#include <linux/cpu.h>
e8edc6e0 35#include <linux/sched.h>
d9e368d6
AK
36#include <linux/cpumask.h>
37#include <linux/smp.h>
d6d28168 38#include <linux/anon_inodes.h>
04d2cc77 39#include <linux/profile.h>
7aa81cc0 40#include <linux/kvm_para.h>
6fc138d2 41#include <linux/pagemap.h>
8d4e1288 42#include <linux/mman.h>
35149e21 43#include <linux/swap.h>
e56d532f 44#include <linux/bitops.h>
547de29e 45#include <linux/spinlock.h>
6ff5894c 46#include <linux/compat.h>
bc6678a3 47#include <linux/srcu.h>
8f0b1ab6 48#include <linux/hugetlb.h>
5a0e3ad6 49#include <linux/slab.h>
6aa8b732 50
e495606d 51#include <asm/processor.h>
e495606d
AK
52#include <asm/io.h>
53#include <asm/uaccess.h>
3e021bf5 54#include <asm/pgtable.h>
6aa8b732 55
5f94c174 56#include "coalesced_mmio.h"
af585b92 57#include "async_pf.h"
5f94c174 58
229456fc
MT
59#define CREATE_TRACE_POINTS
60#include <trace/events/kvm.h>
61
6aa8b732
AK
62MODULE_AUTHOR("Qumranet");
63MODULE_LICENSE("GPL");
64
fa40a821
MT
65/*
66 * Ordering of locks:
67 *
fae3a353 68 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock
fa40a821
MT
69 */
70
e935b837 71DEFINE_RAW_SPINLOCK(kvm_lock);
e9b11c17 72LIST_HEAD(vm_list);
133de902 73
7f59f492 74static cpumask_var_t cpus_hardware_enabled;
10474ae8
AG
75static int kvm_usage_count = 0;
76static atomic_t hardware_enable_failed;
1b6c0168 77
c16f862d
RR
78struct kmem_cache *kvm_vcpu_cache;
79EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
1165f5fe 80
15ad7146
AK
81static __read_mostly struct preempt_ops kvm_preempt_ops;
82
76f7c879 83struct dentry *kvm_debugfs_dir;
6aa8b732 84
bccf2150
AK
85static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
86 unsigned long arg);
1dda606c
AG
87#ifdef CONFIG_COMPAT
88static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
89 unsigned long arg);
90#endif
10474ae8
AG
91static int hardware_enable_all(void);
92static void hardware_disable_all(void);
bccf2150 93
e93f8a0f
MT
94static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
95
b7c4145b
AK
96bool kvm_rebooting;
97EXPORT_SYMBOL_GPL(kvm_rebooting);
4ecac3fd 98
54dee993
MT
99static bool largepages_enabled = true;
100
fa7bff8f
GN
101static struct page *hwpoison_page;
102static pfn_t hwpoison_pfn;
bf998156 103
edba23e5
GN
104static struct page *fault_page;
105static pfn_t fault_pfn;
106
c77fb9dc 107inline int kvm_is_mmio_pfn(pfn_t pfn)
cbff90a7 108{
fc5659c8 109 if (pfn_valid(pfn)) {
22e5c47e 110 int reserved;
936a5fe6 111 struct page *tail = pfn_to_page(pfn);
22e5c47e
AA
112 struct page *head = compound_trans_head(tail);
113 reserved = PageReserved(head);
936a5fe6 114 if (head != tail) {
936a5fe6 115 /*
22e5c47e
AA
116 * "head" is not a dangling pointer
117 * (compound_trans_head takes care of that)
118 * but the hugepage may have been splitted
119 * from under us (and we may not hold a
120 * reference count on the head page so it can
121 * be reused before we run PageReferenced), so
122 * we've to check PageTail before returning
123 * what we just read.
936a5fe6 124 */
22e5c47e
AA
125 smp_rmb();
126 if (PageTail(tail))
127 return reserved;
936a5fe6
AA
128 }
129 return PageReserved(tail);
fc5659c8 130 }
cbff90a7
BAY
131
132 return true;
133}
134
bccf2150
AK
135/*
136 * Switches to specified vcpu, until a matching vcpu_put()
137 */
313a3dc7 138void vcpu_load(struct kvm_vcpu *vcpu)
6aa8b732 139{
15ad7146
AK
140 int cpu;
141
bccf2150 142 mutex_lock(&vcpu->mutex);
34bb10b7
RR
143 if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) {
144 /* The thread running this VCPU changed. */
145 struct pid *oldpid = vcpu->pid;
146 struct pid *newpid = get_task_pid(current, PIDTYPE_PID);
147 rcu_assign_pointer(vcpu->pid, newpid);
148 synchronize_rcu();
149 put_pid(oldpid);
150 }
15ad7146
AK
151 cpu = get_cpu();
152 preempt_notifier_register(&vcpu->preempt_notifier);
313a3dc7 153 kvm_arch_vcpu_load(vcpu, cpu);
15ad7146 154 put_cpu();
6aa8b732
AK
155}
156
313a3dc7 157void vcpu_put(struct kvm_vcpu *vcpu)
6aa8b732 158{
15ad7146 159 preempt_disable();
313a3dc7 160 kvm_arch_vcpu_put(vcpu);
15ad7146
AK
161 preempt_notifier_unregister(&vcpu->preempt_notifier);
162 preempt_enable();
6aa8b732
AK
163 mutex_unlock(&vcpu->mutex);
164}
165
d9e368d6
AK
166static void ack_flush(void *_completed)
167{
d9e368d6
AK
168}
169
49846896 170static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
d9e368d6 171{
597a5f55 172 int i, cpu, me;
6ef7a1bc
RR
173 cpumask_var_t cpus;
174 bool called = true;
d9e368d6 175 struct kvm_vcpu *vcpu;
d9e368d6 176
79f55997 177 zalloc_cpumask_var(&cpus, GFP_ATOMIC);
6ef7a1bc 178
3cba4130 179 me = get_cpu();
988a2cae 180 kvm_for_each_vcpu(i, vcpu, kvm) {
3cba4130 181 kvm_make_request(req, vcpu);
d9e368d6 182 cpu = vcpu->cpu;
6b7e2d09
XG
183
184 /* Set ->requests bit before we read ->mode */
185 smp_mb();
186
187 if (cpus != NULL && cpu != -1 && cpu != me &&
188 kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE)
6ef7a1bc 189 cpumask_set_cpu(cpu, cpus);
49846896 190 }
6ef7a1bc
RR
191 if (unlikely(cpus == NULL))
192 smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1);
193 else if (!cpumask_empty(cpus))
194 smp_call_function_many(cpus, ack_flush, NULL, 1);
195 else
196 called = false;
3cba4130 197 put_cpu();
6ef7a1bc 198 free_cpumask_var(cpus);
49846896 199 return called;
d9e368d6
AK
200}
201
49846896 202void kvm_flush_remote_tlbs(struct kvm *kvm)
2e53d63a 203{
a4ee1ca4
XG
204 int dirty_count = kvm->tlbs_dirty;
205
206 smp_mb();
49846896
RR
207 if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
208 ++kvm->stat.remote_tlb_flush;
a4ee1ca4 209 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
2e53d63a
MT
210}
211
49846896
RR
212void kvm_reload_remote_mmus(struct kvm *kvm)
213{
214 make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
215}
2e53d63a 216
fb3f0f51
RR
217int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
218{
219 struct page *page;
220 int r;
221
222 mutex_init(&vcpu->mutex);
223 vcpu->cpu = -1;
fb3f0f51
RR
224 vcpu->kvm = kvm;
225 vcpu->vcpu_id = id;
34bb10b7 226 vcpu->pid = NULL;
b6958ce4 227 init_waitqueue_head(&vcpu->wq);
af585b92 228 kvm_async_pf_vcpu_init(vcpu);
fb3f0f51
RR
229
230 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
231 if (!page) {
232 r = -ENOMEM;
233 goto fail;
234 }
235 vcpu->run = page_address(page);
236
e9b11c17 237 r = kvm_arch_vcpu_init(vcpu);
fb3f0f51 238 if (r < 0)
e9b11c17 239 goto fail_free_run;
fb3f0f51
RR
240 return 0;
241
fb3f0f51
RR
242fail_free_run:
243 free_page((unsigned long)vcpu->run);
244fail:
76fafa5e 245 return r;
fb3f0f51
RR
246}
247EXPORT_SYMBOL_GPL(kvm_vcpu_init);
248
249void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
250{
34bb10b7 251 put_pid(vcpu->pid);
e9b11c17 252 kvm_arch_vcpu_uninit(vcpu);
fb3f0f51
RR
253 free_page((unsigned long)vcpu->run);
254}
255EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
256
e930bffe
AA
257#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
258static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
259{
260 return container_of(mn, struct kvm, mmu_notifier);
261}
262
263static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
264 struct mm_struct *mm,
265 unsigned long address)
266{
267 struct kvm *kvm = mmu_notifier_to_kvm(mn);
bc6678a3 268 int need_tlb_flush, idx;
e930bffe
AA
269
270 /*
271 * When ->invalidate_page runs, the linux pte has been zapped
272 * already but the page is still allocated until
273 * ->invalidate_page returns. So if we increase the sequence
274 * here the kvm page fault will notice if the spte can't be
275 * established because the page is going to be freed. If
276 * instead the kvm page fault establishes the spte before
277 * ->invalidate_page runs, kvm_unmap_hva will release it
278 * before returning.
279 *
280 * The sequence increase only need to be seen at spin_unlock
281 * time, and not at spin_lock time.
282 *
283 * Increasing the sequence after the spin_unlock would be
284 * unsafe because the kvm page fault could then establish the
285 * pte after kvm_unmap_hva returned, without noticing the page
286 * is going to be freed.
287 */
bc6678a3 288 idx = srcu_read_lock(&kvm->srcu);
e930bffe
AA
289 spin_lock(&kvm->mmu_lock);
290 kvm->mmu_notifier_seq++;
a4ee1ca4 291 need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
e930bffe 292 spin_unlock(&kvm->mmu_lock);
bc6678a3 293 srcu_read_unlock(&kvm->srcu, idx);
e930bffe
AA
294
295 /* we've to flush the tlb before the pages can be freed */
296 if (need_tlb_flush)
297 kvm_flush_remote_tlbs(kvm);
298
299}
300
3da0dd43
IE
301static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
302 struct mm_struct *mm,
303 unsigned long address,
304 pte_t pte)
305{
306 struct kvm *kvm = mmu_notifier_to_kvm(mn);
bc6678a3 307 int idx;
3da0dd43 308
bc6678a3 309 idx = srcu_read_lock(&kvm->srcu);
3da0dd43
IE
310 spin_lock(&kvm->mmu_lock);
311 kvm->mmu_notifier_seq++;
312 kvm_set_spte_hva(kvm, address, pte);
313 spin_unlock(&kvm->mmu_lock);
bc6678a3 314 srcu_read_unlock(&kvm->srcu, idx);
3da0dd43
IE
315}
316
e930bffe
AA
317static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
318 struct mm_struct *mm,
319 unsigned long start,
320 unsigned long end)
321{
322 struct kvm *kvm = mmu_notifier_to_kvm(mn);
bc6678a3 323 int need_tlb_flush = 0, idx;
e930bffe 324
bc6678a3 325 idx = srcu_read_lock(&kvm->srcu);
e930bffe
AA
326 spin_lock(&kvm->mmu_lock);
327 /*
328 * The count increase must become visible at unlock time as no
329 * spte can be established without taking the mmu_lock and
330 * count is also read inside the mmu_lock critical section.
331 */
332 kvm->mmu_notifier_count++;
333 for (; start < end; start += PAGE_SIZE)
334 need_tlb_flush |= kvm_unmap_hva(kvm, start);
a4ee1ca4 335 need_tlb_flush |= kvm->tlbs_dirty;
e930bffe 336 spin_unlock(&kvm->mmu_lock);
bc6678a3 337 srcu_read_unlock(&kvm->srcu, idx);
e930bffe
AA
338
339 /* we've to flush the tlb before the pages can be freed */
340 if (need_tlb_flush)
341 kvm_flush_remote_tlbs(kvm);
342}
343
344static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
345 struct mm_struct *mm,
346 unsigned long start,
347 unsigned long end)
348{
349 struct kvm *kvm = mmu_notifier_to_kvm(mn);
350
351 spin_lock(&kvm->mmu_lock);
352 /*
353 * This sequence increase will notify the kvm page fault that
354 * the page that is going to be mapped in the spte could have
355 * been freed.
356 */
357 kvm->mmu_notifier_seq++;
358 /*
359 * The above sequence increase must be visible before the
360 * below count decrease but both values are read by the kvm
361 * page fault under mmu_lock spinlock so we don't need to add
362 * a smb_wmb() here in between the two.
363 */
364 kvm->mmu_notifier_count--;
365 spin_unlock(&kvm->mmu_lock);
366
367 BUG_ON(kvm->mmu_notifier_count < 0);
368}
369
370static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
371 struct mm_struct *mm,
372 unsigned long address)
373{
374 struct kvm *kvm = mmu_notifier_to_kvm(mn);
bc6678a3 375 int young, idx;
e930bffe 376
bc6678a3 377 idx = srcu_read_lock(&kvm->srcu);
e930bffe
AA
378 spin_lock(&kvm->mmu_lock);
379 young = kvm_age_hva(kvm, address);
380 spin_unlock(&kvm->mmu_lock);
bc6678a3 381 srcu_read_unlock(&kvm->srcu, idx);
e930bffe
AA
382
383 if (young)
384 kvm_flush_remote_tlbs(kvm);
385
386 return young;
387}
388
8ee53820
AA
389static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
390 struct mm_struct *mm,
391 unsigned long address)
392{
393 struct kvm *kvm = mmu_notifier_to_kvm(mn);
394 int young, idx;
395
396 idx = srcu_read_lock(&kvm->srcu);
397 spin_lock(&kvm->mmu_lock);
398 young = kvm_test_age_hva(kvm, address);
399 spin_unlock(&kvm->mmu_lock);
400 srcu_read_unlock(&kvm->srcu, idx);
401
402 return young;
403}
404
85db06e5
MT
405static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
406 struct mm_struct *mm)
407{
408 struct kvm *kvm = mmu_notifier_to_kvm(mn);
eda2beda
LJ
409 int idx;
410
411 idx = srcu_read_lock(&kvm->srcu);
85db06e5 412 kvm_arch_flush_shadow(kvm);
eda2beda 413 srcu_read_unlock(&kvm->srcu, idx);
85db06e5
MT
414}
415
e930bffe
AA
416static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
417 .invalidate_page = kvm_mmu_notifier_invalidate_page,
418 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
419 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
420 .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
8ee53820 421 .test_young = kvm_mmu_notifier_test_young,
3da0dd43 422 .change_pte = kvm_mmu_notifier_change_pte,
85db06e5 423 .release = kvm_mmu_notifier_release,
e930bffe 424};
4c07b0a4
AK
425
426static int kvm_init_mmu_notifier(struct kvm *kvm)
427{
428 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
429 return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
430}
431
432#else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
433
434static int kvm_init_mmu_notifier(struct kvm *kvm)
435{
436 return 0;
437}
438
e930bffe
AA
439#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
440
f17abe9a 441static struct kvm *kvm_create_vm(void)
6aa8b732 442{
d89f5eff
JK
443 int r, i;
444 struct kvm *kvm = kvm_arch_alloc_vm();
6aa8b732 445
d89f5eff
JK
446 if (!kvm)
447 return ERR_PTR(-ENOMEM);
448
449 r = kvm_arch_init_vm(kvm);
450 if (r)
451 goto out_err_nodisable;
10474ae8
AG
452
453 r = hardware_enable_all();
454 if (r)
455 goto out_err_nodisable;
456
75858a84
AK
457#ifdef CONFIG_HAVE_KVM_IRQCHIP
458 INIT_HLIST_HEAD(&kvm->mask_notifier_list);
136bdfee 459 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
75858a84 460#endif
6aa8b732 461
46a26bf5
MT
462 r = -ENOMEM;
463 kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
464 if (!kvm->memslots)
57e7fbee 465 goto out_err_nosrcu;
bc6678a3 466 if (init_srcu_struct(&kvm->srcu))
57e7fbee 467 goto out_err_nosrcu;
e93f8a0f
MT
468 for (i = 0; i < KVM_NR_BUSES; i++) {
469 kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
470 GFP_KERNEL);
57e7fbee 471 if (!kvm->buses[i])
e93f8a0f 472 goto out_err;
e93f8a0f 473 }
e930bffe 474
74b5c5bf 475 spin_lock_init(&kvm->mmu_lock);
6d4e4c4f
AK
476 kvm->mm = current->mm;
477 atomic_inc(&kvm->mm->mm_count);
d34e6b17 478 kvm_eventfd_init(kvm);
11ec2804 479 mutex_init(&kvm->lock);
60eead79 480 mutex_init(&kvm->irq_lock);
79fac95e 481 mutex_init(&kvm->slots_lock);
d39f13b0 482 atomic_set(&kvm->users_count, 1);
74b5c5bf
MW
483
484 r = kvm_init_mmu_notifier(kvm);
485 if (r)
486 goto out_err;
487
e935b837 488 raw_spin_lock(&kvm_lock);
5e58cfe4 489 list_add(&kvm->vm_list, &vm_list);
e935b837 490 raw_spin_unlock(&kvm_lock);
d89f5eff 491
f17abe9a 492 return kvm;
10474ae8
AG
493
494out_err:
57e7fbee
JK
495 cleanup_srcu_struct(&kvm->srcu);
496out_err_nosrcu:
10474ae8
AG
497 hardware_disable_all();
498out_err_nodisable:
e93f8a0f
MT
499 for (i = 0; i < KVM_NR_BUSES; i++)
500 kfree(kvm->buses[i]);
46a26bf5 501 kfree(kvm->memslots);
d89f5eff 502 kvm_arch_free_vm(kvm);
10474ae8 503 return ERR_PTR(r);
f17abe9a
AK
504}
505
a36a57b1
TY
506static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
507{
508 if (!memslot->dirty_bitmap)
509 return;
510
6f9e5c17
TY
511 if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE)
512 vfree(memslot->dirty_bitmap_head);
513 else
514 kfree(memslot->dirty_bitmap_head);
515
a36a57b1 516 memslot->dirty_bitmap = NULL;
515a0127 517 memslot->dirty_bitmap_head = NULL;
a36a57b1
TY
518}
519
6aa8b732
AK
520/*
521 * Free any memory in @free but not in @dont.
522 */
523static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
524 struct kvm_memory_slot *dont)
525{
ec04b260
JR
526 int i;
527
290fc38d
IE
528 if (!dont || free->rmap != dont->rmap)
529 vfree(free->rmap);
6aa8b732
AK
530
531 if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
a36a57b1 532 kvm_destroy_dirty_bitmap(free);
6aa8b732 533
ec04b260
JR
534
535 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
536 if (!dont || free->lpage_info[i] != dont->lpage_info[i]) {
537 vfree(free->lpage_info[i]);
538 free->lpage_info[i] = NULL;
539 }
540 }
05da4558 541
6aa8b732 542 free->npages = 0;
8d4e1288 543 free->rmap = NULL;
6aa8b732
AK
544}
545
d19a9cd2 546void kvm_free_physmem(struct kvm *kvm)
6aa8b732
AK
547{
548 int i;
46a26bf5
MT
549 struct kvm_memslots *slots = kvm->memslots;
550
551 for (i = 0; i < slots->nmemslots; ++i)
552 kvm_free_physmem_slot(&slots->memslots[i], NULL);
6aa8b732 553
46a26bf5 554 kfree(kvm->memslots);
6aa8b732
AK
555}
556
f17abe9a
AK
557static void kvm_destroy_vm(struct kvm *kvm)
558{
e93f8a0f 559 int i;
6d4e4c4f
AK
560 struct mm_struct *mm = kvm->mm;
561
ad8ba2cd 562 kvm_arch_sync_events(kvm);
e935b837 563 raw_spin_lock(&kvm_lock);
133de902 564 list_del(&kvm->vm_list);
e935b837 565 raw_spin_unlock(&kvm_lock);
399ec807 566 kvm_free_irq_routing(kvm);
e93f8a0f
MT
567 for (i = 0; i < KVM_NR_BUSES; i++)
568 kvm_io_bus_destroy(kvm->buses[i]);
980da6ce 569 kvm_coalesced_mmio_free(kvm);
e930bffe
AA
570#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
571 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
f00be0ca
GN
572#else
573 kvm_arch_flush_shadow(kvm);
5f94c174 574#endif
d19a9cd2 575 kvm_arch_destroy_vm(kvm);
d89f5eff
JK
576 kvm_free_physmem(kvm);
577 cleanup_srcu_struct(&kvm->srcu);
578 kvm_arch_free_vm(kvm);
10474ae8 579 hardware_disable_all();
6d4e4c4f 580 mmdrop(mm);
f17abe9a
AK
581}
582
d39f13b0
IE
583void kvm_get_kvm(struct kvm *kvm)
584{
585 atomic_inc(&kvm->users_count);
586}
587EXPORT_SYMBOL_GPL(kvm_get_kvm);
588
589void kvm_put_kvm(struct kvm *kvm)
590{
591 if (atomic_dec_and_test(&kvm->users_count))
592 kvm_destroy_vm(kvm);
593}
594EXPORT_SYMBOL_GPL(kvm_put_kvm);
595
596
f17abe9a
AK
597static int kvm_vm_release(struct inode *inode, struct file *filp)
598{
599 struct kvm *kvm = filp->private_data;
600
721eecbf
GH
601 kvm_irqfd_release(kvm);
602
d39f13b0 603 kvm_put_kvm(kvm);
6aa8b732
AK
604 return 0;
605}
606
d48ead8b 607#ifndef CONFIG_S390
515a0127
TY
608/*
609 * Allocation size is twice as large as the actual dirty bitmap size.
610 * This makes it possible to do double buffering: see x86's
611 * kvm_vm_ioctl_get_dirty_log().
612 */
a36a57b1
TY
613static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
614{
515a0127 615 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
a36a57b1 616
6f9e5c17
TY
617 if (dirty_bytes > PAGE_SIZE)
618 memslot->dirty_bitmap = vzalloc(dirty_bytes);
619 else
620 memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL);
621
a36a57b1
TY
622 if (!memslot->dirty_bitmap)
623 return -ENOMEM;
624
515a0127 625 memslot->dirty_bitmap_head = memslot->dirty_bitmap;
a36a57b1
TY
626 return 0;
627}
d48ead8b 628#endif /* !CONFIG_S390 */
a36a57b1 629
6aa8b732
AK
630/*
631 * Allocate some memory and give it an address in the guest physical address
632 * space.
633 *
634 * Discontiguous memory is allowed, mostly for framebuffers.
f78e0e2e 635 *
10589a46 636 * Must be called holding mmap_sem for write.
6aa8b732 637 */
f78e0e2e
SY
638int __kvm_set_memory_region(struct kvm *kvm,
639 struct kvm_userspace_memory_region *mem,
640 int user_alloc)
6aa8b732 641{
8234b22e 642 int r;
6aa8b732 643 gfn_t base_gfn;
28bcb112
HC
644 unsigned long npages;
645 unsigned long i;
6aa8b732
AK
646 struct kvm_memory_slot *memslot;
647 struct kvm_memory_slot old, new;
bc6678a3 648 struct kvm_memslots *slots, *old_memslots;
6aa8b732
AK
649
650 r = -EINVAL;
651 /* General sanity checks */
652 if (mem->memory_size & (PAGE_SIZE - 1))
653 goto out;
654 if (mem->guest_phys_addr & (PAGE_SIZE - 1))
655 goto out;
fa3d315a
TY
656 /* We can read the guest memory with __xxx_user() later on. */
657 if (user_alloc &&
658 ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
9e3bb6b6
HC
659 !access_ok(VERIFY_WRITE,
660 (void __user *)(unsigned long)mem->userspace_addr,
661 mem->memory_size)))
78749809 662 goto out;
e0d62c7f 663 if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
6aa8b732
AK
664 goto out;
665 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
666 goto out;
667
46a26bf5 668 memslot = &kvm->memslots->memslots[mem->slot];
6aa8b732
AK
669 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
670 npages = mem->memory_size >> PAGE_SHIFT;
671
660c22c4
TY
672 r = -EINVAL;
673 if (npages > KVM_MEM_MAX_NR_PAGES)
674 goto out;
675
6aa8b732
AK
676 if (!npages)
677 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
678
6aa8b732
AK
679 new = old = *memslot;
680
e36d96f7 681 new.id = mem->slot;
6aa8b732
AK
682 new.base_gfn = base_gfn;
683 new.npages = npages;
684 new.flags = mem->flags;
685
686 /* Disallow changing a memory slot's size. */
687 r = -EINVAL;
688 if (npages && old.npages && npages != old.npages)
f78e0e2e 689 goto out_free;
6aa8b732
AK
690
691 /* Check for overlaps */
692 r = -EEXIST;
693 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
46a26bf5 694 struct kvm_memory_slot *s = &kvm->memslots->memslots[i];
6aa8b732 695
4cd481f6 696 if (s == memslot || !s->npages)
6aa8b732
AK
697 continue;
698 if (!((base_gfn + npages <= s->base_gfn) ||
699 (base_gfn >= s->base_gfn + s->npages)))
f78e0e2e 700 goto out_free;
6aa8b732 701 }
6aa8b732 702
6aa8b732
AK
703 /* Free page dirty bitmap if unneeded */
704 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
8b6d44c7 705 new.dirty_bitmap = NULL;
6aa8b732
AK
706
707 r = -ENOMEM;
708
709 /* Allocate if a slot is being created */
eff0114a 710#ifndef CONFIG_S390
8d4e1288 711 if (npages && !new.rmap) {
26535037 712 new.rmap = vzalloc(npages * sizeof(*new.rmap));
290fc38d
IE
713
714 if (!new.rmap)
f78e0e2e 715 goto out_free;
290fc38d 716
80b14b5b 717 new.user_alloc = user_alloc;
bc6678a3 718 new.userspace_addr = mem->userspace_addr;
6aa8b732 719 }
ec04b260
JR
720 if (!npages)
721 goto skip_lpage;
05da4558 722
ec04b260 723 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
28bcb112
HC
724 unsigned long ugfn;
725 unsigned long j;
726 int lpages;
ec04b260 727 int level = i + 2;
05da4558 728
ec04b260
JR
729 /* Avoid unused variable warning if no large pages */
730 (void)level;
731
732 if (new.lpage_info[i])
733 continue;
734
82855413
JR
735 lpages = 1 + ((base_gfn + npages - 1)
736 >> KVM_HPAGE_GFN_SHIFT(level));
737 lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level);
ec04b260 738
26535037 739 new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i]));
ec04b260
JR
740
741 if (!new.lpage_info[i])
05da4558
MT
742 goto out_free;
743
82855413 744 if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
ec04b260 745 new.lpage_info[i][0].write_count = 1;
82855413 746 if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
ec04b260 747 new.lpage_info[i][lpages - 1].write_count = 1;
ac04527f
AK
748 ugfn = new.userspace_addr >> PAGE_SHIFT;
749 /*
750 * If the gfn and userspace address are not aligned wrt each
54dee993
MT
751 * other, or if explicitly asked to, disable large page
752 * support for this slot
ac04527f 753 */
ec04b260 754 if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
54dee993 755 !largepages_enabled)
ec04b260
JR
756 for (j = 0; j < lpages; ++j)
757 new.lpage_info[i][j].write_count = 1;
05da4558 758 }
6aa8b732 759
ec04b260
JR
760skip_lpage:
761
6aa8b732
AK
762 /* Allocate page dirty bitmap if needed */
763 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
a36a57b1 764 if (kvm_create_dirty_bitmap(&new) < 0)
f78e0e2e 765 goto out_free;
bc6678a3 766 /* destroy any largepage mappings for dirty tracking */
6aa8b732 767 }
3eea8437
CB
768#else /* not defined CONFIG_S390 */
769 new.user_alloc = user_alloc;
770 if (user_alloc)
771 new.userspace_addr = mem->userspace_addr;
eff0114a 772#endif /* not defined CONFIG_S390 */
6aa8b732 773
bc6678a3
MT
774 if (!npages) {
775 r = -ENOMEM;
776 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
777 if (!slots)
778 goto out_free;
779 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
780 if (mem->slot >= slots->nmemslots)
781 slots->nmemslots = mem->slot + 1;
49c7754c 782 slots->generation++;
bc6678a3
MT
783 slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID;
784
785 old_memslots = kvm->memslots;
786 rcu_assign_pointer(kvm->memslots, slots);
787 synchronize_srcu_expedited(&kvm->srcu);
788 /* From this point no new shadow pages pointing to a deleted
789 * memslot will be created.
790 *
791 * validation of sp->gfn happens in:
792 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
793 * - kvm_is_visible_gfn (mmu_check_roots)
794 */
34d4cb8f 795 kvm_arch_flush_shadow(kvm);
bc6678a3
MT
796 kfree(old_memslots);
797 }
34d4cb8f 798
f7784b8e
MT
799 r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc);
800 if (r)
801 goto out_free;
802
bc6678a3
MT
803 /* map the pages in iommu page table */
804 if (npages) {
805 r = kvm_iommu_map_pages(kvm, &new);
806 if (r)
807 goto out_free;
808 }
604b38ac 809
bc6678a3
MT
810 r = -ENOMEM;
811 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
812 if (!slots)
813 goto out_free;
814 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
815 if (mem->slot >= slots->nmemslots)
816 slots->nmemslots = mem->slot + 1;
49c7754c 817 slots->generation++;
bc6678a3
MT
818
819 /* actual memory is freed via old in kvm_free_physmem_slot below */
820 if (!npages) {
821 new.rmap = NULL;
822 new.dirty_bitmap = NULL;
823 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i)
824 new.lpage_info[i] = NULL;
825 }
826
827 slots->memslots[mem->slot] = new;
828 old_memslots = kvm->memslots;
829 rcu_assign_pointer(kvm->memslots, slots);
830 synchronize_srcu_expedited(&kvm->srcu);
3ad82a7e 831
f7784b8e 832 kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
82ce2c96 833
bc6678a3
MT
834 kvm_free_physmem_slot(&old, &new);
835 kfree(old_memslots);
836
6aa8b732
AK
837 return 0;
838
f78e0e2e 839out_free:
6aa8b732
AK
840 kvm_free_physmem_slot(&new, &old);
841out:
842 return r;
210c7c4d
IE
843
844}
f78e0e2e
SY
845EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
846
847int kvm_set_memory_region(struct kvm *kvm,
848 struct kvm_userspace_memory_region *mem,
849 int user_alloc)
850{
851 int r;
852
79fac95e 853 mutex_lock(&kvm->slots_lock);
f78e0e2e 854 r = __kvm_set_memory_region(kvm, mem, user_alloc);
79fac95e 855 mutex_unlock(&kvm->slots_lock);
f78e0e2e
SY
856 return r;
857}
210c7c4d
IE
858EXPORT_SYMBOL_GPL(kvm_set_memory_region);
859
1fe779f8
CO
860int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
861 struct
862 kvm_userspace_memory_region *mem,
863 int user_alloc)
210c7c4d 864{
e0d62c7f
IE
865 if (mem->slot >= KVM_MEMORY_SLOTS)
866 return -EINVAL;
210c7c4d 867 return kvm_set_memory_region(kvm, mem, user_alloc);
6aa8b732
AK
868}
869
5bb064dc
ZX
870int kvm_get_dirty_log(struct kvm *kvm,
871 struct kvm_dirty_log *log, int *is_dirty)
6aa8b732
AK
872{
873 struct kvm_memory_slot *memslot;
874 int r, i;
87bf6e7d 875 unsigned long n;
6aa8b732
AK
876 unsigned long any = 0;
877
6aa8b732
AK
878 r = -EINVAL;
879 if (log->slot >= KVM_MEMORY_SLOTS)
880 goto out;
881
46a26bf5 882 memslot = &kvm->memslots->memslots[log->slot];
6aa8b732
AK
883 r = -ENOENT;
884 if (!memslot->dirty_bitmap)
885 goto out;
886
87bf6e7d 887 n = kvm_dirty_bitmap_bytes(memslot);
6aa8b732 888
cd1a4a98 889 for (i = 0; !any && i < n/sizeof(long); ++i)
6aa8b732
AK
890 any = memslot->dirty_bitmap[i];
891
892 r = -EFAULT;
893 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
894 goto out;
895
5bb064dc
ZX
896 if (any)
897 *is_dirty = 1;
6aa8b732
AK
898
899 r = 0;
6aa8b732 900out:
6aa8b732
AK
901 return r;
902}
903
54dee993
MT
904void kvm_disable_largepages(void)
905{
906 largepages_enabled = false;
907}
908EXPORT_SYMBOL_GPL(kvm_disable_largepages);
909
cea7bb21
IE
910int is_error_page(struct page *page)
911{
edba23e5 912 return page == bad_page || page == hwpoison_page || page == fault_page;
cea7bb21
IE
913}
914EXPORT_SYMBOL_GPL(is_error_page);
915
35149e21
AL
916int is_error_pfn(pfn_t pfn)
917{
edba23e5 918 return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn;
35149e21
AL
919}
920EXPORT_SYMBOL_GPL(is_error_pfn);
921
bf998156
HY
922int is_hwpoison_pfn(pfn_t pfn)
923{
924 return pfn == hwpoison_pfn;
925}
926EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
927
edba23e5
GN
928int is_fault_pfn(pfn_t pfn)
929{
930 return pfn == fault_pfn;
931}
932EXPORT_SYMBOL_GPL(is_fault_pfn);
933
f9d46eb0
IE
934static inline unsigned long bad_hva(void)
935{
936 return PAGE_OFFSET;
937}
938
939int kvm_is_error_hva(unsigned long addr)
940{
941 return addr == bad_hva();
942}
943EXPORT_SYMBOL_GPL(kvm_is_error_hva);
944
49c7754c
GN
945static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots,
946 gfn_t gfn)
6aa8b732
AK
947{
948 int i;
949
46a26bf5
MT
950 for (i = 0; i < slots->nmemslots; ++i) {
951 struct kvm_memory_slot *memslot = &slots->memslots[i];
6aa8b732
AK
952
953 if (gfn >= memslot->base_gfn
954 && gfn < memslot->base_gfn + memslot->npages)
955 return memslot;
956 }
8b6d44c7 957 return NULL;
6aa8b732 958}
49c7754c
GN
959
960struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
961{
962 return __gfn_to_memslot(kvm_memslots(kvm), gfn);
963}
a1f4d395 964EXPORT_SYMBOL_GPL(gfn_to_memslot);
6aa8b732 965
e0d62c7f
IE
966int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
967{
968 int i;
90d83dc3 969 struct kvm_memslots *slots = kvm_memslots(kvm);
e0d62c7f 970
e0d62c7f 971 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
46a26bf5 972 struct kvm_memory_slot *memslot = &slots->memslots[i];
e0d62c7f 973
bc6678a3
MT
974 if (memslot->flags & KVM_MEMSLOT_INVALID)
975 continue;
976
e0d62c7f
IE
977 if (gfn >= memslot->base_gfn
978 && gfn < memslot->base_gfn + memslot->npages)
979 return 1;
980 }
981 return 0;
982}
983EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
984
8f0b1ab6
JR
985unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn)
986{
987 struct vm_area_struct *vma;
988 unsigned long addr, size;
989
990 size = PAGE_SIZE;
991
992 addr = gfn_to_hva(kvm, gfn);
993 if (kvm_is_error_hva(addr))
994 return PAGE_SIZE;
995
996 down_read(&current->mm->mmap_sem);
997 vma = find_vma(current->mm, addr);
998 if (!vma)
999 goto out;
1000
1001 size = vma_kernel_pagesize(vma);
1002
1003out:
1004 up_read(&current->mm->mmap_sem);
1005
1006 return size;
1007}
1008
49c7754c 1009static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
48987781 1010 gfn_t *nr_pages)
539cb660 1011{
bc6678a3 1012 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
539cb660 1013 return bad_hva();
48987781
XG
1014
1015 if (nr_pages)
1016 *nr_pages = slot->npages - (gfn - slot->base_gfn);
1017
f5c98031 1018 return gfn_to_hva_memslot(slot, gfn);
539cb660 1019}
48987781
XG
1020
1021unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
1022{
49c7754c 1023 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
48987781 1024}
0d150298 1025EXPORT_SYMBOL_GPL(gfn_to_hva);
539cb660 1026
8030089f
GN
1027static pfn_t get_fault_pfn(void)
1028{
1029 get_page(fault_page);
1030 return fault_pfn;
1031}
1032
0857b9e9
GN
1033int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
1034 unsigned long start, int write, struct page **page)
1035{
1036 int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET;
1037
1038 if (write)
1039 flags |= FOLL_WRITE;
1040
1041 return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
1042}
1043
fafc3dba
HY
1044static inline int check_user_page_hwpoison(unsigned long addr)
1045{
1046 int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
1047
1048 rc = __get_user_pages(current, current->mm, addr, 1,
1049 flags, NULL, NULL, NULL);
1050 return rc == -EHWPOISON;
1051}
1052
af585b92 1053static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
612819c3 1054 bool *async, bool write_fault, bool *writable)
954bbbc2 1055{
8d4e1288 1056 struct page *page[1];
af585b92 1057 int npages = 0;
2e2e3738 1058 pfn_t pfn;
954bbbc2 1059
af585b92
GN
1060 /* we can do it either atomically or asynchronously, not both */
1061 BUG_ON(atomic && async);
1062
612819c3
MT
1063 BUG_ON(!write_fault && !writable);
1064
1065 if (writable)
1066 *writable = true;
1067
af585b92 1068 if (atomic || async)
887c08ac 1069 npages = __get_user_pages_fast(addr, 1, 1, page);
af585b92
GN
1070
1071 if (unlikely(npages != 1) && !atomic) {
887c08ac 1072 might_sleep();
612819c3
MT
1073
1074 if (writable)
1075 *writable = write_fault;
1076
0857b9e9
GN
1077 if (async) {
1078 down_read(&current->mm->mmap_sem);
1079 npages = get_user_page_nowait(current, current->mm,
1080 addr, write_fault, page);
1081 up_read(&current->mm->mmap_sem);
1082 } else
1083 npages = get_user_pages_fast(addr, 1, write_fault,
1084 page);
612819c3
MT
1085
1086 /* map read fault as writable if possible */
1087 if (unlikely(!write_fault) && npages == 1) {
1088 struct page *wpage[1];
1089
1090 npages = __get_user_pages_fast(addr, 1, 1, wpage);
1091 if (npages == 1) {
1092 *writable = true;
1093 put_page(page[0]);
1094 page[0] = wpage[0];
1095 }
1096 npages = 1;
1097 }
887c08ac 1098 }
539cb660 1099
2e2e3738
AL
1100 if (unlikely(npages != 1)) {
1101 struct vm_area_struct *vma;
1102
887c08ac 1103 if (atomic)
8030089f 1104 return get_fault_pfn();
887c08ac 1105
bbeb3406 1106 down_read(&current->mm->mmap_sem);
0857b9e9
GN
1107 if (npages == -EHWPOISON ||
1108 (!async && check_user_page_hwpoison(addr))) {
bbeb3406 1109 up_read(&current->mm->mmap_sem);
bf998156
HY
1110 get_page(hwpoison_page);
1111 return page_to_pfn(hwpoison_page);
1112 }
1113
8030089f 1114 vma = find_vma_intersection(current->mm, addr, addr+1);
4c2155ce 1115
8030089f
GN
1116 if (vma == NULL)
1117 pfn = get_fault_pfn();
1118 else if ((vma->vm_flags & VM_PFNMAP)) {
1119 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
1120 vma->vm_pgoff;
1121 BUG_ON(!kvm_is_mmio_pfn(pfn));
1122 } else {
1123 if (async && (vma->vm_flags & VM_WRITE))
af585b92 1124 *async = true;
8030089f 1125 pfn = get_fault_pfn();
2e2e3738 1126 }
4c2155ce 1127 up_read(&current->mm->mmap_sem);
2e2e3738
AL
1128 } else
1129 pfn = page_to_pfn(page[0]);
8d4e1288 1130
2e2e3738 1131 return pfn;
35149e21
AL
1132}
1133
887c08ac
XG
1134pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
1135{
612819c3 1136 return hva_to_pfn(kvm, addr, true, NULL, true, NULL);
887c08ac
XG
1137}
1138EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
1139
612819c3
MT
1140static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
1141 bool write_fault, bool *writable)
506f0d6f
MT
1142{
1143 unsigned long addr;
1144
af585b92
GN
1145 if (async)
1146 *async = false;
1147
506f0d6f
MT
1148 addr = gfn_to_hva(kvm, gfn);
1149 if (kvm_is_error_hva(addr)) {
1150 get_page(bad_page);
1151 return page_to_pfn(bad_page);
1152 }
1153
612819c3 1154 return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable);
365fb3fd
XG
1155}
1156
1157pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
1158{
612819c3 1159 return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL);
365fb3fd
XG
1160}
1161EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
1162
612819c3
MT
1163pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
1164 bool write_fault, bool *writable)
af585b92 1165{
612819c3 1166 return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable);
af585b92
GN
1167}
1168EXPORT_SYMBOL_GPL(gfn_to_pfn_async);
1169
365fb3fd
XG
1170pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
1171{
612819c3 1172 return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL);
506f0d6f 1173}
35149e21
AL
1174EXPORT_SYMBOL_GPL(gfn_to_pfn);
1175
612819c3
MT
1176pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
1177 bool *writable)
1178{
1179 return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable);
1180}
1181EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
1182
506f0d6f
MT
1183pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
1184 struct kvm_memory_slot *slot, gfn_t gfn)
1185{
1186 unsigned long addr = gfn_to_hva_memslot(slot, gfn);
612819c3 1187 return hva_to_pfn(kvm, addr, false, NULL, true, NULL);
506f0d6f
MT
1188}
1189
48987781
XG
1190int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
1191 int nr_pages)
1192{
1193 unsigned long addr;
1194 gfn_t entry;
1195
49c7754c 1196 addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry);
48987781
XG
1197 if (kvm_is_error_hva(addr))
1198 return -1;
1199
1200 if (entry < nr_pages)
1201 return 0;
1202
1203 return __get_user_pages_fast(addr, nr_pages, 1, pages);
1204}
1205EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
1206
35149e21
AL
1207struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
1208{
2e2e3738
AL
1209 pfn_t pfn;
1210
1211 pfn = gfn_to_pfn(kvm, gfn);
c77fb9dc 1212 if (!kvm_is_mmio_pfn(pfn))
2e2e3738
AL
1213 return pfn_to_page(pfn);
1214
c77fb9dc 1215 WARN_ON(kvm_is_mmio_pfn(pfn));
2e2e3738
AL
1216
1217 get_page(bad_page);
1218 return bad_page;
954bbbc2 1219}
aab61cc0 1220
954bbbc2
AK
1221EXPORT_SYMBOL_GPL(gfn_to_page);
1222
b4231d61
IE
1223void kvm_release_page_clean(struct page *page)
1224{
35149e21 1225 kvm_release_pfn_clean(page_to_pfn(page));
b4231d61
IE
1226}
1227EXPORT_SYMBOL_GPL(kvm_release_page_clean);
1228
35149e21
AL
1229void kvm_release_pfn_clean(pfn_t pfn)
1230{
c77fb9dc 1231 if (!kvm_is_mmio_pfn(pfn))
2e2e3738 1232 put_page(pfn_to_page(pfn));
35149e21
AL
1233}
1234EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
1235
b4231d61 1236void kvm_release_page_dirty(struct page *page)
8a7ae055 1237{
35149e21
AL
1238 kvm_release_pfn_dirty(page_to_pfn(page));
1239}
1240EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
1241
1242void kvm_release_pfn_dirty(pfn_t pfn)
1243{
1244 kvm_set_pfn_dirty(pfn);
1245 kvm_release_pfn_clean(pfn);
1246}
1247EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
1248
1249void kvm_set_page_dirty(struct page *page)
1250{
1251 kvm_set_pfn_dirty(page_to_pfn(page));
1252}
1253EXPORT_SYMBOL_GPL(kvm_set_page_dirty);
1254
1255void kvm_set_pfn_dirty(pfn_t pfn)
1256{
c77fb9dc 1257 if (!kvm_is_mmio_pfn(pfn)) {
2e2e3738
AL
1258 struct page *page = pfn_to_page(pfn);
1259 if (!PageReserved(page))
1260 SetPageDirty(page);
1261 }
8a7ae055 1262}
35149e21
AL
1263EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
1264
1265void kvm_set_pfn_accessed(pfn_t pfn)
1266{
c77fb9dc 1267 if (!kvm_is_mmio_pfn(pfn))
2e2e3738 1268 mark_page_accessed(pfn_to_page(pfn));
35149e21
AL
1269}
1270EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
1271
1272void kvm_get_pfn(pfn_t pfn)
1273{
c77fb9dc 1274 if (!kvm_is_mmio_pfn(pfn))
2e2e3738 1275 get_page(pfn_to_page(pfn));
35149e21
AL
1276}
1277EXPORT_SYMBOL_GPL(kvm_get_pfn);
8a7ae055 1278
195aefde
IE
1279static int next_segment(unsigned long len, int offset)
1280{
1281 if (len > PAGE_SIZE - offset)
1282 return PAGE_SIZE - offset;
1283 else
1284 return len;
1285}
1286
1287int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
1288 int len)
1289{
e0506bcb
IE
1290 int r;
1291 unsigned long addr;
195aefde 1292
e0506bcb
IE
1293 addr = gfn_to_hva(kvm, gfn);
1294 if (kvm_is_error_hva(addr))
1295 return -EFAULT;
fa3d315a 1296 r = __copy_from_user(data, (void __user *)addr + offset, len);
e0506bcb 1297 if (r)
195aefde 1298 return -EFAULT;
195aefde
IE
1299 return 0;
1300}
1301EXPORT_SYMBOL_GPL(kvm_read_guest_page);
1302
1303int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
1304{
1305 gfn_t gfn = gpa >> PAGE_SHIFT;
1306 int seg;
1307 int offset = offset_in_page(gpa);
1308 int ret;
1309
1310 while ((seg = next_segment(len, offset)) != 0) {
1311 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
1312 if (ret < 0)
1313 return ret;
1314 offset = 0;
1315 len -= seg;
1316 data += seg;
1317 ++gfn;
1318 }
1319 return 0;
1320}
1321EXPORT_SYMBOL_GPL(kvm_read_guest);
1322
7ec54588
MT
1323int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
1324 unsigned long len)
1325{
1326 int r;
1327 unsigned long addr;
1328 gfn_t gfn = gpa >> PAGE_SHIFT;
1329 int offset = offset_in_page(gpa);
1330
1331 addr = gfn_to_hva(kvm, gfn);
1332 if (kvm_is_error_hva(addr))
1333 return -EFAULT;
0aac03f0 1334 pagefault_disable();
7ec54588 1335 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
0aac03f0 1336 pagefault_enable();
7ec54588
MT
1337 if (r)
1338 return -EFAULT;
1339 return 0;
1340}
1341EXPORT_SYMBOL(kvm_read_guest_atomic);
1342
195aefde
IE
1343int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
1344 int offset, int len)
1345{
e0506bcb
IE
1346 int r;
1347 unsigned long addr;
195aefde 1348
e0506bcb
IE
1349 addr = gfn_to_hva(kvm, gfn);
1350 if (kvm_is_error_hva(addr))
1351 return -EFAULT;
8b0cedff 1352 r = __copy_to_user((void __user *)addr + offset, data, len);
e0506bcb 1353 if (r)
195aefde 1354 return -EFAULT;
195aefde
IE
1355 mark_page_dirty(kvm, gfn);
1356 return 0;
1357}
1358EXPORT_SYMBOL_GPL(kvm_write_guest_page);
1359
1360int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
1361 unsigned long len)
1362{
1363 gfn_t gfn = gpa >> PAGE_SHIFT;
1364 int seg;
1365 int offset = offset_in_page(gpa);
1366 int ret;
1367
1368 while ((seg = next_segment(len, offset)) != 0) {
1369 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
1370 if (ret < 0)
1371 return ret;
1372 offset = 0;
1373 len -= seg;
1374 data += seg;
1375 ++gfn;
1376 }
1377 return 0;
1378}
1379
49c7754c
GN
1380int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1381 gpa_t gpa)
1382{
1383 struct kvm_memslots *slots = kvm_memslots(kvm);
1384 int offset = offset_in_page(gpa);
1385 gfn_t gfn = gpa >> PAGE_SHIFT;
1386
1387 ghc->gpa = gpa;
1388 ghc->generation = slots->generation;
1389 ghc->memslot = __gfn_to_memslot(slots, gfn);
1390 ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL);
1391 if (!kvm_is_error_hva(ghc->hva))
1392 ghc->hva += offset;
1393 else
1394 return -EFAULT;
1395
1396 return 0;
1397}
1398EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
1399
1400int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1401 void *data, unsigned long len)
1402{
1403 struct kvm_memslots *slots = kvm_memslots(kvm);
1404 int r;
1405
1406 if (slots->generation != ghc->generation)
1407 kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa);
1408
1409 if (kvm_is_error_hva(ghc->hva))
1410 return -EFAULT;
1411
8b0cedff 1412 r = __copy_to_user((void __user *)ghc->hva, data, len);
49c7754c
GN
1413 if (r)
1414 return -EFAULT;
1415 mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
1416
1417 return 0;
1418}
1419EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
1420
195aefde
IE
1421int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
1422{
3bcc8a8c
HC
1423 return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page,
1424 offset, len);
195aefde
IE
1425}
1426EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
1427
1428int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
1429{
1430 gfn_t gfn = gpa >> PAGE_SHIFT;
1431 int seg;
1432 int offset = offset_in_page(gpa);
1433 int ret;
1434
1435 while ((seg = next_segment(len, offset)) != 0) {
1436 ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
1437 if (ret < 0)
1438 return ret;
1439 offset = 0;
1440 len -= seg;
1441 ++gfn;
1442 }
1443 return 0;
1444}
1445EXPORT_SYMBOL_GPL(kvm_clear_guest);
1446
49c7754c
GN
1447void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
1448 gfn_t gfn)
6aa8b732 1449{
7e9d619d
RR
1450 if (memslot && memslot->dirty_bitmap) {
1451 unsigned long rel_gfn = gfn - memslot->base_gfn;
6aa8b732 1452
cd7e48c5 1453 __set_bit_le(rel_gfn, memslot->dirty_bitmap);
6aa8b732
AK
1454 }
1455}
1456
49c7754c
GN
1457void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
1458{
1459 struct kvm_memory_slot *memslot;
1460
1461 memslot = gfn_to_memslot(kvm, gfn);
1462 mark_page_dirty_in_slot(kvm, memslot, gfn);
1463}
1464
b6958ce4
ED
1465/*
1466 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
1467 */
8776e519 1468void kvm_vcpu_block(struct kvm_vcpu *vcpu)
d3bef15f 1469{
e5c239cf
MT
1470 DEFINE_WAIT(wait);
1471
1472 for (;;) {
1473 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
1474
a1b37100 1475 if (kvm_arch_vcpu_runnable(vcpu)) {
a8eeb04a 1476 kvm_make_request(KVM_REQ_UNHALT, vcpu);
e5c239cf 1477 break;
d7690175 1478 }
09cec754
GN
1479 if (kvm_cpu_has_pending_timer(vcpu))
1480 break;
e5c239cf
MT
1481 if (signal_pending(current))
1482 break;
1483
b6958ce4 1484 schedule();
b6958ce4 1485 }
d3bef15f 1486
e5c239cf 1487 finish_wait(&vcpu->wq, &wait);
b6958ce4
ED
1488}
1489
6aa8b732
AK
1490void kvm_resched(struct kvm_vcpu *vcpu)
1491{
3fca0365
YD
1492 if (!need_resched())
1493 return;
6aa8b732 1494 cond_resched();
6aa8b732
AK
1495}
1496EXPORT_SYMBOL_GPL(kvm_resched);
1497
217ece61 1498void kvm_vcpu_on_spin(struct kvm_vcpu *me)
d255f4f2 1499{
217ece61
RR
1500 struct kvm *kvm = me->kvm;
1501 struct kvm_vcpu *vcpu;
1502 int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
1503 int yielded = 0;
1504 int pass;
1505 int i;
d255f4f2 1506
217ece61
RR
1507 /*
1508 * We boost the priority of a VCPU that is runnable but not
1509 * currently running, because it got preempted by something
1510 * else and called schedule in __vcpu_run. Hopefully that
1511 * VCPU is holding the lock that we need and will release it.
1512 * We approximate round-robin by starting at the last boosted VCPU.
1513 */
1514 for (pass = 0; pass < 2 && !yielded; pass++) {
1515 kvm_for_each_vcpu(i, vcpu, kvm) {
1516 struct task_struct *task = NULL;
1517 struct pid *pid;
1518 if (!pass && i < last_boosted_vcpu) {
1519 i = last_boosted_vcpu;
1520 continue;
1521 } else if (pass && i > last_boosted_vcpu)
1522 break;
1523 if (vcpu == me)
1524 continue;
1525 if (waitqueue_active(&vcpu->wq))
1526 continue;
1527 rcu_read_lock();
1528 pid = rcu_dereference(vcpu->pid);
1529 if (pid)
1530 task = get_pid_task(vcpu->pid, PIDTYPE_PID);
1531 rcu_read_unlock();
1532 if (!task)
1533 continue;
1534 if (task->flags & PF_VCPU) {
1535 put_task_struct(task);
1536 continue;
1537 }
1538 if (yield_to(task, 1)) {
1539 put_task_struct(task);
1540 kvm->last_boosted_vcpu = i;
1541 yielded = 1;
1542 break;
1543 }
1544 put_task_struct(task);
1545 }
1546 }
d255f4f2
ZE
1547}
1548EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
1549
e4a533a4 1550static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
9a2bb7f4
AK
1551{
1552 struct kvm_vcpu *vcpu = vma->vm_file->private_data;
9a2bb7f4
AK
1553 struct page *page;
1554
e4a533a4 1555 if (vmf->pgoff == 0)
039576c0 1556 page = virt_to_page(vcpu->run);
09566765 1557#ifdef CONFIG_X86
e4a533a4 1558 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
ad312c7c 1559 page = virt_to_page(vcpu->arch.pio_data);
5f94c174
LV
1560#endif
1561#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
1562 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
1563 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
09566765 1564#endif
039576c0 1565 else
e4a533a4 1566 return VM_FAULT_SIGBUS;
9a2bb7f4 1567 get_page(page);
e4a533a4 1568 vmf->page = page;
1569 return 0;
9a2bb7f4
AK
1570}
1571
f0f37e2f 1572static const struct vm_operations_struct kvm_vcpu_vm_ops = {
e4a533a4 1573 .fault = kvm_vcpu_fault,
9a2bb7f4
AK
1574};
1575
1576static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
1577{
1578 vma->vm_ops = &kvm_vcpu_vm_ops;
1579 return 0;
1580}
1581
bccf2150
AK
1582static int kvm_vcpu_release(struct inode *inode, struct file *filp)
1583{
1584 struct kvm_vcpu *vcpu = filp->private_data;
1585
66c0b394 1586 kvm_put_kvm(vcpu->kvm);
bccf2150
AK
1587 return 0;
1588}
1589
3d3aab1b 1590static struct file_operations kvm_vcpu_fops = {
bccf2150
AK
1591 .release = kvm_vcpu_release,
1592 .unlocked_ioctl = kvm_vcpu_ioctl,
1dda606c
AG
1593#ifdef CONFIG_COMPAT
1594 .compat_ioctl = kvm_vcpu_compat_ioctl,
1595#endif
9a2bb7f4 1596 .mmap = kvm_vcpu_mmap,
6038f373 1597 .llseek = noop_llseek,
bccf2150
AK
1598};
1599
1600/*
1601 * Allocates an inode for the vcpu.
1602 */
1603static int create_vcpu_fd(struct kvm_vcpu *vcpu)
1604{
628ff7c1 1605 return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR);
bccf2150
AK
1606}
1607
c5ea7660
AK
1608/*
1609 * Creates some virtual cpus. Good luck creating more than one.
1610 */
73880c80 1611static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
c5ea7660
AK
1612{
1613 int r;
988a2cae 1614 struct kvm_vcpu *vcpu, *v;
c5ea7660 1615
73880c80 1616 vcpu = kvm_arch_vcpu_create(kvm, id);
fb3f0f51
RR
1617 if (IS_ERR(vcpu))
1618 return PTR_ERR(vcpu);
c5ea7660 1619
15ad7146
AK
1620 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
1621
26e5215f
AK
1622 r = kvm_arch_vcpu_setup(vcpu);
1623 if (r)
d780592b 1624 goto vcpu_destroy;
26e5215f 1625
11ec2804 1626 mutex_lock(&kvm->lock);
73880c80
GN
1627 if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
1628 r = -EINVAL;
d780592b 1629 goto unlock_vcpu_destroy;
fb3f0f51 1630 }
73880c80 1631
988a2cae
GN
1632 kvm_for_each_vcpu(r, v, kvm)
1633 if (v->vcpu_id == id) {
73880c80 1634 r = -EEXIST;
d780592b 1635 goto unlock_vcpu_destroy;
73880c80
GN
1636 }
1637
1638 BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]);
c5ea7660 1639
fb3f0f51 1640 /* Now it's all set up, let userspace reach it */
66c0b394 1641 kvm_get_kvm(kvm);
bccf2150 1642 r = create_vcpu_fd(vcpu);
73880c80
GN
1643 if (r < 0) {
1644 kvm_put_kvm(kvm);
d780592b 1645 goto unlock_vcpu_destroy;
73880c80
GN
1646 }
1647
1648 kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;
1649 smp_wmb();
1650 atomic_inc(&kvm->online_vcpus);
1651
1652#ifdef CONFIG_KVM_APIC_ARCHITECTURE
1653 if (kvm->bsp_vcpu_id == id)
1654 kvm->bsp_vcpu = vcpu;
1655#endif
1656 mutex_unlock(&kvm->lock);
fb3f0f51 1657 return r;
39c3b86e 1658
d780592b 1659unlock_vcpu_destroy:
7d8fece6 1660 mutex_unlock(&kvm->lock);
d780592b 1661vcpu_destroy:
d40ccc62 1662 kvm_arch_vcpu_destroy(vcpu);
c5ea7660
AK
1663 return r;
1664}
1665
1961d276
AK
1666static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
1667{
1668 if (sigset) {
1669 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
1670 vcpu->sigset_active = 1;
1671 vcpu->sigset = *sigset;
1672 } else
1673 vcpu->sigset_active = 0;
1674 return 0;
1675}
1676
bccf2150
AK
1677static long kvm_vcpu_ioctl(struct file *filp,
1678 unsigned int ioctl, unsigned long arg)
6aa8b732 1679{
bccf2150 1680 struct kvm_vcpu *vcpu = filp->private_data;
2f366987 1681 void __user *argp = (void __user *)arg;
313a3dc7 1682 int r;
fa3795a7
DH
1683 struct kvm_fpu *fpu = NULL;
1684 struct kvm_sregs *kvm_sregs = NULL;
6aa8b732 1685
6d4e4c4f
AK
1686 if (vcpu->kvm->mm != current->mm)
1687 return -EIO;
2122ff5e
AK
1688
1689#if defined(CONFIG_S390) || defined(CONFIG_PPC)
1690 /*
1691 * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
1692 * so vcpu_load() would break it.
1693 */
1694 if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT)
1695 return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
1696#endif
1697
1698
1699 vcpu_load(vcpu);
6aa8b732 1700 switch (ioctl) {
9a2bb7f4 1701 case KVM_RUN:
f0fe5108
AK
1702 r = -EINVAL;
1703 if (arg)
1704 goto out;
b6c7a5dc 1705 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
64be5007 1706 trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
6aa8b732 1707 break;
6aa8b732 1708 case KVM_GET_REGS: {
3e4bb3ac 1709 struct kvm_regs *kvm_regs;
6aa8b732 1710
3e4bb3ac
XZ
1711 r = -ENOMEM;
1712 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
1713 if (!kvm_regs)
6aa8b732 1714 goto out;
3e4bb3ac
XZ
1715 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
1716 if (r)
1717 goto out_free1;
6aa8b732 1718 r = -EFAULT;
3e4bb3ac
XZ
1719 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
1720 goto out_free1;
6aa8b732 1721 r = 0;
3e4bb3ac
XZ
1722out_free1:
1723 kfree(kvm_regs);
6aa8b732
AK
1724 break;
1725 }
1726 case KVM_SET_REGS: {
3e4bb3ac 1727 struct kvm_regs *kvm_regs;
6aa8b732 1728
3e4bb3ac
XZ
1729 r = -ENOMEM;
1730 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
1731 if (!kvm_regs)
6aa8b732 1732 goto out;
3e4bb3ac
XZ
1733 r = -EFAULT;
1734 if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs)))
1735 goto out_free2;
1736 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
6aa8b732 1737 if (r)
3e4bb3ac 1738 goto out_free2;
6aa8b732 1739 r = 0;
3e4bb3ac
XZ
1740out_free2:
1741 kfree(kvm_regs);
6aa8b732
AK
1742 break;
1743 }
1744 case KVM_GET_SREGS: {
fa3795a7
DH
1745 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
1746 r = -ENOMEM;
1747 if (!kvm_sregs)
1748 goto out;
1749 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
6aa8b732
AK
1750 if (r)
1751 goto out;
1752 r = -EFAULT;
fa3795a7 1753 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
6aa8b732
AK
1754 goto out;
1755 r = 0;
1756 break;
1757 }
1758 case KVM_SET_SREGS: {
fa3795a7
DH
1759 kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
1760 r = -ENOMEM;
1761 if (!kvm_sregs)
1762 goto out;
6aa8b732 1763 r = -EFAULT;
fa3795a7 1764 if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs)))
6aa8b732 1765 goto out;
fa3795a7 1766 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
6aa8b732
AK
1767 if (r)
1768 goto out;
1769 r = 0;
1770 break;
1771 }
62d9f0db
MT
1772 case KVM_GET_MP_STATE: {
1773 struct kvm_mp_state mp_state;
1774
1775 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
1776 if (r)
1777 goto out;
1778 r = -EFAULT;
1779 if (copy_to_user(argp, &mp_state, sizeof mp_state))
1780 goto out;
1781 r = 0;
1782 break;
1783 }
1784 case KVM_SET_MP_STATE: {
1785 struct kvm_mp_state mp_state;
1786
1787 r = -EFAULT;
1788 if (copy_from_user(&mp_state, argp, sizeof mp_state))
1789 goto out;
1790 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
1791 if (r)
1792 goto out;
1793 r = 0;
1794 break;
1795 }
6aa8b732
AK
1796 case KVM_TRANSLATE: {
1797 struct kvm_translation tr;
1798
1799 r = -EFAULT;
2f366987 1800 if (copy_from_user(&tr, argp, sizeof tr))
6aa8b732 1801 goto out;
8b006791 1802 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
6aa8b732
AK
1803 if (r)
1804 goto out;
1805 r = -EFAULT;
2f366987 1806 if (copy_to_user(argp, &tr, sizeof tr))
6aa8b732
AK
1807 goto out;
1808 r = 0;
1809 break;
1810 }
d0bfb940
JK
1811 case KVM_SET_GUEST_DEBUG: {
1812 struct kvm_guest_debug dbg;
6aa8b732
AK
1813
1814 r = -EFAULT;
2f366987 1815 if (copy_from_user(&dbg, argp, sizeof dbg))
6aa8b732 1816 goto out;
d0bfb940 1817 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
6aa8b732
AK
1818 if (r)
1819 goto out;
1820 r = 0;
1821 break;
1822 }
1961d276
AK
1823 case KVM_SET_SIGNAL_MASK: {
1824 struct kvm_signal_mask __user *sigmask_arg = argp;
1825 struct kvm_signal_mask kvm_sigmask;
1826 sigset_t sigset, *p;
1827
1828 p = NULL;
1829 if (argp) {
1830 r = -EFAULT;
1831 if (copy_from_user(&kvm_sigmask, argp,
1832 sizeof kvm_sigmask))
1833 goto out;
1834 r = -EINVAL;
1835 if (kvm_sigmask.len != sizeof sigset)
1836 goto out;
1837 r = -EFAULT;
1838 if (copy_from_user(&sigset, sigmask_arg->sigset,
1839 sizeof sigset))
1840 goto out;
1841 p = &sigset;
1842 }
376d41ff 1843 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
1961d276
AK
1844 break;
1845 }
b8836737 1846 case KVM_GET_FPU: {
fa3795a7
DH
1847 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
1848 r = -ENOMEM;
1849 if (!fpu)
1850 goto out;
1851 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
b8836737
AK
1852 if (r)
1853 goto out;
1854 r = -EFAULT;
fa3795a7 1855 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
b8836737
AK
1856 goto out;
1857 r = 0;
1858 break;
1859 }
1860 case KVM_SET_FPU: {
fa3795a7
DH
1861 fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
1862 r = -ENOMEM;
1863 if (!fpu)
1864 goto out;
b8836737 1865 r = -EFAULT;
fa3795a7 1866 if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu)))
b8836737 1867 goto out;
fa3795a7 1868 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
b8836737
AK
1869 if (r)
1870 goto out;
1871 r = 0;
1872 break;
1873 }
bccf2150 1874 default:
313a3dc7 1875 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
bccf2150
AK
1876 }
1877out:
2122ff5e 1878 vcpu_put(vcpu);
fa3795a7
DH
1879 kfree(fpu);
1880 kfree(kvm_sregs);
bccf2150
AK
1881 return r;
1882}
1883
1dda606c
AG
1884#ifdef CONFIG_COMPAT
1885static long kvm_vcpu_compat_ioctl(struct file *filp,
1886 unsigned int ioctl, unsigned long arg)
1887{
1888 struct kvm_vcpu *vcpu = filp->private_data;
1889 void __user *argp = compat_ptr(arg);
1890 int r;
1891
1892 if (vcpu->kvm->mm != current->mm)
1893 return -EIO;
1894
1895 switch (ioctl) {
1896 case KVM_SET_SIGNAL_MASK: {
1897 struct kvm_signal_mask __user *sigmask_arg = argp;
1898 struct kvm_signal_mask kvm_sigmask;
1899 compat_sigset_t csigset;
1900 sigset_t sigset;
1901
1902 if (argp) {
1903 r = -EFAULT;
1904 if (copy_from_user(&kvm_sigmask, argp,
1905 sizeof kvm_sigmask))
1906 goto out;
1907 r = -EINVAL;
1908 if (kvm_sigmask.len != sizeof csigset)
1909 goto out;
1910 r = -EFAULT;
1911 if (copy_from_user(&csigset, sigmask_arg->sigset,
1912 sizeof csigset))
1913 goto out;
1914 }
1915 sigset_from_compat(&sigset, &csigset);
1916 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
1917 break;
1918 }
1919 default:
1920 r = kvm_vcpu_ioctl(filp, ioctl, arg);
1921 }
1922
1923out:
1924 return r;
1925}
1926#endif
1927
bccf2150
AK
1928static long kvm_vm_ioctl(struct file *filp,
1929 unsigned int ioctl, unsigned long arg)
1930{
1931 struct kvm *kvm = filp->private_data;
1932 void __user *argp = (void __user *)arg;
1fe779f8 1933 int r;
bccf2150 1934
6d4e4c4f
AK
1935 if (kvm->mm != current->mm)
1936 return -EIO;
bccf2150
AK
1937 switch (ioctl) {
1938 case KVM_CREATE_VCPU:
1939 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
1940 if (r < 0)
1941 goto out;
1942 break;
6fc138d2
IE
1943 case KVM_SET_USER_MEMORY_REGION: {
1944 struct kvm_userspace_memory_region kvm_userspace_mem;
1945
1946 r = -EFAULT;
1947 if (copy_from_user(&kvm_userspace_mem, argp,
1948 sizeof kvm_userspace_mem))
1949 goto out;
1950
1951 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
6aa8b732
AK
1952 if (r)
1953 goto out;
1954 break;
1955 }
1956 case KVM_GET_DIRTY_LOG: {
1957 struct kvm_dirty_log log;
1958
1959 r = -EFAULT;
2f366987 1960 if (copy_from_user(&log, argp, sizeof log))
6aa8b732 1961 goto out;
2c6f5df9 1962 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
6aa8b732
AK
1963 if (r)
1964 goto out;
1965 break;
1966 }
5f94c174
LV
1967#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
1968 case KVM_REGISTER_COALESCED_MMIO: {
1969 struct kvm_coalesced_mmio_zone zone;
1970 r = -EFAULT;
1971 if (copy_from_user(&zone, argp, sizeof zone))
1972 goto out;
5f94c174
LV
1973 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
1974 if (r)
1975 goto out;
1976 r = 0;
1977 break;
1978 }
1979 case KVM_UNREGISTER_COALESCED_MMIO: {
1980 struct kvm_coalesced_mmio_zone zone;
1981 r = -EFAULT;
1982 if (copy_from_user(&zone, argp, sizeof zone))
1983 goto out;
5f94c174
LV
1984 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
1985 if (r)
1986 goto out;
1987 r = 0;
1988 break;
1989 }
1990#endif
721eecbf
GH
1991 case KVM_IRQFD: {
1992 struct kvm_irqfd data;
1993
1994 r = -EFAULT;
1995 if (copy_from_user(&data, argp, sizeof data))
1996 goto out;
1997 r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags);
1998 break;
1999 }
d34e6b17
GH
2000 case KVM_IOEVENTFD: {
2001 struct kvm_ioeventfd data;
2002
2003 r = -EFAULT;
2004 if (copy_from_user(&data, argp, sizeof data))
2005 goto out;
2006 r = kvm_ioeventfd(kvm, &data);
2007 break;
2008 }
73880c80
GN
2009#ifdef CONFIG_KVM_APIC_ARCHITECTURE
2010 case KVM_SET_BOOT_CPU_ID:
2011 r = 0;
894a9c55 2012 mutex_lock(&kvm->lock);
73880c80
GN
2013 if (atomic_read(&kvm->online_vcpus) != 0)
2014 r = -EBUSY;
2015 else
2016 kvm->bsp_vcpu_id = arg;
894a9c55 2017 mutex_unlock(&kvm->lock);
73880c80
GN
2018 break;
2019#endif
f17abe9a 2020 default:
1fe779f8 2021 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
bfd99ff5
AK
2022 if (r == -ENOTTY)
2023 r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
f17abe9a
AK
2024 }
2025out:
2026 return r;
2027}
2028
6ff5894c
AB
2029#ifdef CONFIG_COMPAT
2030struct compat_kvm_dirty_log {
2031 __u32 slot;
2032 __u32 padding1;
2033 union {
2034 compat_uptr_t dirty_bitmap; /* one bit per page */
2035 __u64 padding2;
2036 };
2037};
2038
2039static long kvm_vm_compat_ioctl(struct file *filp,
2040 unsigned int ioctl, unsigned long arg)
2041{
2042 struct kvm *kvm = filp->private_data;
2043 int r;
2044
2045 if (kvm->mm != current->mm)
2046 return -EIO;
2047 switch (ioctl) {
2048 case KVM_GET_DIRTY_LOG: {
2049 struct compat_kvm_dirty_log compat_log;
2050 struct kvm_dirty_log log;
2051
2052 r = -EFAULT;
2053 if (copy_from_user(&compat_log, (void __user *)arg,
2054 sizeof(compat_log)))
2055 goto out;
2056 log.slot = compat_log.slot;
2057 log.padding1 = compat_log.padding1;
2058 log.padding2 = compat_log.padding2;
2059 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
2060
2061 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
2062 if (r)
2063 goto out;
2064 break;
2065 }
2066 default:
2067 r = kvm_vm_ioctl(filp, ioctl, arg);
2068 }
2069
2070out:
2071 return r;
2072}
2073#endif
2074
e4a533a4 2075static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
f17abe9a 2076{
777b3f49
MT
2077 struct page *page[1];
2078 unsigned long addr;
2079 int npages;
2080 gfn_t gfn = vmf->pgoff;
f17abe9a 2081 struct kvm *kvm = vma->vm_file->private_data;
f17abe9a 2082
777b3f49
MT
2083 addr = gfn_to_hva(kvm, gfn);
2084 if (kvm_is_error_hva(addr))
e4a533a4 2085 return VM_FAULT_SIGBUS;
777b3f49
MT
2086
2087 npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page,
2088 NULL);
2089 if (unlikely(npages != 1))
e4a533a4 2090 return VM_FAULT_SIGBUS;
777b3f49
MT
2091
2092 vmf->page = page[0];
e4a533a4 2093 return 0;
f17abe9a
AK
2094}
2095
f0f37e2f 2096static const struct vm_operations_struct kvm_vm_vm_ops = {
e4a533a4 2097 .fault = kvm_vm_fault,
f17abe9a
AK
2098};
2099
2100static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
2101{
2102 vma->vm_ops = &kvm_vm_vm_ops;
2103 return 0;
2104}
2105
3d3aab1b 2106static struct file_operations kvm_vm_fops = {
f17abe9a
AK
2107 .release = kvm_vm_release,
2108 .unlocked_ioctl = kvm_vm_ioctl,
6ff5894c
AB
2109#ifdef CONFIG_COMPAT
2110 .compat_ioctl = kvm_vm_compat_ioctl,
2111#endif
f17abe9a 2112 .mmap = kvm_vm_mmap,
6038f373 2113 .llseek = noop_llseek,
f17abe9a
AK
2114};
2115
2116static int kvm_dev_ioctl_create_vm(void)
2117{
aac87636 2118 int r;
f17abe9a
AK
2119 struct kvm *kvm;
2120
f17abe9a 2121 kvm = kvm_create_vm();
d6d28168
AK
2122 if (IS_ERR(kvm))
2123 return PTR_ERR(kvm);
6ce5a090
TY
2124#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
2125 r = kvm_coalesced_mmio_init(kvm);
2126 if (r < 0) {
2127 kvm_put_kvm(kvm);
2128 return r;
2129 }
2130#endif
aac87636
HC
2131 r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
2132 if (r < 0)
66c0b394 2133 kvm_put_kvm(kvm);
f17abe9a 2134
aac87636 2135 return r;
f17abe9a
AK
2136}
2137
1a811b61
AK
2138static long kvm_dev_ioctl_check_extension_generic(long arg)
2139{
2140 switch (arg) {
ca9edaee 2141 case KVM_CAP_USER_MEMORY:
1a811b61 2142 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
4cd481f6 2143 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
73880c80
GN
2144#ifdef CONFIG_KVM_APIC_ARCHITECTURE
2145 case KVM_CAP_SET_BOOT_CPU_ID:
2146#endif
a9c7399d 2147 case KVM_CAP_INTERNAL_ERROR_DATA:
1a811b61 2148 return 1;
399ec807
AK
2149#ifdef CONFIG_HAVE_KVM_IRQCHIP
2150 case KVM_CAP_IRQ_ROUTING:
36463146 2151 return KVM_MAX_IRQ_ROUTES;
399ec807 2152#endif
1a811b61
AK
2153 default:
2154 break;
2155 }
2156 return kvm_dev_ioctl_check_extension(arg);
2157}
2158
f17abe9a
AK
2159static long kvm_dev_ioctl(struct file *filp,
2160 unsigned int ioctl, unsigned long arg)
2161{
07c45a36 2162 long r = -EINVAL;
f17abe9a
AK
2163
2164 switch (ioctl) {
2165 case KVM_GET_API_VERSION:
f0fe5108
AK
2166 r = -EINVAL;
2167 if (arg)
2168 goto out;
f17abe9a
AK
2169 r = KVM_API_VERSION;
2170 break;
2171 case KVM_CREATE_VM:
f0fe5108
AK
2172 r = -EINVAL;
2173 if (arg)
2174 goto out;
f17abe9a
AK
2175 r = kvm_dev_ioctl_create_vm();
2176 break;
018d00d2 2177 case KVM_CHECK_EXTENSION:
1a811b61 2178 r = kvm_dev_ioctl_check_extension_generic(arg);
5d308f45 2179 break;
07c45a36
AK
2180 case KVM_GET_VCPU_MMAP_SIZE:
2181 r = -EINVAL;
2182 if (arg)
2183 goto out;
adb1ff46
AK
2184 r = PAGE_SIZE; /* struct kvm_run */
2185#ifdef CONFIG_X86
2186 r += PAGE_SIZE; /* pio data page */
5f94c174
LV
2187#endif
2188#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
2189 r += PAGE_SIZE; /* coalesced mmio ring page */
adb1ff46 2190#endif
07c45a36 2191 break;
d4c9ff2d
FEL
2192 case KVM_TRACE_ENABLE:
2193 case KVM_TRACE_PAUSE:
2194 case KVM_TRACE_DISABLE:
2023a29c 2195 r = -EOPNOTSUPP;
d4c9ff2d 2196 break;
6aa8b732 2197 default:
043405e1 2198 return kvm_arch_dev_ioctl(filp, ioctl, arg);
6aa8b732
AK
2199 }
2200out:
2201 return r;
2202}
2203
6aa8b732 2204static struct file_operations kvm_chardev_ops = {
6aa8b732
AK
2205 .unlocked_ioctl = kvm_dev_ioctl,
2206 .compat_ioctl = kvm_dev_ioctl,
6038f373 2207 .llseek = noop_llseek,
6aa8b732
AK
2208};
2209
2210static struct miscdevice kvm_dev = {
bbe4432e 2211 KVM_MINOR,
6aa8b732
AK
2212 "kvm",
2213 &kvm_chardev_ops,
2214};
2215
75b7127c 2216static void hardware_enable_nolock(void *junk)
1b6c0168
AK
2217{
2218 int cpu = raw_smp_processor_id();
10474ae8 2219 int r;
1b6c0168 2220
7f59f492 2221 if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
1b6c0168 2222 return;
10474ae8 2223
7f59f492 2224 cpumask_set_cpu(cpu, cpus_hardware_enabled);
10474ae8
AG
2225
2226 r = kvm_arch_hardware_enable(NULL);
2227
2228 if (r) {
2229 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
2230 atomic_inc(&hardware_enable_failed);
2231 printk(KERN_INFO "kvm: enabling virtualization on "
2232 "CPU%d failed\n", cpu);
2233 }
1b6c0168
AK
2234}
2235
75b7127c
TY
2236static void hardware_enable(void *junk)
2237{
e935b837 2238 raw_spin_lock(&kvm_lock);
75b7127c 2239 hardware_enable_nolock(junk);
e935b837 2240 raw_spin_unlock(&kvm_lock);
75b7127c
TY
2241}
2242
2243static void hardware_disable_nolock(void *junk)
1b6c0168
AK
2244{
2245 int cpu = raw_smp_processor_id();
2246
7f59f492 2247 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
1b6c0168 2248 return;
7f59f492 2249 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
e9b11c17 2250 kvm_arch_hardware_disable(NULL);
1b6c0168
AK
2251}
2252
75b7127c
TY
2253static void hardware_disable(void *junk)
2254{
e935b837 2255 raw_spin_lock(&kvm_lock);
75b7127c 2256 hardware_disable_nolock(junk);
e935b837 2257 raw_spin_unlock(&kvm_lock);
75b7127c
TY
2258}
2259
10474ae8
AG
2260static void hardware_disable_all_nolock(void)
2261{
2262 BUG_ON(!kvm_usage_count);
2263
2264 kvm_usage_count--;
2265 if (!kvm_usage_count)
75b7127c 2266 on_each_cpu(hardware_disable_nolock, NULL, 1);
10474ae8
AG
2267}
2268
2269static void hardware_disable_all(void)
2270{
e935b837 2271 raw_spin_lock(&kvm_lock);
10474ae8 2272 hardware_disable_all_nolock();
e935b837 2273 raw_spin_unlock(&kvm_lock);
10474ae8
AG
2274}
2275
2276static int hardware_enable_all(void)
2277{
2278 int r = 0;
2279
e935b837 2280 raw_spin_lock(&kvm_lock);
10474ae8
AG
2281
2282 kvm_usage_count++;
2283 if (kvm_usage_count == 1) {
2284 atomic_set(&hardware_enable_failed, 0);
75b7127c 2285 on_each_cpu(hardware_enable_nolock, NULL, 1);
10474ae8
AG
2286
2287 if (atomic_read(&hardware_enable_failed)) {
2288 hardware_disable_all_nolock();
2289 r = -EBUSY;
2290 }
2291 }
2292
e935b837 2293 raw_spin_unlock(&kvm_lock);
10474ae8
AG
2294
2295 return r;
2296}
2297
774c47f1
AK
2298static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
2299 void *v)
2300{
2301 int cpu = (long)v;
2302
10474ae8
AG
2303 if (!kvm_usage_count)
2304 return NOTIFY_OK;
2305
1a6f4d7f 2306 val &= ~CPU_TASKS_FROZEN;
774c47f1 2307 switch (val) {
cec9ad27 2308 case CPU_DYING:
6ec8a856
AK
2309 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
2310 cpu);
2311 hardware_disable(NULL);
2312 break;
da908f2f 2313 case CPU_STARTING:
43934a38
JK
2314 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
2315 cpu);
da908f2f 2316 hardware_enable(NULL);
774c47f1
AK
2317 break;
2318 }
2319 return NOTIFY_OK;
2320}
2321
4ecac3fd 2322
b7c4145b 2323asmlinkage void kvm_spurious_fault(void)
4ecac3fd 2324{
4ecac3fd
AK
2325 /* Fault while not rebooting. We want the trace. */
2326 BUG();
2327}
b7c4145b 2328EXPORT_SYMBOL_GPL(kvm_spurious_fault);
4ecac3fd 2329
9a2b85c6 2330static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
d77c26fc 2331 void *v)
9a2b85c6 2332{
8e1c1815
SY
2333 /*
2334 * Some (well, at least mine) BIOSes hang on reboot if
2335 * in vmx root mode.
2336 *
2337 * And Intel TXT required VMX off for all cpu when system shutdown.
2338 */
2339 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
2340 kvm_rebooting = true;
75b7127c 2341 on_each_cpu(hardware_disable_nolock, NULL, 1);
9a2b85c6
RR
2342 return NOTIFY_OK;
2343}
2344
2345static struct notifier_block kvm_reboot_notifier = {
2346 .notifier_call = kvm_reboot,
2347 .priority = 0,
2348};
2349
e93f8a0f 2350static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
2eeb2e94
GH
2351{
2352 int i;
2353
2354 for (i = 0; i < bus->dev_count; i++) {
2355 struct kvm_io_device *pos = bus->devs[i];
2356
2357 kvm_iodevice_destructor(pos);
2358 }
e93f8a0f 2359 kfree(bus);
2eeb2e94
GH
2360}
2361
bda9020e 2362/* kvm_io_bus_write - called under kvm->slots_lock */
e93f8a0f 2363int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
bda9020e 2364 int len, const void *val)
2eeb2e94
GH
2365{
2366 int i;
90d83dc3
LJ
2367 struct kvm_io_bus *bus;
2368
2369 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
bda9020e
MT
2370 for (i = 0; i < bus->dev_count; i++)
2371 if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
2372 return 0;
2373 return -EOPNOTSUPP;
2374}
2eeb2e94 2375
bda9020e 2376/* kvm_io_bus_read - called under kvm->slots_lock */
e93f8a0f
MT
2377int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2378 int len, void *val)
bda9020e
MT
2379{
2380 int i;
90d83dc3 2381 struct kvm_io_bus *bus;
e93f8a0f 2382
90d83dc3 2383 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
bda9020e
MT
2384 for (i = 0; i < bus->dev_count; i++)
2385 if (!kvm_iodevice_read(bus->devs[i], addr, len, val))
2386 return 0;
2387 return -EOPNOTSUPP;
2eeb2e94
GH
2388}
2389
79fac95e 2390/* Caller must hold slots_lock. */
e93f8a0f
MT
2391int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
2392 struct kvm_io_device *dev)
6c474694 2393{
e93f8a0f 2394 struct kvm_io_bus *new_bus, *bus;
090b7aff 2395
e93f8a0f 2396 bus = kvm->buses[bus_idx];
090b7aff
GH
2397 if (bus->dev_count > NR_IOBUS_DEVS-1)
2398 return -ENOSPC;
2eeb2e94 2399
e93f8a0f
MT
2400 new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL);
2401 if (!new_bus)
2402 return -ENOMEM;
2403 memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
2404 new_bus->devs[new_bus->dev_count++] = dev;
2405 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
2406 synchronize_srcu_expedited(&kvm->srcu);
2407 kfree(bus);
090b7aff
GH
2408
2409 return 0;
2410}
2411
79fac95e 2412/* Caller must hold slots_lock. */
e93f8a0f
MT
2413int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
2414 struct kvm_io_device *dev)
090b7aff 2415{
e93f8a0f
MT
2416 int i, r;
2417 struct kvm_io_bus *new_bus, *bus;
090b7aff 2418
e93f8a0f
MT
2419 new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL);
2420 if (!new_bus)
2421 return -ENOMEM;
090b7aff 2422
e93f8a0f
MT
2423 bus = kvm->buses[bus_idx];
2424 memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
2425
2426 r = -ENOENT;
2427 for (i = 0; i < new_bus->dev_count; i++)
2428 if (new_bus->devs[i] == dev) {
2429 r = 0;
2430 new_bus->devs[i] = new_bus->devs[--new_bus->dev_count];
090b7aff
GH
2431 break;
2432 }
e93f8a0f
MT
2433
2434 if (r) {
2435 kfree(new_bus);
2436 return r;
2437 }
2438
2439 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
2440 synchronize_srcu_expedited(&kvm->srcu);
2441 kfree(bus);
2442 return r;
2eeb2e94
GH
2443}
2444
774c47f1
AK
2445static struct notifier_block kvm_cpu_notifier = {
2446 .notifier_call = kvm_cpu_hotplug,
774c47f1
AK
2447};
2448
8b88b099 2449static int vm_stat_get(void *_offset, u64 *val)
ba1389b7
AK
2450{
2451 unsigned offset = (long)_offset;
ba1389b7
AK
2452 struct kvm *kvm;
2453
8b88b099 2454 *val = 0;
e935b837 2455 raw_spin_lock(&kvm_lock);
ba1389b7 2456 list_for_each_entry(kvm, &vm_list, vm_list)
8b88b099 2457 *val += *(u32 *)((void *)kvm + offset);
e935b837 2458 raw_spin_unlock(&kvm_lock);
8b88b099 2459 return 0;
ba1389b7
AK
2460}
2461
2462DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n");
2463
8b88b099 2464static int vcpu_stat_get(void *_offset, u64 *val)
1165f5fe
AK
2465{
2466 unsigned offset = (long)_offset;
1165f5fe
AK
2467 struct kvm *kvm;
2468 struct kvm_vcpu *vcpu;
2469 int i;
2470
8b88b099 2471 *val = 0;
e935b837 2472 raw_spin_lock(&kvm_lock);
1165f5fe 2473 list_for_each_entry(kvm, &vm_list, vm_list)
988a2cae
GN
2474 kvm_for_each_vcpu(i, vcpu, kvm)
2475 *val += *(u32 *)((void *)vcpu + offset);
2476
e935b837 2477 raw_spin_unlock(&kvm_lock);
8b88b099 2478 return 0;
1165f5fe
AK
2479}
2480
ba1389b7
AK
2481DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n");
2482
828c0950 2483static const struct file_operations *stat_fops[] = {
ba1389b7
AK
2484 [KVM_STAT_VCPU] = &vcpu_stat_fops,
2485 [KVM_STAT_VM] = &vm_stat_fops,
2486};
1165f5fe 2487
a16b043c 2488static void kvm_init_debug(void)
6aa8b732
AK
2489{
2490 struct kvm_stats_debugfs_item *p;
2491
76f7c879 2492 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
6aa8b732 2493 for (p = debugfs_entries; p->name; ++p)
76f7c879 2494 p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir,
1165f5fe 2495 (void *)(long)p->offset,
ba1389b7 2496 stat_fops[p->kind]);
6aa8b732
AK
2497}
2498
2499static void kvm_exit_debug(void)
2500{
2501 struct kvm_stats_debugfs_item *p;
2502
2503 for (p = debugfs_entries; p->name; ++p)
2504 debugfs_remove(p->dentry);
76f7c879 2505 debugfs_remove(kvm_debugfs_dir);
6aa8b732
AK
2506}
2507
fb3600cc 2508static int kvm_suspend(void)
59ae6c6b 2509{
10474ae8 2510 if (kvm_usage_count)
75b7127c 2511 hardware_disable_nolock(NULL);
59ae6c6b
AK
2512 return 0;
2513}
2514
fb3600cc 2515static void kvm_resume(void)
59ae6c6b 2516{
ca84d1a2 2517 if (kvm_usage_count) {
e935b837 2518 WARN_ON(raw_spin_is_locked(&kvm_lock));
75b7127c 2519 hardware_enable_nolock(NULL);
ca84d1a2 2520 }
59ae6c6b
AK
2521}
2522
fb3600cc 2523static struct syscore_ops kvm_syscore_ops = {
59ae6c6b
AK
2524 .suspend = kvm_suspend,
2525 .resume = kvm_resume,
2526};
2527
cea7bb21 2528struct page *bad_page;
35149e21 2529pfn_t bad_pfn;
6aa8b732 2530
15ad7146
AK
2531static inline
2532struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
2533{
2534 return container_of(pn, struct kvm_vcpu, preempt_notifier);
2535}
2536
2537static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
2538{
2539 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
2540
e9b11c17 2541 kvm_arch_vcpu_load(vcpu, cpu);
15ad7146
AK
2542}
2543
2544static void kvm_sched_out(struct preempt_notifier *pn,
2545 struct task_struct *next)
2546{
2547 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
2548
e9b11c17 2549 kvm_arch_vcpu_put(vcpu);
15ad7146
AK
2550}
2551
0ee75bea 2552int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
c16f862d 2553 struct module *module)
6aa8b732
AK
2554{
2555 int r;
002c7f7c 2556 int cpu;
6aa8b732 2557
f8c16bba
ZX
2558 r = kvm_arch_init(opaque);
2559 if (r)
d2308784 2560 goto out_fail;
cb498ea2
ZX
2561
2562 bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2563
2564 if (bad_page == NULL) {
2565 r = -ENOMEM;
2566 goto out;
2567 }
2568
35149e21
AL
2569 bad_pfn = page_to_pfn(bad_page);
2570
bf998156
HY
2571 hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2572
2573 if (hwpoison_page == NULL) {
2574 r = -ENOMEM;
2575 goto out_free_0;
2576 }
2577
2578 hwpoison_pfn = page_to_pfn(hwpoison_page);
2579
edba23e5
GN
2580 fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2581
2582 if (fault_page == NULL) {
2583 r = -ENOMEM;
2584 goto out_free_0;
2585 }
2586
2587 fault_pfn = page_to_pfn(fault_page);
2588
8437a617 2589 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
7f59f492
RR
2590 r = -ENOMEM;
2591 goto out_free_0;
2592 }
2593
e9b11c17 2594 r = kvm_arch_hardware_setup();
6aa8b732 2595 if (r < 0)
7f59f492 2596 goto out_free_0a;
6aa8b732 2597
002c7f7c
YS
2598 for_each_online_cpu(cpu) {
2599 smp_call_function_single(cpu,
e9b11c17 2600 kvm_arch_check_processor_compat,
8691e5a8 2601 &r, 1);
002c7f7c 2602 if (r < 0)
d2308784 2603 goto out_free_1;
002c7f7c
YS
2604 }
2605
774c47f1
AK
2606 r = register_cpu_notifier(&kvm_cpu_notifier);
2607 if (r)
d2308784 2608 goto out_free_2;
6aa8b732
AK
2609 register_reboot_notifier(&kvm_reboot_notifier);
2610
c16f862d 2611 /* A kmem cache lets us meet the alignment requirements of fx_save. */
0ee75bea
AK
2612 if (!vcpu_align)
2613 vcpu_align = __alignof__(struct kvm_vcpu);
2614 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align,
56919c5c 2615 0, NULL);
c16f862d
RR
2616 if (!kvm_vcpu_cache) {
2617 r = -ENOMEM;
fb3600cc 2618 goto out_free_3;
c16f862d
RR
2619 }
2620
af585b92
GN
2621 r = kvm_async_pf_init();
2622 if (r)
2623 goto out_free;
2624
6aa8b732 2625 kvm_chardev_ops.owner = module;
3d3aab1b
CB
2626 kvm_vm_fops.owner = module;
2627 kvm_vcpu_fops.owner = module;
6aa8b732
AK
2628
2629 r = misc_register(&kvm_dev);
2630 if (r) {
d77c26fc 2631 printk(KERN_ERR "kvm: misc device register failed\n");
af585b92 2632 goto out_unreg;
6aa8b732
AK
2633 }
2634
fb3600cc
RW
2635 register_syscore_ops(&kvm_syscore_ops);
2636
15ad7146
AK
2637 kvm_preempt_ops.sched_in = kvm_sched_in;
2638 kvm_preempt_ops.sched_out = kvm_sched_out;
2639
0ea4ed8e
DW
2640 kvm_init_debug();
2641
c7addb90 2642 return 0;
6aa8b732 2643
af585b92
GN
2644out_unreg:
2645 kvm_async_pf_deinit();
6aa8b732 2646out_free:
c16f862d 2647 kmem_cache_destroy(kvm_vcpu_cache);
d2308784 2648out_free_3:
6aa8b732 2649 unregister_reboot_notifier(&kvm_reboot_notifier);
774c47f1 2650 unregister_cpu_notifier(&kvm_cpu_notifier);
d2308784 2651out_free_2:
d2308784 2652out_free_1:
e9b11c17 2653 kvm_arch_hardware_unsetup();
7f59f492
RR
2654out_free_0a:
2655 free_cpumask_var(cpus_hardware_enabled);
d2308784 2656out_free_0:
edba23e5
GN
2657 if (fault_page)
2658 __free_page(fault_page);
bf998156
HY
2659 if (hwpoison_page)
2660 __free_page(hwpoison_page);
d2308784 2661 __free_page(bad_page);
ca45aaae 2662out:
f8c16bba 2663 kvm_arch_exit();
d2308784 2664out_fail:
6aa8b732
AK
2665 return r;
2666}
cb498ea2 2667EXPORT_SYMBOL_GPL(kvm_init);
6aa8b732 2668
cb498ea2 2669void kvm_exit(void)
6aa8b732 2670{
0ea4ed8e 2671 kvm_exit_debug();
6aa8b732 2672 misc_deregister(&kvm_dev);
c16f862d 2673 kmem_cache_destroy(kvm_vcpu_cache);
af585b92 2674 kvm_async_pf_deinit();
fb3600cc 2675 unregister_syscore_ops(&kvm_syscore_ops);
6aa8b732 2676 unregister_reboot_notifier(&kvm_reboot_notifier);
59ae6c6b 2677 unregister_cpu_notifier(&kvm_cpu_notifier);
75b7127c 2678 on_each_cpu(hardware_disable_nolock, NULL, 1);
e9b11c17 2679 kvm_arch_hardware_unsetup();
f8c16bba 2680 kvm_arch_exit();
7f59f492 2681 free_cpumask_var(cpus_hardware_enabled);
bf998156 2682 __free_page(hwpoison_page);
cea7bb21 2683 __free_page(bad_page);
6aa8b732 2684}
cb498ea2 2685EXPORT_SYMBOL_GPL(kvm_exit);