]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - arch/x86/kvm/x86.c
x86: Export some definition of MTRR
[mirror_ubuntu-bionic-kernel.git] / arch / x86 / kvm / x86.c
CommitLineData
043405e1
CO
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * derived from drivers/kvm/kvm_main.c
5 *
6 * Copyright (C) 2006 Qumranet, Inc.
4d5c5d0f
BAY
7 * Copyright (C) 2008 Qumranet, Inc.
8 * Copyright IBM Corporation, 2008
043405e1
CO
9 *
10 * Authors:
11 * Avi Kivity <avi@qumranet.com>
12 * Yaniv Kamay <yaniv@qumranet.com>
4d5c5d0f
BAY
13 * Amit Shah <amit.shah@qumranet.com>
14 * Ben-Ami Yassour <benami@il.ibm.com>
043405e1
CO
15 *
16 * This work is licensed under the terms of the GNU GPL, version 2. See
17 * the COPYING file in the top-level directory.
18 *
19 */
20
edf88417 21#include <linux/kvm_host.h>
313a3dc7 22#include "irq.h"
1d737c8a 23#include "mmu.h"
7837699f 24#include "i8254.h"
37817f29 25#include "tss.h"
5fdbf976 26#include "kvm_cache_regs.h"
26eef70c 27#include "x86.h"
313a3dc7 28
18068523 29#include <linux/clocksource.h>
4d5c5d0f 30#include <linux/interrupt.h>
313a3dc7
CO
31#include <linux/kvm.h>
32#include <linux/fs.h>
33#include <linux/vmalloc.h>
5fb76f9b 34#include <linux/module.h>
0de10343 35#include <linux/mman.h>
2bacc55c 36#include <linux/highmem.h>
62c476c7 37#include <linux/intel-iommu.h>
043405e1
CO
38
39#include <asm/uaccess.h>
d825ed0a 40#include <asm/msr.h>
a5f61300 41#include <asm/desc.h>
043405e1 42
313a3dc7 43#define MAX_IO_MSRS 256
a03490ed
CO
44#define CR0_RESERVED_BITS \
45 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
46 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
47 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
48#define CR4_RESERVED_BITS \
49 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
50 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
51 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
52 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
53
54#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
50a37eb4
JR
55/* EFER defaults:
56 * - enable syscall per default because its emulated by KVM
57 * - enable LME and LMA per default on 64 bit KVM
58 */
59#ifdef CONFIG_X86_64
60static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL;
61#else
62static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
63#endif
313a3dc7 64
ba1389b7
AK
65#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
66#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
417bc304 67
674eea0f
AK
68static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
69 struct kvm_cpuid_entry2 __user *entries);
70
97896d04 71struct kvm_x86_ops *kvm_x86_ops;
5fdbf976 72EXPORT_SYMBOL_GPL(kvm_x86_ops);
97896d04 73
417bc304 74struct kvm_stats_debugfs_item debugfs_entries[] = {
ba1389b7
AK
75 { "pf_fixed", VCPU_STAT(pf_fixed) },
76 { "pf_guest", VCPU_STAT(pf_guest) },
77 { "tlb_flush", VCPU_STAT(tlb_flush) },
78 { "invlpg", VCPU_STAT(invlpg) },
79 { "exits", VCPU_STAT(exits) },
80 { "io_exits", VCPU_STAT(io_exits) },
81 { "mmio_exits", VCPU_STAT(mmio_exits) },
82 { "signal_exits", VCPU_STAT(signal_exits) },
83 { "irq_window", VCPU_STAT(irq_window_exits) },
f08864b4 84 { "nmi_window", VCPU_STAT(nmi_window_exits) },
ba1389b7
AK
85 { "halt_exits", VCPU_STAT(halt_exits) },
86 { "halt_wakeup", VCPU_STAT(halt_wakeup) },
f11c3a8d 87 { "hypercalls", VCPU_STAT(hypercalls) },
ba1389b7 88 { "request_irq", VCPU_STAT(request_irq_exits) },
c4abb7c9 89 { "request_nmi", VCPU_STAT(request_nmi_exits) },
ba1389b7
AK
90 { "irq_exits", VCPU_STAT(irq_exits) },
91 { "host_state_reload", VCPU_STAT(host_state_reload) },
92 { "efer_reload", VCPU_STAT(efer_reload) },
93 { "fpu_reload", VCPU_STAT(fpu_reload) },
94 { "insn_emulation", VCPU_STAT(insn_emulation) },
95 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
fa89a817 96 { "irq_injections", VCPU_STAT(irq_injections) },
c4abb7c9 97 { "nmi_injections", VCPU_STAT(nmi_injections) },
4cee5764
AK
98 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
99 { "mmu_pte_write", VM_STAT(mmu_pte_write) },
100 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
101 { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
102 { "mmu_flooded", VM_STAT(mmu_flooded) },
103 { "mmu_recycled", VM_STAT(mmu_recycled) },
dfc5aa00 104 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
4731d4c7 105 { "mmu_unsync", VM_STAT(mmu_unsync) },
0f74a24c 106 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
05da4558 107 { "largepages", VM_STAT(lpages) },
417bc304
HB
108 { NULL }
109};
110
5fb76f9b
CO
111unsigned long segment_base(u16 selector)
112{
113 struct descriptor_table gdt;
a5f61300 114 struct desc_struct *d;
5fb76f9b
CO
115 unsigned long table_base;
116 unsigned long v;
117
118 if (selector == 0)
119 return 0;
120
121 asm("sgdt %0" : "=m"(gdt));
122 table_base = gdt.base;
123
124 if (selector & 4) { /* from ldt */
125 u16 ldt_selector;
126
127 asm("sldt %0" : "=g"(ldt_selector));
128 table_base = segment_base(ldt_selector);
129 }
a5f61300
AK
130 d = (struct desc_struct *)(table_base + (selector & ~7));
131 v = d->base0 | ((unsigned long)d->base1 << 16) |
132 ((unsigned long)d->base2 << 24);
5fb76f9b 133#ifdef CONFIG_X86_64
a5f61300
AK
134 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
135 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
5fb76f9b
CO
136#endif
137 return v;
138}
139EXPORT_SYMBOL_GPL(segment_base);
140
6866b83e
CO
141u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
142{
143 if (irqchip_in_kernel(vcpu->kvm))
ad312c7c 144 return vcpu->arch.apic_base;
6866b83e 145 else
ad312c7c 146 return vcpu->arch.apic_base;
6866b83e
CO
147}
148EXPORT_SYMBOL_GPL(kvm_get_apic_base);
149
150void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
151{
152 /* TODO: reserve bits check */
153 if (irqchip_in_kernel(vcpu->kvm))
154 kvm_lapic_set_base(vcpu, data);
155 else
ad312c7c 156 vcpu->arch.apic_base = data;
6866b83e
CO
157}
158EXPORT_SYMBOL_GPL(kvm_set_apic_base);
159
298101da
AK
160void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
161{
ad312c7c
ZX
162 WARN_ON(vcpu->arch.exception.pending);
163 vcpu->arch.exception.pending = true;
164 vcpu->arch.exception.has_error_code = false;
165 vcpu->arch.exception.nr = nr;
298101da
AK
166}
167EXPORT_SYMBOL_GPL(kvm_queue_exception);
168
c3c91fee
AK
169void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
170 u32 error_code)
171{
172 ++vcpu->stat.pf_guest;
71c4dfaf
JR
173 if (vcpu->arch.exception.pending) {
174 if (vcpu->arch.exception.nr == PF_VECTOR) {
175 printk(KERN_DEBUG "kvm: inject_page_fault:"
176 " double fault 0x%lx\n", addr);
177 vcpu->arch.exception.nr = DF_VECTOR;
178 vcpu->arch.exception.error_code = 0;
179 } else if (vcpu->arch.exception.nr == DF_VECTOR) {
180 /* triple fault -> shutdown */
181 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
182 }
c3c91fee
AK
183 return;
184 }
ad312c7c 185 vcpu->arch.cr2 = addr;
c3c91fee
AK
186 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
187}
188
3419ffc8
SY
189void kvm_inject_nmi(struct kvm_vcpu *vcpu)
190{
191 vcpu->arch.nmi_pending = 1;
192}
193EXPORT_SYMBOL_GPL(kvm_inject_nmi);
194
298101da
AK
195void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
196{
ad312c7c
ZX
197 WARN_ON(vcpu->arch.exception.pending);
198 vcpu->arch.exception.pending = true;
199 vcpu->arch.exception.has_error_code = true;
200 vcpu->arch.exception.nr = nr;
201 vcpu->arch.exception.error_code = error_code;
298101da
AK
202}
203EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
204
205static void __queue_exception(struct kvm_vcpu *vcpu)
206{
ad312c7c
ZX
207 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
208 vcpu->arch.exception.has_error_code,
209 vcpu->arch.exception.error_code);
298101da
AK
210}
211
a03490ed
CO
212/*
213 * Load the pae pdptrs. Return true is they are all valid.
214 */
215int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
216{
217 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
218 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
219 int i;
220 int ret;
ad312c7c 221 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
a03490ed 222
a03490ed
CO
223 ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
224 offset * sizeof(u64), sizeof(pdpte));
225 if (ret < 0) {
226 ret = 0;
227 goto out;
228 }
229 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
230 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
231 ret = 0;
232 goto out;
233 }
234 }
235 ret = 1;
236
ad312c7c 237 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
a03490ed 238out:
a03490ed
CO
239
240 return ret;
241}
cc4b6871 242EXPORT_SYMBOL_GPL(load_pdptrs);
a03490ed 243
d835dfec
AK
244static bool pdptrs_changed(struct kvm_vcpu *vcpu)
245{
ad312c7c 246 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
d835dfec
AK
247 bool changed = true;
248 int r;
249
250 if (is_long_mode(vcpu) || !is_pae(vcpu))
251 return false;
252
ad312c7c 253 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
d835dfec
AK
254 if (r < 0)
255 goto out;
ad312c7c 256 changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
d835dfec 257out:
d835dfec
AK
258
259 return changed;
260}
261
2d3ad1f4 262void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
a03490ed
CO
263{
264 if (cr0 & CR0_RESERVED_BITS) {
265 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
ad312c7c 266 cr0, vcpu->arch.cr0);
c1a5d4f9 267 kvm_inject_gp(vcpu, 0);
a03490ed
CO
268 return;
269 }
270
271 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
272 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
c1a5d4f9 273 kvm_inject_gp(vcpu, 0);
a03490ed
CO
274 return;
275 }
276
277 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
278 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
279 "and a clear PE flag\n");
c1a5d4f9 280 kvm_inject_gp(vcpu, 0);
a03490ed
CO
281 return;
282 }
283
284 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
285#ifdef CONFIG_X86_64
ad312c7c 286 if ((vcpu->arch.shadow_efer & EFER_LME)) {
a03490ed
CO
287 int cs_db, cs_l;
288
289 if (!is_pae(vcpu)) {
290 printk(KERN_DEBUG "set_cr0: #GP, start paging "
291 "in long mode while PAE is disabled\n");
c1a5d4f9 292 kvm_inject_gp(vcpu, 0);
a03490ed
CO
293 return;
294 }
295 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
296 if (cs_l) {
297 printk(KERN_DEBUG "set_cr0: #GP, start paging "
298 "in long mode while CS.L == 1\n");
c1a5d4f9 299 kvm_inject_gp(vcpu, 0);
a03490ed
CO
300 return;
301
302 }
303 } else
304#endif
ad312c7c 305 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
a03490ed
CO
306 printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
307 "reserved bits\n");
c1a5d4f9 308 kvm_inject_gp(vcpu, 0);
a03490ed
CO
309 return;
310 }
311
312 }
313
314 kvm_x86_ops->set_cr0(vcpu, cr0);
ad312c7c 315 vcpu->arch.cr0 = cr0;
a03490ed 316
a03490ed 317 kvm_mmu_reset_context(vcpu);
a03490ed
CO
318 return;
319}
2d3ad1f4 320EXPORT_SYMBOL_GPL(kvm_set_cr0);
a03490ed 321
2d3ad1f4 322void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
a03490ed 323{
2d3ad1f4 324 kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
2714d1d3
FEL
325 KVMTRACE_1D(LMSW, vcpu,
326 (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)),
327 handler);
a03490ed 328}
2d3ad1f4 329EXPORT_SYMBOL_GPL(kvm_lmsw);
a03490ed 330
2d3ad1f4 331void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
a03490ed
CO
332{
333 if (cr4 & CR4_RESERVED_BITS) {
334 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
c1a5d4f9 335 kvm_inject_gp(vcpu, 0);
a03490ed
CO
336 return;
337 }
338
339 if (is_long_mode(vcpu)) {
340 if (!(cr4 & X86_CR4_PAE)) {
341 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
342 "in long mode\n");
c1a5d4f9 343 kvm_inject_gp(vcpu, 0);
a03490ed
CO
344 return;
345 }
346 } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
ad312c7c 347 && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
a03490ed 348 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
c1a5d4f9 349 kvm_inject_gp(vcpu, 0);
a03490ed
CO
350 return;
351 }
352
353 if (cr4 & X86_CR4_VMXE) {
354 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
c1a5d4f9 355 kvm_inject_gp(vcpu, 0);
a03490ed
CO
356 return;
357 }
358 kvm_x86_ops->set_cr4(vcpu, cr4);
ad312c7c 359 vcpu->arch.cr4 = cr4;
a03490ed 360 kvm_mmu_reset_context(vcpu);
a03490ed 361}
2d3ad1f4 362EXPORT_SYMBOL_GPL(kvm_set_cr4);
a03490ed 363
2d3ad1f4 364void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
a03490ed 365{
ad312c7c 366 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
0ba73cda 367 kvm_mmu_sync_roots(vcpu);
d835dfec
AK
368 kvm_mmu_flush_tlb(vcpu);
369 return;
370 }
371
a03490ed
CO
372 if (is_long_mode(vcpu)) {
373 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
374 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
c1a5d4f9 375 kvm_inject_gp(vcpu, 0);
a03490ed
CO
376 return;
377 }
378 } else {
379 if (is_pae(vcpu)) {
380 if (cr3 & CR3_PAE_RESERVED_BITS) {
381 printk(KERN_DEBUG
382 "set_cr3: #GP, reserved bits\n");
c1a5d4f9 383 kvm_inject_gp(vcpu, 0);
a03490ed
CO
384 return;
385 }
386 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
387 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
388 "reserved bits\n");
c1a5d4f9 389 kvm_inject_gp(vcpu, 0);
a03490ed
CO
390 return;
391 }
392 }
393 /*
394 * We don't check reserved bits in nonpae mode, because
395 * this isn't enforced, and VMware depends on this.
396 */
397 }
398
a03490ed
CO
399 /*
400 * Does the new cr3 value map to physical memory? (Note, we
401 * catch an invalid cr3 even in real-mode, because it would
402 * cause trouble later on when we turn on paging anyway.)
403 *
404 * A real CPU would silently accept an invalid cr3 and would
405 * attempt to use it - with largely undefined (and often hard
406 * to debug) behavior on the guest side.
407 */
408 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
c1a5d4f9 409 kvm_inject_gp(vcpu, 0);
a03490ed 410 else {
ad312c7c
ZX
411 vcpu->arch.cr3 = cr3;
412 vcpu->arch.mmu.new_cr3(vcpu);
a03490ed 413 }
a03490ed 414}
2d3ad1f4 415EXPORT_SYMBOL_GPL(kvm_set_cr3);
a03490ed 416
2d3ad1f4 417void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
a03490ed
CO
418{
419 if (cr8 & CR8_RESERVED_BITS) {
420 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
c1a5d4f9 421 kvm_inject_gp(vcpu, 0);
a03490ed
CO
422 return;
423 }
424 if (irqchip_in_kernel(vcpu->kvm))
425 kvm_lapic_set_tpr(vcpu, cr8);
426 else
ad312c7c 427 vcpu->arch.cr8 = cr8;
a03490ed 428}
2d3ad1f4 429EXPORT_SYMBOL_GPL(kvm_set_cr8);
a03490ed 430
2d3ad1f4 431unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
a03490ed
CO
432{
433 if (irqchip_in_kernel(vcpu->kvm))
434 return kvm_lapic_get_cr8(vcpu);
435 else
ad312c7c 436 return vcpu->arch.cr8;
a03490ed 437}
2d3ad1f4 438EXPORT_SYMBOL_GPL(kvm_get_cr8);
a03490ed 439
043405e1
CO
440/*
441 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
442 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
443 *
444 * This list is modified at module load time to reflect the
445 * capabilities of the host cpu.
446 */
447static u32 msrs_to_save[] = {
448 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
449 MSR_K6_STAR,
450#ifdef CONFIG_X86_64
451 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
452#endif
18068523 453 MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
847f0ad8 454 MSR_IA32_PERF_STATUS,
043405e1
CO
455};
456
457static unsigned num_msrs_to_save;
458
459static u32 emulated_msrs[] = {
460 MSR_IA32_MISC_ENABLE,
461};
462
15c4a640
CO
463static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
464{
f2b4b7dd 465 if (efer & efer_reserved_bits) {
15c4a640
CO
466 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
467 efer);
c1a5d4f9 468 kvm_inject_gp(vcpu, 0);
15c4a640
CO
469 return;
470 }
471
472 if (is_paging(vcpu)
ad312c7c 473 && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
15c4a640 474 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
c1a5d4f9 475 kvm_inject_gp(vcpu, 0);
15c4a640
CO
476 return;
477 }
478
479 kvm_x86_ops->set_efer(vcpu, efer);
480
481 efer &= ~EFER_LMA;
ad312c7c 482 efer |= vcpu->arch.shadow_efer & EFER_LMA;
15c4a640 483
ad312c7c 484 vcpu->arch.shadow_efer = efer;
15c4a640
CO
485}
486
f2b4b7dd
JR
487void kvm_enable_efer_bits(u64 mask)
488{
489 efer_reserved_bits &= ~mask;
490}
491EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
492
493
15c4a640
CO
494/*
495 * Writes msr value into into the appropriate "register".
496 * Returns 0 on success, non-0 otherwise.
497 * Assumes vcpu_load() was already called.
498 */
499int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
500{
501 return kvm_x86_ops->set_msr(vcpu, msr_index, data);
502}
503
313a3dc7
CO
504/*
505 * Adapt set_msr() to msr_io()'s calling convention
506 */
507static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
508{
509 return kvm_set_msr(vcpu, index, *data);
510}
511
18068523
GOC
512static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
513{
514 static int version;
50d0a0f9
GH
515 struct pvclock_wall_clock wc;
516 struct timespec now, sys, boot;
18068523
GOC
517
518 if (!wall_clock)
519 return;
520
521 version++;
522
18068523
GOC
523 kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
524
50d0a0f9
GH
525 /*
526 * The guest calculates current wall clock time by adding
527 * system time (updated by kvm_write_guest_time below) to the
528 * wall clock specified here. guest system time equals host
529 * system time for us, thus we must fill in host boot time here.
530 */
531 now = current_kernel_time();
532 ktime_get_ts(&sys);
533 boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
534
535 wc.sec = boot.tv_sec;
536 wc.nsec = boot.tv_nsec;
537 wc.version = version;
18068523
GOC
538
539 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
540
541 version++;
542 kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
18068523
GOC
543}
544
50d0a0f9
GH
545static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
546{
547 uint32_t quotient, remainder;
548
549 /* Don't try to replace with do_div(), this one calculates
550 * "(dividend << 32) / divisor" */
551 __asm__ ( "divl %4"
552 : "=a" (quotient), "=d" (remainder)
553 : "0" (0), "1" (dividend), "r" (divisor) );
554 return quotient;
555}
556
557static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
558{
559 uint64_t nsecs = 1000000000LL;
560 int32_t shift = 0;
561 uint64_t tps64;
562 uint32_t tps32;
563
564 tps64 = tsc_khz * 1000LL;
565 while (tps64 > nsecs*2) {
566 tps64 >>= 1;
567 shift--;
568 }
569
570 tps32 = (uint32_t)tps64;
571 while (tps32 <= (uint32_t)nsecs) {
572 tps32 <<= 1;
573 shift++;
574 }
575
576 hv_clock->tsc_shift = shift;
577 hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
578
579 pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
80a914dc 580 __func__, tsc_khz, hv_clock->tsc_shift,
50d0a0f9
GH
581 hv_clock->tsc_to_system_mul);
582}
583
18068523
GOC
584static void kvm_write_guest_time(struct kvm_vcpu *v)
585{
586 struct timespec ts;
587 unsigned long flags;
588 struct kvm_vcpu_arch *vcpu = &v->arch;
589 void *shared_kaddr;
590
591 if ((!vcpu->time_page))
592 return;
593
50d0a0f9
GH
594 if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) {
595 kvm_set_time_scale(tsc_khz, &vcpu->hv_clock);
596 vcpu->hv_clock_tsc_khz = tsc_khz;
597 }
598
18068523
GOC
599 /* Keep irq disabled to prevent changes to the clock */
600 local_irq_save(flags);
601 kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
602 &vcpu->hv_clock.tsc_timestamp);
603 ktime_get_ts(&ts);
604 local_irq_restore(flags);
605
606 /* With all the info we got, fill in the values */
607
608 vcpu->hv_clock.system_time = ts.tv_nsec +
609 (NSEC_PER_SEC * (u64)ts.tv_sec);
610 /*
611 * The interface expects us to write an even number signaling that the
612 * update is finished. Since the guest won't see the intermediate
50d0a0f9 613 * state, we just increase by 2 at the end.
18068523 614 */
50d0a0f9 615 vcpu->hv_clock.version += 2;
18068523
GOC
616
617 shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
618
619 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
50d0a0f9 620 sizeof(vcpu->hv_clock));
18068523
GOC
621
622 kunmap_atomic(shared_kaddr, KM_USER0);
623
624 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
625}
626
9ba075a6
AK
627static bool msr_mtrr_valid(unsigned msr)
628{
629 switch (msr) {
630 case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
631 case MSR_MTRRfix64K_00000:
632 case MSR_MTRRfix16K_80000:
633 case MSR_MTRRfix16K_A0000:
634 case MSR_MTRRfix4K_C0000:
635 case MSR_MTRRfix4K_C8000:
636 case MSR_MTRRfix4K_D0000:
637 case MSR_MTRRfix4K_D8000:
638 case MSR_MTRRfix4K_E0000:
639 case MSR_MTRRfix4K_E8000:
640 case MSR_MTRRfix4K_F0000:
641 case MSR_MTRRfix4K_F8000:
642 case MSR_MTRRdefType:
643 case MSR_IA32_CR_PAT:
644 return true;
645 case 0x2f8:
646 return true;
647 }
648 return false;
649}
650
651static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
652{
653 if (!msr_mtrr_valid(msr))
654 return 1;
655
656 vcpu->arch.mtrr[msr - 0x200] = data;
657 return 0;
658}
15c4a640
CO
659
660int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
661{
662 switch (msr) {
15c4a640
CO
663 case MSR_EFER:
664 set_efer(vcpu, data);
665 break;
15c4a640
CO
666 case MSR_IA32_MC0_STATUS:
667 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
b8688d51 668 __func__, data);
15c4a640
CO
669 break;
670 case MSR_IA32_MCG_STATUS:
671 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
b8688d51 672 __func__, data);
15c4a640 673 break;
c7ac679c
JR
674 case MSR_IA32_MCG_CTL:
675 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
b8688d51 676 __func__, data);
c7ac679c 677 break;
b5e2fec0
AG
678 case MSR_IA32_DEBUGCTLMSR:
679 if (!data) {
680 /* We support the non-activated case already */
681 break;
682 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
683 /* Values other than LBR and BTF are vendor-specific,
684 thus reserved and should throw a #GP */
685 return 1;
686 }
687 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
688 __func__, data);
689 break;
15c4a640
CO
690 case MSR_IA32_UCODE_REV:
691 case MSR_IA32_UCODE_WRITE:
15c4a640 692 break;
9ba075a6
AK
693 case 0x200 ... 0x2ff:
694 return set_msr_mtrr(vcpu, msr, data);
15c4a640
CO
695 case MSR_IA32_APICBASE:
696 kvm_set_apic_base(vcpu, data);
697 break;
698 case MSR_IA32_MISC_ENABLE:
ad312c7c 699 vcpu->arch.ia32_misc_enable_msr = data;
15c4a640 700 break;
18068523
GOC
701 case MSR_KVM_WALL_CLOCK:
702 vcpu->kvm->arch.wall_clock = data;
703 kvm_write_wall_clock(vcpu->kvm, data);
704 break;
705 case MSR_KVM_SYSTEM_TIME: {
706 if (vcpu->arch.time_page) {
707 kvm_release_page_dirty(vcpu->arch.time_page);
708 vcpu->arch.time_page = NULL;
709 }
710
711 vcpu->arch.time = data;
712
713 /* we verify if the enable bit is set... */
714 if (!(data & 1))
715 break;
716
717 /* ...but clean it before doing the actual write */
718 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
719
18068523
GOC
720 vcpu->arch.time_page =
721 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
18068523
GOC
722
723 if (is_error_page(vcpu->arch.time_page)) {
724 kvm_release_page_clean(vcpu->arch.time_page);
725 vcpu->arch.time_page = NULL;
726 }
727
728 kvm_write_guest_time(vcpu);
729 break;
730 }
15c4a640 731 default:
565f1fbd 732 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
15c4a640
CO
733 return 1;
734 }
735 return 0;
736}
737EXPORT_SYMBOL_GPL(kvm_set_msr_common);
738
739
740/*
741 * Reads an msr value (of 'msr_index') into 'pdata'.
742 * Returns 0 on success, non-0 otherwise.
743 * Assumes vcpu_load() was already called.
744 */
745int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
746{
747 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
748}
749
9ba075a6
AK
750static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
751{
752 if (!msr_mtrr_valid(msr))
753 return 1;
754
755 *pdata = vcpu->arch.mtrr[msr - 0x200];
756 return 0;
757}
758
15c4a640
CO
759int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
760{
761 u64 data;
762
763 switch (msr) {
764 case 0xc0010010: /* SYSCFG */
765 case 0xc0010015: /* HWCR */
766 case MSR_IA32_PLATFORM_ID:
767 case MSR_IA32_P5_MC_ADDR:
768 case MSR_IA32_P5_MC_TYPE:
769 case MSR_IA32_MC0_CTL:
770 case MSR_IA32_MCG_STATUS:
771 case MSR_IA32_MCG_CAP:
c7ac679c 772 case MSR_IA32_MCG_CTL:
15c4a640
CO
773 case MSR_IA32_MC0_MISC:
774 case MSR_IA32_MC0_MISC+4:
775 case MSR_IA32_MC0_MISC+8:
776 case MSR_IA32_MC0_MISC+12:
777 case MSR_IA32_MC0_MISC+16:
a89c1ad2 778 case MSR_IA32_MC0_MISC+20:
15c4a640 779 case MSR_IA32_UCODE_REV:
15c4a640 780 case MSR_IA32_EBL_CR_POWERON:
b5e2fec0
AG
781 case MSR_IA32_DEBUGCTLMSR:
782 case MSR_IA32_LASTBRANCHFROMIP:
783 case MSR_IA32_LASTBRANCHTOIP:
784 case MSR_IA32_LASTINTFROMIP:
785 case MSR_IA32_LASTINTTOIP:
15c4a640
CO
786 data = 0;
787 break;
9ba075a6
AK
788 case MSR_MTRRcap:
789 data = 0x500 | KVM_NR_VAR_MTRR;
790 break;
791 case 0x200 ... 0x2ff:
792 return get_msr_mtrr(vcpu, msr, pdata);
15c4a640
CO
793 case 0xcd: /* fsb frequency */
794 data = 3;
795 break;
796 case MSR_IA32_APICBASE:
797 data = kvm_get_apic_base(vcpu);
798 break;
799 case MSR_IA32_MISC_ENABLE:
ad312c7c 800 data = vcpu->arch.ia32_misc_enable_msr;
15c4a640 801 break;
847f0ad8
AG
802 case MSR_IA32_PERF_STATUS:
803 /* TSC increment by tick */
804 data = 1000ULL;
805 /* CPU multiplier */
806 data |= (((uint64_t)4ULL) << 40);
807 break;
15c4a640 808 case MSR_EFER:
ad312c7c 809 data = vcpu->arch.shadow_efer;
15c4a640 810 break;
18068523
GOC
811 case MSR_KVM_WALL_CLOCK:
812 data = vcpu->kvm->arch.wall_clock;
813 break;
814 case MSR_KVM_SYSTEM_TIME:
815 data = vcpu->arch.time;
816 break;
15c4a640
CO
817 default:
818 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
819 return 1;
820 }
821 *pdata = data;
822 return 0;
823}
824EXPORT_SYMBOL_GPL(kvm_get_msr_common);
825
313a3dc7
CO
826/*
827 * Read or write a bunch of msrs. All parameters are kernel addresses.
828 *
829 * @return number of msrs set successfully.
830 */
831static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
832 struct kvm_msr_entry *entries,
833 int (*do_msr)(struct kvm_vcpu *vcpu,
834 unsigned index, u64 *data))
835{
836 int i;
837
838 vcpu_load(vcpu);
839
3200f405 840 down_read(&vcpu->kvm->slots_lock);
313a3dc7
CO
841 for (i = 0; i < msrs->nmsrs; ++i)
842 if (do_msr(vcpu, entries[i].index, &entries[i].data))
843 break;
3200f405 844 up_read(&vcpu->kvm->slots_lock);
313a3dc7
CO
845
846 vcpu_put(vcpu);
847
848 return i;
849}
850
851/*
852 * Read or write a bunch of msrs. Parameters are user addresses.
853 *
854 * @return number of msrs set successfully.
855 */
856static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
857 int (*do_msr)(struct kvm_vcpu *vcpu,
858 unsigned index, u64 *data),
859 int writeback)
860{
861 struct kvm_msrs msrs;
862 struct kvm_msr_entry *entries;
863 int r, n;
864 unsigned size;
865
866 r = -EFAULT;
867 if (copy_from_user(&msrs, user_msrs, sizeof msrs))
868 goto out;
869
870 r = -E2BIG;
871 if (msrs.nmsrs >= MAX_IO_MSRS)
872 goto out;
873
874 r = -ENOMEM;
875 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
876 entries = vmalloc(size);
877 if (!entries)
878 goto out;
879
880 r = -EFAULT;
881 if (copy_from_user(entries, user_msrs->entries, size))
882 goto out_free;
883
884 r = n = __msr_io(vcpu, &msrs, entries, do_msr);
885 if (r < 0)
886 goto out_free;
887
888 r = -EFAULT;
889 if (writeback && copy_to_user(user_msrs->entries, entries, size))
890 goto out_free;
891
892 r = n;
893
894out_free:
895 vfree(entries);
896out:
897 return r;
898}
899
018d00d2
ZX
900int kvm_dev_ioctl_check_extension(long ext)
901{
902 int r;
903
904 switch (ext) {
905 case KVM_CAP_IRQCHIP:
906 case KVM_CAP_HLT:
907 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
908 case KVM_CAP_USER_MEMORY:
909 case KVM_CAP_SET_TSS_ADDR:
07716717 910 case KVM_CAP_EXT_CPUID:
18068523 911 case KVM_CAP_CLOCKSOURCE:
7837699f 912 case KVM_CAP_PIT:
a28e4f5a 913 case KVM_CAP_NOP_IO_DELAY:
62d9f0db 914 case KVM_CAP_MP_STATE:
ed848624 915 case KVM_CAP_SYNC_MMU:
018d00d2
ZX
916 r = 1;
917 break;
542472b5
LV
918 case KVM_CAP_COALESCED_MMIO:
919 r = KVM_COALESCED_MMIO_PAGE_OFFSET;
920 break;
774ead3a
AK
921 case KVM_CAP_VAPIC:
922 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
923 break;
f725230a
AK
924 case KVM_CAP_NR_VCPUS:
925 r = KVM_MAX_VCPUS;
926 break;
a988b910
AK
927 case KVM_CAP_NR_MEMSLOTS:
928 r = KVM_MEMORY_SLOTS;
929 break;
2f333bcb
MT
930 case KVM_CAP_PV_MMU:
931 r = !tdp_enabled;
932 break;
62c476c7
BAY
933 case KVM_CAP_IOMMU:
934 r = intel_iommu_found();
935 break;
018d00d2
ZX
936 default:
937 r = 0;
938 break;
939 }
940 return r;
941
942}
943
043405e1
CO
944long kvm_arch_dev_ioctl(struct file *filp,
945 unsigned int ioctl, unsigned long arg)
946{
947 void __user *argp = (void __user *)arg;
948 long r;
949
950 switch (ioctl) {
951 case KVM_GET_MSR_INDEX_LIST: {
952 struct kvm_msr_list __user *user_msr_list = argp;
953 struct kvm_msr_list msr_list;
954 unsigned n;
955
956 r = -EFAULT;
957 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
958 goto out;
959 n = msr_list.nmsrs;
960 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
961 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
962 goto out;
963 r = -E2BIG;
964 if (n < num_msrs_to_save)
965 goto out;
966 r = -EFAULT;
967 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
968 num_msrs_to_save * sizeof(u32)))
969 goto out;
970 if (copy_to_user(user_msr_list->indices
971 + num_msrs_to_save * sizeof(u32),
972 &emulated_msrs,
973 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
974 goto out;
975 r = 0;
976 break;
977 }
674eea0f
AK
978 case KVM_GET_SUPPORTED_CPUID: {
979 struct kvm_cpuid2 __user *cpuid_arg = argp;
980 struct kvm_cpuid2 cpuid;
981
982 r = -EFAULT;
983 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
984 goto out;
985 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
986 cpuid_arg->entries);
987 if (r)
988 goto out;
989
990 r = -EFAULT;
991 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
992 goto out;
993 r = 0;
994 break;
995 }
043405e1
CO
996 default:
997 r = -EINVAL;
998 }
999out:
1000 return r;
1001}
1002
313a3dc7
CO
1003void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1004{
1005 kvm_x86_ops->vcpu_load(vcpu, cpu);
18068523 1006 kvm_write_guest_time(vcpu);
313a3dc7
CO
1007}
1008
1009void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
1010{
1011 kvm_x86_ops->vcpu_put(vcpu);
9327fd11 1012 kvm_put_guest_fpu(vcpu);
313a3dc7
CO
1013}
1014
07716717 1015static int is_efer_nx(void)
313a3dc7
CO
1016{
1017 u64 efer;
313a3dc7
CO
1018
1019 rdmsrl(MSR_EFER, efer);
07716717
DK
1020 return efer & EFER_NX;
1021}
1022
1023static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
1024{
1025 int i;
1026 struct kvm_cpuid_entry2 *e, *entry;
1027
313a3dc7 1028 entry = NULL;
ad312c7c
ZX
1029 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
1030 e = &vcpu->arch.cpuid_entries[i];
313a3dc7
CO
1031 if (e->function == 0x80000001) {
1032 entry = e;
1033 break;
1034 }
1035 }
07716717 1036 if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
313a3dc7
CO
1037 entry->edx &= ~(1 << 20);
1038 printk(KERN_INFO "kvm: guest NX capability removed\n");
1039 }
1040}
1041
07716717 1042/* when an old userspace process fills a new kernel module */
313a3dc7
CO
1043static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1044 struct kvm_cpuid *cpuid,
1045 struct kvm_cpuid_entry __user *entries)
07716717
DK
1046{
1047 int r, i;
1048 struct kvm_cpuid_entry *cpuid_entries;
1049
1050 r = -E2BIG;
1051 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1052 goto out;
1053 r = -ENOMEM;
1054 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
1055 if (!cpuid_entries)
1056 goto out;
1057 r = -EFAULT;
1058 if (copy_from_user(cpuid_entries, entries,
1059 cpuid->nent * sizeof(struct kvm_cpuid_entry)))
1060 goto out_free;
1061 for (i = 0; i < cpuid->nent; i++) {
ad312c7c
ZX
1062 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
1063 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
1064 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
1065 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
1066 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
1067 vcpu->arch.cpuid_entries[i].index = 0;
1068 vcpu->arch.cpuid_entries[i].flags = 0;
1069 vcpu->arch.cpuid_entries[i].padding[0] = 0;
1070 vcpu->arch.cpuid_entries[i].padding[1] = 0;
1071 vcpu->arch.cpuid_entries[i].padding[2] = 0;
1072 }
1073 vcpu->arch.cpuid_nent = cpuid->nent;
07716717
DK
1074 cpuid_fix_nx_cap(vcpu);
1075 r = 0;
1076
1077out_free:
1078 vfree(cpuid_entries);
1079out:
1080 return r;
1081}
1082
1083static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
1084 struct kvm_cpuid2 *cpuid,
1085 struct kvm_cpuid_entry2 __user *entries)
313a3dc7
CO
1086{
1087 int r;
1088
1089 r = -E2BIG;
1090 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1091 goto out;
1092 r = -EFAULT;
ad312c7c 1093 if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
07716717 1094 cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
313a3dc7 1095 goto out;
ad312c7c 1096 vcpu->arch.cpuid_nent = cpuid->nent;
313a3dc7
CO
1097 return 0;
1098
1099out:
1100 return r;
1101}
1102
07716717
DK
1103static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1104 struct kvm_cpuid2 *cpuid,
1105 struct kvm_cpuid_entry2 __user *entries)
1106{
1107 int r;
1108
1109 r = -E2BIG;
ad312c7c 1110 if (cpuid->nent < vcpu->arch.cpuid_nent)
07716717
DK
1111 goto out;
1112 r = -EFAULT;
ad312c7c
ZX
1113 if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
1114 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
07716717
DK
1115 goto out;
1116 return 0;
1117
1118out:
ad312c7c 1119 cpuid->nent = vcpu->arch.cpuid_nent;
07716717
DK
1120 return r;
1121}
1122
1123static inline u32 bit(int bitno)
1124{
1125 return 1 << (bitno & 31);
1126}
1127
1128static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1129 u32 index)
1130{
1131 entry->function = function;
1132 entry->index = index;
1133 cpuid_count(entry->function, entry->index,
1134 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
1135 entry->flags = 0;
1136}
1137
1138static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1139 u32 index, int *nent, int maxnent)
1140{
1141 const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
1142 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
1143 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
1144 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
1145 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
1146 bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
1147 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
1148 bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
1149 bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
1150 bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
1151 const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
1152 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
1153 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
1154 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
1155 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
1156 bit(X86_FEATURE_PGE) |
1157 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
1158 bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
1159 bit(X86_FEATURE_SYSCALL) |
1160 (bit(X86_FEATURE_NX) && is_efer_nx()) |
1161#ifdef CONFIG_X86_64
1162 bit(X86_FEATURE_LM) |
1163#endif
1164 bit(X86_FEATURE_MMXEXT) |
1165 bit(X86_FEATURE_3DNOWEXT) |
1166 bit(X86_FEATURE_3DNOW);
1167 const u32 kvm_supported_word3_x86_features =
1168 bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
1169 const u32 kvm_supported_word6_x86_features =
1170 bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY);
1171
1172 /* all func 2 cpuid_count() should be called on the same cpu */
1173 get_cpu();
1174 do_cpuid_1_ent(entry, function, index);
1175 ++*nent;
1176
1177 switch (function) {
1178 case 0:
1179 entry->eax = min(entry->eax, (u32)0xb);
1180 break;
1181 case 1:
1182 entry->edx &= kvm_supported_word0_x86_features;
1183 entry->ecx &= kvm_supported_word3_x86_features;
1184 break;
1185 /* function 2 entries are STATEFUL. That is, repeated cpuid commands
1186 * may return different values. This forces us to get_cpu() before
1187 * issuing the first command, and also to emulate this annoying behavior
1188 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
1189 case 2: {
1190 int t, times = entry->eax & 0xff;
1191
1192 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1193 for (t = 1; t < times && *nent < maxnent; ++t) {
1194 do_cpuid_1_ent(&entry[t], function, 0);
1195 entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1196 ++*nent;
1197 }
1198 break;
1199 }
1200 /* function 4 and 0xb have additional index. */
1201 case 4: {
14af3f3c 1202 int i, cache_type;
07716717
DK
1203
1204 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1205 /* read more entries until cache_type is zero */
14af3f3c
HH
1206 for (i = 1; *nent < maxnent; ++i) {
1207 cache_type = entry[i - 1].eax & 0x1f;
07716717
DK
1208 if (!cache_type)
1209 break;
14af3f3c
HH
1210 do_cpuid_1_ent(&entry[i], function, i);
1211 entry[i].flags |=
07716717
DK
1212 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1213 ++*nent;
1214 }
1215 break;
1216 }
1217 case 0xb: {
14af3f3c 1218 int i, level_type;
07716717
DK
1219
1220 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1221 /* read more entries until level_type is zero */
14af3f3c
HH
1222 for (i = 1; *nent < maxnent; ++i) {
1223 level_type = entry[i - 1].ecx & 0xff;
07716717
DK
1224 if (!level_type)
1225 break;
14af3f3c
HH
1226 do_cpuid_1_ent(&entry[i], function, i);
1227 entry[i].flags |=
07716717
DK
1228 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1229 ++*nent;
1230 }
1231 break;
1232 }
1233 case 0x80000000:
1234 entry->eax = min(entry->eax, 0x8000001a);
1235 break;
1236 case 0x80000001:
1237 entry->edx &= kvm_supported_word1_x86_features;
1238 entry->ecx &= kvm_supported_word6_x86_features;
1239 break;
1240 }
1241 put_cpu();
1242}
1243
674eea0f 1244static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
07716717
DK
1245 struct kvm_cpuid_entry2 __user *entries)
1246{
1247 struct kvm_cpuid_entry2 *cpuid_entries;
1248 int limit, nent = 0, r = -E2BIG;
1249 u32 func;
1250
1251 if (cpuid->nent < 1)
1252 goto out;
1253 r = -ENOMEM;
1254 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
1255 if (!cpuid_entries)
1256 goto out;
1257
1258 do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
1259 limit = cpuid_entries[0].eax;
1260 for (func = 1; func <= limit && nent < cpuid->nent; ++func)
1261 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1262 &nent, cpuid->nent);
1263 r = -E2BIG;
1264 if (nent >= cpuid->nent)
1265 goto out_free;
1266
1267 do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
1268 limit = cpuid_entries[nent - 1].eax;
1269 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
1270 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1271 &nent, cpuid->nent);
1272 r = -EFAULT;
1273 if (copy_to_user(entries, cpuid_entries,
1274 nent * sizeof(struct kvm_cpuid_entry2)))
1275 goto out_free;
1276 cpuid->nent = nent;
1277 r = 0;
1278
1279out_free:
1280 vfree(cpuid_entries);
1281out:
1282 return r;
1283}
1284
313a3dc7
CO
1285static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
1286 struct kvm_lapic_state *s)
1287{
1288 vcpu_load(vcpu);
ad312c7c 1289 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
313a3dc7
CO
1290 vcpu_put(vcpu);
1291
1292 return 0;
1293}
1294
1295static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
1296 struct kvm_lapic_state *s)
1297{
1298 vcpu_load(vcpu);
ad312c7c 1299 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
313a3dc7
CO
1300 kvm_apic_post_state_restore(vcpu);
1301 vcpu_put(vcpu);
1302
1303 return 0;
1304}
1305
f77bc6a4
ZX
1306static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1307 struct kvm_interrupt *irq)
1308{
1309 if (irq->irq < 0 || irq->irq >= 256)
1310 return -EINVAL;
1311 if (irqchip_in_kernel(vcpu->kvm))
1312 return -ENXIO;
1313 vcpu_load(vcpu);
1314
ad312c7c
ZX
1315 set_bit(irq->irq, vcpu->arch.irq_pending);
1316 set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
f77bc6a4
ZX
1317
1318 vcpu_put(vcpu);
1319
1320 return 0;
1321}
1322
c4abb7c9
JK
1323static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
1324{
1325 vcpu_load(vcpu);
1326 kvm_inject_nmi(vcpu);
1327 vcpu_put(vcpu);
1328
1329 return 0;
1330}
1331
b209749f
AK
1332static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
1333 struct kvm_tpr_access_ctl *tac)
1334{
1335 if (tac->flags)
1336 return -EINVAL;
1337 vcpu->arch.tpr_access_reporting = !!tac->enabled;
1338 return 0;
1339}
1340
313a3dc7
CO
1341long kvm_arch_vcpu_ioctl(struct file *filp,
1342 unsigned int ioctl, unsigned long arg)
1343{
1344 struct kvm_vcpu *vcpu = filp->private_data;
1345 void __user *argp = (void __user *)arg;
1346 int r;
b772ff36 1347 struct kvm_lapic_state *lapic = NULL;
313a3dc7
CO
1348
1349 switch (ioctl) {
1350 case KVM_GET_LAPIC: {
b772ff36 1351 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
313a3dc7 1352
b772ff36
DH
1353 r = -ENOMEM;
1354 if (!lapic)
1355 goto out;
1356 r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
313a3dc7
CO
1357 if (r)
1358 goto out;
1359 r = -EFAULT;
b772ff36 1360 if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
313a3dc7
CO
1361 goto out;
1362 r = 0;
1363 break;
1364 }
1365 case KVM_SET_LAPIC: {
b772ff36
DH
1366 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1367 r = -ENOMEM;
1368 if (!lapic)
1369 goto out;
313a3dc7 1370 r = -EFAULT;
b772ff36 1371 if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
313a3dc7 1372 goto out;
b772ff36 1373 r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
313a3dc7
CO
1374 if (r)
1375 goto out;
1376 r = 0;
1377 break;
1378 }
f77bc6a4
ZX
1379 case KVM_INTERRUPT: {
1380 struct kvm_interrupt irq;
1381
1382 r = -EFAULT;
1383 if (copy_from_user(&irq, argp, sizeof irq))
1384 goto out;
1385 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
1386 if (r)
1387 goto out;
1388 r = 0;
1389 break;
1390 }
c4abb7c9
JK
1391 case KVM_NMI: {
1392 r = kvm_vcpu_ioctl_nmi(vcpu);
1393 if (r)
1394 goto out;
1395 r = 0;
1396 break;
1397 }
313a3dc7
CO
1398 case KVM_SET_CPUID: {
1399 struct kvm_cpuid __user *cpuid_arg = argp;
1400 struct kvm_cpuid cpuid;
1401
1402 r = -EFAULT;
1403 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1404 goto out;
1405 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
1406 if (r)
1407 goto out;
1408 break;
1409 }
07716717
DK
1410 case KVM_SET_CPUID2: {
1411 struct kvm_cpuid2 __user *cpuid_arg = argp;
1412 struct kvm_cpuid2 cpuid;
1413
1414 r = -EFAULT;
1415 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1416 goto out;
1417 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
1418 cpuid_arg->entries);
1419 if (r)
1420 goto out;
1421 break;
1422 }
1423 case KVM_GET_CPUID2: {
1424 struct kvm_cpuid2 __user *cpuid_arg = argp;
1425 struct kvm_cpuid2 cpuid;
1426
1427 r = -EFAULT;
1428 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1429 goto out;
1430 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
1431 cpuid_arg->entries);
1432 if (r)
1433 goto out;
1434 r = -EFAULT;
1435 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1436 goto out;
1437 r = 0;
1438 break;
1439 }
313a3dc7
CO
1440 case KVM_GET_MSRS:
1441 r = msr_io(vcpu, argp, kvm_get_msr, 1);
1442 break;
1443 case KVM_SET_MSRS:
1444 r = msr_io(vcpu, argp, do_set_msr, 0);
1445 break;
b209749f
AK
1446 case KVM_TPR_ACCESS_REPORTING: {
1447 struct kvm_tpr_access_ctl tac;
1448
1449 r = -EFAULT;
1450 if (copy_from_user(&tac, argp, sizeof tac))
1451 goto out;
1452 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
1453 if (r)
1454 goto out;
1455 r = -EFAULT;
1456 if (copy_to_user(argp, &tac, sizeof tac))
1457 goto out;
1458 r = 0;
1459 break;
1460 };
b93463aa
AK
1461 case KVM_SET_VAPIC_ADDR: {
1462 struct kvm_vapic_addr va;
1463
1464 r = -EINVAL;
1465 if (!irqchip_in_kernel(vcpu->kvm))
1466 goto out;
1467 r = -EFAULT;
1468 if (copy_from_user(&va, argp, sizeof va))
1469 goto out;
1470 r = 0;
1471 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
1472 break;
1473 }
313a3dc7
CO
1474 default:
1475 r = -EINVAL;
1476 }
1477out:
b772ff36
DH
1478 if (lapic)
1479 kfree(lapic);
313a3dc7
CO
1480 return r;
1481}
1482
1fe779f8
CO
1483static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
1484{
1485 int ret;
1486
1487 if (addr > (unsigned int)(-3 * PAGE_SIZE))
1488 return -1;
1489 ret = kvm_x86_ops->set_tss_addr(kvm, addr);
1490 return ret;
1491}
1492
1493static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
1494 u32 kvm_nr_mmu_pages)
1495{
1496 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
1497 return -EINVAL;
1498
72dc67a6 1499 down_write(&kvm->slots_lock);
1fe779f8
CO
1500
1501 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
f05e70ac 1502 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
1fe779f8 1503
72dc67a6 1504 up_write(&kvm->slots_lock);
1fe779f8
CO
1505 return 0;
1506}
1507
1508static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
1509{
f05e70ac 1510 return kvm->arch.n_alloc_mmu_pages;
1fe779f8
CO
1511}
1512
e9f85cde
ZX
1513gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
1514{
1515 int i;
1516 struct kvm_mem_alias *alias;
1517
d69fb81f
ZX
1518 for (i = 0; i < kvm->arch.naliases; ++i) {
1519 alias = &kvm->arch.aliases[i];
e9f85cde
ZX
1520 if (gfn >= alias->base_gfn
1521 && gfn < alias->base_gfn + alias->npages)
1522 return alias->target_gfn + gfn - alias->base_gfn;
1523 }
1524 return gfn;
1525}
1526
1fe779f8
CO
1527/*
1528 * Set a new alias region. Aliases map a portion of physical memory into
1529 * another portion. This is useful for memory windows, for example the PC
1530 * VGA region.
1531 */
1532static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1533 struct kvm_memory_alias *alias)
1534{
1535 int r, n;
1536 struct kvm_mem_alias *p;
1537
1538 r = -EINVAL;
1539 /* General sanity checks */
1540 if (alias->memory_size & (PAGE_SIZE - 1))
1541 goto out;
1542 if (alias->guest_phys_addr & (PAGE_SIZE - 1))
1543 goto out;
1544 if (alias->slot >= KVM_ALIAS_SLOTS)
1545 goto out;
1546 if (alias->guest_phys_addr + alias->memory_size
1547 < alias->guest_phys_addr)
1548 goto out;
1549 if (alias->target_phys_addr + alias->memory_size
1550 < alias->target_phys_addr)
1551 goto out;
1552
72dc67a6 1553 down_write(&kvm->slots_lock);
a1708ce8 1554 spin_lock(&kvm->mmu_lock);
1fe779f8 1555
d69fb81f 1556 p = &kvm->arch.aliases[alias->slot];
1fe779f8
CO
1557 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
1558 p->npages = alias->memory_size >> PAGE_SHIFT;
1559 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
1560
1561 for (n = KVM_ALIAS_SLOTS; n > 0; --n)
d69fb81f 1562 if (kvm->arch.aliases[n - 1].npages)
1fe779f8 1563 break;
d69fb81f 1564 kvm->arch.naliases = n;
1fe779f8 1565
a1708ce8 1566 spin_unlock(&kvm->mmu_lock);
1fe779f8
CO
1567 kvm_mmu_zap_all(kvm);
1568
72dc67a6 1569 up_write(&kvm->slots_lock);
1fe779f8
CO
1570
1571 return 0;
1572
1573out:
1574 return r;
1575}
1576
1577static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1578{
1579 int r;
1580
1581 r = 0;
1582 switch (chip->chip_id) {
1583 case KVM_IRQCHIP_PIC_MASTER:
1584 memcpy(&chip->chip.pic,
1585 &pic_irqchip(kvm)->pics[0],
1586 sizeof(struct kvm_pic_state));
1587 break;
1588 case KVM_IRQCHIP_PIC_SLAVE:
1589 memcpy(&chip->chip.pic,
1590 &pic_irqchip(kvm)->pics[1],
1591 sizeof(struct kvm_pic_state));
1592 break;
1593 case KVM_IRQCHIP_IOAPIC:
1594 memcpy(&chip->chip.ioapic,
1595 ioapic_irqchip(kvm),
1596 sizeof(struct kvm_ioapic_state));
1597 break;
1598 default:
1599 r = -EINVAL;
1600 break;
1601 }
1602 return r;
1603}
1604
1605static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1606{
1607 int r;
1608
1609 r = 0;
1610 switch (chip->chip_id) {
1611 case KVM_IRQCHIP_PIC_MASTER:
1612 memcpy(&pic_irqchip(kvm)->pics[0],
1613 &chip->chip.pic,
1614 sizeof(struct kvm_pic_state));
1615 break;
1616 case KVM_IRQCHIP_PIC_SLAVE:
1617 memcpy(&pic_irqchip(kvm)->pics[1],
1618 &chip->chip.pic,
1619 sizeof(struct kvm_pic_state));
1620 break;
1621 case KVM_IRQCHIP_IOAPIC:
1622 memcpy(ioapic_irqchip(kvm),
1623 &chip->chip.ioapic,
1624 sizeof(struct kvm_ioapic_state));
1625 break;
1626 default:
1627 r = -EINVAL;
1628 break;
1629 }
1630 kvm_pic_update_irq(pic_irqchip(kvm));
1631 return r;
1632}
1633
e0f63cb9
SY
1634static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
1635{
1636 int r = 0;
1637
1638 memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
1639 return r;
1640}
1641
1642static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
1643{
1644 int r = 0;
1645
1646 memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
1647 kvm_pit_load_count(kvm, 0, ps->channels[0].count);
1648 return r;
1649}
1650
5bb064dc
ZX
1651/*
1652 * Get (and clear) the dirty memory log for a memory slot.
1653 */
1654int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1655 struct kvm_dirty_log *log)
1656{
1657 int r;
1658 int n;
1659 struct kvm_memory_slot *memslot;
1660 int is_dirty = 0;
1661
72dc67a6 1662 down_write(&kvm->slots_lock);
5bb064dc
ZX
1663
1664 r = kvm_get_dirty_log(kvm, log, &is_dirty);
1665 if (r)
1666 goto out;
1667
1668 /* If nothing is dirty, don't bother messing with page tables. */
1669 if (is_dirty) {
1670 kvm_mmu_slot_remove_write_access(kvm, log->slot);
1671 kvm_flush_remote_tlbs(kvm);
1672 memslot = &kvm->memslots[log->slot];
1673 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
1674 memset(memslot->dirty_bitmap, 0, n);
1675 }
1676 r = 0;
1677out:
72dc67a6 1678 up_write(&kvm->slots_lock);
5bb064dc
ZX
1679 return r;
1680}
1681
1fe779f8
CO
1682long kvm_arch_vm_ioctl(struct file *filp,
1683 unsigned int ioctl, unsigned long arg)
1684{
1685 struct kvm *kvm = filp->private_data;
1686 void __user *argp = (void __user *)arg;
1687 int r = -EINVAL;
f0d66275
DH
1688 /*
1689 * This union makes it completely explicit to gcc-3.x
1690 * that these two variables' stack usage should be
1691 * combined, not added together.
1692 */
1693 union {
1694 struct kvm_pit_state ps;
1695 struct kvm_memory_alias alias;
1696 } u;
1fe779f8
CO
1697
1698 switch (ioctl) {
1699 case KVM_SET_TSS_ADDR:
1700 r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
1701 if (r < 0)
1702 goto out;
1703 break;
1704 case KVM_SET_MEMORY_REGION: {
1705 struct kvm_memory_region kvm_mem;
1706 struct kvm_userspace_memory_region kvm_userspace_mem;
1707
1708 r = -EFAULT;
1709 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
1710 goto out;
1711 kvm_userspace_mem.slot = kvm_mem.slot;
1712 kvm_userspace_mem.flags = kvm_mem.flags;
1713 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
1714 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
1715 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
1716 if (r)
1717 goto out;
1718 break;
1719 }
1720 case KVM_SET_NR_MMU_PAGES:
1721 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
1722 if (r)
1723 goto out;
1724 break;
1725 case KVM_GET_NR_MMU_PAGES:
1726 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
1727 break;
f0d66275 1728 case KVM_SET_MEMORY_ALIAS:
1fe779f8 1729 r = -EFAULT;
f0d66275 1730 if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
1fe779f8 1731 goto out;
f0d66275 1732 r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
1fe779f8
CO
1733 if (r)
1734 goto out;
1735 break;
1fe779f8
CO
1736 case KVM_CREATE_IRQCHIP:
1737 r = -ENOMEM;
d7deeeb0
ZX
1738 kvm->arch.vpic = kvm_create_pic(kvm);
1739 if (kvm->arch.vpic) {
1fe779f8
CO
1740 r = kvm_ioapic_init(kvm);
1741 if (r) {
d7deeeb0
ZX
1742 kfree(kvm->arch.vpic);
1743 kvm->arch.vpic = NULL;
1fe779f8
CO
1744 goto out;
1745 }
1746 } else
1747 goto out;
1748 break;
7837699f
SY
1749 case KVM_CREATE_PIT:
1750 r = -ENOMEM;
1751 kvm->arch.vpit = kvm_create_pit(kvm);
1752 if (kvm->arch.vpit)
1753 r = 0;
1754 break;
1fe779f8
CO
1755 case KVM_IRQ_LINE: {
1756 struct kvm_irq_level irq_event;
1757
1758 r = -EFAULT;
1759 if (copy_from_user(&irq_event, argp, sizeof irq_event))
1760 goto out;
1761 if (irqchip_in_kernel(kvm)) {
1762 mutex_lock(&kvm->lock);
5550af4d
SY
1763 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
1764 irq_event.irq, irq_event.level);
1fe779f8
CO
1765 mutex_unlock(&kvm->lock);
1766 r = 0;
1767 }
1768 break;
1769 }
1770 case KVM_GET_IRQCHIP: {
1771 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
f0d66275 1772 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
1fe779f8 1773
f0d66275
DH
1774 r = -ENOMEM;
1775 if (!chip)
1fe779f8 1776 goto out;
f0d66275
DH
1777 r = -EFAULT;
1778 if (copy_from_user(chip, argp, sizeof *chip))
1779 goto get_irqchip_out;
1fe779f8
CO
1780 r = -ENXIO;
1781 if (!irqchip_in_kernel(kvm))
f0d66275
DH
1782 goto get_irqchip_out;
1783 r = kvm_vm_ioctl_get_irqchip(kvm, chip);
1fe779f8 1784 if (r)
f0d66275 1785 goto get_irqchip_out;
1fe779f8 1786 r = -EFAULT;
f0d66275
DH
1787 if (copy_to_user(argp, chip, sizeof *chip))
1788 goto get_irqchip_out;
1fe779f8 1789 r = 0;
f0d66275
DH
1790 get_irqchip_out:
1791 kfree(chip);
1792 if (r)
1793 goto out;
1fe779f8
CO
1794 break;
1795 }
1796 case KVM_SET_IRQCHIP: {
1797 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
f0d66275 1798 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
1fe779f8 1799
f0d66275
DH
1800 r = -ENOMEM;
1801 if (!chip)
1fe779f8 1802 goto out;
f0d66275
DH
1803 r = -EFAULT;
1804 if (copy_from_user(chip, argp, sizeof *chip))
1805 goto set_irqchip_out;
1fe779f8
CO
1806 r = -ENXIO;
1807 if (!irqchip_in_kernel(kvm))
f0d66275
DH
1808 goto set_irqchip_out;
1809 r = kvm_vm_ioctl_set_irqchip(kvm, chip);
1fe779f8 1810 if (r)
f0d66275 1811 goto set_irqchip_out;
1fe779f8 1812 r = 0;
f0d66275
DH
1813 set_irqchip_out:
1814 kfree(chip);
1815 if (r)
1816 goto out;
1fe779f8
CO
1817 break;
1818 }
e0f63cb9 1819 case KVM_GET_PIT: {
e0f63cb9 1820 r = -EFAULT;
f0d66275 1821 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
e0f63cb9
SY
1822 goto out;
1823 r = -ENXIO;
1824 if (!kvm->arch.vpit)
1825 goto out;
f0d66275 1826 r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
e0f63cb9
SY
1827 if (r)
1828 goto out;
1829 r = -EFAULT;
f0d66275 1830 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
e0f63cb9
SY
1831 goto out;
1832 r = 0;
1833 break;
1834 }
1835 case KVM_SET_PIT: {
e0f63cb9 1836 r = -EFAULT;
f0d66275 1837 if (copy_from_user(&u.ps, argp, sizeof u.ps))
e0f63cb9
SY
1838 goto out;
1839 r = -ENXIO;
1840 if (!kvm->arch.vpit)
1841 goto out;
f0d66275 1842 r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
e0f63cb9
SY
1843 if (r)
1844 goto out;
1845 r = 0;
1846 break;
1847 }
1fe779f8
CO
1848 default:
1849 ;
1850 }
1851out:
1852 return r;
1853}
1854
a16b043c 1855static void kvm_init_msr_list(void)
043405e1
CO
1856{
1857 u32 dummy[2];
1858 unsigned i, j;
1859
1860 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
1861 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
1862 continue;
1863 if (j < i)
1864 msrs_to_save[j] = msrs_to_save[i];
1865 j++;
1866 }
1867 num_msrs_to_save = j;
1868}
1869
bbd9b64e
CO
1870/*
1871 * Only apic need an MMIO device hook, so shortcut now..
1872 */
1873static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
92760499
LV
1874 gpa_t addr, int len,
1875 int is_write)
bbd9b64e
CO
1876{
1877 struct kvm_io_device *dev;
1878
ad312c7c
ZX
1879 if (vcpu->arch.apic) {
1880 dev = &vcpu->arch.apic->dev;
92760499 1881 if (dev->in_range(dev, addr, len, is_write))
bbd9b64e
CO
1882 return dev;
1883 }
1884 return NULL;
1885}
1886
1887
1888static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
92760499
LV
1889 gpa_t addr, int len,
1890 int is_write)
bbd9b64e
CO
1891{
1892 struct kvm_io_device *dev;
1893
92760499 1894 dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write);
bbd9b64e 1895 if (dev == NULL)
92760499
LV
1896 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len,
1897 is_write);
bbd9b64e
CO
1898 return dev;
1899}
1900
1901int emulator_read_std(unsigned long addr,
1902 void *val,
1903 unsigned int bytes,
1904 struct kvm_vcpu *vcpu)
1905{
1906 void *data = val;
10589a46 1907 int r = X86EMUL_CONTINUE;
bbd9b64e
CO
1908
1909 while (bytes) {
ad312c7c 1910 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
bbd9b64e
CO
1911 unsigned offset = addr & (PAGE_SIZE-1);
1912 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
1913 int ret;
1914
10589a46
MT
1915 if (gpa == UNMAPPED_GVA) {
1916 r = X86EMUL_PROPAGATE_FAULT;
1917 goto out;
1918 }
bbd9b64e 1919 ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);
10589a46
MT
1920 if (ret < 0) {
1921 r = X86EMUL_UNHANDLEABLE;
1922 goto out;
1923 }
bbd9b64e
CO
1924
1925 bytes -= tocopy;
1926 data += tocopy;
1927 addr += tocopy;
1928 }
10589a46 1929out:
10589a46 1930 return r;
bbd9b64e
CO
1931}
1932EXPORT_SYMBOL_GPL(emulator_read_std);
1933
bbd9b64e
CO
1934static int emulator_read_emulated(unsigned long addr,
1935 void *val,
1936 unsigned int bytes,
1937 struct kvm_vcpu *vcpu)
1938{
1939 struct kvm_io_device *mmio_dev;
1940 gpa_t gpa;
1941
1942 if (vcpu->mmio_read_completed) {
1943 memcpy(val, vcpu->mmio_data, bytes);
1944 vcpu->mmio_read_completed = 0;
1945 return X86EMUL_CONTINUE;
1946 }
1947
ad312c7c 1948 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
bbd9b64e
CO
1949
1950 /* For APIC access vmexit */
1951 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1952 goto mmio;
1953
1954 if (emulator_read_std(addr, val, bytes, vcpu)
1955 == X86EMUL_CONTINUE)
1956 return X86EMUL_CONTINUE;
1957 if (gpa == UNMAPPED_GVA)
1958 return X86EMUL_PROPAGATE_FAULT;
1959
1960mmio:
1961 /*
1962 * Is this MMIO handled locally?
1963 */
10589a46 1964 mutex_lock(&vcpu->kvm->lock);
92760499 1965 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0);
bbd9b64e
CO
1966 if (mmio_dev) {
1967 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
10589a46 1968 mutex_unlock(&vcpu->kvm->lock);
bbd9b64e
CO
1969 return X86EMUL_CONTINUE;
1970 }
10589a46 1971 mutex_unlock(&vcpu->kvm->lock);
bbd9b64e
CO
1972
1973 vcpu->mmio_needed = 1;
1974 vcpu->mmio_phys_addr = gpa;
1975 vcpu->mmio_size = bytes;
1976 vcpu->mmio_is_write = 0;
1977
1978 return X86EMUL_UNHANDLEABLE;
1979}
1980
3200f405 1981int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
9f811285 1982 const void *val, int bytes)
bbd9b64e
CO
1983{
1984 int ret;
1985
1986 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
9f811285 1987 if (ret < 0)
bbd9b64e
CO
1988 return 0;
1989 kvm_mmu_pte_write(vcpu, gpa, val, bytes);
1990 return 1;
1991}
1992
1993static int emulator_write_emulated_onepage(unsigned long addr,
1994 const void *val,
1995 unsigned int bytes,
1996 struct kvm_vcpu *vcpu)
1997{
1998 struct kvm_io_device *mmio_dev;
10589a46
MT
1999 gpa_t gpa;
2000
10589a46 2001 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
bbd9b64e
CO
2002
2003 if (gpa == UNMAPPED_GVA) {
c3c91fee 2004 kvm_inject_page_fault(vcpu, addr, 2);
bbd9b64e
CO
2005 return X86EMUL_PROPAGATE_FAULT;
2006 }
2007
2008 /* For APIC access vmexit */
2009 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2010 goto mmio;
2011
2012 if (emulator_write_phys(vcpu, gpa, val, bytes))
2013 return X86EMUL_CONTINUE;
2014
2015mmio:
2016 /*
2017 * Is this MMIO handled locally?
2018 */
10589a46 2019 mutex_lock(&vcpu->kvm->lock);
92760499 2020 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1);
bbd9b64e
CO
2021 if (mmio_dev) {
2022 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
10589a46 2023 mutex_unlock(&vcpu->kvm->lock);
bbd9b64e
CO
2024 return X86EMUL_CONTINUE;
2025 }
10589a46 2026 mutex_unlock(&vcpu->kvm->lock);
bbd9b64e
CO
2027
2028 vcpu->mmio_needed = 1;
2029 vcpu->mmio_phys_addr = gpa;
2030 vcpu->mmio_size = bytes;
2031 vcpu->mmio_is_write = 1;
2032 memcpy(vcpu->mmio_data, val, bytes);
2033
2034 return X86EMUL_CONTINUE;
2035}
2036
2037int emulator_write_emulated(unsigned long addr,
2038 const void *val,
2039 unsigned int bytes,
2040 struct kvm_vcpu *vcpu)
2041{
2042 /* Crossing a page boundary? */
2043 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
2044 int rc, now;
2045
2046 now = -addr & ~PAGE_MASK;
2047 rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
2048 if (rc != X86EMUL_CONTINUE)
2049 return rc;
2050 addr += now;
2051 val += now;
2052 bytes -= now;
2053 }
2054 return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
2055}
2056EXPORT_SYMBOL_GPL(emulator_write_emulated);
2057
2058static int emulator_cmpxchg_emulated(unsigned long addr,
2059 const void *old,
2060 const void *new,
2061 unsigned int bytes,
2062 struct kvm_vcpu *vcpu)
2063{
2064 static int reported;
2065
2066 if (!reported) {
2067 reported = 1;
2068 printk(KERN_WARNING "kvm: emulating exchange as write\n");
2069 }
2bacc55c
MT
2070#ifndef CONFIG_X86_64
2071 /* guests cmpxchg8b have to be emulated atomically */
2072 if (bytes == 8) {
10589a46 2073 gpa_t gpa;
2bacc55c 2074 struct page *page;
c0b49b0d 2075 char *kaddr;
2bacc55c
MT
2076 u64 val;
2077
10589a46
MT
2078 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2079
2bacc55c
MT
2080 if (gpa == UNMAPPED_GVA ||
2081 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2082 goto emul_write;
2083
2084 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
2085 goto emul_write;
2086
2087 val = *(u64 *)new;
72dc67a6 2088
2bacc55c 2089 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
72dc67a6 2090
c0b49b0d
AM
2091 kaddr = kmap_atomic(page, KM_USER0);
2092 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
2093 kunmap_atomic(kaddr, KM_USER0);
2bacc55c
MT
2094 kvm_release_page_dirty(page);
2095 }
3200f405 2096emul_write:
2bacc55c
MT
2097#endif
2098
bbd9b64e
CO
2099 return emulator_write_emulated(addr, new, bytes, vcpu);
2100}
2101
2102static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
2103{
2104 return kvm_x86_ops->get_segment_base(vcpu, seg);
2105}
2106
2107int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
2108{
a7052897 2109 kvm_mmu_invlpg(vcpu, address);
bbd9b64e
CO
2110 return X86EMUL_CONTINUE;
2111}
2112
2113int emulate_clts(struct kvm_vcpu *vcpu)
2114{
54e445ca 2115 KVMTRACE_0D(CLTS, vcpu, handler);
ad312c7c 2116 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
bbd9b64e
CO
2117 return X86EMUL_CONTINUE;
2118}
2119
2120int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
2121{
2122 struct kvm_vcpu *vcpu = ctxt->vcpu;
2123
2124 switch (dr) {
2125 case 0 ... 3:
2126 *dest = kvm_x86_ops->get_dr(vcpu, dr);
2127 return X86EMUL_CONTINUE;
2128 default:
b8688d51 2129 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr);
bbd9b64e
CO
2130 return X86EMUL_UNHANDLEABLE;
2131 }
2132}
2133
2134int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
2135{
2136 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
2137 int exception;
2138
2139 kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
2140 if (exception) {
2141 /* FIXME: better handling */
2142 return X86EMUL_UNHANDLEABLE;
2143 }
2144 return X86EMUL_CONTINUE;
2145}
2146
2147void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
2148{
bbd9b64e 2149 u8 opcodes[4];
5fdbf976 2150 unsigned long rip = kvm_rip_read(vcpu);
bbd9b64e
CO
2151 unsigned long rip_linear;
2152
f76c710d 2153 if (!printk_ratelimit())
bbd9b64e
CO
2154 return;
2155
25be4608
GC
2156 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
2157
bbd9b64e
CO
2158 emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
2159
2160 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
2161 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
bbd9b64e
CO
2162}
2163EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
2164
14af3f3c 2165static struct x86_emulate_ops emulate_ops = {
bbd9b64e 2166 .read_std = emulator_read_std,
bbd9b64e
CO
2167 .read_emulated = emulator_read_emulated,
2168 .write_emulated = emulator_write_emulated,
2169 .cmpxchg_emulated = emulator_cmpxchg_emulated,
2170};
2171
5fdbf976
MT
2172static void cache_all_regs(struct kvm_vcpu *vcpu)
2173{
2174 kvm_register_read(vcpu, VCPU_REGS_RAX);
2175 kvm_register_read(vcpu, VCPU_REGS_RSP);
2176 kvm_register_read(vcpu, VCPU_REGS_RIP);
2177 vcpu->arch.regs_dirty = ~0;
2178}
2179
bbd9b64e
CO
2180int emulate_instruction(struct kvm_vcpu *vcpu,
2181 struct kvm_run *run,
2182 unsigned long cr2,
2183 u16 error_code,
571008da 2184 int emulation_type)
bbd9b64e
CO
2185{
2186 int r;
571008da 2187 struct decode_cache *c;
bbd9b64e 2188
26eef70c 2189 kvm_clear_exception_queue(vcpu);
ad312c7c 2190 vcpu->arch.mmio_fault_cr2 = cr2;
5fdbf976
MT
2191 /*
2192 * TODO: fix x86_emulate.c to use guest_read/write_register
2193 * instead of direct ->regs accesses, can save hundred cycles
2194 * on Intel for instructions that don't read/change RSP, for
2195 * for example.
2196 */
2197 cache_all_regs(vcpu);
bbd9b64e
CO
2198
2199 vcpu->mmio_is_write = 0;
ad312c7c 2200 vcpu->arch.pio.string = 0;
bbd9b64e 2201
571008da 2202 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
bbd9b64e
CO
2203 int cs_db, cs_l;
2204 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
2205
ad312c7c
ZX
2206 vcpu->arch.emulate_ctxt.vcpu = vcpu;
2207 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
2208 vcpu->arch.emulate_ctxt.mode =
2209 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
bbd9b64e
CO
2210 ? X86EMUL_MODE_REAL : cs_l
2211 ? X86EMUL_MODE_PROT64 : cs_db
2212 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
2213
ad312c7c 2214 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
571008da
SY
2215
2216 /* Reject the instructions other than VMCALL/VMMCALL when
2217 * try to emulate invalid opcode */
2218 c = &vcpu->arch.emulate_ctxt.decode;
2219 if ((emulation_type & EMULTYPE_TRAP_UD) &&
2220 (!(c->twobyte && c->b == 0x01 &&
2221 (c->modrm_reg == 0 || c->modrm_reg == 3) &&
2222 c->modrm_mod == 3 && c->modrm_rm == 1)))
2223 return EMULATE_FAIL;
2224
f2b5756b 2225 ++vcpu->stat.insn_emulation;
bbd9b64e 2226 if (r) {
f2b5756b 2227 ++vcpu->stat.insn_emulation_fail;
bbd9b64e
CO
2228 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
2229 return EMULATE_DONE;
2230 return EMULATE_FAIL;
2231 }
2232 }
2233
ad312c7c 2234 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
bbd9b64e 2235
ad312c7c 2236 if (vcpu->arch.pio.string)
bbd9b64e
CO
2237 return EMULATE_DO_MMIO;
2238
2239 if ((r || vcpu->mmio_is_write) && run) {
2240 run->exit_reason = KVM_EXIT_MMIO;
2241 run->mmio.phys_addr = vcpu->mmio_phys_addr;
2242 memcpy(run->mmio.data, vcpu->mmio_data, 8);
2243 run->mmio.len = vcpu->mmio_size;
2244 run->mmio.is_write = vcpu->mmio_is_write;
2245 }
2246
2247 if (r) {
2248 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
2249 return EMULATE_DONE;
2250 if (!vcpu->mmio_needed) {
2251 kvm_report_emulation_failure(vcpu, "mmio");
2252 return EMULATE_FAIL;
2253 }
2254 return EMULATE_DO_MMIO;
2255 }
2256
ad312c7c 2257 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
bbd9b64e
CO
2258
2259 if (vcpu->mmio_is_write) {
2260 vcpu->mmio_needed = 0;
2261 return EMULATE_DO_MMIO;
2262 }
2263
2264 return EMULATE_DONE;
2265}
2266EXPORT_SYMBOL_GPL(emulate_instruction);
2267
de7d789a
CO
2268static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
2269{
2270 int i;
2271
ad312c7c
ZX
2272 for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i)
2273 if (vcpu->arch.pio.guest_pages[i]) {
2274 kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]);
2275 vcpu->arch.pio.guest_pages[i] = NULL;
de7d789a
CO
2276 }
2277}
2278
2279static int pio_copy_data(struct kvm_vcpu *vcpu)
2280{
ad312c7c 2281 void *p = vcpu->arch.pio_data;
de7d789a
CO
2282 void *q;
2283 unsigned bytes;
ad312c7c 2284 int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1;
de7d789a 2285
ad312c7c 2286 q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
de7d789a
CO
2287 PAGE_KERNEL);
2288 if (!q) {
2289 free_pio_guest_pages(vcpu);
2290 return -ENOMEM;
2291 }
ad312c7c
ZX
2292 q += vcpu->arch.pio.guest_page_offset;
2293 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
2294 if (vcpu->arch.pio.in)
de7d789a
CO
2295 memcpy(q, p, bytes);
2296 else
2297 memcpy(p, q, bytes);
ad312c7c 2298 q -= vcpu->arch.pio.guest_page_offset;
de7d789a
CO
2299 vunmap(q);
2300 free_pio_guest_pages(vcpu);
2301 return 0;
2302}
2303
2304int complete_pio(struct kvm_vcpu *vcpu)
2305{
ad312c7c 2306 struct kvm_pio_request *io = &vcpu->arch.pio;
de7d789a
CO
2307 long delta;
2308 int r;
5fdbf976 2309 unsigned long val;
de7d789a
CO
2310
2311 if (!io->string) {
5fdbf976
MT
2312 if (io->in) {
2313 val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2314 memcpy(&val, vcpu->arch.pio_data, io->size);
2315 kvm_register_write(vcpu, VCPU_REGS_RAX, val);
2316 }
de7d789a
CO
2317 } else {
2318 if (io->in) {
2319 r = pio_copy_data(vcpu);
5fdbf976 2320 if (r)
de7d789a 2321 return r;
de7d789a
CO
2322 }
2323
2324 delta = 1;
2325 if (io->rep) {
2326 delta *= io->cur_count;
2327 /*
2328 * The size of the register should really depend on
2329 * current address size.
2330 */
5fdbf976
MT
2331 val = kvm_register_read(vcpu, VCPU_REGS_RCX);
2332 val -= delta;
2333 kvm_register_write(vcpu, VCPU_REGS_RCX, val);
de7d789a
CO
2334 }
2335 if (io->down)
2336 delta = -delta;
2337 delta *= io->size;
5fdbf976
MT
2338 if (io->in) {
2339 val = kvm_register_read(vcpu, VCPU_REGS_RDI);
2340 val += delta;
2341 kvm_register_write(vcpu, VCPU_REGS_RDI, val);
2342 } else {
2343 val = kvm_register_read(vcpu, VCPU_REGS_RSI);
2344 val += delta;
2345 kvm_register_write(vcpu, VCPU_REGS_RSI, val);
2346 }
de7d789a
CO
2347 }
2348
de7d789a
CO
2349 io->count -= io->cur_count;
2350 io->cur_count = 0;
2351
2352 return 0;
2353}
2354
2355static void kernel_pio(struct kvm_io_device *pio_dev,
2356 struct kvm_vcpu *vcpu,
2357 void *pd)
2358{
2359 /* TODO: String I/O for in kernel device */
2360
2361 mutex_lock(&vcpu->kvm->lock);
ad312c7c
ZX
2362 if (vcpu->arch.pio.in)
2363 kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
2364 vcpu->arch.pio.size,
de7d789a
CO
2365 pd);
2366 else
ad312c7c
ZX
2367 kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
2368 vcpu->arch.pio.size,
de7d789a
CO
2369 pd);
2370 mutex_unlock(&vcpu->kvm->lock);
2371}
2372
2373static void pio_string_write(struct kvm_io_device *pio_dev,
2374 struct kvm_vcpu *vcpu)
2375{
ad312c7c
ZX
2376 struct kvm_pio_request *io = &vcpu->arch.pio;
2377 void *pd = vcpu->arch.pio_data;
de7d789a
CO
2378 int i;
2379
2380 mutex_lock(&vcpu->kvm->lock);
2381 for (i = 0; i < io->cur_count; i++) {
2382 kvm_iodevice_write(pio_dev, io->port,
2383 io->size,
2384 pd);
2385 pd += io->size;
2386 }
2387 mutex_unlock(&vcpu->kvm->lock);
2388}
2389
2390static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
92760499
LV
2391 gpa_t addr, int len,
2392 int is_write)
de7d789a 2393{
92760499 2394 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write);
de7d789a
CO
2395}
2396
2397int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2398 int size, unsigned port)
2399{
2400 struct kvm_io_device *pio_dev;
5fdbf976 2401 unsigned long val;
de7d789a
CO
2402
2403 vcpu->run->exit_reason = KVM_EXIT_IO;
2404 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
ad312c7c 2405 vcpu->run->io.size = vcpu->arch.pio.size = size;
de7d789a 2406 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
ad312c7c
ZX
2407 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
2408 vcpu->run->io.port = vcpu->arch.pio.port = port;
2409 vcpu->arch.pio.in = in;
2410 vcpu->arch.pio.string = 0;
2411 vcpu->arch.pio.down = 0;
2412 vcpu->arch.pio.guest_page_offset = 0;
2413 vcpu->arch.pio.rep = 0;
de7d789a 2414
2714d1d3
FEL
2415 if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
2416 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
2417 handler);
2418 else
2419 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
2420 handler);
2421
5fdbf976
MT
2422 val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2423 memcpy(vcpu->arch.pio_data, &val, 4);
de7d789a
CO
2424
2425 kvm_x86_ops->skip_emulated_instruction(vcpu);
2426
92760499 2427 pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
de7d789a 2428 if (pio_dev) {
ad312c7c 2429 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
de7d789a
CO
2430 complete_pio(vcpu);
2431 return 1;
2432 }
2433 return 0;
2434}
2435EXPORT_SYMBOL_GPL(kvm_emulate_pio);
2436
2437int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2438 int size, unsigned long count, int down,
2439 gva_t address, int rep, unsigned port)
2440{
2441 unsigned now, in_page;
2442 int i, ret = 0;
2443 int nr_pages = 1;
2444 struct page *page;
2445 struct kvm_io_device *pio_dev;
2446
2447 vcpu->run->exit_reason = KVM_EXIT_IO;
2448 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
ad312c7c 2449 vcpu->run->io.size = vcpu->arch.pio.size = size;
de7d789a 2450 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
ad312c7c
ZX
2451 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
2452 vcpu->run->io.port = vcpu->arch.pio.port = port;
2453 vcpu->arch.pio.in = in;
2454 vcpu->arch.pio.string = 1;
2455 vcpu->arch.pio.down = down;
2456 vcpu->arch.pio.guest_page_offset = offset_in_page(address);
2457 vcpu->arch.pio.rep = rep;
de7d789a 2458
2714d1d3
FEL
2459 if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
2460 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
2461 handler);
2462 else
2463 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
2464 handler);
2465
de7d789a
CO
2466 if (!count) {
2467 kvm_x86_ops->skip_emulated_instruction(vcpu);
2468 return 1;
2469 }
2470
2471 if (!down)
2472 in_page = PAGE_SIZE - offset_in_page(address);
2473 else
2474 in_page = offset_in_page(address) + size;
2475 now = min(count, (unsigned long)in_page / size);
2476 if (!now) {
2477 /*
2478 * String I/O straddles page boundary. Pin two guest pages
2479 * so that we satisfy atomicity constraints. Do just one
2480 * transaction to avoid complexity.
2481 */
2482 nr_pages = 2;
2483 now = 1;
2484 }
2485 if (down) {
2486 /*
2487 * String I/O in reverse. Yuck. Kill the guest, fix later.
2488 */
2489 pr_unimpl(vcpu, "guest string pio down\n");
c1a5d4f9 2490 kvm_inject_gp(vcpu, 0);
de7d789a
CO
2491 return 1;
2492 }
2493 vcpu->run->io.count = now;
ad312c7c 2494 vcpu->arch.pio.cur_count = now;
de7d789a 2495
ad312c7c 2496 if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
de7d789a
CO
2497 kvm_x86_ops->skip_emulated_instruction(vcpu);
2498
2499 for (i = 0; i < nr_pages; ++i) {
de7d789a 2500 page = gva_to_page(vcpu, address + i * PAGE_SIZE);
ad312c7c 2501 vcpu->arch.pio.guest_pages[i] = page;
de7d789a 2502 if (!page) {
c1a5d4f9 2503 kvm_inject_gp(vcpu, 0);
de7d789a
CO
2504 free_pio_guest_pages(vcpu);
2505 return 1;
2506 }
2507 }
2508
92760499
LV
2509 pio_dev = vcpu_find_pio_dev(vcpu, port,
2510 vcpu->arch.pio.cur_count,
2511 !vcpu->arch.pio.in);
ad312c7c 2512 if (!vcpu->arch.pio.in) {
de7d789a
CO
2513 /* string PIO write */
2514 ret = pio_copy_data(vcpu);
2515 if (ret >= 0 && pio_dev) {
2516 pio_string_write(pio_dev, vcpu);
2517 complete_pio(vcpu);
ad312c7c 2518 if (vcpu->arch.pio.count == 0)
de7d789a
CO
2519 ret = 1;
2520 }
2521 } else if (pio_dev)
2522 pr_unimpl(vcpu, "no string pio read support yet, "
2523 "port %x size %d count %ld\n",
2524 port, size, count);
2525
2526 return ret;
2527}
2528EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
2529
f8c16bba 2530int kvm_arch_init(void *opaque)
043405e1 2531{
56c6d28a 2532 int r;
f8c16bba
ZX
2533 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
2534
f8c16bba
ZX
2535 if (kvm_x86_ops) {
2536 printk(KERN_ERR "kvm: already loaded the other module\n");
56c6d28a
ZX
2537 r = -EEXIST;
2538 goto out;
f8c16bba
ZX
2539 }
2540
2541 if (!ops->cpu_has_kvm_support()) {
2542 printk(KERN_ERR "kvm: no hardware support\n");
56c6d28a
ZX
2543 r = -EOPNOTSUPP;
2544 goto out;
f8c16bba
ZX
2545 }
2546 if (ops->disabled_by_bios()) {
2547 printk(KERN_ERR "kvm: disabled by bios\n");
56c6d28a
ZX
2548 r = -EOPNOTSUPP;
2549 goto out;
f8c16bba
ZX
2550 }
2551
97db56ce
AK
2552 r = kvm_mmu_module_init();
2553 if (r)
2554 goto out;
2555
2556 kvm_init_msr_list();
2557
f8c16bba 2558 kvm_x86_ops = ops;
56c6d28a 2559 kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
7b52345e
SY
2560 kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
2561 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
2562 PT_DIRTY_MASK, PT64_NX_MASK, 0);
f8c16bba 2563 return 0;
56c6d28a
ZX
2564
2565out:
56c6d28a 2566 return r;
043405e1 2567}
8776e519 2568
f8c16bba
ZX
2569void kvm_arch_exit(void)
2570{
2571 kvm_x86_ops = NULL;
56c6d28a
ZX
2572 kvm_mmu_module_exit();
2573}
f8c16bba 2574
8776e519
HB
2575int kvm_emulate_halt(struct kvm_vcpu *vcpu)
2576{
2577 ++vcpu->stat.halt_exits;
2714d1d3 2578 KVMTRACE_0D(HLT, vcpu, handler);
8776e519 2579 if (irqchip_in_kernel(vcpu->kvm)) {
a4535290 2580 vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
8776e519
HB
2581 return 1;
2582 } else {
2583 vcpu->run->exit_reason = KVM_EXIT_HLT;
2584 return 0;
2585 }
2586}
2587EXPORT_SYMBOL_GPL(kvm_emulate_halt);
2588
2f333bcb
MT
2589static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
2590 unsigned long a1)
2591{
2592 if (is_long_mode(vcpu))
2593 return a0;
2594 else
2595 return a0 | ((gpa_t)a1 << 32);
2596}
2597
8776e519
HB
2598int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2599{
2600 unsigned long nr, a0, a1, a2, a3, ret;
2f333bcb 2601 int r = 1;
8776e519 2602
5fdbf976
MT
2603 nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
2604 a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
2605 a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
2606 a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
2607 a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
8776e519 2608
2714d1d3
FEL
2609 KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);
2610
8776e519
HB
2611 if (!is_long_mode(vcpu)) {
2612 nr &= 0xFFFFFFFF;
2613 a0 &= 0xFFFFFFFF;
2614 a1 &= 0xFFFFFFFF;
2615 a2 &= 0xFFFFFFFF;
2616 a3 &= 0xFFFFFFFF;
2617 }
2618
2619 switch (nr) {
b93463aa
AK
2620 case KVM_HC_VAPIC_POLL_IRQ:
2621 ret = 0;
2622 break;
2f333bcb
MT
2623 case KVM_HC_MMU_OP:
2624 r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
2625 break;
8776e519
HB
2626 default:
2627 ret = -KVM_ENOSYS;
2628 break;
2629 }
5fdbf976 2630 kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
f11c3a8d 2631 ++vcpu->stat.hypercalls;
2f333bcb 2632 return r;
8776e519
HB
2633}
2634EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
2635
2636int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
2637{
2638 char instruction[3];
2639 int ret = 0;
5fdbf976 2640 unsigned long rip = kvm_rip_read(vcpu);
8776e519 2641
8776e519
HB
2642
2643 /*
2644 * Blow out the MMU to ensure that no other VCPU has an active mapping
2645 * to ensure that the updated hypercall appears atomically across all
2646 * VCPUs.
2647 */
2648 kvm_mmu_zap_all(vcpu->kvm);
2649
8776e519 2650 kvm_x86_ops->patch_hypercall(vcpu, instruction);
5fdbf976 2651 if (emulator_write_emulated(rip, instruction, 3, vcpu)
8776e519
HB
2652 != X86EMUL_CONTINUE)
2653 ret = -EFAULT;
2654
8776e519
HB
2655 return ret;
2656}
2657
2658static u64 mk_cr_64(u64 curr_cr, u32 new_val)
2659{
2660 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
2661}
2662
2663void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2664{
2665 struct descriptor_table dt = { limit, base };
2666
2667 kvm_x86_ops->set_gdt(vcpu, &dt);
2668}
2669
2670void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2671{
2672 struct descriptor_table dt = { limit, base };
2673
2674 kvm_x86_ops->set_idt(vcpu, &dt);
2675}
2676
2677void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
2678 unsigned long *rflags)
2679{
2d3ad1f4 2680 kvm_lmsw(vcpu, msw);
8776e519
HB
2681 *rflags = kvm_x86_ops->get_rflags(vcpu);
2682}
2683
2684unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
2685{
54e445ca
JR
2686 unsigned long value;
2687
8776e519
HB
2688 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2689 switch (cr) {
2690 case 0:
54e445ca
JR
2691 value = vcpu->arch.cr0;
2692 break;
8776e519 2693 case 2:
54e445ca
JR
2694 value = vcpu->arch.cr2;
2695 break;
8776e519 2696 case 3:
54e445ca
JR
2697 value = vcpu->arch.cr3;
2698 break;
8776e519 2699 case 4:
54e445ca
JR
2700 value = vcpu->arch.cr4;
2701 break;
152ff9be 2702 case 8:
54e445ca
JR
2703 value = kvm_get_cr8(vcpu);
2704 break;
8776e519 2705 default:
b8688d51 2706 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
8776e519
HB
2707 return 0;
2708 }
54e445ca
JR
2709 KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value,
2710 (u32)((u64)value >> 32), handler);
2711
2712 return value;
8776e519
HB
2713}
2714
2715void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
2716 unsigned long *rflags)
2717{
54e445ca
JR
2718 KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val,
2719 (u32)((u64)val >> 32), handler);
2720
8776e519
HB
2721 switch (cr) {
2722 case 0:
2d3ad1f4 2723 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
8776e519
HB
2724 *rflags = kvm_x86_ops->get_rflags(vcpu);
2725 break;
2726 case 2:
ad312c7c 2727 vcpu->arch.cr2 = val;
8776e519
HB
2728 break;
2729 case 3:
2d3ad1f4 2730 kvm_set_cr3(vcpu, val);
8776e519
HB
2731 break;
2732 case 4:
2d3ad1f4 2733 kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
8776e519 2734 break;
152ff9be 2735 case 8:
2d3ad1f4 2736 kvm_set_cr8(vcpu, val & 0xfUL);
152ff9be 2737 break;
8776e519 2738 default:
b8688d51 2739 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
8776e519
HB
2740 }
2741}
2742
07716717
DK
2743static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
2744{
ad312c7c
ZX
2745 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
2746 int j, nent = vcpu->arch.cpuid_nent;
07716717
DK
2747
2748 e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
2749 /* when no next entry is found, the current entry[i] is reselected */
2750 for (j = i + 1; j == i; j = (j + 1) % nent) {
ad312c7c 2751 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
07716717
DK
2752 if (ej->function == e->function) {
2753 ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
2754 return j;
2755 }
2756 }
2757 return 0; /* silence gcc, even though control never reaches here */
2758}
2759
2760/* find an entry with matching function, matching index (if needed), and that
2761 * should be read next (if it's stateful) */
2762static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
2763 u32 function, u32 index)
2764{
2765 if (e->function != function)
2766 return 0;
2767 if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
2768 return 0;
2769 if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
2770 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
2771 return 0;
2772 return 1;
2773}
2774
8776e519
HB
2775void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
2776{
2777 int i;
07716717
DK
2778 u32 function, index;
2779 struct kvm_cpuid_entry2 *e, *best;
8776e519 2780
5fdbf976
MT
2781 function = kvm_register_read(vcpu, VCPU_REGS_RAX);
2782 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
2783 kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
2784 kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
2785 kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
2786 kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
8776e519 2787 best = NULL;
ad312c7c
ZX
2788 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
2789 e = &vcpu->arch.cpuid_entries[i];
07716717
DK
2790 if (is_matching_cpuid_entry(e, function, index)) {
2791 if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
2792 move_to_next_stateful_cpuid_entry(vcpu, i);
8776e519
HB
2793 best = e;
2794 break;
2795 }
2796 /*
2797 * Both basic or both extended?
2798 */
2799 if (((e->function ^ function) & 0x80000000) == 0)
2800 if (!best || e->function > best->function)
2801 best = e;
2802 }
2803 if (best) {
5fdbf976
MT
2804 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
2805 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
2806 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
2807 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
8776e519 2808 }
8776e519 2809 kvm_x86_ops->skip_emulated_instruction(vcpu);
2714d1d3 2810 KVMTRACE_5D(CPUID, vcpu, function,
5fdbf976
MT
2811 (u32)kvm_register_read(vcpu, VCPU_REGS_RAX),
2812 (u32)kvm_register_read(vcpu, VCPU_REGS_RBX),
2813 (u32)kvm_register_read(vcpu, VCPU_REGS_RCX),
2814 (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler);
8776e519
HB
2815}
2816EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
d0752060 2817
b6c7a5dc
HB
2818/*
2819 * Check if userspace requested an interrupt window, and that the
2820 * interrupt window is open.
2821 *
2822 * No need to exit to userspace if we already have an interrupt queued.
2823 */
2824static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
2825 struct kvm_run *kvm_run)
2826{
ad312c7c 2827 return (!vcpu->arch.irq_summary &&
b6c7a5dc 2828 kvm_run->request_interrupt_window &&
ad312c7c 2829 vcpu->arch.interrupt_window_open &&
b6c7a5dc
HB
2830 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
2831}
2832
c4abb7c9
JK
2833/*
2834 * Check if userspace requested a NMI window, and that the NMI window
2835 * is open.
2836 *
2837 * No need to exit to userspace if we already have a NMI queued.
2838 */
2839static int dm_request_for_nmi_injection(struct kvm_vcpu *vcpu,
2840 struct kvm_run *kvm_run)
2841{
2842 return (!vcpu->arch.nmi_pending &&
2843 kvm_run->request_nmi_window &&
2844 vcpu->arch.nmi_window_open);
2845}
2846
b6c7a5dc
HB
2847static void post_kvm_run_save(struct kvm_vcpu *vcpu,
2848 struct kvm_run *kvm_run)
2849{
2850 kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
2d3ad1f4 2851 kvm_run->cr8 = kvm_get_cr8(vcpu);
b6c7a5dc 2852 kvm_run->apic_base = kvm_get_apic_base(vcpu);
c4abb7c9 2853 if (irqchip_in_kernel(vcpu->kvm)) {
b6c7a5dc 2854 kvm_run->ready_for_interrupt_injection = 1;
c4abb7c9
JK
2855 kvm_run->ready_for_nmi_injection = 1;
2856 } else {
b6c7a5dc 2857 kvm_run->ready_for_interrupt_injection =
ad312c7c
ZX
2858 (vcpu->arch.interrupt_window_open &&
2859 vcpu->arch.irq_summary == 0);
c4abb7c9
JK
2860 kvm_run->ready_for_nmi_injection =
2861 (vcpu->arch.nmi_window_open &&
2862 vcpu->arch.nmi_pending == 0);
2863 }
b6c7a5dc
HB
2864}
2865
b93463aa
AK
2866static void vapic_enter(struct kvm_vcpu *vcpu)
2867{
2868 struct kvm_lapic *apic = vcpu->arch.apic;
2869 struct page *page;
2870
2871 if (!apic || !apic->vapic_addr)
2872 return;
2873
2874 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
72dc67a6
IE
2875
2876 vcpu->arch.apic->vapic_page = page;
b93463aa
AK
2877}
2878
2879static void vapic_exit(struct kvm_vcpu *vcpu)
2880{
2881 struct kvm_lapic *apic = vcpu->arch.apic;
2882
2883 if (!apic || !apic->vapic_addr)
2884 return;
2885
f8b78fa3 2886 down_read(&vcpu->kvm->slots_lock);
b93463aa
AK
2887 kvm_release_page_dirty(apic->vapic_page);
2888 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
f8b78fa3 2889 up_read(&vcpu->kvm->slots_lock);
b93463aa
AK
2890}
2891
d7690175 2892static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
b6c7a5dc
HB
2893{
2894 int r;
2895
2e53d63a
MT
2896 if (vcpu->requests)
2897 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
2898 kvm_mmu_unload(vcpu);
2899
b6c7a5dc
HB
2900 r = kvm_mmu_reload(vcpu);
2901 if (unlikely(r))
2902 goto out;
2903
2f52d58c
AK
2904 if (vcpu->requests) {
2905 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
2f599714 2906 __kvm_migrate_timers(vcpu);
4731d4c7
MT
2907 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
2908 kvm_mmu_sync_roots(vcpu);
d4acf7e7
MT
2909 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
2910 kvm_x86_ops->tlb_flush(vcpu);
b93463aa
AK
2911 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
2912 &vcpu->requests)) {
2913 kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
2914 r = 0;
2915 goto out;
2916 }
71c4dfaf
JR
2917 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
2918 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
2919 r = 0;
2920 goto out;
2921 }
2f52d58c 2922 }
b93463aa 2923
06e05645 2924 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
b6c7a5dc
HB
2925 kvm_inject_pending_timer_irqs(vcpu);
2926
2927 preempt_disable();
2928
2929 kvm_x86_ops->prepare_guest_switch(vcpu);
2930 kvm_load_guest_fpu(vcpu);
2931
2932 local_irq_disable();
2933
d7690175 2934 if (vcpu->requests || need_resched() || signal_pending(current)) {
6c142801
AK
2935 local_irq_enable();
2936 preempt_enable();
2937 r = 1;
2938 goto out;
2939 }
2940
29415c37
MT
2941 if (vcpu->guest_debug.enabled)
2942 kvm_x86_ops->guest_debug_pre(vcpu);
b6c7a5dc 2943
e9571ed5
MT
2944 vcpu->guest_mode = 1;
2945 /*
2946 * Make sure that guest_mode assignment won't happen after
2947 * testing the pending IRQ vector bitmap.
2948 */
2949 smp_wmb();
2950
ad312c7c 2951 if (vcpu->arch.exception.pending)
298101da
AK
2952 __queue_exception(vcpu);
2953 else if (irqchip_in_kernel(vcpu->kvm))
b6c7a5dc 2954 kvm_x86_ops->inject_pending_irq(vcpu);
eb9774f0 2955 else
b6c7a5dc
HB
2956 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
2957
b93463aa
AK
2958 kvm_lapic_sync_to_vapic(vcpu);
2959
3200f405
MT
2960 up_read(&vcpu->kvm->slots_lock);
2961
b6c7a5dc
HB
2962 kvm_guest_enter();
2963
b6c7a5dc 2964
2714d1d3 2965 KVMTRACE_0D(VMENTRY, vcpu, entryexit);
b6c7a5dc
HB
2966 kvm_x86_ops->run(vcpu, kvm_run);
2967
2968 vcpu->guest_mode = 0;
2969 local_irq_enable();
2970
2971 ++vcpu->stat.exits;
2972
2973 /*
2974 * We must have an instruction between local_irq_enable() and
2975 * kvm_guest_exit(), so the timer interrupt isn't delayed by
2976 * the interrupt shadow. The stat.exits increment will do nicely.
2977 * But we need to prevent reordering, hence this barrier():
2978 */
2979 barrier();
2980
2981 kvm_guest_exit();
2982
2983 preempt_enable();
2984
3200f405
MT
2985 down_read(&vcpu->kvm->slots_lock);
2986
b6c7a5dc
HB
2987 /*
2988 * Profile KVM exit RIPs:
2989 */
2990 if (unlikely(prof_on == KVM_PROFILING)) {
5fdbf976
MT
2991 unsigned long rip = kvm_rip_read(vcpu);
2992 profile_hit(KVM_PROFILING, (void *)rip);
b6c7a5dc
HB
2993 }
2994
ad312c7c
ZX
2995 if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
2996 vcpu->arch.exception.pending = false;
298101da 2997
b93463aa
AK
2998 kvm_lapic_sync_from_vapic(vcpu);
2999
b6c7a5dc 3000 r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
d7690175
MT
3001out:
3002 return r;
3003}
b6c7a5dc 3004
d7690175
MT
3005static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3006{
3007 int r;
3008
3009 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
1b10bf31
JK
3010 pr_debug("vcpu %d received sipi with vector # %x\n",
3011 vcpu->vcpu_id, vcpu->arch.sipi_vector);
d7690175 3012 kvm_lapic_reset(vcpu);
5f179287 3013 r = kvm_arch_vcpu_reset(vcpu);
d7690175
MT
3014 if (r)
3015 return r;
3016 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
b6c7a5dc
HB
3017 }
3018
d7690175
MT
3019 down_read(&vcpu->kvm->slots_lock);
3020 vapic_enter(vcpu);
3021
3022 r = 1;
3023 while (r > 0) {
af2152f5 3024 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
d7690175
MT
3025 r = vcpu_enter_guest(vcpu, kvm_run);
3026 else {
3027 up_read(&vcpu->kvm->slots_lock);
3028 kvm_vcpu_block(vcpu);
3029 down_read(&vcpu->kvm->slots_lock);
3030 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
3031 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
3032 vcpu->arch.mp_state =
3033 KVM_MP_STATE_RUNNABLE;
3034 if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
3035 r = -EINTR;
3036 }
3037
3038 if (r > 0) {
c4abb7c9
JK
3039 if (dm_request_for_nmi_injection(vcpu, kvm_run)) {
3040 r = -EINTR;
3041 kvm_run->exit_reason = KVM_EXIT_NMI;
3042 ++vcpu->stat.request_nmi_exits;
3043 }
d7690175
MT
3044 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
3045 r = -EINTR;
3046 kvm_run->exit_reason = KVM_EXIT_INTR;
3047 ++vcpu->stat.request_irq_exits;
3048 }
3049 if (signal_pending(current)) {
3050 r = -EINTR;
3051 kvm_run->exit_reason = KVM_EXIT_INTR;
3052 ++vcpu->stat.signal_exits;
3053 }
3054 if (need_resched()) {
3055 up_read(&vcpu->kvm->slots_lock);
3056 kvm_resched(vcpu);
3057 down_read(&vcpu->kvm->slots_lock);
3058 }
3059 }
b6c7a5dc
HB
3060 }
3061
d7690175 3062 up_read(&vcpu->kvm->slots_lock);
b6c7a5dc
HB
3063 post_kvm_run_save(vcpu, kvm_run);
3064
b93463aa
AK
3065 vapic_exit(vcpu);
3066
b6c7a5dc
HB
3067 return r;
3068}
3069
3070int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3071{
3072 int r;
3073 sigset_t sigsaved;
3074
3075 vcpu_load(vcpu);
3076
ac9f6dc0
AK
3077 if (vcpu->sigset_active)
3078 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
3079
a4535290 3080 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
b6c7a5dc 3081 kvm_vcpu_block(vcpu);
d7690175 3082 clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
ac9f6dc0
AK
3083 r = -EAGAIN;
3084 goto out;
b6c7a5dc
HB
3085 }
3086
b6c7a5dc
HB
3087 /* re-sync apic's tpr */
3088 if (!irqchip_in_kernel(vcpu->kvm))
2d3ad1f4 3089 kvm_set_cr8(vcpu, kvm_run->cr8);
b6c7a5dc 3090
ad312c7c 3091 if (vcpu->arch.pio.cur_count) {
b6c7a5dc
HB
3092 r = complete_pio(vcpu);
3093 if (r)
3094 goto out;
3095 }
3096#if CONFIG_HAS_IOMEM
3097 if (vcpu->mmio_needed) {
3098 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
3099 vcpu->mmio_read_completed = 1;
3100 vcpu->mmio_needed = 0;
3200f405
MT
3101
3102 down_read(&vcpu->kvm->slots_lock);
b6c7a5dc 3103 r = emulate_instruction(vcpu, kvm_run,
571008da
SY
3104 vcpu->arch.mmio_fault_cr2, 0,
3105 EMULTYPE_NO_DECODE);
3200f405 3106 up_read(&vcpu->kvm->slots_lock);
b6c7a5dc
HB
3107 if (r == EMULATE_DO_MMIO) {
3108 /*
3109 * Read-modify-write. Back to userspace.
3110 */
3111 r = 0;
3112 goto out;
3113 }
3114 }
3115#endif
5fdbf976
MT
3116 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
3117 kvm_register_write(vcpu, VCPU_REGS_RAX,
3118 kvm_run->hypercall.ret);
b6c7a5dc
HB
3119
3120 r = __vcpu_run(vcpu, kvm_run);
3121
3122out:
3123 if (vcpu->sigset_active)
3124 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
3125
3126 vcpu_put(vcpu);
3127 return r;
3128}
3129
3130int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3131{
3132 vcpu_load(vcpu);
3133
5fdbf976
MT
3134 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3135 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3136 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3137 regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3138 regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
3139 regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
3140 regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3141 regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
b6c7a5dc 3142#ifdef CONFIG_X86_64
5fdbf976
MT
3143 regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
3144 regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
3145 regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
3146 regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
3147 regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
3148 regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
3149 regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
3150 regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
b6c7a5dc
HB
3151#endif
3152
5fdbf976 3153 regs->rip = kvm_rip_read(vcpu);
b6c7a5dc
HB
3154 regs->rflags = kvm_x86_ops->get_rflags(vcpu);
3155
3156 /*
3157 * Don't leak debug flags in case they were set for guest debugging
3158 */
3159 if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
3160 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
3161
3162 vcpu_put(vcpu);
3163
3164 return 0;
3165}
3166
3167int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3168{
3169 vcpu_load(vcpu);
3170
5fdbf976
MT
3171 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
3172 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
3173 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
3174 kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
3175 kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
3176 kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
3177 kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
3178 kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
b6c7a5dc 3179#ifdef CONFIG_X86_64
5fdbf976
MT
3180 kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
3181 kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
3182 kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
3183 kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
3184 kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
3185 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
3186 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
3187 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
3188
b6c7a5dc
HB
3189#endif
3190
5fdbf976 3191 kvm_rip_write(vcpu, regs->rip);
b6c7a5dc
HB
3192 kvm_x86_ops->set_rflags(vcpu, regs->rflags);
3193
b6c7a5dc 3194
b4f14abd
JK
3195 vcpu->arch.exception.pending = false;
3196
b6c7a5dc
HB
3197 vcpu_put(vcpu);
3198
3199 return 0;
3200}
3201
3e6e0aab
GT
3202void kvm_get_segment(struct kvm_vcpu *vcpu,
3203 struct kvm_segment *var, int seg)
b6c7a5dc 3204{
14af3f3c 3205 kvm_x86_ops->get_segment(vcpu, var, seg);
b6c7a5dc
HB
3206}
3207
3208void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3209{
3210 struct kvm_segment cs;
3211
3e6e0aab 3212 kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
b6c7a5dc
HB
3213 *db = cs.db;
3214 *l = cs.l;
3215}
3216EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
3217
3218int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
3219 struct kvm_sregs *sregs)
3220{
3221 struct descriptor_table dt;
3222 int pending_vec;
3223
3224 vcpu_load(vcpu);
3225
3e6e0aab
GT
3226 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
3227 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
3228 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
3229 kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
3230 kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
3231 kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
b6c7a5dc 3232
3e6e0aab
GT
3233 kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3234 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
b6c7a5dc
HB
3235
3236 kvm_x86_ops->get_idt(vcpu, &dt);
3237 sregs->idt.limit = dt.limit;
3238 sregs->idt.base = dt.base;
3239 kvm_x86_ops->get_gdt(vcpu, &dt);
3240 sregs->gdt.limit = dt.limit;
3241 sregs->gdt.base = dt.base;
3242
3243 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
ad312c7c
ZX
3244 sregs->cr0 = vcpu->arch.cr0;
3245 sregs->cr2 = vcpu->arch.cr2;
3246 sregs->cr3 = vcpu->arch.cr3;
3247 sregs->cr4 = vcpu->arch.cr4;
2d3ad1f4 3248 sregs->cr8 = kvm_get_cr8(vcpu);
ad312c7c 3249 sregs->efer = vcpu->arch.shadow_efer;
b6c7a5dc
HB
3250 sregs->apic_base = kvm_get_apic_base(vcpu);
3251
3252 if (irqchip_in_kernel(vcpu->kvm)) {
3253 memset(sregs->interrupt_bitmap, 0,
3254 sizeof sregs->interrupt_bitmap);
3255 pending_vec = kvm_x86_ops->get_irq(vcpu);
3256 if (pending_vec >= 0)
3257 set_bit(pending_vec,
3258 (unsigned long *)sregs->interrupt_bitmap);
3259 } else
ad312c7c 3260 memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
b6c7a5dc
HB
3261 sizeof sregs->interrupt_bitmap);
3262
3263 vcpu_put(vcpu);
3264
3265 return 0;
3266}
3267
62d9f0db
MT
3268int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
3269 struct kvm_mp_state *mp_state)
3270{
3271 vcpu_load(vcpu);
3272 mp_state->mp_state = vcpu->arch.mp_state;
3273 vcpu_put(vcpu);
3274 return 0;
3275}
3276
3277int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
3278 struct kvm_mp_state *mp_state)
3279{
3280 vcpu_load(vcpu);
3281 vcpu->arch.mp_state = mp_state->mp_state;
3282 vcpu_put(vcpu);
3283 return 0;
3284}
3285
3e6e0aab 3286static void kvm_set_segment(struct kvm_vcpu *vcpu,
b6c7a5dc
HB
3287 struct kvm_segment *var, int seg)
3288{
14af3f3c 3289 kvm_x86_ops->set_segment(vcpu, var, seg);
b6c7a5dc
HB
3290}
3291
37817f29
IE
3292static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
3293 struct kvm_segment *kvm_desct)
3294{
3295 kvm_desct->base = seg_desc->base0;
3296 kvm_desct->base |= seg_desc->base1 << 16;
3297 kvm_desct->base |= seg_desc->base2 << 24;
3298 kvm_desct->limit = seg_desc->limit0;
3299 kvm_desct->limit |= seg_desc->limit << 16;
c93cd3a5
MT
3300 if (seg_desc->g) {
3301 kvm_desct->limit <<= 12;
3302 kvm_desct->limit |= 0xfff;
3303 }
37817f29
IE
3304 kvm_desct->selector = selector;
3305 kvm_desct->type = seg_desc->type;
3306 kvm_desct->present = seg_desc->p;
3307 kvm_desct->dpl = seg_desc->dpl;
3308 kvm_desct->db = seg_desc->d;
3309 kvm_desct->s = seg_desc->s;
3310 kvm_desct->l = seg_desc->l;
3311 kvm_desct->g = seg_desc->g;
3312 kvm_desct->avl = seg_desc->avl;
3313 if (!selector)
3314 kvm_desct->unusable = 1;
3315 else
3316 kvm_desct->unusable = 0;
3317 kvm_desct->padding = 0;
3318}
3319
3320static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu,
3321 u16 selector,
3322 struct descriptor_table *dtable)
3323{
3324 if (selector & 1 << 2) {
3325 struct kvm_segment kvm_seg;
3326
3e6e0aab 3327 kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
37817f29
IE
3328
3329 if (kvm_seg.unusable)
3330 dtable->limit = 0;
3331 else
3332 dtable->limit = kvm_seg.limit;
3333 dtable->base = kvm_seg.base;
3334 }
3335 else
3336 kvm_x86_ops->get_gdt(vcpu, dtable);
3337}
3338
3339/* allowed just for 8 bytes segments */
3340static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3341 struct desc_struct *seg_desc)
3342{
98899aa0 3343 gpa_t gpa;
37817f29
IE
3344 struct descriptor_table dtable;
3345 u16 index = selector >> 3;
3346
3347 get_segment_descritptor_dtable(vcpu, selector, &dtable);
3348
3349 if (dtable.limit < index * 8 + 7) {
3350 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
3351 return 1;
3352 }
98899aa0
MT
3353 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
3354 gpa += index * 8;
3355 return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8);
37817f29
IE
3356}
3357
3358/* allowed just for 8 bytes segments */
3359static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3360 struct desc_struct *seg_desc)
3361{
98899aa0 3362 gpa_t gpa;
37817f29
IE
3363 struct descriptor_table dtable;
3364 u16 index = selector >> 3;
3365
3366 get_segment_descritptor_dtable(vcpu, selector, &dtable);
3367
3368 if (dtable.limit < index * 8 + 7)
3369 return 1;
98899aa0
MT
3370 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
3371 gpa += index * 8;
3372 return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8);
37817f29
IE
3373}
3374
3375static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
3376 struct desc_struct *seg_desc)
3377{
3378 u32 base_addr;
3379
3380 base_addr = seg_desc->base0;
3381 base_addr |= (seg_desc->base1 << 16);
3382 base_addr |= (seg_desc->base2 << 24);
3383
98899aa0 3384 return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr);
37817f29
IE
3385}
3386
37817f29
IE
3387static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
3388{
3389 struct kvm_segment kvm_seg;
3390
3e6e0aab 3391 kvm_get_segment(vcpu, &kvm_seg, seg);
37817f29
IE
3392 return kvm_seg.selector;
3393}
3394
3395static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
3396 u16 selector,
3397 struct kvm_segment *kvm_seg)
3398{
3399 struct desc_struct seg_desc;
3400
3401 if (load_guest_segment_descriptor(vcpu, selector, &seg_desc))
3402 return 1;
3403 seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg);
3404 return 0;
3405}
3406
2259e3a7 3407static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
f4bbd9aa
AK
3408{
3409 struct kvm_segment segvar = {
3410 .base = selector << 4,
3411 .limit = 0xffff,
3412 .selector = selector,
3413 .type = 3,
3414 .present = 1,
3415 .dpl = 3,
3416 .db = 0,
3417 .s = 1,
3418 .l = 0,
3419 .g = 0,
3420 .avl = 0,
3421 .unusable = 0,
3422 };
3423 kvm_x86_ops->set_segment(vcpu, &segvar, seg);
3424 return 0;
3425}
3426
3e6e0aab
GT
3427int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3428 int type_bits, int seg)
37817f29
IE
3429{
3430 struct kvm_segment kvm_seg;
3431
f4bbd9aa
AK
3432 if (!(vcpu->arch.cr0 & X86_CR0_PE))
3433 return kvm_load_realmode_segment(vcpu, selector, seg);
37817f29
IE
3434 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
3435 return 1;
3436 kvm_seg.type |= type_bits;
3437
3438 if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS &&
3439 seg != VCPU_SREG_LDTR)
3440 if (!kvm_seg.s)
3441 kvm_seg.unusable = 1;
3442
3e6e0aab 3443 kvm_set_segment(vcpu, &kvm_seg, seg);
37817f29
IE
3444 return 0;
3445}
3446
3447static void save_state_to_tss32(struct kvm_vcpu *vcpu,
3448 struct tss_segment_32 *tss)
3449{
3450 tss->cr3 = vcpu->arch.cr3;
5fdbf976 3451 tss->eip = kvm_rip_read(vcpu);
37817f29 3452 tss->eflags = kvm_x86_ops->get_rflags(vcpu);
5fdbf976
MT
3453 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3454 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3455 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3456 tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3457 tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3458 tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3459 tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
3460 tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
37817f29
IE
3461 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
3462 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
3463 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
3464 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
3465 tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
3466 tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
3467 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
3468 tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
3469}
3470
3471static int load_state_from_tss32(struct kvm_vcpu *vcpu,
3472 struct tss_segment_32 *tss)
3473{
3474 kvm_set_cr3(vcpu, tss->cr3);
3475
5fdbf976 3476 kvm_rip_write(vcpu, tss->eip);
37817f29
IE
3477 kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2);
3478
5fdbf976
MT
3479 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
3480 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
3481 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
3482 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
3483 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
3484 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
3485 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
3486 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
37817f29 3487
3e6e0aab 3488 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
37817f29
IE
3489 return 1;
3490
3e6e0aab 3491 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
37817f29
IE
3492 return 1;
3493
3e6e0aab 3494 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
37817f29
IE
3495 return 1;
3496
3e6e0aab 3497 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
37817f29
IE
3498 return 1;
3499
3e6e0aab 3500 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
37817f29
IE
3501 return 1;
3502
3e6e0aab 3503 if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS))
37817f29
IE
3504 return 1;
3505
3e6e0aab 3506 if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS))
37817f29
IE
3507 return 1;
3508 return 0;
3509}
3510
3511static void save_state_to_tss16(struct kvm_vcpu *vcpu,
3512 struct tss_segment_16 *tss)
3513{
5fdbf976 3514 tss->ip = kvm_rip_read(vcpu);
37817f29 3515 tss->flag = kvm_x86_ops->get_rflags(vcpu);
5fdbf976
MT
3516 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3517 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3518 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3519 tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3520 tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3521 tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3522 tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
3523 tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
37817f29
IE
3524
3525 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
3526 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
3527 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
3528 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
3529 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
3530 tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
3531}
3532
3533static int load_state_from_tss16(struct kvm_vcpu *vcpu,
3534 struct tss_segment_16 *tss)
3535{
5fdbf976 3536 kvm_rip_write(vcpu, tss->ip);
37817f29 3537 kvm_x86_ops->set_rflags(vcpu, tss->flag | 2);
5fdbf976
MT
3538 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
3539 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
3540 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
3541 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
3542 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
3543 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
3544 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
3545 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
37817f29 3546
3e6e0aab 3547 if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
37817f29
IE
3548 return 1;
3549
3e6e0aab 3550 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
37817f29
IE
3551 return 1;
3552
3e6e0aab 3553 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
37817f29
IE
3554 return 1;
3555
3e6e0aab 3556 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
37817f29
IE
3557 return 1;
3558
3e6e0aab 3559 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
37817f29
IE
3560 return 1;
3561 return 0;
3562}
3563
8b2cf73c 3564static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
34198bf8 3565 u32 old_tss_base,
37817f29
IE
3566 struct desc_struct *nseg_desc)
3567{
3568 struct tss_segment_16 tss_segment_16;
3569 int ret = 0;
3570
34198bf8
MT
3571 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
3572 sizeof tss_segment_16))
37817f29
IE
3573 goto out;
3574
3575 save_state_to_tss16(vcpu, &tss_segment_16);
37817f29 3576
34198bf8
MT
3577 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
3578 sizeof tss_segment_16))
37817f29 3579 goto out;
34198bf8
MT
3580
3581 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
3582 &tss_segment_16, sizeof tss_segment_16))
3583 goto out;
3584
37817f29
IE
3585 if (load_state_from_tss16(vcpu, &tss_segment_16))
3586 goto out;
3587
3588 ret = 1;
3589out:
3590 return ret;
3591}
3592
8b2cf73c 3593static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
34198bf8 3594 u32 old_tss_base,
37817f29
IE
3595 struct desc_struct *nseg_desc)
3596{
3597 struct tss_segment_32 tss_segment_32;
3598 int ret = 0;
3599
34198bf8
MT
3600 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
3601 sizeof tss_segment_32))
37817f29
IE
3602 goto out;
3603
3604 save_state_to_tss32(vcpu, &tss_segment_32);
37817f29 3605
34198bf8
MT
3606 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
3607 sizeof tss_segment_32))
3608 goto out;
3609
3610 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
3611 &tss_segment_32, sizeof tss_segment_32))
37817f29 3612 goto out;
34198bf8 3613
37817f29
IE
3614 if (load_state_from_tss32(vcpu, &tss_segment_32))
3615 goto out;
3616
3617 ret = 1;
3618out:
3619 return ret;
3620}
3621
3622int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3623{
3624 struct kvm_segment tr_seg;
3625 struct desc_struct cseg_desc;
3626 struct desc_struct nseg_desc;
3627 int ret = 0;
34198bf8
MT
3628 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
3629 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
37817f29 3630
34198bf8 3631 old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base);
37817f29 3632
34198bf8
MT
3633 /* FIXME: Handle errors. Failure to read either TSS or their
3634 * descriptors should generate a pagefault.
3635 */
37817f29
IE
3636 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
3637 goto out;
3638
34198bf8 3639 if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
37817f29
IE
3640 goto out;
3641
37817f29
IE
3642 if (reason != TASK_SWITCH_IRET) {
3643 int cpl;
3644
3645 cpl = kvm_x86_ops->get_cpl(vcpu);
3646 if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {
3647 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
3648 return 1;
3649 }
3650 }
3651
3652 if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) {
3653 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
3654 return 1;
3655 }
3656
3657 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
3fe913e7 3658 cseg_desc.type &= ~(1 << 1); //clear the B flag
34198bf8 3659 save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
37817f29
IE
3660 }
3661
3662 if (reason == TASK_SWITCH_IRET) {
3663 u32 eflags = kvm_x86_ops->get_rflags(vcpu);
3664 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
3665 }
3666
3667 kvm_x86_ops->skip_emulated_instruction(vcpu);
37817f29
IE
3668
3669 if (nseg_desc.type & 8)
34198bf8 3670 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base,
37817f29
IE
3671 &nseg_desc);
3672 else
34198bf8 3673 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base,
37817f29
IE
3674 &nseg_desc);
3675
3676 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
3677 u32 eflags = kvm_x86_ops->get_rflags(vcpu);
3678 kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT);
3679 }
3680
3681 if (reason != TASK_SWITCH_IRET) {
3fe913e7 3682 nseg_desc.type |= (1 << 1);
37817f29
IE
3683 save_guest_segment_descriptor(vcpu, tss_selector,
3684 &nseg_desc);
3685 }
3686
3687 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS);
3688 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
3689 tr_seg.type = 11;
3e6e0aab 3690 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
37817f29 3691out:
37817f29
IE
3692 return ret;
3693}
3694EXPORT_SYMBOL_GPL(kvm_task_switch);
3695
b6c7a5dc
HB
3696int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3697 struct kvm_sregs *sregs)
3698{
3699 int mmu_reset_needed = 0;
3700 int i, pending_vec, max_bits;
3701 struct descriptor_table dt;
3702
3703 vcpu_load(vcpu);
3704
3705 dt.limit = sregs->idt.limit;
3706 dt.base = sregs->idt.base;
3707 kvm_x86_ops->set_idt(vcpu, &dt);
3708 dt.limit = sregs->gdt.limit;
3709 dt.base = sregs->gdt.base;
3710 kvm_x86_ops->set_gdt(vcpu, &dt);
3711
ad312c7c
ZX
3712 vcpu->arch.cr2 = sregs->cr2;
3713 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
3714 vcpu->arch.cr3 = sregs->cr3;
b6c7a5dc 3715
2d3ad1f4 3716 kvm_set_cr8(vcpu, sregs->cr8);
b6c7a5dc 3717
ad312c7c 3718 mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
b6c7a5dc 3719 kvm_x86_ops->set_efer(vcpu, sregs->efer);
b6c7a5dc
HB
3720 kvm_set_apic_base(vcpu, sregs->apic_base);
3721
3722 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
3723
ad312c7c 3724 mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
b6c7a5dc 3725 kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
d7306163 3726 vcpu->arch.cr0 = sregs->cr0;
b6c7a5dc 3727
ad312c7c 3728 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
b6c7a5dc
HB
3729 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
3730 if (!is_long_mode(vcpu) && is_pae(vcpu))
ad312c7c 3731 load_pdptrs(vcpu, vcpu->arch.cr3);
b6c7a5dc
HB
3732
3733 if (mmu_reset_needed)
3734 kvm_mmu_reset_context(vcpu);
3735
3736 if (!irqchip_in_kernel(vcpu->kvm)) {
ad312c7c
ZX
3737 memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap,
3738 sizeof vcpu->arch.irq_pending);
3739 vcpu->arch.irq_summary = 0;
3740 for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i)
3741 if (vcpu->arch.irq_pending[i])
3742 __set_bit(i, &vcpu->arch.irq_summary);
b6c7a5dc
HB
3743 } else {
3744 max_bits = (sizeof sregs->interrupt_bitmap) << 3;
3745 pending_vec = find_first_bit(
3746 (const unsigned long *)sregs->interrupt_bitmap,
3747 max_bits);
3748 /* Only pending external irq is handled here */
3749 if (pending_vec < max_bits) {
3750 kvm_x86_ops->set_irq(vcpu, pending_vec);
3751 pr_debug("Set back pending irq %d\n",
3752 pending_vec);
3753 }
e4825800 3754 kvm_pic_clear_isr_ack(vcpu->kvm);
b6c7a5dc
HB
3755 }
3756
3e6e0aab
GT
3757 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
3758 kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
3759 kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
3760 kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
3761 kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
3762 kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
b6c7a5dc 3763
3e6e0aab
GT
3764 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3765 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
b6c7a5dc 3766
9c3e4aab
MT
3767 /* Older userspace won't unhalt the vcpu on reset. */
3768 if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 &&
3769 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
3770 !(vcpu->arch.cr0 & X86_CR0_PE))
3771 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
3772
b6c7a5dc
HB
3773 vcpu_put(vcpu);
3774
3775 return 0;
3776}
3777
3778int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
3779 struct kvm_debug_guest *dbg)
3780{
3781 int r;
3782
3783 vcpu_load(vcpu);
3784
3785 r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
3786
3787 vcpu_put(vcpu);
3788
3789 return r;
3790}
3791
d0752060
HB
3792/*
3793 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when
3794 * we have asm/x86/processor.h
3795 */
3796struct fxsave {
3797 u16 cwd;
3798 u16 swd;
3799 u16 twd;
3800 u16 fop;
3801 u64 rip;
3802 u64 rdp;
3803 u32 mxcsr;
3804 u32 mxcsr_mask;
3805 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
3806#ifdef CONFIG_X86_64
3807 u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
3808#else
3809 u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
3810#endif
3811};
3812
8b006791
ZX
3813/*
3814 * Translate a guest virtual address to a guest physical address.
3815 */
3816int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
3817 struct kvm_translation *tr)
3818{
3819 unsigned long vaddr = tr->linear_address;
3820 gpa_t gpa;
3821
3822 vcpu_load(vcpu);
72dc67a6 3823 down_read(&vcpu->kvm->slots_lock);
ad312c7c 3824 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
72dc67a6 3825 up_read(&vcpu->kvm->slots_lock);
8b006791
ZX
3826 tr->physical_address = gpa;
3827 tr->valid = gpa != UNMAPPED_GVA;
3828 tr->writeable = 1;
3829 tr->usermode = 0;
8b006791
ZX
3830 vcpu_put(vcpu);
3831
3832 return 0;
3833}
3834
d0752060
HB
3835int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
3836{
ad312c7c 3837 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
d0752060
HB
3838
3839 vcpu_load(vcpu);
3840
3841 memcpy(fpu->fpr, fxsave->st_space, 128);
3842 fpu->fcw = fxsave->cwd;
3843 fpu->fsw = fxsave->swd;
3844 fpu->ftwx = fxsave->twd;
3845 fpu->last_opcode = fxsave->fop;
3846 fpu->last_ip = fxsave->rip;
3847 fpu->last_dp = fxsave->rdp;
3848 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
3849
3850 vcpu_put(vcpu);
3851
3852 return 0;
3853}
3854
3855int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
3856{
ad312c7c 3857 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
d0752060
HB
3858
3859 vcpu_load(vcpu);
3860
3861 memcpy(fxsave->st_space, fpu->fpr, 128);
3862 fxsave->cwd = fpu->fcw;
3863 fxsave->swd = fpu->fsw;
3864 fxsave->twd = fpu->ftwx;
3865 fxsave->fop = fpu->last_opcode;
3866 fxsave->rip = fpu->last_ip;
3867 fxsave->rdp = fpu->last_dp;
3868 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
3869
3870 vcpu_put(vcpu);
3871
3872 return 0;
3873}
3874
3875void fx_init(struct kvm_vcpu *vcpu)
3876{
3877 unsigned after_mxcsr_mask;
3878
bc1a34f1
AA
3879 /*
3880 * Touch the fpu the first time in non atomic context as if
3881 * this is the first fpu instruction the exception handler
3882 * will fire before the instruction returns and it'll have to
3883 * allocate ram with GFP_KERNEL.
3884 */
3885 if (!used_math())
d6e88aec 3886 kvm_fx_save(&vcpu->arch.host_fx_image);
bc1a34f1 3887
d0752060
HB
3888 /* Initialize guest FPU by resetting ours and saving into guest's */
3889 preempt_disable();
d6e88aec
AK
3890 kvm_fx_save(&vcpu->arch.host_fx_image);
3891 kvm_fx_finit();
3892 kvm_fx_save(&vcpu->arch.guest_fx_image);
3893 kvm_fx_restore(&vcpu->arch.host_fx_image);
d0752060
HB
3894 preempt_enable();
3895
ad312c7c 3896 vcpu->arch.cr0 |= X86_CR0_ET;
d0752060 3897 after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
ad312c7c
ZX
3898 vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
3899 memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
d0752060
HB
3900 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
3901}
3902EXPORT_SYMBOL_GPL(fx_init);
3903
3904void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
3905{
3906 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
3907 return;
3908
3909 vcpu->guest_fpu_loaded = 1;
d6e88aec
AK
3910 kvm_fx_save(&vcpu->arch.host_fx_image);
3911 kvm_fx_restore(&vcpu->arch.guest_fx_image);
d0752060
HB
3912}
3913EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
3914
3915void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
3916{
3917 if (!vcpu->guest_fpu_loaded)
3918 return;
3919
3920 vcpu->guest_fpu_loaded = 0;
d6e88aec
AK
3921 kvm_fx_save(&vcpu->arch.guest_fx_image);
3922 kvm_fx_restore(&vcpu->arch.host_fx_image);
f096ed85 3923 ++vcpu->stat.fpu_reload;
d0752060
HB
3924}
3925EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
e9b11c17
ZX
3926
3927void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
3928{
3929 kvm_x86_ops->vcpu_free(vcpu);
3930}
3931
3932struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
3933 unsigned int id)
3934{
26e5215f
AK
3935 return kvm_x86_ops->vcpu_create(kvm, id);
3936}
e9b11c17 3937
26e5215f
AK
3938int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
3939{
3940 int r;
e9b11c17
ZX
3941
3942 /* We do fxsave: this must be aligned. */
ad312c7c 3943 BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
e9b11c17
ZX
3944
3945 vcpu_load(vcpu);
3946 r = kvm_arch_vcpu_reset(vcpu);
3947 if (r == 0)
3948 r = kvm_mmu_setup(vcpu);
3949 vcpu_put(vcpu);
3950 if (r < 0)
3951 goto free_vcpu;
3952
26e5215f 3953 return 0;
e9b11c17
ZX
3954free_vcpu:
3955 kvm_x86_ops->vcpu_free(vcpu);
26e5215f 3956 return r;
e9b11c17
ZX
3957}
3958
d40ccc62 3959void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
e9b11c17
ZX
3960{
3961 vcpu_load(vcpu);
3962 kvm_mmu_unload(vcpu);
3963 vcpu_put(vcpu);
3964
3965 kvm_x86_ops->vcpu_free(vcpu);
3966}
3967
3968int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
3969{
448fa4a9
JK
3970 vcpu->arch.nmi_pending = false;
3971 vcpu->arch.nmi_injected = false;
3972
e9b11c17
ZX
3973 return kvm_x86_ops->vcpu_reset(vcpu);
3974}
3975
3976void kvm_arch_hardware_enable(void *garbage)
3977{
3978 kvm_x86_ops->hardware_enable(garbage);
3979}
3980
3981void kvm_arch_hardware_disable(void *garbage)
3982{
3983 kvm_x86_ops->hardware_disable(garbage);
3984}
3985
3986int kvm_arch_hardware_setup(void)
3987{
3988 return kvm_x86_ops->hardware_setup();
3989}
3990
3991void kvm_arch_hardware_unsetup(void)
3992{
3993 kvm_x86_ops->hardware_unsetup();
3994}
3995
3996void kvm_arch_check_processor_compat(void *rtn)
3997{
3998 kvm_x86_ops->check_processor_compatibility(rtn);
3999}
4000
4001int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
4002{
4003 struct page *page;
4004 struct kvm *kvm;
4005 int r;
4006
4007 BUG_ON(vcpu->kvm == NULL);
4008 kvm = vcpu->kvm;
4009
ad312c7c 4010 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
e9b11c17 4011 if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
a4535290 4012 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
e9b11c17 4013 else
a4535290 4014 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
e9b11c17
ZX
4015
4016 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
4017 if (!page) {
4018 r = -ENOMEM;
4019 goto fail;
4020 }
ad312c7c 4021 vcpu->arch.pio_data = page_address(page);
e9b11c17
ZX
4022
4023 r = kvm_mmu_create(vcpu);
4024 if (r < 0)
4025 goto fail_free_pio_data;
4026
4027 if (irqchip_in_kernel(kvm)) {
4028 r = kvm_create_lapic(vcpu);
4029 if (r < 0)
4030 goto fail_mmu_destroy;
4031 }
4032
4033 return 0;
4034
4035fail_mmu_destroy:
4036 kvm_mmu_destroy(vcpu);
4037fail_free_pio_data:
ad312c7c 4038 free_page((unsigned long)vcpu->arch.pio_data);
e9b11c17
ZX
4039fail:
4040 return r;
4041}
4042
4043void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
4044{
4045 kvm_free_lapic(vcpu);
3200f405 4046 down_read(&vcpu->kvm->slots_lock);
e9b11c17 4047 kvm_mmu_destroy(vcpu);
3200f405 4048 up_read(&vcpu->kvm->slots_lock);
ad312c7c 4049 free_page((unsigned long)vcpu->arch.pio_data);
e9b11c17 4050}
d19a9cd2
ZX
4051
4052struct kvm *kvm_arch_create_vm(void)
4053{
4054 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
4055
4056 if (!kvm)
4057 return ERR_PTR(-ENOMEM);
4058
f05e70ac 4059 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
4d5c5d0f 4060 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
d19a9cd2 4061
5550af4d
SY
4062 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
4063 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
4064
d19a9cd2
ZX
4065 return kvm;
4066}
4067
4068static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
4069{
4070 vcpu_load(vcpu);
4071 kvm_mmu_unload(vcpu);
4072 vcpu_put(vcpu);
4073}
4074
4075static void kvm_free_vcpus(struct kvm *kvm)
4076{
4077 unsigned int i;
4078
4079 /*
4080 * Unpin any mmu pages first.
4081 */
4082 for (i = 0; i < KVM_MAX_VCPUS; ++i)
4083 if (kvm->vcpus[i])
4084 kvm_unload_vcpu_mmu(kvm->vcpus[i]);
4085 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
4086 if (kvm->vcpus[i]) {
4087 kvm_arch_vcpu_free(kvm->vcpus[i]);
4088 kvm->vcpus[i] = NULL;
4089 }
4090 }
4091
4092}
4093
4094void kvm_arch_destroy_vm(struct kvm *kvm)
4095{
62c476c7 4096 kvm_iommu_unmap_guest(kvm);
bfadaded 4097 kvm_free_all_assigned_devices(kvm);
7837699f 4098 kvm_free_pit(kvm);
d7deeeb0
ZX
4099 kfree(kvm->arch.vpic);
4100 kfree(kvm->arch.vioapic);
d19a9cd2
ZX
4101 kvm_free_vcpus(kvm);
4102 kvm_free_physmem(kvm);
3d45830c
AK
4103 if (kvm->arch.apic_access_page)
4104 put_page(kvm->arch.apic_access_page);
b7ebfb05
SY
4105 if (kvm->arch.ept_identity_pagetable)
4106 put_page(kvm->arch.ept_identity_pagetable);
d19a9cd2
ZX
4107 kfree(kvm);
4108}
0de10343
ZX
4109
4110int kvm_arch_set_memory_region(struct kvm *kvm,
4111 struct kvm_userspace_memory_region *mem,
4112 struct kvm_memory_slot old,
4113 int user_alloc)
4114{
4115 int npages = mem->memory_size >> PAGE_SHIFT;
4116 struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
4117
4118 /*To keep backward compatibility with older userspace,
4119 *x86 needs to hanlde !user_alloc case.
4120 */
4121 if (!user_alloc) {
4122 if (npages && !old.rmap) {
604b38ac
AA
4123 unsigned long userspace_addr;
4124
72dc67a6 4125 down_write(&current->mm->mmap_sem);
604b38ac
AA
4126 userspace_addr = do_mmap(NULL, 0,
4127 npages * PAGE_SIZE,
4128 PROT_READ | PROT_WRITE,
acee3c04 4129 MAP_PRIVATE | MAP_ANONYMOUS,
604b38ac 4130 0);
72dc67a6 4131 up_write(&current->mm->mmap_sem);
0de10343 4132
604b38ac
AA
4133 if (IS_ERR((void *)userspace_addr))
4134 return PTR_ERR((void *)userspace_addr);
4135
4136 /* set userspace_addr atomically for kvm_hva_to_rmapp */
4137 spin_lock(&kvm->mmu_lock);
4138 memslot->userspace_addr = userspace_addr;
4139 spin_unlock(&kvm->mmu_lock);
0de10343
ZX
4140 } else {
4141 if (!old.user_alloc && old.rmap) {
4142 int ret;
4143
72dc67a6 4144 down_write(&current->mm->mmap_sem);
0de10343
ZX
4145 ret = do_munmap(current->mm, old.userspace_addr,
4146 old.npages * PAGE_SIZE);
72dc67a6 4147 up_write(&current->mm->mmap_sem);
0de10343
ZX
4148 if (ret < 0)
4149 printk(KERN_WARNING
4150 "kvm_vm_ioctl_set_memory_region: "
4151 "failed to munmap memory\n");
4152 }
4153 }
4154 }
4155
f05e70ac 4156 if (!kvm->arch.n_requested_mmu_pages) {
0de10343
ZX
4157 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
4158 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
4159 }
4160
4161 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
4162 kvm_flush_remote_tlbs(kvm);
4163
4164 return 0;
4165}
1d737c8a 4166
34d4cb8f
MT
4167void kvm_arch_flush_shadow(struct kvm *kvm)
4168{
4169 kvm_mmu_zap_all(kvm);
4170}
4171
1d737c8a
ZX
4172int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
4173{
a4535290 4174 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
0496fbb9
JK
4175 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
4176 || vcpu->arch.nmi_pending;
1d737c8a 4177}
5736199a
ZX
4178
4179static void vcpu_kick_intr(void *info)
4180{
4181#ifdef DEBUG
4182 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
4183 printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
4184#endif
4185}
4186
4187void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
4188{
4189 int ipi_pcpu = vcpu->cpu;
e9571ed5 4190 int cpu = get_cpu();
5736199a
ZX
4191
4192 if (waitqueue_active(&vcpu->wq)) {
4193 wake_up_interruptible(&vcpu->wq);
4194 ++vcpu->stat.halt_wakeup;
4195 }
e9571ed5
MT
4196 /*
4197 * We may be called synchronously with irqs disabled in guest mode,
4198 * So need not to call smp_call_function_single() in that case.
4199 */
4200 if (vcpu->guest_mode && vcpu->cpu != cpu)
8691e5a8 4201 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0);
e9571ed5 4202 put_cpu();
5736199a 4203}