]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - arch/x86/kvm/vmx.c
KVM: VMX: Add facility to atomically switch MSRs on guest entry/exit
[mirror_ubuntu-artful-kernel.git] / arch / x86 / kvm / vmx.c
CommitLineData
6aa8b732
AK
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * Copyright (C) 2006 Qumranet, Inc.
8 *
9 * Authors:
10 * Avi Kivity <avi@qumranet.com>
11 * Yaniv Kamay <yaniv@qumranet.com>
12 *
13 * This work is licensed under the terms of the GNU GPL, version 2. See
14 * the COPYING file in the top-level directory.
15 *
16 */
17
85f455f7 18#include "irq.h"
1d737c8a 19#include "mmu.h"
e495606d 20
edf88417 21#include <linux/kvm_host.h>
6aa8b732 22#include <linux/module.h>
9d8f549d 23#include <linux/kernel.h>
6aa8b732
AK
24#include <linux/mm.h>
25#include <linux/highmem.h>
e8edc6e0 26#include <linux/sched.h>
c7addb90 27#include <linux/moduleparam.h>
229456fc 28#include <linux/ftrace_event.h>
5a0e3ad6 29#include <linux/slab.h>
5fdbf976 30#include "kvm_cache_regs.h"
35920a35 31#include "x86.h"
e495606d 32
6aa8b732 33#include <asm/io.h>
3b3be0d1 34#include <asm/desc.h>
13673a90 35#include <asm/vmx.h>
6210e37b 36#include <asm/virtext.h>
a0861c02 37#include <asm/mce.h>
6aa8b732 38
229456fc
MT
39#include "trace.h"
40
4ecac3fd
AK
41#define __ex(x) __kvm_handle_fault_on_reboot(x)
42
6aa8b732
AK
43MODULE_AUTHOR("Qumranet");
44MODULE_LICENSE("GPL");
45
4462d21a 46static int __read_mostly bypass_guest_pf = 1;
c1f8bc04 47module_param(bypass_guest_pf, bool, S_IRUGO);
c7addb90 48
4462d21a 49static int __read_mostly enable_vpid = 1;
736caefe 50module_param_named(vpid, enable_vpid, bool, 0444);
2384d2b3 51
4462d21a 52static int __read_mostly flexpriority_enabled = 1;
736caefe 53module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
4c9fc8ef 54
4462d21a 55static int __read_mostly enable_ept = 1;
736caefe 56module_param_named(ept, enable_ept, bool, S_IRUGO);
d56f546d 57
3a624e29
NK
58static int __read_mostly enable_unrestricted_guest = 1;
59module_param_named(unrestricted_guest,
60 enable_unrestricted_guest, bool, S_IRUGO);
61
4462d21a 62static int __read_mostly emulate_invalid_guest_state = 0;
c1f8bc04 63module_param(emulate_invalid_guest_state, bool, S_IRUGO);
04fa4d32 64
cdc0e244
AK
65#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
66 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
67#define KVM_GUEST_CR0_MASK \
68 (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
69#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \
81231c69 70 (X86_CR0_WP | X86_CR0_NE)
cdc0e244
AK
71#define KVM_VM_CR0_ALWAYS_ON \
72 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
4c38609a
AK
73#define KVM_CR4_GUEST_OWNED_BITS \
74 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
75 | X86_CR4_OSXMMEXCPT)
76
cdc0e244
AK
77#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
78#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
79
78ac8b47
AK
80#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
81
4b8d54f9
ZE
82/*
83 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
84 * ple_gap: upper bound on the amount of time between two successive
85 * executions of PAUSE in a loop. Also indicate if ple enabled.
86 * According to test, this time is usually small than 41 cycles.
87 * ple_window: upper bound on the amount of time a guest is allowed to execute
88 * in a PAUSE loop. Tests indicate that most spinlocks are held for
89 * less than 2^12 cycles
90 * Time is measured based on a counter that runs at the same rate as the TSC,
91 * refer SDM volume 3b section 21.6.13 & 22.1.3.
92 */
93#define KVM_VMX_DEFAULT_PLE_GAP 41
94#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
95static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
96module_param(ple_gap, int, S_IRUGO);
97
98static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
99module_param(ple_window, int, S_IRUGO);
100
61d2ef2c
AK
101#define NR_AUTOLOAD_MSRS 1
102
a2fa3e9f
GH
103struct vmcs {
104 u32 revision_id;
105 u32 abort;
106 char data[0];
107};
108
26bb0981
AK
109struct shared_msr_entry {
110 unsigned index;
111 u64 data;
d5696725 112 u64 mask;
26bb0981
AK
113};
114
a2fa3e9f 115struct vcpu_vmx {
fb3f0f51 116 struct kvm_vcpu vcpu;
543e4243 117 struct list_head local_vcpus_link;
313dbd49 118 unsigned long host_rsp;
a2fa3e9f 119 int launched;
29bd8a78 120 u8 fail;
1155f76a 121 u32 idt_vectoring_info;
26bb0981 122 struct shared_msr_entry *guest_msrs;
a2fa3e9f
GH
123 int nmsrs;
124 int save_nmsrs;
a2fa3e9f 125#ifdef CONFIG_X86_64
44ea2b17
AK
126 u64 msr_host_kernel_gs_base;
127 u64 msr_guest_kernel_gs_base;
a2fa3e9f
GH
128#endif
129 struct vmcs *vmcs;
61d2ef2c
AK
130 struct msr_autoload {
131 unsigned nr;
132 struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
133 struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
134 } msr_autoload;
a2fa3e9f
GH
135 struct {
136 int loaded;
137 u16 fs_sel, gs_sel, ldt_sel;
152d3f2f
LV
138 int gs_ldt_reload_needed;
139 int fs_reload_needed;
d77c26fc 140 } host_state;
9c8cba37 141 struct {
7ffd92c5 142 int vm86_active;
78ac8b47 143 ulong save_rflags;
7ffd92c5
AK
144 struct kvm_save_segment {
145 u16 selector;
146 unsigned long base;
147 u32 limit;
148 u32 ar;
149 } tr, es, ds, fs, gs;
9c8cba37
AK
150 struct {
151 bool pending;
152 u8 vector;
153 unsigned rip;
154 } irq;
155 } rmode;
2384d2b3 156 int vpid;
04fa4d32 157 bool emulation_required;
3b86cd99
JK
158
159 /* Support for vnmi-less CPUs */
160 int soft_vnmi_blocked;
161 ktime_t entry_time;
162 s64 vnmi_blocked_time;
a0861c02 163 u32 exit_reason;
4e47c7a6
SY
164
165 bool rdtscp_enabled;
a2fa3e9f
GH
166};
167
168static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
169{
fb3f0f51 170 return container_of(vcpu, struct vcpu_vmx, vcpu);
a2fa3e9f
GH
171}
172
b7ebfb05 173static int init_rmode(struct kvm *kvm);
4e1096d2 174static u64 construct_eptp(unsigned long root_hpa);
75880a01 175
6aa8b732
AK
176static DEFINE_PER_CPU(struct vmcs *, vmxarea);
177static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
543e4243 178static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
6aa8b732 179
3e7c73e9
AK
180static unsigned long *vmx_io_bitmap_a;
181static unsigned long *vmx_io_bitmap_b;
5897297b
AK
182static unsigned long *vmx_msr_bitmap_legacy;
183static unsigned long *vmx_msr_bitmap_longmode;
fdef3ad1 184
2384d2b3
SY
185static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
186static DEFINE_SPINLOCK(vmx_vpid_lock);
187
1c3d14fe 188static struct vmcs_config {
6aa8b732
AK
189 int size;
190 int order;
191 u32 revision_id;
1c3d14fe
YS
192 u32 pin_based_exec_ctrl;
193 u32 cpu_based_exec_ctrl;
f78e0e2e 194 u32 cpu_based_2nd_exec_ctrl;
1c3d14fe
YS
195 u32 vmexit_ctrl;
196 u32 vmentry_ctrl;
197} vmcs_config;
6aa8b732 198
efff9e53 199static struct vmx_capability {
d56f546d
SY
200 u32 ept;
201 u32 vpid;
202} vmx_capability;
203
6aa8b732
AK
204#define VMX_SEGMENT_FIELD(seg) \
205 [VCPU_SREG_##seg] = { \
206 .selector = GUEST_##seg##_SELECTOR, \
207 .base = GUEST_##seg##_BASE, \
208 .limit = GUEST_##seg##_LIMIT, \
209 .ar_bytes = GUEST_##seg##_AR_BYTES, \
210 }
211
212static struct kvm_vmx_segment_field {
213 unsigned selector;
214 unsigned base;
215 unsigned limit;
216 unsigned ar_bytes;
217} kvm_vmx_segment_fields[] = {
218 VMX_SEGMENT_FIELD(CS),
219 VMX_SEGMENT_FIELD(DS),
220 VMX_SEGMENT_FIELD(ES),
221 VMX_SEGMENT_FIELD(FS),
222 VMX_SEGMENT_FIELD(GS),
223 VMX_SEGMENT_FIELD(SS),
224 VMX_SEGMENT_FIELD(TR),
225 VMX_SEGMENT_FIELD(LDTR),
226};
227
26bb0981
AK
228static u64 host_efer;
229
6de4f3ad
AK
230static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
231
4d56c8a7
AK
232/*
233 * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
234 * away by decrementing the array size.
235 */
6aa8b732 236static const u32 vmx_msr_index[] = {
05b3e0c2 237#ifdef CONFIG_X86_64
44ea2b17 238 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
6aa8b732 239#endif
4e47c7a6 240 MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR,
6aa8b732 241};
9d8f549d 242#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
6aa8b732 243
31299944 244static inline bool is_page_fault(u32 intr_info)
6aa8b732
AK
245{
246 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
247 INTR_INFO_VALID_MASK)) ==
8ab2d2e2 248 (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
6aa8b732
AK
249}
250
31299944 251static inline bool is_no_device(u32 intr_info)
2ab455cc
AL
252{
253 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
254 INTR_INFO_VALID_MASK)) ==
8ab2d2e2 255 (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
2ab455cc
AL
256}
257
31299944 258static inline bool is_invalid_opcode(u32 intr_info)
7aa81cc0
AL
259{
260 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
261 INTR_INFO_VALID_MASK)) ==
8ab2d2e2 262 (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
7aa81cc0
AL
263}
264
31299944 265static inline bool is_external_interrupt(u32 intr_info)
6aa8b732
AK
266{
267 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
268 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
269}
270
31299944 271static inline bool is_machine_check(u32 intr_info)
a0861c02
AK
272{
273 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
274 INTR_INFO_VALID_MASK)) ==
275 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
276}
277
31299944 278static inline bool cpu_has_vmx_msr_bitmap(void)
25c5f225 279{
04547156 280 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
25c5f225
SY
281}
282
31299944 283static inline bool cpu_has_vmx_tpr_shadow(void)
6e5d865c 284{
04547156 285 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
6e5d865c
YS
286}
287
31299944 288static inline bool vm_need_tpr_shadow(struct kvm *kvm)
6e5d865c 289{
04547156 290 return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
6e5d865c
YS
291}
292
31299944 293static inline bool cpu_has_secondary_exec_ctrls(void)
f78e0e2e 294{
04547156
SY
295 return vmcs_config.cpu_based_exec_ctrl &
296 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
f78e0e2e
SY
297}
298
774ead3a 299static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
f78e0e2e 300{
04547156
SY
301 return vmcs_config.cpu_based_2nd_exec_ctrl &
302 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
303}
304
305static inline bool cpu_has_vmx_flexpriority(void)
306{
307 return cpu_has_vmx_tpr_shadow() &&
308 cpu_has_vmx_virtualize_apic_accesses();
f78e0e2e
SY
309}
310
e799794e
MT
311static inline bool cpu_has_vmx_ept_execute_only(void)
312{
31299944 313 return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
e799794e
MT
314}
315
316static inline bool cpu_has_vmx_eptp_uncacheable(void)
317{
31299944 318 return vmx_capability.ept & VMX_EPTP_UC_BIT;
e799794e
MT
319}
320
321static inline bool cpu_has_vmx_eptp_writeback(void)
322{
31299944 323 return vmx_capability.ept & VMX_EPTP_WB_BIT;
e799794e
MT
324}
325
326static inline bool cpu_has_vmx_ept_2m_page(void)
327{
31299944 328 return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
e799794e
MT
329}
330
878403b7
SY
331static inline bool cpu_has_vmx_ept_1g_page(void)
332{
31299944 333 return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
878403b7
SY
334}
335
31299944 336static inline bool cpu_has_vmx_invept_individual_addr(void)
d56f546d 337{
31299944 338 return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
d56f546d
SY
339}
340
31299944 341static inline bool cpu_has_vmx_invept_context(void)
d56f546d 342{
31299944 343 return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
d56f546d
SY
344}
345
31299944 346static inline bool cpu_has_vmx_invept_global(void)
d56f546d 347{
31299944 348 return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
d56f546d
SY
349}
350
31299944 351static inline bool cpu_has_vmx_ept(void)
d56f546d 352{
04547156
SY
353 return vmcs_config.cpu_based_2nd_exec_ctrl &
354 SECONDARY_EXEC_ENABLE_EPT;
d56f546d
SY
355}
356
31299944 357static inline bool cpu_has_vmx_unrestricted_guest(void)
3a624e29
NK
358{
359 return vmcs_config.cpu_based_2nd_exec_ctrl &
360 SECONDARY_EXEC_UNRESTRICTED_GUEST;
361}
362
31299944 363static inline bool cpu_has_vmx_ple(void)
4b8d54f9
ZE
364{
365 return vmcs_config.cpu_based_2nd_exec_ctrl &
366 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
367}
368
31299944 369static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
f78e0e2e 370{
6d3e435e 371 return flexpriority_enabled && irqchip_in_kernel(kvm);
f78e0e2e
SY
372}
373
31299944 374static inline bool cpu_has_vmx_vpid(void)
2384d2b3 375{
04547156
SY
376 return vmcs_config.cpu_based_2nd_exec_ctrl &
377 SECONDARY_EXEC_ENABLE_VPID;
2384d2b3
SY
378}
379
31299944 380static inline bool cpu_has_vmx_rdtscp(void)
4e47c7a6
SY
381{
382 return vmcs_config.cpu_based_2nd_exec_ctrl &
383 SECONDARY_EXEC_RDTSCP;
384}
385
31299944 386static inline bool cpu_has_virtual_nmis(void)
f08864b4
SY
387{
388 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
389}
390
04547156
SY
391static inline bool report_flexpriority(void)
392{
393 return flexpriority_enabled;
394}
395
8b9cf98c 396static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
7725f0ba
AK
397{
398 int i;
399
a2fa3e9f 400 for (i = 0; i < vmx->nmsrs; ++i)
26bb0981 401 if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
a75beee6
ED
402 return i;
403 return -1;
404}
405
2384d2b3
SY
406static inline void __invvpid(int ext, u16 vpid, gva_t gva)
407{
408 struct {
409 u64 vpid : 16;
410 u64 rsvd : 48;
411 u64 gva;
412 } operand = { vpid, 0, gva };
413
4ecac3fd 414 asm volatile (__ex(ASM_VMX_INVVPID)
2384d2b3
SY
415 /* CF==1 or ZF==1 --> rc = -1 */
416 "; ja 1f ; ud2 ; 1:"
417 : : "a"(&operand), "c"(ext) : "cc", "memory");
418}
419
1439442c
SY
420static inline void __invept(int ext, u64 eptp, gpa_t gpa)
421{
422 struct {
423 u64 eptp, gpa;
424 } operand = {eptp, gpa};
425
4ecac3fd 426 asm volatile (__ex(ASM_VMX_INVEPT)
1439442c
SY
427 /* CF==1 or ZF==1 --> rc = -1 */
428 "; ja 1f ; ud2 ; 1:\n"
429 : : "a" (&operand), "c" (ext) : "cc", "memory");
430}
431
26bb0981 432static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
a75beee6
ED
433{
434 int i;
435
8b9cf98c 436 i = __find_msr_index(vmx, msr);
a75beee6 437 if (i >= 0)
a2fa3e9f 438 return &vmx->guest_msrs[i];
8b6d44c7 439 return NULL;
7725f0ba
AK
440}
441
6aa8b732
AK
442static void vmcs_clear(struct vmcs *vmcs)
443{
444 u64 phys_addr = __pa(vmcs);
445 u8 error;
446
4ecac3fd 447 asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
6aa8b732
AK
448 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
449 : "cc", "memory");
450 if (error)
451 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
452 vmcs, phys_addr);
453}
454
455static void __vcpu_clear(void *arg)
456{
8b9cf98c 457 struct vcpu_vmx *vmx = arg;
d3b2c338 458 int cpu = raw_smp_processor_id();
6aa8b732 459
8b9cf98c 460 if (vmx->vcpu.cpu == cpu)
a2fa3e9f
GH
461 vmcs_clear(vmx->vmcs);
462 if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
6aa8b732 463 per_cpu(current_vmcs, cpu) = NULL;
ad312c7c 464 rdtscll(vmx->vcpu.arch.host_tsc);
543e4243
AK
465 list_del(&vmx->local_vcpus_link);
466 vmx->vcpu.cpu = -1;
467 vmx->launched = 0;
6aa8b732
AK
468}
469
8b9cf98c 470static void vcpu_clear(struct vcpu_vmx *vmx)
8d0be2b3 471{
eae5ecb5
AK
472 if (vmx->vcpu.cpu == -1)
473 return;
8691e5a8 474 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
8d0be2b3
AK
475}
476
2384d2b3
SY
477static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx)
478{
479 if (vmx->vpid == 0)
480 return;
481
482 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
483}
484
1439442c
SY
485static inline void ept_sync_global(void)
486{
487 if (cpu_has_vmx_invept_global())
488 __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
489}
490
491static inline void ept_sync_context(u64 eptp)
492{
089d034e 493 if (enable_ept) {
1439442c
SY
494 if (cpu_has_vmx_invept_context())
495 __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
496 else
497 ept_sync_global();
498 }
499}
500
501static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
502{
089d034e 503 if (enable_ept) {
1439442c
SY
504 if (cpu_has_vmx_invept_individual_addr())
505 __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
506 eptp, gpa);
507 else
508 ept_sync_context(eptp);
509 }
510}
511
6aa8b732
AK
512static unsigned long vmcs_readl(unsigned long field)
513{
514 unsigned long value;
515
4ecac3fd 516 asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
6aa8b732
AK
517 : "=a"(value) : "d"(field) : "cc");
518 return value;
519}
520
521static u16 vmcs_read16(unsigned long field)
522{
523 return vmcs_readl(field);
524}
525
526static u32 vmcs_read32(unsigned long field)
527{
528 return vmcs_readl(field);
529}
530
531static u64 vmcs_read64(unsigned long field)
532{
05b3e0c2 533#ifdef CONFIG_X86_64
6aa8b732
AK
534 return vmcs_readl(field);
535#else
536 return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
537#endif
538}
539
e52de1b8
AK
540static noinline void vmwrite_error(unsigned long field, unsigned long value)
541{
542 printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
543 field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
544 dump_stack();
545}
546
6aa8b732
AK
547static void vmcs_writel(unsigned long field, unsigned long value)
548{
549 u8 error;
550
4ecac3fd 551 asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0"
d77c26fc 552 : "=q"(error) : "a"(value), "d"(field) : "cc");
e52de1b8
AK
553 if (unlikely(error))
554 vmwrite_error(field, value);
6aa8b732
AK
555}
556
557static void vmcs_write16(unsigned long field, u16 value)
558{
559 vmcs_writel(field, value);
560}
561
562static void vmcs_write32(unsigned long field, u32 value)
563{
564 vmcs_writel(field, value);
565}
566
567static void vmcs_write64(unsigned long field, u64 value)
568{
6aa8b732 569 vmcs_writel(field, value);
7682f2d0 570#ifndef CONFIG_X86_64
6aa8b732
AK
571 asm volatile ("");
572 vmcs_writel(field+1, value >> 32);
573#endif
574}
575
2ab455cc
AL
576static void vmcs_clear_bits(unsigned long field, u32 mask)
577{
578 vmcs_writel(field, vmcs_readl(field) & ~mask);
579}
580
581static void vmcs_set_bits(unsigned long field, u32 mask)
582{
583 vmcs_writel(field, vmcs_readl(field) | mask);
584}
585
abd3f2d6
AK
586static void update_exception_bitmap(struct kvm_vcpu *vcpu)
587{
588 u32 eb;
589
fd7373cc
JK
590 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
591 (1u << NM_VECTOR) | (1u << DB_VECTOR);
592 if ((vcpu->guest_debug &
593 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
594 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
595 eb |= 1u << BP_VECTOR;
7ffd92c5 596 if (to_vmx(vcpu)->rmode.vm86_active)
abd3f2d6 597 eb = ~0;
089d034e 598 if (enable_ept)
1439442c 599 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
02daab21
AK
600 if (vcpu->fpu_active)
601 eb &= ~(1u << NM_VECTOR);
abd3f2d6
AK
602 vmcs_write32(EXCEPTION_BITMAP, eb);
603}
604
61d2ef2c
AK
605static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
606{
607 unsigned i;
608 struct msr_autoload *m = &vmx->msr_autoload;
609
610 for (i = 0; i < m->nr; ++i)
611 if (m->guest[i].index == msr)
612 break;
613
614 if (i == m->nr)
615 return;
616 --m->nr;
617 m->guest[i] = m->guest[m->nr];
618 m->host[i] = m->host[m->nr];
619 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
620 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
621}
622
623static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
624 u64 guest_val, u64 host_val)
625{
626 unsigned i;
627 struct msr_autoload *m = &vmx->msr_autoload;
628
629 for (i = 0; i < m->nr; ++i)
630 if (m->guest[i].index == msr)
631 break;
632
633 if (i == m->nr) {
634 ++m->nr;
635 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
636 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
637 }
638
639 m->guest[i].index = msr;
640 m->guest[i].value = guest_val;
641 m->host[i].index = msr;
642 m->host[i].value = host_val;
643}
644
33ed6329
AK
645static void reload_tss(void)
646{
33ed6329
AK
647 /*
648 * VT restores TR but not its size. Useless.
649 */
89a27f4d 650 struct desc_ptr gdt;
a5f61300 651 struct desc_struct *descs;
33ed6329 652
d6ab1ed4 653 native_store_gdt(&gdt);
89a27f4d 654 descs = (void *)gdt.address;
33ed6329
AK
655 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
656 load_TR_desc();
33ed6329
AK
657}
658
92c0d900 659static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
2cc51560 660{
3a34a881 661 u64 guest_efer;
51c6cf66
AK
662 u64 ignore_bits;
663
f6801dff 664 guest_efer = vmx->vcpu.arch.efer;
3a34a881 665
51c6cf66
AK
666 /*
667 * NX is emulated; LMA and LME handled by hardware; SCE meaninless
668 * outside long mode
669 */
670 ignore_bits = EFER_NX | EFER_SCE;
671#ifdef CONFIG_X86_64
672 ignore_bits |= EFER_LMA | EFER_LME;
673 /* SCE is meaningful only in long mode on Intel */
674 if (guest_efer & EFER_LMA)
675 ignore_bits &= ~(u64)EFER_SCE;
676#endif
51c6cf66
AK
677 guest_efer &= ~ignore_bits;
678 guest_efer |= host_efer & ignore_bits;
26bb0981 679 vmx->guest_msrs[efer_offset].data = guest_efer;
d5696725 680 vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
26bb0981 681 return true;
51c6cf66
AK
682}
683
2d49ec72
GN
684static unsigned long segment_base(u16 selector)
685{
686 struct desc_ptr gdt;
687 struct desc_struct *d;
688 unsigned long table_base;
689 unsigned long v;
690
691 if (!(selector & ~3))
692 return 0;
693
694 native_store_gdt(&gdt);
695 table_base = gdt.address;
696
697 if (selector & 4) { /* from ldt */
698 u16 ldt_selector = kvm_read_ldt();
699
700 if (!(ldt_selector & ~3))
701 return 0;
702
703 table_base = segment_base(ldt_selector);
704 }
705 d = (struct desc_struct *)(table_base + (selector & ~7));
706 v = get_desc_base(d);
707#ifdef CONFIG_X86_64
708 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
709 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
710#endif
711 return v;
712}
713
714static inline unsigned long kvm_read_tr_base(void)
715{
716 u16 tr;
717 asm("str %0" : "=g"(tr));
718 return segment_base(tr);
719}
720
04d2cc77 721static void vmx_save_host_state(struct kvm_vcpu *vcpu)
33ed6329 722{
04d2cc77 723 struct vcpu_vmx *vmx = to_vmx(vcpu);
26bb0981 724 int i;
04d2cc77 725
a2fa3e9f 726 if (vmx->host_state.loaded)
33ed6329
AK
727 return;
728
a2fa3e9f 729 vmx->host_state.loaded = 1;
33ed6329
AK
730 /*
731 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
732 * allow segment selectors with cpl > 0 or ti == 1.
733 */
d6e88aec 734 vmx->host_state.ldt_sel = kvm_read_ldt();
152d3f2f 735 vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
d6e88aec 736 vmx->host_state.fs_sel = kvm_read_fs();
152d3f2f 737 if (!(vmx->host_state.fs_sel & 7)) {
a2fa3e9f 738 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
152d3f2f
LV
739 vmx->host_state.fs_reload_needed = 0;
740 } else {
33ed6329 741 vmcs_write16(HOST_FS_SELECTOR, 0);
152d3f2f 742 vmx->host_state.fs_reload_needed = 1;
33ed6329 743 }
d6e88aec 744 vmx->host_state.gs_sel = kvm_read_gs();
a2fa3e9f
GH
745 if (!(vmx->host_state.gs_sel & 7))
746 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
33ed6329
AK
747 else {
748 vmcs_write16(HOST_GS_SELECTOR, 0);
152d3f2f 749 vmx->host_state.gs_ldt_reload_needed = 1;
33ed6329
AK
750 }
751
752#ifdef CONFIG_X86_64
753 vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
754 vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
755#else
a2fa3e9f
GH
756 vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
757 vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
33ed6329 758#endif
707c0874
AK
759
760#ifdef CONFIG_X86_64
44ea2b17
AK
761 if (is_long_mode(&vmx->vcpu)) {
762 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
763 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
764 }
707c0874 765#endif
26bb0981
AK
766 for (i = 0; i < vmx->save_nmsrs; ++i)
767 kvm_set_shared_msr(vmx->guest_msrs[i].index,
d5696725
AK
768 vmx->guest_msrs[i].data,
769 vmx->guest_msrs[i].mask);
33ed6329
AK
770}
771
a9b21b62 772static void __vmx_load_host_state(struct vcpu_vmx *vmx)
33ed6329 773{
15ad7146 774 unsigned long flags;
33ed6329 775
a2fa3e9f 776 if (!vmx->host_state.loaded)
33ed6329
AK
777 return;
778
e1beb1d3 779 ++vmx->vcpu.stat.host_state_reload;
a2fa3e9f 780 vmx->host_state.loaded = 0;
152d3f2f 781 if (vmx->host_state.fs_reload_needed)
d6e88aec 782 kvm_load_fs(vmx->host_state.fs_sel);
152d3f2f 783 if (vmx->host_state.gs_ldt_reload_needed) {
d6e88aec 784 kvm_load_ldt(vmx->host_state.ldt_sel);
33ed6329
AK
785 /*
786 * If we have to reload gs, we must take care to
787 * preserve our gs base.
788 */
15ad7146 789 local_irq_save(flags);
d6e88aec 790 kvm_load_gs(vmx->host_state.gs_sel);
33ed6329
AK
791#ifdef CONFIG_X86_64
792 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
793#endif
15ad7146 794 local_irq_restore(flags);
33ed6329 795 }
152d3f2f 796 reload_tss();
44ea2b17
AK
797#ifdef CONFIG_X86_64
798 if (is_long_mode(&vmx->vcpu)) {
799 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
800 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
801 }
802#endif
33ed6329
AK
803}
804
a9b21b62
AK
805static void vmx_load_host_state(struct vcpu_vmx *vmx)
806{
807 preempt_disable();
808 __vmx_load_host_state(vmx);
809 preempt_enable();
810}
811
6aa8b732
AK
812/*
813 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
814 * vcpu mutex is already taken.
815 */
15ad7146 816static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
6aa8b732 817{
a2fa3e9f
GH
818 struct vcpu_vmx *vmx = to_vmx(vcpu);
819 u64 phys_addr = __pa(vmx->vmcs);
019960ae 820 u64 tsc_this, delta, new_offset;
6aa8b732 821
a3d7f85f 822 if (vcpu->cpu != cpu) {
8b9cf98c 823 vcpu_clear(vmx);
2f599714 824 kvm_migrate_timers(vcpu);
eb5109e3 825 set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests);
543e4243
AK
826 local_irq_disable();
827 list_add(&vmx->local_vcpus_link,
828 &per_cpu(vcpus_on_cpu, cpu));
829 local_irq_enable();
a3d7f85f 830 }
6aa8b732 831
a2fa3e9f 832 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
6aa8b732
AK
833 u8 error;
834
a2fa3e9f 835 per_cpu(current_vmcs, cpu) = vmx->vmcs;
4ecac3fd 836 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
6aa8b732
AK
837 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
838 : "cc");
839 if (error)
840 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
a2fa3e9f 841 vmx->vmcs, phys_addr);
6aa8b732
AK
842 }
843
844 if (vcpu->cpu != cpu) {
89a27f4d 845 struct desc_ptr dt;
6aa8b732
AK
846 unsigned long sysenter_esp;
847
848 vcpu->cpu = cpu;
849 /*
850 * Linux uses per-cpu TSS and GDT, so set these when switching
851 * processors.
852 */
d6e88aec 853 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
d6ab1ed4 854 native_store_gdt(&dt);
89a27f4d 855 vmcs_writel(HOST_GDTR_BASE, dt.address); /* 22.2.4 */
6aa8b732
AK
856
857 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
858 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
7700270e
AK
859
860 /*
861 * Make sure the time stamp counter is monotonous.
862 */
863 rdtscll(tsc_this);
019960ae
AK
864 if (tsc_this < vcpu->arch.host_tsc) {
865 delta = vcpu->arch.host_tsc - tsc_this;
866 new_offset = vmcs_read64(TSC_OFFSET) + delta;
867 vmcs_write64(TSC_OFFSET, new_offset);
868 }
6aa8b732 869 }
6aa8b732
AK
870}
871
872static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
873{
a9b21b62 874 __vmx_load_host_state(to_vmx(vcpu));
6aa8b732
AK
875}
876
5fd86fcf
AK
877static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
878{
81231c69
AK
879 ulong cr0;
880
5fd86fcf
AK
881 if (vcpu->fpu_active)
882 return;
883 vcpu->fpu_active = 1;
81231c69
AK
884 cr0 = vmcs_readl(GUEST_CR0);
885 cr0 &= ~(X86_CR0_TS | X86_CR0_MP);
886 cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
887 vmcs_writel(GUEST_CR0, cr0);
5fd86fcf 888 update_exception_bitmap(vcpu);
edcafe3c
AK
889 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
890 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
5fd86fcf
AK
891}
892
edcafe3c
AK
893static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
894
5fd86fcf
AK
895static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
896{
edcafe3c 897 vmx_decache_cr0_guest_bits(vcpu);
81231c69 898 vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
5fd86fcf 899 update_exception_bitmap(vcpu);
edcafe3c
AK
900 vcpu->arch.cr0_guest_owned_bits = 0;
901 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
902 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
5fd86fcf
AK
903}
904
6aa8b732
AK
905static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
906{
78ac8b47 907 unsigned long rflags, save_rflags;
345dcaa8
AK
908
909 rflags = vmcs_readl(GUEST_RFLAGS);
78ac8b47
AK
910 if (to_vmx(vcpu)->rmode.vm86_active) {
911 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
912 save_rflags = to_vmx(vcpu)->rmode.save_rflags;
913 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
914 }
345dcaa8 915 return rflags;
6aa8b732
AK
916}
917
918static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
919{
78ac8b47
AK
920 if (to_vmx(vcpu)->rmode.vm86_active) {
921 to_vmx(vcpu)->rmode.save_rflags = rflags;
053de044 922 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
78ac8b47 923 }
6aa8b732
AK
924 vmcs_writel(GUEST_RFLAGS, rflags);
925}
926
2809f5d2
GC
927static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
928{
929 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
930 int ret = 0;
931
932 if (interruptibility & GUEST_INTR_STATE_STI)
48005f64 933 ret |= KVM_X86_SHADOW_INT_STI;
2809f5d2 934 if (interruptibility & GUEST_INTR_STATE_MOV_SS)
48005f64 935 ret |= KVM_X86_SHADOW_INT_MOV_SS;
2809f5d2
GC
936
937 return ret & mask;
938}
939
940static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
941{
942 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
943 u32 interruptibility = interruptibility_old;
944
945 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
946
48005f64 947 if (mask & KVM_X86_SHADOW_INT_MOV_SS)
2809f5d2 948 interruptibility |= GUEST_INTR_STATE_MOV_SS;
48005f64 949 else if (mask & KVM_X86_SHADOW_INT_STI)
2809f5d2
GC
950 interruptibility |= GUEST_INTR_STATE_STI;
951
952 if ((interruptibility != interruptibility_old))
953 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
954}
955
6aa8b732
AK
956static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
957{
958 unsigned long rip;
6aa8b732 959
5fdbf976 960 rip = kvm_rip_read(vcpu);
6aa8b732 961 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
5fdbf976 962 kvm_rip_write(vcpu, rip);
6aa8b732 963
2809f5d2
GC
964 /* skipping an emulated instruction also counts */
965 vmx_set_interrupt_shadow(vcpu, 0);
6aa8b732
AK
966}
967
298101da 968static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
ce7ddec4
JR
969 bool has_error_code, u32 error_code,
970 bool reinject)
298101da 971{
77ab6db0 972 struct vcpu_vmx *vmx = to_vmx(vcpu);
8ab2d2e2 973 u32 intr_info = nr | INTR_INFO_VALID_MASK;
77ab6db0 974
8ab2d2e2 975 if (has_error_code) {
77ab6db0 976 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
8ab2d2e2
JK
977 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
978 }
77ab6db0 979
7ffd92c5 980 if (vmx->rmode.vm86_active) {
77ab6db0
JK
981 vmx->rmode.irq.pending = true;
982 vmx->rmode.irq.vector = nr;
983 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
ae0bb3e0
GN
984 if (kvm_exception_is_soft(nr))
985 vmx->rmode.irq.rip +=
986 vmx->vcpu.arch.event_exit_inst_len;
8ab2d2e2
JK
987 intr_info |= INTR_TYPE_SOFT_INTR;
988 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
77ab6db0
JK
989 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
990 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
991 return;
992 }
993
66fd3f7f
GN
994 if (kvm_exception_is_soft(nr)) {
995 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
996 vmx->vcpu.arch.event_exit_inst_len);
8ab2d2e2
JK
997 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
998 } else
999 intr_info |= INTR_TYPE_HARD_EXCEPTION;
1000
1001 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
298101da
AK
1002}
1003
4e47c7a6
SY
1004static bool vmx_rdtscp_supported(void)
1005{
1006 return cpu_has_vmx_rdtscp();
1007}
1008
a75beee6
ED
1009/*
1010 * Swap MSR entry in host/guest MSR entry array.
1011 */
8b9cf98c 1012static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
a75beee6 1013{
26bb0981 1014 struct shared_msr_entry tmp;
a2fa3e9f
GH
1015
1016 tmp = vmx->guest_msrs[to];
1017 vmx->guest_msrs[to] = vmx->guest_msrs[from];
1018 vmx->guest_msrs[from] = tmp;
a75beee6
ED
1019}
1020
e38aea3e
AK
1021/*
1022 * Set up the vmcs to automatically save and restore system
1023 * msrs. Don't touch the 64-bit msrs if the guest is in legacy
1024 * mode, as fiddling with msrs is very expensive.
1025 */
8b9cf98c 1026static void setup_msrs(struct vcpu_vmx *vmx)
e38aea3e 1027{
26bb0981 1028 int save_nmsrs, index;
5897297b 1029 unsigned long *msr_bitmap;
e38aea3e 1030
33f9c505 1031 vmx_load_host_state(vmx);
a75beee6
ED
1032 save_nmsrs = 0;
1033#ifdef CONFIG_X86_64
8b9cf98c 1034 if (is_long_mode(&vmx->vcpu)) {
8b9cf98c 1035 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
a75beee6 1036 if (index >= 0)
8b9cf98c
RR
1037 move_msr_up(vmx, index, save_nmsrs++);
1038 index = __find_msr_index(vmx, MSR_LSTAR);
a75beee6 1039 if (index >= 0)
8b9cf98c
RR
1040 move_msr_up(vmx, index, save_nmsrs++);
1041 index = __find_msr_index(vmx, MSR_CSTAR);
a75beee6 1042 if (index >= 0)
8b9cf98c 1043 move_msr_up(vmx, index, save_nmsrs++);
4e47c7a6
SY
1044 index = __find_msr_index(vmx, MSR_TSC_AUX);
1045 if (index >= 0 && vmx->rdtscp_enabled)
1046 move_msr_up(vmx, index, save_nmsrs++);
a75beee6
ED
1047 /*
1048 * MSR_K6_STAR is only needed on long mode guests, and only
1049 * if efer.sce is enabled.
1050 */
8b9cf98c 1051 index = __find_msr_index(vmx, MSR_K6_STAR);
f6801dff 1052 if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
8b9cf98c 1053 move_msr_up(vmx, index, save_nmsrs++);
a75beee6
ED
1054 }
1055#endif
92c0d900
AK
1056 index = __find_msr_index(vmx, MSR_EFER);
1057 if (index >= 0 && update_transition_efer(vmx, index))
26bb0981 1058 move_msr_up(vmx, index, save_nmsrs++);
e38aea3e 1059
26bb0981 1060 vmx->save_nmsrs = save_nmsrs;
5897297b
AK
1061
1062 if (cpu_has_vmx_msr_bitmap()) {
1063 if (is_long_mode(&vmx->vcpu))
1064 msr_bitmap = vmx_msr_bitmap_longmode;
1065 else
1066 msr_bitmap = vmx_msr_bitmap_legacy;
1067
1068 vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
1069 }
e38aea3e
AK
1070}
1071
6aa8b732
AK
1072/*
1073 * reads and returns guest's timestamp counter "register"
1074 * guest_tsc = host_tsc + tsc_offset -- 21.3
1075 */
1076static u64 guest_read_tsc(void)
1077{
1078 u64 host_tsc, tsc_offset;
1079
1080 rdtscll(host_tsc);
1081 tsc_offset = vmcs_read64(TSC_OFFSET);
1082 return host_tsc + tsc_offset;
1083}
1084
1085/*
1086 * writes 'guest_tsc' into guest's timestamp counter "register"
1087 * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
1088 */
53f658b3 1089static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
6aa8b732 1090{
6aa8b732
AK
1091 vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
1092}
1093
6aa8b732
AK
1094/*
1095 * Reads an msr value (of 'msr_index') into 'pdata'.
1096 * Returns 0 on success, non-0 otherwise.
1097 * Assumes vcpu_load() was already called.
1098 */
1099static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1100{
1101 u64 data;
26bb0981 1102 struct shared_msr_entry *msr;
6aa8b732
AK
1103
1104 if (!pdata) {
1105 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
1106 return -EINVAL;
1107 }
1108
1109 switch (msr_index) {
05b3e0c2 1110#ifdef CONFIG_X86_64
6aa8b732
AK
1111 case MSR_FS_BASE:
1112 data = vmcs_readl(GUEST_FS_BASE);
1113 break;
1114 case MSR_GS_BASE:
1115 data = vmcs_readl(GUEST_GS_BASE);
1116 break;
44ea2b17
AK
1117 case MSR_KERNEL_GS_BASE:
1118 vmx_load_host_state(to_vmx(vcpu));
1119 data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
1120 break;
26bb0981 1121#endif
6aa8b732 1122 case MSR_EFER:
3bab1f5d 1123 return kvm_get_msr_common(vcpu, msr_index, pdata);
af24a4e4 1124 case MSR_IA32_TSC:
6aa8b732
AK
1125 data = guest_read_tsc();
1126 break;
1127 case MSR_IA32_SYSENTER_CS:
1128 data = vmcs_read32(GUEST_SYSENTER_CS);
1129 break;
1130 case MSR_IA32_SYSENTER_EIP:
f5b42c33 1131 data = vmcs_readl(GUEST_SYSENTER_EIP);
6aa8b732
AK
1132 break;
1133 case MSR_IA32_SYSENTER_ESP:
f5b42c33 1134 data = vmcs_readl(GUEST_SYSENTER_ESP);
6aa8b732 1135 break;
4e47c7a6
SY
1136 case MSR_TSC_AUX:
1137 if (!to_vmx(vcpu)->rdtscp_enabled)
1138 return 1;
1139 /* Otherwise falls through */
6aa8b732 1140 default:
26bb0981 1141 vmx_load_host_state(to_vmx(vcpu));
8b9cf98c 1142 msr = find_msr_entry(to_vmx(vcpu), msr_index);
3bab1f5d 1143 if (msr) {
542423b0 1144 vmx_load_host_state(to_vmx(vcpu));
3bab1f5d
AK
1145 data = msr->data;
1146 break;
6aa8b732 1147 }
3bab1f5d 1148 return kvm_get_msr_common(vcpu, msr_index, pdata);
6aa8b732
AK
1149 }
1150
1151 *pdata = data;
1152 return 0;
1153}
1154
1155/*
1156 * Writes msr value into into the appropriate "register".
1157 * Returns 0 on success, non-0 otherwise.
1158 * Assumes vcpu_load() was already called.
1159 */
1160static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1161{
a2fa3e9f 1162 struct vcpu_vmx *vmx = to_vmx(vcpu);
26bb0981 1163 struct shared_msr_entry *msr;
53f658b3 1164 u64 host_tsc;
2cc51560
ED
1165 int ret = 0;
1166
6aa8b732 1167 switch (msr_index) {
3bab1f5d 1168 case MSR_EFER:
a9b21b62 1169 vmx_load_host_state(vmx);
2cc51560 1170 ret = kvm_set_msr_common(vcpu, msr_index, data);
2cc51560 1171 break;
16175a79 1172#ifdef CONFIG_X86_64
6aa8b732
AK
1173 case MSR_FS_BASE:
1174 vmcs_writel(GUEST_FS_BASE, data);
1175 break;
1176 case MSR_GS_BASE:
1177 vmcs_writel(GUEST_GS_BASE, data);
1178 break;
44ea2b17
AK
1179 case MSR_KERNEL_GS_BASE:
1180 vmx_load_host_state(vmx);
1181 vmx->msr_guest_kernel_gs_base = data;
1182 break;
6aa8b732
AK
1183#endif
1184 case MSR_IA32_SYSENTER_CS:
1185 vmcs_write32(GUEST_SYSENTER_CS, data);
1186 break;
1187 case MSR_IA32_SYSENTER_EIP:
f5b42c33 1188 vmcs_writel(GUEST_SYSENTER_EIP, data);
6aa8b732
AK
1189 break;
1190 case MSR_IA32_SYSENTER_ESP:
f5b42c33 1191 vmcs_writel(GUEST_SYSENTER_ESP, data);
6aa8b732 1192 break;
af24a4e4 1193 case MSR_IA32_TSC:
53f658b3
MT
1194 rdtscll(host_tsc);
1195 guest_write_tsc(data, host_tsc);
6aa8b732 1196 break;
468d472f
SY
1197 case MSR_IA32_CR_PAT:
1198 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
1199 vmcs_write64(GUEST_IA32_PAT, data);
1200 vcpu->arch.pat = data;
1201 break;
1202 }
4e47c7a6
SY
1203 ret = kvm_set_msr_common(vcpu, msr_index, data);
1204 break;
1205 case MSR_TSC_AUX:
1206 if (!vmx->rdtscp_enabled)
1207 return 1;
1208 /* Check reserved bit, higher 32 bits should be zero */
1209 if ((data >> 32) != 0)
1210 return 1;
1211 /* Otherwise falls through */
6aa8b732 1212 default:
8b9cf98c 1213 msr = find_msr_entry(vmx, msr_index);
3bab1f5d 1214 if (msr) {
542423b0 1215 vmx_load_host_state(vmx);
3bab1f5d
AK
1216 msr->data = data;
1217 break;
6aa8b732 1218 }
2cc51560 1219 ret = kvm_set_msr_common(vcpu, msr_index, data);
6aa8b732
AK
1220 }
1221
2cc51560 1222 return ret;
6aa8b732
AK
1223}
1224
5fdbf976 1225static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
6aa8b732 1226{
5fdbf976
MT
1227 __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
1228 switch (reg) {
1229 case VCPU_REGS_RSP:
1230 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
1231 break;
1232 case VCPU_REGS_RIP:
1233 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
1234 break;
6de4f3ad
AK
1235 case VCPU_EXREG_PDPTR:
1236 if (enable_ept)
1237 ept_save_pdptrs(vcpu);
1238 break;
5fdbf976
MT
1239 default:
1240 break;
1241 }
6aa8b732
AK
1242}
1243
355be0b9 1244static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
6aa8b732 1245{
ae675ef0
JK
1246 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1247 vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
1248 else
1249 vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
1250
abd3f2d6 1251 update_exception_bitmap(vcpu);
6aa8b732
AK
1252}
1253
1254static __init int cpu_has_kvm_support(void)
1255{
6210e37b 1256 return cpu_has_vmx();
6aa8b732
AK
1257}
1258
1259static __init int vmx_disabled_by_bios(void)
1260{
1261 u64 msr;
1262
1263 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
9ea542fa
SY
1264 return (msr & (FEATURE_CONTROL_LOCKED |
1265 FEATURE_CONTROL_VMXON_ENABLED))
1266 == FEATURE_CONTROL_LOCKED;
62b3ffb8 1267 /* locked but not enabled */
6aa8b732
AK
1268}
1269
10474ae8 1270static int hardware_enable(void *garbage)
6aa8b732
AK
1271{
1272 int cpu = raw_smp_processor_id();
1273 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
1274 u64 old;
1275
10474ae8
AG
1276 if (read_cr4() & X86_CR4_VMXE)
1277 return -EBUSY;
1278
543e4243 1279 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
6aa8b732 1280 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
9ea542fa
SY
1281 if ((old & (FEATURE_CONTROL_LOCKED |
1282 FEATURE_CONTROL_VMXON_ENABLED))
1283 != (FEATURE_CONTROL_LOCKED |
1284 FEATURE_CONTROL_VMXON_ENABLED))
6aa8b732 1285 /* enable and lock */
62b3ffb8 1286 wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
9ea542fa
SY
1287 FEATURE_CONTROL_LOCKED |
1288 FEATURE_CONTROL_VMXON_ENABLED);
66aee91a 1289 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
4ecac3fd
AK
1290 asm volatile (ASM_VMX_VMXON_RAX
1291 : : "a"(&phys_addr), "m"(phys_addr)
6aa8b732 1292 : "memory", "cc");
10474ae8
AG
1293
1294 ept_sync_global();
1295
1296 return 0;
6aa8b732
AK
1297}
1298
543e4243
AK
1299static void vmclear_local_vcpus(void)
1300{
1301 int cpu = raw_smp_processor_id();
1302 struct vcpu_vmx *vmx, *n;
1303
1304 list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu),
1305 local_vcpus_link)
1306 __vcpu_clear(vmx);
1307}
1308
710ff4a8
EH
1309
1310/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
1311 * tricks.
1312 */
1313static void kvm_cpu_vmxoff(void)
6aa8b732 1314{
4ecac3fd 1315 asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
e693d71b 1316 write_cr4(read_cr4() & ~X86_CR4_VMXE);
6aa8b732
AK
1317}
1318
710ff4a8
EH
1319static void hardware_disable(void *garbage)
1320{
1321 vmclear_local_vcpus();
1322 kvm_cpu_vmxoff();
1323}
1324
1c3d14fe 1325static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
d77c26fc 1326 u32 msr, u32 *result)
1c3d14fe
YS
1327{
1328 u32 vmx_msr_low, vmx_msr_high;
1329 u32 ctl = ctl_min | ctl_opt;
1330
1331 rdmsr(msr, vmx_msr_low, vmx_msr_high);
1332
1333 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
1334 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
1335
1336 /* Ensure minimum (required) set of control bits are supported. */
1337 if (ctl_min & ~ctl)
002c7f7c 1338 return -EIO;
1c3d14fe
YS
1339
1340 *result = ctl;
1341 return 0;
1342}
1343
002c7f7c 1344static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
6aa8b732
AK
1345{
1346 u32 vmx_msr_low, vmx_msr_high;
d56f546d 1347 u32 min, opt, min2, opt2;
1c3d14fe
YS
1348 u32 _pin_based_exec_control = 0;
1349 u32 _cpu_based_exec_control = 0;
f78e0e2e 1350 u32 _cpu_based_2nd_exec_control = 0;
1c3d14fe
YS
1351 u32 _vmexit_control = 0;
1352 u32 _vmentry_control = 0;
1353
1354 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
f08864b4 1355 opt = PIN_BASED_VIRTUAL_NMIS;
1c3d14fe
YS
1356 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
1357 &_pin_based_exec_control) < 0)
002c7f7c 1358 return -EIO;
1c3d14fe
YS
1359
1360 min = CPU_BASED_HLT_EXITING |
1361#ifdef CONFIG_X86_64
1362 CPU_BASED_CR8_LOAD_EXITING |
1363 CPU_BASED_CR8_STORE_EXITING |
1364#endif
d56f546d
SY
1365 CPU_BASED_CR3_LOAD_EXITING |
1366 CPU_BASED_CR3_STORE_EXITING |
1c3d14fe
YS
1367 CPU_BASED_USE_IO_BITMAPS |
1368 CPU_BASED_MOV_DR_EXITING |
a7052897 1369 CPU_BASED_USE_TSC_OFFSETING |
59708670
SY
1370 CPU_BASED_MWAIT_EXITING |
1371 CPU_BASED_MONITOR_EXITING |
a7052897 1372 CPU_BASED_INVLPG_EXITING;
f78e0e2e 1373 opt = CPU_BASED_TPR_SHADOW |
25c5f225 1374 CPU_BASED_USE_MSR_BITMAPS |
f78e0e2e 1375 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
1c3d14fe
YS
1376 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
1377 &_cpu_based_exec_control) < 0)
002c7f7c 1378 return -EIO;
6e5d865c
YS
1379#ifdef CONFIG_X86_64
1380 if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
1381 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
1382 ~CPU_BASED_CR8_STORE_EXITING;
1383#endif
f78e0e2e 1384 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
d56f546d
SY
1385 min2 = 0;
1386 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2384d2b3 1387 SECONDARY_EXEC_WBINVD_EXITING |
d56f546d 1388 SECONDARY_EXEC_ENABLE_VPID |
3a624e29 1389 SECONDARY_EXEC_ENABLE_EPT |
4b8d54f9 1390 SECONDARY_EXEC_UNRESTRICTED_GUEST |
4e47c7a6
SY
1391 SECONDARY_EXEC_PAUSE_LOOP_EXITING |
1392 SECONDARY_EXEC_RDTSCP;
d56f546d
SY
1393 if (adjust_vmx_controls(min2, opt2,
1394 MSR_IA32_VMX_PROCBASED_CTLS2,
f78e0e2e
SY
1395 &_cpu_based_2nd_exec_control) < 0)
1396 return -EIO;
1397 }
1398#ifndef CONFIG_X86_64
1399 if (!(_cpu_based_2nd_exec_control &
1400 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
1401 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
1402#endif
d56f546d 1403 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
a7052897
MT
1404 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
1405 enabled */
5fff7d27
GN
1406 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
1407 CPU_BASED_CR3_STORE_EXITING |
1408 CPU_BASED_INVLPG_EXITING);
d56f546d
SY
1409 rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
1410 vmx_capability.ept, vmx_capability.vpid);
1411 }
1c3d14fe
YS
1412
1413 min = 0;
1414#ifdef CONFIG_X86_64
1415 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
1416#endif
468d472f 1417 opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
1c3d14fe
YS
1418 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
1419 &_vmexit_control) < 0)
002c7f7c 1420 return -EIO;
1c3d14fe 1421
468d472f
SY
1422 min = 0;
1423 opt = VM_ENTRY_LOAD_IA32_PAT;
1c3d14fe
YS
1424 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
1425 &_vmentry_control) < 0)
002c7f7c 1426 return -EIO;
6aa8b732 1427
c68876fd 1428 rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
1c3d14fe
YS
1429
1430 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
1431 if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
002c7f7c 1432 return -EIO;
1c3d14fe
YS
1433
1434#ifdef CONFIG_X86_64
1435 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
1436 if (vmx_msr_high & (1u<<16))
002c7f7c 1437 return -EIO;
1c3d14fe
YS
1438#endif
1439
1440 /* Require Write-Back (WB) memory type for VMCS accesses. */
1441 if (((vmx_msr_high >> 18) & 15) != 6)
002c7f7c 1442 return -EIO;
1c3d14fe 1443
002c7f7c
YS
1444 vmcs_conf->size = vmx_msr_high & 0x1fff;
1445 vmcs_conf->order = get_order(vmcs_config.size);
1446 vmcs_conf->revision_id = vmx_msr_low;
1c3d14fe 1447
002c7f7c
YS
1448 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
1449 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
f78e0e2e 1450 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
002c7f7c
YS
1451 vmcs_conf->vmexit_ctrl = _vmexit_control;
1452 vmcs_conf->vmentry_ctrl = _vmentry_control;
1c3d14fe
YS
1453
1454 return 0;
c68876fd 1455}
6aa8b732
AK
1456
1457static struct vmcs *alloc_vmcs_cpu(int cpu)
1458{
1459 int node = cpu_to_node(cpu);
1460 struct page *pages;
1461 struct vmcs *vmcs;
1462
6484eb3e 1463 pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
6aa8b732
AK
1464 if (!pages)
1465 return NULL;
1466 vmcs = page_address(pages);
1c3d14fe
YS
1467 memset(vmcs, 0, vmcs_config.size);
1468 vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
6aa8b732
AK
1469 return vmcs;
1470}
1471
1472static struct vmcs *alloc_vmcs(void)
1473{
d3b2c338 1474 return alloc_vmcs_cpu(raw_smp_processor_id());
6aa8b732
AK
1475}
1476
1477static void free_vmcs(struct vmcs *vmcs)
1478{
1c3d14fe 1479 free_pages((unsigned long)vmcs, vmcs_config.order);
6aa8b732
AK
1480}
1481
39959588 1482static void free_kvm_area(void)
6aa8b732
AK
1483{
1484 int cpu;
1485
3230bb47 1486 for_each_possible_cpu(cpu) {
6aa8b732 1487 free_vmcs(per_cpu(vmxarea, cpu));
3230bb47
ZA
1488 per_cpu(vmxarea, cpu) = NULL;
1489 }
6aa8b732
AK
1490}
1491
6aa8b732
AK
1492static __init int alloc_kvm_area(void)
1493{
1494 int cpu;
1495
3230bb47 1496 for_each_possible_cpu(cpu) {
6aa8b732
AK
1497 struct vmcs *vmcs;
1498
1499 vmcs = alloc_vmcs_cpu(cpu);
1500 if (!vmcs) {
1501 free_kvm_area();
1502 return -ENOMEM;
1503 }
1504
1505 per_cpu(vmxarea, cpu) = vmcs;
1506 }
1507 return 0;
1508}
1509
1510static __init int hardware_setup(void)
1511{
002c7f7c
YS
1512 if (setup_vmcs_config(&vmcs_config) < 0)
1513 return -EIO;
50a37eb4
JR
1514
1515 if (boot_cpu_has(X86_FEATURE_NX))
1516 kvm_enable_efer_bits(EFER_NX);
1517
93ba03c2
SY
1518 if (!cpu_has_vmx_vpid())
1519 enable_vpid = 0;
1520
3a624e29 1521 if (!cpu_has_vmx_ept()) {
93ba03c2 1522 enable_ept = 0;
3a624e29
NK
1523 enable_unrestricted_guest = 0;
1524 }
1525
1526 if (!cpu_has_vmx_unrestricted_guest())
1527 enable_unrestricted_guest = 0;
93ba03c2
SY
1528
1529 if (!cpu_has_vmx_flexpriority())
1530 flexpriority_enabled = 0;
1531
95ba8273
GN
1532 if (!cpu_has_vmx_tpr_shadow())
1533 kvm_x86_ops->update_cr8_intercept = NULL;
1534
54dee993
MT
1535 if (enable_ept && !cpu_has_vmx_ept_2m_page())
1536 kvm_disable_largepages();
1537
4b8d54f9
ZE
1538 if (!cpu_has_vmx_ple())
1539 ple_gap = 0;
1540
6aa8b732
AK
1541 return alloc_kvm_area();
1542}
1543
1544static __exit void hardware_unsetup(void)
1545{
1546 free_kvm_area();
1547}
1548
6aa8b732
AK
1549static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
1550{
1551 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1552
6af11b9e 1553 if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) {
6aa8b732
AK
1554 vmcs_write16(sf->selector, save->selector);
1555 vmcs_writel(sf->base, save->base);
1556 vmcs_write32(sf->limit, save->limit);
1557 vmcs_write32(sf->ar_bytes, save->ar);
1558 } else {
1559 u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
1560 << AR_DPL_SHIFT;
1561 vmcs_write32(sf->ar_bytes, 0x93 | dpl);
1562 }
1563}
1564
1565static void enter_pmode(struct kvm_vcpu *vcpu)
1566{
1567 unsigned long flags;
a89a8fb9 1568 struct vcpu_vmx *vmx = to_vmx(vcpu);
6aa8b732 1569
a89a8fb9 1570 vmx->emulation_required = 1;
7ffd92c5 1571 vmx->rmode.vm86_active = 0;
6aa8b732 1572
7ffd92c5
AK
1573 vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base);
1574 vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit);
1575 vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
6aa8b732
AK
1576
1577 flags = vmcs_readl(GUEST_RFLAGS);
78ac8b47
AK
1578 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
1579 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
6aa8b732
AK
1580 vmcs_writel(GUEST_RFLAGS, flags);
1581
66aee91a
RR
1582 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
1583 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
6aa8b732
AK
1584
1585 update_exception_bitmap(vcpu);
1586
a89a8fb9
MG
1587 if (emulate_invalid_guest_state)
1588 return;
1589
7ffd92c5
AK
1590 fix_pmode_dataseg(VCPU_SREG_ES, &vmx->rmode.es);
1591 fix_pmode_dataseg(VCPU_SREG_DS, &vmx->rmode.ds);
1592 fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs);
1593 fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs);
6aa8b732
AK
1594
1595 vmcs_write16(GUEST_SS_SELECTOR, 0);
1596 vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
1597
1598 vmcs_write16(GUEST_CS_SELECTOR,
1599 vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
1600 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1601}
1602
d77c26fc 1603static gva_t rmode_tss_base(struct kvm *kvm)
6aa8b732 1604{
bfc6d222 1605 if (!kvm->arch.tss_addr) {
bc6678a3
MT
1606 struct kvm_memslots *slots;
1607 gfn_t base_gfn;
1608
90d83dc3 1609 slots = kvm_memslots(kvm);
bc6678a3 1610 base_gfn = kvm->memslots->memslots[0].base_gfn +
46a26bf5 1611 kvm->memslots->memslots[0].npages - 3;
cbc94022
IE
1612 return base_gfn << PAGE_SHIFT;
1613 }
bfc6d222 1614 return kvm->arch.tss_addr;
6aa8b732
AK
1615}
1616
1617static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
1618{
1619 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1620
1621 save->selector = vmcs_read16(sf->selector);
1622 save->base = vmcs_readl(sf->base);
1623 save->limit = vmcs_read32(sf->limit);
1624 save->ar = vmcs_read32(sf->ar_bytes);
15b00f32
JK
1625 vmcs_write16(sf->selector, save->base >> 4);
1626 vmcs_write32(sf->base, save->base & 0xfffff);
6aa8b732
AK
1627 vmcs_write32(sf->limit, 0xffff);
1628 vmcs_write32(sf->ar_bytes, 0xf3);
1629}
1630
1631static void enter_rmode(struct kvm_vcpu *vcpu)
1632{
1633 unsigned long flags;
a89a8fb9 1634 struct vcpu_vmx *vmx = to_vmx(vcpu);
6aa8b732 1635
3a624e29
NK
1636 if (enable_unrestricted_guest)
1637 return;
1638
a89a8fb9 1639 vmx->emulation_required = 1;
7ffd92c5 1640 vmx->rmode.vm86_active = 1;
6aa8b732 1641
7ffd92c5 1642 vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
6aa8b732
AK
1643 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
1644
7ffd92c5 1645 vmx->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
6aa8b732
AK
1646 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
1647
7ffd92c5 1648 vmx->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
6aa8b732
AK
1649 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1650
1651 flags = vmcs_readl(GUEST_RFLAGS);
78ac8b47 1652 vmx->rmode.save_rflags = flags;
6aa8b732 1653
053de044 1654 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
6aa8b732
AK
1655
1656 vmcs_writel(GUEST_RFLAGS, flags);
66aee91a 1657 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
6aa8b732
AK
1658 update_exception_bitmap(vcpu);
1659
a89a8fb9
MG
1660 if (emulate_invalid_guest_state)
1661 goto continue_rmode;
1662
6aa8b732
AK
1663 vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
1664 vmcs_write32(GUEST_SS_LIMIT, 0xffff);
1665 vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
1666
1667 vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
abacf8df 1668 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
8cb5b033
AK
1669 if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
1670 vmcs_writel(GUEST_CS_BASE, 0xf0000);
6aa8b732
AK
1671 vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
1672
7ffd92c5
AK
1673 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
1674 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
1675 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
1676 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
75880a01 1677
a89a8fb9 1678continue_rmode:
8668a3c4 1679 kvm_mmu_reset_context(vcpu);
b7ebfb05 1680 init_rmode(vcpu->kvm);
6aa8b732
AK
1681}
1682
401d10de
AS
1683static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
1684{
1685 struct vcpu_vmx *vmx = to_vmx(vcpu);
26bb0981
AK
1686 struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
1687
1688 if (!msr)
1689 return;
401d10de 1690
44ea2b17
AK
1691 /*
1692 * Force kernel_gs_base reloading before EFER changes, as control
1693 * of this msr depends on is_long_mode().
1694 */
1695 vmx_load_host_state(to_vmx(vcpu));
f6801dff 1696 vcpu->arch.efer = efer;
401d10de
AS
1697 if (efer & EFER_LMA) {
1698 vmcs_write32(VM_ENTRY_CONTROLS,
1699 vmcs_read32(VM_ENTRY_CONTROLS) |
1700 VM_ENTRY_IA32E_MODE);
1701 msr->data = efer;
1702 } else {
1703 vmcs_write32(VM_ENTRY_CONTROLS,
1704 vmcs_read32(VM_ENTRY_CONTROLS) &
1705 ~VM_ENTRY_IA32E_MODE);
1706
1707 msr->data = efer & ~EFER_LME;
1708 }
1709 setup_msrs(vmx);
1710}
1711
05b3e0c2 1712#ifdef CONFIG_X86_64
6aa8b732
AK
1713
1714static void enter_lmode(struct kvm_vcpu *vcpu)
1715{
1716 u32 guest_tr_ar;
1717
1718 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
1719 if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
1720 printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
b8688d51 1721 __func__);
6aa8b732
AK
1722 vmcs_write32(GUEST_TR_AR_BYTES,
1723 (guest_tr_ar & ~AR_TYPE_MASK)
1724 | AR_TYPE_BUSY_64_TSS);
1725 }
f6801dff
AK
1726 vcpu->arch.efer |= EFER_LMA;
1727 vmx_set_efer(vcpu, vcpu->arch.efer);
6aa8b732
AK
1728}
1729
1730static void exit_lmode(struct kvm_vcpu *vcpu)
1731{
f6801dff 1732 vcpu->arch.efer &= ~EFER_LMA;
6aa8b732
AK
1733
1734 vmcs_write32(VM_ENTRY_CONTROLS,
1735 vmcs_read32(VM_ENTRY_CONTROLS)
1e4e6e00 1736 & ~VM_ENTRY_IA32E_MODE);
6aa8b732
AK
1737}
1738
1739#endif
1740
2384d2b3
SY
1741static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
1742{
1743 vpid_sync_vcpu_all(to_vmx(vcpu));
089d034e 1744 if (enable_ept)
4e1096d2 1745 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
2384d2b3
SY
1746}
1747
e8467fda
AK
1748static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
1749{
1750 ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
1751
1752 vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
1753 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
1754}
1755
25c4c276 1756static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
399badf3 1757{
fc78f519
AK
1758 ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
1759
1760 vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
1761 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
399badf3
AK
1762}
1763
1439442c
SY
1764static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
1765{
6de4f3ad
AK
1766 if (!test_bit(VCPU_EXREG_PDPTR,
1767 (unsigned long *)&vcpu->arch.regs_dirty))
1768 return;
1769
1439442c 1770 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
1439442c
SY
1771 vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]);
1772 vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]);
1773 vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]);
1774 vmcs_write64(GUEST_PDPTR3, vcpu->arch.pdptrs[3]);
1775 }
1776}
1777
8f5d549f
AK
1778static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
1779{
1780 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
1781 vcpu->arch.pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
1782 vcpu->arch.pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
1783 vcpu->arch.pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
1784 vcpu->arch.pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
1785 }
6de4f3ad
AK
1786
1787 __set_bit(VCPU_EXREG_PDPTR,
1788 (unsigned long *)&vcpu->arch.regs_avail);
1789 __set_bit(VCPU_EXREG_PDPTR,
1790 (unsigned long *)&vcpu->arch.regs_dirty);
8f5d549f
AK
1791}
1792
1439442c
SY
1793static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
1794
1795static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1796 unsigned long cr0,
1797 struct kvm_vcpu *vcpu)
1798{
1799 if (!(cr0 & X86_CR0_PG)) {
1800 /* From paging/starting to nonpaging */
1801 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
65267ea1 1802 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
1439442c
SY
1803 (CPU_BASED_CR3_LOAD_EXITING |
1804 CPU_BASED_CR3_STORE_EXITING));
1805 vcpu->arch.cr0 = cr0;
fc78f519 1806 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
1439442c
SY
1807 } else if (!is_paging(vcpu)) {
1808 /* From nonpaging to paging */
1809 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
65267ea1 1810 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
1439442c
SY
1811 ~(CPU_BASED_CR3_LOAD_EXITING |
1812 CPU_BASED_CR3_STORE_EXITING));
1813 vcpu->arch.cr0 = cr0;
fc78f519 1814 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
1439442c 1815 }
95eb84a7
SY
1816
1817 if (!(cr0 & X86_CR0_WP))
1818 *hw_cr0 &= ~X86_CR0_WP;
1439442c
SY
1819}
1820
6aa8b732
AK
1821static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1822{
7ffd92c5 1823 struct vcpu_vmx *vmx = to_vmx(vcpu);
3a624e29
NK
1824 unsigned long hw_cr0;
1825
1826 if (enable_unrestricted_guest)
1827 hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST)
1828 | KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
1829 else
1830 hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON;
1439442c 1831
7ffd92c5 1832 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
6aa8b732
AK
1833 enter_pmode(vcpu);
1834
7ffd92c5 1835 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
6aa8b732
AK
1836 enter_rmode(vcpu);
1837
05b3e0c2 1838#ifdef CONFIG_X86_64
f6801dff 1839 if (vcpu->arch.efer & EFER_LME) {
707d92fa 1840 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
6aa8b732 1841 enter_lmode(vcpu);
707d92fa 1842 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
6aa8b732
AK
1843 exit_lmode(vcpu);
1844 }
1845#endif
1846
089d034e 1847 if (enable_ept)
1439442c
SY
1848 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
1849
02daab21 1850 if (!vcpu->fpu_active)
81231c69 1851 hw_cr0 |= X86_CR0_TS | X86_CR0_MP;
02daab21 1852
6aa8b732 1853 vmcs_writel(CR0_READ_SHADOW, cr0);
1439442c 1854 vmcs_writel(GUEST_CR0, hw_cr0);
ad312c7c 1855 vcpu->arch.cr0 = cr0;
6aa8b732
AK
1856}
1857
1439442c
SY
1858static u64 construct_eptp(unsigned long root_hpa)
1859{
1860 u64 eptp;
1861
1862 /* TODO write the value reading from MSR */
1863 eptp = VMX_EPT_DEFAULT_MT |
1864 VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
1865 eptp |= (root_hpa & PAGE_MASK);
1866
1867 return eptp;
1868}
1869
6aa8b732
AK
1870static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1871{
1439442c
SY
1872 unsigned long guest_cr3;
1873 u64 eptp;
1874
1875 guest_cr3 = cr3;
089d034e 1876 if (enable_ept) {
1439442c
SY
1877 eptp = construct_eptp(cr3);
1878 vmcs_write64(EPT_POINTER, eptp);
1439442c 1879 guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
b927a3ce 1880 vcpu->kvm->arch.ept_identity_map_addr;
7c93be44 1881 ept_load_pdptrs(vcpu);
1439442c
SY
1882 }
1883
2384d2b3 1884 vmx_flush_tlb(vcpu);
1439442c 1885 vmcs_writel(GUEST_CR3, guest_cr3);
6aa8b732
AK
1886}
1887
1888static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1889{
7ffd92c5 1890 unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
1439442c
SY
1891 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
1892
ad312c7c 1893 vcpu->arch.cr4 = cr4;
bc23008b
AK
1894 if (enable_ept) {
1895 if (!is_paging(vcpu)) {
1896 hw_cr4 &= ~X86_CR4_PAE;
1897 hw_cr4 |= X86_CR4_PSE;
1898 } else if (!(cr4 & X86_CR4_PAE)) {
1899 hw_cr4 &= ~X86_CR4_PAE;
1900 }
1901 }
1439442c
SY
1902
1903 vmcs_writel(CR4_READ_SHADOW, cr4);
1904 vmcs_writel(GUEST_CR4, hw_cr4);
6aa8b732
AK
1905}
1906
6aa8b732
AK
1907static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1908{
1909 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1910
1911 return vmcs_readl(sf->base);
1912}
1913
1914static void vmx_get_segment(struct kvm_vcpu *vcpu,
1915 struct kvm_segment *var, int seg)
1916{
1917 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1918 u32 ar;
1919
1920 var->base = vmcs_readl(sf->base);
1921 var->limit = vmcs_read32(sf->limit);
1922 var->selector = vmcs_read16(sf->selector);
1923 ar = vmcs_read32(sf->ar_bytes);
9fd4a3b7 1924 if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
6aa8b732
AK
1925 ar = 0;
1926 var->type = ar & 15;
1927 var->s = (ar >> 4) & 1;
1928 var->dpl = (ar >> 5) & 3;
1929 var->present = (ar >> 7) & 1;
1930 var->avl = (ar >> 12) & 1;
1931 var->l = (ar >> 13) & 1;
1932 var->db = (ar >> 14) & 1;
1933 var->g = (ar >> 15) & 1;
1934 var->unusable = (ar >> 16) & 1;
1935}
1936
2e4d2653
IE
1937static int vmx_get_cpl(struct kvm_vcpu *vcpu)
1938{
3eeb3288 1939 if (!is_protmode(vcpu))
2e4d2653
IE
1940 return 0;
1941
1942 if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */
1943 return 3;
1944
eab4b8aa 1945 return vmcs_read16(GUEST_CS_SELECTOR) & 3;
2e4d2653
IE
1946}
1947
653e3108 1948static u32 vmx_segment_access_rights(struct kvm_segment *var)
6aa8b732 1949{
6aa8b732
AK
1950 u32 ar;
1951
653e3108 1952 if (var->unusable)
6aa8b732
AK
1953 ar = 1 << 16;
1954 else {
1955 ar = var->type & 15;
1956 ar |= (var->s & 1) << 4;
1957 ar |= (var->dpl & 3) << 5;
1958 ar |= (var->present & 1) << 7;
1959 ar |= (var->avl & 1) << 12;
1960 ar |= (var->l & 1) << 13;
1961 ar |= (var->db & 1) << 14;
1962 ar |= (var->g & 1) << 15;
1963 }
f7fbf1fd
UL
1964 if (ar == 0) /* a 0 value means unusable */
1965 ar = AR_UNUSABLE_MASK;
653e3108
AK
1966
1967 return ar;
1968}
1969
1970static void vmx_set_segment(struct kvm_vcpu *vcpu,
1971 struct kvm_segment *var, int seg)
1972{
7ffd92c5 1973 struct vcpu_vmx *vmx = to_vmx(vcpu);
653e3108
AK
1974 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1975 u32 ar;
1976
7ffd92c5
AK
1977 if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
1978 vmx->rmode.tr.selector = var->selector;
1979 vmx->rmode.tr.base = var->base;
1980 vmx->rmode.tr.limit = var->limit;
1981 vmx->rmode.tr.ar = vmx_segment_access_rights(var);
653e3108
AK
1982 return;
1983 }
1984 vmcs_writel(sf->base, var->base);
1985 vmcs_write32(sf->limit, var->limit);
1986 vmcs_write16(sf->selector, var->selector);
7ffd92c5 1987 if (vmx->rmode.vm86_active && var->s) {
653e3108
AK
1988 /*
1989 * Hack real-mode segments into vm86 compatibility.
1990 */
1991 if (var->base == 0xffff0000 && var->selector == 0xf000)
1992 vmcs_writel(sf->base, 0xf0000);
1993 ar = 0xf3;
1994 } else
1995 ar = vmx_segment_access_rights(var);
3a624e29
NK
1996
1997 /*
1998 * Fix the "Accessed" bit in AR field of segment registers for older
1999 * qemu binaries.
2000 * IA32 arch specifies that at the time of processor reset the
2001 * "Accessed" bit in the AR field of segment registers is 1. And qemu
2002 * is setting it to 0 in the usedland code. This causes invalid guest
2003 * state vmexit when "unrestricted guest" mode is turned on.
2004 * Fix for this setup issue in cpu_reset is being pushed in the qemu
2005 * tree. Newer qemu binaries with that qemu fix would not need this
2006 * kvm hack.
2007 */
2008 if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
2009 ar |= 0x1; /* Accessed */
2010
6aa8b732
AK
2011 vmcs_write32(sf->ar_bytes, ar);
2012}
2013
6aa8b732
AK
2014static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
2015{
2016 u32 ar = vmcs_read32(GUEST_CS_AR_BYTES);
2017
2018 *db = (ar >> 14) & 1;
2019 *l = (ar >> 13) & 1;
2020}
2021
89a27f4d 2022static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 2023{
89a27f4d
GN
2024 dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
2025 dt->address = vmcs_readl(GUEST_IDTR_BASE);
6aa8b732
AK
2026}
2027
89a27f4d 2028static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 2029{
89a27f4d
GN
2030 vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
2031 vmcs_writel(GUEST_IDTR_BASE, dt->address);
6aa8b732
AK
2032}
2033
89a27f4d 2034static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 2035{
89a27f4d
GN
2036 dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
2037 dt->address = vmcs_readl(GUEST_GDTR_BASE);
6aa8b732
AK
2038}
2039
89a27f4d 2040static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 2041{
89a27f4d
GN
2042 vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
2043 vmcs_writel(GUEST_GDTR_BASE, dt->address);
6aa8b732
AK
2044}
2045
648dfaa7
MG
2046static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
2047{
2048 struct kvm_segment var;
2049 u32 ar;
2050
2051 vmx_get_segment(vcpu, &var, seg);
2052 ar = vmx_segment_access_rights(&var);
2053
2054 if (var.base != (var.selector << 4))
2055 return false;
2056 if (var.limit != 0xffff)
2057 return false;
2058 if (ar != 0xf3)
2059 return false;
2060
2061 return true;
2062}
2063
2064static bool code_segment_valid(struct kvm_vcpu *vcpu)
2065{
2066 struct kvm_segment cs;
2067 unsigned int cs_rpl;
2068
2069 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
2070 cs_rpl = cs.selector & SELECTOR_RPL_MASK;
2071
1872a3f4
AK
2072 if (cs.unusable)
2073 return false;
648dfaa7
MG
2074 if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK))
2075 return false;
2076 if (!cs.s)
2077 return false;
1872a3f4 2078 if (cs.type & AR_TYPE_WRITEABLE_MASK) {
648dfaa7
MG
2079 if (cs.dpl > cs_rpl)
2080 return false;
1872a3f4 2081 } else {
648dfaa7
MG
2082 if (cs.dpl != cs_rpl)
2083 return false;
2084 }
2085 if (!cs.present)
2086 return false;
2087
2088 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
2089 return true;
2090}
2091
2092static bool stack_segment_valid(struct kvm_vcpu *vcpu)
2093{
2094 struct kvm_segment ss;
2095 unsigned int ss_rpl;
2096
2097 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
2098 ss_rpl = ss.selector & SELECTOR_RPL_MASK;
2099
1872a3f4
AK
2100 if (ss.unusable)
2101 return true;
2102 if (ss.type != 3 && ss.type != 7)
648dfaa7
MG
2103 return false;
2104 if (!ss.s)
2105 return false;
2106 if (ss.dpl != ss_rpl) /* DPL != RPL */
2107 return false;
2108 if (!ss.present)
2109 return false;
2110
2111 return true;
2112}
2113
2114static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
2115{
2116 struct kvm_segment var;
2117 unsigned int rpl;
2118
2119 vmx_get_segment(vcpu, &var, seg);
2120 rpl = var.selector & SELECTOR_RPL_MASK;
2121
1872a3f4
AK
2122 if (var.unusable)
2123 return true;
648dfaa7
MG
2124 if (!var.s)
2125 return false;
2126 if (!var.present)
2127 return false;
2128 if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) {
2129 if (var.dpl < rpl) /* DPL < RPL */
2130 return false;
2131 }
2132
2133 /* TODO: Add other members to kvm_segment_field to allow checking for other access
2134 * rights flags
2135 */
2136 return true;
2137}
2138
2139static bool tr_valid(struct kvm_vcpu *vcpu)
2140{
2141 struct kvm_segment tr;
2142
2143 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
2144
1872a3f4
AK
2145 if (tr.unusable)
2146 return false;
648dfaa7
MG
2147 if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */
2148 return false;
1872a3f4 2149 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
648dfaa7
MG
2150 return false;
2151 if (!tr.present)
2152 return false;
2153
2154 return true;
2155}
2156
2157static bool ldtr_valid(struct kvm_vcpu *vcpu)
2158{
2159 struct kvm_segment ldtr;
2160
2161 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
2162
1872a3f4
AK
2163 if (ldtr.unusable)
2164 return true;
648dfaa7
MG
2165 if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */
2166 return false;
2167 if (ldtr.type != 2)
2168 return false;
2169 if (!ldtr.present)
2170 return false;
2171
2172 return true;
2173}
2174
2175static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
2176{
2177 struct kvm_segment cs, ss;
2178
2179 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
2180 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
2181
2182 return ((cs.selector & SELECTOR_RPL_MASK) ==
2183 (ss.selector & SELECTOR_RPL_MASK));
2184}
2185
2186/*
2187 * Check if guest state is valid. Returns true if valid, false if
2188 * not.
2189 * We assume that registers are always usable
2190 */
2191static bool guest_state_valid(struct kvm_vcpu *vcpu)
2192{
2193 /* real mode guest state checks */
3eeb3288 2194 if (!is_protmode(vcpu)) {
648dfaa7
MG
2195 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
2196 return false;
2197 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
2198 return false;
2199 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
2200 return false;
2201 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
2202 return false;
2203 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
2204 return false;
2205 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
2206 return false;
2207 } else {
2208 /* protected mode guest state checks */
2209 if (!cs_ss_rpl_check(vcpu))
2210 return false;
2211 if (!code_segment_valid(vcpu))
2212 return false;
2213 if (!stack_segment_valid(vcpu))
2214 return false;
2215 if (!data_segment_valid(vcpu, VCPU_SREG_DS))
2216 return false;
2217 if (!data_segment_valid(vcpu, VCPU_SREG_ES))
2218 return false;
2219 if (!data_segment_valid(vcpu, VCPU_SREG_FS))
2220 return false;
2221 if (!data_segment_valid(vcpu, VCPU_SREG_GS))
2222 return false;
2223 if (!tr_valid(vcpu))
2224 return false;
2225 if (!ldtr_valid(vcpu))
2226 return false;
2227 }
2228 /* TODO:
2229 * - Add checks on RIP
2230 * - Add checks on RFLAGS
2231 */
2232
2233 return true;
2234}
2235
d77c26fc 2236static int init_rmode_tss(struct kvm *kvm)
6aa8b732 2237{
6aa8b732 2238 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
195aefde 2239 u16 data = 0;
10589a46 2240 int ret = 0;
195aefde 2241 int r;
6aa8b732 2242
195aefde
IE
2243 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
2244 if (r < 0)
10589a46 2245 goto out;
195aefde 2246 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
464d17c8
SY
2247 r = kvm_write_guest_page(kvm, fn++, &data,
2248 TSS_IOPB_BASE_OFFSET, sizeof(u16));
195aefde 2249 if (r < 0)
10589a46 2250 goto out;
195aefde
IE
2251 r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
2252 if (r < 0)
10589a46 2253 goto out;
195aefde
IE
2254 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
2255 if (r < 0)
10589a46 2256 goto out;
195aefde 2257 data = ~0;
10589a46
MT
2258 r = kvm_write_guest_page(kvm, fn, &data,
2259 RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
2260 sizeof(u8));
195aefde 2261 if (r < 0)
10589a46
MT
2262 goto out;
2263
2264 ret = 1;
2265out:
10589a46 2266 return ret;
6aa8b732
AK
2267}
2268
b7ebfb05
SY
2269static int init_rmode_identity_map(struct kvm *kvm)
2270{
2271 int i, r, ret;
2272 pfn_t identity_map_pfn;
2273 u32 tmp;
2274
089d034e 2275 if (!enable_ept)
b7ebfb05
SY
2276 return 1;
2277 if (unlikely(!kvm->arch.ept_identity_pagetable)) {
2278 printk(KERN_ERR "EPT: identity-mapping pagetable "
2279 "haven't been allocated!\n");
2280 return 0;
2281 }
2282 if (likely(kvm->arch.ept_identity_pagetable_done))
2283 return 1;
2284 ret = 0;
b927a3ce 2285 identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
b7ebfb05
SY
2286 r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
2287 if (r < 0)
2288 goto out;
2289 /* Set up identity-mapping pagetable for EPT in real mode */
2290 for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
2291 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
2292 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
2293 r = kvm_write_guest_page(kvm, identity_map_pfn,
2294 &tmp, i * sizeof(tmp), sizeof(tmp));
2295 if (r < 0)
2296 goto out;
2297 }
2298 kvm->arch.ept_identity_pagetable_done = true;
2299 ret = 1;
2300out:
2301 return ret;
2302}
2303
6aa8b732
AK
2304static void seg_setup(int seg)
2305{
2306 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3a624e29 2307 unsigned int ar;
6aa8b732
AK
2308
2309 vmcs_write16(sf->selector, 0);
2310 vmcs_writel(sf->base, 0);
2311 vmcs_write32(sf->limit, 0xffff);
3a624e29
NK
2312 if (enable_unrestricted_guest) {
2313 ar = 0x93;
2314 if (seg == VCPU_SREG_CS)
2315 ar |= 0x08; /* code segment */
2316 } else
2317 ar = 0xf3;
2318
2319 vmcs_write32(sf->ar_bytes, ar);
6aa8b732
AK
2320}
2321
f78e0e2e
SY
2322static int alloc_apic_access_page(struct kvm *kvm)
2323{
2324 struct kvm_userspace_memory_region kvm_userspace_mem;
2325 int r = 0;
2326
79fac95e 2327 mutex_lock(&kvm->slots_lock);
bfc6d222 2328 if (kvm->arch.apic_access_page)
f78e0e2e
SY
2329 goto out;
2330 kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
2331 kvm_userspace_mem.flags = 0;
2332 kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
2333 kvm_userspace_mem.memory_size = PAGE_SIZE;
2334 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
2335 if (r)
2336 goto out;
72dc67a6 2337
bfc6d222 2338 kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
f78e0e2e 2339out:
79fac95e 2340 mutex_unlock(&kvm->slots_lock);
f78e0e2e
SY
2341 return r;
2342}
2343
b7ebfb05
SY
2344static int alloc_identity_pagetable(struct kvm *kvm)
2345{
2346 struct kvm_userspace_memory_region kvm_userspace_mem;
2347 int r = 0;
2348
79fac95e 2349 mutex_lock(&kvm->slots_lock);
b7ebfb05
SY
2350 if (kvm->arch.ept_identity_pagetable)
2351 goto out;
2352 kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
2353 kvm_userspace_mem.flags = 0;
b927a3ce
SY
2354 kvm_userspace_mem.guest_phys_addr =
2355 kvm->arch.ept_identity_map_addr;
b7ebfb05
SY
2356 kvm_userspace_mem.memory_size = PAGE_SIZE;
2357 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
2358 if (r)
2359 goto out;
2360
b7ebfb05 2361 kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
b927a3ce 2362 kvm->arch.ept_identity_map_addr >> PAGE_SHIFT);
b7ebfb05 2363out:
79fac95e 2364 mutex_unlock(&kvm->slots_lock);
b7ebfb05
SY
2365 return r;
2366}
2367
2384d2b3
SY
2368static void allocate_vpid(struct vcpu_vmx *vmx)
2369{
2370 int vpid;
2371
2372 vmx->vpid = 0;
919818ab 2373 if (!enable_vpid)
2384d2b3
SY
2374 return;
2375 spin_lock(&vmx_vpid_lock);
2376 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
2377 if (vpid < VMX_NR_VPIDS) {
2378 vmx->vpid = vpid;
2379 __set_bit(vpid, vmx_vpid_bitmap);
2380 }
2381 spin_unlock(&vmx_vpid_lock);
2382}
2383
cdbecfc3
LJ
2384static void free_vpid(struct vcpu_vmx *vmx)
2385{
2386 if (!enable_vpid)
2387 return;
2388 spin_lock(&vmx_vpid_lock);
2389 if (vmx->vpid != 0)
2390 __clear_bit(vmx->vpid, vmx_vpid_bitmap);
2391 spin_unlock(&vmx_vpid_lock);
2392}
2393
5897297b 2394static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
25c5f225 2395{
3e7c73e9 2396 int f = sizeof(unsigned long);
25c5f225
SY
2397
2398 if (!cpu_has_vmx_msr_bitmap())
2399 return;
2400
2401 /*
2402 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
2403 * have the write-low and read-high bitmap offsets the wrong way round.
2404 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
2405 */
25c5f225 2406 if (msr <= 0x1fff) {
3e7c73e9
AK
2407 __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
2408 __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
25c5f225
SY
2409 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
2410 msr &= 0x1fff;
3e7c73e9
AK
2411 __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
2412 __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
25c5f225 2413 }
25c5f225
SY
2414}
2415
5897297b
AK
2416static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
2417{
2418 if (!longmode_only)
2419 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
2420 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
2421}
2422
6aa8b732
AK
2423/*
2424 * Sets up the vmcs for emulated real mode.
2425 */
8b9cf98c 2426static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
6aa8b732 2427{
468d472f 2428 u32 host_sysenter_cs, msr_low, msr_high;
6aa8b732 2429 u32 junk;
53f658b3 2430 u64 host_pat, tsc_this, tsc_base;
6aa8b732 2431 unsigned long a;
89a27f4d 2432 struct desc_ptr dt;
6aa8b732 2433 int i;
cd2276a7 2434 unsigned long kvm_vmx_return;
6e5d865c 2435 u32 exec_control;
6aa8b732 2436
6aa8b732 2437 /* I/O */
3e7c73e9
AK
2438 vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
2439 vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
6aa8b732 2440
25c5f225 2441 if (cpu_has_vmx_msr_bitmap())
5897297b 2442 vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
25c5f225 2443
6aa8b732
AK
2444 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
2445
6aa8b732 2446 /* Control */
1c3d14fe
YS
2447 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
2448 vmcs_config.pin_based_exec_ctrl);
6e5d865c
YS
2449
2450 exec_control = vmcs_config.cpu_based_exec_ctrl;
2451 if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
2452 exec_control &= ~CPU_BASED_TPR_SHADOW;
2453#ifdef CONFIG_X86_64
2454 exec_control |= CPU_BASED_CR8_STORE_EXITING |
2455 CPU_BASED_CR8_LOAD_EXITING;
2456#endif
2457 }
089d034e 2458 if (!enable_ept)
d56f546d 2459 exec_control |= CPU_BASED_CR3_STORE_EXITING |
83dbc83a
MT
2460 CPU_BASED_CR3_LOAD_EXITING |
2461 CPU_BASED_INVLPG_EXITING;
6e5d865c 2462 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
6aa8b732 2463
83ff3b9d
SY
2464 if (cpu_has_secondary_exec_ctrls()) {
2465 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
2466 if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
2467 exec_control &=
2468 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2384d2b3
SY
2469 if (vmx->vpid == 0)
2470 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
046d8710 2471 if (!enable_ept) {
d56f546d 2472 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
046d8710
SY
2473 enable_unrestricted_guest = 0;
2474 }
3a624e29
NK
2475 if (!enable_unrestricted_guest)
2476 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
4b8d54f9
ZE
2477 if (!ple_gap)
2478 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
83ff3b9d
SY
2479 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
2480 }
f78e0e2e 2481
4b8d54f9
ZE
2482 if (ple_gap) {
2483 vmcs_write32(PLE_GAP, ple_gap);
2484 vmcs_write32(PLE_WINDOW, ple_window);
2485 }
2486
c7addb90
AK
2487 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
2488 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
6aa8b732
AK
2489 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
2490
2491 vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */
2492 vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
2493 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
2494
2495 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
2496 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
2497 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
d6e88aec
AK
2498 vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs()); /* 22.2.4 */
2499 vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs()); /* 22.2.4 */
6aa8b732 2500 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
05b3e0c2 2501#ifdef CONFIG_X86_64
6aa8b732
AK
2502 rdmsrl(MSR_FS_BASE, a);
2503 vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
2504 rdmsrl(MSR_GS_BASE, a);
2505 vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
2506#else
2507 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
2508 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
2509#endif
2510
2511 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
2512
ec68798c 2513 native_store_idt(&dt);
89a27f4d 2514 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
6aa8b732 2515
d77c26fc 2516 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
cd2276a7 2517 vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
2cc51560
ED
2518 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
2519 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
61d2ef2c 2520 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
2cc51560 2521 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
61d2ef2c 2522 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
6aa8b732
AK
2523
2524 rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
2525 vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
2526 rdmsrl(MSR_IA32_SYSENTER_ESP, a);
2527 vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */
2528 rdmsrl(MSR_IA32_SYSENTER_EIP, a);
2529 vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
2530
468d472f
SY
2531 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
2532 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
2533 host_pat = msr_low | ((u64) msr_high << 32);
2534 vmcs_write64(HOST_IA32_PAT, host_pat);
2535 }
2536 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2537 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
2538 host_pat = msr_low | ((u64) msr_high << 32);
2539 /* Write the default value follow host pat */
2540 vmcs_write64(GUEST_IA32_PAT, host_pat);
2541 /* Keep arch.pat sync with GUEST_IA32_PAT */
2542 vmx->vcpu.arch.pat = host_pat;
2543 }
2544
6aa8b732
AK
2545 for (i = 0; i < NR_VMX_MSR; ++i) {
2546 u32 index = vmx_msr_index[i];
2547 u32 data_low, data_high;
a2fa3e9f 2548 int j = vmx->nmsrs;
6aa8b732
AK
2549
2550 if (rdmsr_safe(index, &data_low, &data_high) < 0)
2551 continue;
432bd6cb
AK
2552 if (wrmsr_safe(index, data_low, data_high) < 0)
2553 continue;
26bb0981
AK
2554 vmx->guest_msrs[j].index = i;
2555 vmx->guest_msrs[j].data = 0;
d5696725 2556 vmx->guest_msrs[j].mask = -1ull;
a2fa3e9f 2557 ++vmx->nmsrs;
6aa8b732 2558 }
6aa8b732 2559
1c3d14fe 2560 vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
6aa8b732
AK
2561
2562 /* 22.2.1, 20.8.1 */
1c3d14fe
YS
2563 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
2564
e00c8cf2 2565 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
4c38609a 2566 vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
ce03e4f2
AK
2567 if (enable_ept)
2568 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
4c38609a 2569 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
e00c8cf2 2570
53f658b3
MT
2571 tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc;
2572 rdtscll(tsc_this);
2573 if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc)
2574 tsc_base = tsc_this;
2575
2576 guest_write_tsc(0, tsc_base);
f78e0e2e 2577
e00c8cf2
AK
2578 return 0;
2579}
2580
b7ebfb05
SY
2581static int init_rmode(struct kvm *kvm)
2582{
2583 if (!init_rmode_tss(kvm))
2584 return 0;
2585 if (!init_rmode_identity_map(kvm))
2586 return 0;
2587 return 1;
2588}
2589
e00c8cf2
AK
2590static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2591{
2592 struct vcpu_vmx *vmx = to_vmx(vcpu);
2593 u64 msr;
f656ce01 2594 int ret, idx;
e00c8cf2 2595
5fdbf976 2596 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
f656ce01 2597 idx = srcu_read_lock(&vcpu->kvm->srcu);
b7ebfb05 2598 if (!init_rmode(vmx->vcpu.kvm)) {
e00c8cf2
AK
2599 ret = -ENOMEM;
2600 goto out;
2601 }
2602
7ffd92c5 2603 vmx->rmode.vm86_active = 0;
e00c8cf2 2604
3b86cd99
JK
2605 vmx->soft_vnmi_blocked = 0;
2606
ad312c7c 2607 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
2d3ad1f4 2608 kvm_set_cr8(&vmx->vcpu, 0);
e00c8cf2 2609 msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
c5af89b6 2610 if (kvm_vcpu_is_bsp(&vmx->vcpu))
e00c8cf2
AK
2611 msr |= MSR_IA32_APICBASE_BSP;
2612 kvm_set_apic_base(&vmx->vcpu, msr);
2613
2614 fx_init(&vmx->vcpu);
2615
5706be0d 2616 seg_setup(VCPU_SREG_CS);
e00c8cf2
AK
2617 /*
2618 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
2619 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
2620 */
c5af89b6 2621 if (kvm_vcpu_is_bsp(&vmx->vcpu)) {
e00c8cf2
AK
2622 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
2623 vmcs_writel(GUEST_CS_BASE, 0x000f0000);
2624 } else {
ad312c7c
ZX
2625 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
2626 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
e00c8cf2 2627 }
e00c8cf2
AK
2628
2629 seg_setup(VCPU_SREG_DS);
2630 seg_setup(VCPU_SREG_ES);
2631 seg_setup(VCPU_SREG_FS);
2632 seg_setup(VCPU_SREG_GS);
2633 seg_setup(VCPU_SREG_SS);
2634
2635 vmcs_write16(GUEST_TR_SELECTOR, 0);
2636 vmcs_writel(GUEST_TR_BASE, 0);
2637 vmcs_write32(GUEST_TR_LIMIT, 0xffff);
2638 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
2639
2640 vmcs_write16(GUEST_LDTR_SELECTOR, 0);
2641 vmcs_writel(GUEST_LDTR_BASE, 0);
2642 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
2643 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
2644
2645 vmcs_write32(GUEST_SYSENTER_CS, 0);
2646 vmcs_writel(GUEST_SYSENTER_ESP, 0);
2647 vmcs_writel(GUEST_SYSENTER_EIP, 0);
2648
2649 vmcs_writel(GUEST_RFLAGS, 0x02);
c5af89b6 2650 if (kvm_vcpu_is_bsp(&vmx->vcpu))
5fdbf976 2651 kvm_rip_write(vcpu, 0xfff0);
e00c8cf2 2652 else
5fdbf976
MT
2653 kvm_rip_write(vcpu, 0);
2654 kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
e00c8cf2 2655
e00c8cf2
AK
2656 vmcs_writel(GUEST_DR7, 0x400);
2657
2658 vmcs_writel(GUEST_GDTR_BASE, 0);
2659 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
2660
2661 vmcs_writel(GUEST_IDTR_BASE, 0);
2662 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
2663
2664 vmcs_write32(GUEST_ACTIVITY_STATE, 0);
2665 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
2666 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
2667
e00c8cf2
AK
2668 /* Special registers */
2669 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
2670
2671 setup_msrs(vmx);
2672
6aa8b732
AK
2673 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
2674
f78e0e2e
SY
2675 if (cpu_has_vmx_tpr_shadow()) {
2676 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
2677 if (vm_need_tpr_shadow(vmx->vcpu.kvm))
2678 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
ad312c7c 2679 page_to_phys(vmx->vcpu.arch.apic->regs_page));
f78e0e2e
SY
2680 vmcs_write32(TPR_THRESHOLD, 0);
2681 }
2682
2683 if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
2684 vmcs_write64(APIC_ACCESS_ADDR,
bfc6d222 2685 page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
6aa8b732 2686
2384d2b3
SY
2687 if (vmx->vpid != 0)
2688 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2689
fa40052c 2690 vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
4d4ec087 2691 vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
8b9cf98c 2692 vmx_set_cr4(&vmx->vcpu, 0);
8b9cf98c 2693 vmx_set_efer(&vmx->vcpu, 0);
8b9cf98c
RR
2694 vmx_fpu_activate(&vmx->vcpu);
2695 update_exception_bitmap(&vmx->vcpu);
6aa8b732 2696
2384d2b3
SY
2697 vpid_sync_vcpu_all(vmx);
2698
3200f405 2699 ret = 0;
6aa8b732 2700
a89a8fb9
MG
2701 /* HACK: Don't enable emulation on guest boot/reset */
2702 vmx->emulation_required = 0;
2703
6aa8b732 2704out:
f656ce01 2705 srcu_read_unlock(&vcpu->kvm->srcu, idx);
6aa8b732
AK
2706 return ret;
2707}
2708
3b86cd99
JK
2709static void enable_irq_window(struct kvm_vcpu *vcpu)
2710{
2711 u32 cpu_based_vm_exec_control;
2712
2713 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2714 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
2715 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2716}
2717
2718static void enable_nmi_window(struct kvm_vcpu *vcpu)
2719{
2720 u32 cpu_based_vm_exec_control;
2721
2722 if (!cpu_has_virtual_nmis()) {
2723 enable_irq_window(vcpu);
2724 return;
2725 }
2726
2727 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2728 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
2729 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2730}
2731
66fd3f7f 2732static void vmx_inject_irq(struct kvm_vcpu *vcpu)
85f455f7 2733{
9c8cba37 2734 struct vcpu_vmx *vmx = to_vmx(vcpu);
66fd3f7f
GN
2735 uint32_t intr;
2736 int irq = vcpu->arch.interrupt.nr;
9c8cba37 2737
229456fc 2738 trace_kvm_inj_virq(irq);
2714d1d3 2739
fa89a817 2740 ++vcpu->stat.irq_injections;
7ffd92c5 2741 if (vmx->rmode.vm86_active) {
9c8cba37
AK
2742 vmx->rmode.irq.pending = true;
2743 vmx->rmode.irq.vector = irq;
5fdbf976 2744 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
ae0bb3e0
GN
2745 if (vcpu->arch.interrupt.soft)
2746 vmx->rmode.irq.rip +=
2747 vmx->vcpu.arch.event_exit_inst_len;
9c5623e3
AK
2748 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2749 irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
2750 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
5fdbf976 2751 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
85f455f7
ED
2752 return;
2753 }
66fd3f7f
GN
2754 intr = irq | INTR_INFO_VALID_MASK;
2755 if (vcpu->arch.interrupt.soft) {
2756 intr |= INTR_TYPE_SOFT_INTR;
2757 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2758 vmx->vcpu.arch.event_exit_inst_len);
2759 } else
2760 intr |= INTR_TYPE_EXT_INTR;
2761 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
85f455f7
ED
2762}
2763
f08864b4
SY
2764static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2765{
66a5a347
JK
2766 struct vcpu_vmx *vmx = to_vmx(vcpu);
2767
3b86cd99
JK
2768 if (!cpu_has_virtual_nmis()) {
2769 /*
2770 * Tracking the NMI-blocked state in software is built upon
2771 * finding the next open IRQ window. This, in turn, depends on
2772 * well-behaving guests: They have to keep IRQs disabled at
2773 * least as long as the NMI handler runs. Otherwise we may
2774 * cause NMI nesting, maybe breaking the guest. But as this is
2775 * highly unlikely, we can live with the residual risk.
2776 */
2777 vmx->soft_vnmi_blocked = 1;
2778 vmx->vnmi_blocked_time = 0;
2779 }
2780
487b391d 2781 ++vcpu->stat.nmi_injections;
7ffd92c5 2782 if (vmx->rmode.vm86_active) {
66a5a347
JK
2783 vmx->rmode.irq.pending = true;
2784 vmx->rmode.irq.vector = NMI_VECTOR;
2785 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
2786 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2787 NMI_VECTOR | INTR_TYPE_SOFT_INTR |
2788 INTR_INFO_VALID_MASK);
2789 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
2790 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
2791 return;
2792 }
f08864b4
SY
2793 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2794 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
f08864b4
SY
2795}
2796
c4282df9 2797static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
33f089ca 2798{
3b86cd99 2799 if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
c4282df9 2800 return 0;
33f089ca 2801
c4282df9
GN
2802 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2803 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS |
2804 GUEST_INTR_STATE_NMI));
33f089ca
JK
2805}
2806
3cfc3092
JK
2807static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
2808{
2809 if (!cpu_has_virtual_nmis())
2810 return to_vmx(vcpu)->soft_vnmi_blocked;
2811 else
2812 return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2813 GUEST_INTR_STATE_NMI);
2814}
2815
2816static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
2817{
2818 struct vcpu_vmx *vmx = to_vmx(vcpu);
2819
2820 if (!cpu_has_virtual_nmis()) {
2821 if (vmx->soft_vnmi_blocked != masked) {
2822 vmx->soft_vnmi_blocked = masked;
2823 vmx->vnmi_blocked_time = 0;
2824 }
2825 } else {
2826 if (masked)
2827 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
2828 GUEST_INTR_STATE_NMI);
2829 else
2830 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
2831 GUEST_INTR_STATE_NMI);
2832 }
2833}
2834
78646121
GN
2835static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
2836{
c4282df9
GN
2837 return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
2838 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2839 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
78646121
GN
2840}
2841
cbc94022
IE
2842static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
2843{
2844 int ret;
2845 struct kvm_userspace_memory_region tss_mem = {
6fe63979 2846 .slot = TSS_PRIVATE_MEMSLOT,
cbc94022
IE
2847 .guest_phys_addr = addr,
2848 .memory_size = PAGE_SIZE * 3,
2849 .flags = 0,
2850 };
2851
2852 ret = kvm_set_memory_region(kvm, &tss_mem, 0);
2853 if (ret)
2854 return ret;
bfc6d222 2855 kvm->arch.tss_addr = addr;
cbc94022
IE
2856 return 0;
2857}
2858
6aa8b732
AK
2859static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2860 int vec, u32 err_code)
2861{
b3f37707
NK
2862 /*
2863 * Instruction with address size override prefix opcode 0x67
2864 * Cause the #SS fault with 0 error code in VM86 mode.
2865 */
2866 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
851ba692 2867 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE)
6aa8b732 2868 return 1;
77ab6db0
JK
2869 /*
2870 * Forward all other exceptions that are valid in real mode.
2871 * FIXME: Breaks guest debugging in real mode, needs to be fixed with
2872 * the required debugging infrastructure rework.
2873 */
2874 switch (vec) {
77ab6db0 2875 case DB_VECTOR:
d0bfb940
JK
2876 if (vcpu->guest_debug &
2877 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
2878 return 0;
2879 kvm_queue_exception(vcpu, vec);
2880 return 1;
77ab6db0 2881 case BP_VECTOR:
c573cd22
JK
2882 /*
2883 * Update instruction length as we may reinject the exception
2884 * from user space while in guest debugging mode.
2885 */
2886 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
2887 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
d0bfb940
JK
2888 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
2889 return 0;
2890 /* fall through */
2891 case DE_VECTOR:
77ab6db0
JK
2892 case OF_VECTOR:
2893 case BR_VECTOR:
2894 case UD_VECTOR:
2895 case DF_VECTOR:
2896 case SS_VECTOR:
2897 case GP_VECTOR:
2898 case MF_VECTOR:
2899 kvm_queue_exception(vcpu, vec);
2900 return 1;
2901 }
6aa8b732
AK
2902 return 0;
2903}
2904
a0861c02
AK
2905/*
2906 * Trigger machine check on the host. We assume all the MSRs are already set up
2907 * by the CPU and that we still run on the same CPU as the MCE occurred on.
2908 * We pass a fake environment to the machine check handler because we want
2909 * the guest to be always treated like user space, no matter what context
2910 * it used internally.
2911 */
2912static void kvm_machine_check(void)
2913{
2914#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
2915 struct pt_regs regs = {
2916 .cs = 3, /* Fake ring 3 no matter what the guest ran on */
2917 .flags = X86_EFLAGS_IF,
2918 };
2919
2920 do_machine_check(&regs, 0);
2921#endif
2922}
2923
851ba692 2924static int handle_machine_check(struct kvm_vcpu *vcpu)
a0861c02
AK
2925{
2926 /* already handled by vcpu_run */
2927 return 1;
2928}
2929
851ba692 2930static int handle_exception(struct kvm_vcpu *vcpu)
6aa8b732 2931{
1155f76a 2932 struct vcpu_vmx *vmx = to_vmx(vcpu);
851ba692 2933 struct kvm_run *kvm_run = vcpu->run;
d0bfb940 2934 u32 intr_info, ex_no, error_code;
42dbaa5a 2935 unsigned long cr2, rip, dr6;
6aa8b732
AK
2936 u32 vect_info;
2937 enum emulation_result er;
2938
1155f76a 2939 vect_info = vmx->idt_vectoring_info;
6aa8b732
AK
2940 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2941
a0861c02 2942 if (is_machine_check(intr_info))
851ba692 2943 return handle_machine_check(vcpu);
a0861c02 2944
6aa8b732 2945 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
65ac7264
AK
2946 !is_page_fault(intr_info)) {
2947 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2948 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
2949 vcpu->run->internal.ndata = 2;
2950 vcpu->run->internal.data[0] = vect_info;
2951 vcpu->run->internal.data[1] = intr_info;
2952 return 0;
2953 }
6aa8b732 2954
e4a41889 2955 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
1b6269db 2956 return 1; /* already handled by vmx_vcpu_run() */
2ab455cc
AL
2957
2958 if (is_no_device(intr_info)) {
5fd86fcf 2959 vmx_fpu_activate(vcpu);
2ab455cc
AL
2960 return 1;
2961 }
2962
7aa81cc0 2963 if (is_invalid_opcode(intr_info)) {
851ba692 2964 er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD);
7aa81cc0 2965 if (er != EMULATE_DONE)
7ee5d940 2966 kvm_queue_exception(vcpu, UD_VECTOR);
7aa81cc0
AL
2967 return 1;
2968 }
2969
6aa8b732 2970 error_code = 0;
5fdbf976 2971 rip = kvm_rip_read(vcpu);
2e11384c 2972 if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
6aa8b732
AK
2973 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
2974 if (is_page_fault(intr_info)) {
1439442c 2975 /* EPT won't cause page fault directly */
089d034e 2976 if (enable_ept)
1439442c 2977 BUG();
6aa8b732 2978 cr2 = vmcs_readl(EXIT_QUALIFICATION);
229456fc
MT
2979 trace_kvm_page_fault(cr2, error_code);
2980
3298b75c 2981 if (kvm_event_needs_reinjection(vcpu))
577bdc49 2982 kvm_mmu_unprotect_page_virt(vcpu, cr2);
3067714c 2983 return kvm_mmu_page_fault(vcpu, cr2, error_code);
6aa8b732
AK
2984 }
2985
7ffd92c5 2986 if (vmx->rmode.vm86_active &&
6aa8b732 2987 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
72d6e5a0 2988 error_code)) {
ad312c7c
ZX
2989 if (vcpu->arch.halt_request) {
2990 vcpu->arch.halt_request = 0;
72d6e5a0
AK
2991 return kvm_emulate_halt(vcpu);
2992 }
6aa8b732 2993 return 1;
72d6e5a0 2994 }
6aa8b732 2995
d0bfb940 2996 ex_no = intr_info & INTR_INFO_VECTOR_MASK;
42dbaa5a
JK
2997 switch (ex_no) {
2998 case DB_VECTOR:
2999 dr6 = vmcs_readl(EXIT_QUALIFICATION);
3000 if (!(vcpu->guest_debug &
3001 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
3002 vcpu->arch.dr6 = dr6 | DR6_FIXED_1;
3003 kvm_queue_exception(vcpu, DB_VECTOR);
3004 return 1;
3005 }
3006 kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
3007 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
3008 /* fall through */
3009 case BP_VECTOR:
c573cd22
JK
3010 /*
3011 * Update instruction length as we may reinject #BP from
3012 * user space while in guest debugging mode. Reading it for
3013 * #DB as well causes no harm, it is not used in that case.
3014 */
3015 vmx->vcpu.arch.event_exit_inst_len =
3016 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
6aa8b732 3017 kvm_run->exit_reason = KVM_EXIT_DEBUG;
d0bfb940
JK
3018 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
3019 kvm_run->debug.arch.exception = ex_no;
42dbaa5a
JK
3020 break;
3021 default:
d0bfb940
JK
3022 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
3023 kvm_run->ex.exception = ex_no;
3024 kvm_run->ex.error_code = error_code;
42dbaa5a 3025 break;
6aa8b732 3026 }
6aa8b732
AK
3027 return 0;
3028}
3029
851ba692 3030static int handle_external_interrupt(struct kvm_vcpu *vcpu)
6aa8b732 3031{
1165f5fe 3032 ++vcpu->stat.irq_exits;
6aa8b732
AK
3033 return 1;
3034}
3035
851ba692 3036static int handle_triple_fault(struct kvm_vcpu *vcpu)
988ad74f 3037{
851ba692 3038 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
988ad74f
AK
3039 return 0;
3040}
6aa8b732 3041
851ba692 3042static int handle_io(struct kvm_vcpu *vcpu)
6aa8b732 3043{
bfdaab09 3044 unsigned long exit_qualification;
34c33d16 3045 int size, in, string;
039576c0 3046 unsigned port;
6aa8b732 3047
bfdaab09 3048 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
039576c0 3049 string = (exit_qualification & 16) != 0;
cf8f70bf 3050 in = (exit_qualification & 8) != 0;
e70669ab 3051
cf8f70bf 3052 ++vcpu->stat.io_exits;
e70669ab 3053
cf8f70bf
GN
3054 if (string || in)
3055 return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
e70669ab 3056
cf8f70bf
GN
3057 port = exit_qualification >> 16;
3058 size = (exit_qualification & 7) + 1;
e93f36bc 3059 skip_emulated_instruction(vcpu);
cf8f70bf
GN
3060
3061 return kvm_fast_pio_out(vcpu, size, port);
6aa8b732
AK
3062}
3063
102d8325
IM
3064static void
3065vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
3066{
3067 /*
3068 * Patch in the VMCALL instruction:
3069 */
3070 hypercall[0] = 0x0f;
3071 hypercall[1] = 0x01;
3072 hypercall[2] = 0xc1;
102d8325
IM
3073}
3074
851ba692 3075static int handle_cr(struct kvm_vcpu *vcpu)
6aa8b732 3076{
229456fc 3077 unsigned long exit_qualification, val;
6aa8b732
AK
3078 int cr;
3079 int reg;
3080
bfdaab09 3081 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6aa8b732
AK
3082 cr = exit_qualification & 15;
3083 reg = (exit_qualification >> 8) & 15;
3084 switch ((exit_qualification >> 4) & 3) {
3085 case 0: /* mov to cr */
229456fc
MT
3086 val = kvm_register_read(vcpu, reg);
3087 trace_kvm_cr_write(cr, val);
6aa8b732
AK
3088 switch (cr) {
3089 case 0:
229456fc 3090 kvm_set_cr0(vcpu, val);
6aa8b732
AK
3091 skip_emulated_instruction(vcpu);
3092 return 1;
3093 case 3:
229456fc 3094 kvm_set_cr3(vcpu, val);
6aa8b732
AK
3095 skip_emulated_instruction(vcpu);
3096 return 1;
3097 case 4:
229456fc 3098 kvm_set_cr4(vcpu, val);
6aa8b732
AK
3099 skip_emulated_instruction(vcpu);
3100 return 1;
0a5fff19
GN
3101 case 8: {
3102 u8 cr8_prev = kvm_get_cr8(vcpu);
3103 u8 cr8 = kvm_register_read(vcpu, reg);
3104 kvm_set_cr8(vcpu, cr8);
3105 skip_emulated_instruction(vcpu);
3106 if (irqchip_in_kernel(vcpu->kvm))
3107 return 1;
3108 if (cr8_prev <= cr8)
3109 return 1;
851ba692 3110 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
0a5fff19
GN
3111 return 0;
3112 }
6aa8b732
AK
3113 };
3114 break;
25c4c276 3115 case 2: /* clts */
edcafe3c 3116 vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
4d4ec087 3117 trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
25c4c276 3118 skip_emulated_instruction(vcpu);
6b52d186 3119 vmx_fpu_activate(vcpu);
25c4c276 3120 return 1;
6aa8b732
AK
3121 case 1: /*mov from cr*/
3122 switch (cr) {
3123 case 3:
5fdbf976 3124 kvm_register_write(vcpu, reg, vcpu->arch.cr3);
229456fc 3125 trace_kvm_cr_read(cr, vcpu->arch.cr3);
6aa8b732
AK
3126 skip_emulated_instruction(vcpu);
3127 return 1;
3128 case 8:
229456fc
MT
3129 val = kvm_get_cr8(vcpu);
3130 kvm_register_write(vcpu, reg, val);
3131 trace_kvm_cr_read(cr, val);
6aa8b732
AK
3132 skip_emulated_instruction(vcpu);
3133 return 1;
3134 }
3135 break;
3136 case 3: /* lmsw */
a1f83a74 3137 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
4d4ec087 3138 trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
a1f83a74 3139 kvm_lmsw(vcpu, val);
6aa8b732
AK
3140
3141 skip_emulated_instruction(vcpu);
3142 return 1;
3143 default:
3144 break;
3145 }
851ba692 3146 vcpu->run->exit_reason = 0;
f0242478 3147 pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
6aa8b732
AK
3148 (int)(exit_qualification >> 4) & 3, cr);
3149 return 0;
3150}
3151
851ba692 3152static int handle_dr(struct kvm_vcpu *vcpu)
6aa8b732 3153{
bfdaab09 3154 unsigned long exit_qualification;
6aa8b732
AK
3155 int dr, reg;
3156
f2483415 3157 /* Do not handle if the CPL > 0, will trigger GP on re-entry */
0a79b009
AK
3158 if (!kvm_require_cpl(vcpu, 0))
3159 return 1;
42dbaa5a
JK
3160 dr = vmcs_readl(GUEST_DR7);
3161 if (dr & DR7_GD) {
3162 /*
3163 * As the vm-exit takes precedence over the debug trap, we
3164 * need to emulate the latter, either for the host or the
3165 * guest debugging itself.
3166 */
3167 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
851ba692
AK
3168 vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
3169 vcpu->run->debug.arch.dr7 = dr;
3170 vcpu->run->debug.arch.pc =
42dbaa5a
JK
3171 vmcs_readl(GUEST_CS_BASE) +
3172 vmcs_readl(GUEST_RIP);
851ba692
AK
3173 vcpu->run->debug.arch.exception = DB_VECTOR;
3174 vcpu->run->exit_reason = KVM_EXIT_DEBUG;
42dbaa5a
JK
3175 return 0;
3176 } else {
3177 vcpu->arch.dr7 &= ~DR7_GD;
3178 vcpu->arch.dr6 |= DR6_BD;
3179 vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
3180 kvm_queue_exception(vcpu, DB_VECTOR);
3181 return 1;
3182 }
3183 }
3184
bfdaab09 3185 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
42dbaa5a
JK
3186 dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
3187 reg = DEBUG_REG_ACCESS_REG(exit_qualification);
3188 if (exit_qualification & TYPE_MOV_FROM_DR) {
020df079
GN
3189 unsigned long val;
3190 if (!kvm_get_dr(vcpu, dr, &val))
3191 kvm_register_write(vcpu, reg, val);
3192 } else
3193 kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]);
6aa8b732
AK
3194 skip_emulated_instruction(vcpu);
3195 return 1;
3196}
3197
020df079
GN
3198static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
3199{
3200 vmcs_writel(GUEST_DR7, val);
3201}
3202
851ba692 3203static int handle_cpuid(struct kvm_vcpu *vcpu)
6aa8b732 3204{
06465c5a
AK
3205 kvm_emulate_cpuid(vcpu);
3206 return 1;
6aa8b732
AK
3207}
3208
851ba692 3209static int handle_rdmsr(struct kvm_vcpu *vcpu)
6aa8b732 3210{
ad312c7c 3211 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
6aa8b732
AK
3212 u64 data;
3213
3214 if (vmx_get_msr(vcpu, ecx, &data)) {
59200273 3215 trace_kvm_msr_read_ex(ecx);
c1a5d4f9 3216 kvm_inject_gp(vcpu, 0);
6aa8b732
AK
3217 return 1;
3218 }
3219
229456fc 3220 trace_kvm_msr_read(ecx, data);
2714d1d3 3221
6aa8b732 3222 /* FIXME: handling of bits 32:63 of rax, rdx */
ad312c7c
ZX
3223 vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
3224 vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
6aa8b732
AK
3225 skip_emulated_instruction(vcpu);
3226 return 1;
3227}
3228
851ba692 3229static int handle_wrmsr(struct kvm_vcpu *vcpu)
6aa8b732 3230{
ad312c7c
ZX
3231 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
3232 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
3233 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
6aa8b732
AK
3234
3235 if (vmx_set_msr(vcpu, ecx, data) != 0) {
59200273 3236 trace_kvm_msr_write_ex(ecx, data);
c1a5d4f9 3237 kvm_inject_gp(vcpu, 0);
6aa8b732
AK
3238 return 1;
3239 }
3240
59200273 3241 trace_kvm_msr_write(ecx, data);
6aa8b732
AK
3242 skip_emulated_instruction(vcpu);
3243 return 1;
3244}
3245
851ba692 3246static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
6e5d865c
YS
3247{
3248 return 1;
3249}
3250
851ba692 3251static int handle_interrupt_window(struct kvm_vcpu *vcpu)
6aa8b732 3252{
85f455f7
ED
3253 u32 cpu_based_vm_exec_control;
3254
3255 /* clear pending irq */
3256 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
3257 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
3258 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2714d1d3 3259
a26bf12a 3260 ++vcpu->stat.irq_window_exits;
2714d1d3 3261
c1150d8c
DL
3262 /*
3263 * If the user space waits to inject interrupts, exit as soon as
3264 * possible
3265 */
8061823a 3266 if (!irqchip_in_kernel(vcpu->kvm) &&
851ba692 3267 vcpu->run->request_interrupt_window &&
8061823a 3268 !kvm_cpu_has_interrupt(vcpu)) {
851ba692 3269 vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
c1150d8c
DL
3270 return 0;
3271 }
6aa8b732
AK
3272 return 1;
3273}
3274
851ba692 3275static int handle_halt(struct kvm_vcpu *vcpu)
6aa8b732
AK
3276{
3277 skip_emulated_instruction(vcpu);
d3bef15f 3278 return kvm_emulate_halt(vcpu);
6aa8b732
AK
3279}
3280
851ba692 3281static int handle_vmcall(struct kvm_vcpu *vcpu)
c21415e8 3282{
510043da 3283 skip_emulated_instruction(vcpu);
7aa81cc0
AL
3284 kvm_emulate_hypercall(vcpu);
3285 return 1;
c21415e8
IM
3286}
3287
851ba692 3288static int handle_vmx_insn(struct kvm_vcpu *vcpu)
e3c7cb6a
AK
3289{
3290 kvm_queue_exception(vcpu, UD_VECTOR);
3291 return 1;
3292}
3293
851ba692 3294static int handle_invlpg(struct kvm_vcpu *vcpu)
a7052897 3295{
f9c617f6 3296 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
a7052897
MT
3297
3298 kvm_mmu_invlpg(vcpu, exit_qualification);
3299 skip_emulated_instruction(vcpu);
3300 return 1;
3301}
3302
851ba692 3303static int handle_wbinvd(struct kvm_vcpu *vcpu)
e5edaa01
ED
3304{
3305 skip_emulated_instruction(vcpu);
3306 /* TODO: Add support for VT-d/pass-through device */
3307 return 1;
3308}
3309
851ba692 3310static int handle_apic_access(struct kvm_vcpu *vcpu)
f78e0e2e 3311{
f9c617f6 3312 unsigned long exit_qualification;
f78e0e2e
SY
3313 enum emulation_result er;
3314 unsigned long offset;
3315
f9c617f6 3316 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
f78e0e2e
SY
3317 offset = exit_qualification & 0xffful;
3318
851ba692 3319 er = emulate_instruction(vcpu, 0, 0, 0);
f78e0e2e
SY
3320
3321 if (er != EMULATE_DONE) {
3322 printk(KERN_ERR
3323 "Fail to handle apic access vmexit! Offset is 0x%lx\n",
3324 offset);
7f582ab6 3325 return -ENOEXEC;
f78e0e2e
SY
3326 }
3327 return 1;
3328}
3329
851ba692 3330static int handle_task_switch(struct kvm_vcpu *vcpu)
37817f29 3331{
60637aac 3332 struct vcpu_vmx *vmx = to_vmx(vcpu);
37817f29 3333 unsigned long exit_qualification;
e269fb21
JK
3334 bool has_error_code = false;
3335 u32 error_code = 0;
37817f29 3336 u16 tss_selector;
64a7ec06
GN
3337 int reason, type, idt_v;
3338
3339 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
3340 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
37817f29
IE
3341
3342 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3343
3344 reason = (u32)exit_qualification >> 30;
64a7ec06
GN
3345 if (reason == TASK_SWITCH_GATE && idt_v) {
3346 switch (type) {
3347 case INTR_TYPE_NMI_INTR:
3348 vcpu->arch.nmi_injected = false;
3349 if (cpu_has_virtual_nmis())
3350 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
3351 GUEST_INTR_STATE_NMI);
3352 break;
3353 case INTR_TYPE_EXT_INTR:
66fd3f7f 3354 case INTR_TYPE_SOFT_INTR:
64a7ec06
GN
3355 kvm_clear_interrupt_queue(vcpu);
3356 break;
3357 case INTR_TYPE_HARD_EXCEPTION:
e269fb21
JK
3358 if (vmx->idt_vectoring_info &
3359 VECTORING_INFO_DELIVER_CODE_MASK) {
3360 has_error_code = true;
3361 error_code =
3362 vmcs_read32(IDT_VECTORING_ERROR_CODE);
3363 }
3364 /* fall through */
64a7ec06
GN
3365 case INTR_TYPE_SOFT_EXCEPTION:
3366 kvm_clear_exception_queue(vcpu);
3367 break;
3368 default:
3369 break;
3370 }
60637aac 3371 }
37817f29
IE
3372 tss_selector = exit_qualification;
3373
64a7ec06
GN
3374 if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
3375 type != INTR_TYPE_EXT_INTR &&
3376 type != INTR_TYPE_NMI_INTR))
3377 skip_emulated_instruction(vcpu);
3378
acb54517
GN
3379 if (kvm_task_switch(vcpu, tss_selector, reason,
3380 has_error_code, error_code) == EMULATE_FAIL) {
3381 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3382 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
3383 vcpu->run->internal.ndata = 0;
42dbaa5a 3384 return 0;
acb54517 3385 }
42dbaa5a
JK
3386
3387 /* clear all local breakpoint enable flags */
3388 vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55);
3389
3390 /*
3391 * TODO: What about debug traps on tss switch?
3392 * Are we supposed to inject them and update dr6?
3393 */
3394
3395 return 1;
37817f29
IE
3396}
3397
851ba692 3398static int handle_ept_violation(struct kvm_vcpu *vcpu)
1439442c 3399{
f9c617f6 3400 unsigned long exit_qualification;
1439442c 3401 gpa_t gpa;
1439442c 3402 int gla_validity;
1439442c 3403
f9c617f6 3404 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
1439442c
SY
3405
3406 if (exit_qualification & (1 << 6)) {
3407 printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
7f582ab6 3408 return -EINVAL;
1439442c
SY
3409 }
3410
3411 gla_validity = (exit_qualification >> 7) & 0x3;
3412 if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
3413 printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
3414 printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
3415 (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
f9c617f6 3416 vmcs_readl(GUEST_LINEAR_ADDRESS));
1439442c
SY
3417 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
3418 (long unsigned int)exit_qualification);
851ba692
AK
3419 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
3420 vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
596ae895 3421 return 0;
1439442c
SY
3422 }
3423
3424 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
229456fc 3425 trace_kvm_page_fault(gpa, exit_qualification);
49cd7d22 3426 return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);
1439442c
SY
3427}
3428
68f89400
MT
3429static u64 ept_rsvd_mask(u64 spte, int level)
3430{
3431 int i;
3432 u64 mask = 0;
3433
3434 for (i = 51; i > boot_cpu_data.x86_phys_bits; i--)
3435 mask |= (1ULL << i);
3436
3437 if (level > 2)
3438 /* bits 7:3 reserved */
3439 mask |= 0xf8;
3440 else if (level == 2) {
3441 if (spte & (1ULL << 7))
3442 /* 2MB ref, bits 20:12 reserved */
3443 mask |= 0x1ff000;
3444 else
3445 /* bits 6:3 reserved */
3446 mask |= 0x78;
3447 }
3448
3449 return mask;
3450}
3451
3452static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
3453 int level)
3454{
3455 printk(KERN_ERR "%s: spte 0x%llx level %d\n", __func__, spte, level);
3456
3457 /* 010b (write-only) */
3458 WARN_ON((spte & 0x7) == 0x2);
3459
3460 /* 110b (write/execute) */
3461 WARN_ON((spte & 0x7) == 0x6);
3462
3463 /* 100b (execute-only) and value not supported by logical processor */
3464 if (!cpu_has_vmx_ept_execute_only())
3465 WARN_ON((spte & 0x7) == 0x4);
3466
3467 /* not 000b */
3468 if ((spte & 0x7)) {
3469 u64 rsvd_bits = spte & ept_rsvd_mask(spte, level);
3470
3471 if (rsvd_bits != 0) {
3472 printk(KERN_ERR "%s: rsvd_bits = 0x%llx\n",
3473 __func__, rsvd_bits);
3474 WARN_ON(1);
3475 }
3476
3477 if (level == 1 || (level == 2 && (spte & (1ULL << 7)))) {
3478 u64 ept_mem_type = (spte & 0x38) >> 3;
3479
3480 if (ept_mem_type == 2 || ept_mem_type == 3 ||
3481 ept_mem_type == 7) {
3482 printk(KERN_ERR "%s: ept_mem_type=0x%llx\n",
3483 __func__, ept_mem_type);
3484 WARN_ON(1);
3485 }
3486 }
3487 }
3488}
3489
851ba692 3490static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
68f89400
MT
3491{
3492 u64 sptes[4];
3493 int nr_sptes, i;
3494 gpa_t gpa;
3495
3496 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
3497
3498 printk(KERN_ERR "EPT: Misconfiguration.\n");
3499 printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
3500
3501 nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes);
3502
3503 for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
3504 ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);
3505
851ba692
AK
3506 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
3507 vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
68f89400
MT
3508
3509 return 0;
3510}
3511
851ba692 3512static int handle_nmi_window(struct kvm_vcpu *vcpu)
f08864b4
SY
3513{
3514 u32 cpu_based_vm_exec_control;
3515
3516 /* clear pending NMI */
3517 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
3518 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
3519 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
3520 ++vcpu->stat.nmi_window_exits;
3521
3522 return 1;
3523}
3524
80ced186 3525static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
ea953ef0 3526{
8b3079a5
AK
3527 struct vcpu_vmx *vmx = to_vmx(vcpu);
3528 enum emulation_result err = EMULATE_DONE;
80ced186 3529 int ret = 1;
ea953ef0
MG
3530
3531 while (!guest_state_valid(vcpu)) {
851ba692 3532 err = emulate_instruction(vcpu, 0, 0, 0);
ea953ef0 3533
80ced186
MG
3534 if (err == EMULATE_DO_MMIO) {
3535 ret = 0;
3536 goto out;
3537 }
1d5a4d9b
GT
3538
3539 if (err != EMULATE_DONE) {
80ced186
MG
3540 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3541 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
a9c7399d 3542 vcpu->run->internal.ndata = 0;
80ced186
MG
3543 ret = 0;
3544 goto out;
ea953ef0
MG
3545 }
3546
3547 if (signal_pending(current))
80ced186 3548 goto out;
ea953ef0
MG
3549 if (need_resched())
3550 schedule();
3551 }
3552
80ced186
MG
3553 vmx->emulation_required = 0;
3554out:
3555 return ret;
ea953ef0
MG
3556}
3557
4b8d54f9
ZE
3558/*
3559 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
3560 * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
3561 */
9fb41ba8 3562static int handle_pause(struct kvm_vcpu *vcpu)
4b8d54f9
ZE
3563{
3564 skip_emulated_instruction(vcpu);
3565 kvm_vcpu_on_spin(vcpu);
3566
3567 return 1;
3568}
3569
59708670
SY
3570static int handle_invalid_op(struct kvm_vcpu *vcpu)
3571{
3572 kvm_queue_exception(vcpu, UD_VECTOR);
3573 return 1;
3574}
3575
6aa8b732
AK
3576/*
3577 * The exit handlers return 1 if the exit was handled fully and guest execution
3578 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
3579 * to be done to userspace and return 0.
3580 */
851ba692 3581static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
6aa8b732
AK
3582 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
3583 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
988ad74f 3584 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
f08864b4 3585 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
6aa8b732 3586 [EXIT_REASON_IO_INSTRUCTION] = handle_io,
6aa8b732
AK
3587 [EXIT_REASON_CR_ACCESS] = handle_cr,
3588 [EXIT_REASON_DR_ACCESS] = handle_dr,
3589 [EXIT_REASON_CPUID] = handle_cpuid,
3590 [EXIT_REASON_MSR_READ] = handle_rdmsr,
3591 [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
3592 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
3593 [EXIT_REASON_HLT] = handle_halt,
a7052897 3594 [EXIT_REASON_INVLPG] = handle_invlpg,
c21415e8 3595 [EXIT_REASON_VMCALL] = handle_vmcall,
e3c7cb6a
AK
3596 [EXIT_REASON_VMCLEAR] = handle_vmx_insn,
3597 [EXIT_REASON_VMLAUNCH] = handle_vmx_insn,
3598 [EXIT_REASON_VMPTRLD] = handle_vmx_insn,
3599 [EXIT_REASON_VMPTRST] = handle_vmx_insn,
3600 [EXIT_REASON_VMREAD] = handle_vmx_insn,
3601 [EXIT_REASON_VMRESUME] = handle_vmx_insn,
3602 [EXIT_REASON_VMWRITE] = handle_vmx_insn,
3603 [EXIT_REASON_VMOFF] = handle_vmx_insn,
3604 [EXIT_REASON_VMON] = handle_vmx_insn,
f78e0e2e
SY
3605 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
3606 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
e5edaa01 3607 [EXIT_REASON_WBINVD] = handle_wbinvd,
37817f29 3608 [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
a0861c02 3609 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
68f89400
MT
3610 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
3611 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
4b8d54f9 3612 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
59708670
SY
3613 [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op,
3614 [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op,
6aa8b732
AK
3615};
3616
3617static const int kvm_vmx_max_exit_handlers =
50a3485c 3618 ARRAY_SIZE(kvm_vmx_exit_handlers);
6aa8b732
AK
3619
3620/*
3621 * The guest has exited. See if we can fix it or if we need userspace
3622 * assistance.
3623 */
851ba692 3624static int vmx_handle_exit(struct kvm_vcpu *vcpu)
6aa8b732 3625{
29bd8a78 3626 struct vcpu_vmx *vmx = to_vmx(vcpu);
a0861c02 3627 u32 exit_reason = vmx->exit_reason;
1155f76a 3628 u32 vectoring_info = vmx->idt_vectoring_info;
29bd8a78 3629
5bfd8b54 3630 trace_kvm_exit(exit_reason, vcpu);
2714d1d3 3631
80ced186
MG
3632 /* If guest state is invalid, start emulating */
3633 if (vmx->emulation_required && emulate_invalid_guest_state)
3634 return handle_invalid_guest_state(vcpu);
1d5a4d9b 3635
1439442c
SY
3636 /* Access CR3 don't cause VMExit in paging mode, so we need
3637 * to sync with guest real CR3. */
6de4f3ad 3638 if (enable_ept && is_paging(vcpu))
1439442c 3639 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
1439442c 3640
29bd8a78 3641 if (unlikely(vmx->fail)) {
851ba692
AK
3642 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3643 vcpu->run->fail_entry.hardware_entry_failure_reason
29bd8a78
AK
3644 = vmcs_read32(VM_INSTRUCTION_ERROR);
3645 return 0;
3646 }
6aa8b732 3647
d77c26fc 3648 if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
1439442c 3649 (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
60637aac
JK
3650 exit_reason != EXIT_REASON_EPT_VIOLATION &&
3651 exit_reason != EXIT_REASON_TASK_SWITCH))
3652 printk(KERN_WARNING "%s: unexpected, valid vectoring info "
3653 "(0x%x) and exit reason is 0x%x\n",
3654 __func__, vectoring_info, exit_reason);
3b86cd99
JK
3655
3656 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
c4282df9 3657 if (vmx_interrupt_allowed(vcpu)) {
3b86cd99 3658 vmx->soft_vnmi_blocked = 0;
3b86cd99 3659 } else if (vmx->vnmi_blocked_time > 1000000000LL &&
4531220b 3660 vcpu->arch.nmi_pending) {
3b86cd99
JK
3661 /*
3662 * This CPU don't support us in finding the end of an
3663 * NMI-blocked window if the guest runs with IRQs
3664 * disabled. So we pull the trigger after 1 s of
3665 * futile waiting, but inform the user about this.
3666 */
3667 printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
3668 "state on VCPU %d after 1 s timeout\n",
3669 __func__, vcpu->vcpu_id);
3670 vmx->soft_vnmi_blocked = 0;
3b86cd99 3671 }
3b86cd99
JK
3672 }
3673
6aa8b732
AK
3674 if (exit_reason < kvm_vmx_max_exit_handlers
3675 && kvm_vmx_exit_handlers[exit_reason])
851ba692 3676 return kvm_vmx_exit_handlers[exit_reason](vcpu);
6aa8b732 3677 else {
851ba692
AK
3678 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
3679 vcpu->run->hw.hardware_exit_reason = exit_reason;
6aa8b732
AK
3680 }
3681 return 0;
3682}
3683
95ba8273 3684static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
6e5d865c 3685{
95ba8273 3686 if (irr == -1 || tpr < irr) {
6e5d865c
YS
3687 vmcs_write32(TPR_THRESHOLD, 0);
3688 return;
3689 }
3690
95ba8273 3691 vmcs_write32(TPR_THRESHOLD, irr);
6e5d865c
YS
3692}
3693
cf393f75
AK
3694static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3695{
3696 u32 exit_intr_info;
7b4a25cb 3697 u32 idt_vectoring_info = vmx->idt_vectoring_info;
cf393f75
AK
3698 bool unblock_nmi;
3699 u8 vector;
668f612f
AK
3700 int type;
3701 bool idtv_info_valid;
cf393f75
AK
3702
3703 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
20f65983 3704
a0861c02
AK
3705 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
3706
3707 /* Handle machine checks before interrupts are enabled */
3708 if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
3709 || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
3710 && is_machine_check(exit_intr_info)))
3711 kvm_machine_check();
3712
20f65983
GN
3713 /* We need to handle NMIs before interrupts are enabled */
3714 if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
ff9d07a0
ZY
3715 (exit_intr_info & INTR_INFO_VALID_MASK)) {
3716 kvm_before_handle_nmi(&vmx->vcpu);
20f65983 3717 asm("int $2");
ff9d07a0
ZY
3718 kvm_after_handle_nmi(&vmx->vcpu);
3719 }
20f65983
GN
3720
3721 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
3722
cf393f75
AK
3723 if (cpu_has_virtual_nmis()) {
3724 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
3725 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
3726 /*
7b4a25cb 3727 * SDM 3: 27.7.1.2 (September 2008)
cf393f75
AK
3728 * Re-set bit "block by NMI" before VM entry if vmexit caused by
3729 * a guest IRET fault.
7b4a25cb
GN
3730 * SDM 3: 23.2.2 (September 2008)
3731 * Bit 12 is undefined in any of the following cases:
3732 * If the VM exit sets the valid bit in the IDT-vectoring
3733 * information field.
3734 * If the VM exit is due to a double fault.
cf393f75 3735 */
7b4a25cb
GN
3736 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
3737 vector != DF_VECTOR && !idtv_info_valid)
cf393f75
AK
3738 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
3739 GUEST_INTR_STATE_NMI);
3b86cd99
JK
3740 } else if (unlikely(vmx->soft_vnmi_blocked))
3741 vmx->vnmi_blocked_time +=
3742 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
668f612f 3743
37b96e98
GN
3744 vmx->vcpu.arch.nmi_injected = false;
3745 kvm_clear_exception_queue(&vmx->vcpu);
3746 kvm_clear_interrupt_queue(&vmx->vcpu);
3747
3748 if (!idtv_info_valid)
3749 return;
3750
668f612f
AK
3751 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
3752 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
37b96e98 3753
64a7ec06 3754 switch (type) {
37b96e98
GN
3755 case INTR_TYPE_NMI_INTR:
3756 vmx->vcpu.arch.nmi_injected = true;
668f612f 3757 /*
7b4a25cb 3758 * SDM 3: 27.7.1.2 (September 2008)
37b96e98
GN
3759 * Clear bit "block by NMI" before VM entry if a NMI
3760 * delivery faulted.
668f612f 3761 */
37b96e98
GN
3762 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
3763 GUEST_INTR_STATE_NMI);
3764 break;
37b96e98 3765 case INTR_TYPE_SOFT_EXCEPTION:
66fd3f7f
GN
3766 vmx->vcpu.arch.event_exit_inst_len =
3767 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3768 /* fall through */
3769 case INTR_TYPE_HARD_EXCEPTION:
35920a35 3770 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
37b96e98
GN
3771 u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE);
3772 kvm_queue_exception_e(&vmx->vcpu, vector, err);
35920a35
AK
3773 } else
3774 kvm_queue_exception(&vmx->vcpu, vector);
37b96e98 3775 break;
66fd3f7f
GN
3776 case INTR_TYPE_SOFT_INTR:
3777 vmx->vcpu.arch.event_exit_inst_len =
3778 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3779 /* fall through */
37b96e98 3780 case INTR_TYPE_EXT_INTR:
66fd3f7f
GN
3781 kvm_queue_interrupt(&vmx->vcpu, vector,
3782 type == INTR_TYPE_SOFT_INTR);
37b96e98
GN
3783 break;
3784 default:
3785 break;
f7d9238f 3786 }
cf393f75
AK
3787}
3788
9c8cba37
AK
3789/*
3790 * Failure to inject an interrupt should give us the information
3791 * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs
3792 * when fetching the interrupt redirection bitmap in the real-mode
3793 * tss, this doesn't happen. So we do it ourselves.
3794 */
3795static void fixup_rmode_irq(struct vcpu_vmx *vmx)
3796{
3797 vmx->rmode.irq.pending = 0;
5fdbf976 3798 if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip)
9c8cba37 3799 return;
5fdbf976 3800 kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip);
9c8cba37
AK
3801 if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
3802 vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
3803 vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
3804 return;
3805 }
3806 vmx->idt_vectoring_info =
3807 VECTORING_INFO_VALID_MASK
3808 | INTR_TYPE_EXT_INTR
3809 | vmx->rmode.irq.vector;
3810}
3811
c801949d
AK
3812#ifdef CONFIG_X86_64
3813#define R "r"
3814#define Q "q"
3815#else
3816#define R "e"
3817#define Q "l"
3818#endif
3819
851ba692 3820static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
6aa8b732 3821{
a2fa3e9f 3822 struct vcpu_vmx *vmx = to_vmx(vcpu);
e6adf283 3823
3b86cd99
JK
3824 /* Record the guest's net vcpu time for enforced NMI injections. */
3825 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
3826 vmx->entry_time = ktime_get();
3827
80ced186
MG
3828 /* Don't enter VMX if guest state is invalid, let the exit handler
3829 start emulation until we arrive back to a valid state */
3830 if (vmx->emulation_required && emulate_invalid_guest_state)
a89a8fb9 3831 return;
a89a8fb9 3832
5fdbf976
MT
3833 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
3834 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
3835 if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
3836 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
3837
787ff736
GN
3838 /* When single-stepping over STI and MOV SS, we must clear the
3839 * corresponding interruptibility bits in the guest state. Otherwise
3840 * vmentry fails as it then expects bit 14 (BS) in pending debug
3841 * exceptions being set, but that's not correct for the guest debugging
3842 * case. */
3843 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3844 vmx_set_interrupt_shadow(vcpu, 0);
3845
e6adf283
AK
3846 /*
3847 * Loading guest fpu may have cleared host cr0.ts
3848 */
3849 vmcs_writel(HOST_CR0, read_cr0());
3850
d77c26fc 3851 asm(
6aa8b732 3852 /* Store host registers */
c801949d
AK
3853 "push %%"R"dx; push %%"R"bp;"
3854 "push %%"R"cx \n\t"
313dbd49
AK
3855 "cmp %%"R"sp, %c[host_rsp](%0) \n\t"
3856 "je 1f \n\t"
3857 "mov %%"R"sp, %c[host_rsp](%0) \n\t"
4ecac3fd 3858 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
313dbd49 3859 "1: \n\t"
d3edefc0
AK
3860 /* Reload cr2 if changed */
3861 "mov %c[cr2](%0), %%"R"ax \n\t"
3862 "mov %%cr2, %%"R"dx \n\t"
3863 "cmp %%"R"ax, %%"R"dx \n\t"
3864 "je 2f \n\t"
3865 "mov %%"R"ax, %%cr2 \n\t"
3866 "2: \n\t"
6aa8b732 3867 /* Check if vmlaunch of vmresume is needed */
e08aa78a 3868 "cmpl $0, %c[launched](%0) \n\t"
6aa8b732 3869 /* Load guest registers. Don't clobber flags. */
c801949d
AK
3870 "mov %c[rax](%0), %%"R"ax \n\t"
3871 "mov %c[rbx](%0), %%"R"bx \n\t"
3872 "mov %c[rdx](%0), %%"R"dx \n\t"
3873 "mov %c[rsi](%0), %%"R"si \n\t"
3874 "mov %c[rdi](%0), %%"R"di \n\t"
3875 "mov %c[rbp](%0), %%"R"bp \n\t"
05b3e0c2 3876#ifdef CONFIG_X86_64
e08aa78a
AK
3877 "mov %c[r8](%0), %%r8 \n\t"
3878 "mov %c[r9](%0), %%r9 \n\t"
3879 "mov %c[r10](%0), %%r10 \n\t"
3880 "mov %c[r11](%0), %%r11 \n\t"
3881 "mov %c[r12](%0), %%r12 \n\t"
3882 "mov %c[r13](%0), %%r13 \n\t"
3883 "mov %c[r14](%0), %%r14 \n\t"
3884 "mov %c[r15](%0), %%r15 \n\t"
6aa8b732 3885#endif
c801949d
AK
3886 "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */
3887
6aa8b732 3888 /* Enter guest mode */
cd2276a7 3889 "jne .Llaunched \n\t"
4ecac3fd 3890 __ex(ASM_VMX_VMLAUNCH) "\n\t"
cd2276a7 3891 "jmp .Lkvm_vmx_return \n\t"
4ecac3fd 3892 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
cd2276a7 3893 ".Lkvm_vmx_return: "
6aa8b732 3894 /* Save guest registers, load host registers, keep flags */
c801949d
AK
3895 "xchg %0, (%%"R"sp) \n\t"
3896 "mov %%"R"ax, %c[rax](%0) \n\t"
3897 "mov %%"R"bx, %c[rbx](%0) \n\t"
3898 "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t"
3899 "mov %%"R"dx, %c[rdx](%0) \n\t"
3900 "mov %%"R"si, %c[rsi](%0) \n\t"
3901 "mov %%"R"di, %c[rdi](%0) \n\t"
3902 "mov %%"R"bp, %c[rbp](%0) \n\t"
05b3e0c2 3903#ifdef CONFIG_X86_64
e08aa78a
AK
3904 "mov %%r8, %c[r8](%0) \n\t"
3905 "mov %%r9, %c[r9](%0) \n\t"
3906 "mov %%r10, %c[r10](%0) \n\t"
3907 "mov %%r11, %c[r11](%0) \n\t"
3908 "mov %%r12, %c[r12](%0) \n\t"
3909 "mov %%r13, %c[r13](%0) \n\t"
3910 "mov %%r14, %c[r14](%0) \n\t"
3911 "mov %%r15, %c[r15](%0) \n\t"
6aa8b732 3912#endif
c801949d
AK
3913 "mov %%cr2, %%"R"ax \n\t"
3914 "mov %%"R"ax, %c[cr2](%0) \n\t"
3915
3916 "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t"
e08aa78a
AK
3917 "setbe %c[fail](%0) \n\t"
3918 : : "c"(vmx), "d"((unsigned long)HOST_RSP),
3919 [launched]"i"(offsetof(struct vcpu_vmx, launched)),
3920 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
313dbd49 3921 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
ad312c7c
ZX
3922 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
3923 [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
3924 [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
3925 [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
3926 [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
3927 [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
3928 [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
05b3e0c2 3929#ifdef CONFIG_X86_64
ad312c7c
ZX
3930 [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
3931 [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
3932 [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
3933 [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
3934 [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
3935 [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
3936 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
3937 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
6aa8b732 3938#endif
ad312c7c 3939 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
c2036300 3940 : "cc", "memory"
c801949d 3941 , R"bx", R"di", R"si"
c2036300 3942#ifdef CONFIG_X86_64
c2036300
LV
3943 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
3944#endif
3945 );
6aa8b732 3946
6de4f3ad
AK
3947 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
3948 | (1 << VCPU_EXREG_PDPTR));
5fdbf976
MT
3949 vcpu->arch.regs_dirty = 0;
3950
1155f76a 3951 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
9c8cba37
AK
3952 if (vmx->rmode.irq.pending)
3953 fixup_rmode_irq(vmx);
1155f76a 3954
d77c26fc 3955 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
15ad7146 3956 vmx->launched = 1;
1b6269db 3957
cf393f75 3958 vmx_complete_interrupts(vmx);
6aa8b732
AK
3959}
3960
c801949d
AK
3961#undef R
3962#undef Q
3963
6aa8b732
AK
3964static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
3965{
a2fa3e9f
GH
3966 struct vcpu_vmx *vmx = to_vmx(vcpu);
3967
3968 if (vmx->vmcs) {
543e4243 3969 vcpu_clear(vmx);
a2fa3e9f
GH
3970 free_vmcs(vmx->vmcs);
3971 vmx->vmcs = NULL;
6aa8b732
AK
3972 }
3973}
3974
3975static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
3976{
fb3f0f51
RR
3977 struct vcpu_vmx *vmx = to_vmx(vcpu);
3978
cdbecfc3 3979 free_vpid(vmx);
6aa8b732 3980 vmx_free_vmcs(vcpu);
fb3f0f51
RR
3981 kfree(vmx->guest_msrs);
3982 kvm_vcpu_uninit(vcpu);
a4770347 3983 kmem_cache_free(kvm_vcpu_cache, vmx);
6aa8b732
AK
3984}
3985
fb3f0f51 3986static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
6aa8b732 3987{
fb3f0f51 3988 int err;
c16f862d 3989 struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
15ad7146 3990 int cpu;
6aa8b732 3991
a2fa3e9f 3992 if (!vmx)
fb3f0f51
RR
3993 return ERR_PTR(-ENOMEM);
3994
2384d2b3
SY
3995 allocate_vpid(vmx);
3996
fb3f0f51
RR
3997 err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
3998 if (err)
3999 goto free_vcpu;
965b58a5 4000
a2fa3e9f 4001 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
fb3f0f51
RR
4002 if (!vmx->guest_msrs) {
4003 err = -ENOMEM;
4004 goto uninit_vcpu;
4005 }
965b58a5 4006
a2fa3e9f
GH
4007 vmx->vmcs = alloc_vmcs();
4008 if (!vmx->vmcs)
fb3f0f51 4009 goto free_msrs;
a2fa3e9f
GH
4010
4011 vmcs_clear(vmx->vmcs);
4012
15ad7146
AK
4013 cpu = get_cpu();
4014 vmx_vcpu_load(&vmx->vcpu, cpu);
8b9cf98c 4015 err = vmx_vcpu_setup(vmx);
fb3f0f51 4016 vmx_vcpu_put(&vmx->vcpu);
15ad7146 4017 put_cpu();
fb3f0f51
RR
4018 if (err)
4019 goto free_vmcs;
5e4a0b3c
MT
4020 if (vm_need_virtualize_apic_accesses(kvm))
4021 if (alloc_apic_access_page(kvm) != 0)
4022 goto free_vmcs;
fb3f0f51 4023
b927a3ce
SY
4024 if (enable_ept) {
4025 if (!kvm->arch.ept_identity_map_addr)
4026 kvm->arch.ept_identity_map_addr =
4027 VMX_EPT_IDENTITY_PAGETABLE_ADDR;
b7ebfb05
SY
4028 if (alloc_identity_pagetable(kvm) != 0)
4029 goto free_vmcs;
b927a3ce 4030 }
b7ebfb05 4031
fb3f0f51
RR
4032 return &vmx->vcpu;
4033
4034free_vmcs:
4035 free_vmcs(vmx->vmcs);
4036free_msrs:
fb3f0f51
RR
4037 kfree(vmx->guest_msrs);
4038uninit_vcpu:
4039 kvm_vcpu_uninit(&vmx->vcpu);
4040free_vcpu:
cdbecfc3 4041 free_vpid(vmx);
a4770347 4042 kmem_cache_free(kvm_vcpu_cache, vmx);
fb3f0f51 4043 return ERR_PTR(err);
6aa8b732
AK
4044}
4045
002c7f7c
YS
4046static void __init vmx_check_processor_compat(void *rtn)
4047{
4048 struct vmcs_config vmcs_conf;
4049
4050 *(int *)rtn = 0;
4051 if (setup_vmcs_config(&vmcs_conf) < 0)
4052 *(int *)rtn = -EIO;
4053 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
4054 printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
4055 smp_processor_id());
4056 *(int *)rtn = -EIO;
4057 }
4058}
4059
67253af5
SY
4060static int get_ept_level(void)
4061{
4062 return VMX_EPT_DEFAULT_GAW + 1;
4063}
4064
4b12f0de 4065static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
64d4d521 4066{
4b12f0de
SY
4067 u64 ret;
4068
522c68c4
SY
4069 /* For VT-d and EPT combination
4070 * 1. MMIO: always map as UC
4071 * 2. EPT with VT-d:
4072 * a. VT-d without snooping control feature: can't guarantee the
4073 * result, try to trust guest.
4074 * b. VT-d with snooping control feature: snooping control feature of
4075 * VT-d engine can guarantee the cache correctness. Just set it
4076 * to WB to keep consistent with host. So the same as item 3.
a19a6d11 4077 * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
522c68c4
SY
4078 * consistent with host MTRR
4079 */
4b12f0de
SY
4080 if (is_mmio)
4081 ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
522c68c4
SY
4082 else if (vcpu->kvm->arch.iommu_domain &&
4083 !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY))
4084 ret = kvm_get_guest_memory_type(vcpu, gfn) <<
4085 VMX_EPT_MT_EPTE_SHIFT;
4b12f0de 4086 else
522c68c4 4087 ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)
a19a6d11 4088 | VMX_EPT_IPAT_BIT;
4b12f0de
SY
4089
4090 return ret;
64d4d521
SY
4091}
4092
f4c9e87c
AK
4093#define _ER(x) { EXIT_REASON_##x, #x }
4094
229456fc 4095static const struct trace_print_flags vmx_exit_reasons_str[] = {
f4c9e87c
AK
4096 _ER(EXCEPTION_NMI),
4097 _ER(EXTERNAL_INTERRUPT),
4098 _ER(TRIPLE_FAULT),
4099 _ER(PENDING_INTERRUPT),
4100 _ER(NMI_WINDOW),
4101 _ER(TASK_SWITCH),
4102 _ER(CPUID),
4103 _ER(HLT),
4104 _ER(INVLPG),
4105 _ER(RDPMC),
4106 _ER(RDTSC),
4107 _ER(VMCALL),
4108 _ER(VMCLEAR),
4109 _ER(VMLAUNCH),
4110 _ER(VMPTRLD),
4111 _ER(VMPTRST),
4112 _ER(VMREAD),
4113 _ER(VMRESUME),
4114 _ER(VMWRITE),
4115 _ER(VMOFF),
4116 _ER(VMON),
4117 _ER(CR_ACCESS),
4118 _ER(DR_ACCESS),
4119 _ER(IO_INSTRUCTION),
4120 _ER(MSR_READ),
4121 _ER(MSR_WRITE),
4122 _ER(MWAIT_INSTRUCTION),
4123 _ER(MONITOR_INSTRUCTION),
4124 _ER(PAUSE_INSTRUCTION),
4125 _ER(MCE_DURING_VMENTRY),
4126 _ER(TPR_BELOW_THRESHOLD),
4127 _ER(APIC_ACCESS),
4128 _ER(EPT_VIOLATION),
4129 _ER(EPT_MISCONFIG),
4130 _ER(WBINVD),
229456fc
MT
4131 { -1, NULL }
4132};
4133
f4c9e87c
AK
4134#undef _ER
4135
17cc3935 4136static int vmx_get_lpage_level(void)
344f414f 4137{
878403b7
SY
4138 if (enable_ept && !cpu_has_vmx_ept_1g_page())
4139 return PT_DIRECTORY_LEVEL;
4140 else
4141 /* For shadow and EPT supported 1GB page */
4142 return PT_PDPE_LEVEL;
344f414f
JR
4143}
4144
4e47c7a6
SY
4145static inline u32 bit(int bitno)
4146{
4147 return 1 << (bitno & 31);
4148}
4149
0e851880
SY
4150static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
4151{
4e47c7a6
SY
4152 struct kvm_cpuid_entry2 *best;
4153 struct vcpu_vmx *vmx = to_vmx(vcpu);
4154 u32 exec_control;
4155
4156 vmx->rdtscp_enabled = false;
4157 if (vmx_rdtscp_supported()) {
4158 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
4159 if (exec_control & SECONDARY_EXEC_RDTSCP) {
4160 best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
4161 if (best && (best->edx & bit(X86_FEATURE_RDTSCP)))
4162 vmx->rdtscp_enabled = true;
4163 else {
4164 exec_control &= ~SECONDARY_EXEC_RDTSCP;
4165 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
4166 exec_control);
4167 }
4168 }
4169 }
0e851880
SY
4170}
4171
d4330ef2
JR
4172static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
4173{
4174}
4175
cbdd1bea 4176static struct kvm_x86_ops vmx_x86_ops = {
6aa8b732
AK
4177 .cpu_has_kvm_support = cpu_has_kvm_support,
4178 .disabled_by_bios = vmx_disabled_by_bios,
4179 .hardware_setup = hardware_setup,
4180 .hardware_unsetup = hardware_unsetup,
002c7f7c 4181 .check_processor_compatibility = vmx_check_processor_compat,
6aa8b732
AK
4182 .hardware_enable = hardware_enable,
4183 .hardware_disable = hardware_disable,
04547156 4184 .cpu_has_accelerated_tpr = report_flexpriority,
6aa8b732
AK
4185
4186 .vcpu_create = vmx_create_vcpu,
4187 .vcpu_free = vmx_free_vcpu,
04d2cc77 4188 .vcpu_reset = vmx_vcpu_reset,
6aa8b732 4189
04d2cc77 4190 .prepare_guest_switch = vmx_save_host_state,
6aa8b732
AK
4191 .vcpu_load = vmx_vcpu_load,
4192 .vcpu_put = vmx_vcpu_put,
4193
4194 .set_guest_debug = set_guest_debug,
4195 .get_msr = vmx_get_msr,
4196 .set_msr = vmx_set_msr,
4197 .get_segment_base = vmx_get_segment_base,
4198 .get_segment = vmx_get_segment,
4199 .set_segment = vmx_set_segment,
2e4d2653 4200 .get_cpl = vmx_get_cpl,
6aa8b732 4201 .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
e8467fda 4202 .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
25c4c276 4203 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
6aa8b732 4204 .set_cr0 = vmx_set_cr0,
6aa8b732
AK
4205 .set_cr3 = vmx_set_cr3,
4206 .set_cr4 = vmx_set_cr4,
6aa8b732 4207 .set_efer = vmx_set_efer,
6aa8b732
AK
4208 .get_idt = vmx_get_idt,
4209 .set_idt = vmx_set_idt,
4210 .get_gdt = vmx_get_gdt,
4211 .set_gdt = vmx_set_gdt,
020df079 4212 .set_dr7 = vmx_set_dr7,
5fdbf976 4213 .cache_reg = vmx_cache_reg,
6aa8b732
AK
4214 .get_rflags = vmx_get_rflags,
4215 .set_rflags = vmx_set_rflags,
ebcbab4c 4216 .fpu_activate = vmx_fpu_activate,
02daab21 4217 .fpu_deactivate = vmx_fpu_deactivate,
6aa8b732
AK
4218
4219 .tlb_flush = vmx_flush_tlb,
6aa8b732 4220
6aa8b732 4221 .run = vmx_vcpu_run,
6062d012 4222 .handle_exit = vmx_handle_exit,
6aa8b732 4223 .skip_emulated_instruction = skip_emulated_instruction,
2809f5d2
GC
4224 .set_interrupt_shadow = vmx_set_interrupt_shadow,
4225 .get_interrupt_shadow = vmx_get_interrupt_shadow,
102d8325 4226 .patch_hypercall = vmx_patch_hypercall,
2a8067f1 4227 .set_irq = vmx_inject_irq,
95ba8273 4228 .set_nmi = vmx_inject_nmi,
298101da 4229 .queue_exception = vmx_queue_exception,
78646121 4230 .interrupt_allowed = vmx_interrupt_allowed,
95ba8273 4231 .nmi_allowed = vmx_nmi_allowed,
3cfc3092
JK
4232 .get_nmi_mask = vmx_get_nmi_mask,
4233 .set_nmi_mask = vmx_set_nmi_mask,
95ba8273
GN
4234 .enable_nmi_window = enable_nmi_window,
4235 .enable_irq_window = enable_irq_window,
4236 .update_cr8_intercept = update_cr8_intercept,
95ba8273 4237
cbc94022 4238 .set_tss_addr = vmx_set_tss_addr,
67253af5 4239 .get_tdp_level = get_ept_level,
4b12f0de 4240 .get_mt_mask = vmx_get_mt_mask,
229456fc
MT
4241
4242 .exit_reasons_str = vmx_exit_reasons_str,
17cc3935 4243 .get_lpage_level = vmx_get_lpage_level,
0e851880
SY
4244
4245 .cpuid_update = vmx_cpuid_update,
4e47c7a6
SY
4246
4247 .rdtscp_supported = vmx_rdtscp_supported,
d4330ef2
JR
4248
4249 .set_supported_cpuid = vmx_set_supported_cpuid,
6aa8b732
AK
4250};
4251
4252static int __init vmx_init(void)
4253{
26bb0981
AK
4254 int r, i;
4255
4256 rdmsrl_safe(MSR_EFER, &host_efer);
4257
4258 for (i = 0; i < NR_VMX_MSR; ++i)
4259 kvm_define_shared_msr(i, vmx_msr_index[i]);
fdef3ad1 4260
3e7c73e9 4261 vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
fdef3ad1
HQ
4262 if (!vmx_io_bitmap_a)
4263 return -ENOMEM;
4264
3e7c73e9 4265 vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
fdef3ad1
HQ
4266 if (!vmx_io_bitmap_b) {
4267 r = -ENOMEM;
4268 goto out;
4269 }
4270
5897297b
AK
4271 vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
4272 if (!vmx_msr_bitmap_legacy) {
25c5f225
SY
4273 r = -ENOMEM;
4274 goto out1;
4275 }
4276
5897297b
AK
4277 vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
4278 if (!vmx_msr_bitmap_longmode) {
4279 r = -ENOMEM;
4280 goto out2;
4281 }
4282
fdef3ad1
HQ
4283 /*
4284 * Allow direct access to the PC debug port (it is often used for I/O
4285 * delays, but the vmexits simply slow things down).
4286 */
3e7c73e9
AK
4287 memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
4288 clear_bit(0x80, vmx_io_bitmap_a);
fdef3ad1 4289
3e7c73e9 4290 memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
fdef3ad1 4291
5897297b
AK
4292 memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
4293 memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
25c5f225 4294
2384d2b3
SY
4295 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
4296
0ee75bea
AK
4297 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
4298 __alignof__(struct vcpu_vmx), THIS_MODULE);
fdef3ad1 4299 if (r)
5897297b 4300 goto out3;
25c5f225 4301
5897297b
AK
4302 vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
4303 vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
4304 vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
4305 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
4306 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
4307 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
fdef3ad1 4308
089d034e 4309 if (enable_ept) {
1439442c 4310 bypass_guest_pf = 0;
5fdbcb9d 4311 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
2aaf69dc 4312 VMX_EPT_WRITABLE_MASK);
534e38b4 4313 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
4b12f0de 4314 VMX_EPT_EXECUTABLE_MASK);
5fdbcb9d
SY
4315 kvm_enable_tdp();
4316 } else
4317 kvm_disable_tdp();
1439442c 4318
c7addb90
AK
4319 if (bypass_guest_pf)
4320 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
4321
fdef3ad1
HQ
4322 return 0;
4323
5897297b
AK
4324out3:
4325 free_page((unsigned long)vmx_msr_bitmap_longmode);
25c5f225 4326out2:
5897297b 4327 free_page((unsigned long)vmx_msr_bitmap_legacy);
fdef3ad1 4328out1:
3e7c73e9 4329 free_page((unsigned long)vmx_io_bitmap_b);
fdef3ad1 4330out:
3e7c73e9 4331 free_page((unsigned long)vmx_io_bitmap_a);
fdef3ad1 4332 return r;
6aa8b732
AK
4333}
4334
4335static void __exit vmx_exit(void)
4336{
5897297b
AK
4337 free_page((unsigned long)vmx_msr_bitmap_legacy);
4338 free_page((unsigned long)vmx_msr_bitmap_longmode);
3e7c73e9
AK
4339 free_page((unsigned long)vmx_io_bitmap_b);
4340 free_page((unsigned long)vmx_io_bitmap_a);
fdef3ad1 4341
cb498ea2 4342 kvm_exit();
6aa8b732
AK
4343}
4344
4345module_init(vmx_init)
4346module_exit(vmx_exit)