2 * Kernel-based Virtual Machine driver for Linux
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
7 * Copyright (C) 2006 Qumranet, Inc.
10 * Avi Kivity <avi@qumranet.com>
11 * Yaniv Kamay <yaniv@qumranet.com>
13 * This work is licensed under the terms of the GNU GPL, version 2. See
14 * the COPYING file in the top-level directory.
22 #include <linux/kvm_host.h>
23 #include <linux/module.h>
24 #include <linux/kernel.h>
26 #include <linux/highmem.h>
27 #include <linux/sched.h>
28 #include <linux/moduleparam.h>
29 #include "kvm_cache_regs.h"
35 #define __ex(x) __kvm_handle_fault_on_reboot(x)
37 MODULE_AUTHOR("Qumranet");
38 MODULE_LICENSE("GPL");
40 static int bypass_guest_pf
= 1;
41 module_param(bypass_guest_pf
, bool, 0);
43 static int enable_vpid
= 1;
44 module_param(enable_vpid
, bool, 0);
46 static int flexpriority_enabled
= 1;
47 module_param(flexpriority_enabled
, bool, 0);
49 static int enable_ept
= 1;
50 module_param(enable_ept
, bool, 0);
60 struct list_head local_vcpus_link
;
61 unsigned long host_rsp
;
64 u32 idt_vectoring_info
;
65 struct kvm_msr_entry
*guest_msrs
;
66 struct kvm_msr_entry
*host_msrs
;
71 int msr_offset_kernel_gs_base
;
76 u16 fs_sel
, gs_sel
, ldt_sel
;
77 int gs_ldt_reload_needed
;
79 int guest_efer_loaded
;
91 static inline struct vcpu_vmx
*to_vmx(struct kvm_vcpu
*vcpu
)
93 return container_of(vcpu
, struct vcpu_vmx
, vcpu
);
96 static int init_rmode(struct kvm
*kvm
);
97 static u64
construct_eptp(unsigned long root_hpa
);
99 static DEFINE_PER_CPU(struct vmcs
*, vmxarea
);
100 static DEFINE_PER_CPU(struct vmcs
*, current_vmcs
);
101 static DEFINE_PER_CPU(struct list_head
, vcpus_on_cpu
);
103 static struct page
*vmx_io_bitmap_a
;
104 static struct page
*vmx_io_bitmap_b
;
105 static struct page
*vmx_msr_bitmap
;
107 static DECLARE_BITMAP(vmx_vpid_bitmap
, VMX_NR_VPIDS
);
108 static DEFINE_SPINLOCK(vmx_vpid_lock
);
110 static struct vmcs_config
{
114 u32 pin_based_exec_ctrl
;
115 u32 cpu_based_exec_ctrl
;
116 u32 cpu_based_2nd_exec_ctrl
;
121 struct vmx_capability
{
126 #define VMX_SEGMENT_FIELD(seg) \
127 [VCPU_SREG_##seg] = { \
128 .selector = GUEST_##seg##_SELECTOR, \
129 .base = GUEST_##seg##_BASE, \
130 .limit = GUEST_##seg##_LIMIT, \
131 .ar_bytes = GUEST_##seg##_AR_BYTES, \
134 static struct kvm_vmx_segment_field
{
139 } kvm_vmx_segment_fields
[] = {
140 VMX_SEGMENT_FIELD(CS
),
141 VMX_SEGMENT_FIELD(DS
),
142 VMX_SEGMENT_FIELD(ES
),
143 VMX_SEGMENT_FIELD(FS
),
144 VMX_SEGMENT_FIELD(GS
),
145 VMX_SEGMENT_FIELD(SS
),
146 VMX_SEGMENT_FIELD(TR
),
147 VMX_SEGMENT_FIELD(LDTR
),
151 * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
152 * away by decrementing the array size.
154 static const u32 vmx_msr_index
[] = {
156 MSR_SYSCALL_MASK
, MSR_LSTAR
, MSR_CSTAR
, MSR_KERNEL_GS_BASE
,
158 MSR_EFER
, MSR_K6_STAR
,
160 #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
162 static void load_msrs(struct kvm_msr_entry
*e
, int n
)
166 for (i
= 0; i
< n
; ++i
)
167 wrmsrl(e
[i
].index
, e
[i
].data
);
170 static void save_msrs(struct kvm_msr_entry
*e
, int n
)
174 for (i
= 0; i
< n
; ++i
)
175 rdmsrl(e
[i
].index
, e
[i
].data
);
178 static inline int is_page_fault(u32 intr_info
)
180 return (intr_info
& (INTR_INFO_INTR_TYPE_MASK
| INTR_INFO_VECTOR_MASK
|
181 INTR_INFO_VALID_MASK
)) ==
182 (INTR_TYPE_EXCEPTION
| PF_VECTOR
| INTR_INFO_VALID_MASK
);
185 static inline int is_no_device(u32 intr_info
)
187 return (intr_info
& (INTR_INFO_INTR_TYPE_MASK
| INTR_INFO_VECTOR_MASK
|
188 INTR_INFO_VALID_MASK
)) ==
189 (INTR_TYPE_EXCEPTION
| NM_VECTOR
| INTR_INFO_VALID_MASK
);
192 static inline int is_invalid_opcode(u32 intr_info
)
194 return (intr_info
& (INTR_INFO_INTR_TYPE_MASK
| INTR_INFO_VECTOR_MASK
|
195 INTR_INFO_VALID_MASK
)) ==
196 (INTR_TYPE_EXCEPTION
| UD_VECTOR
| INTR_INFO_VALID_MASK
);
199 static inline int is_external_interrupt(u32 intr_info
)
201 return (intr_info
& (INTR_INFO_INTR_TYPE_MASK
| INTR_INFO_VALID_MASK
))
202 == (INTR_TYPE_EXT_INTR
| INTR_INFO_VALID_MASK
);
205 static inline int cpu_has_vmx_msr_bitmap(void)
207 return (vmcs_config
.cpu_based_exec_ctrl
& CPU_BASED_USE_MSR_BITMAPS
);
210 static inline int cpu_has_vmx_tpr_shadow(void)
212 return (vmcs_config
.cpu_based_exec_ctrl
& CPU_BASED_TPR_SHADOW
);
215 static inline int vm_need_tpr_shadow(struct kvm
*kvm
)
217 return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm
)));
220 static inline int cpu_has_secondary_exec_ctrls(void)
222 return (vmcs_config
.cpu_based_exec_ctrl
&
223 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS
);
226 static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
228 return flexpriority_enabled
229 && (vmcs_config
.cpu_based_2nd_exec_ctrl
&
230 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES
);
233 static inline int cpu_has_vmx_invept_individual_addr(void)
235 return (!!(vmx_capability
.ept
& VMX_EPT_EXTENT_INDIVIDUAL_BIT
));
238 static inline int cpu_has_vmx_invept_context(void)
240 return (!!(vmx_capability
.ept
& VMX_EPT_EXTENT_CONTEXT_BIT
));
243 static inline int cpu_has_vmx_invept_global(void)
245 return (!!(vmx_capability
.ept
& VMX_EPT_EXTENT_GLOBAL_BIT
));
248 static inline int cpu_has_vmx_ept(void)
250 return (vmcs_config
.cpu_based_2nd_exec_ctrl
&
251 SECONDARY_EXEC_ENABLE_EPT
);
254 static inline int vm_need_ept(void)
256 return (cpu_has_vmx_ept() && enable_ept
);
259 static inline int vm_need_virtualize_apic_accesses(struct kvm
*kvm
)
261 return ((cpu_has_vmx_virtualize_apic_accesses()) &&
262 (irqchip_in_kernel(kvm
)));
265 static inline int cpu_has_vmx_vpid(void)
267 return (vmcs_config
.cpu_based_2nd_exec_ctrl
&
268 SECONDARY_EXEC_ENABLE_VPID
);
271 static inline int cpu_has_virtual_nmis(void)
273 return vmcs_config
.pin_based_exec_ctrl
& PIN_BASED_VIRTUAL_NMIS
;
276 static int __find_msr_index(struct vcpu_vmx
*vmx
, u32 msr
)
280 for (i
= 0; i
< vmx
->nmsrs
; ++i
)
281 if (vmx
->guest_msrs
[i
].index
== msr
)
286 static inline void __invvpid(int ext
, u16 vpid
, gva_t gva
)
292 } operand
= { vpid
, 0, gva
};
294 asm volatile (__ex(ASM_VMX_INVVPID
)
295 /* CF==1 or ZF==1 --> rc = -1 */
297 : : "a"(&operand
), "c"(ext
) : "cc", "memory");
300 static inline void __invept(int ext
, u64 eptp
, gpa_t gpa
)
304 } operand
= {eptp
, gpa
};
306 asm volatile (__ex(ASM_VMX_INVEPT
)
307 /* CF==1 or ZF==1 --> rc = -1 */
308 "; ja 1f ; ud2 ; 1:\n"
309 : : "a" (&operand
), "c" (ext
) : "cc", "memory");
312 static struct kvm_msr_entry
*find_msr_entry(struct vcpu_vmx
*vmx
, u32 msr
)
316 i
= __find_msr_index(vmx
, msr
);
318 return &vmx
->guest_msrs
[i
];
322 static void vmcs_clear(struct vmcs
*vmcs
)
324 u64 phys_addr
= __pa(vmcs
);
327 asm volatile (__ex(ASM_VMX_VMCLEAR_RAX
) "; setna %0"
328 : "=g"(error
) : "a"(&phys_addr
), "m"(phys_addr
)
331 printk(KERN_ERR
"kvm: vmclear fail: %p/%llx\n",
335 static void __vcpu_clear(void *arg
)
337 struct vcpu_vmx
*vmx
= arg
;
338 int cpu
= raw_smp_processor_id();
340 if (vmx
->vcpu
.cpu
== cpu
)
341 vmcs_clear(vmx
->vmcs
);
342 if (per_cpu(current_vmcs
, cpu
) == vmx
->vmcs
)
343 per_cpu(current_vmcs
, cpu
) = NULL
;
344 rdtscll(vmx
->vcpu
.arch
.host_tsc
);
345 list_del(&vmx
->local_vcpus_link
);
350 static void vcpu_clear(struct vcpu_vmx
*vmx
)
352 if (vmx
->vcpu
.cpu
== -1)
354 smp_call_function_single(vmx
->vcpu
.cpu
, __vcpu_clear
, vmx
, 1);
357 static inline void vpid_sync_vcpu_all(struct vcpu_vmx
*vmx
)
362 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT
, vmx
->vpid
, 0);
365 static inline void ept_sync_global(void)
367 if (cpu_has_vmx_invept_global())
368 __invept(VMX_EPT_EXTENT_GLOBAL
, 0, 0);
371 static inline void ept_sync_context(u64 eptp
)
374 if (cpu_has_vmx_invept_context())
375 __invept(VMX_EPT_EXTENT_CONTEXT
, eptp
, 0);
381 static inline void ept_sync_individual_addr(u64 eptp
, gpa_t gpa
)
384 if (cpu_has_vmx_invept_individual_addr())
385 __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR
,
388 ept_sync_context(eptp
);
392 static unsigned long vmcs_readl(unsigned long field
)
396 asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX
)
397 : "=a"(value
) : "d"(field
) : "cc");
401 static u16
vmcs_read16(unsigned long field
)
403 return vmcs_readl(field
);
406 static u32
vmcs_read32(unsigned long field
)
408 return vmcs_readl(field
);
411 static u64
vmcs_read64(unsigned long field
)
414 return vmcs_readl(field
);
416 return vmcs_readl(field
) | ((u64
)vmcs_readl(field
+1) << 32);
420 static noinline
void vmwrite_error(unsigned long field
, unsigned long value
)
422 printk(KERN_ERR
"vmwrite error: reg %lx value %lx (err %d)\n",
423 field
, value
, vmcs_read32(VM_INSTRUCTION_ERROR
));
427 static void vmcs_writel(unsigned long field
, unsigned long value
)
431 asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX
) "; setna %0"
432 : "=q"(error
) : "a"(value
), "d"(field
) : "cc");
434 vmwrite_error(field
, value
);
437 static void vmcs_write16(unsigned long field
, u16 value
)
439 vmcs_writel(field
, value
);
442 static void vmcs_write32(unsigned long field
, u32 value
)
444 vmcs_writel(field
, value
);
447 static void vmcs_write64(unsigned long field
, u64 value
)
449 vmcs_writel(field
, value
);
450 #ifndef CONFIG_X86_64
452 vmcs_writel(field
+1, value
>> 32);
456 static void vmcs_clear_bits(unsigned long field
, u32 mask
)
458 vmcs_writel(field
, vmcs_readl(field
) & ~mask
);
461 static void vmcs_set_bits(unsigned long field
, u32 mask
)
463 vmcs_writel(field
, vmcs_readl(field
) | mask
);
466 static void update_exception_bitmap(struct kvm_vcpu
*vcpu
)
470 eb
= (1u << PF_VECTOR
) | (1u << UD_VECTOR
);
471 if (!vcpu
->fpu_active
)
472 eb
|= 1u << NM_VECTOR
;
473 if (vcpu
->guest_debug
.enabled
)
474 eb
|= 1u << DB_VECTOR
;
475 if (vcpu
->arch
.rmode
.active
)
478 eb
&= ~(1u << PF_VECTOR
); /* bypass_guest_pf = 0 */
479 vmcs_write32(EXCEPTION_BITMAP
, eb
);
482 static void reload_tss(void)
485 * VT restores TR but not its size. Useless.
487 struct descriptor_table gdt
;
488 struct desc_struct
*descs
;
491 descs
= (void *)gdt
.base
;
492 descs
[GDT_ENTRY_TSS
].type
= 9; /* available TSS */
496 static void load_transition_efer(struct vcpu_vmx
*vmx
)
498 int efer_offset
= vmx
->msr_offset_efer
;
499 u64 host_efer
= vmx
->host_msrs
[efer_offset
].data
;
500 u64 guest_efer
= vmx
->guest_msrs
[efer_offset
].data
;
506 * NX is emulated; LMA and LME handled by hardware; SCE meaninless
509 ignore_bits
= EFER_NX
| EFER_SCE
;
511 ignore_bits
|= EFER_LMA
| EFER_LME
;
512 /* SCE is meaningful only in long mode on Intel */
513 if (guest_efer
& EFER_LMA
)
514 ignore_bits
&= ~(u64
)EFER_SCE
;
516 if ((guest_efer
& ~ignore_bits
) == (host_efer
& ~ignore_bits
))
519 vmx
->host_state
.guest_efer_loaded
= 1;
520 guest_efer
&= ~ignore_bits
;
521 guest_efer
|= host_efer
& ignore_bits
;
522 wrmsrl(MSR_EFER
, guest_efer
);
523 vmx
->vcpu
.stat
.efer_reload
++;
526 static void reload_host_efer(struct vcpu_vmx
*vmx
)
528 if (vmx
->host_state
.guest_efer_loaded
) {
529 vmx
->host_state
.guest_efer_loaded
= 0;
530 load_msrs(vmx
->host_msrs
+ vmx
->msr_offset_efer
, 1);
534 static void vmx_save_host_state(struct kvm_vcpu
*vcpu
)
536 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
538 if (vmx
->host_state
.loaded
)
541 vmx
->host_state
.loaded
= 1;
543 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
544 * allow segment selectors with cpl > 0 or ti == 1.
546 vmx
->host_state
.ldt_sel
= kvm_read_ldt();
547 vmx
->host_state
.gs_ldt_reload_needed
= vmx
->host_state
.ldt_sel
;
548 vmx
->host_state
.fs_sel
= kvm_read_fs();
549 if (!(vmx
->host_state
.fs_sel
& 7)) {
550 vmcs_write16(HOST_FS_SELECTOR
, vmx
->host_state
.fs_sel
);
551 vmx
->host_state
.fs_reload_needed
= 0;
553 vmcs_write16(HOST_FS_SELECTOR
, 0);
554 vmx
->host_state
.fs_reload_needed
= 1;
556 vmx
->host_state
.gs_sel
= kvm_read_gs();
557 if (!(vmx
->host_state
.gs_sel
& 7))
558 vmcs_write16(HOST_GS_SELECTOR
, vmx
->host_state
.gs_sel
);
560 vmcs_write16(HOST_GS_SELECTOR
, 0);
561 vmx
->host_state
.gs_ldt_reload_needed
= 1;
565 vmcs_writel(HOST_FS_BASE
, read_msr(MSR_FS_BASE
));
566 vmcs_writel(HOST_GS_BASE
, read_msr(MSR_GS_BASE
));
568 vmcs_writel(HOST_FS_BASE
, segment_base(vmx
->host_state
.fs_sel
));
569 vmcs_writel(HOST_GS_BASE
, segment_base(vmx
->host_state
.gs_sel
));
573 if (is_long_mode(&vmx
->vcpu
))
574 save_msrs(vmx
->host_msrs
+
575 vmx
->msr_offset_kernel_gs_base
, 1);
578 load_msrs(vmx
->guest_msrs
, vmx
->save_nmsrs
);
579 load_transition_efer(vmx
);
582 static void __vmx_load_host_state(struct vcpu_vmx
*vmx
)
586 if (!vmx
->host_state
.loaded
)
589 ++vmx
->vcpu
.stat
.host_state_reload
;
590 vmx
->host_state
.loaded
= 0;
591 if (vmx
->host_state
.fs_reload_needed
)
592 kvm_load_fs(vmx
->host_state
.fs_sel
);
593 if (vmx
->host_state
.gs_ldt_reload_needed
) {
594 kvm_load_ldt(vmx
->host_state
.ldt_sel
);
596 * If we have to reload gs, we must take care to
597 * preserve our gs base.
599 local_irq_save(flags
);
600 kvm_load_gs(vmx
->host_state
.gs_sel
);
602 wrmsrl(MSR_GS_BASE
, vmcs_readl(HOST_GS_BASE
));
604 local_irq_restore(flags
);
607 save_msrs(vmx
->guest_msrs
, vmx
->save_nmsrs
);
608 load_msrs(vmx
->host_msrs
, vmx
->save_nmsrs
);
609 reload_host_efer(vmx
);
612 static void vmx_load_host_state(struct vcpu_vmx
*vmx
)
615 __vmx_load_host_state(vmx
);
620 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
621 * vcpu mutex is already taken.
623 static void vmx_vcpu_load(struct kvm_vcpu
*vcpu
, int cpu
)
625 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
626 u64 phys_addr
= __pa(vmx
->vmcs
);
627 u64 tsc_this
, delta
, new_offset
;
629 if (vcpu
->cpu
!= cpu
) {
631 kvm_migrate_timers(vcpu
);
632 vpid_sync_vcpu_all(vmx
);
634 list_add(&vmx
->local_vcpus_link
,
635 &per_cpu(vcpus_on_cpu
, cpu
));
639 if (per_cpu(current_vmcs
, cpu
) != vmx
->vmcs
) {
642 per_cpu(current_vmcs
, cpu
) = vmx
->vmcs
;
643 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX
) "; setna %0"
644 : "=g"(error
) : "a"(&phys_addr
), "m"(phys_addr
)
647 printk(KERN_ERR
"kvm: vmptrld %p/%llx fail\n",
648 vmx
->vmcs
, phys_addr
);
651 if (vcpu
->cpu
!= cpu
) {
652 struct descriptor_table dt
;
653 unsigned long sysenter_esp
;
657 * Linux uses per-cpu TSS and GDT, so set these when switching
660 vmcs_writel(HOST_TR_BASE
, kvm_read_tr_base()); /* 22.2.4 */
662 vmcs_writel(HOST_GDTR_BASE
, dt
.base
); /* 22.2.4 */
664 rdmsrl(MSR_IA32_SYSENTER_ESP
, sysenter_esp
);
665 vmcs_writel(HOST_IA32_SYSENTER_ESP
, sysenter_esp
); /* 22.2.3 */
668 * Make sure the time stamp counter is monotonous.
671 if (tsc_this
< vcpu
->arch
.host_tsc
) {
672 delta
= vcpu
->arch
.host_tsc
- tsc_this
;
673 new_offset
= vmcs_read64(TSC_OFFSET
) + delta
;
674 vmcs_write64(TSC_OFFSET
, new_offset
);
679 static void vmx_vcpu_put(struct kvm_vcpu
*vcpu
)
681 __vmx_load_host_state(to_vmx(vcpu
));
684 static void vmx_fpu_activate(struct kvm_vcpu
*vcpu
)
686 if (vcpu
->fpu_active
)
688 vcpu
->fpu_active
= 1;
689 vmcs_clear_bits(GUEST_CR0
, X86_CR0_TS
);
690 if (vcpu
->arch
.cr0
& X86_CR0_TS
)
691 vmcs_set_bits(GUEST_CR0
, X86_CR0_TS
);
692 update_exception_bitmap(vcpu
);
695 static void vmx_fpu_deactivate(struct kvm_vcpu
*vcpu
)
697 if (!vcpu
->fpu_active
)
699 vcpu
->fpu_active
= 0;
700 vmcs_set_bits(GUEST_CR0
, X86_CR0_TS
);
701 update_exception_bitmap(vcpu
);
704 static unsigned long vmx_get_rflags(struct kvm_vcpu
*vcpu
)
706 return vmcs_readl(GUEST_RFLAGS
);
709 static void vmx_set_rflags(struct kvm_vcpu
*vcpu
, unsigned long rflags
)
711 if (vcpu
->arch
.rmode
.active
)
712 rflags
|= X86_EFLAGS_IOPL
| X86_EFLAGS_VM
;
713 vmcs_writel(GUEST_RFLAGS
, rflags
);
716 static void skip_emulated_instruction(struct kvm_vcpu
*vcpu
)
719 u32 interruptibility
;
721 rip
= kvm_rip_read(vcpu
);
722 rip
+= vmcs_read32(VM_EXIT_INSTRUCTION_LEN
);
723 kvm_rip_write(vcpu
, rip
);
726 * We emulated an instruction, so temporary interrupt blocking
727 * should be removed, if set.
729 interruptibility
= vmcs_read32(GUEST_INTERRUPTIBILITY_INFO
);
730 if (interruptibility
& 3)
731 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO
,
732 interruptibility
& ~3);
733 vcpu
->arch
.interrupt_window_open
= 1;
736 static void vmx_queue_exception(struct kvm_vcpu
*vcpu
, unsigned nr
,
737 bool has_error_code
, u32 error_code
)
739 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
742 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE
, error_code
);
744 if (vcpu
->arch
.rmode
.active
) {
745 vmx
->rmode
.irq
.pending
= true;
746 vmx
->rmode
.irq
.vector
= nr
;
747 vmx
->rmode
.irq
.rip
= kvm_rip_read(vcpu
);
749 vmx
->rmode
.irq
.rip
++;
750 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD
,
751 nr
| INTR_TYPE_SOFT_INTR
752 | (has_error_code
? INTR_INFO_DELIVER_CODE_MASK
: 0)
753 | INTR_INFO_VALID_MASK
);
754 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN
, 1);
755 kvm_rip_write(vcpu
, vmx
->rmode
.irq
.rip
- 1);
759 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD
,
760 nr
| INTR_TYPE_EXCEPTION
761 | (has_error_code
? INTR_INFO_DELIVER_CODE_MASK
: 0)
762 | INTR_INFO_VALID_MASK
);
765 static bool vmx_exception_injected(struct kvm_vcpu
*vcpu
)
771 * Swap MSR entry in host/guest MSR entry array.
774 static void move_msr_up(struct vcpu_vmx
*vmx
, int from
, int to
)
776 struct kvm_msr_entry tmp
;
778 tmp
= vmx
->guest_msrs
[to
];
779 vmx
->guest_msrs
[to
] = vmx
->guest_msrs
[from
];
780 vmx
->guest_msrs
[from
] = tmp
;
781 tmp
= vmx
->host_msrs
[to
];
782 vmx
->host_msrs
[to
] = vmx
->host_msrs
[from
];
783 vmx
->host_msrs
[from
] = tmp
;
788 * Set up the vmcs to automatically save and restore system
789 * msrs. Don't touch the 64-bit msrs if the guest is in legacy
790 * mode, as fiddling with msrs is very expensive.
792 static void setup_msrs(struct vcpu_vmx
*vmx
)
796 vmx_load_host_state(vmx
);
799 if (is_long_mode(&vmx
->vcpu
)) {
802 index
= __find_msr_index(vmx
, MSR_SYSCALL_MASK
);
804 move_msr_up(vmx
, index
, save_nmsrs
++);
805 index
= __find_msr_index(vmx
, MSR_LSTAR
);
807 move_msr_up(vmx
, index
, save_nmsrs
++);
808 index
= __find_msr_index(vmx
, MSR_CSTAR
);
810 move_msr_up(vmx
, index
, save_nmsrs
++);
811 index
= __find_msr_index(vmx
, MSR_KERNEL_GS_BASE
);
813 move_msr_up(vmx
, index
, save_nmsrs
++);
815 * MSR_K6_STAR is only needed on long mode guests, and only
816 * if efer.sce is enabled.
818 index
= __find_msr_index(vmx
, MSR_K6_STAR
);
819 if ((index
>= 0) && (vmx
->vcpu
.arch
.shadow_efer
& EFER_SCE
))
820 move_msr_up(vmx
, index
, save_nmsrs
++);
823 vmx
->save_nmsrs
= save_nmsrs
;
826 vmx
->msr_offset_kernel_gs_base
=
827 __find_msr_index(vmx
, MSR_KERNEL_GS_BASE
);
829 vmx
->msr_offset_efer
= __find_msr_index(vmx
, MSR_EFER
);
833 * reads and returns guest's timestamp counter "register"
834 * guest_tsc = host_tsc + tsc_offset -- 21.3
836 static u64
guest_read_tsc(void)
838 u64 host_tsc
, tsc_offset
;
841 tsc_offset
= vmcs_read64(TSC_OFFSET
);
842 return host_tsc
+ tsc_offset
;
846 * writes 'guest_tsc' into guest's timestamp counter "register"
847 * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
849 static void guest_write_tsc(u64 guest_tsc
)
854 vmcs_write64(TSC_OFFSET
, guest_tsc
- host_tsc
);
858 * Reads an msr value (of 'msr_index') into 'pdata'.
859 * Returns 0 on success, non-0 otherwise.
860 * Assumes vcpu_load() was already called.
862 static int vmx_get_msr(struct kvm_vcpu
*vcpu
, u32 msr_index
, u64
*pdata
)
865 struct kvm_msr_entry
*msr
;
868 printk(KERN_ERR
"BUG: get_msr called with NULL pdata\n");
875 data
= vmcs_readl(GUEST_FS_BASE
);
878 data
= vmcs_readl(GUEST_GS_BASE
);
881 return kvm_get_msr_common(vcpu
, msr_index
, pdata
);
883 case MSR_IA32_TIME_STAMP_COUNTER
:
884 data
= guest_read_tsc();
886 case MSR_IA32_SYSENTER_CS
:
887 data
= vmcs_read32(GUEST_SYSENTER_CS
);
889 case MSR_IA32_SYSENTER_EIP
:
890 data
= vmcs_readl(GUEST_SYSENTER_EIP
);
892 case MSR_IA32_SYSENTER_ESP
:
893 data
= vmcs_readl(GUEST_SYSENTER_ESP
);
896 msr
= find_msr_entry(to_vmx(vcpu
), msr_index
);
901 return kvm_get_msr_common(vcpu
, msr_index
, pdata
);
909 * Writes msr value into into the appropriate "register".
910 * Returns 0 on success, non-0 otherwise.
911 * Assumes vcpu_load() was already called.
913 static int vmx_set_msr(struct kvm_vcpu
*vcpu
, u32 msr_index
, u64 data
)
915 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
916 struct kvm_msr_entry
*msr
;
922 vmx_load_host_state(vmx
);
923 ret
= kvm_set_msr_common(vcpu
, msr_index
, data
);
926 vmcs_writel(GUEST_FS_BASE
, data
);
929 vmcs_writel(GUEST_GS_BASE
, data
);
932 case MSR_IA32_SYSENTER_CS
:
933 vmcs_write32(GUEST_SYSENTER_CS
, data
);
935 case MSR_IA32_SYSENTER_EIP
:
936 vmcs_writel(GUEST_SYSENTER_EIP
, data
);
938 case MSR_IA32_SYSENTER_ESP
:
939 vmcs_writel(GUEST_SYSENTER_ESP
, data
);
941 case MSR_IA32_TIME_STAMP_COUNTER
:
942 guest_write_tsc(data
);
944 case MSR_P6_PERFCTR0
:
945 case MSR_P6_PERFCTR1
:
946 case MSR_P6_EVNTSEL0
:
947 case MSR_P6_EVNTSEL1
:
949 * Just discard all writes to the performance counters; this
950 * should keep both older linux and windows 64-bit guests
953 pr_unimpl(vcpu
, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index
, data
);
957 vmx_load_host_state(vmx
);
958 msr
= find_msr_entry(vmx
, msr_index
);
963 ret
= kvm_set_msr_common(vcpu
, msr_index
, data
);
969 static void vmx_cache_reg(struct kvm_vcpu
*vcpu
, enum kvm_reg reg
)
971 __set_bit(reg
, (unsigned long *)&vcpu
->arch
.regs_avail
);
974 vcpu
->arch
.regs
[VCPU_REGS_RSP
] = vmcs_readl(GUEST_RSP
);
977 vcpu
->arch
.regs
[VCPU_REGS_RIP
] = vmcs_readl(GUEST_RIP
);
984 static int set_guest_debug(struct kvm_vcpu
*vcpu
, struct kvm_debug_guest
*dbg
)
986 unsigned long dr7
= 0x400;
989 old_singlestep
= vcpu
->guest_debug
.singlestep
;
991 vcpu
->guest_debug
.enabled
= dbg
->enabled
;
992 if (vcpu
->guest_debug
.enabled
) {
995 dr7
|= 0x200; /* exact */
996 for (i
= 0; i
< 4; ++i
) {
997 if (!dbg
->breakpoints
[i
].enabled
)
999 vcpu
->guest_debug
.bp
[i
] = dbg
->breakpoints
[i
].address
;
1000 dr7
|= 2 << (i
*2); /* global enable */
1001 dr7
|= 0 << (i
*4+16); /* execution breakpoint */
1004 vcpu
->guest_debug
.singlestep
= dbg
->singlestep
;
1006 vcpu
->guest_debug
.singlestep
= 0;
1008 if (old_singlestep
&& !vcpu
->guest_debug
.singlestep
) {
1009 unsigned long flags
;
1011 flags
= vmcs_readl(GUEST_RFLAGS
);
1012 flags
&= ~(X86_EFLAGS_TF
| X86_EFLAGS_RF
);
1013 vmcs_writel(GUEST_RFLAGS
, flags
);
1016 update_exception_bitmap(vcpu
);
1017 vmcs_writel(GUEST_DR7
, dr7
);
1022 static int vmx_get_irq(struct kvm_vcpu
*vcpu
)
1024 if (!vcpu
->arch
.interrupt
.pending
)
1026 return vcpu
->arch
.interrupt
.nr
;
1029 static __init
int cpu_has_kvm_support(void)
1031 unsigned long ecx
= cpuid_ecx(1);
1032 return test_bit(5, &ecx
); /* CPUID.1:ECX.VMX[bit 5] -> VT */
1035 static __init
int vmx_disabled_by_bios(void)
1039 rdmsrl(MSR_IA32_FEATURE_CONTROL
, msr
);
1040 return (msr
& (IA32_FEATURE_CONTROL_LOCKED_BIT
|
1041 IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT
))
1042 == IA32_FEATURE_CONTROL_LOCKED_BIT
;
1043 /* locked but not enabled */
1046 static void hardware_enable(void *garbage
)
1048 int cpu
= raw_smp_processor_id();
1049 u64 phys_addr
= __pa(per_cpu(vmxarea
, cpu
));
1052 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu
, cpu
));
1053 rdmsrl(MSR_IA32_FEATURE_CONTROL
, old
);
1054 if ((old
& (IA32_FEATURE_CONTROL_LOCKED_BIT
|
1055 IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT
))
1056 != (IA32_FEATURE_CONTROL_LOCKED_BIT
|
1057 IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT
))
1058 /* enable and lock */
1059 wrmsrl(MSR_IA32_FEATURE_CONTROL
, old
|
1060 IA32_FEATURE_CONTROL_LOCKED_BIT
|
1061 IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT
);
1062 write_cr4(read_cr4() | X86_CR4_VMXE
); /* FIXME: not cpu hotplug safe */
1063 asm volatile (ASM_VMX_VMXON_RAX
1064 : : "a"(&phys_addr
), "m"(phys_addr
)
1068 static void vmclear_local_vcpus(void)
1070 int cpu
= raw_smp_processor_id();
1071 struct vcpu_vmx
*vmx
, *n
;
1073 list_for_each_entry_safe(vmx
, n
, &per_cpu(vcpus_on_cpu
, cpu
),
1078 static void hardware_disable(void *garbage
)
1080 vmclear_local_vcpus();
1081 asm volatile (__ex(ASM_VMX_VMXOFF
) : : : "cc");
1082 write_cr4(read_cr4() & ~X86_CR4_VMXE
);
1085 static __init
int adjust_vmx_controls(u32 ctl_min
, u32 ctl_opt
,
1086 u32 msr
, u32
*result
)
1088 u32 vmx_msr_low
, vmx_msr_high
;
1089 u32 ctl
= ctl_min
| ctl_opt
;
1091 rdmsr(msr
, vmx_msr_low
, vmx_msr_high
);
1093 ctl
&= vmx_msr_high
; /* bit == 0 in high word ==> must be zero */
1094 ctl
|= vmx_msr_low
; /* bit == 1 in low word ==> must be one */
1096 /* Ensure minimum (required) set of control bits are supported. */
1104 static __init
int setup_vmcs_config(struct vmcs_config
*vmcs_conf
)
1106 u32 vmx_msr_low
, vmx_msr_high
;
1107 u32 min
, opt
, min2
, opt2
;
1108 u32 _pin_based_exec_control
= 0;
1109 u32 _cpu_based_exec_control
= 0;
1110 u32 _cpu_based_2nd_exec_control
= 0;
1111 u32 _vmexit_control
= 0;
1112 u32 _vmentry_control
= 0;
1114 min
= PIN_BASED_EXT_INTR_MASK
| PIN_BASED_NMI_EXITING
;
1115 opt
= PIN_BASED_VIRTUAL_NMIS
;
1116 if (adjust_vmx_controls(min
, opt
, MSR_IA32_VMX_PINBASED_CTLS
,
1117 &_pin_based_exec_control
) < 0)
1120 min
= CPU_BASED_HLT_EXITING
|
1121 #ifdef CONFIG_X86_64
1122 CPU_BASED_CR8_LOAD_EXITING
|
1123 CPU_BASED_CR8_STORE_EXITING
|
1125 CPU_BASED_CR3_LOAD_EXITING
|
1126 CPU_BASED_CR3_STORE_EXITING
|
1127 CPU_BASED_USE_IO_BITMAPS
|
1128 CPU_BASED_MOV_DR_EXITING
|
1129 CPU_BASED_USE_TSC_OFFSETING
;
1130 opt
= CPU_BASED_TPR_SHADOW
|
1131 CPU_BASED_USE_MSR_BITMAPS
|
1132 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS
;
1133 if (adjust_vmx_controls(min
, opt
, MSR_IA32_VMX_PROCBASED_CTLS
,
1134 &_cpu_based_exec_control
) < 0)
1136 #ifdef CONFIG_X86_64
1137 if ((_cpu_based_exec_control
& CPU_BASED_TPR_SHADOW
))
1138 _cpu_based_exec_control
&= ~CPU_BASED_CR8_LOAD_EXITING
&
1139 ~CPU_BASED_CR8_STORE_EXITING
;
1141 if (_cpu_based_exec_control
& CPU_BASED_ACTIVATE_SECONDARY_CONTROLS
) {
1143 opt2
= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES
|
1144 SECONDARY_EXEC_WBINVD_EXITING
|
1145 SECONDARY_EXEC_ENABLE_VPID
|
1146 SECONDARY_EXEC_ENABLE_EPT
;
1147 if (adjust_vmx_controls(min2
, opt2
,
1148 MSR_IA32_VMX_PROCBASED_CTLS2
,
1149 &_cpu_based_2nd_exec_control
) < 0)
1152 #ifndef CONFIG_X86_64
1153 if (!(_cpu_based_2nd_exec_control
&
1154 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES
))
1155 _cpu_based_exec_control
&= ~CPU_BASED_TPR_SHADOW
;
1157 if (_cpu_based_2nd_exec_control
& SECONDARY_EXEC_ENABLE_EPT
) {
1158 /* CR3 accesses don't need to cause VM Exits when EPT enabled */
1159 min
&= ~(CPU_BASED_CR3_LOAD_EXITING
|
1160 CPU_BASED_CR3_STORE_EXITING
);
1161 if (adjust_vmx_controls(min
, opt
, MSR_IA32_VMX_PROCBASED_CTLS
,
1162 &_cpu_based_exec_control
) < 0)
1164 rdmsr(MSR_IA32_VMX_EPT_VPID_CAP
,
1165 vmx_capability
.ept
, vmx_capability
.vpid
);
1169 #ifdef CONFIG_X86_64
1170 min
|= VM_EXIT_HOST_ADDR_SPACE_SIZE
;
1173 if (adjust_vmx_controls(min
, opt
, MSR_IA32_VMX_EXIT_CTLS
,
1174 &_vmexit_control
) < 0)
1178 if (adjust_vmx_controls(min
, opt
, MSR_IA32_VMX_ENTRY_CTLS
,
1179 &_vmentry_control
) < 0)
1182 rdmsr(MSR_IA32_VMX_BASIC
, vmx_msr_low
, vmx_msr_high
);
1184 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
1185 if ((vmx_msr_high
& 0x1fff) > PAGE_SIZE
)
1188 #ifdef CONFIG_X86_64
1189 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
1190 if (vmx_msr_high
& (1u<<16))
1194 /* Require Write-Back (WB) memory type for VMCS accesses. */
1195 if (((vmx_msr_high
>> 18) & 15) != 6)
1198 vmcs_conf
->size
= vmx_msr_high
& 0x1fff;
1199 vmcs_conf
->order
= get_order(vmcs_config
.size
);
1200 vmcs_conf
->revision_id
= vmx_msr_low
;
1202 vmcs_conf
->pin_based_exec_ctrl
= _pin_based_exec_control
;
1203 vmcs_conf
->cpu_based_exec_ctrl
= _cpu_based_exec_control
;
1204 vmcs_conf
->cpu_based_2nd_exec_ctrl
= _cpu_based_2nd_exec_control
;
1205 vmcs_conf
->vmexit_ctrl
= _vmexit_control
;
1206 vmcs_conf
->vmentry_ctrl
= _vmentry_control
;
1211 static struct vmcs
*alloc_vmcs_cpu(int cpu
)
1213 int node
= cpu_to_node(cpu
);
1217 pages
= alloc_pages_node(node
, GFP_KERNEL
, vmcs_config
.order
);
1220 vmcs
= page_address(pages
);
1221 memset(vmcs
, 0, vmcs_config
.size
);
1222 vmcs
->revision_id
= vmcs_config
.revision_id
; /* vmcs revision id */
1226 static struct vmcs
*alloc_vmcs(void)
1228 return alloc_vmcs_cpu(raw_smp_processor_id());
1231 static void free_vmcs(struct vmcs
*vmcs
)
1233 free_pages((unsigned long)vmcs
, vmcs_config
.order
);
1236 static void free_kvm_area(void)
1240 for_each_online_cpu(cpu
)
1241 free_vmcs(per_cpu(vmxarea
, cpu
));
1244 static __init
int alloc_kvm_area(void)
1248 for_each_online_cpu(cpu
) {
1251 vmcs
= alloc_vmcs_cpu(cpu
);
1257 per_cpu(vmxarea
, cpu
) = vmcs
;
1262 static __init
int hardware_setup(void)
1264 if (setup_vmcs_config(&vmcs_config
) < 0)
1267 if (boot_cpu_has(X86_FEATURE_NX
))
1268 kvm_enable_efer_bits(EFER_NX
);
1270 return alloc_kvm_area();
1273 static __exit
void hardware_unsetup(void)
1278 static void fix_pmode_dataseg(int seg
, struct kvm_save_segment
*save
)
1280 struct kvm_vmx_segment_field
*sf
= &kvm_vmx_segment_fields
[seg
];
1282 if (vmcs_readl(sf
->base
) == save
->base
&& (save
->base
& AR_S_MASK
)) {
1283 vmcs_write16(sf
->selector
, save
->selector
);
1284 vmcs_writel(sf
->base
, save
->base
);
1285 vmcs_write32(sf
->limit
, save
->limit
);
1286 vmcs_write32(sf
->ar_bytes
, save
->ar
);
1288 u32 dpl
= (vmcs_read16(sf
->selector
) & SELECTOR_RPL_MASK
)
1290 vmcs_write32(sf
->ar_bytes
, 0x93 | dpl
);
1294 static void enter_pmode(struct kvm_vcpu
*vcpu
)
1296 unsigned long flags
;
1298 vcpu
->arch
.rmode
.active
= 0;
1300 vmcs_writel(GUEST_TR_BASE
, vcpu
->arch
.rmode
.tr
.base
);
1301 vmcs_write32(GUEST_TR_LIMIT
, vcpu
->arch
.rmode
.tr
.limit
);
1302 vmcs_write32(GUEST_TR_AR_BYTES
, vcpu
->arch
.rmode
.tr
.ar
);
1304 flags
= vmcs_readl(GUEST_RFLAGS
);
1305 flags
&= ~(X86_EFLAGS_IOPL
| X86_EFLAGS_VM
);
1306 flags
|= (vcpu
->arch
.rmode
.save_iopl
<< IOPL_SHIFT
);
1307 vmcs_writel(GUEST_RFLAGS
, flags
);
1309 vmcs_writel(GUEST_CR4
, (vmcs_readl(GUEST_CR4
) & ~X86_CR4_VME
) |
1310 (vmcs_readl(CR4_READ_SHADOW
) & X86_CR4_VME
));
1312 update_exception_bitmap(vcpu
);
1314 fix_pmode_dataseg(VCPU_SREG_ES
, &vcpu
->arch
.rmode
.es
);
1315 fix_pmode_dataseg(VCPU_SREG_DS
, &vcpu
->arch
.rmode
.ds
);
1316 fix_pmode_dataseg(VCPU_SREG_GS
, &vcpu
->arch
.rmode
.gs
);
1317 fix_pmode_dataseg(VCPU_SREG_FS
, &vcpu
->arch
.rmode
.fs
);
1319 vmcs_write16(GUEST_SS_SELECTOR
, 0);
1320 vmcs_write32(GUEST_SS_AR_BYTES
, 0x93);
1322 vmcs_write16(GUEST_CS_SELECTOR
,
1323 vmcs_read16(GUEST_CS_SELECTOR
) & ~SELECTOR_RPL_MASK
);
1324 vmcs_write32(GUEST_CS_AR_BYTES
, 0x9b);
1327 static gva_t
rmode_tss_base(struct kvm
*kvm
)
1329 if (!kvm
->arch
.tss_addr
) {
1330 gfn_t base_gfn
= kvm
->memslots
[0].base_gfn
+
1331 kvm
->memslots
[0].npages
- 3;
1332 return base_gfn
<< PAGE_SHIFT
;
1334 return kvm
->arch
.tss_addr
;
1337 static void fix_rmode_seg(int seg
, struct kvm_save_segment
*save
)
1339 struct kvm_vmx_segment_field
*sf
= &kvm_vmx_segment_fields
[seg
];
1341 save
->selector
= vmcs_read16(sf
->selector
);
1342 save
->base
= vmcs_readl(sf
->base
);
1343 save
->limit
= vmcs_read32(sf
->limit
);
1344 save
->ar
= vmcs_read32(sf
->ar_bytes
);
1345 vmcs_write16(sf
->selector
, save
->base
>> 4);
1346 vmcs_write32(sf
->base
, save
->base
& 0xfffff);
1347 vmcs_write32(sf
->limit
, 0xffff);
1348 vmcs_write32(sf
->ar_bytes
, 0xf3);
1351 static void enter_rmode(struct kvm_vcpu
*vcpu
)
1353 unsigned long flags
;
1355 vcpu
->arch
.rmode
.active
= 1;
1357 vcpu
->arch
.rmode
.tr
.base
= vmcs_readl(GUEST_TR_BASE
);
1358 vmcs_writel(GUEST_TR_BASE
, rmode_tss_base(vcpu
->kvm
));
1360 vcpu
->arch
.rmode
.tr
.limit
= vmcs_read32(GUEST_TR_LIMIT
);
1361 vmcs_write32(GUEST_TR_LIMIT
, RMODE_TSS_SIZE
- 1);
1363 vcpu
->arch
.rmode
.tr
.ar
= vmcs_read32(GUEST_TR_AR_BYTES
);
1364 vmcs_write32(GUEST_TR_AR_BYTES
, 0x008b);
1366 flags
= vmcs_readl(GUEST_RFLAGS
);
1367 vcpu
->arch
.rmode
.save_iopl
1368 = (flags
& X86_EFLAGS_IOPL
) >> IOPL_SHIFT
;
1370 flags
|= X86_EFLAGS_IOPL
| X86_EFLAGS_VM
;
1372 vmcs_writel(GUEST_RFLAGS
, flags
);
1373 vmcs_writel(GUEST_CR4
, vmcs_readl(GUEST_CR4
) | X86_CR4_VME
);
1374 update_exception_bitmap(vcpu
);
1376 vmcs_write16(GUEST_SS_SELECTOR
, vmcs_readl(GUEST_SS_BASE
) >> 4);
1377 vmcs_write32(GUEST_SS_LIMIT
, 0xffff);
1378 vmcs_write32(GUEST_SS_AR_BYTES
, 0xf3);
1380 vmcs_write32(GUEST_CS_AR_BYTES
, 0xf3);
1381 vmcs_write32(GUEST_CS_LIMIT
, 0xffff);
1382 if (vmcs_readl(GUEST_CS_BASE
) == 0xffff0000)
1383 vmcs_writel(GUEST_CS_BASE
, 0xf0000);
1384 vmcs_write16(GUEST_CS_SELECTOR
, vmcs_readl(GUEST_CS_BASE
) >> 4);
1386 fix_rmode_seg(VCPU_SREG_ES
, &vcpu
->arch
.rmode
.es
);
1387 fix_rmode_seg(VCPU_SREG_DS
, &vcpu
->arch
.rmode
.ds
);
1388 fix_rmode_seg(VCPU_SREG_GS
, &vcpu
->arch
.rmode
.gs
);
1389 fix_rmode_seg(VCPU_SREG_FS
, &vcpu
->arch
.rmode
.fs
);
1391 kvm_mmu_reset_context(vcpu
);
1392 init_rmode(vcpu
->kvm
);
1395 #ifdef CONFIG_X86_64
1397 static void enter_lmode(struct kvm_vcpu
*vcpu
)
1401 guest_tr_ar
= vmcs_read32(GUEST_TR_AR_BYTES
);
1402 if ((guest_tr_ar
& AR_TYPE_MASK
) != AR_TYPE_BUSY_64_TSS
) {
1403 printk(KERN_DEBUG
"%s: tss fixup for long mode. \n",
1405 vmcs_write32(GUEST_TR_AR_BYTES
,
1406 (guest_tr_ar
& ~AR_TYPE_MASK
)
1407 | AR_TYPE_BUSY_64_TSS
);
1410 vcpu
->arch
.shadow_efer
|= EFER_LMA
;
1412 find_msr_entry(to_vmx(vcpu
), MSR_EFER
)->data
|= EFER_LMA
| EFER_LME
;
1413 vmcs_write32(VM_ENTRY_CONTROLS
,
1414 vmcs_read32(VM_ENTRY_CONTROLS
)
1415 | VM_ENTRY_IA32E_MODE
);
1418 static void exit_lmode(struct kvm_vcpu
*vcpu
)
1420 vcpu
->arch
.shadow_efer
&= ~EFER_LMA
;
1422 vmcs_write32(VM_ENTRY_CONTROLS
,
1423 vmcs_read32(VM_ENTRY_CONTROLS
)
1424 & ~VM_ENTRY_IA32E_MODE
);
1429 static void vmx_flush_tlb(struct kvm_vcpu
*vcpu
)
1431 vpid_sync_vcpu_all(to_vmx(vcpu
));
1433 ept_sync_context(construct_eptp(vcpu
->arch
.mmu
.root_hpa
));
1436 static void vmx_decache_cr4_guest_bits(struct kvm_vcpu
*vcpu
)
1438 vcpu
->arch
.cr4
&= KVM_GUEST_CR4_MASK
;
1439 vcpu
->arch
.cr4
|= vmcs_readl(GUEST_CR4
) & ~KVM_GUEST_CR4_MASK
;
1442 static void ept_load_pdptrs(struct kvm_vcpu
*vcpu
)
1444 if (is_paging(vcpu
) && is_pae(vcpu
) && !is_long_mode(vcpu
)) {
1445 if (!load_pdptrs(vcpu
, vcpu
->arch
.cr3
)) {
1446 printk(KERN_ERR
"EPT: Fail to load pdptrs!\n");
1449 vmcs_write64(GUEST_PDPTR0
, vcpu
->arch
.pdptrs
[0]);
1450 vmcs_write64(GUEST_PDPTR1
, vcpu
->arch
.pdptrs
[1]);
1451 vmcs_write64(GUEST_PDPTR2
, vcpu
->arch
.pdptrs
[2]);
1452 vmcs_write64(GUEST_PDPTR3
, vcpu
->arch
.pdptrs
[3]);
1456 static void vmx_set_cr4(struct kvm_vcpu
*vcpu
, unsigned long cr4
);
1458 static void ept_update_paging_mode_cr0(unsigned long *hw_cr0
,
1460 struct kvm_vcpu
*vcpu
)
1462 if (!(cr0
& X86_CR0_PG
)) {
1463 /* From paging/starting to nonpaging */
1464 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL
,
1465 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL
) |
1466 (CPU_BASED_CR3_LOAD_EXITING
|
1467 CPU_BASED_CR3_STORE_EXITING
));
1468 vcpu
->arch
.cr0
= cr0
;
1469 vmx_set_cr4(vcpu
, vcpu
->arch
.cr4
);
1470 *hw_cr0
|= X86_CR0_PE
| X86_CR0_PG
;
1471 *hw_cr0
&= ~X86_CR0_WP
;
1472 } else if (!is_paging(vcpu
)) {
1473 /* From nonpaging to paging */
1474 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL
,
1475 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL
) &
1476 ~(CPU_BASED_CR3_LOAD_EXITING
|
1477 CPU_BASED_CR3_STORE_EXITING
));
1478 vcpu
->arch
.cr0
= cr0
;
1479 vmx_set_cr4(vcpu
, vcpu
->arch
.cr4
);
1480 if (!(vcpu
->arch
.cr0
& X86_CR0_WP
))
1481 *hw_cr0
&= ~X86_CR0_WP
;
1485 static void ept_update_paging_mode_cr4(unsigned long *hw_cr4
,
1486 struct kvm_vcpu
*vcpu
)
1488 if (!is_paging(vcpu
)) {
1489 *hw_cr4
&= ~X86_CR4_PAE
;
1490 *hw_cr4
|= X86_CR4_PSE
;
1491 } else if (!(vcpu
->arch
.cr4
& X86_CR4_PAE
))
1492 *hw_cr4
&= ~X86_CR4_PAE
;
1495 static void vmx_set_cr0(struct kvm_vcpu
*vcpu
, unsigned long cr0
)
1497 unsigned long hw_cr0
= (cr0
& ~KVM_GUEST_CR0_MASK
) |
1498 KVM_VM_CR0_ALWAYS_ON
;
1500 vmx_fpu_deactivate(vcpu
);
1502 if (vcpu
->arch
.rmode
.active
&& (cr0
& X86_CR0_PE
))
1505 if (!vcpu
->arch
.rmode
.active
&& !(cr0
& X86_CR0_PE
))
1508 #ifdef CONFIG_X86_64
1509 if (vcpu
->arch
.shadow_efer
& EFER_LME
) {
1510 if (!is_paging(vcpu
) && (cr0
& X86_CR0_PG
))
1512 if (is_paging(vcpu
) && !(cr0
& X86_CR0_PG
))
1518 ept_update_paging_mode_cr0(&hw_cr0
, cr0
, vcpu
);
1520 vmcs_writel(CR0_READ_SHADOW
, cr0
);
1521 vmcs_writel(GUEST_CR0
, hw_cr0
);
1522 vcpu
->arch
.cr0
= cr0
;
1524 if (!(cr0
& X86_CR0_TS
) || !(cr0
& X86_CR0_PE
))
1525 vmx_fpu_activate(vcpu
);
1528 static u64
construct_eptp(unsigned long root_hpa
)
1532 /* TODO write the value reading from MSR */
1533 eptp
= VMX_EPT_DEFAULT_MT
|
1534 VMX_EPT_DEFAULT_GAW
<< VMX_EPT_GAW_EPTP_SHIFT
;
1535 eptp
|= (root_hpa
& PAGE_MASK
);
1540 static void vmx_set_cr3(struct kvm_vcpu
*vcpu
, unsigned long cr3
)
1542 unsigned long guest_cr3
;
1546 if (vm_need_ept()) {
1547 eptp
= construct_eptp(cr3
);
1548 vmcs_write64(EPT_POINTER
, eptp
);
1549 ept_sync_context(eptp
);
1550 ept_load_pdptrs(vcpu
);
1551 guest_cr3
= is_paging(vcpu
) ? vcpu
->arch
.cr3
:
1552 VMX_EPT_IDENTITY_PAGETABLE_ADDR
;
1555 vmx_flush_tlb(vcpu
);
1556 vmcs_writel(GUEST_CR3
, guest_cr3
);
1557 if (vcpu
->arch
.cr0
& X86_CR0_PE
)
1558 vmx_fpu_deactivate(vcpu
);
1561 static void vmx_set_cr4(struct kvm_vcpu
*vcpu
, unsigned long cr4
)
1563 unsigned long hw_cr4
= cr4
| (vcpu
->arch
.rmode
.active
?
1564 KVM_RMODE_VM_CR4_ALWAYS_ON
: KVM_PMODE_VM_CR4_ALWAYS_ON
);
1566 vcpu
->arch
.cr4
= cr4
;
1568 ept_update_paging_mode_cr4(&hw_cr4
, vcpu
);
1570 vmcs_writel(CR4_READ_SHADOW
, cr4
);
1571 vmcs_writel(GUEST_CR4
, hw_cr4
);
1574 static void vmx_set_efer(struct kvm_vcpu
*vcpu
, u64 efer
)
1576 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
1577 struct kvm_msr_entry
*msr
= find_msr_entry(vmx
, MSR_EFER
);
1579 vcpu
->arch
.shadow_efer
= efer
;
1582 if (efer
& EFER_LMA
) {
1583 vmcs_write32(VM_ENTRY_CONTROLS
,
1584 vmcs_read32(VM_ENTRY_CONTROLS
) |
1585 VM_ENTRY_IA32E_MODE
);
1589 vmcs_write32(VM_ENTRY_CONTROLS
,
1590 vmcs_read32(VM_ENTRY_CONTROLS
) &
1591 ~VM_ENTRY_IA32E_MODE
);
1593 msr
->data
= efer
& ~EFER_LME
;
1598 static u64
vmx_get_segment_base(struct kvm_vcpu
*vcpu
, int seg
)
1600 struct kvm_vmx_segment_field
*sf
= &kvm_vmx_segment_fields
[seg
];
1602 return vmcs_readl(sf
->base
);
1605 static void vmx_get_segment(struct kvm_vcpu
*vcpu
,
1606 struct kvm_segment
*var
, int seg
)
1608 struct kvm_vmx_segment_field
*sf
= &kvm_vmx_segment_fields
[seg
];
1611 var
->base
= vmcs_readl(sf
->base
);
1612 var
->limit
= vmcs_read32(sf
->limit
);
1613 var
->selector
= vmcs_read16(sf
->selector
);
1614 ar
= vmcs_read32(sf
->ar_bytes
);
1615 if (ar
& AR_UNUSABLE_MASK
)
1617 var
->type
= ar
& 15;
1618 var
->s
= (ar
>> 4) & 1;
1619 var
->dpl
= (ar
>> 5) & 3;
1620 var
->present
= (ar
>> 7) & 1;
1621 var
->avl
= (ar
>> 12) & 1;
1622 var
->l
= (ar
>> 13) & 1;
1623 var
->db
= (ar
>> 14) & 1;
1624 var
->g
= (ar
>> 15) & 1;
1625 var
->unusable
= (ar
>> 16) & 1;
1628 static int vmx_get_cpl(struct kvm_vcpu
*vcpu
)
1630 struct kvm_segment kvm_seg
;
1632 if (!(vcpu
->arch
.cr0
& X86_CR0_PE
)) /* if real mode */
1635 if (vmx_get_rflags(vcpu
) & X86_EFLAGS_VM
) /* if virtual 8086 */
1638 vmx_get_segment(vcpu
, &kvm_seg
, VCPU_SREG_CS
);
1639 return kvm_seg
.selector
& 3;
1642 static u32
vmx_segment_access_rights(struct kvm_segment
*var
)
1649 ar
= var
->type
& 15;
1650 ar
|= (var
->s
& 1) << 4;
1651 ar
|= (var
->dpl
& 3) << 5;
1652 ar
|= (var
->present
& 1) << 7;
1653 ar
|= (var
->avl
& 1) << 12;
1654 ar
|= (var
->l
& 1) << 13;
1655 ar
|= (var
->db
& 1) << 14;
1656 ar
|= (var
->g
& 1) << 15;
1658 if (ar
== 0) /* a 0 value means unusable */
1659 ar
= AR_UNUSABLE_MASK
;
1664 static void vmx_set_segment(struct kvm_vcpu
*vcpu
,
1665 struct kvm_segment
*var
, int seg
)
1667 struct kvm_vmx_segment_field
*sf
= &kvm_vmx_segment_fields
[seg
];
1670 if (vcpu
->arch
.rmode
.active
&& seg
== VCPU_SREG_TR
) {
1671 vcpu
->arch
.rmode
.tr
.selector
= var
->selector
;
1672 vcpu
->arch
.rmode
.tr
.base
= var
->base
;
1673 vcpu
->arch
.rmode
.tr
.limit
= var
->limit
;
1674 vcpu
->arch
.rmode
.tr
.ar
= vmx_segment_access_rights(var
);
1677 vmcs_writel(sf
->base
, var
->base
);
1678 vmcs_write32(sf
->limit
, var
->limit
);
1679 vmcs_write16(sf
->selector
, var
->selector
);
1680 if (vcpu
->arch
.rmode
.active
&& var
->s
) {
1682 * Hack real-mode segments into vm86 compatibility.
1684 if (var
->base
== 0xffff0000 && var
->selector
== 0xf000)
1685 vmcs_writel(sf
->base
, 0xf0000);
1688 ar
= vmx_segment_access_rights(var
);
1689 vmcs_write32(sf
->ar_bytes
, ar
);
1692 static void vmx_get_cs_db_l_bits(struct kvm_vcpu
*vcpu
, int *db
, int *l
)
1694 u32 ar
= vmcs_read32(GUEST_CS_AR_BYTES
);
1696 *db
= (ar
>> 14) & 1;
1697 *l
= (ar
>> 13) & 1;
1700 static void vmx_get_idt(struct kvm_vcpu
*vcpu
, struct descriptor_table
*dt
)
1702 dt
->limit
= vmcs_read32(GUEST_IDTR_LIMIT
);
1703 dt
->base
= vmcs_readl(GUEST_IDTR_BASE
);
1706 static void vmx_set_idt(struct kvm_vcpu
*vcpu
, struct descriptor_table
*dt
)
1708 vmcs_write32(GUEST_IDTR_LIMIT
, dt
->limit
);
1709 vmcs_writel(GUEST_IDTR_BASE
, dt
->base
);
1712 static void vmx_get_gdt(struct kvm_vcpu
*vcpu
, struct descriptor_table
*dt
)
1714 dt
->limit
= vmcs_read32(GUEST_GDTR_LIMIT
);
1715 dt
->base
= vmcs_readl(GUEST_GDTR_BASE
);
1718 static void vmx_set_gdt(struct kvm_vcpu
*vcpu
, struct descriptor_table
*dt
)
1720 vmcs_write32(GUEST_GDTR_LIMIT
, dt
->limit
);
1721 vmcs_writel(GUEST_GDTR_BASE
, dt
->base
);
1724 static int init_rmode_tss(struct kvm
*kvm
)
1726 gfn_t fn
= rmode_tss_base(kvm
) >> PAGE_SHIFT
;
1731 r
= kvm_clear_guest_page(kvm
, fn
, 0, PAGE_SIZE
);
1734 data
= TSS_BASE_SIZE
+ TSS_REDIRECTION_SIZE
;
1735 r
= kvm_write_guest_page(kvm
, fn
++, &data
,
1736 TSS_IOPB_BASE_OFFSET
, sizeof(u16
));
1739 r
= kvm_clear_guest_page(kvm
, fn
++, 0, PAGE_SIZE
);
1742 r
= kvm_clear_guest_page(kvm
, fn
, 0, PAGE_SIZE
);
1746 r
= kvm_write_guest_page(kvm
, fn
, &data
,
1747 RMODE_TSS_SIZE
- 2 * PAGE_SIZE
- 1,
1757 static int init_rmode_identity_map(struct kvm
*kvm
)
1760 pfn_t identity_map_pfn
;
1765 if (unlikely(!kvm
->arch
.ept_identity_pagetable
)) {
1766 printk(KERN_ERR
"EPT: identity-mapping pagetable "
1767 "haven't been allocated!\n");
1770 if (likely(kvm
->arch
.ept_identity_pagetable_done
))
1773 identity_map_pfn
= VMX_EPT_IDENTITY_PAGETABLE_ADDR
>> PAGE_SHIFT
;
1774 r
= kvm_clear_guest_page(kvm
, identity_map_pfn
, 0, PAGE_SIZE
);
1777 /* Set up identity-mapping pagetable for EPT in real mode */
1778 for (i
= 0; i
< PT32_ENT_PER_PAGE
; i
++) {
1779 tmp
= (i
<< 22) + (_PAGE_PRESENT
| _PAGE_RW
| _PAGE_USER
|
1780 _PAGE_ACCESSED
| _PAGE_DIRTY
| _PAGE_PSE
);
1781 r
= kvm_write_guest_page(kvm
, identity_map_pfn
,
1782 &tmp
, i
* sizeof(tmp
), sizeof(tmp
));
1786 kvm
->arch
.ept_identity_pagetable_done
= true;
1792 static void seg_setup(int seg
)
1794 struct kvm_vmx_segment_field
*sf
= &kvm_vmx_segment_fields
[seg
];
1796 vmcs_write16(sf
->selector
, 0);
1797 vmcs_writel(sf
->base
, 0);
1798 vmcs_write32(sf
->limit
, 0xffff);
1799 vmcs_write32(sf
->ar_bytes
, 0x93);
1802 static int alloc_apic_access_page(struct kvm
*kvm
)
1804 struct kvm_userspace_memory_region kvm_userspace_mem
;
1807 down_write(&kvm
->slots_lock
);
1808 if (kvm
->arch
.apic_access_page
)
1810 kvm_userspace_mem
.slot
= APIC_ACCESS_PAGE_PRIVATE_MEMSLOT
;
1811 kvm_userspace_mem
.flags
= 0;
1812 kvm_userspace_mem
.guest_phys_addr
= 0xfee00000ULL
;
1813 kvm_userspace_mem
.memory_size
= PAGE_SIZE
;
1814 r
= __kvm_set_memory_region(kvm
, &kvm_userspace_mem
, 0);
1818 down_read(¤t
->mm
->mmap_sem
);
1819 kvm
->arch
.apic_access_page
= gfn_to_page(kvm
, 0xfee00);
1820 up_read(¤t
->mm
->mmap_sem
);
1822 up_write(&kvm
->slots_lock
);
1826 static int alloc_identity_pagetable(struct kvm
*kvm
)
1828 struct kvm_userspace_memory_region kvm_userspace_mem
;
1831 down_write(&kvm
->slots_lock
);
1832 if (kvm
->arch
.ept_identity_pagetable
)
1834 kvm_userspace_mem
.slot
= IDENTITY_PAGETABLE_PRIVATE_MEMSLOT
;
1835 kvm_userspace_mem
.flags
= 0;
1836 kvm_userspace_mem
.guest_phys_addr
= VMX_EPT_IDENTITY_PAGETABLE_ADDR
;
1837 kvm_userspace_mem
.memory_size
= PAGE_SIZE
;
1838 r
= __kvm_set_memory_region(kvm
, &kvm_userspace_mem
, 0);
1842 down_read(¤t
->mm
->mmap_sem
);
1843 kvm
->arch
.ept_identity_pagetable
= gfn_to_page(kvm
,
1844 VMX_EPT_IDENTITY_PAGETABLE_ADDR
>> PAGE_SHIFT
);
1845 up_read(¤t
->mm
->mmap_sem
);
1847 up_write(&kvm
->slots_lock
);
1851 static void allocate_vpid(struct vcpu_vmx
*vmx
)
1856 if (!enable_vpid
|| !cpu_has_vmx_vpid())
1858 spin_lock(&vmx_vpid_lock
);
1859 vpid
= find_first_zero_bit(vmx_vpid_bitmap
, VMX_NR_VPIDS
);
1860 if (vpid
< VMX_NR_VPIDS
) {
1862 __set_bit(vpid
, vmx_vpid_bitmap
);
1864 spin_unlock(&vmx_vpid_lock
);
1867 static void vmx_disable_intercept_for_msr(struct page
*msr_bitmap
, u32 msr
)
1871 if (!cpu_has_vmx_msr_bitmap())
1875 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
1876 * have the write-low and read-high bitmap offsets the wrong way round.
1877 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
1879 va
= kmap(msr_bitmap
);
1880 if (msr
<= 0x1fff) {
1881 __clear_bit(msr
, va
+ 0x000); /* read-low */
1882 __clear_bit(msr
, va
+ 0x800); /* write-low */
1883 } else if ((msr
>= 0xc0000000) && (msr
<= 0xc0001fff)) {
1885 __clear_bit(msr
, va
+ 0x400); /* read-high */
1886 __clear_bit(msr
, va
+ 0xc00); /* write-high */
1892 * Sets up the vmcs for emulated real mode.
1894 static int vmx_vcpu_setup(struct vcpu_vmx
*vmx
)
1896 u32 host_sysenter_cs
;
1899 struct descriptor_table dt
;
1901 unsigned long kvm_vmx_return
;
1905 vmcs_write64(IO_BITMAP_A
, page_to_phys(vmx_io_bitmap_a
));
1906 vmcs_write64(IO_BITMAP_B
, page_to_phys(vmx_io_bitmap_b
));
1908 if (cpu_has_vmx_msr_bitmap())
1909 vmcs_write64(MSR_BITMAP
, page_to_phys(vmx_msr_bitmap
));
1911 vmcs_write64(VMCS_LINK_POINTER
, -1ull); /* 22.3.1.5 */
1914 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL
,
1915 vmcs_config
.pin_based_exec_ctrl
);
1917 exec_control
= vmcs_config
.cpu_based_exec_ctrl
;
1918 if (!vm_need_tpr_shadow(vmx
->vcpu
.kvm
)) {
1919 exec_control
&= ~CPU_BASED_TPR_SHADOW
;
1920 #ifdef CONFIG_X86_64
1921 exec_control
|= CPU_BASED_CR8_STORE_EXITING
|
1922 CPU_BASED_CR8_LOAD_EXITING
;
1926 exec_control
|= CPU_BASED_CR3_STORE_EXITING
|
1927 CPU_BASED_CR3_LOAD_EXITING
;
1928 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL
, exec_control
);
1930 if (cpu_has_secondary_exec_ctrls()) {
1931 exec_control
= vmcs_config
.cpu_based_2nd_exec_ctrl
;
1932 if (!vm_need_virtualize_apic_accesses(vmx
->vcpu
.kvm
))
1934 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES
;
1936 exec_control
&= ~SECONDARY_EXEC_ENABLE_VPID
;
1938 exec_control
&= ~SECONDARY_EXEC_ENABLE_EPT
;
1939 vmcs_write32(SECONDARY_VM_EXEC_CONTROL
, exec_control
);
1942 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK
, !!bypass_guest_pf
);
1943 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH
, !!bypass_guest_pf
);
1944 vmcs_write32(CR3_TARGET_COUNT
, 0); /* 22.2.1 */
1946 vmcs_writel(HOST_CR0
, read_cr0()); /* 22.2.3 */
1947 vmcs_writel(HOST_CR4
, read_cr4()); /* 22.2.3, 22.2.5 */
1948 vmcs_writel(HOST_CR3
, read_cr3()); /* 22.2.3 FIXME: shadow tables */
1950 vmcs_write16(HOST_CS_SELECTOR
, __KERNEL_CS
); /* 22.2.4 */
1951 vmcs_write16(HOST_DS_SELECTOR
, __KERNEL_DS
); /* 22.2.4 */
1952 vmcs_write16(HOST_ES_SELECTOR
, __KERNEL_DS
); /* 22.2.4 */
1953 vmcs_write16(HOST_FS_SELECTOR
, kvm_read_fs()); /* 22.2.4 */
1954 vmcs_write16(HOST_GS_SELECTOR
, kvm_read_gs()); /* 22.2.4 */
1955 vmcs_write16(HOST_SS_SELECTOR
, __KERNEL_DS
); /* 22.2.4 */
1956 #ifdef CONFIG_X86_64
1957 rdmsrl(MSR_FS_BASE
, a
);
1958 vmcs_writel(HOST_FS_BASE
, a
); /* 22.2.4 */
1959 rdmsrl(MSR_GS_BASE
, a
);
1960 vmcs_writel(HOST_GS_BASE
, a
); /* 22.2.4 */
1962 vmcs_writel(HOST_FS_BASE
, 0); /* 22.2.4 */
1963 vmcs_writel(HOST_GS_BASE
, 0); /* 22.2.4 */
1966 vmcs_write16(HOST_TR_SELECTOR
, GDT_ENTRY_TSS
*8); /* 22.2.4 */
1969 vmcs_writel(HOST_IDTR_BASE
, dt
.base
); /* 22.2.4 */
1971 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return
));
1972 vmcs_writel(HOST_RIP
, kvm_vmx_return
); /* 22.2.5 */
1973 vmcs_write32(VM_EXIT_MSR_STORE_COUNT
, 0);
1974 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT
, 0);
1975 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT
, 0);
1977 rdmsr(MSR_IA32_SYSENTER_CS
, host_sysenter_cs
, junk
);
1978 vmcs_write32(HOST_IA32_SYSENTER_CS
, host_sysenter_cs
);
1979 rdmsrl(MSR_IA32_SYSENTER_ESP
, a
);
1980 vmcs_writel(HOST_IA32_SYSENTER_ESP
, a
); /* 22.2.3 */
1981 rdmsrl(MSR_IA32_SYSENTER_EIP
, a
);
1982 vmcs_writel(HOST_IA32_SYSENTER_EIP
, a
); /* 22.2.3 */
1984 for (i
= 0; i
< NR_VMX_MSR
; ++i
) {
1985 u32 index
= vmx_msr_index
[i
];
1986 u32 data_low
, data_high
;
1990 if (rdmsr_safe(index
, &data_low
, &data_high
) < 0)
1992 if (wrmsr_safe(index
, data_low
, data_high
) < 0)
1994 data
= data_low
| ((u64
)data_high
<< 32);
1995 vmx
->host_msrs
[j
].index
= index
;
1996 vmx
->host_msrs
[j
].reserved
= 0;
1997 vmx
->host_msrs
[j
].data
= data
;
1998 vmx
->guest_msrs
[j
] = vmx
->host_msrs
[j
];
2002 vmcs_write32(VM_EXIT_CONTROLS
, vmcs_config
.vmexit_ctrl
);
2004 /* 22.2.1, 20.8.1 */
2005 vmcs_write32(VM_ENTRY_CONTROLS
, vmcs_config
.vmentry_ctrl
);
2007 vmcs_writel(CR0_GUEST_HOST_MASK
, ~0UL);
2008 vmcs_writel(CR4_GUEST_HOST_MASK
, KVM_GUEST_CR4_MASK
);
2014 static int init_rmode(struct kvm
*kvm
)
2016 if (!init_rmode_tss(kvm
))
2018 if (!init_rmode_identity_map(kvm
))
2023 static int vmx_vcpu_reset(struct kvm_vcpu
*vcpu
)
2025 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
2029 vcpu
->arch
.regs_avail
= ~((1 << VCPU_REGS_RIP
) | (1 << VCPU_REGS_RSP
));
2030 down_read(&vcpu
->kvm
->slots_lock
);
2031 if (!init_rmode(vmx
->vcpu
.kvm
)) {
2036 vmx
->vcpu
.arch
.rmode
.active
= 0;
2038 vmx
->vcpu
.arch
.regs
[VCPU_REGS_RDX
] = get_rdx_init_val();
2039 kvm_set_cr8(&vmx
->vcpu
, 0);
2040 msr
= 0xfee00000 | MSR_IA32_APICBASE_ENABLE
;
2041 if (vmx
->vcpu
.vcpu_id
== 0)
2042 msr
|= MSR_IA32_APICBASE_BSP
;
2043 kvm_set_apic_base(&vmx
->vcpu
, msr
);
2045 fx_init(&vmx
->vcpu
);
2048 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
2049 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
2051 if (vmx
->vcpu
.vcpu_id
== 0) {
2052 vmcs_write16(GUEST_CS_SELECTOR
, 0xf000);
2053 vmcs_writel(GUEST_CS_BASE
, 0x000f0000);
2055 vmcs_write16(GUEST_CS_SELECTOR
, vmx
->vcpu
.arch
.sipi_vector
<< 8);
2056 vmcs_writel(GUEST_CS_BASE
, vmx
->vcpu
.arch
.sipi_vector
<< 12);
2058 vmcs_write32(GUEST_CS_LIMIT
, 0xffff);
2059 vmcs_write32(GUEST_CS_AR_BYTES
, 0x9b);
2061 seg_setup(VCPU_SREG_DS
);
2062 seg_setup(VCPU_SREG_ES
);
2063 seg_setup(VCPU_SREG_FS
);
2064 seg_setup(VCPU_SREG_GS
);
2065 seg_setup(VCPU_SREG_SS
);
2067 vmcs_write16(GUEST_TR_SELECTOR
, 0);
2068 vmcs_writel(GUEST_TR_BASE
, 0);
2069 vmcs_write32(GUEST_TR_LIMIT
, 0xffff);
2070 vmcs_write32(GUEST_TR_AR_BYTES
, 0x008b);
2072 vmcs_write16(GUEST_LDTR_SELECTOR
, 0);
2073 vmcs_writel(GUEST_LDTR_BASE
, 0);
2074 vmcs_write32(GUEST_LDTR_LIMIT
, 0xffff);
2075 vmcs_write32(GUEST_LDTR_AR_BYTES
, 0x00082);
2077 vmcs_write32(GUEST_SYSENTER_CS
, 0);
2078 vmcs_writel(GUEST_SYSENTER_ESP
, 0);
2079 vmcs_writel(GUEST_SYSENTER_EIP
, 0);
2081 vmcs_writel(GUEST_RFLAGS
, 0x02);
2082 if (vmx
->vcpu
.vcpu_id
== 0)
2083 kvm_rip_write(vcpu
, 0xfff0);
2085 kvm_rip_write(vcpu
, 0);
2086 kvm_register_write(vcpu
, VCPU_REGS_RSP
, 0);
2088 /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
2089 vmcs_writel(GUEST_DR7
, 0x400);
2091 vmcs_writel(GUEST_GDTR_BASE
, 0);
2092 vmcs_write32(GUEST_GDTR_LIMIT
, 0xffff);
2094 vmcs_writel(GUEST_IDTR_BASE
, 0);
2095 vmcs_write32(GUEST_IDTR_LIMIT
, 0xffff);
2097 vmcs_write32(GUEST_ACTIVITY_STATE
, 0);
2098 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO
, 0);
2099 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS
, 0);
2103 /* Special registers */
2104 vmcs_write64(GUEST_IA32_DEBUGCTL
, 0);
2108 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD
, 0); /* 22.2.1 */
2110 if (cpu_has_vmx_tpr_shadow()) {
2111 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR
, 0);
2112 if (vm_need_tpr_shadow(vmx
->vcpu
.kvm
))
2113 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR
,
2114 page_to_phys(vmx
->vcpu
.arch
.apic
->regs_page
));
2115 vmcs_write32(TPR_THRESHOLD
, 0);
2118 if (vm_need_virtualize_apic_accesses(vmx
->vcpu
.kvm
))
2119 vmcs_write64(APIC_ACCESS_ADDR
,
2120 page_to_phys(vmx
->vcpu
.kvm
->arch
.apic_access_page
));
2123 vmcs_write16(VIRTUAL_PROCESSOR_ID
, vmx
->vpid
);
2125 vmx
->vcpu
.arch
.cr0
= 0x60000010;
2126 vmx_set_cr0(&vmx
->vcpu
, vmx
->vcpu
.arch
.cr0
); /* enter rmode */
2127 vmx_set_cr4(&vmx
->vcpu
, 0);
2128 vmx_set_efer(&vmx
->vcpu
, 0);
2129 vmx_fpu_activate(&vmx
->vcpu
);
2130 update_exception_bitmap(&vmx
->vcpu
);
2132 vpid_sync_vcpu_all(vmx
);
2137 up_read(&vcpu
->kvm
->slots_lock
);
2141 static void vmx_inject_irq(struct kvm_vcpu
*vcpu
, int irq
)
2143 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
2145 KVMTRACE_1D(INJ_VIRQ
, vcpu
, (u32
)irq
, handler
);
2147 if (vcpu
->arch
.rmode
.active
) {
2148 vmx
->rmode
.irq
.pending
= true;
2149 vmx
->rmode
.irq
.vector
= irq
;
2150 vmx
->rmode
.irq
.rip
= kvm_rip_read(vcpu
);
2151 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD
,
2152 irq
| INTR_TYPE_SOFT_INTR
| INTR_INFO_VALID_MASK
);
2153 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN
, 1);
2154 kvm_rip_write(vcpu
, vmx
->rmode
.irq
.rip
- 1);
2157 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD
,
2158 irq
| INTR_TYPE_EXT_INTR
| INTR_INFO_VALID_MASK
);
2161 static void vmx_inject_nmi(struct kvm_vcpu
*vcpu
)
2163 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD
,
2164 INTR_TYPE_NMI_INTR
| INTR_INFO_VALID_MASK
| NMI_VECTOR
);
2167 static void kvm_do_inject_irq(struct kvm_vcpu
*vcpu
)
2169 int word_index
= __ffs(vcpu
->arch
.irq_summary
);
2170 int bit_index
= __ffs(vcpu
->arch
.irq_pending
[word_index
]);
2171 int irq
= word_index
* BITS_PER_LONG
+ bit_index
;
2173 clear_bit(bit_index
, &vcpu
->arch
.irq_pending
[word_index
]);
2174 if (!vcpu
->arch
.irq_pending
[word_index
])
2175 clear_bit(word_index
, &vcpu
->arch
.irq_summary
);
2176 vmx_inject_irq(vcpu
, irq
);
2180 static void do_interrupt_requests(struct kvm_vcpu
*vcpu
,
2181 struct kvm_run
*kvm_run
)
2183 u32 cpu_based_vm_exec_control
;
2185 vcpu
->arch
.interrupt_window_open
=
2186 ((vmcs_readl(GUEST_RFLAGS
) & X86_EFLAGS_IF
) &&
2187 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO
) & 3) == 0);
2189 if (vcpu
->arch
.interrupt_window_open
&&
2190 vcpu
->arch
.irq_summary
&&
2191 !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD
) & INTR_INFO_VALID_MASK
))
2193 * If interrupts enabled, and not blocked by sti or mov ss. Good.
2195 kvm_do_inject_irq(vcpu
);
2197 cpu_based_vm_exec_control
= vmcs_read32(CPU_BASED_VM_EXEC_CONTROL
);
2198 if (!vcpu
->arch
.interrupt_window_open
&&
2199 (vcpu
->arch
.irq_summary
|| kvm_run
->request_interrupt_window
))
2201 * Interrupts blocked. Wait for unblock.
2203 cpu_based_vm_exec_control
|= CPU_BASED_VIRTUAL_INTR_PENDING
;
2205 cpu_based_vm_exec_control
&= ~CPU_BASED_VIRTUAL_INTR_PENDING
;
2206 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL
, cpu_based_vm_exec_control
);
2209 static int vmx_set_tss_addr(struct kvm
*kvm
, unsigned int addr
)
2212 struct kvm_userspace_memory_region tss_mem
= {
2214 .guest_phys_addr
= addr
,
2215 .memory_size
= PAGE_SIZE
* 3,
2219 ret
= kvm_set_memory_region(kvm
, &tss_mem
, 0);
2222 kvm
->arch
.tss_addr
= addr
;
2226 static void kvm_guest_debug_pre(struct kvm_vcpu
*vcpu
)
2228 struct kvm_guest_debug
*dbg
= &vcpu
->guest_debug
;
2230 set_debugreg(dbg
->bp
[0], 0);
2231 set_debugreg(dbg
->bp
[1], 1);
2232 set_debugreg(dbg
->bp
[2], 2);
2233 set_debugreg(dbg
->bp
[3], 3);
2235 if (dbg
->singlestep
) {
2236 unsigned long flags
;
2238 flags
= vmcs_readl(GUEST_RFLAGS
);
2239 flags
|= X86_EFLAGS_TF
| X86_EFLAGS_RF
;
2240 vmcs_writel(GUEST_RFLAGS
, flags
);
2244 static int handle_rmode_exception(struct kvm_vcpu
*vcpu
,
2245 int vec
, u32 err_code
)
2248 * Instruction with address size override prefix opcode 0x67
2249 * Cause the #SS fault with 0 error code in VM86 mode.
2251 if (((vec
== GP_VECTOR
) || (vec
== SS_VECTOR
)) && err_code
== 0)
2252 if (emulate_instruction(vcpu
, NULL
, 0, 0, 0) == EMULATE_DONE
)
2255 * Forward all other exceptions that are valid in real mode.
2256 * FIXME: Breaks guest debugging in real mode, needs to be fixed with
2257 * the required debugging infrastructure rework.
2270 kvm_queue_exception(vcpu
, vec
);
2276 static int handle_exception(struct kvm_vcpu
*vcpu
, struct kvm_run
*kvm_run
)
2278 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
2279 u32 intr_info
, error_code
;
2280 unsigned long cr2
, rip
;
2282 enum emulation_result er
;
2284 vect_info
= vmx
->idt_vectoring_info
;
2285 intr_info
= vmcs_read32(VM_EXIT_INTR_INFO
);
2287 if ((vect_info
& VECTORING_INFO_VALID_MASK
) &&
2288 !is_page_fault(intr_info
))
2289 printk(KERN_ERR
"%s: unexpected, vectoring info 0x%x "
2290 "intr info 0x%x\n", __func__
, vect_info
, intr_info
);
2292 if (!irqchip_in_kernel(vcpu
->kvm
) && is_external_interrupt(vect_info
)) {
2293 int irq
= vect_info
& VECTORING_INFO_VECTOR_MASK
;
2294 set_bit(irq
, vcpu
->arch
.irq_pending
);
2295 set_bit(irq
/ BITS_PER_LONG
, &vcpu
->arch
.irq_summary
);
2298 if ((intr_info
& INTR_INFO_INTR_TYPE_MASK
) == 0x200) /* nmi */
2299 return 1; /* already handled by vmx_vcpu_run() */
2301 if (is_no_device(intr_info
)) {
2302 vmx_fpu_activate(vcpu
);
2306 if (is_invalid_opcode(intr_info
)) {
2307 er
= emulate_instruction(vcpu
, kvm_run
, 0, 0, EMULTYPE_TRAP_UD
);
2308 if (er
!= EMULATE_DONE
)
2309 kvm_queue_exception(vcpu
, UD_VECTOR
);
2314 rip
= kvm_rip_read(vcpu
);
2315 if (intr_info
& INTR_INFO_DELIVER_CODE_MASK
)
2316 error_code
= vmcs_read32(VM_EXIT_INTR_ERROR_CODE
);
2317 if (is_page_fault(intr_info
)) {
2318 /* EPT won't cause page fault directly */
2321 cr2
= vmcs_readl(EXIT_QUALIFICATION
);
2322 KVMTRACE_3D(PAGE_FAULT
, vcpu
, error_code
, (u32
)cr2
,
2323 (u32
)((u64
)cr2
>> 32), handler
);
2324 if (vcpu
->arch
.interrupt
.pending
|| vcpu
->arch
.exception
.pending
)
2325 kvm_mmu_unprotect_page_virt(vcpu
, cr2
);
2326 return kvm_mmu_page_fault(vcpu
, cr2
, error_code
);
2329 if (vcpu
->arch
.rmode
.active
&&
2330 handle_rmode_exception(vcpu
, intr_info
& INTR_INFO_VECTOR_MASK
,
2332 if (vcpu
->arch
.halt_request
) {
2333 vcpu
->arch
.halt_request
= 0;
2334 return kvm_emulate_halt(vcpu
);
2339 if ((intr_info
& (INTR_INFO_INTR_TYPE_MASK
| INTR_INFO_VECTOR_MASK
)) ==
2340 (INTR_TYPE_EXCEPTION
| 1)) {
2341 kvm_run
->exit_reason
= KVM_EXIT_DEBUG
;
2344 kvm_run
->exit_reason
= KVM_EXIT_EXCEPTION
;
2345 kvm_run
->ex
.exception
= intr_info
& INTR_INFO_VECTOR_MASK
;
2346 kvm_run
->ex
.error_code
= error_code
;
2350 static int handle_external_interrupt(struct kvm_vcpu
*vcpu
,
2351 struct kvm_run
*kvm_run
)
2353 ++vcpu
->stat
.irq_exits
;
2354 KVMTRACE_1D(INTR
, vcpu
, vmcs_read32(VM_EXIT_INTR_INFO
), handler
);
2358 static int handle_triple_fault(struct kvm_vcpu
*vcpu
, struct kvm_run
*kvm_run
)
2360 kvm_run
->exit_reason
= KVM_EXIT_SHUTDOWN
;
2364 static int handle_io(struct kvm_vcpu
*vcpu
, struct kvm_run
*kvm_run
)
2366 unsigned long exit_qualification
;
2367 int size
, down
, in
, string
, rep
;
2370 ++vcpu
->stat
.io_exits
;
2371 exit_qualification
= vmcs_readl(EXIT_QUALIFICATION
);
2372 string
= (exit_qualification
& 16) != 0;
2375 if (emulate_instruction(vcpu
,
2376 kvm_run
, 0, 0, 0) == EMULATE_DO_MMIO
)
2381 size
= (exit_qualification
& 7) + 1;
2382 in
= (exit_qualification
& 8) != 0;
2383 down
= (vmcs_readl(GUEST_RFLAGS
) & X86_EFLAGS_DF
) != 0;
2384 rep
= (exit_qualification
& 32) != 0;
2385 port
= exit_qualification
>> 16;
2387 return kvm_emulate_pio(vcpu
, kvm_run
, in
, size
, port
);
2391 vmx_patch_hypercall(struct kvm_vcpu
*vcpu
, unsigned char *hypercall
)
2394 * Patch in the VMCALL instruction:
2396 hypercall
[0] = 0x0f;
2397 hypercall
[1] = 0x01;
2398 hypercall
[2] = 0xc1;
2401 static int handle_cr(struct kvm_vcpu
*vcpu
, struct kvm_run
*kvm_run
)
2403 unsigned long exit_qualification
;
2407 exit_qualification
= vmcs_readl(EXIT_QUALIFICATION
);
2408 cr
= exit_qualification
& 15;
2409 reg
= (exit_qualification
>> 8) & 15;
2410 switch ((exit_qualification
>> 4) & 3) {
2411 case 0: /* mov to cr */
2412 KVMTRACE_3D(CR_WRITE
, vcpu
, (u32
)cr
,
2413 (u32
)kvm_register_read(vcpu
, reg
),
2414 (u32
)((u64
)kvm_register_read(vcpu
, reg
) >> 32),
2418 kvm_set_cr0(vcpu
, kvm_register_read(vcpu
, reg
));
2419 skip_emulated_instruction(vcpu
);
2422 kvm_set_cr3(vcpu
, kvm_register_read(vcpu
, reg
));
2423 skip_emulated_instruction(vcpu
);
2426 kvm_set_cr4(vcpu
, kvm_register_read(vcpu
, reg
));
2427 skip_emulated_instruction(vcpu
);
2430 kvm_set_cr8(vcpu
, kvm_register_read(vcpu
, reg
));
2431 skip_emulated_instruction(vcpu
);
2432 if (irqchip_in_kernel(vcpu
->kvm
))
2434 kvm_run
->exit_reason
= KVM_EXIT_SET_TPR
;
2439 vmx_fpu_deactivate(vcpu
);
2440 vcpu
->arch
.cr0
&= ~X86_CR0_TS
;
2441 vmcs_writel(CR0_READ_SHADOW
, vcpu
->arch
.cr0
);
2442 vmx_fpu_activate(vcpu
);
2443 KVMTRACE_0D(CLTS
, vcpu
, handler
);
2444 skip_emulated_instruction(vcpu
);
2446 case 1: /*mov from cr*/
2449 kvm_register_write(vcpu
, reg
, vcpu
->arch
.cr3
);
2450 KVMTRACE_3D(CR_READ
, vcpu
, (u32
)cr
,
2451 (u32
)kvm_register_read(vcpu
, reg
),
2452 (u32
)((u64
)kvm_register_read(vcpu
, reg
) >> 32),
2454 skip_emulated_instruction(vcpu
);
2457 kvm_register_write(vcpu
, reg
, kvm_get_cr8(vcpu
));
2458 KVMTRACE_2D(CR_READ
, vcpu
, (u32
)cr
,
2459 (u32
)kvm_register_read(vcpu
, reg
), handler
);
2460 skip_emulated_instruction(vcpu
);
2465 kvm_lmsw(vcpu
, (exit_qualification
>> LMSW_SOURCE_DATA_SHIFT
) & 0x0f);
2467 skip_emulated_instruction(vcpu
);
2472 kvm_run
->exit_reason
= 0;
2473 pr_unimpl(vcpu
, "unhandled control register: op %d cr %d\n",
2474 (int)(exit_qualification
>> 4) & 3, cr
);
2478 static int handle_dr(struct kvm_vcpu
*vcpu
, struct kvm_run
*kvm_run
)
2480 unsigned long exit_qualification
;
2485 * FIXME: this code assumes the host is debugging the guest.
2486 * need to deal with guest debugging itself too.
2488 exit_qualification
= vmcs_readl(EXIT_QUALIFICATION
);
2489 dr
= exit_qualification
& 7;
2490 reg
= (exit_qualification
>> 8) & 15;
2491 if (exit_qualification
& 16) {
2503 kvm_register_write(vcpu
, reg
, val
);
2504 KVMTRACE_2D(DR_READ
, vcpu
, (u32
)dr
, (u32
)val
, handler
);
2508 skip_emulated_instruction(vcpu
);
2512 static int handle_cpuid(struct kvm_vcpu
*vcpu
, struct kvm_run
*kvm_run
)
2514 kvm_emulate_cpuid(vcpu
);
2518 static int handle_rdmsr(struct kvm_vcpu
*vcpu
, struct kvm_run
*kvm_run
)
2520 u32 ecx
= vcpu
->arch
.regs
[VCPU_REGS_RCX
];
2523 if (vmx_get_msr(vcpu
, ecx
, &data
)) {
2524 kvm_inject_gp(vcpu
, 0);
2528 KVMTRACE_3D(MSR_READ
, vcpu
, ecx
, (u32
)data
, (u32
)(data
>> 32),
2531 /* FIXME: handling of bits 32:63 of rax, rdx */
2532 vcpu
->arch
.regs
[VCPU_REGS_RAX
] = data
& -1u;
2533 vcpu
->arch
.regs
[VCPU_REGS_RDX
] = (data
>> 32) & -1u;
2534 skip_emulated_instruction(vcpu
);
2538 static int handle_wrmsr(struct kvm_vcpu
*vcpu
, struct kvm_run
*kvm_run
)
2540 u32 ecx
= vcpu
->arch
.regs
[VCPU_REGS_RCX
];
2541 u64 data
= (vcpu
->arch
.regs
[VCPU_REGS_RAX
] & -1u)
2542 | ((u64
)(vcpu
->arch
.regs
[VCPU_REGS_RDX
] & -1u) << 32);
2544 KVMTRACE_3D(MSR_WRITE
, vcpu
, ecx
, (u32
)data
, (u32
)(data
>> 32),
2547 if (vmx_set_msr(vcpu
, ecx
, data
) != 0) {
2548 kvm_inject_gp(vcpu
, 0);
2552 skip_emulated_instruction(vcpu
);
2556 static int handle_tpr_below_threshold(struct kvm_vcpu
*vcpu
,
2557 struct kvm_run
*kvm_run
)
2562 static int handle_interrupt_window(struct kvm_vcpu
*vcpu
,
2563 struct kvm_run
*kvm_run
)
2565 u32 cpu_based_vm_exec_control
;
2567 /* clear pending irq */
2568 cpu_based_vm_exec_control
= vmcs_read32(CPU_BASED_VM_EXEC_CONTROL
);
2569 cpu_based_vm_exec_control
&= ~CPU_BASED_VIRTUAL_INTR_PENDING
;
2570 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL
, cpu_based_vm_exec_control
);
2572 KVMTRACE_0D(PEND_INTR
, vcpu
, handler
);
2575 * If the user space waits to inject interrupts, exit as soon as
2578 if (kvm_run
->request_interrupt_window
&&
2579 !vcpu
->arch
.irq_summary
) {
2580 kvm_run
->exit_reason
= KVM_EXIT_IRQ_WINDOW_OPEN
;
2581 ++vcpu
->stat
.irq_window_exits
;
2587 static int handle_halt(struct kvm_vcpu
*vcpu
, struct kvm_run
*kvm_run
)
2589 skip_emulated_instruction(vcpu
);
2590 return kvm_emulate_halt(vcpu
);
2593 static int handle_vmcall(struct kvm_vcpu
*vcpu
, struct kvm_run
*kvm_run
)
2595 skip_emulated_instruction(vcpu
);
2596 kvm_emulate_hypercall(vcpu
);
2600 static int handle_wbinvd(struct kvm_vcpu
*vcpu
, struct kvm_run
*kvm_run
)
2602 skip_emulated_instruction(vcpu
);
2603 /* TODO: Add support for VT-d/pass-through device */
2607 static int handle_apic_access(struct kvm_vcpu
*vcpu
, struct kvm_run
*kvm_run
)
2609 u64 exit_qualification
;
2610 enum emulation_result er
;
2611 unsigned long offset
;
2613 exit_qualification
= vmcs_read64(EXIT_QUALIFICATION
);
2614 offset
= exit_qualification
& 0xffful
;
2616 er
= emulate_instruction(vcpu
, kvm_run
, 0, 0, 0);
2618 if (er
!= EMULATE_DONE
) {
2620 "Fail to handle apic access vmexit! Offset is 0x%lx\n",
2627 static int handle_task_switch(struct kvm_vcpu
*vcpu
, struct kvm_run
*kvm_run
)
2629 unsigned long exit_qualification
;
2633 exit_qualification
= vmcs_readl(EXIT_QUALIFICATION
);
2635 reason
= (u32
)exit_qualification
>> 30;
2636 tss_selector
= exit_qualification
;
2638 return kvm_task_switch(vcpu
, tss_selector
, reason
);
2641 static int handle_ept_violation(struct kvm_vcpu
*vcpu
, struct kvm_run
*kvm_run
)
2643 u64 exit_qualification
;
2644 enum emulation_result er
;
2650 exit_qualification
= vmcs_read64(EXIT_QUALIFICATION
);
2652 if (exit_qualification
& (1 << 6)) {
2653 printk(KERN_ERR
"EPT: GPA exceeds GAW!\n");
2657 gla_validity
= (exit_qualification
>> 7) & 0x3;
2658 if (gla_validity
!= 0x3 && gla_validity
!= 0x1 && gla_validity
!= 0) {
2659 printk(KERN_ERR
"EPT: Handling EPT violation failed!\n");
2660 printk(KERN_ERR
"EPT: GPA: 0x%lx, GVA: 0x%lx\n",
2661 (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS
),
2662 (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS
));
2663 printk(KERN_ERR
"EPT: Exit qualification is 0x%lx\n",
2664 (long unsigned int)exit_qualification
);
2665 kvm_run
->exit_reason
= KVM_EXIT_UNKNOWN
;
2666 kvm_run
->hw
.hardware_exit_reason
= 0;
2670 gpa
= vmcs_read64(GUEST_PHYSICAL_ADDRESS
);
2671 hva
= gfn_to_hva(vcpu
->kvm
, gpa
>> PAGE_SHIFT
);
2672 if (!kvm_is_error_hva(hva
)) {
2673 r
= kvm_mmu_page_fault(vcpu
, gpa
& PAGE_MASK
, 0);
2675 printk(KERN_ERR
"EPT: Not enough memory!\n");
2681 er
= emulate_instruction(vcpu
, kvm_run
, 0, 0, 0);
2683 if (er
== EMULATE_FAIL
) {
2685 "EPT: Fail to handle EPT violation vmexit!er is %d\n",
2687 printk(KERN_ERR
"EPT: GPA: 0x%lx, GVA: 0x%lx\n",
2688 (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS
),
2689 (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS
));
2690 printk(KERN_ERR
"EPT: Exit qualification is 0x%lx\n",
2691 (long unsigned int)exit_qualification
);
2693 } else if (er
== EMULATE_DO_MMIO
)
2699 static int handle_nmi_window(struct kvm_vcpu
*vcpu
, struct kvm_run
*kvm_run
)
2701 u32 cpu_based_vm_exec_control
;
2703 /* clear pending NMI */
2704 cpu_based_vm_exec_control
= vmcs_read32(CPU_BASED_VM_EXEC_CONTROL
);
2705 cpu_based_vm_exec_control
&= ~CPU_BASED_VIRTUAL_NMI_PENDING
;
2706 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL
, cpu_based_vm_exec_control
);
2707 ++vcpu
->stat
.nmi_window_exits
;
2713 * The exit handlers return 1 if the exit was handled fully and guest execution
2714 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
2715 * to be done to userspace and return 0.
2717 static int (*kvm_vmx_exit_handlers
[])(struct kvm_vcpu
*vcpu
,
2718 struct kvm_run
*kvm_run
) = {
2719 [EXIT_REASON_EXCEPTION_NMI
] = handle_exception
,
2720 [EXIT_REASON_EXTERNAL_INTERRUPT
] = handle_external_interrupt
,
2721 [EXIT_REASON_TRIPLE_FAULT
] = handle_triple_fault
,
2722 [EXIT_REASON_NMI_WINDOW
] = handle_nmi_window
,
2723 [EXIT_REASON_IO_INSTRUCTION
] = handle_io
,
2724 [EXIT_REASON_CR_ACCESS
] = handle_cr
,
2725 [EXIT_REASON_DR_ACCESS
] = handle_dr
,
2726 [EXIT_REASON_CPUID
] = handle_cpuid
,
2727 [EXIT_REASON_MSR_READ
] = handle_rdmsr
,
2728 [EXIT_REASON_MSR_WRITE
] = handle_wrmsr
,
2729 [EXIT_REASON_PENDING_INTERRUPT
] = handle_interrupt_window
,
2730 [EXIT_REASON_HLT
] = handle_halt
,
2731 [EXIT_REASON_VMCALL
] = handle_vmcall
,
2732 [EXIT_REASON_TPR_BELOW_THRESHOLD
] = handle_tpr_below_threshold
,
2733 [EXIT_REASON_APIC_ACCESS
] = handle_apic_access
,
2734 [EXIT_REASON_WBINVD
] = handle_wbinvd
,
2735 [EXIT_REASON_TASK_SWITCH
] = handle_task_switch
,
2736 [EXIT_REASON_EPT_VIOLATION
] = handle_ept_violation
,
2739 static const int kvm_vmx_max_exit_handlers
=
2740 ARRAY_SIZE(kvm_vmx_exit_handlers
);
2743 * The guest has exited. See if we can fix it or if we need userspace
2746 static int kvm_handle_exit(struct kvm_run
*kvm_run
, struct kvm_vcpu
*vcpu
)
2748 u32 exit_reason
= vmcs_read32(VM_EXIT_REASON
);
2749 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
2750 u32 vectoring_info
= vmx
->idt_vectoring_info
;
2752 KVMTRACE_3D(VMEXIT
, vcpu
, exit_reason
, (u32
)kvm_rip_read(vcpu
),
2753 (u32
)((u64
)kvm_rip_read(vcpu
) >> 32), entryexit
);
2755 /* Access CR3 don't cause VMExit in paging mode, so we need
2756 * to sync with guest real CR3. */
2757 if (vm_need_ept() && is_paging(vcpu
)) {
2758 vcpu
->arch
.cr3
= vmcs_readl(GUEST_CR3
);
2759 ept_load_pdptrs(vcpu
);
2762 if (unlikely(vmx
->fail
)) {
2763 kvm_run
->exit_reason
= KVM_EXIT_FAIL_ENTRY
;
2764 kvm_run
->fail_entry
.hardware_entry_failure_reason
2765 = vmcs_read32(VM_INSTRUCTION_ERROR
);
2769 if ((vectoring_info
& VECTORING_INFO_VALID_MASK
) &&
2770 (exit_reason
!= EXIT_REASON_EXCEPTION_NMI
&&
2771 exit_reason
!= EXIT_REASON_EPT_VIOLATION
))
2772 printk(KERN_WARNING
"%s: unexpected, valid vectoring info and "
2773 "exit reason is 0x%x\n", __func__
, exit_reason
);
2774 if (exit_reason
< kvm_vmx_max_exit_handlers
2775 && kvm_vmx_exit_handlers
[exit_reason
])
2776 return kvm_vmx_exit_handlers
[exit_reason
](vcpu
, kvm_run
);
2778 kvm_run
->exit_reason
= KVM_EXIT_UNKNOWN
;
2779 kvm_run
->hw
.hardware_exit_reason
= exit_reason
;
2784 static void update_tpr_threshold(struct kvm_vcpu
*vcpu
)
2788 if (!vm_need_tpr_shadow(vcpu
->kvm
))
2791 if (!kvm_lapic_enabled(vcpu
) ||
2792 ((max_irr
= kvm_lapic_find_highest_irr(vcpu
)) == -1)) {
2793 vmcs_write32(TPR_THRESHOLD
, 0);
2797 tpr
= (kvm_lapic_get_cr8(vcpu
) & 0x0f) << 4;
2798 vmcs_write32(TPR_THRESHOLD
, (max_irr
> tpr
) ? tpr
>> 4 : max_irr
>> 4);
2801 static void enable_irq_window(struct kvm_vcpu
*vcpu
)
2803 u32 cpu_based_vm_exec_control
;
2805 cpu_based_vm_exec_control
= vmcs_read32(CPU_BASED_VM_EXEC_CONTROL
);
2806 cpu_based_vm_exec_control
|= CPU_BASED_VIRTUAL_INTR_PENDING
;
2807 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL
, cpu_based_vm_exec_control
);
2810 static void enable_nmi_window(struct kvm_vcpu
*vcpu
)
2812 u32 cpu_based_vm_exec_control
;
2814 if (!cpu_has_virtual_nmis())
2817 cpu_based_vm_exec_control
= vmcs_read32(CPU_BASED_VM_EXEC_CONTROL
);
2818 cpu_based_vm_exec_control
|= CPU_BASED_VIRTUAL_NMI_PENDING
;
2819 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL
, cpu_based_vm_exec_control
);
2822 static int vmx_nmi_enabled(struct kvm_vcpu
*vcpu
)
2824 u32 guest_intr
= vmcs_read32(GUEST_INTERRUPTIBILITY_INFO
);
2825 return !(guest_intr
& (GUEST_INTR_STATE_NMI
|
2826 GUEST_INTR_STATE_MOV_SS
|
2827 GUEST_INTR_STATE_STI
));
2830 static int vmx_irq_enabled(struct kvm_vcpu
*vcpu
)
2832 u32 guest_intr
= vmcs_read32(GUEST_INTERRUPTIBILITY_INFO
);
2833 return (!(guest_intr
& (GUEST_INTR_STATE_MOV_SS
|
2834 GUEST_INTR_STATE_STI
)) &&
2835 (vmcs_readl(GUEST_RFLAGS
) & X86_EFLAGS_IF
));
2838 static void enable_intr_window(struct kvm_vcpu
*vcpu
)
2840 if (vcpu
->arch
.nmi_pending
)
2841 enable_nmi_window(vcpu
);
2842 else if (kvm_cpu_has_interrupt(vcpu
))
2843 enable_irq_window(vcpu
);
2846 static void vmx_complete_interrupts(struct vcpu_vmx
*vmx
)
2849 u32 idt_vectoring_info
;
2853 bool idtv_info_valid
;
2856 exit_intr_info
= vmcs_read32(VM_EXIT_INTR_INFO
);
2857 if (cpu_has_virtual_nmis()) {
2858 unblock_nmi
= (exit_intr_info
& INTR_INFO_UNBLOCK_NMI
) != 0;
2859 vector
= exit_intr_info
& INTR_INFO_VECTOR_MASK
;
2862 * Re-set bit "block by NMI" before VM entry if vmexit caused by
2863 * a guest IRET fault.
2865 if (unblock_nmi
&& vector
!= DF_VECTOR
)
2866 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO
,
2867 GUEST_INTR_STATE_NMI
);
2870 idt_vectoring_info
= vmx
->idt_vectoring_info
;
2871 idtv_info_valid
= idt_vectoring_info
& VECTORING_INFO_VALID_MASK
;
2872 vector
= idt_vectoring_info
& VECTORING_INFO_VECTOR_MASK
;
2873 type
= idt_vectoring_info
& VECTORING_INFO_TYPE_MASK
;
2874 if (vmx
->vcpu
.arch
.nmi_injected
) {
2877 * Clear bit "block by NMI" before VM entry if a NMI delivery
2880 if (idtv_info_valid
&& type
== INTR_TYPE_NMI_INTR
)
2881 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO
,
2882 GUEST_INTR_STATE_NMI
);
2884 vmx
->vcpu
.arch
.nmi_injected
= false;
2886 kvm_clear_exception_queue(&vmx
->vcpu
);
2887 if (idtv_info_valid
&& type
== INTR_TYPE_EXCEPTION
) {
2888 if (idt_vectoring_info
& VECTORING_INFO_DELIVER_CODE_MASK
) {
2889 error
= vmcs_read32(IDT_VECTORING_ERROR_CODE
);
2890 kvm_queue_exception_e(&vmx
->vcpu
, vector
, error
);
2892 kvm_queue_exception(&vmx
->vcpu
, vector
);
2893 vmx
->idt_vectoring_info
= 0;
2895 kvm_clear_interrupt_queue(&vmx
->vcpu
);
2896 if (idtv_info_valid
&& type
== INTR_TYPE_EXT_INTR
) {
2897 kvm_queue_interrupt(&vmx
->vcpu
, vector
);
2898 vmx
->idt_vectoring_info
= 0;
2902 static void vmx_intr_assist(struct kvm_vcpu
*vcpu
)
2904 u32 intr_info_field
;
2906 update_tpr_threshold(vcpu
);
2908 intr_info_field
= vmcs_read32(VM_ENTRY_INTR_INFO_FIELD
);
2909 if (cpu_has_virtual_nmis()) {
2910 if (vcpu
->arch
.nmi_pending
&& !vcpu
->arch
.nmi_injected
) {
2911 if (vmx_nmi_enabled(vcpu
)) {
2912 vcpu
->arch
.nmi_pending
= false;
2913 vcpu
->arch
.nmi_injected
= true;
2915 enable_intr_window(vcpu
);
2919 if (vcpu
->arch
.nmi_injected
) {
2920 vmx_inject_nmi(vcpu
);
2921 enable_intr_window(vcpu
);
2925 if (!vcpu
->arch
.interrupt
.pending
&& kvm_cpu_has_interrupt(vcpu
)) {
2926 if (vmx_irq_enabled(vcpu
))
2927 kvm_queue_interrupt(vcpu
, kvm_cpu_get_interrupt(vcpu
));
2929 enable_irq_window(vcpu
);
2931 if (vcpu
->arch
.interrupt
.pending
) {
2932 vmx_inject_irq(vcpu
, vcpu
->arch
.interrupt
.nr
);
2933 kvm_timer_intr_post(vcpu
, vcpu
->arch
.interrupt
.nr
);
2938 * Failure to inject an interrupt should give us the information
2939 * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs
2940 * when fetching the interrupt redirection bitmap in the real-mode
2941 * tss, this doesn't happen. So we do it ourselves.
2943 static void fixup_rmode_irq(struct vcpu_vmx
*vmx
)
2945 vmx
->rmode
.irq
.pending
= 0;
2946 if (kvm_rip_read(&vmx
->vcpu
) + 1 != vmx
->rmode
.irq
.rip
)
2948 kvm_rip_write(&vmx
->vcpu
, vmx
->rmode
.irq
.rip
);
2949 if (vmx
->idt_vectoring_info
& VECTORING_INFO_VALID_MASK
) {
2950 vmx
->idt_vectoring_info
&= ~VECTORING_INFO_TYPE_MASK
;
2951 vmx
->idt_vectoring_info
|= INTR_TYPE_EXT_INTR
;
2954 vmx
->idt_vectoring_info
=
2955 VECTORING_INFO_VALID_MASK
2956 | INTR_TYPE_EXT_INTR
2957 | vmx
->rmode
.irq
.vector
;
2960 #ifdef CONFIG_X86_64
2968 static void vmx_vcpu_run(struct kvm_vcpu
*vcpu
, struct kvm_run
*kvm_run
)
2970 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
2973 if (test_bit(VCPU_REGS_RSP
, (unsigned long *)&vcpu
->arch
.regs_dirty
))
2974 vmcs_writel(GUEST_RSP
, vcpu
->arch
.regs
[VCPU_REGS_RSP
]);
2975 if (test_bit(VCPU_REGS_RIP
, (unsigned long *)&vcpu
->arch
.regs_dirty
))
2976 vmcs_writel(GUEST_RIP
, vcpu
->arch
.regs
[VCPU_REGS_RIP
]);
2979 * Loading guest fpu may have cleared host cr0.ts
2981 vmcs_writel(HOST_CR0
, read_cr0());
2984 /* Store host registers */
2985 "push %%"R
"dx; push %%"R
"bp;"
2987 "cmp %%"R
"sp, %c[host_rsp](%0) \n\t"
2989 "mov %%"R
"sp, %c[host_rsp](%0) \n\t"
2990 __ex(ASM_VMX_VMWRITE_RSP_RDX
) "\n\t"
2992 /* Check if vmlaunch of vmresume is needed */
2993 "cmpl $0, %c[launched](%0) \n\t"
2994 /* Load guest registers. Don't clobber flags. */
2995 "mov %c[cr2](%0), %%"R
"ax \n\t"
2996 "mov %%"R
"ax, %%cr2 \n\t"
2997 "mov %c[rax](%0), %%"R
"ax \n\t"
2998 "mov %c[rbx](%0), %%"R
"bx \n\t"
2999 "mov %c[rdx](%0), %%"R
"dx \n\t"
3000 "mov %c[rsi](%0), %%"R
"si \n\t"
3001 "mov %c[rdi](%0), %%"R
"di \n\t"
3002 "mov %c[rbp](%0), %%"R
"bp \n\t"
3003 #ifdef CONFIG_X86_64
3004 "mov %c[r8](%0), %%r8 \n\t"
3005 "mov %c[r9](%0), %%r9 \n\t"
3006 "mov %c[r10](%0), %%r10 \n\t"
3007 "mov %c[r11](%0), %%r11 \n\t"
3008 "mov %c[r12](%0), %%r12 \n\t"
3009 "mov %c[r13](%0), %%r13 \n\t"
3010 "mov %c[r14](%0), %%r14 \n\t"
3011 "mov %c[r15](%0), %%r15 \n\t"
3013 "mov %c[rcx](%0), %%"R
"cx \n\t" /* kills %0 (ecx) */
3015 /* Enter guest mode */
3016 "jne .Llaunched \n\t"
3017 __ex(ASM_VMX_VMLAUNCH
) "\n\t"
3018 "jmp .Lkvm_vmx_return \n\t"
3019 ".Llaunched: " __ex(ASM_VMX_VMRESUME
) "\n\t"
3020 ".Lkvm_vmx_return: "
3021 /* Save guest registers, load host registers, keep flags */
3022 "xchg %0, (%%"R
"sp) \n\t"
3023 "mov %%"R
"ax, %c[rax](%0) \n\t"
3024 "mov %%"R
"bx, %c[rbx](%0) \n\t"
3025 "push"Q
" (%%"R
"sp); pop"Q
" %c[rcx](%0) \n\t"
3026 "mov %%"R
"dx, %c[rdx](%0) \n\t"
3027 "mov %%"R
"si, %c[rsi](%0) \n\t"
3028 "mov %%"R
"di, %c[rdi](%0) \n\t"
3029 "mov %%"R
"bp, %c[rbp](%0) \n\t"
3030 #ifdef CONFIG_X86_64
3031 "mov %%r8, %c[r8](%0) \n\t"
3032 "mov %%r9, %c[r9](%0) \n\t"
3033 "mov %%r10, %c[r10](%0) \n\t"
3034 "mov %%r11, %c[r11](%0) \n\t"
3035 "mov %%r12, %c[r12](%0) \n\t"
3036 "mov %%r13, %c[r13](%0) \n\t"
3037 "mov %%r14, %c[r14](%0) \n\t"
3038 "mov %%r15, %c[r15](%0) \n\t"
3040 "mov %%cr2, %%"R
"ax \n\t"
3041 "mov %%"R
"ax, %c[cr2](%0) \n\t"
3043 "pop %%"R
"bp; pop %%"R
"bp; pop %%"R
"dx \n\t"
3044 "setbe %c[fail](%0) \n\t"
3045 : : "c"(vmx
), "d"((unsigned long)HOST_RSP
),
3046 [launched
]"i"(offsetof(struct vcpu_vmx
, launched
)),
3047 [fail
]"i"(offsetof(struct vcpu_vmx
, fail
)),
3048 [host_rsp
]"i"(offsetof(struct vcpu_vmx
, host_rsp
)),
3049 [rax
]"i"(offsetof(struct vcpu_vmx
, vcpu
.arch
.regs
[VCPU_REGS_RAX
])),
3050 [rbx
]"i"(offsetof(struct vcpu_vmx
, vcpu
.arch
.regs
[VCPU_REGS_RBX
])),
3051 [rcx
]"i"(offsetof(struct vcpu_vmx
, vcpu
.arch
.regs
[VCPU_REGS_RCX
])),
3052 [rdx
]"i"(offsetof(struct vcpu_vmx
, vcpu
.arch
.regs
[VCPU_REGS_RDX
])),
3053 [rsi
]"i"(offsetof(struct vcpu_vmx
, vcpu
.arch
.regs
[VCPU_REGS_RSI
])),
3054 [rdi
]"i"(offsetof(struct vcpu_vmx
, vcpu
.arch
.regs
[VCPU_REGS_RDI
])),
3055 [rbp
]"i"(offsetof(struct vcpu_vmx
, vcpu
.arch
.regs
[VCPU_REGS_RBP
])),
3056 #ifdef CONFIG_X86_64
3057 [r8
]"i"(offsetof(struct vcpu_vmx
, vcpu
.arch
.regs
[VCPU_REGS_R8
])),
3058 [r9
]"i"(offsetof(struct vcpu_vmx
, vcpu
.arch
.regs
[VCPU_REGS_R9
])),
3059 [r10
]"i"(offsetof(struct vcpu_vmx
, vcpu
.arch
.regs
[VCPU_REGS_R10
])),
3060 [r11
]"i"(offsetof(struct vcpu_vmx
, vcpu
.arch
.regs
[VCPU_REGS_R11
])),
3061 [r12
]"i"(offsetof(struct vcpu_vmx
, vcpu
.arch
.regs
[VCPU_REGS_R12
])),
3062 [r13
]"i"(offsetof(struct vcpu_vmx
, vcpu
.arch
.regs
[VCPU_REGS_R13
])),
3063 [r14
]"i"(offsetof(struct vcpu_vmx
, vcpu
.arch
.regs
[VCPU_REGS_R14
])),
3064 [r15
]"i"(offsetof(struct vcpu_vmx
, vcpu
.arch
.regs
[VCPU_REGS_R15
])),
3066 [cr2
]"i"(offsetof(struct vcpu_vmx
, vcpu
.arch
.cr2
))
3068 , R
"bx", R
"di", R
"si"
3069 #ifdef CONFIG_X86_64
3070 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
3074 vcpu
->arch
.regs_avail
= ~((1 << VCPU_REGS_RIP
) | (1 << VCPU_REGS_RSP
));
3075 vcpu
->arch
.regs_dirty
= 0;
3077 vmx
->idt_vectoring_info
= vmcs_read32(IDT_VECTORING_INFO_FIELD
);
3078 if (vmx
->rmode
.irq
.pending
)
3079 fixup_rmode_irq(vmx
);
3081 vcpu
->arch
.interrupt_window_open
=
3082 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO
) &
3083 (GUEST_INTR_STATE_STI
| GUEST_INTR_STATE_MOV_SS
)) == 0;
3085 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS
));
3088 intr_info
= vmcs_read32(VM_EXIT_INTR_INFO
);
3090 /* We need to handle NMIs before interrupts are enabled */
3091 if ((intr_info
& INTR_INFO_INTR_TYPE_MASK
) == 0x200 &&
3092 (intr_info
& INTR_INFO_VALID_MASK
)) {
3093 KVMTRACE_0D(NMI
, vcpu
, handler
);
3097 vmx_complete_interrupts(vmx
);
3103 static void vmx_free_vmcs(struct kvm_vcpu
*vcpu
)
3105 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
3109 free_vmcs(vmx
->vmcs
);
3114 static void vmx_free_vcpu(struct kvm_vcpu
*vcpu
)
3116 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
3118 spin_lock(&vmx_vpid_lock
);
3120 __clear_bit(vmx
->vpid
, vmx_vpid_bitmap
);
3121 spin_unlock(&vmx_vpid_lock
);
3122 vmx_free_vmcs(vcpu
);
3123 kfree(vmx
->host_msrs
);
3124 kfree(vmx
->guest_msrs
);
3125 kvm_vcpu_uninit(vcpu
);
3126 kmem_cache_free(kvm_vcpu_cache
, vmx
);
3129 static struct kvm_vcpu
*vmx_create_vcpu(struct kvm
*kvm
, unsigned int id
)
3132 struct vcpu_vmx
*vmx
= kmem_cache_zalloc(kvm_vcpu_cache
, GFP_KERNEL
);
3136 return ERR_PTR(-ENOMEM
);
3140 err
= kvm_vcpu_init(&vmx
->vcpu
, kvm
, id
);
3144 vmx
->guest_msrs
= kmalloc(PAGE_SIZE
, GFP_KERNEL
);
3145 if (!vmx
->guest_msrs
) {
3150 vmx
->host_msrs
= kmalloc(PAGE_SIZE
, GFP_KERNEL
);
3151 if (!vmx
->host_msrs
)
3152 goto free_guest_msrs
;
3154 vmx
->vmcs
= alloc_vmcs();
3158 vmcs_clear(vmx
->vmcs
);
3161 vmx_vcpu_load(&vmx
->vcpu
, cpu
);
3162 err
= vmx_vcpu_setup(vmx
);
3163 vmx_vcpu_put(&vmx
->vcpu
);
3167 if (vm_need_virtualize_apic_accesses(kvm
))
3168 if (alloc_apic_access_page(kvm
) != 0)
3172 if (alloc_identity_pagetable(kvm
) != 0)
3178 free_vmcs(vmx
->vmcs
);
3180 kfree(vmx
->host_msrs
);
3182 kfree(vmx
->guest_msrs
);
3184 kvm_vcpu_uninit(&vmx
->vcpu
);
3186 kmem_cache_free(kvm_vcpu_cache
, vmx
);
3187 return ERR_PTR(err
);
3190 static void __init
vmx_check_processor_compat(void *rtn
)
3192 struct vmcs_config vmcs_conf
;
3195 if (setup_vmcs_config(&vmcs_conf
) < 0)
3197 if (memcmp(&vmcs_config
, &vmcs_conf
, sizeof(struct vmcs_config
)) != 0) {
3198 printk(KERN_ERR
"kvm: CPU %d feature inconsistency!\n",
3199 smp_processor_id());
3204 static int get_ept_level(void)
3206 return VMX_EPT_DEFAULT_GAW
+ 1;
3209 static struct kvm_x86_ops vmx_x86_ops
= {
3210 .cpu_has_kvm_support
= cpu_has_kvm_support
,
3211 .disabled_by_bios
= vmx_disabled_by_bios
,
3212 .hardware_setup
= hardware_setup
,
3213 .hardware_unsetup
= hardware_unsetup
,
3214 .check_processor_compatibility
= vmx_check_processor_compat
,
3215 .hardware_enable
= hardware_enable
,
3216 .hardware_disable
= hardware_disable
,
3217 .cpu_has_accelerated_tpr
= cpu_has_vmx_virtualize_apic_accesses
,
3219 .vcpu_create
= vmx_create_vcpu
,
3220 .vcpu_free
= vmx_free_vcpu
,
3221 .vcpu_reset
= vmx_vcpu_reset
,
3223 .prepare_guest_switch
= vmx_save_host_state
,
3224 .vcpu_load
= vmx_vcpu_load
,
3225 .vcpu_put
= vmx_vcpu_put
,
3227 .set_guest_debug
= set_guest_debug
,
3228 .guest_debug_pre
= kvm_guest_debug_pre
,
3229 .get_msr
= vmx_get_msr
,
3230 .set_msr
= vmx_set_msr
,
3231 .get_segment_base
= vmx_get_segment_base
,
3232 .get_segment
= vmx_get_segment
,
3233 .set_segment
= vmx_set_segment
,
3234 .get_cpl
= vmx_get_cpl
,
3235 .get_cs_db_l_bits
= vmx_get_cs_db_l_bits
,
3236 .decache_cr4_guest_bits
= vmx_decache_cr4_guest_bits
,
3237 .set_cr0
= vmx_set_cr0
,
3238 .set_cr3
= vmx_set_cr3
,
3239 .set_cr4
= vmx_set_cr4
,
3240 .set_efer
= vmx_set_efer
,
3241 .get_idt
= vmx_get_idt
,
3242 .set_idt
= vmx_set_idt
,
3243 .get_gdt
= vmx_get_gdt
,
3244 .set_gdt
= vmx_set_gdt
,
3245 .cache_reg
= vmx_cache_reg
,
3246 .get_rflags
= vmx_get_rflags
,
3247 .set_rflags
= vmx_set_rflags
,
3249 .tlb_flush
= vmx_flush_tlb
,
3251 .run
= vmx_vcpu_run
,
3252 .handle_exit
= kvm_handle_exit
,
3253 .skip_emulated_instruction
= skip_emulated_instruction
,
3254 .patch_hypercall
= vmx_patch_hypercall
,
3255 .get_irq
= vmx_get_irq
,
3256 .set_irq
= vmx_inject_irq
,
3257 .queue_exception
= vmx_queue_exception
,
3258 .exception_injected
= vmx_exception_injected
,
3259 .inject_pending_irq
= vmx_intr_assist
,
3260 .inject_pending_vectors
= do_interrupt_requests
,
3262 .set_tss_addr
= vmx_set_tss_addr
,
3263 .get_tdp_level
= get_ept_level
,
3266 static int __init
vmx_init(void)
3271 vmx_io_bitmap_a
= alloc_page(GFP_KERNEL
| __GFP_HIGHMEM
);
3272 if (!vmx_io_bitmap_a
)
3275 vmx_io_bitmap_b
= alloc_page(GFP_KERNEL
| __GFP_HIGHMEM
);
3276 if (!vmx_io_bitmap_b
) {
3281 vmx_msr_bitmap
= alloc_page(GFP_KERNEL
| __GFP_HIGHMEM
);
3282 if (!vmx_msr_bitmap
) {
3288 * Allow direct access to the PC debug port (it is often used for I/O
3289 * delays, but the vmexits simply slow things down).
3291 va
= kmap(vmx_io_bitmap_a
);
3292 memset(va
, 0xff, PAGE_SIZE
);
3293 clear_bit(0x80, va
);
3294 kunmap(vmx_io_bitmap_a
);
3296 va
= kmap(vmx_io_bitmap_b
);
3297 memset(va
, 0xff, PAGE_SIZE
);
3298 kunmap(vmx_io_bitmap_b
);
3300 va
= kmap(vmx_msr_bitmap
);
3301 memset(va
, 0xff, PAGE_SIZE
);
3302 kunmap(vmx_msr_bitmap
);
3304 set_bit(0, vmx_vpid_bitmap
); /* 0 is reserved for host */
3306 r
= kvm_init(&vmx_x86_ops
, sizeof(struct vcpu_vmx
), THIS_MODULE
);
3310 vmx_disable_intercept_for_msr(vmx_msr_bitmap
, MSR_FS_BASE
);
3311 vmx_disable_intercept_for_msr(vmx_msr_bitmap
, MSR_GS_BASE
);
3312 vmx_disable_intercept_for_msr(vmx_msr_bitmap
, MSR_IA32_SYSENTER_CS
);
3313 vmx_disable_intercept_for_msr(vmx_msr_bitmap
, MSR_IA32_SYSENTER_ESP
);
3314 vmx_disable_intercept_for_msr(vmx_msr_bitmap
, MSR_IA32_SYSENTER_EIP
);
3316 if (vm_need_ept()) {
3317 bypass_guest_pf
= 0;
3318 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK
|
3319 VMX_EPT_WRITABLE_MASK
|
3320 VMX_EPT_DEFAULT_MT
<< VMX_EPT_MT_EPTE_SHIFT
);
3321 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
3322 VMX_EPT_EXECUTABLE_MASK
);
3327 if (bypass_guest_pf
)
3328 kvm_mmu_set_nonpresent_ptes(~0xffeull
, 0ull);
3335 __free_page(vmx_msr_bitmap
);
3337 __free_page(vmx_io_bitmap_b
);
3339 __free_page(vmx_io_bitmap_a
);
3343 static void __exit
vmx_exit(void)
3345 __free_page(vmx_msr_bitmap
);
3346 __free_page(vmx_io_bitmap_b
);
3347 __free_page(vmx_io_bitmap_a
);
3352 module_init(vmx_init
)
3353 module_exit(vmx_exit
)