]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blobdiff - arch/x86/kvm/vmx.c
x86/cpufeature: Replace cpu_has_xsaves with boot_cpu_has() usage
[mirror_ubuntu-artful-kernel.git] / arch / x86 / kvm / vmx.c
index 9bd8f44baded2318e8b5bc2a4773068a45a8723b..d5908bde93429d5d5759478d4c61380935e4b706 100644 (file)
@@ -598,6 +598,10 @@ struct vcpu_vmx {
        struct page *pml_pg;
 
        u64 current_tsc_ratio;
+
+       bool guest_pkru_valid;
+       u32 guest_pkru;
+       u32 host_pkru;
 };
 
 enum segment_cache_field {
@@ -863,7 +867,6 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
-static bool vmx_mpx_supported(void);
 static bool vmx_xsaves_supported(void);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
@@ -963,25 +966,36 @@ static const u32 vmx_msr_index[] = {
        MSR_EFER, MSR_TSC_AUX, MSR_STAR,
 };
 
-static inline bool is_page_fault(u32 intr_info)
+static inline bool is_exception_n(u32 intr_info, u8 vector)
 {
        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
                             INTR_INFO_VALID_MASK)) ==
-               (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
+               (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK);
+}
+
+static inline bool is_debug(u32 intr_info)
+{
+       return is_exception_n(intr_info, DB_VECTOR);
+}
+
+static inline bool is_breakpoint(u32 intr_info)
+{
+       return is_exception_n(intr_info, BP_VECTOR);
+}
+
+static inline bool is_page_fault(u32 intr_info)
+{
+       return is_exception_n(intr_info, PF_VECTOR);
 }
 
 static inline bool is_no_device(u32 intr_info)
 {
-       return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
-                            INTR_INFO_VALID_MASK)) ==
-               (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
+       return is_exception_n(intr_info, NM_VECTOR);
 }
 
 static inline bool is_invalid_opcode(u32 intr_info)
 {
-       return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
-                            INTR_INFO_VALID_MASK)) ==
-               (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
+       return is_exception_n(intr_info, UD_VECTOR);
 }
 
 static inline bool is_external_interrupt(u32 intr_info)
@@ -2097,6 +2111,7 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
        } while (cmpxchg(&pi_desc->control, old.control,
                        new.control) != old.control);
 }
+
 /*
  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
  * vcpu mutex is already taken.
@@ -2157,6 +2172,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
        }
 
        vmx_vcpu_pi_load(vcpu, cpu);
+       vmx->host_pkru = read_pkru();
 }
 
 static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
@@ -2276,6 +2292,11 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
        vmcs_writel(GUEST_RFLAGS, rflags);
 }
 
+static u32 vmx_get_pkru(struct kvm_vcpu *vcpu)
+{
+       return to_vmx(vcpu)->guest_pkru;
+}
+
 static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
 {
        u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
@@ -2605,7 +2626,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
                VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
                VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
 
-       if (vmx_mpx_supported())
+       if (kvm_mpx_supported())
                vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
 
        /* We support free control of debug control saving. */
@@ -2626,7 +2647,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
                VM_ENTRY_LOAD_IA32_PAT;
        vmx->nested.nested_vmx_entry_ctls_high |=
                (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
-       if (vmx_mpx_supported())
+       if (kvm_mpx_supported())
                vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
 
        /* We support free control of debug control loading. */
@@ -2702,8 +2723,15 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
        } else
                vmx->nested.nested_vmx_ept_caps = 0;
 
+       /*
+        * Old versions of KVM use the single-context version without
+        * checking for support, so declare that it is supported even
+        * though it is treated as global context.  The alternative is
+        * not failing the single-context invvpid, and it is worse.
+        */
        if (enable_vpid)
                vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
+                               VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |
                                VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
        else
                vmx->nested.nested_vmx_vpid_caps = 0;
@@ -2870,7 +2898,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
                break;
        case MSR_IA32_BNDCFGS:
-               if (!vmx_mpx_supported())
+               if (!kvm_mpx_supported())
                        return 1;
                msr_info->data = vmcs_read64(GUEST_BNDCFGS);
                break;
@@ -2947,7 +2975,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                vmcs_writel(GUEST_SYSENTER_ESP, data);
                break;
        case MSR_IA32_BNDCFGS:
-               if (!vmx_mpx_supported())
+               if (!kvm_mpx_supported())
                        return 1;
                vmcs_write64(GUEST_BNDCFGS, data);
                break;
@@ -3358,7 +3386,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                }
        }
 
-       if (cpu_has_xsaves)
+       if (boot_cpu_has(X86_FEATURE_XSAVES))
                rdmsrl(MSR_IA32_XSS, host_xss);
 
        return 0;
@@ -3420,7 +3448,7 @@ static void init_vmcs_shadow_fields(void)
        for (i = j = 0; i < max_shadow_read_write_fields; i++) {
                switch (shadow_read_write_fields[i]) {
                case GUEST_BNDCFGS:
-                       if (!vmx_mpx_supported())
+                       if (!kvm_mpx_supported())
                                continue;
                        break;
                default:
@@ -3876,13 +3904,17 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 
        if (!enable_unrestricted_guest && !is_paging(vcpu))
                /*
-                * SMEP/SMAP is disabled if CPU is in non-paging mode in
-                * hardware.  However KVM always uses paging mode without
-                * unrestricted guest.
-                * To emulate this behavior, SMEP/SMAP needs to be manually
-                * disabled when guest switches to non-paging mode.
+                * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
+                * hardware.  To emulate this behavior, SMEP/SMAP/PKU needs
+                * to be manually disabled when guest switches to non-paging
+                * mode.
+                *
+                * If !enable_unrestricted_guest, the CPU is always running
+                * with CR0.PG=1 and CR4 needs to be modified.
+                * If enable_unrestricted_guest, the CPU automatically
+                * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
                 */
-               hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
+               hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
 
        vmcs_writel(CR4_READ_SHADOW, cr4);
        vmcs_writel(GUEST_CR4, hw_cr4);
@@ -5496,7 +5528,7 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
                return kvm_set_cr4(vcpu, val);
 }
 
-/* called to set cr0 as approriate for clts instruction exit. */
+/* called to set cr0 as appropriate for clts instruction exit. */
 static void handle_clts(struct kvm_vcpu *vcpu)
 {
        if (is_guest_mode(vcpu)) {
@@ -5629,11 +5661,8 @@ static int handle_dr(struct kvm_vcpu *vcpu)
        }
 
        if (vcpu->guest_debug == 0) {
-               u32 cpu_based_vm_exec_control;
-
-               cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-               cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING;
-               vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+               vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+                               CPU_BASED_MOV_DR_EXITING);
 
                /*
                 * No more DR vmexits; force a reload of the debug registers
@@ -5670,8 +5699,6 @@ static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
 
 static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
 {
-       u32 cpu_based_vm_exec_control;
-
        get_debugreg(vcpu->arch.db[0], 0);
        get_debugreg(vcpu->arch.db[1], 1);
        get_debugreg(vcpu->arch.db[2], 2);
@@ -5680,10 +5707,7 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
        vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
 
        vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
-
-       cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-       cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING;
-       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+       vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING);
 }
 
 static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
@@ -5768,8 +5792,7 @@ static int handle_halt(struct kvm_vcpu *vcpu)
 
 static int handle_vmcall(struct kvm_vcpu *vcpu)
 {
-       kvm_emulate_hypercall(vcpu);
-       return 1;
+       return kvm_emulate_hypercall(vcpu);
 }
 
 static int handle_invd(struct kvm_vcpu *vcpu)
@@ -6456,8 +6479,8 @@ static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
 
        if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
                /* Recycle the least recently used VMCS. */
-               item = list_entry(vmx->nested.vmcs02_pool.prev,
-                       struct vmcs02_list, list);
+               item = list_last_entry(&vmx->nested.vmcs02_pool,
+                                      struct vmcs02_list, list);
                item->vmptr = vmx->nested.current_vmptr;
                list_move(&item->list, &vmx->nested.vmcs02_pool);
                return &item->vmcs02;
@@ -7244,7 +7267,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
        /* The value to write might be 32 or 64 bits, depending on L1's long
         * mode, and eventually we need to write that into a field of several
         * possible lengths. The code below first zero-extends the value to 64
-        * bit (field_value), and then copies only the approriate number of
+        * bit (field_value), and then copies only the appropriate number of
         * bits into the vmcs12 field.
         */
        u64 field_value = 0;
@@ -7398,6 +7421,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
        if (!(types & (1UL << type))) {
                nested_vmx_failValid(vcpu,
                                VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+               skip_emulated_instruction(vcpu);
                return 1;
        }
 
@@ -7456,6 +7480,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
        if (!(types & (1UL << type))) {
                nested_vmx_failValid(vcpu,
                        VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+               skip_emulated_instruction(vcpu);
                return 1;
        }
 
@@ -7472,12 +7497,17 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
        }
 
        switch (type) {
+       case VMX_VPID_EXTENT_SINGLE_CONTEXT:
+               /*
+                * Old versions of KVM use the single-context version so we
+                * have to support it; just treat it the same as all-context.
+                */
        case VMX_VPID_EXTENT_ALL_CONTEXT:
                __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
                nested_vmx_succeed(vcpu);
                break;
        default:
-               /* Trap single context invalidation invvpid calls */
+               /* Trap individual address invalidation invvpid calls */
                BUG_ON(1);
                break;
        }
@@ -7773,6 +7803,13 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                else if (is_no_device(intr_info) &&
                         !(vmcs12->guest_cr0 & X86_CR0_TS))
                        return false;
+               else if (is_debug(intr_info) &&
+                        vcpu->guest_debug &
+                        (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+                       return false;
+               else if (is_breakpoint(intr_info) &&
+                        vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+                       return false;
                return vmcs12->exception_bitmap &
                                (1u << (intr_info & INTR_INFO_VECTOR_MASK));
        case EXIT_REASON_EXTERNAL_INTERRUPT:
@@ -8377,6 +8414,7 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
 static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 {
        u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+       register void *__sp asm(_ASM_SP);
 
        /*
         * If external interrupt exists, IF bit is set in rflags/eflags on the
@@ -8409,8 +8447,9 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
                        "call *%[entry]\n\t"
                        :
 #ifdef CONFIG_X86_64
-                       [sp]"=&r"(tmp)
+                       [sp]"=&r"(tmp),
 #endif
+                       "+r"(__sp)
                        :
                        [entry]"r"(entry),
                        [ss]"i"(__KERNEL_DS),
@@ -8611,6 +8650,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
        if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
                vmx_set_interrupt_shadow(vcpu, 0);
 
+       if (vmx->guest_pkru_valid)
+               __write_pkru(vmx->guest_pkru);
+
        atomic_switch_perf_msrs(vmx);
        debugctlmsr = get_debugctlmsr();
 
@@ -8750,6 +8792,20 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
        vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
 
+       /*
+        * eager fpu is enabled if PKEY is supported and CR4 is switched
+        * back on host, so it is safe to read guest PKRU from current
+        * XSAVE.
+        */
+       if (boot_cpu_has(X86_FEATURE_OSPKE)) {
+               vmx->guest_pkru = __read_pkru();
+               if (vmx->guest_pkru != vmx->host_pkru) {
+                       vmx->guest_pkru_valid = true;
+                       __write_pkru(vmx->host_pkru);
+               } else
+                       vmx->guest_pkru_valid = false;
+       }
+
        /*
         * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
         * we did not inject a still-pending event to L1 now because of
@@ -10277,7 +10333,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
        vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
        vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
-       if (vmx_mpx_supported())
+       if (kvm_mpx_supported())
                vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
        if (nested_cpu_has_xsaves(vmcs12))
                vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP);
@@ -10785,13 +10841,26 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
                 */
 
                kvm_set_msi_irq(e, &irq);
-               if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu))
+               if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
+                       /*
+                        * Make sure the IRTE is in remapped mode if
+                        * we don't handle it in posted mode.
+                        */
+                       ret = irq_set_vcpu_affinity(host_irq, NULL);
+                       if (ret < 0) {
+                               printk(KERN_INFO
+                                  "failed to back to remapped mode, irq: %u\n",
+                                  host_irq);
+                               goto out;
+                       }
+
                        continue;
+               }
 
                vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
                vcpu_info.vector = irq.vector;
 
-               trace_kvm_pi_irte_update(vcpu->vcpu_id, e->gsi,
+               trace_kvm_pi_irte_update(vcpu->vcpu_id, host_irq, e->gsi,
                                vcpu_info.vector, vcpu_info.pi_desc_addr, set);
 
                if (set)
@@ -10861,6 +10930,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .cache_reg = vmx_cache_reg,
        .get_rflags = vmx_get_rflags,
        .set_rflags = vmx_set_rflags,
+
+       .get_pkru = vmx_get_pkru,
+
        .fpu_activate = vmx_fpu_activate,
        .fpu_deactivate = vmx_fpu_deactivate,