KVM: x86: Use gpa_t for cr2/gpa to fix TDP support on 32-bit KVM

[mirror_ubuntu-bionic-kernel.git] / arch / x86 / kvm / paging_tmpl.h
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h

index 5abae72266b77a7295853b6a7ce0cf9109d5c1f6..152c2978a2563be65395e16015b0c0edfd970e26 100644 (file)
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -181,7 +181,7 @@ no_present:
   * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK
   * to signify readability since it isn't used in the EPT case
   */
-static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
+static inline unsigned FNAME(gpte_access)(u64 gpte)
  {
         unsigned access;
  #if PTTYPE == PTTYPE_EPT
@@ -273,11 +273,11 @@ static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte)
  }
  
  /*
- * Fetch a guest pte for a guest virtual address
+ * Fetch a guest pte for a guest virtual address, or for an L2's GPA.
   */
  static int FNAME(walk_addr_generic)(struct guest_walker *walker,
                                     struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
-                                   gva_t addr, u32 access)
+                                   gpa_t addr, u32 access)
  {
         int ret;
         pt_element_t pte;
@@ -394,8 +394,8 @@ retry_walk:
         accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0;
  
         /* Convert to ACC_*_MASK flags for struct guest_walker.  */
-       walker->pt_access = FNAME(gpte_access)(vcpu, pt_access ^ walk_nx_mask);
-       walker->pte_access = FNAME(gpte_access)(vcpu, pte_access ^ walk_nx_mask);
+       walker->pt_access = FNAME(gpte_access)(pt_access ^ walk_nx_mask);
+       walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask);
         errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access);
         if (unlikely(errcode))
                 goto error;
@@ -452,14 +452,21 @@ error:
          * done by is_rsvd_bits_set() above.
          *
          * We set up the value of exit_qualification to inject:
-        * [2:0] - Derive from [2:0] of real exit_qualification at EPT violation
+        * [2:0] - Derive from the access bits. The exit_qualification might be
+        *         out of date if it is serving an EPT misconfiguration.
          * [5:3] - Calculated by the page walk of the guest EPT page tables
          * [7:8] - Derived from [7:8] of real exit_qualification
          *
          * The other bits are set to 0.
          */
         if (!(errcode & PFERR_RSVD_MASK)) {
-               vcpu->arch.exit_qualification &= 0x187;
+               vcpu->arch.exit_qualification &= 0x180;
+               if (write_fault)
+                       vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_WRITE;
+               if (user_fault)
+                       vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_READ;
+               if (fetch_fault)
+                       vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_INSTR;
                 vcpu->arch.exit_qualification |= (pte_access & 0x7) << 3;
         }
  #endif
@@ -471,7 +478,7 @@ error:
  }
  
  static int FNAME(walk_addr)(struct guest_walker *walker,
-                           struct kvm_vcpu *vcpu, gva_t addr, u32 access)
+                           struct kvm_vcpu *vcpu, gpa_t addr, u32 access)
  {
         return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr,
                                         access);
@@ -501,7 +508,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
         pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
  
         gfn = gpte_to_gfn(gpte);
-       pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+       pte_access = sp->role.access & FNAME(gpte_access)(gpte);
         FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte);
         pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
                         no_dirty_log && (pte_access & ACC_WRITE_MASK));
@@ -515,6 +522,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
         mmu_set_spte(vcpu, spte, pte_access, 0, PT_PAGE_TABLE_LEVEL, gfn, pfn,
                      true, true);
  
+       kvm_release_pfn_clean(pfn);
         return true;
  }
  
@@ -585,15 +593,17 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
   * If the guest tries to write a write-protected page, we need to
   * emulate this operation, return 1 to indicate this case.
   */
-static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
+static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
                          struct guest_walker *gw,
                          int write_fault, int hlevel,
-                        kvm_pfn_t pfn, bool map_writable, bool prefault)
+                        kvm_pfn_t pfn, bool map_writable, bool prefault,
+                        bool lpage_disallowed)
  {
         struct kvm_mmu_page *sp = NULL;
         struct kvm_shadow_walk_iterator it;
         unsigned direct_access, access = gw->pt_access;
         int top_level, ret;
+       gfn_t gfn, base_gfn;
  
         direct_access = gw->pte_access;
  
@@ -638,35 +648,48 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
                         link_shadow_page(vcpu, it.sptep, sp);
         }
  
-       for (;
-            shadow_walk_okay(&it) && it.level > hlevel;
-            shadow_walk_next(&it)) {
-               gfn_t direct_gfn;
+       /*
+        * FNAME(page_fault) might have clobbered the bottom bits of
+        * gw->gfn, restore them from the virtual address.
+        */
+       gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT);
+       base_gfn = gfn;
  
+       trace_kvm_mmu_spte_requested(addr, gw->level, pfn);
+
+       for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
                 clear_sp_write_flooding_count(it.sptep);
-               validate_direct_spte(vcpu, it.sptep, direct_access);
  
-               drop_large_spte(vcpu, it.sptep);
+               /*
+                * We cannot overwrite existing page tables with an NX
+                * large page, as the leaf could be executable.
+                */
+               disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel);
  
-               if (is_shadow_present_pte(*it.sptep))
-                       continue;
+               base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+               if (it.level == hlevel)
+                       break;
  
-               direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+               validate_direct_spte(vcpu, it.sptep, direct_access);
  
-               sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
-                                     true, direct_access);
-               link_shadow_page(vcpu, it.sptep, sp);
+               drop_large_spte(vcpu, it.sptep);
+
+               if (!is_shadow_present_pte(*it.sptep)) {
+                       sp = kvm_mmu_get_page(vcpu, base_gfn, addr,
+                                             it.level - 1, true, direct_access);
+                       link_shadow_page(vcpu, it.sptep, sp);
+                       if (lpage_disallowed)
+                               account_huge_nx_page(vcpu->kvm, sp);
+               }
         }
  
-       clear_sp_write_flooding_count(it.sptep);
         ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault,
-                          it.level, gw->gfn, pfn, prefault, map_writable);
+                          it.level, base_gfn, pfn, prefault, map_writable);
         FNAME(pte_prefetch)(vcpu, gw, it.sptep);
-
+       ++vcpu->stat.pf_fixed;
         return ret;
  
  out_gpte_changed:
-       kvm_release_pfn_clean(pfn);
         return RET_PF_RETRY;
  }
  
@@ -724,7 +747,7 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
   *  Returns: 1 if we need to emulate the instruction, 0 otherwise, or
   *           a negative value on error.
   */
-static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
+static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
                              bool prefault)
  {
         int write_fault = error_code & PFERR_WRITE_MASK;
@@ -733,9 +756,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
         int r;
         kvm_pfn_t pfn;
         int level = PT_PAGE_TABLE_LEVEL;
-       bool force_pt_level = false;
         unsigned long mmu_seq;
         bool map_writable, is_self_change_mapping;
+       bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
+                               is_nx_huge_page_enabled();
+       bool force_pt_level = lpage_disallowed;
  
         pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
  
@@ -814,6 +839,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
                         walker.pte_access &= ~ACC_EXEC_MASK;
         }
  
+       r = RET_PF_RETRY;
         spin_lock(&vcpu->kvm->mmu_lock);
         if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
                 goto out_unlock;
@@ -822,19 +848,15 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
         if (make_mmu_pages_available(vcpu) < 0)
                 goto out_unlock;
         if (!force_pt_level)
-               transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
+               transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level);
         r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
-                        level, pfn, map_writable, prefault);
-       ++vcpu->stat.pf_fixed;
+                        level, pfn, map_writable, prefault, lpage_disallowed);
         kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
-       spin_unlock(&vcpu->kvm->mmu_lock);
-
-       return r;
  
  out_unlock:
         spin_unlock(&vcpu->kvm->mmu_lock);
         kvm_release_pfn_clean(pfn);
-       return RET_PF_RETRY;
+       return r;
  }
  
  static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
@@ -904,18 +926,19 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
         spin_unlock(&vcpu->kvm->mmu_lock);
  }
  
-static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
+/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
+static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t addr, u32 access,
                                struct x86_exception *exception)
  {
         struct guest_walker walker;
         gpa_t gpa = UNMAPPED_GVA;
         int r;
  
-       r = FNAME(walk_addr)(&walker, vcpu, vaddr, access);
+       r = FNAME(walk_addr)(&walker, vcpu, addr, access);
  
         if (r) {
                 gpa = gfn_to_gpa(walker.gfn);
-               gpa |= vaddr & ~PAGE_MASK;
+               gpa |= addr & ~PAGE_MASK;
         } else if (exception)
                 *exception = walker.fault;
  
@@ -923,7 +946,8 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
  }
  
  #if PTTYPE != PTTYPE_EPT
-static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
+/* Note, gva_to_gpa_nested() is only used to translate L2 GVAs. */
+static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr,
                                       u32 access,
                                       struct x86_exception *exception)
  {
@@ -931,6 +955,11 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
         gpa_t gpa = UNMAPPED_GVA;
         int r;
  
+#ifndef CONFIG_X86_64
+       /* A 64-bit GVA should be impossible on 32-bit KVM. */
+       WARN_ON_ONCE(vaddr >> 32);
+#endif
+
         r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access);
  
         if (r) {
@@ -995,7 +1024,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
  
                 gfn = gpte_to_gfn(gpte);
                 pte_access = sp->role.access;
-               pte_access &= FNAME(gpte_access)(vcpu, gpte);
+               pte_access &= FNAME(gpte_access)(gpte);
                 FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte);
  
                 if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access,