]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blobdiff - arch/x86/kvm/mmu.c
KVM: MMU: drop superfluous is_present_gpte() check.
[mirror_ubuntu-jammy-kernel.git] / arch / x86 / kvm / mmu.c
index 6f85fe0bf958987f0275f1fa25e2b9e5240aad02..42ba85c62fcb74884dec4866b5ce24830e1321ae 100644 (file)
@@ -448,7 +448,8 @@ static bool __check_direct_spte_mmio_pf(u64 spte)
 
 static bool spte_is_locklessly_modifiable(u64 spte)
 {
-       return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE));
+       return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
+               (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
 }
 
 static bool spte_has_volatile_bits(u64 spte)
@@ -1142,7 +1143,7 @@ spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
 }
 
 static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
-                                int level, bool pt_protect)
+                                bool pt_protect)
 {
        u64 *sptep;
        struct rmap_iterator iter;
@@ -1180,7 +1181,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
        while (mask) {
                rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
                                      PT_PAGE_TABLE_LEVEL, slot);
-               __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false);
+               __rmap_write_protect(kvm, rmapp, false);
 
                /* clear the first set bit */
                mask &= mask - 1;
@@ -1199,7 +1200,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
        for (i = PT_PAGE_TABLE_LEVEL;
             i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
                rmapp = __gfn_to_rmap(gfn, i, slot);
-               write_protected |= __rmap_write_protect(kvm, rmapp, i, true);
+               write_protected |= __rmap_write_protect(kvm, rmapp, true);
        }
 
        return write_protected;
@@ -1522,7 +1523,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
                sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
        set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
        list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
-       bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM);
        sp->parent_ptes = 0;
        mmu_page_add_parent_pte(vcpu, sp, parent_pte);
        kvm_mod_used_mmu_pages(vcpu->kvm, +1);
@@ -2144,6 +2144,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
         * change the value
         */
 
+       spin_lock(&kvm->mmu_lock);
+
        if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
                while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
                        !list_empty(&kvm->arch.active_mmu_pages)) {
@@ -2158,6 +2160,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
        }
 
        kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
+
+       spin_unlock(&kvm->mmu_lock);
 }
 
 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
@@ -2183,14 +2187,6 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
 
-static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
-{
-       int slot = memslot_id(kvm, gfn);
-       struct kvm_mmu_page *sp = page_header(__pa(pte));
-
-       __set_bit(slot, sp->slot_bitmap);
-}
-
 /*
  * The function is based on mtrr_type_lookup() in
  * arch/x86/kernel/cpu/mtrr/generic.c
@@ -2332,9 +2328,8 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
                if (s->role.level != PT_PAGE_TABLE_LEVEL)
                        return 1;
 
-               if (!need_unsync && !s->unsync) {
+               if (!s->unsync)
                        need_unsync = true;
-               }
        }
        if (need_unsync)
                kvm_unsync_pages(vcpu, gfn);
@@ -2342,8 +2337,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 }
 
 static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
-                   unsigned pte_access, int user_fault,
-                   int write_fault, int level,
+                   unsigned pte_access, int level,
                    gfn_t gfn, pfn_t pfn, bool speculative,
                    bool can_unsync, bool host_writable)
 {
@@ -2378,32 +2372,20 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 
        spte |= (u64)pfn << PAGE_SHIFT;
 
-       if ((pte_access & ACC_WRITE_MASK)
-           || (!vcpu->arch.mmu.direct_map && write_fault
-               && !is_write_protection(vcpu) && !user_fault)) {
+       if (pte_access & ACC_WRITE_MASK) {
 
+               /*
+                * Other vcpu creates new sp in the window between
+                * mapping_level() and acquiring mmu-lock. We can
+                * allow guest to retry the access, the mapping can
+                * be fixed if guest refault.
+                */
                if (level > PT_PAGE_TABLE_LEVEL &&
-                   has_wrprotected_page(vcpu->kvm, gfn, level)) {
-                       ret = 1;
-                       drop_spte(vcpu->kvm, sptep);
+                   has_wrprotected_page(vcpu->kvm, gfn, level))
                        goto done;
-               }
 
                spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
 
-               if (!vcpu->arch.mmu.direct_map
-                   && !(pte_access & ACC_WRITE_MASK)) {
-                       spte &= ~PT_USER_MASK;
-                       /*
-                        * If we converted a user page to a kernel page,
-                        * so that the kernel can write to it when cr0.wp=0,
-                        * then we should prevent the kernel from executing it
-                        * if SMEP is enabled.
-                        */
-                       if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
-                               spte |= PT64_NX_MASK;
-               }
-
                /*
                 * Optimization: for pte sync, if spte was writable the hash
                 * lookup is unnecessary (and expensive). Write protection
@@ -2434,18 +2416,15 @@ done:
 
 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                         unsigned pt_access, unsigned pte_access,
-                        int user_fault, int write_fault,
-                        int *emulate, int level, gfn_t gfn,
-                        pfn_t pfn, bool speculative,
-                        bool host_writable)
+                        int write_fault, int *emulate, int level, gfn_t gfn,
+                        pfn_t pfn, bool speculative, bool host_writable)
 {
        int was_rmapped = 0;
        int rmap_count;
 
-       pgprintk("%s: spte %llx access %x write_fault %d"
-                " user_fault %d gfn %llx\n",
+       pgprintk("%s: spte %llx access %x write_fault %d gfn %llx\n",
                 __func__, *sptep, pt_access,
-                write_fault, user_fault, gfn);
+                write_fault, gfn);
 
        if (is_rmap_spte(*sptep)) {
                /*
@@ -2469,9 +2448,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                        was_rmapped = 1;
        }
 
-       if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
-                     level, gfn, pfn, speculative, true,
-                     host_writable)) {
+       if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative,
+             true, host_writable)) {
                if (write_fault)
                        *emulate = 1;
                kvm_mmu_flush_tlb(vcpu);
@@ -2489,7 +2467,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                ++vcpu->kvm->stat.lpages;
 
        if (is_shadow_present_pte(*sptep)) {
-               page_header_update_slot(vcpu->kvm, sptep, gfn);
                if (!was_rmapped) {
                        rmap_count = rmap_add(vcpu, sptep, gfn);
                        if (rmap_count > RMAP_RECYCLE_THRESHOLD)
@@ -2505,6 +2482,14 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
        mmu_free_roots(vcpu);
 }
 
+static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
+{
+       int bit7;
+
+       bit7 = (gpte >> 7) & 1;
+       return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
+}
+
 static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
                                     bool no_dirty_log)
 {
@@ -2517,6 +2502,26 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
        return gfn_to_pfn_memslot_atomic(slot, gfn);
 }
 
+static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu,
+                                 struct kvm_mmu_page *sp, u64 *spte,
+                                 u64 gpte)
+{
+       if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+               goto no_present;
+
+       if (!is_present_gpte(gpte))
+               goto no_present;
+
+       if (!(gpte & PT_ACCESSED_MASK))
+               goto no_present;
+
+       return false;
+
+no_present:
+       drop_spte(vcpu->kvm, spte);
+       return true;
+}
+
 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
                                    struct kvm_mmu_page *sp,
                                    u64 *start, u64 *end)
@@ -2535,10 +2540,9 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
                return -1;
 
        for (i = 0; i < ret; i++, gfn++, start++)
-               mmu_set_spte(vcpu, start, ACC_ALL,
-                            access, 0, 0, NULL,
-                            sp->role.level, gfn,
-                            page_to_pfn(pages[i]), true, true);
+               mmu_set_spte(vcpu, start, ACC_ALL, access, 0, NULL,
+                            sp->role.level, gfn, page_to_pfn(pages[i]),
+                            true, true);
 
        return 0;
 }
@@ -2600,8 +2604,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
                        unsigned pte_access = ACC_ALL;
 
                        mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
-                                    0, write, &emulate,
-                                    level, gfn, pfn, prefault, map_writable);
+                                    write, &emulate, level, gfn, pfn,
+                                    prefault, map_writable);
                        direct_pte_prefetch(vcpu, iterator.sptep);
                        ++vcpu->stat.pf_fixed;
                        break;
@@ -2671,7 +2675,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
         * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
         * here.
         */
-       if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
+       if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
            level == PT_PAGE_TABLE_LEVEL &&
            PageTransCompound(pfn_to_page(pfn)) &&
            !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
@@ -2699,18 +2703,13 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
        }
 }
 
-static bool mmu_invalid_pfn(pfn_t pfn)
-{
-       return unlikely(is_invalid_pfn(pfn));
-}
-
 static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
                                pfn_t pfn, unsigned access, int *ret_val)
 {
        bool ret = true;
 
        /* The pfn is invalid, report the error! */
-       if (unlikely(is_invalid_pfn(pfn))) {
+       if (unlikely(is_error_pfn(pfn))) {
                *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
                goto exit;
        }
@@ -2862,7 +2861,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
                return r;
 
        spin_lock(&vcpu->kvm->mmu_lock);
-       if (mmu_notifier_retry(vcpu, mmu_seq))
+       if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
                goto out_unlock;
        kvm_mmu_free_some_pages(vcpu);
        if (likely(!force_pt_level))
@@ -3331,7 +3330,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
                return r;
 
        spin_lock(&vcpu->kvm->mmu_lock);
-       if (mmu_notifier_retry(vcpu, mmu_seq))
+       if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
                goto out_unlock;
        kvm_mmu_free_some_pages(vcpu);
        if (likely(!force_pt_level))
@@ -3399,14 +3398,6 @@ static void paging_free(struct kvm_vcpu *vcpu)
        nonpaging_free(vcpu);
 }
 
-static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
-{
-       int bit7;
-
-       bit7 = (gpte >> 7) & 1;
-       return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
-}
-
 static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
 {
        unsigned mask;
@@ -3696,6 +3687,7 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
        else
                r = paging32_init_context(vcpu, context);
 
+       vcpu->arch.mmu.base_role.nxe = is_nx(vcpu);
        vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
        vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
        vcpu->arch.mmu.base_role.smep_andnot_wp
@@ -3862,7 +3854,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
                /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
                *gpa &= ~(gpa_t)7;
                *bytes = 8;
-               r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, min(*bytes, 8));
+               r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, 8);
                if (r)
                        gentry = 0;
                new = (const u8 *)&gentry;
@@ -4016,7 +4008,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
                              !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
                              & mask.word) && rmap_can_add(vcpu))
                                mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
-                       if (!remote_flush && need_remote_flush(entry, *spte))
+                       if (need_remote_flush(entry, *spte))
                                remote_flush = true;
                        ++spte;
                }
@@ -4175,26 +4167,36 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
 
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 {
-       struct kvm_mmu_page *sp;
-       bool flush = false;
+       struct kvm_memory_slot *memslot;
+       gfn_t last_gfn;
+       int i;
 
-       list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
-               int i;
-               u64 *pt;
+       memslot = id_to_memslot(kvm->memslots, slot);
+       last_gfn = memslot->base_gfn + memslot->npages - 1;
 
-               if (!test_bit(slot, sp->slot_bitmap))
-                       continue;
+       spin_lock(&kvm->mmu_lock);
 
-               pt = sp->spt;
-               for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-                       if (!is_shadow_present_pte(pt[i]) ||
-                             !is_last_spte(pt[i], sp->role.level))
-                               continue;
+       for (i = PT_PAGE_TABLE_LEVEL;
+            i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+               unsigned long *rmapp;
+               unsigned long last_index, index;
+
+               rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
+               last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
+
+               for (index = 0; index <= last_index; ++index, ++rmapp) {
+                       if (*rmapp)
+                               __rmap_write_protect(kvm, rmapp, false);
 
-                       spte_write_protect(kvm, &pt[i], &flush, false);
+                       if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+                               kvm_flush_remote_tlbs(kvm);
+                               cond_resched_lock(&kvm->mmu_lock);
+                       }
                }
        }
+
        kvm_flush_remote_tlbs(kvm);
+       spin_unlock(&kvm->mmu_lock);
 }
 
 void kvm_mmu_zap_all(struct kvm *kvm)