]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blobdiff - arch/x86/kvm/mmu/mmu.c
KVM: x86/mmu: Batch zap MMU pages when shrinking the slab
[mirror_ubuntu-jammy-kernel.git] / arch / x86 / kvm / mmu / mmu.c
index fdd05c233308a7236e7f1b19b1116ef6095fef84..8083ec32a0dd5fbaaa0026dee1e2c875cfc9be3e 100644 (file)
@@ -1738,21 +1738,6 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
                kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
 }
 
-/**
- * kvm_arch_write_log_dirty - emulate dirty page logging
- * @vcpu: Guest mode vcpu
- *
- * Emulate arch specific page modification logging for the
- * nested hypervisor
- */
-int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu)
-{
-       if (kvm_x86_ops.write_log_dirty)
-               return kvm_x86_ops.write_log_dirty(vcpu);
-
-       return 0;
-}
-
 bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
                                    struct kvm_memory_slot *slot, u64 gfn)
 {
@@ -2258,15 +2243,14 @@ static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
                                    struct list_head *invalid_list);
 
-
-#define for_each_valid_sp(_kvm, _sp, _gfn)                             \
-       hlist_for_each_entry(_sp,                                       \
-         &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
+#define for_each_valid_sp(_kvm, _sp, _list)                            \
+       hlist_for_each_entry(_sp, _list, hash_link)                     \
                if (is_obsolete_sp((_kvm), (_sp))) {                    \
                } else
 
 #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn)                        \
-       for_each_valid_sp(_kvm, _sp, _gfn)                              \
+       for_each_valid_sp(_kvm, _sp,                                    \
+         &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)])     \
                if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
 
 static inline bool is_ept_sp(struct kvm_mmu_page *sp)
@@ -2476,7 +2460,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
                                             int direct,
                                             unsigned int access)
 {
+       bool direct_mmu = vcpu->arch.mmu->direct_map;
        union kvm_mmu_page_role role;
+       struct hlist_head *sp_list;
        unsigned quadrant;
        struct kvm_mmu_page *sp;
        bool need_sync = false;
@@ -2490,13 +2476,14 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
        if (role.direct)
                role.gpte_is_8_bytes = true;
        role.access = access;
-       if (!vcpu->arch.mmu->direct_map
-           && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
+       if (!direct_mmu && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
                quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
                quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
                role.quadrant = quadrant;
        }
-       for_each_valid_sp(vcpu->kvm, sp, gfn) {
+
+       sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
+       for_each_valid_sp(vcpu->kvm, sp, sp_list) {
                if (sp->gfn != gfn) {
                        collisions++;
                        continue;
@@ -2508,6 +2495,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
                if (sp->role.word != role.word)
                        continue;
 
+               if (direct_mmu)
+                       goto trace_get_page;
+
                if (sp->unsync) {
                        /* The page is good, but __kvm_sync_page might still end
                         * up zapping it.  If so, break in order to rebuild it.
@@ -2523,6 +2513,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
                        kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
 
                __clear_sp_write_flooding_count(sp);
+
+trace_get_page:
                trace_kvm_mmu_get_page(sp, false);
                goto out;
        }
@@ -2533,8 +2525,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 
        sp->gfn = gfn;
        sp->role = role;
-       hlist_add_head(&sp->hash_link,
-               &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
+       hlist_add_head(&sp->hash_link, sp_list);
        if (!direct) {
                /*
                 * we should do write protection before syncing pages
@@ -2757,10 +2748,23 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
        if (!sp->root_count) {
                /* Count self */
                (*nr_zapped)++;
-               list_move(&sp->link, invalid_list);
+
+               /*
+                * Already invalid pages (previously active roots) are not on
+                * the active page list.  See list_del() in the "else" case of
+                * !sp->root_count.
+                */
+               if (sp->role.invalid)
+                       list_add(&sp->link, invalid_list);
+               else
+                       list_move(&sp->link, invalid_list);
                kvm_mod_used_mmu_pages(kvm, -1);
        } else {
-               list_move(&sp->link, &kvm->arch.active_mmu_pages);
+               /*
+                * Remove the active root from the active page list, the root
+                * will be explicitly freed when the root_count hits zero.
+                */
+               list_del(&sp->link);
 
                /*
                 * Obsolete pages cannot be used on any vCPUs, see the comment
@@ -2812,33 +2816,51 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
        }
 }
 
-static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
-                                       struct list_head *invalid_list)
+static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm,
+                                                 unsigned long nr_to_zap)
 {
-       struct kvm_mmu_page *sp;
+       unsigned long total_zapped = 0;
+       struct kvm_mmu_page *sp, *tmp;
+       LIST_HEAD(invalid_list);
+       bool unstable;
+       int nr_zapped;
 
        if (list_empty(&kvm->arch.active_mmu_pages))
-               return false;
+               return 0;
 
-       sp = list_last_entry(&kvm->arch.active_mmu_pages,
-                            struct kvm_mmu_page, link);
-       return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+restart:
+       list_for_each_entry_safe(sp, tmp, &kvm->arch.active_mmu_pages, link) {
+               /*
+                * Don't zap active root pages, the page itself can't be freed
+                * and zapping it will just force vCPUs to realloc and reload.
+                */
+               if (sp->root_count)
+                       continue;
+
+               unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list,
+                                                     &nr_zapped);
+               total_zapped += nr_zapped;
+               if (total_zapped >= nr_to_zap)
+                       break;
+
+               if (unstable)
+                       goto restart;
+       }
+
+       kvm_mmu_commit_zap_page(kvm, &invalid_list);
+
+       kvm->stat.mmu_recycled += total_zapped;
+       return total_zapped;
 }
 
 static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
 {
-       LIST_HEAD(invalid_list);
+       unsigned long avail = kvm_mmu_available_pages(vcpu->kvm);
 
-       if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
+       if (likely(avail >= KVM_MIN_FREE_MMU_PAGES))
                return 0;
 
-       while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
-               if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
-                       break;
-
-               ++vcpu->kvm->stat.mmu_recycled;
-       }
-       kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+       kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail);
 
        if (!kvm_mmu_available_pages(vcpu->kvm))
                return -ENOSPC;
@@ -2851,17 +2873,12 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
  */
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
 {
-       LIST_HEAD(invalid_list);
-
        spin_lock(&kvm->mmu_lock);
 
        if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
-               /* Need to free some mmu pages to achieve the goal. */
-               while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages)
-                       if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list))
-                               break;
+               kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages -
+                                                 goal_nr_mmu_pages);
 
-               kvm_mmu_commit_zap_page(kvm, &invalid_list);
                goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
        }
 
@@ -4045,8 +4062,8 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
        walk_shadow_page_lockless_end(vcpu);
 }
 
-static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
-                                  gfn_t gfn)
+static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+                                   gfn_t gfn)
 {
        struct kvm_arch_async_pf arch;
 
@@ -4156,6 +4173,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
                                u64 fault_address, char *insn, int insn_len)
 {
        int r = 1;
+       u32 flags = vcpu->arch.apf.host_apf_flags;
 
 #ifndef CONFIG_X86_64
        /* A 64-bit CR2 should be impossible on 32-bit KVM. */
@@ -4164,28 +4182,22 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
 #endif
 
        vcpu->arch.l1tf_flush_l1d = true;
-       switch (vcpu->arch.apf.host_apf_flags) {
-       default:
+       if (!flags) {
                trace_kvm_page_fault(fault_address, error_code);
 
                if (kvm_event_needs_reinjection(vcpu))
                        kvm_mmu_unprotect_page_virt(vcpu, fault_address);
                r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
                                insn_len);
-               break;
-       case KVM_PV_REASON_PAGE_NOT_PRESENT:
+       } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
                vcpu->arch.apf.host_apf_flags = 0;
                local_irq_disable();
                kvm_async_pf_task_wait_schedule(fault_address);
                local_irq_enable();
-               break;
-       case KVM_PV_REASON_PAGE_READY:
-               vcpu->arch.apf.host_apf_flags = 0;
-               local_irq_disable();
-               kvm_async_pf_task_wake(fault_address);
-               local_irq_enable();
-               break;
+       } else {
+               WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags);
        }
+
        return r;
 }
 EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
@@ -4449,7 +4461,7 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
                        nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
                        rsvd_bits(maxphyaddr, 51);
                rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd |
-                       nonleaf_bit8_rsvd | gbpages_bit_rsvd |
+                       gbpages_bit_rsvd |
                        rsvd_bits(maxphyaddr, 51);
                rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
                        rsvd_bits(maxphyaddr, 51);
@@ -5732,12 +5744,11 @@ restart:
                        break;
 
                /*
-                * Skip invalid pages with a non-zero root count, zapping pages
-                * with a non-zero root count will never succeed, i.e. the page
-                * will get thrown back on active_mmu_pages and we'll get stuck
-                * in an infinite loop.
+                * Invalid pages should never land back on the list of active
+                * pages.  Skip the bogus page, otherwise we'll get stuck in an
+                * infinite loop if the page gets put back on the list (again).
                 */
-               if (sp->role.invalid && sp->root_count)
+               if (WARN_ON(sp->role.invalid))
                        continue;
 
                /*
@@ -6015,7 +6026,7 @@ void kvm_mmu_zap_all(struct kvm *kvm)
        spin_lock(&kvm->mmu_lock);
 restart:
        list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
-               if (sp->role.invalid && sp->root_count)
+               if (WARN_ON(sp->role.invalid))
                        continue;
                if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
                        goto restart;
@@ -6092,9 +6103,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
                        goto unlock;
                }
 
-               if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
-                       freed++;
-               kvm_mmu_commit_zap_page(kvm, &invalid_list);
+               freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
 
 unlock:
                spin_unlock(&kvm->mmu_lock);