]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blobdiff - arch/x86/kvm/mmu.c
KVM: x86: Use gpa_t for cr2/gpa to fix TDP support on 32-bit KVM
[mirror_ubuntu-bionic-kernel.git] / arch / x86 / kvm / mmu.c
index 295c678b37f559be6ea03ef814d004c8585cfc14..e0aebdf3901e40d7f09ff64daf157b328a3f1f82 100644 (file)
@@ -40,6 +40,7 @@
 #include <linux/uaccess.h>
 #include <linux/hash.h>
 #include <linux/kern_levels.h>
+#include <linux/kthread.h>
 
 #include <asm/page.h>
 #include <asm/pat.h>
 #include <asm/kvm_page_track.h>
 #include "trace.h"
 
+extern bool itlb_multihit_kvm_mitigation;
+
+static int __read_mostly nx_huge_pages = -1;
+static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
+
+static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
+static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp);
+
+static struct kernel_param_ops nx_huge_pages_ops = {
+       .set = set_nx_huge_pages,
+       .get = param_get_bool,
+};
+
+static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = {
+       .set = set_nx_huge_pages_recovery_ratio,
+       .get = param_get_uint,
+};
+
+module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
+__MODULE_PARM_TYPE(nx_huge_pages, "bool");
+module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops,
+               &nx_huge_pages_recovery_ratio, 0644);
+__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
+
 /*
  * When setting this variable to true it enables Two-Dimensional-Paging
  * where the hardware walks 2 page tables:
@@ -140,9 +165,6 @@ module_param(dbg, bool, 0644);
 
 #include <trace/events/kvm.h>
 
-#define CREATE_TRACE_POINTS
-#include "mmutrace.h"
-
 #define SPTE_HOST_WRITEABLE    (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
 #define SPTE_MMU_WRITEABLE     (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
 
@@ -245,6 +267,11 @@ static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
 
 static void mmu_spte_set(u64 *sptep, u64 spte);
 static void mmu_free_roots(struct kvm_vcpu *vcpu);
+static bool is_executable_pte(u64 spte);
+
+#define CREATE_TRACE_POINTS
+#include "mmutrace.h"
+
 
 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value)
 {
@@ -265,6 +292,11 @@ static inline bool spte_ad_enabled(u64 spte)
        return !(spte & shadow_acc_track_value);
 }
 
+static bool is_nx_huge_page_enabled(void)
+{
+       return READ_ONCE(nx_huge_pages);
+}
+
 static inline u64 spte_shadow_accessed_mask(u64 spte)
 {
        MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
@@ -1009,10 +1041,16 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
 
 static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
 {
-       if (sp->role.direct)
-               BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
-       else
+       if (!sp->role.direct) {
                sp->gfns[index] = gfn;
+               return;
+       }
+
+       if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index)))
+               pr_err_ratelimited("gfn mismatch under direct page %llx "
+                                  "(expected %llx, got %llx)\n",
+                                  sp->gfn,
+                                  kvm_mmu_page_get_gfn(sp, index), gfn);
 }
 
 /*
@@ -1071,6 +1109,17 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
        kvm_mmu_gfn_disallow_lpage(slot, gfn);
 }
 
+static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+       if (sp->lpage_disallowed)
+               return;
+
+       ++kvm->stat.nx_lpage_splits;
+       list_add_tail(&sp->lpage_disallowed_link,
+                     &kvm->arch.lpage_disallowed_mmu_pages);
+       sp->lpage_disallowed = true;
+}
+
 static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
        struct kvm_memslots *slots;
@@ -1088,6 +1137,13 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
        kvm_mmu_gfn_allow_lpage(slot, gfn);
 }
 
+static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+       --kvm->stat.nx_lpage_splits;
+       sp->lpage_disallowed = false;
+       list_del(&sp->lpage_disallowed_link);
+}
+
 static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
                                          struct kvm_memory_slot *slot)
 {
@@ -1110,12 +1166,12 @@ static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn,
        return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
 }
 
-static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
+static int host_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
        unsigned long page_size;
        int i, ret = 0;
 
-       page_size = kvm_host_page_size(kvm, gfn);
+       page_size = kvm_host_page_size(vcpu, gfn);
 
        for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
                if (page_size >= KVM_HPAGE_SIZE(i))
@@ -1165,7 +1221,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
        if (unlikely(*force_pt_level))
                return PT_PAGE_TABLE_LEVEL;
 
-       host_level = host_mapping_level(vcpu->kvm, large_gfn);
+       host_level = host_mapping_level(vcpu, large_gfn);
 
        if (host_level == PT_PAGE_TABLE_LEVEL)
                return host_level;
@@ -2635,6 +2691,9 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
                        kvm_reload_remote_mmus(kvm);
        }
 
+       if (sp->lpage_disallowed)
+               unaccount_huge_nx_page(kvm, sp);
+
        sp->role.invalid = 1;
        return ret;
 }
@@ -2800,6 +2859,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
        if (!speculative)
                spte |= spte_shadow_accessed_mask(spte);
 
+       if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) &&
+           is_nx_huge_page_enabled()) {
+               pte_access &= ~ACC_EXEC_MASK;
+       }
+
        if (pte_access & ACC_EXEC_MASK)
                spte |= shadow_x_mask;
        else
@@ -2915,10 +2979,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
                ret = RET_PF_EMULATE;
 
        pgprintk("%s: setting spte %llx\n", __func__, *sptep);
-       pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
-                is_large_pte(*sptep)? "2MB" : "4kB",
-                *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn,
-                *sptep, sptep);
+       trace_kvm_mmu_set_spte(level, gfn, sptep);
        if (!was_rmapped && is_large_pte(*sptep))
                ++vcpu->kvm->stat.lpages;
 
@@ -2930,8 +2991,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
                }
        }
 
-       kvm_release_pfn_clean(pfn);
-
        return ret;
 }
 
@@ -2966,9 +3025,11 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
        if (ret <= 0)
                return -1;
 
-       for (i = 0; i < ret; i++, gfn++, start++)
+       for (i = 0; i < ret; i++, gfn++, start++) {
                mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
                             page_to_pfn(pages[i]), true, true);
+               put_page(pages[i]);
+       }
 
        return 0;
 }
@@ -3016,40 +3077,71 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
        __direct_pte_prefetch(vcpu, sp, sptep);
 }
 
-static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable,
-                       int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault)
+static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
+                                      gfn_t gfn, kvm_pfn_t *pfnp, int *levelp)
 {
-       struct kvm_shadow_walk_iterator iterator;
+       int level = *levelp;
+       u64 spte = *it.sptep;
+
+       if (it.level == level && level > PT_PAGE_TABLE_LEVEL &&
+           is_nx_huge_page_enabled() &&
+           is_shadow_present_pte(spte) &&
+           !is_large_pte(spte)) {
+               /*
+                * A small SPTE exists for this pfn, but FNAME(fetch)
+                * and __direct_map would like to create a large PTE
+                * instead: just force them to go down another level,
+                * patching back for them into pfn the next 9 bits of
+                * the address.
+                */
+               u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1);
+               *pfnp |= gfn & page_mask;
+               (*levelp)--;
+       }
+}
+
+static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
+                       int map_writable, int level, kvm_pfn_t pfn,
+                       bool prefault, bool lpage_disallowed)
+{
+       struct kvm_shadow_walk_iterator it;
        struct kvm_mmu_page *sp;
-       int emulate = 0;
-       gfn_t pseudo_gfn;
+       int ret;
+       gfn_t gfn = gpa >> PAGE_SHIFT;
+       gfn_t base_gfn = gfn;
 
        if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
-               return 0;
+               return RET_PF_RETRY;
 
-       for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
-               if (iterator.level == level) {
-                       emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
-                                              write, level, gfn, pfn, prefault,
-                                              map_writable);
-                       direct_pte_prefetch(vcpu, iterator.sptep);
-                       ++vcpu->stat.pf_fixed;
-                       break;
-               }
+       trace_kvm_mmu_spte_requested(gpa, level, pfn);
+       for_each_shadow_entry(vcpu, gpa, it) {
+               /*
+                * We cannot overwrite existing page tables with an NX
+                * large page, as the leaf could be executable.
+                */
+               disallowed_hugepage_adjust(it, gfn, &pfn, &level);
 
-               drop_large_spte(vcpu, iterator.sptep);
-               if (!is_shadow_present_pte(*iterator.sptep)) {
-                       u64 base_addr = iterator.addr;
+               base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+               if (it.level == level)
+                       break;
 
-                       base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
-                       pseudo_gfn = base_addr >> PAGE_SHIFT;
-                       sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
-                                             iterator.level - 1, 1, ACC_ALL);
+               drop_large_spte(vcpu, it.sptep);
+               if (!is_shadow_present_pte(*it.sptep)) {
+                       sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
+                                             it.level - 1, true, ACC_ALL);
 
-                       link_shadow_page(vcpu, iterator.sptep, sp);
+                       link_shadow_page(vcpu, it.sptep, sp);
+                       if (lpage_disallowed)
+                               account_huge_nx_page(vcpu->kvm, sp);
                }
        }
-       return emulate;
+
+       ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
+                          write, level, base_gfn, pfn, prefault,
+                          map_writable);
+       direct_pte_prefetch(vcpu, it.sptep);
+       ++vcpu->stat.pf_fixed;
+       return ret;
 }
 
 static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
@@ -3084,11 +3176,10 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
 }
 
 static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
-                                       gfn_t *gfnp, kvm_pfn_t *pfnp,
+                                       gfn_t gfn, kvm_pfn_t *pfnp,
                                        int *levelp)
 {
        kvm_pfn_t pfn = *pfnp;
-       gfn_t gfn = *gfnp;
        int level = *levelp;
 
        /*
@@ -3098,7 +3189,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
         * here.
         */
        if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
-           level == PT_PAGE_TABLE_LEVEL &&
+           !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL &&
            PageTransCompoundMap(pfn_to_page(pfn)) &&
            !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
                unsigned long mask;
@@ -3115,8 +3206,6 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
                mask = KVM_PAGES_PER_HPAGE(level) - 1;
                VM_BUG_ON((gfn & mask) != (pfn & mask));
                if (pfn & mask) {
-                       gfn &= ~mask;
-                       *gfnp = gfn;
                        kvm_release_pfn_clean(pfn);
                        pfn &= ~mask;
                        kvm_get_pfn(pfn);
@@ -3229,7 +3318,7 @@ static bool is_access_allowed(u32 fault_err_code, u64 spte)
  * - true: let the vcpu to access on the same address again.
  * - false: let the real page fault path to fix it.
  */
-static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
+static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int level,
                            u32 error_code)
 {
        struct kvm_shadow_walk_iterator iterator;
@@ -3249,7 +3338,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
        do {
                u64 new_spte;
 
-               for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
+               for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte)
                        if (!is_shadow_present_pte(spte) ||
                            iterator.level < level)
                                break;
@@ -3327,7 +3416,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 
        } while (true);
 
-       trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
+       trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep,
                              spte, fault_handled);
        walk_shadow_page_lockless_end(vcpu);
 
@@ -3335,19 +3424,23 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 }
 
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
-                        gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable);
+                        gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
+                        bool *writable);
 static int make_mmu_pages_available(struct kvm_vcpu *vcpu);
 
-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
+static int nonpaging_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
                         gfn_t gfn, bool prefault)
 {
        int r;
        int level;
-       bool force_pt_level = false;
+       bool force_pt_level;
        kvm_pfn_t pfn;
        unsigned long mmu_seq;
        bool map_writable, write = error_code & PFERR_WRITE_MASK;
+       bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
+                               is_nx_huge_page_enabled();
 
+       force_pt_level = lpage_disallowed;
        level = mapping_level(vcpu, gfn, &force_pt_level);
        if (likely(!force_pt_level)) {
                /*
@@ -3361,34 +3454,32 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
                gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
        }
 
-       if (fast_page_fault(vcpu, v, level, error_code))
+       if (fast_page_fault(vcpu, gpa, level, error_code))
                return RET_PF_RETRY;
 
        mmu_seq = vcpu->kvm->mmu_notifier_seq;
        smp_rmb();
 
-       if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
+       if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
                return RET_PF_RETRY;
 
-       if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
+       if (handle_abnormal_pfn(vcpu, gpa, gfn, pfn, ACC_ALL, &r))
                return r;
 
+       r = RET_PF_RETRY;
        spin_lock(&vcpu->kvm->mmu_lock);
        if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
                goto out_unlock;
        if (make_mmu_pages_available(vcpu) < 0)
                goto out_unlock;
        if (likely(!force_pt_level))
-               transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
-       r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
-       spin_unlock(&vcpu->kvm->mmu_lock);
-
-       return r;
-
+               transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
+       r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
+                        prefault, false);
 out_unlock:
        spin_unlock(&vcpu->kvm->mmu_lock);
        kvm_release_pfn_clean(pfn);
-       return RET_PF_RETRY;
+       return r;
 }
 
 
@@ -3638,7 +3729,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
 
-static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
+static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr,
                                  u32 access, struct x86_exception *exception)
 {
        if (exception)
@@ -3646,7 +3737,7 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
        return vaddr;
 }
 
-static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
+static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr,
                                         u32 access,
                                         struct x86_exception *exception)
 {
@@ -3807,13 +3898,14 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
        walk_shadow_page_lockless_end(vcpu);
 }
 
-static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
+static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa,
                                u32 error_code, bool prefault)
 {
-       gfn_t gfn = gva >> PAGE_SHIFT;
+       gfn_t gfn = gpa >> PAGE_SHIFT;
        int r;
 
-       pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
+       /* Note, paging is disabled, ergo gva == gpa. */
+       pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code);
 
        if (page_fault_handle_page_track(vcpu, error_code, gfn))
                return RET_PF_EMULATE;
@@ -3825,11 +3917,12 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
        MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
 
-       return nonpaging_map(vcpu, gva & PAGE_MASK,
+       return nonpaging_map(vcpu, gpa & PAGE_MASK,
                             error_code, gfn, prefault);
 }
 
-static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
+static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+                                  gfn_t gfn)
 {
        struct kvm_arch_async_pf arch;
 
@@ -3838,7 +3931,8 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
        arch.direct_map = vcpu->arch.mmu.direct_map;
        arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
 
-       return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
+       return kvm_setup_async_pf(vcpu, cr2_or_gpa,
+                                 kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
 }
 
 bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
@@ -3855,7 +3949,8 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
 }
 
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
-                        gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
+                        gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
+                        bool *writable)
 {
        struct kvm_memory_slot *slot;
        bool async;
@@ -3867,12 +3962,12 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
                return false; /* *pfn has correct page already */
 
        if (!prefault && kvm_can_do_async_pf(vcpu)) {
-               trace_kvm_try_async_get_page(gva, gfn);
+               trace_kvm_try_async_get_page(cr2_or_gpa, gfn);
                if (kvm_find_async_pf_gfn(vcpu, gfn)) {
-                       trace_kvm_async_pf_doublefault(gva, gfn);
+                       trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn);
                        kvm_make_request(KVM_REQ_APF_HALT, vcpu);
                        return true;
-               } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
+               } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn))
                        return true;
        }
 
@@ -3885,6 +3980,12 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
 {
        int r = 1;
 
+#ifndef CONFIG_X86_64
+       /* A 64-bit CR2 should be impossible on 32-bit KVM. */
+       if (WARN_ON_ONCE(fault_address >> 32))
+               return -EFAULT;
+#endif
+
        vcpu->arch.l1tf_flush_l1d = true;
        switch (vcpu->arch.apf.host_apf_reason) {
        default:
@@ -3922,7 +4023,7 @@ check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
        return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num);
 }
 
-static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
+static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
                          bool prefault)
 {
        kvm_pfn_t pfn;
@@ -3933,6 +4034,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
        unsigned long mmu_seq;
        int write = error_code & PFERR_WRITE_MASK;
        bool map_writable;
+       bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
+                               is_nx_huge_page_enabled();
 
        MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
@@ -3943,8 +4046,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
        if (r)
                return r;
 
-       force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn,
-                                                          PT_DIRECTORY_LEVEL);
+       force_pt_level =
+               lpage_disallowed ||
+               !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL);
        level = mapping_level(vcpu, gfn, &force_pt_level);
        if (likely(!force_pt_level)) {
                if (level > PT_DIRECTORY_LEVEL &&
@@ -3965,22 +4069,20 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
        if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
                return r;
 
+       r = RET_PF_RETRY;
        spin_lock(&vcpu->kvm->mmu_lock);
        if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
                goto out_unlock;
        if (make_mmu_pages_available(vcpu) < 0)
                goto out_unlock;
        if (likely(!force_pt_level))
-               transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
-       r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
-       spin_unlock(&vcpu->kvm->mmu_lock);
-
-       return r;
-
+               transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
+       r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
+                        prefault, lpage_disallowed);
 out_unlock:
        spin_unlock(&vcpu->kvm->mmu_lock);
        kvm_release_pfn_clean(pfn);
-       return RET_PF_RETRY;
+       return r;
 }
 
 static void nonpaging_init_context(struct kvm_vcpu *vcpu,
@@ -4957,7 +5059,7 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
        return 0;
 }
 
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
                       void *insn, int insn_len)
 {
        int r, emulation_type = EMULTYPE_RETRY;
@@ -4967,12 +5069,12 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
        /* With shadow page tables, fault_address contains a GVA or nGPA.  */
        if (vcpu->arch.mmu.direct_map) {
                vcpu->arch.gpa_available = true;
-               vcpu->arch.gpa_val = cr2;
+               vcpu->arch.gpa_val = cr2_or_gpa;
        }
 
        r = RET_PF_INVALID;
        if (unlikely(error_code & PFERR_RSVD_MASK)) {
-               r = handle_mmio_page_fault(vcpu, cr2, direct);
+               r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
                if (r == RET_PF_EMULATE) {
                        emulation_type = 0;
                        goto emulate;
@@ -4980,8 +5082,9 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
        }
 
        if (r == RET_PF_INVALID) {
-               r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code),
-                                             false);
+               r = vcpu->arch.mmu.page_fault(vcpu, cr2_or_gpa,
+                                              lower_32_bits(error_code),
+                                              false);
                WARN_ON(r == RET_PF_INVALID);
        }
 
@@ -4999,11 +5102,11 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
         */
        if (vcpu->arch.mmu.direct_map &&
            (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
-               kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2));
+               kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
                return 1;
        }
 
-       if (mmio_info_in_cache(vcpu, cr2, direct))
+       if (mmio_info_in_cache(vcpu, cr2_or_gpa, direct))
                emulation_type = 0;
 emulate:
        /*
@@ -5016,7 +5119,7 @@ emulate:
        if (unlikely(insn && !insn_len))
                return 1;
 
-       er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
+       er = x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, insn_len);
 
        switch (er) {
        case EMULATE_DONE:
@@ -5273,9 +5376,9 @@ restart:
                 * the guest, and the guest page table is using 4K page size
                 * mapping if the indirect sp has level = 1.
                 */
-               if (sp->role.direct &&
-                       !kvm_is_reserved_pfn(pfn) &&
-                       PageTransCompoundMap(pfn_to_page(pfn))) {
+               if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
+                   !kvm_is_zone_device_pfn(pfn) &&
+                   PageTransCompoundMap(pfn_to_page(pfn))) {
                        drop_spte(kvm, sptep);
                        need_tlb_flush = 1;
                        goto restart;
@@ -5474,7 +5577,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
        int nr_to_scan = sc->nr_to_scan;
        unsigned long freed = 0;
 
-       spin_lock(&kvm_lock);
+       mutex_lock(&kvm_lock);
 
        list_for_each_entry(kvm, &vm_list, vm_list) {
                int idx;
@@ -5524,7 +5627,7 @@ unlock:
                break;
        }
 
-       spin_unlock(&kvm_lock);
+       mutex_unlock(&kvm_lock);
        return freed;
 }
 
@@ -5546,10 +5649,60 @@ static void mmu_destroy_caches(void)
        kmem_cache_destroy(mmu_page_header_cache);
 }
 
+static bool get_nx_auto_mode(void)
+{
+       /* Return true when CPU has the bug, and mitigations are ON */
+       return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
+}
+
+static void __set_nx_huge_pages(bool val)
+{
+       nx_huge_pages = itlb_multihit_kvm_mitigation = val;
+}
+
+static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
+{
+       bool old_val = nx_huge_pages;
+       bool new_val;
+
+       /* In "auto" mode deploy workaround only if CPU has the bug. */
+       if (sysfs_streq(val, "off"))
+               new_val = 0;
+       else if (sysfs_streq(val, "force"))
+               new_val = 1;
+       else if (sysfs_streq(val, "auto"))
+               new_val = get_nx_auto_mode();
+       else if (strtobool(val, &new_val) < 0)
+               return -EINVAL;
+
+       __set_nx_huge_pages(new_val);
+
+       if (new_val != old_val) {
+               struct kvm *kvm;
+               int idx;
+
+               mutex_lock(&kvm_lock);
+
+               list_for_each_entry(kvm, &vm_list, vm_list) {
+                       idx = srcu_read_lock(&kvm->srcu);
+                       kvm_mmu_invalidate_zap_all_pages(kvm);
+                       srcu_read_unlock(&kvm->srcu, idx);
+
+                       wake_up_process(kvm->arch.nx_lpage_recovery_thread);
+               }
+               mutex_unlock(&kvm_lock);
+       }
+
+       return 0;
+}
+
 int kvm_mmu_module_init(void)
 {
        int ret = -ENOMEM;
 
+       if (nx_huge_pages == -1)
+               __set_nx_huge_pages(get_nx_auto_mode());
+
        kvm_mmu_reset_all_pte_masks();
 
        pte_list_desc_cache = kmem_cache_create("pte_list_desc",
@@ -5616,3 +5769,116 @@ void kvm_mmu_module_exit(void)
        unregister_shrinker(&mmu_shrinker);
        mmu_audit_disable();
 }
+
+static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp)
+{
+       unsigned int old_val;
+       int err;
+
+       old_val = nx_huge_pages_recovery_ratio;
+       err = param_set_uint(val, kp);
+       if (err)
+               return err;
+
+       if (READ_ONCE(nx_huge_pages) &&
+           !old_val && nx_huge_pages_recovery_ratio) {
+               struct kvm *kvm;
+
+               mutex_lock(&kvm_lock);
+
+               list_for_each_entry(kvm, &vm_list, vm_list)
+                       wake_up_process(kvm->arch.nx_lpage_recovery_thread);
+
+               mutex_unlock(&kvm_lock);
+       }
+
+       return err;
+}
+
+static void kvm_recover_nx_lpages(struct kvm *kvm)
+{
+       int rcu_idx;
+       struct kvm_mmu_page *sp;
+       unsigned int ratio;
+       LIST_HEAD(invalid_list);
+       ulong to_zap;
+
+       rcu_idx = srcu_read_lock(&kvm->srcu);
+       spin_lock(&kvm->mmu_lock);
+
+       ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
+       to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
+       while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) {
+               /*
+                * We use a separate list instead of just using active_mmu_pages
+                * because the number of lpage_disallowed pages is expected to
+                * be relatively small compared to the total.
+                */
+               sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
+                                     struct kvm_mmu_page,
+                                     lpage_disallowed_link);
+               WARN_ON_ONCE(!sp->lpage_disallowed);
+               kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+               WARN_ON_ONCE(sp->lpage_disallowed);
+
+               if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+                       kvm_mmu_commit_zap_page(kvm, &invalid_list);
+                       if (to_zap)
+                               cond_resched_lock(&kvm->mmu_lock);
+               }
+       }
+
+       spin_unlock(&kvm->mmu_lock);
+       srcu_read_unlock(&kvm->srcu, rcu_idx);
+}
+
+static long get_nx_lpage_recovery_timeout(u64 start_time)
+{
+       return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio)
+               ? start_time + 60 * HZ - get_jiffies_64()
+               : MAX_SCHEDULE_TIMEOUT;
+}
+
+static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
+{
+       u64 start_time;
+       long remaining_time;
+
+       while (true) {
+               start_time = get_jiffies_64();
+               remaining_time = get_nx_lpage_recovery_timeout(start_time);
+
+               set_current_state(TASK_INTERRUPTIBLE);
+               while (!kthread_should_stop() && remaining_time > 0) {
+                       schedule_timeout(remaining_time);
+                       remaining_time = get_nx_lpage_recovery_timeout(start_time);
+                       set_current_state(TASK_INTERRUPTIBLE);
+               }
+
+               set_current_state(TASK_RUNNING);
+
+               if (kthread_should_stop())
+                       return 0;
+
+               kvm_recover_nx_lpages(kvm);
+       }
+}
+
+int kvm_mmu_post_init_vm(struct kvm *kvm)
+{
+       int err;
+
+       err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
+                                         "kvm-nx-lpage-recovery",
+                                         &kvm->arch.nx_lpage_recovery_thread);
+       if (!err)
+               kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
+
+       return err;
+}
+
+void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
+{
+       if (kvm->arch.nx_lpage_recovery_thread)
+               kthread_stop(kvm->arch.nx_lpage_recovery_thread);
+}