]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blobdiff - arch/x86/kvm/mmu.c
KVM: MMU: filter out the mmio pfn from the fault pfn
[mirror_ubuntu-hirsute-kernel.git] / arch / x86 / kvm / mmu.c
index aee38623b768edae62394fc09c64742f54b5b955..96a7ed4e6837d3da36e79364e947b0ca4bc3427a 100644 (file)
@@ -148,7 +148,7 @@ module_param(oos_shadow, bool, 0644);
 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
                        | PT64_NX_MASK)
 
-#define RMAP_EXT 4
+#define PTE_LIST_EXT 4
 
 #define ACC_EXEC_MASK    1
 #define ACC_WRITE_MASK   PT_WRITABLE_MASK
@@ -164,9 +164,9 @@ module_param(oos_shadow, bool, 0644);
 
 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 
-struct kvm_rmap_desc {
-       u64 *sptes[RMAP_EXT];
-       struct kvm_rmap_desc *more;
+struct pte_list_desc {
+       u64 *sptes[PTE_LIST_EXT];
+       struct pte_list_desc *more;
 };
 
 struct kvm_shadow_walk_iterator {
@@ -182,15 +182,10 @@ struct kvm_shadow_walk_iterator {
             shadow_walk_okay(&(_walker));                      \
             shadow_walk_next(&(_walker)))
 
-typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
-
-static struct kmem_cache *pte_chain_cache;
-static struct kmem_cache *rmap_desc_cache;
+static struct kmem_cache *pte_list_desc_cache;
 static struct kmem_cache *mmu_page_header_cache;
 static struct percpu_counter kvm_total_used_mmu_pages;
 
-static u64 __read_mostly shadow_trap_nonpresent_pte;
-static u64 __read_mostly shadow_notrap_nonpresent_pte;
 static u64 __read_mostly shadow_nx_mask;
 static u64 __read_mostly shadow_x_mask;        /* mutual exclusive with nx_mask */
 static u64 __read_mostly shadow_user_mask;
@@ -202,13 +197,6 @@ static inline u64 rsvd_bits(int s, int e)
        return ((1ULL << (e - s + 1)) - 1) << s;
 }
 
-void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
-{
-       shadow_trap_nonpresent_pte = trap_pte;
-       shadow_notrap_nonpresent_pte = notrap_pte;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
-
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
                u64 dirty_mask, u64 nx_mask, u64 x_mask)
 {
@@ -220,11 +208,6 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
-static bool is_write_protection(struct kvm_vcpu *vcpu)
-{
-       return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
-}
-
 static int is_cpuid_PSE36(void)
 {
        return 1;
@@ -237,8 +220,7 @@ static int is_nx(struct kvm_vcpu *vcpu)
 
 static int is_shadow_present_pte(u64 pte)
 {
-       return pte != shadow_trap_nonpresent_pte
-               && pte != shadow_notrap_nonpresent_pte;
+       return pte & PT_PRESENT_MASK;
 }
 
 static int is_large_pte(u64 pte)
@@ -246,11 +228,6 @@ static int is_large_pte(u64 pte)
        return pte & PT_PAGE_SIZE_MASK;
 }
 
-static int is_writable_pte(unsigned long pte)
-{
-       return pte & PT_WRITABLE_MASK;
-}
-
 static int is_dirty_gpte(unsigned long pte)
 {
        return pte & PT_DIRTY_MASK;
@@ -397,12 +374,8 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
 {
        int r;
 
-       r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
-                                  pte_chain_cache, 4);
-       if (r)
-               goto out;
-       r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
-                                  rmap_desc_cache, 4 + PTE_PREFETCH_NUM);
+       r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
+                                  pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
        if (r)
                goto out;
        r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
@@ -416,8 +389,8 @@ out:
 
 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 {
-       mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache);
-       mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);
+       mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
+                               pte_list_desc_cache);
        mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
        mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
                                mmu_page_header_cache);
@@ -433,26 +406,15 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
        return p;
 }
 
-static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
-{
-       return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
-                                     sizeof(struct kvm_pte_chain));
-}
-
-static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
+static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
 {
-       kmem_cache_free(pte_chain_cache, pc);
+       return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache,
+                                     sizeof(struct pte_list_desc));
 }
 
-static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
+static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
 {
-       return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
-                                     sizeof(struct kvm_rmap_desc));
-}
-
-static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
-{
-       kmem_cache_free(rmap_desc_cache, rd);
+       kmem_cache_free(pte_list_desc_cache, pte_list_desc);
 }
 
 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
@@ -498,6 +460,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
                linfo = lpage_info_slot(gfn, slot, i);
                linfo->write_count += 1;
        }
+       kvm->arch.indirect_shadow_pages++;
 }
 
 static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
@@ -513,6 +476,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
                linfo->write_count -= 1;
                WARN_ON(linfo->write_count < 0);
        }
+       kvm->arch.indirect_shadow_pages--;
 }
 
 static int has_wrprotected_page(struct kvm *kvm,
@@ -588,67 +552,42 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
 }
 
 /*
- * Take gfn and return the reverse mapping to it.
- */
-
-static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
-{
-       struct kvm_memory_slot *slot;
-       struct kvm_lpage_info *linfo;
-
-       slot = gfn_to_memslot(kvm, gfn);
-       if (likely(level == PT_PAGE_TABLE_LEVEL))
-               return &slot->rmap[gfn - slot->base_gfn];
-
-       linfo = lpage_info_slot(gfn, slot, level);
-
-       return &linfo->rmap_pde;
-}
-
-/*
- * Reverse mapping data structures:
+ * Pte mapping structures:
  *
- * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
- * that points to page_address(page).
+ * If pte_list bit zero is zero, then pte_list point to the spte.
  *
- * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
- * containing more mappings.
+ * If pte_list bit zero is one, (then pte_list & ~1) points to a struct
+ * pte_list_desc containing more mappings.
  *
- * Returns the number of rmap entries before the spte was added or zero if
+ * Returns the number of pte entries before the spte was added or zero if
  * the spte was not added.
  *
  */
-static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
+static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
+                       unsigned long *pte_list)
 {
-       struct kvm_mmu_page *sp;
-       struct kvm_rmap_desc *desc;
-       unsigned long *rmapp;
+       struct pte_list_desc *desc;
        int i, count = 0;
 
-       if (!is_rmap_spte(*spte))
-               return count;
-       sp = page_header(__pa(spte));
-       kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
-       rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
-       if (!*rmapp) {
-               rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
-               *rmapp = (unsigned long)spte;
-       } else if (!(*rmapp & 1)) {
-               rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
-               desc = mmu_alloc_rmap_desc(vcpu);
-               desc->sptes[0] = (u64 *)*rmapp;
+       if (!*pte_list) {
+               rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
+               *pte_list = (unsigned long)spte;
+       } else if (!(*pte_list & 1)) {
+               rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
+               desc = mmu_alloc_pte_list_desc(vcpu);
+               desc->sptes[0] = (u64 *)*pte_list;
                desc->sptes[1] = spte;
-               *rmapp = (unsigned long)desc | 1;
+               *pte_list = (unsigned long)desc | 1;
                ++count;
        } else {
-               rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
-               desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
-               while (desc->sptes[RMAP_EXT-1] && desc->more) {
+               rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
+               desc = (struct pte_list_desc *)(*pte_list & ~1ul);
+               while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
                        desc = desc->more;
-                       count += RMAP_EXT;
+                       count += PTE_LIST_EXT;
                }
-               if (desc->sptes[RMAP_EXT-1]) {
-                       desc->more = mmu_alloc_rmap_desc(vcpu);
+               if (desc->sptes[PTE_LIST_EXT-1]) {
+                       desc->more = mmu_alloc_pte_list_desc(vcpu);
                        desc = desc->more;
                }
                for (i = 0; desc->sptes[i]; ++i)
@@ -658,59 +597,78 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
        return count;
 }
 
-static void rmap_desc_remove_entry(unsigned long *rmapp,
-                                  struct kvm_rmap_desc *desc,
-                                  int i,
-                                  struct kvm_rmap_desc *prev_desc)
+static u64 *pte_list_next(unsigned long *pte_list, u64 *spte)
+{
+       struct pte_list_desc *desc;
+       u64 *prev_spte;
+       int i;
+
+       if (!*pte_list)
+               return NULL;
+       else if (!(*pte_list & 1)) {
+               if (!spte)
+                       return (u64 *)*pte_list;
+               return NULL;
+       }
+       desc = (struct pte_list_desc *)(*pte_list & ~1ul);
+       prev_spte = NULL;
+       while (desc) {
+               for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
+                       if (prev_spte == spte)
+                               return desc->sptes[i];
+                       prev_spte = desc->sptes[i];
+               }
+               desc = desc->more;
+       }
+       return NULL;
+}
+
+static void
+pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc,
+                          int i, struct pte_list_desc *prev_desc)
 {
        int j;
 
-       for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j)
+       for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
                ;
        desc->sptes[i] = desc->sptes[j];
        desc->sptes[j] = NULL;
        if (j != 0)
                return;
        if (!prev_desc && !desc->more)
-               *rmapp = (unsigned long)desc->sptes[0];
+               *pte_list = (unsigned long)desc->sptes[0];
        else
                if (prev_desc)
                        prev_desc->more = desc->more;
                else
-                       *rmapp = (unsigned long)desc->more | 1;
-       mmu_free_rmap_desc(desc);
+                       *pte_list = (unsigned long)desc->more | 1;
+       mmu_free_pte_list_desc(desc);
 }
 
-static void rmap_remove(struct kvm *kvm, u64 *spte)
+static void pte_list_remove(u64 *spte, unsigned long *pte_list)
 {
-       struct kvm_rmap_desc *desc;
-       struct kvm_rmap_desc *prev_desc;
-       struct kvm_mmu_page *sp;
-       gfn_t gfn;
-       unsigned long *rmapp;
+       struct pte_list_desc *desc;
+       struct pte_list_desc *prev_desc;
        int i;
 
-       sp = page_header(__pa(spte));
-       gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
-       rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
-       if (!*rmapp) {
-               printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte);
+       if (!*pte_list) {
+               printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte);
                BUG();
-       } else if (!(*rmapp & 1)) {
-               rmap_printk("rmap_remove:  %p 1->0\n", spte);
-               if ((u64 *)*rmapp != spte) {
-                       printk(KERN_ERR "rmap_remove:  %p 1->BUG\n", spte);
+       } else if (!(*pte_list & 1)) {
+               rmap_printk("pte_list_remove:  %p 1->0\n", spte);
+               if ((u64 *)*pte_list != spte) {
+                       printk(KERN_ERR "pte_list_remove:  %p 1->BUG\n", spte);
                        BUG();
                }
-               *rmapp = 0;
+               *pte_list = 0;
        } else {
-               rmap_printk("rmap_remove:  %p many->many\n", spte);
-               desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
+               rmap_printk("pte_list_remove:  %p many->many\n", spte);
+               desc = (struct pte_list_desc *)(*pte_list & ~1ul);
                prev_desc = NULL;
                while (desc) {
-                       for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i)
+                       for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
                                if (desc->sptes[i] == spte) {
-                                       rmap_desc_remove_entry(rmapp,
+                                       pte_list_desc_remove_entry(pte_list,
                                                               desc, i,
                                                               prev_desc);
                                        return;
@@ -718,11 +676,76 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
                        prev_desc = desc;
                        desc = desc->more;
                }
-               pr_err("rmap_remove: %p many->many\n", spte);
+               pr_err("pte_list_remove: %p many->many\n", spte);
                BUG();
        }
 }
 
+typedef void (*pte_list_walk_fn) (u64 *spte);
+static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
+{
+       struct pte_list_desc *desc;
+       int i;
+
+       if (!*pte_list)
+               return;
+
+       if (!(*pte_list & 1))
+               return fn((u64 *)*pte_list);
+
+       desc = (struct pte_list_desc *)(*pte_list & ~1ul);
+       while (desc) {
+               for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
+                       fn(desc->sptes[i]);
+               desc = desc->more;
+       }
+}
+
+/*
+ * Take gfn and return the reverse mapping to it.
+ */
+static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
+{
+       struct kvm_memory_slot *slot;
+       struct kvm_lpage_info *linfo;
+
+       slot = gfn_to_memslot(kvm, gfn);
+       if (likely(level == PT_PAGE_TABLE_LEVEL))
+               return &slot->rmap[gfn - slot->base_gfn];
+
+       linfo = lpage_info_slot(gfn, slot, level);
+
+       return &linfo->rmap_pde;
+}
+
+static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
+{
+       struct kvm_mmu_page *sp;
+       unsigned long *rmapp;
+
+       sp = page_header(__pa(spte));
+       kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
+       rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
+       return pte_list_add(vcpu, spte, rmapp);
+}
+
+static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
+{
+       return pte_list_next(rmapp, spte);
+}
+
+static void rmap_remove(struct kvm *kvm, u64 *spte)
+{
+       struct kvm_mmu_page *sp;
+       gfn_t gfn;
+       unsigned long *rmapp;
+
+       sp = page_header(__pa(spte));
+       gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
+       rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
+       pte_list_remove(spte, rmapp);
+}
+
 static int set_spte_track_bits(u64 *sptep, u64 new_spte)
 {
        pfn_t pfn;
@@ -744,38 +767,12 @@ static int set_spte_track_bits(u64 *sptep, u64 new_spte)
        return 1;
 }
 
-static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
+static void drop_spte(struct kvm *kvm, u64 *sptep)
 {
-       if (set_spte_track_bits(sptep, new_spte))
+       if (set_spte_track_bits(sptep, 0ull))
                rmap_remove(kvm, sptep);
 }
 
-static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
-{
-       struct kvm_rmap_desc *desc;
-       u64 *prev_spte;
-       int i;
-
-       if (!*rmapp)
-               return NULL;
-       else if (!(*rmapp & 1)) {
-               if (!spte)
-                       return (u64 *)*rmapp;
-               return NULL;
-       }
-       desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
-       prev_spte = NULL;
-       while (desc) {
-               for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
-                       if (prev_spte == spte)
-                               return desc->sptes[i];
-                       prev_spte = desc->sptes[i];
-               }
-               desc = desc->more;
-       }
-       return NULL;
-}
-
 static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 {
        unsigned long *rmapp;
@@ -807,8 +804,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
                        BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
                        pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
                        if (is_writable_pte(*spte)) {
-                               drop_spte(kvm, spte,
-                                         shadow_trap_nonpresent_pte);
+                               drop_spte(kvm, spte);
                                --kvm->stat.lpages;
                                spte = NULL;
                                write_protected = 1;
@@ -829,7 +825,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
        while ((spte = rmap_next(kvm, rmapp, NULL))) {
                BUG_ON(!(*spte & PT_PRESENT_MASK));
                rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
-               drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
+               drop_spte(kvm, spte);
                need_tlb_flush = 1;
        }
        return need_tlb_flush;
@@ -851,7 +847,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
                rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
                need_flush = 1;
                if (pte_write(*ptep)) {
-                       drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
+                       drop_spte(kvm, spte);
                        spte = rmap_next(kvm, rmapp, NULL);
                } else {
                        new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
@@ -1032,151 +1028,89 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
        percpu_counter_add(&kvm_total_used_mmu_pages, nr);
 }
 
-static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+/*
+ * Remove the sp from shadow page cache, after call it,
+ * we can not find this sp from the cache, and the shadow
+ * page table is still valid.
+ * It should be under the protection of mmu lock.
+ */
+static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp)
 {
        ASSERT(is_empty_shadow_page(sp->spt));
        hlist_del(&sp->hash_link);
-       list_del(&sp->link);
-       free_page((unsigned long)sp->spt);
        if (!sp->role.direct)
                free_page((unsigned long)sp->gfns);
-       kmem_cache_free(mmu_page_header_cache, sp);
-       kvm_mod_used_mmu_pages(kvm, -1);
 }
 
-static unsigned kvm_page_table_hashfn(gfn_t gfn)
+/*
+ * Free the shadow page table and the sp, we can do it
+ * out of the protection of mmu lock.
+ */
+static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
 {
-       return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
+       list_del(&sp->link);
+       free_page((unsigned long)sp->spt);
+       kmem_cache_free(mmu_page_header_cache, sp);
 }
 
-static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
-                                              u64 *parent_pte, int direct)
+static unsigned kvm_page_table_hashfn(gfn_t gfn)
 {
-       struct kvm_mmu_page *sp;
-
-       sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
-       sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
-       if (!direct)
-               sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
-                                                 PAGE_SIZE);
-       set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
-       list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
-       bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
-       sp->multimapped = 0;
-       sp->parent_pte = parent_pte;
-       kvm_mod_used_mmu_pages(vcpu->kvm, +1);
-       return sp;
+       return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
 }
 
 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
                                    struct kvm_mmu_page *sp, u64 *parent_pte)
 {
-       struct kvm_pte_chain *pte_chain;
-       struct hlist_node *node;
-       int i;
-
        if (!parent_pte)
                return;
-       if (!sp->multimapped) {
-               u64 *old = sp->parent_pte;
 
-               if (!old) {
-                       sp->parent_pte = parent_pte;
-                       return;
-               }
-               sp->multimapped = 1;
-               pte_chain = mmu_alloc_pte_chain(vcpu);
-               INIT_HLIST_HEAD(&sp->parent_ptes);
-               hlist_add_head(&pte_chain->link, &sp->parent_ptes);
-               pte_chain->parent_ptes[0] = old;
-       }
-       hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
-               if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
-                       continue;
-               for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
-                       if (!pte_chain->parent_ptes[i]) {
-                               pte_chain->parent_ptes[i] = parent_pte;
-                               return;
-                       }
-       }
-       pte_chain = mmu_alloc_pte_chain(vcpu);
-       BUG_ON(!pte_chain);
-       hlist_add_head(&pte_chain->link, &sp->parent_ptes);
-       pte_chain->parent_ptes[0] = parent_pte;
+       pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
 }
 
 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
                                       u64 *parent_pte)
 {
-       struct kvm_pte_chain *pte_chain;
-       struct hlist_node *node;
-       int i;
-
-       if (!sp->multimapped) {
-               BUG_ON(sp->parent_pte != parent_pte);
-               sp->parent_pte = NULL;
-               return;
-       }
-       hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
-               for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
-                       if (!pte_chain->parent_ptes[i])
-                               break;
-                       if (pte_chain->parent_ptes[i] != parent_pte)
-                               continue;
-                       while (i + 1 < NR_PTE_CHAIN_ENTRIES
-                               && pte_chain->parent_ptes[i + 1]) {
-                               pte_chain->parent_ptes[i]
-                                       = pte_chain->parent_ptes[i + 1];
-                               ++i;
-                       }
-                       pte_chain->parent_ptes[i] = NULL;
-                       if (i == 0) {
-                               hlist_del(&pte_chain->link);
-                               mmu_free_pte_chain(pte_chain);
-                               if (hlist_empty(&sp->parent_ptes)) {
-                                       sp->multimapped = 0;
-                                       sp->parent_pte = NULL;
-                               }
-                       }
-                       return;
-               }
-       BUG();
+       pte_list_remove(parent_pte, &sp->parent_ptes);
 }
 
-static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
+static void drop_parent_pte(struct kvm_mmu_page *sp,
+                           u64 *parent_pte)
 {
-       struct kvm_pte_chain *pte_chain;
-       struct hlist_node *node;
-       struct kvm_mmu_page *parent_sp;
-       int i;
-
-       if (!sp->multimapped && sp->parent_pte) {
-               parent_sp = page_header(__pa(sp->parent_pte));
-               fn(parent_sp, sp->parent_pte);
-               return;
-       }
-
-       hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
-               for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
-                       u64 *spte = pte_chain->parent_ptes[i];
+       mmu_page_remove_parent_pte(sp, parent_pte);
+       __set_spte(parent_pte, 0ull);
+}
 
-                       if (!spte)
-                               break;
-                       parent_sp = page_header(__pa(spte));
-                       fn(parent_sp, spte);
-               }
+static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
+                                              u64 *parent_pte, int direct)
+{
+       struct kvm_mmu_page *sp;
+       sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache,
+                                       sizeof *sp);
+       sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
+       if (!direct)
+               sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
+                                                 PAGE_SIZE);
+       set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+       list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
+       bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
+       sp->parent_ptes = 0;
+       mmu_page_add_parent_pte(vcpu, sp, parent_pte);
+       kvm_mod_used_mmu_pages(vcpu->kvm, +1);
+       return sp;
 }
 
-static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte);
+static void mark_unsync(u64 *spte);
 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
 {
-       mmu_parent_walk(sp, mark_unsync);
+       pte_list_walk(&sp->parent_ptes, mark_unsync);
 }
 
-static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
+static void mark_unsync(u64 *spte)
 {
+       struct kvm_mmu_page *sp;
        unsigned int index;
 
+       sp = page_header(__pa(spte));
        index = spte - sp->spt;
        if (__test_and_set_bit(index, sp->unsync_child_bitmap))
                return;
@@ -1185,15 +1119,6 @@ static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
        kvm_mmu_mark_parents_unsync(sp);
 }
 
-static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
-                                   struct kvm_mmu_page *sp)
-{
-       int i;
-
-       for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
-               sp->spt[i] = shadow_trap_nonpresent_pte;
-}
-
 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
                               struct kvm_mmu_page *sp)
 {
@@ -1475,6 +1400,14 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
        }
 }
 
+static void init_shadow_page_table(struct kvm_mmu_page *sp)
+{
+       int i;
+
+       for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+               sp->spt[i] = 0ull;
+}
+
 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
                                             gfn_t gfn,
                                             gva_t gaddr,
@@ -1537,10 +1470,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 
                account_shadowed(vcpu->kvm, gfn);
        }
-       if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
-               vcpu->arch.mmu.prefetch_page(vcpu, sp);
-       else
-               nonpaging_prefetch_page(vcpu, sp);
+       init_shadow_page_table(sp);
        trace_kvm_mmu_get_page(sp, true);
        return sp;
 }
@@ -1572,10 +1502,6 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
        if (iterator->level < PT_PAGE_TABLE_LEVEL)
                return false;
 
-       if (iterator->level == PT_PAGE_TABLE_LEVEL)
-               if (is_large_pte(*iterator->sptep))
-                       return false;
-
        iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
        iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
        return true;
@@ -1583,6 +1509,11 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
 
 static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
 {
+       if (is_last_spte(*iterator->sptep, iterator->level)) {
+               iterator->level = 0;
+               return;
+       }
+
        iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
        --iterator->level;
 }
@@ -1600,7 +1531,7 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
 static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
 {
        if (is_large_pte(*sptep)) {
-               drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+               drop_spte(vcpu->kvm, sptep);
                kvm_flush_remote_tlbs(vcpu->kvm);
        }
 }
@@ -1622,38 +1553,38 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                if (child->role.access == direct_access)
                        return;
 
-               mmu_page_remove_parent_pte(child, sptep);
-               __set_spte(sptep, shadow_trap_nonpresent_pte);
+               drop_parent_pte(child, sptep);
                kvm_flush_remote_tlbs(vcpu->kvm);
        }
 }
 
+static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
+                            u64 *spte)
+{
+       u64 pte;
+       struct kvm_mmu_page *child;
+
+       pte = *spte;
+       if (is_shadow_present_pte(pte)) {
+               if (is_last_spte(pte, sp->role.level))
+                       drop_spte(kvm, spte);
+               else {
+                       child = page_header(pte & PT64_BASE_ADDR_MASK);
+                       drop_parent_pte(child, spte);
+               }
+       }
+
+       if (is_large_pte(pte))
+               --kvm->stat.lpages;
+}
+
 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
                                         struct kvm_mmu_page *sp)
 {
        unsigned i;
-       u64 *pt;
-       u64 ent;
-
-       pt = sp->spt;
-
-       for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-               ent = pt[i];
-
-               if (is_shadow_present_pte(ent)) {
-                       if (!is_last_spte(ent, sp->role.level)) {
-                               ent &= PT64_BASE_ADDR_MASK;
-                               mmu_page_remove_parent_pte(page_header(ent),
-                                                          &pt[i]);
-                       } else {
-                               if (is_large_pte(ent))
-                                       --kvm->stat.lpages;
-                               drop_spte(kvm, &pt[i],
-                                         shadow_trap_nonpresent_pte);
-                       }
-               }
-               pt[i] = shadow_trap_nonpresent_pte;
-       }
+
+       for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+               mmu_page_zap_pte(kvm, sp, sp->spt + i);
 }
 
 static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
@@ -1674,20 +1605,8 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
        u64 *parent_pte;
 
-       while (sp->multimapped || sp->parent_pte) {
-               if (!sp->multimapped)
-                       parent_pte = sp->parent_pte;
-               else {
-                       struct kvm_pte_chain *chain;
-
-                       chain = container_of(sp->parent_ptes.first,
-                                            struct kvm_pte_chain, link);
-                       parent_pte = chain->parent_ptes[0];
-               }
-               BUG_ON(!parent_pte);
-               kvm_mmu_put_page(sp, parent_pte);
-               __set_spte(parent_pte, shadow_trap_nonpresent_pte);
-       }
+       while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL)))
+               drop_parent_pte(sp, parent_pte);
 }
 
 static int mmu_zap_unsync_children(struct kvm *kvm,
@@ -1734,6 +1653,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
                /* Count self */
                ret++;
                list_move(&sp->link, invalid_list);
+               kvm_mod_used_mmu_pages(kvm, -1);
        } else {
                list_move(&sp->link, &kvm->arch.active_mmu_pages);
                kvm_reload_remote_mmus(kvm);
@@ -1757,7 +1677,8 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
        do {
                sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
                WARN_ON(!sp->role.invalid || sp->root_count);
-               kvm_mmu_free_page(kvm, sp);
+               kvm_mmu_isolate_page(sp);
+               kvm_mmu_free_page(sp);
        } while (!list_empty(invalid_list));
 
 }
@@ -1783,8 +1704,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
                        page = container_of(kvm->arch.active_mmu_pages.prev,
                                            struct kvm_mmu_page, link);
                        kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
-                       kvm_mmu_commit_zap_page(kvm, &invalid_list);
                }
+               kvm_mmu_commit_zap_page(kvm, &invalid_list);
                goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
        }
 
@@ -1833,20 +1754,6 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
        __set_bit(slot, sp->slot_bitmap);
 }
 
-static void mmu_convert_notrap(struct kvm_mmu_page *sp)
-{
-       int i;
-       u64 *pt = sp->spt;
-
-       if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
-               return;
-
-       for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-               if (pt[i] == shadow_notrap_nonpresent_pte)
-                       __set_spte(&pt[i], shadow_trap_nonpresent_pte);
-       }
-}
-
 /*
  * The function is based on mtrr_type_lookup() in
  * arch/x86/kernel/cpu/mtrr/generic.c
@@ -1959,7 +1866,6 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
        sp->unsync = 1;
 
        kvm_mmu_mark_parents_unsync(sp);
-       mmu_convert_notrap(sp);
 }
 
 static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
@@ -2002,7 +1908,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 
 static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                    unsigned pte_access, int user_fault,
-                   int write_fault, int dirty, int level,
+                   int write_fault, int level,
                    gfn_t gfn, pfn_t pfn, bool speculative,
                    bool can_unsync, bool host_writable)
 {
@@ -2017,8 +1923,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
        spte = PT_PRESENT_MASK;
        if (!speculative)
                spte |= shadow_accessed_mask;
-       if (!dirty)
-               pte_access &= ~ACC_WRITE_MASK;
+
        if (pte_access & ACC_EXEC_MASK)
                spte |= shadow_x_mask;
        else
@@ -2045,15 +1950,24 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                if (level > PT_PAGE_TABLE_LEVEL &&
                    has_wrprotected_page(vcpu->kvm, gfn, level)) {
                        ret = 1;
-                       drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+                       drop_spte(vcpu->kvm, sptep);
                        goto done;
                }
 
                spte |= PT_WRITABLE_MASK;
 
                if (!vcpu->arch.mmu.direct_map
-                   && !(pte_access & ACC_WRITE_MASK))
+                   && !(pte_access & ACC_WRITE_MASK)) {
                        spte &= ~PT_USER_MASK;
+                       /*
+                        * If we converted a user page to a kernel page,
+                        * so that the kernel can write to it when cr0.wp=0,
+                        * then we should prevent the kernel from executing it
+                        * if SMEP is enabled.
+                        */
+                       if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
+                               spte |= PT64_NX_MASK;
+               }
 
                /*
                 * Optimization: for pte sync, if spte was writable the hash
@@ -2093,8 +2007,8 @@ done:
 
 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                         unsigned pt_access, unsigned pte_access,
-                        int user_fault, int write_fault, int dirty,
-                        int *ptwrite, int level, gfn_t gfn,
+                        int user_fault, int write_fault,
+                        int *emulate, int level, gfn_t gfn,
                         pfn_t pfn, bool speculative,
                         bool host_writable)
 {
@@ -2117,23 +2031,22 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                        u64 pte = *sptep;
 
                        child = page_header(pte & PT64_BASE_ADDR_MASK);
-                       mmu_page_remove_parent_pte(child, sptep);
-                       __set_spte(sptep, shadow_trap_nonpresent_pte);
+                       drop_parent_pte(child, sptep);
                        kvm_flush_remote_tlbs(vcpu->kvm);
                } else if (pfn != spte_to_pfn(*sptep)) {
                        pgprintk("hfn old %llx new %llx\n",
                                 spte_to_pfn(*sptep), pfn);
-                       drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+                       drop_spte(vcpu->kvm, sptep);
                        kvm_flush_remote_tlbs(vcpu->kvm);
                } else
                        was_rmapped = 1;
        }
 
        if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
-                     dirty, level, gfn, pfn, speculative, true,
+                     level, gfn, pfn, speculative, true,
                      host_writable)) {
                if (write_fault)
-                       *ptwrite = 1;
+                       *emulate = 1;
                kvm_mmu_flush_tlb(vcpu);
        }
 
@@ -2145,11 +2058,13 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
        if (!was_rmapped && is_large_pte(*sptep))
                ++vcpu->kvm->stat.lpages;
 
-       page_header_update_slot(vcpu->kvm, sptep, gfn);
-       if (!was_rmapped) {
-               rmap_count = rmap_add(vcpu, sptep, gfn);
-               if (rmap_count > RMAP_RECYCLE_THRESHOLD)
-                       rmap_recycle(vcpu, sptep, gfn);
+       if (is_shadow_present_pte(*sptep)) {
+               page_header_update_slot(vcpu->kvm, sptep, gfn);
+               if (!was_rmapped) {
+                       rmap_count = rmap_add(vcpu, sptep, gfn);
+                       if (rmap_count > RMAP_RECYCLE_THRESHOLD)
+                               rmap_recycle(vcpu, sptep, gfn);
+               }
        }
        kvm_release_pfn_clean(pfn);
        if (speculative) {
@@ -2170,8 +2085,8 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
 
        slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
        if (!slot) {
-               get_page(bad_page);
-               return page_to_pfn(bad_page);
+               get_page(fault_page);
+               return page_to_pfn(fault_page);
        }
 
        hva = gfn_to_hva_memslot(slot, gfn);
@@ -2198,7 +2113,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
 
        for (i = 0; i < ret; i++, gfn++, start++)
                mmu_set_spte(vcpu, start, ACC_ALL,
-                            access, 0, 0, 1, NULL,
+                            access, 0, 0, NULL,
                             sp->role.level, gfn,
                             page_to_pfn(pages[i]), true, true);
 
@@ -2217,7 +2132,7 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
        spte = sp->spt + i;
 
        for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
-               if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
+               if (is_shadow_present_pte(*spte) || spte == sptep) {
                        if (!start)
                                continue;
                        if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
@@ -2254,7 +2169,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 {
        struct kvm_shadow_walk_iterator iterator;
        struct kvm_mmu_page *sp;
-       int pt_write = 0;
+       int emulate = 0;
        gfn_t pseudo_gfn;
 
        for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
@@ -2262,14 +2177,14 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
                        unsigned pte_access = ACC_ALL;
 
                        mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
-                                    0, write, 1, &pt_write,
+                                    0, write, &emulate,
                                     level, gfn, pfn, prefault, map_writable);
                        direct_pte_prefetch(vcpu, iterator.sptep);
                        ++vcpu->stat.pf_fixed;
                        break;
                }
 
-               if (*iterator.sptep == shadow_trap_nonpresent_pte) {
+               if (!is_shadow_present_pte(*iterator.sptep)) {
                        u64 base_addr = iterator.addr;
 
                        base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
@@ -2290,7 +2205,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
                                   | shadow_accessed_mask);
                }
        }
-       return pt_write;
+       return emulate;
 }
 
 static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
@@ -2306,15 +2221,17 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
        send_sig_info(SIGBUS, &info, tsk);
 }
 
-static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
+static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gva_t gva,
+                              unsigned access, gfn_t gfn, pfn_t pfn)
 {
        kvm_release_pfn_clean(pfn);
        if (is_hwpoison_pfn(pfn)) {
-               kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current);
+               kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
                return 0;
        } else if (is_fault_pfn(pfn))
                return -EFAULT;
 
+       vcpu_cache_mmio_info(vcpu, gva, gfn, access);
        return 1;
 }
 
@@ -2396,7 +2313,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
 
        /* mmio */
        if (is_error_pfn(pfn))
-               return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
+               return kvm_handle_bad_page(vcpu, v, ACC_ALL, gfn, pfn);
 
        spin_lock(&vcpu->kvm->mmu_lock);
        if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -2623,6 +2540,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
        if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
                return;
 
+       vcpu_clear_mmio_info(vcpu, ~0ul);
        trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
        if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
                hpa_t root = vcpu->arch.mmu.root_hpa;
@@ -2769,7 +2687,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 
        /* mmio */
        if (is_error_pfn(pfn))
-               return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
+               return kvm_handle_bad_page(vcpu, 0, 0, gfn, pfn);
        spin_lock(&vcpu->kvm->mmu_lock);
        if (mmu_notifier_retry(vcpu, mmu_seq))
                goto out_unlock;
@@ -2800,7 +2718,6 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu,
        context->page_fault = nonpaging_page_fault;
        context->gva_to_gpa = nonpaging_gva_to_gpa;
        context->free = nonpaging_free;
-       context->prefetch_page = nonpaging_prefetch_page;
        context->sync_page = nonpaging_sync_page;
        context->invlpg = nonpaging_invlpg;
        context->update_pte = nonpaging_update_pte;
@@ -2930,7 +2847,6 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
        context->new_cr3 = paging_new_cr3;
        context->page_fault = paging64_page_fault;
        context->gva_to_gpa = paging64_gva_to_gpa;
-       context->prefetch_page = paging64_prefetch_page;
        context->sync_page = paging64_sync_page;
        context->invlpg = paging64_invlpg;
        context->update_pte = paging64_update_pte;
@@ -2959,7 +2875,6 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
        context->page_fault = paging32_page_fault;
        context->gva_to_gpa = paging32_gva_to_gpa;
        context->free = paging_free;
-       context->prefetch_page = paging32_prefetch_page;
        context->sync_page = paging32_sync_page;
        context->invlpg = paging32_invlpg;
        context->update_pte = paging32_update_pte;
@@ -2984,7 +2899,6 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
        context->new_cr3 = nonpaging_new_cr3;
        context->page_fault = tdp_page_fault;
        context->free = nonpaging_free;
-       context->prefetch_page = nonpaging_prefetch_page;
        context->sync_page = nonpaging_sync_page;
        context->invlpg = nonpaging_invlpg;
        context->update_pte = nonpaging_update_pte;
@@ -3023,6 +2937,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
 {
        int r;
+       bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
        ASSERT(vcpu);
        ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
@@ -3037,6 +2952,8 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
 
        vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
        vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
+       vcpu->arch.mmu.base_role.smep_andnot_wp
+               = smep && !is_write_protection(vcpu);
 
        return r;
 }
@@ -3141,27 +3058,6 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_unload);
 
-static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
-                                 struct kvm_mmu_page *sp,
-                                 u64 *spte)
-{
-       u64 pte;
-       struct kvm_mmu_page *child;
-
-       pte = *spte;
-       if (is_shadow_present_pte(pte)) {
-               if (is_last_spte(pte, sp->role.level))
-                       drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
-               else {
-                       child = page_header(pte & PT64_BASE_ADDR_MASK);
-                       mmu_page_remove_parent_pte(child, spte);
-               }
-       }
-       __set_spte(spte, shadow_trap_nonpresent_pte);
-       if (is_large_pte(pte))
-               --vcpu->kvm->stat.lpages;
-}
-
 static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
                                  struct kvm_mmu_page *sp, u64 *spte,
                                  const void *new)
@@ -3233,6 +3129,13 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
        int level, npte, invlpg_counter, r, flooded = 0;
        bool remote_flush, local_flush, zap_page;
 
+       /*
+        * If we don't have indirect shadow pages, it means no page is
+        * write-protected, so we can exit simply.
+        */
+       if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
+               return;
+
        zap_page = remote_flush = local_flush = false;
        offset = offset_in_page(gpa);
 
@@ -3336,7 +3239,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
                spte = &sp->spt[page_offset / sizeof(*spte)];
                while (npte--) {
                        entry = *spte;
-                       mmu_pte_write_zap_pte(vcpu, sp, spte);
+                       mmu_page_zap_pte(vcpu->kvm, sp, spte);
                        if (gentry &&
                              !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
                              & mask.word))
@@ -3380,9 +3283,9 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
                sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
                                  struct kvm_mmu_page, link);
                kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
-               kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
                ++vcpu->kvm->stat.mmu_recycled;
        }
+       kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 }
 
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
@@ -3506,8 +3409,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
                                continue;
 
                        if (is_large_pte(pt[i])) {
-                               drop_spte(kvm, &pt[i],
-                                         shadow_trap_nonpresent_pte);
+                               drop_spte(kvm, &pt[i]);
                                --kvm->stat.lpages;
                                continue;
                        }
@@ -3590,25 +3492,18 @@ static struct shrinker mmu_shrinker = {
 
 static void mmu_destroy_caches(void)
 {
-       if (pte_chain_cache)
-               kmem_cache_destroy(pte_chain_cache);
-       if (rmap_desc_cache)
-               kmem_cache_destroy(rmap_desc_cache);
+       if (pte_list_desc_cache)
+               kmem_cache_destroy(pte_list_desc_cache);
        if (mmu_page_header_cache)
                kmem_cache_destroy(mmu_page_header_cache);
 }
 
 int kvm_mmu_module_init(void)
 {
-       pte_chain_cache = kmem_cache_create("kvm_pte_chain",
-                                           sizeof(struct kvm_pte_chain),
-                                           0, 0, NULL);
-       if (!pte_chain_cache)
-               goto nomem;
-       rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
-                                           sizeof(struct kvm_rmap_desc),
+       pte_list_desc_cache = kmem_cache_create("pte_list_desc",
+                                           sizeof(struct pte_list_desc),
                                            0, 0, NULL);
-       if (!rmap_desc_cache)
+       if (!pte_list_desc_cache)
                goto nomem;
 
        mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",