arch/x86/kvm/mmu.c

   1 /*
   2  * Kernel-based Virtual Machine driver for Linux
   3  *
   4  * This module enables machines with Intel VT-x extensions to run virtual
   5  * machines without emulation or binary translation.
   6  *
   7  * MMU support
   8  *
   9  * Copyright (C) 2006 Qumranet, Inc.
  10  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  11  *
  12  * Authors:
  13  *   Yaniv Kamay  <yaniv@qumranet.com>
  14  *   Avi Kivity   <avi@qumranet.com>
  15  *
  16  * This work is licensed under the terms of the GNU GPL, version 2.  See
  17  * the COPYING file in the top-level directory.
  18  *
  19  */
  20
  21 #include "irq.h"
  22 #include "mmu.h"
  23 #include "x86.h"
  24 #include "kvm_cache_regs.h"
  25 #include "x86.h"
  26
  27 #include <linux/kvm_host.h>
  28 #include <linux/types.h>
  29 #include <linux/string.h>
  30 #include <linux/mm.h>
  31 #include <linux/highmem.h>
  32 #include <linux/module.h>
  33 #include <linux/swap.h>
  34 #include <linux/hugetlb.h>
  35 #include <linux/compiler.h>
  36 #include <linux/srcu.h>
  37 #include <linux/slab.h>
  38 #include <linux/uaccess.h>
  39
  40 #include <asm/page.h>
  41 #include <asm/cmpxchg.h>
  42 #include <asm/io.h>
  43 #include <asm/vmx.h>
  44
  45 /*
  46  * When setting this variable to true it enables Two-Dimensional-Paging
  47  * where the hardware walks 2 page tables:
  48  * 1. the guest-virtual to guest-physical
  49  * 2. while doing 1. it walks guest-physical to host-physical
  50  * If the hardware supports that we don't need to do shadow paging.
  51  */
  52 bool tdp_enabled = false;
  53
  54 enum {
  55         AUDIT_PRE_PAGE_FAULT,
  56         AUDIT_POST_PAGE_FAULT,
  57         AUDIT_PRE_PTE_WRITE,
  58         AUDIT_POST_PTE_WRITE,
  59         AUDIT_PRE_SYNC,
  60         AUDIT_POST_SYNC
  61 };
  62
  63 char *audit_point_name[] = {
  64         "pre page fault",
  65         "post page fault",
  66         "pre pte write",
  67         "post pte write",
  68         "pre sync",
  69         "post sync"
  70 };
  71
  72 #undef MMU_DEBUG
  73
  74 #ifdef MMU_DEBUG
  75
  76 #define pgprintk(x...) do { if (dbg) printk(x); } while (0)
  77 #define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
  78
  79 #else
  80
  81 #define pgprintk(x...) do { } while (0)
  82 #define rmap_printk(x...) do { } while (0)
  83
  84 #endif
  85
  86 #ifdef MMU_DEBUG
  87 static int dbg = 0;
  88 module_param(dbg, bool, 0644);
  89 #endif
  90
  91 static int oos_shadow = 1;
  92 module_param(oos_shadow, bool, 0644);
  93
  94 #ifndef MMU_DEBUG
  95 #define ASSERT(x) do { } while (0)
  96 #else
  97 #define ASSERT(x)                                                       \
  98         if (!(x)) {                                                     \
  99                 printk(KERN_WARNING "assertion failed %s:%d: %s\n",     \
 100                        __FILE__, __LINE__, #x);                         \
 101         }
 102 #endif
 103
 104 #define PTE_PREFETCH_NUM                8
 105
 106 #define PT_FIRST_AVAIL_BITS_SHIFT 9
 107 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
 108
 109 #define PT64_LEVEL_BITS 9
 110
 111 #define PT64_LEVEL_SHIFT(level) \
 112                 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
 113
 114 #define PT64_LEVEL_MASK(level) \
 115                 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
 116
 117 #define PT64_INDEX(address, level)\
 118         (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
 119
 120
 121 #define PT32_LEVEL_BITS 10
 122
 123 #define PT32_LEVEL_SHIFT(level) \
 124                 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
 125
 126 #define PT32_LEVEL_MASK(level) \
 127                 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
 128 #define PT32_LVL_OFFSET_MASK(level) \
 129         (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
 130                                                 * PT32_LEVEL_BITS))) - 1))
 131
 132 #define PT32_INDEX(address, level)\
 133         (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
 134
 135
 136 #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
 137 #define PT64_DIR_BASE_ADDR_MASK \
 138         (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
 139 #define PT64_LVL_ADDR_MASK(level) \
 140         (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
 141                                                 * PT64_LEVEL_BITS))) - 1))
 142 #define PT64_LVL_OFFSET_MASK(level) \
 143         (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
 144                                                 * PT64_LEVEL_BITS))) - 1))
 145
 146 #define PT32_BASE_ADDR_MASK PAGE_MASK
 147 #define PT32_DIR_BASE_ADDR_MASK \
 148         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
 149 #define PT32_LVL_ADDR_MASK(level) \
 150         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
 151                                             * PT32_LEVEL_BITS))) - 1))
 152
 153 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
 154                         | PT64_NX_MASK)
 155
 156 #define RMAP_EXT 4
 157
 158 #define ACC_EXEC_MASK    1
 159 #define ACC_WRITE_MASK   PT_WRITABLE_MASK
 160 #define ACC_USER_MASK    PT_USER_MASK
 161 #define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
 162
 163 #include <trace/events/kvm.h>
 164
 165 #define CREATE_TRACE_POINTS
 166 #include "mmutrace.h"
 167
 168 #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
 169
 170 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 171
 172 struct kvm_rmap_desc {
 173         u64 *sptes[RMAP_EXT];
 174         struct kvm_rmap_desc *more;
 175 };
 176
 177 struct kvm_shadow_walk_iterator {
 178         u64 addr;
 179         hpa_t shadow_addr;
 180         int level;
 181         u64 *sptep;
 182         unsigned index;
 183 };
 184
 185 #define for_each_shadow_entry(_vcpu, _addr, _walker)    \
 186         for (shadow_walk_init(&(_walker), _vcpu, _addr);        \
 187              shadow_walk_okay(&(_walker));                      \
 188              shadow_walk_next(&(_walker)))
 189
 190 typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
 191
 192 static struct kmem_cache *pte_chain_cache;
 193 static struct kmem_cache *rmap_desc_cache;
 194 static struct kmem_cache *mmu_page_header_cache;
 195 static struct percpu_counter kvm_total_used_mmu_pages;
 196
 197 static u64 __read_mostly shadow_trap_nonpresent_pte;
 198 static u64 __read_mostly shadow_notrap_nonpresent_pte;
 199 static u64 __read_mostly shadow_base_present_pte;
 200 static u64 __read_mostly shadow_nx_mask;
 201 static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
 202 static u64 __read_mostly shadow_user_mask;
 203 static u64 __read_mostly shadow_accessed_mask;
 204 static u64 __read_mostly shadow_dirty_mask;
 205
 206 static inline u64 rsvd_bits(int s, int e)
 207 {
 208         return ((1ULL << (e - s + 1)) - 1) << s;
 209 }
 210
 211 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
 212 {
 213         shadow_trap_nonpresent_pte = trap_pte;
 214         shadow_notrap_nonpresent_pte = notrap_pte;
 215 }
 216 EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
 217
 218 void kvm_mmu_set_base_ptes(u64 base_pte)
 219 {
 220         shadow_base_present_pte = base_pte;
 221 }
 222 EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
 223
 224 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 225                 u64 dirty_mask, u64 nx_mask, u64 x_mask)
 226 {
 227         shadow_user_mask = user_mask;
 228         shadow_accessed_mask = accessed_mask;
 229         shadow_dirty_mask = dirty_mask;
 230         shadow_nx_mask = nx_mask;
 231         shadow_x_mask = x_mask;
 232 }
 233 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 234
 235 static bool is_write_protection(struct kvm_vcpu *vcpu)
 236 {
 237         return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
 238 }
 239
 240 static int is_cpuid_PSE36(void)
 241 {
 242         return 1;
 243 }
 244
 245 static int is_nx(struct kvm_vcpu *vcpu)
 246 {
 247         return vcpu->arch.efer & EFER_NX;
 248 }
 249
 250 static int is_shadow_present_pte(u64 pte)
 251 {
 252         return pte != shadow_trap_nonpresent_pte
 253                 && pte != shadow_notrap_nonpresent_pte;
 254 }
 255
 256 static int is_large_pte(u64 pte)
 257 {
 258         return pte & PT_PAGE_SIZE_MASK;
 259 }
 260
 261 static int is_writable_pte(unsigned long pte)
 262 {
 263         return pte & PT_WRITABLE_MASK;
 264 }
 265
 266 static int is_dirty_gpte(unsigned long pte)
 267 {
 268         return pte & PT_DIRTY_MASK;
 269 }
 270
 271 static int is_rmap_spte(u64 pte)
 272 {
 273         return is_shadow_present_pte(pte);
 274 }
 275
 276 static int is_last_spte(u64 pte, int level)
 277 {
 278         if (level == PT_PAGE_TABLE_LEVEL)
 279                 return 1;
 280         if (is_large_pte(pte))
 281                 return 1;
 282         return 0;
 283 }
 284
 285 static pfn_t spte_to_pfn(u64 pte)
 286 {
 287         return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
 288 }
 289
 290 static gfn_t pse36_gfn_delta(u32 gpte)
 291 {
 292         int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
 293
 294         return (gpte & PT32_DIR_PSE36_MASK) << shift;
 295 }
 296
 297 static void __set_spte(u64 *sptep, u64 spte)
 298 {
 299         set_64bit(sptep, spte);
 300 }
 301
 302 static u64 __xchg_spte(u64 *sptep, u64 new_spte)
 303 {
 304 #ifdef CONFIG_X86_64
 305         return xchg(sptep, new_spte);
 306 #else
 307         u64 old_spte;
 308
 309         do {
 310                 old_spte = *sptep;
 311         } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
 312
 313         return old_spte;
 314 #endif
 315 }
 316
 317 static bool spte_has_volatile_bits(u64 spte)
 318 {
 319         if (!shadow_accessed_mask)
 320                 return false;
 321
 322         if (!is_shadow_present_pte(spte))
 323                 return false;
 324
 325         if ((spte & shadow_accessed_mask) &&
 326               (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
 327                 return false;
 328
 329         return true;
 330 }
 331
 332 static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
 333 {
 334         return (old_spte & bit_mask) && !(new_spte & bit_mask);
 335 }
 336
 337 static void update_spte(u64 *sptep, u64 new_spte)
 338 {
 339         u64 mask, old_spte = *sptep;
 340
 341         WARN_ON(!is_rmap_spte(new_spte));
 342
 343         new_spte |= old_spte & shadow_dirty_mask;
 344
 345         mask = shadow_accessed_mask;
 346         if (is_writable_pte(old_spte))
 347                 mask |= shadow_dirty_mask;
 348
 349         if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
 350                 __set_spte(sptep, new_spte);
 351         else
 352                 old_spte = __xchg_spte(sptep, new_spte);
 353
 354         if (!shadow_accessed_mask)
 355                 return;
 356
 357         if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
 358                 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 359         if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
 360                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 361 }
 362
 363 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 364                                   struct kmem_cache *base_cache, int min)
 365 {
 366         void *obj;
 367
 368         if (cache->nobjs >= min)
 369                 return 0;
 370         while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
 371                 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
 372                 if (!obj)
 373                         return -ENOMEM;
 374                 cache->objects[cache->nobjs++] = obj;
 375         }
 376         return 0;
 377 }
 378
 379 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
 380                                   struct kmem_cache *cache)
 381 {
 382         while (mc->nobjs)
 383                 kmem_cache_free(cache, mc->objects[--mc->nobjs]);
 384 }
 385
 386 static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
 387                                        int min)
 388 {
 389         struct page *page;
 390
 391         if (cache->nobjs >= min)
 392                 return 0;
 393         while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
 394                 page = alloc_page(GFP_KERNEL);
 395                 if (!page)
 396                         return -ENOMEM;
 397                 cache->objects[cache->nobjs++] = page_address(page);
 398         }
 399         return 0;
 400 }
 401
 402 static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
 403 {
 404         while (mc->nobjs)
 405                 free_page((unsigned long)mc->objects[--mc->nobjs]);
 406 }
 407
 408 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
 409 {
 410         int r;
 411
 412         r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
 413                                    pte_chain_cache, 4);
 414         if (r)
 415                 goto out;
 416         r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
 417                                    rmap_desc_cache, 4 + PTE_PREFETCH_NUM);
 418         if (r)
 419                 goto out;
 420         r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
 421         if (r)
 422                 goto out;
 423         r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
 424                                    mmu_page_header_cache, 4);
 425 out:
 426         return r;
 427 }
 428
 429 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 430 {
 431         mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache);
 432         mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);
 433         mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
 434         mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
 435                                 mmu_page_header_cache);
 436 }
 437
 438 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
 439                                     size_t size)
 440 {
 441         void *p;
 442
 443         BUG_ON(!mc->nobjs);
 444         p = mc->objects[--mc->nobjs];
 445         return p;
 446 }
 447
 448 static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
 449 {
 450         return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
 451                                       sizeof(struct kvm_pte_chain));
 452 }
 453
 454 static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
 455 {
 456         kmem_cache_free(pte_chain_cache, pc);
 457 }
 458
 459 static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
 460 {
 461         return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
 462                                       sizeof(struct kvm_rmap_desc));
 463 }
 464
 465 static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
 466 {
 467         kmem_cache_free(rmap_desc_cache, rd);
 468 }
 469
 470 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
 471 {
 472         if (!sp->role.direct)
 473                 return sp->gfns[index];
 474
 475         return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
 476 }
 477
 478 static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
 479 {
 480         if (sp->role.direct)
 481                 BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
 482         else
 483                 sp->gfns[index] = gfn;
 484 }
 485
 486 /*
 487  * Return the pointer to the largepage write count for a given
 488  * gfn, handling slots that are not large page aligned.
 489  */
 490 static int *slot_largepage_idx(gfn_t gfn,
 491                                struct kvm_memory_slot *slot,
 492                                int level)
 493 {
 494         unsigned long idx;
 495
 496         idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
 497               (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
 498         return &slot->lpage_info[level - 2][idx].write_count;
 499 }
 500
 501 static void account_shadowed(struct kvm *kvm, gfn_t gfn)
 502 {
 503         struct kvm_memory_slot *slot;
 504         int *write_count;
 505         int i;
 506
 507         slot = gfn_to_memslot(kvm, gfn);
 508         for (i = PT_DIRECTORY_LEVEL;
 509              i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
 510                 write_count   = slot_largepage_idx(gfn, slot, i);
 511                 *write_count += 1;
 512         }
 513 }
 514
 515 static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
 516 {
 517         struct kvm_memory_slot *slot;
 518         int *write_count;
 519         int i;
 520
 521         slot = gfn_to_memslot(kvm, gfn);
 522         for (i = PT_DIRECTORY_LEVEL;
 523              i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
 524                 write_count   = slot_largepage_idx(gfn, slot, i);
 525                 *write_count -= 1;
 526                 WARN_ON(*write_count < 0);
 527         }
 528 }
 529
 530 static int has_wrprotected_page(struct kvm *kvm,
 531                                 gfn_t gfn,
 532                                 int level)
 533 {
 534         struct kvm_memory_slot *slot;
 535         int *largepage_idx;
 536
 537         slot = gfn_to_memslot(kvm, gfn);
 538         if (slot) {
 539                 largepage_idx = slot_largepage_idx(gfn, slot, level);
 540                 return *largepage_idx;
 541         }
 542
 543         return 1;
 544 }
 545
 546 static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
 547 {
 548         unsigned long page_size;
 549         int i, ret = 0;
 550
 551         page_size = kvm_host_page_size(kvm, gfn);
 552
 553         for (i = PT_PAGE_TABLE_LEVEL;
 554              i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
 555                 if (page_size >= KVM_HPAGE_SIZE(i))
 556                         ret = i;
 557                 else
 558                         break;
 559         }
 560
 561         return ret;
 562 }
 563
 564 static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
 565 {
 566         struct kvm_memory_slot *slot;
 567         int host_level, level, max_level;
 568
 569         slot = gfn_to_memslot(vcpu->kvm, large_gfn);
 570         if (slot && slot->dirty_bitmap)
 571                 return PT_PAGE_TABLE_LEVEL;
 572
 573         host_level = host_mapping_level(vcpu->kvm, large_gfn);
 574
 575         if (host_level == PT_PAGE_TABLE_LEVEL)
 576                 return host_level;
 577
 578         max_level = kvm_x86_ops->get_lpage_level() < host_level ?
 579                 kvm_x86_ops->get_lpage_level() : host_level;
 580
 581         for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
 582                 if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
 583                         break;
 584
 585         return level - 1;
 586 }
 587
 588 /*
 589  * Take gfn and return the reverse mapping to it.
 590  */
 591
 592 static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
 593 {
 594         struct kvm_memory_slot *slot;
 595         unsigned long idx;
 596
 597         slot = gfn_to_memslot(kvm, gfn);
 598         if (likely(level == PT_PAGE_TABLE_LEVEL))
 599                 return &slot->rmap[gfn - slot->base_gfn];
 600
 601         idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
 602                 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
 603
 604         return &slot->lpage_info[level - 2][idx].rmap_pde;
 605 }
 606
 607 /*
 608  * Reverse mapping data structures:
 609  *
 610  * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
 611  * that points to page_address(page).
 612  *
 613  * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
 614  * containing more mappings.
 615  *
 616  * Returns the number of rmap entries before the spte was added or zero if
 617  * the spte was not added.
 618  *
 619  */
 620 static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 621 {
 622         struct kvm_mmu_page *sp;
 623         struct kvm_rmap_desc *desc;
 624         unsigned long *rmapp;
 625         int i, count = 0;
 626
 627         if (!is_rmap_spte(*spte))
 628                 return count;
 629         sp = page_header(__pa(spte));
 630         kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
 631         rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
 632         if (!*rmapp) {
 633                 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
 634                 *rmapp = (unsigned long)spte;
 635         } else if (!(*rmapp & 1)) {
 636                 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
 637                 desc = mmu_alloc_rmap_desc(vcpu);
 638                 desc->sptes[0] = (u64 *)*rmapp;
 639                 desc->sptes[1] = spte;
 640                 *rmapp = (unsigned long)desc | 1;
 641                 ++count;
 642         } else {
 643                 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
 644                 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
 645                 while (desc->sptes[RMAP_EXT-1] && desc->more) {
 646                         desc = desc->more;
 647                         count += RMAP_EXT;
 648                 }
 649                 if (desc->sptes[RMAP_EXT-1]) {
 650                         desc->more = mmu_alloc_rmap_desc(vcpu);
 651                         desc = desc->more;
 652                 }
 653                 for (i = 0; desc->sptes[i]; ++i)
 654                         ++count;
 655                 desc->sptes[i] = spte;
 656         }
 657         return count;
 658 }
 659
 660 static void rmap_desc_remove_entry(unsigned long *rmapp,
 661                                    struct kvm_rmap_desc *desc,
 662                                    int i,
 663                                    struct kvm_rmap_desc *prev_desc)
 664 {
 665         int j;
 666
 667         for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j)
 668                 ;
 669         desc->sptes[i] = desc->sptes[j];
 670         desc->sptes[j] = NULL;
 671         if (j != 0)
 672                 return;
 673         if (!prev_desc && !desc->more)
 674                 *rmapp = (unsigned long)desc->sptes[0];
 675         else
 676                 if (prev_desc)
 677                         prev_desc->more = desc->more;
 678                 else
 679                         *rmapp = (unsigned long)desc->more | 1;
 680         mmu_free_rmap_desc(desc);
 681 }
 682
 683 static void rmap_remove(struct kvm *kvm, u64 *spte)
 684 {
 685         struct kvm_rmap_desc *desc;
 686         struct kvm_rmap_desc *prev_desc;
 687         struct kvm_mmu_page *sp;
 688         gfn_t gfn;
 689         unsigned long *rmapp;
 690         int i;
 691
 692         sp = page_header(__pa(spte));
 693         gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
 694         rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
 695         if (!*rmapp) {
 696                 printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte);
 697                 BUG();
 698         } else if (!(*rmapp & 1)) {
 699                 rmap_printk("rmap_remove:  %p 1->0\n", spte);
 700                 if ((u64 *)*rmapp != spte) {
 701                         printk(KERN_ERR "rmap_remove:  %p 1->BUG\n", spte);
 702                         BUG();
 703                 }
 704                 *rmapp = 0;
 705         } else {
 706                 rmap_printk("rmap_remove:  %p many->many\n", spte);
 707                 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
 708                 prev_desc = NULL;
 709                 while (desc) {
 710                         for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i)
 711                                 if (desc->sptes[i] == spte) {
 712                                         rmap_desc_remove_entry(rmapp,
 713                                                                desc, i,
 714                                                                prev_desc);
 715                                         return;
 716                                 }
 717                         prev_desc = desc;
 718                         desc = desc->more;
 719                 }
 720                 pr_err("rmap_remove: %p many->many\n", spte);
 721                 BUG();
 722         }
 723 }
 724
 725 static int set_spte_track_bits(u64 *sptep, u64 new_spte)
 726 {
 727         pfn_t pfn;
 728         u64 old_spte = *sptep;
 729
 730         if (!spte_has_volatile_bits(old_spte))
 731                 __set_spte(sptep, new_spte);
 732         else
 733                 old_spte = __xchg_spte(sptep, new_spte);
 734
 735         if (!is_rmap_spte(old_spte))
 736                 return 0;
 737
 738         pfn = spte_to_pfn(old_spte);
 739         if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
 740                 kvm_set_pfn_accessed(pfn);
 741         if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
 742                 kvm_set_pfn_dirty(pfn);
 743         return 1;
 744 }
 745
 746 static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
 747 {
 748         if (set_spte_track_bits(sptep, new_spte))
 749                 rmap_remove(kvm, sptep);
 750 }
 751
 752 static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
 753 {
 754         struct kvm_rmap_desc *desc;
 755         u64 *prev_spte;
 756         int i;
 757
 758         if (!*rmapp)
 759                 return NULL;
 760         else if (!(*rmapp & 1)) {
 761                 if (!spte)
 762                         return (u64 *)*rmapp;
 763                 return NULL;
 764         }
 765         desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
 766         prev_spte = NULL;
 767         while (desc) {
 768                 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
 769                         if (prev_spte == spte)
 770                                 return desc->sptes[i];
 771                         prev_spte = desc->sptes[i];
 772                 }
 773                 desc = desc->more;
 774         }
 775         return NULL;
 776 }
 777
 778 static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 779 {
 780         unsigned long *rmapp;
 781         u64 *spte;
 782         int i, write_protected = 0;
 783
 784         rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
 785
 786         spte = rmap_next(kvm, rmapp, NULL);
 787         while (spte) {
 788                 BUG_ON(!spte);
 789                 BUG_ON(!(*spte & PT_PRESENT_MASK));
 790                 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
 791                 if (is_writable_pte(*spte)) {
 792                         update_spte(spte, *spte & ~PT_WRITABLE_MASK);
 793                         write_protected = 1;
 794                 }
 795                 spte = rmap_next(kvm, rmapp, spte);
 796         }
 797
 798         /* check for huge page mappings */
 799         for (i = PT_DIRECTORY_LEVEL;
 800              i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
 801                 rmapp = gfn_to_rmap(kvm, gfn, i);
 802                 spte = rmap_next(kvm, rmapp, NULL);
 803                 while (spte) {
 804                         BUG_ON(!spte);
 805                         BUG_ON(!(*spte & PT_PRESENT_MASK));
 806                         BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
 807                         pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
 808                         if (is_writable_pte(*spte)) {
 809                                 drop_spte(kvm, spte,
 810                                           shadow_trap_nonpresent_pte);
 811                                 --kvm->stat.lpages;
 812                                 spte = NULL;
 813                                 write_protected = 1;
 814                         }
 815                         spte = rmap_next(kvm, rmapp, spte);
 816                 }
 817         }
 818
 819         return write_protected;
 820 }
 821
 822 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 823                            unsigned long data)
 824 {
 825         u64 *spte;
 826         int need_tlb_flush = 0;
 827
 828         while ((spte = rmap_next(kvm, rmapp, NULL))) {
 829                 BUG_ON(!(*spte & PT_PRESENT_MASK));
 830                 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
 831                 drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
 832                 need_tlb_flush = 1;
 833         }
 834         return need_tlb_flush;
 835 }
 836
 837 static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 838                              unsigned long data)
 839 {
 840         int need_flush = 0;
 841         u64 *spte, new_spte;
 842         pte_t *ptep = (pte_t *)data;
 843         pfn_t new_pfn;
 844
 845         WARN_ON(pte_huge(*ptep));
 846         new_pfn = pte_pfn(*ptep);
 847         spte = rmap_next(kvm, rmapp, NULL);
 848         while (spte) {
 849                 BUG_ON(!is_shadow_present_pte(*spte));
 850                 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
 851                 need_flush = 1;
 852                 if (pte_write(*ptep)) {
 853                         drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
 854                         spte = rmap_next(kvm, rmapp, NULL);
 855                 } else {
 856                         new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
 857                         new_spte |= (u64)new_pfn << PAGE_SHIFT;
 858
 859                         new_spte &= ~PT_WRITABLE_MASK;
 860                         new_spte &= ~SPTE_HOST_WRITEABLE;
 861                         new_spte &= ~shadow_accessed_mask;
 862                         set_spte_track_bits(spte, new_spte);
 863                         spte = rmap_next(kvm, rmapp, spte);
 864                 }
 865         }
 866         if (need_flush)
 867                 kvm_flush_remote_tlbs(kvm);
 868
 869         return 0;
 870 }
 871
 872 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 873                           unsigned long data,
 874                           int (*handler)(struct kvm *kvm, unsigned long *rmapp,
 875                                          unsigned long data))
 876 {
 877         int i, j;
 878         int ret;
 879         int retval = 0;
 880         struct kvm_memslots *slots;
 881
 882         slots = kvm_memslots(kvm);
 883
 884         for (i = 0; i < slots->nmemslots; i++) {
 885                 struct kvm_memory_slot *memslot = &slots->memslots[i];
 886                 unsigned long start = memslot->userspace_addr;
 887                 unsigned long end;
 888
 889                 end = start + (memslot->npages << PAGE_SHIFT);
 890                 if (hva >= start && hva < end) {
 891                         gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
 892
 893                         ret = handler(kvm, &memslot->rmap[gfn_offset], data);
 894
 895                         for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
 896                                 unsigned long idx;
 897                                 int sh;
 898
 899                                 sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j);
 900                                 idx = ((memslot->base_gfn+gfn_offset) >> sh) -
 901                                         (memslot->base_gfn >> sh);
 902                                 ret |= handler(kvm,
 903                                         &memslot->lpage_info[j][idx].rmap_pde,
 904                                         data);
 905                         }
 906                         trace_kvm_age_page(hva, memslot, ret);
 907                         retval |= ret;
 908                 }
 909         }
 910
 911         return retval;
 912 }
 913
 914 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
 915 {
 916         return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
 917 }
 918
 919 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
 920 {
 921         kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
 922 }
 923
 924 static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 925                          unsigned long data)
 926 {
 927         u64 *spte;
 928         int young = 0;
 929
 930         /*
 931          * Emulate the accessed bit for EPT, by checking if this page has
 932          * an EPT mapping, and clearing it if it does. On the next access,
 933          * a new EPT mapping will be established.
 934          * This has some overhead, but not as much as the cost of swapping
 935          * out actively used pages or breaking up actively used hugepages.
 936          */
 937         if (!shadow_accessed_mask)
 938                 return kvm_unmap_rmapp(kvm, rmapp, data);
 939
 940         spte = rmap_next(kvm, rmapp, NULL);
 941         while (spte) {
 942                 int _young;
 943                 u64 _spte = *spte;
 944                 BUG_ON(!(_spte & PT_PRESENT_MASK));
 945                 _young = _spte & PT_ACCESSED_MASK;
 946                 if (_young) {
 947                         young = 1;
 948                         clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
 949                 }
 950                 spte = rmap_next(kvm, rmapp, spte);
 951         }
 952         return young;
 953 }
 954
 955 #define RMAP_RECYCLE_THRESHOLD 1000
 956
 957 static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 958 {
 959         unsigned long *rmapp;
 960         struct kvm_mmu_page *sp;
 961
 962         sp = page_header(__pa(spte));
 963
 964         rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
 965
 966         kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
 967         kvm_flush_remote_tlbs(vcpu->kvm);
 968 }
 969
 970 int kvm_age_hva(struct kvm *kvm, unsigned long hva)
 971 {
 972         return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
 973 }
 974
 975 #ifdef MMU_DEBUG
 976 static int is_empty_shadow_page(u64 *spt)
 977 {
 978         u64 *pos;
 979         u64 *end;
 980
 981         for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
 982                 if (is_shadow_present_pte(*pos)) {
 983                         printk(KERN_ERR "%s: %p %llx\n", __func__,
 984                                pos, *pos);
 985                         return 0;
 986                 }
 987         return 1;
 988 }
 989 #endif
 990
 991 /*
 992  * This value is the sum of all of the kvm instances's
 993  * kvm->arch.n_used_mmu_pages values.  We need a global,
 994  * aggregate version in order to make the slab shrinker
 995  * faster
 996  */
 997 static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
 998 {
 999         kvm->arch.n_used_mmu_pages += nr;
1000         percpu_counter_add(&kvm_total_used_mmu_pages, nr);
1001 }
1002
1003 static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1004 {
1005         ASSERT(is_empty_shadow_page(sp->spt));
1006         hlist_del(&sp->hash_link);
1007         list_del(&sp->link);
1008         __free_page(virt_to_page(sp->spt));
1009         if (!sp->role.direct)
1010                 __free_page(virt_to_page(sp->gfns));
1011         kmem_cache_free(mmu_page_header_cache, sp);
1012         kvm_mod_used_mmu_pages(kvm, -1);
1013 }
1014
1015 static unsigned kvm_page_table_hashfn(gfn_t gfn)
1016 {
1017         return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
1018 }
1019
1020 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
1021                                                u64 *parent_pte, int direct)
1022 {
1023         struct kvm_mmu_page *sp;
1024
1025         sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
1026         sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
1027         if (!direct)
1028                 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
1029                                                   PAGE_SIZE);
1030         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1031         list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1032         bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
1033         sp->multimapped = 0;
1034         sp->parent_pte = parent_pte;
1035         kvm_mod_used_mmu_pages(vcpu->kvm, +1);
1036         return sp;
1037 }
1038
1039 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
1040                                     struct kvm_mmu_page *sp, u64 *parent_pte)
1041 {
1042         struct kvm_pte_chain *pte_chain;
1043         struct hlist_node *node;
1044         int i;
1045
1046         if (!parent_pte)
1047                 return;
1048         if (!sp->multimapped) {
1049                 u64 *old = sp->parent_pte;
1050
1051                 if (!old) {
1052                         sp->parent_pte = parent_pte;
1053                         return;
1054                 }
1055                 sp->multimapped = 1;
1056                 pte_chain = mmu_alloc_pte_chain(vcpu);
1057                 INIT_HLIST_HEAD(&sp->parent_ptes);
1058                 hlist_add_head(&pte_chain->link, &sp->parent_ptes);
1059                 pte_chain->parent_ptes[0] = old;
1060         }
1061         hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
1062                 if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
1063                         continue;
1064                 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
1065                         if (!pte_chain->parent_ptes[i]) {
1066                                 pte_chain->parent_ptes[i] = parent_pte;
1067                                 return;
1068                         }
1069         }
1070         pte_chain = mmu_alloc_pte_chain(vcpu);
1071         BUG_ON(!pte_chain);
1072         hlist_add_head(&pte_chain->link, &sp->parent_ptes);
1073         pte_chain->parent_ptes[0] = parent_pte;
1074 }
1075
1076 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
1077                                        u64 *parent_pte)
1078 {
1079         struct kvm_pte_chain *pte_chain;
1080         struct hlist_node *node;
1081         int i;
1082
1083         if (!sp->multimapped) {
1084                 BUG_ON(sp->parent_pte != parent_pte);
1085                 sp->parent_pte = NULL;
1086                 return;
1087         }
1088         hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1089                 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1090                         if (!pte_chain->parent_ptes[i])
1091                                 break;
1092                         if (pte_chain->parent_ptes[i] != parent_pte)
1093                                 continue;
1094                         while (i + 1 < NR_PTE_CHAIN_ENTRIES
1095                                 && pte_chain->parent_ptes[i + 1]) {
1096                                 pte_chain->parent_ptes[i]
1097                                         = pte_chain->parent_ptes[i + 1];
1098                                 ++i;
1099                         }
1100                         pte_chain->parent_ptes[i] = NULL;
1101                         if (i == 0) {
1102                                 hlist_del(&pte_chain->link);
1103                                 mmu_free_pte_chain(pte_chain);
1104                                 if (hlist_empty(&sp->parent_ptes)) {
1105                                         sp->multimapped = 0;
1106                                         sp->parent_pte = NULL;
1107                                 }
1108                         }
1109                         return;
1110                 }
1111         BUG();
1112 }
1113
1114 static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
1115 {
1116         struct kvm_pte_chain *pte_chain;
1117         struct hlist_node *node;
1118         struct kvm_mmu_page *parent_sp;
1119         int i;
1120
1121         if (!sp->multimapped && sp->parent_pte) {
1122                 parent_sp = page_header(__pa(sp->parent_pte));
1123                 fn(parent_sp, sp->parent_pte);
1124                 return;
1125         }
1126
1127         hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1128                 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1129                         u64 *spte = pte_chain->parent_ptes[i];
1130
1131                         if (!spte)
1132                                 break;
1133                         parent_sp = page_header(__pa(spte));
1134                         fn(parent_sp, spte);
1135                 }
1136 }
1137
1138 static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte);
1139 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1140 {
1141         mmu_parent_walk(sp, mark_unsync);
1142 }
1143
1144 static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
1145 {
1146         unsigned int index;
1147
1148         index = spte - sp->spt;
1149         if (__test_and_set_bit(index, sp->unsync_child_bitmap))
1150                 return;
1151         if (sp->unsync_children++)
1152                 return;
1153         kvm_mmu_mark_parents_unsync(sp);
1154 }
1155
1156 static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1157                                     struct kvm_mmu_page *sp)
1158 {
1159         int i;
1160
1161         for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1162                 sp->spt[i] = shadow_trap_nonpresent_pte;
1163 }
1164
1165 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1166                                struct kvm_mmu_page *sp, bool clear_unsync)
1167 {
1168         return 1;
1169 }
1170
1171 static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
1172 {
1173 }
1174
1175 #define KVM_PAGE_ARRAY_NR 16
1176
1177 struct kvm_mmu_pages {
1178         struct mmu_page_and_offset {
1179                 struct kvm_mmu_page *sp;
1180                 unsigned int idx;
1181         } page[KVM_PAGE_ARRAY_NR];
1182         unsigned int nr;
1183 };
1184
1185 #define for_each_unsync_children(bitmap, idx)           \
1186         for (idx = find_first_bit(bitmap, 512);         \
1187              idx < 512;                                 \
1188              idx = find_next_bit(bitmap, 512, idx+1))
1189
1190 static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
1191                          int idx)
1192 {
1193         int i;
1194
1195         if (sp->unsync)
1196                 for (i=0; i < pvec->nr; i++)
1197                         if (pvec->page[i].sp == sp)
1198                                 return 0;
1199
1200         pvec->page[pvec->nr].sp = sp;
1201         pvec->page[pvec->nr].idx = idx;
1202         pvec->nr++;
1203         return (pvec->nr == KVM_PAGE_ARRAY_NR);
1204 }
1205
1206 static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
1207                            struct kvm_mmu_pages *pvec)
1208 {
1209         int i, ret, nr_unsync_leaf = 0;
1210
1211         for_each_unsync_children(sp->unsync_child_bitmap, i) {
1212                 struct kvm_mmu_page *child;
1213                 u64 ent = sp->spt[i];
1214
1215                 if (!is_shadow_present_pte(ent) || is_large_pte(ent))
1216                         goto clear_child_bitmap;
1217
1218                 child = page_header(ent & PT64_BASE_ADDR_MASK);
1219
1220                 if (child->unsync_children) {
1221                         if (mmu_pages_add(pvec, child, i))
1222                                 return -ENOSPC;
1223
1224                         ret = __mmu_unsync_walk(child, pvec);
1225                         if (!ret)
1226                                 goto clear_child_bitmap;
1227                         else if (ret > 0)
1228                                 nr_unsync_leaf += ret;
1229                         else
1230                                 return ret;
1231                 } else if (child->unsync) {
1232                         nr_unsync_leaf++;
1233                         if (mmu_pages_add(pvec, child, i))
1234                                 return -ENOSPC;
1235                 } else
1236                          goto clear_child_bitmap;
1237
1238                 continue;
1239
1240 clear_child_bitmap:
1241                 __clear_bit(i, sp->unsync_child_bitmap);
1242                 sp->unsync_children--;
1243                 WARN_ON((int)sp->unsync_children < 0);
1244         }
1245
1246
1247         return nr_unsync_leaf;
1248 }
1249
1250 static int mmu_unsync_walk(struct kvm_mmu_page *sp,
1251                            struct kvm_mmu_pages *pvec)
1252 {
1253         if (!sp->unsync_children)
1254                 return 0;
1255
1256         mmu_pages_add(pvec, sp, 0);
1257         return __mmu_unsync_walk(sp, pvec);
1258 }
1259
1260 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1261 {
1262         WARN_ON(!sp->unsync);
1263         trace_kvm_mmu_sync_page(sp);
1264         sp->unsync = 0;
1265         --kvm->stat.mmu_unsync;
1266 }
1267
1268 static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1269                                     struct list_head *invalid_list);
1270 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1271                                     struct list_head *invalid_list);
1272
1273 #define for_each_gfn_sp(kvm, sp, gfn, pos)                              \
1274   hlist_for_each_entry(sp, pos,                                         \
1275    &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)   \
1276         if ((sp)->gfn != (gfn)) {} else
1277
1278 #define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos)               \
1279   hlist_for_each_entry(sp, pos,                                         \
1280    &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)   \
1281                 if ((sp)->gfn != (gfn) || (sp)->role.direct ||          \
1282                         (sp)->role.invalid) {} else
1283
1284 /* @sp->gfn should be write-protected at the call site */
1285 static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1286                            struct list_head *invalid_list, bool clear_unsync)
1287 {
1288         if (sp->role.cr4_pae != !!is_pae(vcpu)) {
1289                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1290                 return 1;
1291         }
1292
1293         if (clear_unsync)
1294                 kvm_unlink_unsync_page(vcpu->kvm, sp);
1295
1296         if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) {
1297                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1298                 return 1;
1299         }
1300
1301         kvm_mmu_flush_tlb(vcpu);
1302         return 0;
1303 }
1304
1305 static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
1306                                    struct kvm_mmu_page *sp)
1307 {
1308         LIST_HEAD(invalid_list);
1309         int ret;
1310
1311         ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
1312         if (ret)
1313                 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1314
1315         return ret;
1316 }
1317
1318 static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1319                          struct list_head *invalid_list)
1320 {
1321         return __kvm_sync_page(vcpu, sp, invalid_list, true);
1322 }
1323
1324 /* @gfn should be write-protected at the call site */
1325 static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
1326 {
1327         struct kvm_mmu_page *s;
1328         struct hlist_node *node;
1329         LIST_HEAD(invalid_list);
1330         bool flush = false;
1331
1332         for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1333                 if (!s->unsync)
1334                         continue;
1335
1336                 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1337                 if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
1338                         (vcpu->arch.mmu.sync_page(vcpu, s, true))) {
1339                         kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
1340                         continue;
1341                 }
1342                 kvm_unlink_unsync_page(vcpu->kvm, s);
1343                 flush = true;
1344         }
1345
1346         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1347         if (flush)
1348                 kvm_mmu_flush_tlb(vcpu);
1349 }
1350
1351 struct mmu_page_path {
1352         struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
1353         unsigned int idx[PT64_ROOT_LEVEL-1];
1354 };
1355
1356 #define for_each_sp(pvec, sp, parents, i)                       \
1357                 for (i = mmu_pages_next(&pvec, &parents, -1),   \
1358                         sp = pvec.page[i].sp;                   \
1359                         i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});   \
1360                         i = mmu_pages_next(&pvec, &parents, i))
1361
1362 static int mmu_pages_next(struct kvm_mmu_pages *pvec,
1363                           struct mmu_page_path *parents,
1364                           int i)
1365 {
1366         int n;
1367
1368         for (n = i+1; n < pvec->nr; n++) {
1369                 struct kvm_mmu_page *sp = pvec->page[n].sp;
1370
1371                 if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
1372                         parents->idx[0] = pvec->page[n].idx;
1373                         return n;
1374                 }
1375
1376                 parents->parent[sp->role.level-2] = sp;
1377                 parents->idx[sp->role.level-1] = pvec->page[n].idx;
1378         }
1379
1380         return n;
1381 }
1382
1383 static void mmu_pages_clear_parents(struct mmu_page_path *parents)
1384 {
1385         struct kvm_mmu_page *sp;
1386         unsigned int level = 0;
1387
1388         do {
1389                 unsigned int idx = parents->idx[level];
1390
1391                 sp = parents->parent[level];
1392                 if (!sp)
1393                         return;
1394
1395                 --sp->unsync_children;
1396                 WARN_ON((int)sp->unsync_children < 0);
1397                 __clear_bit(idx, sp->unsync_child_bitmap);
1398                 level++;
1399         } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
1400 }
1401
1402 static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
1403                                struct mmu_page_path *parents,
1404                                struct kvm_mmu_pages *pvec)
1405 {
1406         parents->parent[parent->role.level-1] = NULL;
1407         pvec->nr = 0;
1408 }
1409
1410 static void mmu_sync_children(struct kvm_vcpu *vcpu,
1411                               struct kvm_mmu_page *parent)
1412 {
1413         int i;
1414         struct kvm_mmu_page *sp;
1415         struct mmu_page_path parents;
1416         struct kvm_mmu_pages pages;
1417         LIST_HEAD(invalid_list);
1418
1419         kvm_mmu_pages_init(parent, &parents, &pages);
1420         while (mmu_unsync_walk(parent, &pages)) {
1421                 int protected = 0;
1422
1423                 for_each_sp(pages, sp, parents, i)
1424                         protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
1425
1426                 if (protected)
1427                         kvm_flush_remote_tlbs(vcpu->kvm);
1428
1429                 for_each_sp(pages, sp, parents, i) {
1430                         kvm_sync_page(vcpu, sp, &invalid_list);
1431                         mmu_pages_clear_parents(&parents);
1432                 }
1433                 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1434                 cond_resched_lock(&vcpu->kvm->mmu_lock);
1435                 kvm_mmu_pages_init(parent, &parents, &pages);
1436         }
1437 }
1438
1439 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1440                                              gfn_t gfn,
1441                                              gva_t gaddr,
1442                                              unsigned level,
1443                                              int direct,
1444                                              unsigned access,
1445                                              u64 *parent_pte)
1446 {
1447         union kvm_mmu_page_role role;
1448         unsigned quadrant;
1449         struct kvm_mmu_page *sp;
1450         struct hlist_node *node;
1451         bool need_sync = false;
1452
1453         role = vcpu->arch.mmu.base_role;
1454         role.level = level;
1455         role.direct = direct;
1456         if (role.direct)
1457                 role.cr4_pae = 0;
1458         role.access = access;
1459         if (!vcpu->arch.mmu.direct_map
1460             && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1461                 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
1462                 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
1463                 role.quadrant = quadrant;
1464         }
1465         for_each_gfn_sp(vcpu->kvm, sp, gfn, node) {
1466                 if (!need_sync && sp->unsync)
1467                         need_sync = true;
1468
1469                 if (sp->role.word != role.word)
1470                         continue;
1471
1472                 if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
1473                         break;
1474
1475                 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1476                 if (sp->unsync_children) {
1477                         kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
1478                         kvm_mmu_mark_parents_unsync(sp);
1479                 } else if (sp->unsync)
1480                         kvm_mmu_mark_parents_unsync(sp);
1481
1482                 trace_kvm_mmu_get_page(sp, false);
1483                 return sp;
1484         }
1485         ++vcpu->kvm->stat.mmu_cache_miss;
1486         sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
1487         if (!sp)
1488                 return sp;
1489         sp->gfn = gfn;
1490         sp->role = role;
1491         hlist_add_head(&sp->hash_link,
1492                 &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
1493         if (!direct) {
1494                 if (rmap_write_protect(vcpu->kvm, gfn))
1495                         kvm_flush_remote_tlbs(vcpu->kvm);
1496                 if (level > PT_PAGE_TABLE_LEVEL && need_sync)
1497                         kvm_sync_pages(vcpu, gfn);
1498
1499                 account_shadowed(vcpu->kvm, gfn);
1500         }
1501         if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
1502                 vcpu->arch.mmu.prefetch_page(vcpu, sp);
1503         else
1504                 nonpaging_prefetch_page(vcpu, sp);
1505         trace_kvm_mmu_get_page(sp, true);
1506         return sp;
1507 }
1508
1509 static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
1510                              struct kvm_vcpu *vcpu, u64 addr)
1511 {
1512         iterator->addr = addr;
1513         iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
1514         iterator->level = vcpu->arch.mmu.shadow_root_level;
1515
1516         if (iterator->level == PT64_ROOT_LEVEL &&
1517             vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL &&
1518             !vcpu->arch.mmu.direct_map)
1519                 --iterator->level;
1520
1521         if (iterator->level == PT32E_ROOT_LEVEL) {
1522                 iterator->shadow_addr
1523                         = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
1524                 iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
1525                 --iterator->level;
1526                 if (!iterator->shadow_addr)
1527                         iterator->level = 0;
1528         }
1529 }
1530
1531 static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
1532 {
1533         if (iterator->level < PT_PAGE_TABLE_LEVEL)
1534                 return false;
1535
1536         if (iterator->level == PT_PAGE_TABLE_LEVEL)
1537                 if (is_large_pte(*iterator->sptep))
1538                         return false;
1539
1540         iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
1541         iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
1542         return true;
1543 }
1544
1545 static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
1546 {
1547         iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
1548         --iterator->level;
1549 }
1550
1551 static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
1552 {
1553         u64 spte;
1554
1555         spte = __pa(sp->spt)
1556                 | PT_PRESENT_MASK | PT_ACCESSED_MASK
1557                 | PT_WRITABLE_MASK | PT_USER_MASK;
1558         __set_spte(sptep, spte);
1559 }
1560
1561 static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1562 {
1563         if (is_large_pte(*sptep)) {
1564                 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
1565                 kvm_flush_remote_tlbs(vcpu->kvm);
1566         }
1567 }
1568
1569 static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1570                                    unsigned direct_access)
1571 {
1572         if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
1573                 struct kvm_mmu_page *child;
1574
1575                 /*
1576                  * For the direct sp, if the guest pte's dirty bit
1577                  * changed form clean to dirty, it will corrupt the
1578                  * sp's access: allow writable in the read-only sp,
1579                  * so we should update the spte at this point to get
1580                  * a new sp with the correct access.
1581                  */
1582                 child = page_header(*sptep & PT64_BASE_ADDR_MASK);
1583                 if (child->role.access == direct_access)
1584                         return;
1585
1586                 mmu_page_remove_parent_pte(child, sptep);
1587                 __set_spte(sptep, shadow_trap_nonpresent_pte);
1588                 kvm_flush_remote_tlbs(vcpu->kvm);
1589         }
1590 }
1591
1592 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1593                                          struct kvm_mmu_page *sp)
1594 {
1595         unsigned i;
1596         u64 *pt;
1597         u64 ent;
1598
1599         pt = sp->spt;
1600
1601         for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1602                 ent = pt[i];
1603
1604                 if (is_shadow_present_pte(ent)) {
1605                         if (!is_last_spte(ent, sp->role.level)) {
1606                                 ent &= PT64_BASE_ADDR_MASK;
1607                                 mmu_page_remove_parent_pte(page_header(ent),
1608                                                            &pt[i]);
1609                         } else {
1610                                 if (is_large_pte(ent))
1611                                         --kvm->stat.lpages;
1612                                 drop_spte(kvm, &pt[i],
1613                                           shadow_trap_nonpresent_pte);
1614                         }
1615                 }
1616                 pt[i] = shadow_trap_nonpresent_pte;
1617         }
1618 }
1619
1620 static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
1621 {
1622         mmu_page_remove_parent_pte(sp, parent_pte);
1623 }
1624
1625 static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
1626 {
1627         int i;
1628         struct kvm_vcpu *vcpu;
1629
1630         kvm_for_each_vcpu(i, vcpu, kvm)
1631                 vcpu->arch.last_pte_updated = NULL;
1632 }
1633
1634 static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1635 {
1636         u64 *parent_pte;
1637
1638         while (sp->multimapped || sp->parent_pte) {
1639                 if (!sp->multimapped)
1640                         parent_pte = sp->parent_pte;
1641                 else {
1642                         struct kvm_pte_chain *chain;
1643
1644                         chain = container_of(sp->parent_ptes.first,
1645                                              struct kvm_pte_chain, link);
1646                         parent_pte = chain->parent_ptes[0];
1647                 }
1648                 BUG_ON(!parent_pte);
1649                 kvm_mmu_put_page(sp, parent_pte);
1650                 __set_spte(parent_pte, shadow_trap_nonpresent_pte);
1651         }
1652 }
1653
1654 static int mmu_zap_unsync_children(struct kvm *kvm,
1655                                    struct kvm_mmu_page *parent,
1656                                    struct list_head *invalid_list)
1657 {
1658         int i, zapped = 0;
1659         struct mmu_page_path parents;
1660         struct kvm_mmu_pages pages;
1661
1662         if (parent->role.level == PT_PAGE_TABLE_LEVEL)
1663                 return 0;
1664
1665         kvm_mmu_pages_init(parent, &parents, &pages);
1666         while (mmu_unsync_walk(parent, &pages)) {
1667                 struct kvm_mmu_page *sp;
1668
1669                 for_each_sp(pages, sp, parents, i) {
1670                         kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
1671                         mmu_pages_clear_parents(&parents);
1672                         zapped++;
1673                 }
1674                 kvm_mmu_pages_init(parent, &parents, &pages);
1675         }
1676
1677         return zapped;
1678 }
1679
1680 static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1681                                     struct list_head *invalid_list)
1682 {
1683         int ret;
1684
1685         trace_kvm_mmu_prepare_zap_page(sp);
1686         ++kvm->stat.mmu_shadow_zapped;
1687         ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
1688         kvm_mmu_page_unlink_children(kvm, sp);
1689         kvm_mmu_unlink_parents(kvm, sp);
1690         if (!sp->role.invalid && !sp->role.direct)
1691                 unaccount_shadowed(kvm, sp->gfn);
1692         if (sp->unsync)
1693                 kvm_unlink_unsync_page(kvm, sp);
1694         if (!sp->root_count) {
1695                 /* Count self */
1696                 ret++;
1697                 list_move(&sp->link, invalid_list);
1698         } else {
1699                 list_move(&sp->link, &kvm->arch.active_mmu_pages);
1700                 kvm_reload_remote_mmus(kvm);
1701         }
1702
1703         sp->role.invalid = 1;
1704         kvm_mmu_reset_last_pte_updated(kvm);
1705         return ret;
1706 }
1707
1708 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1709                                     struct list_head *invalid_list)
1710 {
1711         struct kvm_mmu_page *sp;
1712
1713         if (list_empty(invalid_list))
1714                 return;
1715
1716         kvm_flush_remote_tlbs(kvm);
1717
1718         do {
1719                 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
1720                 WARN_ON(!sp->role.invalid || sp->root_count);
1721                 kvm_mmu_free_page(kvm, sp);
1722         } while (!list_empty(invalid_list));
1723
1724 }
1725
1726 /*
1727  * Changing the number of mmu pages allocated to the vm
1728  * Note: if goal_nr_mmu_pages is too small, you will get dead lock
1729  */
1730 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
1731 {
1732         LIST_HEAD(invalid_list);
1733         /*
1734          * If we set the number of mmu pages to be smaller be than the
1735          * number of actived pages , we must to free some mmu pages before we
1736          * change the value
1737          */
1738
1739         if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
1740                 while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
1741                         !list_empty(&kvm->arch.active_mmu_pages)) {
1742                         struct kvm_mmu_page *page;
1743
1744                         page = container_of(kvm->arch.active_mmu_pages.prev,
1745                                             struct kvm_mmu_page, link);
1746                         kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
1747                         kvm_mmu_commit_zap_page(kvm, &invalid_list);
1748                 }
1749                 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
1750         }
1751
1752         kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
1753 }
1754
1755 static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1756 {
1757         struct kvm_mmu_page *sp;
1758         struct hlist_node *node;
1759         LIST_HEAD(invalid_list);
1760         int r;
1761
1762         pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
1763         r = 0;
1764
1765         for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1766                 pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
1767                          sp->role.word);
1768                 r = 1;
1769                 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1770         }
1771         kvm_mmu_commit_zap_page(kvm, &invalid_list);
1772         return r;
1773 }
1774
1775 static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1776 {
1777         struct kvm_mmu_page *sp;
1778         struct hlist_node *node;
1779         LIST_HEAD(invalid_list);
1780
1781         for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1782                 pgprintk("%s: zap %llx %x\n",
1783                          __func__, gfn, sp->role.word);
1784                 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1785         }
1786         kvm_mmu_commit_zap_page(kvm, &invalid_list);
1787 }
1788
1789 static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
1790 {
1791         int slot = memslot_id(kvm, gfn);
1792         struct kvm_mmu_page *sp = page_header(__pa(pte));
1793
1794         __set_bit(slot, sp->slot_bitmap);
1795 }
1796
1797 static void mmu_convert_notrap(struct kvm_mmu_page *sp)
1798 {
1799         int i;
1800         u64 *pt = sp->spt;
1801
1802         if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
1803                 return;
1804
1805         for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1806                 if (pt[i] == shadow_notrap_nonpresent_pte)
1807                         __set_spte(&pt[i], shadow_trap_nonpresent_pte);
1808         }
1809 }
1810
1811 /*
1812  * The function is based on mtrr_type_lookup() in
1813  * arch/x86/kernel/cpu/mtrr/generic.c
1814  */
1815 static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
1816                          u64 start, u64 end)
1817 {
1818         int i;
1819         u64 base, mask;
1820         u8 prev_match, curr_match;
1821         int num_var_ranges = KVM_NR_VAR_MTRR;
1822
1823         if (!mtrr_state->enabled)
1824                 return 0xFF;
1825
1826         /* Make end inclusive end, instead of exclusive */
1827         end--;
1828
1829         /* Look in fixed ranges. Just return the type as per start */
1830         if (mtrr_state->have_fixed && (start < 0x100000)) {
1831                 int idx;
1832
1833                 if (start < 0x80000) {
1834                         idx = 0;
1835                         idx += (start >> 16);
1836                         return mtrr_state->fixed_ranges[idx];
1837                 } else if (start < 0xC0000) {
1838                         idx = 1 * 8;
1839                         idx += ((start - 0x80000) >> 14);
1840                         return mtrr_state->fixed_ranges[idx];
1841                 } else if (start < 0x1000000) {
1842                         idx = 3 * 8;
1843                         idx += ((start - 0xC0000) >> 12);
1844                         return mtrr_state->fixed_ranges[idx];
1845                 }
1846         }
1847
1848         /*
1849          * Look in variable ranges
1850          * Look of multiple ranges matching this address and pick type
1851          * as per MTRR precedence
1852          */
1853         if (!(mtrr_state->enabled & 2))
1854                 return mtrr_state->def_type;
1855
1856         prev_match = 0xFF;
1857         for (i = 0; i < num_var_ranges; ++i) {
1858                 unsigned short start_state, end_state;
1859
1860                 if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11)))
1861                         continue;
1862
1863                 base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) +
1864                        (mtrr_state->var_ranges[i].base_lo & PAGE_MASK);
1865                 mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) +
1866                        (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK);
1867
1868                 start_state = ((start & mask) == (base & mask));
1869                 end_state = ((end & mask) == (base & mask));
1870                 if (start_state != end_state)
1871                         return 0xFE;
1872
1873                 if ((start & mask) != (base & mask))
1874                         continue;
1875
1876                 curr_match = mtrr_state->var_ranges[i].base_lo & 0xff;
1877                 if (prev_match == 0xFF) {
1878                         prev_match = curr_match;
1879                         continue;
1880                 }
1881
1882                 if (prev_match == MTRR_TYPE_UNCACHABLE ||
1883                     curr_match == MTRR_TYPE_UNCACHABLE)
1884                         return MTRR_TYPE_UNCACHABLE;
1885
1886                 if ((prev_match == MTRR_TYPE_WRBACK &&
1887                      curr_match == MTRR_TYPE_WRTHROUGH) ||
1888                     (prev_match == MTRR_TYPE_WRTHROUGH &&
1889                      curr_match == MTRR_TYPE_WRBACK)) {
1890                         prev_match = MTRR_TYPE_WRTHROUGH;
1891                         curr_match = MTRR_TYPE_WRTHROUGH;
1892                 }
1893
1894                 if (prev_match != curr_match)
1895                         return MTRR_TYPE_UNCACHABLE;
1896         }
1897
1898         if (prev_match != 0xFF)
1899                 return prev_match;
1900
1901         return mtrr_state->def_type;
1902 }
1903
1904 u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
1905 {
1906         u8 mtrr;
1907
1908         mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT,
1909                              (gfn << PAGE_SHIFT) + PAGE_SIZE);
1910         if (mtrr == 0xfe || mtrr == 0xff)
1911                 mtrr = MTRR_TYPE_WRBACK;
1912         return mtrr;
1913 }
1914 EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
1915
1916 static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1917 {
1918         trace_kvm_mmu_unsync_page(sp);
1919         ++vcpu->kvm->stat.mmu_unsync;
1920         sp->unsync = 1;
1921
1922         kvm_mmu_mark_parents_unsync(sp);
1923         mmu_convert_notrap(sp);
1924 }
1925
1926 static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
1927 {
1928         struct kvm_mmu_page *s;
1929         struct hlist_node *node;
1930
1931         for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1932                 if (s->unsync)
1933                         continue;
1934                 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1935                 __kvm_unsync_page(vcpu, s);
1936         }
1937 }
1938
1939 static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
1940                                   bool can_unsync)
1941 {
1942         struct kvm_mmu_page *s;
1943         struct hlist_node *node;
1944         bool need_unsync = false;
1945
1946         for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1947                 if (!can_unsync)
1948                         return 1;
1949
1950                 if (s->role.level != PT_PAGE_TABLE_LEVEL)
1951                         return 1;
1952
1953                 if (!need_unsync && !s->unsync) {
1954                         if (!oos_shadow)
1955                                 return 1;
1956                         need_unsync = true;
1957                 }
1958         }
1959         if (need_unsync)
1960                 kvm_unsync_pages(vcpu, gfn);
1961         return 0;
1962 }
1963
1964 static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1965                     unsigned pte_access, int user_fault,
1966                     int write_fault, int dirty, int level,
1967                     gfn_t gfn, pfn_t pfn, bool speculative,
1968                     bool can_unsync, bool reset_host_protection)
1969 {
1970         u64 spte;
1971         int ret = 0;
1972
1973         /*
1974          * We don't set the accessed bit, since we sometimes want to see
1975          * whether the guest actually used the pte (in order to detect
1976          * demand paging).
1977          */
1978         spte = shadow_base_present_pte;
1979         if (!speculative)
1980                 spte |= shadow_accessed_mask;
1981         if (!dirty)
1982                 pte_access &= ~ACC_WRITE_MASK;
1983         if (pte_access & ACC_EXEC_MASK)
1984                 spte |= shadow_x_mask;
1985         else
1986                 spte |= shadow_nx_mask;
1987         if (pte_access & ACC_USER_MASK)
1988                 spte |= shadow_user_mask;
1989         if (level > PT_PAGE_TABLE_LEVEL)
1990                 spte |= PT_PAGE_SIZE_MASK;
1991         if (tdp_enabled)
1992                 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
1993                         kvm_is_mmio_pfn(pfn));
1994
1995         if (reset_host_protection)
1996                 spte |= SPTE_HOST_WRITEABLE;
1997
1998         spte |= (u64)pfn << PAGE_SHIFT;
1999
2000         if ((pte_access & ACC_WRITE_MASK)
2001             || (!vcpu->arch.mmu.direct_map && write_fault
2002                 && !is_write_protection(vcpu) && !user_fault)) {
2003
2004                 if (level > PT_PAGE_TABLE_LEVEL &&
2005                     has_wrprotected_page(vcpu->kvm, gfn, level)) {
2006                         ret = 1;
2007                         drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
2008                         goto done;
2009                 }
2010
2011                 spte |= PT_WRITABLE_MASK;
2012
2013                 if (!vcpu->arch.mmu.direct_map
2014                     && !(pte_access & ACC_WRITE_MASK))
2015                         spte &= ~PT_USER_MASK;
2016
2017                 /*
2018                  * Optimization: for pte sync, if spte was writable the hash
2019                  * lookup is unnecessary (and expensive). Write protection
2020                  * is responsibility of mmu_get_page / kvm_sync_page.
2021                  * Same reasoning can be applied to dirty page accounting.
2022                  */
2023                 if (!can_unsync && is_writable_pte(*sptep))
2024                         goto set_pte;
2025
2026                 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
2027                         pgprintk("%s: found shadow page for %llx, marking ro\n",
2028                                  __func__, gfn);
2029                         ret = 1;
2030                         pte_access &= ~ACC_WRITE_MASK;
2031                         if (is_writable_pte(spte))
2032                                 spte &= ~PT_WRITABLE_MASK;
2033                 }
2034         }
2035
2036         if (pte_access & ACC_WRITE_MASK)
2037                 mark_page_dirty(vcpu->kvm, gfn);
2038
2039 set_pte:
2040         update_spte(sptep, spte);
2041 done:
2042         return ret;
2043 }
2044
2045 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2046                          unsigned pt_access, unsigned pte_access,
2047                          int user_fault, int write_fault, int dirty,
2048                          int *ptwrite, int level, gfn_t gfn,
2049                          pfn_t pfn, bool speculative,
2050                          bool reset_host_protection)
2051 {
2052         int was_rmapped = 0;
2053         int rmap_count;
2054
2055         pgprintk("%s: spte %llx access %x write_fault %d"
2056                  " user_fault %d gfn %llx\n",
2057                  __func__, *sptep, pt_access,
2058                  write_fault, user_fault, gfn);
2059
2060         if (is_rmap_spte(*sptep)) {
2061                 /*
2062                  * If we overwrite a PTE page pointer with a 2MB PMD, unlink
2063                  * the parent of the now unreachable PTE.
2064                  */
2065                 if (level > PT_PAGE_TABLE_LEVEL &&
2066                     !is_large_pte(*sptep)) {
2067                         struct kvm_mmu_page *child;
2068                         u64 pte = *sptep;
2069
2070                         child = page_header(pte & PT64_BASE_ADDR_MASK);
2071                         mmu_page_remove_parent_pte(child, sptep);
2072                         __set_spte(sptep, shadow_trap_nonpresent_pte);
2073                         kvm_flush_remote_tlbs(vcpu->kvm);
2074                 } else if (pfn != spte_to_pfn(*sptep)) {
2075                         pgprintk("hfn old %llx new %llx\n",
2076                                  spte_to_pfn(*sptep), pfn);
2077                         drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
2078                         kvm_flush_remote_tlbs(vcpu->kvm);
2079                 } else
2080                         was_rmapped = 1;
2081         }
2082
2083         if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
2084                       dirty, level, gfn, pfn, speculative, true,
2085                       reset_host_protection)) {
2086                 if (write_fault)
2087                         *ptwrite = 1;
2088                 kvm_mmu_flush_tlb(vcpu);
2089         }
2090
2091         pgprintk("%s: setting spte %llx\n", __func__, *sptep);
2092         pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
2093                  is_large_pte(*sptep)? "2MB" : "4kB",
2094                  *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
2095                  *sptep, sptep);
2096         if (!was_rmapped && is_large_pte(*sptep))
2097                 ++vcpu->kvm->stat.lpages;
2098
2099         page_header_update_slot(vcpu->kvm, sptep, gfn);
2100         if (!was_rmapped) {
2101                 rmap_count = rmap_add(vcpu, sptep, gfn);
2102                 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
2103                         rmap_recycle(vcpu, sptep, gfn);
2104         }
2105         kvm_release_pfn_clean(pfn);
2106         if (speculative) {
2107                 vcpu->arch.last_pte_updated = sptep;
2108                 vcpu->arch.last_pte_gfn = gfn;
2109         }
2110 }
2111
2112 static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
2113 {
2114 }
2115
2116 static struct kvm_memory_slot *
2117 pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log)
2118 {
2119         struct kvm_memory_slot *slot;
2120
2121         slot = gfn_to_memslot(vcpu->kvm, gfn);
2122         if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
2123               (no_dirty_log && slot->dirty_bitmap))
2124                 slot = NULL;
2125
2126         return slot;
2127 }
2128
2129 static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2130                                      bool no_dirty_log)
2131 {
2132         struct kvm_memory_slot *slot;
2133         unsigned long hva;
2134
2135         slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log);
2136         if (!slot) {
2137                 get_page(bad_page);
2138                 return page_to_pfn(bad_page);
2139         }
2140
2141         hva = gfn_to_hva_memslot(slot, gfn);
2142
2143         return hva_to_pfn_atomic(vcpu->kvm, hva);
2144 }
2145
2146 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2147                                     struct kvm_mmu_page *sp,
2148                                     u64 *start, u64 *end)
2149 {
2150         struct page *pages[PTE_PREFETCH_NUM];
2151         unsigned access = sp->role.access;
2152         int i, ret;
2153         gfn_t gfn;
2154
2155         gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
2156         if (!pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK))
2157                 return -1;
2158
2159         ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
2160         if (ret <= 0)
2161                 return -1;
2162
2163         for (i = 0; i < ret; i++, gfn++, start++)
2164                 mmu_set_spte(vcpu, start, ACC_ALL,
2165                              access, 0, 0, 1, NULL,
2166                              sp->role.level, gfn,
2167                              page_to_pfn(pages[i]), true, true);
2168
2169         return 0;
2170 }
2171
2172 static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
2173                                   struct kvm_mmu_page *sp, u64 *sptep)
2174 {
2175         u64 *spte, *start = NULL;
2176         int i;
2177
2178         WARN_ON(!sp->role.direct);
2179
2180         i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
2181         spte = sp->spt + i;
2182
2183         for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
2184                 if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
2185                         if (!start)
2186                                 continue;
2187                         if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
2188                                 break;
2189                         start = NULL;
2190                 } else if (!start)
2191                         start = spte;
2192         }
2193 }
2194
2195 static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
2196 {
2197         struct kvm_mmu_page *sp;
2198
2199         /*
2200          * Since it's no accessed bit on EPT, it's no way to
2201          * distinguish between actually accessed translations
2202          * and prefetched, so disable pte prefetch if EPT is
2203          * enabled.
2204          */
2205         if (!shadow_accessed_mask)
2206                 return;
2207
2208         sp = page_header(__pa(sptep));
2209         if (sp->role.level > PT_PAGE_TABLE_LEVEL)
2210                 return;
2211
2212         __direct_pte_prefetch(vcpu, sp, sptep);
2213 }
2214
2215 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2216                         int level, gfn_t gfn, pfn_t pfn)
2217 {
2218         struct kvm_shadow_walk_iterator iterator;
2219         struct kvm_mmu_page *sp;
2220         int pt_write = 0;
2221         gfn_t pseudo_gfn;
2222
2223         for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
2224                 if (iterator.level == level) {
2225                         mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
2226                                      0, write, 1, &pt_write,
2227                                      level, gfn, pfn, false, true);
2228                         direct_pte_prefetch(vcpu, iterator.sptep);
2229                         ++vcpu->stat.pf_fixed;
2230                         break;
2231                 }
2232
2233                 if (*iterator.sptep == shadow_trap_nonpresent_pte) {
2234                         u64 base_addr = iterator.addr;
2235
2236                         base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
2237                         pseudo_gfn = base_addr >> PAGE_SHIFT;
2238                         sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
2239                                               iterator.level - 1,
2240                                               1, ACC_ALL, iterator.sptep);
2241                         if (!sp) {
2242                                 pgprintk("nonpaging_map: ENOMEM\n");
2243                                 kvm_release_pfn_clean(pfn);
2244                                 return -ENOMEM;
2245                         }
2246
2247                         __set_spte(iterator.sptep,
2248                                    __pa(sp->spt)
2249                                    | PT_PRESENT_MASK | PT_WRITABLE_MASK
2250                                    | shadow_user_mask | shadow_x_mask
2251                                    | shadow_accessed_mask);
2252                 }
2253         }
2254         return pt_write;
2255 }
2256
2257 static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
2258 {
2259         siginfo_t info;
2260
2261         info.si_signo   = SIGBUS;
2262         info.si_errno   = 0;
2263         info.si_code    = BUS_MCEERR_AR;
2264         info.si_addr    = (void __user *)address;
2265         info.si_addr_lsb = PAGE_SHIFT;
2266
2267         send_sig_info(SIGBUS, &info, tsk);
2268 }
2269
2270 static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
2271 {
2272         kvm_release_pfn_clean(pfn);
2273         if (is_hwpoison_pfn(pfn)) {
2274                 kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current);
2275                 return 0;
2276         } else if (is_fault_pfn(pfn))
2277                 return -EFAULT;
2278
2279         return 1;
2280 }
2281
2282 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
2283 {
2284         int r;
2285         int level;
2286         pfn_t pfn;
2287         unsigned long mmu_seq;
2288
2289         level = mapping_level(vcpu, gfn);
2290
2291         /*
2292          * This path builds a PAE pagetable - so we can map 2mb pages at
2293          * maximum. Therefore check if the level is larger than that.
2294          */
2295         if (level > PT_DIRECTORY_LEVEL)
2296                 level = PT_DIRECTORY_LEVEL;
2297
2298         gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2299
2300         mmu_seq = vcpu->kvm->mmu_notifier_seq;
2301         smp_rmb();
2302         pfn = gfn_to_pfn(vcpu->kvm, gfn);
2303
2304         /* mmio */
2305         if (is_error_pfn(pfn))
2306                 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2307
2308         spin_lock(&vcpu->kvm->mmu_lock);
2309         if (mmu_notifier_retry(vcpu, mmu_seq))
2310                 goto out_unlock;
2311         kvm_mmu_free_some_pages(vcpu);
2312         r = __direct_map(vcpu, v, write, level, gfn, pfn);
2313         spin_unlock(&vcpu->kvm->mmu_lock);
2314
2315
2316         return r;
2317
2318 out_unlock:
2319         spin_unlock(&vcpu->kvm->mmu_lock);
2320         kvm_release_pfn_clean(pfn);
2321         return 0;
2322 }
2323
2324
2325 static void mmu_free_roots(struct kvm_vcpu *vcpu)
2326 {
2327         int i;
2328         struct kvm_mmu_page *sp;
2329         LIST_HEAD(invalid_list);
2330
2331         if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2332                 return;
2333         spin_lock(&vcpu->kvm->mmu_lock);
2334         if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
2335             (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
2336              vcpu->arch.mmu.direct_map)) {
2337                 hpa_t root = vcpu->arch.mmu.root_hpa;
2338
2339                 sp = page_header(root);
2340                 --sp->root_count;
2341                 if (!sp->root_count && sp->role.invalid) {
2342                         kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
2343                         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2344                 }
2345                 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2346                 spin_unlock(&vcpu->kvm->mmu_lock);
2347                 return;
2348         }
2349         for (i = 0; i < 4; ++i) {
2350                 hpa_t root = vcpu->arch.mmu.pae_root[i];
2351
2352                 if (root) {
2353                         root &= PT64_BASE_ADDR_MASK;
2354                         sp = page_header(root);
2355                         --sp->root_count;
2356                         if (!sp->root_count && sp->role.invalid)
2357                                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2358                                                          &invalid_list);
2359                 }
2360                 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
2361         }
2362         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2363         spin_unlock(&vcpu->kvm->mmu_lock);
2364         vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2365 }
2366
2367 static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
2368 {
2369         int ret = 0;
2370
2371         if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
2372                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2373                 ret = 1;
2374         }
2375
2376         return ret;
2377 }
2378
2379 static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
2380 {
2381         struct kvm_mmu_page *sp;
2382         unsigned i;
2383
2384         if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2385                 spin_lock(&vcpu->kvm->mmu_lock);
2386                 kvm_mmu_free_some_pages(vcpu);
2387                 sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
2388                                       1, ACC_ALL, NULL);
2389                 ++sp->root_count;
2390                 spin_unlock(&vcpu->kvm->mmu_lock);
2391                 vcpu->arch.mmu.root_hpa = __pa(sp->spt);
2392         } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) {
2393                 for (i = 0; i < 4; ++i) {
2394                         hpa_t root = vcpu->arch.mmu.pae_root[i];
2395
2396                         ASSERT(!VALID_PAGE(root));
2397                         spin_lock(&vcpu->kvm->mmu_lock);
2398                         kvm_mmu_free_some_pages(vcpu);
2399                         sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
2400                                               i << 30,
2401                                               PT32_ROOT_LEVEL, 1, ACC_ALL,
2402                                               NULL);
2403                         root = __pa(sp->spt);
2404                         ++sp->root_count;
2405                         spin_unlock(&vcpu->kvm->mmu_lock);
2406                         vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
2407                 }
2408                 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
2409         } else
2410                 BUG();
2411
2412         return 0;
2413 }
2414
2415 static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
2416 {
2417         struct kvm_mmu_page *sp;
2418         u64 pdptr, pm_mask;
2419         gfn_t root_gfn;
2420         int i;
2421
2422         root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
2423
2424         if (mmu_check_root(vcpu, root_gfn))
2425                 return 1;
2426
2427         /*
2428          * Do we shadow a long mode page table? If so we need to
2429          * write-protect the guests page table root.
2430          */
2431         if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
2432                 hpa_t root = vcpu->arch.mmu.root_hpa;
2433
2434                 ASSERT(!VALID_PAGE(root));
2435
2436                 spin_lock(&vcpu->kvm->mmu_lock);
2437                 kvm_mmu_free_some_pages(vcpu);
2438                 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
2439                                       0, ACC_ALL, NULL);
2440                 root = __pa(sp->spt);
2441                 ++sp->root_count;
2442                 spin_unlock(&vcpu->kvm->mmu_lock);
2443                 vcpu->arch.mmu.root_hpa = root;
2444                 return 0;
2445         }
2446
2447         /*
2448          * We shadow a 32 bit page table. This may be a legacy 2-level
2449          * or a PAE 3-level page table. In either case we need to be aware that
2450          * the shadow page table may be a PAE or a long mode page table.
2451          */
2452         pm_mask = PT_PRESENT_MASK;
2453         if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL)
2454                 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
2455
2456         for (i = 0; i < 4; ++i) {
2457                 hpa_t root = vcpu->arch.mmu.pae_root[i];
2458
2459                 ASSERT(!VALID_PAGE(root));
2460                 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
2461                         pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i);
2462                         if (!is_present_gpte(pdptr)) {
2463                                 vcpu->arch.mmu.pae_root[i] = 0;
2464                                 continue;
2465                         }
2466                         root_gfn = pdptr >> PAGE_SHIFT;
2467                         if (mmu_check_root(vcpu, root_gfn))
2468                                 return 1;
2469                 }
2470                 spin_lock(&vcpu->kvm->mmu_lock);
2471                 kvm_mmu_free_some_pages(vcpu);
2472                 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
2473                                       PT32_ROOT_LEVEL, 0,
2474                                       ACC_ALL, NULL);
2475                 root = __pa(sp->spt);
2476                 ++sp->root_count;
2477                 spin_unlock(&vcpu->kvm->mmu_lock);
2478
2479                 vcpu->arch.mmu.pae_root[i] = root | pm_mask;
2480         }
2481         vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
2482
2483         /*
2484          * If we shadow a 32 bit page table with a long mode page
2485          * table we enter this path.
2486          */
2487         if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2488                 if (vcpu->arch.mmu.lm_root == NULL) {
2489                         /*
2490                          * The additional page necessary for this is only
2491                          * allocated on demand.
2492                          */
2493
2494                         u64 *lm_root;
2495
2496                         lm_root = (void*)get_zeroed_page(GFP_KERNEL);
2497                         if (lm_root == NULL)
2498                                 return 1;
2499
2500                         lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask;
2501
2502                         vcpu->arch.mmu.lm_root = lm_root;
2503                 }
2504
2505                 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
2506         }
2507
2508         return 0;
2509 }
2510
2511 static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2512 {
2513         if (vcpu->arch.mmu.direct_map)
2514                 return mmu_alloc_direct_roots(vcpu);
2515         else
2516                 return mmu_alloc_shadow_roots(vcpu);
2517 }
2518
2519 static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2520 {
2521         int i;
2522         struct kvm_mmu_page *sp;
2523
2524         if (vcpu->arch.mmu.direct_map)
2525                 return;
2526
2527         if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2528                 return;
2529
2530         trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
2531         if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
2532                 hpa_t root = vcpu->arch.mmu.root_hpa;
2533                 sp = page_header(root);
2534                 mmu_sync_children(vcpu, sp);
2535                 return;
2536         }
2537         for (i = 0; i < 4; ++i) {
2538                 hpa_t root = vcpu->arch.mmu.pae_root[i];
2539
2540                 if (root && VALID_PAGE(root)) {
2541                         root &= PT64_BASE_ADDR_MASK;
2542                         sp = page_header(root);
2543                         mmu_sync_children(vcpu, sp);
2544                 }
2545         }
2546         trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
2547 }
2548
2549 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
2550 {
2551         spin_lock(&vcpu->kvm->mmu_lock);
2552         mmu_sync_roots(vcpu);
2553         spin_unlock(&vcpu->kvm->mmu_lock);
2554 }
2555
2556 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
2557                                   u32 access, u32 *error)
2558 {
2559         if (error)
2560                 *error = 0;
2561         return vaddr;
2562 }
2563
2564 static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
2565                                          u32 access, u32 *error)
2566 {
2567         if (error)
2568                 *error = 0;
2569         return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
2570 }
2571
2572 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2573                                 u32 error_code, bool no_apf)
2574 {
2575         gfn_t gfn;
2576         int r;
2577
2578         pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
2579         r = mmu_topup_memory_caches(vcpu);
2580         if (r)
2581                 return r;
2582
2583         ASSERT(vcpu);
2584         ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
2585
2586         gfn = gva >> PAGE_SHIFT;
2587
2588         return nonpaging_map(vcpu, gva & PAGE_MASK,
2589                              error_code & PFERR_WRITE_MASK, gfn);
2590 }
2591
2592 int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
2593 {
2594         struct kvm_arch_async_pf arch;
2595         arch.gfn = gfn;
2596
2597         return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
2598 }
2599
2600 static bool can_do_async_pf(struct kvm_vcpu *vcpu)
2601 {
2602         if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
2603                      kvm_event_needs_reinjection(vcpu)))
2604                 return false;
2605
2606         return kvm_x86_ops->interrupt_allowed(vcpu);
2607 }
2608
2609 static bool try_async_pf(struct kvm_vcpu *vcpu, bool no_apf, gfn_t gfn,
2610                          gva_t gva, pfn_t *pfn)
2611 {
2612         bool async;
2613
2614         *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async);
2615
2616         if (!async)
2617                 return false; /* *pfn has correct page already */
2618
2619         put_page(pfn_to_page(*pfn));
2620
2621         if (!no_apf && can_do_async_pf(vcpu)) {
2622                 trace_kvm_try_async_get_page(async, *pfn);
2623                 if (kvm_find_async_pf_gfn(vcpu, gfn)) {
2624                         trace_kvm_async_pf_doublefault(gva, gfn);
2625                         kvm_make_request(KVM_REQ_APF_HALT, vcpu);
2626                         return true;
2627                 } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
2628                         return true;
2629         }
2630
2631         *pfn = gfn_to_pfn(vcpu->kvm, gfn);
2632
2633         return false;
2634 }
2635
2636 static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
2637                           bool no_apf)
2638 {
2639         pfn_t pfn;
2640         int r;
2641         int level;
2642         gfn_t gfn = gpa >> PAGE_SHIFT;
2643         unsigned long mmu_seq;
2644
2645         ASSERT(vcpu);
2646         ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
2647
2648         r = mmu_topup_memory_caches(vcpu);
2649         if (r)
2650                 return r;
2651
2652         level = mapping_level(vcpu, gfn);
2653
2654         gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2655
2656         mmu_seq = vcpu->kvm->mmu_notifier_seq;
2657         smp_rmb();
2658
2659         if (try_async_pf(vcpu, no_apf, gfn, gpa, &pfn))
2660                 return 0;
2661
2662         /* mmio */
2663         if (is_error_pfn(pfn))
2664                 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2665         spin_lock(&vcpu->kvm->mmu_lock);
2666         if (mmu_notifier_retry(vcpu, mmu_seq))
2667                 goto out_unlock;
2668         kvm_mmu_free_some_pages(vcpu);
2669         r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
2670                          level, gfn, pfn);
2671         spin_unlock(&vcpu->kvm->mmu_lock);
2672
2673         return r;
2674
2675 out_unlock:
2676         spin_unlock(&vcpu->kvm->mmu_lock);
2677         kvm_release_pfn_clean(pfn);
2678         return 0;
2679 }
2680
2681 static void nonpaging_free(struct kvm_vcpu *vcpu)
2682 {
2683         mmu_free_roots(vcpu);
2684 }
2685
2686 static int nonpaging_init_context(struct kvm_vcpu *vcpu,
2687                                   struct kvm_mmu *context)
2688 {
2689         context->new_cr3 = nonpaging_new_cr3;
2690         context->page_fault = nonpaging_page_fault;
2691         context->gva_to_gpa = nonpaging_gva_to_gpa;
2692         context->free = nonpaging_free;
2693         context->prefetch_page = nonpaging_prefetch_page;
2694         context->sync_page = nonpaging_sync_page;
2695         context->invlpg = nonpaging_invlpg;
2696         context->root_level = 0;
2697         context->shadow_root_level = PT32E_ROOT_LEVEL;
2698         context->root_hpa = INVALID_PAGE;
2699         context->direct_map = true;
2700         context->nx = false;
2701         return 0;
2702 }
2703
2704 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
2705 {
2706         ++vcpu->stat.tlb_flush;
2707         kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2708 }
2709
2710 static void paging_new_cr3(struct kvm_vcpu *vcpu)
2711 {
2712         pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3);
2713         mmu_free_roots(vcpu);
2714 }
2715
2716 static unsigned long get_cr3(struct kvm_vcpu *vcpu)
2717 {
2718         return vcpu->arch.cr3;
2719 }
2720
2721 static void inject_page_fault(struct kvm_vcpu *vcpu)
2722 {
2723         vcpu->arch.mmu.inject_page_fault(vcpu);
2724 }
2725
2726 static void paging_free(struct kvm_vcpu *vcpu)
2727 {
2728         nonpaging_free(vcpu);
2729 }
2730
2731 static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
2732 {
2733         int bit7;
2734
2735         bit7 = (gpte >> 7) & 1;
2736         return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
2737 }
2738
2739 #define PTTYPE 64
2740 #include "paging_tmpl.h"
2741 #undef PTTYPE
2742
2743 #define PTTYPE 32
2744 #include "paging_tmpl.h"
2745 #undef PTTYPE
2746
2747 static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
2748                                   struct kvm_mmu *context,
2749                                   int level)
2750 {
2751         int maxphyaddr = cpuid_maxphyaddr(vcpu);
2752         u64 exb_bit_rsvd = 0;
2753
2754         if (!context->nx)
2755                 exb_bit_rsvd = rsvd_bits(63, 63);
2756         switch (level) {
2757         case PT32_ROOT_LEVEL:
2758                 /* no rsvd bits for 2 level 4K page table entries */
2759                 context->rsvd_bits_mask[0][1] = 0;
2760                 context->rsvd_bits_mask[0][0] = 0;
2761                 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2762
2763                 if (!is_pse(vcpu)) {
2764                         context->rsvd_bits_mask[1][1] = 0;
2765                         break;
2766                 }
2767
2768                 if (is_cpuid_PSE36())
2769                         /* 36bits PSE 4MB page */
2770                         context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
2771                 else
2772                         /* 32 bits PSE 4MB page */
2773                         context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
2774                 break;
2775         case PT32E_ROOT_LEVEL:
2776                 context->rsvd_bits_mask[0][2] =
2777                         rsvd_bits(maxphyaddr, 63) |
2778                         rsvd_bits(7, 8) | rsvd_bits(1, 2);      /* PDPTE */
2779                 context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2780                         rsvd_bits(maxphyaddr, 62);      /* PDE */
2781                 context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
2782                         rsvd_bits(maxphyaddr, 62);      /* PTE */
2783                 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2784                         rsvd_bits(maxphyaddr, 62) |
2785                         rsvd_bits(13, 20);              /* large page */
2786                 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2787                 break;
2788         case PT64_ROOT_LEVEL:
2789                 context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
2790                         rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
2791                 context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
2792                         rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
2793                 context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2794                         rsvd_bits(maxphyaddr, 51);
2795                 context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
2796                         rsvd_bits(maxphyaddr, 51);
2797                 context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
2798                 context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
2799                         rsvd_bits(maxphyaddr, 51) |
2800                         rsvd_bits(13, 29);
2801                 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2802                         rsvd_bits(maxphyaddr, 51) |
2803                         rsvd_bits(13, 20);              /* large page */
2804                 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2805                 break;
2806         }
2807 }
2808
2809 static int paging64_init_context_common(struct kvm_vcpu *vcpu,
2810                                         struct kvm_mmu *context,
2811                                         int level)
2812 {
2813         context->nx = is_nx(vcpu);
2814
2815         reset_rsvds_bits_mask(vcpu, context, level);
2816
2817         ASSERT(is_pae(vcpu));
2818         context->new_cr3 = paging_new_cr3;
2819         context->page_fault = paging64_page_fault;
2820         context->gva_to_gpa = paging64_gva_to_gpa;
2821         context->prefetch_page = paging64_prefetch_page;
2822         context->sync_page = paging64_sync_page;
2823         context->invlpg = paging64_invlpg;
2824         context->free = paging_free;
2825         context->root_level = level;
2826         context->shadow_root_level = level;
2827         context->root_hpa = INVALID_PAGE;
2828         context->direct_map = false;
2829         return 0;
2830 }
2831
2832 static int paging64_init_context(struct kvm_vcpu *vcpu,
2833                                  struct kvm_mmu *context)
2834 {
2835         return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL);
2836 }
2837
2838 static int paging32_init_context(struct kvm_vcpu *vcpu,
2839                                  struct kvm_mmu *context)
2840 {
2841         context->nx = false;
2842
2843         reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
2844
2845         context->new_cr3 = paging_new_cr3;
2846         context->page_fault = paging32_page_fault;
2847         context->gva_to_gpa = paging32_gva_to_gpa;
2848         context->free = paging_free;
2849         context->prefetch_page = paging32_prefetch_page;
2850         context->sync_page = paging32_sync_page;
2851         context->invlpg = paging32_invlpg;
2852         context->root_level = PT32_ROOT_LEVEL;
2853         context->shadow_root_level = PT32E_ROOT_LEVEL;
2854         context->root_hpa = INVALID_PAGE;
2855         context->direct_map = false;
2856         return 0;
2857 }
2858
2859 static int paging32E_init_context(struct kvm_vcpu *vcpu,
2860                                   struct kvm_mmu *context)
2861 {
2862         return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
2863 }
2864
2865 static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2866 {
2867         struct kvm_mmu *context = vcpu->arch.walk_mmu;
2868
2869         context->new_cr3 = nonpaging_new_cr3;
2870         context->page_fault = tdp_page_fault;
2871         context->free = nonpaging_free;
2872         context->prefetch_page = nonpaging_prefetch_page;
2873         context->sync_page = nonpaging_sync_page;
2874         context->invlpg = nonpaging_invlpg;
2875         context->shadow_root_level = kvm_x86_ops->get_tdp_level();
2876         context->root_hpa = INVALID_PAGE;
2877         context->direct_map = true;
2878         context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
2879         context->get_cr3 = get_cr3;
2880         context->inject_page_fault = kvm_inject_page_fault;
2881         context->nx = is_nx(vcpu);
2882
2883         if (!is_paging(vcpu)) {
2884                 context->nx = false;
2885                 context->gva_to_gpa = nonpaging_gva_to_gpa;
2886                 context->root_level = 0;
2887         } else if (is_long_mode(vcpu)) {
2888                 context->nx = is_nx(vcpu);
2889                 reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL);
2890                 context->gva_to_gpa = paging64_gva_to_gpa;
2891                 context->root_level = PT64_ROOT_LEVEL;
2892         } else if (is_pae(vcpu)) {
2893                 context->nx = is_nx(vcpu);
2894                 reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL);
2895                 context->gva_to_gpa = paging64_gva_to_gpa;
2896                 context->root_level = PT32E_ROOT_LEVEL;
2897         } else {
2898                 context->nx = false;
2899                 reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
2900                 context->gva_to_gpa = paging32_gva_to_gpa;
2901                 context->root_level = PT32_ROOT_LEVEL;
2902         }
2903
2904         return 0;
2905 }
2906
2907 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
2908 {
2909         int r;
2910         ASSERT(vcpu);
2911         ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
2912
2913         if (!is_paging(vcpu))
2914                 r = nonpaging_init_context(vcpu, context);
2915         else if (is_long_mode(vcpu))
2916                 r = paging64_init_context(vcpu, context);
2917         else if (is_pae(vcpu))
2918                 r = paging32E_init_context(vcpu, context);
2919         else
2920                 r = paging32_init_context(vcpu, context);
2921
2922         vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
2923         vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
2924
2925         return r;
2926 }
2927 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
2928
2929 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
2930 {
2931         int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
2932
2933         vcpu->arch.walk_mmu->set_cr3           = kvm_x86_ops->set_cr3;
2934         vcpu->arch.walk_mmu->get_cr3           = get_cr3;
2935         vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
2936
2937         return r;
2938 }
2939
2940 static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
2941 {
2942         struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
2943
2944         g_context->get_cr3           = get_cr3;
2945         g_context->inject_page_fault = kvm_inject_page_fault;
2946
2947         /*
2948          * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The
2949          * translation of l2_gpa to l1_gpa addresses is done using the
2950          * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa
2951          * functions between mmu and nested_mmu are swapped.
2952          */
2953         if (!is_paging(vcpu)) {
2954                 g_context->nx = false;
2955                 g_context->root_level = 0;
2956                 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
2957         } else if (is_long_mode(vcpu)) {
2958                 g_context->nx = is_nx(vcpu);
2959                 reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL);
2960                 g_context->root_level = PT64_ROOT_LEVEL;
2961                 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
2962         } else if (is_pae(vcpu)) {
2963                 g_context->nx = is_nx(vcpu);
2964                 reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL);
2965                 g_context->root_level = PT32E_ROOT_LEVEL;
2966                 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
2967         } else {
2968                 g_context->nx = false;
2969                 reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL);
2970                 g_context->root_level = PT32_ROOT_LEVEL;
2971                 g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
2972         }
2973
2974         return 0;
2975 }
2976
2977 static int init_kvm_mmu(struct kvm_vcpu *vcpu)
2978 {
2979         vcpu->arch.update_pte.pfn = bad_pfn;
2980
2981         if (mmu_is_nested(vcpu))
2982                 return init_kvm_nested_mmu(vcpu);
2983         else if (tdp_enabled)
2984                 return init_kvm_tdp_mmu(vcpu);
2985         else
2986                 return init_kvm_softmmu(vcpu);
2987 }
2988
2989 static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
2990 {
2991         ASSERT(vcpu);
2992         if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
2993                 /* mmu.free() should set root_hpa = INVALID_PAGE */
2994                 vcpu->arch.mmu.free(vcpu);
2995 }
2996
2997 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
2998 {
2999         destroy_kvm_mmu(vcpu);
3000         return init_kvm_mmu(vcpu);
3001 }
3002 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
3003
3004 int kvm_mmu_load(struct kvm_vcpu *vcpu)
3005 {
3006         int r;
3007
3008         r = mmu_topup_memory_caches(vcpu);
3009         if (r)
3010                 goto out;
3011         r = mmu_alloc_roots(vcpu);
3012         spin_lock(&vcpu->kvm->mmu_lock);
3013         mmu_sync_roots(vcpu);
3014         spin_unlock(&vcpu->kvm->mmu_lock);
3015         if (r)
3016                 goto out;
3017         /* set_cr3() should ensure TLB has been flushed */
3018         vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
3019 out:
3020         return r;
3021 }
3022 EXPORT_SYMBOL_GPL(kvm_mmu_load);
3023
3024 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
3025 {
3026         mmu_free_roots(vcpu);
3027 }
3028 EXPORT_SYMBOL_GPL(kvm_mmu_unload);
3029
3030 static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
3031                                   struct kvm_mmu_page *sp,
3032                                   u64 *spte)
3033 {
3034         u64 pte;
3035         struct kvm_mmu_page *child;
3036
3037         pte = *spte;
3038         if (is_shadow_present_pte(pte)) {
3039                 if (is_last_spte(pte, sp->role.level))
3040                         drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
3041                 else {
3042                         child = page_header(pte & PT64_BASE_ADDR_MASK);
3043                         mmu_page_remove_parent_pte(child, spte);
3044                 }
3045         }
3046         __set_spte(spte, shadow_trap_nonpresent_pte);
3047         if (is_large_pte(pte))
3048                 --vcpu->kvm->stat.lpages;
3049 }
3050
3051 static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
3052                                   struct kvm_mmu_page *sp,
3053                                   u64 *spte,
3054                                   const void *new)
3055 {
3056         if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
3057                 ++vcpu->kvm->stat.mmu_pde_zapped;
3058                 return;
3059         }
3060
3061         if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
3062                 return;
3063
3064         ++vcpu->kvm->stat.mmu_pte_updated;
3065         if (!sp->role.cr4_pae)
3066                 paging32_update_pte(vcpu, sp, spte, new);
3067         else
3068                 paging64_update_pte(vcpu, sp, spte, new);
3069 }
3070
3071 static bool need_remote_flush(u64 old, u64 new)
3072 {
3073         if (!is_shadow_present_pte(old))
3074                 return false;
3075         if (!is_shadow_present_pte(new))
3076                 return true;
3077         if ((old ^ new) & PT64_BASE_ADDR_MASK)
3078                 return true;
3079         old ^= PT64_NX_MASK;
3080         new ^= PT64_NX_MASK;
3081         return (old & ~new & PT64_PERM_MASK) != 0;
3082 }
3083
3084 static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
3085                                     bool remote_flush, bool local_flush)
3086 {
3087         if (zap_page)
3088                 return;
3089
3090         if (remote_flush)
3091                 kvm_flush_remote_tlbs(vcpu->kvm);
3092         else if (local_flush)
3093                 kvm_mmu_flush_tlb(vcpu);
3094 }
3095
3096 static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
3097 {
3098         u64 *spte = vcpu->arch.last_pte_updated;
3099
3100         return !!(spte && (*spte & shadow_accessed_mask));
3101 }
3102
3103 static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3104                                           u64 gpte)
3105 {
3106         gfn_t gfn;
3107         pfn_t pfn;
3108
3109         if (!is_present_gpte(gpte))
3110                 return;
3111         gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
3112
3113         vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
3114         smp_rmb();
3115         pfn = gfn_to_pfn(vcpu->kvm, gfn);
3116
3117         if (is_error_pfn(pfn)) {
3118                 kvm_release_pfn_clean(pfn);
3119                 return;
3120         }
3121         vcpu->arch.update_pte.gfn = gfn;
3122         vcpu->arch.update_pte.pfn = pfn;
3123 }
3124
3125 static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
3126 {
3127         u64 *spte = vcpu->arch.last_pte_updated;
3128
3129         if (spte
3130             && vcpu->arch.last_pte_gfn == gfn
3131             && shadow_accessed_mask
3132             && !(*spte & shadow_accessed_mask)
3133             && is_shadow_present_pte(*spte))
3134                 set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
3135 }
3136
3137 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3138                        const u8 *new, int bytes,
3139                        bool guest_initiated)
3140 {
3141         gfn_t gfn = gpa >> PAGE_SHIFT;
3142         union kvm_mmu_page_role mask = { .word = 0 };
3143         struct kvm_mmu_page *sp;
3144         struct hlist_node *node;
3145         LIST_HEAD(invalid_list);
3146         u64 entry, gentry;
3147         u64 *spte;
3148         unsigned offset = offset_in_page(gpa);
3149         unsigned pte_size;
3150         unsigned page_offset;
3151         unsigned misaligned;
3152         unsigned quadrant;
3153         int level;
3154         int flooded = 0;
3155         int npte;
3156         int r;
3157         int invlpg_counter;
3158         bool remote_flush, local_flush, zap_page;
3159
3160         zap_page = remote_flush = local_flush = false;
3161
3162         pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
3163
3164         invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter);
3165
3166         /*
3167          * Assume that the pte write on a page table of the same type
3168          * as the current vcpu paging mode.  This is nearly always true
3169          * (might be false while changing modes).  Note it is verified later
3170          * by update_pte().
3171          */
3172         if ((is_pae(vcpu) && bytes == 4) || !new) {
3173                 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
3174                 if (is_pae(vcpu)) {
3175                         gpa &= ~(gpa_t)7;
3176                         bytes = 8;
3177                 }
3178                 r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
3179                 if (r)
3180                         gentry = 0;
3181                 new = (const u8 *)&gentry;
3182         }
3183
3184         switch (bytes) {
3185         case 4:
3186                 gentry = *(const u32 *)new;
3187                 break;
3188         case 8:
3189                 gentry = *(const u64 *)new;
3190                 break;
3191         default:
3192                 gentry = 0;
3193                 break;
3194         }
3195
3196         mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
3197         spin_lock(&vcpu->kvm->mmu_lock);
3198         if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
3199                 gentry = 0;
3200         kvm_mmu_access_page(vcpu, gfn);
3201         kvm_mmu_free_some_pages(vcpu);
3202         ++vcpu->kvm->stat.mmu_pte_write;
3203         trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
3204         if (guest_initiated) {
3205                 if (gfn == vcpu->arch.last_pt_write_gfn
3206                     && !last_updated_pte_accessed(vcpu)) {
3207                         ++vcpu->arch.last_pt_write_count;
3208                         if (vcpu->arch.last_pt_write_count >= 3)
3209                                 flooded = 1;
3210                 } else {
3211                         vcpu->arch.last_pt_write_gfn = gfn;
3212                         vcpu->arch.last_pt_write_count = 1;
3213                         vcpu->arch.last_pte_updated = NULL;
3214                 }
3215         }
3216
3217         mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
3218         for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
3219                 pte_size = sp->role.cr4_pae ? 8 : 4;
3220                 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
3221                 misaligned |= bytes < 4;
3222                 if (misaligned || flooded) {
3223                         /*
3224                          * Misaligned accesses are too much trouble to fix
3225                          * up; also, they usually indicate a page is not used
3226                          * as a page table.
3227                          *
3228                          * If we're seeing too many writes to a page,
3229                          * it may no longer be a page table, or we may be
3230                          * forking, in which case it is better to unmap the
3231                          * page.
3232                          */
3233                         pgprintk("misaligned: gpa %llx bytes %d role %x\n",
3234                                  gpa, bytes, sp->role.word);
3235                         zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
3236                                                      &invalid_list);
3237                         ++vcpu->kvm->stat.mmu_flooded;
3238                         continue;
3239                 }
3240                 page_offset = offset;
3241                 level = sp->role.level;
3242                 npte = 1;
3243                 if (!sp->role.cr4_pae) {
3244                         page_offset <<= 1;      /* 32->64 */
3245                         /*
3246                          * A 32-bit pde maps 4MB while the shadow pdes map
3247                          * only 2MB.  So we need to double the offset again
3248                          * and zap two pdes instead of one.
3249                          */
3250                         if (level == PT32_ROOT_LEVEL) {
3251                                 page_offset &= ~7; /* kill rounding error */
3252                                 page_offset <<= 1;
3253                                 npte = 2;
3254                         }
3255                         quadrant = page_offset >> PAGE_SHIFT;
3256                         page_offset &= ~PAGE_MASK;
3257                         if (quadrant != sp->role.quadrant)
3258                                 continue;
3259                 }
3260                 local_flush = true;
3261                 spte = &sp->spt[page_offset / sizeof(*spte)];
3262                 while (npte--) {
3263                         entry = *spte;
3264                         mmu_pte_write_zap_pte(vcpu, sp, spte);
3265                         if (gentry &&
3266                               !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
3267                               & mask.word))
3268                                 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
3269                         if (!remote_flush && need_remote_flush(entry, *spte))
3270                                 remote_flush = true;
3271                         ++spte;
3272                 }
3273         }
3274         mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
3275         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3276         trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
3277         spin_unlock(&vcpu->kvm->mmu_lock);
3278         if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
3279                 kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
3280                 vcpu->arch.update_pte.pfn = bad_pfn;
3281         }
3282 }
3283
3284 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
3285 {
3286         gpa_t gpa;
3287         int r;
3288
3289         if (vcpu->arch.mmu.direct_map)
3290                 return 0;
3291
3292         gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
3293
3294         spin_lock(&vcpu->kvm->mmu_lock);
3295         r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
3296         spin_unlock(&vcpu->kvm->mmu_lock);
3297         return r;
3298 }
3299 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
3300
3301 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
3302 {
3303         LIST_HEAD(invalid_list);
3304
3305         while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES &&
3306                !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
3307                 struct kvm_mmu_page *sp;
3308
3309                 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
3310                                   struct kvm_mmu_page, link);
3311                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
3312                 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3313                 ++vcpu->kvm->stat.mmu_recycled;
3314         }
3315 }
3316
3317 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
3318 {
3319         int r;
3320         enum emulation_result er;
3321
3322         r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
3323         if (r < 0)
3324                 goto out;
3325
3326         if (!r) {
3327                 r = 1;
3328                 goto out;
3329         }
3330
3331         r = mmu_topup_memory_caches(vcpu);
3332         if (r)
3333                 goto out;
3334
3335         er = emulate_instruction(vcpu, cr2, error_code, 0);
3336
3337         switch (er) {
3338         case EMULATE_DONE:
3339                 return 1;
3340         case EMULATE_DO_MMIO:
3341                 ++vcpu->stat.mmio_exits;
3342                 /* fall through */
3343         case EMULATE_FAIL:
3344                 return 0;
3345         default:
3346                 BUG();
3347         }
3348 out:
3349         return r;
3350 }
3351 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
3352
3353 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
3354 {
3355         vcpu->arch.mmu.invlpg(vcpu, gva);
3356         kvm_mmu_flush_tlb(vcpu);
3357         ++vcpu->stat.invlpg;
3358 }
3359 EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
3360
3361 void kvm_enable_tdp(void)
3362 {
3363         tdp_enabled = true;
3364 }
3365 EXPORT_SYMBOL_GPL(kvm_enable_tdp);
3366
3367 void kvm_disable_tdp(void)
3368 {
3369         tdp_enabled = false;
3370 }
3371 EXPORT_SYMBOL_GPL(kvm_disable_tdp);
3372
3373 static void free_mmu_pages(struct kvm_vcpu *vcpu)
3374 {
3375         free_page((unsigned long)vcpu->arch.mmu.pae_root);
3376         if (vcpu->arch.mmu.lm_root != NULL)
3377                 free_page((unsigned long)vcpu->arch.mmu.lm_root);
3378 }
3379
3380 static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
3381 {
3382         struct page *page;
3383         int i;
3384
3385         ASSERT(vcpu);
3386
3387         /*
3388          * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
3389          * Therefore we need to allocate shadow page tables in the first
3390          * 4GB of memory, which happens to fit the DMA32 zone.
3391          */
3392         page = alloc_page(GFP_KERNEL | __GFP_DMA32);
3393         if (!page)
3394                 return -ENOMEM;
3395
3396         vcpu->arch.mmu.pae_root = page_address(page);
3397         for (i = 0; i < 4; ++i)
3398                 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
3399
3400         return 0;
3401 }
3402
3403 int kvm_mmu_create(struct kvm_vcpu *vcpu)
3404 {
3405         ASSERT(vcpu);
3406         ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
3407
3408         return alloc_mmu_pages(vcpu);
3409 }
3410
3411 int kvm_mmu_setup(struct kvm_vcpu *vcpu)
3412 {
3413         ASSERT(vcpu);
3414         ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
3415
3416         return init_kvm_mmu(vcpu);
3417 }
3418
3419 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3420 {
3421         struct kvm_mmu_page *sp;
3422
3423         list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
3424                 int i;
3425                 u64 *pt;
3426
3427                 if (!test_bit(slot, sp->slot_bitmap))
3428                         continue;
3429
3430                 pt = sp->spt;
3431                 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
3432                         /* avoid RMW */
3433                         if (is_writable_pte(pt[i]))
3434                                 pt[i] &= ~PT_WRITABLE_MASK;
3435         }
3436         kvm_flush_remote_tlbs(kvm);
3437 }
3438
3439 void kvm_mmu_zap_all(struct kvm *kvm)
3440 {
3441         struct kvm_mmu_page *sp, *node;
3442         LIST_HEAD(invalid_list);
3443
3444         spin_lock(&kvm->mmu_lock);
3445 restart:
3446         list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
3447                 if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
3448                         goto restart;
3449
3450         kvm_mmu_commit_zap_page(kvm, &invalid_list);
3451         spin_unlock(&kvm->mmu_lock);
3452 }
3453
3454 static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
3455                                                struct list_head *invalid_list)
3456 {
3457         struct kvm_mmu_page *page;
3458
3459         page = container_of(kvm->arch.active_mmu_pages.prev,
3460                             struct kvm_mmu_page, link);
3461         return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
3462 }
3463
3464 static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
3465 {
3466         struct kvm *kvm;
3467         struct kvm *kvm_freed = NULL;
3468
3469         if (nr_to_scan == 0)
3470                 goto out;
3471
3472         spin_lock(&kvm_lock);
3473
3474         list_for_each_entry(kvm, &vm_list, vm_list) {
3475                 int idx, freed_pages;
3476                 LIST_HEAD(invalid_list);
3477
3478                 idx = srcu_read_lock(&kvm->srcu);
3479                 spin_lock(&kvm->mmu_lock);
3480                 if (!kvm_freed && nr_to_scan > 0 &&
3481                     kvm->arch.n_used_mmu_pages > 0) {
3482                         freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
3483                                                           &invalid_list);
3484                         kvm_freed = kvm;
3485                 }
3486                 nr_to_scan--;
3487
3488                 kvm_mmu_commit_zap_page(kvm, &invalid_list);
3489                 spin_unlock(&kvm->mmu_lock);
3490                 srcu_read_unlock(&kvm->srcu, idx);
3491         }
3492         if (kvm_freed)
3493                 list_move_tail(&kvm_freed->vm_list, &vm_list);
3494
3495         spin_unlock(&kvm_lock);
3496
3497 out:
3498         return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
3499 }
3500
3501 static struct shrinker mmu_shrinker = {
3502         .shrink = mmu_shrink,
3503         .seeks = DEFAULT_SEEKS * 10,
3504 };
3505
3506 static void mmu_destroy_caches(void)
3507 {
3508         if (pte_chain_cache)
3509                 kmem_cache_destroy(pte_chain_cache);
3510         if (rmap_desc_cache)
3511                 kmem_cache_destroy(rmap_desc_cache);
3512         if (mmu_page_header_cache)
3513                 kmem_cache_destroy(mmu_page_header_cache);
3514 }
3515
3516 void kvm_mmu_module_exit(void)
3517 {
3518         mmu_destroy_caches();
3519         percpu_counter_destroy(&kvm_total_used_mmu_pages);
3520         unregister_shrinker(&mmu_shrinker);
3521 }
3522
3523 int kvm_mmu_module_init(void)
3524 {
3525         pte_chain_cache = kmem_cache_create("kvm_pte_chain",
3526                                             sizeof(struct kvm_pte_chain),
3527                                             0, 0, NULL);
3528         if (!pte_chain_cache)
3529                 goto nomem;
3530         rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
3531                                             sizeof(struct kvm_rmap_desc),
3532                                             0, 0, NULL);
3533         if (!rmap_desc_cache)
3534                 goto nomem;
3535
3536         mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
3537                                                   sizeof(struct kvm_mmu_page),
3538                                                   0, 0, NULL);
3539         if (!mmu_page_header_cache)
3540                 goto nomem;
3541
3542         if (percpu_counter_init(&kvm_total_used_mmu_pages, 0))
3543                 goto nomem;
3544
3545         register_shrinker(&mmu_shrinker);
3546
3547         return 0;
3548
3549 nomem:
3550         mmu_destroy_caches();
3551         return -ENOMEM;
3552 }
3553
3554 /*
3555  * Caculate mmu pages needed for kvm.
3556  */
3557 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
3558 {
3559         int i;
3560         unsigned int nr_mmu_pages;
3561         unsigned int  nr_pages = 0;
3562         struct kvm_memslots *slots;
3563
3564         slots = kvm_memslots(kvm);
3565
3566         for (i = 0; i < slots->nmemslots; i++)
3567                 nr_pages += slots->memslots[i].npages;
3568
3569         nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
3570         nr_mmu_pages = max(nr_mmu_pages,
3571                         (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
3572
3573         return nr_mmu_pages;
3574 }
3575
3576 static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer,
3577                                 unsigned len)
3578 {
3579         if (len > buffer->len)
3580                 return NULL;
3581         return buffer->ptr;
3582 }
3583
3584 static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer,
3585                                 unsigned len)
3586 {
3587         void *ret;
3588
3589         ret = pv_mmu_peek_buffer(buffer, len);
3590         if (!ret)
3591                 return ret;
3592         buffer->ptr += len;
3593         buffer->len -= len;
3594         buffer->processed += len;
3595         return ret;
3596 }
3597
3598 static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
3599                              gpa_t addr, gpa_t value)
3600 {
3601         int bytes = 8;
3602         int r;
3603
3604         if (!is_long_mode(vcpu) && !is_pae(vcpu))
3605                 bytes = 4;
3606
3607         r = mmu_topup_memory_caches(vcpu);
3608         if (r)
3609                 return r;
3610
3611         if (!emulator_write_phys(vcpu, addr, &value, bytes))
3612                 return -EFAULT;
3613
3614         return 1;
3615 }
3616
3617 static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
3618 {
3619         (void)kvm_set_cr3(vcpu, vcpu->arch.cr3);
3620         return 1;
3621 }
3622
3623 static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
3624 {
3625         spin_lock(&vcpu->kvm->mmu_lock);
3626         mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
3627         spin_unlock(&vcpu->kvm->mmu_lock);
3628         return 1;
3629 }
3630
3631 static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
3632                              struct kvm_pv_mmu_op_buffer *buffer)
3633 {
3634         struct kvm_mmu_op_header *header;
3635
3636         header = pv_mmu_peek_buffer(buffer, sizeof *header);
3637         if (!header)
3638                 return 0;
3639         switch (header->op) {
3640         case KVM_MMU_OP_WRITE_PTE: {
3641                 struct kvm_mmu_op_write_pte *wpte;
3642
3643                 wpte = pv_mmu_read_buffer(buffer, sizeof *wpte);
3644                 if (!wpte)
3645                         return 0;
3646                 return kvm_pv_mmu_write(vcpu, wpte->pte_phys,
3647                                         wpte->pte_val);
3648         }
3649         case KVM_MMU_OP_FLUSH_TLB: {
3650                 struct kvm_mmu_op_flush_tlb *ftlb;
3651
3652                 ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb);
3653                 if (!ftlb)
3654                         return 0;
3655                 return kvm_pv_mmu_flush_tlb(vcpu);
3656         }
3657         case KVM_MMU_OP_RELEASE_PT: {
3658                 struct kvm_mmu_op_release_pt *rpt;
3659
3660                 rpt = pv_mmu_read_buffer(buffer, sizeof *rpt);
3661                 if (!rpt)
3662                         return 0;
3663                 return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
3664         }
3665         default: return 0;
3666         }
3667 }
3668
3669 int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
3670                   gpa_t addr, unsigned long *ret)
3671 {
3672         int r;
3673         struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
3674
3675         buffer->ptr = buffer->buf;
3676         buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
3677         buffer->processed = 0;
3678
3679         r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
3680         if (r)
3681                 goto out;
3682
3683         while (buffer->len) {
3684                 r = kvm_pv_mmu_op_one(vcpu, buffer);
3685                 if (r < 0)
3686                         goto out;
3687                 if (r == 0)
3688                         break;
3689         }
3690
3691         r = 1;
3692 out:
3693         *ret = buffer->processed;
3694         return r;
3695 }
3696
3697 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
3698 {
3699         struct kvm_shadow_walk_iterator iterator;
3700         int nr_sptes = 0;
3701
3702         spin_lock(&vcpu->kvm->mmu_lock);
3703         for_each_shadow_entry(vcpu, addr, iterator) {
3704                 sptes[iterator.level-1] = *iterator.sptep;
3705                 nr_sptes++;
3706                 if (!is_shadow_present_pte(*iterator.sptep))
3707                         break;
3708         }
3709         spin_unlock(&vcpu->kvm->mmu_lock);
3710
3711         return nr_sptes;
3712 }
3713 EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
3714
3715 #ifdef CONFIG_KVM_MMU_AUDIT
3716 #include "mmu_audit.c"
3717 #else
3718 static void mmu_audit_disable(void) { }
3719 #endif
3720
3721 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
3722 {
3723         ASSERT(vcpu);
3724
3725         destroy_kvm_mmu(vcpu);
3726         free_mmu_pages(vcpu);
3727         mmu_free_memory_caches(vcpu);
3728         mmu_audit_disable();
3729 }