]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - arch/x86/kvm/mmu.c
kvm: x86: Introduce KVM_REQ_LOAD_CR3
[mirror_ubuntu-jammy-kernel.git] / arch / x86 / kvm / mmu.c
CommitLineData
6aa8b732
AK
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * MMU support
8 *
9 * Copyright (C) 2006 Qumranet, Inc.
9611c187 10 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
6aa8b732
AK
11 *
12 * Authors:
13 * Yaniv Kamay <yaniv@qumranet.com>
14 * Avi Kivity <avi@qumranet.com>
15 *
16 * This work is licensed under the terms of the GNU GPL, version 2. See
17 * the COPYING file in the top-level directory.
18 *
19 */
e495606d 20
af585b92 21#include "irq.h"
1d737c8a 22#include "mmu.h"
836a1b3c 23#include "x86.h"
6de4f3ad 24#include "kvm_cache_regs.h"
5f7dde7b 25#include "cpuid.h"
e495606d 26
edf88417 27#include <linux/kvm_host.h>
6aa8b732
AK
28#include <linux/types.h>
29#include <linux/string.h>
6aa8b732
AK
30#include <linux/mm.h>
31#include <linux/highmem.h>
1767e931
PG
32#include <linux/moduleparam.h>
33#include <linux/export.h>
448353ca 34#include <linux/swap.h>
05da4558 35#include <linux/hugetlb.h>
2f333bcb 36#include <linux/compiler.h>
bc6678a3 37#include <linux/srcu.h>
5a0e3ad6 38#include <linux/slab.h>
3f07c014 39#include <linux/sched/signal.h>
bf998156 40#include <linux/uaccess.h>
114df303 41#include <linux/hash.h>
f160c7b7 42#include <linux/kern_levels.h>
6aa8b732 43
e495606d 44#include <asm/page.h>
aa2e063a 45#include <asm/pat.h>
e495606d 46#include <asm/cmpxchg.h>
4e542370 47#include <asm/io.h>
13673a90 48#include <asm/vmx.h>
3d0c27ad 49#include <asm/kvm_page_track.h>
1261bfa3 50#include "trace.h"
6aa8b732 51
18552672
JR
52/*
53 * When setting this variable to true it enables Two-Dimensional-Paging
54 * where the hardware walks 2 page tables:
55 * 1. the guest-virtual to guest-physical
56 * 2. while doing 1. it walks guest-physical to host-physical
57 * If the hardware supports that we don't need to do shadow paging.
58 */
2f333bcb 59bool tdp_enabled = false;
18552672 60
8b1fe17c
XG
61enum {
62 AUDIT_PRE_PAGE_FAULT,
63 AUDIT_POST_PAGE_FAULT,
64 AUDIT_PRE_PTE_WRITE,
6903074c
XG
65 AUDIT_POST_PTE_WRITE,
66 AUDIT_PRE_SYNC,
67 AUDIT_POST_SYNC
8b1fe17c 68};
37a7d8b0 69
8b1fe17c 70#undef MMU_DEBUG
37a7d8b0
AK
71
72#ifdef MMU_DEBUG
fa4a2c08
PB
73static bool dbg = 0;
74module_param(dbg, bool, 0644);
37a7d8b0
AK
75
76#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
77#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
fa4a2c08 78#define MMU_WARN_ON(x) WARN_ON(x)
37a7d8b0 79#else
37a7d8b0
AK
80#define pgprintk(x...) do { } while (0)
81#define rmap_printk(x...) do { } while (0)
fa4a2c08 82#define MMU_WARN_ON(x) do { } while (0)
d6c69ee9 83#endif
6aa8b732 84
957ed9ef
XG
85#define PTE_PREFETCH_NUM 8
86
00763e41 87#define PT_FIRST_AVAIL_BITS_SHIFT 10
6aa8b732
AK
88#define PT64_SECOND_AVAIL_BITS_SHIFT 52
89
6aa8b732
AK
90#define PT64_LEVEL_BITS 9
91
92#define PT64_LEVEL_SHIFT(level) \
d77c26fc 93 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
6aa8b732 94
6aa8b732
AK
95#define PT64_INDEX(address, level)\
96 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
97
98
99#define PT32_LEVEL_BITS 10
100
101#define PT32_LEVEL_SHIFT(level) \
d77c26fc 102 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
6aa8b732 103
e04da980
JR
104#define PT32_LVL_OFFSET_MASK(level) \
105 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
106 * PT32_LEVEL_BITS))) - 1))
6aa8b732
AK
107
108#define PT32_INDEX(address, level)\
109 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
110
111
d0ec49d4 112#define PT64_BASE_ADDR_MASK __sme_clr((((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)))
6aa8b732
AK
113#define PT64_DIR_BASE_ADDR_MASK \
114 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
e04da980
JR
115#define PT64_LVL_ADDR_MASK(level) \
116 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
117 * PT64_LEVEL_BITS))) - 1))
118#define PT64_LVL_OFFSET_MASK(level) \
119 (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
120 * PT64_LEVEL_BITS))) - 1))
6aa8b732
AK
121
122#define PT32_BASE_ADDR_MASK PAGE_MASK
123#define PT32_DIR_BASE_ADDR_MASK \
124 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
e04da980
JR
125#define PT32_LVL_ADDR_MASK(level) \
126 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
127 * PT32_LEVEL_BITS))) - 1))
6aa8b732 128
53166229 129#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
d0ec49d4 130 | shadow_x_mask | shadow_nx_mask | shadow_me_mask)
6aa8b732 131
fe135d2c
AK
132#define ACC_EXEC_MASK 1
133#define ACC_WRITE_MASK PT_WRITABLE_MASK
134#define ACC_USER_MASK PT_USER_MASK
135#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
136
f160c7b7
JS
137/* The mask for the R/X bits in EPT PTEs */
138#define PT64_EPT_READABLE_MASK 0x1ull
139#define PT64_EPT_EXECUTABLE_MASK 0x4ull
140
90bb6fc5
AK
141#include <trace/events/kvm.h>
142
07420171
AK
143#define CREATE_TRACE_POINTS
144#include "mmutrace.h"
145
49fde340
XG
146#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
147#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
1403283a 148
135f8c2b
AK
149#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
150
220f773a
TY
151/* make pte_list_desc fit well in cache line */
152#define PTE_LIST_EXT 3
153
9b8ebbdb
PB
154/*
155 * Return values of handle_mmio_page_fault and mmu.page_fault:
156 * RET_PF_RETRY: let CPU fault again on the address.
157 * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
158 *
159 * For handle_mmio_page_fault only:
160 * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
161 */
162enum {
163 RET_PF_RETRY = 0,
164 RET_PF_EMULATE = 1,
165 RET_PF_INVALID = 2,
166};
167
53c07b18
XG
168struct pte_list_desc {
169 u64 *sptes[PTE_LIST_EXT];
170 struct pte_list_desc *more;
cd4a4e53
AK
171};
172
2d11123a
AK
173struct kvm_shadow_walk_iterator {
174 u64 addr;
175 hpa_t shadow_addr;
2d11123a 176 u64 *sptep;
dd3bfd59 177 int level;
2d11123a
AK
178 unsigned index;
179};
180
9fa72119
JS
181static const union kvm_mmu_page_role mmu_base_role_mask = {
182 .cr0_wp = 1,
183 .cr4_pae = 1,
184 .nxe = 1,
185 .smep_andnot_wp = 1,
186 .smap_andnot_wp = 1,
187 .smm = 1,
188 .guest_mode = 1,
189 .ad_disabled = 1,
190};
191
2d11123a
AK
192#define for_each_shadow_entry(_vcpu, _addr, _walker) \
193 for (shadow_walk_init(&(_walker), _vcpu, _addr); \
194 shadow_walk_okay(&(_walker)); \
195 shadow_walk_next(&(_walker)))
196
c2a2ac2b
XG
197#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \
198 for (shadow_walk_init(&(_walker), _vcpu, _addr); \
199 shadow_walk_okay(&(_walker)) && \
200 ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \
201 __shadow_walk_next(&(_walker), spte))
202
53c07b18 203static struct kmem_cache *pte_list_desc_cache;
d3d25b04 204static struct kmem_cache *mmu_page_header_cache;
45221ab6 205static struct percpu_counter kvm_total_used_mmu_pages;
b5a33a75 206
7b52345e
SY
207static u64 __read_mostly shadow_nx_mask;
208static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
209static u64 __read_mostly shadow_user_mask;
210static u64 __read_mostly shadow_accessed_mask;
211static u64 __read_mostly shadow_dirty_mask;
ce88decf 212static u64 __read_mostly shadow_mmio_mask;
dcdca5fe 213static u64 __read_mostly shadow_mmio_value;
ffb128c8 214static u64 __read_mostly shadow_present_mask;
d0ec49d4 215static u64 __read_mostly shadow_me_mask;
ce88decf 216
f160c7b7 217/*
ac8d57e5
PF
218 * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value.
219 * Non-present SPTEs with shadow_acc_track_value set are in place for access
220 * tracking.
f160c7b7
JS
221 */
222static u64 __read_mostly shadow_acc_track_mask;
223static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK;
224
225/*
226 * The mask/shift to use for saving the original R/X bits when marking the PTE
227 * as not-present for access tracking purposes. We do not save the W bit as the
228 * PTEs being access tracked also need to be dirty tracked, so the W bit will be
229 * restored only when a write is attempted to the page.
230 */
231static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
232 PT64_EPT_EXECUTABLE_MASK;
233static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
234
ce88decf 235static void mmu_spte_set(u64 *sptep, u64 spte);
9fa72119
JS
236static union kvm_mmu_page_role
237kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
ce88decf 238
dcdca5fe 239void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value)
ce88decf 240{
dcdca5fe
PF
241 BUG_ON((mmio_mask & mmio_value) != mmio_value);
242 shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK;
312b616b 243 shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK;
ce88decf
XG
244}
245EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
246
ac8d57e5
PF
247static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
248{
249 return sp->role.ad_disabled;
250}
251
252static inline bool spte_ad_enabled(u64 spte)
253{
254 MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
255 return !(spte & shadow_acc_track_value);
256}
257
258static inline u64 spte_shadow_accessed_mask(u64 spte)
259{
260 MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
261 return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
262}
263
264static inline u64 spte_shadow_dirty_mask(u64 spte)
265{
266 MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
267 return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
268}
269
f160c7b7
JS
270static inline bool is_access_track_spte(u64 spte)
271{
ac8d57e5 272 return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
f160c7b7
JS
273}
274
f2fd125d 275/*
ee3d1570
DM
276 * the low bit of the generation number is always presumed to be zero.
277 * This disables mmio caching during memslot updates. The concept is
278 * similar to a seqcount but instead of retrying the access we just punt
279 * and ignore the cache.
280 *
281 * spte bits 3-11 are used as bits 1-9 of the generation number,
282 * the bits 52-61 are used as bits 10-19 of the generation number.
f2fd125d 283 */
ee3d1570 284#define MMIO_SPTE_GEN_LOW_SHIFT 2
f2fd125d
XG
285#define MMIO_SPTE_GEN_HIGH_SHIFT 52
286
ee3d1570
DM
287#define MMIO_GEN_SHIFT 20
288#define MMIO_GEN_LOW_SHIFT 10
289#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2)
f8f55942 290#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1)
f2fd125d
XG
291
292static u64 generation_mmio_spte_mask(unsigned int gen)
293{
294 u64 mask;
295
842bb26a 296 WARN_ON(gen & ~MMIO_GEN_MASK);
f2fd125d
XG
297
298 mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT;
299 mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT;
300 return mask;
301}
302
303static unsigned int get_mmio_spte_generation(u64 spte)
304{
305 unsigned int gen;
306
307 spte &= ~shadow_mmio_mask;
308
309 gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK;
310 gen |= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT;
311 return gen;
312}
313
54bf36aa 314static unsigned int kvm_current_mmio_generation(struct kvm_vcpu *vcpu)
f8f55942 315{
54bf36aa 316 return kvm_vcpu_memslots(vcpu)->generation & MMIO_GEN_MASK;
f8f55942
XG
317}
318
54bf36aa 319static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
f2fd125d 320 unsigned access)
ce88decf 321{
54bf36aa 322 unsigned int gen = kvm_current_mmio_generation(vcpu);
f8f55942 323 u64 mask = generation_mmio_spte_mask(gen);
95b0430d 324
ce88decf 325 access &= ACC_WRITE_MASK | ACC_USER_MASK;
dcdca5fe 326 mask |= shadow_mmio_value | access | gfn << PAGE_SHIFT;
f2fd125d 327
f8f55942 328 trace_mark_mmio_spte(sptep, gfn, access, gen);
f2fd125d 329 mmu_spte_set(sptep, mask);
ce88decf
XG
330}
331
332static bool is_mmio_spte(u64 spte)
333{
dcdca5fe 334 return (spte & shadow_mmio_mask) == shadow_mmio_value;
ce88decf
XG
335}
336
337static gfn_t get_mmio_spte_gfn(u64 spte)
338{
842bb26a 339 u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask;
f2fd125d 340 return (spte & ~mask) >> PAGE_SHIFT;
ce88decf
XG
341}
342
343static unsigned get_mmio_spte_access(u64 spte)
344{
842bb26a 345 u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask;
f2fd125d 346 return (spte & ~mask) & ~PAGE_MASK;
ce88decf
XG
347}
348
54bf36aa 349static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
ba049e93 350 kvm_pfn_t pfn, unsigned access)
ce88decf
XG
351{
352 if (unlikely(is_noslot_pfn(pfn))) {
54bf36aa 353 mark_mmio_spte(vcpu, sptep, gfn, access);
ce88decf
XG
354 return true;
355 }
356
357 return false;
358}
c7addb90 359
54bf36aa 360static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
f8f55942 361{
089504c0
XG
362 unsigned int kvm_gen, spte_gen;
363
54bf36aa 364 kvm_gen = kvm_current_mmio_generation(vcpu);
089504c0
XG
365 spte_gen = get_mmio_spte_generation(spte);
366
367 trace_check_mmio_spte(spte, kvm_gen, spte_gen);
368 return likely(kvm_gen == spte_gen);
f8f55942
XG
369}
370
ce00053b
PF
371/*
372 * Sets the shadow PTE masks used by the MMU.
373 *
374 * Assumptions:
375 * - Setting either @accessed_mask or @dirty_mask requires setting both
376 * - At least one of @accessed_mask or @acc_track_mask must be set
377 */
7b52345e 378void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
f160c7b7 379 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
d0ec49d4 380 u64 acc_track_mask, u64 me_mask)
7b52345e 381{
ce00053b
PF
382 BUG_ON(!dirty_mask != !accessed_mask);
383 BUG_ON(!accessed_mask && !acc_track_mask);
ac8d57e5 384 BUG_ON(acc_track_mask & shadow_acc_track_value);
312b616b 385
7b52345e
SY
386 shadow_user_mask = user_mask;
387 shadow_accessed_mask = accessed_mask;
388 shadow_dirty_mask = dirty_mask;
389 shadow_nx_mask = nx_mask;
390 shadow_x_mask = x_mask;
ffb128c8 391 shadow_present_mask = p_mask;
f160c7b7 392 shadow_acc_track_mask = acc_track_mask;
d0ec49d4 393 shadow_me_mask = me_mask;
7b52345e
SY
394}
395EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
396
858ac87f 397static void kvm_mmu_clear_all_pte_masks(void)
f160c7b7
JS
398{
399 shadow_user_mask = 0;
400 shadow_accessed_mask = 0;
401 shadow_dirty_mask = 0;
402 shadow_nx_mask = 0;
403 shadow_x_mask = 0;
404 shadow_mmio_mask = 0;
405 shadow_present_mask = 0;
406 shadow_acc_track_mask = 0;
407}
408
6aa8b732
AK
409static int is_cpuid_PSE36(void)
410{
411 return 1;
412}
413
73b1087e
AK
414static int is_nx(struct kvm_vcpu *vcpu)
415{
f6801dff 416 return vcpu->arch.efer & EFER_NX;
73b1087e
AK
417}
418
c7addb90
AK
419static int is_shadow_present_pte(u64 pte)
420{
f160c7b7 421 return (pte != 0) && !is_mmio_spte(pte);
c7addb90
AK
422}
423
05da4558
MT
424static int is_large_pte(u64 pte)
425{
426 return pte & PT_PAGE_SIZE_MASK;
427}
428
776e6633
MT
429static int is_last_spte(u64 pte, int level)
430{
431 if (level == PT_PAGE_TABLE_LEVEL)
432 return 1;
852e3c19 433 if (is_large_pte(pte))
776e6633
MT
434 return 1;
435 return 0;
436}
437
d3e328f2
JS
438static bool is_executable_pte(u64 spte)
439{
440 return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
441}
442
ba049e93 443static kvm_pfn_t spte_to_pfn(u64 pte)
0b49ea86 444{
35149e21 445 return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
0b49ea86
AK
446}
447
da928521
AK
448static gfn_t pse36_gfn_delta(u32 gpte)
449{
450 int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
451
452 return (gpte & PT32_DIR_PSE36_MASK) << shift;
453}
454
603e0651 455#ifdef CONFIG_X86_64
d555c333 456static void __set_spte(u64 *sptep, u64 spte)
e663ee64 457{
b19ee2ff 458 WRITE_ONCE(*sptep, spte);
e663ee64
AK
459}
460
603e0651 461static void __update_clear_spte_fast(u64 *sptep, u64 spte)
a9221dd5 462{
b19ee2ff 463 WRITE_ONCE(*sptep, spte);
603e0651
XG
464}
465
466static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
467{
468 return xchg(sptep, spte);
469}
c2a2ac2b
XG
470
471static u64 __get_spte_lockless(u64 *sptep)
472{
6aa7de05 473 return READ_ONCE(*sptep);
c2a2ac2b 474}
a9221dd5 475#else
603e0651
XG
476union split_spte {
477 struct {
478 u32 spte_low;
479 u32 spte_high;
480 };
481 u64 spte;
482};
a9221dd5 483
c2a2ac2b
XG
484static void count_spte_clear(u64 *sptep, u64 spte)
485{
486 struct kvm_mmu_page *sp = page_header(__pa(sptep));
487
488 if (is_shadow_present_pte(spte))
489 return;
490
491 /* Ensure the spte is completely set before we increase the count */
492 smp_wmb();
493 sp->clear_spte_count++;
494}
495
603e0651
XG
496static void __set_spte(u64 *sptep, u64 spte)
497{
498 union split_spte *ssptep, sspte;
a9221dd5 499
603e0651
XG
500 ssptep = (union split_spte *)sptep;
501 sspte = (union split_spte)spte;
502
503 ssptep->spte_high = sspte.spte_high;
504
505 /*
506 * If we map the spte from nonpresent to present, We should store
507 * the high bits firstly, then set present bit, so cpu can not
508 * fetch this spte while we are setting the spte.
509 */
510 smp_wmb();
511
b19ee2ff 512 WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
a9221dd5
AK
513}
514
603e0651
XG
515static void __update_clear_spte_fast(u64 *sptep, u64 spte)
516{
517 union split_spte *ssptep, sspte;
518
519 ssptep = (union split_spte *)sptep;
520 sspte = (union split_spte)spte;
521
b19ee2ff 522 WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
603e0651
XG
523
524 /*
525 * If we map the spte from present to nonpresent, we should clear
526 * present bit firstly to avoid vcpu fetch the old high bits.
527 */
528 smp_wmb();
529
530 ssptep->spte_high = sspte.spte_high;
c2a2ac2b 531 count_spte_clear(sptep, spte);
603e0651
XG
532}
533
534static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
535{
536 union split_spte *ssptep, sspte, orig;
537
538 ssptep = (union split_spte *)sptep;
539 sspte = (union split_spte)spte;
540
541 /* xchg acts as a barrier before the setting of the high bits */
542 orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
41bc3186
ZJ
543 orig.spte_high = ssptep->spte_high;
544 ssptep->spte_high = sspte.spte_high;
c2a2ac2b 545 count_spte_clear(sptep, spte);
603e0651
XG
546
547 return orig.spte;
548}
c2a2ac2b
XG
549
550/*
551 * The idea using the light way get the spte on x86_32 guest is from
552 * gup_get_pte(arch/x86/mm/gup.c).
accaefe0
XG
553 *
554 * An spte tlb flush may be pending, because kvm_set_pte_rmapp
555 * coalesces them and we are running out of the MMU lock. Therefore
556 * we need to protect against in-progress updates of the spte.
557 *
558 * Reading the spte while an update is in progress may get the old value
559 * for the high part of the spte. The race is fine for a present->non-present
560 * change (because the high part of the spte is ignored for non-present spte),
561 * but for a present->present change we must reread the spte.
562 *
563 * All such changes are done in two steps (present->non-present and
564 * non-present->present), hence it is enough to count the number of
565 * present->non-present updates: if it changed while reading the spte,
566 * we might have hit the race. This is done using clear_spte_count.
c2a2ac2b
XG
567 */
568static u64 __get_spte_lockless(u64 *sptep)
569{
570 struct kvm_mmu_page *sp = page_header(__pa(sptep));
571 union split_spte spte, *orig = (union split_spte *)sptep;
572 int count;
573
574retry:
575 count = sp->clear_spte_count;
576 smp_rmb();
577
578 spte.spte_low = orig->spte_low;
579 smp_rmb();
580
581 spte.spte_high = orig->spte_high;
582 smp_rmb();
583
584 if (unlikely(spte.spte_low != orig->spte_low ||
585 count != sp->clear_spte_count))
586 goto retry;
587
588 return spte.spte;
589}
603e0651
XG
590#endif
591
ea4114bc 592static bool spte_can_locklessly_be_made_writable(u64 spte)
c7ba5b48 593{
feb3eb70
GN
594 return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
595 (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
c7ba5b48
XG
596}
597
8672b721
XG
598static bool spte_has_volatile_bits(u64 spte)
599{
f160c7b7
JS
600 if (!is_shadow_present_pte(spte))
601 return false;
602
c7ba5b48 603 /*
6a6256f9 604 * Always atomically update spte if it can be updated
c7ba5b48
XG
605 * out of mmu-lock, it can ensure dirty bit is not lost,
606 * also, it can help us to get a stable is_writable_pte()
607 * to ensure tlb flush is not missed.
608 */
f160c7b7
JS
609 if (spte_can_locklessly_be_made_writable(spte) ||
610 is_access_track_spte(spte))
c7ba5b48
XG
611 return true;
612
ac8d57e5 613 if (spte_ad_enabled(spte)) {
f160c7b7
JS
614 if ((spte & shadow_accessed_mask) == 0 ||
615 (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
616 return true;
617 }
8672b721 618
f160c7b7 619 return false;
8672b721
XG
620}
621
83ef6c81 622static bool is_accessed_spte(u64 spte)
4132779b 623{
ac8d57e5
PF
624 u64 accessed_mask = spte_shadow_accessed_mask(spte);
625
626 return accessed_mask ? spte & accessed_mask
627 : !is_access_track_spte(spte);
4132779b
XG
628}
629
83ef6c81 630static bool is_dirty_spte(u64 spte)
7e71a59b 631{
ac8d57e5
PF
632 u64 dirty_mask = spte_shadow_dirty_mask(spte);
633
634 return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
7e71a59b
KH
635}
636
1df9f2dc
XG
637/* Rules for using mmu_spte_set:
638 * Set the sptep from nonpresent to present.
639 * Note: the sptep being assigned *must* be either not present
640 * or in a state where the hardware will not attempt to update
641 * the spte.
642 */
643static void mmu_spte_set(u64 *sptep, u64 new_spte)
644{
645 WARN_ON(is_shadow_present_pte(*sptep));
646 __set_spte(sptep, new_spte);
647}
648
f39a058d
JS
649/*
650 * Update the SPTE (excluding the PFN), but do not track changes in its
651 * accessed/dirty status.
1df9f2dc 652 */
f39a058d 653static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
b79b93f9 654{
c7ba5b48 655 u64 old_spte = *sptep;
4132779b 656
afd28fe1 657 WARN_ON(!is_shadow_present_pte(new_spte));
b79b93f9 658
6e7d0354
XG
659 if (!is_shadow_present_pte(old_spte)) {
660 mmu_spte_set(sptep, new_spte);
f39a058d 661 return old_spte;
6e7d0354 662 }
4132779b 663
c7ba5b48 664 if (!spte_has_volatile_bits(old_spte))
603e0651 665 __update_clear_spte_fast(sptep, new_spte);
4132779b 666 else
603e0651 667 old_spte = __update_clear_spte_slow(sptep, new_spte);
4132779b 668
83ef6c81
JS
669 WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
670
f39a058d
JS
671 return old_spte;
672}
673
674/* Rules for using mmu_spte_update:
675 * Update the state bits, it means the mapped pfn is not changed.
676 *
677 * Whenever we overwrite a writable spte with a read-only one we
678 * should flush remote TLBs. Otherwise rmap_write_protect
679 * will find a read-only spte, even though the writable spte
680 * might be cached on a CPU's TLB, the return value indicates this
681 * case.
682 *
683 * Returns true if the TLB needs to be flushed
684 */
685static bool mmu_spte_update(u64 *sptep, u64 new_spte)
686{
687 bool flush = false;
688 u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
689
690 if (!is_shadow_present_pte(old_spte))
691 return false;
692
c7ba5b48
XG
693 /*
694 * For the spte updated out of mmu-lock is safe, since
6a6256f9 695 * we always atomically update it, see the comments in
c7ba5b48
XG
696 * spte_has_volatile_bits().
697 */
ea4114bc 698 if (spte_can_locklessly_be_made_writable(old_spte) &&
7f31c959 699 !is_writable_pte(new_spte))
83ef6c81 700 flush = true;
4132779b 701
7e71a59b 702 /*
83ef6c81 703 * Flush TLB when accessed/dirty states are changed in the page tables,
7e71a59b
KH
704 * to guarantee consistency between TLB and page tables.
705 */
7e71a59b 706
83ef6c81
JS
707 if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
708 flush = true;
4132779b 709 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
83ef6c81
JS
710 }
711
712 if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
713 flush = true;
4132779b 714 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
83ef6c81 715 }
6e7d0354 716
83ef6c81 717 return flush;
b79b93f9
AK
718}
719
1df9f2dc
XG
720/*
721 * Rules for using mmu_spte_clear_track_bits:
722 * It sets the sptep from present to nonpresent, and track the
723 * state bits, it is used to clear the last level sptep.
83ef6c81 724 * Returns non-zero if the PTE was previously valid.
1df9f2dc
XG
725 */
726static int mmu_spte_clear_track_bits(u64 *sptep)
727{
ba049e93 728 kvm_pfn_t pfn;
1df9f2dc
XG
729 u64 old_spte = *sptep;
730
731 if (!spte_has_volatile_bits(old_spte))
603e0651 732 __update_clear_spte_fast(sptep, 0ull);
1df9f2dc 733 else
603e0651 734 old_spte = __update_clear_spte_slow(sptep, 0ull);
1df9f2dc 735
afd28fe1 736 if (!is_shadow_present_pte(old_spte))
1df9f2dc
XG
737 return 0;
738
739 pfn = spte_to_pfn(old_spte);
86fde74c
XG
740
741 /*
742 * KVM does not hold the refcount of the page used by
743 * kvm mmu, before reclaiming the page, we should
744 * unmap it from mmu first.
745 */
bf4bea8e 746 WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
86fde74c 747
83ef6c81 748 if (is_accessed_spte(old_spte))
1df9f2dc 749 kvm_set_pfn_accessed(pfn);
83ef6c81
JS
750
751 if (is_dirty_spte(old_spte))
1df9f2dc 752 kvm_set_pfn_dirty(pfn);
83ef6c81 753
1df9f2dc
XG
754 return 1;
755}
756
757/*
758 * Rules for using mmu_spte_clear_no_track:
759 * Directly clear spte without caring the state bits of sptep,
760 * it is used to set the upper level spte.
761 */
762static void mmu_spte_clear_no_track(u64 *sptep)
763{
603e0651 764 __update_clear_spte_fast(sptep, 0ull);
1df9f2dc
XG
765}
766
c2a2ac2b
XG
767static u64 mmu_spte_get_lockless(u64 *sptep)
768{
769 return __get_spte_lockless(sptep);
770}
771
f160c7b7
JS
772static u64 mark_spte_for_access_track(u64 spte)
773{
ac8d57e5 774 if (spte_ad_enabled(spte))
f160c7b7
JS
775 return spte & ~shadow_accessed_mask;
776
ac8d57e5 777 if (is_access_track_spte(spte))
f160c7b7
JS
778 return spte;
779
780 /*
20d65236
JS
781 * Making an Access Tracking PTE will result in removal of write access
782 * from the PTE. So, verify that we will be able to restore the write
783 * access in the fast page fault path later on.
f160c7b7
JS
784 */
785 WARN_ONCE((spte & PT_WRITABLE_MASK) &&
786 !spte_can_locklessly_be_made_writable(spte),
787 "kvm: Writable SPTE is not locklessly dirty-trackable\n");
788
789 WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
790 shadow_acc_track_saved_bits_shift),
791 "kvm: Access Tracking saved bit locations are not zero\n");
792
793 spte |= (spte & shadow_acc_track_saved_bits_mask) <<
794 shadow_acc_track_saved_bits_shift;
795 spte &= ~shadow_acc_track_mask;
f160c7b7
JS
796
797 return spte;
798}
799
d3e328f2
JS
800/* Restore an acc-track PTE back to a regular PTE */
801static u64 restore_acc_track_spte(u64 spte)
802{
803 u64 new_spte = spte;
804 u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
805 & shadow_acc_track_saved_bits_mask;
806
ac8d57e5 807 WARN_ON_ONCE(spte_ad_enabled(spte));
d3e328f2
JS
808 WARN_ON_ONCE(!is_access_track_spte(spte));
809
810 new_spte &= ~shadow_acc_track_mask;
811 new_spte &= ~(shadow_acc_track_saved_bits_mask <<
812 shadow_acc_track_saved_bits_shift);
813 new_spte |= saved_bits;
814
815 return new_spte;
816}
817
f160c7b7
JS
818/* Returns the Accessed status of the PTE and resets it at the same time. */
819static bool mmu_spte_age(u64 *sptep)
820{
821 u64 spte = mmu_spte_get_lockless(sptep);
822
823 if (!is_accessed_spte(spte))
824 return false;
825
ac8d57e5 826 if (spte_ad_enabled(spte)) {
f160c7b7
JS
827 clear_bit((ffs(shadow_accessed_mask) - 1),
828 (unsigned long *)sptep);
829 } else {
830 /*
831 * Capture the dirty status of the page, so that it doesn't get
832 * lost when the SPTE is marked for access tracking.
833 */
834 if (is_writable_pte(spte))
835 kvm_set_pfn_dirty(spte_to_pfn(spte));
836
837 spte = mark_spte_for_access_track(spte);
838 mmu_spte_update_no_track(sptep, spte);
839 }
840
841 return true;
842}
843
c2a2ac2b
XG
844static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
845{
c142786c
AK
846 /*
847 * Prevent page table teardown by making any free-er wait during
848 * kvm_flush_remote_tlbs() IPI to all active vcpus.
849 */
850 local_irq_disable();
36ca7e0a 851
c142786c
AK
852 /*
853 * Make sure a following spte read is not reordered ahead of the write
854 * to vcpu->mode.
855 */
36ca7e0a 856 smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
c2a2ac2b
XG
857}
858
859static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
860{
c142786c
AK
861 /*
862 * Make sure the write to vcpu->mode is not reordered in front of
863 * reads to sptes. If it does, kvm_commit_zap_page() can see us
864 * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
865 */
36ca7e0a 866 smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
c142786c 867 local_irq_enable();
c2a2ac2b
XG
868}
869
e2dec939 870static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
2e3e5882 871 struct kmem_cache *base_cache, int min)
714b93da
AK
872{
873 void *obj;
874
875 if (cache->nobjs >= min)
e2dec939 876 return 0;
714b93da 877 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
2e3e5882 878 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
714b93da 879 if (!obj)
e2dec939 880 return -ENOMEM;
714b93da
AK
881 cache->objects[cache->nobjs++] = obj;
882 }
e2dec939 883 return 0;
714b93da
AK
884}
885
f759e2b4
XG
886static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache)
887{
888 return cache->nobjs;
889}
890
e8ad9a70
XG
891static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
892 struct kmem_cache *cache)
714b93da
AK
893{
894 while (mc->nobjs)
e8ad9a70 895 kmem_cache_free(cache, mc->objects[--mc->nobjs]);
714b93da
AK
896}
897
c1158e63 898static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
2e3e5882 899 int min)
c1158e63 900{
842f22ed 901 void *page;
c1158e63
AK
902
903 if (cache->nobjs >= min)
904 return 0;
905 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
842f22ed 906 page = (void *)__get_free_page(GFP_KERNEL);
c1158e63
AK
907 if (!page)
908 return -ENOMEM;
842f22ed 909 cache->objects[cache->nobjs++] = page;
c1158e63
AK
910 }
911 return 0;
912}
913
914static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
915{
916 while (mc->nobjs)
c4d198d5 917 free_page((unsigned long)mc->objects[--mc->nobjs]);
c1158e63
AK
918}
919
2e3e5882 920static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
714b93da 921{
e2dec939
AK
922 int r;
923
53c07b18 924 r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
67052b35 925 pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
d3d25b04
AK
926 if (r)
927 goto out;
ad312c7c 928 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
d3d25b04
AK
929 if (r)
930 goto out;
ad312c7c 931 r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
2e3e5882 932 mmu_page_header_cache, 4);
e2dec939
AK
933out:
934 return r;
714b93da
AK
935}
936
937static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
938{
53c07b18
XG
939 mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
940 pte_list_desc_cache);
ad312c7c 941 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
e8ad9a70
XG
942 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
943 mmu_page_header_cache);
714b93da
AK
944}
945
80feb89a 946static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
714b93da
AK
947{
948 void *p;
949
950 BUG_ON(!mc->nobjs);
951 p = mc->objects[--mc->nobjs];
714b93da
AK
952 return p;
953}
954
53c07b18 955static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
714b93da 956{
80feb89a 957 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
714b93da
AK
958}
959
53c07b18 960static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
714b93da 961{
53c07b18 962 kmem_cache_free(pte_list_desc_cache, pte_list_desc);
714b93da
AK
963}
964
2032a93d
LJ
965static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
966{
967 if (!sp->role.direct)
968 return sp->gfns[index];
969
970 return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
971}
972
973static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
974{
975 if (sp->role.direct)
976 BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
977 else
978 sp->gfns[index] = gfn;
979}
980
05da4558 981/*
d4dbf470
TY
982 * Return the pointer to the large page information for a given gfn,
983 * handling slots that are not large page aligned.
05da4558 984 */
d4dbf470
TY
985static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
986 struct kvm_memory_slot *slot,
987 int level)
05da4558
MT
988{
989 unsigned long idx;
990
fb03cb6f 991 idx = gfn_to_index(gfn, slot->base_gfn, level);
db3fe4eb 992 return &slot->arch.lpage_info[level - 2][idx];
05da4558
MT
993}
994
547ffaed
XG
995static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
996 gfn_t gfn, int count)
997{
998 struct kvm_lpage_info *linfo;
999 int i;
1000
1001 for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1002 linfo = lpage_info_slot(gfn, slot, i);
1003 linfo->disallow_lpage += count;
1004 WARN_ON(linfo->disallow_lpage < 0);
1005 }
1006}
1007
1008void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
1009{
1010 update_gfn_disallow_lpage_count(slot, gfn, 1);
1011}
1012
1013void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
1014{
1015 update_gfn_disallow_lpage_count(slot, gfn, -1);
1016}
1017
3ed1a478 1018static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
05da4558 1019{
699023e2 1020 struct kvm_memslots *slots;
d25797b2 1021 struct kvm_memory_slot *slot;
3ed1a478 1022 gfn_t gfn;
05da4558 1023
56ca57f9 1024 kvm->arch.indirect_shadow_pages++;
3ed1a478 1025 gfn = sp->gfn;
699023e2
PB
1026 slots = kvm_memslots_for_spte_role(kvm, sp->role);
1027 slot = __gfn_to_memslot(slots, gfn);
56ca57f9
XG
1028
1029 /* the non-leaf shadow pages are keeping readonly. */
1030 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
1031 return kvm_slot_page_track_add_page(kvm, slot, gfn,
1032 KVM_PAGE_TRACK_WRITE);
1033
547ffaed 1034 kvm_mmu_gfn_disallow_lpage(slot, gfn);
05da4558
MT
1035}
1036
3ed1a478 1037static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
05da4558 1038{
699023e2 1039 struct kvm_memslots *slots;
d25797b2 1040 struct kvm_memory_slot *slot;
3ed1a478 1041 gfn_t gfn;
05da4558 1042
56ca57f9 1043 kvm->arch.indirect_shadow_pages--;
3ed1a478 1044 gfn = sp->gfn;
699023e2
PB
1045 slots = kvm_memslots_for_spte_role(kvm, sp->role);
1046 slot = __gfn_to_memslot(slots, gfn);
56ca57f9
XG
1047 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
1048 return kvm_slot_page_track_remove_page(kvm, slot, gfn,
1049 KVM_PAGE_TRACK_WRITE);
1050
547ffaed 1051 kvm_mmu_gfn_allow_lpage(slot, gfn);
05da4558
MT
1052}
1053
92f94f1e
XG
1054static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
1055 struct kvm_memory_slot *slot)
05da4558 1056{
d4dbf470 1057 struct kvm_lpage_info *linfo;
05da4558
MT
1058
1059 if (slot) {
d4dbf470 1060 linfo = lpage_info_slot(gfn, slot, level);
92f94f1e 1061 return !!linfo->disallow_lpage;
05da4558
MT
1062 }
1063
92f94f1e 1064 return true;
05da4558
MT
1065}
1066
92f94f1e
XG
1067static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn,
1068 int level)
5225fdf8
TY
1069{
1070 struct kvm_memory_slot *slot;
1071
1072 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
92f94f1e 1073 return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
5225fdf8
TY
1074}
1075
d25797b2 1076static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
05da4558 1077{
8f0b1ab6 1078 unsigned long page_size;
d25797b2 1079 int i, ret = 0;
05da4558 1080
8f0b1ab6 1081 page_size = kvm_host_page_size(kvm, gfn);
05da4558 1082
8a3d08f1 1083 for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
d25797b2
JR
1084 if (page_size >= KVM_HPAGE_SIZE(i))
1085 ret = i;
1086 else
1087 break;
1088 }
1089
4c2155ce 1090 return ret;
05da4558
MT
1091}
1092
d8aacf5d
TY
1093static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot,
1094 bool no_dirty_log)
1095{
1096 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
1097 return false;
1098 if (no_dirty_log && slot->dirty_bitmap)
1099 return false;
1100
1101 return true;
1102}
1103
5d163b1c
XG
1104static struct kvm_memory_slot *
1105gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
1106 bool no_dirty_log)
05da4558
MT
1107{
1108 struct kvm_memory_slot *slot;
5d163b1c 1109
54bf36aa 1110 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
d8aacf5d 1111 if (!memslot_valid_for_gpte(slot, no_dirty_log))
5d163b1c
XG
1112 slot = NULL;
1113
1114 return slot;
1115}
1116
fd136902
TY
1117static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
1118 bool *force_pt_level)
936a5fe6
AA
1119{
1120 int host_level, level, max_level;
d8aacf5d
TY
1121 struct kvm_memory_slot *slot;
1122
8c85ac1c
TY
1123 if (unlikely(*force_pt_level))
1124 return PT_PAGE_TABLE_LEVEL;
05da4558 1125
8c85ac1c
TY
1126 slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn);
1127 *force_pt_level = !memslot_valid_for_gpte(slot, true);
fd136902
TY
1128 if (unlikely(*force_pt_level))
1129 return PT_PAGE_TABLE_LEVEL;
1130
d25797b2
JR
1131 host_level = host_mapping_level(vcpu->kvm, large_gfn);
1132
1133 if (host_level == PT_PAGE_TABLE_LEVEL)
1134 return host_level;
1135
55dd98c3 1136 max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
878403b7
SY
1137
1138 for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
92f94f1e 1139 if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot))
d25797b2 1140 break;
d25797b2
JR
1141
1142 return level - 1;
05da4558
MT
1143}
1144
290fc38d 1145/*
018aabb5 1146 * About rmap_head encoding:
cd4a4e53 1147 *
018aabb5
TY
1148 * If the bit zero of rmap_head->val is clear, then it points to the only spte
1149 * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
53c07b18 1150 * pte_list_desc containing more mappings.
018aabb5
TY
1151 */
1152
1153/*
1154 * Returns the number of pointers in the rmap chain, not counting the new one.
cd4a4e53 1155 */
53c07b18 1156static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
018aabb5 1157 struct kvm_rmap_head *rmap_head)
cd4a4e53 1158{
53c07b18 1159 struct pte_list_desc *desc;
53a27b39 1160 int i, count = 0;
cd4a4e53 1161
018aabb5 1162 if (!rmap_head->val) {
53c07b18 1163 rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
018aabb5
TY
1164 rmap_head->val = (unsigned long)spte;
1165 } else if (!(rmap_head->val & 1)) {
53c07b18
XG
1166 rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
1167 desc = mmu_alloc_pte_list_desc(vcpu);
018aabb5 1168 desc->sptes[0] = (u64 *)rmap_head->val;
d555c333 1169 desc->sptes[1] = spte;
018aabb5 1170 rmap_head->val = (unsigned long)desc | 1;
cb16a7b3 1171 ++count;
cd4a4e53 1172 } else {
53c07b18 1173 rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
018aabb5 1174 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
53c07b18 1175 while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
cd4a4e53 1176 desc = desc->more;
53c07b18 1177 count += PTE_LIST_EXT;
53a27b39 1178 }
53c07b18
XG
1179 if (desc->sptes[PTE_LIST_EXT-1]) {
1180 desc->more = mmu_alloc_pte_list_desc(vcpu);
cd4a4e53
AK
1181 desc = desc->more;
1182 }
d555c333 1183 for (i = 0; desc->sptes[i]; ++i)
cb16a7b3 1184 ++count;
d555c333 1185 desc->sptes[i] = spte;
cd4a4e53 1186 }
53a27b39 1187 return count;
cd4a4e53
AK
1188}
1189
53c07b18 1190static void
018aabb5
TY
1191pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
1192 struct pte_list_desc *desc, int i,
1193 struct pte_list_desc *prev_desc)
cd4a4e53
AK
1194{
1195 int j;
1196
53c07b18 1197 for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
cd4a4e53 1198 ;
d555c333
AK
1199 desc->sptes[i] = desc->sptes[j];
1200 desc->sptes[j] = NULL;
cd4a4e53
AK
1201 if (j != 0)
1202 return;
1203 if (!prev_desc && !desc->more)
018aabb5 1204 rmap_head->val = (unsigned long)desc->sptes[0];
cd4a4e53
AK
1205 else
1206 if (prev_desc)
1207 prev_desc->more = desc->more;
1208 else
018aabb5 1209 rmap_head->val = (unsigned long)desc->more | 1;
53c07b18 1210 mmu_free_pte_list_desc(desc);
cd4a4e53
AK
1211}
1212
018aabb5 1213static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
cd4a4e53 1214{
53c07b18
XG
1215 struct pte_list_desc *desc;
1216 struct pte_list_desc *prev_desc;
cd4a4e53
AK
1217 int i;
1218
018aabb5 1219 if (!rmap_head->val) {
53c07b18 1220 printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte);
cd4a4e53 1221 BUG();
018aabb5 1222 } else if (!(rmap_head->val & 1)) {
53c07b18 1223 rmap_printk("pte_list_remove: %p 1->0\n", spte);
018aabb5 1224 if ((u64 *)rmap_head->val != spte) {
53c07b18 1225 printk(KERN_ERR "pte_list_remove: %p 1->BUG\n", spte);
cd4a4e53
AK
1226 BUG();
1227 }
018aabb5 1228 rmap_head->val = 0;
cd4a4e53 1229 } else {
53c07b18 1230 rmap_printk("pte_list_remove: %p many->many\n", spte);
018aabb5 1231 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
cd4a4e53
AK
1232 prev_desc = NULL;
1233 while (desc) {
018aabb5 1234 for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
d555c333 1235 if (desc->sptes[i] == spte) {
018aabb5
TY
1236 pte_list_desc_remove_entry(rmap_head,
1237 desc, i, prev_desc);
cd4a4e53
AK
1238 return;
1239 }
018aabb5 1240 }
cd4a4e53
AK
1241 prev_desc = desc;
1242 desc = desc->more;
1243 }
53c07b18 1244 pr_err("pte_list_remove: %p many->many\n", spte);
cd4a4e53
AK
1245 BUG();
1246 }
1247}
1248
018aabb5
TY
1249static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
1250 struct kvm_memory_slot *slot)
53c07b18 1251{
77d11309 1252 unsigned long idx;
53c07b18 1253
77d11309 1254 idx = gfn_to_index(gfn, slot->base_gfn, level);
d89cc617 1255 return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx];
53c07b18
XG
1256}
1257
018aabb5
TY
1258static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
1259 struct kvm_mmu_page *sp)
9b9b1492 1260{
699023e2 1261 struct kvm_memslots *slots;
9b9b1492
TY
1262 struct kvm_memory_slot *slot;
1263
699023e2
PB
1264 slots = kvm_memslots_for_spte_role(kvm, sp->role);
1265 slot = __gfn_to_memslot(slots, gfn);
e4cd1da9 1266 return __gfn_to_rmap(gfn, sp->role.level, slot);
9b9b1492
TY
1267}
1268
f759e2b4
XG
1269static bool rmap_can_add(struct kvm_vcpu *vcpu)
1270{
1271 struct kvm_mmu_memory_cache *cache;
1272
1273 cache = &vcpu->arch.mmu_pte_list_desc_cache;
1274 return mmu_memory_cache_free_objects(cache);
1275}
1276
53c07b18
XG
1277static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
1278{
1279 struct kvm_mmu_page *sp;
018aabb5 1280 struct kvm_rmap_head *rmap_head;
53c07b18 1281
53c07b18
XG
1282 sp = page_header(__pa(spte));
1283 kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
018aabb5
TY
1284 rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
1285 return pte_list_add(vcpu, spte, rmap_head);
53c07b18
XG
1286}
1287
53c07b18
XG
1288static void rmap_remove(struct kvm *kvm, u64 *spte)
1289{
1290 struct kvm_mmu_page *sp;
1291 gfn_t gfn;
018aabb5 1292 struct kvm_rmap_head *rmap_head;
53c07b18
XG
1293
1294 sp = page_header(__pa(spte));
1295 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
018aabb5
TY
1296 rmap_head = gfn_to_rmap(kvm, gfn, sp);
1297 pte_list_remove(spte, rmap_head);
53c07b18
XG
1298}
1299
1e3f42f0
TY
1300/*
1301 * Used by the following functions to iterate through the sptes linked by a
1302 * rmap. All fields are private and not assumed to be used outside.
1303 */
1304struct rmap_iterator {
1305 /* private fields */
1306 struct pte_list_desc *desc; /* holds the sptep if not NULL */
1307 int pos; /* index of the sptep */
1308};
1309
1310/*
1311 * Iteration must be started by this function. This should also be used after
1312 * removing/dropping sptes from the rmap link because in such cases the
1313 * information in the itererator may not be valid.
1314 *
1315 * Returns sptep if found, NULL otherwise.
1316 */
018aabb5
TY
1317static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
1318 struct rmap_iterator *iter)
1e3f42f0 1319{
77fbbbd2
TY
1320 u64 *sptep;
1321
018aabb5 1322 if (!rmap_head->val)
1e3f42f0
TY
1323 return NULL;
1324
018aabb5 1325 if (!(rmap_head->val & 1)) {
1e3f42f0 1326 iter->desc = NULL;
77fbbbd2
TY
1327 sptep = (u64 *)rmap_head->val;
1328 goto out;
1e3f42f0
TY
1329 }
1330
018aabb5 1331 iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1e3f42f0 1332 iter->pos = 0;
77fbbbd2
TY
1333 sptep = iter->desc->sptes[iter->pos];
1334out:
1335 BUG_ON(!is_shadow_present_pte(*sptep));
1336 return sptep;
1e3f42f0
TY
1337}
1338
1339/*
1340 * Must be used with a valid iterator: e.g. after rmap_get_first().
1341 *
1342 * Returns sptep if found, NULL otherwise.
1343 */
1344static u64 *rmap_get_next(struct rmap_iterator *iter)
1345{
77fbbbd2
TY
1346 u64 *sptep;
1347
1e3f42f0
TY
1348 if (iter->desc) {
1349 if (iter->pos < PTE_LIST_EXT - 1) {
1e3f42f0
TY
1350 ++iter->pos;
1351 sptep = iter->desc->sptes[iter->pos];
1352 if (sptep)
77fbbbd2 1353 goto out;
1e3f42f0
TY
1354 }
1355
1356 iter->desc = iter->desc->more;
1357
1358 if (iter->desc) {
1359 iter->pos = 0;
1360 /* desc->sptes[0] cannot be NULL */
77fbbbd2
TY
1361 sptep = iter->desc->sptes[iter->pos];
1362 goto out;
1e3f42f0
TY
1363 }
1364 }
1365
1366 return NULL;
77fbbbd2
TY
1367out:
1368 BUG_ON(!is_shadow_present_pte(*sptep));
1369 return sptep;
1e3f42f0
TY
1370}
1371
018aabb5
TY
1372#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \
1373 for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \
77fbbbd2 1374 _spte_; _spte_ = rmap_get_next(_iter_))
0d536790 1375
c3707958 1376static void drop_spte(struct kvm *kvm, u64 *sptep)
e4b502ea 1377{
1df9f2dc 1378 if (mmu_spte_clear_track_bits(sptep))
eb45fda4 1379 rmap_remove(kvm, sptep);
be38d276
AK
1380}
1381
8e22f955
XG
1382
1383static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
1384{
1385 if (is_large_pte(*sptep)) {
1386 WARN_ON(page_header(__pa(sptep))->role.level ==
1387 PT_PAGE_TABLE_LEVEL);
1388 drop_spte(kvm, sptep);
1389 --kvm->stat.lpages;
1390 return true;
1391 }
1392
1393 return false;
1394}
1395
1396static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1397{
1398 if (__drop_large_spte(vcpu->kvm, sptep))
1399 kvm_flush_remote_tlbs(vcpu->kvm);
1400}
1401
1402/*
49fde340 1403 * Write-protect on the specified @sptep, @pt_protect indicates whether
c126d94f 1404 * spte write-protection is caused by protecting shadow page table.
49fde340 1405 *
b4619660 1406 * Note: write protection is difference between dirty logging and spte
49fde340
XG
1407 * protection:
1408 * - for dirty logging, the spte can be set to writable at anytime if
1409 * its dirty bitmap is properly set.
1410 * - for spte protection, the spte can be writable only after unsync-ing
1411 * shadow page.
8e22f955 1412 *
c126d94f 1413 * Return true if tlb need be flushed.
8e22f955 1414 */
c4f138b4 1415static bool spte_write_protect(u64 *sptep, bool pt_protect)
d13bc5b5
XG
1416{
1417 u64 spte = *sptep;
1418
49fde340 1419 if (!is_writable_pte(spte) &&
ea4114bc 1420 !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
d13bc5b5
XG
1421 return false;
1422
1423 rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
1424
49fde340
XG
1425 if (pt_protect)
1426 spte &= ~SPTE_MMU_WRITEABLE;
d13bc5b5 1427 spte = spte & ~PT_WRITABLE_MASK;
49fde340 1428
c126d94f 1429 return mmu_spte_update(sptep, spte);
d13bc5b5
XG
1430}
1431
018aabb5
TY
1432static bool __rmap_write_protect(struct kvm *kvm,
1433 struct kvm_rmap_head *rmap_head,
245c3912 1434 bool pt_protect)
98348e95 1435{
1e3f42f0
TY
1436 u64 *sptep;
1437 struct rmap_iterator iter;
d13bc5b5 1438 bool flush = false;
374cbac0 1439
018aabb5 1440 for_each_rmap_spte(rmap_head, &iter, sptep)
c4f138b4 1441 flush |= spte_write_protect(sptep, pt_protect);
855149aa 1442
d13bc5b5 1443 return flush;
a0ed4607
TY
1444}
1445
c4f138b4 1446static bool spte_clear_dirty(u64 *sptep)
f4b4b180
KH
1447{
1448 u64 spte = *sptep;
1449
1450 rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
1451
1452 spte &= ~shadow_dirty_mask;
1453
1454 return mmu_spte_update(sptep, spte);
1455}
1456
ac8d57e5
PF
1457static bool wrprot_ad_disabled_spte(u64 *sptep)
1458{
1459 bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
1460 (unsigned long *)sptep);
1461 if (was_writable)
1462 kvm_set_pfn_dirty(spte_to_pfn(*sptep));
1463
1464 return was_writable;
1465}
1466
1467/*
1468 * Gets the GFN ready for another round of dirty logging by clearing the
1469 * - D bit on ad-enabled SPTEs, and
1470 * - W bit on ad-disabled SPTEs.
1471 * Returns true iff any D or W bits were cleared.
1472 */
018aabb5 1473static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
f4b4b180
KH
1474{
1475 u64 *sptep;
1476 struct rmap_iterator iter;
1477 bool flush = false;
1478
018aabb5 1479 for_each_rmap_spte(rmap_head, &iter, sptep)
ac8d57e5
PF
1480 if (spte_ad_enabled(*sptep))
1481 flush |= spte_clear_dirty(sptep);
1482 else
1483 flush |= wrprot_ad_disabled_spte(sptep);
f4b4b180
KH
1484
1485 return flush;
1486}
1487
c4f138b4 1488static bool spte_set_dirty(u64 *sptep)
f4b4b180
KH
1489{
1490 u64 spte = *sptep;
1491
1492 rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
1493
1494 spte |= shadow_dirty_mask;
1495
1496 return mmu_spte_update(sptep, spte);
1497}
1498
018aabb5 1499static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
f4b4b180
KH
1500{
1501 u64 *sptep;
1502 struct rmap_iterator iter;
1503 bool flush = false;
1504
018aabb5 1505 for_each_rmap_spte(rmap_head, &iter, sptep)
ac8d57e5
PF
1506 if (spte_ad_enabled(*sptep))
1507 flush |= spte_set_dirty(sptep);
f4b4b180
KH
1508
1509 return flush;
1510}
1511
5dc99b23 1512/**
3b0f1d01 1513 * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
5dc99b23
TY
1514 * @kvm: kvm instance
1515 * @slot: slot to protect
1516 * @gfn_offset: start of the BITS_PER_LONG pages we care about
1517 * @mask: indicates which pages we should protect
1518 *
1519 * Used when we do not need to care about huge page mappings: e.g. during dirty
1520 * logging we do not have any such mappings.
1521 */
3b0f1d01 1522static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
5dc99b23
TY
1523 struct kvm_memory_slot *slot,
1524 gfn_t gfn_offset, unsigned long mask)
a0ed4607 1525{
018aabb5 1526 struct kvm_rmap_head *rmap_head;
a0ed4607 1527
5dc99b23 1528 while (mask) {
018aabb5
TY
1529 rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1530 PT_PAGE_TABLE_LEVEL, slot);
1531 __rmap_write_protect(kvm, rmap_head, false);
05da4558 1532
5dc99b23
TY
1533 /* clear the first set bit */
1534 mask &= mask - 1;
1535 }
374cbac0
AK
1536}
1537
f4b4b180 1538/**
ac8d57e5
PF
1539 * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
1540 * protect the page if the D-bit isn't supported.
f4b4b180
KH
1541 * @kvm: kvm instance
1542 * @slot: slot to clear D-bit
1543 * @gfn_offset: start of the BITS_PER_LONG pages we care about
1544 * @mask: indicates which pages we should clear D-bit
1545 *
1546 * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
1547 */
1548void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1549 struct kvm_memory_slot *slot,
1550 gfn_t gfn_offset, unsigned long mask)
1551{
018aabb5 1552 struct kvm_rmap_head *rmap_head;
f4b4b180
KH
1553
1554 while (mask) {
018aabb5
TY
1555 rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1556 PT_PAGE_TABLE_LEVEL, slot);
1557 __rmap_clear_dirty(kvm, rmap_head);
f4b4b180
KH
1558
1559 /* clear the first set bit */
1560 mask &= mask - 1;
1561 }
1562}
1563EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked);
1564
3b0f1d01
KH
1565/**
1566 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1567 * PT level pages.
1568 *
1569 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1570 * enable dirty logging for them.
1571 *
1572 * Used when we do not need to care about huge page mappings: e.g. during dirty
1573 * logging we do not have any such mappings.
1574 */
1575void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1576 struct kvm_memory_slot *slot,
1577 gfn_t gfn_offset, unsigned long mask)
1578{
88178fd4
KH
1579 if (kvm_x86_ops->enable_log_dirty_pt_masked)
1580 kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset,
1581 mask);
1582 else
1583 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
3b0f1d01
KH
1584}
1585
bab4165e
BD
1586/**
1587 * kvm_arch_write_log_dirty - emulate dirty page logging
1588 * @vcpu: Guest mode vcpu
1589 *
1590 * Emulate arch specific page modification logging for the
1591 * nested hypervisor
1592 */
1593int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu)
1594{
1595 if (kvm_x86_ops->write_log_dirty)
1596 return kvm_x86_ops->write_log_dirty(vcpu);
1597
1598 return 0;
1599}
1600
aeecee2e
XG
1601bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
1602 struct kvm_memory_slot *slot, u64 gfn)
95d4c16c 1603{
018aabb5 1604 struct kvm_rmap_head *rmap_head;
5dc99b23 1605 int i;
2f84569f 1606 bool write_protected = false;
95d4c16c 1607
8a3d08f1 1608 for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
018aabb5 1609 rmap_head = __gfn_to_rmap(gfn, i, slot);
aeecee2e 1610 write_protected |= __rmap_write_protect(kvm, rmap_head, true);
5dc99b23
TY
1611 }
1612
1613 return write_protected;
95d4c16c
TY
1614}
1615
aeecee2e
XG
1616static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
1617{
1618 struct kvm_memory_slot *slot;
1619
1620 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1621 return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
1622}
1623
018aabb5 1624static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
e930bffe 1625{
1e3f42f0
TY
1626 u64 *sptep;
1627 struct rmap_iterator iter;
6a49f85c 1628 bool flush = false;
e930bffe 1629
018aabb5 1630 while ((sptep = rmap_get_first(rmap_head, &iter))) {
6a49f85c 1631 rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
1e3f42f0
TY
1632
1633 drop_spte(kvm, sptep);
6a49f85c 1634 flush = true;
e930bffe 1635 }
1e3f42f0 1636
6a49f85c
XG
1637 return flush;
1638}
1639
018aabb5 1640static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
6a49f85c
XG
1641 struct kvm_memory_slot *slot, gfn_t gfn, int level,
1642 unsigned long data)
1643{
018aabb5 1644 return kvm_zap_rmapp(kvm, rmap_head);
e930bffe
AA
1645}
1646
018aabb5 1647static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
8a9522d2
ALC
1648 struct kvm_memory_slot *slot, gfn_t gfn, int level,
1649 unsigned long data)
3da0dd43 1650{
1e3f42f0
TY
1651 u64 *sptep;
1652 struct rmap_iterator iter;
3da0dd43 1653 int need_flush = 0;
1e3f42f0 1654 u64 new_spte;
3da0dd43 1655 pte_t *ptep = (pte_t *)data;
ba049e93 1656 kvm_pfn_t new_pfn;
3da0dd43
IE
1657
1658 WARN_ON(pte_huge(*ptep));
1659 new_pfn = pte_pfn(*ptep);
1e3f42f0 1660
0d536790 1661restart:
018aabb5 1662 for_each_rmap_spte(rmap_head, &iter, sptep) {
8a9522d2 1663 rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
f160c7b7 1664 sptep, *sptep, gfn, level);
1e3f42f0 1665
3da0dd43 1666 need_flush = 1;
1e3f42f0 1667
3da0dd43 1668 if (pte_write(*ptep)) {
1e3f42f0 1669 drop_spte(kvm, sptep);
0d536790 1670 goto restart;
3da0dd43 1671 } else {
1e3f42f0 1672 new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
3da0dd43
IE
1673 new_spte |= (u64)new_pfn << PAGE_SHIFT;
1674
1675 new_spte &= ~PT_WRITABLE_MASK;
1676 new_spte &= ~SPTE_HOST_WRITEABLE;
f160c7b7
JS
1677
1678 new_spte = mark_spte_for_access_track(new_spte);
1e3f42f0
TY
1679
1680 mmu_spte_clear_track_bits(sptep);
1681 mmu_spte_set(sptep, new_spte);
3da0dd43
IE
1682 }
1683 }
1e3f42f0 1684
3da0dd43
IE
1685 if (need_flush)
1686 kvm_flush_remote_tlbs(kvm);
1687
1688 return 0;
1689}
1690
6ce1f4e2
XG
1691struct slot_rmap_walk_iterator {
1692 /* input fields. */
1693 struct kvm_memory_slot *slot;
1694 gfn_t start_gfn;
1695 gfn_t end_gfn;
1696 int start_level;
1697 int end_level;
1698
1699 /* output fields. */
1700 gfn_t gfn;
018aabb5 1701 struct kvm_rmap_head *rmap;
6ce1f4e2
XG
1702 int level;
1703
1704 /* private field. */
018aabb5 1705 struct kvm_rmap_head *end_rmap;
6ce1f4e2
XG
1706};
1707
1708static void
1709rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
1710{
1711 iterator->level = level;
1712 iterator->gfn = iterator->start_gfn;
1713 iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot);
1714 iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level,
1715 iterator->slot);
1716}
1717
1718static void
1719slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
1720 struct kvm_memory_slot *slot, int start_level,
1721 int end_level, gfn_t start_gfn, gfn_t end_gfn)
1722{
1723 iterator->slot = slot;
1724 iterator->start_level = start_level;
1725 iterator->end_level = end_level;
1726 iterator->start_gfn = start_gfn;
1727 iterator->end_gfn = end_gfn;
1728
1729 rmap_walk_init_level(iterator, iterator->start_level);
1730}
1731
1732static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
1733{
1734 return !!iterator->rmap;
1735}
1736
1737static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
1738{
1739 if (++iterator->rmap <= iterator->end_rmap) {
1740 iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
1741 return;
1742 }
1743
1744 if (++iterator->level > iterator->end_level) {
1745 iterator->rmap = NULL;
1746 return;
1747 }
1748
1749 rmap_walk_init_level(iterator, iterator->level);
1750}
1751
1752#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \
1753 _start_gfn, _end_gfn, _iter_) \
1754 for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \
1755 _end_level_, _start_gfn, _end_gfn); \
1756 slot_rmap_walk_okay(_iter_); \
1757 slot_rmap_walk_next(_iter_))
1758
84504ef3
TY
1759static int kvm_handle_hva_range(struct kvm *kvm,
1760 unsigned long start,
1761 unsigned long end,
1762 unsigned long data,
1763 int (*handler)(struct kvm *kvm,
018aabb5 1764 struct kvm_rmap_head *rmap_head,
048212d0 1765 struct kvm_memory_slot *slot,
8a9522d2
ALC
1766 gfn_t gfn,
1767 int level,
84504ef3 1768 unsigned long data))
e930bffe 1769{
bc6678a3 1770 struct kvm_memslots *slots;
be6ba0f0 1771 struct kvm_memory_slot *memslot;
6ce1f4e2
XG
1772 struct slot_rmap_walk_iterator iterator;
1773 int ret = 0;
9da0e4d5 1774 int i;
bc6678a3 1775
9da0e4d5
PB
1776 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1777 slots = __kvm_memslots(kvm, i);
1778 kvm_for_each_memslot(memslot, slots) {
1779 unsigned long hva_start, hva_end;
1780 gfn_t gfn_start, gfn_end;
e930bffe 1781
9da0e4d5
PB
1782 hva_start = max(start, memslot->userspace_addr);
1783 hva_end = min(end, memslot->userspace_addr +
1784 (memslot->npages << PAGE_SHIFT));
1785 if (hva_start >= hva_end)
1786 continue;
1787 /*
1788 * {gfn(page) | page intersects with [hva_start, hva_end)} =
1789 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
1790 */
1791 gfn_start = hva_to_gfn_memslot(hva_start, memslot);
1792 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
1793
1794 for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL,
1795 PT_MAX_HUGEPAGE_LEVEL,
1796 gfn_start, gfn_end - 1,
1797 &iterator)
1798 ret |= handler(kvm, iterator.rmap, memslot,
1799 iterator.gfn, iterator.level, data);
1800 }
e930bffe
AA
1801 }
1802
f395302e 1803 return ret;
e930bffe
AA
1804}
1805
84504ef3
TY
1806static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
1807 unsigned long data,
018aabb5
TY
1808 int (*handler)(struct kvm *kvm,
1809 struct kvm_rmap_head *rmap_head,
048212d0 1810 struct kvm_memory_slot *slot,
8a9522d2 1811 gfn_t gfn, int level,
84504ef3
TY
1812 unsigned long data))
1813{
1814 return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
e930bffe
AA
1815}
1816
1817int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
1818{
3da0dd43
IE
1819 return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
1820}
1821
b3ae2096
TY
1822int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
1823{
1824 return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
1825}
1826
3da0dd43
IE
1827void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1828{
8a8365c5 1829 kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
e930bffe
AA
1830}
1831
018aabb5 1832static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
8a9522d2
ALC
1833 struct kvm_memory_slot *slot, gfn_t gfn, int level,
1834 unsigned long data)
e930bffe 1835{
1e3f42f0 1836 u64 *sptep;
79f702a6 1837 struct rmap_iterator uninitialized_var(iter);
e930bffe
AA
1838 int young = 0;
1839
f160c7b7
JS
1840 for_each_rmap_spte(rmap_head, &iter, sptep)
1841 young |= mmu_spte_age(sptep);
0d536790 1842
8a9522d2 1843 trace_kvm_age_page(gfn, level, slot, young);
e930bffe
AA
1844 return young;
1845}
1846
018aabb5 1847static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
8a9522d2
ALC
1848 struct kvm_memory_slot *slot, gfn_t gfn,
1849 int level, unsigned long data)
8ee53820 1850{
1e3f42f0
TY
1851 u64 *sptep;
1852 struct rmap_iterator iter;
8ee53820 1853
83ef6c81
JS
1854 for_each_rmap_spte(rmap_head, &iter, sptep)
1855 if (is_accessed_spte(*sptep))
1856 return 1;
83ef6c81 1857 return 0;
8ee53820
AA
1858}
1859
53a27b39
MT
1860#define RMAP_RECYCLE_THRESHOLD 1000
1861
852e3c19 1862static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
53a27b39 1863{
018aabb5 1864 struct kvm_rmap_head *rmap_head;
852e3c19
JR
1865 struct kvm_mmu_page *sp;
1866
1867 sp = page_header(__pa(spte));
53a27b39 1868
018aabb5 1869 rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
53a27b39 1870
018aabb5 1871 kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);
53a27b39
MT
1872 kvm_flush_remote_tlbs(vcpu->kvm);
1873}
1874
57128468 1875int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
e930bffe 1876{
57128468 1877 return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
e930bffe
AA
1878}
1879
8ee53820
AA
1880int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
1881{
1882 return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
1883}
1884
d6c69ee9 1885#ifdef MMU_DEBUG
47ad8e68 1886static int is_empty_shadow_page(u64 *spt)
6aa8b732 1887{
139bdb2d
AK
1888 u64 *pos;
1889 u64 *end;
1890
47ad8e68 1891 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
3c915510 1892 if (is_shadow_present_pte(*pos)) {
b8688d51 1893 printk(KERN_ERR "%s: %p %llx\n", __func__,
139bdb2d 1894 pos, *pos);
6aa8b732 1895 return 0;
139bdb2d 1896 }
6aa8b732
AK
1897 return 1;
1898}
d6c69ee9 1899#endif
6aa8b732 1900
45221ab6
DH
1901/*
1902 * This value is the sum of all of the kvm instances's
1903 * kvm->arch.n_used_mmu_pages values. We need a global,
1904 * aggregate version in order to make the slab shrinker
1905 * faster
1906 */
1907static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
1908{
1909 kvm->arch.n_used_mmu_pages += nr;
1910 percpu_counter_add(&kvm_total_used_mmu_pages, nr);
1911}
1912
834be0d8 1913static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
260746c0 1914{
fa4a2c08 1915 MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
7775834a 1916 hlist_del(&sp->hash_link);
bd4c86ea
XG
1917 list_del(&sp->link);
1918 free_page((unsigned long)sp->spt);
834be0d8
GN
1919 if (!sp->role.direct)
1920 free_page((unsigned long)sp->gfns);
e8ad9a70 1921 kmem_cache_free(mmu_page_header_cache, sp);
260746c0
AK
1922}
1923
cea0f0e7
AK
1924static unsigned kvm_page_table_hashfn(gfn_t gfn)
1925{
114df303 1926 return hash_64(gfn, KVM_MMU_HASH_SHIFT);
cea0f0e7
AK
1927}
1928
714b93da 1929static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
4db35314 1930 struct kvm_mmu_page *sp, u64 *parent_pte)
cea0f0e7 1931{
cea0f0e7
AK
1932 if (!parent_pte)
1933 return;
cea0f0e7 1934
67052b35 1935 pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
cea0f0e7
AK
1936}
1937
4db35314 1938static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
cea0f0e7
AK
1939 u64 *parent_pte)
1940{
67052b35 1941 pte_list_remove(parent_pte, &sp->parent_ptes);
cea0f0e7
AK
1942}
1943
bcdd9a93
XG
1944static void drop_parent_pte(struct kvm_mmu_page *sp,
1945 u64 *parent_pte)
1946{
1947 mmu_page_remove_parent_pte(sp, parent_pte);
1df9f2dc 1948 mmu_spte_clear_no_track(parent_pte);
bcdd9a93
XG
1949}
1950
47005792 1951static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct)
ad8cfbe3 1952{
67052b35 1953 struct kvm_mmu_page *sp;
7ddca7e4 1954
80feb89a
TY
1955 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
1956 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
67052b35 1957 if (!direct)
80feb89a 1958 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
67052b35 1959 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
5304b8d3
XG
1960
1961 /*
1962 * The active_mmu_pages list is the FIFO list, do not move the
1963 * page until it is zapped. kvm_zap_obsolete_pages depends on
1964 * this feature. See the comments in kvm_zap_obsolete_pages().
1965 */
67052b35 1966 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
67052b35
XG
1967 kvm_mod_used_mmu_pages(vcpu->kvm, +1);
1968 return sp;
ad8cfbe3
MT
1969}
1970
67052b35 1971static void mark_unsync(u64 *spte);
1047df1f 1972static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
0074ff63 1973{
74c4e63a
TY
1974 u64 *sptep;
1975 struct rmap_iterator iter;
1976
1977 for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
1978 mark_unsync(sptep);
1979 }
0074ff63
MT
1980}
1981
67052b35 1982static void mark_unsync(u64 *spte)
0074ff63 1983{
67052b35 1984 struct kvm_mmu_page *sp;
1047df1f 1985 unsigned int index;
0074ff63 1986
67052b35 1987 sp = page_header(__pa(spte));
1047df1f
XG
1988 index = spte - sp->spt;
1989 if (__test_and_set_bit(index, sp->unsync_child_bitmap))
0074ff63 1990 return;
1047df1f 1991 if (sp->unsync_children++)
0074ff63 1992 return;
1047df1f 1993 kvm_mmu_mark_parents_unsync(sp);
0074ff63
MT
1994}
1995
e8bc217a 1996static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
a4a8e6f7 1997 struct kvm_mmu_page *sp)
e8bc217a 1998{
1f50f1b3 1999 return 0;
e8bc217a
MT
2000}
2001
a7052897
MT
2002static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
2003{
2004}
2005
0f53b5b1
XG
2006static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
2007 struct kvm_mmu_page *sp, u64 *spte,
7c562522 2008 const void *pte)
0f53b5b1
XG
2009{
2010 WARN_ON(1);
2011}
2012
60c8aec6
MT
2013#define KVM_PAGE_ARRAY_NR 16
2014
2015struct kvm_mmu_pages {
2016 struct mmu_page_and_offset {
2017 struct kvm_mmu_page *sp;
2018 unsigned int idx;
2019 } page[KVM_PAGE_ARRAY_NR];
2020 unsigned int nr;
2021};
2022
cded19f3
HE
2023static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
2024 int idx)
4731d4c7 2025{
60c8aec6 2026 int i;
4731d4c7 2027
60c8aec6
MT
2028 if (sp->unsync)
2029 for (i=0; i < pvec->nr; i++)
2030 if (pvec->page[i].sp == sp)
2031 return 0;
2032
2033 pvec->page[pvec->nr].sp = sp;
2034 pvec->page[pvec->nr].idx = idx;
2035 pvec->nr++;
2036 return (pvec->nr == KVM_PAGE_ARRAY_NR);
2037}
2038
fd951457
TY
2039static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
2040{
2041 --sp->unsync_children;
2042 WARN_ON((int)sp->unsync_children < 0);
2043 __clear_bit(idx, sp->unsync_child_bitmap);
2044}
2045
60c8aec6
MT
2046static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
2047 struct kvm_mmu_pages *pvec)
2048{
2049 int i, ret, nr_unsync_leaf = 0;
4731d4c7 2050
37178b8b 2051 for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
7a8f1a74 2052 struct kvm_mmu_page *child;
4731d4c7
MT
2053 u64 ent = sp->spt[i];
2054
fd951457
TY
2055 if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
2056 clear_unsync_child_bit(sp, i);
2057 continue;
2058 }
7a8f1a74
XG
2059
2060 child = page_header(ent & PT64_BASE_ADDR_MASK);
2061
2062 if (child->unsync_children) {
2063 if (mmu_pages_add(pvec, child, i))
2064 return -ENOSPC;
2065
2066 ret = __mmu_unsync_walk(child, pvec);
fd951457
TY
2067 if (!ret) {
2068 clear_unsync_child_bit(sp, i);
2069 continue;
2070 } else if (ret > 0) {
7a8f1a74 2071 nr_unsync_leaf += ret;
fd951457 2072 } else
7a8f1a74
XG
2073 return ret;
2074 } else if (child->unsync) {
2075 nr_unsync_leaf++;
2076 if (mmu_pages_add(pvec, child, i))
2077 return -ENOSPC;
2078 } else
fd951457 2079 clear_unsync_child_bit(sp, i);
4731d4c7
MT
2080 }
2081
60c8aec6
MT
2082 return nr_unsync_leaf;
2083}
2084
e23d3fef
XG
2085#define INVALID_INDEX (-1)
2086
60c8aec6
MT
2087static int mmu_unsync_walk(struct kvm_mmu_page *sp,
2088 struct kvm_mmu_pages *pvec)
2089{
0a47cd85 2090 pvec->nr = 0;
60c8aec6
MT
2091 if (!sp->unsync_children)
2092 return 0;
2093
e23d3fef 2094 mmu_pages_add(pvec, sp, INVALID_INDEX);
60c8aec6 2095 return __mmu_unsync_walk(sp, pvec);
4731d4c7
MT
2096}
2097
4731d4c7
MT
2098static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
2099{
2100 WARN_ON(!sp->unsync);
5e1b3ddb 2101 trace_kvm_mmu_sync_page(sp);
4731d4c7
MT
2102 sp->unsync = 0;
2103 --kvm->stat.mmu_unsync;
2104}
2105
7775834a
XG
2106static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2107 struct list_head *invalid_list);
2108static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2109 struct list_head *invalid_list);
4731d4c7 2110
f34d251d
XG
2111/*
2112 * NOTE: we should pay more attention on the zapped-obsolete page
2113 * (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk
2114 * since it has been deleted from active_mmu_pages but still can be found
2115 * at hast list.
2116 *
f3414bc7 2117 * for_each_valid_sp() has skipped that kind of pages.
f34d251d 2118 */
f3414bc7 2119#define for_each_valid_sp(_kvm, _sp, _gfn) \
1044b030
TY
2120 hlist_for_each_entry(_sp, \
2121 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
f3414bc7
DM
2122 if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) { \
2123 } else
1044b030
TY
2124
2125#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
f3414bc7
DM
2126 for_each_valid_sp(_kvm, _sp, _gfn) \
2127 if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
7ae680eb 2128
f918b443 2129/* @sp->gfn should be write-protected at the call site */
1f50f1b3
PB
2130static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2131 struct list_head *invalid_list)
4731d4c7 2132{
5b7e0102 2133 if (sp->role.cr4_pae != !!is_pae(vcpu)) {
d98ba053 2134 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1f50f1b3 2135 return false;
4731d4c7
MT
2136 }
2137
1f50f1b3 2138 if (vcpu->arch.mmu.sync_page(vcpu, sp) == 0) {
d98ba053 2139 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1f50f1b3 2140 return false;
4731d4c7
MT
2141 }
2142
1f50f1b3 2143 return true;
4731d4c7
MT
2144}
2145
35a70510
PB
2146static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
2147 struct list_head *invalid_list,
2148 bool remote_flush, bool local_flush)
1d9dc7e0 2149{
35a70510
PB
2150 if (!list_empty(invalid_list)) {
2151 kvm_mmu_commit_zap_page(vcpu->kvm, invalid_list);
2152 return;
2153 }
d98ba053 2154
35a70510
PB
2155 if (remote_flush)
2156 kvm_flush_remote_tlbs(vcpu->kvm);
2157 else if (local_flush)
2158 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
1d9dc7e0
XG
2159}
2160
e37fa785
XG
2161#ifdef CONFIG_KVM_MMU_AUDIT
2162#include "mmu_audit.c"
2163#else
2164static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
2165static void mmu_audit_disable(void) { }
2166#endif
2167
46971a2f
XG
2168static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
2169{
2170 return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
2171}
2172
1f50f1b3 2173static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
d98ba053 2174 struct list_head *invalid_list)
1d9dc7e0 2175{
9a43c5d9
PB
2176 kvm_unlink_unsync_page(vcpu->kvm, sp);
2177 return __kvm_sync_page(vcpu, sp, invalid_list);
1d9dc7e0
XG
2178}
2179
9f1a122f 2180/* @gfn should be write-protected at the call site */
2a74003a
PB
2181static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
2182 struct list_head *invalid_list)
9f1a122f 2183{
9f1a122f 2184 struct kvm_mmu_page *s;
2a74003a 2185 bool ret = false;
9f1a122f 2186
b67bfe0d 2187 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
7ae680eb 2188 if (!s->unsync)
9f1a122f
XG
2189 continue;
2190
2191 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
2a74003a 2192 ret |= kvm_sync_page(vcpu, s, invalid_list);
9f1a122f
XG
2193 }
2194
2a74003a 2195 return ret;
9f1a122f
XG
2196}
2197
60c8aec6 2198struct mmu_page_path {
2a7266a8
YZ
2199 struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
2200 unsigned int idx[PT64_ROOT_MAX_LEVEL];
4731d4c7
MT
2201};
2202
60c8aec6 2203#define for_each_sp(pvec, sp, parents, i) \
0a47cd85 2204 for (i = mmu_pages_first(&pvec, &parents); \
60c8aec6
MT
2205 i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
2206 i = mmu_pages_next(&pvec, &parents, i))
2207
cded19f3
HE
2208static int mmu_pages_next(struct kvm_mmu_pages *pvec,
2209 struct mmu_page_path *parents,
2210 int i)
60c8aec6
MT
2211{
2212 int n;
2213
2214 for (n = i+1; n < pvec->nr; n++) {
2215 struct kvm_mmu_page *sp = pvec->page[n].sp;
0a47cd85
PB
2216 unsigned idx = pvec->page[n].idx;
2217 int level = sp->role.level;
60c8aec6 2218
0a47cd85
PB
2219 parents->idx[level-1] = idx;
2220 if (level == PT_PAGE_TABLE_LEVEL)
2221 break;
60c8aec6 2222
0a47cd85 2223 parents->parent[level-2] = sp;
60c8aec6
MT
2224 }
2225
2226 return n;
2227}
2228
0a47cd85
PB
2229static int mmu_pages_first(struct kvm_mmu_pages *pvec,
2230 struct mmu_page_path *parents)
2231{
2232 struct kvm_mmu_page *sp;
2233 int level;
2234
2235 if (pvec->nr == 0)
2236 return 0;
2237
e23d3fef
XG
2238 WARN_ON(pvec->page[0].idx != INVALID_INDEX);
2239
0a47cd85
PB
2240 sp = pvec->page[0].sp;
2241 level = sp->role.level;
2242 WARN_ON(level == PT_PAGE_TABLE_LEVEL);
2243
2244 parents->parent[level-2] = sp;
2245
2246 /* Also set up a sentinel. Further entries in pvec are all
2247 * children of sp, so this element is never overwritten.
2248 */
2249 parents->parent[level-1] = NULL;
2250 return mmu_pages_next(pvec, parents, 0);
2251}
2252
cded19f3 2253static void mmu_pages_clear_parents(struct mmu_page_path *parents)
4731d4c7 2254{
60c8aec6
MT
2255 struct kvm_mmu_page *sp;
2256 unsigned int level = 0;
2257
2258 do {
2259 unsigned int idx = parents->idx[level];
60c8aec6
MT
2260 sp = parents->parent[level];
2261 if (!sp)
2262 return;
2263
e23d3fef 2264 WARN_ON(idx == INVALID_INDEX);
fd951457 2265 clear_unsync_child_bit(sp, idx);
60c8aec6 2266 level++;
0a47cd85 2267 } while (!sp->unsync_children);
60c8aec6 2268}
4731d4c7 2269
60c8aec6
MT
2270static void mmu_sync_children(struct kvm_vcpu *vcpu,
2271 struct kvm_mmu_page *parent)
2272{
2273 int i;
2274 struct kvm_mmu_page *sp;
2275 struct mmu_page_path parents;
2276 struct kvm_mmu_pages pages;
d98ba053 2277 LIST_HEAD(invalid_list);
50c9e6f3 2278 bool flush = false;
60c8aec6 2279
60c8aec6 2280 while (mmu_unsync_walk(parent, &pages)) {
2f84569f 2281 bool protected = false;
b1a36821
MT
2282
2283 for_each_sp(pages, sp, parents, i)
54bf36aa 2284 protected |= rmap_write_protect(vcpu, sp->gfn);
b1a36821 2285
50c9e6f3 2286 if (protected) {
b1a36821 2287 kvm_flush_remote_tlbs(vcpu->kvm);
50c9e6f3
PB
2288 flush = false;
2289 }
b1a36821 2290
60c8aec6 2291 for_each_sp(pages, sp, parents, i) {
1f50f1b3 2292 flush |= kvm_sync_page(vcpu, sp, &invalid_list);
60c8aec6
MT
2293 mmu_pages_clear_parents(&parents);
2294 }
50c9e6f3
PB
2295 if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) {
2296 kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2297 cond_resched_lock(&vcpu->kvm->mmu_lock);
2298 flush = false;
2299 }
60c8aec6 2300 }
50c9e6f3
PB
2301
2302 kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
4731d4c7
MT
2303}
2304
a30f47cb
XG
2305static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
2306{
e5691a81 2307 atomic_set(&sp->write_flooding_count, 0);
a30f47cb
XG
2308}
2309
2310static void clear_sp_write_flooding_count(u64 *spte)
2311{
2312 struct kvm_mmu_page *sp = page_header(__pa(spte));
2313
2314 __clear_sp_write_flooding_count(sp);
2315}
2316
cea0f0e7
AK
2317static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
2318 gfn_t gfn,
2319 gva_t gaddr,
2320 unsigned level,
f6e2c02b 2321 int direct,
bb11c6c9 2322 unsigned access)
cea0f0e7
AK
2323{
2324 union kvm_mmu_page_role role;
cea0f0e7 2325 unsigned quadrant;
9f1a122f 2326 struct kvm_mmu_page *sp;
9f1a122f 2327 bool need_sync = false;
2a74003a 2328 bool flush = false;
f3414bc7 2329 int collisions = 0;
2a74003a 2330 LIST_HEAD(invalid_list);
cea0f0e7 2331
a770f6f2 2332 role = vcpu->arch.mmu.base_role;
cea0f0e7 2333 role.level = level;
f6e2c02b 2334 role.direct = direct;
84b0c8c6 2335 if (role.direct)
5b7e0102 2336 role.cr4_pae = 0;
41074d07 2337 role.access = access;
c5a78f2b
JR
2338 if (!vcpu->arch.mmu.direct_map
2339 && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
cea0f0e7
AK
2340 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
2341 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
2342 role.quadrant = quadrant;
2343 }
f3414bc7
DM
2344 for_each_valid_sp(vcpu->kvm, sp, gfn) {
2345 if (sp->gfn != gfn) {
2346 collisions++;
2347 continue;
2348 }
2349
7ae680eb
XG
2350 if (!need_sync && sp->unsync)
2351 need_sync = true;
4731d4c7 2352
7ae680eb
XG
2353 if (sp->role.word != role.word)
2354 continue;
4731d4c7 2355
2a74003a
PB
2356 if (sp->unsync) {
2357 /* The page is good, but __kvm_sync_page might still end
2358 * up zapping it. If so, break in order to rebuild it.
2359 */
2360 if (!__kvm_sync_page(vcpu, sp, &invalid_list))
2361 break;
2362
2363 WARN_ON(!list_empty(&invalid_list));
2364 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2365 }
e02aa901 2366
98bba238 2367 if (sp->unsync_children)
a8eeb04a 2368 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
e02aa901 2369
a30f47cb 2370 __clear_sp_write_flooding_count(sp);
7ae680eb 2371 trace_kvm_mmu_get_page(sp, false);
f3414bc7 2372 goto out;
7ae680eb 2373 }
47005792 2374
dfc5aa00 2375 ++vcpu->kvm->stat.mmu_cache_miss;
47005792
TY
2376
2377 sp = kvm_mmu_alloc_page(vcpu, direct);
2378
4db35314
AK
2379 sp->gfn = gfn;
2380 sp->role = role;
7ae680eb
XG
2381 hlist_add_head(&sp->hash_link,
2382 &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
f6e2c02b 2383 if (!direct) {
56ca57f9
XG
2384 /*
2385 * we should do write protection before syncing pages
2386 * otherwise the content of the synced shadow page may
2387 * be inconsistent with guest page table.
2388 */
2389 account_shadowed(vcpu->kvm, sp);
2390 if (level == PT_PAGE_TABLE_LEVEL &&
2391 rmap_write_protect(vcpu, gfn))
b1a36821 2392 kvm_flush_remote_tlbs(vcpu->kvm);
9f1a122f 2393
9f1a122f 2394 if (level > PT_PAGE_TABLE_LEVEL && need_sync)
2a74003a 2395 flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
4731d4c7 2396 }
5304b8d3 2397 sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
77492664 2398 clear_page(sp->spt);
f691fe1d 2399 trace_kvm_mmu_get_page(sp, true);
2a74003a
PB
2400
2401 kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
f3414bc7
DM
2402out:
2403 if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
2404 vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
4db35314 2405 return sp;
cea0f0e7
AK
2406}
2407
2d11123a
AK
2408static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
2409 struct kvm_vcpu *vcpu, u64 addr)
2410{
2411 iterator->addr = addr;
2412 iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
2413 iterator->level = vcpu->arch.mmu.shadow_root_level;
81407ca5 2414
2a7266a8
YZ
2415 if (iterator->level == PT64_ROOT_4LEVEL &&
2416 vcpu->arch.mmu.root_level < PT64_ROOT_4LEVEL &&
81407ca5
JR
2417 !vcpu->arch.mmu.direct_map)
2418 --iterator->level;
2419
2d11123a
AK
2420 if (iterator->level == PT32E_ROOT_LEVEL) {
2421 iterator->shadow_addr
2422 = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
2423 iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
2424 --iterator->level;
2425 if (!iterator->shadow_addr)
2426 iterator->level = 0;
2427 }
2428}
2429
2430static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
2431{
2432 if (iterator->level < PT_PAGE_TABLE_LEVEL)
2433 return false;
4d88954d 2434
2d11123a
AK
2435 iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
2436 iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
2437 return true;
2438}
2439
c2a2ac2b
XG
2440static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
2441 u64 spte)
2d11123a 2442{
c2a2ac2b 2443 if (is_last_spte(spte, iterator->level)) {
052331be
XG
2444 iterator->level = 0;
2445 return;
2446 }
2447
c2a2ac2b 2448 iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
2d11123a
AK
2449 --iterator->level;
2450}
2451
c2a2ac2b
XG
2452static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
2453{
bb606a9b 2454 __shadow_walk_next(iterator, *iterator->sptep);
c2a2ac2b
XG
2455}
2456
98bba238
TY
2457static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
2458 struct kvm_mmu_page *sp)
32ef26a3
AK
2459{
2460 u64 spte;
2461
ffb128c8 2462 BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
7a1638ce 2463
ffb128c8 2464 spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
d0ec49d4 2465 shadow_user_mask | shadow_x_mask | shadow_me_mask;
ac8d57e5
PF
2466
2467 if (sp_ad_disabled(sp))
2468 spte |= shadow_acc_track_value;
2469 else
2470 spte |= shadow_accessed_mask;
24db2734 2471
1df9f2dc 2472 mmu_spte_set(sptep, spte);
98bba238
TY
2473
2474 mmu_page_add_parent_pte(vcpu, sp, sptep);
2475
2476 if (sp->unsync_children || sp->unsync)
2477 mark_unsync(sptep);
32ef26a3
AK
2478}
2479
a357bd22
AK
2480static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2481 unsigned direct_access)
2482{
2483 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
2484 struct kvm_mmu_page *child;
2485
2486 /*
2487 * For the direct sp, if the guest pte's dirty bit
2488 * changed form clean to dirty, it will corrupt the
2489 * sp's access: allow writable in the read-only sp,
2490 * so we should update the spte at this point to get
2491 * a new sp with the correct access.
2492 */
2493 child = page_header(*sptep & PT64_BASE_ADDR_MASK);
2494 if (child->role.access == direct_access)
2495 return;
2496
bcdd9a93 2497 drop_parent_pte(child, sptep);
a357bd22
AK
2498 kvm_flush_remote_tlbs(vcpu->kvm);
2499 }
2500}
2501
505aef8f 2502static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
38e3b2b2
XG
2503 u64 *spte)
2504{
2505 u64 pte;
2506 struct kvm_mmu_page *child;
2507
2508 pte = *spte;
2509 if (is_shadow_present_pte(pte)) {
505aef8f 2510 if (is_last_spte(pte, sp->role.level)) {
c3707958 2511 drop_spte(kvm, spte);
505aef8f
XG
2512 if (is_large_pte(pte))
2513 --kvm->stat.lpages;
2514 } else {
38e3b2b2 2515 child = page_header(pte & PT64_BASE_ADDR_MASK);
bcdd9a93 2516 drop_parent_pte(child, spte);
38e3b2b2 2517 }
505aef8f
XG
2518 return true;
2519 }
2520
2521 if (is_mmio_spte(pte))
ce88decf 2522 mmu_spte_clear_no_track(spte);
c3707958 2523
505aef8f 2524 return false;
38e3b2b2
XG
2525}
2526
90cb0529 2527static void kvm_mmu_page_unlink_children(struct kvm *kvm,
4db35314 2528 struct kvm_mmu_page *sp)
a436036b 2529{
697fe2e2 2530 unsigned i;
697fe2e2 2531
38e3b2b2
XG
2532 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
2533 mmu_page_zap_pte(kvm, sp, sp->spt + i);
a436036b
AK
2534}
2535
31aa2b44 2536static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
a436036b 2537{
1e3f42f0
TY
2538 u64 *sptep;
2539 struct rmap_iterator iter;
a436036b 2540
018aabb5 2541 while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
1e3f42f0 2542 drop_parent_pte(sp, sptep);
31aa2b44
AK
2543}
2544
60c8aec6 2545static int mmu_zap_unsync_children(struct kvm *kvm,
7775834a
XG
2546 struct kvm_mmu_page *parent,
2547 struct list_head *invalid_list)
4731d4c7 2548{
60c8aec6
MT
2549 int i, zapped = 0;
2550 struct mmu_page_path parents;
2551 struct kvm_mmu_pages pages;
4731d4c7 2552
60c8aec6 2553 if (parent->role.level == PT_PAGE_TABLE_LEVEL)
4731d4c7 2554 return 0;
60c8aec6 2555
60c8aec6
MT
2556 while (mmu_unsync_walk(parent, &pages)) {
2557 struct kvm_mmu_page *sp;
2558
2559 for_each_sp(pages, sp, parents, i) {
7775834a 2560 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
60c8aec6 2561 mmu_pages_clear_parents(&parents);
77662e00 2562 zapped++;
60c8aec6 2563 }
60c8aec6
MT
2564 }
2565
2566 return zapped;
4731d4c7
MT
2567}
2568
7775834a
XG
2569static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2570 struct list_head *invalid_list)
31aa2b44 2571{
4731d4c7 2572 int ret;
f691fe1d 2573
7775834a 2574 trace_kvm_mmu_prepare_zap_page(sp);
31aa2b44 2575 ++kvm->stat.mmu_shadow_zapped;
7775834a 2576 ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
4db35314 2577 kvm_mmu_page_unlink_children(kvm, sp);
31aa2b44 2578 kvm_mmu_unlink_parents(kvm, sp);
5304b8d3 2579
f6e2c02b 2580 if (!sp->role.invalid && !sp->role.direct)
3ed1a478 2581 unaccount_shadowed(kvm, sp);
5304b8d3 2582
4731d4c7
MT
2583 if (sp->unsync)
2584 kvm_unlink_unsync_page(kvm, sp);
4db35314 2585 if (!sp->root_count) {
54a4f023
GJ
2586 /* Count self */
2587 ret++;
7775834a 2588 list_move(&sp->link, invalid_list);
aa6bd187 2589 kvm_mod_used_mmu_pages(kvm, -1);
2e53d63a 2590 } else {
5b5c6a5a 2591 list_move(&sp->link, &kvm->arch.active_mmu_pages);
05988d72
GN
2592
2593 /*
2594 * The obsolete pages can not be used on any vcpus.
2595 * See the comments in kvm_mmu_invalidate_zap_all_pages().
2596 */
2597 if (!sp->role.invalid && !is_obsolete_sp(kvm, sp))
2598 kvm_reload_remote_mmus(kvm);
2e53d63a 2599 }
7775834a
XG
2600
2601 sp->role.invalid = 1;
4731d4c7 2602 return ret;
a436036b
AK
2603}
2604
7775834a
XG
2605static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2606 struct list_head *invalid_list)
2607{
945315b9 2608 struct kvm_mmu_page *sp, *nsp;
7775834a
XG
2609
2610 if (list_empty(invalid_list))
2611 return;
2612
c142786c 2613 /*
9753f529
LT
2614 * We need to make sure everyone sees our modifications to
2615 * the page tables and see changes to vcpu->mode here. The barrier
2616 * in the kvm_flush_remote_tlbs() achieves this. This pairs
2617 * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
2618 *
2619 * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
2620 * guest mode and/or lockless shadow page table walks.
c142786c
AK
2621 */
2622 kvm_flush_remote_tlbs(kvm);
c2a2ac2b 2623
945315b9 2624 list_for_each_entry_safe(sp, nsp, invalid_list, link) {
7775834a 2625 WARN_ON(!sp->role.invalid || sp->root_count);
aa6bd187 2626 kvm_mmu_free_page(sp);
945315b9 2627 }
7775834a
XG
2628}
2629
5da59607
TY
2630static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
2631 struct list_head *invalid_list)
2632{
2633 struct kvm_mmu_page *sp;
2634
2635 if (list_empty(&kvm->arch.active_mmu_pages))
2636 return false;
2637
d74c0e6b
GT
2638 sp = list_last_entry(&kvm->arch.active_mmu_pages,
2639 struct kvm_mmu_page, link);
42bcbebf 2640 return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
5da59607
TY
2641}
2642
82ce2c96
IE
2643/*
2644 * Changing the number of mmu pages allocated to the vm
49d5ca26 2645 * Note: if goal_nr_mmu_pages is too small, you will get dead lock
82ce2c96 2646 */
49d5ca26 2647void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
82ce2c96 2648{
d98ba053 2649 LIST_HEAD(invalid_list);
82ce2c96 2650
b34cb590
TY
2651 spin_lock(&kvm->mmu_lock);
2652
49d5ca26 2653 if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
5da59607
TY
2654 /* Need to free some mmu pages to achieve the goal. */
2655 while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages)
2656 if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list))
2657 break;
82ce2c96 2658
aa6bd187 2659 kvm_mmu_commit_zap_page(kvm, &invalid_list);
49d5ca26 2660 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
82ce2c96 2661 }
82ce2c96 2662
49d5ca26 2663 kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
b34cb590
TY
2664
2665 spin_unlock(&kvm->mmu_lock);
82ce2c96
IE
2666}
2667
1cb3f3ae 2668int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
a436036b 2669{
4db35314 2670 struct kvm_mmu_page *sp;
d98ba053 2671 LIST_HEAD(invalid_list);
a436036b
AK
2672 int r;
2673
9ad17b10 2674 pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
a436036b 2675 r = 0;
1cb3f3ae 2676 spin_lock(&kvm->mmu_lock);
b67bfe0d 2677 for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
9ad17b10 2678 pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
7ae680eb
XG
2679 sp->role.word);
2680 r = 1;
f41d335a 2681 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
7ae680eb 2682 }
d98ba053 2683 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1cb3f3ae
XG
2684 spin_unlock(&kvm->mmu_lock);
2685
a436036b 2686 return r;
cea0f0e7 2687}
1cb3f3ae 2688EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
cea0f0e7 2689
5c520e90 2690static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
9cf5cf5a
XG
2691{
2692 trace_kvm_mmu_unsync_page(sp);
2693 ++vcpu->kvm->stat.mmu_unsync;
2694 sp->unsync = 1;
2695
2696 kvm_mmu_mark_parents_unsync(sp);
9cf5cf5a
XG
2697}
2698
3d0c27ad
XG
2699static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
2700 bool can_unsync)
4731d4c7 2701{
5c520e90 2702 struct kvm_mmu_page *sp;
4731d4c7 2703
3d0c27ad
XG
2704 if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
2705 return true;
9cf5cf5a 2706
5c520e90 2707 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
36a2e677 2708 if (!can_unsync)
3d0c27ad 2709 return true;
36a2e677 2710
5c520e90
XG
2711 if (sp->unsync)
2712 continue;
9cf5cf5a 2713
5c520e90
XG
2714 WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
2715 kvm_unsync_page(vcpu, sp);
4731d4c7 2716 }
3d0c27ad 2717
578e1c4d
JS
2718 /*
2719 * We need to ensure that the marking of unsync pages is visible
2720 * before the SPTE is updated to allow writes because
2721 * kvm_mmu_sync_roots() checks the unsync flags without holding
2722 * the MMU lock and so can race with this. If the SPTE was updated
2723 * before the page had been marked as unsync-ed, something like the
2724 * following could happen:
2725 *
2726 * CPU 1 CPU 2
2727 * ---------------------------------------------------------------------
2728 * 1.2 Host updates SPTE
2729 * to be writable
2730 * 2.1 Guest writes a GPTE for GVA X.
2731 * (GPTE being in the guest page table shadowed
2732 * by the SP from CPU 1.)
2733 * This reads SPTE during the page table walk.
2734 * Since SPTE.W is read as 1, there is no
2735 * fault.
2736 *
2737 * 2.2 Guest issues TLB flush.
2738 * That causes a VM Exit.
2739 *
2740 * 2.3 kvm_mmu_sync_pages() reads sp->unsync.
2741 * Since it is false, so it just returns.
2742 *
2743 * 2.4 Guest accesses GVA X.
2744 * Since the mapping in the SP was not updated,
2745 * so the old mapping for GVA X incorrectly
2746 * gets used.
2747 * 1.1 Host marks SP
2748 * as unsync
2749 * (sp->unsync = true)
2750 *
2751 * The write barrier below ensures that 1.1 happens before 1.2 and thus
2752 * the situation in 2.4 does not arise. The implicit barrier in 2.2
2753 * pairs with this write barrier.
2754 */
2755 smp_wmb();
2756
3d0c27ad 2757 return false;
4731d4c7
MT
2758}
2759
ba049e93 2760static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
d1fe9219
PB
2761{
2762 if (pfn_valid(pfn))
aa2e063a
HZ
2763 return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
2764 /*
2765 * Some reserved pages, such as those from NVDIMM
2766 * DAX devices, are not for MMIO, and can be mapped
2767 * with cached memory type for better performance.
2768 * However, the above check misconceives those pages
2769 * as MMIO, and results in KVM mapping them with UC
2770 * memory type, which would hurt the performance.
2771 * Therefore, we check the host memory type in addition
2772 * and only treat UC/UC-/WC pages as MMIO.
2773 */
2774 (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn));
d1fe9219
PB
2775
2776 return true;
2777}
2778
5ce4786f
JS
2779/* Bits which may be returned by set_spte() */
2780#define SET_SPTE_WRITE_PROTECTED_PT BIT(0)
2781#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
2782
d555c333 2783static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
c2288505 2784 unsigned pte_access, int level,
ba049e93 2785 gfn_t gfn, kvm_pfn_t pfn, bool speculative,
9bdbba13 2786 bool can_unsync, bool host_writable)
1c4f1fd6 2787{
ffb128c8 2788 u64 spte = 0;
1e73f9dd 2789 int ret = 0;
ac8d57e5 2790 struct kvm_mmu_page *sp;
64d4d521 2791
54bf36aa 2792 if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
ce88decf
XG
2793 return 0;
2794
ac8d57e5
PF
2795 sp = page_header(__pa(sptep));
2796 if (sp_ad_disabled(sp))
2797 spte |= shadow_acc_track_value;
2798
d95c5568
BD
2799 /*
2800 * For the EPT case, shadow_present_mask is 0 if hardware
2801 * supports exec-only page table entries. In that case,
2802 * ACC_USER_MASK and shadow_user_mask are used to represent
2803 * read access. See FNAME(gpte_access) in paging_tmpl.h.
2804 */
ffb128c8 2805 spte |= shadow_present_mask;
947da538 2806 if (!speculative)
ac8d57e5 2807 spte |= spte_shadow_accessed_mask(spte);
640d9b0d 2808
7b52345e
SY
2809 if (pte_access & ACC_EXEC_MASK)
2810 spte |= shadow_x_mask;
2811 else
2812 spte |= shadow_nx_mask;
49fde340 2813
1c4f1fd6 2814 if (pte_access & ACC_USER_MASK)
7b52345e 2815 spte |= shadow_user_mask;
49fde340 2816
852e3c19 2817 if (level > PT_PAGE_TABLE_LEVEL)
05da4558 2818 spte |= PT_PAGE_SIZE_MASK;
b0bc3ee2 2819 if (tdp_enabled)
4b12f0de 2820 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
d1fe9219 2821 kvm_is_mmio_pfn(pfn));
1c4f1fd6 2822
9bdbba13 2823 if (host_writable)
1403283a 2824 spte |= SPTE_HOST_WRITEABLE;
f8e453b0
XG
2825 else
2826 pte_access &= ~ACC_WRITE_MASK;
1403283a 2827
daaf216c
TL
2828 if (!kvm_is_mmio_pfn(pfn))
2829 spte |= shadow_me_mask;
2830
35149e21 2831 spte |= (u64)pfn << PAGE_SHIFT;
1c4f1fd6 2832
c2288505 2833 if (pte_access & ACC_WRITE_MASK) {
1c4f1fd6 2834
c2193463 2835 /*
7751babd
XG
2836 * Other vcpu creates new sp in the window between
2837 * mapping_level() and acquiring mmu-lock. We can
2838 * allow guest to retry the access, the mapping can
2839 * be fixed if guest refault.
c2193463 2840 */
852e3c19 2841 if (level > PT_PAGE_TABLE_LEVEL &&
92f94f1e 2842 mmu_gfn_lpage_is_disallowed(vcpu, gfn, level))
be38d276 2843 goto done;
38187c83 2844
49fde340 2845 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
1c4f1fd6 2846
ecc5589f
MT
2847 /*
2848 * Optimization: for pte sync, if spte was writable the hash
2849 * lookup is unnecessary (and expensive). Write protection
2850 * is responsibility of mmu_get_page / kvm_sync_page.
2851 * Same reasoning can be applied to dirty page accounting.
2852 */
8dae4445 2853 if (!can_unsync && is_writable_pte(*sptep))
ecc5589f
MT
2854 goto set_pte;
2855
4731d4c7 2856 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
9ad17b10 2857 pgprintk("%s: found shadow page for %llx, marking ro\n",
b8688d51 2858 __func__, gfn);
5ce4786f 2859 ret |= SET_SPTE_WRITE_PROTECTED_PT;
1c4f1fd6 2860 pte_access &= ~ACC_WRITE_MASK;
49fde340 2861 spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
1c4f1fd6
AK
2862 }
2863 }
2864
9b51a630 2865 if (pte_access & ACC_WRITE_MASK) {
54bf36aa 2866 kvm_vcpu_mark_page_dirty(vcpu, gfn);
ac8d57e5 2867 spte |= spte_shadow_dirty_mask(spte);
9b51a630 2868 }
1c4f1fd6 2869
f160c7b7
JS
2870 if (speculative)
2871 spte = mark_spte_for_access_track(spte);
2872
38187c83 2873set_pte:
6e7d0354 2874 if (mmu_spte_update(sptep, spte))
5ce4786f 2875 ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
be38d276 2876done:
1e73f9dd
MT
2877 return ret;
2878}
2879
9b8ebbdb
PB
2880static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
2881 int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
2882 bool speculative, bool host_writable)
1e73f9dd
MT
2883{
2884 int was_rmapped = 0;
53a27b39 2885 int rmap_count;
5ce4786f 2886 int set_spte_ret;
9b8ebbdb 2887 int ret = RET_PF_RETRY;
1e73f9dd 2888
f7616203
XG
2889 pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
2890 *sptep, write_fault, gfn);
1e73f9dd 2891
afd28fe1 2892 if (is_shadow_present_pte(*sptep)) {
1e73f9dd
MT
2893 /*
2894 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
2895 * the parent of the now unreachable PTE.
2896 */
852e3c19
JR
2897 if (level > PT_PAGE_TABLE_LEVEL &&
2898 !is_large_pte(*sptep)) {
1e73f9dd 2899 struct kvm_mmu_page *child;
d555c333 2900 u64 pte = *sptep;
1e73f9dd
MT
2901
2902 child = page_header(pte & PT64_BASE_ADDR_MASK);
bcdd9a93 2903 drop_parent_pte(child, sptep);
3be2264b 2904 kvm_flush_remote_tlbs(vcpu->kvm);
d555c333 2905 } else if (pfn != spte_to_pfn(*sptep)) {
9ad17b10 2906 pgprintk("hfn old %llx new %llx\n",
d555c333 2907 spte_to_pfn(*sptep), pfn);
c3707958 2908 drop_spte(vcpu->kvm, sptep);
91546356 2909 kvm_flush_remote_tlbs(vcpu->kvm);
6bed6b9e
JR
2910 } else
2911 was_rmapped = 1;
1e73f9dd 2912 }
852e3c19 2913
5ce4786f
JS
2914 set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
2915 speculative, true, host_writable);
2916 if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
1e73f9dd 2917 if (write_fault)
9b8ebbdb 2918 ret = RET_PF_EMULATE;
77c3913b 2919 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
a378b4e6 2920 }
5ce4786f
JS
2921 if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH)
2922 kvm_flush_remote_tlbs(vcpu->kvm);
1e73f9dd 2923
029499b4 2924 if (unlikely(is_mmio_spte(*sptep)))
9b8ebbdb 2925 ret = RET_PF_EMULATE;
ce88decf 2926
d555c333 2927 pgprintk("%s: setting spte %llx\n", __func__, *sptep);
9ad17b10 2928 pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
d555c333 2929 is_large_pte(*sptep)? "2MB" : "4kB",
f160c7b7 2930 *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn,
a205bc19 2931 *sptep, sptep);
d555c333 2932 if (!was_rmapped && is_large_pte(*sptep))
05da4558
MT
2933 ++vcpu->kvm->stat.lpages;
2934
ffb61bb3 2935 if (is_shadow_present_pte(*sptep)) {
ffb61bb3
XG
2936 if (!was_rmapped) {
2937 rmap_count = rmap_add(vcpu, sptep, gfn);
2938 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
2939 rmap_recycle(vcpu, sptep, gfn);
2940 }
1c4f1fd6 2941 }
cb9aaa30 2942
f3ac1a4b 2943 kvm_release_pfn_clean(pfn);
029499b4 2944
9b8ebbdb 2945 return ret;
1c4f1fd6
AK
2946}
2947
ba049e93 2948static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
957ed9ef
XG
2949 bool no_dirty_log)
2950{
2951 struct kvm_memory_slot *slot;
957ed9ef 2952
5d163b1c 2953 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
903816fa 2954 if (!slot)
6c8ee57b 2955 return KVM_PFN_ERR_FAULT;
957ed9ef 2956
037d92dc 2957 return gfn_to_pfn_memslot_atomic(slot, gfn);
957ed9ef
XG
2958}
2959
2960static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2961 struct kvm_mmu_page *sp,
2962 u64 *start, u64 *end)
2963{
2964 struct page *pages[PTE_PREFETCH_NUM];
d9ef13c2 2965 struct kvm_memory_slot *slot;
957ed9ef
XG
2966 unsigned access = sp->role.access;
2967 int i, ret;
2968 gfn_t gfn;
2969
2970 gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
d9ef13c2
PB
2971 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
2972 if (!slot)
957ed9ef
XG
2973 return -1;
2974
d9ef13c2 2975 ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
957ed9ef
XG
2976 if (ret <= 0)
2977 return -1;
2978
2979 for (i = 0; i < ret; i++, gfn++, start++)
029499b4
TY
2980 mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
2981 page_to_pfn(pages[i]), true, true);
957ed9ef
XG
2982
2983 return 0;
2984}
2985
2986static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
2987 struct kvm_mmu_page *sp, u64 *sptep)
2988{
2989 u64 *spte, *start = NULL;
2990 int i;
2991
2992 WARN_ON(!sp->role.direct);
2993
2994 i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
2995 spte = sp->spt + i;
2996
2997 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
c3707958 2998 if (is_shadow_present_pte(*spte) || spte == sptep) {
957ed9ef
XG
2999 if (!start)
3000 continue;
3001 if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
3002 break;
3003 start = NULL;
3004 } else if (!start)
3005 start = spte;
3006 }
3007}
3008
3009static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
3010{
3011 struct kvm_mmu_page *sp;
3012
ac8d57e5
PF
3013 sp = page_header(__pa(sptep));
3014
957ed9ef 3015 /*
ac8d57e5
PF
3016 * Without accessed bits, there's no way to distinguish between
3017 * actually accessed translations and prefetched, so disable pte
3018 * prefetch if accessed bits aren't available.
957ed9ef 3019 */
ac8d57e5 3020 if (sp_ad_disabled(sp))
957ed9ef
XG
3021 return;
3022
957ed9ef
XG
3023 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
3024 return;
3025
3026 __direct_pte_prefetch(vcpu, sp, sptep);
3027}
3028
7ee0e5b2 3029static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable,
ba049e93 3030 int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault)
140754bc 3031{
9f652d21 3032 struct kvm_shadow_walk_iterator iterator;
140754bc 3033 struct kvm_mmu_page *sp;
b90a0e6c 3034 int emulate = 0;
140754bc 3035 gfn_t pseudo_gfn;
6aa8b732 3036
989c6b34
MT
3037 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
3038 return 0;
3039
9f652d21 3040 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
852e3c19 3041 if (iterator.level == level) {
029499b4
TY
3042 emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
3043 write, level, gfn, pfn, prefault,
3044 map_writable);
957ed9ef 3045 direct_pte_prefetch(vcpu, iterator.sptep);
9f652d21
AK
3046 ++vcpu->stat.pf_fixed;
3047 break;
6aa8b732
AK
3048 }
3049
404381c5 3050 drop_large_spte(vcpu, iterator.sptep);
c3707958 3051 if (!is_shadow_present_pte(*iterator.sptep)) {
c9fa0b3b
LJ
3052 u64 base_addr = iterator.addr;
3053
3054 base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
3055 pseudo_gfn = base_addr >> PAGE_SHIFT;
9f652d21 3056 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
bb11c6c9 3057 iterator.level - 1, 1, ACC_ALL);
140754bc 3058
98bba238 3059 link_shadow_page(vcpu, iterator.sptep, sp);
9f652d21
AK
3060 }
3061 }
b90a0e6c 3062 return emulate;
6aa8b732
AK
3063}
3064
77db5cbd 3065static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
bf998156 3066{
77db5cbd
HY
3067 siginfo_t info;
3068
3eb0f519 3069 clear_siginfo(&info);
77db5cbd
HY
3070 info.si_signo = SIGBUS;
3071 info.si_errno = 0;
3072 info.si_code = BUS_MCEERR_AR;
3073 info.si_addr = (void __user *)address;
3074 info.si_addr_lsb = PAGE_SHIFT;
bf998156 3075
77db5cbd 3076 send_sig_info(SIGBUS, &info, tsk);
bf998156
HY
3077}
3078
ba049e93 3079static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
bf998156 3080{
4d8b81ab
XG
3081 /*
3082 * Do not cache the mmio info caused by writing the readonly gfn
3083 * into the spte otherwise read access on readonly gfn also can
3084 * caused mmio page fault and treat it as mmio access.
4d8b81ab
XG
3085 */
3086 if (pfn == KVM_PFN_ERR_RO_FAULT)
9b8ebbdb 3087 return RET_PF_EMULATE;
4d8b81ab 3088
e6c1502b 3089 if (pfn == KVM_PFN_ERR_HWPOISON) {
54bf36aa 3090 kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
9b8ebbdb 3091 return RET_PF_RETRY;
d7c55201 3092 }
edba23e5 3093
2c151b25 3094 return -EFAULT;
bf998156
HY
3095}
3096
936a5fe6 3097static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
ba049e93
DW
3098 gfn_t *gfnp, kvm_pfn_t *pfnp,
3099 int *levelp)
936a5fe6 3100{
ba049e93 3101 kvm_pfn_t pfn = *pfnp;
936a5fe6
AA
3102 gfn_t gfn = *gfnp;
3103 int level = *levelp;
3104
3105 /*
3106 * Check if it's a transparent hugepage. If this would be an
3107 * hugetlbfs page, level wouldn't be set to
3108 * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
3109 * here.
3110 */
bf4bea8e 3111 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
936a5fe6 3112 level == PT_PAGE_TABLE_LEVEL &&
127393fb 3113 PageTransCompoundMap(pfn_to_page(pfn)) &&
92f94f1e 3114 !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
936a5fe6
AA
3115 unsigned long mask;
3116 /*
3117 * mmu_notifier_retry was successful and we hold the
3118 * mmu_lock here, so the pmd can't become splitting
3119 * from under us, and in turn
3120 * __split_huge_page_refcount() can't run from under
3121 * us and we can safely transfer the refcount from
3122 * PG_tail to PG_head as we switch the pfn to tail to
3123 * head.
3124 */
3125 *levelp = level = PT_DIRECTORY_LEVEL;
3126 mask = KVM_PAGES_PER_HPAGE(level) - 1;
3127 VM_BUG_ON((gfn & mask) != (pfn & mask));
3128 if (pfn & mask) {
3129 gfn &= ~mask;
3130 *gfnp = gfn;
3131 kvm_release_pfn_clean(pfn);
3132 pfn &= ~mask;
c3586667 3133 kvm_get_pfn(pfn);
936a5fe6
AA
3134 *pfnp = pfn;
3135 }
3136 }
3137}
3138
d7c55201 3139static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
ba049e93 3140 kvm_pfn_t pfn, unsigned access, int *ret_val)
d7c55201 3141{
d7c55201 3142 /* The pfn is invalid, report the error! */
81c52c56 3143 if (unlikely(is_error_pfn(pfn))) {
d7c55201 3144 *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
798e88b3 3145 return true;
d7c55201
XG
3146 }
3147
ce88decf 3148 if (unlikely(is_noslot_pfn(pfn)))
d7c55201 3149 vcpu_cache_mmio_info(vcpu, gva, gfn, access);
d7c55201 3150
798e88b3 3151 return false;
d7c55201
XG
3152}
3153
e5552fd2 3154static bool page_fault_can_be_fast(u32 error_code)
c7ba5b48 3155{
1c118b82
XG
3156 /*
3157 * Do not fix the mmio spte with invalid generation number which
3158 * need to be updated by slow page fault path.
3159 */
3160 if (unlikely(error_code & PFERR_RSVD_MASK))
3161 return false;
3162
f160c7b7
JS
3163 /* See if the page fault is due to an NX violation */
3164 if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))
3165 == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))))
3166 return false;
3167
c7ba5b48 3168 /*
f160c7b7
JS
3169 * #PF can be fast if:
3170 * 1. The shadow page table entry is not present, which could mean that
3171 * the fault is potentially caused by access tracking (if enabled).
3172 * 2. The shadow page table entry is present and the fault
3173 * is caused by write-protect, that means we just need change the W
3174 * bit of the spte which can be done out of mmu-lock.
3175 *
3176 * However, if access tracking is disabled we know that a non-present
3177 * page must be a genuine page fault where we have to create a new SPTE.
3178 * So, if access tracking is disabled, we return true only for write
3179 * accesses to a present page.
c7ba5b48 3180 */
c7ba5b48 3181
f160c7b7
JS
3182 return shadow_acc_track_mask != 0 ||
3183 ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK))
3184 == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK));
c7ba5b48
XG
3185}
3186
97dceba2
JS
3187/*
3188 * Returns true if the SPTE was fixed successfully. Otherwise,
3189 * someone else modified the SPTE from its original value.
3190 */
c7ba5b48 3191static bool
92a476cb 3192fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
d3e328f2 3193 u64 *sptep, u64 old_spte, u64 new_spte)
c7ba5b48 3194{
c7ba5b48
XG
3195 gfn_t gfn;
3196
3197 WARN_ON(!sp->role.direct);
3198
9b51a630
KH
3199 /*
3200 * Theoretically we could also set dirty bit (and flush TLB) here in
3201 * order to eliminate unnecessary PML logging. See comments in
3202 * set_spte. But fast_page_fault is very unlikely to happen with PML
3203 * enabled, so we do not do this. This might result in the same GPA
3204 * to be logged in PML buffer again when the write really happens, and
3205 * eventually to be called by mark_page_dirty twice. But it's also no
3206 * harm. This also avoids the TLB flush needed after setting dirty bit
3207 * so non-PML cases won't be impacted.
3208 *
3209 * Compare with set_spte where instead shadow_dirty_mask is set.
3210 */
f160c7b7 3211 if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
97dceba2
JS
3212 return false;
3213
d3e328f2 3214 if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) {
f160c7b7
JS
3215 /*
3216 * The gfn of direct spte is stable since it is
3217 * calculated by sp->gfn.
3218 */
3219 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
3220 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3221 }
c7ba5b48
XG
3222
3223 return true;
3224}
3225
d3e328f2
JS
3226static bool is_access_allowed(u32 fault_err_code, u64 spte)
3227{
3228 if (fault_err_code & PFERR_FETCH_MASK)
3229 return is_executable_pte(spte);
3230
3231 if (fault_err_code & PFERR_WRITE_MASK)
3232 return is_writable_pte(spte);
3233
3234 /* Fault was on Read access */
3235 return spte & PT_PRESENT_MASK;
3236}
3237
c7ba5b48
XG
3238/*
3239 * Return value:
3240 * - true: let the vcpu to access on the same address again.
3241 * - false: let the real page fault path to fix it.
3242 */
3243static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
3244 u32 error_code)
3245{
3246 struct kvm_shadow_walk_iterator iterator;
92a476cb 3247 struct kvm_mmu_page *sp;
97dceba2 3248 bool fault_handled = false;
c7ba5b48 3249 u64 spte = 0ull;
97dceba2 3250 uint retry_count = 0;
c7ba5b48 3251
37f6a4e2
MT
3252 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
3253 return false;
3254
e5552fd2 3255 if (!page_fault_can_be_fast(error_code))
c7ba5b48
XG
3256 return false;
3257
3258 walk_shadow_page_lockless_begin(vcpu);
c7ba5b48 3259
97dceba2 3260 do {
d3e328f2 3261 u64 new_spte;
c7ba5b48 3262
d162f30a
JS
3263 for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
3264 if (!is_shadow_present_pte(spte) ||
3265 iterator.level < level)
3266 break;
3267
97dceba2
JS
3268 sp = page_header(__pa(iterator.sptep));
3269 if (!is_last_spte(spte, sp->role.level))
3270 break;
c7ba5b48 3271
97dceba2 3272 /*
f160c7b7
JS
3273 * Check whether the memory access that caused the fault would
3274 * still cause it if it were to be performed right now. If not,
3275 * then this is a spurious fault caused by TLB lazily flushed,
3276 * or some other CPU has already fixed the PTE after the
3277 * current CPU took the fault.
97dceba2
JS
3278 *
3279 * Need not check the access of upper level table entries since
3280 * they are always ACC_ALL.
3281 */
d3e328f2
JS
3282 if (is_access_allowed(error_code, spte)) {
3283 fault_handled = true;
3284 break;
3285 }
f160c7b7 3286
d3e328f2
JS
3287 new_spte = spte;
3288
3289 if (is_access_track_spte(spte))
3290 new_spte = restore_acc_track_spte(new_spte);
3291
3292 /*
3293 * Currently, to simplify the code, write-protection can
3294 * be removed in the fast path only if the SPTE was
3295 * write-protected for dirty-logging or access tracking.
3296 */
3297 if ((error_code & PFERR_WRITE_MASK) &&
3298 spte_can_locklessly_be_made_writable(spte))
3299 {
3300 new_spte |= PT_WRITABLE_MASK;
f160c7b7
JS
3301
3302 /*
d3e328f2
JS
3303 * Do not fix write-permission on the large spte. Since
3304 * we only dirty the first page into the dirty-bitmap in
3305 * fast_pf_fix_direct_spte(), other pages are missed
3306 * if its slot has dirty logging enabled.
3307 *
3308 * Instead, we let the slow page fault path create a
3309 * normal spte to fix the access.
3310 *
3311 * See the comments in kvm_arch_commit_memory_region().
f160c7b7 3312 */
d3e328f2 3313 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
f160c7b7 3314 break;
97dceba2 3315 }
c7ba5b48 3316
f160c7b7 3317 /* Verify that the fault can be handled in the fast path */
d3e328f2
JS
3318 if (new_spte == spte ||
3319 !is_access_allowed(error_code, new_spte))
97dceba2
JS
3320 break;
3321
3322 /*
3323 * Currently, fast page fault only works for direct mapping
3324 * since the gfn is not stable for indirect shadow page. See
3325 * Documentation/virtual/kvm/locking.txt to get more detail.
3326 */
3327 fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
f160c7b7 3328 iterator.sptep, spte,
d3e328f2 3329 new_spte);
97dceba2
JS
3330 if (fault_handled)
3331 break;
3332
3333 if (++retry_count > 4) {
3334 printk_once(KERN_WARNING
3335 "kvm: Fast #PF retrying more than 4 times.\n");
3336 break;
3337 }
3338
97dceba2 3339 } while (true);
c126d94f 3340
a72faf25 3341 trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
97dceba2 3342 spte, fault_handled);
c7ba5b48
XG
3343 walk_shadow_page_lockless_end(vcpu);
3344
97dceba2 3345 return fault_handled;
c7ba5b48
XG
3346}
3347
78b2c54a 3348static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
ba049e93 3349 gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable);
26eeb53c 3350static int make_mmu_pages_available(struct kvm_vcpu *vcpu);
060c2abe 3351
c7ba5b48
XG
3352static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
3353 gfn_t gfn, bool prefault)
10589a46
MT
3354{
3355 int r;
852e3c19 3356 int level;
fd136902 3357 bool force_pt_level = false;
ba049e93 3358 kvm_pfn_t pfn;
e930bffe 3359 unsigned long mmu_seq;
c7ba5b48 3360 bool map_writable, write = error_code & PFERR_WRITE_MASK;
aaee2c94 3361
fd136902 3362 level = mapping_level(vcpu, gfn, &force_pt_level);
936a5fe6 3363 if (likely(!force_pt_level)) {
936a5fe6
AA
3364 /*
3365 * This path builds a PAE pagetable - so we can map
3366 * 2mb pages at maximum. Therefore check if the level
3367 * is larger than that.
3368 */
3369 if (level > PT_DIRECTORY_LEVEL)
3370 level = PT_DIRECTORY_LEVEL;
852e3c19 3371
936a5fe6 3372 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
fd136902 3373 }
05da4558 3374
c7ba5b48 3375 if (fast_page_fault(vcpu, v, level, error_code))
9b8ebbdb 3376 return RET_PF_RETRY;
c7ba5b48 3377
e930bffe 3378 mmu_seq = vcpu->kvm->mmu_notifier_seq;
4c2155ce 3379 smp_rmb();
060c2abe 3380
78b2c54a 3381 if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
9b8ebbdb 3382 return RET_PF_RETRY;
aaee2c94 3383
d7c55201
XG
3384 if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
3385 return r;
d196e343 3386
aaee2c94 3387 spin_lock(&vcpu->kvm->mmu_lock);
8ca40a70 3388 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
e930bffe 3389 goto out_unlock;
26eeb53c
WL
3390 if (make_mmu_pages_available(vcpu) < 0)
3391 goto out_unlock;
936a5fe6
AA
3392 if (likely(!force_pt_level))
3393 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
7ee0e5b2 3394 r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
aaee2c94
MT
3395 spin_unlock(&vcpu->kvm->mmu_lock);
3396
10589a46 3397 return r;
e930bffe
AA
3398
3399out_unlock:
3400 spin_unlock(&vcpu->kvm->mmu_lock);
3401 kvm_release_pfn_clean(pfn);
9b8ebbdb 3402 return RET_PF_RETRY;
10589a46
MT
3403}
3404
74b566e6
JS
3405static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
3406 struct list_head *invalid_list)
17ac10ad 3407{
4db35314 3408 struct kvm_mmu_page *sp;
17ac10ad 3409
74b566e6 3410 if (!VALID_PAGE(*root_hpa))
7b53aa56 3411 return;
35af577a 3412
74b566e6
JS
3413 sp = page_header(*root_hpa & PT64_BASE_ADDR_MASK);
3414 --sp->root_count;
3415 if (!sp->root_count && sp->role.invalid)
3416 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
17ac10ad 3417
74b566e6
JS
3418 *root_hpa = INVALID_PAGE;
3419}
3420
7c390d35 3421void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, bool free_prev_root)
74b566e6
JS
3422{
3423 int i;
3424 LIST_HEAD(invalid_list);
3425 struct kvm_mmu *mmu = &vcpu->arch.mmu;
3426
7c390d35
JS
3427 if (!VALID_PAGE(mmu->root_hpa) &&
3428 (!VALID_PAGE(mmu->prev_root.hpa) || !free_prev_root))
17ac10ad 3429 return;
35af577a
GN
3430
3431 spin_lock(&vcpu->kvm->mmu_lock);
17ac10ad 3432
7c390d35
JS
3433 if (free_prev_root)
3434 mmu_free_root_page(vcpu->kvm, &mmu->prev_root.hpa,
3435 &invalid_list);
3436
74b566e6
JS
3437 if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
3438 (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
3439 mmu_free_root_page(vcpu->kvm, &mmu->root_hpa, &invalid_list);
3440 } else {
3441 for (i = 0; i < 4; ++i)
3442 if (mmu->pae_root[i] != 0)
3443 mmu_free_root_page(vcpu->kvm, &mmu->pae_root[i],
3444 &invalid_list);
3445 mmu->root_hpa = INVALID_PAGE;
17ac10ad 3446 }
74b566e6 3447
d98ba053 3448 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
aaee2c94 3449 spin_unlock(&vcpu->kvm->mmu_lock);
17ac10ad 3450}
74b566e6 3451EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
17ac10ad 3452
8986ecc0
MT
3453static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
3454{
3455 int ret = 0;
3456
3457 if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
a8eeb04a 3458 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
8986ecc0
MT
3459 ret = 1;
3460 }
3461
3462 return ret;
3463}
3464
651dd37a
JR
3465static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
3466{
3467 struct kvm_mmu_page *sp;
7ebaf15e 3468 unsigned i;
651dd37a 3469
855feb67 3470 if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL) {
651dd37a 3471 spin_lock(&vcpu->kvm->mmu_lock);
26eeb53c
WL
3472 if(make_mmu_pages_available(vcpu) < 0) {
3473 spin_unlock(&vcpu->kvm->mmu_lock);
ed52870f 3474 return -ENOSPC;
26eeb53c 3475 }
855feb67
YZ
3476 sp = kvm_mmu_get_page(vcpu, 0, 0,
3477 vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL);
651dd37a
JR
3478 ++sp->root_count;
3479 spin_unlock(&vcpu->kvm->mmu_lock);
3480 vcpu->arch.mmu.root_hpa = __pa(sp->spt);
3481 } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) {
3482 for (i = 0; i < 4; ++i) {
3483 hpa_t root = vcpu->arch.mmu.pae_root[i];
3484
fa4a2c08 3485 MMU_WARN_ON(VALID_PAGE(root));
651dd37a 3486 spin_lock(&vcpu->kvm->mmu_lock);
26eeb53c
WL
3487 if (make_mmu_pages_available(vcpu) < 0) {
3488 spin_unlock(&vcpu->kvm->mmu_lock);
ed52870f 3489 return -ENOSPC;
26eeb53c 3490 }
649497d1 3491 sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
bb11c6c9 3492 i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL);
651dd37a
JR
3493 root = __pa(sp->spt);
3494 ++sp->root_count;
3495 spin_unlock(&vcpu->kvm->mmu_lock);
3496 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
651dd37a 3497 }
6292757f 3498 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
651dd37a
JR
3499 } else
3500 BUG();
3501
3502 return 0;
3503}
3504
3505static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
17ac10ad 3506{
4db35314 3507 struct kvm_mmu_page *sp;
81407ca5
JR
3508 u64 pdptr, pm_mask;
3509 gfn_t root_gfn;
3510 int i;
3bb65a22 3511
5777ed34 3512 root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
17ac10ad 3513
651dd37a
JR
3514 if (mmu_check_root(vcpu, root_gfn))
3515 return 1;
3516
3517 /*
3518 * Do we shadow a long mode page table? If so we need to
3519 * write-protect the guests page table root.
3520 */
855feb67 3521 if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
ad312c7c 3522 hpa_t root = vcpu->arch.mmu.root_hpa;
17ac10ad 3523
fa4a2c08 3524 MMU_WARN_ON(VALID_PAGE(root));
651dd37a 3525
8facbbff 3526 spin_lock(&vcpu->kvm->mmu_lock);
26eeb53c
WL
3527 if (make_mmu_pages_available(vcpu) < 0) {
3528 spin_unlock(&vcpu->kvm->mmu_lock);
ed52870f 3529 return -ENOSPC;
26eeb53c 3530 }
855feb67
YZ
3531 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
3532 vcpu->arch.mmu.shadow_root_level, 0, ACC_ALL);
4db35314
AK
3533 root = __pa(sp->spt);
3534 ++sp->root_count;
8facbbff 3535 spin_unlock(&vcpu->kvm->mmu_lock);
ad312c7c 3536 vcpu->arch.mmu.root_hpa = root;
8986ecc0 3537 return 0;
17ac10ad 3538 }
f87f9288 3539
651dd37a
JR
3540 /*
3541 * We shadow a 32 bit page table. This may be a legacy 2-level
81407ca5
JR
3542 * or a PAE 3-level page table. In either case we need to be aware that
3543 * the shadow page table may be a PAE or a long mode page table.
651dd37a 3544 */
81407ca5 3545 pm_mask = PT_PRESENT_MASK;
2a7266a8 3546 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL)
81407ca5
JR
3547 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
3548
17ac10ad 3549 for (i = 0; i < 4; ++i) {
ad312c7c 3550 hpa_t root = vcpu->arch.mmu.pae_root[i];
17ac10ad 3551
fa4a2c08 3552 MMU_WARN_ON(VALID_PAGE(root));
ad312c7c 3553 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
e4e517b4 3554 pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i);
812f30b2 3555 if (!(pdptr & PT_PRESENT_MASK)) {
ad312c7c 3556 vcpu->arch.mmu.pae_root[i] = 0;
417726a3
AK
3557 continue;
3558 }
6de4f3ad 3559 root_gfn = pdptr >> PAGE_SHIFT;
f87f9288
JR
3560 if (mmu_check_root(vcpu, root_gfn))
3561 return 1;
5a7388c2 3562 }
8facbbff 3563 spin_lock(&vcpu->kvm->mmu_lock);
26eeb53c
WL
3564 if (make_mmu_pages_available(vcpu) < 0) {
3565 spin_unlock(&vcpu->kvm->mmu_lock);
ed52870f 3566 return -ENOSPC;
26eeb53c 3567 }
bb11c6c9
TY
3568 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL,
3569 0, ACC_ALL);
4db35314
AK
3570 root = __pa(sp->spt);
3571 ++sp->root_count;
8facbbff
AK
3572 spin_unlock(&vcpu->kvm->mmu_lock);
3573
81407ca5 3574 vcpu->arch.mmu.pae_root[i] = root | pm_mask;
17ac10ad 3575 }
6292757f 3576 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
81407ca5
JR
3577
3578 /*
3579 * If we shadow a 32 bit page table with a long mode page
3580 * table we enter this path.
3581 */
2a7266a8 3582 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
81407ca5
JR
3583 if (vcpu->arch.mmu.lm_root == NULL) {
3584 /*
3585 * The additional page necessary for this is only
3586 * allocated on demand.
3587 */
3588
3589 u64 *lm_root;
3590
3591 lm_root = (void*)get_zeroed_page(GFP_KERNEL);
3592 if (lm_root == NULL)
3593 return 1;
3594
3595 lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask;
3596
3597 vcpu->arch.mmu.lm_root = lm_root;
3598 }
3599
3600 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
3601 }
3602
8986ecc0 3603 return 0;
17ac10ad
AK
3604}
3605
651dd37a
JR
3606static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
3607{
3608 if (vcpu->arch.mmu.direct_map)
3609 return mmu_alloc_direct_roots(vcpu);
3610 else
3611 return mmu_alloc_shadow_roots(vcpu);
3612}
3613
578e1c4d 3614void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
0ba73cda
MT
3615{
3616 int i;
3617 struct kvm_mmu_page *sp;
3618
81407ca5
JR
3619 if (vcpu->arch.mmu.direct_map)
3620 return;
3621
0ba73cda
MT
3622 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
3623 return;
6903074c 3624
56f17dd3 3625 vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
578e1c4d 3626
855feb67 3627 if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
0ba73cda 3628 hpa_t root = vcpu->arch.mmu.root_hpa;
578e1c4d 3629
0ba73cda 3630 sp = page_header(root);
578e1c4d
JS
3631
3632 /*
3633 * Even if another CPU was marking the SP as unsync-ed
3634 * simultaneously, any guest page table changes are not
3635 * guaranteed to be visible anyway until this VCPU issues a TLB
3636 * flush strictly after those changes are made. We only need to
3637 * ensure that the other CPU sets these flags before any actual
3638 * changes to the page tables are made. The comments in
3639 * mmu_need_write_protect() describe what could go wrong if this
3640 * requirement isn't satisfied.
3641 */
3642 if (!smp_load_acquire(&sp->unsync) &&
3643 !smp_load_acquire(&sp->unsync_children))
3644 return;
3645
3646 spin_lock(&vcpu->kvm->mmu_lock);
3647 kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
3648
0ba73cda 3649 mmu_sync_children(vcpu, sp);
578e1c4d 3650
0375f7fa 3651 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
578e1c4d 3652 spin_unlock(&vcpu->kvm->mmu_lock);
0ba73cda
MT
3653 return;
3654 }
578e1c4d
JS
3655
3656 spin_lock(&vcpu->kvm->mmu_lock);
3657 kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
3658
0ba73cda
MT
3659 for (i = 0; i < 4; ++i) {
3660 hpa_t root = vcpu->arch.mmu.pae_root[i];
3661
8986ecc0 3662 if (root && VALID_PAGE(root)) {
0ba73cda
MT
3663 root &= PT64_BASE_ADDR_MASK;
3664 sp = page_header(root);
3665 mmu_sync_children(vcpu, sp);
3666 }
3667 }
0ba73cda 3668
578e1c4d 3669 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
6cffe8ca 3670 spin_unlock(&vcpu->kvm->mmu_lock);
0ba73cda 3671}
bfd0a56b 3672EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
0ba73cda 3673
1871c602 3674static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
ab9ae313 3675 u32 access, struct x86_exception *exception)
6aa8b732 3676{
ab9ae313
AK
3677 if (exception)
3678 exception->error_code = 0;
6aa8b732
AK
3679 return vaddr;
3680}
3681
6539e738 3682static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
ab9ae313
AK
3683 u32 access,
3684 struct x86_exception *exception)
6539e738 3685{
ab9ae313
AK
3686 if (exception)
3687 exception->error_code = 0;
54987b7a 3688 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception);
6539e738
JR
3689}
3690
d625b155
XG
3691static bool
3692__is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level)
3693{
3694 int bit7 = (pte >> 7) & 1, low6 = pte & 0x3f;
3695
3696 return (pte & rsvd_check->rsvd_bits_mask[bit7][level-1]) |
3697 ((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0);
3698}
3699
3700static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
3701{
3702 return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level);
3703}
3704
3705static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level)
3706{
3707 return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level);
3708}
3709
ded58749 3710static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
ce88decf 3711{
9034e6e8
PB
3712 /*
3713 * A nested guest cannot use the MMIO cache if it is using nested
3714 * page tables, because cr2 is a nGPA while the cache stores GPAs.
3715 */
3716 if (mmu_is_nested(vcpu))
3717 return false;
3718
ce88decf
XG
3719 if (direct)
3720 return vcpu_match_mmio_gpa(vcpu, addr);
3721
3722 return vcpu_match_mmio_gva(vcpu, addr);
3723}
3724
47ab8751
XG
3725/* return true if reserved bit is detected on spte. */
3726static bool
3727walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
ce88decf
XG
3728{
3729 struct kvm_shadow_walk_iterator iterator;
2a7266a8 3730 u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull;
47ab8751
XG
3731 int root, leaf;
3732 bool reserved = false;
ce88decf 3733
37f6a4e2 3734 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
47ab8751 3735 goto exit;
37f6a4e2 3736
ce88decf 3737 walk_shadow_page_lockless_begin(vcpu);
47ab8751 3738
29ecd660
PB
3739 for (shadow_walk_init(&iterator, vcpu, addr),
3740 leaf = root = iterator.level;
47ab8751
XG
3741 shadow_walk_okay(&iterator);
3742 __shadow_walk_next(&iterator, spte)) {
47ab8751
XG
3743 spte = mmu_spte_get_lockless(iterator.sptep);
3744
3745 sptes[leaf - 1] = spte;
29ecd660 3746 leaf--;
47ab8751 3747
ce88decf
XG
3748 if (!is_shadow_present_pte(spte))
3749 break;
47ab8751
XG
3750
3751 reserved |= is_shadow_zero_bits_set(&vcpu->arch.mmu, spte,
58c95070 3752 iterator.level);
47ab8751
XG
3753 }
3754
ce88decf
XG
3755 walk_shadow_page_lockless_end(vcpu);
3756
47ab8751
XG
3757 if (reserved) {
3758 pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
3759 __func__, addr);
29ecd660 3760 while (root > leaf) {
47ab8751
XG
3761 pr_err("------ spte 0x%llx level %d.\n",
3762 sptes[root - 1], root);
3763 root--;
3764 }
3765 }
3766exit:
3767 *sptep = spte;
3768 return reserved;
ce88decf
XG
3769}
3770
e08d26f0 3771static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
ce88decf
XG
3772{
3773 u64 spte;
47ab8751 3774 bool reserved;
ce88decf 3775
ded58749 3776 if (mmio_info_in_cache(vcpu, addr, direct))
9b8ebbdb 3777 return RET_PF_EMULATE;
ce88decf 3778
47ab8751 3779 reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
450869d6 3780 if (WARN_ON(reserved))
9b8ebbdb 3781 return -EINVAL;
ce88decf
XG
3782
3783 if (is_mmio_spte(spte)) {
3784 gfn_t gfn = get_mmio_spte_gfn(spte);
3785 unsigned access = get_mmio_spte_access(spte);
3786
54bf36aa 3787 if (!check_mmio_spte(vcpu, spte))
9b8ebbdb 3788 return RET_PF_INVALID;
f8f55942 3789
ce88decf
XG
3790 if (direct)
3791 addr = 0;
4f022648
XG
3792
3793 trace_handle_mmio_page_fault(addr, gfn, access);
ce88decf 3794 vcpu_cache_mmio_info(vcpu, addr, gfn, access);
9b8ebbdb 3795 return RET_PF_EMULATE;
ce88decf
XG
3796 }
3797
ce88decf
XG
3798 /*
3799 * If the page table is zapped by other cpus, let CPU fault again on
3800 * the address.
3801 */
9b8ebbdb 3802 return RET_PF_RETRY;
ce88decf 3803}
ce88decf 3804
3d0c27ad
XG
3805static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
3806 u32 error_code, gfn_t gfn)
3807{
3808 if (unlikely(error_code & PFERR_RSVD_MASK))
3809 return false;
3810
3811 if (!(error_code & PFERR_PRESENT_MASK) ||
3812 !(error_code & PFERR_WRITE_MASK))
3813 return false;
3814
3815 /*
3816 * guest is writing the page which is write tracked which can
3817 * not be fixed by page fault handler.
3818 */
3819 if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
3820 return true;
3821
3822 return false;
3823}
3824
e5691a81
XG
3825static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
3826{
3827 struct kvm_shadow_walk_iterator iterator;
3828 u64 spte;
3829
3830 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
3831 return;
3832
3833 walk_shadow_page_lockless_begin(vcpu);
3834 for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
3835 clear_sp_write_flooding_count(iterator.sptep);
3836 if (!is_shadow_present_pte(spte))
3837 break;
3838 }
3839 walk_shadow_page_lockless_end(vcpu);
3840}
3841
6aa8b732 3842static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
78b2c54a 3843 u32 error_code, bool prefault)
6aa8b732 3844{
3d0c27ad 3845 gfn_t gfn = gva >> PAGE_SHIFT;
e2dec939 3846 int r;
6aa8b732 3847
b8688d51 3848 pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
ce88decf 3849
3d0c27ad 3850 if (page_fault_handle_page_track(vcpu, error_code, gfn))
9b8ebbdb 3851 return RET_PF_EMULATE;
ce88decf 3852
e2dec939
AK
3853 r = mmu_topup_memory_caches(vcpu);
3854 if (r)
3855 return r;
714b93da 3856
fa4a2c08 3857 MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
6aa8b732 3858
6aa8b732 3859
e833240f 3860 return nonpaging_map(vcpu, gva & PAGE_MASK,
c7ba5b48 3861 error_code, gfn, prefault);
6aa8b732
AK
3862}
3863
7e1fbeac 3864static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
af585b92
GN
3865{
3866 struct kvm_arch_async_pf arch;
fb67e14f 3867
7c90705b 3868 arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
af585b92 3869 arch.gfn = gfn;
c4806acd 3870 arch.direct_map = vcpu->arch.mmu.direct_map;
fb67e14f 3871 arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
af585b92 3872
54bf36aa 3873 return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
af585b92
GN
3874}
3875
9bc1f09f 3876bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
af585b92 3877{
35754c98 3878 if (unlikely(!lapic_in_kernel(vcpu) ||
2a266f23
HZ
3879 kvm_event_needs_reinjection(vcpu) ||
3880 vcpu->arch.exception.pending))
af585b92
GN
3881 return false;
3882
52a5c155 3883 if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
9bc1f09f
WL
3884 return false;
3885
af585b92
GN
3886 return kvm_x86_ops->interrupt_allowed(vcpu);
3887}
3888
78b2c54a 3889static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
ba049e93 3890 gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
af585b92 3891{
3520469d 3892 struct kvm_memory_slot *slot;
af585b92
GN
3893 bool async;
3894
3a2936de
JM
3895 /*
3896 * Don't expose private memslots to L2.
3897 */
3898 if (is_guest_mode(vcpu) && !kvm_is_visible_gfn(vcpu->kvm, gfn)) {
3899 *pfn = KVM_PFN_NOSLOT;
3900 return false;
3901 }
3902
54bf36aa 3903 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3520469d
PB
3904 async = false;
3905 *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
af585b92
GN
3906 if (!async)
3907 return false; /* *pfn has correct page already */
3908
9bc1f09f 3909 if (!prefault && kvm_can_do_async_pf(vcpu)) {
c9b263d2 3910 trace_kvm_try_async_get_page(gva, gfn);
af585b92
GN
3911 if (kvm_find_async_pf_gfn(vcpu, gfn)) {
3912 trace_kvm_async_pf_doublefault(gva, gfn);
3913 kvm_make_request(KVM_REQ_APF_HALT, vcpu);
3914 return true;
3915 } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
3916 return true;
3917 }
3918
3520469d 3919 *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
af585b92
GN
3920 return false;
3921}
3922
1261bfa3 3923int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
d0006530 3924 u64 fault_address, char *insn, int insn_len)
1261bfa3
WL
3925{
3926 int r = 1;
3927
3928 switch (vcpu->arch.apf.host_apf_reason) {
3929 default:
3930 trace_kvm_page_fault(fault_address, error_code);
3931
d0006530 3932 if (kvm_event_needs_reinjection(vcpu))
1261bfa3
WL
3933 kvm_mmu_unprotect_page_virt(vcpu, fault_address);
3934 r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
3935 insn_len);
3936 break;
3937 case KVM_PV_REASON_PAGE_NOT_PRESENT:
3938 vcpu->arch.apf.host_apf_reason = 0;
3939 local_irq_disable();
a2b7861b 3940 kvm_async_pf_task_wait(fault_address, 0);
1261bfa3
WL
3941 local_irq_enable();
3942 break;
3943 case KVM_PV_REASON_PAGE_READY:
3944 vcpu->arch.apf.host_apf_reason = 0;
3945 local_irq_disable();
3946 kvm_async_pf_task_wake(fault_address);
3947 local_irq_enable();
3948 break;
3949 }
3950 return r;
3951}
3952EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
3953
6a39bbc5
XG
3954static bool
3955check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
3956{
3957 int page_num = KVM_PAGES_PER_HPAGE(level);
3958
3959 gfn &= ~(page_num - 1);
3960
3961 return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num);
3962}
3963
56028d08 3964static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
78b2c54a 3965 bool prefault)
fb72d167 3966{
ba049e93 3967 kvm_pfn_t pfn;
fb72d167 3968 int r;
852e3c19 3969 int level;
cd1872f0 3970 bool force_pt_level;
05da4558 3971 gfn_t gfn = gpa >> PAGE_SHIFT;
e930bffe 3972 unsigned long mmu_seq;
612819c3
MT
3973 int write = error_code & PFERR_WRITE_MASK;
3974 bool map_writable;
fb72d167 3975
fa4a2c08 3976 MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
fb72d167 3977
3d0c27ad 3978 if (page_fault_handle_page_track(vcpu, error_code, gfn))
9b8ebbdb 3979 return RET_PF_EMULATE;
ce88decf 3980
fb72d167
JR
3981 r = mmu_topup_memory_caches(vcpu);
3982 if (r)
3983 return r;
3984
fd136902
TY
3985 force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn,
3986 PT_DIRECTORY_LEVEL);
3987 level = mapping_level(vcpu, gfn, &force_pt_level);
936a5fe6 3988 if (likely(!force_pt_level)) {
6a39bbc5
XG
3989 if (level > PT_DIRECTORY_LEVEL &&
3990 !check_hugepage_cache_consistency(vcpu, gfn, level))
3991 level = PT_DIRECTORY_LEVEL;
936a5fe6 3992 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
fd136902 3993 }
852e3c19 3994
c7ba5b48 3995 if (fast_page_fault(vcpu, gpa, level, error_code))
9b8ebbdb 3996 return RET_PF_RETRY;
c7ba5b48 3997
e930bffe 3998 mmu_seq = vcpu->kvm->mmu_notifier_seq;
4c2155ce 3999 smp_rmb();
af585b92 4000
78b2c54a 4001 if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
9b8ebbdb 4002 return RET_PF_RETRY;
af585b92 4003
d7c55201
XG
4004 if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
4005 return r;
4006
fb72d167 4007 spin_lock(&vcpu->kvm->mmu_lock);
8ca40a70 4008 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
e930bffe 4009 goto out_unlock;
26eeb53c
WL
4010 if (make_mmu_pages_available(vcpu) < 0)
4011 goto out_unlock;
936a5fe6
AA
4012 if (likely(!force_pt_level))
4013 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
7ee0e5b2 4014 r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
fb72d167 4015 spin_unlock(&vcpu->kvm->mmu_lock);
fb72d167
JR
4016
4017 return r;
e930bffe
AA
4018
4019out_unlock:
4020 spin_unlock(&vcpu->kvm->mmu_lock);
4021 kvm_release_pfn_clean(pfn);
9b8ebbdb 4022 return RET_PF_RETRY;
fb72d167
JR
4023}
4024
8a3c1a33
PB
4025static void nonpaging_init_context(struct kvm_vcpu *vcpu,
4026 struct kvm_mmu *context)
6aa8b732 4027{
6aa8b732 4028 context->page_fault = nonpaging_page_fault;
6aa8b732 4029 context->gva_to_gpa = nonpaging_gva_to_gpa;
e8bc217a 4030 context->sync_page = nonpaging_sync_page;
a7052897 4031 context->invlpg = nonpaging_invlpg;
0f53b5b1 4032 context->update_pte = nonpaging_update_pte;
cea0f0e7 4033 context->root_level = 0;
6aa8b732 4034 context->shadow_root_level = PT32E_ROOT_LEVEL;
17c3ba9d 4035 context->root_hpa = INVALID_PAGE;
7c390d35 4036 context->prev_root = KVM_MMU_ROOT_INFO_INVALID;
c5a78f2b 4037 context->direct_map = true;
2d48a985 4038 context->nx = false;
6aa8b732
AK
4039}
4040
7c390d35
JS
4041static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3)
4042{
4043 struct kvm_mmu *mmu = &vcpu->arch.mmu;
4044
4045 /*
4046 * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid
4047 * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs
4048 * later if necessary.
4049 */
4050 if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
4051 mmu->root_level >= PT64_ROOT_4LEVEL) {
4052 gpa_t prev_cr3 = mmu->prev_root.cr3;
4053
4054 if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT))
4055 return false;
4056
4057 swap(mmu->root_hpa, mmu->prev_root.hpa);
4058 mmu->prev_root.cr3 = kvm_read_cr3(vcpu);
4059
4060 if (new_cr3 == prev_cr3 && VALID_PAGE(mmu->root_hpa)) {
4061 /*
4062 * It is possible that the cached previous root page is
4063 * obsolete because of a change in the MMU
4064 * generation number. However, that is accompanied by
4065 * KVM_REQ_MMU_RELOAD, which will free the root that we
4066 * have set here and allocate a new one.
4067 */
4068
4069 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
4070 __clear_sp_write_flooding_count(
4071 page_header(mmu->root_hpa));
4072
4073 mmu->set_cr3(vcpu, mmu->root_hpa);
4074
4075 return true;
4076 }
4077 }
4078
4079 return false;
4080}
4081
4082void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3)
6aa8b732 4083{
7c390d35
JS
4084 if (!fast_cr3_switch(vcpu, new_cr3))
4085 kvm_mmu_free_roots(vcpu, false);
6aa8b732
AK
4086}
4087
5777ed34
JR
4088static unsigned long get_cr3(struct kvm_vcpu *vcpu)
4089{
9f8fe504 4090 return kvm_read_cr3(vcpu);
5777ed34
JR
4091}
4092
6389ee94
AK
4093static void inject_page_fault(struct kvm_vcpu *vcpu,
4094 struct x86_exception *fault)
6aa8b732 4095{
6389ee94 4096 vcpu->arch.mmu.inject_page_fault(vcpu, fault);
6aa8b732
AK
4097}
4098
54bf36aa 4099static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
f2fd125d 4100 unsigned access, int *nr_present)
ce88decf
XG
4101{
4102 if (unlikely(is_mmio_spte(*sptep))) {
4103 if (gfn != get_mmio_spte_gfn(*sptep)) {
4104 mmu_spte_clear_no_track(sptep);
4105 return true;
4106 }
4107
4108 (*nr_present)++;
54bf36aa 4109 mark_mmio_spte(vcpu, sptep, gfn, access);
ce88decf
XG
4110 return true;
4111 }
4112
4113 return false;
4114}
4115
6bb69c9b
PB
4116static inline bool is_last_gpte(struct kvm_mmu *mmu,
4117 unsigned level, unsigned gpte)
6fd01b71 4118{
6bb69c9b
PB
4119 /*
4120 * The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
4121 * If it is clear, there are no large pages at this level, so clear
4122 * PT_PAGE_SIZE_MASK in gpte if that is the case.
4123 */
4124 gpte &= level - mmu->last_nonleaf_level;
4125
829ee279
LP
4126 /*
4127 * PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set
4128 * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
4129 * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
4130 */
4131 gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
4132
6bb69c9b 4133 return gpte & PT_PAGE_SIZE_MASK;
6fd01b71
AK
4134}
4135
37406aaa
NHE
4136#define PTTYPE_EPT 18 /* arbitrary */
4137#define PTTYPE PTTYPE_EPT
4138#include "paging_tmpl.h"
4139#undef PTTYPE
4140
6aa8b732
AK
4141#define PTTYPE 64
4142#include "paging_tmpl.h"
4143#undef PTTYPE
4144
4145#define PTTYPE 32
4146#include "paging_tmpl.h"
4147#undef PTTYPE
4148
6dc98b86
XG
4149static void
4150__reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4151 struct rsvd_bits_validate *rsvd_check,
4152 int maxphyaddr, int level, bool nx, bool gbpages,
6fec2144 4153 bool pse, bool amd)
82725b20 4154{
82725b20 4155 u64 exb_bit_rsvd = 0;
5f7dde7b 4156 u64 gbpages_bit_rsvd = 0;
a0c0feb5 4157 u64 nonleaf_bit8_rsvd = 0;
82725b20 4158
a0a64f50 4159 rsvd_check->bad_mt_xwr = 0;
25d92081 4160
6dc98b86 4161 if (!nx)
82725b20 4162 exb_bit_rsvd = rsvd_bits(63, 63);
6dc98b86 4163 if (!gbpages)
5f7dde7b 4164 gbpages_bit_rsvd = rsvd_bits(7, 7);
a0c0feb5
PB
4165
4166 /*
4167 * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
4168 * leaf entries) on AMD CPUs only.
4169 */
6fec2144 4170 if (amd)
a0c0feb5
PB
4171 nonleaf_bit8_rsvd = rsvd_bits(8, 8);
4172
6dc98b86 4173 switch (level) {
82725b20
DE
4174 case PT32_ROOT_LEVEL:
4175 /* no rsvd bits for 2 level 4K page table entries */
a0a64f50
XG
4176 rsvd_check->rsvd_bits_mask[0][1] = 0;
4177 rsvd_check->rsvd_bits_mask[0][0] = 0;
4178 rsvd_check->rsvd_bits_mask[1][0] =
4179 rsvd_check->rsvd_bits_mask[0][0];
f815bce8 4180
6dc98b86 4181 if (!pse) {
a0a64f50 4182 rsvd_check->rsvd_bits_mask[1][1] = 0;
f815bce8
XG
4183 break;
4184 }
4185
82725b20
DE
4186 if (is_cpuid_PSE36())
4187 /* 36bits PSE 4MB page */
a0a64f50 4188 rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
82725b20
DE
4189 else
4190 /* 32 bits PSE 4MB page */
a0a64f50 4191 rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
82725b20
DE
4192 break;
4193 case PT32E_ROOT_LEVEL:
a0a64f50 4194 rsvd_check->rsvd_bits_mask[0][2] =
20c466b5 4195 rsvd_bits(maxphyaddr, 63) |
cd9ae5fe 4196 rsvd_bits(5, 8) | rsvd_bits(1, 2); /* PDPTE */
a0a64f50 4197 rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
4c26b4cd 4198 rsvd_bits(maxphyaddr, 62); /* PDE */
a0a64f50 4199 rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
82725b20 4200 rsvd_bits(maxphyaddr, 62); /* PTE */
a0a64f50 4201 rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
82725b20
DE
4202 rsvd_bits(maxphyaddr, 62) |
4203 rsvd_bits(13, 20); /* large page */
a0a64f50
XG
4204 rsvd_check->rsvd_bits_mask[1][0] =
4205 rsvd_check->rsvd_bits_mask[0][0];
82725b20 4206 break;
855feb67
YZ
4207 case PT64_ROOT_5LEVEL:
4208 rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd |
4209 nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
4210 rsvd_bits(maxphyaddr, 51);
4211 rsvd_check->rsvd_bits_mask[1][4] =
4212 rsvd_check->rsvd_bits_mask[0][4];
2a7266a8 4213 case PT64_ROOT_4LEVEL:
a0a64f50
XG
4214 rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
4215 nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
4c26b4cd 4216 rsvd_bits(maxphyaddr, 51);
a0a64f50
XG
4217 rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd |
4218 nonleaf_bit8_rsvd | gbpages_bit_rsvd |
82725b20 4219 rsvd_bits(maxphyaddr, 51);
a0a64f50
XG
4220 rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
4221 rsvd_bits(maxphyaddr, 51);
4222 rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
4223 rsvd_bits(maxphyaddr, 51);
4224 rsvd_check->rsvd_bits_mask[1][3] =
4225 rsvd_check->rsvd_bits_mask[0][3];
4226 rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd |
5f7dde7b 4227 gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) |
e04da980 4228 rsvd_bits(13, 29);
a0a64f50 4229 rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
4c26b4cd
SY
4230 rsvd_bits(maxphyaddr, 51) |
4231 rsvd_bits(13, 20); /* large page */
a0a64f50
XG
4232 rsvd_check->rsvd_bits_mask[1][0] =
4233 rsvd_check->rsvd_bits_mask[0][0];
82725b20
DE
4234 break;
4235 }
4236}
4237
6dc98b86
XG
4238static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4239 struct kvm_mmu *context)
4240{
4241 __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check,
4242 cpuid_maxphyaddr(vcpu), context->root_level,
d6321d49
RK
4243 context->nx,
4244 guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
6fec2144 4245 is_pse(vcpu), guest_cpuid_is_amd(vcpu));
6dc98b86
XG
4246}
4247
81b8eebb
XG
4248static void
4249__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
4250 int maxphyaddr, bool execonly)
25d92081 4251{
951f9fd7 4252 u64 bad_mt_xwr;
25d92081 4253
855feb67
YZ
4254 rsvd_check->rsvd_bits_mask[0][4] =
4255 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
a0a64f50 4256 rsvd_check->rsvd_bits_mask[0][3] =
25d92081 4257 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
a0a64f50 4258 rsvd_check->rsvd_bits_mask[0][2] =
25d92081 4259 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
a0a64f50 4260 rsvd_check->rsvd_bits_mask[0][1] =
25d92081 4261 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
a0a64f50 4262 rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
25d92081
YZ
4263
4264 /* large page */
855feb67 4265 rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
a0a64f50
XG
4266 rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
4267 rsvd_check->rsvd_bits_mask[1][2] =
25d92081 4268 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
a0a64f50 4269 rsvd_check->rsvd_bits_mask[1][1] =
25d92081 4270 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
a0a64f50 4271 rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
25d92081 4272
951f9fd7
PB
4273 bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */
4274 bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */
4275 bad_mt_xwr |= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */
4276 bad_mt_xwr |= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */
4277 bad_mt_xwr |= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */
4278 if (!execonly) {
4279 /* bits 0..2 must not be 100 unless VMX capabilities allow it */
4280 bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
25d92081 4281 }
951f9fd7 4282 rsvd_check->bad_mt_xwr = bad_mt_xwr;
25d92081
YZ
4283}
4284
81b8eebb
XG
4285static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
4286 struct kvm_mmu *context, bool execonly)
4287{
4288 __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
4289 cpuid_maxphyaddr(vcpu), execonly);
4290}
4291
c258b62b
XG
4292/*
4293 * the page table on host is the shadow page table for the page
4294 * table in guest or amd nested guest, its mmu features completely
4295 * follow the features in guest.
4296 */
4297void
4298reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
4299{
5f0b8199 4300 bool uses_nx = context->nx || context->base_role.smep_andnot_wp;
ea2800dd
BS
4301 struct rsvd_bits_validate *shadow_zero_check;
4302 int i;
5f0b8199 4303
6fec2144
PB
4304 /*
4305 * Passing "true" to the last argument is okay; it adds a check
4306 * on bit 8 of the SPTEs which KVM doesn't use anyway.
4307 */
ea2800dd
BS
4308 shadow_zero_check = &context->shadow_zero_check;
4309 __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
c258b62b 4310 boot_cpu_data.x86_phys_bits,
5f0b8199 4311 context->shadow_root_level, uses_nx,
d6321d49
RK
4312 guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
4313 is_pse(vcpu), true);
ea2800dd
BS
4314
4315 if (!shadow_me_mask)
4316 return;
4317
4318 for (i = context->shadow_root_level; --i >= 0;) {
4319 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4320 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4321 }
4322
c258b62b
XG
4323}
4324EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask);
4325
6fec2144
PB
4326static inline bool boot_cpu_is_amd(void)
4327{
4328 WARN_ON_ONCE(!tdp_enabled);
4329 return shadow_x_mask == 0;
4330}
4331
c258b62b
XG
4332/*
4333 * the direct page table on host, use as much mmu features as
4334 * possible, however, kvm currently does not do execution-protection.
4335 */
4336static void
4337reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4338 struct kvm_mmu *context)
4339{
ea2800dd
BS
4340 struct rsvd_bits_validate *shadow_zero_check;
4341 int i;
4342
4343 shadow_zero_check = &context->shadow_zero_check;
4344
6fec2144 4345 if (boot_cpu_is_amd())
ea2800dd 4346 __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
c258b62b
XG
4347 boot_cpu_data.x86_phys_bits,
4348 context->shadow_root_level, false,
b8291adc
BP
4349 boot_cpu_has(X86_FEATURE_GBPAGES),
4350 true, true);
c258b62b 4351 else
ea2800dd 4352 __reset_rsvds_bits_mask_ept(shadow_zero_check,
c258b62b
XG
4353 boot_cpu_data.x86_phys_bits,
4354 false);
4355
ea2800dd
BS
4356 if (!shadow_me_mask)
4357 return;
4358
4359 for (i = context->shadow_root_level; --i >= 0;) {
4360 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4361 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4362 }
c258b62b
XG
4363}
4364
4365/*
4366 * as the comments in reset_shadow_zero_bits_mask() except it
4367 * is the shadow page table for intel nested guest.
4368 */
4369static void
4370reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4371 struct kvm_mmu *context, bool execonly)
4372{
4373 __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
4374 boot_cpu_data.x86_phys_bits, execonly);
4375}
4376
09f037aa
PB
4377#define BYTE_MASK(access) \
4378 ((1 & (access) ? 2 : 0) | \
4379 (2 & (access) ? 4 : 0) | \
4380 (3 & (access) ? 8 : 0) | \
4381 (4 & (access) ? 16 : 0) | \
4382 (5 & (access) ? 32 : 0) | \
4383 (6 & (access) ? 64 : 0) | \
4384 (7 & (access) ? 128 : 0))
4385
4386
edc90b7d
XG
4387static void update_permission_bitmask(struct kvm_vcpu *vcpu,
4388 struct kvm_mmu *mmu, bool ept)
97d64b78 4389{
09f037aa
PB
4390 unsigned byte;
4391
4392 const u8 x = BYTE_MASK(ACC_EXEC_MASK);
4393 const u8 w = BYTE_MASK(ACC_WRITE_MASK);
4394 const u8 u = BYTE_MASK(ACC_USER_MASK);
4395
4396 bool cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) != 0;
4397 bool cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP) != 0;
4398 bool cr0_wp = is_write_protection(vcpu);
97d64b78 4399
97d64b78 4400 for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
09f037aa
PB
4401 unsigned pfec = byte << 1;
4402
97ec8c06 4403 /*
09f037aa
PB
4404 * Each "*f" variable has a 1 bit for each UWX value
4405 * that causes a fault with the given PFEC.
97ec8c06 4406 */
97d64b78 4407
09f037aa
PB
4408 /* Faults from writes to non-writable pages */
4409 u8 wf = (pfec & PFERR_WRITE_MASK) ? ~w : 0;
4410 /* Faults from user mode accesses to supervisor pages */
4411 u8 uf = (pfec & PFERR_USER_MASK) ? ~u : 0;
4412 /* Faults from fetches of non-executable pages*/
4413 u8 ff = (pfec & PFERR_FETCH_MASK) ? ~x : 0;
4414 /* Faults from kernel mode fetches of user pages */
4415 u8 smepf = 0;
4416 /* Faults from kernel mode accesses of user pages */
4417 u8 smapf = 0;
4418
4419 if (!ept) {
4420 /* Faults from kernel mode accesses to user pages */
4421 u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
4422
4423 /* Not really needed: !nx will cause pte.nx to fault */
4424 if (!mmu->nx)
4425 ff = 0;
4426
4427 /* Allow supervisor writes if !cr0.wp */
4428 if (!cr0_wp)
4429 wf = (pfec & PFERR_USER_MASK) ? wf : 0;
4430
4431 /* Disallow supervisor fetches of user code if cr4.smep */
4432 if (cr4_smep)
4433 smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
4434
4435 /*
4436 * SMAP:kernel-mode data accesses from user-mode
4437 * mappings should fault. A fault is considered
4438 * as a SMAP violation if all of the following
4439 * conditions are ture:
4440 * - X86_CR4_SMAP is set in CR4
4441 * - A user page is accessed
4442 * - The access is not a fetch
4443 * - Page fault in kernel mode
4444 * - if CPL = 3 or X86_EFLAGS_AC is clear
4445 *
4446 * Here, we cover the first three conditions.
4447 * The fourth is computed dynamically in permission_fault();
4448 * PFERR_RSVD_MASK bit will be set in PFEC if the access is
4449 * *not* subject to SMAP restrictions.
4450 */
4451 if (cr4_smap)
4452 smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
97d64b78 4453 }
09f037aa
PB
4454
4455 mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
97d64b78
AK
4456 }
4457}
4458
2d344105
HH
4459/*
4460* PKU is an additional mechanism by which the paging controls access to
4461* user-mode addresses based on the value in the PKRU register. Protection
4462* key violations are reported through a bit in the page fault error code.
4463* Unlike other bits of the error code, the PK bit is not known at the
4464* call site of e.g. gva_to_gpa; it must be computed directly in
4465* permission_fault based on two bits of PKRU, on some machine state (CR4,
4466* CR0, EFER, CPL), and on other bits of the error code and the page tables.
4467*
4468* In particular the following conditions come from the error code, the
4469* page tables and the machine state:
4470* - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
4471* - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
4472* - PK is always zero if U=0 in the page tables
4473* - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
4474*
4475* The PKRU bitmask caches the result of these four conditions. The error
4476* code (minus the P bit) and the page table's U bit form an index into the
4477* PKRU bitmask. Two bits of the PKRU bitmask are then extracted and ANDed
4478* with the two bits of the PKRU register corresponding to the protection key.
4479* For the first three conditions above the bits will be 00, thus masking
4480* away both AD and WD. For all reads or if the last condition holds, WD
4481* only will be masked away.
4482*/
4483static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
4484 bool ept)
4485{
4486 unsigned bit;
4487 bool wp;
4488
4489 if (ept) {
4490 mmu->pkru_mask = 0;
4491 return;
4492 }
4493
4494 /* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */
4495 if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) {
4496 mmu->pkru_mask = 0;
4497 return;
4498 }
4499
4500 wp = is_write_protection(vcpu);
4501
4502 for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
4503 unsigned pfec, pkey_bits;
4504 bool check_pkey, check_write, ff, uf, wf, pte_user;
4505
4506 pfec = bit << 1;
4507 ff = pfec & PFERR_FETCH_MASK;
4508 uf = pfec & PFERR_USER_MASK;
4509 wf = pfec & PFERR_WRITE_MASK;
4510
4511 /* PFEC.RSVD is replaced by ACC_USER_MASK. */
4512 pte_user = pfec & PFERR_RSVD_MASK;
4513
4514 /*
4515 * Only need to check the access which is not an
4516 * instruction fetch and is to a user page.
4517 */
4518 check_pkey = (!ff && pte_user);
4519 /*
4520 * write access is controlled by PKRU if it is a
4521 * user access or CR0.WP = 1.
4522 */
4523 check_write = check_pkey && wf && (uf || wp);
4524
4525 /* PKRU.AD stops both read and write access. */
4526 pkey_bits = !!check_pkey;
4527 /* PKRU.WD stops write access. */
4528 pkey_bits |= (!!check_write) << 1;
4529
4530 mmu->pkru_mask |= (pkey_bits & 3) << pfec;
4531 }
4532}
4533
6bb69c9b 4534static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
6fd01b71 4535{
6bb69c9b
PB
4536 unsigned root_level = mmu->root_level;
4537
4538 mmu->last_nonleaf_level = root_level;
4539 if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu))
4540 mmu->last_nonleaf_level++;
6fd01b71
AK
4541}
4542
8a3c1a33
PB
4543static void paging64_init_context_common(struct kvm_vcpu *vcpu,
4544 struct kvm_mmu *context,
4545 int level)
6aa8b732 4546{
2d48a985 4547 context->nx = is_nx(vcpu);
4d6931c3 4548 context->root_level = level;
2d48a985 4549
4d6931c3 4550 reset_rsvds_bits_mask(vcpu, context);
25d92081 4551 update_permission_bitmask(vcpu, context, false);
2d344105 4552 update_pkru_bitmask(vcpu, context, false);
6bb69c9b 4553 update_last_nonleaf_level(vcpu, context);
6aa8b732 4554
fa4a2c08 4555 MMU_WARN_ON(!is_pae(vcpu));
6aa8b732 4556 context->page_fault = paging64_page_fault;
6aa8b732 4557 context->gva_to_gpa = paging64_gva_to_gpa;
e8bc217a 4558 context->sync_page = paging64_sync_page;
a7052897 4559 context->invlpg = paging64_invlpg;
0f53b5b1 4560 context->update_pte = paging64_update_pte;
17ac10ad 4561 context->shadow_root_level = level;
17c3ba9d 4562 context->root_hpa = INVALID_PAGE;
7c390d35 4563 context->prev_root = KVM_MMU_ROOT_INFO_INVALID;
c5a78f2b 4564 context->direct_map = false;
6aa8b732
AK
4565}
4566
8a3c1a33
PB
4567static void paging64_init_context(struct kvm_vcpu *vcpu,
4568 struct kvm_mmu *context)
17ac10ad 4569{
855feb67
YZ
4570 int root_level = is_la57_mode(vcpu) ?
4571 PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
4572
4573 paging64_init_context_common(vcpu, context, root_level);
17ac10ad
AK
4574}
4575
8a3c1a33
PB
4576static void paging32_init_context(struct kvm_vcpu *vcpu,
4577 struct kvm_mmu *context)
6aa8b732 4578{
2d48a985 4579 context->nx = false;
4d6931c3 4580 context->root_level = PT32_ROOT_LEVEL;
2d48a985 4581
4d6931c3 4582 reset_rsvds_bits_mask(vcpu, context);
25d92081 4583 update_permission_bitmask(vcpu, context, false);
2d344105 4584 update_pkru_bitmask(vcpu, context, false);
6bb69c9b 4585 update_last_nonleaf_level(vcpu, context);
6aa8b732 4586
6aa8b732 4587 context->page_fault = paging32_page_fault;
6aa8b732 4588 context->gva_to_gpa = paging32_gva_to_gpa;
e8bc217a 4589 context->sync_page = paging32_sync_page;
a7052897 4590 context->invlpg = paging32_invlpg;
0f53b5b1 4591 context->update_pte = paging32_update_pte;
6aa8b732 4592 context->shadow_root_level = PT32E_ROOT_LEVEL;
17c3ba9d 4593 context->root_hpa = INVALID_PAGE;
7c390d35 4594 context->prev_root = KVM_MMU_ROOT_INFO_INVALID;
c5a78f2b 4595 context->direct_map = false;
6aa8b732
AK
4596}
4597
8a3c1a33
PB
4598static void paging32E_init_context(struct kvm_vcpu *vcpu,
4599 struct kvm_mmu *context)
6aa8b732 4600{
8a3c1a33 4601 paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
6aa8b732
AK
4602}
4603
9fa72119
JS
4604static union kvm_mmu_page_role
4605kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu)
4606{
4607 union kvm_mmu_page_role role = {0};
4608
4609 role.guest_mode = is_guest_mode(vcpu);
4610 role.smm = is_smm(vcpu);
4611 role.ad_disabled = (shadow_accessed_mask == 0);
4612 role.level = kvm_x86_ops->get_tdp_level(vcpu);
4613 role.direct = true;
4614 role.access = ACC_ALL;
4615
4616 return role;
4617}
4618
8a3c1a33 4619static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
fb72d167 4620{
ad896af0 4621 struct kvm_mmu *context = &vcpu->arch.mmu;
fb72d167 4622
9fa72119
JS
4623 context->base_role.word = mmu_base_role_mask.word &
4624 kvm_calc_tdp_mmu_root_page_role(vcpu).word;
fb72d167 4625 context->page_fault = tdp_page_fault;
e8bc217a 4626 context->sync_page = nonpaging_sync_page;
a7052897 4627 context->invlpg = nonpaging_invlpg;
0f53b5b1 4628 context->update_pte = nonpaging_update_pte;
855feb67 4629 context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
fb72d167 4630 context->root_hpa = INVALID_PAGE;
7c390d35 4631 context->prev_root = KVM_MMU_ROOT_INFO_INVALID;
c5a78f2b 4632 context->direct_map = true;
1c97f0a0 4633 context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
5777ed34 4634 context->get_cr3 = get_cr3;
e4e517b4 4635 context->get_pdptr = kvm_pdptr_read;
cb659db8 4636 context->inject_page_fault = kvm_inject_page_fault;
fb72d167
JR
4637
4638 if (!is_paging(vcpu)) {
2d48a985 4639 context->nx = false;
fb72d167
JR
4640 context->gva_to_gpa = nonpaging_gva_to_gpa;
4641 context->root_level = 0;
4642 } else if (is_long_mode(vcpu)) {
2d48a985 4643 context->nx = is_nx(vcpu);
855feb67
YZ
4644 context->root_level = is_la57_mode(vcpu) ?
4645 PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
4d6931c3
DB
4646 reset_rsvds_bits_mask(vcpu, context);
4647 context->gva_to_gpa = paging64_gva_to_gpa;
fb72d167 4648 } else if (is_pae(vcpu)) {
2d48a985 4649 context->nx = is_nx(vcpu);
fb72d167 4650 context->root_level = PT32E_ROOT_LEVEL;
4d6931c3
DB
4651 reset_rsvds_bits_mask(vcpu, context);
4652 context->gva_to_gpa = paging64_gva_to_gpa;
fb72d167 4653 } else {
2d48a985 4654 context->nx = false;
fb72d167 4655 context->root_level = PT32_ROOT_LEVEL;
4d6931c3
DB
4656 reset_rsvds_bits_mask(vcpu, context);
4657 context->gva_to_gpa = paging32_gva_to_gpa;
fb72d167
JR
4658 }
4659
25d92081 4660 update_permission_bitmask(vcpu, context, false);
2d344105 4661 update_pkru_bitmask(vcpu, context, false);
6bb69c9b 4662 update_last_nonleaf_level(vcpu, context);
c258b62b 4663 reset_tdp_shadow_zero_bits_mask(vcpu, context);
fb72d167
JR
4664}
4665
9fa72119
JS
4666static union kvm_mmu_page_role
4667kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu)
6aa8b732 4668{
9fa72119 4669 union kvm_mmu_page_role role = {0};
411c588d 4670 bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
edc90b7d 4671 bool smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
9fa72119
JS
4672
4673 role.nxe = is_nx(vcpu);
4674 role.cr4_pae = !!is_pae(vcpu);
4675 role.cr0_wp = is_write_protection(vcpu);
4676 role.smep_andnot_wp = smep && !is_write_protection(vcpu);
4677 role.smap_andnot_wp = smap && !is_write_protection(vcpu);
4678 role.guest_mode = is_guest_mode(vcpu);
4679 role.smm = is_smm(vcpu);
4680 role.direct = !is_paging(vcpu);
4681 role.access = ACC_ALL;
4682
4683 if (!is_long_mode(vcpu))
4684 role.level = PT32E_ROOT_LEVEL;
4685 else if (is_la57_mode(vcpu))
4686 role.level = PT64_ROOT_5LEVEL;
4687 else
4688 role.level = PT64_ROOT_4LEVEL;
4689
4690 return role;
4691}
4692
4693void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
4694{
ad896af0
PB
4695 struct kvm_mmu *context = &vcpu->arch.mmu;
4696
fa4a2c08 4697 MMU_WARN_ON(VALID_PAGE(context->root_hpa));
6aa8b732
AK
4698
4699 if (!is_paging(vcpu))
8a3c1a33 4700 nonpaging_init_context(vcpu, context);
a9058ecd 4701 else if (is_long_mode(vcpu))
8a3c1a33 4702 paging64_init_context(vcpu, context);
6aa8b732 4703 else if (is_pae(vcpu))
8a3c1a33 4704 paging32E_init_context(vcpu, context);
6aa8b732 4705 else
8a3c1a33 4706 paging32_init_context(vcpu, context);
a770f6f2 4707
9fa72119
JS
4708 context->base_role.word = mmu_base_role_mask.word &
4709 kvm_calc_shadow_mmu_root_page_role(vcpu).word;
c258b62b 4710 reset_shadow_zero_bits_mask(vcpu, context);
52fde8df
JR
4711}
4712EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
4713
9fa72119
JS
4714static union kvm_mmu_page_role
4715kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty)
4716{
4717 union kvm_mmu_page_role role = vcpu->arch.mmu.base_role;
4718
4719 role.level = PT64_ROOT_4LEVEL;
4720 role.direct = false;
4721 role.ad_disabled = !accessed_dirty;
4722 role.guest_mode = true;
4723 role.access = ACC_ALL;
4724
4725 return role;
4726}
4727
ae1e2d10
PB
4728void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
4729 bool accessed_dirty)
155a97a3 4730{
ad896af0 4731 struct kvm_mmu *context = &vcpu->arch.mmu;
9fa72119
JS
4732 union kvm_mmu_page_role root_page_role =
4733 kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty);
ad896af0 4734
fa4a2c08 4735 MMU_WARN_ON(VALID_PAGE(context->root_hpa));
155a97a3 4736
855feb67 4737 context->shadow_root_level = PT64_ROOT_4LEVEL;
155a97a3
NHE
4738
4739 context->nx = true;
ae1e2d10 4740 context->ept_ad = accessed_dirty;
155a97a3
NHE
4741 context->page_fault = ept_page_fault;
4742 context->gva_to_gpa = ept_gva_to_gpa;
4743 context->sync_page = ept_sync_page;
4744 context->invlpg = ept_invlpg;
4745 context->update_pte = ept_update_pte;
855feb67 4746 context->root_level = PT64_ROOT_4LEVEL;
155a97a3 4747 context->root_hpa = INVALID_PAGE;
7c390d35 4748 context->prev_root = KVM_MMU_ROOT_INFO_INVALID;
155a97a3 4749 context->direct_map = false;
9fa72119 4750 context->base_role.word = root_page_role.word & mmu_base_role_mask.word;
155a97a3 4751 update_permission_bitmask(vcpu, context, true);
2d344105 4752 update_pkru_bitmask(vcpu, context, true);
fd19d3b4 4753 update_last_nonleaf_level(vcpu, context);
155a97a3 4754 reset_rsvds_bits_mask_ept(vcpu, context, execonly);
c258b62b 4755 reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
155a97a3
NHE
4756}
4757EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
4758
8a3c1a33 4759static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
52fde8df 4760{
ad896af0
PB
4761 struct kvm_mmu *context = &vcpu->arch.mmu;
4762
4763 kvm_init_shadow_mmu(vcpu);
4764 context->set_cr3 = kvm_x86_ops->set_cr3;
4765 context->get_cr3 = get_cr3;
4766 context->get_pdptr = kvm_pdptr_read;
4767 context->inject_page_fault = kvm_inject_page_fault;
6aa8b732
AK
4768}
4769
8a3c1a33 4770static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
02f59dc9
JR
4771{
4772 struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
4773
4774 g_context->get_cr3 = get_cr3;
e4e517b4 4775 g_context->get_pdptr = kvm_pdptr_read;
02f59dc9
JR
4776 g_context->inject_page_fault = kvm_inject_page_fault;
4777
4778 /*
0af2593b
DM
4779 * Note that arch.mmu.gva_to_gpa translates l2_gpa to l1_gpa using
4780 * L1's nested page tables (e.g. EPT12). The nested translation
4781 * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
4782 * L2's page tables as the first level of translation and L1's
4783 * nested page tables as the second level of translation. Basically
4784 * the gva_to_gpa functions between mmu and nested_mmu are swapped.
02f59dc9
JR
4785 */
4786 if (!is_paging(vcpu)) {
2d48a985 4787 g_context->nx = false;
02f59dc9
JR
4788 g_context->root_level = 0;
4789 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
4790 } else if (is_long_mode(vcpu)) {
2d48a985 4791 g_context->nx = is_nx(vcpu);
855feb67
YZ
4792 g_context->root_level = is_la57_mode(vcpu) ?
4793 PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
4d6931c3 4794 reset_rsvds_bits_mask(vcpu, g_context);
02f59dc9
JR
4795 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
4796 } else if (is_pae(vcpu)) {
2d48a985 4797 g_context->nx = is_nx(vcpu);
02f59dc9 4798 g_context->root_level = PT32E_ROOT_LEVEL;
4d6931c3 4799 reset_rsvds_bits_mask(vcpu, g_context);
02f59dc9
JR
4800 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
4801 } else {
2d48a985 4802 g_context->nx = false;
02f59dc9 4803 g_context->root_level = PT32_ROOT_LEVEL;
4d6931c3 4804 reset_rsvds_bits_mask(vcpu, g_context);
02f59dc9
JR
4805 g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
4806 }
4807
25d92081 4808 update_permission_bitmask(vcpu, g_context, false);
2d344105 4809 update_pkru_bitmask(vcpu, g_context, false);
6bb69c9b 4810 update_last_nonleaf_level(vcpu, g_context);
02f59dc9
JR
4811}
4812
8a3c1a33 4813static void init_kvm_mmu(struct kvm_vcpu *vcpu)
fb72d167 4814{
02f59dc9 4815 if (mmu_is_nested(vcpu))
e0c6db3e 4816 init_kvm_nested_mmu(vcpu);
02f59dc9 4817 else if (tdp_enabled)
e0c6db3e 4818 init_kvm_tdp_mmu(vcpu);
fb72d167 4819 else
e0c6db3e 4820 init_kvm_softmmu(vcpu);
fb72d167
JR
4821}
4822
9fa72119
JS
4823static union kvm_mmu_page_role
4824kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
4825{
4826 if (tdp_enabled)
4827 return kvm_calc_tdp_mmu_root_page_role(vcpu);
4828 else
4829 return kvm_calc_shadow_mmu_root_page_role(vcpu);
4830}
4831
8a3c1a33 4832void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
6aa8b732 4833{
95f93af4 4834 kvm_mmu_unload(vcpu);
8a3c1a33 4835 init_kvm_mmu(vcpu);
17c3ba9d 4836}
8668a3c4 4837EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
17c3ba9d
AK
4838
4839int kvm_mmu_load(struct kvm_vcpu *vcpu)
6aa8b732 4840{
714b93da
AK
4841 int r;
4842
e2dec939 4843 r = mmu_topup_memory_caches(vcpu);
17c3ba9d
AK
4844 if (r)
4845 goto out;
8986ecc0 4846 r = mmu_alloc_roots(vcpu);
e2858b4a 4847 kvm_mmu_sync_roots(vcpu);
8986ecc0
MT
4848 if (r)
4849 goto out;
6e42782f 4850 kvm_mmu_load_cr3(vcpu);
714b93da
AK
4851out:
4852 return r;
6aa8b732 4853}
17c3ba9d
AK
4854EXPORT_SYMBOL_GPL(kvm_mmu_load);
4855
4856void kvm_mmu_unload(struct kvm_vcpu *vcpu)
4857{
7c390d35 4858 kvm_mmu_free_roots(vcpu, true);
95f93af4 4859 WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
17c3ba9d 4860}
4b16184c 4861EXPORT_SYMBOL_GPL(kvm_mmu_unload);
6aa8b732 4862
0028425f 4863static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
7c562522
XG
4864 struct kvm_mmu_page *sp, u64 *spte,
4865 const void *new)
0028425f 4866{
30945387 4867 if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
7e4e4056
JR
4868 ++vcpu->kvm->stat.mmu_pde_zapped;
4869 return;
30945387 4870 }
0028425f 4871
4cee5764 4872 ++vcpu->kvm->stat.mmu_pte_updated;
7c562522 4873 vcpu->arch.mmu.update_pte(vcpu, sp, spte, new);
0028425f
AK
4874}
4875
79539cec
AK
4876static bool need_remote_flush(u64 old, u64 new)
4877{
4878 if (!is_shadow_present_pte(old))
4879 return false;
4880 if (!is_shadow_present_pte(new))
4881 return true;
4882 if ((old ^ new) & PT64_BASE_ADDR_MASK)
4883 return true;
53166229
GN
4884 old ^= shadow_nx_mask;
4885 new ^= shadow_nx_mask;
79539cec
AK
4886 return (old & ~new & PT64_PERM_MASK) != 0;
4887}
4888
889e5cbc
XG
4889static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
4890 const u8 *new, int *bytes)
da4a00f0 4891{
889e5cbc
XG
4892 u64 gentry;
4893 int r;
72016f3a 4894
72016f3a
AK
4895 /*
4896 * Assume that the pte write on a page table of the same type
49b26e26
XG
4897 * as the current vcpu paging mode since we update the sptes only
4898 * when they have the same mode.
72016f3a 4899 */
889e5cbc 4900 if (is_pae(vcpu) && *bytes == 4) {
72016f3a 4901 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
889e5cbc
XG
4902 *gpa &= ~(gpa_t)7;
4903 *bytes = 8;
54bf36aa 4904 r = kvm_vcpu_read_guest(vcpu, *gpa, &gentry, 8);
72016f3a
AK
4905 if (r)
4906 gentry = 0;
08e850c6
AK
4907 new = (const u8 *)&gentry;
4908 }
4909
889e5cbc 4910 switch (*bytes) {
08e850c6
AK
4911 case 4:
4912 gentry = *(const u32 *)new;
4913 break;
4914 case 8:
4915 gentry = *(const u64 *)new;
4916 break;
4917 default:
4918 gentry = 0;
4919 break;
72016f3a
AK
4920 }
4921
889e5cbc
XG
4922 return gentry;
4923}
4924
4925/*
4926 * If we're seeing too many writes to a page, it may no longer be a page table,
4927 * or we may be forking, in which case it is better to unmap the page.
4928 */
a138fe75 4929static bool detect_write_flooding(struct kvm_mmu_page *sp)
889e5cbc 4930{
a30f47cb
XG
4931 /*
4932 * Skip write-flooding detected for the sp whose level is 1, because
4933 * it can become unsync, then the guest page is not write-protected.
4934 */
f71fa31f 4935 if (sp->role.level == PT_PAGE_TABLE_LEVEL)
a30f47cb 4936 return false;
3246af0e 4937
e5691a81
XG
4938 atomic_inc(&sp->write_flooding_count);
4939 return atomic_read(&sp->write_flooding_count) >= 3;
889e5cbc
XG
4940}
4941
4942/*
4943 * Misaligned accesses are too much trouble to fix up; also, they usually
4944 * indicate a page is not used as a page table.
4945 */
4946static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
4947 int bytes)
4948{
4949 unsigned offset, pte_size, misaligned;
4950
4951 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
4952 gpa, bytes, sp->role.word);
4953
4954 offset = offset_in_page(gpa);
4955 pte_size = sp->role.cr4_pae ? 8 : 4;
5d9ca30e
XG
4956
4957 /*
4958 * Sometimes, the OS only writes the last one bytes to update status
4959 * bits, for example, in linux, andb instruction is used in clear_bit().
4960 */
4961 if (!(offset & (pte_size - 1)) && bytes == 1)
4962 return false;
4963
889e5cbc
XG
4964 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
4965 misaligned |= bytes < 4;
4966
4967 return misaligned;
4968}
4969
4970static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
4971{
4972 unsigned page_offset, quadrant;
4973 u64 *spte;
4974 int level;
4975
4976 page_offset = offset_in_page(gpa);
4977 level = sp->role.level;
4978 *nspte = 1;
4979 if (!sp->role.cr4_pae) {
4980 page_offset <<= 1; /* 32->64 */
4981 /*
4982 * A 32-bit pde maps 4MB while the shadow pdes map
4983 * only 2MB. So we need to double the offset again
4984 * and zap two pdes instead of one.
4985 */
4986 if (level == PT32_ROOT_LEVEL) {
4987 page_offset &= ~7; /* kill rounding error */
4988 page_offset <<= 1;
4989 *nspte = 2;
4990 }
4991 quadrant = page_offset >> PAGE_SHIFT;
4992 page_offset &= ~PAGE_MASK;
4993 if (quadrant != sp->role.quadrant)
4994 return NULL;
4995 }
4996
4997 spte = &sp->spt[page_offset / sizeof(*spte)];
4998 return spte;
4999}
5000
13d268ca 5001static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
d126363d
JS
5002 const u8 *new, int bytes,
5003 struct kvm_page_track_notifier_node *node)
889e5cbc
XG
5004{
5005 gfn_t gfn = gpa >> PAGE_SHIFT;
889e5cbc 5006 struct kvm_mmu_page *sp;
889e5cbc
XG
5007 LIST_HEAD(invalid_list);
5008 u64 entry, gentry, *spte;
5009 int npte;
b8c67b7a 5010 bool remote_flush, local_flush;
889e5cbc
XG
5011
5012 /*
5013 * If we don't have indirect shadow pages, it means no page is
5014 * write-protected, so we can exit simply.
5015 */
6aa7de05 5016 if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
889e5cbc
XG
5017 return;
5018
b8c67b7a 5019 remote_flush = local_flush = false;
889e5cbc
XG
5020
5021 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
5022
5023 gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, new, &bytes);
5024
5025 /*
5026 * No need to care whether allocation memory is successful
5027 * or not since pte prefetch is skiped if it does not have
5028 * enough objects in the cache.
5029 */
5030 mmu_topup_memory_caches(vcpu);
5031
5032 spin_lock(&vcpu->kvm->mmu_lock);
5033 ++vcpu->kvm->stat.mmu_pte_write;
0375f7fa 5034 kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
889e5cbc 5035
b67bfe0d 5036 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
a30f47cb 5037 if (detect_write_misaligned(sp, gpa, bytes) ||
a138fe75 5038 detect_write_flooding(sp)) {
b8c67b7a 5039 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
4cee5764 5040 ++vcpu->kvm->stat.mmu_flooded;
0e7bc4b9
AK
5041 continue;
5042 }
889e5cbc
XG
5043
5044 spte = get_written_sptes(sp, gpa, &npte);
5045 if (!spte)
5046 continue;
5047
0671a8e7 5048 local_flush = true;
ac1b714e 5049 while (npte--) {
79539cec 5050 entry = *spte;
38e3b2b2 5051 mmu_page_zap_pte(vcpu->kvm, sp, spte);
fa1de2bf
XG
5052 if (gentry &&
5053 !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
9fa72119 5054 & mmu_base_role_mask.word) && rmap_can_add(vcpu))
7c562522 5055 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
9bb4f6b1 5056 if (need_remote_flush(entry, *spte))
0671a8e7 5057 remote_flush = true;
ac1b714e 5058 ++spte;
9b7a0325 5059 }
9b7a0325 5060 }
b8c67b7a 5061 kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
0375f7fa 5062 kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
aaee2c94 5063 spin_unlock(&vcpu->kvm->mmu_lock);
da4a00f0
AK
5064}
5065
a436036b
AK
5066int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
5067{
10589a46
MT
5068 gpa_t gpa;
5069 int r;
a436036b 5070
c5a78f2b 5071 if (vcpu->arch.mmu.direct_map)
60f24784
AK
5072 return 0;
5073
1871c602 5074 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
10589a46 5075
10589a46 5076 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1cb3f3ae 5077
10589a46 5078 return r;
a436036b 5079}
577bdc49 5080EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
a436036b 5081
26eeb53c 5082static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
ebeace86 5083{
d98ba053 5084 LIST_HEAD(invalid_list);
103ad25a 5085
81f4f76b 5086 if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
26eeb53c 5087 return 0;
81f4f76b 5088
5da59607
TY
5089 while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
5090 if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
5091 break;
ebeace86 5092
4cee5764 5093 ++vcpu->kvm->stat.mmu_recycled;
ebeace86 5094 }
aa6bd187 5095 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
26eeb53c
WL
5096
5097 if (!kvm_mmu_available_pages(vcpu->kvm))
5098 return -ENOSPC;
5099 return 0;
ebeace86 5100}
ebeace86 5101
14727754 5102int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
dc25e89e 5103 void *insn, int insn_len)
3067714c 5104{
1cb3f3ae 5105 int r, emulation_type = EMULTYPE_RETRY;
3067714c 5106 enum emulation_result er;
9034e6e8 5107 bool direct = vcpu->arch.mmu.direct_map;
3067714c 5108
618232e2
BS
5109 /* With shadow page tables, fault_address contains a GVA or nGPA. */
5110 if (vcpu->arch.mmu.direct_map) {
5111 vcpu->arch.gpa_available = true;
5112 vcpu->arch.gpa_val = cr2;
5113 }
3067714c 5114
9b8ebbdb 5115 r = RET_PF_INVALID;
e9ee956e
TY
5116 if (unlikely(error_code & PFERR_RSVD_MASK)) {
5117 r = handle_mmio_page_fault(vcpu, cr2, direct);
9b8ebbdb 5118 if (r == RET_PF_EMULATE) {
e9ee956e
TY
5119 emulation_type = 0;
5120 goto emulate;
5121 }
e9ee956e 5122 }
3067714c 5123
9b8ebbdb
PB
5124 if (r == RET_PF_INVALID) {
5125 r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code),
5126 false);
5127 WARN_ON(r == RET_PF_INVALID);
5128 }
5129
5130 if (r == RET_PF_RETRY)
5131 return 1;
3067714c 5132 if (r < 0)
e9ee956e 5133 return r;
3067714c 5134
14727754
TL
5135 /*
5136 * Before emulating the instruction, check if the error code
5137 * was due to a RO violation while translating the guest page.
5138 * This can occur when using nested virtualization with nested
5139 * paging in both guests. If true, we simply unprotect the page
5140 * and resume the guest.
14727754 5141 */
64531a3b 5142 if (vcpu->arch.mmu.direct_map &&
eebed243 5143 (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
14727754
TL
5144 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2));
5145 return 1;
5146 }
5147
ded58749 5148 if (mmio_info_in_cache(vcpu, cr2, direct))
1cb3f3ae 5149 emulation_type = 0;
e9ee956e 5150emulate:
00b10fe1
BS
5151 /*
5152 * On AMD platforms, under certain conditions insn_len may be zero on #NPF.
5153 * This can happen if a guest gets a page-fault on data access but the HW
5154 * table walker is not able to read the instruction page (e.g instruction
5155 * page is not present in memory). In those cases we simply restart the
5156 * guest.
5157 */
5158 if (unlikely(insn && !insn_len))
5159 return 1;
5160
1cb3f3ae 5161 er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
3067714c
AK
5162
5163 switch (er) {
5164 case EMULATE_DONE:
5165 return 1;
ac0a48c3 5166 case EMULATE_USER_EXIT:
3067714c 5167 ++vcpu->stat.mmio_exits;
6d77dbfc 5168 /* fall through */
3067714c 5169 case EMULATE_FAIL:
3f5d18a9 5170 return 0;
3067714c
AK
5171 default:
5172 BUG();
5173 }
3067714c
AK
5174}
5175EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
5176
a7052897
MT
5177void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
5178{
a7052897 5179 vcpu->arch.mmu.invlpg(vcpu, gva);
77c3913b 5180 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
a7052897
MT
5181 ++vcpu->stat.invlpg;
5182}
5183EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
5184
18552672
JR
5185void kvm_enable_tdp(void)
5186{
5187 tdp_enabled = true;
5188}
5189EXPORT_SYMBOL_GPL(kvm_enable_tdp);
5190
5f4cb662
JR
5191void kvm_disable_tdp(void)
5192{
5193 tdp_enabled = false;
5194}
5195EXPORT_SYMBOL_GPL(kvm_disable_tdp);
5196
6aa8b732
AK
5197static void free_mmu_pages(struct kvm_vcpu *vcpu)
5198{
ad312c7c 5199 free_page((unsigned long)vcpu->arch.mmu.pae_root);
87ca74ad 5200 free_page((unsigned long)vcpu->arch.mmu.lm_root);
6aa8b732
AK
5201}
5202
5203static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
5204{
17ac10ad 5205 struct page *page;
6aa8b732
AK
5206 int i;
5207
17ac10ad
AK
5208 /*
5209 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
5210 * Therefore we need to allocate shadow page tables in the first
5211 * 4GB of memory, which happens to fit the DMA32 zone.
5212 */
5213 page = alloc_page(GFP_KERNEL | __GFP_DMA32);
5214 if (!page)
d7fa6ab2
WY
5215 return -ENOMEM;
5216
ad312c7c 5217 vcpu->arch.mmu.pae_root = page_address(page);
17ac10ad 5218 for (i = 0; i < 4; ++i)
ad312c7c 5219 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
17ac10ad 5220
6aa8b732 5221 return 0;
6aa8b732
AK
5222}
5223
8018c27b 5224int kvm_mmu_create(struct kvm_vcpu *vcpu)
6aa8b732 5225{
e459e322
XG
5226 vcpu->arch.walk_mmu = &vcpu->arch.mmu;
5227 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
7c390d35 5228 vcpu->arch.mmu.prev_root = KVM_MMU_ROOT_INFO_INVALID;
e459e322
XG
5229 vcpu->arch.mmu.translate_gpa = translate_gpa;
5230 vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
6aa8b732 5231
8018c27b
IM
5232 return alloc_mmu_pages(vcpu);
5233}
6aa8b732 5234
8a3c1a33 5235void kvm_mmu_setup(struct kvm_vcpu *vcpu)
8018c27b 5236{
fa4a2c08 5237 MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
2c264957 5238
8a3c1a33 5239 init_kvm_mmu(vcpu);
6aa8b732
AK
5240}
5241
b5f5fdca 5242static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
d126363d
JS
5243 struct kvm_memory_slot *slot,
5244 struct kvm_page_track_notifier_node *node)
b5f5fdca
XC
5245{
5246 kvm_mmu_invalidate_zap_all_pages(kvm);
5247}
5248
13d268ca
XG
5249void kvm_mmu_init_vm(struct kvm *kvm)
5250{
5251 struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5252
5253 node->track_write = kvm_mmu_pte_write;
b5f5fdca 5254 node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
13d268ca
XG
5255 kvm_page_track_register_notifier(kvm, node);
5256}
5257
5258void kvm_mmu_uninit_vm(struct kvm *kvm)
5259{
5260 struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5261
5262 kvm_page_track_unregister_notifier(kvm, node);
5263}
5264
1bad2b2a 5265/* The return value indicates if tlb flush on all vcpus is needed. */
018aabb5 5266typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
1bad2b2a
XG
5267
5268/* The caller should hold mmu-lock before calling this function. */
928a4c39 5269static __always_inline bool
1bad2b2a
XG
5270slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
5271 slot_level_handler fn, int start_level, int end_level,
5272 gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
5273{
5274 struct slot_rmap_walk_iterator iterator;
5275 bool flush = false;
5276
5277 for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
5278 end_gfn, &iterator) {
5279 if (iterator.rmap)
5280 flush |= fn(kvm, iterator.rmap);
5281
5282 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
5283 if (flush && lock_flush_tlb) {
5284 kvm_flush_remote_tlbs(kvm);
5285 flush = false;
5286 }
5287 cond_resched_lock(&kvm->mmu_lock);
5288 }
5289 }
5290
5291 if (flush && lock_flush_tlb) {
5292 kvm_flush_remote_tlbs(kvm);
5293 flush = false;
5294 }
5295
5296 return flush;
5297}
5298
928a4c39 5299static __always_inline bool
1bad2b2a
XG
5300slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5301 slot_level_handler fn, int start_level, int end_level,
5302 bool lock_flush_tlb)
5303{
5304 return slot_handle_level_range(kvm, memslot, fn, start_level,
5305 end_level, memslot->base_gfn,
5306 memslot->base_gfn + memslot->npages - 1,
5307 lock_flush_tlb);
5308}
5309
928a4c39 5310static __always_inline bool
1bad2b2a
XG
5311slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5312 slot_level_handler fn, bool lock_flush_tlb)
5313{
5314 return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
5315 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
5316}
5317
928a4c39 5318static __always_inline bool
1bad2b2a
XG
5319slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5320 slot_level_handler fn, bool lock_flush_tlb)
5321{
5322 return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
5323 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
5324}
5325
928a4c39 5326static __always_inline bool
1bad2b2a
XG
5327slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
5328 slot_level_handler fn, bool lock_flush_tlb)
5329{
5330 return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
5331 PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
5332}
5333
efdfe536
XG
5334void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
5335{
5336 struct kvm_memslots *slots;
5337 struct kvm_memory_slot *memslot;
9da0e4d5 5338 int i;
efdfe536
XG
5339
5340 spin_lock(&kvm->mmu_lock);
9da0e4d5
PB
5341 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
5342 slots = __kvm_memslots(kvm, i);
5343 kvm_for_each_memslot(memslot, slots) {
5344 gfn_t start, end;
5345
5346 start = max(gfn_start, memslot->base_gfn);
5347 end = min(gfn_end, memslot->base_gfn + memslot->npages);
5348 if (start >= end)
5349 continue;
efdfe536 5350
9da0e4d5
PB
5351 slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
5352 PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
5353 start, end - 1, true);
5354 }
efdfe536
XG
5355 }
5356
5357 spin_unlock(&kvm->mmu_lock);
5358}
5359
018aabb5
TY
5360static bool slot_rmap_write_protect(struct kvm *kvm,
5361 struct kvm_rmap_head *rmap_head)
d77aa73c 5362{
018aabb5 5363 return __rmap_write_protect(kvm, rmap_head, false);
d77aa73c
XG
5364}
5365
1c91cad4
KH
5366void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
5367 struct kvm_memory_slot *memslot)
6aa8b732 5368{
d77aa73c 5369 bool flush;
6aa8b732 5370
9d1beefb 5371 spin_lock(&kvm->mmu_lock);
d77aa73c
XG
5372 flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect,
5373 false);
9d1beefb 5374 spin_unlock(&kvm->mmu_lock);
198c74f4
XG
5375
5376 /*
5377 * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log()
5378 * which do tlb flush out of mmu-lock should be serialized by
5379 * kvm->slots_lock otherwise tlb flush would be missed.
5380 */
5381 lockdep_assert_held(&kvm->slots_lock);
5382
5383 /*
5384 * We can flush all the TLBs out of the mmu lock without TLB
5385 * corruption since we just change the spte from writable to
5386 * readonly so that we only need to care the case of changing
5387 * spte from present to present (changing the spte from present
5388 * to nonpresent will flush all the TLBs immediately), in other
5389 * words, the only case we care is mmu_spte_update() where we
5390 * haved checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE
5391 * instead of PT_WRITABLE_MASK, that means it does not depend
5392 * on PT_WRITABLE_MASK anymore.
5393 */
d91ffee9
KH
5394 if (flush)
5395 kvm_flush_remote_tlbs(kvm);
6aa8b732 5396}
37a7d8b0 5397
3ea3b7fa 5398static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
018aabb5 5399 struct kvm_rmap_head *rmap_head)
3ea3b7fa
WL
5400{
5401 u64 *sptep;
5402 struct rmap_iterator iter;
5403 int need_tlb_flush = 0;
ba049e93 5404 kvm_pfn_t pfn;
3ea3b7fa
WL
5405 struct kvm_mmu_page *sp;
5406
0d536790 5407restart:
018aabb5 5408 for_each_rmap_spte(rmap_head, &iter, sptep) {
3ea3b7fa
WL
5409 sp = page_header(__pa(sptep));
5410 pfn = spte_to_pfn(*sptep);
5411
5412 /*
decf6333
XG
5413 * We cannot do huge page mapping for indirect shadow pages,
5414 * which are found on the last rmap (level = 1) when not using
5415 * tdp; such shadow pages are synced with the page table in
5416 * the guest, and the guest page table is using 4K page size
5417 * mapping if the indirect sp has level = 1.
3ea3b7fa
WL
5418 */
5419 if (sp->role.direct &&
5420 !kvm_is_reserved_pfn(pfn) &&
127393fb 5421 PageTransCompoundMap(pfn_to_page(pfn))) {
3ea3b7fa 5422 drop_spte(kvm, sptep);
3ea3b7fa 5423 need_tlb_flush = 1;
0d536790
XG
5424 goto restart;
5425 }
3ea3b7fa
WL
5426 }
5427
5428 return need_tlb_flush;
5429}
5430
5431void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
f36f3f28 5432 const struct kvm_memory_slot *memslot)
3ea3b7fa 5433{
f36f3f28 5434 /* FIXME: const-ify all uses of struct kvm_memory_slot. */
3ea3b7fa 5435 spin_lock(&kvm->mmu_lock);
f36f3f28
PB
5436 slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
5437 kvm_mmu_zap_collapsible_spte, true);
3ea3b7fa
WL
5438 spin_unlock(&kvm->mmu_lock);
5439}
5440
f4b4b180
KH
5441void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
5442 struct kvm_memory_slot *memslot)
5443{
d77aa73c 5444 bool flush;
f4b4b180
KH
5445
5446 spin_lock(&kvm->mmu_lock);
d77aa73c 5447 flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
f4b4b180
KH
5448 spin_unlock(&kvm->mmu_lock);
5449
5450 lockdep_assert_held(&kvm->slots_lock);
5451
5452 /*
5453 * It's also safe to flush TLBs out of mmu lock here as currently this
5454 * function is only used for dirty logging, in which case flushing TLB
5455 * out of mmu lock also guarantees no dirty pages will be lost in
5456 * dirty_bitmap.
5457 */
5458 if (flush)
5459 kvm_flush_remote_tlbs(kvm);
5460}
5461EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
5462
5463void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
5464 struct kvm_memory_slot *memslot)
5465{
d77aa73c 5466 bool flush;
f4b4b180
KH
5467
5468 spin_lock(&kvm->mmu_lock);
d77aa73c
XG
5469 flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
5470 false);
f4b4b180
KH
5471 spin_unlock(&kvm->mmu_lock);
5472
5473 /* see kvm_mmu_slot_remove_write_access */
5474 lockdep_assert_held(&kvm->slots_lock);
5475
5476 if (flush)
5477 kvm_flush_remote_tlbs(kvm);
5478}
5479EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
5480
5481void kvm_mmu_slot_set_dirty(struct kvm *kvm,
5482 struct kvm_memory_slot *memslot)
5483{
d77aa73c 5484 bool flush;
f4b4b180
KH
5485
5486 spin_lock(&kvm->mmu_lock);
d77aa73c 5487 flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
f4b4b180
KH
5488 spin_unlock(&kvm->mmu_lock);
5489
5490 lockdep_assert_held(&kvm->slots_lock);
5491
5492 /* see kvm_mmu_slot_leaf_clear_dirty */
5493 if (flush)
5494 kvm_flush_remote_tlbs(kvm);
5495}
5496EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
5497
e7d11c7a 5498#define BATCH_ZAP_PAGES 10
5304b8d3
XG
5499static void kvm_zap_obsolete_pages(struct kvm *kvm)
5500{
5501 struct kvm_mmu_page *sp, *node;
e7d11c7a 5502 int batch = 0;
5304b8d3
XG
5503
5504restart:
5505 list_for_each_entry_safe_reverse(sp, node,
5506 &kvm->arch.active_mmu_pages, link) {
e7d11c7a
XG
5507 int ret;
5508
5304b8d3
XG
5509 /*
5510 * No obsolete page exists before new created page since
5511 * active_mmu_pages is the FIFO list.
5512 */
5513 if (!is_obsolete_sp(kvm, sp))
5514 break;
5515
5516 /*
5304b8d3
XG
5517 * Since we are reversely walking the list and the invalid
5518 * list will be moved to the head, skip the invalid page
5519 * can help us to avoid the infinity list walking.
5520 */
5521 if (sp->role.invalid)
5522 continue;
5523
f34d251d
XG
5524 /*
5525 * Need not flush tlb since we only zap the sp with invalid
5526 * generation number.
5527 */
e7d11c7a 5528 if (batch >= BATCH_ZAP_PAGES &&
f34d251d 5529 cond_resched_lock(&kvm->mmu_lock)) {
e7d11c7a 5530 batch = 0;
5304b8d3
XG
5531 goto restart;
5532 }
5533
365c8868
XG
5534 ret = kvm_mmu_prepare_zap_page(kvm, sp,
5535 &kvm->arch.zapped_obsolete_pages);
e7d11c7a
XG
5536 batch += ret;
5537
5538 if (ret)
5304b8d3
XG
5539 goto restart;
5540 }
5541
f34d251d
XG
5542 /*
5543 * Should flush tlb before free page tables since lockless-walking
5544 * may use the pages.
5545 */
365c8868 5546 kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
5304b8d3
XG
5547}
5548
5549/*
5550 * Fast invalidate all shadow pages and use lock-break technique
5551 * to zap obsolete pages.
5552 *
5553 * It's required when memslot is being deleted or VM is being
5554 * destroyed, in these cases, we should ensure that KVM MMU does
5555 * not use any resource of the being-deleted slot or all slots
5556 * after calling the function.
5557 */
5558void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm)
5559{
5560 spin_lock(&kvm->mmu_lock);
35006126 5561 trace_kvm_mmu_invalidate_zap_all_pages(kvm);
5304b8d3
XG
5562 kvm->arch.mmu_valid_gen++;
5563
f34d251d
XG
5564 /*
5565 * Notify all vcpus to reload its shadow page table
5566 * and flush TLB. Then all vcpus will switch to new
5567 * shadow page table with the new mmu_valid_gen.
5568 *
5569 * Note: we should do this under the protection of
5570 * mmu-lock, otherwise, vcpu would purge shadow page
5571 * but miss tlb flush.
5572 */
5573 kvm_reload_remote_mmus(kvm);
5574
5304b8d3
XG
5575 kvm_zap_obsolete_pages(kvm);
5576 spin_unlock(&kvm->mmu_lock);
5577}
5578
365c8868
XG
5579static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
5580{
5581 return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
5582}
5583
54bf36aa 5584void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots)
f8f55942
XG
5585{
5586 /*
5587 * The very rare case: if the generation-number is round,
5588 * zap all shadow pages.
f8f55942 5589 */
54bf36aa 5590 if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) {
ae0f5499 5591 kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
a8eca9dc 5592 kvm_mmu_invalidate_zap_all_pages(kvm);
7a2e8aaf 5593 }
f8f55942
XG
5594}
5595
70534a73
DC
5596static unsigned long
5597mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
3ee16c81
IE
5598{
5599 struct kvm *kvm;
1495f230 5600 int nr_to_scan = sc->nr_to_scan;
70534a73 5601 unsigned long freed = 0;
3ee16c81 5602
2f303b74 5603 spin_lock(&kvm_lock);
3ee16c81
IE
5604
5605 list_for_each_entry(kvm, &vm_list, vm_list) {
3d56cbdf 5606 int idx;
d98ba053 5607 LIST_HEAD(invalid_list);
3ee16c81 5608
35f2d16b
TY
5609 /*
5610 * Never scan more than sc->nr_to_scan VM instances.
5611 * Will not hit this condition practically since we do not try
5612 * to shrink more than one VM and it is very unlikely to see
5613 * !n_used_mmu_pages so many times.
5614 */
5615 if (!nr_to_scan--)
5616 break;
19526396
GN
5617 /*
5618 * n_used_mmu_pages is accessed without holding kvm->mmu_lock
5619 * here. We may skip a VM instance errorneosly, but we do not
5620 * want to shrink a VM that only started to populate its MMU
5621 * anyway.
5622 */
365c8868
XG
5623 if (!kvm->arch.n_used_mmu_pages &&
5624 !kvm_has_zapped_obsolete_pages(kvm))
19526396 5625 continue;
19526396 5626
f656ce01 5627 idx = srcu_read_lock(&kvm->srcu);
3ee16c81 5628 spin_lock(&kvm->mmu_lock);
3ee16c81 5629
365c8868
XG
5630 if (kvm_has_zapped_obsolete_pages(kvm)) {
5631 kvm_mmu_commit_zap_page(kvm,
5632 &kvm->arch.zapped_obsolete_pages);
5633 goto unlock;
5634 }
5635
70534a73
DC
5636 if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
5637 freed++;
d98ba053 5638 kvm_mmu_commit_zap_page(kvm, &invalid_list);
19526396 5639
365c8868 5640unlock:
3ee16c81 5641 spin_unlock(&kvm->mmu_lock);
f656ce01 5642 srcu_read_unlock(&kvm->srcu, idx);
19526396 5643
70534a73
DC
5644 /*
5645 * unfair on small ones
5646 * per-vm shrinkers cry out
5647 * sadness comes quickly
5648 */
19526396
GN
5649 list_move_tail(&kvm->vm_list, &vm_list);
5650 break;
3ee16c81 5651 }
3ee16c81 5652
2f303b74 5653 spin_unlock(&kvm_lock);
70534a73 5654 return freed;
70534a73
DC
5655}
5656
5657static unsigned long
5658mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
5659{
45221ab6 5660 return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
3ee16c81
IE
5661}
5662
5663static struct shrinker mmu_shrinker = {
70534a73
DC
5664 .count_objects = mmu_shrink_count,
5665 .scan_objects = mmu_shrink_scan,
3ee16c81
IE
5666 .seeks = DEFAULT_SEEKS * 10,
5667};
5668
2ddfd20e 5669static void mmu_destroy_caches(void)
b5a33a75 5670{
c1bd743e
TH
5671 kmem_cache_destroy(pte_list_desc_cache);
5672 kmem_cache_destroy(mmu_page_header_cache);
b5a33a75
AK
5673}
5674
5675int kvm_mmu_module_init(void)
5676{
ab271bd4
AB
5677 int ret = -ENOMEM;
5678
f160c7b7
JS
5679 kvm_mmu_clear_all_pte_masks();
5680
53c07b18
XG
5681 pte_list_desc_cache = kmem_cache_create("pte_list_desc",
5682 sizeof(struct pte_list_desc),
46bea48a 5683 0, SLAB_ACCOUNT, NULL);
53c07b18 5684 if (!pte_list_desc_cache)
ab271bd4 5685 goto out;
b5a33a75 5686
d3d25b04
AK
5687 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
5688 sizeof(struct kvm_mmu_page),
46bea48a 5689 0, SLAB_ACCOUNT, NULL);
d3d25b04 5690 if (!mmu_page_header_cache)
ab271bd4 5691 goto out;
d3d25b04 5692
908c7f19 5693 if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
ab271bd4 5694 goto out;
45bf21a8 5695
ab271bd4
AB
5696 ret = register_shrinker(&mmu_shrinker);
5697 if (ret)
5698 goto out;
3ee16c81 5699
b5a33a75
AK
5700 return 0;
5701
ab271bd4 5702out:
3ee16c81 5703 mmu_destroy_caches();
ab271bd4 5704 return ret;
b5a33a75
AK
5705}
5706
3ad82a7e
ZX
5707/*
5708 * Caculate mmu pages needed for kvm.
5709 */
5710unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
5711{
3ad82a7e
ZX
5712 unsigned int nr_mmu_pages;
5713 unsigned int nr_pages = 0;
bc6678a3 5714 struct kvm_memslots *slots;
be6ba0f0 5715 struct kvm_memory_slot *memslot;
9da0e4d5 5716 int i;
3ad82a7e 5717
9da0e4d5
PB
5718 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
5719 slots = __kvm_memslots(kvm, i);
90d83dc3 5720
9da0e4d5
PB
5721 kvm_for_each_memslot(memslot, slots)
5722 nr_pages += memslot->npages;
5723 }
3ad82a7e
ZX
5724
5725 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
5726 nr_mmu_pages = max(nr_mmu_pages,
9da0e4d5 5727 (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
3ad82a7e
ZX
5728
5729 return nr_mmu_pages;
5730}
5731
c42fffe3
XG
5732void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
5733{
95f93af4 5734 kvm_mmu_unload(vcpu);
c42fffe3
XG
5735 free_mmu_pages(vcpu);
5736 mmu_free_memory_caches(vcpu);
b034cf01
XG
5737}
5738
b034cf01
XG
5739void kvm_mmu_module_exit(void)
5740{
5741 mmu_destroy_caches();
5742 percpu_counter_destroy(&kvm_total_used_mmu_pages);
5743 unregister_shrinker(&mmu_shrinker);
c42fffe3
XG
5744 mmu_audit_disable();
5745}