]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | * Kernel-based Virtual Machine driver for Linux | |
3 | * | |
4 | * This module enables machines with Intel VT-x extensions to run virtual | |
5 | * machines without emulation or binary translation. | |
6 | * | |
7 | * MMU support | |
8 | * | |
9 | * Copyright (C) 2006 Qumranet, Inc. | |
10 | * Copyright 2010 Red Hat, Inc. and/or its affiliates. | |
11 | * | |
12 | * Authors: | |
13 | * Yaniv Kamay <yaniv@qumranet.com> | |
14 | * Avi Kivity <avi@qumranet.com> | |
15 | * | |
16 | * This work is licensed under the terms of the GNU GPL, version 2. See | |
17 | * the COPYING file in the top-level directory. | |
18 | * | |
19 | */ | |
20 | ||
21 | #include "irq.h" | |
22 | #include "mmu.h" | |
23 | #include "x86.h" | |
24 | #include "kvm_cache_regs.h" | |
25 | ||
26 | #include <linux/kvm_host.h> | |
27 | #include <linux/types.h> | |
28 | #include <linux/string.h> | |
29 | #include <linux/mm.h> | |
30 | #include <linux/highmem.h> | |
31 | #include <linux/module.h> | |
32 | #include <linux/swap.h> | |
33 | #include <linux/hugetlb.h> | |
34 | #include <linux/compiler.h> | |
35 | #include <linux/srcu.h> | |
36 | #include <linux/slab.h> | |
37 | #include <linux/uaccess.h> | |
38 | ||
39 | #include <asm/page.h> | |
40 | #include <asm/cmpxchg.h> | |
41 | #include <asm/io.h> | |
42 | #include <asm/vmx.h> | |
43 | ||
44 | /* | |
45 | * When setting this variable to true it enables Two-Dimensional-Paging | |
46 | * where the hardware walks 2 page tables: | |
47 | * 1. the guest-virtual to guest-physical | |
48 | * 2. while doing 1. it walks guest-physical to host-physical | |
49 | * If the hardware supports that we don't need to do shadow paging. | |
50 | */ | |
51 | bool tdp_enabled = false; | |
52 | ||
53 | enum { | |
54 | AUDIT_PRE_PAGE_FAULT, | |
55 | AUDIT_POST_PAGE_FAULT, | |
56 | AUDIT_PRE_PTE_WRITE, | |
57 | AUDIT_POST_PTE_WRITE, | |
58 | AUDIT_PRE_SYNC, | |
59 | AUDIT_POST_SYNC | |
60 | }; | |
61 | ||
62 | char *audit_point_name[] = { | |
63 | "pre page fault", | |
64 | "post page fault", | |
65 | "pre pte write", | |
66 | "post pte write", | |
67 | "pre sync", | |
68 | "post sync" | |
69 | }; | |
70 | ||
71 | #ifdef CONFIG_KVM_MMU_AUDIT | |
72 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point); | |
73 | #else | |
74 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { } | |
75 | #endif | |
76 | ||
77 | #undef MMU_DEBUG | |
78 | ||
79 | #ifdef MMU_DEBUG | |
80 | ||
81 | #define pgprintk(x...) do { if (dbg) printk(x); } while (0) | |
82 | #define rmap_printk(x...) do { if (dbg) printk(x); } while (0) | |
83 | ||
84 | #else | |
85 | ||
86 | #define pgprintk(x...) do { } while (0) | |
87 | #define rmap_printk(x...) do { } while (0) | |
88 | ||
89 | #endif | |
90 | ||
91 | #ifdef MMU_DEBUG | |
92 | static int dbg = 0; | |
93 | module_param(dbg, bool, 0644); | |
94 | #endif | |
95 | ||
96 | #ifndef MMU_DEBUG | |
97 | #define ASSERT(x) do { } while (0) | |
98 | #else | |
99 | #define ASSERT(x) \ | |
100 | if (!(x)) { \ | |
101 | printk(KERN_WARNING "assertion failed %s:%d: %s\n", \ | |
102 | __FILE__, __LINE__, #x); \ | |
103 | } | |
104 | #endif | |
105 | ||
106 | #define PTE_PREFETCH_NUM 8 | |
107 | ||
108 | #define PT_FIRST_AVAIL_BITS_SHIFT 9 | |
109 | #define PT64_SECOND_AVAIL_BITS_SHIFT 52 | |
110 | ||
111 | #define PT64_LEVEL_BITS 9 | |
112 | ||
113 | #define PT64_LEVEL_SHIFT(level) \ | |
114 | (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) | |
115 | ||
116 | #define PT64_INDEX(address, level)\ | |
117 | (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) | |
118 | ||
119 | ||
120 | #define PT32_LEVEL_BITS 10 | |
121 | ||
122 | #define PT32_LEVEL_SHIFT(level) \ | |
123 | (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) | |
124 | ||
125 | #define PT32_LVL_OFFSET_MASK(level) \ | |
126 | (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ | |
127 | * PT32_LEVEL_BITS))) - 1)) | |
128 | ||
129 | #define PT32_INDEX(address, level)\ | |
130 | (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) | |
131 | ||
132 | ||
133 | #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) | |
134 | #define PT64_DIR_BASE_ADDR_MASK \ | |
135 | (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) | |
136 | #define PT64_LVL_ADDR_MASK(level) \ | |
137 | (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ | |
138 | * PT64_LEVEL_BITS))) - 1)) | |
139 | #define PT64_LVL_OFFSET_MASK(level) \ | |
140 | (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ | |
141 | * PT64_LEVEL_BITS))) - 1)) | |
142 | ||
143 | #define PT32_BASE_ADDR_MASK PAGE_MASK | |
144 | #define PT32_DIR_BASE_ADDR_MASK \ | |
145 | (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) | |
146 | #define PT32_LVL_ADDR_MASK(level) \ | |
147 | (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ | |
148 | * PT32_LEVEL_BITS))) - 1)) | |
149 | ||
150 | #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ | |
151 | | PT64_NX_MASK) | |
152 | ||
153 | #define PTE_LIST_EXT 4 | |
154 | ||
155 | #define ACC_EXEC_MASK 1 | |
156 | #define ACC_WRITE_MASK PT_WRITABLE_MASK | |
157 | #define ACC_USER_MASK PT_USER_MASK | |
158 | #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) | |
159 | ||
160 | #include <trace/events/kvm.h> | |
161 | ||
162 | #define CREATE_TRACE_POINTS | |
163 | #include "mmutrace.h" | |
164 | ||
165 | #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) | |
166 | ||
167 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | |
168 | ||
169 | struct pte_list_desc { | |
170 | u64 *sptes[PTE_LIST_EXT]; | |
171 | struct pte_list_desc *more; | |
172 | }; | |
173 | ||
174 | struct kvm_shadow_walk_iterator { | |
175 | u64 addr; | |
176 | hpa_t shadow_addr; | |
177 | u64 *sptep; | |
178 | int level; | |
179 | unsigned index; | |
180 | }; | |
181 | ||
182 | #define for_each_shadow_entry(_vcpu, _addr, _walker) \ | |
183 | for (shadow_walk_init(&(_walker), _vcpu, _addr); \ | |
184 | shadow_walk_okay(&(_walker)); \ | |
185 | shadow_walk_next(&(_walker))) | |
186 | ||
187 | #define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \ | |
188 | for (shadow_walk_init(&(_walker), _vcpu, _addr); \ | |
189 | shadow_walk_okay(&(_walker)) && \ | |
190 | ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \ | |
191 | __shadow_walk_next(&(_walker), spte)) | |
192 | ||
193 | static struct kmem_cache *pte_list_desc_cache; | |
194 | static struct kmem_cache *mmu_page_header_cache; | |
195 | static struct percpu_counter kvm_total_used_mmu_pages; | |
196 | ||
197 | static u64 __read_mostly shadow_nx_mask; | |
198 | static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ | |
199 | static u64 __read_mostly shadow_user_mask; | |
200 | static u64 __read_mostly shadow_accessed_mask; | |
201 | static u64 __read_mostly shadow_dirty_mask; | |
202 | static u64 __read_mostly shadow_mmio_mask; | |
203 | ||
204 | static void mmu_spte_set(u64 *sptep, u64 spte); | |
205 | ||
206 | void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) | |
207 | { | |
208 | shadow_mmio_mask = mmio_mask; | |
209 | } | |
210 | EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); | |
211 | ||
212 | static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access) | |
213 | { | |
214 | access &= ACC_WRITE_MASK | ACC_USER_MASK; | |
215 | ||
216 | trace_mark_mmio_spte(sptep, gfn, access); | |
217 | mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT); | |
218 | } | |
219 | ||
220 | static bool is_mmio_spte(u64 spte) | |
221 | { | |
222 | return (spte & shadow_mmio_mask) == shadow_mmio_mask; | |
223 | } | |
224 | ||
225 | static gfn_t get_mmio_spte_gfn(u64 spte) | |
226 | { | |
227 | return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT; | |
228 | } | |
229 | ||
230 | static unsigned get_mmio_spte_access(u64 spte) | |
231 | { | |
232 | return (spte & ~shadow_mmio_mask) & ~PAGE_MASK; | |
233 | } | |
234 | ||
235 | static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access) | |
236 | { | |
237 | if (unlikely(is_noslot_pfn(pfn))) { | |
238 | mark_mmio_spte(sptep, gfn, access); | |
239 | return true; | |
240 | } | |
241 | ||
242 | return false; | |
243 | } | |
244 | ||
245 | static inline u64 rsvd_bits(int s, int e) | |
246 | { | |
247 | return ((1ULL << (e - s + 1)) - 1) << s; | |
248 | } | |
249 | ||
250 | void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, | |
251 | u64 dirty_mask, u64 nx_mask, u64 x_mask) | |
252 | { | |
253 | shadow_user_mask = user_mask; | |
254 | shadow_accessed_mask = accessed_mask; | |
255 | shadow_dirty_mask = dirty_mask; | |
256 | shadow_nx_mask = nx_mask; | |
257 | shadow_x_mask = x_mask; | |
258 | } | |
259 | EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); | |
260 | ||
261 | static int is_cpuid_PSE36(void) | |
262 | { | |
263 | return 1; | |
264 | } | |
265 | ||
266 | static int is_nx(struct kvm_vcpu *vcpu) | |
267 | { | |
268 | return vcpu->arch.efer & EFER_NX; | |
269 | } | |
270 | ||
271 | static int is_shadow_present_pte(u64 pte) | |
272 | { | |
273 | return pte & PT_PRESENT_MASK && !is_mmio_spte(pte); | |
274 | } | |
275 | ||
276 | static int is_large_pte(u64 pte) | |
277 | { | |
278 | return pte & PT_PAGE_SIZE_MASK; | |
279 | } | |
280 | ||
281 | static int is_dirty_gpte(unsigned long pte) | |
282 | { | |
283 | return pte & PT_DIRTY_MASK; | |
284 | } | |
285 | ||
286 | static int is_rmap_spte(u64 pte) | |
287 | { | |
288 | return is_shadow_present_pte(pte); | |
289 | } | |
290 | ||
291 | static int is_last_spte(u64 pte, int level) | |
292 | { | |
293 | if (level == PT_PAGE_TABLE_LEVEL) | |
294 | return 1; | |
295 | if (is_large_pte(pte)) | |
296 | return 1; | |
297 | return 0; | |
298 | } | |
299 | ||
300 | static pfn_t spte_to_pfn(u64 pte) | |
301 | { | |
302 | return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; | |
303 | } | |
304 | ||
305 | static gfn_t pse36_gfn_delta(u32 gpte) | |
306 | { | |
307 | int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; | |
308 | ||
309 | return (gpte & PT32_DIR_PSE36_MASK) << shift; | |
310 | } | |
311 | ||
312 | #ifdef CONFIG_X86_64 | |
313 | static void __set_spte(u64 *sptep, u64 spte) | |
314 | { | |
315 | *sptep = spte; | |
316 | } | |
317 | ||
318 | static void __update_clear_spte_fast(u64 *sptep, u64 spte) | |
319 | { | |
320 | *sptep = spte; | |
321 | } | |
322 | ||
323 | static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) | |
324 | { | |
325 | return xchg(sptep, spte); | |
326 | } | |
327 | ||
328 | static u64 __get_spte_lockless(u64 *sptep) | |
329 | { | |
330 | return ACCESS_ONCE(*sptep); | |
331 | } | |
332 | ||
333 | static bool __check_direct_spte_mmio_pf(u64 spte) | |
334 | { | |
335 | /* It is valid if the spte is zapped. */ | |
336 | return spte == 0ull; | |
337 | } | |
338 | #else | |
339 | union split_spte { | |
340 | struct { | |
341 | u32 spte_low; | |
342 | u32 spte_high; | |
343 | }; | |
344 | u64 spte; | |
345 | }; | |
346 | ||
347 | static void count_spte_clear(u64 *sptep, u64 spte) | |
348 | { | |
349 | struct kvm_mmu_page *sp = page_header(__pa(sptep)); | |
350 | ||
351 | if (is_shadow_present_pte(spte)) | |
352 | return; | |
353 | ||
354 | /* Ensure the spte is completely set before we increase the count */ | |
355 | smp_wmb(); | |
356 | sp->clear_spte_count++; | |
357 | } | |
358 | ||
359 | static void __set_spte(u64 *sptep, u64 spte) | |
360 | { | |
361 | union split_spte *ssptep, sspte; | |
362 | ||
363 | ssptep = (union split_spte *)sptep; | |
364 | sspte = (union split_spte)spte; | |
365 | ||
366 | ssptep->spte_high = sspte.spte_high; | |
367 | ||
368 | /* | |
369 | * If we map the spte from nonpresent to present, We should store | |
370 | * the high bits firstly, then set present bit, so cpu can not | |
371 | * fetch this spte while we are setting the spte. | |
372 | */ | |
373 | smp_wmb(); | |
374 | ||
375 | ssptep->spte_low = sspte.spte_low; | |
376 | } | |
377 | ||
378 | static void __update_clear_spte_fast(u64 *sptep, u64 spte) | |
379 | { | |
380 | union split_spte *ssptep, sspte; | |
381 | ||
382 | ssptep = (union split_spte *)sptep; | |
383 | sspte = (union split_spte)spte; | |
384 | ||
385 | ssptep->spte_low = sspte.spte_low; | |
386 | ||
387 | /* | |
388 | * If we map the spte from present to nonpresent, we should clear | |
389 | * present bit firstly to avoid vcpu fetch the old high bits. | |
390 | */ | |
391 | smp_wmb(); | |
392 | ||
393 | ssptep->spte_high = sspte.spte_high; | |
394 | count_spte_clear(sptep, spte); | |
395 | } | |
396 | ||
397 | static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) | |
398 | { | |
399 | union split_spte *ssptep, sspte, orig; | |
400 | ||
401 | ssptep = (union split_spte *)sptep; | |
402 | sspte = (union split_spte)spte; | |
403 | ||
404 | /* xchg acts as a barrier before the setting of the high bits */ | |
405 | orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low); | |
406 | orig.spte_high = ssptep->spte_high; | |
407 | ssptep->spte_high = sspte.spte_high; | |
408 | count_spte_clear(sptep, spte); | |
409 | ||
410 | return orig.spte; | |
411 | } | |
412 | ||
413 | /* | |
414 | * The idea using the light way get the spte on x86_32 guest is from | |
415 | * gup_get_pte(arch/x86/mm/gup.c). | |
416 | * The difference is we can not catch the spte tlb flush if we leave | |
417 | * guest mode, so we emulate it by increase clear_spte_count when spte | |
418 | * is cleared. | |
419 | */ | |
420 | static u64 __get_spte_lockless(u64 *sptep) | |
421 | { | |
422 | struct kvm_mmu_page *sp = page_header(__pa(sptep)); | |
423 | union split_spte spte, *orig = (union split_spte *)sptep; | |
424 | int count; | |
425 | ||
426 | retry: | |
427 | count = sp->clear_spte_count; | |
428 | smp_rmb(); | |
429 | ||
430 | spte.spte_low = orig->spte_low; | |
431 | smp_rmb(); | |
432 | ||
433 | spte.spte_high = orig->spte_high; | |
434 | smp_rmb(); | |
435 | ||
436 | if (unlikely(spte.spte_low != orig->spte_low || | |
437 | count != sp->clear_spte_count)) | |
438 | goto retry; | |
439 | ||
440 | return spte.spte; | |
441 | } | |
442 | ||
443 | static bool __check_direct_spte_mmio_pf(u64 spte) | |
444 | { | |
445 | union split_spte sspte = (union split_spte)spte; | |
446 | u32 high_mmio_mask = shadow_mmio_mask >> 32; | |
447 | ||
448 | /* It is valid if the spte is zapped. */ | |
449 | if (spte == 0ull) | |
450 | return true; | |
451 | ||
452 | /* It is valid if the spte is being zapped. */ | |
453 | if (sspte.spte_low == 0ull && | |
454 | (sspte.spte_high & high_mmio_mask) == high_mmio_mask) | |
455 | return true; | |
456 | ||
457 | return false; | |
458 | } | |
459 | #endif | |
460 | ||
461 | static bool spte_has_volatile_bits(u64 spte) | |
462 | { | |
463 | if (!shadow_accessed_mask) | |
464 | return false; | |
465 | ||
466 | if (!is_shadow_present_pte(spte)) | |
467 | return false; | |
468 | ||
469 | if ((spte & shadow_accessed_mask) && | |
470 | (!is_writable_pte(spte) || (spte & shadow_dirty_mask))) | |
471 | return false; | |
472 | ||
473 | return true; | |
474 | } | |
475 | ||
476 | static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask) | |
477 | { | |
478 | return (old_spte & bit_mask) && !(new_spte & bit_mask); | |
479 | } | |
480 | ||
481 | /* Rules for using mmu_spte_set: | |
482 | * Set the sptep from nonpresent to present. | |
483 | * Note: the sptep being assigned *must* be either not present | |
484 | * or in a state where the hardware will not attempt to update | |
485 | * the spte. | |
486 | */ | |
487 | static void mmu_spte_set(u64 *sptep, u64 new_spte) | |
488 | { | |
489 | WARN_ON(is_shadow_present_pte(*sptep)); | |
490 | __set_spte(sptep, new_spte); | |
491 | } | |
492 | ||
493 | /* Rules for using mmu_spte_update: | |
494 | * Update the state bits, it means the mapped pfn is not changged. | |
495 | */ | |
496 | static void mmu_spte_update(u64 *sptep, u64 new_spte) | |
497 | { | |
498 | u64 mask, old_spte = *sptep; | |
499 | ||
500 | WARN_ON(!is_rmap_spte(new_spte)); | |
501 | ||
502 | if (!is_shadow_present_pte(old_spte)) | |
503 | return mmu_spte_set(sptep, new_spte); | |
504 | ||
505 | new_spte |= old_spte & shadow_dirty_mask; | |
506 | ||
507 | mask = shadow_accessed_mask; | |
508 | if (is_writable_pte(old_spte)) | |
509 | mask |= shadow_dirty_mask; | |
510 | ||
511 | if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) | |
512 | __update_clear_spte_fast(sptep, new_spte); | |
513 | else | |
514 | old_spte = __update_clear_spte_slow(sptep, new_spte); | |
515 | ||
516 | if (!shadow_accessed_mask) | |
517 | return; | |
518 | ||
519 | if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) | |
520 | kvm_set_pfn_accessed(spte_to_pfn(old_spte)); | |
521 | if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) | |
522 | kvm_set_pfn_dirty(spte_to_pfn(old_spte)); | |
523 | } | |
524 | ||
525 | /* | |
526 | * Rules for using mmu_spte_clear_track_bits: | |
527 | * It sets the sptep from present to nonpresent, and track the | |
528 | * state bits, it is used to clear the last level sptep. | |
529 | */ | |
530 | static int mmu_spte_clear_track_bits(u64 *sptep) | |
531 | { | |
532 | pfn_t pfn; | |
533 | u64 old_spte = *sptep; | |
534 | ||
535 | if (!spte_has_volatile_bits(old_spte)) | |
536 | __update_clear_spte_fast(sptep, 0ull); | |
537 | else | |
538 | old_spte = __update_clear_spte_slow(sptep, 0ull); | |
539 | ||
540 | if (!is_rmap_spte(old_spte)) | |
541 | return 0; | |
542 | ||
543 | pfn = spte_to_pfn(old_spte); | |
544 | if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) | |
545 | kvm_set_pfn_accessed(pfn); | |
546 | if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) | |
547 | kvm_set_pfn_dirty(pfn); | |
548 | return 1; | |
549 | } | |
550 | ||
551 | /* | |
552 | * Rules for using mmu_spte_clear_no_track: | |
553 | * Directly clear spte without caring the state bits of sptep, | |
554 | * it is used to set the upper level spte. | |
555 | */ | |
556 | static void mmu_spte_clear_no_track(u64 *sptep) | |
557 | { | |
558 | __update_clear_spte_fast(sptep, 0ull); | |
559 | } | |
560 | ||
561 | static u64 mmu_spte_get_lockless(u64 *sptep) | |
562 | { | |
563 | return __get_spte_lockless(sptep); | |
564 | } | |
565 | ||
566 | static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) | |
567 | { | |
568 | rcu_read_lock(); | |
569 | atomic_inc(&vcpu->kvm->arch.reader_counter); | |
570 | ||
571 | /* Increase the counter before walking shadow page table */ | |
572 | smp_mb__after_atomic_inc(); | |
573 | } | |
574 | ||
575 | static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) | |
576 | { | |
577 | /* Decrease the counter after walking shadow page table finished */ | |
578 | smp_mb__before_atomic_dec(); | |
579 | atomic_dec(&vcpu->kvm->arch.reader_counter); | |
580 | rcu_read_unlock(); | |
581 | } | |
582 | ||
583 | static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, | |
584 | struct kmem_cache *base_cache, int min) | |
585 | { | |
586 | void *obj; | |
587 | ||
588 | if (cache->nobjs >= min) | |
589 | return 0; | |
590 | while (cache->nobjs < ARRAY_SIZE(cache->objects)) { | |
591 | obj = kmem_cache_zalloc(base_cache, GFP_KERNEL); | |
592 | if (!obj) | |
593 | return -ENOMEM; | |
594 | cache->objects[cache->nobjs++] = obj; | |
595 | } | |
596 | return 0; | |
597 | } | |
598 | ||
599 | static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache) | |
600 | { | |
601 | return cache->nobjs; | |
602 | } | |
603 | ||
604 | static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, | |
605 | struct kmem_cache *cache) | |
606 | { | |
607 | while (mc->nobjs) | |
608 | kmem_cache_free(cache, mc->objects[--mc->nobjs]); | |
609 | } | |
610 | ||
611 | static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, | |
612 | int min) | |
613 | { | |
614 | void *page; | |
615 | ||
616 | if (cache->nobjs >= min) | |
617 | return 0; | |
618 | while (cache->nobjs < ARRAY_SIZE(cache->objects)) { | |
619 | page = (void *)__get_free_page(GFP_KERNEL); | |
620 | if (!page) | |
621 | return -ENOMEM; | |
622 | cache->objects[cache->nobjs++] = page; | |
623 | } | |
624 | return 0; | |
625 | } | |
626 | ||
627 | static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc) | |
628 | { | |
629 | while (mc->nobjs) | |
630 | free_page((unsigned long)mc->objects[--mc->nobjs]); | |
631 | } | |
632 | ||
633 | static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) | |
634 | { | |
635 | int r; | |
636 | ||
637 | r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, | |
638 | pte_list_desc_cache, 8 + PTE_PREFETCH_NUM); | |
639 | if (r) | |
640 | goto out; | |
641 | r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); | |
642 | if (r) | |
643 | goto out; | |
644 | r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, | |
645 | mmu_page_header_cache, 4); | |
646 | out: | |
647 | return r; | |
648 | } | |
649 | ||
650 | static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) | |
651 | { | |
652 | mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, | |
653 | pte_list_desc_cache); | |
654 | mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); | |
655 | mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache, | |
656 | mmu_page_header_cache); | |
657 | } | |
658 | ||
659 | static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, | |
660 | size_t size) | |
661 | { | |
662 | void *p; | |
663 | ||
664 | BUG_ON(!mc->nobjs); | |
665 | p = mc->objects[--mc->nobjs]; | |
666 | return p; | |
667 | } | |
668 | ||
669 | static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) | |
670 | { | |
671 | return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache, | |
672 | sizeof(struct pte_list_desc)); | |
673 | } | |
674 | ||
675 | static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) | |
676 | { | |
677 | kmem_cache_free(pte_list_desc_cache, pte_list_desc); | |
678 | } | |
679 | ||
680 | static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) | |
681 | { | |
682 | if (!sp->role.direct) | |
683 | return sp->gfns[index]; | |
684 | ||
685 | return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS)); | |
686 | } | |
687 | ||
688 | static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) | |
689 | { | |
690 | if (sp->role.direct) | |
691 | BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index)); | |
692 | else | |
693 | sp->gfns[index] = gfn; | |
694 | } | |
695 | ||
696 | /* | |
697 | * Return the pointer to the large page information for a given gfn, | |
698 | * handling slots that are not large page aligned. | |
699 | */ | |
700 | static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, | |
701 | struct kvm_memory_slot *slot, | |
702 | int level) | |
703 | { | |
704 | unsigned long idx; | |
705 | ||
706 | idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - | |
707 | (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); | |
708 | return &slot->lpage_info[level - 2][idx]; | |
709 | } | |
710 | ||
711 | static void account_shadowed(struct kvm *kvm, gfn_t gfn) | |
712 | { | |
713 | struct kvm_memory_slot *slot; | |
714 | struct kvm_lpage_info *linfo; | |
715 | int i; | |
716 | ||
717 | slot = gfn_to_memslot(kvm, gfn); | |
718 | for (i = PT_DIRECTORY_LEVEL; | |
719 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { | |
720 | linfo = lpage_info_slot(gfn, slot, i); | |
721 | linfo->write_count += 1; | |
722 | } | |
723 | kvm->arch.indirect_shadow_pages++; | |
724 | } | |
725 | ||
726 | static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) | |
727 | { | |
728 | struct kvm_memory_slot *slot; | |
729 | struct kvm_lpage_info *linfo; | |
730 | int i; | |
731 | ||
732 | slot = gfn_to_memslot(kvm, gfn); | |
733 | for (i = PT_DIRECTORY_LEVEL; | |
734 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { | |
735 | linfo = lpage_info_slot(gfn, slot, i); | |
736 | linfo->write_count -= 1; | |
737 | WARN_ON(linfo->write_count < 0); | |
738 | } | |
739 | kvm->arch.indirect_shadow_pages--; | |
740 | } | |
741 | ||
742 | static int has_wrprotected_page(struct kvm *kvm, | |
743 | gfn_t gfn, | |
744 | int level) | |
745 | { | |
746 | struct kvm_memory_slot *slot; | |
747 | struct kvm_lpage_info *linfo; | |
748 | ||
749 | slot = gfn_to_memslot(kvm, gfn); | |
750 | if (slot) { | |
751 | linfo = lpage_info_slot(gfn, slot, level); | |
752 | return linfo->write_count; | |
753 | } | |
754 | ||
755 | return 1; | |
756 | } | |
757 | ||
758 | static int host_mapping_level(struct kvm *kvm, gfn_t gfn) | |
759 | { | |
760 | unsigned long page_size; | |
761 | int i, ret = 0; | |
762 | ||
763 | page_size = kvm_host_page_size(kvm, gfn); | |
764 | ||
765 | for (i = PT_PAGE_TABLE_LEVEL; | |
766 | i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) { | |
767 | if (page_size >= KVM_HPAGE_SIZE(i)) | |
768 | ret = i; | |
769 | else | |
770 | break; | |
771 | } | |
772 | ||
773 | return ret; | |
774 | } | |
775 | ||
776 | static struct kvm_memory_slot * | |
777 | gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, | |
778 | bool no_dirty_log) | |
779 | { | |
780 | struct kvm_memory_slot *slot; | |
781 | ||
782 | slot = gfn_to_memslot(vcpu->kvm, gfn); | |
783 | if (!slot || slot->flags & KVM_MEMSLOT_INVALID || | |
784 | (no_dirty_log && slot->dirty_bitmap)) | |
785 | slot = NULL; | |
786 | ||
787 | return slot; | |
788 | } | |
789 | ||
790 | static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) | |
791 | { | |
792 | return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true); | |
793 | } | |
794 | ||
795 | static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) | |
796 | { | |
797 | int host_level, level, max_level; | |
798 | ||
799 | host_level = host_mapping_level(vcpu->kvm, large_gfn); | |
800 | ||
801 | if (host_level == PT_PAGE_TABLE_LEVEL) | |
802 | return host_level; | |
803 | ||
804 | max_level = kvm_x86_ops->get_lpage_level() < host_level ? | |
805 | kvm_x86_ops->get_lpage_level() : host_level; | |
806 | ||
807 | for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) | |
808 | if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) | |
809 | break; | |
810 | ||
811 | return level - 1; | |
812 | } | |
813 | ||
814 | /* | |
815 | * Pte mapping structures: | |
816 | * | |
817 | * If pte_list bit zero is zero, then pte_list point to the spte. | |
818 | * | |
819 | * If pte_list bit zero is one, (then pte_list & ~1) points to a struct | |
820 | * pte_list_desc containing more mappings. | |
821 | * | |
822 | * Returns the number of pte entries before the spte was added or zero if | |
823 | * the spte was not added. | |
824 | * | |
825 | */ | |
826 | static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, | |
827 | unsigned long *pte_list) | |
828 | { | |
829 | struct pte_list_desc *desc; | |
830 | int i, count = 0; | |
831 | ||
832 | if (!*pte_list) { | |
833 | rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte); | |
834 | *pte_list = (unsigned long)spte; | |
835 | } else if (!(*pte_list & 1)) { | |
836 | rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte); | |
837 | desc = mmu_alloc_pte_list_desc(vcpu); | |
838 | desc->sptes[0] = (u64 *)*pte_list; | |
839 | desc->sptes[1] = spte; | |
840 | *pte_list = (unsigned long)desc | 1; | |
841 | ++count; | |
842 | } else { | |
843 | rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte); | |
844 | desc = (struct pte_list_desc *)(*pte_list & ~1ul); | |
845 | while (desc->sptes[PTE_LIST_EXT-1] && desc->more) { | |
846 | desc = desc->more; | |
847 | count += PTE_LIST_EXT; | |
848 | } | |
849 | if (desc->sptes[PTE_LIST_EXT-1]) { | |
850 | desc->more = mmu_alloc_pte_list_desc(vcpu); | |
851 | desc = desc->more; | |
852 | } | |
853 | for (i = 0; desc->sptes[i]; ++i) | |
854 | ++count; | |
855 | desc->sptes[i] = spte; | |
856 | } | |
857 | return count; | |
858 | } | |
859 | ||
860 | static u64 *pte_list_next(unsigned long *pte_list, u64 *spte) | |
861 | { | |
862 | struct pte_list_desc *desc; | |
863 | u64 *prev_spte; | |
864 | int i; | |
865 | ||
866 | if (!*pte_list) | |
867 | return NULL; | |
868 | else if (!(*pte_list & 1)) { | |
869 | if (!spte) | |
870 | return (u64 *)*pte_list; | |
871 | return NULL; | |
872 | } | |
873 | desc = (struct pte_list_desc *)(*pte_list & ~1ul); | |
874 | prev_spte = NULL; | |
875 | while (desc) { | |
876 | for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) { | |
877 | if (prev_spte == spte) | |
878 | return desc->sptes[i]; | |
879 | prev_spte = desc->sptes[i]; | |
880 | } | |
881 | desc = desc->more; | |
882 | } | |
883 | return NULL; | |
884 | } | |
885 | ||
886 | static void | |
887 | pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc, | |
888 | int i, struct pte_list_desc *prev_desc) | |
889 | { | |
890 | int j; | |
891 | ||
892 | for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j) | |
893 | ; | |
894 | desc->sptes[i] = desc->sptes[j]; | |
895 | desc->sptes[j] = NULL; | |
896 | if (j != 0) | |
897 | return; | |
898 | if (!prev_desc && !desc->more) | |
899 | *pte_list = (unsigned long)desc->sptes[0]; | |
900 | else | |
901 | if (prev_desc) | |
902 | prev_desc->more = desc->more; | |
903 | else | |
904 | *pte_list = (unsigned long)desc->more | 1; | |
905 | mmu_free_pte_list_desc(desc); | |
906 | } | |
907 | ||
908 | static void pte_list_remove(u64 *spte, unsigned long *pte_list) | |
909 | { | |
910 | struct pte_list_desc *desc; | |
911 | struct pte_list_desc *prev_desc; | |
912 | int i; | |
913 | ||
914 | if (!*pte_list) { | |
915 | printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte); | |
916 | BUG(); | |
917 | } else if (!(*pte_list & 1)) { | |
918 | rmap_printk("pte_list_remove: %p 1->0\n", spte); | |
919 | if ((u64 *)*pte_list != spte) { | |
920 | printk(KERN_ERR "pte_list_remove: %p 1->BUG\n", spte); | |
921 | BUG(); | |
922 | } | |
923 | *pte_list = 0; | |
924 | } else { | |
925 | rmap_printk("pte_list_remove: %p many->many\n", spte); | |
926 | desc = (struct pte_list_desc *)(*pte_list & ~1ul); | |
927 | prev_desc = NULL; | |
928 | while (desc) { | |
929 | for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) | |
930 | if (desc->sptes[i] == spte) { | |
931 | pte_list_desc_remove_entry(pte_list, | |
932 | desc, i, | |
933 | prev_desc); | |
934 | return; | |
935 | } | |
936 | prev_desc = desc; | |
937 | desc = desc->more; | |
938 | } | |
939 | pr_err("pte_list_remove: %p many->many\n", spte); | |
940 | BUG(); | |
941 | } | |
942 | } | |
943 | ||
944 | typedef void (*pte_list_walk_fn) (u64 *spte); | |
945 | static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn) | |
946 | { | |
947 | struct pte_list_desc *desc; | |
948 | int i; | |
949 | ||
950 | if (!*pte_list) | |
951 | return; | |
952 | ||
953 | if (!(*pte_list & 1)) | |
954 | return fn((u64 *)*pte_list); | |
955 | ||
956 | desc = (struct pte_list_desc *)(*pte_list & ~1ul); | |
957 | while (desc) { | |
958 | for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) | |
959 | fn(desc->sptes[i]); | |
960 | desc = desc->more; | |
961 | } | |
962 | } | |
963 | ||
964 | static unsigned long *__gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level, | |
965 | struct kvm_memory_slot *slot) | |
966 | { | |
967 | struct kvm_lpage_info *linfo; | |
968 | ||
969 | if (likely(level == PT_PAGE_TABLE_LEVEL)) | |
970 | return &slot->rmap[gfn - slot->base_gfn]; | |
971 | ||
972 | linfo = lpage_info_slot(gfn, slot, level); | |
973 | return &linfo->rmap_pde; | |
974 | } | |
975 | ||
976 | /* | |
977 | * Take gfn and return the reverse mapping to it. | |
978 | */ | |
979 | static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) | |
980 | { | |
981 | struct kvm_memory_slot *slot; | |
982 | ||
983 | slot = gfn_to_memslot(kvm, gfn); | |
984 | return __gfn_to_rmap(kvm, gfn, level, slot); | |
985 | } | |
986 | ||
987 | static bool rmap_can_add(struct kvm_vcpu *vcpu) | |
988 | { | |
989 | struct kvm_mmu_memory_cache *cache; | |
990 | ||
991 | cache = &vcpu->arch.mmu_pte_list_desc_cache; | |
992 | return mmu_memory_cache_free_objects(cache); | |
993 | } | |
994 | ||
995 | static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | |
996 | { | |
997 | struct kvm_mmu_page *sp; | |
998 | unsigned long *rmapp; | |
999 | ||
1000 | sp = page_header(__pa(spte)); | |
1001 | kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); | |
1002 | rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); | |
1003 | return pte_list_add(vcpu, spte, rmapp); | |
1004 | } | |
1005 | ||
1006 | static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) | |
1007 | { | |
1008 | return pte_list_next(rmapp, spte); | |
1009 | } | |
1010 | ||
1011 | static void rmap_remove(struct kvm *kvm, u64 *spte) | |
1012 | { | |
1013 | struct kvm_mmu_page *sp; | |
1014 | gfn_t gfn; | |
1015 | unsigned long *rmapp; | |
1016 | ||
1017 | sp = page_header(__pa(spte)); | |
1018 | gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); | |
1019 | rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); | |
1020 | pte_list_remove(spte, rmapp); | |
1021 | } | |
1022 | ||
1023 | static void drop_spte(struct kvm *kvm, u64 *sptep) | |
1024 | { | |
1025 | if (mmu_spte_clear_track_bits(sptep)) | |
1026 | rmap_remove(kvm, sptep); | |
1027 | } | |
1028 | ||
1029 | int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, | |
1030 | struct kvm_memory_slot *slot) | |
1031 | { | |
1032 | unsigned long *rmapp; | |
1033 | u64 *spte; | |
1034 | int i, write_protected = 0; | |
1035 | ||
1036 | rmapp = __gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL, slot); | |
1037 | spte = rmap_next(kvm, rmapp, NULL); | |
1038 | while (spte) { | |
1039 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | |
1040 | rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); | |
1041 | if (is_writable_pte(*spte)) { | |
1042 | mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK); | |
1043 | write_protected = 1; | |
1044 | } | |
1045 | spte = rmap_next(kvm, rmapp, spte); | |
1046 | } | |
1047 | ||
1048 | /* check for huge page mappings */ | |
1049 | for (i = PT_DIRECTORY_LEVEL; | |
1050 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { | |
1051 | rmapp = __gfn_to_rmap(kvm, gfn, i, slot); | |
1052 | spte = rmap_next(kvm, rmapp, NULL); | |
1053 | while (spte) { | |
1054 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | |
1055 | BUG_ON(!is_large_pte(*spte)); | |
1056 | pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); | |
1057 | if (is_writable_pte(*spte)) { | |
1058 | drop_spte(kvm, spte); | |
1059 | --kvm->stat.lpages; | |
1060 | spte = NULL; | |
1061 | write_protected = 1; | |
1062 | } | |
1063 | spte = rmap_next(kvm, rmapp, spte); | |
1064 | } | |
1065 | } | |
1066 | ||
1067 | return write_protected; | |
1068 | } | |
1069 | ||
1070 | static int rmap_write_protect(struct kvm *kvm, u64 gfn) | |
1071 | { | |
1072 | struct kvm_memory_slot *slot; | |
1073 | ||
1074 | slot = gfn_to_memslot(kvm, gfn); | |
1075 | return kvm_mmu_rmap_write_protect(kvm, gfn, slot); | |
1076 | } | |
1077 | ||
1078 | static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, | |
1079 | unsigned long data) | |
1080 | { | |
1081 | u64 *spte; | |
1082 | int need_tlb_flush = 0; | |
1083 | ||
1084 | while ((spte = rmap_next(kvm, rmapp, NULL))) { | |
1085 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | |
1086 | rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); | |
1087 | drop_spte(kvm, spte); | |
1088 | need_tlb_flush = 1; | |
1089 | } | |
1090 | return need_tlb_flush; | |
1091 | } | |
1092 | ||
1093 | static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, | |
1094 | unsigned long data) | |
1095 | { | |
1096 | int need_flush = 0; | |
1097 | u64 *spte, new_spte; | |
1098 | pte_t *ptep = (pte_t *)data; | |
1099 | pfn_t new_pfn; | |
1100 | ||
1101 | WARN_ON(pte_huge(*ptep)); | |
1102 | new_pfn = pte_pfn(*ptep); | |
1103 | spte = rmap_next(kvm, rmapp, NULL); | |
1104 | while (spte) { | |
1105 | BUG_ON(!is_shadow_present_pte(*spte)); | |
1106 | rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); | |
1107 | need_flush = 1; | |
1108 | if (pte_write(*ptep)) { | |
1109 | drop_spte(kvm, spte); | |
1110 | spte = rmap_next(kvm, rmapp, NULL); | |
1111 | } else { | |
1112 | new_spte = *spte &~ (PT64_BASE_ADDR_MASK); | |
1113 | new_spte |= (u64)new_pfn << PAGE_SHIFT; | |
1114 | ||
1115 | new_spte &= ~PT_WRITABLE_MASK; | |
1116 | new_spte &= ~SPTE_HOST_WRITEABLE; | |
1117 | new_spte &= ~shadow_accessed_mask; | |
1118 | mmu_spte_clear_track_bits(spte); | |
1119 | mmu_spte_set(spte, new_spte); | |
1120 | spte = rmap_next(kvm, rmapp, spte); | |
1121 | } | |
1122 | } | |
1123 | if (need_flush) | |
1124 | kvm_flush_remote_tlbs(kvm); | |
1125 | ||
1126 | return 0; | |
1127 | } | |
1128 | ||
1129 | static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, | |
1130 | unsigned long data, | |
1131 | int (*handler)(struct kvm *kvm, unsigned long *rmapp, | |
1132 | unsigned long data)) | |
1133 | { | |
1134 | int j; | |
1135 | int ret; | |
1136 | int retval = 0; | |
1137 | struct kvm_memslots *slots; | |
1138 | struct kvm_memory_slot *memslot; | |
1139 | ||
1140 | slots = kvm_memslots(kvm); | |
1141 | ||
1142 | kvm_for_each_memslot(memslot, slots) { | |
1143 | unsigned long start = memslot->userspace_addr; | |
1144 | unsigned long end; | |
1145 | ||
1146 | end = start + (memslot->npages << PAGE_SHIFT); | |
1147 | if (hva >= start && hva < end) { | |
1148 | gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; | |
1149 | gfn_t gfn = memslot->base_gfn + gfn_offset; | |
1150 | ||
1151 | ret = handler(kvm, &memslot->rmap[gfn_offset], data); | |
1152 | ||
1153 | for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { | |
1154 | struct kvm_lpage_info *linfo; | |
1155 | ||
1156 | linfo = lpage_info_slot(gfn, memslot, | |
1157 | PT_DIRECTORY_LEVEL + j); | |
1158 | ret |= handler(kvm, &linfo->rmap_pde, data); | |
1159 | } | |
1160 | trace_kvm_age_page(hva, memslot, ret); | |
1161 | retval |= ret; | |
1162 | } | |
1163 | } | |
1164 | ||
1165 | return retval; | |
1166 | } | |
1167 | ||
1168 | int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) | |
1169 | { | |
1170 | return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp); | |
1171 | } | |
1172 | ||
1173 | void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) | |
1174 | { | |
1175 | kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); | |
1176 | } | |
1177 | ||
1178 | static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | |
1179 | unsigned long data) | |
1180 | { | |
1181 | u64 *spte; | |
1182 | int young = 0; | |
1183 | ||
1184 | /* | |
1185 | * Emulate the accessed bit for EPT, by checking if this page has | |
1186 | * an EPT mapping, and clearing it if it does. On the next access, | |
1187 | * a new EPT mapping will be established. | |
1188 | * This has some overhead, but not as much as the cost of swapping | |
1189 | * out actively used pages or breaking up actively used hugepages. | |
1190 | */ | |
1191 | if (!shadow_accessed_mask) | |
1192 | return kvm_unmap_rmapp(kvm, rmapp, data); | |
1193 | ||
1194 | spte = rmap_next(kvm, rmapp, NULL); | |
1195 | while (spte) { | |
1196 | int _young; | |
1197 | u64 _spte = *spte; | |
1198 | BUG_ON(!(_spte & PT_PRESENT_MASK)); | |
1199 | _young = _spte & PT_ACCESSED_MASK; | |
1200 | if (_young) { | |
1201 | young = 1; | |
1202 | clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); | |
1203 | } | |
1204 | spte = rmap_next(kvm, rmapp, spte); | |
1205 | } | |
1206 | return young; | |
1207 | } | |
1208 | ||
1209 | static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | |
1210 | unsigned long data) | |
1211 | { | |
1212 | u64 *spte; | |
1213 | int young = 0; | |
1214 | ||
1215 | /* | |
1216 | * If there's no access bit in the secondary pte set by the | |
1217 | * hardware it's up to gup-fast/gup to set the access bit in | |
1218 | * the primary pte or in the page structure. | |
1219 | */ | |
1220 | if (!shadow_accessed_mask) | |
1221 | goto out; | |
1222 | ||
1223 | spte = rmap_next(kvm, rmapp, NULL); | |
1224 | while (spte) { | |
1225 | u64 _spte = *spte; | |
1226 | BUG_ON(!(_spte & PT_PRESENT_MASK)); | |
1227 | young = _spte & PT_ACCESSED_MASK; | |
1228 | if (young) { | |
1229 | young = 1; | |
1230 | break; | |
1231 | } | |
1232 | spte = rmap_next(kvm, rmapp, spte); | |
1233 | } | |
1234 | out: | |
1235 | return young; | |
1236 | } | |
1237 | ||
1238 | #define RMAP_RECYCLE_THRESHOLD 1000 | |
1239 | ||
1240 | static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | |
1241 | { | |
1242 | unsigned long *rmapp; | |
1243 | struct kvm_mmu_page *sp; | |
1244 | ||
1245 | sp = page_header(__pa(spte)); | |
1246 | ||
1247 | rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); | |
1248 | ||
1249 | kvm_unmap_rmapp(vcpu->kvm, rmapp, 0); | |
1250 | kvm_flush_remote_tlbs(vcpu->kvm); | |
1251 | } | |
1252 | ||
1253 | int kvm_age_hva(struct kvm *kvm, unsigned long hva) | |
1254 | { | |
1255 | return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); | |
1256 | } | |
1257 | ||
1258 | int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) | |
1259 | { | |
1260 | return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); | |
1261 | } | |
1262 | ||
1263 | #ifdef MMU_DEBUG | |
1264 | static int is_empty_shadow_page(u64 *spt) | |
1265 | { | |
1266 | u64 *pos; | |
1267 | u64 *end; | |
1268 | ||
1269 | for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) | |
1270 | if (is_shadow_present_pte(*pos)) { | |
1271 | printk(KERN_ERR "%s: %p %llx\n", __func__, | |
1272 | pos, *pos); | |
1273 | return 0; | |
1274 | } | |
1275 | return 1; | |
1276 | } | |
1277 | #endif | |
1278 | ||
1279 | /* | |
1280 | * This value is the sum of all of the kvm instances's | |
1281 | * kvm->arch.n_used_mmu_pages values. We need a global, | |
1282 | * aggregate version in order to make the slab shrinker | |
1283 | * faster | |
1284 | */ | |
1285 | static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr) | |
1286 | { | |
1287 | kvm->arch.n_used_mmu_pages += nr; | |
1288 | percpu_counter_add(&kvm_total_used_mmu_pages, nr); | |
1289 | } | |
1290 | ||
1291 | /* | |
1292 | * Remove the sp from shadow page cache, after call it, | |
1293 | * we can not find this sp from the cache, and the shadow | |
1294 | * page table is still valid. | |
1295 | * It should be under the protection of mmu lock. | |
1296 | */ | |
1297 | static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp) | |
1298 | { | |
1299 | ASSERT(is_empty_shadow_page(sp->spt)); | |
1300 | hlist_del(&sp->hash_link); | |
1301 | if (!sp->role.direct) | |
1302 | free_page((unsigned long)sp->gfns); | |
1303 | } | |
1304 | ||
1305 | /* | |
1306 | * Free the shadow page table and the sp, we can do it | |
1307 | * out of the protection of mmu lock. | |
1308 | */ | |
1309 | static void kvm_mmu_free_page(struct kvm_mmu_page *sp) | |
1310 | { | |
1311 | list_del(&sp->link); | |
1312 | free_page((unsigned long)sp->spt); | |
1313 | kmem_cache_free(mmu_page_header_cache, sp); | |
1314 | } | |
1315 | ||
1316 | static unsigned kvm_page_table_hashfn(gfn_t gfn) | |
1317 | { | |
1318 | return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1); | |
1319 | } | |
1320 | ||
1321 | static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, | |
1322 | struct kvm_mmu_page *sp, u64 *parent_pte) | |
1323 | { | |
1324 | if (!parent_pte) | |
1325 | return; | |
1326 | ||
1327 | pte_list_add(vcpu, parent_pte, &sp->parent_ptes); | |
1328 | } | |
1329 | ||
1330 | static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, | |
1331 | u64 *parent_pte) | |
1332 | { | |
1333 | pte_list_remove(parent_pte, &sp->parent_ptes); | |
1334 | } | |
1335 | ||
1336 | static void drop_parent_pte(struct kvm_mmu_page *sp, | |
1337 | u64 *parent_pte) | |
1338 | { | |
1339 | mmu_page_remove_parent_pte(sp, parent_pte); | |
1340 | mmu_spte_clear_no_track(parent_pte); | |
1341 | } | |
1342 | ||
1343 | static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, | |
1344 | u64 *parent_pte, int direct) | |
1345 | { | |
1346 | struct kvm_mmu_page *sp; | |
1347 | sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, | |
1348 | sizeof *sp); | |
1349 | sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); | |
1350 | if (!direct) | |
1351 | sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, | |
1352 | PAGE_SIZE); | |
1353 | set_page_private(virt_to_page(sp->spt), (unsigned long)sp); | |
1354 | list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); | |
1355 | bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM); | |
1356 | sp->parent_ptes = 0; | |
1357 | mmu_page_add_parent_pte(vcpu, sp, parent_pte); | |
1358 | kvm_mod_used_mmu_pages(vcpu->kvm, +1); | |
1359 | return sp; | |
1360 | } | |
1361 | ||
1362 | static void mark_unsync(u64 *spte); | |
1363 | static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) | |
1364 | { | |
1365 | pte_list_walk(&sp->parent_ptes, mark_unsync); | |
1366 | } | |
1367 | ||
1368 | static void mark_unsync(u64 *spte) | |
1369 | { | |
1370 | struct kvm_mmu_page *sp; | |
1371 | unsigned int index; | |
1372 | ||
1373 | sp = page_header(__pa(spte)); | |
1374 | index = spte - sp->spt; | |
1375 | if (__test_and_set_bit(index, sp->unsync_child_bitmap)) | |
1376 | return; | |
1377 | if (sp->unsync_children++) | |
1378 | return; | |
1379 | kvm_mmu_mark_parents_unsync(sp); | |
1380 | } | |
1381 | ||
1382 | static int nonpaging_sync_page(struct kvm_vcpu *vcpu, | |
1383 | struct kvm_mmu_page *sp) | |
1384 | { | |
1385 | return 1; | |
1386 | } | |
1387 | ||
1388 | static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) | |
1389 | { | |
1390 | } | |
1391 | ||
1392 | static void nonpaging_update_pte(struct kvm_vcpu *vcpu, | |
1393 | struct kvm_mmu_page *sp, u64 *spte, | |
1394 | const void *pte) | |
1395 | { | |
1396 | WARN_ON(1); | |
1397 | } | |
1398 | ||
1399 | #define KVM_PAGE_ARRAY_NR 16 | |
1400 | ||
1401 | struct kvm_mmu_pages { | |
1402 | struct mmu_page_and_offset { | |
1403 | struct kvm_mmu_page *sp; | |
1404 | unsigned int idx; | |
1405 | } page[KVM_PAGE_ARRAY_NR]; | |
1406 | unsigned int nr; | |
1407 | }; | |
1408 | ||
1409 | #define for_each_unsync_children(bitmap, idx) \ | |
1410 | for (idx = find_first_bit(bitmap, 512); \ | |
1411 | idx < 512; \ | |
1412 | idx = find_next_bit(bitmap, 512, idx+1)) | |
1413 | ||
1414 | static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, | |
1415 | int idx) | |
1416 | { | |
1417 | int i; | |
1418 | ||
1419 | if (sp->unsync) | |
1420 | for (i=0; i < pvec->nr; i++) | |
1421 | if (pvec->page[i].sp == sp) | |
1422 | return 0; | |
1423 | ||
1424 | pvec->page[pvec->nr].sp = sp; | |
1425 | pvec->page[pvec->nr].idx = idx; | |
1426 | pvec->nr++; | |
1427 | return (pvec->nr == KVM_PAGE_ARRAY_NR); | |
1428 | } | |
1429 | ||
1430 | static int __mmu_unsync_walk(struct kvm_mmu_page *sp, | |
1431 | struct kvm_mmu_pages *pvec) | |
1432 | { | |
1433 | int i, ret, nr_unsync_leaf = 0; | |
1434 | ||
1435 | for_each_unsync_children(sp->unsync_child_bitmap, i) { | |
1436 | struct kvm_mmu_page *child; | |
1437 | u64 ent = sp->spt[i]; | |
1438 | ||
1439 | if (!is_shadow_present_pte(ent) || is_large_pte(ent)) | |
1440 | goto clear_child_bitmap; | |
1441 | ||
1442 | child = page_header(ent & PT64_BASE_ADDR_MASK); | |
1443 | ||
1444 | if (child->unsync_children) { | |
1445 | if (mmu_pages_add(pvec, child, i)) | |
1446 | return -ENOSPC; | |
1447 | ||
1448 | ret = __mmu_unsync_walk(child, pvec); | |
1449 | if (!ret) | |
1450 | goto clear_child_bitmap; | |
1451 | else if (ret > 0) | |
1452 | nr_unsync_leaf += ret; | |
1453 | else | |
1454 | return ret; | |
1455 | } else if (child->unsync) { | |
1456 | nr_unsync_leaf++; | |
1457 | if (mmu_pages_add(pvec, child, i)) | |
1458 | return -ENOSPC; | |
1459 | } else | |
1460 | goto clear_child_bitmap; | |
1461 | ||
1462 | continue; | |
1463 | ||
1464 | clear_child_bitmap: | |
1465 | __clear_bit(i, sp->unsync_child_bitmap); | |
1466 | sp->unsync_children--; | |
1467 | WARN_ON((int)sp->unsync_children < 0); | |
1468 | } | |
1469 | ||
1470 | ||
1471 | return nr_unsync_leaf; | |
1472 | } | |
1473 | ||
1474 | static int mmu_unsync_walk(struct kvm_mmu_page *sp, | |
1475 | struct kvm_mmu_pages *pvec) | |
1476 | { | |
1477 | if (!sp->unsync_children) | |
1478 | return 0; | |
1479 | ||
1480 | mmu_pages_add(pvec, sp, 0); | |
1481 | return __mmu_unsync_walk(sp, pvec); | |
1482 | } | |
1483 | ||
1484 | static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) | |
1485 | { | |
1486 | WARN_ON(!sp->unsync); | |
1487 | trace_kvm_mmu_sync_page(sp); | |
1488 | sp->unsync = 0; | |
1489 | --kvm->stat.mmu_unsync; | |
1490 | } | |
1491 | ||
1492 | static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, | |
1493 | struct list_head *invalid_list); | |
1494 | static void kvm_mmu_commit_zap_page(struct kvm *kvm, | |
1495 | struct list_head *invalid_list); | |
1496 | ||
1497 | #define for_each_gfn_sp(kvm, sp, gfn, pos) \ | |
1498 | hlist_for_each_entry(sp, pos, \ | |
1499 | &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ | |
1500 | if ((sp)->gfn != (gfn)) {} else | |
1501 | ||
1502 | #define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos) \ | |
1503 | hlist_for_each_entry(sp, pos, \ | |
1504 | &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ | |
1505 | if ((sp)->gfn != (gfn) || (sp)->role.direct || \ | |
1506 | (sp)->role.invalid) {} else | |
1507 | ||
1508 | /* @sp->gfn should be write-protected at the call site */ | |
1509 | static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | |
1510 | struct list_head *invalid_list, bool clear_unsync) | |
1511 | { | |
1512 | if (sp->role.cr4_pae != !!is_pae(vcpu)) { | |
1513 | kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); | |
1514 | return 1; | |
1515 | } | |
1516 | ||
1517 | if (clear_unsync) | |
1518 | kvm_unlink_unsync_page(vcpu->kvm, sp); | |
1519 | ||
1520 | if (vcpu->arch.mmu.sync_page(vcpu, sp)) { | |
1521 | kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); | |
1522 | return 1; | |
1523 | } | |
1524 | ||
1525 | kvm_mmu_flush_tlb(vcpu); | |
1526 | return 0; | |
1527 | } | |
1528 | ||
1529 | static int kvm_sync_page_transient(struct kvm_vcpu *vcpu, | |
1530 | struct kvm_mmu_page *sp) | |
1531 | { | |
1532 | LIST_HEAD(invalid_list); | |
1533 | int ret; | |
1534 | ||
1535 | ret = __kvm_sync_page(vcpu, sp, &invalid_list, false); | |
1536 | if (ret) | |
1537 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); | |
1538 | ||
1539 | return ret; | |
1540 | } | |
1541 | ||
1542 | static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | |
1543 | struct list_head *invalid_list) | |
1544 | { | |
1545 | return __kvm_sync_page(vcpu, sp, invalid_list, true); | |
1546 | } | |
1547 | ||
1548 | /* @gfn should be write-protected at the call site */ | |
1549 | static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) | |
1550 | { | |
1551 | struct kvm_mmu_page *s; | |
1552 | struct hlist_node *node; | |
1553 | LIST_HEAD(invalid_list); | |
1554 | bool flush = false; | |
1555 | ||
1556 | for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { | |
1557 | if (!s->unsync) | |
1558 | continue; | |
1559 | ||
1560 | WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); | |
1561 | kvm_unlink_unsync_page(vcpu->kvm, s); | |
1562 | if ((s->role.cr4_pae != !!is_pae(vcpu)) || | |
1563 | (vcpu->arch.mmu.sync_page(vcpu, s))) { | |
1564 | kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); | |
1565 | continue; | |
1566 | } | |
1567 | flush = true; | |
1568 | } | |
1569 | ||
1570 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); | |
1571 | if (flush) | |
1572 | kvm_mmu_flush_tlb(vcpu); | |
1573 | } | |
1574 | ||
1575 | struct mmu_page_path { | |
1576 | struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1]; | |
1577 | unsigned int idx[PT64_ROOT_LEVEL-1]; | |
1578 | }; | |
1579 | ||
1580 | #define for_each_sp(pvec, sp, parents, i) \ | |
1581 | for (i = mmu_pages_next(&pvec, &parents, -1), \ | |
1582 | sp = pvec.page[i].sp; \ | |
1583 | i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \ | |
1584 | i = mmu_pages_next(&pvec, &parents, i)) | |
1585 | ||
1586 | static int mmu_pages_next(struct kvm_mmu_pages *pvec, | |
1587 | struct mmu_page_path *parents, | |
1588 | int i) | |
1589 | { | |
1590 | int n; | |
1591 | ||
1592 | for (n = i+1; n < pvec->nr; n++) { | |
1593 | struct kvm_mmu_page *sp = pvec->page[n].sp; | |
1594 | ||
1595 | if (sp->role.level == PT_PAGE_TABLE_LEVEL) { | |
1596 | parents->idx[0] = pvec->page[n].idx; | |
1597 | return n; | |
1598 | } | |
1599 | ||
1600 | parents->parent[sp->role.level-2] = sp; | |
1601 | parents->idx[sp->role.level-1] = pvec->page[n].idx; | |
1602 | } | |
1603 | ||
1604 | return n; | |
1605 | } | |
1606 | ||
1607 | static void mmu_pages_clear_parents(struct mmu_page_path *parents) | |
1608 | { | |
1609 | struct kvm_mmu_page *sp; | |
1610 | unsigned int level = 0; | |
1611 | ||
1612 | do { | |
1613 | unsigned int idx = parents->idx[level]; | |
1614 | ||
1615 | sp = parents->parent[level]; | |
1616 | if (!sp) | |
1617 | return; | |
1618 | ||
1619 | --sp->unsync_children; | |
1620 | WARN_ON((int)sp->unsync_children < 0); | |
1621 | __clear_bit(idx, sp->unsync_child_bitmap); | |
1622 | level++; | |
1623 | } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children); | |
1624 | } | |
1625 | ||
1626 | static void kvm_mmu_pages_init(struct kvm_mmu_page *parent, | |
1627 | struct mmu_page_path *parents, | |
1628 | struct kvm_mmu_pages *pvec) | |
1629 | { | |
1630 | parents->parent[parent->role.level-1] = NULL; | |
1631 | pvec->nr = 0; | |
1632 | } | |
1633 | ||
1634 | static void mmu_sync_children(struct kvm_vcpu *vcpu, | |
1635 | struct kvm_mmu_page *parent) | |
1636 | { | |
1637 | int i; | |
1638 | struct kvm_mmu_page *sp; | |
1639 | struct mmu_page_path parents; | |
1640 | struct kvm_mmu_pages pages; | |
1641 | LIST_HEAD(invalid_list); | |
1642 | ||
1643 | kvm_mmu_pages_init(parent, &parents, &pages); | |
1644 | while (mmu_unsync_walk(parent, &pages)) { | |
1645 | int protected = 0; | |
1646 | ||
1647 | for_each_sp(pages, sp, parents, i) | |
1648 | protected |= rmap_write_protect(vcpu->kvm, sp->gfn); | |
1649 | ||
1650 | if (protected) | |
1651 | kvm_flush_remote_tlbs(vcpu->kvm); | |
1652 | ||
1653 | for_each_sp(pages, sp, parents, i) { | |
1654 | kvm_sync_page(vcpu, sp, &invalid_list); | |
1655 | mmu_pages_clear_parents(&parents); | |
1656 | } | |
1657 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); | |
1658 | cond_resched_lock(&vcpu->kvm->mmu_lock); | |
1659 | kvm_mmu_pages_init(parent, &parents, &pages); | |
1660 | } | |
1661 | } | |
1662 | ||
1663 | static void init_shadow_page_table(struct kvm_mmu_page *sp) | |
1664 | { | |
1665 | int i; | |
1666 | ||
1667 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) | |
1668 | sp->spt[i] = 0ull; | |
1669 | } | |
1670 | ||
1671 | static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp) | |
1672 | { | |
1673 | sp->write_flooding_count = 0; | |
1674 | } | |
1675 | ||
1676 | static void clear_sp_write_flooding_count(u64 *spte) | |
1677 | { | |
1678 | struct kvm_mmu_page *sp = page_header(__pa(spte)); | |
1679 | ||
1680 | __clear_sp_write_flooding_count(sp); | |
1681 | } | |
1682 | ||
1683 | static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |
1684 | gfn_t gfn, | |
1685 | gva_t gaddr, | |
1686 | unsigned level, | |
1687 | int direct, | |
1688 | unsigned access, | |
1689 | u64 *parent_pte) | |
1690 | { | |
1691 | union kvm_mmu_page_role role; | |
1692 | unsigned quadrant; | |
1693 | struct kvm_mmu_page *sp; | |
1694 | struct hlist_node *node; | |
1695 | bool need_sync = false; | |
1696 | ||
1697 | role = vcpu->arch.mmu.base_role; | |
1698 | role.level = level; | |
1699 | role.direct = direct; | |
1700 | if (role.direct) | |
1701 | role.cr4_pae = 0; | |
1702 | role.access = access; | |
1703 | if (!vcpu->arch.mmu.direct_map | |
1704 | && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { | |
1705 | quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); | |
1706 | quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; | |
1707 | role.quadrant = quadrant; | |
1708 | } | |
1709 | for_each_gfn_sp(vcpu->kvm, sp, gfn, node) { | |
1710 | if (!need_sync && sp->unsync) | |
1711 | need_sync = true; | |
1712 | ||
1713 | if (sp->role.word != role.word) | |
1714 | continue; | |
1715 | ||
1716 | if (sp->unsync && kvm_sync_page_transient(vcpu, sp)) | |
1717 | break; | |
1718 | ||
1719 | mmu_page_add_parent_pte(vcpu, sp, parent_pte); | |
1720 | if (sp->unsync_children) { | |
1721 | kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); | |
1722 | kvm_mmu_mark_parents_unsync(sp); | |
1723 | } else if (sp->unsync) | |
1724 | kvm_mmu_mark_parents_unsync(sp); | |
1725 | ||
1726 | __clear_sp_write_flooding_count(sp); | |
1727 | trace_kvm_mmu_get_page(sp, false); | |
1728 | return sp; | |
1729 | } | |
1730 | ++vcpu->kvm->stat.mmu_cache_miss; | |
1731 | sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct); | |
1732 | if (!sp) | |
1733 | return sp; | |
1734 | sp->gfn = gfn; | |
1735 | sp->role = role; | |
1736 | hlist_add_head(&sp->hash_link, | |
1737 | &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]); | |
1738 | if (!direct) { | |
1739 | if (rmap_write_protect(vcpu->kvm, gfn)) | |
1740 | kvm_flush_remote_tlbs(vcpu->kvm); | |
1741 | if (level > PT_PAGE_TABLE_LEVEL && need_sync) | |
1742 | kvm_sync_pages(vcpu, gfn); | |
1743 | ||
1744 | account_shadowed(vcpu->kvm, gfn); | |
1745 | } | |
1746 | init_shadow_page_table(sp); | |
1747 | trace_kvm_mmu_get_page(sp, true); | |
1748 | return sp; | |
1749 | } | |
1750 | ||
1751 | static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, | |
1752 | struct kvm_vcpu *vcpu, u64 addr) | |
1753 | { | |
1754 | iterator->addr = addr; | |
1755 | iterator->shadow_addr = vcpu->arch.mmu.root_hpa; | |
1756 | iterator->level = vcpu->arch.mmu.shadow_root_level; | |
1757 | ||
1758 | if (iterator->level == PT64_ROOT_LEVEL && | |
1759 | vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL && | |
1760 | !vcpu->arch.mmu.direct_map) | |
1761 | --iterator->level; | |
1762 | ||
1763 | if (iterator->level == PT32E_ROOT_LEVEL) { | |
1764 | iterator->shadow_addr | |
1765 | = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; | |
1766 | iterator->shadow_addr &= PT64_BASE_ADDR_MASK; | |
1767 | --iterator->level; | |
1768 | if (!iterator->shadow_addr) | |
1769 | iterator->level = 0; | |
1770 | } | |
1771 | } | |
1772 | ||
1773 | static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) | |
1774 | { | |
1775 | if (iterator->level < PT_PAGE_TABLE_LEVEL) | |
1776 | return false; | |
1777 | ||
1778 | iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); | |
1779 | iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; | |
1780 | return true; | |
1781 | } | |
1782 | ||
1783 | static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator, | |
1784 | u64 spte) | |
1785 | { | |
1786 | if (is_last_spte(spte, iterator->level)) { | |
1787 | iterator->level = 0; | |
1788 | return; | |
1789 | } | |
1790 | ||
1791 | iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK; | |
1792 | --iterator->level; | |
1793 | } | |
1794 | ||
1795 | static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) | |
1796 | { | |
1797 | return __shadow_walk_next(iterator, *iterator->sptep); | |
1798 | } | |
1799 | ||
1800 | static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) | |
1801 | { | |
1802 | u64 spte; | |
1803 | ||
1804 | spte = __pa(sp->spt) | |
1805 | | PT_PRESENT_MASK | PT_ACCESSED_MASK | |
1806 | | PT_WRITABLE_MASK | PT_USER_MASK; | |
1807 | mmu_spte_set(sptep, spte); | |
1808 | } | |
1809 | ||
1810 | static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) | |
1811 | { | |
1812 | if (is_large_pte(*sptep)) { | |
1813 | drop_spte(vcpu->kvm, sptep); | |
1814 | kvm_flush_remote_tlbs(vcpu->kvm); | |
1815 | } | |
1816 | } | |
1817 | ||
1818 | static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |
1819 | unsigned direct_access) | |
1820 | { | |
1821 | if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) { | |
1822 | struct kvm_mmu_page *child; | |
1823 | ||
1824 | /* | |
1825 | * For the direct sp, if the guest pte's dirty bit | |
1826 | * changed form clean to dirty, it will corrupt the | |
1827 | * sp's access: allow writable in the read-only sp, | |
1828 | * so we should update the spte at this point to get | |
1829 | * a new sp with the correct access. | |
1830 | */ | |
1831 | child = page_header(*sptep & PT64_BASE_ADDR_MASK); | |
1832 | if (child->role.access == direct_access) | |
1833 | return; | |
1834 | ||
1835 | drop_parent_pte(child, sptep); | |
1836 | kvm_flush_remote_tlbs(vcpu->kvm); | |
1837 | } | |
1838 | } | |
1839 | ||
1840 | static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, | |
1841 | u64 *spte) | |
1842 | { | |
1843 | u64 pte; | |
1844 | struct kvm_mmu_page *child; | |
1845 | ||
1846 | pte = *spte; | |
1847 | if (is_shadow_present_pte(pte)) { | |
1848 | if (is_last_spte(pte, sp->role.level)) { | |
1849 | drop_spte(kvm, spte); | |
1850 | if (is_large_pte(pte)) | |
1851 | --kvm->stat.lpages; | |
1852 | } else { | |
1853 | child = page_header(pte & PT64_BASE_ADDR_MASK); | |
1854 | drop_parent_pte(child, spte); | |
1855 | } | |
1856 | return true; | |
1857 | } | |
1858 | ||
1859 | if (is_mmio_spte(pte)) | |
1860 | mmu_spte_clear_no_track(spte); | |
1861 | ||
1862 | return false; | |
1863 | } | |
1864 | ||
1865 | static void kvm_mmu_page_unlink_children(struct kvm *kvm, | |
1866 | struct kvm_mmu_page *sp) | |
1867 | { | |
1868 | unsigned i; | |
1869 | ||
1870 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) | |
1871 | mmu_page_zap_pte(kvm, sp, sp->spt + i); | |
1872 | } | |
1873 | ||
1874 | static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) | |
1875 | { | |
1876 | mmu_page_remove_parent_pte(sp, parent_pte); | |
1877 | } | |
1878 | ||
1879 | static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) | |
1880 | { | |
1881 | u64 *parent_pte; | |
1882 | ||
1883 | while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL))) | |
1884 | drop_parent_pte(sp, parent_pte); | |
1885 | } | |
1886 | ||
1887 | static int mmu_zap_unsync_children(struct kvm *kvm, | |
1888 | struct kvm_mmu_page *parent, | |
1889 | struct list_head *invalid_list) | |
1890 | { | |
1891 | int i, zapped = 0; | |
1892 | struct mmu_page_path parents; | |
1893 | struct kvm_mmu_pages pages; | |
1894 | ||
1895 | if (parent->role.level == PT_PAGE_TABLE_LEVEL) | |
1896 | return 0; | |
1897 | ||
1898 | kvm_mmu_pages_init(parent, &parents, &pages); | |
1899 | while (mmu_unsync_walk(parent, &pages)) { | |
1900 | struct kvm_mmu_page *sp; | |
1901 | ||
1902 | for_each_sp(pages, sp, parents, i) { | |
1903 | kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); | |
1904 | mmu_pages_clear_parents(&parents); | |
1905 | zapped++; | |
1906 | } | |
1907 | kvm_mmu_pages_init(parent, &parents, &pages); | |
1908 | } | |
1909 | ||
1910 | return zapped; | |
1911 | } | |
1912 | ||
1913 | static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, | |
1914 | struct list_head *invalid_list) | |
1915 | { | |
1916 | int ret; | |
1917 | ||
1918 | trace_kvm_mmu_prepare_zap_page(sp); | |
1919 | ++kvm->stat.mmu_shadow_zapped; | |
1920 | ret = mmu_zap_unsync_children(kvm, sp, invalid_list); | |
1921 | kvm_mmu_page_unlink_children(kvm, sp); | |
1922 | kvm_mmu_unlink_parents(kvm, sp); | |
1923 | if (!sp->role.invalid && !sp->role.direct) | |
1924 | unaccount_shadowed(kvm, sp->gfn); | |
1925 | if (sp->unsync) | |
1926 | kvm_unlink_unsync_page(kvm, sp); | |
1927 | if (!sp->root_count) { | |
1928 | /* Count self */ | |
1929 | ret++; | |
1930 | list_move(&sp->link, invalid_list); | |
1931 | kvm_mod_used_mmu_pages(kvm, -1); | |
1932 | } else { | |
1933 | list_move(&sp->link, &kvm->arch.active_mmu_pages); | |
1934 | kvm_reload_remote_mmus(kvm); | |
1935 | } | |
1936 | ||
1937 | sp->role.invalid = 1; | |
1938 | return ret; | |
1939 | } | |
1940 | ||
1941 | static void kvm_mmu_isolate_pages(struct list_head *invalid_list) | |
1942 | { | |
1943 | struct kvm_mmu_page *sp; | |
1944 | ||
1945 | list_for_each_entry(sp, invalid_list, link) | |
1946 | kvm_mmu_isolate_page(sp); | |
1947 | } | |
1948 | ||
1949 | static void free_pages_rcu(struct rcu_head *head) | |
1950 | { | |
1951 | struct kvm_mmu_page *next, *sp; | |
1952 | ||
1953 | sp = container_of(head, struct kvm_mmu_page, rcu); | |
1954 | while (sp) { | |
1955 | if (!list_empty(&sp->link)) | |
1956 | next = list_first_entry(&sp->link, | |
1957 | struct kvm_mmu_page, link); | |
1958 | else | |
1959 | next = NULL; | |
1960 | kvm_mmu_free_page(sp); | |
1961 | sp = next; | |
1962 | } | |
1963 | } | |
1964 | ||
1965 | static void kvm_mmu_commit_zap_page(struct kvm *kvm, | |
1966 | struct list_head *invalid_list) | |
1967 | { | |
1968 | struct kvm_mmu_page *sp; | |
1969 | ||
1970 | if (list_empty(invalid_list)) | |
1971 | return; | |
1972 | ||
1973 | kvm_flush_remote_tlbs(kvm); | |
1974 | ||
1975 | if (atomic_read(&kvm->arch.reader_counter)) { | |
1976 | kvm_mmu_isolate_pages(invalid_list); | |
1977 | sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); | |
1978 | list_del_init(invalid_list); | |
1979 | ||
1980 | trace_kvm_mmu_delay_free_pages(sp); | |
1981 | call_rcu(&sp->rcu, free_pages_rcu); | |
1982 | return; | |
1983 | } | |
1984 | ||
1985 | do { | |
1986 | sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); | |
1987 | WARN_ON(!sp->role.invalid || sp->root_count); | |
1988 | kvm_mmu_isolate_page(sp); | |
1989 | kvm_mmu_free_page(sp); | |
1990 | } while (!list_empty(invalid_list)); | |
1991 | ||
1992 | } | |
1993 | ||
1994 | /* | |
1995 | * Changing the number of mmu pages allocated to the vm | |
1996 | * Note: if goal_nr_mmu_pages is too small, you will get dead lock | |
1997 | */ | |
1998 | void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) | |
1999 | { | |
2000 | LIST_HEAD(invalid_list); | |
2001 | /* | |
2002 | * If we set the number of mmu pages to be smaller be than the | |
2003 | * number of actived pages , we must to free some mmu pages before we | |
2004 | * change the value | |
2005 | */ | |
2006 | ||
2007 | if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { | |
2008 | while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages && | |
2009 | !list_empty(&kvm->arch.active_mmu_pages)) { | |
2010 | struct kvm_mmu_page *page; | |
2011 | ||
2012 | page = container_of(kvm->arch.active_mmu_pages.prev, | |
2013 | struct kvm_mmu_page, link); | |
2014 | kvm_mmu_prepare_zap_page(kvm, page, &invalid_list); | |
2015 | } | |
2016 | kvm_mmu_commit_zap_page(kvm, &invalid_list); | |
2017 | goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; | |
2018 | } | |
2019 | ||
2020 | kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; | |
2021 | } | |
2022 | ||
2023 | int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) | |
2024 | { | |
2025 | struct kvm_mmu_page *sp; | |
2026 | struct hlist_node *node; | |
2027 | LIST_HEAD(invalid_list); | |
2028 | int r; | |
2029 | ||
2030 | pgprintk("%s: looking for gfn %llx\n", __func__, gfn); | |
2031 | r = 0; | |
2032 | spin_lock(&kvm->mmu_lock); | |
2033 | for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { | |
2034 | pgprintk("%s: gfn %llx role %x\n", __func__, gfn, | |
2035 | sp->role.word); | |
2036 | r = 1; | |
2037 | kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); | |
2038 | } | |
2039 | kvm_mmu_commit_zap_page(kvm, &invalid_list); | |
2040 | spin_unlock(&kvm->mmu_lock); | |
2041 | ||
2042 | return r; | |
2043 | } | |
2044 | EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page); | |
2045 | ||
2046 | static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) | |
2047 | { | |
2048 | int slot = memslot_id(kvm, gfn); | |
2049 | struct kvm_mmu_page *sp = page_header(__pa(pte)); | |
2050 | ||
2051 | __set_bit(slot, sp->slot_bitmap); | |
2052 | } | |
2053 | ||
2054 | /* | |
2055 | * The function is based on mtrr_type_lookup() in | |
2056 | * arch/x86/kernel/cpu/mtrr/generic.c | |
2057 | */ | |
2058 | static int get_mtrr_type(struct mtrr_state_type *mtrr_state, | |
2059 | u64 start, u64 end) | |
2060 | { | |
2061 | int i; | |
2062 | u64 base, mask; | |
2063 | u8 prev_match, curr_match; | |
2064 | int num_var_ranges = KVM_NR_VAR_MTRR; | |
2065 | ||
2066 | if (!mtrr_state->enabled) | |
2067 | return 0xFF; | |
2068 | ||
2069 | /* Make end inclusive end, instead of exclusive */ | |
2070 | end--; | |
2071 | ||
2072 | /* Look in fixed ranges. Just return the type as per start */ | |
2073 | if (mtrr_state->have_fixed && (start < 0x100000)) { | |
2074 | int idx; | |
2075 | ||
2076 | if (start < 0x80000) { | |
2077 | idx = 0; | |
2078 | idx += (start >> 16); | |
2079 | return mtrr_state->fixed_ranges[idx]; | |
2080 | } else if (start < 0xC0000) { | |
2081 | idx = 1 * 8; | |
2082 | idx += ((start - 0x80000) >> 14); | |
2083 | return mtrr_state->fixed_ranges[idx]; | |
2084 | } else if (start < 0x1000000) { | |
2085 | idx = 3 * 8; | |
2086 | idx += ((start - 0xC0000) >> 12); | |
2087 | return mtrr_state->fixed_ranges[idx]; | |
2088 | } | |
2089 | } | |
2090 | ||
2091 | /* | |
2092 | * Look in variable ranges | |
2093 | * Look of multiple ranges matching this address and pick type | |
2094 | * as per MTRR precedence | |
2095 | */ | |
2096 | if (!(mtrr_state->enabled & 2)) | |
2097 | return mtrr_state->def_type; | |
2098 | ||
2099 | prev_match = 0xFF; | |
2100 | for (i = 0; i < num_var_ranges; ++i) { | |
2101 | unsigned short start_state, end_state; | |
2102 | ||
2103 | if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11))) | |
2104 | continue; | |
2105 | ||
2106 | base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) + | |
2107 | (mtrr_state->var_ranges[i].base_lo & PAGE_MASK); | |
2108 | mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) + | |
2109 | (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK); | |
2110 | ||
2111 | start_state = ((start & mask) == (base & mask)); | |
2112 | end_state = ((end & mask) == (base & mask)); | |
2113 | if (start_state != end_state) | |
2114 | return 0xFE; | |
2115 | ||
2116 | if ((start & mask) != (base & mask)) | |
2117 | continue; | |
2118 | ||
2119 | curr_match = mtrr_state->var_ranges[i].base_lo & 0xff; | |
2120 | if (prev_match == 0xFF) { | |
2121 | prev_match = curr_match; | |
2122 | continue; | |
2123 | } | |
2124 | ||
2125 | if (prev_match == MTRR_TYPE_UNCACHABLE || | |
2126 | curr_match == MTRR_TYPE_UNCACHABLE) | |
2127 | return MTRR_TYPE_UNCACHABLE; | |
2128 | ||
2129 | if ((prev_match == MTRR_TYPE_WRBACK && | |
2130 | curr_match == MTRR_TYPE_WRTHROUGH) || | |
2131 | (prev_match == MTRR_TYPE_WRTHROUGH && | |
2132 | curr_match == MTRR_TYPE_WRBACK)) { | |
2133 | prev_match = MTRR_TYPE_WRTHROUGH; | |
2134 | curr_match = MTRR_TYPE_WRTHROUGH; | |
2135 | } | |
2136 | ||
2137 | if (prev_match != curr_match) | |
2138 | return MTRR_TYPE_UNCACHABLE; | |
2139 | } | |
2140 | ||
2141 | if (prev_match != 0xFF) | |
2142 | return prev_match; | |
2143 | ||
2144 | return mtrr_state->def_type; | |
2145 | } | |
2146 | ||
2147 | u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) | |
2148 | { | |
2149 | u8 mtrr; | |
2150 | ||
2151 | mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT, | |
2152 | (gfn << PAGE_SHIFT) + PAGE_SIZE); | |
2153 | if (mtrr == 0xfe || mtrr == 0xff) | |
2154 | mtrr = MTRR_TYPE_WRBACK; | |
2155 | return mtrr; | |
2156 | } | |
2157 | EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type); | |
2158 | ||
2159 | static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | |
2160 | { | |
2161 | trace_kvm_mmu_unsync_page(sp); | |
2162 | ++vcpu->kvm->stat.mmu_unsync; | |
2163 | sp->unsync = 1; | |
2164 | ||
2165 | kvm_mmu_mark_parents_unsync(sp); | |
2166 | } | |
2167 | ||
2168 | static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) | |
2169 | { | |
2170 | struct kvm_mmu_page *s; | |
2171 | struct hlist_node *node; | |
2172 | ||
2173 | for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { | |
2174 | if (s->unsync) | |
2175 | continue; | |
2176 | WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); | |
2177 | __kvm_unsync_page(vcpu, s); | |
2178 | } | |
2179 | } | |
2180 | ||
2181 | static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, | |
2182 | bool can_unsync) | |
2183 | { | |
2184 | struct kvm_mmu_page *s; | |
2185 | struct hlist_node *node; | |
2186 | bool need_unsync = false; | |
2187 | ||
2188 | for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { | |
2189 | if (!can_unsync) | |
2190 | return 1; | |
2191 | ||
2192 | if (s->role.level != PT_PAGE_TABLE_LEVEL) | |
2193 | return 1; | |
2194 | ||
2195 | if (!need_unsync && !s->unsync) { | |
2196 | need_unsync = true; | |
2197 | } | |
2198 | } | |
2199 | if (need_unsync) | |
2200 | kvm_unsync_pages(vcpu, gfn); | |
2201 | return 0; | |
2202 | } | |
2203 | ||
2204 | static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |
2205 | unsigned pte_access, int user_fault, | |
2206 | int write_fault, int level, | |
2207 | gfn_t gfn, pfn_t pfn, bool speculative, | |
2208 | bool can_unsync, bool host_writable) | |
2209 | { | |
2210 | u64 spte, entry = *sptep; | |
2211 | int ret = 0; | |
2212 | ||
2213 | if (set_mmio_spte(sptep, gfn, pfn, pte_access)) | |
2214 | return 0; | |
2215 | ||
2216 | spte = PT_PRESENT_MASK; | |
2217 | if (!speculative) | |
2218 | spte |= shadow_accessed_mask; | |
2219 | ||
2220 | if (pte_access & ACC_EXEC_MASK) | |
2221 | spte |= shadow_x_mask; | |
2222 | else | |
2223 | spte |= shadow_nx_mask; | |
2224 | if (pte_access & ACC_USER_MASK) | |
2225 | spte |= shadow_user_mask; | |
2226 | if (level > PT_PAGE_TABLE_LEVEL) | |
2227 | spte |= PT_PAGE_SIZE_MASK; | |
2228 | if (tdp_enabled) | |
2229 | spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, | |
2230 | kvm_is_mmio_pfn(pfn)); | |
2231 | ||
2232 | if (host_writable) | |
2233 | spte |= SPTE_HOST_WRITEABLE; | |
2234 | else | |
2235 | pte_access &= ~ACC_WRITE_MASK; | |
2236 | ||
2237 | spte |= (u64)pfn << PAGE_SHIFT; | |
2238 | ||
2239 | if ((pte_access & ACC_WRITE_MASK) | |
2240 | || (!vcpu->arch.mmu.direct_map && write_fault | |
2241 | && !is_write_protection(vcpu) && !user_fault)) { | |
2242 | ||
2243 | if (level > PT_PAGE_TABLE_LEVEL && | |
2244 | has_wrprotected_page(vcpu->kvm, gfn, level)) { | |
2245 | ret = 1; | |
2246 | drop_spte(vcpu->kvm, sptep); | |
2247 | goto done; | |
2248 | } | |
2249 | ||
2250 | spte |= PT_WRITABLE_MASK; | |
2251 | ||
2252 | if (!vcpu->arch.mmu.direct_map | |
2253 | && !(pte_access & ACC_WRITE_MASK)) { | |
2254 | spte &= ~PT_USER_MASK; | |
2255 | /* | |
2256 | * If we converted a user page to a kernel page, | |
2257 | * so that the kernel can write to it when cr0.wp=0, | |
2258 | * then we should prevent the kernel from executing it | |
2259 | * if SMEP is enabled. | |
2260 | */ | |
2261 | if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) | |
2262 | spte |= PT64_NX_MASK; | |
2263 | } | |
2264 | ||
2265 | /* | |
2266 | * Optimization: for pte sync, if spte was writable the hash | |
2267 | * lookup is unnecessary (and expensive). Write protection | |
2268 | * is responsibility of mmu_get_page / kvm_sync_page. | |
2269 | * Same reasoning can be applied to dirty page accounting. | |
2270 | */ | |
2271 | if (!can_unsync && is_writable_pte(*sptep)) | |
2272 | goto set_pte; | |
2273 | ||
2274 | if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { | |
2275 | pgprintk("%s: found shadow page for %llx, marking ro\n", | |
2276 | __func__, gfn); | |
2277 | ret = 1; | |
2278 | pte_access &= ~ACC_WRITE_MASK; | |
2279 | if (is_writable_pte(spte)) | |
2280 | spte &= ~PT_WRITABLE_MASK; | |
2281 | } | |
2282 | } | |
2283 | ||
2284 | if (pte_access & ACC_WRITE_MASK) | |
2285 | mark_page_dirty(vcpu->kvm, gfn); | |
2286 | ||
2287 | set_pte: | |
2288 | mmu_spte_update(sptep, spte); | |
2289 | /* | |
2290 | * If we overwrite a writable spte with a read-only one we | |
2291 | * should flush remote TLBs. Otherwise rmap_write_protect | |
2292 | * will find a read-only spte, even though the writable spte | |
2293 | * might be cached on a CPU's TLB. | |
2294 | */ | |
2295 | if (is_writable_pte(entry) && !is_writable_pte(*sptep)) | |
2296 | kvm_flush_remote_tlbs(vcpu->kvm); | |
2297 | done: | |
2298 | return ret; | |
2299 | } | |
2300 | ||
2301 | static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |
2302 | unsigned pt_access, unsigned pte_access, | |
2303 | int user_fault, int write_fault, | |
2304 | int *emulate, int level, gfn_t gfn, | |
2305 | pfn_t pfn, bool speculative, | |
2306 | bool host_writable) | |
2307 | { | |
2308 | int was_rmapped = 0; | |
2309 | int rmap_count; | |
2310 | ||
2311 | pgprintk("%s: spte %llx access %x write_fault %d" | |
2312 | " user_fault %d gfn %llx\n", | |
2313 | __func__, *sptep, pt_access, | |
2314 | write_fault, user_fault, gfn); | |
2315 | ||
2316 | if (is_rmap_spte(*sptep)) { | |
2317 | /* | |
2318 | * If we overwrite a PTE page pointer with a 2MB PMD, unlink | |
2319 | * the parent of the now unreachable PTE. | |
2320 | */ | |
2321 | if (level > PT_PAGE_TABLE_LEVEL && | |
2322 | !is_large_pte(*sptep)) { | |
2323 | struct kvm_mmu_page *child; | |
2324 | u64 pte = *sptep; | |
2325 | ||
2326 | child = page_header(pte & PT64_BASE_ADDR_MASK); | |
2327 | drop_parent_pte(child, sptep); | |
2328 | kvm_flush_remote_tlbs(vcpu->kvm); | |
2329 | } else if (pfn != spte_to_pfn(*sptep)) { | |
2330 | pgprintk("hfn old %llx new %llx\n", | |
2331 | spte_to_pfn(*sptep), pfn); | |
2332 | drop_spte(vcpu->kvm, sptep); | |
2333 | kvm_flush_remote_tlbs(vcpu->kvm); | |
2334 | } else | |
2335 | was_rmapped = 1; | |
2336 | } | |
2337 | ||
2338 | if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, | |
2339 | level, gfn, pfn, speculative, true, | |
2340 | host_writable)) { | |
2341 | if (write_fault) | |
2342 | *emulate = 1; | |
2343 | kvm_mmu_flush_tlb(vcpu); | |
2344 | } | |
2345 | ||
2346 | if (unlikely(is_mmio_spte(*sptep) && emulate)) | |
2347 | *emulate = 1; | |
2348 | ||
2349 | pgprintk("%s: setting spte %llx\n", __func__, *sptep); | |
2350 | pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", | |
2351 | is_large_pte(*sptep)? "2MB" : "4kB", | |
2352 | *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, | |
2353 | *sptep, sptep); | |
2354 | if (!was_rmapped && is_large_pte(*sptep)) | |
2355 | ++vcpu->kvm->stat.lpages; | |
2356 | ||
2357 | if (is_shadow_present_pte(*sptep)) { | |
2358 | page_header_update_slot(vcpu->kvm, sptep, gfn); | |
2359 | if (!was_rmapped) { | |
2360 | rmap_count = rmap_add(vcpu, sptep, gfn); | |
2361 | if (rmap_count > RMAP_RECYCLE_THRESHOLD) | |
2362 | rmap_recycle(vcpu, sptep, gfn); | |
2363 | } | |
2364 | } | |
2365 | kvm_release_pfn_clean(pfn); | |
2366 | } | |
2367 | ||
2368 | static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) | |
2369 | { | |
2370 | } | |
2371 | ||
2372 | static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, | |
2373 | bool no_dirty_log) | |
2374 | { | |
2375 | struct kvm_memory_slot *slot; | |
2376 | unsigned long hva; | |
2377 | ||
2378 | slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); | |
2379 | if (!slot) { | |
2380 | get_page(fault_page); | |
2381 | return page_to_pfn(fault_page); | |
2382 | } | |
2383 | ||
2384 | hva = gfn_to_hva_memslot(slot, gfn); | |
2385 | ||
2386 | return hva_to_pfn_atomic(vcpu->kvm, hva); | |
2387 | } | |
2388 | ||
2389 | static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, | |
2390 | struct kvm_mmu_page *sp, | |
2391 | u64 *start, u64 *end) | |
2392 | { | |
2393 | struct page *pages[PTE_PREFETCH_NUM]; | |
2394 | unsigned access = sp->role.access; | |
2395 | int i, ret; | |
2396 | gfn_t gfn; | |
2397 | ||
2398 | gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); | |
2399 | if (!gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK)) | |
2400 | return -1; | |
2401 | ||
2402 | ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start); | |
2403 | if (ret <= 0) | |
2404 | return -1; | |
2405 | ||
2406 | for (i = 0; i < ret; i++, gfn++, start++) | |
2407 | mmu_set_spte(vcpu, start, ACC_ALL, | |
2408 | access, 0, 0, NULL, | |
2409 | sp->role.level, gfn, | |
2410 | page_to_pfn(pages[i]), true, true); | |
2411 | ||
2412 | return 0; | |
2413 | } | |
2414 | ||
2415 | static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, | |
2416 | struct kvm_mmu_page *sp, u64 *sptep) | |
2417 | { | |
2418 | u64 *spte, *start = NULL; | |
2419 | int i; | |
2420 | ||
2421 | WARN_ON(!sp->role.direct); | |
2422 | ||
2423 | i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); | |
2424 | spte = sp->spt + i; | |
2425 | ||
2426 | for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { | |
2427 | if (is_shadow_present_pte(*spte) || spte == sptep) { | |
2428 | if (!start) | |
2429 | continue; | |
2430 | if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) | |
2431 | break; | |
2432 | start = NULL; | |
2433 | } else if (!start) | |
2434 | start = spte; | |
2435 | } | |
2436 | } | |
2437 | ||
2438 | static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) | |
2439 | { | |
2440 | struct kvm_mmu_page *sp; | |
2441 | ||
2442 | /* | |
2443 | * Since it's no accessed bit on EPT, it's no way to | |
2444 | * distinguish between actually accessed translations | |
2445 | * and prefetched, so disable pte prefetch if EPT is | |
2446 | * enabled. | |
2447 | */ | |
2448 | if (!shadow_accessed_mask) | |
2449 | return; | |
2450 | ||
2451 | sp = page_header(__pa(sptep)); | |
2452 | if (sp->role.level > PT_PAGE_TABLE_LEVEL) | |
2453 | return; | |
2454 | ||
2455 | __direct_pte_prefetch(vcpu, sp, sptep); | |
2456 | } | |
2457 | ||
2458 | static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | |
2459 | int map_writable, int level, gfn_t gfn, pfn_t pfn, | |
2460 | bool prefault) | |
2461 | { | |
2462 | struct kvm_shadow_walk_iterator iterator; | |
2463 | struct kvm_mmu_page *sp; | |
2464 | int emulate = 0; | |
2465 | gfn_t pseudo_gfn; | |
2466 | ||
2467 | for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { | |
2468 | if (iterator.level == level) { | |
2469 | unsigned pte_access = ACC_ALL; | |
2470 | ||
2471 | mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access, | |
2472 | 0, write, &emulate, | |
2473 | level, gfn, pfn, prefault, map_writable); | |
2474 | direct_pte_prefetch(vcpu, iterator.sptep); | |
2475 | ++vcpu->stat.pf_fixed; | |
2476 | break; | |
2477 | } | |
2478 | ||
2479 | if (!is_shadow_present_pte(*iterator.sptep)) { | |
2480 | u64 base_addr = iterator.addr; | |
2481 | ||
2482 | base_addr &= PT64_LVL_ADDR_MASK(iterator.level); | |
2483 | pseudo_gfn = base_addr >> PAGE_SHIFT; | |
2484 | sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, | |
2485 | iterator.level - 1, | |
2486 | 1, ACC_ALL, iterator.sptep); | |
2487 | if (!sp) { | |
2488 | pgprintk("nonpaging_map: ENOMEM\n"); | |
2489 | kvm_release_pfn_clean(pfn); | |
2490 | return -ENOMEM; | |
2491 | } | |
2492 | ||
2493 | mmu_spte_set(iterator.sptep, | |
2494 | __pa(sp->spt) | |
2495 | | PT_PRESENT_MASK | PT_WRITABLE_MASK | |
2496 | | shadow_user_mask | shadow_x_mask | |
2497 | | shadow_accessed_mask); | |
2498 | } | |
2499 | } | |
2500 | return emulate; | |
2501 | } | |
2502 | ||
2503 | static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) | |
2504 | { | |
2505 | siginfo_t info; | |
2506 | ||
2507 | info.si_signo = SIGBUS; | |
2508 | info.si_errno = 0; | |
2509 | info.si_code = BUS_MCEERR_AR; | |
2510 | info.si_addr = (void __user *)address; | |
2511 | info.si_addr_lsb = PAGE_SHIFT; | |
2512 | ||
2513 | send_sig_info(SIGBUS, &info, tsk); | |
2514 | } | |
2515 | ||
2516 | static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn) | |
2517 | { | |
2518 | kvm_release_pfn_clean(pfn); | |
2519 | if (is_hwpoison_pfn(pfn)) { | |
2520 | kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current); | |
2521 | return 0; | |
2522 | } | |
2523 | ||
2524 | return -EFAULT; | |
2525 | } | |
2526 | ||
2527 | static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, | |
2528 | gfn_t *gfnp, pfn_t *pfnp, int *levelp) | |
2529 | { | |
2530 | pfn_t pfn = *pfnp; | |
2531 | gfn_t gfn = *gfnp; | |
2532 | int level = *levelp; | |
2533 | ||
2534 | /* | |
2535 | * Check if it's a transparent hugepage. If this would be an | |
2536 | * hugetlbfs page, level wouldn't be set to | |
2537 | * PT_PAGE_TABLE_LEVEL and there would be no adjustment done | |
2538 | * here. | |
2539 | */ | |
2540 | if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && | |
2541 | level == PT_PAGE_TABLE_LEVEL && | |
2542 | PageTransCompound(pfn_to_page(pfn)) && | |
2543 | !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { | |
2544 | unsigned long mask; | |
2545 | /* | |
2546 | * mmu_notifier_retry was successful and we hold the | |
2547 | * mmu_lock here, so the pmd can't become splitting | |
2548 | * from under us, and in turn | |
2549 | * __split_huge_page_refcount() can't run from under | |
2550 | * us and we can safely transfer the refcount from | |
2551 | * PG_tail to PG_head as we switch the pfn to tail to | |
2552 | * head. | |
2553 | */ | |
2554 | *levelp = level = PT_DIRECTORY_LEVEL; | |
2555 | mask = KVM_PAGES_PER_HPAGE(level) - 1; | |
2556 | VM_BUG_ON((gfn & mask) != (pfn & mask)); | |
2557 | if (pfn & mask) { | |
2558 | gfn &= ~mask; | |
2559 | *gfnp = gfn; | |
2560 | kvm_release_pfn_clean(pfn); | |
2561 | pfn &= ~mask; | |
2562 | if (!get_page_unless_zero(pfn_to_page(pfn))) | |
2563 | BUG(); | |
2564 | *pfnp = pfn; | |
2565 | } | |
2566 | } | |
2567 | } | |
2568 | ||
2569 | static bool mmu_invalid_pfn(pfn_t pfn) | |
2570 | { | |
2571 | return unlikely(is_invalid_pfn(pfn)); | |
2572 | } | |
2573 | ||
2574 | static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, | |
2575 | pfn_t pfn, unsigned access, int *ret_val) | |
2576 | { | |
2577 | bool ret = true; | |
2578 | ||
2579 | /* The pfn is invalid, report the error! */ | |
2580 | if (unlikely(is_invalid_pfn(pfn))) { | |
2581 | *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn); | |
2582 | goto exit; | |
2583 | } | |
2584 | ||
2585 | if (unlikely(is_noslot_pfn(pfn))) | |
2586 | vcpu_cache_mmio_info(vcpu, gva, gfn, access); | |
2587 | ||
2588 | ret = false; | |
2589 | exit: | |
2590 | return ret; | |
2591 | } | |
2592 | ||
2593 | static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, | |
2594 | gva_t gva, pfn_t *pfn, bool write, bool *writable); | |
2595 | ||
2596 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, | |
2597 | bool prefault) | |
2598 | { | |
2599 | int r; | |
2600 | int level; | |
2601 | int force_pt_level; | |
2602 | pfn_t pfn; | |
2603 | unsigned long mmu_seq; | |
2604 | bool map_writable; | |
2605 | ||
2606 | force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); | |
2607 | if (likely(!force_pt_level)) { | |
2608 | level = mapping_level(vcpu, gfn); | |
2609 | /* | |
2610 | * This path builds a PAE pagetable - so we can map | |
2611 | * 2mb pages at maximum. Therefore check if the level | |
2612 | * is larger than that. | |
2613 | */ | |
2614 | if (level > PT_DIRECTORY_LEVEL) | |
2615 | level = PT_DIRECTORY_LEVEL; | |
2616 | ||
2617 | gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); | |
2618 | } else | |
2619 | level = PT_PAGE_TABLE_LEVEL; | |
2620 | ||
2621 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | |
2622 | smp_rmb(); | |
2623 | ||
2624 | if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) | |
2625 | return 0; | |
2626 | ||
2627 | if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) | |
2628 | return r; | |
2629 | ||
2630 | spin_lock(&vcpu->kvm->mmu_lock); | |
2631 | if (mmu_notifier_retry(vcpu, mmu_seq)) | |
2632 | goto out_unlock; | |
2633 | kvm_mmu_free_some_pages(vcpu); | |
2634 | if (likely(!force_pt_level)) | |
2635 | transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); | |
2636 | r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, | |
2637 | prefault); | |
2638 | spin_unlock(&vcpu->kvm->mmu_lock); | |
2639 | ||
2640 | ||
2641 | return r; | |
2642 | ||
2643 | out_unlock: | |
2644 | spin_unlock(&vcpu->kvm->mmu_lock); | |
2645 | kvm_release_pfn_clean(pfn); | |
2646 | return 0; | |
2647 | } | |
2648 | ||
2649 | ||
2650 | static void mmu_free_roots(struct kvm_vcpu *vcpu) | |
2651 | { | |
2652 | int i; | |
2653 | struct kvm_mmu_page *sp; | |
2654 | LIST_HEAD(invalid_list); | |
2655 | ||
2656 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) | |
2657 | return; | |
2658 | spin_lock(&vcpu->kvm->mmu_lock); | |
2659 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL && | |
2660 | (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL || | |
2661 | vcpu->arch.mmu.direct_map)) { | |
2662 | hpa_t root = vcpu->arch.mmu.root_hpa; | |
2663 | ||
2664 | sp = page_header(root); | |
2665 | --sp->root_count; | |
2666 | if (!sp->root_count && sp->role.invalid) { | |
2667 | kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); | |
2668 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); | |
2669 | } | |
2670 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | |
2671 | spin_unlock(&vcpu->kvm->mmu_lock); | |
2672 | return; | |
2673 | } | |
2674 | for (i = 0; i < 4; ++i) { | |
2675 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | |
2676 | ||
2677 | if (root) { | |
2678 | root &= PT64_BASE_ADDR_MASK; | |
2679 | sp = page_header(root); | |
2680 | --sp->root_count; | |
2681 | if (!sp->root_count && sp->role.invalid) | |
2682 | kvm_mmu_prepare_zap_page(vcpu->kvm, sp, | |
2683 | &invalid_list); | |
2684 | } | |
2685 | vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; | |
2686 | } | |
2687 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); | |
2688 | spin_unlock(&vcpu->kvm->mmu_lock); | |
2689 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | |
2690 | } | |
2691 | ||
2692 | static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) | |
2693 | { | |
2694 | int ret = 0; | |
2695 | ||
2696 | if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { | |
2697 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); | |
2698 | ret = 1; | |
2699 | } | |
2700 | ||
2701 | return ret; | |
2702 | } | |
2703 | ||
2704 | static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) | |
2705 | { | |
2706 | struct kvm_mmu_page *sp; | |
2707 | unsigned i; | |
2708 | ||
2709 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { | |
2710 | spin_lock(&vcpu->kvm->mmu_lock); | |
2711 | kvm_mmu_free_some_pages(vcpu); | |
2712 | sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, | |
2713 | 1, ACC_ALL, NULL); | |
2714 | ++sp->root_count; | |
2715 | spin_unlock(&vcpu->kvm->mmu_lock); | |
2716 | vcpu->arch.mmu.root_hpa = __pa(sp->spt); | |
2717 | } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) { | |
2718 | for (i = 0; i < 4; ++i) { | |
2719 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | |
2720 | ||
2721 | ASSERT(!VALID_PAGE(root)); | |
2722 | spin_lock(&vcpu->kvm->mmu_lock); | |
2723 | kvm_mmu_free_some_pages(vcpu); | |
2724 | sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), | |
2725 | i << 30, | |
2726 | PT32_ROOT_LEVEL, 1, ACC_ALL, | |
2727 | NULL); | |
2728 | root = __pa(sp->spt); | |
2729 | ++sp->root_count; | |
2730 | spin_unlock(&vcpu->kvm->mmu_lock); | |
2731 | vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; | |
2732 | } | |
2733 | vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); | |
2734 | } else | |
2735 | BUG(); | |
2736 | ||
2737 | return 0; | |
2738 | } | |
2739 | ||
2740 | static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) | |
2741 | { | |
2742 | struct kvm_mmu_page *sp; | |
2743 | u64 pdptr, pm_mask; | |
2744 | gfn_t root_gfn; | |
2745 | int i; | |
2746 | ||
2747 | root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT; | |
2748 | ||
2749 | if (mmu_check_root(vcpu, root_gfn)) | |
2750 | return 1; | |
2751 | ||
2752 | /* | |
2753 | * Do we shadow a long mode page table? If so we need to | |
2754 | * write-protect the guests page table root. | |
2755 | */ | |
2756 | if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { | |
2757 | hpa_t root = vcpu->arch.mmu.root_hpa; | |
2758 | ||
2759 | ASSERT(!VALID_PAGE(root)); | |
2760 | ||
2761 | spin_lock(&vcpu->kvm->mmu_lock); | |
2762 | kvm_mmu_free_some_pages(vcpu); | |
2763 | sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, | |
2764 | 0, ACC_ALL, NULL); | |
2765 | root = __pa(sp->spt); | |
2766 | ++sp->root_count; | |
2767 | spin_unlock(&vcpu->kvm->mmu_lock); | |
2768 | vcpu->arch.mmu.root_hpa = root; | |
2769 | return 0; | |
2770 | } | |
2771 | ||
2772 | /* | |
2773 | * We shadow a 32 bit page table. This may be a legacy 2-level | |
2774 | * or a PAE 3-level page table. In either case we need to be aware that | |
2775 | * the shadow page table may be a PAE or a long mode page table. | |
2776 | */ | |
2777 | pm_mask = PT_PRESENT_MASK; | |
2778 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) | |
2779 | pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; | |
2780 | ||
2781 | for (i = 0; i < 4; ++i) { | |
2782 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | |
2783 | ||
2784 | ASSERT(!VALID_PAGE(root)); | |
2785 | if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { | |
2786 | pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i); | |
2787 | if (!is_present_gpte(pdptr)) { | |
2788 | vcpu->arch.mmu.pae_root[i] = 0; | |
2789 | continue; | |
2790 | } | |
2791 | root_gfn = pdptr >> PAGE_SHIFT; | |
2792 | if (mmu_check_root(vcpu, root_gfn)) | |
2793 | return 1; | |
2794 | } | |
2795 | spin_lock(&vcpu->kvm->mmu_lock); | |
2796 | kvm_mmu_free_some_pages(vcpu); | |
2797 | sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, | |
2798 | PT32_ROOT_LEVEL, 0, | |
2799 | ACC_ALL, NULL); | |
2800 | root = __pa(sp->spt); | |
2801 | ++sp->root_count; | |
2802 | spin_unlock(&vcpu->kvm->mmu_lock); | |
2803 | ||
2804 | vcpu->arch.mmu.pae_root[i] = root | pm_mask; | |
2805 | } | |
2806 | vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); | |
2807 | ||
2808 | /* | |
2809 | * If we shadow a 32 bit page table with a long mode page | |
2810 | * table we enter this path. | |
2811 | */ | |
2812 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { | |
2813 | if (vcpu->arch.mmu.lm_root == NULL) { | |
2814 | /* | |
2815 | * The additional page necessary for this is only | |
2816 | * allocated on demand. | |
2817 | */ | |
2818 | ||
2819 | u64 *lm_root; | |
2820 | ||
2821 | lm_root = (void*)get_zeroed_page(GFP_KERNEL); | |
2822 | if (lm_root == NULL) | |
2823 | return 1; | |
2824 | ||
2825 | lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask; | |
2826 | ||
2827 | vcpu->arch.mmu.lm_root = lm_root; | |
2828 | } | |
2829 | ||
2830 | vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root); | |
2831 | } | |
2832 | ||
2833 | return 0; | |
2834 | } | |
2835 | ||
2836 | static int mmu_alloc_roots(struct kvm_vcpu *vcpu) | |
2837 | { | |
2838 | if (vcpu->arch.mmu.direct_map) | |
2839 | return mmu_alloc_direct_roots(vcpu); | |
2840 | else | |
2841 | return mmu_alloc_shadow_roots(vcpu); | |
2842 | } | |
2843 | ||
2844 | static void mmu_sync_roots(struct kvm_vcpu *vcpu) | |
2845 | { | |
2846 | int i; | |
2847 | struct kvm_mmu_page *sp; | |
2848 | ||
2849 | if (vcpu->arch.mmu.direct_map) | |
2850 | return; | |
2851 | ||
2852 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) | |
2853 | return; | |
2854 | ||
2855 | vcpu_clear_mmio_info(vcpu, ~0ul); | |
2856 | kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); | |
2857 | if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { | |
2858 | hpa_t root = vcpu->arch.mmu.root_hpa; | |
2859 | sp = page_header(root); | |
2860 | mmu_sync_children(vcpu, sp); | |
2861 | kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); | |
2862 | return; | |
2863 | } | |
2864 | for (i = 0; i < 4; ++i) { | |
2865 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | |
2866 | ||
2867 | if (root && VALID_PAGE(root)) { | |
2868 | root &= PT64_BASE_ADDR_MASK; | |
2869 | sp = page_header(root); | |
2870 | mmu_sync_children(vcpu, sp); | |
2871 | } | |
2872 | } | |
2873 | kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); | |
2874 | } | |
2875 | ||
2876 | void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) | |
2877 | { | |
2878 | spin_lock(&vcpu->kvm->mmu_lock); | |
2879 | mmu_sync_roots(vcpu); | |
2880 | spin_unlock(&vcpu->kvm->mmu_lock); | |
2881 | } | |
2882 | ||
2883 | static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, | |
2884 | u32 access, struct x86_exception *exception) | |
2885 | { | |
2886 | if (exception) | |
2887 | exception->error_code = 0; | |
2888 | return vaddr; | |
2889 | } | |
2890 | ||
2891 | static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, | |
2892 | u32 access, | |
2893 | struct x86_exception *exception) | |
2894 | { | |
2895 | if (exception) | |
2896 | exception->error_code = 0; | |
2897 | return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access); | |
2898 | } | |
2899 | ||
2900 | static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct) | |
2901 | { | |
2902 | if (direct) | |
2903 | return vcpu_match_mmio_gpa(vcpu, addr); | |
2904 | ||
2905 | return vcpu_match_mmio_gva(vcpu, addr); | |
2906 | } | |
2907 | ||
2908 | ||
2909 | /* | |
2910 | * On direct hosts, the last spte is only allows two states | |
2911 | * for mmio page fault: | |
2912 | * - It is the mmio spte | |
2913 | * - It is zapped or it is being zapped. | |
2914 | * | |
2915 | * This function completely checks the spte when the last spte | |
2916 | * is not the mmio spte. | |
2917 | */ | |
2918 | static bool check_direct_spte_mmio_pf(u64 spte) | |
2919 | { | |
2920 | return __check_direct_spte_mmio_pf(spte); | |
2921 | } | |
2922 | ||
2923 | static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr) | |
2924 | { | |
2925 | struct kvm_shadow_walk_iterator iterator; | |
2926 | u64 spte = 0ull; | |
2927 | ||
2928 | walk_shadow_page_lockless_begin(vcpu); | |
2929 | for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) | |
2930 | if (!is_shadow_present_pte(spte)) | |
2931 | break; | |
2932 | walk_shadow_page_lockless_end(vcpu); | |
2933 | ||
2934 | return spte; | |
2935 | } | |
2936 | ||
2937 | /* | |
2938 | * If it is a real mmio page fault, return 1 and emulat the instruction | |
2939 | * directly, return 0 to let CPU fault again on the address, -1 is | |
2940 | * returned if bug is detected. | |
2941 | */ | |
2942 | int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) | |
2943 | { | |
2944 | u64 spte; | |
2945 | ||
2946 | if (quickly_check_mmio_pf(vcpu, addr, direct)) | |
2947 | return 1; | |
2948 | ||
2949 | spte = walk_shadow_page_get_mmio_spte(vcpu, addr); | |
2950 | ||
2951 | if (is_mmio_spte(spte)) { | |
2952 | gfn_t gfn = get_mmio_spte_gfn(spte); | |
2953 | unsigned access = get_mmio_spte_access(spte); | |
2954 | ||
2955 | if (direct) | |
2956 | addr = 0; | |
2957 | ||
2958 | trace_handle_mmio_page_fault(addr, gfn, access); | |
2959 | vcpu_cache_mmio_info(vcpu, addr, gfn, access); | |
2960 | return 1; | |
2961 | } | |
2962 | ||
2963 | /* | |
2964 | * It's ok if the gva is remapped by other cpus on shadow guest, | |
2965 | * it's a BUG if the gfn is not a mmio page. | |
2966 | */ | |
2967 | if (direct && !check_direct_spte_mmio_pf(spte)) | |
2968 | return -1; | |
2969 | ||
2970 | /* | |
2971 | * If the page table is zapped by other cpus, let CPU fault again on | |
2972 | * the address. | |
2973 | */ | |
2974 | return 0; | |
2975 | } | |
2976 | EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common); | |
2977 | ||
2978 | static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, | |
2979 | u32 error_code, bool direct) | |
2980 | { | |
2981 | int ret; | |
2982 | ||
2983 | ret = handle_mmio_page_fault_common(vcpu, addr, direct); | |
2984 | WARN_ON(ret < 0); | |
2985 | return ret; | |
2986 | } | |
2987 | ||
2988 | static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | |
2989 | u32 error_code, bool prefault) | |
2990 | { | |
2991 | gfn_t gfn; | |
2992 | int r; | |
2993 | ||
2994 | pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); | |
2995 | ||
2996 | if (unlikely(error_code & PFERR_RSVD_MASK)) | |
2997 | return handle_mmio_page_fault(vcpu, gva, error_code, true); | |
2998 | ||
2999 | r = mmu_topup_memory_caches(vcpu); | |
3000 | if (r) | |
3001 | return r; | |
3002 | ||
3003 | ASSERT(vcpu); | |
3004 | ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); | |
3005 | ||
3006 | gfn = gva >> PAGE_SHIFT; | |
3007 | ||
3008 | return nonpaging_map(vcpu, gva & PAGE_MASK, | |
3009 | error_code & PFERR_WRITE_MASK, gfn, prefault); | |
3010 | } | |
3011 | ||
3012 | static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) | |
3013 | { | |
3014 | struct kvm_arch_async_pf arch; | |
3015 | ||
3016 | arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; | |
3017 | arch.gfn = gfn; | |
3018 | arch.direct_map = vcpu->arch.mmu.direct_map; | |
3019 | arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu); | |
3020 | ||
3021 | return kvm_setup_async_pf(vcpu, gva, gfn, &arch); | |
3022 | } | |
3023 | ||
3024 | static bool can_do_async_pf(struct kvm_vcpu *vcpu) | |
3025 | { | |
3026 | if (unlikely(!irqchip_in_kernel(vcpu->kvm) || | |
3027 | kvm_event_needs_reinjection(vcpu))) | |
3028 | return false; | |
3029 | ||
3030 | return kvm_x86_ops->interrupt_allowed(vcpu); | |
3031 | } | |
3032 | ||
3033 | static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, | |
3034 | gva_t gva, pfn_t *pfn, bool write, bool *writable) | |
3035 | { | |
3036 | bool async; | |
3037 | ||
3038 | *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable); | |
3039 | ||
3040 | if (!async) | |
3041 | return false; /* *pfn has correct page already */ | |
3042 | ||
3043 | put_page(pfn_to_page(*pfn)); | |
3044 | ||
3045 | if (!prefault && can_do_async_pf(vcpu)) { | |
3046 | trace_kvm_try_async_get_page(gva, gfn); | |
3047 | if (kvm_find_async_pf_gfn(vcpu, gfn)) { | |
3048 | trace_kvm_async_pf_doublefault(gva, gfn); | |
3049 | kvm_make_request(KVM_REQ_APF_HALT, vcpu); | |
3050 | return true; | |
3051 | } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn)) | |
3052 | return true; | |
3053 | } | |
3054 | ||
3055 | *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable); | |
3056 | ||
3057 | return false; | |
3058 | } | |
3059 | ||
3060 | static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, | |
3061 | bool prefault) | |
3062 | { | |
3063 | pfn_t pfn; | |
3064 | int r; | |
3065 | int level; | |
3066 | int force_pt_level; | |
3067 | gfn_t gfn = gpa >> PAGE_SHIFT; | |
3068 | unsigned long mmu_seq; | |
3069 | int write = error_code & PFERR_WRITE_MASK; | |
3070 | bool map_writable; | |
3071 | ||
3072 | ASSERT(vcpu); | |
3073 | ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); | |
3074 | ||
3075 | if (unlikely(error_code & PFERR_RSVD_MASK)) | |
3076 | return handle_mmio_page_fault(vcpu, gpa, error_code, true); | |
3077 | ||
3078 | r = mmu_topup_memory_caches(vcpu); | |
3079 | if (r) | |
3080 | return r; | |
3081 | ||
3082 | force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); | |
3083 | if (likely(!force_pt_level)) { | |
3084 | level = mapping_level(vcpu, gfn); | |
3085 | gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); | |
3086 | } else | |
3087 | level = PT_PAGE_TABLE_LEVEL; | |
3088 | ||
3089 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | |
3090 | smp_rmb(); | |
3091 | ||
3092 | if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) | |
3093 | return 0; | |
3094 | ||
3095 | if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r)) | |
3096 | return r; | |
3097 | ||
3098 | spin_lock(&vcpu->kvm->mmu_lock); | |
3099 | if (mmu_notifier_retry(vcpu, mmu_seq)) | |
3100 | goto out_unlock; | |
3101 | kvm_mmu_free_some_pages(vcpu); | |
3102 | if (likely(!force_pt_level)) | |
3103 | transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); | |
3104 | r = __direct_map(vcpu, gpa, write, map_writable, | |
3105 | level, gfn, pfn, prefault); | |
3106 | spin_unlock(&vcpu->kvm->mmu_lock); | |
3107 | ||
3108 | return r; | |
3109 | ||
3110 | out_unlock: | |
3111 | spin_unlock(&vcpu->kvm->mmu_lock); | |
3112 | kvm_release_pfn_clean(pfn); | |
3113 | return 0; | |
3114 | } | |
3115 | ||
3116 | static void nonpaging_free(struct kvm_vcpu *vcpu) | |
3117 | { | |
3118 | mmu_free_roots(vcpu); | |
3119 | } | |
3120 | ||
3121 | static int nonpaging_init_context(struct kvm_vcpu *vcpu, | |
3122 | struct kvm_mmu *context) | |
3123 | { | |
3124 | context->new_cr3 = nonpaging_new_cr3; | |
3125 | context->page_fault = nonpaging_page_fault; | |
3126 | context->gva_to_gpa = nonpaging_gva_to_gpa; | |
3127 | context->free = nonpaging_free; | |
3128 | context->sync_page = nonpaging_sync_page; | |
3129 | context->invlpg = nonpaging_invlpg; | |
3130 | context->update_pte = nonpaging_update_pte; | |
3131 | context->root_level = 0; | |
3132 | context->shadow_root_level = PT32E_ROOT_LEVEL; | |
3133 | context->root_hpa = INVALID_PAGE; | |
3134 | context->direct_map = true; | |
3135 | context->nx = false; | |
3136 | return 0; | |
3137 | } | |
3138 | ||
3139 | void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) | |
3140 | { | |
3141 | ++vcpu->stat.tlb_flush; | |
3142 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); | |
3143 | } | |
3144 | ||
3145 | static void paging_new_cr3(struct kvm_vcpu *vcpu) | |
3146 | { | |
3147 | pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu)); | |
3148 | mmu_free_roots(vcpu); | |
3149 | } | |
3150 | ||
3151 | static unsigned long get_cr3(struct kvm_vcpu *vcpu) | |
3152 | { | |
3153 | return kvm_read_cr3(vcpu); | |
3154 | } | |
3155 | ||
3156 | static void inject_page_fault(struct kvm_vcpu *vcpu, | |
3157 | struct x86_exception *fault) | |
3158 | { | |
3159 | vcpu->arch.mmu.inject_page_fault(vcpu, fault); | |
3160 | } | |
3161 | ||
3162 | static void paging_free(struct kvm_vcpu *vcpu) | |
3163 | { | |
3164 | nonpaging_free(vcpu); | |
3165 | } | |
3166 | ||
3167 | static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) | |
3168 | { | |
3169 | int bit7; | |
3170 | ||
3171 | bit7 = (gpte >> 7) & 1; | |
3172 | return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; | |
3173 | } | |
3174 | ||
3175 | static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, | |
3176 | int *nr_present) | |
3177 | { | |
3178 | if (unlikely(is_mmio_spte(*sptep))) { | |
3179 | if (gfn != get_mmio_spte_gfn(*sptep)) { | |
3180 | mmu_spte_clear_no_track(sptep); | |
3181 | return true; | |
3182 | } | |
3183 | ||
3184 | (*nr_present)++; | |
3185 | mark_mmio_spte(sptep, gfn, access); | |
3186 | return true; | |
3187 | } | |
3188 | ||
3189 | return false; | |
3190 | } | |
3191 | ||
3192 | #define PTTYPE 64 | |
3193 | #include "paging_tmpl.h" | |
3194 | #undef PTTYPE | |
3195 | ||
3196 | #define PTTYPE 32 | |
3197 | #include "paging_tmpl.h" | |
3198 | #undef PTTYPE | |
3199 | ||
3200 | static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, | |
3201 | struct kvm_mmu *context, | |
3202 | int level) | |
3203 | { | |
3204 | int maxphyaddr = cpuid_maxphyaddr(vcpu); | |
3205 | u64 exb_bit_rsvd = 0; | |
3206 | ||
3207 | if (!context->nx) | |
3208 | exb_bit_rsvd = rsvd_bits(63, 63); | |
3209 | switch (level) { | |
3210 | case PT32_ROOT_LEVEL: | |
3211 | /* no rsvd bits for 2 level 4K page table entries */ | |
3212 | context->rsvd_bits_mask[0][1] = 0; | |
3213 | context->rsvd_bits_mask[0][0] = 0; | |
3214 | context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; | |
3215 | ||
3216 | if (!is_pse(vcpu)) { | |
3217 | context->rsvd_bits_mask[1][1] = 0; | |
3218 | break; | |
3219 | } | |
3220 | ||
3221 | if (is_cpuid_PSE36()) | |
3222 | /* 36bits PSE 4MB page */ | |
3223 | context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); | |
3224 | else | |
3225 | /* 32 bits PSE 4MB page */ | |
3226 | context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); | |
3227 | break; | |
3228 | case PT32E_ROOT_LEVEL: | |
3229 | context->rsvd_bits_mask[0][2] = | |
3230 | rsvd_bits(maxphyaddr, 63) | | |
3231 | rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */ | |
3232 | context->rsvd_bits_mask[0][1] = exb_bit_rsvd | | |
3233 | rsvd_bits(maxphyaddr, 62); /* PDE */ | |
3234 | context->rsvd_bits_mask[0][0] = exb_bit_rsvd | | |
3235 | rsvd_bits(maxphyaddr, 62); /* PTE */ | |
3236 | context->rsvd_bits_mask[1][1] = exb_bit_rsvd | | |
3237 | rsvd_bits(maxphyaddr, 62) | | |
3238 | rsvd_bits(13, 20); /* large page */ | |
3239 | context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; | |
3240 | break; | |
3241 | case PT64_ROOT_LEVEL: | |
3242 | context->rsvd_bits_mask[0][3] = exb_bit_rsvd | | |
3243 | rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); | |
3244 | context->rsvd_bits_mask[0][2] = exb_bit_rsvd | | |
3245 | rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); | |
3246 | context->rsvd_bits_mask[0][1] = exb_bit_rsvd | | |
3247 | rsvd_bits(maxphyaddr, 51); | |
3248 | context->rsvd_bits_mask[0][0] = exb_bit_rsvd | | |
3249 | rsvd_bits(maxphyaddr, 51); | |
3250 | context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; | |
3251 | context->rsvd_bits_mask[1][2] = exb_bit_rsvd | | |
3252 | rsvd_bits(maxphyaddr, 51) | | |
3253 | rsvd_bits(13, 29); | |
3254 | context->rsvd_bits_mask[1][1] = exb_bit_rsvd | | |
3255 | rsvd_bits(maxphyaddr, 51) | | |
3256 | rsvd_bits(13, 20); /* large page */ | |
3257 | context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; | |
3258 | break; | |
3259 | } | |
3260 | } | |
3261 | ||
3262 | static int paging64_init_context_common(struct kvm_vcpu *vcpu, | |
3263 | struct kvm_mmu *context, | |
3264 | int level) | |
3265 | { | |
3266 | context->nx = is_nx(vcpu); | |
3267 | ||
3268 | reset_rsvds_bits_mask(vcpu, context, level); | |
3269 | ||
3270 | ASSERT(is_pae(vcpu)); | |
3271 | context->new_cr3 = paging_new_cr3; | |
3272 | context->page_fault = paging64_page_fault; | |
3273 | context->gva_to_gpa = paging64_gva_to_gpa; | |
3274 | context->sync_page = paging64_sync_page; | |
3275 | context->invlpg = paging64_invlpg; | |
3276 | context->update_pte = paging64_update_pte; | |
3277 | context->free = paging_free; | |
3278 | context->root_level = level; | |
3279 | context->shadow_root_level = level; | |
3280 | context->root_hpa = INVALID_PAGE; | |
3281 | context->direct_map = false; | |
3282 | return 0; | |
3283 | } | |
3284 | ||
3285 | static int paging64_init_context(struct kvm_vcpu *vcpu, | |
3286 | struct kvm_mmu *context) | |
3287 | { | |
3288 | return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL); | |
3289 | } | |
3290 | ||
3291 | static int paging32_init_context(struct kvm_vcpu *vcpu, | |
3292 | struct kvm_mmu *context) | |
3293 | { | |
3294 | context->nx = false; | |
3295 | ||
3296 | reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL); | |
3297 | ||
3298 | context->new_cr3 = paging_new_cr3; | |
3299 | context->page_fault = paging32_page_fault; | |
3300 | context->gva_to_gpa = paging32_gva_to_gpa; | |
3301 | context->free = paging_free; | |
3302 | context->sync_page = paging32_sync_page; | |
3303 | context->invlpg = paging32_invlpg; | |
3304 | context->update_pte = paging32_update_pte; | |
3305 | context->root_level = PT32_ROOT_LEVEL; | |
3306 | context->shadow_root_level = PT32E_ROOT_LEVEL; | |
3307 | context->root_hpa = INVALID_PAGE; | |
3308 | context->direct_map = false; | |
3309 | return 0; | |
3310 | } | |
3311 | ||
3312 | static int paging32E_init_context(struct kvm_vcpu *vcpu, | |
3313 | struct kvm_mmu *context) | |
3314 | { | |
3315 | return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL); | |
3316 | } | |
3317 | ||
3318 | static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) | |
3319 | { | |
3320 | struct kvm_mmu *context = vcpu->arch.walk_mmu; | |
3321 | ||
3322 | context->base_role.word = 0; | |
3323 | context->new_cr3 = nonpaging_new_cr3; | |
3324 | context->page_fault = tdp_page_fault; | |
3325 | context->free = nonpaging_free; | |
3326 | context->sync_page = nonpaging_sync_page; | |
3327 | context->invlpg = nonpaging_invlpg; | |
3328 | context->update_pte = nonpaging_update_pte; | |
3329 | context->shadow_root_level = kvm_x86_ops->get_tdp_level(); | |
3330 | context->root_hpa = INVALID_PAGE; | |
3331 | context->direct_map = true; | |
3332 | context->set_cr3 = kvm_x86_ops->set_tdp_cr3; | |
3333 | context->get_cr3 = get_cr3; | |
3334 | context->get_pdptr = kvm_pdptr_read; | |
3335 | context->inject_page_fault = kvm_inject_page_fault; | |
3336 | context->nx = is_nx(vcpu); | |
3337 | ||
3338 | if (!is_paging(vcpu)) { | |
3339 | context->nx = false; | |
3340 | context->gva_to_gpa = nonpaging_gva_to_gpa; | |
3341 | context->root_level = 0; | |
3342 | } else if (is_long_mode(vcpu)) { | |
3343 | context->nx = is_nx(vcpu); | |
3344 | reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL); | |
3345 | context->gva_to_gpa = paging64_gva_to_gpa; | |
3346 | context->root_level = PT64_ROOT_LEVEL; | |
3347 | } else if (is_pae(vcpu)) { | |
3348 | context->nx = is_nx(vcpu); | |
3349 | reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL); | |
3350 | context->gva_to_gpa = paging64_gva_to_gpa; | |
3351 | context->root_level = PT32E_ROOT_LEVEL; | |
3352 | } else { | |
3353 | context->nx = false; | |
3354 | reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL); | |
3355 | context->gva_to_gpa = paging32_gva_to_gpa; | |
3356 | context->root_level = PT32_ROOT_LEVEL; | |
3357 | } | |
3358 | ||
3359 | return 0; | |
3360 | } | |
3361 | ||
3362 | int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) | |
3363 | { | |
3364 | int r; | |
3365 | bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); | |
3366 | ASSERT(vcpu); | |
3367 | ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); | |
3368 | ||
3369 | if (!is_paging(vcpu)) | |
3370 | r = nonpaging_init_context(vcpu, context); | |
3371 | else if (is_long_mode(vcpu)) | |
3372 | r = paging64_init_context(vcpu, context); | |
3373 | else if (is_pae(vcpu)) | |
3374 | r = paging32E_init_context(vcpu, context); | |
3375 | else | |
3376 | r = paging32_init_context(vcpu, context); | |
3377 | ||
3378 | vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); | |
3379 | vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); | |
3380 | vcpu->arch.mmu.base_role.smep_andnot_wp | |
3381 | = smep && !is_write_protection(vcpu); | |
3382 | ||
3383 | return r; | |
3384 | } | |
3385 | EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); | |
3386 | ||
3387 | static int init_kvm_softmmu(struct kvm_vcpu *vcpu) | |
3388 | { | |
3389 | int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); | |
3390 | ||
3391 | vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3; | |
3392 | vcpu->arch.walk_mmu->get_cr3 = get_cr3; | |
3393 | vcpu->arch.walk_mmu->get_pdptr = kvm_pdptr_read; | |
3394 | vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; | |
3395 | ||
3396 | return r; | |
3397 | } | |
3398 | ||
3399 | static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) | |
3400 | { | |
3401 | struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; | |
3402 | ||
3403 | g_context->get_cr3 = get_cr3; | |
3404 | g_context->get_pdptr = kvm_pdptr_read; | |
3405 | g_context->inject_page_fault = kvm_inject_page_fault; | |
3406 | ||
3407 | /* | |
3408 | * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The | |
3409 | * translation of l2_gpa to l1_gpa addresses is done using the | |
3410 | * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa | |
3411 | * functions between mmu and nested_mmu are swapped. | |
3412 | */ | |
3413 | if (!is_paging(vcpu)) { | |
3414 | g_context->nx = false; | |
3415 | g_context->root_level = 0; | |
3416 | g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; | |
3417 | } else if (is_long_mode(vcpu)) { | |
3418 | g_context->nx = is_nx(vcpu); | |
3419 | reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL); | |
3420 | g_context->root_level = PT64_ROOT_LEVEL; | |
3421 | g_context->gva_to_gpa = paging64_gva_to_gpa_nested; | |
3422 | } else if (is_pae(vcpu)) { | |
3423 | g_context->nx = is_nx(vcpu); | |
3424 | reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL); | |
3425 | g_context->root_level = PT32E_ROOT_LEVEL; | |
3426 | g_context->gva_to_gpa = paging64_gva_to_gpa_nested; | |
3427 | } else { | |
3428 | g_context->nx = false; | |
3429 | reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL); | |
3430 | g_context->root_level = PT32_ROOT_LEVEL; | |
3431 | g_context->gva_to_gpa = paging32_gva_to_gpa_nested; | |
3432 | } | |
3433 | ||
3434 | return 0; | |
3435 | } | |
3436 | ||
3437 | static int init_kvm_mmu(struct kvm_vcpu *vcpu) | |
3438 | { | |
3439 | if (mmu_is_nested(vcpu)) | |
3440 | return init_kvm_nested_mmu(vcpu); | |
3441 | else if (tdp_enabled) | |
3442 | return init_kvm_tdp_mmu(vcpu); | |
3443 | else | |
3444 | return init_kvm_softmmu(vcpu); | |
3445 | } | |
3446 | ||
3447 | static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) | |
3448 | { | |
3449 | ASSERT(vcpu); | |
3450 | if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) | |
3451 | /* mmu.free() should set root_hpa = INVALID_PAGE */ | |
3452 | vcpu->arch.mmu.free(vcpu); | |
3453 | } | |
3454 | ||
3455 | int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) | |
3456 | { | |
3457 | destroy_kvm_mmu(vcpu); | |
3458 | return init_kvm_mmu(vcpu); | |
3459 | } | |
3460 | EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); | |
3461 | ||
3462 | int kvm_mmu_load(struct kvm_vcpu *vcpu) | |
3463 | { | |
3464 | int r; | |
3465 | ||
3466 | r = mmu_topup_memory_caches(vcpu); | |
3467 | if (r) | |
3468 | goto out; | |
3469 | r = mmu_alloc_roots(vcpu); | |
3470 | spin_lock(&vcpu->kvm->mmu_lock); | |
3471 | mmu_sync_roots(vcpu); | |
3472 | spin_unlock(&vcpu->kvm->mmu_lock); | |
3473 | if (r) | |
3474 | goto out; | |
3475 | /* set_cr3() should ensure TLB has been flushed */ | |
3476 | vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa); | |
3477 | out: | |
3478 | return r; | |
3479 | } | |
3480 | EXPORT_SYMBOL_GPL(kvm_mmu_load); | |
3481 | ||
3482 | void kvm_mmu_unload(struct kvm_vcpu *vcpu) | |
3483 | { | |
3484 | mmu_free_roots(vcpu); | |
3485 | } | |
3486 | EXPORT_SYMBOL_GPL(kvm_mmu_unload); | |
3487 | ||
3488 | static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, | |
3489 | struct kvm_mmu_page *sp, u64 *spte, | |
3490 | const void *new) | |
3491 | { | |
3492 | if (sp->role.level != PT_PAGE_TABLE_LEVEL) { | |
3493 | ++vcpu->kvm->stat.mmu_pde_zapped; | |
3494 | return; | |
3495 | } | |
3496 | ||
3497 | ++vcpu->kvm->stat.mmu_pte_updated; | |
3498 | vcpu->arch.mmu.update_pte(vcpu, sp, spte, new); | |
3499 | } | |
3500 | ||
3501 | static bool need_remote_flush(u64 old, u64 new) | |
3502 | { | |
3503 | if (!is_shadow_present_pte(old)) | |
3504 | return false; | |
3505 | if (!is_shadow_present_pte(new)) | |
3506 | return true; | |
3507 | if ((old ^ new) & PT64_BASE_ADDR_MASK) | |
3508 | return true; | |
3509 | old ^= PT64_NX_MASK; | |
3510 | new ^= PT64_NX_MASK; | |
3511 | return (old & ~new & PT64_PERM_MASK) != 0; | |
3512 | } | |
3513 | ||
3514 | static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page, | |
3515 | bool remote_flush, bool local_flush) | |
3516 | { | |
3517 | if (zap_page) | |
3518 | return; | |
3519 | ||
3520 | if (remote_flush) | |
3521 | kvm_flush_remote_tlbs(vcpu->kvm); | |
3522 | else if (local_flush) | |
3523 | kvm_mmu_flush_tlb(vcpu); | |
3524 | } | |
3525 | ||
3526 | static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, | |
3527 | const u8 *new, int *bytes) | |
3528 | { | |
3529 | u64 gentry; | |
3530 | int r; | |
3531 | ||
3532 | /* | |
3533 | * Assume that the pte write on a page table of the same type | |
3534 | * as the current vcpu paging mode since we update the sptes only | |
3535 | * when they have the same mode. | |
3536 | */ | |
3537 | if (is_pae(vcpu) && *bytes == 4) { | |
3538 | /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ | |
3539 | *gpa &= ~(gpa_t)7; | |
3540 | *bytes = 8; | |
3541 | r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, min(*bytes, 8)); | |
3542 | if (r) | |
3543 | gentry = 0; | |
3544 | new = (const u8 *)&gentry; | |
3545 | } | |
3546 | ||
3547 | switch (*bytes) { | |
3548 | case 4: | |
3549 | gentry = *(const u32 *)new; | |
3550 | break; | |
3551 | case 8: | |
3552 | gentry = *(const u64 *)new; | |
3553 | break; | |
3554 | default: | |
3555 | gentry = 0; | |
3556 | break; | |
3557 | } | |
3558 | ||
3559 | return gentry; | |
3560 | } | |
3561 | ||
3562 | /* | |
3563 | * If we're seeing too many writes to a page, it may no longer be a page table, | |
3564 | * or we may be forking, in which case it is better to unmap the page. | |
3565 | */ | |
3566 | static bool detect_write_flooding(struct kvm_mmu_page *sp, u64 *spte) | |
3567 | { | |
3568 | /* | |
3569 | * Skip write-flooding detected for the sp whose level is 1, because | |
3570 | * it can become unsync, then the guest page is not write-protected. | |
3571 | */ | |
3572 | if (sp->role.level == 1) | |
3573 | return false; | |
3574 | ||
3575 | return ++sp->write_flooding_count >= 3; | |
3576 | } | |
3577 | ||
3578 | /* | |
3579 | * Misaligned accesses are too much trouble to fix up; also, they usually | |
3580 | * indicate a page is not used as a page table. | |
3581 | */ | |
3582 | static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa, | |
3583 | int bytes) | |
3584 | { | |
3585 | unsigned offset, pte_size, misaligned; | |
3586 | ||
3587 | pgprintk("misaligned: gpa %llx bytes %d role %x\n", | |
3588 | gpa, bytes, sp->role.word); | |
3589 | ||
3590 | offset = offset_in_page(gpa); | |
3591 | pte_size = sp->role.cr4_pae ? 8 : 4; | |
3592 | ||
3593 | /* | |
3594 | * Sometimes, the OS only writes the last one bytes to update status | |
3595 | * bits, for example, in linux, andb instruction is used in clear_bit(). | |
3596 | */ | |
3597 | if (!(offset & (pte_size - 1)) && bytes == 1) | |
3598 | return false; | |
3599 | ||
3600 | misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); | |
3601 | misaligned |= bytes < 4; | |
3602 | ||
3603 | return misaligned; | |
3604 | } | |
3605 | ||
3606 | static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) | |
3607 | { | |
3608 | unsigned page_offset, quadrant; | |
3609 | u64 *spte; | |
3610 | int level; | |
3611 | ||
3612 | page_offset = offset_in_page(gpa); | |
3613 | level = sp->role.level; | |
3614 | *nspte = 1; | |
3615 | if (!sp->role.cr4_pae) { | |
3616 | page_offset <<= 1; /* 32->64 */ | |
3617 | /* | |
3618 | * A 32-bit pde maps 4MB while the shadow pdes map | |
3619 | * only 2MB. So we need to double the offset again | |
3620 | * and zap two pdes instead of one. | |
3621 | */ | |
3622 | if (level == PT32_ROOT_LEVEL) { | |
3623 | page_offset &= ~7; /* kill rounding error */ | |
3624 | page_offset <<= 1; | |
3625 | *nspte = 2; | |
3626 | } | |
3627 | quadrant = page_offset >> PAGE_SHIFT; | |
3628 | page_offset &= ~PAGE_MASK; | |
3629 | if (quadrant != sp->role.quadrant) | |
3630 | return NULL; | |
3631 | } | |
3632 | ||
3633 | spte = &sp->spt[page_offset / sizeof(*spte)]; | |
3634 | return spte; | |
3635 | } | |
3636 | ||
3637 | void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |
3638 | const u8 *new, int bytes) | |
3639 | { | |
3640 | gfn_t gfn = gpa >> PAGE_SHIFT; | |
3641 | union kvm_mmu_page_role mask = { .word = 0 }; | |
3642 | struct kvm_mmu_page *sp; | |
3643 | struct hlist_node *node; | |
3644 | LIST_HEAD(invalid_list); | |
3645 | u64 entry, gentry, *spte; | |
3646 | int npte; | |
3647 | bool remote_flush, local_flush, zap_page; | |
3648 | ||
3649 | /* | |
3650 | * If we don't have indirect shadow pages, it means no page is | |
3651 | * write-protected, so we can exit simply. | |
3652 | */ | |
3653 | if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) | |
3654 | return; | |
3655 | ||
3656 | zap_page = remote_flush = local_flush = false; | |
3657 | ||
3658 | pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); | |
3659 | ||
3660 | gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, new, &bytes); | |
3661 | ||
3662 | /* | |
3663 | * No need to care whether allocation memory is successful | |
3664 | * or not since pte prefetch is skiped if it does not have | |
3665 | * enough objects in the cache. | |
3666 | */ | |
3667 | mmu_topup_memory_caches(vcpu); | |
3668 | ||
3669 | spin_lock(&vcpu->kvm->mmu_lock); | |
3670 | ++vcpu->kvm->stat.mmu_pte_write; | |
3671 | kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); | |
3672 | ||
3673 | mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; | |
3674 | for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { | |
3675 | spte = get_written_sptes(sp, gpa, &npte); | |
3676 | ||
3677 | if (detect_write_misaligned(sp, gpa, bytes) || | |
3678 | detect_write_flooding(sp, spte)) { | |
3679 | zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, | |
3680 | &invalid_list); | |
3681 | ++vcpu->kvm->stat.mmu_flooded; | |
3682 | continue; | |
3683 | } | |
3684 | ||
3685 | spte = get_written_sptes(sp, gpa, &npte); | |
3686 | if (!spte) | |
3687 | continue; | |
3688 | ||
3689 | local_flush = true; | |
3690 | while (npte--) { | |
3691 | entry = *spte; | |
3692 | mmu_page_zap_pte(vcpu->kvm, sp, spte); | |
3693 | if (gentry && | |
3694 | !((sp->role.word ^ vcpu->arch.mmu.base_role.word) | |
3695 | & mask.word) && rmap_can_add(vcpu)) | |
3696 | mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); | |
3697 | if (!remote_flush && need_remote_flush(entry, *spte)) | |
3698 | remote_flush = true; | |
3699 | ++spte; | |
3700 | } | |
3701 | } | |
3702 | mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); | |
3703 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); | |
3704 | kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); | |
3705 | spin_unlock(&vcpu->kvm->mmu_lock); | |
3706 | } | |
3707 | ||
3708 | int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) | |
3709 | { | |
3710 | gpa_t gpa; | |
3711 | int r; | |
3712 | ||
3713 | if (vcpu->arch.mmu.direct_map) | |
3714 | return 0; | |
3715 | ||
3716 | gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); | |
3717 | ||
3718 | r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); | |
3719 | ||
3720 | return r; | |
3721 | } | |
3722 | EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); | |
3723 | ||
3724 | void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) | |
3725 | { | |
3726 | LIST_HEAD(invalid_list); | |
3727 | ||
3728 | while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES && | |
3729 | !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { | |
3730 | struct kvm_mmu_page *sp; | |
3731 | ||
3732 | sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, | |
3733 | struct kvm_mmu_page, link); | |
3734 | kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); | |
3735 | ++vcpu->kvm->stat.mmu_recycled; | |
3736 | } | |
3737 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); | |
3738 | } | |
3739 | ||
3740 | static bool is_mmio_page_fault(struct kvm_vcpu *vcpu, gva_t addr) | |
3741 | { | |
3742 | if (vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu)) | |
3743 | return vcpu_match_mmio_gpa(vcpu, addr); | |
3744 | ||
3745 | return vcpu_match_mmio_gva(vcpu, addr); | |
3746 | } | |
3747 | ||
3748 | int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, | |
3749 | void *insn, int insn_len) | |
3750 | { | |
3751 | int r, emulation_type = EMULTYPE_RETRY; | |
3752 | enum emulation_result er; | |
3753 | ||
3754 | r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); | |
3755 | if (r < 0) | |
3756 | goto out; | |
3757 | ||
3758 | if (!r) { | |
3759 | r = 1; | |
3760 | goto out; | |
3761 | } | |
3762 | ||
3763 | if (is_mmio_page_fault(vcpu, cr2)) | |
3764 | emulation_type = 0; | |
3765 | ||
3766 | er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len); | |
3767 | ||
3768 | switch (er) { | |
3769 | case EMULATE_DONE: | |
3770 | return 1; | |
3771 | case EMULATE_DO_MMIO: | |
3772 | ++vcpu->stat.mmio_exits; | |
3773 | /* fall through */ | |
3774 | case EMULATE_FAIL: | |
3775 | return 0; | |
3776 | default: | |
3777 | BUG(); | |
3778 | } | |
3779 | out: | |
3780 | return r; | |
3781 | } | |
3782 | EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); | |
3783 | ||
3784 | void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) | |
3785 | { | |
3786 | vcpu->arch.mmu.invlpg(vcpu, gva); | |
3787 | kvm_mmu_flush_tlb(vcpu); | |
3788 | ++vcpu->stat.invlpg; | |
3789 | } | |
3790 | EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); | |
3791 | ||
3792 | void kvm_enable_tdp(void) | |
3793 | { | |
3794 | tdp_enabled = true; | |
3795 | } | |
3796 | EXPORT_SYMBOL_GPL(kvm_enable_tdp); | |
3797 | ||
3798 | void kvm_disable_tdp(void) | |
3799 | { | |
3800 | tdp_enabled = false; | |
3801 | } | |
3802 | EXPORT_SYMBOL_GPL(kvm_disable_tdp); | |
3803 | ||
3804 | static void free_mmu_pages(struct kvm_vcpu *vcpu) | |
3805 | { | |
3806 | free_page((unsigned long)vcpu->arch.mmu.pae_root); | |
3807 | if (vcpu->arch.mmu.lm_root != NULL) | |
3808 | free_page((unsigned long)vcpu->arch.mmu.lm_root); | |
3809 | } | |
3810 | ||
3811 | static int alloc_mmu_pages(struct kvm_vcpu *vcpu) | |
3812 | { | |
3813 | struct page *page; | |
3814 | int i; | |
3815 | ||
3816 | ASSERT(vcpu); | |
3817 | ||
3818 | /* | |
3819 | * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. | |
3820 | * Therefore we need to allocate shadow page tables in the first | |
3821 | * 4GB of memory, which happens to fit the DMA32 zone. | |
3822 | */ | |
3823 | page = alloc_page(GFP_KERNEL | __GFP_DMA32); | |
3824 | if (!page) | |
3825 | return -ENOMEM; | |
3826 | ||
3827 | vcpu->arch.mmu.pae_root = page_address(page); | |
3828 | for (i = 0; i < 4; ++i) | |
3829 | vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; | |
3830 | ||
3831 | return 0; | |
3832 | } | |
3833 | ||
3834 | int kvm_mmu_create(struct kvm_vcpu *vcpu) | |
3835 | { | |
3836 | ASSERT(vcpu); | |
3837 | ||
3838 | vcpu->arch.walk_mmu = &vcpu->arch.mmu; | |
3839 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | |
3840 | vcpu->arch.mmu.translate_gpa = translate_gpa; | |
3841 | vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; | |
3842 | ||
3843 | return alloc_mmu_pages(vcpu); | |
3844 | } | |
3845 | ||
3846 | int kvm_mmu_setup(struct kvm_vcpu *vcpu) | |
3847 | { | |
3848 | ASSERT(vcpu); | |
3849 | ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); | |
3850 | ||
3851 | return init_kvm_mmu(vcpu); | |
3852 | } | |
3853 | ||
3854 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | |
3855 | { | |
3856 | struct kvm_mmu_page *sp; | |
3857 | ||
3858 | list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { | |
3859 | int i; | |
3860 | u64 *pt; | |
3861 | ||
3862 | if (!test_bit(slot, sp->slot_bitmap)) | |
3863 | continue; | |
3864 | ||
3865 | pt = sp->spt; | |
3866 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | |
3867 | if (!is_shadow_present_pte(pt[i]) || | |
3868 | !is_last_spte(pt[i], sp->role.level)) | |
3869 | continue; | |
3870 | ||
3871 | if (is_large_pte(pt[i])) { | |
3872 | drop_spte(kvm, &pt[i]); | |
3873 | --kvm->stat.lpages; | |
3874 | continue; | |
3875 | } | |
3876 | ||
3877 | /* avoid RMW */ | |
3878 | if (is_writable_pte(pt[i])) | |
3879 | mmu_spte_update(&pt[i], | |
3880 | pt[i] & ~PT_WRITABLE_MASK); | |
3881 | } | |
3882 | } | |
3883 | kvm_flush_remote_tlbs(kvm); | |
3884 | } | |
3885 | ||
3886 | void kvm_mmu_zap_all(struct kvm *kvm) | |
3887 | { | |
3888 | struct kvm_mmu_page *sp, *node; | |
3889 | LIST_HEAD(invalid_list); | |
3890 | ||
3891 | spin_lock(&kvm->mmu_lock); | |
3892 | restart: | |
3893 | list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) | |
3894 | if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) | |
3895 | goto restart; | |
3896 | ||
3897 | kvm_mmu_commit_zap_page(kvm, &invalid_list); | |
3898 | spin_unlock(&kvm->mmu_lock); | |
3899 | } | |
3900 | ||
3901 | static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, | |
3902 | struct list_head *invalid_list) | |
3903 | { | |
3904 | struct kvm_mmu_page *page; | |
3905 | ||
3906 | page = container_of(kvm->arch.active_mmu_pages.prev, | |
3907 | struct kvm_mmu_page, link); | |
3908 | return kvm_mmu_prepare_zap_page(kvm, page, invalid_list); | |
3909 | } | |
3910 | ||
3911 | static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) | |
3912 | { | |
3913 | struct kvm *kvm; | |
3914 | struct kvm *kvm_freed = NULL; | |
3915 | int nr_to_scan = sc->nr_to_scan; | |
3916 | ||
3917 | if (nr_to_scan == 0) | |
3918 | goto out; | |
3919 | ||
3920 | raw_spin_lock(&kvm_lock); | |
3921 | ||
3922 | list_for_each_entry(kvm, &vm_list, vm_list) { | |
3923 | int idx, freed_pages; | |
3924 | LIST_HEAD(invalid_list); | |
3925 | ||
3926 | idx = srcu_read_lock(&kvm->srcu); | |
3927 | spin_lock(&kvm->mmu_lock); | |
3928 | if (!kvm_freed && nr_to_scan > 0 && | |
3929 | kvm->arch.n_used_mmu_pages > 0) { | |
3930 | freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, | |
3931 | &invalid_list); | |
3932 | kvm_freed = kvm; | |
3933 | } | |
3934 | nr_to_scan--; | |
3935 | ||
3936 | kvm_mmu_commit_zap_page(kvm, &invalid_list); | |
3937 | spin_unlock(&kvm->mmu_lock); | |
3938 | srcu_read_unlock(&kvm->srcu, idx); | |
3939 | } | |
3940 | if (kvm_freed) | |
3941 | list_move_tail(&kvm_freed->vm_list, &vm_list); | |
3942 | ||
3943 | raw_spin_unlock(&kvm_lock); | |
3944 | ||
3945 | out: | |
3946 | return percpu_counter_read_positive(&kvm_total_used_mmu_pages); | |
3947 | } | |
3948 | ||
3949 | static struct shrinker mmu_shrinker = { | |
3950 | .shrink = mmu_shrink, | |
3951 | .seeks = DEFAULT_SEEKS * 10, | |
3952 | }; | |
3953 | ||
3954 | static void mmu_destroy_caches(void) | |
3955 | { | |
3956 | if (pte_list_desc_cache) | |
3957 | kmem_cache_destroy(pte_list_desc_cache); | |
3958 | if (mmu_page_header_cache) | |
3959 | kmem_cache_destroy(mmu_page_header_cache); | |
3960 | } | |
3961 | ||
3962 | int kvm_mmu_module_init(void) | |
3963 | { | |
3964 | pte_list_desc_cache = kmem_cache_create("pte_list_desc", | |
3965 | sizeof(struct pte_list_desc), | |
3966 | 0, 0, NULL); | |
3967 | if (!pte_list_desc_cache) | |
3968 | goto nomem; | |
3969 | ||
3970 | mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", | |
3971 | sizeof(struct kvm_mmu_page), | |
3972 | 0, 0, NULL); | |
3973 | if (!mmu_page_header_cache) | |
3974 | goto nomem; | |
3975 | ||
3976 | if (percpu_counter_init(&kvm_total_used_mmu_pages, 0)) | |
3977 | goto nomem; | |
3978 | ||
3979 | register_shrinker(&mmu_shrinker); | |
3980 | ||
3981 | return 0; | |
3982 | ||
3983 | nomem: | |
3984 | mmu_destroy_caches(); | |
3985 | return -ENOMEM; | |
3986 | } | |
3987 | ||
3988 | /* | |
3989 | * Caculate mmu pages needed for kvm. | |
3990 | */ | |
3991 | unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) | |
3992 | { | |
3993 | unsigned int nr_mmu_pages; | |
3994 | unsigned int nr_pages = 0; | |
3995 | struct kvm_memslots *slots; | |
3996 | struct kvm_memory_slot *memslot; | |
3997 | ||
3998 | slots = kvm_memslots(kvm); | |
3999 | ||
4000 | kvm_for_each_memslot(memslot, slots) | |
4001 | nr_pages += memslot->npages; | |
4002 | ||
4003 | nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; | |
4004 | nr_mmu_pages = max(nr_mmu_pages, | |
4005 | (unsigned int) KVM_MIN_ALLOC_MMU_PAGES); | |
4006 | ||
4007 | return nr_mmu_pages; | |
4008 | } | |
4009 | ||
4010 | int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) | |
4011 | { | |
4012 | struct kvm_shadow_walk_iterator iterator; | |
4013 | u64 spte; | |
4014 | int nr_sptes = 0; | |
4015 | ||
4016 | walk_shadow_page_lockless_begin(vcpu); | |
4017 | for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { | |
4018 | sptes[iterator.level-1] = spte; | |
4019 | nr_sptes++; | |
4020 | if (!is_shadow_present_pte(spte)) | |
4021 | break; | |
4022 | } | |
4023 | walk_shadow_page_lockless_end(vcpu); | |
4024 | ||
4025 | return nr_sptes; | |
4026 | } | |
4027 | EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); | |
4028 | ||
4029 | void kvm_mmu_destroy(struct kvm_vcpu *vcpu) | |
4030 | { | |
4031 | ASSERT(vcpu); | |
4032 | ||
4033 | destroy_kvm_mmu(vcpu); | |
4034 | free_mmu_pages(vcpu); | |
4035 | mmu_free_memory_caches(vcpu); | |
4036 | } | |
4037 | ||
4038 | #ifdef CONFIG_KVM_MMU_AUDIT | |
4039 | #include "mmu_audit.c" | |
4040 | #else | |
4041 | static void mmu_audit_disable(void) { } | |
4042 | #endif | |
4043 | ||
4044 | void kvm_mmu_module_exit(void) | |
4045 | { | |
4046 | mmu_destroy_caches(); | |
4047 | percpu_counter_destroy(&kvm_total_used_mmu_pages); | |
4048 | unregister_shrinker(&mmu_shrinker); | |
4049 | mmu_audit_disable(); | |
4050 | } |