1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
4 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
7 #include <linux/mman.h>
8 #include <linux/kvm_host.h>
10 #include <linux/hugetlb.h>
11 #include <linux/sched/signal.h>
12 #include <trace/events/kvm.h>
13 #include <asm/pgalloc.h>
14 #include <asm/cacheflush.h>
15 #include <asm/kvm_arm.h>
16 #include <asm/kvm_mmu.h>
17 #include <asm/kvm_ras.h>
18 #include <asm/kvm_asm.h>
19 #include <asm/kvm_emulate.h>
24 static pgd_t
*boot_hyp_pgd
;
25 static pgd_t
*hyp_pgd
;
26 static pgd_t
*merged_hyp_pgd
;
27 static DEFINE_MUTEX(kvm_hyp_pgd_mutex
);
29 static unsigned long hyp_idmap_start
;
30 static unsigned long hyp_idmap_end
;
31 static phys_addr_t hyp_idmap_vector
;
33 static unsigned long io_map_base
;
35 #define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
37 #define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0)
38 #define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1)
40 static bool is_iomap(unsigned long flags
)
42 return flags
& KVM_S2PTE_FLAG_IS_IOMAP
;
45 static bool memslot_is_logging(struct kvm_memory_slot
*memslot
)
47 return memslot
->dirty_bitmap
&& !(memslot
->flags
& KVM_MEM_READONLY
);
51 * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
52 * @kvm: pointer to kvm structure.
54 * Interface to HYP function to flush all VM TLB entries
56 void kvm_flush_remote_tlbs(struct kvm
*kvm
)
58 kvm_call_hyp(__kvm_tlb_flush_vmid
, &kvm
->arch
.mmu
);
61 static void kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu
*mmu
, phys_addr_t ipa
,
64 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa
, mmu
, ipa
, level
);
68 * D-Cache management functions. They take the page table entries by
69 * value, as they are flushing the cache using the kernel mapping (or
72 static void kvm_flush_dcache_pte(pte_t pte
)
74 __kvm_flush_dcache_pte(pte
);
77 static void kvm_flush_dcache_pmd(pmd_t pmd
)
79 __kvm_flush_dcache_pmd(pmd
);
82 static void kvm_flush_dcache_pud(pud_t pud
)
84 __kvm_flush_dcache_pud(pud
);
87 static bool kvm_is_device_pfn(unsigned long pfn
)
89 return !pfn_valid(pfn
);
93 * stage2_dissolve_pmd() - clear and flush huge PMD entry
94 * @mmu: pointer to mmu structure to operate on
96 * @pmd: pmd pointer for IPA
98 * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
100 static void stage2_dissolve_pmd(struct kvm_s2_mmu
*mmu
, phys_addr_t addr
, pmd_t
*pmd
)
102 if (!pmd_thp_or_huge(*pmd
))
106 kvm_tlb_flush_vmid_ipa(mmu
, addr
, S2_PMD_LEVEL
);
107 put_page(virt_to_page(pmd
));
111 * stage2_dissolve_pud() - clear and flush huge PUD entry
112 * @mmu: pointer to mmu structure to operate on
114 * @pud: pud pointer for IPA
116 * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
118 static void stage2_dissolve_pud(struct kvm_s2_mmu
*mmu
, phys_addr_t addr
, pud_t
*pudp
)
120 struct kvm
*kvm
= mmu
->kvm
;
122 if (!stage2_pud_huge(kvm
, *pudp
))
125 stage2_pud_clear(kvm
, pudp
);
126 kvm_tlb_flush_vmid_ipa(mmu
, addr
, S2_PUD_LEVEL
);
127 put_page(virt_to_page(pudp
));
130 static void clear_stage2_pgd_entry(struct kvm_s2_mmu
*mmu
, pgd_t
*pgd
, phys_addr_t addr
)
132 struct kvm
*kvm
= mmu
->kvm
;
133 p4d_t
*p4d_table __maybe_unused
= stage2_p4d_offset(kvm
, pgd
, 0UL);
134 stage2_pgd_clear(kvm
, pgd
);
135 kvm_tlb_flush_vmid_ipa(mmu
, addr
, S2_NO_LEVEL_HINT
);
136 stage2_p4d_free(kvm
, p4d_table
);
137 put_page(virt_to_page(pgd
));
140 static void clear_stage2_p4d_entry(struct kvm_s2_mmu
*mmu
, p4d_t
*p4d
, phys_addr_t addr
)
142 struct kvm
*kvm
= mmu
->kvm
;
143 pud_t
*pud_table __maybe_unused
= stage2_pud_offset(kvm
, p4d
, 0);
144 stage2_p4d_clear(kvm
, p4d
);
145 kvm_tlb_flush_vmid_ipa(mmu
, addr
, S2_NO_LEVEL_HINT
);
146 stage2_pud_free(kvm
, pud_table
);
147 put_page(virt_to_page(p4d
));
150 static void clear_stage2_pud_entry(struct kvm_s2_mmu
*mmu
, pud_t
*pud
, phys_addr_t addr
)
152 struct kvm
*kvm
= mmu
->kvm
;
153 pmd_t
*pmd_table __maybe_unused
= stage2_pmd_offset(kvm
, pud
, 0);
155 VM_BUG_ON(stage2_pud_huge(kvm
, *pud
));
156 stage2_pud_clear(kvm
, pud
);
157 kvm_tlb_flush_vmid_ipa(mmu
, addr
, S2_NO_LEVEL_HINT
);
158 stage2_pmd_free(kvm
, pmd_table
);
159 put_page(virt_to_page(pud
));
162 static void clear_stage2_pmd_entry(struct kvm_s2_mmu
*mmu
, pmd_t
*pmd
, phys_addr_t addr
)
164 pte_t
*pte_table
= pte_offset_kernel(pmd
, 0);
165 VM_BUG_ON(pmd_thp_or_huge(*pmd
));
167 kvm_tlb_flush_vmid_ipa(mmu
, addr
, S2_NO_LEVEL_HINT
);
168 free_page((unsigned long)pte_table
);
169 put_page(virt_to_page(pmd
));
172 static inline void kvm_set_pte(pte_t
*ptep
, pte_t new_pte
)
174 WRITE_ONCE(*ptep
, new_pte
);
178 static inline void kvm_set_pmd(pmd_t
*pmdp
, pmd_t new_pmd
)
180 WRITE_ONCE(*pmdp
, new_pmd
);
184 static inline void kvm_pmd_populate(pmd_t
*pmdp
, pte_t
*ptep
)
186 kvm_set_pmd(pmdp
, kvm_mk_pmd(ptep
));
189 static inline void kvm_pud_populate(pud_t
*pudp
, pmd_t
*pmdp
)
191 WRITE_ONCE(*pudp
, kvm_mk_pud(pmdp
));
195 static inline void kvm_p4d_populate(p4d_t
*p4dp
, pud_t
*pudp
)
197 WRITE_ONCE(*p4dp
, kvm_mk_p4d(pudp
));
201 static inline void kvm_pgd_populate(pgd_t
*pgdp
, p4d_t
*p4dp
)
203 #ifndef __PAGETABLE_P4D_FOLDED
204 WRITE_ONCE(*pgdp
, kvm_mk_pgd(p4dp
));
210 * Unmapping vs dcache management:
212 * If a guest maps certain memory pages as uncached, all writes will
213 * bypass the data cache and go directly to RAM. However, the CPUs
214 * can still speculate reads (not writes) and fill cache lines with
217 * Those cache lines will be *clean* cache lines though, so a
218 * clean+invalidate operation is equivalent to an invalidate
219 * operation, because no cache lines are marked dirty.
221 * Those clean cache lines could be filled prior to an uncached write
222 * by the guest, and the cache coherent IO subsystem would therefore
223 * end up writing old data to disk.
225 * This is why right after unmapping a page/section and invalidating
226 * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure
227 * the IO subsystem will never hit in the cache.
229 * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
230 * we then fully enforce cacheability of RAM, no matter what the guest
233 static void unmap_stage2_ptes(struct kvm_s2_mmu
*mmu
, pmd_t
*pmd
,
234 phys_addr_t addr
, phys_addr_t end
)
236 phys_addr_t start_addr
= addr
;
237 pte_t
*pte
, *start_pte
;
239 start_pte
= pte
= pte_offset_kernel(pmd
, addr
);
241 if (!pte_none(*pte
)) {
242 pte_t old_pte
= *pte
;
244 kvm_set_pte(pte
, __pte(0));
245 kvm_tlb_flush_vmid_ipa(mmu
, addr
, S2_PTE_LEVEL
);
247 /* No need to invalidate the cache for device mappings */
248 if (!kvm_is_device_pfn(pte_pfn(old_pte
)))
249 kvm_flush_dcache_pte(old_pte
);
251 put_page(virt_to_page(pte
));
253 } while (pte
++, addr
+= PAGE_SIZE
, addr
!= end
);
255 if (stage2_pte_table_empty(mmu
->kvm
, start_pte
))
256 clear_stage2_pmd_entry(mmu
, pmd
, start_addr
);
259 static void unmap_stage2_pmds(struct kvm_s2_mmu
*mmu
, pud_t
*pud
,
260 phys_addr_t addr
, phys_addr_t end
)
262 struct kvm
*kvm
= mmu
->kvm
;
263 phys_addr_t next
, start_addr
= addr
;
264 pmd_t
*pmd
, *start_pmd
;
266 start_pmd
= pmd
= stage2_pmd_offset(kvm
, pud
, addr
);
268 next
= stage2_pmd_addr_end(kvm
, addr
, end
);
269 if (!pmd_none(*pmd
)) {
270 if (pmd_thp_or_huge(*pmd
)) {
271 pmd_t old_pmd
= *pmd
;
274 kvm_tlb_flush_vmid_ipa(mmu
, addr
, S2_PMD_LEVEL
);
276 kvm_flush_dcache_pmd(old_pmd
);
278 put_page(virt_to_page(pmd
));
280 unmap_stage2_ptes(mmu
, pmd
, addr
, next
);
283 } while (pmd
++, addr
= next
, addr
!= end
);
285 if (stage2_pmd_table_empty(kvm
, start_pmd
))
286 clear_stage2_pud_entry(mmu
, pud
, start_addr
);
289 static void unmap_stage2_puds(struct kvm_s2_mmu
*mmu
, p4d_t
*p4d
,
290 phys_addr_t addr
, phys_addr_t end
)
292 struct kvm
*kvm
= mmu
->kvm
;
293 phys_addr_t next
, start_addr
= addr
;
294 pud_t
*pud
, *start_pud
;
296 start_pud
= pud
= stage2_pud_offset(kvm
, p4d
, addr
);
298 next
= stage2_pud_addr_end(kvm
, addr
, end
);
299 if (!stage2_pud_none(kvm
, *pud
)) {
300 if (stage2_pud_huge(kvm
, *pud
)) {
301 pud_t old_pud
= *pud
;
303 stage2_pud_clear(kvm
, pud
);
304 kvm_tlb_flush_vmid_ipa(mmu
, addr
, S2_PUD_LEVEL
);
305 kvm_flush_dcache_pud(old_pud
);
306 put_page(virt_to_page(pud
));
308 unmap_stage2_pmds(mmu
, pud
, addr
, next
);
311 } while (pud
++, addr
= next
, addr
!= end
);
313 if (stage2_pud_table_empty(kvm
, start_pud
))
314 clear_stage2_p4d_entry(mmu
, p4d
, start_addr
);
317 static void unmap_stage2_p4ds(struct kvm_s2_mmu
*mmu
, pgd_t
*pgd
,
318 phys_addr_t addr
, phys_addr_t end
)
320 struct kvm
*kvm
= mmu
->kvm
;
321 phys_addr_t next
, start_addr
= addr
;
322 p4d_t
*p4d
, *start_p4d
;
324 start_p4d
= p4d
= stage2_p4d_offset(kvm
, pgd
, addr
);
326 next
= stage2_p4d_addr_end(kvm
, addr
, end
);
327 if (!stage2_p4d_none(kvm
, *p4d
))
328 unmap_stage2_puds(mmu
, p4d
, addr
, next
);
329 } while (p4d
++, addr
= next
, addr
!= end
);
331 if (stage2_p4d_table_empty(kvm
, start_p4d
))
332 clear_stage2_pgd_entry(mmu
, pgd
, start_addr
);
336 * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
337 * @kvm: The VM pointer
338 * @start: The intermediate physical base address of the range to unmap
339 * @size: The size of the area to unmap
341 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must
342 * be called while holding mmu_lock (unless for freeing the stage2 pgd before
343 * destroying the VM), otherwise another faulting VCPU may come in and mess
344 * with things behind our backs.
346 static void unmap_stage2_range(struct kvm_s2_mmu
*mmu
, phys_addr_t start
, u64 size
)
348 struct kvm
*kvm
= mmu
->kvm
;
350 phys_addr_t addr
= start
, end
= start
+ size
;
353 assert_spin_locked(&kvm
->mmu_lock
);
354 WARN_ON(size
& ~PAGE_MASK
);
356 pgd
= mmu
->pgd
+ stage2_pgd_index(kvm
, addr
);
359 * Make sure the page table is still active, as another thread
360 * could have possibly freed the page table, while we released
363 if (!READ_ONCE(mmu
->pgd
))
365 next
= stage2_pgd_addr_end(kvm
, addr
, end
);
366 if (!stage2_pgd_none(kvm
, *pgd
))
367 unmap_stage2_p4ds(mmu
, pgd
, addr
, next
);
369 * If the range is too large, release the kvm->mmu_lock
370 * to prevent starvation and lockup detector warnings.
373 cond_resched_lock(&kvm
->mmu_lock
);
374 } while (pgd
++, addr
= next
, addr
!= end
);
377 static void stage2_flush_ptes(struct kvm_s2_mmu
*mmu
, pmd_t
*pmd
,
378 phys_addr_t addr
, phys_addr_t end
)
382 pte
= pte_offset_kernel(pmd
, addr
);
384 if (!pte_none(*pte
) && !kvm_is_device_pfn(pte_pfn(*pte
)))
385 kvm_flush_dcache_pte(*pte
);
386 } while (pte
++, addr
+= PAGE_SIZE
, addr
!= end
);
389 static void stage2_flush_pmds(struct kvm_s2_mmu
*mmu
, pud_t
*pud
,
390 phys_addr_t addr
, phys_addr_t end
)
392 struct kvm
*kvm
= mmu
->kvm
;
396 pmd
= stage2_pmd_offset(kvm
, pud
, addr
);
398 next
= stage2_pmd_addr_end(kvm
, addr
, end
);
399 if (!pmd_none(*pmd
)) {
400 if (pmd_thp_or_huge(*pmd
))
401 kvm_flush_dcache_pmd(*pmd
);
403 stage2_flush_ptes(mmu
, pmd
, addr
, next
);
405 } while (pmd
++, addr
= next
, addr
!= end
);
408 static void stage2_flush_puds(struct kvm_s2_mmu
*mmu
, p4d_t
*p4d
,
409 phys_addr_t addr
, phys_addr_t end
)
411 struct kvm
*kvm
= mmu
->kvm
;
415 pud
= stage2_pud_offset(kvm
, p4d
, addr
);
417 next
= stage2_pud_addr_end(kvm
, addr
, end
);
418 if (!stage2_pud_none(kvm
, *pud
)) {
419 if (stage2_pud_huge(kvm
, *pud
))
420 kvm_flush_dcache_pud(*pud
);
422 stage2_flush_pmds(mmu
, pud
, addr
, next
);
424 } while (pud
++, addr
= next
, addr
!= end
);
427 static void stage2_flush_p4ds(struct kvm_s2_mmu
*mmu
, pgd_t
*pgd
,
428 phys_addr_t addr
, phys_addr_t end
)
430 struct kvm
*kvm
= mmu
->kvm
;
434 p4d
= stage2_p4d_offset(kvm
, pgd
, addr
);
436 next
= stage2_p4d_addr_end(kvm
, addr
, end
);
437 if (!stage2_p4d_none(kvm
, *p4d
))
438 stage2_flush_puds(mmu
, p4d
, addr
, next
);
439 } while (p4d
++, addr
= next
, addr
!= end
);
442 static void stage2_flush_memslot(struct kvm
*kvm
,
443 struct kvm_memory_slot
*memslot
)
445 struct kvm_s2_mmu
*mmu
= &kvm
->arch
.mmu
;
446 phys_addr_t addr
= memslot
->base_gfn
<< PAGE_SHIFT
;
447 phys_addr_t end
= addr
+ PAGE_SIZE
* memslot
->npages
;
451 pgd
= mmu
->pgd
+ stage2_pgd_index(kvm
, addr
);
453 next
= stage2_pgd_addr_end(kvm
, addr
, end
);
454 if (!stage2_pgd_none(kvm
, *pgd
))
455 stage2_flush_p4ds(mmu
, pgd
, addr
, next
);
458 cond_resched_lock(&kvm
->mmu_lock
);
459 } while (pgd
++, addr
= next
, addr
!= end
);
463 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
464 * @kvm: The struct kvm pointer
466 * Go through the stage 2 page tables and invalidate any cache lines
467 * backing memory already mapped to the VM.
469 static void stage2_flush_vm(struct kvm
*kvm
)
471 struct kvm_memslots
*slots
;
472 struct kvm_memory_slot
*memslot
;
475 idx
= srcu_read_lock(&kvm
->srcu
);
476 spin_lock(&kvm
->mmu_lock
);
478 slots
= kvm_memslots(kvm
);
479 kvm_for_each_memslot(memslot
, slots
)
480 stage2_flush_memslot(kvm
, memslot
);
482 spin_unlock(&kvm
->mmu_lock
);
483 srcu_read_unlock(&kvm
->srcu
, idx
);
486 static void clear_hyp_pgd_entry(pgd_t
*pgd
)
488 p4d_t
*p4d_table __maybe_unused
= p4d_offset(pgd
, 0UL);
490 p4d_free(NULL
, p4d_table
);
491 put_page(virt_to_page(pgd
));
494 static void clear_hyp_p4d_entry(p4d_t
*p4d
)
496 pud_t
*pud_table __maybe_unused
= pud_offset(p4d
, 0UL);
497 VM_BUG_ON(p4d_huge(*p4d
));
499 pud_free(NULL
, pud_table
);
500 put_page(virt_to_page(p4d
));
503 static void clear_hyp_pud_entry(pud_t
*pud
)
505 pmd_t
*pmd_table __maybe_unused
= pmd_offset(pud
, 0);
506 VM_BUG_ON(pud_huge(*pud
));
508 pmd_free(NULL
, pmd_table
);
509 put_page(virt_to_page(pud
));
512 static void clear_hyp_pmd_entry(pmd_t
*pmd
)
514 pte_t
*pte_table
= pte_offset_kernel(pmd
, 0);
515 VM_BUG_ON(pmd_thp_or_huge(*pmd
));
517 pte_free_kernel(NULL
, pte_table
);
518 put_page(virt_to_page(pmd
));
521 static void unmap_hyp_ptes(pmd_t
*pmd
, phys_addr_t addr
, phys_addr_t end
)
523 pte_t
*pte
, *start_pte
;
525 start_pte
= pte
= pte_offset_kernel(pmd
, addr
);
527 if (!pte_none(*pte
)) {
528 kvm_set_pte(pte
, __pte(0));
529 put_page(virt_to_page(pte
));
531 } while (pte
++, addr
+= PAGE_SIZE
, addr
!= end
);
533 if (hyp_pte_table_empty(start_pte
))
534 clear_hyp_pmd_entry(pmd
);
537 static void unmap_hyp_pmds(pud_t
*pud
, phys_addr_t addr
, phys_addr_t end
)
540 pmd_t
*pmd
, *start_pmd
;
542 start_pmd
= pmd
= pmd_offset(pud
, addr
);
544 next
= pmd_addr_end(addr
, end
);
545 /* Hyp doesn't use huge pmds */
547 unmap_hyp_ptes(pmd
, addr
, next
);
548 } while (pmd
++, addr
= next
, addr
!= end
);
550 if (hyp_pmd_table_empty(start_pmd
))
551 clear_hyp_pud_entry(pud
);
554 static void unmap_hyp_puds(p4d_t
*p4d
, phys_addr_t addr
, phys_addr_t end
)
557 pud_t
*pud
, *start_pud
;
559 start_pud
= pud
= pud_offset(p4d
, addr
);
561 next
= pud_addr_end(addr
, end
);
562 /* Hyp doesn't use huge puds */
564 unmap_hyp_pmds(pud
, addr
, next
);
565 } while (pud
++, addr
= next
, addr
!= end
);
567 if (hyp_pud_table_empty(start_pud
))
568 clear_hyp_p4d_entry(p4d
);
571 static void unmap_hyp_p4ds(pgd_t
*pgd
, phys_addr_t addr
, phys_addr_t end
)
574 p4d_t
*p4d
, *start_p4d
;
576 start_p4d
= p4d
= p4d_offset(pgd
, addr
);
578 next
= p4d_addr_end(addr
, end
);
579 /* Hyp doesn't use huge p4ds */
581 unmap_hyp_puds(p4d
, addr
, next
);
582 } while (p4d
++, addr
= next
, addr
!= end
);
584 if (hyp_p4d_table_empty(start_p4d
))
585 clear_hyp_pgd_entry(pgd
);
588 static unsigned int kvm_pgd_index(unsigned long addr
, unsigned int ptrs_per_pgd
)
590 return (addr
>> PGDIR_SHIFT
) & (ptrs_per_pgd
- 1);
593 static void __unmap_hyp_range(pgd_t
*pgdp
, unsigned long ptrs_per_pgd
,
594 phys_addr_t start
, u64 size
)
597 phys_addr_t addr
= start
, end
= start
+ size
;
601 * We don't unmap anything from HYP, except at the hyp tear down.
602 * Hence, we don't have to invalidate the TLBs here.
604 pgd
= pgdp
+ kvm_pgd_index(addr
, ptrs_per_pgd
);
606 next
= pgd_addr_end(addr
, end
);
608 unmap_hyp_p4ds(pgd
, addr
, next
);
609 } while (pgd
++, addr
= next
, addr
!= end
);
612 static void unmap_hyp_range(pgd_t
*pgdp
, phys_addr_t start
, u64 size
)
614 __unmap_hyp_range(pgdp
, PTRS_PER_PGD
, start
, size
);
617 static void unmap_hyp_idmap_range(pgd_t
*pgdp
, phys_addr_t start
, u64 size
)
619 __unmap_hyp_range(pgdp
, __kvm_idmap_ptrs_per_pgd(), start
, size
);
623 * free_hyp_pgds - free Hyp-mode page tables
625 * Assumes hyp_pgd is a page table used strictly in Hyp-mode and
626 * therefore contains either mappings in the kernel memory area (above
627 * PAGE_OFFSET), or device mappings in the idmap range.
629 * boot_hyp_pgd should only map the idmap range, and is only used in
630 * the extended idmap case.
632 void free_hyp_pgds(void)
636 mutex_lock(&kvm_hyp_pgd_mutex
);
638 id_pgd
= boot_hyp_pgd
? boot_hyp_pgd
: hyp_pgd
;
641 /* In case we never called hyp_mmu_init() */
643 io_map_base
= hyp_idmap_start
;
644 unmap_hyp_idmap_range(id_pgd
, io_map_base
,
645 hyp_idmap_start
+ PAGE_SIZE
- io_map_base
);
649 free_pages((unsigned long)boot_hyp_pgd
, hyp_pgd_order
);
654 unmap_hyp_range(hyp_pgd
, kern_hyp_va(PAGE_OFFSET
),
655 (uintptr_t)high_memory
- PAGE_OFFSET
);
657 free_pages((unsigned long)hyp_pgd
, hyp_pgd_order
);
660 if (merged_hyp_pgd
) {
661 clear_page(merged_hyp_pgd
);
662 free_page((unsigned long)merged_hyp_pgd
);
663 merged_hyp_pgd
= NULL
;
666 mutex_unlock(&kvm_hyp_pgd_mutex
);
669 static void create_hyp_pte_mappings(pmd_t
*pmd
, unsigned long start
,
670 unsigned long end
, unsigned long pfn
,
678 pte
= pte_offset_kernel(pmd
, addr
);
679 kvm_set_pte(pte
, kvm_pfn_pte(pfn
, prot
));
680 get_page(virt_to_page(pte
));
682 } while (addr
+= PAGE_SIZE
, addr
!= end
);
685 static int create_hyp_pmd_mappings(pud_t
*pud
, unsigned long start
,
686 unsigned long end
, unsigned long pfn
,
691 unsigned long addr
, next
;
695 pmd
= pmd_offset(pud
, addr
);
697 BUG_ON(pmd_sect(*pmd
));
699 if (pmd_none(*pmd
)) {
700 pte
= pte_alloc_one_kernel(NULL
);
702 kvm_err("Cannot allocate Hyp pte\n");
705 kvm_pmd_populate(pmd
, pte
);
706 get_page(virt_to_page(pmd
));
709 next
= pmd_addr_end(addr
, end
);
711 create_hyp_pte_mappings(pmd
, addr
, next
, pfn
, prot
);
712 pfn
+= (next
- addr
) >> PAGE_SHIFT
;
713 } while (addr
= next
, addr
!= end
);
718 static int create_hyp_pud_mappings(p4d_t
*p4d
, unsigned long start
,
719 unsigned long end
, unsigned long pfn
,
724 unsigned long addr
, next
;
729 pud
= pud_offset(p4d
, addr
);
731 if (pud_none_or_clear_bad(pud
)) {
732 pmd
= pmd_alloc_one(NULL
, addr
);
734 kvm_err("Cannot allocate Hyp pmd\n");
737 kvm_pud_populate(pud
, pmd
);
738 get_page(virt_to_page(pud
));
741 next
= pud_addr_end(addr
, end
);
742 ret
= create_hyp_pmd_mappings(pud
, addr
, next
, pfn
, prot
);
745 pfn
+= (next
- addr
) >> PAGE_SHIFT
;
746 } while (addr
= next
, addr
!= end
);
751 static int create_hyp_p4d_mappings(pgd_t
*pgd
, unsigned long start
,
752 unsigned long end
, unsigned long pfn
,
757 unsigned long addr
, next
;
762 p4d
= p4d_offset(pgd
, addr
);
764 if (p4d_none(*p4d
)) {
765 pud
= pud_alloc_one(NULL
, addr
);
767 kvm_err("Cannot allocate Hyp pud\n");
770 kvm_p4d_populate(p4d
, pud
);
771 get_page(virt_to_page(p4d
));
774 next
= p4d_addr_end(addr
, end
);
775 ret
= create_hyp_pud_mappings(p4d
, addr
, next
, pfn
, prot
);
778 pfn
+= (next
- addr
) >> PAGE_SHIFT
;
779 } while (addr
= next
, addr
!= end
);
784 static int __create_hyp_mappings(pgd_t
*pgdp
, unsigned long ptrs_per_pgd
,
785 unsigned long start
, unsigned long end
,
786 unsigned long pfn
, pgprot_t prot
)
790 unsigned long addr
, next
;
793 mutex_lock(&kvm_hyp_pgd_mutex
);
794 addr
= start
& PAGE_MASK
;
795 end
= PAGE_ALIGN(end
);
797 pgd
= pgdp
+ kvm_pgd_index(addr
, ptrs_per_pgd
);
799 if (pgd_none(*pgd
)) {
800 p4d
= p4d_alloc_one(NULL
, addr
);
802 kvm_err("Cannot allocate Hyp p4d\n");
806 kvm_pgd_populate(pgd
, p4d
);
807 get_page(virt_to_page(pgd
));
810 next
= pgd_addr_end(addr
, end
);
811 err
= create_hyp_p4d_mappings(pgd
, addr
, next
, pfn
, prot
);
814 pfn
+= (next
- addr
) >> PAGE_SHIFT
;
815 } while (addr
= next
, addr
!= end
);
817 mutex_unlock(&kvm_hyp_pgd_mutex
);
821 static phys_addr_t
kvm_kaddr_to_phys(void *kaddr
)
823 if (!is_vmalloc_addr(kaddr
)) {
824 BUG_ON(!virt_addr_valid(kaddr
));
827 return page_to_phys(vmalloc_to_page(kaddr
)) +
828 offset_in_page(kaddr
);
833 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
834 * @from: The virtual kernel start address of the range
835 * @to: The virtual kernel end address of the range (exclusive)
836 * @prot: The protection to be applied to this range
838 * The same virtual address as the kernel virtual address is also used
839 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
842 int create_hyp_mappings(void *from
, void *to
, pgprot_t prot
)
844 phys_addr_t phys_addr
;
845 unsigned long virt_addr
;
846 unsigned long start
= kern_hyp_va((unsigned long)from
);
847 unsigned long end
= kern_hyp_va((unsigned long)to
);
849 if (is_kernel_in_hyp_mode())
852 start
= start
& PAGE_MASK
;
853 end
= PAGE_ALIGN(end
);
855 for (virt_addr
= start
; virt_addr
< end
; virt_addr
+= PAGE_SIZE
) {
858 phys_addr
= kvm_kaddr_to_phys(from
+ virt_addr
- start
);
859 err
= __create_hyp_mappings(hyp_pgd
, PTRS_PER_PGD
,
860 virt_addr
, virt_addr
+ PAGE_SIZE
,
861 __phys_to_pfn(phys_addr
),
870 static int __create_hyp_private_mapping(phys_addr_t phys_addr
, size_t size
,
871 unsigned long *haddr
, pgprot_t prot
)
873 pgd_t
*pgd
= hyp_pgd
;
877 mutex_lock(&kvm_hyp_pgd_mutex
);
880 * This assumes that we have enough space below the idmap
881 * page to allocate our VAs. If not, the check below will
882 * kick. A potential alternative would be to detect that
883 * overflow and switch to an allocation above the idmap.
885 * The allocated size is always a multiple of PAGE_SIZE.
887 size
= PAGE_ALIGN(size
+ offset_in_page(phys_addr
));
888 base
= io_map_base
- size
;
891 * Verify that BIT(VA_BITS - 1) hasn't been flipped by
892 * allocating the new area, as it would indicate we've
893 * overflowed the idmap/IO address range.
895 if ((base
^ io_map_base
) & BIT(VA_BITS
- 1))
900 mutex_unlock(&kvm_hyp_pgd_mutex
);
905 if (__kvm_cpu_uses_extended_idmap())
908 ret
= __create_hyp_mappings(pgd
, __kvm_idmap_ptrs_per_pgd(),
910 __phys_to_pfn(phys_addr
), prot
);
914 *haddr
= base
+ offset_in_page(phys_addr
);
921 * create_hyp_io_mappings - Map IO into both kernel and HYP
922 * @phys_addr: The physical start address which gets mapped
923 * @size: Size of the region being mapped
924 * @kaddr: Kernel VA for this mapping
925 * @haddr: HYP VA for this mapping
927 int create_hyp_io_mappings(phys_addr_t phys_addr
, size_t size
,
928 void __iomem
**kaddr
,
929 void __iomem
**haddr
)
934 *kaddr
= ioremap(phys_addr
, size
);
938 if (is_kernel_in_hyp_mode()) {
943 ret
= __create_hyp_private_mapping(phys_addr
, size
,
944 &addr
, PAGE_HYP_DEVICE
);
952 *haddr
= (void __iomem
*)addr
;
957 * create_hyp_exec_mappings - Map an executable range into HYP
958 * @phys_addr: The physical start address which gets mapped
959 * @size: Size of the region being mapped
960 * @haddr: HYP VA for this mapping
962 int create_hyp_exec_mappings(phys_addr_t phys_addr
, size_t size
,
968 BUG_ON(is_kernel_in_hyp_mode());
970 ret
= __create_hyp_private_mapping(phys_addr
, size
,
971 &addr
, PAGE_HYP_EXEC
);
977 *haddr
= (void *)addr
;
982 * kvm_init_stage2_mmu - Initialise a S2 MMU strucrure
983 * @kvm: The pointer to the KVM structure
984 * @mmu: The pointer to the s2 MMU structure
986 * Allocates only the stage-2 HW PGD level table(s) of size defined by
987 * stage2_pgd_size(mmu->kvm).
989 * Note we don't need locking here as this is only called when the VM is
990 * created, which can only be done once.
992 int kvm_init_stage2_mmu(struct kvm
*kvm
, struct kvm_s2_mmu
*mmu
)
994 phys_addr_t pgd_phys
;
998 if (mmu
->pgd
!= NULL
) {
999 kvm_err("kvm_arch already initialized?\n");
1003 /* Allocate the HW PGD, making sure that each page gets its own refcount */
1004 pgd
= alloc_pages_exact(stage2_pgd_size(kvm
), GFP_KERNEL
| __GFP_ZERO
);
1008 pgd_phys
= virt_to_phys(pgd
);
1009 if (WARN_ON(pgd_phys
& ~kvm_vttbr_baddr_mask(kvm
)))
1012 mmu
->last_vcpu_ran
= alloc_percpu(typeof(*mmu
->last_vcpu_ran
));
1013 if (!mmu
->last_vcpu_ran
) {
1014 free_pages_exact(pgd
, stage2_pgd_size(kvm
));
1018 for_each_possible_cpu(cpu
)
1019 *per_cpu_ptr(mmu
->last_vcpu_ran
, cpu
) = -1;
1023 mmu
->pgd_phys
= pgd_phys
;
1024 mmu
->vmid
.vmid_gen
= 0;
1029 static void stage2_unmap_memslot(struct kvm
*kvm
,
1030 struct kvm_memory_slot
*memslot
)
1032 hva_t hva
= memslot
->userspace_addr
;
1033 phys_addr_t addr
= memslot
->base_gfn
<< PAGE_SHIFT
;
1034 phys_addr_t size
= PAGE_SIZE
* memslot
->npages
;
1035 hva_t reg_end
= hva
+ size
;
1038 * A memory region could potentially cover multiple VMAs, and any holes
1039 * between them, so iterate over all of them to find out if we should
1040 * unmap any of them.
1042 * +--------------------------------------------+
1043 * +---------------+----------------+ +----------------+
1044 * | : VMA 1 | VMA 2 | | VMA 3 : |
1045 * +---------------+----------------+ +----------------+
1047 * +--------------------------------------------+
1050 struct vm_area_struct
*vma
= find_vma(current
->mm
, hva
);
1051 hva_t vm_start
, vm_end
;
1053 if (!vma
|| vma
->vm_start
>= reg_end
)
1057 * Take the intersection of this VMA with the memory region
1059 vm_start
= max(hva
, vma
->vm_start
);
1060 vm_end
= min(reg_end
, vma
->vm_end
);
1062 if (!(vma
->vm_flags
& VM_PFNMAP
)) {
1063 gpa_t gpa
= addr
+ (vm_start
- memslot
->userspace_addr
);
1064 unmap_stage2_range(&kvm
->arch
.mmu
, gpa
, vm_end
- vm_start
);
1067 } while (hva
< reg_end
);
1071 * stage2_unmap_vm - Unmap Stage-2 RAM mappings
1072 * @kvm: The struct kvm pointer
1074 * Go through the memregions and unmap any regular RAM
1075 * backing memory already mapped to the VM.
1077 void stage2_unmap_vm(struct kvm
*kvm
)
1079 struct kvm_memslots
*slots
;
1080 struct kvm_memory_slot
*memslot
;
1083 idx
= srcu_read_lock(&kvm
->srcu
);
1084 mmap_read_lock(current
->mm
);
1085 spin_lock(&kvm
->mmu_lock
);
1087 slots
= kvm_memslots(kvm
);
1088 kvm_for_each_memslot(memslot
, slots
)
1089 stage2_unmap_memslot(kvm
, memslot
);
1091 spin_unlock(&kvm
->mmu_lock
);
1092 mmap_read_unlock(current
->mm
);
1093 srcu_read_unlock(&kvm
->srcu
, idx
);
1096 void kvm_free_stage2_pgd(struct kvm_s2_mmu
*mmu
)
1098 struct kvm
*kvm
= mmu
->kvm
;
1101 spin_lock(&kvm
->mmu_lock
);
1103 unmap_stage2_range(mmu
, 0, kvm_phys_size(kvm
));
1104 pgd
= READ_ONCE(mmu
->pgd
);
1107 spin_unlock(&kvm
->mmu_lock
);
1109 /* Free the HW pgd, one page at a time */
1111 free_pages_exact(pgd
, stage2_pgd_size(kvm
));
1112 free_percpu(mmu
->last_vcpu_ran
);
1116 static p4d_t
*stage2_get_p4d(struct kvm_s2_mmu
*mmu
, struct kvm_mmu_memory_cache
*cache
,
1119 struct kvm
*kvm
= mmu
->kvm
;
1123 pgd
= mmu
->pgd
+ stage2_pgd_index(kvm
, addr
);
1124 if (stage2_pgd_none(kvm
, *pgd
)) {
1127 p4d
= kvm_mmu_memory_cache_alloc(cache
);
1128 stage2_pgd_populate(kvm
, pgd
, p4d
);
1129 get_page(virt_to_page(pgd
));
1132 return stage2_p4d_offset(kvm
, pgd
, addr
);
1135 static pud_t
*stage2_get_pud(struct kvm_s2_mmu
*mmu
, struct kvm_mmu_memory_cache
*cache
,
1138 struct kvm
*kvm
= mmu
->kvm
;
1142 p4d
= stage2_get_p4d(mmu
, cache
, addr
);
1143 if (stage2_p4d_none(kvm
, *p4d
)) {
1146 pud
= kvm_mmu_memory_cache_alloc(cache
);
1147 stage2_p4d_populate(kvm
, p4d
, pud
);
1148 get_page(virt_to_page(p4d
));
1151 return stage2_pud_offset(kvm
, p4d
, addr
);
1154 static pmd_t
*stage2_get_pmd(struct kvm_s2_mmu
*mmu
, struct kvm_mmu_memory_cache
*cache
,
1157 struct kvm
*kvm
= mmu
->kvm
;
1161 pud
= stage2_get_pud(mmu
, cache
, addr
);
1162 if (!pud
|| stage2_pud_huge(kvm
, *pud
))
1165 if (stage2_pud_none(kvm
, *pud
)) {
1168 pmd
= kvm_mmu_memory_cache_alloc(cache
);
1169 stage2_pud_populate(kvm
, pud
, pmd
);
1170 get_page(virt_to_page(pud
));
1173 return stage2_pmd_offset(kvm
, pud
, addr
);
1176 static int stage2_set_pmd_huge(struct kvm_s2_mmu
*mmu
,
1177 struct kvm_mmu_memory_cache
*cache
,
1178 phys_addr_t addr
, const pmd_t
*new_pmd
)
1180 pmd_t
*pmd
, old_pmd
;
1183 pmd
= stage2_get_pmd(mmu
, cache
, addr
);
1188 * Multiple vcpus faulting on the same PMD entry, can
1189 * lead to them sequentially updating the PMD with the
1190 * same value. Following the break-before-make
1191 * (pmd_clear() followed by tlb_flush()) process can
1192 * hinder forward progress due to refaults generated
1193 * on missing translations.
1195 * Skip updating the page table if the entry is
1198 if (pmd_val(old_pmd
) == pmd_val(*new_pmd
))
1201 if (pmd_present(old_pmd
)) {
1203 * If we already have PTE level mapping for this block,
1204 * we must unmap it to avoid inconsistent TLB state and
1205 * leaking the table page. We could end up in this situation
1206 * if the memory slot was marked for dirty logging and was
1207 * reverted, leaving PTE level mappings for the pages accessed
1208 * during the period. So, unmap the PTE level mapping for this
1209 * block and retry, as we could have released the upper level
1210 * table in the process.
1212 * Normal THP split/merge follows mmu_notifier callbacks and do
1213 * get handled accordingly.
1215 if (!pmd_thp_or_huge(old_pmd
)) {
1216 unmap_stage2_range(mmu
, addr
& S2_PMD_MASK
, S2_PMD_SIZE
);
1220 * Mapping in huge pages should only happen through a
1221 * fault. If a page is merged into a transparent huge
1222 * page, the individual subpages of that huge page
1223 * should be unmapped through MMU notifiers before we
1226 * Merging of CompoundPages is not supported; they
1227 * should become splitting first, unmapped, merged,
1228 * and mapped back in on-demand.
1230 WARN_ON_ONCE(pmd_pfn(old_pmd
) != pmd_pfn(*new_pmd
));
1232 kvm_tlb_flush_vmid_ipa(mmu
, addr
, S2_PMD_LEVEL
);
1234 get_page(virt_to_page(pmd
));
1237 kvm_set_pmd(pmd
, *new_pmd
);
1241 static int stage2_set_pud_huge(struct kvm_s2_mmu
*mmu
,
1242 struct kvm_mmu_memory_cache
*cache
,
1243 phys_addr_t addr
, const pud_t
*new_pudp
)
1245 struct kvm
*kvm
= mmu
->kvm
;
1246 pud_t
*pudp
, old_pud
;
1249 pudp
= stage2_get_pud(mmu
, cache
, addr
);
1255 * A large number of vcpus faulting on the same stage 2 entry,
1256 * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
1257 * Skip updating the page tables if there is no change.
1259 if (pud_val(old_pud
) == pud_val(*new_pudp
))
1262 if (stage2_pud_present(kvm
, old_pud
)) {
1264 * If we already have table level mapping for this block, unmap
1265 * the range for this block and retry.
1267 if (!stage2_pud_huge(kvm
, old_pud
)) {
1268 unmap_stage2_range(mmu
, addr
& S2_PUD_MASK
, S2_PUD_SIZE
);
1272 WARN_ON_ONCE(kvm_pud_pfn(old_pud
) != kvm_pud_pfn(*new_pudp
));
1273 stage2_pud_clear(kvm
, pudp
);
1274 kvm_tlb_flush_vmid_ipa(mmu
, addr
, S2_PUD_LEVEL
);
1276 get_page(virt_to_page(pudp
));
1279 kvm_set_pud(pudp
, *new_pudp
);
1284 * stage2_get_leaf_entry - walk the stage2 VM page tables and return
1285 * true if a valid and present leaf-entry is found. A pointer to the
1286 * leaf-entry is returned in the appropriate level variable - pudpp,
1289 static bool stage2_get_leaf_entry(struct kvm_s2_mmu
*mmu
, phys_addr_t addr
,
1290 pud_t
**pudpp
, pmd_t
**pmdpp
, pte_t
**ptepp
)
1292 struct kvm
*kvm
= mmu
->kvm
;
1301 pudp
= stage2_get_pud(mmu
, NULL
, addr
);
1302 if (!pudp
|| stage2_pud_none(kvm
, *pudp
) || !stage2_pud_present(kvm
, *pudp
))
1305 if (stage2_pud_huge(kvm
, *pudp
)) {
1310 pmdp
= stage2_pmd_offset(kvm
, pudp
, addr
);
1311 if (!pmdp
|| pmd_none(*pmdp
) || !pmd_present(*pmdp
))
1314 if (pmd_thp_or_huge(*pmdp
)) {
1319 ptep
= pte_offset_kernel(pmdp
, addr
);
1320 if (!ptep
|| pte_none(*ptep
) || !pte_present(*ptep
))
1327 static bool stage2_is_exec(struct kvm_s2_mmu
*mmu
, phys_addr_t addr
, unsigned long sz
)
1334 found
= stage2_get_leaf_entry(mmu
, addr
, &pudp
, &pmdp
, &ptep
);
1339 return sz
<= PUD_SIZE
&& kvm_s2pud_exec(pudp
);
1341 return sz
<= PMD_SIZE
&& kvm_s2pmd_exec(pmdp
);
1343 return sz
== PAGE_SIZE
&& kvm_s2pte_exec(ptep
);
1346 static int stage2_set_pte(struct kvm_s2_mmu
*mmu
,
1347 struct kvm_mmu_memory_cache
*cache
,
1348 phys_addr_t addr
, const pte_t
*new_pte
,
1349 unsigned long flags
)
1351 struct kvm
*kvm
= mmu
->kvm
;
1354 pte_t
*pte
, old_pte
;
1355 bool iomap
= flags
& KVM_S2PTE_FLAG_IS_IOMAP
;
1356 bool logging_active
= flags
& KVM_S2_FLAG_LOGGING_ACTIVE
;
1358 VM_BUG_ON(logging_active
&& !cache
);
1360 /* Create stage-2 page table mapping - Levels 0 and 1 */
1361 pud
= stage2_get_pud(mmu
, cache
, addr
);
1364 * Ignore calls from kvm_set_spte_hva for unallocated
1371 * While dirty page logging - dissolve huge PUD, then continue
1372 * on to allocate page.
1375 stage2_dissolve_pud(mmu
, addr
, pud
);
1377 if (stage2_pud_none(kvm
, *pud
)) {
1379 return 0; /* ignore calls from kvm_set_spte_hva */
1380 pmd
= kvm_mmu_memory_cache_alloc(cache
);
1381 stage2_pud_populate(kvm
, pud
, pmd
);
1382 get_page(virt_to_page(pud
));
1385 pmd
= stage2_pmd_offset(kvm
, pud
, addr
);
1388 * Ignore calls from kvm_set_spte_hva for unallocated
1395 * While dirty page logging - dissolve huge PMD, then continue on to
1399 stage2_dissolve_pmd(mmu
, addr
, pmd
);
1401 /* Create stage-2 page mappings - Level 2 */
1402 if (pmd_none(*pmd
)) {
1404 return 0; /* ignore calls from kvm_set_spte_hva */
1405 pte
= kvm_mmu_memory_cache_alloc(cache
);
1406 kvm_pmd_populate(pmd
, pte
);
1407 get_page(virt_to_page(pmd
));
1410 pte
= pte_offset_kernel(pmd
, addr
);
1412 if (iomap
&& pte_present(*pte
))
1415 /* Create 2nd stage page table mapping - Level 3 */
1417 if (pte_present(old_pte
)) {
1418 /* Skip page table update if there is no change */
1419 if (pte_val(old_pte
) == pte_val(*new_pte
))
1422 kvm_set_pte(pte
, __pte(0));
1423 kvm_tlb_flush_vmid_ipa(mmu
, addr
, S2_PTE_LEVEL
);
1425 get_page(virt_to_page(pte
));
1428 kvm_set_pte(pte
, *new_pte
);
1432 #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
1433 static int stage2_ptep_test_and_clear_young(pte_t
*pte
)
1435 if (pte_young(*pte
)) {
1436 *pte
= pte_mkold(*pte
);
1442 static int stage2_ptep_test_and_clear_young(pte_t
*pte
)
1444 return __ptep_test_and_clear_young(pte
);
1448 static int stage2_pmdp_test_and_clear_young(pmd_t
*pmd
)
1450 return stage2_ptep_test_and_clear_young((pte_t
*)pmd
);
1453 static int stage2_pudp_test_and_clear_young(pud_t
*pud
)
1455 return stage2_ptep_test_and_clear_young((pte_t
*)pud
);
1459 * kvm_phys_addr_ioremap - map a device range to guest IPA
1461 * @kvm: The KVM pointer
1462 * @guest_ipa: The IPA at which to insert the mapping
1463 * @pa: The physical address of the device
1464 * @size: The size of the mapping
1466 int kvm_phys_addr_ioremap(struct kvm
*kvm
, phys_addr_t guest_ipa
,
1467 phys_addr_t pa
, unsigned long size
, bool writable
)
1469 phys_addr_t addr
, end
;
1472 struct kvm_mmu_memory_cache cache
= { 0, __GFP_ZERO
, NULL
, };
1474 end
= (guest_ipa
+ size
+ PAGE_SIZE
- 1) & PAGE_MASK
;
1475 pfn
= __phys_to_pfn(pa
);
1477 for (addr
= guest_ipa
; addr
< end
; addr
+= PAGE_SIZE
) {
1478 pte_t pte
= kvm_pfn_pte(pfn
, PAGE_S2_DEVICE
);
1481 pte
= kvm_s2pte_mkwrite(pte
);
1483 ret
= kvm_mmu_topup_memory_cache(&cache
,
1484 kvm_mmu_cache_min_pages(kvm
));
1487 spin_lock(&kvm
->mmu_lock
);
1488 ret
= stage2_set_pte(&kvm
->arch
.mmu
, &cache
, addr
, &pte
,
1489 KVM_S2PTE_FLAG_IS_IOMAP
);
1490 spin_unlock(&kvm
->mmu_lock
);
1498 kvm_mmu_free_memory_cache(&cache
);
1503 * stage2_wp_ptes - write protect PMD range
1504 * @pmd: pointer to pmd entry
1505 * @addr: range start address
1506 * @end: range end address
1508 static void stage2_wp_ptes(pmd_t
*pmd
, phys_addr_t addr
, phys_addr_t end
)
1512 pte
= pte_offset_kernel(pmd
, addr
);
1514 if (!pte_none(*pte
)) {
1515 if (!kvm_s2pte_readonly(pte
))
1516 kvm_set_s2pte_readonly(pte
);
1518 } while (pte
++, addr
+= PAGE_SIZE
, addr
!= end
);
1522 * stage2_wp_pmds - write protect PUD range
1523 * kvm: kvm instance for the VM
1524 * @pud: pointer to pud entry
1525 * @addr: range start address
1526 * @end: range end address
1528 static void stage2_wp_pmds(struct kvm_s2_mmu
*mmu
, pud_t
*pud
,
1529 phys_addr_t addr
, phys_addr_t end
)
1531 struct kvm
*kvm
= mmu
->kvm
;
1535 pmd
= stage2_pmd_offset(kvm
, pud
, addr
);
1538 next
= stage2_pmd_addr_end(kvm
, addr
, end
);
1539 if (!pmd_none(*pmd
)) {
1540 if (pmd_thp_or_huge(*pmd
)) {
1541 if (!kvm_s2pmd_readonly(pmd
))
1542 kvm_set_s2pmd_readonly(pmd
);
1544 stage2_wp_ptes(pmd
, addr
, next
);
1547 } while (pmd
++, addr
= next
, addr
!= end
);
1551 * stage2_wp_puds - write protect P4D range
1552 * @p4d: pointer to p4d entry
1553 * @addr: range start address
1554 * @end: range end address
1556 static void stage2_wp_puds(struct kvm_s2_mmu
*mmu
, p4d_t
*p4d
,
1557 phys_addr_t addr
, phys_addr_t end
)
1559 struct kvm
*kvm
= mmu
->kvm
;
1563 pud
= stage2_pud_offset(kvm
, p4d
, addr
);
1565 next
= stage2_pud_addr_end(kvm
, addr
, end
);
1566 if (!stage2_pud_none(kvm
, *pud
)) {
1567 if (stage2_pud_huge(kvm
, *pud
)) {
1568 if (!kvm_s2pud_readonly(pud
))
1569 kvm_set_s2pud_readonly(pud
);
1571 stage2_wp_pmds(mmu
, pud
, addr
, next
);
1574 } while (pud
++, addr
= next
, addr
!= end
);
1578 * stage2_wp_p4ds - write protect PGD range
1579 * @pgd: pointer to pgd entry
1580 * @addr: range start address
1581 * @end: range end address
1583 static void stage2_wp_p4ds(struct kvm_s2_mmu
*mmu
, pgd_t
*pgd
,
1584 phys_addr_t addr
, phys_addr_t end
)
1586 struct kvm
*kvm
= mmu
->kvm
;
1590 p4d
= stage2_p4d_offset(kvm
, pgd
, addr
);
1592 next
= stage2_p4d_addr_end(kvm
, addr
, end
);
1593 if (!stage2_p4d_none(kvm
, *p4d
))
1594 stage2_wp_puds(mmu
, p4d
, addr
, next
);
1595 } while (p4d
++, addr
= next
, addr
!= end
);
1599 * stage2_wp_range() - write protect stage2 memory region range
1600 * @kvm: The KVM pointer
1601 * @addr: Start address of range
1602 * @end: End address of range
1604 static void stage2_wp_range(struct kvm_s2_mmu
*mmu
, phys_addr_t addr
, phys_addr_t end
)
1606 struct kvm
*kvm
= mmu
->kvm
;
1610 pgd
= mmu
->pgd
+ stage2_pgd_index(kvm
, addr
);
1613 * Release kvm_mmu_lock periodically if the memory region is
1614 * large. Otherwise, we may see kernel panics with
1615 * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR,
1616 * CONFIG_LOCKDEP. Additionally, holding the lock too long
1617 * will also starve other vCPUs. We have to also make sure
1618 * that the page tables are not freed while we released
1621 cond_resched_lock(&kvm
->mmu_lock
);
1622 if (!READ_ONCE(mmu
->pgd
))
1624 next
= stage2_pgd_addr_end(kvm
, addr
, end
);
1625 if (stage2_pgd_present(kvm
, *pgd
))
1626 stage2_wp_p4ds(mmu
, pgd
, addr
, next
);
1627 } while (pgd
++, addr
= next
, addr
!= end
);
1631 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
1632 * @kvm: The KVM pointer
1633 * @slot: The memory slot to write protect
1635 * Called to start logging dirty pages after memory region
1636 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
1637 * all present PUD, PMD and PTEs are write protected in the memory region.
1638 * Afterwards read of dirty page log can be called.
1640 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
1641 * serializing operations for VM memory regions.
1643 void kvm_mmu_wp_memory_region(struct kvm
*kvm
, int slot
)
1645 struct kvm_memslots
*slots
= kvm_memslots(kvm
);
1646 struct kvm_memory_slot
*memslot
= id_to_memslot(slots
, slot
);
1647 phys_addr_t start
, end
;
1649 if (WARN_ON_ONCE(!memslot
))
1652 start
= memslot
->base_gfn
<< PAGE_SHIFT
;
1653 end
= (memslot
->base_gfn
+ memslot
->npages
) << PAGE_SHIFT
;
1655 spin_lock(&kvm
->mmu_lock
);
1656 stage2_wp_range(&kvm
->arch
.mmu
, start
, end
);
1657 spin_unlock(&kvm
->mmu_lock
);
1658 kvm_flush_remote_tlbs(kvm
);
1662 * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
1663 * @kvm: The KVM pointer
1664 * @slot: The memory slot associated with mask
1665 * @gfn_offset: The gfn offset in memory slot
1666 * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory
1667 * slot to be write protected
1669 * Walks bits set in mask write protects the associated pte's. Caller must
1670 * acquire kvm_mmu_lock.
1672 static void kvm_mmu_write_protect_pt_masked(struct kvm
*kvm
,
1673 struct kvm_memory_slot
*slot
,
1674 gfn_t gfn_offset
, unsigned long mask
)
1676 phys_addr_t base_gfn
= slot
->base_gfn
+ gfn_offset
;
1677 phys_addr_t start
= (base_gfn
+ __ffs(mask
)) << PAGE_SHIFT
;
1678 phys_addr_t end
= (base_gfn
+ __fls(mask
) + 1) << PAGE_SHIFT
;
1680 stage2_wp_range(&kvm
->arch
.mmu
, start
, end
);
1684 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1687 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1688 * enable dirty logging for them.
1690 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm
*kvm
,
1691 struct kvm_memory_slot
*slot
,
1692 gfn_t gfn_offset
, unsigned long mask
)
1694 kvm_mmu_write_protect_pt_masked(kvm
, slot
, gfn_offset
, mask
);
1697 static void clean_dcache_guest_page(kvm_pfn_t pfn
, unsigned long size
)
1699 __clean_dcache_guest_page(pfn
, size
);
1702 static void invalidate_icache_guest_page(kvm_pfn_t pfn
, unsigned long size
)
1704 __invalidate_icache_guest_page(pfn
, size
);
1707 static void kvm_send_hwpoison_signal(unsigned long address
, short lsb
)
1709 send_sig_mceerr(BUS_MCEERR_AR
, (void __user
*)address
, lsb
, current
);
1712 static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot
*memslot
,
1714 unsigned long map_size
)
1717 hva_t uaddr_start
, uaddr_end
;
1720 /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */
1721 if (map_size
== PAGE_SIZE
)
1724 size
= memslot
->npages
* PAGE_SIZE
;
1726 gpa_start
= memslot
->base_gfn
<< PAGE_SHIFT
;
1728 uaddr_start
= memslot
->userspace_addr
;
1729 uaddr_end
= uaddr_start
+ size
;
1732 * Pages belonging to memslots that don't have the same alignment
1733 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
1734 * PMD/PUD entries, because we'll end up mapping the wrong pages.
1736 * Consider a layout like the following:
1738 * memslot->userspace_addr:
1739 * +-----+--------------------+--------------------+---+
1740 * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz|
1741 * +-----+--------------------+--------------------+---+
1743 * memslot->base_gfn << PAGE_SHIFT:
1744 * +---+--------------------+--------------------+-----+
1745 * |abc|def Stage-2 block | Stage-2 block |tvxyz|
1746 * +---+--------------------+--------------------+-----+
1748 * If we create those stage-2 blocks, we'll end up with this incorrect
1754 if ((gpa_start
& (map_size
- 1)) != (uaddr_start
& (map_size
- 1)))
1758 * Next, let's make sure we're not trying to map anything not covered
1759 * by the memslot. This means we have to prohibit block size mappings
1760 * for the beginning and end of a non-block aligned and non-block sized
1761 * memory slot (illustrated by the head and tail parts of the
1762 * userspace view above containing pages 'abcde' and 'xyz',
1765 * Note that it doesn't matter if we do the check using the
1766 * userspace_addr or the base_gfn, as both are equally aligned (per
1767 * the check above) and equally sized.
1769 return (hva
& ~(map_size
- 1)) >= uaddr_start
&&
1770 (hva
& ~(map_size
- 1)) + map_size
<= uaddr_end
;
1774 * Check if the given hva is backed by a transparent huge page (THP) and
1775 * whether it can be mapped using block mapping in stage2. If so, adjust
1776 * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently
1777 * supported. This will need to be updated to support other THP sizes.
1779 * Returns the size of the mapping.
1781 static unsigned long
1782 transparent_hugepage_adjust(struct kvm_memory_slot
*memslot
,
1783 unsigned long hva
, kvm_pfn_t
*pfnp
,
1786 kvm_pfn_t pfn
= *pfnp
;
1789 * Make sure the adjustment is done only for THP pages. Also make
1790 * sure that the HVA and IPA are sufficiently aligned and that the
1791 * block map is contained within the memslot.
1793 if (kvm_is_transparent_hugepage(pfn
) &&
1794 fault_supports_stage2_huge_mapping(memslot
, hva
, PMD_SIZE
)) {
1796 * The address we faulted on is backed by a transparent huge
1797 * page. However, because we map the compound huge page and
1798 * not the individual tail page, we need to transfer the
1799 * refcount to the head page. We have to be careful that the
1800 * THP doesn't start to split while we are adjusting the
1803 * We are sure this doesn't happen, because mmu_notifier_retry
1804 * was successful and we are holding the mmu_lock, so if this
1805 * THP is trying to split, it will be blocked in the mmu
1806 * notifier before touching any of the pages, specifically
1807 * before being able to call __split_huge_page_refcount().
1809 * We can therefore safely transfer the refcount from PG_tail
1810 * to PG_head and switch the pfn from a tail page to the head
1814 kvm_release_pfn_clean(pfn
);
1815 pfn
&= ~(PTRS_PER_PMD
- 1);
1822 /* Use page mapping if we cannot use block mapping. */
1826 static int user_mem_abort(struct kvm_vcpu
*vcpu
, phys_addr_t fault_ipa
,
1827 struct kvm_memory_slot
*memslot
, unsigned long hva
,
1828 unsigned long fault_status
)
1831 bool write_fault
, writable
, force_pte
= false;
1832 bool exec_fault
, needs_exec
;
1833 unsigned long mmu_seq
;
1834 gfn_t gfn
= fault_ipa
>> PAGE_SHIFT
;
1835 struct kvm
*kvm
= vcpu
->kvm
;
1836 struct kvm_mmu_memory_cache
*memcache
= &vcpu
->arch
.mmu_page_cache
;
1837 struct vm_area_struct
*vma
;
1840 pgprot_t mem_type
= PAGE_S2
;
1841 bool logging_active
= memslot_is_logging(memslot
);
1842 unsigned long vma_pagesize
, flags
= 0;
1843 struct kvm_s2_mmu
*mmu
= vcpu
->arch
.hw_mmu
;
1845 write_fault
= kvm_is_write_fault(vcpu
);
1846 exec_fault
= kvm_vcpu_trap_is_iabt(vcpu
);
1847 VM_BUG_ON(write_fault
&& exec_fault
);
1849 if (fault_status
== FSC_PERM
&& !write_fault
&& !exec_fault
) {
1850 kvm_err("Unexpected L2 read permission error\n");
1854 /* Let's check if we will get back a huge page backed by hugetlbfs */
1855 mmap_read_lock(current
->mm
);
1856 vma
= find_vma_intersection(current
->mm
, hva
, hva
+ 1);
1857 if (unlikely(!vma
)) {
1858 kvm_err("Failed to find VMA for hva 0x%lx\n", hva
);
1859 mmap_read_unlock(current
->mm
);
1863 if (is_vm_hugetlb_page(vma
))
1864 vma_shift
= huge_page_shift(hstate_vma(vma
));
1866 vma_shift
= PAGE_SHIFT
;
1868 vma_pagesize
= 1ULL << vma_shift
;
1869 if (logging_active
||
1870 (vma
->vm_flags
& VM_PFNMAP
) ||
1871 !fault_supports_stage2_huge_mapping(memslot
, hva
, vma_pagesize
)) {
1873 vma_pagesize
= PAGE_SIZE
;
1874 vma_shift
= PAGE_SHIFT
;
1878 * The stage2 has a minimum of 2 level table (For arm64 see
1879 * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
1880 * use PMD_SIZE huge mappings (even when the PMD is folded into PGD).
1881 * As for PUD huge maps, we must make sure that we have at least
1882 * 3 levels, i.e, PMD is not folded.
1884 if (vma_pagesize
== PMD_SIZE
||
1885 (vma_pagesize
== PUD_SIZE
&& kvm_stage2_has_pmd(kvm
)))
1886 gfn
= (fault_ipa
& huge_page_mask(hstate_vma(vma
))) >> PAGE_SHIFT
;
1887 mmap_read_unlock(current
->mm
);
1889 /* We need minimum second+third level pages */
1890 ret
= kvm_mmu_topup_memory_cache(memcache
, kvm_mmu_cache_min_pages(kvm
));
1894 mmu_seq
= vcpu
->kvm
->mmu_notifier_seq
;
1896 * Ensure the read of mmu_notifier_seq happens before we call
1897 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
1898 * the page we just got a reference to gets unmapped before we have a
1899 * chance to grab the mmu_lock, which ensure that if the page gets
1900 * unmapped afterwards, the call to kvm_unmap_hva will take it away
1901 * from us again properly. This smp_rmb() interacts with the smp_wmb()
1902 * in kvm_mmu_notifier_invalidate_<page|range_end>.
1906 pfn
= gfn_to_pfn_prot(kvm
, gfn
, write_fault
, &writable
);
1907 if (pfn
== KVM_PFN_ERR_HWPOISON
) {
1908 kvm_send_hwpoison_signal(hva
, vma_shift
);
1911 if (is_error_noslot_pfn(pfn
))
1914 if (kvm_is_device_pfn(pfn
)) {
1915 mem_type
= PAGE_S2_DEVICE
;
1916 flags
|= KVM_S2PTE_FLAG_IS_IOMAP
;
1917 } else if (logging_active
) {
1919 * Faults on pages in a memslot with logging enabled
1920 * should not be mapped with huge pages (it introduces churn
1921 * and performance degradation), so force a pte mapping.
1923 flags
|= KVM_S2_FLAG_LOGGING_ACTIVE
;
1926 * Only actually map the page as writable if this was a write
1933 if (exec_fault
&& is_iomap(flags
))
1936 spin_lock(&kvm
->mmu_lock
);
1937 if (mmu_notifier_retry(kvm
, mmu_seq
))
1941 * If we are not forced to use page mapping, check if we are
1942 * backed by a THP and thus use block mapping if possible.
1944 if (vma_pagesize
== PAGE_SIZE
&& !force_pte
)
1945 vma_pagesize
= transparent_hugepage_adjust(memslot
, hva
,
1948 kvm_set_pfn_dirty(pfn
);
1950 if (fault_status
!= FSC_PERM
&& !is_iomap(flags
))
1951 clean_dcache_guest_page(pfn
, vma_pagesize
);
1954 invalidate_icache_guest_page(pfn
, vma_pagesize
);
1957 * If we took an execution fault we have made the
1958 * icache/dcache coherent above and should now let the s2
1959 * mapping be executable.
1961 * Write faults (!exec_fault && FSC_PERM) are orthogonal to
1962 * execute permissions, and we preserve whatever we have.
1964 needs_exec
= exec_fault
||
1965 (fault_status
== FSC_PERM
&&
1966 stage2_is_exec(mmu
, fault_ipa
, vma_pagesize
));
1969 * If PUD_SIZE == PMD_SIZE, there is no real PUD level, and
1970 * all we have is a 2-level page table. Trying to map a PUD in
1971 * this case would be fatally wrong.
1973 if (PUD_SIZE
!= PMD_SIZE
&& vma_pagesize
== PUD_SIZE
) {
1974 pud_t new_pud
= kvm_pfn_pud(pfn
, mem_type
);
1976 new_pud
= kvm_pud_mkhuge(new_pud
);
1978 new_pud
= kvm_s2pud_mkwrite(new_pud
);
1981 new_pud
= kvm_s2pud_mkexec(new_pud
);
1983 ret
= stage2_set_pud_huge(mmu
, memcache
, fault_ipa
, &new_pud
);
1984 } else if (vma_pagesize
== PMD_SIZE
) {
1985 pmd_t new_pmd
= kvm_pfn_pmd(pfn
, mem_type
);
1987 new_pmd
= kvm_pmd_mkhuge(new_pmd
);
1990 new_pmd
= kvm_s2pmd_mkwrite(new_pmd
);
1993 new_pmd
= kvm_s2pmd_mkexec(new_pmd
);
1995 ret
= stage2_set_pmd_huge(mmu
, memcache
, fault_ipa
, &new_pmd
);
1997 pte_t new_pte
= kvm_pfn_pte(pfn
, mem_type
);
2000 new_pte
= kvm_s2pte_mkwrite(new_pte
);
2001 mark_page_dirty(kvm
, gfn
);
2005 new_pte
= kvm_s2pte_mkexec(new_pte
);
2007 ret
= stage2_set_pte(mmu
, memcache
, fault_ipa
, &new_pte
, flags
);
2011 spin_unlock(&kvm
->mmu_lock
);
2012 kvm_set_pfn_accessed(pfn
);
2013 kvm_release_pfn_clean(pfn
);
2018 * Resolve the access fault by making the page young again.
2019 * Note that because the faulting entry is guaranteed not to be
2020 * cached in the TLB, we don't need to invalidate anything.
2021 * Only the HW Access Flag updates are supported for Stage 2 (no DBM),
2022 * so there is no need for atomic (pte|pmd)_mkyoung operations.
2024 static void handle_access_fault(struct kvm_vcpu
*vcpu
, phys_addr_t fault_ipa
)
2030 bool pfn_valid
= false;
2032 trace_kvm_access_fault(fault_ipa
);
2034 spin_lock(&vcpu
->kvm
->mmu_lock
);
2036 if (!stage2_get_leaf_entry(vcpu
->arch
.hw_mmu
, fault_ipa
, &pud
, &pmd
, &pte
))
2039 if (pud
) { /* HugeTLB */
2040 *pud
= kvm_s2pud_mkyoung(*pud
);
2041 pfn
= kvm_pud_pfn(*pud
);
2043 } else if (pmd
) { /* THP, HugeTLB */
2044 *pmd
= pmd_mkyoung(*pmd
);
2045 pfn
= pmd_pfn(*pmd
);
2048 *pte
= pte_mkyoung(*pte
); /* Just a page... */
2049 pfn
= pte_pfn(*pte
);
2054 spin_unlock(&vcpu
->kvm
->mmu_lock
);
2056 kvm_set_pfn_accessed(pfn
);
2060 * kvm_handle_guest_abort - handles all 2nd stage aborts
2061 * @vcpu: the VCPU pointer
2063 * Any abort that gets to the host is almost guaranteed to be caused by a
2064 * missing second stage translation table entry, which can mean that either the
2065 * guest simply needs more memory and we must allocate an appropriate page or it
2066 * can mean that the guest tried to access I/O memory, which is emulated by user
2067 * space. The distinction is based on the IPA causing the fault and whether this
2068 * memory region has been registered as standard RAM by user space.
2070 int kvm_handle_guest_abort(struct kvm_vcpu
*vcpu
)
2072 unsigned long fault_status
;
2073 phys_addr_t fault_ipa
;
2074 struct kvm_memory_slot
*memslot
;
2076 bool is_iabt
, write_fault
, writable
;
2080 fault_status
= kvm_vcpu_trap_get_fault_type(vcpu
);
2082 fault_ipa
= kvm_vcpu_get_fault_ipa(vcpu
);
2083 is_iabt
= kvm_vcpu_trap_is_iabt(vcpu
);
2085 /* Synchronous External Abort? */
2086 if (kvm_vcpu_abt_issea(vcpu
)) {
2088 * For RAS the host kernel may handle this abort.
2089 * There is no need to pass the error into the guest.
2091 if (kvm_handle_guest_sea(fault_ipa
, kvm_vcpu_get_esr(vcpu
)))
2092 kvm_inject_vabt(vcpu
);
2097 trace_kvm_guest_fault(*vcpu_pc(vcpu
), kvm_vcpu_get_esr(vcpu
),
2098 kvm_vcpu_get_hfar(vcpu
), fault_ipa
);
2100 /* Check the stage-2 fault is trans. fault or write fault */
2101 if (fault_status
!= FSC_FAULT
&& fault_status
!= FSC_PERM
&&
2102 fault_status
!= FSC_ACCESS
) {
2103 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
2104 kvm_vcpu_trap_get_class(vcpu
),
2105 (unsigned long)kvm_vcpu_trap_get_fault(vcpu
),
2106 (unsigned long)kvm_vcpu_get_esr(vcpu
));
2110 idx
= srcu_read_lock(&vcpu
->kvm
->srcu
);
2112 gfn
= fault_ipa
>> PAGE_SHIFT
;
2113 memslot
= gfn_to_memslot(vcpu
->kvm
, gfn
);
2114 hva
= gfn_to_hva_memslot_prot(memslot
, gfn
, &writable
);
2115 write_fault
= kvm_is_write_fault(vcpu
);
2116 if (kvm_is_error_hva(hva
) || (write_fault
&& !writable
)) {
2118 * The guest has put either its instructions or its page-tables
2119 * somewhere it shouldn't have. Userspace won't be able to do
2120 * anything about this (there's no syndrome for a start), so
2121 * re-inject the abort back into the guest.
2128 if (kvm_vcpu_dabt_iss1tw(vcpu
)) {
2129 kvm_inject_dabt(vcpu
, kvm_vcpu_get_hfar(vcpu
));
2135 * Check for a cache maintenance operation. Since we
2136 * ended-up here, we know it is outside of any memory
2137 * slot. But we can't find out if that is for a device,
2138 * or if the guest is just being stupid. The only thing
2139 * we know for sure is that this range cannot be cached.
2141 * So let's assume that the guest is just being
2142 * cautious, and skip the instruction.
2144 if (kvm_is_error_hva(hva
) && kvm_vcpu_dabt_is_cm(vcpu
)) {
2145 kvm_skip_instr(vcpu
, kvm_vcpu_trap_il_is32bit(vcpu
));
2151 * The IPA is reported as [MAX:12], so we need to
2152 * complement it with the bottom 12 bits from the
2153 * faulting VA. This is always 12 bits, irrespective
2156 fault_ipa
|= kvm_vcpu_get_hfar(vcpu
) & ((1 << 12) - 1);
2157 ret
= io_mem_abort(vcpu
, fault_ipa
);
2161 /* Userspace should not be able to register out-of-bounds IPAs */
2162 VM_BUG_ON(fault_ipa
>= kvm_phys_size(vcpu
->kvm
));
2164 if (fault_status
== FSC_ACCESS
) {
2165 handle_access_fault(vcpu
, fault_ipa
);
2170 ret
= user_mem_abort(vcpu
, fault_ipa
, memslot
, hva
, fault_status
);
2174 if (ret
== -ENOEXEC
) {
2175 kvm_inject_pabt(vcpu
, kvm_vcpu_get_hfar(vcpu
));
2179 srcu_read_unlock(&vcpu
->kvm
->srcu
, idx
);
2183 static int handle_hva_to_gpa(struct kvm
*kvm
,
2184 unsigned long start
,
2186 int (*handler
)(struct kvm
*kvm
,
2187 gpa_t gpa
, u64 size
,
2191 struct kvm_memslots
*slots
;
2192 struct kvm_memory_slot
*memslot
;
2195 slots
= kvm_memslots(kvm
);
2197 /* we only care about the pages that the guest sees */
2198 kvm_for_each_memslot(memslot
, slots
) {
2199 unsigned long hva_start
, hva_end
;
2202 hva_start
= max(start
, memslot
->userspace_addr
);
2203 hva_end
= min(end
, memslot
->userspace_addr
+
2204 (memslot
->npages
<< PAGE_SHIFT
));
2205 if (hva_start
>= hva_end
)
2208 gpa
= hva_to_gfn_memslot(hva_start
, memslot
) << PAGE_SHIFT
;
2209 ret
|= handler(kvm
, gpa
, (u64
)(hva_end
- hva_start
), data
);
2215 static int kvm_unmap_hva_handler(struct kvm
*kvm
, gpa_t gpa
, u64 size
, void *data
)
2217 unmap_stage2_range(&kvm
->arch
.mmu
, gpa
, size
);
2221 int kvm_unmap_hva_range(struct kvm
*kvm
,
2222 unsigned long start
, unsigned long end
)
2224 if (!kvm
->arch
.mmu
.pgd
)
2227 trace_kvm_unmap_hva_range(start
, end
);
2228 handle_hva_to_gpa(kvm
, start
, end
, &kvm_unmap_hva_handler
, NULL
);
2232 static int kvm_set_spte_handler(struct kvm
*kvm
, gpa_t gpa
, u64 size
, void *data
)
2234 pte_t
*pte
= (pte_t
*)data
;
2236 WARN_ON(size
!= PAGE_SIZE
);
2238 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
2239 * flag clear because MMU notifiers will have unmapped a huge PMD before
2240 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
2241 * therefore stage2_set_pte() never needs to clear out a huge PMD
2242 * through this calling path.
2244 stage2_set_pte(&kvm
->arch
.mmu
, NULL
, gpa
, pte
, 0);
2249 int kvm_set_spte_hva(struct kvm
*kvm
, unsigned long hva
, pte_t pte
)
2251 unsigned long end
= hva
+ PAGE_SIZE
;
2252 kvm_pfn_t pfn
= pte_pfn(pte
);
2255 if (!kvm
->arch
.mmu
.pgd
)
2258 trace_kvm_set_spte_hva(hva
);
2261 * We've moved a page around, probably through CoW, so let's treat it
2262 * just like a translation fault and clean the cache to the PoC.
2264 clean_dcache_guest_page(pfn
, PAGE_SIZE
);
2265 stage2_pte
= kvm_pfn_pte(pfn
, PAGE_S2
);
2266 handle_hva_to_gpa(kvm
, hva
, end
, &kvm_set_spte_handler
, &stage2_pte
);
2271 static int kvm_age_hva_handler(struct kvm
*kvm
, gpa_t gpa
, u64 size
, void *data
)
2277 WARN_ON(size
!= PAGE_SIZE
&& size
!= PMD_SIZE
&& size
!= PUD_SIZE
);
2278 if (!stage2_get_leaf_entry(&kvm
->arch
.mmu
, gpa
, &pud
, &pmd
, &pte
))
2282 return stage2_pudp_test_and_clear_young(pud
);
2284 return stage2_pmdp_test_and_clear_young(pmd
);
2286 return stage2_ptep_test_and_clear_young(pte
);
2289 static int kvm_test_age_hva_handler(struct kvm
*kvm
, gpa_t gpa
, u64 size
, void *data
)
2295 WARN_ON(size
!= PAGE_SIZE
&& size
!= PMD_SIZE
&& size
!= PUD_SIZE
);
2296 if (!stage2_get_leaf_entry(&kvm
->arch
.mmu
, gpa
, &pud
, &pmd
, &pte
))
2300 return kvm_s2pud_young(*pud
);
2302 return pmd_young(*pmd
);
2304 return pte_young(*pte
);
2307 int kvm_age_hva(struct kvm
*kvm
, unsigned long start
, unsigned long end
)
2309 if (!kvm
->arch
.mmu
.pgd
)
2311 trace_kvm_age_hva(start
, end
);
2312 return handle_hva_to_gpa(kvm
, start
, end
, kvm_age_hva_handler
, NULL
);
2315 int kvm_test_age_hva(struct kvm
*kvm
, unsigned long hva
)
2317 if (!kvm
->arch
.mmu
.pgd
)
2319 trace_kvm_test_age_hva(hva
);
2320 return handle_hva_to_gpa(kvm
, hva
, hva
+ PAGE_SIZE
,
2321 kvm_test_age_hva_handler
, NULL
);
2324 void kvm_mmu_free_memory_caches(struct kvm_vcpu
*vcpu
)
2326 kvm_mmu_free_memory_cache(&vcpu
->arch
.mmu_page_cache
);
2329 phys_addr_t
kvm_mmu_get_httbr(void)
2331 if (__kvm_cpu_uses_extended_idmap())
2332 return virt_to_phys(merged_hyp_pgd
);
2334 return virt_to_phys(hyp_pgd
);
2337 phys_addr_t
kvm_get_idmap_vector(void)
2339 return hyp_idmap_vector
;
2342 static int kvm_map_idmap_text(pgd_t
*pgd
)
2346 /* Create the idmap in the boot page tables */
2347 err
= __create_hyp_mappings(pgd
, __kvm_idmap_ptrs_per_pgd(),
2348 hyp_idmap_start
, hyp_idmap_end
,
2349 __phys_to_pfn(hyp_idmap_start
),
2352 kvm_err("Failed to idmap %lx-%lx\n",
2353 hyp_idmap_start
, hyp_idmap_end
);
2358 int kvm_mmu_init(void)
2362 hyp_idmap_start
= __pa_symbol(__hyp_idmap_text_start
);
2363 hyp_idmap_start
= ALIGN_DOWN(hyp_idmap_start
, PAGE_SIZE
);
2364 hyp_idmap_end
= __pa_symbol(__hyp_idmap_text_end
);
2365 hyp_idmap_end
= ALIGN(hyp_idmap_end
, PAGE_SIZE
);
2366 hyp_idmap_vector
= __pa_symbol(__kvm_hyp_init
);
2369 * We rely on the linker script to ensure at build time that the HYP
2370 * init code does not cross a page boundary.
2372 BUG_ON((hyp_idmap_start
^ (hyp_idmap_end
- 1)) & PAGE_MASK
);
2374 kvm_debug("IDMAP page: %lx\n", hyp_idmap_start
);
2375 kvm_debug("HYP VA range: %lx:%lx\n",
2376 kern_hyp_va(PAGE_OFFSET
),
2377 kern_hyp_va((unsigned long)high_memory
- 1));
2379 if (hyp_idmap_start
>= kern_hyp_va(PAGE_OFFSET
) &&
2380 hyp_idmap_start
< kern_hyp_va((unsigned long)high_memory
- 1) &&
2381 hyp_idmap_start
!= (unsigned long)__hyp_idmap_text_start
) {
2383 * The idmap page is intersecting with the VA space,
2384 * it is not safe to continue further.
2386 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
2391 hyp_pgd
= (pgd_t
*)__get_free_pages(GFP_KERNEL
| __GFP_ZERO
, hyp_pgd_order
);
2393 kvm_err("Hyp mode PGD not allocated\n");
2398 if (__kvm_cpu_uses_extended_idmap()) {
2399 boot_hyp_pgd
= (pgd_t
*)__get_free_pages(GFP_KERNEL
| __GFP_ZERO
,
2401 if (!boot_hyp_pgd
) {
2402 kvm_err("Hyp boot PGD not allocated\n");
2407 err
= kvm_map_idmap_text(boot_hyp_pgd
);
2411 merged_hyp_pgd
= (pgd_t
*)__get_free_page(GFP_KERNEL
| __GFP_ZERO
);
2412 if (!merged_hyp_pgd
) {
2413 kvm_err("Failed to allocate extra HYP pgd\n");
2416 __kvm_extend_hypmap(boot_hyp_pgd
, hyp_pgd
, merged_hyp_pgd
,
2419 err
= kvm_map_idmap_text(hyp_pgd
);
2424 io_map_base
= hyp_idmap_start
;
2431 void kvm_arch_commit_memory_region(struct kvm
*kvm
,
2432 const struct kvm_userspace_memory_region
*mem
,
2433 struct kvm_memory_slot
*old
,
2434 const struct kvm_memory_slot
*new,
2435 enum kvm_mr_change change
)
2438 * At this point memslot has been committed and there is an
2439 * allocated dirty_bitmap[], dirty pages will be tracked while the
2440 * memory slot is write protected.
2442 if (change
!= KVM_MR_DELETE
&& mem
->flags
& KVM_MEM_LOG_DIRTY_PAGES
) {
2444 * If we're with initial-all-set, we don't need to write
2445 * protect any pages because they're all reported as dirty.
2446 * Huge pages and normal pages will be write protect gradually.
2448 if (!kvm_dirty_log_manual_protect_and_init_set(kvm
)) {
2449 kvm_mmu_wp_memory_region(kvm
, mem
->slot
);
2454 int kvm_arch_prepare_memory_region(struct kvm
*kvm
,
2455 struct kvm_memory_slot
*memslot
,
2456 const struct kvm_userspace_memory_region
*mem
,
2457 enum kvm_mr_change change
)
2459 hva_t hva
= mem
->userspace_addr
;
2460 hva_t reg_end
= hva
+ mem
->memory_size
;
2461 bool writable
= !(mem
->flags
& KVM_MEM_READONLY
);
2464 if (change
!= KVM_MR_CREATE
&& change
!= KVM_MR_MOVE
&&
2465 change
!= KVM_MR_FLAGS_ONLY
)
2469 * Prevent userspace from creating a memory region outside of the IPA
2470 * space addressable by the KVM guest IPA space.
2472 if (memslot
->base_gfn
+ memslot
->npages
>=
2473 (kvm_phys_size(kvm
) >> PAGE_SHIFT
))
2476 mmap_read_lock(current
->mm
);
2478 * A memory region could potentially cover multiple VMAs, and any holes
2479 * between them, so iterate over all of them to find out if we can map
2480 * any of them right now.
2482 * +--------------------------------------------+
2483 * +---------------+----------------+ +----------------+
2484 * | : VMA 1 | VMA 2 | | VMA 3 : |
2485 * +---------------+----------------+ +----------------+
2487 * +--------------------------------------------+
2490 struct vm_area_struct
*vma
= find_vma(current
->mm
, hva
);
2491 hva_t vm_start
, vm_end
;
2493 if (!vma
|| vma
->vm_start
>= reg_end
)
2497 * Take the intersection of this VMA with the memory region
2499 vm_start
= max(hva
, vma
->vm_start
);
2500 vm_end
= min(reg_end
, vma
->vm_end
);
2502 if (vma
->vm_flags
& VM_PFNMAP
) {
2503 gpa_t gpa
= mem
->guest_phys_addr
+
2504 (vm_start
- mem
->userspace_addr
);
2507 pa
= (phys_addr_t
)vma
->vm_pgoff
<< PAGE_SHIFT
;
2508 pa
+= vm_start
- vma
->vm_start
;
2510 /* IO region dirty page logging not allowed */
2511 if (memslot
->flags
& KVM_MEM_LOG_DIRTY_PAGES
) {
2516 ret
= kvm_phys_addr_ioremap(kvm
, gpa
, pa
,
2523 } while (hva
< reg_end
);
2525 if (change
== KVM_MR_FLAGS_ONLY
)
2528 spin_lock(&kvm
->mmu_lock
);
2530 unmap_stage2_range(&kvm
->arch
.mmu
, mem
->guest_phys_addr
, mem
->memory_size
);
2532 stage2_flush_memslot(kvm
, memslot
);
2533 spin_unlock(&kvm
->mmu_lock
);
2535 mmap_read_unlock(current
->mm
);
2539 void kvm_arch_free_memslot(struct kvm
*kvm
, struct kvm_memory_slot
*slot
)
2543 void kvm_arch_memslots_updated(struct kvm
*kvm
, u64 gen
)
2547 void kvm_arch_flush_shadow_all(struct kvm
*kvm
)
2549 kvm_free_stage2_pgd(&kvm
->arch
.mmu
);
2552 void kvm_arch_flush_shadow_memslot(struct kvm
*kvm
,
2553 struct kvm_memory_slot
*slot
)
2555 gpa_t gpa
= slot
->base_gfn
<< PAGE_SHIFT
;
2556 phys_addr_t size
= slot
->npages
<< PAGE_SHIFT
;
2558 spin_lock(&kvm
->mmu_lock
);
2559 unmap_stage2_range(&kvm
->arch
.mmu
, gpa
, size
);
2560 spin_unlock(&kvm
->mmu_lock
);
2564 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
2567 * - S/W ops are local to a CPU (not broadcast)
2568 * - We have line migration behind our back (speculation)
2569 * - System caches don't support S/W at all (damn!)
2571 * In the face of the above, the best we can do is to try and convert
2572 * S/W ops to VA ops. Because the guest is not allowed to infer the
2573 * S/W to PA mapping, it can only use S/W to nuke the whole cache,
2574 * which is a rather good thing for us.
2576 * Also, it is only used when turning caches on/off ("The expected
2577 * usage of the cache maintenance instructions that operate by set/way
2578 * is associated with the cache maintenance instructions associated
2579 * with the powerdown and powerup of caches, if this is required by
2580 * the implementation.").
2582 * We use the following policy:
2584 * - If we trap a S/W operation, we enable VM trapping to detect
2585 * caches being turned on/off, and do a full clean.
2587 * - We flush the caches on both caches being turned on and off.
2589 * - Once the caches are enabled, we stop trapping VM ops.
2591 void kvm_set_way_flush(struct kvm_vcpu
*vcpu
)
2593 unsigned long hcr
= *vcpu_hcr(vcpu
);
2596 * If this is the first time we do a S/W operation
2597 * (i.e. HCR_TVM not set) flush the whole memory, and set the
2600 * Otherwise, rely on the VM trapping to wait for the MMU +
2601 * Caches to be turned off. At that point, we'll be able to
2602 * clean the caches again.
2604 if (!(hcr
& HCR_TVM
)) {
2605 trace_kvm_set_way_flush(*vcpu_pc(vcpu
),
2606 vcpu_has_cache_enabled(vcpu
));
2607 stage2_flush_vm(vcpu
->kvm
);
2608 *vcpu_hcr(vcpu
) = hcr
| HCR_TVM
;
2612 void kvm_toggle_cache(struct kvm_vcpu
*vcpu
, bool was_enabled
)
2614 bool now_enabled
= vcpu_has_cache_enabled(vcpu
);
2617 * If switching the MMU+caches on, need to invalidate the caches.
2618 * If switching it off, need to clean the caches.
2619 * Clean + invalidate does the trick always.
2621 if (now_enabled
!= was_enabled
)
2622 stage2_flush_vm(vcpu
->kvm
);
2624 /* Caches are now on, stop trapping VM ops (until a S/W op) */
2626 *vcpu_hcr(vcpu
) &= ~HCR_TVM
;
2628 trace_kvm_toggle_cache(*vcpu_pc(vcpu
), was_enabled
, now_enabled
);