2 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
3 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License, version 2, as
7 * published by the Free Software Foundation.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 #include <linux/mman.h>
20 #include <linux/kvm_host.h>
22 #include <trace/events/kvm.h>
23 #include <asm/idmap.h>
24 #include <asm/pgalloc.h>
25 #include <asm/cacheflush.h>
26 #include <asm/kvm_arm.h>
27 #include <asm/kvm_mmu.h>
28 #include <asm/kvm_mmio.h>
29 #include <asm/kvm_asm.h>
30 #include <asm/kvm_emulate.h>
31 #include <asm/mach/map.h>
32 #include <trace/events/kvm.h>
36 extern char __hyp_idmap_text_start
[], __hyp_idmap_text_end
[];
38 static DEFINE_MUTEX(kvm_hyp_pgd_mutex
);
40 static void kvm_tlb_flush_vmid(struct kvm
*kvm
)
42 kvm_call_hyp(__kvm_tlb_flush_vmid
, kvm
);
45 static void kvm_set_pte(pte_t
*pte
, pte_t new_pte
)
47 pte_val(*pte
) = new_pte
;
49 * flush_pmd_entry just takes a void pointer and cleans the necessary
50 * cache entries, so we can reuse the function for ptes.
55 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache
*cache
,
60 BUG_ON(max
> KVM_NR_MEM_OBJS
);
61 if (cache
->nobjs
>= min
)
63 while (cache
->nobjs
< max
) {
64 page
= (void *)__get_free_page(PGALLOC_GFP
);
67 cache
->objects
[cache
->nobjs
++] = page
;
72 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache
*mc
)
75 free_page((unsigned long)mc
->objects
[--mc
->nobjs
]);
78 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache
*mc
)
82 BUG_ON(!mc
|| !mc
->nobjs
);
83 p
= mc
->objects
[--mc
->nobjs
];
87 static void free_ptes(pmd_t
*pmd
, unsigned long addr
)
92 for (i
= 0; i
< PTRS_PER_PMD
; i
++, addr
+= PMD_SIZE
) {
93 if (!pmd_none(*pmd
) && pmd_table(*pmd
)) {
94 pte
= pte_offset_kernel(pmd
, addr
);
95 pte_free_kernel(NULL
, pte
);
102 * free_hyp_pmds - free a Hyp-mode level-2 tables and child level-3 tables
104 * Assumes this is a page table used strictly in Hyp-mode and therefore contains
105 * only mappings in the kernel memory area, which is above PAGE_OFFSET.
107 void free_hyp_pmds(void)
114 mutex_lock(&kvm_hyp_pgd_mutex
);
115 for (addr
= PAGE_OFFSET
; addr
!= 0; addr
+= PGDIR_SIZE
) {
116 pgd
= hyp_pgd
+ pgd_index(addr
);
117 pud
= pud_offset(pgd
, addr
);
121 BUG_ON(pud_bad(*pud
));
123 pmd
= pmd_offset(pud
, addr
);
124 free_ptes(pmd
, addr
);
128 mutex_unlock(&kvm_hyp_pgd_mutex
);
131 static void create_hyp_pte_mappings(pmd_t
*pmd
, unsigned long start
,
138 for (addr
= start
& PAGE_MASK
; addr
< end
; addr
+= PAGE_SIZE
) {
139 pte
= pte_offset_kernel(pmd
, addr
);
140 BUG_ON(!virt_addr_valid(addr
));
141 page
= virt_to_page(addr
);
142 kvm_set_pte(pte
, mk_pte(page
, PAGE_HYP
));
146 static void create_hyp_io_pte_mappings(pmd_t
*pmd
, unsigned long start
,
148 unsigned long *pfn_base
)
153 for (addr
= start
& PAGE_MASK
; addr
< end
; addr
+= PAGE_SIZE
) {
154 pte
= pte_offset_kernel(pmd
, addr
);
155 BUG_ON(pfn_valid(*pfn_base
));
156 kvm_set_pte(pte
, pfn_pte(*pfn_base
, PAGE_HYP_DEVICE
));
161 static int create_hyp_pmd_mappings(pud_t
*pud
, unsigned long start
,
162 unsigned long end
, unsigned long *pfn_base
)
166 unsigned long addr
, next
;
168 for (addr
= start
; addr
< end
; addr
= next
) {
169 pmd
= pmd_offset(pud
, addr
);
171 BUG_ON(pmd_sect(*pmd
));
173 if (pmd_none(*pmd
)) {
174 pte
= pte_alloc_one_kernel(NULL
, addr
);
176 kvm_err("Cannot allocate Hyp pte\n");
179 pmd_populate_kernel(NULL
, pmd
, pte
);
182 next
= pmd_addr_end(addr
, end
);
185 * If pfn_base is NULL, we map kernel pages into HYP with the
186 * virtual address. Otherwise, this is considered an I/O
187 * mapping and we map the physical region starting at
188 * *pfn_base to [start, end[.
191 create_hyp_pte_mappings(pmd
, addr
, next
);
193 create_hyp_io_pte_mappings(pmd
, addr
, next
, pfn_base
);
199 static int __create_hyp_mappings(void *from
, void *to
, unsigned long *pfn_base
)
201 unsigned long start
= (unsigned long)from
;
202 unsigned long end
= (unsigned long)to
;
206 unsigned long addr
, next
;
210 if (start
< PAGE_OFFSET
)
213 mutex_lock(&kvm_hyp_pgd_mutex
);
214 for (addr
= start
; addr
< end
; addr
= next
) {
215 pgd
= hyp_pgd
+ pgd_index(addr
);
216 pud
= pud_offset(pgd
, addr
);
218 if (pud_none_or_clear_bad(pud
)) {
219 pmd
= pmd_alloc_one(NULL
, addr
);
221 kvm_err("Cannot allocate Hyp pmd\n");
225 pud_populate(NULL
, pud
, pmd
);
228 next
= pgd_addr_end(addr
, end
);
229 err
= create_hyp_pmd_mappings(pud
, addr
, next
, pfn_base
);
234 mutex_unlock(&kvm_hyp_pgd_mutex
);
239 * create_hyp_mappings - map a kernel virtual address range in Hyp mode
240 * @from: The virtual kernel start address of the range
241 * @to: The virtual kernel end address of the range (exclusive)
243 * The same virtual address as the kernel virtual address is also used in
244 * Hyp-mode mapping to the same underlying physical pages.
246 * Note: Wrapping around zero in the "to" address is not supported.
248 int create_hyp_mappings(void *from
, void *to
)
250 return __create_hyp_mappings(from
, to
, NULL
);
254 * create_hyp_io_mappings - map a physical IO range in Hyp mode
255 * @from: The virtual HYP start address of the range
256 * @to: The virtual HYP end address of the range (exclusive)
257 * @addr: The physical start address which gets mapped
259 int create_hyp_io_mappings(void *from
, void *to
, phys_addr_t addr
)
261 unsigned long pfn
= __phys_to_pfn(addr
);
262 return __create_hyp_mappings(from
, to
, &pfn
);
266 * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
267 * @kvm: The KVM struct pointer for the VM.
269 * Allocates the 1st level table only of size defined by S2_PGD_ORDER (can
270 * support either full 40-bit input addresses or limited to 32-bit input
271 * addresses). Clears the allocated pages.
273 * Note we don't need locking here as this is only called when the VM is
274 * created, which can only be done once.
276 int kvm_alloc_stage2_pgd(struct kvm
*kvm
)
280 if (kvm
->arch
.pgd
!= NULL
) {
281 kvm_err("kvm_arch already initialized?\n");
285 pgd
= (pgd_t
*)__get_free_pages(GFP_KERNEL
, S2_PGD_ORDER
);
289 /* stage-2 pgd must be aligned to its size */
290 VM_BUG_ON((unsigned long)pgd
& (S2_PGD_SIZE
- 1));
292 memset(pgd
, 0, PTRS_PER_S2_PGD
* sizeof(pgd_t
));
293 clean_dcache_area(pgd
, PTRS_PER_S2_PGD
* sizeof(pgd_t
));
299 static void clear_pud_entry(pud_t
*pud
)
301 pmd_t
*pmd_table
= pmd_offset(pud
, 0);
303 pmd_free(NULL
, pmd_table
);
304 put_page(virt_to_page(pud
));
307 static void clear_pmd_entry(pmd_t
*pmd
)
309 pte_t
*pte_table
= pte_offset_kernel(pmd
, 0);
311 pte_free_kernel(NULL
, pte_table
);
312 put_page(virt_to_page(pmd
));
315 static bool pmd_empty(pmd_t
*pmd
)
317 struct page
*pmd_page
= virt_to_page(pmd
);
318 return page_count(pmd_page
) == 1;
321 static void clear_pte_entry(pte_t
*pte
)
323 if (pte_present(*pte
)) {
324 kvm_set_pte(pte
, __pte(0));
325 put_page(virt_to_page(pte
));
329 static bool pte_empty(pte_t
*pte
)
331 struct page
*pte_page
= virt_to_page(pte
);
332 return page_count(pte_page
) == 1;
336 * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
337 * @kvm: The VM pointer
338 * @start: The intermediate physical base address of the range to unmap
339 * @size: The size of the area to unmap
341 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must
342 * be called while holding mmu_lock (unless for freeing the stage2 pgd before
343 * destroying the VM), otherwise another faulting VCPU may come in and mess
344 * with things behind our backs.
346 static void unmap_stage2_range(struct kvm
*kvm
, phys_addr_t start
, u64 size
)
352 phys_addr_t addr
= start
, end
= start
+ size
;
356 pgd
= kvm
->arch
.pgd
+ pgd_index(addr
);
357 pud
= pud_offset(pgd
, addr
);
358 if (pud_none(*pud
)) {
363 pmd
= pmd_offset(pud
, addr
);
364 if (pmd_none(*pmd
)) {
369 pte
= pte_offset_kernel(pmd
, addr
);
370 clear_pte_entry(pte
);
373 /* If we emptied the pte, walk back up the ladder */
374 if (pte_empty(pte
)) {
375 clear_pmd_entry(pmd
);
377 if (pmd_empty(pmd
)) {
378 clear_pud_entry(pud
);
388 * kvm_free_stage2_pgd - free all stage-2 tables
389 * @kvm: The KVM struct pointer for the VM.
391 * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all
392 * underlying level-2 and level-3 tables before freeing the actual level-1 table
393 * and setting the struct pointer to NULL.
395 * Note we don't need locking here as this is only called when the VM is
396 * destroyed, which can only be done once.
398 void kvm_free_stage2_pgd(struct kvm
*kvm
)
400 if (kvm
->arch
.pgd
== NULL
)
403 unmap_stage2_range(kvm
, 0, KVM_PHYS_SIZE
);
404 free_pages((unsigned long)kvm
->arch
.pgd
, S2_PGD_ORDER
);
405 kvm
->arch
.pgd
= NULL
;
409 static int stage2_set_pte(struct kvm
*kvm
, struct kvm_mmu_memory_cache
*cache
,
410 phys_addr_t addr
, const pte_t
*new_pte
, bool iomap
)
417 /* Create 2nd stage page table mapping - Level 1 */
418 pgd
= kvm
->arch
.pgd
+ pgd_index(addr
);
419 pud
= pud_offset(pgd
, addr
);
420 if (pud_none(*pud
)) {
422 return 0; /* ignore calls from kvm_set_spte_hva */
423 pmd
= mmu_memory_cache_alloc(cache
);
424 pud_populate(NULL
, pud
, pmd
);
425 pmd
+= pmd_index(addr
);
426 get_page(virt_to_page(pud
));
428 pmd
= pmd_offset(pud
, addr
);
430 /* Create 2nd stage page table mapping - Level 2 */
431 if (pmd_none(*pmd
)) {
433 return 0; /* ignore calls from kvm_set_spte_hva */
434 pte
= mmu_memory_cache_alloc(cache
);
435 clean_pte_table(pte
);
436 pmd_populate_kernel(NULL
, pmd
, pte
);
437 pte
+= pte_index(addr
);
438 get_page(virt_to_page(pmd
));
440 pte
= pte_offset_kernel(pmd
, addr
);
442 if (iomap
&& pte_present(*pte
))
445 /* Create 2nd stage page table mapping - Level 3 */
447 kvm_set_pte(pte
, *new_pte
);
448 if (pte_present(old_pte
))
449 kvm_tlb_flush_vmid(kvm
);
451 get_page(virt_to_page(pte
));
457 * kvm_phys_addr_ioremap - map a device range to guest IPA
459 * @kvm: The KVM pointer
460 * @guest_ipa: The IPA at which to insert the mapping
461 * @pa: The physical address of the device
462 * @size: The size of the mapping
464 int kvm_phys_addr_ioremap(struct kvm
*kvm
, phys_addr_t guest_ipa
,
465 phys_addr_t pa
, unsigned long size
)
467 phys_addr_t addr
, end
;
470 struct kvm_mmu_memory_cache cache
= { 0, };
472 end
= (guest_ipa
+ size
+ PAGE_SIZE
- 1) & PAGE_MASK
;
473 pfn
= __phys_to_pfn(pa
);
475 for (addr
= guest_ipa
; addr
< end
; addr
+= PAGE_SIZE
) {
476 pte_t pte
= pfn_pte(pfn
, PAGE_S2_DEVICE
| L_PTE_S2_RDWR
);
478 ret
= mmu_topup_memory_cache(&cache
, 2, 2);
481 spin_lock(&kvm
->mmu_lock
);
482 ret
= stage2_set_pte(kvm
, &cache
, addr
, &pte
, true);
483 spin_unlock(&kvm
->mmu_lock
);
491 mmu_free_memory_cache(&cache
);
495 static void coherent_icache_guest_page(struct kvm
*kvm
, gfn_t gfn
)
498 * If we are going to insert an instruction page and the icache is
499 * either VIPT or PIPT, there is a potential problem where the host
500 * (or another VM) may have used the same page as this guest, and we
501 * read incorrect data from the icache. If we're using a PIPT cache,
502 * we can invalidate just that page, but if we are using a VIPT cache
503 * we need to invalidate the entire icache - damn shame - as written
504 * in the ARM ARM (DDI 0406C.b - Page B3-1393).
506 * VIVT caches are tagged using both the ASID and the VMID and doesn't
507 * need any kind of flushing (DDI 0406C.b - Page B3-1392).
509 if (icache_is_pipt()) {
510 unsigned long hva
= gfn_to_hva(kvm
, gfn
);
511 __cpuc_coherent_user_range(hva
, hva
+ PAGE_SIZE
);
512 } else if (!icache_is_vivt_asid_tagged()) {
513 /* any kind of VIPT cache */
514 __flush_icache_all();
518 static int user_mem_abort(struct kvm_vcpu
*vcpu
, phys_addr_t fault_ipa
,
519 gfn_t gfn
, struct kvm_memory_slot
*memslot
,
520 unsigned long fault_status
)
525 bool write_fault
, writable
;
526 unsigned long mmu_seq
;
527 struct kvm_mmu_memory_cache
*memcache
= &vcpu
->arch
.mmu_page_cache
;
529 write_fault
= kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu
));
530 if (fault_status
== FSC_PERM
&& !write_fault
) {
531 kvm_err("Unexpected L2 read permission error\n");
535 /* We need minimum second+third level pages */
536 ret
= mmu_topup_memory_cache(memcache
, 2, KVM_NR_MEM_OBJS
);
540 mmu_seq
= vcpu
->kvm
->mmu_notifier_seq
;
542 * Ensure the read of mmu_notifier_seq happens before we call
543 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
544 * the page we just got a reference to gets unmapped before we have a
545 * chance to grab the mmu_lock, which ensure that if the page gets
546 * unmapped afterwards, the call to kvm_unmap_hva will take it away
547 * from us again properly. This smp_rmb() interacts with the smp_wmb()
548 * in kvm_mmu_notifier_invalidate_<page|range_end>.
552 pfn
= gfn_to_pfn_prot(vcpu
->kvm
, gfn
, write_fault
, &writable
);
553 if (is_error_pfn(pfn
))
556 new_pte
= pfn_pte(pfn
, PAGE_S2
);
557 coherent_icache_guest_page(vcpu
->kvm
, gfn
);
559 spin_lock(&vcpu
->kvm
->mmu_lock
);
560 if (mmu_notifier_retry(vcpu
->kvm
, mmu_seq
))
563 pte_val(new_pte
) |= L_PTE_S2_RDWR
;
564 kvm_set_pfn_dirty(pfn
);
566 stage2_set_pte(vcpu
->kvm
, memcache
, fault_ipa
, &new_pte
, false);
569 spin_unlock(&vcpu
->kvm
->mmu_lock
);
570 kvm_release_pfn_clean(pfn
);
575 * kvm_handle_guest_abort - handles all 2nd stage aborts
576 * @vcpu: the VCPU pointer
577 * @run: the kvm_run structure
579 * Any abort that gets to the host is almost guaranteed to be caused by a
580 * missing second stage translation table entry, which can mean that either the
581 * guest simply needs more memory and we must allocate an appropriate page or it
582 * can mean that the guest tried to access I/O memory, which is emulated by user
583 * space. The distinction is based on the IPA causing the fault and whether this
584 * memory region has been registered as standard RAM by user space.
586 int kvm_handle_guest_abort(struct kvm_vcpu
*vcpu
, struct kvm_run
*run
)
588 unsigned long hsr_ec
;
589 unsigned long fault_status
;
590 phys_addr_t fault_ipa
;
591 struct kvm_memory_slot
*memslot
;
596 hsr_ec
= kvm_vcpu_trap_get_class(vcpu
);
597 is_iabt
= (hsr_ec
== HSR_EC_IABT
);
598 fault_ipa
= kvm_vcpu_get_fault_ipa(vcpu
);
600 trace_kvm_guest_fault(*vcpu_pc(vcpu
), kvm_vcpu_get_hsr(vcpu
),
601 kvm_vcpu_get_hfar(vcpu
), fault_ipa
);
603 /* Check the stage-2 fault is trans. fault or write fault */
604 fault_status
= kvm_vcpu_trap_get_fault(vcpu
);
605 if (fault_status
!= FSC_FAULT
&& fault_status
!= FSC_PERM
) {
606 kvm_err("Unsupported fault status: EC=%#lx DFCS=%#lx\n",
607 hsr_ec
, fault_status
);
611 idx
= srcu_read_lock(&vcpu
->kvm
->srcu
);
613 gfn
= fault_ipa
>> PAGE_SHIFT
;
614 if (!kvm_is_visible_gfn(vcpu
->kvm
, gfn
)) {
616 /* Prefetch Abort on I/O address */
617 kvm_inject_pabt(vcpu
, kvm_vcpu_get_hfar(vcpu
));
622 if (fault_status
!= FSC_FAULT
) {
623 kvm_err("Unsupported fault status on io memory: %#lx\n",
629 /* Adjust page offset */
630 fault_ipa
|= kvm_vcpu_get_hfar(vcpu
) & ~PAGE_MASK
;
631 ret
= io_mem_abort(vcpu
, run
, fault_ipa
);
635 memslot
= gfn_to_memslot(vcpu
->kvm
, gfn
);
637 ret
= user_mem_abort(vcpu
, fault_ipa
, gfn
, memslot
, fault_status
);
641 srcu_read_unlock(&vcpu
->kvm
->srcu
, idx
);
645 static void handle_hva_to_gpa(struct kvm
*kvm
,
648 void (*handler
)(struct kvm
*kvm
,
649 gpa_t gpa
, void *data
),
652 struct kvm_memslots
*slots
;
653 struct kvm_memory_slot
*memslot
;
655 slots
= kvm_memslots(kvm
);
657 /* we only care about the pages that the guest sees */
658 kvm_for_each_memslot(memslot
, slots
) {
659 unsigned long hva_start
, hva_end
;
662 hva_start
= max(start
, memslot
->userspace_addr
);
663 hva_end
= min(end
, memslot
->userspace_addr
+
664 (memslot
->npages
<< PAGE_SHIFT
));
665 if (hva_start
>= hva_end
)
669 * {gfn(page) | page intersects with [hva_start, hva_end)} =
670 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
672 gfn
= hva_to_gfn_memslot(hva_start
, memslot
);
673 gfn_end
= hva_to_gfn_memslot(hva_end
+ PAGE_SIZE
- 1, memslot
);
675 for (; gfn
< gfn_end
; ++gfn
) {
676 gpa_t gpa
= gfn
<< PAGE_SHIFT
;
677 handler(kvm
, gpa
, data
);
682 static void kvm_unmap_hva_handler(struct kvm
*kvm
, gpa_t gpa
, void *data
)
684 unmap_stage2_range(kvm
, gpa
, PAGE_SIZE
);
685 kvm_tlb_flush_vmid(kvm
);
688 int kvm_unmap_hva(struct kvm
*kvm
, unsigned long hva
)
690 unsigned long end
= hva
+ PAGE_SIZE
;
695 trace_kvm_unmap_hva(hva
);
696 handle_hva_to_gpa(kvm
, hva
, end
, &kvm_unmap_hva_handler
, NULL
);
700 int kvm_unmap_hva_range(struct kvm
*kvm
,
701 unsigned long start
, unsigned long end
)
706 trace_kvm_unmap_hva_range(start
, end
);
707 handle_hva_to_gpa(kvm
, start
, end
, &kvm_unmap_hva_handler
, NULL
);
711 static void kvm_set_spte_handler(struct kvm
*kvm
, gpa_t gpa
, void *data
)
713 pte_t
*pte
= (pte_t
*)data
;
715 stage2_set_pte(kvm
, NULL
, gpa
, pte
, false);
719 void kvm_set_spte_hva(struct kvm
*kvm
, unsigned long hva
, pte_t pte
)
721 unsigned long end
= hva
+ PAGE_SIZE
;
727 trace_kvm_set_spte_hva(hva
);
728 stage2_pte
= pfn_pte(pte_pfn(pte
), PAGE_S2
);
729 handle_hva_to_gpa(kvm
, hva
, end
, &kvm_set_spte_handler
, &stage2_pte
);
732 void kvm_mmu_free_memory_caches(struct kvm_vcpu
*vcpu
)
734 mmu_free_memory_cache(&vcpu
->arch
.mmu_page_cache
);
737 phys_addr_t
kvm_mmu_get_httbr(void)
739 VM_BUG_ON(!virt_addr_valid(hyp_pgd
));
740 return virt_to_phys(hyp_pgd
);
743 int kvm_mmu_init(void)
746 kvm_err("Hyp mode PGD not allocated\n");
754 * kvm_clear_idmap - remove all idmaps from the hyp pgd
756 * Free the underlying pmds for all pgds in range and clear the pgds (but
757 * don't free them) afterwards.
759 void kvm_clear_hyp_idmap(void)
761 unsigned long addr
, end
;
763 pgd_t
*pgd
= hyp_pgd
;
767 addr
= virt_to_phys(__hyp_idmap_text_start
);
768 end
= virt_to_phys(__hyp_idmap_text_end
);
770 pgd
+= pgd_index(addr
);
772 next
= pgd_addr_end(addr
, end
);
773 if (pgd_none_or_clear_bad(pgd
))
775 pud
= pud_offset(pgd
, addr
);
776 pmd
= pmd_offset(pud
, addr
);
779 clean_pmd_entry(pmd
);
780 pmd_free(NULL
, (pmd_t
*)((unsigned long)pmd
& PAGE_MASK
));
781 } while (pgd
++, addr
= next
, addr
< end
);