]> git.proxmox.com Git - mirror_ubuntu-kernels.git/blame - arch/powerpc/kvm/book3s_64_mmu_radix.c
Merge branches 'for-5.1/upstream-fixes', 'for-5.2/core', 'for-5.2/ish', 'for-5.2...
[mirror_ubuntu-kernels.git] / arch / powerpc / kvm / book3s_64_mmu_radix.c
CommitLineData
9e04ba69
PM
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License, version 2, as
4 * published by the Free Software Foundation.
5 *
6 * Copyright 2016 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
7 */
8
9#include <linux/types.h>
10#include <linux/string.h>
11#include <linux/kvm.h>
12#include <linux/kvm_host.h>
9a94d3ee
PM
13#include <linux/anon_inodes.h>
14#include <linux/file.h>
15#include <linux/debugfs.h>
9e04ba69
PM
16
17#include <asm/kvm_ppc.h>
18#include <asm/kvm_book3s.h>
19#include <asm/page.h>
20#include <asm/mmu.h>
21#include <asm/pgtable.h>
22#include <asm/pgalloc.h>
94171b19 23#include <asm/pte-walk.h>
9e04ba69
PM
24
25/*
26 * Supported radix tree geometry.
27 * Like p9, we support either 5 or 9 bits at the first (lowest) level,
28 * for a page size of 64k or 4k.
29 */
30static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
31
6ff887b8
SJS
32unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
33 gva_t eaddr, void *to, void *from,
34 unsigned long n)
d7b45615 35{
f4607722 36 int uninitialized_var(old_pid), old_lpid;
d7b45615 37 unsigned long quadrant, ret = n;
d7b45615
SJS
38 bool is_load = !!to;
39
95d386c2
SJS
40 /* Can't access quadrants 1 or 2 in non-HV mode, call the HV to do it */
41 if (kvmhv_on_pseries())
42 return plpar_hcall_norets(H_COPY_TOFROM_GUEST, lpid, pid, eaddr,
43 __pa(to), __pa(from), n);
d7b45615
SJS
44
45 quadrant = 1;
46 if (!pid)
47 quadrant = 2;
48 if (is_load)
49 from = (void *) (eaddr | (quadrant << 62));
50 else
51 to = (void *) (eaddr | (quadrant << 62));
52
53 preempt_disable();
54
55 /* switch the lpid first to avoid running host with unallocated pid */
56 old_lpid = mfspr(SPRN_LPID);
57 if (old_lpid != lpid)
58 mtspr(SPRN_LPID, lpid);
59 if (quadrant == 1) {
60 old_pid = mfspr(SPRN_PID);
61 if (old_pid != pid)
62 mtspr(SPRN_PID, pid);
63 }
64 isync();
65
66 pagefault_disable();
67 if (is_load)
68 ret = raw_copy_from_user(to, from, n);
69 else
70 ret = raw_copy_to_user(to, from, n);
71 pagefault_enable();
72
73 /* switch the pid first to avoid running host with unallocated pid */
74 if (quadrant == 1 && pid != old_pid)
75 mtspr(SPRN_PID, old_pid);
76 if (lpid != old_lpid)
77 mtspr(SPRN_LPID, old_lpid);
78 isync();
79
80 preempt_enable();
81
82 return ret;
83}
6ff887b8 84EXPORT_SYMBOL_GPL(__kvmhv_copy_tofrom_guest_radix);
d7b45615
SJS
85
86static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr,
87 void *to, void *from, unsigned long n)
88{
89 int lpid = vcpu->kvm->arch.lpid;
90 int pid = vcpu->arch.pid;
91
92 /* This would cause a data segment intr so don't allow the access */
93 if (eaddr & (0x3FFUL << 52))
94 return -EINVAL;
95
96 /* Should we be using the nested lpid */
97 if (vcpu->arch.nested)
98 lpid = vcpu->arch.nested->shadow_lpid;
99
100 /* If accessing quadrant 3 then pid is expected to be 0 */
101 if (((eaddr >> 62) & 0x3) == 0x3)
102 pid = 0;
103
104 eaddr &= ~(0xFFFUL << 52);
105
106 return __kvmhv_copy_tofrom_guest_radix(lpid, pid, eaddr, to, from, n);
107}
108
109long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to,
110 unsigned long n)
111{
112 long ret;
113
114 ret = kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, to, NULL, n);
115 if (ret > 0)
116 memset(to + (n - ret), 0, ret);
117
118 return ret;
119}
120EXPORT_SYMBOL_GPL(kvmhv_copy_from_guest_radix);
121
122long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *from,
123 unsigned long n)
124{
125 return kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, NULL, from, n);
126}
127EXPORT_SYMBOL_GPL(kvmhv_copy_to_guest_radix);
128
fd10be25
SJS
129int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
130 struct kvmppc_pte *gpte, u64 root,
131 u64 *pte_ret_p)
9e04ba69
PM
132{
133 struct kvm *kvm = vcpu->kvm;
9e04ba69 134 int ret, level, ps;
fd10be25 135 unsigned long rts, bits, offset, index;
9811c78e
SJS
136 u64 pte, base, gpa;
137 __be64 rpte;
9e04ba69 138
9e04ba69
PM
139 rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
140 ((root & RTS2_MASK) >> RTS2_SHIFT);
141 bits = root & RPDS_MASK;
9811c78e 142 base = root & RPDB_MASK;
9e04ba69 143
9e04ba69 144 offset = rts + 31;
9e04ba69 145
9811c78e 146 /* Current implementations only support 52-bit space */
9e04ba69
PM
147 if (offset != 52)
148 return -EINVAL;
149
9811c78e 150 /* Walk each level of the radix tree */
9e04ba69 151 for (level = 3; level >= 0; --level) {
fd10be25 152 u64 addr;
9811c78e 153 /* Check a valid size */
9e04ba69
PM
154 if (level && bits != p9_supported_radix_bits[level])
155 return -EINVAL;
156 if (level == 0 && !(bits == 5 || bits == 9))
157 return -EINVAL;
158 offset -= bits;
159 index = (eaddr >> offset) & ((1UL << bits) - 1);
9811c78e
SJS
160 /* Check that low bits of page table base are zero */
161 if (base & ((1UL << (bits + 3)) - 1))
9e04ba69 162 return -EINVAL;
9811c78e 163 /* Read the entry from guest memory */
fd10be25
SJS
164 addr = base + (index * sizeof(rpte));
165 ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte));
166 if (ret) {
167 if (pte_ret_p)
168 *pte_ret_p = addr;
9e04ba69 169 return ret;
fd10be25 170 }
9e04ba69
PM
171 pte = __be64_to_cpu(rpte);
172 if (!(pte & _PAGE_PRESENT))
173 return -ENOENT;
9811c78e 174 /* Check if a leaf entry */
9e04ba69
PM
175 if (pte & _PAGE_PTE)
176 break;
9811c78e
SJS
177 /* Get ready to walk the next level */
178 base = pte & RPDB_MASK;
179 bits = pte & RPDS_MASK;
9e04ba69 180 }
9811c78e
SJS
181
182 /* Need a leaf at lowest level; 512GB pages not supported */
9e04ba69
PM
183 if (level < 0 || level == 3)
184 return -EINVAL;
185
9811c78e
SJS
186 /* We found a valid leaf PTE */
187 /* Offset is now log base 2 of the page size */
9e04ba69
PM
188 gpa = pte & 0x01fffffffffff000ul;
189 if (gpa & ((1ul << offset) - 1))
190 return -EINVAL;
9811c78e 191 gpa |= eaddr & ((1ul << offset) - 1);
9e04ba69
PM
192 for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps)
193 if (offset == mmu_psize_defs[ps].shift)
194 break;
195 gpte->page_size = ps;
fd10be25 196 gpte->page_shift = offset;
9e04ba69
PM
197
198 gpte->eaddr = eaddr;
199 gpte->raddr = gpa;
200
201 /* Work out permissions */
202 gpte->may_read = !!(pte & _PAGE_READ);
203 gpte->may_write = !!(pte & _PAGE_WRITE);
204 gpte->may_execute = !!(pte & _PAGE_EXEC);
9811c78e 205
fd10be25
SJS
206 gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY);
207
9811c78e
SJS
208 if (pte_ret_p)
209 *pte_ret_p = pte;
210
211 return 0;
212}
213
fd10be25
SJS
214/*
215 * Used to walk a partition or process table radix tree in guest memory
216 * Note: We exploit the fact that a partition table and a process
217 * table have the same layout, a partition-scoped page table and a
218 * process-scoped page table have the same layout, and the 2nd
219 * doubleword of a partition table entry has the same layout as
220 * the PTCR register.
221 */
222int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
223 struct kvmppc_pte *gpte, u64 table,
224 int table_index, u64 *pte_ret_p)
225{
226 struct kvm *kvm = vcpu->kvm;
227 int ret;
228 unsigned long size, ptbl, root;
229 struct prtb_entry entry;
230
231 if ((table & PRTS_MASK) > 24)
232 return -EINVAL;
233 size = 1ul << ((table & PRTS_MASK) + 12);
234
235 /* Is the table big enough to contain this entry? */
236 if ((table_index * sizeof(entry)) >= size)
237 return -EINVAL;
238
239 /* Read the table to find the root of the radix tree */
240 ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry));
241 ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry));
242 if (ret)
243 return ret;
244
245 /* Root is stored in the first double word */
246 root = be64_to_cpu(entry.prtb0);
247
248 return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p);
249}
250
9811c78e
SJS
251int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
252 struct kvmppc_pte *gpte, bool data, bool iswrite)
253{
254 u32 pid;
255 u64 pte;
256 int ret;
257
258 /* Work out effective PID */
259 switch (eaddr >> 62) {
260 case 0:
261 pid = vcpu->arch.pid;
262 break;
263 case 3:
264 pid = 0;
265 break;
266 default:
267 return -EINVAL;
268 }
269
270 ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte,
271 vcpu->kvm->arch.process_table, pid, &pte);
272 if (ret)
273 return ret;
274
275 /* Check privilege (applies only to process scoped translations) */
9e04ba69
PM
276 if (kvmppc_get_msr(vcpu) & MSR_PR) {
277 if (pte & _PAGE_PRIVILEGED) {
278 gpte->may_read = 0;
279 gpte->may_write = 0;
280 gpte->may_execute = 0;
281 }
282 } else {
283 if (!(pte & _PAGE_PRIVILEGED)) {
284 /* Check AMR/IAMR to see if strict mode is in force */
285 if (vcpu->arch.amr & (1ul << 62))
286 gpte->may_read = 0;
287 if (vcpu->arch.amr & (1ul << 63))
288 gpte->may_write = 0;
289 if (vcpu->arch.iamr & (1ul << 62))
290 gpte->may_execute = 0;
291 }
292 }
293
294 return 0;
295}
296
90165d3d
SJS
297void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
298 unsigned int pshift, unsigned int lpid)
5a319350 299{
d91cb39f 300 unsigned long psize = PAGE_SIZE;
690ed4ca
PM
301 int psi;
302 long rc;
303 unsigned long rb;
d91cb39f
NP
304
305 if (pshift)
306 psize = 1UL << pshift;
690ed4ca
PM
307 else
308 pshift = PAGE_SHIFT;
d91cb39f
NP
309
310 addr &= ~(psize - 1);
690ed4ca
PM
311
312 if (!kvmhv_on_pseries()) {
313 radix__flush_tlb_lpid_page(lpid, addr, psize);
314 return;
315 }
316
317 psi = shift_to_mmu_psize(pshift);
318 rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58));
319 rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1),
320 lpid, rb);
321 if (rc)
322 pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc);
5a319350
PM
323}
324
fd10be25 325static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid)
c4c8a764 326{
690ed4ca
PM
327 long rc;
328
329 if (!kvmhv_on_pseries()) {
330 radix__flush_pwc_lpid(lpid);
331 return;
332 }
333
334 rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1),
335 lpid, TLBIEL_INVAL_SET_LPID);
336 if (rc)
337 pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc);
c4c8a764
PM
338}
339
878cf2bb 340static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
8f7b79b8
PM
341 unsigned long clr, unsigned long set,
342 unsigned long addr, unsigned int shift)
5a319350 343{
2bf1071a 344 return __radix_pte_update(ptep, clr, set);
5a319350
PM
345}
346
347void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
348 pte_t *ptep, pte_t pte)
349{
350 radix__set_pte_at(kvm->mm, addr, ptep, pte, 0);
351}
352
353static struct kmem_cache *kvm_pte_cache;
21828c99 354static struct kmem_cache *kvm_pmd_cache;
5a319350
PM
355
356static pte_t *kvmppc_pte_alloc(void)
357{
358 return kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL);
359}
360
361static void kvmppc_pte_free(pte_t *ptep)
362{
363 kmem_cache_free(kvm_pte_cache, ptep);
364}
365
c3856aeb
PM
366/* Like pmd_huge() and pmd_large(), but works regardless of config options */
367static inline int pmd_is_leaf(pmd_t pmd)
368{
369 return !!(pmd_val(pmd) & _PAGE_PTE);
370}
371
21828c99
AK
372static pmd_t *kvmppc_pmd_alloc(void)
373{
374 return kmem_cache_alloc(kvm_pmd_cache, GFP_KERNEL);
375}
376
377static void kvmppc_pmd_free(pmd_t *pmdp)
378{
379 kmem_cache_free(kvm_pmd_cache, pmdp);
380}
381
8cf531ed
SJS
382/* Called with kvm->mmu_lock held */
383void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
c43c3a86
PM
384 unsigned int shift,
385 const struct kvm_memory_slot *memslot,
fd10be25 386 unsigned int lpid)
a5fad1e9
NP
387
388{
a5fad1e9 389 unsigned long old;
8cf531ed
SJS
390 unsigned long gfn = gpa >> PAGE_SHIFT;
391 unsigned long page_size = PAGE_SIZE;
392 unsigned long hpa;
a5fad1e9
NP
393
394 old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift);
fd10be25 395 kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
a5fad1e9 396
8cf531ed
SJS
397 /* The following only applies to L1 entries */
398 if (lpid != kvm->arch.lpid)
399 return;
a5fad1e9 400
8cf531ed 401 if (!memslot) {
a5fad1e9 402 memslot = gfn_to_memslot(kvm, gfn);
f0f825f0 403 if (!memslot)
8cf531ed 404 return;
a5fad1e9 405 }
8f1f7b9b 406 if (shift) { /* 1GB or 2MB page */
8cf531ed 407 page_size = 1ul << shift;
8f1f7b9b
SJS
408 if (shift == PMD_SHIFT)
409 kvm->stat.num_2M_pages--;
410 else if (shift == PUD_SHIFT)
411 kvm->stat.num_1G_pages--;
412 }
8cf531ed
SJS
413
414 gpa &= ~(page_size - 1);
415 hpa = old & PTE_RPN_MASK;
416 kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size);
417
418 if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap)
419 kvmppc_update_dirty_map(memslot, gfn, page_size);
a5fad1e9
NP
420}
421
a5704e83
NP
422/*
423 * kvmppc_free_p?d are used to free existing page tables, and recursively
424 * descend and clear and free children.
425 * Callers are responsible for flushing the PWC.
426 *
427 * When page tables are being unmapped/freed as part of page fault path
428 * (full == false), ptes are not expected. There is code to unmap them
429 * and emit a warning if encountered, but there may already be data
430 * corruption due to the unexpected mappings.
431 */
fd10be25
SJS
432static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full,
433 unsigned int lpid)
a5704e83
NP
434{
435 if (full) {
436 memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE);
437 } else {
438 pte_t *p = pte;
439 unsigned long it;
440
441 for (it = 0; it < PTRS_PER_PTE; ++it, ++p) {
442 if (pte_val(*p) == 0)
443 continue;
444 WARN_ON_ONCE(1);
445 kvmppc_unmap_pte(kvm, p,
446 pte_pfn(*p) << PAGE_SHIFT,
fd10be25 447 PAGE_SHIFT, NULL, lpid);
a5704e83
NP
448 }
449 }
450
451 kvmppc_pte_free(pte);
452}
453
fd10be25
SJS
454static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full,
455 unsigned int lpid)
a5704e83
NP
456{
457 unsigned long im;
458 pmd_t *p = pmd;
459
460 for (im = 0; im < PTRS_PER_PMD; ++im, ++p) {
461 if (!pmd_present(*p))
462 continue;
463 if (pmd_is_leaf(*p)) {
464 if (full) {
465 pmd_clear(p);
466 } else {
467 WARN_ON_ONCE(1);
468 kvmppc_unmap_pte(kvm, (pte_t *)p,
469 pte_pfn(*(pte_t *)p) << PAGE_SHIFT,
fd10be25 470 PMD_SHIFT, NULL, lpid);
a5704e83
NP
471 }
472 } else {
473 pte_t *pte;
474
475 pte = pte_offset_map(p, 0);
fd10be25 476 kvmppc_unmap_free_pte(kvm, pte, full, lpid);
a5704e83
NP
477 pmd_clear(p);
478 }
479 }
480 kvmppc_pmd_free(pmd);
481}
482
fd10be25
SJS
483static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud,
484 unsigned int lpid)
a5704e83
NP
485{
486 unsigned long iu;
487 pud_t *p = pud;
488
489 for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++p) {
490 if (!pud_present(*p))
491 continue;
492 if (pud_huge(*p)) {
493 pud_clear(p);
494 } else {
495 pmd_t *pmd;
496
497 pmd = pmd_offset(p, 0);
fd10be25 498 kvmppc_unmap_free_pmd(kvm, pmd, true, lpid);
a5704e83
NP
499 pud_clear(p);
500 }
501 }
502 pud_free(kvm->mm, pud);
503}
504
fd10be25 505void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid)
a5704e83
NP
506{
507 unsigned long ig;
a5704e83 508
a5704e83
NP
509 for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
510 pud_t *pud;
511
512 if (!pgd_present(*pgd))
513 continue;
514 pud = pud_offset(pgd, 0);
fd10be25 515 kvmppc_unmap_free_pud(kvm, pud, lpid);
a5704e83
NP
516 pgd_clear(pgd);
517 }
fd10be25
SJS
518}
519
520void kvmppc_free_radix(struct kvm *kvm)
521{
522 if (kvm->arch.pgtable) {
523 kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable,
524 kvm->arch.lpid);
525 pgd_free(kvm->mm, kvm->arch.pgtable);
526 kvm->arch.pgtable = NULL;
527 }
a5704e83
NP
528}
529
530static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
fd10be25 531 unsigned long gpa, unsigned int lpid)
a5704e83
NP
532{
533 pte_t *pte = pte_offset_kernel(pmd, 0);
534
535 /*
536 * Clearing the pmd entry then flushing the PWC ensures that the pte
537 * page no longer be cached by the MMU, so can be freed without
538 * flushing the PWC again.
539 */
540 pmd_clear(pmd);
fd10be25 541 kvmppc_radix_flush_pwc(kvm, lpid);
a5704e83 542
fd10be25 543 kvmppc_unmap_free_pte(kvm, pte, false, lpid);
a5704e83
NP
544}
545
546static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
fd10be25 547 unsigned long gpa, unsigned int lpid)
a5704e83
NP
548{
549 pmd_t *pmd = pmd_offset(pud, 0);
550
551 /*
552 * Clearing the pud entry then flushing the PWC ensures that the pmd
553 * page and any children pte pages will no longer be cached by the MMU,
554 * so can be freed without flushing the PWC again.
555 */
556 pud_clear(pud);
fd10be25 557 kvmppc_radix_flush_pwc(kvm, lpid);
a5704e83 558
fd10be25 559 kvmppc_unmap_free_pmd(kvm, pmd, false, lpid);
a5704e83
NP
560}
561
878cf2bb
NP
562/*
563 * There are a number of bits which may differ between different faults to
564 * the same partition scope entry. RC bits, in the course of cleaning and
565 * aging. And the write bit can change, either the access could have been
566 * upgraded, or a read fault could happen concurrently with a write fault
567 * that sets those bits first.
568 */
569#define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED))
570
fd10be25
SJS
571int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
572 unsigned long gpa, unsigned int level,
8cf531ed
SJS
573 unsigned long mmu_seq, unsigned int lpid,
574 unsigned long *rmapp, struct rmap_nested **n_rmap)
5a319350
PM
575{
576 pgd_t *pgd;
577 pud_t *pud, *new_pud = NULL;
578 pmd_t *pmd, *new_pmd = NULL;
579 pte_t *ptep, *new_ptep = NULL;
580 int ret;
581
582 /* Traverse the guest's 2nd-level tree, allocate new levels needed */
04bae9d5 583 pgd = pgtable + pgd_index(gpa);
5a319350
PM
584 pud = NULL;
585 if (pgd_present(*pgd))
586 pud = pud_offset(pgd, gpa);
587 else
588 new_pud = pud_alloc_one(kvm->mm, gpa);
589
590 pmd = NULL;
58c5c276 591 if (pud && pud_present(*pud) && !pud_huge(*pud))
5a319350 592 pmd = pmd_offset(pud, gpa);
58c5c276 593 else if (level <= 1)
21828c99 594 new_pmd = kvmppc_pmd_alloc();
5a319350 595
c3856aeb 596 if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_is_leaf(*pmd)))
5a319350
PM
597 new_ptep = kvmppc_pte_alloc();
598
599 /* Check if we might have been invalidated; let the guest retry if so */
600 spin_lock(&kvm->mmu_lock);
601 ret = -EAGAIN;
602 if (mmu_notifier_retry(kvm, mmu_seq))
603 goto out_unlock;
604
605 /* Now traverse again under the lock and change the tree */
606 ret = -ENOMEM;
607 if (pgd_none(*pgd)) {
608 if (!new_pud)
609 goto out_unlock;
610 pgd_populate(kvm->mm, pgd, new_pud);
611 new_pud = NULL;
612 }
613 pud = pud_offset(pgd, gpa);
58c5c276
PM
614 if (pud_huge(*pud)) {
615 unsigned long hgpa = gpa & PUD_MASK;
616
878cf2bb
NP
617 /* Check if we raced and someone else has set the same thing */
618 if (level == 2) {
619 if (pud_raw(*pud) == pte_raw(pte)) {
620 ret = 0;
621 goto out_unlock;
622 }
623 /* Valid 1GB page here already, add our extra bits */
624 WARN_ON_ONCE((pud_val(*pud) ^ pte_val(pte)) &
625 PTE_BITS_MUST_MATCH);
626 kvmppc_radix_update_pte(kvm, (pte_t *)pud,
627 0, pte_val(pte), hgpa, PUD_SHIFT);
628 ret = 0;
629 goto out_unlock;
630 }
58c5c276
PM
631 /*
632 * If we raced with another CPU which has just put
633 * a 1GB pte in after we saw a pmd page, try again.
634 */
878cf2bb 635 if (!new_pmd) {
58c5c276
PM
636 ret = -EAGAIN;
637 goto out_unlock;
638 }
58c5c276 639 /* Valid 1GB page here already, remove it */
fd10be25
SJS
640 kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL,
641 lpid);
58c5c276
PM
642 }
643 if (level == 2) {
644 if (!pud_none(*pud)) {
645 /*
646 * There's a page table page here, but we wanted to
647 * install a large page, so remove and free the page
a5704e83 648 * table page.
58c5c276 649 */
fd10be25 650 kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid);
58c5c276
PM
651 }
652 kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
8cf531ed
SJS
653 if (rmapp && n_rmap)
654 kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
58c5c276
PM
655 ret = 0;
656 goto out_unlock;
657 }
5a319350
PM
658 if (pud_none(*pud)) {
659 if (!new_pmd)
660 goto out_unlock;
661 pud_populate(kvm->mm, pud, new_pmd);
662 new_pmd = NULL;
663 }
664 pmd = pmd_offset(pud, gpa);
c3856aeb
PM
665 if (pmd_is_leaf(*pmd)) {
666 unsigned long lgpa = gpa & PMD_MASK;
667
878cf2bb
NP
668 /* Check if we raced and someone else has set the same thing */
669 if (level == 1) {
670 if (pmd_raw(*pmd) == pte_raw(pte)) {
671 ret = 0;
672 goto out_unlock;
673 }
674 /* Valid 2MB page here already, add our extra bits */
675 WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) &
676 PTE_BITS_MUST_MATCH);
677 kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
fd10be25 678 0, pte_val(pte), lgpa, PMD_SHIFT);
878cf2bb
NP
679 ret = 0;
680 goto out_unlock;
681 }
682
c3856aeb
PM
683 /*
684 * If we raced with another CPU which has just put
685 * a 2MB pte in after we saw a pte page, try again.
686 */
878cf2bb 687 if (!new_ptep) {
c3856aeb
PM
688 ret = -EAGAIN;
689 goto out_unlock;
690 }
691 /* Valid 2MB page here already, remove it */
fd10be25
SJS
692 kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL,
693 lpid);
5a319350 694 }
58c5c276
PM
695 if (level == 1) {
696 if (!pmd_none(*pmd)) {
697 /*
698 * There's a page table page here, but we wanted to
699 * install a large page, so remove and free the page
a5704e83 700 * table page.
58c5c276 701 */
fd10be25 702 kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid);
5a319350 703 }
5a319350 704 kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
8cf531ed
SJS
705 if (rmapp && n_rmap)
706 kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
58c5c276
PM
707 ret = 0;
708 goto out_unlock;
5a319350 709 }
58c5c276
PM
710 if (pmd_none(*pmd)) {
711 if (!new_ptep)
712 goto out_unlock;
713 pmd_populate(kvm->mm, pmd, new_ptep);
714 new_ptep = NULL;
715 }
716 ptep = pte_offset_kernel(pmd, gpa);
717 if (pte_present(*ptep)) {
718 /* Check if someone else set the same thing */
719 if (pte_raw(*ptep) == pte_raw(pte)) {
720 ret = 0;
721 goto out_unlock;
722 }
878cf2bb
NP
723 /* Valid page here already, add our extra bits */
724 WARN_ON_ONCE((pte_val(*ptep) ^ pte_val(pte)) &
725 PTE_BITS_MUST_MATCH);
726 kvmppc_radix_update_pte(kvm, ptep, 0, pte_val(pte), gpa, 0);
727 ret = 0;
728 goto out_unlock;
5a319350 729 }
58c5c276 730 kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
8cf531ed
SJS
731 if (rmapp && n_rmap)
732 kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
5a319350
PM
733 ret = 0;
734
735 out_unlock:
736 spin_unlock(&kvm->mmu_lock);
737 if (new_pud)
738 pud_free(kvm->mm, new_pud);
739 if (new_pmd)
21828c99 740 kvmppc_pmd_free(new_pmd);
5a319350
PM
741 if (new_ptep)
742 kvmppc_pte_free(new_ptep);
743 return ret;
744}
745
fd10be25
SJS
746bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, bool writing,
747 unsigned long gpa, unsigned int lpid)
04bae9d5
SJS
748{
749 unsigned long pgflags;
750 unsigned int shift;
751 pte_t *ptep;
752
753 /*
754 * Need to set an R or C bit in the 2nd-level tables;
755 * since we are just helping out the hardware here,
756 * it is sufficient to do what the hardware does.
757 */
758 pgflags = _PAGE_ACCESSED;
759 if (writing)
760 pgflags |= _PAGE_DIRTY;
761 /*
762 * We are walking the secondary (partition-scoped) page table here.
763 * We can do this without disabling irq because the Linux MM
764 * subsystem doesn't do THP splits and collapses on this tree.
765 */
766 ptep = __find_linux_pte(pgtable, gpa, NULL, &shift);
767 if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) {
768 kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift);
769 return true;
770 }
771 return false;
772}
773
fd10be25
SJS
774int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
775 unsigned long gpa,
776 struct kvm_memory_slot *memslot,
777 bool writing, bool kvm_ro,
778 pte_t *inserted_pte, unsigned int *levelp)
5a319350
PM
779{
780 struct kvm *kvm = vcpu->kvm;
31c8b0d0 781 struct page *page = NULL;
04bae9d5
SJS
782 unsigned long mmu_seq;
783 unsigned long hva, gfn = gpa >> PAGE_SHIFT;
31c8b0d0
PM
784 bool upgrade_write = false;
785 bool *upgrade_p = &upgrade_write;
5a319350 786 pte_t pte, *ptep;
5a319350 787 unsigned int shift, level;
04bae9d5 788 int ret;
f460f679 789 bool large_enable;
5a319350 790
31c8b0d0
PM
791 /* used to check for invalidations in progress */
792 mmu_seq = kvm->mmu_notifier_seq;
793 smp_rmb();
794
795 /*
796 * Do a fast check first, since __gfn_to_pfn_memslot doesn't
797 * do it with !atomic && !async, which is how we call it.
798 * We always ask for write permission since the common case
799 * is that the page is writable.
800 */
801 hva = gfn_to_hva_memslot(memslot, gfn);
04bae9d5 802 if (!kvm_ro && __get_user_pages_fast(hva, 1, 1, &page) == 1) {
31c8b0d0
PM
803 upgrade_write = true;
804 } else {
71d29f43
NP
805 unsigned long pfn;
806
31c8b0d0
PM
807 /* Call KVM generic code to do the slow-path check */
808 pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
809 writing, upgrade_p);
810 if (is_error_noslot_pfn(pfn))
811 return -EFAULT;
812 page = NULL;
813 if (pfn_valid(pfn)) {
814 page = pfn_to_page(pfn);
815 if (PageReserved(page))
816 page = NULL;
5a319350 817 }
31c8b0d0
PM
818 }
819
5a319350 820 /*
71d29f43
NP
821 * Read the PTE from the process' radix tree and use that
822 * so we get the shift and attribute bits.
5a319350 823 */
71d29f43
NP
824 local_irq_disable();
825 ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift);
6579804c
PM
826 /*
827 * If the PTE disappeared temporarily due to a THP
828 * collapse, just return and let the guest try again.
829 */
830 if (!ptep) {
831 local_irq_enable();
832 if (page)
833 put_page(page);
834 return RESUME_GUEST;
835 }
71d29f43
NP
836 pte = *ptep;
837 local_irq_enable();
838
f460f679
PM
839 /* If we're logging dirty pages, always map single pages */
840 large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES);
841
71d29f43 842 /* Get pte level from shift/size */
f460f679 843 if (large_enable && shift == PUD_SHIFT &&
71d29f43
NP
844 (gpa & (PUD_SIZE - PAGE_SIZE)) ==
845 (hva & (PUD_SIZE - PAGE_SIZE))) {
846 level = 2;
f460f679 847 } else if (large_enable && shift == PMD_SHIFT &&
71d29f43
NP
848 (gpa & (PMD_SIZE - PAGE_SIZE)) ==
849 (hva & (PMD_SIZE - PAGE_SIZE))) {
850 level = 1;
31c8b0d0 851 } else {
71d29f43
NP
852 level = 0;
853 if (shift > PAGE_SHIFT) {
854 /*
855 * If the pte maps more than one page, bring over
856 * bits from the virtual address to get the real
857 * address of the specific single page we want.
858 */
859 unsigned long rpnmask = (1ul << shift) - PAGE_SIZE;
860 pte = __pte(pte_val(pte) | (hva & rpnmask));
bc64dd0e 861 }
5a319350 862 }
5a319350 863
71d29f43
NP
864 pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED);
865 if (writing || upgrade_write) {
866 if (pte_val(pte) & _PAGE_WRITE)
867 pte = __pte(pte_val(pte) | _PAGE_DIRTY);
868 } else {
869 pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY));
870 }
871
5a319350 872 /* Allocate space in the tree and write the PTE */
04bae9d5 873 ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level,
8cf531ed 874 mmu_seq, kvm->arch.lpid, NULL, NULL);
04bae9d5
SJS
875 if (inserted_pte)
876 *inserted_pte = pte;
877 if (levelp)
878 *levelp = level;
5a319350
PM
879
880 if (page) {
31c8b0d0 881 if (!ret && (pte_val(pte) & _PAGE_WRITE))
c3856aeb
PM
882 set_page_dirty_lock(page);
883 put_page(page);
5a319350 884 }
c3856aeb 885
8f1f7b9b
SJS
886 /* Increment number of large pages if we (successfully) inserted one */
887 if (!ret) {
888 if (level == 1)
889 kvm->stat.num_2M_pages++;
890 else if (level == 2)
891 kvm->stat.num_1G_pages++;
892 }
893
04bae9d5
SJS
894 return ret;
895}
896
897int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
898 unsigned long ea, unsigned long dsisr)
899{
900 struct kvm *kvm = vcpu->kvm;
901 unsigned long gpa, gfn;
902 struct kvm_memory_slot *memslot;
903 long ret;
904 bool writing = !!(dsisr & DSISR_ISSTORE);
905 bool kvm_ro = false;
906
907 /* Check for unusual errors */
908 if (dsisr & DSISR_UNSUPP_MMU) {
909 pr_err("KVM: Got unsupported MMU fault\n");
910 return -EFAULT;
911 }
912 if (dsisr & DSISR_BADACCESS) {
913 /* Reflect to the guest as DSI */
914 pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
915 kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
916 return RESUME_GUEST;
917 }
918
919 /* Translate the logical address */
920 gpa = vcpu->arch.fault_gpa & ~0xfffUL;
921 gpa &= ~0xF000000000000000ul;
922 gfn = gpa >> PAGE_SHIFT;
923 if (!(dsisr & DSISR_PRTABLE_FAULT))
924 gpa |= ea & 0xfff;
925
926 /* Get the corresponding memslot */
927 memslot = gfn_to_memslot(kvm, gfn);
928
929 /* No memslot means it's an emulated MMIO region */
930 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
931 if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
932 DSISR_SET_RC)) {
933 /*
934 * Bad address in guest page table tree, or other
935 * unusual error - reflect it to the guest as DSI.
936 */
937 kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
938 return RESUME_GUEST;
939 }
940 return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, writing);
941 }
942
943 if (memslot->flags & KVM_MEM_READONLY) {
944 if (writing) {
945 /* give the guest a DSI */
946 kvmppc_core_queue_data_storage(vcpu, ea, DSISR_ISSTORE |
947 DSISR_PROTFAULT);
948 return RESUME_GUEST;
949 }
950 kvm_ro = true;
951 }
952
953 /* Failed to set the reference/change bits */
954 if (dsisr & DSISR_SET_RC) {
955 spin_lock(&kvm->mmu_lock);
956 if (kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable,
fd10be25 957 writing, gpa, kvm->arch.lpid))
04bae9d5
SJS
958 dsisr &= ~DSISR_SET_RC;
959 spin_unlock(&kvm->mmu_lock);
960
961 if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
962 DSISR_PROTFAULT | DSISR_SET_RC)))
963 return RESUME_GUEST;
964 }
965
966 /* Try to insert a pte */
967 ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing,
968 kvm_ro, NULL, NULL);
969
c3856aeb
PM
970 if (ret == 0 || ret == -EAGAIN)
971 ret = RESUME_GUEST;
5a319350
PM
972 return ret;
973}
974
c43c3a86 975/* Called with kvm->mmu_lock held */
01756099
PM
976int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
977 unsigned long gfn)
978{
979 pte_t *ptep;
980 unsigned long gpa = gfn << PAGE_SHIFT;
981 unsigned int shift;
982
94171b19 983 ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
f0f825f0 984 if (ptep && pte_present(*ptep))
fd10be25
SJS
985 kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
986 kvm->arch.lpid);
01756099
PM
987 return 0;
988}
989
c43c3a86 990/* Called with kvm->mmu_lock held */
01756099
PM
991int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
992 unsigned long gfn)
993{
994 pte_t *ptep;
995 unsigned long gpa = gfn << PAGE_SHIFT;
996 unsigned int shift;
997 int ref = 0;
ae59a7e1 998 unsigned long old, *rmapp;
01756099 999
94171b19 1000 ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
01756099 1001 if (ptep && pte_present(*ptep) && pte_young(*ptep)) {
ae59a7e1
SJS
1002 old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0,
1003 gpa, shift);
01756099 1004 /* XXX need to flush tlb here? */
ae59a7e1
SJS
1005 /* Also clear bit in ptes in shadow pgtable for nested guests */
1006 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
1007 kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0,
1008 old & PTE_RPN_MASK,
1009 1UL << shift);
01756099
PM
1010 ref = 1;
1011 }
1012 return ref;
1013}
1014
c43c3a86 1015/* Called with kvm->mmu_lock held */
01756099
PM
1016int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
1017 unsigned long gfn)
1018{
1019 pte_t *ptep;
1020 unsigned long gpa = gfn << PAGE_SHIFT;
1021 unsigned int shift;
1022 int ref = 0;
1023
94171b19 1024 ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
01756099
PM
1025 if (ptep && pte_present(*ptep) && pte_young(*ptep))
1026 ref = 1;
1027 return ref;
1028}
1029
8f7b79b8
PM
1030/* Returns the number of PAGE_SIZE pages that are dirty */
1031static int kvm_radix_test_clear_dirty(struct kvm *kvm,
1032 struct kvm_memory_slot *memslot, int pagenum)
1033{
1034 unsigned long gfn = memslot->base_gfn + pagenum;
1035 unsigned long gpa = gfn << PAGE_SHIFT;
1036 pte_t *ptep;
1037 unsigned int shift;
1038 int ret = 0;
ae59a7e1 1039 unsigned long old, *rmapp;
8f7b79b8 1040
94171b19 1041 ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
8f7b79b8
PM
1042 if (ptep && pte_present(*ptep) && pte_dirty(*ptep)) {
1043 ret = 1;
1044 if (shift)
1045 ret = 1 << (shift - PAGE_SHIFT);
ae59a7e1
SJS
1046 spin_lock(&kvm->mmu_lock);
1047 old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
1048 gpa, shift);
fd10be25 1049 kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid);
ae59a7e1
SJS
1050 /* Also clear bit in ptes in shadow pgtable for nested guests */
1051 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
1052 kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_DIRTY, 0,
1053 old & PTE_RPN_MASK,
1054 1UL << shift);
1055 spin_unlock(&kvm->mmu_lock);
8f7b79b8
PM
1056 }
1057 return ret;
1058}
1059
1060long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
1061 struct kvm_memory_slot *memslot, unsigned long *map)
1062{
1063 unsigned long i, j;
8f7b79b8
PM
1064 int npages;
1065
8f7b79b8
PM
1066 for (i = 0; i < memslot->npages; i = j) {
1067 npages = kvm_radix_test_clear_dirty(kvm, memslot, i);
1068
1069 /*
1070 * Note that if npages > 0 then i must be a multiple of npages,
1071 * since huge pages are only used to back the guest at guest
1072 * real addresses that are a multiple of their size.
1073 * Since we have at most one PTE covering any given guest
1074 * real address, if npages > 1 we can skip to i + npages.
1075 */
1076 j = i + 1;
e641a317
PM
1077 if (npages) {
1078 set_dirty_bits(map, i, npages);
117647ff 1079 j = i + npages;
e641a317 1080 }
8f7b79b8
PM
1081 }
1082 return 0;
1083}
1084
5af3e9d0
PM
1085void kvmppc_radix_flush_memslot(struct kvm *kvm,
1086 const struct kvm_memory_slot *memslot)
1087{
1088 unsigned long n;
1089 pte_t *ptep;
1090 unsigned long gpa;
1091 unsigned int shift;
1092
1093 gpa = memslot->base_gfn << PAGE_SHIFT;
1094 spin_lock(&kvm->mmu_lock);
1095 for (n = memslot->npages; n; --n) {
1096 ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
1097 if (ptep && pte_present(*ptep))
1098 kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
1099 kvm->arch.lpid);
1100 gpa += PAGE_SIZE;
1101 }
1102 spin_unlock(&kvm->mmu_lock);
1103}
1104
8cf4ecc0
PM
1105static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info,
1106 int psize, int *indexp)
1107{
1108 if (!mmu_psize_defs[psize].shift)
1109 return;
1110 info->ap_encodings[*indexp] = mmu_psize_defs[psize].shift |
1111 (mmu_psize_defs[psize].ap << 29);
1112 ++(*indexp);
1113}
1114
1115int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info)
1116{
1117 int i;
1118
1119 if (!radix_enabled())
1120 return -EINVAL;
1121 memset(info, 0, sizeof(*info));
1122
1123 /* 4k page size */
1124 info->geometries[0].page_shift = 12;
1125 info->geometries[0].level_bits[0] = 9;
1126 for (i = 1; i < 4; ++i)
1127 info->geometries[0].level_bits[i] = p9_supported_radix_bits[i];
1128 /* 64k page size */
1129 info->geometries[1].page_shift = 16;
1130 for (i = 0; i < 4; ++i)
1131 info->geometries[1].level_bits[i] = p9_supported_radix_bits[i];
1132
1133 i = 0;
1134 add_rmmu_ap_encoding(info, MMU_PAGE_4K, &i);
1135 add_rmmu_ap_encoding(info, MMU_PAGE_64K, &i);
1136 add_rmmu_ap_encoding(info, MMU_PAGE_2M, &i);
1137 add_rmmu_ap_encoding(info, MMU_PAGE_1G, &i);
1138
1139 return 0;
1140}
1141
1142int kvmppc_init_vm_radix(struct kvm *kvm)
1143{
1144 kvm->arch.pgtable = pgd_alloc(kvm->mm);
1145 if (!kvm->arch.pgtable)
1146 return -ENOMEM;
1147 return 0;
1148}
1149
5a319350
PM
1150static void pte_ctor(void *addr)
1151{
21828c99
AK
1152 memset(addr, 0, RADIX_PTE_TABLE_SIZE);
1153}
1154
1155static void pmd_ctor(void *addr)
1156{
1157 memset(addr, 0, RADIX_PMD_TABLE_SIZE);
5a319350
PM
1158}
1159
9a94d3ee
PM
1160struct debugfs_radix_state {
1161 struct kvm *kvm;
1162 struct mutex mutex;
1163 unsigned long gpa;
83a05510 1164 int lpid;
9a94d3ee
PM
1165 int chars_left;
1166 int buf_index;
1167 char buf[128];
1168 u8 hdr;
1169};
1170
1171static int debugfs_radix_open(struct inode *inode, struct file *file)
1172{
1173 struct kvm *kvm = inode->i_private;
1174 struct debugfs_radix_state *p;
1175
1176 p = kzalloc(sizeof(*p), GFP_KERNEL);
1177 if (!p)
1178 return -ENOMEM;
1179
1180 kvm_get_kvm(kvm);
1181 p->kvm = kvm;
1182 mutex_init(&p->mutex);
1183 file->private_data = p;
1184
1185 return nonseekable_open(inode, file);
1186}
1187
1188static int debugfs_radix_release(struct inode *inode, struct file *file)
1189{
1190 struct debugfs_radix_state *p = file->private_data;
1191
1192 kvm_put_kvm(p->kvm);
1193 kfree(p);
1194 return 0;
1195}
1196
1197static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
1198 size_t len, loff_t *ppos)
1199{
1200 struct debugfs_radix_state *p = file->private_data;
1201 ssize_t ret, r;
1202 unsigned long n;
1203 struct kvm *kvm;
1204 unsigned long gpa;
1205 pgd_t *pgt;
83a05510 1206 struct kvm_nested_guest *nested;
9a94d3ee
PM
1207 pgd_t pgd, *pgdp;
1208 pud_t pud, *pudp;
1209 pmd_t pmd, *pmdp;
1210 pte_t *ptep;
1211 int shift;
1212 unsigned long pte;
1213
1214 kvm = p->kvm;
1215 if (!kvm_is_radix(kvm))
1216 return 0;
1217
1218 ret = mutex_lock_interruptible(&p->mutex);
1219 if (ret)
1220 return ret;
1221
1222 if (p->chars_left) {
1223 n = p->chars_left;
1224 if (n > len)
1225 n = len;
1226 r = copy_to_user(buf, p->buf + p->buf_index, n);
1227 n -= r;
1228 p->chars_left -= n;
1229 p->buf_index += n;
1230 buf += n;
1231 len -= n;
1232 ret = n;
1233 if (r) {
1234 if (!n)
1235 ret = -EFAULT;
1236 goto out;
1237 }
1238 }
1239
1240 gpa = p->gpa;
83a05510
PM
1241 nested = NULL;
1242 pgt = NULL;
1243 while (len != 0 && p->lpid >= 0) {
1244 if (gpa >= RADIX_PGTABLE_RANGE) {
1245 gpa = 0;
1246 pgt = NULL;
1247 if (nested) {
1248 kvmhv_put_nested(nested);
1249 nested = NULL;
1250 }
1251 p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid);
1252 p->hdr = 0;
1253 if (p->lpid < 0)
1254 break;
1255 }
1256 if (!pgt) {
1257 if (p->lpid == 0) {
1258 pgt = kvm->arch.pgtable;
1259 } else {
1260 nested = kvmhv_get_nested(kvm, p->lpid, false);
1261 if (!nested) {
1262 gpa = RADIX_PGTABLE_RANGE;
1263 continue;
1264 }
1265 pgt = nested->shadow_pgtable;
1266 }
1267 }
1268 n = 0;
9a94d3ee 1269 if (!p->hdr) {
83a05510
PM
1270 if (p->lpid > 0)
1271 n = scnprintf(p->buf, sizeof(p->buf),
1272 "\nNested LPID %d: ", p->lpid);
1273 n += scnprintf(p->buf + n, sizeof(p->buf) - n,
9a94d3ee
PM
1274 "pgdir: %lx\n", (unsigned long)pgt);
1275 p->hdr = 1;
1276 goto copy;
1277 }
1278
1279 pgdp = pgt + pgd_index(gpa);
1280 pgd = READ_ONCE(*pgdp);
1281 if (!(pgd_val(pgd) & _PAGE_PRESENT)) {
1282 gpa = (gpa & PGDIR_MASK) + PGDIR_SIZE;
1283 continue;
1284 }
1285
1286 pudp = pud_offset(&pgd, gpa);
1287 pud = READ_ONCE(*pudp);
1288 if (!(pud_val(pud) & _PAGE_PRESENT)) {
1289 gpa = (gpa & PUD_MASK) + PUD_SIZE;
1290 continue;
1291 }
1292 if (pud_val(pud) & _PAGE_PTE) {
1293 pte = pud_val(pud);
1294 shift = PUD_SHIFT;
1295 goto leaf;
1296 }
1297
1298 pmdp = pmd_offset(&pud, gpa);
1299 pmd = READ_ONCE(*pmdp);
1300 if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
1301 gpa = (gpa & PMD_MASK) + PMD_SIZE;
1302 continue;
1303 }
1304 if (pmd_val(pmd) & _PAGE_PTE) {
1305 pte = pmd_val(pmd);
1306 shift = PMD_SHIFT;
1307 goto leaf;
1308 }
1309
1310 ptep = pte_offset_kernel(&pmd, gpa);
1311 pte = pte_val(READ_ONCE(*ptep));
1312 if (!(pte & _PAGE_PRESENT)) {
1313 gpa += PAGE_SIZE;
1314 continue;
1315 }
1316 shift = PAGE_SHIFT;
1317 leaf:
1318 n = scnprintf(p->buf, sizeof(p->buf),
1319 " %lx: %lx %d\n", gpa, pte, shift);
1320 gpa += 1ul << shift;
1321 copy:
1322 p->chars_left = n;
1323 if (n > len)
1324 n = len;
1325 r = copy_to_user(buf, p->buf, n);
1326 n -= r;
1327 p->chars_left -= n;
1328 p->buf_index = n;
1329 buf += n;
1330 len -= n;
1331 ret += n;
1332 if (r) {
1333 if (!ret)
1334 ret = -EFAULT;
1335 break;
1336 }
1337 }
1338 p->gpa = gpa;
83a05510
PM
1339 if (nested)
1340 kvmhv_put_nested(nested);
9a94d3ee
PM
1341
1342 out:
1343 mutex_unlock(&p->mutex);
1344 return ret;
1345}
1346
1347static ssize_t debugfs_radix_write(struct file *file, const char __user *buf,
1348 size_t len, loff_t *ppos)
1349{
1350 return -EACCES;
1351}
1352
1353static const struct file_operations debugfs_radix_fops = {
1354 .owner = THIS_MODULE,
1355 .open = debugfs_radix_open,
1356 .release = debugfs_radix_release,
1357 .read = debugfs_radix_read,
1358 .write = debugfs_radix_write,
1359 .llseek = generic_file_llseek,
1360};
1361
1362void kvmhv_radix_debugfs_init(struct kvm *kvm)
1363{
1364 kvm->arch.radix_dentry = debugfs_create_file("radix", 0400,
1365 kvm->arch.debugfs_dir, kvm,
1366 &debugfs_radix_fops);
1367}
1368
5a319350
PM
1369int kvmppc_radix_init(void)
1370{
21828c99 1371 unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE;
5a319350
PM
1372
1373 kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor);
1374 if (!kvm_pte_cache)
1375 return -ENOMEM;
21828c99
AK
1376
1377 size = sizeof(void *) << RADIX_PMD_INDEX_SIZE;
1378
1379 kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, 0, pmd_ctor);
1380 if (!kvm_pmd_cache) {
1381 kmem_cache_destroy(kvm_pte_cache);
1382 return -ENOMEM;
1383 }
1384
5a319350
PM
1385 return 0;
1386}
1387
1388void kvmppc_radix_exit(void)
1389{
1390 kmem_cache_destroy(kvm_pte_cache);
21828c99 1391 kmem_cache_destroy(kvm_pmd_cache);
5a319350 1392}