]>
Commit | Line | Data |
---|---|---|
9e04ba69 PM |
1 | /* |
2 | * This program is free software; you can redistribute it and/or modify | |
3 | * it under the terms of the GNU General Public License, version 2, as | |
4 | * published by the Free Software Foundation. | |
5 | * | |
6 | * Copyright 2016 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> | |
7 | */ | |
8 | ||
9 | #include <linux/types.h> | |
10 | #include <linux/string.h> | |
11 | #include <linux/kvm.h> | |
12 | #include <linux/kvm_host.h> | |
9a94d3ee PM |
13 | #include <linux/anon_inodes.h> |
14 | #include <linux/file.h> | |
15 | #include <linux/debugfs.h> | |
9e04ba69 PM |
16 | |
17 | #include <asm/kvm_ppc.h> | |
18 | #include <asm/kvm_book3s.h> | |
19 | #include <asm/page.h> | |
20 | #include <asm/mmu.h> | |
21 | #include <asm/pgtable.h> | |
22 | #include <asm/pgalloc.h> | |
94171b19 | 23 | #include <asm/pte-walk.h> |
9e04ba69 PM |
24 | |
25 | /* | |
26 | * Supported radix tree geometry. | |
27 | * Like p9, we support either 5 or 9 bits at the first (lowest) level, | |
28 | * for a page size of 64k or 4k. | |
29 | */ | |
30 | static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 }; | |
31 | ||
6ff887b8 SJS |
32 | unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid, |
33 | gva_t eaddr, void *to, void *from, | |
34 | unsigned long n) | |
d7b45615 | 35 | { |
f4607722 | 36 | int uninitialized_var(old_pid), old_lpid; |
d7b45615 | 37 | unsigned long quadrant, ret = n; |
d7b45615 SJS |
38 | bool is_load = !!to; |
39 | ||
95d386c2 SJS |
40 | /* Can't access quadrants 1 or 2 in non-HV mode, call the HV to do it */ |
41 | if (kvmhv_on_pseries()) | |
42 | return plpar_hcall_norets(H_COPY_TOFROM_GUEST, lpid, pid, eaddr, | |
43 | __pa(to), __pa(from), n); | |
d7b45615 SJS |
44 | |
45 | quadrant = 1; | |
46 | if (!pid) | |
47 | quadrant = 2; | |
48 | if (is_load) | |
49 | from = (void *) (eaddr | (quadrant << 62)); | |
50 | else | |
51 | to = (void *) (eaddr | (quadrant << 62)); | |
52 | ||
53 | preempt_disable(); | |
54 | ||
55 | /* switch the lpid first to avoid running host with unallocated pid */ | |
56 | old_lpid = mfspr(SPRN_LPID); | |
57 | if (old_lpid != lpid) | |
58 | mtspr(SPRN_LPID, lpid); | |
59 | if (quadrant == 1) { | |
60 | old_pid = mfspr(SPRN_PID); | |
61 | if (old_pid != pid) | |
62 | mtspr(SPRN_PID, pid); | |
63 | } | |
64 | isync(); | |
65 | ||
66 | pagefault_disable(); | |
67 | if (is_load) | |
68 | ret = raw_copy_from_user(to, from, n); | |
69 | else | |
70 | ret = raw_copy_to_user(to, from, n); | |
71 | pagefault_enable(); | |
72 | ||
73 | /* switch the pid first to avoid running host with unallocated pid */ | |
74 | if (quadrant == 1 && pid != old_pid) | |
75 | mtspr(SPRN_PID, old_pid); | |
76 | if (lpid != old_lpid) | |
77 | mtspr(SPRN_LPID, old_lpid); | |
78 | isync(); | |
79 | ||
80 | preempt_enable(); | |
81 | ||
82 | return ret; | |
83 | } | |
6ff887b8 | 84 | EXPORT_SYMBOL_GPL(__kvmhv_copy_tofrom_guest_radix); |
d7b45615 SJS |
85 | |
86 | static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, | |
87 | void *to, void *from, unsigned long n) | |
88 | { | |
89 | int lpid = vcpu->kvm->arch.lpid; | |
90 | int pid = vcpu->arch.pid; | |
91 | ||
92 | /* This would cause a data segment intr so don't allow the access */ | |
93 | if (eaddr & (0x3FFUL << 52)) | |
94 | return -EINVAL; | |
95 | ||
96 | /* Should we be using the nested lpid */ | |
97 | if (vcpu->arch.nested) | |
98 | lpid = vcpu->arch.nested->shadow_lpid; | |
99 | ||
100 | /* If accessing quadrant 3 then pid is expected to be 0 */ | |
101 | if (((eaddr >> 62) & 0x3) == 0x3) | |
102 | pid = 0; | |
103 | ||
104 | eaddr &= ~(0xFFFUL << 52); | |
105 | ||
106 | return __kvmhv_copy_tofrom_guest_radix(lpid, pid, eaddr, to, from, n); | |
107 | } | |
108 | ||
109 | long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to, | |
110 | unsigned long n) | |
111 | { | |
112 | long ret; | |
113 | ||
114 | ret = kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, to, NULL, n); | |
115 | if (ret > 0) | |
116 | memset(to + (n - ret), 0, ret); | |
117 | ||
118 | return ret; | |
119 | } | |
120 | EXPORT_SYMBOL_GPL(kvmhv_copy_from_guest_radix); | |
121 | ||
122 | long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *from, | |
123 | unsigned long n) | |
124 | { | |
125 | return kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, NULL, from, n); | |
126 | } | |
127 | EXPORT_SYMBOL_GPL(kvmhv_copy_to_guest_radix); | |
128 | ||
fd10be25 SJS |
129 | int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr, |
130 | struct kvmppc_pte *gpte, u64 root, | |
131 | u64 *pte_ret_p) | |
9e04ba69 PM |
132 | { |
133 | struct kvm *kvm = vcpu->kvm; | |
9e04ba69 | 134 | int ret, level, ps; |
fd10be25 | 135 | unsigned long rts, bits, offset, index; |
9811c78e SJS |
136 | u64 pte, base, gpa; |
137 | __be64 rpte; | |
9e04ba69 | 138 | |
9e04ba69 PM |
139 | rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) | |
140 | ((root & RTS2_MASK) >> RTS2_SHIFT); | |
141 | bits = root & RPDS_MASK; | |
9811c78e | 142 | base = root & RPDB_MASK; |
9e04ba69 | 143 | |
9e04ba69 | 144 | offset = rts + 31; |
9e04ba69 | 145 | |
9811c78e | 146 | /* Current implementations only support 52-bit space */ |
9e04ba69 PM |
147 | if (offset != 52) |
148 | return -EINVAL; | |
149 | ||
9811c78e | 150 | /* Walk each level of the radix tree */ |
9e04ba69 | 151 | for (level = 3; level >= 0; --level) { |
fd10be25 | 152 | u64 addr; |
9811c78e | 153 | /* Check a valid size */ |
9e04ba69 PM |
154 | if (level && bits != p9_supported_radix_bits[level]) |
155 | return -EINVAL; | |
156 | if (level == 0 && !(bits == 5 || bits == 9)) | |
157 | return -EINVAL; | |
158 | offset -= bits; | |
159 | index = (eaddr >> offset) & ((1UL << bits) - 1); | |
9811c78e SJS |
160 | /* Check that low bits of page table base are zero */ |
161 | if (base & ((1UL << (bits + 3)) - 1)) | |
9e04ba69 | 162 | return -EINVAL; |
9811c78e | 163 | /* Read the entry from guest memory */ |
fd10be25 SJS |
164 | addr = base + (index * sizeof(rpte)); |
165 | ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte)); | |
166 | if (ret) { | |
167 | if (pte_ret_p) | |
168 | *pte_ret_p = addr; | |
9e04ba69 | 169 | return ret; |
fd10be25 | 170 | } |
9e04ba69 PM |
171 | pte = __be64_to_cpu(rpte); |
172 | if (!(pte & _PAGE_PRESENT)) | |
173 | return -ENOENT; | |
9811c78e | 174 | /* Check if a leaf entry */ |
9e04ba69 PM |
175 | if (pte & _PAGE_PTE) |
176 | break; | |
9811c78e SJS |
177 | /* Get ready to walk the next level */ |
178 | base = pte & RPDB_MASK; | |
179 | bits = pte & RPDS_MASK; | |
9e04ba69 | 180 | } |
9811c78e SJS |
181 | |
182 | /* Need a leaf at lowest level; 512GB pages not supported */ | |
9e04ba69 PM |
183 | if (level < 0 || level == 3) |
184 | return -EINVAL; | |
185 | ||
9811c78e SJS |
186 | /* We found a valid leaf PTE */ |
187 | /* Offset is now log base 2 of the page size */ | |
9e04ba69 PM |
188 | gpa = pte & 0x01fffffffffff000ul; |
189 | if (gpa & ((1ul << offset) - 1)) | |
190 | return -EINVAL; | |
9811c78e | 191 | gpa |= eaddr & ((1ul << offset) - 1); |
9e04ba69 PM |
192 | for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps) |
193 | if (offset == mmu_psize_defs[ps].shift) | |
194 | break; | |
195 | gpte->page_size = ps; | |
fd10be25 | 196 | gpte->page_shift = offset; |
9e04ba69 PM |
197 | |
198 | gpte->eaddr = eaddr; | |
199 | gpte->raddr = gpa; | |
200 | ||
201 | /* Work out permissions */ | |
202 | gpte->may_read = !!(pte & _PAGE_READ); | |
203 | gpte->may_write = !!(pte & _PAGE_WRITE); | |
204 | gpte->may_execute = !!(pte & _PAGE_EXEC); | |
9811c78e | 205 | |
fd10be25 SJS |
206 | gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY); |
207 | ||
9811c78e SJS |
208 | if (pte_ret_p) |
209 | *pte_ret_p = pte; | |
210 | ||
211 | return 0; | |
212 | } | |
213 | ||
fd10be25 SJS |
214 | /* |
215 | * Used to walk a partition or process table radix tree in guest memory | |
216 | * Note: We exploit the fact that a partition table and a process | |
217 | * table have the same layout, a partition-scoped page table and a | |
218 | * process-scoped page table have the same layout, and the 2nd | |
219 | * doubleword of a partition table entry has the same layout as | |
220 | * the PTCR register. | |
221 | */ | |
222 | int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr, | |
223 | struct kvmppc_pte *gpte, u64 table, | |
224 | int table_index, u64 *pte_ret_p) | |
225 | { | |
226 | struct kvm *kvm = vcpu->kvm; | |
227 | int ret; | |
228 | unsigned long size, ptbl, root; | |
229 | struct prtb_entry entry; | |
230 | ||
231 | if ((table & PRTS_MASK) > 24) | |
232 | return -EINVAL; | |
233 | size = 1ul << ((table & PRTS_MASK) + 12); | |
234 | ||
235 | /* Is the table big enough to contain this entry? */ | |
236 | if ((table_index * sizeof(entry)) >= size) | |
237 | return -EINVAL; | |
238 | ||
239 | /* Read the table to find the root of the radix tree */ | |
240 | ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry)); | |
241 | ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry)); | |
242 | if (ret) | |
243 | return ret; | |
244 | ||
245 | /* Root is stored in the first double word */ | |
246 | root = be64_to_cpu(entry.prtb0); | |
247 | ||
248 | return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p); | |
249 | } | |
250 | ||
9811c78e SJS |
251 | int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, |
252 | struct kvmppc_pte *gpte, bool data, bool iswrite) | |
253 | { | |
254 | u32 pid; | |
255 | u64 pte; | |
256 | int ret; | |
257 | ||
258 | /* Work out effective PID */ | |
259 | switch (eaddr >> 62) { | |
260 | case 0: | |
261 | pid = vcpu->arch.pid; | |
262 | break; | |
263 | case 3: | |
264 | pid = 0; | |
265 | break; | |
266 | default: | |
267 | return -EINVAL; | |
268 | } | |
269 | ||
270 | ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte, | |
271 | vcpu->kvm->arch.process_table, pid, &pte); | |
272 | if (ret) | |
273 | return ret; | |
274 | ||
275 | /* Check privilege (applies only to process scoped translations) */ | |
9e04ba69 PM |
276 | if (kvmppc_get_msr(vcpu) & MSR_PR) { |
277 | if (pte & _PAGE_PRIVILEGED) { | |
278 | gpte->may_read = 0; | |
279 | gpte->may_write = 0; | |
280 | gpte->may_execute = 0; | |
281 | } | |
282 | } else { | |
283 | if (!(pte & _PAGE_PRIVILEGED)) { | |
284 | /* Check AMR/IAMR to see if strict mode is in force */ | |
285 | if (vcpu->arch.amr & (1ul << 62)) | |
286 | gpte->may_read = 0; | |
287 | if (vcpu->arch.amr & (1ul << 63)) | |
288 | gpte->may_write = 0; | |
289 | if (vcpu->arch.iamr & (1ul << 62)) | |
290 | gpte->may_execute = 0; | |
291 | } | |
292 | } | |
293 | ||
294 | return 0; | |
295 | } | |
296 | ||
90165d3d SJS |
297 | void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr, |
298 | unsigned int pshift, unsigned int lpid) | |
5a319350 | 299 | { |
d91cb39f | 300 | unsigned long psize = PAGE_SIZE; |
690ed4ca PM |
301 | int psi; |
302 | long rc; | |
303 | unsigned long rb; | |
d91cb39f NP |
304 | |
305 | if (pshift) | |
306 | psize = 1UL << pshift; | |
690ed4ca PM |
307 | else |
308 | pshift = PAGE_SHIFT; | |
d91cb39f NP |
309 | |
310 | addr &= ~(psize - 1); | |
690ed4ca PM |
311 | |
312 | if (!kvmhv_on_pseries()) { | |
313 | radix__flush_tlb_lpid_page(lpid, addr, psize); | |
314 | return; | |
315 | } | |
316 | ||
317 | psi = shift_to_mmu_psize(pshift); | |
318 | rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58)); | |
319 | rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1), | |
320 | lpid, rb); | |
321 | if (rc) | |
322 | pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc); | |
5a319350 PM |
323 | } |
324 | ||
fd10be25 | 325 | static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid) |
c4c8a764 | 326 | { |
690ed4ca PM |
327 | long rc; |
328 | ||
329 | if (!kvmhv_on_pseries()) { | |
330 | radix__flush_pwc_lpid(lpid); | |
331 | return; | |
332 | } | |
333 | ||
334 | rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1), | |
335 | lpid, TLBIEL_INVAL_SET_LPID); | |
336 | if (rc) | |
337 | pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc); | |
c4c8a764 PM |
338 | } |
339 | ||
878cf2bb | 340 | static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep, |
8f7b79b8 PM |
341 | unsigned long clr, unsigned long set, |
342 | unsigned long addr, unsigned int shift) | |
5a319350 | 343 | { |
2bf1071a | 344 | return __radix_pte_update(ptep, clr, set); |
5a319350 PM |
345 | } |
346 | ||
347 | void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr, | |
348 | pte_t *ptep, pte_t pte) | |
349 | { | |
350 | radix__set_pte_at(kvm->mm, addr, ptep, pte, 0); | |
351 | } | |
352 | ||
353 | static struct kmem_cache *kvm_pte_cache; | |
21828c99 | 354 | static struct kmem_cache *kvm_pmd_cache; |
5a319350 PM |
355 | |
356 | static pte_t *kvmppc_pte_alloc(void) | |
357 | { | |
358 | return kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL); | |
359 | } | |
360 | ||
361 | static void kvmppc_pte_free(pte_t *ptep) | |
362 | { | |
363 | kmem_cache_free(kvm_pte_cache, ptep); | |
364 | } | |
365 | ||
c3856aeb PM |
366 | /* Like pmd_huge() and pmd_large(), but works regardless of config options */ |
367 | static inline int pmd_is_leaf(pmd_t pmd) | |
368 | { | |
369 | return !!(pmd_val(pmd) & _PAGE_PTE); | |
370 | } | |
371 | ||
21828c99 AK |
372 | static pmd_t *kvmppc_pmd_alloc(void) |
373 | { | |
374 | return kmem_cache_alloc(kvm_pmd_cache, GFP_KERNEL); | |
375 | } | |
376 | ||
377 | static void kvmppc_pmd_free(pmd_t *pmdp) | |
378 | { | |
379 | kmem_cache_free(kvm_pmd_cache, pmdp); | |
380 | } | |
381 | ||
8cf531ed SJS |
382 | /* Called with kvm->mmu_lock held */ |
383 | void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa, | |
c43c3a86 PM |
384 | unsigned int shift, |
385 | const struct kvm_memory_slot *memslot, | |
fd10be25 | 386 | unsigned int lpid) |
a5fad1e9 NP |
387 | |
388 | { | |
a5fad1e9 | 389 | unsigned long old; |
8cf531ed SJS |
390 | unsigned long gfn = gpa >> PAGE_SHIFT; |
391 | unsigned long page_size = PAGE_SIZE; | |
392 | unsigned long hpa; | |
a5fad1e9 NP |
393 | |
394 | old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift); | |
fd10be25 | 395 | kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid); |
a5fad1e9 | 396 | |
8cf531ed SJS |
397 | /* The following only applies to L1 entries */ |
398 | if (lpid != kvm->arch.lpid) | |
399 | return; | |
a5fad1e9 | 400 | |
8cf531ed | 401 | if (!memslot) { |
a5fad1e9 | 402 | memslot = gfn_to_memslot(kvm, gfn); |
f0f825f0 | 403 | if (!memslot) |
8cf531ed | 404 | return; |
a5fad1e9 | 405 | } |
8f1f7b9b | 406 | if (shift) { /* 1GB or 2MB page */ |
8cf531ed | 407 | page_size = 1ul << shift; |
8f1f7b9b SJS |
408 | if (shift == PMD_SHIFT) |
409 | kvm->stat.num_2M_pages--; | |
410 | else if (shift == PUD_SHIFT) | |
411 | kvm->stat.num_1G_pages--; | |
412 | } | |
8cf531ed SJS |
413 | |
414 | gpa &= ~(page_size - 1); | |
415 | hpa = old & PTE_RPN_MASK; | |
416 | kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size); | |
417 | ||
418 | if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) | |
419 | kvmppc_update_dirty_map(memslot, gfn, page_size); | |
a5fad1e9 NP |
420 | } |
421 | ||
a5704e83 NP |
422 | /* |
423 | * kvmppc_free_p?d are used to free existing page tables, and recursively | |
424 | * descend and clear and free children. | |
425 | * Callers are responsible for flushing the PWC. | |
426 | * | |
427 | * When page tables are being unmapped/freed as part of page fault path | |
428 | * (full == false), ptes are not expected. There is code to unmap them | |
429 | * and emit a warning if encountered, but there may already be data | |
430 | * corruption due to the unexpected mappings. | |
431 | */ | |
fd10be25 SJS |
432 | static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full, |
433 | unsigned int lpid) | |
a5704e83 NP |
434 | { |
435 | if (full) { | |
436 | memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE); | |
437 | } else { | |
438 | pte_t *p = pte; | |
439 | unsigned long it; | |
440 | ||
441 | for (it = 0; it < PTRS_PER_PTE; ++it, ++p) { | |
442 | if (pte_val(*p) == 0) | |
443 | continue; | |
444 | WARN_ON_ONCE(1); | |
445 | kvmppc_unmap_pte(kvm, p, | |
446 | pte_pfn(*p) << PAGE_SHIFT, | |
fd10be25 | 447 | PAGE_SHIFT, NULL, lpid); |
a5704e83 NP |
448 | } |
449 | } | |
450 | ||
451 | kvmppc_pte_free(pte); | |
452 | } | |
453 | ||
fd10be25 SJS |
454 | static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full, |
455 | unsigned int lpid) | |
a5704e83 NP |
456 | { |
457 | unsigned long im; | |
458 | pmd_t *p = pmd; | |
459 | ||
460 | for (im = 0; im < PTRS_PER_PMD; ++im, ++p) { | |
461 | if (!pmd_present(*p)) | |
462 | continue; | |
463 | if (pmd_is_leaf(*p)) { | |
464 | if (full) { | |
465 | pmd_clear(p); | |
466 | } else { | |
467 | WARN_ON_ONCE(1); | |
468 | kvmppc_unmap_pte(kvm, (pte_t *)p, | |
469 | pte_pfn(*(pte_t *)p) << PAGE_SHIFT, | |
fd10be25 | 470 | PMD_SHIFT, NULL, lpid); |
a5704e83 NP |
471 | } |
472 | } else { | |
473 | pte_t *pte; | |
474 | ||
475 | pte = pte_offset_map(p, 0); | |
fd10be25 | 476 | kvmppc_unmap_free_pte(kvm, pte, full, lpid); |
a5704e83 NP |
477 | pmd_clear(p); |
478 | } | |
479 | } | |
480 | kvmppc_pmd_free(pmd); | |
481 | } | |
482 | ||
fd10be25 SJS |
483 | static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud, |
484 | unsigned int lpid) | |
a5704e83 NP |
485 | { |
486 | unsigned long iu; | |
487 | pud_t *p = pud; | |
488 | ||
489 | for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++p) { | |
490 | if (!pud_present(*p)) | |
491 | continue; | |
492 | if (pud_huge(*p)) { | |
493 | pud_clear(p); | |
494 | } else { | |
495 | pmd_t *pmd; | |
496 | ||
497 | pmd = pmd_offset(p, 0); | |
fd10be25 | 498 | kvmppc_unmap_free_pmd(kvm, pmd, true, lpid); |
a5704e83 NP |
499 | pud_clear(p); |
500 | } | |
501 | } | |
502 | pud_free(kvm->mm, pud); | |
503 | } | |
504 | ||
fd10be25 | 505 | void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid) |
a5704e83 NP |
506 | { |
507 | unsigned long ig; | |
a5704e83 | 508 | |
a5704e83 NP |
509 | for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) { |
510 | pud_t *pud; | |
511 | ||
512 | if (!pgd_present(*pgd)) | |
513 | continue; | |
514 | pud = pud_offset(pgd, 0); | |
fd10be25 | 515 | kvmppc_unmap_free_pud(kvm, pud, lpid); |
a5704e83 NP |
516 | pgd_clear(pgd); |
517 | } | |
fd10be25 SJS |
518 | } |
519 | ||
520 | void kvmppc_free_radix(struct kvm *kvm) | |
521 | { | |
522 | if (kvm->arch.pgtable) { | |
523 | kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable, | |
524 | kvm->arch.lpid); | |
525 | pgd_free(kvm->mm, kvm->arch.pgtable); | |
526 | kvm->arch.pgtable = NULL; | |
527 | } | |
a5704e83 NP |
528 | } |
529 | ||
530 | static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd, | |
fd10be25 | 531 | unsigned long gpa, unsigned int lpid) |
a5704e83 NP |
532 | { |
533 | pte_t *pte = pte_offset_kernel(pmd, 0); | |
534 | ||
535 | /* | |
536 | * Clearing the pmd entry then flushing the PWC ensures that the pte | |
537 | * page no longer be cached by the MMU, so can be freed without | |
538 | * flushing the PWC again. | |
539 | */ | |
540 | pmd_clear(pmd); | |
fd10be25 | 541 | kvmppc_radix_flush_pwc(kvm, lpid); |
a5704e83 | 542 | |
fd10be25 | 543 | kvmppc_unmap_free_pte(kvm, pte, false, lpid); |
a5704e83 NP |
544 | } |
545 | ||
546 | static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud, | |
fd10be25 | 547 | unsigned long gpa, unsigned int lpid) |
a5704e83 NP |
548 | { |
549 | pmd_t *pmd = pmd_offset(pud, 0); | |
550 | ||
551 | /* | |
552 | * Clearing the pud entry then flushing the PWC ensures that the pmd | |
553 | * page and any children pte pages will no longer be cached by the MMU, | |
554 | * so can be freed without flushing the PWC again. | |
555 | */ | |
556 | pud_clear(pud); | |
fd10be25 | 557 | kvmppc_radix_flush_pwc(kvm, lpid); |
a5704e83 | 558 | |
fd10be25 | 559 | kvmppc_unmap_free_pmd(kvm, pmd, false, lpid); |
a5704e83 NP |
560 | } |
561 | ||
878cf2bb NP |
562 | /* |
563 | * There are a number of bits which may differ between different faults to | |
564 | * the same partition scope entry. RC bits, in the course of cleaning and | |
565 | * aging. And the write bit can change, either the access could have been | |
566 | * upgraded, or a read fault could happen concurrently with a write fault | |
567 | * that sets those bits first. | |
568 | */ | |
569 | #define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED)) | |
570 | ||
fd10be25 SJS |
571 | int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, |
572 | unsigned long gpa, unsigned int level, | |
8cf531ed SJS |
573 | unsigned long mmu_seq, unsigned int lpid, |
574 | unsigned long *rmapp, struct rmap_nested **n_rmap) | |
5a319350 PM |
575 | { |
576 | pgd_t *pgd; | |
577 | pud_t *pud, *new_pud = NULL; | |
578 | pmd_t *pmd, *new_pmd = NULL; | |
579 | pte_t *ptep, *new_ptep = NULL; | |
580 | int ret; | |
581 | ||
582 | /* Traverse the guest's 2nd-level tree, allocate new levels needed */ | |
04bae9d5 | 583 | pgd = pgtable + pgd_index(gpa); |
5a319350 PM |
584 | pud = NULL; |
585 | if (pgd_present(*pgd)) | |
586 | pud = pud_offset(pgd, gpa); | |
587 | else | |
588 | new_pud = pud_alloc_one(kvm->mm, gpa); | |
589 | ||
590 | pmd = NULL; | |
58c5c276 | 591 | if (pud && pud_present(*pud) && !pud_huge(*pud)) |
5a319350 | 592 | pmd = pmd_offset(pud, gpa); |
58c5c276 | 593 | else if (level <= 1) |
21828c99 | 594 | new_pmd = kvmppc_pmd_alloc(); |
5a319350 | 595 | |
c3856aeb | 596 | if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_is_leaf(*pmd))) |
5a319350 PM |
597 | new_ptep = kvmppc_pte_alloc(); |
598 | ||
599 | /* Check if we might have been invalidated; let the guest retry if so */ | |
600 | spin_lock(&kvm->mmu_lock); | |
601 | ret = -EAGAIN; | |
602 | if (mmu_notifier_retry(kvm, mmu_seq)) | |
603 | goto out_unlock; | |
604 | ||
605 | /* Now traverse again under the lock and change the tree */ | |
606 | ret = -ENOMEM; | |
607 | if (pgd_none(*pgd)) { | |
608 | if (!new_pud) | |
609 | goto out_unlock; | |
610 | pgd_populate(kvm->mm, pgd, new_pud); | |
611 | new_pud = NULL; | |
612 | } | |
613 | pud = pud_offset(pgd, gpa); | |
58c5c276 PM |
614 | if (pud_huge(*pud)) { |
615 | unsigned long hgpa = gpa & PUD_MASK; | |
616 | ||
878cf2bb NP |
617 | /* Check if we raced and someone else has set the same thing */ |
618 | if (level == 2) { | |
619 | if (pud_raw(*pud) == pte_raw(pte)) { | |
620 | ret = 0; | |
621 | goto out_unlock; | |
622 | } | |
623 | /* Valid 1GB page here already, add our extra bits */ | |
624 | WARN_ON_ONCE((pud_val(*pud) ^ pte_val(pte)) & | |
625 | PTE_BITS_MUST_MATCH); | |
626 | kvmppc_radix_update_pte(kvm, (pte_t *)pud, | |
627 | 0, pte_val(pte), hgpa, PUD_SHIFT); | |
628 | ret = 0; | |
629 | goto out_unlock; | |
630 | } | |
58c5c276 PM |
631 | /* |
632 | * If we raced with another CPU which has just put | |
633 | * a 1GB pte in after we saw a pmd page, try again. | |
634 | */ | |
878cf2bb | 635 | if (!new_pmd) { |
58c5c276 PM |
636 | ret = -EAGAIN; |
637 | goto out_unlock; | |
638 | } | |
58c5c276 | 639 | /* Valid 1GB page here already, remove it */ |
fd10be25 SJS |
640 | kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL, |
641 | lpid); | |
58c5c276 PM |
642 | } |
643 | if (level == 2) { | |
644 | if (!pud_none(*pud)) { | |
645 | /* | |
646 | * There's a page table page here, but we wanted to | |
647 | * install a large page, so remove and free the page | |
a5704e83 | 648 | * table page. |
58c5c276 | 649 | */ |
fd10be25 | 650 | kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid); |
58c5c276 PM |
651 | } |
652 | kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte); | |
8cf531ed SJS |
653 | if (rmapp && n_rmap) |
654 | kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap); | |
58c5c276 PM |
655 | ret = 0; |
656 | goto out_unlock; | |
657 | } | |
5a319350 PM |
658 | if (pud_none(*pud)) { |
659 | if (!new_pmd) | |
660 | goto out_unlock; | |
661 | pud_populate(kvm->mm, pud, new_pmd); | |
662 | new_pmd = NULL; | |
663 | } | |
664 | pmd = pmd_offset(pud, gpa); | |
c3856aeb PM |
665 | if (pmd_is_leaf(*pmd)) { |
666 | unsigned long lgpa = gpa & PMD_MASK; | |
667 | ||
878cf2bb NP |
668 | /* Check if we raced and someone else has set the same thing */ |
669 | if (level == 1) { | |
670 | if (pmd_raw(*pmd) == pte_raw(pte)) { | |
671 | ret = 0; | |
672 | goto out_unlock; | |
673 | } | |
674 | /* Valid 2MB page here already, add our extra bits */ | |
675 | WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) & | |
676 | PTE_BITS_MUST_MATCH); | |
677 | kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd), | |
fd10be25 | 678 | 0, pte_val(pte), lgpa, PMD_SHIFT); |
878cf2bb NP |
679 | ret = 0; |
680 | goto out_unlock; | |
681 | } | |
682 | ||
c3856aeb PM |
683 | /* |
684 | * If we raced with another CPU which has just put | |
685 | * a 2MB pte in after we saw a pte page, try again. | |
686 | */ | |
878cf2bb | 687 | if (!new_ptep) { |
c3856aeb PM |
688 | ret = -EAGAIN; |
689 | goto out_unlock; | |
690 | } | |
691 | /* Valid 2MB page here already, remove it */ | |
fd10be25 SJS |
692 | kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL, |
693 | lpid); | |
5a319350 | 694 | } |
58c5c276 PM |
695 | if (level == 1) { |
696 | if (!pmd_none(*pmd)) { | |
697 | /* | |
698 | * There's a page table page here, but we wanted to | |
699 | * install a large page, so remove and free the page | |
a5704e83 | 700 | * table page. |
58c5c276 | 701 | */ |
fd10be25 | 702 | kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid); |
5a319350 | 703 | } |
5a319350 | 704 | kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte); |
8cf531ed SJS |
705 | if (rmapp && n_rmap) |
706 | kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap); | |
58c5c276 PM |
707 | ret = 0; |
708 | goto out_unlock; | |
5a319350 | 709 | } |
58c5c276 PM |
710 | if (pmd_none(*pmd)) { |
711 | if (!new_ptep) | |
712 | goto out_unlock; | |
713 | pmd_populate(kvm->mm, pmd, new_ptep); | |
714 | new_ptep = NULL; | |
715 | } | |
716 | ptep = pte_offset_kernel(pmd, gpa); | |
717 | if (pte_present(*ptep)) { | |
718 | /* Check if someone else set the same thing */ | |
719 | if (pte_raw(*ptep) == pte_raw(pte)) { | |
720 | ret = 0; | |
721 | goto out_unlock; | |
722 | } | |
878cf2bb NP |
723 | /* Valid page here already, add our extra bits */ |
724 | WARN_ON_ONCE((pte_val(*ptep) ^ pte_val(pte)) & | |
725 | PTE_BITS_MUST_MATCH); | |
726 | kvmppc_radix_update_pte(kvm, ptep, 0, pte_val(pte), gpa, 0); | |
727 | ret = 0; | |
728 | goto out_unlock; | |
5a319350 | 729 | } |
58c5c276 | 730 | kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte); |
8cf531ed SJS |
731 | if (rmapp && n_rmap) |
732 | kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap); | |
5a319350 PM |
733 | ret = 0; |
734 | ||
735 | out_unlock: | |
736 | spin_unlock(&kvm->mmu_lock); | |
737 | if (new_pud) | |
738 | pud_free(kvm->mm, new_pud); | |
739 | if (new_pmd) | |
21828c99 | 740 | kvmppc_pmd_free(new_pmd); |
5a319350 PM |
741 | if (new_ptep) |
742 | kvmppc_pte_free(new_ptep); | |
743 | return ret; | |
744 | } | |
745 | ||
fd10be25 SJS |
746 | bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, bool writing, |
747 | unsigned long gpa, unsigned int lpid) | |
04bae9d5 SJS |
748 | { |
749 | unsigned long pgflags; | |
750 | unsigned int shift; | |
751 | pte_t *ptep; | |
752 | ||
753 | /* | |
754 | * Need to set an R or C bit in the 2nd-level tables; | |
755 | * since we are just helping out the hardware here, | |
756 | * it is sufficient to do what the hardware does. | |
757 | */ | |
758 | pgflags = _PAGE_ACCESSED; | |
759 | if (writing) | |
760 | pgflags |= _PAGE_DIRTY; | |
761 | /* | |
762 | * We are walking the secondary (partition-scoped) page table here. | |
763 | * We can do this without disabling irq because the Linux MM | |
764 | * subsystem doesn't do THP splits and collapses on this tree. | |
765 | */ | |
766 | ptep = __find_linux_pte(pgtable, gpa, NULL, &shift); | |
767 | if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) { | |
768 | kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift); | |
769 | return true; | |
770 | } | |
771 | return false; | |
772 | } | |
773 | ||
fd10be25 SJS |
774 | int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, |
775 | unsigned long gpa, | |
776 | struct kvm_memory_slot *memslot, | |
777 | bool writing, bool kvm_ro, | |
778 | pte_t *inserted_pte, unsigned int *levelp) | |
5a319350 PM |
779 | { |
780 | struct kvm *kvm = vcpu->kvm; | |
31c8b0d0 | 781 | struct page *page = NULL; |
04bae9d5 SJS |
782 | unsigned long mmu_seq; |
783 | unsigned long hva, gfn = gpa >> PAGE_SHIFT; | |
31c8b0d0 PM |
784 | bool upgrade_write = false; |
785 | bool *upgrade_p = &upgrade_write; | |
5a319350 | 786 | pte_t pte, *ptep; |
5a319350 | 787 | unsigned int shift, level; |
04bae9d5 | 788 | int ret; |
f460f679 | 789 | bool large_enable; |
5a319350 | 790 | |
31c8b0d0 PM |
791 | /* used to check for invalidations in progress */ |
792 | mmu_seq = kvm->mmu_notifier_seq; | |
793 | smp_rmb(); | |
794 | ||
795 | /* | |
796 | * Do a fast check first, since __gfn_to_pfn_memslot doesn't | |
797 | * do it with !atomic && !async, which is how we call it. | |
798 | * We always ask for write permission since the common case | |
799 | * is that the page is writable. | |
800 | */ | |
801 | hva = gfn_to_hva_memslot(memslot, gfn); | |
04bae9d5 | 802 | if (!kvm_ro && __get_user_pages_fast(hva, 1, 1, &page) == 1) { |
31c8b0d0 PM |
803 | upgrade_write = true; |
804 | } else { | |
71d29f43 NP |
805 | unsigned long pfn; |
806 | ||
31c8b0d0 PM |
807 | /* Call KVM generic code to do the slow-path check */ |
808 | pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, | |
809 | writing, upgrade_p); | |
810 | if (is_error_noslot_pfn(pfn)) | |
811 | return -EFAULT; | |
812 | page = NULL; | |
813 | if (pfn_valid(pfn)) { | |
814 | page = pfn_to_page(pfn); | |
815 | if (PageReserved(page)) | |
816 | page = NULL; | |
5a319350 | 817 | } |
31c8b0d0 PM |
818 | } |
819 | ||
5a319350 | 820 | /* |
71d29f43 NP |
821 | * Read the PTE from the process' radix tree and use that |
822 | * so we get the shift and attribute bits. | |
5a319350 | 823 | */ |
71d29f43 NP |
824 | local_irq_disable(); |
825 | ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift); | |
6579804c PM |
826 | /* |
827 | * If the PTE disappeared temporarily due to a THP | |
828 | * collapse, just return and let the guest try again. | |
829 | */ | |
830 | if (!ptep) { | |
831 | local_irq_enable(); | |
832 | if (page) | |
833 | put_page(page); | |
834 | return RESUME_GUEST; | |
835 | } | |
71d29f43 NP |
836 | pte = *ptep; |
837 | local_irq_enable(); | |
838 | ||
f460f679 PM |
839 | /* If we're logging dirty pages, always map single pages */ |
840 | large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES); | |
841 | ||
71d29f43 | 842 | /* Get pte level from shift/size */ |
f460f679 | 843 | if (large_enable && shift == PUD_SHIFT && |
71d29f43 NP |
844 | (gpa & (PUD_SIZE - PAGE_SIZE)) == |
845 | (hva & (PUD_SIZE - PAGE_SIZE))) { | |
846 | level = 2; | |
f460f679 | 847 | } else if (large_enable && shift == PMD_SHIFT && |
71d29f43 NP |
848 | (gpa & (PMD_SIZE - PAGE_SIZE)) == |
849 | (hva & (PMD_SIZE - PAGE_SIZE))) { | |
850 | level = 1; | |
31c8b0d0 | 851 | } else { |
71d29f43 NP |
852 | level = 0; |
853 | if (shift > PAGE_SHIFT) { | |
854 | /* | |
855 | * If the pte maps more than one page, bring over | |
856 | * bits from the virtual address to get the real | |
857 | * address of the specific single page we want. | |
858 | */ | |
859 | unsigned long rpnmask = (1ul << shift) - PAGE_SIZE; | |
860 | pte = __pte(pte_val(pte) | (hva & rpnmask)); | |
bc64dd0e | 861 | } |
5a319350 | 862 | } |
5a319350 | 863 | |
71d29f43 NP |
864 | pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED); |
865 | if (writing || upgrade_write) { | |
866 | if (pte_val(pte) & _PAGE_WRITE) | |
867 | pte = __pte(pte_val(pte) | _PAGE_DIRTY); | |
868 | } else { | |
869 | pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY)); | |
870 | } | |
871 | ||
5a319350 | 872 | /* Allocate space in the tree and write the PTE */ |
04bae9d5 | 873 | ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level, |
8cf531ed | 874 | mmu_seq, kvm->arch.lpid, NULL, NULL); |
04bae9d5 SJS |
875 | if (inserted_pte) |
876 | *inserted_pte = pte; | |
877 | if (levelp) | |
878 | *levelp = level; | |
5a319350 PM |
879 | |
880 | if (page) { | |
31c8b0d0 | 881 | if (!ret && (pte_val(pte) & _PAGE_WRITE)) |
c3856aeb PM |
882 | set_page_dirty_lock(page); |
883 | put_page(page); | |
5a319350 | 884 | } |
c3856aeb | 885 | |
8f1f7b9b SJS |
886 | /* Increment number of large pages if we (successfully) inserted one */ |
887 | if (!ret) { | |
888 | if (level == 1) | |
889 | kvm->stat.num_2M_pages++; | |
890 | else if (level == 2) | |
891 | kvm->stat.num_1G_pages++; | |
892 | } | |
893 | ||
04bae9d5 SJS |
894 | return ret; |
895 | } | |
896 | ||
897 | int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, | |
898 | unsigned long ea, unsigned long dsisr) | |
899 | { | |
900 | struct kvm *kvm = vcpu->kvm; | |
901 | unsigned long gpa, gfn; | |
902 | struct kvm_memory_slot *memslot; | |
903 | long ret; | |
904 | bool writing = !!(dsisr & DSISR_ISSTORE); | |
905 | bool kvm_ro = false; | |
906 | ||
907 | /* Check for unusual errors */ | |
908 | if (dsisr & DSISR_UNSUPP_MMU) { | |
909 | pr_err("KVM: Got unsupported MMU fault\n"); | |
910 | return -EFAULT; | |
911 | } | |
912 | if (dsisr & DSISR_BADACCESS) { | |
913 | /* Reflect to the guest as DSI */ | |
914 | pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr); | |
915 | kvmppc_core_queue_data_storage(vcpu, ea, dsisr); | |
916 | return RESUME_GUEST; | |
917 | } | |
918 | ||
919 | /* Translate the logical address */ | |
920 | gpa = vcpu->arch.fault_gpa & ~0xfffUL; | |
921 | gpa &= ~0xF000000000000000ul; | |
922 | gfn = gpa >> PAGE_SHIFT; | |
923 | if (!(dsisr & DSISR_PRTABLE_FAULT)) | |
924 | gpa |= ea & 0xfff; | |
925 | ||
926 | /* Get the corresponding memslot */ | |
927 | memslot = gfn_to_memslot(kvm, gfn); | |
928 | ||
929 | /* No memslot means it's an emulated MMIO region */ | |
930 | if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) { | |
931 | if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS | | |
932 | DSISR_SET_RC)) { | |
933 | /* | |
934 | * Bad address in guest page table tree, or other | |
935 | * unusual error - reflect it to the guest as DSI. | |
936 | */ | |
937 | kvmppc_core_queue_data_storage(vcpu, ea, dsisr); | |
938 | return RESUME_GUEST; | |
939 | } | |
940 | return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, writing); | |
941 | } | |
942 | ||
943 | if (memslot->flags & KVM_MEM_READONLY) { | |
944 | if (writing) { | |
945 | /* give the guest a DSI */ | |
946 | kvmppc_core_queue_data_storage(vcpu, ea, DSISR_ISSTORE | | |
947 | DSISR_PROTFAULT); | |
948 | return RESUME_GUEST; | |
949 | } | |
950 | kvm_ro = true; | |
951 | } | |
952 | ||
953 | /* Failed to set the reference/change bits */ | |
954 | if (dsisr & DSISR_SET_RC) { | |
955 | spin_lock(&kvm->mmu_lock); | |
956 | if (kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable, | |
fd10be25 | 957 | writing, gpa, kvm->arch.lpid)) |
04bae9d5 SJS |
958 | dsisr &= ~DSISR_SET_RC; |
959 | spin_unlock(&kvm->mmu_lock); | |
960 | ||
961 | if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE | | |
962 | DSISR_PROTFAULT | DSISR_SET_RC))) | |
963 | return RESUME_GUEST; | |
964 | } | |
965 | ||
966 | /* Try to insert a pte */ | |
967 | ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing, | |
968 | kvm_ro, NULL, NULL); | |
969 | ||
c3856aeb PM |
970 | if (ret == 0 || ret == -EAGAIN) |
971 | ret = RESUME_GUEST; | |
5a319350 PM |
972 | return ret; |
973 | } | |
974 | ||
c43c3a86 | 975 | /* Called with kvm->mmu_lock held */ |
01756099 PM |
976 | int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, |
977 | unsigned long gfn) | |
978 | { | |
979 | pte_t *ptep; | |
980 | unsigned long gpa = gfn << PAGE_SHIFT; | |
981 | unsigned int shift; | |
982 | ||
94171b19 | 983 | ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); |
f0f825f0 | 984 | if (ptep && pte_present(*ptep)) |
fd10be25 SJS |
985 | kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot, |
986 | kvm->arch.lpid); | |
01756099 PM |
987 | return 0; |
988 | } | |
989 | ||
c43c3a86 | 990 | /* Called with kvm->mmu_lock held */ |
01756099 PM |
991 | int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, |
992 | unsigned long gfn) | |
993 | { | |
994 | pte_t *ptep; | |
995 | unsigned long gpa = gfn << PAGE_SHIFT; | |
996 | unsigned int shift; | |
997 | int ref = 0; | |
ae59a7e1 | 998 | unsigned long old, *rmapp; |
01756099 | 999 | |
94171b19 | 1000 | ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); |
01756099 | 1001 | if (ptep && pte_present(*ptep) && pte_young(*ptep)) { |
ae59a7e1 SJS |
1002 | old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0, |
1003 | gpa, shift); | |
01756099 | 1004 | /* XXX need to flush tlb here? */ |
ae59a7e1 SJS |
1005 | /* Also clear bit in ptes in shadow pgtable for nested guests */ |
1006 | rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; | |
1007 | kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0, | |
1008 | old & PTE_RPN_MASK, | |
1009 | 1UL << shift); | |
01756099 PM |
1010 | ref = 1; |
1011 | } | |
1012 | return ref; | |
1013 | } | |
1014 | ||
c43c3a86 | 1015 | /* Called with kvm->mmu_lock held */ |
01756099 PM |
1016 | int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, |
1017 | unsigned long gfn) | |
1018 | { | |
1019 | pte_t *ptep; | |
1020 | unsigned long gpa = gfn << PAGE_SHIFT; | |
1021 | unsigned int shift; | |
1022 | int ref = 0; | |
1023 | ||
94171b19 | 1024 | ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); |
01756099 PM |
1025 | if (ptep && pte_present(*ptep) && pte_young(*ptep)) |
1026 | ref = 1; | |
1027 | return ref; | |
1028 | } | |
1029 | ||
8f7b79b8 PM |
1030 | /* Returns the number of PAGE_SIZE pages that are dirty */ |
1031 | static int kvm_radix_test_clear_dirty(struct kvm *kvm, | |
1032 | struct kvm_memory_slot *memslot, int pagenum) | |
1033 | { | |
1034 | unsigned long gfn = memslot->base_gfn + pagenum; | |
1035 | unsigned long gpa = gfn << PAGE_SHIFT; | |
1036 | pte_t *ptep; | |
1037 | unsigned int shift; | |
1038 | int ret = 0; | |
ae59a7e1 | 1039 | unsigned long old, *rmapp; |
8f7b79b8 | 1040 | |
94171b19 | 1041 | ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); |
8f7b79b8 PM |
1042 | if (ptep && pte_present(*ptep) && pte_dirty(*ptep)) { |
1043 | ret = 1; | |
1044 | if (shift) | |
1045 | ret = 1 << (shift - PAGE_SHIFT); | |
ae59a7e1 SJS |
1046 | spin_lock(&kvm->mmu_lock); |
1047 | old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0, | |
1048 | gpa, shift); | |
fd10be25 | 1049 | kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid); |
ae59a7e1 SJS |
1050 | /* Also clear bit in ptes in shadow pgtable for nested guests */ |
1051 | rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; | |
1052 | kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_DIRTY, 0, | |
1053 | old & PTE_RPN_MASK, | |
1054 | 1UL << shift); | |
1055 | spin_unlock(&kvm->mmu_lock); | |
8f7b79b8 PM |
1056 | } |
1057 | return ret; | |
1058 | } | |
1059 | ||
1060 | long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm, | |
1061 | struct kvm_memory_slot *memslot, unsigned long *map) | |
1062 | { | |
1063 | unsigned long i, j; | |
8f7b79b8 PM |
1064 | int npages; |
1065 | ||
8f7b79b8 PM |
1066 | for (i = 0; i < memslot->npages; i = j) { |
1067 | npages = kvm_radix_test_clear_dirty(kvm, memslot, i); | |
1068 | ||
1069 | /* | |
1070 | * Note that if npages > 0 then i must be a multiple of npages, | |
1071 | * since huge pages are only used to back the guest at guest | |
1072 | * real addresses that are a multiple of their size. | |
1073 | * Since we have at most one PTE covering any given guest | |
1074 | * real address, if npages > 1 we can skip to i + npages. | |
1075 | */ | |
1076 | j = i + 1; | |
e641a317 PM |
1077 | if (npages) { |
1078 | set_dirty_bits(map, i, npages); | |
117647ff | 1079 | j = i + npages; |
e641a317 | 1080 | } |
8f7b79b8 PM |
1081 | } |
1082 | return 0; | |
1083 | } | |
1084 | ||
5af3e9d0 PM |
1085 | void kvmppc_radix_flush_memslot(struct kvm *kvm, |
1086 | const struct kvm_memory_slot *memslot) | |
1087 | { | |
1088 | unsigned long n; | |
1089 | pte_t *ptep; | |
1090 | unsigned long gpa; | |
1091 | unsigned int shift; | |
1092 | ||
1093 | gpa = memslot->base_gfn << PAGE_SHIFT; | |
1094 | spin_lock(&kvm->mmu_lock); | |
1095 | for (n = memslot->npages; n; --n) { | |
1096 | ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); | |
1097 | if (ptep && pte_present(*ptep)) | |
1098 | kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot, | |
1099 | kvm->arch.lpid); | |
1100 | gpa += PAGE_SIZE; | |
1101 | } | |
1102 | spin_unlock(&kvm->mmu_lock); | |
1103 | } | |
1104 | ||
8cf4ecc0 PM |
1105 | static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info, |
1106 | int psize, int *indexp) | |
1107 | { | |
1108 | if (!mmu_psize_defs[psize].shift) | |
1109 | return; | |
1110 | info->ap_encodings[*indexp] = mmu_psize_defs[psize].shift | | |
1111 | (mmu_psize_defs[psize].ap << 29); | |
1112 | ++(*indexp); | |
1113 | } | |
1114 | ||
1115 | int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info) | |
1116 | { | |
1117 | int i; | |
1118 | ||
1119 | if (!radix_enabled()) | |
1120 | return -EINVAL; | |
1121 | memset(info, 0, sizeof(*info)); | |
1122 | ||
1123 | /* 4k page size */ | |
1124 | info->geometries[0].page_shift = 12; | |
1125 | info->geometries[0].level_bits[0] = 9; | |
1126 | for (i = 1; i < 4; ++i) | |
1127 | info->geometries[0].level_bits[i] = p9_supported_radix_bits[i]; | |
1128 | /* 64k page size */ | |
1129 | info->geometries[1].page_shift = 16; | |
1130 | for (i = 0; i < 4; ++i) | |
1131 | info->geometries[1].level_bits[i] = p9_supported_radix_bits[i]; | |
1132 | ||
1133 | i = 0; | |
1134 | add_rmmu_ap_encoding(info, MMU_PAGE_4K, &i); | |
1135 | add_rmmu_ap_encoding(info, MMU_PAGE_64K, &i); | |
1136 | add_rmmu_ap_encoding(info, MMU_PAGE_2M, &i); | |
1137 | add_rmmu_ap_encoding(info, MMU_PAGE_1G, &i); | |
1138 | ||
1139 | return 0; | |
1140 | } | |
1141 | ||
1142 | int kvmppc_init_vm_radix(struct kvm *kvm) | |
1143 | { | |
1144 | kvm->arch.pgtable = pgd_alloc(kvm->mm); | |
1145 | if (!kvm->arch.pgtable) | |
1146 | return -ENOMEM; | |
1147 | return 0; | |
1148 | } | |
1149 | ||
5a319350 PM |
1150 | static void pte_ctor(void *addr) |
1151 | { | |
21828c99 AK |
1152 | memset(addr, 0, RADIX_PTE_TABLE_SIZE); |
1153 | } | |
1154 | ||
1155 | static void pmd_ctor(void *addr) | |
1156 | { | |
1157 | memset(addr, 0, RADIX_PMD_TABLE_SIZE); | |
5a319350 PM |
1158 | } |
1159 | ||
9a94d3ee PM |
1160 | struct debugfs_radix_state { |
1161 | struct kvm *kvm; | |
1162 | struct mutex mutex; | |
1163 | unsigned long gpa; | |
83a05510 | 1164 | int lpid; |
9a94d3ee PM |
1165 | int chars_left; |
1166 | int buf_index; | |
1167 | char buf[128]; | |
1168 | u8 hdr; | |
1169 | }; | |
1170 | ||
1171 | static int debugfs_radix_open(struct inode *inode, struct file *file) | |
1172 | { | |
1173 | struct kvm *kvm = inode->i_private; | |
1174 | struct debugfs_radix_state *p; | |
1175 | ||
1176 | p = kzalloc(sizeof(*p), GFP_KERNEL); | |
1177 | if (!p) | |
1178 | return -ENOMEM; | |
1179 | ||
1180 | kvm_get_kvm(kvm); | |
1181 | p->kvm = kvm; | |
1182 | mutex_init(&p->mutex); | |
1183 | file->private_data = p; | |
1184 | ||
1185 | return nonseekable_open(inode, file); | |
1186 | } | |
1187 | ||
1188 | static int debugfs_radix_release(struct inode *inode, struct file *file) | |
1189 | { | |
1190 | struct debugfs_radix_state *p = file->private_data; | |
1191 | ||
1192 | kvm_put_kvm(p->kvm); | |
1193 | kfree(p); | |
1194 | return 0; | |
1195 | } | |
1196 | ||
1197 | static ssize_t debugfs_radix_read(struct file *file, char __user *buf, | |
1198 | size_t len, loff_t *ppos) | |
1199 | { | |
1200 | struct debugfs_radix_state *p = file->private_data; | |
1201 | ssize_t ret, r; | |
1202 | unsigned long n; | |
1203 | struct kvm *kvm; | |
1204 | unsigned long gpa; | |
1205 | pgd_t *pgt; | |
83a05510 | 1206 | struct kvm_nested_guest *nested; |
9a94d3ee PM |
1207 | pgd_t pgd, *pgdp; |
1208 | pud_t pud, *pudp; | |
1209 | pmd_t pmd, *pmdp; | |
1210 | pte_t *ptep; | |
1211 | int shift; | |
1212 | unsigned long pte; | |
1213 | ||
1214 | kvm = p->kvm; | |
1215 | if (!kvm_is_radix(kvm)) | |
1216 | return 0; | |
1217 | ||
1218 | ret = mutex_lock_interruptible(&p->mutex); | |
1219 | if (ret) | |
1220 | return ret; | |
1221 | ||
1222 | if (p->chars_left) { | |
1223 | n = p->chars_left; | |
1224 | if (n > len) | |
1225 | n = len; | |
1226 | r = copy_to_user(buf, p->buf + p->buf_index, n); | |
1227 | n -= r; | |
1228 | p->chars_left -= n; | |
1229 | p->buf_index += n; | |
1230 | buf += n; | |
1231 | len -= n; | |
1232 | ret = n; | |
1233 | if (r) { | |
1234 | if (!n) | |
1235 | ret = -EFAULT; | |
1236 | goto out; | |
1237 | } | |
1238 | } | |
1239 | ||
1240 | gpa = p->gpa; | |
83a05510 PM |
1241 | nested = NULL; |
1242 | pgt = NULL; | |
1243 | while (len != 0 && p->lpid >= 0) { | |
1244 | if (gpa >= RADIX_PGTABLE_RANGE) { | |
1245 | gpa = 0; | |
1246 | pgt = NULL; | |
1247 | if (nested) { | |
1248 | kvmhv_put_nested(nested); | |
1249 | nested = NULL; | |
1250 | } | |
1251 | p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid); | |
1252 | p->hdr = 0; | |
1253 | if (p->lpid < 0) | |
1254 | break; | |
1255 | } | |
1256 | if (!pgt) { | |
1257 | if (p->lpid == 0) { | |
1258 | pgt = kvm->arch.pgtable; | |
1259 | } else { | |
1260 | nested = kvmhv_get_nested(kvm, p->lpid, false); | |
1261 | if (!nested) { | |
1262 | gpa = RADIX_PGTABLE_RANGE; | |
1263 | continue; | |
1264 | } | |
1265 | pgt = nested->shadow_pgtable; | |
1266 | } | |
1267 | } | |
1268 | n = 0; | |
9a94d3ee | 1269 | if (!p->hdr) { |
83a05510 PM |
1270 | if (p->lpid > 0) |
1271 | n = scnprintf(p->buf, sizeof(p->buf), | |
1272 | "\nNested LPID %d: ", p->lpid); | |
1273 | n += scnprintf(p->buf + n, sizeof(p->buf) - n, | |
9a94d3ee PM |
1274 | "pgdir: %lx\n", (unsigned long)pgt); |
1275 | p->hdr = 1; | |
1276 | goto copy; | |
1277 | } | |
1278 | ||
1279 | pgdp = pgt + pgd_index(gpa); | |
1280 | pgd = READ_ONCE(*pgdp); | |
1281 | if (!(pgd_val(pgd) & _PAGE_PRESENT)) { | |
1282 | gpa = (gpa & PGDIR_MASK) + PGDIR_SIZE; | |
1283 | continue; | |
1284 | } | |
1285 | ||
1286 | pudp = pud_offset(&pgd, gpa); | |
1287 | pud = READ_ONCE(*pudp); | |
1288 | if (!(pud_val(pud) & _PAGE_PRESENT)) { | |
1289 | gpa = (gpa & PUD_MASK) + PUD_SIZE; | |
1290 | continue; | |
1291 | } | |
1292 | if (pud_val(pud) & _PAGE_PTE) { | |
1293 | pte = pud_val(pud); | |
1294 | shift = PUD_SHIFT; | |
1295 | goto leaf; | |
1296 | } | |
1297 | ||
1298 | pmdp = pmd_offset(&pud, gpa); | |
1299 | pmd = READ_ONCE(*pmdp); | |
1300 | if (!(pmd_val(pmd) & _PAGE_PRESENT)) { | |
1301 | gpa = (gpa & PMD_MASK) + PMD_SIZE; | |
1302 | continue; | |
1303 | } | |
1304 | if (pmd_val(pmd) & _PAGE_PTE) { | |
1305 | pte = pmd_val(pmd); | |
1306 | shift = PMD_SHIFT; | |
1307 | goto leaf; | |
1308 | } | |
1309 | ||
1310 | ptep = pte_offset_kernel(&pmd, gpa); | |
1311 | pte = pte_val(READ_ONCE(*ptep)); | |
1312 | if (!(pte & _PAGE_PRESENT)) { | |
1313 | gpa += PAGE_SIZE; | |
1314 | continue; | |
1315 | } | |
1316 | shift = PAGE_SHIFT; | |
1317 | leaf: | |
1318 | n = scnprintf(p->buf, sizeof(p->buf), | |
1319 | " %lx: %lx %d\n", gpa, pte, shift); | |
1320 | gpa += 1ul << shift; | |
1321 | copy: | |
1322 | p->chars_left = n; | |
1323 | if (n > len) | |
1324 | n = len; | |
1325 | r = copy_to_user(buf, p->buf, n); | |
1326 | n -= r; | |
1327 | p->chars_left -= n; | |
1328 | p->buf_index = n; | |
1329 | buf += n; | |
1330 | len -= n; | |
1331 | ret += n; | |
1332 | if (r) { | |
1333 | if (!ret) | |
1334 | ret = -EFAULT; | |
1335 | break; | |
1336 | } | |
1337 | } | |
1338 | p->gpa = gpa; | |
83a05510 PM |
1339 | if (nested) |
1340 | kvmhv_put_nested(nested); | |
9a94d3ee PM |
1341 | |
1342 | out: | |
1343 | mutex_unlock(&p->mutex); | |
1344 | return ret; | |
1345 | } | |
1346 | ||
1347 | static ssize_t debugfs_radix_write(struct file *file, const char __user *buf, | |
1348 | size_t len, loff_t *ppos) | |
1349 | { | |
1350 | return -EACCES; | |
1351 | } | |
1352 | ||
1353 | static const struct file_operations debugfs_radix_fops = { | |
1354 | .owner = THIS_MODULE, | |
1355 | .open = debugfs_radix_open, | |
1356 | .release = debugfs_radix_release, | |
1357 | .read = debugfs_radix_read, | |
1358 | .write = debugfs_radix_write, | |
1359 | .llseek = generic_file_llseek, | |
1360 | }; | |
1361 | ||
1362 | void kvmhv_radix_debugfs_init(struct kvm *kvm) | |
1363 | { | |
1364 | kvm->arch.radix_dentry = debugfs_create_file("radix", 0400, | |
1365 | kvm->arch.debugfs_dir, kvm, | |
1366 | &debugfs_radix_fops); | |
1367 | } | |
1368 | ||
5a319350 PM |
1369 | int kvmppc_radix_init(void) |
1370 | { | |
21828c99 | 1371 | unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE; |
5a319350 PM |
1372 | |
1373 | kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor); | |
1374 | if (!kvm_pte_cache) | |
1375 | return -ENOMEM; | |
21828c99 AK |
1376 | |
1377 | size = sizeof(void *) << RADIX_PMD_INDEX_SIZE; | |
1378 | ||
1379 | kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, 0, pmd_ctor); | |
1380 | if (!kvm_pmd_cache) { | |
1381 | kmem_cache_destroy(kvm_pte_cache); | |
1382 | return -ENOMEM; | |
1383 | } | |
1384 | ||
5a319350 PM |
1385 | return 0; |
1386 | } | |
1387 | ||
1388 | void kvmppc_radix_exit(void) | |
1389 | { | |
1390 | kmem_cache_destroy(kvm_pte_cache); | |
21828c99 | 1391 | kmem_cache_destroy(kvm_pmd_cache); |
5a319350 | 1392 | } |