]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - arch/powerpc/kvm/book3s_64_mmu_hv.c
KVM: PPC: Book3S HV: Provide a method for userspace to read and write the HPT
[mirror_ubuntu-artful-kernel.git] / arch / powerpc / kvm / book3s_64_mmu_hv.c
CommitLineData
de56a948
PM
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License, version 2, as
4 * published by the Free Software Foundation.
5 *
6 * This program is distributed in the hope that it will be useful,
7 * but WITHOUT ANY WARRANTY; without even the implied warranty of
8 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9 * GNU General Public License for more details.
10 *
11 * You should have received a copy of the GNU General Public License
12 * along with this program; if not, write to the Free Software
13 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
14 *
15 * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
16 */
17
18#include <linux/types.h>
19#include <linux/string.h>
20#include <linux/kvm.h>
21#include <linux/kvm_host.h>
22#include <linux/highmem.h>
23#include <linux/gfp.h>
24#include <linux/slab.h>
25#include <linux/hugetlb.h>
8936dda4 26#include <linux/vmalloc.h>
2c9097e4 27#include <linux/srcu.h>
a2932923
PM
28#include <linux/anon_inodes.h>
29#include <linux/file.h>
de56a948
PM
30
31#include <asm/tlbflush.h>
32#include <asm/kvm_ppc.h>
33#include <asm/kvm_book3s.h>
34#include <asm/mmu-hash64.h>
35#include <asm/hvcall.h>
36#include <asm/synch.h>
37#include <asm/ppc-opcode.h>
38#include <asm/cputable.h>
39
9e368f29
PM
40/* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */
41#define MAX_LPID_970 63
de56a948 42
32fad281
PM
43/* Power architecture requires HPT is at least 256kB */
44#define PPC_MIN_HPT_ORDER 18
45
7ed661bf
PM
46static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
47 long pte_index, unsigned long pteh,
48 unsigned long ptel, unsigned long *pte_idx_ret);
49
32fad281 50long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
de56a948
PM
51{
52 unsigned long hpt;
8936dda4 53 struct revmap_entry *rev;
d2a1b483 54 struct kvmppc_linear_info *li;
32fad281 55 long order = kvm_hpt_order;
de56a948 56
32fad281
PM
57 if (htab_orderp) {
58 order = *htab_orderp;
59 if (order < PPC_MIN_HPT_ORDER)
60 order = PPC_MIN_HPT_ORDER;
61 }
62
63 /*
64 * If the user wants a different size from default,
65 * try first to allocate it from the kernel page allocator.
66 */
67 hpt = 0;
68 if (order != kvm_hpt_order) {
d2a1b483 69 hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
32fad281
PM
70 __GFP_NOWARN, order - PAGE_SHIFT);
71 if (!hpt)
72 --order;
d2a1b483
AG
73 }
74
32fad281 75 /* Next try to allocate from the preallocated pool */
de56a948 76 if (!hpt) {
32fad281
PM
77 li = kvm_alloc_hpt();
78 if (li) {
79 hpt = (ulong)li->base_virt;
80 kvm->arch.hpt_li = li;
81 order = kvm_hpt_order;
82 }
de56a948 83 }
32fad281
PM
84
85 /* Lastly try successively smaller sizes from the page allocator */
86 while (!hpt && order > PPC_MIN_HPT_ORDER) {
87 hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
88 __GFP_NOWARN, order - PAGE_SHIFT);
89 if (!hpt)
90 --order;
91 }
92
93 if (!hpt)
94 return -ENOMEM;
95
de56a948 96 kvm->arch.hpt_virt = hpt;
32fad281
PM
97 kvm->arch.hpt_order = order;
98 /* HPTEs are 2**4 bytes long */
99 kvm->arch.hpt_npte = 1ul << (order - 4);
100 /* 128 (2**7) bytes in each HPTEG */
101 kvm->arch.hpt_mask = (1ul << (order - 7)) - 1;
de56a948 102
8936dda4 103 /* Allocate reverse map array */
32fad281 104 rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt_npte);
8936dda4
PM
105 if (!rev) {
106 pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n");
107 goto out_freehpt;
108 }
109 kvm->arch.revmap = rev;
32fad281 110 kvm->arch.sdr1 = __pa(hpt) | (order - 18);
8936dda4 111
32fad281
PM
112 pr_info("KVM guest htab at %lx (order %ld), LPID %x\n",
113 hpt, order, kvm->arch.lpid);
de56a948 114
32fad281
PM
115 if (htab_orderp)
116 *htab_orderp = order;
de56a948 117 return 0;
8936dda4 118
8936dda4 119 out_freehpt:
32fad281
PM
120 if (kvm->arch.hpt_li)
121 kvm_release_hpt(kvm->arch.hpt_li);
122 else
123 free_pages(hpt, order - PAGE_SHIFT);
8936dda4 124 return -ENOMEM;
de56a948
PM
125}
126
32fad281
PM
127long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
128{
129 long err = -EBUSY;
130 long order;
131
132 mutex_lock(&kvm->lock);
133 if (kvm->arch.rma_setup_done) {
134 kvm->arch.rma_setup_done = 0;
135 /* order rma_setup_done vs. vcpus_running */
136 smp_mb();
137 if (atomic_read(&kvm->arch.vcpus_running)) {
138 kvm->arch.rma_setup_done = 1;
139 goto out;
140 }
141 }
142 if (kvm->arch.hpt_virt) {
143 order = kvm->arch.hpt_order;
144 /* Set the entire HPT to 0, i.e. invalid HPTEs */
145 memset((void *)kvm->arch.hpt_virt, 0, 1ul << order);
146 /*
147 * Set the whole last_vcpu array to an invalid vcpu number.
148 * This ensures that each vcpu will flush its TLB on next entry.
149 */
150 memset(kvm->arch.last_vcpu, 0xff, sizeof(kvm->arch.last_vcpu));
151 *htab_orderp = order;
152 err = 0;
153 } else {
154 err = kvmppc_alloc_hpt(kvm, htab_orderp);
155 order = *htab_orderp;
156 }
157 out:
158 mutex_unlock(&kvm->lock);
159 return err;
160}
161
de56a948
PM
162void kvmppc_free_hpt(struct kvm *kvm)
163{
043cc4d7 164 kvmppc_free_lpid(kvm->arch.lpid);
8936dda4 165 vfree(kvm->arch.revmap);
d2a1b483
AG
166 if (kvm->arch.hpt_li)
167 kvm_release_hpt(kvm->arch.hpt_li);
168 else
32fad281
PM
169 free_pages(kvm->arch.hpt_virt,
170 kvm->arch.hpt_order - PAGE_SHIFT);
de56a948
PM
171}
172
da9d1d7f
PM
173/* Bits in first HPTE dword for pagesize 4k, 64k or 16M */
174static inline unsigned long hpte0_pgsize_encoding(unsigned long pgsize)
175{
176 return (pgsize > 0x1000) ? HPTE_V_LARGE : 0;
177}
178
179/* Bits in second HPTE dword for pagesize 4k, 64k or 16M */
180static inline unsigned long hpte1_pgsize_encoding(unsigned long pgsize)
181{
182 return (pgsize == 0x10000) ? 0x1000 : 0;
183}
184
185void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
186 unsigned long porder)
de56a948
PM
187{
188 unsigned long i;
b2b2f165 189 unsigned long npages;
c77162de
PM
190 unsigned long hp_v, hp_r;
191 unsigned long addr, hash;
da9d1d7f
PM
192 unsigned long psize;
193 unsigned long hp0, hp1;
7ed661bf 194 unsigned long idx_ret;
c77162de 195 long ret;
32fad281 196 struct kvm *kvm = vcpu->kvm;
de56a948 197
da9d1d7f
PM
198 psize = 1ul << porder;
199 npages = memslot->npages >> (porder - PAGE_SHIFT);
de56a948
PM
200
201 /* VRMA can't be > 1TB */
8936dda4
PM
202 if (npages > 1ul << (40 - porder))
203 npages = 1ul << (40 - porder);
de56a948 204 /* Can't use more than 1 HPTE per HPTEG */
32fad281
PM
205 if (npages > kvm->arch.hpt_mask + 1)
206 npages = kvm->arch.hpt_mask + 1;
de56a948 207
da9d1d7f
PM
208 hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
209 HPTE_V_BOLTED | hpte0_pgsize_encoding(psize);
210 hp1 = hpte1_pgsize_encoding(psize) |
211 HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX;
212
de56a948 213 for (i = 0; i < npages; ++i) {
c77162de 214 addr = i << porder;
de56a948 215 /* can't use hpt_hash since va > 64 bits */
32fad281 216 hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & kvm->arch.hpt_mask;
de56a948
PM
217 /*
218 * We assume that the hash table is empty and no
219 * vcpus are using it at this stage. Since we create
220 * at most one HPTE per HPTEG, we just assume entry 7
221 * is available and use it.
222 */
8936dda4 223 hash = (hash << 3) + 7;
da9d1d7f
PM
224 hp_v = hp0 | ((addr >> 16) & ~0x7fUL);
225 hp_r = hp1 | addr;
7ed661bf
PM
226 ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, hash, hp_v, hp_r,
227 &idx_ret);
c77162de
PM
228 if (ret != H_SUCCESS) {
229 pr_err("KVM: map_vrma at %lx failed, ret=%ld\n",
230 addr, ret);
231 break;
232 }
de56a948
PM
233 }
234}
235
236int kvmppc_mmu_hv_init(void)
237{
9e368f29
PM
238 unsigned long host_lpid, rsvd_lpid;
239
240 if (!cpu_has_feature(CPU_FTR_HVMODE))
de56a948 241 return -EINVAL;
9e368f29 242
043cc4d7 243 /* POWER7 has 10-bit LPIDs, PPC970 and e500mc have 6-bit LPIDs */
9e368f29
PM
244 if (cpu_has_feature(CPU_FTR_ARCH_206)) {
245 host_lpid = mfspr(SPRN_LPID); /* POWER7 */
246 rsvd_lpid = LPID_RSVD;
247 } else {
248 host_lpid = 0; /* PPC970 */
249 rsvd_lpid = MAX_LPID_970;
250 }
251
043cc4d7
SW
252 kvmppc_init_lpid(rsvd_lpid + 1);
253
254 kvmppc_claim_lpid(host_lpid);
9e368f29 255 /* rsvd_lpid is reserved for use in partition switching */
043cc4d7 256 kvmppc_claim_lpid(rsvd_lpid);
de56a948
PM
257
258 return 0;
259}
260
261void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
262{
263}
264
265static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
266{
267 kvmppc_set_msr(vcpu, MSR_SF | MSR_ME);
268}
269
c77162de
PM
270/*
271 * This is called to get a reference to a guest page if there isn't
a66b48c3 272 * one already in the memslot->arch.slot_phys[] array.
c77162de
PM
273 */
274static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn,
da9d1d7f
PM
275 struct kvm_memory_slot *memslot,
276 unsigned long psize)
c77162de
PM
277{
278 unsigned long start;
da9d1d7f
PM
279 long np, err;
280 struct page *page, *hpage, *pages[1];
281 unsigned long s, pgsize;
c77162de 282 unsigned long *physp;
9d0ef5ea
PM
283 unsigned int is_io, got, pgorder;
284 struct vm_area_struct *vma;
da9d1d7f 285 unsigned long pfn, i, npages;
c77162de 286
a66b48c3 287 physp = memslot->arch.slot_phys;
c77162de
PM
288 if (!physp)
289 return -EINVAL;
da9d1d7f 290 if (physp[gfn - memslot->base_gfn])
c77162de
PM
291 return 0;
292
9d0ef5ea
PM
293 is_io = 0;
294 got = 0;
c77162de 295 page = NULL;
da9d1d7f 296 pgsize = psize;
9d0ef5ea 297 err = -EINVAL;
c77162de
PM
298 start = gfn_to_hva_memslot(memslot, gfn);
299
300 /* Instantiate and get the page we want access to */
301 np = get_user_pages_fast(start, 1, 1, pages);
9d0ef5ea
PM
302 if (np != 1) {
303 /* Look up the vma for the page */
304 down_read(&current->mm->mmap_sem);
305 vma = find_vma(current->mm, start);
306 if (!vma || vma->vm_start > start ||
307 start + psize > vma->vm_end ||
308 !(vma->vm_flags & VM_PFNMAP))
309 goto up_err;
310 is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot));
311 pfn = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
312 /* check alignment of pfn vs. requested page size */
313 if (psize > PAGE_SIZE && (pfn & ((psize >> PAGE_SHIFT) - 1)))
314 goto up_err;
315 up_read(&current->mm->mmap_sem);
316
317 } else {
318 page = pages[0];
319 got = KVMPPC_GOT_PAGE;
320
321 /* See if this is a large page */
322 s = PAGE_SIZE;
323 if (PageHuge(page)) {
324 hpage = compound_head(page);
325 s <<= compound_order(hpage);
326 /* Get the whole large page if slot alignment is ok */
327 if (s > psize && slot_is_aligned(memslot, s) &&
328 !(memslot->userspace_addr & (s - 1))) {
329 start &= ~(s - 1);
330 pgsize = s;
de6c0b02
DG
331 get_page(hpage);
332 put_page(page);
9d0ef5ea
PM
333 page = hpage;
334 }
da9d1d7f 335 }
9d0ef5ea
PM
336 if (s < psize)
337 goto out;
338 pfn = page_to_pfn(page);
c77162de 339 }
c77162de 340
da9d1d7f
PM
341 npages = pgsize >> PAGE_SHIFT;
342 pgorder = __ilog2(npages);
343 physp += (gfn - memslot->base_gfn) & ~(npages - 1);
c77162de 344 spin_lock(&kvm->arch.slot_phys_lock);
da9d1d7f
PM
345 for (i = 0; i < npages; ++i) {
346 if (!physp[i]) {
9d0ef5ea
PM
347 physp[i] = ((pfn + i) << PAGE_SHIFT) +
348 got + is_io + pgorder;
da9d1d7f
PM
349 got = 0;
350 }
351 }
c77162de 352 spin_unlock(&kvm->arch.slot_phys_lock);
da9d1d7f 353 err = 0;
c77162de 354
da9d1d7f 355 out:
de6c0b02 356 if (got)
da9d1d7f 357 put_page(page);
da9d1d7f 358 return err;
9d0ef5ea
PM
359
360 up_err:
361 up_read(&current->mm->mmap_sem);
362 return err;
c77162de
PM
363}
364
7ed661bf
PM
365long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
366 long pte_index, unsigned long pteh,
367 unsigned long ptel, unsigned long *pte_idx_ret)
c77162de 368{
c77162de
PM
369 unsigned long psize, gpa, gfn;
370 struct kvm_memory_slot *memslot;
371 long ret;
372
342d3db7
PM
373 if (kvm->arch.using_mmu_notifiers)
374 goto do_insert;
375
c77162de
PM
376 psize = hpte_page_size(pteh, ptel);
377 if (!psize)
378 return H_PARAMETER;
379
697d3899
PM
380 pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID);
381
c77162de
PM
382 /* Find the memslot (if any) for this address */
383 gpa = (ptel & HPTE_R_RPN) & ~(psize - 1);
384 gfn = gpa >> PAGE_SHIFT;
385 memslot = gfn_to_memslot(kvm, gfn);
697d3899
PM
386 if (memslot && !(memslot->flags & KVM_MEMSLOT_INVALID)) {
387 if (!slot_is_aligned(memslot, psize))
388 return H_PARAMETER;
389 if (kvmppc_get_guest_page(kvm, gfn, memslot, psize) < 0)
390 return H_PARAMETER;
391 }
c77162de 392
342d3db7
PM
393 do_insert:
394 /* Protect linux PTE lookup from page table destruction */
395 rcu_read_lock_sched(); /* this disables preemption too */
7ed661bf
PM
396 ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel,
397 current->mm->pgd, false, pte_idx_ret);
342d3db7 398 rcu_read_unlock_sched();
c77162de
PM
399 if (ret == H_TOO_HARD) {
400 /* this can't happen */
401 pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n");
402 ret = H_RESOURCE; /* or something */
403 }
404 return ret;
405
406}
407
7ed661bf
PM
408/*
409 * We come here on a H_ENTER call from the guest when we are not
410 * using mmu notifiers and we don't have the requested page pinned
411 * already.
412 */
413long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
414 long pte_index, unsigned long pteh,
415 unsigned long ptel)
416{
417 return kvmppc_virtmode_do_h_enter(vcpu->kvm, flags, pte_index,
418 pteh, ptel, &vcpu->arch.gpr[4]);
419}
420
697d3899
PM
421static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu,
422 gva_t eaddr)
423{
424 u64 mask;
425 int i;
426
427 for (i = 0; i < vcpu->arch.slb_nr; i++) {
428 if (!(vcpu->arch.slb[i].orige & SLB_ESID_V))
429 continue;
430
431 if (vcpu->arch.slb[i].origv & SLB_VSID_B_1T)
432 mask = ESID_MASK_1T;
433 else
434 mask = ESID_MASK;
435
436 if (((vcpu->arch.slb[i].orige ^ eaddr) & mask) == 0)
437 return &vcpu->arch.slb[i];
438 }
439 return NULL;
440}
441
442static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r,
443 unsigned long ea)
444{
445 unsigned long ra_mask;
446
447 ra_mask = hpte_page_size(v, r) - 1;
448 return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask);
449}
450
de56a948 451static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
697d3899 452 struct kvmppc_pte *gpte, bool data)
de56a948 453{
697d3899
PM
454 struct kvm *kvm = vcpu->kvm;
455 struct kvmppc_slb *slbe;
456 unsigned long slb_v;
457 unsigned long pp, key;
458 unsigned long v, gr;
459 unsigned long *hptep;
460 int index;
461 int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR);
462
463 /* Get SLB entry */
464 if (virtmode) {
465 slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr);
466 if (!slbe)
467 return -EINVAL;
468 slb_v = slbe->origv;
469 } else {
470 /* real mode access */
471 slb_v = vcpu->kvm->arch.vrma_slb_v;
472 }
473
474 /* Find the HPTE in the hash table */
475 index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v,
476 HPTE_V_VALID | HPTE_V_ABSENT);
477 if (index < 0)
478 return -ENOENT;
479 hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
480 v = hptep[0] & ~HPTE_V_HVLOCK;
481 gr = kvm->arch.revmap[index].guest_rpte;
482
483 /* Unlock the HPTE */
484 asm volatile("lwsync" : : : "memory");
485 hptep[0] = v;
486
487 gpte->eaddr = eaddr;
488 gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff);
489
490 /* Get PP bits and key for permission check */
491 pp = gr & (HPTE_R_PP0 | HPTE_R_PP);
492 key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS;
493 key &= slb_v;
494
495 /* Calculate permissions */
496 gpte->may_read = hpte_read_permission(pp, key);
497 gpte->may_write = hpte_write_permission(pp, key);
498 gpte->may_execute = gpte->may_read && !(gr & (HPTE_R_N | HPTE_R_G));
499
500 /* Storage key permission check for POWER7 */
501 if (data && virtmode && cpu_has_feature(CPU_FTR_ARCH_206)) {
502 int amrfield = hpte_get_skey_perm(gr, vcpu->arch.amr);
503 if (amrfield & 1)
504 gpte->may_read = 0;
505 if (amrfield & 2)
506 gpte->may_write = 0;
507 }
508
509 /* Get the guest physical address */
510 gpte->raddr = kvmppc_mmu_get_real_addr(v, gr, eaddr);
511 return 0;
512}
513
514/*
515 * Quick test for whether an instruction is a load or a store.
516 * If the instruction is a load or a store, then this will indicate
517 * which it is, at least on server processors. (Embedded processors
518 * have some external PID instructions that don't follow the rule
519 * embodied here.) If the instruction isn't a load or store, then
520 * this doesn't return anything useful.
521 */
522static int instruction_is_store(unsigned int instr)
523{
524 unsigned int mask;
525
526 mask = 0x10000000;
527 if ((instr & 0xfc000000) == 0x7c000000)
528 mask = 0x100; /* major opcode 31 */
529 return (instr & mask) != 0;
530}
531
532static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
6020c0f6 533 unsigned long gpa, gva_t ea, int is_store)
697d3899
PM
534{
535 int ret;
536 u32 last_inst;
537 unsigned long srr0 = kvmppc_get_pc(vcpu);
538
539 /* We try to load the last instruction. We don't let
540 * emulate_instruction do it as it doesn't check what
541 * kvmppc_ld returns.
542 * If we fail, we just return to the guest and try executing it again.
543 */
544 if (vcpu->arch.last_inst == KVM_INST_FETCH_FAILED) {
545 ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false);
546 if (ret != EMULATE_DONE || last_inst == KVM_INST_FETCH_FAILED)
547 return RESUME_GUEST;
548 vcpu->arch.last_inst = last_inst;
549 }
550
551 /*
552 * WARNING: We do not know for sure whether the instruction we just
553 * read from memory is the same that caused the fault in the first
554 * place. If the instruction we read is neither an load or a store,
555 * then it can't access memory, so we don't need to worry about
556 * enforcing access permissions. So, assuming it is a load or
557 * store, we just check that its direction (load or store) is
558 * consistent with the original fault, since that's what we
559 * checked the access permissions against. If there is a mismatch
560 * we just return and retry the instruction.
561 */
562
563 if (instruction_is_store(vcpu->arch.last_inst) != !!is_store)
564 return RESUME_GUEST;
565
566 /*
567 * Emulated accesses are emulated by looking at the hash for
568 * translation once, then performing the access later. The
569 * translation could be invalidated in the meantime in which
570 * point performing the subsequent memory access on the old
571 * physical address could possibly be a security hole for the
572 * guest (but not the host).
573 *
574 * This is less of an issue for MMIO stores since they aren't
575 * globally visible. It could be an issue for MMIO loads to
576 * a certain extent but we'll ignore it for now.
577 */
578
579 vcpu->arch.paddr_accessed = gpa;
6020c0f6 580 vcpu->arch.vaddr_accessed = ea;
697d3899
PM
581 return kvmppc_emulate_mmio(run, vcpu);
582}
583
584int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
585 unsigned long ea, unsigned long dsisr)
586{
587 struct kvm *kvm = vcpu->kvm;
342d3db7
PM
588 unsigned long *hptep, hpte[3], r;
589 unsigned long mmu_seq, psize, pte_size;
70bddfef 590 unsigned long gpa, gfn, hva, pfn;
697d3899 591 struct kvm_memory_slot *memslot;
342d3db7 592 unsigned long *rmap;
697d3899 593 struct revmap_entry *rev;
342d3db7
PM
594 struct page *page, *pages[1];
595 long index, ret, npages;
596 unsigned long is_io;
4cf302bc 597 unsigned int writing, write_ok;
342d3db7 598 struct vm_area_struct *vma;
bad3b507 599 unsigned long rcbits;
697d3899
PM
600
601 /*
602 * Real-mode code has already searched the HPT and found the
603 * entry we're interested in. Lock the entry and check that
604 * it hasn't changed. If it has, just return and re-execute the
605 * instruction.
606 */
607 if (ea != vcpu->arch.pgfault_addr)
608 return RESUME_GUEST;
609 index = vcpu->arch.pgfault_index;
610 hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
611 rev = &kvm->arch.revmap[index];
612 preempt_disable();
613 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
614 cpu_relax();
615 hpte[0] = hptep[0] & ~HPTE_V_HVLOCK;
616 hpte[1] = hptep[1];
342d3db7 617 hpte[2] = r = rev->guest_rpte;
697d3899
PM
618 asm volatile("lwsync" : : : "memory");
619 hptep[0] = hpte[0];
620 preempt_enable();
621
622 if (hpte[0] != vcpu->arch.pgfault_hpte[0] ||
623 hpte[1] != vcpu->arch.pgfault_hpte[1])
624 return RESUME_GUEST;
625
626 /* Translate the logical address and get the page */
342d3db7 627 psize = hpte_page_size(hpte[0], r);
70bddfef
PM
628 gpa = (r & HPTE_R_RPN & ~(psize - 1)) | (ea & (psize - 1));
629 gfn = gpa >> PAGE_SHIFT;
697d3899
PM
630 memslot = gfn_to_memslot(kvm, gfn);
631
632 /* No memslot means it's an emulated MMIO region */
70bddfef 633 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
6020c0f6 634 return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
697d3899 635 dsisr & DSISR_ISSTORE);
697d3899 636
342d3db7
PM
637 if (!kvm->arch.using_mmu_notifiers)
638 return -EFAULT; /* should never get here */
639
640 /* used to check for invalidations in progress */
641 mmu_seq = kvm->mmu_notifier_seq;
642 smp_rmb();
643
644 is_io = 0;
645 pfn = 0;
646 page = NULL;
647 pte_size = PAGE_SIZE;
4cf302bc
PM
648 writing = (dsisr & DSISR_ISSTORE) != 0;
649 /* If writing != 0, then the HPTE must allow writing, if we get here */
650 write_ok = writing;
342d3db7 651 hva = gfn_to_hva_memslot(memslot, gfn);
4cf302bc 652 npages = get_user_pages_fast(hva, 1, writing, pages);
342d3db7
PM
653 if (npages < 1) {
654 /* Check if it's an I/O mapping */
655 down_read(&current->mm->mmap_sem);
656 vma = find_vma(current->mm, hva);
657 if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end &&
658 (vma->vm_flags & VM_PFNMAP)) {
659 pfn = vma->vm_pgoff +
660 ((hva - vma->vm_start) >> PAGE_SHIFT);
661 pte_size = psize;
662 is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot));
4cf302bc 663 write_ok = vma->vm_flags & VM_WRITE;
342d3db7
PM
664 }
665 up_read(&current->mm->mmap_sem);
666 if (!pfn)
667 return -EFAULT;
668 } else {
669 page = pages[0];
670 if (PageHuge(page)) {
671 page = compound_head(page);
672 pte_size <<= compound_order(page);
673 }
4cf302bc
PM
674 /* if the guest wants write access, see if that is OK */
675 if (!writing && hpte_is_writable(r)) {
676 pte_t *ptep, pte;
677
678 /*
679 * We need to protect against page table destruction
680 * while looking up and updating the pte.
681 */
682 rcu_read_lock_sched();
683 ptep = find_linux_pte_or_hugepte(current->mm->pgd,
684 hva, NULL);
685 if (ptep && pte_present(*ptep)) {
686 pte = kvmppc_read_update_linux_pte(ptep, 1);
687 if (pte_write(pte))
688 write_ok = 1;
689 }
690 rcu_read_unlock_sched();
691 }
342d3db7
PM
692 pfn = page_to_pfn(page);
693 }
694
695 ret = -EFAULT;
696 if (psize > pte_size)
697 goto out_put;
698
699 /* Check WIMG vs. the actual page we're accessing */
700 if (!hpte_cache_flags_ok(r, is_io)) {
701 if (is_io)
702 return -EFAULT;
703 /*
704 * Allow guest to map emulated device memory as
705 * uncacheable, but actually make it cacheable.
706 */
707 r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M;
708 }
709
710 /* Set the HPTE to point to pfn */
711 r = (r & ~(HPTE_R_PP0 - pte_size)) | (pfn << PAGE_SHIFT);
4cf302bc
PM
712 if (hpte_is_writable(r) && !write_ok)
713 r = hpte_make_readonly(r);
342d3db7
PM
714 ret = RESUME_GUEST;
715 preempt_disable();
716 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
717 cpu_relax();
718 if ((hptep[0] & ~HPTE_V_HVLOCK) != hpte[0] || hptep[1] != hpte[1] ||
719 rev->guest_rpte != hpte[2])
720 /* HPTE has been changed under us; let the guest retry */
721 goto out_unlock;
722 hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
723
d89cc617 724 rmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
342d3db7
PM
725 lock_rmap(rmap);
726
727 /* Check if we might have been invalidated; let the guest retry if so */
728 ret = RESUME_GUEST;
8ca40a70 729 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) {
342d3db7
PM
730 unlock_rmap(rmap);
731 goto out_unlock;
732 }
4cf302bc 733
bad3b507
PM
734 /* Only set R/C in real HPTE if set in both *rmap and guest_rpte */
735 rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT;
736 r &= rcbits | ~(HPTE_R_R | HPTE_R_C);
737
4cf302bc
PM
738 if (hptep[0] & HPTE_V_VALID) {
739 /* HPTE was previously valid, so we need to invalidate it */
740 unlock_rmap(rmap);
741 hptep[0] |= HPTE_V_ABSENT;
742 kvmppc_invalidate_hpte(kvm, hptep, index);
bad3b507
PM
743 /* don't lose previous R and C bits */
744 r |= hptep[1] & (HPTE_R_R | HPTE_R_C);
4cf302bc
PM
745 } else {
746 kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0);
747 }
342d3db7
PM
748
749 hptep[1] = r;
750 eieio();
751 hptep[0] = hpte[0];
752 asm volatile("ptesync" : : : "memory");
753 preempt_enable();
4cf302bc 754 if (page && hpte_is_writable(r))
342d3db7
PM
755 SetPageDirty(page);
756
757 out_put:
de6c0b02
DG
758 if (page) {
759 /*
760 * We drop pages[0] here, not page because page might
761 * have been set to the head page of a compound, but
762 * we have to drop the reference on the correct tail
763 * page to match the get inside gup()
764 */
765 put_page(pages[0]);
766 }
342d3db7
PM
767 return ret;
768
769 out_unlock:
770 hptep[0] &= ~HPTE_V_HVLOCK;
771 preempt_enable();
772 goto out_put;
773}
774
84504ef3
TY
775static int kvm_handle_hva_range(struct kvm *kvm,
776 unsigned long start,
777 unsigned long end,
778 int (*handler)(struct kvm *kvm,
779 unsigned long *rmapp,
780 unsigned long gfn))
342d3db7
PM
781{
782 int ret;
783 int retval = 0;
784 struct kvm_memslots *slots;
785 struct kvm_memory_slot *memslot;
786
787 slots = kvm_memslots(kvm);
788 kvm_for_each_memslot(memslot, slots) {
84504ef3
TY
789 unsigned long hva_start, hva_end;
790 gfn_t gfn, gfn_end;
791
792 hva_start = max(start, memslot->userspace_addr);
793 hva_end = min(end, memslot->userspace_addr +
794 (memslot->npages << PAGE_SHIFT));
795 if (hva_start >= hva_end)
796 continue;
797 /*
798 * {gfn(page) | page intersects with [hva_start, hva_end)} =
799 * {gfn, gfn+1, ..., gfn_end-1}.
800 */
801 gfn = hva_to_gfn_memslot(hva_start, memslot);
802 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
342d3db7 803
84504ef3 804 for (; gfn < gfn_end; ++gfn) {
d19a748b 805 gfn_t gfn_offset = gfn - memslot->base_gfn;
342d3db7 806
d89cc617 807 ret = handler(kvm, &memslot->arch.rmap[gfn_offset], gfn);
342d3db7
PM
808 retval |= ret;
809 }
810 }
811
812 return retval;
813}
814
84504ef3
TY
815static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
816 int (*handler)(struct kvm *kvm, unsigned long *rmapp,
817 unsigned long gfn))
818{
819 return kvm_handle_hva_range(kvm, hva, hva + 1, handler);
820}
821
342d3db7
PM
822static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
823 unsigned long gfn)
824{
825 struct revmap_entry *rev = kvm->arch.revmap;
826 unsigned long h, i, j;
827 unsigned long *hptep;
bad3b507 828 unsigned long ptel, psize, rcbits;
342d3db7
PM
829
830 for (;;) {
bad3b507 831 lock_rmap(rmapp);
342d3db7 832 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
bad3b507 833 unlock_rmap(rmapp);
342d3db7
PM
834 break;
835 }
836
837 /*
838 * To avoid an ABBA deadlock with the HPTE lock bit,
bad3b507
PM
839 * we can't spin on the HPTE lock while holding the
840 * rmap chain lock.
342d3db7
PM
841 */
842 i = *rmapp & KVMPPC_RMAP_INDEX;
bad3b507
PM
843 hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
844 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
845 /* unlock rmap before spinning on the HPTE lock */
846 unlock_rmap(rmapp);
847 while (hptep[0] & HPTE_V_HVLOCK)
848 cpu_relax();
849 continue;
850 }
342d3db7
PM
851 j = rev[i].forw;
852 if (j == i) {
853 /* chain is now empty */
bad3b507 854 *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
342d3db7
PM
855 } else {
856 /* remove i from chain */
857 h = rev[i].back;
858 rev[h].forw = j;
859 rev[j].back = h;
860 rev[i].forw = rev[i].back = i;
bad3b507 861 *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j;
342d3db7 862 }
342d3db7 863
bad3b507 864 /* Now check and modify the HPTE */
342d3db7
PM
865 ptel = rev[i].guest_rpte;
866 psize = hpte_page_size(hptep[0], ptel);
867 if ((hptep[0] & HPTE_V_VALID) &&
868 hpte_rpn(ptel, psize) == gfn) {
dfe49dbd
PM
869 if (kvm->arch.using_mmu_notifiers)
870 hptep[0] |= HPTE_V_ABSENT;
bad3b507
PM
871 kvmppc_invalidate_hpte(kvm, hptep, i);
872 /* Harvest R and C */
873 rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C);
874 *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
875 rev[i].guest_rpte = ptel | rcbits;
342d3db7 876 }
bad3b507 877 unlock_rmap(rmapp);
342d3db7
PM
878 hptep[0] &= ~HPTE_V_HVLOCK;
879 }
880 return 0;
881}
882
883int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
884{
885 if (kvm->arch.using_mmu_notifiers)
886 kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
887 return 0;
888}
889
b3ae2096
TY
890int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
891{
892 if (kvm->arch.using_mmu_notifiers)
893 kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp);
894 return 0;
895}
896
dfe49dbd
PM
897void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot)
898{
899 unsigned long *rmapp;
900 unsigned long gfn;
901 unsigned long n;
902
903 rmapp = memslot->arch.rmap;
904 gfn = memslot->base_gfn;
905 for (n = memslot->npages; n; --n) {
906 /*
907 * Testing the present bit without locking is OK because
908 * the memslot has been marked invalid already, and hence
909 * no new HPTEs referencing this page can be created,
910 * thus the present bit can't go from 0 to 1.
911 */
912 if (*rmapp & KVMPPC_RMAP_PRESENT)
913 kvm_unmap_rmapp(kvm, rmapp, gfn);
914 ++rmapp;
915 ++gfn;
916 }
917}
918
342d3db7
PM
919static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
920 unsigned long gfn)
921{
55514893
PM
922 struct revmap_entry *rev = kvm->arch.revmap;
923 unsigned long head, i, j;
924 unsigned long *hptep;
925 int ret = 0;
926
927 retry:
928 lock_rmap(rmapp);
929 if (*rmapp & KVMPPC_RMAP_REFERENCED) {
930 *rmapp &= ~KVMPPC_RMAP_REFERENCED;
931 ret = 1;
932 }
933 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
934 unlock_rmap(rmapp);
935 return ret;
936 }
937
938 i = head = *rmapp & KVMPPC_RMAP_INDEX;
939 do {
940 hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
941 j = rev[i].forw;
942
943 /* If this HPTE isn't referenced, ignore it */
944 if (!(hptep[1] & HPTE_R_R))
945 continue;
946
947 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
948 /* unlock rmap before spinning on the HPTE lock */
949 unlock_rmap(rmapp);
950 while (hptep[0] & HPTE_V_HVLOCK)
951 cpu_relax();
952 goto retry;
953 }
954
955 /* Now check and modify the HPTE */
956 if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_R)) {
957 kvmppc_clear_ref_hpte(kvm, hptep, i);
958 rev[i].guest_rpte |= HPTE_R_R;
959 ret = 1;
960 }
961 hptep[0] &= ~HPTE_V_HVLOCK;
962 } while ((i = j) != head);
963
964 unlock_rmap(rmapp);
965 return ret;
342d3db7
PM
966}
967
968int kvm_age_hva(struct kvm *kvm, unsigned long hva)
969{
970 if (!kvm->arch.using_mmu_notifiers)
971 return 0;
972 return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
973}
974
975static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
976 unsigned long gfn)
977{
55514893
PM
978 struct revmap_entry *rev = kvm->arch.revmap;
979 unsigned long head, i, j;
980 unsigned long *hp;
981 int ret = 1;
982
983 if (*rmapp & KVMPPC_RMAP_REFERENCED)
984 return 1;
985
986 lock_rmap(rmapp);
987 if (*rmapp & KVMPPC_RMAP_REFERENCED)
988 goto out;
989
990 if (*rmapp & KVMPPC_RMAP_PRESENT) {
991 i = head = *rmapp & KVMPPC_RMAP_INDEX;
992 do {
993 hp = (unsigned long *)(kvm->arch.hpt_virt + (i << 4));
994 j = rev[i].forw;
995 if (hp[1] & HPTE_R_R)
996 goto out;
997 } while ((i = j) != head);
998 }
999 ret = 0;
1000
1001 out:
1002 unlock_rmap(rmapp);
1003 return ret;
342d3db7
PM
1004}
1005
1006int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
1007{
1008 if (!kvm->arch.using_mmu_notifiers)
1009 return 0;
1010 return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp);
1011}
1012
1013void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1014{
1015 if (!kvm->arch.using_mmu_notifiers)
1016 return;
1017 kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
de56a948
PM
1018}
1019
82ed3616
PM
1020static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
1021{
1022 struct revmap_entry *rev = kvm->arch.revmap;
1023 unsigned long head, i, j;
1024 unsigned long *hptep;
1025 int ret = 0;
1026
1027 retry:
1028 lock_rmap(rmapp);
1029 if (*rmapp & KVMPPC_RMAP_CHANGED) {
1030 *rmapp &= ~KVMPPC_RMAP_CHANGED;
1031 ret = 1;
1032 }
1033 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
1034 unlock_rmap(rmapp);
1035 return ret;
1036 }
1037
1038 i = head = *rmapp & KVMPPC_RMAP_INDEX;
1039 do {
1040 hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
1041 j = rev[i].forw;
1042
1043 if (!(hptep[1] & HPTE_R_C))
1044 continue;
1045
1046 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
1047 /* unlock rmap before spinning on the HPTE lock */
1048 unlock_rmap(rmapp);
1049 while (hptep[0] & HPTE_V_HVLOCK)
1050 cpu_relax();
1051 goto retry;
1052 }
1053
1054 /* Now check and modify the HPTE */
1055 if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_C)) {
1056 /* need to make it temporarily absent to clear C */
1057 hptep[0] |= HPTE_V_ABSENT;
1058 kvmppc_invalidate_hpte(kvm, hptep, i);
1059 hptep[1] &= ~HPTE_R_C;
1060 eieio();
1061 hptep[0] = (hptep[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
1062 rev[i].guest_rpte |= HPTE_R_C;
1063 ret = 1;
1064 }
1065 hptep[0] &= ~HPTE_V_HVLOCK;
1066 } while ((i = j) != head);
1067
1068 unlock_rmap(rmapp);
1069 return ret;
1070}
1071
dfe49dbd
PM
1072long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot,
1073 unsigned long *map)
82ed3616
PM
1074{
1075 unsigned long i;
dfe49dbd 1076 unsigned long *rmapp;
82ed3616
PM
1077
1078 preempt_disable();
d89cc617 1079 rmapp = memslot->arch.rmap;
82ed3616 1080 for (i = 0; i < memslot->npages; ++i) {
dfe49dbd 1081 if (kvm_test_clear_dirty(kvm, rmapp) && map)
82ed3616
PM
1082 __set_bit_le(i, map);
1083 ++rmapp;
1084 }
1085 preempt_enable();
1086 return 0;
1087}
1088
93e60249
PM
1089void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
1090 unsigned long *nb_ret)
1091{
1092 struct kvm_memory_slot *memslot;
1093 unsigned long gfn = gpa >> PAGE_SHIFT;
342d3db7
PM
1094 struct page *page, *pages[1];
1095 int npages;
1096 unsigned long hva, psize, offset;
da9d1d7f 1097 unsigned long pa;
93e60249 1098 unsigned long *physp;
2c9097e4 1099 int srcu_idx;
93e60249 1100
2c9097e4 1101 srcu_idx = srcu_read_lock(&kvm->srcu);
93e60249
PM
1102 memslot = gfn_to_memslot(kvm, gfn);
1103 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
2c9097e4 1104 goto err;
342d3db7 1105 if (!kvm->arch.using_mmu_notifiers) {
a66b48c3 1106 physp = memslot->arch.slot_phys;
342d3db7 1107 if (!physp)
2c9097e4 1108 goto err;
342d3db7 1109 physp += gfn - memslot->base_gfn;
c77162de 1110 pa = *physp;
342d3db7
PM
1111 if (!pa) {
1112 if (kvmppc_get_guest_page(kvm, gfn, memslot,
1113 PAGE_SIZE) < 0)
2c9097e4 1114 goto err;
342d3db7
PM
1115 pa = *physp;
1116 }
1117 page = pfn_to_page(pa >> PAGE_SHIFT);
de6c0b02 1118 get_page(page);
342d3db7
PM
1119 } else {
1120 hva = gfn_to_hva_memslot(memslot, gfn);
1121 npages = get_user_pages_fast(hva, 1, 1, pages);
1122 if (npages < 1)
2c9097e4 1123 goto err;
342d3db7 1124 page = pages[0];
c77162de 1125 }
2c9097e4
PM
1126 srcu_read_unlock(&kvm->srcu, srcu_idx);
1127
da9d1d7f
PM
1128 psize = PAGE_SIZE;
1129 if (PageHuge(page)) {
1130 page = compound_head(page);
1131 psize <<= compound_order(page);
1132 }
da9d1d7f 1133 offset = gpa & (psize - 1);
93e60249 1134 if (nb_ret)
da9d1d7f 1135 *nb_ret = psize - offset;
93e60249 1136 return page_address(page) + offset;
2c9097e4
PM
1137
1138 err:
1139 srcu_read_unlock(&kvm->srcu, srcu_idx);
1140 return NULL;
93e60249
PM
1141}
1142
1143void kvmppc_unpin_guest_page(struct kvm *kvm, void *va)
1144{
1145 struct page *page = virt_to_page(va);
1146
93e60249
PM
1147 put_page(page);
1148}
1149
a2932923
PM
1150/*
1151 * Functions for reading and writing the hash table via reads and
1152 * writes on a file descriptor.
1153 *
1154 * Reads return the guest view of the hash table, which has to be
1155 * pieced together from the real hash table and the guest_rpte
1156 * values in the revmap array.
1157 *
1158 * On writes, each HPTE written is considered in turn, and if it
1159 * is valid, it is written to the HPT as if an H_ENTER with the
1160 * exact flag set was done. When the invalid count is non-zero
1161 * in the header written to the stream, the kernel will make
1162 * sure that that many HPTEs are invalid, and invalidate them
1163 * if not.
1164 */
1165
1166struct kvm_htab_ctx {
1167 unsigned long index;
1168 unsigned long flags;
1169 struct kvm *kvm;
1170 int first_pass;
1171};
1172
1173#define HPTE_SIZE (2 * sizeof(unsigned long))
1174
1175static long record_hpte(unsigned long flags, unsigned long *hptp,
1176 unsigned long *hpte, struct revmap_entry *revp,
1177 int want_valid, int first_pass)
1178{
1179 unsigned long v, r;
1180 int ok = 1;
1181 int valid, dirty;
1182
1183 /* Unmodified entries are uninteresting except on the first pass */
1184 dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
1185 if (!first_pass && !dirty)
1186 return 0;
1187
1188 valid = 0;
1189 if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) {
1190 valid = 1;
1191 if ((flags & KVM_GET_HTAB_BOLTED_ONLY) &&
1192 !(hptp[0] & HPTE_V_BOLTED))
1193 valid = 0;
1194 }
1195 if (valid != want_valid)
1196 return 0;
1197
1198 v = r = 0;
1199 if (valid || dirty) {
1200 /* lock the HPTE so it's stable and read it */
1201 preempt_disable();
1202 while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
1203 cpu_relax();
1204 v = hptp[0];
1205 if (v & HPTE_V_ABSENT) {
1206 v &= ~HPTE_V_ABSENT;
1207 v |= HPTE_V_VALID;
1208 }
1209 /* re-evaluate valid and dirty from synchronized HPTE value */
1210 valid = !!(v & HPTE_V_VALID);
1211 if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED))
1212 valid = 0;
1213 r = revp->guest_rpte | (hptp[1] & (HPTE_R_R | HPTE_R_C));
1214 dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
1215 /* only clear modified if this is the right sort of entry */
1216 if (valid == want_valid && dirty) {
1217 r &= ~HPTE_GR_MODIFIED;
1218 revp->guest_rpte = r;
1219 }
1220 asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
1221 hptp[0] &= ~HPTE_V_HVLOCK;
1222 preempt_enable();
1223 if (!(valid == want_valid && (first_pass || dirty)))
1224 ok = 0;
1225 }
1226 hpte[0] = v;
1227 hpte[1] = r;
1228 return ok;
1229}
1230
1231static ssize_t kvm_htab_read(struct file *file, char __user *buf,
1232 size_t count, loff_t *ppos)
1233{
1234 struct kvm_htab_ctx *ctx = file->private_data;
1235 struct kvm *kvm = ctx->kvm;
1236 struct kvm_get_htab_header hdr;
1237 unsigned long *hptp;
1238 struct revmap_entry *revp;
1239 unsigned long i, nb, nw;
1240 unsigned long __user *lbuf;
1241 struct kvm_get_htab_header __user *hptr;
1242 unsigned long flags;
1243 int first_pass;
1244 unsigned long hpte[2];
1245
1246 if (!access_ok(VERIFY_WRITE, buf, count))
1247 return -EFAULT;
1248
1249 first_pass = ctx->first_pass;
1250 flags = ctx->flags;
1251
1252 i = ctx->index;
1253 hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
1254 revp = kvm->arch.revmap + i;
1255 lbuf = (unsigned long __user *)buf;
1256
1257 nb = 0;
1258 while (nb + sizeof(hdr) + HPTE_SIZE < count) {
1259 /* Initialize header */
1260 hptr = (struct kvm_get_htab_header __user *)buf;
1261 hdr.index = i;
1262 hdr.n_valid = 0;
1263 hdr.n_invalid = 0;
1264 nw = nb;
1265 nb += sizeof(hdr);
1266 lbuf = (unsigned long __user *)(buf + sizeof(hdr));
1267
1268 /* Skip uninteresting entries, i.e. clean on not-first pass */
1269 if (!first_pass) {
1270 while (i < kvm->arch.hpt_npte &&
1271 !(revp->guest_rpte & HPTE_GR_MODIFIED)) {
1272 ++i;
1273 hptp += 2;
1274 ++revp;
1275 }
1276 }
1277
1278 /* Grab a series of valid entries */
1279 while (i < kvm->arch.hpt_npte &&
1280 hdr.n_valid < 0xffff &&
1281 nb + HPTE_SIZE < count &&
1282 record_hpte(flags, hptp, hpte, revp, 1, first_pass)) {
1283 /* valid entry, write it out */
1284 ++hdr.n_valid;
1285 if (__put_user(hpte[0], lbuf) ||
1286 __put_user(hpte[1], lbuf + 1))
1287 return -EFAULT;
1288 nb += HPTE_SIZE;
1289 lbuf += 2;
1290 ++i;
1291 hptp += 2;
1292 ++revp;
1293 }
1294 /* Now skip invalid entries while we can */
1295 while (i < kvm->arch.hpt_npte &&
1296 hdr.n_invalid < 0xffff &&
1297 record_hpte(flags, hptp, hpte, revp, 0, first_pass)) {
1298 /* found an invalid entry */
1299 ++hdr.n_invalid;
1300 ++i;
1301 hptp += 2;
1302 ++revp;
1303 }
1304
1305 if (hdr.n_valid || hdr.n_invalid) {
1306 /* write back the header */
1307 if (__copy_to_user(hptr, &hdr, sizeof(hdr)))
1308 return -EFAULT;
1309 nw = nb;
1310 buf = (char __user *)lbuf;
1311 } else {
1312 nb = nw;
1313 }
1314
1315 /* Check if we've wrapped around the hash table */
1316 if (i >= kvm->arch.hpt_npte) {
1317 i = 0;
1318 ctx->first_pass = 0;
1319 break;
1320 }
1321 }
1322
1323 ctx->index = i;
1324
1325 return nb;
1326}
1327
1328static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
1329 size_t count, loff_t *ppos)
1330{
1331 struct kvm_htab_ctx *ctx = file->private_data;
1332 struct kvm *kvm = ctx->kvm;
1333 struct kvm_get_htab_header hdr;
1334 unsigned long i, j;
1335 unsigned long v, r;
1336 unsigned long __user *lbuf;
1337 unsigned long *hptp;
1338 unsigned long tmp[2];
1339 ssize_t nb;
1340 long int err, ret;
1341 int rma_setup;
1342
1343 if (!access_ok(VERIFY_READ, buf, count))
1344 return -EFAULT;
1345
1346 /* lock out vcpus from running while we're doing this */
1347 mutex_lock(&kvm->lock);
1348 rma_setup = kvm->arch.rma_setup_done;
1349 if (rma_setup) {
1350 kvm->arch.rma_setup_done = 0; /* temporarily */
1351 /* order rma_setup_done vs. vcpus_running */
1352 smp_mb();
1353 if (atomic_read(&kvm->arch.vcpus_running)) {
1354 kvm->arch.rma_setup_done = 1;
1355 mutex_unlock(&kvm->lock);
1356 return -EBUSY;
1357 }
1358 }
1359
1360 err = 0;
1361 for (nb = 0; nb + sizeof(hdr) <= count; ) {
1362 err = -EFAULT;
1363 if (__copy_from_user(&hdr, buf, sizeof(hdr)))
1364 break;
1365
1366 err = 0;
1367 if (nb + hdr.n_valid * HPTE_SIZE > count)
1368 break;
1369
1370 nb += sizeof(hdr);
1371 buf += sizeof(hdr);
1372
1373 err = -EINVAL;
1374 i = hdr.index;
1375 if (i >= kvm->arch.hpt_npte ||
1376 i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt_npte)
1377 break;
1378
1379 hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
1380 lbuf = (unsigned long __user *)buf;
1381 for (j = 0; j < hdr.n_valid; ++j) {
1382 err = -EFAULT;
1383 if (__get_user(v, lbuf) || __get_user(r, lbuf + 1))
1384 goto out;
1385 err = -EINVAL;
1386 if (!(v & HPTE_V_VALID))
1387 goto out;
1388 lbuf += 2;
1389 nb += HPTE_SIZE;
1390
1391 if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT))
1392 kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
1393 err = -EIO;
1394 ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r,
1395 tmp);
1396 if (ret != H_SUCCESS) {
1397 pr_err("kvm_htab_write ret %ld i=%ld v=%lx "
1398 "r=%lx\n", ret, i, v, r);
1399 goto out;
1400 }
1401 if (!rma_setup && is_vrma_hpte(v)) {
1402 unsigned long psize = hpte_page_size(v, r);
1403 unsigned long senc = slb_pgsize_encoding(psize);
1404 unsigned long lpcr;
1405
1406 kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
1407 (VRMA_VSID << SLB_VSID_SHIFT_1T);
1408 lpcr = kvm->arch.lpcr & ~LPCR_VRMASD;
1409 lpcr |= senc << (LPCR_VRMASD_SH - 4);
1410 kvm->arch.lpcr = lpcr;
1411 rma_setup = 1;
1412 }
1413 ++i;
1414 hptp += 2;
1415 }
1416
1417 for (j = 0; j < hdr.n_invalid; ++j) {
1418 if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT))
1419 kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
1420 ++i;
1421 hptp += 2;
1422 }
1423 err = 0;
1424 }
1425
1426 out:
1427 /* Order HPTE updates vs. rma_setup_done */
1428 smp_wmb();
1429 kvm->arch.rma_setup_done = rma_setup;
1430 mutex_unlock(&kvm->lock);
1431
1432 if (err)
1433 return err;
1434 return nb;
1435}
1436
1437static int kvm_htab_release(struct inode *inode, struct file *filp)
1438{
1439 struct kvm_htab_ctx *ctx = filp->private_data;
1440
1441 filp->private_data = NULL;
1442 if (!(ctx->flags & KVM_GET_HTAB_WRITE))
1443 atomic_dec(&ctx->kvm->arch.hpte_mod_interest);
1444 kvm_put_kvm(ctx->kvm);
1445 kfree(ctx);
1446 return 0;
1447}
1448
1449static struct file_operations kvm_htab_fops = {
1450 .read = kvm_htab_read,
1451 .write = kvm_htab_write,
1452 .llseek = default_llseek,
1453 .release = kvm_htab_release,
1454};
1455
1456int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf)
1457{
1458 int ret;
1459 struct kvm_htab_ctx *ctx;
1460 int rwflag;
1461
1462 /* reject flags we don't recognize */
1463 if (ghf->flags & ~(KVM_GET_HTAB_BOLTED_ONLY | KVM_GET_HTAB_WRITE))
1464 return -EINVAL;
1465 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1466 if (!ctx)
1467 return -ENOMEM;
1468 kvm_get_kvm(kvm);
1469 ctx->kvm = kvm;
1470 ctx->index = ghf->start_index;
1471 ctx->flags = ghf->flags;
1472 ctx->first_pass = 1;
1473
1474 rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY;
1475 ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag);
1476 if (ret < 0) {
1477 kvm_put_kvm(kvm);
1478 return ret;
1479 }
1480
1481 if (rwflag == O_RDONLY) {
1482 mutex_lock(&kvm->slots_lock);
1483 atomic_inc(&kvm->arch.hpte_mod_interest);
1484 /* make sure kvmppc_do_h_enter etc. see the increment */
1485 synchronize_srcu_expedited(&kvm->srcu);
1486 mutex_unlock(&kvm->slots_lock);
1487 }
1488
1489 return ret;
1490}
1491
de56a948
PM
1492void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
1493{
1494 struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
1495
9e368f29
PM
1496 if (cpu_has_feature(CPU_FTR_ARCH_206))
1497 vcpu->arch.slb_nr = 32; /* POWER7 */
1498 else
1499 vcpu->arch.slb_nr = 64;
de56a948
PM
1500
1501 mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
1502 mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
1503
1504 vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
1505}