]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - arch/powerpc/kvm/book3s_hv.c
KVM: PPC: Book3S HV: Add capability to report possible virtual SMT modes
[mirror_ubuntu-artful-kernel.git] / arch / powerpc / kvm / book3s_hv.c
CommitLineData
de56a948
PM
1/*
2 * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
3 * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
4 *
5 * Authors:
6 * Paul Mackerras <paulus@au1.ibm.com>
7 * Alexander Graf <agraf@suse.de>
8 * Kevin Wolf <mail@kevin-wolf.de>
9 *
10 * Description: KVM functions specific to running on Book 3S
11 * processors in hypervisor mode (specifically POWER7 and later).
12 *
13 * This file is derived from arch/powerpc/kvm/book3s.c,
14 * by Alexander Graf <agraf@suse.de>.
15 *
16 * This program is free software; you can redistribute it and/or modify
17 * it under the terms of the GNU General Public License, version 2, as
18 * published by the Free Software Foundation.
19 */
20
21#include <linux/kvm_host.h>
22#include <linux/err.h>
23#include <linux/slab.h>
24#include <linux/preempt.h>
174cd4b1 25#include <linux/sched/signal.h>
03441a34 26#include <linux/sched/stat.h>
de56a948 27#include <linux/delay.h>
66b15db6 28#include <linux/export.h>
de56a948
PM
29#include <linux/fs.h>
30#include <linux/anon_inodes.h>
07f8ab25 31#include <linux/cpu.h>
de56a948 32#include <linux/cpumask.h>
aa04b4cc
PM
33#include <linux/spinlock.h>
34#include <linux/page-flags.h>
2c9097e4 35#include <linux/srcu.h>
398a76c6 36#include <linux/miscdevice.h>
e23a808b 37#include <linux/debugfs.h>
d3989143
BH
38#include <linux/gfp.h>
39#include <linux/vmalloc.h>
40#include <linux/highmem.h>
41#include <linux/hugetlb.h>
42#include <linux/kvm_irqfd.h>
43#include <linux/irqbypass.h>
44#include <linux/module.h>
45#include <linux/compiler.h>
46#include <linux/of.h>
de56a948
PM
47
48#include <asm/reg.h>
57900694
PM
49#include <asm/ppc-opcode.h>
50#include <asm/disassemble.h>
de56a948
PM
51#include <asm/cputable.h>
52#include <asm/cacheflush.h>
53#include <asm/tlbflush.h>
7c0f6ba6 54#include <linux/uaccess.h>
de56a948
PM
55#include <asm/io.h>
56#include <asm/kvm_ppc.h>
57#include <asm/kvm_book3s.h>
58#include <asm/mmu_context.h>
59#include <asm/lppaca.h>
60#include <asm/processor.h>
371fefd6 61#include <asm/cputhreads.h>
aa04b4cc 62#include <asm/page.h>
de1d9248 63#include <asm/hvcall.h>
ae3a197e 64#include <asm/switch_to.h>
512691d4 65#include <asm/smp.h>
66feed61 66#include <asm/dbell.h>
fd7bacbc 67#include <asm/hmi.h>
c57875f5 68#include <asm/pnv-pci.h>
7a84084c 69#include <asm/mmu.h>
f725758b
PM
70#include <asm/opal.h>
71#include <asm/xics.h>
5af50993 72#include <asm/xive.h>
de56a948 73
3a167bea
AK
74#include "book3s.h"
75
3c78f78a
SW
76#define CREATE_TRACE_POINTS
77#include "trace_hv.h"
78
de56a948
PM
79/* #define EXIT_DEBUG */
80/* #define EXIT_DEBUG_SIMPLE */
81/* #define EXIT_DEBUG_INT */
82
913d3ff9
PM
83/* Used to indicate that a guest page fault needs to be handled */
84#define RESUME_PAGE_FAULT (RESUME_GUEST | RESUME_FLAG_ARCH1)
f7af5209
SW
85/* Used to indicate that a guest passthrough interrupt needs to be handled */
86#define RESUME_PASSTHROUGH (RESUME_GUEST | RESUME_FLAG_ARCH2)
913d3ff9 87
c7b67670
PM
88/* Used as a "null" value for timebase values */
89#define TB_NIL (~(u64)0)
90
699a0ea0
PM
91static DECLARE_BITMAP(default_enabled_hcalls, MAX_HCALL_OPCODE/4 + 1);
92
b4deba5c
PM
93static int dynamic_mt_modes = 6;
94module_param(dynamic_mt_modes, int, S_IRUGO | S_IWUSR);
95MODULE_PARM_DESC(dynamic_mt_modes, "Set of allowed dynamic micro-threading modes: 0 (= none), 2, 4, or 6 (= 2 or 4)");
ec257165
PM
96static int target_smt_mode;
97module_param(target_smt_mode, int, S_IRUGO | S_IWUSR);
98MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
9678cdaa 99
520fe9c6
SW
100#ifdef CONFIG_KVM_XICS
101static struct kernel_param_ops module_param_ops = {
102 .set = param_set_int,
103 .get = param_get_int,
104};
105
644abbb2
SW
106module_param_cb(kvm_irq_bypass, &module_param_ops, &kvm_irq_bypass,
107 S_IRUGO | S_IWUSR);
108MODULE_PARM_DESC(kvm_irq_bypass, "Bypass passthrough interrupt optimization");
109
520fe9c6
SW
110module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect,
111 S_IRUGO | S_IWUSR);
112MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
113#endif
114
19ccb76a 115static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
32fad281 116static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
19ccb76a 117
7b5f8272
SJS
118static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc,
119 int *ip)
120{
121 int i = *ip;
122 struct kvm_vcpu *vcpu;
123
124 while (++i < MAX_SMT_THREADS) {
125 vcpu = READ_ONCE(vc->runnable_threads[i]);
126 if (vcpu) {
127 *ip = i;
128 return vcpu;
129 }
130 }
131 return NULL;
132}
133
134/* Used to traverse the list of runnable threads for a given vcore */
135#define for_each_runnable_thread(i, vcpu, vc) \
136 for (i = -1; (vcpu = next_runnable_thread(vc, &i)); )
137
66feed61
PM
138static bool kvmppc_ipi_thread(int cpu)
139{
1704a81c
PM
140 unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
141
142 /* On POWER9 we can use msgsnd to IPI any cpu */
143 if (cpu_has_feature(CPU_FTR_ARCH_300)) {
144 msg |= get_hard_smp_processor_id(cpu);
145 smp_mb();
146 __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
147 return true;
148 }
149
66feed61
PM
150 /* On POWER8 for IPIs to threads in the same core, use msgsnd */
151 if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
152 preempt_disable();
153 if (cpu_first_thread_sibling(cpu) ==
154 cpu_first_thread_sibling(smp_processor_id())) {
66feed61
PM
155 msg |= cpu_thread_in_core(cpu);
156 smp_mb();
157 __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
158 preempt_enable();
159 return true;
160 }
161 preempt_enable();
162 }
163
164#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
f725758b
PM
165 if (cpu >= 0 && cpu < nr_cpu_ids) {
166 if (paca[cpu].kvm_hstate.xics_phys) {
167 xics_wake_cpu(cpu);
168 return true;
169 }
170 opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY);
66feed61
PM
171 return true;
172 }
173#endif
174
175 return false;
176}
177
3a167bea 178static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
54695c30 179{
ec257165 180 int cpu;
8577370f 181 struct swait_queue_head *wqp;
54695c30
BH
182
183 wqp = kvm_arch_vcpu_wq(vcpu);
8577370f
MT
184 if (swait_active(wqp)) {
185 swake_up(wqp);
54695c30
BH
186 ++vcpu->stat.halt_wakeup;
187 }
188
3deda5e5
PM
189 cpu = READ_ONCE(vcpu->arch.thread_cpu);
190 if (cpu >= 0 && kvmppc_ipi_thread(cpu))
66feed61 191 return;
54695c30
BH
192
193 /* CPU points to the first thread of the core */
ec257165 194 cpu = vcpu->cpu;
66feed61
PM
195 if (cpu >= 0 && cpu < nr_cpu_ids && cpu_online(cpu))
196 smp_send_reschedule(cpu);
54695c30
BH
197}
198
c7b67670
PM
199/*
200 * We use the vcpu_load/put functions to measure stolen time.
201 * Stolen time is counted as time when either the vcpu is able to
202 * run as part of a virtual core, but the task running the vcore
203 * is preempted or sleeping, or when the vcpu needs something done
204 * in the kernel by the task running the vcpu, but that task is
205 * preempted or sleeping. Those two things have to be counted
206 * separately, since one of the vcpu tasks will take on the job
207 * of running the core, and the other vcpu tasks in the vcore will
208 * sleep waiting for it to do that, but that sleep shouldn't count
209 * as stolen time.
210 *
211 * Hence we accumulate stolen time when the vcpu can run as part of
212 * a vcore using vc->stolen_tb, and the stolen time when the vcpu
213 * needs its task to do other things in the kernel (for example,
214 * service a page fault) in busy_stolen. We don't accumulate
215 * stolen time for a vcore when it is inactive, or for a vcpu
216 * when it is in state RUNNING or NOTREADY. NOTREADY is a bit of
217 * a misnomer; it means that the vcpu task is not executing in
218 * the KVM_VCPU_RUN ioctl, i.e. it is in userspace or elsewhere in
219 * the kernel. We don't have any way of dividing up that time
220 * between time that the vcpu is genuinely stopped, time that
221 * the task is actively working on behalf of the vcpu, and time
222 * that the task is preempted, so we don't count any of it as
223 * stolen.
224 *
225 * Updates to busy_stolen are protected by arch.tbacct_lock;
2711e248
PM
226 * updates to vc->stolen_tb are protected by the vcore->stoltb_lock
227 * lock. The stolen times are measured in units of timebase ticks.
228 * (Note that the != TB_NIL checks below are purely defensive;
229 * they should never fail.)
c7b67670
PM
230 */
231
ec257165
PM
232static void kvmppc_core_start_stolen(struct kvmppc_vcore *vc)
233{
234 unsigned long flags;
235
236 spin_lock_irqsave(&vc->stoltb_lock, flags);
237 vc->preempt_tb = mftb();
238 spin_unlock_irqrestore(&vc->stoltb_lock, flags);
239}
240
241static void kvmppc_core_end_stolen(struct kvmppc_vcore *vc)
242{
243 unsigned long flags;
244
245 spin_lock_irqsave(&vc->stoltb_lock, flags);
246 if (vc->preempt_tb != TB_NIL) {
247 vc->stolen_tb += mftb() - vc->preempt_tb;
248 vc->preempt_tb = TB_NIL;
249 }
250 spin_unlock_irqrestore(&vc->stoltb_lock, flags);
251}
252
3a167bea 253static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu)
de56a948 254{
0456ec4f 255 struct kvmppc_vcore *vc = vcpu->arch.vcore;
bf3d32e1 256 unsigned long flags;
0456ec4f 257
2711e248
PM
258 /*
259 * We can test vc->runner without taking the vcore lock,
260 * because only this task ever sets vc->runner to this
261 * vcpu, and once it is set to this vcpu, only this task
262 * ever sets it to NULL.
263 */
ec257165
PM
264 if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
265 kvmppc_core_end_stolen(vc);
266
2711e248 267 spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
c7b67670
PM
268 if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST &&
269 vcpu->arch.busy_preempt != TB_NIL) {
270 vcpu->arch.busy_stolen += mftb() - vcpu->arch.busy_preempt;
271 vcpu->arch.busy_preempt = TB_NIL;
272 }
bf3d32e1 273 spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
de56a948
PM
274}
275
3a167bea 276static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu)
de56a948 277{
0456ec4f 278 struct kvmppc_vcore *vc = vcpu->arch.vcore;
bf3d32e1 279 unsigned long flags;
0456ec4f 280
ec257165
PM
281 if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
282 kvmppc_core_start_stolen(vc);
283
2711e248 284 spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
c7b67670
PM
285 if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST)
286 vcpu->arch.busy_preempt = mftb();
bf3d32e1 287 spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
de56a948
PM
288}
289
3a167bea 290static void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr)
de56a948 291{
c20875a3
PM
292 /*
293 * Check for illegal transactional state bit combination
294 * and if we find it, force the TS field to a safe state.
295 */
296 if ((msr & MSR_TS_MASK) == MSR_TS_MASK)
297 msr &= ~MSR_TS_MASK;
de56a948 298 vcpu->arch.shregs.msr = msr;
19ccb76a 299 kvmppc_end_cede(vcpu);
de56a948
PM
300}
301
5358a963 302static void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr)
de56a948
PM
303{
304 vcpu->arch.pvr = pvr;
305}
306
2ee13be3
SJS
307/* Dummy value used in computing PCR value below */
308#define PCR_ARCH_300 (PCR_ARCH_207 << 1)
309
5358a963 310static int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
388cc6e1 311{
2ee13be3 312 unsigned long host_pcr_bit = 0, guest_pcr_bit = 0;
388cc6e1
PM
313 struct kvmppc_vcore *vc = vcpu->arch.vcore;
314
2ee13be3
SJS
315 /* We can (emulate) our own architecture version and anything older */
316 if (cpu_has_feature(CPU_FTR_ARCH_300))
317 host_pcr_bit = PCR_ARCH_300;
318 else if (cpu_has_feature(CPU_FTR_ARCH_207S))
319 host_pcr_bit = PCR_ARCH_207;
320 else if (cpu_has_feature(CPU_FTR_ARCH_206))
321 host_pcr_bit = PCR_ARCH_206;
322 else
323 host_pcr_bit = PCR_ARCH_205;
324
325 /* Determine lowest PCR bit needed to run guest in given PVR level */
326 guest_pcr_bit = host_pcr_bit;
388cc6e1 327 if (arch_compat) {
388cc6e1
PM
328 switch (arch_compat) {
329 case PVR_ARCH_205:
2ee13be3 330 guest_pcr_bit = PCR_ARCH_205;
388cc6e1
PM
331 break;
332 case PVR_ARCH_206:
333 case PVR_ARCH_206p:
2ee13be3 334 guest_pcr_bit = PCR_ARCH_206;
5557ae0e
PM
335 break;
336 case PVR_ARCH_207:
2ee13be3
SJS
337 guest_pcr_bit = PCR_ARCH_207;
338 break;
339 case PVR_ARCH_300:
340 guest_pcr_bit = PCR_ARCH_300;
388cc6e1
PM
341 break;
342 default:
343 return -EINVAL;
344 }
345 }
346
2ee13be3
SJS
347 /* Check requested PCR bits don't exceed our capabilities */
348 if (guest_pcr_bit > host_pcr_bit)
349 return -EINVAL;
350
388cc6e1
PM
351 spin_lock(&vc->lock);
352 vc->arch_compat = arch_compat;
2ee13be3
SJS
353 /* Set all PCR bits for which guest_pcr_bit <= bit < host_pcr_bit */
354 vc->pcr = host_pcr_bit - guest_pcr_bit;
388cc6e1
PM
355 spin_unlock(&vc->lock);
356
357 return 0;
358}
359
5358a963 360static void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
de56a948
PM
361{
362 int r;
363
364 pr_err("vcpu %p (%d):\n", vcpu, vcpu->vcpu_id);
365 pr_err("pc = %.16lx msr = %.16llx trap = %x\n",
366 vcpu->arch.pc, vcpu->arch.shregs.msr, vcpu->arch.trap);
367 for (r = 0; r < 16; ++r)
368 pr_err("r%2d = %.16lx r%d = %.16lx\n",
369 r, kvmppc_get_gpr(vcpu, r),
370 r+16, kvmppc_get_gpr(vcpu, r+16));
371 pr_err("ctr = %.16lx lr = %.16lx\n",
372 vcpu->arch.ctr, vcpu->arch.lr);
373 pr_err("srr0 = %.16llx srr1 = %.16llx\n",
374 vcpu->arch.shregs.srr0, vcpu->arch.shregs.srr1);
375 pr_err("sprg0 = %.16llx sprg1 = %.16llx\n",
376 vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1);
377 pr_err("sprg2 = %.16llx sprg3 = %.16llx\n",
378 vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3);
379 pr_err("cr = %.8x xer = %.16lx dsisr = %.8x\n",
380 vcpu->arch.cr, vcpu->arch.xer, vcpu->arch.shregs.dsisr);
381 pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar);
382 pr_err("fault dar = %.16lx dsisr = %.8x\n",
383 vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
384 pr_err("SLB (%d entries):\n", vcpu->arch.slb_max);
385 for (r = 0; r < vcpu->arch.slb_max; ++r)
386 pr_err(" ESID = %.16llx VSID = %.16llx\n",
387 vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv);
388 pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n",
a0144e2a 389 vcpu->arch.vcore->lpcr, vcpu->kvm->arch.sdr1,
de56a948
PM
390 vcpu->arch.last_inst);
391}
392
5358a963 393static struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id)
a8606e20 394{
e09fefde 395 struct kvm_vcpu *ret;
a8606e20
PM
396
397 mutex_lock(&kvm->lock);
e09fefde 398 ret = kvm_get_vcpu_by_id(kvm, id);
a8606e20
PM
399 mutex_unlock(&kvm->lock);
400 return ret;
401}
402
403static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa)
404{
f13c13a0 405 vpa->__old_status |= LPPACA_OLD_SHARED_PROC;
02407552 406 vpa->yield_count = cpu_to_be32(1);
a8606e20
PM
407}
408
55b665b0
PM
409static int set_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *v,
410 unsigned long addr, unsigned long len)
411{
412 /* check address is cacheline aligned */
413 if (addr & (L1_CACHE_BYTES - 1))
414 return -EINVAL;
415 spin_lock(&vcpu->arch.vpa_update_lock);
416 if (v->next_gpa != addr || v->len != len) {
417 v->next_gpa = addr;
418 v->len = addr ? len : 0;
419 v->update_pending = 1;
420 }
421 spin_unlock(&vcpu->arch.vpa_update_lock);
422 return 0;
423}
424
2e25aa5f
PM
425/* Length for a per-processor buffer is passed in at offset 4 in the buffer */
426struct reg_vpa {
427 u32 dummy;
428 union {
02407552
AG
429 __be16 hword;
430 __be32 word;
2e25aa5f
PM
431 } length;
432};
433
434static int vpa_is_registered(struct kvmppc_vpa *vpap)
435{
436 if (vpap->update_pending)
437 return vpap->next_gpa != 0;
438 return vpap->pinned_addr != NULL;
439}
440
a8606e20
PM
441static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
442 unsigned long flags,
443 unsigned long vcpuid, unsigned long vpa)
444{
445 struct kvm *kvm = vcpu->kvm;
93e60249 446 unsigned long len, nb;
a8606e20
PM
447 void *va;
448 struct kvm_vcpu *tvcpu;
2e25aa5f
PM
449 int err;
450 int subfunc;
451 struct kvmppc_vpa *vpap;
a8606e20
PM
452
453 tvcpu = kvmppc_find_vcpu(kvm, vcpuid);
454 if (!tvcpu)
455 return H_PARAMETER;
456
2e25aa5f
PM
457 subfunc = (flags >> H_VPA_FUNC_SHIFT) & H_VPA_FUNC_MASK;
458 if (subfunc == H_VPA_REG_VPA || subfunc == H_VPA_REG_DTL ||
459 subfunc == H_VPA_REG_SLB) {
460 /* Registering new area - address must be cache-line aligned */
461 if ((vpa & (L1_CACHE_BYTES - 1)) || !vpa)
a8606e20 462 return H_PARAMETER;
2e25aa5f
PM
463
464 /* convert logical addr to kernel addr and read length */
93e60249
PM
465 va = kvmppc_pin_guest_page(kvm, vpa, &nb);
466 if (va == NULL)
b2b2f165 467 return H_PARAMETER;
2e25aa5f 468 if (subfunc == H_VPA_REG_VPA)
02407552 469 len = be16_to_cpu(((struct reg_vpa *)va)->length.hword);
a8606e20 470 else
02407552 471 len = be32_to_cpu(((struct reg_vpa *)va)->length.word);
c35635ef 472 kvmppc_unpin_guest_page(kvm, va, vpa, false);
2e25aa5f
PM
473
474 /* Check length */
475 if (len > nb || len < sizeof(struct reg_vpa))
476 return H_PARAMETER;
477 } else {
478 vpa = 0;
479 len = 0;
480 }
481
482 err = H_PARAMETER;
483 vpap = NULL;
484 spin_lock(&tvcpu->arch.vpa_update_lock);
485
486 switch (subfunc) {
487 case H_VPA_REG_VPA: /* register VPA */
488 if (len < sizeof(struct lppaca))
a8606e20 489 break;
2e25aa5f
PM
490 vpap = &tvcpu->arch.vpa;
491 err = 0;
492 break;
493
494 case H_VPA_REG_DTL: /* register DTL */
495 if (len < sizeof(struct dtl_entry))
a8606e20 496 break;
2e25aa5f
PM
497 len -= len % sizeof(struct dtl_entry);
498
499 /* Check that they have previously registered a VPA */
500 err = H_RESOURCE;
501 if (!vpa_is_registered(&tvcpu->arch.vpa))
a8606e20 502 break;
2e25aa5f
PM
503
504 vpap = &tvcpu->arch.dtl;
505 err = 0;
506 break;
507
508 case H_VPA_REG_SLB: /* register SLB shadow buffer */
509 /* Check that they have previously registered a VPA */
510 err = H_RESOURCE;
511 if (!vpa_is_registered(&tvcpu->arch.vpa))
a8606e20 512 break;
2e25aa5f
PM
513
514 vpap = &tvcpu->arch.slb_shadow;
515 err = 0;
516 break;
517
518 case H_VPA_DEREG_VPA: /* deregister VPA */
519 /* Check they don't still have a DTL or SLB buf registered */
520 err = H_RESOURCE;
521 if (vpa_is_registered(&tvcpu->arch.dtl) ||
522 vpa_is_registered(&tvcpu->arch.slb_shadow))
a8606e20 523 break;
2e25aa5f
PM
524
525 vpap = &tvcpu->arch.vpa;
526 err = 0;
527 break;
528
529 case H_VPA_DEREG_DTL: /* deregister DTL */
530 vpap = &tvcpu->arch.dtl;
531 err = 0;
532 break;
533
534 case H_VPA_DEREG_SLB: /* deregister SLB shadow buffer */
535 vpap = &tvcpu->arch.slb_shadow;
536 err = 0;
537 break;
538 }
539
540 if (vpap) {
541 vpap->next_gpa = vpa;
542 vpap->len = len;
543 vpap->update_pending = 1;
a8606e20 544 }
93e60249 545
2e25aa5f
PM
546 spin_unlock(&tvcpu->arch.vpa_update_lock);
547
93e60249 548 return err;
a8606e20
PM
549}
550
081f323b 551static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap)
2e25aa5f 552{
081f323b 553 struct kvm *kvm = vcpu->kvm;
2e25aa5f
PM
554 void *va;
555 unsigned long nb;
081f323b 556 unsigned long gpa;
2e25aa5f 557
081f323b
PM
558 /*
559 * We need to pin the page pointed to by vpap->next_gpa,
560 * but we can't call kvmppc_pin_guest_page under the lock
561 * as it does get_user_pages() and down_read(). So we
562 * have to drop the lock, pin the page, then get the lock
563 * again and check that a new area didn't get registered
564 * in the meantime.
565 */
566 for (;;) {
567 gpa = vpap->next_gpa;
568 spin_unlock(&vcpu->arch.vpa_update_lock);
569 va = NULL;
570 nb = 0;
571 if (gpa)
c35635ef 572 va = kvmppc_pin_guest_page(kvm, gpa, &nb);
081f323b
PM
573 spin_lock(&vcpu->arch.vpa_update_lock);
574 if (gpa == vpap->next_gpa)
575 break;
576 /* sigh... unpin that one and try again */
577 if (va)
c35635ef 578 kvmppc_unpin_guest_page(kvm, va, gpa, false);
081f323b
PM
579 }
580
581 vpap->update_pending = 0;
582 if (va && nb < vpap->len) {
583 /*
584 * If it's now too short, it must be that userspace
585 * has changed the mappings underlying guest memory,
586 * so unregister the region.
587 */
c35635ef 588 kvmppc_unpin_guest_page(kvm, va, gpa, false);
081f323b 589 va = NULL;
2e25aa5f
PM
590 }
591 if (vpap->pinned_addr)
c35635ef
PM
592 kvmppc_unpin_guest_page(kvm, vpap->pinned_addr, vpap->gpa,
593 vpap->dirty);
594 vpap->gpa = gpa;
2e25aa5f 595 vpap->pinned_addr = va;
c35635ef 596 vpap->dirty = false;
2e25aa5f
PM
597 if (va)
598 vpap->pinned_end = va + vpap->len;
599}
600
601static void kvmppc_update_vpas(struct kvm_vcpu *vcpu)
602{
2f12f034
PM
603 if (!(vcpu->arch.vpa.update_pending ||
604 vcpu->arch.slb_shadow.update_pending ||
605 vcpu->arch.dtl.update_pending))
606 return;
607
2e25aa5f
PM
608 spin_lock(&vcpu->arch.vpa_update_lock);
609 if (vcpu->arch.vpa.update_pending) {
081f323b 610 kvmppc_update_vpa(vcpu, &vcpu->arch.vpa);
55b665b0
PM
611 if (vcpu->arch.vpa.pinned_addr)
612 init_vpa(vcpu, vcpu->arch.vpa.pinned_addr);
2e25aa5f
PM
613 }
614 if (vcpu->arch.dtl.update_pending) {
081f323b 615 kvmppc_update_vpa(vcpu, &vcpu->arch.dtl);
2e25aa5f
PM
616 vcpu->arch.dtl_ptr = vcpu->arch.dtl.pinned_addr;
617 vcpu->arch.dtl_index = 0;
618 }
619 if (vcpu->arch.slb_shadow.update_pending)
081f323b 620 kvmppc_update_vpa(vcpu, &vcpu->arch.slb_shadow);
2e25aa5f
PM
621 spin_unlock(&vcpu->arch.vpa_update_lock);
622}
623
c7b67670
PM
624/*
625 * Return the accumulated stolen time for the vcore up until `now'.
626 * The caller should hold the vcore lock.
627 */
628static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now)
629{
630 u64 p;
2711e248 631 unsigned long flags;
c7b67670 632
2711e248
PM
633 spin_lock_irqsave(&vc->stoltb_lock, flags);
634 p = vc->stolen_tb;
c7b67670 635 if (vc->vcore_state != VCORE_INACTIVE &&
2711e248
PM
636 vc->preempt_tb != TB_NIL)
637 p += now - vc->preempt_tb;
638 spin_unlock_irqrestore(&vc->stoltb_lock, flags);
c7b67670
PM
639 return p;
640}
641
0456ec4f
PM
642static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
643 struct kvmppc_vcore *vc)
644{
645 struct dtl_entry *dt;
646 struct lppaca *vpa;
c7b67670
PM
647 unsigned long stolen;
648 unsigned long core_stolen;
649 u64 now;
0456ec4f
PM
650
651 dt = vcpu->arch.dtl_ptr;
652 vpa = vcpu->arch.vpa.pinned_addr;
c7b67670
PM
653 now = mftb();
654 core_stolen = vcore_stolen_time(vc, now);
655 stolen = core_stolen - vcpu->arch.stolen_logged;
656 vcpu->arch.stolen_logged = core_stolen;
bf3d32e1 657 spin_lock_irq(&vcpu->arch.tbacct_lock);
c7b67670
PM
658 stolen += vcpu->arch.busy_stolen;
659 vcpu->arch.busy_stolen = 0;
bf3d32e1 660 spin_unlock_irq(&vcpu->arch.tbacct_lock);
0456ec4f
PM
661 if (!dt || !vpa)
662 return;
663 memset(dt, 0, sizeof(struct dtl_entry));
664 dt->dispatch_reason = 7;
02407552
AG
665 dt->processor_id = cpu_to_be16(vc->pcpu + vcpu->arch.ptid);
666 dt->timebase = cpu_to_be64(now + vc->tb_offset);
667 dt->enqueue_to_dispatch_time = cpu_to_be32(stolen);
668 dt->srr0 = cpu_to_be64(kvmppc_get_pc(vcpu));
669 dt->srr1 = cpu_to_be64(vcpu->arch.shregs.msr);
0456ec4f
PM
670 ++dt;
671 if (dt == vcpu->arch.dtl.pinned_end)
672 dt = vcpu->arch.dtl.pinned_addr;
673 vcpu->arch.dtl_ptr = dt;
674 /* order writing *dt vs. writing vpa->dtl_idx */
675 smp_wmb();
02407552 676 vpa->dtl_idx = cpu_to_be64(++vcpu->arch.dtl_index);
c35635ef 677 vcpu->arch.dtl.dirty = true;
0456ec4f
PM
678}
679
1da4e2f4
PM
680/* See if there is a doorbell interrupt pending for a vcpu */
681static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu)
682{
683 int thr;
684 struct kvmppc_vcore *vc;
685
57900694
PM
686 if (vcpu->arch.doorbell_request)
687 return true;
688 /*
689 * Ensure that the read of vcore->dpdes comes after the read
690 * of vcpu->doorbell_request. This barrier matches the
691 * lwsync in book3s_hv_rmhandlers.S just before the
692 * fast_guest_return label.
693 */
694 smp_rmb();
1da4e2f4
PM
695 vc = vcpu->arch.vcore;
696 thr = vcpu->vcpu_id - vc->first_vcpuid;
697 return !!(vc->dpdes & (1 << thr));
698}
699
9642382e
MN
700static bool kvmppc_power8_compatible(struct kvm_vcpu *vcpu)
701{
702 if (vcpu->arch.vcore->arch_compat >= PVR_ARCH_207)
703 return true;
704 if ((!vcpu->arch.vcore->arch_compat) &&
705 cpu_has_feature(CPU_FTR_ARCH_207S))
706 return true;
707 return false;
708}
709
710static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, unsigned long mflags,
711 unsigned long resource, unsigned long value1,
712 unsigned long value2)
713{
714 switch (resource) {
715 case H_SET_MODE_RESOURCE_SET_CIABR:
716 if (!kvmppc_power8_compatible(vcpu))
717 return H_P2;
718 if (value2)
719 return H_P4;
720 if (mflags)
721 return H_UNSUPPORTED_FLAG_START;
722 /* Guests can't breakpoint the hypervisor */
723 if ((value1 & CIABR_PRIV) == CIABR_PRIV_HYPER)
724 return H_P3;
725 vcpu->arch.ciabr = value1;
726 return H_SUCCESS;
727 case H_SET_MODE_RESOURCE_SET_DAWR:
728 if (!kvmppc_power8_compatible(vcpu))
729 return H_P2;
730 if (mflags)
731 return H_UNSUPPORTED_FLAG_START;
732 if (value2 & DABRX_HYP)
733 return H_P4;
734 vcpu->arch.dawr = value1;
735 vcpu->arch.dawrx = value2;
736 return H_SUCCESS;
737 default:
738 return H_TOO_HARD;
739 }
740}
741
90fd09f8
SB
742static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target)
743{
744 struct kvmppc_vcore *vcore = target->arch.vcore;
745
746 /*
747 * We expect to have been called by the real mode handler
748 * (kvmppc_rm_h_confer()) which would have directly returned
749 * H_SUCCESS if the source vcore wasn't idle (e.g. if it may
750 * have useful work to do and should not confer) so we don't
751 * recheck that here.
752 */
753
754 spin_lock(&vcore->lock);
755 if (target->arch.state == KVMPPC_VCPU_RUNNABLE &&
ec257165
PM
756 vcore->vcore_state != VCORE_INACTIVE &&
757 vcore->runner)
90fd09f8
SB
758 target = vcore->runner;
759 spin_unlock(&vcore->lock);
760
761 return kvm_vcpu_yield_to(target);
762}
763
764static int kvmppc_get_yield_count(struct kvm_vcpu *vcpu)
765{
766 int yield_count = 0;
767 struct lppaca *lppaca;
768
769 spin_lock(&vcpu->arch.vpa_update_lock);
770 lppaca = (struct lppaca *)vcpu->arch.vpa.pinned_addr;
771 if (lppaca)
ecb6d618 772 yield_count = be32_to_cpu(lppaca->yield_count);
90fd09f8
SB
773 spin_unlock(&vcpu->arch.vpa_update_lock);
774 return yield_count;
775}
776
a8606e20
PM
777int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
778{
779 unsigned long req = kvmppc_get_gpr(vcpu, 3);
780 unsigned long target, ret = H_SUCCESS;
90fd09f8 781 int yield_count;
a8606e20 782 struct kvm_vcpu *tvcpu;
8e591cb7 783 int idx, rc;
a8606e20 784
699a0ea0
PM
785 if (req <= MAX_HCALL_OPCODE &&
786 !test_bit(req/4, vcpu->kvm->arch.enabled_hcalls))
787 return RESUME_HOST;
788
a8606e20
PM
789 switch (req) {
790 case H_CEDE:
a8606e20
PM
791 break;
792 case H_PROD:
793 target = kvmppc_get_gpr(vcpu, 4);
794 tvcpu = kvmppc_find_vcpu(vcpu->kvm, target);
795 if (!tvcpu) {
796 ret = H_PARAMETER;
797 break;
798 }
799 tvcpu->arch.prodded = 1;
800 smp_mb();
8464c884
PM
801 if (tvcpu->arch.ceded)
802 kvmppc_fast_vcpu_kick_hv(tvcpu);
a8606e20
PM
803 break;
804 case H_CONFER:
42d7604d
PM
805 target = kvmppc_get_gpr(vcpu, 4);
806 if (target == -1)
807 break;
808 tvcpu = kvmppc_find_vcpu(vcpu->kvm, target);
809 if (!tvcpu) {
810 ret = H_PARAMETER;
811 break;
812 }
90fd09f8
SB
813 yield_count = kvmppc_get_gpr(vcpu, 5);
814 if (kvmppc_get_yield_count(tvcpu) != yield_count)
815 break;
816 kvm_arch_vcpu_yield_to(tvcpu);
a8606e20
PM
817 break;
818 case H_REGISTER_VPA:
819 ret = do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4),
820 kvmppc_get_gpr(vcpu, 5),
821 kvmppc_get_gpr(vcpu, 6));
822 break;
8e591cb7
ME
823 case H_RTAS:
824 if (list_empty(&vcpu->kvm->arch.rtas_tokens))
825 return RESUME_HOST;
826
c9438092 827 idx = srcu_read_lock(&vcpu->kvm->srcu);
8e591cb7 828 rc = kvmppc_rtas_hcall(vcpu);
c9438092 829 srcu_read_unlock(&vcpu->kvm->srcu, idx);
8e591cb7
ME
830
831 if (rc == -ENOENT)
832 return RESUME_HOST;
833 else if (rc == 0)
834 break;
835
836 /* Send the error out to userspace via KVM_RUN */
837 return rc;
99342cf8
DG
838 case H_LOGICAL_CI_LOAD:
839 ret = kvmppc_h_logical_ci_load(vcpu);
840 if (ret == H_TOO_HARD)
841 return RESUME_HOST;
842 break;
843 case H_LOGICAL_CI_STORE:
844 ret = kvmppc_h_logical_ci_store(vcpu);
845 if (ret == H_TOO_HARD)
846 return RESUME_HOST;
847 break;
9642382e
MN
848 case H_SET_MODE:
849 ret = kvmppc_h_set_mode(vcpu, kvmppc_get_gpr(vcpu, 4),
850 kvmppc_get_gpr(vcpu, 5),
851 kvmppc_get_gpr(vcpu, 6),
852 kvmppc_get_gpr(vcpu, 7));
853 if (ret == H_TOO_HARD)
854 return RESUME_HOST;
855 break;
bc5ad3f3
BH
856 case H_XIRR:
857 case H_CPPR:
858 case H_EOI:
859 case H_IPI:
8e44ddc3
PM
860 case H_IPOLL:
861 case H_XIRR_X:
bc5ad3f3 862 if (kvmppc_xics_enabled(vcpu)) {
5af50993
BH
863 if (xive_enabled()) {
864 ret = H_NOT_AVAILABLE;
865 return RESUME_GUEST;
866 }
bc5ad3f3
BH
867 ret = kvmppc_xics_hcall(vcpu, req);
868 break;
d3695aa4
AK
869 }
870 return RESUME_HOST;
871 case H_PUT_TCE:
872 ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
873 kvmppc_get_gpr(vcpu, 5),
874 kvmppc_get_gpr(vcpu, 6));
875 if (ret == H_TOO_HARD)
876 return RESUME_HOST;
877 break;
878 case H_PUT_TCE_INDIRECT:
879 ret = kvmppc_h_put_tce_indirect(vcpu, kvmppc_get_gpr(vcpu, 4),
880 kvmppc_get_gpr(vcpu, 5),
881 kvmppc_get_gpr(vcpu, 6),
882 kvmppc_get_gpr(vcpu, 7));
883 if (ret == H_TOO_HARD)
884 return RESUME_HOST;
885 break;
886 case H_STUFF_TCE:
887 ret = kvmppc_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
888 kvmppc_get_gpr(vcpu, 5),
889 kvmppc_get_gpr(vcpu, 6),
890 kvmppc_get_gpr(vcpu, 7));
891 if (ret == H_TOO_HARD)
892 return RESUME_HOST;
893 break;
a8606e20
PM
894 default:
895 return RESUME_HOST;
896 }
897 kvmppc_set_gpr(vcpu, 3, ret);
898 vcpu->arch.hcall_needed = 0;
899 return RESUME_GUEST;
900}
901
ae2113a4
PM
902static int kvmppc_hcall_impl_hv(unsigned long cmd)
903{
904 switch (cmd) {
905 case H_CEDE:
906 case H_PROD:
907 case H_CONFER:
908 case H_REGISTER_VPA:
9642382e 909 case H_SET_MODE:
99342cf8
DG
910 case H_LOGICAL_CI_LOAD:
911 case H_LOGICAL_CI_STORE:
ae2113a4
PM
912#ifdef CONFIG_KVM_XICS
913 case H_XIRR:
914 case H_CPPR:
915 case H_EOI:
916 case H_IPI:
917 case H_IPOLL:
918 case H_XIRR_X:
919#endif
920 return 1;
921 }
922
923 /* See if it's in the real-mode table */
924 return kvmppc_hcall_impl_hv_realmode(cmd);
925}
926
a59c1d9e
MS
927static int kvmppc_emulate_debug_inst(struct kvm_run *run,
928 struct kvm_vcpu *vcpu)
929{
930 u32 last_inst;
931
932 if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) !=
933 EMULATE_DONE) {
934 /*
935 * Fetch failed, so return to guest and
936 * try executing it again.
937 */
938 return RESUME_GUEST;
939 }
940
941 if (last_inst == KVMPPC_INST_SW_BREAKPOINT) {
942 run->exit_reason = KVM_EXIT_DEBUG;
943 run->debug.arch.address = kvmppc_get_pc(vcpu);
944 return RESUME_HOST;
945 } else {
946 kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
947 return RESUME_GUEST;
948 }
949}
950
57900694
PM
951static void do_nothing(void *x)
952{
953}
954
955static unsigned long kvmppc_read_dpdes(struct kvm_vcpu *vcpu)
956{
957 int thr, cpu, pcpu, nthreads;
958 struct kvm_vcpu *v;
959 unsigned long dpdes;
960
961 nthreads = vcpu->kvm->arch.emul_smt_mode;
962 dpdes = 0;
963 cpu = vcpu->vcpu_id & ~(nthreads - 1);
964 for (thr = 0; thr < nthreads; ++thr, ++cpu) {
965 v = kvmppc_find_vcpu(vcpu->kvm, cpu);
966 if (!v)
967 continue;
968 /*
969 * If the vcpu is currently running on a physical cpu thread,
970 * interrupt it in order to pull it out of the guest briefly,
971 * which will update its vcore->dpdes value.
972 */
973 pcpu = READ_ONCE(v->cpu);
974 if (pcpu >= 0)
975 smp_call_function_single(pcpu, do_nothing, NULL, 1);
976 if (kvmppc_doorbell_pending(v))
977 dpdes |= 1 << thr;
978 }
979 return dpdes;
980}
981
982/*
983 * On POWER9, emulate doorbell-related instructions in order to
984 * give the guest the illusion of running on a multi-threaded core.
985 * The instructions emulated are msgsndp, msgclrp, mfspr TIR,
986 * and mfspr DPDES.
987 */
988static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu *vcpu)
989{
990 u32 inst, rb, thr;
991 unsigned long arg;
992 struct kvm *kvm = vcpu->kvm;
993 struct kvm_vcpu *tvcpu;
994
995 if (!cpu_has_feature(CPU_FTR_ARCH_300))
996 return EMULATE_FAIL;
997 if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &inst) != EMULATE_DONE)
998 return RESUME_GUEST;
999 if (get_op(inst) != 31)
1000 return EMULATE_FAIL;
1001 rb = get_rb(inst);
1002 thr = vcpu->vcpu_id & (kvm->arch.emul_smt_mode - 1);
1003 switch (get_xop(inst)) {
1004 case OP_31_XOP_MSGSNDP:
1005 arg = kvmppc_get_gpr(vcpu, rb);
1006 if (((arg >> 27) & 0xf) != PPC_DBELL_SERVER)
1007 break;
1008 arg &= 0x3f;
1009 if (arg >= kvm->arch.emul_smt_mode)
1010 break;
1011 tvcpu = kvmppc_find_vcpu(kvm, vcpu->vcpu_id - thr + arg);
1012 if (!tvcpu)
1013 break;
1014 if (!tvcpu->arch.doorbell_request) {
1015 tvcpu->arch.doorbell_request = 1;
1016 kvmppc_fast_vcpu_kick_hv(tvcpu);
1017 }
1018 break;
1019 case OP_31_XOP_MSGCLRP:
1020 arg = kvmppc_get_gpr(vcpu, rb);
1021 if (((arg >> 27) & 0xf) != PPC_DBELL_SERVER)
1022 break;
1023 vcpu->arch.vcore->dpdes = 0;
1024 vcpu->arch.doorbell_request = 0;
1025 break;
1026 case OP_31_XOP_MFSPR:
1027 switch (get_sprn(inst)) {
1028 case SPRN_TIR:
1029 arg = thr;
1030 break;
1031 case SPRN_DPDES:
1032 arg = kvmppc_read_dpdes(vcpu);
1033 break;
1034 default:
1035 return EMULATE_FAIL;
1036 }
1037 kvmppc_set_gpr(vcpu, get_rt(inst), arg);
1038 break;
1039 default:
1040 return EMULATE_FAIL;
1041 }
1042 kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4);
1043 return RESUME_GUEST;
1044}
1045
3a167bea
AK
1046static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
1047 struct task_struct *tsk)
de56a948
PM
1048{
1049 int r = RESUME_HOST;
1050
1051 vcpu->stat.sum_exits++;
1052
1c9e3d51
PM
1053 /*
1054 * This can happen if an interrupt occurs in the last stages
1055 * of guest entry or the first stages of guest exit (i.e. after
1056 * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV
1057 * and before setting it to KVM_GUEST_MODE_HOST_HV).
1058 * That can happen due to a bug, or due to a machine check
1059 * occurring at just the wrong time.
1060 */
1061 if (vcpu->arch.shregs.msr & MSR_HV) {
1062 printk(KERN_EMERG "KVM trap in HV mode!\n");
1063 printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
1064 vcpu->arch.trap, kvmppc_get_pc(vcpu),
1065 vcpu->arch.shregs.msr);
1066 kvmppc_dump_regs(vcpu);
1067 run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
1068 run->hw.hardware_exit_reason = vcpu->arch.trap;
1069 return RESUME_HOST;
1070 }
de56a948
PM
1071 run->exit_reason = KVM_EXIT_UNKNOWN;
1072 run->ready_for_interrupt_injection = 1;
1073 switch (vcpu->arch.trap) {
1074 /* We're good on these - the host merely wanted to get our attention */
1075 case BOOK3S_INTERRUPT_HV_DECREMENTER:
1076 vcpu->stat.dec_exits++;
1077 r = RESUME_GUEST;
1078 break;
1079 case BOOK3S_INTERRUPT_EXTERNAL:
5d00f66b 1080 case BOOK3S_INTERRUPT_H_DOORBELL:
84f7139c 1081 case BOOK3S_INTERRUPT_H_VIRT:
de56a948
PM
1082 vcpu->stat.ext_intr_exits++;
1083 r = RESUME_GUEST;
1084 break;
dee6f24c
MS
1085 /* HMI is hypervisor interrupt and host has handled it. Resume guest.*/
1086 case BOOK3S_INTERRUPT_HMI:
de56a948
PM
1087 case BOOK3S_INTERRUPT_PERFMON:
1088 r = RESUME_GUEST;
1089 break;
b4072df4 1090 case BOOK3S_INTERRUPT_MACHINE_CHECK:
e20bbd3d
AP
1091 /* Exit to guest with KVM_EXIT_NMI as exit reason */
1092 run->exit_reason = KVM_EXIT_NMI;
1093 run->hw.hardware_exit_reason = vcpu->arch.trap;
1094 /* Clear out the old NMI status from run->flags */
1095 run->flags &= ~KVM_RUN_PPC_NMI_DISP_MASK;
1096 /* Now set the NMI status */
1097 if (vcpu->arch.mce_evt.disposition == MCE_DISPOSITION_RECOVERED)
1098 run->flags |= KVM_RUN_PPC_NMI_DISP_FULLY_RECOV;
1099 else
1100 run->flags |= KVM_RUN_PPC_NMI_DISP_NOT_RECOV;
1101
1102 r = RESUME_HOST;
1103 /* Print the MCE event to host console. */
1104 machine_check_print_event_info(&vcpu->arch.mce_evt, false);
b4072df4 1105 break;
de56a948
PM
1106 case BOOK3S_INTERRUPT_PROGRAM:
1107 {
1108 ulong flags;
1109 /*
1110 * Normally program interrupts are delivered directly
1111 * to the guest by the hardware, but we can get here
1112 * as a result of a hypervisor emulation interrupt
1113 * (e40) getting turned into a 700 by BML RTAS.
1114 */
1115 flags = vcpu->arch.shregs.msr & 0x1f0000ull;
1116 kvmppc_core_queue_program(vcpu, flags);
1117 r = RESUME_GUEST;
1118 break;
1119 }
1120 case BOOK3S_INTERRUPT_SYSCALL:
1121 {
1122 /* hcall - punt to userspace */
1123 int i;
1124
27025a60
LPF
1125 /* hypercall with MSR_PR has already been handled in rmode,
1126 * and never reaches here.
1127 */
1128
de56a948
PM
1129 run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
1130 for (i = 0; i < 9; ++i)
1131 run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i);
1132 run->exit_reason = KVM_EXIT_PAPR_HCALL;
1133 vcpu->arch.hcall_needed = 1;
1134 r = RESUME_HOST;
1135 break;
1136 }
1137 /*
342d3db7
PM
1138 * We get these next two if the guest accesses a page which it thinks
1139 * it has mapped but which is not actually present, either because
1140 * it is for an emulated I/O device or because the corresonding
1141 * host page has been paged out. Any other HDSI/HISI interrupts
1142 * have been handled already.
de56a948
PM
1143 */
1144 case BOOK3S_INTERRUPT_H_DATA_STORAGE:
913d3ff9 1145 r = RESUME_PAGE_FAULT;
de56a948
PM
1146 break;
1147 case BOOK3S_INTERRUPT_H_INST_STORAGE:
913d3ff9
PM
1148 vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
1149 vcpu->arch.fault_dsisr = 0;
1150 r = RESUME_PAGE_FAULT;
de56a948
PM
1151 break;
1152 /*
1153 * This occurs if the guest executes an illegal instruction.
a59c1d9e
MS
1154 * If the guest debug is disabled, generate a program interrupt
1155 * to the guest. If guest debug is enabled, we need to check
1156 * whether the instruction is a software breakpoint instruction.
1157 * Accordingly return to Guest or Host.
de56a948
PM
1158 */
1159 case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
4a157d61
PM
1160 if (vcpu->arch.emul_inst != KVM_INST_FETCH_FAILED)
1161 vcpu->arch.last_inst = kvmppc_need_byteswap(vcpu) ?
1162 swab32(vcpu->arch.emul_inst) :
1163 vcpu->arch.emul_inst;
a59c1d9e
MS
1164 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) {
1165 r = kvmppc_emulate_debug_inst(run, vcpu);
1166 } else {
1167 kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
1168 r = RESUME_GUEST;
1169 }
bd3048b8
ME
1170 break;
1171 /*
1172 * This occurs if the guest (kernel or userspace), does something that
57900694
PM
1173 * is prohibited by HFSCR.
1174 * On POWER9, this could be a doorbell instruction that we need
1175 * to emulate.
1176 * Otherwise, we just generate a program interrupt to the guest.
bd3048b8
ME
1177 */
1178 case BOOK3S_INTERRUPT_H_FAC_UNAVAIL:
57900694
PM
1179 r = EMULATE_FAIL;
1180 if ((vcpu->arch.hfscr >> 56) == FSCR_MSGP_LG)
1181 r = kvmppc_emulate_doorbell_instr(vcpu);
1182 if (r == EMULATE_FAIL) {
1183 kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
1184 r = RESUME_GUEST;
1185 }
de56a948 1186 break;
f7af5209
SW
1187 case BOOK3S_INTERRUPT_HV_RM_HARD:
1188 r = RESUME_PASSTHROUGH;
1189 break;
de56a948
PM
1190 default:
1191 kvmppc_dump_regs(vcpu);
1192 printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
1193 vcpu->arch.trap, kvmppc_get_pc(vcpu),
1194 vcpu->arch.shregs.msr);
f3271d4c 1195 run->hw.hardware_exit_reason = vcpu->arch.trap;
de56a948 1196 r = RESUME_HOST;
de56a948
PM
1197 break;
1198 }
1199
de56a948
PM
1200 return r;
1201}
1202
3a167bea
AK
1203static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu,
1204 struct kvm_sregs *sregs)
de56a948
PM
1205{
1206 int i;
1207
de56a948 1208 memset(sregs, 0, sizeof(struct kvm_sregs));
87916442 1209 sregs->pvr = vcpu->arch.pvr;
de56a948
PM
1210 for (i = 0; i < vcpu->arch.slb_max; i++) {
1211 sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige;
1212 sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
1213 }
1214
1215 return 0;
1216}
1217
3a167bea
AK
1218static int kvm_arch_vcpu_ioctl_set_sregs_hv(struct kvm_vcpu *vcpu,
1219 struct kvm_sregs *sregs)
de56a948
PM
1220{
1221 int i, j;
1222
9333e6c4
PM
1223 /* Only accept the same PVR as the host's, since we can't spoof it */
1224 if (sregs->pvr != vcpu->arch.pvr)
1225 return -EINVAL;
de56a948
PM
1226
1227 j = 0;
1228 for (i = 0; i < vcpu->arch.slb_nr; i++) {
1229 if (sregs->u.s.ppc64.slb[i].slbe & SLB_ESID_V) {
1230 vcpu->arch.slb[j].orige = sregs->u.s.ppc64.slb[i].slbe;
1231 vcpu->arch.slb[j].origv = sregs->u.s.ppc64.slb[i].slbv;
1232 ++j;
1233 }
1234 }
1235 vcpu->arch.slb_max = j;
1236
1237 return 0;
1238}
1239
a0840240
AK
1240static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr,
1241 bool preserve_top32)
a0144e2a 1242{
8f902b00 1243 struct kvm *kvm = vcpu->kvm;
a0144e2a
PM
1244 struct kvmppc_vcore *vc = vcpu->arch.vcore;
1245 u64 mask;
1246
8f902b00 1247 mutex_lock(&kvm->lock);
a0144e2a 1248 spin_lock(&vc->lock);
d682916a
AB
1249 /*
1250 * If ILE (interrupt little-endian) has changed, update the
1251 * MSR_LE bit in the intr_msr for each vcpu in this vcore.
1252 */
1253 if ((new_lpcr & LPCR_ILE) != (vc->lpcr & LPCR_ILE)) {
d682916a
AB
1254 struct kvm_vcpu *vcpu;
1255 int i;
1256
d682916a
AB
1257 kvm_for_each_vcpu(i, vcpu, kvm) {
1258 if (vcpu->arch.vcore != vc)
1259 continue;
1260 if (new_lpcr & LPCR_ILE)
1261 vcpu->arch.intr_msr |= MSR_LE;
1262 else
1263 vcpu->arch.intr_msr &= ~MSR_LE;
1264 }
d682916a
AB
1265 }
1266
a0144e2a
PM
1267 /*
1268 * Userspace can only modify DPFD (default prefetch depth),
1269 * ILE (interrupt little-endian) and TC (translation control).
8cf4ecc0 1270 * On POWER8 and POWER9 userspace can also modify AIL (alt. interrupt loc.).
a0144e2a
PM
1271 */
1272 mask = LPCR_DPFD | LPCR_ILE | LPCR_TC;
e0622bd9
PM
1273 if (cpu_has_feature(CPU_FTR_ARCH_207S))
1274 mask |= LPCR_AIL;
1bc3fe81
PM
1275 /*
1276 * On POWER9, allow userspace to enable large decrementer for the
1277 * guest, whether or not the host has it enabled.
1278 */
1279 if (cpu_has_feature(CPU_FTR_ARCH_300))
1280 mask |= LPCR_LD;
a0840240
AK
1281
1282 /* Broken 32-bit version of LPCR must not clear top bits */
1283 if (preserve_top32)
1284 mask &= 0xFFFFFFFF;
a0144e2a
PM
1285 vc->lpcr = (vc->lpcr & ~mask) | (new_lpcr & mask);
1286 spin_unlock(&vc->lock);
8f902b00 1287 mutex_unlock(&kvm->lock);
a0144e2a
PM
1288}
1289
3a167bea
AK
1290static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
1291 union kvmppc_one_reg *val)
31f3438e 1292{
a136a8bd
PM
1293 int r = 0;
1294 long int i;
31f3438e 1295
a136a8bd 1296 switch (id) {
a59c1d9e
MS
1297 case KVM_REG_PPC_DEBUG_INST:
1298 *val = get_reg_val(id, KVMPPC_INST_SW_BREAKPOINT);
1299 break;
31f3438e 1300 case KVM_REG_PPC_HIOR:
a136a8bd
PM
1301 *val = get_reg_val(id, 0);
1302 break;
1303 case KVM_REG_PPC_DABR:
1304 *val = get_reg_val(id, vcpu->arch.dabr);
1305 break;
8563bf52
PM
1306 case KVM_REG_PPC_DABRX:
1307 *val = get_reg_val(id, vcpu->arch.dabrx);
1308 break;
a136a8bd
PM
1309 case KVM_REG_PPC_DSCR:
1310 *val = get_reg_val(id, vcpu->arch.dscr);
1311 break;
1312 case KVM_REG_PPC_PURR:
1313 *val = get_reg_val(id, vcpu->arch.purr);
1314 break;
1315 case KVM_REG_PPC_SPURR:
1316 *val = get_reg_val(id, vcpu->arch.spurr);
1317 break;
1318 case KVM_REG_PPC_AMR:
1319 *val = get_reg_val(id, vcpu->arch.amr);
1320 break;
1321 case KVM_REG_PPC_UAMOR:
1322 *val = get_reg_val(id, vcpu->arch.uamor);
1323 break;
b005255e 1324 case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRS:
a136a8bd
PM
1325 i = id - KVM_REG_PPC_MMCR0;
1326 *val = get_reg_val(id, vcpu->arch.mmcr[i]);
1327 break;
1328 case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
1329 i = id - KVM_REG_PPC_PMC1;
1330 *val = get_reg_val(id, vcpu->arch.pmc[i]);
31f3438e 1331 break;
b005255e
MN
1332 case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2:
1333 i = id - KVM_REG_PPC_SPMC1;
1334 *val = get_reg_val(id, vcpu->arch.spmc[i]);
1335 break;
14941789
PM
1336 case KVM_REG_PPC_SIAR:
1337 *val = get_reg_val(id, vcpu->arch.siar);
1338 break;
1339 case KVM_REG_PPC_SDAR:
1340 *val = get_reg_val(id, vcpu->arch.sdar);
1341 break;
b005255e
MN
1342 case KVM_REG_PPC_SIER:
1343 *val = get_reg_val(id, vcpu->arch.sier);
a8bd19ef 1344 break;
b005255e
MN
1345 case KVM_REG_PPC_IAMR:
1346 *val = get_reg_val(id, vcpu->arch.iamr);
1347 break;
b005255e
MN
1348 case KVM_REG_PPC_PSPB:
1349 *val = get_reg_val(id, vcpu->arch.pspb);
1350 break;
b005255e
MN
1351 case KVM_REG_PPC_DPDES:
1352 *val = get_reg_val(id, vcpu->arch.vcore->dpdes);
1353 break;
88b02cf9
PM
1354 case KVM_REG_PPC_VTB:
1355 *val = get_reg_val(id, vcpu->arch.vcore->vtb);
1356 break;
b005255e
MN
1357 case KVM_REG_PPC_DAWR:
1358 *val = get_reg_val(id, vcpu->arch.dawr);
1359 break;
1360 case KVM_REG_PPC_DAWRX:
1361 *val = get_reg_val(id, vcpu->arch.dawrx);
1362 break;
1363 case KVM_REG_PPC_CIABR:
1364 *val = get_reg_val(id, vcpu->arch.ciabr);
1365 break;
b005255e
MN
1366 case KVM_REG_PPC_CSIGR:
1367 *val = get_reg_val(id, vcpu->arch.csigr);
1368 break;
1369 case KVM_REG_PPC_TACR:
1370 *val = get_reg_val(id, vcpu->arch.tacr);
1371 break;
1372 case KVM_REG_PPC_TCSCR:
1373 *val = get_reg_val(id, vcpu->arch.tcscr);
1374 break;
1375 case KVM_REG_PPC_PID:
1376 *val = get_reg_val(id, vcpu->arch.pid);
1377 break;
1378 case KVM_REG_PPC_ACOP:
1379 *val = get_reg_val(id, vcpu->arch.acop);
1380 break;
1381 case KVM_REG_PPC_WORT:
1382 *val = get_reg_val(id, vcpu->arch.wort);
a8bd19ef 1383 break;
e9cf1e08
PM
1384 case KVM_REG_PPC_TIDR:
1385 *val = get_reg_val(id, vcpu->arch.tid);
1386 break;
1387 case KVM_REG_PPC_PSSCR:
1388 *val = get_reg_val(id, vcpu->arch.psscr);
1389 break;
55b665b0
PM
1390 case KVM_REG_PPC_VPA_ADDR:
1391 spin_lock(&vcpu->arch.vpa_update_lock);
1392 *val = get_reg_val(id, vcpu->arch.vpa.next_gpa);
1393 spin_unlock(&vcpu->arch.vpa_update_lock);
1394 break;
1395 case KVM_REG_PPC_VPA_SLB:
1396 spin_lock(&vcpu->arch.vpa_update_lock);
1397 val->vpaval.addr = vcpu->arch.slb_shadow.next_gpa;
1398 val->vpaval.length = vcpu->arch.slb_shadow.len;
1399 spin_unlock(&vcpu->arch.vpa_update_lock);
1400 break;
1401 case KVM_REG_PPC_VPA_DTL:
1402 spin_lock(&vcpu->arch.vpa_update_lock);
1403 val->vpaval.addr = vcpu->arch.dtl.next_gpa;
1404 val->vpaval.length = vcpu->arch.dtl.len;
1405 spin_unlock(&vcpu->arch.vpa_update_lock);
1406 break;
93b0f4dc
PM
1407 case KVM_REG_PPC_TB_OFFSET:
1408 *val = get_reg_val(id, vcpu->arch.vcore->tb_offset);
1409 break;
a0144e2a 1410 case KVM_REG_PPC_LPCR:
a0840240 1411 case KVM_REG_PPC_LPCR_64:
a0144e2a
PM
1412 *val = get_reg_val(id, vcpu->arch.vcore->lpcr);
1413 break;
4b8473c9
PM
1414 case KVM_REG_PPC_PPR:
1415 *val = get_reg_val(id, vcpu->arch.ppr);
1416 break;
a7d80d01
MN
1417#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
1418 case KVM_REG_PPC_TFHAR:
1419 *val = get_reg_val(id, vcpu->arch.tfhar);
1420 break;
1421 case KVM_REG_PPC_TFIAR:
1422 *val = get_reg_val(id, vcpu->arch.tfiar);
1423 break;
1424 case KVM_REG_PPC_TEXASR:
1425 *val = get_reg_val(id, vcpu->arch.texasr);
1426 break;
1427 case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
1428 i = id - KVM_REG_PPC_TM_GPR0;
1429 *val = get_reg_val(id, vcpu->arch.gpr_tm[i]);
1430 break;
1431 case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
1432 {
1433 int j;
1434 i = id - KVM_REG_PPC_TM_VSR0;
1435 if (i < 32)
1436 for (j = 0; j < TS_FPRWIDTH; j++)
1437 val->vsxval[j] = vcpu->arch.fp_tm.fpr[i][j];
1438 else {
1439 if (cpu_has_feature(CPU_FTR_ALTIVEC))
1440 val->vval = vcpu->arch.vr_tm.vr[i-32];
1441 else
1442 r = -ENXIO;
1443 }
1444 break;
1445 }
1446 case KVM_REG_PPC_TM_CR:
1447 *val = get_reg_val(id, vcpu->arch.cr_tm);
1448 break;
0d808df0
PM
1449 case KVM_REG_PPC_TM_XER:
1450 *val = get_reg_val(id, vcpu->arch.xer_tm);
1451 break;
a7d80d01
MN
1452 case KVM_REG_PPC_TM_LR:
1453 *val = get_reg_val(id, vcpu->arch.lr_tm);
1454 break;
1455 case KVM_REG_PPC_TM_CTR:
1456 *val = get_reg_val(id, vcpu->arch.ctr_tm);
1457 break;
1458 case KVM_REG_PPC_TM_FPSCR:
1459 *val = get_reg_val(id, vcpu->arch.fp_tm.fpscr);
1460 break;
1461 case KVM_REG_PPC_TM_AMR:
1462 *val = get_reg_val(id, vcpu->arch.amr_tm);
1463 break;
1464 case KVM_REG_PPC_TM_PPR:
1465 *val = get_reg_val(id, vcpu->arch.ppr_tm);
1466 break;
1467 case KVM_REG_PPC_TM_VRSAVE:
1468 *val = get_reg_val(id, vcpu->arch.vrsave_tm);
1469 break;
1470 case KVM_REG_PPC_TM_VSCR:
1471 if (cpu_has_feature(CPU_FTR_ALTIVEC))
1472 *val = get_reg_val(id, vcpu->arch.vr_tm.vscr.u[3]);
1473 else
1474 r = -ENXIO;
1475 break;
1476 case KVM_REG_PPC_TM_DSCR:
1477 *val = get_reg_val(id, vcpu->arch.dscr_tm);
1478 break;
1479 case KVM_REG_PPC_TM_TAR:
1480 *val = get_reg_val(id, vcpu->arch.tar_tm);
1481 break;
1482#endif
388cc6e1
PM
1483 case KVM_REG_PPC_ARCH_COMPAT:
1484 *val = get_reg_val(id, vcpu->arch.vcore->arch_compat);
1485 break;
31f3438e 1486 default:
a136a8bd 1487 r = -EINVAL;
31f3438e
PM
1488 break;
1489 }
1490
1491 return r;
1492}
1493
3a167bea
AK
1494static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
1495 union kvmppc_one_reg *val)
31f3438e 1496{
a136a8bd
PM
1497 int r = 0;
1498 long int i;
55b665b0 1499 unsigned long addr, len;
31f3438e 1500
a136a8bd 1501 switch (id) {
31f3438e 1502 case KVM_REG_PPC_HIOR:
31f3438e 1503 /* Only allow this to be set to zero */
a136a8bd 1504 if (set_reg_val(id, *val))
31f3438e
PM
1505 r = -EINVAL;
1506 break;
a136a8bd
PM
1507 case KVM_REG_PPC_DABR:
1508 vcpu->arch.dabr = set_reg_val(id, *val);
1509 break;
8563bf52
PM
1510 case KVM_REG_PPC_DABRX:
1511 vcpu->arch.dabrx = set_reg_val(id, *val) & ~DABRX_HYP;
1512 break;
a136a8bd
PM
1513 case KVM_REG_PPC_DSCR:
1514 vcpu->arch.dscr = set_reg_val(id, *val);
1515 break;
1516 case KVM_REG_PPC_PURR:
1517 vcpu->arch.purr = set_reg_val(id, *val);
1518 break;
1519 case KVM_REG_PPC_SPURR:
1520 vcpu->arch.spurr = set_reg_val(id, *val);
1521 break;
1522 case KVM_REG_PPC_AMR:
1523 vcpu->arch.amr = set_reg_val(id, *val);
1524 break;
1525 case KVM_REG_PPC_UAMOR:
1526 vcpu->arch.uamor = set_reg_val(id, *val);
1527 break;
b005255e 1528 case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRS:
a136a8bd
PM
1529 i = id - KVM_REG_PPC_MMCR0;
1530 vcpu->arch.mmcr[i] = set_reg_val(id, *val);
1531 break;
1532 case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
1533 i = id - KVM_REG_PPC_PMC1;
1534 vcpu->arch.pmc[i] = set_reg_val(id, *val);
1535 break;
b005255e
MN
1536 case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2:
1537 i = id - KVM_REG_PPC_SPMC1;
1538 vcpu->arch.spmc[i] = set_reg_val(id, *val);
1539 break;
14941789
PM
1540 case KVM_REG_PPC_SIAR:
1541 vcpu->arch.siar = set_reg_val(id, *val);
1542 break;
1543 case KVM_REG_PPC_SDAR:
1544 vcpu->arch.sdar = set_reg_val(id, *val);
1545 break;
b005255e
MN
1546 case KVM_REG_PPC_SIER:
1547 vcpu->arch.sier = set_reg_val(id, *val);
a8bd19ef 1548 break;
b005255e
MN
1549 case KVM_REG_PPC_IAMR:
1550 vcpu->arch.iamr = set_reg_val(id, *val);
1551 break;
b005255e
MN
1552 case KVM_REG_PPC_PSPB:
1553 vcpu->arch.pspb = set_reg_val(id, *val);
1554 break;
b005255e
MN
1555 case KVM_REG_PPC_DPDES:
1556 vcpu->arch.vcore->dpdes = set_reg_val(id, *val);
1557 break;
88b02cf9
PM
1558 case KVM_REG_PPC_VTB:
1559 vcpu->arch.vcore->vtb = set_reg_val(id, *val);
1560 break;
b005255e
MN
1561 case KVM_REG_PPC_DAWR:
1562 vcpu->arch.dawr = set_reg_val(id, *val);
1563 break;
1564 case KVM_REG_PPC_DAWRX:
1565 vcpu->arch.dawrx = set_reg_val(id, *val) & ~DAWRX_HYP;
1566 break;
1567 case KVM_REG_PPC_CIABR:
1568 vcpu->arch.ciabr = set_reg_val(id, *val);
1569 /* Don't allow setting breakpoints in hypervisor code */
1570 if ((vcpu->arch.ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER)
1571 vcpu->arch.ciabr &= ~CIABR_PRIV; /* disable */
1572 break;
b005255e
MN
1573 case KVM_REG_PPC_CSIGR:
1574 vcpu->arch.csigr = set_reg_val(id, *val);
1575 break;
1576 case KVM_REG_PPC_TACR:
1577 vcpu->arch.tacr = set_reg_val(id, *val);
1578 break;
1579 case KVM_REG_PPC_TCSCR:
1580 vcpu->arch.tcscr = set_reg_val(id, *val);
1581 break;
1582 case KVM_REG_PPC_PID:
1583 vcpu->arch.pid = set_reg_val(id, *val);
1584 break;
1585 case KVM_REG_PPC_ACOP:
1586 vcpu->arch.acop = set_reg_val(id, *val);
1587 break;
1588 case KVM_REG_PPC_WORT:
1589 vcpu->arch.wort = set_reg_val(id, *val);
a8bd19ef 1590 break;
e9cf1e08
PM
1591 case KVM_REG_PPC_TIDR:
1592 vcpu->arch.tid = set_reg_val(id, *val);
1593 break;
1594 case KVM_REG_PPC_PSSCR:
1595 vcpu->arch.psscr = set_reg_val(id, *val) & PSSCR_GUEST_VIS;
1596 break;
55b665b0
PM
1597 case KVM_REG_PPC_VPA_ADDR:
1598 addr = set_reg_val(id, *val);
1599 r = -EINVAL;
1600 if (!addr && (vcpu->arch.slb_shadow.next_gpa ||
1601 vcpu->arch.dtl.next_gpa))
1602 break;
1603 r = set_vpa(vcpu, &vcpu->arch.vpa, addr, sizeof(struct lppaca));
1604 break;
1605 case KVM_REG_PPC_VPA_SLB:
1606 addr = val->vpaval.addr;
1607 len = val->vpaval.length;
1608 r = -EINVAL;
1609 if (addr && !vcpu->arch.vpa.next_gpa)
1610 break;
1611 r = set_vpa(vcpu, &vcpu->arch.slb_shadow, addr, len);
1612 break;
1613 case KVM_REG_PPC_VPA_DTL:
1614 addr = val->vpaval.addr;
1615 len = val->vpaval.length;
1616 r = -EINVAL;
9f8c8c78
PM
1617 if (addr && (len < sizeof(struct dtl_entry) ||
1618 !vcpu->arch.vpa.next_gpa))
55b665b0
PM
1619 break;
1620 len -= len % sizeof(struct dtl_entry);
1621 r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len);
1622 break;
93b0f4dc 1623 case KVM_REG_PPC_TB_OFFSET:
3d3efb68
PM
1624 /*
1625 * POWER9 DD1 has an erratum where writing TBU40 causes
1626 * the timebase to lose ticks. So we don't let the
1627 * timebase offset be changed on P9 DD1. (It is
1628 * initialized to zero.)
1629 */
1630 if (cpu_has_feature(CPU_FTR_POWER9_DD1))
1631 break;
93b0f4dc
PM
1632 /* round up to multiple of 2^24 */
1633 vcpu->arch.vcore->tb_offset =
1634 ALIGN(set_reg_val(id, *val), 1UL << 24);
1635 break;
a0144e2a 1636 case KVM_REG_PPC_LPCR:
a0840240
AK
1637 kvmppc_set_lpcr(vcpu, set_reg_val(id, *val), true);
1638 break;
1639 case KVM_REG_PPC_LPCR_64:
1640 kvmppc_set_lpcr(vcpu, set_reg_val(id, *val), false);
a0144e2a 1641 break;
4b8473c9
PM
1642 case KVM_REG_PPC_PPR:
1643 vcpu->arch.ppr = set_reg_val(id, *val);
1644 break;
a7d80d01
MN
1645#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
1646 case KVM_REG_PPC_TFHAR:
1647 vcpu->arch.tfhar = set_reg_val(id, *val);
1648 break;
1649 case KVM_REG_PPC_TFIAR:
1650 vcpu->arch.tfiar = set_reg_val(id, *val);
1651 break;
1652 case KVM_REG_PPC_TEXASR:
1653 vcpu->arch.texasr = set_reg_val(id, *val);
1654 break;
1655 case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
1656 i = id - KVM_REG_PPC_TM_GPR0;
1657 vcpu->arch.gpr_tm[i] = set_reg_val(id, *val);
1658 break;
1659 case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
1660 {
1661 int j;
1662 i = id - KVM_REG_PPC_TM_VSR0;
1663 if (i < 32)
1664 for (j = 0; j < TS_FPRWIDTH; j++)
1665 vcpu->arch.fp_tm.fpr[i][j] = val->vsxval[j];
1666 else
1667 if (cpu_has_feature(CPU_FTR_ALTIVEC))
1668 vcpu->arch.vr_tm.vr[i-32] = val->vval;
1669 else
1670 r = -ENXIO;
1671 break;
1672 }
1673 case KVM_REG_PPC_TM_CR:
1674 vcpu->arch.cr_tm = set_reg_val(id, *val);
1675 break;
0d808df0
PM
1676 case KVM_REG_PPC_TM_XER:
1677 vcpu->arch.xer_tm = set_reg_val(id, *val);
1678 break;
a7d80d01
MN
1679 case KVM_REG_PPC_TM_LR:
1680 vcpu->arch.lr_tm = set_reg_val(id, *val);
1681 break;
1682 case KVM_REG_PPC_TM_CTR:
1683 vcpu->arch.ctr_tm = set_reg_val(id, *val);
1684 break;
1685 case KVM_REG_PPC_TM_FPSCR:
1686 vcpu->arch.fp_tm.fpscr = set_reg_val(id, *val);
1687 break;
1688 case KVM_REG_PPC_TM_AMR:
1689 vcpu->arch.amr_tm = set_reg_val(id, *val);
1690 break;
1691 case KVM_REG_PPC_TM_PPR:
1692 vcpu->arch.ppr_tm = set_reg_val(id, *val);
1693 break;
1694 case KVM_REG_PPC_TM_VRSAVE:
1695 vcpu->arch.vrsave_tm = set_reg_val(id, *val);
1696 break;
1697 case KVM_REG_PPC_TM_VSCR:
1698 if (cpu_has_feature(CPU_FTR_ALTIVEC))
1699 vcpu->arch.vr.vscr.u[3] = set_reg_val(id, *val);
1700 else
1701 r = - ENXIO;
1702 break;
1703 case KVM_REG_PPC_TM_DSCR:
1704 vcpu->arch.dscr_tm = set_reg_val(id, *val);
1705 break;
1706 case KVM_REG_PPC_TM_TAR:
1707 vcpu->arch.tar_tm = set_reg_val(id, *val);
1708 break;
1709#endif
388cc6e1
PM
1710 case KVM_REG_PPC_ARCH_COMPAT:
1711 r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val));
1712 break;
31f3438e 1713 default:
a136a8bd 1714 r = -EINVAL;
31f3438e
PM
1715 break;
1716 }
1717
1718 return r;
1719}
1720
45c940ba
PM
1721/*
1722 * On POWER9, threads are independent and can be in different partitions.
1723 * Therefore we consider each thread to be a subcore.
1724 * There is a restriction that all threads have to be in the same
1725 * MMU mode (radix or HPT), unfortunately, but since we only support
1726 * HPT guests on a HPT host so far, that isn't an impediment yet.
1727 */
1728static int threads_per_vcore(void)
1729{
1730 if (cpu_has_feature(CPU_FTR_ARCH_300))
1731 return 1;
1732 return threads_per_subcore;
1733}
1734
de9bdd1a
SS
1735static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
1736{
1737 struct kvmppc_vcore *vcore;
1738
1739 vcore = kzalloc(sizeof(struct kvmppc_vcore), GFP_KERNEL);
1740
1741 if (vcore == NULL)
1742 return NULL;
1743
de9bdd1a 1744 spin_lock_init(&vcore->lock);
2711e248 1745 spin_lock_init(&vcore->stoltb_lock);
8577370f 1746 init_swait_queue_head(&vcore->wq);
de9bdd1a
SS
1747 vcore->preempt_tb = TB_NIL;
1748 vcore->lpcr = kvm->arch.lpcr;
3c313524 1749 vcore->first_vcpuid = core * kvm->arch.smt_mode;
de9bdd1a 1750 vcore->kvm = kvm;
ec257165 1751 INIT_LIST_HEAD(&vcore->preempt_list);
de9bdd1a
SS
1752
1753 return vcore;
1754}
1755
b6c295df
PM
1756#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
1757static struct debugfs_timings_element {
1758 const char *name;
1759 size_t offset;
1760} timings[] = {
1761 {"rm_entry", offsetof(struct kvm_vcpu, arch.rm_entry)},
1762 {"rm_intr", offsetof(struct kvm_vcpu, arch.rm_intr)},
1763 {"rm_exit", offsetof(struct kvm_vcpu, arch.rm_exit)},
1764 {"guest", offsetof(struct kvm_vcpu, arch.guest_time)},
1765 {"cede", offsetof(struct kvm_vcpu, arch.cede_time)},
1766};
1767
1768#define N_TIMINGS (sizeof(timings) / sizeof(timings[0]))
1769
1770struct debugfs_timings_state {
1771 struct kvm_vcpu *vcpu;
1772 unsigned int buflen;
1773 char buf[N_TIMINGS * 100];
1774};
1775
1776static int debugfs_timings_open(struct inode *inode, struct file *file)
1777{
1778 struct kvm_vcpu *vcpu = inode->i_private;
1779 struct debugfs_timings_state *p;
1780
1781 p = kzalloc(sizeof(*p), GFP_KERNEL);
1782 if (!p)
1783 return -ENOMEM;
1784
1785 kvm_get_kvm(vcpu->kvm);
1786 p->vcpu = vcpu;
1787 file->private_data = p;
1788
1789 return nonseekable_open(inode, file);
1790}
1791
1792static int debugfs_timings_release(struct inode *inode, struct file *file)
1793{
1794 struct debugfs_timings_state *p = file->private_data;
1795
1796 kvm_put_kvm(p->vcpu->kvm);
1797 kfree(p);
1798 return 0;
1799}
1800
1801static ssize_t debugfs_timings_read(struct file *file, char __user *buf,
1802 size_t len, loff_t *ppos)
1803{
1804 struct debugfs_timings_state *p = file->private_data;
1805 struct kvm_vcpu *vcpu = p->vcpu;
1806 char *s, *buf_end;
1807 struct kvmhv_tb_accumulator tb;
1808 u64 count;
1809 loff_t pos;
1810 ssize_t n;
1811 int i, loops;
1812 bool ok;
1813
1814 if (!p->buflen) {
1815 s = p->buf;
1816 buf_end = s + sizeof(p->buf);
1817 for (i = 0; i < N_TIMINGS; ++i) {
1818 struct kvmhv_tb_accumulator *acc;
1819
1820 acc = (struct kvmhv_tb_accumulator *)
1821 ((unsigned long)vcpu + timings[i].offset);
1822 ok = false;
1823 for (loops = 0; loops < 1000; ++loops) {
1824 count = acc->seqcount;
1825 if (!(count & 1)) {
1826 smp_rmb();
1827 tb = *acc;
1828 smp_rmb();
1829 if (count == acc->seqcount) {
1830 ok = true;
1831 break;
1832 }
1833 }
1834 udelay(1);
1835 }
1836 if (!ok)
1837 snprintf(s, buf_end - s, "%s: stuck\n",
1838 timings[i].name);
1839 else
1840 snprintf(s, buf_end - s,
1841 "%s: %llu %llu %llu %llu\n",
1842 timings[i].name, count / 2,
1843 tb_to_ns(tb.tb_total),
1844 tb_to_ns(tb.tb_min),
1845 tb_to_ns(tb.tb_max));
1846 s += strlen(s);
1847 }
1848 p->buflen = s - p->buf;
1849 }
1850
1851 pos = *ppos;
1852 if (pos >= p->buflen)
1853 return 0;
1854 if (len > p->buflen - pos)
1855 len = p->buflen - pos;
1856 n = copy_to_user(buf, p->buf + pos, len);
1857 if (n) {
1858 if (n == len)
1859 return -EFAULT;
1860 len -= n;
1861 }
1862 *ppos = pos + len;
1863 return len;
1864}
1865
1866static ssize_t debugfs_timings_write(struct file *file, const char __user *buf,
1867 size_t len, loff_t *ppos)
1868{
1869 return -EACCES;
1870}
1871
1872static const struct file_operations debugfs_timings_ops = {
1873 .owner = THIS_MODULE,
1874 .open = debugfs_timings_open,
1875 .release = debugfs_timings_release,
1876 .read = debugfs_timings_read,
1877 .write = debugfs_timings_write,
1878 .llseek = generic_file_llseek,
1879};
1880
1881/* Create a debugfs directory for the vcpu */
1882static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id)
1883{
1884 char buf[16];
1885 struct kvm *kvm = vcpu->kvm;
1886
1887 snprintf(buf, sizeof(buf), "vcpu%u", id);
1888 if (IS_ERR_OR_NULL(kvm->arch.debugfs_dir))
1889 return;
1890 vcpu->arch.debugfs_dir = debugfs_create_dir(buf, kvm->arch.debugfs_dir);
1891 if (IS_ERR_OR_NULL(vcpu->arch.debugfs_dir))
1892 return;
1893 vcpu->arch.debugfs_timings =
1894 debugfs_create_file("timings", 0444, vcpu->arch.debugfs_dir,
1895 vcpu, &debugfs_timings_ops);
1896}
1897
1898#else /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
1899static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id)
1900{
1901}
1902#endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
1903
3a167bea
AK
1904static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
1905 unsigned int id)
de56a948
PM
1906{
1907 struct kvm_vcpu *vcpu;
3c313524 1908 int err;
371fefd6
PM
1909 int core;
1910 struct kvmppc_vcore *vcore;
de56a948 1911
371fefd6 1912 err = -ENOMEM;
6b75e6bf 1913 vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
de56a948
PM
1914 if (!vcpu)
1915 goto out;
1916
1917 err = kvm_vcpu_init(vcpu, kvm, id);
1918 if (err)
1919 goto free_vcpu;
1920
1921 vcpu->arch.shared = &vcpu->arch.shregs;
5deb8e7a
AG
1922#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
1923 /*
1924 * The shared struct is never shared on HV,
1925 * so we can always use host endianness
1926 */
1927#ifdef __BIG_ENDIAN__
1928 vcpu->arch.shared_big_endian = true;
1929#else
1930 vcpu->arch.shared_big_endian = false;
1931#endif
1932#endif
de56a948
PM
1933 vcpu->arch.mmcr[0] = MMCR0_FC;
1934 vcpu->arch.ctrl = CTRL_RUNLATCH;
1935 /* default to host PVR, since we can't spoof it */
3a167bea 1936 kvmppc_set_pvr_hv(vcpu, mfspr(SPRN_PVR));
2e25aa5f 1937 spin_lock_init(&vcpu->arch.vpa_update_lock);
c7b67670
PM
1938 spin_lock_init(&vcpu->arch.tbacct_lock);
1939 vcpu->arch.busy_preempt = TB_NIL;
d682916a 1940 vcpu->arch.intr_msr = MSR_SF | MSR_ME;
de56a948 1941
769377f7
PM
1942 /*
1943 * Set the default HFSCR for the guest from the host value.
1944 * This value is only used on POWER9.
1945 * On POWER9 DD1, TM doesn't work, so we make sure to
1946 * prevent the guest from using it.
57900694
PM
1947 * On POWER9, we want to virtualize the doorbell facility, so we
1948 * turn off the HFSCR bit, which causes those instructions to trap.
769377f7
PM
1949 */
1950 vcpu->arch.hfscr = mfspr(SPRN_HFSCR);
1951 if (!cpu_has_feature(CPU_FTR_TM))
1952 vcpu->arch.hfscr &= ~HFSCR_TM;
57900694
PM
1953 if (cpu_has_feature(CPU_FTR_ARCH_300))
1954 vcpu->arch.hfscr &= ~HFSCR_MSGP;
769377f7 1955
de56a948
PM
1956 kvmppc_mmu_book3s_hv_init(vcpu);
1957
8455d79e 1958 vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
371fefd6
PM
1959
1960 init_waitqueue_head(&vcpu->arch.cpu_run);
1961
1962 mutex_lock(&kvm->lock);
3c313524
PM
1963 vcore = NULL;
1964 err = -EINVAL;
1965 core = id / kvm->arch.smt_mode;
1966 if (core < KVM_MAX_VCORES) {
1967 vcore = kvm->arch.vcores[core];
1968 if (!vcore) {
1969 err = -ENOMEM;
1970 vcore = kvmppc_vcore_create(kvm, core);
1971 kvm->arch.vcores[core] = vcore;
1972 kvm->arch.online_vcores++;
1973 }
371fefd6
PM
1974 }
1975 mutex_unlock(&kvm->lock);
1976
1977 if (!vcore)
1978 goto free_vcpu;
1979
1980 spin_lock(&vcore->lock);
1981 ++vcore->num_threads;
371fefd6
PM
1982 spin_unlock(&vcore->lock);
1983 vcpu->arch.vcore = vcore;
e0b7ec05 1984 vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid;
ec257165 1985 vcpu->arch.thread_cpu = -1;
a29ebeaf 1986 vcpu->arch.prev_cpu = -1;
371fefd6 1987
af8f38b3
AG
1988 vcpu->arch.cpu_type = KVM_CPU_3S_64;
1989 kvmppc_sanity_check(vcpu);
1990
b6c295df
PM
1991 debugfs_vcpu_init(vcpu, id);
1992
de56a948
PM
1993 return vcpu;
1994
1995free_vcpu:
6b75e6bf 1996 kmem_cache_free(kvm_vcpu_cache, vcpu);
de56a948
PM
1997out:
1998 return ERR_PTR(err);
1999}
2000
3c313524
PM
2001static int kvmhv_set_smt_mode(struct kvm *kvm, unsigned long smt_mode,
2002 unsigned long flags)
2003{
2004 int err;
57900694 2005 int esmt = 0;
3c313524
PM
2006
2007 if (flags)
2008 return -EINVAL;
2009 if (smt_mode > MAX_SMT_THREADS || !is_power_of_2(smt_mode))
2010 return -EINVAL;
2011 if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
2012 /*
2013 * On POWER8 (or POWER7), the threading mode is "strict",
2014 * so we pack smt_mode vcpus per vcore.
2015 */
2016 if (smt_mode > threads_per_subcore)
2017 return -EINVAL;
2018 } else {
2019 /*
2020 * On POWER9, the threading mode is "loose",
2021 * so each vcpu gets its own vcore.
2022 */
57900694 2023 esmt = smt_mode;
3c313524
PM
2024 smt_mode = 1;
2025 }
2026 mutex_lock(&kvm->lock);
2027 err = -EBUSY;
2028 if (!kvm->arch.online_vcores) {
2029 kvm->arch.smt_mode = smt_mode;
57900694 2030 kvm->arch.emul_smt_mode = esmt;
3c313524
PM
2031 err = 0;
2032 }
2033 mutex_unlock(&kvm->lock);
2034
2035 return err;
2036}
2037
c35635ef
PM
2038static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa)
2039{
2040 if (vpa->pinned_addr)
2041 kvmppc_unpin_guest_page(kvm, vpa->pinned_addr, vpa->gpa,
2042 vpa->dirty);
2043}
2044
3a167bea 2045static void kvmppc_core_vcpu_free_hv(struct kvm_vcpu *vcpu)
de56a948 2046{
2e25aa5f 2047 spin_lock(&vcpu->arch.vpa_update_lock);
c35635ef
PM
2048 unpin_vpa(vcpu->kvm, &vcpu->arch.dtl);
2049 unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow);
2050 unpin_vpa(vcpu->kvm, &vcpu->arch.vpa);
2e25aa5f 2051 spin_unlock(&vcpu->arch.vpa_update_lock);
de56a948 2052 kvm_vcpu_uninit(vcpu);
6b75e6bf 2053 kmem_cache_free(kvm_vcpu_cache, vcpu);
de56a948
PM
2054}
2055
3a167bea
AK
2056static int kvmppc_core_check_requests_hv(struct kvm_vcpu *vcpu)
2057{
2058 /* Indicate we want to get back into the guest */
2059 return 1;
2060}
2061
19ccb76a 2062static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
371fefd6 2063{
19ccb76a 2064 unsigned long dec_nsec, now;
371fefd6 2065
19ccb76a
PM
2066 now = get_tb();
2067 if (now > vcpu->arch.dec_expires) {
2068 /* decrementer has already gone negative */
2069 kvmppc_core_queue_dec(vcpu);
7e28e60e 2070 kvmppc_core_prepare_to_enter(vcpu);
19ccb76a 2071 return;
371fefd6 2072 }
19ccb76a
PM
2073 dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC
2074 / tb_ticks_per_sec;
8b0e1953 2075 hrtimer_start(&vcpu->arch.dec_timer, dec_nsec, HRTIMER_MODE_REL);
19ccb76a 2076 vcpu->arch.timer_running = 1;
371fefd6
PM
2077}
2078
19ccb76a 2079static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
371fefd6 2080{
19ccb76a
PM
2081 vcpu->arch.ceded = 0;
2082 if (vcpu->arch.timer_running) {
2083 hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
2084 vcpu->arch.timer_running = 0;
2085 }
371fefd6
PM
2086}
2087
e0b7ec05 2088extern void __kvmppc_vcore_entry(void);
de56a948 2089
371fefd6
PM
2090static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
2091 struct kvm_vcpu *vcpu)
de56a948 2092{
c7b67670
PM
2093 u64 now;
2094
371fefd6
PM
2095 if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
2096 return;
bf3d32e1 2097 spin_lock_irq(&vcpu->arch.tbacct_lock);
c7b67670
PM
2098 now = mftb();
2099 vcpu->arch.busy_stolen += vcore_stolen_time(vc, now) -
2100 vcpu->arch.stolen_logged;
2101 vcpu->arch.busy_preempt = now;
2102 vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
bf3d32e1 2103 spin_unlock_irq(&vcpu->arch.tbacct_lock);
371fefd6 2104 --vc->n_runnable;
7b5f8272 2105 WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], NULL);
371fefd6
PM
2106}
2107
f0888f70
PM
2108static int kvmppc_grab_hwthread(int cpu)
2109{
2110 struct paca_struct *tpaca;
b754c739 2111 long timeout = 10000;
f0888f70
PM
2112
2113 tpaca = &paca[cpu];
2114
2115 /* Ensure the thread won't go into the kernel if it wakes */
7b444c67 2116 tpaca->kvm_hstate.kvm_vcpu = NULL;
b4deba5c 2117 tpaca->kvm_hstate.kvm_vcore = NULL;
5d5b99cd
PM
2118 tpaca->kvm_hstate.napping = 0;
2119 smp_wmb();
2120 tpaca->kvm_hstate.hwthread_req = 1;
f0888f70
PM
2121
2122 /*
2123 * If the thread is already executing in the kernel (e.g. handling
2124 * a stray interrupt), wait for it to get back to nap mode.
2125 * The smp_mb() is to ensure that our setting of hwthread_req
2126 * is visible before we look at hwthread_state, so if this
2127 * races with the code at system_reset_pSeries and the thread
2128 * misses our setting of hwthread_req, we are sure to see its
2129 * setting of hwthread_state, and vice versa.
2130 */
2131 smp_mb();
2132 while (tpaca->kvm_hstate.hwthread_state == KVM_HWTHREAD_IN_KERNEL) {
2133 if (--timeout <= 0) {
2134 pr_err("KVM: couldn't grab cpu %d\n", cpu);
2135 return -EBUSY;
2136 }
2137 udelay(1);
2138 }
2139 return 0;
2140}
2141
2142static void kvmppc_release_hwthread(int cpu)
2143{
2144 struct paca_struct *tpaca;
2145
2146 tpaca = &paca[cpu];
2147 tpaca->kvm_hstate.hwthread_req = 0;
2148 tpaca->kvm_hstate.kvm_vcpu = NULL;
b4deba5c
PM
2149 tpaca->kvm_hstate.kvm_vcore = NULL;
2150 tpaca->kvm_hstate.kvm_split_mode = NULL;
f0888f70
PM
2151}
2152
a29ebeaf
PM
2153static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
2154{
2155 int i;
2156
2157 cpu = cpu_first_thread_sibling(cpu);
2158 cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush);
2159 /*
2160 * Make sure setting of bit in need_tlb_flush precedes
2161 * testing of cpu_in_guest bits. The matching barrier on
2162 * the other side is the first smp_mb() in kvmppc_run_core().
2163 */
2164 smp_mb();
2165 for (i = 0; i < threads_per_core; ++i)
2166 if (cpumask_test_cpu(cpu + i, &kvm->arch.cpu_in_guest))
2167 smp_call_function_single(cpu + i, do_nothing, NULL, 1);
2168}
2169
b4deba5c 2170static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
371fefd6
PM
2171{
2172 int cpu;
2173 struct paca_struct *tpaca;
ec257165 2174 struct kvmppc_vcore *mvc = vc->master_vcore;
a29ebeaf 2175 struct kvm *kvm = vc->kvm;
371fefd6 2176
b4deba5c
PM
2177 cpu = vc->pcpu;
2178 if (vcpu) {
2179 if (vcpu->arch.timer_running) {
2180 hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
2181 vcpu->arch.timer_running = 0;
2182 }
2183 cpu += vcpu->arch.ptid;
2184 vcpu->cpu = mvc->pcpu;
2185 vcpu->arch.thread_cpu = cpu;
a29ebeaf
PM
2186
2187 /*
2188 * With radix, the guest can do TLB invalidations itself,
2189 * and it could choose to use the local form (tlbiel) if
2190 * it is invalidating a translation that has only ever been
2191 * used on one vcpu. However, that doesn't mean it has
2192 * only ever been used on one physical cpu, since vcpus
2193 * can move around between pcpus. To cope with this, when
2194 * a vcpu moves from one pcpu to another, we need to tell
2195 * any vcpus running on the same core as this vcpu previously
2196 * ran to flush the TLB. The TLB is shared between threads,
2197 * so we use a single bit in .need_tlb_flush for all 4 threads.
2198 */
2199 if (kvm_is_radix(kvm) && vcpu->arch.prev_cpu != cpu) {
2200 if (vcpu->arch.prev_cpu >= 0 &&
2201 cpu_first_thread_sibling(vcpu->arch.prev_cpu) !=
2202 cpu_first_thread_sibling(cpu))
2203 radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu);
2204 vcpu->arch.prev_cpu = cpu;
2205 }
2206 cpumask_set_cpu(cpu, &kvm->arch.cpu_in_guest);
19ccb76a 2207 }
371fefd6 2208 tpaca = &paca[cpu];
5d5b99cd 2209 tpaca->kvm_hstate.kvm_vcpu = vcpu;
ec257165 2210 tpaca->kvm_hstate.ptid = cpu - mvc->pcpu;
ec257165 2211 /* Order stores to hstate.kvm_vcpu etc. before store to kvm_vcore */
371fefd6 2212 smp_wmb();
b4deba5c 2213 tpaca->kvm_hstate.kvm_vcore = mvc;
5d5b99cd 2214 if (cpu != smp_processor_id())
66feed61 2215 kvmppc_ipi_thread(cpu);
371fefd6 2216}
de56a948 2217
5d5b99cd 2218static void kvmppc_wait_for_nap(void)
371fefd6 2219{
5d5b99cd
PM
2220 int cpu = smp_processor_id();
2221 int i, loops;
45c940ba 2222 int n_threads = threads_per_vcore();
371fefd6 2223
45c940ba
PM
2224 if (n_threads <= 1)
2225 return;
5d5b99cd
PM
2226 for (loops = 0; loops < 1000000; ++loops) {
2227 /*
2228 * Check if all threads are finished.
b4deba5c 2229 * We set the vcore pointer when starting a thread
5d5b99cd 2230 * and the thread clears it when finished, so we look
b4deba5c 2231 * for any threads that still have a non-NULL vcore ptr.
5d5b99cd 2232 */
45c940ba 2233 for (i = 1; i < n_threads; ++i)
b4deba5c 2234 if (paca[cpu + i].kvm_hstate.kvm_vcore)
5d5b99cd 2235 break;
45c940ba 2236 if (i == n_threads) {
5d5b99cd
PM
2237 HMT_medium();
2238 return;
371fefd6 2239 }
5d5b99cd 2240 HMT_low();
371fefd6
PM
2241 }
2242 HMT_medium();
45c940ba 2243 for (i = 1; i < n_threads; ++i)
b4deba5c 2244 if (paca[cpu + i].kvm_hstate.kvm_vcore)
5d5b99cd 2245 pr_err("KVM: CPU %d seems to be stuck\n", cpu + i);
371fefd6
PM
2246}
2247
2248/*
2249 * Check that we are on thread 0 and that any other threads in
7b444c67
PM
2250 * this core are off-line. Then grab the threads so they can't
2251 * enter the kernel.
371fefd6
PM
2252 */
2253static int on_primary_thread(void)
2254{
2255 int cpu = smp_processor_id();
3102f784 2256 int thr;
371fefd6 2257
3102f784
ME
2258 /* Are we on a primary subcore? */
2259 if (cpu_thread_in_subcore(cpu))
371fefd6 2260 return 0;
3102f784
ME
2261
2262 thr = 0;
2263 while (++thr < threads_per_subcore)
371fefd6
PM
2264 if (cpu_online(cpu + thr))
2265 return 0;
7b444c67
PM
2266
2267 /* Grab all hw threads so they can't go into the kernel */
3102f784 2268 for (thr = 1; thr < threads_per_subcore; ++thr) {
7b444c67
PM
2269 if (kvmppc_grab_hwthread(cpu + thr)) {
2270 /* Couldn't grab one; let the others go */
2271 do {
2272 kvmppc_release_hwthread(cpu + thr);
2273 } while (--thr > 0);
2274 return 0;
2275 }
2276 }
371fefd6
PM
2277 return 1;
2278}
2279
ec257165
PM
2280/*
2281 * A list of virtual cores for each physical CPU.
2282 * These are vcores that could run but their runner VCPU tasks are
2283 * (or may be) preempted.
2284 */
2285struct preempted_vcore_list {
2286 struct list_head list;
2287 spinlock_t lock;
2288};
2289
2290static DEFINE_PER_CPU(struct preempted_vcore_list, preempted_vcores);
2291
2292static void init_vcore_lists(void)
2293{
2294 int cpu;
2295
2296 for_each_possible_cpu(cpu) {
2297 struct preempted_vcore_list *lp = &per_cpu(preempted_vcores, cpu);
2298 spin_lock_init(&lp->lock);
2299 INIT_LIST_HEAD(&lp->list);
2300 }
2301}
2302
2303static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc)
2304{
2305 struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores);
2306
2307 vc->vcore_state = VCORE_PREEMPT;
2308 vc->pcpu = smp_processor_id();
45c940ba 2309 if (vc->num_threads < threads_per_vcore()) {
ec257165
PM
2310 spin_lock(&lp->lock);
2311 list_add_tail(&vc->preempt_list, &lp->list);
2312 spin_unlock(&lp->lock);
2313 }
2314
2315 /* Start accumulating stolen time */
2316 kvmppc_core_start_stolen(vc);
2317}
2318
2319static void kvmppc_vcore_end_preempt(struct kvmppc_vcore *vc)
2320{
402813fe 2321 struct preempted_vcore_list *lp;
ec257165
PM
2322
2323 kvmppc_core_end_stolen(vc);
2324 if (!list_empty(&vc->preempt_list)) {
402813fe 2325 lp = &per_cpu(preempted_vcores, vc->pcpu);
ec257165
PM
2326 spin_lock(&lp->lock);
2327 list_del_init(&vc->preempt_list);
2328 spin_unlock(&lp->lock);
2329 }
2330 vc->vcore_state = VCORE_INACTIVE;
2331}
2332
b4deba5c
PM
2333/*
2334 * This stores information about the virtual cores currently
2335 * assigned to a physical core.
2336 */
ec257165 2337struct core_info {
b4deba5c
PM
2338 int n_subcores;
2339 int max_subcore_threads;
ec257165 2340 int total_threads;
b4deba5c
PM
2341 int subcore_threads[MAX_SUBCORES];
2342 struct kvm *subcore_vm[MAX_SUBCORES];
2343 struct list_head vcs[MAX_SUBCORES];
ec257165
PM
2344};
2345
b4deba5c
PM
2346/*
2347 * This mapping means subcores 0 and 1 can use threads 0-3 and 4-7
2348 * respectively in 2-way micro-threading (split-core) mode.
2349 */
2350static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 };
2351
ec257165
PM
2352static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc)
2353{
b4deba5c
PM
2354 int sub;
2355
ec257165 2356 memset(cip, 0, sizeof(*cip));
b4deba5c
PM
2357 cip->n_subcores = 1;
2358 cip->max_subcore_threads = vc->num_threads;
ec257165 2359 cip->total_threads = vc->num_threads;
b4deba5c
PM
2360 cip->subcore_threads[0] = vc->num_threads;
2361 cip->subcore_vm[0] = vc->kvm;
2362 for (sub = 0; sub < MAX_SUBCORES; ++sub)
2363 INIT_LIST_HEAD(&cip->vcs[sub]);
2364 list_add_tail(&vc->preempt_list, &cip->vcs[0]);
2365}
2366
2367static bool subcore_config_ok(int n_subcores, int n_threads)
2368{
2369 /* Can only dynamically split if unsplit to begin with */
2370 if (n_subcores > 1 && threads_per_subcore < MAX_SMT_THREADS)
2371 return false;
2372 if (n_subcores > MAX_SUBCORES)
2373 return false;
2374 if (n_subcores > 1) {
2375 if (!(dynamic_mt_modes & 2))
2376 n_subcores = 4;
2377 if (n_subcores > 2 && !(dynamic_mt_modes & 4))
2378 return false;
2379 }
2380
2381 return n_subcores * roundup_pow_of_two(n_threads) <= MAX_SMT_THREADS;
ec257165
PM
2382}
2383
2384static void init_master_vcore(struct kvmppc_vcore *vc)
2385{
2386 vc->master_vcore = vc;
2387 vc->entry_exit_map = 0;
2388 vc->in_guest = 0;
2389 vc->napping_threads = 0;
2390 vc->conferring_threads = 0;
2391}
2392
b4deba5c
PM
2393static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
2394{
2395 int n_threads = vc->num_threads;
2396 int sub;
2397
2398 if (!cpu_has_feature(CPU_FTR_ARCH_207S))
2399 return false;
2400
2401 if (n_threads < cip->max_subcore_threads)
2402 n_threads = cip->max_subcore_threads;
b009031f 2403 if (!subcore_config_ok(cip->n_subcores + 1, n_threads))
b4deba5c 2404 return false;
b009031f 2405 cip->max_subcore_threads = n_threads;
b4deba5c
PM
2406
2407 sub = cip->n_subcores;
2408 ++cip->n_subcores;
2409 cip->total_threads += vc->num_threads;
2410 cip->subcore_threads[sub] = vc->num_threads;
2411 cip->subcore_vm[sub] = vc->kvm;
2412 init_master_vcore(vc);
28d057c8 2413 list_move_tail(&vc->preempt_list, &cip->vcs[sub]);
b4deba5c
PM
2414
2415 return true;
2416}
2417
b4deba5c
PM
2418/*
2419 * Work out whether it is possible to piggyback the execution of
2420 * vcore *pvc onto the execution of the other vcores described in *cip.
2421 */
2422static bool can_piggyback(struct kvmppc_vcore *pvc, struct core_info *cip,
2423 int target_threads)
2424{
b4deba5c
PM
2425 if (cip->total_threads + pvc->num_threads > target_threads)
2426 return false;
b4deba5c 2427
b009031f 2428 return can_dynamic_split(pvc, cip);
b4deba5c
PM
2429}
2430
d911f0be
PM
2431static void prepare_threads(struct kvmppc_vcore *vc)
2432{
7b5f8272
SJS
2433 int i;
2434 struct kvm_vcpu *vcpu;
d911f0be 2435
7b5f8272 2436 for_each_runnable_thread(i, vcpu, vc) {
d911f0be
PM
2437 if (signal_pending(vcpu->arch.run_task))
2438 vcpu->arch.ret = -EINTR;
2439 else if (vcpu->arch.vpa.update_pending ||
2440 vcpu->arch.slb_shadow.update_pending ||
2441 vcpu->arch.dtl.update_pending)
2442 vcpu->arch.ret = RESUME_GUEST;
2443 else
2444 continue;
2445 kvmppc_remove_runnable(vc, vcpu);
2446 wake_up(&vcpu->arch.cpu_run);
2447 }
2448}
2449
ec257165
PM
2450static void collect_piggybacks(struct core_info *cip, int target_threads)
2451{
2452 struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores);
2453 struct kvmppc_vcore *pvc, *vcnext;
2454
2455 spin_lock(&lp->lock);
2456 list_for_each_entry_safe(pvc, vcnext, &lp->list, preempt_list) {
2457 if (!spin_trylock(&pvc->lock))
2458 continue;
2459 prepare_threads(pvc);
2460 if (!pvc->n_runnable) {
2461 list_del_init(&pvc->preempt_list);
2462 if (pvc->runner == NULL) {
2463 pvc->vcore_state = VCORE_INACTIVE;
2464 kvmppc_core_end_stolen(pvc);
2465 }
2466 spin_unlock(&pvc->lock);
2467 continue;
2468 }
2469 if (!can_piggyback(pvc, cip, target_threads)) {
2470 spin_unlock(&pvc->lock);
2471 continue;
2472 }
2473 kvmppc_core_end_stolen(pvc);
2474 pvc->vcore_state = VCORE_PIGGYBACK;
2475 if (cip->total_threads >= target_threads)
2476 break;
2477 }
2478 spin_unlock(&lp->lock);
2479}
2480
2481static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
25fedfca 2482{
7b5f8272 2483 int still_running = 0, i;
25fedfca
PM
2484 u64 now;
2485 long ret;
7b5f8272 2486 struct kvm_vcpu *vcpu;
25fedfca 2487
ec257165 2488 spin_lock(&vc->lock);
25fedfca 2489 now = get_tb();
7b5f8272 2490 for_each_runnable_thread(i, vcpu, vc) {
25fedfca
PM
2491 /* cancel pending dec exception if dec is positive */
2492 if (now < vcpu->arch.dec_expires &&
2493 kvmppc_core_pending_dec(vcpu))
2494 kvmppc_core_dequeue_dec(vcpu);
2495
2496 trace_kvm_guest_exit(vcpu);
2497
2498 ret = RESUME_GUEST;
2499 if (vcpu->arch.trap)
2500 ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu,
2501 vcpu->arch.run_task);
2502
2503 vcpu->arch.ret = ret;
2504 vcpu->arch.trap = 0;
2505
ec257165
PM
2506 if (is_kvmppc_resume_guest(vcpu->arch.ret)) {
2507 if (vcpu->arch.pending_exceptions)
2508 kvmppc_core_prepare_to_enter(vcpu);
2509 if (vcpu->arch.ceded)
25fedfca 2510 kvmppc_set_timer(vcpu);
ec257165
PM
2511 else
2512 ++still_running;
2513 } else {
25fedfca
PM
2514 kvmppc_remove_runnable(vc, vcpu);
2515 wake_up(&vcpu->arch.cpu_run);
2516 }
2517 }
ec257165
PM
2518 list_del_init(&vc->preempt_list);
2519 if (!is_master) {
563a1e93 2520 if (still_running > 0) {
ec257165 2521 kvmppc_vcore_preempt(vc);
563a1e93
PM
2522 } else if (vc->runner) {
2523 vc->vcore_state = VCORE_PREEMPT;
2524 kvmppc_core_start_stolen(vc);
2525 } else {
2526 vc->vcore_state = VCORE_INACTIVE;
2527 }
ec257165
PM
2528 if (vc->n_runnable > 0 && vc->runner == NULL) {
2529 /* make sure there's a candidate runner awake */
7b5f8272
SJS
2530 i = -1;
2531 vcpu = next_runnable_thread(vc, &i);
ec257165
PM
2532 wake_up(&vcpu->arch.cpu_run);
2533 }
2534 }
2535 spin_unlock(&vc->lock);
25fedfca
PM
2536}
2537
b8e6a87c
SW
2538/*
2539 * Clear core from the list of active host cores as we are about to
2540 * enter the guest. Only do this if it is the primary thread of the
2541 * core (not if a subcore) that is entering the guest.
2542 */
3f7cd919 2543static inline int kvmppc_clear_host_core(unsigned int cpu)
b8e6a87c
SW
2544{
2545 int core;
2546
2547 if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
3f7cd919 2548 return 0;
b8e6a87c
SW
2549 /*
2550 * Memory barrier can be omitted here as we will do a smp_wmb()
2551 * later in kvmppc_start_thread and we need ensure that state is
2552 * visible to other CPUs only after we enter guest.
2553 */
2554 core = cpu >> threads_shift;
2555 kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 0;
3f7cd919 2556 return 0;
b8e6a87c
SW
2557}
2558
2559/*
2560 * Advertise this core as an active host core since we exited the guest
2561 * Only need to do this if it is the primary thread of the core that is
2562 * exiting.
2563 */
3f7cd919 2564static inline int kvmppc_set_host_core(unsigned int cpu)
b8e6a87c
SW
2565{
2566 int core;
2567
2568 if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
3f7cd919 2569 return 0;
b8e6a87c
SW
2570
2571 /*
2572 * Memory barrier can be omitted here because we do a spin_unlock
2573 * immediately after this which provides the memory barrier.
2574 */
2575 core = cpu >> threads_shift;
2576 kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 1;
3f7cd919 2577 return 0;
b8e6a87c
SW
2578}
2579
371fefd6
PM
2580/*
2581 * Run a set of guest threads on a physical core.
2582 * Called with vc->lock held.
2583 */
66feed61 2584static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
371fefd6 2585{
7b5f8272 2586 struct kvm_vcpu *vcpu;
d911f0be 2587 int i;
2c9097e4 2588 int srcu_idx;
ec257165
PM
2589 struct core_info core_info;
2590 struct kvmppc_vcore *pvc, *vcnext;
b4deba5c
PM
2591 struct kvm_split_mode split_info, *sip;
2592 int split, subcore_size, active;
2593 int sub;
2594 bool thr0_done;
2595 unsigned long cmd_bit, stat_bit;
ec257165
PM
2596 int pcpu, thr;
2597 int target_threads;
45c940ba 2598 int controlled_threads;
371fefd6 2599
d911f0be
PM
2600 /*
2601 * Remove from the list any threads that have a signal pending
2602 * or need a VPA update done
2603 */
2604 prepare_threads(vc);
2605
2606 /* if the runner is no longer runnable, let the caller pick a new one */
2607 if (vc->runner->arch.state != KVMPPC_VCPU_RUNNABLE)
2608 return;
081f323b
PM
2609
2610 /*
d911f0be 2611 * Initialize *vc.
081f323b 2612 */
ec257165 2613 init_master_vcore(vc);
2711e248 2614 vc->preempt_tb = TB_NIL;
081f323b 2615
45c940ba
PM
2616 /*
2617 * Number of threads that we will be controlling: the same as
2618 * the number of threads per subcore, except on POWER9,
2619 * where it's 1 because the threads are (mostly) independent.
2620 */
2621 controlled_threads = threads_per_vcore();
2622
7b444c67 2623 /*
3102f784
ME
2624 * Make sure we are running on primary threads, and that secondary
2625 * threads are offline. Also check if the number of threads in this
2626 * guest are greater than the current system threads per guest.
7b444c67 2627 */
45c940ba 2628 if ((controlled_threads > 1) &&
3102f784 2629 ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
7b5f8272 2630 for_each_runnable_thread(i, vcpu, vc) {
7b444c67 2631 vcpu->arch.ret = -EBUSY;
25fedfca
PM
2632 kvmppc_remove_runnable(vc, vcpu);
2633 wake_up(&vcpu->arch.cpu_run);
2634 }
7b444c67
PM
2635 goto out;
2636 }
2637
ec257165
PM
2638 /*
2639 * See if we could run any other vcores on the physical core
2640 * along with this one.
2641 */
2642 init_core_info(&core_info, vc);
2643 pcpu = smp_processor_id();
45c940ba 2644 target_threads = controlled_threads;
ec257165
PM
2645 if (target_smt_mode && target_smt_mode < target_threads)
2646 target_threads = target_smt_mode;
2647 if (vc->num_threads < target_threads)
2648 collect_piggybacks(&core_info, target_threads);
3102f784 2649
b4deba5c
PM
2650 /* Decide on micro-threading (split-core) mode */
2651 subcore_size = threads_per_subcore;
2652 cmd_bit = stat_bit = 0;
2653 split = core_info.n_subcores;
2654 sip = NULL;
2655 if (split > 1) {
2656 /* threads_per_subcore must be MAX_SMT_THREADS (8) here */
2657 if (split == 2 && (dynamic_mt_modes & 2)) {
2658 cmd_bit = HID0_POWER8_1TO2LPAR;
2659 stat_bit = HID0_POWER8_2LPARMODE;
2660 } else {
2661 split = 4;
2662 cmd_bit = HID0_POWER8_1TO4LPAR;
2663 stat_bit = HID0_POWER8_4LPARMODE;
2664 }
2665 subcore_size = MAX_SMT_THREADS / split;
2666 sip = &split_info;
2667 memset(&split_info, 0, sizeof(split_info));
2668 split_info.rpr = mfspr(SPRN_RPR);
2669 split_info.pmmar = mfspr(SPRN_PMMAR);
2670 split_info.ldbar = mfspr(SPRN_LDBAR);
2671 split_info.subcore_size = subcore_size;
2672 for (sub = 0; sub < core_info.n_subcores; ++sub)
2673 split_info.master_vcs[sub] =
2674 list_first_entry(&core_info.vcs[sub],
2675 struct kvmppc_vcore, preempt_list);
2676 /* order writes to split_info before kvm_split_mode pointer */
2677 smp_wmb();
2678 }
2679 pcpu = smp_processor_id();
45c940ba 2680 for (thr = 0; thr < controlled_threads; ++thr)
b4deba5c
PM
2681 paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip;
2682
2683 /* Initiate micro-threading (split-core) if required */
2684 if (cmd_bit) {
2685 unsigned long hid0 = mfspr(SPRN_HID0);
2686
2687 hid0 |= cmd_bit | HID0_POWER8_DYNLPARDIS;
2688 mb();
2689 mtspr(SPRN_HID0, hid0);
2690 isync();
2691 for (;;) {
2692 hid0 = mfspr(SPRN_HID0);
2693 if (hid0 & stat_bit)
2694 break;
2695 cpu_relax();
ec257165 2696 }
2e25aa5f 2697 }
3102f784 2698
b8e6a87c
SW
2699 kvmppc_clear_host_core(pcpu);
2700
b4deba5c
PM
2701 /* Start all the threads */
2702 active = 0;
2703 for (sub = 0; sub < core_info.n_subcores; ++sub) {
2704 thr = subcore_thread_map[sub];
2705 thr0_done = false;
2706 active |= 1 << thr;
2707 list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) {
2708 pvc->pcpu = pcpu + thr;
7b5f8272 2709 for_each_runnable_thread(i, vcpu, pvc) {
b4deba5c
PM
2710 kvmppc_start_thread(vcpu, pvc);
2711 kvmppc_create_dtl_entry(vcpu, pvc);
2712 trace_kvm_guest_enter(vcpu);
2713 if (!vcpu->arch.ptid)
2714 thr0_done = true;
2715 active |= 1 << (thr + vcpu->arch.ptid);
2716 }
2717 /*
2718 * We need to start the first thread of each subcore
2719 * even if it doesn't have a vcpu.
2720 */
2721 if (pvc->master_vcore == pvc && !thr0_done)
2722 kvmppc_start_thread(NULL, pvc);
2723 thr += pvc->num_threads;
2724 }
2e25aa5f 2725 }
371fefd6 2726
7f235328
GS
2727 /*
2728 * Ensure that split_info.do_nap is set after setting
2729 * the vcore pointer in the PACA of the secondaries.
2730 */
2731 smp_mb();
2732 if (cmd_bit)
2733 split_info.do_nap = 1; /* ask secondaries to nap when done */
2734
b4deba5c
PM
2735 /*
2736 * When doing micro-threading, poke the inactive threads as well.
2737 * This gets them to the nap instruction after kvm_do_nap,
2738 * which reduces the time taken to unsplit later.
2739 */
2740 if (split > 1)
2741 for (thr = 1; thr < threads_per_subcore; ++thr)
2742 if (!(active & (1 << thr)))
2743 kvmppc_ipi_thread(pcpu + thr);
e0b7ec05 2744
2f12f034 2745 vc->vcore_state = VCORE_RUNNING;
19ccb76a 2746 preempt_disable();
3c78f78a
SW
2747
2748 trace_kvmppc_run_core(vc, 0);
2749
b4deba5c
PM
2750 for (sub = 0; sub < core_info.n_subcores; ++sub)
2751 list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list)
2752 spin_unlock(&pvc->lock);
de56a948 2753
6edaa530 2754 guest_enter();
2c9097e4 2755
e0b7ec05 2756 srcu_idx = srcu_read_lock(&vc->kvm->srcu);
2c9097e4 2757
e0b7ec05 2758 __kvmppc_vcore_entry();
de56a948 2759
ec257165
PM
2760 srcu_read_unlock(&vc->kvm->srcu, srcu_idx);
2761
2762 spin_lock(&vc->lock);
371fefd6 2763 /* prevent other vcpu threads from doing kvmppc_start_thread() now */
19ccb76a 2764 vc->vcore_state = VCORE_EXITING;
371fefd6 2765
19ccb76a 2766 /* wait for secondary threads to finish writing their state to memory */
5d5b99cd 2767 kvmppc_wait_for_nap();
b4deba5c
PM
2768
2769 /* Return to whole-core mode if we split the core earlier */
2770 if (split > 1) {
2771 unsigned long hid0 = mfspr(SPRN_HID0);
2772 unsigned long loops = 0;
2773
2774 hid0 &= ~HID0_POWER8_DYNLPARDIS;
2775 stat_bit = HID0_POWER8_2LPARMODE | HID0_POWER8_4LPARMODE;
2776 mb();
2777 mtspr(SPRN_HID0, hid0);
2778 isync();
2779 for (;;) {
2780 hid0 = mfspr(SPRN_HID0);
2781 if (!(hid0 & stat_bit))
2782 break;
2783 cpu_relax();
2784 ++loops;
2785 }
2786 split_info.do_nap = 0;
2787 }
2788
2789 /* Let secondaries go back to the offline loop */
45c940ba 2790 for (i = 0; i < controlled_threads; ++i) {
b4deba5c
PM
2791 kvmppc_release_hwthread(pcpu + i);
2792 if (sip && sip->napped[i])
2793 kvmppc_ipi_thread(pcpu + i);
a29ebeaf 2794 cpumask_clear_cpu(pcpu + i, &vc->kvm->arch.cpu_in_guest);
b4deba5c
PM
2795 }
2796
b8e6a87c
SW
2797 kvmppc_set_host_core(pcpu);
2798
371fefd6 2799 spin_unlock(&vc->lock);
2c9097e4 2800
371fefd6
PM
2801 /* make sure updates to secondary vcpu structs are visible now */
2802 smp_mb();
6edaa530 2803 guest_exit();
de56a948 2804
b4deba5c
PM
2805 for (sub = 0; sub < core_info.n_subcores; ++sub)
2806 list_for_each_entry_safe(pvc, vcnext, &core_info.vcs[sub],
2807 preempt_list)
2808 post_guest_process(pvc, pvc == vc);
de56a948 2809
913d3ff9 2810 spin_lock(&vc->lock);
ec257165 2811 preempt_enable();
de56a948
PM
2812
2813 out:
19ccb76a 2814 vc->vcore_state = VCORE_INACTIVE;
3c78f78a 2815 trace_kvmppc_run_core(vc, 1);
371fefd6
PM
2816}
2817
19ccb76a
PM
2818/*
2819 * Wait for some other vcpu thread to execute us, and
2820 * wake us up when we need to handle something in the host.
2821 */
ec257165
PM
2822static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc,
2823 struct kvm_vcpu *vcpu, int wait_state)
371fefd6 2824{
371fefd6
PM
2825 DEFINE_WAIT(wait);
2826
19ccb76a 2827 prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
ec257165
PM
2828 if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
2829 spin_unlock(&vc->lock);
19ccb76a 2830 schedule();
ec257165
PM
2831 spin_lock(&vc->lock);
2832 }
19ccb76a
PM
2833 finish_wait(&vcpu->arch.cpu_run, &wait);
2834}
2835
0cda69dd
SJS
2836static void grow_halt_poll_ns(struct kvmppc_vcore *vc)
2837{
2838 /* 10us base */
2839 if (vc->halt_poll_ns == 0 && halt_poll_ns_grow)
2840 vc->halt_poll_ns = 10000;
2841 else
2842 vc->halt_poll_ns *= halt_poll_ns_grow;
0cda69dd
SJS
2843}
2844
2845static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
2846{
2847 if (halt_poll_ns_shrink == 0)
2848 vc->halt_poll_ns = 0;
2849 else
2850 vc->halt_poll_ns /= halt_poll_ns_shrink;
2851}
2852
ee3308a2
PM
2853#ifdef CONFIG_KVM_XICS
2854static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu)
2855{
2856 if (!xive_enabled())
2857 return false;
2858 return vcpu->arch.xive_saved_state.pipr <
2859 vcpu->arch.xive_saved_state.cppr;
2860}
2861#else
2862static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu)
2863{
2864 return false;
2865}
2866#endif /* CONFIG_KVM_XICS */
2867
1da4e2f4
PM
2868static bool kvmppc_vcpu_woken(struct kvm_vcpu *vcpu)
2869{
2870 if (vcpu->arch.pending_exceptions || vcpu->arch.prodded ||
ee3308a2 2871 kvmppc_doorbell_pending(vcpu) || xive_interrupt_pending(vcpu))
1da4e2f4
PM
2872 return true;
2873
2874 return false;
2875}
2876
908a0935
SJS
2877/*
2878 * Check to see if any of the runnable vcpus on the vcore have pending
0cda69dd
SJS
2879 * exceptions or are no longer ceded
2880 */
2881static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
2882{
2883 struct kvm_vcpu *vcpu;
2884 int i;
2885
2886 for_each_runnable_thread(i, vcpu, vc) {
1da4e2f4 2887 if (!vcpu->arch.ceded || kvmppc_vcpu_woken(vcpu))
0cda69dd
SJS
2888 return 1;
2889 }
2890
2891 return 0;
2892}
2893
19ccb76a
PM
2894/*
2895 * All the vcpus in this vcore are idle, so wait for a decrementer
2896 * or external interrupt to one of the vcpus. vc->lock is held.
2897 */
2898static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
2899{
2a27f514 2900 ktime_t cur, start_poll, start_wait;
0cda69dd 2901 int do_sleep = 1;
0cda69dd 2902 u64 block_ns;
8577370f 2903 DECLARE_SWAITQUEUE(wait);
1bc5d59c 2904
0cda69dd 2905 /* Poll for pending exceptions and ceded state */
2a27f514 2906 cur = start_poll = ktime_get();
0cda69dd 2907 if (vc->halt_poll_ns) {
2a27f514
SJS
2908 ktime_t stop = ktime_add_ns(start_poll, vc->halt_poll_ns);
2909 ++vc->runner->stat.halt_attempted_poll;
1bc5d59c 2910
0cda69dd
SJS
2911 vc->vcore_state = VCORE_POLLING;
2912 spin_unlock(&vc->lock);
2913
2914 do {
2915 if (kvmppc_vcore_check_block(vc)) {
2916 do_sleep = 0;
2917 break;
2918 }
2919 cur = ktime_get();
2920 } while (single_task_running() && ktime_before(cur, stop));
2921
2922 spin_lock(&vc->lock);
2923 vc->vcore_state = VCORE_INACTIVE;
2924
2a27f514
SJS
2925 if (!do_sleep) {
2926 ++vc->runner->stat.halt_successful_poll;
0cda69dd 2927 goto out;
2a27f514 2928 }
1bc5d59c
SW
2929 }
2930
0cda69dd
SJS
2931 prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
2932
2933 if (kvmppc_vcore_check_block(vc)) {
8577370f 2934 finish_swait(&vc->wq, &wait);
0cda69dd 2935 do_sleep = 0;
2a27f514
SJS
2936 /* If we polled, count this as a successful poll */
2937 if (vc->halt_poll_ns)
2938 ++vc->runner->stat.halt_successful_poll;
0cda69dd 2939 goto out;
1bc5d59c
SW
2940 }
2941
2a27f514
SJS
2942 start_wait = ktime_get();
2943
19ccb76a 2944 vc->vcore_state = VCORE_SLEEPING;
3c78f78a 2945 trace_kvmppc_vcore_blocked(vc, 0);
19ccb76a 2946 spin_unlock(&vc->lock);
913d3ff9 2947 schedule();
8577370f 2948 finish_swait(&vc->wq, &wait);
19ccb76a
PM
2949 spin_lock(&vc->lock);
2950 vc->vcore_state = VCORE_INACTIVE;
3c78f78a 2951 trace_kvmppc_vcore_blocked(vc, 1);
2a27f514 2952 ++vc->runner->stat.halt_successful_wait;
0cda69dd
SJS
2953
2954 cur = ktime_get();
2955
2956out:
2a27f514
SJS
2957 block_ns = ktime_to_ns(cur) - ktime_to_ns(start_poll);
2958
2959 /* Attribute wait time */
2960 if (do_sleep) {
2961 vc->runner->stat.halt_wait_ns +=
2962 ktime_to_ns(cur) - ktime_to_ns(start_wait);
2963 /* Attribute failed poll time */
2964 if (vc->halt_poll_ns)
2965 vc->runner->stat.halt_poll_fail_ns +=
2966 ktime_to_ns(start_wait) -
2967 ktime_to_ns(start_poll);
2968 } else {
2969 /* Attribute successful poll time */
2970 if (vc->halt_poll_ns)
2971 vc->runner->stat.halt_poll_success_ns +=
2972 ktime_to_ns(cur) -
2973 ktime_to_ns(start_poll);
2974 }
0cda69dd
SJS
2975
2976 /* Adjust poll time */
307d93e4 2977 if (halt_poll_ns) {
0cda69dd
SJS
2978 if (block_ns <= vc->halt_poll_ns)
2979 ;
2980 /* We slept and blocked for longer than the max halt time */
307d93e4 2981 else if (vc->halt_poll_ns && block_ns > halt_poll_ns)
0cda69dd
SJS
2982 shrink_halt_poll_ns(vc);
2983 /* We slept and our poll time is too small */
307d93e4
SJS
2984 else if (vc->halt_poll_ns < halt_poll_ns &&
2985 block_ns < halt_poll_ns)
0cda69dd 2986 grow_halt_poll_ns(vc);
e03f3921
SJS
2987 if (vc->halt_poll_ns > halt_poll_ns)
2988 vc->halt_poll_ns = halt_poll_ns;
0cda69dd
SJS
2989 } else
2990 vc->halt_poll_ns = 0;
2991
2992 trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
19ccb76a 2993}
371fefd6 2994
19ccb76a
PM
2995static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2996{
7b5f8272 2997 int n_ceded, i;
19ccb76a 2998 struct kvmppc_vcore *vc;
7b5f8272 2999 struct kvm_vcpu *v;
9e368f29 3000
3c78f78a
SW
3001 trace_kvmppc_run_vcpu_enter(vcpu);
3002
371fefd6
PM
3003 kvm_run->exit_reason = 0;
3004 vcpu->arch.ret = RESUME_GUEST;
3005 vcpu->arch.trap = 0;
2f12f034 3006 kvmppc_update_vpas(vcpu);
371fefd6 3007
371fefd6
PM
3008 /*
3009 * Synchronize with other threads in this virtual core
3010 */
3011 vc = vcpu->arch.vcore;
3012 spin_lock(&vc->lock);
19ccb76a 3013 vcpu->arch.ceded = 0;
371fefd6
PM
3014 vcpu->arch.run_task = current;
3015 vcpu->arch.kvm_run = kvm_run;
c7b67670 3016 vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
19ccb76a 3017 vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
c7b67670 3018 vcpu->arch.busy_preempt = TB_NIL;
7b5f8272 3019 WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], vcpu);
371fefd6
PM
3020 ++vc->n_runnable;
3021
19ccb76a
PM
3022 /*
3023 * This happens the first time this is called for a vcpu.
3024 * If the vcore is already running, we may be able to start
3025 * this thread straight away and have it join in.
3026 */
8455d79e 3027 if (!signal_pending(current)) {
ec257165
PM
3028 if (vc->vcore_state == VCORE_PIGGYBACK) {
3029 struct kvmppc_vcore *mvc = vc->master_vcore;
3030 if (spin_trylock(&mvc->lock)) {
3031 if (mvc->vcore_state == VCORE_RUNNING &&
3032 !VCORE_IS_EXITING(mvc)) {
3033 kvmppc_create_dtl_entry(vcpu, vc);
b4deba5c 3034 kvmppc_start_thread(vcpu, vc);
ec257165
PM
3035 trace_kvm_guest_enter(vcpu);
3036 }
3037 spin_unlock(&mvc->lock);
3038 }
3039 } else if (vc->vcore_state == VCORE_RUNNING &&
3040 !VCORE_IS_EXITING(vc)) {
2f12f034 3041 kvmppc_create_dtl_entry(vcpu, vc);
b4deba5c 3042 kvmppc_start_thread(vcpu, vc);
3c78f78a 3043 trace_kvm_guest_enter(vcpu);
8455d79e 3044 } else if (vc->vcore_state == VCORE_SLEEPING) {
8577370f 3045 swake_up(&vc->wq);
371fefd6
PM
3046 }
3047
8455d79e 3048 }
371fefd6 3049
19ccb76a
PM
3050 while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
3051 !signal_pending(current)) {
ec257165
PM
3052 if (vc->vcore_state == VCORE_PREEMPT && vc->runner == NULL)
3053 kvmppc_vcore_end_preempt(vc);
3054
8455d79e 3055 if (vc->vcore_state != VCORE_INACTIVE) {
ec257165 3056 kvmppc_wait_for_exec(vc, vcpu, TASK_INTERRUPTIBLE);
19ccb76a
PM
3057 continue;
3058 }
7b5f8272 3059 for_each_runnable_thread(i, v, vc) {
7e28e60e 3060 kvmppc_core_prepare_to_enter(v);
19ccb76a
PM
3061 if (signal_pending(v->arch.run_task)) {
3062 kvmppc_remove_runnable(vc, v);
3063 v->stat.signal_exits++;
3064 v->arch.kvm_run->exit_reason = KVM_EXIT_INTR;
3065 v->arch.ret = -EINTR;
3066 wake_up(&v->arch.cpu_run);
3067 }
3068 }
8455d79e
PM
3069 if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
3070 break;
8455d79e 3071 n_ceded = 0;
7b5f8272 3072 for_each_runnable_thread(i, v, vc) {
1da4e2f4 3073 if (!kvmppc_vcpu_woken(v))
8455d79e 3074 n_ceded += v->arch.ceded;
4619ac88
PM
3075 else
3076 v->arch.ceded = 0;
3077 }
25fedfca
PM
3078 vc->runner = vcpu;
3079 if (n_ceded == vc->n_runnable) {
8455d79e 3080 kvmppc_vcore_blocked(vc);
c56dadf3 3081 } else if (need_resched()) {
ec257165 3082 kvmppc_vcore_preempt(vc);
25fedfca
PM
3083 /* Let something else run */
3084 cond_resched_lock(&vc->lock);
ec257165
PM
3085 if (vc->vcore_state == VCORE_PREEMPT)
3086 kvmppc_vcore_end_preempt(vc);
25fedfca 3087 } else {
8455d79e 3088 kvmppc_run_core(vc);
25fedfca 3089 }
0456ec4f 3090 vc->runner = NULL;
19ccb76a 3091 }
371fefd6 3092
8455d79e
PM
3093 while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
3094 (vc->vcore_state == VCORE_RUNNING ||
5fc3e64f
PM
3095 vc->vcore_state == VCORE_EXITING ||
3096 vc->vcore_state == VCORE_PIGGYBACK))
ec257165 3097 kvmppc_wait_for_exec(vc, vcpu, TASK_UNINTERRUPTIBLE);
8455d79e 3098
5fc3e64f
PM
3099 if (vc->vcore_state == VCORE_PREEMPT && vc->runner == NULL)
3100 kvmppc_vcore_end_preempt(vc);
3101
8455d79e
PM
3102 if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
3103 kvmppc_remove_runnable(vc, vcpu);
3104 vcpu->stat.signal_exits++;
3105 kvm_run->exit_reason = KVM_EXIT_INTR;
3106 vcpu->arch.ret = -EINTR;
3107 }
3108
3109 if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) {
3110 /* Wake up some vcpu to run the core */
7b5f8272
SJS
3111 i = -1;
3112 v = next_runnable_thread(vc, &i);
8455d79e 3113 wake_up(&v->arch.cpu_run);
371fefd6
PM
3114 }
3115
3c78f78a 3116 trace_kvmppc_run_vcpu_exit(vcpu, kvm_run);
371fefd6 3117 spin_unlock(&vc->lock);
371fefd6 3118 return vcpu->arch.ret;
de56a948
PM
3119}
3120
3a167bea 3121static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
a8606e20
PM
3122{
3123 int r;
913d3ff9 3124 int srcu_idx;
ca8efa1d 3125 unsigned long ebb_regs[3] = {}; /* shut up GCC */
4c3bb4cc
PM
3126 unsigned long user_tar = 0;
3127 unsigned int user_vrsave;
a8606e20 3128
af8f38b3
AG
3129 if (!vcpu->arch.sane) {
3130 run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3131 return -EINVAL;
3132 }
3133
46a704f8
PM
3134 /*
3135 * Don't allow entry with a suspended transaction, because
3136 * the guest entry/exit code will lose it.
3137 * If the guest has TM enabled, save away their TM-related SPRs
3138 * (they will get restored by the TM unavailable interrupt).
3139 */
3140#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
3141 if (cpu_has_feature(CPU_FTR_TM) && current->thread.regs &&
3142 (current->thread.regs->msr & MSR_TM)) {
3143 if (MSR_TM_ACTIVE(current->thread.regs->msr)) {
3144 run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3145 run->fail_entry.hardware_entry_failure_reason = 0;
3146 return -EINVAL;
3147 }
3148 current->thread.tm_tfhar = mfspr(SPRN_TFHAR);
3149 current->thread.tm_tfiar = mfspr(SPRN_TFIAR);
3150 current->thread.tm_texasr = mfspr(SPRN_TEXASR);
3151 current->thread.regs->msr &= ~MSR_TM;
3152 }
3153#endif
3154
25051b5a
SW
3155 kvmppc_core_prepare_to_enter(vcpu);
3156
19ccb76a
PM
3157 /* No need to go into the guest when all we'll do is come back out */
3158 if (signal_pending(current)) {
3159 run->exit_reason = KVM_EXIT_INTR;
3160 return -EINTR;
3161 }
3162
32fad281 3163 atomic_inc(&vcpu->kvm->arch.vcpus_running);
31037eca 3164 /* Order vcpus_running vs. hpte_setup_done, see kvmppc_alloc_reset_hpt */
32fad281
PM
3165 smp_mb();
3166
c17b98cf 3167 /* On the first time here, set up HTAB and VRMA */
8cf4ecc0 3168 if (!kvm_is_radix(vcpu->kvm) && !vcpu->kvm->arch.hpte_setup_done) {
32fad281 3169 r = kvmppc_hv_setup_htab_rma(vcpu);
c77162de 3170 if (r)
32fad281 3171 goto out;
c77162de 3172 }
19ccb76a 3173
579e633e
AB
3174 flush_all_to_thread(current);
3175
4c3bb4cc 3176 /* Save userspace EBB and other register values */
ca8efa1d
PM
3177 if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
3178 ebb_regs[0] = mfspr(SPRN_EBBHR);
3179 ebb_regs[1] = mfspr(SPRN_EBBRR);
3180 ebb_regs[2] = mfspr(SPRN_BESCR);
4c3bb4cc 3181 user_tar = mfspr(SPRN_TAR);
ca8efa1d 3182 }
4c3bb4cc 3183 user_vrsave = mfspr(SPRN_VRSAVE);
ca8efa1d 3184
19ccb76a 3185 vcpu->arch.wqp = &vcpu->arch.vcore->wq;
342d3db7 3186 vcpu->arch.pgdir = current->mm->pgd;
c7b67670 3187 vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
19ccb76a 3188
a8606e20
PM
3189 do {
3190 r = kvmppc_run_vcpu(run, vcpu);
3191
3192 if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
3193 !(vcpu->arch.shregs.msr & MSR_PR)) {
3c78f78a 3194 trace_kvm_hcall_enter(vcpu);
a8606e20 3195 r = kvmppc_pseries_do_hcall(vcpu);
3c78f78a 3196 trace_kvm_hcall_exit(vcpu, r);
7e28e60e 3197 kvmppc_core_prepare_to_enter(vcpu);
913d3ff9
PM
3198 } else if (r == RESUME_PAGE_FAULT) {
3199 srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
3200 r = kvmppc_book3s_hv_page_fault(run, vcpu,
3201 vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
3202 srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
5af50993
BH
3203 } else if (r == RESUME_PASSTHROUGH) {
3204 if (WARN_ON(xive_enabled()))
3205 r = H_SUCCESS;
3206 else
3207 r = kvmppc_xics_rm_complete(vcpu, 0);
3208 }
e59d24e6 3209 } while (is_kvmppc_resume_guest(r));
32fad281 3210
4c3bb4cc 3211 /* Restore userspace EBB and other register values */
ca8efa1d
PM
3212 if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
3213 mtspr(SPRN_EBBHR, ebb_regs[0]);
3214 mtspr(SPRN_EBBRR, ebb_regs[1]);
3215 mtspr(SPRN_BESCR, ebb_regs[2]);
4c3bb4cc
PM
3216 mtspr(SPRN_TAR, user_tar);
3217 mtspr(SPRN_FSCR, current->thread.fscr);
ca8efa1d 3218 }
4c3bb4cc 3219 mtspr(SPRN_VRSAVE, user_vrsave);
ca8efa1d 3220
32fad281 3221 out:
c7b67670 3222 vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
32fad281 3223 atomic_dec(&vcpu->kvm->arch.vcpus_running);
a8606e20
PM
3224 return r;
3225}
3226
5b74716e
BH
3227static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
3228 int linux_psize)
3229{
3230 struct mmu_psize_def *def = &mmu_psize_defs[linux_psize];
3231
3232 if (!def->shift)
3233 return;
3234 (*sps)->page_shift = def->shift;
3235 (*sps)->slb_enc = def->sllp;
3236 (*sps)->enc[0].page_shift = def->shift;
b1022fbd 3237 (*sps)->enc[0].pte_enc = def->penc[linux_psize];
1f365bb0
AK
3238 /*
3239 * Add 16MB MPSS support if host supports it
3240 */
3241 if (linux_psize != MMU_PAGE_16M && def->penc[MMU_PAGE_16M] != -1) {
3242 (*sps)->enc[1].page_shift = 24;
3243 (*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M];
3244 }
5b74716e
BH
3245 (*sps)++;
3246}
3247
3a167bea
AK
3248static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
3249 struct kvm_ppc_smmu_info *info)
5b74716e
BH
3250{
3251 struct kvm_ppc_one_seg_page_size *sps;
3252
8cf4ecc0
PM
3253 /*
3254 * Since we don't yet support HPT guests on a radix host,
3255 * return an error if the host uses radix.
3256 */
3257 if (radix_enabled())
3258 return -EINVAL;
3259
5b74716e
BH
3260 info->flags = KVM_PPC_PAGE_SIZES_REAL;
3261 if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
3262 info->flags |= KVM_PPC_1T_SEGMENTS;
3263 info->slb_size = mmu_slb_size;
3264
3265 /* We only support these sizes for now, and no muti-size segments */
3266 sps = &info->sps[0];
3267 kvmppc_add_seg_page_size(&sps, MMU_PAGE_4K);
3268 kvmppc_add_seg_page_size(&sps, MMU_PAGE_64K);
3269 kvmppc_add_seg_page_size(&sps, MMU_PAGE_16M);
3270
3271 return 0;
3272}
3273
82ed3616
PM
3274/*
3275 * Get (and clear) the dirty memory log for a memory slot.
3276 */
3a167bea
AK
3277static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
3278 struct kvm_dirty_log *log)
82ed3616 3279{
9f6b8029 3280 struct kvm_memslots *slots;
82ed3616 3281 struct kvm_memory_slot *memslot;
8f7b79b8 3282 int i, r;
82ed3616 3283 unsigned long n;
8f7b79b8
PM
3284 unsigned long *buf;
3285 struct kvm_vcpu *vcpu;
82ed3616
PM
3286
3287 mutex_lock(&kvm->slots_lock);
3288
3289 r = -EINVAL;
bbacc0c1 3290 if (log->slot >= KVM_USER_MEM_SLOTS)
82ed3616
PM
3291 goto out;
3292
9f6b8029
PB
3293 slots = kvm_memslots(kvm);
3294 memslot = id_to_memslot(slots, log->slot);
82ed3616
PM
3295 r = -ENOENT;
3296 if (!memslot->dirty_bitmap)
3297 goto out;
3298
8f7b79b8
PM
3299 /*
3300 * Use second half of bitmap area because radix accumulates
3301 * bits in the first half.
3302 */
82ed3616 3303 n = kvm_dirty_bitmap_bytes(memslot);
8f7b79b8
PM
3304 buf = memslot->dirty_bitmap + n / sizeof(long);
3305 memset(buf, 0, n);
82ed3616 3306
8f7b79b8
PM
3307 if (kvm_is_radix(kvm))
3308 r = kvmppc_hv_get_dirty_log_radix(kvm, memslot, buf);
3309 else
3310 r = kvmppc_hv_get_dirty_log_hpt(kvm, memslot, buf);
82ed3616
PM
3311 if (r)
3312 goto out;
3313
8f7b79b8
PM
3314 /* Harvest dirty bits from VPA and DTL updates */
3315 /* Note: we never modify the SLB shadow buffer areas */
3316 kvm_for_each_vcpu(i, vcpu, kvm) {
3317 spin_lock(&vcpu->arch.vpa_update_lock);
3318 kvmppc_harvest_vpa_dirty(&vcpu->arch.vpa, memslot, buf);
3319 kvmppc_harvest_vpa_dirty(&vcpu->arch.dtl, memslot, buf);
3320 spin_unlock(&vcpu->arch.vpa_update_lock);
3321 }
3322
82ed3616 3323 r = -EFAULT;
8f7b79b8 3324 if (copy_to_user(log->dirty_bitmap, buf, n))
82ed3616
PM
3325 goto out;
3326
3327 r = 0;
3328out:
3329 mutex_unlock(&kvm->slots_lock);
3330 return r;
3331}
3332
3a167bea
AK
3333static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free,
3334 struct kvm_memory_slot *dont)
a66b48c3
PM
3335{
3336 if (!dont || free->arch.rmap != dont->arch.rmap) {
3337 vfree(free->arch.rmap);
3338 free->arch.rmap = NULL;
b2b2f165 3339 }
a66b48c3
PM
3340}
3341
3a167bea
AK
3342static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot,
3343 unsigned long npages)
a66b48c3 3344{
8cf4ecc0
PM
3345 /*
3346 * For now, if radix_enabled() then we only support radix guests,
3347 * and in that case we don't need the rmap array.
3348 */
3349 if (radix_enabled()) {
3350 slot->arch.rmap = NULL;
3351 return 0;
3352 }
3353
a66b48c3
PM
3354 slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap));
3355 if (!slot->arch.rmap)
3356 return -ENOMEM;
aa04b4cc 3357
c77162de
PM
3358 return 0;
3359}
aa04b4cc 3360
3a167bea
AK
3361static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm,
3362 struct kvm_memory_slot *memslot,
09170a49 3363 const struct kvm_userspace_memory_region *mem)
c77162de 3364{
a66b48c3 3365 return 0;
c77162de
PM
3366}
3367
3a167bea 3368static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
09170a49 3369 const struct kvm_userspace_memory_region *mem,
f36f3f28
PB
3370 const struct kvm_memory_slot *old,
3371 const struct kvm_memory_slot *new)
c77162de 3372{
dfe49dbd 3373 unsigned long npages = mem->memory_size >> PAGE_SHIFT;
9f6b8029 3374 struct kvm_memslots *slots;
dfe49dbd
PM
3375 struct kvm_memory_slot *memslot;
3376
a56ee9f8
YX
3377 /*
3378 * If we are making a new memslot, it might make
3379 * some address that was previously cached as emulated
3380 * MMIO be no longer emulated MMIO, so invalidate
3381 * all the caches of emulated MMIO translations.
3382 */
3383 if (npages)
3384 atomic64_inc(&kvm->arch.mmio_update);
3385
8f7b79b8 3386 if (npages && old->npages && !kvm_is_radix(kvm)) {
dfe49dbd
PM
3387 /*
3388 * If modifying a memslot, reset all the rmap dirty bits.
3389 * If this is a new memslot, we don't need to do anything
3390 * since the rmap array starts out as all zeroes,
3391 * i.e. no pages are dirty.
3392 */
9f6b8029
PB
3393 slots = kvm_memslots(kvm);
3394 memslot = id_to_memslot(slots, mem->slot);
8f7b79b8 3395 kvmppc_hv_get_dirty_log_hpt(kvm, memslot, NULL);
dfe49dbd 3396 }
c77162de
PM
3397}
3398
a0144e2a
PM
3399/*
3400 * Update LPCR values in kvm->arch and in vcores.
3401 * Caller must hold kvm->lock.
3402 */
3403void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask)
3404{
3405 long int i;
3406 u32 cores_done = 0;
3407
3408 if ((kvm->arch.lpcr & mask) == lpcr)
3409 return;
3410
3411 kvm->arch.lpcr = (kvm->arch.lpcr & ~mask) | lpcr;
3412
3413 for (i = 0; i < KVM_MAX_VCORES; ++i) {
3414 struct kvmppc_vcore *vc = kvm->arch.vcores[i];
3415 if (!vc)
3416 continue;
3417 spin_lock(&vc->lock);
3418 vc->lpcr = (vc->lpcr & ~mask) | lpcr;
3419 spin_unlock(&vc->lock);
3420 if (++cores_done >= kvm->arch.online_vcores)
3421 break;
3422 }
3423}
3424
3a167bea
AK
3425static void kvmppc_mmu_destroy_hv(struct kvm_vcpu *vcpu)
3426{
3427 return;
3428}
3429
7a84084c
PM
3430static void kvmppc_setup_partition_table(struct kvm *kvm)
3431{
3432 unsigned long dw0, dw1;
3433
8cf4ecc0
PM
3434 if (!kvm_is_radix(kvm)) {
3435 /* PS field - page size for VRMA */
3436 dw0 = ((kvm->arch.vrma_slb_v & SLB_VSID_L) >> 1) |
3437 ((kvm->arch.vrma_slb_v & SLB_VSID_LP) << 1);
3438 /* HTABSIZE and HTABORG fields */
3439 dw0 |= kvm->arch.sdr1;
7a84084c 3440
8cf4ecc0
PM
3441 /* Second dword as set by userspace */
3442 dw1 = kvm->arch.process_table;
3443 } else {
3444 dw0 = PATB_HR | radix__get_tree_size() |
3445 __pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE;
3446 dw1 = PATB_GR | kvm->arch.process_table;
3447 }
7a84084c
PM
3448
3449 mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1);
3450}
3451
32fad281 3452static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
c77162de
PM
3453{
3454 int err = 0;
3455 struct kvm *kvm = vcpu->kvm;
c77162de
PM
3456 unsigned long hva;
3457 struct kvm_memory_slot *memslot;
3458 struct vm_area_struct *vma;
a0144e2a 3459 unsigned long lpcr = 0, senc;
c77162de 3460 unsigned long psize, porder;
2c9097e4 3461 int srcu_idx;
c77162de
PM
3462
3463 mutex_lock(&kvm->lock);
31037eca 3464 if (kvm->arch.hpte_setup_done)
c77162de 3465 goto out; /* another vcpu beat us to it */
aa04b4cc 3466
32fad281 3467 /* Allocate hashed page table (if not done already) and reset it */
3f9d4f5a 3468 if (!kvm->arch.hpt.virt) {
aae0777f
DG
3469 int order = KVM_DEFAULT_HPT_ORDER;
3470 struct kvm_hpt_info info;
3471
3472 err = kvmppc_allocate_hpt(&info, order);
3473 /* If we get here, it means userspace didn't specify a
3474 * size explicitly. So, try successively smaller
3475 * sizes if the default failed. */
3476 while ((err == -ENOMEM) && --order >= PPC_MIN_HPT_ORDER)
3477 err = kvmppc_allocate_hpt(&info, order);
3478
3479 if (err < 0) {
32fad281
PM
3480 pr_err("KVM: Couldn't alloc HPT\n");
3481 goto out;
3482 }
aae0777f
DG
3483
3484 kvmppc_set_hpt(kvm, &info);
32fad281
PM
3485 }
3486
c77162de 3487 /* Look up the memslot for guest physical address 0 */
2c9097e4 3488 srcu_idx = srcu_read_lock(&kvm->srcu);
c77162de 3489 memslot = gfn_to_memslot(kvm, 0);
aa04b4cc 3490
c77162de
PM
3491 /* We must have some memory at 0 by now */
3492 err = -EINVAL;
3493 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
2c9097e4 3494 goto out_srcu;
c77162de
PM
3495
3496 /* Look up the VMA for the start of this memory slot */
3497 hva = memslot->userspace_addr;
3498 down_read(&current->mm->mmap_sem);
3499 vma = find_vma(current->mm, hva);
3500 if (!vma || vma->vm_start > hva || (vma->vm_flags & VM_IO))
3501 goto up_out;
3502
3503 psize = vma_kernel_pagesize(vma);
da9d1d7f 3504 porder = __ilog2(psize);
c77162de 3505
c77162de
PM
3506 up_read(&current->mm->mmap_sem);
3507
c17b98cf
PM
3508 /* We can handle 4k, 64k or 16M pages in the VRMA */
3509 err = -EINVAL;
3510 if (!(psize == 0x1000 || psize == 0x10000 ||
3511 psize == 0x1000000))
3512 goto out_srcu;
c77162de 3513
c17b98cf
PM
3514 senc = slb_pgsize_encoding(psize);
3515 kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
3516 (VRMA_VSID << SLB_VSID_SHIFT_1T);
c17b98cf
PM
3517 /* Create HPTEs in the hash page table for the VRMA */
3518 kvmppc_map_vrma(vcpu, memslot, porder);
aa04b4cc 3519
7a84084c
PM
3520 /* Update VRMASD field in the LPCR */
3521 if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
3522 /* the -4 is to account for senc values starting at 0x10 */
3523 lpcr = senc << (LPCR_VRMASD_SH - 4);
3524 kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
3525 } else {
3526 kvmppc_setup_partition_table(kvm);
3527 }
a0144e2a 3528
31037eca 3529 /* Order updates to kvm->arch.lpcr etc. vs. hpte_setup_done */
c77162de 3530 smp_wmb();
31037eca 3531 kvm->arch.hpte_setup_done = 1;
c77162de 3532 err = 0;
2c9097e4
PM
3533 out_srcu:
3534 srcu_read_unlock(&kvm->srcu, srcu_idx);
c77162de
PM
3535 out:
3536 mutex_unlock(&kvm->lock);
3537 return err;
b2b2f165 3538
c77162de
PM
3539 up_out:
3540 up_read(&current->mm->mmap_sem);
505d6421 3541 goto out_srcu;
de56a948
PM
3542}
3543
79b6c247
SW
3544#ifdef CONFIG_KVM_XICS
3545/*
3546 * Allocate a per-core structure for managing state about which cores are
3547 * running in the host versus the guest and for exchanging data between
3548 * real mode KVM and CPU running in the host.
3549 * This is only done for the first VM.
3550 * The allocated structure stays even if all VMs have stopped.
3551 * It is only freed when the kvm-hv module is unloaded.
3552 * It's OK for this routine to fail, we just don't support host
3553 * core operations like redirecting H_IPI wakeups.
3554 */
3555void kvmppc_alloc_host_rm_ops(void)
3556{
3557 struct kvmppc_host_rm_ops *ops;
3558 unsigned long l_ops;
3559 int cpu, core;
3560 int size;
3561
3562 /* Not the first time here ? */
3563 if (kvmppc_host_rm_ops_hv != NULL)
3564 return;
3565
3566 ops = kzalloc(sizeof(struct kvmppc_host_rm_ops), GFP_KERNEL);
3567 if (!ops)
3568 return;
3569
3570 size = cpu_nr_cores() * sizeof(struct kvmppc_host_rm_core);
3571 ops->rm_core = kzalloc(size, GFP_KERNEL);
3572
3573 if (!ops->rm_core) {
3574 kfree(ops);
3575 return;
3576 }
3577
6f3bb809
SW
3578 get_online_cpus();
3579
79b6c247
SW
3580 for (cpu = 0; cpu < nr_cpu_ids; cpu += threads_per_core) {
3581 if (!cpu_online(cpu))
3582 continue;
3583
3584 core = cpu >> threads_shift;
3585 ops->rm_core[core].rm_state.in_host = 1;
3586 }
3587
0c2a6606
SW
3588 ops->vcpu_kick = kvmppc_fast_vcpu_kick_hv;
3589
79b6c247
SW
3590 /*
3591 * Make the contents of the kvmppc_host_rm_ops structure visible
3592 * to other CPUs before we assign it to the global variable.
3593 * Do an atomic assignment (no locks used here), but if someone
3594 * beats us to it, just free our copy and return.
3595 */
3596 smp_wmb();
3597 l_ops = (unsigned long) ops;
3598
3599 if (cmpxchg64((unsigned long *)&kvmppc_host_rm_ops_hv, 0, l_ops)) {
6f3bb809 3600 put_online_cpus();
79b6c247
SW
3601 kfree(ops->rm_core);
3602 kfree(ops);
6f3bb809 3603 return;
79b6c247 3604 }
6f3bb809 3605
3f7cd919
AMG
3606 cpuhp_setup_state_nocalls(CPUHP_KVM_PPC_BOOK3S_PREPARE,
3607 "ppc/kvm_book3s:prepare",
3608 kvmppc_set_host_core,
3609 kvmppc_clear_host_core);
6f3bb809 3610 put_online_cpus();
79b6c247
SW
3611}
3612
3613void kvmppc_free_host_rm_ops(void)
3614{
3615 if (kvmppc_host_rm_ops_hv) {
3f7cd919 3616 cpuhp_remove_state_nocalls(CPUHP_KVM_PPC_BOOK3S_PREPARE);
79b6c247
SW
3617 kfree(kvmppc_host_rm_ops_hv->rm_core);
3618 kfree(kvmppc_host_rm_ops_hv);
3619 kvmppc_host_rm_ops_hv = NULL;
3620 }
3621}
3622#endif
3623
3a167bea 3624static int kvmppc_core_init_vm_hv(struct kvm *kvm)
de56a948 3625{
32fad281 3626 unsigned long lpcr, lpid;
e23a808b 3627 char buf[32];
8cf4ecc0 3628 int ret;
de56a948 3629
32fad281
PM
3630 /* Allocate the guest's logical partition ID */
3631
3632 lpid = kvmppc_alloc_lpid();
5d226ae5 3633 if ((long)lpid < 0)
32fad281
PM
3634 return -ENOMEM;
3635 kvm->arch.lpid = lpid;
de56a948 3636
79b6c247
SW
3637 kvmppc_alloc_host_rm_ops();
3638
1b400ba0
PM
3639 /*
3640 * Since we don't flush the TLB when tearing down a VM,
3641 * and this lpid might have previously been used,
3642 * make sure we flush on each core before running the new VM.
7c5b06ca
PM
3643 * On POWER9, the tlbie in mmu_partition_table_set_entry()
3644 * does this flush for us.
1b400ba0 3645 */
7c5b06ca
PM
3646 if (!cpu_has_feature(CPU_FTR_ARCH_300))
3647 cpumask_setall(&kvm->arch.need_tlb_flush);
1b400ba0 3648
699a0ea0
PM
3649 /* Start out with the default set of hcalls enabled */
3650 memcpy(kvm->arch.enabled_hcalls, default_enabled_hcalls,
3651 sizeof(kvm->arch.enabled_hcalls));
3652
7a84084c
PM
3653 if (!cpu_has_feature(CPU_FTR_ARCH_300))
3654 kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
aa04b4cc 3655
c17b98cf
PM
3656 /* Init LPCR for virtual RMA mode */
3657 kvm->arch.host_lpid = mfspr(SPRN_LPID);
3658 kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
3659 lpcr &= LPCR_PECE | LPCR_LPES;
3660 lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
3661 LPCR_VPM0 | LPCR_VPM1;
3662 kvm->arch.vrma_slb_v = SLB_VSID_B_1T |
3663 (VRMA_VSID << SLB_VSID_SHIFT_1T);
3664 /* On POWER8 turn on online bit to enable PURR/SPURR */
3665 if (cpu_has_feature(CPU_FTR_ARCH_207S))
3666 lpcr |= LPCR_ONL;
84f7139c
PM
3667 /*
3668 * On POWER9, VPM0 bit is reserved (VPM0=1 behaviour is assumed)
3669 * Set HVICE bit to enable hypervisor virtualization interrupts.
5af50993
BH
3670 * Set HEIC to prevent OS interrupts to go to hypervisor (should
3671 * be unnecessary but better safe than sorry in case we re-enable
3672 * EE in HV mode with this LPCR still set)
84f7139c
PM
3673 */
3674 if (cpu_has_feature(CPU_FTR_ARCH_300)) {
7a84084c 3675 lpcr &= ~LPCR_VPM0;
5af50993
BH
3676 lpcr |= LPCR_HVICE | LPCR_HEIC;
3677
3678 /*
3679 * If xive is enabled, we route 0x500 interrupts directly
3680 * to the guest.
3681 */
3682 if (xive_enabled())
3683 lpcr |= LPCR_LPES;
84f7139c
PM
3684 }
3685
8cf4ecc0
PM
3686 /*
3687 * For now, if the host uses radix, the guest must be radix.
3688 */
3689 if (radix_enabled()) {
3690 kvm->arch.radix = 1;
3691 lpcr &= ~LPCR_VPM1;
3692 lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR;
3693 ret = kvmppc_init_vm_radix(kvm);
3694 if (ret) {
3695 kvmppc_free_lpid(kvm->arch.lpid);
3696 return ret;
3697 }
3698 kvmppc_setup_partition_table(kvm);
3699 }
3700
9e368f29 3701 kvm->arch.lpcr = lpcr;
aa04b4cc 3702
5e985969
DG
3703 /* Initialization for future HPT resizes */
3704 kvm->arch.resize_hpt = NULL;
3705
7c5b06ca
PM
3706 /*
3707 * Work out how many sets the TLB has, for the use of
3708 * the TLB invalidation loop in book3s_hv_rmhandlers.S.
3709 */
8cf4ecc0
PM
3710 if (kvm_is_radix(kvm))
3711 kvm->arch.tlb_sets = POWER9_TLB_SETS_RADIX; /* 128 */
3712 else if (cpu_has_feature(CPU_FTR_ARCH_300))
7c5b06ca
PM
3713 kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH; /* 256 */
3714 else if (cpu_has_feature(CPU_FTR_ARCH_207S))
3715 kvm->arch.tlb_sets = POWER8_TLB_SETS; /* 512 */
3716 else
3717 kvm->arch.tlb_sets = POWER7_TLB_SETS; /* 128 */
3718
512691d4 3719 /*
441c19c8
ME
3720 * Track that we now have a HV mode VM active. This blocks secondary
3721 * CPU threads from coming online.
8cf4ecc0
PM
3722 * On POWER9, we only need to do this for HPT guests on a radix
3723 * host, which is not yet supported.
512691d4 3724 */
8cf4ecc0
PM
3725 if (!cpu_has_feature(CPU_FTR_ARCH_300))
3726 kvm_hv_vm_activated();
512691d4 3727
3c313524
PM
3728 /*
3729 * Initialize smt_mode depending on processor.
3730 * POWER8 and earlier have to use "strict" threading, where
3731 * all vCPUs in a vcore have to run on the same (sub)core,
3732 * whereas on POWER9 the threads can each run a different
3733 * guest.
3734 */
3735 if (!cpu_has_feature(CPU_FTR_ARCH_300))
3736 kvm->arch.smt_mode = threads_per_subcore;
3737 else
3738 kvm->arch.smt_mode = 1;
57900694 3739 kvm->arch.emul_smt_mode = 1;
3c313524 3740
e23a808b
PM
3741 /*
3742 * Create a debugfs directory for the VM
3743 */
3744 snprintf(buf, sizeof(buf), "vm%d", current->pid);
3745 kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir);
3746 if (!IS_ERR_OR_NULL(kvm->arch.debugfs_dir))
3747 kvmppc_mmu_debugfs_init(kvm);
3748
54738c09 3749 return 0;
de56a948
PM
3750}
3751
f1378b1c
PM
3752static void kvmppc_free_vcores(struct kvm *kvm)
3753{
3754 long int i;
3755
23316316 3756 for (i = 0; i < KVM_MAX_VCORES; ++i)
f1378b1c
PM
3757 kfree(kvm->arch.vcores[i]);
3758 kvm->arch.online_vcores = 0;
3759}
3760
3a167bea 3761static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
de56a948 3762{
e23a808b
PM
3763 debugfs_remove_recursive(kvm->arch.debugfs_dir);
3764
8cf4ecc0
PM
3765 if (!cpu_has_feature(CPU_FTR_ARCH_300))
3766 kvm_hv_vm_deactivated();
512691d4 3767
f1378b1c 3768 kvmppc_free_vcores(kvm);
aa04b4cc 3769
8cf4ecc0
PM
3770 kvmppc_free_lpid(kvm->arch.lpid);
3771
5a319350
PM
3772 if (kvm_is_radix(kvm))
3773 kvmppc_free_radix(kvm);
3774 else
aae0777f 3775 kvmppc_free_hpt(&kvm->arch.hpt);
c57875f5
SW
3776
3777 kvmppc_free_pimap(kvm);
de56a948
PM
3778}
3779
3a167bea
AK
3780/* We don't need to emulate any privileged instructions or dcbz */
3781static int kvmppc_core_emulate_op_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
3782 unsigned int inst, int *advance)
de56a948 3783{
3a167bea 3784 return EMULATE_FAIL;
de56a948
PM
3785}
3786
3a167bea
AK
3787static int kvmppc_core_emulate_mtspr_hv(struct kvm_vcpu *vcpu, int sprn,
3788 ulong spr_val)
de56a948
PM
3789{
3790 return EMULATE_FAIL;
3791}
3792
3a167bea
AK
3793static int kvmppc_core_emulate_mfspr_hv(struct kvm_vcpu *vcpu, int sprn,
3794 ulong *spr_val)
de56a948
PM
3795{
3796 return EMULATE_FAIL;
3797}
3798
3a167bea 3799static int kvmppc_core_check_processor_compat_hv(void)
de56a948 3800{
c17b98cf
PM
3801 if (!cpu_has_feature(CPU_FTR_HVMODE) ||
3802 !cpu_has_feature(CPU_FTR_ARCH_206))
3a167bea 3803 return -EIO;
50de596d 3804
3a167bea 3805 return 0;
de56a948
PM
3806}
3807
8daaafc8
SW
3808#ifdef CONFIG_KVM_XICS
3809
3810void kvmppc_free_pimap(struct kvm *kvm)
3811{
3812 kfree(kvm->arch.pimap);
3813}
3814
c57875f5 3815static struct kvmppc_passthru_irqmap *kvmppc_alloc_pimap(void)
8daaafc8
SW
3816{
3817 return kzalloc(sizeof(struct kvmppc_passthru_irqmap), GFP_KERNEL);
3818}
c57875f5
SW
3819
3820static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
3821{
3822 struct irq_desc *desc;
3823 struct kvmppc_irq_map *irq_map;
3824 struct kvmppc_passthru_irqmap *pimap;
3825 struct irq_chip *chip;
5af50993 3826 int i, rc = 0;
c57875f5 3827
644abbb2
SW
3828 if (!kvm_irq_bypass)
3829 return 1;
3830
c57875f5
SW
3831 desc = irq_to_desc(host_irq);
3832 if (!desc)
3833 return -EIO;
3834
3835 mutex_lock(&kvm->lock);
3836
3837 pimap = kvm->arch.pimap;
3838 if (pimap == NULL) {
3839 /* First call, allocate structure to hold IRQ map */
3840 pimap = kvmppc_alloc_pimap();
3841 if (pimap == NULL) {
3842 mutex_unlock(&kvm->lock);
3843 return -ENOMEM;
3844 }
3845 kvm->arch.pimap = pimap;
3846 }
3847
3848 /*
3849 * For now, we only support interrupts for which the EOI operation
3850 * is an OPAL call followed by a write to XIRR, since that's
5af50993 3851 * what our real-mode EOI code does, or a XIVE interrupt
c57875f5
SW
3852 */
3853 chip = irq_data_get_irq_chip(&desc->irq_data);
5af50993 3854 if (!chip || !(is_pnv_opal_msi(chip) || is_xive_irq(chip))) {
c57875f5
SW
3855 pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map for (%d,%d)\n",
3856 host_irq, guest_gsi);
3857 mutex_unlock(&kvm->lock);
3858 return -ENOENT;
3859 }
3860
3861 /*
3862 * See if we already have an entry for this guest IRQ number.
3863 * If it's mapped to a hardware IRQ number, that's an error,
3864 * otherwise re-use this entry.
3865 */
3866 for (i = 0; i < pimap->n_mapped; i++) {
3867 if (guest_gsi == pimap->mapped[i].v_hwirq) {
3868 if (pimap->mapped[i].r_hwirq) {
3869 mutex_unlock(&kvm->lock);
3870 return -EINVAL;
3871 }
3872 break;
3873 }
3874 }
3875
3876 if (i == KVMPPC_PIRQ_MAPPED) {
3877 mutex_unlock(&kvm->lock);
3878 return -EAGAIN; /* table is full */
3879 }
3880
3881 irq_map = &pimap->mapped[i];
3882
3883 irq_map->v_hwirq = guest_gsi;
c57875f5
SW
3884 irq_map->desc = desc;
3885
e3c13e56
SW
3886 /*
3887 * Order the above two stores before the next to serialize with
3888 * the KVM real mode handler.
3889 */
3890 smp_wmb();
3891 irq_map->r_hwirq = desc->irq_data.hwirq;
3892
c57875f5
SW
3893 if (i == pimap->n_mapped)
3894 pimap->n_mapped++;
3895
5af50993
BH
3896 if (xive_enabled())
3897 rc = kvmppc_xive_set_mapped(kvm, guest_gsi, desc);
3898 else
3899 kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq);
3900 if (rc)
3901 irq_map->r_hwirq = 0;
5d375199 3902
c57875f5
SW
3903 mutex_unlock(&kvm->lock);
3904
3905 return 0;
3906}
3907
3908static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
3909{
3910 struct irq_desc *desc;
3911 struct kvmppc_passthru_irqmap *pimap;
5af50993 3912 int i, rc = 0;
c57875f5 3913
644abbb2
SW
3914 if (!kvm_irq_bypass)
3915 return 0;
3916
c57875f5
SW
3917 desc = irq_to_desc(host_irq);
3918 if (!desc)
3919 return -EIO;
3920
3921 mutex_lock(&kvm->lock);
a1c52e1c
ME
3922 if (!kvm->arch.pimap)
3923 goto unlock;
c57875f5 3924
c57875f5
SW
3925 pimap = kvm->arch.pimap;
3926
3927 for (i = 0; i < pimap->n_mapped; i++) {
3928 if (guest_gsi == pimap->mapped[i].v_hwirq)
3929 break;
3930 }
3931
3932 if (i == pimap->n_mapped) {
3933 mutex_unlock(&kvm->lock);
3934 return -ENODEV;
3935 }
3936
5af50993
BH
3937 if (xive_enabled())
3938 rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, pimap->mapped[i].desc);
3939 else
3940 kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq);
5d375199 3941
5af50993 3942 /* invalidate the entry (what do do on error from the above ?) */
c57875f5
SW
3943 pimap->mapped[i].r_hwirq = 0;
3944
3945 /*
3946 * We don't free this structure even when the count goes to
3947 * zero. The structure is freed when we destroy the VM.
3948 */
a1c52e1c 3949 unlock:
c57875f5 3950 mutex_unlock(&kvm->lock);
5af50993 3951 return rc;
c57875f5
SW
3952}
3953
3954static int kvmppc_irq_bypass_add_producer_hv(struct irq_bypass_consumer *cons,
3955 struct irq_bypass_producer *prod)
3956{
3957 int ret = 0;
3958 struct kvm_kernel_irqfd *irqfd =
3959 container_of(cons, struct kvm_kernel_irqfd, consumer);
3960
3961 irqfd->producer = prod;
3962
3963 ret = kvmppc_set_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi);
3964 if (ret)
3965 pr_info("kvmppc_set_passthru_irq (irq %d, gsi %d) fails: %d\n",
3966 prod->irq, irqfd->gsi, ret);
3967
3968 return ret;
3969}
3970
3971static void kvmppc_irq_bypass_del_producer_hv(struct irq_bypass_consumer *cons,
3972 struct irq_bypass_producer *prod)
3973{
3974 int ret;
3975 struct kvm_kernel_irqfd *irqfd =
3976 container_of(cons, struct kvm_kernel_irqfd, consumer);
3977
3978 irqfd->producer = NULL;
3979
3980 /*
3981 * When producer of consumer is unregistered, we change back to
3982 * default external interrupt handling mode - KVM real mode
3983 * will switch back to host.
3984 */
3985 ret = kvmppc_clr_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi);
3986 if (ret)
3987 pr_warn("kvmppc_clr_passthru_irq (irq %d, gsi %d) fails: %d\n",
3988 prod->irq, irqfd->gsi, ret);
3989}
8daaafc8
SW
3990#endif
3991
3a167bea
AK
3992static long kvm_arch_vm_ioctl_hv(struct file *filp,
3993 unsigned int ioctl, unsigned long arg)
3994{
3995 struct kvm *kvm __maybe_unused = filp->private_data;
3996 void __user *argp = (void __user *)arg;
3997 long r;
3998
3999 switch (ioctl) {
4000
3a167bea
AK
4001 case KVM_PPC_ALLOCATE_HTAB: {
4002 u32 htab_order;
4003
4004 r = -EFAULT;
4005 if (get_user(htab_order, (u32 __user *)argp))
4006 break;
f98a8bf9 4007 r = kvmppc_alloc_reset_hpt(kvm, htab_order);
3a167bea
AK
4008 if (r)
4009 break;
3a167bea
AK
4010 r = 0;
4011 break;
4012 }
4013
4014 case KVM_PPC_GET_HTAB_FD: {
4015 struct kvm_get_htab_fd ghf;
4016
4017 r = -EFAULT;
4018 if (copy_from_user(&ghf, argp, sizeof(ghf)))
4019 break;
4020 r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf);
4021 break;
4022 }
4023
5e985969
DG
4024 case KVM_PPC_RESIZE_HPT_PREPARE: {
4025 struct kvm_ppc_resize_hpt rhpt;
4026
4027 r = -EFAULT;
4028 if (copy_from_user(&rhpt, argp, sizeof(rhpt)))
4029 break;
4030
4031 r = kvm_vm_ioctl_resize_hpt_prepare(kvm, &rhpt);
4032 break;
4033 }
4034
4035 case KVM_PPC_RESIZE_HPT_COMMIT: {
4036 struct kvm_ppc_resize_hpt rhpt;
4037
4038 r = -EFAULT;
4039 if (copy_from_user(&rhpt, argp, sizeof(rhpt)))
4040 break;
4041
4042 r = kvm_vm_ioctl_resize_hpt_commit(kvm, &rhpt);
4043 break;
4044 }
4045
3a167bea
AK
4046 default:
4047 r = -ENOTTY;
4048 }
4049
4050 return r;
4051}
4052
699a0ea0
PM
4053/*
4054 * List of hcall numbers to enable by default.
4055 * For compatibility with old userspace, we enable by default
4056 * all hcalls that were implemented before the hcall-enabling
4057 * facility was added. Note this list should not include H_RTAS.
4058 */
4059static unsigned int default_hcall_list[] = {
4060 H_REMOVE,
4061 H_ENTER,
4062 H_READ,
4063 H_PROTECT,
4064 H_BULK_REMOVE,
4065 H_GET_TCE,
4066 H_PUT_TCE,
4067 H_SET_DABR,
4068 H_SET_XDABR,
4069 H_CEDE,
4070 H_PROD,
4071 H_CONFER,
4072 H_REGISTER_VPA,
4073#ifdef CONFIG_KVM_XICS
4074 H_EOI,
4075 H_CPPR,
4076 H_IPI,
4077 H_IPOLL,
4078 H_XIRR,
4079 H_XIRR_X,
4080#endif
4081 0
4082};
4083
4084static void init_default_hcalls(void)
4085{
4086 int i;
ae2113a4 4087 unsigned int hcall;
699a0ea0 4088
ae2113a4
PM
4089 for (i = 0; default_hcall_list[i]; ++i) {
4090 hcall = default_hcall_list[i];
4091 WARN_ON(!kvmppc_hcall_impl_hv(hcall));
4092 __set_bit(hcall / 4, default_enabled_hcalls);
4093 }
699a0ea0
PM
4094}
4095
c9270132
PM
4096static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
4097{
468808bd 4098 unsigned long lpcr;
8cf4ecc0 4099 int radix;
468808bd
PM
4100
4101 /* If not on a POWER9, reject it */
4102 if (!cpu_has_feature(CPU_FTR_ARCH_300))
4103 return -ENODEV;
4104
4105 /* If any unknown flags set, reject it */
4106 if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE))
4107 return -EINVAL;
4108
8cf4ecc0
PM
4109 /* We can't change a guest to/from radix yet */
4110 radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX);
4111 if (radix != kvm_is_radix(kvm))
468808bd
PM
4112 return -EINVAL;
4113
4114 /* GR (guest radix) bit in process_table field must match */
8cf4ecc0 4115 if (!!(cfg->process_table & PATB_GR) != radix)
468808bd
PM
4116 return -EINVAL;
4117
4118 /* Process table size field must be reasonable, i.e. <= 24 */
4119 if ((cfg->process_table & PRTS_MASK) > 24)
4120 return -EINVAL;
4121
4122 kvm->arch.process_table = cfg->process_table;
4123 kvmppc_setup_partition_table(kvm);
4124
4125 lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0;
4126 kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE);
4127
4128 return 0;
c9270132
PM
4129}
4130
cbbc58d4 4131static struct kvmppc_ops kvm_ops_hv = {
3a167bea
AK
4132 .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
4133 .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
4134 .get_one_reg = kvmppc_get_one_reg_hv,
4135 .set_one_reg = kvmppc_set_one_reg_hv,
4136 .vcpu_load = kvmppc_core_vcpu_load_hv,
4137 .vcpu_put = kvmppc_core_vcpu_put_hv,
4138 .set_msr = kvmppc_set_msr_hv,
4139 .vcpu_run = kvmppc_vcpu_run_hv,
4140 .vcpu_create = kvmppc_core_vcpu_create_hv,
4141 .vcpu_free = kvmppc_core_vcpu_free_hv,
4142 .check_requests = kvmppc_core_check_requests_hv,
4143 .get_dirty_log = kvm_vm_ioctl_get_dirty_log_hv,
4144 .flush_memslot = kvmppc_core_flush_memslot_hv,
4145 .prepare_memory_region = kvmppc_core_prepare_memory_region_hv,
4146 .commit_memory_region = kvmppc_core_commit_memory_region_hv,
4147 .unmap_hva = kvm_unmap_hva_hv,
4148 .unmap_hva_range = kvm_unmap_hva_range_hv,
4149 .age_hva = kvm_age_hva_hv,
4150 .test_age_hva = kvm_test_age_hva_hv,
4151 .set_spte_hva = kvm_set_spte_hva_hv,
4152 .mmu_destroy = kvmppc_mmu_destroy_hv,
4153 .free_memslot = kvmppc_core_free_memslot_hv,
4154 .create_memslot = kvmppc_core_create_memslot_hv,
4155 .init_vm = kvmppc_core_init_vm_hv,
4156 .destroy_vm = kvmppc_core_destroy_vm_hv,
3a167bea
AK
4157 .get_smmu_info = kvm_vm_ioctl_get_smmu_info_hv,
4158 .emulate_op = kvmppc_core_emulate_op_hv,
4159 .emulate_mtspr = kvmppc_core_emulate_mtspr_hv,
4160 .emulate_mfspr = kvmppc_core_emulate_mfspr_hv,
4161 .fast_vcpu_kick = kvmppc_fast_vcpu_kick_hv,
4162 .arch_vm_ioctl = kvm_arch_vm_ioctl_hv,
ae2113a4 4163 .hcall_implemented = kvmppc_hcall_impl_hv,
c57875f5
SW
4164#ifdef CONFIG_KVM_XICS
4165 .irq_bypass_add_producer = kvmppc_irq_bypass_add_producer_hv,
4166 .irq_bypass_del_producer = kvmppc_irq_bypass_del_producer_hv,
4167#endif
c9270132
PM
4168 .configure_mmu = kvmhv_configure_mmu,
4169 .get_rmmu_info = kvmhv_get_rmmu_info,
3c313524 4170 .set_smt_mode = kvmhv_set_smt_mode,
3a167bea
AK
4171};
4172
fd7bacbc
MS
4173static int kvm_init_subcore_bitmap(void)
4174{
4175 int i, j;
4176 int nr_cores = cpu_nr_cores();
4177 struct sibling_subcore_state *sibling_subcore_state;
4178
4179 for (i = 0; i < nr_cores; i++) {
4180 int first_cpu = i * threads_per_core;
4181 int node = cpu_to_node(first_cpu);
4182
4183 /* Ignore if it is already allocated. */
4184 if (paca[first_cpu].sibling_subcore_state)
4185 continue;
4186
4187 sibling_subcore_state =
4188 kmalloc_node(sizeof(struct sibling_subcore_state),
4189 GFP_KERNEL, node);
4190 if (!sibling_subcore_state)
4191 return -ENOMEM;
4192
4193 memset(sibling_subcore_state, 0,
4194 sizeof(struct sibling_subcore_state));
4195
4196 for (j = 0; j < threads_per_core; j++) {
4197 int cpu = first_cpu + j;
4198
4199 paca[cpu].sibling_subcore_state = sibling_subcore_state;
4200 }
4201 }
4202 return 0;
4203}
4204
5a319350
PM
4205static int kvmppc_radix_possible(void)
4206{
4207 return cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled();
4208}
4209
3a167bea 4210static int kvmppc_book3s_init_hv(void)
de56a948
PM
4211{
4212 int r;
cbbc58d4
AK
4213 /*
4214 * FIXME!! Do we need to check on all cpus ?
4215 */
4216 r = kvmppc_core_check_processor_compat_hv();
4217 if (r < 0)
739e2425 4218 return -ENODEV;
de56a948 4219
fd7bacbc
MS
4220 r = kvm_init_subcore_bitmap();
4221 if (r)
4222 return r;
4223
f725758b
PM
4224 /*
4225 * We need a way of accessing the XICS interrupt controller,
4226 * either directly, via paca[cpu].kvm_hstate.xics_phys, or
4227 * indirectly, via OPAL.
4228 */
4229#ifdef CONFIG_SMP
fb7dcf72 4230 if (!xive_enabled() && !local_paca->kvm_hstate.xics_phys) {
f725758b
PM
4231 struct device_node *np;
4232
4233 np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc");
4234 if (!np) {
4235 pr_err("KVM-HV: Cannot determine method for accessing XICS\n");
4236 return -ENODEV;
4237 }
4238 }
4239#endif
4240
cbbc58d4
AK
4241 kvm_ops_hv.owner = THIS_MODULE;
4242 kvmppc_hv_ops = &kvm_ops_hv;
de56a948 4243
699a0ea0
PM
4244 init_default_hcalls();
4245
ec257165
PM
4246 init_vcore_lists();
4247
cbbc58d4 4248 r = kvmppc_mmu_hv_init();
5a319350
PM
4249 if (r)
4250 return r;
4251
4252 if (kvmppc_radix_possible())
4253 r = kvmppc_radix_init();
de56a948
PM
4254 return r;
4255}
4256
3a167bea 4257static void kvmppc_book3s_exit_hv(void)
de56a948 4258{
79b6c247 4259 kvmppc_free_host_rm_ops();
5a319350
PM
4260 if (kvmppc_radix_possible())
4261 kvmppc_radix_exit();
cbbc58d4 4262 kvmppc_hv_ops = NULL;
de56a948
PM
4263}
4264
3a167bea
AK
4265module_init(kvmppc_book3s_init_hv);
4266module_exit(kvmppc_book3s_exit_hv);
2ba9f0d8 4267MODULE_LICENSE("GPL");
398a76c6
AG
4268MODULE_ALIAS_MISCDEV(KVM_MINOR);
4269MODULE_ALIAS("devname:kvm");
7c5b06ca 4270