]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blobdiff - arch/powerpc/kvm/book3s_hv.c
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
[mirror_ubuntu-artful-kernel.git] / arch / powerpc / kvm / book3s_hv.c
index 773b35d16a0b61ddc3b13f02fd8a7eaca9d4976b..0b436df746fcb094d8bd8b3c928c0862d6a90df5 100644 (file)
@@ -46,6 +46,8 @@
 #include <linux/of.h>
 
 #include <asm/reg.h>
+#include <asm/ppc-opcode.h>
+#include <asm/disassemble.h>
 #include <asm/cputable.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
@@ -645,6 +647,7 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
        unsigned long stolen;
        unsigned long core_stolen;
        u64 now;
+       unsigned long flags;
 
        dt = vcpu->arch.dtl_ptr;
        vpa = vcpu->arch.vpa.pinned_addr;
@@ -652,10 +655,10 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
        core_stolen = vcore_stolen_time(vc, now);
        stolen = core_stolen - vcpu->arch.stolen_logged;
        vcpu->arch.stolen_logged = core_stolen;
-       spin_lock_irq(&vcpu->arch.tbacct_lock);
+       spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
        stolen += vcpu->arch.busy_stolen;
        vcpu->arch.busy_stolen = 0;
-       spin_unlock_irq(&vcpu->arch.tbacct_lock);
+       spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
        if (!dt || !vpa)
                return;
        memset(dt, 0, sizeof(struct dtl_entry));
@@ -675,6 +678,26 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
        vcpu->arch.dtl.dirty = true;
 }
 
+/* See if there is a doorbell interrupt pending for a vcpu */
+static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu)
+{
+       int thr;
+       struct kvmppc_vcore *vc;
+
+       if (vcpu->arch.doorbell_request)
+               return true;
+       /*
+        * Ensure that the read of vcore->dpdes comes after the read
+        * of vcpu->doorbell_request.  This barrier matches the
+        * lwsync in book3s_hv_rmhandlers.S just before the
+        * fast_guest_return label.
+        */
+       smp_rmb();
+       vc = vcpu->arch.vcore;
+       thr = vcpu->vcpu_id - vc->first_vcpuid;
+       return !!(vc->dpdes & (1 << thr));
+}
+
 static bool kvmppc_power8_compatible(struct kvm_vcpu *vcpu)
 {
        if (vcpu->arch.vcore->arch_compat >= PVR_ARCH_207)
@@ -926,6 +949,101 @@ static int kvmppc_emulate_debug_inst(struct kvm_run *run,
        }
 }
 
+static void do_nothing(void *x)
+{
+}
+
+static unsigned long kvmppc_read_dpdes(struct kvm_vcpu *vcpu)
+{
+       int thr, cpu, pcpu, nthreads;
+       struct kvm_vcpu *v;
+       unsigned long dpdes;
+
+       nthreads = vcpu->kvm->arch.emul_smt_mode;
+       dpdes = 0;
+       cpu = vcpu->vcpu_id & ~(nthreads - 1);
+       for (thr = 0; thr < nthreads; ++thr, ++cpu) {
+               v = kvmppc_find_vcpu(vcpu->kvm, cpu);
+               if (!v)
+                       continue;
+               /*
+                * If the vcpu is currently running on a physical cpu thread,
+                * interrupt it in order to pull it out of the guest briefly,
+                * which will update its vcore->dpdes value.
+                */
+               pcpu = READ_ONCE(v->cpu);
+               if (pcpu >= 0)
+                       smp_call_function_single(pcpu, do_nothing, NULL, 1);
+               if (kvmppc_doorbell_pending(v))
+                       dpdes |= 1 << thr;
+       }
+       return dpdes;
+}
+
+/*
+ * On POWER9, emulate doorbell-related instructions in order to
+ * give the guest the illusion of running on a multi-threaded core.
+ * The instructions emulated are msgsndp, msgclrp, mfspr TIR,
+ * and mfspr DPDES.
+ */
+static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu *vcpu)
+{
+       u32 inst, rb, thr;
+       unsigned long arg;
+       struct kvm *kvm = vcpu->kvm;
+       struct kvm_vcpu *tvcpu;
+
+       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+               return EMULATE_FAIL;
+       if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &inst) != EMULATE_DONE)
+               return RESUME_GUEST;
+       if (get_op(inst) != 31)
+               return EMULATE_FAIL;
+       rb = get_rb(inst);
+       thr = vcpu->vcpu_id & (kvm->arch.emul_smt_mode - 1);
+       switch (get_xop(inst)) {
+       case OP_31_XOP_MSGSNDP:
+               arg = kvmppc_get_gpr(vcpu, rb);
+               if (((arg >> 27) & 0xf) != PPC_DBELL_SERVER)
+                       break;
+               arg &= 0x3f;
+               if (arg >= kvm->arch.emul_smt_mode)
+                       break;
+               tvcpu = kvmppc_find_vcpu(kvm, vcpu->vcpu_id - thr + arg);
+               if (!tvcpu)
+                       break;
+               if (!tvcpu->arch.doorbell_request) {
+                       tvcpu->arch.doorbell_request = 1;
+                       kvmppc_fast_vcpu_kick_hv(tvcpu);
+               }
+               break;
+       case OP_31_XOP_MSGCLRP:
+               arg = kvmppc_get_gpr(vcpu, rb);
+               if (((arg >> 27) & 0xf) != PPC_DBELL_SERVER)
+                       break;
+               vcpu->arch.vcore->dpdes = 0;
+               vcpu->arch.doorbell_request = 0;
+               break;
+       case OP_31_XOP_MFSPR:
+               switch (get_sprn(inst)) {
+               case SPRN_TIR:
+                       arg = thr;
+                       break;
+               case SPRN_DPDES:
+                       arg = kvmppc_read_dpdes(vcpu);
+                       break;
+               default:
+                       return EMULATE_FAIL;
+               }
+               kvmppc_set_gpr(vcpu, get_rt(inst), arg);
+               break;
+       default:
+               return EMULATE_FAIL;
+       }
+       kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4);
+       return RESUME_GUEST;
+}
+
 static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
                                 struct task_struct *tsk)
 {
@@ -971,15 +1089,20 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
                r = RESUME_GUEST;
                break;
        case BOOK3S_INTERRUPT_MACHINE_CHECK:
-               /*
-                * Deliver a machine check interrupt to the guest.
-                * We have to do this, even if the host has handled the
-                * machine check, because machine checks use SRR0/1 and
-                * the interrupt might have trashed guest state in them.
-                */
-               kvmppc_book3s_queue_irqprio(vcpu,
-                                           BOOK3S_INTERRUPT_MACHINE_CHECK);
-               r = RESUME_GUEST;
+               /* Exit to guest with KVM_EXIT_NMI as exit reason */
+               run->exit_reason = KVM_EXIT_NMI;
+               run->hw.hardware_exit_reason = vcpu->arch.trap;
+               /* Clear out the old NMI status from run->flags */
+               run->flags &= ~KVM_RUN_PPC_NMI_DISP_MASK;
+               /* Now set the NMI status */
+               if (vcpu->arch.mce_evt.disposition == MCE_DISPOSITION_RECOVERED)
+                       run->flags |= KVM_RUN_PPC_NMI_DISP_FULLY_RECOV;
+               else
+                       run->flags |= KVM_RUN_PPC_NMI_DISP_NOT_RECOV;
+
+               r = RESUME_HOST;
+               /* Print the MCE event to host console. */
+               machine_check_print_event_info(&vcpu->arch.mce_evt, false);
                break;
        case BOOK3S_INTERRUPT_PROGRAM:
        {
@@ -1048,12 +1171,19 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
                break;
        /*
         * This occurs if the guest (kernel or userspace), does something that
-        * is prohibited by HFSCR.  We just generate a program interrupt to
-        * the guest.
+        * is prohibited by HFSCR.
+        * On POWER9, this could be a doorbell instruction that we need
+        * to emulate.
+        * Otherwise, we just generate a program interrupt to the guest.
         */
        case BOOK3S_INTERRUPT_H_FAC_UNAVAIL:
-               kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
-               r = RESUME_GUEST;
+               r = EMULATE_FAIL;
+               if ((vcpu->arch.hfscr >> 56) == FSCR_MSGP_LG)
+                       r = kvmppc_emulate_doorbell_instr(vcpu);
+               if (r == EMULATE_FAIL) {
+                       kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
+                       r = RESUME_GUEST;
+               }
                break;
        case BOOK3S_INTERRUPT_HV_RM_HARD:
                r = RESUME_PASSTHROUGH;
@@ -1143,6 +1273,12 @@ static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr,
        mask = LPCR_DPFD | LPCR_ILE | LPCR_TC;
        if (cpu_has_feature(CPU_FTR_ARCH_207S))
                mask |= LPCR_AIL;
+       /*
+        * On POWER9, allow userspace to enable large decrementer for the
+        * guest, whether or not the host has it enabled.
+        */
+       if (cpu_has_feature(CPU_FTR_ARCH_300))
+               mask |= LPCR_LD;
 
        /* Broken 32-bit version of LPCR must not clear top bits */
        if (preserve_top32)
@@ -1611,7 +1747,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
        init_swait_queue_head(&vcore->wq);
        vcore->preempt_tb = TB_NIL;
        vcore->lpcr = kvm->arch.lpcr;
-       vcore->first_vcpuid = core * threads_per_vcore();
+       vcore->first_vcpuid = core * kvm->arch.smt_mode;
        vcore->kvm = kvm;
        INIT_LIST_HEAD(&vcore->preempt_list);
 
@@ -1770,14 +1906,10 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
                                                   unsigned int id)
 {
        struct kvm_vcpu *vcpu;
-       int err = -EINVAL;
+       int err;
        int core;
        struct kvmppc_vcore *vcore;
 
-       core = id / threads_per_vcore();
-       if (core >= KVM_MAX_VCORES)
-               goto out;
-
        err = -ENOMEM;
        vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
        if (!vcpu)
@@ -1808,6 +1940,20 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
        vcpu->arch.busy_preempt = TB_NIL;
        vcpu->arch.intr_msr = MSR_SF | MSR_ME;
 
+       /*
+        * Set the default HFSCR for the guest from the host value.
+        * This value is only used on POWER9.
+        * On POWER9 DD1, TM doesn't work, so we make sure to
+        * prevent the guest from using it.
+        * On POWER9, we want to virtualize the doorbell facility, so we
+        * turn off the HFSCR bit, which causes those instructions to trap.
+        */
+       vcpu->arch.hfscr = mfspr(SPRN_HFSCR);
+       if (!cpu_has_feature(CPU_FTR_TM))
+               vcpu->arch.hfscr &= ~HFSCR_TM;
+       if (cpu_has_feature(CPU_FTR_ARCH_300))
+               vcpu->arch.hfscr &= ~HFSCR_MSGP;
+
        kvmppc_mmu_book3s_hv_init(vcpu);
 
        vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
@@ -1815,11 +1961,17 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
        init_waitqueue_head(&vcpu->arch.cpu_run);
 
        mutex_lock(&kvm->lock);
-       vcore = kvm->arch.vcores[core];
-       if (!vcore) {
-               vcore = kvmppc_vcore_create(kvm, core);
-               kvm->arch.vcores[core] = vcore;
-               kvm->arch.online_vcores++;
+       vcore = NULL;
+       err = -EINVAL;
+       core = id / kvm->arch.smt_mode;
+       if (core < KVM_MAX_VCORES) {
+               vcore = kvm->arch.vcores[core];
+               if (!vcore) {
+                       err = -ENOMEM;
+                       vcore = kvmppc_vcore_create(kvm, core);
+                       kvm->arch.vcores[core] = vcore;
+                       kvm->arch.online_vcores++;
+               }
        }
        mutex_unlock(&kvm->lock);
 
@@ -1847,6 +1999,43 @@ out:
        return ERR_PTR(err);
 }
 
+static int kvmhv_set_smt_mode(struct kvm *kvm, unsigned long smt_mode,
+                             unsigned long flags)
+{
+       int err;
+       int esmt = 0;
+
+       if (flags)
+               return -EINVAL;
+       if (smt_mode > MAX_SMT_THREADS || !is_power_of_2(smt_mode))
+               return -EINVAL;
+       if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
+               /*
+                * On POWER8 (or POWER7), the threading mode is "strict",
+                * so we pack smt_mode vcpus per vcore.
+                */
+               if (smt_mode > threads_per_subcore)
+                       return -EINVAL;
+       } else {
+               /*
+                * On POWER9, the threading mode is "loose",
+                * so each vcpu gets its own vcore.
+                */
+               esmt = smt_mode;
+               smt_mode = 1;
+       }
+       mutex_lock(&kvm->lock);
+       err = -EBUSY;
+       if (!kvm->arch.online_vcores) {
+               kvm->arch.smt_mode = smt_mode;
+               kvm->arch.emul_smt_mode = esmt;
+               err = 0;
+       }
+       mutex_unlock(&kvm->lock);
+
+       return err;
+}
+
 static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa)
 {
        if (vpa->pinned_addr)
@@ -1897,7 +2086,7 @@ static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
        }
 }
 
-extern void __kvmppc_vcore_entry(void);
+extern int __kvmppc_vcore_entry(void);
 
 static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
                                   struct kvm_vcpu *vcpu)
@@ -1962,10 +2151,6 @@ static void kvmppc_release_hwthread(int cpu)
        tpaca->kvm_hstate.kvm_split_mode = NULL;
 }
 
-static void do_nothing(void *x)
-{
-}
-
 static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
 {
        int i;
@@ -1983,11 +2168,35 @@ static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
                        smp_call_function_single(cpu + i, do_nothing, NULL, 1);
 }
 
+static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
+{
+       struct kvm *kvm = vcpu->kvm;
+
+       /*
+        * With radix, the guest can do TLB invalidations itself,
+        * and it could choose to use the local form (tlbiel) if
+        * it is invalidating a translation that has only ever been
+        * used on one vcpu.  However, that doesn't mean it has
+        * only ever been used on one physical cpu, since vcpus
+        * can move around between pcpus.  To cope with this, when
+        * a vcpu moves from one pcpu to another, we need to tell
+        * any vcpus running on the same core as this vcpu previously
+        * ran to flush the TLB.  The TLB is shared between threads,
+        * so we use a single bit in .need_tlb_flush for all 4 threads.
+        */
+       if (vcpu->arch.prev_cpu != pcpu) {
+               if (vcpu->arch.prev_cpu >= 0 &&
+                   cpu_first_thread_sibling(vcpu->arch.prev_cpu) !=
+                   cpu_first_thread_sibling(pcpu))
+                       radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu);
+               vcpu->arch.prev_cpu = pcpu;
+       }
+}
+
 static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
 {
        int cpu;
        struct paca_struct *tpaca;
-       struct kvmppc_vcore *mvc = vc->master_vcore;
        struct kvm *kvm = vc->kvm;
 
        cpu = vc->pcpu;
@@ -1997,36 +2206,16 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
                        vcpu->arch.timer_running = 0;
                }
                cpu += vcpu->arch.ptid;
-               vcpu->cpu = mvc->pcpu;
+               vcpu->cpu = vc->pcpu;
                vcpu->arch.thread_cpu = cpu;
-
-               /*
-                * With radix, the guest can do TLB invalidations itself,
-                * and it could choose to use the local form (tlbiel) if
-                * it is invalidating a translation that has only ever been
-                * used on one vcpu.  However, that doesn't mean it has
-                * only ever been used on one physical cpu, since vcpus
-                * can move around between pcpus.  To cope with this, when
-                * a vcpu moves from one pcpu to another, we need to tell
-                * any vcpus running on the same core as this vcpu previously
-                * ran to flush the TLB.  The TLB is shared between threads,
-                * so we use a single bit in .need_tlb_flush for all 4 threads.
-                */
-               if (kvm_is_radix(kvm) && vcpu->arch.prev_cpu != cpu) {
-                       if (vcpu->arch.prev_cpu >= 0 &&
-                           cpu_first_thread_sibling(vcpu->arch.prev_cpu) !=
-                           cpu_first_thread_sibling(cpu))
-                               radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu);
-                       vcpu->arch.prev_cpu = cpu;
-               }
                cpumask_set_cpu(cpu, &kvm->arch.cpu_in_guest);
        }
        tpaca = &paca[cpu];
        tpaca->kvm_hstate.kvm_vcpu = vcpu;
-       tpaca->kvm_hstate.ptid = cpu - mvc->pcpu;
+       tpaca->kvm_hstate.ptid = cpu - vc->pcpu;
        /* Order stores to hstate.kvm_vcpu etc. before store to kvm_vcore */
        smp_wmb();
-       tpaca->kvm_hstate.kvm_vcore = mvc;
+       tpaca->kvm_hstate.kvm_vcore = vc;
        if (cpu != smp_processor_id())
                kvmppc_ipi_thread(cpu);
 }
@@ -2155,8 +2344,7 @@ struct core_info {
        int             max_subcore_threads;
        int             total_threads;
        int             subcore_threads[MAX_SUBCORES];
-       struct kvm      *subcore_vm[MAX_SUBCORES];
-       struct list_head vcs[MAX_SUBCORES];
+       struct kvmppc_vcore *vc[MAX_SUBCORES];
 };
 
 /*
@@ -2167,17 +2355,12 @@ static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 };
 
 static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc)
 {
-       int sub;
-
        memset(cip, 0, sizeof(*cip));
        cip->n_subcores = 1;
        cip->max_subcore_threads = vc->num_threads;
        cip->total_threads = vc->num_threads;
        cip->subcore_threads[0] = vc->num_threads;
-       cip->subcore_vm[0] = vc->kvm;
-       for (sub = 0; sub < MAX_SUBCORES; ++sub)
-               INIT_LIST_HEAD(&cip->vcs[sub]);
-       list_add_tail(&vc->preempt_list, &cip->vcs[0]);
+       cip->vc[0] = vc;
 }
 
 static bool subcore_config_ok(int n_subcores, int n_threads)
@@ -2197,9 +2380,8 @@ static bool subcore_config_ok(int n_subcores, int n_threads)
        return n_subcores * roundup_pow_of_two(n_threads) <= MAX_SMT_THREADS;
 }
 
-static void init_master_vcore(struct kvmppc_vcore *vc)
+static void init_vcore_to_run(struct kvmppc_vcore *vc)
 {
-       vc->master_vcore = vc;
        vc->entry_exit_map = 0;
        vc->in_guest = 0;
        vc->napping_threads = 0;
@@ -2224,9 +2406,9 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
        ++cip->n_subcores;
        cip->total_threads += vc->num_threads;
        cip->subcore_threads[sub] = vc->num_threads;
-       cip->subcore_vm[sub] = vc->kvm;
-       init_master_vcore(vc);
-       list_move_tail(&vc->preempt_list, &cip->vcs[sub]);
+       cip->vc[sub] = vc;
+       init_vcore_to_run(vc);
+       list_del_init(&vc->preempt_list);
 
        return true;
 }
@@ -2294,6 +2476,18 @@ static void collect_piggybacks(struct core_info *cip, int target_threads)
        spin_unlock(&lp->lock);
 }
 
+static bool recheck_signals(struct core_info *cip)
+{
+       int sub, i;
+       struct kvm_vcpu *vcpu;
+
+       for (sub = 0; sub < cip->n_subcores; ++sub)
+               for_each_runnable_thread(i, vcpu, cip->vc[sub])
+                       if (signal_pending(vcpu->arch.run_task))
+                               return true;
+       return false;
+}
+
 static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
 {
        int still_running = 0, i;
@@ -2331,7 +2525,6 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
                        wake_up(&vcpu->arch.cpu_run);
                }
        }
-       list_del_init(&vc->preempt_list);
        if (!is_master) {
                if (still_running > 0) {
                        kvmppc_vcore_preempt(vc);
@@ -2393,6 +2586,21 @@ static inline int kvmppc_set_host_core(unsigned int cpu)
        return 0;
 }
 
+static void set_irq_happened(int trap)
+{
+       switch (trap) {
+       case BOOK3S_INTERRUPT_EXTERNAL:
+               local_paca->irq_happened |= PACA_IRQ_EE;
+               break;
+       case BOOK3S_INTERRUPT_H_DOORBELL:
+               local_paca->irq_happened |= PACA_IRQ_DBELL;
+               break;
+       case BOOK3S_INTERRUPT_HMI:
+               local_paca->irq_happened |= PACA_IRQ_HMI;
+               break;
+       }
+}
+
 /*
  * Run a set of guest threads on a physical core.
  * Called with vc->lock held.
@@ -2403,7 +2611,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
        int i;
        int srcu_idx;
        struct core_info core_info;
-       struct kvmppc_vcore *pvc, *vcnext;
+       struct kvmppc_vcore *pvc;
        struct kvm_split_mode split_info, *sip;
        int split, subcore_size, active;
        int sub;
@@ -2412,6 +2620,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
        int pcpu, thr;
        int target_threads;
        int controlled_threads;
+       int trap;
 
        /*
         * Remove from the list any threads that have a signal pending
@@ -2426,7 +2635,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
        /*
         * Initialize *vc.
         */
-       init_master_vcore(vc);
+       init_vcore_to_run(vc);
        vc->preempt_tb = TB_NIL;
 
        /*
@@ -2463,6 +2672,43 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
        if (vc->num_threads < target_threads)
                collect_piggybacks(&core_info, target_threads);
 
+       /*
+        * On radix, arrange for TLB flushing if necessary.
+        * This has to be done before disabling interrupts since
+        * it uses smp_call_function().
+        */
+       pcpu = smp_processor_id();
+       if (kvm_is_radix(vc->kvm)) {
+               for (sub = 0; sub < core_info.n_subcores; ++sub)
+                       for_each_runnable_thread(i, vcpu, core_info.vc[sub])
+                               kvmppc_prepare_radix_vcpu(vcpu, pcpu);
+       }
+
+       /*
+        * Hard-disable interrupts, and check resched flag and signals.
+        * If we need to reschedule or deliver a signal, clean up
+        * and return without going into the guest(s).
+        */
+       local_irq_disable();
+       hard_irq_disable();
+       if (lazy_irq_pending() || need_resched() ||
+           recheck_signals(&core_info)) {
+               local_irq_enable();
+               vc->vcore_state = VCORE_INACTIVE;
+               /* Unlock all except the primary vcore */
+               for (sub = 1; sub < core_info.n_subcores; ++sub) {
+                       pvc = core_info.vc[sub];
+                       /* Put back on to the preempted vcores list */
+                       kvmppc_vcore_preempt(pvc);
+                       spin_unlock(&pvc->lock);
+               }
+               for (i = 0; i < controlled_threads; ++i)
+                       kvmppc_release_hwthread(pcpu + i);
+               return;
+       }
+
+       kvmppc_clear_host_core(pcpu);
+
        /* Decide on micro-threading (split-core) mode */
        subcore_size = threads_per_subcore;
        cmd_bit = stat_bit = 0;
@@ -2486,13 +2732,10 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
                split_info.ldbar = mfspr(SPRN_LDBAR);
                split_info.subcore_size = subcore_size;
                for (sub = 0; sub < core_info.n_subcores; ++sub)
-                       split_info.master_vcs[sub] =
-                               list_first_entry(&core_info.vcs[sub],
-                                       struct kvmppc_vcore, preempt_list);
+                       split_info.vc[sub] = core_info.vc[sub];
                /* order writes to split_info before kvm_split_mode pointer */
                smp_wmb();
        }
-       pcpu = smp_processor_id();
        for (thr = 0; thr < controlled_threads; ++thr)
                paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip;
 
@@ -2512,32 +2755,29 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
                }
        }
 
-       kvmppc_clear_host_core(pcpu);
-
        /* Start all the threads */
        active = 0;
        for (sub = 0; sub < core_info.n_subcores; ++sub) {
                thr = subcore_thread_map[sub];
                thr0_done = false;
                active |= 1 << thr;
-               list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) {
-                       pvc->pcpu = pcpu + thr;
-                       for_each_runnable_thread(i, vcpu, pvc) {
-                               kvmppc_start_thread(vcpu, pvc);
-                               kvmppc_create_dtl_entry(vcpu, pvc);
-                               trace_kvm_guest_enter(vcpu);
-                               if (!vcpu->arch.ptid)
-                                       thr0_done = true;
-                               active |= 1 << (thr + vcpu->arch.ptid);
-                       }
-                       /*
-                        * We need to start the first thread of each subcore
-                        * even if it doesn't have a vcpu.
-                        */
-                       if (pvc->master_vcore == pvc && !thr0_done)
-                               kvmppc_start_thread(NULL, pvc);
-                       thr += pvc->num_threads;
+               pvc = core_info.vc[sub];
+               pvc->pcpu = pcpu + thr;
+               for_each_runnable_thread(i, vcpu, pvc) {
+                       kvmppc_start_thread(vcpu, pvc);
+                       kvmppc_create_dtl_entry(vcpu, pvc);
+                       trace_kvm_guest_enter(vcpu);
+                       if (!vcpu->arch.ptid)
+                               thr0_done = true;
+                       active |= 1 << (thr + vcpu->arch.ptid);
                }
+               /*
+                * We need to start the first thread of each subcore
+                * even if it doesn't have a vcpu.
+                */
+               if (!thr0_done)
+                       kvmppc_start_thread(NULL, pvc);
+               thr += pvc->num_threads;
        }
 
        /*
@@ -2564,17 +2804,27 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
        trace_kvmppc_run_core(vc, 0);
 
        for (sub = 0; sub < core_info.n_subcores; ++sub)
-               list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list)
-                       spin_unlock(&pvc->lock);
+               spin_unlock(&core_info.vc[sub]->lock);
+
+       /*
+        * Interrupts will be enabled once we get into the guest,
+        * so tell lockdep that we're about to enable interrupts.
+        */
+       trace_hardirqs_on();
 
        guest_enter();
 
        srcu_idx = srcu_read_lock(&vc->kvm->srcu);
 
-       __kvmppc_vcore_entry();
+       trap = __kvmppc_vcore_entry();
 
        srcu_read_unlock(&vc->kvm->srcu, srcu_idx);
 
+       guest_exit();
+
+       trace_hardirqs_off();
+       set_irq_happened(trap);
+
        spin_lock(&vc->lock);
        /* prevent other vcpu threads from doing kvmppc_start_thread() now */
        vc->vcore_state = VCORE_EXITING;
@@ -2602,6 +2852,10 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
                split_info.do_nap = 0;
        }
 
+       kvmppc_set_host_core(pcpu);
+
+       local_irq_enable();
+
        /* Let secondaries go back to the offline loop */
        for (i = 0; i < controlled_threads; ++i) {
                kvmppc_release_hwthread(pcpu + i);
@@ -2610,18 +2864,15 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
                cpumask_clear_cpu(pcpu + i, &vc->kvm->arch.cpu_in_guest);
        }
 
-       kvmppc_set_host_core(pcpu);
-
        spin_unlock(&vc->lock);
 
        /* make sure updates to secondary vcpu structs are visible now */
        smp_mb();
-       guest_exit();
 
-       for (sub = 0; sub < core_info.n_subcores; ++sub)
-               list_for_each_entry_safe(pvc, vcnext, &core_info.vcs[sub],
-                                        preempt_list)
-                       post_guest_process(pvc, pvc == vc);
+       for (sub = 0; sub < core_info.n_subcores; ++sub) {
+               pvc = core_info.vc[sub];
+               post_guest_process(pvc, pvc == vc);
+       }
 
        spin_lock(&vc->lock);
        preempt_enable();
@@ -2666,6 +2917,30 @@ static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
                vc->halt_poll_ns /= halt_poll_ns_shrink;
 }
 
+#ifdef CONFIG_KVM_XICS
+static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu)
+{
+       if (!xive_enabled())
+               return false;
+       return vcpu->arch.xive_saved_state.pipr <
+               vcpu->arch.xive_saved_state.cppr;
+}
+#else
+static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu)
+{
+       return false;
+}
+#endif /* CONFIG_KVM_XICS */
+
+static bool kvmppc_vcpu_woken(struct kvm_vcpu *vcpu)
+{
+       if (vcpu->arch.pending_exceptions || vcpu->arch.prodded ||
+           kvmppc_doorbell_pending(vcpu) || xive_interrupt_pending(vcpu))
+               return true;
+
+       return false;
+}
+
 /*
  * Check to see if any of the runnable vcpus on the vcore have pending
  * exceptions or are no longer ceded
@@ -2676,8 +2951,7 @@ static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
        int i;
 
        for_each_runnable_thread(i, vcpu, vc) {
-               if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded ||
-                   vcpu->arch.prodded)
+               if (!vcpu->arch.ceded || kvmppc_vcpu_woken(vcpu))
                        return 1;
        }
 
@@ -2819,15 +3093,14 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
         */
        if (!signal_pending(current)) {
                if (vc->vcore_state == VCORE_PIGGYBACK) {
-                       struct kvmppc_vcore *mvc = vc->master_vcore;
-                       if (spin_trylock(&mvc->lock)) {
-                               if (mvc->vcore_state == VCORE_RUNNING &&
-                                   !VCORE_IS_EXITING(mvc)) {
+                       if (spin_trylock(&vc->lock)) {
+                               if (vc->vcore_state == VCORE_RUNNING &&
+                                   !VCORE_IS_EXITING(vc)) {
                                        kvmppc_create_dtl_entry(vcpu, vc);
                                        kvmppc_start_thread(vcpu, vc);
                                        trace_kvm_guest_enter(vcpu);
                                }
-                               spin_unlock(&mvc->lock);
+                               spin_unlock(&vc->lock);
                        }
                } else if (vc->vcore_state == VCORE_RUNNING &&
                           !VCORE_IS_EXITING(vc)) {
@@ -2863,7 +3136,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
                        break;
                n_ceded = 0;
                for_each_runnable_thread(i, v, vc) {
-                       if (!v->arch.pending_exceptions && !v->arch.prodded)
+                       if (!kvmppc_vcpu_woken(v))
                                n_ceded += v->arch.ceded;
                        else
                                v->arch.ceded = 0;
@@ -3518,6 +3791,19 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
        if (!cpu_has_feature(CPU_FTR_ARCH_300))
                kvm_hv_vm_activated();
 
+       /*
+        * Initialize smt_mode depending on processor.
+        * POWER8 and earlier have to use "strict" threading, where
+        * all vCPUs in a vcore have to run on the same (sub)core,
+        * whereas on POWER9 the threads can each run a different
+        * guest.
+        */
+       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+               kvm->arch.smt_mode = threads_per_subcore;
+       else
+               kvm->arch.smt_mode = 1;
+       kvm->arch.emul_smt_mode = 1;
+
        /*
         * Create a debugfs directory for the VM
         */
@@ -3947,6 +4233,7 @@ static struct kvmppc_ops kvm_ops_hv = {
 #endif
        .configure_mmu = kvmhv_configure_mmu,
        .get_rmmu_info = kvmhv_get_rmmu_info,
+       .set_smt_mode = kvmhv_set_smt_mode,
 };
 
 static int kvm_init_subcore_bitmap(void)