KVM: vmx/pmu: Fix dummy check if lbr_desc->event is created

[mirror_ubuntu-jammy-kernel.git] / arch / x86 / kvm / vmx / pmu_intel.c
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c

index a886a47daebdadffa556e44e8e9dd6848abbe733..9efc1a6b86930ad5879faa487270c8ab459a2da6 100644 (file)
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -29,7 +29,7 @@ static struct kvm_event_hw_type_mapping intel_arch_events[] = {
         [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
         [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
         [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
-       [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES },
+       [7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES },
  };
  
  /* mapping between fixed pmc index and intel_arch_events array */
@@ -152,12 +152,17 @@ static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
         return &counters[array_index_nospec(idx, num_counters)];
  }
  
-static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu)
+static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu)
  {
         if (!guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
-               return false;
+               return 0;
  
-       return vcpu->arch.perf_capabilities & PMU_CAP_FW_WRITES;
+       return vcpu->arch.perf_capabilities;
+}
+
+static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu)
+{
+       return (vcpu_get_perf_capabilities(vcpu) & PMU_CAP_FW_WRITES) != 0;
  }
  
  static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr)
@@ -168,6 +173,41 @@ static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr)
         return get_gp_pmc(pmu, msr, MSR_IA32_PMC0);
  }
  
+bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu)
+{
+       /*
+        * As a first step, a guest could only enable LBR feature if its
+        * cpu model is the same as the host because the LBR registers
+        * would be pass-through to the guest and they're model specific.
+        */
+       return boot_cpu_data.x86_model == guest_cpuid_model(vcpu);
+}
+
+bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu)
+{
+       struct x86_pmu_lbr *lbr = vcpu_to_lbr_records(vcpu);
+
+       return lbr->nr && (vcpu_get_perf_capabilities(vcpu) & PMU_CAP_LBR_FMT);
+}
+
+static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index)
+{
+       struct x86_pmu_lbr *records = vcpu_to_lbr_records(vcpu);
+       bool ret = false;
+
+       if (!intel_pmu_lbr_is_enabled(vcpu))
+               return ret;
+
+       ret = (index == MSR_LBR_SELECT) || (index == MSR_LBR_TOS) ||
+               (index >= records->from && index < records->from + records->nr) ||
+               (index >= records->to && index < records->to + records->nr);
+
+       if (!ret && records->info)
+               ret = (index >= records->info && index < records->info + records->nr);
+
+       return ret;
+}
+
  static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
  {
         struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -183,7 +223,8 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
         default:
                 ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
                         get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) ||
-                       get_fixed_pmc(pmu, msr) || get_fw_gp_pmc(pmu, msr);
+                       get_fixed_pmc(pmu, msr) || get_fw_gp_pmc(pmu, msr) ||
+                       intel_pmu_is_valid_lbr_msr(vcpu, msr);
                 break;
         }
  
@@ -202,6 +243,111 @@ static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr)
         return pmc;
  }
  
+static inline void intel_pmu_release_guest_lbr_event(struct kvm_vcpu *vcpu)
+{
+       struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
+       if (lbr_desc->event) {
+               perf_event_release_kernel(lbr_desc->event);
+               lbr_desc->event = NULL;
+               vcpu_to_pmu(vcpu)->event_count--;
+       }
+}
+
+int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu)
+{
+       struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+       struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+       struct perf_event *event;
+
+       /*
+        * The perf_event_attr is constructed in the minimum efficient way:
+        * - set 'pinned = true' to make it task pinned so that if another
+        *   cpu pinned event reclaims LBR, the event->oncpu will be set to -1;
+        * - set '.exclude_host = true' to record guest branches behavior;
+        *
+        * - set '.config = INTEL_FIXED_VLBR_EVENT' to indicates host perf
+        *   schedule the event without a real HW counter but a fake one;
+        *   check is_guest_lbr_event() and __intel_get_event_constraints();
+        *
+        * - set 'sample_type = PERF_SAMPLE_BRANCH_STACK' and
+        *   'branch_sample_type = PERF_SAMPLE_BRANCH_CALL_STACK |
+        *   PERF_SAMPLE_BRANCH_USER' to configure it as a LBR callstack
+        *   event, which helps KVM to save/restore guest LBR records
+        *   during host context switches and reduces quite a lot overhead,
+        *   check branch_user_callstack() and intel_pmu_lbr_sched_task();
+        */
+       struct perf_event_attr attr = {
+               .type = PERF_TYPE_RAW,
+               .size = sizeof(attr),
+               .config = INTEL_FIXED_VLBR_EVENT,
+               .sample_type = PERF_SAMPLE_BRANCH_STACK,
+               .pinned = true,
+               .exclude_host = true,
+               .branch_sample_type = PERF_SAMPLE_BRANCH_CALL_STACK |
+                                       PERF_SAMPLE_BRANCH_USER,
+       };
+
+       if (unlikely(lbr_desc->event)) {
+               __set_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use);
+               return 0;
+       }
+
+       event = perf_event_create_kernel_counter(&attr, -1,
+                                               current, NULL, NULL);
+       if (IS_ERR(event)) {
+               pr_debug_ratelimited("%s: failed %ld\n",
+                                       __func__, PTR_ERR(event));
+               return PTR_ERR(event);
+       }
+       lbr_desc->event = event;
+       pmu->event_count++;
+       __set_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use);
+       return 0;
+}
+
+/*
+ * It's safe to access LBR msrs from guest when they have not
+ * been passthrough since the host would help restore or reset
+ * the LBR msrs records when the guest LBR event is scheduled in.
+ */
+static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu,
+                                    struct msr_data *msr_info, bool read)
+{
+       struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+       u32 index = msr_info->index;
+
+       if (!intel_pmu_is_valid_lbr_msr(vcpu, index))
+               return false;
+
+       if (!lbr_desc->event && intel_pmu_create_guest_lbr_event(vcpu) < 0)
+               goto dummy;
+
+       /*
+        * Disable irq to ensure the LBR feature doesn't get reclaimed by the
+        * host at the time the value is read from the msr, and this avoids the
+        * host LBR value to be leaked to the guest. If LBR has been reclaimed,
+        * return 0 on guest reads.
+        */
+       local_irq_disable();
+       if (lbr_desc->event->state == PERF_EVENT_STATE_ACTIVE) {
+               if (read)
+                       rdmsrl(index, msr_info->data);
+               else
+                       wrmsrl(index, msr_info->data);
+               __set_bit(INTEL_PMC_IDX_FIXED_VLBR, vcpu_to_pmu(vcpu)->pmc_in_use);
+               local_irq_enable();
+               return true;
+       }
+       clear_bit(INTEL_PMC_IDX_FIXED_VLBR, vcpu_to_pmu(vcpu)->pmc_in_use);
+       local_irq_enable();
+
+dummy:
+       if (read)
+               msr_info->data = 0;
+       return true;
+}
+
  static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
  {
         struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -236,7 +382,8 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
                         msr_info->data = pmc->eventsel;
                         return 0;
-               }
+               } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, true))
+                       return 0;
         }
  
         return 1;
@@ -307,7 +454,8 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                                 reprogram_gp_counter(pmc, data);
                                 return 0;
                         }
-               }
+               } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false))
+                       return 0;
         }
  
         return 1;
@@ -316,6 +464,8 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
  static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
  {
         struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+       struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
         struct x86_pmu_capability x86_pmu;
         struct kvm_cpuid_entry2 *entry;
         union cpuid10_eax eax;
@@ -327,7 +477,6 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
         pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
         pmu->version = 0;
         pmu->reserved_bits = 0xffffffff00200000ull;
-       vcpu->arch.perf_capabilities = 0;
  
         entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
         if (!entry)
@@ -340,12 +489,12 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
                 return;
  
         perf_get_x86_pmu_capability(&x86_pmu);
-       if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
-               vcpu->arch.perf_capabilities = vmx_get_perf_capabilities();
  
         pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
                                          x86_pmu.num_counters_gp);
+       eax.split.bit_width = min_t(int, eax.split.bit_width, x86_pmu.bit_width_gp);
         pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1;
+       eax.split.mask_length = min_t(int, eax.split.mask_length, x86_pmu.events_mask_len);
         pmu->available_event_types = ~entry->ebx &
                                         ((1ull << eax.split.mask_length) - 1);
  
@@ -355,6 +504,8 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
                 pmu->nr_arch_fixed_counters =
                         min_t(int, edx.split.num_counters_fixed,
                               x86_pmu.num_counters_fixed);
+               edx.split.bit_width_fixed = min_t(int,
+                       edx.split.bit_width_fixed, x86_pmu.bit_width_fixed);
                 pmu->counter_bitmask[KVM_PMC_FIXED] =
                         ((u64)1 << edx.split.bit_width_fixed) - 1;
         }
@@ -381,12 +532,21 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
                 INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters);
  
         nested_vmx_pmu_entry_exit_ctls_update(vcpu);
+
+       if (intel_pmu_lbr_is_compatible(vcpu))
+               x86_perf_get_lbr(&lbr_desc->records);
+       else
+               lbr_desc->records.nr = 0;
+
+       if (lbr_desc->records.nr)
+               bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1);
  }
  
  static void intel_pmu_init(struct kvm_vcpu *vcpu)
  {
         int i;
         struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+       struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
  
         for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
                 pmu->gp_counters[i].type = KVM_PMC_GP;
@@ -401,6 +561,11 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu)
                 pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;
                 pmu->fixed_counters[i].current_config = 0;
         }
+
+       vcpu->arch.perf_capabilities = vmx_get_perf_capabilities();
+       lbr_desc->records.nr = 0;
+       lbr_desc->event = NULL;
+       lbr_desc->msr_passthrough = false;
  }
  
  static void intel_pmu_reset(struct kvm_vcpu *vcpu)
@@ -425,6 +590,119 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
  
         pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status =
                 pmu->global_ovf_ctrl = 0;
+
+       intel_pmu_release_guest_lbr_event(vcpu);
+}
+
+/*
+ * Emulate LBR_On_PMI behavior for 1 < pmu.version < 4.
+ *
+ * If Freeze_LBR_On_PMI = 1, the LBR is frozen on PMI and
+ * the KVM emulates to clear the LBR bit (bit 0) in IA32_DEBUGCTL.
+ *
+ * Guest needs to re-enable LBR to resume branches recording.
+ */
+static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu)
+{
+       u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL);
+
+       if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) {
+               data &= ~DEBUGCTLMSR_LBR;
+               vmcs_write64(GUEST_IA32_DEBUGCTL, data);
+       }
+}
+
+static void intel_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
+{
+       u8 version = vcpu_to_pmu(vcpu)->version;
+
+       if (!intel_pmu_lbr_is_enabled(vcpu))
+               return;
+
+       if (version > 1 && version < 4)
+               intel_pmu_legacy_freezing_lbrs_on_pmi(vcpu);
+}
+
+static void vmx_update_intercept_for_lbr_msrs(struct kvm_vcpu *vcpu, bool set)
+{
+       struct x86_pmu_lbr *lbr = vcpu_to_lbr_records(vcpu);
+       int i;
+
+       for (i = 0; i < lbr->nr; i++) {
+               vmx_set_intercept_for_msr(vcpu, lbr->from + i, MSR_TYPE_RW, set);
+               vmx_set_intercept_for_msr(vcpu, lbr->to + i, MSR_TYPE_RW, set);
+               if (lbr->info)
+                       vmx_set_intercept_for_msr(vcpu, lbr->info + i, MSR_TYPE_RW, set);
+       }
+
+       vmx_set_intercept_for_msr(vcpu, MSR_LBR_SELECT, MSR_TYPE_RW, set);
+       vmx_set_intercept_for_msr(vcpu, MSR_LBR_TOS, MSR_TYPE_RW, set);
+}
+
+static inline void vmx_disable_lbr_msrs_passthrough(struct kvm_vcpu *vcpu)
+{
+       struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
+       if (!lbr_desc->msr_passthrough)
+               return;
+
+       vmx_update_intercept_for_lbr_msrs(vcpu, true);
+       lbr_desc->msr_passthrough = false;
+}
+
+static inline void vmx_enable_lbr_msrs_passthrough(struct kvm_vcpu *vcpu)
+{
+       struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
+       if (lbr_desc->msr_passthrough)
+               return;
+
+       vmx_update_intercept_for_lbr_msrs(vcpu, false);
+       lbr_desc->msr_passthrough = true;
+}
+
+/*
+ * Higher priority host perf events (e.g. cpu pinned) could reclaim the
+ * pmu resources (e.g. LBR) that were assigned to the guest. This is
+ * usually done via ipi calls (more details in perf_install_in_context).
+ *
+ * Before entering the non-root mode (with irq disabled here), double
+ * confirm that the pmu features enabled to the guest are not reclaimed
+ * by higher priority host events. Otherwise, disallow vcpu's access to
+ * the reclaimed features.
+ */
+void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
+{
+       struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+       struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
+       if (!lbr_desc->event) {
+               vmx_disable_lbr_msrs_passthrough(vcpu);
+               if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)
+                       goto warn;
+               if (test_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use))
+                       goto warn;
+               return;
+       }
+
+       if (lbr_desc->event->state < PERF_EVENT_STATE_ACTIVE) {
+               vmx_disable_lbr_msrs_passthrough(vcpu);
+               __clear_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use);
+               goto warn;
+       } else
+               vmx_enable_lbr_msrs_passthrough(vcpu);
+
+       return;
+
+warn:
+       pr_warn_ratelimited("kvm: vcpu-%d: fail to passthrough LBR.\n",
+               vcpu->vcpu_id);
+}
+
+static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
+{
+       if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR))
+               intel_pmu_release_guest_lbr_event(vcpu);
  }
  
  struct kvm_pmu_ops intel_pmu_ops = {
@@ -441,4 +719,6 @@ struct kvm_pmu_ops intel_pmu_ops = {
         .refresh = intel_pmu_refresh,
         .init = intel_pmu_init,
         .reset = intel_pmu_reset,
+       .deliver_pmi = intel_pmu_deliver_pmi,
+       .cleanup = intel_pmu_cleanup,
  };