struct kvm_vcpu_stat {
u32 halt_successful_poll;
u32 halt_attempted_poll;
+ u32 halt_poll_invalid;
u32 halt_wakeup;
u32 hvc_exit_stat;
u64 wfe_exit_stat;
kvm_call_hyp(__init_stage2_translation);
}
+static inline void __cpu_reset_hyp_mode(phys_addr_t boot_pgd_ptr,
+ phys_addr_t phys_idmap_start)
+{
+ /*
+ * TODO
+ * kvm_call_reset(boot_pgd_ptr, phys_idmap_start);
+ */
+}
+
static inline int kvm_arch_dev_ioctl_check_extension(long ext)
{
return 0;
struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
-static inline void kvm_arch_hardware_disable(void) {}
static inline void kvm_arch_hardware_unsetup(void) {}
static inline void kvm_arch_sync_events(struct kvm *kvm) {}
static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
+ static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
static inline void kvm_arm_init_debug(void) {}
static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {}
#include <linux/highmem.h>
#include <asm/cacheflush.h>
#include <asm/pgalloc.h>
+ #include <asm/stage2_pgtable.h>
int create_hyp_mappings(void *from, void *to);
int create_hyp_io_mappings(void *from, void *to, phys_addr_t);
phys_addr_t kvm_mmu_get_httbr(void);
phys_addr_t kvm_mmu_get_boot_httbr(void);
phys_addr_t kvm_get_idmap_vector(void);
+phys_addr_t kvm_get_idmap_start(void);
int kvm_mmu_init(void);
void kvm_clear_hyp_idmap(void);
clean_pte_table(pte);
}
- static inline void kvm_set_s2pte_writable(pte_t *pte)
+ static inline pte_t kvm_s2pte_mkwrite(pte_t pte)
{
- pte_val(*pte) |= L_PTE_S2_RDWR;
+ pte_val(pte) |= L_PTE_S2_RDWR;
+ return pte;
}
- static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
+ static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd)
{
- pmd_val(*pmd) |= L_PMD_S2_RDWR;
+ pmd_val(pmd) |= L_PMD_S2_RDWR;
+ return pmd;
}
static inline void kvm_set_s2pte_readonly(pte_t *pte)
return (pmd_val(*pmd) & L_PMD_S2_RDWR) == L_PMD_S2_RDONLY;
}
-
- /* Open coded p*d_addr_end that can deal with 64bit addresses */
- #define kvm_pgd_addr_end(addr, end) \
- ({ u64 __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK; \
- (__boundary - 1 < (end) - 1)? __boundary: (end); \
- })
-
- #define kvm_pud_addr_end(addr,end) (end)
-
- #define kvm_pmd_addr_end(addr, end) \
- ({ u64 __boundary = ((addr) + PMD_SIZE) & PMD_MASK; \
- (__boundary - 1 < (end) - 1)? __boundary: (end); \
- })
-
- #define kvm_pgd_index(addr) pgd_index(addr)
-
static inline bool kvm_page_empty(void *ptr)
{
struct page *ptr_page = virt_to_page(ptr);
#define kvm_pte_table_empty(kvm, ptep) kvm_page_empty(ptep)
#define kvm_pmd_table_empty(kvm, pmdp) kvm_page_empty(pmdp)
- #define kvm_pud_table_empty(kvm, pudp) (0)
-
- #define KVM_PREALLOC_LEVEL 0
+ #define kvm_pud_table_empty(kvm, pudp) false
- static inline void *kvm_get_hwpgd(struct kvm *kvm)
- {
- return kvm->arch.pgd;
- }
-
- static inline unsigned int kvm_get_hwpgd_size(void)
- {
- return PTRS_PER_S2_PGD * sizeof(pgd_t);
- }
+ #define hyp_pte_table_empty(ptep) kvm_page_empty(ptep)
+ #define hyp_pmd_table_empty(pmdp) kvm_page_empty(pmdp)
+ #define hyp_pud_table_empty(pudp) false
struct kvm;
* Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
-#include <linux/cpu.h>
#include <linux/cpu_pm.h>
#include <linux/errno.h>
#include <linux/err.h>
static bool vgic_present;
+static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled);
+
static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
{
BUG_ON(preemptible());
return &kvm_arm_running_vcpu;
}
-int kvm_arch_hardware_enable(void)
-{
- return 0;
-}
-
int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
{
return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
kvm_next_vmid &= (1 << kvm_vmid_bits) - 1;
/* update vttbr to be used with the new vmid */
- pgd_phys = virt_to_phys(kvm_get_hwpgd(kvm));
+ pgd_phys = virt_to_phys(kvm->arch.pgd);
BUG_ON(pgd_phys & ~VTTBR_BADDR_MASK);
vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits);
kvm->arch.vttbr = pgd_phys | vmid;
}
}
-static void cpu_init_stage2(void *dummy)
-{
- __cpu_init_stage2();
-}
-
static void cpu_init_hyp_mode(void *dummy)
{
phys_addr_t boot_pgd_ptr;
{
if (is_kernel_in_hyp_mode()) {
/*
- * cpu_init_stage2() is safe to call even if the PM
+ * __cpu_init_stage2() is safe to call even if the PM
* event was cancelled before the CPU was reset.
*/
- cpu_init_stage2(NULL);
+ __cpu_init_stage2();
} else {
if (__hyp_get_vectors() == hyp_default_vectors)
cpu_init_hyp_mode(NULL);
}
}
-static int hyp_init_cpu_notify(struct notifier_block *self,
- unsigned long action, void *cpu)
+static void cpu_hyp_reset(void)
{
- switch (action) {
- case CPU_STARTING:
- case CPU_STARTING_FROZEN:
+ phys_addr_t boot_pgd_ptr;
+ phys_addr_t phys_idmap_start;
+
+ if (!is_kernel_in_hyp_mode()) {
+ boot_pgd_ptr = kvm_mmu_get_boot_httbr();
+ phys_idmap_start = kvm_get_idmap_start();
+
+ __cpu_reset_hyp_mode(boot_pgd_ptr, phys_idmap_start);
+ }
+}
+
+static void _kvm_arch_hardware_enable(void *discard)
+{
+ if (!__this_cpu_read(kvm_arm_hardware_enabled)) {
cpu_hyp_reinit();
+ __this_cpu_write(kvm_arm_hardware_enabled, 1);
}
+}
- return NOTIFY_OK;
+int kvm_arch_hardware_enable(void)
+{
+ _kvm_arch_hardware_enable(NULL);
+ return 0;
}
-static struct notifier_block hyp_init_cpu_nb = {
- .notifier_call = hyp_init_cpu_notify,
-};
+static void _kvm_arch_hardware_disable(void *discard)
+{
+ if (__this_cpu_read(kvm_arm_hardware_enabled)) {
+ cpu_hyp_reset();
+ __this_cpu_write(kvm_arm_hardware_enabled, 0);
+ }
+}
+
+void kvm_arch_hardware_disable(void)
+{
+ _kvm_arch_hardware_disable(NULL);
+}
#ifdef CONFIG_CPU_PM
static int hyp_init_cpu_pm_notifier(struct notifier_block *self,
unsigned long cmd,
void *v)
{
- if (cmd == CPU_PM_EXIT) {
- cpu_hyp_reinit();
+ /*
+ * kvm_arm_hardware_enabled is left with its old value over
+ * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should
+ * re-enable hyp.
+ */
+ switch (cmd) {
+ case CPU_PM_ENTER:
+ if (__this_cpu_read(kvm_arm_hardware_enabled))
+ /*
+ * don't update kvm_arm_hardware_enabled here
+ * so that the hardware will be re-enabled
+ * when we resume. See below.
+ */
+ cpu_hyp_reset();
+
return NOTIFY_OK;
- }
+ case CPU_PM_EXIT:
+ if (__this_cpu_read(kvm_arm_hardware_enabled))
+ /* The hardware was enabled before suspend. */
+ cpu_hyp_reinit();
- return NOTIFY_DONE;
+ return NOTIFY_OK;
+
+ default:
+ return NOTIFY_DONE;
+ }
}
static struct notifier_block hyp_init_cpu_pm_nb = {
static int init_subsystems(void)
{
- int err;
+ int err = 0;
/*
- * Register CPU Hotplug notifier
+ * Enable hardware so that subsystem initialisation can access EL2.
*/
- err = register_cpu_notifier(&hyp_init_cpu_nb);
- if (err) {
- kvm_err("Cannot register KVM init CPU notifier (%d)\n", err);
- return err;
- }
+ on_each_cpu(_kvm_arch_hardware_enable, NULL, 1);
/*
* Register CPU lower-power notifier
case -ENODEV:
case -ENXIO:
vgic_present = false;
+ err = 0;
break;
default:
- return err;
+ goto out;
}
/*
*/
err = kvm_timer_hyp_init();
if (err)
- return err;
+ goto out;
kvm_perf_init();
kvm_coproc_table_init();
- return 0;
+out:
+ on_each_cpu(_kvm_arch_hardware_disable, NULL, 1);
+
+ return err;
}
static void teardown_hyp_mode(void)
free_hyp_pgds();
for_each_possible_cpu(cpu)
free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
- unregister_cpu_notifier(&hyp_init_cpu_nb);
hyp_cpu_pm_exit();
}
static int init_vhe_mode(void)
{
- /*
- * Execute the init code on each CPU.
- */
- on_each_cpu(cpu_init_stage2, NULL, 1);
-
/* set size of VMID supported by CPU */
kvm_vmid_bits = kvm_get_vmid_bits();
kvm_info("%d-bit VMID\n", kvm_vmid_bits);
}
}
- /*
- * Execute the init code on each CPU.
- */
- on_each_cpu(cpu_init_hyp_mode, NULL, 1);
-
#ifndef CONFIG_HOTPLUG_CPU
free_boot_hyp_pgd();
#endif
static unsigned long hyp_idmap_end;
static phys_addr_t hyp_idmap_vector;
+ #define S2_PGD_SIZE (PTRS_PER_S2_PGD * sizeof(pgd_t))
#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
- #define kvm_pmd_huge(_x) (pmd_huge(_x) || pmd_trans_huge(_x))
- #define kvm_pud_huge(_x) pud_huge(_x)
-
#define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0)
#define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1)
static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
{
- /*
- * This function also gets called when dealing with HYP page
- * tables. As HYP doesn't have an associated struct kvm (and
- * the HYP page tables are fairly static), we don't do
- * anything there.
- */
- if (kvm)
- kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
}
/*
*/
static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
{
- if (!kvm_pmd_huge(*pmd))
+ if (!pmd_thp_or_huge(*pmd))
return;
pmd_clear(pmd);
return p;
}
- static void clear_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr)
+ static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr)
{
- pud_t *pud_table __maybe_unused = pud_offset(pgd, 0);
- pgd_clear(pgd);
+ pud_t *pud_table __maybe_unused = stage2_pud_offset(pgd, 0UL);
+ stage2_pgd_clear(pgd);
kvm_tlb_flush_vmid_ipa(kvm, addr);
- pud_free(NULL, pud_table);
+ stage2_pud_free(pud_table);
put_page(virt_to_page(pgd));
}
- static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
+ static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
{
- pmd_t *pmd_table = pmd_offset(pud, 0);
- VM_BUG_ON(pud_huge(*pud));
- pud_clear(pud);
+ pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(pud, 0);
+ VM_BUG_ON(stage2_pud_huge(*pud));
+ stage2_pud_clear(pud);
kvm_tlb_flush_vmid_ipa(kvm, addr);
- pmd_free(NULL, pmd_table);
+ stage2_pmd_free(pmd_table);
put_page(virt_to_page(pud));
}
- static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
+ static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
{
pte_t *pte_table = pte_offset_kernel(pmd, 0);
- VM_BUG_ON(kvm_pmd_huge(*pmd));
+ VM_BUG_ON(pmd_thp_or_huge(*pmd));
pmd_clear(pmd);
kvm_tlb_flush_vmid_ipa(kvm, addr);
pte_free_kernel(NULL, pte_table);
* the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure
* the IO subsystem will never hit in the cache.
*/
- static void unmap_ptes(struct kvm *kvm, pmd_t *pmd,
+ static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd,
phys_addr_t addr, phys_addr_t end)
{
phys_addr_t start_addr = addr;
}
} while (pte++, addr += PAGE_SIZE, addr != end);
- if (kvm_pte_table_empty(kvm, start_pte))
- clear_pmd_entry(kvm, pmd, start_addr);
+ if (stage2_pte_table_empty(start_pte))
+ clear_stage2_pmd_entry(kvm, pmd, start_addr);
}
- static void unmap_pmds(struct kvm *kvm, pud_t *pud,
+ static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud,
phys_addr_t addr, phys_addr_t end)
{
phys_addr_t next, start_addr = addr;
pmd_t *pmd, *start_pmd;
- start_pmd = pmd = pmd_offset(pud, addr);
+ start_pmd = pmd = stage2_pmd_offset(pud, addr);
do {
- next = kvm_pmd_addr_end(addr, end);
+ next = stage2_pmd_addr_end(addr, end);
if (!pmd_none(*pmd)) {
- if (kvm_pmd_huge(*pmd)) {
+ if (pmd_thp_or_huge(*pmd)) {
pmd_t old_pmd = *pmd;
pmd_clear(pmd);
put_page(virt_to_page(pmd));
} else {
- unmap_ptes(kvm, pmd, addr, next);
+ unmap_stage2_ptes(kvm, pmd, addr, next);
}
}
} while (pmd++, addr = next, addr != end);
- if (kvm_pmd_table_empty(kvm, start_pmd))
- clear_pud_entry(kvm, pud, start_addr);
+ if (stage2_pmd_table_empty(start_pmd))
+ clear_stage2_pud_entry(kvm, pud, start_addr);
}
- static void unmap_puds(struct kvm *kvm, pgd_t *pgd,
+ static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd,
phys_addr_t addr, phys_addr_t end)
{
phys_addr_t next, start_addr = addr;
pud_t *pud, *start_pud;
- start_pud = pud = pud_offset(pgd, addr);
+ start_pud = pud = stage2_pud_offset(pgd, addr);
do {
- next = kvm_pud_addr_end(addr, end);
- if (!pud_none(*pud)) {
- if (pud_huge(*pud)) {
+ next = stage2_pud_addr_end(addr, end);
+ if (!stage2_pud_none(*pud)) {
+ if (stage2_pud_huge(*pud)) {
pud_t old_pud = *pud;
- pud_clear(pud);
+ stage2_pud_clear(pud);
kvm_tlb_flush_vmid_ipa(kvm, addr);
-
kvm_flush_dcache_pud(old_pud);
-
put_page(virt_to_page(pud));
} else {
- unmap_pmds(kvm, pud, addr, next);
+ unmap_stage2_pmds(kvm, pud, addr, next);
}
}
} while (pud++, addr = next, addr != end);
- if (kvm_pud_table_empty(kvm, start_pud))
- clear_pgd_entry(kvm, pgd, start_addr);
+ if (stage2_pud_table_empty(start_pud))
+ clear_stage2_pgd_entry(kvm, pgd, start_addr);
}
-
- static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
- phys_addr_t start, u64 size)
+ /**
+ * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
+ * @kvm: The VM pointer
+ * @start: The intermediate physical base address of the range to unmap
+ * @size: The size of the area to unmap
+ *
+ * Clear a range of stage-2 mappings, lowering the various ref-counts. Must
+ * be called while holding mmu_lock (unless for freeing the stage2 pgd before
+ * destroying the VM), otherwise another faulting VCPU may come in and mess
+ * with things behind our backs.
+ */
+ static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
{
pgd_t *pgd;
phys_addr_t addr = start, end = start + size;
phys_addr_t next;
- pgd = pgdp + kvm_pgd_index(addr);
+ pgd = kvm->arch.pgd + stage2_pgd_index(addr);
do {
- next = kvm_pgd_addr_end(addr, end);
- if (!pgd_none(*pgd))
- unmap_puds(kvm, pgd, addr, next);
+ next = stage2_pgd_addr_end(addr, end);
+ if (!stage2_pgd_none(*pgd))
+ unmap_stage2_puds(kvm, pgd, addr, next);
} while (pgd++, addr = next, addr != end);
}
pmd_t *pmd;
phys_addr_t next;
- pmd = pmd_offset(pud, addr);
+ pmd = stage2_pmd_offset(pud, addr);
do {
- next = kvm_pmd_addr_end(addr, end);
+ next = stage2_pmd_addr_end(addr, end);
if (!pmd_none(*pmd)) {
- if (kvm_pmd_huge(*pmd))
+ if (pmd_thp_or_huge(*pmd))
kvm_flush_dcache_pmd(*pmd);
else
stage2_flush_ptes(kvm, pmd, addr, next);
pud_t *pud;
phys_addr_t next;
- pud = pud_offset(pgd, addr);
+ pud = stage2_pud_offset(pgd, addr);
do {
- next = kvm_pud_addr_end(addr, end);
- if (!pud_none(*pud)) {
- if (pud_huge(*pud))
+ next = stage2_pud_addr_end(addr, end);
+ if (!stage2_pud_none(*pud)) {
+ if (stage2_pud_huge(*pud))
kvm_flush_dcache_pud(*pud);
else
stage2_flush_pmds(kvm, pud, addr, next);
phys_addr_t next;
pgd_t *pgd;
- pgd = kvm->arch.pgd + kvm_pgd_index(addr);
+ pgd = kvm->arch.pgd + stage2_pgd_index(addr);
do {
- next = kvm_pgd_addr_end(addr, end);
+ next = stage2_pgd_addr_end(addr, end);
stage2_flush_puds(kvm, pgd, addr, next);
} while (pgd++, addr = next, addr != end);
}
srcu_read_unlock(&kvm->srcu, idx);
}
+ static void clear_hyp_pgd_entry(pgd_t *pgd)
+ {
+ pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL);
+ pgd_clear(pgd);
+ pud_free(NULL, pud_table);
+ put_page(virt_to_page(pgd));
+ }
+
+ static void clear_hyp_pud_entry(pud_t *pud)
+ {
+ pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0);
+ VM_BUG_ON(pud_huge(*pud));
+ pud_clear(pud);
+ pmd_free(NULL, pmd_table);
+ put_page(virt_to_page(pud));
+ }
+
+ static void clear_hyp_pmd_entry(pmd_t *pmd)
+ {
+ pte_t *pte_table = pte_offset_kernel(pmd, 0);
+ VM_BUG_ON(pmd_thp_or_huge(*pmd));
+ pmd_clear(pmd);
+ pte_free_kernel(NULL, pte_table);
+ put_page(virt_to_page(pmd));
+ }
+
+ static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
+ {
+ pte_t *pte, *start_pte;
+
+ start_pte = pte = pte_offset_kernel(pmd, addr);
+ do {
+ if (!pte_none(*pte)) {
+ kvm_set_pte(pte, __pte(0));
+ put_page(virt_to_page(pte));
+ }
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+
+ if (hyp_pte_table_empty(start_pte))
+ clear_hyp_pmd_entry(pmd);
+ }
+
+ static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
+ {
+ phys_addr_t next;
+ pmd_t *pmd, *start_pmd;
+
+ start_pmd = pmd = pmd_offset(pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ /* Hyp doesn't use huge pmds */
+ if (!pmd_none(*pmd))
+ unmap_hyp_ptes(pmd, addr, next);
+ } while (pmd++, addr = next, addr != end);
+
+ if (hyp_pmd_table_empty(start_pmd))
+ clear_hyp_pud_entry(pud);
+ }
+
+ static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
+ {
+ phys_addr_t next;
+ pud_t *pud, *start_pud;
+
+ start_pud = pud = pud_offset(pgd, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ /* Hyp doesn't use huge puds */
+ if (!pud_none(*pud))
+ unmap_hyp_pmds(pud, addr, next);
+ } while (pud++, addr = next, addr != end);
+
+ if (hyp_pud_table_empty(start_pud))
+ clear_hyp_pgd_entry(pgd);
+ }
+
+ static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
+ {
+ pgd_t *pgd;
+ phys_addr_t addr = start, end = start + size;
+ phys_addr_t next;
+
+ /*
+ * We don't unmap anything from HYP, except at the hyp tear down.
+ * Hence, we don't have to invalidate the TLBs here.
+ */
+ pgd = pgdp + pgd_index(addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (!pgd_none(*pgd))
+ unmap_hyp_puds(pgd, addr, next);
+ } while (pgd++, addr = next, addr != end);
+ }
+
/**
* free_boot_hyp_pgd - free HYP boot page tables
*
mutex_lock(&kvm_hyp_pgd_mutex);
if (boot_hyp_pgd) {
- unmap_range(NULL, boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
- unmap_range(NULL, boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
+ unmap_hyp_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
+ unmap_hyp_range(boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
boot_hyp_pgd = NULL;
}
if (hyp_pgd)
- unmap_range(NULL, hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
+ unmap_hyp_range(hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
mutex_unlock(&kvm_hyp_pgd_mutex);
}
if (hyp_pgd) {
for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE)
- unmap_range(NULL, hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
+ unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE)
- unmap_range(NULL, hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
+ unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
hyp_pgd = NULL;
__phys_to_pfn(phys_addr), PAGE_HYP_DEVICE);
}
- /* Free the HW pgd, one page at a time */
- static void kvm_free_hwpgd(void *hwpgd)
- {
- free_pages_exact(hwpgd, kvm_get_hwpgd_size());
- }
-
- /* Allocate the HW PGD, making sure that each page gets its own refcount */
- static void *kvm_alloc_hwpgd(void)
- {
- unsigned int size = kvm_get_hwpgd_size();
-
- return alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO);
- }
-
/**
* kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
* @kvm: The KVM struct pointer for the VM.
int kvm_alloc_stage2_pgd(struct kvm *kvm)
{
pgd_t *pgd;
- void *hwpgd;
if (kvm->arch.pgd != NULL) {
kvm_err("kvm_arch already initialized?\n");
return -EINVAL;
}
- hwpgd = kvm_alloc_hwpgd();
- if (!hwpgd)
+ /* Allocate the HW PGD, making sure that each page gets its own refcount */
+ pgd = alloc_pages_exact(S2_PGD_SIZE, GFP_KERNEL | __GFP_ZERO);
+ if (!pgd)
return -ENOMEM;
- /* When the kernel uses more levels of page tables than the
- * guest, we allocate a fake PGD and pre-populate it to point
- * to the next-level page table, which will be the real
- * initial page table pointed to by the VTTBR.
- *
- * When KVM_PREALLOC_LEVEL==2, we allocate a single page for
- * the PMD and the kernel will use folded pud.
- * When KVM_PREALLOC_LEVEL==1, we allocate 2 consecutive PUD
- * pages.
- */
- if (KVM_PREALLOC_LEVEL > 0) {
- int i;
-
- /*
- * Allocate fake pgd for the page table manipulation macros to
- * work. This is not used by the hardware and we have no
- * alignment requirement for this allocation.
- */
- pgd = kmalloc(PTRS_PER_S2_PGD * sizeof(pgd_t),
- GFP_KERNEL | __GFP_ZERO);
-
- if (!pgd) {
- kvm_free_hwpgd(hwpgd);
- return -ENOMEM;
- }
-
- /* Plug the HW PGD into the fake one. */
- for (i = 0; i < PTRS_PER_S2_PGD; i++) {
- if (KVM_PREALLOC_LEVEL == 1)
- pgd_populate(NULL, pgd + i,
- (pud_t *)hwpgd + i * PTRS_PER_PUD);
- else if (KVM_PREALLOC_LEVEL == 2)
- pud_populate(NULL, pud_offset(pgd, 0) + i,
- (pmd_t *)hwpgd + i * PTRS_PER_PMD);
- }
- } else {
- /*
- * Allocate actual first-level Stage-2 page table used by the
- * hardware for Stage-2 page table walks.
- */
- pgd = (pgd_t *)hwpgd;
- }
-
kvm_clean_pgd(pgd);
kvm->arch.pgd = pgd;
return 0;
}
- /**
- * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
- * @kvm: The VM pointer
- * @start: The intermediate physical base address of the range to unmap
- * @size: The size of the area to unmap
- *
- * Clear a range of stage-2 mappings, lowering the various ref-counts. Must
- * be called while holding mmu_lock (unless for freeing the stage2 pgd before
- * destroying the VM), otherwise another faulting VCPU may come in and mess
- * with things behind our backs.
- */
- static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
- {
- unmap_range(kvm, kvm->arch.pgd, start, size);
- }
-
static void stage2_unmap_memslot(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
return;
unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE);
- kvm_free_hwpgd(kvm_get_hwpgd(kvm));
- if (KVM_PREALLOC_LEVEL > 0)
- kfree(kvm->arch.pgd);
-
+ /* Free the HW pgd, one page at a time */
+ free_pages_exact(kvm->arch.pgd, S2_PGD_SIZE);
kvm->arch.pgd = NULL;
}
pgd_t *pgd;
pud_t *pud;
- pgd = kvm->arch.pgd + kvm_pgd_index(addr);
- if (WARN_ON(pgd_none(*pgd))) {
+ pgd = kvm->arch.pgd + stage2_pgd_index(addr);
+ if (WARN_ON(stage2_pgd_none(*pgd))) {
if (!cache)
return NULL;
pud = mmu_memory_cache_alloc(cache);
- pgd_populate(NULL, pgd, pud);
+ stage2_pgd_populate(pgd, pud);
get_page(virt_to_page(pgd));
}
- return pud_offset(pgd, addr);
+ return stage2_pud_offset(pgd, addr);
}
static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
pmd_t *pmd;
pud = stage2_get_pud(kvm, cache, addr);
- if (pud_none(*pud)) {
+ if (stage2_pud_none(*pud)) {
if (!cache)
return NULL;
pmd = mmu_memory_cache_alloc(cache);
- pud_populate(NULL, pud, pmd);
+ stage2_pud_populate(pud, pmd);
get_page(virt_to_page(pud));
}
- return pmd_offset(pud, addr);
+ return stage2_pmd_offset(pud, addr);
}
static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd));
old_pmd = *pmd;
- kvm_set_pmd(pmd, *new_pmd);
- if (pmd_present(old_pmd))
+ if (pmd_present(old_pmd)) {
+ pmd_clear(pmd);
kvm_tlb_flush_vmid_ipa(kvm, addr);
- else
+ } else {
get_page(virt_to_page(pmd));
+ }
+
+ kvm_set_pmd(pmd, *new_pmd);
return 0;
}
/* Create 2nd stage page table mapping - Level 3 */
old_pte = *pte;
- kvm_set_pte(pte, *new_pte);
- if (pte_present(old_pte))
+ if (pte_present(old_pte)) {
+ kvm_set_pte(pte, __pte(0));
kvm_tlb_flush_vmid_ipa(kvm, addr);
- else
+ } else {
get_page(virt_to_page(pte));
+ }
+
+ kvm_set_pte(pte, *new_pte);
+ return 0;
+ }
+ #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+ static int stage2_ptep_test_and_clear_young(pte_t *pte)
+ {
+ if (pte_young(*pte)) {
+ *pte = pte_mkold(*pte);
+ return 1;
+ }
return 0;
}
+ #else
+ static int stage2_ptep_test_and_clear_young(pte_t *pte)
+ {
+ return __ptep_test_and_clear_young(pte);
+ }
+ #endif
+
+ static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
+ {
+ return stage2_ptep_test_and_clear_young((pte_t *)pmd);
+ }
/**
* kvm_phys_addr_ioremap - map a device range to guest IPA
pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE);
if (writable)
- kvm_set_s2pte_writable(&pte);
+ pte = kvm_s2pte_mkwrite(pte);
ret = mmu_topup_memory_cache(&cache, KVM_MMU_CACHE_MIN_PAGES,
KVM_NR_MEM_OBJS);
kvm_pfn_t pfn = *pfnp;
gfn_t gfn = *ipap >> PAGE_SHIFT;
- if (PageTransCompound(pfn_to_page(pfn))) {
+ if (PageTransCompoundMap(pfn_to_page(pfn))) {
unsigned long mask;
/*
* The address we faulted on is backed by a transparent huge
pmd_t *pmd;
phys_addr_t next;
- pmd = pmd_offset(pud, addr);
+ pmd = stage2_pmd_offset(pud, addr);
do {
- next = kvm_pmd_addr_end(addr, end);
+ next = stage2_pmd_addr_end(addr, end);
if (!pmd_none(*pmd)) {
- if (kvm_pmd_huge(*pmd)) {
+ if (pmd_thp_or_huge(*pmd)) {
if (!kvm_s2pmd_readonly(pmd))
kvm_set_s2pmd_readonly(pmd);
} else {
pud_t *pud;
phys_addr_t next;
- pud = pud_offset(pgd, addr);
+ pud = stage2_pud_offset(pgd, addr);
do {
- next = kvm_pud_addr_end(addr, end);
- if (!pud_none(*pud)) {
+ next = stage2_pud_addr_end(addr, end);
+ if (!stage2_pud_none(*pud)) {
/* TODO:PUD not supported, revisit later if supported */
- BUG_ON(kvm_pud_huge(*pud));
+ BUG_ON(stage2_pud_huge(*pud));
stage2_wp_pmds(pud, addr, next);
}
} while (pud++, addr = next, addr != end);
pgd_t *pgd;
phys_addr_t next;
- pgd = kvm->arch.pgd + kvm_pgd_index(addr);
+ pgd = kvm->arch.pgd + stage2_pgd_index(addr);
do {
/*
* Release kvm_mmu_lock periodically if the memory region is
if (need_resched() || spin_needbreak(&kvm->mmu_lock))
cond_resched_lock(&kvm->mmu_lock);
- next = kvm_pgd_addr_end(addr, end);
- if (pgd_present(*pgd))
+ next = stage2_pgd_addr_end(addr, end);
+ if (stage2_pgd_present(*pgd))
stage2_wp_puds(pgd, addr, next);
} while (pgd++, addr = next, addr != end);
}
pmd_t new_pmd = pfn_pmd(pfn, mem_type);
new_pmd = pmd_mkhuge(new_pmd);
if (writable) {
- kvm_set_s2pmd_writable(&new_pmd);
+ new_pmd = kvm_s2pmd_mkwrite(new_pmd);
kvm_set_pfn_dirty(pfn);
}
coherent_cache_guest_page(vcpu, pfn, PMD_SIZE, fault_ipa_uncached);
pte_t new_pte = pfn_pte(pfn, mem_type);
if (writable) {
- kvm_set_s2pte_writable(&new_pte);
+ new_pte = kvm_s2pte_mkwrite(new_pte);
kvm_set_pfn_dirty(pfn);
mark_page_dirty(kvm, gfn);
}
* Resolve the access fault by making the page young again.
* Note that because the faulting entry is guaranteed not to be
* cached in the TLB, we don't need to invalidate anything.
+ * Only the HW Access Flag updates are supported for Stage 2 (no DBM),
+ * so there is no need for atomic (pte|pmd)_mkyoung operations.
*/
static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
{
if (!pmd || pmd_none(*pmd)) /* Nothing there */
goto out;
- if (kvm_pmd_huge(*pmd)) { /* THP, HugeTLB */
+ if (pmd_thp_or_huge(*pmd)) { /* THP, HugeTLB */
*pmd = pmd_mkyoung(*pmd);
pfn = pmd_pfn(*pmd);
pfn_valid = true;
if (!pmd || pmd_none(*pmd)) /* Nothing there */
return 0;
- if (kvm_pmd_huge(*pmd)) { /* THP, HugeTLB */
- if (pmd_young(*pmd)) {
- *pmd = pmd_mkold(*pmd);
- return 1;
- }
-
- return 0;
- }
+ if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */
+ return stage2_pmdp_test_and_clear_young(pmd);
pte = pte_offset_kernel(pmd, gpa);
if (pte_none(*pte))
return 0;
- if (pte_young(*pte)) {
- *pte = pte_mkold(*pte); /* Just a page... */
- return 1;
- }
-
- return 0;
+ return stage2_ptep_test_and_clear_young(pte);
}
static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data)
if (!pmd || pmd_none(*pmd)) /* Nothing there */
return 0;
- if (kvm_pmd_huge(*pmd)) /* THP, HugeTLB */
+ if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */
return pmd_young(*pmd);
pte = pte_offset_kernel(pmd, gpa);
return hyp_idmap_vector;
}
+phys_addr_t kvm_get_idmap_start(void)
+{
+ return hyp_idmap_start;
+}
+
int kvm_mmu_init(void)
{
int err;
#define HCR_INT_OVERRIDE (HCR_FMO | HCR_IMO)
#define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H)
-/* Hyp System Control Register (SCTLR_EL2) bits */
-#define SCTLR_EL2_EE (1 << 25)
-#define SCTLR_EL2_WXN (1 << 19)
-#define SCTLR_EL2_I (1 << 12)
-#define SCTLR_EL2_SA (1 << 3)
-#define SCTLR_EL2_C (1 << 2)
-#define SCTLR_EL2_A (1 << 1)
-#define SCTLR_EL2_M 1
-#define SCTLR_EL2_FLAGS (SCTLR_EL2_M | SCTLR_EL2_A | SCTLR_EL2_C | \
- SCTLR_EL2_SA | SCTLR_EL2_I)
-
/* TCR_EL2 Registers bits */
- #define TCR_EL2_RES1 ((1 << 31) | (1 << 23))
- #define TCR_EL2_TBI (1 << 20)
- #define TCR_EL2_PS (7 << 16)
- #define TCR_EL2_PS_40B (2 << 16)
- #define TCR_EL2_TG0 (1 << 14)
- #define TCR_EL2_SH0 (3 << 12)
- #define TCR_EL2_ORGN0 (3 << 10)
- #define TCR_EL2_IRGN0 (3 << 8)
- #define TCR_EL2_T0SZ 0x3f
- #define TCR_EL2_MASK (TCR_EL2_TG0 | TCR_EL2_SH0 | \
- TCR_EL2_ORGN0 | TCR_EL2_IRGN0 | TCR_EL2_T0SZ)
+ #define TCR_EL2_RES1 ((1 << 31) | (1 << 23))
+ #define TCR_EL2_TBI (1 << 20)
+ #define TCR_EL2_PS_SHIFT 16
+ #define TCR_EL2_PS_MASK (7 << TCR_EL2_PS_SHIFT)
+ #define TCR_EL2_PS_40B (2 << TCR_EL2_PS_SHIFT)
+ #define TCR_EL2_TG0_MASK TCR_TG0_MASK
+ #define TCR_EL2_SH0_MASK TCR_SH0_MASK
+ #define TCR_EL2_ORGN0_MASK TCR_ORGN0_MASK
+ #define TCR_EL2_IRGN0_MASK TCR_IRGN0_MASK
+ #define TCR_EL2_T0SZ_MASK 0x3f
+ #define TCR_EL2_MASK (TCR_EL2_TG0_MASK | TCR_EL2_SH0_MASK | \
+ TCR_EL2_ORGN0_MASK | TCR_EL2_IRGN0_MASK | TCR_EL2_T0SZ_MASK)
/* VTCR_EL2 Registers bits */
#define VTCR_EL2_RES1 (1 << 31)
- #define VTCR_EL2_PS_MASK (7 << 16)
- #define VTCR_EL2_TG0_MASK (1 << 14)
- #define VTCR_EL2_TG0_4K (0 << 14)
- #define VTCR_EL2_TG0_64K (1 << 14)
- #define VTCR_EL2_SH0_MASK (3 << 12)
- #define VTCR_EL2_SH0_INNER (3 << 12)
- #define VTCR_EL2_ORGN0_MASK (3 << 10)
- #define VTCR_EL2_ORGN0_WBWA (1 << 10)
- #define VTCR_EL2_IRGN0_MASK (3 << 8)
- #define VTCR_EL2_IRGN0_WBWA (1 << 8)
- #define VTCR_EL2_SL0_MASK (3 << 6)
- #define VTCR_EL2_SL0_LVL1 (1 << 6)
+ #define VTCR_EL2_HD (1 << 22)
+ #define VTCR_EL2_HA (1 << 21)
+ #define VTCR_EL2_PS_MASK TCR_EL2_PS_MASK
+ #define VTCR_EL2_TG0_MASK TCR_TG0_MASK
+ #define VTCR_EL2_TG0_4K TCR_TG0_4K
+ #define VTCR_EL2_TG0_16K TCR_TG0_16K
+ #define VTCR_EL2_TG0_64K TCR_TG0_64K
+ #define VTCR_EL2_SH0_MASK TCR_SH0_MASK
+ #define VTCR_EL2_SH0_INNER TCR_SH0_INNER
+ #define VTCR_EL2_ORGN0_MASK TCR_ORGN0_MASK
+ #define VTCR_EL2_ORGN0_WBWA TCR_ORGN0_WBWA
+ #define VTCR_EL2_IRGN0_MASK TCR_IRGN0_MASK
+ #define VTCR_EL2_IRGN0_WBWA TCR_IRGN0_WBWA
+ #define VTCR_EL2_SL0_SHIFT 6
+ #define VTCR_EL2_SL0_MASK (3 << VTCR_EL2_SL0_SHIFT)
+ #define VTCR_EL2_SL0_LVL1 (1 << VTCR_EL2_SL0_SHIFT)
#define VTCR_EL2_T0SZ_MASK 0x3f
#define VTCR_EL2_T0SZ_40B 24
#define VTCR_EL2_VS_SHIFT 19
* (see hyp-init.S).
*
* Note that when using 4K pages, we concatenate two first level page tables
- * together.
+ * together. With 16K pages, we concatenate 16 first level page tables.
*
* The magic numbers used for VTTBR_X in this patch can be found in Tables
* D4-23 and D4-25 in ARM DDI 0487A.b.
*/
+
+ #define VTCR_EL2_T0SZ_IPA VTCR_EL2_T0SZ_40B
+ #define VTCR_EL2_COMMON_BITS (VTCR_EL2_SH0_INNER | VTCR_EL2_ORGN0_WBWA | \
+ VTCR_EL2_IRGN0_WBWA | VTCR_EL2_RES1)
+
#ifdef CONFIG_ARM64_64K_PAGES
/*
* Stage2 translation configuration:
- * 40bits input (T0SZ = 24)
* 64kB pages (TG0 = 1)
* 2 level page tables (SL = 1)
*/
- #define VTCR_EL2_FLAGS (VTCR_EL2_TG0_64K | VTCR_EL2_SH0_INNER | \
- VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
- VTCR_EL2_SL0_LVL1 | VTCR_EL2_RES1)
- #define VTTBR_X (38 - VTCR_EL2_T0SZ_40B)
- #else
+ #define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_64K | VTCR_EL2_SL0_LVL1)
+ #define VTTBR_X_TGRAN_MAGIC 38
+ #elif defined(CONFIG_ARM64_16K_PAGES)
+ /*
+ * Stage2 translation configuration:
+ * 16kB pages (TG0 = 2)
+ * 2 level page tables (SL = 1)
+ */
+ #define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_16K | VTCR_EL2_SL0_LVL1)
+ #define VTTBR_X_TGRAN_MAGIC 42
+ #else /* 4K */
/*
* Stage2 translation configuration:
- * 40bits input (T0SZ = 24)
* 4kB pages (TG0 = 0)
* 3 level page tables (SL = 1)
*/
- #define VTCR_EL2_FLAGS (VTCR_EL2_TG0_4K | VTCR_EL2_SH0_INNER | \
- VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
- VTCR_EL2_SL0_LVL1 | VTCR_EL2_RES1)
- #define VTTBR_X (37 - VTCR_EL2_T0SZ_40B)
+ #define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_4K | VTCR_EL2_SL0_LVL1)
+ #define VTTBR_X_TGRAN_MAGIC 37
#endif
+ #define VTCR_EL2_FLAGS (VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN_FLAGS)
+ #define VTTBR_X (VTTBR_X_TGRAN_MAGIC - VTCR_EL2_T0SZ_IPA)
+
#define VTTBR_BADDR_SHIFT (VTTBR_X - 1)
#define VTTBR_BADDR_MASK (((UL(1) << (PHYS_MASK_SHIFT - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT)
#define VTTBR_VMID_SHIFT (UL(48))
int __attribute_const__ kvm_target_cpu(void);
int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
int kvm_arch_dev_ioctl_check_extension(long ext);
+unsigned long kvm_hyp_reset_entry(void);
+void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start);
struct kvm_arch {
/* The VMID generation used for the virt. memory system */
struct kvm_vcpu_stat {
u32 halt_successful_poll;
u32 halt_attempted_poll;
+ u32 halt_poll_invalid;
u32 halt_wakeup;
u32 hvc_exit_stat;
u64 wfe_exit_stat;
hyp_stack_ptr, vector_ptr);
}
-static inline void kvm_arch_hardware_disable(void) {}
+static inline void __cpu_reset_hyp_mode(phys_addr_t boot_pgd_ptr,
+ phys_addr_t phys_idmap_start)
+{
+ /*
+ * Call reset code, and switch back to stub hyp vectors.
+ * Uses __kvm_call_hyp() to avoid kaslr's kvm_ksym_ref() translation.
+ */
+ __kvm_call_hyp((void *)kvm_hyp_reset_entry(),
+ boot_pgd_ptr, phys_idmap_start);
+}
+
static inline void kvm_arch_hardware_unsetup(void) {}
static inline void kvm_arch_sync_events(struct kvm *kvm) {}
static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
+ static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
void kvm_arm_init_debug(void);
void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
*/
#define TRAMPOLINE_VA (HYP_PAGE_OFFSET_MASK & PAGE_MASK)
- /*
- * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation
- * levels in addition to the PGD and potentially the PUD which are
- * pre-allocated (we pre-allocate the fake PGD and the PUD when the Stage-2
- * tables use one level of tables less than the kernel.
- */
- #ifdef CONFIG_ARM64_64K_PAGES
- #define KVM_MMU_CACHE_MIN_PAGES 1
- #else
- #define KVM_MMU_CACHE_MIN_PAGES 2
- #endif
-
#ifdef __ASSEMBLY__
#include <asm/alternative.h>
#define KVM_PHYS_SIZE (1UL << KVM_PHYS_SHIFT)
#define KVM_PHYS_MASK (KVM_PHYS_SIZE - 1UL)
+ #include <asm/stage2_pgtable.h>
+
int create_hyp_mappings(void *from, void *to);
int create_hyp_io_mappings(void *from, void *to, phys_addr_t);
void free_boot_hyp_pgd(void);
phys_addr_t kvm_mmu_get_httbr(void);
phys_addr_t kvm_mmu_get_boot_httbr(void);
phys_addr_t kvm_get_idmap_vector(void);
+phys_addr_t kvm_get_idmap_start(void);
int kvm_mmu_init(void);
void kvm_clear_hyp_idmap(void);
static inline void kvm_clean_pte(pte_t *pte) {}
static inline void kvm_clean_pte_entry(pte_t *pte) {}
- static inline void kvm_set_s2pte_writable(pte_t *pte)
+ static inline pte_t kvm_s2pte_mkwrite(pte_t pte)
{
- pte_val(*pte) |= PTE_S2_RDWR;
+ pte_val(pte) |= PTE_S2_RDWR;
+ return pte;
}
- static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
+ static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd)
{
- pmd_val(*pmd) |= PMD_S2_RDWR;
+ pmd_val(pmd) |= PMD_S2_RDWR;
+ return pmd;
}
static inline void kvm_set_s2pte_readonly(pte_t *pte)
{
- pte_val(*pte) = (pte_val(*pte) & ~PTE_S2_RDWR) | PTE_S2_RDONLY;
+ pteval_t pteval;
+ unsigned long tmp;
+
+ asm volatile("// kvm_set_s2pte_readonly\n"
+ " prfm pstl1strm, %2\n"
+ "1: ldxr %0, %2\n"
+ " and %0, %0, %3 // clear PTE_S2_RDWR\n"
+ " orr %0, %0, %4 // set PTE_S2_RDONLY\n"
+ " stxr %w1, %0, %2\n"
+ " cbnz %w1, 1b\n"
+ : "=&r" (pteval), "=&r" (tmp), "+Q" (pte_val(*pte))
+ : "L" (~PTE_S2_RDWR), "L" (PTE_S2_RDONLY));
}
static inline bool kvm_s2pte_readonly(pte_t *pte)
static inline void kvm_set_s2pmd_readonly(pmd_t *pmd)
{
- pmd_val(*pmd) = (pmd_val(*pmd) & ~PMD_S2_RDWR) | PMD_S2_RDONLY;
+ kvm_set_s2pte_readonly((pte_t *)pmd);
}
static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
{
- return (pmd_val(*pmd) & PMD_S2_RDWR) == PMD_S2_RDONLY;
- }
-
-
- #define kvm_pgd_addr_end(addr, end) pgd_addr_end(addr, end)
- #define kvm_pud_addr_end(addr, end) pud_addr_end(addr, end)
- #define kvm_pmd_addr_end(addr, end) pmd_addr_end(addr, end)
-
- /*
- * In the case where PGDIR_SHIFT is larger than KVM_PHYS_SHIFT, we can address
- * the entire IPA input range with a single pgd entry, and we would only need
- * one pgd entry. Note that in this case, the pgd is actually not used by
- * the MMU for Stage-2 translations, but is merely a fake pgd used as a data
- * structure for the kernel pgtable macros to work.
- */
- #if PGDIR_SHIFT > KVM_PHYS_SHIFT
- #define PTRS_PER_S2_PGD_SHIFT 0
- #else
- #define PTRS_PER_S2_PGD_SHIFT (KVM_PHYS_SHIFT - PGDIR_SHIFT)
- #endif
- #define PTRS_PER_S2_PGD (1 << PTRS_PER_S2_PGD_SHIFT)
-
- #define kvm_pgd_index(addr) (((addr) >> PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1))
-
- /*
- * If we are concatenating first level stage-2 page tables, we would have less
- * than or equal to 16 pointers in the fake PGD, because that's what the
- * architecture allows. In this case, (4 - CONFIG_PGTABLE_LEVELS)
- * represents the first level for the host, and we add 1 to go to the next
- * level (which uses contatenation) for the stage-2 tables.
- */
- #if PTRS_PER_S2_PGD <= 16
- #define KVM_PREALLOC_LEVEL (4 - CONFIG_PGTABLE_LEVELS + 1)
- #else
- #define KVM_PREALLOC_LEVEL (0)
- #endif
-
- static inline void *kvm_get_hwpgd(struct kvm *kvm)
- {
- pgd_t *pgd = kvm->arch.pgd;
- pud_t *pud;
-
- if (KVM_PREALLOC_LEVEL == 0)
- return pgd;
-
- pud = pud_offset(pgd, 0);
- if (KVM_PREALLOC_LEVEL == 1)
- return pud;
-
- BUG_ON(KVM_PREALLOC_LEVEL != 2);
- return pmd_offset(pud, 0);
- }
-
- static inline unsigned int kvm_get_hwpgd_size(void)
- {
- if (KVM_PREALLOC_LEVEL > 0)
- return PTRS_PER_S2_PGD * PAGE_SIZE;
- return PTRS_PER_S2_PGD * sizeof(pgd_t);
+ return kvm_s2pte_readonly((pte_t *)pmd);
}
static inline bool kvm_page_empty(void *ptr)
return page_count(ptr_page) == 1;
}
- #define kvm_pte_table_empty(kvm, ptep) kvm_page_empty(ptep)
+ #define hyp_pte_table_empty(ptep) kvm_page_empty(ptep)
#ifdef __PAGETABLE_PMD_FOLDED
- #define kvm_pmd_table_empty(kvm, pmdp) (0)
+ #define hyp_pmd_table_empty(pmdp) (0)
#else
- #define kvm_pmd_table_empty(kvm, pmdp) \
- (kvm_page_empty(pmdp) && (!(kvm) || KVM_PREALLOC_LEVEL < 2))
+ #define hyp_pmd_table_empty(pmdp) kvm_page_empty(pmdp)
#endif
#ifdef __PAGETABLE_PUD_FOLDED
- #define kvm_pud_table_empty(kvm, pudp) (0)
+ #define hyp_pud_table_empty(pudp) (0)
#else
- #define kvm_pud_table_empty(kvm, pudp) \
- (kvm_page_empty(pudp) && (!(kvm) || KVM_PREALLOC_LEVEL < 1))
+ #define hyp_pud_table_empty(pudp) kvm_page_empty(pudp)
#endif
-
struct kvm;
#define kvm_flush_dcache_to_poc(a,l) __flush_dcache_area((a), (l))
* Section
*/
#define PMD_SECT_VALID (_AT(pmdval_t, 1) << 0)
-#define PMD_SECT_PROT_NONE (_AT(pmdval_t, 1) << 58)
#define PMD_SECT_USER (_AT(pmdval_t, 1) << 6) /* AP[1] */
#define PMD_SECT_RDONLY (_AT(pmdval_t, 1) << 7) /* AP[2] */
#define PMD_SECT_S (_AT(pmdval_t, 3) << 8)
#define TCR_T1SZ(x) ((UL(64) - (x)) << TCR_T1SZ_OFFSET)
#define TCR_TxSZ(x) (TCR_T0SZ(x) | TCR_T1SZ(x))
#define TCR_TxSZ_WIDTH 6
- #define TCR_IRGN_NC ((UL(0) << 8) | (UL(0) << 24))
- #define TCR_IRGN_WBWA ((UL(1) << 8) | (UL(1) << 24))
- #define TCR_IRGN_WT ((UL(2) << 8) | (UL(2) << 24))
- #define TCR_IRGN_WBnWA ((UL(3) << 8) | (UL(3) << 24))
- #define TCR_IRGN_MASK ((UL(3) << 8) | (UL(3) << 24))
- #define TCR_ORGN_NC ((UL(0) << 10) | (UL(0) << 26))
- #define TCR_ORGN_WBWA ((UL(1) << 10) | (UL(1) << 26))
- #define TCR_ORGN_WT ((UL(2) << 10) | (UL(2) << 26))
- #define TCR_ORGN_WBnWA ((UL(3) << 10) | (UL(3) << 26))
- #define TCR_ORGN_MASK ((UL(3) << 10) | (UL(3) << 26))
- #define TCR_SHARED ((UL(3) << 12) | (UL(3) << 28))
- #define TCR_TG0_4K (UL(0) << 14)
- #define TCR_TG0_64K (UL(1) << 14)
- #define TCR_TG0_16K (UL(2) << 14)
- #define TCR_TG1_16K (UL(1) << 30)
- #define TCR_TG1_4K (UL(2) << 30)
- #define TCR_TG1_64K (UL(3) << 30)
+
+ #define TCR_IRGN0_SHIFT 8
+ #define TCR_IRGN0_MASK (UL(3) << TCR_IRGN0_SHIFT)
+ #define TCR_IRGN0_NC (UL(0) << TCR_IRGN0_SHIFT)
+ #define TCR_IRGN0_WBWA (UL(1) << TCR_IRGN0_SHIFT)
+ #define TCR_IRGN0_WT (UL(2) << TCR_IRGN0_SHIFT)
+ #define TCR_IRGN0_WBnWA (UL(3) << TCR_IRGN0_SHIFT)
+
+ #define TCR_IRGN1_SHIFT 24
+ #define TCR_IRGN1_MASK (UL(3) << TCR_IRGN1_SHIFT)
+ #define TCR_IRGN1_NC (UL(0) << TCR_IRGN1_SHIFT)
+ #define TCR_IRGN1_WBWA (UL(1) << TCR_IRGN1_SHIFT)
+ #define TCR_IRGN1_WT (UL(2) << TCR_IRGN1_SHIFT)
+ #define TCR_IRGN1_WBnWA (UL(3) << TCR_IRGN1_SHIFT)
+
+ #define TCR_IRGN_NC (TCR_IRGN0_NC | TCR_IRGN1_NC)
+ #define TCR_IRGN_WBWA (TCR_IRGN0_WBWA | TCR_IRGN1_WBWA)
+ #define TCR_IRGN_WT (TCR_IRGN0_WT | TCR_IRGN1_WT)
+ #define TCR_IRGN_WBnWA (TCR_IRGN0_WBnWA | TCR_IRGN1_WBnWA)
+ #define TCR_IRGN_MASK (TCR_IRGN0_MASK | TCR_IRGN1_MASK)
+
+
+ #define TCR_ORGN0_SHIFT 10
+ #define TCR_ORGN0_MASK (UL(3) << TCR_ORGN0_SHIFT)
+ #define TCR_ORGN0_NC (UL(0) << TCR_ORGN0_SHIFT)
+ #define TCR_ORGN0_WBWA (UL(1) << TCR_ORGN0_SHIFT)
+ #define TCR_ORGN0_WT (UL(2) << TCR_ORGN0_SHIFT)
+ #define TCR_ORGN0_WBnWA (UL(3) << TCR_ORGN0_SHIFT)
+
+ #define TCR_ORGN1_SHIFT 26
+ #define TCR_ORGN1_MASK (UL(3) << TCR_ORGN1_SHIFT)
+ #define TCR_ORGN1_NC (UL(0) << TCR_ORGN1_SHIFT)
+ #define TCR_ORGN1_WBWA (UL(1) << TCR_ORGN1_SHIFT)
+ #define TCR_ORGN1_WT (UL(2) << TCR_ORGN1_SHIFT)
+ #define TCR_ORGN1_WBnWA (UL(3) << TCR_ORGN1_SHIFT)
+
+ #define TCR_ORGN_NC (TCR_ORGN0_NC | TCR_ORGN1_NC)
+ #define TCR_ORGN_WBWA (TCR_ORGN0_WBWA | TCR_ORGN1_WBWA)
+ #define TCR_ORGN_WT (TCR_ORGN0_WT | TCR_ORGN1_WT)
+ #define TCR_ORGN_WBnWA (TCR_ORGN0_WBnWA | TCR_ORGN1_WBnWA)
+ #define TCR_ORGN_MASK (TCR_ORGN0_MASK | TCR_ORGN1_MASK)
+
+ #define TCR_SH0_SHIFT 12
+ #define TCR_SH0_MASK (UL(3) << TCR_SH0_SHIFT)
+ #define TCR_SH0_INNER (UL(3) << TCR_SH0_SHIFT)
+
+ #define TCR_SH1_SHIFT 28
+ #define TCR_SH1_MASK (UL(3) << TCR_SH1_SHIFT)
+ #define TCR_SH1_INNER (UL(3) << TCR_SH1_SHIFT)
+ #define TCR_SHARED (TCR_SH0_INNER | TCR_SH1_INNER)
+
+ #define TCR_TG0_SHIFT 14
+ #define TCR_TG0_MASK (UL(3) << TCR_TG0_SHIFT)
+ #define TCR_TG0_4K (UL(0) << TCR_TG0_SHIFT)
+ #define TCR_TG0_64K (UL(1) << TCR_TG0_SHIFT)
+ #define TCR_TG0_16K (UL(2) << TCR_TG0_SHIFT)
+
+ #define TCR_TG1_SHIFT 30
+ #define TCR_TG1_MASK (UL(3) << TCR_TG1_SHIFT)
+ #define TCR_TG1_16K (UL(1) << TCR_TG1_SHIFT)
+ #define TCR_TG1_4K (UL(2) << TCR_TG1_SHIFT)
+ #define TCR_TG1_64K (UL(3) << TCR_TG1_SHIFT)
+
#define TCR_ASID16 (UL(1) << 36)
#define TCR_TBI0 (UL(1) << 37)
#define TCR_HA (UL(1) << 39)
#include <asm/pgtable-prot.h>
/*
- * VMALLOC and SPARSEMEM_VMEMMAP ranges.
+ * VMALLOC range.
*
- * VMEMAP_SIZE: allows the whole linear region to be covered by a struct page array
- * (rounded up to PUD_SIZE).
* VMALLOC_START: beginning of the kernel vmalloc space
- * VMALLOC_END: extends to the available space below vmmemmap, PCI I/O space,
- * fixed mappings and modules
+ * VMALLOC_END: extends to the available space below vmmemmap, PCI I/O space
+ * and fixed mappings
*/
-#define VMEMMAP_SIZE ALIGN((1UL << (VA_BITS - PAGE_SHIFT)) * sizeof(struct page), PUD_SIZE)
-
#define VMALLOC_START (MODULES_END)
#define VMALLOC_END (PAGE_OFFSET - PUD_SIZE - VMEMMAP_SIZE - SZ_64K)
-#define VMEMMAP_START (VMALLOC_END + SZ_64K)
-#define vmemmap ((struct page *)VMEMMAP_START - \
- SECTION_ALIGN_DOWN(memstart_addr >> PAGE_SHIFT))
+#define vmemmap ((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT))
#define FIRST_USER_ADDRESS 0UL
* for zero-mapped memory areas etc..
*/
extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
-#define ZERO_PAGE(vaddr) virt_to_page(empty_zero_page)
+#define ZERO_PAGE(vaddr) pfn_to_page(PHYS_PFN(__pa(empty_zero_page)))
#define pte_ERROR(pte) __pte_error(__FILE__, __LINE__, pte_val(pte))
return __pgprot(pgprot_val(prot) & ~PTE_TABLE_BIT);
}
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * See the comment in include/asm-generic/pgtable.h
+ */
+static inline int pte_protnone(pte_t pte)
+{
+ return (pte_val(pte) & (PTE_VALID | PTE_PROT_NONE)) == PTE_PROT_NONE;
+}
+
+static inline int pmd_protnone(pmd_t pmd)
+{
+ return pte_protnone(pmd_pte(pmd));
+}
+#endif
+
/*
* THP definitions.
*/
#define pmd_trans_huge(pmd) (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT))
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#define pmd_present(pmd) pte_present(pmd_pte(pmd))
#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
#define pmd_young(pmd) pte_young(pmd_pte(pmd))
#define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd)))
#define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd)))
#define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd)))
-#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd)))
+#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd)))
#define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd)))
#define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd)))
-#define pmd_mknotpresent(pmd) (__pmd(pmd_val(pmd) & ~PMD_TYPE_MASK))
+#define pmd_mknotpresent(pmd) (__pmd(pmd_val(pmd) & ~PMD_SECT_VALID))
+ #define pmd_thp_or_huge(pmd) (pmd_huge(pmd) || pmd_trans_huge(pmd))
+
#define __HAVE_ARCH_PMD_WRITE
#define pmd_write(pmd) pte_write(pmd_pte(pmd))
unsigned long size, pgprot_t vma_prot);
#define pmd_none(pmd) (!pmd_val(pmd))
-#define pmd_present(pmd) (pmd_val(pmd))
-#define pmd_bad(pmd) (!(pmd_val(pmd) & 2))
+#define pmd_bad(pmd) (!(pmd_val(pmd) & PMD_TABLE_BIT))
#define pmd_table(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \
PMD_TYPE_TABLE)
#define pmd_ERROR(pmd) __pmd_error(__FILE__, __LINE__, pmd_val(pmd))
#define pud_none(pud) (!pud_val(pud))
-#define pud_bad(pud) (!(pud_val(pud) & 2))
+#define pud_bad(pud) (!(pud_val(pud) & PUD_TABLE_BIT))
#define pud_present(pud) (pud_val(pud))
static inline void set_pud(pud_t *pudp, pud_t pud)
}
#ifdef CONFIG_ARM64_HW_AFDBM
+#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+extern int ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty);
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp,
+ pmd_t entry, int dirty)
+{
+ return ptep_set_access_flags(vma, address, (pte_t *)pmdp, pmd_pte(entry), dirty);
+}
+#endif
+
/*
* Atomic pte/pmd modifications.
*/
#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
- static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
- unsigned long address,
- pte_t *ptep)
+ static inline int __ptep_test_and_clear_young(pte_t *ptep)
{
pteval_t pteval;
unsigned int tmp, res;
- asm volatile("// ptep_test_and_clear_young\n"
+ asm volatile("// __ptep_test_and_clear_young\n"
" prfm pstl1strm, %2\n"
"1: ldxr %0, %2\n"
" ubfx %w3, %w0, %5, #1 // extract PTE_AF (young)\n"
return res;
}
+ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long address,
+ pte_t *ptep)
+ {
+ return __ptep_test_and_clear_young(ptep);
+ }
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
- unsigned long address, pmd_t *pmdp)
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long address, pmd_t *pmdp)
{
return pte_pmd(ptep_get_and_clear(mm, address, (pte_t *)pmdp));
}
u32 flush_dcache_exits;
u32 halt_successful_poll;
u32 halt_attempted_poll;
+ u32 halt_poll_invalid;
u32 halt_wakeup;
};
#define MIPS3_PG_FRAME 0x3fffffc0
#define VPN2_MASK 0xffffe000
+#define KVM_ENTRYHI_ASID MIPS_ENTRYHI_ASID
#define TLB_IS_GLOBAL(x) (((x).tlb_lo0 & MIPS3_PG_G) && \
((x).tlb_lo1 & MIPS3_PG_G))
#define TLB_VPN2(x) ((x).tlb_hi & VPN2_MASK)
-#define TLB_ASID(x) ((x).tlb_hi & ASID_MASK)
+#define TLB_ASID(x) ((x).tlb_hi & KVM_ENTRYHI_ASID)
#define TLB_IS_VALID(x, va) (((va) & (1 << PAGE_SHIFT)) \
? ((x).tlb_lo1 & MIPS3_PG_V) \
: ((x).tlb_lo0 & MIPS3_PG_V))
#define TLB_HI_VPN2_HIT(x, y) ((TLB_VPN2(x) & ~(x).tlb_mask) == \
((y) & VPN2_MASK & ~(x).tlb_mask))
#define TLB_HI_ASID_HIT(x, y) (TLB_IS_GLOBAL(x) || \
- TLB_ASID(x) == ((y) & ASID_MASK))
+ TLB_ASID(x) == ((y) & KVM_ENTRYHI_ASID))
struct kvm_mips_tlb {
long tlb_mask;
uint32_t kvm_mips_read_count(struct kvm_vcpu *vcpu);
void kvm_mips_write_count(struct kvm_vcpu *vcpu, uint32_t count);
- void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare);
+ void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare, bool ack);
void kvm_mips_init_count(struct kvm_vcpu *vcpu);
int kvm_mips_set_count_ctl(struct kvm_vcpu *vcpu, s64 count_ctl);
int kvm_mips_set_count_resume(struct kvm_vcpu *vcpu, s64 count_resume);
static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
+ static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
#endif /* __MIPS_KVM_HOST_H__ */
*/
static uint32_t kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now)
{
- ktime_t expires;
+ struct mips_coproc *cop0 = vcpu->arch.cop0;
+ ktime_t expires, threshold;
+ uint32_t count, compare;
int running;
- /* Is the hrtimer pending? */
+ /* Calculate the biased and scaled guest CP0_Count */
+ count = vcpu->arch.count_bias + kvm_mips_ktime_to_count(vcpu, now);
+ compare = kvm_read_c0_guest_compare(cop0);
+
+ /*
+ * Find whether CP0_Count has reached the closest timer interrupt. If
+ * not, we shouldn't inject it.
+ */
+ if ((int32_t)(count - compare) < 0)
+ return count;
+
+ /*
+ * The CP0_Count we're going to return has already reached the closest
+ * timer interrupt. Quickly check if it really is a new interrupt by
+ * looking at whether the interval until the hrtimer expiry time is
+ * less than 1/4 of the timer period.
+ */
expires = hrtimer_get_expires(&vcpu->arch.comparecount_timer);
- if (ktime_compare(now, expires) >= 0) {
+ threshold = ktime_add_ns(now, vcpu->arch.count_period / 4);
+ if (ktime_before(expires, threshold)) {
/*
* Cancel it while we handle it so there's no chance of
* interference with the timeout handler.
}
}
- /* Return the biased and scaled guest CP0_Count */
- return vcpu->arch.count_bias + kvm_mips_ktime_to_count(vcpu, now);
+ return count;
}
/**
hrtimer_start(&vcpu->arch.comparecount_timer, expire, HRTIMER_MODE_ABS);
}
- /**
- * kvm_mips_update_hrtimer() - Update next expiry time of hrtimer.
- * @vcpu: Virtual CPU.
- *
- * Recalculates and updates the expiry time of the hrtimer. This can be used
- * after timer parameters have been altered which do not depend on the time that
- * the change occurs (in those cases kvm_mips_freeze_hrtimer() and
- * kvm_mips_resume_hrtimer() are used directly).
- *
- * It is guaranteed that no timer interrupts will be lost in the process.
- *
- * Assumes !kvm_mips_count_disabled(@vcpu) (guest CP0_Count timer is running).
- */
- static void kvm_mips_update_hrtimer(struct kvm_vcpu *vcpu)
- {
- ktime_t now;
- uint32_t count;
-
- /*
- * freeze_hrtimer takes care of a timer interrupts <= count, and
- * resume_hrtimer the hrtimer takes care of a timer interrupts > count.
- */
- now = kvm_mips_freeze_hrtimer(vcpu, &count);
- kvm_mips_resume_hrtimer(vcpu, now, count);
- }
-
/**
* kvm_mips_write_count() - Modify the count and update timer.
* @vcpu: Virtual CPU.
* kvm_mips_write_compare() - Modify compare and update timer.
* @vcpu: Virtual CPU.
* @compare: New CP0_Compare value.
+ * @ack: Whether to acknowledge timer interrupt.
*
* Update CP0_Compare to a new value and update the timeout.
+ * If @ack, atomically acknowledge any pending timer interrupt, otherwise ensure
+ * any pending timer interrupt is preserved.
*/
- void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare)
+ void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare, bool ack)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
+ int dc;
+ u32 old_compare = kvm_read_c0_guest_compare(cop0);
+ ktime_t now;
+ uint32_t count;
/* if unchanged, must just be an ack */
- if (kvm_read_c0_guest_compare(cop0) == compare)
+ if (old_compare == compare) {
+ if (!ack)
+ return;
+ kvm_mips_callbacks->dequeue_timer_int(vcpu);
+ kvm_write_c0_guest_compare(cop0, compare);
return;
+ }
+
+ /* freeze_hrtimer() takes care of timer interrupts <= count */
+ dc = kvm_mips_count_disabled(vcpu);
+ if (!dc)
+ now = kvm_mips_freeze_hrtimer(vcpu, &count);
+
+ if (ack)
+ kvm_mips_callbacks->dequeue_timer_int(vcpu);
- /* Update compare */
kvm_write_c0_guest_compare(cop0, compare);
- /* Update timeout if count enabled */
- if (!kvm_mips_count_disabled(vcpu))
- kvm_mips_update_hrtimer(vcpu);
+ /* resume_hrtimer() takes care of timer interrupts > count */
+ if (!dc)
+ kvm_mips_resume_hrtimer(vcpu, now, count);
}
/**
kvm_read_c0_guest_ebase(cop0));
} else if (rd == MIPS_CP0_TLB_HI && sel == 0) {
uint32_t nasid =
- vcpu->arch.gprs[rt] & ASID_MASK;
+ vcpu->arch.gprs[rt] & KVM_ENTRYHI_ASID;
if ((KSEGX(vcpu->arch.gprs[rt]) != CKSEG0) &&
((kvm_read_c0_guest_entryhi(cop0) &
- ASID_MASK) != nasid)) {
+ KVM_ENTRYHI_ASID) != nasid)) {
kvm_debug("MTCz, change ASID from %#lx to %#lx\n",
kvm_read_c0_guest_entryhi(cop0)
- & ASID_MASK,
+ & KVM_ENTRYHI_ASID,
vcpu->arch.gprs[rt]
- & ASID_MASK);
+ & KVM_ENTRYHI_ASID);
/* Blow away the shadow host TLBs */
kvm_mips_flush_host_tlb(1);
/* If we are writing to COMPARE */
/* Clear pending timer interrupt, if any */
- kvm_mips_callbacks->dequeue_timer_int(vcpu);
kvm_mips_write_compare(vcpu,
- vcpu->arch.gprs[rt]);
+ vcpu->arch.gprs[rt],
+ true);
} else if ((rd == MIPS_CP0_STATUS) && (sel == 0)) {
unsigned int old_val, val, change;
*/
index = kvm_mips_guest_tlb_lookup(vcpu, (va & VPN2_MASK) |
(kvm_read_c0_guest_entryhi
- (cop0) & ASID_MASK));
+ (cop0) & KVM_ENTRYHI_ASID));
if (index < 0) {
vcpu->arch.host_cp0_entryhi = (va & VPN2_MASK);
struct mips_coproc *cop0 = vcpu->arch.cop0;
struct kvm_vcpu_arch *arch = &vcpu->arch;
unsigned long entryhi = (vcpu->arch. host_cp0_badvaddr & VPN2_MASK) |
- (kvm_read_c0_guest_entryhi(cop0) & ASID_MASK);
+ (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID);
if ((kvm_read_c0_guest_status(cop0) & ST0_EXL) == 0) {
/* save old pc */
struct kvm_vcpu_arch *arch = &vcpu->arch;
unsigned long entryhi =
(vcpu->arch.host_cp0_badvaddr & VPN2_MASK) |
- (kvm_read_c0_guest_entryhi(cop0) & ASID_MASK);
+ (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID);
if ((kvm_read_c0_guest_status(cop0) & ST0_EXL) == 0) {
/* save old pc */
struct mips_coproc *cop0 = vcpu->arch.cop0;
struct kvm_vcpu_arch *arch = &vcpu->arch;
unsigned long entryhi = (vcpu->arch.host_cp0_badvaddr & VPN2_MASK) |
- (kvm_read_c0_guest_entryhi(cop0) & ASID_MASK);
+ (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID);
if ((kvm_read_c0_guest_status(cop0) & ST0_EXL) == 0) {
/* save old pc */
struct mips_coproc *cop0 = vcpu->arch.cop0;
struct kvm_vcpu_arch *arch = &vcpu->arch;
unsigned long entryhi = (vcpu->arch.host_cp0_badvaddr & VPN2_MASK) |
- (kvm_read_c0_guest_entryhi(cop0) & ASID_MASK);
+ (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID);
if ((kvm_read_c0_guest_status(cop0) & ST0_EXL) == 0) {
/* save old pc */
#ifdef DEBUG
struct mips_coproc *cop0 = vcpu->arch.cop0;
unsigned long entryhi = (vcpu->arch.host_cp0_badvaddr & VPN2_MASK) |
- (kvm_read_c0_guest_entryhi(cop0) & ASID_MASK);
+ (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID);
int index;
/* If address not in the guest TLB, then we are in trouble */
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
unsigned long entryhi = (vcpu->arch.host_cp0_badvaddr & VPN2_MASK) |
- (kvm_read_c0_guest_entryhi(cop0) & ASID_MASK);
+ (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID);
struct kvm_vcpu_arch *arch = &vcpu->arch;
if ((kvm_read_c0_guest_status(cop0) & ST0_EXL) == 0) {
*/
index = kvm_mips_guest_tlb_lookup(vcpu,
(va & VPN2_MASK) |
- (kvm_read_c0_guest_entryhi(vcpu->arch.cop0) & ASID_MASK));
+ (kvm_read_c0_guest_entryhi(vcpu->arch.cop0) &
+ KVM_ENTRYHI_ASID));
if (index < 0) {
if (exccode == EXCCODE_TLBL) {
er = kvm_mips_emulate_tlbmiss_ld(cause, opc, run, vcpu);
uint32_t kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.guest_kernel_asid[smp_processor_id()] & ASID_MASK;
+ int cpu = smp_processor_id();
+
+ return vcpu->arch.guest_kernel_asid[cpu] &
+ cpu_asid_mask(&cpu_data[cpu]);
}
uint32_t kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.guest_user_asid[smp_processor_id()] & ASID_MASK;
+ int cpu = smp_processor_id();
+
+ return vcpu->arch.guest_user_asid[cpu] &
+ cpu_asid_mask(&cpu_data[cpu]);
}
inline uint32_t kvm_mips_get_commpage_asid(struct kvm_vcpu *vcpu)
old_pagemask = read_c0_pagemask();
kvm_info("HOST TLBs:\n");
- kvm_info("ASID: %#lx\n", read_c0_entryhi() & ASID_MASK);
+ kvm_info("ASID: %#lx\n", read_c0_entryhi() &
+ cpu_asid_mask(¤t_cpu_data));
for (i = 0; i < current_cpu_data.tlbsize; i++) {
write_c0_index(i);
int even;
struct kvm *kvm = vcpu->kvm;
const int flush_dcache_mask = 0;
+ int ret;
if (KVM_GUEST_KSEGX(badvaddr) != KVM_GUEST_KSEG0) {
kvm_err("%s: Invalid BadVaddr: %#lx\n", __func__, badvaddr);
pfn1 = kvm->arch.guest_pmap[gfn];
}
- entryhi = (vaddr | kvm_mips_get_kernel_asid(vcpu));
entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) |
(1 << 2) | (0x1 << 1);
entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) |
(1 << 2) | (0x1 << 1);
- return kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
- flush_dcache_mask);
+ preempt_disable();
+ entryhi = (vaddr | kvm_mips_get_kernel_asid(vcpu));
+ ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
+ flush_dcache_mask);
+ preempt_enable();
+
+ return ret;
}
EXPORT_SYMBOL_GPL(kvm_mips_handle_kseg0_tlb_fault);
unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
struct kvm *kvm = vcpu->kvm;
kvm_pfn_t pfn0, pfn1;
+ int ret;
if ((tlb->tlb_hi & VPN2_MASK) == 0) {
pfn0 = 0;
*hpa1 = pfn1 << PAGE_SHIFT;
/* Get attributes from the Guest TLB */
- entryhi = (tlb->tlb_hi & VPN2_MASK) | (KVM_GUEST_KERNEL_MODE(vcpu) ?
- kvm_mips_get_kernel_asid(vcpu) :
- kvm_mips_get_user_asid(vcpu));
entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) |
(tlb->tlb_lo0 & MIPS3_PG_D) | (tlb->tlb_lo0 & MIPS3_PG_V);
entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) |
kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc,
tlb->tlb_lo0, tlb->tlb_lo1);
- return kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
- tlb->tlb_mask);
+ preempt_disable();
+ entryhi = (tlb->tlb_hi & VPN2_MASK) | (KVM_GUEST_KERNEL_MODE(vcpu) ?
+ kvm_mips_get_kernel_asid(vcpu) :
+ kvm_mips_get_user_asid(vcpu));
+ ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
+ tlb->tlb_mask);
+ preempt_enable();
+
+ return ret;
}
EXPORT_SYMBOL_GPL(kvm_mips_handle_mapped_seg_tlb_fault);
{
unsigned long asid = asid_cache(cpu);
- asid += ASID_INC;
- if (!(asid & ASID_MASK)) {
+ asid += cpu_asid_inc();
+ if (!(asid & cpu_asid_mask(&cpu_data[cpu]))) {
if (cpu_has_vtag_icache)
flush_icache_all();
kvm_local_flush_tlb_all(); /* start new asid cycle */
if (!asid) /* fix version if needed */
- asid = ASID_FIRST_VERSION;
+ asid = asid_first_version(cpu);
}
cpu_context(cpu, mm) = asid_cache(cpu) = asid;
/* Restore ASID once we are scheduled back after preemption */
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
+ unsigned long asid_mask = cpu_asid_mask(&cpu_data[cpu]);
unsigned long flags;
int newasid = 0;
local_irq_save(flags);
if ((vcpu->arch.guest_kernel_asid[cpu] ^ asid_cache(cpu)) &
- ASID_VERSION_MASK) {
+ asid_version_mask(cpu)) {
kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu);
vcpu->arch.guest_kernel_asid[cpu] =
vcpu->arch.guest_kernel_mm.context.asid[cpu];
*/
if (current->flags & PF_VCPU) {
write_c0_entryhi(vcpu->arch.
- preempt_entryhi & ASID_MASK);
+ preempt_entryhi & asid_mask);
ehb();
}
} else {
if (KVM_GUEST_KERNEL_MODE(vcpu))
write_c0_entryhi(vcpu->arch.
guest_kernel_asid[cpu] &
- ASID_MASK);
+ asid_mask);
else
write_c0_entryhi(vcpu->arch.
guest_user_asid[cpu] &
- ASID_MASK);
+ asid_mask);
ehb();
}
}
kvm_mips_callbacks->vcpu_get_regs(vcpu);
if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) &
- ASID_VERSION_MASK)) {
+ asid_version_mask(cpu))) {
kvm_debug("%s: Dropping MMU Context: %#lx\n", __func__,
cpu_context(cpu, current->mm));
drop_mmu_context(current->mm, cpu);
inst = *(opc);
} else {
vpn2 = (unsigned long) opc & VPN2_MASK;
- asid = kvm_read_c0_guest_entryhi(cop0) & ASID_MASK;
+ asid = kvm_read_c0_guest_entryhi(cop0) &
+ KVM_ENTRYHI_ASID;
index = kvm_mips_guest_tlb_lookup(vcpu, vpn2 | asid);
if (index < 0) {
kvm_err("%s: get_user_failed for %p, vcpu: %p, ASID: %#lx\n",
kvm_write_c0_guest_intctl(cop0, 0xFC000000);
/* Put in vcpu id as CPUNum into Ebase Reg to handle SMP Guests */
- kvm_write_c0_guest_ebase(cop0, KVM_GUEST_KSEG0 | (vcpu_id & 0xFF));
+ kvm_write_c0_guest_ebase(cop0, KVM_GUEST_KSEG0 |
+ (vcpu_id & MIPS_EBASE_CPUNUM));
return 0;
}
kvm_mips_write_count(vcpu, v);
break;
case KVM_REG_MIPS_CP0_COMPARE:
- kvm_mips_write_compare(vcpu, v);
+ kvm_mips_write_compare(vcpu, v, false);
break;
case KVM_REG_MIPS_CP0_CAUSE:
/*
unsigned int max_cores;
unsigned long hsa_size;
unsigned long facilities;
+ unsigned int hmfai;
};
extern struct sclp_info sclp;
+struct zpci_report_error_header {
+ u8 version; /* Interface version byte */
+ u8 action; /* Action qualifier byte
+ * 1: Deconfigure and repair action requested
+ * (OpenCrypto Problem Call Home)
+ * 2: Informational Report
+ * (OpenCrypto Successful Diagnostics Execution)
+ */
+ u16 length; /* Length of Subsequent Data (up to 4K – SCLP header */
+ u8 data[0]; /* Subsequent Data passed verbatim to SCLP ET 24 */
+} __packed;
+
int sclp_get_core_info(struct sclp_core_info *info);
int sclp_core_configure(u8 core);
int sclp_core_deconfigure(u8 core);
void sclp_get_ipl_info(struct sclp_ipl_info *info);
int sclp_pci_configure(u32 fid);
int sclp_pci_deconfigure(u32 fid);
+int sclp_pci_report(struct zpci_report_error_header *report, u32 fh, u32 fid);
int memcpy_hsa_kernel(void *dest, unsigned long src, size_t count);
int memcpy_hsa_user(void __user *dest, unsigned long src, size_t count);
void sclp_early_detect(void);
* since it has been deleted from active_mmu_pages but still can be found
* at hast list.
*
- * for_each_gfn_indirect_valid_sp has skipped that kind of page and
- * kvm_mmu_get_page(), the only user of for_each_gfn_sp(), has skipped
- * all the obsolete pages.
+ * for_each_gfn_valid_sp() has skipped that kind of pages.
*/
- #define for_each_gfn_sp(_kvm, _sp, _gfn) \
+ #define for_each_gfn_valid_sp(_kvm, _sp, _gfn) \
hlist_for_each_entry(_sp, \
&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
- if ((_sp)->gfn != (_gfn)) {} else
+ if ((_sp)->gfn != (_gfn) || is_obsolete_sp((_kvm), (_sp)) \
+ || (_sp)->role.invalid) {} else
#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
- for_each_gfn_sp(_kvm, _sp, _gfn) \
- if ((_sp)->role.direct || (_sp)->role.invalid) {} else
+ for_each_gfn_valid_sp(_kvm, _sp, _gfn) \
+ if ((_sp)->role.direct) {} else
/* @sp->gfn should be write-protected at the call site */
static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
static void mmu_audit_disable(void) { }
#endif
+ static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+ {
+ return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
+ }
+
static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
struct list_head *invalid_list)
{
__clear_sp_write_flooding_count(sp);
}
- static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
- {
- return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
- }
-
static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
gfn_t gfn,
gva_t gaddr,
quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
role.quadrant = quadrant;
}
- for_each_gfn_sp(vcpu->kvm, sp, gfn) {
- if (is_obsolete_sp(vcpu->kvm, sp))
- continue;
-
+ for_each_gfn_valid_sp(vcpu->kvm, sp, gfn) {
if (!need_sync && sp->unsync)
need_sync = true;
*/
if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
level == PT_PAGE_TABLE_LEVEL &&
- PageTransCompound(pfn_to_page(pfn)) &&
+ PageTransCompoundMap(pfn_to_page(pfn)) &&
!mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
unsigned long mask;
/*
__reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check,
boot_cpu_data.x86_phys_bits,
context->shadow_root_level, false,
- cpu_has_gbpages, true, true);
+ boot_cpu_has(X86_FEATURE_GBPAGES),
+ true, true);
else
__reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
boot_cpu_data.x86_phys_bits,
*/
if (sp->role.direct &&
!kvm_is_reserved_pfn(pfn) &&
- PageTransCompound(pfn_to_page(pfn))) {
+ PageTransCompoundMap(pfn_to_page(pfn))) {
drop_spte(kvm, sptep);
need_tlb_flush = 1;
goto restart;
* the COPYING file in the top-level directory.
*
*/
+
+ #define pr_fmt(fmt) "SVM: " fmt
+
#include <linux/kvm_host.h>
#include "irq.h"
#include <linux/trace_events.h>
#include <linux/slab.h>
+ #include <asm/apic.h>
#include <asm/perf_event.h>
#include <asm/tlbflush.h>
#include <asm/desc.h>
#define SVM_FEATURE_DECODE_ASSIST (1 << 7)
#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
+ #define SVM_AVIC_DOORBELL 0xc001011b
+
#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
#define NESTED_EXIT_CONTINUE 2 /* Further checks needed */
#define TSC_RATIO_MIN 0x0000000000000001ULL
#define TSC_RATIO_MAX 0x000000ffffffffffULL
+ #define AVIC_HPA_MASK ~((0xFFFULL << 52) || 0xFFF)
+
+ /*
+ * 0xff is broadcast, so the max index allowed for physical APIC ID
+ * table is 0xfe. APIC IDs above 0xff are reserved.
+ */
+ #define AVIC_MAX_PHYSICAL_ID_COUNT 255
+
+ #define AVIC_UNACCEL_ACCESS_WRITE_MASK 1
+ #define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0
+ #define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF
+
static bool erratum_383_found __read_mostly;
static const u32 host_save_user_msrs[] = {
/* cached guest cpuid flags for faster access */
bool nrips_enabled : 1;
+
+ u32 ldr_reg;
+ struct page *avic_backing_page;
+ u64 *avic_physical_id_cache;
+ bool avic_is_running;
};
+ #define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF)
+ #define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
+
+ #define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL)
+ #define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12)
+ #define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62)
+ #define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63)
+
static DEFINE_PER_CPU(u64, current_tsc_ratio);
#define TSC_RATIO_DEFAULT 0x0100000000ULL
static int nested = true;
module_param(nested, int, S_IRUGO);
+ /* enable / disable AVIC */
+ static int avic;
+ module_param(avic, int, S_IRUGO);
+
static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
static void svm_flush_tlb(struct kvm_vcpu *vcpu);
static void svm_complete_interrupts(struct vcpu_svm *svm);
VMCB_SEG, /* CS, DS, SS, ES, CPL */
VMCB_CR2, /* CR2 only */
VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
+ VMCB_AVIC, /* AVIC APIC_BAR, AVIC APIC_BACKING_PAGE,
+ * AVIC PHYSICAL_TABLE pointer,
+ * AVIC LOGICAL_TABLE pointer
+ */
VMCB_DIRTY_MAX,
};
/* TPR and CR2 are always written before VMRUN */
#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2))
+ #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
+
static inline void mark_all_dirty(struct vmcb *vmcb)
{
vmcb->control.clean = 0;
return container_of(vcpu, struct vcpu_svm, vcpu);
}
+ static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data)
+ {
+ svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK;
+ mark_dirty(svm->vmcb, VMCB_AVIC);
+ }
+
+ static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu)
+ {
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 *entry = svm->avic_physical_id_cache;
+
+ if (!entry)
+ return false;
+
+ return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
+ }
+
static void recalc_intercepts(struct vcpu_svm *svm)
{
struct vmcb_control_area *c, *h;
} else
kvm_disable_tdp();
+ if (avic && (!npt_enabled || !boot_cpu_has(X86_FEATURE_AVIC)))
+ avic = false;
+
+ if (avic)
+ pr_info("AVIC enabled\n");
+
return 0;
err:
mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
}
+ static void avic_init_vmcb(struct vcpu_svm *svm)
+ {
+ struct vmcb *vmcb = svm->vmcb;
+ struct kvm_arch *vm_data = &svm->vcpu.kvm->arch;
+ phys_addr_t bpa = page_to_phys(svm->avic_backing_page);
+ phys_addr_t lpa = page_to_phys(vm_data->avic_logical_id_table_page);
+ phys_addr_t ppa = page_to_phys(vm_data->avic_physical_id_table_page);
+
+ vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
+ vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
+ vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
+ vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT;
+ vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
+ svm->vcpu.arch.apicv_active = true;
+ }
+
static void init_vmcb(struct vcpu_svm *svm)
{
struct vmcb_control_area *control = &svm->vmcb->control;
set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
- set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+ if (!kvm_vcpu_apicv_active(&svm->vcpu))
+ set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
set_dr_intercepts(svm);
set_intercept(svm, INTERCEPT_PAUSE);
}
+ if (avic)
+ avic_init_vmcb(svm);
+
mark_all_dirty(svm->vmcb);
enable_gif(svm);
+
+ }
+
+ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, int index)
+ {
+ u64 *avic_physical_id_table;
+ struct kvm_arch *vm_data = &vcpu->kvm->arch;
+
+ if (index >= AVIC_MAX_PHYSICAL_ID_COUNT)
+ return NULL;
+
+ avic_physical_id_table = page_address(vm_data->avic_physical_id_table_page);
+
+ return &avic_physical_id_table[index];
+ }
+
+ /**
+ * Note:
+ * AVIC hardware walks the nested page table to check permissions,
+ * but does not use the SPA address specified in the leaf page
+ * table entry since it uses address in the AVIC_BACKING_PAGE pointer
+ * field of the VMCB. Therefore, we set up the
+ * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
+ */
+ static int avic_init_access_page(struct kvm_vcpu *vcpu)
+ {
+ struct kvm *kvm = vcpu->kvm;
+ int ret;
+
+ if (kvm->arch.apic_access_page_done)
+ return 0;
+
+ ret = x86_set_memory_region(kvm,
+ APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+ APIC_DEFAULT_PHYS_BASE,
+ PAGE_SIZE);
+ if (ret)
+ return ret;
+
+ kvm->arch.apic_access_page_done = true;
+ return 0;
+ }
+
+ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
+ {
+ int ret;
+ u64 *entry, new_entry;
+ int id = vcpu->vcpu_id;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ ret = avic_init_access_page(vcpu);
+ if (ret)
+ return ret;
+
+ if (id >= AVIC_MAX_PHYSICAL_ID_COUNT)
+ return -EINVAL;
+
+ if (!svm->vcpu.arch.apic->regs)
+ return -EINVAL;
+
+ svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs);
+
+ /* Setting AVIC backing page address in the phy APIC ID table */
+ entry = avic_get_physical_id_entry(vcpu, id);
+ if (!entry)
+ return -EINVAL;
+
+ new_entry = READ_ONCE(*entry);
+ new_entry = (page_to_phys(svm->avic_backing_page) &
+ AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
+ AVIC_PHYSICAL_ID_ENTRY_VALID_MASK;
+ WRITE_ONCE(*entry, new_entry);
+
+ svm->avic_physical_id_cache = entry;
+
+ return 0;
+ }
+
+ static void avic_vm_destroy(struct kvm *kvm)
+ {
+ struct kvm_arch *vm_data = &kvm->arch;
+
+ if (vm_data->avic_logical_id_table_page)
+ __free_page(vm_data->avic_logical_id_table_page);
+ if (vm_data->avic_physical_id_table_page)
+ __free_page(vm_data->avic_physical_id_table_page);
+ }
+
+ static int avic_vm_init(struct kvm *kvm)
+ {
+ int err = -ENOMEM;
+ struct kvm_arch *vm_data = &kvm->arch;
+ struct page *p_page;
+ struct page *l_page;
+
+ if (!avic)
+ return 0;
+
+ /* Allocating physical APIC ID table (4KB) */
+ p_page = alloc_page(GFP_KERNEL);
+ if (!p_page)
+ goto free_avic;
+
+ vm_data->avic_physical_id_table_page = p_page;
+ clear_page(page_address(p_page));
+
+ /* Allocating logical APIC ID table (4KB) */
+ l_page = alloc_page(GFP_KERNEL);
+ if (!l_page)
+ goto free_avic;
+
+ vm_data->avic_logical_id_table_page = l_page;
+ clear_page(page_address(l_page));
+
+ return 0;
+
+ free_avic:
+ avic_vm_destroy(kvm);
+ return err;
+ }
+
+ /**
+ * This function is called during VCPU halt/unhalt.
+ */
+ static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
+ {
+ u64 entry;
+ int h_physical_id = __default_cpu_present_to_apicid(vcpu->cpu);
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ svm->avic_is_running = is_run;
+
+ /* ID = 0xff (broadcast), ID > 0xff (reserved) */
+ if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT))
+ return;
+
+ entry = READ_ONCE(*(svm->avic_physical_id_cache));
+ WARN_ON(is_run == !!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK));
+
+ entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+ if (is_run)
+ entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+ WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+ }
+
+ static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+ {
+ u64 entry;
+ /* ID = 0xff (broadcast), ID > 0xff (reserved) */
+ int h_physical_id = __default_cpu_present_to_apicid(cpu);
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT))
+ return;
+
+ entry = READ_ONCE(*(svm->avic_physical_id_cache));
+ WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
+
+ entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
+ entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
+
+ entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+ if (svm->avic_is_running)
+ entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+
+ WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+ }
+
+ static void avic_vcpu_put(struct kvm_vcpu *vcpu)
+ {
+ u64 entry;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ entry = READ_ONCE(*(svm->avic_physical_id_cache));
+ entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+ WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
}
static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
+
+ if (kvm_vcpu_apicv_active(vcpu) && !init_event)
+ avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
}
static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
if (!hsave_page)
goto free_page3;
+ if (avic) {
+ err = avic_init_backing_page(&svm->vcpu);
+ if (err)
+ goto free_page4;
+ }
+
+ /* We initialize this flag to true to make sure that the is_running
+ * bit would be set the first time the vcpu is loaded.
+ */
+ svm->avic_is_running = true;
+
svm->nested.hsave = page_address(hsave_page);
svm->msrpm = page_address(msrpm_pages);
return &svm->vcpu;
+ free_page4:
+ __free_page(hsave_page);
free_page3:
__free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
free_page2:
/* This assumes that the kernel never uses MSR_TSC_AUX */
if (static_cpu_has(X86_FEATURE_RDTSCP))
wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
+
+ avic_vcpu_load(vcpu, cpu);
}
static void svm_vcpu_put(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
int i;
+ avic_vcpu_put(vcpu);
+
++vcpu->stat.host_state_reload;
kvm_load_ldt(svm->host.ldt);
#ifdef CONFIG_X86_64
loadsegment(fs, svm->host.fs);
- wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
+ wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase);
load_gs_index(svm->host.gs);
#else
#ifdef CONFIG_X86_32_LAZY_GS
wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
}
+ static void svm_vcpu_blocking(struct kvm_vcpu *vcpu)
+ {
+ avic_set_running(vcpu, false);
+ }
+
+ static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
+ {
+ avic_set_running(vcpu, true);
+ }
+
static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
{
return to_svm(vcpu)->vmcb->save.rflags;
disable_gif(svm);
/* After a CLGI no interrupts should come */
- svm_clear_vintr(svm);
- svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
-
- mark_dirty(svm->vmcb, VMCB_INTR);
+ if (!kvm_vcpu_apicv_active(&svm->vcpu)) {
+ svm_clear_vintr(svm);
+ svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
+ mark_dirty(svm->vmcb, VMCB_INTR);
+ }
return 1;
}
case MSR_VM_IGNNE:
vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
break;
+ case MSR_IA32_APICBASE:
+ if (kvm_vcpu_apicv_active(vcpu))
+ avic_update_vapic_bar(to_svm(vcpu), data);
+ /* Follow through */
default:
return kvm_set_msr_common(vcpu, msr);
}
return nop_interception(svm);
}
+ enum avic_ipi_failure_cause {
+ AVIC_IPI_FAILURE_INVALID_INT_TYPE,
+ AVIC_IPI_FAILURE_TARGET_NOT_RUNNING,
+ AVIC_IPI_FAILURE_INVALID_TARGET,
+ AVIC_IPI_FAILURE_INVALID_BACKING_PAGE,
+ };
+
+ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
+ {
+ u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
+ u32 icrl = svm->vmcb->control.exit_info_1;
+ u32 id = svm->vmcb->control.exit_info_2 >> 32;
+ u32 index = svm->vmcb->control.exit_info_2 && 0xFF;
+ struct kvm_lapic *apic = svm->vcpu.arch.apic;
+
+ trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index);
+
+ switch (id) {
+ case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
+ /*
+ * AVIC hardware handles the generation of
+ * IPIs when the specified Message Type is Fixed
+ * (also known as fixed delivery mode) and
+ * the Trigger Mode is edge-triggered. The hardware
+ * also supports self and broadcast delivery modes
+ * specified via the Destination Shorthand(DSH)
+ * field of the ICRL. Logical and physical APIC ID
+ * formats are supported. All other IPI types cause
+ * a #VMEXIT, which needs to emulated.
+ */
+ kvm_lapic_reg_write(apic, APIC_ICR2, icrh);
+ kvm_lapic_reg_write(apic, APIC_ICR, icrl);
+ break;
+ case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: {
+ int i;
+ struct kvm_vcpu *vcpu;
+ struct kvm *kvm = svm->vcpu.kvm;
+ struct kvm_lapic *apic = svm->vcpu.arch.apic;
+
+ /*
+ * At this point, we expect that the AVIC HW has already
+ * set the appropriate IRR bits on the valid target
+ * vcpus. So, we just need to kick the appropriate vcpu.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ bool m = kvm_apic_match_dest(vcpu, apic,
+ icrl & KVM_APIC_SHORT_MASK,
+ GET_APIC_DEST_FIELD(icrh),
+ icrl & KVM_APIC_DEST_MASK);
+
+ if (m && !avic_vcpu_is_running(vcpu))
+ kvm_vcpu_wake_up(vcpu);
+ }
+ break;
+ }
+ case AVIC_IPI_FAILURE_INVALID_TARGET:
+ break;
+ case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
+ WARN_ONCE(1, "Invalid backing page\n");
+ break;
+ default:
+ pr_err("Unknown IPI interception\n");
+ }
+
+ return 1;
+ }
+
+ static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
+ {
+ struct kvm_arch *vm_data = &vcpu->kvm->arch;
+ int index;
+ u32 *logical_apic_id_table;
+ int dlid = GET_APIC_LOGICAL_ID(ldr);
+
+ if (!dlid)
+ return NULL;
+
+ if (flat) { /* flat */
+ index = ffs(dlid) - 1;
+ if (index > 7)
+ return NULL;
+ } else { /* cluster */
+ int cluster = (dlid & 0xf0) >> 4;
+ int apic = ffs(dlid & 0x0f) - 1;
+
+ if ((apic < 0) || (apic > 7) ||
+ (cluster >= 0xf))
+ return NULL;
+ index = (cluster << 2) + apic;
+ }
+
+ logical_apic_id_table = (u32 *) page_address(vm_data->avic_logical_id_table_page);
+
+ return &logical_apic_id_table[index];
+ }
+
+ static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr,
+ bool valid)
+ {
+ bool flat;
+ u32 *entry, new_entry;
+
+ flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
+ entry = avic_get_logical_id_entry(vcpu, ldr, flat);
+ if (!entry)
+ return -EINVAL;
+
+ new_entry = READ_ONCE(*entry);
+ new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
+ new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
+ if (valid)
+ new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
+ else
+ new_entry &= ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
+ WRITE_ONCE(*entry, new_entry);
+
+ return 0;
+ }
+
+ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
+ {
+ int ret;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
+
+ if (!ldr)
+ return 1;
+
+ ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr, true);
+ if (ret && svm->ldr_reg) {
+ avic_ldr_write(vcpu, 0, svm->ldr_reg, false);
+ svm->ldr_reg = 0;
+ } else {
+ svm->ldr_reg = ldr;
+ }
+ return ret;
+ }
+
+ static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu)
+ {
+ u64 *old, *new;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 apic_id_reg = kvm_lapic_get_reg(vcpu->arch.apic, APIC_ID);
+ u32 id = (apic_id_reg >> 24) & 0xff;
+
+ if (vcpu->vcpu_id == id)
+ return 0;
+
+ old = avic_get_physical_id_entry(vcpu, vcpu->vcpu_id);
+ new = avic_get_physical_id_entry(vcpu, id);
+ if (!new || !old)
+ return 1;
+
+ /* We need to move physical_id_entry to new offset */
+ *new = *old;
+ *old = 0ULL;
+ to_svm(vcpu)->avic_physical_id_cache = new;
+
+ /*
+ * Also update the guest physical APIC ID in the logical
+ * APIC ID table entry if already setup the LDR.
+ */
+ if (svm->ldr_reg)
+ avic_handle_ldr_update(vcpu);
+
+ return 0;
+ }
+
+ static int avic_handle_dfr_update(struct kvm_vcpu *vcpu)
+ {
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_arch *vm_data = &vcpu->kvm->arch;
+ u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
+ u32 mod = (dfr >> 28) & 0xf;
+
+ /*
+ * We assume that all local APICs are using the same type.
+ * If this changes, we need to flush the AVIC logical
+ * APID id table.
+ */
+ if (vm_data->ldr_mode == mod)
+ return 0;
+
+ clear_page(page_address(vm_data->avic_logical_id_table_page));
+ vm_data->ldr_mode = mod;
+
+ if (svm->ldr_reg)
+ avic_handle_ldr_update(vcpu);
+ return 0;
+ }
+
+ static int avic_unaccel_trap_write(struct vcpu_svm *svm)
+ {
+ struct kvm_lapic *apic = svm->vcpu.arch.apic;
+ u32 offset = svm->vmcb->control.exit_info_1 &
+ AVIC_UNACCEL_ACCESS_OFFSET_MASK;
+
+ switch (offset) {
+ case APIC_ID:
+ if (avic_handle_apic_id_update(&svm->vcpu))
+ return 0;
+ break;
+ case APIC_LDR:
+ if (avic_handle_ldr_update(&svm->vcpu))
+ return 0;
+ break;
+ case APIC_DFR:
+ avic_handle_dfr_update(&svm->vcpu);
+ break;
+ default:
+ break;
+ }
+
+ kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
+
+ return 1;
+ }
+
+ static bool is_avic_unaccelerated_access_trap(u32 offset)
+ {
+ bool ret = false;
+
+ switch (offset) {
+ case APIC_ID:
+ case APIC_EOI:
+ case APIC_RRR:
+ case APIC_LDR:
+ case APIC_DFR:
+ case APIC_SPIV:
+ case APIC_ESR:
+ case APIC_ICR:
+ case APIC_LVTT:
+ case APIC_LVTTHMR:
+ case APIC_LVTPC:
+ case APIC_LVT0:
+ case APIC_LVT1:
+ case APIC_LVTERR:
+ case APIC_TMICT:
+ case APIC_TDCR:
+ ret = true;
+ break;
+ default:
+ break;
+ }
+ return ret;
+ }
+
+ static int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
+ {
+ int ret = 0;
+ u32 offset = svm->vmcb->control.exit_info_1 &
+ AVIC_UNACCEL_ACCESS_OFFSET_MASK;
+ u32 vector = svm->vmcb->control.exit_info_2 &
+ AVIC_UNACCEL_ACCESS_VECTOR_MASK;
+ bool write = (svm->vmcb->control.exit_info_1 >> 32) &
+ AVIC_UNACCEL_ACCESS_WRITE_MASK;
+ bool trap = is_avic_unaccelerated_access_trap(offset);
+
+ trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset,
+ trap, write, vector);
+ if (trap) {
+ /* Handling Trap */
+ WARN_ONCE(!write, "svm: Handling trap read.\n");
+ ret = avic_unaccel_trap_write(svm);
+ } else {
+ /* Handling Fault */
+ ret = (emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE);
+ }
+
+ return ret;
+ }
+
static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_READ_CR0] = cr_interception,
[SVM_EXIT_READ_CR3] = cr_interception,
[SVM_EXIT_XSETBV] = xsetbv_interception,
[SVM_EXIT_NPF] = pf_interception,
[SVM_EXIT_RSM] = emulate_on_interception,
+ [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
+ [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception,
};
static void dump_vmcb(struct kvm_vcpu *vcpu)
pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
+ pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl);
pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
+ pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
+ pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
+ pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
pr_err("VMCB State Save Area:\n");
pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
"es:",
{
struct vmcb_control_area *control;
+ /* The following fields are ignored when AVIC is enabled */
control = &svm->vmcb->control;
control->int_vector = irq;
control->int_ctl &= ~V_INTR_PRIO_MASK;
SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
}
+ static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu)
+ {
+ return is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK);
+ }
+
static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
+ if (svm_nested_virtualize_tpr(vcpu) ||
+ kvm_vcpu_apicv_active(vcpu))
return;
clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
static bool svm_get_enable_apicv(void)
{
- return false;
+ return avic;
+ }
+
+ static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
+ {
}
+ static void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
+ {
+ }
+
+ /* Note: Currently only used by Hyper-V. */
static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
+
+ if (!avic)
+ return;
+
+ vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
+ mark_dirty(vmcb, VMCB_INTR);
}
static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
return;
}
+ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
+ {
+ kvm_lapic_set_irr(vec, vcpu->arch.apic);
+ smp_mb__after_atomic();
+
+ if (avic_vcpu_is_running(vcpu))
+ wrmsrl(SVM_AVIC_DOORBELL,
+ __default_cpu_present_to_apicid(vcpu->cpu));
+ else
+ kvm_vcpu_wake_up(vcpu);
+ }
+
static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
{
struct vcpu_svm *svm = to_svm(vcpu);
+ if (kvm_vcpu_apicv_active(vcpu))
+ return;
+
/*
* In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
* 1, because that's a separate STGI/VMRUN intercept. The next time we
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
+ if (svm_nested_virtualize_tpr(vcpu))
return;
if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
struct vcpu_svm *svm = to_svm(vcpu);
u64 cr8;
- if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
+ if (svm_nested_virtualize_tpr(vcpu) ||
+ kvm_vcpu_apicv_active(vcpu))
return;
cr8 = kvm_get_cr8(vcpu);
static void svm_cpuid_update(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_cpuid_entry2 *entry;
/* Update nrips enabled cache */
svm->nrips_enabled = !!guest_cpuid_has_nrips(&svm->vcpu);
+
+ if (!kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ entry = kvm_find_cpuid_entry(vcpu, 1, 0);
+ if (entry)
+ entry->ecx &= ~bit(X86_FEATURE_X2APIC);
}
static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
{
switch (func) {
+ case 0x1:
+ if (avic)
+ entry->ecx &= ~bit(X86_FEATURE_X2APIC);
+ break;
case 0x80000001:
if (nested)
entry->ecx |= (1 << 2); /* Set SVM bit */
{
}
+ static inline void avic_post_state_restore(struct kvm_vcpu *vcpu)
+ {
+ if (avic_handle_apic_id_update(vcpu) != 0)
+ return;
+ if (avic_handle_dfr_update(vcpu) != 0)
+ return;
+ avic_handle_ldr_update(vcpu);
+ }
+
static struct kvm_x86_ops svm_x86_ops = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
.vcpu_free = svm_free_vcpu,
.vcpu_reset = svm_vcpu_reset,
+ .vm_init = avic_vm_init,
+ .vm_destroy = avic_vm_destroy,
+
.prepare_guest_switch = svm_prepare_guest_switch,
.vcpu_load = svm_vcpu_load,
.vcpu_put = svm_vcpu_put,
+ .vcpu_blocking = svm_vcpu_blocking,
+ .vcpu_unblocking = svm_vcpu_unblocking,
.update_bp_intercept = update_bp_intercept,
.get_msr = svm_get_msr,
.refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
.load_eoi_exitmap = svm_load_eoi_exitmap,
.sync_pir_to_irr = svm_sync_pir_to_irr,
+ .hwapic_irr_update = svm_hwapic_irr_update,
+ .hwapic_isr_update = svm_hwapic_isr_update,
+ .apicv_post_state_restore = avic_post_state_restore,
.set_tss_addr = svm_set_tss_addr,
.get_tdp_level = get_npt_level,
.sched_in = svm_sched_in,
.pmu_ops = &amd_pmu_ops,
+ .deliver_posted_interrupt = svm_deliver_avic_intr,
};
static int __init svm_init(void)
#define host_clocks \
{VCLOCK_NONE, "none"}, \
- {VCLOCK_TSC, "tsc"}, \
- {VCLOCK_HPET, "hpet"} \
+ {VCLOCK_TSC, "tsc"} \
TRACE_EVENT(kvm_update_master_clock,
TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched),
__entry->vcpu_id, __entry->timer_index)
);
+ /*
+ * Tracepoint for AMD AVIC
+ */
+ TRACE_EVENT(kvm_avic_incomplete_ipi,
+ TP_PROTO(u32 vcpu, u32 icrh, u32 icrl, u32 id, u32 index),
+ TP_ARGS(vcpu, icrh, icrl, id, index),
+
+ TP_STRUCT__entry(
+ __field(u32, vcpu)
+ __field(u32, icrh)
+ __field(u32, icrl)
+ __field(u32, id)
+ __field(u32, index)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu = vcpu;
+ __entry->icrh = icrh;
+ __entry->icrl = icrl;
+ __entry->id = id;
+ __entry->index = index;
+ ),
+
+ TP_printk("vcpu=%u, icrh:icrl=%#010x:%08x, id=%u, index=%u\n",
+ __entry->vcpu, __entry->icrh, __entry->icrl,
+ __entry->id, __entry->index)
+ );
+
+ TRACE_EVENT(kvm_avic_unaccelerated_access,
+ TP_PROTO(u32 vcpu, u32 offset, bool ft, bool rw, u32 vec),
+ TP_ARGS(vcpu, offset, ft, rw, vec),
+
+ TP_STRUCT__entry(
+ __field(u32, vcpu)
+ __field(u32, offset)
+ __field(bool, ft)
+ __field(bool, rw)
+ __field(u32, vec)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu = vcpu;
+ __entry->offset = offset;
+ __entry->ft = ft;
+ __entry->rw = rw;
+ __entry->vec = vec;
+ ),
+
+ TP_printk("vcpu=%u, offset=%#x(%s), %s, %s, vec=%#x\n",
+ __entry->vcpu,
+ __entry->offset,
+ __print_symbolic(__entry->offset, kvm_trace_symbol_apic),
+ __entry->ft ? "trap" : "fault",
+ __entry->rw ? "write" : "read",
+ __entry->vec)
+ );
+
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
static void kvm_cpu_vmxon(u64 addr)
{
+ intel_pt_handle_vmx(1);
+
asm volatile (ASM_VMX_VMXON_RAX
: : "a"(&addr), "m"(addr)
: "memory", "cc");
static void kvm_cpu_vmxoff(void)
{
asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
+
+ intel_pt_handle_vmx(0);
}
static void hardware_disable(void)
}
}
- if (cpu_has_xsaves)
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
rdmsrl(MSR_IA32_XSS, host_xss);
return 0;
vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
- vmx_set_cr0(vcpu, cr0); /* enter rmode */
vmx->vcpu.arch.cr0 = cr0;
+ vmx_set_cr0(vcpu, cr0); /* enter rmode */
vmx_set_cr4(vcpu, 0);
vmx_set_efer(vcpu, 0);
vmx_fpu_activate(vcpu);
vmcs_write64(APIC_ACCESS_ADDR, hpa);
}
- static void vmx_hwapic_isr_update(struct kvm *kvm, int isr)
+ static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
{
u16 status;
u8 old;
- if (isr == -1)
- isr = 0;
+ if (max_isr == -1)
+ max_isr = 0;
status = vmcs_read16(GUEST_INTR_STATUS);
old = status >> 8;
- if (isr != old) {
+ if (max_isr != old) {
status &= 0xff;
- status |= isr << 8;
+ status |= max_isr << 8;
vmcs_write16(GUEST_INTR_STATUS, status);
}
}
{ "halt_exits", VCPU_STAT(halt_exits) },
{ "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
+ { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
{ "hypercalls", VCPU_STAT(hypercalls) },
{ "request_irq", VCPU_STAT(request_irq_exits) },
vcpu->arch.pv_time_enabled = false;
}
- static void accumulate_steal_time(struct kvm_vcpu *vcpu)
- {
- u64 delta;
-
- if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
- return;
-
- delta = current->sched_info.run_delay - vcpu->arch.st.last_steal;
- vcpu->arch.st.last_steal = current->sched_info.run_delay;
- vcpu->arch.st.accum_steal = delta;
- }
-
static void record_steal_time(struct kvm_vcpu *vcpu)
{
- accumulate_steal_time(vcpu);
-
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
&vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
return;
- vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal;
- vcpu->arch.st.steal.version += 2;
- vcpu->arch.st.accum_steal = 0;
+ if (vcpu->arch.st.steal.version & 1)
+ vcpu->arch.st.steal.version += 1; /* first time write, random junk */
+
+ vcpu->arch.st.steal.version += 1;
+
+ kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+ &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
+
+ smp_wmb();
+
+ vcpu->arch.st.steal.steal += current->sched_info.run_delay -
+ vcpu->arch.st.last_steal;
+ vcpu->arch.st.last_steal = current->sched_info.run_delay;
+
+ kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+ &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
+
+ smp_wmb();
+
+ vcpu->arch.st.steal.version += 1;
kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
r = KVM_MAX_MCE_BANKS;
break;
case KVM_CAP_XCRS:
- r = cpu_has_xsave;
+ r = boot_cpu_has(X86_FEATURE_XSAVE);
break;
case KVM_CAP_TSC_CONTROL:
r = kvm_has_tsc_control;
/* Set XSTATE_BV and possibly XCOMP_BV. */
xsave->header.xfeatures = xstate_bv;
- if (cpu_has_xsaves)
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
/*
static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
struct kvm_xsave *guest_xsave)
{
- if (cpu_has_xsave) {
+ if (boot_cpu_has(X86_FEATURE_XSAVE)) {
memset(guest_xsave, 0, sizeof(struct kvm_xsave));
fill_xsave((u8 *) guest_xsave->region, vcpu);
} else {
u64 xstate_bv =
*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
- if (cpu_has_xsave) {
+ if (boot_cpu_has(X86_FEATURE_XSAVE)) {
/*
* Here we allow setting states that are not present in
* CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility
static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
struct kvm_xcrs *guest_xcrs)
{
- if (!cpu_has_xsave) {
+ if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
guest_xcrs->nr_xcrs = 0;
return;
}
{
int i, r = 0;
- if (!cpu_has_xsave)
+ if (!boot_cpu_has(X86_FEATURE_XSAVE))
return -EINVAL;
if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
perf_register_guest_info_callbacks(&kvm_guest_cbs);
- if (cpu_has_xsave)
+ if (boot_cpu_has(X86_FEATURE_XSAVE))
host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
kvm_lapic_init();
static void fx_init(struct kvm_vcpu *vcpu)
{
fpstate_init(&vcpu->arch.guest_fpu.state);
- if (cpu_has_xsaves)
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
vcpu->arch.guest_fpu.state.xsave.header.xcomp_bv =
host_xcr0 | XSTATE_COMPACTION_ENABLED;
kvm_page_track_init(kvm);
kvm_mmu_init_vm(kvm);
+ if (kvm_x86_ops->vm_init)
+ return kvm_x86_ops->vm_init(kvm);
+
return 0;
}
x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 0, 0);
x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
}
+ if (kvm_x86_ops->vm_destroy)
+ kvm_x86_ops->vm_destroy(kvm);
kvm_iommu_unmap_guest(kvm);
kfree(kvm->arch.vpic);
kfree(kvm->arch.vioapic);
}
EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
+ bool kvm_arch_has_irq_bypass(void)
+ {
+ return kvm_x86_ops->update_pi_irte != NULL;
+ }
+
int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
struct irq_bypass_producer *prod)
{
struct kvm_kernel_irqfd *irqfd =
container_of(cons, struct kvm_kernel_irqfd, consumer);
- if (kvm_x86_ops->update_pi_irte) {
- irqfd->producer = prod;
- return kvm_x86_ops->update_pi_irte(irqfd->kvm,
- prod->irq, irqfd->gsi, 1);
- }
+ irqfd->producer = prod;
- return -EINVAL;
+ return kvm_x86_ops->update_pi_irte(irqfd->kvm,
+ prod->irq, irqfd->gsi, 1);
}
void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
struct kvm_kernel_irqfd *irqfd =
container_of(cons, struct kvm_kernel_irqfd, consumer);
- if (!kvm_x86_ops->update_pi_irte) {
- WARN_ON(irqfd->producer != NULL);
- return;
- }
-
WARN_ON(irqfd->producer != prod);
irqfd->producer = NULL;
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
#include "irq-gic-common.h"
+ static const struct gic_kvm_info *gic_kvm_info;
+
+ const struct gic_kvm_info *gic_get_kvm_info(void)
+ {
+ return gic_kvm_info;
+ }
+
+ void gic_set_kvm_info(const struct gic_kvm_info *info)
+ {
+ BUG_ON(gic_kvm_info != NULL);
+ gic_kvm_info = info;
+ }
+
void gic_enable_quirks(u32 iidr, const struct gic_quirk *quirks,
void *data)
{
else if (type & IRQ_TYPE_EDGE_BOTH)
val |= confmask;
+ /* If the current configuration is the same, then we are done */
+ if (val == oldval)
+ return 0;
+
/*
* Write back the new configuration, and possibly re-enable
- * the interrupt. If we tried to write a new configuration and failed,
- * return an error.
+ * the interrupt. If we fail to write a new configuration for
+ * an SPI then WARN and return an error. If we fail to write the
+ * configuration for a PPI this is most likely because the GIC
+ * does not allow us to set the configuration or we are in a
+ * non-secure mode, and hence it may not be catastrophic.
*/
writel_relaxed(val, base + GIC_DIST_CONFIG + confoff);
- if (readl_relaxed(base + GIC_DIST_CONFIG + confoff) != val && val != oldval)
- ret = -EINVAL;
+ if (readl_relaxed(base + GIC_DIST_CONFIG + confoff) != val) {
+ if (WARN_ON(irq >= 32))
+ ret = -EINVAL;
+ else
+ pr_warn("GIC: PPI%d is secure or misconfigured\n",
+ irq - 16);
+ }
if (sync_access)
sync_access();
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+ #define pr_fmt(fmt) "GICv3: " fmt
+
#include <linux/acpi.h>
#include <linux/cpu.h>
#include <linux/cpu_pm.h>
#include <linux/slab.h>
#include <linux/irqchip.h>
+ #include <linux/irqchip/arm-gic-common.h>
#include <linux/irqchip/arm-gic-v3.h>
+#include <linux/irqchip/irq-partition-percpu.h>
#include <asm/cputype.h>
#include <asm/exception.h>
};
struct gic_chip_data {
+ struct fwnode_handle *fwnode;
void __iomem *dist_base;
struct redist_region *redist_regions;
struct rdists rdists;
u64 redist_stride;
u32 nr_redist_regions;
unsigned int irq_nr;
+ struct partition_desc *ppi_descs[16];
};
static struct gic_chip_data gic_data __read_mostly;
static struct static_key supports_deactivate = STATIC_KEY_INIT_TRUE;
+ static struct gic_kvm_info gic_v3_kvm_info;
+
#define gic_data_rdist() (this_cpu_ptr(gic_data.rdists.rdist))
#define gic_data_rdist_rd_base() (gic_data_rdist()->rd_base)
#define gic_data_rdist_sgi_base() (gic_data_rdist_rd_base() + SZ_64K)
if (static_key_true(&supports_deactivate))
gic_write_dir(irqnr);
#ifdef CONFIG_SMP
+ /*
+ * Unlike GICv2, we don't need an smp_rmb() here.
+ * The control dependency from gic_read_iar to
+ * the ISB in gic_write_eoir is enough to ensure
+ * that any shared data read by handle_IPI will
+ * be read after the ACK.
+ */
handle_IPI(irqnr, regs);
#else
WARN_ONCE(true, "Unexpected SGI received!\n");
writel_relaxed(0, base + GICD_CTLR);
gic_dist_wait_for_rwp();
+ /*
+ * Configure SPIs as non-secure Group-1. This will only matter
+ * if the GIC only has a single security state. This will not
+ * do the right thing if the kernel is running in secure mode,
+ * but that's not the intended use case anyway.
+ */
+ for (i = 32; i < gic_data.irq_nr; i += 32)
+ writel_relaxed(~0, base + GICD_IGROUPR + i / 8);
+
gic_dist_config(base, gic_data.irq_nr, gic_dist_wait_for_rwp);
/* Enable distributor with ARE, Group1 */
rbase = gic_data_rdist_sgi_base();
+ /* Configure SGIs/PPIs as non-secure Group-1 */
+ writel_relaxed(~0, rbase + GICR_IGROUPR0);
+
gic_cpu_config(rbase, gic_redist_wait_for_rwp);
/* Give LPIs a spin */
}
}
+static int gic_irq_domain_select(struct irq_domain *d,
+ struct irq_fwspec *fwspec,
+ enum irq_domain_bus_token bus_token)
+{
+ /* Not for us */
+ if (fwspec->fwnode != d->fwnode)
+ return 0;
+
+ /* If this is not DT, then we have a single domain */
+ if (!is_of_node(fwspec->fwnode))
+ return 1;
+
+ /*
+ * If this is a PPI and we have a 4th (non-null) parameter,
+ * then we need to match the partition domain.
+ */
+ if (fwspec->param_count >= 4 &&
+ fwspec->param[0] == 1 && fwspec->param[3] != 0)
+ return d == partition_get_domain(gic_data.ppi_descs[fwspec->param[1]]);
+
+ return d == gic_data.domain;
+}
+
static const struct irq_domain_ops gic_irq_domain_ops = {
.translate = gic_irq_domain_translate,
.alloc = gic_irq_domain_alloc,
.free = gic_irq_domain_free,
+ .select = gic_irq_domain_select,
+};
+
+static int partition_domain_translate(struct irq_domain *d,
+ struct irq_fwspec *fwspec,
+ unsigned long *hwirq,
+ unsigned int *type)
+{
+ struct device_node *np;
+ int ret;
+
+ np = of_find_node_by_phandle(fwspec->param[3]);
+ if (WARN_ON(!np))
+ return -EINVAL;
+
+ ret = partition_translate_id(gic_data.ppi_descs[fwspec->param[1]],
+ of_node_to_fwnode(np));
+ if (ret < 0)
+ return ret;
+
+ *hwirq = ret;
+ *type = fwspec->param[2] & IRQ_TYPE_SENSE_MASK;
+
+ return 0;
+}
+
+static const struct irq_domain_ops partition_domain_ops = {
+ .translate = partition_domain_translate,
+ .select = gic_irq_domain_select,
};
static void gicv3_enable_quirks(void)
if (static_key_true(&supports_deactivate))
pr_info("GIC: Using split EOI/Deactivate mode\n");
+ gic_data.fwnode = handle;
gic_data.dist_base = dist_base;
gic_data.redist_regions = rdist_regs;
gic_data.nr_redist_regions = nr_redist_regions;
return 0;
}
- static void gic_populate_ppi_partitions(struct device_node *gic_node)
+static int get_cpu_number(struct device_node *dn)
+{
+ const __be32 *cell;
+ u64 hwid;
+ int i;
+
+ cell = of_get_property(dn, "reg", NULL);
+ if (!cell)
+ return -1;
+
+ hwid = of_read_number(cell, of_n_addr_cells(dn));
+
+ /*
+ * Non affinity bits must be set to 0 in the DT
+ */
+ if (hwid & ~MPIDR_HWID_BITMASK)
+ return -1;
+
+ for (i = 0; i < num_possible_cpus(); i++)
+ if (cpu_logical_map(i) == hwid)
+ return i;
+
+ return -1;
+}
+
+/* Create all possible partitions at boot time */
++static void __init gic_populate_ppi_partitions(struct device_node *gic_node)
+{
+ struct device_node *parts_node, *child_part;
+ int part_idx = 0, i;
+ int nr_parts;
+ struct partition_affinity *parts;
+
+ parts_node = of_find_node_by_name(gic_node, "ppi-partitions");
+ if (!parts_node)
+ return;
+
+ nr_parts = of_get_child_count(parts_node);
+
+ if (!nr_parts)
+ return;
+
+ parts = kzalloc(sizeof(*parts) * nr_parts, GFP_KERNEL);
+ if (WARN_ON(!parts))
+ return;
+
+ for_each_child_of_node(parts_node, child_part) {
+ struct partition_affinity *part;
+ int n;
+
+ part = &parts[part_idx];
+
+ part->partition_id = of_node_to_fwnode(child_part);
+
+ pr_info("GIC: PPI partition %s[%d] { ",
+ child_part->name, part_idx);
+
+ n = of_property_count_elems_of_size(child_part, "affinity",
+ sizeof(u32));
+ WARN_ON(n <= 0);
+
+ for (i = 0; i < n; i++) {
+ int err, cpu;
+ u32 cpu_phandle;
+ struct device_node *cpu_node;
+
+ err = of_property_read_u32_index(child_part, "affinity",
+ i, &cpu_phandle);
+ if (WARN_ON(err))
+ continue;
+
+ cpu_node = of_find_node_by_phandle(cpu_phandle);
+ if (WARN_ON(!cpu_node))
+ continue;
+
+ cpu = get_cpu_number(cpu_node);
+ if (WARN_ON(cpu == -1))
+ continue;
+
+ pr_cont("%s[%d] ", cpu_node->full_name, cpu);
+
+ cpumask_set_cpu(cpu, &part->mask);
+ }
+
+ pr_cont("}\n");
+ part_idx++;
+ }
+
+ for (i = 0; i < 16; i++) {
+ unsigned int irq;
+ struct partition_desc *desc;
+ struct irq_fwspec ppi_fwspec = {
+ .fwnode = gic_data.fwnode,
+ .param_count = 3,
+ .param = {
+ [0] = 1,
+ [1] = i,
+ [2] = IRQ_TYPE_NONE,
+ },
+ };
+
+ irq = irq_create_fwspec_mapping(&ppi_fwspec);
+ if (WARN_ON(!irq))
+ continue;
+ desc = partition_create_desc(gic_data.fwnode, parts, nr_parts,
+ irq, &partition_domain_ops);
+ if (WARN_ON(!desc))
+ continue;
+
+ gic_data.ppi_descs[i] = desc;
+ }
+}
+
+ static void __init gic_of_setup_kvm_info(struct device_node *node)
+ {
+ int ret;
+ struct resource r;
+ u32 gicv_idx;
+
+ gic_v3_kvm_info.type = GIC_V3;
+
+ gic_v3_kvm_info.maint_irq = irq_of_parse_and_map(node, 0);
+ if (!gic_v3_kvm_info.maint_irq)
+ return;
+
+ if (of_property_read_u32(node, "#redistributor-regions",
+ &gicv_idx))
+ gicv_idx = 1;
+
+ gicv_idx += 3; /* Also skip GICD, GICC, GICH */
+ ret = of_address_to_resource(node, gicv_idx, &r);
+ if (!ret)
+ gic_v3_kvm_info.vcpu = r;
+
+ gic_set_kvm_info(&gic_v3_kvm_info);
+ }
+
static int __init gic_of_init(struct device_node *node, struct device_node *parent)
{
void __iomem *dist_base;
err = gic_init_bases(dist_base, rdist_regs, nr_redist_regions,
redist_stride, &node->fwnode);
- if (!err) {
- gic_of_setup_kvm_info(node);
- return 0;
- }
+ if (err)
+ goto out_unmap_rdist;
+
+ gic_populate_ppi_partitions(node);
++ gic_of_setup_kvm_info(node);
+ return 0;
out_unmap_rdist:
for (i = 0; i < nr_redist_regions; i++)
IRQCHIP_DECLARE(gic_v3, "arm,gic-v3", gic_of_init);
#ifdef CONFIG_ACPI
- static void __iomem *dist_base;
- static struct redist_region *redist_regs __initdata;
- static u32 nr_redist_regions __initdata;
- static bool single_redist;
+ static struct
+ {
+ void __iomem *dist_base;
+ struct redist_region *redist_regs;
+ u32 nr_redist_regions;
+ bool single_redist;
+ u32 maint_irq;
+ int maint_irq_mode;
+ phys_addr_t vcpu_base;
+ } acpi_data __initdata;
static void __init
gic_acpi_register_redist(phys_addr_t phys_base, void __iomem *redist_base)
{
static int count = 0;
- redist_regs[count].phys_base = phys_base;
- redist_regs[count].redist_base = redist_base;
- redist_regs[count].single_redist = single_redist;
+ acpi_data.redist_regs[count].phys_base = phys_base;
+ acpi_data.redist_regs[count].redist_base = redist_base;
+ acpi_data.redist_regs[count].single_redist = acpi_data.single_redist;
count++;
}
{
struct acpi_madt_generic_interrupt *gicc =
(struct acpi_madt_generic_interrupt *)header;
- u32 reg = readl_relaxed(dist_base + GICD_PIDR2) & GIC_PIDR2_ARCH_MASK;
+ u32 reg = readl_relaxed(acpi_data.dist_base + GICD_PIDR2) & GIC_PIDR2_ARCH_MASK;
u32 size = reg == GIC_PIDR2_ARCH_GICv4 ? SZ_64K * 4 : SZ_64K * 2;
void __iomem *redist_base;
acpi_tbl_entry_handler redist_parser;
enum acpi_madt_type type;
- if (single_redist) {
+ if (acpi_data.single_redist) {
type = ACPI_MADT_TYPE_GENERIC_INTERRUPT;
redist_parser = gic_acpi_parse_madt_gicc;
} else {
count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_REDISTRIBUTOR,
gic_acpi_match_gicr, 0);
if (count > 0) {
- single_redist = false;
+ acpi_data.single_redist = false;
return count;
}
count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT,
gic_acpi_match_gicc, 0);
if (count > 0)
- single_redist = true;
+ acpi_data.single_redist = true;
return count;
}
if (count <= 0)
return false;
- nr_redist_regions = count;
+ acpi_data.nr_redist_regions = count;
return true;
}
+ static int __init gic_acpi_parse_virt_madt_gicc(struct acpi_subtable_header *header,
+ const unsigned long end)
+ {
+ struct acpi_madt_generic_interrupt *gicc =
+ (struct acpi_madt_generic_interrupt *)header;
+ int maint_irq_mode;
+ static int first_madt = true;
+
+ /* Skip unusable CPUs */
+ if (!(gicc->flags & ACPI_MADT_ENABLED))
+ return 0;
+
+ maint_irq_mode = (gicc->flags & ACPI_MADT_VGIC_IRQ_MODE) ?
+ ACPI_EDGE_SENSITIVE : ACPI_LEVEL_SENSITIVE;
+
+ if (first_madt) {
+ first_madt = false;
+
+ acpi_data.maint_irq = gicc->vgic_interrupt;
+ acpi_data.maint_irq_mode = maint_irq_mode;
+ acpi_data.vcpu_base = gicc->gicv_base_address;
+
+ return 0;
+ }
+
+ /*
+ * The maintenance interrupt and GICV should be the same for every CPU
+ */
+ if ((acpi_data.maint_irq != gicc->vgic_interrupt) ||
+ (acpi_data.maint_irq_mode != maint_irq_mode) ||
+ (acpi_data.vcpu_base != gicc->gicv_base_address))
+ return -EINVAL;
+
+ return 0;
+ }
+
+ static bool __init gic_acpi_collect_virt_info(void)
+ {
+ int count;
+
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT,
+ gic_acpi_parse_virt_madt_gicc, 0);
+
+ return (count > 0);
+ }
+
#define ACPI_GICV3_DIST_MEM_SIZE (SZ_64K)
+ #define ACPI_GICV2_VCTRL_MEM_SIZE (SZ_4K)
+ #define ACPI_GICV2_VCPU_MEM_SIZE (SZ_8K)
+
+ static void __init gic_acpi_setup_kvm_info(void)
+ {
+ int irq;
+
+ if (!gic_acpi_collect_virt_info()) {
+ pr_warn("Unable to get hardware information used for virtualization\n");
+ return;
+ }
+
+ gic_v3_kvm_info.type = GIC_V3;
+
+ irq = acpi_register_gsi(NULL, acpi_data.maint_irq,
+ acpi_data.maint_irq_mode,
+ ACPI_ACTIVE_HIGH);
+ if (irq <= 0)
+ return;
+
+ gic_v3_kvm_info.maint_irq = irq;
+
+ if (acpi_data.vcpu_base) {
+ struct resource *vcpu = &gic_v3_kvm_info.vcpu;
+
+ vcpu->flags = IORESOURCE_MEM;
+ vcpu->start = acpi_data.vcpu_base;
+ vcpu->end = vcpu->start + ACPI_GICV2_VCPU_MEM_SIZE - 1;
+ }
+
+ gic_set_kvm_info(&gic_v3_kvm_info);
+ }
static int __init
gic_acpi_init(struct acpi_subtable_header *header, const unsigned long end)
{
struct acpi_madt_generic_distributor *dist;
struct fwnode_handle *domain_handle;
+ size_t size;
int i, err;
/* Get distributor base address */
dist = (struct acpi_madt_generic_distributor *)header;
- dist_base = ioremap(dist->base_address, ACPI_GICV3_DIST_MEM_SIZE);
- if (!dist_base) {
+ acpi_data.dist_base = ioremap(dist->base_address,
+ ACPI_GICV3_DIST_MEM_SIZE);
+ if (!acpi_data.dist_base) {
pr_err("Unable to map GICD registers\n");
return -ENOMEM;
}
- err = gic_validate_dist_version(dist_base);
+ err = gic_validate_dist_version(acpi_data.dist_base);
if (err) {
- pr_err("No distributor detected at @%p, giving up", dist_base);
+ pr_err("No distributor detected at @%p, giving up",
+ acpi_data.dist_base);
goto out_dist_unmap;
}
- redist_regs = kzalloc(sizeof(*redist_regs) * nr_redist_regions,
- GFP_KERNEL);
- if (!redist_regs) {
+ size = sizeof(*acpi_data.redist_regs) * acpi_data.nr_redist_regions;
+ acpi_data.redist_regs = kzalloc(size, GFP_KERNEL);
+ if (!acpi_data.redist_regs) {
err = -ENOMEM;
goto out_dist_unmap;
}
if (err)
goto out_redist_unmap;
- domain_handle = irq_domain_alloc_fwnode(dist_base);
+ domain_handle = irq_domain_alloc_fwnode(acpi_data.dist_base);
if (!domain_handle) {
err = -ENOMEM;
goto out_redist_unmap;
}
- err = gic_init_bases(dist_base, redist_regs, nr_redist_regions, 0,
- domain_handle);
+ err = gic_init_bases(acpi_data.dist_base, acpi_data.redist_regs,
+ acpi_data.nr_redist_regions, 0, domain_handle);
if (err)
goto out_fwhandle_free;
acpi_set_irq_model(ACPI_IRQ_MODEL_GIC, domain_handle);
+ gic_acpi_setup_kvm_info();
+
return 0;
out_fwhandle_free:
irq_domain_free_fwnode(domain_handle);
out_redist_unmap:
- for (i = 0; i < nr_redist_regions; i++)
- if (redist_regs[i].redist_base)
- iounmap(redist_regs[i].redist_base);
- kfree(redist_regs);
+ for (i = 0; i < acpi_data.nr_redist_regions; i++)
+ if (acpi_data.redist_regs[i].redist_base)
+ iounmap(acpi_data.redist_regs[i].redist_base);
+ kfree(acpi_data.redist_regs);
out_dist_unmap:
- iounmap(dist_base);
+ iounmap(acpi_data.dist_base);
return err;
}
IRQCHIP_ACPI_DECLARE(gic_v3, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR,
static void gic_check_cpu_features(void)
{
- WARN_TAINT_ONCE(cpus_have_cap(ARM64_HAS_SYSREG_GIC_CPUIF),
+ WARN_TAINT_ONCE(this_cpu_has_cap(ARM64_HAS_SYSREG_GIC_CPUIF),
TAINT_CPU_OUT_OF_SPEC,
"GICv3 system registers enabled, broken firmware!\n");
}
struct irq_chip chip;
union gic_base dist_base;
union gic_base cpu_base;
+ void __iomem *raw_dist_base;
+ void __iomem *raw_cpu_base;
+ u32 percpu_offset;
#ifdef CONFIG_CPU_PM
u32 saved_spi_enable[DIV_ROUND_UP(1020, 32)];
u32 saved_spi_active[DIV_ROUND_UP(1020, 32)];
static struct gic_chip_data gic_data[CONFIG_ARM_GIC_MAX_NR] __read_mostly;
+ static struct gic_kvm_info gic_v2_kvm_info;
+
#ifdef CONFIG_GIC_NON_BANKED
static void __iomem *gic_get_percpu_base(union gic_base *base)
{
if (static_key_true(&supports_deactivate))
writel_relaxed(irqstat, cpu_base + GIC_CPU_DEACTIVATE);
#ifdef CONFIG_SMP
+ /*
+ * Ensure any shared data written by the CPU sending
+ * the IPI is read after we've read the ACK register
+ * on the GIC.
+ *
+ * Pairs with the write barrier in gic_raise_softirq
+ */
+ smp_rmb();
handle_IPI(irqnr, regs);
#endif
continue;
IRQCHIP_MASK_ON_SUSPEND,
};
-static struct irq_chip gic_eoimode1_chip = {
- .name = "GICv2",
- .irq_mask = gic_eoimode1_mask_irq,
- .irq_unmask = gic_unmask_irq,
- .irq_eoi = gic_eoimode1_eoi_irq,
- .irq_set_type = gic_set_type,
- .irq_get_irqchip_state = gic_irq_get_irqchip_state,
- .irq_set_irqchip_state = gic_irq_set_irqchip_state,
- .irq_set_vcpu_affinity = gic_irq_set_vcpu_affinity,
- .flags = IRQCHIP_SET_TYPE_MASKED |
- IRQCHIP_SKIP_SET_WAKE |
- IRQCHIP_MASK_ON_SUSPEND,
-};
-
void __init gic_cascade_irq(unsigned int gic_nr, unsigned int irq)
{
BUG_ON(gic_nr >= CONFIG_ARM_GIC_MAX_NR);
writel_relaxed(GICD_ENABLE, base + GIC_DIST_CTRL);
}
-static void gic_cpu_init(struct gic_chip_data *gic)
+static int gic_cpu_init(struct gic_chip_data *gic)
{
void __iomem *dist_base = gic_data_dist_base(gic);
void __iomem *base = gic_data_cpu_base(gic);
/*
* Get what the GIC says our CPU mask is.
*/
- BUG_ON(cpu >= NR_GIC_CPU_IF);
+ if (WARN_ON(cpu >= NR_GIC_CPU_IF))
+ return -EINVAL;
+
+ gic_check_cpu_features();
cpu_mask = gic_get_cpumask(gic);
gic_cpu_map[cpu] = cpu_mask;
writel_relaxed(GICC_INT_PRI_THRESHOLD, base + GIC_CPU_PRIMASK);
gic_cpu_if_up(gic);
+
+ return 0;
}
int gic_cpu_if_down(unsigned int gic_nr)
* this function, no interrupts will be delivered by the GIC, and another
* platform-specific wakeup source must be enabled.
*/
-static void gic_dist_save(unsigned int gic_nr)
+static void gic_dist_save(struct gic_chip_data *gic)
{
unsigned int gic_irqs;
void __iomem *dist_base;
int i;
- BUG_ON(gic_nr >= CONFIG_ARM_GIC_MAX_NR);
+ if (WARN_ON(!gic))
+ return;
- gic_irqs = gic_data[gic_nr].gic_irqs;
- dist_base = gic_data_dist_base(&gic_data[gic_nr]);
+ gic_irqs = gic->gic_irqs;
+ dist_base = gic_data_dist_base(gic);
if (!dist_base)
return;
for (i = 0; i < DIV_ROUND_UP(gic_irqs, 16); i++)
- gic_data[gic_nr].saved_spi_conf[i] =
+ gic->saved_spi_conf[i] =
readl_relaxed(dist_base + GIC_DIST_CONFIG + i * 4);
for (i = 0; i < DIV_ROUND_UP(gic_irqs, 4); i++)
- gic_data[gic_nr].saved_spi_target[i] =
+ gic->saved_spi_target[i] =
readl_relaxed(dist_base + GIC_DIST_TARGET + i * 4);
for (i = 0; i < DIV_ROUND_UP(gic_irqs, 32); i++)
- gic_data[gic_nr].saved_spi_enable[i] =
+ gic->saved_spi_enable[i] =
readl_relaxed(dist_base + GIC_DIST_ENABLE_SET + i * 4);
for (i = 0; i < DIV_ROUND_UP(gic_irqs, 32); i++)
- gic_data[gic_nr].saved_spi_active[i] =
+ gic->saved_spi_active[i] =
readl_relaxed(dist_base + GIC_DIST_ACTIVE_SET + i * 4);
}
* handled normally, but any edge interrupts that occured will not be seen by
* the GIC and need to be handled by the platform-specific wakeup source.
*/
-static void gic_dist_restore(unsigned int gic_nr)
+static void gic_dist_restore(struct gic_chip_data *gic)
{
unsigned int gic_irqs;
unsigned int i;
void __iomem *dist_base;
- BUG_ON(gic_nr >= CONFIG_ARM_GIC_MAX_NR);
+ if (WARN_ON(!gic))
+ return;
- gic_irqs = gic_data[gic_nr].gic_irqs;
- dist_base = gic_data_dist_base(&gic_data[gic_nr]);
+ gic_irqs = gic->gic_irqs;
+ dist_base = gic_data_dist_base(gic);
if (!dist_base)
return;
writel_relaxed(GICD_DISABLE, dist_base + GIC_DIST_CTRL);
for (i = 0; i < DIV_ROUND_UP(gic_irqs, 16); i++)
- writel_relaxed(gic_data[gic_nr].saved_spi_conf[i],
+ writel_relaxed(gic->saved_spi_conf[i],
dist_base + GIC_DIST_CONFIG + i * 4);
for (i = 0; i < DIV_ROUND_UP(gic_irqs, 4); i++)
dist_base + GIC_DIST_PRI + i * 4);
for (i = 0; i < DIV_ROUND_UP(gic_irqs, 4); i++)
- writel_relaxed(gic_data[gic_nr].saved_spi_target[i],
+ writel_relaxed(gic->saved_spi_target[i],
dist_base + GIC_DIST_TARGET + i * 4);
for (i = 0; i < DIV_ROUND_UP(gic_irqs, 32); i++) {
writel_relaxed(GICD_INT_EN_CLR_X32,
dist_base + GIC_DIST_ENABLE_CLEAR + i * 4);
- writel_relaxed(gic_data[gic_nr].saved_spi_enable[i],
+ writel_relaxed(gic->saved_spi_enable[i],
dist_base + GIC_DIST_ENABLE_SET + i * 4);
}
for (i = 0; i < DIV_ROUND_UP(gic_irqs, 32); i++) {
writel_relaxed(GICD_INT_EN_CLR_X32,
dist_base + GIC_DIST_ACTIVE_CLEAR + i * 4);
- writel_relaxed(gic_data[gic_nr].saved_spi_active[i],
+ writel_relaxed(gic->saved_spi_active[i],
dist_base + GIC_DIST_ACTIVE_SET + i * 4);
}
writel_relaxed(GICD_ENABLE, dist_base + GIC_DIST_CTRL);
}
-static void gic_cpu_save(unsigned int gic_nr)
+static void gic_cpu_save(struct gic_chip_data *gic)
{
int i;
u32 *ptr;
void __iomem *dist_base;
void __iomem *cpu_base;
- BUG_ON(gic_nr >= CONFIG_ARM_GIC_MAX_NR);
+ if (WARN_ON(!gic))
+ return;
- dist_base = gic_data_dist_base(&gic_data[gic_nr]);
- cpu_base = gic_data_cpu_base(&gic_data[gic_nr]);
+ dist_base = gic_data_dist_base(gic);
+ cpu_base = gic_data_cpu_base(gic);
if (!dist_base || !cpu_base)
return;
- ptr = raw_cpu_ptr(gic_data[gic_nr].saved_ppi_enable);
+ ptr = raw_cpu_ptr(gic->saved_ppi_enable);
for (i = 0; i < DIV_ROUND_UP(32, 32); i++)
ptr[i] = readl_relaxed(dist_base + GIC_DIST_ENABLE_SET + i * 4);
- ptr = raw_cpu_ptr(gic_data[gic_nr].saved_ppi_active);
+ ptr = raw_cpu_ptr(gic->saved_ppi_active);
for (i = 0; i < DIV_ROUND_UP(32, 32); i++)
ptr[i] = readl_relaxed(dist_base + GIC_DIST_ACTIVE_SET + i * 4);
- ptr = raw_cpu_ptr(gic_data[gic_nr].saved_ppi_conf);
+ ptr = raw_cpu_ptr(gic->saved_ppi_conf);
for (i = 0; i < DIV_ROUND_UP(32, 16); i++)
ptr[i] = readl_relaxed(dist_base + GIC_DIST_CONFIG + i * 4);
}
-static void gic_cpu_restore(unsigned int gic_nr)
+static void gic_cpu_restore(struct gic_chip_data *gic)
{
int i;
u32 *ptr;
void __iomem *dist_base;
void __iomem *cpu_base;
- BUG_ON(gic_nr >= CONFIG_ARM_GIC_MAX_NR);
+ if (WARN_ON(!gic))
+ return;
- dist_base = gic_data_dist_base(&gic_data[gic_nr]);
- cpu_base = gic_data_cpu_base(&gic_data[gic_nr]);
+ dist_base = gic_data_dist_base(gic);
+ cpu_base = gic_data_cpu_base(gic);
if (!dist_base || !cpu_base)
return;
- ptr = raw_cpu_ptr(gic_data[gic_nr].saved_ppi_enable);
+ ptr = raw_cpu_ptr(gic->saved_ppi_enable);
for (i = 0; i < DIV_ROUND_UP(32, 32); i++) {
writel_relaxed(GICD_INT_EN_CLR_X32,
dist_base + GIC_DIST_ENABLE_CLEAR + i * 4);
writel_relaxed(ptr[i], dist_base + GIC_DIST_ENABLE_SET + i * 4);
}
- ptr = raw_cpu_ptr(gic_data[gic_nr].saved_ppi_active);
+ ptr = raw_cpu_ptr(gic->saved_ppi_active);
for (i = 0; i < DIV_ROUND_UP(32, 32); i++) {
writel_relaxed(GICD_INT_EN_CLR_X32,
dist_base + GIC_DIST_ACTIVE_CLEAR + i * 4);
writel_relaxed(ptr[i], dist_base + GIC_DIST_ACTIVE_SET + i * 4);
}
- ptr = raw_cpu_ptr(gic_data[gic_nr].saved_ppi_conf);
+ ptr = raw_cpu_ptr(gic->saved_ppi_conf);
for (i = 0; i < DIV_ROUND_UP(32, 16); i++)
writel_relaxed(ptr[i], dist_base + GIC_DIST_CONFIG + i * 4);
dist_base + GIC_DIST_PRI + i * 4);
writel_relaxed(GICC_INT_PRI_THRESHOLD, cpu_base + GIC_CPU_PRIMASK);
- gic_cpu_if_up(&gic_data[gic_nr]);
+ gic_cpu_if_up(gic);
}
static int gic_notifier(struct notifier_block *self, unsigned long cmd, void *v)
#endif
switch (cmd) {
case CPU_PM_ENTER:
- gic_cpu_save(i);
+ gic_cpu_save(&gic_data[i]);
break;
case CPU_PM_ENTER_FAILED:
case CPU_PM_EXIT:
- gic_cpu_restore(i);
+ gic_cpu_restore(&gic_data[i]);
break;
case CPU_CLUSTER_PM_ENTER:
- gic_dist_save(i);
+ gic_dist_save(&gic_data[i]);
break;
case CPU_CLUSTER_PM_ENTER_FAILED:
case CPU_CLUSTER_PM_EXIT:
- gic_dist_restore(i);
+ gic_dist_restore(&gic_data[i]);
break;
}
}
.notifier_call = gic_notifier,
};
-static void __init gic_pm_init(struct gic_chip_data *gic)
+static int __init gic_pm_init(struct gic_chip_data *gic)
{
gic->saved_ppi_enable = __alloc_percpu(DIV_ROUND_UP(32, 32) * 4,
sizeof(u32));
- BUG_ON(!gic->saved_ppi_enable);
+ if (WARN_ON(!gic->saved_ppi_enable))
+ return -ENOMEM;
gic->saved_ppi_active = __alloc_percpu(DIV_ROUND_UP(32, 32) * 4,
sizeof(u32));
- BUG_ON(!gic->saved_ppi_active);
+ if (WARN_ON(!gic->saved_ppi_active))
+ goto free_ppi_enable;
gic->saved_ppi_conf = __alloc_percpu(DIV_ROUND_UP(32, 16) * 4,
sizeof(u32));
- BUG_ON(!gic->saved_ppi_conf);
+ if (WARN_ON(!gic->saved_ppi_conf))
+ goto free_ppi_active;
if (gic == &gic_data[0])
cpu_pm_register_notifier(&gic_notifier_block);
+
+ return 0;
+
+free_ppi_active:
+ free_percpu(gic->saved_ppi_active);
+free_ppi_enable:
+ free_percpu(gic->saved_ppi_enable);
+
+ return -ENOMEM;
}
#else
-static void __init gic_pm_init(struct gic_chip_data *gic)
+static int __init gic_pm_init(struct gic_chip_data *gic)
{
+ return 0;
}
#endif
.unmap = gic_irq_domain_unmap,
};
-static void __init __gic_init_bases(unsigned int gic_nr, int irq_start,
- void __iomem *dist_base, void __iomem *cpu_base,
- u32 percpu_offset, struct fwnode_handle *handle)
+static int __init __gic_init_bases(struct gic_chip_data *gic, int irq_start,
+ struct fwnode_handle *handle)
{
irq_hw_number_t hwirq_base;
- struct gic_chip_data *gic;
- int gic_irqs, irq_base, i;
-
- BUG_ON(gic_nr >= CONFIG_ARM_GIC_MAX_NR);
+ int gic_irqs, irq_base, i, ret;
- gic_check_cpu_features();
-
- gic = &gic_data[gic_nr];
+ if (WARN_ON(!gic || gic->domain))
+ return -EINVAL;
/* Initialize irq_chip */
- if (static_key_true(&supports_deactivate) && gic_nr == 0) {
- gic->chip = gic_eoimode1_chip;
+ gic->chip = gic_chip;
+
+ if (static_key_true(&supports_deactivate) && gic == &gic_data[0]) {
+ gic->chip.irq_mask = gic_eoimode1_mask_irq;
+ gic->chip.irq_eoi = gic_eoimode1_eoi_irq;
+ gic->chip.irq_set_vcpu_affinity = gic_irq_set_vcpu_affinity;
+ gic->chip.name = kasprintf(GFP_KERNEL, "GICv2");
} else {
- gic->chip = gic_chip;
- gic->chip.name = kasprintf(GFP_KERNEL, "GIC-%d", gic_nr);
+ gic->chip.name = kasprintf(GFP_KERNEL, "GIC-%d",
+ (int)(gic - &gic_data[0]));
}
#ifdef CONFIG_SMP
- if (gic_nr == 0)
+ if (gic == &gic_data[0])
gic->chip.irq_set_affinity = gic_set_affinity;
#endif
-#ifdef CONFIG_GIC_NON_BANKED
- if (percpu_offset) { /* Frankein-GIC without banked registers... */
+ if (IS_ENABLED(CONFIG_GIC_NON_BANKED) && gic->percpu_offset) {
+ /* Frankein-GIC without banked registers... */
unsigned int cpu;
gic->dist_base.percpu_base = alloc_percpu(void __iomem *);
gic->cpu_base.percpu_base = alloc_percpu(void __iomem *);
if (WARN_ON(!gic->dist_base.percpu_base ||
!gic->cpu_base.percpu_base)) {
- free_percpu(gic->dist_base.percpu_base);
- free_percpu(gic->cpu_base.percpu_base);
- return;
+ ret = -ENOMEM;
+ goto error;
}
for_each_possible_cpu(cpu) {
u32 mpidr = cpu_logical_map(cpu);
u32 core_id = MPIDR_AFFINITY_LEVEL(mpidr, 0);
- unsigned long offset = percpu_offset * core_id;
- *per_cpu_ptr(gic->dist_base.percpu_base, cpu) = dist_base + offset;
- *per_cpu_ptr(gic->cpu_base.percpu_base, cpu) = cpu_base + offset;
+ unsigned long offset = gic->percpu_offset * core_id;
+ *per_cpu_ptr(gic->dist_base.percpu_base, cpu) =
+ gic->raw_dist_base + offset;
+ *per_cpu_ptr(gic->cpu_base.percpu_base, cpu) =
+ gic->raw_cpu_base + offset;
}
gic_set_base_accessor(gic, gic_get_percpu_base);
- } else
-#endif
- { /* Normal, sane GIC... */
- WARN(percpu_offset,
+ } else {
+ /* Normal, sane GIC... */
+ WARN(gic->percpu_offset,
"GIC_NON_BANKED not enabled, ignoring %08x offset!",
- percpu_offset);
- gic->dist_base.common_base = dist_base;
- gic->cpu_base.common_base = cpu_base;
+ gic->percpu_offset);
+ gic->dist_base.common_base = gic->raw_dist_base;
+ gic->cpu_base.common_base = gic->raw_cpu_base;
gic_set_base_accessor(gic, gic_get_common_base);
}
* For primary GICs, skip over SGIs.
* For secondary GICs, skip over PPIs, too.
*/
- if (gic_nr == 0 && (irq_start & 31) > 0) {
+ if (gic == &gic_data[0] && (irq_start & 31) > 0) {
hwirq_base = 16;
if (irq_start != -1)
irq_start = (irq_start & ~31) + 16;
hwirq_base, &gic_irq_domain_ops, gic);
}
- if (WARN_ON(!gic->domain))
- return;
+ if (WARN_ON(!gic->domain)) {
+ ret = -ENODEV;
+ goto error;
+ }
- if (gic_nr == 0) {
+ if (gic == &gic_data[0]) {
/*
* Initialize the CPU interface map to all CPUs.
* It will be refined as each CPU probes its ID.
}
gic_dist_init(gic);
- gic_cpu_init(gic);
- gic_pm_init(gic);
+ ret = gic_cpu_init(gic);
+ if (ret)
+ goto error;
+
+ ret = gic_pm_init(gic);
+ if (ret)
+ goto error;
+
+ return 0;
+
+error:
+ if (IS_ENABLED(CONFIG_GIC_NON_BANKED) && gic->percpu_offset) {
+ free_percpu(gic->dist_base.percpu_base);
+ free_percpu(gic->cpu_base.percpu_base);
+ }
+
+ kfree(gic->chip.name);
+
+ return ret;
}
void __init gic_init(unsigned int gic_nr, int irq_start,
void __iomem *dist_base, void __iomem *cpu_base)
{
+ struct gic_chip_data *gic;
+
+ if (WARN_ON(gic_nr >= CONFIG_ARM_GIC_MAX_NR))
+ return;
+
/*
* Non-DT/ACPI systems won't run a hypervisor, so let's not
* bother with these...
*/
static_key_slow_dec(&supports_deactivate);
- __gic_init_bases(gic_nr, irq_start, dist_base, cpu_base, 0, NULL);
+
+ gic = &gic_data[gic_nr];
+ gic->raw_dist_base = dist_base;
+ gic->raw_cpu_base = cpu_base;
+
+ __gic_init_bases(gic, irq_start, NULL);
+}
+
+static void gic_teardown(struct gic_chip_data *gic)
+{
+ if (WARN_ON(!gic))
+ return;
+
+ if (gic->raw_dist_base)
+ iounmap(gic->raw_dist_base);
+ if (gic->raw_cpu_base)
+ iounmap(gic->raw_cpu_base);
}
#ifdef CONFIG_OF
return true;
}
- static int gic_of_setup(struct gic_chip_data *gic, struct device_node *node)
++static int __init gic_of_setup(struct gic_chip_data *gic, struct device_node *node)
+{
+ if (!gic || !node)
+ return -EINVAL;
+
+ gic->raw_dist_base = of_iomap(node, 0);
+ if (WARN(!gic->raw_dist_base, "unable to map gic dist registers\n"))
+ goto error;
+
+ gic->raw_cpu_base = of_iomap(node, 1);
+ if (WARN(!gic->raw_cpu_base, "unable to map gic cpu registers\n"))
+ goto error;
+
+ if (of_property_read_u32(node, "cpu-offset", &gic->percpu_offset))
+ gic->percpu_offset = 0;
+
+ return 0;
+
+error:
+ gic_teardown(gic);
+
+ return -ENOMEM;
+}
+
+ static void __init gic_of_setup_kvm_info(struct device_node *node)
+ {
+ int ret;
+ struct resource *vctrl_res = &gic_v2_kvm_info.vctrl;
+ struct resource *vcpu_res = &gic_v2_kvm_info.vcpu;
+
+ gic_v2_kvm_info.type = GIC_V2;
+
+ gic_v2_kvm_info.maint_irq = irq_of_parse_and_map(node, 0);
+ if (!gic_v2_kvm_info.maint_irq)
+ return;
+
+ ret = of_address_to_resource(node, 2, vctrl_res);
+ if (ret)
+ return;
+
+ ret = of_address_to_resource(node, 3, vcpu_res);
+ if (ret)
+ return;
+
+ gic_set_kvm_info(&gic_v2_kvm_info);
+ }
+
int __init
gic_of_init(struct device_node *node, struct device_node *parent)
{
- void __iomem *cpu_base;
- void __iomem *dist_base;
- u32 percpu_offset;
- int irq;
+ struct gic_chip_data *gic;
+ int irq, ret;
if (WARN_ON(!node))
return -ENODEV;
- dist_base = of_iomap(node, 0);
- WARN(!dist_base, "unable to map gic dist registers\n");
+ if (WARN_ON(gic_cnt >= CONFIG_ARM_GIC_MAX_NR))
+ return -EINVAL;
+
+ gic = &gic_data[gic_cnt];
- cpu_base = of_iomap(node, 1);
- WARN(!cpu_base, "unable to map gic cpu registers\n");
+ ret = gic_of_setup(gic, node);
+ if (ret)
+ return ret;
/*
* Disable split EOI/Deactivate if either HYP is not available
* or the CPU interface is too small.
*/
- if (gic_cnt == 0 && !gic_check_eoimode(node, &cpu_base))
+ if (gic_cnt == 0 && !gic_check_eoimode(node, &gic->raw_cpu_base))
static_key_slow_dec(&supports_deactivate);
- if (of_property_read_u32(node, "cpu-offset", &percpu_offset))
- percpu_offset = 0;
+ ret = __gic_init_bases(gic, -1, &node->fwnode);
+ if (ret) {
+ gic_teardown(gic);
+ return ret;
+ }
- if (!gic_cnt)
- __gic_init_bases(gic_cnt, -1, dist_base, cpu_base, percpu_offset,
- &node->fwnode);
+ if (!gic_cnt) {
gic_init_physaddr(node);
+ gic_of_setup_kvm_info(node);
+ }
if (parent) {
irq = irq_of_parse_and_map(node, 0);
#endif
#ifdef CONFIG_ACPI
- static phys_addr_t cpu_phy_base __initdata;
+ static struct
+ {
+ phys_addr_t cpu_phys_base;
+ u32 maint_irq;
+ int maint_irq_mode;
+ phys_addr_t vctrl_base;
+ phys_addr_t vcpu_base;
+ } acpi_data __initdata;
static int __init
gic_acpi_parse_madt_cpu(struct acpi_subtable_header *header,
* All CPU interface addresses have to be the same.
*/
gic_cpu_base = processor->base_address;
- if (cpu_base_assigned && gic_cpu_base != cpu_phy_base)
+ if (cpu_base_assigned && gic_cpu_base != acpi_data.cpu_phys_base)
return -EINVAL;
- cpu_phy_base = gic_cpu_base;
+ acpi_data.cpu_phys_base = gic_cpu_base;
+ acpi_data.maint_irq = processor->vgic_interrupt;
+ acpi_data.maint_irq_mode = (processor->flags & ACPI_MADT_VGIC_IRQ_MODE) ?
+ ACPI_EDGE_SENSITIVE : ACPI_LEVEL_SENSITIVE;
+ acpi_data.vctrl_base = processor->gich_base_address;
+ acpi_data.vcpu_base = processor->gicv_base_address;
+
cpu_base_assigned = 1;
return 0;
}
#define ACPI_GICV2_DIST_MEM_SIZE (SZ_4K)
#define ACPI_GIC_CPU_IF_MEM_SIZE (SZ_8K)
+ #define ACPI_GICV2_VCTRL_MEM_SIZE (SZ_4K)
+ #define ACPI_GICV2_VCPU_MEM_SIZE (SZ_8K)
+
+ static void __init gic_acpi_setup_kvm_info(void)
+ {
+ int irq;
+ struct resource *vctrl_res = &gic_v2_kvm_info.vctrl;
+ struct resource *vcpu_res = &gic_v2_kvm_info.vcpu;
+
+ gic_v2_kvm_info.type = GIC_V2;
+
+ if (!acpi_data.vctrl_base)
+ return;
+
+ vctrl_res->flags = IORESOURCE_MEM;
+ vctrl_res->start = acpi_data.vctrl_base;
+ vctrl_res->end = vctrl_res->start + ACPI_GICV2_VCTRL_MEM_SIZE - 1;
+
+ if (!acpi_data.vcpu_base)
+ return;
+
+ vcpu_res->flags = IORESOURCE_MEM;
+ vcpu_res->start = acpi_data.vcpu_base;
+ vcpu_res->end = vcpu_res->start + ACPI_GICV2_VCPU_MEM_SIZE - 1;
+
+ irq = acpi_register_gsi(NULL, acpi_data.maint_irq,
+ acpi_data.maint_irq_mode,
+ ACPI_ACTIVE_HIGH);
+ if (irq <= 0)
+ return;
+
+ gic_v2_kvm_info.maint_irq = irq;
+
+ gic_set_kvm_info(&gic_v2_kvm_info);
+ }
static int __init gic_v2_acpi_init(struct acpi_subtable_header *header,
const unsigned long end)
{
struct acpi_madt_generic_distributor *dist;
- void __iomem *cpu_base, *dist_base;
struct fwnode_handle *domain_handle;
- int count;
+ struct gic_chip_data *gic = &gic_data[0];
+ int count, ret;
/* Collect CPU base addresses */
count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT,
return -EINVAL;
}
- gic->raw_cpu_base = ioremap(cpu_phy_base, ACPI_GIC_CPU_IF_MEM_SIZE);
- cpu_base = ioremap(acpi_data.cpu_phys_base, ACPI_GIC_CPU_IF_MEM_SIZE);
- if (!cpu_base) {
++ gic->raw_cpu_base = ioremap(acpi_data.cpu_phys_base, ACPI_GIC_CPU_IF_MEM_SIZE);
+ if (!gic->raw_cpu_base) {
pr_err("Unable to map GICC registers\n");
return -ENOMEM;
}
dist = (struct acpi_madt_generic_distributor *)header;
- dist_base = ioremap(dist->base_address, ACPI_GICV2_DIST_MEM_SIZE);
- if (!dist_base) {
+ gic->raw_dist_base = ioremap(dist->base_address,
+ ACPI_GICV2_DIST_MEM_SIZE);
+ if (!gic->raw_dist_base) {
pr_err("Unable to map GICD registers\n");
- iounmap(cpu_base);
+ gic_teardown(gic);
return -ENOMEM;
}
/*
* Initialize GIC instance zero (no multi-GIC support).
*/
- domain_handle = irq_domain_alloc_fwnode(dist_base);
+ domain_handle = irq_domain_alloc_fwnode(gic->raw_dist_base);
if (!domain_handle) {
pr_err("Unable to allocate domain handle\n");
- iounmap(cpu_base);
- iounmap(dist_base);
+ gic_teardown(gic);
return -ENOMEM;
}
- __gic_init_bases(0, -1, dist_base, cpu_base, 0, domain_handle);
+ ret = __gic_init_bases(gic, -1, domain_handle);
+ if (ret) {
+ pr_err("Failed to initialise GIC\n");
+ irq_domain_free_fwnode(domain_handle);
+ gic_teardown(gic);
+ return ret;
+ }
acpi_set_irq_model(ACPI_IRQ_MODEL_GIC, domain_handle);
if (IS_ENABLED(CONFIG_ARM_GIC_V2M))
gicv2m_init(NULL, gic_data[0].domain);
+ gic_acpi_setup_kvm_info();
+
return 0;
}
IRQCHIP_ACPI_DECLARE(gic_v2, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR,