]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/commitdiff
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 19 May 2016 18:27:09 +0000 (11:27 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 19 May 2016 18:27:09 +0000 (11:27 -0700)
Pull KVM updates from Paolo Bonzini:
 "Small release overall.

  x86:
   - miscellaneous fixes
   - AVIC support (local APIC virtualization, AMD version)

  s390:
   - polling for interrupts after a VCPU goes to halted state is now
     enabled for s390
   - use hardware provided information about facility bits that do not
     need any hypervisor activity, and other fixes for cpu models and
     facilities
   - improve perf output
   - floating interrupt controller improvements.

  MIPS:
   - miscellaneous fixes

  PPC:
   - bugfixes only

  ARM:
   - 16K page size support
   - generic firmware probing layer for timer and GIC

  Christoffer Dall (KVM-ARM maintainer) says:
    "There are a few changes in this pull request touching things
     outside KVM, but they should all carry the necessary acks and it
     made the merge process much easier to do it this way."

  though actually the irqchip maintainers' acks didn't make it into the
  patches.  Marc Zyngier, who is both irqchip and KVM-ARM maintainer,
  later acked at http://mid.gmane.org/573351D1.4060303@arm.com ('more
  formally and for documentation purposes')"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (82 commits)
  KVM: MTRR: remove MSR 0x2f8
  KVM: x86: make hwapic_isr_update and hwapic_irr_update look the same
  svm: Manage vcpu load/unload when enable AVIC
  svm: Do not intercept CR8 when enable AVIC
  svm: Do not expose x2APIC when enable AVIC
  KVM: x86: Introducing kvm_x86_ops.apicv_post_state_restore
  svm: Add VMEXIT handlers for AVIC
  svm: Add interrupt injection via AVIC
  KVM: x86: Detect and Initialize AVIC support
  svm: Introduce new AVIC VMCB registers
  KVM: split kvm_vcpu_wake_up from kvm_vcpu_kick
  KVM: x86: Introducing kvm_x86_ops VCPU blocking/unblocking hooks
  KVM: x86: Introducing kvm_x86_ops VM init/destroy hooks
  KVM: x86: Rename kvm_apic_get_reg to kvm_lapic_get_reg
  KVM: x86: Misc LAPIC changes to expose helper functions
  KVM: shrink halt polling even more for invalid wakeups
  KVM: s390: set halt polling to 80 microseconds
  KVM: halt_polling: provide a way to qualify wakeups during poll
  KVM: PPC: Book3S HV: Re-enable XICS fast path for irqfd-generated interrupts
  kvm: Conditionally register IRQ bypass consumer
  ...

22 files changed:
1  2 
arch/arm/include/asm/kvm_host.h
arch/arm/include/asm/kvm_mmu.h
arch/arm/kvm/arm.c
arch/arm/kvm/mmu.c
arch/arm64/include/asm/kvm_arm.h
arch/arm64/include/asm/kvm_host.h
arch/arm64/include/asm/kvm_mmu.h
arch/arm64/include/asm/pgtable-hwdef.h
arch/arm64/include/asm/pgtable.h
arch/mips/include/asm/kvm_host.h
arch/mips/kvm/emulate.c
arch/mips/kvm/tlb.c
arch/mips/kvm/trap_emul.c
arch/s390/include/asm/sclp.h
arch/x86/kvm/mmu.c
arch/x86/kvm/svm.c
arch/x86/kvm/trace.h
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
drivers/irqchip/irq-gic-common.c
drivers/irqchip/irq-gic-v3.c
drivers/irqchip/irq-gic.c

index 738d5eee91de0167e9ffa9062cc55b79ed3b4905,4cd8732796ab6ff3cafef1f6b11462d2a25215ea..0df6b1fc965571116ed4ae2366aff451873888e5
@@@ -187,6 -187,7 +187,7 @@@ struct kvm_vm_stat 
  struct kvm_vcpu_stat {
        u32 halt_successful_poll;
        u32 halt_attempted_poll;
+       u32 halt_poll_invalid;
        u32 halt_wakeup;
        u32 hvc_exit_stat;
        u64 wfe_exit_stat;
@@@ -265,15 -266,6 +266,15 @@@ static inline void __cpu_init_stage2(vo
        kvm_call_hyp(__init_stage2_translation);
  }
  
 +static inline void __cpu_reset_hyp_mode(phys_addr_t boot_pgd_ptr,
 +                                      phys_addr_t phys_idmap_start)
 +{
 +      /*
 +       * TODO
 +       * kvm_call_reset(boot_pgd_ptr, phys_idmap_start);
 +       */
 +}
 +
  static inline int kvm_arch_dev_ioctl_check_extension(long ext)
  {
        return 0;
@@@ -286,10 -278,12 +287,11 @@@ void kvm_mmu_wp_memory_region(struct kv
  
  struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
  
 -static inline void kvm_arch_hardware_disable(void) {}
  static inline void kvm_arch_hardware_unsetup(void) {}
  static inline void kvm_arch_sync_events(struct kvm *kvm) {}
  static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
  static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
+ static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
  
  static inline void kvm_arm_init_debug(void) {}
  static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {}
index f17a8d41822caf89c896ccf3aa2c839fc96f3e75,ef0b276d97fc1076ac0b157156f41f71944e3ae0..f9a65061130b66a402fb6d2772b44f9321d926e6
@@@ -47,6 -47,7 +47,7 @@@
  #include <linux/highmem.h>
  #include <asm/cacheflush.h>
  #include <asm/pgalloc.h>
+ #include <asm/stage2_pgtable.h>
  
  int create_hyp_mappings(void *from, void *to);
  int create_hyp_io_mappings(void *from, void *to, phys_addr_t);
@@@ -66,7 -67,6 +67,7 @@@ void kvm_mmu_free_memory_caches(struct 
  phys_addr_t kvm_mmu_get_httbr(void);
  phys_addr_t kvm_mmu_get_boot_httbr(void);
  phys_addr_t kvm_get_idmap_vector(void);
 +phys_addr_t kvm_get_idmap_start(void);
  int kvm_mmu_init(void);
  void kvm_clear_hyp_idmap(void);
  
@@@ -106,14 -106,16 +107,16 @@@ static inline void kvm_clean_pte(pte_t 
        clean_pte_table(pte);
  }
  
- static inline void kvm_set_s2pte_writable(pte_t *pte)
+ static inline pte_t kvm_s2pte_mkwrite(pte_t pte)
  {
-       pte_val(*pte) |= L_PTE_S2_RDWR;
+       pte_val(pte) |= L_PTE_S2_RDWR;
+       return pte;
  }
  
- static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
+ static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd)
  {
-       pmd_val(*pmd) |= L_PMD_S2_RDWR;
+       pmd_val(pmd) |= L_PMD_S2_RDWR;
+       return pmd;
  }
  
  static inline void kvm_set_s2pte_readonly(pte_t *pte)
@@@ -136,22 -138,6 +139,6 @@@ static inline bool kvm_s2pmd_readonly(p
        return (pmd_val(*pmd) & L_PMD_S2_RDWR) == L_PMD_S2_RDONLY;
  }
  
- /* Open coded p*d_addr_end that can deal with 64bit addresses */
- #define kvm_pgd_addr_end(addr, end)                                   \
- ({    u64 __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK;            \
-       (__boundary - 1 < (end) - 1)? __boundary: (end);                \
- })
- #define kvm_pud_addr_end(addr,end)            (end)
- #define kvm_pmd_addr_end(addr, end)                                   \
- ({    u64 __boundary = ((addr) + PMD_SIZE) & PMD_MASK;                \
-       (__boundary - 1 < (end) - 1)? __boundary: (end);                \
- })
- #define kvm_pgd_index(addr)                   pgd_index(addr)
  static inline bool kvm_page_empty(void *ptr)
  {
        struct page *ptr_page = virt_to_page(ptr);
  
  #define kvm_pte_table_empty(kvm, ptep) kvm_page_empty(ptep)
  #define kvm_pmd_table_empty(kvm, pmdp) kvm_page_empty(pmdp)
- #define kvm_pud_table_empty(kvm, pudp) (0)
- #define KVM_PREALLOC_LEVEL    0
+ #define kvm_pud_table_empty(kvm, pudp) false
  
- static inline void *kvm_get_hwpgd(struct kvm *kvm)
- {
-       return kvm->arch.pgd;
- }
- static inline unsigned int kvm_get_hwpgd_size(void)
- {
-       return PTRS_PER_S2_PGD * sizeof(pgd_t);
- }
+ #define hyp_pte_table_empty(ptep) kvm_page_empty(ptep)
+ #define hyp_pmd_table_empty(pmdp) kvm_page_empty(pmdp)
+ #define hyp_pud_table_empty(pudp) false
  
  struct kvm;
  
diff --combined arch/arm/kvm/arm.c
index 9ef013d86cc5c7a5f3924071bbe8044e8d17a783,be4b6394a0620de3037a68a98f364fa941b2c958..237d5d82f0afd6f1749a12f46872dc5170b2c3c6
@@@ -16,6 -16,7 +16,6 @@@
   * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
   */
  
 -#include <linux/cpu.h>
  #include <linux/cpu_pm.h>
  #include <linux/errno.h>
  #include <linux/err.h>
@@@ -65,8 -66,6 +65,8 @@@ static DEFINE_SPINLOCK(kvm_vmid_lock)
  
  static bool vgic_present;
  
 +static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled);
 +
  static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
  {
        BUG_ON(preemptible());
@@@ -91,6 -90,11 +91,6 @@@ struct kvm_vcpu * __percpu *kvm_get_run
        return &kvm_arm_running_vcpu;
  }
  
 -int kvm_arch_hardware_enable(void)
 -{
 -      return 0;
 -}
 -
  int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
  {
        return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
@@@ -444,7 -448,7 +444,7 @@@ static void update_vttbr(struct kvm *kv
        kvm_next_vmid &= (1 << kvm_vmid_bits) - 1;
  
        /* update vttbr to be used with the new vmid */
-       pgd_phys = virt_to_phys(kvm_get_hwpgd(kvm));
+       pgd_phys = virt_to_phys(kvm->arch.pgd);
        BUG_ON(pgd_phys & ~VTTBR_BADDR_MASK);
        vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits);
        kvm->arch.vttbr = pgd_phys | vmid;
@@@ -1029,6 -1033,11 +1029,6 @@@ long kvm_arch_vm_ioctl(struct file *fil
        }
  }
  
 -static void cpu_init_stage2(void *dummy)
 -{
 -      __cpu_init_stage2();
 -}
 -
  static void cpu_init_hyp_mode(void *dummy)
  {
        phys_addr_t boot_pgd_ptr;
@@@ -1056,87 -1065,43 +1056,87 @@@ static void cpu_hyp_reinit(void
  {
        if (is_kernel_in_hyp_mode()) {
                /*
 -               * cpu_init_stage2() is safe to call even if the PM
 +               * __cpu_init_stage2() is safe to call even if the PM
                 * event was cancelled before the CPU was reset.
                 */
 -              cpu_init_stage2(NULL);
 +              __cpu_init_stage2();
        } else {
                if (__hyp_get_vectors() == hyp_default_vectors)
                        cpu_init_hyp_mode(NULL);
        }
  }
  
 -static int hyp_init_cpu_notify(struct notifier_block *self,
 -                             unsigned long action, void *cpu)
 +static void cpu_hyp_reset(void)
  {
 -      switch (action) {
 -      case CPU_STARTING:
 -      case CPU_STARTING_FROZEN:
 +      phys_addr_t boot_pgd_ptr;
 +      phys_addr_t phys_idmap_start;
 +
 +      if (!is_kernel_in_hyp_mode()) {
 +              boot_pgd_ptr = kvm_mmu_get_boot_httbr();
 +              phys_idmap_start = kvm_get_idmap_start();
 +
 +              __cpu_reset_hyp_mode(boot_pgd_ptr, phys_idmap_start);
 +      }
 +}
 +
 +static void _kvm_arch_hardware_enable(void *discard)
 +{
 +      if (!__this_cpu_read(kvm_arm_hardware_enabled)) {
                cpu_hyp_reinit();
 +              __this_cpu_write(kvm_arm_hardware_enabled, 1);
        }
 +}
  
 -      return NOTIFY_OK;
 +int kvm_arch_hardware_enable(void)
 +{
 +      _kvm_arch_hardware_enable(NULL);
 +      return 0;
  }
  
 -static struct notifier_block hyp_init_cpu_nb = {
 -      .notifier_call = hyp_init_cpu_notify,
 -};
 +static void _kvm_arch_hardware_disable(void *discard)
 +{
 +      if (__this_cpu_read(kvm_arm_hardware_enabled)) {
 +              cpu_hyp_reset();
 +              __this_cpu_write(kvm_arm_hardware_enabled, 0);
 +      }
 +}
 +
 +void kvm_arch_hardware_disable(void)
 +{
 +      _kvm_arch_hardware_disable(NULL);
 +}
  
  #ifdef CONFIG_CPU_PM
  static int hyp_init_cpu_pm_notifier(struct notifier_block *self,
                                    unsigned long cmd,
                                    void *v)
  {
 -      if (cmd == CPU_PM_EXIT) {
 -              cpu_hyp_reinit();
 +      /*
 +       * kvm_arm_hardware_enabled is left with its old value over
 +       * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should
 +       * re-enable hyp.
 +       */
 +      switch (cmd) {
 +      case CPU_PM_ENTER:
 +              if (__this_cpu_read(kvm_arm_hardware_enabled))
 +                      /*
 +                       * don't update kvm_arm_hardware_enabled here
 +                       * so that the hardware will be re-enabled
 +                       * when we resume. See below.
 +                       */
 +                      cpu_hyp_reset();
 +
                return NOTIFY_OK;
 -      }
 +      case CPU_PM_EXIT:
 +              if (__this_cpu_read(kvm_arm_hardware_enabled))
 +                      /* The hardware was enabled before suspend. */
 +                      cpu_hyp_reinit();
  
 -      return NOTIFY_DONE;
 +              return NOTIFY_OK;
 +
 +      default:
 +              return NOTIFY_DONE;
 +      }
  }
  
  static struct notifier_block hyp_init_cpu_pm_nb = {
@@@ -1178,12 -1143,16 +1178,12 @@@ static int init_common_resources(void
  
  static int init_subsystems(void)
  {
 -      int err;
 +      int err = 0;
  
        /*
 -       * Register CPU Hotplug notifier
 +       * Enable hardware so that subsystem initialisation can access EL2.
         */
 -      err = register_cpu_notifier(&hyp_init_cpu_nb);
 -      if (err) {
 -              kvm_err("Cannot register KVM init CPU notifier (%d)\n", err);
 -              return err;
 -      }
 +      on_each_cpu(_kvm_arch_hardware_enable, NULL, 1);
  
        /*
         * Register CPU lower-power notifier
        case -ENODEV:
        case -ENXIO:
                vgic_present = false;
 +              err = 0;
                break;
        default:
 -              return err;
 +              goto out;
        }
  
        /*
         */
        err = kvm_timer_hyp_init();
        if (err)
 -              return err;
 +              goto out;
  
        kvm_perf_init();
        kvm_coproc_table_init();
  
 -      return 0;
 +out:
 +      on_each_cpu(_kvm_arch_hardware_disable, NULL, 1);
 +
 +      return err;
  }
  
  static void teardown_hyp_mode(void)
        free_hyp_pgds();
        for_each_possible_cpu(cpu)
                free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
 -      unregister_cpu_notifier(&hyp_init_cpu_nb);
        hyp_cpu_pm_exit();
  }
  
  static int init_vhe_mode(void)
  {
 -      /*
 -       * Execute the init code on each CPU.
 -       */
 -      on_each_cpu(cpu_init_stage2, NULL, 1);
 -
        /* set size of VMID supported by CPU */
        kvm_vmid_bits = kvm_get_vmid_bits();
        kvm_info("%d-bit VMID\n", kvm_vmid_bits);
@@@ -1324,6 -1295,11 +1324,6 @@@ static int init_hyp_mode(void
                }
        }
  
 -      /*
 -       * Execute the init code on each CPU.
 -       */
 -      on_each_cpu(cpu_init_hyp_mode, NULL, 1);
 -
  #ifndef CONFIG_HOTPLUG_CPU
        free_boot_hyp_pgd();
  #endif
diff --combined arch/arm/kvm/mmu.c
index be302128c5d7f57b4545d592261edd6d0db9f5e7,783e5ff0b32e189c4cd8f4733d9f858b0d0c76cf..45c43aecb8f2f30997015f454d3d85fb25d7ac7a
@@@ -43,11 -43,9 +43,9 @@@ static unsigned long hyp_idmap_start
  static unsigned long hyp_idmap_end;
  static phys_addr_t hyp_idmap_vector;
  
+ #define S2_PGD_SIZE   (PTRS_PER_S2_PGD * sizeof(pgd_t))
  #define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
  
- #define kvm_pmd_huge(_x)      (pmd_huge(_x) || pmd_trans_huge(_x))
- #define kvm_pud_huge(_x)      pud_huge(_x)
  #define KVM_S2PTE_FLAG_IS_IOMAP               (1UL << 0)
  #define KVM_S2_FLAG_LOGGING_ACTIVE    (1UL << 1)
  
@@@ -69,14 -67,7 +67,7 @@@ void kvm_flush_remote_tlbs(struct kvm *
  
  static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
  {
-       /*
-        * This function also gets called when dealing with HYP page
-        * tables. As HYP doesn't have an associated struct kvm (and
-        * the HYP page tables are fairly static), we don't do
-        * anything there.
-        */
-       if (kvm)
-               kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
+       kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
  }
  
  /*
@@@ -115,7 -106,7 +106,7 @@@ static bool kvm_is_device_pfn(unsigned 
   */
  static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
  {
-       if (!kvm_pmd_huge(*pmd))
+       if (!pmd_thp_or_huge(*pmd))
                return;
  
        pmd_clear(pmd);
@@@ -155,29 -146,29 +146,29 @@@ static void *mmu_memory_cache_alloc(str
        return p;
  }
  
- static void clear_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr)
+ static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr)
  {
-       pud_t *pud_table __maybe_unused = pud_offset(pgd, 0);
-       pgd_clear(pgd);
+       pud_t *pud_table __maybe_unused = stage2_pud_offset(pgd, 0UL);
+       stage2_pgd_clear(pgd);
        kvm_tlb_flush_vmid_ipa(kvm, addr);
-       pud_free(NULL, pud_table);
+       stage2_pud_free(pud_table);
        put_page(virt_to_page(pgd));
  }
  
- static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
+ static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
  {
-       pmd_t *pmd_table pmd_offset(pud, 0);
-       VM_BUG_ON(pud_huge(*pud));
-       pud_clear(pud);
+       pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(pud, 0);
+       VM_BUG_ON(stage2_pud_huge(*pud));
+       stage2_pud_clear(pud);
        kvm_tlb_flush_vmid_ipa(kvm, addr);
-       pmd_free(NULL, pmd_table);
+       stage2_pmd_free(pmd_table);
        put_page(virt_to_page(pud));
  }
  
- static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
+ static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
  {
        pte_t *pte_table = pte_offset_kernel(pmd, 0);
-       VM_BUG_ON(kvm_pmd_huge(*pmd));
+       VM_BUG_ON(pmd_thp_or_huge(*pmd));
        pmd_clear(pmd);
        kvm_tlb_flush_vmid_ipa(kvm, addr);
        pte_free_kernel(NULL, pte_table);
   * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure
   * the IO subsystem will never hit in the cache.
   */
- static void unmap_ptes(struct kvm *kvm, pmd_t *pmd,
+ static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd,
                       phys_addr_t addr, phys_addr_t end)
  {
        phys_addr_t start_addr = addr;
                }
        } while (pte++, addr += PAGE_SIZE, addr != end);
  
-       if (kvm_pte_table_empty(kvm, start_pte))
-               clear_pmd_entry(kvm, pmd, start_addr);
+       if (stage2_pte_table_empty(start_pte))
+               clear_stage2_pmd_entry(kvm, pmd, start_addr);
  }
  
- static void unmap_pmds(struct kvm *kvm, pud_t *pud,
+ static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud,
                       phys_addr_t addr, phys_addr_t end)
  {
        phys_addr_t next, start_addr = addr;
        pmd_t *pmd, *start_pmd;
  
-       start_pmd = pmd = pmd_offset(pud, addr);
+       start_pmd = pmd = stage2_pmd_offset(pud, addr);
        do {
-               next = kvm_pmd_addr_end(addr, end);
+               next = stage2_pmd_addr_end(addr, end);
                if (!pmd_none(*pmd)) {
-                       if (kvm_pmd_huge(*pmd)) {
+                       if (pmd_thp_or_huge(*pmd)) {
                                pmd_t old_pmd = *pmd;
  
                                pmd_clear(pmd);
  
                                put_page(virt_to_page(pmd));
                        } else {
-                               unmap_ptes(kvm, pmd, addr, next);
+                               unmap_stage2_ptes(kvm, pmd, addr, next);
                        }
                }
        } while (pmd++, addr = next, addr != end);
  
-       if (kvm_pmd_table_empty(kvm, start_pmd))
-               clear_pud_entry(kvm, pud, start_addr);
+       if (stage2_pmd_table_empty(start_pmd))
+               clear_stage2_pud_entry(kvm, pud, start_addr);
  }
  
- static void unmap_puds(struct kvm *kvm, pgd_t *pgd,
+ static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd,
                       phys_addr_t addr, phys_addr_t end)
  {
        phys_addr_t next, start_addr = addr;
        pud_t *pud, *start_pud;
  
-       start_pud = pud = pud_offset(pgd, addr);
+       start_pud = pud = stage2_pud_offset(pgd, addr);
        do {
-               next = kvm_pud_addr_end(addr, end);
-               if (!pud_none(*pud)) {
-                       if (pud_huge(*pud)) {
+               next = stage2_pud_addr_end(addr, end);
+               if (!stage2_pud_none(*pud)) {
+                       if (stage2_pud_huge(*pud)) {
                                pud_t old_pud = *pud;
  
-                               pud_clear(pud);
+                               stage2_pud_clear(pud);
                                kvm_tlb_flush_vmid_ipa(kvm, addr);
                                kvm_flush_dcache_pud(old_pud);
                                put_page(virt_to_page(pud));
                        } else {
-                               unmap_pmds(kvm, pud, addr, next);
+                               unmap_stage2_pmds(kvm, pud, addr, next);
                        }
                }
        } while (pud++, addr = next, addr != end);
  
-       if (kvm_pud_table_empty(kvm, start_pud))
-               clear_pgd_entry(kvm, pgd, start_addr);
+       if (stage2_pud_table_empty(start_pud))
+               clear_stage2_pgd_entry(kvm, pgd, start_addr);
  }
  
- static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
-                       phys_addr_t start, u64 size)
+ /**
+  * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
+  * @kvm:   The VM pointer
+  * @start: The intermediate physical base address of the range to unmap
+  * @size:  The size of the area to unmap
+  *
+  * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
+  * be called while holding mmu_lock (unless for freeing the stage2 pgd before
+  * destroying the VM), otherwise another faulting VCPU may come in and mess
+  * with things behind our backs.
+  */
+ static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
  {
        pgd_t *pgd;
        phys_addr_t addr = start, end = start + size;
        phys_addr_t next;
  
-       pgd = pgdp + kvm_pgd_index(addr);
+       pgd = kvm->arch.pgd + stage2_pgd_index(addr);
        do {
-               next = kvm_pgd_addr_end(addr, end);
-               if (!pgd_none(*pgd))
-                       unmap_puds(kvm, pgd, addr, next);
+               next = stage2_pgd_addr_end(addr, end);
+               if (!stage2_pgd_none(*pgd))
+                       unmap_stage2_puds(kvm, pgd, addr, next);
        } while (pgd++, addr = next, addr != end);
  }
  
@@@ -322,11 -320,11 +320,11 @@@ static void stage2_flush_pmds(struct kv
        pmd_t *pmd;
        phys_addr_t next;
  
-       pmd = pmd_offset(pud, addr);
+       pmd = stage2_pmd_offset(pud, addr);
        do {
-               next = kvm_pmd_addr_end(addr, end);
+               next = stage2_pmd_addr_end(addr, end);
                if (!pmd_none(*pmd)) {
-                       if (kvm_pmd_huge(*pmd))
+                       if (pmd_thp_or_huge(*pmd))
                                kvm_flush_dcache_pmd(*pmd);
                        else
                                stage2_flush_ptes(kvm, pmd, addr, next);
@@@ -340,11 -338,11 +338,11 @@@ static void stage2_flush_puds(struct kv
        pud_t *pud;
        phys_addr_t next;
  
-       pud = pud_offset(pgd, addr);
+       pud = stage2_pud_offset(pgd, addr);
        do {
-               next = kvm_pud_addr_end(addr, end);
-               if (!pud_none(*pud)) {
-                       if (pud_huge(*pud))
+               next = stage2_pud_addr_end(addr, end);
+               if (!stage2_pud_none(*pud)) {
+                       if (stage2_pud_huge(*pud))
                                kvm_flush_dcache_pud(*pud);
                        else
                                stage2_flush_pmds(kvm, pud, addr, next);
@@@ -360,9 -358,9 +358,9 @@@ static void stage2_flush_memslot(struc
        phys_addr_t next;
        pgd_t *pgd;
  
-       pgd = kvm->arch.pgd + kvm_pgd_index(addr);
+       pgd = kvm->arch.pgd + stage2_pgd_index(addr);
        do {
-               next = kvm_pgd_addr_end(addr, end);
+               next = stage2_pgd_addr_end(addr, end);
                stage2_flush_puds(kvm, pgd, addr, next);
        } while (pgd++, addr = next, addr != end);
  }
@@@ -391,6 -389,100 +389,100 @@@ static void stage2_flush_vm(struct kvm 
        srcu_read_unlock(&kvm->srcu, idx);
  }
  
+ static void clear_hyp_pgd_entry(pgd_t *pgd)
+ {
+       pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL);
+       pgd_clear(pgd);
+       pud_free(NULL, pud_table);
+       put_page(virt_to_page(pgd));
+ }
+ static void clear_hyp_pud_entry(pud_t *pud)
+ {
+       pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0);
+       VM_BUG_ON(pud_huge(*pud));
+       pud_clear(pud);
+       pmd_free(NULL, pmd_table);
+       put_page(virt_to_page(pud));
+ }
+ static void clear_hyp_pmd_entry(pmd_t *pmd)
+ {
+       pte_t *pte_table = pte_offset_kernel(pmd, 0);
+       VM_BUG_ON(pmd_thp_or_huge(*pmd));
+       pmd_clear(pmd);
+       pte_free_kernel(NULL, pte_table);
+       put_page(virt_to_page(pmd));
+ }
+ static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
+ {
+       pte_t *pte, *start_pte;
+       start_pte = pte = pte_offset_kernel(pmd, addr);
+       do {
+               if (!pte_none(*pte)) {
+                       kvm_set_pte(pte, __pte(0));
+                       put_page(virt_to_page(pte));
+               }
+       } while (pte++, addr += PAGE_SIZE, addr != end);
+       if (hyp_pte_table_empty(start_pte))
+               clear_hyp_pmd_entry(pmd);
+ }
+ static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
+ {
+       phys_addr_t next;
+       pmd_t *pmd, *start_pmd;
+       start_pmd = pmd = pmd_offset(pud, addr);
+       do {
+               next = pmd_addr_end(addr, end);
+               /* Hyp doesn't use huge pmds */
+               if (!pmd_none(*pmd))
+                       unmap_hyp_ptes(pmd, addr, next);
+       } while (pmd++, addr = next, addr != end);
+       if (hyp_pmd_table_empty(start_pmd))
+               clear_hyp_pud_entry(pud);
+ }
+ static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
+ {
+       phys_addr_t next;
+       pud_t *pud, *start_pud;
+       start_pud = pud = pud_offset(pgd, addr);
+       do {
+               next = pud_addr_end(addr, end);
+               /* Hyp doesn't use huge puds */
+               if (!pud_none(*pud))
+                       unmap_hyp_pmds(pud, addr, next);
+       } while (pud++, addr = next, addr != end);
+       if (hyp_pud_table_empty(start_pud))
+               clear_hyp_pgd_entry(pgd);
+ }
+ static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
+ {
+       pgd_t *pgd;
+       phys_addr_t addr = start, end = start + size;
+       phys_addr_t next;
+       /*
+        * We don't unmap anything from HYP, except at the hyp tear down.
+        * Hence, we don't have to invalidate the TLBs here.
+        */
+       pgd = pgdp + pgd_index(addr);
+       do {
+               next = pgd_addr_end(addr, end);
+               if (!pgd_none(*pgd))
+                       unmap_hyp_puds(pgd, addr, next);
+       } while (pgd++, addr = next, addr != end);
+ }
  /**
   * free_boot_hyp_pgd - free HYP boot page tables
   *
@@@ -401,14 -493,14 +493,14 @@@ void free_boot_hyp_pgd(void
        mutex_lock(&kvm_hyp_pgd_mutex);
  
        if (boot_hyp_pgd) {
-               unmap_range(NULL, boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
-               unmap_range(NULL, boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
+               unmap_hyp_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
+               unmap_hyp_range(boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
                free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
                boot_hyp_pgd = NULL;
        }
  
        if (hyp_pgd)
-               unmap_range(NULL, hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
+               unmap_hyp_range(hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
  
        mutex_unlock(&kvm_hyp_pgd_mutex);
  }
@@@ -433,9 -525,9 +525,9 @@@ void free_hyp_pgds(void
  
        if (hyp_pgd) {
                for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE)
-                       unmap_range(NULL, hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
+                       unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
                for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE)
-                       unmap_range(NULL, hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
+                       unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
  
                free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
                hyp_pgd = NULL;
@@@ -645,20 -737,6 +737,6 @@@ int create_hyp_io_mappings(void *from, 
                                     __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE);
  }
  
- /* Free the HW pgd, one page at a time */
- static void kvm_free_hwpgd(void *hwpgd)
- {
-       free_pages_exact(hwpgd, kvm_get_hwpgd_size());
- }
- /* Allocate the HW PGD, making sure that each page gets its own refcount */
- static void *kvm_alloc_hwpgd(void)
- {
-       unsigned int size = kvm_get_hwpgd_size();
-       return alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO);
- }
  /**
   * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
   * @kvm:      The KVM struct pointer for the VM.
  int kvm_alloc_stage2_pgd(struct kvm *kvm)
  {
        pgd_t *pgd;
-       void *hwpgd;
  
        if (kvm->arch.pgd != NULL) {
                kvm_err("kvm_arch already initialized?\n");
                return -EINVAL;
        }
  
-       hwpgd = kvm_alloc_hwpgd();
-       if (!hwpgd)
+       /* Allocate the HW PGD, making sure that each page gets its own refcount */
+       pgd = alloc_pages_exact(S2_PGD_SIZE, GFP_KERNEL | __GFP_ZERO);
+       if (!pgd)
                return -ENOMEM;
  
-       /* When the kernel uses more levels of page tables than the
-        * guest, we allocate a fake PGD and pre-populate it to point
-        * to the next-level page table, which will be the real
-        * initial page table pointed to by the VTTBR.
-        *
-        * When KVM_PREALLOC_LEVEL==2, we allocate a single page for
-        * the PMD and the kernel will use folded pud.
-        * When KVM_PREALLOC_LEVEL==1, we allocate 2 consecutive PUD
-        * pages.
-        */
-       if (KVM_PREALLOC_LEVEL > 0) {
-               int i;
-               /*
-                * Allocate fake pgd for the page table manipulation macros to
-                * work.  This is not used by the hardware and we have no
-                * alignment requirement for this allocation.
-                */
-               pgd = kmalloc(PTRS_PER_S2_PGD * sizeof(pgd_t),
-                               GFP_KERNEL | __GFP_ZERO);
-               if (!pgd) {
-                       kvm_free_hwpgd(hwpgd);
-                       return -ENOMEM;
-               }
-               /* Plug the HW PGD into the fake one. */
-               for (i = 0; i < PTRS_PER_S2_PGD; i++) {
-                       if (KVM_PREALLOC_LEVEL == 1)
-                               pgd_populate(NULL, pgd + i,
-                                            (pud_t *)hwpgd + i * PTRS_PER_PUD);
-                       else if (KVM_PREALLOC_LEVEL == 2)
-                               pud_populate(NULL, pud_offset(pgd, 0) + i,
-                                            (pmd_t *)hwpgd + i * PTRS_PER_PMD);
-               }
-       } else {
-               /*
-                * Allocate actual first-level Stage-2 page table used by the
-                * hardware for Stage-2 page table walks.
-                */
-               pgd = (pgd_t *)hwpgd;
-       }
        kvm_clean_pgd(pgd);
        kvm->arch.pgd = pgd;
        return 0;
  }
  
- /**
-  * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
-  * @kvm:   The VM pointer
-  * @start: The intermediate physical base address of the range to unmap
-  * @size:  The size of the area to unmap
-  *
-  * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
-  * be called while holding mmu_lock (unless for freeing the stage2 pgd before
-  * destroying the VM), otherwise another faulting VCPU may come in and mess
-  * with things behind our backs.
-  */
- static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
- {
-       unmap_range(kvm, kvm->arch.pgd, start, size);
- }
  static void stage2_unmap_memslot(struct kvm *kvm,
                                 struct kvm_memory_slot *memslot)
  {
@@@ -830,10 -849,8 +849,8 @@@ void kvm_free_stage2_pgd(struct kvm *kv
                return;
  
        unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE);
-       kvm_free_hwpgd(kvm_get_hwpgd(kvm));
-       if (KVM_PREALLOC_LEVEL > 0)
-               kfree(kvm->arch.pgd);
+       /* Free the HW pgd, one page at a time */
+       free_pages_exact(kvm->arch.pgd, S2_PGD_SIZE);
        kvm->arch.pgd = NULL;
  }
  
@@@ -843,16 -860,16 +860,16 @@@ static pud_t *stage2_get_pud(struct kv
        pgd_t *pgd;
        pud_t *pud;
  
-       pgd = kvm->arch.pgd + kvm_pgd_index(addr);
-       if (WARN_ON(pgd_none(*pgd))) {
+       pgd = kvm->arch.pgd + stage2_pgd_index(addr);
+       if (WARN_ON(stage2_pgd_none(*pgd))) {
                if (!cache)
                        return NULL;
                pud = mmu_memory_cache_alloc(cache);
-               pgd_populate(NULL, pgd, pud);
+               stage2_pgd_populate(pgd, pud);
                get_page(virt_to_page(pgd));
        }
  
-       return pud_offset(pgd, addr);
+       return stage2_pud_offset(pgd, addr);
  }
  
  static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
        pmd_t *pmd;
  
        pud = stage2_get_pud(kvm, cache, addr);
-       if (pud_none(*pud)) {
+       if (stage2_pud_none(*pud)) {
                if (!cache)
                        return NULL;
                pmd = mmu_memory_cache_alloc(cache);
-               pud_populate(NULL, pud, pmd);
+               stage2_pud_populate(pud, pmd);
                get_page(virt_to_page(pud));
        }
  
-       return pmd_offset(pud, addr);
+       return stage2_pmd_offset(pud, addr);
  }
  
  static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
        VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd));
  
        old_pmd = *pmd;
-       kvm_set_pmd(pmd, *new_pmd);
-       if (pmd_present(old_pmd))
+       if (pmd_present(old_pmd)) {
+               pmd_clear(pmd);
                kvm_tlb_flush_vmid_ipa(kvm, addr);
-       else
+       } else {
                get_page(virt_to_page(pmd));
+       }
+       kvm_set_pmd(pmd, *new_pmd);
        return 0;
  }
  
@@@ -946,14 -966,37 +966,37 @@@ static int stage2_set_pte(struct kvm *k
  
        /* Create 2nd stage page table mapping - Level 3 */
        old_pte = *pte;
-       kvm_set_pte(pte, *new_pte);
-       if (pte_present(old_pte))
+       if (pte_present(old_pte)) {
+               kvm_set_pte(pte, __pte(0));
                kvm_tlb_flush_vmid_ipa(kvm, addr);
-       else
+       } else {
                get_page(virt_to_page(pte));
+       }
+       kvm_set_pte(pte, *new_pte);
+       return 0;
+ }
  
+ #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+ static int stage2_ptep_test_and_clear_young(pte_t *pte)
+ {
+       if (pte_young(*pte)) {
+               *pte = pte_mkold(*pte);
+               return 1;
+       }
        return 0;
  }
+ #else
+ static int stage2_ptep_test_and_clear_young(pte_t *pte)
+ {
+       return __ptep_test_and_clear_young(pte);
+ }
+ #endif
+ static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
+ {
+       return stage2_ptep_test_and_clear_young((pte_t *)pmd);
+ }
  
  /**
   * kvm_phys_addr_ioremap - map a device range to guest IPA
@@@ -978,7 -1021,7 +1021,7 @@@ int kvm_phys_addr_ioremap(struct kvm *k
                pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE);
  
                if (writable)
-                       kvm_set_s2pte_writable(&pte);
+                       pte = kvm_s2pte_mkwrite(pte);
  
                ret = mmu_topup_memory_cache(&cache, KVM_MMU_CACHE_MIN_PAGES,
                                                KVM_NR_MEM_OBJS);
@@@ -1004,7 -1047,7 +1047,7 @@@ static bool transparent_hugepage_adjust
        kvm_pfn_t pfn = *pfnp;
        gfn_t gfn = *ipap >> PAGE_SHIFT;
  
 -      if (PageTransCompound(pfn_to_page(pfn))) {
 +      if (PageTransCompoundMap(pfn_to_page(pfn))) {
                unsigned long mask;
                /*
                 * The address we faulted on is backed by a transparent huge
@@@ -1078,12 -1121,12 +1121,12 @@@ static void stage2_wp_pmds(pud_t *pud, 
        pmd_t *pmd;
        phys_addr_t next;
  
-       pmd = pmd_offset(pud, addr);
+       pmd = stage2_pmd_offset(pud, addr);
  
        do {
-               next = kvm_pmd_addr_end(addr, end);
+               next = stage2_pmd_addr_end(addr, end);
                if (!pmd_none(*pmd)) {
-                       if (kvm_pmd_huge(*pmd)) {
+                       if (pmd_thp_or_huge(*pmd)) {
                                if (!kvm_s2pmd_readonly(pmd))
                                        kvm_set_s2pmd_readonly(pmd);
                        } else {
@@@ -1106,12 -1149,12 +1149,12 @@@ static void  stage2_wp_puds(pgd_t *pgd
        pud_t *pud;
        phys_addr_t next;
  
-       pud = pud_offset(pgd, addr);
+       pud = stage2_pud_offset(pgd, addr);
        do {
-               next = kvm_pud_addr_end(addr, end);
-               if (!pud_none(*pud)) {
+               next = stage2_pud_addr_end(addr, end);
+               if (!stage2_pud_none(*pud)) {
                        /* TODO:PUD not supported, revisit later if supported */
-                       BUG_ON(kvm_pud_huge(*pud));
+                       BUG_ON(stage2_pud_huge(*pud));
                        stage2_wp_pmds(pud, addr, next);
                }
        } while (pud++, addr = next, addr != end);
@@@ -1128,7 -1171,7 +1171,7 @@@ static void stage2_wp_range(struct kvm 
        pgd_t *pgd;
        phys_addr_t next;
  
-       pgd = kvm->arch.pgd + kvm_pgd_index(addr);
+       pgd = kvm->arch.pgd + stage2_pgd_index(addr);
        do {
                /*
                 * Release kvm_mmu_lock periodically if the memory region is
                if (need_resched() || spin_needbreak(&kvm->mmu_lock))
                        cond_resched_lock(&kvm->mmu_lock);
  
-               next = kvm_pgd_addr_end(addr, end);
-               if (pgd_present(*pgd))
+               next = stage2_pgd_addr_end(addr, end);
+               if (stage2_pgd_present(*pgd))
                        stage2_wp_puds(pgd, addr, next);
        } while (pgd++, addr = next, addr != end);
  }
@@@ -1320,7 -1363,7 +1363,7 @@@ static int user_mem_abort(struct kvm_vc
                pmd_t new_pmd = pfn_pmd(pfn, mem_type);
                new_pmd = pmd_mkhuge(new_pmd);
                if (writable) {
-                       kvm_set_s2pmd_writable(&new_pmd);
+                       new_pmd = kvm_s2pmd_mkwrite(new_pmd);
                        kvm_set_pfn_dirty(pfn);
                }
                coherent_cache_guest_page(vcpu, pfn, PMD_SIZE, fault_ipa_uncached);
                pte_t new_pte = pfn_pte(pfn, mem_type);
  
                if (writable) {
-                       kvm_set_s2pte_writable(&new_pte);
+                       new_pte = kvm_s2pte_mkwrite(new_pte);
                        kvm_set_pfn_dirty(pfn);
                        mark_page_dirty(kvm, gfn);
                }
@@@ -1348,6 -1391,8 +1391,8 @@@ out_unlock
   * Resolve the access fault by making the page young again.
   * Note that because the faulting entry is guaranteed not to be
   * cached in the TLB, we don't need to invalidate anything.
+  * Only the HW Access Flag updates are supported for Stage 2 (no DBM),
+  * so there is no need for atomic (pte|pmd)_mkyoung operations.
   */
  static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
  {
        if (!pmd || pmd_none(*pmd))     /* Nothing there */
                goto out;
  
-       if (kvm_pmd_huge(*pmd)) {       /* THP, HugeTLB */
+       if (pmd_thp_or_huge(*pmd)) {    /* THP, HugeTLB */
                *pmd = pmd_mkyoung(*pmd);
                pfn = pmd_pfn(*pmd);
                pfn_valid = true;
@@@ -1588,25 -1633,14 +1633,14 @@@ static int kvm_age_hva_handler(struct k
        if (!pmd || pmd_none(*pmd))     /* Nothing there */
                return 0;
  
-       if (kvm_pmd_huge(*pmd)) {       /* THP, HugeTLB */
-               if (pmd_young(*pmd)) {
-                       *pmd = pmd_mkold(*pmd);
-                       return 1;
-               }
-               return 0;
-       }
+       if (pmd_thp_or_huge(*pmd))      /* THP, HugeTLB */
+               return stage2_pmdp_test_and_clear_young(pmd);
  
        pte = pte_offset_kernel(pmd, gpa);
        if (pte_none(*pte))
                return 0;
  
-       if (pte_young(*pte)) {
-               *pte = pte_mkold(*pte); /* Just a page... */
-               return 1;
-       }
-       return 0;
+       return stage2_ptep_test_and_clear_young(pte);
  }
  
  static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data)
        if (!pmd || pmd_none(*pmd))     /* Nothing there */
                return 0;
  
-       if (kvm_pmd_huge(*pmd))         /* THP, HugeTLB */
+       if (pmd_thp_or_huge(*pmd))              /* THP, HugeTLB */
                return pmd_young(*pmd);
  
        pte = pte_offset_kernel(pmd, gpa);
@@@ -1666,11 -1700,6 +1700,11 @@@ phys_addr_t kvm_get_idmap_vector(void
        return hyp_idmap_vector;
  }
  
 +phys_addr_t kvm_get_idmap_start(void)
 +{
 +      return hyp_idmap_start;
 +}
 +
  int kvm_mmu_init(void)
  {
        int err;
index 1b3dc9df5257fa3f6ad2235a926f6b410e18517a,ffde15fed3e1bcb6b21d28044803736d1e39cf76..2cdb6b551ac6206ce37691f61a11f4dca8c15f4a
  #define HCR_INT_OVERRIDE   (HCR_FMO | HCR_IMO)
  #define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H)
  
 -/* Hyp System Control Register (SCTLR_EL2) bits */
 -#define SCTLR_EL2_EE  (1 << 25)
 -#define SCTLR_EL2_WXN (1 << 19)
 -#define SCTLR_EL2_I   (1 << 12)
 -#define SCTLR_EL2_SA  (1 << 3)
 -#define SCTLR_EL2_C   (1 << 2)
 -#define SCTLR_EL2_A   (1 << 1)
 -#define SCTLR_EL2_M   1
 -#define SCTLR_EL2_FLAGS       (SCTLR_EL2_M | SCTLR_EL2_A | SCTLR_EL2_C |      \
 -                       SCTLR_EL2_SA | SCTLR_EL2_I)
 -
  /* TCR_EL2 Registers bits */
- #define TCR_EL2_RES1  ((1 << 31) | (1 << 23))
- #define TCR_EL2_TBI   (1 << 20)
- #define TCR_EL2_PS    (7 << 16)
- #define TCR_EL2_PS_40B        (2 << 16)
- #define TCR_EL2_TG0   (1 << 14)
- #define TCR_EL2_SH0   (3 << 12)
- #define TCR_EL2_ORGN0 (3 << 10)
- #define TCR_EL2_IRGN0 (3 << 8)
- #define TCR_EL2_T0SZ  0x3f
- #define TCR_EL2_MASK  (TCR_EL2_TG0 | TCR_EL2_SH0 | \
-                        TCR_EL2_ORGN0 | TCR_EL2_IRGN0 | TCR_EL2_T0SZ)
+ #define TCR_EL2_RES1          ((1 << 31) | (1 << 23))
+ #define TCR_EL2_TBI           (1 << 20)
+ #define TCR_EL2_PS_SHIFT      16
+ #define TCR_EL2_PS_MASK               (7 << TCR_EL2_PS_SHIFT)
+ #define TCR_EL2_PS_40B                (2 << TCR_EL2_PS_SHIFT)
+ #define TCR_EL2_TG0_MASK      TCR_TG0_MASK
+ #define TCR_EL2_SH0_MASK      TCR_SH0_MASK
+ #define TCR_EL2_ORGN0_MASK    TCR_ORGN0_MASK
+ #define TCR_EL2_IRGN0_MASK    TCR_IRGN0_MASK
+ #define TCR_EL2_T0SZ_MASK     0x3f
+ #define TCR_EL2_MASK  (TCR_EL2_TG0_MASK | TCR_EL2_SH0_MASK | \
+                        TCR_EL2_ORGN0_MASK | TCR_EL2_IRGN0_MASK | TCR_EL2_T0SZ_MASK)
  
  /* VTCR_EL2 Registers bits */
  #define VTCR_EL2_RES1         (1 << 31)
- #define VTCR_EL2_PS_MASK      (7 << 16)
- #define VTCR_EL2_TG0_MASK     (1 << 14)
- #define VTCR_EL2_TG0_4K               (0 << 14)
- #define VTCR_EL2_TG0_64K      (1 << 14)
- #define VTCR_EL2_SH0_MASK     (3 << 12)
- #define VTCR_EL2_SH0_INNER    (3 << 12)
- #define VTCR_EL2_ORGN0_MASK   (3 << 10)
- #define VTCR_EL2_ORGN0_WBWA   (1 << 10)
- #define VTCR_EL2_IRGN0_MASK   (3 << 8)
- #define VTCR_EL2_IRGN0_WBWA   (1 << 8)
- #define VTCR_EL2_SL0_MASK     (3 << 6)
- #define VTCR_EL2_SL0_LVL1     (1 << 6)
+ #define VTCR_EL2_HD           (1 << 22)
+ #define VTCR_EL2_HA           (1 << 21)
+ #define VTCR_EL2_PS_MASK      TCR_EL2_PS_MASK
+ #define VTCR_EL2_TG0_MASK     TCR_TG0_MASK
+ #define VTCR_EL2_TG0_4K               TCR_TG0_4K
+ #define VTCR_EL2_TG0_16K      TCR_TG0_16K
+ #define VTCR_EL2_TG0_64K      TCR_TG0_64K
+ #define VTCR_EL2_SH0_MASK     TCR_SH0_MASK
+ #define VTCR_EL2_SH0_INNER    TCR_SH0_INNER
+ #define VTCR_EL2_ORGN0_MASK   TCR_ORGN0_MASK
+ #define VTCR_EL2_ORGN0_WBWA   TCR_ORGN0_WBWA
+ #define VTCR_EL2_IRGN0_MASK   TCR_IRGN0_MASK
+ #define VTCR_EL2_IRGN0_WBWA   TCR_IRGN0_WBWA
+ #define VTCR_EL2_SL0_SHIFT    6
+ #define VTCR_EL2_SL0_MASK     (3 << VTCR_EL2_SL0_SHIFT)
+ #define VTCR_EL2_SL0_LVL1     (1 << VTCR_EL2_SL0_SHIFT)
  #define VTCR_EL2_T0SZ_MASK    0x3f
  #define VTCR_EL2_T0SZ_40B     24
  #define VTCR_EL2_VS_SHIFT     19
   * (see hyp-init.S).
   *
   * Note that when using 4K pages, we concatenate two first level page tables
-  * together.
+  * together. With 16K pages, we concatenate 16 first level page tables.
   *
   * The magic numbers used for VTTBR_X in this patch can be found in Tables
   * D4-23 and D4-25 in ARM DDI 0487A.b.
   */
+ #define VTCR_EL2_T0SZ_IPA     VTCR_EL2_T0SZ_40B
+ #define VTCR_EL2_COMMON_BITS  (VTCR_EL2_SH0_INNER | VTCR_EL2_ORGN0_WBWA | \
+                                VTCR_EL2_IRGN0_WBWA | VTCR_EL2_RES1)
  #ifdef CONFIG_ARM64_64K_PAGES
  /*
   * Stage2 translation configuration:
-  * 40bits input  (T0SZ = 24)
   * 64kB pages (TG0 = 1)
   * 2 level page tables (SL = 1)
   */
- #define VTCR_EL2_FLAGS                (VTCR_EL2_TG0_64K | VTCR_EL2_SH0_INNER | \
-                                VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
-                                VTCR_EL2_SL0_LVL1 | VTCR_EL2_RES1)
- #define VTTBR_X               (38 - VTCR_EL2_T0SZ_40B)
- #else
+ #define VTCR_EL2_TGRAN_FLAGS          (VTCR_EL2_TG0_64K | VTCR_EL2_SL0_LVL1)
+ #define VTTBR_X_TGRAN_MAGIC           38
+ #elif defined(CONFIG_ARM64_16K_PAGES)
+ /*
+  * Stage2 translation configuration:
+  * 16kB pages (TG0 = 2)
+  * 2 level page tables (SL = 1)
+  */
+ #define VTCR_EL2_TGRAN_FLAGS          (VTCR_EL2_TG0_16K | VTCR_EL2_SL0_LVL1)
+ #define VTTBR_X_TGRAN_MAGIC           42
+ #else /* 4K */
  /*
   * Stage2 translation configuration:
-  * 40bits input  (T0SZ = 24)
   * 4kB pages (TG0 = 0)
   * 3 level page tables (SL = 1)
   */
- #define VTCR_EL2_FLAGS                (VTCR_EL2_TG0_4K | VTCR_EL2_SH0_INNER | \
-                                VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
-                                VTCR_EL2_SL0_LVL1 | VTCR_EL2_RES1)
- #define VTTBR_X               (37 - VTCR_EL2_T0SZ_40B)
+ #define VTCR_EL2_TGRAN_FLAGS          (VTCR_EL2_TG0_4K | VTCR_EL2_SL0_LVL1)
+ #define VTTBR_X_TGRAN_MAGIC           37
  #endif
  
+ #define VTCR_EL2_FLAGS                        (VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN_FLAGS)
+ #define VTTBR_X                               (VTTBR_X_TGRAN_MAGIC - VTCR_EL2_T0SZ_IPA)
  #define VTTBR_BADDR_SHIFT (VTTBR_X - 1)
  #define VTTBR_BADDR_MASK  (((UL(1) << (PHYS_MASK_SHIFT - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT)
  #define VTTBR_VMID_SHIFT  (UL(48))
index 90a8d2336cebff5eb00c0c507811f973cb112017,d49399d9890d358ad027a15e0a3e44ee13ba744f..e63d23bad36ea2a932723449b1cee6eedd05da77
@@@ -46,8 -46,6 +46,8 @@@
  int __attribute_const__ kvm_target_cpu(void);
  int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
  int kvm_arch_dev_ioctl_check_extension(long ext);
 +unsigned long kvm_hyp_reset_entry(void);
 +void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start);
  
  struct kvm_arch {
        /* The VMID generation used for the virt. memory system */
@@@ -295,6 -293,7 +295,7 @@@ struct kvm_vm_stat 
  struct kvm_vcpu_stat {
        u32 halt_successful_poll;
        u32 halt_attempted_poll;
+       u32 halt_poll_invalid;
        u32 halt_wakeup;
        u32 hvc_exit_stat;
        u64 wfe_exit_stat;
@@@ -354,21 -353,12 +355,22 @@@ static inline void __cpu_init_hyp_mode(
                       hyp_stack_ptr, vector_ptr);
  }
  
 -static inline void kvm_arch_hardware_disable(void) {}
 +static inline void __cpu_reset_hyp_mode(phys_addr_t boot_pgd_ptr,
 +                                      phys_addr_t phys_idmap_start)
 +{
 +      /*
 +       * Call reset code, and switch back to stub hyp vectors.
 +       * Uses __kvm_call_hyp() to avoid kaslr's kvm_ksym_ref() translation.
 +       */
 +      __kvm_call_hyp((void *)kvm_hyp_reset_entry(),
 +                     boot_pgd_ptr, phys_idmap_start);
 +}
 +
  static inline void kvm_arch_hardware_unsetup(void) {}
  static inline void kvm_arch_sync_events(struct kvm *kvm) {}
  static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
  static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
+ static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
  
  void kvm_arm_init_debug(void);
  void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
index e8d39d4f86b61a74ac44c8ff01351ce7a8d1dab9,844fe5d5ff44454f1c5d6c97a9098625241b5c51..f05ac27d033ed8419d36b871f9e607c87362298f
   */
  #define TRAMPOLINE_VA         (HYP_PAGE_OFFSET_MASK & PAGE_MASK)
  
- /*
-  * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation
-  * levels in addition to the PGD and potentially the PUD which are
-  * pre-allocated (we pre-allocate the fake PGD and the PUD when the Stage-2
-  * tables use one level of tables less than the kernel.
-  */
- #ifdef CONFIG_ARM64_64K_PAGES
- #define KVM_MMU_CACHE_MIN_PAGES       1
- #else
- #define KVM_MMU_CACHE_MIN_PAGES       2
- #endif
  #ifdef __ASSEMBLY__
  
  #include <asm/alternative.h>
@@@ -91,6 -79,8 +79,8 @@@ alternative_endi
  #define KVM_PHYS_SIZE (1UL << KVM_PHYS_SHIFT)
  #define KVM_PHYS_MASK (KVM_PHYS_SIZE - 1UL)
  
+ #include <asm/stage2_pgtable.h>
  int create_hyp_mappings(void *from, void *to);
  int create_hyp_io_mappings(void *from, void *to, phys_addr_t);
  void free_boot_hyp_pgd(void);
@@@ -109,7 -99,6 +99,7 @@@ void kvm_mmu_free_memory_caches(struct 
  phys_addr_t kvm_mmu_get_httbr(void);
  phys_addr_t kvm_mmu_get_boot_httbr(void);
  phys_addr_t kvm_get_idmap_vector(void);
 +phys_addr_t kvm_get_idmap_start(void);
  int kvm_mmu_init(void);
  void kvm_clear_hyp_idmap(void);
  
@@@ -122,19 -111,32 +112,32 @@@ static inline void kvm_clean_pmd_entry(
  static inline void kvm_clean_pte(pte_t *pte) {}
  static inline void kvm_clean_pte_entry(pte_t *pte) {}
  
- static inline void kvm_set_s2pte_writable(pte_t *pte)
+ static inline pte_t kvm_s2pte_mkwrite(pte_t pte)
  {
-       pte_val(*pte) |= PTE_S2_RDWR;
+       pte_val(pte) |= PTE_S2_RDWR;
+       return pte;
  }
  
- static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
+ static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd)
  {
-       pmd_val(*pmd) |= PMD_S2_RDWR;
+       pmd_val(pmd) |= PMD_S2_RDWR;
+       return pmd;
  }
  
  static inline void kvm_set_s2pte_readonly(pte_t *pte)
  {
-       pte_val(*pte) = (pte_val(*pte) & ~PTE_S2_RDWR) | PTE_S2_RDONLY;
+       pteval_t pteval;
+       unsigned long tmp;
+       asm volatile("//        kvm_set_s2pte_readonly\n"
+       "       prfm    pstl1strm, %2\n"
+       "1:     ldxr    %0, %2\n"
+       "       and     %0, %0, %3              // clear PTE_S2_RDWR\n"
+       "       orr     %0, %0, %4              // set PTE_S2_RDONLY\n"
+       "       stxr    %w1, %0, %2\n"
+       "       cbnz    %w1, 1b\n"
+       : "=&r" (pteval), "=&r" (tmp), "+Q" (pte_val(*pte))
+       : "L" (~PTE_S2_RDWR), "L" (PTE_S2_RDONLY));
  }
  
  static inline bool kvm_s2pte_readonly(pte_t *pte)
  
  static inline void kvm_set_s2pmd_readonly(pmd_t *pmd)
  {
-       pmd_val(*pmd) = (pmd_val(*pmd) & ~PMD_S2_RDWR) | PMD_S2_RDONLY;
+       kvm_set_s2pte_readonly((pte_t *)pmd);
  }
  
  static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
  {
-       return (pmd_val(*pmd) & PMD_S2_RDWR) == PMD_S2_RDONLY;
- }
- #define kvm_pgd_addr_end(addr, end)   pgd_addr_end(addr, end)
- #define kvm_pud_addr_end(addr, end)   pud_addr_end(addr, end)
- #define kvm_pmd_addr_end(addr, end)   pmd_addr_end(addr, end)
- /*
-  * In the case where PGDIR_SHIFT is larger than KVM_PHYS_SHIFT, we can address
-  * the entire IPA input range with a single pgd entry, and we would only need
-  * one pgd entry.  Note that in this case, the pgd is actually not used by
-  * the MMU for Stage-2 translations, but is merely a fake pgd used as a data
-  * structure for the kernel pgtable macros to work.
-  */
- #if PGDIR_SHIFT > KVM_PHYS_SHIFT
- #define PTRS_PER_S2_PGD_SHIFT 0
- #else
- #define PTRS_PER_S2_PGD_SHIFT (KVM_PHYS_SHIFT - PGDIR_SHIFT)
- #endif
- #define PTRS_PER_S2_PGD               (1 << PTRS_PER_S2_PGD_SHIFT)
- #define kvm_pgd_index(addr)   (((addr) >> PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1))
- /*
-  * If we are concatenating first level stage-2 page tables, we would have less
-  * than or equal to 16 pointers in the fake PGD, because that's what the
-  * architecture allows.  In this case, (4 - CONFIG_PGTABLE_LEVELS)
-  * represents the first level for the host, and we add 1 to go to the next
-  * level (which uses contatenation) for the stage-2 tables.
-  */
- #if PTRS_PER_S2_PGD <= 16
- #define KVM_PREALLOC_LEVEL    (4 - CONFIG_PGTABLE_LEVELS + 1)
- #else
- #define KVM_PREALLOC_LEVEL    (0)
- #endif
- static inline void *kvm_get_hwpgd(struct kvm *kvm)
- {
-       pgd_t *pgd = kvm->arch.pgd;
-       pud_t *pud;
-       if (KVM_PREALLOC_LEVEL == 0)
-               return pgd;
-       pud = pud_offset(pgd, 0);
-       if (KVM_PREALLOC_LEVEL == 1)
-               return pud;
-       BUG_ON(KVM_PREALLOC_LEVEL != 2);
-       return pmd_offset(pud, 0);
- }
- static inline unsigned int kvm_get_hwpgd_size(void)
- {
-       if (KVM_PREALLOC_LEVEL > 0)
-               return PTRS_PER_S2_PGD * PAGE_SIZE;
-       return PTRS_PER_S2_PGD * sizeof(pgd_t);
+       return kvm_s2pte_readonly((pte_t *)pmd);
  }
  
  static inline bool kvm_page_empty(void *ptr)
        return page_count(ptr_page) == 1;
  }
  
- #define kvm_pte_table_empty(kvm, ptep) kvm_page_empty(ptep)
+ #define hyp_pte_table_empty(ptep) kvm_page_empty(ptep)
  
  #ifdef __PAGETABLE_PMD_FOLDED
- #define kvm_pmd_table_empty(kvm, pmdp) (0)
+ #define hyp_pmd_table_empty(pmdp) (0)
  #else
- #define kvm_pmd_table_empty(kvm, pmdp) \
-       (kvm_page_empty(pmdp) && (!(kvm) || KVM_PREALLOC_LEVEL < 2))
+ #define hyp_pmd_table_empty(pmdp) kvm_page_empty(pmdp)
  #endif
  
  #ifdef __PAGETABLE_PUD_FOLDED
- #define kvm_pud_table_empty(kvm, pudp) (0)
+ #define hyp_pud_table_empty(pudp) (0)
  #else
- #define kvm_pud_table_empty(kvm, pudp) \
-       (kvm_page_empty(pudp) && (!(kvm) || KVM_PREALLOC_LEVEL < 1))
+ #define hyp_pud_table_empty(pudp) kvm_page_empty(pudp)
  #endif
  
  struct kvm;
  
  #define kvm_flush_dcache_to_poc(a,l)  __flush_dcache_area((a), (l))
index 9786f770088df41e919921b3a18024f045bfd707,936f1732727c11d3b7cd00dfb7f990648043617b..2813748e2f242c7cd5466783ff55e59947e9e202
   * Section
   */
  #define PMD_SECT_VALID                (_AT(pmdval_t, 1) << 0)
 -#define PMD_SECT_PROT_NONE    (_AT(pmdval_t, 1) << 58)
  #define PMD_SECT_USER         (_AT(pmdval_t, 1) << 6)         /* AP[1] */
  #define PMD_SECT_RDONLY               (_AT(pmdval_t, 1) << 7)         /* AP[2] */
  #define PMD_SECT_S            (_AT(pmdval_t, 3) << 8)
  #define TCR_T1SZ(x)           ((UL(64) - (x)) << TCR_T1SZ_OFFSET)
  #define TCR_TxSZ(x)           (TCR_T0SZ(x) | TCR_T1SZ(x))
  #define TCR_TxSZ_WIDTH                6
- #define TCR_IRGN_NC           ((UL(0) << 8) | (UL(0) << 24))
- #define TCR_IRGN_WBWA         ((UL(1) << 8) | (UL(1) << 24))
- #define TCR_IRGN_WT           ((UL(2) << 8) | (UL(2) << 24))
- #define TCR_IRGN_WBnWA                ((UL(3) << 8) | (UL(3) << 24))
- #define TCR_IRGN_MASK         ((UL(3) << 8) | (UL(3) << 24))
- #define TCR_ORGN_NC           ((UL(0) << 10) | (UL(0) << 26))
- #define TCR_ORGN_WBWA         ((UL(1) << 10) | (UL(1) << 26))
- #define TCR_ORGN_WT           ((UL(2) << 10) | (UL(2) << 26))
- #define TCR_ORGN_WBnWA                ((UL(3) << 10) | (UL(3) << 26))
- #define TCR_ORGN_MASK         ((UL(3) << 10) | (UL(3) << 26))
- #define TCR_SHARED            ((UL(3) << 12) | (UL(3) << 28))
- #define TCR_TG0_4K            (UL(0) << 14)
- #define TCR_TG0_64K           (UL(1) << 14)
- #define TCR_TG0_16K           (UL(2) << 14)
- #define TCR_TG1_16K           (UL(1) << 30)
- #define TCR_TG1_4K            (UL(2) << 30)
- #define TCR_TG1_64K           (UL(3) << 30)
+ #define TCR_IRGN0_SHIFT               8
+ #define TCR_IRGN0_MASK                (UL(3) << TCR_IRGN0_SHIFT)
+ #define TCR_IRGN0_NC          (UL(0) << TCR_IRGN0_SHIFT)
+ #define TCR_IRGN0_WBWA                (UL(1) << TCR_IRGN0_SHIFT)
+ #define TCR_IRGN0_WT          (UL(2) << TCR_IRGN0_SHIFT)
+ #define TCR_IRGN0_WBnWA               (UL(3) << TCR_IRGN0_SHIFT)
+ #define TCR_IRGN1_SHIFT               24
+ #define TCR_IRGN1_MASK                (UL(3) << TCR_IRGN1_SHIFT)
+ #define TCR_IRGN1_NC          (UL(0) << TCR_IRGN1_SHIFT)
+ #define TCR_IRGN1_WBWA                (UL(1) << TCR_IRGN1_SHIFT)
+ #define TCR_IRGN1_WT          (UL(2) << TCR_IRGN1_SHIFT)
+ #define TCR_IRGN1_WBnWA               (UL(3) << TCR_IRGN1_SHIFT)
+ #define TCR_IRGN_NC           (TCR_IRGN0_NC | TCR_IRGN1_NC)
+ #define TCR_IRGN_WBWA         (TCR_IRGN0_WBWA | TCR_IRGN1_WBWA)
+ #define TCR_IRGN_WT           (TCR_IRGN0_WT | TCR_IRGN1_WT)
+ #define TCR_IRGN_WBnWA                (TCR_IRGN0_WBnWA | TCR_IRGN1_WBnWA)
+ #define TCR_IRGN_MASK         (TCR_IRGN0_MASK | TCR_IRGN1_MASK)
+ #define TCR_ORGN0_SHIFT               10
+ #define TCR_ORGN0_MASK                (UL(3) << TCR_ORGN0_SHIFT)
+ #define TCR_ORGN0_NC          (UL(0) << TCR_ORGN0_SHIFT)
+ #define TCR_ORGN0_WBWA                (UL(1) << TCR_ORGN0_SHIFT)
+ #define TCR_ORGN0_WT          (UL(2) << TCR_ORGN0_SHIFT)
+ #define TCR_ORGN0_WBnWA               (UL(3) << TCR_ORGN0_SHIFT)
+ #define TCR_ORGN1_SHIFT               26
+ #define TCR_ORGN1_MASK                (UL(3) << TCR_ORGN1_SHIFT)
+ #define TCR_ORGN1_NC          (UL(0) << TCR_ORGN1_SHIFT)
+ #define TCR_ORGN1_WBWA                (UL(1) << TCR_ORGN1_SHIFT)
+ #define TCR_ORGN1_WT          (UL(2) << TCR_ORGN1_SHIFT)
+ #define TCR_ORGN1_WBnWA               (UL(3) << TCR_ORGN1_SHIFT)
+ #define TCR_ORGN_NC           (TCR_ORGN0_NC | TCR_ORGN1_NC)
+ #define TCR_ORGN_WBWA         (TCR_ORGN0_WBWA | TCR_ORGN1_WBWA)
+ #define TCR_ORGN_WT           (TCR_ORGN0_WT | TCR_ORGN1_WT)
+ #define TCR_ORGN_WBnWA                (TCR_ORGN0_WBnWA | TCR_ORGN1_WBnWA)
+ #define TCR_ORGN_MASK         (TCR_ORGN0_MASK | TCR_ORGN1_MASK)
+ #define TCR_SH0_SHIFT         12
+ #define TCR_SH0_MASK          (UL(3) << TCR_SH0_SHIFT)
+ #define TCR_SH0_INNER         (UL(3) << TCR_SH0_SHIFT)
+ #define TCR_SH1_SHIFT         28
+ #define TCR_SH1_MASK          (UL(3) << TCR_SH1_SHIFT)
+ #define TCR_SH1_INNER         (UL(3) << TCR_SH1_SHIFT)
+ #define TCR_SHARED            (TCR_SH0_INNER | TCR_SH1_INNER)
+ #define TCR_TG0_SHIFT         14
+ #define TCR_TG0_MASK          (UL(3) << TCR_TG0_SHIFT)
+ #define TCR_TG0_4K            (UL(0) << TCR_TG0_SHIFT)
+ #define TCR_TG0_64K           (UL(1) << TCR_TG0_SHIFT)
+ #define TCR_TG0_16K           (UL(2) << TCR_TG0_SHIFT)
+ #define TCR_TG1_SHIFT         30
+ #define TCR_TG1_MASK          (UL(3) << TCR_TG1_SHIFT)
+ #define TCR_TG1_16K           (UL(1) << TCR_TG1_SHIFT)
+ #define TCR_TG1_4K            (UL(2) << TCR_TG1_SHIFT)
+ #define TCR_TG1_64K           (UL(3) << TCR_TG1_SHIFT)
  #define TCR_ASID16            (UL(1) << 36)
  #define TCR_TBI0              (UL(1) << 37)
  #define TCR_HA                        (UL(1) << 39)
index 2da46ae9c991e3fbbfd8337b39738b7b467789b3,f1d5afdb12dbf6d09dcabca6b9ac3d88158ddc12..1910bf47d4a316af5c66af46f3a04555cdc10211
  #include <asm/pgtable-prot.h>
  
  /*
 - * VMALLOC and SPARSEMEM_VMEMMAP ranges.
 + * VMALLOC range.
   *
 - * VMEMAP_SIZE: allows the whole linear region to be covered by a struct page array
 - *    (rounded up to PUD_SIZE).
   * VMALLOC_START: beginning of the kernel vmalloc space
 - * VMALLOC_END: extends to the available space below vmmemmap, PCI I/O space,
 - *    fixed mappings and modules
 + * VMALLOC_END: extends to the available space below vmmemmap, PCI I/O space
 + *    and fixed mappings
   */
 -#define VMEMMAP_SIZE          ALIGN((1UL << (VA_BITS - PAGE_SHIFT)) * sizeof(struct page), PUD_SIZE)
 -
  #define VMALLOC_START         (MODULES_END)
  #define VMALLOC_END           (PAGE_OFFSET - PUD_SIZE - VMEMMAP_SIZE - SZ_64K)
  
 -#define VMEMMAP_START         (VMALLOC_END + SZ_64K)
 -#define vmemmap                       ((struct page *)VMEMMAP_START - \
 -                               SECTION_ALIGN_DOWN(memstart_addr >> PAGE_SHIFT))
 +#define vmemmap                       ((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT))
  
  #define FIRST_USER_ADDRESS    0UL
  
@@@ -52,7 -58,7 +52,7 @@@ extern void __pgd_error(const char *fil
   * for zero-mapped memory areas etc..
   */
  extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
 -#define ZERO_PAGE(vaddr)      virt_to_page(empty_zero_page)
 +#define ZERO_PAGE(vaddr)      pfn_to_page(PHYS_PFN(__pa(empty_zero_page)))
  
  #define pte_ERROR(pte)                __pte_error(__FILE__, __LINE__, pte_val(pte))
  
@@@ -266,21 -272,6 +266,21 @@@ static inline pgprot_t mk_sect_prot(pgp
        return __pgprot(pgprot_val(prot) & ~PTE_TABLE_BIT);
  }
  
 +#ifdef CONFIG_NUMA_BALANCING
 +/*
 + * See the comment in include/asm-generic/pgtable.h
 + */
 +static inline int pte_protnone(pte_t pte)
 +{
 +      return (pte_val(pte) & (PTE_VALID | PTE_PROT_NONE)) == PTE_PROT_NONE;
 +}
 +
 +static inline int pmd_protnone(pmd_t pmd)
 +{
 +      return pte_protnone(pmd_pte(pmd));
 +}
 +#endif
 +
  /*
   * THP definitions.
   */
  #define pmd_trans_huge(pmd)   (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT))
  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  
 +#define pmd_present(pmd)      pte_present(pmd_pte(pmd))
  #define pmd_dirty(pmd)                pte_dirty(pmd_pte(pmd))
  #define pmd_young(pmd)                pte_young(pmd_pte(pmd))
  #define pmd_wrprotect(pmd)    pte_pmd(pte_wrprotect(pmd_pte(pmd)))
  #define pmd_mkold(pmd)                pte_pmd(pte_mkold(pmd_pte(pmd)))
  #define pmd_mkwrite(pmd)      pte_pmd(pte_mkwrite(pmd_pte(pmd)))
 -#define pmd_mkclean(pmd)       pte_pmd(pte_mkclean(pmd_pte(pmd)))
 +#define pmd_mkclean(pmd)      pte_pmd(pte_mkclean(pmd_pte(pmd)))
  #define pmd_mkdirty(pmd)      pte_pmd(pte_mkdirty(pmd_pte(pmd)))
  #define pmd_mkyoung(pmd)      pte_pmd(pte_mkyoung(pmd_pte(pmd)))
 -#define pmd_mknotpresent(pmd) (__pmd(pmd_val(pmd) & ~PMD_TYPE_MASK))
 +#define pmd_mknotpresent(pmd) (__pmd(pmd_val(pmd) & ~PMD_SECT_VALID))
  
+ #define pmd_thp_or_huge(pmd)  (pmd_huge(pmd) || pmd_trans_huge(pmd))
  #define __HAVE_ARCH_PMD_WRITE
  #define pmd_write(pmd)                pte_write(pmd_pte(pmd))
  
@@@ -337,8 -329,9 +339,8 @@@ extern pgprot_t phys_mem_access_prot(st
                                     unsigned long size, pgprot_t vma_prot);
  
  #define pmd_none(pmd)         (!pmd_val(pmd))
 -#define pmd_present(pmd)      (pmd_val(pmd))
  
 -#define pmd_bad(pmd)          (!(pmd_val(pmd) & 2))
 +#define pmd_bad(pmd)          (!(pmd_val(pmd) & PMD_TABLE_BIT))
  
  #define pmd_table(pmd)                ((pmd_val(pmd) & PMD_TYPE_MASK) == \
                                 PMD_TYPE_TABLE)
@@@ -403,7 -396,7 +405,7 @@@ static inline phys_addr_t pmd_page_padd
  #define pmd_ERROR(pmd)                __pmd_error(__FILE__, __LINE__, pmd_val(pmd))
  
  #define pud_none(pud)         (!pud_val(pud))
 -#define pud_bad(pud)          (!(pud_val(pud) & 2))
 +#define pud_bad(pud)          (!(pud_val(pud) & PUD_TABLE_BIT))
  #define pud_present(pud)      (pud_val(pud))
  
  static inline void set_pud(pud_t *pudp, pud_t pud)
@@@ -535,33 -528,16 +537,31 @@@ static inline pmd_t pmd_modify(pmd_t pm
  }
  
  #ifdef CONFIG_ARM64_HW_AFDBM
 +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
 +extern int ptep_set_access_flags(struct vm_area_struct *vma,
 +                               unsigned long address, pte_t *ptep,
 +                               pte_t entry, int dirty);
 +
 +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 +#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
 +static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
 +                                      unsigned long address, pmd_t *pmdp,
 +                                      pmd_t entry, int dirty)
 +{
 +      return ptep_set_access_flags(vma, address, (pte_t *)pmdp, pmd_pte(entry), dirty);
 +}
 +#endif
 +
  /*
   * Atomic pte/pmd modifications.
   */
  #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
- static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
-                                           unsigned long address,
-                                           pte_t *ptep)
+ static inline int __ptep_test_and_clear_young(pte_t *ptep)
  {
        pteval_t pteval;
        unsigned int tmp, res;
  
-       asm volatile("//        ptep_test_and_clear_young\n"
+       asm volatile("//        __ptep_test_and_clear_young\n"
        "       prfm    pstl1strm, %2\n"
        "1:     ldxr    %0, %2\n"
        "       ubfx    %w3, %w0, %5, #1        // extract PTE_AF (young)\n"
        return res;
  }
  
+ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
+                                           unsigned long address,
+                                           pte_t *ptep)
+ {
+       return __ptep_test_and_clear_young(ptep);
+ }
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
  static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
@@@ -602,9 -585,9 +609,9 @@@ static inline pte_t ptep_get_and_clear(
  }
  
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 -#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
 -static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
 -                                     unsigned long address, pmd_t *pmdp)
 +#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
 +static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
 +                                          unsigned long address, pmd_t *pmdp)
  {
        return pte_pmd(ptep_get_and_clear(mm, address, (pte_t *)pmdp));
  }
index b76e132c87e4b1c74c6472ebfda5858b790f87d5,9a37a1044032dc45733607d901ac20c4f8465798..6733ac575da4fd13d7cee0e06e4e45e0c2d9b0a6
@@@ -122,6 -122,7 +122,7 @@@ struct kvm_vcpu_stat 
        u32 flush_dcache_exits;
        u32 halt_successful_poll;
        u32 halt_attempted_poll;
+       u32 halt_poll_invalid;
        u32 halt_wakeup;
  };
  
@@@ -311,18 -312,17 +312,18 @@@ enum emulation_result 
  #define MIPS3_PG_FRAME                0x3fffffc0
  
  #define VPN2_MASK             0xffffe000
 +#define KVM_ENTRYHI_ASID      MIPS_ENTRYHI_ASID
  #define TLB_IS_GLOBAL(x)      (((x).tlb_lo0 & MIPS3_PG_G) &&          \
                                 ((x).tlb_lo1 & MIPS3_PG_G))
  #define TLB_VPN2(x)           ((x).tlb_hi & VPN2_MASK)
 -#define TLB_ASID(x)           ((x).tlb_hi & ASID_MASK)
 +#define TLB_ASID(x)           ((x).tlb_hi & KVM_ENTRYHI_ASID)
  #define TLB_IS_VALID(x, va)   (((va) & (1 << PAGE_SHIFT))             \
                                 ? ((x).tlb_lo1 & MIPS3_PG_V)           \
                                 : ((x).tlb_lo0 & MIPS3_PG_V))
  #define TLB_HI_VPN2_HIT(x, y) ((TLB_VPN2(x) & ~(x).tlb_mask) ==       \
                                 ((y) & VPN2_MASK & ~(x).tlb_mask))
  #define TLB_HI_ASID_HIT(x, y) (TLB_IS_GLOBAL(x) ||                    \
 -                               TLB_ASID(x) == ((y) & ASID_MASK))
 +                               TLB_ASID(x) == ((y) & KVM_ENTRYHI_ASID))
  
  struct kvm_mips_tlb {
        long tlb_mask;
@@@ -748,7 -748,7 +749,7 @@@ extern enum emulation_result kvm_mips_c
  
  uint32_t kvm_mips_read_count(struct kvm_vcpu *vcpu);
  void kvm_mips_write_count(struct kvm_vcpu *vcpu, uint32_t count);
- void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare);
+ void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare, bool ack);
  void kvm_mips_init_count(struct kvm_vcpu *vcpu);
  int kvm_mips_set_count_ctl(struct kvm_vcpu *vcpu, s64 count_ctl);
  int kvm_mips_set_count_resume(struct kvm_vcpu *vcpu, s64 count_resume);
@@@ -813,5 -813,6 +814,6 @@@ static inline void kvm_arch_vcpu_uninit
  static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
  static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
  static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
+ static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
  
  #endif /* __MIPS_KVM_HOST_H__ */
diff --combined arch/mips/kvm/emulate.c
index 8e945e866a7379ecc2cf770d8c3b0f2646b6e06c,b8b7860ec1a8e3fe266ff5de751ff91e7a49294c..396df6eb0a12d9bdb9a886f00be348c4b8698ec2
@@@ -302,12 -302,31 +302,31 @@@ static inline ktime_t kvm_mips_count_ti
   */
  static uint32_t kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now)
  {
-       ktime_t expires;
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+       ktime_t expires, threshold;
+       uint32_t count, compare;
        int running;
  
-       /* Is the hrtimer pending? */
+       /* Calculate the biased and scaled guest CP0_Count */
+       count = vcpu->arch.count_bias + kvm_mips_ktime_to_count(vcpu, now);
+       compare = kvm_read_c0_guest_compare(cop0);
+       /*
+        * Find whether CP0_Count has reached the closest timer interrupt. If
+        * not, we shouldn't inject it.
+        */
+       if ((int32_t)(count - compare) < 0)
+               return count;
+       /*
+        * The CP0_Count we're going to return has already reached the closest
+        * timer interrupt. Quickly check if it really is a new interrupt by
+        * looking at whether the interval until the hrtimer expiry time is
+        * less than 1/4 of the timer period.
+        */
        expires = hrtimer_get_expires(&vcpu->arch.comparecount_timer);
-       if (ktime_compare(now, expires) >= 0) {
+       threshold = ktime_add_ns(now, vcpu->arch.count_period / 4);
+       if (ktime_before(expires, threshold)) {
                /*
                 * Cancel it while we handle it so there's no chance of
                 * interference with the timeout handler.
                }
        }
  
-       /* Return the biased and scaled guest CP0_Count */
-       return vcpu->arch.count_bias + kvm_mips_ktime_to_count(vcpu, now);
+       return count;
  }
  
  /**
@@@ -419,32 -437,6 +437,6 @@@ static void kvm_mips_resume_hrtimer(str
        hrtimer_start(&vcpu->arch.comparecount_timer, expire, HRTIMER_MODE_ABS);
  }
  
- /**
-  * kvm_mips_update_hrtimer() - Update next expiry time of hrtimer.
-  * @vcpu:     Virtual CPU.
-  *
-  * Recalculates and updates the expiry time of the hrtimer. This can be used
-  * after timer parameters have been altered which do not depend on the time that
-  * the change occurs (in those cases kvm_mips_freeze_hrtimer() and
-  * kvm_mips_resume_hrtimer() are used directly).
-  *
-  * It is guaranteed that no timer interrupts will be lost in the process.
-  *
-  * Assumes !kvm_mips_count_disabled(@vcpu) (guest CP0_Count timer is running).
-  */
- static void kvm_mips_update_hrtimer(struct kvm_vcpu *vcpu)
- {
-       ktime_t now;
-       uint32_t count;
-       /*
-        * freeze_hrtimer takes care of a timer interrupts <= count, and
-        * resume_hrtimer the hrtimer takes care of a timer interrupts > count.
-        */
-       now = kvm_mips_freeze_hrtimer(vcpu, &count);
-       kvm_mips_resume_hrtimer(vcpu, now, count);
- }
  /**
   * kvm_mips_write_count() - Modify the count and update timer.
   * @vcpu:     Virtual CPU.
@@@ -540,23 -532,42 +532,42 @@@ int kvm_mips_set_count_hz(struct kvm_vc
   * kvm_mips_write_compare() - Modify compare and update timer.
   * @vcpu:     Virtual CPU.
   * @compare:  New CP0_Compare value.
+  * @ack:      Whether to acknowledge timer interrupt.
   *
   * Update CP0_Compare to a new value and update the timeout.
+  * If @ack, atomically acknowledge any pending timer interrupt, otherwise ensure
+  * any pending timer interrupt is preserved.
   */
- void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare)
+ void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare, bool ack)
  {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
+       int dc;
+       u32 old_compare = kvm_read_c0_guest_compare(cop0);
+       ktime_t now;
+       uint32_t count;
  
        /* if unchanged, must just be an ack */
-       if (kvm_read_c0_guest_compare(cop0) == compare)
+       if (old_compare == compare) {
+               if (!ack)
+                       return;
+               kvm_mips_callbacks->dequeue_timer_int(vcpu);
+               kvm_write_c0_guest_compare(cop0, compare);
                return;
+       }
+       /* freeze_hrtimer() takes care of timer interrupts <= count */
+       dc = kvm_mips_count_disabled(vcpu);
+       if (!dc)
+               now = kvm_mips_freeze_hrtimer(vcpu, &count);
+       if (ack)
+               kvm_mips_callbacks->dequeue_timer_int(vcpu);
  
-       /* Update compare */
        kvm_write_c0_guest_compare(cop0, compare);
  
-       /* Update timeout if count enabled */
-       if (!kvm_mips_count_disabled(vcpu))
-               kvm_mips_update_hrtimer(vcpu);
+       /* resume_hrtimer() takes care of timer interrupts > count */
+       if (!dc)
+               kvm_mips_resume_hrtimer(vcpu, now, count);
  }
  
  /**
@@@ -1068,15 -1079,15 +1079,15 @@@ enum emulation_result kvm_mips_emulate_
                                        kvm_read_c0_guest_ebase(cop0));
                        } else if (rd == MIPS_CP0_TLB_HI && sel == 0) {
                                uint32_t nasid =
 -                                      vcpu->arch.gprs[rt] & ASID_MASK;
 +                                      vcpu->arch.gprs[rt] & KVM_ENTRYHI_ASID;
                                if ((KSEGX(vcpu->arch.gprs[rt]) != CKSEG0) &&
                                    ((kvm_read_c0_guest_entryhi(cop0) &
 -                                    ASID_MASK) != nasid)) {
 +                                    KVM_ENTRYHI_ASID) != nasid)) {
                                        kvm_debug("MTCz, change ASID from %#lx to %#lx\n",
                                                kvm_read_c0_guest_entryhi(cop0)
 -                                              & ASID_MASK,
 +                                              & KVM_ENTRYHI_ASID,
                                                vcpu->arch.gprs[rt]
 -                                              & ASID_MASK);
 +                                              & KVM_ENTRYHI_ASID);
  
                                        /* Blow away the shadow host TLBs */
                                        kvm_mips_flush_host_tlb(1);
  
                                /* If we are writing to COMPARE */
                                /* Clear pending timer interrupt, if any */
-                               kvm_mips_callbacks->dequeue_timer_int(vcpu);
                                kvm_mips_write_compare(vcpu,
-                                                      vcpu->arch.gprs[rt]);
+                                                      vcpu->arch.gprs[rt],
+                                                      true);
                        } else if ((rd == MIPS_CP0_STATUS) && (sel == 0)) {
                                unsigned int old_val, val, change;
  
@@@ -1620,7 -1631,7 +1631,7 @@@ enum emulation_result kvm_mips_emulate_
                 */
                index = kvm_mips_guest_tlb_lookup(vcpu, (va & VPN2_MASK) |
                                                  (kvm_read_c0_guest_entryhi
 -                                                 (cop0) & ASID_MASK));
 +                                                 (cop0) & KVM_ENTRYHI_ASID));
  
                if (index < 0) {
                        vcpu->arch.host_cp0_entryhi = (va & VPN2_MASK);
@@@ -1786,7 -1797,7 +1797,7 @@@ enum emulation_result kvm_mips_emulate_
        struct mips_coproc *cop0 = vcpu->arch.cop0;
        struct kvm_vcpu_arch *arch = &vcpu->arch;
        unsigned long entryhi = (vcpu->arch.  host_cp0_badvaddr & VPN2_MASK) |
 -                              (kvm_read_c0_guest_entryhi(cop0) & ASID_MASK);
 +                      (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID);
  
        if ((kvm_read_c0_guest_status(cop0) & ST0_EXL) == 0) {
                /* save old pc */
@@@ -1833,7 -1844,7 +1844,7 @@@ enum emulation_result kvm_mips_emulate_
        struct kvm_vcpu_arch *arch = &vcpu->arch;
        unsigned long entryhi =
                (vcpu->arch.host_cp0_badvaddr & VPN2_MASK) |
 -              (kvm_read_c0_guest_entryhi(cop0) & ASID_MASK);
 +              (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID);
  
        if ((kvm_read_c0_guest_status(cop0) & ST0_EXL) == 0) {
                /* save old pc */
@@@ -1878,7 -1889,7 +1889,7 @@@ enum emulation_result kvm_mips_emulate_
        struct mips_coproc *cop0 = vcpu->arch.cop0;
        struct kvm_vcpu_arch *arch = &vcpu->arch;
        unsigned long entryhi = (vcpu->arch.host_cp0_badvaddr & VPN2_MASK) |
 -                              (kvm_read_c0_guest_entryhi(cop0) & ASID_MASK);
 +                      (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID);
  
        if ((kvm_read_c0_guest_status(cop0) & ST0_EXL) == 0) {
                /* save old pc */
@@@ -1922,7 -1933,7 +1933,7 @@@ enum emulation_result kvm_mips_emulate_
        struct mips_coproc *cop0 = vcpu->arch.cop0;
        struct kvm_vcpu_arch *arch = &vcpu->arch;
        unsigned long entryhi = (vcpu->arch.host_cp0_badvaddr & VPN2_MASK) |
 -              (kvm_read_c0_guest_entryhi(cop0) & ASID_MASK);
 +              (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID);
  
        if ((kvm_read_c0_guest_status(cop0) & ST0_EXL) == 0) {
                /* save old pc */
@@@ -1967,7 -1978,7 +1978,7 @@@ enum emulation_result kvm_mips_handle_t
  #ifdef DEBUG
        struct mips_coproc *cop0 = vcpu->arch.cop0;
        unsigned long entryhi = (vcpu->arch.host_cp0_badvaddr & VPN2_MASK) |
 -                              (kvm_read_c0_guest_entryhi(cop0) & ASID_MASK);
 +                      (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID);
        int index;
  
        /* If address not in the guest TLB, then we are in trouble */
@@@ -1994,7 -2005,7 +2005,7 @@@ enum emulation_result kvm_mips_emulate_
  {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
        unsigned long entryhi = (vcpu->arch.host_cp0_badvaddr & VPN2_MASK) |
 -                              (kvm_read_c0_guest_entryhi(cop0) & ASID_MASK);
 +                      (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID);
        struct kvm_vcpu_arch *arch = &vcpu->arch;
  
        if ((kvm_read_c0_guest_status(cop0) & ST0_EXL) == 0) {
@@@ -2569,8 -2580,7 +2580,8 @@@ enum emulation_result kvm_mips_handle_t
         */
        index = kvm_mips_guest_tlb_lookup(vcpu,
                      (va & VPN2_MASK) |
 -                    (kvm_read_c0_guest_entryhi(vcpu->arch.cop0) & ASID_MASK));
 +                    (kvm_read_c0_guest_entryhi(vcpu->arch.cop0) &
 +                     KVM_ENTRYHI_ASID));
        if (index < 0) {
                if (exccode == EXCCODE_TLBL) {
                        er = kvm_mips_emulate_tlbmiss_ld(cause, opc, run, vcpu);
diff --combined arch/mips/kvm/tlb.c
index b9c52c1d35d6a37f181d843819d835b341f5958b,60e4ad0016e7595ec2032a681381e15ae4dc4063..ed021ae7867a797d327b3c1041393467c7c97da2
@@@ -49,18 -49,12 +49,18 @@@ EXPORT_SYMBOL_GPL(kvm_mips_is_error_pfn
  
  uint32_t kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
  {
 -      return vcpu->arch.guest_kernel_asid[smp_processor_id()] & ASID_MASK;
 +      int cpu = smp_processor_id();
 +
 +      return vcpu->arch.guest_kernel_asid[cpu] &
 +                      cpu_asid_mask(&cpu_data[cpu]);
  }
  
  uint32_t kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
  {
 -      return vcpu->arch.guest_user_asid[smp_processor_id()] & ASID_MASK;
 +      int cpu = smp_processor_id();
 +
 +      return vcpu->arch.guest_user_asid[cpu] &
 +                      cpu_asid_mask(&cpu_data[cpu]);
  }
  
  inline uint32_t kvm_mips_get_commpage_asid(struct kvm_vcpu *vcpu)
@@@ -84,8 -78,7 +84,8 @@@ void kvm_mips_dump_host_tlbs(void
        old_pagemask = read_c0_pagemask();
  
        kvm_info("HOST TLBs:\n");
 -      kvm_info("ASID: %#lx\n", read_c0_entryhi() & ASID_MASK);
 +      kvm_info("ASID: %#lx\n", read_c0_entryhi() &
 +               cpu_asid_mask(&current_cpu_data));
  
        for (i = 0; i < current_cpu_data.tlbsize; i++) {
                write_c0_index(i);
@@@ -275,6 -268,7 +275,7 @@@ int kvm_mips_handle_kseg0_tlb_fault(uns
        int even;
        struct kvm *kvm = vcpu->kvm;
        const int flush_dcache_mask = 0;
+       int ret;
  
        if (KVM_GUEST_KSEGX(badvaddr) != KVM_GUEST_KSEG0) {
                kvm_err("%s: Invalid BadVaddr: %#lx\n", __func__, badvaddr);
                pfn1 = kvm->arch.guest_pmap[gfn];
        }
  
-       entryhi = (vaddr | kvm_mips_get_kernel_asid(vcpu));
        entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) |
                   (1 << 2) | (0x1 << 1);
        entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) |
                   (1 << 2) | (0x1 << 1);
  
-       return kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
-                                      flush_dcache_mask);
+       preempt_disable();
+       entryhi = (vaddr | kvm_mips_get_kernel_asid(vcpu));
+       ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
+                                     flush_dcache_mask);
+       preempt_enable();
+       return ret;
  }
  EXPORT_SYMBOL_GPL(kvm_mips_handle_kseg0_tlb_fault);
  
@@@ -368,6 -366,7 +373,7 @@@ int kvm_mips_handle_mapped_seg_tlb_faul
        unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
        struct kvm *kvm = vcpu->kvm;
        kvm_pfn_t pfn0, pfn1;
+       int ret;
  
        if ((tlb->tlb_hi & VPN2_MASK) == 0) {
                pfn0 = 0;
                *hpa1 = pfn1 << PAGE_SHIFT;
  
        /* Get attributes from the Guest TLB */
-       entryhi = (tlb->tlb_hi & VPN2_MASK) | (KVM_GUEST_KERNEL_MODE(vcpu) ?
-                                              kvm_mips_get_kernel_asid(vcpu) :
-                                              kvm_mips_get_user_asid(vcpu));
        entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) |
                   (tlb->tlb_lo0 & MIPS3_PG_D) | (tlb->tlb_lo0 & MIPS3_PG_V);
        entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) |
        kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc,
                  tlb->tlb_lo0, tlb->tlb_lo1);
  
-       return kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
-                                      tlb->tlb_mask);
+       preempt_disable();
+       entryhi = (tlb->tlb_hi & VPN2_MASK) | (KVM_GUEST_KERNEL_MODE(vcpu) ?
+                                              kvm_mips_get_kernel_asid(vcpu) :
+                                              kvm_mips_get_user_asid(vcpu));
+       ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
+                                     tlb->tlb_mask);
+       preempt_enable();
+       return ret;
  }
  EXPORT_SYMBOL_GPL(kvm_mips_handle_mapped_seg_tlb_fault);
  
@@@ -571,15 -574,15 +581,15 @@@ void kvm_get_new_mmu_context(struct mm_
  {
        unsigned long asid = asid_cache(cpu);
  
 -      asid += ASID_INC;
 -      if (!(asid & ASID_MASK)) {
 +      asid += cpu_asid_inc();
 +      if (!(asid & cpu_asid_mask(&cpu_data[cpu]))) {
                if (cpu_has_vtag_icache)
                        flush_icache_all();
  
                kvm_local_flush_tlb_all();      /* start new asid cycle */
  
                if (!asid)      /* fix version if needed */
 -                      asid = ASID_FIRST_VERSION;
 +                      asid = asid_first_version(cpu);
        }
  
        cpu_context(cpu, mm) = asid_cache(cpu) = asid;
@@@ -634,7 -637,6 +644,7 @@@ static void kvm_mips_migrate_count(stru
  /* Restore ASID once we are scheduled back after preemption */
  void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
  {
 +      unsigned long asid_mask = cpu_asid_mask(&cpu_data[cpu]);
        unsigned long flags;
        int newasid = 0;
  
        local_irq_save(flags);
  
        if ((vcpu->arch.guest_kernel_asid[cpu] ^ asid_cache(cpu)) &
 -                                                      ASID_VERSION_MASK) {
 +                                              asid_version_mask(cpu)) {
                kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu);
                vcpu->arch.guest_kernel_asid[cpu] =
                    vcpu->arch.guest_kernel_mm.context.asid[cpu];
                 */
                if (current->flags & PF_VCPU) {
                        write_c0_entryhi(vcpu->arch.
 -                                       preempt_entryhi & ASID_MASK);
 +                                       preempt_entryhi & asid_mask);
                        ehb();
                }
        } else {
                        if (KVM_GUEST_KERNEL_MODE(vcpu))
                                write_c0_entryhi(vcpu->arch.
                                                 guest_kernel_asid[cpu] &
 -                                               ASID_MASK);
 +                                               asid_mask);
                        else
                                write_c0_entryhi(vcpu->arch.
                                                 guest_user_asid[cpu] &
 -                                               ASID_MASK);
 +                                               asid_mask);
                        ehb();
                }
        }
@@@ -729,7 -731,7 +739,7 @@@ void kvm_arch_vcpu_put(struct kvm_vcpu 
        kvm_mips_callbacks->vcpu_get_regs(vcpu);
  
        if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) &
 -           ASID_VERSION_MASK)) {
 +           asid_version_mask(cpu))) {
                kvm_debug("%s: Dropping MMU Context:  %#lx\n", __func__,
                          cpu_context(cpu, current->mm));
                drop_mmu_context(current->mm, cpu);
@@@ -756,8 -758,7 +766,8 @@@ uint32_t kvm_get_inst(uint32_t *opc, st
                        inst = *(opc);
                } else {
                        vpn2 = (unsigned long) opc & VPN2_MASK;
 -                      asid = kvm_read_c0_guest_entryhi(cop0) & ASID_MASK;
 +                      asid = kvm_read_c0_guest_entryhi(cop0) &
 +                                              KVM_ENTRYHI_ASID;
                        index = kvm_mips_guest_tlb_lookup(vcpu, vpn2 | asid);
                        if (index < 0) {
                                kvm_err("%s: get_user_failed for %p, vcpu: %p, ASID: %#lx\n",
index fd43f0afdb9f0b2f31a78a4f8b6b89d34a906315,caa5ea1038a08059b2ae92a804a4201cc73b9a04..6ba0fafcecbc9e24a5933e48ef62b7349a60c43b
@@@ -505,8 -505,7 +505,8 @@@ static int kvm_trap_emul_vcpu_setup(str
        kvm_write_c0_guest_intctl(cop0, 0xFC000000);
  
        /* Put in vcpu id as CPUNum into Ebase Reg to handle SMP Guests */
 -      kvm_write_c0_guest_ebase(cop0, KVM_GUEST_KSEG0 | (vcpu_id & 0xFF));
 +      kvm_write_c0_guest_ebase(cop0, KVM_GUEST_KSEG0 |
 +                                     (vcpu_id & MIPS_EBASE_CPUNUM));
  
        return 0;
  }
@@@ -547,7 -546,7 +547,7 @@@ static int kvm_trap_emul_set_one_reg(st
                kvm_mips_write_count(vcpu, v);
                break;
        case KVM_REG_MIPS_CP0_COMPARE:
-               kvm_mips_write_compare(vcpu, v);
+               kvm_mips_write_compare(vcpu, v, false);
                break;
        case KVM_REG_MIPS_CP0_CAUSE:
                /*
index bd7893d274fabb086f15edb5e7c7122ba4eb772e,994a66c09e8f16f1c256294da1d934791dca255e..e4f6f73afe2f91018ade009782534041d03b52c5
@@@ -69,21 -69,10 +69,22 @@@ struct sclp_info 
        unsigned int max_cores;
        unsigned long hsa_size;
        unsigned long facilities;
+       unsigned int hmfai;
  };
  extern struct sclp_info sclp;
  
 +struct zpci_report_error_header {
 +      u8 version;     /* Interface version byte */
 +      u8 action;      /* Action qualifier byte
 +                       * 1: Deconfigure and repair action requested
 +                       *      (OpenCrypto Problem Call Home)
 +                       * 2: Informational Report
 +                       *      (OpenCrypto Successful Diagnostics Execution)
 +                       */
 +      u16 length;     /* Length of Subsequent Data (up to 4K â€“ SCLP header */
 +      u8 data[0];     /* Subsequent Data passed verbatim to SCLP ET 24 */
 +} __packed;
 +
  int sclp_get_core_info(struct sclp_core_info *info);
  int sclp_core_configure(u8 core);
  int sclp_core_deconfigure(u8 core);
@@@ -95,7 -84,6 +96,7 @@@ int sclp_chp_read_info(struct sclp_chp_
  void sclp_get_ipl_info(struct sclp_ipl_info *info);
  int sclp_pci_configure(u32 fid);
  int sclp_pci_deconfigure(u32 fid);
 +int sclp_pci_report(struct zpci_report_error_header *report, u32 fh, u32 fid);
  int memcpy_hsa_kernel(void *dest, unsigned long src, size_t count);
  int memcpy_hsa_user(void __user *dest, unsigned long src, size_t count);
  void sclp_early_detect(void);
diff --combined arch/x86/kvm/mmu.c
index 38c0c32926c96bc154c2ce6c7c6cb06a30b03ac2,850335a71d9f3ac2b69413a447b160285152657b..24e800116ab4a25d750dca67023633b16f648972
@@@ -1909,18 -1909,17 +1909,17 @@@ static void kvm_mmu_commit_zap_page(str
   * since it has been deleted from active_mmu_pages but still can be found
   * at hast list.
   *
-  * for_each_gfn_indirect_valid_sp has skipped that kind of page and
-  * kvm_mmu_get_page(), the only user of for_each_gfn_sp(), has skipped
-  * all the obsolete pages.
+  * for_each_gfn_valid_sp() has skipped that kind of pages.
   */
- #define for_each_gfn_sp(_kvm, _sp, _gfn)                              \
+ #define for_each_gfn_valid_sp(_kvm, _sp, _gfn)                                \
        hlist_for_each_entry(_sp,                                       \
          &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
-               if ((_sp)->gfn != (_gfn)) {} else
+               if ((_sp)->gfn != (_gfn) || is_obsolete_sp((_kvm), (_sp)) \
+                       || (_sp)->role.invalid) {} else
  
  #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn)                       \
-       for_each_gfn_sp(_kvm, _sp, _gfn)                                \
-               if ((_sp)->role.direct || (_sp)->role.invalid) {} else
+       for_each_gfn_valid_sp(_kvm, _sp, _gfn)                          \
+               if ((_sp)->role.direct) {} else
  
  /* @sp->gfn should be write-protected at the call site */
  static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@@ -1961,6 -1960,11 +1960,11 @@@ static void kvm_mmu_audit(struct kvm_vc
  static void mmu_audit_disable(void) { }
  #endif
  
+ static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+ {
+       return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
+ }
  static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
                         struct list_head *invalid_list)
  {
@@@ -2105,11 -2109,6 +2109,6 @@@ static void clear_sp_write_flooding_cou
        __clear_sp_write_flooding_count(sp);
  }
  
- static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
- {
-       return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
- }
  static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
                                             gfn_t gfn,
                                             gva_t gaddr,
                quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
                role.quadrant = quadrant;
        }
-       for_each_gfn_sp(vcpu->kvm, sp, gfn) {
-               if (is_obsolete_sp(vcpu->kvm, sp))
-                       continue;
+       for_each_gfn_valid_sp(vcpu->kvm, sp, gfn) {
                if (!need_sync && sp->unsync)
                        need_sync = true;
  
@@@ -2823,7 -2819,7 +2819,7 @@@ static void transparent_hugepage_adjust
         */
        if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
            level == PT_PAGE_TABLE_LEVEL &&
 -          PageTransCompound(pfn_to_page(pfn)) &&
 +          PageTransCompoundMap(pfn_to_page(pfn)) &&
            !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
                unsigned long mask;
                /*
@@@ -3844,8 -3840,7 +3840,8 @@@ reset_tdp_shadow_zero_bits_mask(struct 
                __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check,
                                        boot_cpu_data.x86_phys_bits,
                                        context->shadow_root_level, false,
 -                                      cpu_has_gbpages, true, true);
 +                                      boot_cpu_has(X86_FEATURE_GBPAGES),
 +                                      true, true);
        else
                __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
                                            boot_cpu_data.x86_phys_bits,
@@@ -4786,7 -4781,7 +4782,7 @@@ restart
                 */
                if (sp->role.direct &&
                        !kvm_is_reserved_pfn(pfn) &&
 -                      PageTransCompound(pfn_to_page(pfn))) {
 +                      PageTransCompoundMap(pfn_to_page(pfn))) {
                        drop_spte(kvm, sptep);
                        need_tlb_flush = 1;
                        goto restart;
diff --combined arch/x86/kvm/svm.c
index fafd720ce10a12cbe6e70da6c3dc1796af3bd447,b0dd90338de77532c6dfc67fd148ac418e23b5fd..2214214c786b2f11295c0d7df835b2381a3b7ea0
@@@ -14,6 -14,9 +14,9 @@@
   * the COPYING file in the top-level directory.
   *
   */
+ #define pr_fmt(fmt) "SVM: " fmt
  #include <linux/kvm_host.h>
  
  #include "irq.h"
@@@ -32,6 -35,7 +35,7 @@@
  #include <linux/trace_events.h>
  #include <linux/slab.h>
  
+ #include <asm/apic.h>
  #include <asm/perf_event.h>
  #include <asm/tlbflush.h>
  #include <asm/desc.h>
@@@ -68,6 -72,8 +72,8 @@@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id)
  #define SVM_FEATURE_DECODE_ASSIST  (1 <<  7)
  #define SVM_FEATURE_PAUSE_FILTER   (1 << 10)
  
+ #define SVM_AVIC_DOORBELL     0xc001011b
  #define NESTED_EXIT_HOST      0       /* Exit handled on host level */
  #define NESTED_EXIT_DONE      1       /* Exit caused nested vmexit  */
  #define NESTED_EXIT_CONTINUE  2       /* Further checks needed      */
  #define TSC_RATIO_MIN         0x0000000000000001ULL
  #define TSC_RATIO_MAX         0x000000ffffffffffULL
  
+ #define AVIC_HPA_MASK ~((0xFFFULL << 52) || 0xFFF)
+ /*
+  * 0xff is broadcast, so the max index allowed for physical APIC ID
+  * table is 0xfe.  APIC IDs above 0xff are reserved.
+  */
+ #define AVIC_MAX_PHYSICAL_ID_COUNT    255
+ #define AVIC_UNACCEL_ACCESS_WRITE_MASK                1
+ #define AVIC_UNACCEL_ACCESS_OFFSET_MASK               0xFF0
+ #define AVIC_UNACCEL_ACCESS_VECTOR_MASK               0xFFFFFFFF
  static bool erratum_383_found __read_mostly;
  
  static const u32 host_save_user_msrs[] = {
@@@ -162,8 -180,21 +180,21 @@@ struct vcpu_svm 
  
        /* cached guest cpuid flags for faster access */
        bool nrips_enabled      : 1;
+       u32 ldr_reg;
+       struct page *avic_backing_page;
+       u64 *avic_physical_id_cache;
+       bool avic_is_running;
  };
  
+ #define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK  (0xFF)
+ #define AVIC_LOGICAL_ID_ENTRY_VALID_MASK              (1 << 31)
+ #define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK  (0xFFULL)
+ #define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK      (0xFFFFFFFFFFULL << 12)
+ #define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK                (1ULL << 62)
+ #define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK             (1ULL << 63)
  static DEFINE_PER_CPU(u64, current_tsc_ratio);
  #define TSC_RATIO_DEFAULT     0x0100000000ULL
  
@@@ -205,6 -236,10 +236,10 @@@ module_param(npt, int, S_IRUGO)
  static int nested = true;
  module_param(nested, int, S_IRUGO);
  
+ /* enable / disable AVIC */
+ static int avic;
+ module_param(avic, int, S_IRUGO);
  static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
  static void svm_flush_tlb(struct kvm_vcpu *vcpu);
  static void svm_complete_interrupts(struct vcpu_svm *svm);
@@@ -228,12 -263,18 +263,18 @@@ enum 
        VMCB_SEG,        /* CS, DS, SS, ES, CPL */
        VMCB_CR2,        /* CR2 only */
        VMCB_LBR,        /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
+       VMCB_AVIC,       /* AVIC APIC_BAR, AVIC APIC_BACKING_PAGE,
+                         * AVIC PHYSICAL_TABLE pointer,
+                         * AVIC LOGICAL_TABLE pointer
+                         */
        VMCB_DIRTY_MAX,
  };
  
  /* TPR and CR2 are always written before VMRUN */
  #define VMCB_ALWAYS_DIRTY_MASK        ((1U << VMCB_INTR) | (1U << VMCB_CR2))
  
+ #define VMCB_AVIC_APIC_BAR_MASK               0xFFFFFFFFFF000ULL
  static inline void mark_all_dirty(struct vmcb *vmcb)
  {
        vmcb->control.clean = 0;
@@@ -255,6 -296,23 +296,23 @@@ static inline struct vcpu_svm *to_svm(s
        return container_of(vcpu, struct vcpu_svm, vcpu);
  }
  
+ static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data)
+ {
+       svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK;
+       mark_dirty(svm->vmcb, VMCB_AVIC);
+ }
+ static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_svm *svm = to_svm(vcpu);
+       u64 *entry = svm->avic_physical_id_cache;
+       if (!entry)
+               return false;
+       return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
+ }
  static void recalc_intercepts(struct vcpu_svm *svm)
  {
        struct vmcb_control_area *c, *h;
@@@ -923,6 -981,12 +981,12 @@@ static __init int svm_hardware_setup(vo
        } else
                kvm_disable_tdp();
  
+       if (avic && (!npt_enabled || !boot_cpu_has(X86_FEATURE_AVIC)))
+               avic = false;
+       if (avic)
+               pr_info("AVIC enabled\n");
        return 0;
  
  err:
@@@ -1000,6 -1064,22 +1064,22 @@@ static void svm_adjust_tsc_offset_guest
        mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
  }
  
+ static void avic_init_vmcb(struct vcpu_svm *svm)
+ {
+       struct vmcb *vmcb = svm->vmcb;
+       struct kvm_arch *vm_data = &svm->vcpu.kvm->arch;
+       phys_addr_t bpa = page_to_phys(svm->avic_backing_page);
+       phys_addr_t lpa = page_to_phys(vm_data->avic_logical_id_table_page);
+       phys_addr_t ppa = page_to_phys(vm_data->avic_physical_id_table_page);
+       vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
+       vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
+       vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
+       vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT;
+       vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
+       svm->vcpu.arch.apicv_active = true;
+ }
  static void init_vmcb(struct vcpu_svm *svm)
  {
        struct vmcb_control_area *control = &svm->vmcb->control;
        set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
        set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
        set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
-       set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+       if (!kvm_vcpu_apicv_active(&svm->vcpu))
+               set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
  
        set_dr_intercepts(svm);
  
                set_intercept(svm, INTERCEPT_PAUSE);
        }
  
+       if (avic)
+               avic_init_vmcb(svm);
        mark_all_dirty(svm->vmcb);
  
        enable_gif(svm);
+ }
+ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, int index)
+ {
+       u64 *avic_physical_id_table;
+       struct kvm_arch *vm_data = &vcpu->kvm->arch;
+       if (index >= AVIC_MAX_PHYSICAL_ID_COUNT)
+               return NULL;
+       avic_physical_id_table = page_address(vm_data->avic_physical_id_table_page);
+       return &avic_physical_id_table[index];
+ }
+ /**
+  * Note:
+  * AVIC hardware walks the nested page table to check permissions,
+  * but does not use the SPA address specified in the leaf page
+  * table entry since it uses  address in the AVIC_BACKING_PAGE pointer
+  * field of the VMCB. Therefore, we set up the
+  * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
+  */
+ static int avic_init_access_page(struct kvm_vcpu *vcpu)
+ {
+       struct kvm *kvm = vcpu->kvm;
+       int ret;
+       if (kvm->arch.apic_access_page_done)
+               return 0;
+       ret = x86_set_memory_region(kvm,
+                                   APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+                                   APIC_DEFAULT_PHYS_BASE,
+                                   PAGE_SIZE);
+       if (ret)
+               return ret;
+       kvm->arch.apic_access_page_done = true;
+       return 0;
+ }
+ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
+ {
+       int ret;
+       u64 *entry, new_entry;
+       int id = vcpu->vcpu_id;
+       struct vcpu_svm *svm = to_svm(vcpu);
+       ret = avic_init_access_page(vcpu);
+       if (ret)
+               return ret;
+       if (id >= AVIC_MAX_PHYSICAL_ID_COUNT)
+               return -EINVAL;
+       if (!svm->vcpu.arch.apic->regs)
+               return -EINVAL;
+       svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs);
+       /* Setting AVIC backing page address in the phy APIC ID table */
+       entry = avic_get_physical_id_entry(vcpu, id);
+       if (!entry)
+               return -EINVAL;
+       new_entry = READ_ONCE(*entry);
+       new_entry = (page_to_phys(svm->avic_backing_page) &
+                    AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
+                    AVIC_PHYSICAL_ID_ENTRY_VALID_MASK;
+       WRITE_ONCE(*entry, new_entry);
+       svm->avic_physical_id_cache = entry;
+       return 0;
+ }
+ static void avic_vm_destroy(struct kvm *kvm)
+ {
+       struct kvm_arch *vm_data = &kvm->arch;
+       if (vm_data->avic_logical_id_table_page)
+               __free_page(vm_data->avic_logical_id_table_page);
+       if (vm_data->avic_physical_id_table_page)
+               __free_page(vm_data->avic_physical_id_table_page);
+ }
+ static int avic_vm_init(struct kvm *kvm)
+ {
+       int err = -ENOMEM;
+       struct kvm_arch *vm_data = &kvm->arch;
+       struct page *p_page;
+       struct page *l_page;
+       if (!avic)
+               return 0;
+       /* Allocating physical APIC ID table (4KB) */
+       p_page = alloc_page(GFP_KERNEL);
+       if (!p_page)
+               goto free_avic;
+       vm_data->avic_physical_id_table_page = p_page;
+       clear_page(page_address(p_page));
+       /* Allocating logical APIC ID table (4KB) */
+       l_page = alloc_page(GFP_KERNEL);
+       if (!l_page)
+               goto free_avic;
+       vm_data->avic_logical_id_table_page = l_page;
+       clear_page(page_address(l_page));
+       return 0;
+ free_avic:
+       avic_vm_destroy(kvm);
+       return err;
+ }
+ /**
+  * This function is called during VCPU halt/unhalt.
+  */
+ static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
+ {
+       u64 entry;
+       int h_physical_id = __default_cpu_present_to_apicid(vcpu->cpu);
+       struct vcpu_svm *svm = to_svm(vcpu);
+       if (!kvm_vcpu_apicv_active(vcpu))
+               return;
+       svm->avic_is_running = is_run;
+       /* ID = 0xff (broadcast), ID > 0xff (reserved) */
+       if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT))
+               return;
+       entry = READ_ONCE(*(svm->avic_physical_id_cache));
+       WARN_ON(is_run == !!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK));
+       entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+       if (is_run)
+               entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+       WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+ }
+ static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+ {
+       u64 entry;
+       /* ID = 0xff (broadcast), ID > 0xff (reserved) */
+       int h_physical_id = __default_cpu_present_to_apicid(cpu);
+       struct vcpu_svm *svm = to_svm(vcpu);
+       if (!kvm_vcpu_apicv_active(vcpu))
+               return;
+       if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT))
+               return;
+       entry = READ_ONCE(*(svm->avic_physical_id_cache));
+       WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
+       entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
+       entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
+       entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+       if (svm->avic_is_running)
+               entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+       WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+ }
+ static void avic_vcpu_put(struct kvm_vcpu *vcpu)
+ {
+       u64 entry;
+       struct vcpu_svm *svm = to_svm(vcpu);
+       if (!kvm_vcpu_apicv_active(vcpu))
+               return;
+       entry = READ_ONCE(*(svm->avic_physical_id_cache));
+       entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+       WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
  }
  
  static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
  
        kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
        kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
+       if (kvm_vcpu_apicv_active(vcpu) && !init_event)
+               avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
  }
  
  static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
        if (!hsave_page)
                goto free_page3;
  
+       if (avic) {
+               err = avic_init_backing_page(&svm->vcpu);
+               if (err)
+                       goto free_page4;
+       }
+       /* We initialize this flag to true to make sure that the is_running
+        * bit would be set the first time the vcpu is loaded.
+        */
+       svm->avic_is_running = true;
        svm->nested.hsave = page_address(hsave_page);
  
        svm->msrpm = page_address(msrpm_pages);
  
        return &svm->vcpu;
  
+ free_page4:
+       __free_page(hsave_page);
  free_page3:
        __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
  free_page2:
@@@ -1243,6 -1528,8 +1528,8 @@@ static void svm_vcpu_load(struct kvm_vc
        /* This assumes that the kernel never uses MSR_TSC_AUX */
        if (static_cpu_has(X86_FEATURE_RDTSCP))
                wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
+       avic_vcpu_load(vcpu, cpu);
  }
  
  static void svm_vcpu_put(struct kvm_vcpu *vcpu)
        struct vcpu_svm *svm = to_svm(vcpu);
        int i;
  
+       avic_vcpu_put(vcpu);
        ++vcpu->stat.host_state_reload;
        kvm_load_ldt(svm->host.ldt);
  #ifdef CONFIG_X86_64
        loadsegment(fs, svm->host.fs);
 -      wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
 +      wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase);
        load_gs_index(svm->host.gs);
  #else
  #ifdef CONFIG_X86_32_LAZY_GS
                wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
  }
  
+ static void svm_vcpu_blocking(struct kvm_vcpu *vcpu)
+ {
+       avic_set_running(vcpu, false);
+ }
+ static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
+ {
+       avic_set_running(vcpu, true);
+ }
  static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
  {
        return to_svm(vcpu)->vmcb->save.rflags;
@@@ -2673,10 -2972,11 +2972,11 @@@ static int clgi_interception(struct vcp
        disable_gif(svm);
  
        /* After a CLGI no interrupts should come */
-       svm_clear_vintr(svm);
-       svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
-       mark_dirty(svm->vmcb, VMCB_INTR);
+       if (!kvm_vcpu_apicv_active(&svm->vcpu)) {
+               svm_clear_vintr(svm);
+               svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
+               mark_dirty(svm->vmcb, VMCB_INTR);
+       }
  
        return 1;
  }
@@@ -3212,6 -3512,10 +3512,10 @@@ static int svm_set_msr(struct kvm_vcpu 
        case MSR_VM_IGNNE:
                vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
                break;
+       case MSR_IA32_APICBASE:
+               if (kvm_vcpu_apicv_active(vcpu))
+                       avic_update_vapic_bar(to_svm(vcpu), data);
+               /* Follow through */
        default:
                return kvm_set_msr_common(vcpu, msr);
        }
@@@ -3281,6 -3585,278 +3585,278 @@@ static int mwait_interception(struct vc
        return nop_interception(svm);
  }
  
+ enum avic_ipi_failure_cause {
+       AVIC_IPI_FAILURE_INVALID_INT_TYPE,
+       AVIC_IPI_FAILURE_TARGET_NOT_RUNNING,
+       AVIC_IPI_FAILURE_INVALID_TARGET,
+       AVIC_IPI_FAILURE_INVALID_BACKING_PAGE,
+ };
+ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
+ {
+       u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
+       u32 icrl = svm->vmcb->control.exit_info_1;
+       u32 id = svm->vmcb->control.exit_info_2 >> 32;
+       u32 index = svm->vmcb->control.exit_info_2 && 0xFF;
+       struct kvm_lapic *apic = svm->vcpu.arch.apic;
+       trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index);
+       switch (id) {
+       case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
+               /*
+                * AVIC hardware handles the generation of
+                * IPIs when the specified Message Type is Fixed
+                * (also known as fixed delivery mode) and
+                * the Trigger Mode is edge-triggered. The hardware
+                * also supports self and broadcast delivery modes
+                * specified via the Destination Shorthand(DSH)
+                * field of the ICRL. Logical and physical APIC ID
+                * formats are supported. All other IPI types cause
+                * a #VMEXIT, which needs to emulated.
+                */
+               kvm_lapic_reg_write(apic, APIC_ICR2, icrh);
+               kvm_lapic_reg_write(apic, APIC_ICR, icrl);
+               break;
+       case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: {
+               int i;
+               struct kvm_vcpu *vcpu;
+               struct kvm *kvm = svm->vcpu.kvm;
+               struct kvm_lapic *apic = svm->vcpu.arch.apic;
+               /*
+                * At this point, we expect that the AVIC HW has already
+                * set the appropriate IRR bits on the valid target
+                * vcpus. So, we just need to kick the appropriate vcpu.
+                */
+               kvm_for_each_vcpu(i, vcpu, kvm) {
+                       bool m = kvm_apic_match_dest(vcpu, apic,
+                                                    icrl & KVM_APIC_SHORT_MASK,
+                                                    GET_APIC_DEST_FIELD(icrh),
+                                                    icrl & KVM_APIC_DEST_MASK);
+                       if (m && !avic_vcpu_is_running(vcpu))
+                               kvm_vcpu_wake_up(vcpu);
+               }
+               break;
+       }
+       case AVIC_IPI_FAILURE_INVALID_TARGET:
+               break;
+       case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
+               WARN_ONCE(1, "Invalid backing page\n");
+               break;
+       default:
+               pr_err("Unknown IPI interception\n");
+       }
+       return 1;
+ }
+ static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
+ {
+       struct kvm_arch *vm_data = &vcpu->kvm->arch;
+       int index;
+       u32 *logical_apic_id_table;
+       int dlid = GET_APIC_LOGICAL_ID(ldr);
+       if (!dlid)
+               return NULL;
+       if (flat) { /* flat */
+               index = ffs(dlid) - 1;
+               if (index > 7)
+                       return NULL;
+       } else { /* cluster */
+               int cluster = (dlid & 0xf0) >> 4;
+               int apic = ffs(dlid & 0x0f) - 1;
+               if ((apic < 0) || (apic > 7) ||
+                   (cluster >= 0xf))
+                       return NULL;
+               index = (cluster << 2) + apic;
+       }
+       logical_apic_id_table = (u32 *) page_address(vm_data->avic_logical_id_table_page);
+       return &logical_apic_id_table[index];
+ }
+ static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr,
+                         bool valid)
+ {
+       bool flat;
+       u32 *entry, new_entry;
+       flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
+       entry = avic_get_logical_id_entry(vcpu, ldr, flat);
+       if (!entry)
+               return -EINVAL;
+       new_entry = READ_ONCE(*entry);
+       new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
+       new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
+       if (valid)
+               new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
+       else
+               new_entry &= ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
+       WRITE_ONCE(*entry, new_entry);
+       return 0;
+ }
+ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
+ {
+       int ret;
+       struct vcpu_svm *svm = to_svm(vcpu);
+       u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
+       if (!ldr)
+               return 1;
+       ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr, true);
+       if (ret && svm->ldr_reg) {
+               avic_ldr_write(vcpu, 0, svm->ldr_reg, false);
+               svm->ldr_reg = 0;
+       } else {
+               svm->ldr_reg = ldr;
+       }
+       return ret;
+ }
+ static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu)
+ {
+       u64 *old, *new;
+       struct vcpu_svm *svm = to_svm(vcpu);
+       u32 apic_id_reg = kvm_lapic_get_reg(vcpu->arch.apic, APIC_ID);
+       u32 id = (apic_id_reg >> 24) & 0xff;
+       if (vcpu->vcpu_id == id)
+               return 0;
+       old = avic_get_physical_id_entry(vcpu, vcpu->vcpu_id);
+       new = avic_get_physical_id_entry(vcpu, id);
+       if (!new || !old)
+               return 1;
+       /* We need to move physical_id_entry to new offset */
+       *new = *old;
+       *old = 0ULL;
+       to_svm(vcpu)->avic_physical_id_cache = new;
+       /*
+        * Also update the guest physical APIC ID in the logical
+        * APIC ID table entry if already setup the LDR.
+        */
+       if (svm->ldr_reg)
+               avic_handle_ldr_update(vcpu);
+       return 0;
+ }
+ static int avic_handle_dfr_update(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_svm *svm = to_svm(vcpu);
+       struct kvm_arch *vm_data = &vcpu->kvm->arch;
+       u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
+       u32 mod = (dfr >> 28) & 0xf;
+       /*
+        * We assume that all local APICs are using the same type.
+        * If this changes, we need to flush the AVIC logical
+        * APID id table.
+        */
+       if (vm_data->ldr_mode == mod)
+               return 0;
+       clear_page(page_address(vm_data->avic_logical_id_table_page));
+       vm_data->ldr_mode = mod;
+       if (svm->ldr_reg)
+               avic_handle_ldr_update(vcpu);
+       return 0;
+ }
+ static int avic_unaccel_trap_write(struct vcpu_svm *svm)
+ {
+       struct kvm_lapic *apic = svm->vcpu.arch.apic;
+       u32 offset = svm->vmcb->control.exit_info_1 &
+                               AVIC_UNACCEL_ACCESS_OFFSET_MASK;
+       switch (offset) {
+       case APIC_ID:
+               if (avic_handle_apic_id_update(&svm->vcpu))
+                       return 0;
+               break;
+       case APIC_LDR:
+               if (avic_handle_ldr_update(&svm->vcpu))
+                       return 0;
+               break;
+       case APIC_DFR:
+               avic_handle_dfr_update(&svm->vcpu);
+               break;
+       default:
+               break;
+       }
+       kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
+       return 1;
+ }
+ static bool is_avic_unaccelerated_access_trap(u32 offset)
+ {
+       bool ret = false;
+       switch (offset) {
+       case APIC_ID:
+       case APIC_EOI:
+       case APIC_RRR:
+       case APIC_LDR:
+       case APIC_DFR:
+       case APIC_SPIV:
+       case APIC_ESR:
+       case APIC_ICR:
+       case APIC_LVTT:
+       case APIC_LVTTHMR:
+       case APIC_LVTPC:
+       case APIC_LVT0:
+       case APIC_LVT1:
+       case APIC_LVTERR:
+       case APIC_TMICT:
+       case APIC_TDCR:
+               ret = true;
+               break;
+       default:
+               break;
+       }
+       return ret;
+ }
+ static int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
+ {
+       int ret = 0;
+       u32 offset = svm->vmcb->control.exit_info_1 &
+                    AVIC_UNACCEL_ACCESS_OFFSET_MASK;
+       u32 vector = svm->vmcb->control.exit_info_2 &
+                    AVIC_UNACCEL_ACCESS_VECTOR_MASK;
+       bool write = (svm->vmcb->control.exit_info_1 >> 32) &
+                    AVIC_UNACCEL_ACCESS_WRITE_MASK;
+       bool trap = is_avic_unaccelerated_access_trap(offset);
+       trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset,
+                                           trap, write, vector);
+       if (trap) {
+               /* Handling Trap */
+               WARN_ONCE(!write, "svm: Handling trap read.\n");
+               ret = avic_unaccel_trap_write(svm);
+       } else {
+               /* Handling Fault */
+               ret = (emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE);
+       }
+       return ret;
+ }
  static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
        [SVM_EXIT_READ_CR0]                     = cr_interception,
        [SVM_EXIT_READ_CR3]                     = cr_interception,
        [SVM_EXIT_XSETBV]                       = xsetbv_interception,
        [SVM_EXIT_NPF]                          = pf_interception,
        [SVM_EXIT_RSM]                          = emulate_on_interception,
+       [SVM_EXIT_AVIC_INCOMPLETE_IPI]          = avic_incomplete_ipi_interception,
+       [SVM_EXIT_AVIC_UNACCELERATED_ACCESS]    = avic_unaccelerated_access_interception,
  };
  
  static void dump_vmcb(struct kvm_vcpu *vcpu)
        pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
        pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
        pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
+       pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
        pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
        pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
        pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl);
        pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
+       pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
+       pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
+       pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
        pr_err("VMCB State Save Area:\n");
        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
               "es:",
@@@ -3562,6 -4144,7 +4144,7 @@@ static inline void svm_inject_irq(struc
  {
        struct vmcb_control_area *control;
  
+       /* The following fields are ignored when AVIC is enabled */
        control = &svm->vmcb->control;
        control->int_vector = irq;
        control->int_ctl &= ~V_INTR_PRIO_MASK;
@@@ -3583,11 -4166,17 +4166,17 @@@ static void svm_set_irq(struct kvm_vcp
                SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
  }
  
+ static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu)
+ {
+       return is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK);
+ }
  static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
  {
        struct vcpu_svm *svm = to_svm(vcpu);
  
-       if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
+       if (svm_nested_virtualize_tpr(vcpu) ||
+           kvm_vcpu_apicv_active(vcpu))
                return;
  
        clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
@@@ -3606,11 -4195,28 +4195,28 @@@ static void svm_set_virtual_x2apic_mode
  
  static bool svm_get_enable_apicv(void)
  {
-       return false;
+       return avic;
+ }
+ static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
+ {
  }
  
+ static void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
+ {
+ }
+ /* Note: Currently only used by Hyper-V. */
  static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
  {
+       struct vcpu_svm *svm = to_svm(vcpu);
+       struct vmcb *vmcb = svm->vmcb;
+       if (!avic)
+               return;
+       vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
+       mark_dirty(vmcb, VMCB_INTR);
  }
  
  static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
@@@ -3623,6 -4229,18 +4229,18 @@@ static void svm_sync_pir_to_irr(struct 
        return;
  }
  
+ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
+ {
+       kvm_lapic_set_irr(vec, vcpu->arch.apic);
+       smp_mb__after_atomic();
+       if (avic_vcpu_is_running(vcpu))
+               wrmsrl(SVM_AVIC_DOORBELL,
+                      __default_cpu_present_to_apicid(vcpu->cpu));
+       else
+               kvm_vcpu_wake_up(vcpu);
+ }
  static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
  {
        struct vcpu_svm *svm = to_svm(vcpu);
@@@ -3677,6 -4295,9 +4295,9 @@@ static void enable_irq_window(struct kv
  {
        struct vcpu_svm *svm = to_svm(vcpu);
  
+       if (kvm_vcpu_apicv_active(vcpu))
+               return;
        /*
         * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
         * 1, because that's a separate STGI/VMRUN intercept.  The next time we
@@@ -3728,7 -4349,7 +4349,7 @@@ static inline void sync_cr8_to_lapic(st
  {
        struct vcpu_svm *svm = to_svm(vcpu);
  
-       if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
+       if (svm_nested_virtualize_tpr(vcpu))
                return;
  
        if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
@@@ -3742,7 -4363,8 +4363,8 @@@ static inline void sync_lapic_to_cr8(st
        struct vcpu_svm *svm = to_svm(vcpu);
        u64 cr8;
  
-       if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
+       if (svm_nested_virtualize_tpr(vcpu) ||
+           kvm_vcpu_apicv_active(vcpu))
                return;
  
        cr8 = kvm_get_cr8(vcpu);
@@@ -4045,14 -4667,26 +4667,26 @@@ static u64 svm_get_mt_mask(struct kvm_v
  static void svm_cpuid_update(struct kvm_vcpu *vcpu)
  {
        struct vcpu_svm *svm = to_svm(vcpu);
+       struct kvm_cpuid_entry2 *entry;
  
        /* Update nrips enabled cache */
        svm->nrips_enabled = !!guest_cpuid_has_nrips(&svm->vcpu);
+       if (!kvm_vcpu_apicv_active(vcpu))
+               return;
+       entry = kvm_find_cpuid_entry(vcpu, 1, 0);
+       if (entry)
+               entry->ecx &= ~bit(X86_FEATURE_X2APIC);
  }
  
  static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
  {
        switch (func) {
+       case 0x1:
+               if (avic)
+                       entry->ecx &= ~bit(X86_FEATURE_X2APIC);
+               break;
        case 0x80000001:
                if (nested)
                        entry->ecx |= (1 << 2); /* Set SVM bit */
@@@ -4307,6 -4941,15 +4941,15 @@@ static void svm_sched_in(struct kvm_vcp
  {
  }
  
+ static inline void avic_post_state_restore(struct kvm_vcpu *vcpu)
+ {
+       if (avic_handle_apic_id_update(vcpu) != 0)
+               return;
+       if (avic_handle_dfr_update(vcpu) != 0)
+               return;
+       avic_handle_ldr_update(vcpu);
+ }
  static struct kvm_x86_ops svm_x86_ops = {
        .cpu_has_kvm_support = has_svm,
        .disabled_by_bios = is_disabled,
        .vcpu_free = svm_free_vcpu,
        .vcpu_reset = svm_vcpu_reset,
  
+       .vm_init = avic_vm_init,
+       .vm_destroy = avic_vm_destroy,
        .prepare_guest_switch = svm_prepare_guest_switch,
        .vcpu_load = svm_vcpu_load,
        .vcpu_put = svm_vcpu_put,
+       .vcpu_blocking = svm_vcpu_blocking,
+       .vcpu_unblocking = svm_vcpu_unblocking,
  
        .update_bp_intercept = update_bp_intercept,
        .get_msr = svm_get_msr,
        .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
        .load_eoi_exitmap = svm_load_eoi_exitmap,
        .sync_pir_to_irr = svm_sync_pir_to_irr,
+       .hwapic_irr_update = svm_hwapic_irr_update,
+       .hwapic_isr_update = svm_hwapic_isr_update,
+       .apicv_post_state_restore = avic_post_state_restore,
  
        .set_tss_addr = svm_set_tss_addr,
        .get_tdp_level = get_npt_level,
        .sched_in = svm_sched_in,
  
        .pmu_ops = &amd_pmu_ops,
+       .deliver_posted_interrupt = svm_deliver_avic_intr,
  };
  
  static int __init svm_init(void)
diff --combined arch/x86/kvm/trace.h
index b72743c5668d3d55387a55d06c0c886cf2b7b1b1,39f264cbda71a294c8a1257599415009b0b4bfb8..8de925031b5cb41b6447fac108ba8f8a4a9e3193
@@@ -809,7 -809,8 +809,7 @@@ TRACE_EVENT(kvm_write_tsc_offset
  
  #define host_clocks                                   \
        {VCLOCK_NONE, "none"},                          \
 -      {VCLOCK_TSC,  "tsc"},                           \
 -      {VCLOCK_HPET, "hpet"}                           \
 +      {VCLOCK_TSC,  "tsc"}                            \
  
  TRACE_EVENT(kvm_update_master_clock,
        TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched),
@@@ -1291,6 -1292,63 +1291,63 @@@ TRACE_EVENT(kvm_hv_stimer_cleanup
                  __entry->vcpu_id, __entry->timer_index)
  );
  
+ /*
+  * Tracepoint for AMD AVIC
+  */
+ TRACE_EVENT(kvm_avic_incomplete_ipi,
+           TP_PROTO(u32 vcpu, u32 icrh, u32 icrl, u32 id, u32 index),
+           TP_ARGS(vcpu, icrh, icrl, id, index),
+       TP_STRUCT__entry(
+               __field(u32, vcpu)
+               __field(u32, icrh)
+               __field(u32, icrl)
+               __field(u32, id)
+               __field(u32, index)
+       ),
+       TP_fast_assign(
+               __entry->vcpu = vcpu;
+               __entry->icrh = icrh;
+               __entry->icrl = icrl;
+               __entry->id = id;
+               __entry->index = index;
+       ),
+       TP_printk("vcpu=%u, icrh:icrl=%#010x:%08x, id=%u, index=%u\n",
+                 __entry->vcpu, __entry->icrh, __entry->icrl,
+                 __entry->id, __entry->index)
+ );
+ TRACE_EVENT(kvm_avic_unaccelerated_access,
+           TP_PROTO(u32 vcpu, u32 offset, bool ft, bool rw, u32 vec),
+           TP_ARGS(vcpu, offset, ft, rw, vec),
+       TP_STRUCT__entry(
+               __field(u32, vcpu)
+               __field(u32, offset)
+               __field(bool, ft)
+               __field(bool, rw)
+               __field(u32, vec)
+       ),
+       TP_fast_assign(
+               __entry->vcpu = vcpu;
+               __entry->offset = offset;
+               __entry->ft = ft;
+               __entry->rw = rw;
+               __entry->vec = vec;
+       ),
+       TP_printk("vcpu=%u, offset=%#x(%s), %s, %s, vec=%#x\n",
+                 __entry->vcpu,
+                 __entry->offset,
+                 __print_symbolic(__entry->offset, kvm_trace_symbol_apic),
+                 __entry->ft ? "trap" : "fault",
+                 __entry->rw ? "write" : "read",
+                 __entry->vec)
+ );
  #endif /* _TRACE_KVM_H */
  
  #undef TRACE_INCLUDE_PATH
diff --combined arch/x86/kvm/vmx.c
index cb47fe3da2926b3c1c17df41625bef9492353554,7ebf27bafe5cd217d3cdb403f5f283dd56fcaefe..e605d1ed334ff5550cb07310815e546028c26514
@@@ -3103,8 -3103,6 +3103,8 @@@ static __init int vmx_disabled_by_bios(
  
  static void kvm_cpu_vmxon(u64 addr)
  {
 +      intel_pt_handle_vmx(1);
 +
        asm volatile (ASM_VMX_VMXON_RAX
                        : : "a"(&addr), "m"(addr)
                        : "memory", "cc");
@@@ -3174,8 -3172,6 +3174,8 @@@ static void vmclear_local_loaded_vmcss(
  static void kvm_cpu_vmxoff(void)
  {
        asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
 +
 +      intel_pt_handle_vmx(0);
  }
  
  static void hardware_disable(void)
@@@ -3390,7 -3386,7 +3390,7 @@@ static __init int setup_vmcs_config(str
                }
        }
  
 -      if (cpu_has_xsaves)
 +      if (boot_cpu_has(X86_FEATURE_XSAVES))
                rdmsrl(MSR_IA32_XSS, host_xss);
  
        return 0;
@@@ -5050,8 -5046,8 +5050,8 @@@ static void vmx_vcpu_reset(struct kvm_v
                vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
  
        cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
-       vmx_set_cr0(vcpu, cr0); /* enter rmode */
        vmx->vcpu.arch.cr0 = cr0;
+       vmx_set_cr0(vcpu, cr0); /* enter rmode */
        vmx_set_cr4(vcpu, 0);
        vmx_set_efer(vcpu, 0);
        vmx_fpu_activate(vcpu);
@@@ -8318,19 -8314,19 +8318,19 @@@ static void vmx_set_apic_access_page_ad
                vmcs_write64(APIC_ACCESS_ADDR, hpa);
  }
  
- static void vmx_hwapic_isr_update(struct kvm *kvm, int isr)
+ static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
  {
        u16 status;
        u8 old;
  
-       if (isr == -1)
-               isr = 0;
+       if (max_isr == -1)
+               max_isr = 0;
  
        status = vmcs_read16(GUEST_INTR_STATUS);
        old = status >> 8;
-       if (isr != old) {
+       if (max_isr != old) {
                status &= 0xff;
-               status |= isr << 8;
+               status |= max_isr << 8;
                vmcs_write16(GUEST_INTR_STATUS, status);
        }
  }
diff --combined arch/x86/kvm/x86.c
index 12f33e6623826dfcd0af660a534e8240683bc1a2,a8c7ca34ee5d30b88725474159abd64eee95781e..c805cf494154f8e7609ab89e148e210703690d90
@@@ -161,6 -161,7 +161,7 @@@ struct kvm_stats_debugfs_item debugfs_e
        { "halt_exits", VCPU_STAT(halt_exits) },
        { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
        { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
+       { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
        { "halt_wakeup", VCPU_STAT(halt_wakeup) },
        { "hypercalls", VCPU_STAT(hypercalls) },
        { "request_irq", VCPU_STAT(request_irq_exits) },
@@@ -2002,22 -2003,8 +2003,8 @@@ static void kvmclock_reset(struct kvm_v
        vcpu->arch.pv_time_enabled = false;
  }
  
- static void accumulate_steal_time(struct kvm_vcpu *vcpu)
- {
-       u64 delta;
-       if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
-               return;
-       delta = current->sched_info.run_delay - vcpu->arch.st.last_steal;
-       vcpu->arch.st.last_steal = current->sched_info.run_delay;
-       vcpu->arch.st.accum_steal = delta;
- }
  static void record_steal_time(struct kvm_vcpu *vcpu)
  {
-       accumulate_steal_time(vcpu);
        if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
                return;
  
                &vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
                return;
  
-       vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal;
-       vcpu->arch.st.steal.version += 2;
-       vcpu->arch.st.accum_steal = 0;
+       if (vcpu->arch.st.steal.version & 1)
+               vcpu->arch.st.steal.version += 1;  /* first time write, random junk */
+       vcpu->arch.st.steal.version += 1;
+       kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+               &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
+       smp_wmb();
+       vcpu->arch.st.steal.steal += current->sched_info.run_delay -
+               vcpu->arch.st.last_steal;
+       vcpu->arch.st.last_steal = current->sched_info.run_delay;
+       kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+               &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
+       smp_wmb();
+       vcpu->arch.st.steal.version += 1;
  
        kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
                &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
@@@ -2611,7 -2615,7 +2615,7 @@@ int kvm_vm_ioctl_check_extension(struc
                r = KVM_MAX_MCE_BANKS;
                break;
        case KVM_CAP_XCRS:
 -              r = cpu_has_xsave;
 +              r = boot_cpu_has(X86_FEATURE_XSAVE);
                break;
        case KVM_CAP_TSC_CONTROL:
                r = kvm_has_tsc_control;
@@@ -3094,7 -3098,7 +3098,7 @@@ static void load_xsave(struct kvm_vcpu 
  
        /* Set XSTATE_BV and possibly XCOMP_BV.  */
        xsave->header.xfeatures = xstate_bv;
 -      if (cpu_has_xsaves)
 +      if (boot_cpu_has(X86_FEATURE_XSAVES))
                xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
  
        /*
  static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
                                         struct kvm_xsave *guest_xsave)
  {
 -      if (cpu_has_xsave) {
 +      if (boot_cpu_has(X86_FEATURE_XSAVE)) {
                memset(guest_xsave, 0, sizeof(struct kvm_xsave));
                fill_xsave((u8 *) guest_xsave->region, vcpu);
        } else {
@@@ -3139,7 -3143,7 +3143,7 @@@ static int kvm_vcpu_ioctl_x86_set_xsave
        u64 xstate_bv =
                *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
  
 -      if (cpu_has_xsave) {
 +      if (boot_cpu_has(X86_FEATURE_XSAVE)) {
                /*
                 * Here we allow setting states that are not present in
                 * CPUID leaf 0xD, index 0, EDX:EAX.  This is for compatibility
  static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
                                        struct kvm_xcrs *guest_xcrs)
  {
 -      if (!cpu_has_xsave) {
 +      if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
                guest_xcrs->nr_xcrs = 0;
                return;
        }
@@@ -3176,7 -3180,7 +3180,7 @@@ static int kvm_vcpu_ioctl_x86_set_xcrs(
  {
        int i, r = 0;
  
 -      if (!cpu_has_xsave)
 +      if (!boot_cpu_has(X86_FEATURE_XSAVE))
                return -EINVAL;
  
        if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
@@@ -5865,7 -5869,7 +5869,7 @@@ int kvm_arch_init(void *opaque
  
        perf_register_guest_info_callbacks(&kvm_guest_cbs);
  
 -      if (cpu_has_xsave)
 +      if (boot_cpu_has(X86_FEATURE_XSAVE))
                host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
  
        kvm_lapic_init();
@@@ -7293,7 -7297,7 +7297,7 @@@ int kvm_arch_vcpu_ioctl_set_fpu(struct 
  static void fx_init(struct kvm_vcpu *vcpu)
  {
        fpstate_init(&vcpu->arch.guest_fpu.state);
 -      if (cpu_has_xsaves)
 +      if (boot_cpu_has(X86_FEATURE_XSAVES))
                vcpu->arch.guest_fpu.state.xsave.header.xcomp_bv =
                        host_xcr0 | XSTATE_COMPACTION_ENABLED;
  
@@@ -7752,6 -7756,9 +7756,9 @@@ int kvm_arch_init_vm(struct kvm *kvm, u
        kvm_page_track_init(kvm);
        kvm_mmu_init_vm(kvm);
  
+       if (kvm_x86_ops->vm_init)
+               return kvm_x86_ops->vm_init(kvm);
        return 0;
  }
  
@@@ -7873,6 -7880,8 +7880,8 @@@ void kvm_arch_destroy_vm(struct kvm *kv
                x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 0, 0);
                x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
        }
+       if (kvm_x86_ops->vm_destroy)
+               kvm_x86_ops->vm_destroy(kvm);
        kvm_iommu_unmap_guest(kvm);
        kfree(kvm->arch.vpic);
        kfree(kvm->arch.vioapic);
@@@ -8355,19 -8364,21 +8364,21 @@@ bool kvm_arch_has_noncoherent_dma(struc
  }
  EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
  
+ bool kvm_arch_has_irq_bypass(void)
+ {
+       return kvm_x86_ops->update_pi_irte != NULL;
+ }
  int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
                                      struct irq_bypass_producer *prod)
  {
        struct kvm_kernel_irqfd *irqfd =
                container_of(cons, struct kvm_kernel_irqfd, consumer);
  
-       if (kvm_x86_ops->update_pi_irte) {
-               irqfd->producer = prod;
-               return kvm_x86_ops->update_pi_irte(irqfd->kvm,
-                               prod->irq, irqfd->gsi, 1);
-       }
+       irqfd->producer = prod;
  
-       return -EINVAL;
+       return kvm_x86_ops->update_pi_irte(irqfd->kvm,
+                                          prod->irq, irqfd->gsi, 1);
  }
  
  void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
        struct kvm_kernel_irqfd *irqfd =
                container_of(cons, struct kvm_kernel_irqfd, consumer);
  
-       if (!kvm_x86_ops->update_pi_irte) {
-               WARN_ON(irqfd->producer != NULL);
-               return;
-       }
        WARN_ON(irqfd->producer != prod);
        irqfd->producer = NULL;
  
@@@ -8429,3 -8435,5 +8435,5 @@@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
index 97c0028e83889303e49be5a0abc52fd76d2b5dca,2e9443be2b147d87c941e8c4dbb5a4a1f5e0b923..89e7423f0ebbafdf3ed2f7eb693d38d348e9fe35
  
  #include "irq-gic-common.h"
  
+ static const struct gic_kvm_info *gic_kvm_info;
+ const struct gic_kvm_info *gic_get_kvm_info(void)
+ {
+       return gic_kvm_info;
+ }
+ void gic_set_kvm_info(const struct gic_kvm_info *info)
+ {
+       BUG_ON(gic_kvm_info != NULL);
+       gic_kvm_info = info;
+ }
  void gic_enable_quirks(u32 iidr, const struct gic_quirk *quirks,
                void *data)
  {
@@@ -50,26 -63,14 +63,26 @@@ int gic_configure_irq(unsigned int irq
        else if (type & IRQ_TYPE_EDGE_BOTH)
                val |= confmask;
  
 +      /* If the current configuration is the same, then we are done */
 +      if (val == oldval)
 +              return 0;
 +
        /*
         * Write back the new configuration, and possibly re-enable
 -       * the interrupt. If we tried to write a new configuration and failed,
 -       * return an error.
 +       * the interrupt. If we fail to write a new configuration for
 +       * an SPI then WARN and return an error. If we fail to write the
 +       * configuration for a PPI this is most likely because the GIC
 +       * does not allow us to set the configuration or we are in a
 +       * non-secure mode, and hence it may not be catastrophic.
         */
        writel_relaxed(val, base + GIC_DIST_CONFIG + confoff);
 -      if (readl_relaxed(base + GIC_DIST_CONFIG + confoff) != val && val != oldval)
 -              ret = -EINVAL;
 +      if (readl_relaxed(base + GIC_DIST_CONFIG + confoff) != val) {
 +              if (WARN_ON(irq >= 32))
 +                      ret = -EINVAL;
 +              else
 +                      pr_warn("GIC: PPI%d is secure or misconfigured\n",
 +                              irq - 16);
 +      }
  
        if (sync_access)
                sync_access();
index 1a1ea4f733c1f6d3b1f71097135f96e5bc706afb,05a856073714aa2060922e13577c307337214962..fb042ba9a3dbbd239e6cbe6751be24978185eab8
@@@ -15,6 -15,8 +15,8 @@@
   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
   */
  
+ #define pr_fmt(fmt)   "GICv3: " fmt
  #include <linux/acpi.h>
  #include <linux/cpu.h>
  #include <linux/cpu_pm.h>
@@@ -28,8 -30,8 +30,9 @@@
  #include <linux/slab.h>
  
  #include <linux/irqchip.h>
+ #include <linux/irqchip/arm-gic-common.h>
  #include <linux/irqchip/arm-gic-v3.h>
 +#include <linux/irqchip/irq-partition-percpu.h>
  
  #include <asm/cputype.h>
  #include <asm/exception.h>
@@@ -45,7 -47,6 +48,7 @@@ struct redist_region 
  };
  
  struct gic_chip_data {
 +      struct fwnode_handle    *fwnode;
        void __iomem            *dist_base;
        struct redist_region    *redist_regions;
        struct rdists           rdists;
        u64                     redist_stride;
        u32                     nr_redist_regions;
        unsigned int            irq_nr;
 +      struct partition_desc   *ppi_descs[16];
  };
  
  static struct gic_chip_data gic_data __read_mostly;
  static struct static_key supports_deactivate = STATIC_KEY_INIT_TRUE;
  
+ static struct gic_kvm_info gic_v3_kvm_info;
  #define gic_data_rdist()              (this_cpu_ptr(gic_data.rdists.rdist))
  #define gic_data_rdist_rd_base()      (gic_data_rdist()->rd_base)
  #define gic_data_rdist_sgi_base()     (gic_data_rdist_rd_base() + SZ_64K)
@@@ -367,13 -369,6 +372,13 @@@ static asmlinkage void __exception_irq_
                        if (static_key_true(&supports_deactivate))
                                gic_write_dir(irqnr);
  #ifdef CONFIG_SMP
 +                      /*
 +                       * Unlike GICv2, we don't need an smp_rmb() here.
 +                       * The control dependency from gic_read_iar to
 +                       * the ISB in gic_write_eoir is enough to ensure
 +                       * that any shared data read by handle_IPI will
 +                       * be read after the ACK.
 +                       */
                        handle_IPI(irqnr, regs);
  #else
                        WARN_ONCE(true, "Unexpected SGI received!\n");
@@@ -393,15 -388,6 +398,15 @@@ static void __init gic_dist_init(void
        writel_relaxed(0, base + GICD_CTLR);
        gic_dist_wait_for_rwp();
  
 +      /*
 +       * Configure SPIs as non-secure Group-1. This will only matter
 +       * if the GIC only has a single security state. This will not
 +       * do the right thing if the kernel is running in secure mode,
 +       * but that's not the intended use case anyway.
 +       */
 +      for (i = 32; i < gic_data.irq_nr; i += 32)
 +              writel_relaxed(~0, base + GICD_IGROUPR + i / 8);
 +
        gic_dist_config(base, gic_data.irq_nr, gic_dist_wait_for_rwp);
  
        /* Enable distributor with ARE, Group1 */
@@@ -519,9 -505,6 +524,9 @@@ static void gic_cpu_init(void
  
        rbase = gic_data_rdist_sgi_base();
  
 +      /* Configure SGIs/PPIs as non-secure Group-1 */
 +      writel_relaxed(~0, rbase + GICR_IGROUPR0);
 +
        gic_cpu_config(rbase, gic_redist_wait_for_rwp);
  
        /* Give LPIs a spin */
@@@ -834,62 -817,10 +839,62 @@@ static void gic_irq_domain_free(struct 
        }
  }
  
 +static int gic_irq_domain_select(struct irq_domain *d,
 +                               struct irq_fwspec *fwspec,
 +                               enum irq_domain_bus_token bus_token)
 +{
 +      /* Not for us */
 +        if (fwspec->fwnode != d->fwnode)
 +              return 0;
 +
 +      /* If this is not DT, then we have a single domain */
 +      if (!is_of_node(fwspec->fwnode))
 +              return 1;
 +
 +      /*
 +       * If this is a PPI and we have a 4th (non-null) parameter,
 +       * then we need to match the partition domain.
 +       */
 +      if (fwspec->param_count >= 4 &&
 +          fwspec->param[0] == 1 && fwspec->param[3] != 0)
 +              return d == partition_get_domain(gic_data.ppi_descs[fwspec->param[1]]);
 +
 +      return d == gic_data.domain;
 +}
 +
  static const struct irq_domain_ops gic_irq_domain_ops = {
        .translate = gic_irq_domain_translate,
        .alloc = gic_irq_domain_alloc,
        .free = gic_irq_domain_free,
 +      .select = gic_irq_domain_select,
 +};
 +
 +static int partition_domain_translate(struct irq_domain *d,
 +                                    struct irq_fwspec *fwspec,
 +                                    unsigned long *hwirq,
 +                                    unsigned int *type)
 +{
 +      struct device_node *np;
 +      int ret;
 +
 +      np = of_find_node_by_phandle(fwspec->param[3]);
 +      if (WARN_ON(!np))
 +              return -EINVAL;
 +
 +      ret = partition_translate_id(gic_data.ppi_descs[fwspec->param[1]],
 +                                   of_node_to_fwnode(np));
 +      if (ret < 0)
 +              return ret;
 +
 +      *hwirq = ret;
 +      *type = fwspec->param[2] & IRQ_TYPE_SENSE_MASK;
 +
 +      return 0;
 +}
 +
 +static const struct irq_domain_ops partition_domain_ops = {
 +      .translate = partition_domain_translate,
 +      .select = gic_irq_domain_select,
  };
  
  static void gicv3_enable_quirks(void)
@@@ -917,7 -848,6 +922,7 @@@ static int __init gic_init_bases(void _
        if (static_key_true(&supports_deactivate))
                pr_info("GIC: Using split EOI/Deactivate mode\n");
  
 +      gic_data.fwnode = handle;
        gic_data.dist_base = dist_base;
        gic_data.redist_regions = rdist_regs;
        gic_data.nr_redist_regions = nr_redist_regions;
@@@ -976,119 -906,30 +981,143 @@@ static int __init gic_validate_dist_ver
        return 0;
  }
  
- static void gic_populate_ppi_partitions(struct device_node *gic_node)
 +static int get_cpu_number(struct device_node *dn)
 +{
 +      const __be32 *cell;
 +      u64 hwid;
 +      int i;
 +
 +      cell = of_get_property(dn, "reg", NULL);
 +      if (!cell)
 +              return -1;
 +
 +      hwid = of_read_number(cell, of_n_addr_cells(dn));
 +
 +      /*
 +       * Non affinity bits must be set to 0 in the DT
 +       */
 +      if (hwid & ~MPIDR_HWID_BITMASK)
 +              return -1;
 +
 +      for (i = 0; i < num_possible_cpus(); i++)
 +              if (cpu_logical_map(i) == hwid)
 +                      return i;
 +
 +      return -1;
 +}
 +
 +/* Create all possible partitions at boot time */
++static void __init gic_populate_ppi_partitions(struct device_node *gic_node)
 +{
 +      struct device_node *parts_node, *child_part;
 +      int part_idx = 0, i;
 +      int nr_parts;
 +      struct partition_affinity *parts;
 +
 +      parts_node = of_find_node_by_name(gic_node, "ppi-partitions");
 +      if (!parts_node)
 +              return;
 +
 +      nr_parts = of_get_child_count(parts_node);
 +
 +      if (!nr_parts)
 +              return;
 +
 +      parts = kzalloc(sizeof(*parts) * nr_parts, GFP_KERNEL);
 +      if (WARN_ON(!parts))
 +              return;
 +
 +      for_each_child_of_node(parts_node, child_part) {
 +              struct partition_affinity *part;
 +              int n;
 +
 +              part = &parts[part_idx];
 +
 +              part->partition_id = of_node_to_fwnode(child_part);
 +
 +              pr_info("GIC: PPI partition %s[%d] { ",
 +                      child_part->name, part_idx);
 +
 +              n = of_property_count_elems_of_size(child_part, "affinity",
 +                                                  sizeof(u32));
 +              WARN_ON(n <= 0);
 +
 +              for (i = 0; i < n; i++) {
 +                      int err, cpu;
 +                      u32 cpu_phandle;
 +                      struct device_node *cpu_node;
 +
 +                      err = of_property_read_u32_index(child_part, "affinity",
 +                                                       i, &cpu_phandle);
 +                      if (WARN_ON(err))
 +                              continue;
 +
 +                      cpu_node = of_find_node_by_phandle(cpu_phandle);
 +                      if (WARN_ON(!cpu_node))
 +                              continue;
 +
 +                      cpu = get_cpu_number(cpu_node);
 +                      if (WARN_ON(cpu == -1))
 +                              continue;
 +
 +                      pr_cont("%s[%d] ", cpu_node->full_name, cpu);
 +
 +                      cpumask_set_cpu(cpu, &part->mask);
 +              }
 +
 +              pr_cont("}\n");
 +              part_idx++;
 +      }
 +
 +      for (i = 0; i < 16; i++) {
 +              unsigned int irq;
 +              struct partition_desc *desc;
 +              struct irq_fwspec ppi_fwspec = {
 +                      .fwnode         = gic_data.fwnode,
 +                      .param_count    = 3,
 +                      .param          = {
 +                              [0]     = 1,
 +                              [1]     = i,
 +                              [2]     = IRQ_TYPE_NONE,
 +                      },
 +              };
 +
 +              irq = irq_create_fwspec_mapping(&ppi_fwspec);
 +              if (WARN_ON(!irq))
 +                      continue;
 +              desc = partition_create_desc(gic_data.fwnode, parts, nr_parts,
 +                                           irq, &partition_domain_ops);
 +              if (WARN_ON(!desc))
 +                      continue;
 +
 +              gic_data.ppi_descs[i] = desc;
 +      }
 +}
 +
+ static void __init gic_of_setup_kvm_info(struct device_node *node)
+ {
+       int ret;
+       struct resource r;
+       u32 gicv_idx;
+       gic_v3_kvm_info.type = GIC_V3;
+       gic_v3_kvm_info.maint_irq = irq_of_parse_and_map(node, 0);
+       if (!gic_v3_kvm_info.maint_irq)
+               return;
+       if (of_property_read_u32(node, "#redistributor-regions",
+                                &gicv_idx))
+               gicv_idx = 1;
+       gicv_idx += 3;  /* Also skip GICD, GICC, GICH */
+       ret = of_address_to_resource(node, gicv_idx, &r);
+       if (!ret)
+               gic_v3_kvm_info.vcpu = r;
+       gic_set_kvm_info(&gic_v3_kvm_info);
+ }
  static int __init gic_of_init(struct device_node *node, struct device_node *parent)
  {
        void __iomem *dist_base;
  
        err = gic_init_bases(dist_base, rdist_regs, nr_redist_regions,
                             redist_stride, &node->fwnode);
 -      if (!err) {
 -              gic_of_setup_kvm_info(node);
 -              return 0;
 -      }
 +      if (err)
 +              goto out_unmap_rdist;
 +
 +      gic_populate_ppi_partitions(node);
++      gic_of_setup_kvm_info(node);
 +      return 0;
  
  out_unmap_rdist:
        for (i = 0; i < nr_redist_regions; i++)
@@@ -1159,19 -999,25 +1189,25 @@@ out_unmap_dist
  IRQCHIP_DECLARE(gic_v3, "arm,gic-v3", gic_of_init);
  
  #ifdef CONFIG_ACPI
- static void __iomem *dist_base;
- static struct redist_region *redist_regs __initdata;
- static u32 nr_redist_regions __initdata;
- static bool single_redist;
+ static struct
+ {
+       void __iomem *dist_base;
+       struct redist_region *redist_regs;
+       u32 nr_redist_regions;
+       bool single_redist;
+       u32 maint_irq;
+       int maint_irq_mode;
+       phys_addr_t vcpu_base;
+ } acpi_data __initdata;
  
  static void __init
  gic_acpi_register_redist(phys_addr_t phys_base, void __iomem *redist_base)
  {
        static int count = 0;
  
-       redist_regs[count].phys_base = phys_base;
-       redist_regs[count].redist_base = redist_base;
-       redist_regs[count].single_redist = single_redist;
+       acpi_data.redist_regs[count].phys_base = phys_base;
+       acpi_data.redist_regs[count].redist_base = redist_base;
+       acpi_data.redist_regs[count].single_redist = acpi_data.single_redist;
        count++;
  }
  
@@@ -1199,7 -1045,7 +1235,7 @@@ gic_acpi_parse_madt_gicc(struct acpi_su
  {
        struct acpi_madt_generic_interrupt *gicc =
                                (struct acpi_madt_generic_interrupt *)header;
-       u32 reg = readl_relaxed(dist_base + GICD_PIDR2) & GIC_PIDR2_ARCH_MASK;
+       u32 reg = readl_relaxed(acpi_data.dist_base + GICD_PIDR2) & GIC_PIDR2_ARCH_MASK;
        u32 size = reg == GIC_PIDR2_ARCH_GICv4 ? SZ_64K * 4 : SZ_64K * 2;
        void __iomem *redist_base;
  
@@@ -1216,7 -1062,7 +1252,7 @@@ static int __init gic_acpi_collect_gicr
        acpi_tbl_entry_handler redist_parser;
        enum acpi_madt_type type;
  
-       if (single_redist) {
+       if (acpi_data.single_redist) {
                type = ACPI_MADT_TYPE_GENERIC_INTERRUPT;
                redist_parser = gic_acpi_parse_madt_gicc;
        } else {
@@@ -1267,14 -1113,14 +1303,14 @@@ static int __init gic_acpi_count_gicr_r
        count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_REDISTRIBUTOR,
                                      gic_acpi_match_gicr, 0);
        if (count > 0) {
-               single_redist = false;
+               acpi_data.single_redist = false;
                return count;
        }
  
        count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT,
                                      gic_acpi_match_gicc, 0);
        if (count > 0)
-               single_redist = true;
+               acpi_data.single_redist = true;
  
        return count;
  }
@@@ -1294,36 -1140,117 +1330,117 @@@ static bool __init acpi_validate_gic_ta
        if (count <= 0)
                return false;
  
-       nr_redist_regions = count;
+       acpi_data.nr_redist_regions = count;
        return true;
  }
  
+ static int __init gic_acpi_parse_virt_madt_gicc(struct acpi_subtable_header *header,
+                                               const unsigned long end)
+ {
+       struct acpi_madt_generic_interrupt *gicc =
+               (struct acpi_madt_generic_interrupt *)header;
+       int maint_irq_mode;
+       static int first_madt = true;
+       /* Skip unusable CPUs */
+       if (!(gicc->flags & ACPI_MADT_ENABLED))
+               return 0;
+       maint_irq_mode = (gicc->flags & ACPI_MADT_VGIC_IRQ_MODE) ?
+               ACPI_EDGE_SENSITIVE : ACPI_LEVEL_SENSITIVE;
+       if (first_madt) {
+               first_madt = false;
+               acpi_data.maint_irq = gicc->vgic_interrupt;
+               acpi_data.maint_irq_mode = maint_irq_mode;
+               acpi_data.vcpu_base = gicc->gicv_base_address;
+               return 0;
+       }
+       /*
+        * The maintenance interrupt and GICV should be the same for every CPU
+        */
+       if ((acpi_data.maint_irq != gicc->vgic_interrupt) ||
+           (acpi_data.maint_irq_mode != maint_irq_mode) ||
+           (acpi_data.vcpu_base != gicc->gicv_base_address))
+               return -EINVAL;
+       return 0;
+ }
+ static bool __init gic_acpi_collect_virt_info(void)
+ {
+       int count;
+       count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT,
+                                     gic_acpi_parse_virt_madt_gicc, 0);
+       return (count > 0);
+ }
  #define ACPI_GICV3_DIST_MEM_SIZE (SZ_64K)
+ #define ACPI_GICV2_VCTRL_MEM_SIZE     (SZ_4K)
+ #define ACPI_GICV2_VCPU_MEM_SIZE      (SZ_8K)
+ static void __init gic_acpi_setup_kvm_info(void)
+ {
+       int irq;
+       if (!gic_acpi_collect_virt_info()) {
+               pr_warn("Unable to get hardware information used for virtualization\n");
+               return;
+       }
+       gic_v3_kvm_info.type = GIC_V3;
+       irq = acpi_register_gsi(NULL, acpi_data.maint_irq,
+                               acpi_data.maint_irq_mode,
+                               ACPI_ACTIVE_HIGH);
+       if (irq <= 0)
+               return;
+       gic_v3_kvm_info.maint_irq = irq;
+       if (acpi_data.vcpu_base) {
+               struct resource *vcpu = &gic_v3_kvm_info.vcpu;
+               vcpu->flags = IORESOURCE_MEM;
+               vcpu->start = acpi_data.vcpu_base;
+               vcpu->end = vcpu->start + ACPI_GICV2_VCPU_MEM_SIZE - 1;
+       }
+       gic_set_kvm_info(&gic_v3_kvm_info);
+ }
  
  static int __init
  gic_acpi_init(struct acpi_subtable_header *header, const unsigned long end)
  {
        struct acpi_madt_generic_distributor *dist;
        struct fwnode_handle *domain_handle;
+       size_t size;
        int i, err;
  
        /* Get distributor base address */
        dist = (struct acpi_madt_generic_distributor *)header;
-       dist_base = ioremap(dist->base_address, ACPI_GICV3_DIST_MEM_SIZE);
-       if (!dist_base) {
+       acpi_data.dist_base = ioremap(dist->base_address,
+                                     ACPI_GICV3_DIST_MEM_SIZE);
+       if (!acpi_data.dist_base) {
                pr_err("Unable to map GICD registers\n");
                return -ENOMEM;
        }
  
-       err = gic_validate_dist_version(dist_base);
+       err = gic_validate_dist_version(acpi_data.dist_base);
        if (err) {
-               pr_err("No distributor detected at @%p, giving up", dist_base);
+               pr_err("No distributor detected at @%p, giving up",
+                      acpi_data.dist_base);
                goto out_dist_unmap;
        }
  
-       redist_regs = kzalloc(sizeof(*redist_regs) * nr_redist_regions,
-                             GFP_KERNEL);
-       if (!redist_regs) {
+       size = sizeof(*acpi_data.redist_regs) * acpi_data.nr_redist_regions;
+       acpi_data.redist_regs = kzalloc(size, GFP_KERNEL);
+       if (!acpi_data.redist_regs) {
                err = -ENOMEM;
                goto out_dist_unmap;
        }
        if (err)
                goto out_redist_unmap;
  
-       domain_handle = irq_domain_alloc_fwnode(dist_base);
+       domain_handle = irq_domain_alloc_fwnode(acpi_data.dist_base);
        if (!domain_handle) {
                err = -ENOMEM;
                goto out_redist_unmap;
        }
  
-       err = gic_init_bases(dist_base, redist_regs, nr_redist_regions, 0,
-                            domain_handle);
+       err = gic_init_bases(acpi_data.dist_base, acpi_data.redist_regs,
+                            acpi_data.nr_redist_regions, 0, domain_handle);
        if (err)
                goto out_fwhandle_free;
  
        acpi_set_irq_model(ACPI_IRQ_MODEL_GIC, domain_handle);
+       gic_acpi_setup_kvm_info();
        return 0;
  
  out_fwhandle_free:
        irq_domain_free_fwnode(domain_handle);
  out_redist_unmap:
-       for (i = 0; i < nr_redist_regions; i++)
-               if (redist_regs[i].redist_base)
-                       iounmap(redist_regs[i].redist_base);
-       kfree(redist_regs);
+       for (i = 0; i < acpi_data.nr_redist_regions; i++)
+               if (acpi_data.redist_regs[i].redist_base)
+                       iounmap(acpi_data.redist_regs[i].redist_base);
+       kfree(acpi_data.redist_regs);
  out_dist_unmap:
-       iounmap(dist_base);
+       iounmap(acpi_data.dist_base);
        return err;
  }
  IRQCHIP_ACPI_DECLARE(gic_v3, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR,
index 1de20e14a721111737bdab39c575ab7a45ddc63b,3f1d9fd3a4620bbcd25e3719615920e639471bd8..b4e647179346611a5a296a373a0aacdabf9fc9b0
@@@ -55,7 -55,7 +55,7 @@@
  
  static void gic_check_cpu_features(void)
  {
 -      WARN_TAINT_ONCE(cpus_have_cap(ARM64_HAS_SYSREG_GIC_CPUIF),
 +      WARN_TAINT_ONCE(this_cpu_has_cap(ARM64_HAS_SYSREG_GIC_CPUIF),
                        TAINT_CPU_OUT_OF_SPEC,
                        "GICv3 system registers enabled, broken firmware!\n");
  }
@@@ -72,9 -72,6 +72,9 @@@ struct gic_chip_data 
        struct irq_chip chip;
        union gic_base dist_base;
        union gic_base cpu_base;
 +      void __iomem *raw_dist_base;
 +      void __iomem *raw_cpu_base;
 +      u32 percpu_offset;
  #ifdef CONFIG_CPU_PM
        u32 saved_spi_enable[DIV_ROUND_UP(1020, 32)];
        u32 saved_spi_active[DIV_ROUND_UP(1020, 32)];
@@@ -105,6 -102,8 +105,8 @@@ static struct static_key supports_deact
  
  static struct gic_chip_data gic_data[CONFIG_ARM_GIC_MAX_NR] __read_mostly;
  
+ static struct gic_kvm_info gic_v2_kvm_info;
  #ifdef CONFIG_GIC_NON_BANKED
  static void __iomem *gic_get_percpu_base(union gic_base *base)
  {
@@@ -347,14 -346,6 +349,14 @@@ static void __exception_irq_entry gic_h
                        if (static_key_true(&supports_deactivate))
                                writel_relaxed(irqstat, cpu_base + GIC_CPU_DEACTIVATE);
  #ifdef CONFIG_SMP
 +                      /*
 +                       * Ensure any shared data written by the CPU sending
 +                       * the IPI is read after we've read the ACK register
 +                       * on the GIC.
 +                       *
 +                       * Pairs with the write barrier in gic_raise_softirq
 +                       */
 +                      smp_rmb();
                        handle_IPI(irqnr, regs);
  #endif
                        continue;
@@@ -402,6 -393,20 +404,6 @@@ static struct irq_chip gic_chip = 
                                  IRQCHIP_MASK_ON_SUSPEND,
  };
  
 -static struct irq_chip gic_eoimode1_chip = {
 -      .name                   = "GICv2",
 -      .irq_mask               = gic_eoimode1_mask_irq,
 -      .irq_unmask             = gic_unmask_irq,
 -      .irq_eoi                = gic_eoimode1_eoi_irq,
 -      .irq_set_type           = gic_set_type,
 -      .irq_get_irqchip_state  = gic_irq_get_irqchip_state,
 -      .irq_set_irqchip_state  = gic_irq_set_irqchip_state,
 -      .irq_set_vcpu_affinity  = gic_irq_set_vcpu_affinity,
 -      .flags                  = IRQCHIP_SET_TYPE_MASKED |
 -                                IRQCHIP_SKIP_SET_WAKE |
 -                                IRQCHIP_MASK_ON_SUSPEND,
 -};
 -
  void __init gic_cascade_irq(unsigned int gic_nr, unsigned int irq)
  {
        BUG_ON(gic_nr >= CONFIG_ARM_GIC_MAX_NR);
@@@ -470,7 -475,7 +472,7 @@@ static void __init gic_dist_init(struc
        writel_relaxed(GICD_ENABLE, base + GIC_DIST_CTRL);
  }
  
 -static void gic_cpu_init(struct gic_chip_data *gic)
 +static int gic_cpu_init(struct gic_chip_data *gic)
  {
        void __iomem *dist_base = gic_data_dist_base(gic);
        void __iomem *base = gic_data_cpu_base(gic);
                /*
                 * Get what the GIC says our CPU mask is.
                 */
 -              BUG_ON(cpu >= NR_GIC_CPU_IF);
 +              if (WARN_ON(cpu >= NR_GIC_CPU_IF))
 +                      return -EINVAL;
 +
 +              gic_check_cpu_features();
                cpu_mask = gic_get_cpumask(gic);
                gic_cpu_map[cpu] = cpu_mask;
  
  
        writel_relaxed(GICC_INT_PRI_THRESHOLD, base + GIC_CPU_PRIMASK);
        gic_cpu_if_up(gic);
 +
 +      return 0;
  }
  
  int gic_cpu_if_down(unsigned int gic_nr)
   * this function, no interrupts will be delivered by the GIC, and another
   * platform-specific wakeup source must be enabled.
   */
 -static void gic_dist_save(unsigned int gic_nr)
 +static void gic_dist_save(struct gic_chip_data *gic)
  {
        unsigned int gic_irqs;
        void __iomem *dist_base;
        int i;
  
 -      BUG_ON(gic_nr >= CONFIG_ARM_GIC_MAX_NR);
 +      if (WARN_ON(!gic))
 +              return;
  
 -      gic_irqs = gic_data[gic_nr].gic_irqs;
 -      dist_base = gic_data_dist_base(&gic_data[gic_nr]);
 +      gic_irqs = gic->gic_irqs;
 +      dist_base = gic_data_dist_base(gic);
  
        if (!dist_base)
                return;
  
        for (i = 0; i < DIV_ROUND_UP(gic_irqs, 16); i++)
 -              gic_data[gic_nr].saved_spi_conf[i] =
 +              gic->saved_spi_conf[i] =
                        readl_relaxed(dist_base + GIC_DIST_CONFIG + i * 4);
  
        for (i = 0; i < DIV_ROUND_UP(gic_irqs, 4); i++)
 -              gic_data[gic_nr].saved_spi_target[i] =
 +              gic->saved_spi_target[i] =
                        readl_relaxed(dist_base + GIC_DIST_TARGET + i * 4);
  
        for (i = 0; i < DIV_ROUND_UP(gic_irqs, 32); i++)
 -              gic_data[gic_nr].saved_spi_enable[i] =
 +              gic->saved_spi_enable[i] =
                        readl_relaxed(dist_base + GIC_DIST_ENABLE_SET + i * 4);
  
        for (i = 0; i < DIV_ROUND_UP(gic_irqs, 32); i++)
 -              gic_data[gic_nr].saved_spi_active[i] =
 +              gic->saved_spi_active[i] =
                        readl_relaxed(dist_base + GIC_DIST_ACTIVE_SET + i * 4);
  }
  
   * handled normally, but any edge interrupts that occured will not be seen by
   * the GIC and need to be handled by the platform-specific wakeup source.
   */
 -static void gic_dist_restore(unsigned int gic_nr)
 +static void gic_dist_restore(struct gic_chip_data *gic)
  {
        unsigned int gic_irqs;
        unsigned int i;
        void __iomem *dist_base;
  
 -      BUG_ON(gic_nr >= CONFIG_ARM_GIC_MAX_NR);
 +      if (WARN_ON(!gic))
 +              return;
  
 -      gic_irqs = gic_data[gic_nr].gic_irqs;
 -      dist_base = gic_data_dist_base(&gic_data[gic_nr]);
 +      gic_irqs = gic->gic_irqs;
 +      dist_base = gic_data_dist_base(gic);
  
        if (!dist_base)
                return;
        writel_relaxed(GICD_DISABLE, dist_base + GIC_DIST_CTRL);
  
        for (i = 0; i < DIV_ROUND_UP(gic_irqs, 16); i++)
 -              writel_relaxed(gic_data[gic_nr].saved_spi_conf[i],
 +              writel_relaxed(gic->saved_spi_conf[i],
                        dist_base + GIC_DIST_CONFIG + i * 4);
  
        for (i = 0; i < DIV_ROUND_UP(gic_irqs, 4); i++)
                        dist_base + GIC_DIST_PRI + i * 4);
  
        for (i = 0; i < DIV_ROUND_UP(gic_irqs, 4); i++)
 -              writel_relaxed(gic_data[gic_nr].saved_spi_target[i],
 +              writel_relaxed(gic->saved_spi_target[i],
                        dist_base + GIC_DIST_TARGET + i * 4);
  
        for (i = 0; i < DIV_ROUND_UP(gic_irqs, 32); i++) {
                writel_relaxed(GICD_INT_EN_CLR_X32,
                        dist_base + GIC_DIST_ENABLE_CLEAR + i * 4);
 -              writel_relaxed(gic_data[gic_nr].saved_spi_enable[i],
 +              writel_relaxed(gic->saved_spi_enable[i],
                        dist_base + GIC_DIST_ENABLE_SET + i * 4);
        }
  
        for (i = 0; i < DIV_ROUND_UP(gic_irqs, 32); i++) {
                writel_relaxed(GICD_INT_EN_CLR_X32,
                        dist_base + GIC_DIST_ACTIVE_CLEAR + i * 4);
 -              writel_relaxed(gic_data[gic_nr].saved_spi_active[i],
 +              writel_relaxed(gic->saved_spi_active[i],
                        dist_base + GIC_DIST_ACTIVE_SET + i * 4);
        }
  
        writel_relaxed(GICD_ENABLE, dist_base + GIC_DIST_CTRL);
  }
  
 -static void gic_cpu_save(unsigned int gic_nr)
 +static void gic_cpu_save(struct gic_chip_data *gic)
  {
        int i;
        u32 *ptr;
        void __iomem *dist_base;
        void __iomem *cpu_base;
  
 -      BUG_ON(gic_nr >= CONFIG_ARM_GIC_MAX_NR);
 +      if (WARN_ON(!gic))
 +              return;
  
 -      dist_base = gic_data_dist_base(&gic_data[gic_nr]);
 -      cpu_base = gic_data_cpu_base(&gic_data[gic_nr]);
 +      dist_base = gic_data_dist_base(gic);
 +      cpu_base = gic_data_cpu_base(gic);
  
        if (!dist_base || !cpu_base)
                return;
  
 -      ptr = raw_cpu_ptr(gic_data[gic_nr].saved_ppi_enable);
 +      ptr = raw_cpu_ptr(gic->saved_ppi_enable);
        for (i = 0; i < DIV_ROUND_UP(32, 32); i++)
                ptr[i] = readl_relaxed(dist_base + GIC_DIST_ENABLE_SET + i * 4);
  
 -      ptr = raw_cpu_ptr(gic_data[gic_nr].saved_ppi_active);
 +      ptr = raw_cpu_ptr(gic->saved_ppi_active);
        for (i = 0; i < DIV_ROUND_UP(32, 32); i++)
                ptr[i] = readl_relaxed(dist_base + GIC_DIST_ACTIVE_SET + i * 4);
  
 -      ptr = raw_cpu_ptr(gic_data[gic_nr].saved_ppi_conf);
 +      ptr = raw_cpu_ptr(gic->saved_ppi_conf);
        for (i = 0; i < DIV_ROUND_UP(32, 16); i++)
                ptr[i] = readl_relaxed(dist_base + GIC_DIST_CONFIG + i * 4);
  
  }
  
 -static void gic_cpu_restore(unsigned int gic_nr)
 +static void gic_cpu_restore(struct gic_chip_data *gic)
  {
        int i;
        u32 *ptr;
        void __iomem *dist_base;
        void __iomem *cpu_base;
  
 -      BUG_ON(gic_nr >= CONFIG_ARM_GIC_MAX_NR);
 +      if (WARN_ON(!gic))
 +              return;
  
 -      dist_base = gic_data_dist_base(&gic_data[gic_nr]);
 -      cpu_base = gic_data_cpu_base(&gic_data[gic_nr]);
 +      dist_base = gic_data_dist_base(gic);
 +      cpu_base = gic_data_cpu_base(gic);
  
        if (!dist_base || !cpu_base)
                return;
  
 -      ptr = raw_cpu_ptr(gic_data[gic_nr].saved_ppi_enable);
 +      ptr = raw_cpu_ptr(gic->saved_ppi_enable);
        for (i = 0; i < DIV_ROUND_UP(32, 32); i++) {
                writel_relaxed(GICD_INT_EN_CLR_X32,
                               dist_base + GIC_DIST_ENABLE_CLEAR + i * 4);
                writel_relaxed(ptr[i], dist_base + GIC_DIST_ENABLE_SET + i * 4);
        }
  
 -      ptr = raw_cpu_ptr(gic_data[gic_nr].saved_ppi_active);
 +      ptr = raw_cpu_ptr(gic->saved_ppi_active);
        for (i = 0; i < DIV_ROUND_UP(32, 32); i++) {
                writel_relaxed(GICD_INT_EN_CLR_X32,
                               dist_base + GIC_DIST_ACTIVE_CLEAR + i * 4);
                writel_relaxed(ptr[i], dist_base + GIC_DIST_ACTIVE_SET + i * 4);
        }
  
 -      ptr = raw_cpu_ptr(gic_data[gic_nr].saved_ppi_conf);
 +      ptr = raw_cpu_ptr(gic->saved_ppi_conf);
        for (i = 0; i < DIV_ROUND_UP(32, 16); i++)
                writel_relaxed(ptr[i], dist_base + GIC_DIST_CONFIG + i * 4);
  
                                        dist_base + GIC_DIST_PRI + i * 4);
  
        writel_relaxed(GICC_INT_PRI_THRESHOLD, cpu_base + GIC_CPU_PRIMASK);
 -      gic_cpu_if_up(&gic_data[gic_nr]);
 +      gic_cpu_if_up(gic);
  }
  
  static int gic_notifier(struct notifier_block *self, unsigned long cmd,       void *v)
  #endif
                switch (cmd) {
                case CPU_PM_ENTER:
 -                      gic_cpu_save(i);
 +                      gic_cpu_save(&gic_data[i]);
                        break;
                case CPU_PM_ENTER_FAILED:
                case CPU_PM_EXIT:
 -                      gic_cpu_restore(i);
 +                      gic_cpu_restore(&gic_data[i]);
                        break;
                case CPU_CLUSTER_PM_ENTER:
 -                      gic_dist_save(i);
 +                      gic_dist_save(&gic_data[i]);
                        break;
                case CPU_CLUSTER_PM_ENTER_FAILED:
                case CPU_CLUSTER_PM_EXIT:
 -                      gic_dist_restore(i);
 +                      gic_dist_restore(&gic_data[i]);
                        break;
                }
        }
@@@ -725,39 -721,26 +727,39 @@@ static struct notifier_block gic_notifi
        .notifier_call = gic_notifier,
  };
  
 -static void __init gic_pm_init(struct gic_chip_data *gic)
 +static int __init gic_pm_init(struct gic_chip_data *gic)
  {
        gic->saved_ppi_enable = __alloc_percpu(DIV_ROUND_UP(32, 32) * 4,
                sizeof(u32));
 -      BUG_ON(!gic->saved_ppi_enable);
 +      if (WARN_ON(!gic->saved_ppi_enable))
 +              return -ENOMEM;
  
        gic->saved_ppi_active = __alloc_percpu(DIV_ROUND_UP(32, 32) * 4,
                sizeof(u32));
 -      BUG_ON(!gic->saved_ppi_active);
 +      if (WARN_ON(!gic->saved_ppi_active))
 +              goto free_ppi_enable;
  
        gic->saved_ppi_conf = __alloc_percpu(DIV_ROUND_UP(32, 16) * 4,
                sizeof(u32));
 -      BUG_ON(!gic->saved_ppi_conf);
 +      if (WARN_ON(!gic->saved_ppi_conf))
 +              goto free_ppi_active;
  
        if (gic == &gic_data[0])
                cpu_pm_register_notifier(&gic_notifier_block);
 +
 +      return 0;
 +
 +free_ppi_active:
 +      free_percpu(gic->saved_ppi_active);
 +free_ppi_enable:
 +      free_percpu(gic->saved_ppi_enable);
 +
 +      return -ENOMEM;
  }
  #else
 -static void __init gic_pm_init(struct gic_chip_data *gic)
 +static int __init gic_pm_init(struct gic_chip_data *gic)
  {
 +      return 0;
  }
  #endif
  
@@@ -1030,63 -1013,63 +1032,63 @@@ static const struct irq_domain_ops gic_
        .unmap = gic_irq_domain_unmap,
  };
  
 -static void __init __gic_init_bases(unsigned int gic_nr, int irq_start,
 -                         void __iomem *dist_base, void __iomem *cpu_base,
 -                         u32 percpu_offset, struct fwnode_handle *handle)
 +static int __init __gic_init_bases(struct gic_chip_data *gic, int irq_start,
 +                                 struct fwnode_handle *handle)
  {
        irq_hw_number_t hwirq_base;
 -      struct gic_chip_data *gic;
 -      int gic_irqs, irq_base, i;
 -
 -      BUG_ON(gic_nr >= CONFIG_ARM_GIC_MAX_NR);
 +      int gic_irqs, irq_base, i, ret;
  
 -      gic_check_cpu_features();
 -
 -      gic = &gic_data[gic_nr];
 +      if (WARN_ON(!gic || gic->domain))
 +              return -EINVAL;
  
        /* Initialize irq_chip */
 -      if (static_key_true(&supports_deactivate) && gic_nr == 0) {
 -              gic->chip = gic_eoimode1_chip;
 +      gic->chip = gic_chip;
 +
 +      if (static_key_true(&supports_deactivate) && gic == &gic_data[0]) {
 +              gic->chip.irq_mask = gic_eoimode1_mask_irq;
 +              gic->chip.irq_eoi = gic_eoimode1_eoi_irq;
 +              gic->chip.irq_set_vcpu_affinity = gic_irq_set_vcpu_affinity;
 +              gic->chip.name = kasprintf(GFP_KERNEL, "GICv2");
        } else {
 -              gic->chip = gic_chip;
 -              gic->chip.name = kasprintf(GFP_KERNEL, "GIC-%d", gic_nr);
 +              gic->chip.name = kasprintf(GFP_KERNEL, "GIC-%d",
 +                                         (int)(gic - &gic_data[0]));
        }
  
  #ifdef CONFIG_SMP
 -      if (gic_nr == 0)
 +      if (gic == &gic_data[0])
                gic->chip.irq_set_affinity = gic_set_affinity;
  #endif
  
 -#ifdef CONFIG_GIC_NON_BANKED
 -      if (percpu_offset) { /* Frankein-GIC without banked registers... */
 +      if (IS_ENABLED(CONFIG_GIC_NON_BANKED) && gic->percpu_offset) {
 +              /* Frankein-GIC without banked registers... */
                unsigned int cpu;
  
                gic->dist_base.percpu_base = alloc_percpu(void __iomem *);
                gic->cpu_base.percpu_base = alloc_percpu(void __iomem *);
                if (WARN_ON(!gic->dist_base.percpu_base ||
                            !gic->cpu_base.percpu_base)) {
 -                      free_percpu(gic->dist_base.percpu_base);
 -                      free_percpu(gic->cpu_base.percpu_base);
 -                      return;
 +                      ret = -ENOMEM;
 +                      goto error;
                }
  
                for_each_possible_cpu(cpu) {
                        u32 mpidr = cpu_logical_map(cpu);
                        u32 core_id = MPIDR_AFFINITY_LEVEL(mpidr, 0);
 -                      unsigned long offset = percpu_offset * core_id;
 -                      *per_cpu_ptr(gic->dist_base.percpu_base, cpu) = dist_base + offset;
 -                      *per_cpu_ptr(gic->cpu_base.percpu_base, cpu) = cpu_base + offset;
 +                      unsigned long offset = gic->percpu_offset * core_id;
 +                      *per_cpu_ptr(gic->dist_base.percpu_base, cpu) =
 +                              gic->raw_dist_base + offset;
 +                      *per_cpu_ptr(gic->cpu_base.percpu_base, cpu) =
 +                              gic->raw_cpu_base + offset;
                }
  
                gic_set_base_accessor(gic, gic_get_percpu_base);
 -      } else
 -#endif
 -      {                       /* Normal, sane GIC... */
 -              WARN(percpu_offset,
 +      } else {
 +              /* Normal, sane GIC... */
 +              WARN(gic->percpu_offset,
                     "GIC_NON_BANKED not enabled, ignoring %08x offset!",
 -                   percpu_offset);
 -              gic->dist_base.common_base = dist_base;
 -              gic->cpu_base.common_base = cpu_base;
 +                   gic->percpu_offset);
 +              gic->dist_base.common_base = gic->raw_dist_base;
 +              gic->cpu_base.common_base = gic->raw_cpu_base;
                gic_set_base_accessor(gic, gic_get_common_base);
        }
  
                 * For primary GICs, skip over SGIs.
                 * For secondary GICs, skip over PPIs, too.
                 */
 -              if (gic_nr == 0 && (irq_start & 31) > 0) {
 +              if (gic == &gic_data[0] && (irq_start & 31) > 0) {
                        hwirq_base = 16;
                        if (irq_start != -1)
                                irq_start = (irq_start & ~31) + 16;
                                        hwirq_base, &gic_irq_domain_ops, gic);
        }
  
 -      if (WARN_ON(!gic->domain))
 -              return;
 +      if (WARN_ON(!gic->domain)) {
 +              ret = -ENODEV;
 +              goto error;
 +      }
  
 -      if (gic_nr == 0) {
 +      if (gic == &gic_data[0]) {
                /*
                 * Initialize the CPU interface map to all CPUs.
                 * It will be refined as each CPU probes its ID.
        }
  
        gic_dist_init(gic);
 -      gic_cpu_init(gic);
 -      gic_pm_init(gic);
 +      ret = gic_cpu_init(gic);
 +      if (ret)
 +              goto error;
 +
 +      ret = gic_pm_init(gic);
 +      if (ret)
 +              goto error;
 +
 +      return 0;
 +
 +error:
 +      if (IS_ENABLED(CONFIG_GIC_NON_BANKED) && gic->percpu_offset) {
 +              free_percpu(gic->dist_base.percpu_base);
 +              free_percpu(gic->cpu_base.percpu_base);
 +      }
 +
 +      kfree(gic->chip.name);
 +
 +      return ret;
  }
  
  void __init gic_init(unsigned int gic_nr, int irq_start,
                     void __iomem *dist_base, void __iomem *cpu_base)
  {
 +      struct gic_chip_data *gic;
 +
 +      if (WARN_ON(gic_nr >= CONFIG_ARM_GIC_MAX_NR))
 +              return;
 +
        /*
         * Non-DT/ACPI systems won't run a hypervisor, so let's not
         * bother with these...
         */
        static_key_slow_dec(&supports_deactivate);
 -      __gic_init_bases(gic_nr, irq_start, dist_base, cpu_base, 0, NULL);
 +
 +      gic = &gic_data[gic_nr];
 +      gic->raw_dist_base = dist_base;
 +      gic->raw_cpu_base = cpu_base;
 +
 +      __gic_init_bases(gic, irq_start, NULL);
 +}
 +
 +static void gic_teardown(struct gic_chip_data *gic)
 +{
 +      if (WARN_ON(!gic))
 +              return;
 +
 +      if (gic->raw_dist_base)
 +              iounmap(gic->raw_dist_base);
 +      if (gic->raw_cpu_base)
 +              iounmap(gic->raw_cpu_base);
  }
  
  #ifdef CONFIG_OF
@@@ -1248,63 -1191,62 +1250,88 @@@ static bool gic_check_eoimode(struct de
        return true;
  }
  
- static int gic_of_setup(struct gic_chip_data *gic, struct device_node *node)
++static int __init gic_of_setup(struct gic_chip_data *gic, struct device_node *node)
 +{
 +      if (!gic || !node)
 +              return -EINVAL;
 +
 +      gic->raw_dist_base = of_iomap(node, 0);
 +      if (WARN(!gic->raw_dist_base, "unable to map gic dist registers\n"))
 +              goto error;
 +
 +      gic->raw_cpu_base = of_iomap(node, 1);
 +      if (WARN(!gic->raw_cpu_base, "unable to map gic cpu registers\n"))
 +              goto error;
 +
 +      if (of_property_read_u32(node, "cpu-offset", &gic->percpu_offset))
 +              gic->percpu_offset = 0;
 +
 +      return 0;
 +
 +error:
 +      gic_teardown(gic);
 +
 +      return -ENOMEM;
 +}
 +
+ static void __init gic_of_setup_kvm_info(struct device_node *node)
+ {
+       int ret;
+       struct resource *vctrl_res = &gic_v2_kvm_info.vctrl;
+       struct resource *vcpu_res = &gic_v2_kvm_info.vcpu;
+       gic_v2_kvm_info.type = GIC_V2;
+       gic_v2_kvm_info.maint_irq = irq_of_parse_and_map(node, 0);
+       if (!gic_v2_kvm_info.maint_irq)
+               return;
+       ret = of_address_to_resource(node, 2, vctrl_res);
+       if (ret)
+               return;
+       ret = of_address_to_resource(node, 3, vcpu_res);
+       if (ret)
+               return;
+       gic_set_kvm_info(&gic_v2_kvm_info);
+ }
  int __init
  gic_of_init(struct device_node *node, struct device_node *parent)
  {
 -      void __iomem *cpu_base;
 -      void __iomem *dist_base;
 -      u32 percpu_offset;
 -      int irq;
 +      struct gic_chip_data *gic;
 +      int irq, ret;
  
        if (WARN_ON(!node))
                return -ENODEV;
  
 -      dist_base = of_iomap(node, 0);
 -      WARN(!dist_base, "unable to map gic dist registers\n");
 +      if (WARN_ON(gic_cnt >= CONFIG_ARM_GIC_MAX_NR))
 +              return -EINVAL;
 +
 +      gic = &gic_data[gic_cnt];
  
 -      cpu_base = of_iomap(node, 1);
 -      WARN(!cpu_base, "unable to map gic cpu registers\n");
 +      ret = gic_of_setup(gic, node);
 +      if (ret)
 +              return ret;
  
        /*
         * Disable split EOI/Deactivate if either HYP is not available
         * or the CPU interface is too small.
         */
 -      if (gic_cnt == 0 && !gic_check_eoimode(node, &cpu_base))
 +      if (gic_cnt == 0 && !gic_check_eoimode(node, &gic->raw_cpu_base))
                static_key_slow_dec(&supports_deactivate);
  
 -      if (of_property_read_u32(node, "cpu-offset", &percpu_offset))
 -              percpu_offset = 0;
 +      ret = __gic_init_bases(gic, -1, &node->fwnode);
 +      if (ret) {
 +              gic_teardown(gic);
 +              return ret;
 +      }
  
-       if (!gic_cnt)
 -      __gic_init_bases(gic_cnt, -1, dist_base, cpu_base, percpu_offset,
 -                       &node->fwnode);
+       if (!gic_cnt) {
                gic_init_physaddr(node);
+               gic_of_setup_kvm_info(node);
+       }
  
        if (parent) {
                irq = irq_of_parse_and_map(node, 0);
@@@ -1330,7 -1272,14 +1357,14 @@@ IRQCHIP_DECLARE(pl390, "arm,pl390", gic
  #endif
  
  #ifdef CONFIG_ACPI
- static phys_addr_t cpu_phy_base __initdata;
+ static struct
+ {
+       phys_addr_t cpu_phys_base;
+       u32 maint_irq;
+       int maint_irq_mode;
+       phys_addr_t vctrl_base;
+       phys_addr_t vcpu_base;
+ } acpi_data __initdata;
  
  static int __init
  gic_acpi_parse_madt_cpu(struct acpi_subtable_header *header,
         * All CPU interface addresses have to be the same.
         */
        gic_cpu_base = processor->base_address;
-       if (cpu_base_assigned && gic_cpu_base != cpu_phy_base)
+       if (cpu_base_assigned && gic_cpu_base != acpi_data.cpu_phys_base)
                return -EINVAL;
  
-       cpu_phy_base = gic_cpu_base;
+       acpi_data.cpu_phys_base = gic_cpu_base;
+       acpi_data.maint_irq = processor->vgic_interrupt;
+       acpi_data.maint_irq_mode = (processor->flags & ACPI_MADT_VGIC_IRQ_MODE) ?
+                                   ACPI_EDGE_SENSITIVE : ACPI_LEVEL_SENSITIVE;
+       acpi_data.vctrl_base = processor->gich_base_address;
+       acpi_data.vcpu_base = processor->gicv_base_address;
        cpu_base_assigned = 1;
        return 0;
  }
@@@ -1384,14 -1339,49 +1424,49 @@@ static bool __init gic_validate_dist(st
  
  #define ACPI_GICV2_DIST_MEM_SIZE      (SZ_4K)
  #define ACPI_GIC_CPU_IF_MEM_SIZE      (SZ_8K)
+ #define ACPI_GICV2_VCTRL_MEM_SIZE     (SZ_4K)
+ #define ACPI_GICV2_VCPU_MEM_SIZE      (SZ_8K)
+ static void __init gic_acpi_setup_kvm_info(void)
+ {
+       int irq;
+       struct resource *vctrl_res = &gic_v2_kvm_info.vctrl;
+       struct resource *vcpu_res = &gic_v2_kvm_info.vcpu;
+       gic_v2_kvm_info.type = GIC_V2;
+       if (!acpi_data.vctrl_base)
+               return;
+       vctrl_res->flags = IORESOURCE_MEM;
+       vctrl_res->start = acpi_data.vctrl_base;
+       vctrl_res->end = vctrl_res->start + ACPI_GICV2_VCTRL_MEM_SIZE - 1;
+       if (!acpi_data.vcpu_base)
+               return;
+       vcpu_res->flags = IORESOURCE_MEM;
+       vcpu_res->start = acpi_data.vcpu_base;
+       vcpu_res->end = vcpu_res->start + ACPI_GICV2_VCPU_MEM_SIZE - 1;
+       irq = acpi_register_gsi(NULL, acpi_data.maint_irq,
+                               acpi_data.maint_irq_mode,
+                               ACPI_ACTIVE_HIGH);
+       if (irq <= 0)
+               return;
+       gic_v2_kvm_info.maint_irq = irq;
+       gic_set_kvm_info(&gic_v2_kvm_info);
+ }
  
  static int __init gic_v2_acpi_init(struct acpi_subtable_header *header,
                                   const unsigned long end)
  {
        struct acpi_madt_generic_distributor *dist;
 -      void __iomem *cpu_base, *dist_base;
        struct fwnode_handle *domain_handle;
 -      int count;
 +      struct gic_chip_data *gic = &gic_data[0];
 +      int count, ret;
  
        /* Collect CPU base addresses */
        count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT,
                return -EINVAL;
        }
  
-       gic->raw_cpu_base = ioremap(cpu_phy_base, ACPI_GIC_CPU_IF_MEM_SIZE);
 -      cpu_base = ioremap(acpi_data.cpu_phys_base, ACPI_GIC_CPU_IF_MEM_SIZE);
 -      if (!cpu_base) {
++      gic->raw_cpu_base = ioremap(acpi_data.cpu_phys_base, ACPI_GIC_CPU_IF_MEM_SIZE);
 +      if (!gic->raw_cpu_base) {
                pr_err("Unable to map GICC registers\n");
                return -ENOMEM;
        }
  
        dist = (struct acpi_madt_generic_distributor *)header;
 -      dist_base = ioremap(dist->base_address, ACPI_GICV2_DIST_MEM_SIZE);
 -      if (!dist_base) {
 +      gic->raw_dist_base = ioremap(dist->base_address,
 +                                   ACPI_GICV2_DIST_MEM_SIZE);
 +      if (!gic->raw_dist_base) {
                pr_err("Unable to map GICD registers\n");
 -              iounmap(cpu_base);
 +              gic_teardown(gic);
                return -ENOMEM;
        }
  
        /*
         * Initialize GIC instance zero (no multi-GIC support).
         */
 -      domain_handle = irq_domain_alloc_fwnode(dist_base);
 +      domain_handle = irq_domain_alloc_fwnode(gic->raw_dist_base);
        if (!domain_handle) {
                pr_err("Unable to allocate domain handle\n");
 -              iounmap(cpu_base);
 -              iounmap(dist_base);
 +              gic_teardown(gic);
                return -ENOMEM;
        }
  
 -      __gic_init_bases(0, -1, dist_base, cpu_base, 0, domain_handle);
 +      ret = __gic_init_bases(gic, -1, domain_handle);
 +      if (ret) {
 +              pr_err("Failed to initialise GIC\n");
 +              irq_domain_free_fwnode(domain_handle);
 +              gic_teardown(gic);
 +              return ret;
 +      }
  
        acpi_set_irq_model(ACPI_IRQ_MODEL_GIC, domain_handle);
  
        if (IS_ENABLED(CONFIG_ARM_GIC_V2M))
                gicv2m_init(NULL, gic_data[0].domain);
  
+       gic_acpi_setup_kvm_info();
        return 0;
  }
  IRQCHIP_ACPI_DECLARE(gic_v2, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR,