Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

author Linus Torvalds <torvalds@linux-foundation.org>

Thu, 19 May 2016 18:27:09 +0000 (11:27 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 19 May 2016 18:27:09 +0000 (11:27 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Thu, 19 May 2016 18:27:09 +0000 (11:27 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 19 May 2016 18:27:09 +0000 (11:27 -0700)
diff --combined arch/arm/include/asm/kvm_host.h

index 738d5eee91de0167e9ffa9062cc55b79ed3b4905,4cd8732796ab6ff3cafef1f6b11462d2a25215ea..0df6b1fc965571116ed4ae2366aff451873888e5
--- 1/arch/arm/include/asm/kvm_host.h
--- 2/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@@ -187,6 -187,7 +187,7 @@@ struct kvm_vm_stat 
   struct kvm_vcpu_stat {
         u32 halt_successful_poll;
         u32 halt_attempted_poll;
+       u32 halt_poll_invalid;
         u32 halt_wakeup;
         u32 hvc_exit_stat;
         u64 wfe_exit_stat;
@@@ -265,15 -266,6 +266,15 @@@ static inline void __cpu_init_stage2(vo
         kvm_call_hyp(__init_stage2_translation);
   }
   
+ +static inline void __cpu_reset_hyp_mode(phys_addr_t boot_pgd_ptr,
+ +                                      phys_addr_t phys_idmap_start)
+ +{
+ +      /*
+ +       * TODO
+ +       * kvm_call_reset(boot_pgd_ptr, phys_idmap_start);
+ +       */
+ +}
+ +
   static inline int kvm_arch_dev_ioctl_check_extension(long ext)
   {
         return 0;
@@@ -286,10 -278,12 +287,11 @@@ void kvm_mmu_wp_memory_region(struct kv
   
   struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
   
- -static inline void kvm_arch_hardware_disable(void) {}
   static inline void kvm_arch_hardware_unsetup(void) {}
   static inline void kvm_arch_sync_events(struct kvm *kvm) {}
   static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
   static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
+ static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
   
   static inline void kvm_arm_init_debug(void) {}
   static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {}
diff --combined arch/arm/include/asm/kvm_mmu.h

index f17a8d41822caf89c896ccf3aa2c839fc96f3e75,ef0b276d97fc1076ac0b157156f41f71944e3ae0..f9a65061130b66a402fb6d2772b44f9321d926e6
--- 1/arch/arm/include/asm/kvm_mmu.h
--- 2/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@@ -47,6 -47,7 +47,7 @@@
   #include <linux/highmem.h>
   #include <asm/cacheflush.h>
   #include <asm/pgalloc.h>
+ #include <asm/stage2_pgtable.h>
   
   int create_hyp_mappings(void *from, void *to);
   int create_hyp_io_mappings(void *from, void *to, phys_addr_t);
@@@ -66,7 -67,6 +67,7 @@@ void kvm_mmu_free_memory_caches(struct 
   phys_addr_t kvm_mmu_get_httbr(void);
   phys_addr_t kvm_mmu_get_boot_httbr(void);
   phys_addr_t kvm_get_idmap_vector(void);
+ +phys_addr_t kvm_get_idmap_start(void);
   int kvm_mmu_init(void);
   void kvm_clear_hyp_idmap(void);
   
@@@ -106,14 -106,16 +107,16 @@@ static inline void kvm_clean_pte(pte_t 
         clean_pte_table(pte);
   }
   
- static inline void kvm_set_s2pte_writable(pte_t *pte)
+ static inline pte_t kvm_s2pte_mkwrite(pte_t pte)
   {
-       pte_val(*pte) |= L_PTE_S2_RDWR;
+       pte_val(pte) |= L_PTE_S2_RDWR;
+       return pte;
   }
   
- static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
+ static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd)
   {
-       pmd_val(*pmd) |= L_PMD_S2_RDWR;
+       pmd_val(pmd) |= L_PMD_S2_RDWR;
+       return pmd;
   }
   
   static inline void kvm_set_s2pte_readonly(pte_t *pte)
@@@ -136,22 -138,6 +139,6 @@@ static inline bool kvm_s2pmd_readonly(p
         return (pmd_val(*pmd) & L_PMD_S2_RDWR) == L_PMD_S2_RDONLY;
   }
   
- 
- /* Open coded p*d_addr_end that can deal with 64bit addresses */
- #define kvm_pgd_addr_end(addr, end)                                   \
- ({    u64 __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK;            \
-       (__boundary - 1 < (end) - 1)? __boundary: (end);                \
- })
- 
- #define kvm_pud_addr_end(addr,end)            (end)
- 
- #define kvm_pmd_addr_end(addr, end)                                   \
- ({    u64 __boundary = ((addr) + PMD_SIZE) & PMD_MASK;                \
-       (__boundary - 1 < (end) - 1)? __boundary: (end);                \
- })
- 
- #define kvm_pgd_index(addr)                   pgd_index(addr)
- 
   static inline bool kvm_page_empty(void *ptr)
   {
         struct page *ptr_page = virt_to_page(ptr);
@@@ -160,19 -146,11 +147,11 @@@
   
   #define kvm_pte_table_empty(kvm, ptep) kvm_page_empty(ptep)
   #define kvm_pmd_table_empty(kvm, pmdp) kvm_page_empty(pmdp)
- #define kvm_pud_table_empty(kvm, pudp) (0)
- 
- #define KVM_PREALLOC_LEVEL    0
+ #define kvm_pud_table_empty(kvm, pudp) false
   
- static inline void *kvm_get_hwpgd(struct kvm *kvm)
- {
-       return kvm->arch.pgd;
- }
- 
- static inline unsigned int kvm_get_hwpgd_size(void)
- {
-       return PTRS_PER_S2_PGD * sizeof(pgd_t);
- }
+ #define hyp_pte_table_empty(ptep) kvm_page_empty(ptep)
+ #define hyp_pmd_table_empty(pmdp) kvm_page_empty(pmdp)
+ #define hyp_pud_table_empty(pudp) false
   
   struct kvm;
   
diff --combined arch/arm/kvm/arm.c

index 9ef013d86cc5c7a5f3924071bbe8044e8d17a783,be4b6394a0620de3037a68a98f364fa941b2c958..237d5d82f0afd6f1749a12f46872dc5170b2c3c6
--- 1/arch/arm/kvm/arm.c
--- 2/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@@ -16,6 -16,7 +16,6 @@@
    * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
    */
   
- -#include <linux/cpu.h>
   #include <linux/cpu_pm.h>
   #include <linux/errno.h>
   #include <linux/err.h>
@@@ -65,8 -66,6 +65,8 @@@ static DEFINE_SPINLOCK(kvm_vmid_lock)
   
   static bool vgic_present;
   
+ +static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled);
+ +
   static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
   {
         BUG_ON(preemptible());
@@@ -91,6 -90,11 +91,6 @@@ struct kvm_vcpu * __percpu *kvm_get_run
         return &kvm_arm_running_vcpu;
   }
   
- -int kvm_arch_hardware_enable(void)
- -{
- -      return 0;
- -}
- -
   int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
   {
         return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
@@@ -444,7 -448,7 +444,7 @@@ static void update_vttbr(struct kvm *kv
         kvm_next_vmid &= (1 << kvm_vmid_bits) - 1;
   
         /* update vttbr to be used with the new vmid */
-       pgd_phys = virt_to_phys(kvm_get_hwpgd(kvm));
+       pgd_phys = virt_to_phys(kvm->arch.pgd);
         BUG_ON(pgd_phys & ~VTTBR_BADDR_MASK);
         vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits);
         kvm->arch.vttbr = pgd_phys | vmid;
@@@ -1029,6 -1033,11 +1029,6 @@@ long kvm_arch_vm_ioctl(struct file *fil
         }
   }
   
- -static void cpu_init_stage2(void *dummy)
- -{
- -      __cpu_init_stage2();
- -}
- -
   static void cpu_init_hyp_mode(void *dummy)
   {
         phys_addr_t boot_pgd_ptr;
@@@ -1056,87 -1065,43 +1056,87 @@@ static void cpu_hyp_reinit(void
   {
         if (is_kernel_in_hyp_mode()) {
                 /*
- -               * cpu_init_stage2() is safe to call even if the PM
+ +               * __cpu_init_stage2() is safe to call even if the PM
                  * event was cancelled before the CPU was reset.
                  */
- -              cpu_init_stage2(NULL);
+ +              __cpu_init_stage2();
         } else {
                 if (__hyp_get_vectors() == hyp_default_vectors)
                         cpu_init_hyp_mode(NULL);
         }
   }
   
- -static int hyp_init_cpu_notify(struct notifier_block *self,
- -                             unsigned long action, void *cpu)
+ +static void cpu_hyp_reset(void)
   {
- -      switch (action) {
- -      case CPU_STARTING:
- -      case CPU_STARTING_FROZEN:
+ +      phys_addr_t boot_pgd_ptr;
+ +      phys_addr_t phys_idmap_start;
+ +
+ +      if (!is_kernel_in_hyp_mode()) {
+ +              boot_pgd_ptr = kvm_mmu_get_boot_httbr();
+ +              phys_idmap_start = kvm_get_idmap_start();
+ +
+ +              __cpu_reset_hyp_mode(boot_pgd_ptr, phys_idmap_start);
+ +      }
+ +}
+ +
+ +static void _kvm_arch_hardware_enable(void *discard)
+ +{
+ +      if (!__this_cpu_read(kvm_arm_hardware_enabled)) {
                 cpu_hyp_reinit();
+ +              __this_cpu_write(kvm_arm_hardware_enabled, 1);
         }
+ +}
   
- -      return NOTIFY_OK;
+ +int kvm_arch_hardware_enable(void)
+ +{
+ +      _kvm_arch_hardware_enable(NULL);
+ +      return 0;
   }
   
- -static struct notifier_block hyp_init_cpu_nb = {
- -      .notifier_call = hyp_init_cpu_notify,
- -};
+ +static void _kvm_arch_hardware_disable(void *discard)
+ +{
+ +      if (__this_cpu_read(kvm_arm_hardware_enabled)) {
+ +              cpu_hyp_reset();
+ +              __this_cpu_write(kvm_arm_hardware_enabled, 0);
+ +      }
+ +}
+ +
+ +void kvm_arch_hardware_disable(void)
+ +{
+ +      _kvm_arch_hardware_disable(NULL);
+ +}
   
   #ifdef CONFIG_CPU_PM
   static int hyp_init_cpu_pm_notifier(struct notifier_block *self,
                                     unsigned long cmd,
                                     void *v)
   {
- -      if (cmd == CPU_PM_EXIT) {
- -              cpu_hyp_reinit();
+ +      /*
+ +       * kvm_arm_hardware_enabled is left with its old value over
+ +       * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should
+ +       * re-enable hyp.
+ +       */
+ +      switch (cmd) {
+ +      case CPU_PM_ENTER:
+ +              if (__this_cpu_read(kvm_arm_hardware_enabled))
+ +                      /*
+ +                       * don't update kvm_arm_hardware_enabled here
+ +                       * so that the hardware will be re-enabled
+ +                       * when we resume. See below.
+ +                       */
+ +                      cpu_hyp_reset();
+ +
                 return NOTIFY_OK;
- -      }
+ +      case CPU_PM_EXIT:
+ +              if (__this_cpu_read(kvm_arm_hardware_enabled))
+ +                      /* The hardware was enabled before suspend. */
+ +                      cpu_hyp_reinit();
   
- -      return NOTIFY_DONE;
+ +              return NOTIFY_OK;
+ +
+ +      default:
+ +              return NOTIFY_DONE;
+ +      }
   }
   
   static struct notifier_block hyp_init_cpu_pm_nb = {
@@@ -1178,12 -1143,16 +1178,12 @@@ static int init_common_resources(void
   
   static int init_subsystems(void)
   {
- -      int err;
+ +      int err = 0;
   
         /*
- -       * Register CPU Hotplug notifier
+ +       * Enable hardware so that subsystem initialisation can access EL2.
          */
- -      err = register_cpu_notifier(&hyp_init_cpu_nb);
- -      if (err) {
- -              kvm_err("Cannot register KVM init CPU notifier (%d)\n", err);
- -              return err;
- -      }
+ +      on_each_cpu(_kvm_arch_hardware_enable, NULL, 1);
   
         /*
          * Register CPU lower-power notifier
@@@ -1201,10 -1170,9 +1201,10 @@@
         case -ENODEV:
         case -ENXIO:
                 vgic_present = false;
+ +              err = 0;
                 break;
         default:
- -              return err;
+ +              goto out;
         }
   
         /*
@@@ -1212,15 -1180,12 +1212,15 @@@
          */
         err = kvm_timer_hyp_init();
         if (err)
- -              return err;
+ +              goto out;
   
         kvm_perf_init();
         kvm_coproc_table_init();
   
- -      return 0;
+ +out:
+ +      on_each_cpu(_kvm_arch_hardware_disable, NULL, 1);
+ +
+ +      return err;
   }
   
   static void teardown_hyp_mode(void)
@@@ -1233,11 -1198,17 +1233,11 @@@
         free_hyp_pgds();
         for_each_possible_cpu(cpu)
                 free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
- -      unregister_cpu_notifier(&hyp_init_cpu_nb);
         hyp_cpu_pm_exit();
   }
   
   static int init_vhe_mode(void)
   {
- -      /*
- -       * Execute the init code on each CPU.
- -       */
- -      on_each_cpu(cpu_init_stage2, NULL, 1);
- -
         /* set size of VMID supported by CPU */
         kvm_vmid_bits = kvm_get_vmid_bits();
         kvm_info("%d-bit VMID\n", kvm_vmid_bits);
@@@ -1324,6 -1295,11 +1324,6 @@@ static int init_hyp_mode(void
                 }
         }
   
- -      /*
- -       * Execute the init code on each CPU.
- -       */
- -      on_each_cpu(cpu_init_hyp_mode, NULL, 1);
- -
   #ifndef CONFIG_HOTPLUG_CPU
         free_boot_hyp_pgd();
   #endif
diff --combined arch/arm/kvm/mmu.c

index be302128c5d7f57b4545d592261edd6d0db9f5e7,783e5ff0b32e189c4cd8f4733d9f858b0d0c76cf..45c43aecb8f2f30997015f454d3d85fb25d7ac7a
--- 1/arch/arm/kvm/mmu.c
--- 2/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@@ -43,11 -43,9 +43,9 @@@ static unsigned long hyp_idmap_start
   static unsigned long hyp_idmap_end;
   static phys_addr_t hyp_idmap_vector;
   
+ #define S2_PGD_SIZE   (PTRS_PER_S2_PGD * sizeof(pgd_t))
   #define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
   
- #define kvm_pmd_huge(_x)      (pmd_huge(_x) || pmd_trans_huge(_x))
- #define kvm_pud_huge(_x)      pud_huge(_x)
- 
   #define KVM_S2PTE_FLAG_IS_IOMAP               (1UL << 0)
   #define KVM_S2_FLAG_LOGGING_ACTIVE    (1UL << 1)
   
@@@ -69,14 -67,7 +67,7 @@@ void kvm_flush_remote_tlbs(struct kvm *
   
   static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
   {
-       /*
-        * This function also gets called when dealing with HYP page
-        * tables. As HYP doesn't have an associated struct kvm (and
-        * the HYP page tables are fairly static), we don't do
-        * anything there.
-        */
-       if (kvm)
-               kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
+       kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
   }
   
   /*
@@@ -115,7 -106,7 +106,7 @@@ static bool kvm_is_device_pfn(unsigned 
    */
   static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
   {
-       if (!kvm_pmd_huge(*pmd))
+       if (!pmd_thp_or_huge(*pmd))
                 return;
   
         pmd_clear(pmd);
@@@ -155,29 -146,29 +146,29 @@@ static void *mmu_memory_cache_alloc(str
         return p;
   }
   
- static void clear_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr)
+ static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr)
   {
-       pud_t *pud_table __maybe_unused = pud_offset(pgd, 0);
-       pgd_clear(pgd);
+       pud_t *pud_table __maybe_unused = stage2_pud_offset(pgd, 0UL);
+       stage2_pgd_clear(pgd);
         kvm_tlb_flush_vmid_ipa(kvm, addr);
-       pud_free(NULL, pud_table);
+       stage2_pud_free(pud_table);
         put_page(virt_to_page(pgd));
   }
   
- static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
+ static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
   {
-       pmd_t *pmd_table = pmd_offset(pud, 0);
-       VM_BUG_ON(pud_huge(*pud));
-       pud_clear(pud);
+       pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(pud, 0);
+       VM_BUG_ON(stage2_pud_huge(*pud));
+       stage2_pud_clear(pud);
         kvm_tlb_flush_vmid_ipa(kvm, addr);
-       pmd_free(NULL, pmd_table);
+       stage2_pmd_free(pmd_table);
         put_page(virt_to_page(pud));
   }
   
- static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
+ static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
   {
         pte_t *pte_table = pte_offset_kernel(pmd, 0);
-       VM_BUG_ON(kvm_pmd_huge(*pmd));
+       VM_BUG_ON(pmd_thp_or_huge(*pmd));
         pmd_clear(pmd);
         kvm_tlb_flush_vmid_ipa(kvm, addr);
         pte_free_kernel(NULL, pte_table);
@@@ -204,7 -195,7 +195,7 @@@
    * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure
    * the IO subsystem will never hit in the cache.
    */
- static void unmap_ptes(struct kvm *kvm, pmd_t *pmd,
+ static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd,
                        phys_addr_t addr, phys_addr_t end)
   {
         phys_addr_t start_addr = addr;
@@@ -226,21 -217,21 +217,21 @@@
                 }
         } while (pte++, addr += PAGE_SIZE, addr != end);
   
-       if (kvm_pte_table_empty(kvm, start_pte))
-               clear_pmd_entry(kvm, pmd, start_addr);
+       if (stage2_pte_table_empty(start_pte))
+               clear_stage2_pmd_entry(kvm, pmd, start_addr);
   }
   
- static void unmap_pmds(struct kvm *kvm, pud_t *pud,
+ static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud,
                        phys_addr_t addr, phys_addr_t end)
   {
         phys_addr_t next, start_addr = addr;
         pmd_t *pmd, *start_pmd;
   
-       start_pmd = pmd = pmd_offset(pud, addr);
+       start_pmd = pmd = stage2_pmd_offset(pud, addr);
         do {
-               next = kvm_pmd_addr_end(addr, end);
+               next = stage2_pmd_addr_end(addr, end);
                 if (!pmd_none(*pmd)) {
-                       if (kvm_pmd_huge(*pmd)) {
+                       if (pmd_thp_or_huge(*pmd)) {
                                 pmd_t old_pmd = *pmd;
   
                                 pmd_clear(pmd);
@@@ -250,57 -241,64 +241,64 @@@
   
                                 put_page(virt_to_page(pmd));
                         } else {
-                               unmap_ptes(kvm, pmd, addr, next);
+                               unmap_stage2_ptes(kvm, pmd, addr, next);
                         }
                 }
         } while (pmd++, addr = next, addr != end);
   
-       if (kvm_pmd_table_empty(kvm, start_pmd))
-               clear_pud_entry(kvm, pud, start_addr);
+       if (stage2_pmd_table_empty(start_pmd))
+               clear_stage2_pud_entry(kvm, pud, start_addr);
   }
   
- static void unmap_puds(struct kvm *kvm, pgd_t *pgd,
+ static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd,
                        phys_addr_t addr, phys_addr_t end)
   {
         phys_addr_t next, start_addr = addr;
         pud_t *pud, *start_pud;
   
-       start_pud = pud = pud_offset(pgd, addr);
+       start_pud = pud = stage2_pud_offset(pgd, addr);
         do {
-               next = kvm_pud_addr_end(addr, end);
-               if (!pud_none(*pud)) {
-                       if (pud_huge(*pud)) {
+               next = stage2_pud_addr_end(addr, end);
+               if (!stage2_pud_none(*pud)) {
+                       if (stage2_pud_huge(*pud)) {
                                 pud_t old_pud = *pud;
   
-                               pud_clear(pud);
+                               stage2_pud_clear(pud);
                                 kvm_tlb_flush_vmid_ipa(kvm, addr);
- 
                                 kvm_flush_dcache_pud(old_pud);
- 
                                 put_page(virt_to_page(pud));
                         } else {
-                               unmap_pmds(kvm, pud, addr, next);
+                               unmap_stage2_pmds(kvm, pud, addr, next);
                         }
                 }
         } while (pud++, addr = next, addr != end);
   
-       if (kvm_pud_table_empty(kvm, start_pud))
-               clear_pgd_entry(kvm, pgd, start_addr);
+       if (stage2_pud_table_empty(start_pud))
+               clear_stage2_pgd_entry(kvm, pgd, start_addr);
   }
   
- 
- static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
-                       phys_addr_t start, u64 size)
+ /**
+  * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
+  * @kvm:   The VM pointer
+  * @start: The intermediate physical base address of the range to unmap
+  * @size:  The size of the area to unmap
+  *
+  * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
+  * be called while holding mmu_lock (unless for freeing the stage2 pgd before
+  * destroying the VM), otherwise another faulting VCPU may come in and mess
+  * with things behind our backs.
+  */
+ static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
   {
         pgd_t *pgd;
         phys_addr_t addr = start, end = start + size;
         phys_addr_t next;
   
-       pgd = pgdp + kvm_pgd_index(addr);
+       pgd = kvm->arch.pgd + stage2_pgd_index(addr);
         do {
-               next = kvm_pgd_addr_end(addr, end);
-               if (!pgd_none(*pgd))
-                       unmap_puds(kvm, pgd, addr, next);
+               next = stage2_pgd_addr_end(addr, end);
+               if (!stage2_pgd_none(*pgd))
+                       unmap_stage2_puds(kvm, pgd, addr, next);
         } while (pgd++, addr = next, addr != end);
   }
   
@@@ -322,11 -320,11 +320,11 @@@ static void stage2_flush_pmds(struct kv
         pmd_t *pmd;
         phys_addr_t next;
   
-       pmd = pmd_offset(pud, addr);
+       pmd = stage2_pmd_offset(pud, addr);
         do {
-               next = kvm_pmd_addr_end(addr, end);
+               next = stage2_pmd_addr_end(addr, end);
                 if (!pmd_none(*pmd)) {
-                       if (kvm_pmd_huge(*pmd))
+                       if (pmd_thp_or_huge(*pmd))
                                 kvm_flush_dcache_pmd(*pmd);
                         else
                                 stage2_flush_ptes(kvm, pmd, addr, next);
@@@ -340,11 -338,11 +338,11 @@@ static void stage2_flush_puds(struct kv
         pud_t *pud;
         phys_addr_t next;
   
-       pud = pud_offset(pgd, addr);
+       pud = stage2_pud_offset(pgd, addr);
         do {
-               next = kvm_pud_addr_end(addr, end);
-               if (!pud_none(*pud)) {
-                       if (pud_huge(*pud))
+               next = stage2_pud_addr_end(addr, end);
+               if (!stage2_pud_none(*pud)) {
+                       if (stage2_pud_huge(*pud))
                                 kvm_flush_dcache_pud(*pud);
                         else
                                 stage2_flush_pmds(kvm, pud, addr, next);
@@@ -360,9 -358,9 +358,9 @@@ static void stage2_flush_memslot(struc
         phys_addr_t next;
         pgd_t *pgd;
   
-       pgd = kvm->arch.pgd + kvm_pgd_index(addr);
+       pgd = kvm->arch.pgd + stage2_pgd_index(addr);
         do {
-               next = kvm_pgd_addr_end(addr, end);
+               next = stage2_pgd_addr_end(addr, end);
                 stage2_flush_puds(kvm, pgd, addr, next);
         } while (pgd++, addr = next, addr != end);
   }
@@@ -391,6 -389,100 +389,100 @@@ static void stage2_flush_vm(struct kvm 
         srcu_read_unlock(&kvm->srcu, idx);
   }
   
+ static void clear_hyp_pgd_entry(pgd_t *pgd)
+ {
+       pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL);
+       pgd_clear(pgd);
+       pud_free(NULL, pud_table);
+       put_page(virt_to_page(pgd));
+ }
+ 
+ static void clear_hyp_pud_entry(pud_t *pud)
+ {
+       pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0);
+       VM_BUG_ON(pud_huge(*pud));
+       pud_clear(pud);
+       pmd_free(NULL, pmd_table);
+       put_page(virt_to_page(pud));
+ }
+ 
+ static void clear_hyp_pmd_entry(pmd_t *pmd)
+ {
+       pte_t *pte_table = pte_offset_kernel(pmd, 0);
+       VM_BUG_ON(pmd_thp_or_huge(*pmd));
+       pmd_clear(pmd);
+       pte_free_kernel(NULL, pte_table);
+       put_page(virt_to_page(pmd));
+ }
+ 
+ static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
+ {
+       pte_t *pte, *start_pte;
+ 
+       start_pte = pte = pte_offset_kernel(pmd, addr);
+       do {
+               if (!pte_none(*pte)) {
+                       kvm_set_pte(pte, __pte(0));
+                       put_page(virt_to_page(pte));
+               }
+       } while (pte++, addr += PAGE_SIZE, addr != end);
+ 
+       if (hyp_pte_table_empty(start_pte))
+               clear_hyp_pmd_entry(pmd);
+ }
+ 
+ static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
+ {
+       phys_addr_t next;
+       pmd_t *pmd, *start_pmd;
+ 
+       start_pmd = pmd = pmd_offset(pud, addr);
+       do {
+               next = pmd_addr_end(addr, end);
+               /* Hyp doesn't use huge pmds */
+               if (!pmd_none(*pmd))
+                       unmap_hyp_ptes(pmd, addr, next);
+       } while (pmd++, addr = next, addr != end);
+ 
+       if (hyp_pmd_table_empty(start_pmd))
+               clear_hyp_pud_entry(pud);
+ }
+ 
+ static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
+ {
+       phys_addr_t next;
+       pud_t *pud, *start_pud;
+ 
+       start_pud = pud = pud_offset(pgd, addr);
+       do {
+               next = pud_addr_end(addr, end);
+               /* Hyp doesn't use huge puds */
+               if (!pud_none(*pud))
+                       unmap_hyp_pmds(pud, addr, next);
+       } while (pud++, addr = next, addr != end);
+ 
+       if (hyp_pud_table_empty(start_pud))
+               clear_hyp_pgd_entry(pgd);
+ }
+ 
+ static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
+ {
+       pgd_t *pgd;
+       phys_addr_t addr = start, end = start + size;
+       phys_addr_t next;
+ 
+       /*
+        * We don't unmap anything from HYP, except at the hyp tear down.
+        * Hence, we don't have to invalidate the TLBs here.
+        */
+       pgd = pgdp + pgd_index(addr);
+       do {
+               next = pgd_addr_end(addr, end);
+               if (!pgd_none(*pgd))
+                       unmap_hyp_puds(pgd, addr, next);
+       } while (pgd++, addr = next, addr != end);
+ }
+ 
   /**
    * free_boot_hyp_pgd - free HYP boot page tables
    *
@@@ -401,14 -493,14 +493,14 @@@ void free_boot_hyp_pgd(void
         mutex_lock(&kvm_hyp_pgd_mutex);
   
         if (boot_hyp_pgd) {
-               unmap_range(NULL, boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
-               unmap_range(NULL, boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
+               unmap_hyp_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
+               unmap_hyp_range(boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
                 free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
                 boot_hyp_pgd = NULL;
         }
   
         if (hyp_pgd)
-               unmap_range(NULL, hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
+               unmap_hyp_range(hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
   
         mutex_unlock(&kvm_hyp_pgd_mutex);
   }
@@@ -433,9 -525,9 +525,9 @@@ void free_hyp_pgds(void
   
         if (hyp_pgd) {
                 for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE)
-                       unmap_range(NULL, hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
+                       unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
                 for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE)
-                       unmap_range(NULL, hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
+                       unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
   
                 free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
                 hyp_pgd = NULL;
@@@ -645,20 -737,6 +737,6 @@@ int create_hyp_io_mappings(void *from, 
                                      __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE);
   }
   
- /* Free the HW pgd, one page at a time */
- static void kvm_free_hwpgd(void *hwpgd)
- {
-       free_pages_exact(hwpgd, kvm_get_hwpgd_size());
- }
- 
- /* Allocate the HW PGD, making sure that each page gets its own refcount */
- static void *kvm_alloc_hwpgd(void)
- {
-       unsigned int size = kvm_get_hwpgd_size();
- 
-       return alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO);
- }
- 
   /**
    * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
    * @kvm:      The KVM struct pointer for the VM.
@@@ -673,81 -751,22 +751,22 @@@
   int kvm_alloc_stage2_pgd(struct kvm *kvm)
   {
         pgd_t *pgd;
-       void *hwpgd;
   
         if (kvm->arch.pgd != NULL) {
                 kvm_err("kvm_arch already initialized?\n");
                 return -EINVAL;
         }
   
-       hwpgd = kvm_alloc_hwpgd();
-       if (!hwpgd)
+       /* Allocate the HW PGD, making sure that each page gets its own refcount */
+       pgd = alloc_pages_exact(S2_PGD_SIZE, GFP_KERNEL | __GFP_ZERO);
+       if (!pgd)
                 return -ENOMEM;
   
-       /* When the kernel uses more levels of page tables than the
-        * guest, we allocate a fake PGD and pre-populate it to point
-        * to the next-level page table, which will be the real
-        * initial page table pointed to by the VTTBR.
-        *
-        * When KVM_PREALLOC_LEVEL==2, we allocate a single page for
-        * the PMD and the kernel will use folded pud.
-        * When KVM_PREALLOC_LEVEL==1, we allocate 2 consecutive PUD
-        * pages.
-        */
-       if (KVM_PREALLOC_LEVEL > 0) {
-               int i;
- 
-               /*
-                * Allocate fake pgd for the page table manipulation macros to
-                * work.  This is not used by the hardware and we have no
-                * alignment requirement for this allocation.
-                */
-               pgd = kmalloc(PTRS_PER_S2_PGD * sizeof(pgd_t),
-                               GFP_KERNEL | __GFP_ZERO);
- 
-               if (!pgd) {
-                       kvm_free_hwpgd(hwpgd);
-                       return -ENOMEM;
-               }
- 
-               /* Plug the HW PGD into the fake one. */
-               for (i = 0; i < PTRS_PER_S2_PGD; i++) {
-                       if (KVM_PREALLOC_LEVEL == 1)
-                               pgd_populate(NULL, pgd + i,
-                                            (pud_t *)hwpgd + i * PTRS_PER_PUD);
-                       else if (KVM_PREALLOC_LEVEL == 2)
-                               pud_populate(NULL, pud_offset(pgd, 0) + i,
-                                            (pmd_t *)hwpgd + i * PTRS_PER_PMD);
-               }
-       } else {
-               /*
-                * Allocate actual first-level Stage-2 page table used by the
-                * hardware for Stage-2 page table walks.
-                */
-               pgd = (pgd_t *)hwpgd;
-       }
- 
         kvm_clean_pgd(pgd);
         kvm->arch.pgd = pgd;
         return 0;
   }
   
- /**
-  * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
-  * @kvm:   The VM pointer
-  * @start: The intermediate physical base address of the range to unmap
-  * @size:  The size of the area to unmap
-  *
-  * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
-  * be called while holding mmu_lock (unless for freeing the stage2 pgd before
-  * destroying the VM), otherwise another faulting VCPU may come in and mess
-  * with things behind our backs.
-  */
- static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
- {
-       unmap_range(kvm, kvm->arch.pgd, start, size);
- }
- 
   static void stage2_unmap_memslot(struct kvm *kvm,
                                  struct kvm_memory_slot *memslot)
   {
@@@ -830,10 -849,8 +849,8 @@@ void kvm_free_stage2_pgd(struct kvm *kv
                 return;
   
         unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE);
-       kvm_free_hwpgd(kvm_get_hwpgd(kvm));
-       if (KVM_PREALLOC_LEVEL > 0)
-               kfree(kvm->arch.pgd);
- 
+       /* Free the HW pgd, one page at a time */
+       free_pages_exact(kvm->arch.pgd, S2_PGD_SIZE);
         kvm->arch.pgd = NULL;
   }
   
@@@ -843,16 -860,16 +860,16 @@@ static pud_t *stage2_get_pud(struct kv
         pgd_t *pgd;
         pud_t *pud;
   
-       pgd = kvm->arch.pgd + kvm_pgd_index(addr);
-       if (WARN_ON(pgd_none(*pgd))) {
+       pgd = kvm->arch.pgd + stage2_pgd_index(addr);
+       if (WARN_ON(stage2_pgd_none(*pgd))) {
                 if (!cache)
                         return NULL;
                 pud = mmu_memory_cache_alloc(cache);
-               pgd_populate(NULL, pgd, pud);
+               stage2_pgd_populate(pgd, pud);
                 get_page(virt_to_page(pgd));
         }
   
-       return pud_offset(pgd, addr);
+       return stage2_pud_offset(pgd, addr);
   }
   
   static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
@@@ -862,15 -879,15 +879,15 @@@
         pmd_t *pmd;
   
         pud = stage2_get_pud(kvm, cache, addr);
-       if (pud_none(*pud)) {
+       if (stage2_pud_none(*pud)) {
                 if (!cache)
                         return NULL;
                 pmd = mmu_memory_cache_alloc(cache);
-               pud_populate(NULL, pud, pmd);
+               stage2_pud_populate(pud, pmd);
                 get_page(virt_to_page(pud));
         }
   
-       return pmd_offset(pud, addr);
+       return stage2_pmd_offset(pud, addr);
   }
   
   static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
@@@ -893,11 -910,14 +910,14 @@@
         VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd));
   
         old_pmd = *pmd;
-       kvm_set_pmd(pmd, *new_pmd);
-       if (pmd_present(old_pmd))
+       if (pmd_present(old_pmd)) {
+               pmd_clear(pmd);
                 kvm_tlb_flush_vmid_ipa(kvm, addr);
-       else
+       } else {
                 get_page(virt_to_page(pmd));
+       }
+ 
+       kvm_set_pmd(pmd, *new_pmd);
         return 0;
   }
   
@@@ -946,14 -966,37 +966,37 @@@ static int stage2_set_pte(struct kvm *k
   
         /* Create 2nd stage page table mapping - Level 3 */
         old_pte = *pte;
-       kvm_set_pte(pte, *new_pte);
-       if (pte_present(old_pte))
+       if (pte_present(old_pte)) {
+               kvm_set_pte(pte, __pte(0));
                 kvm_tlb_flush_vmid_ipa(kvm, addr);
-       else
+       } else {
                 get_page(virt_to_page(pte));
+       }
+ 
+       kvm_set_pte(pte, *new_pte);
+       return 0;
+ }
   
+ #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+ static int stage2_ptep_test_and_clear_young(pte_t *pte)
+ {
+       if (pte_young(*pte)) {
+               *pte = pte_mkold(*pte);
+               return 1;
+       }
         return 0;
   }
+ #else
+ static int stage2_ptep_test_and_clear_young(pte_t *pte)
+ {
+       return __ptep_test_and_clear_young(pte);
+ }
+ #endif
+ 
+ static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
+ {
+       return stage2_ptep_test_and_clear_young((pte_t *)pmd);
+ }
   
   /**
    * kvm_phys_addr_ioremap - map a device range to guest IPA
@@@ -978,7 -1021,7 +1021,7 @@@ int kvm_phys_addr_ioremap(struct kvm *k
                 pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE);
   
                 if (writable)
-                       kvm_set_s2pte_writable(&pte);
+                       pte = kvm_s2pte_mkwrite(pte);
   
                 ret = mmu_topup_memory_cache(&cache, KVM_MMU_CACHE_MIN_PAGES,
                                                 KVM_NR_MEM_OBJS);
@@@ -1004,7 -1047,7 +1047,7 @@@ static bool transparent_hugepage_adjust
         kvm_pfn_t pfn = *pfnp;
         gfn_t gfn = *ipap >> PAGE_SHIFT;
   
- -      if (PageTransCompound(pfn_to_page(pfn))) {
+ +      if (PageTransCompoundMap(pfn_to_page(pfn))) {
                 unsigned long mask;
                 /*
                  * The address we faulted on is backed by a transparent huge
@@@ -1078,12 -1121,12 +1121,12 @@@ static void stage2_wp_pmds(pud_t *pud, 
         pmd_t *pmd;
         phys_addr_t next;
   
-       pmd = pmd_offset(pud, addr);
+       pmd = stage2_pmd_offset(pud, addr);
   
         do {
-               next = kvm_pmd_addr_end(addr, end);
+               next = stage2_pmd_addr_end(addr, end);
                 if (!pmd_none(*pmd)) {
-                       if (kvm_pmd_huge(*pmd)) {
+                       if (pmd_thp_or_huge(*pmd)) {
                                 if (!kvm_s2pmd_readonly(pmd))
                                         kvm_set_s2pmd_readonly(pmd);
                         } else {
@@@ -1106,12 -1149,12 +1149,12 @@@ static void  stage2_wp_puds(pgd_t *pgd
         pud_t *pud;
         phys_addr_t next;
   
-       pud = pud_offset(pgd, addr);
+       pud = stage2_pud_offset(pgd, addr);
         do {
-               next = kvm_pud_addr_end(addr, end);
-               if (!pud_none(*pud)) {
+               next = stage2_pud_addr_end(addr, end);
+               if (!stage2_pud_none(*pud)) {
                         /* TODO:PUD not supported, revisit later if supported */
-                       BUG_ON(kvm_pud_huge(*pud));
+                       BUG_ON(stage2_pud_huge(*pud));
                         stage2_wp_pmds(pud, addr, next);
                 }
         } while (pud++, addr = next, addr != end);
@@@ -1128,7 -1171,7 +1171,7 @@@ static void stage2_wp_range(struct kvm 
         pgd_t *pgd;
         phys_addr_t next;
   
-       pgd = kvm->arch.pgd + kvm_pgd_index(addr);
+       pgd = kvm->arch.pgd + stage2_pgd_index(addr);
         do {
                 /*
                  * Release kvm_mmu_lock periodically if the memory region is
@@@ -1140,8 -1183,8 +1183,8 @@@
                 if (need_resched() || spin_needbreak(&kvm->mmu_lock))
                         cond_resched_lock(&kvm->mmu_lock);
   
-               next = kvm_pgd_addr_end(addr, end);
-               if (pgd_present(*pgd))
+               next = stage2_pgd_addr_end(addr, end);
+               if (stage2_pgd_present(*pgd))
                         stage2_wp_puds(pgd, addr, next);
         } while (pgd++, addr = next, addr != end);
   }
@@@ -1320,7 -1363,7 +1363,7 @@@ static int user_mem_abort(struct kvm_vc
                 pmd_t new_pmd = pfn_pmd(pfn, mem_type);
                 new_pmd = pmd_mkhuge(new_pmd);
                 if (writable) {
-                       kvm_set_s2pmd_writable(&new_pmd);
+                       new_pmd = kvm_s2pmd_mkwrite(new_pmd);
                         kvm_set_pfn_dirty(pfn);
                 }
                 coherent_cache_guest_page(vcpu, pfn, PMD_SIZE, fault_ipa_uncached);
@@@ -1329,7 -1372,7 +1372,7 @@@
                 pte_t new_pte = pfn_pte(pfn, mem_type);
   
                 if (writable) {
-                       kvm_set_s2pte_writable(&new_pte);
+                       new_pte = kvm_s2pte_mkwrite(new_pte);
                         kvm_set_pfn_dirty(pfn);
                         mark_page_dirty(kvm, gfn);
                 }
@@@ -1348,6 -1391,8 +1391,8 @@@ out_unlock
    * Resolve the access fault by making the page young again.
    * Note that because the faulting entry is guaranteed not to be
    * cached in the TLB, we don't need to invalidate anything.
+  * Only the HW Access Flag updates are supported for Stage 2 (no DBM),
+  * so there is no need for atomic (pte|pmd)_mkyoung operations.
    */
   static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
   {
@@@ -1364,7 -1409,7 +1409,7 @@@
         if (!pmd || pmd_none(*pmd))     /* Nothing there */
                 goto out;
   
-       if (kvm_pmd_huge(*pmd)) {       /* THP, HugeTLB */
+       if (pmd_thp_or_huge(*pmd)) {    /* THP, HugeTLB */
                 *pmd = pmd_mkyoung(*pmd);
                 pfn = pmd_pfn(*pmd);
                 pfn_valid = true;
@@@ -1588,25 -1633,14 +1633,14 @@@ static int kvm_age_hva_handler(struct k
         if (!pmd || pmd_none(*pmd))     /* Nothing there */
                 return 0;
   
-       if (kvm_pmd_huge(*pmd)) {       /* THP, HugeTLB */
-               if (pmd_young(*pmd)) {
-                       *pmd = pmd_mkold(*pmd);
-                       return 1;
-               }
- 
-               return 0;
-       }
+       if (pmd_thp_or_huge(*pmd))      /* THP, HugeTLB */
+               return stage2_pmdp_test_and_clear_young(pmd);
   
         pte = pte_offset_kernel(pmd, gpa);
         if (pte_none(*pte))
                 return 0;
   
-       if (pte_young(*pte)) {
-               *pte = pte_mkold(*pte); /* Just a page... */
-               return 1;
-       }
- 
-       return 0;
+       return stage2_ptep_test_and_clear_young(pte);
   }
   
   static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data)
@@@ -1618,7 -1652,7 +1652,7 @@@
         if (!pmd || pmd_none(*pmd))     /* Nothing there */
                 return 0;
   
-       if (kvm_pmd_huge(*pmd))         /* THP, HugeTLB */
+       if (pmd_thp_or_huge(*pmd))              /* THP, HugeTLB */
                 return pmd_young(*pmd);
   
         pte = pte_offset_kernel(pmd, gpa);
@@@ -1666,11 -1700,6 +1700,11 @@@ phys_addr_t kvm_get_idmap_vector(void
         return hyp_idmap_vector;
   }
   
+ +phys_addr_t kvm_get_idmap_start(void)
+ +{
+ +      return hyp_idmap_start;
+ +}
+ +
   int kvm_mmu_init(void)
   {
         int err;
diff --combined arch/arm64/include/asm/kvm_arm.h

index 1b3dc9df5257fa3f6ad2235a926f6b410e18517a,ffde15fed3e1bcb6b21d28044803736d1e39cf76..2cdb6b551ac6206ce37691f61a11f4dca8c15f4a
--- 1/arch/arm64/include/asm/kvm_arm.h
--- 2/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@@ -84,33 -84,49 +84,38 @@@
   #define HCR_INT_OVERRIDE   (HCR_FMO | HCR_IMO)
   #define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H)
   
- -/* Hyp System Control Register (SCTLR_EL2) bits */
- -#define SCTLR_EL2_EE  (1 << 25)
- -#define SCTLR_EL2_WXN (1 << 19)
- -#define SCTLR_EL2_I   (1 << 12)
- -#define SCTLR_EL2_SA  (1 << 3)
- -#define SCTLR_EL2_C   (1 << 2)
- -#define SCTLR_EL2_A   (1 << 1)
- -#define SCTLR_EL2_M   1
- -#define SCTLR_EL2_FLAGS       (SCTLR_EL2_M | SCTLR_EL2_A | SCTLR_EL2_C |      \
- -                       SCTLR_EL2_SA | SCTLR_EL2_I)
- -
   /* TCR_EL2 Registers bits */
- #define TCR_EL2_RES1  ((1 << 31) | (1 << 23))
- #define TCR_EL2_TBI   (1 << 20)
- #define TCR_EL2_PS    (7 << 16)
- #define TCR_EL2_PS_40B        (2 << 16)
- #define TCR_EL2_TG0   (1 << 14)
- #define TCR_EL2_SH0   (3 << 12)
- #define TCR_EL2_ORGN0 (3 << 10)
- #define TCR_EL2_IRGN0 (3 << 8)
- #define TCR_EL2_T0SZ  0x3f
- #define TCR_EL2_MASK  (TCR_EL2_TG0 | TCR_EL2_SH0 | \
-                        TCR_EL2_ORGN0 | TCR_EL2_IRGN0 | TCR_EL2_T0SZ)
+ #define TCR_EL2_RES1          ((1 << 31) | (1 << 23))
+ #define TCR_EL2_TBI           (1 << 20)
+ #define TCR_EL2_PS_SHIFT      16
+ #define TCR_EL2_PS_MASK               (7 << TCR_EL2_PS_SHIFT)
+ #define TCR_EL2_PS_40B                (2 << TCR_EL2_PS_SHIFT)
+ #define TCR_EL2_TG0_MASK      TCR_TG0_MASK
+ #define TCR_EL2_SH0_MASK      TCR_SH0_MASK
+ #define TCR_EL2_ORGN0_MASK    TCR_ORGN0_MASK
+ #define TCR_EL2_IRGN0_MASK    TCR_IRGN0_MASK
+ #define TCR_EL2_T0SZ_MASK     0x3f
+ #define TCR_EL2_MASK  (TCR_EL2_TG0_MASK | TCR_EL2_SH0_MASK | \
+                        TCR_EL2_ORGN0_MASK | TCR_EL2_IRGN0_MASK | TCR_EL2_T0SZ_MASK)
   
   /* VTCR_EL2 Registers bits */
   #define VTCR_EL2_RES1         (1 << 31)
- #define VTCR_EL2_PS_MASK      (7 << 16)
- #define VTCR_EL2_TG0_MASK     (1 << 14)
- #define VTCR_EL2_TG0_4K               (0 << 14)
- #define VTCR_EL2_TG0_64K      (1 << 14)
- #define VTCR_EL2_SH0_MASK     (3 << 12)
- #define VTCR_EL2_SH0_INNER    (3 << 12)
- #define VTCR_EL2_ORGN0_MASK   (3 << 10)
- #define VTCR_EL2_ORGN0_WBWA   (1 << 10)
- #define VTCR_EL2_IRGN0_MASK   (3 << 8)
- #define VTCR_EL2_IRGN0_WBWA   (1 << 8)
- #define VTCR_EL2_SL0_MASK     (3 << 6)
- #define VTCR_EL2_SL0_LVL1     (1 << 6)
+ #define VTCR_EL2_HD           (1 << 22)
+ #define VTCR_EL2_HA           (1 << 21)
+ #define VTCR_EL2_PS_MASK      TCR_EL2_PS_MASK
+ #define VTCR_EL2_TG0_MASK     TCR_TG0_MASK
+ #define VTCR_EL2_TG0_4K               TCR_TG0_4K
+ #define VTCR_EL2_TG0_16K      TCR_TG0_16K
+ #define VTCR_EL2_TG0_64K      TCR_TG0_64K
+ #define VTCR_EL2_SH0_MASK     TCR_SH0_MASK
+ #define VTCR_EL2_SH0_INNER    TCR_SH0_INNER
+ #define VTCR_EL2_ORGN0_MASK   TCR_ORGN0_MASK
+ #define VTCR_EL2_ORGN0_WBWA   TCR_ORGN0_WBWA
+ #define VTCR_EL2_IRGN0_MASK   TCR_IRGN0_MASK
+ #define VTCR_EL2_IRGN0_WBWA   TCR_IRGN0_WBWA
+ #define VTCR_EL2_SL0_SHIFT    6
+ #define VTCR_EL2_SL0_MASK     (3 << VTCR_EL2_SL0_SHIFT)
+ #define VTCR_EL2_SL0_LVL1     (1 << VTCR_EL2_SL0_SHIFT)
   #define VTCR_EL2_T0SZ_MASK    0x3f
   #define VTCR_EL2_T0SZ_40B     24
   #define VTCR_EL2_VS_SHIFT     19
@@@ -126,35 -142,45 +131,45 @@@
    * (see hyp-init.S).
    *
    * Note that when using 4K pages, we concatenate two first level page tables
-  * together.
+  * together. With 16K pages, we concatenate 16 first level page tables.
    *
    * The magic numbers used for VTTBR_X in this patch can be found in Tables
    * D4-23 and D4-25 in ARM DDI 0487A.b.
    */
+ 
+ #define VTCR_EL2_T0SZ_IPA     VTCR_EL2_T0SZ_40B
+ #define VTCR_EL2_COMMON_BITS  (VTCR_EL2_SH0_INNER | VTCR_EL2_ORGN0_WBWA | \
+                                VTCR_EL2_IRGN0_WBWA | VTCR_EL2_RES1)
+ 
   #ifdef CONFIG_ARM64_64K_PAGES
   /*
    * Stage2 translation configuration:
-  * 40bits input  (T0SZ = 24)
    * 64kB pages (TG0 = 1)
    * 2 level page tables (SL = 1)
    */
- #define VTCR_EL2_FLAGS                (VTCR_EL2_TG0_64K | VTCR_EL2_SH0_INNER | \
-                                VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
-                                VTCR_EL2_SL0_LVL1 | VTCR_EL2_RES1)
- #define VTTBR_X               (38 - VTCR_EL2_T0SZ_40B)
- #else
+ #define VTCR_EL2_TGRAN_FLAGS          (VTCR_EL2_TG0_64K | VTCR_EL2_SL0_LVL1)
+ #define VTTBR_X_TGRAN_MAGIC           38
+ #elif defined(CONFIG_ARM64_16K_PAGES)
+ /*
+  * Stage2 translation configuration:
+  * 16kB pages (TG0 = 2)
+  * 2 level page tables (SL = 1)
+  */
+ #define VTCR_EL2_TGRAN_FLAGS          (VTCR_EL2_TG0_16K | VTCR_EL2_SL0_LVL1)
+ #define VTTBR_X_TGRAN_MAGIC           42
+ #else /* 4K */
   /*
    * Stage2 translation configuration:
-  * 40bits input  (T0SZ = 24)
    * 4kB pages (TG0 = 0)
    * 3 level page tables (SL = 1)
    */
- #define VTCR_EL2_FLAGS                (VTCR_EL2_TG0_4K | VTCR_EL2_SH0_INNER | \
-                                VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
-                                VTCR_EL2_SL0_LVL1 | VTCR_EL2_RES1)
- #define VTTBR_X               (37 - VTCR_EL2_T0SZ_40B)
+ #define VTCR_EL2_TGRAN_FLAGS          (VTCR_EL2_TG0_4K | VTCR_EL2_SL0_LVL1)
+ #define VTTBR_X_TGRAN_MAGIC           37
   #endif
   
+ #define VTCR_EL2_FLAGS                        (VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN_FLAGS)
+ #define VTTBR_X                               (VTTBR_X_TGRAN_MAGIC - VTCR_EL2_T0SZ_IPA)
+ 
   #define VTTBR_BADDR_SHIFT (VTTBR_X - 1)
   #define VTTBR_BADDR_MASK  (((UL(1) << (PHYS_MASK_SHIFT - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT)
   #define VTTBR_VMID_SHIFT  (UL(48))
diff --combined arch/arm64/include/asm/kvm_host.h

index 90a8d2336cebff5eb00c0c507811f973cb112017,d49399d9890d358ad027a15e0a3e44ee13ba744f..e63d23bad36ea2a932723449b1cee6eedd05da77
--- 1/arch/arm64/include/asm/kvm_host.h
--- 2/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@@ -46,8 -46,6 +46,8 @@@
   int __attribute_const__ kvm_target_cpu(void);
   int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
   int kvm_arch_dev_ioctl_check_extension(long ext);
+ +unsigned long kvm_hyp_reset_entry(void);
+ +void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start);
   
   struct kvm_arch {
         /* The VMID generation used for the virt. memory system */
@@@ -295,6 -293,7 +295,7 @@@ struct kvm_vm_stat 
   struct kvm_vcpu_stat {
         u32 halt_successful_poll;
         u32 halt_attempted_poll;
+       u32 halt_poll_invalid;
         u32 halt_wakeup;
         u32 hvc_exit_stat;
         u64 wfe_exit_stat;
@@@ -354,21 -353,12 +355,22 @@@ static inline void __cpu_init_hyp_mode(
                        hyp_stack_ptr, vector_ptr);
   }
   
- -static inline void kvm_arch_hardware_disable(void) {}
+ +static inline void __cpu_reset_hyp_mode(phys_addr_t boot_pgd_ptr,
+ +                                      phys_addr_t phys_idmap_start)
+ +{
+ +      /*
+ +       * Call reset code, and switch back to stub hyp vectors.
+ +       * Uses __kvm_call_hyp() to avoid kaslr's kvm_ksym_ref() translation.
+ +       */
+ +      __kvm_call_hyp((void *)kvm_hyp_reset_entry(),
+ +                     boot_pgd_ptr, phys_idmap_start);
+ +}
+ +
   static inline void kvm_arch_hardware_unsetup(void) {}
   static inline void kvm_arch_sync_events(struct kvm *kvm) {}
   static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
   static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
+ static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
   
   void kvm_arm_init_debug(void);
   void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
diff --combined arch/arm64/include/asm/kvm_mmu.h

index e8d39d4f86b61a74ac44c8ff01351ce7a8d1dab9,844fe5d5ff44454f1c5d6c97a9098625241b5c51..f05ac27d033ed8419d36b871f9e607c87362298f
--- 1/arch/arm64/include/asm/kvm_mmu.h
--- 2/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@@ -45,18 -45,6 +45,6 @@@
    */
   #define TRAMPOLINE_VA         (HYP_PAGE_OFFSET_MASK & PAGE_MASK)
   
- /*
-  * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation
-  * levels in addition to the PGD and potentially the PUD which are
-  * pre-allocated (we pre-allocate the fake PGD and the PUD when the Stage-2
-  * tables use one level of tables less than the kernel.
-  */
- #ifdef CONFIG_ARM64_64K_PAGES
- #define KVM_MMU_CACHE_MIN_PAGES       1
- #else
- #define KVM_MMU_CACHE_MIN_PAGES       2
- #endif
- 
   #ifdef __ASSEMBLY__
   
   #include <asm/alternative.h>
@@@ -91,6 -79,8 +79,8 @@@ alternative_endi
   #define KVM_PHYS_SIZE (1UL << KVM_PHYS_SHIFT)
   #define KVM_PHYS_MASK (KVM_PHYS_SIZE - 1UL)
   
+ #include <asm/stage2_pgtable.h>
+ 
   int create_hyp_mappings(void *from, void *to);
   int create_hyp_io_mappings(void *from, void *to, phys_addr_t);
   void free_boot_hyp_pgd(void);
@@@ -109,7 -99,6 +99,7 @@@ void kvm_mmu_free_memory_caches(struct 
   phys_addr_t kvm_mmu_get_httbr(void);
   phys_addr_t kvm_mmu_get_boot_httbr(void);
   phys_addr_t kvm_get_idmap_vector(void);
+ +phys_addr_t kvm_get_idmap_start(void);
   int kvm_mmu_init(void);
   void kvm_clear_hyp_idmap(void);
   
@@@ -122,19 -111,32 +112,32 @@@ static inline void kvm_clean_pmd_entry(
   static inline void kvm_clean_pte(pte_t *pte) {}
   static inline void kvm_clean_pte_entry(pte_t *pte) {}
   
- static inline void kvm_set_s2pte_writable(pte_t *pte)
+ static inline pte_t kvm_s2pte_mkwrite(pte_t pte)
   {
-       pte_val(*pte) |= PTE_S2_RDWR;
+       pte_val(pte) |= PTE_S2_RDWR;
+       return pte;
   }
   
- static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
+ static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd)
   {
-       pmd_val(*pmd) |= PMD_S2_RDWR;
+       pmd_val(pmd) |= PMD_S2_RDWR;
+       return pmd;
   }
   
   static inline void kvm_set_s2pte_readonly(pte_t *pte)
   {
-       pte_val(*pte) = (pte_val(*pte) & ~PTE_S2_RDWR) | PTE_S2_RDONLY;
+       pteval_t pteval;
+       unsigned long tmp;
+ 
+       asm volatile("//        kvm_set_s2pte_readonly\n"
+       "       prfm    pstl1strm, %2\n"
+       "1:     ldxr    %0, %2\n"
+       "       and     %0, %0, %3              // clear PTE_S2_RDWR\n"
+       "       orr     %0, %0, %4              // set PTE_S2_RDONLY\n"
+       "       stxr    %w1, %0, %2\n"
+       "       cbnz    %w1, 1b\n"
+       : "=&r" (pteval), "=&r" (tmp), "+Q" (pte_val(*pte))
+       : "L" (~PTE_S2_RDWR), "L" (PTE_S2_RDONLY));
   }
   
   static inline bool kvm_s2pte_readonly(pte_t *pte)
@@@ -144,69 -146,12 +147,12 @@@
   
   static inline void kvm_set_s2pmd_readonly(pmd_t *pmd)
   {
-       pmd_val(*pmd) = (pmd_val(*pmd) & ~PMD_S2_RDWR) | PMD_S2_RDONLY;
+       kvm_set_s2pte_readonly((pte_t *)pmd);
   }
   
   static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
   {
-       return (pmd_val(*pmd) & PMD_S2_RDWR) == PMD_S2_RDONLY;
- }
- 
- 
- #define kvm_pgd_addr_end(addr, end)   pgd_addr_end(addr, end)
- #define kvm_pud_addr_end(addr, end)   pud_addr_end(addr, end)
- #define kvm_pmd_addr_end(addr, end)   pmd_addr_end(addr, end)
- 
- /*
-  * In the case where PGDIR_SHIFT is larger than KVM_PHYS_SHIFT, we can address
-  * the entire IPA input range with a single pgd entry, and we would only need
-  * one pgd entry.  Note that in this case, the pgd is actually not used by
-  * the MMU for Stage-2 translations, but is merely a fake pgd used as a data
-  * structure for the kernel pgtable macros to work.
-  */
- #if PGDIR_SHIFT > KVM_PHYS_SHIFT
- #define PTRS_PER_S2_PGD_SHIFT 0
- #else
- #define PTRS_PER_S2_PGD_SHIFT (KVM_PHYS_SHIFT - PGDIR_SHIFT)
- #endif
- #define PTRS_PER_S2_PGD               (1 << PTRS_PER_S2_PGD_SHIFT)
- 
- #define kvm_pgd_index(addr)   (((addr) >> PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1))
- 
- /*
-  * If we are concatenating first level stage-2 page tables, we would have less
-  * than or equal to 16 pointers in the fake PGD, because that's what the
-  * architecture allows.  In this case, (4 - CONFIG_PGTABLE_LEVELS)
-  * represents the first level for the host, and we add 1 to go to the next
-  * level (which uses contatenation) for the stage-2 tables.
-  */
- #if PTRS_PER_S2_PGD <= 16
- #define KVM_PREALLOC_LEVEL    (4 - CONFIG_PGTABLE_LEVELS + 1)
- #else
- #define KVM_PREALLOC_LEVEL    (0)
- #endif
- 
- static inline void *kvm_get_hwpgd(struct kvm *kvm)
- {
-       pgd_t *pgd = kvm->arch.pgd;
-       pud_t *pud;
- 
-       if (KVM_PREALLOC_LEVEL == 0)
-               return pgd;
- 
-       pud = pud_offset(pgd, 0);
-       if (KVM_PREALLOC_LEVEL == 1)
-               return pud;
- 
-       BUG_ON(KVM_PREALLOC_LEVEL != 2);
-       return pmd_offset(pud, 0);
- }
- 
- static inline unsigned int kvm_get_hwpgd_size(void)
- {
-       if (KVM_PREALLOC_LEVEL > 0)
-               return PTRS_PER_S2_PGD * PAGE_SIZE;
-       return PTRS_PER_S2_PGD * sizeof(pgd_t);
+       return kvm_s2pte_readonly((pte_t *)pmd);
   }
   
   static inline bool kvm_page_empty(void *ptr)
@@@ -215,23 -160,20 +161,20 @@@
         return page_count(ptr_page) == 1;
   }
   
- #define kvm_pte_table_empty(kvm, ptep) kvm_page_empty(ptep)
+ #define hyp_pte_table_empty(ptep) kvm_page_empty(ptep)
   
   #ifdef __PAGETABLE_PMD_FOLDED
- #define kvm_pmd_table_empty(kvm, pmdp) (0)
+ #define hyp_pmd_table_empty(pmdp) (0)
   #else
- #define kvm_pmd_table_empty(kvm, pmdp) \
-       (kvm_page_empty(pmdp) && (!(kvm) || KVM_PREALLOC_LEVEL < 2))
+ #define hyp_pmd_table_empty(pmdp) kvm_page_empty(pmdp)
   #endif
   
   #ifdef __PAGETABLE_PUD_FOLDED
- #define kvm_pud_table_empty(kvm, pudp) (0)
+ #define hyp_pud_table_empty(pudp) (0)
   #else
- #define kvm_pud_table_empty(kvm, pudp) \
-       (kvm_page_empty(pudp) && (!(kvm) || KVM_PREALLOC_LEVEL < 1))
+ #define hyp_pud_table_empty(pudp) kvm_page_empty(pudp)
   #endif
   
- 
   struct kvm;
   
   #define kvm_flush_dcache_to_poc(a,l)  __flush_dcache_area((a), (l))
diff --combined arch/arm64/include/asm/pgtable-hwdef.h

index 9786f770088df41e919921b3a18024f045bfd707,936f1732727c11d3b7cd00dfb7f990648043617b..2813748e2f242c7cd5466783ff55e59947e9e202
--- 1/arch/arm64/include/asm/pgtable-hwdef.h
--- 2/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@@ -133,6 -133,7 +133,6 @@@
    * Section
    */
   #define PMD_SECT_VALID                (_AT(pmdval_t, 1) << 0)
- -#define PMD_SECT_PROT_NONE    (_AT(pmdval_t, 1) << 58)
   #define PMD_SECT_USER         (_AT(pmdval_t, 1) << 6)         /* AP[1] */
   #define PMD_SECT_RDONLY               (_AT(pmdval_t, 1) << 7)         /* AP[2] */
   #define PMD_SECT_S            (_AT(pmdval_t, 3) << 8)
@@@ -207,23 -208,69 +207,69 @@@
   #define TCR_T1SZ(x)           ((UL(64) - (x)) << TCR_T1SZ_OFFSET)
   #define TCR_TxSZ(x)           (TCR_T0SZ(x) | TCR_T1SZ(x))
   #define TCR_TxSZ_WIDTH                6
- #define TCR_IRGN_NC           ((UL(0) << 8) | (UL(0) << 24))
- #define TCR_IRGN_WBWA         ((UL(1) << 8) | (UL(1) << 24))
- #define TCR_IRGN_WT           ((UL(2) << 8) | (UL(2) << 24))
- #define TCR_IRGN_WBnWA                ((UL(3) << 8) | (UL(3) << 24))
- #define TCR_IRGN_MASK         ((UL(3) << 8) | (UL(3) << 24))
- #define TCR_ORGN_NC           ((UL(0) << 10) | (UL(0) << 26))
- #define TCR_ORGN_WBWA         ((UL(1) << 10) | (UL(1) << 26))
- #define TCR_ORGN_WT           ((UL(2) << 10) | (UL(2) << 26))
- #define TCR_ORGN_WBnWA                ((UL(3) << 10) | (UL(3) << 26))
- #define TCR_ORGN_MASK         ((UL(3) << 10) | (UL(3) << 26))
- #define TCR_SHARED            ((UL(3) << 12) | (UL(3) << 28))
- #define TCR_TG0_4K            (UL(0) << 14)
- #define TCR_TG0_64K           (UL(1) << 14)
- #define TCR_TG0_16K           (UL(2) << 14)
- #define TCR_TG1_16K           (UL(1) << 30)
- #define TCR_TG1_4K            (UL(2) << 30)
- #define TCR_TG1_64K           (UL(3) << 30)
+ 
+ #define TCR_IRGN0_SHIFT               8
+ #define TCR_IRGN0_MASK                (UL(3) << TCR_IRGN0_SHIFT)
+ #define TCR_IRGN0_NC          (UL(0) << TCR_IRGN0_SHIFT)
+ #define TCR_IRGN0_WBWA                (UL(1) << TCR_IRGN0_SHIFT)
+ #define TCR_IRGN0_WT          (UL(2) << TCR_IRGN0_SHIFT)
+ #define TCR_IRGN0_WBnWA               (UL(3) << TCR_IRGN0_SHIFT)
+ 
+ #define TCR_IRGN1_SHIFT               24
+ #define TCR_IRGN1_MASK                (UL(3) << TCR_IRGN1_SHIFT)
+ #define TCR_IRGN1_NC          (UL(0) << TCR_IRGN1_SHIFT)
+ #define TCR_IRGN1_WBWA                (UL(1) << TCR_IRGN1_SHIFT)
+ #define TCR_IRGN1_WT          (UL(2) << TCR_IRGN1_SHIFT)
+ #define TCR_IRGN1_WBnWA               (UL(3) << TCR_IRGN1_SHIFT)
+ 
+ #define TCR_IRGN_NC           (TCR_IRGN0_NC | TCR_IRGN1_NC)
+ #define TCR_IRGN_WBWA         (TCR_IRGN0_WBWA | TCR_IRGN1_WBWA)
+ #define TCR_IRGN_WT           (TCR_IRGN0_WT | TCR_IRGN1_WT)
+ #define TCR_IRGN_WBnWA                (TCR_IRGN0_WBnWA | TCR_IRGN1_WBnWA)
+ #define TCR_IRGN_MASK         (TCR_IRGN0_MASK | TCR_IRGN1_MASK)
+ 
+ 
+ #define TCR_ORGN0_SHIFT               10
+ #define TCR_ORGN0_MASK                (UL(3) << TCR_ORGN0_SHIFT)
+ #define TCR_ORGN0_NC          (UL(0) << TCR_ORGN0_SHIFT)
+ #define TCR_ORGN0_WBWA                (UL(1) << TCR_ORGN0_SHIFT)
+ #define TCR_ORGN0_WT          (UL(2) << TCR_ORGN0_SHIFT)
+ #define TCR_ORGN0_WBnWA               (UL(3) << TCR_ORGN0_SHIFT)
+ 
+ #define TCR_ORGN1_SHIFT               26
+ #define TCR_ORGN1_MASK                (UL(3) << TCR_ORGN1_SHIFT)
+ #define TCR_ORGN1_NC          (UL(0) << TCR_ORGN1_SHIFT)
+ #define TCR_ORGN1_WBWA                (UL(1) << TCR_ORGN1_SHIFT)
+ #define TCR_ORGN1_WT          (UL(2) << TCR_ORGN1_SHIFT)
+ #define TCR_ORGN1_WBnWA               (UL(3) << TCR_ORGN1_SHIFT)
+ 
+ #define TCR_ORGN_NC           (TCR_ORGN0_NC | TCR_ORGN1_NC)
+ #define TCR_ORGN_WBWA         (TCR_ORGN0_WBWA | TCR_ORGN1_WBWA)
+ #define TCR_ORGN_WT           (TCR_ORGN0_WT | TCR_ORGN1_WT)
+ #define TCR_ORGN_WBnWA                (TCR_ORGN0_WBnWA | TCR_ORGN1_WBnWA)
+ #define TCR_ORGN_MASK         (TCR_ORGN0_MASK | TCR_ORGN1_MASK)
+ 
+ #define TCR_SH0_SHIFT         12
+ #define TCR_SH0_MASK          (UL(3) << TCR_SH0_SHIFT)
+ #define TCR_SH0_INNER         (UL(3) << TCR_SH0_SHIFT)
+ 
+ #define TCR_SH1_SHIFT         28
+ #define TCR_SH1_MASK          (UL(3) << TCR_SH1_SHIFT)
+ #define TCR_SH1_INNER         (UL(3) << TCR_SH1_SHIFT)
+ #define TCR_SHARED            (TCR_SH0_INNER | TCR_SH1_INNER)
+ 
+ #define TCR_TG0_SHIFT         14
+ #define TCR_TG0_MASK          (UL(3) << TCR_TG0_SHIFT)
+ #define TCR_TG0_4K            (UL(0) << TCR_TG0_SHIFT)
+ #define TCR_TG0_64K           (UL(1) << TCR_TG0_SHIFT)
+ #define TCR_TG0_16K           (UL(2) << TCR_TG0_SHIFT)
+ 
+ #define TCR_TG1_SHIFT         30
+ #define TCR_TG1_MASK          (UL(3) << TCR_TG1_SHIFT)
+ #define TCR_TG1_16K           (UL(1) << TCR_TG1_SHIFT)
+ #define TCR_TG1_4K            (UL(2) << TCR_TG1_SHIFT)
+ #define TCR_TG1_64K           (UL(3) << TCR_TG1_SHIFT)
+ 
   #define TCR_ASID16            (UL(1) << 36)
   #define TCR_TBI0              (UL(1) << 37)
   #define TCR_HA                        (UL(1) << 39)
diff --combined arch/arm64/include/asm/pgtable.h

index 2da46ae9c991e3fbbfd8337b39738b7b467789b3,f1d5afdb12dbf6d09dcabca6b9ac3d88158ddc12..1910bf47d4a316af5c66af46f3a04555cdc10211
--- 1/arch/arm64/include/asm/pgtable.h
--- 2/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@@ -24,16 -24,22 +24,16 @@@
   #include <asm/pgtable-prot.h>
   
   /*
- - * VMALLOC and SPARSEMEM_VMEMMAP ranges.
+ + * VMALLOC range.
    *
- - * VMEMAP_SIZE: allows the whole linear region to be covered by a struct page array
- - *    (rounded up to PUD_SIZE).
    * VMALLOC_START: beginning of the kernel vmalloc space
- - * VMALLOC_END: extends to the available space below vmmemmap, PCI I/O space,
- - *    fixed mappings and modules
+ + * VMALLOC_END: extends to the available space below vmmemmap, PCI I/O space
+ + *    and fixed mappings
    */
- -#define VMEMMAP_SIZE          ALIGN((1UL << (VA_BITS - PAGE_SHIFT)) * sizeof(struct page), PUD_SIZE)
- -
   #define VMALLOC_START         (MODULES_END)
   #define VMALLOC_END           (PAGE_OFFSET - PUD_SIZE - VMEMMAP_SIZE - SZ_64K)
   
- -#define VMEMMAP_START         (VMALLOC_END + SZ_64K)
- -#define vmemmap                       ((struct page *)VMEMMAP_START - \
- -                               SECTION_ALIGN_DOWN(memstart_addr >> PAGE_SHIFT))
+ +#define vmemmap                       ((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT))
   
   #define FIRST_USER_ADDRESS    0UL
   
@@@ -52,7 -58,7 +52,7 @@@ extern void __pgd_error(const char *fil
    * for zero-mapped memory areas etc..
    */
   extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
- -#define ZERO_PAGE(vaddr)      virt_to_page(empty_zero_page)
+ +#define ZERO_PAGE(vaddr)      pfn_to_page(PHYS_PFN(__pa(empty_zero_page)))
   
   #define pte_ERROR(pte)                __pte_error(__FILE__, __LINE__, pte_val(pte))
   
@@@ -266,21 -272,6 +266,21 @@@ static inline pgprot_t mk_sect_prot(pgp
         return __pgprot(pgprot_val(prot) & ~PTE_TABLE_BIT);
   }
   
+ +#ifdef CONFIG_NUMA_BALANCING
+ +/*
+ + * See the comment in include/asm-generic/pgtable.h
+ + */
+ +static inline int pte_protnone(pte_t pte)
+ +{
+ +      return (pte_val(pte) & (PTE_VALID | PTE_PROT_NONE)) == PTE_PROT_NONE;
+ +}
+ +
+ +static inline int pmd_protnone(pmd_t pmd)
+ +{
+ +      return pte_protnone(pmd_pte(pmd));
+ +}
+ +#endif
+ +
   /*
    * THP definitions.
    */
@@@ -289,17 -280,18 +289,19 @@@
   #define pmd_trans_huge(pmd)   (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT))
   #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
   
+ +#define pmd_present(pmd)      pte_present(pmd_pte(pmd))
   #define pmd_dirty(pmd)                pte_dirty(pmd_pte(pmd))
   #define pmd_young(pmd)                pte_young(pmd_pte(pmd))
   #define pmd_wrprotect(pmd)    pte_pmd(pte_wrprotect(pmd_pte(pmd)))
   #define pmd_mkold(pmd)                pte_pmd(pte_mkold(pmd_pte(pmd)))
   #define pmd_mkwrite(pmd)      pte_pmd(pte_mkwrite(pmd_pte(pmd)))
- -#define pmd_mkclean(pmd)       pte_pmd(pte_mkclean(pmd_pte(pmd)))
+ +#define pmd_mkclean(pmd)      pte_pmd(pte_mkclean(pmd_pte(pmd)))
   #define pmd_mkdirty(pmd)      pte_pmd(pte_mkdirty(pmd_pte(pmd)))
   #define pmd_mkyoung(pmd)      pte_pmd(pte_mkyoung(pmd_pte(pmd)))
- -#define pmd_mknotpresent(pmd) (__pmd(pmd_val(pmd) & ~PMD_TYPE_MASK))
+ +#define pmd_mknotpresent(pmd) (__pmd(pmd_val(pmd) & ~PMD_SECT_VALID))
   
+ #define pmd_thp_or_huge(pmd)  (pmd_huge(pmd) || pmd_trans_huge(pmd))
+ 
   #define __HAVE_ARCH_PMD_WRITE
   #define pmd_write(pmd)                pte_write(pmd_pte(pmd))
   
@@@ -337,8 -329,9 +339,8 @@@ extern pgprot_t phys_mem_access_prot(st
                                      unsigned long size, pgprot_t vma_prot);
   
   #define pmd_none(pmd)         (!pmd_val(pmd))
- -#define pmd_present(pmd)      (pmd_val(pmd))
   
- -#define pmd_bad(pmd)          (!(pmd_val(pmd) & 2))
+ +#define pmd_bad(pmd)          (!(pmd_val(pmd) & PMD_TABLE_BIT))
   
   #define pmd_table(pmd)                ((pmd_val(pmd) & PMD_TYPE_MASK) == \
                                  PMD_TYPE_TABLE)
@@@ -403,7 -396,7 +405,7 @@@ static inline phys_addr_t pmd_page_padd
   #define pmd_ERROR(pmd)                __pmd_error(__FILE__, __LINE__, pmd_val(pmd))
   
   #define pud_none(pud)         (!pud_val(pud))
- -#define pud_bad(pud)          (!(pud_val(pud) & 2))
+ +#define pud_bad(pud)          (!(pud_val(pud) & PUD_TABLE_BIT))
   #define pud_present(pud)      (pud_val(pud))
   
   static inline void set_pud(pud_t *pudp, pud_t pud)
@@@ -535,33 -528,16 +537,31 @@@ static inline pmd_t pmd_modify(pmd_t pm
   }
   
   #ifdef CONFIG_ARM64_HW_AFDBM
+ +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+ +extern int ptep_set_access_flags(struct vm_area_struct *vma,
+ +                               unsigned long address, pte_t *ptep,
+ +                               pte_t entry, int dirty);
+ +
+ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ +#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+ +static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
+ +                                      unsigned long address, pmd_t *pmdp,
+ +                                      pmd_t entry, int dirty)
+ +{
+ +      return ptep_set_access_flags(vma, address, (pte_t *)pmdp, pmd_pte(entry), dirty);
+ +}
+ +#endif
+ +
   /*
    * Atomic pte/pmd modifications.
    */
   #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
- static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
-                                           unsigned long address,
-                                           pte_t *ptep)
+ static inline int __ptep_test_and_clear_young(pte_t *ptep)
   {
         pteval_t pteval;
         unsigned int tmp, res;
   
-       asm volatile("//        ptep_test_and_clear_young\n"
+       asm volatile("//        __ptep_test_and_clear_young\n"
         "       prfm    pstl1strm, %2\n"
         "1:     ldxr    %0, %2\n"
         "       ubfx    %w3, %w0, %5, #1        // extract PTE_AF (young)\n"
@@@ -574,6 -550,13 +574,13 @@@
         return res;
   }
   
+ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
+                                           unsigned long address,
+                                           pte_t *ptep)
+ {
+       return __ptep_test_and_clear_young(ptep);
+ }
+ 
   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
   #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
   static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
@@@ -602,9 -585,9 +609,9 @@@ static inline pte_t ptep_get_and_clear(
   }
   
   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
- -#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
- -static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
- -                                     unsigned long address, pmd_t *pmdp)
+ +#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
+ +static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+ +                                          unsigned long address, pmd_t *pmdp)
   {
         return pte_pmd(ptep_get_and_clear(mm, address, (pte_t *)pmdp));
   }
diff --combined arch/mips/include/asm/kvm_host.h

index b76e132c87e4b1c74c6472ebfda5858b790f87d5,9a37a1044032dc45733607d901ac20c4f8465798..6733ac575da4fd13d7cee0e06e4e45e0c2d9b0a6
--- 1/arch/mips/include/asm/kvm_host.h
--- 2/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@@ -122,6 -122,7 +122,7 @@@ struct kvm_vcpu_stat 
         u32 flush_dcache_exits;
         u32 halt_successful_poll;
         u32 halt_attempted_poll;
+       u32 halt_poll_invalid;
         u32 halt_wakeup;
   };
   
@@@ -311,18 -312,17 +312,18 @@@ enum emulation_result 
   #define MIPS3_PG_FRAME                0x3fffffc0
   
   #define VPN2_MASK             0xffffe000
+ +#define KVM_ENTRYHI_ASID      MIPS_ENTRYHI_ASID
   #define TLB_IS_GLOBAL(x)      (((x).tlb_lo0 & MIPS3_PG_G) &&          \
                                  ((x).tlb_lo1 & MIPS3_PG_G))
   #define TLB_VPN2(x)           ((x).tlb_hi & VPN2_MASK)
- -#define TLB_ASID(x)           ((x).tlb_hi & ASID_MASK)
+ +#define TLB_ASID(x)           ((x).tlb_hi & KVM_ENTRYHI_ASID)
   #define TLB_IS_VALID(x, va)   (((va) & (1 << PAGE_SHIFT))             \
                                  ? ((x).tlb_lo1 & MIPS3_PG_V)           \
                                  : ((x).tlb_lo0 & MIPS3_PG_V))
   #define TLB_HI_VPN2_HIT(x, y) ((TLB_VPN2(x) & ~(x).tlb_mask) ==       \
                                  ((y) & VPN2_MASK & ~(x).tlb_mask))
   #define TLB_HI_ASID_HIT(x, y) (TLB_IS_GLOBAL(x) ||                    \
- -                               TLB_ASID(x) == ((y) & ASID_MASK))
+ +                               TLB_ASID(x) == ((y) & KVM_ENTRYHI_ASID))
   
   struct kvm_mips_tlb {
         long tlb_mask;
@@@ -748,7 -748,7 +749,7 @@@ extern enum emulation_result kvm_mips_c
   
   uint32_t kvm_mips_read_count(struct kvm_vcpu *vcpu);
   void kvm_mips_write_count(struct kvm_vcpu *vcpu, uint32_t count);
- void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare);
+ void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare, bool ack);
   void kvm_mips_init_count(struct kvm_vcpu *vcpu);
   int kvm_mips_set_count_ctl(struct kvm_vcpu *vcpu, s64 count_ctl);
   int kvm_mips_set_count_resume(struct kvm_vcpu *vcpu, s64 count_resume);
@@@ -813,5 -813,6 +814,6 @@@ static inline void kvm_arch_vcpu_uninit
   static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
   static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
   static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
+ static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
   
   #endif /* __MIPS_KVM_HOST_H__ */
diff --combined arch/mips/kvm/emulate.c

index 8e945e866a7379ecc2cf770d8c3b0f2646b6e06c,b8b7860ec1a8e3fe266ff5de751ff91e7a49294c..396df6eb0a12d9bdb9a886f00be348c4b8698ec2
--- 1/arch/mips/kvm/emulate.c
--- 2/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@@ -302,12 -302,31 +302,31 @@@ static inline ktime_t kvm_mips_count_ti
    */
   static uint32_t kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now)
   {
-       ktime_t expires;
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+       ktime_t expires, threshold;
+       uint32_t count, compare;
         int running;
   
-       /* Is the hrtimer pending? */
+       /* Calculate the biased and scaled guest CP0_Count */
+       count = vcpu->arch.count_bias + kvm_mips_ktime_to_count(vcpu, now);
+       compare = kvm_read_c0_guest_compare(cop0);
+ 
+       /*
+        * Find whether CP0_Count has reached the closest timer interrupt. If
+        * not, we shouldn't inject it.
+        */
+       if ((int32_t)(count - compare) < 0)
+               return count;
+ 
+       /*
+        * The CP0_Count we're going to return has already reached the closest
+        * timer interrupt. Quickly check if it really is a new interrupt by
+        * looking at whether the interval until the hrtimer expiry time is
+        * less than 1/4 of the timer period.
+        */
         expires = hrtimer_get_expires(&vcpu->arch.comparecount_timer);
-       if (ktime_compare(now, expires) >= 0) {
+       threshold = ktime_add_ns(now, vcpu->arch.count_period / 4);
+       if (ktime_before(expires, threshold)) {
                 /*
                  * Cancel it while we handle it so there's no chance of
                  * interference with the timeout handler.
@@@ -329,8 -348,7 +348,7 @@@
                 }
         }
   
-       /* Return the biased and scaled guest CP0_Count */
-       return vcpu->arch.count_bias + kvm_mips_ktime_to_count(vcpu, now);
+       return count;
   }
   
   /**
@@@ -419,32 -437,6 +437,6 @@@ static void kvm_mips_resume_hrtimer(str
         hrtimer_start(&vcpu->arch.comparecount_timer, expire, HRTIMER_MODE_ABS);
   }
   
- /**
-  * kvm_mips_update_hrtimer() - Update next expiry time of hrtimer.
-  * @vcpu:     Virtual CPU.
-  *
-  * Recalculates and updates the expiry time of the hrtimer. This can be used
-  * after timer parameters have been altered which do not depend on the time that
-  * the change occurs (in those cases kvm_mips_freeze_hrtimer() and
-  * kvm_mips_resume_hrtimer() are used directly).
-  *
-  * It is guaranteed that no timer interrupts will be lost in the process.
-  *
-  * Assumes !kvm_mips_count_disabled(@vcpu) (guest CP0_Count timer is running).
-  */
- static void kvm_mips_update_hrtimer(struct kvm_vcpu *vcpu)
- {
-       ktime_t now;
-       uint32_t count;
- 
-       /*
-        * freeze_hrtimer takes care of a timer interrupts <= count, and
-        * resume_hrtimer the hrtimer takes care of a timer interrupts > count.
-        */
-       now = kvm_mips_freeze_hrtimer(vcpu, &count);
-       kvm_mips_resume_hrtimer(vcpu, now, count);
- }
- 
   /**
    * kvm_mips_write_count() - Modify the count and update timer.
    * @vcpu:     Virtual CPU.
@@@ -540,23 -532,42 +532,42 @@@ int kvm_mips_set_count_hz(struct kvm_vc
    * kvm_mips_write_compare() - Modify compare and update timer.
    * @vcpu:     Virtual CPU.
    * @compare:  New CP0_Compare value.
+  * @ack:      Whether to acknowledge timer interrupt.
    *
    * Update CP0_Compare to a new value and update the timeout.
+  * If @ack, atomically acknowledge any pending timer interrupt, otherwise ensure
+  * any pending timer interrupt is preserved.
    */
- void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare)
+ void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare, bool ack)
   {
         struct mips_coproc *cop0 = vcpu->arch.cop0;
+       int dc;
+       u32 old_compare = kvm_read_c0_guest_compare(cop0);
+       ktime_t now;
+       uint32_t count;
   
         /* if unchanged, must just be an ack */
-       if (kvm_read_c0_guest_compare(cop0) == compare)
+       if (old_compare == compare) {
+               if (!ack)
+                       return;
+               kvm_mips_callbacks->dequeue_timer_int(vcpu);
+               kvm_write_c0_guest_compare(cop0, compare);
                 return;
+       }
+ 
+       /* freeze_hrtimer() takes care of timer interrupts <= count */
+       dc = kvm_mips_count_disabled(vcpu);
+       if (!dc)
+               now = kvm_mips_freeze_hrtimer(vcpu, &count);
+ 
+       if (ack)
+               kvm_mips_callbacks->dequeue_timer_int(vcpu);
   
-       /* Update compare */
         kvm_write_c0_guest_compare(cop0, compare);
   
-       /* Update timeout if count enabled */
-       if (!kvm_mips_count_disabled(vcpu))
-               kvm_mips_update_hrtimer(vcpu);
+       /* resume_hrtimer() takes care of timer interrupts > count */
+       if (!dc)
+               kvm_mips_resume_hrtimer(vcpu, now, count);
   }
   
   /**
@@@ -1068,15 -1079,15 +1079,15 @@@ enum emulation_result kvm_mips_emulate_
                                         kvm_read_c0_guest_ebase(cop0));
                         } else if (rd == MIPS_CP0_TLB_HI && sel == 0) {
                                 uint32_t nasid =
- -                                      vcpu->arch.gprs[rt] & ASID_MASK;
+ +                                      vcpu->arch.gprs[rt] & KVM_ENTRYHI_ASID;
                                 if ((KSEGX(vcpu->arch.gprs[rt]) != CKSEG0) &&
                                     ((kvm_read_c0_guest_entryhi(cop0) &
- -                                    ASID_MASK) != nasid)) {
+ +                                    KVM_ENTRYHI_ASID) != nasid)) {
                                         kvm_debug("MTCz, change ASID from %#lx to %#lx\n",
                                                 kvm_read_c0_guest_entryhi(cop0)
- -                                              & ASID_MASK,
+ +                                              & KVM_ENTRYHI_ASID,
                                                 vcpu->arch.gprs[rt]
- -                                              & ASID_MASK);
+ +                                              & KVM_ENTRYHI_ASID);
   
                                         /* Blow away the shadow host TLBs */
                                         kvm_mips_flush_host_tlb(1);
@@@ -1095,9 -1106,9 +1106,9 @@@
   
                                 /* If we are writing to COMPARE */
                                 /* Clear pending timer interrupt, if any */
-                               kvm_mips_callbacks->dequeue_timer_int(vcpu);
                                 kvm_mips_write_compare(vcpu,
-                                                      vcpu->arch.gprs[rt]);
+                                                      vcpu->arch.gprs[rt],
+                                                      true);
                         } else if ((rd == MIPS_CP0_STATUS) && (sel == 0)) {
                                 unsigned int old_val, val, change;
   
@@@ -1620,7 -1631,7 +1631,7 @@@ enum emulation_result kvm_mips_emulate_
                  */
                 index = kvm_mips_guest_tlb_lookup(vcpu, (va & VPN2_MASK) |
                                                   (kvm_read_c0_guest_entryhi
- -                                                 (cop0) & ASID_MASK));
+ +                                                 (cop0) & KVM_ENTRYHI_ASID));
   
                 if (index < 0) {
                         vcpu->arch.host_cp0_entryhi = (va & VPN2_MASK);
@@@ -1786,7 -1797,7 +1797,7 @@@ enum emulation_result kvm_mips_emulate_
         struct mips_coproc *cop0 = vcpu->arch.cop0;
         struct kvm_vcpu_arch *arch = &vcpu->arch;
         unsigned long entryhi = (vcpu->arch.  host_cp0_badvaddr & VPN2_MASK) |
- -                              (kvm_read_c0_guest_entryhi(cop0) & ASID_MASK);
+ +                      (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID);
   
         if ((kvm_read_c0_guest_status(cop0) & ST0_EXL) == 0) {
                 /* save old pc */
@@@ -1833,7 -1844,7 +1844,7 @@@ enum emulation_result kvm_mips_emulate_
         struct kvm_vcpu_arch *arch = &vcpu->arch;
         unsigned long entryhi =
                 (vcpu->arch.host_cp0_badvaddr & VPN2_MASK) |
- -              (kvm_read_c0_guest_entryhi(cop0) & ASID_MASK);
+ +              (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID);
   
         if ((kvm_read_c0_guest_status(cop0) & ST0_EXL) == 0) {
                 /* save old pc */
@@@ -1878,7 -1889,7 +1889,7 @@@ enum emulation_result kvm_mips_emulate_
         struct mips_coproc *cop0 = vcpu->arch.cop0;
         struct kvm_vcpu_arch *arch = &vcpu->arch;
         unsigned long entryhi = (vcpu->arch.host_cp0_badvaddr & VPN2_MASK) |
- -                              (kvm_read_c0_guest_entryhi(cop0) & ASID_MASK);
+ +                      (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID);
   
         if ((kvm_read_c0_guest_status(cop0) & ST0_EXL) == 0) {
                 /* save old pc */
@@@ -1922,7 -1933,7 +1933,7 @@@ enum emulation_result kvm_mips_emulate_
         struct mips_coproc *cop0 = vcpu->arch.cop0;
         struct kvm_vcpu_arch *arch = &vcpu->arch;
         unsigned long entryhi = (vcpu->arch.host_cp0_badvaddr & VPN2_MASK) |
- -              (kvm_read_c0_guest_entryhi(cop0) & ASID_MASK);
+ +              (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID);
   
         if ((kvm_read_c0_guest_status(cop0) & ST0_EXL) == 0) {
                 /* save old pc */
@@@ -1967,7 -1978,7 +1978,7 @@@ enum emulation_result kvm_mips_handle_t
   #ifdef DEBUG
         struct mips_coproc *cop0 = vcpu->arch.cop0;
         unsigned long entryhi = (vcpu->arch.host_cp0_badvaddr & VPN2_MASK) |
- -                              (kvm_read_c0_guest_entryhi(cop0) & ASID_MASK);
+ +                      (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID);
         int index;
   
         /* If address not in the guest TLB, then we are in trouble */
@@@ -1994,7 -2005,7 +2005,7 @@@ enum emulation_result kvm_mips_emulate_
   {
         struct mips_coproc *cop0 = vcpu->arch.cop0;
         unsigned long entryhi = (vcpu->arch.host_cp0_badvaddr & VPN2_MASK) |
- -                              (kvm_read_c0_guest_entryhi(cop0) & ASID_MASK);
+ +                      (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID);
         struct kvm_vcpu_arch *arch = &vcpu->arch;
   
         if ((kvm_read_c0_guest_status(cop0) & ST0_EXL) == 0) {
@@@ -2569,8 -2580,7 +2580,8 @@@ enum emulation_result kvm_mips_handle_t
          */
         index = kvm_mips_guest_tlb_lookup(vcpu,
                       (va & VPN2_MASK) |
- -                    (kvm_read_c0_guest_entryhi(vcpu->arch.cop0) & ASID_MASK));
+ +                    (kvm_read_c0_guest_entryhi(vcpu->arch.cop0) &
+ +                     KVM_ENTRYHI_ASID));
         if (index < 0) {
                 if (exccode == EXCCODE_TLBL) {
                         er = kvm_mips_emulate_tlbmiss_ld(cause, opc, run, vcpu);
diff --combined arch/mips/kvm/tlb.c

index b9c52c1d35d6a37f181d843819d835b341f5958b,60e4ad0016e7595ec2032a681381e15ae4dc4063..ed021ae7867a797d327b3c1041393467c7c97da2
--- 1/arch/mips/kvm/tlb.c
--- 2/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@@ -49,18 -49,12 +49,18 @@@ EXPORT_SYMBOL_GPL(kvm_mips_is_error_pfn
   
   uint32_t kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
   {
- -      return vcpu->arch.guest_kernel_asid[smp_processor_id()] & ASID_MASK;
+ +      int cpu = smp_processor_id();
+ +
+ +      return vcpu->arch.guest_kernel_asid[cpu] &
+ +                      cpu_asid_mask(&cpu_data[cpu]);
   }
   
   uint32_t kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
   {
- -      return vcpu->arch.guest_user_asid[smp_processor_id()] & ASID_MASK;
+ +      int cpu = smp_processor_id();
+ +
+ +      return vcpu->arch.guest_user_asid[cpu] &
+ +                      cpu_asid_mask(&cpu_data[cpu]);
   }
   
   inline uint32_t kvm_mips_get_commpage_asid(struct kvm_vcpu *vcpu)
@@@ -84,8 -78,7 +84,8 @@@ void kvm_mips_dump_host_tlbs(void
         old_pagemask = read_c0_pagemask();
   
         kvm_info("HOST TLBs:\n");
- -      kvm_info("ASID: %#lx\n", read_c0_entryhi() & ASID_MASK);
+ +      kvm_info("ASID: %#lx\n", read_c0_entryhi() &
+ +               cpu_asid_mask(&current_cpu_data));
   
         for (i = 0; i < current_cpu_data.tlbsize; i++) {
                 write_c0_index(i);
@@@ -275,6 -268,7 +275,7 @@@ int kvm_mips_handle_kseg0_tlb_fault(uns
         int even;
         struct kvm *kvm = vcpu->kvm;
         const int flush_dcache_mask = 0;
+       int ret;
   
         if (KVM_GUEST_KSEGX(badvaddr) != KVM_GUEST_KSEG0) {
                 kvm_err("%s: Invalid BadVaddr: %#lx\n", __func__, badvaddr);
@@@ -306,14 -300,18 +307,18 @@@
                 pfn1 = kvm->arch.guest_pmap[gfn];
         }
   
-       entryhi = (vaddr | kvm_mips_get_kernel_asid(vcpu));
         entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) |
                    (1 << 2) | (0x1 << 1);
         entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) |
                    (1 << 2) | (0x1 << 1);
   
-       return kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
-                                      flush_dcache_mask);
+       preempt_disable();
+       entryhi = (vaddr | kvm_mips_get_kernel_asid(vcpu));
+       ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
+                                     flush_dcache_mask);
+       preempt_enable();
+ 
+       return ret;
   }
   EXPORT_SYMBOL_GPL(kvm_mips_handle_kseg0_tlb_fault);
   
@@@ -368,6 -366,7 +373,7 @@@ int kvm_mips_handle_mapped_seg_tlb_faul
         unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
         struct kvm *kvm = vcpu->kvm;
         kvm_pfn_t pfn0, pfn1;
+       int ret;
   
         if ((tlb->tlb_hi & VPN2_MASK) == 0) {
                 pfn0 = 0;
@@@ -394,9 -393,6 +400,6 @@@
                 *hpa1 = pfn1 << PAGE_SHIFT;
   
         /* Get attributes from the Guest TLB */
-       entryhi = (tlb->tlb_hi & VPN2_MASK) | (KVM_GUEST_KERNEL_MODE(vcpu) ?
-                                              kvm_mips_get_kernel_asid(vcpu) :
-                                              kvm_mips_get_user_asid(vcpu));
         entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) |
                    (tlb->tlb_lo0 & MIPS3_PG_D) | (tlb->tlb_lo0 & MIPS3_PG_V);
         entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) |
@@@ -405,8 -401,15 +408,15 @@@
         kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc,
                   tlb->tlb_lo0, tlb->tlb_lo1);
   
-       return kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
-                                      tlb->tlb_mask);
+       preempt_disable();
+       entryhi = (tlb->tlb_hi & VPN2_MASK) | (KVM_GUEST_KERNEL_MODE(vcpu) ?
+                                              kvm_mips_get_kernel_asid(vcpu) :
+                                              kvm_mips_get_user_asid(vcpu));
+       ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
+                                     tlb->tlb_mask);
+       preempt_enable();
+ 
+       return ret;
   }
   EXPORT_SYMBOL_GPL(kvm_mips_handle_mapped_seg_tlb_fault);
   
@@@ -571,15 -574,15 +581,15 @@@ void kvm_get_new_mmu_context(struct mm_
   {
         unsigned long asid = asid_cache(cpu);
   
- -      asid += ASID_INC;
- -      if (!(asid & ASID_MASK)) {
+ +      asid += cpu_asid_inc();
+ +      if (!(asid & cpu_asid_mask(&cpu_data[cpu]))) {
                 if (cpu_has_vtag_icache)
                         flush_icache_all();
   
                 kvm_local_flush_tlb_all();      /* start new asid cycle */
   
                 if (!asid)      /* fix version if needed */
- -                      asid = ASID_FIRST_VERSION;
+ +                      asid = asid_first_version(cpu);
         }
   
         cpu_context(cpu, mm) = asid_cache(cpu) = asid;
@@@ -634,7 -637,6 +644,7 @@@ static void kvm_mips_migrate_count(stru
   /* Restore ASID once we are scheduled back after preemption */
   void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
   {
+ +      unsigned long asid_mask = cpu_asid_mask(&cpu_data[cpu]);
         unsigned long flags;
         int newasid = 0;
   
@@@ -645,7 -647,7 +655,7 @@@
         local_irq_save(flags);
   
         if ((vcpu->arch.guest_kernel_asid[cpu] ^ asid_cache(cpu)) &
- -                                                      ASID_VERSION_MASK) {
+ +                                              asid_version_mask(cpu)) {
                 kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu);
                 vcpu->arch.guest_kernel_asid[cpu] =
                     vcpu->arch.guest_kernel_mm.context.asid[cpu];
@@@ -680,7 -682,7 +690,7 @@@
                  */
                 if (current->flags & PF_VCPU) {
                         write_c0_entryhi(vcpu->arch.
- -                                       preempt_entryhi & ASID_MASK);
+ +                                       preempt_entryhi & asid_mask);
                         ehb();
                 }
         } else {
@@@ -695,11 -697,11 +705,11 @@@
                         if (KVM_GUEST_KERNEL_MODE(vcpu))
                                 write_c0_entryhi(vcpu->arch.
                                                  guest_kernel_asid[cpu] &
- -                                               ASID_MASK);
+ +                                               asid_mask);
                         else
                                 write_c0_entryhi(vcpu->arch.
                                                  guest_user_asid[cpu] &
- -                                               ASID_MASK);
+ +                                               asid_mask);
                         ehb();
                 }
         }
@@@ -729,7 -731,7 +739,7 @@@ void kvm_arch_vcpu_put(struct kvm_vcpu 
         kvm_mips_callbacks->vcpu_get_regs(vcpu);
   
         if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) &
- -           ASID_VERSION_MASK)) {
+ +           asid_version_mask(cpu))) {
                 kvm_debug("%s: Dropping MMU Context:  %#lx\n", __func__,
                           cpu_context(cpu, current->mm));
                 drop_mmu_context(current->mm, cpu);
@@@ -756,8 -758,7 +766,8 @@@ uint32_t kvm_get_inst(uint32_t *opc, st
                         inst = *(opc);
                 } else {
                         vpn2 = (unsigned long) opc & VPN2_MASK;
- -                      asid = kvm_read_c0_guest_entryhi(cop0) & ASID_MASK;
+ +                      asid = kvm_read_c0_guest_entryhi(cop0) &
+ +                                              KVM_ENTRYHI_ASID;
                         index = kvm_mips_guest_tlb_lookup(vcpu, vpn2 | asid);
                         if (index < 0) {
                                 kvm_err("%s: get_user_failed for %p, vcpu: %p, ASID: %#lx\n",
diff --combined arch/mips/kvm/trap_emul.c

index fd43f0afdb9f0b2f31a78a4f8b6b89d34a906315,caa5ea1038a08059b2ae92a804a4201cc73b9a04..6ba0fafcecbc9e24a5933e48ef62b7349a60c43b
--- 1/arch/mips/kvm/trap_emul.c
--- 2/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@@ -505,8 -505,7 +505,8 @@@ static int kvm_trap_emul_vcpu_setup(str
         kvm_write_c0_guest_intctl(cop0, 0xFC000000);
   
         /* Put in vcpu id as CPUNum into Ebase Reg to handle SMP Guests */
- -      kvm_write_c0_guest_ebase(cop0, KVM_GUEST_KSEG0 | (vcpu_id & 0xFF));
+ +      kvm_write_c0_guest_ebase(cop0, KVM_GUEST_KSEG0 |
+ +                                     (vcpu_id & MIPS_EBASE_CPUNUM));
   
         return 0;
   }
@@@ -547,7 -546,7 +547,7 @@@ static int kvm_trap_emul_set_one_reg(st
                 kvm_mips_write_count(vcpu, v);
                 break;
         case KVM_REG_MIPS_CP0_COMPARE:
-               kvm_mips_write_compare(vcpu, v);
+               kvm_mips_write_compare(vcpu, v, false);
                 break;
         case KVM_REG_MIPS_CP0_CAUSE:
                 /*
diff --combined arch/s390/include/asm/sclp.h

index bd7893d274fabb086f15edb5e7c7122ba4eb772e,994a66c09e8f16f1c256294da1d934791dca255e..e4f6f73afe2f91018ade009782534041d03b52c5
--- 1/arch/s390/include/asm/sclp.h
--- 2/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
@@@ -69,21 -69,10 +69,22 @@@ struct sclp_info 
         unsigned int max_cores;
         unsigned long hsa_size;
         unsigned long facilities;
+       unsigned int hmfai;
   };
   extern struct sclp_info sclp;
   
+ +struct zpci_report_error_header {
+ +      u8 version;     /* Interface version byte */
+ +      u8 action;      /* Action qualifier byte
+ +                       * 1: Deconfigure and repair action requested
+ +                       *      (OpenCrypto Problem Call Home)
+ +                       * 2: Informational Report
+ +                       *      (OpenCrypto Successful Diagnostics Execution)
+ +                       */
+ +      u16 length;     /* Length of Subsequent Data (up to 4K – SCLP header */
+ +      u8 data[0];     /* Subsequent Data passed verbatim to SCLP ET 24 */
+ +} __packed;
+ +
   int sclp_get_core_info(struct sclp_core_info *info);
   int sclp_core_configure(u8 core);
   int sclp_core_deconfigure(u8 core);
@@@ -95,7 -84,6 +96,7 @@@ int sclp_chp_read_info(struct sclp_chp_
   void sclp_get_ipl_info(struct sclp_ipl_info *info);
   int sclp_pci_configure(u32 fid);
   int sclp_pci_deconfigure(u32 fid);
+ +int sclp_pci_report(struct zpci_report_error_header *report, u32 fh, u32 fid);
   int memcpy_hsa_kernel(void *dest, unsigned long src, size_t count);
   int memcpy_hsa_user(void __user *dest, unsigned long src, size_t count);
   void sclp_early_detect(void);
diff --combined arch/x86/kvm/mmu.c

index 38c0c32926c96bc154c2ce6c7c6cb06a30b03ac2,850335a71d9f3ac2b69413a447b160285152657b..24e800116ab4a25d750dca67023633b16f648972
--- 1/arch/x86/kvm/mmu.c
--- 2/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@@ -1909,18 -1909,17 +1909,17 @@@ static void kvm_mmu_commit_zap_page(str
    * since it has been deleted from active_mmu_pages but still can be found
    * at hast list.
    *
-  * for_each_gfn_indirect_valid_sp has skipped that kind of page and
-  * kvm_mmu_get_page(), the only user of for_each_gfn_sp(), has skipped
-  * all the obsolete pages.
+  * for_each_gfn_valid_sp() has skipped that kind of pages.
    */
- #define for_each_gfn_sp(_kvm, _sp, _gfn)                              \
+ #define for_each_gfn_valid_sp(_kvm, _sp, _gfn)                                \
         hlist_for_each_entry(_sp,                                       \
           &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
-               if ((_sp)->gfn != (_gfn)) {} else
+               if ((_sp)->gfn != (_gfn) || is_obsolete_sp((_kvm), (_sp)) \
+                       || (_sp)->role.invalid) {} else
   
   #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn)                       \
-       for_each_gfn_sp(_kvm, _sp, _gfn)                                \
-               if ((_sp)->role.direct || (_sp)->role.invalid) {} else
+       for_each_gfn_valid_sp(_kvm, _sp, _gfn)                          \
+               if ((_sp)->role.direct) {} else
   
   /* @sp->gfn should be write-protected at the call site */
   static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@@ -1961,6 -1960,11 +1960,11 @@@ static void kvm_mmu_audit(struct kvm_vc
   static void mmu_audit_disable(void) { }
   #endif
   
+ static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+ {
+       return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
+ }
+ 
   static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
                          struct list_head *invalid_list)
   {
@@@ -2105,11 -2109,6 +2109,6 @@@ static void clear_sp_write_flooding_cou
         __clear_sp_write_flooding_count(sp);
   }
   
- static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
- {
-       return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
- }
- 
   static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
                                              gfn_t gfn,
                                              gva_t gaddr,
@@@ -2136,10 -2135,7 +2135,7 @@@
                 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
                 role.quadrant = quadrant;
         }
-       for_each_gfn_sp(vcpu->kvm, sp, gfn) {
-               if (is_obsolete_sp(vcpu->kvm, sp))
-                       continue;
- 
+       for_each_gfn_valid_sp(vcpu->kvm, sp, gfn) {
                 if (!need_sync && sp->unsync)
                         need_sync = true;
   
@@@ -2823,7 -2819,7 +2819,7 @@@ static void transparent_hugepage_adjust
          */
         if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
             level == PT_PAGE_TABLE_LEVEL &&
- -          PageTransCompound(pfn_to_page(pfn)) &&
+ +          PageTransCompoundMap(pfn_to_page(pfn)) &&
             !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
                 unsigned long mask;
                 /*
@@@ -3844,8 -3840,7 +3840,8 @@@ reset_tdp_shadow_zero_bits_mask(struct 
                 __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check,
                                         boot_cpu_data.x86_phys_bits,
                                         context->shadow_root_level, false,
- -                                      cpu_has_gbpages, true, true);
+ +                                      boot_cpu_has(X86_FEATURE_GBPAGES),
+ +                                      true, true);
         else
                 __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
                                             boot_cpu_data.x86_phys_bits,
@@@ -4786,7 -4781,7 +4782,7 @@@ restart
                  */
                 if (sp->role.direct &&
                         !kvm_is_reserved_pfn(pfn) &&
- -                      PageTransCompound(pfn_to_page(pfn))) {
+ +                      PageTransCompoundMap(pfn_to_page(pfn))) {
                         drop_spte(kvm, sptep);
                         need_tlb_flush = 1;
                         goto restart;
diff --combined arch/x86/kvm/svm.c

index fafd720ce10a12cbe6e70da6c3dc1796af3bd447,b0dd90338de77532c6dfc67fd148ac418e23b5fd..2214214c786b2f11295c0d7df835b2381a3b7ea0
--- 1/arch/x86/kvm/svm.c
--- 2/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@@ -14,6 -14,9 +14,9 @@@
    * the COPYING file in the top-level directory.
    *
    */
+ 
+ #define pr_fmt(fmt) "SVM: " fmt
+ 
   #include <linux/kvm_host.h>
   
   #include "irq.h"
@@@ -32,6 -35,7 +35,7 @@@
   #include <linux/trace_events.h>
   #include <linux/slab.h>
   
+ #include <asm/apic.h>
   #include <asm/perf_event.h>
   #include <asm/tlbflush.h>
   #include <asm/desc.h>
@@@ -68,6 -72,8 +72,8 @@@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id)
   #define SVM_FEATURE_DECODE_ASSIST  (1 <<  7)
   #define SVM_FEATURE_PAUSE_FILTER   (1 << 10)
   
+ #define SVM_AVIC_DOORBELL     0xc001011b
+ 
   #define NESTED_EXIT_HOST      0       /* Exit handled on host level */
   #define NESTED_EXIT_DONE      1       /* Exit caused nested vmexit  */
   #define NESTED_EXIT_CONTINUE  2       /* Further checks needed      */
@@@ -78,6 -84,18 +84,18 @@@
   #define TSC_RATIO_MIN         0x0000000000000001ULL
   #define TSC_RATIO_MAX         0x000000ffffffffffULL
   
+ #define AVIC_HPA_MASK ~((0xFFFULL << 52) || 0xFFF)
+ 
+ /*
+  * 0xff is broadcast, so the max index allowed for physical APIC ID
+  * table is 0xfe.  APIC IDs above 0xff are reserved.
+  */
+ #define AVIC_MAX_PHYSICAL_ID_COUNT    255
+ 
+ #define AVIC_UNACCEL_ACCESS_WRITE_MASK                1
+ #define AVIC_UNACCEL_ACCESS_OFFSET_MASK               0xFF0
+ #define AVIC_UNACCEL_ACCESS_VECTOR_MASK               0xFFFFFFFF
+ 
   static bool erratum_383_found __read_mostly;
   
   static const u32 host_save_user_msrs[] = {
@@@ -162,8 -180,21 +180,21 @@@ struct vcpu_svm 
   
         /* cached guest cpuid flags for faster access */
         bool nrips_enabled      : 1;
+ 
+       u32 ldr_reg;
+       struct page *avic_backing_page;
+       u64 *avic_physical_id_cache;
+       bool avic_is_running;
   };
   
+ #define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK  (0xFF)
+ #define AVIC_LOGICAL_ID_ENTRY_VALID_MASK              (1 << 31)
+ 
+ #define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK  (0xFFULL)
+ #define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK      (0xFFFFFFFFFFULL << 12)
+ #define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK                (1ULL << 62)
+ #define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK             (1ULL << 63)
+ 
   static DEFINE_PER_CPU(u64, current_tsc_ratio);
   #define TSC_RATIO_DEFAULT     0x0100000000ULL
   
@@@ -205,6 -236,10 +236,10 @@@ module_param(npt, int, S_IRUGO)
   static int nested = true;
   module_param(nested, int, S_IRUGO);
   
+ /* enable / disable AVIC */
+ static int avic;
+ module_param(avic, int, S_IRUGO);
+ 
   static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
   static void svm_flush_tlb(struct kvm_vcpu *vcpu);
   static void svm_complete_interrupts(struct vcpu_svm *svm);
@@@ -228,12 -263,18 +263,18 @@@ enum 
         VMCB_SEG,        /* CS, DS, SS, ES, CPL */
         VMCB_CR2,        /* CR2 only */
         VMCB_LBR,        /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
+       VMCB_AVIC,       /* AVIC APIC_BAR, AVIC APIC_BACKING_PAGE,
+                         * AVIC PHYSICAL_TABLE pointer,
+                         * AVIC LOGICAL_TABLE pointer
+                         */
         VMCB_DIRTY_MAX,
   };
   
   /* TPR and CR2 are always written before VMRUN */
   #define VMCB_ALWAYS_DIRTY_MASK        ((1U << VMCB_INTR) | (1U << VMCB_CR2))
   
+ #define VMCB_AVIC_APIC_BAR_MASK               0xFFFFFFFFFF000ULL
+ 
   static inline void mark_all_dirty(struct vmcb *vmcb)
   {
         vmcb->control.clean = 0;
@@@ -255,6 -296,23 +296,23 @@@ static inline struct vcpu_svm *to_svm(s
         return container_of(vcpu, struct vcpu_svm, vcpu);
   }
   
+ static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data)
+ {
+       svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK;
+       mark_dirty(svm->vmcb, VMCB_AVIC);
+ }
+ 
+ static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_svm *svm = to_svm(vcpu);
+       u64 *entry = svm->avic_physical_id_cache;
+ 
+       if (!entry)
+               return false;
+ 
+       return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
+ }
+ 
   static void recalc_intercepts(struct vcpu_svm *svm)
   {
         struct vmcb_control_area *c, *h;
@@@ -923,6 -981,12 +981,12 @@@ static __init int svm_hardware_setup(vo
         } else
                 kvm_disable_tdp();
   
+       if (avic && (!npt_enabled || !boot_cpu_has(X86_FEATURE_AVIC)))
+               avic = false;
+ 
+       if (avic)
+               pr_info("AVIC enabled\n");
+ 
         return 0;
   
   err:
@@@ -1000,6 -1064,22 +1064,22 @@@ static void svm_adjust_tsc_offset_guest
         mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
   }
   
+ static void avic_init_vmcb(struct vcpu_svm *svm)
+ {
+       struct vmcb *vmcb = svm->vmcb;
+       struct kvm_arch *vm_data = &svm->vcpu.kvm->arch;
+       phys_addr_t bpa = page_to_phys(svm->avic_backing_page);
+       phys_addr_t lpa = page_to_phys(vm_data->avic_logical_id_table_page);
+       phys_addr_t ppa = page_to_phys(vm_data->avic_physical_id_table_page);
+ 
+       vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
+       vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
+       vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
+       vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT;
+       vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
+       svm->vcpu.arch.apicv_active = true;
+ }
+ 
   static void init_vmcb(struct vcpu_svm *svm)
   {
         struct vmcb_control_area *control = &svm->vmcb->control;
@@@ -1014,7 -1094,8 +1094,8 @@@
         set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
         set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
         set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
-       set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+       if (!kvm_vcpu_apicv_active(&svm->vcpu))
+               set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
   
         set_dr_intercepts(svm);
   
@@@ -1110,9 -1191,197 +1191,197 @@@
                 set_intercept(svm, INTERCEPT_PAUSE);
         }
   
+       if (avic)
+               avic_init_vmcb(svm);
+ 
         mark_all_dirty(svm->vmcb);
   
         enable_gif(svm);
+ 
+ }
+ 
+ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, int index)
+ {
+       u64 *avic_physical_id_table;
+       struct kvm_arch *vm_data = &vcpu->kvm->arch;
+ 
+       if (index >= AVIC_MAX_PHYSICAL_ID_COUNT)
+               return NULL;
+ 
+       avic_physical_id_table = page_address(vm_data->avic_physical_id_table_page);
+ 
+       return &avic_physical_id_table[index];
+ }
+ 
+ /**
+  * Note:
+  * AVIC hardware walks the nested page table to check permissions,
+  * but does not use the SPA address specified in the leaf page
+  * table entry since it uses  address in the AVIC_BACKING_PAGE pointer
+  * field of the VMCB. Therefore, we set up the
+  * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
+  */
+ static int avic_init_access_page(struct kvm_vcpu *vcpu)
+ {
+       struct kvm *kvm = vcpu->kvm;
+       int ret;
+ 
+       if (kvm->arch.apic_access_page_done)
+               return 0;
+ 
+       ret = x86_set_memory_region(kvm,
+                                   APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+                                   APIC_DEFAULT_PHYS_BASE,
+                                   PAGE_SIZE);
+       if (ret)
+               return ret;
+ 
+       kvm->arch.apic_access_page_done = true;
+       return 0;
+ }
+ 
+ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
+ {
+       int ret;
+       u64 *entry, new_entry;
+       int id = vcpu->vcpu_id;
+       struct vcpu_svm *svm = to_svm(vcpu);
+ 
+       ret = avic_init_access_page(vcpu);
+       if (ret)
+               return ret;
+ 
+       if (id >= AVIC_MAX_PHYSICAL_ID_COUNT)
+               return -EINVAL;
+ 
+       if (!svm->vcpu.arch.apic->regs)
+               return -EINVAL;
+ 
+       svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs);
+ 
+       /* Setting AVIC backing page address in the phy APIC ID table */
+       entry = avic_get_physical_id_entry(vcpu, id);
+       if (!entry)
+               return -EINVAL;
+ 
+       new_entry = READ_ONCE(*entry);
+       new_entry = (page_to_phys(svm->avic_backing_page) &
+                    AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
+                    AVIC_PHYSICAL_ID_ENTRY_VALID_MASK;
+       WRITE_ONCE(*entry, new_entry);
+ 
+       svm->avic_physical_id_cache = entry;
+ 
+       return 0;
+ }
+ 
+ static void avic_vm_destroy(struct kvm *kvm)
+ {
+       struct kvm_arch *vm_data = &kvm->arch;
+ 
+       if (vm_data->avic_logical_id_table_page)
+               __free_page(vm_data->avic_logical_id_table_page);
+       if (vm_data->avic_physical_id_table_page)
+               __free_page(vm_data->avic_physical_id_table_page);
+ }
+ 
+ static int avic_vm_init(struct kvm *kvm)
+ {
+       int err = -ENOMEM;
+       struct kvm_arch *vm_data = &kvm->arch;
+       struct page *p_page;
+       struct page *l_page;
+ 
+       if (!avic)
+               return 0;
+ 
+       /* Allocating physical APIC ID table (4KB) */
+       p_page = alloc_page(GFP_KERNEL);
+       if (!p_page)
+               goto free_avic;
+ 
+       vm_data->avic_physical_id_table_page = p_page;
+       clear_page(page_address(p_page));
+ 
+       /* Allocating logical APIC ID table (4KB) */
+       l_page = alloc_page(GFP_KERNEL);
+       if (!l_page)
+               goto free_avic;
+ 
+       vm_data->avic_logical_id_table_page = l_page;
+       clear_page(page_address(l_page));
+ 
+       return 0;
+ 
+ free_avic:
+       avic_vm_destroy(kvm);
+       return err;
+ }
+ 
+ /**
+  * This function is called during VCPU halt/unhalt.
+  */
+ static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
+ {
+       u64 entry;
+       int h_physical_id = __default_cpu_present_to_apicid(vcpu->cpu);
+       struct vcpu_svm *svm = to_svm(vcpu);
+ 
+       if (!kvm_vcpu_apicv_active(vcpu))
+               return;
+ 
+       svm->avic_is_running = is_run;
+ 
+       /* ID = 0xff (broadcast), ID > 0xff (reserved) */
+       if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT))
+               return;
+ 
+       entry = READ_ONCE(*(svm->avic_physical_id_cache));
+       WARN_ON(is_run == !!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK));
+ 
+       entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+       if (is_run)
+               entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+       WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+ }
+ 
+ static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+ {
+       u64 entry;
+       /* ID = 0xff (broadcast), ID > 0xff (reserved) */
+       int h_physical_id = __default_cpu_present_to_apicid(cpu);
+       struct vcpu_svm *svm = to_svm(vcpu);
+ 
+       if (!kvm_vcpu_apicv_active(vcpu))
+               return;
+ 
+       if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT))
+               return;
+ 
+       entry = READ_ONCE(*(svm->avic_physical_id_cache));
+       WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
+ 
+       entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
+       entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
+ 
+       entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+       if (svm->avic_is_running)
+               entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+ 
+       WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+ }
+ 
+ static void avic_vcpu_put(struct kvm_vcpu *vcpu)
+ {
+       u64 entry;
+       struct vcpu_svm *svm = to_svm(vcpu);
+ 
+       if (!kvm_vcpu_apicv_active(vcpu))
+               return;
+ 
+       entry = READ_ONCE(*(svm->avic_physical_id_cache));
+       entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+       WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
   }
   
   static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@@ -1131,6 -1400,9 +1400,9 @@@
   
         kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
         kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
+ 
+       if (kvm_vcpu_apicv_active(vcpu) && !init_event)
+               avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
   }
   
   static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
@@@ -1169,6 -1441,17 +1441,17 @@@
         if (!hsave_page)
                 goto free_page3;
   
+       if (avic) {
+               err = avic_init_backing_page(&svm->vcpu);
+               if (err)
+                       goto free_page4;
+       }
+ 
+       /* We initialize this flag to true to make sure that the is_running
+        * bit would be set the first time the vcpu is loaded.
+        */
+       svm->avic_is_running = true;
+ 
         svm->nested.hsave = page_address(hsave_page);
   
         svm->msrpm = page_address(msrpm_pages);
@@@ -1187,6 -1470,8 +1470,8 @@@
   
         return &svm->vcpu;
   
+ free_page4:
+       __free_page(hsave_page);
   free_page3:
         __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
   free_page2:
@@@ -1243,6 -1528,8 +1528,8 @@@ static void svm_vcpu_load(struct kvm_vc
         /* This assumes that the kernel never uses MSR_TSC_AUX */
         if (static_cpu_has(X86_FEATURE_RDTSCP))
                 wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
+ 
+       avic_vcpu_load(vcpu, cpu);
   }
   
   static void svm_vcpu_put(struct kvm_vcpu *vcpu)
@@@ -1250,11 -1537,13 +1537,13 @@@
         struct vcpu_svm *svm = to_svm(vcpu);
         int i;
   
+       avic_vcpu_put(vcpu);
+ 
         ++vcpu->stat.host_state_reload;
         kvm_load_ldt(svm->host.ldt);
   #ifdef CONFIG_X86_64
         loadsegment(fs, svm->host.fs);
- -      wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
+ +      wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase);
         load_gs_index(svm->host.gs);
   #else
   #ifdef CONFIG_X86_32_LAZY_GS
@@@ -1265,6 -1554,16 +1554,16 @@@
                 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
   }
   
+ static void svm_vcpu_blocking(struct kvm_vcpu *vcpu)
+ {
+       avic_set_running(vcpu, false);
+ }
+ 
+ static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
+ {
+       avic_set_running(vcpu, true);
+ }
+ 
   static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
   {
         return to_svm(vcpu)->vmcb->save.rflags;
@@@ -2673,10 -2972,11 +2972,11 @@@ static int clgi_interception(struct vcp
         disable_gif(svm);
   
         /* After a CLGI no interrupts should come */
-       svm_clear_vintr(svm);
-       svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
- 
-       mark_dirty(svm->vmcb, VMCB_INTR);
+       if (!kvm_vcpu_apicv_active(&svm->vcpu)) {
+               svm_clear_vintr(svm);
+               svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
+               mark_dirty(svm->vmcb, VMCB_INTR);
+       }
   
         return 1;
   }
@@@ -3212,6 -3512,10 +3512,10 @@@ static int svm_set_msr(struct kvm_vcpu 
         case MSR_VM_IGNNE:
                 vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
                 break;
+       case MSR_IA32_APICBASE:
+               if (kvm_vcpu_apicv_active(vcpu))
+                       avic_update_vapic_bar(to_svm(vcpu), data);
+               /* Follow through */
         default:
                 return kvm_set_msr_common(vcpu, msr);
         }
@@@ -3281,6 -3585,278 +3585,278 @@@ static int mwait_interception(struct vc
         return nop_interception(svm);
   }
   
+ enum avic_ipi_failure_cause {
+       AVIC_IPI_FAILURE_INVALID_INT_TYPE,
+       AVIC_IPI_FAILURE_TARGET_NOT_RUNNING,
+       AVIC_IPI_FAILURE_INVALID_TARGET,
+       AVIC_IPI_FAILURE_INVALID_BACKING_PAGE,
+ };
+ 
+ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
+ {
+       u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
+       u32 icrl = svm->vmcb->control.exit_info_1;
+       u32 id = svm->vmcb->control.exit_info_2 >> 32;
+       u32 index = svm->vmcb->control.exit_info_2 && 0xFF;
+       struct kvm_lapic *apic = svm->vcpu.arch.apic;
+ 
+       trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index);
+ 
+       switch (id) {
+       case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
+               /*
+                * AVIC hardware handles the generation of
+                * IPIs when the specified Message Type is Fixed
+                * (also known as fixed delivery mode) and
+                * the Trigger Mode is edge-triggered. The hardware
+                * also supports self and broadcast delivery modes
+                * specified via the Destination Shorthand(DSH)
+                * field of the ICRL. Logical and physical APIC ID
+                * formats are supported. All other IPI types cause
+                * a #VMEXIT, which needs to emulated.
+                */
+               kvm_lapic_reg_write(apic, APIC_ICR2, icrh);
+               kvm_lapic_reg_write(apic, APIC_ICR, icrl);
+               break;
+       case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: {
+               int i;
+               struct kvm_vcpu *vcpu;
+               struct kvm *kvm = svm->vcpu.kvm;
+               struct kvm_lapic *apic = svm->vcpu.arch.apic;
+ 
+               /*
+                * At this point, we expect that the AVIC HW has already
+                * set the appropriate IRR bits on the valid target
+                * vcpus. So, we just need to kick the appropriate vcpu.
+                */
+               kvm_for_each_vcpu(i, vcpu, kvm) {
+                       bool m = kvm_apic_match_dest(vcpu, apic,
+                                                    icrl & KVM_APIC_SHORT_MASK,
+                                                    GET_APIC_DEST_FIELD(icrh),
+                                                    icrl & KVM_APIC_DEST_MASK);
+ 
+                       if (m && !avic_vcpu_is_running(vcpu))
+                               kvm_vcpu_wake_up(vcpu);
+               }
+               break;
+       }
+       case AVIC_IPI_FAILURE_INVALID_TARGET:
+               break;
+       case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
+               WARN_ONCE(1, "Invalid backing page\n");
+               break;
+       default:
+               pr_err("Unknown IPI interception\n");
+       }
+ 
+       return 1;
+ }
+ 
+ static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
+ {
+       struct kvm_arch *vm_data = &vcpu->kvm->arch;
+       int index;
+       u32 *logical_apic_id_table;
+       int dlid = GET_APIC_LOGICAL_ID(ldr);
+ 
+       if (!dlid)
+               return NULL;
+ 
+       if (flat) { /* flat */
+               index = ffs(dlid) - 1;
+               if (index > 7)
+                       return NULL;
+       } else { /* cluster */
+               int cluster = (dlid & 0xf0) >> 4;
+               int apic = ffs(dlid & 0x0f) - 1;
+ 
+               if ((apic < 0) || (apic > 7) ||
+                   (cluster >= 0xf))
+                       return NULL;
+               index = (cluster << 2) + apic;
+       }
+ 
+       logical_apic_id_table = (u32 *) page_address(vm_data->avic_logical_id_table_page);
+ 
+       return &logical_apic_id_table[index];
+ }
+ 
+ static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr,
+                         bool valid)
+ {
+       bool flat;
+       u32 *entry, new_entry;
+ 
+       flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
+       entry = avic_get_logical_id_entry(vcpu, ldr, flat);
+       if (!entry)
+               return -EINVAL;
+ 
+       new_entry = READ_ONCE(*entry);
+       new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
+       new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
+       if (valid)
+               new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
+       else
+               new_entry &= ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
+       WRITE_ONCE(*entry, new_entry);
+ 
+       return 0;
+ }
+ 
+ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
+ {
+       int ret;
+       struct vcpu_svm *svm = to_svm(vcpu);
+       u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
+ 
+       if (!ldr)
+               return 1;
+ 
+       ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr, true);
+       if (ret && svm->ldr_reg) {
+               avic_ldr_write(vcpu, 0, svm->ldr_reg, false);
+               svm->ldr_reg = 0;
+       } else {
+               svm->ldr_reg = ldr;
+       }
+       return ret;
+ }
+ 
+ static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu)
+ {
+       u64 *old, *new;
+       struct vcpu_svm *svm = to_svm(vcpu);
+       u32 apic_id_reg = kvm_lapic_get_reg(vcpu->arch.apic, APIC_ID);
+       u32 id = (apic_id_reg >> 24) & 0xff;
+ 
+       if (vcpu->vcpu_id == id)
+               return 0;
+ 
+       old = avic_get_physical_id_entry(vcpu, vcpu->vcpu_id);
+       new = avic_get_physical_id_entry(vcpu, id);
+       if (!new || !old)
+               return 1;
+ 
+       /* We need to move physical_id_entry to new offset */
+       *new = *old;
+       *old = 0ULL;
+       to_svm(vcpu)->avic_physical_id_cache = new;
+ 
+       /*
+        * Also update the guest physical APIC ID in the logical
+        * APIC ID table entry if already setup the LDR.
+        */
+       if (svm->ldr_reg)
+               avic_handle_ldr_update(vcpu);
+ 
+       return 0;
+ }
+ 
+ static int avic_handle_dfr_update(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_svm *svm = to_svm(vcpu);
+       struct kvm_arch *vm_data = &vcpu->kvm->arch;
+       u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
+       u32 mod = (dfr >> 28) & 0xf;
+ 
+       /*
+        * We assume that all local APICs are using the same type.
+        * If this changes, we need to flush the AVIC logical
+        * APID id table.
+        */
+       if (vm_data->ldr_mode == mod)
+               return 0;
+ 
+       clear_page(page_address(vm_data->avic_logical_id_table_page));
+       vm_data->ldr_mode = mod;
+ 
+       if (svm->ldr_reg)
+               avic_handle_ldr_update(vcpu);
+       return 0;
+ }
+ 
+ static int avic_unaccel_trap_write(struct vcpu_svm *svm)
+ {
+       struct kvm_lapic *apic = svm->vcpu.arch.apic;
+       u32 offset = svm->vmcb->control.exit_info_1 &
+                               AVIC_UNACCEL_ACCESS_OFFSET_MASK;
+ 
+       switch (offset) {
+       case APIC_ID:
+               if (avic_handle_apic_id_update(&svm->vcpu))
+                       return 0;
+               break;
+       case APIC_LDR:
+               if (avic_handle_ldr_update(&svm->vcpu))
+                       return 0;
+               break;
+       case APIC_DFR:
+               avic_handle_dfr_update(&svm->vcpu);
+               break;
+       default:
+               break;
+       }
+ 
+       kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
+ 
+       return 1;
+ }
+ 
+ static bool is_avic_unaccelerated_access_trap(u32 offset)
+ {
+       bool ret = false;
+ 
+       switch (offset) {
+       case APIC_ID:
+       case APIC_EOI:
+       case APIC_RRR:
+       case APIC_LDR:
+       case APIC_DFR:
+       case APIC_SPIV:
+       case APIC_ESR:
+       case APIC_ICR:
+       case APIC_LVTT:
+       case APIC_LVTTHMR:
+       case APIC_LVTPC:
+       case APIC_LVT0:
+       case APIC_LVT1:
+       case APIC_LVTERR:
+       case APIC_TMICT:
+       case APIC_TDCR:
+               ret = true;
+               break;
+       default:
+               break;
+       }
+       return ret;
+ }
+ 
+ static int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
+ {
+       int ret = 0;
+       u32 offset = svm->vmcb->control.exit_info_1 &
+                    AVIC_UNACCEL_ACCESS_OFFSET_MASK;
+       u32 vector = svm->vmcb->control.exit_info_2 &
+                    AVIC_UNACCEL_ACCESS_VECTOR_MASK;
+       bool write = (svm->vmcb->control.exit_info_1 >> 32) &
+                    AVIC_UNACCEL_ACCESS_WRITE_MASK;
+       bool trap = is_avic_unaccelerated_access_trap(offset);
+ 
+       trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset,
+                                           trap, write, vector);
+       if (trap) {
+               /* Handling Trap */
+               WARN_ONCE(!write, "svm: Handling trap read.\n");
+               ret = avic_unaccel_trap_write(svm);
+       } else {
+               /* Handling Fault */
+               ret = (emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE);
+       }
+ 
+       return ret;
+ }
+ 
   static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
         [SVM_EXIT_READ_CR0]                     = cr_interception,
         [SVM_EXIT_READ_CR3]                     = cr_interception,
@@@ -3344,6 -3920,8 +3920,8 @@@
         [SVM_EXIT_XSETBV]                       = xsetbv_interception,
         [SVM_EXIT_NPF]                          = pf_interception,
         [SVM_EXIT_RSM]                          = emulate_on_interception,
+       [SVM_EXIT_AVIC_INCOMPLETE_IPI]          = avic_incomplete_ipi_interception,
+       [SVM_EXIT_AVIC_UNACCELERATED_ACCESS]    = avic_unaccelerated_access_interception,
   };
   
   static void dump_vmcb(struct kvm_vcpu *vcpu)
@@@ -3375,10 -3953,14 +3953,14 @@@
         pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
         pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
         pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
+       pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
         pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
         pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
         pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl);
         pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
+       pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
+       pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
+       pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
         pr_err("VMCB State Save Area:\n");
         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
                "es:",
@@@ -3562,6 -4144,7 +4144,7 @@@ static inline void svm_inject_irq(struc
   {
         struct vmcb_control_area *control;
   
+       /* The following fields are ignored when AVIC is enabled */
         control = &svm->vmcb->control;
         control->int_vector = irq;
         control->int_ctl &= ~V_INTR_PRIO_MASK;
@@@ -3583,11 -4166,17 +4166,17 @@@ static void svm_set_irq(struct kvm_vcp
                 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
   }
   
+ static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu)
+ {
+       return is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK);
+ }
+ 
   static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
   {
         struct vcpu_svm *svm = to_svm(vcpu);
   
-       if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
+       if (svm_nested_virtualize_tpr(vcpu) ||
+           kvm_vcpu_apicv_active(vcpu))
                 return;
   
         clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
@@@ -3606,11 -4195,28 +4195,28 @@@ static void svm_set_virtual_x2apic_mode
   
   static bool svm_get_enable_apicv(void)
   {
-       return false;
+       return avic;
+ }
+ 
+ static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
+ {
   }
   
+ static void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
+ {
+ }
+ 
+ /* Note: Currently only used by Hyper-V. */
   static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
   {
+       struct vcpu_svm *svm = to_svm(vcpu);
+       struct vmcb *vmcb = svm->vmcb;
+ 
+       if (!avic)
+               return;
+ 
+       vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
+       mark_dirty(vmcb, VMCB_INTR);
   }
   
   static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
@@@ -3623,6 -4229,18 +4229,18 @@@ static void svm_sync_pir_to_irr(struct 
         return;
   }
   
+ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
+ {
+       kvm_lapic_set_irr(vec, vcpu->arch.apic);
+       smp_mb__after_atomic();
+ 
+       if (avic_vcpu_is_running(vcpu))
+               wrmsrl(SVM_AVIC_DOORBELL,
+                      __default_cpu_present_to_apicid(vcpu->cpu));
+       else
+               kvm_vcpu_wake_up(vcpu);
+ }
+ 
   static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
   {
         struct vcpu_svm *svm = to_svm(vcpu);
@@@ -3677,6 -4295,9 +4295,9 @@@ static void enable_irq_window(struct kv
   {
         struct vcpu_svm *svm = to_svm(vcpu);
   
+       if (kvm_vcpu_apicv_active(vcpu))
+               return;
+ 
         /*
          * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
          * 1, because that's a separate STGI/VMRUN intercept.  The next time we
@@@ -3728,7 -4349,7 +4349,7 @@@ static inline void sync_cr8_to_lapic(st
   {
         struct vcpu_svm *svm = to_svm(vcpu);
   
-       if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
+       if (svm_nested_virtualize_tpr(vcpu))
                 return;
   
         if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
@@@ -3742,7 -4363,8 +4363,8 @@@ static inline void sync_lapic_to_cr8(st
         struct vcpu_svm *svm = to_svm(vcpu);
         u64 cr8;
   
-       if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
+       if (svm_nested_virtualize_tpr(vcpu) ||
+           kvm_vcpu_apicv_active(vcpu))
                 return;
   
         cr8 = kvm_get_cr8(vcpu);
@@@ -4045,14 -4667,26 +4667,26 @@@ static u64 svm_get_mt_mask(struct kvm_v
   static void svm_cpuid_update(struct kvm_vcpu *vcpu)
   {
         struct vcpu_svm *svm = to_svm(vcpu);
+       struct kvm_cpuid_entry2 *entry;
   
         /* Update nrips enabled cache */
         svm->nrips_enabled = !!guest_cpuid_has_nrips(&svm->vcpu);
+ 
+       if (!kvm_vcpu_apicv_active(vcpu))
+               return;
+ 
+       entry = kvm_find_cpuid_entry(vcpu, 1, 0);
+       if (entry)
+               entry->ecx &= ~bit(X86_FEATURE_X2APIC);
   }
   
   static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
   {
         switch (func) {
+       case 0x1:
+               if (avic)
+                       entry->ecx &= ~bit(X86_FEATURE_X2APIC);
+               break;
         case 0x80000001:
                 if (nested)
                         entry->ecx |= (1 << 2); /* Set SVM bit */
@@@ -4307,6 -4941,15 +4941,15 @@@ static void svm_sched_in(struct kvm_vcp
   {
   }
   
+ static inline void avic_post_state_restore(struct kvm_vcpu *vcpu)
+ {
+       if (avic_handle_apic_id_update(vcpu) != 0)
+               return;
+       if (avic_handle_dfr_update(vcpu) != 0)
+               return;
+       avic_handle_ldr_update(vcpu);
+ }
+ 
   static struct kvm_x86_ops svm_x86_ops = {
         .cpu_has_kvm_support = has_svm,
         .disabled_by_bios = is_disabled,
@@@ -4322,9 -4965,14 +4965,14 @@@
         .vcpu_free = svm_free_vcpu,
         .vcpu_reset = svm_vcpu_reset,
   
+       .vm_init = avic_vm_init,
+       .vm_destroy = avic_vm_destroy,
+ 
         .prepare_guest_switch = svm_prepare_guest_switch,
         .vcpu_load = svm_vcpu_load,
         .vcpu_put = svm_vcpu_put,
+       .vcpu_blocking = svm_vcpu_blocking,
+       .vcpu_unblocking = svm_vcpu_unblocking,
   
         .update_bp_intercept = update_bp_intercept,
         .get_msr = svm_get_msr,
@@@ -4382,6 -5030,9 +5030,9 @@@
         .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
         .load_eoi_exitmap = svm_load_eoi_exitmap,
         .sync_pir_to_irr = svm_sync_pir_to_irr,
+       .hwapic_irr_update = svm_hwapic_irr_update,
+       .hwapic_isr_update = svm_hwapic_isr_update,
+       .apicv_post_state_restore = avic_post_state_restore,
   
         .set_tss_addr = svm_set_tss_addr,
         .get_tdp_level = get_npt_level,
@@@ -4415,6 -5066,7 +5066,7 @@@
         .sched_in = svm_sched_in,
   
         .pmu_ops = &amd_pmu_ops,
+       .deliver_posted_interrupt = svm_deliver_avic_intr,
   };
   
   static int __init svm_init(void)
diff --combined arch/x86/kvm/trace.h

index b72743c5668d3d55387a55d06c0c886cf2b7b1b1,39f264cbda71a294c8a1257599415009b0b4bfb8..8de925031b5cb41b6447fac108ba8f8a4a9e3193
--- 1/arch/x86/kvm/trace.h
--- 2/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@@ -809,7 -809,8 +809,7 @@@ TRACE_EVENT(kvm_write_tsc_offset
   
   #define host_clocks                                   \
         {VCLOCK_NONE, "none"},                          \
- -      {VCLOCK_TSC,  "tsc"},                           \
- -      {VCLOCK_HPET, "hpet"}                           \
+ +      {VCLOCK_TSC,  "tsc"}                            \
   
   TRACE_EVENT(kvm_update_master_clock,
         TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched),
@@@ -1291,6 -1292,63 +1291,63 @@@ TRACE_EVENT(kvm_hv_stimer_cleanup
                   __entry->vcpu_id, __entry->timer_index)
   );
   
+ /*
+  * Tracepoint for AMD AVIC
+  */
+ TRACE_EVENT(kvm_avic_incomplete_ipi,
+           TP_PROTO(u32 vcpu, u32 icrh, u32 icrl, u32 id, u32 index),
+           TP_ARGS(vcpu, icrh, icrl, id, index),
+ 
+       TP_STRUCT__entry(
+               __field(u32, vcpu)
+               __field(u32, icrh)
+               __field(u32, icrl)
+               __field(u32, id)
+               __field(u32, index)
+       ),
+ 
+       TP_fast_assign(
+               __entry->vcpu = vcpu;
+               __entry->icrh = icrh;
+               __entry->icrl = icrl;
+               __entry->id = id;
+               __entry->index = index;
+       ),
+ 
+       TP_printk("vcpu=%u, icrh:icrl=%#010x:%08x, id=%u, index=%u\n",
+                 __entry->vcpu, __entry->icrh, __entry->icrl,
+                 __entry->id, __entry->index)
+ );
+ 
+ TRACE_EVENT(kvm_avic_unaccelerated_access,
+           TP_PROTO(u32 vcpu, u32 offset, bool ft, bool rw, u32 vec),
+           TP_ARGS(vcpu, offset, ft, rw, vec),
+ 
+       TP_STRUCT__entry(
+               __field(u32, vcpu)
+               __field(u32, offset)
+               __field(bool, ft)
+               __field(bool, rw)
+               __field(u32, vec)
+       ),
+ 
+       TP_fast_assign(
+               __entry->vcpu = vcpu;
+               __entry->offset = offset;
+               __entry->ft = ft;
+               __entry->rw = rw;
+               __entry->vec = vec;
+       ),
+ 
+       TP_printk("vcpu=%u, offset=%#x(%s), %s, %s, vec=%#x\n",
+                 __entry->vcpu,
+                 __entry->offset,
+                 __print_symbolic(__entry->offset, kvm_trace_symbol_apic),
+                 __entry->ft ? "trap" : "fault",
+                 __entry->rw ? "write" : "read",
+                 __entry->vec)
+ );
+ 
   #endif /* _TRACE_KVM_H */
   
   #undef TRACE_INCLUDE_PATH
diff --combined arch/x86/kvm/vmx.c

index cb47fe3da2926b3c1c17df41625bef9492353554,7ebf27bafe5cd217d3cdb403f5f283dd56fcaefe..e605d1ed334ff5550cb07310815e546028c26514
--- 1/arch/x86/kvm/vmx.c
--- 2/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@@ -3103,8 -3103,6 +3103,8 @@@ static __init int vmx_disabled_by_bios(
   
   static void kvm_cpu_vmxon(u64 addr)
   {
+ +      intel_pt_handle_vmx(1);
+ +
         asm volatile (ASM_VMX_VMXON_RAX
                         : : "a"(&addr), "m"(addr)
                         : "memory", "cc");
@@@ -3174,8 -3172,6 +3174,8 @@@ static void vmclear_local_loaded_vmcss(
   static void kvm_cpu_vmxoff(void)
   {
         asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
+ +
+ +      intel_pt_handle_vmx(0);
   }
   
   static void hardware_disable(void)
@@@ -3390,7 -3386,7 +3390,7 @@@ static __init int setup_vmcs_config(str
                 }
         }
   
- -      if (cpu_has_xsaves)
+ +      if (boot_cpu_has(X86_FEATURE_XSAVES))
                 rdmsrl(MSR_IA32_XSS, host_xss);
   
         return 0;
@@@ -5050,8 -5046,8 +5050,8 @@@ static void vmx_vcpu_reset(struct kvm_v
                 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
   
         cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
-       vmx_set_cr0(vcpu, cr0); /* enter rmode */
         vmx->vcpu.arch.cr0 = cr0;
+       vmx_set_cr0(vcpu, cr0); /* enter rmode */
         vmx_set_cr4(vcpu, 0);
         vmx_set_efer(vcpu, 0);
         vmx_fpu_activate(vcpu);
@@@ -8318,19 -8314,19 +8318,19 @@@ static void vmx_set_apic_access_page_ad
                 vmcs_write64(APIC_ACCESS_ADDR, hpa);
   }
   
- static void vmx_hwapic_isr_update(struct kvm *kvm, int isr)
+ static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
   {
         u16 status;
         u8 old;
   
-       if (isr == -1)
-               isr = 0;
+       if (max_isr == -1)
+               max_isr = 0;
   
         status = vmcs_read16(GUEST_INTR_STATUS);
         old = status >> 8;
-       if (isr != old) {
+       if (max_isr != old) {
                 status &= 0xff;
-               status |= isr << 8;
+               status |= max_isr << 8;
                 vmcs_write16(GUEST_INTR_STATUS, status);
         }
   }
diff --combined arch/x86/kvm/x86.c

index 12f33e6623826dfcd0af660a534e8240683bc1a2,a8c7ca34ee5d30b88725474159abd64eee95781e..c805cf494154f8e7609ab89e148e210703690d90
--- 1/arch/x86/kvm/x86.c
--- 2/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@@ -161,6 -161,7 +161,7 @@@ struct kvm_stats_debugfs_item debugfs_e
         { "halt_exits", VCPU_STAT(halt_exits) },
         { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
         { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
+       { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
         { "halt_wakeup", VCPU_STAT(halt_wakeup) },
         { "hypercalls", VCPU_STAT(hypercalls) },
         { "request_irq", VCPU_STAT(request_irq_exits) },
@@@ -2002,22 -2003,8 +2003,8 @@@ static void kvmclock_reset(struct kvm_v
         vcpu->arch.pv_time_enabled = false;
   }
   
- static void accumulate_steal_time(struct kvm_vcpu *vcpu)
- {
-       u64 delta;
- 
-       if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
-               return;
- 
-       delta = current->sched_info.run_delay - vcpu->arch.st.last_steal;
-       vcpu->arch.st.last_steal = current->sched_info.run_delay;
-       vcpu->arch.st.accum_steal = delta;
- }
- 
   static void record_steal_time(struct kvm_vcpu *vcpu)
   {
-       accumulate_steal_time(vcpu);
- 
         if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
                 return;
   
@@@ -2025,9 -2012,26 +2012,26 @@@
                 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
                 return;
   
-       vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal;
-       vcpu->arch.st.steal.version += 2;
-       vcpu->arch.st.accum_steal = 0;
+       if (vcpu->arch.st.steal.version & 1)
+               vcpu->arch.st.steal.version += 1;  /* first time write, random junk */
+ 
+       vcpu->arch.st.steal.version += 1;
+ 
+       kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+               &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
+ 
+       smp_wmb();
+ 
+       vcpu->arch.st.steal.steal += current->sched_info.run_delay -
+               vcpu->arch.st.last_steal;
+       vcpu->arch.st.last_steal = current->sched_info.run_delay;
+ 
+       kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+               &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
+ 
+       smp_wmb();
+ 
+       vcpu->arch.st.steal.version += 1;
   
         kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
                 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
@@@ -2611,7 -2615,7 +2615,7 @@@ int kvm_vm_ioctl_check_extension(struc
                 r = KVM_MAX_MCE_BANKS;
                 break;
         case KVM_CAP_XCRS:
- -              r = cpu_has_xsave;
+ +              r = boot_cpu_has(X86_FEATURE_XSAVE);
                 break;
         case KVM_CAP_TSC_CONTROL:
                 r = kvm_has_tsc_control;
@@@ -3094,7 -3098,7 +3098,7 @@@ static void load_xsave(struct kvm_vcpu 
   
         /* Set XSTATE_BV and possibly XCOMP_BV.  */
         xsave->header.xfeatures = xstate_bv;
- -      if (cpu_has_xsaves)
+ +      if (boot_cpu_has(X86_FEATURE_XSAVES))
                 xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
   
         /*
@@@ -3121,7 -3125,7 +3125,7 @@@
   static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
                                          struct kvm_xsave *guest_xsave)
   {
- -      if (cpu_has_xsave) {
+ +      if (boot_cpu_has(X86_FEATURE_XSAVE)) {
                 memset(guest_xsave, 0, sizeof(struct kvm_xsave));
                 fill_xsave((u8 *) guest_xsave->region, vcpu);
         } else {
@@@ -3139,7 -3143,7 +3143,7 @@@ static int kvm_vcpu_ioctl_x86_set_xsave
         u64 xstate_bv =
                 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
   
- -      if (cpu_has_xsave) {
+ +      if (boot_cpu_has(X86_FEATURE_XSAVE)) {
                 /*
                  * Here we allow setting states that are not present in
                  * CPUID leaf 0xD, index 0, EDX:EAX.  This is for compatibility
@@@ -3160,7 -3164,7 +3164,7 @@@
   static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
                                         struct kvm_xcrs *guest_xcrs)
   {
- -      if (!cpu_has_xsave) {
+ +      if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
                 guest_xcrs->nr_xcrs = 0;
                 return;
         }
@@@ -3176,7 -3180,7 +3180,7 @@@ static int kvm_vcpu_ioctl_x86_set_xcrs(
   {
         int i, r = 0;
   
- -      if (!cpu_has_xsave)
+ +      if (!boot_cpu_has(X86_FEATURE_XSAVE))
                 return -EINVAL;
   
         if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
@@@ -5865,7 -5869,7 +5869,7 @@@ int kvm_arch_init(void *opaque
   
         perf_register_guest_info_callbacks(&kvm_guest_cbs);
   
- -      if (cpu_has_xsave)
+ +      if (boot_cpu_has(X86_FEATURE_XSAVE))
                 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
   
         kvm_lapic_init();
@@@ -7293,7 -7297,7 +7297,7 @@@ int kvm_arch_vcpu_ioctl_set_fpu(struct 
   static void fx_init(struct kvm_vcpu *vcpu)
   {
         fpstate_init(&vcpu->arch.guest_fpu.state);
- -      if (cpu_has_xsaves)
+ +      if (boot_cpu_has(X86_FEATURE_XSAVES))
                 vcpu->arch.guest_fpu.state.xsave.header.xcomp_bv =
                         host_xcr0 | XSTATE_COMPACTION_ENABLED;
   
@@@ -7752,6 -7756,9 +7756,9 @@@ int kvm_arch_init_vm(struct kvm *kvm, u
         kvm_page_track_init(kvm);
         kvm_mmu_init_vm(kvm);
   
+       if (kvm_x86_ops->vm_init)
+               return kvm_x86_ops->vm_init(kvm);
+ 
         return 0;
   }
   
@@@ -7873,6 -7880,8 +7880,8 @@@ void kvm_arch_destroy_vm(struct kvm *kv
                 x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 0, 0);
                 x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
         }
+       if (kvm_x86_ops->vm_destroy)
+               kvm_x86_ops->vm_destroy(kvm);
         kvm_iommu_unmap_guest(kvm);
         kfree(kvm->arch.vpic);
         kfree(kvm->arch.vioapic);
@@@ -8355,19 -8364,21 +8364,21 @@@ bool kvm_arch_has_noncoherent_dma(struc
   }
   EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
   
+ bool kvm_arch_has_irq_bypass(void)
+ {
+       return kvm_x86_ops->update_pi_irte != NULL;
+ }
+ 
   int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
                                       struct irq_bypass_producer *prod)
   {
         struct kvm_kernel_irqfd *irqfd =
                 container_of(cons, struct kvm_kernel_irqfd, consumer);
   
-       if (kvm_x86_ops->update_pi_irte) {
-               irqfd->producer = prod;
-               return kvm_x86_ops->update_pi_irte(irqfd->kvm,
-                               prod->irq, irqfd->gsi, 1);
-       }
+       irqfd->producer = prod;
   
-       return -EINVAL;
+       return kvm_x86_ops->update_pi_irte(irqfd->kvm,
+                                          prod->irq, irqfd->gsi, 1);
   }
   
   void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
@@@ -8377,11 -8388,6 +8388,6 @@@
         struct kvm_kernel_irqfd *irqfd =
                 container_of(cons, struct kvm_kernel_irqfd, consumer);
   
-       if (!kvm_x86_ops->update_pi_irte) {
-               WARN_ON(irqfd->producer != NULL);
-               return;
-       }
- 
         WARN_ON(irqfd->producer != prod);
         irqfd->producer = NULL;
   
@@@ -8429,3 -8435,5 +8435,5 @@@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_
   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
diff --combined drivers/irqchip/irq-gic-common.c

index 97c0028e83889303e49be5a0abc52fd76d2b5dca,2e9443be2b147d87c941e8c4dbb5a4a1f5e0b923..89e7423f0ebbafdf3ed2f7eb693d38d348e9fe35
--- 1/drivers/irqchip/irq-gic-common.c
--- 2/drivers/irqchip/irq-gic-common.c
+++ b/drivers/irqchip/irq-gic-common.c
@@@ -21,6 -21,19 +21,19 @@@
   
   #include "irq-gic-common.h"
   
+ static const struct gic_kvm_info *gic_kvm_info;
+ 
+ const struct gic_kvm_info *gic_get_kvm_info(void)
+ {
+       return gic_kvm_info;
+ }
+ 
+ void gic_set_kvm_info(const struct gic_kvm_info *info)
+ {
+       BUG_ON(gic_kvm_info != NULL);
+       gic_kvm_info = info;
+ }
+ 
   void gic_enable_quirks(u32 iidr, const struct gic_quirk *quirks,
                 void *data)
   {
@@@ -50,26 -63,14 +63,26 @@@ int gic_configure_irq(unsigned int irq
         else if (type & IRQ_TYPE_EDGE_BOTH)
                 val |= confmask;
   
+ +      /* If the current configuration is the same, then we are done */
+ +      if (val == oldval)
+ +              return 0;
+ +
         /*
          * Write back the new configuration, and possibly re-enable
- -       * the interrupt. If we tried to write a new configuration and failed,
- -       * return an error.
+ +       * the interrupt. If we fail to write a new configuration for
+ +       * an SPI then WARN and return an error. If we fail to write the
+ +       * configuration for a PPI this is most likely because the GIC
+ +       * does not allow us to set the configuration or we are in a
+ +       * non-secure mode, and hence it may not be catastrophic.
          */
         writel_relaxed(val, base + GIC_DIST_CONFIG + confoff);
- -      if (readl_relaxed(base + GIC_DIST_CONFIG + confoff) != val && val != oldval)
- -              ret = -EINVAL;
+ +      if (readl_relaxed(base + GIC_DIST_CONFIG + confoff) != val) {
+ +              if (WARN_ON(irq >= 32))
+ +                      ret = -EINVAL;
+ +              else
+ +                      pr_warn("GIC: PPI%d is secure or misconfigured\n",
+ +                              irq - 16);
+ +      }
   
         if (sync_access)
                 sync_access();
diff --combined drivers/irqchip/irq-gic-v3.c

index 1a1ea4f733c1f6d3b1f71097135f96e5bc706afb,05a856073714aa2060922e13577c307337214962..fb042ba9a3dbbd239e6cbe6751be24978185eab8
--- 1/drivers/irqchip/irq-gic-v3.c
--- 2/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@@ -15,6 -15,8 +15,8 @@@
    * along with this program.  If not, see <http://www.gnu.org/licenses/>.
    */
   
+ #define pr_fmt(fmt)   "GICv3: " fmt
+ 
   #include <linux/acpi.h>
   #include <linux/cpu.h>
   #include <linux/cpu_pm.h>
@@@ -28,8 -30,8 +30,9 @@@
   #include <linux/slab.h>
   
   #include <linux/irqchip.h>
+ #include <linux/irqchip/arm-gic-common.h>
   #include <linux/irqchip/arm-gic-v3.h>
+ +#include <linux/irqchip/irq-partition-percpu.h>
   
   #include <asm/cputype.h>
   #include <asm/exception.h>
@@@ -45,7 -47,6 +48,7 @@@ struct redist_region 
   };
   
   struct gic_chip_data {
+ +      struct fwnode_handle    *fwnode;
         void __iomem            *dist_base;
         struct redist_region    *redist_regions;
         struct rdists           rdists;
@@@ -53,12 -54,13 +56,14 @@@
         u64                     redist_stride;
         u32                     nr_redist_regions;
         unsigned int            irq_nr;
+ +      struct partition_desc   *ppi_descs[16];
   };
   
   static struct gic_chip_data gic_data __read_mostly;
   static struct static_key supports_deactivate = STATIC_KEY_INIT_TRUE;
   
+ static struct gic_kvm_info gic_v3_kvm_info;
+ 
   #define gic_data_rdist()              (this_cpu_ptr(gic_data.rdists.rdist))
   #define gic_data_rdist_rd_base()      (gic_data_rdist()->rd_base)
   #define gic_data_rdist_sgi_base()     (gic_data_rdist_rd_base() + SZ_64K)
@@@ -367,13 -369,6 +372,13 @@@ static asmlinkage void __exception_irq_
                         if (static_key_true(&supports_deactivate))
                                 gic_write_dir(irqnr);
   #ifdef CONFIG_SMP
+ +                      /*
+ +                       * Unlike GICv2, we don't need an smp_rmb() here.
+ +                       * The control dependency from gic_read_iar to
+ +                       * the ISB in gic_write_eoir is enough to ensure
+ +                       * that any shared data read by handle_IPI will
+ +                       * be read after the ACK.
+ +                       */
                         handle_IPI(irqnr, regs);
   #else
                         WARN_ONCE(true, "Unexpected SGI received!\n");
@@@ -393,15 -388,6 +398,15 @@@ static void __init gic_dist_init(void
         writel_relaxed(0, base + GICD_CTLR);
         gic_dist_wait_for_rwp();
   
+ +      /*
+ +       * Configure SPIs as non-secure Group-1. This will only matter
+ +       * if the GIC only has a single security state. This will not
+ +       * do the right thing if the kernel is running in secure mode,
+ +       * but that's not the intended use case anyway.
+ +       */
+ +      for (i = 32; i < gic_data.irq_nr; i += 32)
+ +              writel_relaxed(~0, base + GICD_IGROUPR + i / 8);
+ +
         gic_dist_config(base, gic_data.irq_nr, gic_dist_wait_for_rwp);
   
         /* Enable distributor with ARE, Group1 */
@@@ -519,9 -505,6 +524,9 @@@ static void gic_cpu_init(void
   
         rbase = gic_data_rdist_sgi_base();
   
+ +      /* Configure SGIs/PPIs as non-secure Group-1 */
+ +      writel_relaxed(~0, rbase + GICR_IGROUPR0);
+ +
         gic_cpu_config(rbase, gic_redist_wait_for_rwp);
   
         /* Give LPIs a spin */
@@@ -834,62 -817,10 +839,62 @@@ static void gic_irq_domain_free(struct 
         }
   }
   
+ +static int gic_irq_domain_select(struct irq_domain *d,
+ +                               struct irq_fwspec *fwspec,
+ +                               enum irq_domain_bus_token bus_token)
+ +{
+ +      /* Not for us */
+ +        if (fwspec->fwnode != d->fwnode)
+ +              return 0;
+ +
+ +      /* If this is not DT, then we have a single domain */
+ +      if (!is_of_node(fwspec->fwnode))
+ +              return 1;
+ +
+ +      /*
+ +       * If this is a PPI and we have a 4th (non-null) parameter,
+ +       * then we need to match the partition domain.
+ +       */
+ +      if (fwspec->param_count >= 4 &&
+ +          fwspec->param[0] == 1 && fwspec->param[3] != 0)
+ +              return d == partition_get_domain(gic_data.ppi_descs[fwspec->param[1]]);
+ +
+ +      return d == gic_data.domain;
+ +}
+ +
   static const struct irq_domain_ops gic_irq_domain_ops = {
         .translate = gic_irq_domain_translate,
         .alloc = gic_irq_domain_alloc,
         .free = gic_irq_domain_free,
+ +      .select = gic_irq_domain_select,
+ +};
+ +
+ +static int partition_domain_translate(struct irq_domain *d,
+ +                                    struct irq_fwspec *fwspec,
+ +                                    unsigned long *hwirq,
+ +                                    unsigned int *type)
+ +{
+ +      struct device_node *np;
+ +      int ret;
+ +
+ +      np = of_find_node_by_phandle(fwspec->param[3]);
+ +      if (WARN_ON(!np))
+ +              return -EINVAL;
+ +
+ +      ret = partition_translate_id(gic_data.ppi_descs[fwspec->param[1]],
+ +                                   of_node_to_fwnode(np));
+ +      if (ret < 0)
+ +              return ret;
+ +
+ +      *hwirq = ret;
+ +      *type = fwspec->param[2] & IRQ_TYPE_SENSE_MASK;
+ +
+ +      return 0;
+ +}
+ +
+ +static const struct irq_domain_ops partition_domain_ops = {
+ +      .translate = partition_domain_translate,
+ +      .select = gic_irq_domain_select,
   };
   
   static void gicv3_enable_quirks(void)
@@@ -917,7 -848,6 +922,7 @@@ static int __init gic_init_bases(void _
         if (static_key_true(&supports_deactivate))
                 pr_info("GIC: Using split EOI/Deactivate mode\n");
   
+ +      gic_data.fwnode = handle;
         gic_data.dist_base = dist_base;
         gic_data.redist_regions = rdist_regs;
         gic_data.nr_redist_regions = nr_redist_regions;
@@@ -976,119 -906,30 +981,143 @@@ static int __init gic_validate_dist_ver
         return 0;
   }
   
- static void gic_populate_ppi_partitions(struct device_node *gic_node)
+ +static int get_cpu_number(struct device_node *dn)
+ +{
+ +      const __be32 *cell;
+ +      u64 hwid;
+ +      int i;
+ +
+ +      cell = of_get_property(dn, "reg", NULL);
+ +      if (!cell)
+ +              return -1;
+ +
+ +      hwid = of_read_number(cell, of_n_addr_cells(dn));
+ +
+ +      /*
+ +       * Non affinity bits must be set to 0 in the DT
+ +       */
+ +      if (hwid & ~MPIDR_HWID_BITMASK)
+ +              return -1;
+ +
+ +      for (i = 0; i < num_possible_cpus(); i++)
+ +              if (cpu_logical_map(i) == hwid)
+ +                      return i;
+ +
+ +      return -1;
+ +}
+ +
+ +/* Create all possible partitions at boot time */
++static void __init gic_populate_ppi_partitions(struct device_node *gic_node)
+ +{
+ +      struct device_node *parts_node, *child_part;
+ +      int part_idx = 0, i;
+ +      int nr_parts;
+ +      struct partition_affinity *parts;
+ +
+ +      parts_node = of_find_node_by_name(gic_node, "ppi-partitions");
+ +      if (!parts_node)
+ +              return;
+ +
+ +      nr_parts = of_get_child_count(parts_node);
+ +
+ +      if (!nr_parts)
+ +              return;
+ +
+ +      parts = kzalloc(sizeof(*parts) * nr_parts, GFP_KERNEL);
+ +      if (WARN_ON(!parts))
+ +              return;
+ +
+ +      for_each_child_of_node(parts_node, child_part) {
+ +              struct partition_affinity *part;
+ +              int n;
+ +
+ +              part = &parts[part_idx];
+ +
+ +              part->partition_id = of_node_to_fwnode(child_part);
+ +
+ +              pr_info("GIC: PPI partition %s[%d] { ",
+ +                      child_part->name, part_idx);
+ +
+ +              n = of_property_count_elems_of_size(child_part, "affinity",
+ +                                                  sizeof(u32));
+ +              WARN_ON(n <= 0);
+ +
+ +              for (i = 0; i < n; i++) {
+ +                      int err, cpu;
+ +                      u32 cpu_phandle;
+ +                      struct device_node *cpu_node;
+ +
+ +                      err = of_property_read_u32_index(child_part, "affinity",
+ +                                                       i, &cpu_phandle);
+ +                      if (WARN_ON(err))
+ +                              continue;
+ +
+ +                      cpu_node = of_find_node_by_phandle(cpu_phandle);
+ +                      if (WARN_ON(!cpu_node))
+ +                              continue;
+ +
+ +                      cpu = get_cpu_number(cpu_node);
+ +                      if (WARN_ON(cpu == -1))
+ +                              continue;
+ +
+ +                      pr_cont("%s[%d] ", cpu_node->full_name, cpu);
+ +
+ +                      cpumask_set_cpu(cpu, &part->mask);
+ +              }
+ +
+ +              pr_cont("}\n");
+ +              part_idx++;
+ +      }
+ +
+ +      for (i = 0; i < 16; i++) {
+ +              unsigned int irq;
+ +              struct partition_desc *desc;
+ +              struct irq_fwspec ppi_fwspec = {
+ +                      .fwnode         = gic_data.fwnode,
+ +                      .param_count    = 3,
+ +                      .param          = {
+ +                              [0]     = 1,
+ +                              [1]     = i,
+ +                              [2]     = IRQ_TYPE_NONE,
+ +                      },
+ +              };
+ +
+ +              irq = irq_create_fwspec_mapping(&ppi_fwspec);
+ +              if (WARN_ON(!irq))
+ +                      continue;
+ +              desc = partition_create_desc(gic_data.fwnode, parts, nr_parts,
+ +                                           irq, &partition_domain_ops);
+ +              if (WARN_ON(!desc))
+ +                      continue;
+ +
+ +              gic_data.ppi_descs[i] = desc;
+ +      }
+ +}
+ +
+ static void __init gic_of_setup_kvm_info(struct device_node *node)
+ {
+       int ret;
+       struct resource r;
+       u32 gicv_idx;
+ 
+       gic_v3_kvm_info.type = GIC_V3;
+ 
+       gic_v3_kvm_info.maint_irq = irq_of_parse_and_map(node, 0);
+       if (!gic_v3_kvm_info.maint_irq)
+               return;
+ 
+       if (of_property_read_u32(node, "#redistributor-regions",
+                                &gicv_idx))
+               gicv_idx = 1;
+ 
+       gicv_idx += 3;  /* Also skip GICD, GICC, GICH */
+       ret = of_address_to_resource(node, gicv_idx, &r);
+       if (!ret)
+               gic_v3_kvm_info.vcpu = r;
+ 
+       gic_set_kvm_info(&gic_v3_kvm_info);
+ }
+ 
   static int __init gic_of_init(struct device_node *node, struct device_node *parent)
   {
         void __iomem *dist_base;
@@@ -1140,11 -981,10 +1169,12 @@@
   
         err = gic_init_bases(dist_base, rdist_regs, nr_redist_regions,
                              redist_stride, &node->fwnode);
- -      if (!err) {
- -              gic_of_setup_kvm_info(node);
- -              return 0;
- -      }
+ +      if (err)
+ +              goto out_unmap_rdist;
+ +
+ +      gic_populate_ppi_partitions(node);
++      gic_of_setup_kvm_info(node);
+ +      return 0;
   
   out_unmap_rdist:
         for (i = 0; i < nr_redist_regions; i++)
@@@ -1159,19 -999,25 +1189,25 @@@ out_unmap_dist
   IRQCHIP_DECLARE(gic_v3, "arm,gic-v3", gic_of_init);
   
   #ifdef CONFIG_ACPI
- static void __iomem *dist_base;
- static struct redist_region *redist_regs __initdata;
- static u32 nr_redist_regions __initdata;
- static bool single_redist;
+ static struct
+ {
+       void __iomem *dist_base;
+       struct redist_region *redist_regs;
+       u32 nr_redist_regions;
+       bool single_redist;
+       u32 maint_irq;
+       int maint_irq_mode;
+       phys_addr_t vcpu_base;
+ } acpi_data __initdata;
   
   static void __init
   gic_acpi_register_redist(phys_addr_t phys_base, void __iomem *redist_base)
   {
         static int count = 0;
   
-       redist_regs[count].phys_base = phys_base;
-       redist_regs[count].redist_base = redist_base;
-       redist_regs[count].single_redist = single_redist;
+       acpi_data.redist_regs[count].phys_base = phys_base;
+       acpi_data.redist_regs[count].redist_base = redist_base;
+       acpi_data.redist_regs[count].single_redist = acpi_data.single_redist;
         count++;
   }
   
@@@ -1199,7 -1045,7 +1235,7 @@@ gic_acpi_parse_madt_gicc(struct acpi_su
   {
         struct acpi_madt_generic_interrupt *gicc =
                                 (struct acpi_madt_generic_interrupt *)header;
-       u32 reg = readl_relaxed(dist_base + GICD_PIDR2) & GIC_PIDR2_ARCH_MASK;
+       u32 reg = readl_relaxed(acpi_data.dist_base + GICD_PIDR2) & GIC_PIDR2_ARCH_MASK;
         u32 size = reg == GIC_PIDR2_ARCH_GICv4 ? SZ_64K * 4 : SZ_64K * 2;
         void __iomem *redist_base;
   
@@@ -1216,7 -1062,7 +1252,7 @@@ static int __init gic_acpi_collect_gicr
         acpi_tbl_entry_handler redist_parser;
         enum acpi_madt_type type;
   
-       if (single_redist) {
+       if (acpi_data.single_redist) {
                 type = ACPI_MADT_TYPE_GENERIC_INTERRUPT;
                 redist_parser = gic_acpi_parse_madt_gicc;
         } else {
@@@ -1267,14 -1113,14 +1303,14 @@@ static int __init gic_acpi_count_gicr_r
         count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_REDISTRIBUTOR,
                                       gic_acpi_match_gicr, 0);
         if (count > 0) {
-               single_redist = false;
+               acpi_data.single_redist = false;
                 return count;
         }
   
         count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT,
                                       gic_acpi_match_gicc, 0);
         if (count > 0)
-               single_redist = true;
+               acpi_data.single_redist = true;
   
         return count;
   }
@@@ -1294,36 -1140,117 +1330,117 @@@ static bool __init acpi_validate_gic_ta
         if (count <= 0)
                 return false;
   
-       nr_redist_regions = count;
+       acpi_data.nr_redist_regions = count;
         return true;
   }
   
+ static int __init gic_acpi_parse_virt_madt_gicc(struct acpi_subtable_header *header,
+                                               const unsigned long end)
+ {
+       struct acpi_madt_generic_interrupt *gicc =
+               (struct acpi_madt_generic_interrupt *)header;
+       int maint_irq_mode;
+       static int first_madt = true;
+ 
+       /* Skip unusable CPUs */
+       if (!(gicc->flags & ACPI_MADT_ENABLED))
+               return 0;
+ 
+       maint_irq_mode = (gicc->flags & ACPI_MADT_VGIC_IRQ_MODE) ?
+               ACPI_EDGE_SENSITIVE : ACPI_LEVEL_SENSITIVE;
+ 
+       if (first_madt) {
+               first_madt = false;
+ 
+               acpi_data.maint_irq = gicc->vgic_interrupt;
+               acpi_data.maint_irq_mode = maint_irq_mode;
+               acpi_data.vcpu_base = gicc->gicv_base_address;
+ 
+               return 0;
+       }
+ 
+       /*
+        * The maintenance interrupt and GICV should be the same for every CPU
+        */
+       if ((acpi_data.maint_irq != gicc->vgic_interrupt) ||
+           (acpi_data.maint_irq_mode != maint_irq_mode) ||
+           (acpi_data.vcpu_base != gicc->gicv_base_address))
+               return -EINVAL;
+ 
+       return 0;
+ }
+ 
+ static bool __init gic_acpi_collect_virt_info(void)
+ {
+       int count;
+ 
+       count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT,
+                                     gic_acpi_parse_virt_madt_gicc, 0);
+ 
+       return (count > 0);
+ }
+ 
   #define ACPI_GICV3_DIST_MEM_SIZE (SZ_64K)
+ #define ACPI_GICV2_VCTRL_MEM_SIZE     (SZ_4K)
+ #define ACPI_GICV2_VCPU_MEM_SIZE      (SZ_8K)
+ 
+ static void __init gic_acpi_setup_kvm_info(void)
+ {
+       int irq;
+ 
+       if (!gic_acpi_collect_virt_info()) {
+               pr_warn("Unable to get hardware information used for virtualization\n");
+               return;
+       }
+ 
+       gic_v3_kvm_info.type = GIC_V3;
+ 
+       irq = acpi_register_gsi(NULL, acpi_data.maint_irq,
+                               acpi_data.maint_irq_mode,
+                               ACPI_ACTIVE_HIGH);
+       if (irq <= 0)
+               return;
+ 
+       gic_v3_kvm_info.maint_irq = irq;
+ 
+       if (acpi_data.vcpu_base) {
+               struct resource *vcpu = &gic_v3_kvm_info.vcpu;
+ 
+               vcpu->flags = IORESOURCE_MEM;
+               vcpu->start = acpi_data.vcpu_base;
+               vcpu->end = vcpu->start + ACPI_GICV2_VCPU_MEM_SIZE - 1;
+       }
+ 
+       gic_set_kvm_info(&gic_v3_kvm_info);
+ }
   
   static int __init
   gic_acpi_init(struct acpi_subtable_header *header, const unsigned long end)
   {
         struct acpi_madt_generic_distributor *dist;
         struct fwnode_handle *domain_handle;
+       size_t size;
         int i, err;
   
         /* Get distributor base address */
         dist = (struct acpi_madt_generic_distributor *)header;
-       dist_base = ioremap(dist->base_address, ACPI_GICV3_DIST_MEM_SIZE);
-       if (!dist_base) {
+       acpi_data.dist_base = ioremap(dist->base_address,
+                                     ACPI_GICV3_DIST_MEM_SIZE);
+       if (!acpi_data.dist_base) {
                 pr_err("Unable to map GICD registers\n");
                 return -ENOMEM;
         }
   
-       err = gic_validate_dist_version(dist_base);
+       err = gic_validate_dist_version(acpi_data.dist_base);
         if (err) {
-               pr_err("No distributor detected at @%p, giving up", dist_base);
+               pr_err("No distributor detected at @%p, giving up",
+                      acpi_data.dist_base);
                 goto out_dist_unmap;
         }
   
-       redist_regs = kzalloc(sizeof(*redist_regs) * nr_redist_regions,
-                             GFP_KERNEL);
-       if (!redist_regs) {
+       size = sizeof(*acpi_data.redist_regs) * acpi_data.nr_redist_regions;
+       acpi_data.redist_regs = kzalloc(size, GFP_KERNEL);
+       if (!acpi_data.redist_regs) {
                 err = -ENOMEM;
                 goto out_dist_unmap;
         }
@@@ -1332,29 -1259,31 +1449,31 @@@
         if (err)
                 goto out_redist_unmap;
   
-       domain_handle = irq_domain_alloc_fwnode(dist_base);
+       domain_handle = irq_domain_alloc_fwnode(acpi_data.dist_base);
         if (!domain_handle) {
                 err = -ENOMEM;
                 goto out_redist_unmap;
         }
   
-       err = gic_init_bases(dist_base, redist_regs, nr_redist_regions, 0,
-                            domain_handle);
+       err = gic_init_bases(acpi_data.dist_base, acpi_data.redist_regs,
+                            acpi_data.nr_redist_regions, 0, domain_handle);
         if (err)
                 goto out_fwhandle_free;
   
         acpi_set_irq_model(ACPI_IRQ_MODEL_GIC, domain_handle);
+       gic_acpi_setup_kvm_info();
+ 
         return 0;
   
   out_fwhandle_free:
         irq_domain_free_fwnode(domain_handle);
   out_redist_unmap:
-       for (i = 0; i < nr_redist_regions; i++)
-               if (redist_regs[i].redist_base)
-                       iounmap(redist_regs[i].redist_base);
-       kfree(redist_regs);
+       for (i = 0; i < acpi_data.nr_redist_regions; i++)
+               if (acpi_data.redist_regs[i].redist_base)
+                       iounmap(acpi_data.redist_regs[i].redist_base);
+       kfree(acpi_data.redist_regs);
   out_dist_unmap:
-       iounmap(dist_base);
+       iounmap(acpi_data.dist_base);
         return err;
   }
   IRQCHIP_ACPI_DECLARE(gic_v3, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR,
diff --combined drivers/irqchip/irq-gic.c

index 1de20e14a721111737bdab39c575ab7a45ddc63b,3f1d9fd3a4620bbcd25e3719615920e639471bd8..b4e647179346611a5a296a373a0aacdabf9fc9b0
--- 1/drivers/irqchip/irq-gic.c
--- 2/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@@ -55,7 -55,7 +55,7 @@@
   
   static void gic_check_cpu_features(void)
   {
- -      WARN_TAINT_ONCE(cpus_have_cap(ARM64_HAS_SYSREG_GIC_CPUIF),
+ +      WARN_TAINT_ONCE(this_cpu_has_cap(ARM64_HAS_SYSREG_GIC_CPUIF),
                         TAINT_CPU_OUT_OF_SPEC,
                         "GICv3 system registers enabled, broken firmware!\n");
   }
@@@ -72,9 -72,6 +72,9 @@@ struct gic_chip_data 
         struct irq_chip chip;
         union gic_base dist_base;
         union gic_base cpu_base;
+ +      void __iomem *raw_dist_base;
+ +      void __iomem *raw_cpu_base;
+ +      u32 percpu_offset;
   #ifdef CONFIG_CPU_PM
         u32 saved_spi_enable[DIV_ROUND_UP(1020, 32)];
         u32 saved_spi_active[DIV_ROUND_UP(1020, 32)];
@@@ -105,6 -102,8 +105,8 @@@ static struct static_key supports_deact
   
   static struct gic_chip_data gic_data[CONFIG_ARM_GIC_MAX_NR] __read_mostly;
   
+ static struct gic_kvm_info gic_v2_kvm_info;
+ 
   #ifdef CONFIG_GIC_NON_BANKED
   static void __iomem *gic_get_percpu_base(union gic_base *base)
   {
@@@ -347,14 -346,6 +349,14 @@@ static void __exception_irq_entry gic_h
                         if (static_key_true(&supports_deactivate))
                                 writel_relaxed(irqstat, cpu_base + GIC_CPU_DEACTIVATE);
   #ifdef CONFIG_SMP
+ +                      /*
+ +                       * Ensure any shared data written by the CPU sending
+ +                       * the IPI is read after we've read the ACK register
+ +                       * on the GIC.
+ +                       *
+ +                       * Pairs with the write barrier in gic_raise_softirq
+ +                       */
+ +                      smp_rmb();
                         handle_IPI(irqnr, regs);
   #endif
                         continue;
@@@ -402,6 -393,20 +404,6 @@@ static struct irq_chip gic_chip = 
                                   IRQCHIP_MASK_ON_SUSPEND,
   };
   
- -static struct irq_chip gic_eoimode1_chip = {
- -      .name                   = "GICv2",
- -      .irq_mask               = gic_eoimode1_mask_irq,
- -      .irq_unmask             = gic_unmask_irq,
- -      .irq_eoi                = gic_eoimode1_eoi_irq,
- -      .irq_set_type           = gic_set_type,
- -      .irq_get_irqchip_state  = gic_irq_get_irqchip_state,
- -      .irq_set_irqchip_state  = gic_irq_set_irqchip_state,
- -      .irq_set_vcpu_affinity  = gic_irq_set_vcpu_affinity,
- -      .flags                  = IRQCHIP_SET_TYPE_MASKED |
- -                                IRQCHIP_SKIP_SET_WAKE |
- -                                IRQCHIP_MASK_ON_SUSPEND,
- -};
- -
   void __init gic_cascade_irq(unsigned int gic_nr, unsigned int irq)
   {
         BUG_ON(gic_nr >= CONFIG_ARM_GIC_MAX_NR);
@@@ -470,7 -475,7 +472,7 @@@ static void __init gic_dist_init(struc
         writel_relaxed(GICD_ENABLE, base + GIC_DIST_CTRL);
   }
   
- -static void gic_cpu_init(struct gic_chip_data *gic)
+ +static int gic_cpu_init(struct gic_chip_data *gic)
   {
         void __iomem *dist_base = gic_data_dist_base(gic);
         void __iomem *base = gic_data_cpu_base(gic);
@@@ -486,10 -491,7 +488,10 @@@
                 /*
                  * Get what the GIC says our CPU mask is.
                  */
- -              BUG_ON(cpu >= NR_GIC_CPU_IF);
+ +              if (WARN_ON(cpu >= NR_GIC_CPU_IF))
+ +                      return -EINVAL;
+ +
+ +              gic_check_cpu_features();
                 cpu_mask = gic_get_cpumask(gic);
                 gic_cpu_map[cpu] = cpu_mask;
   
@@@ -506,8 -508,6 +508,8 @@@
   
         writel_relaxed(GICC_INT_PRI_THRESHOLD, base + GIC_CPU_PRIMASK);
         gic_cpu_if_up(gic);
+ +
+ +      return 0;
   }
   
   int gic_cpu_if_down(unsigned int gic_nr)
@@@ -533,35 -533,34 +535,35 @@@
    * this function, no interrupts will be delivered by the GIC, and another
    * platform-specific wakeup source must be enabled.
    */
- -static void gic_dist_save(unsigned int gic_nr)
+ +static void gic_dist_save(struct gic_chip_data *gic)
   {
         unsigned int gic_irqs;
         void __iomem *dist_base;
         int i;
   
- -      BUG_ON(gic_nr >= CONFIG_ARM_GIC_MAX_NR);
+ +      if (WARN_ON(!gic))
+ +              return;
   
- -      gic_irqs = gic_data[gic_nr].gic_irqs;
- -      dist_base = gic_data_dist_base(&gic_data[gic_nr]);
+ +      gic_irqs = gic->gic_irqs;
+ +      dist_base = gic_data_dist_base(gic);
   
         if (!dist_base)
                 return;
   
         for (i = 0; i < DIV_ROUND_UP(gic_irqs, 16); i++)
- -              gic_data[gic_nr].saved_spi_conf[i] =
+ +              gic->saved_spi_conf[i] =
                         readl_relaxed(dist_base + GIC_DIST_CONFIG + i * 4);
   
         for (i = 0; i < DIV_ROUND_UP(gic_irqs, 4); i++)
- -              gic_data[gic_nr].saved_spi_target[i] =
+ +              gic->saved_spi_target[i] =
                         readl_relaxed(dist_base + GIC_DIST_TARGET + i * 4);
   
         for (i = 0; i < DIV_ROUND_UP(gic_irqs, 32); i++)
- -              gic_data[gic_nr].saved_spi_enable[i] =
+ +              gic->saved_spi_enable[i] =
                         readl_relaxed(dist_base + GIC_DIST_ENABLE_SET + i * 4);
   
         for (i = 0; i < DIV_ROUND_UP(gic_irqs, 32); i++)
- -              gic_data[gic_nr].saved_spi_active[i] =
+ +              gic->saved_spi_active[i] =
                         readl_relaxed(dist_base + GIC_DIST_ACTIVE_SET + i * 4);
   }
   
@@@ -572,17 -571,16 +574,17 @@@
    * handled normally, but any edge interrupts that occured will not be seen by
    * the GIC and need to be handled by the platform-specific wakeup source.
    */
- -static void gic_dist_restore(unsigned int gic_nr)
+ +static void gic_dist_restore(struct gic_chip_data *gic)
   {
         unsigned int gic_irqs;
         unsigned int i;
         void __iomem *dist_base;
   
- -      BUG_ON(gic_nr >= CONFIG_ARM_GIC_MAX_NR);
+ +      if (WARN_ON(!gic))
+ +              return;
   
- -      gic_irqs = gic_data[gic_nr].gic_irqs;
- -      dist_base = gic_data_dist_base(&gic_data[gic_nr]);
+ +      gic_irqs = gic->gic_irqs;
+ +      dist_base = gic_data_dist_base(gic);
   
         if (!dist_base)
                 return;
@@@ -590,7 -588,7 +592,7 @@@
         writel_relaxed(GICD_DISABLE, dist_base + GIC_DIST_CTRL);
   
         for (i = 0; i < DIV_ROUND_UP(gic_irqs, 16); i++)
- -              writel_relaxed(gic_data[gic_nr].saved_spi_conf[i],
+ +              writel_relaxed(gic->saved_spi_conf[i],
                         dist_base + GIC_DIST_CONFIG + i * 4);
   
         for (i = 0; i < DIV_ROUND_UP(gic_irqs, 4); i++)
@@@ -598,87 -596,85 +600,87 @@@
                         dist_base + GIC_DIST_PRI + i * 4);
   
         for (i = 0; i < DIV_ROUND_UP(gic_irqs, 4); i++)
- -              writel_relaxed(gic_data[gic_nr].saved_spi_target[i],
+ +              writel_relaxed(gic->saved_spi_target[i],
                         dist_base + GIC_DIST_TARGET + i * 4);
   
         for (i = 0; i < DIV_ROUND_UP(gic_irqs, 32); i++) {
                 writel_relaxed(GICD_INT_EN_CLR_X32,
                         dist_base + GIC_DIST_ENABLE_CLEAR + i * 4);
- -              writel_relaxed(gic_data[gic_nr].saved_spi_enable[i],
+ +              writel_relaxed(gic->saved_spi_enable[i],
                         dist_base + GIC_DIST_ENABLE_SET + i * 4);
         }
   
         for (i = 0; i < DIV_ROUND_UP(gic_irqs, 32); i++) {
                 writel_relaxed(GICD_INT_EN_CLR_X32,
                         dist_base + GIC_DIST_ACTIVE_CLEAR + i * 4);
- -              writel_relaxed(gic_data[gic_nr].saved_spi_active[i],
+ +              writel_relaxed(gic->saved_spi_active[i],
                         dist_base + GIC_DIST_ACTIVE_SET + i * 4);
         }
   
         writel_relaxed(GICD_ENABLE, dist_base + GIC_DIST_CTRL);
   }
   
- -static void gic_cpu_save(unsigned int gic_nr)
+ +static void gic_cpu_save(struct gic_chip_data *gic)
   {
         int i;
         u32 *ptr;
         void __iomem *dist_base;
         void __iomem *cpu_base;
   
- -      BUG_ON(gic_nr >= CONFIG_ARM_GIC_MAX_NR);
+ +      if (WARN_ON(!gic))
+ +              return;
   
- -      dist_base = gic_data_dist_base(&gic_data[gic_nr]);
- -      cpu_base = gic_data_cpu_base(&gic_data[gic_nr]);
+ +      dist_base = gic_data_dist_base(gic);
+ +      cpu_base = gic_data_cpu_base(gic);
   
         if (!dist_base || !cpu_base)
                 return;
   
- -      ptr = raw_cpu_ptr(gic_data[gic_nr].saved_ppi_enable);
+ +      ptr = raw_cpu_ptr(gic->saved_ppi_enable);
         for (i = 0; i < DIV_ROUND_UP(32, 32); i++)
                 ptr[i] = readl_relaxed(dist_base + GIC_DIST_ENABLE_SET + i * 4);
   
- -      ptr = raw_cpu_ptr(gic_data[gic_nr].saved_ppi_active);
+ +      ptr = raw_cpu_ptr(gic->saved_ppi_active);
         for (i = 0; i < DIV_ROUND_UP(32, 32); i++)
                 ptr[i] = readl_relaxed(dist_base + GIC_DIST_ACTIVE_SET + i * 4);
   
- -      ptr = raw_cpu_ptr(gic_data[gic_nr].saved_ppi_conf);
+ +      ptr = raw_cpu_ptr(gic->saved_ppi_conf);
         for (i = 0; i < DIV_ROUND_UP(32, 16); i++)
                 ptr[i] = readl_relaxed(dist_base + GIC_DIST_CONFIG + i * 4);
   
   }
   
- -static void gic_cpu_restore(unsigned int gic_nr)
+ +static void gic_cpu_restore(struct gic_chip_data *gic)
   {
         int i;
         u32 *ptr;
         void __iomem *dist_base;
         void __iomem *cpu_base;
   
- -      BUG_ON(gic_nr >= CONFIG_ARM_GIC_MAX_NR);
+ +      if (WARN_ON(!gic))
+ +              return;
   
- -      dist_base = gic_data_dist_base(&gic_data[gic_nr]);
- -      cpu_base = gic_data_cpu_base(&gic_data[gic_nr]);
+ +      dist_base = gic_data_dist_base(gic);
+ +      cpu_base = gic_data_cpu_base(gic);
   
         if (!dist_base || !cpu_base)
                 return;
   
- -      ptr = raw_cpu_ptr(gic_data[gic_nr].saved_ppi_enable);
+ +      ptr = raw_cpu_ptr(gic->saved_ppi_enable);
         for (i = 0; i < DIV_ROUND_UP(32, 32); i++) {
                 writel_relaxed(GICD_INT_EN_CLR_X32,
                                dist_base + GIC_DIST_ENABLE_CLEAR + i * 4);
                 writel_relaxed(ptr[i], dist_base + GIC_DIST_ENABLE_SET + i * 4);
         }
   
- -      ptr = raw_cpu_ptr(gic_data[gic_nr].saved_ppi_active);
+ +      ptr = raw_cpu_ptr(gic->saved_ppi_active);
         for (i = 0; i < DIV_ROUND_UP(32, 32); i++) {
                 writel_relaxed(GICD_INT_EN_CLR_X32,
                                dist_base + GIC_DIST_ACTIVE_CLEAR + i * 4);
                 writel_relaxed(ptr[i], dist_base + GIC_DIST_ACTIVE_SET + i * 4);
         }
   
- -      ptr = raw_cpu_ptr(gic_data[gic_nr].saved_ppi_conf);
+ +      ptr = raw_cpu_ptr(gic->saved_ppi_conf);
         for (i = 0; i < DIV_ROUND_UP(32, 16); i++)
                 writel_relaxed(ptr[i], dist_base + GIC_DIST_CONFIG + i * 4);
   
@@@ -687,7 -683,7 +689,7 @@@
                                         dist_base + GIC_DIST_PRI + i * 4);
   
         writel_relaxed(GICC_INT_PRI_THRESHOLD, cpu_base + GIC_CPU_PRIMASK);
- -      gic_cpu_if_up(&gic_data[gic_nr]);
+ +      gic_cpu_if_up(gic);
   }
   
   static int gic_notifier(struct notifier_block *self, unsigned long cmd,       void *v)
@@@ -702,18 -698,18 +704,18 @@@
   #endif
                 switch (cmd) {
                 case CPU_PM_ENTER:
- -                      gic_cpu_save(i);
+ +                      gic_cpu_save(&gic_data[i]);
                         break;
                 case CPU_PM_ENTER_FAILED:
                 case CPU_PM_EXIT:
- -                      gic_cpu_restore(i);
+ +                      gic_cpu_restore(&gic_data[i]);
                         break;
                 case CPU_CLUSTER_PM_ENTER:
- -                      gic_dist_save(i);
+ +                      gic_dist_save(&gic_data[i]);
                         break;
                 case CPU_CLUSTER_PM_ENTER_FAILED:
                 case CPU_CLUSTER_PM_EXIT:
- -                      gic_dist_restore(i);
+ +                      gic_dist_restore(&gic_data[i]);
                         break;
                 }
         }
@@@ -725,39 -721,26 +727,39 @@@ static struct notifier_block gic_notifi
         .notifier_call = gic_notifier,
   };
   
- -static void __init gic_pm_init(struct gic_chip_data *gic)
+ +static int __init gic_pm_init(struct gic_chip_data *gic)
   {
         gic->saved_ppi_enable = __alloc_percpu(DIV_ROUND_UP(32, 32) * 4,
                 sizeof(u32));
- -      BUG_ON(!gic->saved_ppi_enable);
+ +      if (WARN_ON(!gic->saved_ppi_enable))
+ +              return -ENOMEM;
   
         gic->saved_ppi_active = __alloc_percpu(DIV_ROUND_UP(32, 32) * 4,
                 sizeof(u32));
- -      BUG_ON(!gic->saved_ppi_active);
+ +      if (WARN_ON(!gic->saved_ppi_active))
+ +              goto free_ppi_enable;
   
         gic->saved_ppi_conf = __alloc_percpu(DIV_ROUND_UP(32, 16) * 4,
                 sizeof(u32));
- -      BUG_ON(!gic->saved_ppi_conf);
+ +      if (WARN_ON(!gic->saved_ppi_conf))
+ +              goto free_ppi_active;
   
         if (gic == &gic_data[0])
                 cpu_pm_register_notifier(&gic_notifier_block);
+ +
+ +      return 0;
+ +
+ +free_ppi_active:
+ +      free_percpu(gic->saved_ppi_active);
+ +free_ppi_enable:
+ +      free_percpu(gic->saved_ppi_enable);
+ +
+ +      return -ENOMEM;
   }
   #else
- -static void __init gic_pm_init(struct gic_chip_data *gic)
+ +static int __init gic_pm_init(struct gic_chip_data *gic)
   {
+ +      return 0;
   }
   #endif
   
@@@ -1030,63 -1013,63 +1032,63 @@@ static const struct irq_domain_ops gic_
         .unmap = gic_irq_domain_unmap,
   };
   
- -static void __init __gic_init_bases(unsigned int gic_nr, int irq_start,
- -                         void __iomem *dist_base, void __iomem *cpu_base,
- -                         u32 percpu_offset, struct fwnode_handle *handle)
+ +static int __init __gic_init_bases(struct gic_chip_data *gic, int irq_start,
+ +                                 struct fwnode_handle *handle)
   {
         irq_hw_number_t hwirq_base;
- -      struct gic_chip_data *gic;
- -      int gic_irqs, irq_base, i;
- -
- -      BUG_ON(gic_nr >= CONFIG_ARM_GIC_MAX_NR);
+ +      int gic_irqs, irq_base, i, ret;
   
- -      gic_check_cpu_features();
- -
- -      gic = &gic_data[gic_nr];
+ +      if (WARN_ON(!gic || gic->domain))
+ +              return -EINVAL;
   
         /* Initialize irq_chip */
- -      if (static_key_true(&supports_deactivate) && gic_nr == 0) {
- -              gic->chip = gic_eoimode1_chip;
+ +      gic->chip = gic_chip;
+ +
+ +      if (static_key_true(&supports_deactivate) && gic == &gic_data[0]) {
+ +              gic->chip.irq_mask = gic_eoimode1_mask_irq;
+ +              gic->chip.irq_eoi = gic_eoimode1_eoi_irq;
+ +              gic->chip.irq_set_vcpu_affinity = gic_irq_set_vcpu_affinity;
+ +              gic->chip.name = kasprintf(GFP_KERNEL, "GICv2");
         } else {
- -              gic->chip = gic_chip;
- -              gic->chip.name = kasprintf(GFP_KERNEL, "GIC-%d", gic_nr);
+ +              gic->chip.name = kasprintf(GFP_KERNEL, "GIC-%d",
+ +                                         (int)(gic - &gic_data[0]));
         }
   
   #ifdef CONFIG_SMP
- -      if (gic_nr == 0)
+ +      if (gic == &gic_data[0])
                 gic->chip.irq_set_affinity = gic_set_affinity;
   #endif
   
- -#ifdef CONFIG_GIC_NON_BANKED
- -      if (percpu_offset) { /* Frankein-GIC without banked registers... */
+ +      if (IS_ENABLED(CONFIG_GIC_NON_BANKED) && gic->percpu_offset) {
+ +              /* Frankein-GIC without banked registers... */
                 unsigned int cpu;
   
                 gic->dist_base.percpu_base = alloc_percpu(void __iomem *);
                 gic->cpu_base.percpu_base = alloc_percpu(void __iomem *);
                 if (WARN_ON(!gic->dist_base.percpu_base ||
                             !gic->cpu_base.percpu_base)) {
- -                      free_percpu(gic->dist_base.percpu_base);
- -                      free_percpu(gic->cpu_base.percpu_base);
- -                      return;
+ +                      ret = -ENOMEM;
+ +                      goto error;
                 }
   
                 for_each_possible_cpu(cpu) {
                         u32 mpidr = cpu_logical_map(cpu);
                         u32 core_id = MPIDR_AFFINITY_LEVEL(mpidr, 0);
- -                      unsigned long offset = percpu_offset * core_id;
- -                      *per_cpu_ptr(gic->dist_base.percpu_base, cpu) = dist_base + offset;
- -                      *per_cpu_ptr(gic->cpu_base.percpu_base, cpu) = cpu_base + offset;
+ +                      unsigned long offset = gic->percpu_offset * core_id;
+ +                      *per_cpu_ptr(gic->dist_base.percpu_base, cpu) =
+ +                              gic->raw_dist_base + offset;
+ +                      *per_cpu_ptr(gic->cpu_base.percpu_base, cpu) =
+ +                              gic->raw_cpu_base + offset;
                 }
   
                 gic_set_base_accessor(gic, gic_get_percpu_base);
- -      } else
- -#endif
- -      {                       /* Normal, sane GIC... */
- -              WARN(percpu_offset,
+ +      } else {
+ +              /* Normal, sane GIC... */
+ +              WARN(gic->percpu_offset,
                      "GIC_NON_BANKED not enabled, ignoring %08x offset!",
- -                   percpu_offset);
- -              gic->dist_base.common_base = dist_base;
- -              gic->cpu_base.common_base = cpu_base;
+ +                   gic->percpu_offset);
+ +              gic->dist_base.common_base = gic->raw_dist_base;
+ +              gic->cpu_base.common_base = gic->raw_cpu_base;
                 gic_set_base_accessor(gic, gic_get_common_base);
         }
   
@@@ -1109,7 -1092,7 +1111,7 @@@
                  * For primary GICs, skip over SGIs.
                  * For secondary GICs, skip over PPIs, too.
                  */
- -              if (gic_nr == 0 && (irq_start & 31) > 0) {
+ +              if (gic == &gic_data[0] && (irq_start & 31) > 0) {
                         hwirq_base = 16;
                         if (irq_start != -1)
                                 irq_start = (irq_start & ~31) + 16;
@@@ -1131,12 -1114,10 +1133,12 @@@
                                         hwirq_base, &gic_irq_domain_ops, gic);
         }
   
- -      if (WARN_ON(!gic->domain))
- -              return;
+ +      if (WARN_ON(!gic->domain)) {
+ +              ret = -ENODEV;
+ +              goto error;
+ +      }
   
- -      if (gic_nr == 0) {
+ +      if (gic == &gic_data[0]) {
                 /*
                  * Initialize the CPU interface map to all CPUs.
                  * It will be refined as each CPU probes its ID.
@@@ -1154,57 -1135,19 +1156,57 @@@
         }
   
         gic_dist_init(gic);
- -      gic_cpu_init(gic);
- -      gic_pm_init(gic);
+ +      ret = gic_cpu_init(gic);
+ +      if (ret)
+ +              goto error;
+ +
+ +      ret = gic_pm_init(gic);
+ +      if (ret)
+ +              goto error;
+ +
+ +      return 0;
+ +
+ +error:
+ +      if (IS_ENABLED(CONFIG_GIC_NON_BANKED) && gic->percpu_offset) {
+ +              free_percpu(gic->dist_base.percpu_base);
+ +              free_percpu(gic->cpu_base.percpu_base);
+ +      }
+ +
+ +      kfree(gic->chip.name);
+ +
+ +      return ret;
   }
   
   void __init gic_init(unsigned int gic_nr, int irq_start,
                      void __iomem *dist_base, void __iomem *cpu_base)
   {
+ +      struct gic_chip_data *gic;
+ +
+ +      if (WARN_ON(gic_nr >= CONFIG_ARM_GIC_MAX_NR))
+ +              return;
+ +
         /*
          * Non-DT/ACPI systems won't run a hypervisor, so let's not
          * bother with these...
          */
         static_key_slow_dec(&supports_deactivate);
- -      __gic_init_bases(gic_nr, irq_start, dist_base, cpu_base, 0, NULL);
+ +
+ +      gic = &gic_data[gic_nr];
+ +      gic->raw_dist_base = dist_base;
+ +      gic->raw_cpu_base = cpu_base;
+ +
+ +      __gic_init_bases(gic, irq_start, NULL);
+ +}
+ +
+ +static void gic_teardown(struct gic_chip_data *gic)
+ +{
+ +      if (WARN_ON(!gic))
+ +              return;
+ +
+ +      if (gic->raw_dist_base)
+ +              iounmap(gic->raw_dist_base);
+ +      if (gic->raw_cpu_base)
+ +              iounmap(gic->raw_cpu_base);
   }
   
   #ifdef CONFIG_OF
@@@ -1248,63 -1191,62 +1250,88 @@@ static bool gic_check_eoimode(struct de
         return true;
   }
   
- static int gic_of_setup(struct gic_chip_data *gic, struct device_node *node)
++static int __init gic_of_setup(struct gic_chip_data *gic, struct device_node *node)
+ +{
+ +      if (!gic || !node)
+ +              return -EINVAL;
+ +
+ +      gic->raw_dist_base = of_iomap(node, 0);
+ +      if (WARN(!gic->raw_dist_base, "unable to map gic dist registers\n"))
+ +              goto error;
+ +
+ +      gic->raw_cpu_base = of_iomap(node, 1);
+ +      if (WARN(!gic->raw_cpu_base, "unable to map gic cpu registers\n"))
+ +              goto error;
+ +
+ +      if (of_property_read_u32(node, "cpu-offset", &gic->percpu_offset))
+ +              gic->percpu_offset = 0;
+ +
+ +      return 0;
+ +
+ +error:
+ +      gic_teardown(gic);
+ +
+ +      return -ENOMEM;
+ +}
+ +
+ static void __init gic_of_setup_kvm_info(struct device_node *node)
+ {
+       int ret;
+       struct resource *vctrl_res = &gic_v2_kvm_info.vctrl;
+       struct resource *vcpu_res = &gic_v2_kvm_info.vcpu;
+ 
+       gic_v2_kvm_info.type = GIC_V2;
+ 
+       gic_v2_kvm_info.maint_irq = irq_of_parse_and_map(node, 0);
+       if (!gic_v2_kvm_info.maint_irq)
+               return;
+ 
+       ret = of_address_to_resource(node, 2, vctrl_res);
+       if (ret)
+               return;
+ 
+       ret = of_address_to_resource(node, 3, vcpu_res);
+       if (ret)
+               return;
+ 
+       gic_set_kvm_info(&gic_v2_kvm_info);
+ }
+ 
   int __init
   gic_of_init(struct device_node *node, struct device_node *parent)
   {
- -      void __iomem *cpu_base;
- -      void __iomem *dist_base;
- -      u32 percpu_offset;
- -      int irq;
+ +      struct gic_chip_data *gic;
+ +      int irq, ret;
   
         if (WARN_ON(!node))
                 return -ENODEV;
   
- -      dist_base = of_iomap(node, 0);
- -      WARN(!dist_base, "unable to map gic dist registers\n");
+ +      if (WARN_ON(gic_cnt >= CONFIG_ARM_GIC_MAX_NR))
+ +              return -EINVAL;
+ +
+ +      gic = &gic_data[gic_cnt];
   
- -      cpu_base = of_iomap(node, 1);
- -      WARN(!cpu_base, "unable to map gic cpu registers\n");
+ +      ret = gic_of_setup(gic, node);
+ +      if (ret)
+ +              return ret;
   
         /*
          * Disable split EOI/Deactivate if either HYP is not available
          * or the CPU interface is too small.
          */
- -      if (gic_cnt == 0 && !gic_check_eoimode(node, &cpu_base))
+ +      if (gic_cnt == 0 && !gic_check_eoimode(node, &gic->raw_cpu_base))
                 static_key_slow_dec(&supports_deactivate);
   
- -      if (of_property_read_u32(node, "cpu-offset", &percpu_offset))
- -              percpu_offset = 0;
+ +      ret = __gic_init_bases(gic, -1, &node->fwnode);
+ +      if (ret) {
+ +              gic_teardown(gic);
+ +              return ret;
+ +      }
   
-       if (!gic_cnt)
- -      __gic_init_bases(gic_cnt, -1, dist_base, cpu_base, percpu_offset,
- -                       &node->fwnode);
+       if (!gic_cnt) {
                 gic_init_physaddr(node);
+               gic_of_setup_kvm_info(node);
+       }
   
         if (parent) {
                 irq = irq_of_parse_and_map(node, 0);
@@@ -1330,7 -1272,14 +1357,14 @@@ IRQCHIP_DECLARE(pl390, "arm,pl390", gic
   #endif
   
   #ifdef CONFIG_ACPI
- static phys_addr_t cpu_phy_base __initdata;
+ static struct
+ {
+       phys_addr_t cpu_phys_base;
+       u32 maint_irq;
+       int maint_irq_mode;
+       phys_addr_t vctrl_base;
+       phys_addr_t vcpu_base;
+ } acpi_data __initdata;
   
   static int __init
   gic_acpi_parse_madt_cpu(struct acpi_subtable_header *header,
@@@ -1350,10 -1299,16 +1384,16 @@@
          * All CPU interface addresses have to be the same.
          */
         gic_cpu_base = processor->base_address;
-       if (cpu_base_assigned && gic_cpu_base != cpu_phy_base)
+       if (cpu_base_assigned && gic_cpu_base != acpi_data.cpu_phys_base)
                 return -EINVAL;
   
-       cpu_phy_base = gic_cpu_base;
+       acpi_data.cpu_phys_base = gic_cpu_base;
+       acpi_data.maint_irq = processor->vgic_interrupt;
+       acpi_data.maint_irq_mode = (processor->flags & ACPI_MADT_VGIC_IRQ_MODE) ?
+                                   ACPI_EDGE_SENSITIVE : ACPI_LEVEL_SENSITIVE;
+       acpi_data.vctrl_base = processor->gich_base_address;
+       acpi_data.vcpu_base = processor->gicv_base_address;
+ 
         cpu_base_assigned = 1;
         return 0;
   }
@@@ -1384,14 -1339,49 +1424,49 @@@ static bool __init gic_validate_dist(st
   
   #define ACPI_GICV2_DIST_MEM_SIZE      (SZ_4K)
   #define ACPI_GIC_CPU_IF_MEM_SIZE      (SZ_8K)
+ #define ACPI_GICV2_VCTRL_MEM_SIZE     (SZ_4K)
+ #define ACPI_GICV2_VCPU_MEM_SIZE      (SZ_8K)
+ 
+ static void __init gic_acpi_setup_kvm_info(void)
+ {
+       int irq;
+       struct resource *vctrl_res = &gic_v2_kvm_info.vctrl;
+       struct resource *vcpu_res = &gic_v2_kvm_info.vcpu;
+ 
+       gic_v2_kvm_info.type = GIC_V2;
+ 
+       if (!acpi_data.vctrl_base)
+               return;
+ 
+       vctrl_res->flags = IORESOURCE_MEM;
+       vctrl_res->start = acpi_data.vctrl_base;
+       vctrl_res->end = vctrl_res->start + ACPI_GICV2_VCTRL_MEM_SIZE - 1;
+ 
+       if (!acpi_data.vcpu_base)
+               return;
+ 
+       vcpu_res->flags = IORESOURCE_MEM;
+       vcpu_res->start = acpi_data.vcpu_base;
+       vcpu_res->end = vcpu_res->start + ACPI_GICV2_VCPU_MEM_SIZE - 1;
+ 
+       irq = acpi_register_gsi(NULL, acpi_data.maint_irq,
+                               acpi_data.maint_irq_mode,
+                               ACPI_ACTIVE_HIGH);
+       if (irq <= 0)
+               return;
+ 
+       gic_v2_kvm_info.maint_irq = irq;
+ 
+       gic_set_kvm_info(&gic_v2_kvm_info);
+ }
   
   static int __init gic_v2_acpi_init(struct acpi_subtable_header *header,
                                    const unsigned long end)
   {
         struct acpi_madt_generic_distributor *dist;
- -      void __iomem *cpu_base, *dist_base;
         struct fwnode_handle *domain_handle;
- -      int count;
+ +      struct gic_chip_data *gic = &gic_data[0];
+ +      int count, ret;
   
         /* Collect CPU base addresses */
         count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT,
@@@ -1401,18 -1391,17 +1476,18 @@@
                 return -EINVAL;
         }
   
-       gic->raw_cpu_base = ioremap(cpu_phy_base, ACPI_GIC_CPU_IF_MEM_SIZE);
- -      cpu_base = ioremap(acpi_data.cpu_phys_base, ACPI_GIC_CPU_IF_MEM_SIZE);
- -      if (!cpu_base) {
++      gic->raw_cpu_base = ioremap(acpi_data.cpu_phys_base, ACPI_GIC_CPU_IF_MEM_SIZE);
+ +      if (!gic->raw_cpu_base) {
                 pr_err("Unable to map GICC registers\n");
                 return -ENOMEM;
         }
   
         dist = (struct acpi_madt_generic_distributor *)header;
- -      dist_base = ioremap(dist->base_address, ACPI_GICV2_DIST_MEM_SIZE);
- -      if (!dist_base) {
+ +      gic->raw_dist_base = ioremap(dist->base_address,
+ +                                   ACPI_GICV2_DIST_MEM_SIZE);
+ +      if (!gic->raw_dist_base) {
                 pr_err("Unable to map GICD registers\n");
- -              iounmap(cpu_base);
+ +              gic_teardown(gic);
                 return -ENOMEM;
         }
   
@@@ -1427,26 -1416,23 +1502,28 @@@
         /*
          * Initialize GIC instance zero (no multi-GIC support).
          */
- -      domain_handle = irq_domain_alloc_fwnode(dist_base);
+ +      domain_handle = irq_domain_alloc_fwnode(gic->raw_dist_base);
         if (!domain_handle) {
                 pr_err("Unable to allocate domain handle\n");
- -              iounmap(cpu_base);
- -              iounmap(dist_base);
+ +              gic_teardown(gic);
                 return -ENOMEM;
         }
   
- -      __gic_init_bases(0, -1, dist_base, cpu_base, 0, domain_handle);
+ +      ret = __gic_init_bases(gic, -1, domain_handle);
+ +      if (ret) {
+ +              pr_err("Failed to initialise GIC\n");
+ +              irq_domain_free_fwnode(domain_handle);
+ +              gic_teardown(gic);
+ +              return ret;
+ +      }
   
         acpi_set_irq_model(ACPI_IRQ_MODEL_GIC, domain_handle);
   
         if (IS_ENABLED(CONFIG_ARM_GIC_V2M))
                 gicv2m_init(NULL, gic_data[0].domain);
   
+       gic_acpi_setup_kvm_info();
+ 
         return 0;
   }
   IRQCHIP_ACPI_DECLARE(gic_v2, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR,
author	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 19 May 2016 18:27:09 +0000 (11:27 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 19 May 2016 18:27:09 +0000 (11:27 -0700)
		1	2
arch/arm/include/asm/kvm_host.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm/include/asm/kvm_mmu.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm/kvm/arm.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm/kvm/mmu.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm64/include/asm/kvm_arm.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm64/include/asm/kvm_host.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm64/include/asm/kvm_mmu.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm64/include/asm/pgtable-hwdef.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm64/include/asm/pgtable.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/mips/include/asm/kvm_host.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/mips/kvm/emulate.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/mips/kvm/tlb.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/mips/kvm/trap_emul.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/include/asm/sclp.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/mmu.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/svm.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/trace.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/vmx.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/x86.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/irqchip/irq-gic-common.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/irqchip/irq-gic-v3.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/irqchip/irq-gic.c	patch \|	diff1 \|	diff2 \|	blob \| history