]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/commitdiff
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 16 Mar 2016 16:55:35 +0000 (09:55 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 16 Mar 2016 16:55:35 +0000 (09:55 -0700)
Pull KVM updates from Paolo Bonzini:
 "One of the largest releases for KVM...  Hardly any generic
  changes, but lots of architecture-specific updates.

  ARM:
   - VHE support so that we can run the kernel at EL2 on ARMv8.1 systems
   - PMU support for guests
   - 32bit world switch rewritten in C
   - various optimizations to the vgic save/restore code.

  PPC:
   - enabled KVM-VFIO integration ("VFIO device")
   - optimizations to speed up IPIs between vcpus
   - in-kernel handling of IOMMU hypercalls
   - support for dynamic DMA windows (DDW).

  s390:
   - provide the floating point registers via sync regs;
   - separated instruction vs.  data accesses
   - dirty log improvements for huge guests
   - bugfixes and documentation improvements.

  x86:
   - Hyper-V VMBus hypercall userspace exit
   - alternative implementation of lowest-priority interrupts using
     vector hashing (for better VT-d posted interrupt support)
   - fixed guest debugging with nested virtualizations
   - improved interrupt tracking in the in-kernel IOAPIC
   - generic infrastructure for tracking writes to guest
     memory - currently its only use is to speedup the legacy shadow
     paging (pre-EPT) case, but in the future it will be used for
     virtual GPUs as well
   - much cleanup (LAPIC, kvmclock, MMU, PIT), including ubsan fixes"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (217 commits)
  KVM: x86: remove eager_fpu field of struct kvm_vcpu_arch
  KVM: x86: disable MPX if host did not enable MPX XSAVE features
  arm64: KVM: vgic-v3: Only wipe LRs on vcpu exit
  arm64: KVM: vgic-v3: Reset LRs at boot time
  arm64: KVM: vgic-v3: Do not save an LR known to be empty
  arm64: KVM: vgic-v3: Save maintenance interrupt state only if required
  arm64: KVM: vgic-v3: Avoid accessing ICH registers
  KVM: arm/arm64: vgic-v2: Make GICD_SGIR quicker to hit
  KVM: arm/arm64: vgic-v2: Only wipe LRs on vcpu exit
  KVM: arm/arm64: vgic-v2: Reset LRs at boot time
  KVM: arm/arm64: vgic-v2: Do not save an LR known to be empty
  KVM: arm/arm64: vgic-v2: Move GICH_ELRSR saving to its own function
  KVM: arm/arm64: vgic-v2: Save maintenance interrupt state only if required
  KVM: arm/arm64: vgic-v2: Avoid accessing GICH registers
  KVM: s390: allocate only one DMA page per VM
  KVM: s390: enable STFLE interpretation only if enabled for the guest
  KVM: s390: wake up when the VCPU cpu timer expires
  KVM: s390: step the VCPU timer while in enabled wait
  KVM: s390: protect VCPU cpu timer with a seqcount
  KVM: s390: step VCPU cpu timer during kvm_run ioctl
  ...

18 files changed:
1  2 
Documentation/virtual/kvm/mmu.txt
arch/arm/kvm/arm.c
arch/arm/kvm/guest.c
arch/arm64/kvm/guest.c
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/kernel/smp.c
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/kvm/book3s_hv_rmhandlers.S
arch/s390/include/asm/kvm_host.h
arch/s390/kvm/interrupt.c
arch/s390/kvm/kvm-s390.c
arch/x86/kvm/lapic.c
arch/x86/kvm/mmu.c
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
drivers/clocksource/arm_arch_timer.c
virt/kvm/async_pf.c
virt/kvm/kvm_main.c

index c81731096a4338bcec4d43b8049a26118d10f260,dda2e9316701d75b479d05215c1d60614acef4b8..481b6a9c25d5a1737aef54f55a3aa712c859f970
@@@ -358,8 -358,7 +358,8 @@@ In the first case there are two additio
  - if CR4.SMEP is enabled: since we've turned the page into a kernel page,
    the kernel may now execute it.  We handle this by also setting spte.nx.
    If we get a user fetch or read fault, we'll change spte.u=1 and
 -  spte.nx=gpte.nx back.
 +  spte.nx=gpte.nx back.  For this to work, KVM forces EFER.NX to 1 when
 +  shadow paging is in use.
  - if CR4.SMAP is disabled: since the page has been changed to a kernel
    page, it can not be reused when CR4.SMAP is enabled. We set
    CR4.SMAP && !CR0.WP into shadow page's role to avoid this case. Note,
@@@ -392,11 -391,11 +392,11 @@@ To instantiate a large spte, four const
    write-protected pages
  - the guest page must be wholly contained by a single memory slot
  
- To check the last two conditions, the mmu maintains a ->write_count set of
+ To check the last two conditions, the mmu maintains a ->disallow_lpage set of
  arrays for each memory slot and large page size.  Every write protected page
- causes its write_count to be incremented, thus preventing instantiation of
+ causes its disallow_lpage to be incremented, thus preventing instantiation of
  a large spte.  The frames at the end of an unaligned memory slot have
- artificially inflated ->write_counts so they can never be instantiated.
+ artificially inflated ->disallow_lpages so they can never be instantiated.
  
  Zapping all pages (page generation count)
  =========================================
diff --combined arch/arm/kvm/arm.c
index 08e49c423c24147a2f2c40b011866c8d2d144de9,9ca653e34d8ca7a85af3aeb56b556d6b4cc358dc..76552b51c7aea64fb9b8cc84436abad526991732
@@@ -28,6 -28,7 +28,7 @@@
  #include <linux/sched.h>
  #include <linux/kvm.h>
  #include <trace/events/kvm.h>
+ #include <kvm/arm_pmu.h>
  
  #define CREATE_TRACE_POINTS
  #include "trace.h"
@@@ -265,6 -266,7 +266,7 @@@ void kvm_arch_vcpu_free(struct kvm_vcp
        kvm_mmu_free_memory_caches(vcpu);
        kvm_timer_vcpu_terminate(vcpu);
        kvm_vgic_vcpu_destroy(vcpu);
+       kvm_pmu_vcpu_destroy(vcpu);
        kmem_cache_free(kvm_vcpu_cache, vcpu);
  }
  
@@@ -320,6 -322,7 +322,7 @@@ void kvm_arch_vcpu_put(struct kvm_vcpu 
        vcpu->cpu = -1;
  
        kvm_arm_set_running_vcpu(NULL);
+       kvm_timer_vcpu_put(vcpu);
  }
  
  int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
@@@ -506,18 -509,18 +509,18 @@@ static void kvm_arm_resume_guest(struc
        struct kvm_vcpu *vcpu;
  
        kvm_for_each_vcpu(i, vcpu, kvm) {
 -              wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
 +              struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
  
                vcpu->arch.pause = false;
 -              wake_up_interruptible(wq);
 +              swake_up(wq);
        }
  }
  
  static void vcpu_sleep(struct kvm_vcpu *vcpu)
  {
 -      wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
 +      struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
  
 -      wait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
 +      swait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
                                       (!vcpu->arch.pause)));
  }
  
@@@ -577,6 -580,7 +580,7 @@@ int kvm_arch_vcpu_ioctl_run(struct kvm_
                 * non-preemptible context.
                 */
                preempt_disable();
+               kvm_pmu_flush_hwstate(vcpu);
                kvm_timer_flush_hwstate(vcpu);
                kvm_vgic_flush_hwstate(vcpu);
  
                if (ret <= 0 || need_new_vmid_gen(vcpu->kvm) ||
                        vcpu->arch.power_off || vcpu->arch.pause) {
                        local_irq_enable();
+                       kvm_pmu_sync_hwstate(vcpu);
                        kvm_timer_sync_hwstate(vcpu);
                        kvm_vgic_sync_hwstate(vcpu);
                        preempt_enable();
                trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
  
                /*
-                * We must sync the timer state before the vgic state so that
-                * the vgic can properly sample the updated state of the
+                * We must sync the PMU and timer state before the vgic state so
+                * that the vgic can properly sample the updated state of the
                 * interrupt line.
                 */
+               kvm_pmu_sync_hwstate(vcpu);
                kvm_timer_sync_hwstate(vcpu);
  
                kvm_vgic_sync_hwstate(vcpu);
@@@ -823,11 -829,54 +829,54 @@@ static int kvm_arch_vcpu_ioctl_vcpu_ini
        return 0;
  }
  
+ static int kvm_arm_vcpu_set_attr(struct kvm_vcpu *vcpu,
+                                struct kvm_device_attr *attr)
+ {
+       int ret = -ENXIO;
+       switch (attr->group) {
+       default:
+               ret = kvm_arm_vcpu_arch_set_attr(vcpu, attr);
+               break;
+       }
+       return ret;
+ }
+ static int kvm_arm_vcpu_get_attr(struct kvm_vcpu *vcpu,
+                                struct kvm_device_attr *attr)
+ {
+       int ret = -ENXIO;
+       switch (attr->group) {
+       default:
+               ret = kvm_arm_vcpu_arch_get_attr(vcpu, attr);
+               break;
+       }
+       return ret;
+ }
+ static int kvm_arm_vcpu_has_attr(struct kvm_vcpu *vcpu,
+                                struct kvm_device_attr *attr)
+ {
+       int ret = -ENXIO;
+       switch (attr->group) {
+       default:
+               ret = kvm_arm_vcpu_arch_has_attr(vcpu, attr);
+               break;
+       }
+       return ret;
+ }
  long kvm_arch_vcpu_ioctl(struct file *filp,
                         unsigned int ioctl, unsigned long arg)
  {
        struct kvm_vcpu *vcpu = filp->private_data;
        void __user *argp = (void __user *)arg;
+       struct kvm_device_attr attr;
  
        switch (ioctl) {
        case KVM_ARM_VCPU_INIT: {
                        return -E2BIG;
                return kvm_arm_copy_reg_indices(vcpu, user_list->reg);
        }
+       case KVM_SET_DEVICE_ATTR: {
+               if (copy_from_user(&attr, argp, sizeof(attr)))
+                       return -EFAULT;
+               return kvm_arm_vcpu_set_attr(vcpu, &attr);
+       }
+       case KVM_GET_DEVICE_ATTR: {
+               if (copy_from_user(&attr, argp, sizeof(attr)))
+                       return -EFAULT;
+               return kvm_arm_vcpu_get_attr(vcpu, &attr);
+       }
+       case KVM_HAS_DEVICE_ATTR: {
+               if (copy_from_user(&attr, argp, sizeof(attr)))
+                       return -EFAULT;
+               return kvm_arm_vcpu_has_attr(vcpu, &attr);
+       }
        default:
                return -EINVAL;
        }
@@@ -967,6 -1031,11 +1031,11 @@@ long kvm_arch_vm_ioctl(struct file *fil
        }
  }
  
+ static void cpu_init_stage2(void *dummy)
+ {
+       __cpu_init_stage2();
+ }
  static void cpu_init_hyp_mode(void *dummy)
  {
        phys_addr_t boot_pgd_ptr;
        vector_ptr = (unsigned long)__kvm_hyp_vector;
  
        __cpu_init_hyp_mode(boot_pgd_ptr, pgd_ptr, hyp_stack_ptr, vector_ptr);
+       __cpu_init_stage2();
  
        kvm_arm_init_debug();
  }
@@@ -1035,6 -1105,82 +1105,82 @@@ static inline void hyp_cpu_pm_init(void
  }
  #endif
  
+ static void teardown_common_resources(void)
+ {
+       free_percpu(kvm_host_cpu_state);
+ }
+ static int init_common_resources(void)
+ {
+       kvm_host_cpu_state = alloc_percpu(kvm_cpu_context_t);
+       if (!kvm_host_cpu_state) {
+               kvm_err("Cannot allocate host CPU state\n");
+               return -ENOMEM;
+       }
+       return 0;
+ }
+ static int init_subsystems(void)
+ {
+       int err;
+       /*
+        * Init HYP view of VGIC
+        */
+       err = kvm_vgic_hyp_init();
+       switch (err) {
+       case 0:
+               vgic_present = true;
+               break;
+       case -ENODEV:
+       case -ENXIO:
+               vgic_present = false;
+               break;
+       default:
+               return err;
+       }
+       /*
+        * Init HYP architected timer support
+        */
+       err = kvm_timer_hyp_init();
+       if (err)
+               return err;
+       kvm_perf_init();
+       kvm_coproc_table_init();
+       return 0;
+ }
+ static void teardown_hyp_mode(void)
+ {
+       int cpu;
+       if (is_kernel_in_hyp_mode())
+               return;
+       free_hyp_pgds();
+       for_each_possible_cpu(cpu)
+               free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
+ }
+ static int init_vhe_mode(void)
+ {
+       /*
+        * Execute the init code on each CPU.
+        */
+       on_each_cpu(cpu_init_stage2, NULL, 1);
+       /* set size of VMID supported by CPU */
+       kvm_vmid_bits = kvm_get_vmid_bits();
+       kvm_info("%d-bit VMID\n", kvm_vmid_bits);
+       kvm_info("VHE mode initialized successfully\n");
+       return 0;
+ }
  /**
   * Inits Hyp-mode on all online CPUs
   */
@@@ -1065,7 -1211,7 +1211,7 @@@ static int init_hyp_mode(void
                stack_page = __get_free_page(GFP_KERNEL);
                if (!stack_page) {
                        err = -ENOMEM;
-                       goto out_free_stack_pages;
+                       goto out_err;
                }
  
                per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page;
        /*
         * Map the Hyp-code called directly from the host
         */
-       err = create_hyp_mappings(__kvm_hyp_code_start, __kvm_hyp_code_end);
+       err = create_hyp_mappings(__hyp_text_start, __hyp_text_end);
        if (err) {
                kvm_err("Cannot map world-switch code\n");
-               goto out_free_mappings;
+               goto out_err;
        }
  
        err = create_hyp_mappings(__start_rodata, __end_rodata);
        if (err) {
                kvm_err("Cannot map rodata section\n");
-               goto out_free_mappings;
+               goto out_err;
        }
  
        /*
  
                if (err) {
                        kvm_err("Cannot map hyp stack\n");
-                       goto out_free_mappings;
+                       goto out_err;
                }
        }
  
-       /*
-        * Map the host CPU structures
-        */
-       kvm_host_cpu_state = alloc_percpu(kvm_cpu_context_t);
-       if (!kvm_host_cpu_state) {
-               err = -ENOMEM;
-               kvm_err("Cannot allocate host CPU state\n");
-               goto out_free_mappings;
-       }
        for_each_possible_cpu(cpu) {
                kvm_cpu_context_t *cpu_ctxt;
  
  
                if (err) {
                        kvm_err("Cannot map host CPU state: %d\n", err);
-                       goto out_free_context;
+                       goto out_err;
                }
        }
  
         */
        on_each_cpu(cpu_init_hyp_mode, NULL, 1);
  
-       /*
-        * Init HYP view of VGIC
-        */
-       err = kvm_vgic_hyp_init();
-       switch (err) {
-       case 0:
-               vgic_present = true;
-               break;
-       case -ENODEV:
-       case -ENXIO:
-               vgic_present = false;
-               break;
-       default:
-               goto out_free_context;
-       }
-       /*
-        * Init HYP architected timer support
-        */
-       err = kvm_timer_hyp_init();
-       if (err)
-               goto out_free_context;
  #ifndef CONFIG_HOTPLUG_CPU
        free_boot_hyp_pgd();
  #endif
  
-       kvm_perf_init();
+       cpu_notifier_register_begin();
+       err = __register_cpu_notifier(&hyp_init_cpu_nb);
+       cpu_notifier_register_done();
+       if (err) {
+               kvm_err("Cannot register HYP init CPU notifier (%d)\n", err);
+               goto out_err;
+       }
+       hyp_cpu_pm_init();
  
        /* set size of VMID supported by CPU */
        kvm_vmid_bits = kvm_get_vmid_bits();
        kvm_info("Hyp mode initialized successfully\n");
  
        return 0;
- out_free_context:
-       free_percpu(kvm_host_cpu_state);
- out_free_mappings:
-       free_hyp_pgds();
- out_free_stack_pages:
-       for_each_possible_cpu(cpu)
-               free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
  out_err:
+       teardown_hyp_mode();
        kvm_err("error initializing Hyp mode: %d\n", err);
        return err;
  }
@@@ -1213,26 -1332,27 +1332,27 @@@ int kvm_arch_init(void *opaque
                }
        }
  
-       cpu_notifier_register_begin();
-       err = init_hyp_mode();
+       err = init_common_resources();
        if (err)
-               goto out_err;
+               return err;
  
-       err = __register_cpu_notifier(&hyp_init_cpu_nb);
-       if (err) {
-               kvm_err("Cannot register HYP init CPU notifier (%d)\n", err);
+       if (is_kernel_in_hyp_mode())
+               err = init_vhe_mode();
+       else
+               err = init_hyp_mode();
+       if (err)
                goto out_err;
-       }
-       cpu_notifier_register_done();
  
-       hyp_cpu_pm_init();
+       err = init_subsystems();
+       if (err)
+               goto out_hyp;
  
-       kvm_coproc_table_init();
        return 0;
+ out_hyp:
+       teardown_hyp_mode();
  out_err:
-       cpu_notifier_register_done();
+       teardown_common_resources();
        return err;
  }
  
diff --combined arch/arm/kvm/guest.c
index 99361f11354a0d8928c9aaead7fdf5e09436b3ac,12cbb68244435d277961a1cd24488d77a300c7d2..9093ed0f8b2a71e1d226fd31832a81d764242da6
@@@ -25,7 -25,6 +25,6 @@@
  #include <asm/cputype.h>
  #include <asm/uaccess.h>
  #include <asm/kvm.h>
- #include <asm/kvm_asm.h>
  #include <asm/kvm_emulate.h>
  #include <asm/kvm_coproc.h>
  
@@@ -55,7 -54,7 +54,7 @@@ static u64 core_reg_offset_from_id(u64 
  static int get_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
  {
        u32 __user *uaddr = (u32 __user *)(long)reg->addr;
-       struct kvm_regs *regs = &vcpu->arch.regs;
+       struct kvm_regs *regs = &vcpu->arch.ctxt.gp_regs;
        u64 off;
  
        if (KVM_REG_SIZE(reg->id) != 4)
@@@ -72,7 -71,7 +71,7 @@@
  static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
  {
        u32 __user *uaddr = (u32 __user *)(long)reg->addr;
-       struct kvm_regs *regs = &vcpu->arch.regs;
+       struct kvm_regs *regs = &vcpu->arch.ctxt.gp_regs;
        u64 off, val;
  
        if (KVM_REG_SIZE(reg->id) != 4)
@@@ -161,7 -160,7 +160,7 @@@ static int get_timer_reg(struct kvm_vcp
        u64 val;
  
        val = kvm_arm_timer_get_reg(vcpu, reg->id);
 -      return copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id));
 +      return copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id)) ? -EFAULT : 0;
  }
  
  static unsigned long num_core_regs(void)
diff --combined arch/arm64/kvm/guest.c
index 9e54ad7c240ac5d2239ba5140b04c6727b20377e,dbe45c364bbb150977696b6fa9ff2ac8751aff6e..32fad75bb9ff5960242596685c317ba4086689e5
@@@ -194,7 -194,7 +194,7 @@@ static int get_timer_reg(struct kvm_vcp
        u64 val;
  
        val = kvm_arm_timer_get_reg(vcpu, reg->id);
 -      return copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id));
 +      return copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id)) ? -EFAULT : 0;
  }
  
  /**
@@@ -380,3 -380,54 +380,54 @@@ int kvm_arch_vcpu_ioctl_set_guest_debug
        }
        return 0;
  }
+ int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu,
+                              struct kvm_device_attr *attr)
+ {
+       int ret;
+       switch (attr->group) {
+       case KVM_ARM_VCPU_PMU_V3_CTRL:
+               ret = kvm_arm_pmu_v3_set_attr(vcpu, attr);
+               break;
+       default:
+               ret = -ENXIO;
+               break;
+       }
+       return ret;
+ }
+ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
+                              struct kvm_device_attr *attr)
+ {
+       int ret;
+       switch (attr->group) {
+       case KVM_ARM_VCPU_PMU_V3_CTRL:
+               ret = kvm_arm_pmu_v3_get_attr(vcpu, attr);
+               break;
+       default:
+               ret = -ENXIO;
+               break;
+       }
+       return ret;
+ }
+ int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
+                              struct kvm_device_attr *attr)
+ {
+       int ret;
+       switch (attr->group) {
+       case KVM_ARM_VCPU_PMU_V3_CTRL:
+               ret = kvm_arm_pmu_v3_has_attr(vcpu, attr);
+               break;
+       default:
+               ret = -ENXIO;
+               break;
+       }
+       return ret;
+ }
index c98afa538b3aeca91901e858c02884e820ca3dfa,2e7c79101652ef863ee049da91b6047a193a440b..d7b343170453df82b4f31429020c8680e1f949bf
@@@ -182,7 -182,10 +182,10 @@@ struct kvmppc_spapr_tce_table 
        struct list_head list;
        struct kvm *kvm;
        u64 liobn;
-       u32 window_size;
+       struct rcu_head rcu;
+       u32 page_shift;
+       u64 offset;             /* in pages */
+       u64 size;               /* window size in pages */
        struct page *pages[0];
  };
  
@@@ -289,7 -292,7 +292,7 @@@ struct kvmppc_vcore 
        struct list_head runnable_threads;
        struct list_head preempt_list;
        spinlock_t lock;
 -      wait_queue_head_t wq;
 +      struct swait_queue_head wq;
        spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */
        u64 stolen_tb;
        u64 preempt_tb;
@@@ -629,7 -632,7 +632,7 @@@ struct kvm_vcpu_arch 
        u8 prodded;
        u32 last_inst;
  
 -      wait_queue_head_t *wqp;
 +      struct swait_queue_head *wqp;
        struct kvmppc_vcore *vcore;
        int ret;
        int trap;
index cc13d4c832916bc13203ba308949ff13870782c8,cb8be5dc118a72876dc0e93c5bb510bf4e307a49..b7dea05f07259558089205cebef69f40a7985a2f
@@@ -206,7 -206,7 +206,7 @@@ int smp_request_message_ipi(int virq, i
  
  #ifdef CONFIG_PPC_SMP_MUXED_IPI
  struct cpu_messages {
-       int messages;                   /* current messages */
+       long messages;                  /* current messages */
        unsigned long data;             /* data for cause ipi */
  };
  static DEFINE_PER_CPU_SHARED_ALIGNED(struct cpu_messages, ipi_message);
@@@ -218,7 -218,7 +218,7 @@@ void smp_muxed_ipi_set_data(int cpu, un
        info->data = data;
  }
  
- void smp_muxed_ipi_message_pass(int cpu, int msg)
+ void smp_muxed_ipi_set_message(int cpu, int msg)
  {
        struct cpu_messages *info = &per_cpu(ipi_message, cpu);
        char *message = (char *)&info->messages;
         */
        smp_mb();
        message[msg] = 1;
+ }
+ void smp_muxed_ipi_message_pass(int cpu, int msg)
+ {
+       struct cpu_messages *info = &per_cpu(ipi_message, cpu);
+       smp_muxed_ipi_set_message(cpu, msg);
        /*
         * cause_ipi functions are required to include a full barrier
         * before doing whatever causes the IPI.
  }
  
  #ifdef __BIG_ENDIAN__
- #define IPI_MESSAGE(A) (1 << (24 - 8 * (A)))
+ #define IPI_MESSAGE(A) (1uL << ((BITS_PER_LONG - 8) - 8 * (A)))
  #else
- #define IPI_MESSAGE(A) (1 << (8 * (A)))
+ #define IPI_MESSAGE(A) (1uL << (8 * (A)))
  #endif
  
  irqreturn_t smp_ipi_demux(void)
  {
        struct cpu_messages *info = this_cpu_ptr(&ipi_message);
-       unsigned int all;
+       unsigned long all;
  
        mb();   /* order any irq clear */
  
        do {
                all = xchg(&info->messages, 0);
+ #if defined(CONFIG_KVM_XICS) && defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE)
+               /*
+                * Must check for PPC_MSG_RM_HOST_ACTION messages
+                * before PPC_MSG_CALL_FUNCTION messages because when
+                * a VM is destroyed, we call kick_all_cpus_sync()
+                * to ensure that any pending PPC_MSG_RM_HOST_ACTION
+                * messages have completed before we free any VCPUs.
+                */
+               if (all & IPI_MESSAGE(PPC_MSG_RM_HOST_ACTION))
+                       kvmppc_xics_ipi_action();
+ #endif
                if (all & IPI_MESSAGE(PPC_MSG_CALL_FUNCTION))
                        generic_smp_call_function_interrupt();
                if (all & IPI_MESSAGE(PPC_MSG_RESCHEDULE))
@@@ -727,7 -745,7 +745,7 @@@ void start_secondary(void *unused
  
        local_irq_enable();
  
 -      cpu_startup_entry(CPUHP_ONLINE);
 +      cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
  
        BUG();
  }
index f1187bb6dd4d7f5960e57aea111bd1c12021408d,f47fffefadc1fb8f0a53d13bfdb9d7cbcf55d80d..84fb4fcfaa41b802a614515c67539b0d2d7ee3cf
@@@ -81,6 -81,17 +81,17 @@@ static int target_smt_mode
  module_param(target_smt_mode, int, S_IRUGO | S_IWUSR);
  MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
  
+ #ifdef CONFIG_KVM_XICS
+ static struct kernel_param_ops module_param_ops = {
+       .set = param_set_int,
+       .get = param_get_int,
+ };
+ module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect,
+                                                       S_IRUGO | S_IWUSR);
+ MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
+ #endif
  static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
  static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
  
@@@ -114,11 -125,11 +125,11 @@@ static bool kvmppc_ipi_thread(int cpu
  static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
  {
        int cpu;
 -      wait_queue_head_t *wqp;
 +      struct swait_queue_head *wqp;
  
        wqp = kvm_arch_vcpu_wq(vcpu);
 -      if (waitqueue_active(wqp)) {
 -              wake_up_interruptible(wqp);
 +      if (swait_active(wqp)) {
 +              swake_up(wqp);
                ++vcpu->stat.halt_wakeup;
        }
  
@@@ -701,8 -712,8 +712,8 @@@ int kvmppc_pseries_do_hcall(struct kvm_
                tvcpu->arch.prodded = 1;
                smp_mb();
                if (vcpu->arch.ceded) {
 -                      if (waitqueue_active(&vcpu->wq)) {
 -                              wake_up_interruptible(&vcpu->wq);
 +                      if (swait_active(&vcpu->wq)) {
 +                              swake_up(&vcpu->wq);
                                vcpu->stat.halt_wakeup++;
                        }
                }
                if (kvmppc_xics_enabled(vcpu)) {
                        ret = kvmppc_xics_hcall(vcpu, req);
                        break;
-               } /* fallthrough */
+               }
+               return RESUME_HOST;
+       case H_PUT_TCE:
+               ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
+                                               kvmppc_get_gpr(vcpu, 5),
+                                               kvmppc_get_gpr(vcpu, 6));
+               if (ret == H_TOO_HARD)
+                       return RESUME_HOST;
+               break;
+       case H_PUT_TCE_INDIRECT:
+               ret = kvmppc_h_put_tce_indirect(vcpu, kvmppc_get_gpr(vcpu, 4),
+                                               kvmppc_get_gpr(vcpu, 5),
+                                               kvmppc_get_gpr(vcpu, 6),
+                                               kvmppc_get_gpr(vcpu, 7));
+               if (ret == H_TOO_HARD)
+                       return RESUME_HOST;
+               break;
+       case H_STUFF_TCE:
+               ret = kvmppc_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
+                                               kvmppc_get_gpr(vcpu, 5),
+                                               kvmppc_get_gpr(vcpu, 6),
+                                               kvmppc_get_gpr(vcpu, 7));
+               if (ret == H_TOO_HARD)
+                       return RESUME_HOST;
+               break;
        default:
                return RESUME_HOST;
        }
@@@ -1459,7 -1494,7 +1494,7 @@@ static struct kvmppc_vcore *kvmppc_vcor
        INIT_LIST_HEAD(&vcore->runnable_threads);
        spin_lock_init(&vcore->lock);
        spin_lock_init(&vcore->stoltb_lock);
 -      init_waitqueue_head(&vcore->wq);
 +      init_swait_queue_head(&vcore->wq);
        vcore->preempt_tb = TB_NIL;
        vcore->lpcr = kvm->arch.lpcr;
        vcore->first_vcpuid = core * threads_per_subcore;
@@@ -2278,6 -2313,46 +2313,46 @@@ static void post_guest_process(struct k
        spin_unlock(&vc->lock);
  }
  
+ /*
+  * Clear core from the list of active host cores as we are about to
+  * enter the guest. Only do this if it is the primary thread of the
+  * core (not if a subcore) that is entering the guest.
+  */
+ static inline void kvmppc_clear_host_core(int cpu)
+ {
+       int core;
+       if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
+               return;
+       /*
+        * Memory barrier can be omitted here as we will do a smp_wmb()
+        * later in kvmppc_start_thread and we need ensure that state is
+        * visible to other CPUs only after we enter guest.
+        */
+       core = cpu >> threads_shift;
+       kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 0;
+ }
+ /*
+  * Advertise this core as an active host core since we exited the guest
+  * Only need to do this if it is the primary thread of the core that is
+  * exiting.
+  */
+ static inline void kvmppc_set_host_core(int cpu)
+ {
+       int core;
+       if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
+               return;
+       /*
+        * Memory barrier can be omitted here because we do a spin_unlock
+        * immediately after this which provides the memory barrier.
+        */
+       core = cpu >> threads_shift;
+       kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 1;
+ }
  /*
   * Run a set of guest threads on a physical core.
   * Called with vc->lock held.
@@@ -2390,6 -2465,8 +2465,8 @@@ static noinline void kvmppc_run_core(st
                }
        }
  
+       kvmppc_clear_host_core(pcpu);
        /* Start all the threads */
        active = 0;
        for (sub = 0; sub < core_info.n_subcores; ++sub) {
                        kvmppc_ipi_thread(pcpu + i);
        }
  
+       kvmppc_set_host_core(pcpu);
        spin_unlock(&vc->lock);
  
        /* make sure updates to secondary vcpu structs are visible now */
@@@ -2531,9 -2610,10 +2610,9 @@@ static void kvmppc_vcore_blocked(struc
  {
        struct kvm_vcpu *vcpu;
        int do_sleep = 1;
 +      DECLARE_SWAITQUEUE(wait);
  
 -      DEFINE_WAIT(wait);
 -
 -      prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
 +      prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
  
        /*
         * Check one last time for pending exceptions and ceded state after
        }
  
        if (!do_sleep) {
 -              finish_wait(&vc->wq, &wait);
 +              finish_swait(&vc->wq, &wait);
                return;
        }
  
        trace_kvmppc_vcore_blocked(vc, 0);
        spin_unlock(&vc->lock);
        schedule();
 -      finish_wait(&vc->wq, &wait);
 +      finish_swait(&vc->wq, &wait);
        spin_lock(&vc->lock);
        vc->vcore_state = VCORE_INACTIVE;
        trace_kvmppc_vcore_blocked(vc, 1);
@@@ -2611,7 -2691,7 +2690,7 @@@ static int kvmppc_run_vcpu(struct kvm_r
                        kvmppc_start_thread(vcpu, vc);
                        trace_kvm_guest_enter(vcpu);
                } else if (vc->vcore_state == VCORE_SLEEPING) {
 -                      wake_up(&vc->wq);
 +                      swake_up(&vc->wq);
                }
  
        }
@@@ -2983,6 -3063,114 +3062,114 @@@ static int kvmppc_hv_setup_htab_rma(str
        goto out_srcu;
  }
  
+ #ifdef CONFIG_KVM_XICS
+ static int kvmppc_cpu_notify(struct notifier_block *self, unsigned long action,
+                       void *hcpu)
+ {
+       unsigned long cpu = (long)hcpu;
+       switch (action) {
+       case CPU_UP_PREPARE:
+       case CPU_UP_PREPARE_FROZEN:
+               kvmppc_set_host_core(cpu);
+               break;
+ #ifdef CONFIG_HOTPLUG_CPU
+       case CPU_DEAD:
+       case CPU_DEAD_FROZEN:
+       case CPU_UP_CANCELED:
+       case CPU_UP_CANCELED_FROZEN:
+               kvmppc_clear_host_core(cpu);
+               break;
+ #endif
+       default:
+               break;
+       }
+       return NOTIFY_OK;
+ }
+ static struct notifier_block kvmppc_cpu_notifier = {
+           .notifier_call = kvmppc_cpu_notify,
+ };
+ /*
+  * Allocate a per-core structure for managing state about which cores are
+  * running in the host versus the guest and for exchanging data between
+  * real mode KVM and CPU running in the host.
+  * This is only done for the first VM.
+  * The allocated structure stays even if all VMs have stopped.
+  * It is only freed when the kvm-hv module is unloaded.
+  * It's OK for this routine to fail, we just don't support host
+  * core operations like redirecting H_IPI wakeups.
+  */
+ void kvmppc_alloc_host_rm_ops(void)
+ {
+       struct kvmppc_host_rm_ops *ops;
+       unsigned long l_ops;
+       int cpu, core;
+       int size;
+       /* Not the first time here ? */
+       if (kvmppc_host_rm_ops_hv != NULL)
+               return;
+       ops = kzalloc(sizeof(struct kvmppc_host_rm_ops), GFP_KERNEL);
+       if (!ops)
+               return;
+       size = cpu_nr_cores() * sizeof(struct kvmppc_host_rm_core);
+       ops->rm_core = kzalloc(size, GFP_KERNEL);
+       if (!ops->rm_core) {
+               kfree(ops);
+               return;
+       }
+       get_online_cpus();
+       for (cpu = 0; cpu < nr_cpu_ids; cpu += threads_per_core) {
+               if (!cpu_online(cpu))
+                       continue;
+               core = cpu >> threads_shift;
+               ops->rm_core[core].rm_state.in_host = 1;
+       }
+       ops->vcpu_kick = kvmppc_fast_vcpu_kick_hv;
+       /*
+        * Make the contents of the kvmppc_host_rm_ops structure visible
+        * to other CPUs before we assign it to the global variable.
+        * Do an atomic assignment (no locks used here), but if someone
+        * beats us to it, just free our copy and return.
+        */
+       smp_wmb();
+       l_ops = (unsigned long) ops;
+       if (cmpxchg64((unsigned long *)&kvmppc_host_rm_ops_hv, 0, l_ops)) {
+               put_online_cpus();
+               kfree(ops->rm_core);
+               kfree(ops);
+               return;
+       }
+       register_cpu_notifier(&kvmppc_cpu_notifier);
+       put_online_cpus();
+ }
+ void kvmppc_free_host_rm_ops(void)
+ {
+       if (kvmppc_host_rm_ops_hv) {
+               unregister_cpu_notifier(&kvmppc_cpu_notifier);
+               kfree(kvmppc_host_rm_ops_hv->rm_core);
+               kfree(kvmppc_host_rm_ops_hv);
+               kvmppc_host_rm_ops_hv = NULL;
+       }
+ }
+ #endif
  static int kvmppc_core_init_vm_hv(struct kvm *kvm)
  {
        unsigned long lpcr, lpid;
                return -ENOMEM;
        kvm->arch.lpid = lpid;
  
+       kvmppc_alloc_host_rm_ops();
        /*
         * Since we don't flush the TLB when tearing down a VM,
         * and this lpid might have previously been used,
@@@ -3228,6 -3418,7 +3417,7 @@@ static int kvmppc_book3s_init_hv(void
  
  static void kvmppc_book3s_exit_hv(void)
  {
+       kvmppc_free_host_rm_ops();
        kvmppc_hv_ops = NULL;
  }
  
index 25ae2c9913c39c2fae34fb60dd7ee655ba821b6e,ed16182a008b7f10b7aa53c3af2d0f3fd167fe25..85b32f16fa74e02a2fa753a62ecaf2c728e3eb2a
@@@ -1370,20 -1370,6 +1370,20 @@@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S
        std     r6, VCPU_ACOP(r9)
        stw     r7, VCPU_GUEST_PID(r9)
        std     r8, VCPU_WORT(r9)
 +      /*
 +       * Restore various registers to 0, where non-zero values
 +       * set by the guest could disrupt the host.
 +       */
 +      li      r0, 0
 +      mtspr   SPRN_IAMR, r0
 +      mtspr   SPRN_CIABR, r0
 +      mtspr   SPRN_DAWRX, r0
 +      mtspr   SPRN_TCSCR, r0
 +      mtspr   SPRN_WORT, r0
 +      /* Set MMCRS to 1<<31 to freeze and disable the SPMC counters */
 +      li      r0, 1
 +      sldi    r0, r0, 31
 +      mtspr   SPRN_MMCRS, r0
  8:
  
        /* Save and reset AMR and UAMOR before turning on the MMU */
@@@ -2020,8 -2006,8 +2020,8 @@@ hcall_real_table
        .long   0               /* 0x12c */
        .long   0               /* 0x130 */
        .long   DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table
-       .long   0               /* 0x138 */
-       .long   0               /* 0x13c */
+       .long   DOTSYM(kvmppc_h_stuff_tce) - hcall_real_table
+       .long   DOTSYM(kvmppc_rm_h_put_tce_indirect) - hcall_real_table
        .long   0               /* 0x140 */
        .long   0               /* 0x144 */
        .long   0               /* 0x148 */
index b0c8ad0799c7f0c09607420441ea87735c88c6d9,3c254952d3a7c95a031fdb93247fe9c26a78cdac..6da41fab70fbe951c957d1b5542101d22980839b
@@@ -20,6 -20,7 +20,7 @@@
  #include <linux/kvm_types.h>
  #include <linux/kvm_host.h>
  #include <linux/kvm.h>
+ #include <linux/seqlock.h>
  #include <asm/debug.h>
  #include <asm/cpu.h>
  #include <asm/fpu/api.h>
@@@ -229,17 -230,11 +230,11 @@@ struct kvm_s390_itdb 
        __u8    data[256];
  } __packed;
  
- struct kvm_s390_vregs {
-       __vector128 vrs[32];
-       __u8    reserved200[512];       /* for future vector expansion */
- } __packed;
  struct sie_page {
        struct kvm_s390_sie_block sie_block;
        __u8 reserved200[1024];         /* 0x0200 */
        struct kvm_s390_itdb itdb;      /* 0x0600 */
-       __u8 reserved700[1280];         /* 0x0700 */
-       struct kvm_s390_vregs vregs;    /* 0x0c00 */
+       __u8 reserved700[2304];         /* 0x0700 */
  } __packed;
  
  struct kvm_vcpu_stat {
@@@ -467,7 -462,7 +462,7 @@@ struct kvm_s390_irq_payload 
  struct kvm_s390_local_interrupt {
        spinlock_t lock;
        struct kvm_s390_float_interrupt *float_int;
 -      wait_queue_head_t *wq;
 +      struct swait_queue_head *wq;
        atomic_t *cpuflags;
        DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS);
        struct kvm_s390_irq_payload irq;
@@@ -558,6 -553,15 +553,15 @@@ struct kvm_vcpu_arch 
        unsigned long pfault_token;
        unsigned long pfault_select;
        unsigned long pfault_compare;
+       bool cputm_enabled;
+       /*
+        * The seqcount protects updates to cputm_start and sie_block.cputm,
+        * this way we can have non-blocking reads with consistent values.
+        * Only the owning VCPU thread (vcpu->cpu) is allowed to change these
+        * values and to start/stop/enable/disable cpu timer accounting.
+        */
+       seqcount_t cputm_seqcount;
+       __u64 cputm_start;
  };
  
  struct kvm_vm_stat {
@@@ -596,15 -600,11 +600,11 @@@ struct s390_io_adapter 
  #define S390_ARCH_FAC_MASK_SIZE_U64 \
        (S390_ARCH_FAC_MASK_SIZE_BYTE / sizeof(u64))
  
- struct kvm_s390_fac {
-       /* facility list requested by guest */
-       __u64 list[S390_ARCH_FAC_LIST_SIZE_U64];
-       /* facility mask supported by kvm & hosting machine */
-       __u64 mask[S390_ARCH_FAC_LIST_SIZE_U64];
- };
  struct kvm_s390_cpu_model {
-       struct kvm_s390_fac *fac;
+       /* facility mask supported by kvm & hosting machine */
+       __u64 fac_mask[S390_ARCH_FAC_LIST_SIZE_U64];
+       /* facility list requested by guest (in dma page) */
+       __u64 *fac_list;
        struct cpuid cpu_id;
        unsigned short ibc;
  };
@@@ -623,6 -623,16 +623,16 @@@ struct kvm_s390_crypto_cb 
        __u8    reserved80[128];                /* 0x0080 */
  };
  
+ /*
+  * sie_page2 has to be allocated as DMA because fac_list and crycb need
+  * 31bit addresses in the sie control block.
+  */
+ struct sie_page2 {
+       __u64 fac_list[S390_ARCH_FAC_LIST_SIZE_U64];    /* 0x0000 */
+       struct kvm_s390_crypto_cb crycb;                /* 0x0800 */
+       u8 reserved900[0x1000 - 0x900];                 /* 0x0900 */
+ } __packed;
  struct kvm_arch{
        void *sca;
        int use_esca;
        int ipte_lock_count;
        struct mutex ipte_mutex;
        spinlock_t start_stop_lock;
+       struct sie_page2 *sie_page2;
        struct kvm_s390_cpu_model model;
        struct kvm_s390_crypto crypto;
        u64 epoch;
index 9ffc7322179213f031939fa184bc6c93545af559,ef84a803433eeea0a3992f0aa5eaed3fe440bcdc..704809d91dddf759d9d7e4b33b86e4c3dc29441e
@@@ -182,8 -182,9 +182,9 @@@ static int cpu_timer_interrupts_enabled
  
  static int cpu_timer_irq_pending(struct kvm_vcpu *vcpu)
  {
-       return (vcpu->arch.sie_block->cputm >> 63) &&
-              cpu_timer_interrupts_enabled(vcpu);
+       if (!cpu_timer_interrupts_enabled(vcpu))
+               return 0;
+       return kvm_s390_get_cpu_timer(vcpu) >> 63;
  }
  
  static inline int is_ioirq(unsigned long irq_type)
@@@ -335,23 -336,6 +336,6 @@@ static void set_intercept_indicators(st
        set_intercept_indicators_stop(vcpu);
  }
  
- static u16 get_ilc(struct kvm_vcpu *vcpu)
- {
-       switch (vcpu->arch.sie_block->icptcode) {
-       case ICPT_INST:
-       case ICPT_INSTPROGI:
-       case ICPT_OPEREXC:
-       case ICPT_PARTEXEC:
-       case ICPT_IOINST:
-               /* last instruction only stored for these icptcodes */
-               return insn_length(vcpu->arch.sie_block->ipa >> 8);
-       case ICPT_PROGI:
-               return vcpu->arch.sie_block->pgmilc;
-       default:
-               return 0;
-       }
- }
  static int __must_check __deliver_cpu_timer(struct kvm_vcpu *vcpu)
  {
        struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@@@ -588,7 -572,7 +572,7 @@@ static int __must_check __deliver_prog(
        struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
        struct kvm_s390_pgm_info pgm_info;
        int rc = 0, nullifying = false;
-       u16 ilc = get_ilc(vcpu);
+       u16 ilen;
  
        spin_lock(&li->lock);
        pgm_info = li->irq.pgm;
        memset(&li->irq.pgm, 0, sizeof(pgm_info));
        spin_unlock(&li->lock);
  
-       VCPU_EVENT(vcpu, 3, "deliver: program irq code 0x%x, ilc:%d",
-                  pgm_info.code, ilc);
+       ilen = pgm_info.flags & KVM_S390_PGM_FLAGS_ILC_MASK;
+       VCPU_EVENT(vcpu, 3, "deliver: program irq code 0x%x, ilen:%d",
+                  pgm_info.code, ilen);
        vcpu->stat.deliver_program_int++;
        trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_PROGRAM_INT,
                                         pgm_info.code, 0);
                                   (u8 *) __LC_PER_ACCESS_ID);
        }
  
-       if (nullifying && vcpu->arch.sie_block->icptcode == ICPT_INST)
-               kvm_s390_rewind_psw(vcpu, ilc);
+       if (nullifying && !(pgm_info.flags & KVM_S390_PGM_FLAGS_NO_REWIND))
+               kvm_s390_rewind_psw(vcpu, ilen);
  
-       rc |= put_guest_lc(vcpu, ilc, (u16 *) __LC_PGM_ILC);
+       /* bit 1+2 of the target are the ilc, so we can directly use ilen */
+       rc |= put_guest_lc(vcpu, ilen, (u16 *) __LC_PGM_ILC);
        rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->gbea,
                                 (u64 *) __LC_LAST_BREAK);
        rc |= put_guest_lc(vcpu, pgm_info.code,
@@@ -923,9 -909,35 +909,35 @@@ int kvm_cpu_has_pending_timer(struct kv
        return ckc_irq_pending(vcpu) || cpu_timer_irq_pending(vcpu);
  }
  
+ static u64 __calculate_sltime(struct kvm_vcpu *vcpu)
+ {
+       u64 now, cputm, sltime = 0;
+       if (ckc_interrupts_enabled(vcpu)) {
+               now = kvm_s390_get_tod_clock_fast(vcpu->kvm);
+               sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now);
+               /* already expired or overflow? */
+               if (!sltime || vcpu->arch.sie_block->ckc <= now)
+                       return 0;
+               if (cpu_timer_interrupts_enabled(vcpu)) {
+                       cputm = kvm_s390_get_cpu_timer(vcpu);
+                       /* already expired? */
+                       if (cputm >> 63)
+                               return 0;
+                       return min(sltime, tod_to_ns(cputm));
+               }
+       } else if (cpu_timer_interrupts_enabled(vcpu)) {
+               sltime = kvm_s390_get_cpu_timer(vcpu);
+               /* already expired? */
+               if (sltime >> 63)
+                       return 0;
+       }
+       return sltime;
+ }
  int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
  {
-       u64 now, sltime;
+       u64 sltime;
  
        vcpu->stat.exit_wait_state++;
  
                return -EOPNOTSUPP; /* disabled wait */
        }
  
-       if (!ckc_interrupts_enabled(vcpu)) {
+       if (!ckc_interrupts_enabled(vcpu) &&
+           !cpu_timer_interrupts_enabled(vcpu)) {
                VCPU_EVENT(vcpu, 3, "%s", "enabled wait w/o timer");
                __set_cpu_idle(vcpu);
                goto no_timer;
        }
  
-       now = kvm_s390_get_tod_clock_fast(vcpu->kvm);
-       sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now);
-       /* underflow */
-       if (vcpu->arch.sie_block->ckc < now)
+       sltime = __calculate_sltime(vcpu);
+       if (!sltime)
                return 0;
  
        __set_cpu_idle(vcpu);
        hrtimer_start(&vcpu->arch.ckc_timer, ktime_set (0, sltime) , HRTIMER_MODE_REL);
-       VCPU_EVENT(vcpu, 4, "enabled wait via clock comparator: %llu ns", sltime);
+       VCPU_EVENT(vcpu, 4, "enabled wait: %llu ns", sltime);
  no_timer:
        srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
        kvm_vcpu_block(vcpu);
  
  void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
  {
 -      if (waitqueue_active(&vcpu->wq)) {
 +      if (swait_active(&vcpu->wq)) {
                /*
                 * The vcpu gave up the cpu voluntarily, mark it as a good
                 * yield-candidate.
                 */
                vcpu->preempted = true;
 -              wake_up_interruptible(&vcpu->wq);
 +              swake_up(&vcpu->wq);
                vcpu->stat.halt_wakeup++;
        }
  }
  enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer)
  {
        struct kvm_vcpu *vcpu;
-       u64 now, sltime;
+       u64 sltime;
  
        vcpu = container_of(timer, struct kvm_vcpu, arch.ckc_timer);
-       now = kvm_s390_get_tod_clock_fast(vcpu->kvm);
-       sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now);
+       sltime = __calculate_sltime(vcpu);
  
        /*
         * If the monotonic clock runs faster than the tod clock we might be
         * woken up too early and have to go back to sleep to avoid deadlocks.
         */
-       if (vcpu->arch.sie_block->ckc > now &&
-           hrtimer_forward_now(timer, ns_to_ktime(sltime)))
+       if (sltime && hrtimer_forward_now(timer, ns_to_ktime(sltime)))
                return HRTIMER_RESTART;
        kvm_s390_vcpu_wakeup(vcpu);
        return HRTIMER_NORESTART;
@@@ -1059,8 -1067,16 +1067,16 @@@ static int __inject_prog(struct kvm_vcp
        trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_PROGRAM_INT,
                                   irq->u.pgm.code, 0);
  
+       if (!(irq->u.pgm.flags & KVM_S390_PGM_FLAGS_ILC_VALID)) {
+               /* auto detection if no valid ILC was given */
+               irq->u.pgm.flags &= ~KVM_S390_PGM_FLAGS_ILC_MASK;
+               irq->u.pgm.flags |= kvm_s390_get_ilen(vcpu);
+               irq->u.pgm.flags |= KVM_S390_PGM_FLAGS_ILC_VALID;
+       }
        if (irq->u.pgm.code == PGM_PER) {
                li->irq.pgm.code |= PGM_PER;
+               li->irq.pgm.flags = irq->u.pgm.flags;
                /* only modify PER related information */
                li->irq.pgm.per_address = irq->u.pgm.per_address;
                li->irq.pgm.per_code = irq->u.pgm.per_code;
        } else if (!(irq->u.pgm.code & PGM_PER)) {
                li->irq.pgm.code = (li->irq.pgm.code & PGM_PER) |
                                   irq->u.pgm.code;
+               li->irq.pgm.flags = irq->u.pgm.flags;
                /* only modify non-PER information */
                li->irq.pgm.trans_exc_code = irq->u.pgm.trans_exc_code;
                li->irq.pgm.mon_code = irq->u.pgm.mon_code;
diff --combined arch/s390/kvm/kvm-s390.c
index 03dfe9c667f4eb944705787e54ff7e6ac3c08afb,c186d55b87ac3db66d74d7e36e0cb283fa7b2516..e196582fe87d4631ab9335b57c7d21f3f4a2ba0f
@@@ -158,6 -158,8 +158,8 @@@ static int kvm_clock_sync(struct notifi
                kvm->arch.epoch -= *delta;
                kvm_for_each_vcpu(i, vcpu, kvm) {
                        vcpu->arch.sie_block->epoch -= *delta;
+                       if (vcpu->arch.cputm_enabled)
+                               vcpu->arch.cputm_start += *delta;
                }
        }
        return NOTIFY_OK;
@@@ -274,7 -276,6 +276,6 @@@ static void kvm_s390_sync_dirty_log(str
        unsigned long address;
        struct gmap *gmap = kvm->arch.gmap;
  
-       down_read(&gmap->mm->mmap_sem);
        /* Loop over all guest pages */
        last_gfn = memslot->base_gfn + memslot->npages;
        for (cur_gfn = memslot->base_gfn; cur_gfn <= last_gfn; cur_gfn++) {
  
                if (gmap_test_and_clear_dirty(address, gmap))
                        mark_page_dirty(kvm, cur_gfn);
+               if (fatal_signal_pending(current))
+                       return;
+               cond_resched();
        }
-       up_read(&gmap->mm->mmap_sem);
  }
  
  /* Section: vm related */
@@@ -352,8 -355,8 +355,8 @@@ static int kvm_vm_ioctl_enable_cap(stru
                if (atomic_read(&kvm->online_vcpus)) {
                        r = -EBUSY;
                } else if (MACHINE_HAS_VX) {
-                       set_kvm_facility(kvm->arch.model.fac->mask, 129);
-                       set_kvm_facility(kvm->arch.model.fac->list, 129);
+                       set_kvm_facility(kvm->arch.model.fac_mask, 129);
+                       set_kvm_facility(kvm->arch.model.fac_list, 129);
                        r = 0;
                } else
                        r = -EINVAL;
                if (atomic_read(&kvm->online_vcpus)) {
                        r = -EBUSY;
                } else if (test_facility(64)) {
-                       set_kvm_facility(kvm->arch.model.fac->mask, 64);
-                       set_kvm_facility(kvm->arch.model.fac->list, 64);
+                       set_kvm_facility(kvm->arch.model.fac_mask, 64);
+                       set_kvm_facility(kvm->arch.model.fac_list, 64);
                        r = 0;
                }
                mutex_unlock(&kvm->lock);
@@@ -651,7 -654,7 +654,7 @@@ static int kvm_s390_set_processor(struc
                memcpy(&kvm->arch.model.cpu_id, &proc->cpuid,
                       sizeof(struct cpuid));
                kvm->arch.model.ibc = proc->ibc;
-               memcpy(kvm->arch.model.fac->list, proc->fac_list,
+               memcpy(kvm->arch.model.fac_list, proc->fac_list,
                       S390_ARCH_FAC_LIST_SIZE_BYTE);
        } else
                ret = -EFAULT;
@@@ -685,7 -688,8 +688,8 @@@ static int kvm_s390_get_processor(struc
        }
        memcpy(&proc->cpuid, &kvm->arch.model.cpu_id, sizeof(struct cpuid));
        proc->ibc = kvm->arch.model.ibc;
-       memcpy(&proc->fac_list, kvm->arch.model.fac->list, S390_ARCH_FAC_LIST_SIZE_BYTE);
+       memcpy(&proc->fac_list, kvm->arch.model.fac_list,
+              S390_ARCH_FAC_LIST_SIZE_BYTE);
        if (copy_to_user((void __user *)attr->addr, proc, sizeof(*proc)))
                ret = -EFAULT;
        kfree(proc);
@@@ -705,7 -709,7 +709,7 @@@ static int kvm_s390_get_machine(struct 
        }
        get_cpu_id((struct cpuid *) &mach->cpuid);
        mach->ibc = sclp.ibc;
-       memcpy(&mach->fac_mask, kvm->arch.model.fac->mask,
+       memcpy(&mach->fac_mask, kvm->arch.model.fac_mask,
               S390_ARCH_FAC_LIST_SIZE_BYTE);
        memcpy((unsigned long *)&mach->fac_list, S390_lowcore.stfle_fac_list,
               S390_ARCH_FAC_LIST_SIZE_BYTE);
@@@ -1082,16 -1086,12 +1086,12 @@@ static void kvm_s390_get_cpu_id(struct 
        cpu_id->version = 0xff;
  }
  
- static int kvm_s390_crypto_init(struct kvm *kvm)
+ static void kvm_s390_crypto_init(struct kvm *kvm)
  {
        if (!test_kvm_facility(kvm, 76))
-               return 0;
-       kvm->arch.crypto.crycb = kzalloc(sizeof(*kvm->arch.crypto.crycb),
-                                        GFP_KERNEL | GFP_DMA);
-       if (!kvm->arch.crypto.crycb)
-               return -ENOMEM;
+               return;
  
+       kvm->arch.crypto.crycb = &kvm->arch.sie_page2->crycb;
        kvm_s390_set_crycb_format(kvm);
  
        /* Enable AES/DEA protected key functions by default */
                         sizeof(kvm->arch.crypto.crycb->aes_wrapping_key_mask));
        get_random_bytes(kvm->arch.crypto.crycb->dea_wrapping_key_mask,
                         sizeof(kvm->arch.crypto.crycb->dea_wrapping_key_mask));
-       return 0;
  }
  
  static void sca_dispose(struct kvm *kvm)
@@@ -1156,37 -1154,30 +1154,30 @@@ int kvm_arch_init_vm(struct kvm *kvm, u
        if (!kvm->arch.dbf)
                goto out_err;
  
-       /*
-        * The architectural maximum amount of facilities is 16 kbit. To store
-        * this amount, 2 kbyte of memory is required. Thus we need a full
-        * page to hold the guest facility list (arch.model.fac->list) and the
-        * facility mask (arch.model.fac->mask). Its address size has to be
-        * 31 bits and word aligned.
-        */
-       kvm->arch.model.fac =
-               (struct kvm_s390_fac *) get_zeroed_page(GFP_KERNEL | GFP_DMA);
-       if (!kvm->arch.model.fac)
+       kvm->arch.sie_page2 =
+            (struct sie_page2 *) get_zeroed_page(GFP_KERNEL | GFP_DMA);
+       if (!kvm->arch.sie_page2)
                goto out_err;
  
        /* Populate the facility mask initially. */
-       memcpy(kvm->arch.model.fac->mask, S390_lowcore.stfle_fac_list,
+       memcpy(kvm->arch.model.fac_mask, S390_lowcore.stfle_fac_list,
               S390_ARCH_FAC_LIST_SIZE_BYTE);
        for (i = 0; i < S390_ARCH_FAC_LIST_SIZE_U64; i++) {
                if (i < kvm_s390_fac_list_mask_size())
-                       kvm->arch.model.fac->mask[i] &= kvm_s390_fac_list_mask[i];
+                       kvm->arch.model.fac_mask[i] &= kvm_s390_fac_list_mask[i];
                else
-                       kvm->arch.model.fac->mask[i] = 0UL;
+                       kvm->arch.model.fac_mask[i] = 0UL;
        }
  
        /* Populate the facility list initially. */
-       memcpy(kvm->arch.model.fac->list, kvm->arch.model.fac->mask,
+       kvm->arch.model.fac_list = kvm->arch.sie_page2->fac_list;
+       memcpy(kvm->arch.model.fac_list, kvm->arch.model.fac_mask,
               S390_ARCH_FAC_LIST_SIZE_BYTE);
  
        kvm_s390_get_cpu_id(&kvm->arch.model.cpu_id);
        kvm->arch.model.ibc = sclp.ibc & 0x0fff;
  
-       if (kvm_s390_crypto_init(kvm) < 0)
-               goto out_err;
+       kvm_s390_crypto_init(kvm);
  
        spin_lock_init(&kvm->arch.float_int.lock);
        for (i = 0; i < FIRQ_LIST_COUNT; i++)
  
        return 0;
  out_err:
-       kfree(kvm->arch.crypto.crycb);
-       free_page((unsigned long)kvm->arch.model.fac);
+       free_page((unsigned long)kvm->arch.sie_page2);
        debug_unregister(kvm->arch.dbf);
        sca_dispose(kvm);
        KVM_EVENT(3, "creation of vm failed: %d", rc);
@@@ -1269,10 -1259,9 +1259,9 @@@ static void kvm_free_vcpus(struct kvm *
  void kvm_arch_destroy_vm(struct kvm *kvm)
  {
        kvm_free_vcpus(kvm);
-       free_page((unsigned long)kvm->arch.model.fac);
        sca_dispose(kvm);
        debug_unregister(kvm->arch.dbf);
-       kfree(kvm->arch.crypto.crycb);
+       free_page((unsigned long)kvm->arch.sie_page2);
        if (!kvm_is_ucontrol(kvm))
                gmap_free(kvm->arch.gmap);
        kvm_s390_destroy_adapters(kvm);
@@@ -1414,8 -1403,13 +1403,13 @@@ int kvm_arch_vcpu_init(struct kvm_vcpu 
                                    KVM_SYNC_PFAULT;
        if (test_kvm_facility(vcpu->kvm, 64))
                vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB;
-       if (test_kvm_facility(vcpu->kvm, 129))
+       /* fprs can be synchronized via vrs, even if the guest has no vx. With
+        * MACHINE_HAS_VX, (load|store)_fpu_regs() will work with vrs format.
+        */
+       if (MACHINE_HAS_VX)
                vcpu->run->kvm_valid_regs |= KVM_SYNC_VRS;
+       else
+               vcpu->run->kvm_valid_regs |= KVM_SYNC_FPRS;
  
        if (kvm_is_ucontrol(vcpu->kvm))
                return __kvm_ucontrol_vcpu_init(vcpu);
        return 0;
  }
  
+ /* needs disabled preemption to protect from TOD sync and vcpu_load/put */
+ static void __start_cpu_timer_accounting(struct kvm_vcpu *vcpu)
+ {
+       WARN_ON_ONCE(vcpu->arch.cputm_start != 0);
+       raw_write_seqcount_begin(&vcpu->arch.cputm_seqcount);
+       vcpu->arch.cputm_start = get_tod_clock_fast();
+       raw_write_seqcount_end(&vcpu->arch.cputm_seqcount);
+ }
+ /* needs disabled preemption to protect from TOD sync and vcpu_load/put */
+ static void __stop_cpu_timer_accounting(struct kvm_vcpu *vcpu)
+ {
+       WARN_ON_ONCE(vcpu->arch.cputm_start == 0);
+       raw_write_seqcount_begin(&vcpu->arch.cputm_seqcount);
+       vcpu->arch.sie_block->cputm -= get_tod_clock_fast() - vcpu->arch.cputm_start;
+       vcpu->arch.cputm_start = 0;
+       raw_write_seqcount_end(&vcpu->arch.cputm_seqcount);
+ }
+ /* needs disabled preemption to protect from TOD sync and vcpu_load/put */
+ static void __enable_cpu_timer_accounting(struct kvm_vcpu *vcpu)
+ {
+       WARN_ON_ONCE(vcpu->arch.cputm_enabled);
+       vcpu->arch.cputm_enabled = true;
+       __start_cpu_timer_accounting(vcpu);
+ }
+ /* needs disabled preemption to protect from TOD sync and vcpu_load/put */
+ static void __disable_cpu_timer_accounting(struct kvm_vcpu *vcpu)
+ {
+       WARN_ON_ONCE(!vcpu->arch.cputm_enabled);
+       __stop_cpu_timer_accounting(vcpu);
+       vcpu->arch.cputm_enabled = false;
+ }
+ static void enable_cpu_timer_accounting(struct kvm_vcpu *vcpu)
+ {
+       preempt_disable(); /* protect from TOD sync and vcpu_load/put */
+       __enable_cpu_timer_accounting(vcpu);
+       preempt_enable();
+ }
+ static void disable_cpu_timer_accounting(struct kvm_vcpu *vcpu)
+ {
+       preempt_disable(); /* protect from TOD sync and vcpu_load/put */
+       __disable_cpu_timer_accounting(vcpu);
+       preempt_enable();
+ }
+ /* set the cpu timer - may only be called from the VCPU thread itself */
+ void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm)
+ {
+       preempt_disable(); /* protect from TOD sync and vcpu_load/put */
+       raw_write_seqcount_begin(&vcpu->arch.cputm_seqcount);
+       if (vcpu->arch.cputm_enabled)
+               vcpu->arch.cputm_start = get_tod_clock_fast();
+       vcpu->arch.sie_block->cputm = cputm;
+       raw_write_seqcount_end(&vcpu->arch.cputm_seqcount);
+       preempt_enable();
+ }
+ /* update and get the cpu timer - can also be called from other VCPU threads */
+ __u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu)
+ {
+       unsigned int seq;
+       __u64 value;
+       if (unlikely(!vcpu->arch.cputm_enabled))
+               return vcpu->arch.sie_block->cputm;
+       preempt_disable(); /* protect from TOD sync and vcpu_load/put */
+       do {
+               seq = raw_read_seqcount(&vcpu->arch.cputm_seqcount);
+               /*
+                * If the writer would ever execute a read in the critical
+                * section, e.g. in irq context, we have a deadlock.
+                */
+               WARN_ON_ONCE((seq & 1) && smp_processor_id() == vcpu->cpu);
+               value = vcpu->arch.sie_block->cputm;
+               /* if cputm_start is 0, accounting is being started/stopped */
+               if (likely(vcpu->arch.cputm_start))
+                       value -= get_tod_clock_fast() - vcpu->arch.cputm_start;
+       } while (read_seqcount_retry(&vcpu->arch.cputm_seqcount, seq & ~1));
+       preempt_enable();
+       return value;
+ }
  void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
  {
        /* Save host register state */
        vcpu->arch.host_fpregs.fpc = current->thread.fpu.fpc;
        vcpu->arch.host_fpregs.regs = current->thread.fpu.regs;
  
-       /* Depending on MACHINE_HAS_VX, data stored to vrs either
-        * has vector register or floating point register format.
-        */
-       current->thread.fpu.regs = vcpu->run->s.regs.vrs;
+       if (MACHINE_HAS_VX)
+               current->thread.fpu.regs = vcpu->run->s.regs.vrs;
+       else
+               current->thread.fpu.regs = vcpu->run->s.regs.fprs;
        current->thread.fpu.fpc = vcpu->run->s.regs.fpc;
        if (test_fp_ctl(current->thread.fpu.fpc))
                /* User space provided an invalid FPC, let's clear it */
        restore_access_regs(vcpu->run->s.regs.acrs);
        gmap_enable(vcpu->arch.gmap);
        atomic_or(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
+       if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
+               __start_cpu_timer_accounting(vcpu);
+       vcpu->cpu = cpu;
  }
  
  void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
  {
+       vcpu->cpu = -1;
+       if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
+               __stop_cpu_timer_accounting(vcpu);
        atomic_andnot(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
        gmap_disable(vcpu->arch.gmap);
  
@@@ -1468,7 -1555,7 +1555,7 @@@ static void kvm_s390_vcpu_initial_reset
        vcpu->arch.sie_block->gpsw.mask = 0UL;
        vcpu->arch.sie_block->gpsw.addr = 0UL;
        kvm_s390_set_prefix(vcpu, 0);
-       vcpu->arch.sie_block->cputm     = 0UL;
+       kvm_s390_set_cpu_timer(vcpu, 0);
        vcpu->arch.sie_block->ckc       = 0UL;
        vcpu->arch.sie_block->todpr     = 0;
        memset(vcpu->arch.sie_block->gcr, 0, 16 * sizeof(__u64));
@@@ -1538,7 -1625,8 +1625,8 @@@ static void kvm_s390_vcpu_setup_model(s
  
        vcpu->arch.cpu_id = model->cpu_id;
        vcpu->arch.sie_block->ibc = model->ibc;
-       vcpu->arch.sie_block->fac = (int) (long) model->fac->list;
+       if (test_kvm_facility(vcpu->kvm, 7))
+               vcpu->arch.sie_block->fac = (u32)(u64) model->fac_list;
  }
  
  int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
@@@ -1616,6 -1704,7 +1704,7 @@@ struct kvm_vcpu *kvm_arch_vcpu_create(s
        vcpu->arch.local_int.float_int = &kvm->arch.float_int;
        vcpu->arch.local_int.wq = &vcpu->wq;
        vcpu->arch.local_int.cpuflags = &vcpu->arch.sie_block->cpuflags;
+       seqcount_init(&vcpu->arch.cputm_seqcount);
  
        rc = kvm_vcpu_init(vcpu, kvm, id);
        if (rc)
@@@ -1715,7 -1804,7 +1804,7 @@@ static int kvm_arch_vcpu_ioctl_get_one_
                             (u64 __user *)reg->addr);
                break;
        case KVM_REG_S390_CPU_TIMER:
-               r = put_user(vcpu->arch.sie_block->cputm,
+               r = put_user(kvm_s390_get_cpu_timer(vcpu),
                             (u64 __user *)reg->addr);
                break;
        case KVM_REG_S390_CLOCK_COMP:
@@@ -1753,6 -1842,7 +1842,7 @@@ static int kvm_arch_vcpu_ioctl_set_one_
                                           struct kvm_one_reg *reg)
  {
        int r = -EINVAL;
+       __u64 val;
  
        switch (reg->id) {
        case KVM_REG_S390_TODPR:
                             (u64 __user *)reg->addr);
                break;
        case KVM_REG_S390_CPU_TIMER:
-               r = get_user(vcpu->arch.sie_block->cputm,
-                            (u64 __user *)reg->addr);
+               r = get_user(val, (u64 __user *)reg->addr);
+               if (!r)
+                       kvm_s390_set_cpu_timer(vcpu, val);
                break;
        case KVM_REG_S390_CLOCK_COMP:
                r = get_user(vcpu->arch.sie_block->ckc,
@@@ -2158,8 -2249,10 +2249,10 @@@ static int vcpu_pre_run(struct kvm_vcp
  
  static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu)
  {
-       psw_t *psw = &vcpu->arch.sie_block->gpsw;
-       u8 opcode;
+       struct kvm_s390_pgm_info pgm_info = {
+               .code = PGM_ADDRESSING,
+       };
+       u8 opcode, ilen;
        int rc;
  
        VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction");
         * to look up the current opcode to get the length of the instruction
         * to be able to forward the PSW.
         */
-       rc = read_guest(vcpu, psw->addr, 0, &opcode, 1);
-       if (rc)
-               return kvm_s390_inject_prog_cond(vcpu, rc);
-       psw->addr = __rewind_psw(*psw, -insn_length(opcode));
-       return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+       rc = read_guest_instr(vcpu, &opcode, 1);
+       ilen = insn_length(opcode);
+       if (rc < 0) {
+               return rc;
+       } else if (rc) {
+               /* Instruction-Fetching Exceptions - we can't detect the ilen.
+                * Forward by arbitrary ilc, injection will take care of
+                * nullification if necessary.
+                */
+               pgm_info = vcpu->arch.pgm;
+               ilen = 4;
+       }
+       pgm_info.flags = ilen | KVM_S390_PGM_FLAGS_ILC_VALID;
+       kvm_s390_forward_psw(vcpu, ilen);
+       return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
  }
  
  static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
@@@ -2244,10 -2346,12 +2346,12 @@@ static int __vcpu_run(struct kvm_vcpu *
                 */
                local_irq_disable();
                __kvm_guest_enter();
+               __disable_cpu_timer_accounting(vcpu);
                local_irq_enable();
                exit_reason = sie64a(vcpu->arch.sie_block,
                                     vcpu->run->s.regs.gprs);
                local_irq_disable();
+               __enable_cpu_timer_accounting(vcpu);
                __kvm_guest_exit();
                local_irq_enable();
                vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
@@@ -2271,7 -2375,7 +2375,7 @@@ static void sync_regs(struct kvm_vcpu *
                kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
        }
        if (kvm_run->kvm_dirty_regs & KVM_SYNC_ARCH0) {
-               vcpu->arch.sie_block->cputm = kvm_run->s.regs.cputm;
+               kvm_s390_set_cpu_timer(vcpu, kvm_run->s.regs.cputm);
                vcpu->arch.sie_block->ckc = kvm_run->s.regs.ckc;
                vcpu->arch.sie_block->todpr = kvm_run->s.regs.todpr;
                vcpu->arch.sie_block->pp = kvm_run->s.regs.pp;
@@@ -2293,7 -2397,7 +2397,7 @@@ static void store_regs(struct kvm_vcpu 
        kvm_run->psw_addr = vcpu->arch.sie_block->gpsw.addr;
        kvm_run->s.regs.prefix = kvm_s390_get_prefix(vcpu);
        memcpy(&kvm_run->s.regs.crs, &vcpu->arch.sie_block->gcr, 128);
-       kvm_run->s.regs.cputm = vcpu->arch.sie_block->cputm;
+       kvm_run->s.regs.cputm = kvm_s390_get_cpu_timer(vcpu);
        kvm_run->s.regs.ckc = vcpu->arch.sie_block->ckc;
        kvm_run->s.regs.todpr = vcpu->arch.sie_block->todpr;
        kvm_run->s.regs.pp = vcpu->arch.sie_block->pp;
@@@ -2325,6 -2429,7 +2429,7 @@@ int kvm_arch_vcpu_ioctl_run(struct kvm_
        }
  
        sync_regs(vcpu, kvm_run);
+       enable_cpu_timer_accounting(vcpu);
  
        might_fault();
        rc = __vcpu_run(vcpu);
                rc = 0;
        }
  
+       disable_cpu_timer_accounting(vcpu);
        store_regs(vcpu, kvm_run);
  
        if (vcpu->sigset_active)
@@@ -2364,7 -2470,7 +2470,7 @@@ int kvm_s390_store_status_unloaded(stru
        unsigned char archmode = 1;
        freg_t fprs[NUM_FPRS];
        unsigned int px;
-       u64 clkcomp;
+       u64 clkcomp, cputm;
        int rc;
  
        px = kvm_s390_get_prefix(vcpu);
  
        /* manually convert vector registers if necessary */
        if (MACHINE_HAS_VX) {
 -              convert_vx_to_fp(fprs, current->thread.fpu.vxrs);
 +              convert_vx_to_fp(fprs, (__vector128 *) vcpu->run->s.regs.vrs);
                rc = write_guest_abs(vcpu, gpa + __LC_FPREGS_SAVE_AREA,
                                     fprs, 128);
        } else {
                rc = write_guest_abs(vcpu, gpa + __LC_FPREGS_SAVE_AREA,
-                                    vcpu->run->s.regs.vrs, 128);
+                                    vcpu->run->s.regs.fprs, 128);
        }
        rc |= write_guest_abs(vcpu, gpa + __LC_GPREGS_SAVE_AREA,
                              vcpu->run->s.regs.gprs, 128);
                              &vcpu->run->s.regs.fpc, 4);
        rc |= write_guest_abs(vcpu, gpa + __LC_TOD_PROGREG_SAVE_AREA,
                              &vcpu->arch.sie_block->todpr, 4);
+       cputm = kvm_s390_get_cpu_timer(vcpu);
        rc |= write_guest_abs(vcpu, gpa + __LC_CPU_TIMER_SAVE_AREA,
-                             &vcpu->arch.sie_block->cputm, 8);
+                             &cputm, 8);
        clkcomp = vcpu->arch.sie_block->ckc >> 8;
        rc |= write_guest_abs(vcpu, gpa + __LC_CLOCK_COMP_SAVE_AREA,
                              &clkcomp, 8);
@@@ -2605,7 -2712,8 +2712,8 @@@ static long kvm_s390_guest_mem_op(struc
        switch (mop->op) {
        case KVM_S390_MEMOP_LOGICAL_READ:
                if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
-                       r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size, false);
+                       r = check_gva_range(vcpu, mop->gaddr, mop->ar,
+                                           mop->size, GACC_FETCH);
                        break;
                }
                r = read_guest(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size);
                break;
        case KVM_S390_MEMOP_LOGICAL_WRITE:
                if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
-                       r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size, true);
+                       r = check_gva_range(vcpu, mop->gaddr, mop->ar,
+                                           mop->size, GACC_STORE);
                        break;
                }
                if (copy_from_user(tmpbuf, uaddr, mop->size)) {
diff --combined arch/x86/kvm/lapic.c
index 3a045f39ed8114e24e375521135cb7d2296e9e7e,d9ae1ce2a6a03e0e8ebac52ea88c94dd913fe5f7..443d2a57ad3d9620246097a48ed3cd7de9e02f50
@@@ -281,7 -281,7 +281,7 @@@ void kvm_apic_set_version(struct kvm_vc
        struct kvm_cpuid_entry2 *feat;
        u32 v = APIC_VERSION;
  
-       if (!kvm_vcpu_has_lapic(vcpu))
+       if (!lapic_in_kernel(vcpu))
                return;
  
        feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
@@@ -475,26 -475,20 +475,20 @@@ static inline void apic_clear_isr(int v
  
  int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
  {
-       int highest_irr;
        /* This may race with setting of irr in __apic_accept_irq() and
         * value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq
         * will cause vmexit immediately and the value will be recalculated
         * on the next vmentry.
         */
-       if (!kvm_vcpu_has_lapic(vcpu))
-               return 0;
-       highest_irr = apic_find_highest_irr(vcpu->arch.apic);
-       return highest_irr;
+       return apic_find_highest_irr(vcpu->arch.apic);
  }
  
  static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
                             int vector, int level, int trig_mode,
-                            unsigned long *dest_map);
+                            struct dest_map *dest_map);
  
  int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
-               unsigned long *dest_map)
+                    struct dest_map *dest_map)
  {
        struct kvm_lapic *apic = vcpu->arch.apic;
  
@@@ -675,8 -669,33 +669,33 @@@ bool kvm_apic_match_dest(struct kvm_vcp
        }
  }
  
+ int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
+                      const unsigned long *bitmap, u32 bitmap_size)
+ {
+       u32 mod;
+       int i, idx = -1;
+       mod = vector % dest_vcpus;
+       for (i = 0; i <= mod; i++) {
+               idx = find_next_bit(bitmap, bitmap_size, idx + 1);
+               BUG_ON(idx == bitmap_size);
+       }
+       return idx;
+ }
+ static void kvm_apic_disabled_lapic_found(struct kvm *kvm)
+ {
+       if (!kvm->arch.disabled_lapic_found) {
+               kvm->arch.disabled_lapic_found = true;
+               printk(KERN_INFO
+                      "Disabled LAPIC found during irq injection\n");
+       }
+ }
  bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
-               struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map)
+               struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map)
  {
        struct kvm_apic_map *map;
        unsigned long bitmap = 1;
  
                dst = map->logical_map[cid];
  
-               if (kvm_lowest_prio_delivery(irq)) {
+               if (!kvm_lowest_prio_delivery(irq))
+                       goto set_irq;
+               if (!kvm_vector_hashing_enabled()) {
                        int l = -1;
                        for_each_set_bit(i, &bitmap, 16) {
                                if (!dst[i])
                                        continue;
                                if (l < 0)
                                        l = i;
-                               else if (kvm_apic_compare_prio(dst[i]->vcpu, dst[l]->vcpu) < 0)
+                               else if (kvm_apic_compare_prio(dst[i]->vcpu,
+                                                       dst[l]->vcpu) < 0)
                                        l = i;
                        }
                        bitmap = (l >= 0) ? 1 << l : 0;
+               } else {
+                       int idx;
+                       unsigned int dest_vcpus;
+                       dest_vcpus = hweight16(bitmap);
+                       if (dest_vcpus == 0)
+                               goto out;
+                       idx = kvm_vector_to_index(irq->vector,
+                               dest_vcpus, &bitmap, 16);
+                       if (!dst[idx]) {
+                               kvm_apic_disabled_lapic_found(kvm);
+                               goto out;
+                       }
+                       bitmap = (idx >= 0) ? 1 << idx : 0;
                }
        }
  
+ set_irq:
        for_each_set_bit(i, &bitmap, 16) {
                if (!dst[i])
                        continue;
@@@ -754,6 -794,20 +794,20 @@@ out
        return ret;
  }
  
+ /*
+  * This routine tries to handler interrupts in posted mode, here is how
+  * it deals with different cases:
+  * - For single-destination interrupts, handle it in posted mode
+  * - Else if vector hashing is enabled and it is a lowest-priority
+  *   interrupt, handle it in posted mode and use the following mechanism
+  *   to find the destinaiton vCPU.
+  *    1. For lowest-priority interrupts, store all the possible
+  *       destination vCPUs in an array.
+  *    2. Use "guest vector % max number of destination vCPUs" to find
+  *       the right destination vCPU in the array for the lowest-priority
+  *       interrupt.
+  * - Otherwise, use remapped mode to inject the interrupt.
+  */
  bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
                        struct kvm_vcpu **dest_vcpu)
  {
                if (cid >= ARRAY_SIZE(map->logical_map))
                        goto out;
  
-               for_each_set_bit(i, &bitmap, 16) {
-                       dst = map->logical_map[cid][i];
-                       if (++r == 2)
+               if (kvm_vector_hashing_enabled() &&
+                               kvm_lowest_prio_delivery(irq)) {
+                       int idx;
+                       unsigned int dest_vcpus;
+                       dest_vcpus = hweight16(bitmap);
+                       if (dest_vcpus == 0)
                                goto out;
-               }
  
-               if (dst && kvm_apic_present(dst->vcpu))
+                       idx = kvm_vector_to_index(irq->vector, dest_vcpus,
+                                                 &bitmap, 16);
+                       dst = map->logical_map[cid][idx];
+                       if (!dst) {
+                               kvm_apic_disabled_lapic_found(kvm);
+                               goto out;
+                       }
                        *dest_vcpu = dst->vcpu;
-               else
-                       goto out;
+               } else {
+                       for_each_set_bit(i, &bitmap, 16) {
+                               dst = map->logical_map[cid][i];
+                               if (++r == 2)
+                                       goto out;
+                       }
+                       if (dst && kvm_apic_present(dst->vcpu))
+                               *dest_vcpu = dst->vcpu;
+                       else
+                               goto out;
+               }
        }
  
        ret = true;
@@@ -819,7 -894,7 +894,7 @@@ out
   */
  static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
                             int vector, int level, int trig_mode,
-                            unsigned long *dest_map)
+                            struct dest_map *dest_map)
  {
        int result = 0;
        struct kvm_vcpu *vcpu = apic->vcpu;
  
                result = 1;
  
-               if (dest_map)
-                       __set_bit(vcpu->vcpu_id, dest_map);
+               if (dest_map) {
+                       __set_bit(vcpu->vcpu_id, dest_map->map);
+                       dest_map->vectors[vcpu->vcpu_id] = vector;
+               }
  
                if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
                        if (trig_mode)
@@@ -1195,7 -1272,7 +1272,7 @@@ static void apic_update_lvtt(struct kvm
  static void apic_timer_expired(struct kvm_lapic *apic)
  {
        struct kvm_vcpu *vcpu = apic->vcpu;
 -      wait_queue_head_t *q = &vcpu->wq;
 +      struct swait_queue_head *q = &vcpu->wq;
        struct kvm_timer *ktimer = &apic->lapic_timer;
  
        if (atomic_read(&apic->lapic_timer.pending))
        atomic_inc(&apic->lapic_timer.pending);
        kvm_set_pending_timer(vcpu);
  
 -      if (waitqueue_active(q))
 -              wake_up_interruptible(q);
 +      if (swait_active(q))
 +              swake_up(q);
  
        if (apic_lvtt_tscdeadline(apic))
                ktimer->expired_tscdeadline = ktimer->tscdeadline;
@@@ -1239,7 -1316,7 +1316,7 @@@ void wait_lapic_expire(struct kvm_vcpu 
        struct kvm_lapic *apic = vcpu->arch.apic;
        u64 guest_tsc, tsc_deadline;
  
-       if (!kvm_vcpu_has_lapic(vcpu))
+       if (!lapic_in_kernel(vcpu))
                return;
  
        if (apic->lapic_timer.expired_tscdeadline == 0)
@@@ -1515,8 -1592,7 +1592,7 @@@ static int apic_mmio_write(struct kvm_v
  
  void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
  {
-       if (kvm_vcpu_has_lapic(vcpu))
-               apic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
+       apic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
  }
  EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
  
@@@ -1566,7 -1642,7 +1642,7 @@@ u64 kvm_get_lapic_tscdeadline_msr(struc
  {
        struct kvm_lapic *apic = vcpu->arch.apic;
  
-       if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
+       if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) ||
                        apic_lvtt_period(apic))
                return 0;
  
@@@ -1577,7 -1653,7 +1653,7 @@@ void kvm_set_lapic_tscdeadline_msr(stru
  {
        struct kvm_lapic *apic = vcpu->arch.apic;
  
-       if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
+       if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) ||
                        apic_lvtt_period(apic))
                return;
  
@@@ -1590,9 -1666,6 +1666,6 @@@ void kvm_lapic_set_tpr(struct kvm_vcpu 
  {
        struct kvm_lapic *apic = vcpu->arch.apic;
  
-       if (!kvm_vcpu_has_lapic(vcpu))
-               return;
        apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
                     | (kvm_apic_get_reg(apic, APIC_TASKPRI) & 4));
  }
@@@ -1601,9 -1674,6 +1674,6 @@@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *
  {
        u64 tpr;
  
-       if (!kvm_vcpu_has_lapic(vcpu))
-               return 0;
        tpr = (u64) kvm_apic_get_reg(vcpu->arch.apic, APIC_TASKPRI);
  
        return (tpr & 0xf0) >> 4;
@@@ -1728,8 -1798,7 +1798,7 @@@ int apic_has_pending_timer(struct kvm_v
  {
        struct kvm_lapic *apic = vcpu->arch.apic;
  
-       if (kvm_vcpu_has_lapic(vcpu) && apic_enabled(apic) &&
-                       apic_lvt_enabled(apic, APIC_LVTT))
+       if (apic_enabled(apic) && apic_lvt_enabled(apic, APIC_LVTT))
                return atomic_read(&apic->lapic_timer.pending);
  
        return 0;
@@@ -1826,7 -1895,7 +1895,7 @@@ int kvm_apic_has_interrupt(struct kvm_v
        struct kvm_lapic *apic = vcpu->arch.apic;
        int highest_irr;
  
-       if (!kvm_vcpu_has_lapic(vcpu) || !apic_enabled(apic))
+       if (!apic_enabled(apic))
                return -1;
  
        apic_update_ppr(apic);
@@@ -1854,9 -1923,6 +1923,6 @@@ void kvm_inject_apic_timer_irqs(struct 
  {
        struct kvm_lapic *apic = vcpu->arch.apic;
  
-       if (!kvm_vcpu_has_lapic(vcpu))
-               return;
        if (atomic_read(&apic->lapic_timer.pending) > 0) {
                kvm_apic_local_deliver(apic, APIC_LVTT);
                if (apic_lvtt_tscdeadline(apic))
@@@ -1932,7 -1998,7 +1998,7 @@@ void __kvm_migrate_apic_timer(struct kv
  {
        struct hrtimer *timer;
  
-       if (!kvm_vcpu_has_lapic(vcpu))
+       if (!lapic_in_kernel(vcpu))
                return;
  
        timer = &vcpu->arch.apic->lapic_timer.timer;
@@@ -2105,7 -2171,7 +2171,7 @@@ int kvm_hv_vapic_msr_write(struct kvm_v
  {
        struct kvm_lapic *apic = vcpu->arch.apic;
  
-       if (!kvm_vcpu_has_lapic(vcpu))
+       if (!lapic_in_kernel(vcpu))
                return 1;
  
        /* if this is ICR write vector before command */
@@@ -2119,7 -2185,7 +2185,7 @@@ int kvm_hv_vapic_msr_read(struct kvm_vc
        struct kvm_lapic *apic = vcpu->arch.apic;
        u32 low, high = 0;
  
-       if (!kvm_vcpu_has_lapic(vcpu))
+       if (!lapic_in_kernel(vcpu))
                return 1;
  
        if (apic_reg_read(apic, reg, 4, &low))
@@@ -2151,7 -2217,7 +2217,7 @@@ void kvm_apic_accept_events(struct kvm_
        u8 sipi_vector;
        unsigned long pe;
  
-       if (!kvm_vcpu_has_lapic(vcpu) || !apic->pending_events)
+       if (!lapic_in_kernel(vcpu) || !apic->pending_events)
                return;
  
        /*
diff --combined arch/x86/kvm/mmu.c
index 1e7a49bfc94fb323cbb11782693cab89743f1133,2463de0b935cea06967faa9a56c17a65eede15fb..c512f095cdac82b9e2ba258ae052a9a4199dc13c
@@@ -41,6 -41,7 +41,7 @@@
  #include <asm/cmpxchg.h>
  #include <asm/io.h>
  #include <asm/vmx.h>
+ #include <asm/kvm_page_track.h>
  
  /*
   * When setting this variable to true it enables Two-Dimensional-Paging
@@@ -776,62 -777,85 +777,85 @@@ static struct kvm_lpage_info *lpage_inf
        return &slot->arch.lpage_info[level - 2][idx];
  }
  
+ static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
+                                           gfn_t gfn, int count)
+ {
+       struct kvm_lpage_info *linfo;
+       int i;
+       for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
+               linfo = lpage_info_slot(gfn, slot, i);
+               linfo->disallow_lpage += count;
+               WARN_ON(linfo->disallow_lpage < 0);
+       }
+ }
+ void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+ {
+       update_gfn_disallow_lpage_count(slot, gfn, 1);
+ }
+ void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+ {
+       update_gfn_disallow_lpage_count(slot, gfn, -1);
+ }
  static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
  {
        struct kvm_memslots *slots;
        struct kvm_memory_slot *slot;
-       struct kvm_lpage_info *linfo;
        gfn_t gfn;
-       int i;
  
+       kvm->arch.indirect_shadow_pages++;
        gfn = sp->gfn;
        slots = kvm_memslots_for_spte_role(kvm, sp->role);
        slot = __gfn_to_memslot(slots, gfn);
-       for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
-               linfo = lpage_info_slot(gfn, slot, i);
-               linfo->write_count += 1;
-       }
-       kvm->arch.indirect_shadow_pages++;
+       /* the non-leaf shadow pages are keeping readonly. */
+       if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+               return kvm_slot_page_track_add_page(kvm, slot, gfn,
+                                                   KVM_PAGE_TRACK_WRITE);
+       kvm_mmu_gfn_disallow_lpage(slot, gfn);
  }
  
  static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
  {
        struct kvm_memslots *slots;
        struct kvm_memory_slot *slot;
-       struct kvm_lpage_info *linfo;
        gfn_t gfn;
-       int i;
  
+       kvm->arch.indirect_shadow_pages--;
        gfn = sp->gfn;
        slots = kvm_memslots_for_spte_role(kvm, sp->role);
        slot = __gfn_to_memslot(slots, gfn);
-       for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
-               linfo = lpage_info_slot(gfn, slot, i);
-               linfo->write_count -= 1;
-               WARN_ON(linfo->write_count < 0);
-       }
-       kvm->arch.indirect_shadow_pages--;
+       if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+               return kvm_slot_page_track_remove_page(kvm, slot, gfn,
+                                                      KVM_PAGE_TRACK_WRITE);
+       kvm_mmu_gfn_allow_lpage(slot, gfn);
  }
  
- static int __has_wrprotected_page(gfn_t gfn, int level,
-                                 struct kvm_memory_slot *slot)
+ static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
+                                         struct kvm_memory_slot *slot)
  {
        struct kvm_lpage_info *linfo;
  
        if (slot) {
                linfo = lpage_info_slot(gfn, slot, level);
-               return linfo->write_count;
+               return !!linfo->disallow_lpage;
        }
  
-       return 1;
+       return true;
  }
  
- static int has_wrprotected_page(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
+ static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn,
+                                       int level)
  {
        struct kvm_memory_slot *slot;
  
        slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
-       return __has_wrprotected_page(gfn, level, slot);
+       return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
  }
  
  static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
@@@ -897,7 -921,7 +921,7 @@@ static int mapping_level(struct kvm_vcp
        max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
  
        for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
-               if (__has_wrprotected_page(large_gfn, level, slot))
+               if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot))
                        break;
  
        return level - 1;
@@@ -1323,23 -1347,29 +1347,29 @@@ void kvm_arch_mmu_enable_log_dirty_pt_m
                kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
  }
  
- static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
+ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
+                                   struct kvm_memory_slot *slot, u64 gfn)
  {
-       struct kvm_memory_slot *slot;
        struct kvm_rmap_head *rmap_head;
        int i;
        bool write_protected = false;
  
-       slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
        for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
                rmap_head = __gfn_to_rmap(gfn, i, slot);
-               write_protected |= __rmap_write_protect(vcpu->kvm, rmap_head, true);
+               write_protected |= __rmap_write_protect(kvm, rmap_head, true);
        }
  
        return write_protected;
  }
  
+ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
+ {
+       struct kvm_memory_slot *slot;
+       slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+       return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
+ }
  static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
  {
        u64 *sptep;
@@@ -1754,7 -1784,7 +1784,7 @@@ static void mark_unsync(u64 *spte
  static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
                               struct kvm_mmu_page *sp)
  {
-       return 1;
+       return 0;
  }
  
  static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
@@@ -1840,13 -1870,16 +1870,16 @@@ static int __mmu_unsync_walk(struct kvm
        return nr_unsync_leaf;
  }
  
+ #define INVALID_INDEX (-1)
  static int mmu_unsync_walk(struct kvm_mmu_page *sp,
                           struct kvm_mmu_pages *pvec)
  {
+       pvec->nr = 0;
        if (!sp->unsync_children)
                return 0;
  
-       mmu_pages_add(pvec, sp, 0);
+       mmu_pages_add(pvec, sp, INVALID_INDEX);
        return __mmu_unsync_walk(sp, pvec);
  }
  
@@@ -1883,37 -1916,35 +1916,35 @@@ static void kvm_mmu_commit_zap_page(str
                if ((_sp)->role.direct || (_sp)->role.invalid) {} else
  
  /* @sp->gfn should be write-protected at the call site */
- static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
-                          struct list_head *invalid_list, bool clear_unsync)
+ static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+                           struct list_head *invalid_list)
  {
        if (sp->role.cr4_pae != !!is_pae(vcpu)) {
                kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
-               return 1;
+               return false;
        }
  
-       if (clear_unsync)
-               kvm_unlink_unsync_page(vcpu->kvm, sp);
-       if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
+       if (vcpu->arch.mmu.sync_page(vcpu, sp) == 0) {
                kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
-               return 1;
+               return false;
        }
  
-       kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
-       return 0;
+       return true;
  }
  
- static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
-                                  struct kvm_mmu_page *sp)
+ static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
+                                struct list_head *invalid_list,
+                                bool remote_flush, bool local_flush)
  {
-       LIST_HEAD(invalid_list);
-       int ret;
-       ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
-       if (ret)
-               kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+       if (!list_empty(invalid_list)) {
+               kvm_mmu_commit_zap_page(vcpu->kvm, invalid_list);
+               return;
+       }
  
-       return ret;
+       if (remote_flush)
+               kvm_flush_remote_tlbs(vcpu->kvm);
+       else if (local_flush)
+               kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
  }
  
  #ifdef CONFIG_KVM_MMU_AUDIT
@@@ -1923,46 -1954,38 +1954,38 @@@ static void kvm_mmu_audit(struct kvm_vc
  static void mmu_audit_disable(void) { }
  #endif
  
- static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
                         struct list_head *invalid_list)
  {
-       return __kvm_sync_page(vcpu, sp, invalid_list, true);
+       kvm_unlink_unsync_page(vcpu->kvm, sp);
+       return __kvm_sync_page(vcpu, sp, invalid_list);
  }
  
  /* @gfn should be write-protected at the call site */
- static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
+ static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
+                          struct list_head *invalid_list)
  {
        struct kvm_mmu_page *s;
-       LIST_HEAD(invalid_list);
-       bool flush = false;
+       bool ret = false;
  
        for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
                if (!s->unsync)
                        continue;
  
                WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
-               kvm_unlink_unsync_page(vcpu->kvm, s);
-               if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
-                       (vcpu->arch.mmu.sync_page(vcpu, s))) {
-                       kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
-                       continue;
-               }
-               flush = true;
+               ret |= kvm_sync_page(vcpu, s, invalid_list);
        }
  
-       kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
-       if (flush)
-               kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+       return ret;
  }
  
  struct mmu_page_path {
-       struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
-       unsigned int idx[PT64_ROOT_LEVEL-1];
+       struct kvm_mmu_page *parent[PT64_ROOT_LEVEL];
+       unsigned int idx[PT64_ROOT_LEVEL];
  };
  
  #define for_each_sp(pvec, sp, parents, i)                     \
-               for (i = mmu_pages_next(&pvec, &parents, -1),   \
-                       sp = pvec.page[i].sp;                   \
+               for (i = mmu_pages_first(&pvec, &parents);      \
                        i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});   \
                        i = mmu_pages_next(&pvec, &parents, i))
  
@@@ -1974,19 -1997,43 +1997,43 @@@ static int mmu_pages_next(struct kvm_mm
  
        for (n = i+1; n < pvec->nr; n++) {
                struct kvm_mmu_page *sp = pvec->page[n].sp;
+               unsigned idx = pvec->page[n].idx;
+               int level = sp->role.level;
  
-               if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
-                       parents->idx[0] = pvec->page[n].idx;
-                       return n;
-               }
+               parents->idx[level-1] = idx;
+               if (level == PT_PAGE_TABLE_LEVEL)
+                       break;
  
-               parents->parent[sp->role.level-2] = sp;
-               parents->idx[sp->role.level-1] = pvec->page[n].idx;
+               parents->parent[level-2] = sp;
        }
  
        return n;
  }
  
+ static int mmu_pages_first(struct kvm_mmu_pages *pvec,
+                          struct mmu_page_path *parents)
+ {
+       struct kvm_mmu_page *sp;
+       int level;
+       if (pvec->nr == 0)
+               return 0;
+       WARN_ON(pvec->page[0].idx != INVALID_INDEX);
+       sp = pvec->page[0].sp;
+       level = sp->role.level;
+       WARN_ON(level == PT_PAGE_TABLE_LEVEL);
+       parents->parent[level-2] = sp;
+       /* Also set up a sentinel.  Further entries in pvec are all
+        * children of sp, so this element is never overwritten.
+        */
+       parents->parent[level-1] = NULL;
+       return mmu_pages_next(pvec, parents, 0);
+ }
  static void mmu_pages_clear_parents(struct mmu_page_path *parents)
  {
        struct kvm_mmu_page *sp;
  
        do {
                unsigned int idx = parents->idx[level];
                sp = parents->parent[level];
                if (!sp)
                        return;
  
+               WARN_ON(idx == INVALID_INDEX);
                clear_unsync_child_bit(sp, idx);
                level++;
-       } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
- }
- static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
-                              struct mmu_page_path *parents,
-                              struct kvm_mmu_pages *pvec)
- {
-       parents->parent[parent->role.level-1] = NULL;
-       pvec->nr = 0;
+       } while (!sp->unsync_children);
  }
  
  static void mmu_sync_children(struct kvm_vcpu *vcpu,
        struct mmu_page_path parents;
        struct kvm_mmu_pages pages;
        LIST_HEAD(invalid_list);
+       bool flush = false;
  
-       kvm_mmu_pages_init(parent, &parents, &pages);
        while (mmu_unsync_walk(parent, &pages)) {
                bool protected = false;
  
                for_each_sp(pages, sp, parents, i)
                        protected |= rmap_write_protect(vcpu, sp->gfn);
  
-               if (protected)
+               if (protected) {
                        kvm_flush_remote_tlbs(vcpu->kvm);
+                       flush = false;
+               }
  
                for_each_sp(pages, sp, parents, i) {
-                       kvm_sync_page(vcpu, sp, &invalid_list);
+                       flush |= kvm_sync_page(vcpu, sp, &invalid_list);
                        mmu_pages_clear_parents(&parents);
                }
-               kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
-               cond_resched_lock(&vcpu->kvm->mmu_lock);
-               kvm_mmu_pages_init(parent, &parents, &pages);
+               if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) {
+                       kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
+                       cond_resched_lock(&vcpu->kvm->mmu_lock);
+                       flush = false;
+               }
        }
+       kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
  }
  
  static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
  {
-       sp->write_flooding_count = 0;
+       atomic_set(&sp->write_flooding_count,  0);
  }
  
  static void clear_sp_write_flooding_count(u64 *spte)
@@@ -2069,6 -2114,8 +2114,8 @@@ static struct kvm_mmu_page *kvm_mmu_get
        unsigned quadrant;
        struct kvm_mmu_page *sp;
        bool need_sync = false;
+       bool flush = false;
+       LIST_HEAD(invalid_list);
  
        role = vcpu->arch.mmu.base_role;
        role.level = level;
                if (sp->role.word != role.word)
                        continue;
  
-               if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
-                       break;
+               if (sp->unsync) {
+                       /* The page is good, but __kvm_sync_page might still end
+                        * up zapping it.  If so, break in order to rebuild it.
+                        */
+                       if (!__kvm_sync_page(vcpu, sp, &invalid_list))
+                               break;
+                       WARN_ON(!list_empty(&invalid_list));
+                       kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+               }
  
                if (sp->unsync_children)
                        kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
        hlist_add_head(&sp->hash_link,
                &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
        if (!direct) {
-               if (rmap_write_protect(vcpu, gfn))
+               /*
+                * we should do write protection before syncing pages
+                * otherwise the content of the synced shadow page may
+                * be inconsistent with guest page table.
+                */
+               account_shadowed(vcpu->kvm, sp);
+               if (level == PT_PAGE_TABLE_LEVEL &&
+                     rmap_write_protect(vcpu, gfn))
                        kvm_flush_remote_tlbs(vcpu->kvm);
-               if (level > PT_PAGE_TABLE_LEVEL && need_sync)
-                       kvm_sync_pages(vcpu, gfn);
  
-               account_shadowed(vcpu->kvm, sp);
+               if (level > PT_PAGE_TABLE_LEVEL && need_sync)
+                       flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
        }
        sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
        clear_page(sp->spt);
        trace_kvm_mmu_get_page(sp, true);
+       kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
        return sp;
  }
  
@@@ -2269,7 -2332,6 +2332,6 @@@ static int mmu_zap_unsync_children(stru
        if (parent->role.level == PT_PAGE_TABLE_LEVEL)
                return 0;
  
-       kvm_mmu_pages_init(parent, &parents, &pages);
        while (mmu_unsync_walk(parent, &pages)) {
                struct kvm_mmu_page *sp;
  
                        mmu_pages_clear_parents(&parents);
                        zapped++;
                }
-               kvm_mmu_pages_init(parent, &parents, &pages);
        }
  
        return zapped;
@@@ -2354,8 -2415,8 +2415,8 @@@ static bool prepare_zap_oldest_mmu_page
        if (list_empty(&kvm->arch.active_mmu_pages))
                return false;
  
-       sp = list_entry(kvm->arch.active_mmu_pages.prev,
-                       struct kvm_mmu_page, link);
+       sp = list_last_entry(&kvm->arch.active_mmu_pages,
+                            struct kvm_mmu_page, link);
        kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
  
        return true;
@@@ -2408,7 -2469,7 +2469,7 @@@ int kvm_mmu_unprotect_page(struct kvm *
  }
  EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
  
- static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
  {
        trace_kvm_mmu_unsync_page(sp);
        ++vcpu->kvm->stat.mmu_unsync;
        kvm_mmu_mark_parents_unsync(sp);
  }
  
- static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
+ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
+                                  bool can_unsync)
  {
-       struct kvm_mmu_page *s;
-       for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
-               if (s->unsync)
-                       continue;
-               WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
-               __kvm_unsync_page(vcpu, s);
-       }
- }
+       struct kvm_mmu_page *sp;
  
- static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
-                                 bool can_unsync)
- {
-       struct kvm_mmu_page *s;
-       bool need_unsync = false;
+       if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+               return true;
  
-       for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
+       for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
                if (!can_unsync)
-                       return 1;
+                       return true;
  
-               if (s->role.level != PT_PAGE_TABLE_LEVEL)
-                       return 1;
+               if (sp->unsync)
+                       continue;
  
-               if (!s->unsync)
-                       need_unsync = true;
+               WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
+               kvm_unsync_page(vcpu, sp);
        }
-       if (need_unsync)
-               kvm_unsync_pages(vcpu, gfn);
-       return 0;
+       return false;
  }
  
  static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
@@@ -2503,7 -2553,7 +2553,7 @@@ static int set_spte(struct kvm_vcpu *vc
                 * be fixed if guest refault.
                 */
                if (level > PT_PAGE_TABLE_LEVEL &&
-                   has_wrprotected_page(vcpu, gfn, level))
+                   mmu_gfn_lpage_is_disallowed(vcpu, gfn, level))
                        goto done;
  
                spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
@@@ -2768,7 -2818,7 +2818,7 @@@ static void transparent_hugepage_adjust
        if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
            level == PT_PAGE_TABLE_LEVEL &&
            PageTransCompound(pfn_to_page(pfn)) &&
-           !has_wrprotected_page(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
+           !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
                unsigned long mask;
                /*
                 * mmu_notifier_retry was successful and we hold the
  static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
                                kvm_pfn_t pfn, unsigned access, int *ret_val)
  {
-       bool ret = true;
        /* The pfn is invalid, report the error! */
        if (unlikely(is_error_pfn(pfn))) {
                *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
-               goto exit;
+               return true;
        }
  
        if (unlikely(is_noslot_pfn(pfn)))
                vcpu_cache_mmio_info(vcpu, gva, gfn, access);
  
-       ret = false;
- exit:
-       return ret;
+       return false;
  }
  
  static bool page_fault_can_be_fast(u32 error_code)
@@@ -3273,7 -3319,7 +3319,7 @@@ static bool is_shadow_zero_bits_set(str
        return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level);
  }
  
- static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
  {
        if (direct)
                return vcpu_match_mmio_gpa(vcpu, addr);
@@@ -3332,7 -3378,7 +3378,7 @@@ int handle_mmio_page_fault(struct kvm_v
        u64 spte;
        bool reserved;
  
-       if (quickly_check_mmio_pf(vcpu, addr, direct))
+       if (mmio_info_in_cache(vcpu, addr, direct))
                return RET_MMIO_PF_EMULATE;
  
        reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
  }
  EXPORT_SYMBOL_GPL(handle_mmio_page_fault);
  
+ static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
+                                        u32 error_code, gfn_t gfn)
+ {
+       if (unlikely(error_code & PFERR_RSVD_MASK))
+               return false;
+       if (!(error_code & PFERR_PRESENT_MASK) ||
+             !(error_code & PFERR_WRITE_MASK))
+               return false;
+       /*
+        * guest is writing the page which is write tracked which can
+        * not be fixed by page fault handler.
+        */
+       if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+               return true;
+       return false;
+ }
+ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
+ {
+       struct kvm_shadow_walk_iterator iterator;
+       u64 spte;
+       if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+               return;
+       walk_shadow_page_lockless_begin(vcpu);
+       for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
+               clear_sp_write_flooding_count(iterator.sptep);
+               if (!is_shadow_present_pte(spte))
+                       break;
+       }
+       walk_shadow_page_lockless_end(vcpu);
+ }
  static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
                                u32 error_code, bool prefault)
  {
-       gfn_t gfn;
+       gfn_t gfn = gva >> PAGE_SHIFT;
        int r;
  
        pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
  
-       if (unlikely(error_code & PFERR_RSVD_MASK)) {
-               r = handle_mmio_page_fault(vcpu, gva, true);
-               if (likely(r != RET_MMIO_PF_INVALID))
-                       return r;
-       }
+       if (page_fault_handle_page_track(vcpu, error_code, gfn))
+               return 1;
  
        r = mmu_topup_memory_caches(vcpu);
        if (r)
  
        MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
  
-       gfn = gva >> PAGE_SHIFT;
  
        return nonpaging_map(vcpu, gva & PAGE_MASK,
                             error_code, gfn, prefault);
@@@ -3460,12 -3538,8 +3538,8 @@@ static int tdp_page_fault(struct kvm_vc
  
        MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
  
-       if (unlikely(error_code & PFERR_RSVD_MASK)) {
-               r = handle_mmio_page_fault(vcpu, gpa, true);
-               if (likely(r != RET_MMIO_PF_INVALID))
-                       return r;
-       }
+       if (page_fault_handle_page_track(vcpu, error_code, gfn))
+               return 1;
  
        r = mmu_topup_memory_caches(vcpu);
        if (r)
@@@ -3558,13 -3632,24 +3632,24 @@@ static bool sync_mmio_spte(struct kvm_v
        return false;
  }
  
- static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte)
+ static inline bool is_last_gpte(struct kvm_mmu *mmu,
+                               unsigned level, unsigned gpte)
  {
-       unsigned index;
+       /*
+        * PT_PAGE_TABLE_LEVEL always terminates.  The RHS has bit 7 set
+        * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
+        * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
+        */
+       gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
  
-       index = level - 1;
-       index |= (gpte & PT_PAGE_SIZE_MASK) >> (PT_PAGE_SIZE_SHIFT - 2);
-       return mmu->last_pte_bitmap & (1 << index);
+       /*
+        * The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
+        * If it is clear, there are no large pages at this level, so clear
+        * PT_PAGE_SIZE_MASK in gpte if that is the case.
+        */
+       gpte &= level - mmu->last_nonleaf_level;
+       return gpte & PT_PAGE_SIZE_MASK;
  }
  
  #define PTTYPE_EPT 18 /* arbitrary */
@@@ -3721,15 -3806,13 +3806,15 @@@ static void reset_rsvds_bits_mask_ept(s
  void
  reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
  {
 +      bool uses_nx = context->nx || context->base_role.smep_andnot_wp;
 +
        /*
         * Passing "true" to the last argument is okay; it adds a check
         * on bit 8 of the SPTEs which KVM doesn't use anyway.
         */
        __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check,
                                boot_cpu_data.x86_phys_bits,
 -                              context->shadow_root_level, context->nx,
 +                              context->shadow_root_level, uses_nx,
                                guest_cpuid_has_gbpages(vcpu), is_pse(vcpu),
                                true);
  }
@@@ -3838,22 -3921,13 +3923,13 @@@ static void update_permission_bitmask(s
        }
  }
  
- static void update_last_pte_bitmap(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+ static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
  {
-       u8 map;
-       unsigned level, root_level = mmu->root_level;
-       const unsigned ps_set_index = 1 << 2;  /* bit 2 of index: ps */
-       if (root_level == PT32E_ROOT_LEVEL)
-               --root_level;
-       /* PT_PAGE_TABLE_LEVEL always terminates */
-       map = 1 | (1 << ps_set_index);
-       for (level = PT_DIRECTORY_LEVEL; level <= root_level; ++level) {
-               if (level <= PT_PDPE_LEVEL
-                   && (mmu->root_level >= PT32E_ROOT_LEVEL || is_pse(vcpu)))
-                       map |= 1 << (ps_set_index | (level - 1));
-       }
-       mmu->last_pte_bitmap = map;
+       unsigned root_level = mmu->root_level;
+       mmu->last_nonleaf_level = root_level;
+       if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu))
+               mmu->last_nonleaf_level++;
  }
  
  static void paging64_init_context_common(struct kvm_vcpu *vcpu,
  
        reset_rsvds_bits_mask(vcpu, context);
        update_permission_bitmask(vcpu, context, false);
-       update_last_pte_bitmap(vcpu, context);
+       update_last_nonleaf_level(vcpu, context);
  
        MMU_WARN_ON(!is_pae(vcpu));
        context->page_fault = paging64_page_fault;
@@@ -3892,7 -3966,7 +3968,7 @@@ static void paging32_init_context(struc
  
        reset_rsvds_bits_mask(vcpu, context);
        update_permission_bitmask(vcpu, context, false);
-       update_last_pte_bitmap(vcpu, context);
+       update_last_nonleaf_level(vcpu, context);
  
        context->page_fault = paging32_page_fault;
        context->gva_to_gpa = paging32_gva_to_gpa;
@@@ -3950,7 -4024,7 +4026,7 @@@ static void init_kvm_tdp_mmu(struct kvm
        }
  
        update_permission_bitmask(vcpu, context, false);
-       update_last_pte_bitmap(vcpu, context);
+       update_last_nonleaf_level(vcpu, context);
        reset_tdp_shadow_zero_bits_mask(vcpu, context);
  }
  
@@@ -4056,7 -4130,7 +4132,7 @@@ static void init_kvm_nested_mmu(struct 
        }
  
        update_permission_bitmask(vcpu, g_context, false);
-       update_last_pte_bitmap(vcpu, g_context);
+       update_last_nonleaf_level(vcpu, g_context);
  }
  
  static void init_kvm_mmu(struct kvm_vcpu *vcpu)
@@@ -4127,18 -4201,6 +4203,6 @@@ static bool need_remote_flush(u64 old, 
        return (old & ~new & PT64_PERM_MASK) != 0;
  }
  
- static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
-                                   bool remote_flush, bool local_flush)
- {
-       if (zap_page)
-               return;
-       if (remote_flush)
-               kvm_flush_remote_tlbs(vcpu->kvm);
-       else if (local_flush)
-               kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
- }
  static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
                                    const u8 *new, int *bytes)
  {
@@@ -4188,7 -4250,8 +4252,8 @@@ static bool detect_write_flooding(struc
        if (sp->role.level == PT_PAGE_TABLE_LEVEL)
                return false;
  
-       return ++sp->write_flooding_count >= 3;
+       atomic_inc(&sp->write_flooding_count);
+       return atomic_read(&sp->write_flooding_count) >= 3;
  }
  
  /*
@@@ -4250,15 -4313,15 +4315,15 @@@ static u64 *get_written_sptes(struct kv
        return spte;
  }
  
- void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-                      const u8 *new, int bytes)
static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+                             const u8 *new, int bytes)
  {
        gfn_t gfn = gpa >> PAGE_SHIFT;
        struct kvm_mmu_page *sp;
        LIST_HEAD(invalid_list);
        u64 entry, gentry, *spte;
        int npte;
-       bool remote_flush, local_flush, zap_page;
+       bool remote_flush, local_flush;
        union kvm_mmu_page_role mask = { };
  
        mask.cr0_wp = 1;
        if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
                return;
  
-       zap_page = remote_flush = local_flush = false;
+       remote_flush = local_flush = false;
  
        pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
  
        for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
                if (detect_write_misaligned(sp, gpa, bytes) ||
                      detect_write_flooding(sp)) {
-                       zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
-                                                    &invalid_list);
+                       kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
                        ++vcpu->kvm->stat.mmu_flooded;
                        continue;
                }
                        ++spte;
                }
        }
-       mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
-       kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+       kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
        kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
        spin_unlock(&vcpu->kvm->mmu_lock);
  }
@@@ -4356,32 -4417,34 +4419,34 @@@ static void make_mmu_pages_available(st
        kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
  }
  
- static bool is_mmio_page_fault(struct kvm_vcpu *vcpu, gva_t addr)
- {
-       if (vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu))
-               return vcpu_match_mmio_gpa(vcpu, addr);
-       return vcpu_match_mmio_gva(vcpu, addr);
- }
  int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
                       void *insn, int insn_len)
  {
        int r, emulation_type = EMULTYPE_RETRY;
        enum emulation_result er;
+       bool direct = vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu);
+       if (unlikely(error_code & PFERR_RSVD_MASK)) {
+               r = handle_mmio_page_fault(vcpu, cr2, direct);
+               if (r == RET_MMIO_PF_EMULATE) {
+                       emulation_type = 0;
+                       goto emulate;
+               }
+               if (r == RET_MMIO_PF_RETRY)
+                       return 1;
+               if (r < 0)
+                       return r;
+       }
  
        r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
        if (r < 0)
-               goto out;
-       if (!r) {
-               r = 1;
-               goto out;
-       }
+               return r;
+       if (!r)
+               return 1;
  
-       if (is_mmio_page_fault(vcpu, cr2))
+       if (mmio_info_in_cache(vcpu, cr2, direct))
                emulation_type = 0;
+ emulate:
        er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
  
        switch (er) {
        default:
                BUG();
        }
- out:
-       return r;
  }
  EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
  
@@@ -4465,6 -4526,21 +4528,21 @@@ void kvm_mmu_setup(struct kvm_vcpu *vcp
        init_kvm_mmu(vcpu);
  }
  
+ void kvm_mmu_init_vm(struct kvm *kvm)
+ {
+       struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+       node->track_write = kvm_mmu_pte_write;
+       kvm_page_track_register_notifier(kvm, node);
+ }
+ void kvm_mmu_uninit_vm(struct kvm *kvm)
+ {
+       struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+       kvm_page_track_unregister_notifier(kvm, node);
+ }
  /* The return value indicates if tlb flush on all vcpus is needed. */
  typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
  
diff --combined arch/x86/kvm/vmx.c
index 9bd8f44baded2318e8b5bc2a4773068a45a8723b,e512aa7ed8748ff8de66c053348572ce574ac756..5e45c2731a5d60ba6a9adfc58841ca075cd90834
@@@ -596,8 -596,6 +596,8 @@@ struct vcpu_vmx 
        /* Support for PML */
  #define PML_ENTITY_NUM                512
        struct page *pml_pg;
 +
 +      u64 current_tsc_ratio;
  };
  
  enum segment_cache_field {
@@@ -863,7 -861,6 +863,6 @@@ static unsigned long nested_ept_get_cr3
  static u64 construct_eptp(unsigned long root_hpa);
  static void kvm_cpu_vmxon(u64 addr);
  static void kvm_cpu_vmxoff(void);
- static bool vmx_mpx_supported(void);
  static bool vmx_xsaves_supported(void);
  static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
  static void vmx_set_segment(struct kvm_vcpu *vcpu,
@@@ -963,25 -960,36 +962,36 @@@ static const u32 vmx_msr_index[] = 
        MSR_EFER, MSR_TSC_AUX, MSR_STAR,
  };
  
- static inline bool is_page_fault(u32 intr_info)
+ static inline bool is_exception_n(u32 intr_info, u8 vector)
  {
        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
                             INTR_INFO_VALID_MASK)) ==
-               (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
+               (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK);
+ }
+ static inline bool is_debug(u32 intr_info)
+ {
+       return is_exception_n(intr_info, DB_VECTOR);
+ }
+ static inline bool is_breakpoint(u32 intr_info)
+ {
+       return is_exception_n(intr_info, BP_VECTOR);
+ }
+ static inline bool is_page_fault(u32 intr_info)
+ {
+       return is_exception_n(intr_info, PF_VECTOR);
  }
  
  static inline bool is_no_device(u32 intr_info)
  {
-       return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
-                            INTR_INFO_VALID_MASK)) ==
-               (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
+       return is_exception_n(intr_info, NM_VECTOR);
  }
  
  static inline bool is_invalid_opcode(u32 intr_info)
  {
-       return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
-                            INTR_INFO_VALID_MASK)) ==
-               (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
+       return is_exception_n(intr_info, UD_VECTOR);
  }
  
  static inline bool is_external_interrupt(u32 intr_info)
@@@ -1813,13 -1821,6 +1823,13 @@@ static void add_atomic_switch_msr(struc
                        return;
                }
                break;
 +      case MSR_IA32_PEBS_ENABLE:
 +              /* PEBS needs a quiescent period after being disabled (to write
 +               * a record).  Disabling PEBS through VMX MSR swapping doesn't
 +               * provide that period, so a CPU could write host's record into
 +               * guest's memory.
 +               */
 +              wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
        }
  
        for (i = 0; i < m->nr; ++i)
@@@ -1857,31 -1858,26 +1867,31 @@@ static void reload_tss(void
  
  static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
  {
 -      u64 guest_efer;
 -      u64 ignore_bits;
 +      u64 guest_efer = vmx->vcpu.arch.efer;
 +      u64 ignore_bits = 0;
  
 -      guest_efer = vmx->vcpu.arch.efer;
 +      if (!enable_ept) {
 +              /*
 +               * NX is needed to handle CR0.WP=1, CR4.SMEP=1.  Testing
 +               * host CPUID is more efficient than testing guest CPUID
 +               * or CR4.  Host SMEP is anyway a requirement for guest SMEP.
 +               */
 +              if (boot_cpu_has(X86_FEATURE_SMEP))
 +                      guest_efer |= EFER_NX;
 +              else if (!(guest_efer & EFER_NX))
 +                      ignore_bits |= EFER_NX;
 +      }
  
        /*
 -       * NX is emulated; LMA and LME handled by hardware; SCE meaningless
 -       * outside long mode
 +       * LMA and LME handled by hardware; SCE meaningless outside long mode.
         */
 -      ignore_bits = EFER_NX | EFER_SCE;
 +      ignore_bits |= EFER_SCE;
  #ifdef CONFIG_X86_64
        ignore_bits |= EFER_LMA | EFER_LME;
        /* SCE is meaningful only in long mode on Intel */
        if (guest_efer & EFER_LMA)
                ignore_bits &= ~(u64)EFER_SCE;
  #endif
 -      guest_efer &= ~ignore_bits;
 -      guest_efer |= host_efer & ignore_bits;
 -      vmx->guest_msrs[efer_offset].data = guest_efer;
 -      vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
  
        clear_atomic_switch_msr(vmx, MSR_EFER);
  
         */
        if (cpu_has_load_ia32_efer ||
            (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
 -              guest_efer = vmx->vcpu.arch.efer;
                if (!(guest_efer & EFER_LMA))
                        guest_efer &= ~EFER_LME;
                if (guest_efer != host_efer)
                        add_atomic_switch_msr(vmx, MSR_EFER,
                                              guest_efer, host_efer);
                return false;
 -      }
 +      } else {
 +              guest_efer &= ~ignore_bits;
 +              guest_efer |= host_efer & ignore_bits;
 +
 +              vmx->guest_msrs[efer_offset].data = guest_efer;
 +              vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
  
 -      return true;
 +              return true;
 +      }
  }
  
  static unsigned long segment_base(u16 selector)
@@@ -2146,16 -2137,14 +2156,16 @@@ static void vmx_vcpu_load(struct kvm_vc
                rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
                vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
  
 -              /* Setup TSC multiplier */
 -              if (cpu_has_vmx_tsc_scaling())
 -                      vmcs_write64(TSC_MULTIPLIER,
 -                                   vcpu->arch.tsc_scaling_ratio);
 -
                vmx->loaded_vmcs->cpu = cpu;
        }
  
 +      /* Setup TSC multiplier */
 +      if (kvm_has_tsc_control &&
 +          vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) {
 +              vmx->current_tsc_ratio = vcpu->arch.tsc_scaling_ratio;
 +              vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
 +      }
 +
        vmx_vcpu_pi_load(vcpu, cpu);
  }
  
@@@ -2605,7 -2594,7 +2615,7 @@@ static void nested_vmx_setup_ctls_msrs(
                VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
                VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
  
-       if (vmx_mpx_supported())
+       if (kvm_mpx_supported())
                vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
  
        /* We support free control of debug control saving. */
                VM_ENTRY_LOAD_IA32_PAT;
        vmx->nested.nested_vmx_entry_ctls_high |=
                (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
-       if (vmx_mpx_supported())
+       if (kvm_mpx_supported())
                vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
  
        /* We support free control of debug control loading. */
@@@ -2870,7 -2859,7 +2880,7 @@@ static int vmx_get_msr(struct kvm_vcpu 
                msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
                break;
        case MSR_IA32_BNDCFGS:
-               if (!vmx_mpx_supported())
+               if (!kvm_mpx_supported())
                        return 1;
                msr_info->data = vmcs_read64(GUEST_BNDCFGS);
                break;
@@@ -2947,7 -2936,7 +2957,7 @@@ static int vmx_set_msr(struct kvm_vcpu 
                vmcs_writel(GUEST_SYSENTER_ESP, data);
                break;
        case MSR_IA32_BNDCFGS:
-               if (!vmx_mpx_supported())
+               if (!kvm_mpx_supported())
                        return 1;
                vmcs_write64(GUEST_BNDCFGS, data);
                break;
@@@ -3420,7 -3409,7 +3430,7 @@@ static void init_vmcs_shadow_fields(voi
        for (i = j = 0; i < max_shadow_read_write_fields; i++) {
                switch (shadow_read_write_fields[i]) {
                case GUEST_BNDCFGS:
-                       if (!vmx_mpx_supported())
+                       if (!kvm_mpx_supported())
                                continue;
                        break;
                default:
@@@ -5629,11 -5618,8 +5639,8 @@@ static int handle_dr(struct kvm_vcpu *v
        }
  
        if (vcpu->guest_debug == 0) {
-               u32 cpu_based_vm_exec_control;
-               cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-               cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING;
-               vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+               vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+                               CPU_BASED_MOV_DR_EXITING);
  
                /*
                 * No more DR vmexits; force a reload of the debug registers
@@@ -5670,8 -5656,6 +5677,6 @@@ static void vmx_set_dr6(struct kvm_vcp
  
  static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
  {
-       u32 cpu_based_vm_exec_control;
        get_debugreg(vcpu->arch.db[0], 0);
        get_debugreg(vcpu->arch.db[1], 1);
        get_debugreg(vcpu->arch.db[2], 2);
        vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
  
        vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
-       cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-       cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING;
-       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+       vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING);
  }
  
  static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
@@@ -5768,8 -5749,7 +5770,7 @@@ static int handle_halt(struct kvm_vcpu 
  
  static int handle_vmcall(struct kvm_vcpu *vcpu)
  {
-       kvm_emulate_hypercall(vcpu);
-       return 1;
+       return kvm_emulate_hypercall(vcpu);
  }
  
  static int handle_invd(struct kvm_vcpu *vcpu)
@@@ -6456,8 -6436,8 +6457,8 @@@ static struct loaded_vmcs *nested_get_c
  
        if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
                /* Recycle the least recently used VMCS. */
-               item = list_entry(vmx->nested.vmcs02_pool.prev,
-                       struct vmcs02_list, list);
+               item = list_last_entry(&vmx->nested.vmcs02_pool,
+                                      struct vmcs02_list, list);
                item->vmptr = vmx->nested.current_vmptr;
                list_move(&item->list, &vmx->nested.vmcs02_pool);
                return &item->vmcs02;
@@@ -7773,6 -7753,13 +7774,13 @@@ static bool nested_vmx_exit_handled(str
                else if (is_no_device(intr_info) &&
                         !(vmcs12->guest_cr0 & X86_CR0_TS))
                        return false;
+               else if (is_debug(intr_info) &&
+                        vcpu->guest_debug &
+                        (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+                       return false;
+               else if (is_breakpoint(intr_info) &&
+                        vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+                       return false;
                return vmcs12->exception_bitmap &
                                (1u << (intr_info & INTR_INFO_VECTOR_MASK));
        case EXIT_REASON_EXTERNAL_INTERRUPT:
@@@ -10277,7 -10264,7 +10285,7 @@@ static void prepare_vmcs12(struct kvm_v
        vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
        vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
        vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
-       if (vmx_mpx_supported())
+       if (kvm_mpx_supported())
                vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
        if (nested_cpu_has_xsaves(vmcs12))
                vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP);
@@@ -10785,13 -10772,26 +10793,26 @@@ static int vmx_update_pi_irte(struct kv
                 */
  
                kvm_set_msi_irq(e, &irq);
-               if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu))
+               if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
+                       /*
+                        * Make sure the IRTE is in remapped mode if
+                        * we don't handle it in posted mode.
+                        */
+                       ret = irq_set_vcpu_affinity(host_irq, NULL);
+                       if (ret < 0) {
+                               printk(KERN_INFO
+                                  "failed to back to remapped mode, irq: %u\n",
+                                  host_irq);
+                               goto out;
+                       }
                        continue;
+               }
  
                vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
                vcpu_info.vector = irq.vector;
  
-               trace_kvm_pi_irte_update(vcpu->vcpu_id, e->gsi,
+               trace_kvm_pi_irte_update(vcpu->vcpu_id, host_irq, e->gsi,
                                vcpu_info.vector, vcpu_info.pi_desc_addr, set);
  
                if (set)
diff --combined arch/x86/kvm/x86.c
index eaf6ee8c28b8f1619e7404bfd7efbf78beecc574,bcbce0fa0bc278b0c17fddf92e89395ea9c11d4f..7236bd3a4c3d7a0c5a6148decc6fad276eb18bb7
@@@ -123,6 -123,9 +123,9 @@@ module_param(tsc_tolerance_ppm, uint, S
  unsigned int __read_mostly lapic_timer_advance_ns = 0;
  module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
  
+ static bool __read_mostly vector_hashing = true;
+ module_param(vector_hashing, bool, S_IRUGO);
  static bool __read_mostly backwards_tsc_observed = false;
  
  #define KVM_NR_SHARED_MSRS 16
@@@ -1196,17 -1199,11 +1199,11 @@@ static void kvm_write_wall_clock(struc
  
  static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
  {
-       uint32_t quotient, remainder;
-       /* Don't try to replace with do_div(), this one calculates
-        * "(dividend << 32) / divisor" */
-       __asm__ ( "divl %4"
-                 : "=a" (quotient), "=d" (remainder)
-                 : "0" (0), "1" (dividend), "r" (divisor) );
-       return quotient;
+       do_shl32_div32(dividend, divisor);
+       return dividend;
  }
  
- static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
+ static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
                               s8 *pshift, u32 *pmultiplier)
  {
        uint64_t scaled64;
        uint64_t tps64;
        uint32_t tps32;
  
-       tps64 = base_khz * 1000LL;
-       scaled64 = scaled_khz * 1000LL;
+       tps64 = base_hz;
+       scaled64 = scaled_hz;
        while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
                tps64 >>= 1;
                shift--;
        *pshift = shift;
        *pmultiplier = div_frac(scaled64, tps32);
  
-       pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
-                __func__, base_khz, scaled_khz, shift, *pmultiplier);
+       pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n",
+                __func__, base_hz, scaled_hz, shift, *pmultiplier);
  }
  
  #ifdef CONFIG_X86_64
@@@ -1293,23 -1290,23 +1290,23 @@@ static int set_tsc_khz(struct kvm_vcpu 
        return 0;
  }
  
- static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
+ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
  {
        u32 thresh_lo, thresh_hi;
        int use_scaling = 0;
  
        /* tsc_khz can be zero if TSC calibration fails */
-       if (this_tsc_khz == 0) {
+       if (user_tsc_khz == 0) {
                /* set tsc_scaling_ratio to a safe value */
                vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
                return -1;
        }
  
        /* Compute a scale to convert nanoseconds in TSC cycles */
-       kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
+       kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
                           &vcpu->arch.virtual_tsc_shift,
                           &vcpu->arch.virtual_tsc_mult);
-       vcpu->arch.virtual_tsc_khz = this_tsc_khz;
+       vcpu->arch.virtual_tsc_khz = user_tsc_khz;
  
        /*
         * Compute the variation in TSC rate which is acceptable
         */
        thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
        thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
-       if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {
-               pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
+       if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
+               pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
                use_scaling = 1;
        }
-       return set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
+       return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
  }
  
  static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
@@@ -1716,7 -1713,7 +1713,7 @@@ static void kvm_gen_update_masterclock(
  
  static int kvm_guest_time_update(struct kvm_vcpu *v)
  {
-       unsigned long flags, this_tsc_khz, tgt_tsc_khz;
+       unsigned long flags, tgt_tsc_khz;
        struct kvm_vcpu_arch *vcpu = &v->arch;
        struct kvm_arch *ka = &v->kvm->arch;
        s64 kernel_ns;
  
        /* Keep irq disabled to prevent changes to the clock */
        local_irq_save(flags);
-       this_tsc_khz = __this_cpu_read(cpu_tsc_khz);
-       if (unlikely(this_tsc_khz == 0)) {
+       tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
+       if (unlikely(tgt_tsc_khz == 0)) {
                local_irq_restore(flags);
                kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
                return 1;
        if (!vcpu->pv_time_enabled)
                return 0;
  
-       if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
-               tgt_tsc_khz = kvm_has_tsc_control ?
-                       vcpu->virtual_tsc_khz : this_tsc_khz;
-               kvm_get_time_scale(NSEC_PER_SEC / 1000, tgt_tsc_khz,
+       if (kvm_has_tsc_control)
+               tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
+       if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
+               kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
                                   &vcpu->hv_clock.tsc_shift,
                                   &vcpu->hv_clock.tsc_to_system_mul);
-               vcpu->hw_tsc_khz = this_tsc_khz;
+               vcpu->hw_tsc_khz = tgt_tsc_khz;
        }
  
        /* With all the info we got, fill in the values */
@@@ -2752,6 -2750,7 +2750,6 @@@ void kvm_arch_vcpu_load(struct kvm_vcp
        }
  
        kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
 -      vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
  }
  
  void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@@ -2987,7 -2986,7 +2985,7 @@@ static int kvm_vcpu_ioctl_x86_set_vcpu_
        kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
  
        if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
-           kvm_vcpu_has_lapic(vcpu))
+           lapic_in_kernel(vcpu))
                vcpu->arch.apic->sipi_vector = events->sipi_vector;
  
        if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
                        vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
                else
                        vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
-               if (kvm_vcpu_has_lapic(vcpu)) {
+               if (lapic_in_kernel(vcpu)) {
                        if (events->smi.latched_init)
                                set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
                        else
@@@ -3240,7 -3239,7 +3238,7 @@@ long kvm_arch_vcpu_ioctl(struct file *f
        switch (ioctl) {
        case KVM_GET_LAPIC: {
                r = -EINVAL;
-               if (!vcpu->arch.apic)
+               if (!lapic_in_kernel(vcpu))
                        goto out;
                u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
  
        }
        case KVM_SET_LAPIC: {
                r = -EINVAL;
-               if (!vcpu->arch.apic)
+               if (!lapic_in_kernel(vcpu))
                        goto out;
                u.lapic = memdup_user(argp, sizeof(*u.lapic));
                if (IS_ERR(u.lapic))
@@@ -3605,20 -3604,26 +3603,26 @@@ static int kvm_vm_ioctl_set_irqchip(str
  
  static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
  {
-       mutex_lock(&kvm->arch.vpit->pit_state.lock);
-       memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
-       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+       struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
+       BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
+       mutex_lock(&kps->lock);
+       memcpy(ps, &kps->channels, sizeof(*ps));
+       mutex_unlock(&kps->lock);
        return 0;
  }
  
  static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
  {
        int i;
-       mutex_lock(&kvm->arch.vpit->pit_state.lock);
-       memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
+       struct kvm_pit *pit = kvm->arch.vpit;
+       mutex_lock(&pit->pit_state.lock);
+       memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
        for (i = 0; i < 3; i++)
-               kvm_pit_load_count(kvm, i, ps->channels[i].count, 0);
-       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+               kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
+       mutex_unlock(&pit->pit_state.lock);
        return 0;
  }
  
@@@ -3638,29 -3643,39 +3642,39 @@@ static int kvm_vm_ioctl_set_pit2(struc
        int start = 0;
        int i;
        u32 prev_legacy, cur_legacy;
-       mutex_lock(&kvm->arch.vpit->pit_state.lock);
-       prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
+       struct kvm_pit *pit = kvm->arch.vpit;
+       mutex_lock(&pit->pit_state.lock);
+       prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
        cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
        if (!prev_legacy && cur_legacy)
                start = 1;
-       memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
-              sizeof(kvm->arch.vpit->pit_state.channels));
-       kvm->arch.vpit->pit_state.flags = ps->flags;
+       memcpy(&pit->pit_state.channels, &ps->channels,
+              sizeof(pit->pit_state.channels));
+       pit->pit_state.flags = ps->flags;
        for (i = 0; i < 3; i++)
-               kvm_pit_load_count(kvm, i, kvm->arch.vpit->pit_state.channels[i].count,
+               kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
                                   start && i == 0);
-       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+       mutex_unlock(&pit->pit_state.lock);
        return 0;
  }
  
  static int kvm_vm_ioctl_reinject(struct kvm *kvm,
                                 struct kvm_reinject_control *control)
  {
-       if (!kvm->arch.vpit)
+       struct kvm_pit *pit = kvm->arch.vpit;
+       if (!pit)
                return -ENXIO;
-       mutex_lock(&kvm->arch.vpit->pit_state.lock);
-       kvm->arch.vpit->pit_state.reinject = control->pit_reinject;
-       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+       /* pit->pit_state.lock was overloaded to prevent userspace from getting
+        * an inconsistent state after running multiple KVM_REINJECT_CONTROL
+        * ioctls in parallel.  Use a separate lock if that ioctl isn't rare.
+        */
+       mutex_lock(&pit->pit_state.lock);
+       kvm_pit_set_reinject(pit, control->pit_reinject);
+       mutex_unlock(&pit->pit_state.lock);
        return 0;
  }
  
@@@ -4093,7 -4108,7 +4107,7 @@@ static int vcpu_mmio_write(struct kvm_v
  
        do {
                n = min(len, 8);
-               if (!(vcpu->arch.apic &&
+               if (!(lapic_in_kernel(vcpu) &&
                      !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
                    && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
                        break;
@@@ -4113,7 -4128,7 +4127,7 @@@ static int vcpu_mmio_read(struct kvm_vc
  
        do {
                n = min(len, 8);
-               if (!(vcpu->arch.apic &&
+               if (!(lapic_in_kernel(vcpu) &&
                      !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
                                         addr, n, v))
                    && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
@@@ -4346,7 -4361,7 +4360,7 @@@ int emulator_write_phys(struct kvm_vcp
        ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
        if (ret < 0)
                return 0;
-       kvm_mmu_pte_write(vcpu, gpa, val, bytes);
+       kvm_page_track_write(vcpu, gpa, val, bytes);
        return 1;
  }
  
@@@ -4604,7 -4619,7 +4618,7 @@@ static int emulator_cmpxchg_emulated(st
                return X86EMUL_CMPXCHG_FAILED;
  
        kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
-       kvm_mmu_pte_write(vcpu, gpa, new, bytes);
+       kvm_page_track_write(vcpu, gpa, new, bytes);
  
        return X86EMUL_CONTINUE;
  
@@@ -6010,7 -6025,7 +6024,7 @@@ static void update_cr8_intercept(struc
        if (!kvm_x86_ops->update_cr8_intercept)
                return;
  
-       if (!vcpu->arch.apic)
+       if (!lapic_in_kernel(vcpu))
                return;
  
        if (vcpu->arch.apicv_active)
@@@ -6618,12 -6633,12 +6632,12 @@@ static int vcpu_enter_guest(struct kvm_
         * KVM_DEBUGREG_WONT_EXIT again.
         */
        if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
 -              int i;
 -
                WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
                kvm_x86_ops->sync_dirty_debug_regs(vcpu);
 -              for (i = 0; i < KVM_NR_DB_REGS; i++)
 -                      vcpu->arch.eff_db[i] = vcpu->arch.db[i];
 +              kvm_update_dr0123(vcpu);
 +              kvm_update_dr6(vcpu);
 +              kvm_update_dr7(vcpu);
 +              vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
        }
  
        /*
@@@ -7038,7 -7053,7 +7052,7 @@@ int kvm_arch_vcpu_ioctl_get_mpstate(str
  int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
                                    struct kvm_mp_state *mp_state)
  {
-       if (!kvm_vcpu_has_lapic(vcpu) &&
+       if (!lapic_in_kernel(vcpu) &&
            mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
                return -EINVAL;
  
@@@ -7314,7 -7329,7 +7328,7 @@@ void kvm_put_guest_fpu(struct kvm_vcpu 
         * Every 255 times fpu_counter rolls over to 0; a guest that uses
         * the FPU in bursts will revert to loading it on demand.
         */
-       if (!vcpu->arch.eager_fpu) {
+       if (!use_eager_fpu()) {
                if (++vcpu->fpu_counter < 5)
                        kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
        }
@@@ -7593,6 -7608,7 +7607,7 @@@ bool kvm_vcpu_compatible(struct kvm_vcp
  }
  
  struct static_key kvm_no_apic_vcpu __read_mostly;
+ EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
  
  int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
  {
@@@ -7724,6 -7740,9 +7739,9 @@@ int kvm_arch_init_vm(struct kvm *kvm, u
        INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
        INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
  
+       kvm_page_track_init(kvm);
+       kvm_mmu_init_vm(kvm);
        return 0;
  }
  
@@@ -7850,6 -7869,7 +7868,7 @@@ void kvm_arch_destroy_vm(struct kvm *kv
        kfree(kvm->arch.vioapic);
        kvm_free_vcpus(kvm);
        kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
+       kvm_mmu_uninit_vm(kvm);
  }
  
  void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
                        free->arch.lpage_info[i - 1] = NULL;
                }
        }
+       kvm_page_track_free_memslot(free, dont);
  }
  
  int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
        int i;
  
        for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+               struct kvm_lpage_info *linfo;
                unsigned long ugfn;
                int lpages;
                int level = i + 1;
                if (i == 0)
                        continue;
  
-               slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages *
-                                       sizeof(*slot->arch.lpage_info[i - 1]));
-               if (!slot->arch.lpage_info[i - 1])
+               linfo = kvm_kvzalloc(lpages * sizeof(*linfo));
+               if (!linfo)
                        goto out_free;
  
+               slot->arch.lpage_info[i - 1] = linfo;
                if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
-                       slot->arch.lpage_info[i - 1][0].write_count = 1;
+                       linfo[0].disallow_lpage = 1;
                if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
-                       slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1;
+                       linfo[lpages - 1].disallow_lpage = 1;
                ugfn = slot->userspace_addr >> PAGE_SHIFT;
                /*
                 * If the gfn and userspace address are not aligned wrt each
                        unsigned long j;
  
                        for (j = 0; j < lpages; ++j)
-                               slot->arch.lpage_info[i - 1][j].write_count = 1;
+                               linfo[j].disallow_lpage = 1;
                }
        }
  
+       if (kvm_page_track_create_memslot(slot, npages))
+               goto out_free;
        return 0;
  
  out_free:
@@@ -8370,6 -8397,12 +8396,12 @@@ int kvm_arch_update_irqfd_routing(struc
        return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
  }
  
+ bool kvm_vector_hashing_enabled(void)
+ {
+       return vector_hashing;
+ }
+ EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
index f0dd9d42bc7b1c9b55f48719cd0f3ded16bbe00f,ffe9d1c6b5884637c3c5c00153565a7fc8730235..5152b389815500a77a95cffe69a0602379d4f570
  #define CNTTIDR               0x08
  #define CNTTIDR_VIRT(n)       (BIT(1) << ((n) * 4))
  
 +#define CNTACR(n)     (0x40 + ((n) * 4))
 +#define CNTACR_RPCT   BIT(0)
 +#define CNTACR_RVCT   BIT(1)
 +#define CNTACR_RFRQ   BIT(2)
 +#define CNTACR_RVOFF  BIT(3)
 +#define CNTACR_RWVT   BIT(4)
 +#define CNTACR_RWPT   BIT(5)
 +
  #define CNTVCT_LO     0x08
  #define CNTVCT_HI     0x0c
  #define CNTFRQ                0x10
@@@ -75,7 -67,7 +75,7 @@@ static int arch_timer_ppi[MAX_TIMER_PPI
  
  static struct clock_event_device __percpu *arch_timer_evt;
  
- static bool arch_timer_use_virtual = true;
+ static enum ppi_nr arch_timer_uses_ppi = VIRT_PPI;
  static bool arch_timer_c3stop;
  static bool arch_timer_mem_use_virtual;
  
@@@ -271,16 -263,20 +271,22 @@@ static void __arch_timer_setup(unsigne
                clk->name = "arch_sys_timer";
                clk->rating = 450;
                clk->cpumask = cpumask_of(smp_processor_id());
-               if (arch_timer_use_virtual) {
-                       clk->irq = arch_timer_ppi[VIRT_PPI];
+               clk->irq = arch_timer_ppi[arch_timer_uses_ppi];
+               switch (arch_timer_uses_ppi) {
+               case VIRT_PPI:
                        clk->set_state_shutdown = arch_timer_shutdown_virt;
 +                      clk->set_state_oneshot_stopped = arch_timer_shutdown_virt;
                        clk->set_next_event = arch_timer_set_next_event_virt;
-               } else {
-                       clk->irq = arch_timer_ppi[PHYS_SECURE_PPI];
+                       break;
+               case PHYS_SECURE_PPI:
+               case PHYS_NONSECURE_PPI:
+               case HYP_PPI:
                        clk->set_state_shutdown = arch_timer_shutdown_phys;
 +                      clk->set_state_oneshot_stopped = arch_timer_shutdown_phys;
                        clk->set_next_event = arch_timer_set_next_event_phys;
+                       break;
+               default:
+                       BUG();
                }
        } else {
                clk->features |= CLOCK_EVT_FEAT_DYNIRQ;
                clk->cpumask = cpu_all_mask;
                if (arch_timer_mem_use_virtual) {
                        clk->set_state_shutdown = arch_timer_shutdown_virt_mem;
 +                      clk->set_state_oneshot_stopped = arch_timer_shutdown_virt_mem;
                        clk->set_next_event =
                                arch_timer_set_next_event_virt_mem;
                } else {
                        clk->set_state_shutdown = arch_timer_shutdown_phys_mem;
 +                      clk->set_state_oneshot_stopped = arch_timer_shutdown_phys_mem;
                        clk->set_next_event =
                                arch_timer_set_next_event_phys_mem;
                }
@@@ -350,17 -344,20 +356,20 @@@ static void arch_counter_set_user_acces
        arch_timer_set_cntkctl(cntkctl);
  }
  
+ static bool arch_timer_has_nonsecure_ppi(void)
+ {
+       return (arch_timer_uses_ppi == PHYS_SECURE_PPI &&
+               arch_timer_ppi[PHYS_NONSECURE_PPI]);
+ }
  static int arch_timer_setup(struct clock_event_device *clk)
  {
        __arch_timer_setup(ARCH_CP15_TIMER, clk);
  
-       if (arch_timer_use_virtual)
-               enable_percpu_irq(arch_timer_ppi[VIRT_PPI], 0);
-       else {
-               enable_percpu_irq(arch_timer_ppi[PHYS_SECURE_PPI], 0);
-               if (arch_timer_ppi[PHYS_NONSECURE_PPI])
-                       enable_percpu_irq(arch_timer_ppi[PHYS_NONSECURE_PPI], 0);
-       }
+       enable_percpu_irq(arch_timer_ppi[arch_timer_uses_ppi], 0);
+       if (arch_timer_has_nonsecure_ppi())
+               enable_percpu_irq(arch_timer_ppi[PHYS_NONSECURE_PPI], 0);
  
        arch_counter_set_user_access();
        if (IS_ENABLED(CONFIG_ARM_ARCH_TIMER_EVTSTREAM))
@@@ -402,7 -399,7 +411,7 @@@ static void arch_timer_banner(unsigned 
                     (unsigned long)arch_timer_rate / 1000000,
                     (unsigned long)(arch_timer_rate / 10000) % 100,
                     type & ARCH_CP15_TIMER ?
-                       arch_timer_use_virtual ? "virt" : "phys" :
+                    (arch_timer_uses_ppi == VIRT_PPI) ? "virt" : "phys" :
                        "",
                     type == (ARCH_CP15_TIMER | ARCH_MEM_TIMER) ?  "/" : "",
                     type & ARCH_MEM_TIMER ?
@@@ -472,7 -469,7 +481,7 @@@ static void __init arch_counter_registe
  
        /* Register the CP15 based counter if we have one */
        if (type & ARCH_CP15_TIMER) {
-               if (IS_ENABLED(CONFIG_ARM64) || arch_timer_use_virtual)
+               if (IS_ENABLED(CONFIG_ARM64) || arch_timer_uses_ppi == VIRT_PPI)
                        arch_timer_read_counter = arch_counter_get_cntvct;
                else
                        arch_timer_read_counter = arch_counter_get_cntpct;
@@@ -502,13 -499,9 +511,9 @@@ static void arch_timer_stop(struct cloc
        pr_debug("arch_timer_teardown disable IRQ%d cpu #%d\n",
                 clk->irq, smp_processor_id());
  
-       if (arch_timer_use_virtual)
-               disable_percpu_irq(arch_timer_ppi[VIRT_PPI]);
-       else {
-               disable_percpu_irq(arch_timer_ppi[PHYS_SECURE_PPI]);
-               if (arch_timer_ppi[PHYS_NONSECURE_PPI])
-                       disable_percpu_irq(arch_timer_ppi[PHYS_NONSECURE_PPI]);
-       }
+       disable_percpu_irq(arch_timer_ppi[arch_timer_uses_ppi]);
+       if (arch_timer_has_nonsecure_ppi())
+               disable_percpu_irq(arch_timer_ppi[PHYS_NONSECURE_PPI]);
  
        clk->set_state_shutdown(clk);
  }
@@@ -574,12 -567,14 +579,14 @@@ static int __init arch_timer_register(v
                goto out;
        }
  
-       if (arch_timer_use_virtual) {
-               ppi = arch_timer_ppi[VIRT_PPI];
+       ppi = arch_timer_ppi[arch_timer_uses_ppi];
+       switch (arch_timer_uses_ppi) {
+       case VIRT_PPI:
                err = request_percpu_irq(ppi, arch_timer_handler_virt,
                                         "arch_timer", arch_timer_evt);
-       } else {
-               ppi = arch_timer_ppi[PHYS_SECURE_PPI];
+               break;
+       case PHYS_SECURE_PPI:
+       case PHYS_NONSECURE_PPI:
                err = request_percpu_irq(ppi, arch_timer_handler_phys,
                                         "arch_timer", arch_timer_evt);
                if (!err && arch_timer_ppi[PHYS_NONSECURE_PPI]) {
                                free_percpu_irq(arch_timer_ppi[PHYS_SECURE_PPI],
                                                arch_timer_evt);
                }
+               break;
+       case HYP_PPI:
+               err = request_percpu_irq(ppi, arch_timer_handler_phys,
+                                        "arch_timer", arch_timer_evt);
+               break;
+       default:
+               BUG();
        }
  
        if (err) {
  out_unreg_notify:
        unregister_cpu_notifier(&arch_timer_cpu_nb);
  out_free_irq:
-       if (arch_timer_use_virtual)
-               free_percpu_irq(arch_timer_ppi[VIRT_PPI], arch_timer_evt);
-       else {
-               free_percpu_irq(arch_timer_ppi[PHYS_SECURE_PPI],
+       free_percpu_irq(arch_timer_ppi[arch_timer_uses_ppi], arch_timer_evt);
+       if (arch_timer_has_nonsecure_ppi())
+               free_percpu_irq(arch_timer_ppi[PHYS_NONSECURE_PPI],
                                arch_timer_evt);
-               if (arch_timer_ppi[PHYS_NONSECURE_PPI])
-                       free_percpu_irq(arch_timer_ppi[PHYS_NONSECURE_PPI],
-                                       arch_timer_evt);
-       }
  
  out_free:
        free_percpu(arch_timer_evt);
@@@ -709,12 -706,25 +718,25 @@@ static void __init arch_timer_init(void
         *
         * If no interrupt provided for virtual timer, we'll have to
         * stick to the physical timer. It'd better be accessible...
+        *
+        * On ARMv8.1 with VH extensions, the kernel runs in HYP. VHE
+        * accesses to CNTP_*_EL1 registers are silently redirected to
+        * their CNTHP_*_EL2 counterparts, and use a different PPI
+        * number.
         */
        if (is_hyp_mode_available() || !arch_timer_ppi[VIRT_PPI]) {
-               arch_timer_use_virtual = false;
+               bool has_ppi;
+               if (is_kernel_in_hyp_mode()) {
+                       arch_timer_uses_ppi = HYP_PPI;
+                       has_ppi = !!arch_timer_ppi[HYP_PPI];
+               } else {
+                       arch_timer_uses_ppi = PHYS_SECURE_PPI;
+                       has_ppi = (!!arch_timer_ppi[PHYS_SECURE_PPI] ||
+                                  !!arch_timer_ppi[PHYS_NONSECURE_PPI]);
+               }
  
-               if (!arch_timer_ppi[PHYS_SECURE_PPI] ||
-                   !arch_timer_ppi[PHYS_NONSECURE_PPI]) {
+               if (!has_ppi) {
                        pr_warn("arch_timer: No interrupt available, giving up\n");
                        return;
                }
@@@ -747,7 -757,7 +769,7 @@@ static void __init arch_timer_of_init(s
         */
        if (IS_ENABLED(CONFIG_ARM) &&
            of_property_read_bool(np, "arm,cpu-registers-not-fw-configured"))
-                       arch_timer_use_virtual = false;
+               arch_timer_uses_ppi = PHYS_SECURE_PPI;
  
        arch_timer_init();
  }
@@@ -769,6 -779,7 +791,6 @@@ static void __init arch_timer_mem_init(
        }
  
        cnttidr = readl_relaxed(cntctlbase + CNTTIDR);
 -      iounmap(cntctlbase);
  
        /*
         * Try to find a virtual capable frame. Otherwise fall back to a
         */
        for_each_available_child_of_node(np, frame) {
                int n;
 +              u32 cntacr;
  
                if (of_property_read_u32(frame, "frame-number", &n)) {
                        pr_err("arch_timer: Missing frame-number\n");
 -                      of_node_put(best_frame);
                        of_node_put(frame);
 -                      return;
 +                      goto out;
                }
  
 -              if (cnttidr & CNTTIDR_VIRT(n)) {
 +              /* Try enabling everything, and see what sticks */
 +              cntacr = CNTACR_RFRQ | CNTACR_RWPT | CNTACR_RPCT |
 +                       CNTACR_RWVT | CNTACR_RVOFF | CNTACR_RVCT;
 +              writel_relaxed(cntacr, cntctlbase + CNTACR(n));
 +              cntacr = readl_relaxed(cntctlbase + CNTACR(n));
 +
 +              if ((cnttidr & CNTTIDR_VIRT(n)) &&
 +                  !(~cntacr & (CNTACR_RWVT | CNTACR_RVCT))) {
                        of_node_put(best_frame);
                        best_frame = frame;
                        arch_timer_mem_use_virtual = true;
                        break;
                }
 +
 +              if (~cntacr & (CNTACR_RWPT | CNTACR_RPCT))
 +                      continue;
 +
                of_node_put(best_frame);
                best_frame = of_node_get(frame);
        }
        base = arch_counter_base = of_iomap(best_frame, 0);
        if (!base) {
                pr_err("arch_timer: Can't map frame's registers\n");
 -              of_node_put(best_frame);
 -              return;
 +              goto out;
        }
  
        if (arch_timer_mem_use_virtual)
                irq = irq_of_parse_and_map(best_frame, 1);
        else
                irq = irq_of_parse_and_map(best_frame, 0);
 -      of_node_put(best_frame);
 +
        if (!irq) {
                pr_err("arch_timer: Frame missing %s irq",
                       arch_timer_mem_use_virtual ? "virt" : "phys");
 -              return;
 +              goto out;
        }
  
        arch_timer_detect_rate(base, np);
        arch_timer_mem_register(base, irq);
        arch_timer_common_init();
 +out:
 +      iounmap(cntctlbase);
 +      of_node_put(best_frame);
  }
  CLOCKSOURCE_OF_DECLARE(armv7_arch_timer_mem, "arm,armv7-timer-mem",
                       arch_timer_mem_init);
diff --combined virt/kvm/async_pf.c
index 65da997b430a8e8c2b3d8eb9de93fb6b42d00f80,b866374282be995ae37364784fb8903e3b0ebd98..f0d061f92674c674803720a0cadfdc04df8a4a0c
@@@ -97,8 -97,8 +97,8 @@@ static void async_pf_execute(struct wor
         * This memory barrier pairs with prepare_to_wait's set_current_state()
         */
        smp_mb();
 -      if (waitqueue_active(&vcpu->wq))
 -              wake_up_interruptible(&vcpu->wq);
 +      if (swait_active(&vcpu->wq))
 +              swake_up(&vcpu->wq);
  
        mmput(mm);
        kvm_put_kvm(vcpu->kvm);
@@@ -109,8 -109,8 +109,8 @@@ void kvm_clear_async_pf_completion_queu
        /* cancel outstanding work queue item */
        while (!list_empty(&vcpu->async_pf.queue)) {
                struct kvm_async_pf *work =
-                       list_entry(vcpu->async_pf.queue.next,
-                                  typeof(*work), queue);
+                       list_first_entry(&vcpu->async_pf.queue,
+                                        typeof(*work), queue);
                list_del(&work->queue);
  
  #ifdef CONFIG_KVM_ASYNC_PF_SYNC
        spin_lock(&vcpu->async_pf.lock);
        while (!list_empty(&vcpu->async_pf.done)) {
                struct kvm_async_pf *work =
-                       list_entry(vcpu->async_pf.done.next,
-                                  typeof(*work), link);
+                       list_first_entry(&vcpu->async_pf.done,
+                                        typeof(*work), link);
                list_del(&work->link);
                kmem_cache_free(async_pf_cache, work);
        }
diff --combined virt/kvm/kvm_main.c
index 5af50c3ddd535a5094b8de788bd88cc17085fb17,1eae05236347f1d1c4c6cc9912a524a845c53218..7ba1d10ffed2d5a416701153caea619c7ab22f8d
@@@ -72,11 -72,11 +72,11 @@@ module_param(halt_poll_ns, uint, S_IRUG
  
  /* Default doubles per-vcpu halt_poll_ns. */
  static unsigned int halt_poll_ns_grow = 2;
- module_param(halt_poll_ns_grow, int, S_IRUGO);
+ module_param(halt_poll_ns_grow, uint, S_IRUGO | S_IWUSR);
  
  /* Default resets per-vcpu halt_poll_ns . */
  static unsigned int halt_poll_ns_shrink;
- module_param(halt_poll_ns_shrink, int, S_IRUGO);
+ module_param(halt_poll_ns_shrink, uint, S_IRUGO | S_IWUSR);
  
  /*
   * Ordering of locks:
@@@ -216,7 -216,8 +216,7 @@@ int kvm_vcpu_init(struct kvm_vcpu *vcpu
        vcpu->kvm = kvm;
        vcpu->vcpu_id = id;
        vcpu->pid = NULL;
 -      vcpu->halt_poll_ns = 0;
 -      init_waitqueue_head(&vcpu->wq);
 +      init_swait_queue_head(&vcpu->wq);
        kvm_async_pf_vcpu_init(vcpu);
  
        vcpu->pre_pcpu = -1;
@@@ -619,13 -620,10 +619,10 @@@ void *kvm_kvzalloc(unsigned long size
  
  static void kvm_destroy_devices(struct kvm *kvm)
  {
-       struct list_head *node, *tmp;
+       struct kvm_device *dev, *tmp;
  
-       list_for_each_safe(node, tmp, &kvm->devices) {
-               struct kvm_device *dev =
-                       list_entry(node, struct kvm_device, vm_node);
-               list_del(node);
+       list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
+               list_del(&dev->vm_node);
                dev->ops->destroy(dev);
        }
  }
@@@ -1436,11 -1434,17 +1433,17 @@@ kvm_pfn_t __gfn_to_pfn_memslot(struct k
  {
        unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
  
-       if (addr == KVM_HVA_ERR_RO_BAD)
+       if (addr == KVM_HVA_ERR_RO_BAD) {
+               if (writable)
+                       *writable = false;
                return KVM_PFN_ERR_RO_FAULT;
+       }
  
-       if (kvm_is_error_hva(addr))
+       if (kvm_is_error_hva(addr)) {
+               if (writable)
+                       *writable = false;
                return KVM_PFN_NOSLOT;
+       }
  
        /* Do not map writable pfn in the readonly memslot. */
        if (writable && memslot_is_readonly(slot)) {
@@@ -1942,31 -1946,30 +1945,33 @@@ EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_di
  
  static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
  {
-       int old, val;
+       unsigned int old, val, grow;
  
        old = val = vcpu->halt_poll_ns;
+       grow = READ_ONCE(halt_poll_ns_grow);
        /* 10us base */
-       if (val == 0 && halt_poll_ns_grow)
+       if (val == 0 && grow)
                val = 10000;
        else
-               val *= halt_poll_ns_grow;
+               val *= grow;
  
 +      if (val > halt_poll_ns)
 +              val = halt_poll_ns;
 +
        vcpu->halt_poll_ns = val;
        trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
  }
  
  static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
  {
-       int old, val;
+       unsigned int old, val, shrink;
  
        old = val = vcpu->halt_poll_ns;
-       if (halt_poll_ns_shrink == 0)
+       shrink = READ_ONCE(halt_poll_ns_shrink);
+       if (shrink == 0)
                val = 0;
        else
-               val /= halt_poll_ns_shrink;
+               val /= shrink;
  
        vcpu->halt_poll_ns = val;
        trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
@@@ -1992,7 -1995,7 +1997,7 @@@ static int kvm_vcpu_check_block(struct 
  void kvm_vcpu_block(struct kvm_vcpu *vcpu)
  {
        ktime_t start, cur;
 -      DEFINE_WAIT(wait);
 +      DECLARE_SWAITQUEUE(wait);
        bool waited = false;
        u64 block_ns;
  
        kvm_arch_vcpu_blocking(vcpu);
  
        for (;;) {
 -              prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
 +              prepare_to_swait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
  
                if (kvm_vcpu_check_block(vcpu) < 0)
                        break;
                schedule();
        }
  
 -      finish_wait(&vcpu->wq, &wait);
 +      finish_swait(&vcpu->wq, &wait);
        cur = ktime_get();
  
        kvm_arch_vcpu_unblocking(vcpu);
@@@ -2058,11 -2061,11 +2063,11 @@@ void kvm_vcpu_kick(struct kvm_vcpu *vcp
  {
        int me;
        int cpu = vcpu->cpu;
 -      wait_queue_head_t *wqp;
 +      struct swait_queue_head *wqp;
  
        wqp = kvm_arch_vcpu_wq(vcpu);
 -      if (waitqueue_active(wqp)) {
 -              wake_up_interruptible(wqp);
 +      if (swait_active(wqp)) {
 +              swake_up(wqp);
                ++vcpu->stat.halt_wakeup;
        }
  
@@@ -2163,7 -2166,7 +2168,7 @@@ void kvm_vcpu_on_spin(struct kvm_vcpu *
                                continue;
                        if (vcpu == me)
                                continue;
 -                      if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
 +                      if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
                                continue;
                        if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
                                continue;