Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

author Linus Torvalds <torvalds@linux-foundation.org>

Wed, 16 Mar 2016 16:55:35 +0000 (09:55 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 16 Mar 2016 16:55:35 +0000 (09:55 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Wed, 16 Mar 2016 16:55:35 +0000 (09:55 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 16 Mar 2016 16:55:35 +0000 (09:55 -0700)
diff --combined Documentation/virtual/kvm/mmu.txt

index c81731096a4338bcec4d43b8049a26118d10f260,dda2e9316701d75b479d05215c1d60614acef4b8..481b6a9c25d5a1737aef54f55a3aa712c859f970
--- 1/Documentation/virtual/kvm/mmu.txt
--- 2/Documentation/virtual/kvm/mmu.txt
+++ b/Documentation/virtual/kvm/mmu.txt
@@@ -358,8 -358,7 +358,8 @@@ In the first case there are two additio
   - if CR4.SMEP is enabled: since we've turned the page into a kernel page,
     the kernel may now execute it.  We handle this by also setting spte.nx.
     If we get a user fetch or read fault, we'll change spte.u=1 and
- -  spte.nx=gpte.nx back.
+ +  spte.nx=gpte.nx back.  For this to work, KVM forces EFER.NX to 1 when
+ +  shadow paging is in use.
   - if CR4.SMAP is disabled: since the page has been changed to a kernel
     page, it can not be reused when CR4.SMAP is enabled. We set
     CR4.SMAP && !CR0.WP into shadow page's role to avoid this case. Note,
@@@ -392,11 -391,11 +392,11 @@@ To instantiate a large spte, four const
     write-protected pages
   - the guest page must be wholly contained by a single memory slot
   
- To check the last two conditions, the mmu maintains a ->write_count set of
+ To check the last two conditions, the mmu maintains a ->disallow_lpage set of
   arrays for each memory slot and large page size.  Every write protected page
- causes its write_count to be incremented, thus preventing instantiation of
+ causes its disallow_lpage to be incremented, thus preventing instantiation of
   a large spte.  The frames at the end of an unaligned memory slot have
- artificially inflated ->write_counts so they can never be instantiated.
+ artificially inflated ->disallow_lpages so they can never be instantiated.
   
   Zapping all pages (page generation count)
   =========================================
diff --combined arch/arm/kvm/arm.c

index 08e49c423c24147a2f2c40b011866c8d2d144de9,9ca653e34d8ca7a85af3aeb56b556d6b4cc358dc..76552b51c7aea64fb9b8cc84436abad526991732
--- 1/arch/arm/kvm/arm.c
--- 2/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@@ -28,6 -28,7 +28,7 @@@
   #include <linux/sched.h>
   #include <linux/kvm.h>
   #include <trace/events/kvm.h>
+ #include <kvm/arm_pmu.h>
   
   #define CREATE_TRACE_POINTS
   #include "trace.h"
@@@ -265,6 -266,7 +266,7 @@@ void kvm_arch_vcpu_free(struct kvm_vcp
         kvm_mmu_free_memory_caches(vcpu);
         kvm_timer_vcpu_terminate(vcpu);
         kvm_vgic_vcpu_destroy(vcpu);
+       kvm_pmu_vcpu_destroy(vcpu);
         kmem_cache_free(kvm_vcpu_cache, vcpu);
   }
   
@@@ -320,6 -322,7 +322,7 @@@ void kvm_arch_vcpu_put(struct kvm_vcpu 
         vcpu->cpu = -1;
   
         kvm_arm_set_running_vcpu(NULL);
+       kvm_timer_vcpu_put(vcpu);
   }
   
   int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
@@@ -506,18 -509,18 +509,18 @@@ static void kvm_arm_resume_guest(struc
         struct kvm_vcpu *vcpu;
   
         kvm_for_each_vcpu(i, vcpu, kvm) {
- -              wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
+ +              struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
   
                 vcpu->arch.pause = false;
- -              wake_up_interruptible(wq);
+ +              swake_up(wq);
         }
   }
   
   static void vcpu_sleep(struct kvm_vcpu *vcpu)
   {
- -      wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
+ +      struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
   
- -      wait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
+ +      swait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
                                        (!vcpu->arch.pause)));
   }
   
@@@ -577,6 -580,7 +580,7 @@@ int kvm_arch_vcpu_ioctl_run(struct kvm_
                  * non-preemptible context.
                  */
                 preempt_disable();
+               kvm_pmu_flush_hwstate(vcpu);
                 kvm_timer_flush_hwstate(vcpu);
                 kvm_vgic_flush_hwstate(vcpu);
   
@@@ -593,6 -597,7 +597,7 @@@
                 if (ret <= 0 || need_new_vmid_gen(vcpu->kvm) ||
                         vcpu->arch.power_off || vcpu->arch.pause) {
                         local_irq_enable();
+                       kvm_pmu_sync_hwstate(vcpu);
                         kvm_timer_sync_hwstate(vcpu);
                         kvm_vgic_sync_hwstate(vcpu);
                         preempt_enable();
@@@ -642,10 -647,11 +647,11 @@@
                 trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
   
                 /*
-                * We must sync the timer state before the vgic state so that
-                * the vgic can properly sample the updated state of the
+                * We must sync the PMU and timer state before the vgic state so
+                * that the vgic can properly sample the updated state of the
                  * interrupt line.
                  */
+               kvm_pmu_sync_hwstate(vcpu);
                 kvm_timer_sync_hwstate(vcpu);
   
                 kvm_vgic_sync_hwstate(vcpu);
@@@ -823,11 -829,54 +829,54 @@@ static int kvm_arch_vcpu_ioctl_vcpu_ini
         return 0;
   }
   
+ static int kvm_arm_vcpu_set_attr(struct kvm_vcpu *vcpu,
+                                struct kvm_device_attr *attr)
+ {
+       int ret = -ENXIO;
+ 
+       switch (attr->group) {
+       default:
+               ret = kvm_arm_vcpu_arch_set_attr(vcpu, attr);
+               break;
+       }
+ 
+       return ret;
+ }
+ 
+ static int kvm_arm_vcpu_get_attr(struct kvm_vcpu *vcpu,
+                                struct kvm_device_attr *attr)
+ {
+       int ret = -ENXIO;
+ 
+       switch (attr->group) {
+       default:
+               ret = kvm_arm_vcpu_arch_get_attr(vcpu, attr);
+               break;
+       }
+ 
+       return ret;
+ }
+ 
+ static int kvm_arm_vcpu_has_attr(struct kvm_vcpu *vcpu,
+                                struct kvm_device_attr *attr)
+ {
+       int ret = -ENXIO;
+ 
+       switch (attr->group) {
+       default:
+               ret = kvm_arm_vcpu_arch_has_attr(vcpu, attr);
+               break;
+       }
+ 
+       return ret;
+ }
+ 
   long kvm_arch_vcpu_ioctl(struct file *filp,
                          unsigned int ioctl, unsigned long arg)
   {
         struct kvm_vcpu *vcpu = filp->private_data;
         void __user *argp = (void __user *)arg;
+       struct kvm_device_attr attr;
   
         switch (ioctl) {
         case KVM_ARM_VCPU_INIT: {
@@@ -870,6 -919,21 +919,21 @@@
                         return -E2BIG;
                 return kvm_arm_copy_reg_indices(vcpu, user_list->reg);
         }
+       case KVM_SET_DEVICE_ATTR: {
+               if (copy_from_user(&attr, argp, sizeof(attr)))
+                       return -EFAULT;
+               return kvm_arm_vcpu_set_attr(vcpu, &attr);
+       }
+       case KVM_GET_DEVICE_ATTR: {
+               if (copy_from_user(&attr, argp, sizeof(attr)))
+                       return -EFAULT;
+               return kvm_arm_vcpu_get_attr(vcpu, &attr);
+       }
+       case KVM_HAS_DEVICE_ATTR: {
+               if (copy_from_user(&attr, argp, sizeof(attr)))
+                       return -EFAULT;
+               return kvm_arm_vcpu_has_attr(vcpu, &attr);
+       }
         default:
                 return -EINVAL;
         }
@@@ -967,6 -1031,11 +1031,11 @@@ long kvm_arch_vm_ioctl(struct file *fil
         }
   }
   
+ static void cpu_init_stage2(void *dummy)
+ {
+       __cpu_init_stage2();
+ }
+ 
   static void cpu_init_hyp_mode(void *dummy)
   {
         phys_addr_t boot_pgd_ptr;
@@@ -985,6 -1054,7 +1054,7 @@@
         vector_ptr = (unsigned long)__kvm_hyp_vector;
   
         __cpu_init_hyp_mode(boot_pgd_ptr, pgd_ptr, hyp_stack_ptr, vector_ptr);
+       __cpu_init_stage2();
   
         kvm_arm_init_debug();
   }
@@@ -1035,6 -1105,82 +1105,82 @@@ static inline void hyp_cpu_pm_init(void
   }
   #endif
   
+ static void teardown_common_resources(void)
+ {
+       free_percpu(kvm_host_cpu_state);
+ }
+ 
+ static int init_common_resources(void)
+ {
+       kvm_host_cpu_state = alloc_percpu(kvm_cpu_context_t);
+       if (!kvm_host_cpu_state) {
+               kvm_err("Cannot allocate host CPU state\n");
+               return -ENOMEM;
+       }
+ 
+       return 0;
+ }
+ 
+ static int init_subsystems(void)
+ {
+       int err;
+ 
+       /*
+        * Init HYP view of VGIC
+        */
+       err = kvm_vgic_hyp_init();
+       switch (err) {
+       case 0:
+               vgic_present = true;
+               break;
+       case -ENODEV:
+       case -ENXIO:
+               vgic_present = false;
+               break;
+       default:
+               return err;
+       }
+ 
+       /*
+        * Init HYP architected timer support
+        */
+       err = kvm_timer_hyp_init();
+       if (err)
+               return err;
+ 
+       kvm_perf_init();
+       kvm_coproc_table_init();
+ 
+       return 0;
+ }
+ 
+ static void teardown_hyp_mode(void)
+ {
+       int cpu;
+ 
+       if (is_kernel_in_hyp_mode())
+               return;
+ 
+       free_hyp_pgds();
+       for_each_possible_cpu(cpu)
+               free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
+ }
+ 
+ static int init_vhe_mode(void)
+ {
+       /*
+        * Execute the init code on each CPU.
+        */
+       on_each_cpu(cpu_init_stage2, NULL, 1);
+ 
+       /* set size of VMID supported by CPU */
+       kvm_vmid_bits = kvm_get_vmid_bits();
+       kvm_info("%d-bit VMID\n", kvm_vmid_bits);
+ 
+       kvm_info("VHE mode initialized successfully\n");
+       return 0;
+ }
+ 
   /**
    * Inits Hyp-mode on all online CPUs
    */
@@@ -1065,7 -1211,7 +1211,7 @@@ static int init_hyp_mode(void
                 stack_page = __get_free_page(GFP_KERNEL);
                 if (!stack_page) {
                         err = -ENOMEM;
-                       goto out_free_stack_pages;
+                       goto out_err;
                 }
   
                 per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page;
@@@ -1074,16 -1220,16 +1220,16 @@@
         /*
          * Map the Hyp-code called directly from the host
          */
-       err = create_hyp_mappings(__kvm_hyp_code_start, __kvm_hyp_code_end);
+       err = create_hyp_mappings(__hyp_text_start, __hyp_text_end);
         if (err) {
                 kvm_err("Cannot map world-switch code\n");
-               goto out_free_mappings;
+               goto out_err;
         }
   
         err = create_hyp_mappings(__start_rodata, __end_rodata);
         if (err) {
                 kvm_err("Cannot map rodata section\n");
-               goto out_free_mappings;
+               goto out_err;
         }
   
         /*
@@@ -1095,20 -1241,10 +1241,10 @@@
   
                 if (err) {
                         kvm_err("Cannot map hyp stack\n");
-                       goto out_free_mappings;
+                       goto out_err;
                 }
         }
   
-       /*
-        * Map the host CPU structures
-        */
-       kvm_host_cpu_state = alloc_percpu(kvm_cpu_context_t);
-       if (!kvm_host_cpu_state) {
-               err = -ENOMEM;
-               kvm_err("Cannot allocate host CPU state\n");
-               goto out_free_mappings;
-       }
- 
         for_each_possible_cpu(cpu) {
                 kvm_cpu_context_t *cpu_ctxt;
   
@@@ -1117,7 -1253,7 +1253,7 @@@
   
                 if (err) {
                         kvm_err("Cannot map host CPU state: %d\n", err);
-                       goto out_free_context;
+                       goto out_err;
                 }
         }
   
@@@ -1126,34 -1262,22 +1262,22 @@@
          */
         on_each_cpu(cpu_init_hyp_mode, NULL, 1);
   
-       /*
-        * Init HYP view of VGIC
-        */
-       err = kvm_vgic_hyp_init();
-       switch (err) {
-       case 0:
-               vgic_present = true;
-               break;
-       case -ENODEV:
-       case -ENXIO:
-               vgic_present = false;
-               break;
-       default:
-               goto out_free_context;
-       }
- 
-       /*
-        * Init HYP architected timer support
-        */
-       err = kvm_timer_hyp_init();
-       if (err)
-               goto out_free_context;
- 
   #ifndef CONFIG_HOTPLUG_CPU
         free_boot_hyp_pgd();
   #endif
   
-       kvm_perf_init();
+       cpu_notifier_register_begin();
+ 
+       err = __register_cpu_notifier(&hyp_init_cpu_nb);
+ 
+       cpu_notifier_register_done();
+ 
+       if (err) {
+               kvm_err("Cannot register HYP init CPU notifier (%d)\n", err);
+               goto out_err;
+       }
+ 
+       hyp_cpu_pm_init();
   
         /* set size of VMID supported by CPU */
         kvm_vmid_bits = kvm_get_vmid_bits();
@@@ -1162,14 -1286,9 +1286,9 @@@
         kvm_info("Hyp mode initialized successfully\n");
   
         return 0;
- out_free_context:
-       free_percpu(kvm_host_cpu_state);
- out_free_mappings:
-       free_hyp_pgds();
- out_free_stack_pages:
-       for_each_possible_cpu(cpu)
-               free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
+ 
   out_err:
+       teardown_hyp_mode();
         kvm_err("error initializing Hyp mode: %d\n", err);
         return err;
   }
@@@ -1213,26 -1332,27 +1332,27 @@@ int kvm_arch_init(void *opaque
                 }
         }
   
-       cpu_notifier_register_begin();
- 
-       err = init_hyp_mode();
+       err = init_common_resources();
         if (err)
-               goto out_err;
+               return err;
   
-       err = __register_cpu_notifier(&hyp_init_cpu_nb);
-       if (err) {
-               kvm_err("Cannot register HYP init CPU notifier (%d)\n", err);
+       if (is_kernel_in_hyp_mode())
+               err = init_vhe_mode();
+       else
+               err = init_hyp_mode();
+       if (err)
                 goto out_err;
-       }
- 
-       cpu_notifier_register_done();
   
-       hyp_cpu_pm_init();
+       err = init_subsystems();
+       if (err)
+               goto out_hyp;
   
-       kvm_coproc_table_init();
         return 0;
+ 
+ out_hyp:
+       teardown_hyp_mode();
   out_err:
-       cpu_notifier_register_done();
+       teardown_common_resources();
         return err;
   }
   
diff --combined arch/arm/kvm/guest.c

index 99361f11354a0d8928c9aaead7fdf5e09436b3ac,12cbb68244435d277961a1cd24488d77a300c7d2..9093ed0f8b2a71e1d226fd31832a81d764242da6
--- 1/arch/arm/kvm/guest.c
--- 2/arch/arm/kvm/guest.c
+++ b/arch/arm/kvm/guest.c
@@@ -25,7 -25,6 +25,6 @@@
   #include <asm/cputype.h>
   #include <asm/uaccess.h>
   #include <asm/kvm.h>
- #include <asm/kvm_asm.h>
   #include <asm/kvm_emulate.h>
   #include <asm/kvm_coproc.h>
   
@@@ -55,7 -54,7 +54,7 @@@ static u64 core_reg_offset_from_id(u64 
   static int get_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
   {
         u32 __user *uaddr = (u32 __user *)(long)reg->addr;
-       struct kvm_regs *regs = &vcpu->arch.regs;
+       struct kvm_regs *regs = &vcpu->arch.ctxt.gp_regs;
         u64 off;
   
         if (KVM_REG_SIZE(reg->id) != 4)
@@@ -72,7 -71,7 +71,7 @@@
   static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
   {
         u32 __user *uaddr = (u32 __user *)(long)reg->addr;
-       struct kvm_regs *regs = &vcpu->arch.regs;
+       struct kvm_regs *regs = &vcpu->arch.ctxt.gp_regs;
         u64 off, val;
   
         if (KVM_REG_SIZE(reg->id) != 4)
@@@ -161,7 -160,7 +160,7 @@@ static int get_timer_reg(struct kvm_vcp
         u64 val;
   
         val = kvm_arm_timer_get_reg(vcpu, reg->id);
- -      return copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id));
+ +      return copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id)) ? -EFAULT : 0;
   }
   
   static unsigned long num_core_regs(void)
diff --combined arch/arm64/kvm/guest.c

index 9e54ad7c240ac5d2239ba5140b04c6727b20377e,dbe45c364bbb150977696b6fa9ff2ac8751aff6e..32fad75bb9ff5960242596685c317ba4086689e5
--- 1/arch/arm64/kvm/guest.c
--- 2/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@@ -194,7 -194,7 +194,7 @@@ static int get_timer_reg(struct kvm_vcp
         u64 val;
   
         val = kvm_arm_timer_get_reg(vcpu, reg->id);
- -      return copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id));
+ +      return copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id)) ? -EFAULT : 0;
   }
   
   /**
@@@ -380,3 -380,54 +380,54 @@@ int kvm_arch_vcpu_ioctl_set_guest_debug
         }
         return 0;
   }
+ 
+ int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu,
+                              struct kvm_device_attr *attr)
+ {
+       int ret;
+ 
+       switch (attr->group) {
+       case KVM_ARM_VCPU_PMU_V3_CTRL:
+               ret = kvm_arm_pmu_v3_set_attr(vcpu, attr);
+               break;
+       default:
+               ret = -ENXIO;
+               break;
+       }
+ 
+       return ret;
+ }
+ 
+ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
+                              struct kvm_device_attr *attr)
+ {
+       int ret;
+ 
+       switch (attr->group) {
+       case KVM_ARM_VCPU_PMU_V3_CTRL:
+               ret = kvm_arm_pmu_v3_get_attr(vcpu, attr);
+               break;
+       default:
+               ret = -ENXIO;
+               break;
+       }
+ 
+       return ret;
+ }
+ 
+ int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
+                              struct kvm_device_attr *attr)
+ {
+       int ret;
+ 
+       switch (attr->group) {
+       case KVM_ARM_VCPU_PMU_V3_CTRL:
+               ret = kvm_arm_pmu_v3_has_attr(vcpu, attr);
+               break;
+       default:
+               ret = -ENXIO;
+               break;
+       }
+ 
+       return ret;
+ }
diff --combined arch/powerpc/include/asm/kvm_host.h

index c98afa538b3aeca91901e858c02884e820ca3dfa,2e7c79101652ef863ee049da91b6047a193a440b..d7b343170453df82b4f31429020c8680e1f949bf
--- 1/arch/powerpc/include/asm/kvm_host.h
--- 2/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@@ -182,7 -182,10 +182,10 @@@ struct kvmppc_spapr_tce_table 
         struct list_head list;
         struct kvm *kvm;
         u64 liobn;
-       u32 window_size;
+       struct rcu_head rcu;
+       u32 page_shift;
+       u64 offset;             /* in pages */
+       u64 size;               /* window size in pages */
         struct page *pages[0];
   };
   
@@@ -289,7 -292,7 +292,7 @@@ struct kvmppc_vcore 
         struct list_head runnable_threads;
         struct list_head preempt_list;
         spinlock_t lock;
- -      wait_queue_head_t wq;
+ +      struct swait_queue_head wq;
         spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */
         u64 stolen_tb;
         u64 preempt_tb;
@@@ -629,7 -632,7 +632,7 @@@ struct kvm_vcpu_arch 
         u8 prodded;
         u32 last_inst;
   
- -      wait_queue_head_t *wqp;
+ +      struct swait_queue_head *wqp;
         struct kvmppc_vcore *vcore;
         int ret;
         int trap;
diff --combined arch/powerpc/kernel/smp.c

index cc13d4c832916bc13203ba308949ff13870782c8,cb8be5dc118a72876dc0e93c5bb510bf4e307a49..b7dea05f07259558089205cebef69f40a7985a2f
--- 1/arch/powerpc/kernel/smp.c
--- 2/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@@ -206,7 -206,7 +206,7 @@@ int smp_request_message_ipi(int virq, i
   
   #ifdef CONFIG_PPC_SMP_MUXED_IPI
   struct cpu_messages {
-       int messages;                   /* current messages */
+       long messages;                  /* current messages */
         unsigned long data;             /* data for cause ipi */
   };
   static DEFINE_PER_CPU_SHARED_ALIGNED(struct cpu_messages, ipi_message);
@@@ -218,7 -218,7 +218,7 @@@ void smp_muxed_ipi_set_data(int cpu, un
         info->data = data;
   }
   
- void smp_muxed_ipi_message_pass(int cpu, int msg)
+ void smp_muxed_ipi_set_message(int cpu, int msg)
   {
         struct cpu_messages *info = &per_cpu(ipi_message, cpu);
         char *message = (char *)&info->messages;
@@@ -228,6 -228,13 +228,13 @@@
          */
         smp_mb();
         message[msg] = 1;
+ }
+ 
+ void smp_muxed_ipi_message_pass(int cpu, int msg)
+ {
+       struct cpu_messages *info = &per_cpu(ipi_message, cpu);
+ 
+       smp_muxed_ipi_set_message(cpu, msg);
         /*
          * cause_ipi functions are required to include a full barrier
          * before doing whatever causes the IPI.
@@@ -236,20 -243,31 +243,31 @@@
   }
   
   #ifdef __BIG_ENDIAN__
- #define IPI_MESSAGE(A) (1 << (24 - 8 * (A)))
+ #define IPI_MESSAGE(A) (1uL << ((BITS_PER_LONG - 8) - 8 * (A)))
   #else
- #define IPI_MESSAGE(A) (1 << (8 * (A)))
+ #define IPI_MESSAGE(A) (1uL << (8 * (A)))
   #endif
   
   irqreturn_t smp_ipi_demux(void)
   {
         struct cpu_messages *info = this_cpu_ptr(&ipi_message);
-       unsigned int all;
+       unsigned long all;
   
         mb();   /* order any irq clear */
   
         do {
                 all = xchg(&info->messages, 0);
+ #if defined(CONFIG_KVM_XICS) && defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE)
+               /*
+                * Must check for PPC_MSG_RM_HOST_ACTION messages
+                * before PPC_MSG_CALL_FUNCTION messages because when
+                * a VM is destroyed, we call kick_all_cpus_sync()
+                * to ensure that any pending PPC_MSG_RM_HOST_ACTION
+                * messages have completed before we free any VCPUs.
+                */
+               if (all & IPI_MESSAGE(PPC_MSG_RM_HOST_ACTION))
+                       kvmppc_xics_ipi_action();
+ #endif
                 if (all & IPI_MESSAGE(PPC_MSG_CALL_FUNCTION))
                         generic_smp_call_function_interrupt();
                 if (all & IPI_MESSAGE(PPC_MSG_RESCHEDULE))
@@@ -727,7 -745,7 +745,7 @@@ void start_secondary(void *unused
   
         local_irq_enable();
   
- -      cpu_startup_entry(CPUHP_ONLINE);
+ +      cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
   
         BUG();
   }
diff --combined arch/powerpc/kvm/book3s_hv.c

index f1187bb6dd4d7f5960e57aea111bd1c12021408d,f47fffefadc1fb8f0a53d13bfdb9d7cbcf55d80d..84fb4fcfaa41b802a614515c67539b0d2d7ee3cf
--- 1/arch/powerpc/kvm/book3s_hv.c
--- 2/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@@ -81,6 -81,17 +81,17 @@@ static int target_smt_mode
   module_param(target_smt_mode, int, S_IRUGO | S_IWUSR);
   MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
   
+ #ifdef CONFIG_KVM_XICS
+ static struct kernel_param_ops module_param_ops = {
+       .set = param_set_int,
+       .get = param_get_int,
+ };
+ 
+ module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect,
+                                                       S_IRUGO | S_IWUSR);
+ MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
+ #endif
+ 
   static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
   static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
   
@@@ -114,11 -125,11 +125,11 @@@ static bool kvmppc_ipi_thread(int cpu
   static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
   {
         int cpu;
- -      wait_queue_head_t *wqp;
+ +      struct swait_queue_head *wqp;
   
         wqp = kvm_arch_vcpu_wq(vcpu);
- -      if (waitqueue_active(wqp)) {
- -              wake_up_interruptible(wqp);
+ +      if (swait_active(wqp)) {
+ +              swake_up(wqp);
                 ++vcpu->stat.halt_wakeup;
         }
   
@@@ -701,8 -712,8 +712,8 @@@ int kvmppc_pseries_do_hcall(struct kvm_
                 tvcpu->arch.prodded = 1;
                 smp_mb();
                 if (vcpu->arch.ceded) {
- -                      if (waitqueue_active(&vcpu->wq)) {
- -                              wake_up_interruptible(&vcpu->wq);
+ +                      if (swait_active(&vcpu->wq)) {
+ +                              swake_up(&vcpu->wq);
                                 vcpu->stat.halt_wakeup++;
                         }
                 }
@@@ -768,7 -779,31 +779,31 @@@
                 if (kvmppc_xics_enabled(vcpu)) {
                         ret = kvmppc_xics_hcall(vcpu, req);
                         break;
-               } /* fallthrough */
+               }
+               return RESUME_HOST;
+       case H_PUT_TCE:
+               ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
+                                               kvmppc_get_gpr(vcpu, 5),
+                                               kvmppc_get_gpr(vcpu, 6));
+               if (ret == H_TOO_HARD)
+                       return RESUME_HOST;
+               break;
+       case H_PUT_TCE_INDIRECT:
+               ret = kvmppc_h_put_tce_indirect(vcpu, kvmppc_get_gpr(vcpu, 4),
+                                               kvmppc_get_gpr(vcpu, 5),
+                                               kvmppc_get_gpr(vcpu, 6),
+                                               kvmppc_get_gpr(vcpu, 7));
+               if (ret == H_TOO_HARD)
+                       return RESUME_HOST;
+               break;
+       case H_STUFF_TCE:
+               ret = kvmppc_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
+                                               kvmppc_get_gpr(vcpu, 5),
+                                               kvmppc_get_gpr(vcpu, 6),
+                                               kvmppc_get_gpr(vcpu, 7));
+               if (ret == H_TOO_HARD)
+                       return RESUME_HOST;
+               break;
         default:
                 return RESUME_HOST;
         }
@@@ -1459,7 -1494,7 +1494,7 @@@ static struct kvmppc_vcore *kvmppc_vcor
         INIT_LIST_HEAD(&vcore->runnable_threads);
         spin_lock_init(&vcore->lock);
         spin_lock_init(&vcore->stoltb_lock);
- -      init_waitqueue_head(&vcore->wq);
+ +      init_swait_queue_head(&vcore->wq);
         vcore->preempt_tb = TB_NIL;
         vcore->lpcr = kvm->arch.lpcr;
         vcore->first_vcpuid = core * threads_per_subcore;
@@@ -2278,6 -2313,46 +2313,46 @@@ static void post_guest_process(struct k
         spin_unlock(&vc->lock);
   }
   
+ /*
+  * Clear core from the list of active host cores as we are about to
+  * enter the guest. Only do this if it is the primary thread of the
+  * core (not if a subcore) that is entering the guest.
+  */
+ static inline void kvmppc_clear_host_core(int cpu)
+ {
+       int core;
+ 
+       if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
+               return;
+       /*
+        * Memory barrier can be omitted here as we will do a smp_wmb()
+        * later in kvmppc_start_thread and we need ensure that state is
+        * visible to other CPUs only after we enter guest.
+        */
+       core = cpu >> threads_shift;
+       kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 0;
+ }
+ 
+ /*
+  * Advertise this core as an active host core since we exited the guest
+  * Only need to do this if it is the primary thread of the core that is
+  * exiting.
+  */
+ static inline void kvmppc_set_host_core(int cpu)
+ {
+       int core;
+ 
+       if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
+               return;
+ 
+       /*
+        * Memory barrier can be omitted here because we do a spin_unlock
+        * immediately after this which provides the memory barrier.
+        */
+       core = cpu >> threads_shift;
+       kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 1;
+ }
+ 
   /*
    * Run a set of guest threads on a physical core.
    * Called with vc->lock held.
@@@ -2390,6 -2465,8 +2465,8 @@@ static noinline void kvmppc_run_core(st
                 }
         }
   
+       kvmppc_clear_host_core(pcpu);
+ 
         /* Start all the threads */
         active = 0;
         for (sub = 0; sub < core_info.n_subcores; ++sub) {
@@@ -2486,6 -2563,8 +2563,8 @@@
                         kvmppc_ipi_thread(pcpu + i);
         }
   
+       kvmppc_set_host_core(pcpu);
+ 
         spin_unlock(&vc->lock);
   
         /* make sure updates to secondary vcpu structs are visible now */
@@@ -2531,9 -2610,10 +2610,9 @@@ static void kvmppc_vcore_blocked(struc
   {
         struct kvm_vcpu *vcpu;
         int do_sleep = 1;
+ +      DECLARE_SWAITQUEUE(wait);
   
- -      DEFINE_WAIT(wait);
- -
- -      prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
+ +      prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
   
         /*
          * Check one last time for pending exceptions and ceded state after
@@@ -2547,7 -2627,7 +2626,7 @@@
         }
   
         if (!do_sleep) {
- -              finish_wait(&vc->wq, &wait);
+ +              finish_swait(&vc->wq, &wait);
                 return;
         }
   
@@@ -2555,7 -2635,7 +2634,7 @@@
         trace_kvmppc_vcore_blocked(vc, 0);
         spin_unlock(&vc->lock);
         schedule();
- -      finish_wait(&vc->wq, &wait);
+ +      finish_swait(&vc->wq, &wait);
         spin_lock(&vc->lock);
         vc->vcore_state = VCORE_INACTIVE;
         trace_kvmppc_vcore_blocked(vc, 1);
@@@ -2611,7 -2691,7 +2690,7 @@@ static int kvmppc_run_vcpu(struct kvm_r
                         kvmppc_start_thread(vcpu, vc);
                         trace_kvm_guest_enter(vcpu);
                 } else if (vc->vcore_state == VCORE_SLEEPING) {
- -                      wake_up(&vc->wq);
+ +                      swake_up(&vc->wq);
                 }
   
         }
@@@ -2983,6 -3063,114 +3062,114 @@@ static int kvmppc_hv_setup_htab_rma(str
         goto out_srcu;
   }
   
+ #ifdef CONFIG_KVM_XICS
+ static int kvmppc_cpu_notify(struct notifier_block *self, unsigned long action,
+                       void *hcpu)
+ {
+       unsigned long cpu = (long)hcpu;
+ 
+       switch (action) {
+       case CPU_UP_PREPARE:
+       case CPU_UP_PREPARE_FROZEN:
+               kvmppc_set_host_core(cpu);
+               break;
+ 
+ #ifdef CONFIG_HOTPLUG_CPU
+       case CPU_DEAD:
+       case CPU_DEAD_FROZEN:
+       case CPU_UP_CANCELED:
+       case CPU_UP_CANCELED_FROZEN:
+               kvmppc_clear_host_core(cpu);
+               break;
+ #endif
+       default:
+               break;
+       }
+ 
+       return NOTIFY_OK;
+ }
+ 
+ static struct notifier_block kvmppc_cpu_notifier = {
+           .notifier_call = kvmppc_cpu_notify,
+ };
+ 
+ /*
+  * Allocate a per-core structure for managing state about which cores are
+  * running in the host versus the guest and for exchanging data between
+  * real mode KVM and CPU running in the host.
+  * This is only done for the first VM.
+  * The allocated structure stays even if all VMs have stopped.
+  * It is only freed when the kvm-hv module is unloaded.
+  * It's OK for this routine to fail, we just don't support host
+  * core operations like redirecting H_IPI wakeups.
+  */
+ void kvmppc_alloc_host_rm_ops(void)
+ {
+       struct kvmppc_host_rm_ops *ops;
+       unsigned long l_ops;
+       int cpu, core;
+       int size;
+ 
+       /* Not the first time here ? */
+       if (kvmppc_host_rm_ops_hv != NULL)
+               return;
+ 
+       ops = kzalloc(sizeof(struct kvmppc_host_rm_ops), GFP_KERNEL);
+       if (!ops)
+               return;
+ 
+       size = cpu_nr_cores() * sizeof(struct kvmppc_host_rm_core);
+       ops->rm_core = kzalloc(size, GFP_KERNEL);
+ 
+       if (!ops->rm_core) {
+               kfree(ops);
+               return;
+       }
+ 
+       get_online_cpus();
+ 
+       for (cpu = 0; cpu < nr_cpu_ids; cpu += threads_per_core) {
+               if (!cpu_online(cpu))
+                       continue;
+ 
+               core = cpu >> threads_shift;
+               ops->rm_core[core].rm_state.in_host = 1;
+       }
+ 
+       ops->vcpu_kick = kvmppc_fast_vcpu_kick_hv;
+ 
+       /*
+        * Make the contents of the kvmppc_host_rm_ops structure visible
+        * to other CPUs before we assign it to the global variable.
+        * Do an atomic assignment (no locks used here), but if someone
+        * beats us to it, just free our copy and return.
+        */
+       smp_wmb();
+       l_ops = (unsigned long) ops;
+ 
+       if (cmpxchg64((unsigned long *)&kvmppc_host_rm_ops_hv, 0, l_ops)) {
+               put_online_cpus();
+               kfree(ops->rm_core);
+               kfree(ops);
+               return;
+       }
+ 
+       register_cpu_notifier(&kvmppc_cpu_notifier);
+ 
+       put_online_cpus();
+ }
+ 
+ void kvmppc_free_host_rm_ops(void)
+ {
+       if (kvmppc_host_rm_ops_hv) {
+               unregister_cpu_notifier(&kvmppc_cpu_notifier);
+               kfree(kvmppc_host_rm_ops_hv->rm_core);
+               kfree(kvmppc_host_rm_ops_hv);
+               kvmppc_host_rm_ops_hv = NULL;
+       }
+ }
+ #endif
+ 
   static int kvmppc_core_init_vm_hv(struct kvm *kvm)
   {
         unsigned long lpcr, lpid;
@@@ -2995,6 -3183,8 +3182,8 @@@
                 return -ENOMEM;
         kvm->arch.lpid = lpid;
   
+       kvmppc_alloc_host_rm_ops();
+ 
         /*
          * Since we don't flush the TLB when tearing down a VM,
          * and this lpid might have previously been used,
@@@ -3228,6 -3418,7 +3417,7 @@@ static int kvmppc_book3s_init_hv(void
   
   static void kvmppc_book3s_exit_hv(void)
   {
+       kvmppc_free_host_rm_ops();
         kvmppc_hv_ops = NULL;
   }
   
diff --combined arch/powerpc/kvm/book3s_hv_rmhandlers.S

index 25ae2c9913c39c2fae34fb60dd7ee655ba821b6e,ed16182a008b7f10b7aa53c3af2d0f3fd167fe25..85b32f16fa74e02a2fa753a62ecaf2c728e3eb2a
--- 1/arch/powerpc/kvm/book3s_hv_rmhandlers.S
--- 2/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@@ -1370,20 -1370,6 +1370,20 @@@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S
         std     r6, VCPU_ACOP(r9)
         stw     r7, VCPU_GUEST_PID(r9)
         std     r8, VCPU_WORT(r9)
+ +      /*
+ +       * Restore various registers to 0, where non-zero values
+ +       * set by the guest could disrupt the host.
+ +       */
+ +      li      r0, 0
+ +      mtspr   SPRN_IAMR, r0
+ +      mtspr   SPRN_CIABR, r0
+ +      mtspr   SPRN_DAWRX, r0
+ +      mtspr   SPRN_TCSCR, r0
+ +      mtspr   SPRN_WORT, r0
+ +      /* Set MMCRS to 1<<31 to freeze and disable the SPMC counters */
+ +      li      r0, 1
+ +      sldi    r0, r0, 31
+ +      mtspr   SPRN_MMCRS, r0
   8:
   
         /* Save and reset AMR and UAMOR before turning on the MMU */
@@@ -2020,8 -2006,8 +2020,8 @@@ hcall_real_table
         .long   0               /* 0x12c */
         .long   0               /* 0x130 */
         .long   DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table
-       .long   0               /* 0x138 */
-       .long   0               /* 0x13c */
+       .long   DOTSYM(kvmppc_h_stuff_tce) - hcall_real_table
+       .long   DOTSYM(kvmppc_rm_h_put_tce_indirect) - hcall_real_table
         .long   0               /* 0x140 */
         .long   0               /* 0x144 */
         .long   0               /* 0x148 */
diff --combined arch/s390/include/asm/kvm_host.h

index b0c8ad0799c7f0c09607420441ea87735c88c6d9,3c254952d3a7c95a031fdb93247fe9c26a78cdac..6da41fab70fbe951c957d1b5542101d22980839b
--- 1/arch/s390/include/asm/kvm_host.h
--- 2/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@@ -20,6 -20,7 +20,7 @@@
   #include <linux/kvm_types.h>
   #include <linux/kvm_host.h>
   #include <linux/kvm.h>
+ #include <linux/seqlock.h>
   #include <asm/debug.h>
   #include <asm/cpu.h>
   #include <asm/fpu/api.h>
@@@ -229,17 -230,11 +230,11 @@@ struct kvm_s390_itdb 
         __u8    data[256];
   } __packed;
   
- struct kvm_s390_vregs {
-       __vector128 vrs[32];
-       __u8    reserved200[512];       /* for future vector expansion */
- } __packed;
- 
   struct sie_page {
         struct kvm_s390_sie_block sie_block;
         __u8 reserved200[1024];         /* 0x0200 */
         struct kvm_s390_itdb itdb;      /* 0x0600 */
-       __u8 reserved700[1280];         /* 0x0700 */
-       struct kvm_s390_vregs vregs;    /* 0x0c00 */
+       __u8 reserved700[2304];         /* 0x0700 */
   } __packed;
   
   struct kvm_vcpu_stat {
@@@ -467,7 -462,7 +462,7 @@@ struct kvm_s390_irq_payload 
   struct kvm_s390_local_interrupt {
         spinlock_t lock;
         struct kvm_s390_float_interrupt *float_int;
- -      wait_queue_head_t *wq;
+ +      struct swait_queue_head *wq;
         atomic_t *cpuflags;
         DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS);
         struct kvm_s390_irq_payload irq;
@@@ -558,6 -553,15 +553,15 @@@ struct kvm_vcpu_arch 
         unsigned long pfault_token;
         unsigned long pfault_select;
         unsigned long pfault_compare;
+       bool cputm_enabled;
+       /*
+        * The seqcount protects updates to cputm_start and sie_block.cputm,
+        * this way we can have non-blocking reads with consistent values.
+        * Only the owning VCPU thread (vcpu->cpu) is allowed to change these
+        * values and to start/stop/enable/disable cpu timer accounting.
+        */
+       seqcount_t cputm_seqcount;
+       __u64 cputm_start;
   };
   
   struct kvm_vm_stat {
@@@ -596,15 -600,11 +600,11 @@@ struct s390_io_adapter 
   #define S390_ARCH_FAC_MASK_SIZE_U64 \
         (S390_ARCH_FAC_MASK_SIZE_BYTE / sizeof(u64))
   
- struct kvm_s390_fac {
-       /* facility list requested by guest */
-       __u64 list[S390_ARCH_FAC_LIST_SIZE_U64];
-       /* facility mask supported by kvm & hosting machine */
-       __u64 mask[S390_ARCH_FAC_LIST_SIZE_U64];
- };
- 
   struct kvm_s390_cpu_model {
-       struct kvm_s390_fac *fac;
+       /* facility mask supported by kvm & hosting machine */
+       __u64 fac_mask[S390_ARCH_FAC_LIST_SIZE_U64];
+       /* facility list requested by guest (in dma page) */
+       __u64 *fac_list;
         struct cpuid cpu_id;
         unsigned short ibc;
   };
@@@ -623,6 -623,16 +623,16 @@@ struct kvm_s390_crypto_cb 
         __u8    reserved80[128];                /* 0x0080 */
   };
   
+ /*
+  * sie_page2 has to be allocated as DMA because fac_list and crycb need
+  * 31bit addresses in the sie control block.
+  */
+ struct sie_page2 {
+       __u64 fac_list[S390_ARCH_FAC_LIST_SIZE_U64];    /* 0x0000 */
+       struct kvm_s390_crypto_cb crycb;                /* 0x0800 */
+       u8 reserved900[0x1000 - 0x900];                 /* 0x0900 */
+ } __packed;
+ 
   struct kvm_arch{
         void *sca;
         int use_esca;
@@@ -643,6 -653,7 +653,7 @@@
         int ipte_lock_count;
         struct mutex ipte_mutex;
         spinlock_t start_stop_lock;
+       struct sie_page2 *sie_page2;
         struct kvm_s390_cpu_model model;
         struct kvm_s390_crypto crypto;
         u64 epoch;
diff --combined arch/s390/kvm/interrupt.c

index 9ffc7322179213f031939fa184bc6c93545af559,ef84a803433eeea0a3992f0aa5eaed3fe440bcdc..704809d91dddf759d9d7e4b33b86e4c3dc29441e
--- 1/arch/s390/kvm/interrupt.c
--- 2/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@@ -182,8 -182,9 +182,9 @@@ static int cpu_timer_interrupts_enabled
   
   static int cpu_timer_irq_pending(struct kvm_vcpu *vcpu)
   {
-       return (vcpu->arch.sie_block->cputm >> 63) &&
-              cpu_timer_interrupts_enabled(vcpu);
+       if (!cpu_timer_interrupts_enabled(vcpu))
+               return 0;
+       return kvm_s390_get_cpu_timer(vcpu) >> 63;
   }
   
   static inline int is_ioirq(unsigned long irq_type)
@@@ -335,23 -336,6 +336,6 @@@ static void set_intercept_indicators(st
         set_intercept_indicators_stop(vcpu);
   }
   
- static u16 get_ilc(struct kvm_vcpu *vcpu)
- {
-       switch (vcpu->arch.sie_block->icptcode) {
-       case ICPT_INST:
-       case ICPT_INSTPROGI:
-       case ICPT_OPEREXC:
-       case ICPT_PARTEXEC:
-       case ICPT_IOINST:
-               /* last instruction only stored for these icptcodes */
-               return insn_length(vcpu->arch.sie_block->ipa >> 8);
-       case ICPT_PROGI:
-               return vcpu->arch.sie_block->pgmilc;
-       default:
-               return 0;
-       }
- }
- 
   static int __must_check __deliver_cpu_timer(struct kvm_vcpu *vcpu)
   {
         struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@@@ -588,7 -572,7 +572,7 @@@ static int __must_check __deliver_prog(
         struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
         struct kvm_s390_pgm_info pgm_info;
         int rc = 0, nullifying = false;
-       u16 ilc = get_ilc(vcpu);
+       u16 ilen;
   
         spin_lock(&li->lock);
         pgm_info = li->irq.pgm;
@@@ -596,8 -580,9 +580,9 @@@
         memset(&li->irq.pgm, 0, sizeof(pgm_info));
         spin_unlock(&li->lock);
   
-       VCPU_EVENT(vcpu, 3, "deliver: program irq code 0x%x, ilc:%d",
-                  pgm_info.code, ilc);
+       ilen = pgm_info.flags & KVM_S390_PGM_FLAGS_ILC_MASK;
+       VCPU_EVENT(vcpu, 3, "deliver: program irq code 0x%x, ilen:%d",
+                  pgm_info.code, ilen);
         vcpu->stat.deliver_program_int++;
         trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_PROGRAM_INT,
                                          pgm_info.code, 0);
@@@ -681,10 -666,11 +666,11 @@@
                                    (u8 *) __LC_PER_ACCESS_ID);
         }
   
-       if (nullifying && vcpu->arch.sie_block->icptcode == ICPT_INST)
-               kvm_s390_rewind_psw(vcpu, ilc);
+       if (nullifying && !(pgm_info.flags & KVM_S390_PGM_FLAGS_NO_REWIND))
+               kvm_s390_rewind_psw(vcpu, ilen);
   
-       rc |= put_guest_lc(vcpu, ilc, (u16 *) __LC_PGM_ILC);
+       /* bit 1+2 of the target are the ilc, so we can directly use ilen */
+       rc |= put_guest_lc(vcpu, ilen, (u16 *) __LC_PGM_ILC);
         rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->gbea,
                                  (u64 *) __LC_LAST_BREAK);
         rc |= put_guest_lc(vcpu, pgm_info.code,
@@@ -923,9 -909,35 +909,35 @@@ int kvm_cpu_has_pending_timer(struct kv
         return ckc_irq_pending(vcpu) || cpu_timer_irq_pending(vcpu);
   }
   
+ static u64 __calculate_sltime(struct kvm_vcpu *vcpu)
+ {
+       u64 now, cputm, sltime = 0;
+ 
+       if (ckc_interrupts_enabled(vcpu)) {
+               now = kvm_s390_get_tod_clock_fast(vcpu->kvm);
+               sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now);
+               /* already expired or overflow? */
+               if (!sltime || vcpu->arch.sie_block->ckc <= now)
+                       return 0;
+               if (cpu_timer_interrupts_enabled(vcpu)) {
+                       cputm = kvm_s390_get_cpu_timer(vcpu);
+                       /* already expired? */
+                       if (cputm >> 63)
+                               return 0;
+                       return min(sltime, tod_to_ns(cputm));
+               }
+       } else if (cpu_timer_interrupts_enabled(vcpu)) {
+               sltime = kvm_s390_get_cpu_timer(vcpu);
+               /* already expired? */
+               if (sltime >> 63)
+                       return 0;
+       }
+       return sltime;
+ }
+ 
   int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
   {
-       u64 now, sltime;
+       u64 sltime;
   
         vcpu->stat.exit_wait_state++;
   
@@@ -938,22 -950,20 +950,20 @@@
                 return -EOPNOTSUPP; /* disabled wait */
         }
   
-       if (!ckc_interrupts_enabled(vcpu)) {
+       if (!ckc_interrupts_enabled(vcpu) &&
+           !cpu_timer_interrupts_enabled(vcpu)) {
                 VCPU_EVENT(vcpu, 3, "%s", "enabled wait w/o timer");
                 __set_cpu_idle(vcpu);
                 goto no_timer;
         }
   
-       now = kvm_s390_get_tod_clock_fast(vcpu->kvm);
-       sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now);
- 
-       /* underflow */
-       if (vcpu->arch.sie_block->ckc < now)
+       sltime = __calculate_sltime(vcpu);
+       if (!sltime)
                 return 0;
   
         __set_cpu_idle(vcpu);
         hrtimer_start(&vcpu->arch.ckc_timer, ktime_set (0, sltime) , HRTIMER_MODE_REL);
-       VCPU_EVENT(vcpu, 4, "enabled wait via clock comparator: %llu ns", sltime);
+       VCPU_EVENT(vcpu, 4, "enabled wait: %llu ns", sltime);
   no_timer:
         srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
         kvm_vcpu_block(vcpu);
@@@ -966,13 -976,13 +976,13 @@@
   
   void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
   {
- -      if (waitqueue_active(&vcpu->wq)) {
+ +      if (swait_active(&vcpu->wq)) {
                 /*
                  * The vcpu gave up the cpu voluntarily, mark it as a good
                  * yield-candidate.
                  */
                 vcpu->preempted = true;
- -              wake_up_interruptible(&vcpu->wq);
+ +              swake_up(&vcpu->wq);
                 vcpu->stat.halt_wakeup++;
         }
   }
@@@ -980,18 -990,16 +990,16 @@@
   enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer)
   {
         struct kvm_vcpu *vcpu;
-       u64 now, sltime;
+       u64 sltime;
   
         vcpu = container_of(timer, struct kvm_vcpu, arch.ckc_timer);
-       now = kvm_s390_get_tod_clock_fast(vcpu->kvm);
-       sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now);
+       sltime = __calculate_sltime(vcpu);
   
         /*
          * If the monotonic clock runs faster than the tod clock we might be
          * woken up too early and have to go back to sleep to avoid deadlocks.
          */
-       if (vcpu->arch.sie_block->ckc > now &&
-           hrtimer_forward_now(timer, ns_to_ktime(sltime)))
+       if (sltime && hrtimer_forward_now(timer, ns_to_ktime(sltime)))
                 return HRTIMER_RESTART;
         kvm_s390_vcpu_wakeup(vcpu);
         return HRTIMER_NORESTART;
@@@ -1059,8 -1067,16 +1067,16 @@@ static int __inject_prog(struct kvm_vcp
         trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_PROGRAM_INT,
                                    irq->u.pgm.code, 0);
   
+       if (!(irq->u.pgm.flags & KVM_S390_PGM_FLAGS_ILC_VALID)) {
+               /* auto detection if no valid ILC was given */
+               irq->u.pgm.flags &= ~KVM_S390_PGM_FLAGS_ILC_MASK;
+               irq->u.pgm.flags |= kvm_s390_get_ilen(vcpu);
+               irq->u.pgm.flags |= KVM_S390_PGM_FLAGS_ILC_VALID;
+       }
+ 
         if (irq->u.pgm.code == PGM_PER) {
                 li->irq.pgm.code |= PGM_PER;
+               li->irq.pgm.flags = irq->u.pgm.flags;
                 /* only modify PER related information */
                 li->irq.pgm.per_address = irq->u.pgm.per_address;
                 li->irq.pgm.per_code = irq->u.pgm.per_code;
@@@ -1069,6 -1085,7 +1085,7 @@@
         } else if (!(irq->u.pgm.code & PGM_PER)) {
                 li->irq.pgm.code = (li->irq.pgm.code & PGM_PER) |
                                    irq->u.pgm.code;
+               li->irq.pgm.flags = irq->u.pgm.flags;
                 /* only modify non-PER information */
                 li->irq.pgm.trans_exc_code = irq->u.pgm.trans_exc_code;
                 li->irq.pgm.mon_code = irq->u.pgm.mon_code;
diff --combined arch/s390/kvm/kvm-s390.c

index 03dfe9c667f4eb944705787e54ff7e6ac3c08afb,c186d55b87ac3db66d74d7e36e0cb283fa7b2516..e196582fe87d4631ab9335b57c7d21f3f4a2ba0f
--- 1/arch/s390/kvm/kvm-s390.c
--- 2/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@@ -158,6 -158,8 +158,8 @@@ static int kvm_clock_sync(struct notifi
                 kvm->arch.epoch -= *delta;
                 kvm_for_each_vcpu(i, vcpu, kvm) {
                         vcpu->arch.sie_block->epoch -= *delta;
+                       if (vcpu->arch.cputm_enabled)
+                               vcpu->arch.cputm_start += *delta;
                 }
         }
         return NOTIFY_OK;
@@@ -274,7 -276,6 +276,6 @@@ static void kvm_s390_sync_dirty_log(str
         unsigned long address;
         struct gmap *gmap = kvm->arch.gmap;
   
-       down_read(&gmap->mm->mmap_sem);
         /* Loop over all guest pages */
         last_gfn = memslot->base_gfn + memslot->npages;
         for (cur_gfn = memslot->base_gfn; cur_gfn <= last_gfn; cur_gfn++) {
@@@ -282,8 -283,10 +283,10 @@@
   
                 if (gmap_test_and_clear_dirty(address, gmap))
                         mark_page_dirty(kvm, cur_gfn);
+               if (fatal_signal_pending(current))
+                       return;
+               cond_resched();
         }
-       up_read(&gmap->mm->mmap_sem);
   }
   
   /* Section: vm related */
@@@ -352,8 -355,8 +355,8 @@@ static int kvm_vm_ioctl_enable_cap(stru
                 if (atomic_read(&kvm->online_vcpus)) {
                         r = -EBUSY;
                 } else if (MACHINE_HAS_VX) {
-                       set_kvm_facility(kvm->arch.model.fac->mask, 129);
-                       set_kvm_facility(kvm->arch.model.fac->list, 129);
+                       set_kvm_facility(kvm->arch.model.fac_mask, 129);
+                       set_kvm_facility(kvm->arch.model.fac_list, 129);
                         r = 0;
                 } else
                         r = -EINVAL;
@@@ -367,8 -370,8 +370,8 @@@
                 if (atomic_read(&kvm->online_vcpus)) {
                         r = -EBUSY;
                 } else if (test_facility(64)) {
-                       set_kvm_facility(kvm->arch.model.fac->mask, 64);
-                       set_kvm_facility(kvm->arch.model.fac->list, 64);
+                       set_kvm_facility(kvm->arch.model.fac_mask, 64);
+                       set_kvm_facility(kvm->arch.model.fac_list, 64);
                         r = 0;
                 }
                 mutex_unlock(&kvm->lock);
@@@ -651,7 -654,7 +654,7 @@@ static int kvm_s390_set_processor(struc
                 memcpy(&kvm->arch.model.cpu_id, &proc->cpuid,
                        sizeof(struct cpuid));
                 kvm->arch.model.ibc = proc->ibc;
-               memcpy(kvm->arch.model.fac->list, proc->fac_list,
+               memcpy(kvm->arch.model.fac_list, proc->fac_list,
                        S390_ARCH_FAC_LIST_SIZE_BYTE);
         } else
                 ret = -EFAULT;
@@@ -685,7 -688,8 +688,8 @@@ static int kvm_s390_get_processor(struc
         }
         memcpy(&proc->cpuid, &kvm->arch.model.cpu_id, sizeof(struct cpuid));
         proc->ibc = kvm->arch.model.ibc;
-       memcpy(&proc->fac_list, kvm->arch.model.fac->list, S390_ARCH_FAC_LIST_SIZE_BYTE);
+       memcpy(&proc->fac_list, kvm->arch.model.fac_list,
+              S390_ARCH_FAC_LIST_SIZE_BYTE);
         if (copy_to_user((void __user *)attr->addr, proc, sizeof(*proc)))
                 ret = -EFAULT;
         kfree(proc);
@@@ -705,7 -709,7 +709,7 @@@ static int kvm_s390_get_machine(struct 
         }
         get_cpu_id((struct cpuid *) &mach->cpuid);
         mach->ibc = sclp.ibc;
-       memcpy(&mach->fac_mask, kvm->arch.model.fac->mask,
+       memcpy(&mach->fac_mask, kvm->arch.model.fac_mask,
                S390_ARCH_FAC_LIST_SIZE_BYTE);
         memcpy((unsigned long *)&mach->fac_list, S390_lowcore.stfle_fac_list,
                S390_ARCH_FAC_LIST_SIZE_BYTE);
@@@ -1082,16 -1086,12 +1086,12 @@@ static void kvm_s390_get_cpu_id(struct 
         cpu_id->version = 0xff;
   }
   
- static int kvm_s390_crypto_init(struct kvm *kvm)
+ static void kvm_s390_crypto_init(struct kvm *kvm)
   {
         if (!test_kvm_facility(kvm, 76))
-               return 0;
- 
-       kvm->arch.crypto.crycb = kzalloc(sizeof(*kvm->arch.crypto.crycb),
-                                        GFP_KERNEL | GFP_DMA);
-       if (!kvm->arch.crypto.crycb)
-               return -ENOMEM;
+               return;
   
+       kvm->arch.crypto.crycb = &kvm->arch.sie_page2->crycb;
         kvm_s390_set_crycb_format(kvm);
   
         /* Enable AES/DEA protected key functions by default */
@@@ -1101,8 -1101,6 +1101,6 @@@
                          sizeof(kvm->arch.crypto.crycb->aes_wrapping_key_mask));
         get_random_bytes(kvm->arch.crypto.crycb->dea_wrapping_key_mask,
                          sizeof(kvm->arch.crypto.crycb->dea_wrapping_key_mask));
- 
-       return 0;
   }
   
   static void sca_dispose(struct kvm *kvm)
@@@ -1156,37 -1154,30 +1154,30 @@@ int kvm_arch_init_vm(struct kvm *kvm, u
         if (!kvm->arch.dbf)
                 goto out_err;
   
-       /*
-        * The architectural maximum amount of facilities is 16 kbit. To store
-        * this amount, 2 kbyte of memory is required. Thus we need a full
-        * page to hold the guest facility list (arch.model.fac->list) and the
-        * facility mask (arch.model.fac->mask). Its address size has to be
-        * 31 bits and word aligned.
-        */
-       kvm->arch.model.fac =
-               (struct kvm_s390_fac *) get_zeroed_page(GFP_KERNEL | GFP_DMA);
-       if (!kvm->arch.model.fac)
+       kvm->arch.sie_page2 =
+            (struct sie_page2 *) get_zeroed_page(GFP_KERNEL | GFP_DMA);
+       if (!kvm->arch.sie_page2)
                 goto out_err;
   
         /* Populate the facility mask initially. */
-       memcpy(kvm->arch.model.fac->mask, S390_lowcore.stfle_fac_list,
+       memcpy(kvm->arch.model.fac_mask, S390_lowcore.stfle_fac_list,
                S390_ARCH_FAC_LIST_SIZE_BYTE);
         for (i = 0; i < S390_ARCH_FAC_LIST_SIZE_U64; i++) {
                 if (i < kvm_s390_fac_list_mask_size())
-                       kvm->arch.model.fac->mask[i] &= kvm_s390_fac_list_mask[i];
+                       kvm->arch.model.fac_mask[i] &= kvm_s390_fac_list_mask[i];
                 else
-                       kvm->arch.model.fac->mask[i] = 0UL;
+                       kvm->arch.model.fac_mask[i] = 0UL;
         }
   
         /* Populate the facility list initially. */
-       memcpy(kvm->arch.model.fac->list, kvm->arch.model.fac->mask,
+       kvm->arch.model.fac_list = kvm->arch.sie_page2->fac_list;
+       memcpy(kvm->arch.model.fac_list, kvm->arch.model.fac_mask,
                S390_ARCH_FAC_LIST_SIZE_BYTE);
   
         kvm_s390_get_cpu_id(&kvm->arch.model.cpu_id);
         kvm->arch.model.ibc = sclp.ibc & 0x0fff;
   
-       if (kvm_s390_crypto_init(kvm) < 0)
-               goto out_err;
+       kvm_s390_crypto_init(kvm);
   
         spin_lock_init(&kvm->arch.float_int.lock);
         for (i = 0; i < FIRQ_LIST_COUNT; i++)
@@@ -1222,8 -1213,7 +1213,7 @@@
   
         return 0;
   out_err:
-       kfree(kvm->arch.crypto.crycb);
-       free_page((unsigned long)kvm->arch.model.fac);
+       free_page((unsigned long)kvm->arch.sie_page2);
         debug_unregister(kvm->arch.dbf);
         sca_dispose(kvm);
         KVM_EVENT(3, "creation of vm failed: %d", rc);
@@@ -1269,10 -1259,9 +1259,9 @@@ static void kvm_free_vcpus(struct kvm *
   void kvm_arch_destroy_vm(struct kvm *kvm)
   {
         kvm_free_vcpus(kvm);
-       free_page((unsigned long)kvm->arch.model.fac);
         sca_dispose(kvm);
         debug_unregister(kvm->arch.dbf);
-       kfree(kvm->arch.crypto.crycb);
+       free_page((unsigned long)kvm->arch.sie_page2);
         if (!kvm_is_ucontrol(kvm))
                 gmap_free(kvm->arch.gmap);
         kvm_s390_destroy_adapters(kvm);
@@@ -1414,8 -1403,13 +1403,13 @@@ int kvm_arch_vcpu_init(struct kvm_vcpu 
                                     KVM_SYNC_PFAULT;
         if (test_kvm_facility(vcpu->kvm, 64))
                 vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB;
-       if (test_kvm_facility(vcpu->kvm, 129))
+       /* fprs can be synchronized via vrs, even if the guest has no vx. With
+        * MACHINE_HAS_VX, (load|store)_fpu_regs() will work with vrs format.
+        */
+       if (MACHINE_HAS_VX)
                 vcpu->run->kvm_valid_regs |= KVM_SYNC_VRS;
+       else
+               vcpu->run->kvm_valid_regs |= KVM_SYNC_FPRS;
   
         if (kvm_is_ucontrol(vcpu->kvm))
                 return __kvm_ucontrol_vcpu_init(vcpu);
@@@ -1423,6 -1417,93 +1417,93 @@@
         return 0;
   }
   
+ /* needs disabled preemption to protect from TOD sync and vcpu_load/put */
+ static void __start_cpu_timer_accounting(struct kvm_vcpu *vcpu)
+ {
+       WARN_ON_ONCE(vcpu->arch.cputm_start != 0);
+       raw_write_seqcount_begin(&vcpu->arch.cputm_seqcount);
+       vcpu->arch.cputm_start = get_tod_clock_fast();
+       raw_write_seqcount_end(&vcpu->arch.cputm_seqcount);
+ }
+ 
+ /* needs disabled preemption to protect from TOD sync and vcpu_load/put */
+ static void __stop_cpu_timer_accounting(struct kvm_vcpu *vcpu)
+ {
+       WARN_ON_ONCE(vcpu->arch.cputm_start == 0);
+       raw_write_seqcount_begin(&vcpu->arch.cputm_seqcount);
+       vcpu->arch.sie_block->cputm -= get_tod_clock_fast() - vcpu->arch.cputm_start;
+       vcpu->arch.cputm_start = 0;
+       raw_write_seqcount_end(&vcpu->arch.cputm_seqcount);
+ }
+ 
+ /* needs disabled preemption to protect from TOD sync and vcpu_load/put */
+ static void __enable_cpu_timer_accounting(struct kvm_vcpu *vcpu)
+ {
+       WARN_ON_ONCE(vcpu->arch.cputm_enabled);
+       vcpu->arch.cputm_enabled = true;
+       __start_cpu_timer_accounting(vcpu);
+ }
+ 
+ /* needs disabled preemption to protect from TOD sync and vcpu_load/put */
+ static void __disable_cpu_timer_accounting(struct kvm_vcpu *vcpu)
+ {
+       WARN_ON_ONCE(!vcpu->arch.cputm_enabled);
+       __stop_cpu_timer_accounting(vcpu);
+       vcpu->arch.cputm_enabled = false;
+ }
+ 
+ static void enable_cpu_timer_accounting(struct kvm_vcpu *vcpu)
+ {
+       preempt_disable(); /* protect from TOD sync and vcpu_load/put */
+       __enable_cpu_timer_accounting(vcpu);
+       preempt_enable();
+ }
+ 
+ static void disable_cpu_timer_accounting(struct kvm_vcpu *vcpu)
+ {
+       preempt_disable(); /* protect from TOD sync and vcpu_load/put */
+       __disable_cpu_timer_accounting(vcpu);
+       preempt_enable();
+ }
+ 
+ /* set the cpu timer - may only be called from the VCPU thread itself */
+ void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm)
+ {
+       preempt_disable(); /* protect from TOD sync and vcpu_load/put */
+       raw_write_seqcount_begin(&vcpu->arch.cputm_seqcount);
+       if (vcpu->arch.cputm_enabled)
+               vcpu->arch.cputm_start = get_tod_clock_fast();
+       vcpu->arch.sie_block->cputm = cputm;
+       raw_write_seqcount_end(&vcpu->arch.cputm_seqcount);
+       preempt_enable();
+ }
+ 
+ /* update and get the cpu timer - can also be called from other VCPU threads */
+ __u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu)
+ {
+       unsigned int seq;
+       __u64 value;
+ 
+       if (unlikely(!vcpu->arch.cputm_enabled))
+               return vcpu->arch.sie_block->cputm;
+ 
+       preempt_disable(); /* protect from TOD sync and vcpu_load/put */
+       do {
+               seq = raw_read_seqcount(&vcpu->arch.cputm_seqcount);
+               /*
+                * If the writer would ever execute a read in the critical
+                * section, e.g. in irq context, we have a deadlock.
+                */
+               WARN_ON_ONCE((seq & 1) && smp_processor_id() == vcpu->cpu);
+               value = vcpu->arch.sie_block->cputm;
+               /* if cputm_start is 0, accounting is being started/stopped */
+               if (likely(vcpu->arch.cputm_start))
+                       value -= get_tod_clock_fast() - vcpu->arch.cputm_start;
+       } while (read_seqcount_retry(&vcpu->arch.cputm_seqcount, seq & ~1));
+       preempt_enable();
+       return value;
+ }
+ 
   void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
   {
         /* Save host register state */
@@@ -1430,10 -1511,10 +1511,10 @@@
         vcpu->arch.host_fpregs.fpc = current->thread.fpu.fpc;
         vcpu->arch.host_fpregs.regs = current->thread.fpu.regs;
   
-       /* Depending on MACHINE_HAS_VX, data stored to vrs either
-        * has vector register or floating point register format.
-        */
-       current->thread.fpu.regs = vcpu->run->s.regs.vrs;
+       if (MACHINE_HAS_VX)
+               current->thread.fpu.regs = vcpu->run->s.regs.vrs;
+       else
+               current->thread.fpu.regs = vcpu->run->s.regs.fprs;
         current->thread.fpu.fpc = vcpu->run->s.regs.fpc;
         if (test_fp_ctl(current->thread.fpu.fpc))
                 /* User space provided an invalid FPC, let's clear it */
@@@ -1443,10 -1524,16 +1524,16 @@@
         restore_access_regs(vcpu->run->s.regs.acrs);
         gmap_enable(vcpu->arch.gmap);
         atomic_or(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
+       if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
+               __start_cpu_timer_accounting(vcpu);
+       vcpu->cpu = cpu;
   }
   
   void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
   {
+       vcpu->cpu = -1;
+       if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
+               __stop_cpu_timer_accounting(vcpu);
         atomic_andnot(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
         gmap_disable(vcpu->arch.gmap);
   
@@@ -1468,7 -1555,7 +1555,7 @@@ static void kvm_s390_vcpu_initial_reset
         vcpu->arch.sie_block->gpsw.mask = 0UL;
         vcpu->arch.sie_block->gpsw.addr = 0UL;
         kvm_s390_set_prefix(vcpu, 0);
-       vcpu->arch.sie_block->cputm     = 0UL;
+       kvm_s390_set_cpu_timer(vcpu, 0);
         vcpu->arch.sie_block->ckc       = 0UL;
         vcpu->arch.sie_block->todpr     = 0;
         memset(vcpu->arch.sie_block->gcr, 0, 16 * sizeof(__u64));
@@@ -1538,7 -1625,8 +1625,8 @@@ static void kvm_s390_vcpu_setup_model(s
   
         vcpu->arch.cpu_id = model->cpu_id;
         vcpu->arch.sie_block->ibc = model->ibc;
-       vcpu->arch.sie_block->fac = (int) (long) model->fac->list;
+       if (test_kvm_facility(vcpu->kvm, 7))
+               vcpu->arch.sie_block->fac = (u32)(u64) model->fac_list;
   }
   
   int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
@@@ -1616,6 -1704,7 +1704,7 @@@ struct kvm_vcpu *kvm_arch_vcpu_create(s
         vcpu->arch.local_int.float_int = &kvm->arch.float_int;
         vcpu->arch.local_int.wq = &vcpu->wq;
         vcpu->arch.local_int.cpuflags = &vcpu->arch.sie_block->cpuflags;
+       seqcount_init(&vcpu->arch.cputm_seqcount);
   
         rc = kvm_vcpu_init(vcpu, kvm, id);
         if (rc)
@@@ -1715,7 -1804,7 +1804,7 @@@ static int kvm_arch_vcpu_ioctl_get_one_
                              (u64 __user *)reg->addr);
                 break;
         case KVM_REG_S390_CPU_TIMER:
-               r = put_user(vcpu->arch.sie_block->cputm,
+               r = put_user(kvm_s390_get_cpu_timer(vcpu),
                              (u64 __user *)reg->addr);
                 break;
         case KVM_REG_S390_CLOCK_COMP:
@@@ -1753,6 -1842,7 +1842,7 @@@ static int kvm_arch_vcpu_ioctl_set_one_
                                            struct kvm_one_reg *reg)
   {
         int r = -EINVAL;
+       __u64 val;
   
         switch (reg->id) {
         case KVM_REG_S390_TODPR:
@@@ -1764,8 -1854,9 +1854,9 @@@
                              (u64 __user *)reg->addr);
                 break;
         case KVM_REG_S390_CPU_TIMER:
-               r = get_user(vcpu->arch.sie_block->cputm,
-                            (u64 __user *)reg->addr);
+               r = get_user(val, (u64 __user *)reg->addr);
+               if (!r)
+                       kvm_s390_set_cpu_timer(vcpu, val);
                 break;
         case KVM_REG_S390_CLOCK_COMP:
                 r = get_user(vcpu->arch.sie_block->ckc,
@@@ -2158,8 -2249,10 +2249,10 @@@ static int vcpu_pre_run(struct kvm_vcp
   
   static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu)
   {
-       psw_t *psw = &vcpu->arch.sie_block->gpsw;
-       u8 opcode;
+       struct kvm_s390_pgm_info pgm_info = {
+               .code = PGM_ADDRESSING,
+       };
+       u8 opcode, ilen;
         int rc;
   
         VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction");
@@@ -2173,12 -2266,21 +2266,21 @@@
          * to look up the current opcode to get the length of the instruction
          * to be able to forward the PSW.
          */
-       rc = read_guest(vcpu, psw->addr, 0, &opcode, 1);
-       if (rc)
-               return kvm_s390_inject_prog_cond(vcpu, rc);
-       psw->addr = __rewind_psw(*psw, -insn_length(opcode));
- 
-       return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+       rc = read_guest_instr(vcpu, &opcode, 1);
+       ilen = insn_length(opcode);
+       if (rc < 0) {
+               return rc;
+       } else if (rc) {
+               /* Instruction-Fetching Exceptions - we can't detect the ilen.
+                * Forward by arbitrary ilc, injection will take care of
+                * nullification if necessary.
+                */
+               pgm_info = vcpu->arch.pgm;
+               ilen = 4;
+       }
+       pgm_info.flags = ilen | KVM_S390_PGM_FLAGS_ILC_VALID;
+       kvm_s390_forward_psw(vcpu, ilen);
+       return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
   }
   
   static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
@@@ -2244,10 -2346,12 +2346,12 @@@ static int __vcpu_run(struct kvm_vcpu *
                  */
                 local_irq_disable();
                 __kvm_guest_enter();
+               __disable_cpu_timer_accounting(vcpu);
                 local_irq_enable();
                 exit_reason = sie64a(vcpu->arch.sie_block,
                                      vcpu->run->s.regs.gprs);
                 local_irq_disable();
+               __enable_cpu_timer_accounting(vcpu);
                 __kvm_guest_exit();
                 local_irq_enable();
                 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
@@@ -2271,7 -2375,7 +2375,7 @@@ static void sync_regs(struct kvm_vcpu *
                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
         }
         if (kvm_run->kvm_dirty_regs & KVM_SYNC_ARCH0) {
-               vcpu->arch.sie_block->cputm = kvm_run->s.regs.cputm;
+               kvm_s390_set_cpu_timer(vcpu, kvm_run->s.regs.cputm);
                 vcpu->arch.sie_block->ckc = kvm_run->s.regs.ckc;
                 vcpu->arch.sie_block->todpr = kvm_run->s.regs.todpr;
                 vcpu->arch.sie_block->pp = kvm_run->s.regs.pp;
@@@ -2293,7 -2397,7 +2397,7 @@@ static void store_regs(struct kvm_vcpu 
         kvm_run->psw_addr = vcpu->arch.sie_block->gpsw.addr;
         kvm_run->s.regs.prefix = kvm_s390_get_prefix(vcpu);
         memcpy(&kvm_run->s.regs.crs, &vcpu->arch.sie_block->gcr, 128);
-       kvm_run->s.regs.cputm = vcpu->arch.sie_block->cputm;
+       kvm_run->s.regs.cputm = kvm_s390_get_cpu_timer(vcpu);
         kvm_run->s.regs.ckc = vcpu->arch.sie_block->ckc;
         kvm_run->s.regs.todpr = vcpu->arch.sie_block->todpr;
         kvm_run->s.regs.pp = vcpu->arch.sie_block->pp;
@@@ -2325,6 -2429,7 +2429,7 @@@ int kvm_arch_vcpu_ioctl_run(struct kvm_
         }
   
         sync_regs(vcpu, kvm_run);
+       enable_cpu_timer_accounting(vcpu);
   
         might_fault();
         rc = __vcpu_run(vcpu);
@@@ -2344,6 -2449,7 +2449,7 @@@
                 rc = 0;
         }
   
+       disable_cpu_timer_accounting(vcpu);
         store_regs(vcpu, kvm_run);
   
         if (vcpu->sigset_active)
@@@ -2364,7 -2470,7 +2470,7 @@@ int kvm_s390_store_status_unloaded(stru
         unsigned char archmode = 1;
         freg_t fprs[NUM_FPRS];
         unsigned int px;
-       u64 clkcomp;
+       u64 clkcomp, cputm;
         int rc;
   
         px = kvm_s390_get_prefix(vcpu);
@@@ -2381,12 -2487,12 +2487,12 @@@
   
         /* manually convert vector registers if necessary */
         if (MACHINE_HAS_VX) {
- -              convert_vx_to_fp(fprs, current->thread.fpu.vxrs);
+ +              convert_vx_to_fp(fprs, (__vector128 *) vcpu->run->s.regs.vrs);
                 rc = write_guest_abs(vcpu, gpa + __LC_FPREGS_SAVE_AREA,
                                      fprs, 128);
         } else {
                 rc = write_guest_abs(vcpu, gpa + __LC_FPREGS_SAVE_AREA,
-                                    vcpu->run->s.regs.vrs, 128);
+                                    vcpu->run->s.regs.fprs, 128);
         }
         rc |= write_guest_abs(vcpu, gpa + __LC_GPREGS_SAVE_AREA,
                               vcpu->run->s.regs.gprs, 128);
@@@ -2398,8 -2504,9 +2504,9 @@@
                               &vcpu->run->s.regs.fpc, 4);
         rc |= write_guest_abs(vcpu, gpa + __LC_TOD_PROGREG_SAVE_AREA,
                               &vcpu->arch.sie_block->todpr, 4);
+       cputm = kvm_s390_get_cpu_timer(vcpu);
         rc |= write_guest_abs(vcpu, gpa + __LC_CPU_TIMER_SAVE_AREA,
-                             &vcpu->arch.sie_block->cputm, 8);
+                             &cputm, 8);
         clkcomp = vcpu->arch.sie_block->ckc >> 8;
         rc |= write_guest_abs(vcpu, gpa + __LC_CLOCK_COMP_SAVE_AREA,
                               &clkcomp, 8);
@@@ -2605,7 -2712,8 +2712,8 @@@ static long kvm_s390_guest_mem_op(struc
         switch (mop->op) {
         case KVM_S390_MEMOP_LOGICAL_READ:
                 if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
-                       r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size, false);
+                       r = check_gva_range(vcpu, mop->gaddr, mop->ar,
+                                           mop->size, GACC_FETCH);
                         break;
                 }
                 r = read_guest(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size);
@@@ -2616,7 -2724,8 +2724,8 @@@
                 break;
         case KVM_S390_MEMOP_LOGICAL_WRITE:
                 if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
-                       r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size, true);
+                       r = check_gva_range(vcpu, mop->gaddr, mop->ar,
+                                           mop->size, GACC_STORE);
                         break;
                 }
                 if (copy_from_user(tmpbuf, uaddr, mop->size)) {
diff --combined arch/x86/kvm/lapic.c

index 3a045f39ed8114e24e375521135cb7d2296e9e7e,d9ae1ce2a6a03e0e8ebac52ea88c94dd913fe5f7..443d2a57ad3d9620246097a48ed3cd7de9e02f50
--- 1/arch/x86/kvm/lapic.c
--- 2/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@@ -281,7 -281,7 +281,7 @@@ void kvm_apic_set_version(struct kvm_vc
         struct kvm_cpuid_entry2 *feat;
         u32 v = APIC_VERSION;
   
-       if (!kvm_vcpu_has_lapic(vcpu))
+       if (!lapic_in_kernel(vcpu))
                 return;
   
         feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
@@@ -475,26 -475,20 +475,20 @@@ static inline void apic_clear_isr(int v
   
   int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
   {
-       int highest_irr;
- 
         /* This may race with setting of irr in __apic_accept_irq() and
          * value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq
          * will cause vmexit immediately and the value will be recalculated
          * on the next vmentry.
          */
-       if (!kvm_vcpu_has_lapic(vcpu))
-               return 0;
-       highest_irr = apic_find_highest_irr(vcpu->arch.apic);
- 
-       return highest_irr;
+       return apic_find_highest_irr(vcpu->arch.apic);
   }
   
   static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
                              int vector, int level, int trig_mode,
-                            unsigned long *dest_map);
+                            struct dest_map *dest_map);
   
   int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
-               unsigned long *dest_map)
+                    struct dest_map *dest_map)
   {
         struct kvm_lapic *apic = vcpu->arch.apic;
   
@@@ -675,8 -669,33 +669,33 @@@ bool kvm_apic_match_dest(struct kvm_vcp
         }
   }
   
+ int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
+                      const unsigned long *bitmap, u32 bitmap_size)
+ {
+       u32 mod;
+       int i, idx = -1;
+ 
+       mod = vector % dest_vcpus;
+ 
+       for (i = 0; i <= mod; i++) {
+               idx = find_next_bit(bitmap, bitmap_size, idx + 1);
+               BUG_ON(idx == bitmap_size);
+       }
+ 
+       return idx;
+ }
+ 
+ static void kvm_apic_disabled_lapic_found(struct kvm *kvm)
+ {
+       if (!kvm->arch.disabled_lapic_found) {
+               kvm->arch.disabled_lapic_found = true;
+               printk(KERN_INFO
+                      "Disabled LAPIC found during irq injection\n");
+       }
+ }
+ 
   bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
-               struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map)
+               struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map)
   {
         struct kvm_apic_map *map;
         unsigned long bitmap = 1;
@@@ -727,21 -746,42 +746,42 @@@
   
                 dst = map->logical_map[cid];
   
-               if (kvm_lowest_prio_delivery(irq)) {
+               if (!kvm_lowest_prio_delivery(irq))
+                       goto set_irq;
+ 
+               if (!kvm_vector_hashing_enabled()) {
                         int l = -1;
                         for_each_set_bit(i, &bitmap, 16) {
                                 if (!dst[i])
                                         continue;
                                 if (l < 0)
                                         l = i;
-                               else if (kvm_apic_compare_prio(dst[i]->vcpu, dst[l]->vcpu) < 0)
+                               else if (kvm_apic_compare_prio(dst[i]->vcpu,
+                                                       dst[l]->vcpu) < 0)
                                         l = i;
                         }
- 
                         bitmap = (l >= 0) ? 1 << l : 0;
+               } else {
+                       int idx;
+                       unsigned int dest_vcpus;
+ 
+                       dest_vcpus = hweight16(bitmap);
+                       if (dest_vcpus == 0)
+                               goto out;
+ 
+                       idx = kvm_vector_to_index(irq->vector,
+                               dest_vcpus, &bitmap, 16);
+ 
+                       if (!dst[idx]) {
+                               kvm_apic_disabled_lapic_found(kvm);
+                               goto out;
+                       }
+ 
+                       bitmap = (idx >= 0) ? 1 << idx : 0;
                 }
         }
   
+ set_irq:
         for_each_set_bit(i, &bitmap, 16) {
                 if (!dst[i])
                         continue;
@@@ -754,6 -794,20 +794,20 @@@ out
         return ret;
   }
   
+ /*
+  * This routine tries to handler interrupts in posted mode, here is how
+  * it deals with different cases:
+  * - For single-destination interrupts, handle it in posted mode
+  * - Else if vector hashing is enabled and it is a lowest-priority
+  *   interrupt, handle it in posted mode and use the following mechanism
+  *   to find the destinaiton vCPU.
+  *    1. For lowest-priority interrupts, store all the possible
+  *       destination vCPUs in an array.
+  *    2. Use "guest vector % max number of destination vCPUs" to find
+  *       the right destination vCPU in the array for the lowest-priority
+  *       interrupt.
+  * - Otherwise, use remapped mode to inject the interrupt.
+  */
   bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
                         struct kvm_vcpu **dest_vcpu)
   {
@@@ -795,16 -849,37 +849,37 @@@
                 if (cid >= ARRAY_SIZE(map->logical_map))
                         goto out;
   
-               for_each_set_bit(i, &bitmap, 16) {
-                       dst = map->logical_map[cid][i];
-                       if (++r == 2)
+               if (kvm_vector_hashing_enabled() &&
+                               kvm_lowest_prio_delivery(irq)) {
+                       int idx;
+                       unsigned int dest_vcpus;
+ 
+                       dest_vcpus = hweight16(bitmap);
+                       if (dest_vcpus == 0)
                                 goto out;
-               }
   
-               if (dst && kvm_apic_present(dst->vcpu))
+                       idx = kvm_vector_to_index(irq->vector, dest_vcpus,
+                                                 &bitmap, 16);
+ 
+                       dst = map->logical_map[cid][idx];
+                       if (!dst) {
+                               kvm_apic_disabled_lapic_found(kvm);
+                               goto out;
+                       }
+ 
                         *dest_vcpu = dst->vcpu;
-               else
-                       goto out;
+               } else {
+                       for_each_set_bit(i, &bitmap, 16) {
+                               dst = map->logical_map[cid][i];
+                               if (++r == 2)
+                                       goto out;
+                       }
+ 
+                       if (dst && kvm_apic_present(dst->vcpu))
+                               *dest_vcpu = dst->vcpu;
+                       else
+                               goto out;
+               }
         }
   
         ret = true;
@@@ -819,7 -894,7 +894,7 @@@ out
    */
   static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
                              int vector, int level, int trig_mode,
-                            unsigned long *dest_map)
+                            struct dest_map *dest_map)
   {
         int result = 0;
         struct kvm_vcpu *vcpu = apic->vcpu;
@@@ -839,8 -914,10 +914,10 @@@
   
                 result = 1;
   
-               if (dest_map)
-                       __set_bit(vcpu->vcpu_id, dest_map);
+               if (dest_map) {
+                       __set_bit(vcpu->vcpu_id, dest_map->map);
+                       dest_map->vectors[vcpu->vcpu_id] = vector;
+               }
   
                 if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
                         if (trig_mode)
@@@ -1195,7 -1272,7 +1272,7 @@@ static void apic_update_lvtt(struct kvm
   static void apic_timer_expired(struct kvm_lapic *apic)
   {
         struct kvm_vcpu *vcpu = apic->vcpu;
- -      wait_queue_head_t *q = &vcpu->wq;
+ +      struct swait_queue_head *q = &vcpu->wq;
         struct kvm_timer *ktimer = &apic->lapic_timer;
   
         if (atomic_read(&apic->lapic_timer.pending))
@@@ -1204,8 -1281,8 +1281,8 @@@
         atomic_inc(&apic->lapic_timer.pending);
         kvm_set_pending_timer(vcpu);
   
- -      if (waitqueue_active(q))
- -              wake_up_interruptible(q);
+ +      if (swait_active(q))
+ +              swake_up(q);
   
         if (apic_lvtt_tscdeadline(apic))
                 ktimer->expired_tscdeadline = ktimer->tscdeadline;
@@@ -1239,7 -1316,7 +1316,7 @@@ void wait_lapic_expire(struct kvm_vcpu 
         struct kvm_lapic *apic = vcpu->arch.apic;
         u64 guest_tsc, tsc_deadline;
   
-       if (!kvm_vcpu_has_lapic(vcpu))
+       if (!lapic_in_kernel(vcpu))
                 return;
   
         if (apic->lapic_timer.expired_tscdeadline == 0)
@@@ -1515,8 -1592,7 +1592,7 @@@ static int apic_mmio_write(struct kvm_v
   
   void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
   {
-       if (kvm_vcpu_has_lapic(vcpu))
-               apic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
+       apic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
   }
   EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
   
@@@ -1566,7 -1642,7 +1642,7 @@@ u64 kvm_get_lapic_tscdeadline_msr(struc
   {
         struct kvm_lapic *apic = vcpu->arch.apic;
   
-       if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
+       if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) ||
                         apic_lvtt_period(apic))
                 return 0;
   
@@@ -1577,7 -1653,7 +1653,7 @@@ void kvm_set_lapic_tscdeadline_msr(stru
   {
         struct kvm_lapic *apic = vcpu->arch.apic;
   
-       if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
+       if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) ||
                         apic_lvtt_period(apic))
                 return;
   
@@@ -1590,9 -1666,6 +1666,6 @@@ void kvm_lapic_set_tpr(struct kvm_vcpu 
   {
         struct kvm_lapic *apic = vcpu->arch.apic;
   
-       if (!kvm_vcpu_has_lapic(vcpu))
-               return;
- 
         apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
                      | (kvm_apic_get_reg(apic, APIC_TASKPRI) & 4));
   }
@@@ -1601,9 -1674,6 +1674,6 @@@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *
   {
         u64 tpr;
   
-       if (!kvm_vcpu_has_lapic(vcpu))
-               return 0;
- 
         tpr = (u64) kvm_apic_get_reg(vcpu->arch.apic, APIC_TASKPRI);
   
         return (tpr & 0xf0) >> 4;
@@@ -1728,8 -1798,7 +1798,7 @@@ int apic_has_pending_timer(struct kvm_v
   {
         struct kvm_lapic *apic = vcpu->arch.apic;
   
-       if (kvm_vcpu_has_lapic(vcpu) && apic_enabled(apic) &&
-                       apic_lvt_enabled(apic, APIC_LVTT))
+       if (apic_enabled(apic) && apic_lvt_enabled(apic, APIC_LVTT))
                 return atomic_read(&apic->lapic_timer.pending);
   
         return 0;
@@@ -1826,7 -1895,7 +1895,7 @@@ int kvm_apic_has_interrupt(struct kvm_v
         struct kvm_lapic *apic = vcpu->arch.apic;
         int highest_irr;
   
-       if (!kvm_vcpu_has_lapic(vcpu) || !apic_enabled(apic))
+       if (!apic_enabled(apic))
                 return -1;
   
         apic_update_ppr(apic);
@@@ -1854,9 -1923,6 +1923,6 @@@ void kvm_inject_apic_timer_irqs(struct 
   {
         struct kvm_lapic *apic = vcpu->arch.apic;
   
-       if (!kvm_vcpu_has_lapic(vcpu))
-               return;
- 
         if (atomic_read(&apic->lapic_timer.pending) > 0) {
                 kvm_apic_local_deliver(apic, APIC_LVTT);
                 if (apic_lvtt_tscdeadline(apic))
@@@ -1932,7 -1998,7 +1998,7 @@@ void __kvm_migrate_apic_timer(struct kv
   {
         struct hrtimer *timer;
   
-       if (!kvm_vcpu_has_lapic(vcpu))
+       if (!lapic_in_kernel(vcpu))
                 return;
   
         timer = &vcpu->arch.apic->lapic_timer.timer;
@@@ -2105,7 -2171,7 +2171,7 @@@ int kvm_hv_vapic_msr_write(struct kvm_v
   {
         struct kvm_lapic *apic = vcpu->arch.apic;
   
-       if (!kvm_vcpu_has_lapic(vcpu))
+       if (!lapic_in_kernel(vcpu))
                 return 1;
   
         /* if this is ICR write vector before command */
@@@ -2119,7 -2185,7 +2185,7 @@@ int kvm_hv_vapic_msr_read(struct kvm_vc
         struct kvm_lapic *apic = vcpu->arch.apic;
         u32 low, high = 0;
   
-       if (!kvm_vcpu_has_lapic(vcpu))
+       if (!lapic_in_kernel(vcpu))
                 return 1;
   
         if (apic_reg_read(apic, reg, 4, &low))
@@@ -2151,7 -2217,7 +2217,7 @@@ void kvm_apic_accept_events(struct kvm_
         u8 sipi_vector;
         unsigned long pe;
   
-       if (!kvm_vcpu_has_lapic(vcpu) || !apic->pending_events)
+       if (!lapic_in_kernel(vcpu) || !apic->pending_events)
                 return;
   
         /*
diff --combined arch/x86/kvm/mmu.c

index 1e7a49bfc94fb323cbb11782693cab89743f1133,2463de0b935cea06967faa9a56c17a65eede15fb..c512f095cdac82b9e2ba258ae052a9a4199dc13c
--- 1/arch/x86/kvm/mmu.c
--- 2/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@@ -41,6 -41,7 +41,7 @@@
   #include <asm/cmpxchg.h>
   #include <asm/io.h>
   #include <asm/vmx.h>
+ #include <asm/kvm_page_track.h>
   
   /*
    * When setting this variable to true it enables Two-Dimensional-Paging
@@@ -776,62 -777,85 +777,85 @@@ static struct kvm_lpage_info *lpage_inf
         return &slot->arch.lpage_info[level - 2][idx];
   }
   
+ static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
+                                           gfn_t gfn, int count)
+ {
+       struct kvm_lpage_info *linfo;
+       int i;
+ 
+       for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
+               linfo = lpage_info_slot(gfn, slot, i);
+               linfo->disallow_lpage += count;
+               WARN_ON(linfo->disallow_lpage < 0);
+       }
+ }
+ 
+ void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+ {
+       update_gfn_disallow_lpage_count(slot, gfn, 1);
+ }
+ 
+ void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+ {
+       update_gfn_disallow_lpage_count(slot, gfn, -1);
+ }
+ 
   static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
   {
         struct kvm_memslots *slots;
         struct kvm_memory_slot *slot;
-       struct kvm_lpage_info *linfo;
         gfn_t gfn;
-       int i;
   
+       kvm->arch.indirect_shadow_pages++;
         gfn = sp->gfn;
         slots = kvm_memslots_for_spte_role(kvm, sp->role);
         slot = __gfn_to_memslot(slots, gfn);
-       for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
-               linfo = lpage_info_slot(gfn, slot, i);
-               linfo->write_count += 1;
-       }
-       kvm->arch.indirect_shadow_pages++;
+ 
+       /* the non-leaf shadow pages are keeping readonly. */
+       if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+               return kvm_slot_page_track_add_page(kvm, slot, gfn,
+                                                   KVM_PAGE_TRACK_WRITE);
+ 
+       kvm_mmu_gfn_disallow_lpage(slot, gfn);
   }
   
   static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
   {
         struct kvm_memslots *slots;
         struct kvm_memory_slot *slot;
-       struct kvm_lpage_info *linfo;
         gfn_t gfn;
-       int i;
   
+       kvm->arch.indirect_shadow_pages--;
         gfn = sp->gfn;
         slots = kvm_memslots_for_spte_role(kvm, sp->role);
         slot = __gfn_to_memslot(slots, gfn);
-       for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
-               linfo = lpage_info_slot(gfn, slot, i);
-               linfo->write_count -= 1;
-               WARN_ON(linfo->write_count < 0);
-       }
-       kvm->arch.indirect_shadow_pages--;
+       if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+               return kvm_slot_page_track_remove_page(kvm, slot, gfn,
+                                                      KVM_PAGE_TRACK_WRITE);
+ 
+       kvm_mmu_gfn_allow_lpage(slot, gfn);
   }
   
- static int __has_wrprotected_page(gfn_t gfn, int level,
-                                 struct kvm_memory_slot *slot)
+ static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
+                                         struct kvm_memory_slot *slot)
   {
         struct kvm_lpage_info *linfo;
   
         if (slot) {
                 linfo = lpage_info_slot(gfn, slot, level);
-               return linfo->write_count;
+               return !!linfo->disallow_lpage;
         }
   
-       return 1;
+       return true;
   }
   
- static int has_wrprotected_page(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
+ static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn,
+                                       int level)
   {
         struct kvm_memory_slot *slot;
   
         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
-       return __has_wrprotected_page(gfn, level, slot);
+       return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
   }
   
   static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
@@@ -897,7 -921,7 +921,7 @@@ static int mapping_level(struct kvm_vcp
         max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
   
         for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
-               if (__has_wrprotected_page(large_gfn, level, slot))
+               if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot))
                         break;
   
         return level - 1;
@@@ -1323,23 -1347,29 +1347,29 @@@ void kvm_arch_mmu_enable_log_dirty_pt_m
                 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
   }
   
- static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
+ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
+                                   struct kvm_memory_slot *slot, u64 gfn)
   {
-       struct kvm_memory_slot *slot;
         struct kvm_rmap_head *rmap_head;
         int i;
         bool write_protected = false;
   
-       slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- 
         for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
                 rmap_head = __gfn_to_rmap(gfn, i, slot);
-               write_protected |= __rmap_write_protect(vcpu->kvm, rmap_head, true);
+               write_protected |= __rmap_write_protect(kvm, rmap_head, true);
         }
   
         return write_protected;
   }
   
+ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
+ {
+       struct kvm_memory_slot *slot;
+ 
+       slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+       return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
+ }
+ 
   static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
   {
         u64 *sptep;
@@@ -1754,7 -1784,7 +1784,7 @@@ static void mark_unsync(u64 *spte
   static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
                                struct kvm_mmu_page *sp)
   {
-       return 1;
+       return 0;
   }
   
   static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
@@@ -1840,13 -1870,16 +1870,16 @@@ static int __mmu_unsync_walk(struct kvm
         return nr_unsync_leaf;
   }
   
+ #define INVALID_INDEX (-1)
+ 
   static int mmu_unsync_walk(struct kvm_mmu_page *sp,
                            struct kvm_mmu_pages *pvec)
   {
+       pvec->nr = 0;
         if (!sp->unsync_children)
                 return 0;
   
-       mmu_pages_add(pvec, sp, 0);
+       mmu_pages_add(pvec, sp, INVALID_INDEX);
         return __mmu_unsync_walk(sp, pvec);
   }
   
@@@ -1883,37 -1916,35 +1916,35 @@@ static void kvm_mmu_commit_zap_page(str
                 if ((_sp)->role.direct || (_sp)->role.invalid) {} else
   
   /* @sp->gfn should be write-protected at the call site */
- static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
-                          struct list_head *invalid_list, bool clear_unsync)
+ static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+                           struct list_head *invalid_list)
   {
         if (sp->role.cr4_pae != !!is_pae(vcpu)) {
                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
-               return 1;
+               return false;
         }
   
-       if (clear_unsync)
-               kvm_unlink_unsync_page(vcpu->kvm, sp);
- 
-       if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
+       if (vcpu->arch.mmu.sync_page(vcpu, sp) == 0) {
                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
-               return 1;
+               return false;
         }
   
-       kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
-       return 0;
+       return true;
   }
   
- static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
-                                  struct kvm_mmu_page *sp)
+ static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
+                                struct list_head *invalid_list,
+                                bool remote_flush, bool local_flush)
   {
-       LIST_HEAD(invalid_list);
-       int ret;
- 
-       ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
-       if (ret)
-               kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+       if (!list_empty(invalid_list)) {
+               kvm_mmu_commit_zap_page(vcpu->kvm, invalid_list);
+               return;
+       }
   
-       return ret;
+       if (remote_flush)
+               kvm_flush_remote_tlbs(vcpu->kvm);
+       else if (local_flush)
+               kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
   }
   
   #ifdef CONFIG_KVM_MMU_AUDIT
@@@ -1923,46 -1954,38 +1954,38 @@@ static void kvm_mmu_audit(struct kvm_vc
   static void mmu_audit_disable(void) { }
   #endif
   
- static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
                          struct list_head *invalid_list)
   {
-       return __kvm_sync_page(vcpu, sp, invalid_list, true);
+       kvm_unlink_unsync_page(vcpu->kvm, sp);
+       return __kvm_sync_page(vcpu, sp, invalid_list);
   }
   
   /* @gfn should be write-protected at the call site */
- static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
+ static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
+                          struct list_head *invalid_list)
   {
         struct kvm_mmu_page *s;
-       LIST_HEAD(invalid_list);
-       bool flush = false;
+       bool ret = false;
   
         for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
                 if (!s->unsync)
                         continue;
   
                 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
-               kvm_unlink_unsync_page(vcpu->kvm, s);
-               if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
-                       (vcpu->arch.mmu.sync_page(vcpu, s))) {
-                       kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
-                       continue;
-               }
-               flush = true;
+               ret |= kvm_sync_page(vcpu, s, invalid_list);
         }
   
-       kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
-       if (flush)
-               kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+       return ret;
   }
   
   struct mmu_page_path {
-       struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
-       unsigned int idx[PT64_ROOT_LEVEL-1];
+       struct kvm_mmu_page *parent[PT64_ROOT_LEVEL];
+       unsigned int idx[PT64_ROOT_LEVEL];
   };
   
   #define for_each_sp(pvec, sp, parents, i)                     \
-               for (i = mmu_pages_next(&pvec, &parents, -1),   \
-                       sp = pvec.page[i].sp;                   \
+               for (i = mmu_pages_first(&pvec, &parents);      \
                         i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});   \
                         i = mmu_pages_next(&pvec, &parents, i))
   
@@@ -1974,19 -1997,43 +1997,43 @@@ static int mmu_pages_next(struct kvm_mm
   
         for (n = i+1; n < pvec->nr; n++) {
                 struct kvm_mmu_page *sp = pvec->page[n].sp;
+               unsigned idx = pvec->page[n].idx;
+               int level = sp->role.level;
   
-               if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
-                       parents->idx[0] = pvec->page[n].idx;
-                       return n;
-               }
+               parents->idx[level-1] = idx;
+               if (level == PT_PAGE_TABLE_LEVEL)
+                       break;
   
-               parents->parent[sp->role.level-2] = sp;
-               parents->idx[sp->role.level-1] = pvec->page[n].idx;
+               parents->parent[level-2] = sp;
         }
   
         return n;
   }
   
+ static int mmu_pages_first(struct kvm_mmu_pages *pvec,
+                          struct mmu_page_path *parents)
+ {
+       struct kvm_mmu_page *sp;
+       int level;
+ 
+       if (pvec->nr == 0)
+               return 0;
+ 
+       WARN_ON(pvec->page[0].idx != INVALID_INDEX);
+ 
+       sp = pvec->page[0].sp;
+       level = sp->role.level;
+       WARN_ON(level == PT_PAGE_TABLE_LEVEL);
+ 
+       parents->parent[level-2] = sp;
+ 
+       /* Also set up a sentinel.  Further entries in pvec are all
+        * children of sp, so this element is never overwritten.
+        */
+       parents->parent[level-1] = NULL;
+       return mmu_pages_next(pvec, parents, 0);
+ }
+ 
   static void mmu_pages_clear_parents(struct mmu_page_path *parents)
   {
         struct kvm_mmu_page *sp;
@@@ -1994,22 -2041,14 +2041,14 @@@
   
         do {
                 unsigned int idx = parents->idx[level];
- 
                 sp = parents->parent[level];
                 if (!sp)
                         return;
   
+               WARN_ON(idx == INVALID_INDEX);
                 clear_unsync_child_bit(sp, idx);
                 level++;
-       } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
- }
- 
- static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
-                              struct mmu_page_path *parents,
-                              struct kvm_mmu_pages *pvec)
- {
-       parents->parent[parent->role.level-1] = NULL;
-       pvec->nr = 0;
+       } while (!sp->unsync_children);
   }
   
   static void mmu_sync_children(struct kvm_vcpu *vcpu,
@@@ -2020,30 -2059,36 +2059,36 @@@
         struct mmu_page_path parents;
         struct kvm_mmu_pages pages;
         LIST_HEAD(invalid_list);
+       bool flush = false;
   
-       kvm_mmu_pages_init(parent, &parents, &pages);
         while (mmu_unsync_walk(parent, &pages)) {
                 bool protected = false;
   
                 for_each_sp(pages, sp, parents, i)
                         protected |= rmap_write_protect(vcpu, sp->gfn);
   
-               if (protected)
+               if (protected) {
                         kvm_flush_remote_tlbs(vcpu->kvm);
+                       flush = false;
+               }
   
                 for_each_sp(pages, sp, parents, i) {
-                       kvm_sync_page(vcpu, sp, &invalid_list);
+                       flush |= kvm_sync_page(vcpu, sp, &invalid_list);
                         mmu_pages_clear_parents(&parents);
                 }
-               kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
-               cond_resched_lock(&vcpu->kvm->mmu_lock);
-               kvm_mmu_pages_init(parent, &parents, &pages);
+               if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) {
+                       kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
+                       cond_resched_lock(&vcpu->kvm->mmu_lock);
+                       flush = false;
+               }
         }
+ 
+       kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
   }
   
   static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
   {
-       sp->write_flooding_count = 0;
+       atomic_set(&sp->write_flooding_count,  0);
   }
   
   static void clear_sp_write_flooding_count(u64 *spte)
@@@ -2069,6 -2114,8 +2114,8 @@@ static struct kvm_mmu_page *kvm_mmu_get
         unsigned quadrant;
         struct kvm_mmu_page *sp;
         bool need_sync = false;
+       bool flush = false;
+       LIST_HEAD(invalid_list);
   
         role = vcpu->arch.mmu.base_role;
         role.level = level;
@@@ -2092,8 -2139,16 +2139,16 @@@
                 if (sp->role.word != role.word)
                         continue;
   
-               if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
-                       break;
+               if (sp->unsync) {
+                       /* The page is good, but __kvm_sync_page might still end
+                        * up zapping it.  If so, break in order to rebuild it.
+                        */
+                       if (!__kvm_sync_page(vcpu, sp, &invalid_list))
+                               break;
+ 
+                       WARN_ON(!list_empty(&invalid_list));
+                       kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+               }
   
                 if (sp->unsync_children)
                         kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
@@@ -2112,16 -2167,24 +2167,24 @@@
         hlist_add_head(&sp->hash_link,
                 &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
         if (!direct) {
-               if (rmap_write_protect(vcpu, gfn))
+               /*
+                * we should do write protection before syncing pages
+                * otherwise the content of the synced shadow page may
+                * be inconsistent with guest page table.
+                */
+               account_shadowed(vcpu->kvm, sp);
+               if (level == PT_PAGE_TABLE_LEVEL &&
+                     rmap_write_protect(vcpu, gfn))
                         kvm_flush_remote_tlbs(vcpu->kvm);
-               if (level > PT_PAGE_TABLE_LEVEL && need_sync)
-                       kvm_sync_pages(vcpu, gfn);
   
-               account_shadowed(vcpu->kvm, sp);
+               if (level > PT_PAGE_TABLE_LEVEL && need_sync)
+                       flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
         }
         sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
         clear_page(sp->spt);
         trace_kvm_mmu_get_page(sp, true);
+ 
+       kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
         return sp;
   }
   
@@@ -2269,7 -2332,6 +2332,6 @@@ static int mmu_zap_unsync_children(stru
         if (parent->role.level == PT_PAGE_TABLE_LEVEL)
                 return 0;
   
-       kvm_mmu_pages_init(parent, &parents, &pages);
         while (mmu_unsync_walk(parent, &pages)) {
                 struct kvm_mmu_page *sp;
   
@@@ -2278,7 -2340,6 +2340,6 @@@
                         mmu_pages_clear_parents(&parents);
                         zapped++;
                 }
-               kvm_mmu_pages_init(parent, &parents, &pages);
         }
   
         return zapped;
@@@ -2354,8 -2415,8 +2415,8 @@@ static bool prepare_zap_oldest_mmu_page
         if (list_empty(&kvm->arch.active_mmu_pages))
                 return false;
   
-       sp = list_entry(kvm->arch.active_mmu_pages.prev,
-                       struct kvm_mmu_page, link);
+       sp = list_last_entry(&kvm->arch.active_mmu_pages,
+                            struct kvm_mmu_page, link);
         kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
   
         return true;
@@@ -2408,7 -2469,7 +2469,7 @@@ int kvm_mmu_unprotect_page(struct kvm *
   }
   EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
   
- static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
   {
         trace_kvm_mmu_unsync_page(sp);
         ++vcpu->kvm->stat.mmu_unsync;
@@@ -2417,37 -2478,26 +2478,26 @@@
         kvm_mmu_mark_parents_unsync(sp);
   }
   
- static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
+ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
+                                  bool can_unsync)
   {
-       struct kvm_mmu_page *s;
- 
-       for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
-               if (s->unsync)
-                       continue;
-               WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
-               __kvm_unsync_page(vcpu, s);
-       }
- }
+       struct kvm_mmu_page *sp;
   
- static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
-                                 bool can_unsync)
- {
-       struct kvm_mmu_page *s;
-       bool need_unsync = false;
+       if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+               return true;
   
-       for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
+       for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
                 if (!can_unsync)
-                       return 1;
+                       return true;
   
-               if (s->role.level != PT_PAGE_TABLE_LEVEL)
-                       return 1;
+               if (sp->unsync)
+                       continue;
   
-               if (!s->unsync)
-                       need_unsync = true;
+               WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
+               kvm_unsync_page(vcpu, sp);
         }
-       if (need_unsync)
-               kvm_unsync_pages(vcpu, gfn);
-       return 0;
+ 
+       return false;
   }
   
   static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
@@@ -2503,7 -2553,7 +2553,7 @@@ static int set_spte(struct kvm_vcpu *vc
                  * be fixed if guest refault.
                  */
                 if (level > PT_PAGE_TABLE_LEVEL &&
-                   has_wrprotected_page(vcpu, gfn, level))
+                   mmu_gfn_lpage_is_disallowed(vcpu, gfn, level))
                         goto done;
   
                 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
@@@ -2768,7 -2818,7 +2818,7 @@@ static void transparent_hugepage_adjust
         if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
             level == PT_PAGE_TABLE_LEVEL &&
             PageTransCompound(pfn_to_page(pfn)) &&
-           !has_wrprotected_page(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
+           !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
                 unsigned long mask;
                 /*
                  * mmu_notifier_retry was successful and we hold the
@@@ -2796,20 -2846,16 +2846,16 @@@
   static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
                                 kvm_pfn_t pfn, unsigned access, int *ret_val)
   {
-       bool ret = true;
- 
         /* The pfn is invalid, report the error! */
         if (unlikely(is_error_pfn(pfn))) {
                 *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
-               goto exit;
+               return true;
         }
   
         if (unlikely(is_noslot_pfn(pfn)))
                 vcpu_cache_mmio_info(vcpu, gva, gfn, access);
   
-       ret = false;
- exit:
-       return ret;
+       return false;
   }
   
   static bool page_fault_can_be_fast(u32 error_code)
@@@ -3273,7 -3319,7 +3319,7 @@@ static bool is_shadow_zero_bits_set(str
         return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level);
   }
   
- static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
   {
         if (direct)
                 return vcpu_match_mmio_gpa(vcpu, addr);
@@@ -3332,7 -3378,7 +3378,7 @@@ int handle_mmio_page_fault(struct kvm_v
         u64 spte;
         bool reserved;
   
-       if (quickly_check_mmio_pf(vcpu, addr, direct))
+       if (mmio_info_in_cache(vcpu, addr, direct))
                 return RET_MMIO_PF_EMULATE;
   
         reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
@@@ -3362,20 -3408,53 +3408,53 @@@
   }
   EXPORT_SYMBOL_GPL(handle_mmio_page_fault);
   
+ static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
+                                        u32 error_code, gfn_t gfn)
+ {
+       if (unlikely(error_code & PFERR_RSVD_MASK))
+               return false;
+ 
+       if (!(error_code & PFERR_PRESENT_MASK) ||
+             !(error_code & PFERR_WRITE_MASK))
+               return false;
+ 
+       /*
+        * guest is writing the page which is write tracked which can
+        * not be fixed by page fault handler.
+        */
+       if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+               return true;
+ 
+       return false;
+ }
+ 
+ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
+ {
+       struct kvm_shadow_walk_iterator iterator;
+       u64 spte;
+ 
+       if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+               return;
+ 
+       walk_shadow_page_lockless_begin(vcpu);
+       for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
+               clear_sp_write_flooding_count(iterator.sptep);
+               if (!is_shadow_present_pte(spte))
+                       break;
+       }
+       walk_shadow_page_lockless_end(vcpu);
+ }
+ 
   static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
                                 u32 error_code, bool prefault)
   {
-       gfn_t gfn;
+       gfn_t gfn = gva >> PAGE_SHIFT;
         int r;
   
         pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
   
-       if (unlikely(error_code & PFERR_RSVD_MASK)) {
-               r = handle_mmio_page_fault(vcpu, gva, true);
- 
-               if (likely(r != RET_MMIO_PF_INVALID))
-                       return r;
-       }
+       if (page_fault_handle_page_track(vcpu, error_code, gfn))
+               return 1;
   
         r = mmu_topup_memory_caches(vcpu);
         if (r)
@@@ -3383,7 -3462,6 +3462,6 @@@
   
         MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
   
-       gfn = gva >> PAGE_SHIFT;
   
         return nonpaging_map(vcpu, gva & PAGE_MASK,
                              error_code, gfn, prefault);
@@@ -3460,12 -3538,8 +3538,8 @@@ static int tdp_page_fault(struct kvm_vc
   
         MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
   
-       if (unlikely(error_code & PFERR_RSVD_MASK)) {
-               r = handle_mmio_page_fault(vcpu, gpa, true);
- 
-               if (likely(r != RET_MMIO_PF_INVALID))
-                       return r;
-       }
+       if (page_fault_handle_page_track(vcpu, error_code, gfn))
+               return 1;
   
         r = mmu_topup_memory_caches(vcpu);
         if (r)
@@@ -3558,13 -3632,24 +3632,24 @@@ static bool sync_mmio_spte(struct kvm_v
         return false;
   }
   
- static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte)
+ static inline bool is_last_gpte(struct kvm_mmu *mmu,
+                               unsigned level, unsigned gpte)
   {
-       unsigned index;
+       /*
+        * PT_PAGE_TABLE_LEVEL always terminates.  The RHS has bit 7 set
+        * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
+        * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
+        */
+       gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
   
-       index = level - 1;
-       index |= (gpte & PT_PAGE_SIZE_MASK) >> (PT_PAGE_SIZE_SHIFT - 2);
-       return mmu->last_pte_bitmap & (1 << index);
+       /*
+        * The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
+        * If it is clear, there are no large pages at this level, so clear
+        * PT_PAGE_SIZE_MASK in gpte if that is the case.
+        */
+       gpte &= level - mmu->last_nonleaf_level;
+ 
+       return gpte & PT_PAGE_SIZE_MASK;
   }
   
   #define PTTYPE_EPT 18 /* arbitrary */
@@@ -3721,15 -3806,13 +3806,15 @@@ static void reset_rsvds_bits_mask_ept(s
   void
   reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
   {
+ +      bool uses_nx = context->nx || context->base_role.smep_andnot_wp;
+ +
         /*
          * Passing "true" to the last argument is okay; it adds a check
          * on bit 8 of the SPTEs which KVM doesn't use anyway.
          */
         __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check,
                                 boot_cpu_data.x86_phys_bits,
- -                              context->shadow_root_level, context->nx,
+ +                              context->shadow_root_level, uses_nx,
                                 guest_cpuid_has_gbpages(vcpu), is_pse(vcpu),
                                 true);
   }
@@@ -3838,22 -3921,13 +3923,13 @@@ static void update_permission_bitmask(s
         }
   }
   
- static void update_last_pte_bitmap(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+ static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
   {
-       u8 map;
-       unsigned level, root_level = mmu->root_level;
-       const unsigned ps_set_index = 1 << 2;  /* bit 2 of index: ps */
- 
-       if (root_level == PT32E_ROOT_LEVEL)
-               --root_level;
-       /* PT_PAGE_TABLE_LEVEL always terminates */
-       map = 1 | (1 << ps_set_index);
-       for (level = PT_DIRECTORY_LEVEL; level <= root_level; ++level) {
-               if (level <= PT_PDPE_LEVEL
-                   && (mmu->root_level >= PT32E_ROOT_LEVEL || is_pse(vcpu)))
-                       map |= 1 << (ps_set_index | (level - 1));
-       }
-       mmu->last_pte_bitmap = map;
+       unsigned root_level = mmu->root_level;
+ 
+       mmu->last_nonleaf_level = root_level;
+       if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu))
+               mmu->last_nonleaf_level++;
   }
   
   static void paging64_init_context_common(struct kvm_vcpu *vcpu,
@@@ -3865,7 -3939,7 +3941,7 @@@
   
         reset_rsvds_bits_mask(vcpu, context);
         update_permission_bitmask(vcpu, context, false);
-       update_last_pte_bitmap(vcpu, context);
+       update_last_nonleaf_level(vcpu, context);
   
         MMU_WARN_ON(!is_pae(vcpu));
         context->page_fault = paging64_page_fault;
@@@ -3892,7 -3966,7 +3968,7 @@@ static void paging32_init_context(struc
   
         reset_rsvds_bits_mask(vcpu, context);
         update_permission_bitmask(vcpu, context, false);
-       update_last_pte_bitmap(vcpu, context);
+       update_last_nonleaf_level(vcpu, context);
   
         context->page_fault = paging32_page_fault;
         context->gva_to_gpa = paging32_gva_to_gpa;
@@@ -3950,7 -4024,7 +4026,7 @@@ static void init_kvm_tdp_mmu(struct kvm
         }
   
         update_permission_bitmask(vcpu, context, false);
-       update_last_pte_bitmap(vcpu, context);
+       update_last_nonleaf_level(vcpu, context);
         reset_tdp_shadow_zero_bits_mask(vcpu, context);
   }
   
@@@ -4056,7 -4130,7 +4132,7 @@@ static void init_kvm_nested_mmu(struct 
         }
   
         update_permission_bitmask(vcpu, g_context, false);
-       update_last_pte_bitmap(vcpu, g_context);
+       update_last_nonleaf_level(vcpu, g_context);
   }
   
   static void init_kvm_mmu(struct kvm_vcpu *vcpu)
@@@ -4127,18 -4201,6 +4203,6 @@@ static bool need_remote_flush(u64 old, 
         return (old & ~new & PT64_PERM_MASK) != 0;
   }
   
- static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
-                                   bool remote_flush, bool local_flush)
- {
-       if (zap_page)
-               return;
- 
-       if (remote_flush)
-               kvm_flush_remote_tlbs(vcpu->kvm);
-       else if (local_flush)
-               kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
- }
- 
   static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
                                     const u8 *new, int *bytes)
   {
@@@ -4188,7 -4250,8 +4252,8 @@@ static bool detect_write_flooding(struc
         if (sp->role.level == PT_PAGE_TABLE_LEVEL)
                 return false;
   
-       return ++sp->write_flooding_count >= 3;
+       atomic_inc(&sp->write_flooding_count);
+       return atomic_read(&sp->write_flooding_count) >= 3;
   }
   
   /*
@@@ -4250,15 -4313,15 +4315,15 @@@ static u64 *get_written_sptes(struct kv
         return spte;
   }
   
- void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-                      const u8 *new, int bytes)
+ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+                             const u8 *new, int bytes)
   {
         gfn_t gfn = gpa >> PAGE_SHIFT;
         struct kvm_mmu_page *sp;
         LIST_HEAD(invalid_list);
         u64 entry, gentry, *spte;
         int npte;
-       bool remote_flush, local_flush, zap_page;
+       bool remote_flush, local_flush;
         union kvm_mmu_page_role mask = { };
   
         mask.cr0_wp = 1;
@@@ -4275,7 -4338,7 +4340,7 @@@
         if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
                 return;
   
-       zap_page = remote_flush = local_flush = false;
+       remote_flush = local_flush = false;
   
         pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
   
@@@ -4295,8 -4358,7 +4360,7 @@@
         for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
                 if (detect_write_misaligned(sp, gpa, bytes) ||
                       detect_write_flooding(sp)) {
-                       zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
-                                                    &invalid_list);
+                       kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
                         ++vcpu->kvm->stat.mmu_flooded;
                         continue;
                 }
@@@ -4318,8 -4380,7 +4382,7 @@@
                         ++spte;
                 }
         }
-       mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
-       kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+       kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
         kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
         spin_unlock(&vcpu->kvm->mmu_lock);
   }
@@@ -4356,32 -4417,34 +4419,34 @@@ static void make_mmu_pages_available(st
         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
   }
   
- static bool is_mmio_page_fault(struct kvm_vcpu *vcpu, gva_t addr)
- {
-       if (vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu))
-               return vcpu_match_mmio_gpa(vcpu, addr);
- 
-       return vcpu_match_mmio_gva(vcpu, addr);
- }
- 
   int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
                        void *insn, int insn_len)
   {
         int r, emulation_type = EMULTYPE_RETRY;
         enum emulation_result er;
+       bool direct = vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu);
+ 
+       if (unlikely(error_code & PFERR_RSVD_MASK)) {
+               r = handle_mmio_page_fault(vcpu, cr2, direct);
+               if (r == RET_MMIO_PF_EMULATE) {
+                       emulation_type = 0;
+                       goto emulate;
+               }
+               if (r == RET_MMIO_PF_RETRY)
+                       return 1;
+               if (r < 0)
+                       return r;
+       }
   
         r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
         if (r < 0)
-               goto out;
- 
-       if (!r) {
-               r = 1;
-               goto out;
-       }
+               return r;
+       if (!r)
+               return 1;
   
-       if (is_mmio_page_fault(vcpu, cr2))
+       if (mmio_info_in_cache(vcpu, cr2, direct))
                 emulation_type = 0;
- 
+ emulate:
         er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
   
         switch (er) {
@@@ -4395,8 -4458,6 +4460,6 @@@
         default:
                 BUG();
         }
- out:
-       return r;
   }
   EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
   
@@@ -4465,6 -4526,21 +4528,21 @@@ void kvm_mmu_setup(struct kvm_vcpu *vcp
         init_kvm_mmu(vcpu);
   }
   
+ void kvm_mmu_init_vm(struct kvm *kvm)
+ {
+       struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+ 
+       node->track_write = kvm_mmu_pte_write;
+       kvm_page_track_register_notifier(kvm, node);
+ }
+ 
+ void kvm_mmu_uninit_vm(struct kvm *kvm)
+ {
+       struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+ 
+       kvm_page_track_unregister_notifier(kvm, node);
+ }
+ 
   /* The return value indicates if tlb flush on all vcpus is needed. */
   typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
   
diff --combined arch/x86/kvm/vmx.c

index 9bd8f44baded2318e8b5bc2a4773068a45a8723b,e512aa7ed8748ff8de66c053348572ce574ac756..5e45c2731a5d60ba6a9adfc58841ca075cd90834
--- 1/arch/x86/kvm/vmx.c
--- 2/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@@ -596,8 -596,6 +596,8 @@@ struct vcpu_vmx 
         /* Support for PML */
   #define PML_ENTITY_NUM                512
         struct page *pml_pg;
+ +
+ +      u64 current_tsc_ratio;
   };
   
   enum segment_cache_field {
@@@ -863,7 -861,6 +863,6 @@@ static unsigned long nested_ept_get_cr3
   static u64 construct_eptp(unsigned long root_hpa);
   static void kvm_cpu_vmxon(u64 addr);
   static void kvm_cpu_vmxoff(void);
- static bool vmx_mpx_supported(void);
   static bool vmx_xsaves_supported(void);
   static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
   static void vmx_set_segment(struct kvm_vcpu *vcpu,
@@@ -963,25 -960,36 +962,36 @@@ static const u32 vmx_msr_index[] = 
         MSR_EFER, MSR_TSC_AUX, MSR_STAR,
   };
   
- static inline bool is_page_fault(u32 intr_info)
+ static inline bool is_exception_n(u32 intr_info, u8 vector)
   {
         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
                              INTR_INFO_VALID_MASK)) ==
-               (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
+               (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK);
+ }
+ 
+ static inline bool is_debug(u32 intr_info)
+ {
+       return is_exception_n(intr_info, DB_VECTOR);
+ }
+ 
+ static inline bool is_breakpoint(u32 intr_info)
+ {
+       return is_exception_n(intr_info, BP_VECTOR);
+ }
+ 
+ static inline bool is_page_fault(u32 intr_info)
+ {
+       return is_exception_n(intr_info, PF_VECTOR);
   }
   
   static inline bool is_no_device(u32 intr_info)
   {
-       return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
-                            INTR_INFO_VALID_MASK)) ==
-               (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
+       return is_exception_n(intr_info, NM_VECTOR);
   }
   
   static inline bool is_invalid_opcode(u32 intr_info)
   {
-       return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
-                            INTR_INFO_VALID_MASK)) ==
-               (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
+       return is_exception_n(intr_info, UD_VECTOR);
   }
   
   static inline bool is_external_interrupt(u32 intr_info)
@@@ -1813,13 -1821,6 +1823,13 @@@ static void add_atomic_switch_msr(struc
                         return;
                 }
                 break;
+ +      case MSR_IA32_PEBS_ENABLE:
+ +              /* PEBS needs a quiescent period after being disabled (to write
+ +               * a record).  Disabling PEBS through VMX MSR swapping doesn't
+ +               * provide that period, so a CPU could write host's record into
+ +               * guest's memory.
+ +               */
+ +              wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
         }
   
         for (i = 0; i < m->nr; ++i)
@@@ -1857,31 -1858,26 +1867,31 @@@ static void reload_tss(void
   
   static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
   {
- -      u64 guest_efer;
- -      u64 ignore_bits;
+ +      u64 guest_efer = vmx->vcpu.arch.efer;
+ +      u64 ignore_bits = 0;
   
- -      guest_efer = vmx->vcpu.arch.efer;
+ +      if (!enable_ept) {
+ +              /*
+ +               * NX is needed to handle CR0.WP=1, CR4.SMEP=1.  Testing
+ +               * host CPUID is more efficient than testing guest CPUID
+ +               * or CR4.  Host SMEP is anyway a requirement for guest SMEP.
+ +               */
+ +              if (boot_cpu_has(X86_FEATURE_SMEP))
+ +                      guest_efer |= EFER_NX;
+ +              else if (!(guest_efer & EFER_NX))
+ +                      ignore_bits |= EFER_NX;
+ +      }
   
         /*
- -       * NX is emulated; LMA and LME handled by hardware; SCE meaningless
- -       * outside long mode
+ +       * LMA and LME handled by hardware; SCE meaningless outside long mode.
          */
- -      ignore_bits = EFER_NX | EFER_SCE;
+ +      ignore_bits |= EFER_SCE;
   #ifdef CONFIG_X86_64
         ignore_bits |= EFER_LMA | EFER_LME;
         /* SCE is meaningful only in long mode on Intel */
         if (guest_efer & EFER_LMA)
                 ignore_bits &= ~(u64)EFER_SCE;
   #endif
- -      guest_efer &= ~ignore_bits;
- -      guest_efer |= host_efer & ignore_bits;
- -      vmx->guest_msrs[efer_offset].data = guest_efer;
- -      vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
   
         clear_atomic_switch_msr(vmx, MSR_EFER);
   
@@@ -1892,21 -1888,16 +1902,21 @@@
          */
         if (cpu_has_load_ia32_efer ||
             (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
- -              guest_efer = vmx->vcpu.arch.efer;
                 if (!(guest_efer & EFER_LMA))
                         guest_efer &= ~EFER_LME;
                 if (guest_efer != host_efer)
                         add_atomic_switch_msr(vmx, MSR_EFER,
                                               guest_efer, host_efer);
                 return false;
- -      }
+ +      } else {
+ +              guest_efer &= ~ignore_bits;
+ +              guest_efer |= host_efer & ignore_bits;
+ +
+ +              vmx->guest_msrs[efer_offset].data = guest_efer;
+ +              vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
   
- -      return true;
+ +              return true;
+ +      }
   }
   
   static unsigned long segment_base(u16 selector)
@@@ -2146,16 -2137,14 +2156,16 @@@ static void vmx_vcpu_load(struct kvm_vc
                 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
                 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
   
- -              /* Setup TSC multiplier */
- -              if (cpu_has_vmx_tsc_scaling())
- -                      vmcs_write64(TSC_MULTIPLIER,
- -                                   vcpu->arch.tsc_scaling_ratio);
- -
                 vmx->loaded_vmcs->cpu = cpu;
         }
   
+ +      /* Setup TSC multiplier */
+ +      if (kvm_has_tsc_control &&
+ +          vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) {
+ +              vmx->current_tsc_ratio = vcpu->arch.tsc_scaling_ratio;
+ +              vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
+ +      }
+ +
         vmx_vcpu_pi_load(vcpu, cpu);
   }
   
@@@ -2605,7 -2594,7 +2615,7 @@@ static void nested_vmx_setup_ctls_msrs(
                 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
                 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
   
-       if (vmx_mpx_supported())
+       if (kvm_mpx_supported())
                 vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
   
         /* We support free control of debug control saving. */
@@@ -2626,7 -2615,7 +2636,7 @@@
                 VM_ENTRY_LOAD_IA32_PAT;
         vmx->nested.nested_vmx_entry_ctls_high |=
                 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
-       if (vmx_mpx_supported())
+       if (kvm_mpx_supported())
                 vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
   
         /* We support free control of debug control loading. */
@@@ -2870,7 -2859,7 +2880,7 @@@ static int vmx_get_msr(struct kvm_vcpu 
                 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
                 break;
         case MSR_IA32_BNDCFGS:
-               if (!vmx_mpx_supported())
+               if (!kvm_mpx_supported())
                         return 1;
                 msr_info->data = vmcs_read64(GUEST_BNDCFGS);
                 break;
@@@ -2947,7 -2936,7 +2957,7 @@@ static int vmx_set_msr(struct kvm_vcpu 
                 vmcs_writel(GUEST_SYSENTER_ESP, data);
                 break;
         case MSR_IA32_BNDCFGS:
-               if (!vmx_mpx_supported())
+               if (!kvm_mpx_supported())
                         return 1;
                 vmcs_write64(GUEST_BNDCFGS, data);
                 break;
@@@ -3420,7 -3409,7 +3430,7 @@@ static void init_vmcs_shadow_fields(voi
         for (i = j = 0; i < max_shadow_read_write_fields; i++) {
                 switch (shadow_read_write_fields[i]) {
                 case GUEST_BNDCFGS:
-                       if (!vmx_mpx_supported())
+                       if (!kvm_mpx_supported())
                                 continue;
                         break;
                 default:
@@@ -5629,11 -5618,8 +5639,8 @@@ static int handle_dr(struct kvm_vcpu *v
         }
   
         if (vcpu->guest_debug == 0) {
-               u32 cpu_based_vm_exec_control;
- 
-               cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-               cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING;
-               vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+               vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+                               CPU_BASED_MOV_DR_EXITING);
   
                 /*
                  * No more DR vmexits; force a reload of the debug registers
@@@ -5670,8 -5656,6 +5677,6 @@@ static void vmx_set_dr6(struct kvm_vcp
   
   static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
   {
-       u32 cpu_based_vm_exec_control;
- 
         get_debugreg(vcpu->arch.db[0], 0);
         get_debugreg(vcpu->arch.db[1], 1);
         get_debugreg(vcpu->arch.db[2], 2);
@@@ -5680,10 -5664,7 +5685,7 @@@
         vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
   
         vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
- 
-       cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-       cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING;
-       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+       vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING);
   }
   
   static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
@@@ -5768,8 -5749,7 +5770,7 @@@ static int handle_halt(struct kvm_vcpu 
   
   static int handle_vmcall(struct kvm_vcpu *vcpu)
   {
-       kvm_emulate_hypercall(vcpu);
-       return 1;
+       return kvm_emulate_hypercall(vcpu);
   }
   
   static int handle_invd(struct kvm_vcpu *vcpu)
@@@ -6456,8 -6436,8 +6457,8 @@@ static struct loaded_vmcs *nested_get_c
   
         if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
                 /* Recycle the least recently used VMCS. */
-               item = list_entry(vmx->nested.vmcs02_pool.prev,
-                       struct vmcs02_list, list);
+               item = list_last_entry(&vmx->nested.vmcs02_pool,
+                                      struct vmcs02_list, list);
                 item->vmptr = vmx->nested.current_vmptr;
                 list_move(&item->list, &vmx->nested.vmcs02_pool);
                 return &item->vmcs02;
@@@ -7773,6 -7753,13 +7774,13 @@@ static bool nested_vmx_exit_handled(str
                 else if (is_no_device(intr_info) &&
                          !(vmcs12->guest_cr0 & X86_CR0_TS))
                         return false;
+               else if (is_debug(intr_info) &&
+                        vcpu->guest_debug &
+                        (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+                       return false;
+               else if (is_breakpoint(intr_info) &&
+                        vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+                       return false;
                 return vmcs12->exception_bitmap &
                                 (1u << (intr_info & INTR_INFO_VECTOR_MASK));
         case EXIT_REASON_EXTERNAL_INTERRUPT:
@@@ -10277,7 -10264,7 +10285,7 @@@ static void prepare_vmcs12(struct kvm_v
         vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
         vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
         vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
-       if (vmx_mpx_supported())
+       if (kvm_mpx_supported())
                 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
         if (nested_cpu_has_xsaves(vmcs12))
                 vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP);
@@@ -10785,13 -10772,26 +10793,26 @@@ static int vmx_update_pi_irte(struct kv
                  */
   
                 kvm_set_msi_irq(e, &irq);
-               if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu))
+               if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
+                       /*
+                        * Make sure the IRTE is in remapped mode if
+                        * we don't handle it in posted mode.
+                        */
+                       ret = irq_set_vcpu_affinity(host_irq, NULL);
+                       if (ret < 0) {
+                               printk(KERN_INFO
+                                  "failed to back to remapped mode, irq: %u\n",
+                                  host_irq);
+                               goto out;
+                       }
+ 
                         continue;
+               }
   
                 vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
                 vcpu_info.vector = irq.vector;
   
-               trace_kvm_pi_irte_update(vcpu->vcpu_id, e->gsi,
+               trace_kvm_pi_irte_update(vcpu->vcpu_id, host_irq, e->gsi,
                                 vcpu_info.vector, vcpu_info.pi_desc_addr, set);
   
                 if (set)
diff --combined arch/x86/kvm/x86.c

index eaf6ee8c28b8f1619e7404bfd7efbf78beecc574,bcbce0fa0bc278b0c17fddf92e89395ea9c11d4f..7236bd3a4c3d7a0c5a6148decc6fad276eb18bb7
--- 1/arch/x86/kvm/x86.c
--- 2/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@@ -123,6 -123,9 +123,9 @@@ module_param(tsc_tolerance_ppm, uint, S
   unsigned int __read_mostly lapic_timer_advance_ns = 0;
   module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
   
+ static bool __read_mostly vector_hashing = true;
+ module_param(vector_hashing, bool, S_IRUGO);
+ 
   static bool __read_mostly backwards_tsc_observed = false;
   
   #define KVM_NR_SHARED_MSRS 16
@@@ -1196,17 -1199,11 +1199,11 @@@ static void kvm_write_wall_clock(struc
   
   static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
   {
-       uint32_t quotient, remainder;
- 
-       /* Don't try to replace with do_div(), this one calculates
-        * "(dividend << 32) / divisor" */
-       __asm__ ( "divl %4"
-                 : "=a" (quotient), "=d" (remainder)
-                 : "0" (0), "1" (dividend), "r" (divisor) );
-       return quotient;
+       do_shl32_div32(dividend, divisor);
+       return dividend;
   }
   
- static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
+ static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
                                s8 *pshift, u32 *pmultiplier)
   {
         uint64_t scaled64;
@@@ -1214,8 -1211,8 +1211,8 @@@
         uint64_t tps64;
         uint32_t tps32;
   
-       tps64 = base_khz * 1000LL;
-       scaled64 = scaled_khz * 1000LL;
+       tps64 = base_hz;
+       scaled64 = scaled_hz;
         while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
                 tps64 >>= 1;
                 shift--;
@@@ -1233,8 -1230,8 +1230,8 @@@
         *pshift = shift;
         *pmultiplier = div_frac(scaled64, tps32);
   
-       pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
-                __func__, base_khz, scaled_khz, shift, *pmultiplier);
+       pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n",
+                __func__, base_hz, scaled_hz, shift, *pmultiplier);
   }
   
   #ifdef CONFIG_X86_64
@@@ -1293,23 -1290,23 +1290,23 @@@ static int set_tsc_khz(struct kvm_vcpu 
         return 0;
   }
   
- static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
+ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
   {
         u32 thresh_lo, thresh_hi;
         int use_scaling = 0;
   
         /* tsc_khz can be zero if TSC calibration fails */
-       if (this_tsc_khz == 0) {
+       if (user_tsc_khz == 0) {
                 /* set tsc_scaling_ratio to a safe value */
                 vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
                 return -1;
         }
   
         /* Compute a scale to convert nanoseconds in TSC cycles */
-       kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
+       kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
                            &vcpu->arch.virtual_tsc_shift,
                            &vcpu->arch.virtual_tsc_mult);
-       vcpu->arch.virtual_tsc_khz = this_tsc_khz;
+       vcpu->arch.virtual_tsc_khz = user_tsc_khz;
   
         /*
          * Compute the variation in TSC rate which is acceptable
@@@ -1319,11 -1316,11 +1316,11 @@@
          */
         thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
         thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
-       if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {
-               pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
+       if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
+               pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
                 use_scaling = 1;
         }
-       return set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
+       return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
   }
   
   static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
@@@ -1716,7 -1713,7 +1713,7 @@@ static void kvm_gen_update_masterclock(
   
   static int kvm_guest_time_update(struct kvm_vcpu *v)
   {
-       unsigned long flags, this_tsc_khz, tgt_tsc_khz;
+       unsigned long flags, tgt_tsc_khz;
         struct kvm_vcpu_arch *vcpu = &v->arch;
         struct kvm_arch *ka = &v->kvm->arch;
         s64 kernel_ns;
@@@ -1742,8 -1739,8 +1739,8 @@@
   
         /* Keep irq disabled to prevent changes to the clock */
         local_irq_save(flags);
-       this_tsc_khz = __this_cpu_read(cpu_tsc_khz);
-       if (unlikely(this_tsc_khz == 0)) {
+       tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
+       if (unlikely(tgt_tsc_khz == 0)) {
                 local_irq_restore(flags);
                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
                 return 1;
@@@ -1778,13 -1775,14 +1775,14 @@@
         if (!vcpu->pv_time_enabled)
                 return 0;
   
-       if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
-               tgt_tsc_khz = kvm_has_tsc_control ?
-                       vcpu->virtual_tsc_khz : this_tsc_khz;
-               kvm_get_time_scale(NSEC_PER_SEC / 1000, tgt_tsc_khz,
+       if (kvm_has_tsc_control)
+               tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
+ 
+       if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
+               kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
                                    &vcpu->hv_clock.tsc_shift,
                                    &vcpu->hv_clock.tsc_to_system_mul);
-               vcpu->hw_tsc_khz = this_tsc_khz;
+               vcpu->hw_tsc_khz = tgt_tsc_khz;
         }
   
         /* With all the info we got, fill in the values */
@@@ -2752,6 -2750,7 +2750,6 @@@ void kvm_arch_vcpu_load(struct kvm_vcp
         }
   
         kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
- -      vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
   }
   
   void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@@ -2987,7 -2986,7 +2985,7 @@@ static int kvm_vcpu_ioctl_x86_set_vcpu_
         kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
   
         if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
-           kvm_vcpu_has_lapic(vcpu))
+           lapic_in_kernel(vcpu))
                 vcpu->arch.apic->sipi_vector = events->sipi_vector;
   
         if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
@@@ -3000,7 -2999,7 +2998,7 @@@
                         vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
                 else
                         vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
-               if (kvm_vcpu_has_lapic(vcpu)) {
+               if (lapic_in_kernel(vcpu)) {
                         if (events->smi.latched_init)
                                 set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
                         else
@@@ -3240,7 -3239,7 +3238,7 @@@ long kvm_arch_vcpu_ioctl(struct file *f
         switch (ioctl) {
         case KVM_GET_LAPIC: {
                 r = -EINVAL;
-               if (!vcpu->arch.apic)
+               if (!lapic_in_kernel(vcpu))
                         goto out;
                 u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
   
@@@ -3258,7 -3257,7 +3256,7 @@@
         }
         case KVM_SET_LAPIC: {
                 r = -EINVAL;
-               if (!vcpu->arch.apic)
+               if (!lapic_in_kernel(vcpu))
                         goto out;
                 u.lapic = memdup_user(argp, sizeof(*u.lapic));
                 if (IS_ERR(u.lapic))
@@@ -3605,20 -3604,26 +3603,26 @@@ static int kvm_vm_ioctl_set_irqchip(str
   
   static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
   {
-       mutex_lock(&kvm->arch.vpit->pit_state.lock);
-       memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
-       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+       struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
+ 
+       BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
+ 
+       mutex_lock(&kps->lock);
+       memcpy(ps, &kps->channels, sizeof(*ps));
+       mutex_unlock(&kps->lock);
         return 0;
   }
   
   static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
   {
         int i;
-       mutex_lock(&kvm->arch.vpit->pit_state.lock);
-       memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
+       struct kvm_pit *pit = kvm->arch.vpit;
+ 
+       mutex_lock(&pit->pit_state.lock);
+       memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
         for (i = 0; i < 3; i++)
-               kvm_pit_load_count(kvm, i, ps->channels[i].count, 0);
-       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+               kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
+       mutex_unlock(&pit->pit_state.lock);
         return 0;
   }
   
@@@ -3638,29 -3643,39 +3642,39 @@@ static int kvm_vm_ioctl_set_pit2(struc
         int start = 0;
         int i;
         u32 prev_legacy, cur_legacy;
-       mutex_lock(&kvm->arch.vpit->pit_state.lock);
-       prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
+       struct kvm_pit *pit = kvm->arch.vpit;
+ 
+       mutex_lock(&pit->pit_state.lock);
+       prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
         cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
         if (!prev_legacy && cur_legacy)
                 start = 1;
-       memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
-              sizeof(kvm->arch.vpit->pit_state.channels));
-       kvm->arch.vpit->pit_state.flags = ps->flags;
+       memcpy(&pit->pit_state.channels, &ps->channels,
+              sizeof(pit->pit_state.channels));
+       pit->pit_state.flags = ps->flags;
         for (i = 0; i < 3; i++)
-               kvm_pit_load_count(kvm, i, kvm->arch.vpit->pit_state.channels[i].count,
+               kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
                                    start && i == 0);
-       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+       mutex_unlock(&pit->pit_state.lock);
         return 0;
   }
   
   static int kvm_vm_ioctl_reinject(struct kvm *kvm,
                                  struct kvm_reinject_control *control)
   {
-       if (!kvm->arch.vpit)
+       struct kvm_pit *pit = kvm->arch.vpit;
+ 
+       if (!pit)
                 return -ENXIO;
-       mutex_lock(&kvm->arch.vpit->pit_state.lock);
-       kvm->arch.vpit->pit_state.reinject = control->pit_reinject;
-       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ 
+       /* pit->pit_state.lock was overloaded to prevent userspace from getting
+        * an inconsistent state after running multiple KVM_REINJECT_CONTROL
+        * ioctls in parallel.  Use a separate lock if that ioctl isn't rare.
+        */
+       mutex_lock(&pit->pit_state.lock);
+       kvm_pit_set_reinject(pit, control->pit_reinject);
+       mutex_unlock(&pit->pit_state.lock);
+ 
         return 0;
   }
   
@@@ -4093,7 -4108,7 +4107,7 @@@ static int vcpu_mmio_write(struct kvm_v
   
         do {
                 n = min(len, 8);
-               if (!(vcpu->arch.apic &&
+               if (!(lapic_in_kernel(vcpu) &&
                       !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
                     && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
                         break;
@@@ -4113,7 -4128,7 +4127,7 @@@ static int vcpu_mmio_read(struct kvm_vc
   
         do {
                 n = min(len, 8);
-               if (!(vcpu->arch.apic &&
+               if (!(lapic_in_kernel(vcpu) &&
                       !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
                                          addr, n, v))
                     && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
@@@ -4346,7 -4361,7 +4360,7 @@@ int emulator_write_phys(struct kvm_vcp
         ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
         if (ret < 0)
                 return 0;
-       kvm_mmu_pte_write(vcpu, gpa, val, bytes);
+       kvm_page_track_write(vcpu, gpa, val, bytes);
         return 1;
   }
   
@@@ -4604,7 -4619,7 +4618,7 @@@ static int emulator_cmpxchg_emulated(st
                 return X86EMUL_CMPXCHG_FAILED;
   
         kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
-       kvm_mmu_pte_write(vcpu, gpa, new, bytes);
+       kvm_page_track_write(vcpu, gpa, new, bytes);
   
         return X86EMUL_CONTINUE;
   
@@@ -6010,7 -6025,7 +6024,7 @@@ static void update_cr8_intercept(struc
         if (!kvm_x86_ops->update_cr8_intercept)
                 return;
   
-       if (!vcpu->arch.apic)
+       if (!lapic_in_kernel(vcpu))
                 return;
   
         if (vcpu->arch.apicv_active)
@@@ -6618,12 -6633,12 +6632,12 @@@ static int vcpu_enter_guest(struct kvm_
          * KVM_DEBUGREG_WONT_EXIT again.
          */
         if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
- -              int i;
- -
                 WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
                 kvm_x86_ops->sync_dirty_debug_regs(vcpu);
- -              for (i = 0; i < KVM_NR_DB_REGS; i++)
- -                      vcpu->arch.eff_db[i] = vcpu->arch.db[i];
+ +              kvm_update_dr0123(vcpu);
+ +              kvm_update_dr6(vcpu);
+ +              kvm_update_dr7(vcpu);
+ +              vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
         }
   
         /*
@@@ -7038,7 -7053,7 +7052,7 @@@ int kvm_arch_vcpu_ioctl_get_mpstate(str
   int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
                                     struct kvm_mp_state *mp_state)
   {
-       if (!kvm_vcpu_has_lapic(vcpu) &&
+       if (!lapic_in_kernel(vcpu) &&
             mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
                 return -EINVAL;
   
@@@ -7314,7 -7329,7 +7328,7 @@@ void kvm_put_guest_fpu(struct kvm_vcpu 
          * Every 255 times fpu_counter rolls over to 0; a guest that uses
          * the FPU in bursts will revert to loading it on demand.
          */
-       if (!vcpu->arch.eager_fpu) {
+       if (!use_eager_fpu()) {
                 if (++vcpu->fpu_counter < 5)
                         kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
         }
@@@ -7593,6 -7608,7 +7607,7 @@@ bool kvm_vcpu_compatible(struct kvm_vcp
   }
   
   struct static_key kvm_no_apic_vcpu __read_mostly;
+ EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
   
   int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
   {
@@@ -7724,6 -7740,9 +7739,9 @@@ int kvm_arch_init_vm(struct kvm *kvm, u
         INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
         INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
   
+       kvm_page_track_init(kvm);
+       kvm_mmu_init_vm(kvm);
+ 
         return 0;
   }
   
@@@ -7850,6 -7869,7 +7868,7 @@@ void kvm_arch_destroy_vm(struct kvm *kv
         kfree(kvm->arch.vioapic);
         kvm_free_vcpus(kvm);
         kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
+       kvm_mmu_uninit_vm(kvm);
   }
   
   void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
@@@ -7871,6 -7891,8 +7890,8 @@@
                         free->arch.lpage_info[i - 1] = NULL;
                 }
         }
+ 
+       kvm_page_track_free_memslot(free, dont);
   }
   
   int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
@@@ -7879,6 -7901,7 +7900,7 @@@
         int i;
   
         for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+               struct kvm_lpage_info *linfo;
                 unsigned long ugfn;
                 int lpages;
                 int level = i + 1;
@@@ -7893,15 -7916,16 +7915,16 @@@
                 if (i == 0)
                         continue;
   
-               slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages *
-                                       sizeof(*slot->arch.lpage_info[i - 1]));
-               if (!slot->arch.lpage_info[i - 1])
+               linfo = kvm_kvzalloc(lpages * sizeof(*linfo));
+               if (!linfo)
                         goto out_free;
   
+               slot->arch.lpage_info[i - 1] = linfo;
+ 
                 if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
-                       slot->arch.lpage_info[i - 1][0].write_count = 1;
+                       linfo[0].disallow_lpage = 1;
                 if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
-                       slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1;
+                       linfo[lpages - 1].disallow_lpage = 1;
                 ugfn = slot->userspace_addr >> PAGE_SHIFT;
                 /*
                  * If the gfn and userspace address are not aligned wrt each
@@@ -7913,10 -7937,13 +7936,13 @@@
                         unsigned long j;
   
                         for (j = 0; j < lpages; ++j)
-                               slot->arch.lpage_info[i - 1][j].write_count = 1;
+                               linfo[j].disallow_lpage = 1;
                 }
         }
   
+       if (kvm_page_track_create_memslot(slot, npages))
+               goto out_free;
+ 
         return 0;
   
   out_free:
@@@ -8370,6 -8397,12 +8396,12 @@@ int kvm_arch_update_irqfd_routing(struc
         return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
   }
   
+ bool kvm_vector_hashing_enabled(void)
+ {
+       return vector_hashing;
+ }
+ EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled);
+ 
   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
diff --combined drivers/clocksource/arm_arch_timer.c

index f0dd9d42bc7b1c9b55f48719cd0f3ded16bbe00f,ffe9d1c6b5884637c3c5c00153565a7fc8730235..5152b389815500a77a95cffe69a0602379d4f570
--- 1/drivers/clocksource/arm_arch_timer.c
--- 2/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@@ -32,14 -32,6 +32,14 @@@
   #define CNTTIDR               0x08
   #define CNTTIDR_VIRT(n)       (BIT(1) << ((n) * 4))
   
+ +#define CNTACR(n)     (0x40 + ((n) * 4))
+ +#define CNTACR_RPCT   BIT(0)
+ +#define CNTACR_RVCT   BIT(1)
+ +#define CNTACR_RFRQ   BIT(2)
+ +#define CNTACR_RVOFF  BIT(3)
+ +#define CNTACR_RWVT   BIT(4)
+ +#define CNTACR_RWPT   BIT(5)
+ +
   #define CNTVCT_LO     0x08
   #define CNTVCT_HI     0x0c
   #define CNTFRQ                0x10
@@@ -75,7 -67,7 +75,7 @@@ static int arch_timer_ppi[MAX_TIMER_PPI
   
   static struct clock_event_device __percpu *arch_timer_evt;
   
- static bool arch_timer_use_virtual = true;
+ static enum ppi_nr arch_timer_uses_ppi = VIRT_PPI;
   static bool arch_timer_c3stop;
   static bool arch_timer_mem_use_virtual;
   
@@@ -271,16 -263,20 +271,22 @@@ static void __arch_timer_setup(unsigne
                 clk->name = "arch_sys_timer";
                 clk->rating = 450;
                 clk->cpumask = cpumask_of(smp_processor_id());
-               if (arch_timer_use_virtual) {
-                       clk->irq = arch_timer_ppi[VIRT_PPI];
+               clk->irq = arch_timer_ppi[arch_timer_uses_ppi];
+               switch (arch_timer_uses_ppi) {
+               case VIRT_PPI:
                         clk->set_state_shutdown = arch_timer_shutdown_virt;
+ +                      clk->set_state_oneshot_stopped = arch_timer_shutdown_virt;
                         clk->set_next_event = arch_timer_set_next_event_virt;
-               } else {
-                       clk->irq = arch_timer_ppi[PHYS_SECURE_PPI];
+                       break;
+               case PHYS_SECURE_PPI:
+               case PHYS_NONSECURE_PPI:
+               case HYP_PPI:
                         clk->set_state_shutdown = arch_timer_shutdown_phys;
+ +                      clk->set_state_oneshot_stopped = arch_timer_shutdown_phys;
                         clk->set_next_event = arch_timer_set_next_event_phys;
+                       break;
+               default:
+                       BUG();
                 }
         } else {
                 clk->features |= CLOCK_EVT_FEAT_DYNIRQ;
@@@ -289,12 -285,10 +295,12 @@@
                 clk->cpumask = cpu_all_mask;
                 if (arch_timer_mem_use_virtual) {
                         clk->set_state_shutdown = arch_timer_shutdown_virt_mem;
+ +                      clk->set_state_oneshot_stopped = arch_timer_shutdown_virt_mem;
                         clk->set_next_event =
                                 arch_timer_set_next_event_virt_mem;
                 } else {
                         clk->set_state_shutdown = arch_timer_shutdown_phys_mem;
+ +                      clk->set_state_oneshot_stopped = arch_timer_shutdown_phys_mem;
                         clk->set_next_event =
                                 arch_timer_set_next_event_phys_mem;
                 }
@@@ -350,17 -344,20 +356,20 @@@ static void arch_counter_set_user_acces
         arch_timer_set_cntkctl(cntkctl);
   }
   
+ static bool arch_timer_has_nonsecure_ppi(void)
+ {
+       return (arch_timer_uses_ppi == PHYS_SECURE_PPI &&
+               arch_timer_ppi[PHYS_NONSECURE_PPI]);
+ }
+ 
   static int arch_timer_setup(struct clock_event_device *clk)
   {
         __arch_timer_setup(ARCH_CP15_TIMER, clk);
   
-       if (arch_timer_use_virtual)
-               enable_percpu_irq(arch_timer_ppi[VIRT_PPI], 0);
-       else {
-               enable_percpu_irq(arch_timer_ppi[PHYS_SECURE_PPI], 0);
-               if (arch_timer_ppi[PHYS_NONSECURE_PPI])
-                       enable_percpu_irq(arch_timer_ppi[PHYS_NONSECURE_PPI], 0);
-       }
+       enable_percpu_irq(arch_timer_ppi[arch_timer_uses_ppi], 0);
+ 
+       if (arch_timer_has_nonsecure_ppi())
+               enable_percpu_irq(arch_timer_ppi[PHYS_NONSECURE_PPI], 0);
   
         arch_counter_set_user_access();
         if (IS_ENABLED(CONFIG_ARM_ARCH_TIMER_EVTSTREAM))
@@@ -402,7 -399,7 +411,7 @@@ static void arch_timer_banner(unsigned 
                      (unsigned long)arch_timer_rate / 1000000,
                      (unsigned long)(arch_timer_rate / 10000) % 100,
                      type & ARCH_CP15_TIMER ?
-                       arch_timer_use_virtual ? "virt" : "phys" :
+                    (arch_timer_uses_ppi == VIRT_PPI) ? "virt" : "phys" :
                         "",
                      type == (ARCH_CP15_TIMER | ARCH_MEM_TIMER) ?  "/" : "",
                      type & ARCH_MEM_TIMER ?
@@@ -472,7 -469,7 +481,7 @@@ static void __init arch_counter_registe
   
         /* Register the CP15 based counter if we have one */
         if (type & ARCH_CP15_TIMER) {
-               if (IS_ENABLED(CONFIG_ARM64) || arch_timer_use_virtual)
+               if (IS_ENABLED(CONFIG_ARM64) || arch_timer_uses_ppi == VIRT_PPI)
                         arch_timer_read_counter = arch_counter_get_cntvct;
                 else
                         arch_timer_read_counter = arch_counter_get_cntpct;
@@@ -502,13 -499,9 +511,9 @@@ static void arch_timer_stop(struct cloc
         pr_debug("arch_timer_teardown disable IRQ%d cpu #%d\n",
                  clk->irq, smp_processor_id());
   
-       if (arch_timer_use_virtual)
-               disable_percpu_irq(arch_timer_ppi[VIRT_PPI]);
-       else {
-               disable_percpu_irq(arch_timer_ppi[PHYS_SECURE_PPI]);
-               if (arch_timer_ppi[PHYS_NONSECURE_PPI])
-                       disable_percpu_irq(arch_timer_ppi[PHYS_NONSECURE_PPI]);
-       }
+       disable_percpu_irq(arch_timer_ppi[arch_timer_uses_ppi]);
+       if (arch_timer_has_nonsecure_ppi())
+               disable_percpu_irq(arch_timer_ppi[PHYS_NONSECURE_PPI]);
   
         clk->set_state_shutdown(clk);
   }
@@@ -574,12 -567,14 +579,14 @@@ static int __init arch_timer_register(v
                 goto out;
         }
   
-       if (arch_timer_use_virtual) {
-               ppi = arch_timer_ppi[VIRT_PPI];
+       ppi = arch_timer_ppi[arch_timer_uses_ppi];
+       switch (arch_timer_uses_ppi) {
+       case VIRT_PPI:
                 err = request_percpu_irq(ppi, arch_timer_handler_virt,
                                          "arch_timer", arch_timer_evt);
-       } else {
-               ppi = arch_timer_ppi[PHYS_SECURE_PPI];
+               break;
+       case PHYS_SECURE_PPI:
+       case PHYS_NONSECURE_PPI:
                 err = request_percpu_irq(ppi, arch_timer_handler_phys,
                                          "arch_timer", arch_timer_evt);
                 if (!err && arch_timer_ppi[PHYS_NONSECURE_PPI]) {
@@@ -590,6 -585,13 +597,13 @@@
                                 free_percpu_irq(arch_timer_ppi[PHYS_SECURE_PPI],
                                                 arch_timer_evt);
                 }
+               break;
+       case HYP_PPI:
+               err = request_percpu_irq(ppi, arch_timer_handler_phys,
+                                        "arch_timer", arch_timer_evt);
+               break;
+       default:
+               BUG();
         }
   
         if (err) {
@@@ -614,15 -616,10 +628,10 @@@
   out_unreg_notify:
         unregister_cpu_notifier(&arch_timer_cpu_nb);
   out_free_irq:
-       if (arch_timer_use_virtual)
-               free_percpu_irq(arch_timer_ppi[VIRT_PPI], arch_timer_evt);
-       else {
-               free_percpu_irq(arch_timer_ppi[PHYS_SECURE_PPI],
+       free_percpu_irq(arch_timer_ppi[arch_timer_uses_ppi], arch_timer_evt);
+       if (arch_timer_has_nonsecure_ppi())
+               free_percpu_irq(arch_timer_ppi[PHYS_NONSECURE_PPI],
                                 arch_timer_evt);
-               if (arch_timer_ppi[PHYS_NONSECURE_PPI])
-                       free_percpu_irq(arch_timer_ppi[PHYS_NONSECURE_PPI],
-                                       arch_timer_evt);
-       }
   
   out_free:
         free_percpu(arch_timer_evt);
@@@ -709,12 -706,25 +718,25 @@@ static void __init arch_timer_init(void
          *
          * If no interrupt provided for virtual timer, we'll have to
          * stick to the physical timer. It'd better be accessible...
+        *
+        * On ARMv8.1 with VH extensions, the kernel runs in HYP. VHE
+        * accesses to CNTP_*_EL1 registers are silently redirected to
+        * their CNTHP_*_EL2 counterparts, and use a different PPI
+        * number.
          */
         if (is_hyp_mode_available() || !arch_timer_ppi[VIRT_PPI]) {
-               arch_timer_use_virtual = false;
+               bool has_ppi;
+ 
+               if (is_kernel_in_hyp_mode()) {
+                       arch_timer_uses_ppi = HYP_PPI;
+                       has_ppi = !!arch_timer_ppi[HYP_PPI];
+               } else {
+                       arch_timer_uses_ppi = PHYS_SECURE_PPI;
+                       has_ppi = (!!arch_timer_ppi[PHYS_SECURE_PPI] ||
+                                  !!arch_timer_ppi[PHYS_NONSECURE_PPI]);
+               }
   
-               if (!arch_timer_ppi[PHYS_SECURE_PPI] ||
-                   !arch_timer_ppi[PHYS_NONSECURE_PPI]) {
+               if (!has_ppi) {
                         pr_warn("arch_timer: No interrupt available, giving up\n");
                         return;
                 }
@@@ -747,7 -757,7 +769,7 @@@ static void __init arch_timer_of_init(s
          */
         if (IS_ENABLED(CONFIG_ARM) &&
             of_property_read_bool(np, "arm,cpu-registers-not-fw-configured"))
-                       arch_timer_use_virtual = false;
+               arch_timer_uses_ppi = PHYS_SECURE_PPI;
   
         arch_timer_init();
   }
@@@ -769,6 -779,7 +791,6 @@@ static void __init arch_timer_mem_init(
         }
   
         cnttidr = readl_relaxed(cntctlbase + CNTTIDR);
- -      iounmap(cntctlbase);
   
         /*
          * Try to find a virtual capable frame. Otherwise fall back to a
@@@ -776,31 -787,20 +798,31 @@@
          */
         for_each_available_child_of_node(np, frame) {
                 int n;
+ +              u32 cntacr;
   
                 if (of_property_read_u32(frame, "frame-number", &n)) {
                         pr_err("arch_timer: Missing frame-number\n");
- -                      of_node_put(best_frame);
                         of_node_put(frame);
- -                      return;
+ +                      goto out;
                 }
   
- -              if (cnttidr & CNTTIDR_VIRT(n)) {
+ +              /* Try enabling everything, and see what sticks */
+ +              cntacr = CNTACR_RFRQ | CNTACR_RWPT | CNTACR_RPCT |
+ +                       CNTACR_RWVT | CNTACR_RVOFF | CNTACR_RVCT;
+ +              writel_relaxed(cntacr, cntctlbase + CNTACR(n));
+ +              cntacr = readl_relaxed(cntctlbase + CNTACR(n));
+ +
+ +              if ((cnttidr & CNTTIDR_VIRT(n)) &&
+ +                  !(~cntacr & (CNTACR_RWVT | CNTACR_RVCT))) {
                         of_node_put(best_frame);
                         best_frame = frame;
                         arch_timer_mem_use_virtual = true;
                         break;
                 }
+ +
+ +              if (~cntacr & (CNTACR_RWPT | CNTACR_RPCT))
+ +                      continue;
+ +
                 of_node_put(best_frame);
                 best_frame = of_node_get(frame);
         }
@@@ -808,26 -808,24 +830,26 @@@
         base = arch_counter_base = of_iomap(best_frame, 0);
         if (!base) {
                 pr_err("arch_timer: Can't map frame's registers\n");
- -              of_node_put(best_frame);
- -              return;
+ +              goto out;
         }
   
         if (arch_timer_mem_use_virtual)
                 irq = irq_of_parse_and_map(best_frame, 1);
         else
                 irq = irq_of_parse_and_map(best_frame, 0);
- -      of_node_put(best_frame);
+ +
         if (!irq) {
                 pr_err("arch_timer: Frame missing %s irq",
                        arch_timer_mem_use_virtual ? "virt" : "phys");
- -              return;
+ +              goto out;
         }
   
         arch_timer_detect_rate(base, np);
         arch_timer_mem_register(base, irq);
         arch_timer_common_init();
+ +out:
+ +      iounmap(cntctlbase);
+ +      of_node_put(best_frame);
   }
   CLOCKSOURCE_OF_DECLARE(armv7_arch_timer_mem, "arm,armv7-timer-mem",
                        arch_timer_mem_init);
diff --combined virt/kvm/async_pf.c

index 65da997b430a8e8c2b3d8eb9de93fb6b42d00f80,b866374282be995ae37364784fb8903e3b0ebd98..f0d061f92674c674803720a0cadfdc04df8a4a0c
--- 1/virt/kvm/async_pf.c
--- 2/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@@ -97,8 -97,8 +97,8 @@@ static void async_pf_execute(struct wor
          * This memory barrier pairs with prepare_to_wait's set_current_state()
          */
         smp_mb();
- -      if (waitqueue_active(&vcpu->wq))
- -              wake_up_interruptible(&vcpu->wq);
+ +      if (swait_active(&vcpu->wq))
+ +              swake_up(&vcpu->wq);
   
         mmput(mm);
         kvm_put_kvm(vcpu->kvm);
@@@ -109,8 -109,8 +109,8 @@@ void kvm_clear_async_pf_completion_queu
         /* cancel outstanding work queue item */
         while (!list_empty(&vcpu->async_pf.queue)) {
                 struct kvm_async_pf *work =
-                       list_entry(vcpu->async_pf.queue.next,
-                                  typeof(*work), queue);
+                       list_first_entry(&vcpu->async_pf.queue,
+                                        typeof(*work), queue);
                 list_del(&work->queue);
   
   #ifdef CONFIG_KVM_ASYNC_PF_SYNC
@@@ -127,8 -127,8 +127,8 @@@
         spin_lock(&vcpu->async_pf.lock);
         while (!list_empty(&vcpu->async_pf.done)) {
                 struct kvm_async_pf *work =
-                       list_entry(vcpu->async_pf.done.next,
-                                  typeof(*work), link);
+                       list_first_entry(&vcpu->async_pf.done,
+                                        typeof(*work), link);
                 list_del(&work->link);
                 kmem_cache_free(async_pf_cache, work);
         }
diff --combined virt/kvm/kvm_main.c

index 5af50c3ddd535a5094b8de788bd88cc17085fb17,1eae05236347f1d1c4c6cc9912a524a845c53218..7ba1d10ffed2d5a416701153caea619c7ab22f8d
--- 1/virt/kvm/kvm_main.c
--- 2/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@@ -72,11 -72,11 +72,11 @@@ module_param(halt_poll_ns, uint, S_IRUG
   
   /* Default doubles per-vcpu halt_poll_ns. */
   static unsigned int halt_poll_ns_grow = 2;
- module_param(halt_poll_ns_grow, int, S_IRUGO);
+ module_param(halt_poll_ns_grow, uint, S_IRUGO | S_IWUSR);
   
   /* Default resets per-vcpu halt_poll_ns . */
   static unsigned int halt_poll_ns_shrink;
- module_param(halt_poll_ns_shrink, int, S_IRUGO);
+ module_param(halt_poll_ns_shrink, uint, S_IRUGO | S_IWUSR);
   
   /*
    * Ordering of locks:
@@@ -216,7 -216,8 +216,7 @@@ int kvm_vcpu_init(struct kvm_vcpu *vcpu
         vcpu->kvm = kvm;
         vcpu->vcpu_id = id;
         vcpu->pid = NULL;
- -      vcpu->halt_poll_ns = 0;
- -      init_waitqueue_head(&vcpu->wq);
+ +      init_swait_queue_head(&vcpu->wq);
         kvm_async_pf_vcpu_init(vcpu);
   
         vcpu->pre_pcpu = -1;
@@@ -619,13 -620,10 +619,10 @@@ void *kvm_kvzalloc(unsigned long size
   
   static void kvm_destroy_devices(struct kvm *kvm)
   {
-       struct list_head *node, *tmp;
+       struct kvm_device *dev, *tmp;
   
-       list_for_each_safe(node, tmp, &kvm->devices) {
-               struct kvm_device *dev =
-                       list_entry(node, struct kvm_device, vm_node);
- 
-               list_del(node);
+       list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
+               list_del(&dev->vm_node);
                 dev->ops->destroy(dev);
         }
   }
@@@ -1436,11 -1434,17 +1433,17 @@@ kvm_pfn_t __gfn_to_pfn_memslot(struct k
   {
         unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
   
-       if (addr == KVM_HVA_ERR_RO_BAD)
+       if (addr == KVM_HVA_ERR_RO_BAD) {
+               if (writable)
+                       *writable = false;
                 return KVM_PFN_ERR_RO_FAULT;
+       }
   
-       if (kvm_is_error_hva(addr))
+       if (kvm_is_error_hva(addr)) {
+               if (writable)
+                       *writable = false;
                 return KVM_PFN_NOSLOT;
+       }
   
         /* Do not map writable pfn in the readonly memslot. */
         if (writable && memslot_is_readonly(slot)) {
@@@ -1942,31 -1946,30 +1945,33 @@@ EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_di
   
   static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
   {
-       int old, val;
+       unsigned int old, val, grow;
   
         old = val = vcpu->halt_poll_ns;
+       grow = READ_ONCE(halt_poll_ns_grow);
         /* 10us base */
-       if (val == 0 && halt_poll_ns_grow)
+       if (val == 0 && grow)
                 val = 10000;
         else
-               val *= halt_poll_ns_grow;
+               val *= grow;
   
+ +      if (val > halt_poll_ns)
+ +              val = halt_poll_ns;
+ +
         vcpu->halt_poll_ns = val;
         trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
   }
   
   static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
   {
-       int old, val;
+       unsigned int old, val, shrink;
   
         old = val = vcpu->halt_poll_ns;
-       if (halt_poll_ns_shrink == 0)
+       shrink = READ_ONCE(halt_poll_ns_shrink);
+       if (shrink == 0)
                 val = 0;
         else
-               val /= halt_poll_ns_shrink;
+               val /= shrink;
   
         vcpu->halt_poll_ns = val;
         trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
@@@ -1992,7 -1995,7 +1997,7 @@@ static int kvm_vcpu_check_block(struct 
   void kvm_vcpu_block(struct kvm_vcpu *vcpu)
   {
         ktime_t start, cur;
- -      DEFINE_WAIT(wait);
+ +      DECLARE_SWAITQUEUE(wait);
         bool waited = false;
         u64 block_ns;
   
@@@ -2017,7 -2020,7 +2022,7 @@@
         kvm_arch_vcpu_blocking(vcpu);
   
         for (;;) {
- -              prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
+ +              prepare_to_swait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
   
                 if (kvm_vcpu_check_block(vcpu) < 0)
                         break;
@@@ -2026,7 -2029,7 +2031,7 @@@
                 schedule();
         }
   
- -      finish_wait(&vcpu->wq, &wait);
+ +      finish_swait(&vcpu->wq, &wait);
         cur = ktime_get();
   
         kvm_arch_vcpu_unblocking(vcpu);
@@@ -2058,11 -2061,11 +2063,11 @@@ void kvm_vcpu_kick(struct kvm_vcpu *vcp
   {
         int me;
         int cpu = vcpu->cpu;
- -      wait_queue_head_t *wqp;
+ +      struct swait_queue_head *wqp;
   
         wqp = kvm_arch_vcpu_wq(vcpu);
- -      if (waitqueue_active(wqp)) {
- -              wake_up_interruptible(wqp);
+ +      if (swait_active(wqp)) {
+ +              swake_up(wqp);
                 ++vcpu->stat.halt_wakeup;
         }
   
@@@ -2163,7 -2166,7 +2168,7 @@@ void kvm_vcpu_on_spin(struct kvm_vcpu *
                                 continue;
                         if (vcpu == me)
                                 continue;
- -                      if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
+ +                      if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
                                 continue;
                         if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
                                 continue;
author	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 16 Mar 2016 16:55:35 +0000 (09:55 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 16 Mar 2016 16:55:35 +0000 (09:55 -0700)
		1	2
Documentation/virtual/kvm/mmu.txt	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm/kvm/arm.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm/kvm/guest.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm64/kvm/guest.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/include/asm/kvm_host.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/kernel/smp.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/kvm/book3s_hv.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/kvm/book3s_hv_rmhandlers.S	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/include/asm/kvm_host.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/kvm/interrupt.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/kvm/kvm-s390.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/lapic.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/mmu.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/vmx.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/x86.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/clocksource/arm_arch_timer.c	patch \|	diff1 \|	diff2 \|	blob \| history
virt/kvm/async_pf.c	patch \|	diff1 \|	diff2 \|	blob \| history
virt/kvm/kvm_main.c	patch \|	diff1 \|	diff2 \|	blob \| history