UBUNTU: SAUCE: kvm: x86: mmu: Recovery of shattered NX large pages

author Junaid Shahid <junaids@google.com>

Thu, 31 Oct 2019 23:33:47 +0000 (00:33 +0100)

committer Stefan Bader <stefan.bader@canonical.com>

Mon, 4 Nov 2019 17:13:16 +0000 (18:13 +0100)
author Junaid Shahid <junaids@google.com>
Thu, 31 Oct 2019 23:33:47 +0000 (00:33 +0100)
committer Stefan Bader <stefan.bader@canonical.com>
Mon, 4 Nov 2019 17:13:16 +0000 (18:13 +0100)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt

index da8209d6f3b7efa8545e9d2335e1579ab161c850..f11e255a1c4015453d67f0508e2604649fc64e55 100644 (file)
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2055,6 +2055,12 @@
                         If the sw workaround is enabled for the host, guests
                         need not enable it for nested guests.
  
+       kvm.nx_huge_pages_recovery_ratio=
+                       [KVM] Controls how many 4KiB pages are periodically zapped
+                       back to huge pages.  0 disables the recovery, otherwise if
+                       the value is N KVM will zap 1/Nth of the 4KiB pages every
+                       minute.  The default is 60.
+
         kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM.
                         Default is 1 (enabled)
  
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h

index 9608924e2e4ffbbd40d7565963f91295fd420db2..f68e174f452f879fb066e38b45f538aed74e3a00 100644 (file)
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -319,6 +319,8 @@ struct kvm_rmap_head {
  struct kvm_mmu_page {
         struct list_head link;
         struct hlist_node hash_link;
+       struct list_head lpage_disallowed_link;
+
         bool unsync;
         bool mmio_cached;
         bool lpage_disallowed; /* Can't be replaced by an equiv large page */
@@ -864,6 +866,7 @@ struct kvm_arch {
          * Hash table of struct kvm_mmu_page.
          */
         struct list_head active_mmu_pages;
+       struct list_head lpage_disallowed_mmu_pages;
         struct kvm_page_track_notifier_node mmu_sp_tracker;
         struct kvm_page_track_notifier_head track_notifier_head;
  
@@ -938,6 +941,7 @@ struct kvm_arch {
         bool exception_payload_enabled;
  
         struct kvm_pmu_event_filter *pmu_event_filter;
+       struct task_struct *nx_lpage_recovery_thread;
  };
  
  struct kvm_vm_stat {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c

index 6987c84b7a62bf016e67b13c79b6bd880d75b069..9f08bbd0bf3aef04f50649bbd8060100b8778fd8 100644 (file)
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -37,6 +37,7 @@
  #include <linux/uaccess.h>
  #include <linux/hash.h>
  #include <linux/kern_levels.h>
+#include <linux/kthread.h>
  
  #include <asm/page.h>
  #include <asm/pat.h>
@@ -50,16 +51,26 @@
  extern bool itlb_multihit_kvm_mitigation;
  
  static int __read_mostly nx_huge_pages = -1;
+static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
  
  static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
+static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp);
  
  static struct kernel_param_ops nx_huge_pages_ops = {
         .set = set_nx_huge_pages,
         .get = param_get_bool,
  };
  
+static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = {
+       .set = set_nx_huge_pages_recovery_ratio,
+       .get = param_get_uint,
+};
+
  module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
  __MODULE_PARM_TYPE(nx_huge_pages, "bool");
+module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops,
+               &nx_huge_pages_recovery_ratio, 0644);
+__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
  
  /*
   * When setting this variable to true it enables Two-Dimensional-Paging
@@ -1187,6 +1198,8 @@ static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
                 return;
  
         ++kvm->stat.nx_lpage_splits;
+       list_add_tail(&sp->lpage_disallowed_link,
+                     &kvm->arch.lpage_disallowed_mmu_pages);
         sp->lpage_disallowed = true;
  }
  
@@ -1211,6 +1224,7 @@ static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
  {
         --kvm->stat.nx_lpage_splits;
         sp->lpage_disallowed = false;
+       list_del(&sp->lpage_disallowed_link);
  }
  
  static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
@@ -6212,6 +6226,8 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
                         idx = srcu_read_lock(&kvm->srcu);
                         kvm_mmu_zap_all_fast(kvm);
                         srcu_read_unlock(&kvm->srcu, idx);
+
+                       wake_up_process(kvm->arch.nx_lpage_recovery_thread);
                 }
                 mutex_unlock(&kvm_lock);
         }
@@ -6305,3 +6321,116 @@ void kvm_mmu_module_exit(void)
         unregister_shrinker(&mmu_shrinker);
         mmu_audit_disable();
  }
+
+static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp)
+{
+       unsigned int old_val;
+       int err;
+
+       old_val = nx_huge_pages_recovery_ratio;
+       err = param_set_uint(val, kp);
+       if (err)
+               return err;
+
+       if (READ_ONCE(nx_huge_pages) &&
+           !old_val && nx_huge_pages_recovery_ratio) {
+               struct kvm *kvm;
+
+               mutex_lock(&kvm_lock);
+
+               list_for_each_entry(kvm, &vm_list, vm_list)
+                       wake_up_process(kvm->arch.nx_lpage_recovery_thread);
+
+               mutex_unlock(&kvm_lock);
+       }
+
+       return err;
+}
+
+static void kvm_recover_nx_lpages(struct kvm *kvm)
+{
+       int rcu_idx;
+       struct kvm_mmu_page *sp;
+       unsigned int ratio;
+       LIST_HEAD(invalid_list);
+       ulong to_zap;
+
+       rcu_idx = srcu_read_lock(&kvm->srcu);
+       spin_lock(&kvm->mmu_lock);
+
+       ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
+       to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
+       while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) {
+               /*
+                * We use a separate list instead of just using active_mmu_pages
+                * because the number of lpage_disallowed pages is expected to
+                * be relatively small compared to the total.
+                */
+               sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
+                                     struct kvm_mmu_page,
+                                     lpage_disallowed_link);
+               WARN_ON_ONCE(!sp->lpage_disallowed);
+               kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+               WARN_ON_ONCE(sp->lpage_disallowed);
+
+               if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+                       kvm_mmu_commit_zap_page(kvm, &invalid_list);
+                       if (to_zap)
+                               cond_resched_lock(&kvm->mmu_lock);
+               }
+       }
+
+       spin_unlock(&kvm->mmu_lock);
+       srcu_read_unlock(&kvm->srcu, rcu_idx);
+}
+
+static long get_nx_lpage_recovery_timeout(u64 start_time)
+{
+       return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio)
+               ? start_time + 60 * HZ - get_jiffies_64()
+               : MAX_SCHEDULE_TIMEOUT;
+}
+
+static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
+{
+       u64 start_time;
+       long remaining_time;
+
+       while (true) {
+               start_time = get_jiffies_64();
+               remaining_time = get_nx_lpage_recovery_timeout(start_time);
+
+               set_current_state(TASK_INTERRUPTIBLE);
+               while (!kthread_should_stop() && remaining_time > 0) {
+                       schedule_timeout(remaining_time);
+                       remaining_time = get_nx_lpage_recovery_timeout(start_time);
+                       set_current_state(TASK_INTERRUPTIBLE);
+               }
+
+               set_current_state(TASK_RUNNING);
+
+               if (kthread_should_stop())
+                       return 0;
+
+               kvm_recover_nx_lpages(kvm);
+       }
+}
+
+int kvm_mmu_post_init_vm(struct kvm *kvm)
+{
+       int err;
+
+       err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
+                                         "kvm-nx-lpage-recovery",
+                                         &kvm->arch.nx_lpage_recovery_thread);
+       if (!err)
+               kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
+
+       return err;
+}
+
+void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
+{
+       if (kvm->arch.nx_lpage_recovery_thread)
+               kthread_stop(kvm->arch.nx_lpage_recovery_thread);
+}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h

index 54c2a377795be6920bee9676e58555110c3a56b9..4610230ddaeab9add37cb8c9110bb47d54865c5e 100644 (file)
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -210,4 +210,8 @@ void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
  bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
                                     struct kvm_memory_slot *slot, u64 gfn);
  int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
+
+int kvm_mmu_post_init_vm(struct kvm *kvm);
+void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
+
  #endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index 38b80b1d740d455cc67e5332fd7e55fc21a9fab8..778b3a89976927e29d37c83b749161a2ef7841fe 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9342,6 +9342,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
  
         INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
         INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+       INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
         INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
         atomic_set(&kvm->arch.noncoherent_dma_count, 0);
  
@@ -9373,6 +9374,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
         return 0;
  }
  
+int kvm_arch_post_init_vm(struct kvm *kvm)
+{
+       return kvm_mmu_post_init_vm(kvm);
+}
+
  static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
  {
         vcpu_load(vcpu);
@@ -9474,6 +9480,11 @@ int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
  }
  EXPORT_SYMBOL_GPL(x86_set_memory_region);
  
+void kvm_arch_pre_destroy_vm(struct kvm *kvm)
+{
+       kvm_mmu_pre_destroy_vm(kvm);
+}
+
  void kvm_arch_destroy_vm(struct kvm *kvm)
  {
         if (current->mm == kvm->mm) {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c

index 0b8d9ff5a4b0fbdecfb45d02913b95590e84292b..9d4e03eddccf5c901fd7a66419ca7dc2685bca07 100644 (file)
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -626,6 +626,23 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
         return 0;
  }
  
+/*
+ * Called after the VM is otherwise initialized, but just before adding it to
+ * the vm_list.
+ */
+int __weak kvm_arch_post_init_vm(struct kvm *kvm)
+{
+       return 0;
+}
+
+/*
+ * Called just after removing the VM from the vm_list, but before doing any
+ * other destruction.
+ */
+void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
+{
+}
+
  static struct kvm *kvm_create_vm(unsigned long type)
  {
         int r, i;
@@ -676,10 +693,14 @@ static struct kvm *kvm_create_vm(unsigned long type)
                 rcu_assign_pointer(kvm->buses[i],
                         kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
                 if (!kvm->buses[i])
-                       goto out_err;
+                       goto out_err_no_mmu_notifier;
         }
  
         r = kvm_init_mmu_notifier(kvm);
+       if (r)
+               goto out_err_no_mmu_notifier;
+
+       r = kvm_arch_post_init_vm(kvm);
         if (r)
                 goto out_err;
  
@@ -692,6 +713,11 @@ static struct kvm *kvm_create_vm(unsigned long type)
         return kvm;
  
  out_err:
+#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+       if (kvm->mmu_notifier.ops)
+               mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
+#endif
+out_err_no_mmu_notifier:
         cleanup_srcu_struct(&kvm->irq_srcu);
  out_err_no_irq_srcu:
         cleanup_srcu_struct(&kvm->srcu);
@@ -734,6 +760,8 @@ static void kvm_destroy_vm(struct kvm *kvm)
         mutex_lock(&kvm_lock);
         list_del(&kvm->vm_list);
         mutex_unlock(&kvm_lock);
+       kvm_arch_pre_destroy_vm(kvm);
+
         kvm_free_irq_routing(kvm);
         for (i = 0; i < KVM_NR_BUSES; i++) {
                 struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
author	Junaid Shahid <junaids@google.com>
	Thu, 31 Oct 2019 23:33:47 +0000 (00:33 +0100)
committer	Stefan Bader <stefan.bader@canonical.com>
	Mon, 4 Nov 2019 17:13:16 +0000 (18:13 +0100)
Documentation/admin-guide/kernel-parameters.txt		patch \| blob \| blame \| history
arch/x86/include/asm/kvm_host.h		patch \| blob \| blame \| history
arch/x86/kvm/mmu.c		patch \| blob \| blame \| history
arch/x86/kvm/mmu.h		patch \| blob \| blame \| history
arch/x86/kvm/x86.c		patch \| blob \| blame \| history
virt/kvm/kvm_main.c		patch \| blob \| blame \| history