From 9a9f6e04a7532522df6c78c1e8c4bc121a2265ea Mon Sep 17 00:00:00 2001 From: =?utf8?q?Fabian=20Gr=C3=BCnbichler?= Date: Mon, 4 Dec 2017 09:07:35 +0100 Subject: [PATCH] cherry-pick final KVM BSOD fix --- ...7-KVM-x86-fix-APIC-page-invalidation.patch | 90 ++ ...vert-Merge-branch-mmu_notifier_fixes.patch | 825 ------------------ 2 files changed, 90 insertions(+), 825 deletions(-) create mode 100644 patches/kernel/0007-KVM-x86-fix-APIC-page-invalidation.patch delete mode 100644 patches/kernel/0007-Revert-Merge-branch-mmu_notifier_fixes.patch diff --git a/patches/kernel/0007-KVM-x86-fix-APIC-page-invalidation.patch b/patches/kernel/0007-KVM-x86-fix-APIC-page-invalidation.patch new file mode 100644 index 0000000..1f00dd7 --- /dev/null +++ b/patches/kernel/0007-KVM-x86-fix-APIC-page-invalidation.patch @@ -0,0 +1,90 @@ +From c7e2dabc8654e9bf08849bd33d3aa0ba9a13f2b2 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= +Date: Thu, 30 Nov 2017 19:05:45 +0100 +Subject: [PATCH 07/13] KVM: x86: fix APIC page invalidation +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Implementation of the unpinned APIC page didn't update the VMCS address +cache when invalidation was done through range mmu notifiers. +This became a problem when the page notifier was removed. + +Re-introduce the arch-specific helper and call it from ...range_start. + +Fixes: 38b9917350cb ("kvm: vmx: Implement set_apic_access_page_addr") +Fixes: 369ea8242c0f ("mm/rmap: update to new mmu_notifier semantic v2") +Signed-off-by: Radim Krčmář +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/kvm_host.h | 3 +++ + arch/x86/kvm/x86.c | 14 ++++++++++++++ + virt/kvm/kvm_main.c | 8 ++++++++ + 3 files changed, 25 insertions(+) + +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index 92c9032502d8..b69af3df978a 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -1437,4 +1437,7 @@ static inline int kvm_cpu_get_apicid(int mps_cpu) + #endif + } + ++void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, ++ unsigned long start, unsigned long end); ++ + #endif /* _ASM_X86_KVM_HOST_H */ +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 7351cdc46cc7..703cd4171921 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -6711,6 +6711,20 @@ static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu) + kvm_x86_ops->tlb_flush(vcpu); + } + ++void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, ++ unsigned long start, unsigned long end) ++{ ++ unsigned long apic_address; ++ ++ /* ++ * The physical address of apic access page is stored in the VMCS. ++ * Update it when it becomes invalid. ++ */ ++ apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); ++ if (start <= apic_address && apic_address < end) ++ kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); ++} ++ + void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) + { + struct page *page = NULL; +diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c +index bfa9c4d34102..d0085c9d6297 100644 +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -136,6 +136,11 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); + static unsigned long long kvm_createvm_count; + static unsigned long long kvm_active_vms; + ++__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, ++ unsigned long start, unsigned long end) ++{ ++} ++ + bool kvm_is_reserved_pfn(kvm_pfn_t pfn) + { + if (pfn_valid(pfn)) +@@ -361,6 +366,9 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, + kvm_flush_remote_tlbs(kvm); + + spin_unlock(&kvm->mmu_lock); ++ ++ kvm_arch_mmu_notifier_invalidate_range(kvm, start, end); ++ + srcu_read_unlock(&kvm->srcu, idx); + } + +-- +2.14.2 + diff --git a/patches/kernel/0007-Revert-Merge-branch-mmu_notifier_fixes.patch b/patches/kernel/0007-Revert-Merge-branch-mmu_notifier_fixes.patch deleted file mode 100644 index 0608d3a..0000000 --- a/patches/kernel/0007-Revert-Merge-branch-mmu_notifier_fixes.patch +++ /dev/null @@ -1,825 +0,0 @@ -From dfd4ec1fd8d1d09930e9cf9ed7ebd07a66813337 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Fabian=20Gr=C3=BCnbichler?= -Date: Wed, 29 Nov 2017 09:45:44 +0100 -Subject: [PATCH 7/7] Revert "Merge branch 'mmu_notifier_fixes'" -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -This reverts commit ea25c43179462e342d4a0e66c3f6a5f53514da05, reversing -changes made to c227390c91a355300f47f9bef0aefbdfaaca1500. - -This series causes blue screens in Windows VMs running under heavy -memory/swap pressure. - -Signed-off-by: Fabian Grünbichler ---- - arch/arm/include/asm/kvm_host.h | 6 +++++ - arch/arm64/include/asm/kvm_host.h | 6 +++++ - arch/mips/include/asm/kvm_host.h | 5 ++++ - arch/powerpc/include/asm/kvm_host.h | 5 ++++ - arch/x86/include/asm/kvm_host.h | 2 ++ - include/linux/mm.h | 1 - - include/linux/mmu_notifier.h | 25 +++++++++++++++++++ - arch/powerpc/platforms/powernv/npu-dma.c | 10 ++++++++ - arch/x86/kvm/x86.c | 11 +++++++++ - drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 31 +++++++++++++++++++++++ - drivers/infiniband/core/umem_odp.c | 19 +++++++++++++++ - drivers/infiniband/hw/hfi1/mmu_rb.c | 9 +++++++ - drivers/iommu/amd_iommu_v2.c | 8 ++++++ - drivers/iommu/intel-svm.c | 9 +++++++ - drivers/misc/mic/scif/scif_dma.c | 11 +++++++++ - drivers/misc/sgi-gru/grutlbpurge.c | 12 +++++++++ - drivers/xen/gntdev.c | 8 ++++++ - fs/dax.c | 19 ++++++--------- - mm/memory.c | 26 ++++---------------- - mm/mmu_notifier.c | 14 +++++++++++ - mm/rmap.c | 35 +++----------------------- - virt/kvm/kvm_main.c | 42 ++++++++++++++++++++++++++++++++ - 22 files changed, 249 insertions(+), 65 deletions(-) - -diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h -index 4a879f6ff13b..127e2dd2e21c 100644 ---- a/arch/arm/include/asm/kvm_host.h -+++ b/arch/arm/include/asm/kvm_host.h -@@ -225,6 +225,12 @@ int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices); - int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); - int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); - -+/* We do not have shadow page tables, hence the empty hooks */ -+static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, -+ unsigned long address) -+{ -+} -+ - struct kvm_vcpu *kvm_arm_get_running_vcpu(void); - struct kvm_vcpu __percpu **kvm_get_running_vcpus(void); - void kvm_arm_halt_guest(struct kvm *kvm); -diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h -index e923b58606e2..d68630007b14 100644 ---- a/arch/arm64/include/asm/kvm_host.h -+++ b/arch/arm64/include/asm/kvm_host.h -@@ -326,6 +326,12 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); - int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); - int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); - -+/* We do not have shadow page tables, hence the empty hooks */ -+static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, -+ unsigned long address) -+{ -+} -+ - struct kvm_vcpu *kvm_arm_get_running_vcpu(void); - struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void); - void kvm_arm_halt_guest(struct kvm *kvm); -diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h -index a9af1d2dcd69..2998479fd4e8 100644 ---- a/arch/mips/include/asm/kvm_host.h -+++ b/arch/mips/include/asm/kvm_host.h -@@ -938,6 +938,11 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); - int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); - int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); - -+static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, -+ unsigned long address) -+{ -+} -+ - /* Emulation */ - int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out); - enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause); -diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h -index e372ed871c51..8b3f1238d07f 100644 ---- a/arch/powerpc/include/asm/kvm_host.h -+++ b/arch/powerpc/include/asm/kvm_host.h -@@ -67,6 +67,11 @@ extern int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); - extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); - extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); - -+static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, -+ unsigned long address) -+{ -+} -+ - #define HPTEG_CACHE_NUM (1 << 15) - #define HPTEG_HASH_BITS_PTE 13 - #define HPTEG_HASH_BITS_PTE_LONG 12 -diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h -index 92c9032502d8..f4d120a3e22e 100644 ---- a/arch/x86/include/asm/kvm_host.h -+++ b/arch/x86/include/asm/kvm_host.h -@@ -1375,6 +1375,8 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); - int kvm_cpu_get_interrupt(struct kvm_vcpu *v); - void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); - void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu); -+void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, -+ unsigned long address); - - void kvm_define_shared_msr(unsigned index, u32 msr); - int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); -diff --git a/include/linux/mm.h b/include/linux/mm.h -index 07630442bbf2..701de4b55ece 100644 ---- a/include/linux/mm.h -+++ b/include/linux/mm.h -@@ -1260,7 +1260,6 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src, - void unmap_mapping_range(struct address_space *mapping, - loff_t const holebegin, loff_t const holelen, int even_cows); - int follow_pte_pmd(struct mm_struct *mm, unsigned long address, -- unsigned long *start, unsigned long *end, - pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp); - int follow_pfn(struct vm_area_struct *vma, unsigned long address, - unsigned long *pfn); -diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h -index 6866e8126982..947f21b451d2 100644 ---- a/include/linux/mmu_notifier.h -+++ b/include/linux/mmu_notifier.h -@@ -94,6 +94,17 @@ struct mmu_notifier_ops { - unsigned long address, - pte_t pte); - -+ /* -+ * Before this is invoked any secondary MMU is still ok to -+ * read/write to the page previously pointed to by the Linux -+ * pte because the page hasn't been freed yet and it won't be -+ * freed until this returns. If required set_page_dirty has to -+ * be called internally to this method. -+ */ -+ void (*invalidate_page)(struct mmu_notifier *mn, -+ struct mm_struct *mm, -+ unsigned long address); -+ - /* - * invalidate_range_start() and invalidate_range_end() must be - * paired and are called only when the mmap_sem and/or the -@@ -209,6 +220,8 @@ extern int __mmu_notifier_test_young(struct mm_struct *mm, - unsigned long address); - extern void __mmu_notifier_change_pte(struct mm_struct *mm, - unsigned long address, pte_t pte); -+extern void __mmu_notifier_invalidate_page(struct mm_struct *mm, -+ unsigned long address); - extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, - unsigned long start, unsigned long end); - extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, -@@ -255,6 +268,13 @@ static inline void mmu_notifier_change_pte(struct mm_struct *mm, - __mmu_notifier_change_pte(mm, address, pte); - } - -+static inline void mmu_notifier_invalidate_page(struct mm_struct *mm, -+ unsigned long address) -+{ -+ if (mm_has_notifiers(mm)) -+ __mmu_notifier_invalidate_page(mm, address); -+} -+ - static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm, - unsigned long start, unsigned long end) - { -@@ -427,6 +447,11 @@ static inline void mmu_notifier_change_pte(struct mm_struct *mm, - { - } - -+static inline void mmu_notifier_invalidate_page(struct mm_struct *mm, -+ unsigned long address) -+{ -+} -+ - static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm, - unsigned long start, unsigned long end) - { -diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c -index 2cb6cbea4b3b..3d4f879e687c 100644 ---- a/arch/powerpc/platforms/powernv/npu-dma.c -+++ b/arch/powerpc/platforms/powernv/npu-dma.c -@@ -614,6 +614,15 @@ static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn, - mmio_invalidate(npu_context, 1, address, true); - } - -+static void pnv_npu2_mn_invalidate_page(struct mmu_notifier *mn, -+ struct mm_struct *mm, -+ unsigned long address) -+{ -+ struct npu_context *npu_context = mn_to_npu_context(mn); -+ -+ mmio_invalidate(npu_context, 1, address, true); -+} -+ - static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, unsigned long end) -@@ -631,6 +640,7 @@ static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, - static const struct mmu_notifier_ops nv_nmmu_notifier_ops = { - .release = pnv_npu2_mn_release, - .change_pte = pnv_npu2_mn_change_pte, -+ .invalidate_page = pnv_npu2_mn_invalidate_page, - .invalidate_range = pnv_npu2_mn_invalidate_range, - }; - -diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c -index 7351cdc46cc7..a669b4dd51e7 100644 ---- a/arch/x86/kvm/x86.c -+++ b/arch/x86/kvm/x86.c -@@ -6734,6 +6734,17 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) - } - EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page); - -+void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, -+ unsigned long address) -+{ -+ /* -+ * The physical address of apic access page is stored in the VMCS. -+ * Update it when it becomes invalid. -+ */ -+ if (address == gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT)) -+ kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); -+} -+ - /* - * Returns 1 to let vcpu_run() continue the guest execution loop without - * exiting to the userspace. Otherwise, the value will be returned to the -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c -index e1cde6b80027..6558a3ed57a7 100644 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c -@@ -146,6 +146,36 @@ static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node, - } - } - -+/** -+ * amdgpu_mn_invalidate_page - callback to notify about mm change -+ * -+ * @mn: our notifier -+ * @mn: the mm this callback is about -+ * @address: address of invalidate page -+ * -+ * Invalidation of a single page. Blocks for all BOs mapping it -+ * and unmap them by move them into system domain again. -+ */ -+static void amdgpu_mn_invalidate_page(struct mmu_notifier *mn, -+ struct mm_struct *mm, -+ unsigned long address) -+{ -+ struct amdgpu_mn *rmn = container_of(mn, struct amdgpu_mn, mn); -+ struct interval_tree_node *it; -+ -+ mutex_lock(&rmn->lock); -+ -+ it = interval_tree_iter_first(&rmn->objects, address, address); -+ if (it) { -+ struct amdgpu_mn_node *node; -+ -+ node = container_of(it, struct amdgpu_mn_node, it); -+ amdgpu_mn_invalidate_node(node, address, address); -+ } -+ -+ mutex_unlock(&rmn->lock); -+} -+ - /** - * amdgpu_mn_invalidate_range_start - callback to notify about mm change - * -@@ -185,6 +215,7 @@ static void amdgpu_mn_invalidate_range_start(struct mmu_notifier *mn, - - static const struct mmu_notifier_ops amdgpu_mn_ops = { - .release = amdgpu_mn_release, -+ .invalidate_page = amdgpu_mn_invalidate_page, - .invalidate_range_start = amdgpu_mn_invalidate_range_start, - }; - -diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c -index 55e8f5ed8b3c..8c4ec564e495 100644 ---- a/drivers/infiniband/core/umem_odp.c -+++ b/drivers/infiniband/core/umem_odp.c -@@ -166,6 +166,24 @@ static int invalidate_page_trampoline(struct ib_umem *item, u64 start, - return 0; - } - -+static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn, -+ struct mm_struct *mm, -+ unsigned long address) -+{ -+ struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); -+ -+ if (!context->invalidate_range) -+ return; -+ -+ ib_ucontext_notifier_start_account(context); -+ down_read(&context->umem_rwsem); -+ rbt_ib_umem_for_each_in_range(&context->umem_tree, address, -+ address + PAGE_SIZE, -+ invalidate_page_trampoline, NULL); -+ up_read(&context->umem_rwsem); -+ ib_ucontext_notifier_end_account(context); -+} -+ - static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start, - u64 end, void *cookie) - { -@@ -219,6 +237,7 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, - - static const struct mmu_notifier_ops ib_umem_notifiers = { - .release = ib_umem_notifier_release, -+ .invalidate_page = ib_umem_notifier_invalidate_page, - .invalidate_range_start = ib_umem_notifier_invalidate_range_start, - .invalidate_range_end = ib_umem_notifier_invalidate_range_end, - }; -diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c -index e4b56a0dd6d0..ccbf52c8ff6f 100644 ---- a/drivers/infiniband/hw/hfi1/mmu_rb.c -+++ b/drivers/infiniband/hw/hfi1/mmu_rb.c -@@ -67,6 +67,8 @@ struct mmu_rb_handler { - - static unsigned long mmu_node_start(struct mmu_rb_node *); - static unsigned long mmu_node_last(struct mmu_rb_node *); -+static inline void mmu_notifier_page(struct mmu_notifier *, struct mm_struct *, -+ unsigned long); - static inline void mmu_notifier_range_start(struct mmu_notifier *, - struct mm_struct *, - unsigned long, unsigned long); -@@ -80,6 +82,7 @@ static void do_remove(struct mmu_rb_handler *handler, - static void handle_remove(struct work_struct *work); - - static const struct mmu_notifier_ops mn_opts = { -+ .invalidate_page = mmu_notifier_page, - .invalidate_range_start = mmu_notifier_range_start, - }; - -@@ -282,6 +285,12 @@ void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler, - handler->ops->remove(handler->ops_arg, node); - } - -+static inline void mmu_notifier_page(struct mmu_notifier *mn, -+ struct mm_struct *mm, unsigned long addr) -+{ -+ mmu_notifier_mem_invalidate(mn, mm, addr, addr + PAGE_SIZE); -+} -+ - static inline void mmu_notifier_range_start(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, -diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c -index dccf5b76eff2..6629c472eafd 100644 ---- a/drivers/iommu/amd_iommu_v2.c -+++ b/drivers/iommu/amd_iommu_v2.c -@@ -391,6 +391,13 @@ static int mn_clear_flush_young(struct mmu_notifier *mn, - return 0; - } - -+static void mn_invalidate_page(struct mmu_notifier *mn, -+ struct mm_struct *mm, -+ unsigned long address) -+{ -+ __mn_flush_page(mn, address); -+} -+ - static void mn_invalidate_range(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, unsigned long end) -@@ -429,6 +436,7 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm) - static const struct mmu_notifier_ops iommu_mn = { - .release = mn_release, - .clear_flush_young = mn_clear_flush_young, -+ .invalidate_page = mn_invalidate_page, - .invalidate_range = mn_invalidate_range, - }; - -diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c -index f620dccec8ee..f167c0d84ebf 100644 ---- a/drivers/iommu/intel-svm.c -+++ b/drivers/iommu/intel-svm.c -@@ -223,6 +223,14 @@ static void intel_change_pte(struct mmu_notifier *mn, struct mm_struct *mm, - intel_flush_svm_range(svm, address, 1, 1, 0); - } - -+static void intel_invalidate_page(struct mmu_notifier *mn, struct mm_struct *mm, -+ unsigned long address) -+{ -+ struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); -+ -+ intel_flush_svm_range(svm, address, 1, 1, 0); -+} -+ - /* Pages have been freed at this point */ - static void intel_invalidate_range(struct mmu_notifier *mn, - struct mm_struct *mm, -@@ -277,6 +285,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) - static const struct mmu_notifier_ops intel_mmuops = { - .release = intel_mm_release, - .change_pte = intel_change_pte, -+ .invalidate_page = intel_invalidate_page, - .invalidate_range = intel_invalidate_range, - }; - -diff --git a/drivers/misc/mic/scif/scif_dma.c b/drivers/misc/mic/scif/scif_dma.c -index 63d6246d6dff..64d5760d069a 100644 ---- a/drivers/misc/mic/scif/scif_dma.c -+++ b/drivers/misc/mic/scif/scif_dma.c -@@ -200,6 +200,16 @@ static void scif_mmu_notifier_release(struct mmu_notifier *mn, - schedule_work(&scif_info.misc_work); - } - -+static void scif_mmu_notifier_invalidate_page(struct mmu_notifier *mn, -+ struct mm_struct *mm, -+ unsigned long address) -+{ -+ struct scif_mmu_notif *mmn; -+ -+ mmn = container_of(mn, struct scif_mmu_notif, ep_mmu_notifier); -+ scif_rma_destroy_tcw(mmn, address, PAGE_SIZE); -+} -+ - static void scif_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, -@@ -225,6 +235,7 @@ static void scif_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, - static const struct mmu_notifier_ops scif_mmu_notifier_ops = { - .release = scif_mmu_notifier_release, - .clear_flush_young = NULL, -+ .invalidate_page = scif_mmu_notifier_invalidate_page, - .invalidate_range_start = scif_mmu_notifier_invalidate_range_start, - .invalidate_range_end = scif_mmu_notifier_invalidate_range_end}; - -diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c -index 9918eda0e05f..e936d43895d2 100644 ---- a/drivers/misc/sgi-gru/grutlbpurge.c -+++ b/drivers/misc/sgi-gru/grutlbpurge.c -@@ -247,6 +247,17 @@ static void gru_invalidate_range_end(struct mmu_notifier *mn, - gru_dbg(grudev, "gms %p, start 0x%lx, end 0x%lx\n", gms, start, end); - } - -+static void gru_invalidate_page(struct mmu_notifier *mn, struct mm_struct *mm, -+ unsigned long address) -+{ -+ struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct, -+ ms_notifier); -+ -+ STAT(mmu_invalidate_page); -+ gru_flush_tlb_range(gms, address, PAGE_SIZE); -+ gru_dbg(grudev, "gms %p, address 0x%lx\n", gms, address); -+} -+ - static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm) - { - struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct, -@@ -258,6 +269,7 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm) - - - static const struct mmu_notifier_ops gru_mmuops = { -+ .invalidate_page = gru_invalidate_page, - .invalidate_range_start = gru_invalidate_range_start, - .invalidate_range_end = gru_invalidate_range_end, - .release = gru_release, -diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c -index 82360594fa8e..f3bf8f4e2d6c 100644 ---- a/drivers/xen/gntdev.c -+++ b/drivers/xen/gntdev.c -@@ -484,6 +484,13 @@ static void mn_invl_range_start(struct mmu_notifier *mn, - mutex_unlock(&priv->lock); - } - -+static void mn_invl_page(struct mmu_notifier *mn, -+ struct mm_struct *mm, -+ unsigned long address) -+{ -+ mn_invl_range_start(mn, mm, address, address + PAGE_SIZE); -+} -+ - static void mn_release(struct mmu_notifier *mn, - struct mm_struct *mm) - { -@@ -515,6 +522,7 @@ static void mn_release(struct mmu_notifier *mn, - - static const struct mmu_notifier_ops gntdev_mmu_ops = { - .release = mn_release, -+ .invalidate_page = mn_invl_page, - .invalidate_range_start = mn_invl_range_start, - }; - -diff --git a/fs/dax.c b/fs/dax.c -index fa8e358c3c6b..57da1d0a6a40 100644 ---- a/fs/dax.c -+++ b/fs/dax.c -@@ -591,10 +591,11 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, - pte_t pte, *ptep = NULL; - pmd_t *pmdp = NULL; - spinlock_t *ptl; -+ bool changed; - - i_mmap_lock_read(mapping); - vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) { -- unsigned long address, start, end; -+ unsigned long address; - - cond_resched(); - -@@ -602,13 +603,8 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, - continue; - - address = pgoff_address(index, vma); -- -- /* -- * Note because we provide start/end to follow_pte_pmd it will -- * call mmu_notifier_invalidate_range_start() on our behalf -- * before taking any lock. -- */ -- if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl)) -+ changed = false; -+ if (follow_pte_pmd(vma->vm_mm, address, &ptep, &pmdp, &ptl)) - continue; - - if (pmdp) { -@@ -625,7 +621,7 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, - pmd = pmd_wrprotect(pmd); - pmd = pmd_mkclean(pmd); - set_pmd_at(vma->vm_mm, address, pmdp, pmd); -- mmu_notifier_invalidate_range(vma->vm_mm, start, end); -+ changed = true; - unlock_pmd: - spin_unlock(ptl); - #endif -@@ -640,12 +636,13 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, - pte = pte_wrprotect(pte); - pte = pte_mkclean(pte); - set_pte_at(vma->vm_mm, address, ptep, pte); -- mmu_notifier_invalidate_range(vma->vm_mm, start, end); -+ changed = true; - unlock_pte: - pte_unmap_unlock(ptep, ptl); - } - -- mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); -+ if (changed) -+ mmu_notifier_invalidate_page(vma->vm_mm, address); - } - i_mmap_unlock_read(mapping); - } -diff --git a/mm/memory.c b/mm/memory.c -index 969c5bf31997..7834310a6b64 100644 ---- a/mm/memory.c -+++ b/mm/memory.c -@@ -4044,8 +4044,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) - #endif /* __PAGETABLE_PMD_FOLDED */ - - static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, -- unsigned long *start, unsigned long *end, -- pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) -+ pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) - { - pgd_t *pgd; - p4d_t *p4d; -@@ -4072,29 +4071,17 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, - if (!pmdpp) - goto out; - -- if (start && end) { -- *start = address & PMD_MASK; -- *end = *start + PMD_SIZE; -- mmu_notifier_invalidate_range_start(mm, *start, *end); -- } - *ptlp = pmd_lock(mm, pmd); - if (pmd_huge(*pmd)) { - *pmdpp = pmd; - return 0; - } - spin_unlock(*ptlp); -- if (start && end) -- mmu_notifier_invalidate_range_end(mm, *start, *end); - } - - if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) - goto out; - -- if (start && end) { -- *start = address & PAGE_MASK; -- *end = *start + PAGE_SIZE; -- mmu_notifier_invalidate_range_start(mm, *start, *end); -- } - ptep = pte_offset_map_lock(mm, pmd, address, ptlp); - if (!pte_present(*ptep)) - goto unlock; -@@ -4102,8 +4089,6 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, - return 0; - unlock: - pte_unmap_unlock(ptep, *ptlp); -- if (start && end) -- mmu_notifier_invalidate_range_end(mm, *start, *end); - out: - return -EINVAL; - } -@@ -4115,21 +4100,20 @@ static inline int follow_pte(struct mm_struct *mm, unsigned long address, - - /* (void) is needed to make gcc happy */ - (void) __cond_lock(*ptlp, -- !(res = __follow_pte_pmd(mm, address, NULL, NULL, -- ptepp, NULL, ptlp))); -+ !(res = __follow_pte_pmd(mm, address, ptepp, NULL, -+ ptlp))); - return res; - } - - int follow_pte_pmd(struct mm_struct *mm, unsigned long address, -- unsigned long *start, unsigned long *end, - pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) - { - int res; - - /* (void) is needed to make gcc happy */ - (void) __cond_lock(*ptlp, -- !(res = __follow_pte_pmd(mm, address, start, end, -- ptepp, pmdpp, ptlp))); -+ !(res = __follow_pte_pmd(mm, address, ptepp, pmdpp, -+ ptlp))); - return res; - } - EXPORT_SYMBOL(follow_pte_pmd); -diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c -index 314285284e6e..54ca54562928 100644 ---- a/mm/mmu_notifier.c -+++ b/mm/mmu_notifier.c -@@ -174,6 +174,20 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, - srcu_read_unlock(&srcu, id); - } - -+void __mmu_notifier_invalidate_page(struct mm_struct *mm, -+ unsigned long address) -+{ -+ struct mmu_notifier *mn; -+ int id; -+ -+ id = srcu_read_lock(&srcu); -+ hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { -+ if (mn->ops->invalidate_page) -+ mn->ops->invalidate_page(mn, mm, address); -+ } -+ srcu_read_unlock(&srcu, id); -+} -+ - void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, - unsigned long start, unsigned long end) - { -diff --git a/mm/rmap.c b/mm/rmap.c -index c570f82e6827..c8993c63eb25 100644 ---- a/mm/rmap.c -+++ b/mm/rmap.c -@@ -887,21 +887,11 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, - .address = address, - .flags = PVMW_SYNC, - }; -- unsigned long start = address, end; - int *cleaned = arg; - -- /* -- * We have to assume the worse case ie pmd for invalidation. Note that -- * the page can not be free from this function. -- */ -- end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); -- mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); -- - while (page_vma_mapped_walk(&pvmw)) { -- unsigned long cstart, cend; - int ret = 0; -- -- cstart = address = pvmw.address; -+ address = pvmw.address; - if (pvmw.pte) { - pte_t entry; - pte_t *pte = pvmw.pte; -@@ -914,7 +904,6 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, - entry = pte_wrprotect(entry); - entry = pte_mkclean(entry); - set_pte_at(vma->vm_mm, address, pte, entry); -- cend = cstart + PAGE_SIZE; - ret = 1; - } else { - #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE -@@ -929,8 +918,6 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, - entry = pmd_wrprotect(entry); - entry = pmd_mkclean(entry); - set_pmd_at(vma->vm_mm, address, pmd, entry); -- cstart &= PMD_MASK; -- cend = cstart + PMD_SIZE; - ret = 1; - #else - /* unexpected pmd-mapped page? */ -@@ -939,13 +926,11 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, - } - - if (ret) { -- mmu_notifier_invalidate_range(vma->vm_mm, cstart, cend); -+ mmu_notifier_invalidate_page(vma->vm_mm, address); - (*cleaned)++; - } - } - -- mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); -- - return true; - } - -@@ -1339,7 +1324,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, - pte_t pteval; - struct page *subpage; - bool ret = true; -- unsigned long start = address, end; - enum ttu_flags flags = (enum ttu_flags)arg; - - /* munlock has nothing to gain from examining un-locked vmas */ -@@ -1351,14 +1335,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, - flags & TTU_MIGRATION, page); - } - -- /* -- * We have to assume the worse case ie pmd for invalidation. Note that -- * the page can not be free in this function as call of try_to_unmap() -- * must hold a reference on the page. -- */ -- end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); -- mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); -- - while (page_vma_mapped_walk(&pvmw)) { - /* - * If the page is mlock()d, we cannot swap it out. -@@ -1469,7 +1445,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, - if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) { - WARN_ON_ONCE(1); - ret = false; -- /* We have to invalidate as we cleared the pte */ - page_vma_mapped_walk_done(&pvmw); - break; - } -@@ -1515,12 +1490,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, - discard: - page_remove_rmap(subpage, PageHuge(page)); - put_page(page); -- mmu_notifier_invalidate_range(mm, address, -- address + PAGE_SIZE); -+ mmu_notifier_invalidate_page(mm, address); - } -- -- mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); -- - return ret; - } - -diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c -index bfa9c4d34102..1d048ef969a8 100644 ---- a/virt/kvm/kvm_main.c -+++ b/virt/kvm/kvm_main.c -@@ -322,6 +322,47 @@ static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) - return container_of(mn, struct kvm, mmu_notifier); - } - -+static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, -+ struct mm_struct *mm, -+ unsigned long address) -+{ -+ struct kvm *kvm = mmu_notifier_to_kvm(mn); -+ int need_tlb_flush, idx; -+ -+ /* -+ * When ->invalidate_page runs, the linux pte has been zapped -+ * already but the page is still allocated until -+ * ->invalidate_page returns. So if we increase the sequence -+ * here the kvm page fault will notice if the spte can't be -+ * established because the page is going to be freed. If -+ * instead the kvm page fault establishes the spte before -+ * ->invalidate_page runs, kvm_unmap_hva will release it -+ * before returning. -+ * -+ * The sequence increase only need to be seen at spin_unlock -+ * time, and not at spin_lock time. -+ * -+ * Increasing the sequence after the spin_unlock would be -+ * unsafe because the kvm page fault could then establish the -+ * pte after kvm_unmap_hva returned, without noticing the page -+ * is going to be freed. -+ */ -+ idx = srcu_read_lock(&kvm->srcu); -+ spin_lock(&kvm->mmu_lock); -+ -+ kvm->mmu_notifier_seq++; -+ need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty; -+ /* we've to flush the tlb before the pages can be freed */ -+ if (need_tlb_flush) -+ kvm_flush_remote_tlbs(kvm); -+ -+ spin_unlock(&kvm->mmu_lock); -+ -+ kvm_arch_mmu_notifier_invalidate_page(kvm, address); -+ -+ srcu_read_unlock(&kvm->srcu, idx); -+} -+ - static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long address, -@@ -469,6 +510,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn, - } - - static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { -+ .invalidate_page = kvm_mmu_notifier_invalidate_page, - .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, - .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, - .clear_flush_young = kvm_mmu_notifier_clear_flush_young, --- -2.14.2 - -- 2.39.5