]> git.proxmox.com Git - pve-kernel.git/blob - patches/kernel/0007-Revert-Merge-branch-mmu_notifier_fixes.patch
revert mmu changes causing bluescreens
[pve-kernel.git] / patches / kernel / 0007-Revert-Merge-branch-mmu_notifier_fixes.patch
1 From dfd4ec1fd8d1d09930e9cf9ed7ebd07a66813337 Mon Sep 17 00:00:00 2001
2 From: =?UTF-8?q?Fabian=20Gr=C3=BCnbichler?= <f.gruenbichler@proxmox.com>
3 Date: Wed, 29 Nov 2017 09:45:44 +0100
4 Subject: [PATCH 7/7] Revert "Merge branch 'mmu_notifier_fixes'"
5 MIME-Version: 1.0
6 Content-Type: text/plain; charset=UTF-8
7 Content-Transfer-Encoding: 8bit
8
9 This reverts commit ea25c43179462e342d4a0e66c3f6a5f53514da05, reversing
10 changes made to c227390c91a355300f47f9bef0aefbdfaaca1500.
11
12 This series causes blue screens in Windows VMs running under heavy
13 memory/swap pressure.
14
15 Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
16 ---
17 arch/arm/include/asm/kvm_host.h | 6 +++++
18 arch/arm64/include/asm/kvm_host.h | 6 +++++
19 arch/mips/include/asm/kvm_host.h | 5 ++++
20 arch/powerpc/include/asm/kvm_host.h | 5 ++++
21 arch/x86/include/asm/kvm_host.h | 2 ++
22 include/linux/mm.h | 1 -
23 include/linux/mmu_notifier.h | 25 +++++++++++++++++++
24 arch/powerpc/platforms/powernv/npu-dma.c | 10 ++++++++
25 arch/x86/kvm/x86.c | 11 +++++++++
26 drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 31 +++++++++++++++++++++++
27 drivers/infiniband/core/umem_odp.c | 19 +++++++++++++++
28 drivers/infiniband/hw/hfi1/mmu_rb.c | 9 +++++++
29 drivers/iommu/amd_iommu_v2.c | 8 ++++++
30 drivers/iommu/intel-svm.c | 9 +++++++
31 drivers/misc/mic/scif/scif_dma.c | 11 +++++++++
32 drivers/misc/sgi-gru/grutlbpurge.c | 12 +++++++++
33 drivers/xen/gntdev.c | 8 ++++++
34 fs/dax.c | 19 ++++++---------
35 mm/memory.c | 26 ++++----------------
36 mm/mmu_notifier.c | 14 +++++++++++
37 mm/rmap.c | 35 +++-----------------------
38 virt/kvm/kvm_main.c | 42 ++++++++++++++++++++++++++++++++
39 22 files changed, 249 insertions(+), 65 deletions(-)
40
41 diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
42 index 4a879f6ff13b..127e2dd2e21c 100644
43 --- a/arch/arm/include/asm/kvm_host.h
44 +++ b/arch/arm/include/asm/kvm_host.h
45 @@ -225,6 +225,12 @@ int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices);
46 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
47 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
48
49 +/* We do not have shadow page tables, hence the empty hooks */
50 +static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
51 + unsigned long address)
52 +{
53 +}
54 +
55 struct kvm_vcpu *kvm_arm_get_running_vcpu(void);
56 struct kvm_vcpu __percpu **kvm_get_running_vcpus(void);
57 void kvm_arm_halt_guest(struct kvm *kvm);
58 diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
59 index e923b58606e2..d68630007b14 100644
60 --- a/arch/arm64/include/asm/kvm_host.h
61 +++ b/arch/arm64/include/asm/kvm_host.h
62 @@ -326,6 +326,12 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
63 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
64 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
65
66 +/* We do not have shadow page tables, hence the empty hooks */
67 +static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
68 + unsigned long address)
69 +{
70 +}
71 +
72 struct kvm_vcpu *kvm_arm_get_running_vcpu(void);
73 struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void);
74 void kvm_arm_halt_guest(struct kvm *kvm);
75 diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
76 index a9af1d2dcd69..2998479fd4e8 100644
77 --- a/arch/mips/include/asm/kvm_host.h
78 +++ b/arch/mips/include/asm/kvm_host.h
79 @@ -938,6 +938,11 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
80 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
81 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
82
83 +static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
84 + unsigned long address)
85 +{
86 +}
87 +
88 /* Emulation */
89 int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out);
90 enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause);
91 diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
92 index e372ed871c51..8b3f1238d07f 100644
93 --- a/arch/powerpc/include/asm/kvm_host.h
94 +++ b/arch/powerpc/include/asm/kvm_host.h
95 @@ -67,6 +67,11 @@ extern int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
96 extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
97 extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
98
99 +static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
100 + unsigned long address)
101 +{
102 +}
103 +
104 #define HPTEG_CACHE_NUM (1 << 15)
105 #define HPTEG_HASH_BITS_PTE 13
106 #define HPTEG_HASH_BITS_PTE_LONG 12
107 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
108 index 92c9032502d8..f4d120a3e22e 100644
109 --- a/arch/x86/include/asm/kvm_host.h
110 +++ b/arch/x86/include/asm/kvm_host.h
111 @@ -1375,6 +1375,8 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
112 int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
113 void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
114 void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
115 +void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
116 + unsigned long address);
117
118 void kvm_define_shared_msr(unsigned index, u32 msr);
119 int kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
120 diff --git a/include/linux/mm.h b/include/linux/mm.h
121 index 07630442bbf2..701de4b55ece 100644
122 --- a/include/linux/mm.h
123 +++ b/include/linux/mm.h
124 @@ -1260,7 +1260,6 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
125 void unmap_mapping_range(struct address_space *mapping,
126 loff_t const holebegin, loff_t const holelen, int even_cows);
127 int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
128 - unsigned long *start, unsigned long *end,
129 pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp);
130 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
131 unsigned long *pfn);
132 diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
133 index 6866e8126982..947f21b451d2 100644
134 --- a/include/linux/mmu_notifier.h
135 +++ b/include/linux/mmu_notifier.h
136 @@ -94,6 +94,17 @@ struct mmu_notifier_ops {
137 unsigned long address,
138 pte_t pte);
139
140 + /*
141 + * Before this is invoked any secondary MMU is still ok to
142 + * read/write to the page previously pointed to by the Linux
143 + * pte because the page hasn't been freed yet and it won't be
144 + * freed until this returns. If required set_page_dirty has to
145 + * be called internally to this method.
146 + */
147 + void (*invalidate_page)(struct mmu_notifier *mn,
148 + struct mm_struct *mm,
149 + unsigned long address);
150 +
151 /*
152 * invalidate_range_start() and invalidate_range_end() must be
153 * paired and are called only when the mmap_sem and/or the
154 @@ -209,6 +220,8 @@ extern int __mmu_notifier_test_young(struct mm_struct *mm,
155 unsigned long address);
156 extern void __mmu_notifier_change_pte(struct mm_struct *mm,
157 unsigned long address, pte_t pte);
158 +extern void __mmu_notifier_invalidate_page(struct mm_struct *mm,
159 + unsigned long address);
160 extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
161 unsigned long start, unsigned long end);
162 extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
163 @@ -255,6 +268,13 @@ static inline void mmu_notifier_change_pte(struct mm_struct *mm,
164 __mmu_notifier_change_pte(mm, address, pte);
165 }
166
167 +static inline void mmu_notifier_invalidate_page(struct mm_struct *mm,
168 + unsigned long address)
169 +{
170 + if (mm_has_notifiers(mm))
171 + __mmu_notifier_invalidate_page(mm, address);
172 +}
173 +
174 static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm,
175 unsigned long start, unsigned long end)
176 {
177 @@ -427,6 +447,11 @@ static inline void mmu_notifier_change_pte(struct mm_struct *mm,
178 {
179 }
180
181 +static inline void mmu_notifier_invalidate_page(struct mm_struct *mm,
182 + unsigned long address)
183 +{
184 +}
185 +
186 static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm,
187 unsigned long start, unsigned long end)
188 {
189 diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
190 index 2cb6cbea4b3b..3d4f879e687c 100644
191 --- a/arch/powerpc/platforms/powernv/npu-dma.c
192 +++ b/arch/powerpc/platforms/powernv/npu-dma.c
193 @@ -614,6 +614,15 @@ static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
194 mmio_invalidate(npu_context, 1, address, true);
195 }
196
197 +static void pnv_npu2_mn_invalidate_page(struct mmu_notifier *mn,
198 + struct mm_struct *mm,
199 + unsigned long address)
200 +{
201 + struct npu_context *npu_context = mn_to_npu_context(mn);
202 +
203 + mmio_invalidate(npu_context, 1, address, true);
204 +}
205 +
206 static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
207 struct mm_struct *mm,
208 unsigned long start, unsigned long end)
209 @@ -631,6 +640,7 @@ static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
210 static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
211 .release = pnv_npu2_mn_release,
212 .change_pte = pnv_npu2_mn_change_pte,
213 + .invalidate_page = pnv_npu2_mn_invalidate_page,
214 .invalidate_range = pnv_npu2_mn_invalidate_range,
215 };
216
217 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
218 index 7351cdc46cc7..a669b4dd51e7 100644
219 --- a/arch/x86/kvm/x86.c
220 +++ b/arch/x86/kvm/x86.c
221 @@ -6734,6 +6734,17 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
222 }
223 EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page);
224
225 +void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
226 + unsigned long address)
227 +{
228 + /*
229 + * The physical address of apic access page is stored in the VMCS.
230 + * Update it when it becomes invalid.
231 + */
232 + if (address == gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT))
233 + kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
234 +}
235 +
236 /*
237 * Returns 1 to let vcpu_run() continue the guest execution loop without
238 * exiting to the userspace. Otherwise, the value will be returned to the
239 diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
240 index e1cde6b80027..6558a3ed57a7 100644
241 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
242 +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
243 @@ -146,6 +146,36 @@ static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node,
244 }
245 }
246
247 +/**
248 + * amdgpu_mn_invalidate_page - callback to notify about mm change
249 + *
250 + * @mn: our notifier
251 + * @mn: the mm this callback is about
252 + * @address: address of invalidate page
253 + *
254 + * Invalidation of a single page. Blocks for all BOs mapping it
255 + * and unmap them by move them into system domain again.
256 + */
257 +static void amdgpu_mn_invalidate_page(struct mmu_notifier *mn,
258 + struct mm_struct *mm,
259 + unsigned long address)
260 +{
261 + struct amdgpu_mn *rmn = container_of(mn, struct amdgpu_mn, mn);
262 + struct interval_tree_node *it;
263 +
264 + mutex_lock(&rmn->lock);
265 +
266 + it = interval_tree_iter_first(&rmn->objects, address, address);
267 + if (it) {
268 + struct amdgpu_mn_node *node;
269 +
270 + node = container_of(it, struct amdgpu_mn_node, it);
271 + amdgpu_mn_invalidate_node(node, address, address);
272 + }
273 +
274 + mutex_unlock(&rmn->lock);
275 +}
276 +
277 /**
278 * amdgpu_mn_invalidate_range_start - callback to notify about mm change
279 *
280 @@ -185,6 +215,7 @@ static void amdgpu_mn_invalidate_range_start(struct mmu_notifier *mn,
281
282 static const struct mmu_notifier_ops amdgpu_mn_ops = {
283 .release = amdgpu_mn_release,
284 + .invalidate_page = amdgpu_mn_invalidate_page,
285 .invalidate_range_start = amdgpu_mn_invalidate_range_start,
286 };
287
288 diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
289 index 55e8f5ed8b3c..8c4ec564e495 100644
290 --- a/drivers/infiniband/core/umem_odp.c
291 +++ b/drivers/infiniband/core/umem_odp.c
292 @@ -166,6 +166,24 @@ static int invalidate_page_trampoline(struct ib_umem *item, u64 start,
293 return 0;
294 }
295
296 +static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn,
297 + struct mm_struct *mm,
298 + unsigned long address)
299 +{
300 + struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
301 +
302 + if (!context->invalidate_range)
303 + return;
304 +
305 + ib_ucontext_notifier_start_account(context);
306 + down_read(&context->umem_rwsem);
307 + rbt_ib_umem_for_each_in_range(&context->umem_tree, address,
308 + address + PAGE_SIZE,
309 + invalidate_page_trampoline, NULL);
310 + up_read(&context->umem_rwsem);
311 + ib_ucontext_notifier_end_account(context);
312 +}
313 +
314 static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start,
315 u64 end, void *cookie)
316 {
317 @@ -219,6 +237,7 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
318
319 static const struct mmu_notifier_ops ib_umem_notifiers = {
320 .release = ib_umem_notifier_release,
321 + .invalidate_page = ib_umem_notifier_invalidate_page,
322 .invalidate_range_start = ib_umem_notifier_invalidate_range_start,
323 .invalidate_range_end = ib_umem_notifier_invalidate_range_end,
324 };
325 diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
326 index e4b56a0dd6d0..ccbf52c8ff6f 100644
327 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c
328 +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
329 @@ -67,6 +67,8 @@ struct mmu_rb_handler {
330
331 static unsigned long mmu_node_start(struct mmu_rb_node *);
332 static unsigned long mmu_node_last(struct mmu_rb_node *);
333 +static inline void mmu_notifier_page(struct mmu_notifier *, struct mm_struct *,
334 + unsigned long);
335 static inline void mmu_notifier_range_start(struct mmu_notifier *,
336 struct mm_struct *,
337 unsigned long, unsigned long);
338 @@ -80,6 +82,7 @@ static void do_remove(struct mmu_rb_handler *handler,
339 static void handle_remove(struct work_struct *work);
340
341 static const struct mmu_notifier_ops mn_opts = {
342 + .invalidate_page = mmu_notifier_page,
343 .invalidate_range_start = mmu_notifier_range_start,
344 };
345
346 @@ -282,6 +285,12 @@ void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler,
347 handler->ops->remove(handler->ops_arg, node);
348 }
349
350 +static inline void mmu_notifier_page(struct mmu_notifier *mn,
351 + struct mm_struct *mm, unsigned long addr)
352 +{
353 + mmu_notifier_mem_invalidate(mn, mm, addr, addr + PAGE_SIZE);
354 +}
355 +
356 static inline void mmu_notifier_range_start(struct mmu_notifier *mn,
357 struct mm_struct *mm,
358 unsigned long start,
359 diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
360 index dccf5b76eff2..6629c472eafd 100644
361 --- a/drivers/iommu/amd_iommu_v2.c
362 +++ b/drivers/iommu/amd_iommu_v2.c
363 @@ -391,6 +391,13 @@ static int mn_clear_flush_young(struct mmu_notifier *mn,
364 return 0;
365 }
366
367 +static void mn_invalidate_page(struct mmu_notifier *mn,
368 + struct mm_struct *mm,
369 + unsigned long address)
370 +{
371 + __mn_flush_page(mn, address);
372 +}
373 +
374 static void mn_invalidate_range(struct mmu_notifier *mn,
375 struct mm_struct *mm,
376 unsigned long start, unsigned long end)
377 @@ -429,6 +436,7 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
378 static const struct mmu_notifier_ops iommu_mn = {
379 .release = mn_release,
380 .clear_flush_young = mn_clear_flush_young,
381 + .invalidate_page = mn_invalidate_page,
382 .invalidate_range = mn_invalidate_range,
383 };
384
385 diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
386 index f620dccec8ee..f167c0d84ebf 100644
387 --- a/drivers/iommu/intel-svm.c
388 +++ b/drivers/iommu/intel-svm.c
389 @@ -223,6 +223,14 @@ static void intel_change_pte(struct mmu_notifier *mn, struct mm_struct *mm,
390 intel_flush_svm_range(svm, address, 1, 1, 0);
391 }
392
393 +static void intel_invalidate_page(struct mmu_notifier *mn, struct mm_struct *mm,
394 + unsigned long address)
395 +{
396 + struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
397 +
398 + intel_flush_svm_range(svm, address, 1, 1, 0);
399 +}
400 +
401 /* Pages have been freed at this point */
402 static void intel_invalidate_range(struct mmu_notifier *mn,
403 struct mm_struct *mm,
404 @@ -277,6 +285,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
405 static const struct mmu_notifier_ops intel_mmuops = {
406 .release = intel_mm_release,
407 .change_pte = intel_change_pte,
408 + .invalidate_page = intel_invalidate_page,
409 .invalidate_range = intel_invalidate_range,
410 };
411
412 diff --git a/drivers/misc/mic/scif/scif_dma.c b/drivers/misc/mic/scif/scif_dma.c
413 index 63d6246d6dff..64d5760d069a 100644
414 --- a/drivers/misc/mic/scif/scif_dma.c
415 +++ b/drivers/misc/mic/scif/scif_dma.c
416 @@ -200,6 +200,16 @@ static void scif_mmu_notifier_release(struct mmu_notifier *mn,
417 schedule_work(&scif_info.misc_work);
418 }
419
420 +static void scif_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
421 + struct mm_struct *mm,
422 + unsigned long address)
423 +{
424 + struct scif_mmu_notif *mmn;
425 +
426 + mmn = container_of(mn, struct scif_mmu_notif, ep_mmu_notifier);
427 + scif_rma_destroy_tcw(mmn, address, PAGE_SIZE);
428 +}
429 +
430 static void scif_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
431 struct mm_struct *mm,
432 unsigned long start,
433 @@ -225,6 +235,7 @@ static void scif_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
434 static const struct mmu_notifier_ops scif_mmu_notifier_ops = {
435 .release = scif_mmu_notifier_release,
436 .clear_flush_young = NULL,
437 + .invalidate_page = scif_mmu_notifier_invalidate_page,
438 .invalidate_range_start = scif_mmu_notifier_invalidate_range_start,
439 .invalidate_range_end = scif_mmu_notifier_invalidate_range_end};
440
441 diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c
442 index 9918eda0e05f..e936d43895d2 100644
443 --- a/drivers/misc/sgi-gru/grutlbpurge.c
444 +++ b/drivers/misc/sgi-gru/grutlbpurge.c
445 @@ -247,6 +247,17 @@ static void gru_invalidate_range_end(struct mmu_notifier *mn,
446 gru_dbg(grudev, "gms %p, start 0x%lx, end 0x%lx\n", gms, start, end);
447 }
448
449 +static void gru_invalidate_page(struct mmu_notifier *mn, struct mm_struct *mm,
450 + unsigned long address)
451 +{
452 + struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct,
453 + ms_notifier);
454 +
455 + STAT(mmu_invalidate_page);
456 + gru_flush_tlb_range(gms, address, PAGE_SIZE);
457 + gru_dbg(grudev, "gms %p, address 0x%lx\n", gms, address);
458 +}
459 +
460 static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm)
461 {
462 struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct,
463 @@ -258,6 +269,7 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm)
464
465
466 static const struct mmu_notifier_ops gru_mmuops = {
467 + .invalidate_page = gru_invalidate_page,
468 .invalidate_range_start = gru_invalidate_range_start,
469 .invalidate_range_end = gru_invalidate_range_end,
470 .release = gru_release,
471 diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
472 index 82360594fa8e..f3bf8f4e2d6c 100644
473 --- a/drivers/xen/gntdev.c
474 +++ b/drivers/xen/gntdev.c
475 @@ -484,6 +484,13 @@ static void mn_invl_range_start(struct mmu_notifier *mn,
476 mutex_unlock(&priv->lock);
477 }
478
479 +static void mn_invl_page(struct mmu_notifier *mn,
480 + struct mm_struct *mm,
481 + unsigned long address)
482 +{
483 + mn_invl_range_start(mn, mm, address, address + PAGE_SIZE);
484 +}
485 +
486 static void mn_release(struct mmu_notifier *mn,
487 struct mm_struct *mm)
488 {
489 @@ -515,6 +522,7 @@ static void mn_release(struct mmu_notifier *mn,
490
491 static const struct mmu_notifier_ops gntdev_mmu_ops = {
492 .release = mn_release,
493 + .invalidate_page = mn_invl_page,
494 .invalidate_range_start = mn_invl_range_start,
495 };
496
497 diff --git a/fs/dax.c b/fs/dax.c
498 index fa8e358c3c6b..57da1d0a6a40 100644
499 --- a/fs/dax.c
500 +++ b/fs/dax.c
501 @@ -591,10 +591,11 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
502 pte_t pte, *ptep = NULL;
503 pmd_t *pmdp = NULL;
504 spinlock_t *ptl;
505 + bool changed;
506
507 i_mmap_lock_read(mapping);
508 vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
509 - unsigned long address, start, end;
510 + unsigned long address;
511
512 cond_resched();
513
514 @@ -602,13 +603,8 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
515 continue;
516
517 address = pgoff_address(index, vma);
518 -
519 - /*
520 - * Note because we provide start/end to follow_pte_pmd it will
521 - * call mmu_notifier_invalidate_range_start() on our behalf
522 - * before taking any lock.
523 - */
524 - if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl))
525 + changed = false;
526 + if (follow_pte_pmd(vma->vm_mm, address, &ptep, &pmdp, &ptl))
527 continue;
528
529 if (pmdp) {
530 @@ -625,7 +621,7 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
531 pmd = pmd_wrprotect(pmd);
532 pmd = pmd_mkclean(pmd);
533 set_pmd_at(vma->vm_mm, address, pmdp, pmd);
534 - mmu_notifier_invalidate_range(vma->vm_mm, start, end);
535 + changed = true;
536 unlock_pmd:
537 spin_unlock(ptl);
538 #endif
539 @@ -640,12 +636,13 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
540 pte = pte_wrprotect(pte);
541 pte = pte_mkclean(pte);
542 set_pte_at(vma->vm_mm, address, ptep, pte);
543 - mmu_notifier_invalidate_range(vma->vm_mm, start, end);
544 + changed = true;
545 unlock_pte:
546 pte_unmap_unlock(ptep, ptl);
547 }
548
549 - mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
550 + if (changed)
551 + mmu_notifier_invalidate_page(vma->vm_mm, address);
552 }
553 i_mmap_unlock_read(mapping);
554 }
555 diff --git a/mm/memory.c b/mm/memory.c
556 index 969c5bf31997..7834310a6b64 100644
557 --- a/mm/memory.c
558 +++ b/mm/memory.c
559 @@ -4044,8 +4044,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
560 #endif /* __PAGETABLE_PMD_FOLDED */
561
562 static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
563 - unsigned long *start, unsigned long *end,
564 - pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
565 + pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
566 {
567 pgd_t *pgd;
568 p4d_t *p4d;
569 @@ -4072,29 +4071,17 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
570 if (!pmdpp)
571 goto out;
572
573 - if (start && end) {
574 - *start = address & PMD_MASK;
575 - *end = *start + PMD_SIZE;
576 - mmu_notifier_invalidate_range_start(mm, *start, *end);
577 - }
578 *ptlp = pmd_lock(mm, pmd);
579 if (pmd_huge(*pmd)) {
580 *pmdpp = pmd;
581 return 0;
582 }
583 spin_unlock(*ptlp);
584 - if (start && end)
585 - mmu_notifier_invalidate_range_end(mm, *start, *end);
586 }
587
588 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
589 goto out;
590
591 - if (start && end) {
592 - *start = address & PAGE_MASK;
593 - *end = *start + PAGE_SIZE;
594 - mmu_notifier_invalidate_range_start(mm, *start, *end);
595 - }
596 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
597 if (!pte_present(*ptep))
598 goto unlock;
599 @@ -4102,8 +4089,6 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
600 return 0;
601 unlock:
602 pte_unmap_unlock(ptep, *ptlp);
603 - if (start && end)
604 - mmu_notifier_invalidate_range_end(mm, *start, *end);
605 out:
606 return -EINVAL;
607 }
608 @@ -4115,21 +4100,20 @@ static inline int follow_pte(struct mm_struct *mm, unsigned long address,
609
610 /* (void) is needed to make gcc happy */
611 (void) __cond_lock(*ptlp,
612 - !(res = __follow_pte_pmd(mm, address, NULL, NULL,
613 - ptepp, NULL, ptlp)));
614 + !(res = __follow_pte_pmd(mm, address, ptepp, NULL,
615 + ptlp)));
616 return res;
617 }
618
619 int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
620 - unsigned long *start, unsigned long *end,
621 pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
622 {
623 int res;
624
625 /* (void) is needed to make gcc happy */
626 (void) __cond_lock(*ptlp,
627 - !(res = __follow_pte_pmd(mm, address, start, end,
628 - ptepp, pmdpp, ptlp)));
629 + !(res = __follow_pte_pmd(mm, address, ptepp, pmdpp,
630 + ptlp)));
631 return res;
632 }
633 EXPORT_SYMBOL(follow_pte_pmd);
634 diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
635 index 314285284e6e..54ca54562928 100644
636 --- a/mm/mmu_notifier.c
637 +++ b/mm/mmu_notifier.c
638 @@ -174,6 +174,20 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
639 srcu_read_unlock(&srcu, id);
640 }
641
642 +void __mmu_notifier_invalidate_page(struct mm_struct *mm,
643 + unsigned long address)
644 +{
645 + struct mmu_notifier *mn;
646 + int id;
647 +
648 + id = srcu_read_lock(&srcu);
649 + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
650 + if (mn->ops->invalidate_page)
651 + mn->ops->invalidate_page(mn, mm, address);
652 + }
653 + srcu_read_unlock(&srcu, id);
654 +}
655 +
656 void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
657 unsigned long start, unsigned long end)
658 {
659 diff --git a/mm/rmap.c b/mm/rmap.c
660 index c570f82e6827..c8993c63eb25 100644
661 --- a/mm/rmap.c
662 +++ b/mm/rmap.c
663 @@ -887,21 +887,11 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
664 .address = address,
665 .flags = PVMW_SYNC,
666 };
667 - unsigned long start = address, end;
668 int *cleaned = arg;
669
670 - /*
671 - * We have to assume the worse case ie pmd for invalidation. Note that
672 - * the page can not be free from this function.
673 - */
674 - end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page)));
675 - mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
676 -
677 while (page_vma_mapped_walk(&pvmw)) {
678 - unsigned long cstart, cend;
679 int ret = 0;
680 -
681 - cstart = address = pvmw.address;
682 + address = pvmw.address;
683 if (pvmw.pte) {
684 pte_t entry;
685 pte_t *pte = pvmw.pte;
686 @@ -914,7 +904,6 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
687 entry = pte_wrprotect(entry);
688 entry = pte_mkclean(entry);
689 set_pte_at(vma->vm_mm, address, pte, entry);
690 - cend = cstart + PAGE_SIZE;
691 ret = 1;
692 } else {
693 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
694 @@ -929,8 +918,6 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
695 entry = pmd_wrprotect(entry);
696 entry = pmd_mkclean(entry);
697 set_pmd_at(vma->vm_mm, address, pmd, entry);
698 - cstart &= PMD_MASK;
699 - cend = cstart + PMD_SIZE;
700 ret = 1;
701 #else
702 /* unexpected pmd-mapped page? */
703 @@ -939,13 +926,11 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
704 }
705
706 if (ret) {
707 - mmu_notifier_invalidate_range(vma->vm_mm, cstart, cend);
708 + mmu_notifier_invalidate_page(vma->vm_mm, address);
709 (*cleaned)++;
710 }
711 }
712
713 - mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
714 -
715 return true;
716 }
717
718 @@ -1339,7 +1324,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
719 pte_t pteval;
720 struct page *subpage;
721 bool ret = true;
722 - unsigned long start = address, end;
723 enum ttu_flags flags = (enum ttu_flags)arg;
724
725 /* munlock has nothing to gain from examining un-locked vmas */
726 @@ -1351,14 +1335,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
727 flags & TTU_MIGRATION, page);
728 }
729
730 - /*
731 - * We have to assume the worse case ie pmd for invalidation. Note that
732 - * the page can not be free in this function as call of try_to_unmap()
733 - * must hold a reference on the page.
734 - */
735 - end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page)));
736 - mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
737 -
738 while (page_vma_mapped_walk(&pvmw)) {
739 /*
740 * If the page is mlock()d, we cannot swap it out.
741 @@ -1469,7 +1445,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
742 if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) {
743 WARN_ON_ONCE(1);
744 ret = false;
745 - /* We have to invalidate as we cleared the pte */
746 page_vma_mapped_walk_done(&pvmw);
747 break;
748 }
749 @@ -1515,12 +1490,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
750 discard:
751 page_remove_rmap(subpage, PageHuge(page));
752 put_page(page);
753 - mmu_notifier_invalidate_range(mm, address,
754 - address + PAGE_SIZE);
755 + mmu_notifier_invalidate_page(mm, address);
756 }
757 -
758 - mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
759 -
760 return ret;
761 }
762
763 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
764 index bfa9c4d34102..1d048ef969a8 100644
765 --- a/virt/kvm/kvm_main.c
766 +++ b/virt/kvm/kvm_main.c
767 @@ -322,6 +322,47 @@ static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
768 return container_of(mn, struct kvm, mmu_notifier);
769 }
770
771 +static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
772 + struct mm_struct *mm,
773 + unsigned long address)
774 +{
775 + struct kvm *kvm = mmu_notifier_to_kvm(mn);
776 + int need_tlb_flush, idx;
777 +
778 + /*
779 + * When ->invalidate_page runs, the linux pte has been zapped
780 + * already but the page is still allocated until
781 + * ->invalidate_page returns. So if we increase the sequence
782 + * here the kvm page fault will notice if the spte can't be
783 + * established because the page is going to be freed. If
784 + * instead the kvm page fault establishes the spte before
785 + * ->invalidate_page runs, kvm_unmap_hva will release it
786 + * before returning.
787 + *
788 + * The sequence increase only need to be seen at spin_unlock
789 + * time, and not at spin_lock time.
790 + *
791 + * Increasing the sequence after the spin_unlock would be
792 + * unsafe because the kvm page fault could then establish the
793 + * pte after kvm_unmap_hva returned, without noticing the page
794 + * is going to be freed.
795 + */
796 + idx = srcu_read_lock(&kvm->srcu);
797 + spin_lock(&kvm->mmu_lock);
798 +
799 + kvm->mmu_notifier_seq++;
800 + need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
801 + /* we've to flush the tlb before the pages can be freed */
802 + if (need_tlb_flush)
803 + kvm_flush_remote_tlbs(kvm);
804 +
805 + spin_unlock(&kvm->mmu_lock);
806 +
807 + kvm_arch_mmu_notifier_invalidate_page(kvm, address);
808 +
809 + srcu_read_unlock(&kvm->srcu, idx);
810 +}
811 +
812 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
813 struct mm_struct *mm,
814 unsigned long address,
815 @@ -469,6 +510,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
816 }
817
818 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
819 + .invalidate_page = kvm_mmu_notifier_invalidate_page,
820 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
821 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
822 .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
823 --
824 2.14.2
825