]>
Commit | Line | Data |
---|---|---|
20c8ccb1 | 1 | // SPDX-License-Identifier: GPL-2.0-only |
cddb8a5c AA |
2 | /* |
3 | * linux/mm/mmu_notifier.c | |
4 | * | |
5 | * Copyright (C) 2008 Qumranet, Inc. | |
6 | * Copyright (C) 2008 SGI | |
93e205a7 | 7 | * Christoph Lameter <cl@linux.com> |
cddb8a5c AA |
8 | */ |
9 | ||
10 | #include <linux/rculist.h> | |
11 | #include <linux/mmu_notifier.h> | |
b95f1b31 | 12 | #include <linux/export.h> |
cddb8a5c AA |
13 | #include <linux/mm.h> |
14 | #include <linux/err.h> | |
99cb252f | 15 | #include <linux/interval_tree.h> |
21a92735 | 16 | #include <linux/srcu.h> |
cddb8a5c AA |
17 | #include <linux/rcupdate.h> |
18 | #include <linux/sched.h> | |
6e84f315 | 19 | #include <linux/sched/mm.h> |
5a0e3ad6 | 20 | #include <linux/slab.h> |
cddb8a5c | 21 | |
21a92735 | 22 | /* global SRCU for all MMs */ |
dde8da6c | 23 | DEFINE_STATIC_SRCU(srcu); |
21a92735 | 24 | |
23b68395 DV |
25 | #ifdef CONFIG_LOCKDEP |
26 | struct lockdep_map __mmu_notifier_invalidate_range_start_map = { | |
27 | .name = "mmu_notifier_invalidate_range_start" | |
28 | }; | |
29 | #endif | |
30 | ||
56f434f4 | 31 | /* |
984cfe4e JG |
32 | * The mmu_notifier_subscriptions structure is allocated and installed in |
33 | * mm->notifier_subscriptions inside the mm_take_all_locks() protected | |
56f434f4 JG |
34 | * critical section and it's released only when mm_count reaches zero |
35 | * in mmdrop(). | |
36 | */ | |
984cfe4e | 37 | struct mmu_notifier_subscriptions { |
56f434f4 JG |
38 | /* all mmu notifiers registered in this mm are queued in this list */ |
39 | struct hlist_head list; | |
99cb252f | 40 | bool has_itree; |
56f434f4 JG |
41 | /* to serialize the list modifications and hlist_unhashed */ |
42 | spinlock_t lock; | |
99cb252f JG |
43 | unsigned long invalidate_seq; |
44 | unsigned long active_invalidate_ranges; | |
45 | struct rb_root_cached itree; | |
46 | wait_queue_head_t wq; | |
47 | struct hlist_head deferred_list; | |
56f434f4 JG |
48 | }; |
49 | ||
99cb252f JG |
50 | /* |
51 | * This is a collision-retry read-side/write-side 'lock', a lot like a | |
52 | * seqcount, however this allows multiple write-sides to hold it at | |
53 | * once. Conceptually the write side is protecting the values of the PTEs in | |
54 | * this mm, such that PTES cannot be read into SPTEs (shadow PTEs) while any | |
55 | * writer exists. | |
56 | * | |
57 | * Note that the core mm creates nested invalidate_range_start()/end() regions | |
58 | * within the same thread, and runs invalidate_range_start()/end() in parallel | |
59 | * on multiple CPUs. This is designed to not reduce concurrency or block | |
60 | * progress on the mm side. | |
61 | * | |
62 | * As a secondary function, holding the full write side also serves to prevent | |
63 | * writers for the itree, this is an optimization to avoid extra locking | |
64 | * during invalidate_range_start/end notifiers. | |
65 | * | |
66 | * The write side has two states, fully excluded: | |
67 | * - mm->active_invalidate_ranges != 0 | |
984cfe4e | 68 | * - subscriptions->invalidate_seq & 1 == True (odd) |
99cb252f JG |
69 | * - some range on the mm_struct is being invalidated |
70 | * - the itree is not allowed to change | |
71 | * | |
72 | * And partially excluded: | |
73 | * - mm->active_invalidate_ranges != 0 | |
984cfe4e | 74 | * - subscriptions->invalidate_seq & 1 == False (even) |
99cb252f JG |
75 | * - some range on the mm_struct is being invalidated |
76 | * - the itree is allowed to change | |
77 | * | |
984cfe4e | 78 | * Operations on notifier_subscriptions->invalidate_seq (under spinlock): |
99cb252f JG |
79 | * seq |= 1 # Begin writing |
80 | * seq++ # Release the writing state | |
81 | * seq & 1 # True if a writer exists | |
82 | * | |
83 | * The later state avoids some expensive work on inv_end in the common case of | |
84 | * no mni monitoring the VA. | |
85 | */ | |
984cfe4e JG |
86 | static bool |
87 | mn_itree_is_invalidating(struct mmu_notifier_subscriptions *subscriptions) | |
99cb252f | 88 | { |
984cfe4e JG |
89 | lockdep_assert_held(&subscriptions->lock); |
90 | return subscriptions->invalidate_seq & 1; | |
99cb252f JG |
91 | } |
92 | ||
93 | static struct mmu_interval_notifier * | |
984cfe4e | 94 | mn_itree_inv_start_range(struct mmu_notifier_subscriptions *subscriptions, |
99cb252f JG |
95 | const struct mmu_notifier_range *range, |
96 | unsigned long *seq) | |
97 | { | |
98 | struct interval_tree_node *node; | |
99 | struct mmu_interval_notifier *res = NULL; | |
100 | ||
984cfe4e JG |
101 | spin_lock(&subscriptions->lock); |
102 | subscriptions->active_invalidate_ranges++; | |
103 | node = interval_tree_iter_first(&subscriptions->itree, range->start, | |
99cb252f JG |
104 | range->end - 1); |
105 | if (node) { | |
984cfe4e | 106 | subscriptions->invalidate_seq |= 1; |
99cb252f JG |
107 | res = container_of(node, struct mmu_interval_notifier, |
108 | interval_tree); | |
109 | } | |
110 | ||
984cfe4e JG |
111 | *seq = subscriptions->invalidate_seq; |
112 | spin_unlock(&subscriptions->lock); | |
99cb252f JG |
113 | return res; |
114 | } | |
115 | ||
116 | static struct mmu_interval_notifier * | |
117 | mn_itree_inv_next(struct mmu_interval_notifier *mni, | |
118 | const struct mmu_notifier_range *range) | |
119 | { | |
120 | struct interval_tree_node *node; | |
121 | ||
122 | node = interval_tree_iter_next(&mni->interval_tree, range->start, | |
123 | range->end - 1); | |
124 | if (!node) | |
125 | return NULL; | |
126 | return container_of(node, struct mmu_interval_notifier, interval_tree); | |
127 | } | |
128 | ||
984cfe4e | 129 | static void mn_itree_inv_end(struct mmu_notifier_subscriptions *subscriptions) |
99cb252f JG |
130 | { |
131 | struct mmu_interval_notifier *mni; | |
132 | struct hlist_node *next; | |
133 | ||
984cfe4e JG |
134 | spin_lock(&subscriptions->lock); |
135 | if (--subscriptions->active_invalidate_ranges || | |
136 | !mn_itree_is_invalidating(subscriptions)) { | |
137 | spin_unlock(&subscriptions->lock); | |
99cb252f JG |
138 | return; |
139 | } | |
140 | ||
141 | /* Make invalidate_seq even */ | |
984cfe4e | 142 | subscriptions->invalidate_seq++; |
99cb252f JG |
143 | |
144 | /* | |
145 | * The inv_end incorporates a deferred mechanism like rtnl_unlock(). | |
146 | * Adds and removes are queued until the final inv_end happens then | |
147 | * they are progressed. This arrangement for tree updates is used to | |
148 | * avoid using a blocking lock during invalidate_range_start. | |
149 | */ | |
984cfe4e | 150 | hlist_for_each_entry_safe(mni, next, &subscriptions->deferred_list, |
99cb252f JG |
151 | deferred_item) { |
152 | if (RB_EMPTY_NODE(&mni->interval_tree.rb)) | |
153 | interval_tree_insert(&mni->interval_tree, | |
984cfe4e | 154 | &subscriptions->itree); |
99cb252f JG |
155 | else |
156 | interval_tree_remove(&mni->interval_tree, | |
984cfe4e | 157 | &subscriptions->itree); |
99cb252f JG |
158 | hlist_del(&mni->deferred_item); |
159 | } | |
984cfe4e | 160 | spin_unlock(&subscriptions->lock); |
99cb252f | 161 | |
984cfe4e | 162 | wake_up_all(&subscriptions->wq); |
99cb252f JG |
163 | } |
164 | ||
165 | /** | |
166 | * mmu_interval_read_begin - Begin a read side critical section against a VA | |
167 | * range | |
168 | * mni: The range to use | |
169 | * | |
170 | * mmu_iterval_read_begin()/mmu_iterval_read_retry() implement a | |
171 | * collision-retry scheme similar to seqcount for the VA range under mni. If | |
172 | * the mm invokes invalidation during the critical section then | |
173 | * mmu_interval_read_retry() will return true. | |
174 | * | |
175 | * This is useful to obtain shadow PTEs where teardown or setup of the SPTEs | |
176 | * require a blocking context. The critical region formed by this can sleep, | |
177 | * and the required 'user_lock' can also be a sleeping lock. | |
178 | * | |
179 | * The caller is required to provide a 'user_lock' to serialize both teardown | |
180 | * and setup. | |
181 | * | |
182 | * The return value should be passed to mmu_interval_read_retry(). | |
183 | */ | |
184 | unsigned long mmu_interval_read_begin(struct mmu_interval_notifier *mni) | |
185 | { | |
984cfe4e JG |
186 | struct mmu_notifier_subscriptions *subscriptions = |
187 | mni->mm->notifier_subscriptions; | |
99cb252f JG |
188 | unsigned long seq; |
189 | bool is_invalidating; | |
190 | ||
191 | /* | |
192 | * If the mni has a different seq value under the user_lock than we | |
193 | * started with then it has collided. | |
194 | * | |
984cfe4e JG |
195 | * If the mni currently has the same seq value as the subscriptions |
196 | * seq, then it is currently between invalidate_start/end and is | |
197 | * colliding. | |
99cb252f JG |
198 | * |
199 | * The locking looks broadly like this: | |
200 | * mn_tree_invalidate_start(): mmu_interval_read_begin(): | |
201 | * spin_lock | |
202 | * seq = READ_ONCE(mni->invalidate_seq); | |
984cfe4e | 203 | * seq == subs->invalidate_seq |
99cb252f JG |
204 | * spin_unlock |
205 | * spin_lock | |
984cfe4e | 206 | * seq = ++subscriptions->invalidate_seq |
99cb252f JG |
207 | * spin_unlock |
208 | * op->invalidate_range(): | |
209 | * user_lock | |
210 | * mmu_interval_set_seq() | |
211 | * mni->invalidate_seq = seq | |
212 | * user_unlock | |
213 | * | |
214 | * [Required: mmu_interval_read_retry() == true] | |
215 | * | |
216 | * mn_itree_inv_end(): | |
217 | * spin_lock | |
984cfe4e | 218 | * seq = ++subscriptions->invalidate_seq |
99cb252f JG |
219 | * spin_unlock |
220 | * | |
221 | * user_lock | |
222 | * mmu_interval_read_retry(): | |
223 | * mni->invalidate_seq != seq | |
224 | * user_unlock | |
225 | * | |
226 | * Barriers are not needed here as any races here are closed by an | |
227 | * eventual mmu_interval_read_retry(), which provides a barrier via the | |
228 | * user_lock. | |
229 | */ | |
984cfe4e | 230 | spin_lock(&subscriptions->lock); |
99cb252f JG |
231 | /* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */ |
232 | seq = READ_ONCE(mni->invalidate_seq); | |
984cfe4e JG |
233 | is_invalidating = seq == subscriptions->invalidate_seq; |
234 | spin_unlock(&subscriptions->lock); | |
99cb252f JG |
235 | |
236 | /* | |
237 | * mni->invalidate_seq must always be set to an odd value via | |
238 | * mmu_interval_set_seq() using the provided cur_seq from | |
239 | * mn_itree_inv_start_range(). This ensures that if seq does wrap we | |
240 | * will always clear the below sleep in some reasonable time as | |
984cfe4e | 241 | * subscriptions->invalidate_seq is even in the idle state. |
99cb252f JG |
242 | */ |
243 | lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); | |
244 | lock_map_release(&__mmu_notifier_invalidate_range_start_map); | |
245 | if (is_invalidating) | |
984cfe4e JG |
246 | wait_event(subscriptions->wq, |
247 | READ_ONCE(subscriptions->invalidate_seq) != seq); | |
99cb252f JG |
248 | |
249 | /* | |
250 | * Notice that mmu_interval_read_retry() can already be true at this | |
251 | * point, avoiding loops here allows the caller to provide a global | |
252 | * time bound. | |
253 | */ | |
254 | ||
255 | return seq; | |
256 | } | |
257 | EXPORT_SYMBOL_GPL(mmu_interval_read_begin); | |
258 | ||
984cfe4e | 259 | static void mn_itree_release(struct mmu_notifier_subscriptions *subscriptions, |
99cb252f JG |
260 | struct mm_struct *mm) |
261 | { | |
262 | struct mmu_notifier_range range = { | |
263 | .flags = MMU_NOTIFIER_RANGE_BLOCKABLE, | |
264 | .event = MMU_NOTIFY_RELEASE, | |
265 | .mm = mm, | |
266 | .start = 0, | |
267 | .end = ULONG_MAX, | |
268 | }; | |
269 | struct mmu_interval_notifier *mni; | |
270 | unsigned long cur_seq; | |
271 | bool ret; | |
272 | ||
984cfe4e JG |
273 | for (mni = mn_itree_inv_start_range(subscriptions, &range, &cur_seq); |
274 | mni; mni = mn_itree_inv_next(mni, &range)) { | |
99cb252f JG |
275 | ret = mni->ops->invalidate(mni, &range, cur_seq); |
276 | WARN_ON(!ret); | |
277 | } | |
278 | ||
984cfe4e | 279 | mn_itree_inv_end(subscriptions); |
99cb252f JG |
280 | } |
281 | ||
cddb8a5c AA |
282 | /* |
283 | * This function can't run concurrently against mmu_notifier_register | |
284 | * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap | |
285 | * runs with mm_users == 0. Other tasks may still invoke mmu notifiers | |
286 | * in parallel despite there being no task using this mm any more, | |
287 | * through the vmas outside of the exit_mmap context, such as with | |
288 | * vmtruncate. This serializes against mmu_notifier_unregister with | |
984cfe4e JG |
289 | * the notifier_subscriptions->lock in addition to SRCU and it serializes |
290 | * against the other mmu notifiers with SRCU. struct mmu_notifier_subscriptions | |
cddb8a5c AA |
291 | * can't go away from under us as exit_mmap holds an mm_count pin |
292 | * itself. | |
293 | */ | |
984cfe4e | 294 | static void mn_hlist_release(struct mmu_notifier_subscriptions *subscriptions, |
99cb252f | 295 | struct mm_struct *mm) |
cddb8a5c AA |
296 | { |
297 | struct mmu_notifier *mn; | |
21a92735 | 298 | int id; |
3ad3d901 XG |
299 | |
300 | /* | |
d34883d4 XG |
301 | * SRCU here will block mmu_notifier_unregister until |
302 | * ->release returns. | |
3ad3d901 | 303 | */ |
21a92735 | 304 | id = srcu_read_lock(&srcu); |
984cfe4e | 305 | hlist_for_each_entry_rcu(mn, &subscriptions->list, hlist) |
d34883d4 XG |
306 | /* |
307 | * If ->release runs before mmu_notifier_unregister it must be | |
308 | * handled, as it's the only way for the driver to flush all | |
309 | * existing sptes and stop the driver from establishing any more | |
310 | * sptes before all the pages in the mm are freed. | |
311 | */ | |
312 | if (mn->ops->release) | |
313 | mn->ops->release(mn, mm); | |
d34883d4 | 314 | |
984cfe4e JG |
315 | spin_lock(&subscriptions->lock); |
316 | while (unlikely(!hlist_empty(&subscriptions->list))) { | |
317 | mn = hlist_entry(subscriptions->list.first, struct mmu_notifier, | |
cddb8a5c AA |
318 | hlist); |
319 | /* | |
d34883d4 XG |
320 | * We arrived before mmu_notifier_unregister so |
321 | * mmu_notifier_unregister will do nothing other than to wait | |
322 | * for ->release to finish and for mmu_notifier_unregister to | |
323 | * return. | |
cddb8a5c AA |
324 | */ |
325 | hlist_del_init_rcu(&mn->hlist); | |
cddb8a5c | 326 | } |
984cfe4e | 327 | spin_unlock(&subscriptions->lock); |
b972216e | 328 | srcu_read_unlock(&srcu, id); |
cddb8a5c AA |
329 | |
330 | /* | |
d34883d4 XG |
331 | * synchronize_srcu here prevents mmu_notifier_release from returning to |
332 | * exit_mmap (which would proceed with freeing all pages in the mm) | |
333 | * until the ->release method returns, if it was invoked by | |
334 | * mmu_notifier_unregister. | |
335 | * | |
984cfe4e JG |
336 | * The notifier_subscriptions can't go away from under us because |
337 | * one mm_count is held by exit_mmap. | |
cddb8a5c | 338 | */ |
21a92735 | 339 | synchronize_srcu(&srcu); |
cddb8a5c AA |
340 | } |
341 | ||
99cb252f JG |
342 | void __mmu_notifier_release(struct mm_struct *mm) |
343 | { | |
984cfe4e JG |
344 | struct mmu_notifier_subscriptions *subscriptions = |
345 | mm->notifier_subscriptions; | |
99cb252f | 346 | |
984cfe4e JG |
347 | if (subscriptions->has_itree) |
348 | mn_itree_release(subscriptions, mm); | |
99cb252f | 349 | |
984cfe4e JG |
350 | if (!hlist_empty(&subscriptions->list)) |
351 | mn_hlist_release(subscriptions, mm); | |
99cb252f JG |
352 | } |
353 | ||
cddb8a5c AA |
354 | /* |
355 | * If no young bitflag is supported by the hardware, ->clear_flush_young can | |
356 | * unmap the address and return 1 or 0 depending if the mapping previously | |
357 | * existed or not. | |
358 | */ | |
359 | int __mmu_notifier_clear_flush_young(struct mm_struct *mm, | |
57128468 ALC |
360 | unsigned long start, |
361 | unsigned long end) | |
cddb8a5c AA |
362 | { |
363 | struct mmu_notifier *mn; | |
21a92735 | 364 | int young = 0, id; |
cddb8a5c | 365 | |
21a92735 | 366 | id = srcu_read_lock(&srcu); |
984cfe4e | 367 | hlist_for_each_entry_rcu(mn, &mm->notifier_subscriptions->list, hlist) { |
cddb8a5c | 368 | if (mn->ops->clear_flush_young) |
57128468 | 369 | young |= mn->ops->clear_flush_young(mn, mm, start, end); |
cddb8a5c | 370 | } |
21a92735 | 371 | srcu_read_unlock(&srcu, id); |
cddb8a5c AA |
372 | |
373 | return young; | |
374 | } | |
375 | ||
1d7715c6 VD |
376 | int __mmu_notifier_clear_young(struct mm_struct *mm, |
377 | unsigned long start, | |
378 | unsigned long end) | |
379 | { | |
380 | struct mmu_notifier *mn; | |
381 | int young = 0, id; | |
382 | ||
383 | id = srcu_read_lock(&srcu); | |
984cfe4e | 384 | hlist_for_each_entry_rcu(mn, &mm->notifier_subscriptions->list, hlist) { |
1d7715c6 VD |
385 | if (mn->ops->clear_young) |
386 | young |= mn->ops->clear_young(mn, mm, start, end); | |
387 | } | |
388 | srcu_read_unlock(&srcu, id); | |
389 | ||
390 | return young; | |
391 | } | |
392 | ||
8ee53820 AA |
393 | int __mmu_notifier_test_young(struct mm_struct *mm, |
394 | unsigned long address) | |
395 | { | |
396 | struct mmu_notifier *mn; | |
21a92735 | 397 | int young = 0, id; |
8ee53820 | 398 | |
21a92735 | 399 | id = srcu_read_lock(&srcu); |
984cfe4e | 400 | hlist_for_each_entry_rcu(mn, &mm->notifier_subscriptions->list, hlist) { |
8ee53820 AA |
401 | if (mn->ops->test_young) { |
402 | young = mn->ops->test_young(mn, mm, address); | |
403 | if (young) | |
404 | break; | |
405 | } | |
406 | } | |
21a92735 | 407 | srcu_read_unlock(&srcu, id); |
8ee53820 AA |
408 | |
409 | return young; | |
410 | } | |
411 | ||
828502d3 IE |
412 | void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, |
413 | pte_t pte) | |
414 | { | |
415 | struct mmu_notifier *mn; | |
21a92735 | 416 | int id; |
828502d3 | 417 | |
21a92735 | 418 | id = srcu_read_lock(&srcu); |
984cfe4e JG |
419 | hlist_for_each_entry_rcu(mn, &mm->notifier_subscriptions->list, |
420 | hlist) { | |
828502d3 IE |
421 | if (mn->ops->change_pte) |
422 | mn->ops->change_pte(mn, mm, address, pte); | |
828502d3 | 423 | } |
21a92735 | 424 | srcu_read_unlock(&srcu, id); |
828502d3 IE |
425 | } |
426 | ||
984cfe4e | 427 | static int mn_itree_invalidate(struct mmu_notifier_subscriptions *subscriptions, |
99cb252f JG |
428 | const struct mmu_notifier_range *range) |
429 | { | |
430 | struct mmu_interval_notifier *mni; | |
431 | unsigned long cur_seq; | |
432 | ||
984cfe4e JG |
433 | for (mni = mn_itree_inv_start_range(subscriptions, range, &cur_seq); |
434 | mni; mni = mn_itree_inv_next(mni, range)) { | |
99cb252f JG |
435 | bool ret; |
436 | ||
437 | ret = mni->ops->invalidate(mni, range, cur_seq); | |
438 | if (!ret) { | |
439 | if (WARN_ON(mmu_notifier_range_blockable(range))) | |
440 | continue; | |
441 | goto out_would_block; | |
442 | } | |
443 | } | |
444 | return 0; | |
445 | ||
446 | out_would_block: | |
447 | /* | |
448 | * On -EAGAIN the non-blocking caller is not allowed to call | |
449 | * invalidate_range_end() | |
450 | */ | |
984cfe4e | 451 | mn_itree_inv_end(subscriptions); |
99cb252f JG |
452 | return -EAGAIN; |
453 | } | |
454 | ||
984cfe4e JG |
455 | static int mn_hlist_invalidate_range_start( |
456 | struct mmu_notifier_subscriptions *subscriptions, | |
457 | struct mmu_notifier_range *range) | |
cddb8a5c AA |
458 | { |
459 | struct mmu_notifier *mn; | |
93065ac7 | 460 | int ret = 0; |
21a92735 | 461 | int id; |
cddb8a5c | 462 | |
21a92735 | 463 | id = srcu_read_lock(&srcu); |
984cfe4e | 464 | hlist_for_each_entry_rcu(mn, &subscriptions->list, hlist) { |
93065ac7 | 465 | if (mn->ops->invalidate_range_start) { |
ba170f76 DV |
466 | int _ret; |
467 | ||
468 | if (!mmu_notifier_range_blockable(range)) | |
469 | non_block_start(); | |
470 | _ret = mn->ops->invalidate_range_start(mn, range); | |
471 | if (!mmu_notifier_range_blockable(range)) | |
472 | non_block_end(); | |
93065ac7 MH |
473 | if (_ret) { |
474 | pr_info("%pS callback failed with %d in %sblockable context.\n", | |
ac46d4f3 | 475 | mn->ops->invalidate_range_start, _ret, |
dfcd6660 | 476 | !mmu_notifier_range_blockable(range) ? "non-" : ""); |
8402ce61 | 477 | WARN_ON(mmu_notifier_range_blockable(range) || |
df2ec764 | 478 | _ret != -EAGAIN); |
93065ac7 MH |
479 | ret = _ret; |
480 | } | |
481 | } | |
cddb8a5c | 482 | } |
21a92735 | 483 | srcu_read_unlock(&srcu, id); |
93065ac7 MH |
484 | |
485 | return ret; | |
cddb8a5c AA |
486 | } |
487 | ||
99cb252f JG |
488 | int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) |
489 | { | |
984cfe4e JG |
490 | struct mmu_notifier_subscriptions *subscriptions = |
491 | range->mm->notifier_subscriptions; | |
99cb252f JG |
492 | int ret; |
493 | ||
984cfe4e JG |
494 | if (subscriptions->has_itree) { |
495 | ret = mn_itree_invalidate(subscriptions, range); | |
99cb252f JG |
496 | if (ret) |
497 | return ret; | |
498 | } | |
984cfe4e JG |
499 | if (!hlist_empty(&subscriptions->list)) |
500 | return mn_hlist_invalidate_range_start(subscriptions, range); | |
99cb252f JG |
501 | return 0; |
502 | } | |
503 | ||
984cfe4e JG |
504 | static void |
505 | mn_hlist_invalidate_end(struct mmu_notifier_subscriptions *subscriptions, | |
506 | struct mmu_notifier_range *range, bool only_end) | |
cddb8a5c AA |
507 | { |
508 | struct mmu_notifier *mn; | |
21a92735 | 509 | int id; |
cddb8a5c | 510 | |
21a92735 | 511 | id = srcu_read_lock(&srcu); |
984cfe4e | 512 | hlist_for_each_entry_rcu(mn, &subscriptions->list, hlist) { |
0f0a327f JR |
513 | /* |
514 | * Call invalidate_range here too to avoid the need for the | |
515 | * subsystem of having to register an invalidate_range_end | |
516 | * call-back when there is invalidate_range already. Usually a | |
517 | * subsystem registers either invalidate_range_start()/end() or | |
518 | * invalidate_range(), so this will be no additional overhead | |
519 | * (besides the pointer check). | |
4645b9fe JG |
520 | * |
521 | * We skip call to invalidate_range() if we know it is safe ie | |
522 | * call site use mmu_notifier_invalidate_range_only_end() which | |
523 | * is safe to do when we know that a call to invalidate_range() | |
524 | * already happen under page table lock. | |
0f0a327f | 525 | */ |
4645b9fe | 526 | if (!only_end && mn->ops->invalidate_range) |
ac46d4f3 JG |
527 | mn->ops->invalidate_range(mn, range->mm, |
528 | range->start, | |
529 | range->end); | |
ba170f76 DV |
530 | if (mn->ops->invalidate_range_end) { |
531 | if (!mmu_notifier_range_blockable(range)) | |
532 | non_block_start(); | |
5d6527a7 | 533 | mn->ops->invalidate_range_end(mn, range); |
ba170f76 DV |
534 | if (!mmu_notifier_range_blockable(range)) |
535 | non_block_end(); | |
536 | } | |
cddb8a5c | 537 | } |
21a92735 | 538 | srcu_read_unlock(&srcu, id); |
99cb252f JG |
539 | } |
540 | ||
541 | void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range, | |
542 | bool only_end) | |
543 | { | |
984cfe4e JG |
544 | struct mmu_notifier_subscriptions *subscriptions = |
545 | range->mm->notifier_subscriptions; | |
99cb252f JG |
546 | |
547 | lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); | |
984cfe4e JG |
548 | if (subscriptions->has_itree) |
549 | mn_itree_inv_end(subscriptions); | |
99cb252f | 550 | |
984cfe4e JG |
551 | if (!hlist_empty(&subscriptions->list)) |
552 | mn_hlist_invalidate_end(subscriptions, range, only_end); | |
23b68395 | 553 | lock_map_release(&__mmu_notifier_invalidate_range_start_map); |
cddb8a5c AA |
554 | } |
555 | ||
0f0a327f JR |
556 | void __mmu_notifier_invalidate_range(struct mm_struct *mm, |
557 | unsigned long start, unsigned long end) | |
558 | { | |
559 | struct mmu_notifier *mn; | |
560 | int id; | |
561 | ||
562 | id = srcu_read_lock(&srcu); | |
984cfe4e | 563 | hlist_for_each_entry_rcu(mn, &mm->notifier_subscriptions->list, hlist) { |
0f0a327f JR |
564 | if (mn->ops->invalidate_range) |
565 | mn->ops->invalidate_range(mn, mm, start, end); | |
566 | } | |
567 | srcu_read_unlock(&srcu, id); | |
568 | } | |
0f0a327f | 569 | |
56c57103 | 570 | /* |
99cb252f JG |
571 | * Same as mmu_notifier_register but here the caller must hold the mmap_sem in |
572 | * write mode. A NULL mn signals the notifier is being registered for itree | |
573 | * mode. | |
56c57103 JG |
574 | */ |
575 | int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) | |
cddb8a5c | 576 | { |
984cfe4e | 577 | struct mmu_notifier_subscriptions *subscriptions = NULL; |
cddb8a5c AA |
578 | int ret; |
579 | ||
56c57103 | 580 | lockdep_assert_held_write(&mm->mmap_sem); |
cddb8a5c AA |
581 | BUG_ON(atomic_read(&mm->mm_users) <= 0); |
582 | ||
66204f1d DV |
583 | if (IS_ENABLED(CONFIG_LOCKDEP)) { |
584 | fs_reclaim_acquire(GFP_KERNEL); | |
585 | lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); | |
586 | lock_map_release(&__mmu_notifier_invalidate_range_start_map); | |
587 | fs_reclaim_release(GFP_KERNEL); | |
588 | } | |
589 | ||
984cfe4e | 590 | if (!mm->notifier_subscriptions) { |
70df291b JG |
591 | /* |
592 | * kmalloc cannot be called under mm_take_all_locks(), but we | |
984cfe4e JG |
593 | * know that mm->notifier_subscriptions can't change while we |
594 | * hold the write side of the mmap_sem. | |
70df291b | 595 | */ |
984cfe4e JG |
596 | subscriptions = kzalloc( |
597 | sizeof(struct mmu_notifier_subscriptions), GFP_KERNEL); | |
598 | if (!subscriptions) | |
70df291b JG |
599 | return -ENOMEM; |
600 | ||
984cfe4e JG |
601 | INIT_HLIST_HEAD(&subscriptions->list); |
602 | spin_lock_init(&subscriptions->lock); | |
603 | subscriptions->invalidate_seq = 2; | |
604 | subscriptions->itree = RB_ROOT_CACHED; | |
605 | init_waitqueue_head(&subscriptions->wq); | |
606 | INIT_HLIST_HEAD(&subscriptions->deferred_list); | |
70df291b | 607 | } |
35cfa2b0 | 608 | |
cddb8a5c AA |
609 | ret = mm_take_all_locks(mm); |
610 | if (unlikely(ret)) | |
35cfa2b0 | 611 | goto out_clean; |
cddb8a5c | 612 | |
cddb8a5c AA |
613 | /* |
614 | * Serialize the update against mmu_notifier_unregister. A | |
615 | * side note: mmu_notifier_release can't run concurrently with | |
616 | * us because we hold the mm_users pin (either implicitly as | |
617 | * current->mm or explicitly with get_task_mm() or similar). | |
618 | * We can't race against any other mmu notifier method either | |
619 | * thanks to mm_take_all_locks(). | |
99cb252f | 620 | * |
984cfe4e JG |
621 | * release semantics on the initialization of the |
622 | * mmu_notifier_subscriptions's contents are provided for unlocked | |
623 | * readers. acquire can only be used while holding the mmgrab or | |
624 | * mmget, and is safe because once created the | |
625 | * mmu_notifier_subscriptions is not freed until the mm is destroyed. | |
626 | * As above, users holding the mmap_sem or one of the | |
99cb252f | 627 | * mm_take_all_locks() do not need to use acquire semantics. |
cddb8a5c | 628 | */ |
984cfe4e JG |
629 | if (subscriptions) |
630 | smp_store_release(&mm->notifier_subscriptions, subscriptions); | |
70df291b | 631 | |
99cb252f JG |
632 | if (mn) { |
633 | /* Pairs with the mmdrop in mmu_notifier_unregister_* */ | |
634 | mmgrab(mm); | |
635 | mn->mm = mm; | |
636 | mn->users = 1; | |
637 | ||
984cfe4e JG |
638 | spin_lock(&mm->notifier_subscriptions->lock); |
639 | hlist_add_head_rcu(&mn->hlist, | |
640 | &mm->notifier_subscriptions->list); | |
641 | spin_unlock(&mm->notifier_subscriptions->lock); | |
99cb252f | 642 | } else |
984cfe4e | 643 | mm->notifier_subscriptions->has_itree = true; |
cddb8a5c AA |
644 | |
645 | mm_drop_all_locks(mm); | |
70df291b JG |
646 | BUG_ON(atomic_read(&mm->mm_users) <= 0); |
647 | return 0; | |
648 | ||
35cfa2b0 | 649 | out_clean: |
984cfe4e | 650 | kfree(subscriptions); |
cddb8a5c AA |
651 | return ret; |
652 | } | |
56c57103 | 653 | EXPORT_SYMBOL_GPL(__mmu_notifier_register); |
cddb8a5c | 654 | |
2c7933f5 JG |
655 | /** |
656 | * mmu_notifier_register - Register a notifier on a mm | |
657 | * @mn: The notifier to attach | |
658 | * @mm: The mm to attach the notifier to | |
659 | * | |
cddb8a5c AA |
660 | * Must not hold mmap_sem nor any other VM related lock when calling |
661 | * this registration function. Must also ensure mm_users can't go down | |
662 | * to zero while this runs to avoid races with mmu_notifier_release, | |
663 | * so mm has to be current->mm or the mm should be pinned safely such | |
664 | * as with get_task_mm(). If the mm is not current->mm, the mm_users | |
665 | * pin should be released by calling mmput after mmu_notifier_register | |
2c7933f5 JG |
666 | * returns. |
667 | * | |
668 | * mmu_notifier_unregister() or mmu_notifier_put() must be always called to | |
669 | * unregister the notifier. | |
670 | * | |
671 | * While the caller has a mmu_notifier get the mn->mm pointer will remain | |
672 | * valid, and can be converted to an active mm pointer via mmget_not_zero(). | |
cddb8a5c AA |
673 | */ |
674 | int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) | |
675 | { | |
56c57103 | 676 | int ret; |
cddb8a5c | 677 | |
56c57103 JG |
678 | down_write(&mm->mmap_sem); |
679 | ret = __mmu_notifier_register(mn, mm); | |
680 | up_write(&mm->mmap_sem); | |
681 | return ret; | |
cddb8a5c | 682 | } |
56c57103 | 683 | EXPORT_SYMBOL_GPL(mmu_notifier_register); |
cddb8a5c | 684 | |
2c7933f5 JG |
685 | static struct mmu_notifier * |
686 | find_get_mmu_notifier(struct mm_struct *mm, const struct mmu_notifier_ops *ops) | |
687 | { | |
688 | struct mmu_notifier *mn; | |
689 | ||
984cfe4e JG |
690 | spin_lock(&mm->notifier_subscriptions->lock); |
691 | hlist_for_each_entry_rcu(mn, &mm->notifier_subscriptions->list, | |
692 | hlist) { | |
2c7933f5 JG |
693 | if (mn->ops != ops) |
694 | continue; | |
695 | ||
696 | if (likely(mn->users != UINT_MAX)) | |
697 | mn->users++; | |
698 | else | |
699 | mn = ERR_PTR(-EOVERFLOW); | |
984cfe4e | 700 | spin_unlock(&mm->notifier_subscriptions->lock); |
2c7933f5 JG |
701 | return mn; |
702 | } | |
984cfe4e | 703 | spin_unlock(&mm->notifier_subscriptions->lock); |
2c7933f5 JG |
704 | return NULL; |
705 | } | |
706 | ||
707 | /** | |
708 | * mmu_notifier_get_locked - Return the single struct mmu_notifier for | |
709 | * the mm & ops | |
710 | * @ops: The operations struct being subscribe with | |
711 | * @mm : The mm to attach notifiers too | |
712 | * | |
713 | * This function either allocates a new mmu_notifier via | |
714 | * ops->alloc_notifier(), or returns an already existing notifier on the | |
715 | * list. The value of the ops pointer is used to determine when two notifiers | |
716 | * are the same. | |
717 | * | |
718 | * Each call to mmu_notifier_get() must be paired with a call to | |
719 | * mmu_notifier_put(). The caller must hold the write side of mm->mmap_sem. | |
720 | * | |
721 | * While the caller has a mmu_notifier get the mm pointer will remain valid, | |
722 | * and can be converted to an active mm pointer via mmget_not_zero(). | |
723 | */ | |
724 | struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops, | |
725 | struct mm_struct *mm) | |
726 | { | |
727 | struct mmu_notifier *mn; | |
728 | int ret; | |
729 | ||
730 | lockdep_assert_held_write(&mm->mmap_sem); | |
731 | ||
984cfe4e | 732 | if (mm->notifier_subscriptions) { |
2c7933f5 JG |
733 | mn = find_get_mmu_notifier(mm, ops); |
734 | if (mn) | |
735 | return mn; | |
736 | } | |
737 | ||
738 | mn = ops->alloc_notifier(mm); | |
739 | if (IS_ERR(mn)) | |
740 | return mn; | |
741 | mn->ops = ops; | |
742 | ret = __mmu_notifier_register(mn, mm); | |
743 | if (ret) | |
744 | goto out_free; | |
745 | return mn; | |
746 | out_free: | |
747 | mn->ops->free_notifier(mn); | |
748 | return ERR_PTR(ret); | |
749 | } | |
750 | EXPORT_SYMBOL_GPL(mmu_notifier_get_locked); | |
751 | ||
cddb8a5c | 752 | /* this is called after the last mmu_notifier_unregister() returned */ |
984cfe4e | 753 | void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm) |
cddb8a5c | 754 | { |
984cfe4e JG |
755 | BUG_ON(!hlist_empty(&mm->notifier_subscriptions->list)); |
756 | kfree(mm->notifier_subscriptions); | |
757 | mm->notifier_subscriptions = LIST_POISON1; /* debug */ | |
cddb8a5c AA |
758 | } |
759 | ||
760 | /* | |
761 | * This releases the mm_count pin automatically and frees the mm | |
762 | * structure if it was the last user of it. It serializes against | |
21a92735 SG |
763 | * running mmu notifiers with SRCU and against mmu_notifier_unregister |
764 | * with the unregister lock + SRCU. All sptes must be dropped before | |
cddb8a5c AA |
765 | * calling mmu_notifier_unregister. ->release or any other notifier |
766 | * method may be invoked concurrently with mmu_notifier_unregister, | |
767 | * and only after mmu_notifier_unregister returned we're guaranteed | |
768 | * that ->release or any other method can't run anymore. | |
769 | */ | |
770 | void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) | |
771 | { | |
772 | BUG_ON(atomic_read(&mm->mm_count) <= 0); | |
773 | ||
cddb8a5c | 774 | if (!hlist_unhashed(&mn->hlist)) { |
d34883d4 XG |
775 | /* |
776 | * SRCU here will force exit_mmap to wait for ->release to | |
777 | * finish before freeing the pages. | |
778 | */ | |
21a92735 | 779 | int id; |
3ad3d901 | 780 | |
d34883d4 | 781 | id = srcu_read_lock(&srcu); |
cddb8a5c | 782 | /* |
d34883d4 XG |
783 | * exit_mmap will block in mmu_notifier_release to guarantee |
784 | * that ->release is called before freeing the pages. | |
cddb8a5c AA |
785 | */ |
786 | if (mn->ops->release) | |
787 | mn->ops->release(mn, mm); | |
d34883d4 | 788 | srcu_read_unlock(&srcu, id); |
3ad3d901 | 789 | |
984cfe4e | 790 | spin_lock(&mm->notifier_subscriptions->lock); |
751efd86 | 791 | /* |
d34883d4 XG |
792 | * Can not use list_del_rcu() since __mmu_notifier_release |
793 | * can delete it before we hold the lock. | |
751efd86 | 794 | */ |
d34883d4 | 795 | hlist_del_init_rcu(&mn->hlist); |
984cfe4e | 796 | spin_unlock(&mm->notifier_subscriptions->lock); |
d34883d4 | 797 | } |
cddb8a5c AA |
798 | |
799 | /* | |
d34883d4 | 800 | * Wait for any running method to finish, of course including |
83a35e36 | 801 | * ->release if it was run by mmu_notifier_release instead of us. |
cddb8a5c | 802 | */ |
21a92735 | 803 | synchronize_srcu(&srcu); |
cddb8a5c AA |
804 | |
805 | BUG_ON(atomic_read(&mm->mm_count) <= 0); | |
806 | ||
807 | mmdrop(mm); | |
808 | } | |
809 | EXPORT_SYMBOL_GPL(mmu_notifier_unregister); | |
21a92735 | 810 | |
2c7933f5 JG |
811 | static void mmu_notifier_free_rcu(struct rcu_head *rcu) |
812 | { | |
813 | struct mmu_notifier *mn = container_of(rcu, struct mmu_notifier, rcu); | |
814 | struct mm_struct *mm = mn->mm; | |
815 | ||
816 | mn->ops->free_notifier(mn); | |
817 | /* Pairs with the get in __mmu_notifier_register() */ | |
818 | mmdrop(mm); | |
819 | } | |
820 | ||
821 | /** | |
822 | * mmu_notifier_put - Release the reference on the notifier | |
823 | * @mn: The notifier to act on | |
824 | * | |
825 | * This function must be paired with each mmu_notifier_get(), it releases the | |
826 | * reference obtained by the get. If this is the last reference then process | |
827 | * to free the notifier will be run asynchronously. | |
828 | * | |
829 | * Unlike mmu_notifier_unregister() the get/put flow only calls ops->release | |
830 | * when the mm_struct is destroyed. Instead free_notifier is always called to | |
831 | * release any resources held by the user. | |
832 | * | |
833 | * As ops->release is not guaranteed to be called, the user must ensure that | |
834 | * all sptes are dropped, and no new sptes can be established before | |
835 | * mmu_notifier_put() is called. | |
836 | * | |
837 | * This function can be called from the ops->release callback, however the | |
838 | * caller must still ensure it is called pairwise with mmu_notifier_get(). | |
839 | * | |
840 | * Modules calling this function must call mmu_notifier_synchronize() in | |
841 | * their __exit functions to ensure the async work is completed. | |
842 | */ | |
843 | void mmu_notifier_put(struct mmu_notifier *mn) | |
844 | { | |
845 | struct mm_struct *mm = mn->mm; | |
846 | ||
984cfe4e | 847 | spin_lock(&mm->notifier_subscriptions->lock); |
2c7933f5 JG |
848 | if (WARN_ON(!mn->users) || --mn->users) |
849 | goto out_unlock; | |
850 | hlist_del_init_rcu(&mn->hlist); | |
984cfe4e | 851 | spin_unlock(&mm->notifier_subscriptions->lock); |
2c7933f5 JG |
852 | |
853 | call_srcu(&srcu, &mn->rcu, mmu_notifier_free_rcu); | |
854 | return; | |
855 | ||
856 | out_unlock: | |
984cfe4e | 857 | spin_unlock(&mm->notifier_subscriptions->lock); |
2c7933f5 JG |
858 | } |
859 | EXPORT_SYMBOL_GPL(mmu_notifier_put); | |
860 | ||
99cb252f JG |
861 | static int __mmu_interval_notifier_insert( |
862 | struct mmu_interval_notifier *mni, struct mm_struct *mm, | |
984cfe4e | 863 | struct mmu_notifier_subscriptions *subscriptions, unsigned long start, |
99cb252f JG |
864 | unsigned long length, const struct mmu_interval_notifier_ops *ops) |
865 | { | |
866 | mni->mm = mm; | |
867 | mni->ops = ops; | |
868 | RB_CLEAR_NODE(&mni->interval_tree.rb); | |
869 | mni->interval_tree.start = start; | |
870 | /* | |
871 | * Note that the representation of the intervals in the interval tree | |
872 | * considers the ending point as contained in the interval. | |
873 | */ | |
874 | if (length == 0 || | |
875 | check_add_overflow(start, length - 1, &mni->interval_tree.last)) | |
876 | return -EOVERFLOW; | |
877 | ||
878 | /* Must call with a mmget() held */ | |
879 | if (WARN_ON(atomic_read(&mm->mm_count) <= 0)) | |
880 | return -EINVAL; | |
881 | ||
882 | /* pairs with mmdrop in mmu_interval_notifier_remove() */ | |
883 | mmgrab(mm); | |
884 | ||
885 | /* | |
886 | * If some invalidate_range_start/end region is going on in parallel | |
887 | * we don't know what VA ranges are affected, so we must assume this | |
888 | * new range is included. | |
889 | * | |
890 | * If the itree is invalidating then we are not allowed to change | |
891 | * it. Retrying until invalidation is done is tricky due to the | |
892 | * possibility for live lock, instead defer the add to | |
893 | * mn_itree_inv_end() so this algorithm is deterministic. | |
894 | * | |
895 | * In all cases the value for the mni->invalidate_seq should be | |
896 | * odd, see mmu_interval_read_begin() | |
897 | */ | |
984cfe4e JG |
898 | spin_lock(&subscriptions->lock); |
899 | if (subscriptions->active_invalidate_ranges) { | |
900 | if (mn_itree_is_invalidating(subscriptions)) | |
99cb252f | 901 | hlist_add_head(&mni->deferred_item, |
984cfe4e | 902 | &subscriptions->deferred_list); |
99cb252f | 903 | else { |
984cfe4e | 904 | subscriptions->invalidate_seq |= 1; |
99cb252f | 905 | interval_tree_insert(&mni->interval_tree, |
984cfe4e | 906 | &subscriptions->itree); |
99cb252f | 907 | } |
984cfe4e | 908 | mni->invalidate_seq = subscriptions->invalidate_seq; |
99cb252f | 909 | } else { |
984cfe4e | 910 | WARN_ON(mn_itree_is_invalidating(subscriptions)); |
99cb252f JG |
911 | /* |
912 | * The starting seq for a mni not under invalidation should be | |
913 | * odd, not equal to the current invalidate_seq and | |
914 | * invalidate_seq should not 'wrap' to the new seq any time | |
915 | * soon. | |
916 | */ | |
984cfe4e JG |
917 | mni->invalidate_seq = subscriptions->invalidate_seq - 1; |
918 | interval_tree_insert(&mni->interval_tree, | |
919 | &subscriptions->itree); | |
99cb252f | 920 | } |
984cfe4e | 921 | spin_unlock(&subscriptions->lock); |
99cb252f JG |
922 | return 0; |
923 | } | |
924 | ||
925 | /** | |
926 | * mmu_interval_notifier_insert - Insert an interval notifier | |
927 | * @mni: Interval notifier to register | |
928 | * @start: Starting virtual address to monitor | |
929 | * @length: Length of the range to monitor | |
930 | * @mm : mm_struct to attach to | |
931 | * | |
932 | * This function subscribes the interval notifier for notifications from the | |
933 | * mm. Upon return the ops related to mmu_interval_notifier will be called | |
934 | * whenever an event that intersects with the given range occurs. | |
935 | * | |
936 | * Upon return the range_notifier may not be present in the interval tree yet. | |
937 | * The caller must use the normal interval notifier read flow via | |
938 | * mmu_interval_read_begin() to establish SPTEs for this range. | |
939 | */ | |
940 | int mmu_interval_notifier_insert(struct mmu_interval_notifier *mni, | |
941 | struct mm_struct *mm, unsigned long start, | |
942 | unsigned long length, | |
943 | const struct mmu_interval_notifier_ops *ops) | |
944 | { | |
984cfe4e | 945 | struct mmu_notifier_subscriptions *subscriptions; |
99cb252f JG |
946 | int ret; |
947 | ||
948 | might_lock(&mm->mmap_sem); | |
949 | ||
984cfe4e JG |
950 | subscriptions = smp_load_acquire(&mm->notifier_subscriptions); |
951 | if (!subscriptions || !subscriptions->has_itree) { | |
99cb252f JG |
952 | ret = mmu_notifier_register(NULL, mm); |
953 | if (ret) | |
954 | return ret; | |
984cfe4e | 955 | subscriptions = mm->notifier_subscriptions; |
99cb252f | 956 | } |
984cfe4e JG |
957 | return __mmu_interval_notifier_insert(mni, mm, subscriptions, start, |
958 | length, ops); | |
99cb252f JG |
959 | } |
960 | EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert); | |
961 | ||
962 | int mmu_interval_notifier_insert_locked( | |
963 | struct mmu_interval_notifier *mni, struct mm_struct *mm, | |
964 | unsigned long start, unsigned long length, | |
965 | const struct mmu_interval_notifier_ops *ops) | |
966 | { | |
984cfe4e JG |
967 | struct mmu_notifier_subscriptions *subscriptions = |
968 | mm->notifier_subscriptions; | |
99cb252f JG |
969 | int ret; |
970 | ||
971 | lockdep_assert_held_write(&mm->mmap_sem); | |
972 | ||
984cfe4e | 973 | if (!subscriptions || !subscriptions->has_itree) { |
99cb252f JG |
974 | ret = __mmu_notifier_register(NULL, mm); |
975 | if (ret) | |
976 | return ret; | |
984cfe4e | 977 | subscriptions = mm->notifier_subscriptions; |
99cb252f | 978 | } |
984cfe4e JG |
979 | return __mmu_interval_notifier_insert(mni, mm, subscriptions, start, |
980 | length, ops); | |
99cb252f JG |
981 | } |
982 | EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked); | |
983 | ||
984 | /** | |
985 | * mmu_interval_notifier_remove - Remove a interval notifier | |
986 | * @mni: Interval notifier to unregister | |
987 | * | |
988 | * This function must be paired with mmu_interval_notifier_insert(). It cannot | |
989 | * be called from any ops callback. | |
990 | * | |
991 | * Once this returns ops callbacks are no longer running on other CPUs and | |
992 | * will not be called in future. | |
993 | */ | |
994 | void mmu_interval_notifier_remove(struct mmu_interval_notifier *mni) | |
995 | { | |
996 | struct mm_struct *mm = mni->mm; | |
984cfe4e JG |
997 | struct mmu_notifier_subscriptions *subscriptions = |
998 | mm->notifier_subscriptions; | |
99cb252f JG |
999 | unsigned long seq = 0; |
1000 | ||
1001 | might_sleep(); | |
1002 | ||
984cfe4e JG |
1003 | spin_lock(&subscriptions->lock); |
1004 | if (mn_itree_is_invalidating(subscriptions)) { | |
99cb252f JG |
1005 | /* |
1006 | * remove is being called after insert put this on the | |
1007 | * deferred list, but before the deferred list was processed. | |
1008 | */ | |
1009 | if (RB_EMPTY_NODE(&mni->interval_tree.rb)) { | |
1010 | hlist_del(&mni->deferred_item); | |
1011 | } else { | |
1012 | hlist_add_head(&mni->deferred_item, | |
984cfe4e JG |
1013 | &subscriptions->deferred_list); |
1014 | seq = subscriptions->invalidate_seq; | |
99cb252f JG |
1015 | } |
1016 | } else { | |
1017 | WARN_ON(RB_EMPTY_NODE(&mni->interval_tree.rb)); | |
984cfe4e JG |
1018 | interval_tree_remove(&mni->interval_tree, |
1019 | &subscriptions->itree); | |
99cb252f | 1020 | } |
984cfe4e | 1021 | spin_unlock(&subscriptions->lock); |
99cb252f JG |
1022 | |
1023 | /* | |
1024 | * The possible sleep on progress in the invalidation requires the | |
1025 | * caller not hold any locks held by invalidation callbacks. | |
1026 | */ | |
1027 | lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); | |
1028 | lock_map_release(&__mmu_notifier_invalidate_range_start_map); | |
1029 | if (seq) | |
984cfe4e JG |
1030 | wait_event(subscriptions->wq, |
1031 | READ_ONCE(subscriptions->invalidate_seq) != seq); | |
99cb252f JG |
1032 | |
1033 | /* pairs with mmgrab in mmu_interval_notifier_insert() */ | |
1034 | mmdrop(mm); | |
1035 | } | |
1036 | EXPORT_SYMBOL_GPL(mmu_interval_notifier_remove); | |
1037 | ||
2c7933f5 JG |
1038 | /** |
1039 | * mmu_notifier_synchronize - Ensure all mmu_notifiers are freed | |
1040 | * | |
1041 | * This function ensures that all outstanding async SRU work from | |
1042 | * mmu_notifier_put() is completed. After it returns any mmu_notifier_ops | |
1043 | * associated with an unused mmu_notifier will no longer be called. | |
1044 | * | |
1045 | * Before using the caller must ensure that all of its mmu_notifiers have been | |
1046 | * fully released via mmu_notifier_put(). | |
1047 | * | |
1048 | * Modules using the mmu_notifier_put() API should call this in their __exit | |
1049 | * function to avoid module unloading races. | |
1050 | */ | |
1051 | void mmu_notifier_synchronize(void) | |
1052 | { | |
1053 | synchronize_srcu(&srcu); | |
1054 | } | |
1055 | EXPORT_SYMBOL_GPL(mmu_notifier_synchronize); | |
1056 | ||
c6d23413 JG |
1057 | bool |
1058 | mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range) | |
1059 | { | |
1060 | if (!range->vma || range->event != MMU_NOTIFY_PROTECTION_VMA) | |
1061 | return false; | |
1062 | /* Return true if the vma still have the read flag set. */ | |
1063 | return range->vma->vm_flags & VM_READ; | |
1064 | } | |
1065 | EXPORT_SYMBOL_GPL(mmu_notifier_range_update_to_read_only); |