]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blob - kernel/sched/membarrier.c
sched: membarrier: cover kthread_use_mm (v4)
[mirror_ubuntu-hirsute-kernel.git] / kernel / sched / membarrier.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
4 *
5 * membarrier system call
6 */
7 #include "sched.h"
8
9 /*
10 * Bitmask made from a "or" of all commands within enum membarrier_cmd,
11 * except MEMBARRIER_CMD_QUERY.
12 */
13 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
14 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
15 (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \
16 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
17 #else
18 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
19 #endif
20
21 #ifdef CONFIG_RSEQ
22 #define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK \
23 (MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \
24 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ_BITMASK)
25 #else
26 #define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK 0
27 #endif
28
29 #define MEMBARRIER_CMD_BITMASK \
30 (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
31 | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
32 | MEMBARRIER_CMD_PRIVATE_EXPEDITED \
33 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
34 | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
35
36 static void ipi_mb(void *info)
37 {
38 smp_mb(); /* IPIs should be serializing but paranoid. */
39 }
40
41 static void ipi_rseq(void *info)
42 {
43 rseq_preempt(current);
44 }
45
46 static void ipi_sync_rq_state(void *info)
47 {
48 struct mm_struct *mm = (struct mm_struct *) info;
49
50 if (current->mm != mm)
51 return;
52 this_cpu_write(runqueues.membarrier_state,
53 atomic_read(&mm->membarrier_state));
54 /*
55 * Issue a memory barrier after setting
56 * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to
57 * guarantee that no memory access following registration is reordered
58 * before registration.
59 */
60 smp_mb();
61 }
62
63 void membarrier_exec_mmap(struct mm_struct *mm)
64 {
65 /*
66 * Issue a memory barrier before clearing membarrier_state to
67 * guarantee that no memory access prior to exec is reordered after
68 * clearing this state.
69 */
70 smp_mb();
71 atomic_set(&mm->membarrier_state, 0);
72 /*
73 * Keep the runqueue membarrier_state in sync with this mm
74 * membarrier_state.
75 */
76 this_cpu_write(runqueues.membarrier_state, 0);
77 }
78
79 void membarrier_update_current_mm(struct mm_struct *next_mm)
80 {
81 struct rq *rq = this_rq();
82 int membarrier_state = 0;
83
84 if (next_mm)
85 membarrier_state = atomic_read(&next_mm->membarrier_state);
86 if (READ_ONCE(rq->membarrier_state) == membarrier_state)
87 return;
88 WRITE_ONCE(rq->membarrier_state, membarrier_state);
89 }
90
91 static int membarrier_global_expedited(void)
92 {
93 int cpu;
94 cpumask_var_t tmpmask;
95
96 if (num_online_cpus() == 1)
97 return 0;
98
99 /*
100 * Matches memory barriers around rq->curr modification in
101 * scheduler.
102 */
103 smp_mb(); /* system call entry is not a mb. */
104
105 if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
106 return -ENOMEM;
107
108 cpus_read_lock();
109 rcu_read_lock();
110 for_each_online_cpu(cpu) {
111 struct task_struct *p;
112
113 /*
114 * Skipping the current CPU is OK even through we can be
115 * migrated at any point. The current CPU, at the point
116 * where we read raw_smp_processor_id(), is ensured to
117 * be in program order with respect to the caller
118 * thread. Therefore, we can skip this CPU from the
119 * iteration.
120 */
121 if (cpu == raw_smp_processor_id())
122 continue;
123
124 if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) &
125 MEMBARRIER_STATE_GLOBAL_EXPEDITED))
126 continue;
127
128 /*
129 * Skip the CPU if it runs a kernel thread which is not using
130 * a task mm.
131 */
132 p = rcu_dereference(cpu_rq(cpu)->curr);
133 if (!p->mm)
134 continue;
135
136 __cpumask_set_cpu(cpu, tmpmask);
137 }
138 rcu_read_unlock();
139
140 preempt_disable();
141 smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
142 preempt_enable();
143
144 free_cpumask_var(tmpmask);
145 cpus_read_unlock();
146
147 /*
148 * Memory barrier on the caller thread _after_ we finished
149 * waiting for the last IPI. Matches memory barriers around
150 * rq->curr modification in scheduler.
151 */
152 smp_mb(); /* exit from system call is not a mb */
153 return 0;
154 }
155
156 static int membarrier_private_expedited(int flags, int cpu_id)
157 {
158 cpumask_var_t tmpmask;
159 struct mm_struct *mm = current->mm;
160 smp_call_func_t ipi_func = ipi_mb;
161
162 if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
163 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
164 return -EINVAL;
165 if (!(atomic_read(&mm->membarrier_state) &
166 MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
167 return -EPERM;
168 } else if (flags == MEMBARRIER_FLAG_RSEQ) {
169 if (!IS_ENABLED(CONFIG_RSEQ))
170 return -EINVAL;
171 if (!(atomic_read(&mm->membarrier_state) &
172 MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY))
173 return -EPERM;
174 ipi_func = ipi_rseq;
175 } else {
176 WARN_ON_ONCE(flags);
177 if (!(atomic_read(&mm->membarrier_state) &
178 MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
179 return -EPERM;
180 }
181
182 if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)
183 return 0;
184
185 /*
186 * Matches memory barriers around rq->curr modification in
187 * scheduler.
188 */
189 smp_mb(); /* system call entry is not a mb. */
190
191 if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
192 return -ENOMEM;
193
194 cpus_read_lock();
195
196 if (cpu_id >= 0) {
197 struct task_struct *p;
198
199 if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id))
200 goto out;
201 if (cpu_id == raw_smp_processor_id())
202 goto out;
203 rcu_read_lock();
204 p = rcu_dereference(cpu_rq(cpu_id)->curr);
205 if (!p || p->mm != mm) {
206 rcu_read_unlock();
207 goto out;
208 }
209 rcu_read_unlock();
210 } else {
211 int cpu;
212
213 rcu_read_lock();
214 for_each_online_cpu(cpu) {
215 struct task_struct *p;
216
217 /*
218 * Skipping the current CPU is OK even through we can be
219 * migrated at any point. The current CPU, at the point
220 * where we read raw_smp_processor_id(), is ensured to
221 * be in program order with respect to the caller
222 * thread. Therefore, we can skip this CPU from the
223 * iteration.
224 */
225 if (cpu == raw_smp_processor_id())
226 continue;
227 p = rcu_dereference(cpu_rq(cpu)->curr);
228 if (p && p->mm == mm)
229 __cpumask_set_cpu(cpu, tmpmask);
230 }
231 rcu_read_unlock();
232 }
233
234 preempt_disable();
235 if (cpu_id >= 0)
236 smp_call_function_single(cpu_id, ipi_func, NULL, 1);
237 else
238 smp_call_function_many(tmpmask, ipi_func, NULL, 1);
239 preempt_enable();
240
241 out:
242 if (cpu_id < 0)
243 free_cpumask_var(tmpmask);
244 cpus_read_unlock();
245
246 /*
247 * Memory barrier on the caller thread _after_ we finished
248 * waiting for the last IPI. Matches memory barriers around
249 * rq->curr modification in scheduler.
250 */
251 smp_mb(); /* exit from system call is not a mb */
252
253 return 0;
254 }
255
256 static int sync_runqueues_membarrier_state(struct mm_struct *mm)
257 {
258 int membarrier_state = atomic_read(&mm->membarrier_state);
259 cpumask_var_t tmpmask;
260 int cpu;
261
262 if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) {
263 this_cpu_write(runqueues.membarrier_state, membarrier_state);
264
265 /*
266 * For single mm user, we can simply issue a memory barrier
267 * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
268 * mm and in the current runqueue to guarantee that no memory
269 * access following registration is reordered before
270 * registration.
271 */
272 smp_mb();
273 return 0;
274 }
275
276 if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
277 return -ENOMEM;
278
279 /*
280 * For mm with multiple users, we need to ensure all future
281 * scheduler executions will observe @mm's new membarrier
282 * state.
283 */
284 synchronize_rcu();
285
286 /*
287 * For each cpu runqueue, if the task's mm match @mm, ensure that all
288 * @mm's membarrier state set bits are also set in in the runqueue's
289 * membarrier state. This ensures that a runqueue scheduling
290 * between threads which are users of @mm has its membarrier state
291 * updated.
292 */
293 cpus_read_lock();
294 rcu_read_lock();
295 for_each_online_cpu(cpu) {
296 struct rq *rq = cpu_rq(cpu);
297 struct task_struct *p;
298
299 p = rcu_dereference(rq->curr);
300 if (p && p->mm == mm)
301 __cpumask_set_cpu(cpu, tmpmask);
302 }
303 rcu_read_unlock();
304
305 preempt_disable();
306 smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1);
307 preempt_enable();
308
309 free_cpumask_var(tmpmask);
310 cpus_read_unlock();
311
312 return 0;
313 }
314
315 static int membarrier_register_global_expedited(void)
316 {
317 struct task_struct *p = current;
318 struct mm_struct *mm = p->mm;
319 int ret;
320
321 if (atomic_read(&mm->membarrier_state) &
322 MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
323 return 0;
324 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
325 ret = sync_runqueues_membarrier_state(mm);
326 if (ret)
327 return ret;
328 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
329 &mm->membarrier_state);
330
331 return 0;
332 }
333
334 static int membarrier_register_private_expedited(int flags)
335 {
336 struct task_struct *p = current;
337 struct mm_struct *mm = p->mm;
338 int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
339 set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
340 ret;
341
342 if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
343 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
344 return -EINVAL;
345 ready_state =
346 MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
347 } else if (flags == MEMBARRIER_FLAG_RSEQ) {
348 if (!IS_ENABLED(CONFIG_RSEQ))
349 return -EINVAL;
350 ready_state =
351 MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY;
352 } else {
353 WARN_ON_ONCE(flags);
354 }
355
356 /*
357 * We need to consider threads belonging to different thread
358 * groups, which use the same mm. (CLONE_VM but not
359 * CLONE_THREAD).
360 */
361 if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
362 return 0;
363 if (flags & MEMBARRIER_FLAG_SYNC_CORE)
364 set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
365 if (flags & MEMBARRIER_FLAG_RSEQ)
366 set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ;
367 atomic_or(set_state, &mm->membarrier_state);
368 ret = sync_runqueues_membarrier_state(mm);
369 if (ret)
370 return ret;
371 atomic_or(ready_state, &mm->membarrier_state);
372
373 return 0;
374 }
375
376 /**
377 * sys_membarrier - issue memory barriers on a set of threads
378 * @cmd: Takes command values defined in enum membarrier_cmd.
379 * @flags: Currently needs to be 0 for all commands other than
380 * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter
381 * case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id
382 * contains the CPU on which to interrupt (= restart)
383 * the RSEQ critical section.
384 * @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which
385 * RSEQ CS should be interrupted (@cmd must be
386 * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ).
387 *
388 * If this system call is not implemented, -ENOSYS is returned. If the
389 * command specified does not exist, not available on the running
390 * kernel, or if the command argument is invalid, this system call
391 * returns -EINVAL. For a given command, with flags argument set to 0,
392 * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to
393 * always return the same value until reboot. In addition, it can return
394 * -ENOMEM if there is not enough memory available to perform the system
395 * call.
396 *
397 * All memory accesses performed in program order from each targeted thread
398 * is guaranteed to be ordered with respect to sys_membarrier(). If we use
399 * the semantic "barrier()" to represent a compiler barrier forcing memory
400 * accesses to be performed in program order across the barrier, and
401 * smp_mb() to represent explicit memory barriers forcing full memory
402 * ordering across the barrier, we have the following ordering table for
403 * each pair of barrier(), sys_membarrier() and smp_mb():
404 *
405 * The pair ordering is detailed as (O: ordered, X: not ordered):
406 *
407 * barrier() smp_mb() sys_membarrier()
408 * barrier() X X O
409 * smp_mb() X O O
410 * sys_membarrier() O O O
411 */
412 SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id)
413 {
414 switch (cmd) {
415 case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
416 if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU))
417 return -EINVAL;
418 break;
419 default:
420 if (unlikely(flags))
421 return -EINVAL;
422 }
423
424 if (!(flags & MEMBARRIER_CMD_FLAG_CPU))
425 cpu_id = -1;
426
427 switch (cmd) {
428 case MEMBARRIER_CMD_QUERY:
429 {
430 int cmd_mask = MEMBARRIER_CMD_BITMASK;
431
432 if (tick_nohz_full_enabled())
433 cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
434 return cmd_mask;
435 }
436 case MEMBARRIER_CMD_GLOBAL:
437 /* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
438 if (tick_nohz_full_enabled())
439 return -EINVAL;
440 if (num_online_cpus() > 1)
441 synchronize_rcu();
442 return 0;
443 case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
444 return membarrier_global_expedited();
445 case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
446 return membarrier_register_global_expedited();
447 case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
448 return membarrier_private_expedited(0, cpu_id);
449 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
450 return membarrier_register_private_expedited(0);
451 case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
452 return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id);
453 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
454 return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
455 case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
456 return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id);
457 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
458 return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ);
459 default:
460 return -EINVAL;
461 }
462 }