]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - kernel/sched/membarrier.c
sched: membarrier: cover kthread_use_mm (v4)
[mirror_ubuntu-jammy-kernel.git] / kernel / sched / membarrier.c
CommitLineData
c942fddf 1// SPDX-License-Identifier: GPL-2.0-or-later
22e4ebb9
MD
2/*
3 * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
4 *
5 * membarrier system call
22e4ebb9 6 */
325ea10c 7#include "sched.h"
22e4ebb9
MD
8
9/*
10 * Bitmask made from a "or" of all commands within enum membarrier_cmd,
11 * except MEMBARRIER_CMD_QUERY.
12 */
70216e18 13#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
97fb7a0a
IM
14#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
15 (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \
70216e18
MD
16 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
17#else
18#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
19#endif
20
2a36ab71
PO
21#ifdef CONFIG_RSEQ
22#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK \
23 (MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \
24 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ_BITMASK)
25#else
26#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK 0
27#endif
28
97fb7a0a
IM
29#define MEMBARRIER_CMD_BITMASK \
30 (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
31 | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
32 | MEMBARRIER_CMD_PRIVATE_EXPEDITED \
33 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
70216e18 34 | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
22e4ebb9
MD
35
36static void ipi_mb(void *info)
37{
38 smp_mb(); /* IPIs should be serializing but paranoid. */
39}
40
2a36ab71
PO
41static void ipi_rseq(void *info)
42{
43 rseq_preempt(current);
44}
45
227a4aad
MD
46static void ipi_sync_rq_state(void *info)
47{
48 struct mm_struct *mm = (struct mm_struct *) info;
49
50 if (current->mm != mm)
51 return;
52 this_cpu_write(runqueues.membarrier_state,
53 atomic_read(&mm->membarrier_state));
54 /*
55 * Issue a memory barrier after setting
56 * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to
57 * guarantee that no memory access following registration is reordered
58 * before registration.
59 */
60 smp_mb();
61}
62
63void membarrier_exec_mmap(struct mm_struct *mm)
64{
65 /*
66 * Issue a memory barrier before clearing membarrier_state to
67 * guarantee that no memory access prior to exec is reordered after
68 * clearing this state.
69 */
70 smp_mb();
71 atomic_set(&mm->membarrier_state, 0);
72 /*
73 * Keep the runqueue membarrier_state in sync with this mm
74 * membarrier_state.
75 */
76 this_cpu_write(runqueues.membarrier_state, 0);
77}
78
5bc78502
MD
79void membarrier_update_current_mm(struct mm_struct *next_mm)
80{
81 struct rq *rq = this_rq();
82 int membarrier_state = 0;
83
84 if (next_mm)
85 membarrier_state = atomic_read(&next_mm->membarrier_state);
86 if (READ_ONCE(rq->membarrier_state) == membarrier_state)
87 return;
88 WRITE_ONCE(rq->membarrier_state, membarrier_state);
89}
90
c5f58bd5
MD
91static int membarrier_global_expedited(void)
92{
93 int cpu;
c5f58bd5
MD
94 cpumask_var_t tmpmask;
95
96 if (num_online_cpus() == 1)
97 return 0;
98
99 /*
100 * Matches memory barriers around rq->curr modification in
101 * scheduler.
102 */
103 smp_mb(); /* system call entry is not a mb. */
104
c172e0a3
MD
105 if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
106 return -ENOMEM;
c5f58bd5
MD
107
108 cpus_read_lock();
227a4aad 109 rcu_read_lock();
c5f58bd5
MD
110 for_each_online_cpu(cpu) {
111 struct task_struct *p;
112
113 /*
114 * Skipping the current CPU is OK even through we can be
115 * migrated at any point. The current CPU, at the point
116 * where we read raw_smp_processor_id(), is ensured to
117 * be in program order with respect to the caller
118 * thread. Therefore, we can skip this CPU from the
119 * iteration.
120 */
121 if (cpu == raw_smp_processor_id())
122 continue;
97fb7a0a 123
227a4aad
MD
124 if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) &
125 MEMBARRIER_STATE_GLOBAL_EXPEDITED))
126 continue;
127
128 /*
618758ed
MD
129 * Skip the CPU if it runs a kernel thread which is not using
130 * a task mm.
227a4aad 131 */
154abafc 132 p = rcu_dereference(cpu_rq(cpu)->curr);
618758ed 133 if (!p->mm)
227a4aad
MD
134 continue;
135
c172e0a3 136 __cpumask_set_cpu(cpu, tmpmask);
c5f58bd5 137 }
227a4aad 138 rcu_read_unlock();
c172e0a3
MD
139
140 preempt_disable();
141 smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
142 preempt_enable();
143
144 free_cpumask_var(tmpmask);
c5f58bd5
MD
145 cpus_read_unlock();
146
147 /*
148 * Memory barrier on the caller thread _after_ we finished
149 * waiting for the last IPI. Matches memory barriers around
150 * rq->curr modification in scheduler.
151 */
152 smp_mb(); /* exit from system call is not a mb */
153 return 0;
154}
155
2a36ab71 156static int membarrier_private_expedited(int flags, int cpu_id)
22e4ebb9 157{
22e4ebb9 158 cpumask_var_t tmpmask;
c6d68c1c 159 struct mm_struct *mm = current->mm;
2a36ab71 160 smp_call_func_t ipi_func = ipi_mb;
22e4ebb9 161
2a36ab71 162 if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
70216e18
MD
163 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
164 return -EINVAL;
c6d68c1c 165 if (!(atomic_read(&mm->membarrier_state) &
70216e18
MD
166 MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
167 return -EPERM;
2a36ab71
PO
168 } else if (flags == MEMBARRIER_FLAG_RSEQ) {
169 if (!IS_ENABLED(CONFIG_RSEQ))
170 return -EINVAL;
171 if (!(atomic_read(&mm->membarrier_state) &
172 MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY))
173 return -EPERM;
174 ipi_func = ipi_rseq;
70216e18 175 } else {
2a36ab71 176 WARN_ON_ONCE(flags);
c6d68c1c 177 if (!(atomic_read(&mm->membarrier_state) &
70216e18
MD
178 MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
179 return -EPERM;
180 }
a961e409 181
c6d68c1c 182 if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)
a961e409 183 return 0;
22e4ebb9
MD
184
185 /*
186 * Matches memory barriers around rq->curr modification in
187 * scheduler.
188 */
189 smp_mb(); /* system call entry is not a mb. */
190
2a36ab71 191 if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
c172e0a3 192 return -ENOMEM;
22e4ebb9
MD
193
194 cpus_read_lock();
2a36ab71
PO
195
196 if (cpu_id >= 0) {
22e4ebb9
MD
197 struct task_struct *p;
198
2a36ab71
PO
199 if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id))
200 goto out;
201 if (cpu_id == raw_smp_processor_id())
202 goto out;
203 rcu_read_lock();
204 p = rcu_dereference(cpu_rq(cpu_id)->curr);
205 if (!p || p->mm != mm) {
206 rcu_read_unlock();
207 goto out;
208 }
209 rcu_read_unlock();
210 } else {
211 int cpu;
212
213 rcu_read_lock();
214 for_each_online_cpu(cpu) {
215 struct task_struct *p;
216
217 /*
218 * Skipping the current CPU is OK even through we can be
219 * migrated at any point. The current CPU, at the point
220 * where we read raw_smp_processor_id(), is ensured to
221 * be in program order with respect to the caller
222 * thread. Therefore, we can skip this CPU from the
223 * iteration.
224 */
225 if (cpu == raw_smp_processor_id())
226 continue;
227 p = rcu_dereference(cpu_rq(cpu)->curr);
228 if (p && p->mm == mm)
229 __cpumask_set_cpu(cpu, tmpmask);
230 }
231 rcu_read_unlock();
22e4ebb9 232 }
c172e0a3
MD
233
234 preempt_disable();
2a36ab71
PO
235 if (cpu_id >= 0)
236 smp_call_function_single(cpu_id, ipi_func, NULL, 1);
237 else
238 smp_call_function_many(tmpmask, ipi_func, NULL, 1);
c172e0a3
MD
239 preempt_enable();
240
2a36ab71
PO
241out:
242 if (cpu_id < 0)
243 free_cpumask_var(tmpmask);
22e4ebb9
MD
244 cpus_read_unlock();
245
246 /*
247 * Memory barrier on the caller thread _after_ we finished
248 * waiting for the last IPI. Matches memory barriers around
249 * rq->curr modification in scheduler.
250 */
251 smp_mb(); /* exit from system call is not a mb */
97fb7a0a 252
a961e409
MD
253 return 0;
254}
255
227a4aad
MD
256static int sync_runqueues_membarrier_state(struct mm_struct *mm)
257{
258 int membarrier_state = atomic_read(&mm->membarrier_state);
259 cpumask_var_t tmpmask;
260 int cpu;
261
262 if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) {
263 this_cpu_write(runqueues.membarrier_state, membarrier_state);
264
265 /*
266 * For single mm user, we can simply issue a memory barrier
267 * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
268 * mm and in the current runqueue to guarantee that no memory
269 * access following registration is reordered before
270 * registration.
271 */
272 smp_mb();
273 return 0;
274 }
275
276 if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
277 return -ENOMEM;
278
279 /*
280 * For mm with multiple users, we need to ensure all future
281 * scheduler executions will observe @mm's new membarrier
282 * state.
283 */
284 synchronize_rcu();
285
286 /*
287 * For each cpu runqueue, if the task's mm match @mm, ensure that all
288 * @mm's membarrier state set bits are also set in in the runqueue's
289 * membarrier state. This ensures that a runqueue scheduling
290 * between threads which are users of @mm has its membarrier state
291 * updated.
292 */
293 cpus_read_lock();
294 rcu_read_lock();
295 for_each_online_cpu(cpu) {
296 struct rq *rq = cpu_rq(cpu);
297 struct task_struct *p;
298
c172e0a3 299 p = rcu_dereference(rq->curr);
227a4aad
MD
300 if (p && p->mm == mm)
301 __cpumask_set_cpu(cpu, tmpmask);
302 }
303 rcu_read_unlock();
304
305 preempt_disable();
306 smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1);
307 preempt_enable();
308
309 free_cpumask_var(tmpmask);
310 cpus_read_unlock();
311
312 return 0;
313}
314
c5f58bd5
MD
315static int membarrier_register_global_expedited(void)
316{
317 struct task_struct *p = current;
318 struct mm_struct *mm = p->mm;
227a4aad 319 int ret;
c5f58bd5
MD
320
321 if (atomic_read(&mm->membarrier_state) &
322 MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
323 return 0;
324 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
227a4aad
MD
325 ret = sync_runqueues_membarrier_state(mm);
326 if (ret)
327 return ret;
c5f58bd5
MD
328 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
329 &mm->membarrier_state);
97fb7a0a 330
c5f58bd5
MD
331 return 0;
332}
333
70216e18 334static int membarrier_register_private_expedited(int flags)
a961e409
MD
335{
336 struct task_struct *p = current;
337 struct mm_struct *mm = p->mm;
227a4aad
MD
338 int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
339 set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
340 ret;
70216e18 341
2a36ab71 342 if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
70216e18
MD
343 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
344 return -EINVAL;
227a4aad
MD
345 ready_state =
346 MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
2a36ab71
PO
347 } else if (flags == MEMBARRIER_FLAG_RSEQ) {
348 if (!IS_ENABLED(CONFIG_RSEQ))
349 return -EINVAL;
350 ready_state =
351 MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY;
352 } else {
353 WARN_ON_ONCE(flags);
70216e18 354 }
a961e409
MD
355
356 /*
357 * We need to consider threads belonging to different thread
358 * groups, which use the same mm. (CLONE_VM but not
359 * CLONE_THREAD).
360 */
227a4aad 361 if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
c5f58bd5 362 return 0;
70216e18 363 if (flags & MEMBARRIER_FLAG_SYNC_CORE)
227a4aad 364 set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
2a36ab71
PO
365 if (flags & MEMBARRIER_FLAG_RSEQ)
366 set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ;
227a4aad
MD
367 atomic_or(set_state, &mm->membarrier_state);
368 ret = sync_runqueues_membarrier_state(mm);
369 if (ret)
370 return ret;
371 atomic_or(ready_state, &mm->membarrier_state);
97fb7a0a 372
c5f58bd5 373 return 0;
22e4ebb9
MD
374}
375
376/**
377 * sys_membarrier - issue memory barriers on a set of threads
2a36ab71
PO
378 * @cmd: Takes command values defined in enum membarrier_cmd.
379 * @flags: Currently needs to be 0 for all commands other than
380 * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter
381 * case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id
382 * contains the CPU on which to interrupt (= restart)
383 * the RSEQ critical section.
384 * @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which
385 * RSEQ CS should be interrupted (@cmd must be
386 * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ).
22e4ebb9
MD
387 *
388 * If this system call is not implemented, -ENOSYS is returned. If the
389 * command specified does not exist, not available on the running
390 * kernel, or if the command argument is invalid, this system call
391 * returns -EINVAL. For a given command, with flags argument set to 0,
227a4aad
MD
392 * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to
393 * always return the same value until reboot. In addition, it can return
394 * -ENOMEM if there is not enough memory available to perform the system
395 * call.
22e4ebb9
MD
396 *
397 * All memory accesses performed in program order from each targeted thread
398 * is guaranteed to be ordered with respect to sys_membarrier(). If we use
399 * the semantic "barrier()" to represent a compiler barrier forcing memory
400 * accesses to be performed in program order across the barrier, and
401 * smp_mb() to represent explicit memory barriers forcing full memory
402 * ordering across the barrier, we have the following ordering table for
403 * each pair of barrier(), sys_membarrier() and smp_mb():
404 *
405 * The pair ordering is detailed as (O: ordered, X: not ordered):
406 *
407 * barrier() smp_mb() sys_membarrier()
408 * barrier() X X O
409 * smp_mb() X O O
410 * sys_membarrier() O O O
411 */
2a36ab71 412SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id)
22e4ebb9 413{
2a36ab71
PO
414 switch (cmd) {
415 case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
416 if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU))
417 return -EINVAL;
418 break;
419 default:
420 if (unlikely(flags))
421 return -EINVAL;
422 }
423
424 if (!(flags & MEMBARRIER_CMD_FLAG_CPU))
425 cpu_id = -1;
426
22e4ebb9
MD
427 switch (cmd) {
428 case MEMBARRIER_CMD_QUERY:
429 {
430 int cmd_mask = MEMBARRIER_CMD_BITMASK;
431
432 if (tick_nohz_full_enabled())
c5f58bd5 433 cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
22e4ebb9
MD
434 return cmd_mask;
435 }
c5f58bd5
MD
436 case MEMBARRIER_CMD_GLOBAL:
437 /* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
22e4ebb9
MD
438 if (tick_nohz_full_enabled())
439 return -EINVAL;
440 if (num_online_cpus() > 1)
78d125d3 441 synchronize_rcu();
22e4ebb9 442 return 0;
c5f58bd5
MD
443 case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
444 return membarrier_global_expedited();
445 case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
446 return membarrier_register_global_expedited();
22e4ebb9 447 case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
2a36ab71 448 return membarrier_private_expedited(0, cpu_id);
a961e409 449 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
70216e18
MD
450 return membarrier_register_private_expedited(0);
451 case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
2a36ab71 452 return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id);
70216e18
MD
453 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
454 return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
2a36ab71
PO
455 case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
456 return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id);
457 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
458 return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ);
22e4ebb9
MD
459 default:
460 return -EINVAL;
461 }
462}