1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
5 * membarrier system call
10 * Bitmask made from a "or" of all commands within enum membarrier_cmd,
11 * except MEMBARRIER_CMD_QUERY.
13 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
14 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
15 (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \
16 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
18 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
22 #define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK \
23 (MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \
24 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ_BITMASK)
26 #define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK 0
29 #define MEMBARRIER_CMD_BITMASK \
30 (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
31 | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
32 | MEMBARRIER_CMD_PRIVATE_EXPEDITED \
33 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
34 | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
36 static void ipi_mb(void *info
)
38 smp_mb(); /* IPIs should be serializing but paranoid. */
41 static void ipi_rseq(void *info
)
43 rseq_preempt(current
);
46 static void ipi_sync_rq_state(void *info
)
48 struct mm_struct
*mm
= (struct mm_struct
*) info
;
50 if (current
->mm
!= mm
)
52 this_cpu_write(runqueues
.membarrier_state
,
53 atomic_read(&mm
->membarrier_state
));
55 * Issue a memory barrier after setting
56 * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to
57 * guarantee that no memory access following registration is reordered
58 * before registration.
63 void membarrier_exec_mmap(struct mm_struct
*mm
)
66 * Issue a memory barrier before clearing membarrier_state to
67 * guarantee that no memory access prior to exec is reordered after
68 * clearing this state.
71 atomic_set(&mm
->membarrier_state
, 0);
73 * Keep the runqueue membarrier_state in sync with this mm
76 this_cpu_write(runqueues
.membarrier_state
, 0);
79 void membarrier_update_current_mm(struct mm_struct
*next_mm
)
81 struct rq
*rq
= this_rq();
82 int membarrier_state
= 0;
85 membarrier_state
= atomic_read(&next_mm
->membarrier_state
);
86 if (READ_ONCE(rq
->membarrier_state
) == membarrier_state
)
88 WRITE_ONCE(rq
->membarrier_state
, membarrier_state
);
91 static int membarrier_global_expedited(void)
94 cpumask_var_t tmpmask
;
96 if (num_online_cpus() == 1)
100 * Matches memory barriers around rq->curr modification in
103 smp_mb(); /* system call entry is not a mb. */
105 if (!zalloc_cpumask_var(&tmpmask
, GFP_KERNEL
))
110 for_each_online_cpu(cpu
) {
111 struct task_struct
*p
;
114 * Skipping the current CPU is OK even through we can be
115 * migrated at any point. The current CPU, at the point
116 * where we read raw_smp_processor_id(), is ensured to
117 * be in program order with respect to the caller
118 * thread. Therefore, we can skip this CPU from the
121 if (cpu
== raw_smp_processor_id())
124 if (!(READ_ONCE(cpu_rq(cpu
)->membarrier_state
) &
125 MEMBARRIER_STATE_GLOBAL_EXPEDITED
))
129 * Skip the CPU if it runs a kernel thread which is not using
132 p
= rcu_dereference(cpu_rq(cpu
)->curr
);
136 __cpumask_set_cpu(cpu
, tmpmask
);
141 smp_call_function_many(tmpmask
, ipi_mb
, NULL
, 1);
144 free_cpumask_var(tmpmask
);
148 * Memory barrier on the caller thread _after_ we finished
149 * waiting for the last IPI. Matches memory barriers around
150 * rq->curr modification in scheduler.
152 smp_mb(); /* exit from system call is not a mb */
156 static int membarrier_private_expedited(int flags
, int cpu_id
)
158 cpumask_var_t tmpmask
;
159 struct mm_struct
*mm
= current
->mm
;
160 smp_call_func_t ipi_func
= ipi_mb
;
162 if (flags
== MEMBARRIER_FLAG_SYNC_CORE
) {
163 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
))
165 if (!(atomic_read(&mm
->membarrier_state
) &
166 MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY
))
168 } else if (flags
== MEMBARRIER_FLAG_RSEQ
) {
169 if (!IS_ENABLED(CONFIG_RSEQ
))
171 if (!(atomic_read(&mm
->membarrier_state
) &
172 MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY
))
177 if (!(atomic_read(&mm
->membarrier_state
) &
178 MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY
))
182 if (atomic_read(&mm
->mm_users
) == 1 || num_online_cpus() == 1)
186 * Matches memory barriers around rq->curr modification in
189 smp_mb(); /* system call entry is not a mb. */
191 if (cpu_id
< 0 && !zalloc_cpumask_var(&tmpmask
, GFP_KERNEL
))
197 struct task_struct
*p
;
199 if (cpu_id
>= nr_cpu_ids
|| !cpu_online(cpu_id
))
201 if (cpu_id
== raw_smp_processor_id())
204 p
= rcu_dereference(cpu_rq(cpu_id
)->curr
);
205 if (!p
|| p
->mm
!= mm
) {
214 for_each_online_cpu(cpu
) {
215 struct task_struct
*p
;
218 * Skipping the current CPU is OK even through we can be
219 * migrated at any point. The current CPU, at the point
220 * where we read raw_smp_processor_id(), is ensured to
221 * be in program order with respect to the caller
222 * thread. Therefore, we can skip this CPU from the
225 if (cpu
== raw_smp_processor_id())
227 p
= rcu_dereference(cpu_rq(cpu
)->curr
);
228 if (p
&& p
->mm
== mm
)
229 __cpumask_set_cpu(cpu
, tmpmask
);
236 smp_call_function_single(cpu_id
, ipi_func
, NULL
, 1);
238 smp_call_function_many(tmpmask
, ipi_func
, NULL
, 1);
243 free_cpumask_var(tmpmask
);
247 * Memory barrier on the caller thread _after_ we finished
248 * waiting for the last IPI. Matches memory barriers around
249 * rq->curr modification in scheduler.
251 smp_mb(); /* exit from system call is not a mb */
256 static int sync_runqueues_membarrier_state(struct mm_struct
*mm
)
258 int membarrier_state
= atomic_read(&mm
->membarrier_state
);
259 cpumask_var_t tmpmask
;
262 if (atomic_read(&mm
->mm_users
) == 1 || num_online_cpus() == 1) {
263 this_cpu_write(runqueues
.membarrier_state
, membarrier_state
);
266 * For single mm user, we can simply issue a memory barrier
267 * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
268 * mm and in the current runqueue to guarantee that no memory
269 * access following registration is reordered before
276 if (!zalloc_cpumask_var(&tmpmask
, GFP_KERNEL
))
280 * For mm with multiple users, we need to ensure all future
281 * scheduler executions will observe @mm's new membarrier
287 * For each cpu runqueue, if the task's mm match @mm, ensure that all
288 * @mm's membarrier state set bits are also set in in the runqueue's
289 * membarrier state. This ensures that a runqueue scheduling
290 * between threads which are users of @mm has its membarrier state
295 for_each_online_cpu(cpu
) {
296 struct rq
*rq
= cpu_rq(cpu
);
297 struct task_struct
*p
;
299 p
= rcu_dereference(rq
->curr
);
300 if (p
&& p
->mm
== mm
)
301 __cpumask_set_cpu(cpu
, tmpmask
);
306 smp_call_function_many(tmpmask
, ipi_sync_rq_state
, mm
, 1);
309 free_cpumask_var(tmpmask
);
315 static int membarrier_register_global_expedited(void)
317 struct task_struct
*p
= current
;
318 struct mm_struct
*mm
= p
->mm
;
321 if (atomic_read(&mm
->membarrier_state
) &
322 MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY
)
324 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED
, &mm
->membarrier_state
);
325 ret
= sync_runqueues_membarrier_state(mm
);
328 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY
,
329 &mm
->membarrier_state
);
334 static int membarrier_register_private_expedited(int flags
)
336 struct task_struct
*p
= current
;
337 struct mm_struct
*mm
= p
->mm
;
338 int ready_state
= MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY
,
339 set_state
= MEMBARRIER_STATE_PRIVATE_EXPEDITED
,
342 if (flags
== MEMBARRIER_FLAG_SYNC_CORE
) {
343 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
))
346 MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY
;
347 } else if (flags
== MEMBARRIER_FLAG_RSEQ
) {
348 if (!IS_ENABLED(CONFIG_RSEQ
))
351 MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY
;
357 * We need to consider threads belonging to different thread
358 * groups, which use the same mm. (CLONE_VM but not
361 if ((atomic_read(&mm
->membarrier_state
) & ready_state
) == ready_state
)
363 if (flags
& MEMBARRIER_FLAG_SYNC_CORE
)
364 set_state
|= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE
;
365 if (flags
& MEMBARRIER_FLAG_RSEQ
)
366 set_state
|= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ
;
367 atomic_or(set_state
, &mm
->membarrier_state
);
368 ret
= sync_runqueues_membarrier_state(mm
);
371 atomic_or(ready_state
, &mm
->membarrier_state
);
377 * sys_membarrier - issue memory barriers on a set of threads
378 * @cmd: Takes command values defined in enum membarrier_cmd.
379 * @flags: Currently needs to be 0 for all commands other than
380 * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter
381 * case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id
382 * contains the CPU on which to interrupt (= restart)
383 * the RSEQ critical section.
384 * @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which
385 * RSEQ CS should be interrupted (@cmd must be
386 * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ).
388 * If this system call is not implemented, -ENOSYS is returned. If the
389 * command specified does not exist, not available on the running
390 * kernel, or if the command argument is invalid, this system call
391 * returns -EINVAL. For a given command, with flags argument set to 0,
392 * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to
393 * always return the same value until reboot. In addition, it can return
394 * -ENOMEM if there is not enough memory available to perform the system
397 * All memory accesses performed in program order from each targeted thread
398 * is guaranteed to be ordered with respect to sys_membarrier(). If we use
399 * the semantic "barrier()" to represent a compiler barrier forcing memory
400 * accesses to be performed in program order across the barrier, and
401 * smp_mb() to represent explicit memory barriers forcing full memory
402 * ordering across the barrier, we have the following ordering table for
403 * each pair of barrier(), sys_membarrier() and smp_mb():
405 * The pair ordering is detailed as (O: ordered, X: not ordered):
407 * barrier() smp_mb() sys_membarrier()
410 * sys_membarrier() O O O
412 SYSCALL_DEFINE3(membarrier
, int, cmd
, unsigned int, flags
, int, cpu_id
)
415 case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ
:
416 if (unlikely(flags
&& flags
!= MEMBARRIER_CMD_FLAG_CPU
))
424 if (!(flags
& MEMBARRIER_CMD_FLAG_CPU
))
428 case MEMBARRIER_CMD_QUERY
:
430 int cmd_mask
= MEMBARRIER_CMD_BITMASK
;
432 if (tick_nohz_full_enabled())
433 cmd_mask
&= ~MEMBARRIER_CMD_GLOBAL
;
436 case MEMBARRIER_CMD_GLOBAL
:
437 /* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
438 if (tick_nohz_full_enabled())
440 if (num_online_cpus() > 1)
443 case MEMBARRIER_CMD_GLOBAL_EXPEDITED
:
444 return membarrier_global_expedited();
445 case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED
:
446 return membarrier_register_global_expedited();
447 case MEMBARRIER_CMD_PRIVATE_EXPEDITED
:
448 return membarrier_private_expedited(0, cpu_id
);
449 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED
:
450 return membarrier_register_private_expedited(0);
451 case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE
:
452 return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE
, cpu_id
);
453 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE
:
454 return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE
);
455 case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ
:
456 return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ
, cpu_id
);
457 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ
:
458 return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ
);