2 * RCU expedited grace periods
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, you can access it online at
16 * http://www.gnu.org/licenses/gpl-2.0.html.
18 * Copyright IBM Corporation, 2016
20 * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
23 /* Wrapper functions for expedited grace periods. */
24 static void rcu_exp_gp_seq_start(struct rcu_state
*rsp
)
26 rcu_seq_start(&rsp
->expedited_sequence
);
28 static void rcu_exp_gp_seq_end(struct rcu_state
*rsp
)
30 rcu_seq_end(&rsp
->expedited_sequence
);
31 smp_mb(); /* Ensure that consecutive grace periods serialize. */
33 static unsigned long rcu_exp_gp_seq_snap(struct rcu_state
*rsp
)
37 smp_mb(); /* Caller's modifications seen first by other CPUs. */
38 s
= rcu_seq_snap(&rsp
->expedited_sequence
);
39 trace_rcu_exp_grace_period(rsp
->name
, s
, TPS("snap"));
42 static bool rcu_exp_gp_seq_done(struct rcu_state
*rsp
, unsigned long s
)
44 return rcu_seq_done(&rsp
->expedited_sequence
, s
);
48 * Reset the ->expmaskinit values in the rcu_node tree to reflect any
49 * recent CPU-online activity. Note that these masks are not cleared
50 * when CPUs go offline, so they reflect the union of all CPUs that have
51 * ever been online. This means that this function normally takes its
52 * no-work-to-do fastpath.
54 static void sync_exp_reset_tree_hotplug(struct rcu_state
*rsp
)
59 unsigned long oldmask
;
60 int ncpus
= READ_ONCE(rsp
->ncpus
);
62 struct rcu_node
*rnp_up
;
64 /* If no new CPUs onlined since last time, nothing to do. */
65 if (likely(ncpus
== rsp
->ncpus_snap
))
67 rsp
->ncpus_snap
= ncpus
;
70 * Each pass through the following loop propagates newly onlined
71 * CPUs for the current rcu_node structure up the rcu_node tree.
73 rcu_for_each_leaf_node(rsp
, rnp
) {
74 raw_spin_lock_irqsave_rcu_node(rnp
, flags
);
75 if (rnp
->expmaskinit
== rnp
->expmaskinitnext
) {
76 raw_spin_unlock_irqrestore_rcu_node(rnp
, flags
);
77 continue; /* No new CPUs, nothing to do. */
80 /* Update this node's mask, track old value for propagation. */
81 oldmask
= rnp
->expmaskinit
;
82 rnp
->expmaskinit
= rnp
->expmaskinitnext
;
83 raw_spin_unlock_irqrestore_rcu_node(rnp
, flags
);
85 /* If was already nonzero, nothing to propagate. */
89 /* Propagate the new CPU up the tree. */
94 raw_spin_lock_irqsave_rcu_node(rnp_up
, flags
);
95 if (rnp_up
->expmaskinit
)
97 rnp_up
->expmaskinit
|= mask
;
98 raw_spin_unlock_irqrestore_rcu_node(rnp_up
, flags
);
101 mask
= rnp_up
->grpmask
;
102 rnp_up
= rnp_up
->parent
;
108 * Reset the ->expmask values in the rcu_node tree in preparation for
109 * a new expedited grace period.
111 static void __maybe_unused
sync_exp_reset_tree(struct rcu_state
*rsp
)
114 struct rcu_node
*rnp
;
116 sync_exp_reset_tree_hotplug(rsp
);
117 rcu_for_each_node_breadth_first(rsp
, rnp
) {
118 raw_spin_lock_irqsave_rcu_node(rnp
, flags
);
119 WARN_ON_ONCE(rnp
->expmask
);
120 rnp
->expmask
= rnp
->expmaskinit
;
121 raw_spin_unlock_irqrestore_rcu_node(rnp
, flags
);
126 * Return non-zero if there is no RCU expedited grace period in progress
127 * for the specified rcu_node structure, in other words, if all CPUs and
128 * tasks covered by the specified rcu_node structure have done their bit
129 * for the current expedited grace period. Works only for preemptible
130 * RCU -- other RCU implementation use other means.
132 * Caller must hold the rcu_state's exp_mutex.
134 static int sync_rcu_preempt_exp_done(struct rcu_node
*rnp
)
136 return rnp
->exp_tasks
== NULL
&&
137 READ_ONCE(rnp
->expmask
) == 0;
141 * Report the exit from RCU read-side critical section for the last task
142 * that queued itself during or before the current expedited preemptible-RCU
143 * grace period. This event is reported either to the rcu_node structure on
144 * which the task was queued or to one of that rcu_node structure's ancestors,
145 * recursively up the tree. (Calm down, calm down, we do the recursion
148 * Caller must hold the rcu_state's exp_mutex and the specified rcu_node
149 * structure's ->lock.
151 static void __rcu_report_exp_rnp(struct rcu_state
*rsp
, struct rcu_node
*rnp
,
152 bool wake
, unsigned long flags
)
153 __releases(rnp
->lock
)
158 if (!sync_rcu_preempt_exp_done(rnp
)) {
160 rcu_initiate_boost(rnp
, flags
);
162 raw_spin_unlock_irqrestore_rcu_node(rnp
, flags
);
165 if (rnp
->parent
== NULL
) {
166 raw_spin_unlock_irqrestore_rcu_node(rnp
, flags
);
168 smp_mb(); /* EGP done before wake_up(). */
169 swake_up(&rsp
->expedited_wq
);
174 raw_spin_unlock_rcu_node(rnp
); /* irqs remain disabled */
176 raw_spin_lock_rcu_node(rnp
); /* irqs already disabled */
177 WARN_ON_ONCE(!(rnp
->expmask
& mask
));
178 rnp
->expmask
&= ~mask
;
183 * Report expedited quiescent state for specified node. This is a
184 * lock-acquisition wrapper function for __rcu_report_exp_rnp().
186 * Caller must hold the rcu_state's exp_mutex.
188 static void __maybe_unused
rcu_report_exp_rnp(struct rcu_state
*rsp
,
189 struct rcu_node
*rnp
, bool wake
)
193 raw_spin_lock_irqsave_rcu_node(rnp
, flags
);
194 __rcu_report_exp_rnp(rsp
, rnp
, wake
, flags
);
198 * Report expedited quiescent state for multiple CPUs, all covered by the
199 * specified leaf rcu_node structure. Caller must hold the rcu_state's
202 static void rcu_report_exp_cpu_mult(struct rcu_state
*rsp
, struct rcu_node
*rnp
,
203 unsigned long mask
, bool wake
)
207 raw_spin_lock_irqsave_rcu_node(rnp
, flags
);
208 if (!(rnp
->expmask
& mask
)) {
209 raw_spin_unlock_irqrestore_rcu_node(rnp
, flags
);
212 rnp
->expmask
&= ~mask
;
213 __rcu_report_exp_rnp(rsp
, rnp
, wake
, flags
); /* Releases rnp->lock. */
217 * Report expedited quiescent state for specified rcu_data (CPU).
219 static void rcu_report_exp_rdp(struct rcu_state
*rsp
, struct rcu_data
*rdp
,
222 rcu_report_exp_cpu_mult(rsp
, rdp
->mynode
, rdp
->grpmask
, wake
);
225 /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
226 static bool sync_exp_work_done(struct rcu_state
*rsp
, atomic_long_t
*stat
,
229 if (rcu_exp_gp_seq_done(rsp
, s
)) {
230 trace_rcu_exp_grace_period(rsp
->name
, s
, TPS("done"));
231 /* Ensure test happens before caller kfree(). */
232 smp_mb__before_atomic(); /* ^^^ */
233 atomic_long_inc(stat
);
240 * Funnel-lock acquisition for expedited grace periods. Returns true
241 * if some other task completed an expedited grace period that this task
242 * can piggy-back on, and with no mutex held. Otherwise, returns false
243 * with the mutex held, indicating that the caller must actually do the
244 * expedited grace period.
246 static bool exp_funnel_lock(struct rcu_state
*rsp
, unsigned long s
)
248 struct rcu_data
*rdp
= per_cpu_ptr(rsp
->rda
, raw_smp_processor_id());
249 struct rcu_node
*rnp
= rdp
->mynode
;
250 struct rcu_node
*rnp_root
= rcu_get_root(rsp
);
252 /* Low-contention fastpath. */
253 if (ULONG_CMP_LT(READ_ONCE(rnp
->exp_seq_rq
), s
) &&
255 ULONG_CMP_LT(READ_ONCE(rnp_root
->exp_seq_rq
), s
)) &&
256 !mutex_is_locked(&rsp
->exp_mutex
) &&
257 mutex_trylock(&rsp
->exp_mutex
))
261 * Each pass through the following loop works its way up
262 * the rcu_node tree, returning if others have done the work or
263 * otherwise falls through to acquire rsp->exp_mutex. The mapping
264 * from CPU to rcu_node structure can be inexact, as it is just
265 * promoting locality and is not strictly needed for correctness.
267 for (; rnp
!= NULL
; rnp
= rnp
->parent
) {
268 if (sync_exp_work_done(rsp
, &rdp
->exp_workdone1
, s
))
271 /* Work not done, either wait here or go up. */
272 spin_lock(&rnp
->exp_lock
);
273 if (ULONG_CMP_GE(rnp
->exp_seq_rq
, s
)) {
275 /* Someone else doing GP, so wait for them. */
276 spin_unlock(&rnp
->exp_lock
);
277 trace_rcu_exp_funnel_lock(rsp
->name
, rnp
->level
,
278 rnp
->grplo
, rnp
->grphi
,
280 wait_event(rnp
->exp_wq
[(s
>> 1) & 0x3],
281 sync_exp_work_done(rsp
,
282 &rdp
->exp_workdone2
, s
));
285 rnp
->exp_seq_rq
= s
; /* Followers can wait on us. */
286 spin_unlock(&rnp
->exp_lock
);
287 trace_rcu_exp_funnel_lock(rsp
->name
, rnp
->level
, rnp
->grplo
,
288 rnp
->grphi
, TPS("nxtlvl"));
290 mutex_lock(&rsp
->exp_mutex
);
292 if (sync_exp_work_done(rsp
, &rdp
->exp_workdone3
, s
)) {
293 mutex_unlock(&rsp
->exp_mutex
);
296 rcu_exp_gp_seq_start(rsp
);
297 trace_rcu_exp_grace_period(rsp
->name
, s
, TPS("start"));
301 /* Invoked on each online non-idle CPU for expedited quiescent state. */
302 static void sync_sched_exp_handler(void *data
)
304 struct rcu_data
*rdp
;
305 struct rcu_node
*rnp
;
306 struct rcu_state
*rsp
= data
;
308 rdp
= this_cpu_ptr(rsp
->rda
);
310 if (!(READ_ONCE(rnp
->expmask
) & rdp
->grpmask
) ||
311 __this_cpu_read(rcu_sched_data
.cpu_no_qs
.b
.exp
))
313 if (rcu_is_cpu_rrupt_from_idle()) {
314 rcu_report_exp_rdp(&rcu_sched_state
,
315 this_cpu_ptr(&rcu_sched_data
), true);
318 __this_cpu_write(rcu_sched_data
.cpu_no_qs
.b
.exp
, true);
319 resched_cpu(smp_processor_id());
322 /* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */
323 static void sync_sched_exp_online_cleanup(int cpu
)
325 struct rcu_data
*rdp
;
327 struct rcu_node
*rnp
;
328 struct rcu_state
*rsp
= &rcu_sched_state
;
330 rdp
= per_cpu_ptr(rsp
->rda
, cpu
);
332 if (!(READ_ONCE(rnp
->expmask
) & rdp
->grpmask
))
334 ret
= smp_call_function_single(cpu
, sync_sched_exp_handler
, rsp
, 0);
339 * Select the nodes that the upcoming expedited grace period needs
342 static void sync_rcu_exp_select_cpus(struct rcu_state
*rsp
,
343 smp_call_func_t func
)
347 unsigned long mask_ofl_test
;
348 unsigned long mask_ofl_ipi
;
350 struct rcu_node
*rnp
;
352 sync_exp_reset_tree(rsp
);
353 rcu_for_each_leaf_node(rsp
, rnp
) {
354 raw_spin_lock_irqsave_rcu_node(rnp
, flags
);
356 /* Each pass checks a CPU for identity, offline, and idle. */
358 for_each_leaf_node_possible_cpu(rnp
, cpu
) {
359 struct rcu_data
*rdp
= per_cpu_ptr(rsp
->rda
, cpu
);
360 struct rcu_dynticks
*rdtp
= &per_cpu(rcu_dynticks
, cpu
);
362 if (raw_smp_processor_id() == cpu
||
363 !(atomic_add_return(0, &rdtp
->dynticks
) & 0x1))
364 mask_ofl_test
|= rdp
->grpmask
;
366 mask_ofl_ipi
= rnp
->expmask
& ~mask_ofl_test
;
369 * Need to wait for any blocked tasks as well. Note that
370 * additional blocking tasks will also block the expedited
371 * GP until such time as the ->expmask bits are cleared.
373 if (rcu_preempt_has_tasks(rnp
))
374 rnp
->exp_tasks
= rnp
->blkd_tasks
.next
;
375 raw_spin_unlock_irqrestore_rcu_node(rnp
, flags
);
377 /* IPI the remaining CPUs for expedited quiescent state. */
378 for_each_leaf_node_possible_cpu(rnp
, cpu
) {
379 unsigned long mask
= leaf_node_cpu_bit(rnp
, cpu
);
380 if (!(mask_ofl_ipi
& mask
))
383 ret
= smp_call_function_single(cpu
, func
, rsp
, 0);
385 mask_ofl_ipi
&= ~mask
;
388 /* Failed, raced with offline. */
389 raw_spin_lock_irqsave_rcu_node(rnp
, flags
);
390 if (cpu_online(cpu
) &&
391 (rnp
->expmask
& mask
)) {
392 raw_spin_unlock_irqrestore_rcu_node(rnp
, flags
);
393 schedule_timeout_uninterruptible(1);
394 if (cpu_online(cpu
) &&
395 (rnp
->expmask
& mask
))
397 raw_spin_lock_irqsave_rcu_node(rnp
, flags
);
399 if (!(rnp
->expmask
& mask
))
400 mask_ofl_ipi
&= ~mask
;
401 raw_spin_unlock_irqrestore_rcu_node(rnp
, flags
);
403 /* Report quiescent states for those that went offline. */
404 mask_ofl_test
|= mask_ofl_ipi
;
406 rcu_report_exp_cpu_mult(rsp
, rnp
, mask_ofl_test
, false);
410 static void synchronize_sched_expedited_wait(struct rcu_state
*rsp
)
413 unsigned long jiffies_stall
;
414 unsigned long jiffies_start
;
417 struct rcu_node
*rnp
;
418 struct rcu_node
*rnp_root
= rcu_get_root(rsp
);
421 jiffies_stall
= rcu_jiffies_till_stall_check();
422 jiffies_start
= jiffies
;
425 ret
= swait_event_timeout(
427 sync_rcu_preempt_exp_done(rnp_root
),
429 if (ret
> 0 || sync_rcu_preempt_exp_done(rnp_root
))
432 /* Hit a signal, disable CPU stall warnings. */
433 swait_event(rsp
->expedited_wq
,
434 sync_rcu_preempt_exp_done(rnp_root
));
437 pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
440 rcu_for_each_leaf_node(rsp
, rnp
) {
441 ndetected
+= rcu_print_task_exp_stall(rnp
);
442 for_each_leaf_node_possible_cpu(rnp
, cpu
) {
443 struct rcu_data
*rdp
;
445 mask
= leaf_node_cpu_bit(rnp
, cpu
);
446 if (!(rnp
->expmask
& mask
))
449 rdp
= per_cpu_ptr(rsp
->rda
, cpu
);
450 pr_cont(" %d-%c%c%c", cpu
,
451 "O."[!!cpu_online(cpu
)],
452 "o."[!!(rdp
->grpmask
& rnp
->expmaskinit
)],
453 "N."[!!(rdp
->grpmask
& rnp
->expmaskinitnext
)]);
456 pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
457 jiffies
- jiffies_start
, rsp
->expedited_sequence
,
458 rnp_root
->expmask
, ".T"[!!rnp_root
->exp_tasks
]);
460 pr_err("blocking rcu_node structures:");
461 rcu_for_each_node_breadth_first(rsp
, rnp
) {
463 continue; /* printed unconditionally */
464 if (sync_rcu_preempt_exp_done(rnp
))
466 pr_cont(" l=%u:%d-%d:%#lx/%c",
467 rnp
->level
, rnp
->grplo
, rnp
->grphi
,
469 ".T"[!!rnp
->exp_tasks
]);
473 rcu_for_each_leaf_node(rsp
, rnp
) {
474 for_each_leaf_node_possible_cpu(rnp
, cpu
) {
475 mask
= leaf_node_cpu_bit(rnp
, cpu
);
476 if (!(rnp
->expmask
& mask
))
481 jiffies_stall
= 3 * rcu_jiffies_till_stall_check() + 3;
486 * Wait for the current expedited grace period to complete, and then
487 * wake up everyone who piggybacked on the just-completed expedited
488 * grace period. Also update all the ->exp_seq_rq counters as needed
489 * in order to avoid counter-wrap problems.
491 static void rcu_exp_wait_wake(struct rcu_state
*rsp
, unsigned long s
)
493 struct rcu_node
*rnp
;
495 synchronize_sched_expedited_wait(rsp
);
496 rcu_exp_gp_seq_end(rsp
);
497 trace_rcu_exp_grace_period(rsp
->name
, s
, TPS("end"));
500 * Switch over to wakeup mode, allowing the next GP, but -only- the
501 * next GP, to proceed.
503 mutex_lock(&rsp
->exp_wake_mutex
);
504 mutex_unlock(&rsp
->exp_mutex
);
506 rcu_for_each_node_breadth_first(rsp
, rnp
) {
507 if (ULONG_CMP_LT(READ_ONCE(rnp
->exp_seq_rq
), s
)) {
508 spin_lock(&rnp
->exp_lock
);
509 /* Recheck, avoid hang in case someone just arrived. */
510 if (ULONG_CMP_LT(rnp
->exp_seq_rq
, s
))
512 spin_unlock(&rnp
->exp_lock
);
514 wake_up_all(&rnp
->exp_wq
[(rsp
->expedited_sequence
>> 1) & 0x3]);
516 trace_rcu_exp_grace_period(rsp
->name
, s
, TPS("endwake"));
517 mutex_unlock(&rsp
->exp_wake_mutex
);
521 * synchronize_sched_expedited - Brute-force RCU-sched grace period
523 * Wait for an RCU-sched grace period to elapse, but use a "big hammer"
524 * approach to force the grace period to end quickly. This consumes
525 * significant time on all CPUs and is unfriendly to real-time workloads,
526 * so is thus not recommended for any sort of common-case code. In fact,
527 * if you are using synchronize_sched_expedited() in a loop, please
528 * restructure your code to batch your updates, and then use a single
529 * synchronize_sched() instead.
531 * This implementation can be thought of as an application of sequence
532 * locking to expedited grace periods, but using the sequence counter to
533 * determine when someone else has already done the work instead of for
536 void synchronize_sched_expedited(void)
539 struct rcu_state
*rsp
= &rcu_sched_state
;
541 /* If only one CPU, this is automatically a grace period. */
542 if (rcu_blocking_is_gp())
545 /* If expedited grace periods are prohibited, fall back to normal. */
546 if (rcu_gp_is_normal()) {
547 wait_rcu_gp(call_rcu_sched
);
551 /* Take a snapshot of the sequence number. */
552 s
= rcu_exp_gp_seq_snap(rsp
);
553 if (exp_funnel_lock(rsp
, s
))
554 return; /* Someone else did our work for us. */
556 /* Initialize the rcu_node tree in preparation for the wait. */
557 sync_rcu_exp_select_cpus(rsp
, sync_sched_exp_handler
);
559 /* Wait and clean up, including waking everyone. */
560 rcu_exp_wait_wake(rsp
, s
);
562 EXPORT_SYMBOL_GPL(synchronize_sched_expedited
);
564 #ifdef CONFIG_PREEMPT_RCU
567 * Remote handler for smp_call_function_single(). If there is an
568 * RCU read-side critical section in effect, request that the
569 * next rcu_read_unlock() record the quiescent state up the
570 * ->expmask fields in the rcu_node tree. Otherwise, immediately
571 * report the quiescent state.
573 static void sync_rcu_exp_handler(void *info
)
575 struct rcu_data
*rdp
;
576 struct rcu_state
*rsp
= info
;
577 struct task_struct
*t
= current
;
580 * Within an RCU read-side critical section, request that the next
581 * rcu_read_unlock() report. Unless this RCU read-side critical
582 * section has already blocked, in which case it is already set
583 * up for the expedited grace period to wait on it.
585 if (t
->rcu_read_lock_nesting
> 0 &&
586 !t
->rcu_read_unlock_special
.b
.blocked
) {
587 t
->rcu_read_unlock_special
.b
.exp_need_qs
= true;
592 * We are either exiting an RCU read-side critical section (negative
593 * values of t->rcu_read_lock_nesting) or are not in one at all
594 * (zero value of t->rcu_read_lock_nesting). Or we are in an RCU
595 * read-side critical section that blocked before this expedited
596 * grace period started. Either way, we can immediately report
597 * the quiescent state.
599 rdp
= this_cpu_ptr(rsp
->rda
);
600 rcu_report_exp_rdp(rsp
, rdp
, true);
604 * synchronize_rcu_expedited - Brute-force RCU grace period
606 * Wait for an RCU-preempt grace period, but expedite it. The basic
607 * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler
608 * checks whether the CPU is in an RCU-preempt critical section, and
609 * if so, it sets a flag that causes the outermost rcu_read_unlock()
610 * to report the quiescent state. On the other hand, if the CPU is
611 * not in an RCU read-side critical section, the IPI handler reports
612 * the quiescent state immediately.
614 * Although this is a greate improvement over previous expedited
615 * implementations, it is still unfriendly to real-time workloads, so is
616 * thus not recommended for any sort of common-case code. In fact, if
617 * you are using synchronize_rcu_expedited() in a loop, please restructure
618 * your code to batch your updates, and then Use a single synchronize_rcu()
621 void synchronize_rcu_expedited(void)
623 struct rcu_state
*rsp
= rcu_state_p
;
626 /* If expedited grace periods are prohibited, fall back to normal. */
627 if (rcu_gp_is_normal()) {
628 wait_rcu_gp(call_rcu
);
632 s
= rcu_exp_gp_seq_snap(rsp
);
633 if (exp_funnel_lock(rsp
, s
))
634 return; /* Someone else did our work for us. */
636 /* Initialize the rcu_node tree in preparation for the wait. */
637 sync_rcu_exp_select_cpus(rsp
, sync_rcu_exp_handler
);
639 /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */
640 rcu_exp_wait_wake(rsp
, s
);
642 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited
);
644 #else /* #ifdef CONFIG_PREEMPT_RCU */
647 * Wait for an rcu-preempt grace period, but make it happen quickly.
648 * But because preemptible RCU does not exist, map to rcu-sched.
650 void synchronize_rcu_expedited(void)
652 synchronize_sched_expedited();
654 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited
);
656 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */