]> git.proxmox.com Git - mirror_ubuntu-kernels.git/blame - kernel/sched/fair.c
selftests/rseq: check if libc rseq support is registered
[mirror_ubuntu-kernels.git] / kernel / sched / fair.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
bf0f6f24
IM
2/*
3 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
4 *
5 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
6 *
7 * Interactivity improvements by Mike Galbraith
8 * (C) 2007 Mike Galbraith <efault@gmx.de>
9 *
10 * Various enhancements by Dmitry Adamushko.
11 * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
12 *
13 * Group scheduling enhancements by Srivatsa Vaddagiri
14 * Copyright IBM Corporation, 2007
15 * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
16 *
17 * Scaled math optimizations by Thomas Gleixner
18 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
21805085
PZ
19 *
20 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
90eec103 21 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
bf0f6f24 22 */
c4ad6fcb
IM
23#include <linux/energy_model.h>
24#include <linux/mmap_lock.h>
25#include <linux/hugetlb_inline.h>
26#include <linux/jiffies.h>
27#include <linux/mm_api.h>
28#include <linux/highmem.h>
29#include <linux/spinlock_api.h>
30#include <linux/cpumask_api.h>
31#include <linux/lockdep_api.h>
32#include <linux/softirq.h>
33#include <linux/refcount_api.h>
34#include <linux/topology.h>
35#include <linux/sched/clock.h>
36#include <linux/sched/cond_resched.h>
37#include <linux/sched/cputime.h>
38#include <linux/sched/isolation.h>
d664e399 39#include <linux/sched/nohz.h>
c4ad6fcb
IM
40
41#include <linux/cpuidle.h>
42#include <linux/interrupt.h>
43#include <linux/mempolicy.h>
44#include <linux/mutex_api.h>
45#include <linux/profile.h>
46#include <linux/psi.h>
47#include <linux/ratelimit.h>
1930a6e7 48#include <linux/task_work.h>
c4ad6fcb
IM
49
50#include <asm/switch_to.h>
51
52#include <linux/sched/cond_resched.h>
53
325ea10c 54#include "sched.h"
b9e9c6ca
IM
55#include "stats.h"
56#include "autogroup.h"
029632fb 57
bf0f6f24 58/*
21805085 59 * Targeted preemption latency for CPU-bound tasks:
bf0f6f24 60 *
21805085 61 * NOTE: this latency value is not the same as the concept of
d274a4ce
IM
62 * 'timeslice length' - timeslices in CFS are of variable length
63 * and have no persistent notion like in traditional, time-slice
64 * based scheduling concepts.
bf0f6f24 65 *
d274a4ce
IM
66 * (to see the precise effective timeslice length of your workload,
67 * run vmstat and monitor the context-switches (cs) field)
2b4d5b25
IM
68 *
69 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
bf0f6f24 70 */
2b4d5b25 71unsigned int sysctl_sched_latency = 6000000ULL;
ed8885a1 72static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
2bd8e6d4 73
1983a922
CE
74/*
75 * The initial- and re-scaling of tunables is configurable
1983a922
CE
76 *
77 * Options are:
2b4d5b25
IM
78 *
79 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
80 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
81 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
82 *
83 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
1983a922 84 */
8a99b683 85unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
1983a922 86
2bd8e6d4 87/*
b2be5e96 88 * Minimal preemption granularity for CPU-bound tasks:
2b4d5b25 89 *
864616ee 90 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
2bd8e6d4 91 */
ed8885a1
MS
92unsigned int sysctl_sched_min_granularity = 750000ULL;
93static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
21805085 94
51ce83ed
JD
95/*
96 * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks.
97 * Applies only when SCHED_IDLE tasks compete with normal tasks.
98 *
99 * (default: 0.75 msec)
100 */
101unsigned int sysctl_sched_idle_min_granularity = 750000ULL;
102
21805085 103/*
2b4d5b25 104 * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
b2be5e96 105 */
0bf377bb 106static unsigned int sched_nr_latency = 8;
b2be5e96
PZ
107
108/*
2bba22c5 109 * After fork, child runs first. If set to 0 (default) then
b2be5e96 110 * parent will (try to) run first.
21805085 111 */
2bba22c5 112unsigned int sysctl_sched_child_runs_first __read_mostly;
bf0f6f24 113
bf0f6f24
IM
114/*
115 * SCHED_OTHER wake-up granularity.
bf0f6f24
IM
116 *
117 * This option delays the preemption effects of decoupled workloads
118 * and reduces their over-scheduling. Synchronous workloads will still
119 * have immediate wakeup/sleep latencies.
2b4d5b25
IM
120 *
121 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
bf0f6f24 122 */
ed8885a1
MS
123unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
124static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
bf0f6f24 125
2b4d5b25 126const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
da84d961 127
05289b90
TG
128int sched_thermal_decay_shift;
129static int __init setup_sched_thermal_decay_shift(char *str)
130{
131 int _shift = 0;
132
133 if (kstrtoint(str, 0, &_shift))
134 pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n");
135
136 sched_thermal_decay_shift = clamp(_shift, 0, 10);
137 return 1;
138}
139__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
140
afe06efd
TC
141#ifdef CONFIG_SMP
142/*
97fb7a0a 143 * For asym packing, by default the lower numbered CPU has higher priority.
afe06efd
TC
144 */
145int __weak arch_asym_cpu_priority(int cpu)
146{
147 return -cpu;
148}
6d101ba6
OJ
149
150/*
60e17f5c 151 * The margin used when comparing utilization with CPU capacity.
6d101ba6
OJ
152 *
153 * (default: ~20%)
154 */
60e17f5c
VK
155#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
156
4aed8aa4
VS
157/*
158 * The margin used when comparing CPU capacities.
159 * is 'cap1' noticeably greater than 'cap2'
160 *
161 * (default: ~5%)
162 */
163#define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
afe06efd
TC
164#endif
165
ec12cb7f
PT
166#ifdef CONFIG_CFS_BANDWIDTH
167/*
168 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
169 * each time a cfs_rq requests quota.
170 *
171 * Note: in the case that the slice exceeds the runtime remaining (either due
172 * to consumption or the quota being specified to be smaller than the slice)
173 * we will always only issue the remaining available time.
174 *
2b4d5b25
IM
175 * (default: 5 msec, units: microseconds)
176 */
d4ae80ff
ZN
177static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
178#endif
179
180#ifdef CONFIG_SYSCTL
181static struct ctl_table sched_fair_sysctls[] = {
182 {
183 .procname = "sched_child_runs_first",
184 .data = &sysctl_sched_child_runs_first,
185 .maxlen = sizeof(unsigned int),
186 .mode = 0644,
187 .proc_handler = proc_dointvec,
188 },
189#ifdef CONFIG_CFS_BANDWIDTH
190 {
191 .procname = "sched_cfs_bandwidth_slice_us",
192 .data = &sysctl_sched_cfs_bandwidth_slice,
193 .maxlen = sizeof(unsigned int),
194 .mode = 0644,
195 .proc_handler = proc_dointvec_minmax,
196 .extra1 = SYSCTL_ONE,
197 },
198#endif
199 {}
200};
201
202static int __init sched_fair_sysctl_init(void)
203{
204 register_sysctl_init("kernel", sched_fair_sysctls);
205 return 0;
206}
207late_initcall(sched_fair_sysctl_init);
ec12cb7f
PT
208#endif
209
8527632d
PG
210static inline void update_load_add(struct load_weight *lw, unsigned long inc)
211{
212 lw->weight += inc;
213 lw->inv_weight = 0;
214}
215
216static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
217{
218 lw->weight -= dec;
219 lw->inv_weight = 0;
220}
221
222static inline void update_load_set(struct load_weight *lw, unsigned long w)
223{
224 lw->weight = w;
225 lw->inv_weight = 0;
226}
227
029632fb
PZ
228/*
229 * Increase the granularity value when there are more CPUs,
230 * because with more CPUs the 'effective latency' as visible
231 * to users decreases. But the relationship is not linear,
232 * so pick a second-best guess by going with the log2 of the
233 * number of CPUs.
234 *
235 * This idea comes from the SD scheduler of Con Kolivas:
236 */
58ac93e4 237static unsigned int get_update_sysctl_factor(void)
029632fb 238{
58ac93e4 239 unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
029632fb
PZ
240 unsigned int factor;
241
242 switch (sysctl_sched_tunable_scaling) {
243 case SCHED_TUNABLESCALING_NONE:
244 factor = 1;
245 break;
246 case SCHED_TUNABLESCALING_LINEAR:
247 factor = cpus;
248 break;
249 case SCHED_TUNABLESCALING_LOG:
250 default:
251 factor = 1 + ilog2(cpus);
252 break;
253 }
254
255 return factor;
256}
257
258static void update_sysctl(void)
259{
260 unsigned int factor = get_update_sysctl_factor();
261
262#define SET_SYSCTL(name) \
263 (sysctl_##name = (factor) * normalized_sysctl_##name)
264 SET_SYSCTL(sched_min_granularity);
265 SET_SYSCTL(sched_latency);
266 SET_SYSCTL(sched_wakeup_granularity);
267#undef SET_SYSCTL
268}
269
f38f12d1 270void __init sched_init_granularity(void)
029632fb
PZ
271{
272 update_sysctl();
273}
274
9dbdb155 275#define WMULT_CONST (~0U)
029632fb
PZ
276#define WMULT_SHIFT 32
277
9dbdb155
PZ
278static void __update_inv_weight(struct load_weight *lw)
279{
280 unsigned long w;
281
282 if (likely(lw->inv_weight))
283 return;
284
285 w = scale_load_down(lw->weight);
286
287 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
288 lw->inv_weight = 1;
289 else if (unlikely(!w))
290 lw->inv_weight = WMULT_CONST;
291 else
292 lw->inv_weight = WMULT_CONST / w;
293}
029632fb
PZ
294
295/*
9dbdb155
PZ
296 * delta_exec * weight / lw.weight
297 * OR
298 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
299 *
1c3de5e1 300 * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
9dbdb155
PZ
301 * we're guaranteed shift stays positive because inv_weight is guaranteed to
302 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
303 *
304 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
305 * weight/lw.weight <= 1, and therefore our shift will also be positive.
029632fb 306 */
9dbdb155 307static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
029632fb 308{
9dbdb155 309 u64 fact = scale_load_down(weight);
1e17fb8e 310 u32 fact_hi = (u32)(fact >> 32);
9dbdb155 311 int shift = WMULT_SHIFT;
1e17fb8e 312 int fs;
029632fb 313
9dbdb155 314 __update_inv_weight(lw);
029632fb 315
1e17fb8e
CC
316 if (unlikely(fact_hi)) {
317 fs = fls(fact_hi);
318 shift -= fs;
319 fact >>= fs;
029632fb
PZ
320 }
321
2eeb01a2 322 fact = mul_u32_u32(fact, lw->inv_weight);
029632fb 323
1e17fb8e
CC
324 fact_hi = (u32)(fact >> 32);
325 if (fact_hi) {
326 fs = fls(fact_hi);
327 shift -= fs;
328 fact >>= fs;
9dbdb155 329 }
029632fb 330
9dbdb155 331 return mul_u64_u32_shr(delta_exec, fact, shift);
029632fb
PZ
332}
333
334
335const struct sched_class fair_sched_class;
a4c2f00f 336
bf0f6f24
IM
337/**************************************************************
338 * CFS operations on generic schedulable entities:
339 */
340
62160e3f 341#ifdef CONFIG_FAIR_GROUP_SCHED
8f48894f 342
b758149c
PZ
343/* Walk up scheduling entities hierarchy */
344#define for_each_sched_entity(se) \
345 for (; se; se = se->parent)
346
f6783319 347static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
3d4b47b4 348{
5d299eab
PZ
349 struct rq *rq = rq_of(cfs_rq);
350 int cpu = cpu_of(rq);
351
352 if (cfs_rq->on_list)
f6783319 353 return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
5d299eab
PZ
354
355 cfs_rq->on_list = 1;
356
357 /*
358 * Ensure we either appear before our parent (if already
359 * enqueued) or force our parent to appear after us when it is
360 * enqueued. The fact that we always enqueue bottom-up
361 * reduces this to two cases and a special case for the root
362 * cfs_rq. Furthermore, it also means that we will always reset
363 * tmp_alone_branch either when the branch is connected
364 * to a tree or when we reach the top of the tree
365 */
366 if (cfs_rq->tg->parent &&
367 cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
67e86250 368 /*
5d299eab
PZ
369 * If parent is already on the list, we add the child
370 * just before. Thanks to circular linked property of
371 * the list, this means to put the child at the tail
372 * of the list that starts by parent.
67e86250 373 */
5d299eab
PZ
374 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
375 &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
376 /*
377 * The branch is now connected to its tree so we can
378 * reset tmp_alone_branch to the beginning of the
379 * list.
380 */
381 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
f6783319 382 return true;
5d299eab 383 }
3d4b47b4 384
5d299eab
PZ
385 if (!cfs_rq->tg->parent) {
386 /*
387 * cfs rq without parent should be put
388 * at the tail of the list.
389 */
390 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
391 &rq->leaf_cfs_rq_list);
392 /*
393 * We have reach the top of a tree so we can reset
394 * tmp_alone_branch to the beginning of the list.
395 */
396 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
f6783319 397 return true;
3d4b47b4 398 }
5d299eab
PZ
399
400 /*
401 * The parent has not already been added so we want to
402 * make sure that it will be put after us.
403 * tmp_alone_branch points to the begin of the branch
404 * where we will add parent.
405 */
406 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
407 /*
408 * update tmp_alone_branch to points to the new begin
409 * of the branch
410 */
411 rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
f6783319 412 return false;
3d4b47b4
PZ
413}
414
415static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
416{
417 if (cfs_rq->on_list) {
31bc6aea
VG
418 struct rq *rq = rq_of(cfs_rq);
419
420 /*
421 * With cfs_rq being unthrottled/throttled during an enqueue,
422 * it can happen the tmp_alone_branch points the a leaf that
423 * we finally want to del. In this case, tmp_alone_branch moves
424 * to the prev element but it will point to rq->leaf_cfs_rq_list
425 * at the end of the enqueue.
426 */
427 if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
428 rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
429
3d4b47b4
PZ
430 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
431 cfs_rq->on_list = 0;
432 }
433}
434
5d299eab
PZ
435static inline void assert_list_leaf_cfs_rq(struct rq *rq)
436{
437 SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
438}
439
039ae8bc
VG
440/* Iterate thr' all leaf cfs_rq's on a runqueue */
441#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
442 list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
443 leaf_cfs_rq_list)
b758149c
PZ
444
445/* Do the two (enqueued) entities belong to the same group ? */
fed14d45 446static inline struct cfs_rq *
b758149c
PZ
447is_same_group(struct sched_entity *se, struct sched_entity *pse)
448{
449 if (se->cfs_rq == pse->cfs_rq)
fed14d45 450 return se->cfs_rq;
b758149c 451
fed14d45 452 return NULL;
b758149c
PZ
453}
454
455static inline struct sched_entity *parent_entity(struct sched_entity *se)
456{
457 return se->parent;
458}
459
464b7527
PZ
460static void
461find_matching_se(struct sched_entity **se, struct sched_entity **pse)
462{
463 int se_depth, pse_depth;
464
465 /*
466 * preemption test can be made between sibling entities who are in the
467 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
468 * both tasks until we find their ancestors who are siblings of common
469 * parent.
470 */
471
472 /* First walk up until both entities are at same depth */
fed14d45
PZ
473 se_depth = (*se)->depth;
474 pse_depth = (*pse)->depth;
464b7527
PZ
475
476 while (se_depth > pse_depth) {
477 se_depth--;
478 *se = parent_entity(*se);
479 }
480
481 while (pse_depth > se_depth) {
482 pse_depth--;
483 *pse = parent_entity(*pse);
484 }
485
486 while (!is_same_group(*se, *pse)) {
487 *se = parent_entity(*se);
488 *pse = parent_entity(*pse);
489 }
490}
491
30400039
JD
492static int tg_is_idle(struct task_group *tg)
493{
494 return tg->idle > 0;
495}
496
497static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
498{
499 return cfs_rq->idle > 0;
500}
501
502static int se_is_idle(struct sched_entity *se)
503{
504 if (entity_is_task(se))
505 return task_has_idle_policy(task_of(se));
506 return cfs_rq_is_idle(group_cfs_rq(se));
507}
508
8f48894f
PZ
509#else /* !CONFIG_FAIR_GROUP_SCHED */
510
b758149c
PZ
511#define for_each_sched_entity(se) \
512 for (; se; se = NULL)
bf0f6f24 513
f6783319 514static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
3d4b47b4 515{
f6783319 516 return true;
3d4b47b4
PZ
517}
518
519static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
520{
521}
522
5d299eab
PZ
523static inline void assert_list_leaf_cfs_rq(struct rq *rq)
524{
525}
526
039ae8bc
VG
527#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
528 for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
b758149c 529
b758149c
PZ
530static inline struct sched_entity *parent_entity(struct sched_entity *se)
531{
532 return NULL;
533}
534
464b7527
PZ
535static inline void
536find_matching_se(struct sched_entity **se, struct sched_entity **pse)
537{
538}
539
366e7ad6 540static inline int tg_is_idle(struct task_group *tg)
30400039
JD
541{
542 return 0;
543}
544
545static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
546{
547 return 0;
548}
549
550static int se_is_idle(struct sched_entity *se)
551{
552 return 0;
553}
554
b758149c
PZ
555#endif /* CONFIG_FAIR_GROUP_SCHED */
556
6c16a6dc 557static __always_inline
9dbdb155 558void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
bf0f6f24
IM
559
560/**************************************************************
561 * Scheduling class tree data structure manipulation methods:
562 */
563
1bf08230 564static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
02e0431a 565{
1bf08230 566 s64 delta = (s64)(vruntime - max_vruntime);
368059a9 567 if (delta > 0)
1bf08230 568 max_vruntime = vruntime;
02e0431a 569
1bf08230 570 return max_vruntime;
02e0431a
PZ
571}
572
0702e3eb 573static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
b0ffd246
PZ
574{
575 s64 delta = (s64)(vruntime - min_vruntime);
576 if (delta < 0)
577 min_vruntime = vruntime;
578
579 return min_vruntime;
580}
581
bf9be9a1 582static inline bool entity_before(struct sched_entity *a,
54fdc581
FC
583 struct sched_entity *b)
584{
585 return (s64)(a->vruntime - b->vruntime) < 0;
586}
587
bf9be9a1
PZ
588#define __node_2_se(node) \
589 rb_entry((node), struct sched_entity, run_node)
590
1af5f730
PZ
591static void update_min_vruntime(struct cfs_rq *cfs_rq)
592{
b60205c7 593 struct sched_entity *curr = cfs_rq->curr;
bfb06889 594 struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
b60205c7 595
1af5f730
PZ
596 u64 vruntime = cfs_rq->min_vruntime;
597
b60205c7
PZ
598 if (curr) {
599 if (curr->on_rq)
600 vruntime = curr->vruntime;
601 else
602 curr = NULL;
603 }
1af5f730 604
bfb06889 605 if (leftmost) { /* non-empty tree */
bf9be9a1 606 struct sched_entity *se = __node_2_se(leftmost);
1af5f730 607
b60205c7 608 if (!curr)
1af5f730
PZ
609 vruntime = se->vruntime;
610 else
611 vruntime = min_vruntime(vruntime, se->vruntime);
612 }
613
1bf08230 614 /* ensure we never gain time by being placed backwards. */
1af5f730 615 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
3fe1698b
PZ
616#ifndef CONFIG_64BIT
617 smp_wmb();
618 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
619#endif
1af5f730
PZ
620}
621
bf9be9a1
PZ
622static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
623{
624 return entity_before(__node_2_se(a), __node_2_se(b));
625}
626
bf0f6f24
IM
627/*
628 * Enqueue an entity into the rb-tree:
629 */
0702e3eb 630static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 631{
bf9be9a1 632 rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
bf0f6f24
IM
633}
634
0702e3eb 635static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 636{
bfb06889 637 rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
bf0f6f24
IM
638}
639
029632fb 640struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
bf0f6f24 641{
bfb06889 642 struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
f4b6755f
PZ
643
644 if (!left)
645 return NULL;
646
bf9be9a1 647 return __node_2_se(left);
bf0f6f24
IM
648}
649
ac53db59
RR
650static struct sched_entity *__pick_next_entity(struct sched_entity *se)
651{
652 struct rb_node *next = rb_next(&se->run_node);
653
654 if (!next)
655 return NULL;
656
bf9be9a1 657 return __node_2_se(next);
ac53db59
RR
658}
659
660#ifdef CONFIG_SCHED_DEBUG
029632fb 661struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
aeb73b04 662{
bfb06889 663 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
aeb73b04 664
70eee74b
BS
665 if (!last)
666 return NULL;
7eee3e67 667
bf9be9a1 668 return __node_2_se(last);
aeb73b04
PZ
669}
670
bf0f6f24
IM
671/**************************************************************
672 * Scheduling class statistics methods:
673 */
674
8a99b683 675int sched_update_scaling(void)
b2be5e96 676{
58ac93e4 677 unsigned int factor = get_update_sysctl_factor();
b2be5e96 678
b2be5e96
PZ
679 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
680 sysctl_sched_min_granularity);
681
acb4a848
CE
682#define WRT_SYSCTL(name) \
683 (normalized_sysctl_##name = sysctl_##name / (factor))
684 WRT_SYSCTL(sched_min_granularity);
685 WRT_SYSCTL(sched_latency);
686 WRT_SYSCTL(sched_wakeup_granularity);
acb4a848
CE
687#undef WRT_SYSCTL
688
b2be5e96
PZ
689 return 0;
690}
691#endif
647e7cac 692
a7be37ac 693/*
f9c0b095 694 * delta /= w
a7be37ac 695 */
9dbdb155 696static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
a7be37ac 697{
f9c0b095 698 if (unlikely(se->load.weight != NICE_0_LOAD))
9dbdb155 699 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
a7be37ac
PZ
700
701 return delta;
702}
703
647e7cac
IM
704/*
705 * The idea is to set a period in which each task runs once.
706 *
532b1858 707 * When there are too many tasks (sched_nr_latency) we have to stretch
647e7cac
IM
708 * this period because otherwise the slices get too small.
709 *
710 * p = (nr <= nl) ? l : l*nr/nl
711 */
4d78e7b6
PZ
712static u64 __sched_period(unsigned long nr_running)
713{
8e2b0bf3
BF
714 if (unlikely(nr_running > sched_nr_latency))
715 return nr_running * sysctl_sched_min_granularity;
716 else
717 return sysctl_sched_latency;
4d78e7b6
PZ
718}
719
51ce83ed
JD
720static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq);
721
647e7cac
IM
722/*
723 * We calculate the wall-time slice from the period by taking a part
724 * proportional to the weight.
725 *
f9c0b095 726 * s = p*P[w/rw]
647e7cac 727 */
6d0f0ebd 728static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
21805085 729{
0c2de3f0 730 unsigned int nr_running = cfs_rq->nr_running;
51ce83ed
JD
731 struct sched_entity *init_se = se;
732 unsigned int min_gran;
0c2de3f0
PZ
733 u64 slice;
734
735 if (sched_feat(ALT_PERIOD))
736 nr_running = rq_of(cfs_rq)->cfs.h_nr_running;
737
738 slice = __sched_period(nr_running + !se->on_rq);
f9c0b095 739
0a582440 740 for_each_sched_entity(se) {
6272d68c 741 struct load_weight *load;
3104bf03 742 struct load_weight lw;
51ce83ed 743 struct cfs_rq *qcfs_rq;
6272d68c 744
51ce83ed
JD
745 qcfs_rq = cfs_rq_of(se);
746 load = &qcfs_rq->load;
f9c0b095 747
0a582440 748 if (unlikely(!se->on_rq)) {
51ce83ed 749 lw = qcfs_rq->load;
0a582440
MG
750
751 update_load_add(&lw, se->load.weight);
752 load = &lw;
753 }
9dbdb155 754 slice = __calc_delta(slice, se->load.weight, load);
0a582440 755 }
0c2de3f0 756
51ce83ed
JD
757 if (sched_feat(BASE_SLICE)) {
758 if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq))
759 min_gran = sysctl_sched_idle_min_granularity;
760 else
761 min_gran = sysctl_sched_min_granularity;
762
763 slice = max_t(u64, slice, min_gran);
764 }
0c2de3f0 765
0a582440 766 return slice;
bf0f6f24
IM
767}
768
647e7cac 769/*
660cc00f 770 * We calculate the vruntime slice of a to-be-inserted task.
647e7cac 771 *
f9c0b095 772 * vs = s/w
647e7cac 773 */
f9c0b095 774static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
67e9fb2a 775{
f9c0b095 776 return calc_delta_fair(sched_slice(cfs_rq, se), se);
a7be37ac
PZ
777}
778
c0796298 779#include "pelt.h"
23127296 780#ifdef CONFIG_SMP
283e2ed3 781
772bd008 782static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
fb13c7ee 783static unsigned long task_h_load(struct task_struct *p);
3b1baa64 784static unsigned long capacity_of(int cpu);
fb13c7ee 785
540247fb
YD
786/* Give new sched_entity start runnable values to heavy its load in infant time */
787void init_entity_runnable_average(struct sched_entity *se)
a75cdaa9 788{
540247fb 789 struct sched_avg *sa = &se->avg;
a75cdaa9 790
f207934f
PZ
791 memset(sa, 0, sizeof(*sa));
792
b5a9b340 793 /*
dfcb245e 794 * Tasks are initialized with full load to be seen as heavy tasks until
b5a9b340 795 * they get a chance to stabilize to their real load level.
dfcb245e 796 * Group entities are initialized with zero load to reflect the fact that
b5a9b340
VG
797 * nothing has been attached to the task group yet.
798 */
799 if (entity_is_task(se))
0dacee1b 800 sa->load_avg = scale_load_down(se->load.weight);
f207934f 801
9d89c257 802 /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
a75cdaa9 803}
7ea241af 804
df217913 805static void attach_entity_cfs_rq(struct sched_entity *se);
7dc603c9 806
2b8c41da
YD
807/*
808 * With new tasks being created, their initial util_avgs are extrapolated
809 * based on the cfs_rq's current util_avg:
810 *
811 * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
812 *
813 * However, in many cases, the above util_avg does not give a desired
814 * value. Moreover, the sum of the util_avgs may be divergent, such
815 * as when the series is a harmonic series.
816 *
817 * To solve this problem, we also cap the util_avg of successive tasks to
818 * only 1/2 of the left utilization budget:
819 *
8fe5c5a9 820 * util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
2b8c41da 821 *
8fe5c5a9 822 * where n denotes the nth task and cpu_scale the CPU capacity.
2b8c41da 823 *
8fe5c5a9
QP
824 * For example, for a CPU with 1024 of capacity, a simplest series from
825 * the beginning would be like:
2b8c41da
YD
826 *
827 * task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
828 * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
829 *
830 * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
831 * if util_avg > util_avg_cap.
832 */
d0fe0b9c 833void post_init_entity_util_avg(struct task_struct *p)
2b8c41da 834{
d0fe0b9c 835 struct sched_entity *se = &p->se;
2b8c41da
YD
836 struct cfs_rq *cfs_rq = cfs_rq_of(se);
837 struct sched_avg *sa = &se->avg;
8ec59c0f 838 long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
8fe5c5a9 839 long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
2b8c41da
YD
840
841 if (cap > 0) {
842 if (cfs_rq->avg.util_avg != 0) {
843 sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
844 sa->util_avg /= (cfs_rq->avg.load_avg + 1);
845
846 if (sa->util_avg > cap)
847 sa->util_avg = cap;
848 } else {
849 sa->util_avg = cap;
850 }
2b8c41da 851 }
7dc603c9 852
e21cf434 853 sa->runnable_avg = sa->util_avg;
9f683953 854
d0fe0b9c
DE
855 if (p->sched_class != &fair_sched_class) {
856 /*
857 * For !fair tasks do:
858 *
859 update_cfs_rq_load_avg(now, cfs_rq);
a4f9a0e5 860 attach_entity_load_avg(cfs_rq, se);
d0fe0b9c
DE
861 switched_from_fair(rq, p);
862 *
863 * such that the next switched_to_fair() has the
864 * expected state.
865 */
866 se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
867 return;
7dc603c9
PZ
868 }
869
df217913 870 attach_entity_cfs_rq(se);
2b8c41da
YD
871}
872
7dc603c9 873#else /* !CONFIG_SMP */
540247fb 874void init_entity_runnable_average(struct sched_entity *se)
a75cdaa9
AS
875{
876}
d0fe0b9c 877void post_init_entity_util_avg(struct task_struct *p)
2b8c41da
YD
878{
879}
fe749158 880static void update_tg_load_avg(struct cfs_rq *cfs_rq)
3d30544f
PZ
881{
882}
7dc603c9 883#endif /* CONFIG_SMP */
a75cdaa9 884
bf0f6f24 885/*
9dbdb155 886 * Update the current task's runtime statistics.
bf0f6f24 887 */
b7cc0896 888static void update_curr(struct cfs_rq *cfs_rq)
bf0f6f24 889{
429d43bc 890 struct sched_entity *curr = cfs_rq->curr;
78becc27 891 u64 now = rq_clock_task(rq_of(cfs_rq));
9dbdb155 892 u64 delta_exec;
bf0f6f24
IM
893
894 if (unlikely(!curr))
895 return;
896
9dbdb155
PZ
897 delta_exec = now - curr->exec_start;
898 if (unlikely((s64)delta_exec <= 0))
34f28ecd 899 return;
bf0f6f24 900
8ebc91d9 901 curr->exec_start = now;
d842de87 902
ceeadb83
YS
903 if (schedstat_enabled()) {
904 struct sched_statistics *stats;
905
906 stats = __schedstats_from_se(curr);
907 __schedstat_set(stats->exec_max,
908 max(delta_exec, stats->exec_max));
909 }
9dbdb155
PZ
910
911 curr->sum_exec_runtime += delta_exec;
ae92882e 912 schedstat_add(cfs_rq->exec_clock, delta_exec);
9dbdb155
PZ
913
914 curr->vruntime += calc_delta_fair(delta_exec, curr);
915 update_min_vruntime(cfs_rq);
916
d842de87
SV
917 if (entity_is_task(curr)) {
918 struct task_struct *curtask = task_of(curr);
919
f977bb49 920 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
d2cc5ed6 921 cgroup_account_cputime(curtask, delta_exec);
f06febc9 922 account_group_exec_runtime(curtask, delta_exec);
d842de87 923 }
ec12cb7f
PT
924
925 account_cfs_rq_runtime(cfs_rq, delta_exec);
bf0f6f24
IM
926}
927
6e998916
SG
928static void update_curr_fair(struct rq *rq)
929{
930 update_curr(cfs_rq_of(&rq->curr->se));
931}
932
bf0f6f24 933static inline void
60f2415e 934update_stats_wait_start_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 935{
ceeadb83 936 struct sched_statistics *stats;
60f2415e 937 struct task_struct *p = NULL;
4fa8d299
JP
938
939 if (!schedstat_enabled())
940 return;
941
ceeadb83
YS
942 stats = __schedstats_from_se(se);
943
60f2415e
YS
944 if (entity_is_task(se))
945 p = task_of(se);
3ea94de1 946
60f2415e 947 __update_stats_wait_start(rq_of(cfs_rq), p, stats);
bf0f6f24
IM
948}
949
4fa8d299 950static inline void
60f2415e 951update_stats_wait_end_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
3ea94de1 952{
ceeadb83
YS
953 struct sched_statistics *stats;
954 struct task_struct *p = NULL;
cb251765 955
4fa8d299
JP
956 if (!schedstat_enabled())
957 return;
958
ceeadb83
YS
959 stats = __schedstats_from_se(se);
960
b9c88f75 961 /*
962 * When the sched_schedstat changes from 0 to 1, some sched se
963 * maybe already in the runqueue, the se->statistics.wait_start
964 * will be 0.So it will let the delta wrong. We need to avoid this
965 * scenario.
966 */
ceeadb83 967 if (unlikely(!schedstat_val(stats->wait_start)))
b9c88f75 968 return;
969
60f2415e 970 if (entity_is_task(se))
3ea94de1 971 p = task_of(se);
3ea94de1 972
60f2415e 973 __update_stats_wait_end(rq_of(cfs_rq), p, stats);
3ea94de1 974}
3ea94de1 975
4fa8d299 976static inline void
60f2415e 977update_stats_enqueue_sleeper_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
1a3d027c 978{
ceeadb83 979 struct sched_statistics *stats;
1a3d027c 980 struct task_struct *tsk = NULL;
4fa8d299
JP
981
982 if (!schedstat_enabled())
983 return;
984
ceeadb83
YS
985 stats = __schedstats_from_se(se);
986
1a3d027c
JP
987 if (entity_is_task(se))
988 tsk = task_of(se);
989
60f2415e 990 __update_stats_enqueue_sleeper(rq_of(cfs_rq), tsk, stats);
3ea94de1 991}
3ea94de1 992
bf0f6f24
IM
993/*
994 * Task is being enqueued - update stats:
995 */
cb251765 996static inline void
60f2415e 997update_stats_enqueue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
bf0f6f24 998{
4fa8d299
JP
999 if (!schedstat_enabled())
1000 return;
1001
bf0f6f24
IM
1002 /*
1003 * Are we enqueueing a waiting task? (for current tasks
1004 * a dequeue/enqueue event is a NOP)
1005 */
429d43bc 1006 if (se != cfs_rq->curr)
60f2415e 1007 update_stats_wait_start_fair(cfs_rq, se);
1a3d027c
JP
1008
1009 if (flags & ENQUEUE_WAKEUP)
60f2415e 1010 update_stats_enqueue_sleeper_fair(cfs_rq, se);
bf0f6f24
IM
1011}
1012
bf0f6f24 1013static inline void
60f2415e 1014update_stats_dequeue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
bf0f6f24 1015{
4fa8d299
JP
1016
1017 if (!schedstat_enabled())
1018 return;
1019
bf0f6f24
IM
1020 /*
1021 * Mark the end of the wait period if dequeueing a
1022 * waiting task:
1023 */
429d43bc 1024 if (se != cfs_rq->curr)
60f2415e 1025 update_stats_wait_end_fair(cfs_rq, se);
cb251765 1026
4fa8d299
JP
1027 if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
1028 struct task_struct *tsk = task_of(se);
2f064a59 1029 unsigned int state;
cb251765 1030
2f064a59
PZ
1031 /* XXX racy against TTWU */
1032 state = READ_ONCE(tsk->__state);
1033 if (state & TASK_INTERRUPTIBLE)
ceeadb83 1034 __schedstat_set(tsk->stats.sleep_start,
4fa8d299 1035 rq_clock(rq_of(cfs_rq)));
2f064a59 1036 if (state & TASK_UNINTERRUPTIBLE)
ceeadb83 1037 __schedstat_set(tsk->stats.block_start,
4fa8d299 1038 rq_clock(rq_of(cfs_rq)));
cb251765 1039 }
cb251765
MG
1040}
1041
bf0f6f24
IM
1042/*
1043 * We are picking a new current task - update its stats:
1044 */
1045static inline void
79303e9e 1046update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24
IM
1047{
1048 /*
1049 * We are starting a new run period:
1050 */
78becc27 1051 se->exec_start = rq_clock_task(rq_of(cfs_rq));
bf0f6f24
IM
1052}
1053
bf0f6f24
IM
1054/**************************************************
1055 * Scheduling class queueing methods:
1056 */
1057
cb29a5c1
MG
1058#ifdef CONFIG_NUMA
1059#define NUMA_IMBALANCE_MIN 2
1060
1061static inline long
1062adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr)
1063{
1064 /*
1065 * Allow a NUMA imbalance if busy CPUs is less than the maximum
1066 * threshold. Above this threshold, individual tasks may be contending
1067 * for both memory bandwidth and any shared HT resources. This is an
1068 * approximation as the number of running tasks may not be related to
1069 * the number of busy CPUs due to sched_setaffinity.
1070 */
1071 if (dst_running > imb_numa_nr)
1072 return imbalance;
1073
1074 /*
1075 * Allow a small imbalance based on a simple pair of communicating
1076 * tasks that remain local when the destination is lightly loaded.
1077 */
1078 if (imbalance <= NUMA_IMBALANCE_MIN)
1079 return 0;
1080
1081 return imbalance;
1082}
1083#endif /* CONFIG_NUMA */
1084
cbee9f88
PZ
1085#ifdef CONFIG_NUMA_BALANCING
1086/*
598f0ec0
MG
1087 * Approximate time to scan a full NUMA task in ms. The task scan period is
1088 * calculated based on the tasks virtual memory size and
1089 * numa_balancing_scan_size.
cbee9f88 1090 */
598f0ec0
MG
1091unsigned int sysctl_numa_balancing_scan_period_min = 1000;
1092unsigned int sysctl_numa_balancing_scan_period_max = 60000;
6e5fb223
PZ
1093
1094/* Portion of address space to scan in MB */
1095unsigned int sysctl_numa_balancing_scan_size = 256;
cbee9f88 1096
4b96a29b
PZ
1097/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
1098unsigned int sysctl_numa_balancing_scan_delay = 1000;
1099
b5dd77c8 1100struct numa_group {
c45a7795 1101 refcount_t refcount;
b5dd77c8
RR
1102
1103 spinlock_t lock; /* nr_tasks, tasks */
1104 int nr_tasks;
1105 pid_t gid;
1106 int active_nodes;
1107
1108 struct rcu_head rcu;
1109 unsigned long total_faults;
1110 unsigned long max_faults_cpu;
1111 /*
5b763a14
BR
1112 * faults[] array is split into two regions: faults_mem and faults_cpu.
1113 *
b5dd77c8
RR
1114 * Faults_cpu is used to decide whether memory should move
1115 * towards the CPU. As a consequence, these stats are weighted
1116 * more by CPU use than by memory faults.
1117 */
04f5c362 1118 unsigned long faults[];
b5dd77c8
RR
1119};
1120
cb361d8c
JH
1121/*
1122 * For functions that can be called in multiple contexts that permit reading
1123 * ->numa_group (see struct task_struct for locking rules).
1124 */
1125static struct numa_group *deref_task_numa_group(struct task_struct *p)
1126{
1127 return rcu_dereference_check(p->numa_group, p == current ||
9ef7e7e3 1128 (lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu)));
cb361d8c
JH
1129}
1130
1131static struct numa_group *deref_curr_numa_group(struct task_struct *p)
1132{
1133 return rcu_dereference_protected(p->numa_group, p == current);
1134}
1135
b5dd77c8
RR
1136static inline unsigned long group_faults_priv(struct numa_group *ng);
1137static inline unsigned long group_faults_shared(struct numa_group *ng);
1138
598f0ec0
MG
1139static unsigned int task_nr_scan_windows(struct task_struct *p)
1140{
1141 unsigned long rss = 0;
1142 unsigned long nr_scan_pages;
1143
1144 /*
1145 * Calculations based on RSS as non-present and empty pages are skipped
1146 * by the PTE scanner and NUMA hinting faults should be trapped based
1147 * on resident pages
1148 */
1149 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
1150 rss = get_mm_rss(p->mm);
1151 if (!rss)
1152 rss = nr_scan_pages;
1153
1154 rss = round_up(rss, nr_scan_pages);
1155 return rss / nr_scan_pages;
1156}
1157
3b03706f 1158/* For sanity's sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
598f0ec0
MG
1159#define MAX_SCAN_WINDOW 2560
1160
1161static unsigned int task_scan_min(struct task_struct *p)
1162{
316c1608 1163 unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
598f0ec0
MG
1164 unsigned int scan, floor;
1165 unsigned int windows = 1;
1166
64192658
KT
1167 if (scan_size < MAX_SCAN_WINDOW)
1168 windows = MAX_SCAN_WINDOW / scan_size;
598f0ec0
MG
1169 floor = 1000 / windows;
1170
1171 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1172 return max_t(unsigned int, floor, scan);
1173}
1174
b5dd77c8
RR
1175static unsigned int task_scan_start(struct task_struct *p)
1176{
1177 unsigned long smin = task_scan_min(p);
1178 unsigned long period = smin;
cb361d8c 1179 struct numa_group *ng;
b5dd77c8
RR
1180
1181 /* Scale the maximum scan period with the amount of shared memory. */
cb361d8c
JH
1182 rcu_read_lock();
1183 ng = rcu_dereference(p->numa_group);
1184 if (ng) {
b5dd77c8
RR
1185 unsigned long shared = group_faults_shared(ng);
1186 unsigned long private = group_faults_priv(ng);
1187
c45a7795 1188 period *= refcount_read(&ng->refcount);
b5dd77c8
RR
1189 period *= shared + 1;
1190 period /= private + shared + 1;
1191 }
cb361d8c 1192 rcu_read_unlock();
b5dd77c8
RR
1193
1194 return max(smin, period);
1195}
1196
598f0ec0
MG
1197static unsigned int task_scan_max(struct task_struct *p)
1198{
b5dd77c8
RR
1199 unsigned long smin = task_scan_min(p);
1200 unsigned long smax;
cb361d8c 1201 struct numa_group *ng;
598f0ec0
MG
1202
1203 /* Watch for min being lower than max due to floor calculations */
1204 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
b5dd77c8
RR
1205
1206 /* Scale the maximum scan period with the amount of shared memory. */
cb361d8c
JH
1207 ng = deref_curr_numa_group(p);
1208 if (ng) {
b5dd77c8
RR
1209 unsigned long shared = group_faults_shared(ng);
1210 unsigned long private = group_faults_priv(ng);
1211 unsigned long period = smax;
1212
c45a7795 1213 period *= refcount_read(&ng->refcount);
b5dd77c8
RR
1214 period *= shared + 1;
1215 period /= private + shared + 1;
1216
1217 smax = max(smax, period);
1218 }
1219
598f0ec0
MG
1220 return max(smin, smax);
1221}
1222
0ec8aa00
PZ
1223static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1224{
98fa15f3 1225 rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
0ec8aa00
PZ
1226 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1227}
1228
1229static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1230{
98fa15f3 1231 rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
0ec8aa00
PZ
1232 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1233}
1234
be1e4e76
RR
1235/* Shared or private faults. */
1236#define NR_NUMA_HINT_FAULT_TYPES 2
1237
1238/* Memory and CPU locality */
1239#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1240
1241/* Averaged statistics, and temporary buffers. */
1242#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1243
e29cf08b
MG
1244pid_t task_numa_group_id(struct task_struct *p)
1245{
cb361d8c
JH
1246 struct numa_group *ng;
1247 pid_t gid = 0;
1248
1249 rcu_read_lock();
1250 ng = rcu_dereference(p->numa_group);
1251 if (ng)
1252 gid = ng->gid;
1253 rcu_read_unlock();
1254
1255 return gid;
e29cf08b
MG
1256}
1257
44dba3d5 1258/*
97fb7a0a 1259 * The averaged statistics, shared & private, memory & CPU,
44dba3d5
IM
1260 * occupy the first half of the array. The second half of the
1261 * array is for current counters, which are averaged into the
1262 * first set by task_numa_placement.
1263 */
1264static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
ac8e895b 1265{
44dba3d5 1266 return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
ac8e895b
MG
1267}
1268
1269static inline unsigned long task_faults(struct task_struct *p, int nid)
1270{
44dba3d5 1271 if (!p->numa_faults)
ac8e895b
MG
1272 return 0;
1273
44dba3d5
IM
1274 return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1275 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
ac8e895b
MG
1276}
1277
83e1d2cd
MG
1278static inline unsigned long group_faults(struct task_struct *p, int nid)
1279{
cb361d8c
JH
1280 struct numa_group *ng = deref_task_numa_group(p);
1281
1282 if (!ng)
83e1d2cd
MG
1283 return 0;
1284
cb361d8c
JH
1285 return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1286 ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
83e1d2cd
MG
1287}
1288
20e07dea
RR
1289static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1290{
5b763a14
BR
1291 return group->faults[task_faults_idx(NUMA_CPU, nid, 0)] +
1292 group->faults[task_faults_idx(NUMA_CPU, nid, 1)];
20e07dea
RR
1293}
1294
b5dd77c8
RR
1295static inline unsigned long group_faults_priv(struct numa_group *ng)
1296{
1297 unsigned long faults = 0;
1298 int node;
1299
1300 for_each_online_node(node) {
1301 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
1302 }
1303
1304 return faults;
1305}
1306
1307static inline unsigned long group_faults_shared(struct numa_group *ng)
1308{
1309 unsigned long faults = 0;
1310 int node;
1311
1312 for_each_online_node(node) {
1313 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
1314 }
1315
1316 return faults;
1317}
1318
4142c3eb
RR
1319/*
1320 * A node triggering more than 1/3 as many NUMA faults as the maximum is
1321 * considered part of a numa group's pseudo-interleaving set. Migrations
1322 * between these nodes are slowed down, to allow things to settle down.
1323 */
1324#define ACTIVE_NODE_FRACTION 3
1325
1326static bool numa_is_active_node(int nid, struct numa_group *ng)
1327{
1328 return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1329}
1330
6c6b1193
RR
1331/* Handle placement on systems where not all nodes are directly connected. */
1332static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
0fb3978b 1333 int lim_dist, bool task)
6c6b1193
RR
1334{
1335 unsigned long score = 0;
0fb3978b 1336 int node, max_dist;
6c6b1193
RR
1337
1338 /*
1339 * All nodes are directly connected, and the same distance
1340 * from each other. No need for fancy placement algorithms.
1341 */
1342 if (sched_numa_topology_type == NUMA_DIRECT)
1343 return 0;
1344
0fb3978b
HY
1345 /* sched_max_numa_distance may be changed in parallel. */
1346 max_dist = READ_ONCE(sched_max_numa_distance);
6c6b1193
RR
1347 /*
1348 * This code is called for each node, introducing N^2 complexity,
1349 * which should be ok given the number of nodes rarely exceeds 8.
1350 */
1351 for_each_online_node(node) {
1352 unsigned long faults;
1353 int dist = node_distance(nid, node);
1354
1355 /*
1356 * The furthest away nodes in the system are not interesting
1357 * for placement; nid was already counted.
1358 */
0fb3978b 1359 if (dist >= max_dist || node == nid)
6c6b1193
RR
1360 continue;
1361
1362 /*
1363 * On systems with a backplane NUMA topology, compare groups
1364 * of nodes, and move tasks towards the group with the most
1365 * memory accesses. When comparing two nodes at distance
1366 * "hoplimit", only nodes closer by than "hoplimit" are part
1367 * of each group. Skip other nodes.
1368 */
0fb3978b 1369 if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist)
6c6b1193
RR
1370 continue;
1371
1372 /* Add up the faults from nearby nodes. */
1373 if (task)
1374 faults = task_faults(p, node);
1375 else
1376 faults = group_faults(p, node);
1377
1378 /*
1379 * On systems with a glueless mesh NUMA topology, there are
1380 * no fixed "groups of nodes". Instead, nodes that are not
1381 * directly connected bounce traffic through intermediate
1382 * nodes; a numa_group can occupy any set of nodes.
1383 * The further away a node is, the less the faults count.
1384 * This seems to result in good task placement.
1385 */
1386 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
0fb3978b
HY
1387 faults *= (max_dist - dist);
1388 faults /= (max_dist - LOCAL_DISTANCE);
6c6b1193
RR
1389 }
1390
1391 score += faults;
1392 }
1393
1394 return score;
1395}
1396
83e1d2cd
MG
1397/*
1398 * These return the fraction of accesses done by a particular task, or
1399 * task group, on a particular numa node. The group weight is given a
1400 * larger multiplier, in order to group tasks together that are almost
1401 * evenly spread out between numa nodes.
1402 */
7bd95320
RR
1403static inline unsigned long task_weight(struct task_struct *p, int nid,
1404 int dist)
83e1d2cd 1405{
7bd95320 1406 unsigned long faults, total_faults;
83e1d2cd 1407
44dba3d5 1408 if (!p->numa_faults)
83e1d2cd
MG
1409 return 0;
1410
1411 total_faults = p->total_numa_faults;
1412
1413 if (!total_faults)
1414 return 0;
1415
7bd95320 1416 faults = task_faults(p, nid);
6c6b1193
RR
1417 faults += score_nearby_nodes(p, nid, dist, true);
1418
7bd95320 1419 return 1000 * faults / total_faults;
83e1d2cd
MG
1420}
1421
7bd95320
RR
1422static inline unsigned long group_weight(struct task_struct *p, int nid,
1423 int dist)
83e1d2cd 1424{
cb361d8c 1425 struct numa_group *ng = deref_task_numa_group(p);
7bd95320
RR
1426 unsigned long faults, total_faults;
1427
cb361d8c 1428 if (!ng)
7bd95320
RR
1429 return 0;
1430
cb361d8c 1431 total_faults = ng->total_faults;
7bd95320
RR
1432
1433 if (!total_faults)
83e1d2cd
MG
1434 return 0;
1435
7bd95320 1436 faults = group_faults(p, nid);
6c6b1193
RR
1437 faults += score_nearby_nodes(p, nid, dist, false);
1438
7bd95320 1439 return 1000 * faults / total_faults;
83e1d2cd
MG
1440}
1441
10f39042
RR
1442bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1443 int src_nid, int dst_cpu)
1444{
cb361d8c 1445 struct numa_group *ng = deref_curr_numa_group(p);
10f39042
RR
1446 int dst_nid = cpu_to_node(dst_cpu);
1447 int last_cpupid, this_cpupid;
1448
1449 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
37355bdc
MG
1450 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1451
1452 /*
1453 * Allow first faults or private faults to migrate immediately early in
1454 * the lifetime of a task. The magic number 4 is based on waiting for
1455 * two full passes of the "multi-stage node selection" test that is
1456 * executed below.
1457 */
98fa15f3 1458 if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
37355bdc
MG
1459 (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
1460 return true;
10f39042
RR
1461
1462 /*
1463 * Multi-stage node selection is used in conjunction with a periodic
1464 * migration fault to build a temporal task<->page relation. By using
1465 * a two-stage filter we remove short/unlikely relations.
1466 *
1467 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
1468 * a task's usage of a particular page (n_p) per total usage of this
1469 * page (n_t) (in a given time-span) to a probability.
1470 *
1471 * Our periodic faults will sample this probability and getting the
1472 * same result twice in a row, given these samples are fully
1473 * independent, is then given by P(n)^2, provided our sample period
1474 * is sufficiently short compared to the usage pattern.
1475 *
1476 * This quadric squishes small probabilities, making it less likely we
1477 * act on an unlikely task<->page relation.
1478 */
10f39042
RR
1479 if (!cpupid_pid_unset(last_cpupid) &&
1480 cpupid_to_nid(last_cpupid) != dst_nid)
1481 return false;
1482
1483 /* Always allow migrate on private faults */
1484 if (cpupid_match_pid(p, last_cpupid))
1485 return true;
1486
1487 /* A shared fault, but p->numa_group has not been set up yet. */
1488 if (!ng)
1489 return true;
1490
1491 /*
4142c3eb
RR
1492 * Destination node is much more heavily used than the source
1493 * node? Allow migration.
10f39042 1494 */
4142c3eb
RR
1495 if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
1496 ACTIVE_NODE_FRACTION)
10f39042
RR
1497 return true;
1498
1499 /*
4142c3eb
RR
1500 * Distribute memory according to CPU & memory use on each node,
1501 * with 3/4 hysteresis to avoid unnecessary memory migrations:
1502 *
1503 * faults_cpu(dst) 3 faults_cpu(src)
1504 * --------------- * - > ---------------
1505 * faults_mem(dst) 4 faults_mem(src)
10f39042 1506 */
4142c3eb
RR
1507 return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
1508 group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
10f39042
RR
1509}
1510
6499b1b2
VG
1511/*
1512 * 'numa_type' describes the node at the moment of load balancing.
1513 */
1514enum numa_type {
1515 /* The node has spare capacity that can be used to run more tasks. */
1516 node_has_spare = 0,
1517 /*
1518 * The node is fully used and the tasks don't compete for more CPU
1519 * cycles. Nevertheless, some tasks might wait before running.
1520 */
1521 node_fully_busy,
1522 /*
1523 * The node is overloaded and can't provide expected CPU cycles to all
1524 * tasks.
1525 */
1526 node_overloaded
1527};
58d081b5 1528
fb13c7ee 1529/* Cached statistics for all CPUs within a node */
58d081b5
MG
1530struct numa_stats {
1531 unsigned long load;
8e0e0eda 1532 unsigned long runnable;
6499b1b2 1533 unsigned long util;
fb13c7ee 1534 /* Total compute capacity of CPUs on a node */
5ef20ca1 1535 unsigned long compute_capacity;
6499b1b2
VG
1536 unsigned int nr_running;
1537 unsigned int weight;
1538 enum numa_type node_type;
ff7db0bf 1539 int idle_cpu;
58d081b5 1540};
e6628d5b 1541
ff7db0bf
MG
1542static inline bool is_core_idle(int cpu)
1543{
1544#ifdef CONFIG_SCHED_SMT
1545 int sibling;
1546
1547 for_each_cpu(sibling, cpu_smt_mask(cpu)) {
1548 if (cpu == sibling)
1549 continue;
1550
1c6829cf 1551 if (!idle_cpu(sibling))
ff7db0bf
MG
1552 return false;
1553 }
1554#endif
1555
1556 return true;
1557}
1558
58d081b5
MG
1559struct task_numa_env {
1560 struct task_struct *p;
e6628d5b 1561
58d081b5
MG
1562 int src_cpu, src_nid;
1563 int dst_cpu, dst_nid;
e496132e 1564 int imb_numa_nr;
e6628d5b 1565
58d081b5 1566 struct numa_stats src_stats, dst_stats;
e6628d5b 1567
40ea2b42 1568 int imbalance_pct;
7bd95320 1569 int dist;
fb13c7ee
MG
1570
1571 struct task_struct *best_task;
1572 long best_imp;
58d081b5
MG
1573 int best_cpu;
1574};
1575
6499b1b2 1576static unsigned long cpu_load(struct rq *rq);
8e0e0eda 1577static unsigned long cpu_runnable(struct rq *rq);
6499b1b2
VG
1578
1579static inline enum
1580numa_type numa_classify(unsigned int imbalance_pct,
1581 struct numa_stats *ns)
1582{
1583 if ((ns->nr_running > ns->weight) &&
8e0e0eda
VG
1584 (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
1585 ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
6499b1b2
VG
1586 return node_overloaded;
1587
1588 if ((ns->nr_running < ns->weight) ||
8e0e0eda
VG
1589 (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
1590 ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
6499b1b2
VG
1591 return node_has_spare;
1592
1593 return node_fully_busy;
1594}
1595
76c389ab
VS
1596#ifdef CONFIG_SCHED_SMT
1597/* Forward declarations of select_idle_sibling helpers */
1598static inline bool test_idle_cores(int cpu, bool def);
ff7db0bf
MG
1599static inline int numa_idle_core(int idle_core, int cpu)
1600{
ff7db0bf
MG
1601 if (!static_branch_likely(&sched_smt_present) ||
1602 idle_core >= 0 || !test_idle_cores(cpu, false))
1603 return idle_core;
1604
1605 /*
1606 * Prefer cores instead of packing HT siblings
1607 * and triggering future load balancing.
1608 */
1609 if (is_core_idle(cpu))
1610 idle_core = cpu;
ff7db0bf
MG
1611
1612 return idle_core;
1613}
76c389ab
VS
1614#else
1615static inline int numa_idle_core(int idle_core, int cpu)
1616{
1617 return idle_core;
1618}
1619#endif
ff7db0bf 1620
6499b1b2 1621/*
ff7db0bf
MG
1622 * Gather all necessary information to make NUMA balancing placement
1623 * decisions that are compatible with standard load balancer. This
1624 * borrows code and logic from update_sg_lb_stats but sharing a
1625 * common implementation is impractical.
6499b1b2
VG
1626 */
1627static void update_numa_stats(struct task_numa_env *env,
ff7db0bf
MG
1628 struct numa_stats *ns, int nid,
1629 bool find_idle)
6499b1b2 1630{
ff7db0bf 1631 int cpu, idle_core = -1;
6499b1b2
VG
1632
1633 memset(ns, 0, sizeof(*ns));
ff7db0bf
MG
1634 ns->idle_cpu = -1;
1635
0621df31 1636 rcu_read_lock();
6499b1b2
VG
1637 for_each_cpu(cpu, cpumask_of_node(nid)) {
1638 struct rq *rq = cpu_rq(cpu);
1639
1640 ns->load += cpu_load(rq);
8e0e0eda 1641 ns->runnable += cpu_runnable(rq);
82762d2a 1642 ns->util += cpu_util_cfs(cpu);
6499b1b2
VG
1643 ns->nr_running += rq->cfs.h_nr_running;
1644 ns->compute_capacity += capacity_of(cpu);
ff7db0bf
MG
1645
1646 if (find_idle && !rq->nr_running && idle_cpu(cpu)) {
1647 if (READ_ONCE(rq->numa_migrate_on) ||
1648 !cpumask_test_cpu(cpu, env->p->cpus_ptr))
1649 continue;
1650
1651 if (ns->idle_cpu == -1)
1652 ns->idle_cpu = cpu;
1653
1654 idle_core = numa_idle_core(idle_core, cpu);
1655 }
6499b1b2 1656 }
0621df31 1657 rcu_read_unlock();
6499b1b2
VG
1658
1659 ns->weight = cpumask_weight(cpumask_of_node(nid));
1660
1661 ns->node_type = numa_classify(env->imbalance_pct, ns);
ff7db0bf
MG
1662
1663 if (idle_core >= 0)
1664 ns->idle_cpu = idle_core;
6499b1b2
VG
1665}
1666
fb13c7ee
MG
1667static void task_numa_assign(struct task_numa_env *env,
1668 struct task_struct *p, long imp)
1669{
a4739eca
SD
1670 struct rq *rq = cpu_rq(env->dst_cpu);
1671
5fb52dd9
MG
1672 /* Check if run-queue part of active NUMA balance. */
1673 if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) {
1674 int cpu;
1675 int start = env->dst_cpu;
1676
1677 /* Find alternative idle CPU. */
1678 for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) {
1679 if (cpu == env->best_cpu || !idle_cpu(cpu) ||
1680 !cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
1681 continue;
1682 }
1683
1684 env->dst_cpu = cpu;
1685 rq = cpu_rq(env->dst_cpu);
1686 if (!xchg(&rq->numa_migrate_on, 1))
1687 goto assign;
1688 }
1689
1690 /* Failed to find an alternative idle CPU */
a4739eca 1691 return;
5fb52dd9 1692 }
a4739eca 1693
5fb52dd9 1694assign:
a4739eca
SD
1695 /*
1696 * Clear previous best_cpu/rq numa-migrate flag, since task now
1697 * found a better CPU to move/swap.
1698 */
5fb52dd9 1699 if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) {
a4739eca
SD
1700 rq = cpu_rq(env->best_cpu);
1701 WRITE_ONCE(rq->numa_migrate_on, 0);
1702 }
1703
fb13c7ee
MG
1704 if (env->best_task)
1705 put_task_struct(env->best_task);
bac78573
ON
1706 if (p)
1707 get_task_struct(p);
fb13c7ee
MG
1708
1709 env->best_task = p;
1710 env->best_imp = imp;
1711 env->best_cpu = env->dst_cpu;
1712}
1713
28a21745 1714static bool load_too_imbalanced(long src_load, long dst_load,
e63da036
RR
1715 struct task_numa_env *env)
1716{
e4991b24
RR
1717 long imb, old_imb;
1718 long orig_src_load, orig_dst_load;
28a21745
RR
1719 long src_capacity, dst_capacity;
1720
1721 /*
1722 * The load is corrected for the CPU capacity available on each node.
1723 *
1724 * src_load dst_load
1725 * ------------ vs ---------
1726 * src_capacity dst_capacity
1727 */
1728 src_capacity = env->src_stats.compute_capacity;
1729 dst_capacity = env->dst_stats.compute_capacity;
e63da036 1730
5f95ba7a 1731 imb = abs(dst_load * src_capacity - src_load * dst_capacity);
e63da036 1732
28a21745 1733 orig_src_load = env->src_stats.load;
e4991b24 1734 orig_dst_load = env->dst_stats.load;
28a21745 1735
5f95ba7a 1736 old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
e4991b24
RR
1737
1738 /* Would this change make things worse? */
1739 return (imb > old_imb);
e63da036
RR
1740}
1741
6fd98e77
SD
1742/*
1743 * Maximum NUMA importance can be 1998 (2*999);
1744 * SMALLIMP @ 30 would be close to 1998/64.
1745 * Used to deter task migration.
1746 */
1747#define SMALLIMP 30
1748
fb13c7ee
MG
1749/*
1750 * This checks if the overall compute and NUMA accesses of the system would
1751 * be improved if the source tasks was migrated to the target dst_cpu taking
1752 * into account that it might be best if task running on the dst_cpu should
1753 * be exchanged with the source task
1754 */
a0f03b61 1755static bool task_numa_compare(struct task_numa_env *env,
305c1fac 1756 long taskimp, long groupimp, bool maymove)
fb13c7ee 1757{
cb361d8c 1758 struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
fb13c7ee 1759 struct rq *dst_rq = cpu_rq(env->dst_cpu);
cb361d8c 1760 long imp = p_ng ? groupimp : taskimp;
fb13c7ee 1761 struct task_struct *cur;
28a21745 1762 long src_load, dst_load;
7bd95320 1763 int dist = env->dist;
cb361d8c
JH
1764 long moveimp = imp;
1765 long load;
a0f03b61 1766 bool stopsearch = false;
fb13c7ee 1767
a4739eca 1768 if (READ_ONCE(dst_rq->numa_migrate_on))
a0f03b61 1769 return false;
a4739eca 1770
fb13c7ee 1771 rcu_read_lock();
154abafc 1772 cur = rcu_dereference(dst_rq->curr);
bac78573 1773 if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
fb13c7ee
MG
1774 cur = NULL;
1775
7af68335
PZ
1776 /*
1777 * Because we have preemption enabled we can get migrated around and
1778 * end try selecting ourselves (current == env->p) as a swap candidate.
1779 */
a0f03b61
MG
1780 if (cur == env->p) {
1781 stopsearch = true;
7af68335 1782 goto unlock;
a0f03b61 1783 }
7af68335 1784
305c1fac 1785 if (!cur) {
6fd98e77 1786 if (maymove && moveimp >= env->best_imp)
305c1fac
SD
1787 goto assign;
1788 else
1789 goto unlock;
1790 }
1791
88cca72c
MG
1792 /* Skip this swap candidate if cannot move to the source cpu. */
1793 if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
1794 goto unlock;
1795
1796 /*
1797 * Skip this swap candidate if it is not moving to its preferred
1798 * node and the best task is.
1799 */
1800 if (env->best_task &&
1801 env->best_task->numa_preferred_nid == env->src_nid &&
1802 cur->numa_preferred_nid != env->src_nid) {
1803 goto unlock;
1804 }
1805
fb13c7ee
MG
1806 /*
1807 * "imp" is the fault differential for the source task between the
1808 * source and destination node. Calculate the total differential for
1809 * the source task and potential destination task. The more negative
305c1fac 1810 * the value is, the more remote accesses that would be expected to
fb13c7ee 1811 * be incurred if the tasks were swapped.
88cca72c 1812 *
305c1fac
SD
1813 * If dst and source tasks are in the same NUMA group, or not
1814 * in any group then look only at task weights.
1815 */
cb361d8c
JH
1816 cur_ng = rcu_dereference(cur->numa_group);
1817 if (cur_ng == p_ng) {
13ede331
MG
1818 /*
1819 * Do not swap within a group or between tasks that have
1820 * no group if there is spare capacity. Swapping does
1821 * not address the load imbalance and helps one task at
1822 * the cost of punishing another.
1823 */
1824 if (env->dst_stats.node_type == node_has_spare)
1825 goto unlock;
1826
305c1fac
SD
1827 imp = taskimp + task_weight(cur, env->src_nid, dist) -
1828 task_weight(cur, env->dst_nid, dist);
887c290e 1829 /*
305c1fac
SD
1830 * Add some hysteresis to prevent swapping the
1831 * tasks within a group over tiny differences.
887c290e 1832 */
cb361d8c 1833 if (cur_ng)
305c1fac
SD
1834 imp -= imp / 16;
1835 } else {
1836 /*
1837 * Compare the group weights. If a task is all by itself
1838 * (not part of a group), use the task weight instead.
1839 */
cb361d8c 1840 if (cur_ng && p_ng)
305c1fac
SD
1841 imp += group_weight(cur, env->src_nid, dist) -
1842 group_weight(cur, env->dst_nid, dist);
1843 else
1844 imp += task_weight(cur, env->src_nid, dist) -
1845 task_weight(cur, env->dst_nid, dist);
fb13c7ee
MG
1846 }
1847
88cca72c
MG
1848 /* Discourage picking a task already on its preferred node */
1849 if (cur->numa_preferred_nid == env->dst_nid)
1850 imp -= imp / 16;
1851
1852 /*
1853 * Encourage picking a task that moves to its preferred node.
1854 * This potentially makes imp larger than it's maximum of
1855 * 1998 (see SMALLIMP and task_weight for why) but in this
1856 * case, it does not matter.
1857 */
1858 if (cur->numa_preferred_nid == env->src_nid)
1859 imp += imp / 8;
1860
305c1fac 1861 if (maymove && moveimp > imp && moveimp > env->best_imp) {
6fd98e77 1862 imp = moveimp;
305c1fac 1863 cur = NULL;
fb13c7ee 1864 goto assign;
305c1fac 1865 }
fb13c7ee 1866
88cca72c
MG
1867 /*
1868 * Prefer swapping with a task moving to its preferred node over a
1869 * task that is not.
1870 */
1871 if (env->best_task && cur->numa_preferred_nid == env->src_nid &&
1872 env->best_task->numa_preferred_nid != env->src_nid) {
1873 goto assign;
1874 }
1875
6fd98e77
SD
1876 /*
1877 * If the NUMA importance is less than SMALLIMP,
1878 * task migration might only result in ping pong
1879 * of tasks and also hurt performance due to cache
1880 * misses.
1881 */
1882 if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
1883 goto unlock;
1884
fb13c7ee
MG
1885 /*
1886 * In the overloaded case, try and keep the load balanced.
1887 */
305c1fac
SD
1888 load = task_h_load(env->p) - task_h_load(cur);
1889 if (!load)
1890 goto assign;
1891
e720fff6
PZ
1892 dst_load = env->dst_stats.load + load;
1893 src_load = env->src_stats.load - load;
fb13c7ee 1894
28a21745 1895 if (load_too_imbalanced(src_load, dst_load, env))
fb13c7ee
MG
1896 goto unlock;
1897
305c1fac 1898assign:
ff7db0bf 1899 /* Evaluate an idle CPU for a task numa move. */
10e2f1ac 1900 if (!cur) {
ff7db0bf
MG
1901 int cpu = env->dst_stats.idle_cpu;
1902
1903 /* Nothing cached so current CPU went idle since the search. */
1904 if (cpu < 0)
1905 cpu = env->dst_cpu;
1906
10e2f1ac 1907 /*
ff7db0bf
MG
1908 * If the CPU is no longer truly idle and the previous best CPU
1909 * is, keep using it.
10e2f1ac 1910 */
ff7db0bf
MG
1911 if (!idle_cpu(cpu) && env->best_cpu >= 0 &&
1912 idle_cpu(env->best_cpu)) {
1913 cpu = env->best_cpu;
1914 }
1915
ff7db0bf 1916 env->dst_cpu = cpu;
10e2f1ac 1917 }
ba7e5a27 1918
fb13c7ee 1919 task_numa_assign(env, cur, imp);
a0f03b61
MG
1920
1921 /*
1922 * If a move to idle is allowed because there is capacity or load
1923 * balance improves then stop the search. While a better swap
1924 * candidate may exist, a search is not free.
1925 */
1926 if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu))
1927 stopsearch = true;
1928
1929 /*
1930 * If a swap candidate must be identified and the current best task
1931 * moves its preferred node then stop the search.
1932 */
1933 if (!maymove && env->best_task &&
1934 env->best_task->numa_preferred_nid == env->src_nid) {
1935 stopsearch = true;
1936 }
fb13c7ee
MG
1937unlock:
1938 rcu_read_unlock();
a0f03b61
MG
1939
1940 return stopsearch;
fb13c7ee
MG
1941}
1942
887c290e
RR
1943static void task_numa_find_cpu(struct task_numa_env *env,
1944 long taskimp, long groupimp)
2c8a50aa 1945{
305c1fac 1946 bool maymove = false;
2c8a50aa
MG
1947 int cpu;
1948
305c1fac 1949 /*
fb86f5b2
MG
1950 * If dst node has spare capacity, then check if there is an
1951 * imbalance that would be overruled by the load balancer.
305c1fac 1952 */
fb86f5b2
MG
1953 if (env->dst_stats.node_type == node_has_spare) {
1954 unsigned int imbalance;
1955 int src_running, dst_running;
1956
1957 /*
1958 * Would movement cause an imbalance? Note that if src has
1959 * more running tasks that the imbalance is ignored as the
1960 * move improves the imbalance from the perspective of the
1961 * CPU load balancer.
1962 * */
1963 src_running = env->src_stats.nr_running - 1;
1964 dst_running = env->dst_stats.nr_running + 1;
1965 imbalance = max(0, dst_running - src_running);
7d2b5dd0 1966 imbalance = adjust_numa_imbalance(imbalance, dst_running,
e496132e 1967 env->imb_numa_nr);
fb86f5b2
MG
1968
1969 /* Use idle CPU if there is no imbalance */
ff7db0bf 1970 if (!imbalance) {
fb86f5b2 1971 maymove = true;
ff7db0bf
MG
1972 if (env->dst_stats.idle_cpu >= 0) {
1973 env->dst_cpu = env->dst_stats.idle_cpu;
1974 task_numa_assign(env, NULL, 0);
1975 return;
1976 }
1977 }
fb86f5b2
MG
1978 } else {
1979 long src_load, dst_load, load;
1980 /*
1981 * If the improvement from just moving env->p direction is better
1982 * than swapping tasks around, check if a move is possible.
1983 */
1984 load = task_h_load(env->p);
1985 dst_load = env->dst_stats.load + load;
1986 src_load = env->src_stats.load - load;
1987 maymove = !load_too_imbalanced(src_load, dst_load, env);
1988 }
305c1fac 1989
2c8a50aa
MG
1990 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1991 /* Skip this CPU if the source task cannot migrate */
3bd37062 1992 if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
2c8a50aa
MG
1993 continue;
1994
1995 env->dst_cpu = cpu;
a0f03b61
MG
1996 if (task_numa_compare(env, taskimp, groupimp, maymove))
1997 break;
2c8a50aa
MG
1998 }
1999}
2000
58d081b5
MG
2001static int task_numa_migrate(struct task_struct *p)
2002{
58d081b5
MG
2003 struct task_numa_env env = {
2004 .p = p,
fb13c7ee 2005
58d081b5 2006 .src_cpu = task_cpu(p),
b32e86b4 2007 .src_nid = task_node(p),
fb13c7ee
MG
2008
2009 .imbalance_pct = 112,
2010
2011 .best_task = NULL,
2012 .best_imp = 0,
4142c3eb 2013 .best_cpu = -1,
58d081b5 2014 };
cb361d8c 2015 unsigned long taskweight, groupweight;
58d081b5 2016 struct sched_domain *sd;
cb361d8c
JH
2017 long taskimp, groupimp;
2018 struct numa_group *ng;
a4739eca 2019 struct rq *best_rq;
7bd95320 2020 int nid, ret, dist;
e6628d5b 2021
58d081b5 2022 /*
fb13c7ee
MG
2023 * Pick the lowest SD_NUMA domain, as that would have the smallest
2024 * imbalance and would be the first to start moving tasks about.
2025 *
2026 * And we want to avoid any moving of tasks about, as that would create
2027 * random movement of tasks -- counter the numa conditions we're trying
2028 * to satisfy here.
58d081b5
MG
2029 */
2030 rcu_read_lock();
fb13c7ee 2031 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
e496132e 2032 if (sd) {
46a73e8a 2033 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
e496132e
MG
2034 env.imb_numa_nr = sd->imb_numa_nr;
2035 }
e6628d5b
MG
2036 rcu_read_unlock();
2037
46a73e8a
RR
2038 /*
2039 * Cpusets can break the scheduler domain tree into smaller
2040 * balance domains, some of which do not cross NUMA boundaries.
2041 * Tasks that are "trapped" in such domains cannot be migrated
2042 * elsewhere, so there is no point in (re)trying.
2043 */
2044 if (unlikely(!sd)) {
8cd45eee 2045 sched_setnuma(p, task_node(p));
46a73e8a
RR
2046 return -EINVAL;
2047 }
2048
2c8a50aa 2049 env.dst_nid = p->numa_preferred_nid;
7bd95320
RR
2050 dist = env.dist = node_distance(env.src_nid, env.dst_nid);
2051 taskweight = task_weight(p, env.src_nid, dist);
2052 groupweight = group_weight(p, env.src_nid, dist);
ff7db0bf 2053 update_numa_stats(&env, &env.src_stats, env.src_nid, false);
7bd95320
RR
2054 taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
2055 groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
ff7db0bf 2056 update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
58d081b5 2057
a43455a1 2058 /* Try to find a spot on the preferred nid. */
2d4056fa 2059 task_numa_find_cpu(&env, taskimp, groupimp);
e1dda8a7 2060
9de05d48
RR
2061 /*
2062 * Look at other nodes in these cases:
2063 * - there is no space available on the preferred_nid
2064 * - the task is part of a numa_group that is interleaved across
2065 * multiple NUMA nodes; in order to better consolidate the group,
2066 * we need to check other locations.
2067 */
cb361d8c
JH
2068 ng = deref_curr_numa_group(p);
2069 if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
5c7b1aaf 2070 for_each_node_state(nid, N_CPU) {
2c8a50aa
MG
2071 if (nid == env.src_nid || nid == p->numa_preferred_nid)
2072 continue;
58d081b5 2073
7bd95320 2074 dist = node_distance(env.src_nid, env.dst_nid);
6c6b1193
RR
2075 if (sched_numa_topology_type == NUMA_BACKPLANE &&
2076 dist != env.dist) {
2077 taskweight = task_weight(p, env.src_nid, dist);
2078 groupweight = group_weight(p, env.src_nid, dist);
2079 }
7bd95320 2080
83e1d2cd 2081 /* Only consider nodes where both task and groups benefit */
7bd95320
RR
2082 taskimp = task_weight(p, nid, dist) - taskweight;
2083 groupimp = group_weight(p, nid, dist) - groupweight;
887c290e 2084 if (taskimp < 0 && groupimp < 0)
fb13c7ee
MG
2085 continue;
2086
7bd95320 2087 env.dist = dist;
2c8a50aa 2088 env.dst_nid = nid;
ff7db0bf 2089 update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
2d4056fa 2090 task_numa_find_cpu(&env, taskimp, groupimp);
58d081b5
MG
2091 }
2092 }
2093
68d1b02a
RR
2094 /*
2095 * If the task is part of a workload that spans multiple NUMA nodes,
2096 * and is migrating into one of the workload's active nodes, remember
2097 * this node as the task's preferred numa node, so the workload can
2098 * settle down.
2099 * A task that migrated to a second choice node will be better off
2100 * trying for a better one later. Do not set the preferred node here.
2101 */
cb361d8c 2102 if (ng) {
db015dae
RR
2103 if (env.best_cpu == -1)
2104 nid = env.src_nid;
2105 else
8cd45eee 2106 nid = cpu_to_node(env.best_cpu);
db015dae 2107
8cd45eee
SD
2108 if (nid != p->numa_preferred_nid)
2109 sched_setnuma(p, nid);
db015dae
RR
2110 }
2111
2112 /* No better CPU than the current one was found. */
f22aef4a 2113 if (env.best_cpu == -1) {
b2b2042b 2114 trace_sched_stick_numa(p, env.src_cpu, NULL, -1);
db015dae 2115 return -EAGAIN;
f22aef4a 2116 }
0ec8aa00 2117
a4739eca 2118 best_rq = cpu_rq(env.best_cpu);
fb13c7ee 2119 if (env.best_task == NULL) {
286549dc 2120 ret = migrate_task_to(p, env.best_cpu);
a4739eca 2121 WRITE_ONCE(best_rq->numa_migrate_on, 0);
286549dc 2122 if (ret != 0)
b2b2042b 2123 trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu);
fb13c7ee
MG
2124 return ret;
2125 }
2126
0ad4e3df 2127 ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
a4739eca 2128 WRITE_ONCE(best_rq->numa_migrate_on, 0);
0ad4e3df 2129
286549dc 2130 if (ret != 0)
b2b2042b 2131 trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu);
fb13c7ee
MG
2132 put_task_struct(env.best_task);
2133 return ret;
e6628d5b
MG
2134}
2135
6b9a7460
MG
2136/* Attempt to migrate a task to a CPU on the preferred node. */
2137static void numa_migrate_preferred(struct task_struct *p)
2138{
5085e2a3
RR
2139 unsigned long interval = HZ;
2140
2739d3ee 2141 /* This task has no NUMA fault statistics yet */
98fa15f3 2142 if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
6b9a7460
MG
2143 return;
2144
2739d3ee 2145 /* Periodically retry migrating the task to the preferred node */
5085e2a3 2146 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
789ba280 2147 p->numa_migrate_retry = jiffies + interval;
2739d3ee
RR
2148
2149 /* Success if task is already running on preferred CPU */
de1b301a 2150 if (task_node(p) == p->numa_preferred_nid)
6b9a7460
MG
2151 return;
2152
2153 /* Otherwise, try migrate to a CPU on the preferred node */
2739d3ee 2154 task_numa_migrate(p);
6b9a7460
MG
2155}
2156
20e07dea 2157/*
7d380f24 2158 * Find out how many nodes the workload is actively running on. Do this by
20e07dea
RR
2159 * tracking the nodes from which NUMA hinting faults are triggered. This can
2160 * be different from the set of nodes where the workload's memory is currently
2161 * located.
20e07dea 2162 */
4142c3eb 2163static void numa_group_count_active_nodes(struct numa_group *numa_group)
20e07dea
RR
2164{
2165 unsigned long faults, max_faults = 0;
4142c3eb 2166 int nid, active_nodes = 0;
20e07dea 2167
5c7b1aaf 2168 for_each_node_state(nid, N_CPU) {
20e07dea
RR
2169 faults = group_faults_cpu(numa_group, nid);
2170 if (faults > max_faults)
2171 max_faults = faults;
2172 }
2173
5c7b1aaf 2174 for_each_node_state(nid, N_CPU) {
20e07dea 2175 faults = group_faults_cpu(numa_group, nid);
4142c3eb
RR
2176 if (faults * ACTIVE_NODE_FRACTION > max_faults)
2177 active_nodes++;
20e07dea 2178 }
4142c3eb
RR
2179
2180 numa_group->max_faults_cpu = max_faults;
2181 numa_group->active_nodes = active_nodes;
20e07dea
RR
2182}
2183
04bb2f94
RR
2184/*
2185 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
2186 * increments. The more local the fault statistics are, the higher the scan
a22b4b01
RR
2187 * period will be for the next scan window. If local/(local+remote) ratio is
2188 * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
2189 * the scan period will decrease. Aim for 70% local accesses.
04bb2f94
RR
2190 */
2191#define NUMA_PERIOD_SLOTS 10
a22b4b01 2192#define NUMA_PERIOD_THRESHOLD 7
04bb2f94
RR
2193
2194/*
2195 * Increase the scan period (slow down scanning) if the majority of
2196 * our memory is already on our local node, or if the majority of
2197 * the page accesses are shared with other processes.
2198 * Otherwise, decrease the scan period.
2199 */
2200static void update_task_scan_period(struct task_struct *p,
2201 unsigned long shared, unsigned long private)
2202{
2203 unsigned int period_slot;
37ec97de 2204 int lr_ratio, ps_ratio;
04bb2f94
RR
2205 int diff;
2206
2207 unsigned long remote = p->numa_faults_locality[0];
2208 unsigned long local = p->numa_faults_locality[1];
2209
2210 /*
2211 * If there were no record hinting faults then either the task is
7d380f24 2212 * completely idle or all activity is in areas that are not of interest
074c2381
MG
2213 * to automatic numa balancing. Related to that, if there were failed
2214 * migration then it implies we are migrating too quickly or the local
2215 * node is overloaded. In either case, scan slower
04bb2f94 2216 */
074c2381 2217 if (local + shared == 0 || p->numa_faults_locality[2]) {
04bb2f94
RR
2218 p->numa_scan_period = min(p->numa_scan_period_max,
2219 p->numa_scan_period << 1);
2220
2221 p->mm->numa_next_scan = jiffies +
2222 msecs_to_jiffies(p->numa_scan_period);
2223
2224 return;
2225 }
2226
2227 /*
2228 * Prepare to scale scan period relative to the current period.
2229 * == NUMA_PERIOD_THRESHOLD scan period stays the same
2230 * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
2231 * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
2232 */
2233 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
37ec97de
RR
2234 lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
2235 ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
2236
2237 if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
2238 /*
2239 * Most memory accesses are local. There is no need to
2240 * do fast NUMA scanning, since memory is already local.
2241 */
2242 int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
2243 if (!slot)
2244 slot = 1;
2245 diff = slot * period_slot;
2246 } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
2247 /*
2248 * Most memory accesses are shared with other tasks.
2249 * There is no point in continuing fast NUMA scanning,
2250 * since other tasks may just move the memory elsewhere.
2251 */
2252 int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
04bb2f94
RR
2253 if (!slot)
2254 slot = 1;
2255 diff = slot * period_slot;
2256 } else {
04bb2f94 2257 /*
37ec97de
RR
2258 * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
2259 * yet they are not on the local NUMA node. Speed up
2260 * NUMA scanning to get the memory moved over.
04bb2f94 2261 */
37ec97de
RR
2262 int ratio = max(lr_ratio, ps_ratio);
2263 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
04bb2f94
RR
2264 }
2265
2266 p->numa_scan_period = clamp(p->numa_scan_period + diff,
2267 task_scan_min(p), task_scan_max(p));
2268 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2269}
2270
7e2703e6
RR
2271/*
2272 * Get the fraction of time the task has been running since the last
2273 * NUMA placement cycle. The scheduler keeps similar statistics, but
2274 * decays those on a 32ms period, which is orders of magnitude off
2275 * from the dozens-of-seconds NUMA balancing period. Use the scheduler
2276 * stats only if the task is so new there are no NUMA statistics yet.
2277 */
2278static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
2279{
2280 u64 runtime, delta, now;
2281 /* Use the start of this time slice to avoid calculations. */
2282 now = p->se.exec_start;
2283 runtime = p->se.sum_exec_runtime;
2284
2285 if (p->last_task_numa_placement) {
2286 delta = runtime - p->last_sum_exec_runtime;
2287 *period = now - p->last_task_numa_placement;
a860fa7b
XX
2288
2289 /* Avoid time going backwards, prevent potential divide error: */
2290 if (unlikely((s64)*period < 0))
2291 *period = 0;
7e2703e6 2292 } else {
c7b50216 2293 delta = p->se.avg.load_sum;
9d89c257 2294 *period = LOAD_AVG_MAX;
7e2703e6
RR
2295 }
2296
2297 p->last_sum_exec_runtime = runtime;
2298 p->last_task_numa_placement = now;
2299
2300 return delta;
2301}
2302
54009416
RR
2303/*
2304 * Determine the preferred nid for a task in a numa_group. This needs to
2305 * be done in a way that produces consistent results with group_weight,
2306 * otherwise workloads might not converge.
2307 */
2308static int preferred_group_nid(struct task_struct *p, int nid)
2309{
2310 nodemask_t nodes;
2311 int dist;
2312
2313 /* Direct connections between all NUMA nodes. */
2314 if (sched_numa_topology_type == NUMA_DIRECT)
2315 return nid;
2316
2317 /*
2318 * On a system with glueless mesh NUMA topology, group_weight
2319 * scores nodes according to the number of NUMA hinting faults on
2320 * both the node itself, and on nearby nodes.
2321 */
2322 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
2323 unsigned long score, max_score = 0;
2324 int node, max_node = nid;
2325
2326 dist = sched_max_numa_distance;
2327
5c7b1aaf 2328 for_each_node_state(node, N_CPU) {
54009416
RR
2329 score = group_weight(p, node, dist);
2330 if (score > max_score) {
2331 max_score = score;
2332 max_node = node;
2333 }
2334 }
2335 return max_node;
2336 }
2337
2338 /*
2339 * Finding the preferred nid in a system with NUMA backplane
2340 * interconnect topology is more involved. The goal is to locate
2341 * tasks from numa_groups near each other in the system, and
2342 * untangle workloads from different sides of the system. This requires
2343 * searching down the hierarchy of node groups, recursively searching
2344 * inside the highest scoring group of nodes. The nodemask tricks
2345 * keep the complexity of the search down.
2346 */
5c7b1aaf 2347 nodes = node_states[N_CPU];
54009416
RR
2348 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
2349 unsigned long max_faults = 0;
81907478 2350 nodemask_t max_group = NODE_MASK_NONE;
54009416
RR
2351 int a, b;
2352
2353 /* Are there nodes at this distance from each other? */
2354 if (!find_numa_distance(dist))
2355 continue;
2356
2357 for_each_node_mask(a, nodes) {
2358 unsigned long faults = 0;
2359 nodemask_t this_group;
2360 nodes_clear(this_group);
2361
2362 /* Sum group's NUMA faults; includes a==b case. */
2363 for_each_node_mask(b, nodes) {
2364 if (node_distance(a, b) < dist) {
2365 faults += group_faults(p, b);
2366 node_set(b, this_group);
2367 node_clear(b, nodes);
2368 }
2369 }
2370
2371 /* Remember the top group. */
2372 if (faults > max_faults) {
2373 max_faults = faults;
2374 max_group = this_group;
2375 /*
2376 * subtle: at the smallest distance there is
2377 * just one node left in each "group", the
2378 * winner is the preferred nid.
2379 */
2380 nid = a;
2381 }
2382 }
2383 /* Next round, evaluate the nodes within max_group. */
890a5409
JB
2384 if (!max_faults)
2385 break;
54009416
RR
2386 nodes = max_group;
2387 }
2388 return nid;
2389}
2390
cbee9f88
PZ
2391static void task_numa_placement(struct task_struct *p)
2392{
98fa15f3 2393 int seq, nid, max_nid = NUMA_NO_NODE;
f03bb676 2394 unsigned long max_faults = 0;
04bb2f94 2395 unsigned long fault_types[2] = { 0, 0 };
7e2703e6
RR
2396 unsigned long total_faults;
2397 u64 runtime, period;
7dbd13ed 2398 spinlock_t *group_lock = NULL;
cb361d8c 2399 struct numa_group *ng;
cbee9f88 2400
7e5a2c17
JL
2401 /*
2402 * The p->mm->numa_scan_seq field gets updated without
2403 * exclusive access. Use READ_ONCE() here to ensure
2404 * that the field is read in a single access:
2405 */
316c1608 2406 seq = READ_ONCE(p->mm->numa_scan_seq);
cbee9f88
PZ
2407 if (p->numa_scan_seq == seq)
2408 return;
2409 p->numa_scan_seq = seq;
598f0ec0 2410 p->numa_scan_period_max = task_scan_max(p);
cbee9f88 2411
7e2703e6
RR
2412 total_faults = p->numa_faults_locality[0] +
2413 p->numa_faults_locality[1];
2414 runtime = numa_get_avg_runtime(p, &period);
2415
7dbd13ed 2416 /* If the task is part of a group prevent parallel updates to group stats */
cb361d8c
JH
2417 ng = deref_curr_numa_group(p);
2418 if (ng) {
2419 group_lock = &ng->lock;
60e69eed 2420 spin_lock_irq(group_lock);
7dbd13ed
MG
2421 }
2422
688b7585
MG
2423 /* Find the node with the highest number of faults */
2424 for_each_online_node(nid) {
44dba3d5
IM
2425 /* Keep track of the offsets in numa_faults array */
2426 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
83e1d2cd 2427 unsigned long faults = 0, group_faults = 0;
44dba3d5 2428 int priv;
745d6147 2429
be1e4e76 2430 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
7e2703e6 2431 long diff, f_diff, f_weight;
8c8a743c 2432
44dba3d5
IM
2433 mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2434 membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2435 cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2436 cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
745d6147 2437
ac8e895b 2438 /* Decay existing window, copy faults since last scan */
44dba3d5
IM
2439 diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
2440 fault_types[priv] += p->numa_faults[membuf_idx];
2441 p->numa_faults[membuf_idx] = 0;
fb13c7ee 2442
7e2703e6
RR
2443 /*
2444 * Normalize the faults_from, so all tasks in a group
2445 * count according to CPU use, instead of by the raw
2446 * number of faults. Tasks with little runtime have
2447 * little over-all impact on throughput, and thus their
2448 * faults are less important.
2449 */
2450 f_weight = div64_u64(runtime << 16, period + 1);
44dba3d5 2451 f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
7e2703e6 2452 (total_faults + 1);
44dba3d5
IM
2453 f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2454 p->numa_faults[cpubuf_idx] = 0;
50ec8a40 2455
44dba3d5
IM
2456 p->numa_faults[mem_idx] += diff;
2457 p->numa_faults[cpu_idx] += f_diff;
2458 faults += p->numa_faults[mem_idx];
83e1d2cd 2459 p->total_numa_faults += diff;
cb361d8c 2460 if (ng) {
44dba3d5
IM
2461 /*
2462 * safe because we can only change our own group
2463 *
2464 * mem_idx represents the offset for a given
2465 * nid and priv in a specific region because it
2466 * is at the beginning of the numa_faults array.
2467 */
cb361d8c 2468 ng->faults[mem_idx] += diff;
5b763a14 2469 ng->faults[cpu_idx] += f_diff;
cb361d8c
JH
2470 ng->total_faults += diff;
2471 group_faults += ng->faults[mem_idx];
8c8a743c 2472 }
ac8e895b
MG
2473 }
2474
cb361d8c 2475 if (!ng) {
f03bb676
SD
2476 if (faults > max_faults) {
2477 max_faults = faults;
2478 max_nid = nid;
2479 }
2480 } else if (group_faults > max_faults) {
2481 max_faults = group_faults;
688b7585
MG
2482 max_nid = nid;
2483 }
83e1d2cd
MG
2484 }
2485
5c7b1aaf 2486 /* Cannot migrate task to CPU-less node */
ab31c7fd 2487 if (max_nid != NUMA_NO_NODE && !node_state(max_nid, N_CPU)) {
5c7b1aaf
HY
2488 int near_nid = max_nid;
2489 int distance, near_distance = INT_MAX;
2490
2491 for_each_node_state(nid, N_CPU) {
2492 distance = node_distance(max_nid, nid);
2493 if (distance < near_distance) {
2494 near_nid = nid;
2495 near_distance = distance;
2496 }
2497 }
2498 max_nid = near_nid;
2499 }
2500
cb361d8c
JH
2501 if (ng) {
2502 numa_group_count_active_nodes(ng);
60e69eed 2503 spin_unlock_irq(group_lock);
f03bb676 2504 max_nid = preferred_group_nid(p, max_nid);
688b7585
MG
2505 }
2506
bb97fc31
RR
2507 if (max_faults) {
2508 /* Set the new preferred node */
2509 if (max_nid != p->numa_preferred_nid)
2510 sched_setnuma(p, max_nid);
3a7053b3 2511 }
30619c89
SD
2512
2513 update_task_scan_period(p, fault_types[0], fault_types[1]);
cbee9f88
PZ
2514}
2515
8c8a743c
PZ
2516static inline int get_numa_group(struct numa_group *grp)
2517{
c45a7795 2518 return refcount_inc_not_zero(&grp->refcount);
8c8a743c
PZ
2519}
2520
2521static inline void put_numa_group(struct numa_group *grp)
2522{
c45a7795 2523 if (refcount_dec_and_test(&grp->refcount))
8c8a743c
PZ
2524 kfree_rcu(grp, rcu);
2525}
2526
3e6a9418
MG
2527static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2528 int *priv)
8c8a743c
PZ
2529{
2530 struct numa_group *grp, *my_grp;
2531 struct task_struct *tsk;
2532 bool join = false;
2533 int cpu = cpupid_to_cpu(cpupid);
2534 int i;
2535
cb361d8c 2536 if (unlikely(!deref_curr_numa_group(p))) {
8c8a743c 2537 unsigned int size = sizeof(struct numa_group) +
7a2341fc
BR
2538 NR_NUMA_HINT_FAULT_STATS *
2539 nr_node_ids * sizeof(unsigned long);
8c8a743c
PZ
2540
2541 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2542 if (!grp)
2543 return;
2544
c45a7795 2545 refcount_set(&grp->refcount, 1);
4142c3eb
RR
2546 grp->active_nodes = 1;
2547 grp->max_faults_cpu = 0;
8c8a743c 2548 spin_lock_init(&grp->lock);
e29cf08b 2549 grp->gid = p->pid;
8c8a743c 2550
be1e4e76 2551 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
44dba3d5 2552 grp->faults[i] = p->numa_faults[i];
8c8a743c 2553
989348b5 2554 grp->total_faults = p->total_numa_faults;
83e1d2cd 2555
8c8a743c
PZ
2556 grp->nr_tasks++;
2557 rcu_assign_pointer(p->numa_group, grp);
2558 }
2559
2560 rcu_read_lock();
316c1608 2561 tsk = READ_ONCE(cpu_rq(cpu)->curr);
8c8a743c
PZ
2562
2563 if (!cpupid_match_pid(tsk, cpupid))
3354781a 2564 goto no_join;
8c8a743c
PZ
2565
2566 grp = rcu_dereference(tsk->numa_group);
2567 if (!grp)
3354781a 2568 goto no_join;
8c8a743c 2569
cb361d8c 2570 my_grp = deref_curr_numa_group(p);
8c8a743c 2571 if (grp == my_grp)
3354781a 2572 goto no_join;
8c8a743c
PZ
2573
2574 /*
2575 * Only join the other group if its bigger; if we're the bigger group,
2576 * the other task will join us.
2577 */
2578 if (my_grp->nr_tasks > grp->nr_tasks)
3354781a 2579 goto no_join;
8c8a743c
PZ
2580
2581 /*
2582 * Tie-break on the grp address.
2583 */
2584 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
3354781a 2585 goto no_join;
8c8a743c 2586
dabe1d99
RR
2587 /* Always join threads in the same process. */
2588 if (tsk->mm == current->mm)
2589 join = true;
2590
2591 /* Simple filter to avoid false positives due to PID collisions */
2592 if (flags & TNF_SHARED)
2593 join = true;
8c8a743c 2594
3e6a9418
MG
2595 /* Update priv based on whether false sharing was detected */
2596 *priv = !join;
2597
dabe1d99 2598 if (join && !get_numa_group(grp))
3354781a 2599 goto no_join;
8c8a743c 2600
8c8a743c
PZ
2601 rcu_read_unlock();
2602
2603 if (!join)
2604 return;
2605
60e69eed
MG
2606 BUG_ON(irqs_disabled());
2607 double_lock_irq(&my_grp->lock, &grp->lock);
989348b5 2608
be1e4e76 2609 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
44dba3d5
IM
2610 my_grp->faults[i] -= p->numa_faults[i];
2611 grp->faults[i] += p->numa_faults[i];
8c8a743c 2612 }
989348b5
MG
2613 my_grp->total_faults -= p->total_numa_faults;
2614 grp->total_faults += p->total_numa_faults;
8c8a743c 2615
8c8a743c
PZ
2616 my_grp->nr_tasks--;
2617 grp->nr_tasks++;
2618
2619 spin_unlock(&my_grp->lock);
60e69eed 2620 spin_unlock_irq(&grp->lock);
8c8a743c
PZ
2621
2622 rcu_assign_pointer(p->numa_group, grp);
2623
2624 put_numa_group(my_grp);
3354781a
PZ
2625 return;
2626
2627no_join:
2628 rcu_read_unlock();
2629 return;
8c8a743c
PZ
2630}
2631
16d51a59 2632/*
3b03706f 2633 * Get rid of NUMA statistics associated with a task (either current or dead).
16d51a59
JH
2634 * If @final is set, the task is dead and has reached refcount zero, so we can
2635 * safely free all relevant data structures. Otherwise, there might be
2636 * concurrent reads from places like load balancing and procfs, and we should
2637 * reset the data back to default state without freeing ->numa_faults.
2638 */
2639void task_numa_free(struct task_struct *p, bool final)
8c8a743c 2640{
cb361d8c
JH
2641 /* safe: p either is current or is being freed by current */
2642 struct numa_group *grp = rcu_dereference_raw(p->numa_group);
16d51a59 2643 unsigned long *numa_faults = p->numa_faults;
e9dd685c
SR
2644 unsigned long flags;
2645 int i;
8c8a743c 2646
16d51a59
JH
2647 if (!numa_faults)
2648 return;
2649
8c8a743c 2650 if (grp) {
e9dd685c 2651 spin_lock_irqsave(&grp->lock, flags);
be1e4e76 2652 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
44dba3d5 2653 grp->faults[i] -= p->numa_faults[i];
989348b5 2654 grp->total_faults -= p->total_numa_faults;
83e1d2cd 2655
8c8a743c 2656 grp->nr_tasks--;
e9dd685c 2657 spin_unlock_irqrestore(&grp->lock, flags);
35b123e2 2658 RCU_INIT_POINTER(p->numa_group, NULL);
8c8a743c
PZ
2659 put_numa_group(grp);
2660 }
2661
16d51a59
JH
2662 if (final) {
2663 p->numa_faults = NULL;
2664 kfree(numa_faults);
2665 } else {
2666 p->total_numa_faults = 0;
2667 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2668 numa_faults[i] = 0;
2669 }
8c8a743c
PZ
2670}
2671
cbee9f88
PZ
2672/*
2673 * Got a PROT_NONE fault for a page on @node.
2674 */
58b46da3 2675void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
cbee9f88
PZ
2676{
2677 struct task_struct *p = current;
6688cc05 2678 bool migrated = flags & TNF_MIGRATED;
58b46da3 2679 int cpu_node = task_node(current);
792568ec 2680 int local = !!(flags & TNF_FAULT_LOCAL);
4142c3eb 2681 struct numa_group *ng;
ac8e895b 2682 int priv;
cbee9f88 2683
2a595721 2684 if (!static_branch_likely(&sched_numa_balancing))
1a687c2e
MG
2685 return;
2686
9ff1d9ff
MG
2687 /* for example, ksmd faulting in a user's mm */
2688 if (!p->mm)
2689 return;
2690
f809ca9a 2691 /* Allocate buffer to track faults on a per-node basis */
44dba3d5
IM
2692 if (unlikely(!p->numa_faults)) {
2693 int size = sizeof(*p->numa_faults) *
be1e4e76 2694 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
f809ca9a 2695
44dba3d5
IM
2696 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2697 if (!p->numa_faults)
f809ca9a 2698 return;
745d6147 2699
83e1d2cd 2700 p->total_numa_faults = 0;
04bb2f94 2701 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
f809ca9a 2702 }
cbee9f88 2703
8c8a743c
PZ
2704 /*
2705 * First accesses are treated as private, otherwise consider accesses
2706 * to be private if the accessing pid has not changed
2707 */
2708 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2709 priv = 1;
2710 } else {
2711 priv = cpupid_match_pid(p, last_cpupid);
6688cc05 2712 if (!priv && !(flags & TNF_NO_GROUP))
3e6a9418 2713 task_numa_group(p, last_cpupid, flags, &priv);
8c8a743c
PZ
2714 }
2715
792568ec
RR
2716 /*
2717 * If a workload spans multiple NUMA nodes, a shared fault that
2718 * occurs wholly within the set of nodes that the workload is
2719 * actively using should be counted as local. This allows the
2720 * scan rate to slow down when a workload has settled down.
2721 */
cb361d8c 2722 ng = deref_curr_numa_group(p);
4142c3eb
RR
2723 if (!priv && !local && ng && ng->active_nodes > 1 &&
2724 numa_is_active_node(cpu_node, ng) &&
2725 numa_is_active_node(mem_node, ng))
792568ec
RR
2726 local = 1;
2727
2739d3ee 2728 /*
e1ff516a
YW
2729 * Retry to migrate task to preferred node periodically, in case it
2730 * previously failed, or the scheduler moved us.
2739d3ee 2731 */
b6a60cf3
SD
2732 if (time_after(jiffies, p->numa_migrate_retry)) {
2733 task_numa_placement(p);
6b9a7460 2734 numa_migrate_preferred(p);
b6a60cf3 2735 }
6b9a7460 2736
b32e86b4
IM
2737 if (migrated)
2738 p->numa_pages_migrated += pages;
074c2381
MG
2739 if (flags & TNF_MIGRATE_FAIL)
2740 p->numa_faults_locality[2] += pages;
b32e86b4 2741
44dba3d5
IM
2742 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2743 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
792568ec 2744 p->numa_faults_locality[local] += pages;
cbee9f88
PZ
2745}
2746
6e5fb223
PZ
2747static void reset_ptenuma_scan(struct task_struct *p)
2748{
7e5a2c17
JL
2749 /*
2750 * We only did a read acquisition of the mmap sem, so
2751 * p->mm->numa_scan_seq is written to without exclusive access
2752 * and the update is not guaranteed to be atomic. That's not
2753 * much of an issue though, since this is just used for
2754 * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
2755 * expensive, to avoid any form of compiler optimizations:
2756 */
316c1608 2757 WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
6e5fb223
PZ
2758 p->mm->numa_scan_offset = 0;
2759}
2760
cbee9f88
PZ
2761/*
2762 * The expensive part of numa migration is done from task_work context.
2763 * Triggered from task_tick_numa().
2764 */
9434f9f5 2765static void task_numa_work(struct callback_head *work)
cbee9f88
PZ
2766{
2767 unsigned long migrate, next_scan, now = jiffies;
2768 struct task_struct *p = current;
2769 struct mm_struct *mm = p->mm;
51170840 2770 u64 runtime = p->se.sum_exec_runtime;
6e5fb223 2771 struct vm_area_struct *vma;
9f40604c 2772 unsigned long start, end;
598f0ec0 2773 unsigned long nr_pte_updates = 0;
4620f8c1 2774 long pages, virtpages;
cbee9f88 2775
9148a3a1 2776 SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
cbee9f88 2777
b34920d4 2778 work->next = work;
cbee9f88
PZ
2779 /*
2780 * Who cares about NUMA placement when they're dying.
2781 *
2782 * NOTE: make sure not to dereference p->mm before this check,
2783 * exit_task_work() happens _after_ exit_mm() so we could be called
2784 * without p->mm even though we still had it when we enqueued this
2785 * work.
2786 */
2787 if (p->flags & PF_EXITING)
2788 return;
2789
930aa174 2790 if (!mm->numa_next_scan) {
7e8d16b6
MG
2791 mm->numa_next_scan = now +
2792 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
b8593bfd
MG
2793 }
2794
cbee9f88
PZ
2795 /*
2796 * Enforce maximal scan/migration frequency..
2797 */
2798 migrate = mm->numa_next_scan;
2799 if (time_before(now, migrate))
2800 return;
2801
598f0ec0
MG
2802 if (p->numa_scan_period == 0) {
2803 p->numa_scan_period_max = task_scan_max(p);
b5dd77c8 2804 p->numa_scan_period = task_scan_start(p);
598f0ec0 2805 }
cbee9f88 2806
fb003b80 2807 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
cbee9f88
PZ
2808 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2809 return;
2810
19a78d11
PZ
2811 /*
2812 * Delay this task enough that another task of this mm will likely win
2813 * the next time around.
2814 */
2815 p->node_stamp += 2 * TICK_NSEC;
2816
9f40604c
MG
2817 start = mm->numa_scan_offset;
2818 pages = sysctl_numa_balancing_scan_size;
2819 pages <<= 20 - PAGE_SHIFT; /* MB in pages */
4620f8c1 2820 virtpages = pages * 8; /* Scan up to this much virtual space */
9f40604c
MG
2821 if (!pages)
2822 return;
cbee9f88 2823
4620f8c1 2824
d8ed45c5 2825 if (!mmap_read_trylock(mm))
8655d549 2826 return;
9f40604c 2827 vma = find_vma(mm, start);
6e5fb223
PZ
2828 if (!vma) {
2829 reset_ptenuma_scan(p);
9f40604c 2830 start = 0;
6e5fb223
PZ
2831 vma = mm->mmap;
2832 }
9f40604c 2833 for (; vma; vma = vma->vm_next) {
6b79c57b 2834 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
8e76d4ee 2835 is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
6e5fb223 2836 continue;
6b79c57b 2837 }
6e5fb223 2838
4591ce4f
MG
2839 /*
2840 * Shared library pages mapped by multiple processes are not
2841 * migrated as it is expected they are cache replicated. Avoid
2842 * hinting faults in read-only file-backed mappings or the vdso
2843 * as migrating the pages will be of marginal benefit.
2844 */
2845 if (!vma->vm_mm ||
2846 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2847 continue;
2848
3c67f474
MG
2849 /*
2850 * Skip inaccessible VMAs to avoid any confusion between
2851 * PROT_NONE and NUMA hinting ptes
2852 */
3122e80e 2853 if (!vma_is_accessible(vma))
3c67f474 2854 continue;
4591ce4f 2855
9f40604c
MG
2856 do {
2857 start = max(start, vma->vm_start);
2858 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2859 end = min(end, vma->vm_end);
4620f8c1 2860 nr_pte_updates = change_prot_numa(vma, start, end);
598f0ec0
MG
2861
2862 /*
4620f8c1
RR
2863 * Try to scan sysctl_numa_balancing_size worth of
2864 * hpages that have at least one present PTE that
2865 * is not already pte-numa. If the VMA contains
2866 * areas that are unused or already full of prot_numa
2867 * PTEs, scan up to virtpages, to skip through those
2868 * areas faster.
598f0ec0
MG
2869 */
2870 if (nr_pte_updates)
2871 pages -= (end - start) >> PAGE_SHIFT;
4620f8c1 2872 virtpages -= (end - start) >> PAGE_SHIFT;
6e5fb223 2873
9f40604c 2874 start = end;
4620f8c1 2875 if (pages <= 0 || virtpages <= 0)
9f40604c 2876 goto out;
3cf1962c
RR
2877
2878 cond_resched();
9f40604c 2879 } while (end != vma->vm_end);
cbee9f88 2880 }
6e5fb223 2881
9f40604c 2882out:
6e5fb223 2883 /*
c69307d5
PZ
2884 * It is possible to reach the end of the VMA list but the last few
2885 * VMAs are not guaranteed to the vma_migratable. If they are not, we
2886 * would find the !migratable VMA on the next scan but not reset the
2887 * scanner to the start so check it now.
6e5fb223
PZ
2888 */
2889 if (vma)
9f40604c 2890 mm->numa_scan_offset = start;
6e5fb223
PZ
2891 else
2892 reset_ptenuma_scan(p);
d8ed45c5 2893 mmap_read_unlock(mm);
51170840
RR
2894
2895 /*
2896 * Make sure tasks use at least 32x as much time to run other code
2897 * than they used here, to limit NUMA PTE scanning overhead to 3% max.
2898 * Usually update_task_scan_period slows down scanning enough; on an
2899 * overloaded system we need to limit overhead on a per task basis.
2900 */
2901 if (unlikely(p->se.sum_exec_runtime != runtime)) {
2902 u64 diff = p->se.sum_exec_runtime - runtime;
2903 p->node_stamp += 32 * diff;
2904 }
cbee9f88
PZ
2905}
2906
d35927a1
VS
2907void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
2908{
2909 int mm_users = 0;
2910 struct mm_struct *mm = p->mm;
2911
2912 if (mm) {
2913 mm_users = atomic_read(&mm->mm_users);
2914 if (mm_users == 1) {
2915 mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2916 mm->numa_scan_seq = 0;
2917 }
2918 }
2919 p->node_stamp = 0;
2920 p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
2921 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
70ce3ea9 2922 p->numa_migrate_retry = 0;
b34920d4 2923 /* Protect against double add, see task_tick_numa and task_numa_work */
d35927a1
VS
2924 p->numa_work.next = &p->numa_work;
2925 p->numa_faults = NULL;
12bf8a7e
HW
2926 p->numa_pages_migrated = 0;
2927 p->total_numa_faults = 0;
d35927a1
VS
2928 RCU_INIT_POINTER(p->numa_group, NULL);
2929 p->last_task_numa_placement = 0;
2930 p->last_sum_exec_runtime = 0;
2931
b34920d4
VS
2932 init_task_work(&p->numa_work, task_numa_work);
2933
d35927a1
VS
2934 /* New address space, reset the preferred nid */
2935 if (!(clone_flags & CLONE_VM)) {
2936 p->numa_preferred_nid = NUMA_NO_NODE;
2937 return;
2938 }
2939
2940 /*
2941 * New thread, keep existing numa_preferred_nid which should be copied
2942 * already by arch_dup_task_struct but stagger when scans start.
2943 */
2944 if (mm) {
2945 unsigned int delay;
2946
2947 delay = min_t(unsigned int, task_scan_max(current),
2948 current->numa_scan_period * mm_users * NSEC_PER_MSEC);
2949 delay += 2 * TICK_NSEC;
2950 p->node_stamp = delay;
2951 }
2952}
2953
cbee9f88
PZ
2954/*
2955 * Drive the periodic memory faults..
2956 */
b1546edc 2957static void task_tick_numa(struct rq *rq, struct task_struct *curr)
cbee9f88
PZ
2958{
2959 struct callback_head *work = &curr->numa_work;
2960 u64 period, now;
2961
2962 /*
2963 * We don't care about NUMA placement if we don't have memory.
2964 */
b3f9916d 2965 if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
cbee9f88
PZ
2966 return;
2967
2968 /*
2969 * Using runtime rather than walltime has the dual advantage that
2970 * we (mostly) drive the selection from busy threads and that the
2971 * task needs to have done some actual work before we bother with
2972 * NUMA placement.
2973 */
2974 now = curr->se.sum_exec_runtime;
2975 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2976
25b3e5a3 2977 if (now > curr->node_stamp + period) {
4b96a29b 2978 if (!curr->node_stamp)
b5dd77c8 2979 curr->numa_scan_period = task_scan_start(curr);
19a78d11 2980 curr->node_stamp += period;
cbee9f88 2981
b34920d4 2982 if (!time_before(jiffies, curr->mm->numa_next_scan))
91989c70 2983 task_work_add(curr, work, TWA_RESUME);
cbee9f88
PZ
2984 }
2985}
3fed382b 2986
3f9672ba
SD
2987static void update_scan_period(struct task_struct *p, int new_cpu)
2988{
2989 int src_nid = cpu_to_node(task_cpu(p));
2990 int dst_nid = cpu_to_node(new_cpu);
2991
05cbdf4f
MG
2992 if (!static_branch_likely(&sched_numa_balancing))
2993 return;
2994
3f9672ba
SD
2995 if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
2996 return;
2997
05cbdf4f
MG
2998 if (src_nid == dst_nid)
2999 return;
3000
3001 /*
3002 * Allow resets if faults have been trapped before one scan
3003 * has completed. This is most likely due to a new task that
3004 * is pulled cross-node due to wakeups or load balancing.
3005 */
3006 if (p->numa_scan_seq) {
3007 /*
3008 * Avoid scan adjustments if moving to the preferred
3009 * node or if the task was not previously running on
3010 * the preferred node.
3011 */
3012 if (dst_nid == p->numa_preferred_nid ||
98fa15f3
AK
3013 (p->numa_preferred_nid != NUMA_NO_NODE &&
3014 src_nid != p->numa_preferred_nid))
05cbdf4f
MG
3015 return;
3016 }
3017
3018 p->numa_scan_period = task_scan_start(p);
3f9672ba
SD
3019}
3020
cbee9f88
PZ
3021#else
3022static void task_tick_numa(struct rq *rq, struct task_struct *curr)
3023{
3024}
0ec8aa00
PZ
3025
3026static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
3027{
3028}
3029
3030static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
3031{
3032}
3fed382b 3033
3f9672ba
SD
3034static inline void update_scan_period(struct task_struct *p, int new_cpu)
3035{
3036}
3037
cbee9f88
PZ
3038#endif /* CONFIG_NUMA_BALANCING */
3039
30cfdcfc
DA
3040static void
3041account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
3042{
3043 update_load_add(&cfs_rq->load, se->load.weight);
367456c7 3044#ifdef CONFIG_SMP
0ec8aa00
PZ
3045 if (entity_is_task(se)) {
3046 struct rq *rq = rq_of(cfs_rq);
3047
3048 account_numa_enqueue(rq, task_of(se));
3049 list_add(&se->group_node, &rq->cfs_tasks);
3050 }
367456c7 3051#endif
30cfdcfc 3052 cfs_rq->nr_running++;
a480adde
JD
3053 if (se_is_idle(se))
3054 cfs_rq->idle_nr_running++;
30cfdcfc
DA
3055}
3056
3057static void
3058account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
3059{
3060 update_load_sub(&cfs_rq->load, se->load.weight);
bfdb198c 3061#ifdef CONFIG_SMP
0ec8aa00
PZ
3062 if (entity_is_task(se)) {
3063 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
b87f1724 3064 list_del_init(&se->group_node);
0ec8aa00 3065 }
bfdb198c 3066#endif
30cfdcfc 3067 cfs_rq->nr_running--;
a480adde
JD
3068 if (se_is_idle(se))
3069 cfs_rq->idle_nr_running--;
30cfdcfc
DA
3070}
3071
8d5b9025
PZ
3072/*
3073 * Signed add and clamp on underflow.
3074 *
3075 * Explicitly do a load-store to ensure the intermediate value never hits
3076 * memory. This allows lockless observations without ever seeing the negative
3077 * values.
3078 */
3079#define add_positive(_ptr, _val) do { \
3080 typeof(_ptr) ptr = (_ptr); \
3081 typeof(_val) val = (_val); \
3082 typeof(*ptr) res, var = READ_ONCE(*ptr); \
3083 \
3084 res = var + val; \
3085 \
3086 if (val < 0 && res > var) \
3087 res = 0; \
3088 \
3089 WRITE_ONCE(*ptr, res); \
3090} while (0)
3091
3092/*
3093 * Unsigned subtract and clamp on underflow.
3094 *
3095 * Explicitly do a load-store to ensure the intermediate value never hits
3096 * memory. This allows lockless observations without ever seeing the negative
3097 * values.
3098 */
3099#define sub_positive(_ptr, _val) do { \
3100 typeof(_ptr) ptr = (_ptr); \
3101 typeof(*ptr) val = (_val); \
3102 typeof(*ptr) res, var = READ_ONCE(*ptr); \
3103 res = var - val; \
3104 if (res > var) \
3105 res = 0; \
3106 WRITE_ONCE(*ptr, res); \
3107} while (0)
3108
b5c0ce7b
PB
3109/*
3110 * Remove and clamp on negative, from a local variable.
3111 *
3112 * A variant of sub_positive(), which does not use explicit load-store
3113 * and is thus optimized for local variable updates.
3114 */
3115#define lsub_positive(_ptr, _val) do { \
3116 typeof(_ptr) ptr = (_ptr); \
3117 *ptr -= min_t(typeof(*ptr), *ptr, _val); \
3118} while (0)
3119
8d5b9025 3120#ifdef CONFIG_SMP
8d5b9025
PZ
3121static inline void
3122enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3123{
3124 cfs_rq->avg.load_avg += se->avg.load_avg;
3125 cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
3126}
3127
3128static inline void
3129dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3130{
3131 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
2d02fa8c
VG
3132 sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
3133 /* See update_cfs_rq_load_avg() */
3134 cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
3135 cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
8d5b9025
PZ
3136}
3137#else
3138static inline void
8d5b9025
PZ
3139enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
3140static inline void
3141dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
3142#endif
3143
9059393e 3144static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
0dacee1b 3145 unsigned long weight)
9059393e
VG
3146{
3147 if (se->on_rq) {
3148 /* commit outstanding execution time */
3149 if (cfs_rq->curr == se)
3150 update_curr(cfs_rq);
1724b95b 3151 update_load_sub(&cfs_rq->load, se->load.weight);
9059393e
VG
3152 }
3153 dequeue_load_avg(cfs_rq, se);
3154
3155 update_load_set(&se->load, weight);
3156
3157#ifdef CONFIG_SMP
1ea6c46a 3158 do {
87e867b4 3159 u32 divider = get_pelt_divider(&se->avg);
1ea6c46a
PZ
3160
3161 se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
1ea6c46a 3162 } while (0);
9059393e
VG
3163#endif
3164
3165 enqueue_load_avg(cfs_rq, se);
0dacee1b 3166 if (se->on_rq)
1724b95b 3167 update_load_add(&cfs_rq->load, se->load.weight);
0dacee1b 3168
9059393e
VG
3169}
3170
3171void reweight_task(struct task_struct *p, int prio)
3172{
3173 struct sched_entity *se = &p->se;
3174 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3175 struct load_weight *load = &se->load;
3176 unsigned long weight = scale_load(sched_prio_to_weight[prio]);
3177
0dacee1b 3178 reweight_entity(cfs_rq, se, weight);
9059393e
VG
3179 load->inv_weight = sched_prio_to_wmult[prio];
3180}
3181
51bf903b
CZ
3182static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
3183
3ff6dcac 3184#ifdef CONFIG_FAIR_GROUP_SCHED
387f77cc 3185#ifdef CONFIG_SMP
cef27403
PZ
3186/*
3187 * All this does is approximate the hierarchical proportion which includes that
3188 * global sum we all love to hate.
3189 *
3190 * That is, the weight of a group entity, is the proportional share of the
3191 * group weight based on the group runqueue weights. That is:
3192 *
3193 * tg->weight * grq->load.weight
3194 * ge->load.weight = ----------------------------- (1)
08f7c2f4 3195 * \Sum grq->load.weight
cef27403
PZ
3196 *
3197 * Now, because computing that sum is prohibitively expensive to compute (been
3198 * there, done that) we approximate it with this average stuff. The average
3199 * moves slower and therefore the approximation is cheaper and more stable.
3200 *
3201 * So instead of the above, we substitute:
3202 *
3203 * grq->load.weight -> grq->avg.load_avg (2)
3204 *
3205 * which yields the following:
3206 *
3207 * tg->weight * grq->avg.load_avg
3208 * ge->load.weight = ------------------------------ (3)
08f7c2f4 3209 * tg->load_avg
cef27403
PZ
3210 *
3211 * Where: tg->load_avg ~= \Sum grq->avg.load_avg
3212 *
3213 * That is shares_avg, and it is right (given the approximation (2)).
3214 *
3215 * The problem with it is that because the average is slow -- it was designed
3216 * to be exactly that of course -- this leads to transients in boundary
3217 * conditions. In specific, the case where the group was idle and we start the
3218 * one task. It takes time for our CPU's grq->avg.load_avg to build up,
3219 * yielding bad latency etc..
3220 *
3221 * Now, in that special case (1) reduces to:
3222 *
3223 * tg->weight * grq->load.weight
17de4ee0 3224 * ge->load.weight = ----------------------------- = tg->weight (4)
08f7c2f4 3225 * grp->load.weight
cef27403
PZ
3226 *
3227 * That is, the sum collapses because all other CPUs are idle; the UP scenario.
3228 *
3229 * So what we do is modify our approximation (3) to approach (4) in the (near)
3230 * UP case, like:
3231 *
3232 * ge->load.weight =
3233 *
3234 * tg->weight * grq->load.weight
3235 * --------------------------------------------------- (5)
3236 * tg->load_avg - grq->avg.load_avg + grq->load.weight
3237 *
17de4ee0
PZ
3238 * But because grq->load.weight can drop to 0, resulting in a divide by zero,
3239 * we need to use grq->avg.load_avg as its lower bound, which then gives:
3240 *
3241 *
3242 * tg->weight * grq->load.weight
3243 * ge->load.weight = ----------------------------- (6)
08f7c2f4 3244 * tg_load_avg'
17de4ee0
PZ
3245 *
3246 * Where:
3247 *
3248 * tg_load_avg' = tg->load_avg - grq->avg.load_avg +
3249 * max(grq->load.weight, grq->avg.load_avg)
cef27403
PZ
3250 *
3251 * And that is shares_weight and is icky. In the (near) UP case it approaches
3252 * (4) while in the normal case it approaches (3). It consistently
3253 * overestimates the ge->load.weight and therefore:
3254 *
3255 * \Sum ge->load.weight >= tg->weight
3256 *
3257 * hence icky!
3258 */
2c8e4dce 3259static long calc_group_shares(struct cfs_rq *cfs_rq)
cf5f0acf 3260{
7c80cfc9
PZ
3261 long tg_weight, tg_shares, load, shares;
3262 struct task_group *tg = cfs_rq->tg;
3263
3264 tg_shares = READ_ONCE(tg->shares);
cf5f0acf 3265
3d4b60d3 3266 load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
cf5f0acf 3267
ea1dc6fc 3268 tg_weight = atomic_long_read(&tg->load_avg);
3ff6dcac 3269
ea1dc6fc
PZ
3270 /* Ensure tg_weight >= load */
3271 tg_weight -= cfs_rq->tg_load_avg_contrib;
3272 tg_weight += load;
3ff6dcac 3273
7c80cfc9 3274 shares = (tg_shares * load);
cf5f0acf
PZ
3275 if (tg_weight)
3276 shares /= tg_weight;
3ff6dcac 3277
b8fd8423
DE
3278 /*
3279 * MIN_SHARES has to be unscaled here to support per-CPU partitioning
3280 * of a group with small tg->shares value. It is a floor value which is
3281 * assigned as a minimum load.weight to the sched_entity representing
3282 * the group on a CPU.
3283 *
3284 * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024
3285 * on an 8-core system with 8 tasks each runnable on one CPU shares has
3286 * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In
3287 * case no task is runnable on a CPU MIN_SHARES=2 should be returned
3288 * instead of 0.
3289 */
7c80cfc9 3290 return clamp_t(long, shares, MIN_SHARES, tg_shares);
3ff6dcac 3291}
387f77cc 3292#endif /* CONFIG_SMP */
ea1dc6fc 3293
1ea6c46a
PZ
3294/*
3295 * Recomputes the group entity based on the current state of its group
3296 * runqueue.
3297 */
3298static void update_cfs_group(struct sched_entity *se)
2069dd75 3299{
1ea6c46a 3300 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
0dacee1b 3301 long shares;
2069dd75 3302
1ea6c46a 3303 if (!gcfs_rq)
89ee048f
VG
3304 return;
3305
1ea6c46a 3306 if (throttled_hierarchy(gcfs_rq))
2069dd75 3307 return;
89ee048f 3308
3ff6dcac 3309#ifndef CONFIG_SMP
0dacee1b 3310 shares = READ_ONCE(gcfs_rq->tg->shares);
7c80cfc9
PZ
3311
3312 if (likely(se->load.weight == shares))
3ff6dcac 3313 return;
7c80cfc9 3314#else
2c8e4dce 3315 shares = calc_group_shares(gcfs_rq);
3ff6dcac 3316#endif
2069dd75 3317
0dacee1b 3318 reweight_entity(cfs_rq_of(se), se, shares);
2069dd75 3319}
89ee048f 3320
2069dd75 3321#else /* CONFIG_FAIR_GROUP_SCHED */
1ea6c46a 3322static inline void update_cfs_group(struct sched_entity *se)
2069dd75
PZ
3323{
3324}
3325#endif /* CONFIG_FAIR_GROUP_SCHED */
3326
ea14b57e 3327static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
a030d738 3328{
43964409
LT
3329 struct rq *rq = rq_of(cfs_rq);
3330
a4f9a0e5 3331 if (&rq->cfs == cfs_rq) {
a030d738
VK
3332 /*
3333 * There are a few boundary cases this might miss but it should
3334 * get called often enough that that should (hopefully) not be
9783be2c 3335 * a real problem.
a030d738
VK
3336 *
3337 * It will not get called when we go idle, because the idle
3338 * thread is a different class (!fair), nor will the utilization
3339 * number include things like RT tasks.
3340 *
3341 * As is, the util number is not freq-invariant (we'd have to
3342 * implement arch_scale_freq_capacity() for that).
3343 *
82762d2a 3344 * See cpu_util_cfs().
a030d738 3345 */
ea14b57e 3346 cpufreq_update_util(rq, flags);
a030d738
VK
3347 }
3348}
3349
141965c7 3350#ifdef CONFIG_SMP
c566e8e9 3351#ifdef CONFIG_FAIR_GROUP_SCHED
fdaba61e
RR
3352/*
3353 * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list
3354 * immediately before a parent cfs_rq, and cfs_rqs are removed from the list
3355 * bottom-up, we only have to test whether the cfs_rq before us on the list
3356 * is our child.
3357 * If cfs_rq is not on the list, test whether a child needs its to be added to
3358 * connect a branch to the tree * (see list_add_leaf_cfs_rq() for details).
3359 */
3360static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq)
3361{
3362 struct cfs_rq *prev_cfs_rq;
3363 struct list_head *prev;
3364
3365 if (cfs_rq->on_list) {
3366 prev = cfs_rq->leaf_cfs_rq_list.prev;
3367 } else {
3368 struct rq *rq = rq_of(cfs_rq);
3369
3370 prev = rq->tmp_alone_branch;
3371 }
3372
3373 prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list);
3374
3375 return (prev_cfs_rq->tg->parent == cfs_rq->tg);
3376}
a7b359fc
OU
3377
3378static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
3379{
3380 if (cfs_rq->load.weight)
3381 return false;
3382
3383 if (cfs_rq->avg.load_sum)
3384 return false;
3385
3386 if (cfs_rq->avg.util_sum)
3387 return false;
3388
3389 if (cfs_rq->avg.runnable_sum)
3390 return false;
3391
fdaba61e
RR
3392 if (child_cfs_rq_on_list(cfs_rq))
3393 return false;
3394
b2c0931a
IM
3395 /*
3396 * _avg must be null when _sum are null because _avg = _sum / divider
3397 * Make sure that rounding and/or propagation of PELT values never
3398 * break this.
3399 */
3400 SCHED_WARN_ON(cfs_rq->avg.load_avg ||
3401 cfs_rq->avg.util_avg ||
3402 cfs_rq->avg.runnable_avg);
3403
a7b359fc
OU
3404 return true;
3405}
3406
7c3edd2c
PZ
3407/**
3408 * update_tg_load_avg - update the tg's load avg
3409 * @cfs_rq: the cfs_rq whose avg changed
7c3edd2c
PZ
3410 *
3411 * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
3412 * However, because tg->load_avg is a global value there are performance
3413 * considerations.
3414 *
3415 * In order to avoid having to look at the other cfs_rq's, we use a
3416 * differential update where we store the last value we propagated. This in
3417 * turn allows skipping updates if the differential is 'small'.
3418 *
815abf5a 3419 * Updating tg's load_avg is necessary before update_cfs_share().
bb17f655 3420 */
fe749158 3421static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
bb17f655 3422{
9d89c257 3423 long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
bb17f655 3424
aa0b7ae0
WL
3425 /*
3426 * No need to update load_avg for root_task_group as it is not used.
3427 */
3428 if (cfs_rq->tg == &root_task_group)
3429 return;
3430
fe749158 3431 if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
9d89c257
YD
3432 atomic_long_add(delta, &cfs_rq->tg->load_avg);
3433 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
bb17f655 3434 }
8165e145 3435}
f5f9739d 3436
ad936d86 3437/*
97fb7a0a 3438 * Called within set_task_rq() right before setting a task's CPU. The
ad936d86
BP
3439 * caller only guarantees p->pi_lock is held; no other assumptions,
3440 * including the state of rq->lock, should be made.
3441 */
3442void set_task_rq_fair(struct sched_entity *se,
3443 struct cfs_rq *prev, struct cfs_rq *next)
3444{
0ccb977f
PZ
3445 u64 p_last_update_time;
3446 u64 n_last_update_time;
3447
ad936d86
BP
3448 if (!sched_feat(ATTACH_AGE_LOAD))
3449 return;
3450
3451 /*
3452 * We are supposed to update the task to "current" time, then its up to
3453 * date and ready to go to new CPU/cfs_rq. But we have difficulty in
3454 * getting what current time is, so simply throw away the out-of-date
3455 * time. This will result in the wakee task is less decayed, but giving
3456 * the wakee more load sounds not bad.
3457 */
0ccb977f
PZ
3458 if (!(se->avg.last_update_time && prev))
3459 return;
ad936d86
BP
3460
3461#ifndef CONFIG_64BIT
0ccb977f 3462 {
ad936d86
BP
3463 u64 p_last_update_time_copy;
3464 u64 n_last_update_time_copy;
3465
3466 do {
3467 p_last_update_time_copy = prev->load_last_update_time_copy;
3468 n_last_update_time_copy = next->load_last_update_time_copy;
3469
3470 smp_rmb();
3471
3472 p_last_update_time = prev->avg.last_update_time;
3473 n_last_update_time = next->avg.last_update_time;
3474
3475 } while (p_last_update_time != p_last_update_time_copy ||
3476 n_last_update_time != n_last_update_time_copy);
0ccb977f 3477 }
ad936d86 3478#else
0ccb977f
PZ
3479 p_last_update_time = prev->avg.last_update_time;
3480 n_last_update_time = next->avg.last_update_time;
ad936d86 3481#endif
23127296 3482 __update_load_avg_blocked_se(p_last_update_time, se);
0ccb977f 3483 se->avg.last_update_time = n_last_update_time;
ad936d86 3484}
09a43ace 3485
0e2d2aaa
PZ
3486/*
3487 * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
3488 * propagate its contribution. The key to this propagation is the invariant
3489 * that for each group:
3490 *
3491 * ge->avg == grq->avg (1)
3492 *
3493 * _IFF_ we look at the pure running and runnable sums. Because they
3494 * represent the very same entity, just at different points in the hierarchy.
3495 *
9f683953
VG
3496 * Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial
3497 * and simply copies the running/runnable sum over (but still wrong, because
3498 * the group entity and group rq do not have their PELT windows aligned).
0e2d2aaa 3499 *
0dacee1b 3500 * However, update_tg_cfs_load() is more complex. So we have:
0e2d2aaa
PZ
3501 *
3502 * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2)
3503 *
3504 * And since, like util, the runnable part should be directly transferable,
3505 * the following would _appear_ to be the straight forward approach:
3506 *
a4c3c049 3507 * grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg (3)
0e2d2aaa
PZ
3508 *
3509 * And per (1) we have:
3510 *
a4c3c049 3511 * ge->avg.runnable_avg == grq->avg.runnable_avg
0e2d2aaa
PZ
3512 *
3513 * Which gives:
3514 *
3515 * ge->load.weight * grq->avg.load_avg
3516 * ge->avg.load_avg = ----------------------------------- (4)
3517 * grq->load.weight
3518 *
3519 * Except that is wrong!
3520 *
3521 * Because while for entities historical weight is not important and we
3522 * really only care about our future and therefore can consider a pure
3523 * runnable sum, runqueues can NOT do this.
3524 *
3525 * We specifically want runqueues to have a load_avg that includes
3526 * historical weights. Those represent the blocked load, the load we expect
3527 * to (shortly) return to us. This only works by keeping the weights as
3528 * integral part of the sum. We therefore cannot decompose as per (3).
3529 *
a4c3c049
VG
3530 * Another reason this doesn't work is that runnable isn't a 0-sum entity.
3531 * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the
3532 * rq itself is runnable anywhere between 2/3 and 1 depending on how the
3533 * runnable section of these tasks overlap (or not). If they were to perfectly
3534 * align the rq as a whole would be runnable 2/3 of the time. If however we
3535 * always have at least 1 runnable task, the rq as a whole is always runnable.
0e2d2aaa 3536 *
a4c3c049 3537 * So we'll have to approximate.. :/
0e2d2aaa 3538 *
a4c3c049 3539 * Given the constraint:
0e2d2aaa 3540 *
a4c3c049 3541 * ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX
0e2d2aaa 3542 *
a4c3c049
VG
3543 * We can construct a rule that adds runnable to a rq by assuming minimal
3544 * overlap.
0e2d2aaa 3545 *
a4c3c049 3546 * On removal, we'll assume each task is equally runnable; which yields:
0e2d2aaa 3547 *
a4c3c049 3548 * grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight
0e2d2aaa 3549 *
a4c3c049 3550 * XXX: only do this for the part of runnable > running ?
0e2d2aaa 3551 *
0e2d2aaa 3552 */
09a43ace 3553static inline void
0e2d2aaa 3554update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
09a43ace 3555{
7ceb7710
VG
3556 long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg;
3557 u32 new_sum, divider;
09a43ace
VG
3558
3559 /* Nothing to update */
7ceb7710 3560 if (!delta_avg)
09a43ace
VG
3561 return;
3562
87e867b4
VG
3563 /*
3564 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3565 * See ___update_load_avg() for details.
3566 */
3567 divider = get_pelt_divider(&cfs_rq->avg);
3568
7ceb7710 3569
09a43ace
VG
3570 /* Set new sched_entity's utilization */
3571 se->avg.util_avg = gcfs_rq->avg.util_avg;
7ceb7710
VG
3572 new_sum = se->avg.util_avg * divider;
3573 delta_sum = (long)new_sum - (long)se->avg.util_sum;
3574 se->avg.util_sum = new_sum;
09a43ace
VG
3575
3576 /* Update parent cfs_rq utilization */
7ceb7710
VG
3577 add_positive(&cfs_rq->avg.util_avg, delta_avg);
3578 add_positive(&cfs_rq->avg.util_sum, delta_sum);
3579
3580 /* See update_cfs_rq_load_avg() */
3581 cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
3582 cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
09a43ace
VG
3583}
3584
9f683953
VG
3585static inline void
3586update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3587{
95246d1e
VG
3588 long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
3589 u32 new_sum, divider;
9f683953
VG
3590
3591 /* Nothing to update */
95246d1e 3592 if (!delta_avg)
9f683953
VG
3593 return;
3594
87e867b4
VG
3595 /*
3596 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3597 * See ___update_load_avg() for details.
3598 */
3599 divider = get_pelt_divider(&cfs_rq->avg);
3600
9f683953
VG
3601 /* Set new sched_entity's runnable */
3602 se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
95246d1e
VG
3603 new_sum = se->avg.runnable_avg * divider;
3604 delta_sum = (long)new_sum - (long)se->avg.runnable_sum;
3605 se->avg.runnable_sum = new_sum;
9f683953
VG
3606
3607 /* Update parent cfs_rq runnable */
95246d1e
VG
3608 add_positive(&cfs_rq->avg.runnable_avg, delta_avg);
3609 add_positive(&cfs_rq->avg.runnable_sum, delta_sum);
3610 /* See update_cfs_rq_load_avg() */
3611 cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
3612 cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
9f683953
VG
3613}
3614
09a43ace 3615static inline void
0dacee1b 3616update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
09a43ace 3617{
2d02fa8c 3618 long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
0dacee1b
VG
3619 unsigned long load_avg;
3620 u64 load_sum = 0;
2d02fa8c 3621 s64 delta_sum;
95d68593 3622 u32 divider;
09a43ace 3623
0e2d2aaa
PZ
3624 if (!runnable_sum)
3625 return;
09a43ace 3626
0e2d2aaa 3627 gcfs_rq->prop_runnable_sum = 0;
09a43ace 3628
95d68593
VG
3629 /*
3630 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3631 * See ___update_load_avg() for details.
3632 */
87e867b4 3633 divider = get_pelt_divider(&cfs_rq->avg);
95d68593 3634
a4c3c049
VG
3635 if (runnable_sum >= 0) {
3636 /*
3637 * Add runnable; clip at LOAD_AVG_MAX. Reflects that until
3638 * the CPU is saturated running == runnable.
3639 */
3640 runnable_sum += se->avg.load_sum;
95d68593 3641 runnable_sum = min_t(long, runnable_sum, divider);
a4c3c049
VG
3642 } else {
3643 /*
3644 * Estimate the new unweighted runnable_sum of the gcfs_rq by
3645 * assuming all tasks are equally runnable.
3646 */
3647 if (scale_load_down(gcfs_rq->load.weight)) {
2d02fa8c 3648 load_sum = div_u64(gcfs_rq->avg.load_sum,
a4c3c049
VG
3649 scale_load_down(gcfs_rq->load.weight));
3650 }
3651
3652 /* But make sure to not inflate se's runnable */
3653 runnable_sum = min(se->avg.load_sum, load_sum);
3654 }
3655
3656 /*
3657 * runnable_sum can't be lower than running_sum
23127296
VG
3658 * Rescale running sum to be in the same range as runnable sum
3659 * running_sum is in [0 : LOAD_AVG_MAX << SCHED_CAPACITY_SHIFT]
3660 * runnable_sum is in [0 : LOAD_AVG_MAX]
a4c3c049 3661 */
23127296 3662 running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
a4c3c049
VG
3663 runnable_sum = max(runnable_sum, running_sum);
3664
2d02fa8c
VG
3665 load_sum = se_weight(se) * runnable_sum;
3666 load_avg = div_u64(load_sum, divider);
83c5e9d5 3667
2d02fa8c
VG
3668 delta_avg = load_avg - se->avg.load_avg;
3669 if (!delta_avg)
83c5e9d5 3670 return;
09a43ace 3671
2d02fa8c 3672 delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
7c7ad626 3673
2d02fa8c
VG
3674 se->avg.load_sum = runnable_sum;
3675 se->avg.load_avg = load_avg;
3676 add_positive(&cfs_rq->avg.load_avg, delta_avg);
3677 add_positive(&cfs_rq->avg.load_sum, delta_sum);
3678 /* See update_cfs_rq_load_avg() */
3679 cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
3680 cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
09a43ace
VG
3681}
3682
0e2d2aaa 3683static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
09a43ace 3684{
0e2d2aaa
PZ
3685 cfs_rq->propagate = 1;
3686 cfs_rq->prop_runnable_sum += runnable_sum;
09a43ace
VG
3687}
3688
3689/* Update task and its cfs_rq load average */
3690static inline int propagate_entity_load_avg(struct sched_entity *se)
3691{
0e2d2aaa 3692 struct cfs_rq *cfs_rq, *gcfs_rq;
09a43ace
VG
3693
3694 if (entity_is_task(se))
3695 return 0;
3696
0e2d2aaa
PZ
3697 gcfs_rq = group_cfs_rq(se);
3698 if (!gcfs_rq->propagate)
09a43ace
VG
3699 return 0;
3700
0e2d2aaa
PZ
3701 gcfs_rq->propagate = 0;
3702
09a43ace
VG
3703 cfs_rq = cfs_rq_of(se);
3704
0e2d2aaa 3705 add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
09a43ace 3706
0e2d2aaa 3707 update_tg_cfs_util(cfs_rq, se, gcfs_rq);
9f683953 3708 update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
0dacee1b 3709 update_tg_cfs_load(cfs_rq, se, gcfs_rq);
09a43ace 3710
ba19f51f 3711 trace_pelt_cfs_tp(cfs_rq);
8de6242c 3712 trace_pelt_se_tp(se);
ba19f51f 3713
09a43ace
VG
3714 return 1;
3715}
3716
bc427898
VG
3717/*
3718 * Check if we need to update the load and the utilization of a blocked
3719 * group_entity:
3720 */
3721static inline bool skip_blocked_update(struct sched_entity *se)
3722{
3723 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3724
3725 /*
3726 * If sched_entity still have not zero load or utilization, we have to
3727 * decay it:
3728 */
3729 if (se->avg.load_avg || se->avg.util_avg)
3730 return false;
3731
3732 /*
3733 * If there is a pending propagation, we have to update the load and
3734 * the utilization of the sched_entity:
3735 */
0e2d2aaa 3736 if (gcfs_rq->propagate)
bc427898
VG
3737 return false;
3738
3739 /*
3740 * Otherwise, the load and the utilization of the sched_entity is
3741 * already zero and there is no pending propagation, so it will be a
3742 * waste of time to try to decay it:
3743 */
3744 return true;
3745}
3746
6e83125c 3747#else /* CONFIG_FAIR_GROUP_SCHED */
09a43ace 3748
fe749158 3749static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
09a43ace
VG
3750
3751static inline int propagate_entity_load_avg(struct sched_entity *se)
3752{
3753 return 0;
3754}
3755
0e2d2aaa 3756static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
09a43ace 3757
6e83125c 3758#endif /* CONFIG_FAIR_GROUP_SCHED */
c566e8e9 3759
3d30544f
PZ
3760/**
3761 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
23127296 3762 * @now: current time, as per cfs_rq_clock_pelt()
3d30544f 3763 * @cfs_rq: cfs_rq to update
3d30544f
PZ
3764 *
3765 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
3766 * avg. The immediate corollary is that all (fair) tasks must be attached, see
3767 * post_init_entity_util_avg().
3768 *
3769 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
3770 *
a315da5e 3771 * Return: true if the load decayed or we removed load.
7c3edd2c
PZ
3772 *
3773 * Since both these conditions indicate a changed cfs_rq->avg.load we should
3774 * call update_tg_load_avg() when this function returns true.
3d30544f 3775 */
a2c6c91f 3776static inline int
3a123bbb 3777update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
2dac754e 3778{
9f683953 3779 unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
9d89c257 3780 struct sched_avg *sa = &cfs_rq->avg;
2a2f5d4e 3781 int decayed = 0;
2dac754e 3782
2a2f5d4e
PZ
3783 if (cfs_rq->removed.nr) {
3784 unsigned long r;
87e867b4 3785 u32 divider = get_pelt_divider(&cfs_rq->avg);
2a2f5d4e
PZ
3786
3787 raw_spin_lock(&cfs_rq->removed.lock);
3788 swap(cfs_rq->removed.util_avg, removed_util);
3789 swap(cfs_rq->removed.load_avg, removed_load);
9f683953 3790 swap(cfs_rq->removed.runnable_avg, removed_runnable);
2a2f5d4e
PZ
3791 cfs_rq->removed.nr = 0;
3792 raw_spin_unlock(&cfs_rq->removed.lock);
3793
2a2f5d4e 3794 r = removed_load;
89741892 3795 sub_positive(&sa->load_avg, r);
2d02fa8c
VG
3796 sub_positive(&sa->load_sum, r * divider);
3797 /* See sa->util_sum below */
3798 sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER);
2dac754e 3799
2a2f5d4e 3800 r = removed_util;
89741892 3801 sub_positive(&sa->util_avg, r);
98b0d890
VG
3802 sub_positive(&sa->util_sum, r * divider);
3803 /*
3804 * Because of rounding, se->util_sum might ends up being +1 more than
3805 * cfs->util_sum. Although this is not a problem by itself, detaching
3806 * a lot of tasks with the rounding problem between 2 updates of
3807 * util_avg (~1ms) can make cfs->util_sum becoming null whereas
3808 * cfs_util_avg is not.
3809 * Check that util_sum is still above its lower bound for the new
3810 * util_avg. Given that period_contrib might have moved since the last
3811 * sync, we are only sure that util_sum must be above or equal to
3812 * util_avg * minimum possible divider
3813 */
3814 sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
2a2f5d4e 3815
9f683953
VG
3816 r = removed_runnable;
3817 sub_positive(&sa->runnable_avg, r);
95246d1e
VG
3818 sub_positive(&sa->runnable_sum, r * divider);
3819 /* See sa->util_sum above */
3820 sa->runnable_sum = max_t(u32, sa->runnable_sum,
3821 sa->runnable_avg * PELT_MIN_DIVIDER);
9f683953
VG
3822
3823 /*
3824 * removed_runnable is the unweighted version of removed_load so we
3825 * can use it to estimate removed_load_sum.
3826 */
3827 add_tg_cfs_propagate(cfs_rq,
3828 -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT);
2a2f5d4e
PZ
3829
3830 decayed = 1;
9d89c257 3831 }
36ee28e4 3832
23127296 3833 decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
36ee28e4 3834
9d89c257
YD
3835#ifndef CONFIG_64BIT
3836 smp_wmb();
3837 cfs_rq->load_last_update_time_copy = sa->last_update_time;
3838#endif
36ee28e4 3839
2a2f5d4e 3840 return decayed;
21e96f88
SM
3841}
3842
3d30544f
PZ
3843/**
3844 * attach_entity_load_avg - attach this entity to its cfs_rq load avg
3845 * @cfs_rq: cfs_rq to attach to
3846 * @se: sched_entity to attach
3847 *
3848 * Must call update_cfs_rq_load_avg() before this, since we rely on
3849 * cfs_rq->avg.last_update_time being current.
3850 */
a4f9a0e5 3851static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
a05e8c51 3852{
95d68593
VG
3853 /*
3854 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3855 * See ___update_load_avg() for details.
3856 */
87e867b4 3857 u32 divider = get_pelt_divider(&cfs_rq->avg);
f207934f
PZ
3858
3859 /*
3860 * When we attach the @se to the @cfs_rq, we must align the decay
3861 * window because without that, really weird and wonderful things can
3862 * happen.
3863 *
3864 * XXX illustrate
3865 */
a05e8c51 3866 se->avg.last_update_time = cfs_rq->avg.last_update_time;
f207934f
PZ
3867 se->avg.period_contrib = cfs_rq->avg.period_contrib;
3868
3869 /*
3870 * Hell(o) Nasty stuff.. we need to recompute _sum based on the new
3871 * period_contrib. This isn't strictly correct, but since we're
3872 * entirely outside of the PELT hierarchy, nobody cares if we truncate
3873 * _sum a little.
3874 */
3875 se->avg.util_sum = se->avg.util_avg * divider;
3876
9f683953
VG
3877 se->avg.runnable_sum = se->avg.runnable_avg * divider;
3878
40f5aa4c 3879 se->avg.load_sum = se->avg.load_avg * divider;
3880 if (se_weight(se) < se->avg.load_sum)
3881 se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se));
3882 else
3883 se->avg.load_sum = 1;
f207934f 3884
8d5b9025 3885 enqueue_load_avg(cfs_rq, se);
a05e8c51
BP
3886 cfs_rq->avg.util_avg += se->avg.util_avg;
3887 cfs_rq->avg.util_sum += se->avg.util_sum;
9f683953
VG
3888 cfs_rq->avg.runnable_avg += se->avg.runnable_avg;
3889 cfs_rq->avg.runnable_sum += se->avg.runnable_sum;
0e2d2aaa
PZ
3890
3891 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
a2c6c91f 3892
a4f9a0e5 3893 cfs_rq_util_change(cfs_rq, 0);
ba19f51f
QY
3894
3895 trace_pelt_cfs_tp(cfs_rq);
a05e8c51
BP
3896}
3897
3d30544f
PZ
3898/**
3899 * detach_entity_load_avg - detach this entity from its cfs_rq load avg
3900 * @cfs_rq: cfs_rq to detach from
3901 * @se: sched_entity to detach
3902 *
3903 * Must call update_cfs_rq_load_avg() before this, since we rely on
3904 * cfs_rq->avg.last_update_time being current.
3905 */
a05e8c51
BP
3906static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3907{
8d5b9025 3908 dequeue_load_avg(cfs_rq, se);
89741892 3909 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
7ceb7710
VG
3910 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
3911 /* See update_cfs_rq_load_avg() */
3912 cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
3913 cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
3914
9f683953 3915 sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
95246d1e
VG
3916 sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
3917 /* See update_cfs_rq_load_avg() */
3918 cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
3919 cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
0e2d2aaa
PZ
3920
3921 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
a2c6c91f 3922
ea14b57e 3923 cfs_rq_util_change(cfs_rq, 0);
ba19f51f
QY
3924
3925 trace_pelt_cfs_tp(cfs_rq);
a05e8c51
BP
3926}
3927
b382a531
PZ
3928/*
3929 * Optional action to be done while updating the load average
3930 */
3931#define UPDATE_TG 0x1
3932#define SKIP_AGE_LOAD 0x2
3933#define DO_ATTACH 0x4
3934
3935/* Update task and its cfs_rq load average */
3936static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3937{
23127296 3938 u64 now = cfs_rq_clock_pelt(cfs_rq);
b382a531
PZ
3939 int decayed;
3940
3941 /*
3942 * Track task load average for carrying it to new CPU after migrated, and
3943 * track group sched_entity load average for task_h_load calc in migration
3944 */
3945 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
23127296 3946 __update_load_avg_se(now, cfs_rq, se);
b382a531
PZ
3947
3948 decayed = update_cfs_rq_load_avg(now, cfs_rq);
3949 decayed |= propagate_entity_load_avg(se);
3950
3951 if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
3952
ea14b57e
PZ
3953 /*
3954 * DO_ATTACH means we're here from enqueue_entity().
3955 * !last_update_time means we've passed through
3956 * migrate_task_rq_fair() indicating we migrated.
3957 *
3958 * IOW we're enqueueing a task on a new CPU.
3959 */
a4f9a0e5 3960 attach_entity_load_avg(cfs_rq, se);
fe749158 3961 update_tg_load_avg(cfs_rq);
b382a531 3962
bef69dd8
VG
3963 } else if (decayed) {
3964 cfs_rq_util_change(cfs_rq, 0);
3965
3966 if (flags & UPDATE_TG)
fe749158 3967 update_tg_load_avg(cfs_rq);
bef69dd8 3968 }
b382a531
PZ
3969}
3970
9d89c257 3971#ifndef CONFIG_64BIT
0905f04e
YD
3972static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3973{
9d89c257 3974 u64 last_update_time_copy;
0905f04e 3975 u64 last_update_time;
9ee474f5 3976
9d89c257
YD
3977 do {
3978 last_update_time_copy = cfs_rq->load_last_update_time_copy;
3979 smp_rmb();
3980 last_update_time = cfs_rq->avg.last_update_time;
3981 } while (last_update_time != last_update_time_copy);
0905f04e
YD
3982
3983 return last_update_time;
3984}
9d89c257 3985#else
0905f04e
YD
3986static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3987{
3988 return cfs_rq->avg.last_update_time;
3989}
9d89c257
YD
3990#endif
3991
104cb16d
MR
3992/*
3993 * Synchronize entity load avg of dequeued entity without locking
3994 * the previous rq.
3995 */
71b47eaf 3996static void sync_entity_load_avg(struct sched_entity *se)
104cb16d
MR
3997{
3998 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3999 u64 last_update_time;
4000
4001 last_update_time = cfs_rq_last_update_time(cfs_rq);
23127296 4002 __update_load_avg_blocked_se(last_update_time, se);
104cb16d
MR
4003}
4004
0905f04e
YD
4005/*
4006 * Task first catches up with cfs_rq, and then subtract
4007 * itself from the cfs_rq (task must be off the queue now).
4008 */
71b47eaf 4009static void remove_entity_load_avg(struct sched_entity *se)
0905f04e
YD
4010{
4011 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2a2f5d4e 4012 unsigned long flags;
0905f04e
YD
4013
4014 /*
7dc603c9
PZ
4015 * tasks cannot exit without having gone through wake_up_new_task() ->
4016 * post_init_entity_util_avg() which will have added things to the
4017 * cfs_rq, so we can remove unconditionally.
0905f04e 4018 */
0905f04e 4019
104cb16d 4020 sync_entity_load_avg(se);
2a2f5d4e
PZ
4021
4022 raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
4023 ++cfs_rq->removed.nr;
4024 cfs_rq->removed.util_avg += se->avg.util_avg;
4025 cfs_rq->removed.load_avg += se->avg.load_avg;
9f683953 4026 cfs_rq->removed.runnable_avg += se->avg.runnable_avg;
2a2f5d4e 4027 raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
2dac754e 4028}
642dbc39 4029
9f683953
VG
4030static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)
4031{
4032 return cfs_rq->avg.runnable_avg;
4033}
4034
7ea241af
YD
4035static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
4036{
4037 return cfs_rq->avg.load_avg;
4038}
4039
d91cecc1
CY
4040static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
4041
7f65ea42
PB
4042static inline unsigned long task_util(struct task_struct *p)
4043{
4044 return READ_ONCE(p->se.avg.util_avg);
4045}
4046
4047static inline unsigned long _task_util_est(struct task_struct *p)
4048{
4049 struct util_est ue = READ_ONCE(p->se.avg.util_est);
4050
68d7a190 4051 return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
7f65ea42
PB
4052}
4053
4054static inline unsigned long task_util_est(struct task_struct *p)
4055{
4056 return max(task_util(p), _task_util_est(p));
4057}
4058
a7008c07
VS
4059#ifdef CONFIG_UCLAMP_TASK
4060static inline unsigned long uclamp_task_util(struct task_struct *p)
4061{
4062 return clamp(task_util_est(p),
4063 uclamp_eff_value(p, UCLAMP_MIN),
4064 uclamp_eff_value(p, UCLAMP_MAX));
4065}
4066#else
4067static inline unsigned long uclamp_task_util(struct task_struct *p)
4068{
4069 return task_util_est(p);
4070}
4071#endif
4072
7f65ea42
PB
4073static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
4074 struct task_struct *p)
4075{
4076 unsigned int enqueued;
4077
4078 if (!sched_feat(UTIL_EST))
4079 return;
4080
4081 /* Update root cfs_rq's estimated utilization */
4082 enqueued = cfs_rq->avg.util_est.enqueued;
92a801e5 4083 enqueued += _task_util_est(p);
7f65ea42 4084 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
4581bea8
VD
4085
4086 trace_sched_util_est_cfs_tp(cfs_rq);
7f65ea42
PB
4087}
4088
8c1f560c
XY
4089static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
4090 struct task_struct *p)
4091{
4092 unsigned int enqueued;
4093
4094 if (!sched_feat(UTIL_EST))
4095 return;
4096
4097 /* Update root cfs_rq's estimated utilization */
4098 enqueued = cfs_rq->avg.util_est.enqueued;
4099 enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
4100 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
4101
4102 trace_sched_util_est_cfs_tp(cfs_rq);
4103}
4104
b89997aa
VD
4105#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
4106
7f65ea42
PB
4107/*
4108 * Check if a (signed) value is within a specified (unsigned) margin,
4109 * based on the observation that:
4110 *
4111 * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
4112 *
3b03706f 4113 * NOTE: this only works when value + margin < INT_MAX.
7f65ea42
PB
4114 */
4115static inline bool within_margin(int value, int margin)
4116{
4117 return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
4118}
4119
8c1f560c
XY
4120static inline void util_est_update(struct cfs_rq *cfs_rq,
4121 struct task_struct *p,
4122 bool task_sleep)
7f65ea42 4123{
b89997aa 4124 long last_ewma_diff, last_enqueued_diff;
7f65ea42
PB
4125 struct util_est ue;
4126
4127 if (!sched_feat(UTIL_EST))
4128 return;
4129
7f65ea42
PB
4130 /*
4131 * Skip update of task's estimated utilization when the task has not
4132 * yet completed an activation, e.g. being migrated.
4133 */
4134 if (!task_sleep)
4135 return;
4136
d519329f
PB
4137 /*
4138 * If the PELT values haven't changed since enqueue time,
4139 * skip the util_est update.
4140 */
4141 ue = p->se.avg.util_est;
4142 if (ue.enqueued & UTIL_AVG_UNCHANGED)
4143 return;
4144
b89997aa
VD
4145 last_enqueued_diff = ue.enqueued;
4146
b8c96361
PB
4147 /*
4148 * Reset EWMA on utilization increases, the moving average is used only
4149 * to smooth utilization decreases.
4150 */
68d7a190 4151 ue.enqueued = task_util(p);
b8c96361
PB
4152 if (sched_feat(UTIL_EST_FASTUP)) {
4153 if (ue.ewma < ue.enqueued) {
4154 ue.ewma = ue.enqueued;
4155 goto done;
4156 }
4157 }
4158
7f65ea42 4159 /*
b89997aa 4160 * Skip update of task's estimated utilization when its members are
7f65ea42
PB
4161 * already ~1% close to its last activation value.
4162 */
7f65ea42 4163 last_ewma_diff = ue.enqueued - ue.ewma;
b89997aa
VD
4164 last_enqueued_diff -= ue.enqueued;
4165 if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) {
4166 if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN))
4167 goto done;
4168
7f65ea42 4169 return;
b89997aa 4170 }
7f65ea42 4171
10a35e68
VG
4172 /*
4173 * To avoid overestimation of actual task utilization, skip updates if
4174 * we cannot grant there is idle time in this CPU.
4175 */
8c1f560c 4176 if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq))))
10a35e68
VG
4177 return;
4178
7f65ea42
PB
4179 /*
4180 * Update Task's estimated utilization
4181 *
4182 * When *p completes an activation we can consolidate another sample
4183 * of the task size. This is done by storing the current PELT value
4184 * as ue.enqueued and by using this value to update the Exponential
4185 * Weighted Moving Average (EWMA):
4186 *
4187 * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
4188 * = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
4189 * = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
4190 * = w * ( last_ewma_diff ) + ewma(t-1)
4191 * = w * (last_ewma_diff + ewma(t-1) / w)
4192 *
4193 * Where 'w' is the weight of new samples, which is configured to be
4194 * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
4195 */
4196 ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
4197 ue.ewma += last_ewma_diff;
4198 ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
b8c96361 4199done:
68d7a190 4200 ue.enqueued |= UTIL_AVG_UNCHANGED;
7f65ea42 4201 WRITE_ONCE(p->se.avg.util_est, ue);
4581bea8
VD
4202
4203 trace_sched_util_est_se_tp(&p->se);
7f65ea42
PB
4204}
4205
ef8df979
VD
4206static inline int task_fits_capacity(struct task_struct *p,
4207 unsigned long capacity)
3b1baa64 4208{
a7008c07 4209 return fits_capacity(uclamp_task_util(p), capacity);
3b1baa64
MR
4210}
4211
4212static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
4213{
4214 if (!static_branch_unlikely(&sched_asym_cpucapacity))
4215 return;
4216
0ae78eec 4217 if (!p || p->nr_cpus_allowed == 1) {
3b1baa64
MR
4218 rq->misfit_task_load = 0;
4219 return;
4220 }
4221
4222 if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
4223 rq->misfit_task_load = 0;
4224 return;
4225 }
4226
01cfcde9
VG
4227 /*
4228 * Make sure that misfit_task_load will not be null even if
4229 * task_h_load() returns 0.
4230 */
4231 rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
3b1baa64
MR
4232}
4233
38033c37
PZ
4234#else /* CONFIG_SMP */
4235
a7b359fc
OU
4236static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
4237{
4238 return true;
4239}
4240
d31b1a66
VG
4241#define UPDATE_TG 0x0
4242#define SKIP_AGE_LOAD 0x0
b382a531 4243#define DO_ATTACH 0x0
d31b1a66 4244
88c0616e 4245static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
536bd00c 4246{
ea14b57e 4247 cfs_rq_util_change(cfs_rq, 0);
536bd00c
RW
4248}
4249
9d89c257 4250static inline void remove_entity_load_avg(struct sched_entity *se) {}
6e83125c 4251
a05e8c51 4252static inline void
a4f9a0e5 4253attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
a05e8c51
BP
4254static inline void
4255detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
4256
d91cecc1 4257static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
6e83125c
PZ
4258{
4259 return 0;
4260}
4261
7f65ea42
PB
4262static inline void
4263util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
4264
4265static inline void
8c1f560c
XY
4266util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
4267
4268static inline void
4269util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p,
4270 bool task_sleep) {}
3b1baa64 4271static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
7f65ea42 4272
38033c37 4273#endif /* CONFIG_SMP */
9d85f21c 4274
ddc97297
PZ
4275static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
4276{
4277#ifdef CONFIG_SCHED_DEBUG
4278 s64 d = se->vruntime - cfs_rq->min_vruntime;
4279
4280 if (d < 0)
4281 d = -d;
4282
4283 if (d > 3*sysctl_sched_latency)
ae92882e 4284 schedstat_inc(cfs_rq->nr_spread_over);
ddc97297
PZ
4285#endif
4286}
4287
aeb73b04
PZ
4288static void
4289place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
4290{
1af5f730 4291 u64 vruntime = cfs_rq->min_vruntime;
94dfb5e7 4292
2cb8600e
PZ
4293 /*
4294 * The 'current' period is already promised to the current tasks,
4295 * however the extra weight of the new task will slow them down a
4296 * little, place the new task so that it fits in the slot that
4297 * stays open at the end.
4298 */
94dfb5e7 4299 if (initial && sched_feat(START_DEBIT))
f9c0b095 4300 vruntime += sched_vslice(cfs_rq, se);
aeb73b04 4301
a2e7a7eb 4302 /* sleeps up to a single latency don't count. */
5ca9880c 4303 if (!initial) {
2cae3948
JD
4304 unsigned long thresh;
4305
4306 if (se_is_idle(se))
4307 thresh = sysctl_sched_min_granularity;
4308 else
4309 thresh = sysctl_sched_latency;
a7be37ac 4310
a2e7a7eb
MG
4311 /*
4312 * Halve their sleep time's effect, to allow
4313 * for a gentler effect of sleepers:
4314 */
4315 if (sched_feat(GENTLE_FAIR_SLEEPERS))
4316 thresh >>= 1;
51e0304c 4317
a2e7a7eb 4318 vruntime -= thresh;
aeb73b04
PZ
4319 }
4320
b5d9d734 4321 /* ensure we never gain time by being placed backwards. */
16c8f1c7 4322 se->vruntime = max_vruntime(se->vruntime, vruntime);
aeb73b04
PZ
4323}
4324
d3d9dc33
PT
4325static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
4326
fe61468b 4327static inline bool cfs_bandwidth_used(void);
b5179ac7
PZ
4328
4329/*
4330 * MIGRATION
4331 *
4332 * dequeue
4333 * update_curr()
4334 * update_min_vruntime()
4335 * vruntime -= min_vruntime
4336 *
4337 * enqueue
4338 * update_curr()
4339 * update_min_vruntime()
4340 * vruntime += min_vruntime
4341 *
4342 * this way the vruntime transition between RQs is done when both
4343 * min_vruntime are up-to-date.
4344 *
4345 * WAKEUP (remote)
4346 *
59efa0ba 4347 * ->migrate_task_rq_fair() (p->state == TASK_WAKING)
b5179ac7
PZ
4348 * vruntime -= min_vruntime
4349 *
4350 * enqueue
4351 * update_curr()
4352 * update_min_vruntime()
4353 * vruntime += min_vruntime
4354 *
4355 * this way we don't have the most up-to-date min_vruntime on the originating
4356 * CPU and an up-to-date min_vruntime on the destination CPU.
4357 */
4358
bf0f6f24 4359static void
88ec22d3 4360enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
bf0f6f24 4361{
2f950354
PZ
4362 bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
4363 bool curr = cfs_rq->curr == se;
4364
88ec22d3 4365 /*
2f950354
PZ
4366 * If we're the current task, we must renormalise before calling
4367 * update_curr().
88ec22d3 4368 */
2f950354 4369 if (renorm && curr)
88ec22d3
PZ
4370 se->vruntime += cfs_rq->min_vruntime;
4371
2f950354
PZ
4372 update_curr(cfs_rq);
4373
bf0f6f24 4374 /*
2f950354
PZ
4375 * Otherwise, renormalise after, such that we're placed at the current
4376 * moment in time, instead of some random moment in the past. Being
4377 * placed in the past could significantly boost this task to the
4378 * fairness detriment of existing tasks.
bf0f6f24 4379 */
2f950354
PZ
4380 if (renorm && !curr)
4381 se->vruntime += cfs_rq->min_vruntime;
4382
89ee048f
VG
4383 /*
4384 * When enqueuing a sched_entity, we must:
4385 * - Update loads to have both entity and cfs_rq synced with now.
9f683953 4386 * - Add its load to cfs_rq->runnable_avg
89ee048f
VG
4387 * - For group_entity, update its weight to reflect the new share of
4388 * its group cfs_rq
4389 * - Add its new weight to cfs_rq->load.weight
4390 */
b382a531 4391 update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
9f683953 4392 se_update_runnable(se);
1ea6c46a 4393 update_cfs_group(se);
17bc14b7 4394 account_entity_enqueue(cfs_rq, se);
bf0f6f24 4395
1a3d027c 4396 if (flags & ENQUEUE_WAKEUP)
aeb73b04 4397 place_entity(cfs_rq, se, 0);
bf0f6f24 4398
cb251765 4399 check_schedstat_required();
60f2415e 4400 update_stats_enqueue_fair(cfs_rq, se, flags);
4fa8d299 4401 check_spread(cfs_rq, se);
2f950354 4402 if (!curr)
83b699ed 4403 __enqueue_entity(cfs_rq, se);
2069dd75 4404 se->on_rq = 1;
3d4b47b4 4405
51bf903b 4406 if (cfs_rq->nr_running == 1) {
d3d9dc33 4407 check_enqueue_throttle(cfs_rq);
51bf903b
CZ
4408 if (!throttled_hierarchy(cfs_rq))
4409 list_add_leaf_cfs_rq(cfs_rq);
4410 }
bf0f6f24
IM
4411}
4412
2c13c919 4413static void __clear_buddies_last(struct sched_entity *se)
2002c695 4414{
2c13c919
RR
4415 for_each_sched_entity(se) {
4416 struct cfs_rq *cfs_rq = cfs_rq_of(se);
f1044799 4417 if (cfs_rq->last != se)
2c13c919 4418 break;
f1044799
PZ
4419
4420 cfs_rq->last = NULL;
2c13c919
RR
4421 }
4422}
2002c695 4423
2c13c919
RR
4424static void __clear_buddies_next(struct sched_entity *se)
4425{
4426 for_each_sched_entity(se) {
4427 struct cfs_rq *cfs_rq = cfs_rq_of(se);
f1044799 4428 if (cfs_rq->next != se)
2c13c919 4429 break;
f1044799
PZ
4430
4431 cfs_rq->next = NULL;
2c13c919 4432 }
2002c695
PZ
4433}
4434
ac53db59
RR
4435static void __clear_buddies_skip(struct sched_entity *se)
4436{
4437 for_each_sched_entity(se) {
4438 struct cfs_rq *cfs_rq = cfs_rq_of(se);
f1044799 4439 if (cfs_rq->skip != se)
ac53db59 4440 break;
f1044799
PZ
4441
4442 cfs_rq->skip = NULL;
ac53db59
RR
4443 }
4444}
4445
a571bbea
PZ
4446static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
4447{
2c13c919
RR
4448 if (cfs_rq->last == se)
4449 __clear_buddies_last(se);
4450
4451 if (cfs_rq->next == se)
4452 __clear_buddies_next(se);
ac53db59
RR
4453
4454 if (cfs_rq->skip == se)
4455 __clear_buddies_skip(se);
a571bbea
PZ
4456}
4457
6c16a6dc 4458static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
d8b4986d 4459
bf0f6f24 4460static void
371fd7e7 4461dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
bf0f6f24 4462{
a2a2d680
DA
4463 /*
4464 * Update run-time statistics of the 'current'.
4465 */
4466 update_curr(cfs_rq);
89ee048f
VG
4467
4468 /*
4469 * When dequeuing a sched_entity, we must:
4470 * - Update loads to have both entity and cfs_rq synced with now.
9f683953 4471 * - Subtract its load from the cfs_rq->runnable_avg.
dfcb245e 4472 * - Subtract its previous weight from cfs_rq->load.weight.
89ee048f
VG
4473 * - For group entity, update its weight to reflect the new share
4474 * of its group cfs_rq.
4475 */
88c0616e 4476 update_load_avg(cfs_rq, se, UPDATE_TG);
9f683953 4477 se_update_runnable(se);
a2a2d680 4478
60f2415e 4479 update_stats_dequeue_fair(cfs_rq, se, flags);
67e9fb2a 4480
2002c695 4481 clear_buddies(cfs_rq, se);
4793241b 4482
83b699ed 4483 if (se != cfs_rq->curr)
30cfdcfc 4484 __dequeue_entity(cfs_rq, se);
17bc14b7 4485 se->on_rq = 0;
30cfdcfc 4486 account_entity_dequeue(cfs_rq, se);
88ec22d3
PZ
4487
4488 /*
b60205c7
PZ
4489 * Normalize after update_curr(); which will also have moved
4490 * min_vruntime if @se is the one holding it back. But before doing
4491 * update_min_vruntime() again, which will discount @se's position and
4492 * can move min_vruntime forward still more.
88ec22d3 4493 */
371fd7e7 4494 if (!(flags & DEQUEUE_SLEEP))
88ec22d3 4495 se->vruntime -= cfs_rq->min_vruntime;
1e876231 4496
d8b4986d
PT
4497 /* return excess runtime on last dequeue */
4498 return_cfs_rq_runtime(cfs_rq);
4499
1ea6c46a 4500 update_cfs_group(se);
b60205c7
PZ
4501
4502 /*
4503 * Now advance min_vruntime if @se was the entity holding it back,
4504 * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
4505 * put back on, and if we advance min_vruntime, we'll be placed back
4506 * further than we started -- ie. we'll be penalized.
4507 */
9845c49c 4508 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
b60205c7 4509 update_min_vruntime(cfs_rq);
bf0f6f24
IM
4510}
4511
4512/*
4513 * Preempt the current task with a newly woken task if needed:
4514 */
7c92e54f 4515static void
2e09bf55 4516check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
bf0f6f24 4517{
11697830 4518 unsigned long ideal_runtime, delta_exec;
f4cfb33e
WX
4519 struct sched_entity *se;
4520 s64 delta;
11697830 4521
6d0f0ebd 4522 ideal_runtime = sched_slice(cfs_rq, curr);
11697830 4523 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
a9f3e2b5 4524 if (delta_exec > ideal_runtime) {
8875125e 4525 resched_curr(rq_of(cfs_rq));
a9f3e2b5
MG
4526 /*
4527 * The current task ran long enough, ensure it doesn't get
4528 * re-elected due to buddy favours.
4529 */
4530 clear_buddies(cfs_rq, curr);
f685ceac
MG
4531 return;
4532 }
4533
4534 /*
4535 * Ensure that a task that missed wakeup preemption by a
4536 * narrow margin doesn't have to wait for a full slice.
4537 * This also mitigates buddy induced latencies under load.
4538 */
f685ceac
MG
4539 if (delta_exec < sysctl_sched_min_granularity)
4540 return;
4541
f4cfb33e
WX
4542 se = __pick_first_entity(cfs_rq);
4543 delta = curr->vruntime - se->vruntime;
f685ceac 4544
f4cfb33e
WX
4545 if (delta < 0)
4546 return;
d7d82944 4547
f4cfb33e 4548 if (delta > ideal_runtime)
8875125e 4549 resched_curr(rq_of(cfs_rq));
bf0f6f24
IM
4550}
4551
83b699ed 4552static void
8494f412 4553set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 4554{
21f56ffe
PZ
4555 clear_buddies(cfs_rq, se);
4556
83b699ed
SV
4557 /* 'current' is not kept within the tree. */
4558 if (se->on_rq) {
4559 /*
4560 * Any task has to be enqueued before it get to execute on
4561 * a CPU. So account for the time it spent waiting on the
4562 * runqueue.
4563 */
60f2415e 4564 update_stats_wait_end_fair(cfs_rq, se);
83b699ed 4565 __dequeue_entity(cfs_rq, se);
88c0616e 4566 update_load_avg(cfs_rq, se, UPDATE_TG);
83b699ed
SV
4567 }
4568
79303e9e 4569 update_stats_curr_start(cfs_rq, se);
429d43bc 4570 cfs_rq->curr = se;
4fa8d299 4571
eba1ed4b
IM
4572 /*
4573 * Track our maximum slice length, if the CPU's load is at
4574 * least twice that of our own weight (i.e. dont track it
4575 * when there are only lesser-weight tasks around):
4576 */
f2bedc47
DE
4577 if (schedstat_enabled() &&
4578 rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
ceeadb83
YS
4579 struct sched_statistics *stats;
4580
4581 stats = __schedstats_from_se(se);
4582 __schedstat_set(stats->slice_max,
4583 max((u64)stats->slice_max,
a2dcb276 4584 se->sum_exec_runtime - se->prev_sum_exec_runtime));
eba1ed4b 4585 }
4fa8d299 4586
4a55b450 4587 se->prev_sum_exec_runtime = se->sum_exec_runtime;
bf0f6f24
IM
4588}
4589
3f3a4904
PZ
4590static int
4591wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
4592
ac53db59
RR
4593/*
4594 * Pick the next process, keeping these things in mind, in this order:
4595 * 1) keep things fair between processes/task groups
4596 * 2) pick the "next" process, since someone really wants that to run
4597 * 3) pick the "last" process, for cache locality
4598 * 4) do not run the "skip" process, if something else is available
4599 */
678d5718
PZ
4600static struct sched_entity *
4601pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
aa2ac252 4602{
678d5718
PZ
4603 struct sched_entity *left = __pick_first_entity(cfs_rq);
4604 struct sched_entity *se;
4605
4606 /*
4607 * If curr is set we have to see if its left of the leftmost entity
4608 * still in the tree, provided there was anything in the tree at all.
4609 */
4610 if (!left || (curr && entity_before(curr, left)))
4611 left = curr;
4612
4613 se = left; /* ideally we run the leftmost entity */
f4b6755f 4614
ac53db59
RR
4615 /*
4616 * Avoid running the skip buddy, if running something else can
4617 * be done without getting too unfair.
4618 */
21f56ffe 4619 if (cfs_rq->skip && cfs_rq->skip == se) {
678d5718
PZ
4620 struct sched_entity *second;
4621
4622 if (se == curr) {
4623 second = __pick_first_entity(cfs_rq);
4624 } else {
4625 second = __pick_next_entity(se);
4626 if (!second || (curr && entity_before(curr, second)))
4627 second = curr;
4628 }
4629
ac53db59
RR
4630 if (second && wakeup_preempt_entity(second, left) < 1)
4631 se = second;
4632 }
aa2ac252 4633
9abb8973
PO
4634 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
4635 /*
4636 * Someone really wants this to run. If it's not unfair, run it.
4637 */
ac53db59 4638 se = cfs_rq->next;
9abb8973
PO
4639 } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
4640 /*
4641 * Prefer last buddy, try to return the CPU to a preempted task.
4642 */
4643 se = cfs_rq->last;
4644 }
ac53db59 4645
4793241b 4646 return se;
aa2ac252
PZ
4647}
4648
678d5718 4649static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
d3d9dc33 4650
ab6cde26 4651static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
bf0f6f24
IM
4652{
4653 /*
4654 * If still on the runqueue then deactivate_task()
4655 * was not called and update_curr() has to be done:
4656 */
4657 if (prev->on_rq)
b7cc0896 4658 update_curr(cfs_rq);
bf0f6f24 4659
d3d9dc33
PT
4660 /* throttle cfs_rqs exceeding runtime */
4661 check_cfs_rq_runtime(cfs_rq);
4662
4fa8d299 4663 check_spread(cfs_rq, prev);
cb251765 4664
30cfdcfc 4665 if (prev->on_rq) {
60f2415e 4666 update_stats_wait_start_fair(cfs_rq, prev);
30cfdcfc
DA
4667 /* Put 'current' back into the tree. */
4668 __enqueue_entity(cfs_rq, prev);
9d85f21c 4669 /* in !on_rq case, update occurred at dequeue */
88c0616e 4670 update_load_avg(cfs_rq, prev, 0);
30cfdcfc 4671 }
429d43bc 4672 cfs_rq->curr = NULL;
bf0f6f24
IM
4673}
4674
8f4d37ec
PZ
4675static void
4676entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
bf0f6f24 4677{
bf0f6f24 4678 /*
30cfdcfc 4679 * Update run-time statistics of the 'current'.
bf0f6f24 4680 */
30cfdcfc 4681 update_curr(cfs_rq);
bf0f6f24 4682
9d85f21c
PT
4683 /*
4684 * Ensure that runnable average is periodically updated.
4685 */
88c0616e 4686 update_load_avg(cfs_rq, curr, UPDATE_TG);
1ea6c46a 4687 update_cfs_group(curr);
9d85f21c 4688
8f4d37ec
PZ
4689#ifdef CONFIG_SCHED_HRTICK
4690 /*
4691 * queued ticks are scheduled to match the slice, so don't bother
4692 * validating it and just reschedule.
4693 */
983ed7a6 4694 if (queued) {
8875125e 4695 resched_curr(rq_of(cfs_rq));
983ed7a6
HH
4696 return;
4697 }
8f4d37ec
PZ
4698 /*
4699 * don't let the period tick interfere with the hrtick preemption
4700 */
4701 if (!sched_feat(DOUBLE_TICK) &&
4702 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
4703 return;
4704#endif
4705
2c2efaed 4706 if (cfs_rq->nr_running > 1)
2e09bf55 4707 check_preempt_tick(cfs_rq, curr);
bf0f6f24
IM
4708}
4709
ab84d31e
PT
4710
4711/**************************************************
4712 * CFS bandwidth control machinery
4713 */
4714
4715#ifdef CONFIG_CFS_BANDWIDTH
029632fb 4716
e9666d10 4717#ifdef CONFIG_JUMP_LABEL
c5905afb 4718static struct static_key __cfs_bandwidth_used;
029632fb
PZ
4719
4720static inline bool cfs_bandwidth_used(void)
4721{
c5905afb 4722 return static_key_false(&__cfs_bandwidth_used);
029632fb
PZ
4723}
4724
1ee14e6c 4725void cfs_bandwidth_usage_inc(void)
029632fb 4726{
ce48c146 4727 static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
1ee14e6c
BS
4728}
4729
4730void cfs_bandwidth_usage_dec(void)
4731{
ce48c146 4732 static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
029632fb 4733}
e9666d10 4734#else /* CONFIG_JUMP_LABEL */
029632fb
PZ
4735static bool cfs_bandwidth_used(void)
4736{
4737 return true;
4738}
4739
1ee14e6c
BS
4740void cfs_bandwidth_usage_inc(void) {}
4741void cfs_bandwidth_usage_dec(void) {}
e9666d10 4742#endif /* CONFIG_JUMP_LABEL */
029632fb 4743
ab84d31e
PT
4744/*
4745 * default period for cfs group bandwidth.
4746 * default: 0.1s, units: nanoseconds
4747 */
4748static inline u64 default_cfs_period(void)
4749{
4750 return 100000000ULL;
4751}
ec12cb7f
PT
4752
4753static inline u64 sched_cfs_bandwidth_slice(void)
4754{
4755 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
4756}
4757
a9cf55b2 4758/*
763a9ec0
QC
4759 * Replenish runtime according to assigned quota. We use sched_clock_cpu
4760 * directly instead of rq->clock to avoid adding additional synchronization
4761 * around rq->lock.
a9cf55b2
PT
4762 *
4763 * requires cfs_b->lock
4764 */
029632fb 4765void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
a9cf55b2 4766{
bcb1704a
HC
4767 s64 runtime;
4768
f4183717
HC
4769 if (unlikely(cfs_b->quota == RUNTIME_INF))
4770 return;
4771
4772 cfs_b->runtime += cfs_b->quota;
bcb1704a
HC
4773 runtime = cfs_b->runtime_snap - cfs_b->runtime;
4774 if (runtime > 0) {
4775 cfs_b->burst_time += runtime;
4776 cfs_b->nr_burst++;
4777 }
4778
f4183717 4779 cfs_b->runtime = min(cfs_b->runtime, cfs_b->quota + cfs_b->burst);
bcb1704a 4780 cfs_b->runtime_snap = cfs_b->runtime;
a9cf55b2
PT
4781}
4782
029632fb
PZ
4783static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4784{
4785 return &tg->cfs_bandwidth;
4786}
4787
85dac906 4788/* returns 0 on failure to allocate runtime */
e98fa02c
PT
4789static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
4790 struct cfs_rq *cfs_rq, u64 target_runtime)
ec12cb7f 4791{
e98fa02c
PT
4792 u64 min_amount, amount = 0;
4793
4794 lockdep_assert_held(&cfs_b->lock);
ec12cb7f
PT
4795
4796 /* note: this is a positive sum as runtime_remaining <= 0 */
e98fa02c 4797 min_amount = target_runtime - cfs_rq->runtime_remaining;
ec12cb7f 4798
ec12cb7f
PT
4799 if (cfs_b->quota == RUNTIME_INF)
4800 amount = min_amount;
58088ad0 4801 else {
77a4d1a1 4802 start_cfs_bandwidth(cfs_b);
58088ad0
PT
4803
4804 if (cfs_b->runtime > 0) {
4805 amount = min(cfs_b->runtime, min_amount);
4806 cfs_b->runtime -= amount;
4807 cfs_b->idle = 0;
4808 }
ec12cb7f 4809 }
ec12cb7f
PT
4810
4811 cfs_rq->runtime_remaining += amount;
85dac906
PT
4812
4813 return cfs_rq->runtime_remaining > 0;
ec12cb7f
PT
4814}
4815
e98fa02c
PT
4816/* returns 0 on failure to allocate runtime */
4817static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4818{
4819 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4820 int ret;
4821
4822 raw_spin_lock(&cfs_b->lock);
4823 ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
4824 raw_spin_unlock(&cfs_b->lock);
4825
4826 return ret;
4827}
4828
9dbdb155 4829static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
a9cf55b2
PT
4830{
4831 /* dock delta_exec before expiring quota (as it could span periods) */
ec12cb7f 4832 cfs_rq->runtime_remaining -= delta_exec;
a9cf55b2
PT
4833
4834 if (likely(cfs_rq->runtime_remaining > 0))
ec12cb7f
PT
4835 return;
4836
5e2d2cc2
L
4837 if (cfs_rq->throttled)
4838 return;
85dac906
PT
4839 /*
4840 * if we're unable to extend our runtime we resched so that the active
4841 * hierarchy can be throttled
4842 */
4843 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
8875125e 4844 resched_curr(rq_of(cfs_rq));
ec12cb7f
PT
4845}
4846
6c16a6dc 4847static __always_inline
9dbdb155 4848void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
ec12cb7f 4849{
56f570e5 4850 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
ec12cb7f
PT
4851 return;
4852
4853 __account_cfs_rq_runtime(cfs_rq, delta_exec);
4854}
4855
85dac906
PT
4856static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4857{
56f570e5 4858 return cfs_bandwidth_used() && cfs_rq->throttled;
85dac906
PT
4859}
4860
64660c86
PT
4861/* check whether cfs_rq, or any parent, is throttled */
4862static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4863{
56f570e5 4864 return cfs_bandwidth_used() && cfs_rq->throttle_count;
64660c86
PT
4865}
4866
4867/*
4868 * Ensure that neither of the group entities corresponding to src_cpu or
4869 * dest_cpu are members of a throttled hierarchy when performing group
4870 * load-balance operations.
4871 */
4872static inline int throttled_lb_pair(struct task_group *tg,
4873 int src_cpu, int dest_cpu)
4874{
4875 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
4876
4877 src_cfs_rq = tg->cfs_rq[src_cpu];
4878 dest_cfs_rq = tg->cfs_rq[dest_cpu];
4879
4880 return throttled_hierarchy(src_cfs_rq) ||
4881 throttled_hierarchy(dest_cfs_rq);
4882}
4883
64660c86
PT
4884static int tg_unthrottle_up(struct task_group *tg, void *data)
4885{
4886 struct rq *rq = data;
4887 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4888
4889 cfs_rq->throttle_count--;
64660c86 4890 if (!cfs_rq->throttle_count) {
64eaf507
CZ
4891 cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
4892 cfs_rq->throttled_clock_pelt;
31bc6aea 4893
a7b359fc 4894 /* Add cfs_rq with load or one or more already running entities to the list */
0a00a354 4895 if (!cfs_rq_is_decayed(cfs_rq))
31bc6aea 4896 list_add_leaf_cfs_rq(cfs_rq);
64660c86 4897 }
64660c86
PT
4898
4899 return 0;
4900}
4901
4902static int tg_throttle_down(struct task_group *tg, void *data)
4903{
4904 struct rq *rq = data;
4905 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4906
82958366 4907 /* group is entering throttled state, stop time */
31bc6aea 4908 if (!cfs_rq->throttle_count) {
64eaf507 4909 cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
31bc6aea
VG
4910 list_del_leaf_cfs_rq(cfs_rq);
4911 }
64660c86
PT
4912 cfs_rq->throttle_count++;
4913
4914 return 0;
4915}
4916
e98fa02c 4917static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
85dac906
PT
4918{
4919 struct rq *rq = rq_of(cfs_rq);
4920 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4921 struct sched_entity *se;
43e9f7f2 4922 long task_delta, idle_task_delta, dequeue = 1;
e98fa02c
PT
4923
4924 raw_spin_lock(&cfs_b->lock);
4925 /* This will start the period timer if necessary */
4926 if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
4927 /*
4928 * We have raced with bandwidth becoming available, and if we
4929 * actually throttled the timer might not unthrottle us for an
4930 * entire period. We additionally needed to make sure that any
4931 * subsequent check_cfs_rq_runtime calls agree not to throttle
4932 * us, as we may commit to do cfs put_prev+pick_next, so we ask
4933 * for 1ns of runtime rather than just check cfs_b.
4934 */
4935 dequeue = 0;
4936 } else {
4937 list_add_tail_rcu(&cfs_rq->throttled_list,
4938 &cfs_b->throttled_cfs_rq);
4939 }
4940 raw_spin_unlock(&cfs_b->lock);
4941
4942 if (!dequeue)
4943 return false; /* Throttle no longer required. */
85dac906
PT
4944
4945 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
4946
f1b17280 4947 /* freeze hierarchy runnable averages while throttled */
64660c86
PT
4948 rcu_read_lock();
4949 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
4950 rcu_read_unlock();
85dac906
PT
4951
4952 task_delta = cfs_rq->h_nr_running;
43e9f7f2 4953 idle_task_delta = cfs_rq->idle_h_nr_running;
85dac906
PT
4954 for_each_sched_entity(se) {
4955 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
4956 /* throttled entity or throttle-on-deactivate */
4957 if (!se->on_rq)
b6d37a76 4958 goto done;
85dac906 4959
b6d37a76 4960 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
6212437f 4961
30400039
JD
4962 if (cfs_rq_is_idle(group_cfs_rq(se)))
4963 idle_task_delta = cfs_rq->h_nr_running;
4964
85dac906 4965 qcfs_rq->h_nr_running -= task_delta;
43e9f7f2 4966 qcfs_rq->idle_h_nr_running -= idle_task_delta;
85dac906 4967
b6d37a76
PW
4968 if (qcfs_rq->load.weight) {
4969 /* Avoid re-evaluating load for this entity: */
4970 se = parent_entity(se);
4971 break;
4972 }
4973 }
4974
4975 for_each_sched_entity(se) {
4976 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
4977 /* throttled entity or throttle-on-deactivate */
4978 if (!se->on_rq)
4979 goto done;
4980
4981 update_load_avg(qcfs_rq, se, 0);
4982 se_update_runnable(se);
4983
30400039
JD
4984 if (cfs_rq_is_idle(group_cfs_rq(se)))
4985 idle_task_delta = cfs_rq->h_nr_running;
4986
b6d37a76
PW
4987 qcfs_rq->h_nr_running -= task_delta;
4988 qcfs_rq->idle_h_nr_running -= idle_task_delta;
85dac906
PT
4989 }
4990
b6d37a76
PW
4991 /* At this point se is NULL and we are at root level*/
4992 sub_nr_running(rq, task_delta);
85dac906 4993
b6d37a76 4994done:
c06f04c7 4995 /*
e98fa02c
PT
4996 * Note: distribution will already see us throttled via the
4997 * throttled-list. rq->lock protects completion.
c06f04c7 4998 */
e98fa02c
PT
4999 cfs_rq->throttled = 1;
5000 cfs_rq->throttled_clock = rq_clock(rq);
5001 return true;
85dac906
PT
5002}
5003
029632fb 5004void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
671fd9da
PT
5005{
5006 struct rq *rq = rq_of(cfs_rq);
5007 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5008 struct sched_entity *se;
43e9f7f2 5009 long task_delta, idle_task_delta;
671fd9da 5010
22b958d8 5011 se = cfs_rq->tg->se[cpu_of(rq)];
671fd9da
PT
5012
5013 cfs_rq->throttled = 0;
1a55af2e
FW
5014
5015 update_rq_clock(rq);
5016
671fd9da 5017 raw_spin_lock(&cfs_b->lock);
78becc27 5018 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
671fd9da
PT
5019 list_del_rcu(&cfs_rq->throttled_list);
5020 raw_spin_unlock(&cfs_b->lock);
5021
64660c86
PT
5022 /* update hierarchical throttle state */
5023 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
5024
2630cde2 5025 if (!cfs_rq->load.weight) {
51bf903b
CZ
5026 if (!cfs_rq->on_list)
5027 return;
5028 /*
5029 * Nothing to run but something to decay (on_list)?
5030 * Complete the branch.
5031 */
5032 for_each_sched_entity(se) {
5033 if (list_add_leaf_cfs_rq(cfs_rq_of(se)))
5034 break;
5035 }
5036 goto unthrottle_throttle;
2630cde2 5037 }
671fd9da
PT
5038
5039 task_delta = cfs_rq->h_nr_running;
43e9f7f2 5040 idle_task_delta = cfs_rq->idle_h_nr_running;
671fd9da 5041 for_each_sched_entity(se) {
30400039
JD
5042 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
5043
671fd9da 5044 if (se->on_rq)
39f23ce0 5045 break;
30400039
JD
5046 enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
5047
5048 if (cfs_rq_is_idle(group_cfs_rq(se)))
5049 idle_task_delta = cfs_rq->h_nr_running;
39f23ce0 5050
30400039
JD
5051 qcfs_rq->h_nr_running += task_delta;
5052 qcfs_rq->idle_h_nr_running += idle_task_delta;
39f23ce0
VG
5053
5054 /* end evaluation on encountering a throttled cfs_rq */
30400039 5055 if (cfs_rq_throttled(qcfs_rq))
39f23ce0
VG
5056 goto unthrottle_throttle;
5057 }
671fd9da 5058
39f23ce0 5059 for_each_sched_entity(se) {
30400039 5060 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
39f23ce0 5061
30400039 5062 update_load_avg(qcfs_rq, se, UPDATE_TG);
39f23ce0 5063 se_update_runnable(se);
6212437f 5064
30400039
JD
5065 if (cfs_rq_is_idle(group_cfs_rq(se)))
5066 idle_task_delta = cfs_rq->h_nr_running;
671fd9da 5067
30400039
JD
5068 qcfs_rq->h_nr_running += task_delta;
5069 qcfs_rq->idle_h_nr_running += idle_task_delta;
39f23ce0
VG
5070
5071 /* end evaluation on encountering a throttled cfs_rq */
30400039 5072 if (cfs_rq_throttled(qcfs_rq))
39f23ce0 5073 goto unthrottle_throttle;
671fd9da
PT
5074 }
5075
39f23ce0
VG
5076 /* At this point se is NULL and we are at root level*/
5077 add_nr_running(rq, task_delta);
671fd9da 5078
39f23ce0 5079unthrottle_throttle:
fe61468b
VG
5080 assert_list_leaf_cfs_rq(rq);
5081
97fb7a0a 5082 /* Determine whether we need to wake up potentially idle CPU: */
671fd9da 5083 if (rq->curr == rq->idle && rq->cfs.nr_running)
8875125e 5084 resched_curr(rq);
671fd9da
PT
5085}
5086
26a8b127 5087static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
671fd9da
PT
5088{
5089 struct cfs_rq *cfs_rq;
26a8b127 5090 u64 runtime, remaining = 1;
671fd9da
PT
5091
5092 rcu_read_lock();
5093 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
5094 throttled_list) {
5095 struct rq *rq = rq_of(cfs_rq);
8a8c69c3 5096 struct rq_flags rf;
671fd9da 5097
c0ad4aa4 5098 rq_lock_irqsave(rq, &rf);
671fd9da
PT
5099 if (!cfs_rq_throttled(cfs_rq))
5100 goto next;
5101
5e2d2cc2
L
5102 /* By the above check, this should never be true */
5103 SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
5104
26a8b127 5105 raw_spin_lock(&cfs_b->lock);
671fd9da 5106 runtime = -cfs_rq->runtime_remaining + 1;
26a8b127
HC
5107 if (runtime > cfs_b->runtime)
5108 runtime = cfs_b->runtime;
5109 cfs_b->runtime -= runtime;
5110 remaining = cfs_b->runtime;
5111 raw_spin_unlock(&cfs_b->lock);
671fd9da
PT
5112
5113 cfs_rq->runtime_remaining += runtime;
671fd9da
PT
5114
5115 /* we check whether we're throttled above */
5116 if (cfs_rq->runtime_remaining > 0)
5117 unthrottle_cfs_rq(cfs_rq);
5118
5119next:
c0ad4aa4 5120 rq_unlock_irqrestore(rq, &rf);
671fd9da
PT
5121
5122 if (!remaining)
5123 break;
5124 }
5125 rcu_read_unlock();
671fd9da
PT
5126}
5127
58088ad0
PT
5128/*
5129 * Responsible for refilling a task_group's bandwidth and unthrottling its
5130 * cfs_rqs as appropriate. If there has been no activity within the last
5131 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
5132 * used to track this state.
5133 */
c0ad4aa4 5134static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
58088ad0 5135{
51f2176d 5136 int throttled;
58088ad0 5137
58088ad0
PT
5138 /* no need to continue the timer with no bandwidth constraint */
5139 if (cfs_b->quota == RUNTIME_INF)
51f2176d 5140 goto out_deactivate;
58088ad0 5141
671fd9da 5142 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
e8da1b18 5143 cfs_b->nr_periods += overrun;
671fd9da 5144
f4183717
HC
5145 /* Refill extra burst quota even if cfs_b->idle */
5146 __refill_cfs_bandwidth_runtime(cfs_b);
5147
51f2176d
BS
5148 /*
5149 * idle depends on !throttled (for the case of a large deficit), and if
5150 * we're going inactive then everything else can be deferred
5151 */
5152 if (cfs_b->idle && !throttled)
5153 goto out_deactivate;
a9cf55b2 5154
671fd9da
PT
5155 if (!throttled) {
5156 /* mark as potentially idle for the upcoming period */
5157 cfs_b->idle = 1;
51f2176d 5158 return 0;
671fd9da
PT
5159 }
5160
e8da1b18
NR
5161 /* account preceding periods in which throttling occurred */
5162 cfs_b->nr_throttled += overrun;
5163
671fd9da 5164 /*
26a8b127 5165 * This check is repeated as we release cfs_b->lock while we unthrottle.
671fd9da 5166 */
ab93a4bc 5167 while (throttled && cfs_b->runtime > 0) {
c0ad4aa4 5168 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
671fd9da 5169 /* we can't nest cfs_b->lock while distributing bandwidth */
26a8b127 5170 distribute_cfs_runtime(cfs_b);
c0ad4aa4 5171 raw_spin_lock_irqsave(&cfs_b->lock, flags);
671fd9da
PT
5172
5173 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
5174 }
58088ad0 5175
671fd9da
PT
5176 /*
5177 * While we are ensured activity in the period following an
5178 * unthrottle, this also covers the case in which the new bandwidth is
5179 * insufficient to cover the existing bandwidth deficit. (Forcing the
5180 * timer to remain active while there are any throttled entities.)
5181 */
5182 cfs_b->idle = 0;
58088ad0 5183
51f2176d
BS
5184 return 0;
5185
5186out_deactivate:
51f2176d 5187 return 1;
58088ad0 5188}
d3d9dc33 5189
d8b4986d
PT
5190/* a cfs_rq won't donate quota below this amount */
5191static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
5192/* minimum remaining period time to redistribute slack quota */
5193static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
5194/* how long we wait to gather additional slack before distributing */
5195static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
5196
db06e78c
BS
5197/*
5198 * Are we near the end of the current quota period?
5199 *
5200 * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
4961b6e1 5201 * hrtimer base being cleared by hrtimer_start. In the case of
db06e78c
BS
5202 * migrate_hrtimers, base is never cleared, so we are fine.
5203 */
d8b4986d
PT
5204static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
5205{
5206 struct hrtimer *refresh_timer = &cfs_b->period_timer;
72d0ad7c 5207 s64 remaining;
d8b4986d
PT
5208
5209 /* if the call-back is running a quota refresh is already occurring */
5210 if (hrtimer_callback_running(refresh_timer))
5211 return 1;
5212
5213 /* is a quota refresh about to occur? */
5214 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
72d0ad7c 5215 if (remaining < (s64)min_expire)
d8b4986d
PT
5216 return 1;
5217
5218 return 0;
5219}
5220
5221static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
5222{
5223 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
5224
5225 /* if there's a quota refresh soon don't bother with slack */
5226 if (runtime_refresh_within(cfs_b, min_left))
5227 return;
5228
66567fcb 5229 /* don't push forwards an existing deferred unthrottle */
5230 if (cfs_b->slack_started)
5231 return;
5232 cfs_b->slack_started = true;
5233
4cfafd30
PZ
5234 hrtimer_start(&cfs_b->slack_timer,
5235 ns_to_ktime(cfs_bandwidth_slack_period),
5236 HRTIMER_MODE_REL);
d8b4986d
PT
5237}
5238
5239/* we know any runtime found here is valid as update_curr() precedes return */
5240static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5241{
5242 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5243 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
5244
5245 if (slack_runtime <= 0)
5246 return;
5247
5248 raw_spin_lock(&cfs_b->lock);
de53fd7a 5249 if (cfs_b->quota != RUNTIME_INF) {
d8b4986d
PT
5250 cfs_b->runtime += slack_runtime;
5251
5252 /* we are under rq->lock, defer unthrottling using a timer */
5253 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
5254 !list_empty(&cfs_b->throttled_cfs_rq))
5255 start_cfs_slack_bandwidth(cfs_b);
5256 }
5257 raw_spin_unlock(&cfs_b->lock);
5258
5259 /* even if it's not valid for return we don't want to try again */
5260 cfs_rq->runtime_remaining -= slack_runtime;
5261}
5262
5263static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5264{
56f570e5
PT
5265 if (!cfs_bandwidth_used())
5266 return;
5267
fccfdc6f 5268 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
d8b4986d
PT
5269 return;
5270
5271 __return_cfs_rq_runtime(cfs_rq);
5272}
5273
5274/*
5275 * This is done with a timer (instead of inline with bandwidth return) since
5276 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
5277 */
5278static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
5279{
5280 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
c0ad4aa4 5281 unsigned long flags;
d8b4986d
PT
5282
5283 /* confirm we're still not at a refresh boundary */
c0ad4aa4 5284 raw_spin_lock_irqsave(&cfs_b->lock, flags);
66567fcb 5285 cfs_b->slack_started = false;
baa9be4f 5286
db06e78c 5287 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
c0ad4aa4 5288 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
d8b4986d 5289 return;
db06e78c 5290 }
d8b4986d 5291
c06f04c7 5292 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
d8b4986d 5293 runtime = cfs_b->runtime;
c06f04c7 5294
c0ad4aa4 5295 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
d8b4986d
PT
5296
5297 if (!runtime)
5298 return;
5299
26a8b127 5300 distribute_cfs_runtime(cfs_b);
d8b4986d
PT
5301}
5302
d3d9dc33
PT
5303/*
5304 * When a group wakes up we want to make sure that its quota is not already
5305 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
c034f48e 5306 * runtime as update_curr() throttling can not trigger until it's on-rq.
d3d9dc33
PT
5307 */
5308static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
5309{
56f570e5
PT
5310 if (!cfs_bandwidth_used())
5311 return;
5312
d3d9dc33
PT
5313 /* an active group must be handled by the update_curr()->put() path */
5314 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
5315 return;
5316
5317 /* ensure the group is not already throttled */
5318 if (cfs_rq_throttled(cfs_rq))
5319 return;
5320
5321 /* update runtime allocation */
5322 account_cfs_rq_runtime(cfs_rq, 0);
5323 if (cfs_rq->runtime_remaining <= 0)
5324 throttle_cfs_rq(cfs_rq);
5325}
5326
55e16d30
PZ
5327static void sync_throttle(struct task_group *tg, int cpu)
5328{
5329 struct cfs_rq *pcfs_rq, *cfs_rq;
5330
5331 if (!cfs_bandwidth_used())
5332 return;
5333
5334 if (!tg->parent)
5335 return;
5336
5337 cfs_rq = tg->cfs_rq[cpu];
5338 pcfs_rq = tg->parent->cfs_rq[cpu];
5339
5340 cfs_rq->throttle_count = pcfs_rq->throttle_count;
64eaf507 5341 cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu));
55e16d30
PZ
5342}
5343
d3d9dc33 5344/* conditionally throttle active cfs_rq's from put_prev_entity() */
678d5718 5345static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
d3d9dc33 5346{
56f570e5 5347 if (!cfs_bandwidth_used())
678d5718 5348 return false;
56f570e5 5349
d3d9dc33 5350 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
678d5718 5351 return false;
d3d9dc33
PT
5352
5353 /*
5354 * it's possible for a throttled entity to be forced into a running
5355 * state (e.g. set_curr_task), in this case we're finished.
5356 */
5357 if (cfs_rq_throttled(cfs_rq))
678d5718 5358 return true;
d3d9dc33 5359
e98fa02c 5360 return throttle_cfs_rq(cfs_rq);
d3d9dc33 5361}
029632fb 5362
029632fb
PZ
5363static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
5364{
5365 struct cfs_bandwidth *cfs_b =
5366 container_of(timer, struct cfs_bandwidth, slack_timer);
77a4d1a1 5367
029632fb
PZ
5368 do_sched_cfs_slack_timer(cfs_b);
5369
5370 return HRTIMER_NORESTART;
5371}
5372
2e8e1922
PA
5373extern const u64 max_cfs_quota_period;
5374
029632fb
PZ
5375static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
5376{
5377 struct cfs_bandwidth *cfs_b =
5378 container_of(timer, struct cfs_bandwidth, period_timer);
c0ad4aa4 5379 unsigned long flags;
029632fb
PZ
5380 int overrun;
5381 int idle = 0;
2e8e1922 5382 int count = 0;
029632fb 5383
c0ad4aa4 5384 raw_spin_lock_irqsave(&cfs_b->lock, flags);
029632fb 5385 for (;;) {
77a4d1a1 5386 overrun = hrtimer_forward_now(timer, cfs_b->period);
029632fb
PZ
5387 if (!overrun)
5388 break;
5389
5a6d6a6c
HC
5390 idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
5391
2e8e1922
PA
5392 if (++count > 3) {
5393 u64 new, old = ktime_to_ns(cfs_b->period);
5394
4929a4e6
XZ
5395 /*
5396 * Grow period by a factor of 2 to avoid losing precision.
5397 * Precision loss in the quota/period ratio can cause __cfs_schedulable
5398 * to fail.
5399 */
5400 new = old * 2;
5401 if (new < max_cfs_quota_period) {
5402 cfs_b->period = ns_to_ktime(new);
5403 cfs_b->quota *= 2;
f4183717 5404 cfs_b->burst *= 2;
4929a4e6
XZ
5405
5406 pr_warn_ratelimited(
5407 "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
5408 smp_processor_id(),
5409 div_u64(new, NSEC_PER_USEC),
5410 div_u64(cfs_b->quota, NSEC_PER_USEC));
5411 } else {
5412 pr_warn_ratelimited(
5413 "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
5414 smp_processor_id(),
5415 div_u64(old, NSEC_PER_USEC),
5416 div_u64(cfs_b->quota, NSEC_PER_USEC));
5417 }
2e8e1922
PA
5418
5419 /* reset count so we don't come right back in here */
5420 count = 0;
5421 }
029632fb 5422 }
4cfafd30
PZ
5423 if (idle)
5424 cfs_b->period_active = 0;
c0ad4aa4 5425 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
029632fb
PZ
5426
5427 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
5428}
5429
5430void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5431{
5432 raw_spin_lock_init(&cfs_b->lock);
5433 cfs_b->runtime = 0;
5434 cfs_b->quota = RUNTIME_INF;
5435 cfs_b->period = ns_to_ktime(default_cfs_period());
f4183717 5436 cfs_b->burst = 0;
029632fb
PZ
5437
5438 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
4cfafd30 5439 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
029632fb
PZ
5440 cfs_b->period_timer.function = sched_cfs_period_timer;
5441 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5442 cfs_b->slack_timer.function = sched_cfs_slack_timer;
66567fcb 5443 cfs_b->slack_started = false;
029632fb
PZ
5444}
5445
5446static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5447{
5448 cfs_rq->runtime_enabled = 0;
5449 INIT_LIST_HEAD(&cfs_rq->throttled_list);
5450}
5451
77a4d1a1 5452void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
029632fb 5453{
4cfafd30 5454 lockdep_assert_held(&cfs_b->lock);
029632fb 5455
f1d1be8a
XP
5456 if (cfs_b->period_active)
5457 return;
5458
5459 cfs_b->period_active = 1;
763a9ec0 5460 hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
f1d1be8a 5461 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
029632fb
PZ
5462}
5463
5464static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5465{
7f1a169b
TH
5466 /* init_cfs_bandwidth() was not called */
5467 if (!cfs_b->throttled_cfs_rq.next)
5468 return;
5469
029632fb
PZ
5470 hrtimer_cancel(&cfs_b->period_timer);
5471 hrtimer_cancel(&cfs_b->slack_timer);
5472}
5473
502ce005 5474/*
97fb7a0a 5475 * Both these CPU hotplug callbacks race against unregister_fair_sched_group()
502ce005
PZ
5476 *
5477 * The race is harmless, since modifying bandwidth settings of unhooked group
5478 * bits doesn't do much.
5479 */
5480
3b03706f 5481/* cpu online callback */
0e59bdae
KT
5482static void __maybe_unused update_runtime_enabled(struct rq *rq)
5483{
502ce005 5484 struct task_group *tg;
0e59bdae 5485
5cb9eaa3 5486 lockdep_assert_rq_held(rq);
502ce005
PZ
5487
5488 rcu_read_lock();
5489 list_for_each_entry_rcu(tg, &task_groups, list) {
5490 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
5491 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
0e59bdae
KT
5492
5493 raw_spin_lock(&cfs_b->lock);
5494 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
5495 raw_spin_unlock(&cfs_b->lock);
5496 }
502ce005 5497 rcu_read_unlock();
0e59bdae
KT
5498}
5499
502ce005 5500/* cpu offline callback */
38dc3348 5501static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
029632fb 5502{
502ce005
PZ
5503 struct task_group *tg;
5504
5cb9eaa3 5505 lockdep_assert_rq_held(rq);
502ce005
PZ
5506
5507 rcu_read_lock();
5508 list_for_each_entry_rcu(tg, &task_groups, list) {
5509 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
029632fb 5510
029632fb
PZ
5511 if (!cfs_rq->runtime_enabled)
5512 continue;
5513
5514 /*
5515 * clock_task is not advancing so we just need to make sure
5516 * there's some valid quota amount
5517 */
51f2176d 5518 cfs_rq->runtime_remaining = 1;
0e59bdae 5519 /*
97fb7a0a 5520 * Offline rq is schedulable till CPU is completely disabled
0e59bdae
KT
5521 * in take_cpu_down(), so we prevent new cfs throttling here.
5522 */
5523 cfs_rq->runtime_enabled = 0;
5524
029632fb
PZ
5525 if (cfs_rq_throttled(cfs_rq))
5526 unthrottle_cfs_rq(cfs_rq);
5527 }
502ce005 5528 rcu_read_unlock();
029632fb
PZ
5529}
5530
5531#else /* CONFIG_CFS_BANDWIDTH */
f6783319
VG
5532
5533static inline bool cfs_bandwidth_used(void)
5534{
5535 return false;
5536}
5537
9dbdb155 5538static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
678d5718 5539static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
d3d9dc33 5540static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
55e16d30 5541static inline void sync_throttle(struct task_group *tg, int cpu) {}
6c16a6dc 5542static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
85dac906
PT
5543
5544static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
5545{
5546 return 0;
5547}
64660c86
PT
5548
5549static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
5550{
5551 return 0;
5552}
5553
5554static inline int throttled_lb_pair(struct task_group *tg,
5555 int src_cpu, int dest_cpu)
5556{
5557 return 0;
5558}
029632fb
PZ
5559
5560void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5561
5562#ifdef CONFIG_FAIR_GROUP_SCHED
5563static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
ab84d31e
PT
5564#endif
5565
029632fb
PZ
5566static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
5567{
5568 return NULL;
5569}
5570static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
0e59bdae 5571static inline void update_runtime_enabled(struct rq *rq) {}
a4c96ae3 5572static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
029632fb
PZ
5573
5574#endif /* CONFIG_CFS_BANDWIDTH */
5575
bf0f6f24
IM
5576/**************************************************
5577 * CFS operations on tasks:
5578 */
5579
8f4d37ec
PZ
5580#ifdef CONFIG_SCHED_HRTICK
5581static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
5582{
8f4d37ec
PZ
5583 struct sched_entity *se = &p->se;
5584 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5585
9148a3a1 5586 SCHED_WARN_ON(task_rq(p) != rq);
8f4d37ec 5587
8bf46a39 5588 if (rq->cfs.h_nr_running > 1) {
8f4d37ec
PZ
5589 u64 slice = sched_slice(cfs_rq, se);
5590 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
5591 s64 delta = slice - ran;
5592
5593 if (delta < 0) {
65bcf072 5594 if (task_current(rq, p))
8875125e 5595 resched_curr(rq);
8f4d37ec
PZ
5596 return;
5597 }
31656519 5598 hrtick_start(rq, delta);
8f4d37ec
PZ
5599 }
5600}
a4c2f00f
PZ
5601
5602/*
5603 * called from enqueue/dequeue and updates the hrtick when the
5604 * current task is from our class and nr_running is low enough
5605 * to matter.
5606 */
5607static void hrtick_update(struct rq *rq)
5608{
5609 struct task_struct *curr = rq->curr;
5610
e0ee463c 5611 if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class)
a4c2f00f
PZ
5612 return;
5613
5614 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
5615 hrtick_start_fair(rq, curr);
5616}
55e12e5e 5617#else /* !CONFIG_SCHED_HRTICK */
8f4d37ec
PZ
5618static inline void
5619hrtick_start_fair(struct rq *rq, struct task_struct *p)
5620{
5621}
a4c2f00f
PZ
5622
5623static inline void hrtick_update(struct rq *rq)
5624{
5625}
8f4d37ec
PZ
5626#endif
5627
2802bf3c 5628#ifdef CONFIG_SMP
2802bf3c
MR
5629static inline bool cpu_overutilized(int cpu)
5630{
82762d2a 5631 return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu));
2802bf3c
MR
5632}
5633
5634static inline void update_overutilized_status(struct rq *rq)
5635{
f9f240f9 5636 if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
2802bf3c 5637 WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
f9f240f9
QY
5638 trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
5639 }
2802bf3c
MR
5640}
5641#else
5642static inline void update_overutilized_status(struct rq *rq) { }
5643#endif
5644
323af6de
VK
5645/* Runqueue only has SCHED_IDLE tasks enqueued */
5646static int sched_idle_rq(struct rq *rq)
5647{
5648 return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
5649 rq->nr_running);
5650}
5651
a480adde
JD
5652/*
5653 * Returns true if cfs_rq only has SCHED_IDLE entities enqueued. Note the use
5654 * of idle_nr_running, which does not consider idle descendants of normal
5655 * entities.
5656 */
5657static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq)
5658{
5659 return cfs_rq->nr_running &&
5660 cfs_rq->nr_running == cfs_rq->idle_nr_running;
5661}
5662
afa70d94 5663#ifdef CONFIG_SMP
323af6de
VK
5664static int sched_idle_cpu(int cpu)
5665{
5666 return sched_idle_rq(cpu_rq(cpu));
5667}
afa70d94 5668#endif
323af6de 5669
bf0f6f24
IM
5670/*
5671 * The enqueue_task method is called before nr_running is
5672 * increased. Here we update the fair scheduling stats and
5673 * then put the task into the rbtree:
5674 */
ea87bb78 5675static void
371fd7e7 5676enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
bf0f6f24
IM
5677{
5678 struct cfs_rq *cfs_rq;
62fb1851 5679 struct sched_entity *se = &p->se;
43e9f7f2 5680 int idle_h_nr_running = task_has_idle_policy(p);
8e1ac429 5681 int task_new = !(flags & ENQUEUE_WAKEUP);
bf0f6f24 5682
2539fc82
PB
5683 /*
5684 * The code below (indirectly) updates schedutil which looks at
5685 * the cfs_rq utilization to select a frequency.
5686 * Let's add the task's estimated utilization to the cfs_rq's
5687 * estimated utilization, before we update schedutil.
5688 */
5689 util_est_enqueue(&rq->cfs, p);
5690
8c34ab19
RW
5691 /*
5692 * If in_iowait is set, the code below may not trigger any cpufreq
5693 * utilization updates, so do it here explicitly with the IOWAIT flag
5694 * passed.
5695 */
5696 if (p->in_iowait)
674e7541 5697 cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
8c34ab19 5698
bf0f6f24 5699 for_each_sched_entity(se) {
62fb1851 5700 if (se->on_rq)
bf0f6f24
IM
5701 break;
5702 cfs_rq = cfs_rq_of(se);
88ec22d3 5703 enqueue_entity(cfs_rq, se, flags);
85dac906 5704
953bfcd1 5705 cfs_rq->h_nr_running++;
43e9f7f2 5706 cfs_rq->idle_h_nr_running += idle_h_nr_running;
85dac906 5707
30400039
JD
5708 if (cfs_rq_is_idle(cfs_rq))
5709 idle_h_nr_running = 1;
5710
6d4d2246
VG
5711 /* end evaluation on encountering a throttled cfs_rq */
5712 if (cfs_rq_throttled(cfs_rq))
5713 goto enqueue_throttle;
5714
88ec22d3 5715 flags = ENQUEUE_WAKEUP;
bf0f6f24 5716 }
8f4d37ec 5717
2069dd75 5718 for_each_sched_entity(se) {
0f317143 5719 cfs_rq = cfs_rq_of(se);
2069dd75 5720
88c0616e 5721 update_load_avg(cfs_rq, se, UPDATE_TG);
9f683953 5722 se_update_runnable(se);
1ea6c46a 5723 update_cfs_group(se);
6d4d2246
VG
5724
5725 cfs_rq->h_nr_running++;
5726 cfs_rq->idle_h_nr_running += idle_h_nr_running;
5ab297ba 5727
30400039
JD
5728 if (cfs_rq_is_idle(cfs_rq))
5729 idle_h_nr_running = 1;
5730
5ab297ba
VG
5731 /* end evaluation on encountering a throttled cfs_rq */
5732 if (cfs_rq_throttled(cfs_rq))
5733 goto enqueue_throttle;
2069dd75
PZ
5734 }
5735
7d148be6
VG
5736 /* At this point se is NULL and we are at root level*/
5737 add_nr_running(rq, 1);
2802bf3c 5738
7d148be6
VG
5739 /*
5740 * Since new tasks are assigned an initial util_avg equal to
5741 * half of the spare capacity of their CPU, tiny tasks have the
5742 * ability to cross the overutilized threshold, which will
5743 * result in the load balancer ruining all the task placement
5744 * done by EAS. As a way to mitigate that effect, do not account
5745 * for the first enqueue operation of new tasks during the
5746 * overutilized flag detection.
5747 *
5748 * A better way of solving this problem would be to wait for
5749 * the PELT signals of tasks to converge before taking them
5750 * into account, but that is not straightforward to implement,
5751 * and the following generally works well enough in practice.
5752 */
8e1ac429 5753 if (!task_new)
7d148be6 5754 update_overutilized_status(rq);
cd126afe 5755
7d148be6 5756enqueue_throttle:
5d299eab
PZ
5757 assert_list_leaf_cfs_rq(rq);
5758
a4c2f00f 5759 hrtick_update(rq);
bf0f6f24
IM
5760}
5761
2f36825b
VP
5762static void set_next_buddy(struct sched_entity *se);
5763
bf0f6f24
IM
5764/*
5765 * The dequeue_task method is called before nr_running is
5766 * decreased. We remove the task from the rbtree and
5767 * update the fair scheduling stats:
5768 */
371fd7e7 5769static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
bf0f6f24
IM
5770{
5771 struct cfs_rq *cfs_rq;
62fb1851 5772 struct sched_entity *se = &p->se;
2f36825b 5773 int task_sleep = flags & DEQUEUE_SLEEP;
43e9f7f2 5774 int idle_h_nr_running = task_has_idle_policy(p);
323af6de 5775 bool was_sched_idle = sched_idle_rq(rq);
bf0f6f24 5776
8c1f560c
XY
5777 util_est_dequeue(&rq->cfs, p);
5778
bf0f6f24
IM
5779 for_each_sched_entity(se) {
5780 cfs_rq = cfs_rq_of(se);
371fd7e7 5781 dequeue_entity(cfs_rq, se, flags);
85dac906 5782
953bfcd1 5783 cfs_rq->h_nr_running--;
43e9f7f2 5784 cfs_rq->idle_h_nr_running -= idle_h_nr_running;
2069dd75 5785
30400039
JD
5786 if (cfs_rq_is_idle(cfs_rq))
5787 idle_h_nr_running = 1;
5788
6d4d2246
VG
5789 /* end evaluation on encountering a throttled cfs_rq */
5790 if (cfs_rq_throttled(cfs_rq))
5791 goto dequeue_throttle;
5792
bf0f6f24 5793 /* Don't dequeue parent if it has other entities besides us */
2f36825b 5794 if (cfs_rq->load.weight) {
754bd598
KK
5795 /* Avoid re-evaluating load for this entity: */
5796 se = parent_entity(se);
2f36825b
VP
5797 /*
5798 * Bias pick_next to pick a task from this cfs_rq, as
5799 * p is sleeping when it is within its sched_slice.
5800 */
754bd598
KK
5801 if (task_sleep && se && !throttled_hierarchy(cfs_rq))
5802 set_next_buddy(se);
bf0f6f24 5803 break;
2f36825b 5804 }
371fd7e7 5805 flags |= DEQUEUE_SLEEP;
bf0f6f24 5806 }
8f4d37ec 5807
2069dd75 5808 for_each_sched_entity(se) {
0f317143 5809 cfs_rq = cfs_rq_of(se);
2069dd75 5810
88c0616e 5811 update_load_avg(cfs_rq, se, UPDATE_TG);
9f683953 5812 se_update_runnable(se);
1ea6c46a 5813 update_cfs_group(se);
6d4d2246
VG
5814
5815 cfs_rq->h_nr_running--;
5816 cfs_rq->idle_h_nr_running -= idle_h_nr_running;
5ab297ba 5817
30400039
JD
5818 if (cfs_rq_is_idle(cfs_rq))
5819 idle_h_nr_running = 1;
5820
5ab297ba
VG
5821 /* end evaluation on encountering a throttled cfs_rq */
5822 if (cfs_rq_throttled(cfs_rq))
5823 goto dequeue_throttle;
5824
2069dd75
PZ
5825 }
5826
423d02e1
PW
5827 /* At this point se is NULL and we are at root level*/
5828 sub_nr_running(rq, 1);
cd126afe 5829
323af6de
VK
5830 /* balance early to pull high priority tasks */
5831 if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
5832 rq->next_balance = jiffies;
5833
423d02e1 5834dequeue_throttle:
8c1f560c 5835 util_est_update(&rq->cfs, p, task_sleep);
a4c2f00f 5836 hrtick_update(rq);
bf0f6f24
IM
5837}
5838
e7693a36 5839#ifdef CONFIG_SMP
10e2f1ac
PZ
5840
5841/* Working cpumask for: load_balance, load_balance_newidle. */
5842DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
5843DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
5844
9fd81dd5 5845#ifdef CONFIG_NO_HZ_COMMON
e022e0d3
PZ
5846
5847static struct {
5848 cpumask_var_t idle_cpus_mask;
5849 atomic_t nr_cpus;
f643ea22 5850 int has_blocked; /* Idle CPUS has blocked load */
7fd7a9e0 5851 int needs_update; /* Newly idle CPUs need their next_balance collated */
e022e0d3 5852 unsigned long next_balance; /* in jiffy units */
f643ea22 5853 unsigned long next_blocked; /* Next update of blocked load in jiffies */
e022e0d3
PZ
5854} nohz ____cacheline_aligned;
5855
9fd81dd5 5856#endif /* CONFIG_NO_HZ_COMMON */
3289bdb4 5857
b0fb1eb4
VG
5858static unsigned long cpu_load(struct rq *rq)
5859{
5860 return cfs_rq_load_avg(&rq->cfs);
5861}
5862
3318544b
VG
5863/*
5864 * cpu_load_without - compute CPU load without any contributions from *p
5865 * @cpu: the CPU which load is requested
5866 * @p: the task which load should be discounted
5867 *
5868 * The load of a CPU is defined by the load of tasks currently enqueued on that
5869 * CPU as well as tasks which are currently sleeping after an execution on that
5870 * CPU.
5871 *
5872 * This method returns the load of the specified CPU by discounting the load of
5873 * the specified task, whenever the task is currently contributing to the CPU
5874 * load.
5875 */
5876static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p)
5877{
5878 struct cfs_rq *cfs_rq;
5879 unsigned int load;
5880
5881 /* Task has no contribution or is new */
5882 if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
5883 return cpu_load(rq);
5884
5885 cfs_rq = &rq->cfs;
5886 load = READ_ONCE(cfs_rq->avg.load_avg);
5887
5888 /* Discount task's util from CPU's util */
5889 lsub_positive(&load, task_h_load(p));
5890
5891 return load;
5892}
5893
9f683953
VG
5894static unsigned long cpu_runnable(struct rq *rq)
5895{
5896 return cfs_rq_runnable_avg(&rq->cfs);
5897}
5898
070f5e86
VG
5899static unsigned long cpu_runnable_without(struct rq *rq, struct task_struct *p)
5900{
5901 struct cfs_rq *cfs_rq;
5902 unsigned int runnable;
5903
5904 /* Task has no contribution or is new */
5905 if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
5906 return cpu_runnable(rq);
5907
5908 cfs_rq = &rq->cfs;
5909 runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
5910
5911 /* Discount task's runnable from CPU's runnable */
5912 lsub_positive(&runnable, p->se.avg.runnable_avg);
5913
5914 return runnable;
5915}
5916
ced549fa 5917static unsigned long capacity_of(int cpu)
029632fb 5918{
ced549fa 5919 return cpu_rq(cpu)->cpu_capacity;
029632fb
PZ
5920}
5921
c58d25f3
PZ
5922static void record_wakee(struct task_struct *p)
5923{
5924 /*
5925 * Only decay a single time; tasks that have less then 1 wakeup per
5926 * jiffy will not have built up many flips.
5927 */
5928 if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
5929 current->wakee_flips >>= 1;
5930 current->wakee_flip_decay_ts = jiffies;
5931 }
5932
5933 if (current->last_wakee != p) {
5934 current->last_wakee = p;
5935 current->wakee_flips++;
5936 }
5937}
5938
63b0e9ed
MG
5939/*
5940 * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
c58d25f3 5941 *
63b0e9ed 5942 * A waker of many should wake a different task than the one last awakened
c58d25f3
PZ
5943 * at a frequency roughly N times higher than one of its wakees.
5944 *
5945 * In order to determine whether we should let the load spread vs consolidating
5946 * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
5947 * partner, and a factor of lls_size higher frequency in the other.
5948 *
5949 * With both conditions met, we can be relatively sure that the relationship is
5950 * non-monogamous, with partner count exceeding socket size.
5951 *
5952 * Waker/wakee being client/server, worker/dispatcher, interrupt source or
5953 * whatever is irrelevant, spread criteria is apparent partner count exceeds
5954 * socket size.
63b0e9ed 5955 */
62470419
MW
5956static int wake_wide(struct task_struct *p)
5957{
63b0e9ed
MG
5958 unsigned int master = current->wakee_flips;
5959 unsigned int slave = p->wakee_flips;
17c891ab 5960 int factor = __this_cpu_read(sd_llc_size);
62470419 5961
63b0e9ed
MG
5962 if (master < slave)
5963 swap(master, slave);
5964 if (slave < factor || master < slave * factor)
5965 return 0;
5966 return 1;
62470419
MW
5967}
5968
90001d67 5969/*
d153b153
PZ
5970 * The purpose of wake_affine() is to quickly determine on which CPU we can run
5971 * soonest. For the purpose of speed we only consider the waking and previous
5972 * CPU.
90001d67 5973 *
7332dec0
MG
5974 * wake_affine_idle() - only considers 'now', it check if the waking CPU is
5975 * cache-affine and is (or will be) idle.
f2cdd9cc
PZ
5976 *
5977 * wake_affine_weight() - considers the weight to reflect the average
5978 * scheduling latency of the CPUs. This seems to work
5979 * for the overloaded case.
90001d67 5980 */
3b76c4a3 5981static int
89a55f56 5982wake_affine_idle(int this_cpu, int prev_cpu, int sync)
90001d67 5983{
7332dec0
MG
5984 /*
5985 * If this_cpu is idle, it implies the wakeup is from interrupt
5986 * context. Only allow the move if cache is shared. Otherwise an
5987 * interrupt intensive workload could force all tasks onto one
5988 * node depending on the IO topology or IRQ affinity settings.
806486c3
MG
5989 *
5990 * If the prev_cpu is idle and cache affine then avoid a migration.
5991 * There is no guarantee that the cache hot data from an interrupt
5992 * is more important than cache hot data on the prev_cpu and from
5993 * a cpufreq perspective, it's better to have higher utilisation
5994 * on one CPU.
7332dec0 5995 */
943d355d
RJ
5996 if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
5997 return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
90001d67 5998
d153b153 5999 if (sync && cpu_rq(this_cpu)->nr_running == 1)
3b76c4a3 6000 return this_cpu;
90001d67 6001
d8fcb81f
JL
6002 if (available_idle_cpu(prev_cpu))
6003 return prev_cpu;
6004
3b76c4a3 6005 return nr_cpumask_bits;
90001d67
PZ
6006}
6007
3b76c4a3 6008static int
f2cdd9cc
PZ
6009wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
6010 int this_cpu, int prev_cpu, int sync)
90001d67 6011{
90001d67
PZ
6012 s64 this_eff_load, prev_eff_load;
6013 unsigned long task_load;
6014
11f10e54 6015 this_eff_load = cpu_load(cpu_rq(this_cpu));
90001d67 6016
90001d67
PZ
6017 if (sync) {
6018 unsigned long current_load = task_h_load(current);
6019
f2cdd9cc 6020 if (current_load > this_eff_load)
3b76c4a3 6021 return this_cpu;
90001d67 6022
f2cdd9cc 6023 this_eff_load -= current_load;
90001d67
PZ
6024 }
6025
90001d67
PZ
6026 task_load = task_h_load(p);
6027
f2cdd9cc
PZ
6028 this_eff_load += task_load;
6029 if (sched_feat(WA_BIAS))
6030 this_eff_load *= 100;
6031 this_eff_load *= capacity_of(prev_cpu);
90001d67 6032
11f10e54 6033 prev_eff_load = cpu_load(cpu_rq(prev_cpu));
f2cdd9cc
PZ
6034 prev_eff_load -= task_load;
6035 if (sched_feat(WA_BIAS))
6036 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
6037 prev_eff_load *= capacity_of(this_cpu);
90001d67 6038
082f764a
MG
6039 /*
6040 * If sync, adjust the weight of prev_eff_load such that if
6041 * prev_eff == this_eff that select_idle_sibling() will consider
6042 * stacking the wakee on top of the waker if no other CPU is
6043 * idle.
6044 */
6045 if (sync)
6046 prev_eff_load += 1;
6047
6048 return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
90001d67
PZ
6049}
6050
772bd008 6051static int wake_affine(struct sched_domain *sd, struct task_struct *p,
7ebb66a1 6052 int this_cpu, int prev_cpu, int sync)
098fb9db 6053{
3b76c4a3 6054 int target = nr_cpumask_bits;
098fb9db 6055
89a55f56 6056 if (sched_feat(WA_IDLE))
3b76c4a3 6057 target = wake_affine_idle(this_cpu, prev_cpu, sync);
90001d67 6058
3b76c4a3
MG
6059 if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
6060 target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
098fb9db 6061
ceeadb83 6062 schedstat_inc(p->stats.nr_wakeups_affine_attempts);
3b76c4a3
MG
6063 if (target == nr_cpumask_bits)
6064 return prev_cpu;
098fb9db 6065
3b76c4a3 6066 schedstat_inc(sd->ttwu_move_affine);
ceeadb83 6067 schedstat_inc(p->stats.nr_wakeups_affine);
3b76c4a3 6068 return target;
098fb9db
IM
6069}
6070
aaee1203 6071static struct sched_group *
45da2773 6072find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
aaee1203
PZ
6073
6074/*
97fb7a0a 6075 * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
aaee1203
PZ
6076 */
6077static int
18bd1b4b 6078find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
aaee1203
PZ
6079{
6080 unsigned long load, min_load = ULONG_MAX;
83a0a96a
NP
6081 unsigned int min_exit_latency = UINT_MAX;
6082 u64 latest_idle_timestamp = 0;
6083 int least_loaded_cpu = this_cpu;
17346452 6084 int shallowest_idle_cpu = -1;
aaee1203
PZ
6085 int i;
6086
eaecf41f
MR
6087 /* Check if we have any choice: */
6088 if (group->group_weight == 1)
ae4df9d6 6089 return cpumask_first(sched_group_span(group));
eaecf41f 6090
aaee1203 6091 /* Traverse only the allowed CPUs */
3bd37062 6092 for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
97886d9d
AL
6093 struct rq *rq = cpu_rq(i);
6094
6095 if (!sched_core_cookie_match(rq, p))
6096 continue;
6097
17346452
VK
6098 if (sched_idle_cpu(i))
6099 return i;
6100
943d355d 6101 if (available_idle_cpu(i)) {
83a0a96a
NP
6102 struct cpuidle_state *idle = idle_get_state(rq);
6103 if (idle && idle->exit_latency < min_exit_latency) {
6104 /*
6105 * We give priority to a CPU whose idle state
6106 * has the smallest exit latency irrespective
6107 * of any idle timestamp.
6108 */
6109 min_exit_latency = idle->exit_latency;
6110 latest_idle_timestamp = rq->idle_stamp;
6111 shallowest_idle_cpu = i;
6112 } else if ((!idle || idle->exit_latency == min_exit_latency) &&
6113 rq->idle_stamp > latest_idle_timestamp) {
6114 /*
6115 * If equal or no active idle state, then
6116 * the most recently idled CPU might have
6117 * a warmer cache.
6118 */
6119 latest_idle_timestamp = rq->idle_stamp;
6120 shallowest_idle_cpu = i;
6121 }
17346452 6122 } else if (shallowest_idle_cpu == -1) {
11f10e54 6123 load = cpu_load(cpu_rq(i));
18cec7e0 6124 if (load < min_load) {
83a0a96a
NP
6125 min_load = load;
6126 least_loaded_cpu = i;
6127 }
e7693a36
GH
6128 }
6129 }
6130
17346452 6131 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
aaee1203 6132}
e7693a36 6133
18bd1b4b
BJ
6134static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
6135 int cpu, int prev_cpu, int sd_flag)
6136{
93f50f90 6137 int new_cpu = cpu;
18bd1b4b 6138
3bd37062 6139 if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
6fee85cc
BJ
6140 return prev_cpu;
6141
c976a862 6142 /*
57abff06 6143 * We need task's util for cpu_util_without, sync it up to
c469933e 6144 * prev_cpu's last_update_time.
c976a862
VK
6145 */
6146 if (!(sd_flag & SD_BALANCE_FORK))
6147 sync_entity_load_avg(&p->se);
6148
18bd1b4b
BJ
6149 while (sd) {
6150 struct sched_group *group;
6151 struct sched_domain *tmp;
6152 int weight;
6153
6154 if (!(sd->flags & sd_flag)) {
6155 sd = sd->child;
6156 continue;
6157 }
6158
45da2773 6159 group = find_idlest_group(sd, p, cpu);
18bd1b4b
BJ
6160 if (!group) {
6161 sd = sd->child;
6162 continue;
6163 }
6164
6165 new_cpu = find_idlest_group_cpu(group, p, cpu);
e90381ea 6166 if (new_cpu == cpu) {
97fb7a0a 6167 /* Now try balancing at a lower domain level of 'cpu': */
18bd1b4b
BJ
6168 sd = sd->child;
6169 continue;
6170 }
6171
97fb7a0a 6172 /* Now try balancing at a lower domain level of 'new_cpu': */
18bd1b4b
BJ
6173 cpu = new_cpu;
6174 weight = sd->span_weight;
6175 sd = NULL;
6176 for_each_domain(cpu, tmp) {
6177 if (weight <= tmp->span_weight)
6178 break;
6179 if (tmp->flags & sd_flag)
6180 sd = tmp;
6181 }
18bd1b4b
BJ
6182 }
6183
6184 return new_cpu;
6185}
6186
97886d9d 6187static inline int __select_idle_cpu(int cpu, struct task_struct *p)
9fe1f127 6188{
97886d9d
AL
6189 if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
6190 sched_cpu_cookie_match(cpu_rq(cpu), p))
9fe1f127
MG
6191 return cpu;
6192
6193 return -1;
6194}
6195
10e2f1ac 6196#ifdef CONFIG_SCHED_SMT
ba2591a5 6197DEFINE_STATIC_KEY_FALSE(sched_smt_present);
b284909a 6198EXPORT_SYMBOL_GPL(sched_smt_present);
10e2f1ac
PZ
6199
6200static inline void set_idle_cores(int cpu, int val)
6201{
6202 struct sched_domain_shared *sds;
6203
6204 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
6205 if (sds)
6206 WRITE_ONCE(sds->has_idle_cores, val);
6207}
6208
6209static inline bool test_idle_cores(int cpu, bool def)
6210{
6211 struct sched_domain_shared *sds;
6212
c722f35b
RR
6213 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
6214 if (sds)
6215 return READ_ONCE(sds->has_idle_cores);
10e2f1ac
PZ
6216
6217 return def;
6218}
6219
6220/*
6221 * Scans the local SMT mask to see if the entire core is idle, and records this
6222 * information in sd_llc_shared->has_idle_cores.
6223 *
6224 * Since SMT siblings share all cache levels, inspecting this limited remote
6225 * state should be fairly cheap.
6226 */
1b568f0a 6227void __update_idle_core(struct rq *rq)
10e2f1ac
PZ
6228{
6229 int core = cpu_of(rq);
6230 int cpu;
6231
6232 rcu_read_lock();
6233 if (test_idle_cores(core, true))
6234 goto unlock;
6235
6236 for_each_cpu(cpu, cpu_smt_mask(core)) {
6237 if (cpu == core)
6238 continue;
6239
943d355d 6240 if (!available_idle_cpu(cpu))
10e2f1ac
PZ
6241 goto unlock;
6242 }
6243
6244 set_idle_cores(core, 1);
6245unlock:
6246 rcu_read_unlock();
6247}
6248
6249/*
6250 * Scan the entire LLC domain for idle cores; this dynamically switches off if
6251 * there are no idle cores left in the system; tracked through
6252 * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
6253 */
9fe1f127 6254static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
10e2f1ac 6255{
9fe1f127
MG
6256 bool idle = true;
6257 int cpu;
10e2f1ac 6258
1b568f0a 6259 if (!static_branch_likely(&sched_smt_present))
97886d9d 6260 return __select_idle_cpu(core, p);
10e2f1ac 6261
9fe1f127
MG
6262 for_each_cpu(cpu, cpu_smt_mask(core)) {
6263 if (!available_idle_cpu(cpu)) {
6264 idle = false;
6265 if (*idle_cpu == -1) {
6266 if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->cpus_ptr)) {
6267 *idle_cpu = cpu;
6268 break;
6269 }
6270 continue;
bec2860a 6271 }
9fe1f127 6272 break;
10e2f1ac 6273 }
9fe1f127
MG
6274 if (*idle_cpu == -1 && cpumask_test_cpu(cpu, p->cpus_ptr))
6275 *idle_cpu = cpu;
10e2f1ac
PZ
6276 }
6277
9fe1f127
MG
6278 if (idle)
6279 return core;
10e2f1ac 6280
9fe1f127 6281 cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
10e2f1ac
PZ
6282 return -1;
6283}
6284
c722f35b
RR
6285/*
6286 * Scan the local SMT mask for idle CPUs.
6287 */
6288static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
6289{
6290 int cpu;
6291
6292 for_each_cpu(cpu, cpu_smt_mask(target)) {
6293 if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
6294 !cpumask_test_cpu(cpu, sched_domain_span(sd)))
6295 continue;
6296 if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
6297 return cpu;
6298 }
6299
6300 return -1;
6301}
6302
10e2f1ac
PZ
6303#else /* CONFIG_SCHED_SMT */
6304
9fe1f127 6305static inline void set_idle_cores(int cpu, int val)
10e2f1ac 6306{
9fe1f127
MG
6307}
6308
6309static inline bool test_idle_cores(int cpu, bool def)
6310{
6311 return def;
6312}
6313
6314static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
6315{
97886d9d 6316 return __select_idle_cpu(core, p);
10e2f1ac
PZ
6317}
6318
c722f35b
RR
6319static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
6320{
6321 return -1;
6322}
6323
10e2f1ac
PZ
6324#endif /* CONFIG_SCHED_SMT */
6325
6326/*
6327 * Scan the LLC domain for idle CPUs; this is dynamically regulated by
6328 * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
6329 * average idle time for this rq (as found in rq->avg_idle).
a50bde51 6330 */
c722f35b 6331static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target)
10e2f1ac 6332{
60588bfa 6333 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
9fe1f127 6334 int i, cpu, idle_cpu = -1, nr = INT_MAX;
94aafc3e 6335 struct rq *this_rq = this_rq();
9fe1f127 6336 int this = smp_processor_id();
9cfb38a7 6337 struct sched_domain *this_sd;
94aafc3e 6338 u64 time = 0;
10e2f1ac 6339
9cfb38a7
WL
6340 this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
6341 if (!this_sd)
6342 return -1;
6343
bae4ec13
MG
6344 cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
6345
c722f35b 6346 if (sched_feat(SIS_PROP) && !has_idle_core) {
e6e0dc2d 6347 u64 avg_cost, avg_idle, span_avg;
94aafc3e 6348 unsigned long now = jiffies;
1ad3aaf3 6349
e6e0dc2d 6350 /*
94aafc3e
PZ
6351 * If we're busy, the assumption that the last idle period
6352 * predicts the future is flawed; age away the remaining
6353 * predicted idle time.
e6e0dc2d 6354 */
94aafc3e
PZ
6355 if (unlikely(this_rq->wake_stamp < now)) {
6356 while (this_rq->wake_stamp < now && this_rq->wake_avg_idle) {
6357 this_rq->wake_stamp++;
6358 this_rq->wake_avg_idle >>= 1;
6359 }
6360 }
6361
6362 avg_idle = this_rq->wake_avg_idle;
e6e0dc2d 6363 avg_cost = this_sd->avg_scan_cost + 1;
10e2f1ac 6364
e6e0dc2d 6365 span_avg = sd->span_weight * avg_idle;
1ad3aaf3
PZ
6366 if (span_avg > 4*avg_cost)
6367 nr = div_u64(span_avg, avg_cost);
6368 else
6369 nr = 4;
10e2f1ac 6370
bae4ec13
MG
6371 time = cpu_clock(this);
6372 }
60588bfa 6373
56498cfb 6374 for_each_cpu_wrap(cpu, cpus, target + 1) {
c722f35b 6375 if (has_idle_core) {
9fe1f127
MG
6376 i = select_idle_core(p, cpu, cpus, &idle_cpu);
6377 if ((unsigned int)i < nr_cpumask_bits)
6378 return i;
6379
6380 } else {
6381 if (!--nr)
6382 return -1;
97886d9d 6383 idle_cpu = __select_idle_cpu(cpu, p);
9fe1f127
MG
6384 if ((unsigned int)idle_cpu < nr_cpumask_bits)
6385 break;
6386 }
10e2f1ac
PZ
6387 }
6388
c722f35b 6389 if (has_idle_core)
02dbb724 6390 set_idle_cores(target, false);
9fe1f127 6391
c722f35b 6392 if (sched_feat(SIS_PROP) && !has_idle_core) {
bae4ec13 6393 time = cpu_clock(this) - time;
94aafc3e
PZ
6394
6395 /*
6396 * Account for the scan cost of wakeups against the average
6397 * idle time.
6398 */
6399 this_rq->wake_avg_idle -= min(this_rq->wake_avg_idle, time);
6400
bae4ec13
MG
6401 update_avg(&this_sd->avg_scan_cost, time);
6402 }
10e2f1ac 6403
9fe1f127 6404 return idle_cpu;
10e2f1ac
PZ
6405}
6406
b7a33161
MR
6407/*
6408 * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
6409 * the task fits. If no CPU is big enough, but there are idle ones, try to
6410 * maximize capacity.
6411 */
6412static int
6413select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
6414{
b4c9c9f1 6415 unsigned long task_util, best_cap = 0;
b7a33161
MR
6416 int cpu, best_cpu = -1;
6417 struct cpumask *cpus;
6418
b7a33161
MR
6419 cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
6420 cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
6421
b4c9c9f1
VG
6422 task_util = uclamp_task_util(p);
6423
b7a33161
MR
6424 for_each_cpu_wrap(cpu, cpus, target) {
6425 unsigned long cpu_cap = capacity_of(cpu);
6426
6427 if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
6428 continue;
b4c9c9f1 6429 if (fits_capacity(task_util, cpu_cap))
b7a33161
MR
6430 return cpu;
6431
6432 if (cpu_cap > best_cap) {
6433 best_cap = cpu_cap;
6434 best_cpu = cpu;
6435 }
6436 }
6437
6438 return best_cpu;
6439}
6440
ef8df979 6441static inline bool asym_fits_capacity(unsigned long task_util, int cpu)
b4c9c9f1
VG
6442{
6443 if (static_branch_unlikely(&sched_asym_cpucapacity))
6444 return fits_capacity(task_util, capacity_of(cpu));
6445
6446 return true;
6447}
6448
10e2f1ac
PZ
6449/*
6450 * Try and locate an idle core/thread in the LLC cache domain.
a50bde51 6451 */
772bd008 6452static int select_idle_sibling(struct task_struct *p, int prev, int target)
a50bde51 6453{
c722f35b 6454 bool has_idle_core = false;
99bd5e2f 6455 struct sched_domain *sd;
b4c9c9f1 6456 unsigned long task_util;
32e839dd 6457 int i, recent_used_cpu;
a50bde51 6458
b7a33161 6459 /*
b4c9c9f1
VG
6460 * On asymmetric system, update task utilization because we will check
6461 * that the task fits with cpu's capacity.
b7a33161
MR
6462 */
6463 if (static_branch_unlikely(&sched_asym_cpucapacity)) {
b4c9c9f1
VG
6464 sync_entity_load_avg(&p->se);
6465 task_util = uclamp_task_util(p);
b7a33161
MR
6466 }
6467
9099a147
PZ
6468 /*
6469 * per-cpu select_idle_mask usage
6470 */
6471 lockdep_assert_irqs_disabled();
6472
b4c9c9f1
VG
6473 if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
6474 asym_fits_capacity(task_util, target))
e0a79f52 6475 return target;
99bd5e2f
SS
6476
6477 /*
97fb7a0a 6478 * If the previous CPU is cache affine and idle, don't be stupid:
99bd5e2f 6479 */
3c29e651 6480 if (prev != target && cpus_share_cache(prev, target) &&
b4c9c9f1
VG
6481 (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
6482 asym_fits_capacity(task_util, prev))
772bd008 6483 return prev;
a50bde51 6484
52262ee5
MG
6485 /*
6486 * Allow a per-cpu kthread to stack with the wakee if the
6487 * kworker thread and the tasks previous CPUs are the same.
6488 * The assumption is that the wakee queued work for the
6489 * per-cpu kthread that is now complete and the wakeup is
6490 * essentially a sync wakeup. An obvious example of this
6491 * pattern is IO completions.
6492 */
6493 if (is_per_cpu_kthread(current) &&
8b4e74cc 6494 in_task() &&
52262ee5 6495 prev == smp_processor_id() &&
014ba44e
VD
6496 this_rq()->nr_running <= 1 &&
6497 asym_fits_capacity(task_util, prev)) {
52262ee5
MG
6498 return prev;
6499 }
6500
97fb7a0a 6501 /* Check a recently used CPU as a potential idle candidate: */
32e839dd 6502 recent_used_cpu = p->recent_used_cpu;
89aafd67 6503 p->recent_used_cpu = prev;
32e839dd
MG
6504 if (recent_used_cpu != prev &&
6505 recent_used_cpu != target &&
6506 cpus_share_cache(recent_used_cpu, target) &&
3c29e651 6507 (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
b4c9c9f1
VG
6508 cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
6509 asym_fits_capacity(task_util, recent_used_cpu)) {
32e839dd
MG
6510 return recent_used_cpu;
6511 }
6512
b4c9c9f1
VG
6513 /*
6514 * For asymmetric CPU capacity systems, our domain of interest is
6515 * sd_asym_cpucapacity rather than sd_llc.
6516 */
6517 if (static_branch_unlikely(&sched_asym_cpucapacity)) {
6518 sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
6519 /*
6520 * On an asymmetric CPU capacity system where an exclusive
6521 * cpuset defines a symmetric island (i.e. one unique
6522 * capacity_orig value through the cpuset), the key will be set
6523 * but the CPUs within that cpuset will not have a domain with
6524 * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
6525 * capacity path.
6526 */
6527 if (sd) {
6528 i = select_idle_capacity(p, sd, target);
6529 return ((unsigned)i < nr_cpumask_bits) ? i : target;
6530 }
6531 }
6532
518cd623 6533 sd = rcu_dereference(per_cpu(sd_llc, target));
10e2f1ac
PZ
6534 if (!sd)
6535 return target;
772bd008 6536
c722f35b
RR
6537 if (sched_smt_active()) {
6538 has_idle_core = test_idle_cores(target, false);
6539
6540 if (!has_idle_core && cpus_share_cache(prev, target)) {
6541 i = select_idle_smt(p, sd, prev);
6542 if ((unsigned int)i < nr_cpumask_bits)
6543 return i;
6544 }
6545 }
6546
6547 i = select_idle_cpu(p, sd, has_idle_core, target);
10e2f1ac
PZ
6548 if ((unsigned)i < nr_cpumask_bits)
6549 return i;
6550
a50bde51
PZ
6551 return target;
6552}
231678b7 6553
104cb16d 6554/*
4e3c7d33
DE
6555 * Predicts what cpu_util(@cpu) would return if @p was removed from @cpu
6556 * (@dst_cpu = -1) or migrated to @dst_cpu.
390031e4
QP
6557 */
6558static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
6559{
6560 struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
4e3c7d33 6561 unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
390031e4
QP
6562
6563 /*
4e3c7d33
DE
6564 * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its
6565 * contribution. If @p migrates from another CPU to @cpu add its
6566 * contribution. In all the other cases @cpu is not impacted by the
6567 * migration so its util_avg is already correct.
390031e4
QP
6568 */
6569 if (task_cpu(p) == cpu && dst_cpu != cpu)
736cc6b3 6570 lsub_positive(&util, task_util(p));
390031e4
QP
6571 else if (task_cpu(p) != cpu && dst_cpu == cpu)
6572 util += task_util(p);
6573
6574 if (sched_feat(UTIL_EST)) {
4e3c7d33
DE
6575 unsigned long util_est;
6576
390031e4
QP
6577 util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
6578
6579 /*
4e3c7d33
DE
6580 * During wake-up @p isn't enqueued yet and doesn't contribute
6581 * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued.
6582 * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p
6583 * has been enqueued.
6584 *
6585 * During exec (@dst_cpu = -1) @p is enqueued and does
6586 * contribute to cpu_rq(cpu)->cfs.util_est.enqueued.
6587 * Remove it to "simulate" cpu_util without @p's contribution.
6588 *
6589 * Despite the task_on_rq_queued(@p) check there is still a
6590 * small window for a possible race when an exec
6591 * select_task_rq_fair() races with LB's detach_task().
6592 *
6593 * detach_task()
6594 * deactivate_task()
6595 * p->on_rq = TASK_ON_RQ_MIGRATING;
6596 * -------------------------------- A
6597 * dequeue_task() \
6598 * dequeue_task_fair() + Race Time
6599 * util_est_dequeue() /
6600 * -------------------------------- B
6601 *
6602 * The additional check "current == p" is required to further
6603 * reduce the race window.
390031e4
QP
6604 */
6605 if (dst_cpu == cpu)
6606 util_est += _task_util_est(p);
4e3c7d33
DE
6607 else if (unlikely(task_on_rq_queued(p) || current == p))
6608 lsub_positive(&util_est, _task_util_est(p));
390031e4
QP
6609
6610 util = max(util, util_est);
6611 }
6612
6613 return min(util, capacity_orig_of(cpu));
6614}
6615
4e3c7d33
DE
6616/*
6617 * cpu_util_without: compute cpu utilization without any contributions from *p
6618 * @cpu: the CPU which utilization is requested
6619 * @p: the task which utilization should be discounted
6620 *
6621 * The utilization of a CPU is defined by the utilization of tasks currently
6622 * enqueued on that CPU as well as tasks which are currently sleeping after an
6623 * execution on that CPU.
6624 *
6625 * This method returns the utilization of the specified CPU by discounting the
6626 * utilization of the specified task, whenever the task is currently
6627 * contributing to the CPU utilization.
6628 */
6629static unsigned long cpu_util_without(int cpu, struct task_struct *p)
6630{
6631 /* Task has no contribution or is new */
6632 if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
6633 return cpu_util_cfs(cpu);
6634
6635 return cpu_util_next(cpu, p, -1);
6636}
6637
390031e4 6638/*
eb92692b 6639 * compute_energy(): Estimates the energy that @pd would consume if @p was
390031e4 6640 * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
eb92692b 6641 * landscape of @pd's CPUs after the task migration, and uses the Energy Model
390031e4
QP
6642 * to compute what would be the energy if we decided to actually migrate that
6643 * task.
6644 */
6645static long
6646compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
6647{
eb92692b
QP
6648 struct cpumask *pd_mask = perf_domain_span(pd);
6649 unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
6650 unsigned long max_util = 0, sum_util = 0;
489f1645 6651 unsigned long _cpu_cap = cpu_cap;
390031e4
QP
6652 int cpu;
6653
489f1645
LL
6654 _cpu_cap -= arch_scale_thermal_pressure(cpumask_first(pd_mask));
6655
eb92692b
QP
6656 /*
6657 * The capacity state of CPUs of the current rd can be driven by CPUs
6658 * of another rd if they belong to the same pd. So, account for the
6659 * utilization of these CPUs too by masking pd with cpu_online_mask
6660 * instead of the rd span.
6661 *
6662 * If an entire pd is outside of the current rd, it will not appear in
6663 * its pd list and will not be accounted by compute_energy().
6664 */
6665 for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
0372e1cf
VD
6666 unsigned long util_freq = cpu_util_next(cpu, p, dst_cpu);
6667 unsigned long cpu_util, util_running = util_freq;
6668 struct task_struct *tsk = NULL;
6669
6670 /*
6671 * When @p is placed on @cpu:
6672 *
6673 * util_running = max(cpu_util, cpu_util_est) +
6674 * max(task_util, _task_util_est)
6675 *
6676 * while cpu_util_next is: max(cpu_util + task_util,
6677 * cpu_util_est + _task_util_est)
6678 */
6679 if (cpu == dst_cpu) {
6680 tsk = p;
6681 util_running =
6682 cpu_util_next(cpu, p, -1) + task_util_est(p);
6683 }
af24bde8
PB
6684
6685 /*
eb92692b
QP
6686 * Busy time computation: utilization clamping is not
6687 * required since the ratio (sum_util / cpu_capacity)
6688 * is already enough to scale the EM reported power
6689 * consumption at the (eventually clamped) cpu_capacity.
af24bde8 6690 */
489f1645
LL
6691 cpu_util = effective_cpu_util(cpu, util_running, cpu_cap,
6692 ENERGY_UTIL, NULL);
6693
6694 sum_util += min(cpu_util, _cpu_cap);
af24bde8 6695
390031e4 6696 /*
eb92692b
QP
6697 * Performance domain frequency: utilization clamping
6698 * must be considered since it affects the selection
6699 * of the performance domain frequency.
6700 * NOTE: in case RT tasks are running, by default the
6701 * FREQUENCY_UTIL's utilization can be max OPP.
390031e4 6702 */
0372e1cf 6703 cpu_util = effective_cpu_util(cpu, util_freq, cpu_cap,
eb92692b 6704 FREQUENCY_UTIL, tsk);
489f1645 6705 max_util = max(max_util, min(cpu_util, _cpu_cap));
390031e4
QP
6706 }
6707
8f1b971b 6708 return em_cpu_energy(pd->em_pd, max_util, sum_util, _cpu_cap);
390031e4
QP
6709}
6710
732cd75b
QP
6711/*
6712 * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
6713 * waking task. find_energy_efficient_cpu() looks for the CPU with maximum
6714 * spare capacity in each performance domain and uses it as a potential
6715 * candidate to execute the task. Then, it uses the Energy Model to figure
6716 * out which of the CPU candidates is the most energy-efficient.
6717 *
6718 * The rationale for this heuristic is as follows. In a performance domain,
6719 * all the most energy efficient CPU candidates (according to the Energy
6720 * Model) are those for which we'll request a low frequency. When there are
6721 * several CPUs for which the frequency request will be the same, we don't
6722 * have enough data to break the tie between them, because the Energy Model
6723 * only includes active power costs. With this model, if we assume that
6724 * frequency requests follow utilization (e.g. using schedutil), the CPU with
6725 * the maximum spare capacity in a performance domain is guaranteed to be among
6726 * the best candidates of the performance domain.
6727 *
6728 * In practice, it could be preferable from an energy standpoint to pack
6729 * small tasks on a CPU in order to let other CPUs go in deeper idle states,
6730 * but that could also hurt our chances to go cluster idle, and we have no
6731 * ways to tell with the current Energy Model if this is actually a good
6732 * idea or not. So, find_energy_efficient_cpu() basically favors
6733 * cluster-packing, and spreading inside a cluster. That should at least be
6734 * a good thing for latency, and this is consistent with the idea that most
6735 * of the energy savings of EAS come from the asymmetry of the system, and
6736 * not so much from breaking the tie between identical CPUs. That's also the
6737 * reason why EAS is enabled in the topology code only for systems where
6738 * SD_ASYM_CPUCAPACITY is set.
6739 *
6740 * NOTE: Forkees are not accepted in the energy-aware wake-up path because
6741 * they don't have any useful utilization data yet and it's not possible to
6742 * forecast their impact on energy consumption. Consequently, they will be
6743 * placed by find_idlest_cpu() on the least loaded CPU, which might turn out
6744 * to be energy-inefficient in some use-cases. The alternative would be to
6745 * bias new tasks towards specific types of CPUs first, or to try to infer
6746 * their util_avg from the parent task, but those heuristics could hurt
6747 * other use-cases too. So, until someone finds a better way to solve this,
6748 * let's keep things simple by re-using the existing slow path.
6749 */
732cd75b
QP
6750static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
6751{
eb92692b 6752 unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
732cd75b 6753 struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
619e090c 6754 int cpu, best_energy_cpu = prev_cpu, target = -1;
eb92692b 6755 unsigned long cpu_cap, util, base_energy = 0;
732cd75b 6756 struct sched_domain *sd;
eb92692b 6757 struct perf_domain *pd;
732cd75b
QP
6758
6759 rcu_read_lock();
6760 pd = rcu_dereference(rd->pd);
6761 if (!pd || READ_ONCE(rd->overutilized))
619e090c 6762 goto unlock;
732cd75b
QP
6763
6764 /*
6765 * Energy-aware wake-up happens on the lowest sched_domain starting
6766 * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
6767 */
6768 sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
6769 while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
6770 sd = sd->parent;
6771 if (!sd)
619e090c
PG
6772 goto unlock;
6773
6774 target = prev_cpu;
732cd75b
QP
6775
6776 sync_entity_load_avg(&p->se);
6777 if (!task_util_est(p))
6778 goto unlock;
6779
6780 for (; pd; pd = pd->next) {
eb92692b 6781 unsigned long cur_delta, spare_cap, max_spare_cap = 0;
8d4c97c1 6782 bool compute_prev_delta = false;
eb92692b 6783 unsigned long base_energy_pd;
732cd75b
QP
6784 int max_spare_cap_cpu = -1;
6785
6786 for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
3bd37062 6787 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
732cd75b
QP
6788 continue;
6789
732cd75b
QP
6790 util = cpu_util_next(cpu, p, cpu);
6791 cpu_cap = capacity_of(cpu);
da0777d3
LL
6792 spare_cap = cpu_cap;
6793 lsub_positive(&spare_cap, util);
1d42509e
VS
6794
6795 /*
6796 * Skip CPUs that cannot satisfy the capacity request.
6797 * IOW, placing the task there would make the CPU
6798 * overutilized. Take uclamp into account to see how
6799 * much capacity we can get out of the CPU; this is
a5418be9 6800 * aligned with sched_cpu_util().
1d42509e
VS
6801 */
6802 util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
60e17f5c 6803 if (!fits_capacity(util, cpu_cap))
732cd75b
QP
6804 continue;
6805
732cd75b 6806 if (cpu == prev_cpu) {
8d4c97c1
PG
6807 /* Always use prev_cpu as a candidate. */
6808 compute_prev_delta = true;
6809 } else if (spare_cap > max_spare_cap) {
6810 /*
6811 * Find the CPU with the maximum spare capacity
6812 * in the performance domain.
6813 */
732cd75b
QP
6814 max_spare_cap = spare_cap;
6815 max_spare_cap_cpu = cpu;
6816 }
6817 }
6818
8d4c97c1
PG
6819 if (max_spare_cap_cpu < 0 && !compute_prev_delta)
6820 continue;
6821
6822 /* Compute the 'base' energy of the pd, without @p */
6823 base_energy_pd = compute_energy(p, -1, pd);
6824 base_energy += base_energy_pd;
6825
6826 /* Evaluate the energy impact of using prev_cpu. */
6827 if (compute_prev_delta) {
6828 prev_delta = compute_energy(p, prev_cpu, pd);
619e090c
PG
6829 if (prev_delta < base_energy_pd)
6830 goto unlock;
8d4c97c1
PG
6831 prev_delta -= base_energy_pd;
6832 best_delta = min(best_delta, prev_delta);
6833 }
6834
6835 /* Evaluate the energy impact of using max_spare_cap_cpu. */
6836 if (max_spare_cap_cpu >= 0) {
eb92692b 6837 cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
619e090c
PG
6838 if (cur_delta < base_energy_pd)
6839 goto unlock;
eb92692b
QP
6840 cur_delta -= base_energy_pd;
6841 if (cur_delta < best_delta) {
6842 best_delta = cur_delta;
732cd75b
QP
6843 best_energy_cpu = max_spare_cap_cpu;
6844 }
6845 }
6846 }
732cd75b
QP
6847 rcu_read_unlock();
6848
6849 /*
6850 * Pick the best CPU if prev_cpu cannot be used, or if it saves at
6851 * least 6% of the energy used by prev_cpu.
6852 */
619e090c
PG
6853 if ((prev_delta == ULONG_MAX) ||
6854 (prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
6855 target = best_energy_cpu;
732cd75b 6856
619e090c 6857 return target;
732cd75b 6858
619e090c 6859unlock:
732cd75b
QP
6860 rcu_read_unlock();
6861
619e090c 6862 return target;
732cd75b
QP
6863}
6864
aaee1203 6865/*
de91b9cb 6866 * select_task_rq_fair: Select target runqueue for the waking task in domains
3aef1551 6867 * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
de91b9cb 6868 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
aaee1203 6869 *
97fb7a0a
IM
6870 * Balances load by selecting the idlest CPU in the idlest group, or under
6871 * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
aaee1203 6872 *
97fb7a0a 6873 * Returns the target CPU number.
aaee1203 6874 */
0017d735 6875static int
3aef1551 6876select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
aaee1203 6877{
3aef1551 6878 int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
f1d88b44 6879 struct sched_domain *tmp, *sd = NULL;
c88d5910 6880 int cpu = smp_processor_id();
63b0e9ed 6881 int new_cpu = prev_cpu;
99bd5e2f 6882 int want_affine = 0;
3aef1551
VS
6883 /* SD_flags and WF_flags share the first nibble */
6884 int sd_flag = wake_flags & 0xF;
c88d5910 6885
9099a147
PZ
6886 /*
6887 * required for stable ->cpus_allowed
6888 */
6889 lockdep_assert_held(&p->pi_lock);
dc824eb8 6890 if (wake_flags & WF_TTWU) {
c58d25f3 6891 record_wakee(p);
732cd75b 6892
f8a696f2 6893 if (sched_energy_enabled()) {
732cd75b
QP
6894 new_cpu = find_energy_efficient_cpu(p, prev_cpu);
6895 if (new_cpu >= 0)
6896 return new_cpu;
6897 new_cpu = prev_cpu;
6898 }
6899
00061968 6900 want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
c58d25f3 6901 }
aaee1203 6902
dce840a0 6903 rcu_read_lock();
aaee1203 6904 for_each_domain(cpu, tmp) {
fe3bcfe1 6905 /*
97fb7a0a 6906 * If both 'cpu' and 'prev_cpu' are part of this domain,
99bd5e2f 6907 * cpu is a valid SD_WAKE_AFFINE target.
fe3bcfe1 6908 */
99bd5e2f
SS
6909 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
6910 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
f1d88b44
VK
6911 if (cpu != prev_cpu)
6912 new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
6913
6914 sd = NULL; /* Prefer wake_affine over balance flags */
29cd8bae 6915 break;
f03542a7 6916 }
29cd8bae 6917
2917406c
BS
6918 /*
6919 * Usually only true for WF_EXEC and WF_FORK, as sched_domains
6920 * usually do not have SD_BALANCE_WAKE set. That means wakeup
6921 * will usually go to the fast path.
6922 */
f03542a7 6923 if (tmp->flags & sd_flag)
29cd8bae 6924 sd = tmp;
63b0e9ed
MG
6925 else if (!want_affine)
6926 break;
29cd8bae
PZ
6927 }
6928
f1d88b44
VK
6929 if (unlikely(sd)) {
6930 /* Slow path */
18bd1b4b 6931 new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
dc824eb8 6932 } else if (wake_flags & WF_TTWU) { /* XXX always ? */
f1d88b44 6933 /* Fast path */
f1d88b44 6934 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
e7693a36 6935 }
dce840a0 6936 rcu_read_unlock();
e7693a36 6937
c88d5910 6938 return new_cpu;
e7693a36 6939}
0a74bef8 6940
144d8487
PZ
6941static void detach_entity_cfs_rq(struct sched_entity *se);
6942
0a74bef8 6943/*
97fb7a0a 6944 * Called immediately before a task is migrated to a new CPU; task_cpu(p) and
0a74bef8 6945 * cfs_rq_of(p) references at time of call are still valid and identify the
97fb7a0a 6946 * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
0a74bef8 6947 */
3f9672ba 6948static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
0a74bef8 6949{
59efa0ba
PZ
6950 /*
6951 * As blocked tasks retain absolute vruntime the migration needs to
6952 * deal with this by subtracting the old and adding the new
6953 * min_vruntime -- the latter is done by enqueue_entity() when placing
6954 * the task on the new runqueue.
6955 */
2f064a59 6956 if (READ_ONCE(p->__state) == TASK_WAKING) {
59efa0ba
PZ
6957 struct sched_entity *se = &p->se;
6958 struct cfs_rq *cfs_rq = cfs_rq_of(se);
6959 u64 min_vruntime;
6960
6961#ifndef CONFIG_64BIT
6962 u64 min_vruntime_copy;
6963
6964 do {
6965 min_vruntime_copy = cfs_rq->min_vruntime_copy;
6966 smp_rmb();
6967 min_vruntime = cfs_rq->min_vruntime;
6968 } while (min_vruntime != min_vruntime_copy);
6969#else
6970 min_vruntime = cfs_rq->min_vruntime;
6971#endif
6972
6973 se->vruntime -= min_vruntime;
6974 }
6975
144d8487
PZ
6976 if (p->on_rq == TASK_ON_RQ_MIGRATING) {
6977 /*
6978 * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
6979 * rq->lock and can modify state directly.
6980 */
5cb9eaa3 6981 lockdep_assert_rq_held(task_rq(p));
144d8487
PZ
6982 detach_entity_cfs_rq(&p->se);
6983
6984 } else {
6985 /*
6986 * We are supposed to update the task to "current" time, then
6987 * its up to date and ready to go to new CPU/cfs_rq. But we
6988 * have difficulty in getting what current time is, so simply
6989 * throw away the out-of-date time. This will result in the
6990 * wakee task is less decayed, but giving the wakee more load
6991 * sounds not bad.
6992 */
6993 remove_entity_load_avg(&p->se);
6994 }
9d89c257
YD
6995
6996 /* Tell new CPU we are migrated */
6997 p->se.avg.last_update_time = 0;
3944a927
BS
6998
6999 /* We have migrated, no longer consider this task hot */
9d89c257 7000 p->se.exec_start = 0;
3f9672ba
SD
7001
7002 update_scan_period(p, new_cpu);
0a74bef8 7003}
12695578
YD
7004
7005static void task_dead_fair(struct task_struct *p)
7006{
7007 remove_entity_load_avg(&p->se);
7008}
6e2df058
PZ
7009
7010static int
7011balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
7012{
7013 if (rq->nr_running)
7014 return 1;
7015
7016 return newidle_balance(rq, rf) != 0;
7017}
e7693a36
GH
7018#endif /* CONFIG_SMP */
7019
a555e9d8 7020static unsigned long wakeup_gran(struct sched_entity *se)
0bbd3336
PZ
7021{
7022 unsigned long gran = sysctl_sched_wakeup_granularity;
7023
7024 /*
e52fb7c0
PZ
7025 * Since its curr running now, convert the gran from real-time
7026 * to virtual-time in his units.
13814d42
MG
7027 *
7028 * By using 'se' instead of 'curr' we penalize light tasks, so
7029 * they get preempted easier. That is, if 'se' < 'curr' then
7030 * the resulting gran will be larger, therefore penalizing the
7031 * lighter, if otoh 'se' > 'curr' then the resulting gran will
7032 * be smaller, again penalizing the lighter task.
7033 *
7034 * This is especially important for buddies when the leftmost
7035 * task is higher priority than the buddy.
0bbd3336 7036 */
f4ad9bd2 7037 return calc_delta_fair(gran, se);
0bbd3336
PZ
7038}
7039
464b7527
PZ
7040/*
7041 * Should 'se' preempt 'curr'.
7042 *
7043 * |s1
7044 * |s2
7045 * |s3
7046 * g
7047 * |<--->|c
7048 *
7049 * w(c, s1) = -1
7050 * w(c, s2) = 0
7051 * w(c, s3) = 1
7052 *
7053 */
7054static int
7055wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
7056{
7057 s64 gran, vdiff = curr->vruntime - se->vruntime;
7058
7059 if (vdiff <= 0)
7060 return -1;
7061
a555e9d8 7062 gran = wakeup_gran(se);
464b7527
PZ
7063 if (vdiff > gran)
7064 return 1;
7065
7066 return 0;
7067}
7068
02479099
PZ
7069static void set_last_buddy(struct sched_entity *se)
7070{
c5ae366e
DA
7071 for_each_sched_entity(se) {
7072 if (SCHED_WARN_ON(!se->on_rq))
7073 return;
30400039
JD
7074 if (se_is_idle(se))
7075 return;
69c80f3e 7076 cfs_rq_of(se)->last = se;
c5ae366e 7077 }
02479099
PZ
7078}
7079
7080static void set_next_buddy(struct sched_entity *se)
7081{
c5ae366e
DA
7082 for_each_sched_entity(se) {
7083 if (SCHED_WARN_ON(!se->on_rq))
7084 return;
30400039
JD
7085 if (se_is_idle(se))
7086 return;
69c80f3e 7087 cfs_rq_of(se)->next = se;
c5ae366e 7088 }
02479099
PZ
7089}
7090
ac53db59
RR
7091static void set_skip_buddy(struct sched_entity *se)
7092{
69c80f3e
VP
7093 for_each_sched_entity(se)
7094 cfs_rq_of(se)->skip = se;
ac53db59
RR
7095}
7096
bf0f6f24
IM
7097/*
7098 * Preempt the current task with a newly woken task if needed:
7099 */
5a9b86f6 7100static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
bf0f6f24
IM
7101{
7102 struct task_struct *curr = rq->curr;
8651a86c 7103 struct sched_entity *se = &curr->se, *pse = &p->se;
03e89e45 7104 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
f685ceac 7105 int scale = cfs_rq->nr_running >= sched_nr_latency;
2f36825b 7106 int next_buddy_marked = 0;
30400039 7107 int cse_is_idle, pse_is_idle;
bf0f6f24 7108
4ae7d5ce
IM
7109 if (unlikely(se == pse))
7110 return;
7111
5238cdd3 7112 /*
163122b7 7113 * This is possible from callers such as attach_tasks(), in which we
3b03706f 7114 * unconditionally check_preempt_curr() after an enqueue (which may have
5238cdd3
PT
7115 * lead to a throttle). This both saves work and prevents false
7116 * next-buddy nomination below.
7117 */
7118 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
7119 return;
7120
2f36825b 7121 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
3cb63d52 7122 set_next_buddy(pse);
2f36825b
VP
7123 next_buddy_marked = 1;
7124 }
57fdc26d 7125
aec0a514
BR
7126 /*
7127 * We can come here with TIF_NEED_RESCHED already set from new task
7128 * wake up path.
5238cdd3
PT
7129 *
7130 * Note: this also catches the edge-case of curr being in a throttled
7131 * group (e.g. via set_curr_task), since update_curr() (in the
7132 * enqueue of curr) will have resulted in resched being set. This
7133 * prevents us from potentially nominating it as a false LAST_BUDDY
7134 * below.
aec0a514
BR
7135 */
7136 if (test_tsk_need_resched(curr))
7137 return;
7138
a2f5c9ab 7139 /* Idle tasks are by definition preempted by non-idle tasks. */
1da1843f
VK
7140 if (unlikely(task_has_idle_policy(curr)) &&
7141 likely(!task_has_idle_policy(p)))
a2f5c9ab
DH
7142 goto preempt;
7143
91c234b4 7144 /*
a2f5c9ab
DH
7145 * Batch and idle tasks do not preempt non-idle tasks (their preemption
7146 * is driven by the tick):
91c234b4 7147 */
8ed92e51 7148 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
91c234b4 7149 return;
bf0f6f24 7150
464b7527 7151 find_matching_se(&se, &pse);
002f128b 7152 BUG_ON(!pse);
30400039
JD
7153
7154 cse_is_idle = se_is_idle(se);
7155 pse_is_idle = se_is_idle(pse);
7156
7157 /*
7158 * Preempt an idle group in favor of a non-idle group (and don't preempt
7159 * in the inverse case).
7160 */
7161 if (cse_is_idle && !pse_is_idle)
7162 goto preempt;
7163 if (cse_is_idle != pse_is_idle)
7164 return;
7165
7166 update_curr(cfs_rq_of(se));
2f36825b
VP
7167 if (wakeup_preempt_entity(se, pse) == 1) {
7168 /*
7169 * Bias pick_next to pick the sched entity that is
7170 * triggering this preemption.
7171 */
7172 if (!next_buddy_marked)
7173 set_next_buddy(pse);
3a7e73a2 7174 goto preempt;
2f36825b 7175 }
464b7527 7176
3a7e73a2 7177 return;
a65ac745 7178
3a7e73a2 7179preempt:
8875125e 7180 resched_curr(rq);
3a7e73a2
PZ
7181 /*
7182 * Only set the backward buddy when the current task is still
7183 * on the rq. This can happen when a wakeup gets interleaved
7184 * with schedule on the ->pre_schedule() or idle_balance()
7185 * point, either of which can * drop the rq lock.
7186 *
7187 * Also, during early boot the idle thread is in the fair class,
7188 * for obvious reasons its a bad idea to schedule back to it.
7189 */
7190 if (unlikely(!se->on_rq || curr == rq->idle))
7191 return;
7192
7193 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
7194 set_last_buddy(se);
bf0f6f24
IM
7195}
7196
21f56ffe
PZ
7197#ifdef CONFIG_SMP
7198static struct task_struct *pick_task_fair(struct rq *rq)
7199{
7200 struct sched_entity *se;
7201 struct cfs_rq *cfs_rq;
7202
7203again:
7204 cfs_rq = &rq->cfs;
7205 if (!cfs_rq->nr_running)
7206 return NULL;
7207
7208 do {
7209 struct sched_entity *curr = cfs_rq->curr;
7210
7211 /* When we pick for a remote RQ, we'll not have done put_prev_entity() */
7212 if (curr) {
7213 if (curr->on_rq)
7214 update_curr(cfs_rq);
7215 else
7216 curr = NULL;
7217
7218 if (unlikely(check_cfs_rq_runtime(cfs_rq)))
7219 goto again;
7220 }
7221
7222 se = pick_next_entity(cfs_rq, curr);
7223 cfs_rq = group_cfs_rq(se);
7224 } while (cfs_rq);
7225
7226 return task_of(se);
7227}
7228#endif
7229
5d7d6056 7230struct task_struct *
d8ac8971 7231pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
bf0f6f24
IM
7232{
7233 struct cfs_rq *cfs_rq = &rq->cfs;
7234 struct sched_entity *se;
678d5718 7235 struct task_struct *p;
37e117c0 7236 int new_tasks;
678d5718 7237
6e83125c 7238again:
6e2df058 7239 if (!sched_fair_runnable(rq))
38033c37 7240 goto idle;
678d5718 7241
9674f5ca 7242#ifdef CONFIG_FAIR_GROUP_SCHED
67692435 7243 if (!prev || prev->sched_class != &fair_sched_class)
678d5718
PZ
7244 goto simple;
7245
7246 /*
7247 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
7248 * likely that a next task is from the same cgroup as the current.
7249 *
7250 * Therefore attempt to avoid putting and setting the entire cgroup
7251 * hierarchy, only change the part that actually changes.
7252 */
7253
7254 do {
7255 struct sched_entity *curr = cfs_rq->curr;
7256
7257 /*
7258 * Since we got here without doing put_prev_entity() we also
7259 * have to consider cfs_rq->curr. If it is still a runnable
7260 * entity, update_curr() will update its vruntime, otherwise
7261 * forget we've ever seen it.
7262 */
54d27365
BS
7263 if (curr) {
7264 if (curr->on_rq)
7265 update_curr(cfs_rq);
7266 else
7267 curr = NULL;
678d5718 7268
54d27365
BS
7269 /*
7270 * This call to check_cfs_rq_runtime() will do the
7271 * throttle and dequeue its entity in the parent(s).
9674f5ca 7272 * Therefore the nr_running test will indeed
54d27365
BS
7273 * be correct.
7274 */
9674f5ca
VK
7275 if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
7276 cfs_rq = &rq->cfs;
7277
7278 if (!cfs_rq->nr_running)
7279 goto idle;
7280
54d27365 7281 goto simple;
9674f5ca 7282 }
54d27365 7283 }
678d5718
PZ
7284
7285 se = pick_next_entity(cfs_rq, curr);
7286 cfs_rq = group_cfs_rq(se);
7287 } while (cfs_rq);
7288
7289 p = task_of(se);
7290
7291 /*
7292 * Since we haven't yet done put_prev_entity and if the selected task
7293 * is a different task than we started out with, try and touch the
7294 * least amount of cfs_rqs.
7295 */
7296 if (prev != p) {
7297 struct sched_entity *pse = &prev->se;
7298
7299 while (!(cfs_rq = is_same_group(se, pse))) {
7300 int se_depth = se->depth;
7301 int pse_depth = pse->depth;
7302
7303 if (se_depth <= pse_depth) {
7304 put_prev_entity(cfs_rq_of(pse), pse);
7305 pse = parent_entity(pse);
7306 }
7307 if (se_depth >= pse_depth) {
7308 set_next_entity(cfs_rq_of(se), se);
7309 se = parent_entity(se);
7310 }
7311 }
7312
7313 put_prev_entity(cfs_rq, pse);
7314 set_next_entity(cfs_rq, se);
7315 }
7316
93824900 7317 goto done;
678d5718 7318simple:
678d5718 7319#endif
67692435
PZ
7320 if (prev)
7321 put_prev_task(rq, prev);
606dba2e 7322
bf0f6f24 7323 do {
678d5718 7324 se = pick_next_entity(cfs_rq, NULL);
f4b6755f 7325 set_next_entity(cfs_rq, se);
bf0f6f24
IM
7326 cfs_rq = group_cfs_rq(se);
7327 } while (cfs_rq);
7328
8f4d37ec 7329 p = task_of(se);
678d5718 7330
13a453c2 7331done: __maybe_unused;
93824900
UR
7332#ifdef CONFIG_SMP
7333 /*
7334 * Move the next running task to the front of
7335 * the list, so our cfs_tasks list becomes MRU
7336 * one.
7337 */
7338 list_move(&p->se.group_node, &rq->cfs_tasks);
7339#endif
7340
e0ee463c 7341 if (hrtick_enabled_fair(rq))
b39e66ea 7342 hrtick_start_fair(rq, p);
8f4d37ec 7343
3b1baa64
MR
7344 update_misfit_status(p, rq);
7345
8f4d37ec 7346 return p;
38033c37
PZ
7347
7348idle:
67692435
PZ
7349 if (!rf)
7350 return NULL;
7351
5ba553ef 7352 new_tasks = newidle_balance(rq, rf);
46f69fa3 7353
37e117c0 7354 /*
5ba553ef 7355 * Because newidle_balance() releases (and re-acquires) rq->lock, it is
37e117c0
PZ
7356 * possible for any higher priority task to appear. In that case we
7357 * must re-start the pick_next_entity() loop.
7358 */
e4aa358b 7359 if (new_tasks < 0)
37e117c0
PZ
7360 return RETRY_TASK;
7361
e4aa358b 7362 if (new_tasks > 0)
38033c37 7363 goto again;
38033c37 7364
23127296
VG
7365 /*
7366 * rq is about to be idle, check if we need to update the
7367 * lost_idle_time of clock_pelt
7368 */
7369 update_idle_rq_clock_pelt(rq);
7370
38033c37 7371 return NULL;
bf0f6f24
IM
7372}
7373
98c2f700
PZ
7374static struct task_struct *__pick_next_task_fair(struct rq *rq)
7375{
7376 return pick_next_task_fair(rq, NULL, NULL);
7377}
7378
bf0f6f24
IM
7379/*
7380 * Account for a descheduled task:
7381 */
6e2df058 7382static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
bf0f6f24
IM
7383{
7384 struct sched_entity *se = &prev->se;
7385 struct cfs_rq *cfs_rq;
7386
7387 for_each_sched_entity(se) {
7388 cfs_rq = cfs_rq_of(se);
ab6cde26 7389 put_prev_entity(cfs_rq, se);
bf0f6f24
IM
7390 }
7391}
7392
ac53db59
RR
7393/*
7394 * sched_yield() is very simple
7395 *
7396 * The magic of dealing with the ->skip buddy is in pick_next_entity.
7397 */
7398static void yield_task_fair(struct rq *rq)
7399{
7400 struct task_struct *curr = rq->curr;
7401 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
7402 struct sched_entity *se = &curr->se;
7403
7404 /*
7405 * Are we the only task in the tree?
7406 */
7407 if (unlikely(rq->nr_running == 1))
7408 return;
7409
7410 clear_buddies(cfs_rq, se);
7411
7412 if (curr->policy != SCHED_BATCH) {
7413 update_rq_clock(rq);
7414 /*
7415 * Update run-time statistics of the 'current'.
7416 */
7417 update_curr(cfs_rq);
916671c0
MG
7418 /*
7419 * Tell update_rq_clock() that we've just updated,
7420 * so we don't do microscopic update in schedule()
7421 * and double the fastpath cost.
7422 */
adcc8da8 7423 rq_clock_skip_update(rq);
ac53db59
RR
7424 }
7425
7426 set_skip_buddy(se);
7427}
7428
0900acf2 7429static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
d95f4122
MG
7430{
7431 struct sched_entity *se = &p->se;
7432
5238cdd3
PT
7433 /* throttled hierarchies are not runnable */
7434 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
d95f4122
MG
7435 return false;
7436
7437 /* Tell the scheduler that we'd really like pse to run next. */
7438 set_next_buddy(se);
7439
d95f4122
MG
7440 yield_task_fair(rq);
7441
7442 return true;
7443}
7444
681f3e68 7445#ifdef CONFIG_SMP
bf0f6f24 7446/**************************************************
e9c84cb8
PZ
7447 * Fair scheduling class load-balancing methods.
7448 *
7449 * BASICS
7450 *
7451 * The purpose of load-balancing is to achieve the same basic fairness the
97fb7a0a 7452 * per-CPU scheduler provides, namely provide a proportional amount of compute
e9c84cb8
PZ
7453 * time to each task. This is expressed in the following equation:
7454 *
7455 * W_i,n/P_i == W_j,n/P_j for all i,j (1)
7456 *
97fb7a0a 7457 * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight
e9c84cb8
PZ
7458 * W_i,0 is defined as:
7459 *
7460 * W_i,0 = \Sum_j w_i,j (2)
7461 *
97fb7a0a 7462 * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight
1c3de5e1 7463 * is derived from the nice value as per sched_prio_to_weight[].
e9c84cb8
PZ
7464 *
7465 * The weight average is an exponential decay average of the instantaneous
7466 * weight:
7467 *
7468 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
7469 *
97fb7a0a 7470 * C_i is the compute capacity of CPU i, typically it is the
e9c84cb8
PZ
7471 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
7472 * can also include other factors [XXX].
7473 *
7474 * To achieve this balance we define a measure of imbalance which follows
7475 * directly from (1):
7476 *
ced549fa 7477 * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)
e9c84cb8
PZ
7478 *
7479 * We them move tasks around to minimize the imbalance. In the continuous
7480 * function space it is obvious this converges, in the discrete case we get
7481 * a few fun cases generally called infeasible weight scenarios.
7482 *
7483 * [XXX expand on:
7484 * - infeasible weights;
7485 * - local vs global optima in the discrete case. ]
7486 *
7487 *
7488 * SCHED DOMAINS
7489 *
7490 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
97fb7a0a 7491 * for all i,j solution, we create a tree of CPUs that follows the hardware
e9c84cb8 7492 * topology where each level pairs two lower groups (or better). This results
97fb7a0a 7493 * in O(log n) layers. Furthermore we reduce the number of CPUs going up the
e9c84cb8 7494 * tree to only the first of the previous level and we decrease the frequency
97fb7a0a 7495 * of load-balance at each level inv. proportional to the number of CPUs in
e9c84cb8
PZ
7496 * the groups.
7497 *
7498 * This yields:
7499 *
7500 * log_2 n 1 n
7501 * \Sum { --- * --- * 2^i } = O(n) (5)
7502 * i = 0 2^i 2^i
7503 * `- size of each group
97fb7a0a 7504 * | | `- number of CPUs doing load-balance
e9c84cb8
PZ
7505 * | `- freq
7506 * `- sum over all levels
7507 *
7508 * Coupled with a limit on how many tasks we can migrate every balance pass,
7509 * this makes (5) the runtime complexity of the balancer.
7510 *
7511 * An important property here is that each CPU is still (indirectly) connected
97fb7a0a 7512 * to every other CPU in at most O(log n) steps:
e9c84cb8
PZ
7513 *
7514 * The adjacency matrix of the resulting graph is given by:
7515 *
97a7142f 7516 * log_2 n
e9c84cb8
PZ
7517 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
7518 * k = 0
7519 *
7520 * And you'll find that:
7521 *
7522 * A^(log_2 n)_i,j != 0 for all i,j (7)
7523 *
97fb7a0a 7524 * Showing there's indeed a path between every CPU in at most O(log n) steps.
e9c84cb8
PZ
7525 * The task movement gives a factor of O(m), giving a convergence complexity
7526 * of:
7527 *
7528 * O(nm log n), n := nr_cpus, m := nr_tasks (8)
7529 *
7530 *
7531 * WORK CONSERVING
7532 *
7533 * In order to avoid CPUs going idle while there's still work to do, new idle
97fb7a0a 7534 * balancing is more aggressive and has the newly idle CPU iterate up the domain
e9c84cb8
PZ
7535 * tree itself instead of relying on other CPUs to bring it work.
7536 *
7537 * This adds some complexity to both (5) and (8) but it reduces the total idle
7538 * time.
7539 *
7540 * [XXX more?]
7541 *
7542 *
7543 * CGROUPS
7544 *
7545 * Cgroups make a horror show out of (2), instead of a simple sum we get:
7546 *
7547 * s_k,i
7548 * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
7549 * S_k
7550 *
7551 * Where
7552 *
7553 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
7554 *
97fb7a0a 7555 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i.
e9c84cb8
PZ
7556 *
7557 * The big problem is S_k, its a global sum needed to compute a local (W_i)
7558 * property.
7559 *
7560 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
7561 * rewrite all of this once again.]
97a7142f 7562 */
bf0f6f24 7563
ed387b78
HS
7564static unsigned long __read_mostly max_load_balance_interval = HZ/10;
7565
0ec8aa00
PZ
7566enum fbq_type { regular, remote, all };
7567
0b0695f2 7568/*
a9723389
VG
7569 * 'group_type' describes the group of CPUs at the moment of load balancing.
7570 *
0b0695f2 7571 * The enum is ordered by pulling priority, with the group with lowest priority
a9723389
VG
7572 * first so the group_type can simply be compared when selecting the busiest
7573 * group. See update_sd_pick_busiest().
0b0695f2 7574 */
3b1baa64 7575enum group_type {
a9723389 7576 /* The group has spare capacity that can be used to run more tasks. */
0b0695f2 7577 group_has_spare = 0,
a9723389
VG
7578 /*
7579 * The group is fully used and the tasks don't compete for more CPU
7580 * cycles. Nevertheless, some tasks might wait before running.
7581 */
0b0695f2 7582 group_fully_busy,
a9723389
VG
7583 /*
7584 * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity
7585 * and must be migrated to a more powerful CPU.
7586 */
3b1baa64 7587 group_misfit_task,
a9723389
VG
7588 /*
7589 * SD_ASYM_PACKING only: One local CPU with higher capacity is available,
7590 * and the task should be migrated to it instead of running on the
7591 * current CPU.
7592 */
0b0695f2 7593 group_asym_packing,
a9723389
VG
7594 /*
7595 * The tasks' affinity constraints previously prevented the scheduler
7596 * from balancing the load across the system.
7597 */
3b1baa64 7598 group_imbalanced,
a9723389
VG
7599 /*
7600 * The CPU is overloaded and can't provide expected CPU cycles to all
7601 * tasks.
7602 */
0b0695f2
VG
7603 group_overloaded
7604};
7605
7606enum migration_type {
7607 migrate_load = 0,
7608 migrate_util,
7609 migrate_task,
7610 migrate_misfit
3b1baa64
MR
7611};
7612
ddcdf6e7 7613#define LBF_ALL_PINNED 0x01
367456c7 7614#define LBF_NEED_BREAK 0x02
6263322c
PZ
7615#define LBF_DST_PINNED 0x04
7616#define LBF_SOME_PINNED 0x08
23fb06d9 7617#define LBF_ACTIVE_LB 0x10
ddcdf6e7
PZ
7618
7619struct lb_env {
7620 struct sched_domain *sd;
7621
ddcdf6e7 7622 struct rq *src_rq;
85c1e7da 7623 int src_cpu;
ddcdf6e7
PZ
7624
7625 int dst_cpu;
7626 struct rq *dst_rq;
7627
88b8dac0
SV
7628 struct cpumask *dst_grpmask;
7629 int new_dst_cpu;
ddcdf6e7 7630 enum cpu_idle_type idle;
bd939f45 7631 long imbalance;
b9403130
MW
7632 /* The set of CPUs under consideration for load-balancing */
7633 struct cpumask *cpus;
7634
ddcdf6e7 7635 unsigned int flags;
367456c7
PZ
7636
7637 unsigned int loop;
7638 unsigned int loop_break;
7639 unsigned int loop_max;
0ec8aa00
PZ
7640
7641 enum fbq_type fbq_type;
0b0695f2 7642 enum migration_type migration_type;
163122b7 7643 struct list_head tasks;
ddcdf6e7
PZ
7644};
7645
029632fb
PZ
7646/*
7647 * Is this task likely cache-hot:
7648 */
5d5e2b1b 7649static int task_hot(struct task_struct *p, struct lb_env *env)
029632fb
PZ
7650{
7651 s64 delta;
7652
5cb9eaa3 7653 lockdep_assert_rq_held(env->src_rq);
e5673f28 7654
029632fb
PZ
7655 if (p->sched_class != &fair_sched_class)
7656 return 0;
7657
1da1843f 7658 if (unlikely(task_has_idle_policy(p)))
029632fb
PZ
7659 return 0;
7660
ec73240b
JD
7661 /* SMT siblings share cache */
7662 if (env->sd->flags & SD_SHARE_CPUCAPACITY)
7663 return 0;
7664
029632fb
PZ
7665 /*
7666 * Buddy candidates are cache hot:
7667 */
5d5e2b1b 7668 if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
029632fb
PZ
7669 (&p->se == cfs_rq_of(&p->se)->next ||
7670 &p->se == cfs_rq_of(&p->se)->last))
7671 return 1;
7672
7673 if (sysctl_sched_migration_cost == -1)
7674 return 1;
97886d9d
AL
7675
7676 /*
7677 * Don't migrate task if the task's cookie does not match
7678 * with the destination CPU's core cookie.
7679 */
7680 if (!sched_core_cookie_match(cpu_rq(env->dst_cpu), p))
7681 return 1;
7682
029632fb
PZ
7683 if (sysctl_sched_migration_cost == 0)
7684 return 0;
7685
5d5e2b1b 7686 delta = rq_clock_task(env->src_rq) - p->se.exec_start;
029632fb
PZ
7687
7688 return delta < (s64)sysctl_sched_migration_cost;
7689}
7690
3a7053b3 7691#ifdef CONFIG_NUMA_BALANCING
c1ceac62 7692/*
2a1ed24c
SD
7693 * Returns 1, if task migration degrades locality
7694 * Returns 0, if task migration improves locality i.e migration preferred.
7695 * Returns -1, if task migration is not affected by locality.
c1ceac62 7696 */
2a1ed24c 7697static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
3a7053b3 7698{
b1ad065e 7699 struct numa_group *numa_group = rcu_dereference(p->numa_group);
f35678b6
SD
7700 unsigned long src_weight, dst_weight;
7701 int src_nid, dst_nid, dist;
3a7053b3 7702
2a595721 7703 if (!static_branch_likely(&sched_numa_balancing))
2a1ed24c
SD
7704 return -1;
7705
c3b9bc5b 7706 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
2a1ed24c 7707 return -1;
7a0f3083
MG
7708
7709 src_nid = cpu_to_node(env->src_cpu);
7710 dst_nid = cpu_to_node(env->dst_cpu);
7711
83e1d2cd 7712 if (src_nid == dst_nid)
2a1ed24c 7713 return -1;
7a0f3083 7714
2a1ed24c
SD
7715 /* Migrating away from the preferred node is always bad. */
7716 if (src_nid == p->numa_preferred_nid) {
7717 if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
7718 return 1;
7719 else
7720 return -1;
7721 }
b1ad065e 7722
c1ceac62
RR
7723 /* Encourage migration to the preferred node. */
7724 if (dst_nid == p->numa_preferred_nid)
2a1ed24c 7725 return 0;
b1ad065e 7726
739294fb 7727 /* Leaving a core idle is often worse than degrading locality. */
f35678b6 7728 if (env->idle == CPU_IDLE)
739294fb
RR
7729 return -1;
7730
f35678b6 7731 dist = node_distance(src_nid, dst_nid);
c1ceac62 7732 if (numa_group) {
f35678b6
SD
7733 src_weight = group_weight(p, src_nid, dist);
7734 dst_weight = group_weight(p, dst_nid, dist);
c1ceac62 7735 } else {
f35678b6
SD
7736 src_weight = task_weight(p, src_nid, dist);
7737 dst_weight = task_weight(p, dst_nid, dist);
b1ad065e
RR
7738 }
7739
f35678b6 7740 return dst_weight < src_weight;
7a0f3083
MG
7741}
7742
3a7053b3 7743#else
2a1ed24c 7744static inline int migrate_degrades_locality(struct task_struct *p,
3a7053b3
MG
7745 struct lb_env *env)
7746{
2a1ed24c 7747 return -1;
7a0f3083 7748}
3a7053b3
MG
7749#endif
7750
1e3c88bd
PZ
7751/*
7752 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
7753 */
7754static
8e45cb54 7755int can_migrate_task(struct task_struct *p, struct lb_env *env)
1e3c88bd 7756{
2a1ed24c 7757 int tsk_cache_hot;
e5673f28 7758
5cb9eaa3 7759 lockdep_assert_rq_held(env->src_rq);
e5673f28 7760
1e3c88bd
PZ
7761 /*
7762 * We do not migrate tasks that are:
d3198084 7763 * 1) throttled_lb_pair, or
3bd37062 7764 * 2) cannot be migrated to this CPU due to cpus_ptr, or
d3198084
JK
7765 * 3) running (obviously), or
7766 * 4) are cache-hot on their current CPU.
1e3c88bd 7767 */
d3198084
JK
7768 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
7769 return 0;
7770
9bcb959d 7771 /* Disregard pcpu kthreads; they are where they need to be. */
3a7956e2 7772 if (kthread_is_per_cpu(p))
9bcb959d
LC
7773 return 0;
7774
3bd37062 7775 if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
e02e60c1 7776 int cpu;
88b8dac0 7777
ceeadb83 7778 schedstat_inc(p->stats.nr_failed_migrations_affine);
88b8dac0 7779
6263322c
PZ
7780 env->flags |= LBF_SOME_PINNED;
7781
88b8dac0 7782 /*
97fb7a0a 7783 * Remember if this task can be migrated to any other CPU in
88b8dac0
SV
7784 * our sched_group. We may want to revisit it if we couldn't
7785 * meet load balance goals by pulling other tasks on src_cpu.
7786 *
23fb06d9
VS
7787 * Avoid computing new_dst_cpu
7788 * - for NEWLY_IDLE
7789 * - if we have already computed one in current iteration
7790 * - if it's an active balance
88b8dac0 7791 */
23fb06d9
VS
7792 if (env->idle == CPU_NEWLY_IDLE ||
7793 env->flags & (LBF_DST_PINNED | LBF_ACTIVE_LB))
88b8dac0
SV
7794 return 0;
7795
97fb7a0a 7796 /* Prevent to re-select dst_cpu via env's CPUs: */
e02e60c1 7797 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
3bd37062 7798 if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
6263322c 7799 env->flags |= LBF_DST_PINNED;
e02e60c1
JK
7800 env->new_dst_cpu = cpu;
7801 break;
7802 }
88b8dac0 7803 }
e02e60c1 7804
1e3c88bd
PZ
7805 return 0;
7806 }
88b8dac0 7807
3b03706f 7808 /* Record that we found at least one task that could run on dst_cpu */
8e45cb54 7809 env->flags &= ~LBF_ALL_PINNED;
1e3c88bd 7810
ddcdf6e7 7811 if (task_running(env->src_rq, p)) {
ceeadb83 7812 schedstat_inc(p->stats.nr_failed_migrations_running);
1e3c88bd
PZ
7813 return 0;
7814 }
7815
7816 /*
7817 * Aggressive migration if:
23fb06d9
VS
7818 * 1) active balance
7819 * 2) destination numa is preferred
7820 * 3) task is cache cold, or
7821 * 4) too many balance attempts have failed.
1e3c88bd 7822 */
23fb06d9
VS
7823 if (env->flags & LBF_ACTIVE_LB)
7824 return 1;
7825
2a1ed24c
SD
7826 tsk_cache_hot = migrate_degrades_locality(p, env);
7827 if (tsk_cache_hot == -1)
7828 tsk_cache_hot = task_hot(p, env);
3a7053b3 7829
2a1ed24c 7830 if (tsk_cache_hot <= 0 ||
7a96c231 7831 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
2a1ed24c 7832 if (tsk_cache_hot == 1) {
ae92882e 7833 schedstat_inc(env->sd->lb_hot_gained[env->idle]);
ceeadb83 7834 schedstat_inc(p->stats.nr_forced_migrations);
3a7053b3 7835 }
1e3c88bd
PZ
7836 return 1;
7837 }
7838
ceeadb83 7839 schedstat_inc(p->stats.nr_failed_migrations_hot);
4e2dcb73 7840 return 0;
1e3c88bd
PZ
7841}
7842
897c395f 7843/*
163122b7
KT
7844 * detach_task() -- detach the task for the migration specified in env
7845 */
7846static void detach_task(struct task_struct *p, struct lb_env *env)
7847{
5cb9eaa3 7848 lockdep_assert_rq_held(env->src_rq);
163122b7 7849
5704ac0a 7850 deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
163122b7
KT
7851 set_task_cpu(p, env->dst_cpu);
7852}
7853
897c395f 7854/*
e5673f28 7855 * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
897c395f 7856 * part of active balancing operations within "domain".
897c395f 7857 *
e5673f28 7858 * Returns a task if successful and NULL otherwise.
897c395f 7859 */
e5673f28 7860static struct task_struct *detach_one_task(struct lb_env *env)
897c395f 7861{
93824900 7862 struct task_struct *p;
897c395f 7863
5cb9eaa3 7864 lockdep_assert_rq_held(env->src_rq);
e5673f28 7865
93824900
UR
7866 list_for_each_entry_reverse(p,
7867 &env->src_rq->cfs_tasks, se.group_node) {
367456c7
PZ
7868 if (!can_migrate_task(p, env))
7869 continue;
897c395f 7870
163122b7 7871 detach_task(p, env);
e5673f28 7872
367456c7 7873 /*
e5673f28 7874 * Right now, this is only the second place where
163122b7 7875 * lb_gained[env->idle] is updated (other is detach_tasks)
e5673f28 7876 * so we can safely collect stats here rather than
163122b7 7877 * inside detach_tasks().
367456c7 7878 */
ae92882e 7879 schedstat_inc(env->sd->lb_gained[env->idle]);
e5673f28 7880 return p;
897c395f 7881 }
e5673f28 7882 return NULL;
897c395f
PZ
7883}
7884
eb95308e
PZ
7885static const unsigned int sched_nr_migrate_break = 32;
7886
5d6523eb 7887/*
0b0695f2 7888 * detach_tasks() -- tries to detach up to imbalance load/util/tasks from
163122b7 7889 * busiest_rq, as part of a balancing operation within domain "sd".
5d6523eb 7890 *
163122b7 7891 * Returns number of detached tasks if successful and 0 otherwise.
5d6523eb 7892 */
163122b7 7893static int detach_tasks(struct lb_env *env)
1e3c88bd 7894{
5d6523eb 7895 struct list_head *tasks = &env->src_rq->cfs_tasks;
0b0695f2 7896 unsigned long util, load;
5d6523eb 7897 struct task_struct *p;
163122b7
KT
7898 int detached = 0;
7899
5cb9eaa3 7900 lockdep_assert_rq_held(env->src_rq);
1e3c88bd 7901
acb4decc
AL
7902 /*
7903 * Source run queue has been emptied by another CPU, clear
7904 * LBF_ALL_PINNED flag as we will not test any task.
7905 */
7906 if (env->src_rq->nr_running <= 1) {
7907 env->flags &= ~LBF_ALL_PINNED;
7908 return 0;
7909 }
7910
bd939f45 7911 if (env->imbalance <= 0)
5d6523eb 7912 return 0;
1e3c88bd 7913
5d6523eb 7914 while (!list_empty(tasks)) {
985d3a4c
YD
7915 /*
7916 * We don't want to steal all, otherwise we may be treated likewise,
7917 * which could at worst lead to a livelock crash.
7918 */
7919 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
7920 break;
7921
93824900 7922 p = list_last_entry(tasks, struct task_struct, se.group_node);
1e3c88bd 7923
367456c7
PZ
7924 env->loop++;
7925 /* We've more or less seen every task there is, call it quits */
5d6523eb 7926 if (env->loop > env->loop_max)
367456c7 7927 break;
5d6523eb
PZ
7928
7929 /* take a breather every nr_migrate tasks */
367456c7 7930 if (env->loop > env->loop_break) {
eb95308e 7931 env->loop_break += sched_nr_migrate_break;
8e45cb54 7932 env->flags |= LBF_NEED_BREAK;
ee00e66f 7933 break;
a195f004 7934 }
1e3c88bd 7935
d3198084 7936 if (!can_migrate_task(p, env))
367456c7
PZ
7937 goto next;
7938
0b0695f2
VG
7939 switch (env->migration_type) {
7940 case migrate_load:
01cfcde9
VG
7941 /*
7942 * Depending of the number of CPUs and tasks and the
7943 * cgroup hierarchy, task_h_load() can return a null
7944 * value. Make sure that env->imbalance decreases
7945 * otherwise detach_tasks() will stop only after
7946 * detaching up to loop_max tasks.
7947 */
7948 load = max_t(unsigned long, task_h_load(p), 1);
5d6523eb 7949
0b0695f2
VG
7950 if (sched_feat(LB_MIN) &&
7951 load < 16 && !env->sd->nr_balance_failed)
7952 goto next;
367456c7 7953
6cf82d55
VG
7954 /*
7955 * Make sure that we don't migrate too much load.
7956 * Nevertheless, let relax the constraint if
7957 * scheduler fails to find a good waiting task to
7958 * migrate.
7959 */
39a2a6eb 7960 if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance)
0b0695f2
VG
7961 goto next;
7962
7963 env->imbalance -= load;
7964 break;
7965
7966 case migrate_util:
7967 util = task_util_est(p);
7968
7969 if (util > env->imbalance)
7970 goto next;
7971
7972 env->imbalance -= util;
7973 break;
7974
7975 case migrate_task:
7976 env->imbalance--;
7977 break;
7978
7979 case migrate_misfit:
c63be7be
VG
7980 /* This is not a misfit task */
7981 if (task_fits_capacity(p, capacity_of(env->src_cpu)))
0b0695f2
VG
7982 goto next;
7983
7984 env->imbalance = 0;
7985 break;
7986 }
1e3c88bd 7987
163122b7
KT
7988 detach_task(p, env);
7989 list_add(&p->se.group_node, &env->tasks);
7990
7991 detached++;
1e3c88bd 7992
c1a280b6 7993#ifdef CONFIG_PREEMPTION
ee00e66f
PZ
7994 /*
7995 * NEWIDLE balancing is a source of latency, so preemptible
163122b7 7996 * kernels will stop after the first task is detached to minimize
ee00e66f
PZ
7997 * the critical section.
7998 */
5d6523eb 7999 if (env->idle == CPU_NEWLY_IDLE)
ee00e66f 8000 break;
1e3c88bd
PZ
8001#endif
8002
ee00e66f
PZ
8003 /*
8004 * We only want to steal up to the prescribed amount of
0b0695f2 8005 * load/util/tasks.
ee00e66f 8006 */
bd939f45 8007 if (env->imbalance <= 0)
ee00e66f 8008 break;
367456c7
PZ
8009
8010 continue;
8011next:
93824900 8012 list_move(&p->se.group_node, tasks);
1e3c88bd 8013 }
5d6523eb 8014
1e3c88bd 8015 /*
163122b7
KT
8016 * Right now, this is one of only two places we collect this stat
8017 * so we can safely collect detach_one_task() stats here rather
8018 * than inside detach_one_task().
1e3c88bd 8019 */
ae92882e 8020 schedstat_add(env->sd->lb_gained[env->idle], detached);
1e3c88bd 8021
163122b7
KT
8022 return detached;
8023}
8024
8025/*
8026 * attach_task() -- attach the task detached by detach_task() to its new rq.
8027 */
8028static void attach_task(struct rq *rq, struct task_struct *p)
8029{
5cb9eaa3 8030 lockdep_assert_rq_held(rq);
163122b7
KT
8031
8032 BUG_ON(task_rq(p) != rq);
5704ac0a 8033 activate_task(rq, p, ENQUEUE_NOCLOCK);
163122b7
KT
8034 check_preempt_curr(rq, p, 0);
8035}
8036
8037/*
8038 * attach_one_task() -- attaches the task returned from detach_one_task() to
8039 * its new rq.
8040 */
8041static void attach_one_task(struct rq *rq, struct task_struct *p)
8042{
8a8c69c3
PZ
8043 struct rq_flags rf;
8044
8045 rq_lock(rq, &rf);
5704ac0a 8046 update_rq_clock(rq);
163122b7 8047 attach_task(rq, p);
8a8c69c3 8048 rq_unlock(rq, &rf);
163122b7
KT
8049}
8050
8051/*
8052 * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
8053 * new rq.
8054 */
8055static void attach_tasks(struct lb_env *env)
8056{
8057 struct list_head *tasks = &env->tasks;
8058 struct task_struct *p;
8a8c69c3 8059 struct rq_flags rf;
163122b7 8060
8a8c69c3 8061 rq_lock(env->dst_rq, &rf);
5704ac0a 8062 update_rq_clock(env->dst_rq);
163122b7
KT
8063
8064 while (!list_empty(tasks)) {
8065 p = list_first_entry(tasks, struct task_struct, se.group_node);
8066 list_del_init(&p->se.group_node);
1e3c88bd 8067
163122b7
KT
8068 attach_task(env->dst_rq, p);
8069 }
8070
8a8c69c3 8071 rq_unlock(env->dst_rq, &rf);
1e3c88bd
PZ
8072}
8073
b0c79224 8074#ifdef CONFIG_NO_HZ_COMMON
1936c53c
VG
8075static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
8076{
8077 if (cfs_rq->avg.load_avg)
8078 return true;
8079
8080 if (cfs_rq->avg.util_avg)
8081 return true;
8082
8083 return false;
8084}
8085
91c27493 8086static inline bool others_have_blocked(struct rq *rq)
371bf427
VG
8087{
8088 if (READ_ONCE(rq->avg_rt.util_avg))
8089 return true;
8090
3727e0e1
VG
8091 if (READ_ONCE(rq->avg_dl.util_avg))
8092 return true;
8093
b4eccf5f
TG
8094 if (thermal_load_avg(rq))
8095 return true;
8096
11d4afd4 8097#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
91c27493
VG
8098 if (READ_ONCE(rq->avg_irq.util_avg))
8099 return true;
8100#endif
8101
371bf427
VG
8102 return false;
8103}
8104
39b6a429 8105static inline void update_blocked_load_tick(struct rq *rq)
b0c79224 8106{
39b6a429
VG
8107 WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies);
8108}
b0c79224 8109
39b6a429
VG
8110static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
8111{
b0c79224
VS
8112 if (!has_blocked)
8113 rq->has_blocked_load = 0;
8114}
8115#else
8116static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
8117static inline bool others_have_blocked(struct rq *rq) { return false; }
39b6a429 8118static inline void update_blocked_load_tick(struct rq *rq) {}
b0c79224
VS
8119static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
8120#endif
8121
bef69dd8
VG
8122static bool __update_blocked_others(struct rq *rq, bool *done)
8123{
8124 const struct sched_class *curr_class;
8125 u64 now = rq_clock_pelt(rq);
b4eccf5f 8126 unsigned long thermal_pressure;
bef69dd8
VG
8127 bool decayed;
8128
8129 /*
8130 * update_load_avg() can call cpufreq_update_util(). Make sure that RT,
8131 * DL and IRQ signals have been updated before updating CFS.
8132 */
8133 curr_class = rq->curr->sched_class;
8134
b4eccf5f
TG
8135 thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
8136
bef69dd8
VG
8137 decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
8138 update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
05289b90 8139 update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) |
bef69dd8
VG
8140 update_irq_load_avg(rq, 0);
8141
8142 if (others_have_blocked(rq))
8143 *done = false;
8144
8145 return decayed;
8146}
8147
1936c53c
VG
8148#ifdef CONFIG_FAIR_GROUP_SCHED
8149
bef69dd8 8150static bool __update_blocked_fair(struct rq *rq, bool *done)
9e3081ca 8151{
039ae8bc 8152 struct cfs_rq *cfs_rq, *pos;
bef69dd8
VG
8153 bool decayed = false;
8154 int cpu = cpu_of(rq);
b90f7c9d 8155
9763b67f
PZ
8156 /*
8157 * Iterates the task_group tree in a bottom up fashion, see
8158 * list_add_leaf_cfs_rq() for details.
8159 */
039ae8bc 8160 for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
bc427898
VG
8161 struct sched_entity *se;
8162
bef69dd8 8163 if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
fe749158 8164 update_tg_load_avg(cfs_rq);
4e516076 8165
bef69dd8
VG
8166 if (cfs_rq == &rq->cfs)
8167 decayed = true;
8168 }
8169
bc427898
VG
8170 /* Propagate pending load changes to the parent, if any: */
8171 se = cfs_rq->tg->se[cpu];
8172 if (se && !skip_blocked_update(se))
02da26ad 8173 update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
a9e7f654 8174
039ae8bc
VG
8175 /*
8176 * There can be a lot of idle CPU cgroups. Don't let fully
8177 * decayed cfs_rqs linger on the list.
8178 */
8179 if (cfs_rq_is_decayed(cfs_rq))
8180 list_del_leaf_cfs_rq(cfs_rq);
8181
1936c53c
VG
8182 /* Don't need periodic decay once load/util_avg are null */
8183 if (cfs_rq_has_blocked(cfs_rq))
bef69dd8 8184 *done = false;
9d89c257 8185 }
12b04875 8186
bef69dd8 8187 return decayed;
9e3081ca
PZ
8188}
8189
9763b67f 8190/*
68520796 8191 * Compute the hierarchical load factor for cfs_rq and all its ascendants.
9763b67f
PZ
8192 * This needs to be done in a top-down fashion because the load of a child
8193 * group is a fraction of its parents load.
8194 */
68520796 8195static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
9763b67f 8196{
68520796
VD
8197 struct rq *rq = rq_of(cfs_rq);
8198 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
a35b6466 8199 unsigned long now = jiffies;
68520796 8200 unsigned long load;
a35b6466 8201
68520796 8202 if (cfs_rq->last_h_load_update == now)
a35b6466
PZ
8203 return;
8204
0e9f0245 8205 WRITE_ONCE(cfs_rq->h_load_next, NULL);
68520796
VD
8206 for_each_sched_entity(se) {
8207 cfs_rq = cfs_rq_of(se);
0e9f0245 8208 WRITE_ONCE(cfs_rq->h_load_next, se);
68520796
VD
8209 if (cfs_rq->last_h_load_update == now)
8210 break;
8211 }
a35b6466 8212
68520796 8213 if (!se) {
7ea241af 8214 cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
68520796
VD
8215 cfs_rq->last_h_load_update = now;
8216 }
8217
0e9f0245 8218 while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
68520796 8219 load = cfs_rq->h_load;
7ea241af
YD
8220 load = div64_ul(load * se->avg.load_avg,
8221 cfs_rq_load_avg(cfs_rq) + 1);
68520796
VD
8222 cfs_rq = group_cfs_rq(se);
8223 cfs_rq->h_load = load;
8224 cfs_rq->last_h_load_update = now;
8225 }
9763b67f
PZ
8226}
8227
367456c7 8228static unsigned long task_h_load(struct task_struct *p)
230059de 8229{
367456c7 8230 struct cfs_rq *cfs_rq = task_cfs_rq(p);
230059de 8231
68520796 8232 update_cfs_rq_h_load(cfs_rq);
9d89c257 8233 return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
7ea241af 8234 cfs_rq_load_avg(cfs_rq) + 1);
230059de
PZ
8235}
8236#else
bef69dd8 8237static bool __update_blocked_fair(struct rq *rq, bool *done)
9e3081ca 8238{
6c1d47c0 8239 struct cfs_rq *cfs_rq = &rq->cfs;
bef69dd8 8240 bool decayed;
b90f7c9d 8241
bef69dd8
VG
8242 decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
8243 if (cfs_rq_has_blocked(cfs_rq))
8244 *done = false;
b90f7c9d 8245
bef69dd8 8246 return decayed;
9e3081ca
PZ
8247}
8248
367456c7 8249static unsigned long task_h_load(struct task_struct *p)
1e3c88bd 8250{
9d89c257 8251 return p->se.avg.load_avg;
1e3c88bd 8252}
230059de 8253#endif
1e3c88bd 8254
bef69dd8
VG
8255static void update_blocked_averages(int cpu)
8256{
8257 bool decayed = false, done = true;
8258 struct rq *rq = cpu_rq(cpu);
8259 struct rq_flags rf;
8260
8261 rq_lock_irqsave(rq, &rf);
39b6a429 8262 update_blocked_load_tick(rq);
bef69dd8
VG
8263 update_rq_clock(rq);
8264
8265 decayed |= __update_blocked_others(rq, &done);
8266 decayed |= __update_blocked_fair(rq, &done);
8267
8268 update_blocked_load_status(rq, !done);
8269 if (decayed)
8270 cpufreq_update_util(rq, 0);
8271 rq_unlock_irqrestore(rq, &rf);
8272}
8273
1e3c88bd 8274/********** Helpers for find_busiest_group ************************/
caeb178c 8275
1e3c88bd
PZ
8276/*
8277 * sg_lb_stats - stats of a sched_group required for load_balancing
8278 */
8279struct sg_lb_stats {
8280 unsigned long avg_load; /*Avg load across the CPUs of the group */
8281 unsigned long group_load; /* Total load over the CPUs of the group */
63b2ca30 8282 unsigned long group_capacity;
070f5e86
VG
8283 unsigned long group_util; /* Total utilization over the CPUs of the group */
8284 unsigned long group_runnable; /* Total runnable time over the CPUs of the group */
5e23e474 8285 unsigned int sum_nr_running; /* Nr of tasks running in the group */
a3498347 8286 unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
147c5fc2
PZ
8287 unsigned int idle_cpus;
8288 unsigned int group_weight;
caeb178c 8289 enum group_type group_type;
490ba971 8290 unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
3b1baa64 8291 unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
0ec8aa00
PZ
8292#ifdef CONFIG_NUMA_BALANCING
8293 unsigned int nr_numa_running;
8294 unsigned int nr_preferred_running;
8295#endif
1e3c88bd
PZ
8296};
8297
56cf515b
JK
8298/*
8299 * sd_lb_stats - Structure to store the statistics of a sched_domain
8300 * during load balancing.
8301 */
8302struct sd_lb_stats {
8303 struct sched_group *busiest; /* Busiest group in this sd */
8304 struct sched_group *local; /* Local group in this sd */
8305 unsigned long total_load; /* Total load of all groups in sd */
63b2ca30 8306 unsigned long total_capacity; /* Total capacity of all groups in sd */
56cf515b 8307 unsigned long avg_load; /* Average load across all groups in sd */
0b0695f2 8308 unsigned int prefer_sibling; /* tasks should go to sibling first */
56cf515b 8309
56cf515b 8310 struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
147c5fc2 8311 struct sg_lb_stats local_stat; /* Statistics of the local group */
56cf515b
JK
8312};
8313
147c5fc2
PZ
8314static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
8315{
8316 /*
8317 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
8318 * local_stat because update_sg_lb_stats() does a full clear/assignment.
0b0695f2
VG
8319 * We must however set busiest_stat::group_type and
8320 * busiest_stat::idle_cpus to the worst busiest group because
8321 * update_sd_pick_busiest() reads these before assignment.
147c5fc2
PZ
8322 */
8323 *sds = (struct sd_lb_stats){
8324 .busiest = NULL,
8325 .local = NULL,
8326 .total_load = 0UL,
63b2ca30 8327 .total_capacity = 0UL,
147c5fc2 8328 .busiest_stat = {
0b0695f2
VG
8329 .idle_cpus = UINT_MAX,
8330 .group_type = group_has_spare,
147c5fc2
PZ
8331 },
8332 };
8333}
8334
1ca2034e 8335static unsigned long scale_rt_capacity(int cpu)
1e3c88bd
PZ
8336{
8337 struct rq *rq = cpu_rq(cpu);
8ec59c0f 8338 unsigned long max = arch_scale_cpu_capacity(cpu);
523e979d 8339 unsigned long used, free;
523e979d 8340 unsigned long irq;
b654f7de 8341
2e62c474 8342 irq = cpu_util_irq(rq);
cadefd3d 8343
523e979d
VG
8344 if (unlikely(irq >= max))
8345 return 1;
aa483808 8346
467b7d01
TG
8347 /*
8348 * avg_rt.util_avg and avg_dl.util_avg track binary signals
8349 * (running and not running) with weights 0 and 1024 respectively.
8350 * avg_thermal.load_avg tracks thermal pressure and the weighted
8351 * average uses the actual delta max capacity(load).
8352 */
523e979d
VG
8353 used = READ_ONCE(rq->avg_rt.util_avg);
8354 used += READ_ONCE(rq->avg_dl.util_avg);
467b7d01 8355 used += thermal_load_avg(rq);
1e3c88bd 8356
523e979d
VG
8357 if (unlikely(used >= max))
8358 return 1;
1e3c88bd 8359
523e979d 8360 free = max - used;
2e62c474
VG
8361
8362 return scale_irq_capacity(free, irq, max);
1e3c88bd
PZ
8363}
8364
ced549fa 8365static void update_cpu_capacity(struct sched_domain *sd, int cpu)
1e3c88bd 8366{
1ca2034e 8367 unsigned long capacity = scale_rt_capacity(cpu);
1e3c88bd
PZ
8368 struct sched_group *sdg = sd->groups;
8369
8ec59c0f 8370 cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
1e3c88bd 8371
ced549fa
NP
8372 if (!capacity)
8373 capacity = 1;
1e3c88bd 8374
ced549fa 8375 cpu_rq(cpu)->cpu_capacity = capacity;
51cf18c9
VD
8376 trace_sched_cpu_capacity_tp(cpu_rq(cpu));
8377
ced549fa 8378 sdg->sgc->capacity = capacity;
bf475ce0 8379 sdg->sgc->min_capacity = capacity;
e3d6d0cb 8380 sdg->sgc->max_capacity = capacity;
1e3c88bd
PZ
8381}
8382
63b2ca30 8383void update_group_capacity(struct sched_domain *sd, int cpu)
1e3c88bd
PZ
8384{
8385 struct sched_domain *child = sd->child;
8386 struct sched_group *group, *sdg = sd->groups;
e3d6d0cb 8387 unsigned long capacity, min_capacity, max_capacity;
4ec4412e
VG
8388 unsigned long interval;
8389
8390 interval = msecs_to_jiffies(sd->balance_interval);
8391 interval = clamp(interval, 1UL, max_load_balance_interval);
63b2ca30 8392 sdg->sgc->next_update = jiffies + interval;
1e3c88bd
PZ
8393
8394 if (!child) {
ced549fa 8395 update_cpu_capacity(sd, cpu);
1e3c88bd
PZ
8396 return;
8397 }
8398
dc7ff76e 8399 capacity = 0;
bf475ce0 8400 min_capacity = ULONG_MAX;
e3d6d0cb 8401 max_capacity = 0;
1e3c88bd 8402
74a5ce20
PZ
8403 if (child->flags & SD_OVERLAP) {
8404 /*
8405 * SD_OVERLAP domains cannot assume that child groups
8406 * span the current group.
8407 */
8408
ae4df9d6 8409 for_each_cpu(cpu, sched_group_span(sdg)) {
4c58f57f 8410 unsigned long cpu_cap = capacity_of(cpu);
863bffc8 8411
4c58f57f
PL
8412 capacity += cpu_cap;
8413 min_capacity = min(cpu_cap, min_capacity);
8414 max_capacity = max(cpu_cap, max_capacity);
863bffc8 8415 }
74a5ce20
PZ
8416 } else {
8417 /*
8418 * !SD_OVERLAP domains can assume that child groups
8419 * span the current group.
97a7142f 8420 */
74a5ce20
PZ
8421
8422 group = child->groups;
8423 do {
bf475ce0
MR
8424 struct sched_group_capacity *sgc = group->sgc;
8425
8426 capacity += sgc->capacity;
8427 min_capacity = min(sgc->min_capacity, min_capacity);
e3d6d0cb 8428 max_capacity = max(sgc->max_capacity, max_capacity);
74a5ce20
PZ
8429 group = group->next;
8430 } while (group != child->groups);
8431 }
1e3c88bd 8432
63b2ca30 8433 sdg->sgc->capacity = capacity;
bf475ce0 8434 sdg->sgc->min_capacity = min_capacity;
e3d6d0cb 8435 sdg->sgc->max_capacity = max_capacity;
1e3c88bd
PZ
8436}
8437
9d5efe05 8438/*
ea67821b
VG
8439 * Check whether the capacity of the rq has been noticeably reduced by side
8440 * activity. The imbalance_pct is used for the threshold.
8441 * Return true is the capacity is reduced
9d5efe05
SV
8442 */
8443static inline int
ea67821b 8444check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
9d5efe05 8445{
ea67821b
VG
8446 return ((rq->cpu_capacity * sd->imbalance_pct) <
8447 (rq->cpu_capacity_orig * 100));
9d5efe05
SV
8448}
8449
a0fe2cf0
VS
8450/*
8451 * Check whether a rq has a misfit task and if it looks like we can actually
8452 * help that task: we can migrate the task to a CPU of higher capacity, or
8453 * the task's current CPU is heavily pressured.
8454 */
8455static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
8456{
8457 return rq->misfit_task_load &&
8458 (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
8459 check_cpu_capacity(rq, sd));
8460}
8461
30ce5dab
PZ
8462/*
8463 * Group imbalance indicates (and tries to solve) the problem where balancing
3bd37062 8464 * groups is inadequate due to ->cpus_ptr constraints.
30ce5dab 8465 *
97fb7a0a
IM
8466 * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
8467 * cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
30ce5dab
PZ
8468 * Something like:
8469 *
2b4d5b25
IM
8470 * { 0 1 2 3 } { 4 5 6 7 }
8471 * * * * *
30ce5dab
PZ
8472 *
8473 * If we were to balance group-wise we'd place two tasks in the first group and
8474 * two tasks in the second group. Clearly this is undesired as it will overload
97fb7a0a 8475 * cpu 3 and leave one of the CPUs in the second group unused.
30ce5dab
PZ
8476 *
8477 * The current solution to this issue is detecting the skew in the first group
6263322c
PZ
8478 * by noticing the lower domain failed to reach balance and had difficulty
8479 * moving tasks due to affinity constraints.
30ce5dab
PZ
8480 *
8481 * When this is so detected; this group becomes a candidate for busiest; see
ed1b7732 8482 * update_sd_pick_busiest(). And calculate_imbalance() and
6263322c 8483 * find_busiest_group() avoid some of the usual balance conditions to allow it
30ce5dab
PZ
8484 * to create an effective group imbalance.
8485 *
8486 * This is a somewhat tricky proposition since the next run might not find the
8487 * group imbalance and decide the groups need to be balanced again. A most
8488 * subtle and fragile situation.
8489 */
8490
6263322c 8491static inline int sg_imbalanced(struct sched_group *group)
30ce5dab 8492{
63b2ca30 8493 return group->sgc->imbalance;
30ce5dab
PZ
8494}
8495
b37d9316 8496/*
ea67821b
VG
8497 * group_has_capacity returns true if the group has spare capacity that could
8498 * be used by some tasks.
8499 * We consider that a group has spare capacity if the * number of task is
9e91d61d
DE
8500 * smaller than the number of CPUs or if the utilization is lower than the
8501 * available capacity for CFS tasks.
ea67821b
VG
8502 * For the latter, we use a threshold to stabilize the state, to take into
8503 * account the variance of the tasks' load and to return true if the available
8504 * capacity in meaningful for the load balancer.
8505 * As an example, an available capacity of 1% can appear but it doesn't make
8506 * any benefit for the load balance.
b37d9316 8507 */
ea67821b 8508static inline bool
57abff06 8509group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
b37d9316 8510{
5e23e474 8511 if (sgs->sum_nr_running < sgs->group_weight)
ea67821b 8512 return true;
c61037e9 8513
070f5e86
VG
8514 if ((sgs->group_capacity * imbalance_pct) <
8515 (sgs->group_runnable * 100))
8516 return false;
8517
ea67821b 8518 if ((sgs->group_capacity * 100) >
57abff06 8519 (sgs->group_util * imbalance_pct))
ea67821b 8520 return true;
b37d9316 8521
ea67821b
VG
8522 return false;
8523}
8524
8525/*
8526 * group_is_overloaded returns true if the group has more tasks than it can
8527 * handle.
8528 * group_is_overloaded is not equals to !group_has_capacity because a group
8529 * with the exact right number of tasks, has no more spare capacity but is not
8530 * overloaded so both group_has_capacity and group_is_overloaded return
8531 * false.
8532 */
8533static inline bool
57abff06 8534group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
ea67821b 8535{
5e23e474 8536 if (sgs->sum_nr_running <= sgs->group_weight)
ea67821b 8537 return false;
b37d9316 8538
ea67821b 8539 if ((sgs->group_capacity * 100) <
57abff06 8540 (sgs->group_util * imbalance_pct))
ea67821b 8541 return true;
b37d9316 8542
070f5e86
VG
8543 if ((sgs->group_capacity * imbalance_pct) <
8544 (sgs->group_runnable * 100))
8545 return true;
8546
ea67821b 8547 return false;
b37d9316
PZ
8548}
8549
79a89f92 8550static inline enum
57abff06 8551group_type group_classify(unsigned int imbalance_pct,
0b0695f2 8552 struct sched_group *group,
79a89f92 8553 struct sg_lb_stats *sgs)
caeb178c 8554{
57abff06 8555 if (group_is_overloaded(imbalance_pct, sgs))
caeb178c
RR
8556 return group_overloaded;
8557
8558 if (sg_imbalanced(group))
8559 return group_imbalanced;
8560
0b0695f2
VG
8561 if (sgs->group_asym_packing)
8562 return group_asym_packing;
8563
3b1baa64
MR
8564 if (sgs->group_misfit_task_load)
8565 return group_misfit_task;
8566
57abff06 8567 if (!group_has_capacity(imbalance_pct, sgs))
0b0695f2
VG
8568 return group_fully_busy;
8569
8570 return group_has_spare;
caeb178c
RR
8571}
8572
4006a72b
RN
8573/**
8574 * asym_smt_can_pull_tasks - Check whether the load balancing CPU can pull tasks
8575 * @dst_cpu: Destination CPU of the load balancing
8576 * @sds: Load-balancing data with statistics of the local group
8577 * @sgs: Load-balancing statistics of the candidate busiest group
8578 * @sg: The candidate busiest group
8579 *
8580 * Check the state of the SMT siblings of both @sds::local and @sg and decide
8581 * if @dst_cpu can pull tasks.
8582 *
8583 * If @dst_cpu does not have SMT siblings, it can pull tasks if two or more of
8584 * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks
8585 * only if @dst_cpu has higher priority.
8586 *
8587 * If both @dst_cpu and @sg have SMT siblings, and @sg has exactly one more
8588 * busy CPU than @sds::local, let @dst_cpu pull tasks if it has higher priority.
8589 * Bigger imbalances in the number of busy CPUs will be dealt with in
8590 * update_sd_pick_busiest().
8591 *
8592 * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings
8593 * of @dst_cpu are idle and @sg has lower priority.
a315da5e
RD
8594 *
8595 * Return: true if @dst_cpu can pull tasks, false otherwise.
4006a72b
RN
8596 */
8597static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
8598 struct sg_lb_stats *sgs,
8599 struct sched_group *sg)
8600{
8601#ifdef CONFIG_SCHED_SMT
8602 bool local_is_smt, sg_is_smt;
8603 int sg_busy_cpus;
8604
8605 local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY;
8606 sg_is_smt = sg->flags & SD_SHARE_CPUCAPACITY;
8607
8608 sg_busy_cpus = sgs->group_weight - sgs->idle_cpus;
8609
8610 if (!local_is_smt) {
8611 /*
8612 * If we are here, @dst_cpu is idle and does not have SMT
8613 * siblings. Pull tasks if candidate group has two or more
8614 * busy CPUs.
8615 */
8616 if (sg_busy_cpus >= 2) /* implies sg_is_smt */
8617 return true;
8618
8619 /*
8620 * @dst_cpu does not have SMT siblings. @sg may have SMT
8621 * siblings and only one is busy. In such case, @dst_cpu
8622 * can help if it has higher priority and is idle (i.e.,
8623 * it has no running tasks).
8624 */
8625 return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
8626 }
8627
8628 /* @dst_cpu has SMT siblings. */
8629
8630 if (sg_is_smt) {
8631 int local_busy_cpus = sds->local->group_weight -
8632 sds->local_stat.idle_cpus;
8633 int busy_cpus_delta = sg_busy_cpus - local_busy_cpus;
8634
8635 if (busy_cpus_delta == 1)
8636 return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
8637
8638 return false;
8639 }
8640
8641 /*
8642 * @sg does not have SMT siblings. Ensure that @sds::local does not end
8643 * up with more than one busy SMT sibling and only pull tasks if there
8644 * are not busy CPUs (i.e., no CPU has running tasks).
8645 */
8646 if (!sds->local_stat.sum_nr_running)
8647 return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
8648
8649 return false;
8650#else
8651 /* Always return false so that callers deal with non-SMT cases. */
8652 return false;
8653#endif
8654}
8655
aafc917a
RN
8656static inline bool
8657sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs,
8658 struct sched_group *group)
8659{
4006a72b
RN
8660 /* Only do SMT checks if either local or candidate have SMT siblings */
8661 if ((sds->local->flags & SD_SHARE_CPUCAPACITY) ||
8662 (group->flags & SD_SHARE_CPUCAPACITY))
8663 return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group);
8664
aafc917a
RN
8665 return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
8666}
8667
1e3c88bd
PZ
8668/**
8669 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
cd96891d 8670 * @env: The load balancing environment.
a315da5e 8671 * @sds: Load-balancing data with statistics of the local group.
1e3c88bd 8672 * @group: sched_group whose statistics are to be updated.
1e3c88bd 8673 * @sgs: variable to hold the statistics for this group.
630246a0 8674 * @sg_status: Holds flag indicating the status of the sched_group
1e3c88bd 8675 */
bd939f45 8676static inline void update_sg_lb_stats(struct lb_env *env,
c0d14b57 8677 struct sd_lb_stats *sds,
630246a0
QP
8678 struct sched_group *group,
8679 struct sg_lb_stats *sgs,
8680 int *sg_status)
1e3c88bd 8681{
0b0695f2 8682 int i, nr_running, local_group;
1e3c88bd 8683
b72ff13c
PZ
8684 memset(sgs, 0, sizeof(*sgs));
8685
c0d14b57 8686 local_group = group == sds->local;
0b0695f2 8687
ae4df9d6 8688 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
1e3c88bd
PZ
8689 struct rq *rq = cpu_rq(i);
8690
b0fb1eb4 8691 sgs->group_load += cpu_load(rq);
82762d2a 8692 sgs->group_util += cpu_util_cfs(i);
070f5e86 8693 sgs->group_runnable += cpu_runnable(rq);
a3498347 8694 sgs->sum_h_nr_running += rq->cfs.h_nr_running;
4486edd1 8695
a426f99c 8696 nr_running = rq->nr_running;
5e23e474
VG
8697 sgs->sum_nr_running += nr_running;
8698
a426f99c 8699 if (nr_running > 1)
630246a0 8700 *sg_status |= SG_OVERLOAD;
4486edd1 8701
2802bf3c
MR
8702 if (cpu_overutilized(i))
8703 *sg_status |= SG_OVERUTILIZED;
4486edd1 8704
0ec8aa00
PZ
8705#ifdef CONFIG_NUMA_BALANCING
8706 sgs->nr_numa_running += rq->nr_numa_running;
8707 sgs->nr_preferred_running += rq->nr_preferred_running;
8708#endif
a426f99c
WL
8709 /*
8710 * No need to call idle_cpu() if nr_running is not 0
8711 */
0b0695f2 8712 if (!nr_running && idle_cpu(i)) {
aae6d3dd 8713 sgs->idle_cpus++;
0b0695f2
VG
8714 /* Idle cpu can't have misfit task */
8715 continue;
8716 }
8717
8718 if (local_group)
8719 continue;
3b1baa64 8720
0b0695f2 8721 /* Check for a misfit task on the cpu */
3b1baa64 8722 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
757ffdd7 8723 sgs->group_misfit_task_load < rq->misfit_task_load) {
3b1baa64 8724 sgs->group_misfit_task_load = rq->misfit_task_load;
630246a0 8725 *sg_status |= SG_OVERLOAD;
757ffdd7 8726 }
1e3c88bd
PZ
8727 }
8728
aafc917a
RN
8729 sgs->group_capacity = group->sgc->capacity;
8730
8731 sgs->group_weight = group->group_weight;
8732
0b0695f2 8733 /* Check if dst CPU is idle and preferred to this group */
60256435 8734 if (!local_group && env->sd->flags & SD_ASYM_PACKING &&
aafc917a
RN
8735 env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running &&
8736 sched_asym(env, sds, sgs, group)) {
0b0695f2
VG
8737 sgs->group_asym_packing = 1;
8738 }
8739
57abff06 8740 sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
0b0695f2
VG
8741
8742 /* Computing avg_load makes sense only when group is overloaded */
8743 if (sgs->group_type == group_overloaded)
8744 sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
8745 sgs->group_capacity;
1e3c88bd
PZ
8746}
8747
532cb4c4
MN
8748/**
8749 * update_sd_pick_busiest - return 1 on busiest group
cd96891d 8750 * @env: The load balancing environment.
532cb4c4
MN
8751 * @sds: sched_domain statistics
8752 * @sg: sched_group candidate to be checked for being the busiest
b6b12294 8753 * @sgs: sched_group statistics
532cb4c4
MN
8754 *
8755 * Determine if @sg is a busier group than the previously selected
8756 * busiest group.
e69f6186
YB
8757 *
8758 * Return: %true if @sg is a busier group than the previously selected
8759 * busiest group. %false otherwise.
532cb4c4 8760 */
bd939f45 8761static bool update_sd_pick_busiest(struct lb_env *env,
532cb4c4
MN
8762 struct sd_lb_stats *sds,
8763 struct sched_group *sg,
bd939f45 8764 struct sg_lb_stats *sgs)
532cb4c4 8765{
caeb178c 8766 struct sg_lb_stats *busiest = &sds->busiest_stat;
532cb4c4 8767
0b0695f2
VG
8768 /* Make sure that there is at least one task to pull */
8769 if (!sgs->sum_h_nr_running)
8770 return false;
8771
cad68e55
MR
8772 /*
8773 * Don't try to pull misfit tasks we can't help.
8774 * We can use max_capacity here as reduction in capacity on some
8775 * CPUs in the group should either be possible to resolve
8776 * internally or be covered by avg_load imbalance (eventually).
8777 */
8778 if (sgs->group_type == group_misfit_task &&
4aed8aa4 8779 (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
0b0695f2 8780 sds->local_stat.group_type != group_has_spare))
cad68e55
MR
8781 return false;
8782
caeb178c 8783 if (sgs->group_type > busiest->group_type)
532cb4c4
MN
8784 return true;
8785
caeb178c
RR
8786 if (sgs->group_type < busiest->group_type)
8787 return false;
8788
9e0994c0 8789 /*
0b0695f2
VG
8790 * The candidate and the current busiest group are the same type of
8791 * group. Let check which one is the busiest according to the type.
9e0994c0 8792 */
9e0994c0 8793
0b0695f2
VG
8794 switch (sgs->group_type) {
8795 case group_overloaded:
8796 /* Select the overloaded group with highest avg_load. */
8797 if (sgs->avg_load <= busiest->avg_load)
8798 return false;
8799 break;
8800
8801 case group_imbalanced:
8802 /*
8803 * Select the 1st imbalanced group as we don't have any way to
8804 * choose one more than another.
8805 */
9e0994c0
MR
8806 return false;
8807
0b0695f2
VG
8808 case group_asym_packing:
8809 /* Prefer to move from lowest priority CPU's work */
8810 if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))
8811 return false;
8812 break;
532cb4c4 8813
0b0695f2
VG
8814 case group_misfit_task:
8815 /*
8816 * If we have more than one misfit sg go with the biggest
8817 * misfit.
8818 */
8819 if (sgs->group_misfit_task_load < busiest->group_misfit_task_load)
8820 return false;
8821 break;
532cb4c4 8822
0b0695f2
VG
8823 case group_fully_busy:
8824 /*
8825 * Select the fully busy group with highest avg_load. In
8826 * theory, there is no need to pull task from such kind of
8827 * group because tasks have all compute capacity that they need
8828 * but we can still improve the overall throughput by reducing
8829 * contention when accessing shared HW resources.
8830 *
8831 * XXX for now avg_load is not computed and always 0 so we
8832 * select the 1st one.
8833 */
8834 if (sgs->avg_load <= busiest->avg_load)
8835 return false;
8836 break;
8837
8838 case group_has_spare:
8839 /*
5f68eb19
VG
8840 * Select not overloaded group with lowest number of idle cpus
8841 * and highest number of running tasks. We could also compare
8842 * the spare capacity which is more stable but it can end up
8843 * that the group has less spare capacity but finally more idle
0b0695f2
VG
8844 * CPUs which means less opportunity to pull tasks.
8845 */
5f68eb19 8846 if (sgs->idle_cpus > busiest->idle_cpus)
0b0695f2 8847 return false;
5f68eb19
VG
8848 else if ((sgs->idle_cpus == busiest->idle_cpus) &&
8849 (sgs->sum_nr_running <= busiest->sum_nr_running))
8850 return false;
8851
0b0695f2 8852 break;
532cb4c4
MN
8853 }
8854
0b0695f2
VG
8855 /*
8856 * Candidate sg has no more than one task per CPU and has higher
8857 * per-CPU capacity. Migrating tasks to less capable CPUs may harm
8858 * throughput. Maximize throughput, power/energy consequences are not
8859 * considered.
8860 */
8861 if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
8862 (sgs->group_type <= group_fully_busy) &&
4aed8aa4 8863 (capacity_greater(sg->sgc->min_capacity, capacity_of(env->dst_cpu))))
0b0695f2
VG
8864 return false;
8865
8866 return true;
532cb4c4
MN
8867}
8868
0ec8aa00
PZ
8869#ifdef CONFIG_NUMA_BALANCING
8870static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
8871{
a3498347 8872 if (sgs->sum_h_nr_running > sgs->nr_numa_running)
0ec8aa00 8873 return regular;
a3498347 8874 if (sgs->sum_h_nr_running > sgs->nr_preferred_running)
0ec8aa00
PZ
8875 return remote;
8876 return all;
8877}
8878
8879static inline enum fbq_type fbq_classify_rq(struct rq *rq)
8880{
8881 if (rq->nr_running > rq->nr_numa_running)
8882 return regular;
8883 if (rq->nr_running > rq->nr_preferred_running)
8884 return remote;
8885 return all;
8886}
8887#else
8888static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
8889{
8890 return all;
8891}
8892
8893static inline enum fbq_type fbq_classify_rq(struct rq *rq)
8894{
8895 return regular;
8896}
8897#endif /* CONFIG_NUMA_BALANCING */
8898
57abff06
VG
8899
8900struct sg_lb_stats;
8901
3318544b
VG
8902/*
8903 * task_running_on_cpu - return 1 if @p is running on @cpu.
8904 */
8905
8906static unsigned int task_running_on_cpu(int cpu, struct task_struct *p)
8907{
8908 /* Task has no contribution or is new */
8909 if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
8910 return 0;
8911
8912 if (task_on_rq_queued(p))
8913 return 1;
8914
8915 return 0;
8916}
8917
8918/**
8919 * idle_cpu_without - would a given CPU be idle without p ?
8920 * @cpu: the processor on which idleness is tested.
8921 * @p: task which should be ignored.
8922 *
8923 * Return: 1 if the CPU would be idle. 0 otherwise.
8924 */
8925static int idle_cpu_without(int cpu, struct task_struct *p)
8926{
8927 struct rq *rq = cpu_rq(cpu);
8928
8929 if (rq->curr != rq->idle && rq->curr != p)
8930 return 0;
8931
8932 /*
8933 * rq->nr_running can't be used but an updated version without the
8934 * impact of p on cpu must be used instead. The updated nr_running
8935 * be computed and tested before calling idle_cpu_without().
8936 */
8937
8938#ifdef CONFIG_SMP
126c2092 8939 if (rq->ttwu_pending)
3318544b
VG
8940 return 0;
8941#endif
8942
8943 return 1;
8944}
8945
57abff06
VG
8946/*
8947 * update_sg_wakeup_stats - Update sched_group's statistics for wakeup.
3318544b 8948 * @sd: The sched_domain level to look for idlest group.
57abff06
VG
8949 * @group: sched_group whose statistics are to be updated.
8950 * @sgs: variable to hold the statistics for this group.
3318544b 8951 * @p: The task for which we look for the idlest group/CPU.
57abff06
VG
8952 */
8953static inline void update_sg_wakeup_stats(struct sched_domain *sd,
8954 struct sched_group *group,
8955 struct sg_lb_stats *sgs,
8956 struct task_struct *p)
8957{
8958 int i, nr_running;
8959
8960 memset(sgs, 0, sizeof(*sgs));
8961
8962 for_each_cpu(i, sched_group_span(group)) {
8963 struct rq *rq = cpu_rq(i);
3318544b 8964 unsigned int local;
57abff06 8965
3318544b 8966 sgs->group_load += cpu_load_without(rq, p);
57abff06 8967 sgs->group_util += cpu_util_without(i, p);
070f5e86 8968 sgs->group_runnable += cpu_runnable_without(rq, p);
3318544b
VG
8969 local = task_running_on_cpu(i, p);
8970 sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
57abff06 8971
3318544b 8972 nr_running = rq->nr_running - local;
57abff06
VG
8973 sgs->sum_nr_running += nr_running;
8974
8975 /*
3318544b 8976 * No need to call idle_cpu_without() if nr_running is not 0
57abff06 8977 */
3318544b 8978 if (!nr_running && idle_cpu_without(i, p))
57abff06
VG
8979 sgs->idle_cpus++;
8980
57abff06
VG
8981 }
8982
8983 /* Check if task fits in the group */
8984 if (sd->flags & SD_ASYM_CPUCAPACITY &&
8985 !task_fits_capacity(p, group->sgc->max_capacity)) {
8986 sgs->group_misfit_task_load = 1;
8987 }
8988
8989 sgs->group_capacity = group->sgc->capacity;
8990
289de359
VG
8991 sgs->group_weight = group->group_weight;
8992
57abff06
VG
8993 sgs->group_type = group_classify(sd->imbalance_pct, group, sgs);
8994
8995 /*
8996 * Computing avg_load makes sense only when group is fully busy or
8997 * overloaded
8998 */
6c8116c9
TZ
8999 if (sgs->group_type == group_fully_busy ||
9000 sgs->group_type == group_overloaded)
57abff06
VG
9001 sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
9002 sgs->group_capacity;
9003}
9004
9005static bool update_pick_idlest(struct sched_group *idlest,
9006 struct sg_lb_stats *idlest_sgs,
9007 struct sched_group *group,
9008 struct sg_lb_stats *sgs)
9009{
9010 if (sgs->group_type < idlest_sgs->group_type)
9011 return true;
9012
9013 if (sgs->group_type > idlest_sgs->group_type)
9014 return false;
9015
9016 /*
9017 * The candidate and the current idlest group are the same type of
9018 * group. Let check which one is the idlest according to the type.
9019 */
9020
9021 switch (sgs->group_type) {
9022 case group_overloaded:
9023 case group_fully_busy:
9024 /* Select the group with lowest avg_load. */
9025 if (idlest_sgs->avg_load <= sgs->avg_load)
9026 return false;
9027 break;
9028
9029 case group_imbalanced:
9030 case group_asym_packing:
9031 /* Those types are not used in the slow wakeup path */
9032 return false;
9033
9034 case group_misfit_task:
9035 /* Select group with the highest max capacity */
9036 if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
9037 return false;
9038 break;
9039
9040 case group_has_spare:
9041 /* Select group with most idle CPUs */
3edecfef 9042 if (idlest_sgs->idle_cpus > sgs->idle_cpus)
57abff06 9043 return false;
3edecfef
PP
9044
9045 /* Select group with lowest group_util */
9046 if (idlest_sgs->idle_cpus == sgs->idle_cpus &&
9047 idlest_sgs->group_util <= sgs->group_util)
9048 return false;
9049
57abff06
VG
9050 break;
9051 }
9052
9053 return true;
9054}
9055
9056/*
9057 * find_idlest_group() finds and returns the least busy CPU group within the
9058 * domain.
9059 *
9060 * Assumes p is allowed on at least one CPU in sd.
9061 */
9062static struct sched_group *
45da2773 9063find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
57abff06
VG
9064{
9065 struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
9066 struct sg_lb_stats local_sgs, tmp_sgs;
9067 struct sg_lb_stats *sgs;
9068 unsigned long imbalance;
9069 struct sg_lb_stats idlest_sgs = {
9070 .avg_load = UINT_MAX,
9071 .group_type = group_overloaded,
9072 };
9073
57abff06
VG
9074 do {
9075 int local_group;
9076
9077 /* Skip over this group if it has no CPUs allowed */
9078 if (!cpumask_intersects(sched_group_span(group),
9079 p->cpus_ptr))
9080 continue;
9081
97886d9d
AL
9082 /* Skip over this group if no cookie matched */
9083 if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group))
9084 continue;
9085
57abff06
VG
9086 local_group = cpumask_test_cpu(this_cpu,
9087 sched_group_span(group));
9088
9089 if (local_group) {
9090 sgs = &local_sgs;
9091 local = group;
9092 } else {
9093 sgs = &tmp_sgs;
9094 }
9095
9096 update_sg_wakeup_stats(sd, group, sgs, p);
9097
9098 if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) {
9099 idlest = group;
9100 idlest_sgs = *sgs;
9101 }
9102
9103 } while (group = group->next, group != sd->groups);
9104
9105
9106 /* There is no idlest group to push tasks to */
9107 if (!idlest)
9108 return NULL;
9109
7ed735c3
VG
9110 /* The local group has been skipped because of CPU affinity */
9111 if (!local)
9112 return idlest;
9113
57abff06
VG
9114 /*
9115 * If the local group is idler than the selected idlest group
9116 * don't try and push the task.
9117 */
9118 if (local_sgs.group_type < idlest_sgs.group_type)
9119 return NULL;
9120
9121 /*
9122 * If the local group is busier than the selected idlest group
9123 * try and push the task.
9124 */
9125 if (local_sgs.group_type > idlest_sgs.group_type)
9126 return idlest;
9127
9128 switch (local_sgs.group_type) {
9129 case group_overloaded:
9130 case group_fully_busy:
5c339005
MG
9131
9132 /* Calculate allowed imbalance based on load */
9133 imbalance = scale_load_down(NICE_0_LOAD) *
9134 (sd->imbalance_pct-100) / 100;
9135
57abff06
VG
9136 /*
9137 * When comparing groups across NUMA domains, it's possible for
9138 * the local domain to be very lightly loaded relative to the
9139 * remote domains but "imbalance" skews the comparison making
9140 * remote CPUs look much more favourable. When considering
9141 * cross-domain, add imbalance to the load on the remote node
9142 * and consider staying local.
9143 */
9144
9145 if ((sd->flags & SD_NUMA) &&
9146 ((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load))
9147 return NULL;
9148
9149 /*
9150 * If the local group is less loaded than the selected
9151 * idlest group don't try and push any tasks.
9152 */
9153 if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance))
9154 return NULL;
9155
9156 if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load)
9157 return NULL;
9158 break;
9159
9160 case group_imbalanced:
9161 case group_asym_packing:
9162 /* Those type are not used in the slow wakeup path */
9163 return NULL;
9164
9165 case group_misfit_task:
9166 /* Select group with the highest max capacity */
9167 if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
9168 return NULL;
9169 break;
9170
9171 case group_has_spare:
cb29a5c1 9172#ifdef CONFIG_NUMA
57abff06 9173 if (sd->flags & SD_NUMA) {
f5b2eeb4 9174 int imb_numa_nr = sd->imb_numa_nr;
57abff06
VG
9175#ifdef CONFIG_NUMA_BALANCING
9176 int idlest_cpu;
9177 /*
9178 * If there is spare capacity at NUMA, try to select
9179 * the preferred node
9180 */
9181 if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
9182 return NULL;
9183
9184 idlest_cpu = cpumask_first(sched_group_span(idlest));
9185 if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
9186 return idlest;
cb29a5c1 9187#endif /* CONFIG_NUMA_BALANCING */
57abff06 9188 /*
2cfb7a1b
MG
9189 * Otherwise, keep the task close to the wakeup source
9190 * and improve locality if the number of running tasks
9191 * would remain below threshold where an imbalance is
f5b2eeb4
PN
9192 * allowed while accounting for the possibility the
9193 * task is pinned to a subset of CPUs. If there is a
9194 * real need of migration, periodic load balance will
9195 * take care of it.
57abff06 9196 */
f5b2eeb4
PN
9197 if (p->nr_cpus_allowed != NR_CPUS) {
9198 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
9199
9200 cpumask_and(cpus, sched_group_span(local), p->cpus_ptr);
9201 imb_numa_nr = min(cpumask_weight(cpus), sd->imb_numa_nr);
9202 }
9203
cb29a5c1
MG
9204 imbalance = abs(local_sgs.idle_cpus - idlest_sgs.idle_cpus);
9205 if (!adjust_numa_imbalance(imbalance,
9206 local_sgs.sum_nr_running + 1,
f5b2eeb4 9207 imb_numa_nr)) {
57abff06 9208 return NULL;
cb29a5c1 9209 }
57abff06 9210 }
cb29a5c1 9211#endif /* CONFIG_NUMA */
57abff06
VG
9212
9213 /*
9214 * Select group with highest number of idle CPUs. We could also
9215 * compare the utilization which is more stable but it can end
9216 * up that the group has less spare capacity but finally more
9217 * idle CPUs which means more opportunity to run task.
9218 */
9219 if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus)
9220 return NULL;
9221 break;
9222 }
9223
9224 return idlest;
9225}
9226
1e3c88bd 9227/**
461819ac 9228 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
cd96891d 9229 * @env: The load balancing environment.
1e3c88bd
PZ
9230 * @sds: variable to hold the statistics for this sched_domain.
9231 */
0b0695f2 9232
0ec8aa00 9233static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
1e3c88bd 9234{
bd939f45
PZ
9235 struct sched_domain *child = env->sd->child;
9236 struct sched_group *sg = env->sd->groups;
05b40e05 9237 struct sg_lb_stats *local = &sds->local_stat;
56cf515b 9238 struct sg_lb_stats tmp_sgs;
630246a0 9239 int sg_status = 0;
1e3c88bd 9240
1e3c88bd 9241 do {
56cf515b 9242 struct sg_lb_stats *sgs = &tmp_sgs;
1e3c88bd
PZ
9243 int local_group;
9244
ae4df9d6 9245 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
56cf515b
JK
9246 if (local_group) {
9247 sds->local = sg;
05b40e05 9248 sgs = local;
b72ff13c
PZ
9249
9250 if (env->idle != CPU_NEWLY_IDLE ||
63b2ca30
NP
9251 time_after_eq(jiffies, sg->sgc->next_update))
9252 update_group_capacity(env->sd, env->dst_cpu);
56cf515b 9253 }
1e3c88bd 9254
c0d14b57 9255 update_sg_lb_stats(env, sds, sg, sgs, &sg_status);
1e3c88bd 9256
b72ff13c
PZ
9257 if (local_group)
9258 goto next_group;
9259
1e3c88bd 9260
b72ff13c 9261 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
532cb4c4 9262 sds->busiest = sg;
56cf515b 9263 sds->busiest_stat = *sgs;
1e3c88bd
PZ
9264 }
9265
b72ff13c
PZ
9266next_group:
9267 /* Now, start updating sd_lb_stats */
9268 sds->total_load += sgs->group_load;
63b2ca30 9269 sds->total_capacity += sgs->group_capacity;
b72ff13c 9270
532cb4c4 9271 sg = sg->next;
bd939f45 9272 } while (sg != env->sd->groups);
0ec8aa00 9273
0b0695f2
VG
9274 /* Tag domain that child domain prefers tasks go to siblings first */
9275 sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
9276
f643ea22 9277
0ec8aa00
PZ
9278 if (env->sd->flags & SD_NUMA)
9279 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
4486edd1
TC
9280
9281 if (!env->sd->parent) {
2802bf3c
MR
9282 struct root_domain *rd = env->dst_rq->rd;
9283
4486edd1 9284 /* update overload indicator if we are at root domain */
2802bf3c
MR
9285 WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
9286
9287 /* Update over-utilization (tipping point, U >= 0) indicator */
9288 WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
f9f240f9 9289 trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
2802bf3c 9290 } else if (sg_status & SG_OVERUTILIZED) {
f9f240f9
QY
9291 struct root_domain *rd = env->dst_rq->rd;
9292
9293 WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
9294 trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
4486edd1 9295 }
532cb4c4
MN
9296}
9297
1e3c88bd
PZ
9298/**
9299 * calculate_imbalance - Calculate the amount of imbalance present within the
9300 * groups of a given sched_domain during load balance.
bd939f45 9301 * @env: load balance environment
1e3c88bd 9302 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
1e3c88bd 9303 */
bd939f45 9304static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
1e3c88bd 9305{
56cf515b
JK
9306 struct sg_lb_stats *local, *busiest;
9307
9308 local = &sds->local_stat;
56cf515b 9309 busiest = &sds->busiest_stat;
dd5feea1 9310
0b0695f2
VG
9311 if (busiest->group_type == group_misfit_task) {
9312 /* Set imbalance to allow misfit tasks to be balanced. */
9313 env->migration_type = migrate_misfit;
c63be7be 9314 env->imbalance = 1;
0b0695f2
VG
9315 return;
9316 }
9317
9318 if (busiest->group_type == group_asym_packing) {
9319 /*
9320 * In case of asym capacity, we will try to migrate all load to
9321 * the preferred CPU.
9322 */
9323 env->migration_type = migrate_task;
9324 env->imbalance = busiest->sum_h_nr_running;
9325 return;
9326 }
9327
9328 if (busiest->group_type == group_imbalanced) {
9329 /*
9330 * In the group_imb case we cannot rely on group-wide averages
9331 * to ensure CPU-load equilibrium, try to move any task to fix
9332 * the imbalance. The next load balance will take care of
9333 * balancing back the system.
9334 */
9335 env->migration_type = migrate_task;
9336 env->imbalance = 1;
490ba971
VG
9337 return;
9338 }
9339
1e3c88bd 9340 /*
0b0695f2 9341 * Try to use spare capacity of local group without overloading it or
a9723389 9342 * emptying busiest.
1e3c88bd 9343 */
0b0695f2 9344 if (local->group_type == group_has_spare) {
16b0a7a1
VG
9345 if ((busiest->group_type > group_fully_busy) &&
9346 !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) {
0b0695f2
VG
9347 /*
9348 * If busiest is overloaded, try to fill spare
9349 * capacity. This might end up creating spare capacity
9350 * in busiest or busiest still being overloaded but
9351 * there is no simple way to directly compute the
9352 * amount of load to migrate in order to balance the
9353 * system.
9354 */
9355 env->migration_type = migrate_util;
9356 env->imbalance = max(local->group_capacity, local->group_util) -
9357 local->group_util;
9358
9359 /*
9360 * In some cases, the group's utilization is max or even
9361 * higher than capacity because of migrations but the
9362 * local CPU is (newly) idle. There is at least one
9363 * waiting task in this overloaded busiest group. Let's
9364 * try to pull it.
9365 */
9366 if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) {
9367 env->migration_type = migrate_task;
9368 env->imbalance = 1;
9369 }
9370
9371 return;
9372 }
9373
9374 if (busiest->group_weight == 1 || sds->prefer_sibling) {
5e23e474 9375 unsigned int nr_diff = busiest->sum_nr_running;
0b0695f2
VG
9376 /*
9377 * When prefer sibling, evenly spread running tasks on
9378 * groups.
9379 */
9380 env->migration_type = migrate_task;
5e23e474 9381 lsub_positive(&nr_diff, local->sum_nr_running);
cb29a5c1 9382 env->imbalance = nr_diff;
b396f523 9383 } else {
0b0695f2 9384
b396f523
MG
9385 /*
9386 * If there is no overload, we just want to even the number of
9387 * idle cpus.
9388 */
9389 env->migration_type = migrate_task;
cb29a5c1
MG
9390 env->imbalance = max_t(long, 0,
9391 (local->idle_cpus - busiest->idle_cpus));
b396f523
MG
9392 }
9393
cb29a5c1 9394#ifdef CONFIG_NUMA
b396f523 9395 /* Consider allowing a small imbalance between NUMA groups */
7d2b5dd0 9396 if (env->sd->flags & SD_NUMA) {
fb86f5b2 9397 env->imbalance = adjust_numa_imbalance(env->imbalance,
cb29a5c1
MG
9398 local->sum_nr_running + 1,
9399 env->sd->imb_numa_nr);
7d2b5dd0 9400 }
cb29a5c1
MG
9401#endif
9402
9403 /* Number of tasks to move to restore balance */
9404 env->imbalance >>= 1;
b396f523 9405
fcf0553d 9406 return;
1e3c88bd
PZ
9407 }
9408
9a5d9ba6 9409 /*
0b0695f2
VG
9410 * Local is fully busy but has to take more load to relieve the
9411 * busiest group
9a5d9ba6 9412 */
0b0695f2
VG
9413 if (local->group_type < group_overloaded) {
9414 /*
9415 * Local will become overloaded so the avg_load metrics are
9416 * finally needed.
9417 */
9418
9419 local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
9420 local->group_capacity;
9421
111688ca
AL
9422 /*
9423 * If the local group is more loaded than the selected
9424 * busiest group don't try to pull any tasks.
9425 */
9426 if (local->avg_load >= busiest->avg_load) {
9427 env->imbalance = 0;
9428 return;
9429 }
06354900 9430
9431 sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
9432 sds->total_capacity;
dd5feea1
SS
9433 }
9434
9435 /*
0b0695f2
VG
9436 * Both group are or will become overloaded and we're trying to get all
9437 * the CPUs to the average_load, so we don't want to push ourselves
9438 * above the average load, nor do we wish to reduce the max loaded CPU
9439 * below the average load. At the same time, we also don't want to
9440 * reduce the group load below the group capacity. Thus we look for
9441 * the minimum possible imbalance.
dd5feea1 9442 */
0b0695f2 9443 env->migration_type = migrate_load;
56cf515b 9444 env->imbalance = min(
0b0695f2 9445 (busiest->avg_load - sds->avg_load) * busiest->group_capacity,
63b2ca30 9446 (sds->avg_load - local->avg_load) * local->group_capacity
ca8ce3d0 9447 ) / SCHED_CAPACITY_SCALE;
1e3c88bd 9448}
fab47622 9449
1e3c88bd
PZ
9450/******* find_busiest_group() helpers end here *********************/
9451
0b0695f2
VG
9452/*
9453 * Decision matrix according to the local and busiest group type:
9454 *
9455 * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
9456 * has_spare nr_idle balanced N/A N/A balanced balanced
9457 * fully_busy nr_idle nr_idle N/A N/A balanced balanced
a6583531 9458 * misfit_task force N/A N/A N/A N/A N/A
0b0695f2
VG
9459 * asym_packing force force N/A N/A force force
9460 * imbalanced force force N/A N/A force force
9461 * overloaded force force N/A N/A force avg_load
9462 *
9463 * N/A : Not Applicable because already filtered while updating
9464 * statistics.
9465 * balanced : The system is balanced for these 2 groups.
9466 * force : Calculate the imbalance as load migration is probably needed.
9467 * avg_load : Only if imbalance is significant enough.
9468 * nr_idle : dst_cpu is not busy and the number of idle CPUs is quite
9469 * different in groups.
9470 */
9471
1e3c88bd
PZ
9472/**
9473 * find_busiest_group - Returns the busiest group within the sched_domain
0a9b23ce 9474 * if there is an imbalance.
a315da5e 9475 * @env: The load balancing environment.
1e3c88bd 9476 *
a3df0679 9477 * Also calculates the amount of runnable load which should be moved
1e3c88bd
PZ
9478 * to restore balance.
9479 *
e69f6186 9480 * Return: - The busiest group if imbalance exists.
1e3c88bd 9481 */
56cf515b 9482static struct sched_group *find_busiest_group(struct lb_env *env)
1e3c88bd 9483{
56cf515b 9484 struct sg_lb_stats *local, *busiest;
1e3c88bd
PZ
9485 struct sd_lb_stats sds;
9486
147c5fc2 9487 init_sd_lb_stats(&sds);
1e3c88bd
PZ
9488
9489 /*
b0fb1eb4 9490 * Compute the various statistics relevant for load balancing at
1e3c88bd
PZ
9491 * this level.
9492 */
23f0d209 9493 update_sd_lb_stats(env, &sds);
2802bf3c 9494
f8a696f2 9495 if (sched_energy_enabled()) {
2802bf3c
MR
9496 struct root_domain *rd = env->dst_rq->rd;
9497
9498 if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
9499 goto out_balanced;
9500 }
9501
56cf515b
JK
9502 local = &sds.local_stat;
9503 busiest = &sds.busiest_stat;
1e3c88bd 9504
cc57aa8f 9505 /* There is no busy sibling group to pull tasks from */
0b0695f2 9506 if (!sds.busiest)
1e3c88bd
PZ
9507 goto out_balanced;
9508
0b0695f2
VG
9509 /* Misfit tasks should be dealt with regardless of the avg load */
9510 if (busiest->group_type == group_misfit_task)
9511 goto force_balance;
9512
9513 /* ASYM feature bypasses nice load balance check */
9514 if (busiest->group_type == group_asym_packing)
9515 goto force_balance;
b0432d8f 9516
866ab43e
PZ
9517 /*
9518 * If the busiest group is imbalanced the below checks don't
30ce5dab 9519 * work because they assume all things are equal, which typically
3bd37062 9520 * isn't true due to cpus_ptr constraints and the like.
866ab43e 9521 */
caeb178c 9522 if (busiest->group_type == group_imbalanced)
866ab43e
PZ
9523 goto force_balance;
9524
cc57aa8f 9525 /*
9c58c79a 9526 * If the local group is busier than the selected busiest group
cc57aa8f
PZ
9527 * don't try and pull any tasks.
9528 */
0b0695f2 9529 if (local->group_type > busiest->group_type)
1e3c88bd
PZ
9530 goto out_balanced;
9531
cc57aa8f 9532 /*
0b0695f2
VG
9533 * When groups are overloaded, use the avg_load to ensure fairness
9534 * between tasks.
cc57aa8f 9535 */
0b0695f2
VG
9536 if (local->group_type == group_overloaded) {
9537 /*
9538 * If the local group is more loaded than the selected
9539 * busiest group don't try to pull any tasks.
9540 */
9541 if (local->avg_load >= busiest->avg_load)
9542 goto out_balanced;
9543
9544 /* XXX broken for overlapping NUMA groups */
9545 sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
9546 sds.total_capacity;
1e3c88bd 9547
aae6d3dd 9548 /*
0b0695f2
VG
9549 * Don't pull any tasks if this group is already above the
9550 * domain average load.
aae6d3dd 9551 */
0b0695f2 9552 if (local->avg_load >= sds.avg_load)
aae6d3dd 9553 goto out_balanced;
0b0695f2 9554
c186fafe 9555 /*
0b0695f2
VG
9556 * If the busiest group is more loaded, use imbalance_pct to be
9557 * conservative.
c186fafe 9558 */
56cf515b
JK
9559 if (100 * busiest->avg_load <=
9560 env->sd->imbalance_pct * local->avg_load)
c186fafe 9561 goto out_balanced;
aae6d3dd 9562 }
1e3c88bd 9563
0b0695f2
VG
9564 /* Try to move all excess tasks to child's sibling domain */
9565 if (sds.prefer_sibling && local->group_type == group_has_spare &&
5e23e474 9566 busiest->sum_nr_running > local->sum_nr_running + 1)
0b0695f2
VG
9567 goto force_balance;
9568
2ab4092f
VG
9569 if (busiest->group_type != group_overloaded) {
9570 if (env->idle == CPU_NOT_IDLE)
9571 /*
9572 * If the busiest group is not overloaded (and as a
9573 * result the local one too) but this CPU is already
9574 * busy, let another idle CPU try to pull task.
9575 */
9576 goto out_balanced;
9577
9578 if (busiest->group_weight > 1 &&
9579 local->idle_cpus <= (busiest->idle_cpus + 1))
9580 /*
9581 * If the busiest group is not overloaded
9582 * and there is no imbalance between this and busiest
9583 * group wrt idle CPUs, it is balanced. The imbalance
9584 * becomes significant if the diff is greater than 1
9585 * otherwise we might end up to just move the imbalance
9586 * on another group. Of course this applies only if
9587 * there is more than 1 CPU per group.
9588 */
9589 goto out_balanced;
9590
9591 if (busiest->sum_h_nr_running == 1)
9592 /*
9593 * busiest doesn't have any tasks waiting to run
9594 */
9595 goto out_balanced;
9596 }
0b0695f2 9597
fab47622 9598force_balance:
1e3c88bd 9599 /* Looks like there is an imbalance. Compute it */
bd939f45 9600 calculate_imbalance(env, &sds);
bb3485c8 9601 return env->imbalance ? sds.busiest : NULL;
1e3c88bd
PZ
9602
9603out_balanced:
bd939f45 9604 env->imbalance = 0;
1e3c88bd
PZ
9605 return NULL;
9606}
9607
9608/*
97fb7a0a 9609 * find_busiest_queue - find the busiest runqueue among the CPUs in the group.
1e3c88bd 9610 */
bd939f45 9611static struct rq *find_busiest_queue(struct lb_env *env,
b9403130 9612 struct sched_group *group)
1e3c88bd
PZ
9613{
9614 struct rq *busiest = NULL, *rq;
0b0695f2
VG
9615 unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
9616 unsigned int busiest_nr = 0;
1e3c88bd
PZ
9617 int i;
9618
ae4df9d6 9619 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
0b0695f2
VG
9620 unsigned long capacity, load, util;
9621 unsigned int nr_running;
0ec8aa00
PZ
9622 enum fbq_type rt;
9623
9624 rq = cpu_rq(i);
9625 rt = fbq_classify_rq(rq);
1e3c88bd 9626
0ec8aa00
PZ
9627 /*
9628 * We classify groups/runqueues into three groups:
9629 * - regular: there are !numa tasks
9630 * - remote: there are numa tasks that run on the 'wrong' node
9631 * - all: there is no distinction
9632 *
9633 * In order to avoid migrating ideally placed numa tasks,
9634 * ignore those when there's better options.
9635 *
9636 * If we ignore the actual busiest queue to migrate another
9637 * task, the next balance pass can still reduce the busiest
9638 * queue by moving tasks around inside the node.
9639 *
9640 * If we cannot move enough load due to this classification
9641 * the next pass will adjust the group classification and
9642 * allow migration of more tasks.
9643 *
9644 * Both cases only affect the total convergence complexity.
9645 */
9646 if (rt > env->fbq_type)
9647 continue;
9648
0b0695f2 9649 nr_running = rq->cfs.h_nr_running;
fc488ffd
VG
9650 if (!nr_running)
9651 continue;
9652
9653 capacity = capacity_of(i);
9d5efe05 9654
4ad3831a
CR
9655 /*
9656 * For ASYM_CPUCAPACITY domains, don't pick a CPU that could
9657 * eventually lead to active_balancing high->low capacity.
9658 * Higher per-CPU capacity is considered better than balancing
9659 * average load.
9660 */
9661 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
4aed8aa4 9662 !capacity_greater(capacity_of(env->dst_cpu), capacity) &&
0b0695f2 9663 nr_running == 1)
4ad3831a
CR
9664 continue;
9665
4006a72b
RN
9666 /* Make sure we only pull tasks from a CPU of lower priority */
9667 if ((env->sd->flags & SD_ASYM_PACKING) &&
9668 sched_asym_prefer(i, env->dst_cpu) &&
9669 nr_running == 1)
9670 continue;
9671
0b0695f2
VG
9672 switch (env->migration_type) {
9673 case migrate_load:
9674 /*
b0fb1eb4
VG
9675 * When comparing with load imbalance, use cpu_load()
9676 * which is not scaled with the CPU capacity.
0b0695f2 9677 */
b0fb1eb4 9678 load = cpu_load(rq);
1e3c88bd 9679
0b0695f2
VG
9680 if (nr_running == 1 && load > env->imbalance &&
9681 !check_cpu_capacity(rq, env->sd))
9682 break;
ea67821b 9683
0b0695f2
VG
9684 /*
9685 * For the load comparisons with the other CPUs,
b0fb1eb4
VG
9686 * consider the cpu_load() scaled with the CPU
9687 * capacity, so that the load can be moved away
9688 * from the CPU that is potentially running at a
9689 * lower capacity.
0b0695f2
VG
9690 *
9691 * Thus we're looking for max(load_i / capacity_i),
9692 * crosswise multiplication to rid ourselves of the
9693 * division works out to:
9694 * load_i * capacity_j > load_j * capacity_i;
9695 * where j is our previous maximum.
9696 */
9697 if (load * busiest_capacity > busiest_load * capacity) {
9698 busiest_load = load;
9699 busiest_capacity = capacity;
9700 busiest = rq;
9701 }
9702 break;
9703
9704 case migrate_util:
82762d2a 9705 util = cpu_util_cfs(i);
0b0695f2 9706
c32b4308
VG
9707 /*
9708 * Don't try to pull utilization from a CPU with one
9709 * running task. Whatever its utilization, we will fail
9710 * detach the task.
9711 */
9712 if (nr_running <= 1)
9713 continue;
9714
0b0695f2
VG
9715 if (busiest_util < util) {
9716 busiest_util = util;
9717 busiest = rq;
9718 }
9719 break;
9720
9721 case migrate_task:
9722 if (busiest_nr < nr_running) {
9723 busiest_nr = nr_running;
9724 busiest = rq;
9725 }
9726 break;
9727
9728 case migrate_misfit:
9729 /*
9730 * For ASYM_CPUCAPACITY domains with misfit tasks we
9731 * simply seek the "biggest" misfit task.
9732 */
9733 if (rq->misfit_task_load > busiest_load) {
9734 busiest_load = rq->misfit_task_load;
9735 busiest = rq;
9736 }
9737
9738 break;
1e3c88bd 9739
1e3c88bd
PZ
9740 }
9741 }
9742
9743 return busiest;
9744}
9745
9746/*
9747 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
9748 * so long as it is large enough.
9749 */
9750#define MAX_PINNED_INTERVAL 512
9751
46a745d9
VG
9752static inline bool
9753asym_active_balance(struct lb_env *env)
1af3ed3d 9754{
46a745d9
VG
9755 /*
9756 * ASYM_PACKING needs to force migrate tasks from busy but
9757 * lower priority CPUs in order to pack all tasks in the
9758 * highest priority CPUs.
9759 */
9760 return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
9761 sched_asym_prefer(env->dst_cpu, env->src_cpu);
9762}
bd939f45 9763
46a745d9 9764static inline bool
e9b9734b
VG
9765imbalanced_active_balance(struct lb_env *env)
9766{
9767 struct sched_domain *sd = env->sd;
9768
9769 /*
9770 * The imbalanced case includes the case of pinned tasks preventing a fair
9771 * distribution of the load on the system but also the even distribution of the
9772 * threads on a system with spare capacity
9773 */
9774 if ((env->migration_type == migrate_task) &&
9775 (sd->nr_balance_failed > sd->cache_nice_tries+2))
9776 return 1;
9777
9778 return 0;
9779}
9780
9781static int need_active_balance(struct lb_env *env)
46a745d9
VG
9782{
9783 struct sched_domain *sd = env->sd;
532cb4c4 9784
46a745d9
VG
9785 if (asym_active_balance(env))
9786 return 1;
1af3ed3d 9787
e9b9734b
VG
9788 if (imbalanced_active_balance(env))
9789 return 1;
9790
1aaf90a4
VG
9791 /*
9792 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
9793 * It's worth migrating the task if the src_cpu's capacity is reduced
9794 * because of other sched_class or IRQs if more capacity stays
9795 * available on dst_cpu.
9796 */
9797 if ((env->idle != CPU_NOT_IDLE) &&
9798 (env->src_rq->cfs.h_nr_running == 1)) {
9799 if ((check_cpu_capacity(env->src_rq, sd)) &&
9800 (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
9801 return 1;
9802 }
9803
0b0695f2 9804 if (env->migration_type == migrate_misfit)
cad68e55
MR
9805 return 1;
9806
46a745d9
VG
9807 return 0;
9808}
9809
969c7921
TH
9810static int active_load_balance_cpu_stop(void *data);
9811
23f0d209
JK
9812static int should_we_balance(struct lb_env *env)
9813{
9814 struct sched_group *sg = env->sd->groups;
64297f2b 9815 int cpu;
23f0d209 9816
024c9d2f
PZ
9817 /*
9818 * Ensure the balancing environment is consistent; can happen
9819 * when the softirq triggers 'during' hotplug.
9820 */
9821 if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
9822 return 0;
9823
23f0d209 9824 /*
97fb7a0a 9825 * In the newly idle case, we will allow all the CPUs
23f0d209 9826 * to do the newly idle load balance.
792b9f65
JD
9827 *
9828 * However, we bail out if we already have tasks or a wakeup pending,
9829 * to optimize wakeup latency.
23f0d209 9830 */
792b9f65
JD
9831 if (env->idle == CPU_NEWLY_IDLE) {
9832 if (env->dst_rq->nr_running > 0 || env->dst_rq->ttwu_pending)
9833 return 0;
23f0d209 9834 return 1;
792b9f65 9835 }
23f0d209 9836
97fb7a0a 9837 /* Try to find first idle CPU */
e5c14b1f 9838 for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
af218122 9839 if (!idle_cpu(cpu))
23f0d209
JK
9840 continue;
9841
64297f2b
PW
9842 /* Are we the first idle CPU? */
9843 return cpu == env->dst_cpu;
23f0d209
JK
9844 }
9845
64297f2b
PW
9846 /* Are we the first CPU of this group ? */
9847 return group_balance_cpu(sg) == env->dst_cpu;
23f0d209
JK
9848}
9849
1e3c88bd
PZ
9850/*
9851 * Check this_cpu to ensure it is balanced within domain. Attempt to move
9852 * tasks if there is an imbalance.
9853 */
9854static int load_balance(int this_cpu, struct rq *this_rq,
9855 struct sched_domain *sd, enum cpu_idle_type idle,
23f0d209 9856 int *continue_balancing)
1e3c88bd 9857{
88b8dac0 9858 int ld_moved, cur_ld_moved, active_balance = 0;
6263322c 9859 struct sched_domain *sd_parent = sd->parent;
1e3c88bd 9860 struct sched_group *group;
1e3c88bd 9861 struct rq *busiest;
8a8c69c3 9862 struct rq_flags rf;
4ba29684 9863 struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
1e3c88bd 9864
8e45cb54
PZ
9865 struct lb_env env = {
9866 .sd = sd,
ddcdf6e7
PZ
9867 .dst_cpu = this_cpu,
9868 .dst_rq = this_rq,
ae4df9d6 9869 .dst_grpmask = sched_group_span(sd->groups),
8e45cb54 9870 .idle = idle,
eb95308e 9871 .loop_break = sched_nr_migrate_break,
b9403130 9872 .cpus = cpus,
0ec8aa00 9873 .fbq_type = all,
163122b7 9874 .tasks = LIST_HEAD_INIT(env.tasks),
8e45cb54
PZ
9875 };
9876
65a4433a 9877 cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
1e3c88bd 9878
ae92882e 9879 schedstat_inc(sd->lb_count[idle]);
1e3c88bd
PZ
9880
9881redo:
23f0d209
JK
9882 if (!should_we_balance(&env)) {
9883 *continue_balancing = 0;
1e3c88bd 9884 goto out_balanced;
23f0d209 9885 }
1e3c88bd 9886
23f0d209 9887 group = find_busiest_group(&env);
1e3c88bd 9888 if (!group) {
ae92882e 9889 schedstat_inc(sd->lb_nobusyg[idle]);
1e3c88bd
PZ
9890 goto out_balanced;
9891 }
9892
b9403130 9893 busiest = find_busiest_queue(&env, group);
1e3c88bd 9894 if (!busiest) {
ae92882e 9895 schedstat_inc(sd->lb_nobusyq[idle]);
1e3c88bd
PZ
9896 goto out_balanced;
9897 }
9898
78feefc5 9899 BUG_ON(busiest == env.dst_rq);
1e3c88bd 9900
ae92882e 9901 schedstat_add(sd->lb_imbalance[idle], env.imbalance);
1e3c88bd 9902
1aaf90a4
VG
9903 env.src_cpu = busiest->cpu;
9904 env.src_rq = busiest;
9905
1e3c88bd 9906 ld_moved = 0;
8a41dfcd
VG
9907 /* Clear this flag as soon as we find a pullable task */
9908 env.flags |= LBF_ALL_PINNED;
1e3c88bd
PZ
9909 if (busiest->nr_running > 1) {
9910 /*
9911 * Attempt to move tasks. If find_busiest_group has found
9912 * an imbalance but busiest->nr_running <= 1, the group is
9913 * still unbalanced. ld_moved simply stays zero, so it is
9914 * correctly treated as an imbalance.
9915 */
c82513e5 9916 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
8e45cb54 9917
5d6523eb 9918more_balance:
8a8c69c3 9919 rq_lock_irqsave(busiest, &rf);
3bed5e21 9920 update_rq_clock(busiest);
88b8dac0
SV
9921
9922 /*
9923 * cur_ld_moved - load moved in current iteration
9924 * ld_moved - cumulative load moved across iterations
9925 */
163122b7 9926 cur_ld_moved = detach_tasks(&env);
1e3c88bd
PZ
9927
9928 /*
163122b7
KT
9929 * We've detached some tasks from busiest_rq. Every
9930 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
9931 * unlock busiest->lock, and we are able to be sure
9932 * that nobody can manipulate the tasks in parallel.
9933 * See task_rq_lock() family for the details.
1e3c88bd 9934 */
163122b7 9935
8a8c69c3 9936 rq_unlock(busiest, &rf);
163122b7
KT
9937
9938 if (cur_ld_moved) {
9939 attach_tasks(&env);
9940 ld_moved += cur_ld_moved;
9941 }
9942
8a8c69c3 9943 local_irq_restore(rf.flags);
88b8dac0 9944
f1cd0858
JK
9945 if (env.flags & LBF_NEED_BREAK) {
9946 env.flags &= ~LBF_NEED_BREAK;
9947 goto more_balance;
9948 }
9949
88b8dac0
SV
9950 /*
9951 * Revisit (affine) tasks on src_cpu that couldn't be moved to
9952 * us and move them to an alternate dst_cpu in our sched_group
9953 * where they can run. The upper limit on how many times we
97fb7a0a 9954 * iterate on same src_cpu is dependent on number of CPUs in our
88b8dac0
SV
9955 * sched_group.
9956 *
9957 * This changes load balance semantics a bit on who can move
9958 * load to a given_cpu. In addition to the given_cpu itself
9959 * (or a ilb_cpu acting on its behalf where given_cpu is
9960 * nohz-idle), we now have balance_cpu in a position to move
9961 * load to given_cpu. In rare situations, this may cause
9962 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
9963 * _independently_ and at _same_ time to move some load to
3b03706f 9964 * given_cpu) causing excess load to be moved to given_cpu.
88b8dac0
SV
9965 * This however should not happen so much in practice and
9966 * moreover subsequent load balance cycles should correct the
9967 * excess load moved.
9968 */
6263322c 9969 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
88b8dac0 9970
97fb7a0a 9971 /* Prevent to re-select dst_cpu via env's CPUs */
c89d92ed 9972 __cpumask_clear_cpu(env.dst_cpu, env.cpus);
7aff2e3a 9973
78feefc5 9974 env.dst_rq = cpu_rq(env.new_dst_cpu);
88b8dac0 9975 env.dst_cpu = env.new_dst_cpu;
6263322c 9976 env.flags &= ~LBF_DST_PINNED;
88b8dac0
SV
9977 env.loop = 0;
9978 env.loop_break = sched_nr_migrate_break;
e02e60c1 9979
88b8dac0
SV
9980 /*
9981 * Go back to "more_balance" rather than "redo" since we
9982 * need to continue with same src_cpu.
9983 */
9984 goto more_balance;
9985 }
1e3c88bd 9986
6263322c
PZ
9987 /*
9988 * We failed to reach balance because of affinity.
9989 */
9990 if (sd_parent) {
63b2ca30 9991 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
6263322c 9992
afdeee05 9993 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
6263322c 9994 *group_imbalance = 1;
6263322c
PZ
9995 }
9996
1e3c88bd 9997 /* All tasks on this runqueue were pinned by CPU affinity */
8e45cb54 9998 if (unlikely(env.flags & LBF_ALL_PINNED)) {
c89d92ed 9999 __cpumask_clear_cpu(cpu_of(busiest), cpus);
65a4433a
JH
10000 /*
10001 * Attempting to continue load balancing at the current
10002 * sched_domain level only makes sense if there are
10003 * active CPUs remaining as possible busiest CPUs to
10004 * pull load from which are not contained within the
10005 * destination group that is receiving any migrated
10006 * load.
10007 */
10008 if (!cpumask_subset(cpus, env.dst_grpmask)) {
bbf18b19
PN
10009 env.loop = 0;
10010 env.loop_break = sched_nr_migrate_break;
1e3c88bd 10011 goto redo;
bbf18b19 10012 }
afdeee05 10013 goto out_all_pinned;
1e3c88bd
PZ
10014 }
10015 }
10016
10017 if (!ld_moved) {
ae92882e 10018 schedstat_inc(sd->lb_failed[idle]);
58b26c4c
VP
10019 /*
10020 * Increment the failure counter only on periodic balance.
10021 * We do not want newidle balance, which can be very
10022 * frequent, pollute the failure counter causing
10023 * excessive cache_hot migrations and active balances.
10024 */
10025 if (idle != CPU_NEWLY_IDLE)
10026 sd->nr_balance_failed++;
1e3c88bd 10027
bd939f45 10028 if (need_active_balance(&env)) {
8a8c69c3
PZ
10029 unsigned long flags;
10030
5cb9eaa3 10031 raw_spin_rq_lock_irqsave(busiest, flags);
1e3c88bd 10032
97fb7a0a
IM
10033 /*
10034 * Don't kick the active_load_balance_cpu_stop,
10035 * if the curr task on busiest CPU can't be
10036 * moved to this_cpu:
1e3c88bd 10037 */
3bd37062 10038 if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
5cb9eaa3 10039 raw_spin_rq_unlock_irqrestore(busiest, flags);
1e3c88bd
PZ
10040 goto out_one_pinned;
10041 }
10042
8a41dfcd
VG
10043 /* Record that we found at least one task that could run on this_cpu */
10044 env.flags &= ~LBF_ALL_PINNED;
10045
969c7921
TH
10046 /*
10047 * ->active_balance synchronizes accesses to
10048 * ->active_balance_work. Once set, it's cleared
10049 * only after active load balance is finished.
10050 */
1e3c88bd
PZ
10051 if (!busiest->active_balance) {
10052 busiest->active_balance = 1;
10053 busiest->push_cpu = this_cpu;
10054 active_balance = 1;
10055 }
5cb9eaa3 10056 raw_spin_rq_unlock_irqrestore(busiest, flags);
969c7921 10057
bd939f45 10058 if (active_balance) {
969c7921
TH
10059 stop_one_cpu_nowait(cpu_of(busiest),
10060 active_load_balance_cpu_stop, busiest,
10061 &busiest->active_balance_work);
bd939f45 10062 }
1e3c88bd 10063 }
e9b9734b 10064 } else {
1e3c88bd 10065 sd->nr_balance_failed = 0;
e9b9734b 10066 }
1e3c88bd 10067
e9b9734b 10068 if (likely(!active_balance) || need_active_balance(&env)) {
1e3c88bd
PZ
10069 /* We were unbalanced, so reset the balancing interval */
10070 sd->balance_interval = sd->min_interval;
1e3c88bd
PZ
10071 }
10072
1e3c88bd
PZ
10073 goto out;
10074
10075out_balanced:
afdeee05
VG
10076 /*
10077 * We reach balance although we may have faced some affinity
f6cad8df
VG
10078 * constraints. Clear the imbalance flag only if other tasks got
10079 * a chance to move and fix the imbalance.
afdeee05 10080 */
f6cad8df 10081 if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
afdeee05
VG
10082 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
10083
10084 if (*group_imbalance)
10085 *group_imbalance = 0;
10086 }
10087
10088out_all_pinned:
10089 /*
10090 * We reach balance because all tasks are pinned at this level so
10091 * we can't migrate them. Let the imbalance flag set so parent level
10092 * can try to migrate them.
10093 */
ae92882e 10094 schedstat_inc(sd->lb_balanced[idle]);
1e3c88bd
PZ
10095
10096 sd->nr_balance_failed = 0;
10097
10098out_one_pinned:
3f130a37
VS
10099 ld_moved = 0;
10100
10101 /*
5ba553ef
PZ
10102 * newidle_balance() disregards balance intervals, so we could
10103 * repeatedly reach this code, which would lead to balance_interval
3b03706f 10104 * skyrocketing in a short amount of time. Skip the balance_interval
5ba553ef 10105 * increase logic to avoid that.
3f130a37
VS
10106 */
10107 if (env.idle == CPU_NEWLY_IDLE)
10108 goto out;
10109
1e3c88bd 10110 /* tune up the balancing interval */
47b7aee1
VS
10111 if ((env.flags & LBF_ALL_PINNED &&
10112 sd->balance_interval < MAX_PINNED_INTERVAL) ||
10113 sd->balance_interval < sd->max_interval)
1e3c88bd 10114 sd->balance_interval *= 2;
1e3c88bd 10115out:
1e3c88bd
PZ
10116 return ld_moved;
10117}
10118
52a08ef1
JL
10119static inline unsigned long
10120get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
10121{
10122 unsigned long interval = sd->balance_interval;
10123
10124 if (cpu_busy)
10125 interval *= sd->busy_factor;
10126
10127 /* scale ms to jiffies */
10128 interval = msecs_to_jiffies(interval);
e4d32e4d
VG
10129
10130 /*
10131 * Reduce likelihood of busy balancing at higher domains racing with
10132 * balancing at lower domains by preventing their balancing periods
10133 * from being multiples of each other.
10134 */
10135 if (cpu_busy)
10136 interval -= 1;
10137
52a08ef1
JL
10138 interval = clamp(interval, 1UL, max_load_balance_interval);
10139
10140 return interval;
10141}
10142
10143static inline void
31851a98 10144update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
52a08ef1
JL
10145{
10146 unsigned long interval, next;
10147
31851a98
LY
10148 /* used by idle balance, so cpu_busy = 0 */
10149 interval = get_sd_balance_interval(sd, 0);
52a08ef1
JL
10150 next = sd->last_balance + interval;
10151
10152 if (time_after(*next_balance, next))
10153 *next_balance = next;
10154}
10155
1e3c88bd 10156/*
97fb7a0a 10157 * active_load_balance_cpu_stop is run by the CPU stopper. It pushes
969c7921
TH
10158 * running tasks off the busiest CPU onto idle CPUs. It requires at
10159 * least 1 task to be running on each physical CPU where possible, and
10160 * avoids physical / logical imbalances.
1e3c88bd 10161 */
969c7921 10162static int active_load_balance_cpu_stop(void *data)
1e3c88bd 10163{
969c7921
TH
10164 struct rq *busiest_rq = data;
10165 int busiest_cpu = cpu_of(busiest_rq);
1e3c88bd 10166 int target_cpu = busiest_rq->push_cpu;
969c7921 10167 struct rq *target_rq = cpu_rq(target_cpu);
1e3c88bd 10168 struct sched_domain *sd;
e5673f28 10169 struct task_struct *p = NULL;
8a8c69c3 10170 struct rq_flags rf;
969c7921 10171
8a8c69c3 10172 rq_lock_irq(busiest_rq, &rf);
edd8e41d
PZ
10173 /*
10174 * Between queueing the stop-work and running it is a hole in which
10175 * CPUs can become inactive. We should not move tasks from or to
10176 * inactive CPUs.
10177 */
10178 if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
10179 goto out_unlock;
969c7921 10180
97fb7a0a 10181 /* Make sure the requested CPU hasn't gone down in the meantime: */
969c7921
TH
10182 if (unlikely(busiest_cpu != smp_processor_id() ||
10183 !busiest_rq->active_balance))
10184 goto out_unlock;
1e3c88bd
PZ
10185
10186 /* Is there any task to move? */
10187 if (busiest_rq->nr_running <= 1)
969c7921 10188 goto out_unlock;
1e3c88bd
PZ
10189
10190 /*
10191 * This condition is "impossible", if it occurs
10192 * we need to fix it. Originally reported by
97fb7a0a 10193 * Bjorn Helgaas on a 128-CPU setup.
1e3c88bd
PZ
10194 */
10195 BUG_ON(busiest_rq == target_rq);
10196
1e3c88bd 10197 /* Search for an sd spanning us and the target CPU. */
dce840a0 10198 rcu_read_lock();
1e3c88bd 10199 for_each_domain(target_cpu, sd) {
e669ac8a
VS
10200 if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
10201 break;
1e3c88bd
PZ
10202 }
10203
10204 if (likely(sd)) {
8e45cb54
PZ
10205 struct lb_env env = {
10206 .sd = sd,
ddcdf6e7
PZ
10207 .dst_cpu = target_cpu,
10208 .dst_rq = target_rq,
10209 .src_cpu = busiest_rq->cpu,
10210 .src_rq = busiest_rq,
8e45cb54 10211 .idle = CPU_IDLE,
23fb06d9 10212 .flags = LBF_ACTIVE_LB,
8e45cb54
PZ
10213 };
10214
ae92882e 10215 schedstat_inc(sd->alb_count);
3bed5e21 10216 update_rq_clock(busiest_rq);
1e3c88bd 10217
e5673f28 10218 p = detach_one_task(&env);
d02c0711 10219 if (p) {
ae92882e 10220 schedstat_inc(sd->alb_pushed);
d02c0711
SD
10221 /* Active balancing done, reset the failure counter. */
10222 sd->nr_balance_failed = 0;
10223 } else {
ae92882e 10224 schedstat_inc(sd->alb_failed);
d02c0711 10225 }
1e3c88bd 10226 }
dce840a0 10227 rcu_read_unlock();
969c7921
TH
10228out_unlock:
10229 busiest_rq->active_balance = 0;
8a8c69c3 10230 rq_unlock(busiest_rq, &rf);
e5673f28
KT
10231
10232 if (p)
10233 attach_one_task(target_rq, p);
10234
10235 local_irq_enable();
10236
969c7921 10237 return 0;
1e3c88bd
PZ
10238}
10239
af3fe03c
PZ
10240static DEFINE_SPINLOCK(balancing);
10241
10242/*
10243 * Scale the max load_balance interval with the number of CPUs in the system.
10244 * This trades load-balance latency on larger machines for less cross talk.
10245 */
10246void update_max_interval(void)
10247{
10248 max_load_balance_interval = HZ*num_online_cpus()/10;
10249}
10250
e60b56e4
VG
10251static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
10252{
10253 if (cost > sd->max_newidle_lb_cost) {
10254 /*
10255 * Track max cost of a domain to make sure to not delay the
10256 * next wakeup on the CPU.
10257 */
10258 sd->max_newidle_lb_cost = cost;
10259 sd->last_decay_max_lb_cost = jiffies;
10260 } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) {
10261 /*
10262 * Decay the newidle max times by ~1% per second to ensure that
10263 * it is not outdated and the current max cost is actually
10264 * shorter.
10265 */
10266 sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256;
10267 sd->last_decay_max_lb_cost = jiffies;
10268
10269 return true;
10270 }
10271
10272 return false;
10273}
10274
af3fe03c
PZ
10275/*
10276 * It checks each scheduling domain to see if it is due to be balanced,
10277 * and initiates a balancing operation if so.
10278 *
10279 * Balancing parameters are set up in init_sched_domains.
10280 */
10281static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
10282{
10283 int continue_balancing = 1;
10284 int cpu = rq->cpu;
323af6de 10285 int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
af3fe03c
PZ
10286 unsigned long interval;
10287 struct sched_domain *sd;
10288 /* Earliest time when we have to do rebalance again */
10289 unsigned long next_balance = jiffies + 60*HZ;
10290 int update_next_balance = 0;
10291 int need_serialize, need_decay = 0;
10292 u64 max_cost = 0;
10293
10294 rcu_read_lock();
10295 for_each_domain(cpu, sd) {
10296 /*
10297 * Decay the newidle max times here because this is a regular
e60b56e4 10298 * visit to all the domains.
af3fe03c 10299 */
e60b56e4 10300 need_decay = update_newidle_cost(sd, 0);
af3fe03c
PZ
10301 max_cost += sd->max_newidle_lb_cost;
10302
af3fe03c
PZ
10303 /*
10304 * Stop the load balance at this level. There is another
10305 * CPU in our sched group which is doing load balancing more
10306 * actively.
10307 */
10308 if (!continue_balancing) {
10309 if (need_decay)
10310 continue;
10311 break;
10312 }
10313
323af6de 10314 interval = get_sd_balance_interval(sd, busy);
af3fe03c
PZ
10315
10316 need_serialize = sd->flags & SD_SERIALIZE;
10317 if (need_serialize) {
10318 if (!spin_trylock(&balancing))
10319 goto out;
10320 }
10321
10322 if (time_after_eq(jiffies, sd->last_balance + interval)) {
10323 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
10324 /*
10325 * The LBF_DST_PINNED logic could have changed
10326 * env->dst_cpu, so we can't know our idle
10327 * state even if we migrated tasks. Update it.
10328 */
10329 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
323af6de 10330 busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
af3fe03c
PZ
10331 }
10332 sd->last_balance = jiffies;
323af6de 10333 interval = get_sd_balance_interval(sd, busy);
af3fe03c
PZ
10334 }
10335 if (need_serialize)
10336 spin_unlock(&balancing);
10337out:
10338 if (time_after(next_balance, sd->last_balance + interval)) {
10339 next_balance = sd->last_balance + interval;
10340 update_next_balance = 1;
10341 }
10342 }
10343 if (need_decay) {
10344 /*
10345 * Ensure the rq-wide value also decays but keep it at a
10346 * reasonable floor to avoid funnies with rq->avg_idle.
10347 */
10348 rq->max_idle_balance_cost =
10349 max((u64)sysctl_sched_migration_cost, max_cost);
10350 }
10351 rcu_read_unlock();
10352
10353 /*
10354 * next_balance will be updated only when there is a need.
10355 * When the cpu is attached to null domain for ex, it will not be
10356 * updated.
10357 */
7a82e5f5 10358 if (likely(update_next_balance))
af3fe03c
PZ
10359 rq->next_balance = next_balance;
10360
af3fe03c
PZ
10361}
10362
d987fc7f
MG
10363static inline int on_null_domain(struct rq *rq)
10364{
10365 return unlikely(!rcu_dereference_sched(rq->sd));
10366}
10367
3451d024 10368#ifdef CONFIG_NO_HZ_COMMON
83cd4fe2
VP
10369/*
10370 * idle load balancing details
83cd4fe2
VP
10371 * - When one of the busy CPUs notice that there may be an idle rebalancing
10372 * needed, they will kick the idle load balancer, which then does idle
10373 * load balancing for all the idle CPUs.
04d4e665 10374 * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED not set
9b019acb 10375 * anywhere yet.
83cd4fe2 10376 */
1e3c88bd 10377
3dd0337d 10378static inline int find_new_ilb(void)
1e3c88bd 10379{
9b019acb 10380 int ilb;
031e3bd8 10381 const struct cpumask *hk_mask;
1e3c88bd 10382
04d4e665 10383 hk_mask = housekeeping_cpumask(HK_TYPE_MISC);
1e3c88bd 10384
031e3bd8 10385 for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) {
45da7a2b
PZ
10386
10387 if (ilb == smp_processor_id())
10388 continue;
10389
9b019acb
NP
10390 if (idle_cpu(ilb))
10391 return ilb;
10392 }
786d6dc7
SS
10393
10394 return nr_cpu_ids;
1e3c88bd 10395}
1e3c88bd 10396
83cd4fe2 10397/*
9b019acb 10398 * Kick a CPU to do the nohz balancing, if it is time for it. We pick any
04d4e665 10399 * idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
83cd4fe2 10400 */
a4064fb6 10401static void kick_ilb(unsigned int flags)
83cd4fe2
VP
10402{
10403 int ilb_cpu;
10404
3ea2f097
VG
10405 /*
10406 * Increase nohz.next_balance only when if full ilb is triggered but
10407 * not if we only update stats.
10408 */
10409 if (flags & NOHZ_BALANCE_KICK)
10410 nohz.next_balance = jiffies+1;
83cd4fe2 10411
3dd0337d 10412 ilb_cpu = find_new_ilb();
83cd4fe2 10413
0b005cf5
SS
10414 if (ilb_cpu >= nr_cpu_ids)
10415 return;
83cd4fe2 10416
19a1f5ec
PZ
10417 /*
10418 * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
10419 * the first flag owns it; cleared by nohz_csd_func().
10420 */
a4064fb6 10421 flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
b7031a02 10422 if (flags & NOHZ_KICK_MASK)
1c792db7 10423 return;
4550487a 10424
1c792db7 10425 /*
90b5363a 10426 * This way we generate an IPI on the target CPU which
1c792db7
SS
10427 * is idle. And the softirq performing nohz idle load balance
10428 * will be run before returning from the IPI.
10429 */
90b5363a 10430 smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
4550487a
PZ
10431}
10432
10433/*
9f132742
VS
10434 * Current decision point for kicking the idle load balancer in the presence
10435 * of idle CPUs in the system.
4550487a
PZ
10436 */
10437static void nohz_balancer_kick(struct rq *rq)
10438{
10439 unsigned long now = jiffies;
10440 struct sched_domain_shared *sds;
10441 struct sched_domain *sd;
10442 int nr_busy, i, cpu = rq->cpu;
a4064fb6 10443 unsigned int flags = 0;
4550487a
PZ
10444
10445 if (unlikely(rq->idle_balance))
10446 return;
10447
10448 /*
10449 * We may be recently in ticked or tickless idle mode. At the first
10450 * busy tick after returning from idle, we will update the busy stats.
10451 */
00357f5e 10452 nohz_balance_exit_idle(rq);
4550487a
PZ
10453
10454 /*
10455 * None are in tickless mode and hence no need for NOHZ idle load
10456 * balancing.
10457 */
10458 if (likely(!atomic_read(&nohz.nr_cpus)))
10459 return;
10460
f643ea22
VG
10461 if (READ_ONCE(nohz.has_blocked) &&
10462 time_after(now, READ_ONCE(nohz.next_blocked)))
a4064fb6
PZ
10463 flags = NOHZ_STATS_KICK;
10464
4550487a 10465 if (time_before(now, nohz.next_balance))
a4064fb6 10466 goto out;
4550487a 10467
a0fe2cf0 10468 if (rq->nr_running >= 2) {
efd984c4 10469 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
4550487a
PZ
10470 goto out;
10471 }
10472
10473 rcu_read_lock();
4550487a
PZ
10474
10475 sd = rcu_dereference(rq->sd);
10476 if (sd) {
e25a7a94
VS
10477 /*
10478 * If there's a CFS task and the current CPU has reduced
10479 * capacity; kick the ILB to see if there's a better CPU to run
10480 * on.
10481 */
10482 if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
efd984c4 10483 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
4550487a
PZ
10484 goto unlock;
10485 }
10486 }
10487
011b27bb 10488 sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
4550487a 10489 if (sd) {
b9a7b883
VS
10490 /*
10491 * When ASYM_PACKING; see if there's a more preferred CPU
10492 * currently idle; in which case, kick the ILB to move tasks
10493 * around.
10494 */
7edab78d 10495 for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
4550487a 10496 if (sched_asym_prefer(i, cpu)) {
efd984c4 10497 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
4550487a
PZ
10498 goto unlock;
10499 }
10500 }
10501 }
b9a7b883 10502
a0fe2cf0
VS
10503 sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
10504 if (sd) {
10505 /*
10506 * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
10507 * to run the misfit task on.
10508 */
10509 if (check_misfit_status(rq, sd)) {
efd984c4 10510 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
a0fe2cf0
VS
10511 goto unlock;
10512 }
b9a7b883
VS
10513
10514 /*
10515 * For asymmetric systems, we do not want to nicely balance
10516 * cache use, instead we want to embrace asymmetry and only
10517 * ensure tasks have enough CPU capacity.
10518 *
10519 * Skip the LLC logic because it's not relevant in that case.
10520 */
10521 goto unlock;
a0fe2cf0
VS
10522 }
10523
b9a7b883
VS
10524 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
10525 if (sds) {
e25a7a94 10526 /*
b9a7b883
VS
10527 * If there is an imbalance between LLC domains (IOW we could
10528 * increase the overall cache use), we need some less-loaded LLC
10529 * domain to pull some load. Likewise, we may need to spread
10530 * load within the current LLC domain (e.g. packed SMT cores but
10531 * other CPUs are idle). We can't really know from here how busy
10532 * the others are - so just get a nohz balance going if it looks
10533 * like this LLC domain has tasks we could move.
e25a7a94 10534 */
b9a7b883
VS
10535 nr_busy = atomic_read(&sds->nr_busy_cpus);
10536 if (nr_busy > 1) {
efd984c4 10537 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
b9a7b883 10538 goto unlock;
4550487a
PZ
10539 }
10540 }
10541unlock:
10542 rcu_read_unlock();
10543out:
7fd7a9e0
VS
10544 if (READ_ONCE(nohz.needs_update))
10545 flags |= NOHZ_NEXT_KICK;
10546
a4064fb6
PZ
10547 if (flags)
10548 kick_ilb(flags);
83cd4fe2
VP
10549}
10550
00357f5e 10551static void set_cpu_sd_state_busy(int cpu)
71325960 10552{
00357f5e 10553 struct sched_domain *sd;
a22e47a4 10554
00357f5e
PZ
10555 rcu_read_lock();
10556 sd = rcu_dereference(per_cpu(sd_llc, cpu));
a22e47a4 10557
00357f5e
PZ
10558 if (!sd || !sd->nohz_idle)
10559 goto unlock;
10560 sd->nohz_idle = 0;
10561
10562 atomic_inc(&sd->shared->nr_busy_cpus);
10563unlock:
10564 rcu_read_unlock();
71325960
SS
10565}
10566
00357f5e
PZ
10567void nohz_balance_exit_idle(struct rq *rq)
10568{
10569 SCHED_WARN_ON(rq != this_rq());
10570
10571 if (likely(!rq->nohz_tick_stopped))
10572 return;
10573
10574 rq->nohz_tick_stopped = 0;
10575 cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
10576 atomic_dec(&nohz.nr_cpus);
10577
10578 set_cpu_sd_state_busy(rq->cpu);
10579}
10580
10581static void set_cpu_sd_state_idle(int cpu)
69e1e811
SS
10582{
10583 struct sched_domain *sd;
69e1e811 10584
69e1e811 10585 rcu_read_lock();
0e369d75 10586 sd = rcu_dereference(per_cpu(sd_llc, cpu));
25f55d9d
VG
10587
10588 if (!sd || sd->nohz_idle)
10589 goto unlock;
10590 sd->nohz_idle = 1;
10591
0e369d75 10592 atomic_dec(&sd->shared->nr_busy_cpus);
25f55d9d 10593unlock:
69e1e811
SS
10594 rcu_read_unlock();
10595}
10596
1e3c88bd 10597/*
97fb7a0a 10598 * This routine will record that the CPU is going idle with tick stopped.
0b005cf5 10599 * This info will be used in performing idle load balancing in the future.
1e3c88bd 10600 */
c1cc017c 10601void nohz_balance_enter_idle(int cpu)
1e3c88bd 10602{
00357f5e
PZ
10603 struct rq *rq = cpu_rq(cpu);
10604
10605 SCHED_WARN_ON(cpu != smp_processor_id());
10606
97fb7a0a 10607 /* If this CPU is going down, then nothing needs to be done: */
71325960
SS
10608 if (!cpu_active(cpu))
10609 return;
10610
387bc8b5 10611 /* Spare idle load balancing on CPUs that don't want to be disturbed: */
04d4e665 10612 if (!housekeeping_cpu(cpu, HK_TYPE_SCHED))
387bc8b5
FW
10613 return;
10614
f643ea22
VG
10615 /*
10616 * Can be set safely without rq->lock held
10617 * If a clear happens, it will have evaluated last additions because
10618 * rq->lock is held during the check and the clear
10619 */
10620 rq->has_blocked_load = 1;
10621
10622 /*
10623 * The tick is still stopped but load could have been added in the
10624 * meantime. We set the nohz.has_blocked flag to trig a check of the
10625 * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
10626 * of nohz.has_blocked can only happen after checking the new load
10627 */
00357f5e 10628 if (rq->nohz_tick_stopped)
f643ea22 10629 goto out;
1e3c88bd 10630
97fb7a0a 10631 /* If we're a completely isolated CPU, we don't play: */
00357f5e 10632 if (on_null_domain(rq))
d987fc7f
MG
10633 return;
10634
00357f5e
PZ
10635 rq->nohz_tick_stopped = 1;
10636
c1cc017c
AS
10637 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
10638 atomic_inc(&nohz.nr_cpus);
00357f5e 10639
f643ea22
VG
10640 /*
10641 * Ensures that if nohz_idle_balance() fails to observe our
10642 * @idle_cpus_mask store, it must observe the @has_blocked
7fd7a9e0 10643 * and @needs_update stores.
f643ea22
VG
10644 */
10645 smp_mb__after_atomic();
10646
00357f5e 10647 set_cpu_sd_state_idle(cpu);
f643ea22 10648
7fd7a9e0 10649 WRITE_ONCE(nohz.needs_update, 1);
f643ea22
VG
10650out:
10651 /*
10652 * Each time a cpu enter idle, we assume that it has blocked load and
10653 * enable the periodic update of the load of idle cpus
10654 */
10655 WRITE_ONCE(nohz.has_blocked, 1);
1e3c88bd 10656}
1e3c88bd 10657
3f5ad914
Y
10658static bool update_nohz_stats(struct rq *rq)
10659{
10660 unsigned int cpu = rq->cpu;
10661
10662 if (!rq->has_blocked_load)
10663 return false;
10664
10665 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
10666 return false;
10667
10668 if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
10669 return true;
10670
10671 update_blocked_averages(cpu);
10672
10673 return rq->has_blocked_load;
10674}
10675
1e3c88bd 10676/*
31e77c93
VG
10677 * Internal function that runs load balance for all idle cpus. The load balance
10678 * can be a simple update of blocked load or a complete load balance with
10679 * tasks movement depending of flags.
1e3c88bd 10680 */
ab2dde5e 10681static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
31e77c93 10682 enum cpu_idle_type idle)
83cd4fe2 10683{
c5afb6a8 10684 /* Earliest time when we have to do rebalance again */
a4064fb6
PZ
10685 unsigned long now = jiffies;
10686 unsigned long next_balance = now + 60*HZ;
f643ea22 10687 bool has_blocked_load = false;
c5afb6a8 10688 int update_next_balance = 0;
b7031a02 10689 int this_cpu = this_rq->cpu;
b7031a02
PZ
10690 int balance_cpu;
10691 struct rq *rq;
83cd4fe2 10692
b7031a02 10693 SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
83cd4fe2 10694
f643ea22
VG
10695 /*
10696 * We assume there will be no idle load after this update and clear
10697 * the has_blocked flag. If a cpu enters idle in the mean time, it will
7fd7a9e0 10698 * set the has_blocked flag and trigger another update of idle load.
f643ea22
VG
10699 * Because a cpu that becomes idle, is added to idle_cpus_mask before
10700 * setting the flag, we are sure to not clear the state and not
10701 * check the load of an idle cpu.
7fd7a9e0
VS
10702 *
10703 * Same applies to idle_cpus_mask vs needs_update.
f643ea22 10704 */
efd984c4
VS
10705 if (flags & NOHZ_STATS_KICK)
10706 WRITE_ONCE(nohz.has_blocked, 0);
7fd7a9e0
VS
10707 if (flags & NOHZ_NEXT_KICK)
10708 WRITE_ONCE(nohz.needs_update, 0);
f643ea22
VG
10709
10710 /*
10711 * Ensures that if we miss the CPU, we must see the has_blocked
10712 * store from nohz_balance_enter_idle().
10713 */
10714 smp_mb();
10715
7a82e5f5
VG
10716 /*
10717 * Start with the next CPU after this_cpu so we will end with this_cpu and let a
10718 * chance for other idle cpu to pull load.
10719 */
10720 for_each_cpu_wrap(balance_cpu, nohz.idle_cpus_mask, this_cpu+1) {
10721 if (!idle_cpu(balance_cpu))
83cd4fe2
VP
10722 continue;
10723
10724 /*
97fb7a0a
IM
10725 * If this CPU gets work to do, stop the load balancing
10726 * work being done for other CPUs. Next load
83cd4fe2
VP
10727 * balancing owner will pick it up.
10728 */
f643ea22 10729 if (need_resched()) {
efd984c4
VS
10730 if (flags & NOHZ_STATS_KICK)
10731 has_blocked_load = true;
7fd7a9e0
VS
10732 if (flags & NOHZ_NEXT_KICK)
10733 WRITE_ONCE(nohz.needs_update, 1);
f643ea22
VG
10734 goto abort;
10735 }
83cd4fe2 10736
5ed4f1d9
VG
10737 rq = cpu_rq(balance_cpu);
10738
efd984c4
VS
10739 if (flags & NOHZ_STATS_KICK)
10740 has_blocked_load |= update_nohz_stats(rq);
f643ea22 10741
ed61bbc6
TC
10742 /*
10743 * If time for next balance is due,
10744 * do the balance.
10745 */
10746 if (time_after_eq(jiffies, rq->next_balance)) {
8a8c69c3
PZ
10747 struct rq_flags rf;
10748
31e77c93 10749 rq_lock_irqsave(rq, &rf);
ed61bbc6 10750 update_rq_clock(rq);
31e77c93 10751 rq_unlock_irqrestore(rq, &rf);
8a8c69c3 10752
b7031a02
PZ
10753 if (flags & NOHZ_BALANCE_KICK)
10754 rebalance_domains(rq, CPU_IDLE);
ed61bbc6 10755 }
83cd4fe2 10756
c5afb6a8
VG
10757 if (time_after(next_balance, rq->next_balance)) {
10758 next_balance = rq->next_balance;
10759 update_next_balance = 1;
10760 }
83cd4fe2 10761 }
c5afb6a8 10762
3ea2f097
VG
10763 /*
10764 * next_balance will be updated only when there is a need.
10765 * When the CPU is attached to null domain for ex, it will not be
10766 * updated.
10767 */
10768 if (likely(update_next_balance))
10769 nohz.next_balance = next_balance;
10770
efd984c4
VS
10771 if (flags & NOHZ_STATS_KICK)
10772 WRITE_ONCE(nohz.next_blocked,
10773 now + msecs_to_jiffies(LOAD_AVG_PERIOD));
f643ea22
VG
10774
10775abort:
10776 /* There is still blocked load, enable periodic update */
10777 if (has_blocked_load)
10778 WRITE_ONCE(nohz.has_blocked, 1);
31e77c93
VG
10779}
10780
10781/*
10782 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
10783 * rebalancing for all the cpus for whom scheduler ticks are stopped.
10784 */
10785static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
10786{
19a1f5ec 10787 unsigned int flags = this_rq->nohz_idle_balance;
31e77c93 10788
19a1f5ec 10789 if (!flags)
31e77c93
VG
10790 return false;
10791
19a1f5ec 10792 this_rq->nohz_idle_balance = 0;
31e77c93 10793
19a1f5ec 10794 if (idle != CPU_IDLE)
31e77c93
VG
10795 return false;
10796
10797 _nohz_idle_balance(this_rq, flags, idle);
10798
b7031a02 10799 return true;
83cd4fe2 10800}
31e77c93 10801
c6f88654
VG
10802/*
10803 * Check if we need to run the ILB for updating blocked load before entering
10804 * idle state.
10805 */
10806void nohz_run_idle_balance(int cpu)
10807{
10808 unsigned int flags;
10809
10810 flags = atomic_fetch_andnot(NOHZ_NEWILB_KICK, nohz_flags(cpu));
10811
10812 /*
10813 * Update the blocked load only if no SCHED_SOFTIRQ is about to happen
10814 * (ie NOHZ_STATS_KICK set) and will do the same.
10815 */
10816 if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
10817 _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK, CPU_IDLE);
10818}
10819
31e77c93
VG
10820static void nohz_newidle_balance(struct rq *this_rq)
10821{
10822 int this_cpu = this_rq->cpu;
10823
10824 /*
10825 * This CPU doesn't want to be disturbed by scheduler
10826 * housekeeping
10827 */
04d4e665 10828 if (!housekeeping_cpu(this_cpu, HK_TYPE_SCHED))
31e77c93
VG
10829 return;
10830
10831 /* Will wake up very soon. No time for doing anything else*/
10832 if (this_rq->avg_idle < sysctl_sched_migration_cost)
10833 return;
10834
10835 /* Don't need to update blocked load of idle CPUs*/
10836 if (!READ_ONCE(nohz.has_blocked) ||
10837 time_before(jiffies, READ_ONCE(nohz.next_blocked)))
10838 return;
10839
31e77c93 10840 /*
c6f88654
VG
10841 * Set the need to trigger ILB in order to update blocked load
10842 * before entering idle state.
31e77c93 10843 */
c6f88654 10844 atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu));
31e77c93
VG
10845}
10846
dd707247
PZ
10847#else /* !CONFIG_NO_HZ_COMMON */
10848static inline void nohz_balancer_kick(struct rq *rq) { }
10849
31e77c93 10850static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
b7031a02
PZ
10851{
10852 return false;
10853}
31e77c93
VG
10854
10855static inline void nohz_newidle_balance(struct rq *this_rq) { }
dd707247 10856#endif /* CONFIG_NO_HZ_COMMON */
83cd4fe2 10857
47ea5412 10858/*
5b78f2dc 10859 * newidle_balance is called by schedule() if this_cpu is about to become
47ea5412 10860 * idle. Attempts to pull tasks from other CPUs.
7277a34c
PZ
10861 *
10862 * Returns:
10863 * < 0 - we released the lock and there are !fair tasks present
10864 * 0 - failed, no new tasks
10865 * > 0 - success, new (fair) tasks present
47ea5412 10866 */
d91cecc1 10867static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
47ea5412
PZ
10868{
10869 unsigned long next_balance = jiffies + HZ;
10870 int this_cpu = this_rq->cpu;
9e9af819 10871 u64 t0, t1, curr_cost = 0;
47ea5412
PZ
10872 struct sched_domain *sd;
10873 int pulled_task = 0;
47ea5412 10874
5ba553ef 10875 update_misfit_status(NULL, this_rq);
e5e678e4
RR
10876
10877 /*
10878 * There is a task waiting to run. No need to search for one.
10879 * Return 0; the task will be enqueued when switching to idle.
10880 */
10881 if (this_rq->ttwu_pending)
10882 return 0;
10883
47ea5412
PZ
10884 /*
10885 * We must set idle_stamp _before_ calling idle_balance(), such that we
10886 * measure the duration of idle_balance() as idle time.
10887 */
10888 this_rq->idle_stamp = rq_clock(this_rq);
10889
10890 /*
10891 * Do not pull tasks towards !active CPUs...
10892 */
10893 if (!cpu_active(this_cpu))
10894 return 0;
10895
10896 /*
10897 * This is OK, because current is on_cpu, which avoids it being picked
10898 * for load-balance and preemption/IRQs are still disabled avoiding
10899 * further scheduler activity on it and we're being very careful to
10900 * re-start the picking loop.
10901 */
10902 rq_unpin_lock(this_rq, rf);
10903
9d783c8d
VG
10904 rcu_read_lock();
10905 sd = rcu_dereference_check_sched_domain(this_rq->sd);
10906
c5b0a7ee 10907 if (!READ_ONCE(this_rq->rd->overload) ||
9d783c8d 10908 (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
31e77c93 10909
47ea5412
PZ
10910 if (sd)
10911 update_next_balance(sd, &next_balance);
10912 rcu_read_unlock();
10913
10914 goto out;
10915 }
9d783c8d 10916 rcu_read_unlock();
47ea5412 10917
5cb9eaa3 10918 raw_spin_rq_unlock(this_rq);
47ea5412 10919
9e9af819 10920 t0 = sched_clock_cpu(this_cpu);
47ea5412 10921 update_blocked_averages(this_cpu);
9e9af819 10922
47ea5412
PZ
10923 rcu_read_lock();
10924 for_each_domain(this_cpu, sd) {
10925 int continue_balancing = 1;
9e9af819 10926 u64 domain_cost;
47ea5412 10927
8ea9183d
VG
10928 update_next_balance(sd, &next_balance);
10929
10930 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
47ea5412 10931 break;
47ea5412
PZ
10932
10933 if (sd->flags & SD_BALANCE_NEWIDLE) {
47ea5412
PZ
10934
10935 pulled_task = load_balance(this_cpu, this_rq,
10936 sd, CPU_NEWLY_IDLE,
10937 &continue_balancing);
10938
9e9af819
VG
10939 t1 = sched_clock_cpu(this_cpu);
10940 domain_cost = t1 - t0;
e60b56e4 10941 update_newidle_cost(sd, domain_cost);
47ea5412
PZ
10942
10943 curr_cost += domain_cost;
9e9af819 10944 t0 = t1;
47ea5412
PZ
10945 }
10946
47ea5412
PZ
10947 /*
10948 * Stop searching for tasks to pull if there are
10949 * now runnable tasks on this rq.
10950 */
e5e678e4
RR
10951 if (pulled_task || this_rq->nr_running > 0 ||
10952 this_rq->ttwu_pending)
47ea5412
PZ
10953 break;
10954 }
10955 rcu_read_unlock();
10956
5cb9eaa3 10957 raw_spin_rq_lock(this_rq);
47ea5412
PZ
10958
10959 if (curr_cost > this_rq->max_idle_balance_cost)
10960 this_rq->max_idle_balance_cost = curr_cost;
10961
10962 /*
10963 * While browsing the domains, we released the rq lock, a task could
10964 * have been enqueued in the meantime. Since we're not going idle,
10965 * pretend we pulled a task.
10966 */
10967 if (this_rq->cfs.h_nr_running && !pulled_task)
10968 pulled_task = 1;
10969
47ea5412
PZ
10970 /* Is there a task of a high priority class? */
10971 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
10972 pulled_task = -1;
10973
6553fc18
VG
10974out:
10975 /* Move the next balance forward */
10976 if (time_after(this_rq->next_balance, next_balance))
10977 this_rq->next_balance = next_balance;
10978
47ea5412
PZ
10979 if (pulled_task)
10980 this_rq->idle_stamp = 0;
0826530d
VG
10981 else
10982 nohz_newidle_balance(this_rq);
47ea5412
PZ
10983
10984 rq_repin_lock(this_rq, rf);
10985
10986 return pulled_task;
10987}
10988
83cd4fe2
VP
10989/*
10990 * run_rebalance_domains is triggered when needed from the scheduler tick.
10991 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
10992 */
0766f788 10993static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
1e3c88bd 10994{
208cb16b 10995 struct rq *this_rq = this_rq();
6eb57e0d 10996 enum cpu_idle_type idle = this_rq->idle_balance ?
1e3c88bd
PZ
10997 CPU_IDLE : CPU_NOT_IDLE;
10998
1e3c88bd 10999 /*
97fb7a0a
IM
11000 * If this CPU has a pending nohz_balance_kick, then do the
11001 * balancing on behalf of the other idle CPUs whose ticks are
d4573c3e 11002 * stopped. Do nohz_idle_balance *before* rebalance_domains to
97fb7a0a 11003 * give the idle CPUs a chance to load balance. Else we may
d4573c3e
PM
11004 * load balance only within the local sched_domain hierarchy
11005 * and abort nohz_idle_balance altogether if we pull some load.
1e3c88bd 11006 */
b7031a02
PZ
11007 if (nohz_idle_balance(this_rq, idle))
11008 return;
11009
11010 /* normal load balance */
11011 update_blocked_averages(this_rq->cpu);
d4573c3e 11012 rebalance_domains(this_rq, idle);
1e3c88bd
PZ
11013}
11014
1e3c88bd
PZ
11015/*
11016 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
1e3c88bd 11017 */
7caff66f 11018void trigger_load_balance(struct rq *rq)
1e3c88bd 11019{
e0b257c3
AMB
11020 /*
11021 * Don't need to rebalance while attached to NULL domain or
11022 * runqueue CPU is not active
11023 */
11024 if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq))))
c726099e
DL
11025 return;
11026
11027 if (time_after_eq(jiffies, rq->next_balance))
1e3c88bd 11028 raise_softirq(SCHED_SOFTIRQ);
4550487a
PZ
11029
11030 nohz_balancer_kick(rq);
1e3c88bd
PZ
11031}
11032
0bcdcf28
CE
11033static void rq_online_fair(struct rq *rq)
11034{
11035 update_sysctl();
0e59bdae
KT
11036
11037 update_runtime_enabled(rq);
0bcdcf28
CE
11038}
11039
11040static void rq_offline_fair(struct rq *rq)
11041{
11042 update_sysctl();
a4c96ae3
PB
11043
11044 /* Ensure any throttled groups are reachable by pick_next_task */
11045 unthrottle_offline_cfs_rqs(rq);
0bcdcf28
CE
11046}
11047
55e12e5e 11048#endif /* CONFIG_SMP */
e1d1484f 11049
8039e96f
VP
11050#ifdef CONFIG_SCHED_CORE
11051static inline bool
11052__entity_slice_used(struct sched_entity *se, int min_nr_tasks)
11053{
11054 u64 slice = sched_slice(cfs_rq_of(se), se);
11055 u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
11056
11057 return (rtime * min_nr_tasks > slice);
11058}
11059
11060#define MIN_NR_TASKS_DURING_FORCEIDLE 2
11061static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
11062{
11063 if (!sched_core_enabled(rq))
11064 return;
11065
11066 /*
11067 * If runqueue has only one task which used up its slice and
11068 * if the sibling is forced idle, then trigger schedule to
11069 * give forced idle task a chance.
11070 *
11071 * sched_slice() considers only this active rq and it gets the
11072 * whole slice. But during force idle, we have siblings acting
11073 * like a single runqueue and hence we need to consider runnable
cc00c198 11074 * tasks on this CPU and the forced idle CPU. Ideally, we should
8039e96f 11075 * go through the forced idle rq, but that would be a perf hit.
cc00c198 11076 * We can assume that the forced idle CPU has at least
8039e96f 11077 * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
cc00c198 11078 * if we need to give up the CPU.
8039e96f 11079 */
4feee7d1 11080 if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 &&
8039e96f
VP
11081 __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
11082 resched_curr(rq);
11083}
c6047c2e
JFG
11084
11085/*
11086 * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed.
11087 */
11088static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle)
11089{
11090 for_each_sched_entity(se) {
11091 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11092
11093 if (forceidle) {
11094 if (cfs_rq->forceidle_seq == fi_seq)
11095 break;
11096 cfs_rq->forceidle_seq = fi_seq;
11097 }
11098
11099 cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
11100 }
11101}
11102
11103void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi)
11104{
11105 struct sched_entity *se = &p->se;
11106
11107 if (p->sched_class != &fair_sched_class)
11108 return;
11109
11110 se_fi_update(se, rq->core->core_forceidle_seq, in_fi);
11111}
11112
11113bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
11114{
11115 struct rq *rq = task_rq(a);
11116 struct sched_entity *sea = &a->se;
11117 struct sched_entity *seb = &b->se;
11118 struct cfs_rq *cfs_rqa;
11119 struct cfs_rq *cfs_rqb;
11120 s64 delta;
11121
11122 SCHED_WARN_ON(task_rq(b)->core != rq->core);
11123
11124#ifdef CONFIG_FAIR_GROUP_SCHED
11125 /*
11126 * Find an se in the hierarchy for tasks a and b, such that the se's
11127 * are immediate siblings.
11128 */
11129 while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
11130 int sea_depth = sea->depth;
11131 int seb_depth = seb->depth;
11132
11133 if (sea_depth >= seb_depth)
11134 sea = parent_entity(sea);
11135 if (sea_depth <= seb_depth)
11136 seb = parent_entity(seb);
11137 }
11138
11139 se_fi_update(sea, rq->core->core_forceidle_seq, in_fi);
11140 se_fi_update(seb, rq->core->core_forceidle_seq, in_fi);
11141
11142 cfs_rqa = sea->cfs_rq;
11143 cfs_rqb = seb->cfs_rq;
11144#else
11145 cfs_rqa = &task_rq(a)->cfs;
11146 cfs_rqb = &task_rq(b)->cfs;
11147#endif
11148
11149 /*
11150 * Find delta after normalizing se's vruntime with its cfs_rq's
11151 * min_vruntime_fi, which would have been updated in prior calls
11152 * to se_fi_update().
11153 */
11154 delta = (s64)(sea->vruntime - seb->vruntime) +
11155 (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
11156
11157 return delta > 0;
11158}
8039e96f
VP
11159#else
11160static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
11161#endif
11162
bf0f6f24 11163/*
d84b3131
FW
11164 * scheduler tick hitting a task of our scheduling class.
11165 *
11166 * NOTE: This function can be called remotely by the tick offload that
11167 * goes along full dynticks. Therefore no local assumption can be made
11168 * and everything must be accessed through the @rq and @curr passed in
11169 * parameters.
bf0f6f24 11170 */
8f4d37ec 11171static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
bf0f6f24
IM
11172{
11173 struct cfs_rq *cfs_rq;
11174 struct sched_entity *se = &curr->se;
11175
11176 for_each_sched_entity(se) {
11177 cfs_rq = cfs_rq_of(se);
8f4d37ec 11178 entity_tick(cfs_rq, se, queued);
bf0f6f24 11179 }
18bf2805 11180
b52da86e 11181 if (static_branch_unlikely(&sched_numa_balancing))
cbee9f88 11182 task_tick_numa(rq, curr);
3b1baa64
MR
11183
11184 update_misfit_status(curr, rq);
2802bf3c 11185 update_overutilized_status(task_rq(curr));
8039e96f
VP
11186
11187 task_tick_core(rq, curr);
bf0f6f24
IM
11188}
11189
11190/*
cd29fe6f
PZ
11191 * called on fork with the child task as argument from the parent's context
11192 * - child not yet on the tasklist
11193 * - preemption disabled
bf0f6f24 11194 */
cd29fe6f 11195static void task_fork_fair(struct task_struct *p)
bf0f6f24 11196{
4fc420c9
DN
11197 struct cfs_rq *cfs_rq;
11198 struct sched_entity *se = &p->se, *curr;
cd29fe6f 11199 struct rq *rq = this_rq();
8a8c69c3 11200 struct rq_flags rf;
bf0f6f24 11201
8a8c69c3 11202 rq_lock(rq, &rf);
861d034e
PZ
11203 update_rq_clock(rq);
11204
4fc420c9
DN
11205 cfs_rq = task_cfs_rq(current);
11206 curr = cfs_rq->curr;
e210bffd
PZ
11207 if (curr) {
11208 update_curr(cfs_rq);
b5d9d734 11209 se->vruntime = curr->vruntime;
e210bffd 11210 }
aeb73b04 11211 place_entity(cfs_rq, se, 1);
4d78e7b6 11212
cd29fe6f 11213 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
87fefa38 11214 /*
edcb60a3
IM
11215 * Upon rescheduling, sched_class::put_prev_task() will place
11216 * 'current' within the tree based on its new key value.
11217 */
4d78e7b6 11218 swap(curr->vruntime, se->vruntime);
8875125e 11219 resched_curr(rq);
4d78e7b6 11220 }
bf0f6f24 11221
88ec22d3 11222 se->vruntime -= cfs_rq->min_vruntime;
8a8c69c3 11223 rq_unlock(rq, &rf);
bf0f6f24
IM
11224}
11225
cb469845
SR
11226/*
11227 * Priority of the task has changed. Check to see if we preempt
11228 * the current task.
11229 */
da7a735e
PZ
11230static void
11231prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
cb469845 11232{
da0c1e65 11233 if (!task_on_rq_queued(p))
da7a735e
PZ
11234 return;
11235
7c2e8bbd
FW
11236 if (rq->cfs.nr_running == 1)
11237 return;
11238
cb469845
SR
11239 /*
11240 * Reschedule if we are currently running on this runqueue and
11241 * our priority decreased, or if we are not currently running on
11242 * this runqueue and our priority is higher than the current's
11243 */
65bcf072 11244 if (task_current(rq, p)) {
cb469845 11245 if (p->prio > oldprio)
8875125e 11246 resched_curr(rq);
cb469845 11247 } else
15afe09b 11248 check_preempt_curr(rq, p, 0);
cb469845
SR
11249}
11250
daa59407 11251static inline bool vruntime_normalized(struct task_struct *p)
da7a735e
PZ
11252{
11253 struct sched_entity *se = &p->se;
da7a735e
PZ
11254
11255 /*
daa59407
BP
11256 * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
11257 * the dequeue_entity(.flags=0) will already have normalized the
11258 * vruntime.
11259 */
11260 if (p->on_rq)
11261 return true;
11262
11263 /*
11264 * When !on_rq, vruntime of the task has usually NOT been normalized.
11265 * But there are some cases where it has already been normalized:
da7a735e 11266 *
daa59407
BP
11267 * - A forked child which is waiting for being woken up by
11268 * wake_up_new_task().
11269 * - A task which has been woken up by try_to_wake_up() and
11270 * waiting for actually being woken up by sched_ttwu_pending().
da7a735e 11271 */
d0cdb3ce 11272 if (!se->sum_exec_runtime ||
2f064a59 11273 (READ_ONCE(p->__state) == TASK_WAKING && p->sched_remote_wakeup))
daa59407
BP
11274 return true;
11275
11276 return false;
11277}
11278
09a43ace
VG
11279#ifdef CONFIG_FAIR_GROUP_SCHED
11280/*
11281 * Propagate the changes of the sched_entity across the tg tree to make it
11282 * visible to the root
11283 */
11284static void propagate_entity_cfs_rq(struct sched_entity *se)
11285{
51bf903b
CZ
11286 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11287
11288 if (cfs_rq_throttled(cfs_rq))
11289 return;
09a43ace 11290
51bf903b
CZ
11291 if (!throttled_hierarchy(cfs_rq))
11292 list_add_leaf_cfs_rq(cfs_rq);
0258bdfa 11293
09a43ace
VG
11294 /* Start to propagate at parent */
11295 se = se->parent;
11296
11297 for_each_sched_entity(se) {
11298 cfs_rq = cfs_rq_of(se);
11299
51bf903b 11300 update_load_avg(cfs_rq, se, UPDATE_TG);
09a43ace 11301
51bf903b 11302 if (cfs_rq_throttled(cfs_rq))
0258bdfa 11303 break;
51bf903b
CZ
11304
11305 if (!throttled_hierarchy(cfs_rq))
11306 list_add_leaf_cfs_rq(cfs_rq);
09a43ace
VG
11307 }
11308}
11309#else
11310static void propagate_entity_cfs_rq(struct sched_entity *se) { }
11311#endif
11312
df217913 11313static void detach_entity_cfs_rq(struct sched_entity *se)
daa59407 11314{
daa59407
BP
11315 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11316
9d89c257 11317 /* Catch up with the cfs_rq and remove our load when we leave */
88c0616e 11318 update_load_avg(cfs_rq, se, 0);
a05e8c51 11319 detach_entity_load_avg(cfs_rq, se);
fe749158 11320 update_tg_load_avg(cfs_rq);
09a43ace 11321 propagate_entity_cfs_rq(se);
da7a735e
PZ
11322}
11323
df217913 11324static void attach_entity_cfs_rq(struct sched_entity *se)
cb469845 11325{
daa59407 11326 struct cfs_rq *cfs_rq = cfs_rq_of(se);
7855a35a
BP
11327
11328#ifdef CONFIG_FAIR_GROUP_SCHED
eb7a59b2
M
11329 /*
11330 * Since the real-depth could have been changed (only FAIR
11331 * class maintain depth value), reset depth properly.
11332 */
11333 se->depth = se->parent ? se->parent->depth + 1 : 0;
11334#endif
7855a35a 11335
df217913 11336 /* Synchronize entity with its cfs_rq */
88c0616e 11337 update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
a4f9a0e5 11338 attach_entity_load_avg(cfs_rq, se);
fe749158 11339 update_tg_load_avg(cfs_rq);
09a43ace 11340 propagate_entity_cfs_rq(se);
df217913
VG
11341}
11342
11343static void detach_task_cfs_rq(struct task_struct *p)
11344{
11345 struct sched_entity *se = &p->se;
11346 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11347
11348 if (!vruntime_normalized(p)) {
11349 /*
11350 * Fix up our vruntime so that the current sleep doesn't
11351 * cause 'unlimited' sleep bonus.
11352 */
11353 place_entity(cfs_rq, se, 0);
11354 se->vruntime -= cfs_rq->min_vruntime;
11355 }
11356
11357 detach_entity_cfs_rq(se);
11358}
11359
11360static void attach_task_cfs_rq(struct task_struct *p)
11361{
11362 struct sched_entity *se = &p->se;
11363 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11364
11365 attach_entity_cfs_rq(se);
daa59407
BP
11366
11367 if (!vruntime_normalized(p))
11368 se->vruntime += cfs_rq->min_vruntime;
11369}
6efdb105 11370
daa59407
BP
11371static void switched_from_fair(struct rq *rq, struct task_struct *p)
11372{
11373 detach_task_cfs_rq(p);
11374}
11375
11376static void switched_to_fair(struct rq *rq, struct task_struct *p)
11377{
11378 attach_task_cfs_rq(p);
7855a35a 11379
daa59407 11380 if (task_on_rq_queued(p)) {
7855a35a 11381 /*
daa59407
BP
11382 * We were most likely switched from sched_rt, so
11383 * kick off the schedule if running, otherwise just see
11384 * if we can still preempt the current task.
7855a35a 11385 */
65bcf072 11386 if (task_current(rq, p))
daa59407
BP
11387 resched_curr(rq);
11388 else
11389 check_preempt_curr(rq, p, 0);
7855a35a 11390 }
cb469845
SR
11391}
11392
83b699ed
SV
11393/* Account for a task changing its policy or group.
11394 *
11395 * This routine is mostly called to set cfs_rq->curr field when a task
11396 * migrates between groups/classes.
11397 */
a0e813f2 11398static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
83b699ed 11399{
03b7fad1
PZ
11400 struct sched_entity *se = &p->se;
11401
11402#ifdef CONFIG_SMP
11403 if (task_on_rq_queued(p)) {
11404 /*
11405 * Move the next running task to the front of the list, so our
11406 * cfs_tasks list becomes MRU one.
11407 */
11408 list_move(&se->group_node, &rq->cfs_tasks);
11409 }
11410#endif
83b699ed 11411
ec12cb7f
PT
11412 for_each_sched_entity(se) {
11413 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11414
11415 set_next_entity(cfs_rq, se);
11416 /* ensure bandwidth has been allocated on our new cfs_rq */
11417 account_cfs_rq_runtime(cfs_rq, 0);
11418 }
83b699ed
SV
11419}
11420
029632fb
PZ
11421void init_cfs_rq(struct cfs_rq *cfs_rq)
11422{
bfb06889 11423 cfs_rq->tasks_timeline = RB_ROOT_CACHED;
029632fb
PZ
11424 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
11425#ifndef CONFIG_64BIT
11426 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
11427#endif
141965c7 11428#ifdef CONFIG_SMP
2a2f5d4e 11429 raw_spin_lock_init(&cfs_rq->removed.lock);
9ee474f5 11430#endif
029632fb
PZ
11431}
11432
810b3817 11433#ifdef CONFIG_FAIR_GROUP_SCHED
ea86cb4b
VG
11434static void task_set_group_fair(struct task_struct *p)
11435{
11436 struct sched_entity *se = &p->se;
11437
11438 set_task_rq(p, task_cpu(p));
11439 se->depth = se->parent ? se->parent->depth + 1 : 0;
11440}
11441
bc54da21 11442static void task_move_group_fair(struct task_struct *p)
810b3817 11443{
daa59407 11444 detach_task_cfs_rq(p);
b2b5ce02 11445 set_task_rq(p, task_cpu(p));
6efdb105
BP
11446
11447#ifdef CONFIG_SMP
11448 /* Tell se's cfs_rq has been changed -- migrated */
11449 p->se.avg.last_update_time = 0;
11450#endif
daa59407 11451 attach_task_cfs_rq(p);
810b3817 11452}
029632fb 11453
ea86cb4b
VG
11454static void task_change_group_fair(struct task_struct *p, int type)
11455{
11456 switch (type) {
11457 case TASK_SET_GROUP:
11458 task_set_group_fair(p);
11459 break;
11460
11461 case TASK_MOVE_GROUP:
11462 task_move_group_fair(p);
11463 break;
11464 }
11465}
11466
029632fb
PZ
11467void free_fair_sched_group(struct task_group *tg)
11468{
11469 int i;
11470
029632fb
PZ
11471 for_each_possible_cpu(i) {
11472 if (tg->cfs_rq)
11473 kfree(tg->cfs_rq[i]);
6fe1f348 11474 if (tg->se)
029632fb
PZ
11475 kfree(tg->se[i]);
11476 }
11477
11478 kfree(tg->cfs_rq);
11479 kfree(tg->se);
11480}
11481
11482int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
11483{
029632fb 11484 struct sched_entity *se;
b7fa30c9 11485 struct cfs_rq *cfs_rq;
029632fb
PZ
11486 int i;
11487
6396bb22 11488 tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
029632fb
PZ
11489 if (!tg->cfs_rq)
11490 goto err;
6396bb22 11491 tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
029632fb
PZ
11492 if (!tg->se)
11493 goto err;
11494
11495 tg->shares = NICE_0_LOAD;
11496
11497 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
11498
11499 for_each_possible_cpu(i) {
11500 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
11501 GFP_KERNEL, cpu_to_node(i));
11502 if (!cfs_rq)
11503 goto err;
11504
ceeadb83 11505 se = kzalloc_node(sizeof(struct sched_entity_stats),
029632fb
PZ
11506 GFP_KERNEL, cpu_to_node(i));
11507 if (!se)
11508 goto err_free_rq;
11509
11510 init_cfs_rq(cfs_rq);
11511 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
540247fb 11512 init_entity_runnable_average(se);
029632fb
PZ
11513 }
11514
11515 return 1;
11516
11517err_free_rq:
11518 kfree(cfs_rq);
11519err:
11520 return 0;
11521}
11522
8663e24d
PZ
11523void online_fair_sched_group(struct task_group *tg)
11524{
11525 struct sched_entity *se;
a46d14ec 11526 struct rq_flags rf;
8663e24d
PZ
11527 struct rq *rq;
11528 int i;
11529
11530 for_each_possible_cpu(i) {
11531 rq = cpu_rq(i);
11532 se = tg->se[i];
a46d14ec 11533 rq_lock_irq(rq, &rf);
4126bad6 11534 update_rq_clock(rq);
d0326691 11535 attach_entity_cfs_rq(se);
55e16d30 11536 sync_throttle(tg, i);
a46d14ec 11537 rq_unlock_irq(rq, &rf);
8663e24d
PZ
11538 }
11539}
11540
6fe1f348 11541void unregister_fair_sched_group(struct task_group *tg)
029632fb 11542{
029632fb 11543 unsigned long flags;
6fe1f348
PZ
11544 struct rq *rq;
11545 int cpu;
029632fb 11546
b027789e
MK
11547 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
11548
6fe1f348
PZ
11549 for_each_possible_cpu(cpu) {
11550 if (tg->se[cpu])
11551 remove_entity_load_avg(tg->se[cpu]);
029632fb 11552
6fe1f348
PZ
11553 /*
11554 * Only empty task groups can be destroyed; so we can speculatively
11555 * check on_list without danger of it being re-added.
11556 */
11557 if (!tg->cfs_rq[cpu]->on_list)
11558 continue;
11559
11560 rq = cpu_rq(cpu);
11561
5cb9eaa3 11562 raw_spin_rq_lock_irqsave(rq, flags);
6fe1f348 11563 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
5cb9eaa3 11564 raw_spin_rq_unlock_irqrestore(rq, flags);
6fe1f348 11565 }
029632fb
PZ
11566}
11567
11568void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
11569 struct sched_entity *se, int cpu,
11570 struct sched_entity *parent)
11571{
11572 struct rq *rq = cpu_rq(cpu);
11573
11574 cfs_rq->tg = tg;
11575 cfs_rq->rq = rq;
029632fb
PZ
11576 init_cfs_rq_runtime(cfs_rq);
11577
11578 tg->cfs_rq[cpu] = cfs_rq;
11579 tg->se[cpu] = se;
11580
11581 /* se could be NULL for root_task_group */
11582 if (!se)
11583 return;
11584
fed14d45 11585 if (!parent) {
029632fb 11586 se->cfs_rq = &rq->cfs;
fed14d45
PZ
11587 se->depth = 0;
11588 } else {
029632fb 11589 se->cfs_rq = parent->my_q;
fed14d45
PZ
11590 se->depth = parent->depth + 1;
11591 }
029632fb
PZ
11592
11593 se->my_q = cfs_rq;
0ac9b1c2
PT
11594 /* guarantee group entities always have weight */
11595 update_load_set(&se->load, NICE_0_LOAD);
029632fb
PZ
11596 se->parent = parent;
11597}
11598
11599static DEFINE_MUTEX(shares_mutex);
11600
30400039 11601static int __sched_group_set_shares(struct task_group *tg, unsigned long shares)
029632fb
PZ
11602{
11603 int i;
029632fb 11604
30400039
JD
11605 lockdep_assert_held(&shares_mutex);
11606
029632fb
PZ
11607 /*
11608 * We can't change the weight of the root cgroup.
11609 */
11610 if (!tg->se[0])
11611 return -EINVAL;
11612
11613 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
11614
029632fb 11615 if (tg->shares == shares)
30400039 11616 return 0;
029632fb
PZ
11617
11618 tg->shares = shares;
11619 for_each_possible_cpu(i) {
11620 struct rq *rq = cpu_rq(i);
8a8c69c3
PZ
11621 struct sched_entity *se = tg->se[i];
11622 struct rq_flags rf;
029632fb 11623
029632fb 11624 /* Propagate contribution to hierarchy */
8a8c69c3 11625 rq_lock_irqsave(rq, &rf);
71b1da46 11626 update_rq_clock(rq);
89ee048f 11627 for_each_sched_entity(se) {
88c0616e 11628 update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
1ea6c46a 11629 update_cfs_group(se);
89ee048f 11630 }
8a8c69c3 11631 rq_unlock_irqrestore(rq, &rf);
029632fb
PZ
11632 }
11633
30400039
JD
11634 return 0;
11635}
11636
11637int sched_group_set_shares(struct task_group *tg, unsigned long shares)
11638{
11639 int ret;
11640
11641 mutex_lock(&shares_mutex);
11642 if (tg_is_idle(tg))
11643 ret = -EINVAL;
11644 else
11645 ret = __sched_group_set_shares(tg, shares);
11646 mutex_unlock(&shares_mutex);
11647
11648 return ret;
11649}
11650
11651int sched_group_set_idle(struct task_group *tg, long idle)
11652{
11653 int i;
11654
11655 if (tg == &root_task_group)
11656 return -EINVAL;
11657
11658 if (idle < 0 || idle > 1)
11659 return -EINVAL;
11660
11661 mutex_lock(&shares_mutex);
11662
11663 if (tg->idle == idle) {
11664 mutex_unlock(&shares_mutex);
11665 return 0;
11666 }
11667
11668 tg->idle = idle;
11669
11670 for_each_possible_cpu(i) {
11671 struct rq *rq = cpu_rq(i);
11672 struct sched_entity *se = tg->se[i];
a480adde 11673 struct cfs_rq *parent_cfs_rq, *grp_cfs_rq = tg->cfs_rq[i];
30400039
JD
11674 bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
11675 long idle_task_delta;
11676 struct rq_flags rf;
11677
11678 rq_lock_irqsave(rq, &rf);
11679
11680 grp_cfs_rq->idle = idle;
11681 if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
11682 goto next_cpu;
11683
a480adde
JD
11684 if (se->on_rq) {
11685 parent_cfs_rq = cfs_rq_of(se);
11686 if (cfs_rq_is_idle(grp_cfs_rq))
11687 parent_cfs_rq->idle_nr_running++;
11688 else
11689 parent_cfs_rq->idle_nr_running--;
11690 }
11691
30400039
JD
11692 idle_task_delta = grp_cfs_rq->h_nr_running -
11693 grp_cfs_rq->idle_h_nr_running;
11694 if (!cfs_rq_is_idle(grp_cfs_rq))
11695 idle_task_delta *= -1;
11696
11697 for_each_sched_entity(se) {
11698 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11699
11700 if (!se->on_rq)
11701 break;
11702
11703 cfs_rq->idle_h_nr_running += idle_task_delta;
11704
11705 /* Already accounted at parent level and above. */
11706 if (cfs_rq_is_idle(cfs_rq))
11707 break;
11708 }
11709
11710next_cpu:
11711 rq_unlock_irqrestore(rq, &rf);
11712 }
11713
11714 /* Idle groups have minimum weight. */
11715 if (tg_is_idle(tg))
11716 __sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO));
11717 else
11718 __sched_group_set_shares(tg, NICE_0_LOAD);
11719
029632fb
PZ
11720 mutex_unlock(&shares_mutex);
11721 return 0;
11722}
30400039 11723
029632fb
PZ
11724#else /* CONFIG_FAIR_GROUP_SCHED */
11725
11726void free_fair_sched_group(struct task_group *tg) { }
11727
11728int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
11729{
11730 return 1;
11731}
11732
8663e24d
PZ
11733void online_fair_sched_group(struct task_group *tg) { }
11734
6fe1f348 11735void unregister_fair_sched_group(struct task_group *tg) { }
029632fb
PZ
11736
11737#endif /* CONFIG_FAIR_GROUP_SCHED */
11738
810b3817 11739
6d686f45 11740static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
0d721cea
PW
11741{
11742 struct sched_entity *se = &task->se;
0d721cea
PW
11743 unsigned int rr_interval = 0;
11744
11745 /*
11746 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
11747 * idle runqueue:
11748 */
0d721cea 11749 if (rq->cfs.load.weight)
a59f4e07 11750 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
0d721cea
PW
11751
11752 return rr_interval;
11753}
11754
bf0f6f24
IM
11755/*
11756 * All the scheduling class methods:
11757 */
43c31ac0
PZ
11758DEFINE_SCHED_CLASS(fair) = {
11759
bf0f6f24
IM
11760 .enqueue_task = enqueue_task_fair,
11761 .dequeue_task = dequeue_task_fair,
11762 .yield_task = yield_task_fair,
d95f4122 11763 .yield_to_task = yield_to_task_fair,
bf0f6f24 11764
2e09bf55 11765 .check_preempt_curr = check_preempt_wakeup,
bf0f6f24 11766
98c2f700 11767 .pick_next_task = __pick_next_task_fair,
bf0f6f24 11768 .put_prev_task = put_prev_task_fair,
03b7fad1 11769 .set_next_task = set_next_task_fair,
bf0f6f24 11770
681f3e68 11771#ifdef CONFIG_SMP
6e2df058 11772 .balance = balance_fair,
21f56ffe 11773 .pick_task = pick_task_fair,
4ce72a2c 11774 .select_task_rq = select_task_rq_fair,
0a74bef8 11775 .migrate_task_rq = migrate_task_rq_fair,
141965c7 11776
0bcdcf28
CE
11777 .rq_online = rq_online_fair,
11778 .rq_offline = rq_offline_fair,
88ec22d3 11779
12695578 11780 .task_dead = task_dead_fair,
c5b28038 11781 .set_cpus_allowed = set_cpus_allowed_common,
681f3e68 11782#endif
bf0f6f24 11783
bf0f6f24 11784 .task_tick = task_tick_fair,
cd29fe6f 11785 .task_fork = task_fork_fair,
cb469845
SR
11786
11787 .prio_changed = prio_changed_fair,
da7a735e 11788 .switched_from = switched_from_fair,
cb469845 11789 .switched_to = switched_to_fair,
810b3817 11790
0d721cea
PW
11791 .get_rr_interval = get_rr_interval_fair,
11792
6e998916
SG
11793 .update_curr = update_curr_fair,
11794
810b3817 11795#ifdef CONFIG_FAIR_GROUP_SCHED
ea86cb4b 11796 .task_change_group = task_change_group_fair,
810b3817 11797#endif
982d9cdc
PB
11798
11799#ifdef CONFIG_UCLAMP_TASK
11800 .uclamp_enabled = 1,
11801#endif
bf0f6f24
IM
11802};
11803
11804#ifdef CONFIG_SCHED_DEBUG
029632fb 11805void print_cfs_stats(struct seq_file *m, int cpu)
bf0f6f24 11806{
039ae8bc 11807 struct cfs_rq *cfs_rq, *pos;
bf0f6f24 11808
5973e5b9 11809 rcu_read_lock();
039ae8bc 11810 for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
5cef9eca 11811 print_cfs_rq(m, cpu, cfs_rq);
5973e5b9 11812 rcu_read_unlock();
bf0f6f24 11813}
397f2378
SD
11814
11815#ifdef CONFIG_NUMA_BALANCING
11816void show_numa_stats(struct task_struct *p, struct seq_file *m)
11817{
11818 int node;
11819 unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
cb361d8c 11820 struct numa_group *ng;
397f2378 11821
cb361d8c
JH
11822 rcu_read_lock();
11823 ng = rcu_dereference(p->numa_group);
397f2378
SD
11824 for_each_online_node(node) {
11825 if (p->numa_faults) {
11826 tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
11827 tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
11828 }
cb361d8c
JH
11829 if (ng) {
11830 gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
11831 gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
397f2378
SD
11832 }
11833 print_numa_stats(m, node, tsf, tpf, gsf, gpf);
11834 }
cb361d8c 11835 rcu_read_unlock();
397f2378
SD
11836}
11837#endif /* CONFIG_NUMA_BALANCING */
11838#endif /* CONFIG_SCHED_DEBUG */
029632fb
PZ
11839
11840__init void init_sched_fair_class(void)
11841{
11842#ifdef CONFIG_SMP
11843 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
11844
3451d024 11845#ifdef CONFIG_NO_HZ_COMMON
554cecaf 11846 nohz.next_balance = jiffies;
f643ea22 11847 nohz.next_blocked = jiffies;
029632fb 11848 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
029632fb
PZ
11849#endif
11850#endif /* SMP */
11851
11852}