kernel/sched/fair.c

   1 /*
   2  * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
   3  *
   4  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
   5  *
   6  *  Interactivity improvements by Mike Galbraith
   7  *  (C) 2007 Mike Galbraith <efault@gmx.de>
   8  *
   9  *  Various enhancements by Dmitry Adamushko.
  10  *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
  11  *
  12  *  Group scheduling enhancements by Srivatsa Vaddagiri
  13  *  Copyright IBM Corporation, 2007
  14  *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
  15  *
  16  *  Scaled math optimizations by Thomas Gleixner
  17  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
  18  *
  19  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
  20  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
  21  */
  22
  23 #include <linux/sched/mm.h>
  24 #include <linux/sched/topology.h>
  25
  26 #include <linux/latencytop.h>
  27 #include <linux/cpumask.h>
  28 #include <linux/cpuidle.h>
  29 #include <linux/slab.h>
  30 #include <linux/profile.h>
  31 #include <linux/interrupt.h>
  32 #include <linux/mempolicy.h>
  33 #include <linux/migrate.h>
  34 #include <linux/task_work.h>
  35
  36 #include <trace/events/sched.h>
  37
  38 #include "sched.h"
  39
  40 /*
  41  * Targeted preemption latency for CPU-bound tasks:
  42  *
  43  * NOTE: this latency value is not the same as the concept of
  44  * 'timeslice length' - timeslices in CFS are of variable length
  45  * and have no persistent notion like in traditional, time-slice
  46  * based scheduling concepts.
  47  *
  48  * (to see the precise effective timeslice length of your workload,
  49  *  run vmstat and monitor the context-switches (cs) field)
  50  *
  51  * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
  52  */
  53 unsigned int sysctl_sched_latency                       = 6000000ULL;
  54 unsigned int normalized_sysctl_sched_latency            = 6000000ULL;
  55
  56 /*
  57  * The initial- and re-scaling of tunables is configurable
  58  *
  59  * Options are:
  60  *
  61  *   SCHED_TUNABLESCALING_NONE - unscaled, always *1
  62  *   SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
  63  *   SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
  64  *
  65  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
  66  */
  67 enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
  68
  69 /*
  70  * Minimal preemption granularity for CPU-bound tasks:
  71  *
  72  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
  73  */
  74 unsigned int sysctl_sched_min_granularity               = 750000ULL;
  75 unsigned int normalized_sysctl_sched_min_granularity    = 750000ULL;
  76
  77 /*
  78  * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
  79  */
  80 static unsigned int sched_nr_latency = 8;
  81
  82 /*
  83  * After fork, child runs first. If set to 0 (default) then
  84  * parent will (try to) run first.
  85  */
  86 unsigned int sysctl_sched_child_runs_first __read_mostly;
  87
  88 /*
  89  * SCHED_OTHER wake-up granularity.
  90  *
  91  * This option delays the preemption effects of decoupled workloads
  92  * and reduces their over-scheduling. Synchronous workloads will still
  93  * have immediate wakeup/sleep latencies.
  94  *
  95  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
  96  */
  97 unsigned int sysctl_sched_wakeup_granularity            = 1000000UL;
  98 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
  99
 100 const_debug unsigned int sysctl_sched_migration_cost    = 500000UL;
 101
 102 #ifdef CONFIG_SMP
 103 /*
 104  * For asym packing, by default the lower numbered cpu has higher priority.
 105  */
 106 int __weak arch_asym_cpu_priority(int cpu)
 107 {
 108         return -cpu;
 109 }
 110 #endif
 111
 112 #ifdef CONFIG_CFS_BANDWIDTH
 113 /*
 114  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
 115  * each time a cfs_rq requests quota.
 116  *
 117  * Note: in the case that the slice exceeds the runtime remaining (either due
 118  * to consumption or the quota being specified to be smaller than the slice)
 119  * we will always only issue the remaining available time.
 120  *
 121  * (default: 5 msec, units: microseconds)
 122  */
 123 unsigned int sysctl_sched_cfs_bandwidth_slice           = 5000UL;
 124 #endif
 125
 126 /*
 127  * The margin used when comparing utilization with CPU capacity:
 128  * util * margin < capacity * 1024
 129  *
 130  * (default: ~20%)
 131  */
 132 unsigned int capacity_margin                            = 1280;
 133
 134 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 135 {
 136         lw->weight += inc;
 137         lw->inv_weight = 0;
 138 }
 139
 140 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
 141 {
 142         lw->weight -= dec;
 143         lw->inv_weight = 0;
 144 }
 145
 146 static inline void update_load_set(struct load_weight *lw, unsigned long w)
 147 {
 148         lw->weight = w;
 149         lw->inv_weight = 0;
 150 }
 151
 152 /*
 153  * Increase the granularity value when there are more CPUs,
 154  * because with more CPUs the 'effective latency' as visible
 155  * to users decreases. But the relationship is not linear,
 156  * so pick a second-best guess by going with the log2 of the
 157  * number of CPUs.
 158  *
 159  * This idea comes from the SD scheduler of Con Kolivas:
 160  */
 161 static unsigned int get_update_sysctl_factor(void)
 162 {
 163         unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
 164         unsigned int factor;
 165
 166         switch (sysctl_sched_tunable_scaling) {
 167         case SCHED_TUNABLESCALING_NONE:
 168                 factor = 1;
 169                 break;
 170         case SCHED_TUNABLESCALING_LINEAR:
 171                 factor = cpus;
 172                 break;
 173         case SCHED_TUNABLESCALING_LOG:
 174         default:
 175                 factor = 1 + ilog2(cpus);
 176                 break;
 177         }
 178
 179         return factor;
 180 }
 181
 182 static void update_sysctl(void)
 183 {
 184         unsigned int factor = get_update_sysctl_factor();
 185
 186 #define SET_SYSCTL(name) \
 187         (sysctl_##name = (factor) * normalized_sysctl_##name)
 188         SET_SYSCTL(sched_min_granularity);
 189         SET_SYSCTL(sched_latency);
 190         SET_SYSCTL(sched_wakeup_granularity);
 191 #undef SET_SYSCTL
 192 }
 193
 194 void sched_init_granularity(void)
 195 {
 196         update_sysctl();
 197 }
 198
 199 #define WMULT_CONST     (~0U)
 200 #define WMULT_SHIFT     32
 201
 202 static void __update_inv_weight(struct load_weight *lw)
 203 {
 204         unsigned long w;
 205
 206         if (likely(lw->inv_weight))
 207                 return;
 208
 209         w = scale_load_down(lw->weight);
 210
 211         if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
 212                 lw->inv_weight = 1;
 213         else if (unlikely(!w))
 214                 lw->inv_weight = WMULT_CONST;
 215         else
 216                 lw->inv_weight = WMULT_CONST / w;
 217 }
 218
 219 /*
 220  * delta_exec * weight / lw.weight
 221  *   OR
 222  * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
 223  *
 224  * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
 225  * we're guaranteed shift stays positive because inv_weight is guaranteed to
 226  * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
 227  *
 228  * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
 229  * weight/lw.weight <= 1, and therefore our shift will also be positive.
 230  */
 231 static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
 232 {
 233         u64 fact = scale_load_down(weight);
 234         int shift = WMULT_SHIFT;
 235
 236         __update_inv_weight(lw);
 237
 238         if (unlikely(fact >> 32)) {
 239                 while (fact >> 32) {
 240                         fact >>= 1;
 241                         shift--;
 242                 }
 243         }
 244
 245         /* hint to use a 32x32->64 mul */
 246         fact = (u64)(u32)fact * lw->inv_weight;
 247
 248         while (fact >> 32) {
 249                 fact >>= 1;
 250                 shift--;
 251         }
 252
 253         return mul_u64_u32_shr(delta_exec, fact, shift);
 254 }
 255
 256
 257 const struct sched_class fair_sched_class;
 258
 259 /**************************************************************
 260  * CFS operations on generic schedulable entities:
 261  */
 262
 263 #ifdef CONFIG_FAIR_GROUP_SCHED
 264
 265 /* cpu runqueue to which this cfs_rq is attached */
 266 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 267 {
 268         return cfs_rq->rq;
 269 }
 270
 271 /* An entity is a task if it doesn't "own" a runqueue */
 272 #define entity_is_task(se)      (!se->my_q)
 273
 274 static inline struct task_struct *task_of(struct sched_entity *se)
 275 {
 276         SCHED_WARN_ON(!entity_is_task(se));
 277         return container_of(se, struct task_struct, se);
 278 }
 279
 280 /* Walk up scheduling entities hierarchy */
 281 #define for_each_sched_entity(se) \
 282                 for (; se; se = se->parent)
 283
 284 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 285 {
 286         return p->se.cfs_rq;
 287 }
 288
 289 /* runqueue on which this entity is (to be) queued */
 290 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 291 {
 292         return se->cfs_rq;
 293 }
 294
 295 /* runqueue "owned" by this group */
 296 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 297 {
 298         return grp->my_q;
 299 }
 300
 301 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 302 {
 303         if (!cfs_rq->on_list) {
 304                 struct rq *rq = rq_of(cfs_rq);
 305                 int cpu = cpu_of(rq);
 306                 /*
 307                  * Ensure we either appear before our parent (if already
 308                  * enqueued) or force our parent to appear after us when it is
 309                  * enqueued. The fact that we always enqueue bottom-up
 310                  * reduces this to two cases and a special case for the root
 311                  * cfs_rq. Furthermore, it also means that we will always reset
 312                  * tmp_alone_branch either when the branch is connected
 313                  * to a tree or when we reach the beg of the tree
 314                  */
 315                 if (cfs_rq->tg->parent &&
 316                     cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
 317                         /*
 318                          * If parent is already on the list, we add the child
 319                          * just before. Thanks to circular linked property of
 320                          * the list, this means to put the child at the tail
 321                          * of the list that starts by parent.
 322                          */
 323                         list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
 324                                 &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
 325                         /*
 326                          * The branch is now connected to its tree so we can
 327                          * reset tmp_alone_branch to the beginning of the
 328                          * list.
 329                          */
 330                         rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
 331                 } else if (!cfs_rq->tg->parent) {
 332                         /*
 333                          * cfs rq without parent should be put
 334                          * at the tail of the list.
 335                          */
 336                         list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
 337                                 &rq->leaf_cfs_rq_list);
 338                         /*
 339                          * We have reach the beg of a tree so we can reset
 340                          * tmp_alone_branch to the beginning of the list.
 341                          */
 342                         rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
 343                 } else {
 344                         /*
 345                          * The parent has not already been added so we want to
 346                          * make sure that it will be put after us.
 347                          * tmp_alone_branch points to the beg of the branch
 348                          * where we will add parent.
 349                          */
 350                         list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
 351                                 rq->tmp_alone_branch);
 352                         /*
 353                          * update tmp_alone_branch to points to the new beg
 354                          * of the branch
 355                          */
 356                         rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
 357                 }
 358
 359                 cfs_rq->on_list = 1;
 360         }
 361 }
 362
 363 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 364 {
 365         if (cfs_rq->on_list) {
 366                 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
 367                 cfs_rq->on_list = 0;
 368         }
 369 }
 370
 371 /* Iterate thr' all leaf cfs_rq's on a runqueue */
 372 #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)                      \
 373         list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list,    \
 374                                  leaf_cfs_rq_list)
 375
 376 /* Do the two (enqueued) entities belong to the same group ? */
 377 static inline struct cfs_rq *
 378 is_same_group(struct sched_entity *se, struct sched_entity *pse)
 379 {
 380         if (se->cfs_rq == pse->cfs_rq)
 381                 return se->cfs_rq;
 382
 383         return NULL;
 384 }
 385
 386 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 387 {
 388         return se->parent;
 389 }
 390
 391 static void
 392 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 393 {
 394         int se_depth, pse_depth;
 395
 396         /*
 397          * preemption test can be made between sibling entities who are in the
 398          * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
 399          * both tasks until we find their ancestors who are siblings of common
 400          * parent.
 401          */
 402
 403         /* First walk up until both entities are at same depth */
 404         se_depth = (*se)->depth;
 405         pse_depth = (*pse)->depth;
 406
 407         while (se_depth > pse_depth) {
 408                 se_depth--;
 409                 *se = parent_entity(*se);
 410         }
 411
 412         while (pse_depth > se_depth) {
 413                 pse_depth--;
 414                 *pse = parent_entity(*pse);
 415         }
 416
 417         while (!is_same_group(*se, *pse)) {
 418                 *se = parent_entity(*se);
 419                 *pse = parent_entity(*pse);
 420         }
 421 }
 422
 423 #else   /* !CONFIG_FAIR_GROUP_SCHED */
 424
 425 static inline struct task_struct *task_of(struct sched_entity *se)
 426 {
 427         return container_of(se, struct task_struct, se);
 428 }
 429
 430 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 431 {
 432         return container_of(cfs_rq, struct rq, cfs);
 433 }
 434
 435 #define entity_is_task(se)      1
 436
 437 #define for_each_sched_entity(se) \
 438                 for (; se; se = NULL)
 439
 440 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 441 {
 442         return &task_rq(p)->cfs;
 443 }
 444
 445 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 446 {
 447         struct task_struct *p = task_of(se);
 448         struct rq *rq = task_rq(p);
 449
 450         return &rq->cfs;
 451 }
 452
 453 /* runqueue "owned" by this group */
 454 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 455 {
 456         return NULL;
 457 }
 458
 459 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 460 {
 461 }
 462
 463 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 464 {
 465 }
 466
 467 #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)      \
 468                 for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
 469
 470 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 471 {
 472         return NULL;
 473 }
 474
 475 static inline void
 476 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 477 {
 478 }
 479
 480 #endif  /* CONFIG_FAIR_GROUP_SCHED */
 481
 482 static __always_inline
 483 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
 484
 485 /**************************************************************
 486  * Scheduling class tree data structure manipulation methods:
 487  */
 488
 489 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
 490 {
 491         s64 delta = (s64)(vruntime - max_vruntime);
 492         if (delta > 0)
 493                 max_vruntime = vruntime;
 494
 495         return max_vruntime;
 496 }
 497
 498 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
 499 {
 500         s64 delta = (s64)(vruntime - min_vruntime);
 501         if (delta < 0)
 502                 min_vruntime = vruntime;
 503
 504         return min_vruntime;
 505 }
 506
 507 static inline int entity_before(struct sched_entity *a,
 508                                 struct sched_entity *b)
 509 {
 510         return (s64)(a->vruntime - b->vruntime) < 0;
 511 }
 512
 513 static void update_min_vruntime(struct cfs_rq *cfs_rq)
 514 {
 515         struct sched_entity *curr = cfs_rq->curr;
 516         struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
 517
 518         u64 vruntime = cfs_rq->min_vruntime;
 519
 520         if (curr) {
 521                 if (curr->on_rq)
 522                         vruntime = curr->vruntime;
 523                 else
 524                         curr = NULL;
 525         }
 526
 527         if (leftmost) { /* non-empty tree */
 528                 struct sched_entity *se;
 529                 se = rb_entry(leftmost, struct sched_entity, run_node);
 530
 531                 if (!curr)
 532                         vruntime = se->vruntime;
 533                 else
 534                         vruntime = min_vruntime(vruntime, se->vruntime);
 535         }
 536
 537         /* ensure we never gain time by being placed backwards. */
 538         cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
 539 #ifndef CONFIG_64BIT
 540         smp_wmb();
 541         cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 542 #endif
 543 }
 544
 545 /*
 546  * Enqueue an entity into the rb-tree:
 547  */
 548 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 549 {
 550         struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
 551         struct rb_node *parent = NULL;
 552         struct sched_entity *entry;
 553         bool leftmost = true;
 554
 555         /*
 556          * Find the right place in the rbtree:
 557          */
 558         while (*link) {
 559                 parent = *link;
 560                 entry = rb_entry(parent, struct sched_entity, run_node);
 561                 /*
 562                  * We dont care about collisions. Nodes with
 563                  * the same key stay together.
 564                  */
 565                 if (entity_before(se, entry)) {
 566                         link = &parent->rb_left;
 567                 } else {
 568                         link = &parent->rb_right;
 569                         leftmost = false;
 570                 }
 571         }
 572
 573         rb_link_node(&se->run_node, parent, link);
 574         rb_insert_color_cached(&se->run_node,
 575                                &cfs_rq->tasks_timeline, leftmost);
 576 }
 577
 578 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 579 {
 580         rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
 581 }
 582
 583 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
 584 {
 585         struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
 586
 587         if (!left)
 588                 return NULL;
 589
 590         return rb_entry(left, struct sched_entity, run_node);
 591 }
 592
 593 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
 594 {
 595         struct rb_node *next = rb_next(&se->run_node);
 596
 597         if (!next)
 598                 return NULL;
 599
 600         return rb_entry(next, struct sched_entity, run_node);
 601 }
 602
 603 #ifdef CONFIG_SCHED_DEBUG
 604 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 605 {
 606         struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
 607
 608         if (!last)
 609                 return NULL;
 610
 611         return rb_entry(last, struct sched_entity, run_node);
 612 }
 613
 614 /**************************************************************
 615  * Scheduling class statistics methods:
 616  */
 617
 618 int sched_proc_update_handler(struct ctl_table *table, int write,
 619                 void __user *buffer, size_t *lenp,
 620                 loff_t *ppos)
 621 {
 622         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 623         unsigned int factor = get_update_sysctl_factor();
 624
 625         if (ret || !write)
 626                 return ret;
 627
 628         sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
 629                                         sysctl_sched_min_granularity);
 630
 631 #define WRT_SYSCTL(name) \
 632         (normalized_sysctl_##name = sysctl_##name / (factor))
 633         WRT_SYSCTL(sched_min_granularity);
 634         WRT_SYSCTL(sched_latency);
 635         WRT_SYSCTL(sched_wakeup_granularity);
 636 #undef WRT_SYSCTL
 637
 638         return 0;
 639 }
 640 #endif
 641
 642 /*
 643  * delta /= w
 644  */
 645 static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
 646 {
 647         if (unlikely(se->load.weight != NICE_0_LOAD))
 648                 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
 649
 650         return delta;
 651 }
 652
 653 /*
 654  * The idea is to set a period in which each task runs once.
 655  *
 656  * When there are too many tasks (sched_nr_latency) we have to stretch
 657  * this period because otherwise the slices get too small.
 658  *
 659  * p = (nr <= nl) ? l : l*nr/nl
 660  */
 661 static u64 __sched_period(unsigned long nr_running)
 662 {
 663         if (unlikely(nr_running > sched_nr_latency))
 664                 return nr_running * sysctl_sched_min_granularity;
 665         else
 666                 return sysctl_sched_latency;
 667 }
 668
 669 /*
 670  * We calculate the wall-time slice from the period by taking a part
 671  * proportional to the weight.
 672  *
 673  * s = p*P[w/rw]
 674  */
 675 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 676 {
 677         u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
 678
 679         for_each_sched_entity(se) {
 680                 struct load_weight *load;
 681                 struct load_weight lw;
 682
 683                 cfs_rq = cfs_rq_of(se);
 684                 load = &cfs_rq->load;
 685
 686                 if (unlikely(!se->on_rq)) {
 687                         lw = cfs_rq->load;
 688
 689                         update_load_add(&lw, se->load.weight);
 690                         load = &lw;
 691                 }
 692                 slice = __calc_delta(slice, se->load.weight, load);
 693         }
 694         return slice;
 695 }
 696
 697 /*
 698  * We calculate the vruntime slice of a to-be-inserted task.
 699  *
 700  * vs = s/w
 701  */
 702 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 703 {
 704         return calc_delta_fair(sched_slice(cfs_rq, se), se);
 705 }
 706
 707 #ifdef CONFIG_SMP
 708
 709 #include "sched-pelt.h"
 710
 711 static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
 712 static unsigned long task_h_load(struct task_struct *p);
 713
 714 /* Give new sched_entity start runnable values to heavy its load in infant time */
 715 void init_entity_runnable_average(struct sched_entity *se)
 716 {
 717         struct sched_avg *sa = &se->avg;
 718
 719         sa->last_update_time = 0;
 720         /*
 721          * sched_avg's period_contrib should be strictly less then 1024, so
 722          * we give it 1023 to make sure it is almost a period (1024us), and
 723          * will definitely be update (after enqueue).
 724          */
 725         sa->period_contrib = 1023;
 726         /*
 727          * Tasks are intialized with full load to be seen as heavy tasks until
 728          * they get a chance to stabilize to their real load level.
 729          * Group entities are intialized with zero load to reflect the fact that
 730          * nothing has been attached to the task group yet.
 731          */
 732         if (entity_is_task(se))
 733                 sa->load_avg = scale_load_down(se->load.weight);
 734         sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
 735         /*
 736          * At this point, util_avg won't be used in select_task_rq_fair anyway
 737          */
 738         sa->util_avg = 0;
 739         sa->util_sum = 0;
 740         /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
 741 }
 742
 743 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
 744 static void attach_entity_cfs_rq(struct sched_entity *se);
 745
 746 /*
 747  * With new tasks being created, their initial util_avgs are extrapolated
 748  * based on the cfs_rq's current util_avg:
 749  *
 750  *   util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
 751  *
 752  * However, in many cases, the above util_avg does not give a desired
 753  * value. Moreover, the sum of the util_avgs may be divergent, such
 754  * as when the series is a harmonic series.
 755  *
 756  * To solve this problem, we also cap the util_avg of successive tasks to
 757  * only 1/2 of the left utilization budget:
 758  *
 759  *   util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
 760  *
 761  * where n denotes the nth task.
 762  *
 763  * For example, a simplest series from the beginning would be like:
 764  *
 765  *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
 766  * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
 767  *
 768  * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
 769  * if util_avg > util_avg_cap.
 770  */
 771 void post_init_entity_util_avg(struct sched_entity *se)
 772 {
 773         struct cfs_rq *cfs_rq = cfs_rq_of(se);
 774         struct sched_avg *sa = &se->avg;
 775         long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
 776
 777         if (cap > 0) {
 778                 if (cfs_rq->avg.util_avg != 0) {
 779                         sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
 780                         sa->util_avg /= (cfs_rq->avg.load_avg + 1);
 781
 782                         if (sa->util_avg > cap)
 783                                 sa->util_avg = cap;
 784                 } else {
 785                         sa->util_avg = cap;
 786                 }
 787                 sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
 788         }
 789
 790         if (entity_is_task(se)) {
 791                 struct task_struct *p = task_of(se);
 792                 if (p->sched_class != &fair_sched_class) {
 793                         /*
 794                          * For !fair tasks do:
 795                          *
 796                         update_cfs_rq_load_avg(now, cfs_rq);
 797                         attach_entity_load_avg(cfs_rq, se);
 798                         switched_from_fair(rq, p);
 799                          *
 800                          * such that the next switched_to_fair() has the
 801                          * expected state.
 802                          */
 803                         se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
 804                         return;
 805                 }
 806         }
 807
 808         attach_entity_cfs_rq(se);
 809 }
 810
 811 #else /* !CONFIG_SMP */
 812 void init_entity_runnable_average(struct sched_entity *se)
 813 {
 814 }
 815 void post_init_entity_util_avg(struct sched_entity *se)
 816 {
 817 }
 818 static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
 819 {
 820 }
 821 #endif /* CONFIG_SMP */
 822
 823 /*
 824  * Update the current task's runtime statistics.
 825  */
 826 static void update_curr(struct cfs_rq *cfs_rq)
 827 {
 828         struct sched_entity *curr = cfs_rq->curr;
 829         u64 now = rq_clock_task(rq_of(cfs_rq));
 830         u64 delta_exec;
 831
 832         if (unlikely(!curr))
 833                 return;
 834
 835         delta_exec = now - curr->exec_start;
 836         if (unlikely((s64)delta_exec <= 0))
 837                 return;
 838
 839         curr->exec_start = now;
 840
 841         schedstat_set(curr->statistics.exec_max,
 842                       max(delta_exec, curr->statistics.exec_max));
 843
 844         curr->sum_exec_runtime += delta_exec;
 845         schedstat_add(cfs_rq->exec_clock, delta_exec);
 846
 847         curr->vruntime += calc_delta_fair(delta_exec, curr);
 848         update_min_vruntime(cfs_rq);
 849
 850         if (entity_is_task(curr)) {
 851                 struct task_struct *curtask = task_of(curr);
 852
 853                 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
 854                 cpuacct_charge(curtask, delta_exec);
 855                 account_group_exec_runtime(curtask, delta_exec);
 856         }
 857
 858         account_cfs_rq_runtime(cfs_rq, delta_exec);
 859 }
 860
 861 static void update_curr_fair(struct rq *rq)
 862 {
 863         update_curr(cfs_rq_of(&rq->curr->se));
 864 }
 865
 866 static inline void
 867 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 868 {
 869         u64 wait_start, prev_wait_start;
 870
 871         if (!schedstat_enabled())
 872                 return;
 873
 874         wait_start = rq_clock(rq_of(cfs_rq));
 875         prev_wait_start = schedstat_val(se->statistics.wait_start);
 876
 877         if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
 878             likely(wait_start > prev_wait_start))
 879                 wait_start -= prev_wait_start;
 880
 881         schedstat_set(se->statistics.wait_start, wait_start);
 882 }
 883
 884 static inline void
 885 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 886 {
 887         struct task_struct *p;
 888         u64 delta;
 889
 890         if (!schedstat_enabled())
 891                 return;
 892
 893         delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
 894
 895         if (entity_is_task(se)) {
 896                 p = task_of(se);
 897                 if (task_on_rq_migrating(p)) {
 898                         /*
 899                          * Preserve migrating task's wait time so wait_start
 900                          * time stamp can be adjusted to accumulate wait time
 901                          * prior to migration.
 902                          */
 903                         schedstat_set(se->statistics.wait_start, delta);
 904                         return;
 905                 }
 906                 trace_sched_stat_wait(p, delta);
 907         }
 908
 909         schedstat_set(se->statistics.wait_max,
 910                       max(schedstat_val(se->statistics.wait_max), delta));
 911         schedstat_inc(se->statistics.wait_count);
 912         schedstat_add(se->statistics.wait_sum, delta);
 913         schedstat_set(se->statistics.wait_start, 0);
 914 }
 915
 916 static inline void
 917 update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 918 {
 919         struct task_struct *tsk = NULL;
 920         u64 sleep_start, block_start;
 921
 922         if (!schedstat_enabled())
 923                 return;
 924
 925         sleep_start = schedstat_val(se->statistics.sleep_start);
 926         block_start = schedstat_val(se->statistics.block_start);
 927
 928         if (entity_is_task(se))
 929                 tsk = task_of(se);
 930
 931         if (sleep_start) {
 932                 u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
 933
 934                 if ((s64)delta < 0)
 935                         delta = 0;
 936
 937                 if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
 938                         schedstat_set(se->statistics.sleep_max, delta);
 939
 940                 schedstat_set(se->statistics.sleep_start, 0);
 941                 schedstat_add(se->statistics.sum_sleep_runtime, delta);
 942
 943                 if (tsk) {
 944                         account_scheduler_latency(tsk, delta >> 10, 1);
 945                         trace_sched_stat_sleep(tsk, delta);
 946                 }
 947         }
 948         if (block_start) {
 949                 u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
 950
 951                 if ((s64)delta < 0)
 952                         delta = 0;
 953
 954                 if (unlikely(delta > schedstat_val(se->statistics.block_max)))
 955                         schedstat_set(se->statistics.block_max, delta);
 956
 957                 schedstat_set(se->statistics.block_start, 0);
 958                 schedstat_add(se->statistics.sum_sleep_runtime, delta);
 959
 960                 if (tsk) {
 961                         if (tsk->in_iowait) {
 962                                 schedstat_add(se->statistics.iowait_sum, delta);
 963                                 schedstat_inc(se->statistics.iowait_count);
 964                                 trace_sched_stat_iowait(tsk, delta);
 965                         }
 966
 967                         trace_sched_stat_blocked(tsk, delta);
 968
 969                         /*
 970                          * Blocking time is in units of nanosecs, so shift by
 971                          * 20 to get a milliseconds-range estimation of the
 972                          * amount of time that the task spent sleeping:
 973                          */
 974                         if (unlikely(prof_on == SLEEP_PROFILING)) {
 975                                 profile_hits(SLEEP_PROFILING,
 976                                                 (void *)get_wchan(tsk),
 977                                                 delta >> 20);
 978                         }
 979                         account_scheduler_latency(tsk, delta >> 10, 0);
 980                 }
 981         }
 982 }
 983
 984 /*
 985  * Task is being enqueued - update stats:
 986  */
 987 static inline void
 988 update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 989 {
 990         if (!schedstat_enabled())
 991                 return;
 992
 993         /*
 994          * Are we enqueueing a waiting task? (for current tasks
 995          * a dequeue/enqueue event is a NOP)
 996          */
 997         if (se != cfs_rq->curr)
 998                 update_stats_wait_start(cfs_rq, se);
 999
1000         if (flags & ENQUEUE_WAKEUP)
1001                 update_stats_enqueue_sleeper(cfs_rq, se);
1002 }
1003
1004 static inline void
1005 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1006 {
1007
1008         if (!schedstat_enabled())
1009                 return;
1010
1011         /*
1012          * Mark the end of the wait period if dequeueing a
1013          * waiting task:
1014          */
1015         if (se != cfs_rq->curr)
1016                 update_stats_wait_end(cfs_rq, se);
1017
1018         if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
1019                 struct task_struct *tsk = task_of(se);
1020
1021                 if (tsk->state & TASK_INTERRUPTIBLE)
1022                         schedstat_set(se->statistics.sleep_start,
1023                                       rq_clock(rq_of(cfs_rq)));
1024                 if (tsk->state & TASK_UNINTERRUPTIBLE)
1025                         schedstat_set(se->statistics.block_start,
1026                                       rq_clock(rq_of(cfs_rq)));
1027         }
1028 }
1029
1030 /*
1031  * We are picking a new current task - update its stats:
1032  */
1033 static inline void
1034 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
1035 {
1036         /*
1037          * We are starting a new run period:
1038          */
1039         se->exec_start = rq_clock_task(rq_of(cfs_rq));
1040 }
1041
1042 /**************************************************
1043  * Scheduling class queueing methods:
1044  */
1045
1046 #ifdef CONFIG_NUMA_BALANCING
1047 /*
1048  * Approximate time to scan a full NUMA task in ms. The task scan period is
1049  * calculated based on the tasks virtual memory size and
1050  * numa_balancing_scan_size.
1051  */
1052 unsigned int sysctl_numa_balancing_scan_period_min = 1000;
1053 unsigned int sysctl_numa_balancing_scan_period_max = 60000;
1054
1055 /* Portion of address space to scan in MB */
1056 unsigned int sysctl_numa_balancing_scan_size = 256;
1057
1058 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
1059 unsigned int sysctl_numa_balancing_scan_delay = 1000;
1060
1061 struct numa_group {
1062         atomic_t refcount;
1063
1064         spinlock_t lock; /* nr_tasks, tasks */
1065         int nr_tasks;
1066         pid_t gid;
1067         int active_nodes;
1068
1069         struct rcu_head rcu;
1070         unsigned long total_faults;
1071         unsigned long max_faults_cpu;
1072         /*
1073          * Faults_cpu is used to decide whether memory should move
1074          * towards the CPU. As a consequence, these stats are weighted
1075          * more by CPU use than by memory faults.
1076          */
1077         unsigned long *faults_cpu;
1078         unsigned long faults[0];
1079 };
1080
1081 static inline unsigned long group_faults_priv(struct numa_group *ng);
1082 static inline unsigned long group_faults_shared(struct numa_group *ng);
1083
1084 static unsigned int task_nr_scan_windows(struct task_struct *p)
1085 {
1086         unsigned long rss = 0;
1087         unsigned long nr_scan_pages;
1088
1089         /*
1090          * Calculations based on RSS as non-present and empty pages are skipped
1091          * by the PTE scanner and NUMA hinting faults should be trapped based
1092          * on resident pages
1093          */
1094         nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
1095         rss = get_mm_rss(p->mm);
1096         if (!rss)
1097                 rss = nr_scan_pages;
1098
1099         rss = round_up(rss, nr_scan_pages);
1100         return rss / nr_scan_pages;
1101 }
1102
1103 /* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
1104 #define MAX_SCAN_WINDOW 2560
1105
1106 static unsigned int task_scan_min(struct task_struct *p)
1107 {
1108         unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
1109         unsigned int scan, floor;
1110         unsigned int windows = 1;
1111
1112         if (scan_size < MAX_SCAN_WINDOW)
1113                 windows = MAX_SCAN_WINDOW / scan_size;
1114         floor = 1000 / windows;
1115
1116         scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1117         return max_t(unsigned int, floor, scan);
1118 }
1119
1120 static unsigned int task_scan_start(struct task_struct *p)
1121 {
1122         unsigned long smin = task_scan_min(p);
1123         unsigned long period = smin;
1124
1125         /* Scale the maximum scan period with the amount of shared memory. */
1126         if (p->numa_group) {
1127                 struct numa_group *ng = p->numa_group;
1128                 unsigned long shared = group_faults_shared(ng);
1129                 unsigned long private = group_faults_priv(ng);
1130
1131                 period *= atomic_read(&ng->refcount);
1132                 period *= shared + 1;
1133                 period /= private + shared + 1;
1134         }
1135
1136         return max(smin, period);
1137 }
1138
1139 static unsigned int task_scan_max(struct task_struct *p)
1140 {
1141         unsigned long smin = task_scan_min(p);
1142         unsigned long smax;
1143
1144         /* Watch for min being lower than max due to floor calculations */
1145         smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
1146
1147         /* Scale the maximum scan period with the amount of shared memory. */
1148         if (p->numa_group) {
1149                 struct numa_group *ng = p->numa_group;
1150                 unsigned long shared = group_faults_shared(ng);
1151                 unsigned long private = group_faults_priv(ng);
1152                 unsigned long period = smax;
1153
1154                 period *= atomic_read(&ng->refcount);
1155                 period *= shared + 1;
1156                 period /= private + shared + 1;
1157
1158                 smax = max(smax, period);
1159         }
1160
1161         return max(smin, smax);
1162 }
1163
1164 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1165 {
1166         rq->nr_numa_running += (p->numa_preferred_nid != -1);
1167         rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1168 }
1169
1170 static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1171 {
1172         rq->nr_numa_running -= (p->numa_preferred_nid != -1);
1173         rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1174 }
1175
1176 /* Shared or private faults. */
1177 #define NR_NUMA_HINT_FAULT_TYPES 2
1178
1179 /* Memory and CPU locality */
1180 #define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1181
1182 /* Averaged statistics, and temporary buffers. */
1183 #define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1184
1185 pid_t task_numa_group_id(struct task_struct *p)
1186 {
1187         return p->numa_group ? p->numa_group->gid : 0;
1188 }
1189
1190 /*
1191  * The averaged statistics, shared & private, memory & cpu,
1192  * occupy the first half of the array. The second half of the
1193  * array is for current counters, which are averaged into the
1194  * first set by task_numa_placement.
1195  */
1196 static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
1197 {
1198         return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
1199 }
1200
1201 static inline unsigned long task_faults(struct task_struct *p, int nid)
1202 {
1203         if (!p->numa_faults)
1204                 return 0;
1205
1206         return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1207                 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
1208 }
1209
1210 static inline unsigned long group_faults(struct task_struct *p, int nid)
1211 {
1212         if (!p->numa_group)
1213                 return 0;
1214
1215         return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1216                 p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
1217 }
1218
1219 static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1220 {
1221         return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
1222                 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
1223 }
1224
1225 static inline unsigned long group_faults_priv(struct numa_group *ng)
1226 {
1227         unsigned long faults = 0;
1228         int node;
1229
1230         for_each_online_node(node) {
1231                 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
1232         }
1233
1234         return faults;
1235 }
1236
1237 static inline unsigned long group_faults_shared(struct numa_group *ng)
1238 {
1239         unsigned long faults = 0;
1240         int node;
1241
1242         for_each_online_node(node) {
1243                 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
1244         }
1245
1246         return faults;
1247 }
1248
1249 /*
1250  * A node triggering more than 1/3 as many NUMA faults as the maximum is
1251  * considered part of a numa group's pseudo-interleaving set. Migrations
1252  * between these nodes are slowed down, to allow things to settle down.
1253  */
1254 #define ACTIVE_NODE_FRACTION 3
1255
1256 static bool numa_is_active_node(int nid, struct numa_group *ng)
1257 {
1258         return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1259 }
1260
1261 /* Handle placement on systems where not all nodes are directly connected. */
1262 static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1263                                         int maxdist, bool task)
1264 {
1265         unsigned long score = 0;
1266         int node;
1267
1268         /*
1269          * All nodes are directly connected, and the same distance
1270          * from each other. No need for fancy placement algorithms.
1271          */
1272         if (sched_numa_topology_type == NUMA_DIRECT)
1273                 return 0;
1274
1275         /*
1276          * This code is called for each node, introducing N^2 complexity,
1277          * which should be ok given the number of nodes rarely exceeds 8.
1278          */
1279         for_each_online_node(node) {
1280                 unsigned long faults;
1281                 int dist = node_distance(nid, node);
1282
1283                 /*
1284                  * The furthest away nodes in the system are not interesting
1285                  * for placement; nid was already counted.
1286                  */
1287                 if (dist == sched_max_numa_distance || node == nid)
1288                         continue;
1289
1290                 /*
1291                  * On systems with a backplane NUMA topology, compare groups
1292                  * of nodes, and move tasks towards the group with the most
1293                  * memory accesses. When comparing two nodes at distance
1294                  * "hoplimit", only nodes closer by than "hoplimit" are part
1295                  * of each group. Skip other nodes.
1296                  */
1297                 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1298                                         dist > maxdist)
1299                         continue;
1300
1301                 /* Add up the faults from nearby nodes. */
1302                 if (task)
1303                         faults = task_faults(p, node);
1304                 else
1305                         faults = group_faults(p, node);
1306
1307                 /*
1308                  * On systems with a glueless mesh NUMA topology, there are
1309                  * no fixed "groups of nodes". Instead, nodes that are not
1310                  * directly connected bounce traffic through intermediate
1311                  * nodes; a numa_group can occupy any set of nodes.
1312                  * The further away a node is, the less the faults count.
1313                  * This seems to result in good task placement.
1314                  */
1315                 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1316                         faults *= (sched_max_numa_distance - dist);
1317                         faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1318                 }
1319
1320                 score += faults;
1321         }
1322
1323         return score;
1324 }
1325
1326 /*
1327  * These return the fraction of accesses done by a particular task, or
1328  * task group, on a particular numa node.  The group weight is given a
1329  * larger multiplier, in order to group tasks together that are almost
1330  * evenly spread out between numa nodes.
1331  */
1332 static inline unsigned long task_weight(struct task_struct *p, int nid,
1333                                         int dist)
1334 {
1335         unsigned long faults, total_faults;
1336
1337         if (!p->numa_faults)
1338                 return 0;
1339
1340         total_faults = p->total_numa_faults;
1341
1342         if (!total_faults)
1343                 return 0;
1344
1345         faults = task_faults(p, nid);
1346         faults += score_nearby_nodes(p, nid, dist, true);
1347
1348         return 1000 * faults / total_faults;
1349 }
1350
1351 static inline unsigned long group_weight(struct task_struct *p, int nid,
1352                                          int dist)
1353 {
1354         unsigned long faults, total_faults;
1355
1356         if (!p->numa_group)
1357                 return 0;
1358
1359         total_faults = p->numa_group->total_faults;
1360
1361         if (!total_faults)
1362                 return 0;
1363
1364         faults = group_faults(p, nid);
1365         faults += score_nearby_nodes(p, nid, dist, false);
1366
1367         return 1000 * faults / total_faults;
1368 }
1369
1370 bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1371                                 int src_nid, int dst_cpu)
1372 {
1373         struct numa_group *ng = p->numa_group;
1374         int dst_nid = cpu_to_node(dst_cpu);
1375         int last_cpupid, this_cpupid;
1376
1377         this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1378
1379         /*
1380          * Multi-stage node selection is used in conjunction with a periodic
1381          * migration fault to build a temporal task<->page relation. By using
1382          * a two-stage filter we remove short/unlikely relations.
1383          *
1384          * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
1385          * a task's usage of a particular page (n_p) per total usage of this
1386          * page (n_t) (in a given time-span) to a probability.
1387          *
1388          * Our periodic faults will sample this probability and getting the
1389          * same result twice in a row, given these samples are fully
1390          * independent, is then given by P(n)^2, provided our sample period
1391          * is sufficiently short compared to the usage pattern.
1392          *
1393          * This quadric squishes small probabilities, making it less likely we
1394          * act on an unlikely task<->page relation.
1395          */
1396         last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1397         if (!cpupid_pid_unset(last_cpupid) &&
1398                                 cpupid_to_nid(last_cpupid) != dst_nid)
1399                 return false;
1400
1401         /* Always allow migrate on private faults */
1402         if (cpupid_match_pid(p, last_cpupid))
1403                 return true;
1404
1405         /* A shared fault, but p->numa_group has not been set up yet. */
1406         if (!ng)
1407                 return true;
1408
1409         /*
1410          * Destination node is much more heavily used than the source
1411          * node? Allow migration.
1412          */
1413         if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
1414                                         ACTIVE_NODE_FRACTION)
1415                 return true;
1416
1417         /*
1418          * Distribute memory according to CPU & memory use on each node,
1419          * with 3/4 hysteresis to avoid unnecessary memory migrations:
1420          *
1421          * faults_cpu(dst)   3   faults_cpu(src)
1422          * --------------- * - > ---------------
1423          * faults_mem(dst)   4   faults_mem(src)
1424          */
1425         return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
1426                group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
1427 }
1428
1429 static unsigned long weighted_cpuload(struct rq *rq);
1430 static unsigned long source_load(int cpu, int type);
1431 static unsigned long target_load(int cpu, int type);
1432 static unsigned long capacity_of(int cpu);
1433
1434 /* Cached statistics for all CPUs within a node */
1435 struct numa_stats {
1436         unsigned long nr_running;
1437         unsigned long load;
1438
1439         /* Total compute capacity of CPUs on a node */
1440         unsigned long compute_capacity;
1441
1442         /* Approximate capacity in terms of runnable tasks on a node */
1443         unsigned long task_capacity;
1444         int has_free_capacity;
1445 };
1446
1447 /*
1448  * XXX borrowed from update_sg_lb_stats
1449  */
1450 static void update_numa_stats(struct numa_stats *ns, int nid)
1451 {
1452         int smt, cpu, cpus = 0;
1453         unsigned long capacity;
1454
1455         memset(ns, 0, sizeof(*ns));
1456         for_each_cpu(cpu, cpumask_of_node(nid)) {
1457                 struct rq *rq = cpu_rq(cpu);
1458
1459                 ns->nr_running += rq->nr_running;
1460                 ns->load += weighted_cpuload(rq);
1461                 ns->compute_capacity += capacity_of(cpu);
1462
1463                 cpus++;
1464         }
1465
1466         /*
1467          * If we raced with hotplug and there are no CPUs left in our mask
1468          * the @ns structure is NULL'ed and task_numa_compare() will
1469          * not find this node attractive.
1470          *
1471          * We'll either bail at !has_free_capacity, or we'll detect a huge
1472          * imbalance and bail there.
1473          */
1474         if (!cpus)
1475                 return;
1476
1477         /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
1478         smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1479         capacity = cpus / smt; /* cores */
1480
1481         ns->task_capacity = min_t(unsigned, capacity,
1482                 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
1483         ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
1484 }
1485
1486 struct task_numa_env {
1487         struct task_struct *p;
1488
1489         int src_cpu, src_nid;
1490         int dst_cpu, dst_nid;
1491
1492         struct numa_stats src_stats, dst_stats;
1493
1494         int imbalance_pct;
1495         int dist;
1496
1497         struct task_struct *best_task;
1498         long best_imp;
1499         int best_cpu;
1500 };
1501
1502 static void task_numa_assign(struct task_numa_env *env,
1503                              struct task_struct *p, long imp)
1504 {
1505         if (env->best_task)
1506                 put_task_struct(env->best_task);
1507         if (p)
1508                 get_task_struct(p);
1509
1510         env->best_task = p;
1511         env->best_imp = imp;
1512         env->best_cpu = env->dst_cpu;
1513 }
1514
1515 static bool load_too_imbalanced(long src_load, long dst_load,
1516                                 struct task_numa_env *env)
1517 {
1518         long imb, old_imb;
1519         long orig_src_load, orig_dst_load;
1520         long src_capacity, dst_capacity;
1521
1522         /*
1523          * The load is corrected for the CPU capacity available on each node.
1524          *
1525          * src_load        dst_load
1526          * ------------ vs ---------
1527          * src_capacity    dst_capacity
1528          */
1529         src_capacity = env->src_stats.compute_capacity;
1530         dst_capacity = env->dst_stats.compute_capacity;
1531
1532         /* We care about the slope of the imbalance, not the direction. */
1533         if (dst_load < src_load)
1534                 swap(dst_load, src_load);
1535
1536         /* Is the difference below the threshold? */
1537         imb = dst_load * src_capacity * 100 -
1538               src_load * dst_capacity * env->imbalance_pct;
1539         if (imb <= 0)
1540                 return false;
1541
1542         /*
1543          * The imbalance is above the allowed threshold.
1544          * Compare it with the old imbalance.
1545          */
1546         orig_src_load = env->src_stats.load;
1547         orig_dst_load = env->dst_stats.load;
1548
1549         if (orig_dst_load < orig_src_load)
1550                 swap(orig_dst_load, orig_src_load);
1551
1552         old_imb = orig_dst_load * src_capacity * 100 -
1553                   orig_src_load * dst_capacity * env->imbalance_pct;
1554
1555         /* Would this change make things worse? */
1556         return (imb > old_imb);
1557 }
1558
1559 /*
1560  * This checks if the overall compute and NUMA accesses of the system would
1561  * be improved if the source tasks was migrated to the target dst_cpu taking
1562  * into account that it might be best if task running on the dst_cpu should
1563  * be exchanged with the source task
1564  */
1565 static void task_numa_compare(struct task_numa_env *env,
1566                               long taskimp, long groupimp)
1567 {
1568         struct rq *src_rq = cpu_rq(env->src_cpu);
1569         struct rq *dst_rq = cpu_rq(env->dst_cpu);
1570         struct task_struct *cur;
1571         long src_load, dst_load;
1572         long load;
1573         long imp = env->p->numa_group ? groupimp : taskimp;
1574         long moveimp = imp;
1575         int dist = env->dist;
1576
1577         rcu_read_lock();
1578         cur = task_rcu_dereference(&dst_rq->curr);
1579         if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
1580                 cur = NULL;
1581
1582         /*
1583          * Because we have preemption enabled we can get migrated around and
1584          * end try selecting ourselves (current == env->p) as a swap candidate.
1585          */
1586         if (cur == env->p)
1587                 goto unlock;
1588
1589         /*
1590          * "imp" is the fault differential for the source task between the
1591          * source and destination node. Calculate the total differential for
1592          * the source task and potential destination task. The more negative
1593          * the value is, the more rmeote accesses that would be expected to
1594          * be incurred if the tasks were swapped.
1595          */
1596         if (cur) {
1597                 /* Skip this swap candidate if cannot move to the source cpu */
1598                 if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
1599                         goto unlock;
1600
1601                 /*
1602                  * If dst and source tasks are in the same NUMA group, or not
1603                  * in any group then look only at task weights.
1604                  */
1605                 if (cur->numa_group == env->p->numa_group) {
1606                         imp = taskimp + task_weight(cur, env->src_nid, dist) -
1607                               task_weight(cur, env->dst_nid, dist);
1608                         /*
1609                          * Add some hysteresis to prevent swapping the
1610                          * tasks within a group over tiny differences.
1611                          */
1612                         if (cur->numa_group)
1613                                 imp -= imp/16;
1614                 } else {
1615                         /*
1616                          * Compare the group weights. If a task is all by
1617                          * itself (not part of a group), use the task weight
1618                          * instead.
1619                          */
1620                         if (cur->numa_group)
1621                                 imp += group_weight(cur, env->src_nid, dist) -
1622                                        group_weight(cur, env->dst_nid, dist);
1623                         else
1624                                 imp += task_weight(cur, env->src_nid, dist) -
1625                                        task_weight(cur, env->dst_nid, dist);
1626                 }
1627         }
1628
1629         if (imp <= env->best_imp && moveimp <= env->best_imp)
1630                 goto unlock;
1631
1632         if (!cur) {
1633                 /* Is there capacity at our destination? */
1634                 if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
1635                     !env->dst_stats.has_free_capacity)
1636                         goto unlock;
1637
1638                 goto balance;
1639         }
1640
1641         /* Balance doesn't matter much if we're running a task per cpu */
1642         if (imp > env->best_imp && src_rq->nr_running == 1 &&
1643                         dst_rq->nr_running == 1)
1644                 goto assign;
1645
1646         /*
1647          * In the overloaded case, try and keep the load balanced.
1648          */
1649 balance:
1650         load = task_h_load(env->p);
1651         dst_load = env->dst_stats.load + load;
1652         src_load = env->src_stats.load - load;
1653
1654         if (moveimp > imp && moveimp > env->best_imp) {
1655                 /*
1656                  * If the improvement from just moving env->p direction is
1657                  * better than swapping tasks around, check if a move is
1658                  * possible. Store a slightly smaller score than moveimp,
1659                  * so an actually idle CPU will win.
1660                  */
1661                 if (!load_too_imbalanced(src_load, dst_load, env)) {
1662                         imp = moveimp - 1;
1663                         cur = NULL;
1664                         goto assign;
1665                 }
1666         }
1667
1668         if (imp <= env->best_imp)
1669                 goto unlock;
1670
1671         if (cur) {
1672                 load = task_h_load(cur);
1673                 dst_load -= load;
1674                 src_load += load;
1675         }
1676
1677         if (load_too_imbalanced(src_load, dst_load, env))
1678                 goto unlock;
1679
1680         /*
1681          * One idle CPU per node is evaluated for a task numa move.
1682          * Call select_idle_sibling to maybe find a better one.
1683          */
1684         if (!cur) {
1685                 /*
1686                  * select_idle_siblings() uses an per-cpu cpumask that
1687                  * can be used from IRQ context.
1688                  */
1689                 local_irq_disable();
1690                 env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
1691                                                    env->dst_cpu);
1692                 local_irq_enable();
1693         }
1694
1695 assign:
1696         task_numa_assign(env, cur, imp);
1697 unlock:
1698         rcu_read_unlock();
1699 }
1700
1701 static void task_numa_find_cpu(struct task_numa_env *env,
1702                                 long taskimp, long groupimp)
1703 {
1704         int cpu;
1705
1706         for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1707                 /* Skip this CPU if the source task cannot migrate */
1708                 if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
1709                         continue;
1710
1711                 env->dst_cpu = cpu;
1712                 task_numa_compare(env, taskimp, groupimp);
1713         }
1714 }
1715
1716 /* Only move tasks to a NUMA node less busy than the current node. */
1717 static bool numa_has_capacity(struct task_numa_env *env)
1718 {
1719         struct numa_stats *src = &env->src_stats;
1720         struct numa_stats *dst = &env->dst_stats;
1721
1722         if (src->has_free_capacity && !dst->has_free_capacity)
1723                 return false;
1724
1725         /*
1726          * Only consider a task move if the source has a higher load
1727          * than the destination, corrected for CPU capacity on each node.
1728          *
1729          *      src->load                dst->load
1730          * --------------------- vs ---------------------
1731          * src->compute_capacity    dst->compute_capacity
1732          */
1733         if (src->load * dst->compute_capacity * env->imbalance_pct >
1734
1735             dst->load * src->compute_capacity * 100)
1736                 return true;
1737
1738         return false;
1739 }
1740
1741 static int task_numa_migrate(struct task_struct *p)
1742 {
1743         struct task_numa_env env = {
1744                 .p = p,
1745
1746                 .src_cpu = task_cpu(p),
1747                 .src_nid = task_node(p),
1748
1749                 .imbalance_pct = 112,
1750
1751                 .best_task = NULL,
1752                 .best_imp = 0,
1753                 .best_cpu = -1,
1754         };
1755         struct sched_domain *sd;
1756         unsigned long taskweight, groupweight;
1757         int nid, ret, dist;
1758         long taskimp, groupimp;
1759
1760         /*
1761          * Pick the lowest SD_NUMA domain, as that would have the smallest
1762          * imbalance and would be the first to start moving tasks about.
1763          *
1764          * And we want to avoid any moving of tasks about, as that would create
1765          * random movement of tasks -- counter the numa conditions we're trying
1766          * to satisfy here.
1767          */
1768         rcu_read_lock();
1769         sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1770         if (sd)
1771                 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1772         rcu_read_unlock();
1773
1774         /*
1775          * Cpusets can break the scheduler domain tree into smaller
1776          * balance domains, some of which do not cross NUMA boundaries.
1777          * Tasks that are "trapped" in such domains cannot be migrated
1778          * elsewhere, so there is no point in (re)trying.
1779          */
1780         if (unlikely(!sd)) {
1781                 p->numa_preferred_nid = task_node(p);
1782                 return -EINVAL;
1783         }
1784
1785         env.dst_nid = p->numa_preferred_nid;
1786         dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1787         taskweight = task_weight(p, env.src_nid, dist);
1788         groupweight = group_weight(p, env.src_nid, dist);
1789         update_numa_stats(&env.src_stats, env.src_nid);
1790         taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1791         groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1792         update_numa_stats(&env.dst_stats, env.dst_nid);
1793
1794         /* Try to find a spot on the preferred nid. */
1795         if (numa_has_capacity(&env))
1796                 task_numa_find_cpu(&env, taskimp, groupimp);
1797
1798         /*
1799          * Look at other nodes in these cases:
1800          * - there is no space available on the preferred_nid
1801          * - the task is part of a numa_group that is interleaved across
1802          *   multiple NUMA nodes; in order to better consolidate the group,
1803          *   we need to check other locations.
1804          */
1805         if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
1806                 for_each_online_node(nid) {
1807                         if (nid == env.src_nid || nid == p->numa_preferred_nid)
1808                                 continue;
1809
1810                         dist = node_distance(env.src_nid, env.dst_nid);
1811                         if (sched_numa_topology_type == NUMA_BACKPLANE &&
1812                                                 dist != env.dist) {
1813                                 taskweight = task_weight(p, env.src_nid, dist);
1814                                 groupweight = group_weight(p, env.src_nid, dist);
1815                         }
1816
1817                         /* Only consider nodes where both task and groups benefit */
1818                         taskimp = task_weight(p, nid, dist) - taskweight;
1819                         groupimp = group_weight(p, nid, dist) - groupweight;
1820                         if (taskimp < 0 && groupimp < 0)
1821                                 continue;
1822
1823                         env.dist = dist;
1824                         env.dst_nid = nid;
1825                         update_numa_stats(&env.dst_stats, env.dst_nid);
1826                         if (numa_has_capacity(&env))
1827                                 task_numa_find_cpu(&env, taskimp, groupimp);
1828                 }
1829         }
1830
1831         /*
1832          * If the task is part of a workload that spans multiple NUMA nodes,
1833          * and is migrating into one of the workload's active nodes, remember
1834          * this node as the task's preferred numa node, so the workload can
1835          * settle down.
1836          * A task that migrated to a second choice node will be better off
1837          * trying for a better one later. Do not set the preferred node here.
1838          */
1839         if (p->numa_group) {
1840                 struct numa_group *ng = p->numa_group;
1841
1842                 if (env.best_cpu == -1)
1843                         nid = env.src_nid;
1844                 else
1845                         nid = env.dst_nid;
1846
1847                 if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
1848                         sched_setnuma(p, env.dst_nid);
1849         }
1850
1851         /* No better CPU than the current one was found. */
1852         if (env.best_cpu == -1)
1853                 return -EAGAIN;
1854
1855         /*
1856          * Reset the scan period if the task is being rescheduled on an
1857          * alternative node to recheck if the tasks is now properly placed.
1858          */
1859         p->numa_scan_period = task_scan_start(p);
1860
1861         if (env.best_task == NULL) {
1862                 ret = migrate_task_to(p, env.best_cpu);
1863                 if (ret != 0)
1864                         trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1865                 return ret;
1866         }
1867
1868         ret = migrate_swap(p, env.best_task);
1869         if (ret != 0)
1870                 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1871         put_task_struct(env.best_task);
1872         return ret;
1873 }
1874
1875 /* Attempt to migrate a task to a CPU on the preferred node. */
1876 static void numa_migrate_preferred(struct task_struct *p)
1877 {
1878         unsigned long interval = HZ;
1879
1880         /* This task has no NUMA fault statistics yet */
1881         if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1882                 return;
1883
1884         /* Periodically retry migrating the task to the preferred node */
1885         interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1886         p->numa_migrate_retry = jiffies + interval;
1887
1888         /* Success if task is already running on preferred CPU */
1889         if (task_node(p) == p->numa_preferred_nid)
1890                 return;
1891
1892         /* Otherwise, try migrate to a CPU on the preferred node */
1893         task_numa_migrate(p);
1894 }
1895
1896 /*
1897  * Find out how many nodes on the workload is actively running on. Do this by
1898  * tracking the nodes from which NUMA hinting faults are triggered. This can
1899  * be different from the set of nodes where the workload's memory is currently
1900  * located.
1901  */
1902 static void numa_group_count_active_nodes(struct numa_group *numa_group)
1903 {
1904         unsigned long faults, max_faults = 0;
1905         int nid, active_nodes = 0;
1906
1907         for_each_online_node(nid) {
1908                 faults = group_faults_cpu(numa_group, nid);
1909                 if (faults > max_faults)
1910                         max_faults = faults;
1911         }
1912
1913         for_each_online_node(nid) {
1914                 faults = group_faults_cpu(numa_group, nid);
1915                 if (faults * ACTIVE_NODE_FRACTION > max_faults)
1916                         active_nodes++;
1917         }
1918
1919         numa_group->max_faults_cpu = max_faults;
1920         numa_group->active_nodes = active_nodes;
1921 }
1922
1923 /*
1924  * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1925  * increments. The more local the fault statistics are, the higher the scan
1926  * period will be for the next scan window. If local/(local+remote) ratio is
1927  * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
1928  * the scan period will decrease. Aim for 70% local accesses.
1929  */
1930 #define NUMA_PERIOD_SLOTS 10
1931 #define NUMA_PERIOD_THRESHOLD 7
1932
1933 /*
1934  * Increase the scan period (slow down scanning) if the majority of
1935  * our memory is already on our local node, or if the majority of
1936  * the page accesses are shared with other processes.
1937  * Otherwise, decrease the scan period.
1938  */
1939 static void update_task_scan_period(struct task_struct *p,
1940                         unsigned long shared, unsigned long private)
1941 {
1942         unsigned int period_slot;
1943         int lr_ratio, ps_ratio;
1944         int diff;
1945
1946         unsigned long remote = p->numa_faults_locality[0];
1947         unsigned long local = p->numa_faults_locality[1];
1948
1949         /*
1950          * If there were no record hinting faults then either the task is
1951          * completely idle or all activity is areas that are not of interest
1952          * to automatic numa balancing. Related to that, if there were failed
1953          * migration then it implies we are migrating too quickly or the local
1954          * node is overloaded. In either case, scan slower
1955          */
1956         if (local + shared == 0 || p->numa_faults_locality[2]) {
1957                 p->numa_scan_period = min(p->numa_scan_period_max,
1958                         p->numa_scan_period << 1);
1959
1960                 p->mm->numa_next_scan = jiffies +
1961                         msecs_to_jiffies(p->numa_scan_period);
1962
1963                 return;
1964         }
1965
1966         /*
1967          * Prepare to scale scan period relative to the current period.
1968          *       == NUMA_PERIOD_THRESHOLD scan period stays the same
1969          *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1970          *       >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1971          */
1972         period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1973         lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1974         ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
1975
1976         if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
1977                 /*
1978                  * Most memory accesses are local. There is no need to
1979                  * do fast NUMA scanning, since memory is already local.
1980                  */
1981                 int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
1982                 if (!slot)
1983                         slot = 1;
1984                 diff = slot * period_slot;
1985         } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
1986                 /*
1987                  * Most memory accesses are shared with other tasks.
1988                  * There is no point in continuing fast NUMA scanning,
1989                  * since other tasks may just move the memory elsewhere.
1990                  */
1991                 int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
1992                 if (!slot)
1993                         slot = 1;
1994                 diff = slot * period_slot;
1995         } else {
1996                 /*
1997                  * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
1998                  * yet they are not on the local NUMA node. Speed up
1999                  * NUMA scanning to get the memory moved over.
2000                  */
2001                 int ratio = max(lr_ratio, ps_ratio);
2002                 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
2003         }
2004
2005         p->numa_scan_period = clamp(p->numa_scan_period + diff,
2006                         task_scan_min(p), task_scan_max(p));
2007         memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2008 }
2009
2010 /*
2011  * Get the fraction of time the task has been running since the last
2012  * NUMA placement cycle. The scheduler keeps similar statistics, but
2013  * decays those on a 32ms period, which is orders of magnitude off
2014  * from the dozens-of-seconds NUMA balancing period. Use the scheduler
2015  * stats only if the task is so new there are no NUMA statistics yet.
2016  */
2017 static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
2018 {
2019         u64 runtime, delta, now;
2020         /* Use the start of this time slice to avoid calculations. */
2021         now = p->se.exec_start;
2022         runtime = p->se.sum_exec_runtime;
2023
2024         if (p->last_task_numa_placement) {
2025                 delta = runtime - p->last_sum_exec_runtime;
2026                 *period = now - p->last_task_numa_placement;
2027         } else {
2028                 delta = p->se.avg.load_sum / p->se.load.weight;
2029                 *period = LOAD_AVG_MAX;
2030         }
2031
2032         p->last_sum_exec_runtime = runtime;
2033         p->last_task_numa_placement = now;
2034
2035         return delta;
2036 }
2037
2038 /*
2039  * Determine the preferred nid for a task in a numa_group. This needs to
2040  * be done in a way that produces consistent results with group_weight,
2041  * otherwise workloads might not converge.
2042  */
2043 static int preferred_group_nid(struct task_struct *p, int nid)
2044 {
2045         nodemask_t nodes;
2046         int dist;
2047
2048         /* Direct connections between all NUMA nodes. */
2049         if (sched_numa_topology_type == NUMA_DIRECT)
2050                 return nid;
2051
2052         /*
2053          * On a system with glueless mesh NUMA topology, group_weight
2054          * scores nodes according to the number of NUMA hinting faults on
2055          * both the node itself, and on nearby nodes.
2056          */
2057         if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
2058                 unsigned long score, max_score = 0;
2059                 int node, max_node = nid;
2060
2061                 dist = sched_max_numa_distance;
2062
2063                 for_each_online_node(node) {
2064                         score = group_weight(p, node, dist);
2065                         if (score > max_score) {
2066                                 max_score = score;
2067                                 max_node = node;
2068                         }
2069                 }
2070                 return max_node;
2071         }
2072
2073         /*
2074          * Finding the preferred nid in a system with NUMA backplane
2075          * interconnect topology is more involved. The goal is to locate
2076          * tasks from numa_groups near each other in the system, and
2077          * untangle workloads from different sides of the system. This requires
2078          * searching down the hierarchy of node groups, recursively searching
2079          * inside the highest scoring group of nodes. The nodemask tricks
2080          * keep the complexity of the search down.
2081          */
2082         nodes = node_online_map;
2083         for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
2084                 unsigned long max_faults = 0;
2085                 nodemask_t max_group = NODE_MASK_NONE;
2086                 int a, b;
2087
2088                 /* Are there nodes at this distance from each other? */
2089                 if (!find_numa_distance(dist))
2090                         continue;
2091
2092                 for_each_node_mask(a, nodes) {
2093                         unsigned long faults = 0;
2094                         nodemask_t this_group;
2095                         nodes_clear(this_group);
2096
2097                         /* Sum group's NUMA faults; includes a==b case. */
2098                         for_each_node_mask(b, nodes) {
2099                                 if (node_distance(a, b) < dist) {
2100                                         faults += group_faults(p, b);
2101                                         node_set(b, this_group);
2102                                         node_clear(b, nodes);
2103                                 }
2104                         }
2105
2106                         /* Remember the top group. */
2107                         if (faults > max_faults) {
2108                                 max_faults = faults;
2109                                 max_group = this_group;
2110                                 /*
2111                                  * subtle: at the smallest distance there is
2112                                  * just one node left in each "group", the
2113                                  * winner is the preferred nid.
2114                                  */
2115                                 nid = a;
2116                         }
2117                 }
2118                 /* Next round, evaluate the nodes within max_group. */
2119                 if (!max_faults)
2120                         break;
2121                 nodes = max_group;
2122         }
2123         return nid;
2124 }
2125
2126 static void task_numa_placement(struct task_struct *p)
2127 {
2128         int seq, nid, max_nid = -1, max_group_nid = -1;
2129         unsigned long max_faults = 0, max_group_faults = 0;
2130         unsigned long fault_types[2] = { 0, 0 };
2131         unsigned long total_faults;
2132         u64 runtime, period;
2133         spinlock_t *group_lock = NULL;
2134
2135         /*
2136          * The p->mm->numa_scan_seq field gets updated without
2137          * exclusive access. Use READ_ONCE() here to ensure
2138          * that the field is read in a single access:
2139          */
2140         seq = READ_ONCE(p->mm->numa_scan_seq);
2141         if (p->numa_scan_seq == seq)
2142                 return;
2143         p->numa_scan_seq = seq;
2144         p->numa_scan_period_max = task_scan_max(p);
2145
2146         total_faults = p->numa_faults_locality[0] +
2147                        p->numa_faults_locality[1];
2148         runtime = numa_get_avg_runtime(p, &period);
2149
2150         /* If the task is part of a group prevent parallel updates to group stats */
2151         if (p->numa_group) {
2152                 group_lock = &p->numa_group->lock;
2153                 spin_lock_irq(group_lock);
2154         }
2155
2156         /* Find the node with the highest number of faults */
2157         for_each_online_node(nid) {
2158                 /* Keep track of the offsets in numa_faults array */
2159                 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
2160                 unsigned long faults = 0, group_faults = 0;
2161                 int priv;
2162
2163                 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
2164                         long diff, f_diff, f_weight;
2165
2166                         mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2167                         membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2168                         cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2169                         cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
2170
2171                         /* Decay existing window, copy faults since last scan */
2172                         diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
2173                         fault_types[priv] += p->numa_faults[membuf_idx];
2174                         p->numa_faults[membuf_idx] = 0;
2175
2176                         /*
2177                          * Normalize the faults_from, so all tasks in a group
2178                          * count according to CPU use, instead of by the raw
2179                          * number of faults. Tasks with little runtime have
2180                          * little over-all impact on throughput, and thus their
2181                          * faults are less important.
2182                          */
2183                         f_weight = div64_u64(runtime << 16, period + 1);
2184                         f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
2185                                    (total_faults + 1);
2186                         f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2187                         p->numa_faults[cpubuf_idx] = 0;
2188
2189                         p->numa_faults[mem_idx] += diff;
2190                         p->numa_faults[cpu_idx] += f_diff;
2191                         faults += p->numa_faults[mem_idx];
2192                         p->total_numa_faults += diff;
2193                         if (p->numa_group) {
2194                                 /*
2195                                  * safe because we can only change our own group
2196                                  *
2197                                  * mem_idx represents the offset for a given
2198                                  * nid and priv in a specific region because it
2199                                  * is at the beginning of the numa_faults array.
2200                                  */
2201                                 p->numa_group->faults[mem_idx] += diff;
2202                                 p->numa_group->faults_cpu[mem_idx] += f_diff;
2203                                 p->numa_group->total_faults += diff;
2204                                 group_faults += p->numa_group->faults[mem_idx];
2205                         }
2206                 }
2207
2208                 if (faults > max_faults) {
2209                         max_faults = faults;
2210                         max_nid = nid;
2211                 }
2212
2213                 if (group_faults > max_group_faults) {
2214                         max_group_faults = group_faults;
2215                         max_group_nid = nid;
2216                 }
2217         }
2218
2219         update_task_scan_period(p, fault_types[0], fault_types[1]);
2220
2221         if (p->numa_group) {
2222                 numa_group_count_active_nodes(p->numa_group);
2223                 spin_unlock_irq(group_lock);
2224                 max_nid = preferred_group_nid(p, max_group_nid);
2225         }
2226
2227         if (max_faults) {
2228                 /* Set the new preferred node */
2229                 if (max_nid != p->numa_preferred_nid)
2230                         sched_setnuma(p, max_nid);
2231
2232                 if (task_node(p) != p->numa_preferred_nid)
2233                         numa_migrate_preferred(p);
2234         }
2235 }
2236
2237 static inline int get_numa_group(struct numa_group *grp)
2238 {
2239         return atomic_inc_not_zero(&grp->refcount);
2240 }
2241
2242 static inline void put_numa_group(struct numa_group *grp)
2243 {
2244         if (atomic_dec_and_test(&grp->refcount))
2245                 kfree_rcu(grp, rcu);
2246 }
2247
2248 static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2249                         int *priv)
2250 {
2251         struct numa_group *grp, *my_grp;
2252         struct task_struct *tsk;
2253         bool join = false;
2254         int cpu = cpupid_to_cpu(cpupid);
2255         int i;
2256
2257         if (unlikely(!p->numa_group)) {
2258                 unsigned int size = sizeof(struct numa_group) +
2259                                     4*nr_node_ids*sizeof(unsigned long);
2260
2261                 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2262                 if (!grp)
2263                         return;
2264
2265                 atomic_set(&grp->refcount, 1);
2266                 grp->active_nodes = 1;
2267                 grp->max_faults_cpu = 0;
2268                 spin_lock_init(&grp->lock);
2269                 grp->gid = p->pid;
2270                 /* Second half of the array tracks nids where faults happen */
2271                 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
2272                                                 nr_node_ids;
2273
2274                 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2275                         grp->faults[i] = p->numa_faults[i];
2276
2277                 grp->total_faults = p->total_numa_faults;
2278
2279                 grp->nr_tasks++;
2280                 rcu_assign_pointer(p->numa_group, grp);
2281         }
2282
2283         rcu_read_lock();
2284         tsk = READ_ONCE(cpu_rq(cpu)->curr);
2285
2286         if (!cpupid_match_pid(tsk, cpupid))
2287                 goto no_join;
2288
2289         grp = rcu_dereference(tsk->numa_group);
2290         if (!grp)
2291                 goto no_join;
2292
2293         my_grp = p->numa_group;
2294         if (grp == my_grp)
2295                 goto no_join;
2296
2297         /*
2298          * Only join the other group if its bigger; if we're the bigger group,
2299          * the other task will join us.
2300          */
2301         if (my_grp->nr_tasks > grp->nr_tasks)
2302                 goto no_join;
2303
2304         /*
2305          * Tie-break on the grp address.
2306          */
2307         if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
2308                 goto no_join;
2309
2310         /* Always join threads in the same process. */
2311         if (tsk->mm == current->mm)
2312                 join = true;
2313
2314         /* Simple filter to avoid false positives due to PID collisions */
2315         if (flags & TNF_SHARED)
2316                 join = true;
2317
2318         /* Update priv based on whether false sharing was detected */
2319         *priv = !join;
2320
2321         if (join && !get_numa_group(grp))
2322                 goto no_join;
2323
2324         rcu_read_unlock();
2325
2326         if (!join)
2327                 return;
2328
2329         BUG_ON(irqs_disabled());
2330         double_lock_irq(&my_grp->lock, &grp->lock);
2331
2332         for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
2333                 my_grp->faults[i] -= p->numa_faults[i];
2334                 grp->faults[i] += p->numa_faults[i];
2335         }
2336         my_grp->total_faults -= p->total_numa_faults;
2337         grp->total_faults += p->total_numa_faults;
2338
2339         my_grp->nr_tasks--;
2340         grp->nr_tasks++;
2341
2342         spin_unlock(&my_grp->lock);
2343         spin_unlock_irq(&grp->lock);
2344
2345         rcu_assign_pointer(p->numa_group, grp);
2346
2347         put_numa_group(my_grp);
2348         return;
2349
2350 no_join:
2351         rcu_read_unlock();
2352         return;
2353 }
2354
2355 void task_numa_free(struct task_struct *p)
2356 {
2357         struct numa_group *grp = p->numa_group;
2358         void *numa_faults = p->numa_faults;
2359         unsigned long flags;
2360         int i;
2361
2362         if (grp) {
2363                 spin_lock_irqsave(&grp->lock, flags);
2364                 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2365                         grp->faults[i] -= p->numa_faults[i];
2366                 grp->total_faults -= p->total_numa_faults;
2367
2368                 grp->nr_tasks--;
2369                 spin_unlock_irqrestore(&grp->lock, flags);
2370                 RCU_INIT_POINTER(p->numa_group, NULL);
2371                 put_numa_group(grp);
2372         }
2373
2374         p->numa_faults = NULL;
2375         kfree(numa_faults);
2376 }
2377
2378 /*
2379  * Got a PROT_NONE fault for a page on @node.
2380  */
2381 void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2382 {
2383         struct task_struct *p = current;
2384         bool migrated = flags & TNF_MIGRATED;
2385         int cpu_node = task_node(current);
2386         int local = !!(flags & TNF_FAULT_LOCAL);
2387         struct numa_group *ng;
2388         int priv;
2389
2390         if (!static_branch_likely(&sched_numa_balancing))
2391                 return;
2392
2393         /* for example, ksmd faulting in a user's mm */
2394         if (!p->mm)
2395                 return;
2396
2397         /* Allocate buffer to track faults on a per-node basis */
2398         if (unlikely(!p->numa_faults)) {
2399                 int size = sizeof(*p->numa_faults) *
2400                            NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2401
2402                 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2403                 if (!p->numa_faults)
2404                         return;
2405
2406                 p->total_numa_faults = 0;
2407                 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2408         }
2409
2410         /*
2411          * First accesses are treated as private, otherwise consider accesses
2412          * to be private if the accessing pid has not changed
2413          */
2414         if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2415                 priv = 1;
2416         } else {
2417                 priv = cpupid_match_pid(p, last_cpupid);
2418                 if (!priv && !(flags & TNF_NO_GROUP))
2419                         task_numa_group(p, last_cpupid, flags, &priv);
2420         }
2421
2422         /*
2423          * If a workload spans multiple NUMA nodes, a shared fault that
2424          * occurs wholly within the set of nodes that the workload is
2425          * actively using should be counted as local. This allows the
2426          * scan rate to slow down when a workload has settled down.
2427          */
2428         ng = p->numa_group;
2429         if (!priv && !local && ng && ng->active_nodes > 1 &&
2430                                 numa_is_active_node(cpu_node, ng) &&
2431                                 numa_is_active_node(mem_node, ng))
2432                 local = 1;
2433
2434         task_numa_placement(p);
2435
2436         /*
2437          * Retry task to preferred node migration periodically, in case it
2438          * case it previously failed, or the scheduler moved us.
2439          */
2440         if (time_after(jiffies, p->numa_migrate_retry))
2441                 numa_migrate_preferred(p);
2442
2443         if (migrated)
2444                 p->numa_pages_migrated += pages;
2445         if (flags & TNF_MIGRATE_FAIL)
2446                 p->numa_faults_locality[2] += pages;
2447
2448         p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2449         p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
2450         p->numa_faults_locality[local] += pages;
2451 }
2452
2453 static void reset_ptenuma_scan(struct task_struct *p)
2454 {
2455         /*
2456          * We only did a read acquisition of the mmap sem, so
2457          * p->mm->numa_scan_seq is written to without exclusive access
2458          * and the update is not guaranteed to be atomic. That's not
2459          * much of an issue though, since this is just used for
2460          * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
2461          * expensive, to avoid any form of compiler optimizations:
2462          */
2463         WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2464         p->mm->numa_scan_offset = 0;
2465 }
2466
2467 /*
2468  * The expensive part of numa migration is done from task_work context.
2469  * Triggered from task_tick_numa().
2470  */
2471 void task_numa_work(struct callback_head *work)
2472 {
2473         unsigned long migrate, next_scan, now = jiffies;
2474         struct task_struct *p = current;
2475         struct mm_struct *mm = p->mm;
2476         u64 runtime = p->se.sum_exec_runtime;
2477         struct vm_area_struct *vma;
2478         unsigned long start, end;
2479         unsigned long nr_pte_updates = 0;
2480         long pages, virtpages;
2481
2482         SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
2483
2484         work->next = work; /* protect against double add */
2485         /*
2486          * Who cares about NUMA placement when they're dying.
2487          *
2488          * NOTE: make sure not to dereference p->mm before this check,
2489          * exit_task_work() happens _after_ exit_mm() so we could be called
2490          * without p->mm even though we still had it when we enqueued this
2491          * work.
2492          */
2493         if (p->flags & PF_EXITING)
2494                 return;
2495
2496         if (!mm->numa_next_scan) {
2497                 mm->numa_next_scan = now +
2498                         msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2499         }
2500
2501         /*
2502          * Enforce maximal scan/migration frequency..
2503          */
2504         migrate = mm->numa_next_scan;
2505         if (time_before(now, migrate))
2506                 return;
2507
2508         if (p->numa_scan_period == 0) {
2509                 p->numa_scan_period_max = task_scan_max(p);
2510                 p->numa_scan_period = task_scan_start(p);
2511         }
2512
2513         next_scan = now + msecs_to_jiffies(p->numa_scan_period);
2514         if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2515                 return;
2516
2517         /*
2518          * Delay this task enough that another task of this mm will likely win
2519          * the next time around.
2520          */
2521         p->node_stamp += 2 * TICK_NSEC;
2522
2523         start = mm->numa_scan_offset;
2524         pages = sysctl_numa_balancing_scan_size;
2525         pages <<= 20 - PAGE_SHIFT; /* MB in pages */
2526         virtpages = pages * 8;     /* Scan up to this much virtual space */
2527         if (!pages)
2528                 return;
2529
2530
2531         if (!down_read_trylock(&mm->mmap_sem))
2532                 return;
2533         vma = find_vma(mm, start);
2534         if (!vma) {
2535                 reset_ptenuma_scan(p);
2536                 start = 0;
2537                 vma = mm->mmap;
2538         }
2539         for (; vma; vma = vma->vm_next) {
2540                 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2541                         is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
2542                         continue;
2543                 }
2544
2545                 /*
2546                  * Shared library pages mapped by multiple processes are not
2547                  * migrated as it is expected they are cache replicated. Avoid
2548                  * hinting faults in read-only file-backed mappings or the vdso
2549                  * as migrating the pages will be of marginal benefit.
2550                  */
2551                 if (!vma->vm_mm ||
2552                     (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2553                         continue;
2554
2555                 /*
2556                  * Skip inaccessible VMAs to avoid any confusion between
2557                  * PROT_NONE and NUMA hinting ptes
2558                  */
2559                 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2560                         continue;
2561
2562                 do {
2563                         start = max(start, vma->vm_start);
2564                         end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2565                         end = min(end, vma->vm_end);
2566                         nr_pte_updates = change_prot_numa(vma, start, end);
2567
2568                         /*
2569                          * Try to scan sysctl_numa_balancing_size worth of
2570                          * hpages that have at least one present PTE that
2571                          * is not already pte-numa. If the VMA contains
2572                          * areas that are unused or already full of prot_numa
2573                          * PTEs, scan up to virtpages, to skip through those
2574                          * areas faster.
2575                          */
2576                         if (nr_pte_updates)
2577                                 pages -= (end - start) >> PAGE_SHIFT;
2578                         virtpages -= (end - start) >> PAGE_SHIFT;
2579
2580                         start = end;
2581                         if (pages <= 0 || virtpages <= 0)
2582                                 goto out;
2583
2584                         cond_resched();
2585                 } while (end != vma->vm_end);
2586         }
2587
2588 out:
2589         /*
2590          * It is possible to reach the end of the VMA list but the last few
2591          * VMAs are not guaranteed to the vma_migratable. If they are not, we
2592          * would find the !migratable VMA on the next scan but not reset the
2593          * scanner to the start so check it now.
2594          */
2595         if (vma)
2596                 mm->numa_scan_offset = start;
2597         else
2598                 reset_ptenuma_scan(p);
2599         up_read(&mm->mmap_sem);
2600
2601         /*
2602          * Make sure tasks use at least 32x as much time to run other code
2603          * than they used here, to limit NUMA PTE scanning overhead to 3% max.
2604          * Usually update_task_scan_period slows down scanning enough; on an
2605          * overloaded system we need to limit overhead on a per task basis.
2606          */
2607         if (unlikely(p->se.sum_exec_runtime != runtime)) {
2608                 u64 diff = p->se.sum_exec_runtime - runtime;
2609                 p->node_stamp += 32 * diff;
2610         }
2611 }
2612
2613 /*
2614  * Drive the periodic memory faults..
2615  */
2616 void task_tick_numa(struct rq *rq, struct task_struct *curr)
2617 {
2618         struct callback_head *work = &curr->numa_work;
2619         u64 period, now;
2620
2621         /*
2622          * We don't care about NUMA placement if we don't have memory.
2623          */
2624         if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
2625                 return;
2626
2627         /*
2628          * Using runtime rather than walltime has the dual advantage that
2629          * we (mostly) drive the selection from busy threads and that the
2630          * task needs to have done some actual work before we bother with
2631          * NUMA placement.
2632          */
2633         now = curr->se.sum_exec_runtime;
2634         period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2635
2636         if (now > curr->node_stamp + period) {
2637                 if (!curr->node_stamp)
2638                         curr->numa_scan_period = task_scan_start(curr);
2639                 curr->node_stamp += period;
2640
2641                 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2642                         init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
2643                         task_work_add(curr, work, true);
2644                 }
2645         }
2646 }
2647
2648 #else
2649 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2650 {
2651 }
2652
2653 static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2654 {
2655 }
2656
2657 static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2658 {
2659 }
2660
2661 #endif /* CONFIG_NUMA_BALANCING */
2662
2663 static void
2664 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2665 {
2666         update_load_add(&cfs_rq->load, se->load.weight);
2667         if (!parent_entity(se))
2668                 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
2669 #ifdef CONFIG_SMP
2670         if (entity_is_task(se)) {
2671                 struct rq *rq = rq_of(cfs_rq);
2672
2673                 account_numa_enqueue(rq, task_of(se));
2674                 list_add(&se->group_node, &rq->cfs_tasks);
2675         }
2676 #endif
2677         cfs_rq->nr_running++;
2678 }
2679
2680 static void
2681 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2682 {
2683         update_load_sub(&cfs_rq->load, se->load.weight);
2684         if (!parent_entity(se))
2685                 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
2686 #ifdef CONFIG_SMP
2687         if (entity_is_task(se)) {
2688                 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
2689                 list_del_init(&se->group_node);
2690         }
2691 #endif
2692         cfs_rq->nr_running--;
2693 }
2694
2695 #ifdef CONFIG_FAIR_GROUP_SCHED
2696 # ifdef CONFIG_SMP
2697 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2698 {
2699         long tg_weight, load, shares;
2700
2701         /*
2702          * This really should be: cfs_rq->avg.load_avg, but instead we use
2703          * cfs_rq->load.weight, which is its upper bound. This helps ramp up
2704          * the shares for small weight interactive tasks.
2705          */
2706         load = scale_load_down(cfs_rq->load.weight);
2707
2708         tg_weight = atomic_long_read(&tg->load_avg);
2709
2710         /* Ensure tg_weight >= load */
2711         tg_weight -= cfs_rq->tg_load_avg_contrib;
2712         tg_weight += load;
2713
2714         shares = (tg->shares * load);
2715         if (tg_weight)
2716                 shares /= tg_weight;
2717
2718         /*
2719          * MIN_SHARES has to be unscaled here to support per-CPU partitioning
2720          * of a group with small tg->shares value. It is a floor value which is
2721          * assigned as a minimum load.weight to the sched_entity representing
2722          * the group on a CPU.
2723          *
2724          * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024
2725          * on an 8-core system with 8 tasks each runnable on one CPU shares has
2726          * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In
2727          * case no task is runnable on a CPU MIN_SHARES=2 should be returned
2728          * instead of 0.
2729          */
2730         if (shares < MIN_SHARES)
2731                 shares = MIN_SHARES;
2732         if (shares > tg->shares)
2733                 shares = tg->shares;
2734
2735         return shares;
2736 }
2737 # else /* CONFIG_SMP */
2738 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2739 {
2740         return tg->shares;
2741 }
2742 # endif /* CONFIG_SMP */
2743
2744 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2745                             unsigned long weight)
2746 {
2747         if (se->on_rq) {
2748                 /* commit outstanding execution time */
2749                 if (cfs_rq->curr == se)
2750                         update_curr(cfs_rq);
2751                 account_entity_dequeue(cfs_rq, se);
2752         }
2753
2754         update_load_set(&se->load, weight);
2755
2756         if (se->on_rq)
2757                 account_entity_enqueue(cfs_rq, se);
2758 }
2759
2760 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
2761
2762 static void update_cfs_shares(struct sched_entity *se)
2763 {
2764         struct cfs_rq *cfs_rq = group_cfs_rq(se);
2765         struct task_group *tg;
2766         long shares;
2767
2768         if (!cfs_rq)
2769                 return;
2770
2771         if (throttled_hierarchy(cfs_rq))
2772                 return;
2773
2774         tg = cfs_rq->tg;
2775
2776 #ifndef CONFIG_SMP
2777         if (likely(se->load.weight == tg->shares))
2778                 return;
2779 #endif
2780         shares = calc_cfs_shares(cfs_rq, tg);
2781
2782         reweight_entity(cfs_rq_of(se), se, shares);
2783 }
2784
2785 #else /* CONFIG_FAIR_GROUP_SCHED */
2786 static inline void update_cfs_shares(struct sched_entity *se)
2787 {
2788 }
2789 #endif /* CONFIG_FAIR_GROUP_SCHED */
2790
2791 static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
2792 {
2793         struct rq *rq = rq_of(cfs_rq);
2794
2795         if (&rq->cfs == cfs_rq) {
2796                 /*
2797                  * There are a few boundary cases this might miss but it should
2798                  * get called often enough that that should (hopefully) not be
2799                  * a real problem -- added to that it only calls on the local
2800                  * CPU, so if we enqueue remotely we'll miss an update, but
2801                  * the next tick/schedule should update.
2802                  *
2803                  * It will not get called when we go idle, because the idle
2804                  * thread is a different class (!fair), nor will the utilization
2805                  * number include things like RT tasks.
2806                  *
2807                  * As is, the util number is not freq-invariant (we'd have to
2808                  * implement arch_scale_freq_capacity() for that).
2809                  *
2810                  * See cpu_util().
2811                  */
2812                 cpufreq_update_util(rq, 0);
2813         }
2814 }
2815
2816 #ifdef CONFIG_SMP
2817 /*
2818  * Approximate:
2819  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
2820  */
2821 static u64 decay_load(u64 val, u64 n)
2822 {
2823         unsigned int local_n;
2824
2825         if (unlikely(n > LOAD_AVG_PERIOD * 63))
2826                 return 0;
2827
2828         /* after bounds checking we can collapse to 32-bit */
2829         local_n = n;
2830
2831         /*
2832          * As y^PERIOD = 1/2, we can combine
2833          *    y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
2834          * With a look-up table which covers y^n (n<PERIOD)
2835          *
2836          * To achieve constant time decay_load.
2837          */
2838         if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
2839                 val >>= local_n / LOAD_AVG_PERIOD;
2840                 local_n %= LOAD_AVG_PERIOD;
2841         }
2842
2843         val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
2844         return val;
2845 }
2846
2847 static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
2848 {
2849         u32 c1, c2, c3 = d3; /* y^0 == 1 */
2850
2851         /*
2852          * c1 = d1 y^p
2853          */
2854         c1 = decay_load((u64)d1, periods);
2855
2856         /*
2857          *            p-1
2858          * c2 = 1024 \Sum y^n
2859          *            n=1
2860          *
2861          *              inf        inf
2862          *    = 1024 ( \Sum y^n - \Sum y^n - y^0 )
2863          *              n=0        n=p
2864          */
2865         c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
2866
2867         return c1 + c2 + c3;
2868 }
2869
2870 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
2871
2872 /*
2873  * Accumulate the three separate parts of the sum; d1 the remainder
2874  * of the last (incomplete) period, d2 the span of full periods and d3
2875  * the remainder of the (incomplete) current period.
2876  *
2877  *           d1          d2           d3
2878  *           ^           ^            ^
2879  *           |           |            |
2880  *         |<->|<----------------->|<--->|
2881  * ... |---x---|------| ... |------|-----x (now)
2882  *
2883  *                           p-1
2884  * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
2885  *                           n=1
2886  *
2887  *    = u y^p +                                 (Step 1)
2888  *
2889  *                     p-1
2890  *      d1 y^p + 1024 \Sum y^n + d3 y^0         (Step 2)
2891  *                     n=1
2892  */
2893 static __always_inline u32
2894 accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
2895                unsigned long weight, int running, struct cfs_rq *cfs_rq)
2896 {
2897         unsigned long scale_freq, scale_cpu;
2898         u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
2899         u64 periods;
2900
2901         scale_freq = arch_scale_freq_capacity(NULL, cpu);
2902         scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
2903
2904         delta += sa->period_contrib;
2905         periods = delta / 1024; /* A period is 1024us (~1ms) */
2906
2907         /*
2908          * Step 1: decay old *_sum if we crossed period boundaries.
2909          */
2910         if (periods) {
2911                 sa->load_sum = decay_load(sa->load_sum, periods);
2912                 if (cfs_rq) {
2913                         cfs_rq->runnable_load_sum =
2914                                 decay_load(cfs_rq->runnable_load_sum, periods);
2915                 }
2916                 sa->util_sum = decay_load((u64)(sa->util_sum), periods);
2917
2918                 /*
2919                  * Step 2
2920                  */
2921                 delta %= 1024;
2922                 contrib = __accumulate_pelt_segments(periods,
2923                                 1024 - sa->period_contrib, delta);
2924         }
2925         sa->period_contrib = delta;
2926
2927         contrib = cap_scale(contrib, scale_freq);
2928         if (weight) {
2929                 sa->load_sum += weight * contrib;
2930                 if (cfs_rq)
2931                         cfs_rq->runnable_load_sum += weight * contrib;
2932         }
2933         if (running)
2934                 sa->util_sum += contrib * scale_cpu;
2935
2936         return periods;
2937 }
2938
2939 /*
2940  * We can represent the historical contribution to runnable average as the
2941  * coefficients of a geometric series.  To do this we sub-divide our runnable
2942  * history into segments of approximately 1ms (1024us); label the segment that
2943  * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
2944  *
2945  * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
2946  *      p0            p1           p2
2947  *     (now)       (~1ms ago)  (~2ms ago)
2948  *
2949  * Let u_i denote the fraction of p_i that the entity was runnable.
2950  *
2951  * We then designate the fractions u_i as our co-efficients, yielding the
2952  * following representation of historical load:
2953  *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
2954  *
2955  * We choose y based on the with of a reasonably scheduling period, fixing:
2956  *   y^32 = 0.5
2957  *
2958  * This means that the contribution to load ~32ms ago (u_32) will be weighted
2959  * approximately half as much as the contribution to load within the last ms
2960  * (u_0).
2961  *
2962  * When a period "rolls over" and we have new u_0`, multiplying the previous
2963  * sum again by y is sufficient to update:
2964  *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
2965  *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
2966  */
2967 static __always_inline int
2968 ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2969                   unsigned long weight, int running, struct cfs_rq *cfs_rq)
2970 {
2971         u64 delta;
2972
2973         delta = now - sa->last_update_time;
2974         /*
2975          * This should only happen when time goes backwards, which it
2976          * unfortunately does during sched clock init when we swap over to TSC.
2977          */
2978         if ((s64)delta < 0) {
2979                 sa->last_update_time = now;
2980                 return 0;
2981         }
2982
2983         /*
2984          * Use 1024ns as the unit of measurement since it's a reasonable
2985          * approximation of 1us and fast to compute.
2986          */
2987         delta >>= 10;
2988         if (!delta)
2989                 return 0;
2990
2991         sa->last_update_time += delta << 10;
2992
2993         /*
2994          * running is a subset of runnable (weight) so running can't be set if
2995          * runnable is clear. But there are some corner cases where the current
2996          * se has been already dequeued but cfs_rq->curr still points to it.
2997          * This means that weight will be 0 but not running for a sched_entity
2998          * but also for a cfs_rq if the latter becomes idle. As an example,
2999          * this happens during idle_balance() which calls
3000          * update_blocked_averages()
3001          */
3002         if (!weight)
3003                 running = 0;
3004
3005         /*
3006          * Now we know we crossed measurement unit boundaries. The *_avg
3007          * accrues by two steps:
3008          *
3009          * Step 1: accumulate *_sum since last_update_time. If we haven't
3010          * crossed period boundaries, finish.
3011          */
3012         if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
3013                 return 0;
3014
3015         /*
3016          * Step 2: update *_avg.
3017          */
3018         sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
3019         if (cfs_rq) {
3020                 cfs_rq->runnable_load_avg =
3021                         div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
3022         }
3023         sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib);
3024
3025         return 1;
3026 }
3027
3028 static int
3029 __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
3030 {
3031         return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL);
3032 }
3033
3034 static int
3035 __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
3036 {
3037         return ___update_load_avg(now, cpu, &se->avg,
3038                                   se->on_rq * scale_load_down(se->load.weight),
3039                                   cfs_rq->curr == se, NULL);
3040 }
3041
3042 static int
3043 __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
3044 {
3045         return ___update_load_avg(now, cpu, &cfs_rq->avg,
3046                         scale_load_down(cfs_rq->load.weight),
3047                         cfs_rq->curr != NULL, cfs_rq);
3048 }
3049
3050 /*
3051  * Signed add and clamp on underflow.
3052  *
3053  * Explicitly do a load-store to ensure the intermediate value never hits
3054  * memory. This allows lockless observations without ever seeing the negative
3055  * values.
3056  */
3057 #define add_positive(_ptr, _val) do {                           \
3058         typeof(_ptr) ptr = (_ptr);                              \
3059         typeof(_val) val = (_val);                              \
3060         typeof(*ptr) res, var = READ_ONCE(*ptr);                \
3061                                                                 \
3062         res = var + val;                                        \
3063                                                                 \
3064         if (val < 0 && res > var)                               \
3065                 res = 0;                                        \
3066                                                                 \
3067         WRITE_ONCE(*ptr, res);                                  \
3068 } while (0)
3069
3070 #ifdef CONFIG_FAIR_GROUP_SCHED
3071 /**
3072  * update_tg_load_avg - update the tg's load avg
3073  * @cfs_rq: the cfs_rq whose avg changed
3074  * @force: update regardless of how small the difference
3075  *
3076  * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
3077  * However, because tg->load_avg is a global value there are performance
3078  * considerations.
3079  *
3080  * In order to avoid having to look at the other cfs_rq's, we use a
3081  * differential update where we store the last value we propagated. This in
3082  * turn allows skipping updates if the differential is 'small'.
3083  *
3084  * Updating tg's load_avg is necessary before update_cfs_share().
3085  */
3086 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
3087 {
3088         long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
3089
3090         /*
3091          * No need to update load_avg for root_task_group as it is not used.
3092          */
3093         if (cfs_rq->tg == &root_task_group)
3094                 return;
3095
3096         if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
3097                 atomic_long_add(delta, &cfs_rq->tg->load_avg);
3098                 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
3099         }
3100 }
3101
3102 /*
3103  * Called within set_task_rq() right before setting a task's cpu. The
3104  * caller only guarantees p->pi_lock is held; no other assumptions,
3105  * including the state of rq->lock, should be made.
3106  */
3107 void set_task_rq_fair(struct sched_entity *se,
3108                       struct cfs_rq *prev, struct cfs_rq *next)
3109 {
3110         u64 p_last_update_time;
3111         u64 n_last_update_time;
3112
3113         if (!sched_feat(ATTACH_AGE_LOAD))
3114                 return;
3115
3116         /*
3117          * We are supposed to update the task to "current" time, then its up to
3118          * date and ready to go to new CPU/cfs_rq. But we have difficulty in
3119          * getting what current time is, so simply throw away the out-of-date
3120          * time. This will result in the wakee task is less decayed, but giving
3121          * the wakee more load sounds not bad.
3122          */
3123         if (!(se->avg.last_update_time && prev))
3124                 return;
3125
3126 #ifndef CONFIG_64BIT
3127         {
3128                 u64 p_last_update_time_copy;
3129                 u64 n_last_update_time_copy;
3130
3131                 do {
3132                         p_last_update_time_copy = prev->load_last_update_time_copy;
3133                         n_last_update_time_copy = next->load_last_update_time_copy;
3134
3135                         smp_rmb();
3136
3137                         p_last_update_time = prev->avg.last_update_time;
3138                         n_last_update_time = next->avg.last_update_time;
3139
3140                 } while (p_last_update_time != p_last_update_time_copy ||
3141                          n_last_update_time != n_last_update_time_copy);
3142         }
3143 #else
3144         p_last_update_time = prev->avg.last_update_time;
3145         n_last_update_time = next->avg.last_update_time;
3146 #endif
3147         __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se);
3148         se->avg.last_update_time = n_last_update_time;
3149 }
3150
3151 /* Take into account change of utilization of a child task group */
3152 static inline void
3153 update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
3154 {
3155         struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3156         long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
3157
3158         /* Nothing to update */
3159         if (!delta)
3160                 return;
3161
3162         /* Set new sched_entity's utilization */
3163         se->avg.util_avg = gcfs_rq->avg.util_avg;
3164         se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
3165
3166         /* Update parent cfs_rq utilization */
3167         add_positive(&cfs_rq->avg.util_avg, delta);
3168         cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
3169 }
3170
3171 /* Take into account change of load of a child task group */
3172 static inline void
3173 update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
3174 {
3175         struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3176         long delta, load = gcfs_rq->avg.load_avg;
3177
3178         /*
3179          * If the load of group cfs_rq is null, the load of the
3180          * sched_entity will also be null so we can skip the formula
3181          */
3182         if (load) {
3183                 long tg_load;
3184
3185                 /* Get tg's load and ensure tg_load > 0 */
3186                 tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
3187
3188                 /* Ensure tg_load >= load and updated with current load*/
3189                 tg_load -= gcfs_rq->tg_load_avg_contrib;
3190                 tg_load += load;
3191
3192                 /*
3193                  * We need to compute a correction term in the case that the
3194                  * task group is consuming more CPU than a task of equal
3195                  * weight. A task with a weight equals to tg->shares will have
3196                  * a load less or equal to scale_load_down(tg->shares).
3197                  * Similarly, the sched_entities that represent the task group
3198                  * at parent level, can't have a load higher than
3199                  * scale_load_down(tg->shares). And the Sum of sched_entities'
3200                  * load must be <= scale_load_down(tg->shares).
3201                  */
3202                 if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
3203                         /* scale gcfs_rq's load into tg's shares*/
3204                         load *= scale_load_down(gcfs_rq->tg->shares);
3205                         load /= tg_load;
3206                 }
3207         }
3208
3209         delta = load - se->avg.load_avg;
3210
3211         /* Nothing to update */
3212         if (!delta)
3213                 return;
3214
3215         /* Set new sched_entity's load */
3216         se->avg.load_avg = load;
3217         se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
3218
3219         /* Update parent cfs_rq load */
3220         add_positive(&cfs_rq->avg.load_avg, delta);
3221         cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
3222
3223         /*
3224          * If the sched_entity is already enqueued, we also have to update the
3225          * runnable load avg.
3226          */
3227         if (se->on_rq) {
3228                 /* Update parent cfs_rq runnable_load_avg */
3229                 add_positive(&cfs_rq->runnable_load_avg, delta);
3230                 cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
3231         }
3232 }
3233
3234 static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
3235 {
3236         cfs_rq->propagate_avg = 1;
3237 }
3238
3239 static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
3240 {
3241         struct cfs_rq *cfs_rq = group_cfs_rq(se);
3242
3243         if (!cfs_rq->propagate_avg)
3244                 return 0;
3245
3246         cfs_rq->propagate_avg = 0;
3247         return 1;
3248 }
3249
3250 /* Update task and its cfs_rq load average */
3251 static inline int propagate_entity_load_avg(struct sched_entity *se)
3252 {
3253         struct cfs_rq *cfs_rq;
3254
3255         if (entity_is_task(se))
3256                 return 0;
3257
3258         if (!test_and_clear_tg_cfs_propagate(se))
3259                 return 0;
3260
3261         cfs_rq = cfs_rq_of(se);
3262
3263         set_tg_cfs_propagate(cfs_rq);
3264
3265         update_tg_cfs_util(cfs_rq, se);
3266         update_tg_cfs_load(cfs_rq, se);
3267
3268         return 1;
3269 }
3270
3271 /*
3272  * Check if we need to update the load and the utilization of a blocked
3273  * group_entity:
3274  */
3275 static inline bool skip_blocked_update(struct sched_entity *se)
3276 {
3277         struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3278
3279         /*
3280          * If sched_entity still have not zero load or utilization, we have to
3281          * decay it:
3282          */
3283         if (se->avg.load_avg || se->avg.util_avg)
3284                 return false;
3285
3286         /*
3287          * If there is a pending propagation, we have to update the load and
3288          * the utilization of the sched_entity:
3289          */
3290         if (gcfs_rq->propagate_avg)
3291                 return false;
3292
3293         /*
3294          * Otherwise, the load and the utilization of the sched_entity is
3295          * already zero and there is no pending propagation, so it will be a
3296          * waste of time to try to decay it:
3297          */
3298         return true;
3299 }
3300
3301 #else /* CONFIG_FAIR_GROUP_SCHED */
3302
3303 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
3304
3305 static inline int propagate_entity_load_avg(struct sched_entity *se)
3306 {
3307         return 0;
3308 }
3309
3310 static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
3311
3312 #endif /* CONFIG_FAIR_GROUP_SCHED */
3313
3314 /*
3315  * Unsigned subtract and clamp on underflow.
3316  *
3317  * Explicitly do a load-store to ensure the intermediate value never hits
3318  * memory. This allows lockless observations without ever seeing the negative
3319  * values.
3320  */
3321 #define sub_positive(_ptr, _val) do {                           \
3322         typeof(_ptr) ptr = (_ptr);                              \
3323         typeof(*ptr) val = (_val);                              \
3324         typeof(*ptr) res, var = READ_ONCE(*ptr);                \
3325         res = var - val;                                        \
3326         if (res > var)                                          \
3327                 res = 0;                                        \
3328         WRITE_ONCE(*ptr, res);                                  \
3329 } while (0)
3330
3331 /**
3332  * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
3333  * @now: current time, as per cfs_rq_clock_task()
3334  * @cfs_rq: cfs_rq to update
3335  *
3336  * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
3337  * avg. The immediate corollary is that all (fair) tasks must be attached, see
3338  * post_init_entity_util_avg().
3339  *
3340  * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
3341  *
3342  * Returns true if the load decayed or we removed load.
3343  *
3344  * Since both these conditions indicate a changed cfs_rq->avg.load we should
3345  * call update_tg_load_avg() when this function returns true.
3346  */
3347 static inline int
3348 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3349 {
3350         struct sched_avg *sa = &cfs_rq->avg;
3351         int decayed, removed_load = 0, removed_util = 0;
3352
3353         if (atomic_long_read(&cfs_rq->removed_load_avg)) {
3354                 s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
3355                 sub_positive(&sa->load_avg, r);
3356                 sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
3357                 removed_load = 1;
3358                 set_tg_cfs_propagate(cfs_rq);
3359         }
3360
3361         if (atomic_long_read(&cfs_rq->removed_util_avg)) {
3362                 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
3363                 sub_positive(&sa->util_avg, r);
3364                 sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
3365                 removed_util = 1;
3366                 set_tg_cfs_propagate(cfs_rq);
3367         }
3368
3369         decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
3370
3371 #ifndef CONFIG_64BIT
3372         smp_wmb();
3373         cfs_rq->load_last_update_time_copy = sa->last_update_time;
3374 #endif
3375
3376         if (decayed || removed_util)
3377                 cfs_rq_util_change(cfs_rq);
3378
3379         return decayed || removed_load;
3380 }
3381
3382 /*
3383  * Optional action to be done while updating the load average
3384  */
3385 #define UPDATE_TG       0x1
3386 #define SKIP_AGE_LOAD   0x2
3387
3388 /* Update task and its cfs_rq load average */
3389 static inline void update_load_avg(struct sched_entity *se, int flags)
3390 {
3391         struct cfs_rq *cfs_rq = cfs_rq_of(se);
3392         u64 now = cfs_rq_clock_task(cfs_rq);
3393         struct rq *rq = rq_of(cfs_rq);
3394         int cpu = cpu_of(rq);
3395         int decayed;
3396
3397         /*
3398          * Track task load average for carrying it to new CPU after migrated, and
3399          * track group sched_entity load average for task_h_load calc in migration
3400          */
3401         if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
3402                 __update_load_avg_se(now, cpu, cfs_rq, se);
3403
3404         decayed  = update_cfs_rq_load_avg(now, cfs_rq);
3405         decayed |= propagate_entity_load_avg(se);
3406
3407         if (decayed && (flags & UPDATE_TG))
3408                 update_tg_load_avg(cfs_rq, 0);
3409 }
3410
3411 /**
3412  * attach_entity_load_avg - attach this entity to its cfs_rq load avg
3413  * @cfs_rq: cfs_rq to attach to
3414  * @se: sched_entity to attach
3415  *
3416  * Must call update_cfs_rq_load_avg() before this, since we rely on
3417  * cfs_rq->avg.last_update_time being current.
3418  */
3419 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3420 {
3421         se->avg.last_update_time = cfs_rq->avg.last_update_time;
3422         cfs_rq->avg.load_avg += se->avg.load_avg;
3423         cfs_rq->avg.load_sum += se->avg.load_sum;
3424         cfs_rq->avg.util_avg += se->avg.util_avg;
3425         cfs_rq->avg.util_sum += se->avg.util_sum;
3426         set_tg_cfs_propagate(cfs_rq);
3427
3428         cfs_rq_util_change(cfs_rq);
3429 }
3430
3431 /**
3432  * detach_entity_load_avg - detach this entity from its cfs_rq load avg
3433  * @cfs_rq: cfs_rq to detach from
3434  * @se: sched_entity to detach
3435  *
3436  * Must call update_cfs_rq_load_avg() before this, since we rely on
3437  * cfs_rq->avg.last_update_time being current.
3438  */
3439 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3440 {
3441
3442         sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
3443         sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
3444         sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3445         sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
3446         set_tg_cfs_propagate(cfs_rq);
3447
3448         cfs_rq_util_change(cfs_rq);
3449 }
3450
3451 /* Add the load generated by se into cfs_rq's load average */
3452 static inline void
3453 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3454 {
3455         struct sched_avg *sa = &se->avg;
3456
3457         cfs_rq->runnable_load_avg += sa->load_avg;
3458         cfs_rq->runnable_load_sum += sa->load_sum;
3459
3460         if (!sa->last_update_time) {
3461                 attach_entity_load_avg(cfs_rq, se);
3462                 update_tg_load_avg(cfs_rq, 0);
3463         }
3464 }
3465
3466 /* Remove the runnable load generated by se from cfs_rq's runnable load average */
3467 static inline void
3468 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3469 {
3470         cfs_rq->runnable_load_avg =
3471                 max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
3472         cfs_rq->runnable_load_sum =
3473                 max_t(s64,  cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
3474 }
3475
3476 #ifndef CONFIG_64BIT
3477 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3478 {
3479         u64 last_update_time_copy;
3480         u64 last_update_time;
3481
3482         do {
3483                 last_update_time_copy = cfs_rq->load_last_update_time_copy;
3484                 smp_rmb();
3485                 last_update_time = cfs_rq->avg.last_update_time;
3486         } while (last_update_time != last_update_time_copy);
3487
3488         return last_update_time;
3489 }
3490 #else
3491 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3492 {
3493         return cfs_rq->avg.last_update_time;
3494 }
3495 #endif
3496
3497 /*
3498  * Synchronize entity load avg of dequeued entity without locking
3499  * the previous rq.
3500  */
3501 void sync_entity_load_avg(struct sched_entity *se)
3502 {
3503         struct cfs_rq *cfs_rq = cfs_rq_of(se);
3504         u64 last_update_time;
3505
3506         last_update_time = cfs_rq_last_update_time(cfs_rq);
3507         __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se);
3508 }
3509
3510 /*
3511  * Task first catches up with cfs_rq, and then subtract
3512  * itself from the cfs_rq (task must be off the queue now).
3513  */
3514 void remove_entity_load_avg(struct sched_entity *se)
3515 {
3516         struct cfs_rq *cfs_rq = cfs_rq_of(se);
3517
3518         /*
3519          * tasks cannot exit without having gone through wake_up_new_task() ->
3520          * post_init_entity_util_avg() which will have added things to the
3521          * cfs_rq, so we can remove unconditionally.
3522          *
3523          * Similarly for groups, they will have passed through
3524          * post_init_entity_util_avg() before unregister_sched_fair_group()
3525          * calls this.
3526          */
3527
3528         sync_entity_load_avg(se);
3529         atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
3530         atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
3531 }
3532
3533 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
3534 {
3535         return cfs_rq->runnable_load_avg;
3536 }
3537
3538 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3539 {
3540         return cfs_rq->avg.load_avg;
3541 }
3542
3543 static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
3544
3545 #else /* CONFIG_SMP */
3546
3547 static inline int
3548 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3549 {
3550         return 0;
3551 }
3552
3553 #define UPDATE_TG       0x0
3554 #define SKIP_AGE_LOAD   0x0
3555
3556 static inline void update_load_avg(struct sched_entity *se, int not_used1)
3557 {
3558         cfs_rq_util_change(cfs_rq_of(se));
3559 }
3560
3561 static inline void
3562 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3563 static inline void
3564 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3565 static inline void remove_entity_load_avg(struct sched_entity *se) {}
3566
3567 static inline void
3568 attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3569 static inline void
3570 detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3571
3572 static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
3573 {
3574         return 0;
3575 }
3576
3577 #endif /* CONFIG_SMP */
3578
3579 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
3580 {
3581 #ifdef CONFIG_SCHED_DEBUG
3582         s64 d = se->vruntime - cfs_rq->min_vruntime;
3583
3584         if (d < 0)
3585                 d = -d;
3586
3587         if (d > 3*sysctl_sched_latency)
3588                 schedstat_inc(cfs_rq->nr_spread_over);
3589 #endif
3590 }
3591
3592 static void
3593 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
3594 {
3595         u64 vruntime = cfs_rq->min_vruntime;
3596
3597         /*
3598          * The 'current' period is already promised to the current tasks,
3599          * however the extra weight of the new task will slow them down a
3600          * little, place the new task so that it fits in the slot that
3601          * stays open at the end.
3602          */
3603         if (initial && sched_feat(START_DEBIT))
3604                 vruntime += sched_vslice(cfs_rq, se);
3605
3606         /* sleeps up to a single latency don't count. */
3607         if (!initial) {
3608                 unsigned long thresh = sysctl_sched_latency;
3609
3610                 /*
3611                  * Halve their sleep time's effect, to allow
3612                  * for a gentler effect of sleepers:
3613                  */
3614                 if (sched_feat(GENTLE_FAIR_SLEEPERS))
3615                         thresh >>= 1;
3616
3617                 vruntime -= thresh;
3618         }
3619
3620         /* ensure we never gain time by being placed backwards. */
3621         se->vruntime = max_vruntime(se->vruntime, vruntime);
3622 }
3623
3624 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
3625
3626 static inline void check_schedstat_required(void)
3627 {
3628 #ifdef CONFIG_SCHEDSTATS
3629         if (schedstat_enabled())
3630                 return;
3631
3632         /* Force schedstat enabled if a dependent tracepoint is active */
3633         if (trace_sched_stat_wait_enabled()    ||
3634                         trace_sched_stat_sleep_enabled()   ||
3635                         trace_sched_stat_iowait_enabled()  ||
3636                         trace_sched_stat_blocked_enabled() ||
3637                         trace_sched_stat_runtime_enabled())  {
3638                 printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
3639                              "stat_blocked and stat_runtime require the "
3640                              "kernel parameter schedstats=enable or "
3641                              "kernel.sched_schedstats=1\n");
3642         }
3643 #endif
3644 }
3645
3646
3647 /*
3648  * MIGRATION
3649  *
3650  *      dequeue
3651  *        update_curr()
3652  *          update_min_vruntime()
3653  *        vruntime -= min_vruntime
3654  *
3655  *      enqueue
3656  *        update_curr()
3657  *          update_min_vruntime()
3658  *        vruntime += min_vruntime
3659  *
3660  * this way the vruntime transition between RQs is done when both
3661  * min_vruntime are up-to-date.
3662  *
3663  * WAKEUP (remote)
3664  *
3665  *      ->migrate_task_rq_fair() (p->state == TASK_WAKING)
3666  *        vruntime -= min_vruntime
3667  *
3668  *      enqueue
3669  *        update_curr()
3670  *          update_min_vruntime()
3671  *        vruntime += min_vruntime
3672  *
3673  * this way we don't have the most up-to-date min_vruntime on the originating
3674  * CPU and an up-to-date min_vruntime on the destination CPU.
3675  */
3676
3677 static void
3678 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3679 {
3680         bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
3681         bool curr = cfs_rq->curr == se;
3682
3683         /*
3684          * If we're the current task, we must renormalise before calling
3685          * update_curr().
3686          */
3687         if (renorm && curr)
3688                 se->vruntime += cfs_rq->min_vruntime;
3689
3690         update_curr(cfs_rq);
3691
3692         /*
3693          * Otherwise, renormalise after, such that we're placed at the current
3694          * moment in time, instead of some random moment in the past. Being
3695          * placed in the past could significantly boost this task to the
3696          * fairness detriment of existing tasks.
3697          */
3698         if (renorm && !curr)
3699                 se->vruntime += cfs_rq->min_vruntime;
3700
3701         /*
3702          * When enqueuing a sched_entity, we must:
3703          *   - Update loads to have both entity and cfs_rq synced with now.
3704          *   - Add its load to cfs_rq->runnable_avg
3705          *   - For group_entity, update its weight to reflect the new share of
3706          *     its group cfs_rq
3707          *   - Add its new weight to cfs_rq->load.weight
3708          */
3709         update_load_avg(se, UPDATE_TG);
3710         enqueue_entity_load_avg(cfs_rq, se);
3711         update_cfs_shares(se);
3712         account_entity_enqueue(cfs_rq, se);
3713
3714         if (flags & ENQUEUE_WAKEUP)
3715                 place_entity(cfs_rq, se, 0);
3716
3717         check_schedstat_required();
3718         update_stats_enqueue(cfs_rq, se, flags);
3719         check_spread(cfs_rq, se);
3720         if (!curr)
3721                 __enqueue_entity(cfs_rq, se);
3722         se->on_rq = 1;
3723
3724         if (cfs_rq->nr_running == 1) {
3725                 list_add_leaf_cfs_rq(cfs_rq);
3726                 check_enqueue_throttle(cfs_rq);
3727         }
3728 }
3729
3730 static void __clear_buddies_last(struct sched_entity *se)
3731 {
3732         for_each_sched_entity(se) {
3733                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3734                 if (cfs_rq->last != se)
3735                         break;
3736
3737                 cfs_rq->last = NULL;
3738         }
3739 }
3740
3741 static void __clear_buddies_next(struct sched_entity *se)
3742 {
3743         for_each_sched_entity(se) {
3744                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3745                 if (cfs_rq->next != se)
3746                         break;
3747
3748                 cfs_rq->next = NULL;
3749         }
3750 }
3751
3752 static void __clear_buddies_skip(struct sched_entity *se)
3753 {
3754         for_each_sched_entity(se) {
3755                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3756                 if (cfs_rq->skip != se)
3757                         break;
3758
3759                 cfs_rq->skip = NULL;
3760         }
3761 }
3762
3763 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
3764 {
3765         if (cfs_rq->last == se)
3766                 __clear_buddies_last(se);
3767
3768         if (cfs_rq->next == se)
3769                 __clear_buddies_next(se);
3770
3771         if (cfs_rq->skip == se)
3772                 __clear_buddies_skip(se);
3773 }
3774
3775 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
3776
3777 static void
3778 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3779 {
3780         /*
3781          * Update run-time statistics of the 'current'.
3782          */
3783         update_curr(cfs_rq);
3784
3785         /*
3786          * When dequeuing a sched_entity, we must:
3787          *   - Update loads to have both entity and cfs_rq synced with now.
3788          *   - Substract its load from the cfs_rq->runnable_avg.
3789          *   - Substract its previous weight from cfs_rq->load.weight.
3790          *   - For group entity, update its weight to reflect the new share
3791          *     of its group cfs_rq.
3792          */
3793         update_load_avg(se, UPDATE_TG);
3794         dequeue_entity_load_avg(cfs_rq, se);
3795
3796         update_stats_dequeue(cfs_rq, se, flags);
3797
3798         clear_buddies(cfs_rq, se);
3799
3800         if (se != cfs_rq->curr)
3801                 __dequeue_entity(cfs_rq, se);
3802         se->on_rq = 0;
3803         account_entity_dequeue(cfs_rq, se);
3804
3805         /*
3806          * Normalize after update_curr(); which will also have moved
3807          * min_vruntime if @se is the one holding it back. But before doing
3808          * update_min_vruntime() again, which will discount @se's position and
3809          * can move min_vruntime forward still more.
3810          */
3811         if (!(flags & DEQUEUE_SLEEP))
3812                 se->vruntime -= cfs_rq->min_vruntime;
3813
3814         /* return excess runtime on last dequeue */
3815         return_cfs_rq_runtime(cfs_rq);
3816
3817         update_cfs_shares(se);
3818
3819         /*
3820          * Now advance min_vruntime if @se was the entity holding it back,
3821          * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
3822          * put back on, and if we advance min_vruntime, we'll be placed back
3823          * further than we started -- ie. we'll be penalized.
3824          */
3825         if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
3826                 update_min_vruntime(cfs_rq);
3827 }
3828
3829 /*
3830  * Preempt the current task with a newly woken task if needed:
3831  */
3832 static void
3833 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
3834 {
3835         unsigned long ideal_runtime, delta_exec;
3836         struct sched_entity *se;
3837         s64 delta;
3838
3839         ideal_runtime = sched_slice(cfs_rq, curr);
3840         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
3841         if (delta_exec > ideal_runtime) {
3842                 resched_curr(rq_of(cfs_rq));
3843                 /*
3844                  * The current task ran long enough, ensure it doesn't get
3845                  * re-elected due to buddy favours.
3846                  */
3847                 clear_buddies(cfs_rq, curr);
3848                 return;
3849         }
3850
3851         /*
3852          * Ensure that a task that missed wakeup preemption by a
3853          * narrow margin doesn't have to wait for a full slice.
3854          * This also mitigates buddy induced latencies under load.
3855          */
3856         if (delta_exec < sysctl_sched_min_granularity)
3857                 return;
3858
3859         se = __pick_first_entity(cfs_rq);
3860         delta = curr->vruntime - se->vruntime;
3861
3862         if (delta < 0)
3863                 return;
3864
3865         if (delta > ideal_runtime)
3866                 resched_curr(rq_of(cfs_rq));
3867 }
3868
3869 static void
3870 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
3871 {
3872         /* 'current' is not kept within the tree. */
3873         if (se->on_rq) {
3874                 /*
3875                  * Any task has to be enqueued before it get to execute on
3876                  * a CPU. So account for the time it spent waiting on the
3877                  * runqueue.
3878                  */
3879                 update_stats_wait_end(cfs_rq, se);
3880                 __dequeue_entity(cfs_rq, se);
3881                 update_load_avg(se, UPDATE_TG);
3882         }
3883
3884         update_stats_curr_start(cfs_rq, se);
3885         cfs_rq->curr = se;
3886
3887         /*
3888          * Track our maximum slice length, if the CPU's load is at
3889          * least twice that of our own weight (i.e. dont track it
3890          * when there are only lesser-weight tasks around):
3891          */
3892         if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
3893                 schedstat_set(se->statistics.slice_max,
3894                         max((u64)schedstat_val(se->statistics.slice_max),
3895                             se->sum_exec_runtime - se->prev_sum_exec_runtime));
3896         }
3897
3898         se->prev_sum_exec_runtime = se->sum_exec_runtime;
3899 }
3900
3901 static int
3902 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
3903
3904 /*
3905  * Pick the next process, keeping these things in mind, in this order:
3906  * 1) keep things fair between processes/task groups
3907  * 2) pick the "next" process, since someone really wants that to run
3908  * 3) pick the "last" process, for cache locality
3909  * 4) do not run the "skip" process, if something else is available
3910  */
3911 static struct sched_entity *
3912 pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
3913 {
3914         struct sched_entity *left = __pick_first_entity(cfs_rq);
3915         struct sched_entity *se;
3916
3917         /*
3918          * If curr is set we have to see if its left of the leftmost entity
3919          * still in the tree, provided there was anything in the tree at all.
3920          */
3921         if (!left || (curr && entity_before(curr, left)))
3922                 left = curr;
3923
3924         se = left; /* ideally we run the leftmost entity */
3925
3926         /*
3927          * Avoid running the skip buddy, if running something else can
3928          * be done without getting too unfair.
3929          */
3930         if (cfs_rq->skip == se) {
3931                 struct sched_entity *second;
3932
3933                 if (se == curr) {
3934                         second = __pick_first_entity(cfs_rq);
3935                 } else {
3936                         second = __pick_next_entity(se);
3937                         if (!second || (curr && entity_before(curr, second)))
3938                                 second = curr;
3939                 }
3940
3941                 if (second && wakeup_preempt_entity(second, left) < 1)
3942                         se = second;
3943         }
3944
3945         /*
3946          * Prefer last buddy, try to return the CPU to a preempted task.
3947          */
3948         if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
3949                 se = cfs_rq->last;
3950
3951         /*
3952          * Someone really wants this to run. If it's not unfair, run it.
3953          */
3954         if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
3955                 se = cfs_rq->next;
3956
3957         clear_buddies(cfs_rq, se);
3958
3959         return se;
3960 }
3961
3962 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
3963
3964 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
3965 {
3966         /*
3967          * If still on the runqueue then deactivate_task()
3968          * was not called and update_curr() has to be done:
3969          */
3970         if (prev->on_rq)
3971                 update_curr(cfs_rq);
3972
3973         /* throttle cfs_rqs exceeding runtime */
3974         check_cfs_rq_runtime(cfs_rq);
3975
3976         check_spread(cfs_rq, prev);
3977
3978         if (prev->on_rq) {
3979                 update_stats_wait_start(cfs_rq, prev);
3980                 /* Put 'current' back into the tree. */
3981                 __enqueue_entity(cfs_rq, prev);
3982                 /* in !on_rq case, update occurred at dequeue */
3983                 update_load_avg(prev, 0);
3984         }
3985         cfs_rq->curr = NULL;
3986 }
3987
3988 static void
3989 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
3990 {
3991         /*
3992          * Update run-time statistics of the 'current'.
3993          */
3994         update_curr(cfs_rq);
3995
3996         /*
3997          * Ensure that runnable average is periodically updated.
3998          */
3999         update_load_avg(curr, UPDATE_TG);
4000         update_cfs_shares(curr);
4001
4002 #ifdef CONFIG_SCHED_HRTICK
4003         /*
4004          * queued ticks are scheduled to match the slice, so don't bother
4005          * validating it and just reschedule.
4006          */
4007         if (queued) {
4008                 resched_curr(rq_of(cfs_rq));
4009                 return;
4010         }
4011         /*
4012          * don't let the period tick interfere with the hrtick preemption
4013          */
4014         if (!sched_feat(DOUBLE_TICK) &&
4015                         hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
4016                 return;
4017 #endif
4018
4019         if (cfs_rq->nr_running > 1)
4020                 check_preempt_tick(cfs_rq, curr);
4021 }
4022
4023
4024 /**************************************************
4025  * CFS bandwidth control machinery
4026  */
4027
4028 #ifdef CONFIG_CFS_BANDWIDTH
4029
4030 #ifdef HAVE_JUMP_LABEL
4031 static struct static_key __cfs_bandwidth_used;
4032
4033 static inline bool cfs_bandwidth_used(void)
4034 {
4035         return static_key_false(&__cfs_bandwidth_used);
4036 }
4037
4038 void cfs_bandwidth_usage_inc(void)
4039 {
4040         static_key_slow_inc(&__cfs_bandwidth_used);
4041 }
4042
4043 void cfs_bandwidth_usage_dec(void)
4044 {
4045         static_key_slow_dec(&__cfs_bandwidth_used);
4046 }
4047 #else /* HAVE_JUMP_LABEL */
4048 static bool cfs_bandwidth_used(void)
4049 {
4050         return true;
4051 }
4052
4053 void cfs_bandwidth_usage_inc(void) {}
4054 void cfs_bandwidth_usage_dec(void) {}
4055 #endif /* HAVE_JUMP_LABEL */
4056
4057 /*
4058  * default period for cfs group bandwidth.
4059  * default: 0.1s, units: nanoseconds
4060  */
4061 static inline u64 default_cfs_period(void)
4062 {
4063         return 100000000ULL;
4064 }
4065
4066 static inline u64 sched_cfs_bandwidth_slice(void)
4067 {
4068         return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
4069 }
4070
4071 /*
4072  * Replenish runtime according to assigned quota and update expiration time.
4073  * We use sched_clock_cpu directly instead of rq->clock to avoid adding
4074  * additional synchronization around rq->lock.
4075  *
4076  * requires cfs_b->lock
4077  */
4078 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
4079 {
4080         u64 now;
4081
4082         if (cfs_b->quota == RUNTIME_INF)
4083                 return;
4084
4085         now = sched_clock_cpu(smp_processor_id());
4086         cfs_b->runtime = cfs_b->quota;
4087         cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
4088 }
4089
4090 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4091 {
4092         return &tg->cfs_bandwidth;
4093 }
4094
4095 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
4096 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4097 {
4098         if (unlikely(cfs_rq->throttle_count))
4099                 return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
4100
4101         return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
4102 }
4103
4104 /* returns 0 on failure to allocate runtime */
4105 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4106 {
4107         struct task_group *tg = cfs_rq->tg;
4108         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
4109         u64 amount = 0, min_amount, expires;
4110
4111         /* note: this is a positive sum as runtime_remaining <= 0 */
4112         min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
4113
4114         raw_spin_lock(&cfs_b->lock);
4115         if (cfs_b->quota == RUNTIME_INF)
4116                 amount = min_amount;
4117         else {
4118                 start_cfs_bandwidth(cfs_b);
4119
4120                 if (cfs_b->runtime > 0) {
4121                         amount = min(cfs_b->runtime, min_amount);
4122                         cfs_b->runtime -= amount;
4123                         cfs_b->idle = 0;
4124                 }
4125         }
4126         expires = cfs_b->runtime_expires;
4127         raw_spin_unlock(&cfs_b->lock);
4128
4129         cfs_rq->runtime_remaining += amount;
4130         /*
4131          * we may have advanced our local expiration to account for allowed
4132          * spread between our sched_clock and the one on which runtime was
4133          * issued.
4134          */
4135         if ((s64)(expires - cfs_rq->runtime_expires) > 0)
4136                 cfs_rq->runtime_expires = expires;
4137
4138         return cfs_rq->runtime_remaining > 0;
4139 }
4140
4141 /*
4142  * Note: This depends on the synchronization provided by sched_clock and the
4143  * fact that rq->clock snapshots this value.
4144  */
4145 static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4146 {
4147         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4148
4149         /* if the deadline is ahead of our clock, nothing to do */
4150         if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
4151                 return;
4152
4153         if (cfs_rq->runtime_remaining < 0)
4154                 return;
4155
4156         /*
4157          * If the local deadline has passed we have to consider the
4158          * possibility that our sched_clock is 'fast' and the global deadline
4159          * has not truly expired.
4160          *
4161          * Fortunately we can check determine whether this the case by checking
4162          * whether the global deadline has advanced. It is valid to compare
4163          * cfs_b->runtime_expires without any locks since we only care about
4164          * exact equality, so a partial write will still work.
4165          */
4166
4167         if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
4168                 /* extend local deadline, drift is bounded above by 2 ticks */
4169                 cfs_rq->runtime_expires += TICK_NSEC;
4170         } else {
4171                 /* global deadline is ahead, expiration has passed */
4172                 cfs_rq->runtime_remaining = 0;
4173         }
4174 }
4175
4176 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4177 {
4178         /* dock delta_exec before expiring quota (as it could span periods) */
4179         cfs_rq->runtime_remaining -= delta_exec;
4180         expire_cfs_rq_runtime(cfs_rq);
4181
4182         if (likely(cfs_rq->runtime_remaining > 0))
4183                 return;
4184
4185         /*
4186          * if we're unable to extend our runtime we resched so that the active
4187          * hierarchy can be throttled
4188          */
4189         if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
4190                 resched_curr(rq_of(cfs_rq));
4191 }
4192
4193 static __always_inline
4194 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4195 {
4196         if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
4197                 return;
4198
4199         __account_cfs_rq_runtime(cfs_rq, delta_exec);
4200 }
4201
4202 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4203 {
4204         return cfs_bandwidth_used() && cfs_rq->throttled;
4205 }
4206
4207 /* check whether cfs_rq, or any parent, is throttled */
4208 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4209 {
4210         return cfs_bandwidth_used() && cfs_rq->throttle_count;
4211 }
4212
4213 /*
4214  * Ensure that neither of the group entities corresponding to src_cpu or
4215  * dest_cpu are members of a throttled hierarchy when performing group
4216  * load-balance operations.
4217  */
4218 static inline int throttled_lb_pair(struct task_group *tg,
4219                                     int src_cpu, int dest_cpu)
4220 {
4221         struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
4222
4223         src_cfs_rq = tg->cfs_rq[src_cpu];
4224         dest_cfs_rq = tg->cfs_rq[dest_cpu];
4225
4226         return throttled_hierarchy(src_cfs_rq) ||
4227                throttled_hierarchy(dest_cfs_rq);
4228 }
4229
4230 /* updated child weight may affect parent so we have to do this bottom up */
4231 static int tg_unthrottle_up(struct task_group *tg, void *data)
4232 {
4233         struct rq *rq = data;
4234         struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4235
4236         cfs_rq->throttle_count--;
4237         if (!cfs_rq->throttle_count) {
4238                 /* adjust cfs_rq_clock_task() */
4239                 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
4240                                              cfs_rq->throttled_clock_task;
4241         }
4242
4243         return 0;
4244 }
4245
4246 static int tg_throttle_down(struct task_group *tg, void *data)
4247 {
4248         struct rq *rq = data;
4249         struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4250
4251         /* group is entering throttled state, stop time */
4252         if (!cfs_rq->throttle_count)
4253                 cfs_rq->throttled_clock_task = rq_clock_task(rq);
4254         cfs_rq->throttle_count++;
4255
4256         return 0;
4257 }
4258
4259 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
4260 {
4261         struct rq *rq = rq_of(cfs_rq);
4262         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4263         struct sched_entity *se;
4264         long task_delta, dequeue = 1;
4265         bool empty;
4266
4267         se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
4268
4269         /* freeze hierarchy runnable averages while throttled */
4270         rcu_read_lock();
4271         walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
4272         rcu_read_unlock();
4273
4274         task_delta = cfs_rq->h_nr_running;
4275         for_each_sched_entity(se) {
4276                 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
4277                 /* throttled entity or throttle-on-deactivate */
4278                 if (!se->on_rq)
4279                         break;
4280
4281                 if (dequeue)
4282                         dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
4283                 qcfs_rq->h_nr_running -= task_delta;
4284
4285                 if (qcfs_rq->load.weight)
4286                         dequeue = 0;
4287         }
4288
4289         if (!se)
4290                 sub_nr_running(rq, task_delta);
4291
4292         cfs_rq->throttled = 1;
4293         cfs_rq->throttled_clock = rq_clock(rq);
4294         raw_spin_lock(&cfs_b->lock);
4295         empty = list_empty(&cfs_b->throttled_cfs_rq);
4296
4297         /*
4298          * Add to the _head_ of the list, so that an already-started
4299          * distribute_cfs_runtime will not see us
4300          */
4301         list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4302
4303         /*
4304          * If we're the first throttled task, make sure the bandwidth
4305          * timer is running.
4306          */
4307         if (empty)
4308                 start_cfs_bandwidth(cfs_b);
4309
4310         raw_spin_unlock(&cfs_b->lock);
4311 }
4312
4313 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
4314 {
4315         struct rq *rq = rq_of(cfs_rq);
4316         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4317         struct sched_entity *se;
4318         int enqueue = 1;
4319         long task_delta;
4320
4321         se = cfs_rq->tg->se[cpu_of(rq)];
4322
4323         cfs_rq->throttled = 0;
4324
4325         update_rq_clock(rq);
4326
4327         raw_spin_lock(&cfs_b->lock);
4328         cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
4329         list_del_rcu(&cfs_rq->throttled_list);
4330         raw_spin_unlock(&cfs_b->lock);
4331
4332         /* update hierarchical throttle state */
4333         walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
4334
4335         if (!cfs_rq->load.weight)
4336                 return;
4337
4338         task_delta = cfs_rq->h_nr_running;
4339         for_each_sched_entity(se) {
4340                 if (se->on_rq)
4341                         enqueue = 0;
4342
4343                 cfs_rq = cfs_rq_of(se);
4344                 if (enqueue)
4345                         enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
4346                 cfs_rq->h_nr_running += task_delta;
4347
4348                 if (cfs_rq_throttled(cfs_rq))
4349                         break;
4350         }
4351
4352         if (!se)
4353                 add_nr_running(rq, task_delta);
4354
4355         /* determine whether we need to wake up potentially idle cpu */
4356         if (rq->curr == rq->idle && rq->cfs.nr_running)
4357                 resched_curr(rq);
4358 }
4359
4360 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
4361                 u64 remaining, u64 expires)
4362 {
4363         struct cfs_rq *cfs_rq;
4364         u64 runtime;
4365         u64 starting_runtime = remaining;
4366
4367         rcu_read_lock();
4368         list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
4369                                 throttled_list) {
4370                 struct rq *rq = rq_of(cfs_rq);
4371                 struct rq_flags rf;
4372
4373                 rq_lock(rq, &rf);
4374                 if (!cfs_rq_throttled(cfs_rq))
4375                         goto next;
4376
4377                 runtime = -cfs_rq->runtime_remaining + 1;
4378                 if (runtime > remaining)
4379                         runtime = remaining;
4380                 remaining -= runtime;
4381
4382                 cfs_rq->runtime_remaining += runtime;
4383                 cfs_rq->runtime_expires = expires;
4384
4385                 /* we check whether we're throttled above */
4386                 if (cfs_rq->runtime_remaining > 0)
4387                         unthrottle_cfs_rq(cfs_rq);
4388
4389 next:
4390                 rq_unlock(rq, &rf);
4391
4392                 if (!remaining)
4393                         break;
4394         }
4395         rcu_read_unlock();
4396
4397         return starting_runtime - remaining;
4398 }
4399
4400 /*
4401  * Responsible for refilling a task_group's bandwidth and unthrottling its
4402  * cfs_rqs as appropriate. If there has been no activity within the last
4403  * period the timer is deactivated until scheduling resumes; cfs_b->idle is
4404  * used to track this state.
4405  */
4406 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
4407 {
4408         u64 runtime, runtime_expires;
4409         int throttled;
4410
4411         /* no need to continue the timer with no bandwidth constraint */
4412         if (cfs_b->quota == RUNTIME_INF)
4413                 goto out_deactivate;
4414
4415         throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4416         cfs_b->nr_periods += overrun;
4417
4418         /*
4419          * idle depends on !throttled (for the case of a large deficit), and if
4420          * we're going inactive then everything else can be deferred
4421          */
4422         if (cfs_b->idle && !throttled)
4423                 goto out_deactivate;
4424
4425         __refill_cfs_bandwidth_runtime(cfs_b);
4426
4427         if (!throttled) {
4428                 /* mark as potentially idle for the upcoming period */
4429                 cfs_b->idle = 1;
4430                 return 0;
4431         }
4432
4433         /* account preceding periods in which throttling occurred */
4434         cfs_b->nr_throttled += overrun;
4435
4436         runtime_expires = cfs_b->runtime_expires;
4437
4438         /*
4439          * This check is repeated as we are holding onto the new bandwidth while
4440          * we unthrottle. This can potentially race with an unthrottled group
4441          * trying to acquire new bandwidth from the global pool. This can result
4442          * in us over-using our runtime if it is all used during this loop, but
4443          * only by limited amounts in that extreme case.
4444          */
4445         while (throttled && cfs_b->runtime > 0) {
4446                 runtime = cfs_b->runtime;
4447                 raw_spin_unlock(&cfs_b->lock);
4448                 /* we can't nest cfs_b->lock while distributing bandwidth */
4449                 runtime = distribute_cfs_runtime(cfs_b, runtime,
4450                                                  runtime_expires);
4451                 raw_spin_lock(&cfs_b->lock);
4452
4453                 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4454
4455                 cfs_b->runtime -= min(runtime, cfs_b->runtime);
4456         }
4457
4458         /*
4459          * While we are ensured activity in the period following an
4460          * unthrottle, this also covers the case in which the new bandwidth is
4461          * insufficient to cover the existing bandwidth deficit.  (Forcing the
4462          * timer to remain active while there are any throttled entities.)
4463          */
4464         cfs_b->idle = 0;
4465
4466         return 0;
4467
4468 out_deactivate:
4469         return 1;
4470 }
4471
4472 /* a cfs_rq won't donate quota below this amount */
4473 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
4474 /* minimum remaining period time to redistribute slack quota */
4475 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
4476 /* how long we wait to gather additional slack before distributing */
4477 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
4478
4479 /*
4480  * Are we near the end of the current quota period?
4481  *
4482  * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
4483  * hrtimer base being cleared by hrtimer_start. In the case of
4484  * migrate_hrtimers, base is never cleared, so we are fine.
4485  */
4486 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
4487 {
4488         struct hrtimer *refresh_timer = &cfs_b->period_timer;
4489         u64 remaining;
4490
4491         /* if the call-back is running a quota refresh is already occurring */
4492         if (hrtimer_callback_running(refresh_timer))
4493                 return 1;
4494
4495         /* is a quota refresh about to occur? */
4496         remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
4497         if (remaining < min_expire)
4498                 return 1;
4499
4500         return 0;
4501 }
4502
4503 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
4504 {
4505         u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
4506
4507         /* if there's a quota refresh soon don't bother with slack */
4508         if (runtime_refresh_within(cfs_b, min_left))
4509                 return;
4510
4511         hrtimer_start(&cfs_b->slack_timer,
4512                         ns_to_ktime(cfs_bandwidth_slack_period),
4513                         HRTIMER_MODE_REL);
4514 }
4515
4516 /* we know any runtime found here is valid as update_curr() precedes return */
4517 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4518 {
4519         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4520         s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
4521
4522         if (slack_runtime <= 0)
4523                 return;
4524
4525         raw_spin_lock(&cfs_b->lock);
4526         if (cfs_b->quota != RUNTIME_INF &&
4527             cfs_rq->runtime_expires == cfs_b->runtime_expires) {
4528                 cfs_b->runtime += slack_runtime;
4529
4530                 /* we are under rq->lock, defer unthrottling using a timer */
4531                 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
4532                     !list_empty(&cfs_b->throttled_cfs_rq))
4533                         start_cfs_slack_bandwidth(cfs_b);
4534         }
4535         raw_spin_unlock(&cfs_b->lock);
4536
4537         /* even if it's not valid for return we don't want to try again */
4538         cfs_rq->runtime_remaining -= slack_runtime;
4539 }
4540
4541 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4542 {
4543         if (!cfs_bandwidth_used())
4544                 return;
4545
4546         if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
4547                 return;
4548
4549         __return_cfs_rq_runtime(cfs_rq);
4550 }
4551
4552 /*
4553  * This is done with a timer (instead of inline with bandwidth return) since
4554  * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
4555  */
4556 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
4557 {
4558         u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
4559         u64 expires;
4560
4561         /* confirm we're still not at a refresh boundary */
4562         raw_spin_lock(&cfs_b->lock);
4563         if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
4564                 raw_spin_unlock(&cfs_b->lock);
4565                 return;
4566         }
4567
4568         if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
4569                 runtime = cfs_b->runtime;
4570
4571         expires = cfs_b->runtime_expires;
4572         raw_spin_unlock(&cfs_b->lock);
4573
4574         if (!runtime)
4575                 return;
4576
4577         runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
4578
4579         raw_spin_lock(&cfs_b->lock);
4580         if (expires == cfs_b->runtime_expires)
4581                 cfs_b->runtime -= min(runtime, cfs_b->runtime);
4582         raw_spin_unlock(&cfs_b->lock);
4583 }
4584
4585 /*
4586  * When a group wakes up we want to make sure that its quota is not already
4587  * expired/exceeded, otherwise it may be allowed to steal additional ticks of
4588  * runtime as update_curr() throttling can not not trigger until it's on-rq.
4589  */
4590 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
4591 {
4592         if (!cfs_bandwidth_used())
4593                 return;
4594
4595         /* an active group must be handled by the update_curr()->put() path */
4596         if (!cfs_rq->runtime_enabled || cfs_rq->curr)
4597                 return;
4598
4599         /* ensure the group is not already throttled */
4600         if (cfs_rq_throttled(cfs_rq))
4601                 return;
4602
4603         /* update runtime allocation */
4604         account_cfs_rq_runtime(cfs_rq, 0);
4605         if (cfs_rq->runtime_remaining <= 0)
4606                 throttle_cfs_rq(cfs_rq);
4607 }
4608
4609 static void sync_throttle(struct task_group *tg, int cpu)
4610 {
4611         struct cfs_rq *pcfs_rq, *cfs_rq;
4612
4613         if (!cfs_bandwidth_used())
4614                 return;
4615
4616         if (!tg->parent)
4617                 return;
4618
4619         cfs_rq = tg->cfs_rq[cpu];
4620         pcfs_rq = tg->parent->cfs_rq[cpu];
4621
4622         cfs_rq->throttle_count = pcfs_rq->throttle_count;
4623         cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
4624 }
4625
4626 /* conditionally throttle active cfs_rq's from put_prev_entity() */
4627 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4628 {
4629         if (!cfs_bandwidth_used())
4630                 return false;
4631
4632         if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
4633                 return false;
4634
4635         /*
4636          * it's possible for a throttled entity to be forced into a running
4637          * state (e.g. set_curr_task), in this case we're finished.
4638          */
4639         if (cfs_rq_throttled(cfs_rq))
4640                 return true;
4641
4642         throttle_cfs_rq(cfs_rq);
4643         return true;
4644 }
4645
4646 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
4647 {
4648         struct cfs_bandwidth *cfs_b =
4649                 container_of(timer, struct cfs_bandwidth, slack_timer);
4650
4651         do_sched_cfs_slack_timer(cfs_b);
4652
4653         return HRTIMER_NORESTART;
4654 }
4655
4656 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
4657 {
4658         struct cfs_bandwidth *cfs_b =
4659                 container_of(timer, struct cfs_bandwidth, period_timer);
4660         int overrun;
4661         int idle = 0;
4662
4663         raw_spin_lock(&cfs_b->lock);
4664         for (;;) {
4665                 overrun = hrtimer_forward_now(timer, cfs_b->period);
4666                 if (!overrun)
4667                         break;
4668
4669                 idle = do_sched_cfs_period_timer(cfs_b, overrun);
4670         }
4671         if (idle)
4672                 cfs_b->period_active = 0;
4673         raw_spin_unlock(&cfs_b->lock);
4674
4675         return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
4676 }
4677
4678 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4679 {
4680         raw_spin_lock_init(&cfs_b->lock);
4681         cfs_b->runtime = 0;
4682         cfs_b->quota = RUNTIME_INF;
4683         cfs_b->period = ns_to_ktime(default_cfs_period());
4684
4685         INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
4686         hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
4687         cfs_b->period_timer.function = sched_cfs_period_timer;
4688         hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4689         cfs_b->slack_timer.function = sched_cfs_slack_timer;
4690 }
4691
4692 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4693 {
4694         cfs_rq->runtime_enabled = 0;
4695         INIT_LIST_HEAD(&cfs_rq->throttled_list);
4696 }
4697
4698 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4699 {
4700         lockdep_assert_held(&cfs_b->lock);
4701
4702         if (!cfs_b->period_active) {
4703                 cfs_b->period_active = 1;
4704                 hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
4705                 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
4706         }
4707 }
4708
4709 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4710 {
4711         /* init_cfs_bandwidth() was not called */
4712         if (!cfs_b->throttled_cfs_rq.next)
4713                 return;
4714
4715         hrtimer_cancel(&cfs_b->period_timer);
4716         hrtimer_cancel(&cfs_b->slack_timer);
4717 }
4718
4719 /*
4720  * Both these cpu hotplug callbacks race against unregister_fair_sched_group()
4721  *
4722  * The race is harmless, since modifying bandwidth settings of unhooked group
4723  * bits doesn't do much.
4724  */
4725
4726 /* cpu online calback */
4727 static void __maybe_unused update_runtime_enabled(struct rq *rq)
4728 {
4729         struct task_group *tg;
4730
4731         lockdep_assert_held(&rq->lock);
4732
4733         rcu_read_lock();
4734         list_for_each_entry_rcu(tg, &task_groups, list) {
4735                 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
4736                 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4737
4738                 raw_spin_lock(&cfs_b->lock);
4739                 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
4740                 raw_spin_unlock(&cfs_b->lock);
4741         }
4742         rcu_read_unlock();
4743 }
4744
4745 /* cpu offline callback */
4746 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
4747 {
4748         struct task_group *tg;
4749
4750         lockdep_assert_held(&rq->lock);
4751
4752         rcu_read_lock();
4753         list_for_each_entry_rcu(tg, &task_groups, list) {
4754                 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4755
4756                 if (!cfs_rq->runtime_enabled)
4757                         continue;
4758
4759                 /*
4760                  * clock_task is not advancing so we just need to make sure
4761                  * there's some valid quota amount
4762                  */
4763                 cfs_rq->runtime_remaining = 1;
4764                 /*
4765                  * Offline rq is schedulable till cpu is completely disabled
4766                  * in take_cpu_down(), so we prevent new cfs throttling here.
4767                  */
4768                 cfs_rq->runtime_enabled = 0;
4769
4770                 if (cfs_rq_throttled(cfs_rq))
4771                         unthrottle_cfs_rq(cfs_rq);
4772         }
4773         rcu_read_unlock();
4774 }
4775
4776 #else /* CONFIG_CFS_BANDWIDTH */
4777 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4778 {
4779         return rq_clock_task(rq_of(cfs_rq));
4780 }
4781
4782 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
4783 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
4784 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
4785 static inline void sync_throttle(struct task_group *tg, int cpu) {}
4786 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
4787
4788 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4789 {
4790         return 0;
4791 }
4792
4793 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4794 {
4795         return 0;
4796 }
4797
4798 static inline int throttled_lb_pair(struct task_group *tg,
4799                                     int src_cpu, int dest_cpu)
4800 {
4801         return 0;
4802 }
4803
4804 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
4805
4806 #ifdef CONFIG_FAIR_GROUP_SCHED
4807 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
4808 #endif
4809
4810 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4811 {
4812         return NULL;
4813 }
4814 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
4815 static inline void update_runtime_enabled(struct rq *rq) {}
4816 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
4817
4818 #endif /* CONFIG_CFS_BANDWIDTH */
4819
4820 /**************************************************
4821  * CFS operations on tasks:
4822  */
4823
4824 #ifdef CONFIG_SCHED_HRTICK
4825 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
4826 {
4827         struct sched_entity *se = &p->se;
4828         struct cfs_rq *cfs_rq = cfs_rq_of(se);
4829
4830         SCHED_WARN_ON(task_rq(p) != rq);
4831
4832         if (rq->cfs.h_nr_running > 1) {
4833                 u64 slice = sched_slice(cfs_rq, se);
4834                 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
4835                 s64 delta = slice - ran;
4836
4837                 if (delta < 0) {
4838                         if (rq->curr == p)
4839                                 resched_curr(rq);
4840                         return;
4841                 }
4842                 hrtick_start(rq, delta);
4843         }
4844 }
4845
4846 /*
4847  * called from enqueue/dequeue and updates the hrtick when the
4848  * current task is from our class and nr_running is low enough
4849  * to matter.
4850  */
4851 static void hrtick_update(struct rq *rq)
4852 {
4853         struct task_struct *curr = rq->curr;
4854
4855         if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
4856                 return;
4857
4858         if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
4859                 hrtick_start_fair(rq, curr);
4860 }
4861 #else /* !CONFIG_SCHED_HRTICK */
4862 static inline void
4863 hrtick_start_fair(struct rq *rq, struct task_struct *p)
4864 {
4865 }
4866
4867 static inline void hrtick_update(struct rq *rq)
4868 {
4869 }
4870 #endif
4871
4872 /*
4873  * The enqueue_task method is called before nr_running is
4874  * increased. Here we update the fair scheduling stats and
4875  * then put the task into the rbtree:
4876  */
4877 static void
4878 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4879 {
4880         struct cfs_rq *cfs_rq;
4881         struct sched_entity *se = &p->se;
4882
4883         /*
4884          * If in_iowait is set, the code below may not trigger any cpufreq
4885          * utilization updates, so do it here explicitly with the IOWAIT flag
4886          * passed.
4887          */
4888         if (p->in_iowait)
4889                 cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
4890
4891         for_each_sched_entity(se) {
4892                 if (se->on_rq)
4893                         break;
4894                 cfs_rq = cfs_rq_of(se);
4895                 enqueue_entity(cfs_rq, se, flags);
4896
4897                 /*
4898                  * end evaluation on encountering a throttled cfs_rq
4899                  *
4900                  * note: in the case of encountering a throttled cfs_rq we will
4901                  * post the final h_nr_running increment below.
4902                  */
4903                 if (cfs_rq_throttled(cfs_rq))
4904                         break;
4905                 cfs_rq->h_nr_running++;
4906
4907                 flags = ENQUEUE_WAKEUP;
4908         }
4909
4910         for_each_sched_entity(se) {
4911                 cfs_rq = cfs_rq_of(se);
4912                 cfs_rq->h_nr_running++;
4913
4914                 if (cfs_rq_throttled(cfs_rq))
4915                         break;
4916
4917                 update_load_avg(se, UPDATE_TG);
4918                 update_cfs_shares(se);
4919         }
4920
4921         if (!se)
4922                 add_nr_running(rq, 1);
4923
4924         hrtick_update(rq);
4925 }
4926
4927 static void set_next_buddy(struct sched_entity *se);
4928
4929 /*
4930  * The dequeue_task method is called before nr_running is
4931  * decreased. We remove the task from the rbtree and
4932  * update the fair scheduling stats:
4933  */
4934 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4935 {
4936         struct cfs_rq *cfs_rq;
4937         struct sched_entity *se = &p->se;
4938         int task_sleep = flags & DEQUEUE_SLEEP;
4939
4940         for_each_sched_entity(se) {
4941                 cfs_rq = cfs_rq_of(se);
4942                 dequeue_entity(cfs_rq, se, flags);
4943
4944                 /*
4945                  * end evaluation on encountering a throttled cfs_rq
4946                  *
4947                  * note: in the case of encountering a throttled cfs_rq we will
4948                  * post the final h_nr_running decrement below.
4949                 */
4950                 if (cfs_rq_throttled(cfs_rq))
4951                         break;
4952                 cfs_rq->h_nr_running--;
4953
4954                 /* Don't dequeue parent if it has other entities besides us */
4955                 if (cfs_rq->load.weight) {
4956                         /* Avoid re-evaluating load for this entity: */
4957                         se = parent_entity(se);
4958                         /*
4959                          * Bias pick_next to pick a task from this cfs_rq, as
4960                          * p is sleeping when it is within its sched_slice.
4961                          */
4962                         if (task_sleep && se && !throttled_hierarchy(cfs_rq))
4963                                 set_next_buddy(se);
4964                         break;
4965                 }
4966                 flags |= DEQUEUE_SLEEP;
4967         }
4968
4969         for_each_sched_entity(se) {
4970                 cfs_rq = cfs_rq_of(se);
4971                 cfs_rq->h_nr_running--;
4972
4973                 if (cfs_rq_throttled(cfs_rq))
4974                         break;
4975
4976                 update_load_avg(se, UPDATE_TG);
4977                 update_cfs_shares(se);
4978         }
4979
4980         if (!se)
4981                 sub_nr_running(rq, 1);
4982
4983         hrtick_update(rq);
4984 }
4985
4986 #ifdef CONFIG_SMP
4987
4988 /* Working cpumask for: load_balance, load_balance_newidle. */
4989 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
4990 DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
4991
4992 #ifdef CONFIG_NO_HZ_COMMON
4993 /*
4994  * per rq 'load' arrray crap; XXX kill this.
4995  */
4996
4997 /*
4998  * The exact cpuload calculated at every tick would be:
4999  *
5000  *   load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
5001  *
5002  * If a cpu misses updates for n ticks (as it was idle) and update gets
5003  * called on the n+1-th tick when cpu may be busy, then we have:
5004  *
5005  *   load_n   = (1 - 1/2^i)^n * load_0
5006  *   load_n+1 = (1 - 1/2^i)   * load_n + (1/2^i) * cur_load
5007  *
5008  * decay_load_missed() below does efficient calculation of
5009  *
5010  *   load' = (1 - 1/2^i)^n * load
5011  *
5012  * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
5013  * This allows us to precompute the above in said factors, thereby allowing the
5014  * reduction of an arbitrary n in O(log_2 n) steps. (See also
5015  * fixed_power_int())
5016  *
5017  * The calculation is approximated on a 128 point scale.
5018  */
5019 #define DEGRADE_SHIFT           7
5020
5021 static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
5022 static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
5023         {   0,   0,  0,  0,  0,  0, 0, 0 },
5024         {  64,  32,  8,  0,  0,  0, 0, 0 },
5025         {  96,  72, 40, 12,  1,  0, 0, 0 },
5026         { 112,  98, 75, 43, 15,  1, 0, 0 },
5027         { 120, 112, 98, 76, 45, 16, 2, 0 }
5028 };
5029
5030 /*
5031  * Update cpu_load for any missed ticks, due to tickless idle. The backlog
5032  * would be when CPU is idle and so we just decay the old load without
5033  * adding any new load.
5034  */
5035 static unsigned long
5036 decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
5037 {
5038         int j = 0;
5039
5040         if (!missed_updates)
5041                 return load;
5042
5043         if (missed_updates >= degrade_zero_ticks[idx])
5044                 return 0;
5045
5046         if (idx == 1)
5047                 return load >> missed_updates;
5048
5049         while (missed_updates) {
5050                 if (missed_updates % 2)
5051                         load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
5052
5053                 missed_updates >>= 1;
5054                 j++;
5055         }
5056         return load;
5057 }
5058 #endif /* CONFIG_NO_HZ_COMMON */
5059
5060 /**
5061  * __cpu_load_update - update the rq->cpu_load[] statistics
5062  * @this_rq: The rq to update statistics for
5063  * @this_load: The current load
5064  * @pending_updates: The number of missed updates
5065  *
5066  * Update rq->cpu_load[] statistics. This function is usually called every
5067  * scheduler tick (TICK_NSEC).
5068  *
5069  * This function computes a decaying average:
5070  *
5071  *   load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
5072  *
5073  * Because of NOHZ it might not get called on every tick which gives need for
5074  * the @pending_updates argument.
5075  *
5076  *   load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
5077  *             = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
5078  *             = A * (A * load[i]_n-2 + B) + B
5079  *             = A * (A * (A * load[i]_n-3 + B) + B) + B
5080  *             = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
5081  *             = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
5082  *             = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
5083  *             = (1 - 1/2^i)^n * (load[i]_0 - load) + load
5084  *
5085  * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
5086  * any change in load would have resulted in the tick being turned back on.
5087  *
5088  * For regular NOHZ, this reduces to:
5089  *
5090  *   load[i]_n = (1 - 1/2^i)^n * load[i]_0
5091  *
5092  * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
5093  * term.
5094  */
5095 static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
5096                             unsigned long pending_updates)
5097 {
5098         unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
5099         int i, scale;
5100
5101         this_rq->nr_load_updates++;
5102
5103         /* Update our load: */
5104         this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
5105         for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
5106                 unsigned long old_load, new_load;
5107
5108                 /* scale is effectively 1 << i now, and >> i divides by scale */
5109
5110                 old_load = this_rq->cpu_load[i];
5111 #ifdef CONFIG_NO_HZ_COMMON
5112                 old_load = decay_load_missed(old_load, pending_updates - 1, i);
5113                 if (tickless_load) {
5114                         old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
5115                         /*
5116                          * old_load can never be a negative value because a
5117                          * decayed tickless_load cannot be greater than the
5118                          * original tickless_load.
5119                          */
5120                         old_load += tickless_load;
5121                 }
5122 #endif
5123                 new_load = this_load;
5124                 /*
5125                  * Round up the averaging division if load is increasing. This
5126                  * prevents us from getting stuck on 9 if the load is 10, for
5127                  * example.
5128                  */
5129                 if (new_load > old_load)
5130                         new_load += scale - 1;
5131
5132                 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
5133         }
5134
5135         sched_avg_update(this_rq);
5136 }
5137
5138 /* Used instead of source_load when we know the type == 0 */
5139 static unsigned long weighted_cpuload(struct rq *rq)
5140 {
5141         return cfs_rq_runnable_load_avg(&rq->cfs);
5142 }
5143
5144 #ifdef CONFIG_NO_HZ_COMMON
5145 /*
5146  * There is no sane way to deal with nohz on smp when using jiffies because the
5147  * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
5148  * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
5149  *
5150  * Therefore we need to avoid the delta approach from the regular tick when
5151  * possible since that would seriously skew the load calculation. This is why we
5152  * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
5153  * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
5154  * loop exit, nohz_idle_balance, nohz full exit...)
5155  *
5156  * This means we might still be one tick off for nohz periods.
5157  */
5158
5159 static void cpu_load_update_nohz(struct rq *this_rq,
5160                                  unsigned long curr_jiffies,
5161                                  unsigned long load)
5162 {
5163         unsigned long pending_updates;
5164
5165         pending_updates = curr_jiffies - this_rq->last_load_update_tick;
5166         if (pending_updates) {
5167                 this_rq->last_load_update_tick = curr_jiffies;
5168                 /*
5169                  * In the regular NOHZ case, we were idle, this means load 0.
5170                  * In the NOHZ_FULL case, we were non-idle, we should consider
5171                  * its weighted load.
5172                  */
5173                 cpu_load_update(this_rq, load, pending_updates);
5174         }
5175 }
5176
5177 /*
5178  * Called from nohz_idle_balance() to update the load ratings before doing the
5179  * idle balance.
5180  */
5181 static void cpu_load_update_idle(struct rq *this_rq)
5182 {
5183         /*
5184          * bail if there's load or we're actually up-to-date.
5185          */
5186         if (weighted_cpuload(this_rq))
5187                 return;
5188
5189         cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
5190 }
5191
5192 /*
5193  * Record CPU load on nohz entry so we know the tickless load to account
5194  * on nohz exit. cpu_load[0] happens then to be updated more frequently
5195  * than other cpu_load[idx] but it should be fine as cpu_load readers
5196  * shouldn't rely into synchronized cpu_load[*] updates.
5197  */
5198 void cpu_load_update_nohz_start(void)
5199 {
5200         struct rq *this_rq = this_rq();
5201
5202         /*
5203          * This is all lockless but should be fine. If weighted_cpuload changes
5204          * concurrently we'll exit nohz. And cpu_load write can race with
5205          * cpu_load_update_idle() but both updater would be writing the same.
5206          */
5207         this_rq->cpu_load[0] = weighted_cpuload(this_rq);
5208 }
5209
5210 /*
5211  * Account the tickless load in the end of a nohz frame.
5212  */
5213 void cpu_load_update_nohz_stop(void)
5214 {
5215         unsigned long curr_jiffies = READ_ONCE(jiffies);
5216         struct rq *this_rq = this_rq();
5217         unsigned long load;
5218         struct rq_flags rf;
5219
5220         if (curr_jiffies == this_rq->last_load_update_tick)
5221                 return;
5222
5223         load = weighted_cpuload(this_rq);
5224         rq_lock(this_rq, &rf);
5225         update_rq_clock(this_rq);
5226         cpu_load_update_nohz(this_rq, curr_jiffies, load);
5227         rq_unlock(this_rq, &rf);
5228 }
5229 #else /* !CONFIG_NO_HZ_COMMON */
5230 static inline void cpu_load_update_nohz(struct rq *this_rq,
5231                                         unsigned long curr_jiffies,
5232                                         unsigned long load) { }
5233 #endif /* CONFIG_NO_HZ_COMMON */
5234
5235 static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
5236 {
5237 #ifdef CONFIG_NO_HZ_COMMON
5238         /* See the mess around cpu_load_update_nohz(). */
5239         this_rq->last_load_update_tick = READ_ONCE(jiffies);
5240 #endif
5241         cpu_load_update(this_rq, load, 1);
5242 }
5243
5244 /*
5245  * Called from scheduler_tick()
5246  */
5247 void cpu_load_update_active(struct rq *this_rq)
5248 {
5249         unsigned long load = weighted_cpuload(this_rq);
5250
5251         if (tick_nohz_tick_stopped())
5252                 cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
5253         else
5254                 cpu_load_update_periodic(this_rq, load);
5255 }
5256
5257 /*
5258  * Return a low guess at the load of a migration-source cpu weighted
5259  * according to the scheduling class and "nice" value.
5260  *
5261  * We want to under-estimate the load of migration sources, to
5262  * balance conservatively.
5263  */
5264 static unsigned long source_load(int cpu, int type)
5265 {
5266         struct rq *rq = cpu_rq(cpu);
5267         unsigned long total = weighted_cpuload(rq);
5268
5269         if (type == 0 || !sched_feat(LB_BIAS))
5270                 return total;
5271
5272         return min(rq->cpu_load[type-1], total);
5273 }
5274
5275 /*
5276  * Return a high guess at the load of a migration-target cpu weighted
5277  * according to the scheduling class and "nice" value.
5278  */
5279 static unsigned long target_load(int cpu, int type)
5280 {
5281         struct rq *rq = cpu_rq(cpu);
5282         unsigned long total = weighted_cpuload(rq);
5283
5284         if (type == 0 || !sched_feat(LB_BIAS))
5285                 return total;
5286
5287         return max(rq->cpu_load[type-1], total);
5288 }
5289
5290 static unsigned long capacity_of(int cpu)
5291 {
5292         return cpu_rq(cpu)->cpu_capacity;
5293 }
5294
5295 static unsigned long capacity_orig_of(int cpu)
5296 {
5297         return cpu_rq(cpu)->cpu_capacity_orig;
5298 }
5299
5300 static unsigned long cpu_avg_load_per_task(int cpu)
5301 {
5302         struct rq *rq = cpu_rq(cpu);
5303         unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
5304         unsigned long load_avg = weighted_cpuload(rq);
5305
5306         if (nr_running)
5307                 return load_avg / nr_running;
5308
5309         return 0;
5310 }
5311
5312 static void record_wakee(struct task_struct *p)
5313 {
5314         /*
5315          * Only decay a single time; tasks that have less then 1 wakeup per
5316          * jiffy will not have built up many flips.
5317          */
5318         if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
5319                 current->wakee_flips >>= 1;
5320                 current->wakee_flip_decay_ts = jiffies;
5321         }
5322
5323         if (current->last_wakee != p) {
5324                 current->last_wakee = p;
5325                 current->wakee_flips++;
5326         }
5327 }
5328
5329 /*
5330  * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
5331  *
5332  * A waker of many should wake a different task than the one last awakened
5333  * at a frequency roughly N times higher than one of its wakees.
5334  *
5335  * In order to determine whether we should let the load spread vs consolidating
5336  * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
5337  * partner, and a factor of lls_size higher frequency in the other.
5338  *
5339  * With both conditions met, we can be relatively sure that the relationship is
5340  * non-monogamous, with partner count exceeding socket size.
5341  *
5342  * Waker/wakee being client/server, worker/dispatcher, interrupt source or
5343  * whatever is irrelevant, spread criteria is apparent partner count exceeds
5344  * socket size.
5345  */
5346 static int wake_wide(struct task_struct *p)
5347 {
5348         unsigned int master = current->wakee_flips;
5349         unsigned int slave = p->wakee_flips;
5350         int factor = this_cpu_read(sd_llc_size);
5351
5352         if (master < slave)
5353                 swap(master, slave);
5354         if (slave < factor || master < slave * factor)
5355                 return 0;
5356         return 1;
5357 }
5358
5359 struct llc_stats {
5360         unsigned long   nr_running;
5361         unsigned long   load;
5362         unsigned long   capacity;
5363         int             has_capacity;
5364 };
5365
5366 static bool get_llc_stats(struct llc_stats *stats, int cpu)
5367 {
5368         struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
5369
5370         if (!sds)
5371                 return false;
5372
5373         stats->nr_running       = READ_ONCE(sds->nr_running);
5374         stats->load             = READ_ONCE(sds->load);
5375         stats->capacity         = READ_ONCE(sds->capacity);
5376         stats->has_capacity     = stats->nr_running < per_cpu(sd_llc_size, cpu);
5377
5378         return true;
5379 }
5380
5381 /*
5382  * Can a task be moved from prev_cpu to this_cpu without causing a load
5383  * imbalance that would trigger the load balancer?
5384  *
5385  * Since we're running on 'stale' values, we might in fact create an imbalance
5386  * but recomputing these values is expensive, as that'd mean iteration 2 cache
5387  * domains worth of CPUs.
5388  */
5389 static bool
5390 wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
5391                 int this_cpu, int prev_cpu, int sync)
5392 {
5393         struct llc_stats prev_stats, this_stats;
5394         s64 this_eff_load, prev_eff_load;
5395         unsigned long task_load;
5396
5397         if (!get_llc_stats(&prev_stats, prev_cpu) ||
5398             !get_llc_stats(&this_stats, this_cpu))
5399                 return false;
5400
5401         /*
5402          * If sync wakeup then subtract the (maximum possible)
5403          * effect of the currently running task from the load
5404          * of the current LLC.
5405          */
5406         if (sync) {
5407                 unsigned long current_load = task_h_load(current);
5408
5409                 /* in this case load hits 0 and this LLC is considered 'idle' */
5410                 if (current_load > this_stats.load)
5411                         return true;
5412
5413                 this_stats.load -= current_load;
5414         }
5415
5416         /*
5417          * The has_capacity stuff is not SMT aware, but by trying to balance
5418          * the nr_running on both ends we try and fill the domain at equal
5419          * rates, thereby first consuming cores before siblings.
5420          */
5421
5422         /* if the old cache has capacity, stay there */
5423         if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
5424                 return false;
5425
5426         /* if this cache has capacity, come here */
5427         if (this_stats.has_capacity && this_stats.nr_running < prev_stats.nr_running+1)
5428                 return true;
5429
5430         /*
5431          * Check to see if we can move the load without causing too much
5432          * imbalance.
5433          */
5434         task_load = task_h_load(p);
5435
5436         this_eff_load = 100;
5437         this_eff_load *= prev_stats.capacity;
5438
5439         prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
5440         prev_eff_load *= this_stats.capacity;
5441
5442         this_eff_load *= this_stats.load + task_load;
5443         prev_eff_load *= prev_stats.load - task_load;
5444
5445         return this_eff_load <= prev_eff_load;
5446 }
5447
5448 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5449                        int prev_cpu, int sync)
5450 {
5451         int this_cpu = smp_processor_id();
5452         bool affine;
5453
5454         /*
5455          * Default to no affine wakeups; wake_affine() should not effect a task
5456          * placement the load-balancer feels inclined to undo. The conservative
5457          * option is therefore to not move tasks when they wake up.
5458          */
5459         affine = false;
5460
5461         /*
5462          * If the wakeup is across cache domains, try to evaluate if movement
5463          * makes sense, otherwise rely on select_idle_siblings() to do
5464          * placement inside the cache domain.
5465          */
5466         if (!cpus_share_cache(prev_cpu, this_cpu))
5467                 affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
5468
5469         schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
5470         if (affine) {
5471                 schedstat_inc(sd->ttwu_move_affine);
5472                 schedstat_inc(p->se.statistics.nr_wakeups_affine);
5473         }
5474
5475         return affine;
5476 }
5477
5478 static inline int task_util(struct task_struct *p);
5479 static int cpu_util_wake(int cpu, struct task_struct *p);
5480
5481 static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
5482 {
5483         return capacity_orig_of(cpu) - cpu_util_wake(cpu, p);
5484 }
5485
5486 /*
5487  * find_idlest_group finds and returns the least busy CPU group within the
5488  * domain.
5489  */
5490 static struct sched_group *
5491 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5492                   int this_cpu, int sd_flag)
5493 {
5494         struct sched_group *idlest = NULL, *group = sd->groups;
5495         struct sched_group *most_spare_sg = NULL;
5496         unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0;
5497         unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0;
5498         unsigned long most_spare = 0, this_spare = 0;
5499         int load_idx = sd->forkexec_idx;
5500         int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
5501         unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
5502                                 (sd->imbalance_pct-100) / 100;
5503
5504         if (sd_flag & SD_BALANCE_WAKE)
5505                 load_idx = sd->wake_idx;
5506
5507         do {
5508                 unsigned long load, avg_load, runnable_load;
5509                 unsigned long spare_cap, max_spare_cap;
5510                 int local_group;
5511                 int i;
5512
5513                 /* Skip over this group if it has no CPUs allowed */
5514                 if (!cpumask_intersects(sched_group_span(group),
5515                                         &p->cpus_allowed))
5516                         continue;
5517
5518                 local_group = cpumask_test_cpu(this_cpu,
5519                                                sched_group_span(group));
5520
5521                 /*
5522                  * Tally up the load of all CPUs in the group and find
5523                  * the group containing the CPU with most spare capacity.
5524                  */
5525                 avg_load = 0;
5526                 runnable_load = 0;
5527                 max_spare_cap = 0;
5528
5529                 for_each_cpu(i, sched_group_span(group)) {
5530                         /* Bias balancing toward cpus of our domain */
5531                         if (local_group)
5532                                 load = source_load(i, load_idx);
5533                         else
5534                                 load = target_load(i, load_idx);
5535
5536                         runnable_load += load;
5537
5538                         avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
5539
5540                         spare_cap = capacity_spare_wake(i, p);
5541
5542                         if (spare_cap > max_spare_cap)
5543                                 max_spare_cap = spare_cap;
5544                 }
5545
5546                 /* Adjust by relative CPU capacity of the group */
5547                 avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
5548                                         group->sgc->capacity;
5549                 runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
5550                                         group->sgc->capacity;
5551
5552                 if (local_group) {
5553                         this_runnable_load = runnable_load;
5554                         this_avg_load = avg_load;
5555                         this_spare = max_spare_cap;
5556                 } else {
5557                         if (min_runnable_load > (runnable_load + imbalance)) {
5558                                 /*
5559                                  * The runnable load is significantly smaller
5560                                  * so we can pick this new cpu
5561                                  */
5562                                 min_runnable_load = runnable_load;
5563                                 min_avg_load = avg_load;
5564                                 idlest = group;
5565                         } else if ((runnable_load < (min_runnable_load + imbalance)) &&
5566                                    (100*min_avg_load > imbalance_scale*avg_load)) {
5567                                 /*
5568                                  * The runnable loads are close so take the
5569                                  * blocked load into account through avg_load.
5570                                  */
5571                                 min_avg_load = avg_load;
5572                                 idlest = group;
5573                         }
5574
5575                         if (most_spare < max_spare_cap) {
5576                                 most_spare = max_spare_cap;
5577                                 most_spare_sg = group;
5578                         }
5579                 }
5580         } while (group = group->next, group != sd->groups);
5581
5582         /*
5583          * The cross-over point between using spare capacity or least load
5584          * is too conservative for high utilization tasks on partially
5585          * utilized systems if we require spare_capacity > task_util(p),
5586          * so we allow for some task stuffing by using
5587          * spare_capacity > task_util(p)/2.
5588          *
5589          * Spare capacity can't be used for fork because the utilization has
5590          * not been set yet, we must first select a rq to compute the initial
5591          * utilization.
5592          */
5593         if (sd_flag & SD_BALANCE_FORK)
5594                 goto skip_spare;
5595
5596         if (this_spare > task_util(p) / 2 &&
5597             imbalance_scale*this_spare > 100*most_spare)
5598                 return NULL;
5599
5600         if (most_spare > task_util(p) / 2)
5601                 return most_spare_sg;
5602
5603 skip_spare:
5604         if (!idlest)
5605                 return NULL;
5606
5607         if (min_runnable_load > (this_runnable_load + imbalance))
5608                 return NULL;
5609
5610         if ((this_runnable_load < (min_runnable_load + imbalance)) &&
5611              (100*this_avg_load < imbalance_scale*min_avg_load))
5612                 return NULL;
5613
5614         return idlest;
5615 }
5616
5617 /*
5618  * find_idlest_cpu - find the idlest cpu among the cpus in group.
5619  */
5620 static int
5621 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
5622 {
5623         unsigned long load, min_load = ULONG_MAX;
5624         unsigned int min_exit_latency = UINT_MAX;
5625         u64 latest_idle_timestamp = 0;
5626         int least_loaded_cpu = this_cpu;
5627         int shallowest_idle_cpu = -1;
5628         int i;
5629
5630         /* Check if we have any choice: */
5631         if (group->group_weight == 1)
5632                 return cpumask_first(sched_group_span(group));
5633
5634         /* Traverse only the allowed CPUs */
5635         for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
5636                 if (idle_cpu(i)) {
5637                         struct rq *rq = cpu_rq(i);
5638                         struct cpuidle_state *idle = idle_get_state(rq);
5639                         if (idle && idle->exit_latency < min_exit_latency) {
5640                                 /*
5641                                  * We give priority to a CPU whose idle state
5642                                  * has the smallest exit latency irrespective
5643                                  * of any idle timestamp.
5644                                  */
5645                                 min_exit_latency = idle->exit_latency;
5646                                 latest_idle_timestamp = rq->idle_stamp;
5647                                 shallowest_idle_cpu = i;
5648                         } else if ((!idle || idle->exit_latency == min_exit_latency) &&
5649                                    rq->idle_stamp > latest_idle_timestamp) {
5650                                 /*
5651                                  * If equal or no active idle state, then
5652                                  * the most recently idled CPU might have
5653                                  * a warmer cache.
5654                                  */
5655                                 latest_idle_timestamp = rq->idle_stamp;
5656                                 shallowest_idle_cpu = i;
5657                         }
5658                 } else if (shallowest_idle_cpu == -1) {
5659                         load = weighted_cpuload(cpu_rq(i));
5660                         if (load < min_load || (load == min_load && i == this_cpu)) {
5661                                 min_load = load;
5662                                 least_loaded_cpu = i;
5663                         }
5664                 }
5665         }
5666
5667         return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
5668 }
5669
5670 #ifdef CONFIG_SCHED_SMT
5671
5672 static inline void set_idle_cores(int cpu, int val)
5673 {
5674         struct sched_domain_shared *sds;
5675
5676         sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
5677         if (sds)
5678                 WRITE_ONCE(sds->has_idle_cores, val);
5679 }
5680
5681 static inline bool test_idle_cores(int cpu, bool def)
5682 {
5683         struct sched_domain_shared *sds;
5684
5685         sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
5686         if (sds)
5687                 return READ_ONCE(sds->has_idle_cores);
5688
5689         return def;
5690 }
5691
5692 /*
5693  * Scans the local SMT mask to see if the entire core is idle, and records this
5694  * information in sd_llc_shared->has_idle_cores.
5695  *
5696  * Since SMT siblings share all cache levels, inspecting this limited remote
5697  * state should be fairly cheap.
5698  */
5699 void __update_idle_core(struct rq *rq)
5700 {
5701         int core = cpu_of(rq);
5702         int cpu;
5703
5704         rcu_read_lock();
5705         if (test_idle_cores(core, true))
5706                 goto unlock;
5707
5708         for_each_cpu(cpu, cpu_smt_mask(core)) {
5709                 if (cpu == core)
5710                         continue;
5711
5712                 if (!idle_cpu(cpu))
5713                         goto unlock;
5714         }
5715
5716         set_idle_cores(core, 1);
5717 unlock:
5718         rcu_read_unlock();
5719 }
5720
5721 /*
5722  * Scan the entire LLC domain for idle cores; this dynamically switches off if
5723  * there are no idle cores left in the system; tracked through
5724  * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
5725  */
5726 static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
5727 {
5728         struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
5729         int core, cpu;
5730
5731         if (!static_branch_likely(&sched_smt_present))
5732                 return -1;
5733
5734         if (!test_idle_cores(target, false))
5735                 return -1;
5736
5737         cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
5738
5739         for_each_cpu_wrap(core, cpus, target) {
5740                 bool idle = true;
5741
5742                 for_each_cpu(cpu, cpu_smt_mask(core)) {
5743                         cpumask_clear_cpu(cpu, cpus);
5744                         if (!idle_cpu(cpu))
5745                                 idle = false;
5746                 }
5747
5748                 if (idle)
5749                         return core;
5750         }
5751
5752         /*
5753          * Failed to find an idle core; stop looking for one.
5754          */
5755         set_idle_cores(target, 0);
5756
5757         return -1;
5758 }
5759
5760 /*
5761  * Scan the local SMT mask for idle CPUs.
5762  */
5763 static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
5764 {
5765         int cpu;
5766
5767         if (!static_branch_likely(&sched_smt_present))
5768                 return -1;
5769
5770         for_each_cpu(cpu, cpu_smt_mask(target)) {
5771                 if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
5772                         continue;
5773                 if (idle_cpu(cpu))
5774                         return cpu;
5775         }
5776
5777         return -1;
5778 }
5779
5780 #else /* CONFIG_SCHED_SMT */
5781
5782 static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
5783 {
5784         return -1;
5785 }
5786
5787 static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
5788 {
5789         return -1;
5790 }
5791
5792 #endif /* CONFIG_SCHED_SMT */
5793
5794 /*
5795  * Scan the LLC domain for idle CPUs; this is dynamically regulated by
5796  * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
5797  * average idle time for this rq (as found in rq->avg_idle).
5798  */
5799 static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
5800 {
5801         struct sched_domain *this_sd;
5802         u64 avg_cost, avg_idle;
5803         u64 time, cost;
5804         s64 delta;
5805         int cpu, nr = INT_MAX;
5806
5807         this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
5808         if (!this_sd)
5809                 return -1;
5810
5811         /*
5812          * Due to large variance we need a large fuzz factor; hackbench in
5813          * particularly is sensitive here.
5814          */
5815         avg_idle = this_rq()->avg_idle / 512;
5816         avg_cost = this_sd->avg_scan_cost + 1;
5817
5818         if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
5819                 return -1;
5820
5821         if (sched_feat(SIS_PROP)) {
5822                 u64 span_avg = sd->span_weight * avg_idle;
5823                 if (span_avg > 4*avg_cost)
5824                         nr = div_u64(span_avg, avg_cost);
5825                 else
5826                         nr = 4;
5827         }
5828
5829         time = local_clock();
5830
5831         for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
5832                 if (!--nr)
5833                         return -1;
5834                 if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
5835                         continue;
5836                 if (idle_cpu(cpu))
5837                         break;
5838         }
5839
5840         time = local_clock() - time;
5841         cost = this_sd->avg_scan_cost;
5842         delta = (s64)(time - cost) / 8;
5843         this_sd->avg_scan_cost += delta;
5844
5845         return cpu;
5846 }
5847
5848 /*
5849  * Try and locate an idle core/thread in the LLC cache domain.
5850  */
5851 static int select_idle_sibling(struct task_struct *p, int prev, int target)
5852 {
5853         struct sched_domain *sd;
5854         int i;
5855
5856         if (idle_cpu(target))
5857                 return target;
5858
5859         /*
5860          * If the previous cpu is cache affine and idle, don't be stupid.
5861          */
5862         if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
5863                 return prev;
5864
5865         sd = rcu_dereference(per_cpu(sd_llc, target));
5866         if (!sd)
5867                 return target;
5868
5869         i = select_idle_core(p, sd, target);
5870         if ((unsigned)i < nr_cpumask_bits)
5871                 return i;
5872
5873         i = select_idle_cpu(p, sd, target);
5874         if ((unsigned)i < nr_cpumask_bits)
5875                 return i;
5876
5877         i = select_idle_smt(p, sd, target);
5878         if ((unsigned)i < nr_cpumask_bits)
5879                 return i;
5880
5881         return target;
5882 }
5883
5884 /*
5885  * cpu_util returns the amount of capacity of a CPU that is used by CFS
5886  * tasks. The unit of the return value must be the one of capacity so we can
5887  * compare the utilization with the capacity of the CPU that is available for
5888  * CFS task (ie cpu_capacity).
5889  *
5890  * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
5891  * recent utilization of currently non-runnable tasks on a CPU. It represents
5892  * the amount of utilization of a CPU in the range [0..capacity_orig] where
5893  * capacity_orig is the cpu_capacity available at the highest frequency
5894  * (arch_scale_freq_capacity()).
5895  * The utilization of a CPU converges towards a sum equal to or less than the
5896  * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
5897  * the running time on this CPU scaled by capacity_curr.
5898  *
5899  * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
5900  * higher than capacity_orig because of unfortunate rounding in
5901  * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
5902  * the average stabilizes with the new running time. We need to check that the
5903  * utilization stays within the range of [0..capacity_orig] and cap it if
5904  * necessary. Without utilization capping, a group could be seen as overloaded
5905  * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
5906  * available capacity. We allow utilization to overshoot capacity_curr (but not
5907  * capacity_orig) as it useful for predicting the capacity required after task
5908  * migrations (scheduler-driven DVFS).
5909  */
5910 static int cpu_util(int cpu)
5911 {
5912         unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
5913         unsigned long capacity = capacity_orig_of(cpu);
5914
5915         return (util >= capacity) ? capacity : util;
5916 }
5917
5918 static inline int task_util(struct task_struct *p)
5919 {
5920         return p->se.avg.util_avg;
5921 }
5922
5923 /*
5924  * cpu_util_wake: Compute cpu utilization with any contributions from
5925  * the waking task p removed.
5926  */
5927 static int cpu_util_wake(int cpu, struct task_struct *p)
5928 {
5929         unsigned long util, capacity;
5930
5931         /* Task has no contribution or is new */
5932         if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
5933                 return cpu_util(cpu);
5934
5935         capacity = capacity_orig_of(cpu);
5936         util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0);
5937
5938         return (util >= capacity) ? capacity : util;
5939 }
5940
5941 /*
5942  * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
5943  * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
5944  *
5945  * In that case WAKE_AFFINE doesn't make sense and we'll let
5946  * BALANCE_WAKE sort things out.
5947  */
5948 static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
5949 {
5950         long min_cap, max_cap;
5951
5952         min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
5953         max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
5954
5955         /* Minimum capacity is close to max, no need to abort wake_affine */
5956         if (max_cap - min_cap < max_cap >> 3)
5957                 return 0;
5958
5959         /* Bring task utilization in sync with prev_cpu */
5960         sync_entity_load_avg(&p->se);
5961
5962         return min_cap * 1024 < task_util(p) * capacity_margin;
5963 }
5964
5965 /*
5966  * select_task_rq_fair: Select target runqueue for the waking task in domains
5967  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
5968  * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
5969  *
5970  * Balances load by selecting the idlest cpu in the idlest group, or under
5971  * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
5972  *
5973  * Returns the target cpu number.
5974  *
5975  * preempt must be disabled.
5976  */
5977 static int
5978 select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
5979 {
5980         struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
5981         int cpu = smp_processor_id();
5982         int new_cpu = prev_cpu;
5983         int want_affine = 0;
5984         int sync = wake_flags & WF_SYNC;
5985
5986         if (sd_flag & SD_BALANCE_WAKE) {
5987                 record_wakee(p);
5988                 want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
5989                               && cpumask_test_cpu(cpu, &p->cpus_allowed);
5990         }
5991
5992         rcu_read_lock();
5993         for_each_domain(cpu, tmp) {
5994                 if (!(tmp->flags & SD_LOAD_BALANCE))
5995                         break;
5996
5997                 /*
5998                  * If both cpu and prev_cpu are part of this domain,
5999                  * cpu is a valid SD_WAKE_AFFINE target.
6000                  */
6001                 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
6002                     cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
6003                         affine_sd = tmp;
6004                         break;
6005                 }
6006
6007                 if (tmp->flags & sd_flag)
6008                         sd = tmp;
6009                 else if (!want_affine)
6010                         break;
6011         }
6012
6013         if (affine_sd) {
6014                 sd = NULL; /* Prefer wake_affine over balance flags */
6015                 if (cpu == prev_cpu)
6016                         goto pick_cpu;
6017
6018                 if (wake_affine(affine_sd, p, prev_cpu, sync))
6019                         new_cpu = cpu;
6020         }
6021
6022         if (!sd) {
6023  pick_cpu:
6024                 if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
6025                         new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
6026
6027         } else while (sd) {
6028                 struct sched_group *group;
6029                 int weight;
6030
6031                 if (!(sd->flags & sd_flag)) {
6032                         sd = sd->child;
6033                         continue;
6034                 }
6035
6036                 group = find_idlest_group(sd, p, cpu, sd_flag);
6037                 if (!group) {
6038                         sd = sd->child;
6039                         continue;
6040                 }
6041
6042                 new_cpu = find_idlest_cpu(group, p, cpu);
6043                 if (new_cpu == -1 || new_cpu == cpu) {
6044                         /* Now try balancing at a lower domain level of cpu */
6045                         sd = sd->child;
6046                         continue;
6047                 }
6048
6049                 /* Now try balancing at a lower domain level of new_cpu */
6050                 cpu = new_cpu;
6051                 weight = sd->span_weight;
6052                 sd = NULL;
6053                 for_each_domain(cpu, tmp) {
6054                         if (weight <= tmp->span_weight)
6055                                 break;
6056                         if (tmp->flags & sd_flag)
6057                                 sd = tmp;
6058                 }
6059                 /* while loop will break here if sd == NULL */
6060         }
6061         rcu_read_unlock();
6062
6063         return new_cpu;
6064 }
6065
6066 /*
6067  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
6068  * cfs_rq_of(p) references at time of call are still valid and identify the
6069  * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
6070  */
6071 static void migrate_task_rq_fair(struct task_struct *p)
6072 {
6073         /*
6074          * As blocked tasks retain absolute vruntime the migration needs to
6075          * deal with this by subtracting the old and adding the new
6076          * min_vruntime -- the latter is done by enqueue_entity() when placing
6077          * the task on the new runqueue.
6078          */
6079         if (p->state == TASK_WAKING) {
6080                 struct sched_entity *se = &p->se;
6081                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
6082                 u64 min_vruntime;
6083
6084 #ifndef CONFIG_64BIT
6085                 u64 min_vruntime_copy;
6086
6087                 do {
6088                         min_vruntime_copy = cfs_rq->min_vruntime_copy;
6089                         smp_rmb();
6090                         min_vruntime = cfs_rq->min_vruntime;
6091                 } while (min_vruntime != min_vruntime_copy);
6092 #else
6093                 min_vruntime = cfs_rq->min_vruntime;
6094 #endif
6095
6096                 se->vruntime -= min_vruntime;
6097         }
6098
6099         /*
6100          * We are supposed to update the task to "current" time, then its up to date
6101          * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
6102          * what current time is, so simply throw away the out-of-date time. This
6103          * will result in the wakee task is less decayed, but giving the wakee more
6104          * load sounds not bad.
6105          */
6106         remove_entity_load_avg(&p->se);
6107
6108         /* Tell new CPU we are migrated */
6109         p->se.avg.last_update_time = 0;
6110
6111         /* We have migrated, no longer consider this task hot */
6112         p->se.exec_start = 0;
6113 }
6114
6115 static void task_dead_fair(struct task_struct *p)
6116 {
6117         remove_entity_load_avg(&p->se);
6118 }
6119 #endif /* CONFIG_SMP */
6120
6121 static unsigned long
6122 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
6123 {
6124         unsigned long gran = sysctl_sched_wakeup_granularity;
6125
6126         /*
6127          * Since its curr running now, convert the gran from real-time
6128          * to virtual-time in his units.
6129          *
6130          * By using 'se' instead of 'curr' we penalize light tasks, so
6131          * they get preempted easier. That is, if 'se' < 'curr' then
6132          * the resulting gran will be larger, therefore penalizing the
6133          * lighter, if otoh 'se' > 'curr' then the resulting gran will
6134          * be smaller, again penalizing the lighter task.
6135          *
6136          * This is especially important for buddies when the leftmost
6137          * task is higher priority than the buddy.
6138          */
6139         return calc_delta_fair(gran, se);
6140 }
6141
6142 /*
6143  * Should 'se' preempt 'curr'.
6144  *
6145  *             |s1
6146  *        |s2
6147  *   |s3
6148  *         g
6149  *      |<--->|c
6150  *
6151  *  w(c, s1) = -1
6152  *  w(c, s2) =  0
6153  *  w(c, s3) =  1
6154  *
6155  */
6156 static int
6157 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
6158 {
6159         s64 gran, vdiff = curr->vruntime - se->vruntime;
6160
6161         if (vdiff <= 0)
6162                 return -1;
6163
6164         gran = wakeup_gran(curr, se);
6165         if (vdiff > gran)
6166                 return 1;
6167
6168         return 0;
6169 }
6170
6171 static void set_last_buddy(struct sched_entity *se)
6172 {
6173         if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
6174                 return;
6175
6176         for_each_sched_entity(se) {
6177                 if (SCHED_WARN_ON(!se->on_rq))
6178                         return;
6179                 cfs_rq_of(se)->last = se;
6180         }
6181 }
6182
6183 static void set_next_buddy(struct sched_entity *se)
6184 {
6185         if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
6186                 return;
6187
6188         for_each_sched_entity(se) {
6189                 if (SCHED_WARN_ON(!se->on_rq))
6190                         return;
6191                 cfs_rq_of(se)->next = se;
6192         }
6193 }
6194
6195 static void set_skip_buddy(struct sched_entity *se)
6196 {
6197         for_each_sched_entity(se)
6198                 cfs_rq_of(se)->skip = se;
6199 }
6200
6201 /*
6202  * Preempt the current task with a newly woken task if needed:
6203  */
6204 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
6205 {
6206         struct task_struct *curr = rq->curr;
6207         struct sched_entity *se = &curr->se, *pse = &p->se;
6208         struct cfs_rq *cfs_rq = task_cfs_rq(curr);
6209         int scale = cfs_rq->nr_running >= sched_nr_latency;
6210         int next_buddy_marked = 0;
6211
6212         if (unlikely(se == pse))
6213                 return;
6214
6215         /*
6216          * This is possible from callers such as attach_tasks(), in which we
6217          * unconditionally check_prempt_curr() after an enqueue (which may have
6218          * lead to a throttle).  This both saves work and prevents false
6219          * next-buddy nomination below.
6220          */
6221         if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
6222                 return;
6223
6224         if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
6225                 set_next_buddy(pse);
6226                 next_buddy_marked = 1;
6227         }
6228
6229         /*
6230          * We can come here with TIF_NEED_RESCHED already set from new task
6231          * wake up path.
6232          *
6233          * Note: this also catches the edge-case of curr being in a throttled
6234          * group (e.g. via set_curr_task), since update_curr() (in the
6235          * enqueue of curr) will have resulted in resched being set.  This
6236          * prevents us from potentially nominating it as a false LAST_BUDDY
6237          * below.
6238          */
6239         if (test_tsk_need_resched(curr))
6240                 return;
6241
6242         /* Idle tasks are by definition preempted by non-idle tasks. */
6243         if (unlikely(curr->policy == SCHED_IDLE) &&
6244             likely(p->policy != SCHED_IDLE))
6245                 goto preempt;
6246
6247         /*
6248          * Batch and idle tasks do not preempt non-idle tasks (their preemption
6249          * is driven by the tick):
6250          */
6251         if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
6252                 return;
6253
6254         find_matching_se(&se, &pse);
6255         update_curr(cfs_rq_of(se));
6256         BUG_ON(!pse);
6257         if (wakeup_preempt_entity(se, pse) == 1) {
6258                 /*
6259                  * Bias pick_next to pick the sched entity that is
6260                  * triggering this preemption.
6261                  */
6262                 if (!next_buddy_marked)
6263                         set_next_buddy(pse);
6264                 goto preempt;
6265         }
6266
6267         return;
6268
6269 preempt:
6270         resched_curr(rq);
6271         /*
6272          * Only set the backward buddy when the current task is still
6273          * on the rq. This can happen when a wakeup gets interleaved
6274          * with schedule on the ->pre_schedule() or idle_balance()
6275          * point, either of which can * drop the rq lock.
6276          *
6277          * Also, during early boot the idle thread is in the fair class,
6278          * for obvious reasons its a bad idea to schedule back to it.
6279          */
6280         if (unlikely(!se->on_rq || curr == rq->idle))
6281                 return;
6282
6283         if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
6284                 set_last_buddy(se);
6285 }
6286
6287 static struct task_struct *
6288 pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
6289 {
6290         struct cfs_rq *cfs_rq = &rq->cfs;
6291         struct sched_entity *se;
6292         struct task_struct *p;
6293         int new_tasks;
6294
6295 again:
6296         if (!cfs_rq->nr_running)
6297                 goto idle;
6298
6299 #ifdef CONFIG_FAIR_GROUP_SCHED
6300         if (prev->sched_class != &fair_sched_class)
6301                 goto simple;
6302
6303         /*
6304          * Because of the set_next_buddy() in dequeue_task_fair() it is rather
6305          * likely that a next task is from the same cgroup as the current.
6306          *
6307          * Therefore attempt to avoid putting and setting the entire cgroup
6308          * hierarchy, only change the part that actually changes.
6309          */
6310
6311         do {
6312                 struct sched_entity *curr = cfs_rq->curr;
6313
6314                 /*
6315                  * Since we got here without doing put_prev_entity() we also
6316                  * have to consider cfs_rq->curr. If it is still a runnable
6317                  * entity, update_curr() will update its vruntime, otherwise
6318                  * forget we've ever seen it.
6319                  */
6320                 if (curr) {
6321                         if (curr->on_rq)
6322                                 update_curr(cfs_rq);
6323                         else
6324                                 curr = NULL;
6325
6326                         /*
6327                          * This call to check_cfs_rq_runtime() will do the
6328                          * throttle and dequeue its entity in the parent(s).
6329                          * Therefore the nr_running test will indeed
6330                          * be correct.
6331                          */
6332                         if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
6333                                 cfs_rq = &rq->cfs;
6334
6335                                 if (!cfs_rq->nr_running)
6336                                         goto idle;
6337
6338                                 goto simple;
6339                         }
6340                 }
6341
6342                 se = pick_next_entity(cfs_rq, curr);
6343                 cfs_rq = group_cfs_rq(se);
6344         } while (cfs_rq);
6345
6346         p = task_of(se);
6347
6348         /*
6349          * Since we haven't yet done put_prev_entity and if the selected task
6350          * is a different task than we started out with, try and touch the
6351          * least amount of cfs_rqs.
6352          */
6353         if (prev != p) {
6354                 struct sched_entity *pse = &prev->se;
6355
6356                 while (!(cfs_rq = is_same_group(se, pse))) {
6357                         int se_depth = se->depth;
6358                         int pse_depth = pse->depth;
6359
6360                         if (se_depth <= pse_depth) {
6361                                 put_prev_entity(cfs_rq_of(pse), pse);
6362                                 pse = parent_entity(pse);
6363                         }
6364                         if (se_depth >= pse_depth) {
6365                                 set_next_entity(cfs_rq_of(se), se);
6366                                 se = parent_entity(se);
6367                         }
6368                 }
6369
6370                 put_prev_entity(cfs_rq, pse);
6371                 set_next_entity(cfs_rq, se);
6372         }
6373
6374         if (hrtick_enabled(rq))
6375                 hrtick_start_fair(rq, p);
6376
6377         return p;
6378 simple:
6379 #endif
6380
6381         put_prev_task(rq, prev);
6382
6383         do {
6384                 se = pick_next_entity(cfs_rq, NULL);
6385                 set_next_entity(cfs_rq, se);
6386                 cfs_rq = group_cfs_rq(se);
6387         } while (cfs_rq);
6388
6389         p = task_of(se);
6390
6391         if (hrtick_enabled(rq))
6392                 hrtick_start_fair(rq, p);
6393
6394         return p;
6395
6396 idle:
6397         new_tasks = idle_balance(rq, rf);
6398
6399         /*
6400          * Because idle_balance() releases (and re-acquires) rq->lock, it is
6401          * possible for any higher priority task to appear. In that case we
6402          * must re-start the pick_next_entity() loop.
6403          */
6404         if (new_tasks < 0)
6405                 return RETRY_TASK;
6406
6407         if (new_tasks > 0)
6408                 goto again;
6409
6410         return NULL;
6411 }
6412
6413 /*
6414  * Account for a descheduled task:
6415  */
6416 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
6417 {
6418         struct sched_entity *se = &prev->se;
6419         struct cfs_rq *cfs_rq;
6420
6421         for_each_sched_entity(se) {
6422                 cfs_rq = cfs_rq_of(se);
6423                 put_prev_entity(cfs_rq, se);
6424         }
6425 }
6426
6427 /*
6428  * sched_yield() is very simple
6429  *
6430  * The magic of dealing with the ->skip buddy is in pick_next_entity.
6431  */
6432 static void yield_task_fair(struct rq *rq)
6433 {
6434         struct task_struct *curr = rq->curr;
6435         struct cfs_rq *cfs_rq = task_cfs_rq(curr);
6436         struct sched_entity *se = &curr->se;
6437
6438         /*
6439          * Are we the only task in the tree?
6440          */
6441         if (unlikely(rq->nr_running == 1))
6442                 return;
6443
6444         clear_buddies(cfs_rq, se);
6445
6446         if (curr->policy != SCHED_BATCH) {
6447                 update_rq_clock(rq);
6448                 /*
6449                  * Update run-time statistics of the 'current'.
6450                  */
6451                 update_curr(cfs_rq);
6452                 /*
6453                  * Tell update_rq_clock() that we've just updated,
6454                  * so we don't do microscopic update in schedule()
6455                  * and double the fastpath cost.
6456                  */
6457                 rq_clock_skip_update(rq, true);
6458         }
6459
6460         set_skip_buddy(se);
6461 }
6462
6463 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
6464 {
6465         struct sched_entity *se = &p->se;
6466
6467         /* throttled hierarchies are not runnable */
6468         if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
6469                 return false;
6470
6471         /* Tell the scheduler that we'd really like pse to run next. */
6472         set_next_buddy(se);
6473
6474         yield_task_fair(rq);
6475
6476         return true;
6477 }
6478
6479 #ifdef CONFIG_SMP
6480 /**************************************************
6481  * Fair scheduling class load-balancing methods.
6482  *
6483  * BASICS
6484  *
6485  * The purpose of load-balancing is to achieve the same basic fairness the
6486  * per-cpu scheduler provides, namely provide a proportional amount of compute
6487  * time to each task. This is expressed in the following equation:
6488  *
6489  *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
6490  *
6491  * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
6492  * W_i,0 is defined as:
6493  *
6494  *   W_i,0 = \Sum_j w_i,j                                             (2)
6495  *
6496  * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
6497  * is derived from the nice value as per sched_prio_to_weight[].
6498  *
6499  * The weight average is an exponential decay average of the instantaneous
6500  * weight:
6501  *
6502  *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
6503  *
6504  * C_i is the compute capacity of cpu i, typically it is the
6505  * fraction of 'recent' time available for SCHED_OTHER task execution. But it
6506  * can also include other factors [XXX].
6507  *
6508  * To achieve this balance we define a measure of imbalance which follows
6509  * directly from (1):
6510  *
6511  *   imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j }    (4)
6512  *
6513  * We them move tasks around to minimize the imbalance. In the continuous
6514  * function space it is obvious this converges, in the discrete case we get
6515  * a few fun cases generally called infeasible weight scenarios.
6516  *
6517  * [XXX expand on:
6518  *     - infeasible weights;
6519  *     - local vs global optima in the discrete case. ]
6520  *
6521  *
6522  * SCHED DOMAINS
6523  *
6524  * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
6525  * for all i,j solution, we create a tree of cpus that follows the hardware
6526  * topology where each level pairs two lower groups (or better). This results
6527  * in O(log n) layers. Furthermore we reduce the number of cpus going up the
6528  * tree to only the first of the previous level and we decrease the frequency
6529  * of load-balance at each level inv. proportional to the number of cpus in
6530  * the groups.
6531  *
6532  * This yields:
6533  *
6534  *     log_2 n     1     n
6535  *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
6536  *     i = 0      2^i   2^i
6537  *                               `- size of each group
6538  *         |         |     `- number of cpus doing load-balance
6539  *         |         `- freq
6540  *         `- sum over all levels
6541  *
6542  * Coupled with a limit on how many tasks we can migrate every balance pass,
6543  * this makes (5) the runtime complexity of the balancer.
6544  *
6545  * An important property here is that each CPU is still (indirectly) connected
6546  * to every other cpu in at most O(log n) steps:
6547  *
6548  * The adjacency matrix of the resulting graph is given by:
6549  *
6550  *             log_2 n
6551  *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
6552  *             k = 0
6553  *
6554  * And you'll find that:
6555  *
6556  *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
6557  *
6558  * Showing there's indeed a path between every cpu in at most O(log n) steps.
6559  * The task movement gives a factor of O(m), giving a convergence complexity
6560  * of:
6561  *
6562  *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
6563  *
6564  *
6565  * WORK CONSERVING
6566  *
6567  * In order to avoid CPUs going idle while there's still work to do, new idle
6568  * balancing is more aggressive and has the newly idle cpu iterate up the domain
6569  * tree itself instead of relying on other CPUs to bring it work.
6570  *
6571  * This adds some complexity to both (5) and (8) but it reduces the total idle
6572  * time.
6573  *
6574  * [XXX more?]
6575  *
6576  *
6577  * CGROUPS
6578  *
6579  * Cgroups make a horror show out of (2), instead of a simple sum we get:
6580  *
6581  *                                s_k,i
6582  *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
6583  *                                 S_k
6584  *
6585  * Where
6586  *
6587  *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
6588  *
6589  * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
6590  *
6591  * The big problem is S_k, its a global sum needed to compute a local (W_i)
6592  * property.
6593  *
6594  * [XXX write more on how we solve this.. _after_ merging pjt's patches that
6595  *      rewrite all of this once again.]
6596  */
6597
6598 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
6599
6600 enum fbq_type { regular, remote, all };
6601
6602 #define LBF_ALL_PINNED  0x01
6603 #define LBF_NEED_BREAK  0x02
6604 #define LBF_DST_PINNED  0x04
6605 #define LBF_SOME_PINNED 0x08
6606
6607 struct lb_env {
6608         struct sched_domain     *sd;
6609
6610         struct rq               *src_rq;
6611         int                     src_cpu;
6612
6613         int                     dst_cpu;
6614         struct rq               *dst_rq;
6615
6616         struct cpumask          *dst_grpmask;
6617         int                     new_dst_cpu;
6618         enum cpu_idle_type      idle;
6619         long                    imbalance;
6620         /* The set of CPUs under consideration for load-balancing */
6621         struct cpumask          *cpus;
6622
6623         unsigned int            flags;
6624
6625         unsigned int            loop;
6626         unsigned int            loop_break;
6627         unsigned int            loop_max;
6628
6629         enum fbq_type           fbq_type;
6630         struct list_head        tasks;
6631 };
6632
6633 /*
6634  * Is this task likely cache-hot:
6635  */
6636 static int task_hot(struct task_struct *p, struct lb_env *env)
6637 {
6638         s64 delta;
6639
6640         lockdep_assert_held(&env->src_rq->lock);
6641
6642         if (p->sched_class != &fair_sched_class)
6643                 return 0;
6644
6645         if (unlikely(p->policy == SCHED_IDLE))
6646                 return 0;
6647
6648         /*
6649          * Buddy candidates are cache hot:
6650          */
6651         if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
6652                         (&p->se == cfs_rq_of(&p->se)->next ||
6653                          &p->se == cfs_rq_of(&p->se)->last))
6654                 return 1;
6655
6656         if (sysctl_sched_migration_cost == -1)
6657                 return 1;
6658         if (sysctl_sched_migration_cost == 0)
6659                 return 0;
6660
6661         delta = rq_clock_task(env->src_rq) - p->se.exec_start;
6662
6663         return delta < (s64)sysctl_sched_migration_cost;
6664 }
6665
6666 #ifdef CONFIG_NUMA_BALANCING
6667 /*
6668  * Returns 1, if task migration degrades locality
6669  * Returns 0, if task migration improves locality i.e migration preferred.
6670  * Returns -1, if task migration is not affected by locality.
6671  */
6672 static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
6673 {
6674         struct numa_group *numa_group = rcu_dereference(p->numa_group);
6675         unsigned long src_faults, dst_faults;
6676         int src_nid, dst_nid;
6677
6678         if (!static_branch_likely(&sched_numa_balancing))
6679                 return -1;
6680
6681         if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
6682                 return -1;
6683
6684         src_nid = cpu_to_node(env->src_cpu);
6685         dst_nid = cpu_to_node(env->dst_cpu);
6686
6687         if (src_nid == dst_nid)
6688                 return -1;
6689
6690         /* Migrating away from the preferred node is always bad. */
6691         if (src_nid == p->numa_preferred_nid) {
6692                 if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
6693                         return 1;
6694                 else
6695                         return -1;
6696         }
6697
6698         /* Encourage migration to the preferred node. */
6699         if (dst_nid == p->numa_preferred_nid)
6700                 return 0;
6701
6702         /* Leaving a core idle is often worse than degrading locality. */
6703         if (env->idle != CPU_NOT_IDLE)
6704                 return -1;
6705
6706         if (numa_group) {
6707                 src_faults = group_faults(p, src_nid);
6708                 dst_faults = group_faults(p, dst_nid);
6709         } else {
6710                 src_faults = task_faults(p, src_nid);
6711                 dst_faults = task_faults(p, dst_nid);
6712         }
6713
6714         return dst_faults < src_faults;
6715 }
6716
6717 #else
6718 static inline int migrate_degrades_locality(struct task_struct *p,
6719                                              struct lb_env *env)
6720 {
6721         return -1;
6722 }
6723 #endif
6724
6725 /*
6726  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
6727  */
6728 static
6729 int can_migrate_task(struct task_struct *p, struct lb_env *env)
6730 {
6731         int tsk_cache_hot;
6732
6733         lockdep_assert_held(&env->src_rq->lock);
6734
6735         /*
6736          * We do not migrate tasks that are:
6737          * 1) throttled_lb_pair, or
6738          * 2) cannot be migrated to this CPU due to cpus_allowed, or
6739          * 3) running (obviously), or
6740          * 4) are cache-hot on their current CPU.
6741          */
6742         if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
6743                 return 0;
6744
6745         if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) {
6746                 int cpu;
6747
6748                 schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
6749
6750                 env->flags |= LBF_SOME_PINNED;
6751
6752                 /*
6753                  * Remember if this task can be migrated to any other cpu in
6754                  * our sched_group. We may want to revisit it if we couldn't
6755                  * meet load balance goals by pulling other tasks on src_cpu.
6756                  *
6757                  * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have
6758                  * already computed one in current iteration.
6759                  */
6760                 if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
6761                         return 0;
6762
6763                 /* Prevent to re-select dst_cpu via env's cpus */
6764                 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
6765                         if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
6766                                 env->flags |= LBF_DST_PINNED;
6767                                 env->new_dst_cpu = cpu;
6768                                 break;
6769                         }
6770                 }
6771
6772                 return 0;
6773         }
6774
6775         /* Record that we found atleast one task that could run on dst_cpu */
6776         env->flags &= ~LBF_ALL_PINNED;
6777
6778         if (task_running(env->src_rq, p)) {
6779                 schedstat_inc(p->se.statistics.nr_failed_migrations_running);
6780                 return 0;
6781         }
6782
6783         /*
6784          * Aggressive migration if:
6785          * 1) destination numa is preferred
6786          * 2) task is cache cold, or
6787          * 3) too many balance attempts have failed.
6788          */
6789         tsk_cache_hot = migrate_degrades_locality(p, env);
6790         if (tsk_cache_hot == -1)
6791                 tsk_cache_hot = task_hot(p, env);
6792
6793         if (tsk_cache_hot <= 0 ||
6794             env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
6795                 if (tsk_cache_hot == 1) {
6796                         schedstat_inc(env->sd->lb_hot_gained[env->idle]);
6797                         schedstat_inc(p->se.statistics.nr_forced_migrations);
6798                 }
6799                 return 1;
6800         }
6801
6802         schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
6803         return 0;
6804 }
6805
6806 /*
6807  * detach_task() -- detach the task for the migration specified in env
6808  */
6809 static void detach_task(struct task_struct *p, struct lb_env *env)
6810 {
6811         lockdep_assert_held(&env->src_rq->lock);
6812
6813         p->on_rq = TASK_ON_RQ_MIGRATING;
6814         deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
6815         set_task_cpu(p, env->dst_cpu);
6816 }
6817
6818 /*
6819  * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
6820  * part of active balancing operations within "domain".
6821  *
6822  * Returns a task if successful and NULL otherwise.
6823  */
6824 static struct task_struct *detach_one_task(struct lb_env *env)
6825 {
6826         struct task_struct *p, *n;
6827
6828         lockdep_assert_held(&env->src_rq->lock);
6829
6830         list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
6831                 if (!can_migrate_task(p, env))
6832                         continue;
6833
6834                 detach_task(p, env);
6835
6836                 /*
6837                  * Right now, this is only the second place where
6838                  * lb_gained[env->idle] is updated (other is detach_tasks)
6839                  * so we can safely collect stats here rather than
6840                  * inside detach_tasks().
6841                  */
6842                 schedstat_inc(env->sd->lb_gained[env->idle]);
6843                 return p;
6844         }
6845         return NULL;
6846 }
6847
6848 static const unsigned int sched_nr_migrate_break = 32;
6849
6850 /*
6851  * detach_tasks() -- tries to detach up to imbalance weighted load from
6852  * busiest_rq, as part of a balancing operation within domain "sd".
6853  *
6854  * Returns number of detached tasks if successful and 0 otherwise.
6855  */
6856 static int detach_tasks(struct lb_env *env)
6857 {
6858         struct list_head *tasks = &env->src_rq->cfs_tasks;
6859         struct task_struct *p;
6860         unsigned long load;
6861         int detached = 0;
6862
6863         lockdep_assert_held(&env->src_rq->lock);
6864
6865         if (env->imbalance <= 0)
6866                 return 0;
6867
6868         while (!list_empty(tasks)) {
6869                 /*
6870                  * We don't want to steal all, otherwise we may be treated likewise,
6871                  * which could at worst lead to a livelock crash.
6872                  */
6873                 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
6874                         break;
6875
6876                 p = list_first_entry(tasks, struct task_struct, se.group_node);
6877
6878                 env->loop++;
6879                 /* We've more or less seen every task there is, call it quits */
6880                 if (env->loop > env->loop_max)
6881                         break;
6882
6883                 /* take a breather every nr_migrate tasks */
6884                 if (env->loop > env->loop_break) {
6885                         env->loop_break += sched_nr_migrate_break;
6886                         env->flags |= LBF_NEED_BREAK;
6887                         break;
6888                 }
6889
6890                 if (!can_migrate_task(p, env))
6891                         goto next;
6892
6893                 load = task_h_load(p);
6894
6895                 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
6896                         goto next;
6897
6898                 if ((load / 2) > env->imbalance)
6899                         goto next;
6900
6901                 detach_task(p, env);
6902                 list_add(&p->se.group_node, &env->tasks);
6903
6904                 detached++;
6905                 env->imbalance -= load;
6906
6907 #ifdef CONFIG_PREEMPT
6908                 /*
6909                  * NEWIDLE balancing is a source of latency, so preemptible
6910                  * kernels will stop after the first task is detached to minimize
6911                  * the critical section.
6912                  */
6913                 if (env->idle == CPU_NEWLY_IDLE)
6914                         break;
6915 #endif
6916
6917                 /*
6918                  * We only want to steal up to the prescribed amount of
6919                  * weighted load.
6920                  */
6921                 if (env->imbalance <= 0)
6922                         break;
6923
6924                 continue;
6925 next:
6926                 list_move_tail(&p->se.group_node, tasks);
6927         }
6928
6929         /*
6930          * Right now, this is one of only two places we collect this stat
6931          * so we can safely collect detach_one_task() stats here rather
6932          * than inside detach_one_task().
6933          */
6934         schedstat_add(env->sd->lb_gained[env->idle], detached);
6935
6936         return detached;
6937 }
6938
6939 /*
6940  * attach_task() -- attach the task detached by detach_task() to its new rq.
6941  */
6942 static void attach_task(struct rq *rq, struct task_struct *p)
6943 {
6944         lockdep_assert_held(&rq->lock);
6945
6946         BUG_ON(task_rq(p) != rq);
6947         activate_task(rq, p, ENQUEUE_NOCLOCK);
6948         p->on_rq = TASK_ON_RQ_QUEUED;
6949         check_preempt_curr(rq, p, 0);
6950 }
6951
6952 /*
6953  * attach_one_task() -- attaches the task returned from detach_one_task() to
6954  * its new rq.
6955  */
6956 static void attach_one_task(struct rq *rq, struct task_struct *p)
6957 {
6958         struct rq_flags rf;
6959
6960         rq_lock(rq, &rf);
6961         update_rq_clock(rq);
6962         attach_task(rq, p);
6963         rq_unlock(rq, &rf);
6964 }
6965
6966 /*
6967  * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
6968  * new rq.
6969  */
6970 static void attach_tasks(struct lb_env *env)
6971 {
6972         struct list_head *tasks = &env->tasks;
6973         struct task_struct *p;
6974         struct rq_flags rf;
6975
6976         rq_lock(env->dst_rq, &rf);
6977         update_rq_clock(env->dst_rq);
6978
6979         while (!list_empty(tasks)) {
6980                 p = list_first_entry(tasks, struct task_struct, se.group_node);
6981                 list_del_init(&p->se.group_node);
6982
6983                 attach_task(env->dst_rq, p);
6984         }
6985
6986         rq_unlock(env->dst_rq, &rf);
6987 }
6988
6989 #ifdef CONFIG_FAIR_GROUP_SCHED
6990
6991 static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
6992 {
6993         if (cfs_rq->load.weight)
6994                 return false;
6995
6996         if (cfs_rq->avg.load_sum)
6997                 return false;
6998
6999         if (cfs_rq->avg.util_sum)
7000                 return false;
7001
7002         if (cfs_rq->runnable_load_sum)
7003                 return false;
7004
7005         return true;
7006 }
7007
7008 static void update_blocked_averages(int cpu)
7009 {
7010         struct rq *rq = cpu_rq(cpu);
7011         struct cfs_rq *cfs_rq, *pos;
7012         struct rq_flags rf;
7013
7014         rq_lock_irqsave(rq, &rf);
7015         update_rq_clock(rq);
7016
7017         /*
7018          * Iterates the task_group tree in a bottom up fashion, see
7019          * list_add_leaf_cfs_rq() for details.
7020          */
7021         for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
7022                 struct sched_entity *se;
7023
7024                 /* throttled entities do not contribute to load */
7025                 if (throttled_hierarchy(cfs_rq))
7026                         continue;
7027
7028                 if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
7029                         update_tg_load_avg(cfs_rq, 0);
7030
7031                 /* Propagate pending load changes to the parent, if any: */
7032                 se = cfs_rq->tg->se[cpu];
7033                 if (se && !skip_blocked_update(se))
7034                         update_load_avg(se, 0);
7035
7036                 /*
7037                  * There can be a lot of idle CPU cgroups.  Don't let fully
7038                  * decayed cfs_rqs linger on the list.
7039                  */
7040                 if (cfs_rq_is_decayed(cfs_rq))
7041                         list_del_leaf_cfs_rq(cfs_rq);
7042         }
7043         rq_unlock_irqrestore(rq, &rf);
7044 }
7045
7046 /*
7047  * Compute the hierarchical load factor for cfs_rq and all its ascendants.
7048  * This needs to be done in a top-down fashion because the load of a child
7049  * group is a fraction of its parents load.
7050  */
7051 static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
7052 {
7053         struct rq *rq = rq_of(cfs_rq);
7054         struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
7055         unsigned long now = jiffies;
7056         unsigned long load;
7057
7058         if (cfs_rq->last_h_load_update == now)
7059                 return;
7060
7061         cfs_rq->h_load_next = NULL;
7062         for_each_sched_entity(se) {
7063                 cfs_rq = cfs_rq_of(se);
7064                 cfs_rq->h_load_next = se;
7065                 if (cfs_rq->last_h_load_update == now)
7066                         break;
7067         }
7068
7069         if (!se) {
7070                 cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
7071                 cfs_rq->last_h_load_update = now;
7072         }
7073
7074         while ((se = cfs_rq->h_load_next) != NULL) {
7075                 load = cfs_rq->h_load;
7076                 load = div64_ul(load * se->avg.load_avg,
7077                         cfs_rq_load_avg(cfs_rq) + 1);
7078                 cfs_rq = group_cfs_rq(se);
7079                 cfs_rq->h_load = load;
7080                 cfs_rq->last_h_load_update = now;
7081         }
7082 }
7083
7084 static unsigned long task_h_load(struct task_struct *p)
7085 {
7086         struct cfs_rq *cfs_rq = task_cfs_rq(p);
7087
7088         update_cfs_rq_h_load(cfs_rq);
7089         return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
7090                         cfs_rq_load_avg(cfs_rq) + 1);
7091 }
7092 #else
7093 static inline void update_blocked_averages(int cpu)
7094 {
7095         struct rq *rq = cpu_rq(cpu);
7096         struct cfs_rq *cfs_rq = &rq->cfs;
7097         struct rq_flags rf;
7098
7099         rq_lock_irqsave(rq, &rf);
7100         update_rq_clock(rq);
7101         update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
7102         rq_unlock_irqrestore(rq, &rf);
7103 }
7104
7105 static unsigned long task_h_load(struct task_struct *p)
7106 {
7107         return p->se.avg.load_avg;
7108 }
7109 #endif
7110
7111 /********** Helpers for find_busiest_group ************************/
7112
7113 enum group_type {
7114         group_other = 0,
7115         group_imbalanced,
7116         group_overloaded,
7117 };
7118
7119 /*
7120  * sg_lb_stats - stats of a sched_group required for load_balancing
7121  */
7122 struct sg_lb_stats {
7123         unsigned long avg_load; /*Avg load across the CPUs of the group */
7124         unsigned long group_load; /* Total load over the CPUs of the group */
7125         unsigned long sum_weighted_load; /* Weighted load of group's tasks */
7126         unsigned long load_per_task;
7127         unsigned long group_capacity;
7128         unsigned long group_util; /* Total utilization of the group */
7129         unsigned int sum_nr_running; /* Nr tasks running in the group */
7130         unsigned int idle_cpus;
7131         unsigned int group_weight;
7132         enum group_type group_type;
7133         int group_no_capacity;
7134 #ifdef CONFIG_NUMA_BALANCING
7135         unsigned int nr_numa_running;
7136         unsigned int nr_preferred_running;
7137 #endif
7138 };
7139
7140 /*
7141  * sd_lb_stats - Structure to store the statistics of a sched_domain
7142  *               during load balancing.
7143  */
7144 struct sd_lb_stats {
7145         struct sched_group *busiest;    /* Busiest group in this sd */
7146         struct sched_group *local;      /* Local group in this sd */
7147         unsigned long total_running;
7148         unsigned long total_load;       /* Total load of all groups in sd */
7149         unsigned long total_capacity;   /* Total capacity of all groups in sd */
7150         unsigned long avg_load; /* Average load across all groups in sd */
7151
7152         struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
7153         struct sg_lb_stats local_stat;  /* Statistics of the local group */
7154 };
7155
7156 static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
7157 {
7158         /*
7159          * Skimp on the clearing to avoid duplicate work. We can avoid clearing
7160          * local_stat because update_sg_lb_stats() does a full clear/assignment.
7161          * We must however clear busiest_stat::avg_load because
7162          * update_sd_pick_busiest() reads this before assignment.
7163          */
7164         *sds = (struct sd_lb_stats){
7165                 .busiest = NULL,
7166                 .local = NULL,
7167                 .total_running = 0UL,
7168                 .total_load = 0UL,
7169                 .total_capacity = 0UL,
7170                 .busiest_stat = {
7171                         .avg_load = 0UL,
7172                         .sum_nr_running = 0,
7173                         .group_type = group_other,
7174                 },
7175         };
7176 }
7177
7178 /**
7179  * get_sd_load_idx - Obtain the load index for a given sched domain.
7180  * @sd: The sched_domain whose load_idx is to be obtained.
7181  * @idle: The idle status of the CPU for whose sd load_idx is obtained.
7182  *
7183  * Return: The load index.
7184  */
7185 static inline int get_sd_load_idx(struct sched_domain *sd,
7186                                         enum cpu_idle_type idle)
7187 {
7188         int load_idx;
7189
7190         switch (idle) {
7191         case CPU_NOT_IDLE:
7192                 load_idx = sd->busy_idx;
7193                 break;
7194
7195         case CPU_NEWLY_IDLE:
7196                 load_idx = sd->newidle_idx;
7197                 break;
7198         default:
7199                 load_idx = sd->idle_idx;
7200                 break;
7201         }
7202
7203         return load_idx;
7204 }
7205
7206 static unsigned long scale_rt_capacity(int cpu)
7207 {
7208         struct rq *rq = cpu_rq(cpu);
7209         u64 total, used, age_stamp, avg;
7210         s64 delta;
7211
7212         /*
7213          * Since we're reading these variables without serialization make sure
7214          * we read them once before doing sanity checks on them.
7215          */
7216         age_stamp = READ_ONCE(rq->age_stamp);
7217         avg = READ_ONCE(rq->rt_avg);
7218         delta = __rq_clock_broken(rq) - age_stamp;
7219
7220         if (unlikely(delta < 0))
7221                 delta = 0;
7222
7223         total = sched_avg_period() + delta;
7224
7225         used = div_u64(avg, total);
7226
7227         if (likely(used < SCHED_CAPACITY_SCALE))
7228                 return SCHED_CAPACITY_SCALE - used;
7229
7230         return 1;
7231 }
7232
7233 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
7234 {
7235         unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
7236         struct sched_group *sdg = sd->groups;
7237
7238         cpu_rq(cpu)->cpu_capacity_orig = capacity;
7239
7240         capacity *= scale_rt_capacity(cpu);
7241         capacity >>= SCHED_CAPACITY_SHIFT;
7242
7243         if (!capacity)
7244                 capacity = 1;
7245
7246         cpu_rq(cpu)->cpu_capacity = capacity;
7247         sdg->sgc->capacity = capacity;
7248         sdg->sgc->min_capacity = capacity;
7249 }
7250
7251 void update_group_capacity(struct sched_domain *sd, int cpu)
7252 {
7253         struct sched_domain *child = sd->child;
7254         struct sched_group *group, *sdg = sd->groups;
7255         unsigned long capacity, min_capacity;
7256         unsigned long interval;
7257
7258         interval = msecs_to_jiffies(sd->balance_interval);
7259         interval = clamp(interval, 1UL, max_load_balance_interval);
7260         sdg->sgc->next_update = jiffies + interval;
7261
7262         if (!child) {
7263                 update_cpu_capacity(sd, cpu);
7264                 return;
7265         }
7266
7267         capacity = 0;
7268         min_capacity = ULONG_MAX;
7269
7270         if (child->flags & SD_OVERLAP) {
7271                 /*
7272                  * SD_OVERLAP domains cannot assume that child groups
7273                  * span the current group.
7274                  */
7275
7276                 for_each_cpu(cpu, sched_group_span(sdg)) {
7277                         struct sched_group_capacity *sgc;
7278                         struct rq *rq = cpu_rq(cpu);
7279
7280                         /*
7281                          * build_sched_domains() -> init_sched_groups_capacity()
7282                          * gets here before we've attached the domains to the
7283                          * runqueues.
7284                          *
7285                          * Use capacity_of(), which is set irrespective of domains
7286                          * in update_cpu_capacity().
7287                          *
7288                          * This avoids capacity from being 0 and
7289                          * causing divide-by-zero issues on boot.
7290                          */
7291                         if (unlikely(!rq->sd)) {
7292                                 capacity += capacity_of(cpu);
7293                         } else {
7294                                 sgc = rq->sd->groups->sgc;
7295                                 capacity += sgc->capacity;
7296                         }
7297
7298                         min_capacity = min(capacity, min_capacity);
7299                 }
7300         } else  {
7301                 /*
7302                  * !SD_OVERLAP domains can assume that child groups
7303                  * span the current group.
7304                  */
7305
7306                 group = child->groups;
7307                 do {
7308                         struct sched_group_capacity *sgc = group->sgc;
7309
7310                         capacity += sgc->capacity;
7311                         min_capacity = min(sgc->min_capacity, min_capacity);
7312                         group = group->next;
7313                 } while (group != child->groups);
7314         }
7315
7316         sdg->sgc->capacity = capacity;
7317         sdg->sgc->min_capacity = min_capacity;
7318 }
7319
7320 /*
7321  * Check whether the capacity of the rq has been noticeably reduced by side
7322  * activity. The imbalance_pct is used for the threshold.
7323  * Return true is the capacity is reduced
7324  */
7325 static inline int
7326 check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
7327 {
7328         return ((rq->cpu_capacity * sd->imbalance_pct) <
7329                                 (rq->cpu_capacity_orig * 100));
7330 }
7331
7332 /*
7333  * Group imbalance indicates (and tries to solve) the problem where balancing
7334  * groups is inadequate due to ->cpus_allowed constraints.
7335  *
7336  * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
7337  * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
7338  * Something like:
7339  *
7340  *      { 0 1 2 3 } { 4 5 6 7 }
7341  *              *     * * *
7342  *
7343  * If we were to balance group-wise we'd place two tasks in the first group and
7344  * two tasks in the second group. Clearly this is undesired as it will overload
7345  * cpu 3 and leave one of the cpus in the second group unused.
7346  *
7347  * The current solution to this issue is detecting the skew in the first group
7348  * by noticing the lower domain failed to reach balance and had difficulty
7349  * moving tasks due to affinity constraints.
7350  *
7351  * When this is so detected; this group becomes a candidate for busiest; see
7352  * update_sd_pick_busiest(). And calculate_imbalance() and
7353  * find_busiest_group() avoid some of the usual balance conditions to allow it
7354  * to create an effective group imbalance.
7355  *
7356  * This is a somewhat tricky proposition since the next run might not find the
7357  * group imbalance and decide the groups need to be balanced again. A most
7358  * subtle and fragile situation.
7359  */
7360
7361 static inline int sg_imbalanced(struct sched_group *group)
7362 {
7363         return group->sgc->imbalance;
7364 }
7365
7366 /*
7367  * group_has_capacity returns true if the group has spare capacity that could
7368  * be used by some tasks.
7369  * We consider that a group has spare capacity if the  * number of task is
7370  * smaller than the number of CPUs or if the utilization is lower than the
7371  * available capacity for CFS tasks.
7372  * For the latter, we use a threshold to stabilize the state, to take into
7373  * account the variance of the tasks' load and to return true if the available
7374  * capacity in meaningful for the load balancer.
7375  * As an example, an available capacity of 1% can appear but it doesn't make
7376  * any benefit for the load balance.
7377  */
7378 static inline bool
7379 group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
7380 {
7381         if (sgs->sum_nr_running < sgs->group_weight)
7382                 return true;
7383
7384         if ((sgs->group_capacity * 100) >
7385                         (sgs->group_util * env->sd->imbalance_pct))
7386                 return true;
7387
7388         return false;
7389 }
7390
7391 /*
7392  *  group_is_overloaded returns true if the group has more tasks than it can
7393  *  handle.
7394  *  group_is_overloaded is not equals to !group_has_capacity because a group
7395  *  with the exact right number of tasks, has no more spare capacity but is not
7396  *  overloaded so both group_has_capacity and group_is_overloaded return
7397  *  false.
7398  */
7399 static inline bool
7400 group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
7401 {
7402         if (sgs->sum_nr_running <= sgs->group_weight)
7403                 return false;
7404
7405         if ((sgs->group_capacity * 100) <
7406                         (sgs->group_util * env->sd->imbalance_pct))
7407                 return true;
7408
7409         return false;
7410 }
7411
7412 /*
7413  * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
7414  * per-CPU capacity than sched_group ref.
7415  */
7416 static inline bool
7417 group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
7418 {
7419         return sg->sgc->min_capacity * capacity_margin <
7420                                                 ref->sgc->min_capacity * 1024;
7421 }
7422
7423 static inline enum
7424 group_type group_classify(struct sched_group *group,
7425                           struct sg_lb_stats *sgs)
7426 {
7427         if (sgs->group_no_capacity)
7428                 return group_overloaded;
7429
7430         if (sg_imbalanced(group))
7431                 return group_imbalanced;
7432
7433         return group_other;
7434 }
7435
7436 /**
7437  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
7438  * @env: The load balancing environment.
7439  * @group: sched_group whose statistics are to be updated.
7440  * @load_idx: Load index of sched_domain of this_cpu for load calc.
7441  * @local_group: Does group contain this_cpu.
7442  * @sgs: variable to hold the statistics for this group.
7443  * @overload: Indicate more than one runnable task for any CPU.
7444  */
7445 static inline void update_sg_lb_stats(struct lb_env *env,
7446                         struct sched_group *group, int load_idx,
7447                         int local_group, struct sg_lb_stats *sgs,
7448                         bool *overload)
7449 {
7450         unsigned long load;
7451         int i, nr_running;
7452
7453         memset(sgs, 0, sizeof(*sgs));
7454
7455         for_each_cpu_and(i, sched_group_span(group), env->cpus) {
7456                 struct rq *rq = cpu_rq(i);
7457
7458                 /* Bias balancing toward cpus of our domain */
7459                 if (local_group)
7460                         load = target_load(i, load_idx);
7461                 else
7462                         load = source_load(i, load_idx);
7463
7464                 sgs->group_load += load;
7465                 sgs->group_util += cpu_util(i);
7466                 sgs->sum_nr_running += rq->cfs.h_nr_running;
7467
7468                 nr_running = rq->nr_running;
7469                 if (nr_running > 1)
7470                         *overload = true;
7471
7472 #ifdef CONFIG_NUMA_BALANCING
7473                 sgs->nr_numa_running += rq->nr_numa_running;
7474                 sgs->nr_preferred_running += rq->nr_preferred_running;
7475 #endif
7476                 sgs->sum_weighted_load += weighted_cpuload(rq);
7477                 /*
7478                  * No need to call idle_cpu() if nr_running is not 0
7479                  */
7480                 if (!nr_running && idle_cpu(i))
7481                         sgs->idle_cpus++;
7482         }
7483
7484         /* Adjust by relative CPU capacity of the group */
7485         sgs->group_capacity = group->sgc->capacity;
7486         sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
7487
7488         if (sgs->sum_nr_running)
7489                 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
7490
7491         sgs->group_weight = group->group_weight;
7492
7493         sgs->group_no_capacity = group_is_overloaded(env, sgs);
7494         sgs->group_type = group_classify(group, sgs);
7495 }
7496
7497 /**
7498  * update_sd_pick_busiest - return 1 on busiest group
7499  * @env: The load balancing environment.
7500  * @sds: sched_domain statistics
7501  * @sg: sched_group candidate to be checked for being the busiest
7502  * @sgs: sched_group statistics
7503  *
7504  * Determine if @sg is a busier group than the previously selected
7505  * busiest group.
7506  *
7507  * Return: %true if @sg is a busier group than the previously selected
7508  * busiest group. %false otherwise.
7509  */
7510 static bool update_sd_pick_busiest(struct lb_env *env,
7511                                    struct sd_lb_stats *sds,
7512                                    struct sched_group *sg,
7513                                    struct sg_lb_stats *sgs)
7514 {
7515         struct sg_lb_stats *busiest = &sds->busiest_stat;
7516
7517         if (sgs->group_type > busiest->group_type)
7518                 return true;
7519
7520         if (sgs->group_type < busiest->group_type)
7521                 return false;
7522
7523         if (sgs->avg_load <= busiest->avg_load)
7524                 return false;
7525
7526         if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
7527                 goto asym_packing;
7528
7529         /*
7530          * Candidate sg has no more than one task per CPU and
7531          * has higher per-CPU capacity. Migrating tasks to less
7532          * capable CPUs may harm throughput. Maximize throughput,
7533          * power/energy consequences are not considered.
7534          */
7535         if (sgs->sum_nr_running <= sgs->group_weight &&
7536             group_smaller_cpu_capacity(sds->local, sg))
7537                 return false;
7538
7539 asym_packing:
7540         /* This is the busiest node in its class. */
7541         if (!(env->sd->flags & SD_ASYM_PACKING))
7542                 return true;
7543
7544         /* No ASYM_PACKING if target cpu is already busy */
7545         if (env->idle == CPU_NOT_IDLE)
7546                 return true;
7547         /*
7548          * ASYM_PACKING needs to move all the work to the highest
7549          * prority CPUs in the group, therefore mark all groups
7550          * of lower priority than ourself as busy.
7551          */
7552         if (sgs->sum_nr_running &&
7553             sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
7554                 if (!sds->busiest)
7555                         return true;
7556
7557                 /* Prefer to move from lowest priority cpu's work */
7558                 if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
7559                                       sg->asym_prefer_cpu))
7560                         return true;
7561         }
7562
7563         return false;
7564 }
7565
7566 #ifdef CONFIG_NUMA_BALANCING
7567 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
7568 {
7569         if (sgs->sum_nr_running > sgs->nr_numa_running)
7570                 return regular;
7571         if (sgs->sum_nr_running > sgs->nr_preferred_running)
7572                 return remote;
7573         return all;
7574 }
7575
7576 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
7577 {
7578         if (rq->nr_running > rq->nr_numa_running)
7579                 return regular;
7580         if (rq->nr_running > rq->nr_preferred_running)
7581                 return remote;
7582         return all;
7583 }
7584 #else
7585 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
7586 {
7587         return all;
7588 }
7589
7590 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
7591 {
7592         return regular;
7593 }
7594 #endif /* CONFIG_NUMA_BALANCING */
7595
7596 /**
7597  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
7598  * @env: The load balancing environment.
7599  * @sds: variable to hold the statistics for this sched_domain.
7600  */
7601 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
7602 {
7603         struct sched_domain_shared *shared = env->sd->shared;
7604         struct sched_domain *child = env->sd->child;
7605         struct sched_group *sg = env->sd->groups;
7606         struct sg_lb_stats *local = &sds->local_stat;
7607         struct sg_lb_stats tmp_sgs;
7608         int load_idx, prefer_sibling = 0;
7609         bool overload = false;
7610
7611         if (child && child->flags & SD_PREFER_SIBLING)
7612                 prefer_sibling = 1;
7613
7614         load_idx = get_sd_load_idx(env->sd, env->idle);
7615
7616         do {
7617                 struct sg_lb_stats *sgs = &tmp_sgs;
7618                 int local_group;
7619
7620                 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
7621                 if (local_group) {
7622                         sds->local = sg;
7623                         sgs = local;
7624
7625                         if (env->idle != CPU_NEWLY_IDLE ||
7626                             time_after_eq(jiffies, sg->sgc->next_update))
7627                                 update_group_capacity(env->sd, env->dst_cpu);
7628                 }
7629
7630                 update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
7631                                                 &overload);
7632
7633                 if (local_group)
7634                         goto next_group;
7635
7636                 /*
7637                  * In case the child domain prefers tasks go to siblings
7638                  * first, lower the sg capacity so that we'll try
7639                  * and move all the excess tasks away. We lower the capacity
7640                  * of a group only if the local group has the capacity to fit
7641                  * these excess tasks. The extra check prevents the case where
7642                  * you always pull from the heaviest group when it is already
7643                  * under-utilized (possible with a large weight task outweighs
7644                  * the tasks on the system).
7645                  */
7646                 if (prefer_sibling && sds->local &&
7647                     group_has_capacity(env, local) &&
7648                     (sgs->sum_nr_running > local->sum_nr_running + 1)) {
7649                         sgs->group_no_capacity = 1;
7650                         sgs->group_type = group_classify(sg, sgs);
7651                 }
7652
7653                 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
7654                         sds->busiest = sg;
7655                         sds->busiest_stat = *sgs;
7656                 }
7657
7658 next_group:
7659                 /* Now, start updating sd_lb_stats */
7660                 sds->total_running += sgs->sum_nr_running;
7661                 sds->total_load += sgs->group_load;
7662                 sds->total_capacity += sgs->group_capacity;
7663
7664                 sg = sg->next;
7665         } while (sg != env->sd->groups);
7666
7667         if (env->sd->flags & SD_NUMA)
7668                 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
7669
7670         if (!env->sd->parent) {
7671                 /* update overload indicator if we are at root domain */
7672                 if (env->dst_rq->rd->overload != overload)
7673                         env->dst_rq->rd->overload = overload;
7674         }
7675
7676         if (!shared)
7677                 return;
7678
7679         /*
7680          * Since these are sums over groups they can contain some CPUs
7681          * multiple times for the NUMA domains.
7682          *
7683          * Currently only wake_affine_llc() and find_busiest_group()
7684          * uses these numbers, only the last is affected by this problem.
7685          *
7686          * XXX fix that.
7687          */
7688         WRITE_ONCE(shared->nr_running,  sds->total_running);
7689         WRITE_ONCE(shared->load,        sds->total_load);
7690         WRITE_ONCE(shared->capacity,    sds->total_capacity);
7691 }
7692
7693 /**
7694  * check_asym_packing - Check to see if the group is packed into the
7695  *                      sched domain.
7696  *
7697  * This is primarily intended to used at the sibling level.  Some
7698  * cores like POWER7 prefer to use lower numbered SMT threads.  In the
7699  * case of POWER7, it can move to lower SMT modes only when higher
7700  * threads are idle.  When in lower SMT modes, the threads will
7701  * perform better since they share less core resources.  Hence when we
7702  * have idle threads, we want them to be the higher ones.
7703  *
7704  * This packing function is run on idle threads.  It checks to see if
7705  * the busiest CPU in this domain (core in the P7 case) has a higher
7706  * CPU number than the packing function is being run on.  Here we are
7707  * assuming lower CPU number will be equivalent to lower a SMT thread
7708  * number.
7709  *
7710  * Return: 1 when packing is required and a task should be moved to
7711  * this CPU.  The amount of the imbalance is returned in *imbalance.
7712  *
7713  * @env: The load balancing environment.
7714  * @sds: Statistics of the sched_domain which is to be packed
7715  */
7716 static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
7717 {
7718         int busiest_cpu;
7719
7720         if (!(env->sd->flags & SD_ASYM_PACKING))
7721                 return 0;
7722
7723         if (env->idle == CPU_NOT_IDLE)
7724                 return 0;
7725
7726         if (!sds->busiest)
7727                 return 0;
7728
7729         busiest_cpu = sds->busiest->asym_prefer_cpu;
7730         if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
7731                 return 0;
7732
7733         env->imbalance = DIV_ROUND_CLOSEST(
7734                 sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
7735                 SCHED_CAPACITY_SCALE);
7736
7737         return 1;
7738 }
7739
7740 /**
7741  * fix_small_imbalance - Calculate the minor imbalance that exists
7742  *                      amongst the groups of a sched_domain, during
7743  *                      load balancing.
7744  * @env: The load balancing environment.
7745  * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
7746  */
7747 static inline
7748 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
7749 {
7750         unsigned long tmp, capa_now = 0, capa_move = 0;
7751         unsigned int imbn = 2;
7752         unsigned long scaled_busy_load_per_task;
7753         struct sg_lb_stats *local, *busiest;
7754
7755         local = &sds->local_stat;
7756         busiest = &sds->busiest_stat;
7757
7758         if (!local->sum_nr_running)
7759                 local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
7760         else if (busiest->load_per_task > local->load_per_task)
7761                 imbn = 1;
7762
7763         scaled_busy_load_per_task =
7764                 (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
7765                 busiest->group_capacity;
7766
7767         if (busiest->avg_load + scaled_busy_load_per_task >=
7768             local->avg_load + (scaled_busy_load_per_task * imbn)) {
7769                 env->imbalance = busiest->load_per_task;
7770                 return;
7771         }
7772
7773         /*
7774          * OK, we don't have enough imbalance to justify moving tasks,
7775          * however we may be able to increase total CPU capacity used by
7776          * moving them.
7777          */
7778
7779         capa_now += busiest->group_capacity *
7780                         min(busiest->load_per_task, busiest->avg_load);
7781         capa_now += local->group_capacity *
7782                         min(local->load_per_task, local->avg_load);
7783         capa_now /= SCHED_CAPACITY_SCALE;
7784
7785         /* Amount of load we'd subtract */
7786         if (busiest->avg_load > scaled_busy_load_per_task) {
7787                 capa_move += busiest->group_capacity *
7788                             min(busiest->load_per_task,
7789                                 busiest->avg_load - scaled_busy_load_per_task);
7790         }
7791
7792         /* Amount of load we'd add */
7793         if (busiest->avg_load * busiest->group_capacity <
7794             busiest->load_per_task * SCHED_CAPACITY_SCALE) {
7795                 tmp = (busiest->avg_load * busiest->group_capacity) /
7796                       local->group_capacity;
7797         } else {
7798                 tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
7799                       local->group_capacity;
7800         }
7801         capa_move += local->group_capacity *
7802                     min(local->load_per_task, local->avg_load + tmp);
7803         capa_move /= SCHED_CAPACITY_SCALE;
7804
7805         /* Move if we gain throughput */
7806         if (capa_move > capa_now)
7807                 env->imbalance = busiest->load_per_task;
7808 }
7809
7810 /**
7811  * calculate_imbalance - Calculate the amount of imbalance present within the
7812  *                       groups of a given sched_domain during load balance.
7813  * @env: load balance environment
7814  * @sds: statistics of the sched_domain whose imbalance is to be calculated.
7815  */
7816 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
7817 {
7818         unsigned long max_pull, load_above_capacity = ~0UL;
7819         struct sg_lb_stats *local, *busiest;
7820
7821         local = &sds->local_stat;
7822         busiest = &sds->busiest_stat;
7823
7824         if (busiest->group_type == group_imbalanced) {
7825                 /*
7826                  * In the group_imb case we cannot rely on group-wide averages
7827                  * to ensure cpu-load equilibrium, look at wider averages. XXX
7828                  */
7829                 busiest->load_per_task =
7830                         min(busiest->load_per_task, sds->avg_load);
7831         }
7832
7833         /*
7834          * Avg load of busiest sg can be less and avg load of local sg can
7835          * be greater than avg load across all sgs of sd because avg load
7836          * factors in sg capacity and sgs with smaller group_type are
7837          * skipped when updating the busiest sg:
7838          */
7839         if (busiest->avg_load <= sds->avg_load ||
7840             local->avg_load >= sds->avg_load) {
7841                 env->imbalance = 0;
7842                 return fix_small_imbalance(env, sds);
7843         }
7844
7845         /*
7846          * If there aren't any idle cpus, avoid creating some.
7847          */
7848         if (busiest->group_type == group_overloaded &&
7849             local->group_type   == group_overloaded) {
7850                 load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
7851                 if (load_above_capacity > busiest->group_capacity) {
7852                         load_above_capacity -= busiest->group_capacity;
7853                         load_above_capacity *= scale_load_down(NICE_0_LOAD);
7854                         load_above_capacity /= busiest->group_capacity;
7855                 } else
7856                         load_above_capacity = ~0UL;
7857         }
7858
7859         /*
7860          * We're trying to get all the cpus to the average_load, so we don't
7861          * want to push ourselves above the average load, nor do we wish to
7862          * reduce the max loaded cpu below the average load. At the same time,
7863          * we also don't want to reduce the group load below the group
7864          * capacity. Thus we look for the minimum possible imbalance.
7865          */
7866         max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
7867
7868         /* How much load to actually move to equalise the imbalance */
7869         env->imbalance = min(
7870                 max_pull * busiest->group_capacity,
7871                 (sds->avg_load - local->avg_load) * local->group_capacity
7872         ) / SCHED_CAPACITY_SCALE;
7873
7874         /*
7875          * if *imbalance is less than the average load per runnable task
7876          * there is no guarantee that any tasks will be moved so we'll have
7877          * a think about bumping its value to force at least one task to be
7878          * moved
7879          */
7880         if (env->imbalance < busiest->load_per_task)
7881                 return fix_small_imbalance(env, sds);
7882 }
7883
7884 /******* find_busiest_group() helpers end here *********************/
7885
7886 /**
7887  * find_busiest_group - Returns the busiest group within the sched_domain
7888  * if there is an imbalance.
7889  *
7890  * Also calculates the amount of weighted load which should be moved
7891  * to restore balance.
7892  *
7893  * @env: The load balancing environment.
7894  *
7895  * Return:      - The busiest group if imbalance exists.
7896  */
7897 static struct sched_group *find_busiest_group(struct lb_env *env)
7898 {
7899         struct sg_lb_stats *local, *busiest;
7900         struct sd_lb_stats sds;
7901
7902         init_sd_lb_stats(&sds);
7903
7904         /*
7905          * Compute the various statistics relavent for load balancing at
7906          * this level.
7907          */
7908         update_sd_lb_stats(env, &sds);
7909         local = &sds.local_stat;
7910         busiest = &sds.busiest_stat;
7911
7912         /* ASYM feature bypasses nice load balance check */
7913         if (check_asym_packing(env, &sds))
7914                 return sds.busiest;
7915
7916         /* There is no busy sibling group to pull tasks from */
7917         if (!sds.busiest || busiest->sum_nr_running == 0)
7918                 goto out_balanced;
7919
7920         /* XXX broken for overlapping NUMA groups */
7921         sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
7922                                                 / sds.total_capacity;
7923
7924         /*
7925          * If the busiest group is imbalanced the below checks don't
7926          * work because they assume all things are equal, which typically
7927          * isn't true due to cpus_allowed constraints and the like.
7928          */
7929         if (busiest->group_type == group_imbalanced)
7930                 goto force_balance;
7931
7932         /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
7933         if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
7934             busiest->group_no_capacity)
7935                 goto force_balance;
7936
7937         /*
7938          * If the local group is busier than the selected busiest group
7939          * don't try and pull any tasks.
7940          */
7941         if (local->avg_load >= busiest->avg_load)
7942                 goto out_balanced;
7943
7944         /*
7945          * Don't pull any tasks if this group is already above the domain
7946          * average load.
7947          */
7948         if (local->avg_load >= sds.avg_load)
7949                 goto out_balanced;
7950
7951         if (env->idle == CPU_IDLE) {
7952                 /*
7953                  * This cpu is idle. If the busiest group is not overloaded
7954                  * and there is no imbalance between this and busiest group
7955                  * wrt idle cpus, it is balanced. The imbalance becomes
7956                  * significant if the diff is greater than 1 otherwise we
7957                  * might end up to just move the imbalance on another group
7958                  */
7959                 if ((busiest->group_type != group_overloaded) &&
7960                                 (local->idle_cpus <= (busiest->idle_cpus + 1)))
7961                         goto out_balanced;
7962         } else {
7963                 /*
7964                  * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
7965                  * imbalance_pct to be conservative.
7966                  */
7967                 if (100 * busiest->avg_load <=
7968                                 env->sd->imbalance_pct * local->avg_load)
7969                         goto out_balanced;
7970         }
7971
7972 force_balance:
7973         /* Looks like there is an imbalance. Compute it */
7974         calculate_imbalance(env, &sds);
7975         return sds.busiest;
7976
7977 out_balanced:
7978         env->imbalance = 0;
7979         return NULL;
7980 }
7981
7982 /*
7983  * find_busiest_queue - find the busiest runqueue among the cpus in group.
7984  */
7985 static struct rq *find_busiest_queue(struct lb_env *env,
7986                                      struct sched_group *group)
7987 {
7988         struct rq *busiest = NULL, *rq;
7989         unsigned long busiest_load = 0, busiest_capacity = 1;
7990         int i;
7991
7992         for_each_cpu_and(i, sched_group_span(group), env->cpus) {
7993                 unsigned long capacity, wl;
7994                 enum fbq_type rt;
7995
7996                 rq = cpu_rq(i);
7997                 rt = fbq_classify_rq(rq);
7998
7999                 /*
8000                  * We classify groups/runqueues into three groups:
8001                  *  - regular: there are !numa tasks
8002                  *  - remote:  there are numa tasks that run on the 'wrong' node
8003                  *  - all:     there is no distinction
8004                  *
8005                  * In order to avoid migrating ideally placed numa tasks,
8006                  * ignore those when there's better options.
8007                  *
8008                  * If we ignore the actual busiest queue to migrate another
8009                  * task, the next balance pass can still reduce the busiest
8010                  * queue by moving tasks around inside the node.
8011                  *
8012                  * If we cannot move enough load due to this classification
8013                  * the next pass will adjust the group classification and
8014                  * allow migration of more tasks.
8015                  *
8016                  * Both cases only affect the total convergence complexity.
8017                  */
8018                 if (rt > env->fbq_type)
8019                         continue;
8020
8021                 capacity = capacity_of(i);
8022
8023                 wl = weighted_cpuload(rq);
8024
8025                 /*
8026                  * When comparing with imbalance, use weighted_cpuload()
8027                  * which is not scaled with the cpu capacity.
8028                  */
8029
8030                 if (rq->nr_running == 1 && wl > env->imbalance &&
8031                     !check_cpu_capacity(rq, env->sd))
8032                         continue;
8033
8034                 /*
8035                  * For the load comparisons with the other cpu's, consider
8036                  * the weighted_cpuload() scaled with the cpu capacity, so
8037                  * that the load can be moved away from the cpu that is
8038                  * potentially running at a lower capacity.
8039                  *
8040                  * Thus we're looking for max(wl_i / capacity_i), crosswise
8041                  * multiplication to rid ourselves of the division works out
8042                  * to: wl_i * capacity_j > wl_j * capacity_i;  where j is
8043                  * our previous maximum.
8044                  */
8045                 if (wl * busiest_capacity > busiest_load * capacity) {
8046                         busiest_load = wl;
8047                         busiest_capacity = capacity;
8048                         busiest = rq;
8049                 }
8050         }
8051
8052         return busiest;
8053 }
8054
8055 /*
8056  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
8057  * so long as it is large enough.
8058  */
8059 #define MAX_PINNED_INTERVAL     512
8060
8061 static int need_active_balance(struct lb_env *env)
8062 {
8063         struct sched_domain *sd = env->sd;
8064
8065         if (env->idle == CPU_NEWLY_IDLE) {
8066
8067                 /*
8068                  * ASYM_PACKING needs to force migrate tasks from busy but
8069                  * lower priority CPUs in order to pack all tasks in the
8070                  * highest priority CPUs.
8071                  */
8072                 if ((sd->flags & SD_ASYM_PACKING) &&
8073                     sched_asym_prefer(env->dst_cpu, env->src_cpu))
8074                         return 1;
8075         }
8076
8077         /*
8078          * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
8079          * It's worth migrating the task if the src_cpu's capacity is reduced
8080          * because of other sched_class or IRQs if more capacity stays
8081          * available on dst_cpu.
8082          */
8083         if ((env->idle != CPU_NOT_IDLE) &&
8084             (env->src_rq->cfs.h_nr_running == 1)) {
8085                 if ((check_cpu_capacity(env->src_rq, sd)) &&
8086                     (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
8087                         return 1;
8088         }
8089
8090         return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
8091 }
8092
8093 static int active_load_balance_cpu_stop(void *data);
8094
8095 static int should_we_balance(struct lb_env *env)
8096 {
8097         struct sched_group *sg = env->sd->groups;
8098         int cpu, balance_cpu = -1;
8099
8100         /*
8101          * In the newly idle case, we will allow all the cpu's
8102          * to do the newly idle load balance.
8103          */
8104         if (env->idle == CPU_NEWLY_IDLE)
8105                 return 1;
8106
8107         /* Try to find first idle cpu */
8108         for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
8109                 if (!idle_cpu(cpu))
8110                         continue;
8111
8112                 balance_cpu = cpu;
8113                 break;
8114         }
8115
8116         if (balance_cpu == -1)
8117                 balance_cpu = group_balance_cpu(sg);
8118
8119         /*
8120          * First idle cpu or the first cpu(busiest) in this sched group
8121          * is eligible for doing load balancing at this and above domains.
8122          */
8123         return balance_cpu == env->dst_cpu;
8124 }
8125
8126 /*
8127  * Check this_cpu to ensure it is balanced within domain. Attempt to move
8128  * tasks if there is an imbalance.
8129  */
8130 static int load_balance(int this_cpu, struct rq *this_rq,
8131                         struct sched_domain *sd, enum cpu_idle_type idle,
8132                         int *continue_balancing)
8133 {
8134         int ld_moved, cur_ld_moved, active_balance = 0;
8135         struct sched_domain *sd_parent = sd->parent;
8136         struct sched_group *group;
8137         struct rq *busiest;
8138         struct rq_flags rf;
8139         struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
8140
8141         struct lb_env env = {
8142                 .sd             = sd,
8143                 .dst_cpu        = this_cpu,
8144                 .dst_rq         = this_rq,
8145                 .dst_grpmask    = sched_group_span(sd->groups),
8146                 .idle           = idle,
8147                 .loop_break     = sched_nr_migrate_break,
8148                 .cpus           = cpus,
8149                 .fbq_type       = all,
8150                 .tasks          = LIST_HEAD_INIT(env.tasks),
8151         };
8152
8153         cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
8154
8155         schedstat_inc(sd->lb_count[idle]);
8156
8157 redo:
8158         if (!should_we_balance(&env)) {
8159                 *continue_balancing = 0;
8160                 goto out_balanced;
8161         }
8162
8163         group = find_busiest_group(&env);
8164         if (!group) {
8165                 schedstat_inc(sd->lb_nobusyg[idle]);
8166                 goto out_balanced;
8167         }
8168
8169         busiest = find_busiest_queue(&env, group);
8170         if (!busiest) {
8171                 schedstat_inc(sd->lb_nobusyq[idle]);
8172                 goto out_balanced;
8173         }
8174
8175         BUG_ON(busiest == env.dst_rq);
8176
8177         schedstat_add(sd->lb_imbalance[idle], env.imbalance);
8178
8179         env.src_cpu = busiest->cpu;
8180         env.src_rq = busiest;
8181
8182         ld_moved = 0;
8183         if (busiest->nr_running > 1) {
8184                 /*
8185                  * Attempt to move tasks. If find_busiest_group has found
8186                  * an imbalance but busiest->nr_running <= 1, the group is
8187                  * still unbalanced. ld_moved simply stays zero, so it is
8188                  * correctly treated as an imbalance.
8189                  */
8190                 env.flags |= LBF_ALL_PINNED;
8191                 env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
8192
8193 more_balance:
8194                 rq_lock_irqsave(busiest, &rf);
8195                 update_rq_clock(busiest);
8196
8197                 /*
8198                  * cur_ld_moved - load moved in current iteration
8199                  * ld_moved     - cumulative load moved across iterations
8200                  */
8201                 cur_ld_moved = detach_tasks(&env);
8202
8203                 /*
8204                  * We've detached some tasks from busiest_rq. Every
8205                  * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
8206                  * unlock busiest->lock, and we are able to be sure
8207                  * that nobody can manipulate the tasks in parallel.
8208                  * See task_rq_lock() family for the details.
8209                  */
8210
8211                 rq_unlock(busiest, &rf);
8212
8213                 if (cur_ld_moved) {
8214                         attach_tasks(&env);
8215                         ld_moved += cur_ld_moved;
8216                 }
8217
8218                 local_irq_restore(rf.flags);
8219
8220                 if (env.flags & LBF_NEED_BREAK) {
8221                         env.flags &= ~LBF_NEED_BREAK;
8222                         goto more_balance;
8223                 }
8224
8225                 /*
8226                  * Revisit (affine) tasks on src_cpu that couldn't be moved to
8227                  * us and move them to an alternate dst_cpu in our sched_group
8228                  * where they can run. The upper limit on how many times we
8229                  * iterate on same src_cpu is dependent on number of cpus in our
8230                  * sched_group.
8231                  *
8232                  * This changes load balance semantics a bit on who can move
8233                  * load to a given_cpu. In addition to the given_cpu itself
8234                  * (or a ilb_cpu acting on its behalf where given_cpu is
8235                  * nohz-idle), we now have balance_cpu in a position to move
8236                  * load to given_cpu. In rare situations, this may cause
8237                  * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
8238                  * _independently_ and at _same_ time to move some load to
8239                  * given_cpu) causing exceess load to be moved to given_cpu.
8240                  * This however should not happen so much in practice and
8241                  * moreover subsequent load balance cycles should correct the
8242                  * excess load moved.
8243                  */
8244                 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
8245
8246                         /* Prevent to re-select dst_cpu via env's cpus */
8247                         cpumask_clear_cpu(env.dst_cpu, env.cpus);
8248
8249                         env.dst_rq       = cpu_rq(env.new_dst_cpu);
8250                         env.dst_cpu      = env.new_dst_cpu;
8251                         env.flags       &= ~LBF_DST_PINNED;
8252                         env.loop         = 0;
8253                         env.loop_break   = sched_nr_migrate_break;
8254
8255                         /*
8256                          * Go back to "more_balance" rather than "redo" since we
8257                          * need to continue with same src_cpu.
8258                          */
8259                         goto more_balance;
8260                 }
8261
8262                 /*
8263                  * We failed to reach balance because of affinity.
8264                  */
8265                 if (sd_parent) {
8266                         int *group_imbalance = &sd_parent->groups->sgc->imbalance;
8267
8268                         if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
8269                                 *group_imbalance = 1;
8270                 }
8271
8272                 /* All tasks on this runqueue were pinned by CPU affinity */
8273                 if (unlikely(env.flags & LBF_ALL_PINNED)) {
8274                         cpumask_clear_cpu(cpu_of(busiest), cpus);
8275                         /*
8276                          * Attempting to continue load balancing at the current
8277                          * sched_domain level only makes sense if there are
8278                          * active CPUs remaining as possible busiest CPUs to
8279                          * pull load from which are not contained within the
8280                          * destination group that is receiving any migrated
8281                          * load.
8282                          */
8283                         if (!cpumask_subset(cpus, env.dst_grpmask)) {
8284                                 env.loop = 0;
8285                                 env.loop_break = sched_nr_migrate_break;
8286                                 goto redo;
8287                         }
8288                         goto out_all_pinned;
8289                 }
8290         }
8291
8292         if (!ld_moved) {
8293                 schedstat_inc(sd->lb_failed[idle]);
8294                 /*
8295                  * Increment the failure counter only on periodic balance.
8296                  * We do not want newidle balance, which can be very
8297                  * frequent, pollute the failure counter causing
8298                  * excessive cache_hot migrations and active balances.
8299                  */
8300                 if (idle != CPU_NEWLY_IDLE)
8301                         sd->nr_balance_failed++;
8302
8303                 if (need_active_balance(&env)) {
8304                         unsigned long flags;
8305
8306                         raw_spin_lock_irqsave(&busiest->lock, flags);
8307
8308                         /* don't kick the active_load_balance_cpu_stop,
8309                          * if the curr task on busiest cpu can't be
8310                          * moved to this_cpu
8311                          */
8312                         if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
8313                                 raw_spin_unlock_irqrestore(&busiest->lock,
8314                                                             flags);
8315                                 env.flags |= LBF_ALL_PINNED;
8316                                 goto out_one_pinned;
8317                         }
8318
8319                         /*
8320                          * ->active_balance synchronizes accesses to
8321                          * ->active_balance_work.  Once set, it's cleared
8322                          * only after active load balance is finished.
8323                          */
8324                         if (!busiest->active_balance) {
8325                                 busiest->active_balance = 1;
8326                                 busiest->push_cpu = this_cpu;
8327                                 active_balance = 1;
8328                         }
8329                         raw_spin_unlock_irqrestore(&busiest->lock, flags);
8330
8331                         if (active_balance) {
8332                                 stop_one_cpu_nowait(cpu_of(busiest),
8333                                         active_load_balance_cpu_stop, busiest,
8334                                         &busiest->active_balance_work);
8335                         }
8336
8337                         /* We've kicked active balancing, force task migration. */
8338                         sd->nr_balance_failed = sd->cache_nice_tries+1;
8339                 }
8340         } else
8341                 sd->nr_balance_failed = 0;
8342
8343         if (likely(!active_balance)) {
8344                 /* We were unbalanced, so reset the balancing interval */
8345                 sd->balance_interval = sd->min_interval;
8346         } else {
8347                 /*
8348                  * If we've begun active balancing, start to back off. This
8349                  * case may not be covered by the all_pinned logic if there
8350                  * is only 1 task on the busy runqueue (because we don't call
8351                  * detach_tasks).
8352                  */
8353                 if (sd->balance_interval < sd->max_interval)
8354                         sd->balance_interval *= 2;
8355         }
8356
8357         goto out;
8358
8359 out_balanced:
8360         /*
8361          * We reach balance although we may have faced some affinity
8362          * constraints. Clear the imbalance flag if it was set.
8363          */
8364         if (sd_parent) {
8365                 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
8366
8367                 if (*group_imbalance)
8368                         *group_imbalance = 0;
8369         }
8370
8371 out_all_pinned:
8372         /*
8373          * We reach balance because all tasks are pinned at this level so
8374          * we can't migrate them. Let the imbalance flag set so parent level
8375          * can try to migrate them.
8376          */
8377         schedstat_inc(sd->lb_balanced[idle]);
8378
8379         sd->nr_balance_failed = 0;
8380
8381 out_one_pinned:
8382         /* tune up the balancing interval */
8383         if (((env.flags & LBF_ALL_PINNED) &&
8384                         sd->balance_interval < MAX_PINNED_INTERVAL) ||
8385                         (sd->balance_interval < sd->max_interval))
8386                 sd->balance_interval *= 2;
8387
8388         ld_moved = 0;
8389 out:
8390         return ld_moved;
8391 }
8392
8393 static inline unsigned long
8394 get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
8395 {
8396         unsigned long interval = sd->balance_interval;
8397
8398         if (cpu_busy)
8399                 interval *= sd->busy_factor;
8400
8401         /* scale ms to jiffies */
8402         interval = msecs_to_jiffies(interval);
8403         interval = clamp(interval, 1UL, max_load_balance_interval);
8404
8405         return interval;
8406 }
8407
8408 static inline void
8409 update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
8410 {
8411         unsigned long interval, next;
8412
8413         /* used by idle balance, so cpu_busy = 0 */
8414         interval = get_sd_balance_interval(sd, 0);
8415         next = sd->last_balance + interval;
8416
8417         if (time_after(*next_balance, next))
8418                 *next_balance = next;
8419 }
8420
8421 /*
8422  * idle_balance is called by schedule() if this_cpu is about to become
8423  * idle. Attempts to pull tasks from other CPUs.
8424  */
8425 static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
8426 {
8427         unsigned long next_balance = jiffies + HZ;
8428         int this_cpu = this_rq->cpu;
8429         struct sched_domain *sd;
8430         int pulled_task = 0;
8431         u64 curr_cost = 0;
8432
8433         /*
8434          * We must set idle_stamp _before_ calling idle_balance(), such that we
8435          * measure the duration of idle_balance() as idle time.
8436          */
8437         this_rq->idle_stamp = rq_clock(this_rq);
8438
8439         /*
8440          * This is OK, because current is on_cpu, which avoids it being picked
8441          * for load-balance and preemption/IRQs are still disabled avoiding
8442          * further scheduler activity on it and we're being very careful to
8443          * re-start the picking loop.
8444          */
8445         rq_unpin_lock(this_rq, rf);
8446
8447         if (this_rq->avg_idle < sysctl_sched_migration_cost ||
8448             !this_rq->rd->overload) {
8449                 rcu_read_lock();
8450                 sd = rcu_dereference_check_sched_domain(this_rq->sd);
8451                 if (sd)
8452                         update_next_balance(sd, &next_balance);
8453                 rcu_read_unlock();
8454
8455                 goto out;
8456         }
8457
8458         raw_spin_unlock(&this_rq->lock);
8459
8460         update_blocked_averages(this_cpu);
8461         rcu_read_lock();
8462         for_each_domain(this_cpu, sd) {
8463                 int continue_balancing = 1;
8464                 u64 t0, domain_cost;
8465
8466                 if (!(sd->flags & SD_LOAD_BALANCE))
8467                         continue;
8468
8469                 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
8470                         update_next_balance(sd, &next_balance);
8471                         break;
8472                 }
8473
8474                 if (sd->flags & SD_BALANCE_NEWIDLE) {
8475                         t0 = sched_clock_cpu(this_cpu);
8476
8477                         pulled_task = load_balance(this_cpu, this_rq,
8478                                                    sd, CPU_NEWLY_IDLE,
8479                                                    &continue_balancing);
8480
8481                         domain_cost = sched_clock_cpu(this_cpu) - t0;
8482                         if (domain_cost > sd->max_newidle_lb_cost)
8483                                 sd->max_newidle_lb_cost = domain_cost;
8484
8485                         curr_cost += domain_cost;
8486                 }
8487
8488                 update_next_balance(sd, &next_balance);
8489
8490                 /*
8491                  * Stop searching for tasks to pull if there are
8492                  * now runnable tasks on this rq.
8493                  */
8494                 if (pulled_task || this_rq->nr_running > 0)
8495                         break;
8496         }
8497         rcu_read_unlock();
8498
8499         raw_spin_lock(&this_rq->lock);
8500
8501         if (curr_cost > this_rq->max_idle_balance_cost)
8502                 this_rq->max_idle_balance_cost = curr_cost;
8503
8504         /*
8505          * While browsing the domains, we released the rq lock, a task could
8506          * have been enqueued in the meantime. Since we're not going idle,
8507          * pretend we pulled a task.
8508          */
8509         if (this_rq->cfs.h_nr_running && !pulled_task)
8510                 pulled_task = 1;
8511
8512 out:
8513         /* Move the next balance forward */
8514         if (time_after(this_rq->next_balance, next_balance))
8515                 this_rq->next_balance = next_balance;
8516
8517         /* Is there a task of a high priority class? */
8518         if (this_rq->nr_running != this_rq->cfs.h_nr_running)
8519                 pulled_task = -1;
8520
8521         if (pulled_task)
8522                 this_rq->idle_stamp = 0;
8523
8524         rq_repin_lock(this_rq, rf);
8525
8526         return pulled_task;
8527 }
8528
8529 /*
8530  * active_load_balance_cpu_stop is run by cpu stopper. It pushes
8531  * running tasks off the busiest CPU onto idle CPUs. It requires at
8532  * least 1 task to be running on each physical CPU where possible, and
8533  * avoids physical / logical imbalances.
8534  */
8535 static int active_load_balance_cpu_stop(void *data)
8536 {
8537         struct rq *busiest_rq = data;
8538         int busiest_cpu = cpu_of(busiest_rq);
8539         int target_cpu = busiest_rq->push_cpu;
8540         struct rq *target_rq = cpu_rq(target_cpu);
8541         struct sched_domain *sd;
8542         struct task_struct *p = NULL;
8543         struct rq_flags rf;
8544
8545         rq_lock_irq(busiest_rq, &rf);
8546
8547         /* make sure the requested cpu hasn't gone down in the meantime */
8548         if (unlikely(busiest_cpu != smp_processor_id() ||
8549                      !busiest_rq->active_balance))
8550                 goto out_unlock;
8551
8552         /* Is there any task to move? */
8553         if (busiest_rq->nr_running <= 1)
8554                 goto out_unlock;
8555
8556         /*
8557          * This condition is "impossible", if it occurs
8558          * we need to fix it. Originally reported by
8559          * Bjorn Helgaas on a 128-cpu setup.
8560          */
8561         BUG_ON(busiest_rq == target_rq);
8562
8563         /* Search for an sd spanning us and the target CPU. */
8564         rcu_read_lock();
8565         for_each_domain(target_cpu, sd) {
8566                 if ((sd->flags & SD_LOAD_BALANCE) &&
8567                     cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
8568                                 break;
8569         }
8570
8571         if (likely(sd)) {
8572                 struct lb_env env = {
8573                         .sd             = sd,
8574                         .dst_cpu        = target_cpu,
8575                         .dst_rq         = target_rq,
8576                         .src_cpu        = busiest_rq->cpu,
8577                         .src_rq         = busiest_rq,
8578                         .idle           = CPU_IDLE,
8579                         /*
8580                          * can_migrate_task() doesn't need to compute new_dst_cpu
8581                          * for active balancing. Since we have CPU_IDLE, but no
8582                          * @dst_grpmask we need to make that test go away with lying
8583                          * about DST_PINNED.
8584                          */
8585                         .flags          = LBF_DST_PINNED,
8586                 };
8587
8588                 schedstat_inc(sd->alb_count);
8589                 update_rq_clock(busiest_rq);
8590
8591                 p = detach_one_task(&env);
8592                 if (p) {
8593                         schedstat_inc(sd->alb_pushed);
8594                         /* Active balancing done, reset the failure counter. */
8595                         sd->nr_balance_failed = 0;
8596                 } else {
8597                         schedstat_inc(sd->alb_failed);
8598                 }
8599         }
8600         rcu_read_unlock();
8601 out_unlock:
8602         busiest_rq->active_balance = 0;
8603         rq_unlock(busiest_rq, &rf);
8604
8605         if (p)
8606                 attach_one_task(target_rq, p);
8607
8608         local_irq_enable();
8609
8610         return 0;
8611 }
8612
8613 static inline int on_null_domain(struct rq *rq)
8614 {
8615         return unlikely(!rcu_dereference_sched(rq->sd));
8616 }
8617
8618 #ifdef CONFIG_NO_HZ_COMMON
8619 /*
8620  * idle load balancing details
8621  * - When one of the busy CPUs notice that there may be an idle rebalancing
8622  *   needed, they will kick the idle load balancer, which then does idle
8623  *   load balancing for all the idle CPUs.
8624  */
8625 static struct {
8626         cpumask_var_t idle_cpus_mask;
8627         atomic_t nr_cpus;
8628         unsigned long next_balance;     /* in jiffy units */
8629 } nohz ____cacheline_aligned;
8630
8631 static inline int find_new_ilb(void)
8632 {
8633         int ilb = cpumask_first(nohz.idle_cpus_mask);
8634
8635         if (ilb < nr_cpu_ids && idle_cpu(ilb))
8636                 return ilb;
8637
8638         return nr_cpu_ids;
8639 }
8640
8641 /*
8642  * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
8643  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
8644  * CPU (if there is one).
8645  */
8646 static void nohz_balancer_kick(void)
8647 {
8648         int ilb_cpu;
8649
8650         nohz.next_balance++;
8651
8652         ilb_cpu = find_new_ilb();
8653
8654         if (ilb_cpu >= nr_cpu_ids)
8655                 return;
8656
8657         if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
8658                 return;
8659         /*
8660          * Use smp_send_reschedule() instead of resched_cpu().
8661          * This way we generate a sched IPI on the target cpu which
8662          * is idle. And the softirq performing nohz idle load balance
8663          * will be run before returning from the IPI.
8664          */
8665         smp_send_reschedule(ilb_cpu);
8666         return;
8667 }
8668
8669 void nohz_balance_exit_idle(unsigned int cpu)
8670 {
8671         if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
8672                 /*
8673                  * Completely isolated CPUs don't ever set, so we must test.
8674                  */
8675                 if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
8676                         cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
8677                         atomic_dec(&nohz.nr_cpus);
8678                 }
8679                 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
8680         }
8681 }
8682
8683 static inline void set_cpu_sd_state_busy(void)
8684 {
8685         struct sched_domain *sd;
8686         int cpu = smp_processor_id();
8687
8688         rcu_read_lock();
8689         sd = rcu_dereference(per_cpu(sd_llc, cpu));
8690
8691         if (!sd || !sd->nohz_idle)
8692                 goto unlock;
8693         sd->nohz_idle = 0;
8694
8695         atomic_inc(&sd->shared->nr_busy_cpus);
8696 unlock:
8697         rcu_read_unlock();
8698 }
8699
8700 void set_cpu_sd_state_idle(void)
8701 {
8702         struct sched_domain *sd;
8703         int cpu = smp_processor_id();
8704
8705         rcu_read_lock();
8706         sd = rcu_dereference(per_cpu(sd_llc, cpu));
8707
8708         if (!sd || sd->nohz_idle)
8709                 goto unlock;
8710         sd->nohz_idle = 1;
8711
8712         atomic_dec(&sd->shared->nr_busy_cpus);
8713 unlock:
8714         rcu_read_unlock();
8715 }
8716
8717 /*
8718  * This routine will record that the cpu is going idle with tick stopped.
8719  * This info will be used in performing idle load balancing in the future.
8720  */
8721 void nohz_balance_enter_idle(int cpu)
8722 {
8723         /*
8724          * If this cpu is going down, then nothing needs to be done.
8725          */
8726         if (!cpu_active(cpu))
8727                 return;
8728
8729         /* Spare idle load balancing on CPUs that don't want to be disturbed: */
8730         if (!is_housekeeping_cpu(cpu))
8731                 return;
8732
8733         if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
8734                 return;
8735
8736         /*
8737          * If we're a completely isolated CPU, we don't play.
8738          */
8739         if (on_null_domain(cpu_rq(cpu)))
8740                 return;
8741
8742         cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
8743         atomic_inc(&nohz.nr_cpus);
8744         set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
8745 }
8746 #endif
8747
8748 static DEFINE_SPINLOCK(balancing);
8749
8750 /*
8751  * Scale the max load_balance interval with the number of CPUs in the system.
8752  * This trades load-balance latency on larger machines for less cross talk.
8753  */
8754 void update_max_interval(void)
8755 {
8756         max_load_balance_interval = HZ*num_online_cpus()/10;
8757 }
8758
8759 /*
8760  * It checks each scheduling domain to see if it is due to be balanced,
8761  * and initiates a balancing operation if so.
8762  *
8763  * Balancing parameters are set up in init_sched_domains.
8764  */
8765 static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
8766 {
8767         int continue_balancing = 1;
8768         int cpu = rq->cpu;
8769         unsigned long interval;
8770         struct sched_domain *sd;
8771         /* Earliest time when we have to do rebalance again */
8772         unsigned long next_balance = jiffies + 60*HZ;
8773         int update_next_balance = 0;
8774         int need_serialize, need_decay = 0;
8775         u64 max_cost = 0;
8776
8777         update_blocked_averages(cpu);
8778
8779         rcu_read_lock();
8780         for_each_domain(cpu, sd) {
8781                 /*
8782                  * Decay the newidle max times here because this is a regular
8783                  * visit to all the domains. Decay ~1% per second.
8784                  */
8785                 if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
8786                         sd->max_newidle_lb_cost =
8787                                 (sd->max_newidle_lb_cost * 253) / 256;
8788                         sd->next_decay_max_lb_cost = jiffies + HZ;
8789                         need_decay = 1;
8790                 }
8791                 max_cost += sd->max_newidle_lb_cost;
8792
8793                 if (!(sd->flags & SD_LOAD_BALANCE))
8794                         continue;
8795
8796                 /*
8797                  * Stop the load balance at this level. There is another
8798                  * CPU in our sched group which is doing load balancing more
8799                  * actively.
8800                  */
8801                 if (!continue_balancing) {
8802                         if (need_decay)
8803                                 continue;
8804                         break;
8805                 }
8806
8807                 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
8808
8809                 need_serialize = sd->flags & SD_SERIALIZE;
8810                 if (need_serialize) {
8811                         if (!spin_trylock(&balancing))
8812                                 goto out;
8813                 }
8814
8815                 if (time_after_eq(jiffies, sd->last_balance + interval)) {
8816                         if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
8817                                 /*
8818                                  * The LBF_DST_PINNED logic could have changed
8819                                  * env->dst_cpu, so we can't know our idle
8820                                  * state even if we migrated tasks. Update it.
8821                                  */
8822                                 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
8823                         }
8824                         sd->last_balance = jiffies;
8825                         interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
8826                 }
8827                 if (need_serialize)
8828                         spin_unlock(&balancing);
8829 out:
8830                 if (time_after(next_balance, sd->last_balance + interval)) {
8831                         next_balance = sd->last_balance + interval;
8832                         update_next_balance = 1;
8833                 }
8834         }
8835         if (need_decay) {
8836                 /*
8837                  * Ensure the rq-wide value also decays but keep it at a
8838                  * reasonable floor to avoid funnies with rq->avg_idle.
8839                  */
8840                 rq->max_idle_balance_cost =
8841                         max((u64)sysctl_sched_migration_cost, max_cost);
8842         }
8843         rcu_read_unlock();
8844
8845         /*
8846          * next_balance will be updated only when there is a need.
8847          * When the cpu is attached to null domain for ex, it will not be
8848          * updated.
8849          */
8850         if (likely(update_next_balance)) {
8851                 rq->next_balance = next_balance;
8852
8853 #ifdef CONFIG_NO_HZ_COMMON
8854                 /*
8855                  * If this CPU has been elected to perform the nohz idle
8856                  * balance. Other idle CPUs have already rebalanced with
8857                  * nohz_idle_balance() and nohz.next_balance has been
8858                  * updated accordingly. This CPU is now running the idle load
8859                  * balance for itself and we need to update the
8860                  * nohz.next_balance accordingly.
8861                  */
8862                 if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
8863                         nohz.next_balance = rq->next_balance;
8864 #endif
8865         }
8866 }
8867
8868 #ifdef CONFIG_NO_HZ_COMMON
8869 /*
8870  * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
8871  * rebalancing for all the cpus for whom scheduler ticks are stopped.
8872  */
8873 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
8874 {
8875         int this_cpu = this_rq->cpu;
8876         struct rq *rq;
8877         int balance_cpu;
8878         /* Earliest time when we have to do rebalance again */
8879         unsigned long next_balance = jiffies + 60*HZ;
8880         int update_next_balance = 0;
8881
8882         if (idle != CPU_IDLE ||
8883             !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
8884                 goto end;
8885
8886         for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
8887                 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
8888                         continue;
8889
8890                 /*
8891                  * If this cpu gets work to do, stop the load balancing
8892                  * work being done for other cpus. Next load
8893                  * balancing owner will pick it up.
8894                  */
8895                 if (need_resched())
8896                         break;
8897
8898                 rq = cpu_rq(balance_cpu);
8899
8900                 /*
8901                  * If time for next balance is due,
8902                  * do the balance.
8903                  */
8904                 if (time_after_eq(jiffies, rq->next_balance)) {
8905                         struct rq_flags rf;
8906
8907                         rq_lock_irq(rq, &rf);
8908                         update_rq_clock(rq);
8909                         cpu_load_update_idle(rq);
8910                         rq_unlock_irq(rq, &rf);
8911
8912                         rebalance_domains(rq, CPU_IDLE);
8913                 }
8914
8915                 if (time_after(next_balance, rq->next_balance)) {
8916                         next_balance = rq->next_balance;
8917                         update_next_balance = 1;
8918                 }
8919         }
8920
8921         /*
8922          * next_balance will be updated only when there is a need.
8923          * When the CPU is attached to null domain for ex, it will not be
8924          * updated.
8925          */
8926         if (likely(update_next_balance))
8927                 nohz.next_balance = next_balance;
8928 end:
8929         clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
8930 }
8931
8932 /*
8933  * Current heuristic for kicking the idle load balancer in the presence
8934  * of an idle cpu in the system.
8935  *   - This rq has more than one task.
8936  *   - This rq has at least one CFS task and the capacity of the CPU is
8937  *     significantly reduced because of RT tasks or IRQs.
8938  *   - At parent of LLC scheduler domain level, this cpu's scheduler group has
8939  *     multiple busy cpu.
8940  *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
8941  *     domain span are idle.
8942  */
8943 static inline bool nohz_kick_needed(struct rq *rq)
8944 {
8945         unsigned long now = jiffies;
8946         struct sched_domain_shared *sds;
8947         struct sched_domain *sd;
8948         int nr_busy, i, cpu = rq->cpu;
8949         bool kick = false;
8950
8951         if (unlikely(rq->idle_balance))
8952                 return false;
8953
8954        /*
8955         * We may be recently in ticked or tickless idle mode. At the first
8956         * busy tick after returning from idle, we will update the busy stats.
8957         */
8958         set_cpu_sd_state_busy();
8959         nohz_balance_exit_idle(cpu);
8960
8961         /*
8962          * None are in tickless mode and hence no need for NOHZ idle load
8963          * balancing.
8964          */
8965         if (likely(!atomic_read(&nohz.nr_cpus)))
8966                 return false;
8967
8968         if (time_before(now, nohz.next_balance))
8969                 return false;
8970
8971         if (rq->nr_running >= 2)
8972                 return true;
8973
8974         rcu_read_lock();
8975         sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
8976         if (sds) {
8977                 /*
8978                  * XXX: write a coherent comment on why we do this.
8979                  * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
8980                  */
8981                 nr_busy = atomic_read(&sds->nr_busy_cpus);
8982                 if (nr_busy > 1) {
8983                         kick = true;
8984                         goto unlock;
8985                 }
8986
8987         }
8988
8989         sd = rcu_dereference(rq->sd);
8990         if (sd) {
8991                 if ((rq->cfs.h_nr_running >= 1) &&
8992                                 check_cpu_capacity(rq, sd)) {
8993                         kick = true;
8994                         goto unlock;
8995                 }
8996         }
8997
8998         sd = rcu_dereference(per_cpu(sd_asym, cpu));
8999         if (sd) {
9000                 for_each_cpu(i, sched_domain_span(sd)) {
9001                         if (i == cpu ||
9002                             !cpumask_test_cpu(i, nohz.idle_cpus_mask))
9003                                 continue;
9004
9005                         if (sched_asym_prefer(i, cpu)) {
9006                                 kick = true;
9007                                 goto unlock;
9008                         }
9009                 }
9010         }
9011 unlock:
9012         rcu_read_unlock();
9013         return kick;
9014 }
9015 #else
9016 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
9017 #endif
9018
9019 /*
9020  * run_rebalance_domains is triggered when needed from the scheduler tick.
9021  * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
9022  */
9023 static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
9024 {
9025         struct rq *this_rq = this_rq();
9026         enum cpu_idle_type idle = this_rq->idle_balance ?
9027                                                 CPU_IDLE : CPU_NOT_IDLE;
9028
9029         /*
9030          * If this cpu has a pending nohz_balance_kick, then do the
9031          * balancing on behalf of the other idle cpus whose ticks are
9032          * stopped. Do nohz_idle_balance *before* rebalance_domains to
9033          * give the idle cpus a chance to load balance. Else we may
9034          * load balance only within the local sched_domain hierarchy
9035          * and abort nohz_idle_balance altogether if we pull some load.
9036          */
9037         nohz_idle_balance(this_rq, idle);
9038         rebalance_domains(this_rq, idle);
9039 }
9040
9041 /*
9042  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
9043  */
9044 void trigger_load_balance(struct rq *rq)
9045 {
9046         /* Don't need to rebalance while attached to NULL domain */
9047         if (unlikely(on_null_domain(rq)))
9048                 return;
9049
9050         if (time_after_eq(jiffies, rq->next_balance))
9051                 raise_softirq(SCHED_SOFTIRQ);
9052 #ifdef CONFIG_NO_HZ_COMMON
9053         if (nohz_kick_needed(rq))
9054                 nohz_balancer_kick();
9055 #endif
9056 }
9057
9058 static void rq_online_fair(struct rq *rq)
9059 {
9060         update_sysctl();
9061
9062         update_runtime_enabled(rq);
9063 }
9064
9065 static void rq_offline_fair(struct rq *rq)
9066 {
9067         update_sysctl();
9068
9069         /* Ensure any throttled groups are reachable by pick_next_task */
9070         unthrottle_offline_cfs_rqs(rq);
9071 }
9072
9073 #endif /* CONFIG_SMP */
9074
9075 /*
9076  * scheduler tick hitting a task of our scheduling class:
9077  */
9078 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
9079 {
9080         struct cfs_rq *cfs_rq;
9081         struct sched_entity *se = &curr->se;
9082
9083         for_each_sched_entity(se) {
9084                 cfs_rq = cfs_rq_of(se);
9085                 entity_tick(cfs_rq, se, queued);
9086         }
9087
9088         if (static_branch_unlikely(&sched_numa_balancing))
9089                 task_tick_numa(rq, curr);
9090 }
9091
9092 /*
9093  * called on fork with the child task as argument from the parent's context
9094  *  - child not yet on the tasklist
9095  *  - preemption disabled
9096  */
9097 static void task_fork_fair(struct task_struct *p)
9098 {
9099         struct cfs_rq *cfs_rq;
9100         struct sched_entity *se = &p->se, *curr;
9101         struct rq *rq = this_rq();
9102         struct rq_flags rf;
9103
9104         rq_lock(rq, &rf);
9105         update_rq_clock(rq);
9106
9107         cfs_rq = task_cfs_rq(current);
9108         curr = cfs_rq->curr;
9109         if (curr) {
9110                 update_curr(cfs_rq);
9111                 se->vruntime = curr->vruntime;
9112         }
9113         place_entity(cfs_rq, se, 1);
9114
9115         if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
9116                 /*
9117                  * Upon rescheduling, sched_class::put_prev_task() will place
9118                  * 'current' within the tree based on its new key value.
9119                  */
9120                 swap(curr->vruntime, se->vruntime);
9121                 resched_curr(rq);
9122         }
9123
9124         se->vruntime -= cfs_rq->min_vruntime;
9125         rq_unlock(rq, &rf);
9126 }
9127
9128 /*
9129  * Priority of the task has changed. Check to see if we preempt
9130  * the current task.
9131  */
9132 static void
9133 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
9134 {
9135         if (!task_on_rq_queued(p))
9136                 return;
9137
9138         /*
9139          * Reschedule if we are currently running on this runqueue and
9140          * our priority decreased, or if we are not currently running on
9141          * this runqueue and our priority is higher than the current's
9142          */
9143         if (rq->curr == p) {
9144                 if (p->prio > oldprio)
9145                         resched_curr(rq);
9146         } else
9147                 check_preempt_curr(rq, p, 0);
9148 }
9149
9150 static inline bool vruntime_normalized(struct task_struct *p)
9151 {
9152         struct sched_entity *se = &p->se;
9153
9154         /*
9155          * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
9156          * the dequeue_entity(.flags=0) will already have normalized the
9157          * vruntime.
9158          */
9159         if (p->on_rq)
9160                 return true;
9161
9162         /*
9163          * When !on_rq, vruntime of the task has usually NOT been normalized.
9164          * But there are some cases where it has already been normalized:
9165          *
9166          * - A forked child which is waiting for being woken up by
9167          *   wake_up_new_task().
9168          * - A task which has been woken up by try_to_wake_up() and
9169          *   waiting for actually being woken up by sched_ttwu_pending().
9170          */
9171         if (!se->sum_exec_runtime || p->state == TASK_WAKING)
9172                 return true;
9173
9174         return false;
9175 }
9176
9177 #ifdef CONFIG_FAIR_GROUP_SCHED
9178 /*
9179  * Propagate the changes of the sched_entity across the tg tree to make it
9180  * visible to the root
9181  */
9182 static void propagate_entity_cfs_rq(struct sched_entity *se)
9183 {
9184         struct cfs_rq *cfs_rq;
9185
9186         /* Start to propagate at parent */
9187         se = se->parent;
9188
9189         for_each_sched_entity(se) {
9190                 cfs_rq = cfs_rq_of(se);
9191
9192                 if (cfs_rq_throttled(cfs_rq))
9193                         break;
9194
9195                 update_load_avg(se, UPDATE_TG);
9196         }
9197 }
9198 #else
9199 static void propagate_entity_cfs_rq(struct sched_entity *se) { }
9200 #endif
9201
9202 static void detach_entity_cfs_rq(struct sched_entity *se)
9203 {
9204         struct cfs_rq *cfs_rq = cfs_rq_of(se);
9205
9206         /* Catch up with the cfs_rq and remove our load when we leave */
9207         update_load_avg(se, 0);
9208         detach_entity_load_avg(cfs_rq, se);
9209         update_tg_load_avg(cfs_rq, false);
9210         propagate_entity_cfs_rq(se);
9211 }
9212
9213 static void attach_entity_cfs_rq(struct sched_entity *se)
9214 {
9215         struct cfs_rq *cfs_rq = cfs_rq_of(se);
9216
9217 #ifdef CONFIG_FAIR_GROUP_SCHED
9218         /*
9219          * Since the real-depth could have been changed (only FAIR
9220          * class maintain depth value), reset depth properly.
9221          */
9222         se->depth = se->parent ? se->parent->depth + 1 : 0;
9223 #endif
9224
9225         /* Synchronize entity with its cfs_rq */
9226         update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
9227         attach_entity_load_avg(cfs_rq, se);
9228         update_tg_load_avg(cfs_rq, false);
9229         propagate_entity_cfs_rq(se);
9230 }
9231
9232 static void detach_task_cfs_rq(struct task_struct *p)
9233 {
9234         struct sched_entity *se = &p->se;
9235         struct cfs_rq *cfs_rq = cfs_rq_of(se);
9236
9237         if (!vruntime_normalized(p)) {
9238                 /*
9239                  * Fix up our vruntime so that the current sleep doesn't
9240                  * cause 'unlimited' sleep bonus.
9241                  */
9242                 place_entity(cfs_rq, se, 0);
9243                 se->vruntime -= cfs_rq->min_vruntime;
9244         }
9245
9246         detach_entity_cfs_rq(se);
9247 }
9248
9249 static void attach_task_cfs_rq(struct task_struct *p)
9250 {
9251         struct sched_entity *se = &p->se;
9252         struct cfs_rq *cfs_rq = cfs_rq_of(se);
9253
9254         attach_entity_cfs_rq(se);
9255
9256         if (!vruntime_normalized(p))
9257                 se->vruntime += cfs_rq->min_vruntime;
9258 }
9259
9260 static void switched_from_fair(struct rq *rq, struct task_struct *p)
9261 {
9262         detach_task_cfs_rq(p);
9263 }
9264
9265 static void switched_to_fair(struct rq *rq, struct task_struct *p)
9266 {
9267         attach_task_cfs_rq(p);
9268
9269         if (task_on_rq_queued(p)) {
9270                 /*
9271                  * We were most likely switched from sched_rt, so
9272                  * kick off the schedule if running, otherwise just see
9273                  * if we can still preempt the current task.
9274                  */
9275                 if (rq->curr == p)
9276                         resched_curr(rq);
9277                 else
9278                         check_preempt_curr(rq, p, 0);
9279         }
9280 }
9281
9282 /* Account for a task changing its policy or group.
9283  *
9284  * This routine is mostly called to set cfs_rq->curr field when a task
9285  * migrates between groups/classes.
9286  */
9287 static void set_curr_task_fair(struct rq *rq)
9288 {
9289         struct sched_entity *se = &rq->curr->se;
9290
9291         for_each_sched_entity(se) {
9292                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
9293
9294                 set_next_entity(cfs_rq, se);
9295                 /* ensure bandwidth has been allocated on our new cfs_rq */
9296                 account_cfs_rq_runtime(cfs_rq, 0);
9297         }
9298 }
9299
9300 void init_cfs_rq(struct cfs_rq *cfs_rq)
9301 {
9302         cfs_rq->tasks_timeline = RB_ROOT_CACHED;
9303         cfs_rq->min_vruntime = (u64)(-(1LL << 20));
9304 #ifndef CONFIG_64BIT
9305         cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
9306 #endif
9307 #ifdef CONFIG_SMP
9308 #ifdef CONFIG_FAIR_GROUP_SCHED
9309         cfs_rq->propagate_avg = 0;
9310 #endif
9311         atomic_long_set(&cfs_rq->removed_load_avg, 0);
9312         atomic_long_set(&cfs_rq->removed_util_avg, 0);
9313 #endif
9314 }
9315
9316 #ifdef CONFIG_FAIR_GROUP_SCHED
9317 static void task_set_group_fair(struct task_struct *p)
9318 {
9319         struct sched_entity *se = &p->se;
9320
9321         set_task_rq(p, task_cpu(p));
9322         se->depth = se->parent ? se->parent->depth + 1 : 0;
9323 }
9324
9325 static void task_move_group_fair(struct task_struct *p)
9326 {
9327         detach_task_cfs_rq(p);
9328         set_task_rq(p, task_cpu(p));
9329
9330 #ifdef CONFIG_SMP
9331         /* Tell se's cfs_rq has been changed -- migrated */
9332         p->se.avg.last_update_time = 0;
9333 #endif
9334         attach_task_cfs_rq(p);
9335 }
9336
9337 static void task_change_group_fair(struct task_struct *p, int type)
9338 {
9339         switch (type) {
9340         case TASK_SET_GROUP:
9341                 task_set_group_fair(p);
9342                 break;
9343
9344         case TASK_MOVE_GROUP:
9345                 task_move_group_fair(p);
9346                 break;
9347         }
9348 }
9349
9350 void free_fair_sched_group(struct task_group *tg)
9351 {
9352         int i;
9353
9354         destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
9355
9356         for_each_possible_cpu(i) {
9357                 if (tg->cfs_rq)
9358                         kfree(tg->cfs_rq[i]);
9359                 if (tg->se)
9360                         kfree(tg->se[i]);
9361         }
9362
9363         kfree(tg->cfs_rq);
9364         kfree(tg->se);
9365 }
9366
9367 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9368 {
9369         struct sched_entity *se;
9370         struct cfs_rq *cfs_rq;
9371         int i;
9372
9373         tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
9374         if (!tg->cfs_rq)
9375                 goto err;
9376         tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
9377         if (!tg->se)
9378                 goto err;
9379
9380         tg->shares = NICE_0_LOAD;
9381
9382         init_cfs_bandwidth(tg_cfs_bandwidth(tg));
9383
9384         for_each_possible_cpu(i) {
9385                 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
9386                                       GFP_KERNEL, cpu_to_node(i));
9387                 if (!cfs_rq)
9388                         goto err;
9389
9390                 se = kzalloc_node(sizeof(struct sched_entity),
9391                                   GFP_KERNEL, cpu_to_node(i));
9392                 if (!se)
9393                         goto err_free_rq;
9394
9395                 init_cfs_rq(cfs_rq);
9396                 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
9397                 init_entity_runnable_average(se);
9398         }
9399
9400         return 1;
9401
9402 err_free_rq:
9403         kfree(cfs_rq);
9404 err:
9405         return 0;
9406 }
9407
9408 void online_fair_sched_group(struct task_group *tg)
9409 {
9410         struct sched_entity *se;
9411         struct rq *rq;
9412         int i;
9413
9414         for_each_possible_cpu(i) {
9415                 rq = cpu_rq(i);
9416                 se = tg->se[i];
9417
9418                 raw_spin_lock_irq(&rq->lock);
9419                 update_rq_clock(rq);
9420                 attach_entity_cfs_rq(se);
9421                 sync_throttle(tg, i);
9422                 raw_spin_unlock_irq(&rq->lock);
9423         }
9424 }
9425
9426 void unregister_fair_sched_group(struct task_group *tg)
9427 {
9428         unsigned long flags;
9429         struct rq *rq;
9430         int cpu;
9431
9432         for_each_possible_cpu(cpu) {
9433                 if (tg->se[cpu])
9434                         remove_entity_load_avg(tg->se[cpu]);
9435
9436                 /*
9437                  * Only empty task groups can be destroyed; so we can speculatively
9438                  * check on_list without danger of it being re-added.
9439                  */
9440                 if (!tg->cfs_rq[cpu]->on_list)
9441                         continue;
9442
9443                 rq = cpu_rq(cpu);
9444
9445                 raw_spin_lock_irqsave(&rq->lock, flags);
9446                 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
9447                 raw_spin_unlock_irqrestore(&rq->lock, flags);
9448         }
9449 }
9450
9451 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
9452                         struct sched_entity *se, int cpu,
9453                         struct sched_entity *parent)
9454 {
9455         struct rq *rq = cpu_rq(cpu);
9456
9457         cfs_rq->tg = tg;
9458         cfs_rq->rq = rq;
9459         init_cfs_rq_runtime(cfs_rq);
9460
9461         tg->cfs_rq[cpu] = cfs_rq;
9462         tg->se[cpu] = se;
9463
9464         /* se could be NULL for root_task_group */
9465         if (!se)
9466                 return;
9467
9468         if (!parent) {
9469                 se->cfs_rq = &rq->cfs;
9470                 se->depth = 0;
9471         } else {
9472                 se->cfs_rq = parent->my_q;
9473                 se->depth = parent->depth + 1;
9474         }
9475
9476         se->my_q = cfs_rq;
9477         /* guarantee group entities always have weight */
9478         update_load_set(&se->load, NICE_0_LOAD);
9479         se->parent = parent;
9480 }
9481
9482 static DEFINE_MUTEX(shares_mutex);
9483
9484 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
9485 {
9486         int i;
9487
9488         /*
9489          * We can't change the weight of the root cgroup.
9490          */
9491         if (!tg->se[0])
9492                 return -EINVAL;
9493
9494         shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
9495
9496         mutex_lock(&shares_mutex);
9497         if (tg->shares == shares)
9498                 goto done;
9499
9500         tg->shares = shares;
9501         for_each_possible_cpu(i) {
9502                 struct rq *rq = cpu_rq(i);
9503                 struct sched_entity *se = tg->se[i];
9504                 struct rq_flags rf;
9505
9506                 /* Propagate contribution to hierarchy */
9507                 rq_lock_irqsave(rq, &rf);
9508                 update_rq_clock(rq);
9509                 for_each_sched_entity(se) {
9510                         update_load_avg(se, UPDATE_TG);
9511                         update_cfs_shares(se);
9512                 }
9513                 rq_unlock_irqrestore(rq, &rf);
9514         }
9515
9516 done:
9517         mutex_unlock(&shares_mutex);
9518         return 0;
9519 }
9520 #else /* CONFIG_FAIR_GROUP_SCHED */
9521
9522 void free_fair_sched_group(struct task_group *tg) { }
9523
9524 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9525 {
9526         return 1;
9527 }
9528
9529 void online_fair_sched_group(struct task_group *tg) { }
9530
9531 void unregister_fair_sched_group(struct task_group *tg) { }
9532
9533 #endif /* CONFIG_FAIR_GROUP_SCHED */
9534
9535
9536 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
9537 {
9538         struct sched_entity *se = &task->se;
9539         unsigned int rr_interval = 0;
9540
9541         /*
9542          * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
9543          * idle runqueue:
9544          */
9545         if (rq->cfs.load.weight)
9546                 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
9547
9548         return rr_interval;
9549 }
9550
9551 /*
9552  * All the scheduling class methods:
9553  */
9554 const struct sched_class fair_sched_class = {
9555         .next                   = &idle_sched_class,
9556         .enqueue_task           = enqueue_task_fair,
9557         .dequeue_task           = dequeue_task_fair,
9558         .yield_task             = yield_task_fair,
9559         .yield_to_task          = yield_to_task_fair,
9560
9561         .check_preempt_curr     = check_preempt_wakeup,
9562
9563         .pick_next_task         = pick_next_task_fair,
9564         .put_prev_task          = put_prev_task_fair,
9565
9566 #ifdef CONFIG_SMP
9567         .select_task_rq         = select_task_rq_fair,
9568         .migrate_task_rq        = migrate_task_rq_fair,
9569
9570         .rq_online              = rq_online_fair,
9571         .rq_offline             = rq_offline_fair,
9572
9573         .task_dead              = task_dead_fair,
9574         .set_cpus_allowed       = set_cpus_allowed_common,
9575 #endif
9576
9577         .set_curr_task          = set_curr_task_fair,
9578         .task_tick              = task_tick_fair,
9579         .task_fork              = task_fork_fair,
9580
9581         .prio_changed           = prio_changed_fair,
9582         .switched_from          = switched_from_fair,
9583         .switched_to            = switched_to_fair,
9584
9585         .get_rr_interval        = get_rr_interval_fair,
9586
9587         .update_curr            = update_curr_fair,
9588
9589 #ifdef CONFIG_FAIR_GROUP_SCHED
9590         .task_change_group      = task_change_group_fair,
9591 #endif
9592 };
9593
9594 #ifdef CONFIG_SCHED_DEBUG
9595 void print_cfs_stats(struct seq_file *m, int cpu)
9596 {
9597         struct cfs_rq *cfs_rq, *pos;
9598
9599         rcu_read_lock();
9600         for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
9601                 print_cfs_rq(m, cpu, cfs_rq);
9602         rcu_read_unlock();
9603 }
9604
9605 #ifdef CONFIG_NUMA_BALANCING
9606 void show_numa_stats(struct task_struct *p, struct seq_file *m)
9607 {
9608         int node;
9609         unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
9610
9611         for_each_online_node(node) {
9612                 if (p->numa_faults) {
9613                         tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
9614                         tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
9615                 }
9616                 if (p->numa_group) {
9617                         gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
9618                         gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
9619                 }
9620                 print_numa_stats(m, node, tsf, tpf, gsf, gpf);
9621         }
9622 }
9623 #endif /* CONFIG_NUMA_BALANCING */
9624 #endif /* CONFIG_SCHED_DEBUG */
9625
9626 __init void init_sched_fair_class(void)
9627 {
9628 #ifdef CONFIG_SMP
9629         open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
9630
9631 #ifdef CONFIG_NO_HZ_COMMON
9632         nohz.next_balance = jiffies;
9633         zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
9634 #endif
9635 #endif /* SMP */
9636
9637 }