]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - kernel/sched/topology.c
sched/topology: Add a few comments
[mirror_ubuntu-bionic-kernel.git] / kernel / sched / topology.c
CommitLineData
f2cb1360
IM
1/*
2 * Scheduler topology setup/handling methods
3 */
4#include <linux/sched.h>
5#include <linux/mutex.h>
6
7#include "sched.h"
8
9DEFINE_MUTEX(sched_domains_mutex);
10
11/* Protected by sched_domains_mutex: */
12cpumask_var_t sched_domains_tmpmask;
1676330e 13cpumask_var_t sched_domains_tmpmask2;
f2cb1360
IM
14
15#ifdef CONFIG_SCHED_DEBUG
16
17static __read_mostly int sched_debug_enabled;
18
19static int __init sched_debug_setup(char *str)
20{
21 sched_debug_enabled = 1;
22
23 return 0;
24}
25early_param("sched_debug", sched_debug_setup);
26
27static inline bool sched_debug(void)
28{
29 return sched_debug_enabled;
30}
31
32static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
33 struct cpumask *groupmask)
34{
35 struct sched_group *group = sd->groups;
36
37 cpumask_clear(groupmask);
38
005f874d 39 printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
f2cb1360
IM
40
41 if (!(sd->flags & SD_LOAD_BALANCE)) {
42 printk("does not load-balance\n");
43 if (sd->parent)
44 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
45 " has parent");
46 return -1;
47 }
48
005f874d 49 printk(KERN_CONT "span=%*pbl level=%s\n",
f2cb1360
IM
50 cpumask_pr_args(sched_domain_span(sd)), sd->name);
51
52 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
53 printk(KERN_ERR "ERROR: domain->span does not contain "
54 "CPU%d\n", cpu);
55 }
56 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
57 printk(KERN_ERR "ERROR: domain->groups does not contain"
58 " CPU%d\n", cpu);
59 }
60
61 printk(KERN_DEBUG "%*s groups:", level + 1, "");
62 do {
63 if (!group) {
64 printk("\n");
65 printk(KERN_ERR "ERROR: group is NULL\n");
66 break;
67 }
68
69 if (!cpumask_weight(sched_group_cpus(group))) {
70 printk(KERN_CONT "\n");
71 printk(KERN_ERR "ERROR: empty group\n");
72 break;
73 }
74
75 if (!(sd->flags & SD_OVERLAP) &&
76 cpumask_intersects(groupmask, sched_group_cpus(group))) {
77 printk(KERN_CONT "\n");
78 printk(KERN_ERR "ERROR: repeated CPUs\n");
79 break;
80 }
81
82 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
83
005f874d
PZ
84 printk(KERN_CONT " %d:{ span=%*pbl",
85 group->sgc->id,
86 cpumask_pr_args(sched_group_cpus(group)));
b0151c25
PZ
87
88 if ((sd->flags & SD_OVERLAP) && !cpumask_full(sched_group_mask(group))) {
005f874d 89 printk(KERN_CONT " mask=%*pbl",
b0151c25
PZ
90 cpumask_pr_args(sched_group_mask(group)));
91 }
92
005f874d
PZ
93 if (group->sgc->capacity != SCHED_CAPACITY_SCALE)
94 printk(KERN_CONT " cap=%lu", group->sgc->capacity);
f2cb1360 95
a420b063
PZ
96 if (group == sd->groups && sd->child &&
97 !cpumask_equal(sched_domain_span(sd->child),
98 sched_group_cpus(group))) {
99 printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");
100 }
101
005f874d
PZ
102 printk(KERN_CONT " }");
103
f2cb1360 104 group = group->next;
b0151c25
PZ
105
106 if (group != sd->groups)
107 printk(KERN_CONT ",");
108
f2cb1360
IM
109 } while (group != sd->groups);
110 printk(KERN_CONT "\n");
111
112 if (!cpumask_equal(sched_domain_span(sd), groupmask))
113 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
114
115 if (sd->parent &&
116 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
117 printk(KERN_ERR "ERROR: parent span is not a superset "
118 "of domain->span\n");
119 return 0;
120}
121
122static void sched_domain_debug(struct sched_domain *sd, int cpu)
123{
124 int level = 0;
125
126 if (!sched_debug_enabled)
127 return;
128
129 if (!sd) {
130 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
131 return;
132 }
133
005f874d 134 printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu);
f2cb1360
IM
135
136 for (;;) {
137 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
138 break;
139 level++;
140 sd = sd->parent;
141 if (!sd)
142 break;
143 }
144}
145#else /* !CONFIG_SCHED_DEBUG */
146
147# define sched_debug_enabled 0
148# define sched_domain_debug(sd, cpu) do { } while (0)
149static inline bool sched_debug(void)
150{
151 return false;
152}
153#endif /* CONFIG_SCHED_DEBUG */
154
155static int sd_degenerate(struct sched_domain *sd)
156{
157 if (cpumask_weight(sched_domain_span(sd)) == 1)
158 return 1;
159
160 /* Following flags need at least 2 groups */
161 if (sd->flags & (SD_LOAD_BALANCE |
162 SD_BALANCE_NEWIDLE |
163 SD_BALANCE_FORK |
164 SD_BALANCE_EXEC |
165 SD_SHARE_CPUCAPACITY |
166 SD_ASYM_CPUCAPACITY |
167 SD_SHARE_PKG_RESOURCES |
168 SD_SHARE_POWERDOMAIN)) {
169 if (sd->groups != sd->groups->next)
170 return 0;
171 }
172
173 /* Following flags don't use groups */
174 if (sd->flags & (SD_WAKE_AFFINE))
175 return 0;
176
177 return 1;
178}
179
180static int
181sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
182{
183 unsigned long cflags = sd->flags, pflags = parent->flags;
184
185 if (sd_degenerate(parent))
186 return 1;
187
188 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
189 return 0;
190
191 /* Flags needing groups don't count if only 1 group in parent */
192 if (parent->groups == parent->groups->next) {
193 pflags &= ~(SD_LOAD_BALANCE |
194 SD_BALANCE_NEWIDLE |
195 SD_BALANCE_FORK |
196 SD_BALANCE_EXEC |
197 SD_ASYM_CPUCAPACITY |
198 SD_SHARE_CPUCAPACITY |
199 SD_SHARE_PKG_RESOURCES |
200 SD_PREFER_SIBLING |
201 SD_SHARE_POWERDOMAIN);
202 if (nr_node_ids == 1)
203 pflags &= ~SD_SERIALIZE;
204 }
205 if (~cflags & pflags)
206 return 0;
207
208 return 1;
209}
210
211static void free_rootdomain(struct rcu_head *rcu)
212{
213 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
214
215 cpupri_cleanup(&rd->cpupri);
216 cpudl_cleanup(&rd->cpudl);
217 free_cpumask_var(rd->dlo_mask);
218 free_cpumask_var(rd->rto_mask);
219 free_cpumask_var(rd->online);
220 free_cpumask_var(rd->span);
221 kfree(rd);
222}
223
224void rq_attach_root(struct rq *rq, struct root_domain *rd)
225{
226 struct root_domain *old_rd = NULL;
227 unsigned long flags;
228
229 raw_spin_lock_irqsave(&rq->lock, flags);
230
231 if (rq->rd) {
232 old_rd = rq->rd;
233
234 if (cpumask_test_cpu(rq->cpu, old_rd->online))
235 set_rq_offline(rq);
236
237 cpumask_clear_cpu(rq->cpu, old_rd->span);
238
239 /*
240 * If we dont want to free the old_rd yet then
241 * set old_rd to NULL to skip the freeing later
242 * in this function:
243 */
244 if (!atomic_dec_and_test(&old_rd->refcount))
245 old_rd = NULL;
246 }
247
248 atomic_inc(&rd->refcount);
249 rq->rd = rd;
250
251 cpumask_set_cpu(rq->cpu, rd->span);
252 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
253 set_rq_online(rq);
254
255 raw_spin_unlock_irqrestore(&rq->lock, flags);
256
257 if (old_rd)
258 call_rcu_sched(&old_rd->rcu, free_rootdomain);
259}
260
261static int init_rootdomain(struct root_domain *rd)
262{
263 memset(rd, 0, sizeof(*rd));
264
265 if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
266 goto out;
267 if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
268 goto free_span;
269 if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
270 goto free_online;
271 if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
272 goto free_dlo_mask;
273
274 init_dl_bw(&rd->dl_bw);
275 if (cpudl_init(&rd->cpudl) != 0)
276 goto free_rto_mask;
277
278 if (cpupri_init(&rd->cpupri) != 0)
279 goto free_cpudl;
280 return 0;
281
282free_cpudl:
283 cpudl_cleanup(&rd->cpudl);
284free_rto_mask:
285 free_cpumask_var(rd->rto_mask);
286free_dlo_mask:
287 free_cpumask_var(rd->dlo_mask);
288free_online:
289 free_cpumask_var(rd->online);
290free_span:
291 free_cpumask_var(rd->span);
292out:
293 return -ENOMEM;
294}
295
296/*
297 * By default the system creates a single root-domain with all CPUs as
298 * members (mimicking the global state we have today).
299 */
300struct root_domain def_root_domain;
301
302void init_defrootdomain(void)
303{
304 init_rootdomain(&def_root_domain);
305
306 atomic_set(&def_root_domain.refcount, 1);
307}
308
309static struct root_domain *alloc_rootdomain(void)
310{
311 struct root_domain *rd;
312
313 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
314 if (!rd)
315 return NULL;
316
317 if (init_rootdomain(rd) != 0) {
318 kfree(rd);
319 return NULL;
320 }
321
322 return rd;
323}
324
325static void free_sched_groups(struct sched_group *sg, int free_sgc)
326{
327 struct sched_group *tmp, *first;
328
329 if (!sg)
330 return;
331
332 first = sg;
333 do {
334 tmp = sg->next;
335
336 if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
337 kfree(sg->sgc);
338
339 kfree(sg);
340 sg = tmp;
341 } while (sg != first);
342}
343
344static void destroy_sched_domain(struct sched_domain *sd)
345{
346 /*
347 * If its an overlapping domain it has private groups, iterate and
348 * nuke them all.
349 */
350 if (sd->flags & SD_OVERLAP) {
351 free_sched_groups(sd->groups, 1);
352 } else if (atomic_dec_and_test(&sd->groups->ref)) {
353 kfree(sd->groups->sgc);
354 kfree(sd->groups);
355 }
356 if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
357 kfree(sd->shared);
358 kfree(sd);
359}
360
361static void destroy_sched_domains_rcu(struct rcu_head *rcu)
362{
363 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
364
365 while (sd) {
366 struct sched_domain *parent = sd->parent;
367 destroy_sched_domain(sd);
368 sd = parent;
369 }
370}
371
372static void destroy_sched_domains(struct sched_domain *sd)
373{
374 if (sd)
375 call_rcu(&sd->rcu, destroy_sched_domains_rcu);
376}
377
378/*
379 * Keep a special pointer to the highest sched_domain that has
380 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
381 * allows us to avoid some pointer chasing select_idle_sibling().
382 *
383 * Also keep a unique ID per domain (we use the first CPU number in
384 * the cpumask of the domain), this allows us to quickly tell if
385 * two CPUs are in the same cache domain, see cpus_share_cache().
386 */
387DEFINE_PER_CPU(struct sched_domain *, sd_llc);
388DEFINE_PER_CPU(int, sd_llc_size);
389DEFINE_PER_CPU(int, sd_llc_id);
390DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
391DEFINE_PER_CPU(struct sched_domain *, sd_numa);
392DEFINE_PER_CPU(struct sched_domain *, sd_asym);
393
394static void update_top_cache_domain(int cpu)
395{
396 struct sched_domain_shared *sds = NULL;
397 struct sched_domain *sd;
398 int id = cpu;
399 int size = 1;
400
401 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
402 if (sd) {
403 id = cpumask_first(sched_domain_span(sd));
404 size = cpumask_weight(sched_domain_span(sd));
405 sds = sd->shared;
406 }
407
408 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
409 per_cpu(sd_llc_size, cpu) = size;
410 per_cpu(sd_llc_id, cpu) = id;
411 rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
412
413 sd = lowest_flag_domain(cpu, SD_NUMA);
414 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
415
416 sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
417 rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
418}
419
420/*
421 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
422 * hold the hotplug lock.
423 */
424static void
425cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
426{
427 struct rq *rq = cpu_rq(cpu);
428 struct sched_domain *tmp;
429
430 /* Remove the sched domains which do not contribute to scheduling. */
431 for (tmp = sd; tmp; ) {
432 struct sched_domain *parent = tmp->parent;
433 if (!parent)
434 break;
435
436 if (sd_parent_degenerate(tmp, parent)) {
437 tmp->parent = parent->parent;
438 if (parent->parent)
439 parent->parent->child = tmp;
440 /*
441 * Transfer SD_PREFER_SIBLING down in case of a
442 * degenerate parent; the spans match for this
443 * so the property transfers.
444 */
445 if (parent->flags & SD_PREFER_SIBLING)
446 tmp->flags |= SD_PREFER_SIBLING;
447 destroy_sched_domain(parent);
448 } else
449 tmp = tmp->parent;
450 }
451
452 if (sd && sd_degenerate(sd)) {
453 tmp = sd;
454 sd = sd->parent;
455 destroy_sched_domain(tmp);
456 if (sd)
457 sd->child = NULL;
458 }
459
460 sched_domain_debug(sd, cpu);
461
462 rq_attach_root(rq, rd);
463 tmp = rq->sd;
464 rcu_assign_pointer(rq->sd, sd);
465 destroy_sched_domains(tmp);
466
467 update_top_cache_domain(cpu);
468}
469
470/* Setup the mask of CPUs configured for isolated domains */
471static int __init isolated_cpu_setup(char *str)
472{
473 int ret;
474
475 alloc_bootmem_cpumask_var(&cpu_isolated_map);
476 ret = cpulist_parse(str, cpu_isolated_map);
477 if (ret) {
478 pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
479 return 0;
480 }
481 return 1;
482}
483__setup("isolcpus=", isolated_cpu_setup);
484
485struct s_data {
486 struct sched_domain ** __percpu sd;
487 struct root_domain *rd;
488};
489
490enum s_alloc {
491 sa_rootdomain,
492 sa_sd,
493 sa_sd_storage,
494 sa_none,
495};
496
35a566e6
PZ
497/*
498 * Return the canonical balance CPU for this group, this is the first CPU
499 * of this group that's also in the iteration mask.
500 *
501 * The iteration mask are all those CPUs that could actually end up at this
502 * group. See build_group_mask().
503 *
504 * Also see should_we_balance().
505 */
506int group_balance_cpu(struct sched_group *sg)
507{
508 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
509}
510
511
512/*
513 * NUMA topology (first read the regular topology blurb below)
514 *
515 * Given a node-distance table, for example:
516 *
517 * node 0 1 2 3
518 * 0: 10 20 30 20
519 * 1: 20 10 20 30
520 * 2: 30 20 10 20
521 * 3: 20 30 20 10
522 *
523 * which represents a 4 node ring topology like:
524 *
525 * 0 ----- 1
526 * | |
527 * | |
528 * | |
529 * 3 ----- 2
530 *
531 * We want to construct domains and groups to represent this. The way we go
532 * about doing this is to build the domains on 'hops'. For each NUMA level we
533 * construct the mask of all nodes reachable in @level hops.
534 *
535 * For the above NUMA topology that gives 3 levels:
536 *
537 * NUMA-2 0-3 0-3 0-3 0-3
538 * groups: {0-1,3},{1-3} {0-2},{0,2-3} {1-3},{0-1,3} {0,2-3},{0-2}
539 *
540 * NUMA-1 0-1,3 0-2 1-3 0,2-3
541 * groups: {0},{1},{3} {0},{1},{2} {1},{2},{3} {0},{2},{3}
542 *
543 * NUMA-0 0 1 2 3
544 *
545 *
546 * As can be seen; things don't nicely line up as with the regular topology.
547 * When we iterate a domain in child domain chunks some nodes can be
548 * represented multiple times -- hence the "overlap" naming for this part of
549 * the topology.
550 *
551 * In order to minimize this overlap, we only build enough groups to cover the
552 * domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3.
553 *
554 * Because:
555 *
556 * - the first group of each domain is its child domain; this
557 * gets us the first 0-1,3
558 * - the only uncovered node is 2, who's child domain is 1-3.
559 *
560 * However, because of the overlap, computing a unique CPU for each group is
561 * more complicated. Consider for instance the groups of NODE-1 NUMA-2, both
562 * groups include the CPUs of Node-0, while those CPUs would not in fact ever
563 * end up at those groups (they would end up in group: 0-1,3).
564 *
565 * To correct this we have to introduce the group iteration mask. This mask
566 * will contain those CPUs in the group that can reach this group given the
567 * (child) domain tree.
568 *
569 * With this we can once again compute balance_cpu and sched_group_capacity
570 * relations.
571 *
572 * XXX include words on how balance_cpu is unique and therefore can be
573 * used for sched_group_capacity links.
574 *
575 *
576 * Another 'interesting' topology is:
577 *
578 * node 0 1 2 3
579 * 0: 10 20 20 30
580 * 1: 20 10 20 20
581 * 2: 20 20 10 20
582 * 3: 30 20 20 10
583 *
584 * Which looks a little like:
585 *
586 * 0 ----- 1
587 * | / |
588 * | / |
589 * | / |
590 * 2 ----- 3
591 *
592 * This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3
593 * are not.
594 *
595 * This leads to a few particularly weird cases where the sched_domain's are
596 * not of the same number for each cpu. Consider:
597 *
598 * NUMA-2 0-3 0-3
599 * groups: {0-2},{1-3} {1-3},{0-2}
600 *
601 * NUMA-1 0-2 0-3 0-3 1-3
602 *
603 * NUMA-0 0 1 2 3
604 *
605 */
606
607
f2cb1360
IM
608/*
609 * Build an iteration mask that can exclude certain CPUs from the upwards
610 * domain traversal.
73bb059f
PZ
611 *
612 * Only CPUs that can arrive at this group should be considered to continue
613 * balancing.
35a566e6
PZ
614 *
615 * We do this during the group creation pass, therefore the group information
616 * isn't complete yet, however since each group represents a (child) domain we
617 * can fully construct this using the sched_domain bits (which are already
618 * complete).
f2cb1360 619 */
1676330e
PZ
620static void
621build_group_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask)
f2cb1360 622{
f32d782e 623 const struct cpumask *sg_span = sched_group_cpus(sg);
f2cb1360
IM
624 struct sd_data *sdd = sd->private;
625 struct sched_domain *sibling;
626 int i;
627
1676330e
PZ
628 cpumask_clear(mask);
629
f32d782e 630 for_each_cpu(i, sg_span) {
f2cb1360 631 sibling = *per_cpu_ptr(sdd->sd, i);
73bb059f
PZ
632
633 /*
634 * Can happen in the asymmetric case, where these siblings are
635 * unused. The mask will not be empty because those CPUs that
636 * do have the top domain _should_ span the domain.
637 */
638 if (!sibling->child)
639 continue;
640
641 /* If we would not end up here, we can't continue from here */
642 if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
f2cb1360
IM
643 continue;
644
1676330e 645 cpumask_set_cpu(i, mask);
f2cb1360 646 }
73bb059f
PZ
647
648 /* We must not have empty masks here */
1676330e 649 WARN_ON_ONCE(cpumask_empty(mask));
f2cb1360
IM
650}
651
652/*
35a566e6
PZ
653 * XXX: This creates per-node group entries; since the load-balancer will
654 * immediately access remote memory to construct this group's load-balance
655 * statistics having the groups node local is of dubious benefit.
f2cb1360 656 */
8c033469
LRV
657static struct sched_group *
658build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
659{
660 struct sched_group *sg;
661 struct cpumask *sg_span;
662
663 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
664 GFP_KERNEL, cpu_to_node(cpu));
665
666 if (!sg)
667 return NULL;
668
669 sg_span = sched_group_cpus(sg);
670 if (sd->child)
671 cpumask_copy(sg_span, sched_domain_span(sd->child));
672 else
673 cpumask_copy(sg_span, sched_domain_span(sd));
674
675 return sg;
676}
677
678static void init_overlap_sched_group(struct sched_domain *sd,
1676330e 679 struct sched_group *sg)
8c033469 680{
1676330e 681 struct cpumask *mask = sched_domains_tmpmask2;
8c033469
LRV
682 struct sd_data *sdd = sd->private;
683 struct cpumask *sg_span;
1676330e
PZ
684 int cpu;
685
686 build_group_mask(sd, sg, mask);
687 cpu = cpumask_first_and(sched_group_cpus(sg), mask);
8c033469
LRV
688
689 sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
690 if (atomic_inc_return(&sg->sgc->ref) == 1)
1676330e 691 cpumask_copy(sched_group_mask(sg), mask);
35a566e6
PZ
692 else
693 WARN_ON_ONCE(!cpumask_equal(sched_group_mask(sg), mask));
8c033469
LRV
694
695 /*
696 * Initialize sgc->capacity such that even if we mess up the
697 * domains and no possible iteration will get us here, we won't
698 * die on a /0 trap.
699 */
700 sg_span = sched_group_cpus(sg);
701 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
702 sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
703}
704
f2cb1360
IM
705static int
706build_overlap_sched_groups(struct sched_domain *sd, int cpu)
707{
91eaed0d 708 struct sched_group *first = NULL, *last = NULL, *sg;
f2cb1360
IM
709 const struct cpumask *span = sched_domain_span(sd);
710 struct cpumask *covered = sched_domains_tmpmask;
711 struct sd_data *sdd = sd->private;
712 struct sched_domain *sibling;
713 int i;
714
715 cpumask_clear(covered);
716
0372dd27 717 for_each_cpu_wrap(i, span, cpu) {
f2cb1360
IM
718 struct cpumask *sg_span;
719
720 if (cpumask_test_cpu(i, covered))
721 continue;
722
723 sibling = *per_cpu_ptr(sdd->sd, i);
724
c20e1ea4
LRV
725 /*
726 * Asymmetric node setups can result in situations where the
727 * domain tree is of unequal depth, make sure to skip domains
728 * that already cover the entire range.
729 *
730 * In that case build_sched_domains() will have terminated the
731 * iteration early and our sibling sd spans will be empty.
732 * Domains should always include the CPU they're built on, so
733 * check that.
734 */
f2cb1360
IM
735 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
736 continue;
737
8c033469 738 sg = build_group_from_child_sched_domain(sibling, cpu);
f2cb1360
IM
739 if (!sg)
740 goto fail;
741
742 sg_span = sched_group_cpus(sg);
f2cb1360
IM
743 cpumask_or(covered, covered, sg_span);
744
1676330e 745 init_overlap_sched_group(sd, sg);
f2cb1360 746
f2cb1360
IM
747 if (!first)
748 first = sg;
749 if (last)
750 last->next = sg;
751 last = sg;
752 last->next = first;
753 }
91eaed0d 754 sd->groups = first;
f2cb1360
IM
755
756 return 0;
757
758fail:
759 free_sched_groups(first, 0);
760
761 return -ENOMEM;
762}
763
35a566e6
PZ
764
765/*
766 * Package topology (also see the load-balance blurb in fair.c)
767 *
768 * The scheduler builds a tree structure to represent a number of important
769 * topology features. By default (default_topology[]) these include:
770 *
771 * - Simultaneous multithreading (SMT)
772 * - Multi-Core Cache (MC)
773 * - Package (DIE)
774 *
775 * Where the last one more or less denotes everything up to a NUMA node.
776 *
777 * The tree consists of 3 primary data structures:
778 *
779 * sched_domain -> sched_group -> sched_group_capacity
780 * ^ ^ ^ ^
781 * `-' `-'
782 *
783 * The sched_domains are per-cpu and have a two way link (parent & child) and
784 * denote the ever growing mask of CPUs belonging to that level of topology.
785 *
786 * Each sched_domain has a circular (double) linked list of sched_group's, each
787 * denoting the domains of the level below (or individual CPUs in case of the
788 * first domain level). The sched_group linked by a sched_domain includes the
789 * CPU of that sched_domain [*].
790 *
791 * Take for instance a 2 threaded, 2 core, 2 cache cluster part:
792 *
793 * CPU 0 1 2 3 4 5 6 7
794 *
795 * DIE [ ]
796 * MC [ ] [ ]
797 * SMT [ ] [ ] [ ] [ ]
798 *
799 * - or -
800 *
801 * DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
802 * MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
803 * SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
804 *
805 * CPU 0 1 2 3 4 5 6 7
806 *
807 * One way to think about it is: sched_domain moves you up and down among these
808 * topology levels, while sched_group moves you sideways through it, at child
809 * domain granularity.
810 *
811 * sched_group_capacity ensures each unique sched_group has shared storage.
812 *
813 * There are two related construction problems, both require a CPU that
814 * uniquely identify each group (for a given domain):
815 *
816 * - The first is the balance_cpu (see should_we_balance() and the
817 * load-balance blub in fair.c); for each group we only want 1 CPU to
818 * continue balancing at a higher domain.
819 *
820 * - The second is the sched_group_capacity; we want all identical groups
821 * to share a single sched_group_capacity.
822 *
823 * Since these topologies are exclusive by construction. That is, its
824 * impossible for an SMT thread to belong to multiple cores, and cores to
825 * be part of multiple caches. There is a very clear and unique location
826 * for each CPU in the hierarchy.
827 *
828 * Therefore computing a unique CPU for each group is trivial (the iteration
829 * mask is redundant and set all 1s; all CPUs in a group will end up at _that_
830 * group), we can simply pick the first CPU in each group.
831 *
832 *
833 * [*] in other words, the first group of each domain is its child domain.
834 */
835
f2cb1360
IM
836static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
837{
838 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
839 struct sched_domain *child = sd->child;
840
841 if (child)
842 cpu = cpumask_first(sched_domain_span(child));
843
844 if (sg) {
845 *sg = *per_cpu_ptr(sdd->sg, cpu);
846 (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
847
848 /* For claim_allocations: */
849 atomic_set(&(*sg)->sgc->ref, 1);
850 }
851
852 return cpu;
853}
854
855/*
856 * build_sched_groups will build a circular linked list of the groups
857 * covered by the given span, and will set each group's ->cpumask correctly,
858 * and ->cpu_capacity to 0.
859 *
860 * Assumes the sched_domain tree is fully constructed
861 */
862static int
863build_sched_groups(struct sched_domain *sd, int cpu)
864{
865 struct sched_group *first = NULL, *last = NULL;
866 struct sd_data *sdd = sd->private;
867 const struct cpumask *span = sched_domain_span(sd);
868 struct cpumask *covered;
869 int i;
870
871 get_group(cpu, sdd, &sd->groups);
872 atomic_inc(&sd->groups->ref);
873
874 if (cpu != cpumask_first(span))
875 return 0;
876
877 lockdep_assert_held(&sched_domains_mutex);
878 covered = sched_domains_tmpmask;
879
880 cpumask_clear(covered);
881
882 for_each_cpu(i, span) {
883 struct sched_group *sg;
884 int group, j;
885
886 if (cpumask_test_cpu(i, covered))
887 continue;
888
889 group = get_group(i, sdd, &sg);
890 cpumask_setall(sched_group_mask(sg));
891
892 for_each_cpu(j, span) {
893 if (get_group(j, sdd, NULL) != group)
894 continue;
895
896 cpumask_set_cpu(j, covered);
897 cpumask_set_cpu(j, sched_group_cpus(sg));
898 }
899
900 if (!first)
901 first = sg;
902 if (last)
903 last->next = sg;
904 last = sg;
905 }
906 last->next = first;
907
908 return 0;
909}
910
911/*
912 * Initialize sched groups cpu_capacity.
913 *
914 * cpu_capacity indicates the capacity of sched group, which is used while
915 * distributing the load between different sched groups in a sched domain.
916 * Typically cpu_capacity for all the groups in a sched domain will be same
917 * unless there are asymmetries in the topology. If there are asymmetries,
918 * group having more cpu_capacity will pickup more load compared to the
919 * group having less cpu_capacity.
920 */
921static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
922{
923 struct sched_group *sg = sd->groups;
924
925 WARN_ON(!sg);
926
927 do {
928 int cpu, max_cpu = -1;
929
930 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
931
932 if (!(sd->flags & SD_ASYM_PACKING))
933 goto next;
934
935 for_each_cpu(cpu, sched_group_cpus(sg)) {
936 if (max_cpu < 0)
937 max_cpu = cpu;
938 else if (sched_asym_prefer(cpu, max_cpu))
939 max_cpu = cpu;
940 }
941 sg->asym_prefer_cpu = max_cpu;
942
943next:
944 sg = sg->next;
945 } while (sg != sd->groups);
946
947 if (cpu != group_balance_cpu(sg))
948 return;
949
950 update_group_capacity(sd, cpu);
951}
952
953/*
954 * Initializers for schedule domains
955 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
956 */
957
958static int default_relax_domain_level = -1;
959int sched_domain_level_max;
960
961static int __init setup_relax_domain_level(char *str)
962{
963 if (kstrtoint(str, 0, &default_relax_domain_level))
964 pr_warn("Unable to set relax_domain_level\n");
965
966 return 1;
967}
968__setup("relax_domain_level=", setup_relax_domain_level);
969
970static void set_domain_attribute(struct sched_domain *sd,
971 struct sched_domain_attr *attr)
972{
973 int request;
974
975 if (!attr || attr->relax_domain_level < 0) {
976 if (default_relax_domain_level < 0)
977 return;
978 else
979 request = default_relax_domain_level;
980 } else
981 request = attr->relax_domain_level;
982 if (request < sd->level) {
983 /* Turn off idle balance on this domain: */
984 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
985 } else {
986 /* Turn on idle balance on this domain: */
987 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
988 }
989}
990
991static void __sdt_free(const struct cpumask *cpu_map);
992static int __sdt_alloc(const struct cpumask *cpu_map);
993
994static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
995 const struct cpumask *cpu_map)
996{
997 switch (what) {
998 case sa_rootdomain:
999 if (!atomic_read(&d->rd->refcount))
1000 free_rootdomain(&d->rd->rcu);
1001 /* Fall through */
1002 case sa_sd:
1003 free_percpu(d->sd);
1004 /* Fall through */
1005 case sa_sd_storage:
1006 __sdt_free(cpu_map);
1007 /* Fall through */
1008 case sa_none:
1009 break;
1010 }
1011}
1012
1013static enum s_alloc
1014__visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
1015{
1016 memset(d, 0, sizeof(*d));
1017
1018 if (__sdt_alloc(cpu_map))
1019 return sa_sd_storage;
1020 d->sd = alloc_percpu(struct sched_domain *);
1021 if (!d->sd)
1022 return sa_sd_storage;
1023 d->rd = alloc_rootdomain();
1024 if (!d->rd)
1025 return sa_sd;
1026 return sa_rootdomain;
1027}
1028
1029/*
1030 * NULL the sd_data elements we've used to build the sched_domain and
1031 * sched_group structure so that the subsequent __free_domain_allocs()
1032 * will not free the data we're using.
1033 */
1034static void claim_allocations(int cpu, struct sched_domain *sd)
1035{
1036 struct sd_data *sdd = sd->private;
1037
1038 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
1039 *per_cpu_ptr(sdd->sd, cpu) = NULL;
1040
1041 if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
1042 *per_cpu_ptr(sdd->sds, cpu) = NULL;
1043
1044 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
1045 *per_cpu_ptr(sdd->sg, cpu) = NULL;
1046
1047 if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
1048 *per_cpu_ptr(sdd->sgc, cpu) = NULL;
1049}
1050
1051#ifdef CONFIG_NUMA
1052static int sched_domains_numa_levels;
1053enum numa_topology_type sched_numa_topology_type;
1054static int *sched_domains_numa_distance;
1055int sched_max_numa_distance;
1056static struct cpumask ***sched_domains_numa_masks;
1057static int sched_domains_curr_level;
1058#endif
1059
1060/*
1061 * SD_flags allowed in topology descriptions.
1062 *
1063 * These flags are purely descriptive of the topology and do not prescribe
1064 * behaviour. Behaviour is artificial and mapped in the below sd_init()
1065 * function:
1066 *
1067 * SD_SHARE_CPUCAPACITY - describes SMT topologies
1068 * SD_SHARE_PKG_RESOURCES - describes shared caches
1069 * SD_NUMA - describes NUMA topologies
1070 * SD_SHARE_POWERDOMAIN - describes shared power domain
1071 * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies
1072 *
1073 * Odd one out, which beside describing the topology has a quirk also
1074 * prescribes the desired behaviour that goes along with it:
1075 *
1076 * SD_ASYM_PACKING - describes SMT quirks
1077 */
1078#define TOPOLOGY_SD_FLAGS \
1079 (SD_SHARE_CPUCAPACITY | \
1080 SD_SHARE_PKG_RESOURCES | \
1081 SD_NUMA | \
1082 SD_ASYM_PACKING | \
1083 SD_ASYM_CPUCAPACITY | \
1084 SD_SHARE_POWERDOMAIN)
1085
1086static struct sched_domain *
1087sd_init(struct sched_domain_topology_level *tl,
1088 const struct cpumask *cpu_map,
1089 struct sched_domain *child, int cpu)
1090{
1091 struct sd_data *sdd = &tl->data;
1092 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
1093 int sd_id, sd_weight, sd_flags = 0;
1094
1095#ifdef CONFIG_NUMA
1096 /*
1097 * Ugly hack to pass state to sd_numa_mask()...
1098 */
1099 sched_domains_curr_level = tl->numa_level;
1100#endif
1101
1102 sd_weight = cpumask_weight(tl->mask(cpu));
1103
1104 if (tl->sd_flags)
1105 sd_flags = (*tl->sd_flags)();
1106 if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
1107 "wrong sd_flags in topology description\n"))
1108 sd_flags &= ~TOPOLOGY_SD_FLAGS;
1109
1110 *sd = (struct sched_domain){
1111 .min_interval = sd_weight,
1112 .max_interval = 2*sd_weight,
1113 .busy_factor = 32,
1114 .imbalance_pct = 125,
1115
1116 .cache_nice_tries = 0,
1117 .busy_idx = 0,
1118 .idle_idx = 0,
1119 .newidle_idx = 0,
1120 .wake_idx = 0,
1121 .forkexec_idx = 0,
1122
1123 .flags = 1*SD_LOAD_BALANCE
1124 | 1*SD_BALANCE_NEWIDLE
1125 | 1*SD_BALANCE_EXEC
1126 | 1*SD_BALANCE_FORK
1127 | 0*SD_BALANCE_WAKE
1128 | 1*SD_WAKE_AFFINE
1129 | 0*SD_SHARE_CPUCAPACITY
1130 | 0*SD_SHARE_PKG_RESOURCES
1131 | 0*SD_SERIALIZE
1132 | 0*SD_PREFER_SIBLING
1133 | 0*SD_NUMA
1134 | sd_flags
1135 ,
1136
1137 .last_balance = jiffies,
1138 .balance_interval = sd_weight,
1139 .smt_gain = 0,
1140 .max_newidle_lb_cost = 0,
1141 .next_decay_max_lb_cost = jiffies,
1142 .child = child,
1143#ifdef CONFIG_SCHED_DEBUG
1144 .name = tl->name,
1145#endif
1146 };
1147
1148 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
1149 sd_id = cpumask_first(sched_domain_span(sd));
1150
1151 /*
1152 * Convert topological properties into behaviour.
1153 */
1154
1155 if (sd->flags & SD_ASYM_CPUCAPACITY) {
1156 struct sched_domain *t = sd;
1157
1158 for_each_lower_domain(t)
1159 t->flags |= SD_BALANCE_WAKE;
1160 }
1161
1162 if (sd->flags & SD_SHARE_CPUCAPACITY) {
1163 sd->flags |= SD_PREFER_SIBLING;
1164 sd->imbalance_pct = 110;
1165 sd->smt_gain = 1178; /* ~15% */
1166
1167 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
1168 sd->imbalance_pct = 117;
1169 sd->cache_nice_tries = 1;
1170 sd->busy_idx = 2;
1171
1172#ifdef CONFIG_NUMA
1173 } else if (sd->flags & SD_NUMA) {
1174 sd->cache_nice_tries = 2;
1175 sd->busy_idx = 3;
1176 sd->idle_idx = 2;
1177
1178 sd->flags |= SD_SERIALIZE;
1179 if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
1180 sd->flags &= ~(SD_BALANCE_EXEC |
1181 SD_BALANCE_FORK |
1182 SD_WAKE_AFFINE);
1183 }
1184
1185#endif
1186 } else {
1187 sd->flags |= SD_PREFER_SIBLING;
1188 sd->cache_nice_tries = 1;
1189 sd->busy_idx = 2;
1190 sd->idle_idx = 1;
1191 }
1192
1193 /*
1194 * For all levels sharing cache; connect a sched_domain_shared
1195 * instance.
1196 */
1197 if (sd->flags & SD_SHARE_PKG_RESOURCES) {
1198 sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
1199 atomic_inc(&sd->shared->ref);
1200 atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
1201 }
1202
1203 sd->private = sdd;
1204
1205 return sd;
1206}
1207
1208/*
1209 * Topology list, bottom-up.
1210 */
1211static struct sched_domain_topology_level default_topology[] = {
1212#ifdef CONFIG_SCHED_SMT
1213 { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
1214#endif
1215#ifdef CONFIG_SCHED_MC
1216 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
1217#endif
1218 { cpu_cpu_mask, SD_INIT_NAME(DIE) },
1219 { NULL, },
1220};
1221
1222static struct sched_domain_topology_level *sched_domain_topology =
1223 default_topology;
1224
1225#define for_each_sd_topology(tl) \
1226 for (tl = sched_domain_topology; tl->mask; tl++)
1227
1228void set_sched_topology(struct sched_domain_topology_level *tl)
1229{
1230 if (WARN_ON_ONCE(sched_smp_initialized))
1231 return;
1232
1233 sched_domain_topology = tl;
1234}
1235
1236#ifdef CONFIG_NUMA
1237
1238static const struct cpumask *sd_numa_mask(int cpu)
1239{
1240 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
1241}
1242
1243static void sched_numa_warn(const char *str)
1244{
1245 static int done = false;
1246 int i,j;
1247
1248 if (done)
1249 return;
1250
1251 done = true;
1252
1253 printk(KERN_WARNING "ERROR: %s\n\n", str);
1254
1255 for (i = 0; i < nr_node_ids; i++) {
1256 printk(KERN_WARNING " ");
1257 for (j = 0; j < nr_node_ids; j++)
1258 printk(KERN_CONT "%02d ", node_distance(i,j));
1259 printk(KERN_CONT "\n");
1260 }
1261 printk(KERN_WARNING "\n");
1262}
1263
1264bool find_numa_distance(int distance)
1265{
1266 int i;
1267
1268 if (distance == node_distance(0, 0))
1269 return true;
1270
1271 for (i = 0; i < sched_domains_numa_levels; i++) {
1272 if (sched_domains_numa_distance[i] == distance)
1273 return true;
1274 }
1275
1276 return false;
1277}
1278
1279/*
1280 * A system can have three types of NUMA topology:
1281 * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
1282 * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
1283 * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
1284 *
1285 * The difference between a glueless mesh topology and a backplane
1286 * topology lies in whether communication between not directly
1287 * connected nodes goes through intermediary nodes (where programs
1288 * could run), or through backplane controllers. This affects
1289 * placement of programs.
1290 *
1291 * The type of topology can be discerned with the following tests:
1292 * - If the maximum distance between any nodes is 1 hop, the system
1293 * is directly connected.
1294 * - If for two nodes A and B, located N > 1 hops away from each other,
1295 * there is an intermediary node C, which is < N hops away from both
1296 * nodes A and B, the system is a glueless mesh.
1297 */
1298static void init_numa_topology_type(void)
1299{
1300 int a, b, c, n;
1301
1302 n = sched_max_numa_distance;
1303
1304 if (sched_domains_numa_levels <= 1) {
1305 sched_numa_topology_type = NUMA_DIRECT;
1306 return;
1307 }
1308
1309 for_each_online_node(a) {
1310 for_each_online_node(b) {
1311 /* Find two nodes furthest removed from each other. */
1312 if (node_distance(a, b) < n)
1313 continue;
1314
1315 /* Is there an intermediary node between a and b? */
1316 for_each_online_node(c) {
1317 if (node_distance(a, c) < n &&
1318 node_distance(b, c) < n) {
1319 sched_numa_topology_type =
1320 NUMA_GLUELESS_MESH;
1321 return;
1322 }
1323 }
1324
1325 sched_numa_topology_type = NUMA_BACKPLANE;
1326 return;
1327 }
1328 }
1329}
1330
1331void sched_init_numa(void)
1332{
1333 int next_distance, curr_distance = node_distance(0, 0);
1334 struct sched_domain_topology_level *tl;
1335 int level = 0;
1336 int i, j, k;
1337
1338 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
1339 if (!sched_domains_numa_distance)
1340 return;
1341
1342 /*
1343 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
1344 * unique distances in the node_distance() table.
1345 *
1346 * Assumes node_distance(0,j) includes all distances in
1347 * node_distance(i,j) in order to avoid cubic time.
1348 */
1349 next_distance = curr_distance;
1350 for (i = 0; i < nr_node_ids; i++) {
1351 for (j = 0; j < nr_node_ids; j++) {
1352 for (k = 0; k < nr_node_ids; k++) {
1353 int distance = node_distance(i, k);
1354
1355 if (distance > curr_distance &&
1356 (distance < next_distance ||
1357 next_distance == curr_distance))
1358 next_distance = distance;
1359
1360 /*
1361 * While not a strong assumption it would be nice to know
1362 * about cases where if node A is connected to B, B is not
1363 * equally connected to A.
1364 */
1365 if (sched_debug() && node_distance(k, i) != distance)
1366 sched_numa_warn("Node-distance not symmetric");
1367
1368 if (sched_debug() && i && !find_numa_distance(distance))
1369 sched_numa_warn("Node-0 not representative");
1370 }
1371 if (next_distance != curr_distance) {
1372 sched_domains_numa_distance[level++] = next_distance;
1373 sched_domains_numa_levels = level;
1374 curr_distance = next_distance;
1375 } else break;
1376 }
1377
1378 /*
1379 * In case of sched_debug() we verify the above assumption.
1380 */
1381 if (!sched_debug())
1382 break;
1383 }
1384
1385 if (!level)
1386 return;
1387
1388 /*
1389 * 'level' contains the number of unique distances, excluding the
1390 * identity distance node_distance(i,i).
1391 *
1392 * The sched_domains_numa_distance[] array includes the actual distance
1393 * numbers.
1394 */
1395
1396 /*
1397 * Here, we should temporarily reset sched_domains_numa_levels to 0.
1398 * If it fails to allocate memory for array sched_domains_numa_masks[][],
1399 * the array will contain less then 'level' members. This could be
1400 * dangerous when we use it to iterate array sched_domains_numa_masks[][]
1401 * in other functions.
1402 *
1403 * We reset it to 'level' at the end of this function.
1404 */
1405 sched_domains_numa_levels = 0;
1406
1407 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
1408 if (!sched_domains_numa_masks)
1409 return;
1410
1411 /*
1412 * Now for each level, construct a mask per node which contains all
1413 * CPUs of nodes that are that many hops away from us.
1414 */
1415 for (i = 0; i < level; i++) {
1416 sched_domains_numa_masks[i] =
1417 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
1418 if (!sched_domains_numa_masks[i])
1419 return;
1420
1421 for (j = 0; j < nr_node_ids; j++) {
1422 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
1423 if (!mask)
1424 return;
1425
1426 sched_domains_numa_masks[i][j] = mask;
1427
1428 for_each_node(k) {
1429 if (node_distance(j, k) > sched_domains_numa_distance[i])
1430 continue;
1431
1432 cpumask_or(mask, mask, cpumask_of_node(k));
1433 }
1434 }
1435 }
1436
1437 /* Compute default topology size */
1438 for (i = 0; sched_domain_topology[i].mask; i++);
1439
1440 tl = kzalloc((i + level + 1) *
1441 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
1442 if (!tl)
1443 return;
1444
1445 /*
1446 * Copy the default topology bits..
1447 */
1448 for (i = 0; sched_domain_topology[i].mask; i++)
1449 tl[i] = sched_domain_topology[i];
1450
1451 /*
1452 * .. and append 'j' levels of NUMA goodness.
1453 */
1454 for (j = 0; j < level; i++, j++) {
1455 tl[i] = (struct sched_domain_topology_level){
1456 .mask = sd_numa_mask,
1457 .sd_flags = cpu_numa_flags,
1458 .flags = SDTL_OVERLAP,
1459 .numa_level = j,
1460 SD_INIT_NAME(NUMA)
1461 };
1462 }
1463
1464 sched_domain_topology = tl;
1465
1466 sched_domains_numa_levels = level;
1467 sched_max_numa_distance = sched_domains_numa_distance[level - 1];
1468
1469 init_numa_topology_type();
1470}
1471
1472void sched_domains_numa_masks_set(unsigned int cpu)
1473{
1474 int node = cpu_to_node(cpu);
1475 int i, j;
1476
1477 for (i = 0; i < sched_domains_numa_levels; i++) {
1478 for (j = 0; j < nr_node_ids; j++) {
1479 if (node_distance(j, node) <= sched_domains_numa_distance[i])
1480 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
1481 }
1482 }
1483}
1484
1485void sched_domains_numa_masks_clear(unsigned int cpu)
1486{
1487 int i, j;
1488
1489 for (i = 0; i < sched_domains_numa_levels; i++) {
1490 for (j = 0; j < nr_node_ids; j++)
1491 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
1492 }
1493}
1494
1495#endif /* CONFIG_NUMA */
1496
1497static int __sdt_alloc(const struct cpumask *cpu_map)
1498{
1499 struct sched_domain_topology_level *tl;
1500 int j;
1501
1502 for_each_sd_topology(tl) {
1503 struct sd_data *sdd = &tl->data;
1504
1505 sdd->sd = alloc_percpu(struct sched_domain *);
1506 if (!sdd->sd)
1507 return -ENOMEM;
1508
1509 sdd->sds = alloc_percpu(struct sched_domain_shared *);
1510 if (!sdd->sds)
1511 return -ENOMEM;
1512
1513 sdd->sg = alloc_percpu(struct sched_group *);
1514 if (!sdd->sg)
1515 return -ENOMEM;
1516
1517 sdd->sgc = alloc_percpu(struct sched_group_capacity *);
1518 if (!sdd->sgc)
1519 return -ENOMEM;
1520
1521 for_each_cpu(j, cpu_map) {
1522 struct sched_domain *sd;
1523 struct sched_domain_shared *sds;
1524 struct sched_group *sg;
1525 struct sched_group_capacity *sgc;
1526
1527 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
1528 GFP_KERNEL, cpu_to_node(j));
1529 if (!sd)
1530 return -ENOMEM;
1531
1532 *per_cpu_ptr(sdd->sd, j) = sd;
1533
1534 sds = kzalloc_node(sizeof(struct sched_domain_shared),
1535 GFP_KERNEL, cpu_to_node(j));
1536 if (!sds)
1537 return -ENOMEM;
1538
1539 *per_cpu_ptr(sdd->sds, j) = sds;
1540
1541 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
1542 GFP_KERNEL, cpu_to_node(j));
1543 if (!sg)
1544 return -ENOMEM;
1545
1546 sg->next = sg;
1547
1548 *per_cpu_ptr(sdd->sg, j) = sg;
1549
1550 sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
1551 GFP_KERNEL, cpu_to_node(j));
1552 if (!sgc)
1553 return -ENOMEM;
1554
005f874d
PZ
1555#ifdef CONFIG_SCHED_DEBUG
1556 sgc->id = j;
1557#endif
1558
f2cb1360
IM
1559 *per_cpu_ptr(sdd->sgc, j) = sgc;
1560 }
1561 }
1562
1563 return 0;
1564}
1565
1566static void __sdt_free(const struct cpumask *cpu_map)
1567{
1568 struct sched_domain_topology_level *tl;
1569 int j;
1570
1571 for_each_sd_topology(tl) {
1572 struct sd_data *sdd = &tl->data;
1573
1574 for_each_cpu(j, cpu_map) {
1575 struct sched_domain *sd;
1576
1577 if (sdd->sd) {
1578 sd = *per_cpu_ptr(sdd->sd, j);
1579 if (sd && (sd->flags & SD_OVERLAP))
1580 free_sched_groups(sd->groups, 0);
1581 kfree(*per_cpu_ptr(sdd->sd, j));
1582 }
1583
1584 if (sdd->sds)
1585 kfree(*per_cpu_ptr(sdd->sds, j));
1586 if (sdd->sg)
1587 kfree(*per_cpu_ptr(sdd->sg, j));
1588 if (sdd->sgc)
1589 kfree(*per_cpu_ptr(sdd->sgc, j));
1590 }
1591 free_percpu(sdd->sd);
1592 sdd->sd = NULL;
1593 free_percpu(sdd->sds);
1594 sdd->sds = NULL;
1595 free_percpu(sdd->sg);
1596 sdd->sg = NULL;
1597 free_percpu(sdd->sgc);
1598 sdd->sgc = NULL;
1599 }
1600}
1601
1602struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
1603 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
1604 struct sched_domain *child, int cpu)
1605{
1606 struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
1607
1608 if (child) {
1609 sd->level = child->level + 1;
1610 sched_domain_level_max = max(sched_domain_level_max, sd->level);
1611 child->parent = sd;
1612
1613 if (!cpumask_subset(sched_domain_span(child),
1614 sched_domain_span(sd))) {
1615 pr_err("BUG: arch topology borken\n");
1616#ifdef CONFIG_SCHED_DEBUG
1617 pr_err(" the %s domain not a subset of the %s domain\n",
1618 child->name, sd->name);
1619#endif
1620 /* Fixup, ensure @sd has at least @child cpus. */
1621 cpumask_or(sched_domain_span(sd),
1622 sched_domain_span(sd),
1623 sched_domain_span(child));
1624 }
1625
1626 }
1627 set_domain_attribute(sd, attr);
1628
1629 return sd;
1630}
1631
1632/*
1633 * Build sched domains for a given set of CPUs and attach the sched domains
1634 * to the individual CPUs
1635 */
1636static int
1637build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
1638{
1639 enum s_alloc alloc_state;
1640 struct sched_domain *sd;
1641 struct s_data d;
1642 struct rq *rq = NULL;
1643 int i, ret = -ENOMEM;
1644
1645 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
1646 if (alloc_state != sa_rootdomain)
1647 goto error;
1648
1649 /* Set up domains for CPUs specified by the cpu_map: */
1650 for_each_cpu(i, cpu_map) {
1651 struct sched_domain_topology_level *tl;
1652
1653 sd = NULL;
1654 for_each_sd_topology(tl) {
1655 sd = build_sched_domain(tl, cpu_map, attr, sd, i);
1656 if (tl == sched_domain_topology)
1657 *per_cpu_ptr(d.sd, i) = sd;
af85596c 1658 if (tl->flags & SDTL_OVERLAP)
f2cb1360
IM
1659 sd->flags |= SD_OVERLAP;
1660 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
1661 break;
1662 }
1663 }
1664
1665 /* Build the groups for the domains */
1666 for_each_cpu(i, cpu_map) {
1667 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
1668 sd->span_weight = cpumask_weight(sched_domain_span(sd));
1669 if (sd->flags & SD_OVERLAP) {
1670 if (build_overlap_sched_groups(sd, i))
1671 goto error;
1672 } else {
1673 if (build_sched_groups(sd, i))
1674 goto error;
1675 }
1676 }
1677 }
1678
1679 /* Calculate CPU capacity for physical packages and nodes */
1680 for (i = nr_cpumask_bits-1; i >= 0; i--) {
1681 if (!cpumask_test_cpu(i, cpu_map))
1682 continue;
1683
1684 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
1685 claim_allocations(i, sd);
1686 init_sched_groups_capacity(i, sd);
1687 }
1688 }
1689
1690 /* Attach the domains */
1691 rcu_read_lock();
1692 for_each_cpu(i, cpu_map) {
1693 rq = cpu_rq(i);
1694 sd = *per_cpu_ptr(d.sd, i);
1695
1696 /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
1697 if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
1698 WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
1699
1700 cpu_attach_domain(sd, d.rd, i);
1701 }
1702 rcu_read_unlock();
1703
1704 if (rq && sched_debug_enabled) {
1705 pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
1706 cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
1707 }
1708
1709 ret = 0;
1710error:
1711 __free_domain_allocs(&d, alloc_state, cpu_map);
1712 return ret;
1713}
1714
1715/* Current sched domains: */
1716static cpumask_var_t *doms_cur;
1717
1718/* Number of sched domains in 'doms_cur': */
1719static int ndoms_cur;
1720
1721/* Attribues of custom domains in 'doms_cur' */
1722static struct sched_domain_attr *dattr_cur;
1723
1724/*
1725 * Special case: If a kmalloc() of a doms_cur partition (array of
1726 * cpumask) fails, then fallback to a single sched domain,
1727 * as determined by the single cpumask fallback_doms.
1728 */
8d5dc512 1729static cpumask_var_t fallback_doms;
f2cb1360
IM
1730
1731/*
1732 * arch_update_cpu_topology lets virtualized architectures update the
1733 * CPU core maps. It is supposed to return 1 if the topology changed
1734 * or 0 if it stayed the same.
1735 */
1736int __weak arch_update_cpu_topology(void)
1737{
1738 return 0;
1739}
1740
1741cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
1742{
1743 int i;
1744 cpumask_var_t *doms;
1745
1746 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
1747 if (!doms)
1748 return NULL;
1749 for (i = 0; i < ndoms; i++) {
1750 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
1751 free_sched_domains(doms, i);
1752 return NULL;
1753 }
1754 }
1755 return doms;
1756}
1757
1758void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
1759{
1760 unsigned int i;
1761 for (i = 0; i < ndoms; i++)
1762 free_cpumask_var(doms[i]);
1763 kfree(doms);
1764}
1765
1766/*
1767 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
1768 * For now this just excludes isolated CPUs, but could be used to
1769 * exclude other special cases in the future.
1770 */
8d5dc512 1771int sched_init_domains(const struct cpumask *cpu_map)
f2cb1360
IM
1772{
1773 int err;
1774
8d5dc512 1775 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
1676330e 1776 zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
8d5dc512
PZ
1777 zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
1778
f2cb1360
IM
1779 arch_update_cpu_topology();
1780 ndoms_cur = 1;
1781 doms_cur = alloc_sched_domains(ndoms_cur);
1782 if (!doms_cur)
1783 doms_cur = &fallback_doms;
1784 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
1785 err = build_sched_domains(doms_cur[0], NULL);
1786 register_sched_domain_sysctl();
1787
1788 return err;
1789}
1790
1791/*
1792 * Detach sched domains from a group of CPUs specified in cpu_map
1793 * These CPUs will now be attached to the NULL domain
1794 */
1795static void detach_destroy_domains(const struct cpumask *cpu_map)
1796{
1797 int i;
1798
1799 rcu_read_lock();
1800 for_each_cpu(i, cpu_map)
1801 cpu_attach_domain(NULL, &def_root_domain, i);
1802 rcu_read_unlock();
1803}
1804
1805/* handle null as "default" */
1806static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
1807 struct sched_domain_attr *new, int idx_new)
1808{
1809 struct sched_domain_attr tmp;
1810
1811 /* Fast path: */
1812 if (!new && !cur)
1813 return 1;
1814
1815 tmp = SD_ATTR_INIT;
1816 return !memcmp(cur ? (cur + idx_cur) : &tmp,
1817 new ? (new + idx_new) : &tmp,
1818 sizeof(struct sched_domain_attr));
1819}
1820
1821/*
1822 * Partition sched domains as specified by the 'ndoms_new'
1823 * cpumasks in the array doms_new[] of cpumasks. This compares
1824 * doms_new[] to the current sched domain partitioning, doms_cur[].
1825 * It destroys each deleted domain and builds each new domain.
1826 *
1827 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
1828 * The masks don't intersect (don't overlap.) We should setup one
1829 * sched domain for each mask. CPUs not in any of the cpumasks will
1830 * not be load balanced. If the same cpumask appears both in the
1831 * current 'doms_cur' domains and in the new 'doms_new', we can leave
1832 * it as it is.
1833 *
1834 * The passed in 'doms_new' should be allocated using
1835 * alloc_sched_domains. This routine takes ownership of it and will
1836 * free_sched_domains it when done with it. If the caller failed the
1837 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
1838 * and partition_sched_domains() will fallback to the single partition
1839 * 'fallback_doms', it also forces the domains to be rebuilt.
1840 *
1841 * If doms_new == NULL it will be replaced with cpu_online_mask.
1842 * ndoms_new == 0 is a special case for destroying existing domains,
1843 * and it will not create the default domain.
1844 *
1845 * Call with hotplug lock held
1846 */
1847void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
1848 struct sched_domain_attr *dattr_new)
1849{
1850 int i, j, n;
1851 int new_topology;
1852
1853 mutex_lock(&sched_domains_mutex);
1854
1855 /* Always unregister in case we don't destroy any domains: */
1856 unregister_sched_domain_sysctl();
1857
1858 /* Let the architecture update CPU core mappings: */
1859 new_topology = arch_update_cpu_topology();
1860
1861 n = doms_new ? ndoms_new : 0;
1862
1863 /* Destroy deleted domains: */
1864 for (i = 0; i < ndoms_cur; i++) {
1865 for (j = 0; j < n && !new_topology; j++) {
1866 if (cpumask_equal(doms_cur[i], doms_new[j])
1867 && dattrs_equal(dattr_cur, i, dattr_new, j))
1868 goto match1;
1869 }
1870 /* No match - a current sched domain not in new doms_new[] */
1871 detach_destroy_domains(doms_cur[i]);
1872match1:
1873 ;
1874 }
1875
1876 n = ndoms_cur;
1877 if (doms_new == NULL) {
1878 n = 0;
1879 doms_new = &fallback_doms;
1880 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
1881 WARN_ON_ONCE(dattr_new);
1882 }
1883
1884 /* Build new domains: */
1885 for (i = 0; i < ndoms_new; i++) {
1886 for (j = 0; j < n && !new_topology; j++) {
1887 if (cpumask_equal(doms_new[i], doms_cur[j])
1888 && dattrs_equal(dattr_new, i, dattr_cur, j))
1889 goto match2;
1890 }
1891 /* No match - add a new doms_new */
1892 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
1893match2:
1894 ;
1895 }
1896
1897 /* Remember the new sched domains: */
1898 if (doms_cur != &fallback_doms)
1899 free_sched_domains(doms_cur, ndoms_cur);
1900
1901 kfree(dattr_cur);
1902 doms_cur = doms_new;
1903 dattr_cur = dattr_new;
1904 ndoms_cur = ndoms_new;
1905
1906 register_sched_domain_sysctl();
1907
1908 mutex_unlock(&sched_domains_mutex);
1909}
1910