X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=kernel%2Fsched.c;h=e03b45ccf789772c946562582cc4876dd7dc1da1;hb=eff766a65c60237bfa865160c3129de31fab591b;hp=770449bee6daa23fc1f71cde4fd47e95361023d1;hpb=02e2b83bd25bb05ac2e69cb31458b7d1b3c70707;p=mirror_ubuntu-kernels.git diff --git a/kernel/sched.c b/kernel/sched.c index 770449bee6da..e03b45ccf789 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -67,6 +67,7 @@ #include #include #include +#include #include #include @@ -115,6 +116,11 @@ unsigned long long __attribute__((weak)) sched_clock(void) */ #define DEF_TIMESLICE (100 * HZ / 1000) +/* + * single value that denotes runtime == period, ie unlimited time. + */ +#define RUNTIME_INF ((u64)~0ULL) + #ifdef CONFIG_SMP /* * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) @@ -156,6 +162,84 @@ struct rt_prio_array { struct list_head queue[MAX_RT_PRIO]; }; +struct rt_bandwidth { + /* nests inside the rq lock: */ + spinlock_t rt_runtime_lock; + ktime_t rt_period; + u64 rt_runtime; + struct hrtimer rt_period_timer; +}; + +static struct rt_bandwidth def_rt_bandwidth; + +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); + +static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) +{ + struct rt_bandwidth *rt_b = + container_of(timer, struct rt_bandwidth, rt_period_timer); + ktime_t now; + int overrun; + int idle = 0; + + for (;;) { + now = hrtimer_cb_get_time(timer); + overrun = hrtimer_forward(timer, now, rt_b->rt_period); + + if (!overrun) + break; + + idle = do_sched_rt_period_timer(rt_b, overrun); + } + + return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; +} + +static +void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) +{ + rt_b->rt_period = ns_to_ktime(period); + rt_b->rt_runtime = runtime; + + spin_lock_init(&rt_b->rt_runtime_lock); + + hrtimer_init(&rt_b->rt_period_timer, + CLOCK_MONOTONIC, HRTIMER_MODE_REL); + rt_b->rt_period_timer.function = sched_rt_period_timer; + rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; +} + +static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + ktime_t now; + + if (rt_b->rt_runtime == RUNTIME_INF) + return; + + if (hrtimer_active(&rt_b->rt_period_timer)) + return; + + spin_lock(&rt_b->rt_runtime_lock); + for (;;) { + if (hrtimer_active(&rt_b->rt_period_timer)) + break; + + now = hrtimer_cb_get_time(&rt_b->rt_period_timer); + hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); + hrtimer_start(&rt_b->rt_period_timer, + rt_b->rt_period_timer.expires, + HRTIMER_MODE_ABS); + } + spin_unlock(&rt_b->rt_runtime_lock); +} + +#ifdef CONFIG_RT_GROUP_SCHED +static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + hrtimer_cancel(&rt_b->rt_period_timer); +} +#endif + #ifdef CONFIG_GROUP_SCHED #include @@ -182,29 +266,35 @@ struct task_group { struct sched_rt_entity **rt_se; struct rt_rq **rt_rq; - u64 rt_runtime; + struct rt_bandwidth rt_bandwidth; #endif struct rcu_head rcu; struct list_head list; }; +#ifdef CONFIG_USER_SCHED + +/* + * Root task group. + * Every UID task group (including init_task_group aka UID-0) will + * be a child to this group. + */ +struct task_group root_task_group; + #ifdef CONFIG_FAIR_GROUP_SCHED /* Default task group's sched entity on each cpu */ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); /* Default task group's cfs_rq on each cpu */ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; - -static struct sched_entity *init_sched_entity_p[NR_CPUS]; -static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; #endif #ifdef CONFIG_RT_GROUP_SCHED static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; - -static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS]; -static struct rt_rq *init_rt_rq_p[NR_CPUS]; +#endif +#else +#define root_task_group init_task_group #endif /* task_group_lock serializes add/remove of task groups and also changes to @@ -228,17 +318,7 @@ static int init_task_group_load = INIT_TASK_GROUP_LOAD; /* Default task group. * Every task in system belong to this group at bootup. */ -struct task_group init_task_group = { -#ifdef CONFIG_FAIR_GROUP_SCHED - .se = init_sched_entity_p, - .cfs_rq = init_cfs_rq_p, -#endif - -#ifdef CONFIG_RT_GROUP_SCHED - .rt_se = init_sched_rt_entity_p, - .rt_rq = init_rt_rq_p, -#endif -}; +struct task_group init_task_group; /* return group to which a task belongs */ static inline struct task_group *task_group(struct task_struct *p) @@ -335,6 +415,9 @@ struct rt_rq { #endif int rt_throttled; u64 rt_time; + u64 rt_runtime; + /* Nests inside the rq lock: */ + spinlock_t rt_runtime_lock; #ifdef CONFIG_RT_GROUP_SCHED unsigned long rt_nr_boosted; @@ -407,8 +490,6 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; - u64 rt_period_expire; - int rt_throttled; #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ @@ -592,23 +673,6 @@ static void update_rq_clock(struct rq *rq) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -unsigned long rt_needs_cpu(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - u64 delta; - - if (!rq->rt_throttled) - return 0; - - if (rq->clock > rq->rt_period_expire) - return 1; - - delta = rq->rt_period_expire - rq->clock; - do_div(delta, NSEC_PER_SEC / HZ); - - return (unsigned long)delta; -} - /* * Tunables that become constants when CONFIG_SCHED_DEBUG is off: */ @@ -630,6 +694,7 @@ enum { SCHED_FEAT_SYNC_WAKEUPS = 32, SCHED_FEAT_HRTICK = 64, SCHED_FEAT_DOUBLE_TICK = 128, + SCHED_FEAT_NORMALIZED_SLEEPER = 256, }; const_debug unsigned int sysctl_sched_features = @@ -640,7 +705,8 @@ const_debug unsigned int sysctl_sched_features = SCHED_FEAT_CACHE_HOT_BUDDY * 1 | SCHED_FEAT_SYNC_WAKEUPS * 1 | SCHED_FEAT_HRTICK * 1 | - SCHED_FEAT_DOUBLE_TICK * 0; + SCHED_FEAT_DOUBLE_TICK * 0 | + SCHED_FEAT_NORMALIZED_SLEEPER * 1; #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) @@ -664,10 +730,18 @@ static __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; -/* - * single value that denotes runtime == period, ie unlimited time. - */ -#define RUNTIME_INF ((u64)~0ULL) +static inline u64 global_rt_period(void) +{ + return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; +} + +static inline u64 global_rt_runtime(void) +{ + if (sysctl_sched_rt_period < 0) + return RUNTIME_INF; + + return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; +} static const unsigned long long time_sync_thresh = 100000; @@ -1811,17 +1885,17 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) * find_idlest_cpu - find the idlest cpu among the cpus in group. */ static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, + cpumask_t *tmp) { - cpumask_t tmp; unsigned long load, min_load = ULONG_MAX; int idlest = -1; int i; /* Traverse only the allowed CPUs */ - cpus_and(tmp, group->cpumask, p->cpus_allowed); + cpus_and(*tmp, group->cpumask, p->cpus_allowed); - for_each_cpu_mask(i, tmp) { + for_each_cpu_mask(i, *tmp) { load = weighted_cpuload(i); if (load < min_load || (load == min_load && i == this_cpu)) { @@ -1860,7 +1934,7 @@ static int sched_balance_self(int cpu, int flag) } while (sd) { - cpumask_t span; + cpumask_t span, tmpmask; struct sched_group *group; int new_cpu, weight; @@ -1876,7 +1950,7 @@ static int sched_balance_self(int cpu, int flag) continue; } - new_cpu = find_idlest_cpu(group, t, cpu); + new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask); if (new_cpu == -1 || new_cpu == cpu) { /* Now try balancing at a lower domain level of cpu */ sd = sd->child; @@ -2760,7 +2834,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long *imbalance, enum cpu_idle_type idle, - int *sd_idle, cpumask_t *cpus, int *balance) + int *sd_idle, const cpumask_t *cpus, int *balance) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; @@ -3061,7 +3135,7 @@ ret: */ static struct rq * find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, - unsigned long imbalance, cpumask_t *cpus) + unsigned long imbalance, const cpumask_t *cpus) { struct rq *busiest = NULL, *rq; unsigned long max_load = 0; @@ -3100,15 +3174,16 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, */ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, - int *balance) + int *balance, cpumask_t *cpus) { int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; - cpumask_t cpus = CPU_MASK_ALL; unsigned long flags; + cpus_setall(*cpus); + /* * When power savings policy is enabled for the parent domain, idle * sibling can pick up load irrespective of busy siblings. In this case, @@ -3123,7 +3198,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, redo: group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, - &cpus, balance); + cpus, balance); if (*balance == 0) goto out_balanced; @@ -3133,7 +3208,7 @@ redo: goto out_balanced; } - busiest = find_busiest_queue(group, idle, imbalance, &cpus); + busiest = find_busiest_queue(group, idle, imbalance, cpus); if (!busiest) { schedstat_inc(sd, lb_nobusyq[idle]); goto out_balanced; @@ -3166,8 +3241,8 @@ redo: /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(all_pinned)) { - cpu_clear(cpu_of(busiest), cpus); - if (!cpus_empty(cpus)) + cpu_clear(cpu_of(busiest), *cpus); + if (!cpus_empty(*cpus)) goto redo; goto out_balanced; } @@ -3252,7 +3327,8 @@ out_one_pinned: * this_rq is locked. */ static int -load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) +load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, + cpumask_t *cpus) { struct sched_group *group; struct rq *busiest = NULL; @@ -3260,7 +3336,8 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) int ld_moved = 0; int sd_idle = 0; int all_pinned = 0; - cpumask_t cpus = CPU_MASK_ALL; + + cpus_setall(*cpus); /* * When power savings policy is enabled for the parent domain, idle @@ -3275,14 +3352,13 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); redo: group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, - &sd_idle, &cpus, NULL); + &sd_idle, cpus, NULL); if (!group) { schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); goto out_balanced; } - busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, - &cpus); + busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus); if (!busiest) { schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); goto out_balanced; @@ -3304,8 +3380,8 @@ redo: spin_unlock(&busiest->lock); if (unlikely(all_pinned)) { - cpu_clear(cpu_of(busiest), cpus); - if (!cpus_empty(cpus)) + cpu_clear(cpu_of(busiest), *cpus); + if (!cpus_empty(*cpus)) goto redo; } } @@ -3339,6 +3415,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) struct sched_domain *sd; int pulled_task = -1; unsigned long next_balance = jiffies + HZ; + cpumask_t tmpmask; for_each_domain(this_cpu, sd) { unsigned long interval; @@ -3348,8 +3425,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq) if (sd->flags & SD_BALANCE_NEWIDLE) /* If we've pulled tasks over stop searching: */ - pulled_task = load_balance_newidle(this_cpu, - this_rq, sd); + pulled_task = load_balance_newidle(this_cpu, this_rq, + sd, &tmpmask); interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) @@ -3508,6 +3585,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; + cpumask_t tmp; for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@ -3531,7 +3609,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) } if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &balance)) { + if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) { /* * We've pulled tasks over so either we're no * longer idle, or one of our SMT siblings is @@ -3647,7 +3725,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) */ int ilb = first_cpu(nohz.cpu_mask); - if (ilb != NR_CPUS) + if (ilb < nr_cpu_ids) resched_cpu(ilb); } } @@ -3854,7 +3932,6 @@ void scheduler_tick(void) update_last_tick_seen(rq); update_cpu_load(rq); curr->sched_class->task_tick(rq, curr, 0); - update_sched_rt_period(rq); spin_unlock(&rq->lock); #ifdef CONFIG_SMP @@ -4689,7 +4766,7 @@ recheck: * Do not allow realtime tasks into groups that have no runtime * assigned. */ - if (rt_policy(policy) && task_group(p)->rt_runtime == 0) + if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) return -EPERM; #endif @@ -4851,9 +4928,10 @@ out_unlock: return retval; } -long sched_setaffinity(pid_t pid, cpumask_t new_mask) +long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) { cpumask_t cpus_allowed; + cpumask_t new_mask = *in_mask; struct task_struct *p; int retval; @@ -4884,13 +4962,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) if (retval) goto out_unlock; - cpus_allowed = cpuset_cpus_allowed(p); + cpuset_cpus_allowed(p, &cpus_allowed); cpus_and(new_mask, new_mask, cpus_allowed); again: - retval = set_cpus_allowed(p, new_mask); + retval = set_cpus_allowed_ptr(p, &new_mask); if (!retval) { - cpus_allowed = cpuset_cpus_allowed(p); + cpuset_cpus_allowed(p, &cpus_allowed); if (!cpus_subset(new_mask, cpus_allowed)) { /* * We must have raced with a concurrent cpuset @@ -4934,7 +5012,7 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, if (retval) return retval; - return sched_setaffinity(pid, new_mask); + return sched_setaffinity(pid, &new_mask); } /* @@ -5396,7 +5474,6 @@ static inline void sched_init_granularity(void) sysctl_sched_latency = limit; sysctl_sched_wakeup_granularity *= factor; - sysctl_sched_batch_wakeup_granularity *= factor; } #ifdef CONFIG_SMP @@ -5425,7 +5502,7 @@ static inline void sched_init_granularity(void) * task must not exit() & deallocate itself prematurely. The * call is not atomic; no spinlocks may be held. */ -int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) +int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) { struct migration_req req; unsigned long flags; @@ -5433,23 +5510,23 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) int ret = 0; rq = task_rq_lock(p, &flags); - if (!cpus_intersects(new_mask, cpu_online_map)) { + if (!cpus_intersects(*new_mask, cpu_online_map)) { ret = -EINVAL; goto out; } if (p->sched_class->set_cpus_allowed) - p->sched_class->set_cpus_allowed(p, &new_mask); + p->sched_class->set_cpus_allowed(p, new_mask); else { - p->cpus_allowed = new_mask; - p->rt.nr_cpus_allowed = cpus_weight(new_mask); + p->cpus_allowed = *new_mask; + p->rt.nr_cpus_allowed = cpus_weight(*new_mask); } /* Can the task run on the task's current CPU? If so, we're done */ - if (cpu_isset(task_cpu(p), new_mask)) + if (cpu_isset(task_cpu(p), *new_mask)) goto out; - if (migrate_task(p, any_online_cpu(new_mask), &req)) { + if (migrate_task(p, any_online_cpu(*new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); @@ -5462,7 +5539,7 @@ out: return ret; } -EXPORT_SYMBOL_GPL(set_cpus_allowed); +EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); /* * Move (not current) task off this cpu, onto dest cpu. We're doing @@ -5600,12 +5677,14 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) dest_cpu = any_online_cpu(mask); /* On any allowed CPU? */ - if (dest_cpu == NR_CPUS) + if (dest_cpu >= nr_cpu_ids) dest_cpu = any_online_cpu(p->cpus_allowed); /* No more Mr. Nice Guy. */ - if (dest_cpu == NR_CPUS) { - cpumask_t cpus_allowed = cpuset_cpus_allowed_locked(p); + if (dest_cpu >= nr_cpu_ids) { + cpumask_t cpus_allowed; + + cpuset_cpus_allowed_locked(p, &cpus_allowed); /* * Try to stay on the same cpuset, where the * current cpuset may be a subset of all cpus. @@ -5641,7 +5720,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) */ static void migrate_nr_uninterruptible(struct rq *rq_src) { - struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); + struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR)); unsigned long flags; local_irq_save(flags); @@ -6053,20 +6132,16 @@ void __init migration_init(void) #ifdef CONFIG_SMP -/* Number of possible processor ids */ -int nr_cpu_ids __read_mostly = NR_CPUS; -EXPORT_SYMBOL(nr_cpu_ids); - #ifdef CONFIG_SCHED_DEBUG -static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) +static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, + cpumask_t *groupmask) { struct sched_group *group = sd->groups; - cpumask_t groupmask; - char str[NR_CPUS]; + char str[256]; - cpumask_scnprintf(str, NR_CPUS, sd->span); - cpus_clear(groupmask); + cpulist_scnprintf(str, sizeof(str), sd->span); + cpus_clear(*groupmask); printk(KERN_DEBUG "%*s domain %d: ", level, "", level); @@ -6110,25 +6185,25 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) break; } - if (cpus_intersects(groupmask, group->cpumask)) { + if (cpus_intersects(*groupmask, group->cpumask)) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: repeated CPUs\n"); break; } - cpus_or(groupmask, groupmask, group->cpumask); + cpus_or(*groupmask, *groupmask, group->cpumask); - cpumask_scnprintf(str, NR_CPUS, group->cpumask); + cpulist_scnprintf(str, sizeof(str), group->cpumask); printk(KERN_CONT " %s", str); group = group->next; } while (group != sd->groups); printk(KERN_CONT "\n"); - if (!cpus_equal(sd->span, groupmask)) + if (!cpus_equal(sd->span, *groupmask)) printk(KERN_ERR "ERROR: groups don't span domain->span\n"); - if (sd->parent && !cpus_subset(groupmask, sd->parent->span)) + if (sd->parent && !cpus_subset(*groupmask, sd->parent->span)) printk(KERN_ERR "ERROR: parent span is not a superset " "of domain->span\n"); return 0; @@ -6136,6 +6211,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) static void sched_domain_debug(struct sched_domain *sd, int cpu) { + cpumask_t *groupmask; int level = 0; if (!sd) { @@ -6145,14 +6221,21 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); + groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + if (!groupmask) { + printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); + return; + } + for (;;) { - if (sched_domain_debug_one(sd, cpu, level)) + if (sched_domain_debug_one(sd, cpu, level, groupmask)) break; level++; sd = sd->parent; if (!sd) break; } + kfree(groupmask); } #else # define sched_domain_debug(sd, cpu) do { } while (0) @@ -6340,30 +6423,33 @@ __setup("isolcpus=", isolated_cpu_setup); * and ->cpu_power to 0. */ static void -init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, +init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, int (*group_fn)(int cpu, const cpumask_t *cpu_map, - struct sched_group **sg)) + struct sched_group **sg, + cpumask_t *tmpmask), + cpumask_t *covered, cpumask_t *tmpmask) { struct sched_group *first = NULL, *last = NULL; - cpumask_t covered = CPU_MASK_NONE; int i; - for_each_cpu_mask(i, span) { + cpus_clear(*covered); + + for_each_cpu_mask(i, *span) { struct sched_group *sg; - int group = group_fn(i, cpu_map, &sg); + int group = group_fn(i, cpu_map, &sg, tmpmask); int j; - if (cpu_isset(i, covered)) + if (cpu_isset(i, *covered)) continue; - sg->cpumask = CPU_MASK_NONE; + cpus_clear(sg->cpumask); sg->__cpu_power = 0; - for_each_cpu_mask(j, span) { - if (group_fn(j, cpu_map, NULL) != group) + for_each_cpu_mask(j, *span) { + if (group_fn(j, cpu_map, NULL, tmpmask) != group) continue; - cpu_set(j, covered); + cpu_set(j, *covered); cpu_set(j, sg->cpumask); } if (!first) @@ -6389,7 +6475,7 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, * * Should use nodemask_t. */ -static int find_next_best_node(int node, unsigned long *used_nodes) +static int find_next_best_node(int node, nodemask_t *used_nodes) { int i, n, val, min_val, best_node = 0; @@ -6403,7 +6489,7 @@ static int find_next_best_node(int node, unsigned long *used_nodes) continue; /* Skip already used nodes */ - if (test_bit(n, used_nodes)) + if (node_isset(n, *used_nodes)) continue; /* Simple min distance search */ @@ -6415,40 +6501,36 @@ static int find_next_best_node(int node, unsigned long *used_nodes) } } - set_bit(best_node, used_nodes); + node_set(best_node, *used_nodes); return best_node; } /** * sched_domain_node_span - get a cpumask for a node's sched_domain * @node: node whose cpumask we're constructing - * @size: number of nodes to include in this span * * Given a node, construct a good cpumask for its sched_domain to span. It * should be one that prevents unnecessary balancing, but also spreads tasks * out optimally. */ -static cpumask_t sched_domain_node_span(int node) +static void sched_domain_node_span(int node, cpumask_t *span) { - DECLARE_BITMAP(used_nodes, MAX_NUMNODES); - cpumask_t span, nodemask; + nodemask_t used_nodes; + node_to_cpumask_ptr(nodemask, node); int i; - cpus_clear(span); - bitmap_zero(used_nodes, MAX_NUMNODES); + cpus_clear(*span); + nodes_clear(used_nodes); - nodemask = node_to_cpumask(node); - cpus_or(span, span, nodemask); - set_bit(node, used_nodes); + cpus_or(*span, *span, *nodemask); + node_set(node, used_nodes); for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { - int next_node = find_next_best_node(node, used_nodes); + int next_node = find_next_best_node(node, &used_nodes); - nodemask = node_to_cpumask(next_node); - cpus_or(span, span, nodemask); + node_to_cpumask_ptr_next(nodemask, next_node); + cpus_or(*span, *span, *nodemask); } - - return span; } #endif @@ -6462,7 +6544,8 @@ static DEFINE_PER_CPU(struct sched_domain, cpu_domains); static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); static int -cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) +cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, + cpumask_t *unused) { if (sg) *sg = &per_cpu(sched_group_cpus, cpu); @@ -6480,19 +6563,22 @@ static DEFINE_PER_CPU(struct sched_group, sched_group_core); #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) static int -cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) +cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, + cpumask_t *mask) { int group; - cpumask_t mask = per_cpu(cpu_sibling_map, cpu); - cpus_and(mask, mask, *cpu_map); - group = first_cpu(mask); + + *mask = per_cpu(cpu_sibling_map, cpu); + cpus_and(*mask, *mask, *cpu_map); + group = first_cpu(*mask); if (sg) *sg = &per_cpu(sched_group_core, group); return group; } #elif defined(CONFIG_SCHED_MC) static int -cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) +cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, + cpumask_t *unused) { if (sg) *sg = &per_cpu(sched_group_core, cpu); @@ -6504,17 +6590,18 @@ static DEFINE_PER_CPU(struct sched_domain, phys_domains); static DEFINE_PER_CPU(struct sched_group, sched_group_phys); static int -cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) +cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, + cpumask_t *mask) { int group; #ifdef CONFIG_SCHED_MC - cpumask_t mask = cpu_coregroup_map(cpu); - cpus_and(mask, mask, *cpu_map); - group = first_cpu(mask); + *mask = cpu_coregroup_map(cpu); + cpus_and(*mask, *mask, *cpu_map); + group = first_cpu(*mask); #elif defined(CONFIG_SCHED_SMT) - cpumask_t mask = per_cpu(cpu_sibling_map, cpu); - cpus_and(mask, mask, *cpu_map); - group = first_cpu(mask); + *mask = per_cpu(cpu_sibling_map, cpu); + cpus_and(*mask, *mask, *cpu_map); + group = first_cpu(*mask); #else group = cpu; #endif @@ -6530,19 +6617,19 @@ cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) * gets dynamically allocated. */ static DEFINE_PER_CPU(struct sched_domain, node_domains); -static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; +static struct sched_group ***sched_group_nodes_bycpu; static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, - struct sched_group **sg) + struct sched_group **sg, cpumask_t *nodemask) { - cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu)); int group; - cpus_and(nodemask, nodemask, *cpu_map); - group = first_cpu(nodemask); + *nodemask = node_to_cpumask(cpu_to_node(cpu)); + cpus_and(*nodemask, *nodemask, *cpu_map); + group = first_cpu(*nodemask); if (sg) *sg = &per_cpu(sched_group_allnodes, group); @@ -6578,7 +6665,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) #ifdef CONFIG_NUMA /* Free memory allocated for various sched_group structures */ -static void free_sched_groups(const cpumask_t *cpu_map) +static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) { int cpu, i; @@ -6590,11 +6677,11 @@ static void free_sched_groups(const cpumask_t *cpu_map) continue; for (i = 0; i < MAX_NUMNODES; i++) { - cpumask_t nodemask = node_to_cpumask(i); struct sched_group *oldsg, *sg = sched_group_nodes[i]; - cpus_and(nodemask, nodemask, *cpu_map); - if (cpus_empty(nodemask)) + *nodemask = node_to_cpumask(i); + cpus_and(*nodemask, *nodemask, *cpu_map); + if (cpus_empty(*nodemask)) continue; if (sg == NULL) @@ -6612,7 +6699,7 @@ next_sg: } } #else -static void free_sched_groups(const cpumask_t *cpu_map) +static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) { } #endif @@ -6669,6 +6756,65 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) } while (group != child->groups); } +/* + * Initializers for schedule domains + * Non-inlined to reduce accumulated stack pressure in build_sched_domains() + */ + +#define SD_INIT(sd, type) sd_init_##type(sd) +#define SD_INIT_FUNC(type) \ +static noinline void sd_init_##type(struct sched_domain *sd) \ +{ \ + memset(sd, 0, sizeof(*sd)); \ + *sd = SD_##type##_INIT; \ +} + +SD_INIT_FUNC(CPU) +#ifdef CONFIG_NUMA + SD_INIT_FUNC(ALLNODES) + SD_INIT_FUNC(NODE) +#endif +#ifdef CONFIG_SCHED_SMT + SD_INIT_FUNC(SIBLING) +#endif +#ifdef CONFIG_SCHED_MC + SD_INIT_FUNC(MC) +#endif + +/* + * To minimize stack usage kmalloc room for cpumasks and share the + * space as the usage in build_sched_domains() dictates. Used only + * if the amount of space is significant. + */ +struct allmasks { + cpumask_t tmpmask; /* make this one first */ + union { + cpumask_t nodemask; + cpumask_t this_sibling_map; + cpumask_t this_core_map; + }; + cpumask_t send_covered; + +#ifdef CONFIG_NUMA + cpumask_t domainspan; + cpumask_t covered; + cpumask_t notcovered; +#endif +}; + +#if NR_CPUS > 128 +#define SCHED_CPUMASK_ALLOC 1 +#define SCHED_CPUMASK_FREE(v) kfree(v) +#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v +#else +#define SCHED_CPUMASK_ALLOC 0 +#define SCHED_CPUMASK_FREE(v) +#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v +#endif + +#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \ + ((unsigned long)(a) + offsetof(struct allmasks, v)) + /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus @@ -6677,6 +6823,8 @@ static int build_sched_domains(const cpumask_t *cpu_map) { int i; struct root_domain *rd; + SCHED_CPUMASK_DECLARE(allmasks); + cpumask_t *tmpmask; #ifdef CONFIG_NUMA struct sched_group **sched_group_nodes = NULL; int sd_allnodes = 0; @@ -6690,39 +6838,61 @@ static int build_sched_domains(const cpumask_t *cpu_map) printk(KERN_WARNING "Can not alloc sched group node list\n"); return -ENOMEM; } - sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; #endif rd = alloc_rootdomain(); if (!rd) { printk(KERN_WARNING "Cannot alloc root domain\n"); +#ifdef CONFIG_NUMA + kfree(sched_group_nodes); +#endif return -ENOMEM; } +#if SCHED_CPUMASK_ALLOC + /* get space for all scratch cpumask variables */ + allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL); + if (!allmasks) { + printk(KERN_WARNING "Cannot alloc cpumask array\n"); + kfree(rd); +#ifdef CONFIG_NUMA + kfree(sched_group_nodes); +#endif + return -ENOMEM; + } +#endif + tmpmask = (cpumask_t *)allmasks; + + +#ifdef CONFIG_NUMA + sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; +#endif + /* * Set up domains for cpus specified by the cpu_map. */ for_each_cpu_mask(i, *cpu_map) { struct sched_domain *sd = NULL, *p; - cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); + SCHED_CPUMASK_VAR(nodemask, allmasks); - cpus_and(nodemask, nodemask, *cpu_map); + *nodemask = node_to_cpumask(cpu_to_node(i)); + cpus_and(*nodemask, *nodemask, *cpu_map); #ifdef CONFIG_NUMA if (cpus_weight(*cpu_map) > - SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { + SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) { sd = &per_cpu(allnodes_domains, i); - *sd = SD_ALLNODES_INIT; + SD_INIT(sd, ALLNODES); sd->span = *cpu_map; - cpu_to_allnodes_group(i, cpu_map, &sd->groups); + cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); p = sd; sd_allnodes = 1; } else p = NULL; sd = &per_cpu(node_domains, i); - *sd = SD_NODE_INIT; - sd->span = sched_domain_node_span(cpu_to_node(i)); + SD_INIT(sd, NODE); + sched_domain_node_span(cpu_to_node(i), &sd->span); sd->parent = p; if (p) p->child = sd; @@ -6731,94 +6901,114 @@ static int build_sched_domains(const cpumask_t *cpu_map) p = sd; sd = &per_cpu(phys_domains, i); - *sd = SD_CPU_INIT; - sd->span = nodemask; + SD_INIT(sd, CPU); + sd->span = *nodemask; sd->parent = p; if (p) p->child = sd; - cpu_to_phys_group(i, cpu_map, &sd->groups); + cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); #ifdef CONFIG_SCHED_MC p = sd; sd = &per_cpu(core_domains, i); - *sd = SD_MC_INIT; + SD_INIT(sd, MC); sd->span = cpu_coregroup_map(i); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; - cpu_to_core_group(i, cpu_map, &sd->groups); + cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); #endif #ifdef CONFIG_SCHED_SMT p = sd; sd = &per_cpu(cpu_domains, i); - *sd = SD_SIBLING_INIT; + SD_INIT(sd, SIBLING); sd->span = per_cpu(cpu_sibling_map, i); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; - cpu_to_cpu_group(i, cpu_map, &sd->groups); + cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); #endif } #ifdef CONFIG_SCHED_SMT /* Set up CPU (sibling) groups */ for_each_cpu_mask(i, *cpu_map) { - cpumask_t this_sibling_map = per_cpu(cpu_sibling_map, i); - cpus_and(this_sibling_map, this_sibling_map, *cpu_map); - if (i != first_cpu(this_sibling_map)) + SCHED_CPUMASK_VAR(this_sibling_map, allmasks); + SCHED_CPUMASK_VAR(send_covered, allmasks); + + *this_sibling_map = per_cpu(cpu_sibling_map, i); + cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map); + if (i != first_cpu(*this_sibling_map)) continue; init_sched_build_groups(this_sibling_map, cpu_map, - &cpu_to_cpu_group); + &cpu_to_cpu_group, + send_covered, tmpmask); } #endif #ifdef CONFIG_SCHED_MC /* Set up multi-core groups */ for_each_cpu_mask(i, *cpu_map) { - cpumask_t this_core_map = cpu_coregroup_map(i); - cpus_and(this_core_map, this_core_map, *cpu_map); - if (i != first_cpu(this_core_map)) + SCHED_CPUMASK_VAR(this_core_map, allmasks); + SCHED_CPUMASK_VAR(send_covered, allmasks); + + *this_core_map = cpu_coregroup_map(i); + cpus_and(*this_core_map, *this_core_map, *cpu_map); + if (i != first_cpu(*this_core_map)) continue; + init_sched_build_groups(this_core_map, cpu_map, - &cpu_to_core_group); + &cpu_to_core_group, + send_covered, tmpmask); } #endif /* Set up physical groups */ for (i = 0; i < MAX_NUMNODES; i++) { - cpumask_t nodemask = node_to_cpumask(i); + SCHED_CPUMASK_VAR(nodemask, allmasks); + SCHED_CPUMASK_VAR(send_covered, allmasks); - cpus_and(nodemask, nodemask, *cpu_map); - if (cpus_empty(nodemask)) + *nodemask = node_to_cpumask(i); + cpus_and(*nodemask, *nodemask, *cpu_map); + if (cpus_empty(*nodemask)) continue; - init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group); + init_sched_build_groups(nodemask, cpu_map, + &cpu_to_phys_group, + send_covered, tmpmask); } #ifdef CONFIG_NUMA /* Set up node groups */ - if (sd_allnodes) - init_sched_build_groups(*cpu_map, cpu_map, - &cpu_to_allnodes_group); + if (sd_allnodes) { + SCHED_CPUMASK_VAR(send_covered, allmasks); + + init_sched_build_groups(cpu_map, cpu_map, + &cpu_to_allnodes_group, + send_covered, tmpmask); + } for (i = 0; i < MAX_NUMNODES; i++) { /* Set up node groups */ struct sched_group *sg, *prev; - cpumask_t nodemask = node_to_cpumask(i); - cpumask_t domainspan; - cpumask_t covered = CPU_MASK_NONE; + SCHED_CPUMASK_VAR(nodemask, allmasks); + SCHED_CPUMASK_VAR(domainspan, allmasks); + SCHED_CPUMASK_VAR(covered, allmasks); int j; - cpus_and(nodemask, nodemask, *cpu_map); - if (cpus_empty(nodemask)) { + *nodemask = node_to_cpumask(i); + cpus_clear(*covered); + + cpus_and(*nodemask, *nodemask, *cpu_map); + if (cpus_empty(*nodemask)) { sched_group_nodes[i] = NULL; continue; } - domainspan = sched_domain_node_span(i); - cpus_and(domainspan, domainspan, *cpu_map); + sched_domain_node_span(i, domainspan); + cpus_and(*domainspan, *domainspan, *cpu_map); sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); if (!sg) { @@ -6827,31 +7017,31 @@ static int build_sched_domains(const cpumask_t *cpu_map) goto error; } sched_group_nodes[i] = sg; - for_each_cpu_mask(j, nodemask) { + for_each_cpu_mask(j, *nodemask) { struct sched_domain *sd; sd = &per_cpu(node_domains, j); sd->groups = sg; } sg->__cpu_power = 0; - sg->cpumask = nodemask; + sg->cpumask = *nodemask; sg->next = sg; - cpus_or(covered, covered, nodemask); + cpus_or(*covered, *covered, *nodemask); prev = sg; for (j = 0; j < MAX_NUMNODES; j++) { - cpumask_t tmp, notcovered; + SCHED_CPUMASK_VAR(notcovered, allmasks); int n = (i + j) % MAX_NUMNODES; + node_to_cpumask_ptr(pnodemask, n); - cpus_complement(notcovered, covered); - cpus_and(tmp, notcovered, *cpu_map); - cpus_and(tmp, tmp, domainspan); - if (cpus_empty(tmp)) + cpus_complement(*notcovered, *covered); + cpus_and(*tmpmask, *notcovered, *cpu_map); + cpus_and(*tmpmask, *tmpmask, *domainspan); + if (cpus_empty(*tmpmask)) break; - nodemask = node_to_cpumask(n); - cpus_and(tmp, tmp, nodemask); - if (cpus_empty(tmp)) + cpus_and(*tmpmask, *tmpmask, *pnodemask); + if (cpus_empty(*tmpmask)) continue; sg = kmalloc_node(sizeof(struct sched_group), @@ -6862,9 +7052,9 @@ static int build_sched_domains(const cpumask_t *cpu_map) goto error; } sg->__cpu_power = 0; - sg->cpumask = tmp; + sg->cpumask = *tmpmask; sg->next = prev->next; - cpus_or(covered, covered, tmp); + cpus_or(*covered, *covered, *tmpmask); prev->next = sg; prev = sg; } @@ -6900,7 +7090,8 @@ static int build_sched_domains(const cpumask_t *cpu_map) if (sd_allnodes) { struct sched_group *sg; - cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg); + cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg, + tmpmask); init_numa_sched_groups_power(sg); } #endif @@ -6918,11 +7109,13 @@ static int build_sched_domains(const cpumask_t *cpu_map) cpu_attach_domain(sd, rd, i); } + SCHED_CPUMASK_FREE((void *)allmasks); return 0; #ifdef CONFIG_NUMA error: - free_sched_groups(cpu_map); + free_sched_groups(cpu_map, tmpmask); + SCHED_CPUMASK_FREE((void *)allmasks); return -ENOMEM; #endif } @@ -6962,9 +7155,10 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map) return err; } -static void arch_destroy_sched_domains(const cpumask_t *cpu_map) +static void arch_destroy_sched_domains(const cpumask_t *cpu_map, + cpumask_t *tmpmask) { - free_sched_groups(cpu_map); + free_sched_groups(cpu_map, tmpmask); } /* @@ -6973,6 +7167,7 @@ static void arch_destroy_sched_domains(const cpumask_t *cpu_map) */ static void detach_destroy_domains(const cpumask_t *cpu_map) { + cpumask_t tmpmask; int i; unregister_sched_domain_sysctl(); @@ -6980,7 +7175,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) for_each_cpu_mask(i, *cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); synchronize_sched(); - arch_destroy_sched_domains(cpu_map); + arch_destroy_sched_domains(cpu_map, &tmpmask); } /* @@ -7173,6 +7368,11 @@ void __init sched_init_smp(void) { cpumask_t non_isolated_cpus; +#if defined(CONFIG_NUMA) + sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), + GFP_KERNEL); + BUG_ON(sched_group_nodes_bycpu == NULL); +#endif get_online_cpus(); arch_init_sched_domains(&cpu_online_map); cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); @@ -7183,13 +7383,18 @@ void __init sched_init_smp(void) hotcpu_notifier(update_sched_domains, 0); /* Move init over to a non-isolated CPU */ - if (set_cpus_allowed(current, non_isolated_cpus) < 0) + if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0) BUG(); sched_init_granularity(); } #else void __init sched_init_smp(void) { +#if defined(CONFIG_NUMA) + sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), + GFP_KERNEL); + BUG_ON(sched_group_nodes_bycpu == NULL); +#endif sched_init_granularity(); } #endif /* CONFIG_SMP */ @@ -7233,6 +7438,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) rt_rq->rt_time = 0; rt_rq->rt_throttled = 0; + rt_rq->rt_runtime = 0; + spin_lock_init(&rt_rq->rt_runtime_lock); #ifdef CONFIG_RT_GROUP_SCHED rt_rq->rt_nr_boosted = 0; @@ -7241,10 +7448,11 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) } #ifdef CONFIG_FAIR_GROUP_SCHED -static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, - struct cfs_rq *cfs_rq, struct sched_entity *se, - int cpu, int add) +static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, + struct sched_entity *se, int cpu, int add, + struct sched_entity *parent) { + struct rq *rq = cpu_rq(cpu); tg->cfs_rq[cpu] = cfs_rq; init_cfs_rq(cfs_rq, rq); cfs_rq->tg = tg; @@ -7252,43 +7460,122 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); tg->se[cpu] = se; - se->cfs_rq = &rq->cfs; + /* se could be NULL for init_task_group */ + if (!se) + return; + + if (!parent) + se->cfs_rq = &rq->cfs; + else + se->cfs_rq = parent->my_q; + se->my_q = cfs_rq; se->load.weight = tg->shares; se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); - se->parent = NULL; + se->parent = parent; } #endif #ifdef CONFIG_RT_GROUP_SCHED -static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, - struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, - int cpu, int add) +static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, + struct sched_rt_entity *rt_se, int cpu, int add, + struct sched_rt_entity *parent) { + struct rq *rq = cpu_rq(cpu); + tg->rt_rq[cpu] = rt_rq; init_rt_rq(rt_rq, rq); rt_rq->tg = tg; rt_rq->rt_se = rt_se; + rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; if (add) list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); tg->rt_se[cpu] = rt_se; + if (!rt_se) + return; + + if (!parent) + rt_se->rt_rq = &rq->rt; + else + rt_se->rt_rq = parent->my_q; + rt_se->rt_rq = &rq->rt; rt_se->my_q = rt_rq; - rt_se->parent = NULL; + rt_se->parent = parent; INIT_LIST_HEAD(&rt_se->run_list); } #endif void __init sched_init(void) { - int highest_cpu = 0; int i, j; + unsigned long alloc_size = 0, ptr; + +#ifdef CONFIG_FAIR_GROUP_SCHED + alloc_size += 2 * nr_cpu_ids * sizeof(void **); +#endif +#ifdef CONFIG_RT_GROUP_SCHED + alloc_size += 2 * nr_cpu_ids * sizeof(void **); +#endif +#ifdef CONFIG_USER_SCHED + alloc_size *= 2; +#endif + /* + * As sched_init() is called before page_alloc is setup, + * we use alloc_bootmem(). + */ + if (alloc_size) { + ptr = (unsigned long)alloc_bootmem_low(alloc_size); + +#ifdef CONFIG_FAIR_GROUP_SCHED + init_task_group.se = (struct sched_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + + init_task_group.cfs_rq = (struct cfs_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + +#ifdef CONFIG_USER_SCHED + root_task_group.se = (struct sched_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + + root_task_group.cfs_rq = (struct cfs_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); +#endif +#endif +#ifdef CONFIG_RT_GROUP_SCHED + init_task_group.rt_se = (struct sched_rt_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + + init_task_group.rt_rq = (struct rt_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + +#ifdef CONFIG_USER_SCHED + root_task_group.rt_se = (struct sched_rt_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + + root_task_group.rt_rq = (struct rt_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); +#endif +#endif + } #ifdef CONFIG_SMP init_defrootdomain(); #endif + init_rt_bandwidth(&def_rt_bandwidth, + global_rt_period(), global_rt_runtime()); + +#ifdef CONFIG_RT_GROUP_SCHED + init_rt_bandwidth(&init_task_group.rt_bandwidth, + global_rt_period(), global_rt_runtime()); +#ifdef CONFIG_USER_SCHED + init_rt_bandwidth(&root_task_group.rt_bandwidth, + global_rt_period(), RUNTIME_INF); +#endif +#endif + #ifdef CONFIG_GROUP_SCHED list_add(&init_task_group.list, &task_groups); #endif @@ -7307,21 +7594,62 @@ void __init sched_init(void) #ifdef CONFIG_FAIR_GROUP_SCHED init_task_group.shares = init_task_group_load; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); - init_tg_cfs_entry(rq, &init_task_group, +#ifdef CONFIG_CGROUP_SCHED + /* + * How much cpu bandwidth does init_task_group get? + * + * In case of task-groups formed thr' the cgroup filesystem, it + * gets 100% of the cpu resources in the system. This overall + * system cpu resource is divided among the tasks of + * init_task_group and its child task-groups in a fair manner, + * based on each entity's (task or task-group's) weight + * (se->load.weight). + * + * In other words, if init_task_group has 10 tasks of weight + * 1024) and two child groups A0 and A1 (of weight 1024 each), + * then A0's share of the cpu resource is: + * + * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% + * + * We achieve this by letting init_task_group's tasks sit + * directly in rq->cfs (i.e init_task_group->se[] = NULL). + */ + init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); +#elif defined CONFIG_USER_SCHED + root_task_group.shares = NICE_0_LOAD; + init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL); + /* + * In case of task-groups formed thr' the user id of tasks, + * init_task_group represents tasks belonging to root user. + * Hence it forms a sibling of all subsequent groups formed. + * In this case, init_task_group gets only a fraction of overall + * system cpu resource, based on the weight assigned to root + * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished + * by letting tasks of init_task_group sit in a separate cfs_rq + * (init_cfs_rq) and having one entity represent this group of + * tasks in rq->cfs (i.e init_task_group->se[] != NULL). + */ + init_tg_cfs_entry(&init_task_group, &per_cpu(init_cfs_rq, i), - &per_cpu(init_sched_entity, i), i, 1); + &per_cpu(init_sched_entity, i), i, 1, + root_task_group.se[i]); #endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ + + rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; #ifdef CONFIG_RT_GROUP_SCHED - init_task_group.rt_runtime = - sysctl_sched_rt_runtime * NSEC_PER_USEC; INIT_LIST_HEAD(&rq->leaf_rt_rq_list); - init_tg_rt_entry(rq, &init_task_group, +#ifdef CONFIG_CGROUP_SCHED + init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); +#elif defined CONFIG_USER_SCHED + init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL); + init_tg_rt_entry(&init_task_group, &per_cpu(init_rt_rq, i), - &per_cpu(init_sched_rt_entity, i), i, 1); + &per_cpu(init_sched_rt_entity, i), i, 1, + root_task_group.rt_se[i]); +#endif #endif - rq->rt_period_expire = 0; - rq->rt_throttled = 0; for (j = 0; j < CPU_LOAD_IDX_MAX; j++) rq->cpu_load[j] = 0; @@ -7338,7 +7666,6 @@ void __init sched_init(void) #endif init_rq_hrtick(rq); atomic_set(&rq->nr_iowait, 0); - highest_cpu = i; } set_load_weight(&init_task); @@ -7348,7 +7675,6 @@ void __init sched_init(void) #endif #ifdef CONFIG_SMP - nr_cpu_ids = highest_cpu + 1; open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); #endif @@ -7507,8 +7833,6 @@ void set_curr_task(int cpu, struct task_struct *p) #endif -#ifdef CONFIG_GROUP_SCHED - #ifdef CONFIG_FAIR_GROUP_SCHED static void free_fair_sched_group(struct task_group *tg) { @@ -7525,17 +7849,18 @@ static void free_fair_sched_group(struct task_group *tg) kfree(tg->se); } -static int alloc_fair_sched_group(struct task_group *tg) +static +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { struct cfs_rq *cfs_rq; - struct sched_entity *se; + struct sched_entity *se, *parent_se; struct rq *rq; int i; - tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL); + tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); if (!tg->cfs_rq) goto err; - tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); + tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); if (!tg->se) goto err; @@ -7554,7 +7879,8 @@ static int alloc_fair_sched_group(struct task_group *tg) if (!se) goto err; - init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0); + parent_se = parent ? parent->se[i] : NULL; + init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se); } return 1; @@ -7578,7 +7904,8 @@ static inline void free_fair_sched_group(struct task_group *tg) { } -static inline int alloc_fair_sched_group(struct task_group *tg) +static inline +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { return 1; } @@ -7597,6 +7924,8 @@ static void free_rt_sched_group(struct task_group *tg) { int i; + destroy_rt_bandwidth(&tg->rt_bandwidth); + for_each_possible_cpu(i) { if (tg->rt_rq) kfree(tg->rt_rq[i]); @@ -7608,21 +7937,23 @@ static void free_rt_sched_group(struct task_group *tg) kfree(tg->rt_se); } -static int alloc_rt_sched_group(struct task_group *tg) +static +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) { struct rt_rq *rt_rq; - struct sched_rt_entity *rt_se; + struct sched_rt_entity *rt_se, *parent_se; struct rq *rq; int i; - tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL); + tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); if (!tg->rt_rq) goto err; - tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL); + tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); if (!tg->rt_se) goto err; - tg->rt_runtime = 0; + init_rt_bandwidth(&tg->rt_bandwidth, + ktime_to_ns(def_rt_bandwidth.rt_period), 0); for_each_possible_cpu(i) { rq = cpu_rq(i); @@ -7637,7 +7968,8 @@ static int alloc_rt_sched_group(struct task_group *tg) if (!rt_se) goto err; - init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0); + parent_se = parent ? parent->rt_se[i] : NULL; + init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se); } return 1; @@ -7661,7 +7993,8 @@ static inline void free_rt_sched_group(struct task_group *tg) { } -static inline int alloc_rt_sched_group(struct task_group *tg) +static inline +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) { return 1; } @@ -7675,6 +8008,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) } #endif +#ifdef CONFIG_GROUP_SCHED static void free_sched_group(struct task_group *tg) { free_fair_sched_group(tg); @@ -7683,7 +8017,7 @@ static void free_sched_group(struct task_group *tg) } /* allocate runqueue etc for a new task group */ -struct task_group *sched_create_group(void) +struct task_group *sched_create_group(struct task_group *parent) { struct task_group *tg; unsigned long flags; @@ -7693,10 +8027,10 @@ struct task_group *sched_create_group(void) if (!tg) return ERR_PTR(-ENOMEM); - if (!alloc_fair_sched_group(tg)) + if (!alloc_fair_sched_group(tg, parent)) goto err; - if (!alloc_rt_sched_group(tg)) + if (!alloc_rt_sched_group(tg, parent)) goto err; spin_lock_irqsave(&task_group_lock, flags); @@ -7776,6 +8110,7 @@ void sched_move_task(struct task_struct *tsk) task_rq_unlock(rq, &flags); } +#endif #ifdef CONFIG_FAIR_GROUP_SCHED static void set_se_shares(struct sched_entity *se, unsigned long shares) @@ -7806,6 +8141,12 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) int i; unsigned long flags; + /* + * We can't change the weight of the root cgroup. + */ + if (!tg->se[0]) + return -EINVAL; + /* * A weight of 0 or 1 can cause arithmetics problems. * (The default weight is 1024 - so there's no practical @@ -7872,16 +8213,15 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) struct task_group *tgi; unsigned long total = 0; unsigned long global_ratio = - to_ratio(sysctl_sched_rt_period, - sysctl_sched_rt_runtime < 0 ? - RUNTIME_INF : sysctl_sched_rt_runtime); + to_ratio(global_rt_period(), global_rt_runtime()); rcu_read_lock(); list_for_each_entry_rcu(tgi, &task_groups, list) { if (tgi == tg) continue; - total += to_ratio(period, tgi->rt_runtime); + total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), + tgi->rt_bandwidth.rt_runtime); } rcu_read_unlock(); @@ -7899,19 +8239,14 @@ static inline int tg_has_rt_tasks(struct task_group *tg) return 0; } -int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) +static int tg_set_bandwidth(struct task_group *tg, + u64 rt_period, u64 rt_runtime) { - u64 rt_runtime, rt_period; - int err = 0; - - rt_period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC; - rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; - if (rt_runtime_us == -1) - rt_runtime = RUNTIME_INF; + int i, err = 0; mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); - if (rt_runtime_us == 0 && tg_has_rt_tasks(tg)) { + if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { err = -EBUSY; goto unlock; } @@ -7919,7 +8254,19 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) err = -EINVAL; goto unlock; } - tg->rt_runtime = rt_runtime; + + spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); + tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); + tg->rt_bandwidth.rt_runtime = rt_runtime; + + for_each_possible_cpu(i) { + struct rt_rq *rt_rq = tg->rt_rq[i]; + + spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_runtime = rt_runtime; + spin_unlock(&rt_rq->rt_runtime_lock); + } + spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); unlock: read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); @@ -7927,19 +8274,109 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) return err; } +int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) +{ + u64 rt_runtime, rt_period; + + rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); + rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; + if (rt_runtime_us < 0) + rt_runtime = RUNTIME_INF; + + return tg_set_bandwidth(tg, rt_period, rt_runtime); +} + long sched_group_rt_runtime(struct task_group *tg) { u64 rt_runtime_us; - if (tg->rt_runtime == RUNTIME_INF) + if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) return -1; - rt_runtime_us = tg->rt_runtime; + rt_runtime_us = tg->rt_bandwidth.rt_runtime; do_div(rt_runtime_us, NSEC_PER_USEC); return rt_runtime_us; } + +int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) +{ + u64 rt_runtime, rt_period; + + rt_period = (u64)rt_period_us * NSEC_PER_USEC; + rt_runtime = tg->rt_bandwidth.rt_runtime; + + return tg_set_bandwidth(tg, rt_period, rt_runtime); +} + +long sched_group_rt_period(struct task_group *tg) +{ + u64 rt_period_us; + + rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); + do_div(rt_period_us, NSEC_PER_USEC); + return rt_period_us; +} + +static int sched_rt_global_constraints(void) +{ + int ret = 0; + + mutex_lock(&rt_constraints_mutex); + if (!__rt_schedulable(NULL, 1, 0)) + ret = -EINVAL; + mutex_unlock(&rt_constraints_mutex); + + return ret; +} +#else +static int sched_rt_global_constraints(void) +{ + unsigned long flags; + int i; + + spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); + for_each_possible_cpu(i) { + struct rt_rq *rt_rq = &cpu_rq(i)->rt; + + spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_runtime = global_rt_runtime(); + spin_unlock(&rt_rq->rt_runtime_lock); + } + spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); + + return 0; +} #endif -#endif /* CONFIG_GROUP_SCHED */ + +int sched_rt_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + int old_period, old_runtime; + static DEFINE_MUTEX(mutex); + + mutex_lock(&mutex); + old_period = sysctl_sched_rt_period; + old_runtime = sysctl_sched_rt_runtime; + + ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); + + if (!ret && write) { + ret = sched_rt_global_constraints(); + if (ret) { + sysctl_sched_rt_period = old_period; + sysctl_sched_rt_runtime = old_runtime; + } else { + def_rt_bandwidth.rt_runtime = global_rt_runtime(); + def_rt_bandwidth.rt_period = + ns_to_ktime(global_rt_period()); + } + } + mutex_unlock(&mutex); + + return ret; +} #ifdef CONFIG_CGROUP_SCHED @@ -7953,7 +8390,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) static struct cgroup_subsys_state * cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) { - struct task_group *tg; + struct task_group *tg, *parent; if (!cgrp->parent) { /* This is early initialization for the top cgroup */ @@ -7961,11 +8398,8 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) return &init_task_group.css; } - /* we support only 1-level deep hierarchical scheduler atm */ - if (cgrp->parent->parent) - return ERR_PTR(-EINVAL); - - tg = sched_create_group(); + parent = cgroup_tg(cgrp->parent); + tg = sched_create_group(parent); if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); @@ -7989,7 +8423,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, { #ifdef CONFIG_RT_GROUP_SCHED /* Don't accept realtime tasks when there is no way for them to run */ - if (rt_task(tsk) && cgroup_tg(cgrp)->rt_runtime == 0) + if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0) return -EINVAL; #else /* We don't support RT-tasks being in separate groups */ @@ -8023,7 +8457,7 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) #endif #ifdef CONFIG_RT_GROUP_SCHED -static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, +static ssize_t cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, struct file *file, const char __user *userbuf, size_t nbytes, loff_t *unused_ppos) @@ -8067,6 +8501,17 @@ static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft, return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); } + +static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, + u64 rt_period_us) +{ + return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); +} + +static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) +{ + return sched_group_rt_period(cgroup_tg(cgrp)); +} #endif static struct cftype cpu_files[] = { @@ -8083,6 +8528,11 @@ static struct cftype cpu_files[] = { .read = cpu_rt_runtime_read, .write = cpu_rt_runtime_write, }, + { + .name = "rt_period_us", + .read_uint = cpu_rt_period_read_uint, + .write_uint = cpu_rt_period_write_uint, + }, #endif }; @@ -8123,9 +8573,9 @@ struct cpuacct { struct cgroup_subsys cpuacct_subsys; /* return cpu accounting group corresponding to this container */ -static inline struct cpuacct *cgroup_ca(struct cgroup *cont) +static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) { - return container_of(cgroup_subsys_state(cont, cpuacct_subsys_id), + return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), struct cpuacct, css); } @@ -8138,7 +8588,7 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk) /* create a new cpu accounting group */ static struct cgroup_subsys_state *cpuacct_create( - struct cgroup_subsys *ss, struct cgroup *cont) + struct cgroup_subsys *ss, struct cgroup *cgrp) { struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); @@ -8156,18 +8606,18 @@ static struct cgroup_subsys_state *cpuacct_create( /* destroy an existing cpu accounting group */ static void -cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cont) +cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) { - struct cpuacct *ca = cgroup_ca(cont); + struct cpuacct *ca = cgroup_ca(cgrp); free_percpu(ca->cpuusage); kfree(ca); } /* return total cpu usage (in nanoseconds) of a group */ -static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft) +static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) { - struct cpuacct *ca = cgroup_ca(cont); + struct cpuacct *ca = cgroup_ca(cgrp); u64 totalcpuusage = 0; int i; @@ -8186,16 +8636,40 @@ static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft) return totalcpuusage; } +static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, + u64 reset) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + int err = 0; + int i; + + if (reset) { + err = -EINVAL; + goto out; + } + + for_each_possible_cpu(i) { + u64 *cpuusage = percpu_ptr(ca->cpuusage, i); + + spin_lock_irq(&cpu_rq(i)->lock); + *cpuusage = 0; + spin_unlock_irq(&cpu_rq(i)->lock); + } +out: + return err; +} + static struct cftype files[] = { { .name = "usage", .read_uint = cpuusage_read, + .write_uint = cpuusage_write, }, }; -static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cont) +static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) { - return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); + return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); } /*