extern unsigned long this_cpu_load(void);
-extern void calc_global_load(void);
+extern void calc_global_load(unsigned long ticks);
extern unsigned long get_parent_ip(unsigned long addr);
extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos);
+#else
+/* Avoid need for ifdefs elsewhere in the code */
+enum { sysctl_hung_task_timeout_secs = 0 };
#endif
/* Attach to any functions which should be ignored in wchan output. */
int oom_adj; /* OOM kill score adjustment (bit shift) */
int oom_score_adj; /* OOM kill score adjustment */
+
+ struct mutex cred_guard_mutex; /* guard against foreign influences on
+ * credential calculations
+ * (notably. ptrace) */
};
/* Context switch must be unlocked if interrupts are to be enabled */
atomic_t inotify_watches; /* How many inotify watches does this user have? */
atomic_t inotify_devs; /* How many inotify devs does this user have opened? */
#endif
+#ifdef CONFIG_FANOTIFY
+ atomic_t fanotify_listeners;
+#endif
#ifdef CONFIG_EPOLL
atomic_t epoll_watches; /* The number of file descriptors currently watched */
#endif
* single CPU.
*/
unsigned int cpu_power, cpu_power_orig;
+ unsigned int group_weight;
/*
* The CPUs this group covers.
SD_LV_NONE = 0,
SD_LV_SIBLING,
SD_LV_MC,
+ SD_LV_BOOK,
SD_LV_CPU,
SD_LV_NODE,
SD_LV_ALLNODES,
struct task_struct *task);
#ifdef CONFIG_FAIR_GROUP_SCHED
- void (*moved_group) (struct task_struct *p, int on_rq);
+ void (*task_move_group) (struct task_struct *p, int on_rq);
#endif
};
struct rcu_node;
+enum perf_event_task_context {
+ perf_invalid_context = -1,
+ perf_hw_context = 0,
+ perf_sw_context,
+ perf_nr_task_contexts,
+};
+
struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
void *stack;
#ifdef CONFIG_TREE_PREEMPT_RCU
struct rcu_node *rcu_blocked_node;
#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+ #ifdef CONFIG_RCU_BOOST
+ struct rt_mutex *rcu_boost_mutex;
+ #endif /* #ifdef CONFIG_RCU_BOOST */
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
struct sched_info sched_info;
* credentials (COW) */
const struct cred __rcu *cred; /* effective (overridable) subjective task
* credentials (COW) */
- struct mutex cred_guard_mutex; /* guard against foreign influences on
- * credential calculations
- * (notably. ptrace) */
struct cred *replacement_session_keyring; /* for KEYCTL_SESSION_TO_PARENT */
char comm[TASK_COMM_LEN]; /* executable name excluding path
struct futex_pi_state *pi_state_cache;
#endif
#ifdef CONFIG_PERF_EVENTS
- struct perf_event_context *perf_event_ctxp;
+ struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
struct mutex perf_event_mutex;
struct list_head perf_event_list;
#endif
/*
* Per process flags
*/
-#define PF_ALIGNWARN 0x00000001 /* Print alignment warning msgs */
- /* Not implemented yet, only for 486*/
+#define PF_KSOFTIRQD 0x00000001 /* I am ksoftirqd */
#define PF_STARTING 0x00000002 /* being created */
#define PF_EXITING 0x00000004 /* getting shut down */
#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
#define PF_DUMPCORE 0x00000200 /* dumped core */
#define PF_SIGNALED 0x00000400 /* killed by a signal */
#define PF_MEMALLOC 0x00000800 /* Allocating memory */
-#define PF_FLUSHER 0x00001000 /* responsible for disk writeback */
#define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */
#define PF_FREEZING 0x00004000 /* freeze in progress. do not account to load */
#define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */
#ifdef CONFIG_PREEMPT_RCU
#define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
- #define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
+ #define RCU_READ_UNLOCK_BOOSTED (1 << 1) /* boosted while in RCU read-side. */
+ #define RCU_READ_UNLOCK_NEED_QS (1 << 2) /* RCU core needs CPU response. */
static inline void rcu_copy_process(struct task_struct *p)
{
p->rcu_read_unlock_special = 0;
#ifdef CONFIG_TREE_PREEMPT_RCU
p->rcu_blocked_node = NULL;
- #endif
+ #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+ #ifdef CONFIG_RCU_BOOST
+ p->rcu_boost_mutex = NULL;
+ #endif /* #ifdef CONFIG_RCU_BOOST */
INIT_LIST_HEAD(&p->rcu_node_entry);
}
extern void sched_clock_idle_wakeup_event(u64 delta_ns);
#endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * An i/f to runtime opt-in for irq time accounting based off of sched_clock.
+ * The reason for this explicit opt-in is not to have perf penalty with
+ * slow sched_clocks.
+ */
+extern void enable_sched_clock_irqtime(void);
+extern void disable_sched_clock_irqtime(void);
+#else
+static inline void enable_sched_clock_irqtime(void) {}
+static inline void disable_sched_clock_irqtime(void) {}
+#endif
+
extern unsigned long long
task_sched_runtime(struct task_struct *task);
extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
spin_unlock(&p->alloc_lock);
}
-extern struct sighand_struct *lock_task_sighand(struct task_struct *tsk,
+extern struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
unsigned long *flags);
+#define lock_task_sighand(tsk, flags) \
+({ struct sighand_struct *__ss; \
+ __cond_lock(&(tsk)->sighand->siglock, \
+ (__ss = __lock_task_sighand(tsk, flags))); \
+ __ss; \
+}) \
+
static inline void unlock_task_sighand(struct task_struct *tsk,
unsigned long *flags)
{
extern int __cond_resched_softirq(void);
-#define cond_resched_softirq() ({ \
- __might_sleep(__FILE__, __LINE__, SOFTIRQ_OFFSET); \
- __cond_resched_softirq(); \
+#define cond_resched_softirq() ({ \
+ __might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \
+ __cond_resched_softirq(); \
})
/*
depends on !UML
default y
+config HAVE_IRQ_WORK
+ bool
+
+config IRQ_WORK
+ bool
+ depends on HAVE_IRQ_WORK
+
menu "General setup"
config EXPERIMENTAL
config LOCK_KERNEL
bool
- depends on SMP || PREEMPT
+ depends on (SMP || PREEMPT) && BKL
default y
config INIT_ENV_ARG_LIMIT
depends on HAVE_KERNEL_LZO
help
Its compression ratio is the poorest among the 4. The kernel
- size is about about 10% bigger than gzip; however its speed
+ size is about 10% bigger than gzip; however its speed
(both compression and decompression) is the fastest.
endchoice
depends on AUDITSYSCALL
select FSNOTIFY
+source "kernel/irq/Kconfig"
+
menu "RCU Subsystem"
choice
config RCU_TRACE
bool "Enable tracing for RCU"
- depends on TREE_RCU || TREE_PREEMPT_RCU
help
This option provides tracing in RCU which presents stats
in debugfs for debugging RCU implementation.
TREE_PREEMPT_RCU implementations, permitting Makefile to
trivially select kernel/rcutree_trace.c.
+ config RCU_BOOST
+ bool "Enable RCU priority boosting"
+ depends on RT_MUTEXES && TINY_PREEMPT_RCU
+ default n
+ help
+ This option boosts the priority of preempted RCU readers that
+ block the current preemptible RCU grace period for too long.
+ This option also prevents heavy loads from blocking RCU
+ callback invocation for all flavors of RCU.
+
+ Say Y here if you are working with real-time apps or heavy loads
+ Say N here if you are unsure.
+
+ config RCU_BOOST_PRIO
+ int "Real-time priority to boost RCU readers to"
+ range 1 99
+ depends on RCU_BOOST
+ default 1
+ help
+ This option specifies the real-time priority to which preempted
+ RCU readers are to be boosted. If you are working with CPU-bound
+ real-time applications, you should specify a priority higher then
+ the highest-priority CPU-bound application.
+
+ Specify the real-time priority, or take the default if unsure.
+
+ config RCU_BOOST_DELAY
+ int "Milliseconds to delay boosting after RCU grace-period start"
+ range 0 3000
+ depends on RCU_BOOST
+ default 500
+ help
+ This option specifies the time to wait after the beginning of
+ a given grace period before priority-boosting preempted RCU
+ readers blocking that grace period. Note that any RCU reader
+ blocking an expedited RCU grace period is boosted immediately.
+
+ Accept the default if unsure.
+
+ config SRCU_SYNCHRONIZE_DELAY
+ int "Microseconds to delay before waiting for readers"
+ range 0 20
+ default 10
+ help
+ This option controls how long SRCU delays before entering its
+ loop waiting on SRCU readers. The purpose of this loop is
+ to avoid the unconditional context-switch penalty that would
+ otherwise be incurred if there was an active SRCU reader,
+ in a manner similar to adaptive locking schemes. This should
+ be set to be a bit longer than the common-case SRCU read-side
+ critical-section overhead.
+
+ Accept the default if unsure.
+
endmenu # "RCU Subsystem"
config IKCONFIG
config CGROUP_DEBUG
bool "Example debug cgroup subsystem"
- depends on CGROUPS
default n
help
This option enables a simple cgroup subsystem that
config CGROUP_NS
bool "Namespace cgroup subsystem"
- depends on CGROUPS
help
Provides a simple namespace cgroup subsystem to
provide hierarchical naming of sets of namespaces,
config CGROUP_FREEZER
bool "Freezer cgroup subsystem"
- depends on CGROUPS
help
Provides a way to freeze and unfreeze all tasks in a
cgroup.
config CGROUP_DEVICE
bool "Device controller for cgroups"
- depends on CGROUPS && EXPERIMENTAL
help
Provides a cgroup implementing whitelists for devices which
a process in the cgroup can mknod or open.
config CPUSETS
bool "Cpuset support"
- depends on CGROUPS
help
This option will let you create and manage CPUSETs which
allow dynamically partitioning a system into sets of CPUs and
config CGROUP_CPUACCT
bool "Simple CPU accounting cgroup subsystem"
- depends on CGROUPS
help
Provides a simple Resource Controller for monitoring the
total CPU consumed by the tasks in a cgroup.
help
This option enables controller independent resource accounting
infrastructure that works with cgroups.
- depends on CGROUPS
config CGROUP_MEM_RES_CTLR
bool "Memory Resource Controller for Control Groups"
- depends on CGROUPS && RESOURCE_COUNTERS
+ depends on RESOURCE_COUNTERS
select MM_OWNER
help
Provides a memory resource controller that manages both anonymous
if boot option "noswapaccount" is set, swap will not be accounted.
Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page
size is 4096bytes, 512k per 1Gbytes of swap.
+config CGROUP_MEM_RES_CTLR_SWAP_ENABLED
+ bool "Memory Resource Controller Swap Extension enabled by default"
+ depends on CGROUP_MEM_RES_CTLR_SWAP
+ default y
+ help
+ Memory Resource Controller Swap Extension comes with its price in
+ a bigger memory consumption. General purpose distribution kernels
+ which want to enable the feautre but keep it disabled by default
+ and let the user enable it by swapaccount boot command line
+ parameter should have this option unselected.
+ For those who want to have the feature enabled by default should
+ select this option (if, for some reason, they need to disable it
+ then noswapaccount does the trick).
menuconfig CGROUP_SCHED
bool "Group CPU scheduler"
- depends on EXPERIMENTAL && CGROUPS
+ depends on EXPERIMENTAL
default n
help
This feature lets CPU scheduler recognize task groups and control CPU
config BLK_CGROUP
tristate "Block IO controller"
- depends on CGROUPS && BLOCK
+ depends on BLOCK
default n
---help---
Generic block IO controller cgroup interface. This is the common
Currently, CFQ IO scheduler uses it to recognize task groups and
control disk bandwidth allocation (proportional time slice allocation)
- to such task groups.
+ to such task groups. It is also used by bio throttling logic in
+ block layer to implement upper limit in IO rates on a device.
This option only enables generic Block IO controller infrastructure.
- One needs to also enable actual IO controlling logic in CFQ for it
- to take effect. (CONFIG_CFQ_GROUP_IOSCHED=y).
+ One needs to also enable actual IO controlling logic/policy. For
+ enabling proportional weight division of disk bandwidth in CFQ seti
+ CONFIG_CFQ_GROUP_IOSCHED=y and for enabling throttling policy set
+ CONFIG_BLK_THROTTLE=y.
See Documentation/cgroups/blkio-controller.txt for more information.
endif # CGROUPS
-config MM_OWNER
- bool
-
-config SYSFS_DEPRECATED
- bool
-
-config SYSFS_DEPRECATED_V2
- bool "enable deprecated sysfs features to support old userspace tools"
- depends on SYSFS
- default n
- select SYSFS_DEPRECATED
- help
- This option switches the layout of sysfs to the deprecated
- version. Do not use it on recent distributions.
-
- The current sysfs layout features a unified device tree at
- /sys/devices/, which is able to express a hierarchy between
- class devices. If the deprecated option is set to Y, the
- unified device tree is split into a bus device tree at
- /sys/devices/ and several individual class device trees at
- /sys/class/. The class and bus devices will be connected by
- "<subsystem>:<name>" and the "device" links. The "block"
- class devices, will not show up in /sys/class/block/. Some
- subsystems will suppress the creation of some devices which
- depend on the unified device tree.
-
- This option is not a pure compatibility option that can
- be safely enabled on newer distributions. It will change the
- layout of sysfs to the non-extensible deprecated version,
- and disable some features, which can not be exported without
- confusing older userspace tools. Since 2007/2008 all major
- distributions do not enable this option, and ship no tools which
- depend on the deprecated layout or this option.
-
- If you are using a new kernel on an older distribution, or use
- older userspace tools, you might need to say Y here. Do not say Y,
- if the original kernel, that came with your distribution, has
- this option set to N.
-
-config RELAY
- bool "Kernel->user space relay support (formerly relayfs)"
- help
- This option enables support for relay interface support in
- certain file systems (such as debugfs).
- It is designed to provide an efficient mechanism for tools and
- facilities to relay large amounts of data from kernel space to
- user space.
-
- If unsure, say N.
-
-config NAMESPACES
+menuconfig NAMESPACES
bool "Namespaces support" if EMBEDDED
default !EMBEDDED
help
or same user id or pid may refer to different tasks when used in
different namespaces.
+if NAMESPACES
+
config UTS_NS
bool "UTS namespace"
- depends on NAMESPACES
+ default y
help
In this namespace tasks see different info provided with the
uname() system call
config IPC_NS
bool "IPC namespace"
- depends on NAMESPACES && (SYSVIPC || POSIX_MQUEUE)
+ depends on (SYSVIPC || POSIX_MQUEUE)
+ default y
help
In this namespace tasks work with IPC ids which correspond to
different IPC objects in different namespaces.
config USER_NS
bool "User namespace (EXPERIMENTAL)"
- depends on NAMESPACES && EXPERIMENTAL
+ depends on EXPERIMENTAL
+ default y
help
This allows containers, i.e. vservers, to use user namespaces
to provide different user info for different servers.
If unsure, say N.
config PID_NS
- bool "PID Namespaces (EXPERIMENTAL)"
- default n
- depends on NAMESPACES && EXPERIMENTAL
+ bool "PID Namespaces"
+ default y
help
Support process id namespaces. This allows having multiple
processes with the same pid as long as they are in different
pid namespaces. This is a building block of containers.
- Unless you want to work with an experimental feature
- say N here.
-
config NET_NS
bool "Network namespace"
- default n
- depends on NAMESPACES && EXPERIMENTAL && NET
+ depends on NET
+ default y
help
Allow user space to create what appear to be multiple instances
of the network stack.
+endif # NAMESPACES
+
+config MM_OWNER
+ bool
+
+config SYSFS_DEPRECATED
+ bool "enable deprecated sysfs features to support old userspace tools"
+ depends on SYSFS
+ default n
+ help
+ This option adds code that switches the layout of the "block" class
+ devices, to not show up in /sys/class/block/, but only in
+ /sys/block/.
+
+ This switch is only active when the sysfs.deprecated=1 boot option is
+ passed or the SYSFS_DEPRECATED_V2 option is set.
+
+ This option allows new kernels to run on old distributions and tools,
+ which might get confused by /sys/class/block/. Since 2007/2008 all
+ major distributions and tools handle this just fine.
+
+ Recent distributions and userspace tools after 2009/2010 depend on
+ the existence of /sys/class/block/, and will not work with this
+ option enabled.
+
+ Only if you are using a new kernel on an old distribution, you might
+ need to say Y here.
+
+config SYSFS_DEPRECATED_V2
+ bool "enabled deprecated sysfs features by default"
+ default n
+ depends on SYSFS
+ depends on SYSFS_DEPRECATED
+ help
+ Enable deprecated sysfs by default.
+
+ See the CONFIG_SYSFS_DEPRECATED option for more details about this
+ option.
+
+ Only if you are using a new kernel on an old distribution, you might
+ need to say Y here. Even then, odds are you would not need it
+ enabled, you can always pass the boot option if absolutely necessary.
+
+config RELAY
+ bool "Kernel->user space relay support (formerly relayfs)"
+ help
+ This option enables support for relay interface support in
+ certain file systems (such as debugfs).
+ It is designed to provide an efficient mechanism for tools and
+ facilities to relay large amounts of data from kernel space to
+ user space.
+
+ If unsure, say N.
+
config BLK_DEV_INITRD
bool "Initial RAM filesystem and RAM disk (initramfs/initrd) support"
depends on BROKEN || !FRV
default y if (PROFILING || PERF_COUNTERS)
depends on HAVE_PERF_EVENTS
select ANON_INODES
+ select IRQ_WORK
help
Enable kernel support for various performance events provided
by software and hardware.
*/
cpumask_var_t rto_mask;
atomic_t rto_count;
-#ifdef CONFIG_SMP
struct cpupri cpupri;
-#endif
};
/*
*/
static struct root_domain def_root_domain;
-#endif
+#endif /* CONFIG_SMP */
/*
* This is the main, per-CPU runqueue data structure.
*/
unsigned long nr_uninterruptible;
- struct task_struct *curr, *idle;
+ struct task_struct *curr, *idle, *stop;
unsigned long next_balance;
struct mm_struct *prev_mm;
u64 clock;
+ u64 clock_task;
atomic_t nr_iowait;
u64 avg_idle;
#endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ u64 prev_irq_time;
+#endif
+
/* calc_load related fields */
unsigned long calc_load_update;
long calc_load_active;
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-static inline
-void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
-{
- rq->curr->sched_class->check_preempt_curr(rq, p, flags);
- /*
- * A queue event has occurred, and we're going to schedule. In
- * this case, we can save a useless back to back clock update.
- */
- if (test_tsk_need_resched(p))
- rq->skip_clock_update = 1;
-}
+static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
static inline int cpu_of(struct rq *rq)
{
#endif /* CONFIG_CGROUP_SCHED */
-inline void update_rq_clock(struct rq *rq)
+static void update_rq_clock_task(struct rq *rq, s64 delta);
+
+static void update_rq_clock(struct rq *rq)
{
- if (!rq->skip_clock_update)
- rq->clock = sched_clock_cpu(cpu_of(rq));
+ s64 delta;
+
+ if (rq->skip_clock_update)
+ return;
+
+ delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+ rq->clock += delta;
+ update_rq_clock_task(rq, delta);
}
/*
size_t cnt, loff_t *ppos)
{
char buf[64];
- char *cmp = buf;
+ char *cmp;
int neg = 0;
int i;
return -EFAULT;
buf[cnt] = 0;
+ cmp = strstrip(buf);
if (strncmp(buf, "NO_", 3) == 0) {
neg = 1;
}
for (i = 0; sched_feat_names[i]; i++) {
- int len = strlen(sched_feat_names[i]);
-
- if (strncmp(cmp, sched_feat_names[i], len) == 0) {
+ if (strcmp(cmp, sched_feat_names[i]) == 0) {
if (neg)
sysctl_sched_features &= ~(1UL << i);
else
static const struct sched_class rt_sched_class;
-#define sched_class_highest (&rt_sched_class)
+#define sched_class_highest (&stop_sched_class)
#define for_each_class(class) \
for (class = sched_class_highest; class; class = class->next)
static void set_load_weight(struct task_struct *p)
{
- if (task_has_rt_policy(p)) {
- p->se.load.weight = 0;
- p->se.load.inv_weight = WMULT_CONST;
- return;
- }
-
/*
* SCHED_IDLE tasks get minimal weight:
*/
dec_nr_running(rq);
}
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
+/*
+ * There are no locks covering percpu hardirq/softirq time.
+ * They are only modified in account_system_vtime, on corresponding CPU
+ * with interrupts disabled. So, writes are safe.
+ * They are read and saved off onto struct rq in update_rq_clock().
+ * This may result in other CPU reading this CPU's irq time and can
+ * race with irq/account_system_vtime on this CPU. We would either get old
+ * or new value with a side effect of accounting a slice of irq time to wrong
+ * task when irq is in progress while we read rq->clock. That is a worthy
+ * compromise in place of having locks on each irq in account_system_time.
+ */
+static DEFINE_PER_CPU(u64, cpu_hardirq_time);
+static DEFINE_PER_CPU(u64, cpu_softirq_time);
+
+static DEFINE_PER_CPU(u64, irq_start_time);
+static int sched_clock_irqtime;
+
+void enable_sched_clock_irqtime(void)
+{
+ sched_clock_irqtime = 1;
+}
+
+void disable_sched_clock_irqtime(void)
+{
+ sched_clock_irqtime = 0;
+}
+
+#ifndef CONFIG_64BIT
+static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
+
+static inline void irq_time_write_begin(void)
+{
+ __this_cpu_inc(irq_time_seq.sequence);
+ smp_wmb();
+}
+
+static inline void irq_time_write_end(void)
+{
+ smp_wmb();
+ __this_cpu_inc(irq_time_seq.sequence);
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+ u64 irq_time;
+ unsigned seq;
+
+ do {
+ seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+ irq_time = per_cpu(cpu_softirq_time, cpu) +
+ per_cpu(cpu_hardirq_time, cpu);
+ } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
+
+ return irq_time;
+}
+#else /* CONFIG_64BIT */
+static inline void irq_time_write_begin(void)
+{
+}
+
+static inline void irq_time_write_end(void)
+{
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+ return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
+}
+#endif /* CONFIG_64BIT */
+
+/*
+ * Called before incrementing preempt_count on {soft,}irq_enter
+ * and before decrementing preempt_count on {soft,}irq_exit.
+ */
+void account_system_vtime(struct task_struct *curr)
+{
+ unsigned long flags;
+ s64 delta;
+ int cpu;
+
+ if (!sched_clock_irqtime)
+ return;
+
+ local_irq_save(flags);
+
+ cpu = smp_processor_id();
+ delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+ __this_cpu_add(irq_start_time, delta);
+
+ irq_time_write_begin();
+ /*
+ * We do not account for softirq time from ksoftirqd here.
+ * We want to continue accounting softirq time to ksoftirqd thread
+ * in that case, so as not to confuse scheduler with a special task
+ * that do not consume any time, but still wants to run.
+ */
+ if (hardirq_count())
+ __this_cpu_add(cpu_hardirq_time, delta);
+ else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
+ __this_cpu_add(cpu_softirq_time, delta);
+
+ irq_time_write_end();
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(account_system_vtime);
+
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+ s64 irq_delta;
+
+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+
+ /*
+ * Since irq_time is only updated on {soft,}irq_exit, we might run into
+ * this case when a previous update_rq_clock() happened inside a
+ * {soft,}irq region.
+ *
+ * When this happens, we stop ->clock_task and only update the
+ * prev_irq_time stamp to account for the part that fit, so that a next
+ * update will consume the rest. This ensures ->clock_task is
+ * monotonic.
+ *
+ * It does however cause some slight miss-attribution of {soft,}irq
+ * time, a more accurate solution would be to update the irq_time using
+ * the current rq->clock timestamp, except that would require using
+ * atomic ops.
+ */
+ if (irq_delta > delta)
+ irq_delta = delta;
+
+ rq->prev_irq_time += irq_delta;
+ delta -= irq_delta;
+ rq->clock_task += delta;
+
+ if (irq_delta && sched_feat(NONIRQ_POWER))
+ sched_rt_avg_update(rq, irq_delta);
+}
+
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+ rq->clock_task += delta;
+}
+
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
#include "sched_idletask.c"
#include "sched_fair.c"
#include "sched_rt.c"
+#include "sched_stoptask.c"
#ifdef CONFIG_SCHED_DEBUG
# include "sched_debug.c"
#endif
+void sched_set_stop_task(int cpu, struct task_struct *stop)
+{
+ struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+ struct task_struct *old_stop = cpu_rq(cpu)->stop;
+
+ if (stop) {
+ /*
+ * Make it appear like a SCHED_FIFO task, its something
+ * userspace knows about and won't get confused about.
+ *
+ * Also, it will make PI more or less work without too
+ * much confusion -- but then, stop work should not
+ * rely on PI working anyway.
+ */
+ sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
+
+ stop->sched_class = &stop_sched_class;
+ }
+
+ cpu_rq(cpu)->stop = stop;
+
+ if (old_stop) {
+ /*
+ * Reset it back to a normal scheduling class so that
+ * it can die in pieces.
+ */
+ old_stop->sched_class = &rt_sched_class;
+ }
+}
+
/*
* __normal_prio - return the priority that is based on the static prio
*/
p->sched_class->prio_changed(rq, p, oldprio, running);
}
+static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
+{
+ const struct sched_class *class;
+
+ if (p->sched_class == rq->curr->sched_class) {
+ rq->curr->sched_class->check_preempt_curr(rq, p, flags);
+ } else {
+ for_each_class(class) {
+ if (class == rq->curr->sched_class)
+ break;
+ if (class == p->sched_class) {
+ resched_task(rq->curr);
+ break;
+ }
+ }
+ }
+
+ /*
+ * A queue event has occurred, and we're going to schedule. In
+ * this case, we can save a useless back to back clock update.
+ */
+ if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
+ rq->skip_clock_update = 1;
+}
+
#ifdef CONFIG_SMP
/*
* Is this task likely cache-hot:
if (p->sched_class != &fair_sched_class)
return 0;
+ if (unlikely(p->policy == SCHED_IDLE))
+ return 0;
+
/*
* Buddy candidates are cache hot:
*/
*/
arch_start_context_switch(prev);
- if (likely(!mm)) {
+ if (!mm) {
next->active_mm = oldmm;
atomic_inc(&oldmm->mm_count);
enter_lazy_tlb(oldmm, next);
} else
switch_mm(oldmm, mm, next);
- if (likely(!prev->mm)) {
+ if (!prev->mm) {
prev->active_mm = NULL;
rq->prev_mm = oldmm;
}
return delta;
}
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+ load *= exp;
+ load += active * (FIXED_1 - exp);
+ load += 1UL << (FSHIFT - 1);
+ return load >> FSHIFT;
+}
+
#ifdef CONFIG_NO_HZ
/*
* For NO_HZ we delay the active fold to the next LOAD_FREQ update.
return delta;
}
+
+/**
+ * fixed_power_int - compute: x^n, in O(log n) time
+ *
+ * @x: base of the power
+ * @frac_bits: fractional bits of @x
+ * @n: power to raise @x to.
+ *
+ * By exploiting the relation between the definition of the natural power
+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+ * (where: n_i \elem {0, 1}, the binary vector representing n),
+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+ * of course trivially computable in O(log_2 n), the length of our binary
+ * vector.
+ */
+static unsigned long
+fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
+{
+ unsigned long result = 1UL << frac_bits;
+
+ if (n) for (;;) {
+ if (n & 1) {
+ result *= x;
+ result += 1UL << (frac_bits - 1);
+ result >>= frac_bits;
+ }
+ n >>= 1;
+ if (!n)
+ break;
+ x *= x;
+ x += 1UL << (frac_bits - 1);
+ x >>= frac_bits;
+ }
+
+ return result;
+}
+
+/*
+ * a1 = a0 * e + a * (1 - e)
+ *
+ * a2 = a1 * e + a * (1 - e)
+ * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
+ * = a0 * e^2 + a * (1 - e) * (1 + e)
+ *
+ * a3 = a2 * e + a * (1 - e)
+ * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
+ * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
+ *
+ * ...
+ *
+ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
+ * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
+ * = a0 * e^n + a * (1 - e^n)
+ *
+ * [1] application of the geometric series:
+ *
+ * n 1 - x^(n+1)
+ * S_n := \Sum x^i = -------------
+ * i=0 1 - x
+ */
+static unsigned long
+calc_load_n(unsigned long load, unsigned long exp,
+ unsigned long active, unsigned int n)
+{
+
+ return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
+}
+
+/*
+ * NO_HZ can leave us missing all per-cpu ticks calling
+ * calc_load_account_active(), but since an idle CPU folds its delta into
+ * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
+ * in the pending idle delta if our idle period crossed a load cycle boundary.
+ *
+ * Once we've updated the global active value, we need to apply the exponential
+ * weights adjusted to the number of cycles missed.
+ */
+static void calc_global_nohz(unsigned long ticks)
+{
+ long delta, active, n;
+
+ if (time_before(jiffies, calc_load_update))
+ return;
+
+ /*
+ * If we crossed a calc_load_update boundary, make sure to fold
+ * any pending idle changes, the respective CPUs might have
+ * missed the tick driven calc_load_account_active() update
+ * due to NO_HZ.
+ */
+ delta = calc_load_fold_idle();
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
+
+ /*
+ * If we were idle for multiple load cycles, apply them.
+ */
+ if (ticks >= LOAD_FREQ) {
+ n = ticks / LOAD_FREQ;
+
+ active = atomic_long_read(&calc_load_tasks);
+ active = active > 0 ? active * FIXED_1 : 0;
+
+ avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+ avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+ avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+
+ calc_load_update += n * LOAD_FREQ;
+ }
+
+ /*
+ * Its possible the remainder of the above division also crosses
+ * a LOAD_FREQ period, the regular check in calc_global_load()
+ * which comes after this will take care of that.
+ *
+ * Consider us being 11 ticks before a cycle completion, and us
+ * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
+ * age us 4 cycles, and the test in calc_global_load() will
+ * pick up the final one.
+ */
+}
#else
static void calc_load_account_idle(struct rq *this_rq)
{
{
return 0;
}
+
+static void calc_global_nohz(unsigned long ticks)
+{
+}
#endif
/**
loads[2] = (avenrun[2] + offset) << shift;
}
-static unsigned long
-calc_load(unsigned long load, unsigned long exp, unsigned long active)
-{
- load *= exp;
- load += active * (FIXED_1 - exp);
- return load >> FSHIFT;
-}
-
/*
* calc_load - update the avenrun load estimates 10 ticks after the
* CPUs have updated calc_load_tasks.
*/
-void calc_global_load(void)
+void calc_global_load(unsigned long ticks)
{
- unsigned long upd = calc_load_update + 10;
long active;
- if (time_before(jiffies, upd))
+ calc_global_nohz(ticks);
+
+ if (time_before(jiffies, calc_load_update + 10))
return;
active = atomic_long_read(&calc_load_tasks);
if (task_current(rq, p)) {
update_rq_clock(rq);
- ns = rq->clock - p->se.exec_start;
+ ns = rq->clock_task - p->se.exec_start;
if ((s64)ns < 0)
ns = 0;
}
tmp = cputime_to_cputime64(cputime);
if (hardirq_count() - hardirq_offset)
cpustat->irq = cputime64_add(cpustat->irq, tmp);
- else if (softirq_count())
+ else if (in_serving_softirq())
cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
else
cpustat->system = cputime64_add(cpustat->system, tmp);
curr->sched_class->task_tick(rq, curr, 0);
raw_spin_unlock(&rq->lock);
- perf_event_task_tick(curr);
+ perf_event_task_tick();
#ifdef CONFIG_SMP
rq->idle_at_tick = idle_cpu(cpu);
{
if (prev->se.on_rq)
update_rq_clock(rq);
- rq->skip_clock_update = 0;
prev->sched_class->put_prev_task(rq, prev);
}
return p;
}
- class = sched_class_highest;
- for ( ; ; ) {
+ for_each_class(class) {
p = class->pick_next_task(rq);
if (p)
return p;
- /*
- * Will never be NULL as the idle class always
- * returns a non-NULL p:
- */
- class = class->next;
}
+
+ BUG(); /* the idle class will always have a runnable task */
}
/*
hrtick_clear(rq);
raw_spin_lock_irq(&rq->lock);
- clear_tsk_need_resched(prev);
switch_count = &prev->nivcsw;
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
put_prev_task(rq, prev);
next = pick_next_task(rq);
+ clear_tsk_need_resched(prev);
+ rq->skip_clock_update = 0;
if (likely(prev != next)) {
sched_info_switch(prev, next);
rq = task_rq_lock(p, &flags);
+ trace_sched_pi_setprio(p, prio);
oldprio = p->prio;
prev_class = p->sched_class;
on_rq = p->se.on_rq;
}
if (user) {
- retval = security_task_setscheduler(p, policy, param);
+ retval = security_task_setscheduler(p);
if (retval)
return retval;
}
*/
rq = __task_rq_lock(p);
+ /*
+ * Changing the policy of the stop threads its a very bad idea
+ */
+ if (p == rq->stop) {
+ __task_rq_unlock(rq);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ return -EINVAL;
+ }
+
#ifdef CONFIG_RT_GROUP_SCHED
if (user) {
/*
if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
goto out_unlock;
- retval = security_task_setscheduler(p, 0, NULL);
+ retval = security_task_setscheduler(p);
if (retval)
goto out_unlock;
cpuset_cpus_allowed(p, cpus_allowed);
cpumask_and(new_mask, in_mask, cpus_allowed);
- again:
+again:
retval = set_cpus_allowed_ptr(p, new_mask);
if (!retval) {
cpumask_var_t nodemask;
cpumask_var_t this_sibling_map;
cpumask_var_t this_core_map;
+ cpumask_var_t this_book_map;
cpumask_var_t send_covered;
cpumask_var_t tmpmask;
struct sched_group **sched_group_nodes;
sa_rootdomain,
sa_tmpmask,
sa_send_covered,
+ sa_this_book_map,
sa_this_core_map,
sa_this_sibling_map,
sa_nodemask,
#ifdef CONFIG_SCHED_MC
static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
-#endif /* CONFIG_SCHED_MC */
-#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
static int
cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
struct sched_group **sg, struct cpumask *mask)
{
int group;
-
+#ifdef CONFIG_SCHED_SMT
cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
group = cpumask_first(mask);
+#else
+ group = cpu;
+#endif
if (sg)
*sg = &per_cpu(sched_group_core, group).sg;
return group;
}
-#elif defined(CONFIG_SCHED_MC)
+#endif /* CONFIG_SCHED_MC */
+
+/*
+ * book sched-domains:
+ */
+#ifdef CONFIG_SCHED_BOOK
+static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
+static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
+
static int
-cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg, struct cpumask *unused)
+cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
+ struct sched_group **sg, struct cpumask *mask)
{
+ int group = cpu;
+#ifdef CONFIG_SCHED_MC
+ cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
+ group = cpumask_first(mask);
+#elif defined(CONFIG_SCHED_SMT)
+ cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
+ group = cpumask_first(mask);
+#endif
if (sg)
- *sg = &per_cpu(sched_group_core, cpu).sg;
- return cpu;
+ *sg = &per_cpu(sched_group_book, group).sg;
+ return group;
}
-#endif
+#endif /* CONFIG_SCHED_BOOK */
static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
struct sched_group **sg, struct cpumask *mask)
{
int group;
-#ifdef CONFIG_SCHED_MC
+#ifdef CONFIG_SCHED_BOOK
+ cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
+ group = cpumask_first(mask);
+#elif defined(CONFIG_SCHED_MC)
cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
group = cpumask_first(mask);
#elif defined(CONFIG_SCHED_SMT)
if (cpu != group_first_cpu(sd->groups))
return;
+ sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
+
child = sd->child;
sd->groups->cpu_power = 0;
#ifdef CONFIG_SCHED_MC
SD_INIT_FUNC(MC)
#endif
+#ifdef CONFIG_SCHED_BOOK
+ SD_INIT_FUNC(BOOK)
+#endif
static int default_relax_domain_level = -1;
free_cpumask_var(d->tmpmask); /* fall through */
case sa_send_covered:
free_cpumask_var(d->send_covered); /* fall through */
+ case sa_this_book_map:
+ free_cpumask_var(d->this_book_map); /* fall through */
case sa_this_core_map:
free_cpumask_var(d->this_core_map); /* fall through */
case sa_this_sibling_map:
return sa_nodemask;
if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
return sa_this_sibling_map;
- if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
+ if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
return sa_this_core_map;
+ if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
+ return sa_this_book_map;
if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
return sa_send_covered;
d->rd = alloc_rootdomain();
return sd;
}
+static struct sched_domain *__build_book_sched_domain(struct s_data *d,
+ const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+ struct sched_domain *parent, int i)
+{
+ struct sched_domain *sd = parent;
+#ifdef CONFIG_SCHED_BOOK
+ sd = &per_cpu(book_domains, i).sd;
+ SD_INIT(sd, BOOK);
+ set_domain_attribute(sd, attr);
+ cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
+ sd->parent = parent;
+ parent->child = sd;
+ cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
+#endif
+ return sd;
+}
+
static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *parent, int i)
&cpu_to_core_group,
d->send_covered, d->tmpmask);
break;
+#endif
+#ifdef CONFIG_SCHED_BOOK
+ case SD_LV_BOOK: /* set up book groups */
+ cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
+ if (cpu == cpumask_first(d->this_book_map))
+ init_sched_build_groups(d->this_book_map, cpu_map,
+ &cpu_to_book_group,
+ d->send_covered, d->tmpmask);
+ break;
#endif
case SD_LV_CPU: /* set up physical groups */
cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
+ sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
}
for_each_cpu(i, cpu_map) {
build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
+ build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
build_sched_groups(&d, SD_LV_MC, cpu_map, i);
}
init_sched_groups_power(i, sd);
}
#endif
+#ifdef CONFIG_SCHED_BOOK
+ for_each_cpu(i, cpu_map) {
+ sd = &per_cpu(book_domains, i).sd;
+ init_sched_groups_power(i, sd);
+ }
+#endif
for_each_cpu(i, cpu_map) {
sd = &per_cpu(phys_domains, i).sd;
sd = &per_cpu(cpu_domains, i).sd;
#elif defined(CONFIG_SCHED_MC)
sd = &per_cpu(core_domains, i).sd;
+#elif defined(CONFIG_SCHED_BOOK)
+ sd = &per_cpu(book_domains, i).sd;
#else
sd = &per_cpu(phys_domains, i).sd;
#endif
return 1;
- err_free_rq:
+err_free_rq:
kfree(cfs_rq);
- err:
+err:
return 0;
}
return 1;
- err_free_rq:
+err_free_rq:
kfree(rt_rq);
- err:
+err:
return 0;
}
if (unlikely(running))
tsk->sched_class->put_prev_task(rq, tsk);
- set_task_rq(tsk, task_cpu(tsk));
-
#ifdef CONFIG_FAIR_GROUP_SCHED
- if (tsk->sched_class->moved_group)
- tsk->sched_class->moved_group(tsk, on_rq);
+ if (tsk->sched_class->task_move_group)
+ tsk->sched_class->task_move_group(tsk, on_rq);
+ else
#endif
+ set_task_rq(tsk, task_cpu(tsk));
if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
raw_spin_unlock(&rt_rq->rt_runtime_lock);
}
raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
- unlock:
+unlock:
read_unlock(&tasklist_lock);
mutex_unlock(&rt_constraints_mutex);
};
#endif /* CONFIG_CGROUP_CPUACCT */
- #ifndef CONFIG_SMP
-
- void synchronize_sched_expedited(void)
- {
- barrier();
- }
- EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
- #else /* #ifndef CONFIG_SMP */
-
- static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
-
- static int synchronize_sched_expedited_cpu_stop(void *data)
- {
- /*
- * There must be a full memory barrier on each affected CPU
- * between the time that try_stop_cpus() is called and the
- * time that it returns.
- *
- * In the current initial implementation of cpu_stop, the
- * above condition is already met when the control reaches
- * this point and the following smp_mb() is not strictly
- * necessary. Do smp_mb() anyway for documentation and
- * robustness against future implementation changes.
- */
- smp_mb(); /* See above comment block. */
- return 0;
- }
-
- /*
- * Wait for an rcu-sched grace period to elapse, but use "big hammer"
- * approach to force grace period to end quickly. This consumes
- * significant time on all CPUs, and is thus not recommended for
- * any sort of common-case code.
- *
- * Note that it is illegal to call this function while holding any
- * lock that is acquired by a CPU-hotplug notifier. Failing to
- * observe this restriction will result in deadlock.
- */
- void synchronize_sched_expedited(void)
- {
- int snap, trycount = 0;
-
- smp_mb(); /* ensure prior mod happens before capturing snap. */
- snap = atomic_read(&synchronize_sched_expedited_count) + 1;
- get_online_cpus();
- while (try_stop_cpus(cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
- if (trycount++ < 10)
- udelay(trycount * num_online_cpus());
- else {
- synchronize_sched();
- return;
- }
- if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
- smp_mb(); /* ensure test happens before caller kfree */
- return;
- }
- get_online_cpus();
- }
- atomic_inc(&synchronize_sched_expedited_count);
- smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
- put_online_cpus();
- }
- EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
- #endif /* #else #ifndef CONFIG_SMP */