[mirror_ubuntu-zesty-kernel.git] / kernel / sched / proc.c

/*
 *  kernel/sched/proc.c
 *
 *  Kernel load calculations, forked from sched/core.c
 */

#include <linux/export.h>

#include "sched.h"

unsigned long this_cpu_load(void)
{
	struct rq *this = this_rq();
	return this->cpu_load[0];
}


/*
 * Global load-average calculations
 *
 * We take a distributed and async approach to calculating the global load-avg
 * in order to minimize overhead.
 *
 * The global load average is an exponentially decaying average of nr_running +
 * nr_uninterruptible.
 *
 * Once every LOAD_FREQ:
 *
 *   nr_active = 0;
 *   for_each_possible_cpu(cpu)
 *	nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
 *
 *   avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
 *
 * Due to a number of reasons the above turns in the mess below:
 *
 *  - for_each_possible_cpu() is prohibitively expensive on machines with
 *    serious number of cpus, therefore we need to take a distributed approach
 *    to calculating nr_active.
 *
 *        \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
 *                      = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
 *
 *    So assuming nr_active := 0 when we start out -- true per definition, we
 *    can simply take per-cpu deltas and fold those into a global accumulate
 *    to obtain the same result. See calc_load_fold_active().
 *
 *    Furthermore, in order to avoid synchronizing all per-cpu delta folding
 *    across the machine, we assume 10 ticks is sufficient time for every
 *    cpu to have completed this task.
 *
 *    This places an upper-bound on the IRQ-off latency of the machine. Then
 *    again, being late doesn't loose the delta, just wrecks the sample.
 *
 *  - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
 *    this would add another cross-cpu cacheline miss and atomic operation
 *    to the wakeup path. Instead we increment on whatever cpu the task ran
 *    when it went into uninterruptible state and decrement on whatever cpu
 *    did the wakeup. This means that only the sum of nr_uninterruptible over
 *    all cpus yields the correct result.
 *
 *  This covers the NO_HZ=n code, for extra head-aches, see the comment below.
 */

/* Variables and functions for calc_load */
atomic_long_t calc_load_tasks;
unsigned long calc_load_update;
unsigned long avenrun[3];
EXPORT_SYMBOL(avenrun); /* should be removed */

/**
 * get_avenrun - get the load average array
 * @loads:	pointer to dest load array
 * @offset:	offset to add
 * @shift:	shift count to shift the result left
 *
 * These values are estimates at best, so no need for locking.
 */
void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
{
	loads[0] = (avenrun[0] + offset) << shift;
	loads[1] = (avenrun[1] + offset) << shift;
	loads[2] = (avenrun[2] + offset) << shift;
}

long calc_load_fold_active(struct rq *this_rq)
{
	long nr_active, delta = 0;

	nr_active = this_rq->nr_running;
	nr_active += (long) this_rq->nr_uninterruptible;

	if (nr_active != this_rq->calc_load_active) {
		delta = nr_active - this_rq->calc_load_active;
		this_rq->calc_load_active = nr_active;
	}

	return delta;
}

/*
 * a1 = a0 * e + a * (1 - e)
 */
static unsigned long
calc_load(unsigned long load, unsigned long exp, unsigned long active)
{
	load *= exp;
	load += active * (FIXED_1 - exp);
	load += 1UL << (FSHIFT - 1);
	return load >> FSHIFT;
}

#ifdef CONFIG_NO_HZ_COMMON
/*
 * Handle NO_HZ for the global load-average.
 *
 * Since the above described distributed algorithm to compute the global
 * load-average relies on per-cpu sampling from the tick, it is affected by
 * NO_HZ.
 *
 * The basic idea is to fold the nr_active delta into a global idle-delta upon
 * entering NO_HZ state such that we can include this as an 'extra' cpu delta
 * when we read the global state.
 *
 * Obviously reality has to ruin such a delightfully simple scheme:
 *
 *  - When we go NO_HZ idle during the window, we can negate our sample
 *    contribution, causing under-accounting.
 *
 *    We avoid this by keeping two idle-delta counters and flipping them
 *    when the window starts, thus separating old and new NO_HZ load.
 *
 *    The only trick is the slight shift in index flip for read vs write.
 *
 *        0s            5s            10s           15s
 *          +10           +10           +10           +10
 *        |-|-----------|-|-----------|-|-----------|-|
 *    r:0 0 1           1 0           0 1           1 0
 *    w:0 1 1           0 0           1 1           0 0
 *
 *    This ensures we'll fold the old idle contribution in this window while
 *    accumlating the new one.
 *
 *  - When we wake up from NO_HZ idle during the window, we push up our
 *    contribution, since we effectively move our sample point to a known
 *    busy state.
 *
 *    This is solved by pushing the window forward, and thus skipping the
 *    sample, for this cpu (effectively using the idle-delta for this cpu which
 *    was in effect at the time the window opened). This also solves the issue
 *    of having to deal with a cpu having been in NOHZ idle for multiple
 *    LOAD_FREQ intervals.
 *
 * When making the ILB scale, we should try to pull this in as well.
 */
static atomic_long_t calc_load_idle[2];
static int calc_load_idx;

static inline int calc_load_write_idx(void)
{
	int idx = calc_load_idx;

	/*
	 * See calc_global_nohz(), if we observe the new index, we also
	 * need to observe the new update time.
	 */
	smp_rmb();

	/*
	 * If the folding window started, make sure we start writing in the
	 * next idle-delta.
	 */
	if (!time_before(jiffies, calc_load_update))
		idx++;

	return idx & 1;
}

static inline int calc_load_read_idx(void)
{
	return calc_load_idx & 1;
}

void calc_load_enter_idle(void)
{
	struct rq *this_rq = this_rq();
	long delta;

	/*
	 * We're going into NOHZ mode, if there's any pending delta, fold it
	 * into the pending idle delta.
	 */
	delta = calc_load_fold_active(this_rq);
	if (delta) {
		int idx = calc_load_write_idx();
		atomic_long_add(delta, &calc_load_idle[idx]);
	}
}

void calc_load_exit_idle(void)
{
	struct rq *this_rq = this_rq();

	/*
	 * If we're still before the sample window, we're done.
	 */
	if (time_before(jiffies, this_rq->calc_load_update))
		return;

	/*
	 * We woke inside or after the sample window, this means we're already
	 * accounted through the nohz accounting, so skip the entire deal and
	 * sync up for the next window.
	 */
	this_rq->calc_load_update = calc_load_update;
	if (time_before(jiffies, this_rq->calc_load_update + 10))
		this_rq->calc_load_update += LOAD_FREQ;
}

static long calc_load_fold_idle(void)
{
	int idx = calc_load_read_idx();
	long delta = 0;

	if (atomic_long_read(&calc_load_idle[idx]))
		delta = atomic_long_xchg(&calc_load_idle[idx], 0);

	return delta;
}

/**
 * fixed_power_int - compute: x^n, in O(log n) time
 *
 * @x:         base of the power
 * @frac_bits: fractional bits of @x
 * @n:         power to raise @x to.
 *
 * By exploiting the relation between the definition of the natural power
 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
 * (where: n_i \elem {0, 1}, the binary vector representing n),
 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
 * of course trivially computable in O(log_2 n), the length of our binary
 * vector.
 */
static unsigned long
fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
{
	unsigned long result = 1UL << frac_bits;

	if (n) for (;;) {
		if (n & 1) {
			result *= x;
			result += 1UL << (frac_bits - 1);
			result >>= frac_bits;
		}
		n >>= 1;
		if (!n)
			break;
		x *= x;
		x += 1UL << (frac_bits - 1);
		x >>= frac_bits;
	}

	return result;
}

/*
 * a1 = a0 * e + a * (1 - e)
 *
 * a2 = a1 * e + a * (1 - e)
 *    = (a0 * e + a * (1 - e)) * e + a * (1 - e)
 *    = a0 * e^2 + a * (1 - e) * (1 + e)
 *
 * a3 = a2 * e + a * (1 - e)
 *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
 *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
 *
 *  ...
 *
 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
 *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
 *    = a0 * e^n + a * (1 - e^n)
 *
 * [1] application of the geometric series:
 *
 *              n         1 - x^(n+1)
 *     S_n := \Sum x^i = -------------
 *             i=0          1 - x
 */
static unsigned long
calc_load_n(unsigned long load, unsigned long exp,
	    unsigned long active, unsigned int n)
{

	return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
}

/*
 * NO_HZ can leave us missing all per-cpu ticks calling
 * calc_load_account_active(), but since an idle CPU folds its delta into
 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
 * in the pending idle delta if our idle period crossed a load cycle boundary.
 *
 * Once we've updated the global active value, we need to apply the exponential
 * weights adjusted to the number of cycles missed.
 */
static void calc_global_nohz(void)
{
	long delta, active, n;

	if (!time_before(jiffies, calc_load_update + 10)) {
		/*
		 * Catch-up, fold however many we are behind still
		 */
		delta = jiffies - calc_load_update - 10;
		n = 1 + (delta / LOAD_FREQ);

		active = atomic_long_read(&calc_load_tasks);
		active = active > 0 ? active * FIXED_1 : 0;

		avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
		avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
		avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);

		calc_load_update += n * LOAD_FREQ;
	}

	/*
	 * Flip the idle index...
	 *
	 * Make sure we first write the new time then flip the index, so that
	 * calc_load_write_idx() will see the new time when it reads the new
	 * index, this avoids a double flip messing things up.
	 */
	smp_wmb();
	calc_load_idx++;
}
#else /* !CONFIG_NO_HZ_COMMON */

static inline long calc_load_fold_idle(void) { return 0; }
static inline void calc_global_nohz(void) { }

#endif /* CONFIG_NO_HZ_COMMON */

/*
 * calc_load - update the avenrun load estimates 10 ticks after the
 * CPUs have updated calc_load_tasks.
 */
void calc_global_load(unsigned long ticks)
{
	long active, delta;

	if (time_before(jiffies, calc_load_update + 10))
		return;

	/*
	 * Fold the 'old' idle-delta to include all NO_HZ cpus.
	 */
	delta = calc_load_fold_idle();
	if (delta)
		atomic_long_add(delta, &calc_load_tasks);

	active = atomic_long_read(&calc_load_tasks);
	active = active > 0 ? active * FIXED_1 : 0;

	avenrun[0] = calc_load(avenrun[0], EXP_1, active);
	avenrun[1] = calc_load(avenrun[1], EXP_5, active);
	avenrun[2] = calc_load(avenrun[2], EXP_15, active);

	calc_load_update += LOAD_FREQ;

	/*
	 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
	 */
	calc_global_nohz();
}

/*
 * Called from update_cpu_load() to periodically update this CPU's
 * active count.
 */
static void calc_load_account_active(struct rq *this_rq)
{
	long delta;

	if (time_before(jiffies, this_rq->calc_load_update))
		return;

	delta  = calc_load_fold_active(this_rq);
	if (delta)
		atomic_long_add(delta, &calc_load_tasks);

	this_rq->calc_load_update += LOAD_FREQ;
}

/*
 * End of global load-average stuff
 */

/*
 * The exact cpuload at various idx values, calculated at every tick would be
 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
 *
 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
 * on nth tick when cpu may be busy, then we have:
 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
 *
 * decay_load_missed() below does efficient calculation of
 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
 *
 * The calculation is approximated on a 128 point scale.
 * degrade_zero_ticks is the number of ticks after which load at any
 * particular idx is approximated to be zero.
 * degrade_factor is a precomputed table, a row for each load idx.
 * Each column corresponds to degradation factor for a power of two ticks,
 * based on 128 point scale.
 * Example:
 * row 2, col 3 (=12) says that the degradation at load idx 2 after
 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
 *
 * With this power of 2 load factors, we can degrade the load n times
 * by looking at 1 bits in n and doing as many mult/shift instead of
 * n mult/shifts needed by the exact degradation.
 */
#define DEGRADE_SHIFT		7
static const unsigned char
		degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
static const unsigned char
		degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
					{0, 0, 0, 0, 0, 0, 0, 0},
					{64, 32, 8, 0, 0, 0, 0, 0},
					{96, 72, 40, 12, 1, 0, 0},
					{112, 98, 75, 43, 15, 1, 0},
					{120, 112, 98, 76, 45, 16, 2} };

/*
 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
 * would be when CPU is idle and so we just decay the old load without
 * adding any new load.
 */
static unsigned long
decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
{
	int j = 0;

	if (!missed_updates)
		return load;

	if (missed_updates >= degrade_zero_ticks[idx])
		return 0;

	if (idx == 1)
		return load >> missed_updates;

	while (missed_updates) {
		if (missed_updates % 2)
			load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;

		missed_updates >>= 1;
		j++;
	}
	return load;
}

/*
 * Update rq->cpu_load[] statistics. This function is usually called every
 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
 * every tick. We fix it up based on jiffies.
 */
static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
			      unsigned long pending_updates)
{
	int i, scale;

	this_rq->nr_load_updates++;

	/* Update our load: */
	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
		unsigned long old_load, new_load;

		/* scale is effectively 1 << i now, and >> i divides by scale */

		old_load = this_rq->cpu_load[i];
		old_load = decay_load_missed(old_load, pending_updates - 1, i);
		new_load = this_load;
		/*
		 * Round up the averaging division if load is increasing. This
		 * prevents us from getting stuck on 9 if the load is 10, for
		 * example.
		 */
		if (new_load > old_load)
			new_load += scale - 1;

		this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
	}

	sched_avg_update(this_rq);
}

#ifdef CONFIG_SMP
static inline unsigned long get_rq_runnable_load(struct rq *rq)
{
	return rq->cfs.runnable_load_avg;
}
#else
static inline unsigned long get_rq_runnable_load(struct rq *rq)
{
	return rq->load.weight;
}
#endif

#ifdef CONFIG_NO_HZ_COMMON
/*
 * There is no sane way to deal with nohz on smp when using jiffies because the
 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
 *
 * Therefore we cannot use the delta approach from the regular tick since that
 * would seriously skew the load calculation. However we'll make do for those
 * updates happening while idle (nohz_idle_balance) or coming out of idle
 * (tick_nohz_idle_exit).
 *
 * This means we might still be one tick off for nohz periods.
 */

/*
 * Called from nohz_idle_balance() to update the load ratings before doing the
 * idle balance.
 */
void update_idle_cpu_load(struct rq *this_rq)
{
	unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
	unsigned long load = get_rq_runnable_load(this_rq);
	unsigned long pending_updates;

	/*
	 * bail if there's load or we're actually up-to-date.
	 */
	if (load || curr_jiffies == this_rq->last_load_update_tick)
		return;

	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
	this_rq->last_load_update_tick = curr_jiffies;

	__update_cpu_load(this_rq, load, pending_updates);
}

/*
 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
 */
void update_cpu_load_nohz(void)
{
	struct rq *this_rq = this_rq();
	unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
	unsigned long pending_updates;

	if (curr_jiffies == this_rq->last_load_update_tick)
		return;

	raw_spin_lock(&this_rq->lock);
	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
	if (pending_updates) {
		this_rq->last_load_update_tick = curr_jiffies;
		/*
		 * We were idle, this means load 0, the current load might be
		 * !0 due to remote wakeups and the sort.
		 */
		__update_cpu_load(this_rq, 0, pending_updates);
	}
	raw_spin_unlock(&this_rq->lock);
}
#endif /* CONFIG_NO_HZ */

/*
 * Called from scheduler_tick()
 */
void update_cpu_load_active(struct rq *this_rq)
{
	unsigned long load = get_rq_runnable_load(this_rq);
	/*
	 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
	 */
	this_rq->last_load_update_tick = jiffies;
	__update_cpu_load(this_rq, load, 1);

	calc_load_account_active(this_rq);
}
Commit	Line	Data
45ceebf7 PG	1	/*
	2	* kernel/sched/proc.c
	3	*
	4	* Kernel load calculations, forked from sched/core.c
	5	*/
	6
	7	#include <linux/export.h>
	8
	9	#include "sched.h"
	10
	11	unsigned long this_cpu_load(void)
	12	{
	13	struct rq *this = this_rq();
	14	return this->cpu_load[0];
	15	}
	16
	17
	18	/*
	19	* Global load-average calculations
	20	*
	21	* We take a distributed and async approach to calculating the global load-avg
	22	* in order to minimize overhead.
	23	*
	24	* The global load average is an exponentially decaying average of nr_running +
	25	* nr_uninterruptible.
	26	*
	27	* Once every LOAD_FREQ:
	28	*
	29	* nr_active = 0;
	30	* for_each_possible_cpu(cpu)
	31	* nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
	32	*
	33	* avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
	34	*
	35	* Due to a number of reasons the above turns in the mess below:
	36	*
	37	* - for_each_possible_cpu() is prohibitively expensive on machines with
	38	* serious number of cpus, therefore we need to take a distributed approach
	39	* to calculating nr_active.
	40	*
	41	* \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) \| x_i(t_0) := 0
	42	* = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
	43	*
	44	* So assuming nr_active := 0 when we start out -- true per definition, we
	45	* can simply take per-cpu deltas and fold those into a global accumulate
	46	* to obtain the same result. See calc_load_fold_active().
	47	*
	48	* Furthermore, in order to avoid synchronizing all per-cpu delta folding
	49	* across the machine, we assume 10 ticks is sufficient time for every
	50	* cpu to have completed this task.
	51	*
	52	* This places an upper-bound on the IRQ-off latency of the machine. Then
	53	* again, being late doesn't loose the delta, just wrecks the sample.
	54	*
	55	* - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
	56	* this would add another cross-cpu cacheline miss and atomic operation
	57	* to the wakeup path. Instead we increment on whatever cpu the task ran
	58	* when it went into uninterruptible state and decrement on whatever cpu
	59	* did the wakeup. This means that only the sum of nr_uninterruptible over
	60	* all cpus yields the correct result.
	61	*
	62	* This covers the NO_HZ=n code, for extra head-aches, see the comment below.
	63	*/
	64
65	/* Variables and functions for calc_load */
66	atomic_long_t calc_load_tasks;
67	unsigned long calc_load_update;
68	unsigned long avenrun[3];
69	EXPORT_SYMBOL(avenrun); /* should be removed */
70
71	/**
72	* get_avenrun - get the load average array
73	* @loads: pointer to dest load array
74	* @offset: offset to add
75	* @shift: shift count to shift the result left
76	*
77	* These values are estimates at best, so no need for locking.
78	*/
79	void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
80	{
81	loads[0] = (avenrun[0] + offset) << shift;
82	loads[1] = (avenrun[1] + offset) << shift;
83	loads[2] = (avenrun[2] + offset) << shift;
84	}
85
86	long calc_load_fold_active(struct rq *this_rq)
87	{
88	long nr_active, delta = 0;
89
90	nr_active = this_rq->nr_running;
91	nr_active += (long) this_rq->nr_uninterruptible;
92
93	if (nr_active != this_rq->calc_load_active) {
94	delta = nr_active - this_rq->calc_load_active;
95	this_rq->calc_load_active = nr_active;
96	}
97
98	return delta;
99	}
100
101	/*
102	* a1 = a0 * e + a * (1 - e)
103	*/
104	static unsigned long
105	calc_load(unsigned long load, unsigned long exp, unsigned long active)
106	{
107	load *= exp;
108	load += active * (FIXED_1 - exp);
109	load += 1UL << (FSHIFT - 1);
110	return load >> FSHIFT;
111	}
112
113	#ifdef CONFIG_NO_HZ_COMMON
114	/*
115	* Handle NO_HZ for the global load-average.
116	*
117	* Since the above described distributed algorithm to compute the global
118	* load-average relies on per-cpu sampling from the tick, it is affected by
119	* NO_HZ.
120	*
121	* The basic idea is to fold the nr_active delta into a global idle-delta upon
122	* entering NO_HZ state such that we can include this as an 'extra' cpu delta
123	* when we read the global state.
124	*
125	* Obviously reality has to ruin such a delightfully simple scheme:
126	*
127	* - When we go NO_HZ idle during the window, we can negate our sample
128	* contribution, causing under-accounting.
129	*
130	* We avoid this by keeping two idle-delta counters and flipping them
131	* when the window starts, thus separating old and new NO_HZ load.
132	*
133	* The only trick is the slight shift in index flip for read vs write.
134	*
135	* 0s 5s 10s 15s
136	* +10 +10 +10 +10
137	* \|-\|-----------\|-\|-----------\|-\|-----------\|-\|
138	* r:0 0 1 1 0 0 1 1 0
139	* w:0 1 1 0 0 1 1 0 0
140	*
141	* This ensures we'll fold the old idle contribution in this window while
142	* accumlating the new one.
143	*
144	* - When we wake up from NO_HZ idle during the window, we push up our
145	* contribution, since we effectively move our sample point to a known
146	* busy state.
147	*
148	* This is solved by pushing the window forward, and thus skipping the
149	* sample, for this cpu (effectively using the idle-delta for this cpu which
150	* was in effect at the time the window opened). This also solves the issue
151	* of having to deal with a cpu having been in NOHZ idle for multiple
152	* LOAD_FREQ intervals.
153	*
154	* When making the ILB scale, we should try to pull this in as well.
155	*/
156	static atomic_long_t calc_load_idle[2];
157	static int calc_load_idx;
158
159	static inline int calc_load_write_idx(void)
160	{
161	int idx = calc_load_idx;
162
163	/*
164	* See calc_global_nohz(), if we observe the new index, we also
165	* need to observe the new update time.
166	*/
167	smp_rmb();
168
169	/*
170	* If the folding window started, make sure we start writing in the
171	* next idle-delta.
172	*/
173	if (!time_before(jiffies, calc_load_update))
174	idx++;
175
176	return idx & 1;
177	}
178
179	static inline int calc_load_read_idx(void)
180	{
181	return calc_load_idx & 1;
182	}
183
184	void calc_load_enter_idle(void)
185	{
186	struct rq *this_rq = this_rq();
187	long delta;
188
189	/*
190	* We're going into NOHZ mode, if there's any pending delta, fold it
191	* into the pending idle delta.
192	*/
193	delta = calc_load_fold_active(this_rq);
194	if (delta) {
195	int idx = calc_load_write_idx();
196	atomic_long_add(delta, &calc_load_idle[idx]);
197	}
198	}
199
200	void calc_load_exit_idle(void)
201	{
202	struct rq *this_rq = this_rq();
203
204	/*
205	* If we're still before the sample window, we're done.
206	*/
207	if (time_before(jiffies, this_rq->calc_load_update))
208	return;
209
210	/*
211	* We woke inside or after the sample window, this means we're already
212	* accounted through the nohz accounting, so skip the entire deal and
213	* sync up for the next window.
214	*/
215	this_rq->calc_load_update = calc_load_update;
216	if (time_before(jiffies, this_rq->calc_load_update + 10))
217	this_rq->calc_load_update += LOAD_FREQ;
218	}
219
220	static long calc_load_fold_idle(void)
221	{
222	int idx = calc_load_read_idx();
223	long delta = 0;
224
225	if (atomic_long_read(&calc_load_idle[idx]))
226	delta = atomic_long_xchg(&calc_load_idle[idx], 0);
227
228	return delta;
229	}
230
231	/**
232	* fixed_power_int - compute: x^n, in O(log n) time
233	*
234	* @x: base of the power
235	* @frac_bits: fractional bits of @x
236	* @n: power to raise @x to.
237	*
238	* By exploiting the relation between the definition of the natural power
239	* function: x^n := xx...*x (x multiplied by itself for n times), and
240	* the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
241	* (where: n_i \elem {0, 1}, the binary vector representing n),
242	* we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
243	* of course trivially computable in O(log_2 n), the length of our binary
244	* vector.
245	*/
246	static unsigned long
247	fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
248	{
249	unsigned long result = 1UL << frac_bits;
250
251	if (n) for (;;) {
252	if (n & 1) {
253	result *= x;
254	result += 1UL << (frac_bits - 1);
255	result >>= frac_bits;
256	}
257	n >>= 1;
258	if (!n)
259	break;
260	x *= x;
261	x += 1UL << (frac_bits - 1);
262	x >>= frac_bits;
263	}
264
265	return result;
266	}
267
268	/*
269	* a1 = a0 * e + a * (1 - e)
270	*
271	* a2 = a1 * e + a * (1 - e)
272	* = (a0 * e + a * (1 - e)) * e + a * (1 - e)
273	* = a0 * e^2 + a * (1 - e) * (1 + e)
274	*
275	* a3 = a2 * e + a * (1 - e)
276	* = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
277	* = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
278	*
279	* ...
280	*
281	* an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
282	* = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
283	* = a0 * e^n + a * (1 - e^n)
284	*
285	* [1] application of the geometric series:
286	*
287	* n 1 - x^(n+1)
288	* S_n := \Sum x^i = -------------
289	* i=0 1 - x
290	*/
291	static unsigned long
292	calc_load_n(unsigned long load, unsigned long exp,
293	unsigned long active, unsigned int n)
294	{
295
296	return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
297	}
298
299	/*
300	* NO_HZ can leave us missing all per-cpu ticks calling
301	* calc_load_account_active(), but since an idle CPU folds its delta into
302	* calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
303	* in the pending idle delta if our idle period crossed a load cycle boundary.
304	*
305	* Once we've updated the global active value, we need to apply the exponential
306	* weights adjusted to the number of cycles missed.
307	*/
308	static void calc_global_nohz(void)
309	{
310	long delta, active, n;
311
312	if (!time_before(jiffies, calc_load_update + 10)) {
313	/*
314	* Catch-up, fold however many we are behind still
315	*/
316	delta = jiffies - calc_load_update - 10;
317	n = 1 + (delta / LOAD_FREQ);
318
319	active = atomic_long_read(&calc_load_tasks);
320	active = active > 0 ? active * FIXED_1 : 0;
321
322	avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
323	avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
324	avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
325
326	calc_load_update += n * LOAD_FREQ;
327	}
328
329	/*
330	* Flip the idle index...
331	*
332	* Make sure we first write the new time then flip the index, so that
333	* calc_load_write_idx() will see the new time when it reads the new
334	* index, this avoids a double flip messing things up.
335	*/
336	smp_wmb();
337	calc_load_idx++;
338	}
339	#else /* !CONFIG_NO_HZ_COMMON */
340
341	static inline long calc_load_fold_idle(void) { return 0; }
342	static inline void calc_global_nohz(void) { }
343
344	#endif /* CONFIG_NO_HZ_COMMON */
345
346	/*
347	* calc_load - update the avenrun load estimates 10 ticks after the
348	* CPUs have updated calc_load_tasks.
349	*/
350	void calc_global_load(unsigned long ticks)
351	{
352	long active, delta;
353
354	if (time_before(jiffies, calc_load_update + 10))
355	return;
356
357	/*
358	* Fold the 'old' idle-delta to include all NO_HZ cpus.
359	*/
360	delta = calc_load_fold_idle();
361	if (delta)
362	atomic_long_add(delta, &calc_load_tasks);
363
364	active = atomic_long_read(&calc_load_tasks);
365	active = active > 0 ? active * FIXED_1 : 0;
366
367	avenrun[0] = calc_load(avenrun[0], EXP_1, active);
368	avenrun[1] = calc_load(avenrun[1], EXP_5, active);
369	avenrun[2] = calc_load(avenrun[2], EXP_15, active);
370
371	calc_load_update += LOAD_FREQ;
372
373	/*
374	* In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
375	*/
376	calc_global_nohz();
377	}
378
379	/*
380	* Called from update_cpu_load() to periodically update this CPU's
381	* active count.
382	*/
383	static void calc_load_account_active(struct rq *this_rq)
384	{
385	long delta;
386
387	if (time_before(jiffies, this_rq->calc_load_update))
388	return;
389
390	delta = calc_load_fold_active(this_rq);
391	if (delta)
392	atomic_long_add(delta, &calc_load_tasks);
393
394	this_rq->calc_load_update += LOAD_FREQ;
395	}
396
397	/*
398	* End of global load-average stuff
399	*/
400
401	/*
402	* The exact cpuload at various idx values, calculated at every tick would be
403	* load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
404	*
405	* If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
406	* on nth tick when cpu may be busy, then we have:
407	* load = ((2^idx - 1) / 2^idx)^(n-1) * load
408	* load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
409	*
410	* decay_load_missed() below does efficient calculation of
411	* load = ((2^idx - 1) / 2^idx)^(n-1) * load
412	* avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
413	*
414	* The calculation is approximated on a 128 point scale.
415	* degrade_zero_ticks is the number of ticks after which load at any
416	* particular idx is approximated to be zero.
417	* degrade_factor is a precomputed table, a row for each load idx.
418	* Each column corresponds to degradation factor for a power of two ticks,
419	* based on 128 point scale.
420	* Example:
421	* row 2, col 3 (=12) says that the degradation at load idx 2 after
422	* 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
423	*
424	* With this power of 2 load factors, we can degrade the load n times
425	* by looking at 1 bits in n and doing as many mult/shift instead of
426	* n mult/shifts needed by the exact degradation.
427	*/
428	#define DEGRADE_SHIFT 7
429	static const unsigned char
430	degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
431	static const unsigned char
432	degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
433	{0, 0, 0, 0, 0, 0, 0, 0},
434	{64, 32, 8, 0, 0, 0, 0, 0},
435	{96, 72, 40, 12, 1, 0, 0},
436	{112, 98, 75, 43, 15, 1, 0},
437	{120, 112, 98, 76, 45, 16, 2} };
438
439	/*
440	* Update cpu_load for any missed ticks, due to tickless idle. The backlog
441	* would be when CPU is idle and so we just decay the old load without
442	* adding any new load.
443	*/
444	static unsigned long
445	decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
446	{
447	int j = 0;
448
449	if (!missed_updates)
450	return load;
451
452	if (missed_updates >= degrade_zero_ticks[idx])
453	return 0;
454
455	if (idx == 1)
456	return load >> missed_updates;
457
458	while (missed_updates) {
459	if (missed_updates % 2)
460	load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
461
462	missed_updates >>= 1;
463	j++;
464	}
465	return load;
466	}
467
468	/*
469	* Update rq->cpu_load[] statistics. This function is usually called every
470	* scheduler tick (TICK_NSEC). With tickless idle this will not be called
471	* every tick. We fix it up based on jiffies.
472	*/
473	static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
474	unsigned long pending_updates)
475	{
476	int i, scale;
477
478	this_rq->nr_load_updates++;
479
480	/* Update our load: */
481	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
482	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
483	unsigned long old_load, new_load;
484
485	/* scale is effectively 1 << i now, and >> i divides by scale */
486
487	old_load = this_rq->cpu_load[i];
488	old_load = decay_load_missed(old_load, pending_updates - 1, i);
489	new_load = this_load;
490	/*
491	* Round up the averaging division if load is increasing. This
492	* prevents us from getting stuck on 9 if the load is 10, for
493	* example.
494	*/
495	if (new_load > old_load)
496	new_load += scale - 1;
497
498	this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
499	}
500
501	sched_avg_update(this_rq);
502	}
503
b92486cb	504	#ifdef CONFIG_SMP
a9dc5d0e	505	static inline unsigned long get_rq_runnable_load(struct rq *rq)
b92486cb AS	506	{
	507	return rq->cfs.runnable_load_avg;
	508	}
	509	#else
a9dc5d0e	510	static inline unsigned long get_rq_runnable_load(struct rq *rq)
b92486cb AS	511	{
	512	return rq->load.weight;
	513	}
	514	#endif
	515
45ceebf7 PG	516	#ifdef CONFIG_NO_HZ_COMMON
	517	/*
	518	* There is no sane way to deal with nohz on smp when using jiffies because the
	519	* cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
	520	* causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
	521	*
	522	* Therefore we cannot use the delta approach from the regular tick since that
	523	* would seriously skew the load calculation. However we'll make do for those
	524	* updates happening while idle (nohz_idle_balance) or coming out of idle
	525	* (tick_nohz_idle_exit).
	526	*
	527	* This means we might still be one tick off for nohz periods.
	528	*/
	529
	530	/*
	531	* Called from nohz_idle_balance() to update the load ratings before doing the
	532	* idle balance.
	533	*/
	534	void update_idle_cpu_load(struct rq *this_rq)
	535	{
	536	unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
b92486cb	537	unsigned long load = get_rq_runnable_load(this_rq);
45ceebf7 PG	538	unsigned long pending_updates;
	539
	540	/*
	541	* bail if there's load or we're actually up-to-date.
	542	*/
	543	if (load \|\| curr_jiffies == this_rq->last_load_update_tick)
	544	return;
	545
	546	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
	547	this_rq->last_load_update_tick = curr_jiffies;
	548
	549	__update_cpu_load(this_rq, load, pending_updates);
	550	}
	551
	552	/*
	553	* Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
	554	*/
	555	void update_cpu_load_nohz(void)
	556	{
	557	struct rq *this_rq = this_rq();
	558	unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
	559	unsigned long pending_updates;
	560
	561	if (curr_jiffies == this_rq->last_load_update_tick)
	562	return;
	563
	564	raw_spin_lock(&this_rq->lock);
	565	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
	566	if (pending_updates) {
	567	this_rq->last_load_update_tick = curr_jiffies;
	568	/*
	569	* We were idle, this means load 0, the current load might be
	570	* !0 due to remote wakeups and the sort.
	571	*/
	572	__update_cpu_load(this_rq, 0, pending_updates);
	573	}
	574	raw_spin_unlock(&this_rq->lock);
	575	}
	576	#endif /* CONFIG_NO_HZ */
	577
	578	/*
	579	* Called from scheduler_tick()
	580	*/
	581	void update_cpu_load_active(struct rq *this_rq)
	582	{
b92486cb	583	unsigned long load = get_rq_runnable_load(this_rq);
45ceebf7 PG	584	/*
	585	* See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
	586	*/
	587	this_rq->last_load_update_tick = jiffies;
b92486cb	588	__update_cpu_load(this_rq, load, 1);
45ceebf7 PG	589
	590	calc_load_account_active(this_rq);
	591	}