]>
Commit | Line | Data |
---|---|---|
b26bf6ab RW |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * Timer events oriented CPU idle governor | |
4 | * | |
5 | * Copyright (C) 2018 Intel Corporation | |
6 | * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> | |
7 | * | |
8 | * The idea of this governor is based on the observation that on many systems | |
9 | * timer events are two or more orders of magnitude more frequent than any | |
10 | * other interrupts, so they are likely to be the most significant source of CPU | |
11 | * wakeups from idle states. Moreover, information about what happened in the | |
12 | * (relatively recent) past can be used to estimate whether or not the deepest | |
13 | * idle state with target residency within the time to the closest timer is | |
14 | * likely to be suitable for the upcoming idle time of the CPU and, if not, then | |
15 | * which of the shallower idle states to choose. | |
16 | * | |
17 | * Of course, non-timer wakeup sources are more important in some use cases and | |
18 | * they can be covered by taking a few most recent idle time intervals of the | |
19 | * CPU into account. However, even in that case it is not necessary to consider | |
20 | * idle duration values greater than the time till the closest timer, as the | |
21 | * patterns that they may belong to produce average values close enough to | |
22 | * the time till the closest timer (sleep length) anyway. | |
23 | * | |
24 | * Thus this governor estimates whether or not the upcoming idle time of the CPU | |
25 | * is likely to be significantly shorter than the sleep length and selects an | |
26 | * idle state for it in accordance with that, as follows: | |
27 | * | |
28 | * - Find an idle state on the basis of the sleep length and state statistics | |
29 | * collected over time: | |
30 | * | |
31 | * o Find the deepest idle state whose target residency is less than or equal | |
32 | * to the sleep length. | |
33 | * | |
34 | * o Select it if it matched both the sleep length and the observed idle | |
35 | * duration in the past more often than it matched the sleep length alone | |
36 | * (i.e. the observed idle duration was significantly shorter than the sleep | |
37 | * length matched by it). | |
38 | * | |
39 | * o Otherwise, select the shallower state with the greatest matched "early" | |
40 | * wakeups metric. | |
41 | * | |
42 | * - If the majority of the most recent idle duration values are below the | |
43 | * target residency of the idle state selected so far, use those values to | |
44 | * compute the new expected idle duration and find an idle state matching it | |
45 | * (which has to be shallower than the one selected so far). | |
46 | */ | |
47 | ||
48 | #include <linux/cpuidle.h> | |
49 | #include <linux/jiffies.h> | |
50 | #include <linux/kernel.h> | |
51 | #include <linux/sched/clock.h> | |
52 | #include <linux/tick.h> | |
53 | ||
54 | /* | |
55 | * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value | |
56 | * is used for decreasing metrics on a regular basis. | |
57 | */ | |
58 | #define PULSE 1024 | |
59 | #define DECAY_SHIFT 3 | |
60 | ||
61 | /* | |
62 | * Number of the most recent idle duration values to take into consideration for | |
63 | * the detection of wakeup patterns. | |
64 | */ | |
65 | #define INTERVALS 8 | |
66 | ||
67 | /** | |
68 | * struct teo_idle_state - Idle state data used by the TEO cpuidle governor. | |
69 | * @early_hits: "Early" CPU wakeups "matching" this state. | |
70 | * @hits: "On time" CPU wakeups "matching" this state. | |
71 | * @misses: CPU wakeups "missing" this state. | |
72 | * | |
73 | * A CPU wakeup is "matched" by a given idle state if the idle duration measured | |
74 | * after the wakeup is between the target residency of that state and the target | |
75 | * residency of the next one (or if this is the deepest available idle state, it | |
76 | * "matches" a CPU wakeup when the measured idle duration is at least equal to | |
77 | * its target residency). | |
78 | * | |
79 | * Also, from the TEO governor perspective, a CPU wakeup from idle is "early" if | |
80 | * it occurs significantly earlier than the closest expected timer event (that | |
81 | * is, early enough to match an idle state shallower than the one matching the | |
82 | * time till the closest timer event). Otherwise, the wakeup is "on time", or | |
83 | * it is a "hit". | |
84 | * | |
85 | * A "miss" occurs when the given state doesn't match the wakeup, but it matches | |
86 | * the time till the closest timer event used for idle state selection. | |
87 | */ | |
88 | struct teo_idle_state { | |
89 | unsigned int early_hits; | |
90 | unsigned int hits; | |
91 | unsigned int misses; | |
92 | }; | |
93 | ||
94 | /** | |
95 | * struct teo_cpu - CPU data used by the TEO cpuidle governor. | |
96 | * @time_span_ns: Time between idle state selection and post-wakeup update. | |
97 | * @sleep_length_ns: Time till the closest timer event (at the selection time). | |
98 | * @states: Idle states data corresponding to this CPU. | |
b26bf6ab RW |
99 | * @interval_idx: Index of the most recent saved idle interval. |
100 | * @intervals: Saved idle duration values. | |
101 | */ | |
102 | struct teo_cpu { | |
103 | u64 time_span_ns; | |
104 | u64 sleep_length_ns; | |
105 | struct teo_idle_state states[CPUIDLE_STATE_MAX]; | |
b26bf6ab RW |
106 | int interval_idx; |
107 | unsigned int intervals[INTERVALS]; | |
108 | }; | |
109 | ||
110 | static DEFINE_PER_CPU(struct teo_cpu, teo_cpus); | |
111 | ||
112 | /** | |
113 | * teo_update - Update CPU data after wakeup. | |
114 | * @drv: cpuidle driver containing state data. | |
115 | * @dev: Target CPU. | |
116 | */ | |
117 | static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) | |
118 | { | |
119 | struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); | |
120 | unsigned int sleep_length_us = ktime_to_us(cpu_data->sleep_length_ns); | |
121 | int i, idx_hit = -1, idx_timer = -1; | |
122 | unsigned int measured_us; | |
123 | ||
124 | if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) { | |
125 | /* | |
126 | * One of the safety nets has triggered or this was a timer | |
127 | * wakeup (or equivalent). | |
128 | */ | |
129 | measured_us = sleep_length_us; | |
130 | } else { | |
7d4daeed MT |
131 | unsigned int lat; |
132 | ||
133 | lat = drv->states[dev->last_state_idx].exit_latency; | |
b26bf6ab RW |
134 | |
135 | measured_us = ktime_to_us(cpu_data->time_span_ns); | |
136 | /* | |
137 | * The delay between the wakeup and the first instruction | |
138 | * executed by the CPU is not likely to be worst-case every | |
139 | * time, so take 1/2 of the exit latency as a very rough | |
140 | * approximation of the average of it. | |
141 | */ | |
142 | if (measured_us >= lat) | |
143 | measured_us -= lat / 2; | |
144 | else | |
145 | measured_us /= 2; | |
146 | } | |
147 | ||
148 | /* | |
149 | * Decay the "early hits" metric for all of the states and find the | |
150 | * states matching the sleep length and the measured idle duration. | |
151 | */ | |
152 | for (i = 0; i < drv->state_count; i++) { | |
153 | unsigned int early_hits = cpu_data->states[i].early_hits; | |
154 | ||
155 | cpu_data->states[i].early_hits -= early_hits >> DECAY_SHIFT; | |
156 | ||
157 | if (drv->states[i].target_residency <= sleep_length_us) { | |
158 | idx_timer = i; | |
159 | if (drv->states[i].target_residency <= measured_us) | |
160 | idx_hit = i; | |
161 | } | |
162 | } | |
163 | ||
164 | /* | |
165 | * Update the "hits" and "misses" data for the state matching the sleep | |
166 | * length. If it matches the measured idle duration too, this is a hit, | |
167 | * so increase the "hits" metric for it then. Otherwise, this is a | |
168 | * miss, so increase the "misses" metric for it. In the latter case | |
169 | * also increase the "early hits" metric for the state that actually | |
170 | * matches the measured idle duration. | |
171 | */ | |
172 | if (idx_timer >= 0) { | |
173 | unsigned int hits = cpu_data->states[idx_timer].hits; | |
174 | unsigned int misses = cpu_data->states[idx_timer].misses; | |
175 | ||
176 | hits -= hits >> DECAY_SHIFT; | |
177 | misses -= misses >> DECAY_SHIFT; | |
178 | ||
179 | if (idx_timer > idx_hit) { | |
180 | misses += PULSE; | |
181 | if (idx_hit >= 0) | |
182 | cpu_data->states[idx_hit].early_hits += PULSE; | |
183 | } else { | |
184 | hits += PULSE; | |
185 | } | |
186 | ||
187 | cpu_data->states[idx_timer].misses = misses; | |
188 | cpu_data->states[idx_timer].hits = hits; | |
189 | } | |
190 | ||
191 | /* | |
192 | * If the total time span between idle state selection and the "reflect" | |
193 | * callback is greater than or equal to the sleep length determined at | |
194 | * the idle state selection time, the wakeup is likely to be due to a | |
195 | * timer event. | |
196 | */ | |
197 | if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) | |
198 | measured_us = UINT_MAX; | |
199 | ||
200 | /* | |
201 | * Save idle duration values corresponding to non-timer wakeups for | |
202 | * pattern detection. | |
203 | */ | |
204 | cpu_data->intervals[cpu_data->interval_idx++] = measured_us; | |
205 | if (cpu_data->interval_idx > INTERVALS) | |
206 | cpu_data->interval_idx = 0; | |
207 | } | |
208 | ||
209 | /** | |
210 | * teo_find_shallower_state - Find shallower idle state matching given duration. | |
211 | * @drv: cpuidle driver containing state data. | |
212 | * @dev: Target CPU. | |
213 | * @state_idx: Index of the capping idle state. | |
214 | * @duration_us: Idle duration value to match. | |
215 | */ | |
216 | static int teo_find_shallower_state(struct cpuidle_driver *drv, | |
217 | struct cpuidle_device *dev, int state_idx, | |
218 | unsigned int duration_us) | |
219 | { | |
220 | int i; | |
221 | ||
222 | for (i = state_idx - 1; i >= 0; i--) { | |
223 | if (drv->states[i].disabled || dev->states_usage[i].disable) | |
224 | continue; | |
225 | ||
226 | state_idx = i; | |
227 | if (drv->states[i].target_residency <= duration_us) | |
228 | break; | |
229 | } | |
230 | return state_idx; | |
231 | } | |
232 | ||
233 | /** | |
234 | * teo_select - Selects the next idle state to enter. | |
235 | * @drv: cpuidle driver containing state data. | |
236 | * @dev: Target CPU. | |
237 | * @stop_tick: Indication on whether or not to stop the scheduler tick. | |
238 | */ | |
239 | static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, | |
240 | bool *stop_tick) | |
241 | { | |
242 | struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); | |
243 | int latency_req = cpuidle_governor_latency_req(dev->cpu); | |
244 | unsigned int duration_us, count; | |
cab09f3d | 245 | int max_early_idx, constraint_idx, idx, i; |
b26bf6ab RW |
246 | ktime_t delta_tick; |
247 | ||
7d4daeed | 248 | if (dev->last_state_idx >= 0) { |
b26bf6ab | 249 | teo_update(drv, dev); |
7d4daeed | 250 | dev->last_state_idx = -1; |
b26bf6ab RW |
251 | } |
252 | ||
253 | cpu_data->time_span_ns = local_clock(); | |
254 | ||
255 | cpu_data->sleep_length_ns = tick_nohz_get_sleep_length(&delta_tick); | |
256 | duration_us = ktime_to_us(cpu_data->sleep_length_ns); | |
257 | ||
258 | count = 0; | |
259 | max_early_idx = -1; | |
cab09f3d | 260 | constraint_idx = drv->state_count; |
b26bf6ab RW |
261 | idx = -1; |
262 | ||
263 | for (i = 0; i < drv->state_count; i++) { | |
264 | struct cpuidle_state *s = &drv->states[i]; | |
265 | struct cpuidle_state_usage *su = &dev->states_usage[i]; | |
266 | ||
267 | if (s->disabled || su->disable) { | |
268 | /* | |
269 | * If the "early hits" metric of a disabled state is | |
270 | * greater than the current maximum, it should be taken | |
271 | * into account, because it would be a mistake to select | |
272 | * a deeper state with lower "early hits" metric. The | |
273 | * index cannot be changed to point to it, however, so | |
274 | * just increase the max count alone and let the index | |
275 | * still point to a shallower idle state. | |
276 | */ | |
277 | if (max_early_idx >= 0 && | |
278 | count < cpu_data->states[i].early_hits) | |
279 | count = cpu_data->states[i].early_hits; | |
280 | ||
281 | continue; | |
282 | } | |
283 | ||
284 | if (idx < 0) | |
285 | idx = i; /* first enabled state */ | |
286 | ||
287 | if (s->target_residency > duration_us) | |
288 | break; | |
289 | ||
cab09f3d RW |
290 | if (s->exit_latency > latency_req && constraint_idx > i) |
291 | constraint_idx = i; | |
b26bf6ab RW |
292 | |
293 | idx = i; | |
294 | ||
295 | if (count < cpu_data->states[i].early_hits && | |
296 | !(tick_nohz_tick_stopped() && | |
297 | drv->states[i].target_residency < TICK_USEC)) { | |
298 | count = cpu_data->states[i].early_hits; | |
299 | max_early_idx = i; | |
300 | } | |
301 | } | |
302 | ||
303 | /* | |
304 | * If the "hits" metric of the idle state matching the sleep length is | |
305 | * greater than its "misses" metric, that is the one to use. Otherwise, | |
306 | * it is more likely that one of the shallower states will match the | |
307 | * idle duration observed after wakeup, so take the one with the maximum | |
308 | * "early hits" metric, but if that cannot be determined, just use the | |
309 | * state selected so far. | |
310 | */ | |
311 | if (cpu_data->states[idx].hits <= cpu_data->states[idx].misses && | |
312 | max_early_idx >= 0) { | |
313 | idx = max_early_idx; | |
314 | duration_us = drv->states[idx].target_residency; | |
315 | } | |
316 | ||
cab09f3d RW |
317 | /* |
318 | * If there is a latency constraint, it may be necessary to use a | |
319 | * shallower idle state than the one selected so far. | |
320 | */ | |
321 | if (constraint_idx < idx) | |
322 | idx = constraint_idx; | |
323 | ||
b26bf6ab RW |
324 | if (idx < 0) { |
325 | idx = 0; /* No states enabled. Must use 0. */ | |
326 | } else if (idx > 0) { | |
327 | u64 sum = 0; | |
328 | ||
329 | count = 0; | |
330 | ||
331 | /* | |
332 | * Count and sum the most recent idle duration values less than | |
cab09f3d | 333 | * the current expected idle duration value. |
b26bf6ab RW |
334 | */ |
335 | for (i = 0; i < INTERVALS; i++) { | |
336 | unsigned int val = cpu_data->intervals[i]; | |
337 | ||
cab09f3d | 338 | if (val >= duration_us) |
b26bf6ab RW |
339 | continue; |
340 | ||
341 | count++; | |
342 | sum += val; | |
343 | } | |
344 | ||
345 | /* | |
346 | * Give up unless the majority of the most recent idle duration | |
347 | * values are in the interesting range. | |
348 | */ | |
349 | if (count > INTERVALS / 2) { | |
350 | unsigned int avg_us = div64_u64(sum, count); | |
351 | ||
352 | /* | |
353 | * Avoid spending too much time in an idle state that | |
354 | * would be too shallow. | |
355 | */ | |
356 | if (!(tick_nohz_tick_stopped() && avg_us < TICK_USEC)) { | |
b26bf6ab | 357 | duration_us = avg_us; |
cab09f3d RW |
358 | if (drv->states[idx].target_residency > avg_us) |
359 | idx = teo_find_shallower_state(drv, dev, | |
360 | idx, avg_us); | |
b26bf6ab RW |
361 | } |
362 | } | |
363 | } | |
364 | ||
365 | /* | |
366 | * Don't stop the tick if the selected state is a polling one or if the | |
367 | * expected idle duration is shorter than the tick period length. | |
368 | */ | |
369 | if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) || | |
370 | duration_us < TICK_USEC) && !tick_nohz_tick_stopped()) { | |
371 | unsigned int delta_tick_us = ktime_to_us(delta_tick); | |
372 | ||
373 | *stop_tick = false; | |
374 | ||
375 | /* | |
376 | * The tick is not going to be stopped, so if the target | |
377 | * residency of the state to be returned is not within the time | |
378 | * till the closest timer including the tick, try to correct | |
379 | * that. | |
380 | */ | |
381 | if (idx > 0 && drv->states[idx].target_residency > delta_tick_us) | |
382 | idx = teo_find_shallower_state(drv, dev, idx, delta_tick_us); | |
383 | } | |
384 | ||
385 | return idx; | |
386 | } | |
387 | ||
388 | /** | |
389 | * teo_reflect - Note that governor data for the CPU need to be updated. | |
390 | * @dev: Target CPU. | |
391 | * @state: Entered state. | |
392 | */ | |
393 | static void teo_reflect(struct cpuidle_device *dev, int state) | |
394 | { | |
395 | struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); | |
396 | ||
7d4daeed | 397 | dev->last_state_idx = state; |
b26bf6ab RW |
398 | /* |
399 | * If the wakeup was not "natural", but triggered by one of the safety | |
400 | * nets, assume that the CPU might have been idle for the entire sleep | |
401 | * length time. | |
402 | */ | |
403 | if (dev->poll_time_limit || | |
404 | (tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) { | |
405 | dev->poll_time_limit = false; | |
406 | cpu_data->time_span_ns = cpu_data->sleep_length_ns; | |
407 | } else { | |
408 | cpu_data->time_span_ns = local_clock() - cpu_data->time_span_ns; | |
409 | } | |
410 | } | |
411 | ||
412 | /** | |
413 | * teo_enable_device - Initialize the governor's data for the target CPU. | |
414 | * @drv: cpuidle driver (not used). | |
415 | * @dev: Target CPU. | |
416 | */ | |
417 | static int teo_enable_device(struct cpuidle_driver *drv, | |
418 | struct cpuidle_device *dev) | |
419 | { | |
420 | struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); | |
421 | int i; | |
422 | ||
423 | memset(cpu_data, 0, sizeof(*cpu_data)); | |
424 | ||
425 | for (i = 0; i < INTERVALS; i++) | |
426 | cpu_data->intervals[i] = UINT_MAX; | |
427 | ||
428 | return 0; | |
429 | } | |
430 | ||
431 | static struct cpuidle_governor teo_governor = { | |
432 | .name = "teo", | |
433 | .rating = 19, | |
434 | .enable = teo_enable_device, | |
435 | .select = teo_select, | |
436 | .reflect = teo_reflect, | |
437 | }; | |
438 | ||
439 | static int __init teo_governor_init(void) | |
440 | { | |
441 | return cpuidle_register_governor(&teo_governor); | |
442 | } | |
443 | ||
444 | postcore_initcall(teo_governor_init); |