]>
Commit | Line | Data |
---|---|---|
b26bf6ab RW |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * Timer events oriented CPU idle governor | |
4 | * | |
5 | * Copyright (C) 2018 Intel Corporation | |
6 | * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> | |
7 | * | |
8 | * The idea of this governor is based on the observation that on many systems | |
9 | * timer events are two or more orders of magnitude more frequent than any | |
10 | * other interrupts, so they are likely to be the most significant source of CPU | |
11 | * wakeups from idle states. Moreover, information about what happened in the | |
12 | * (relatively recent) past can be used to estimate whether or not the deepest | |
13 | * idle state with target residency within the time to the closest timer is | |
14 | * likely to be suitable for the upcoming idle time of the CPU and, if not, then | |
15 | * which of the shallower idle states to choose. | |
16 | * | |
17 | * Of course, non-timer wakeup sources are more important in some use cases and | |
18 | * they can be covered by taking a few most recent idle time intervals of the | |
19 | * CPU into account. However, even in that case it is not necessary to consider | |
20 | * idle duration values greater than the time till the closest timer, as the | |
21 | * patterns that they may belong to produce average values close enough to | |
22 | * the time till the closest timer (sleep length) anyway. | |
23 | * | |
24 | * Thus this governor estimates whether or not the upcoming idle time of the CPU | |
25 | * is likely to be significantly shorter than the sleep length and selects an | |
26 | * idle state for it in accordance with that, as follows: | |
27 | * | |
28 | * - Find an idle state on the basis of the sleep length and state statistics | |
29 | * collected over time: | |
30 | * | |
31 | * o Find the deepest idle state whose target residency is less than or equal | |
32 | * to the sleep length. | |
33 | * | |
34 | * o Select it if it matched both the sleep length and the observed idle | |
35 | * duration in the past more often than it matched the sleep length alone | |
36 | * (i.e. the observed idle duration was significantly shorter than the sleep | |
37 | * length matched by it). | |
38 | * | |
39 | * o Otherwise, select the shallower state with the greatest matched "early" | |
40 | * wakeups metric. | |
41 | * | |
42 | * - If the majority of the most recent idle duration values are below the | |
43 | * target residency of the idle state selected so far, use those values to | |
44 | * compute the new expected idle duration and find an idle state matching it | |
45 | * (which has to be shallower than the one selected so far). | |
46 | */ | |
47 | ||
48 | #include <linux/cpuidle.h> | |
49 | #include <linux/jiffies.h> | |
50 | #include <linux/kernel.h> | |
51 | #include <linux/sched/clock.h> | |
52 | #include <linux/tick.h> | |
53 | ||
54 | /* | |
55 | * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value | |
56 | * is used for decreasing metrics on a regular basis. | |
57 | */ | |
58 | #define PULSE 1024 | |
59 | #define DECAY_SHIFT 3 | |
60 | ||
61 | /* | |
62 | * Number of the most recent idle duration values to take into consideration for | |
63 | * the detection of wakeup patterns. | |
64 | */ | |
65 | #define INTERVALS 8 | |
66 | ||
67 | /** | |
68 | * struct teo_idle_state - Idle state data used by the TEO cpuidle governor. | |
69 | * @early_hits: "Early" CPU wakeups "matching" this state. | |
70 | * @hits: "On time" CPU wakeups "matching" this state. | |
71 | * @misses: CPU wakeups "missing" this state. | |
72 | * | |
73 | * A CPU wakeup is "matched" by a given idle state if the idle duration measured | |
74 | * after the wakeup is between the target residency of that state and the target | |
75 | * residency of the next one (or if this is the deepest available idle state, it | |
76 | * "matches" a CPU wakeup when the measured idle duration is at least equal to | |
77 | * its target residency). | |
78 | * | |
79 | * Also, from the TEO governor perspective, a CPU wakeup from idle is "early" if | |
80 | * it occurs significantly earlier than the closest expected timer event (that | |
81 | * is, early enough to match an idle state shallower than the one matching the | |
82 | * time till the closest timer event). Otherwise, the wakeup is "on time", or | |
83 | * it is a "hit". | |
84 | * | |
85 | * A "miss" occurs when the given state doesn't match the wakeup, but it matches | |
86 | * the time till the closest timer event used for idle state selection. | |
87 | */ | |
88 | struct teo_idle_state { | |
89 | unsigned int early_hits; | |
90 | unsigned int hits; | |
91 | unsigned int misses; | |
92 | }; | |
93 | ||
94 | /** | |
95 | * struct teo_cpu - CPU data used by the TEO cpuidle governor. | |
96 | * @time_span_ns: Time between idle state selection and post-wakeup update. | |
97 | * @sleep_length_ns: Time till the closest timer event (at the selection time). | |
98 | * @states: Idle states data corresponding to this CPU. | |
b26bf6ab RW |
99 | * @interval_idx: Index of the most recent saved idle interval. |
100 | * @intervals: Saved idle duration values. | |
101 | */ | |
102 | struct teo_cpu { | |
103 | u64 time_span_ns; | |
104 | u64 sleep_length_ns; | |
105 | struct teo_idle_state states[CPUIDLE_STATE_MAX]; | |
b26bf6ab | 106 | int interval_idx; |
c1d51f68 | 107 | u64 intervals[INTERVALS]; |
b26bf6ab RW |
108 | }; |
109 | ||
110 | static DEFINE_PER_CPU(struct teo_cpu, teo_cpus); | |
111 | ||
112 | /** | |
113 | * teo_update - Update CPU data after wakeup. | |
114 | * @drv: cpuidle driver containing state data. | |
115 | * @dev: Target CPU. | |
116 | */ | |
117 | static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) | |
118 | { | |
119 | struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); | |
b26bf6ab | 120 | int i, idx_hit = -1, idx_timer = -1; |
c1d51f68 | 121 | u64 measured_ns; |
b26bf6ab RW |
122 | |
123 | if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) { | |
124 | /* | |
b7e7fffd RW |
125 | * One of the safety nets has triggered or the wakeup was close |
126 | * enough to the closest timer event expected at the idle state | |
127 | * selection time to be discarded. | |
b26bf6ab | 128 | */ |
c1d51f68 | 129 | measured_ns = U64_MAX; |
b26bf6ab | 130 | } else { |
c1d51f68 | 131 | u64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns; |
7d4daeed | 132 | |
b6495b7f RW |
133 | /* |
134 | * The computations below are to determine whether or not the | |
135 | * (saved) time till the next timer event and the measured idle | |
136 | * duration fall into the same "bin", so use last_residency_ns | |
137 | * for that instead of time_span_ns which includes the cpuidle | |
138 | * overhead. | |
139 | */ | |
140 | measured_ns = dev->last_residency_ns; | |
b26bf6ab RW |
141 | /* |
142 | * The delay between the wakeup and the first instruction | |
143 | * executed by the CPU is not likely to be worst-case every | |
144 | * time, so take 1/2 of the exit latency as a very rough | |
145 | * approximation of the average of it. | |
146 | */ | |
c1d51f68 RW |
147 | if (measured_ns >= lat_ns) |
148 | measured_ns -= lat_ns / 2; | |
b26bf6ab | 149 | else |
c1d51f68 | 150 | measured_ns /= 2; |
b26bf6ab RW |
151 | } |
152 | ||
153 | /* | |
154 | * Decay the "early hits" metric for all of the states and find the | |
155 | * states matching the sleep length and the measured idle duration. | |
156 | */ | |
157 | for (i = 0; i < drv->state_count; i++) { | |
158 | unsigned int early_hits = cpu_data->states[i].early_hits; | |
159 | ||
160 | cpu_data->states[i].early_hits -= early_hits >> DECAY_SHIFT; | |
161 | ||
c1d51f68 | 162 | if (drv->states[i].target_residency_ns <= cpu_data->sleep_length_ns) { |
b26bf6ab | 163 | idx_timer = i; |
c1d51f68 | 164 | if (drv->states[i].target_residency_ns <= measured_ns) |
b26bf6ab RW |
165 | idx_hit = i; |
166 | } | |
167 | } | |
168 | ||
169 | /* | |
170 | * Update the "hits" and "misses" data for the state matching the sleep | |
171 | * length. If it matches the measured idle duration too, this is a hit, | |
172 | * so increase the "hits" metric for it then. Otherwise, this is a | |
173 | * miss, so increase the "misses" metric for it. In the latter case | |
174 | * also increase the "early hits" metric for the state that actually | |
175 | * matches the measured idle duration. | |
176 | */ | |
177 | if (idx_timer >= 0) { | |
178 | unsigned int hits = cpu_data->states[idx_timer].hits; | |
179 | unsigned int misses = cpu_data->states[idx_timer].misses; | |
180 | ||
181 | hits -= hits >> DECAY_SHIFT; | |
182 | misses -= misses >> DECAY_SHIFT; | |
183 | ||
184 | if (idx_timer > idx_hit) { | |
185 | misses += PULSE; | |
186 | if (idx_hit >= 0) | |
187 | cpu_data->states[idx_hit].early_hits += PULSE; | |
188 | } else { | |
189 | hits += PULSE; | |
190 | } | |
191 | ||
192 | cpu_data->states[idx_timer].misses = misses; | |
193 | cpu_data->states[idx_timer].hits = hits; | |
194 | } | |
195 | ||
b26bf6ab RW |
196 | /* |
197 | * Save idle duration values corresponding to non-timer wakeups for | |
198 | * pattern detection. | |
199 | */ | |
c1d51f68 | 200 | cpu_data->intervals[cpu_data->interval_idx++] = measured_ns; |
57388a2c | 201 | if (cpu_data->interval_idx >= INTERVALS) |
b26bf6ab RW |
202 | cpu_data->interval_idx = 0; |
203 | } | |
204 | ||
85f6a17f RW |
205 | static bool teo_time_ok(u64 interval_ns) |
206 | { | |
207 | return !tick_nohz_tick_stopped() || interval_ns >= TICK_NSEC; | |
208 | } | |
209 | ||
b26bf6ab RW |
210 | /** |
211 | * teo_find_shallower_state - Find shallower idle state matching given duration. | |
212 | * @drv: cpuidle driver containing state data. | |
213 | * @dev: Target CPU. | |
214 | * @state_idx: Index of the capping idle state. | |
c1d51f68 | 215 | * @duration_ns: Idle duration value to match. |
b26bf6ab RW |
216 | */ |
217 | static int teo_find_shallower_state(struct cpuidle_driver *drv, | |
218 | struct cpuidle_device *dev, int state_idx, | |
c1d51f68 | 219 | u64 duration_ns) |
b26bf6ab RW |
220 | { |
221 | int i; | |
222 | ||
223 | for (i = state_idx - 1; i >= 0; i--) { | |
99e98d3f | 224 | if (dev->states_usage[i].disable) |
b26bf6ab RW |
225 | continue; |
226 | ||
227 | state_idx = i; | |
c1d51f68 | 228 | if (drv->states[i].target_residency_ns <= duration_ns) |
b26bf6ab RW |
229 | break; |
230 | } | |
231 | return state_idx; | |
232 | } | |
233 | ||
234 | /** | |
235 | * teo_select - Selects the next idle state to enter. | |
236 | * @drv: cpuidle driver containing state data. | |
237 | * @dev: Target CPU. | |
238 | * @stop_tick: Indication on whether or not to stop the scheduler tick. | |
239 | */ | |
240 | static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, | |
241 | bool *stop_tick) | |
242 | { | |
243 | struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); | |
c1d51f68 RW |
244 | s64 latency_req = cpuidle_governor_latency_req(dev->cpu); |
245 | u64 duration_ns; | |
246 | unsigned int hits, misses, early_hits; | |
63f202e5 | 247 | int max_early_idx, prev_max_early_idx, constraint_idx, idx, i; |
b26bf6ab RW |
248 | ktime_t delta_tick; |
249 | ||
7d4daeed | 250 | if (dev->last_state_idx >= 0) { |
b26bf6ab | 251 | teo_update(drv, dev); |
7d4daeed | 252 | dev->last_state_idx = -1; |
b26bf6ab RW |
253 | } |
254 | ||
255 | cpu_data->time_span_ns = local_clock(); | |
256 | ||
c1d51f68 RW |
257 | duration_ns = tick_nohz_get_sleep_length(&delta_tick); |
258 | cpu_data->sleep_length_ns = duration_ns; | |
b26bf6ab | 259 | |
e43dcf20 RW |
260 | hits = 0; |
261 | misses = 0; | |
4f690bb8 | 262 | early_hits = 0; |
b26bf6ab | 263 | max_early_idx = -1; |
63f202e5 | 264 | prev_max_early_idx = -1; |
cab09f3d | 265 | constraint_idx = drv->state_count; |
b26bf6ab RW |
266 | idx = -1; |
267 | ||
268 | for (i = 0; i < drv->state_count; i++) { | |
269 | struct cpuidle_state *s = &drv->states[i]; | |
b26bf6ab | 270 | |
99e98d3f | 271 | if (dev->states_usage[i].disable) { |
069ce2ef RW |
272 | /* |
273 | * Ignore disabled states with target residencies beyond | |
274 | * the anticipated idle duration. | |
275 | */ | |
c1d51f68 | 276 | if (s->target_residency_ns > duration_ns) |
069ce2ef RW |
277 | continue; |
278 | ||
e43dcf20 RW |
279 | /* |
280 | * This state is disabled, so the range of idle duration | |
281 | * values corresponding to it is covered by the current | |
282 | * candidate state, but still the "hits" and "misses" | |
283 | * metrics of the disabled state need to be used to | |
284 | * decide whether or not the state covering the range in | |
285 | * question is good enough. | |
286 | */ | |
287 | hits = cpu_data->states[i].hits; | |
288 | misses = cpu_data->states[i].misses; | |
289 | ||
159e4856 RW |
290 | if (early_hits >= cpu_data->states[i].early_hits || |
291 | idx < 0) | |
292 | continue; | |
293 | ||
294 | /* | |
295 | * If the current candidate state has been the one with | |
296 | * the maximum "early hits" metric so far, the "early | |
297 | * hits" metric of the disabled state replaces the | |
298 | * current "early hits" count to avoid selecting a | |
299 | * deeper state with lower "early hits" metric. | |
300 | */ | |
301 | if (max_early_idx == idx) { | |
302 | early_hits = cpu_data->states[i].early_hits; | |
303 | continue; | |
304 | } | |
305 | ||
b26bf6ab | 306 | /* |
159e4856 RW |
307 | * The current candidate state is closer to the disabled |
308 | * one than the current maximum "early hits" state, so | |
309 | * replace the latter with it, but in case the maximum | |
310 | * "early hits" state index has not been set so far, | |
311 | * check if the current candidate state is not too | |
312 | * shallow for that role. | |
b26bf6ab | 313 | */ |
85f6a17f | 314 | if (teo_time_ok(drv->states[idx].target_residency_ns)) { |
63f202e5 | 315 | prev_max_early_idx = max_early_idx; |
4f690bb8 | 316 | early_hits = cpu_data->states[i].early_hits; |
159e4856 RW |
317 | max_early_idx = idx; |
318 | } | |
b26bf6ab RW |
319 | |
320 | continue; | |
321 | } | |
322 | ||
e43dcf20 | 323 | if (idx < 0) { |
b26bf6ab | 324 | idx = i; /* first enabled state */ |
e43dcf20 RW |
325 | hits = cpu_data->states[i].hits; |
326 | misses = cpu_data->states[i].misses; | |
327 | } | |
b26bf6ab | 328 | |
c1d51f68 | 329 | if (s->target_residency_ns > duration_ns) |
b26bf6ab RW |
330 | break; |
331 | ||
c1d51f68 | 332 | if (s->exit_latency_ns > latency_req && constraint_idx > i) |
cab09f3d | 333 | constraint_idx = i; |
b26bf6ab RW |
334 | |
335 | idx = i; | |
e43dcf20 RW |
336 | hits = cpu_data->states[i].hits; |
337 | misses = cpu_data->states[i].misses; | |
b26bf6ab | 338 | |
4f690bb8 | 339 | if (early_hits < cpu_data->states[i].early_hits && |
85f6a17f | 340 | teo_time_ok(drv->states[i].target_residency_ns)) { |
63f202e5 | 341 | prev_max_early_idx = max_early_idx; |
4f690bb8 | 342 | early_hits = cpu_data->states[i].early_hits; |
b26bf6ab RW |
343 | max_early_idx = i; |
344 | } | |
345 | } | |
346 | ||
347 | /* | |
348 | * If the "hits" metric of the idle state matching the sleep length is | |
349 | * greater than its "misses" metric, that is the one to use. Otherwise, | |
350 | * it is more likely that one of the shallower states will match the | |
351 | * idle duration observed after wakeup, so take the one with the maximum | |
352 | * "early hits" metric, but if that cannot be determined, just use the | |
353 | * state selected so far. | |
354 | */ | |
63f202e5 RW |
355 | if (hits <= misses) { |
356 | /* | |
357 | * The current candidate state is not suitable, so take the one | |
358 | * whose "early hits" metric is the maximum for the range of | |
359 | * shallower states. | |
360 | */ | |
361 | if (idx == max_early_idx) | |
362 | max_early_idx = prev_max_early_idx; | |
363 | ||
364 | if (max_early_idx >= 0) { | |
365 | idx = max_early_idx; | |
366 | duration_ns = drv->states[idx].target_residency_ns; | |
367 | } | |
b26bf6ab RW |
368 | } |
369 | ||
cab09f3d RW |
370 | /* |
371 | * If there is a latency constraint, it may be necessary to use a | |
372 | * shallower idle state than the one selected so far. | |
373 | */ | |
374 | if (constraint_idx < idx) | |
375 | idx = constraint_idx; | |
376 | ||
b26bf6ab RW |
377 | if (idx < 0) { |
378 | idx = 0; /* No states enabled. Must use 0. */ | |
379 | } else if (idx > 0) { | |
4f690bb8 | 380 | unsigned int count = 0; |
b26bf6ab RW |
381 | u64 sum = 0; |
382 | ||
b26bf6ab RW |
383 | /* |
384 | * Count and sum the most recent idle duration values less than | |
cab09f3d | 385 | * the current expected idle duration value. |
b26bf6ab RW |
386 | */ |
387 | for (i = 0; i < INTERVALS; i++) { | |
c1d51f68 | 388 | u64 val = cpu_data->intervals[i]; |
b26bf6ab | 389 | |
c1d51f68 | 390 | if (val >= duration_ns) |
b26bf6ab RW |
391 | continue; |
392 | ||
393 | count++; | |
394 | sum += val; | |
395 | } | |
396 | ||
397 | /* | |
398 | * Give up unless the majority of the most recent idle duration | |
399 | * values are in the interesting range. | |
400 | */ | |
401 | if (count > INTERVALS / 2) { | |
c1d51f68 | 402 | u64 avg_ns = div64_u64(sum, count); |
b26bf6ab RW |
403 | |
404 | /* | |
405 | * Avoid spending too much time in an idle state that | |
406 | * would be too shallow. | |
407 | */ | |
85f6a17f | 408 | if (teo_time_ok(avg_ns)) { |
c1d51f68 RW |
409 | duration_ns = avg_ns; |
410 | if (drv->states[idx].target_residency_ns > avg_ns) | |
cab09f3d | 411 | idx = teo_find_shallower_state(drv, dev, |
c1d51f68 | 412 | idx, avg_ns); |
b26bf6ab RW |
413 | } |
414 | } | |
415 | } | |
416 | ||
417 | /* | |
418 | * Don't stop the tick if the selected state is a polling one or if the | |
419 | * expected idle duration is shorter than the tick period length. | |
420 | */ | |
421 | if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) || | |
c1d51f68 | 422 | duration_ns < TICK_NSEC) && !tick_nohz_tick_stopped()) { |
b26bf6ab RW |
423 | *stop_tick = false; |
424 | ||
425 | /* | |
426 | * The tick is not going to be stopped, so if the target | |
427 | * residency of the state to be returned is not within the time | |
428 | * till the closest timer including the tick, try to correct | |
429 | * that. | |
430 | */ | |
c1d51f68 RW |
431 | if (idx > 0 && drv->states[idx].target_residency_ns > delta_tick) |
432 | idx = teo_find_shallower_state(drv, dev, idx, delta_tick); | |
b26bf6ab RW |
433 | } |
434 | ||
435 | return idx; | |
436 | } | |
437 | ||
438 | /** | |
439 | * teo_reflect - Note that governor data for the CPU need to be updated. | |
440 | * @dev: Target CPU. | |
441 | * @state: Entered state. | |
442 | */ | |
443 | static void teo_reflect(struct cpuidle_device *dev, int state) | |
444 | { | |
445 | struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); | |
446 | ||
7d4daeed | 447 | dev->last_state_idx = state; |
b26bf6ab RW |
448 | /* |
449 | * If the wakeup was not "natural", but triggered by one of the safety | |
450 | * nets, assume that the CPU might have been idle for the entire sleep | |
451 | * length time. | |
452 | */ | |
453 | if (dev->poll_time_limit || | |
454 | (tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) { | |
455 | dev->poll_time_limit = false; | |
456 | cpu_data->time_span_ns = cpu_data->sleep_length_ns; | |
457 | } else { | |
458 | cpu_data->time_span_ns = local_clock() - cpu_data->time_span_ns; | |
459 | } | |
460 | } | |
461 | ||
462 | /** | |
463 | * teo_enable_device - Initialize the governor's data for the target CPU. | |
464 | * @drv: cpuidle driver (not used). | |
465 | * @dev: Target CPU. | |
466 | */ | |
467 | static int teo_enable_device(struct cpuidle_driver *drv, | |
468 | struct cpuidle_device *dev) | |
469 | { | |
470 | struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); | |
471 | int i; | |
472 | ||
473 | memset(cpu_data, 0, sizeof(*cpu_data)); | |
474 | ||
475 | for (i = 0; i < INTERVALS; i++) | |
c1d51f68 | 476 | cpu_data->intervals[i] = U64_MAX; |
b26bf6ab RW |
477 | |
478 | return 0; | |
479 | } | |
480 | ||
481 | static struct cpuidle_governor teo_governor = { | |
482 | .name = "teo", | |
483 | .rating = 19, | |
484 | .enable = teo_enable_device, | |
485 | .select = teo_select, | |
486 | .reflect = teo_reflect, | |
487 | }; | |
488 | ||
489 | static int __init teo_governor_init(void) | |
490 | { | |
491 | return cpuidle_register_governor(&teo_governor); | |
492 | } | |
493 | ||
494 | postcore_initcall(teo_governor_init); |