]>
Commit | Line | Data |
---|---|---|
7fe2f639 DB |
1 | /* |
2 | * (C) 2010,2011 Thomas Renninger <trenn@suse.de>, Novell Inc. | |
3 | * | |
4 | * Licensed under the terms of the GNU GPL License version 2. | |
5 | */ | |
6 | ||
7 | #if defined(__i386__) || defined(__x86_64__) | |
8 | ||
9 | #include <stdio.h> | |
10 | #include <stdint.h> | |
11 | #include <stdlib.h> | |
12 | #include <string.h> | |
13 | #include <limits.h> | |
14 | ||
15 | #include <cpufreq.h> | |
16 | ||
17 | #include "helpers/helpers.h" | |
18 | #include "idle_monitor/cpupower-monitor.h" | |
19 | ||
20 | #define MSR_APERF 0xE8 | |
21 | #define MSR_MPERF 0xE7 | |
22 | ||
23 | #define MSR_TSC 0x10 | |
24 | ||
2dfc818b TR |
25 | #define MSR_AMD_HWCR 0xc0010015 |
26 | ||
7fe2f639 DB |
27 | enum mperf_id { C0 = 0, Cx, AVG_FREQ, MPERF_CSTATE_COUNT }; |
28 | ||
29 | static int mperf_get_count_percent(unsigned int self_id, double *percent, | |
30 | unsigned int cpu); | |
31 | static int mperf_get_count_freq(unsigned int id, unsigned long long *count, | |
32 | unsigned int cpu); | |
2dfc818b | 33 | static struct timespec time_start, time_end; |
7fe2f639 DB |
34 | |
35 | static cstate_t mperf_cstates[MPERF_CSTATE_COUNT] = { | |
36 | { | |
37 | .name = "C0", | |
38 | .desc = N_("Processor Core not idle"), | |
39 | .id = C0, | |
40 | .range = RANGE_THREAD, | |
41 | .get_count_percent = mperf_get_count_percent, | |
42 | }, | |
43 | { | |
44 | .name = "Cx", | |
45 | .desc = N_("Processor Core in an idle state"), | |
46 | .id = Cx, | |
47 | .range = RANGE_THREAD, | |
48 | .get_count_percent = mperf_get_count_percent, | |
49 | }, | |
50 | ||
51 | { | |
52 | .name = "Freq", | |
53 | .desc = N_("Average Frequency (including boost) in MHz"), | |
54 | .id = AVG_FREQ, | |
55 | .range = RANGE_THREAD, | |
56 | .get_count = mperf_get_count_freq, | |
57 | }, | |
58 | }; | |
59 | ||
2dfc818b TR |
60 | enum MAX_FREQ_MODE { MAX_FREQ_SYSFS, MAX_FREQ_TSC_REF }; |
61 | static int max_freq_mode; | |
62 | /* | |
63 | * The max frequency mperf is ticking at (in C0), either retrieved via: | |
64 | * 1) calculated after measurements if we know TSC ticks at mperf/P0 frequency | |
65 | * 2) cpufreq /sys/devices/.../cpu0/cpufreq/cpuinfo_max_freq at init time | |
66 | * 1. Is preferred as it also works without cpufreq subsystem (e.g. on Xen) | |
67 | */ | |
68 | static unsigned long max_frequency; | |
69 | ||
7fe2f639 DB |
70 | static unsigned long long tsc_at_measure_start; |
71 | static unsigned long long tsc_at_measure_end; | |
7fe2f639 DB |
72 | static unsigned long long *mperf_previous_count; |
73 | static unsigned long long *aperf_previous_count; | |
74 | static unsigned long long *mperf_current_count; | |
75 | static unsigned long long *aperf_current_count; | |
2dfc818b | 76 | |
7fe2f639 DB |
77 | /* valid flag for all CPUs. If a MSR read failed it will be zero */ |
78 | static int *is_valid; | |
79 | ||
80 | static int mperf_get_tsc(unsigned long long *tsc) | |
81 | { | |
2dfc818b TR |
82 | int ret; |
83 | ret = read_msr(0, MSR_TSC, tsc); | |
84 | if (ret) | |
85 | dprint("Reading TSC MSR failed, returning %llu\n", *tsc); | |
86 | return ret; | |
7fe2f639 DB |
87 | } |
88 | ||
89 | static int mperf_init_stats(unsigned int cpu) | |
90 | { | |
91 | unsigned long long val; | |
92 | int ret; | |
93 | ||
94 | ret = read_msr(cpu, MSR_APERF, &val); | |
95 | aperf_previous_count[cpu] = val; | |
96 | ret |= read_msr(cpu, MSR_MPERF, &val); | |
97 | mperf_previous_count[cpu] = val; | |
98 | is_valid[cpu] = !ret; | |
b510b541 | 99 | |
7fe2f639 DB |
100 | return 0; |
101 | } | |
102 | ||
103 | static int mperf_measure_stats(unsigned int cpu) | |
104 | { | |
105 | unsigned long long val; | |
106 | int ret; | |
107 | ||
108 | ret = read_msr(cpu, MSR_APERF, &val); | |
109 | aperf_current_count[cpu] = val; | |
110 | ret |= read_msr(cpu, MSR_MPERF, &val); | |
111 | mperf_current_count[cpu] = val; | |
112 | is_valid[cpu] = !ret; | |
b510b541 | 113 | |
7fe2f639 DB |
114 | return 0; |
115 | } | |
116 | ||
7fe2f639 DB |
117 | static int mperf_get_count_percent(unsigned int id, double *percent, |
118 | unsigned int cpu) | |
119 | { | |
120 | unsigned long long aperf_diff, mperf_diff, tsc_diff; | |
2dfc818b | 121 | unsigned long long timediff; |
7fe2f639 DB |
122 | |
123 | if (!is_valid[cpu]) | |
124 | return -1; | |
125 | ||
126 | if (id != C0 && id != Cx) | |
127 | return -1; | |
128 | ||
129 | mperf_diff = mperf_current_count[cpu] - mperf_previous_count[cpu]; | |
130 | aperf_diff = aperf_current_count[cpu] - aperf_previous_count[cpu]; | |
7fe2f639 | 131 | |
2dfc818b TR |
132 | if (max_freq_mode == MAX_FREQ_TSC_REF) { |
133 | tsc_diff = tsc_at_measure_end - tsc_at_measure_start; | |
134 | *percent = 100.0 * mperf_diff / tsc_diff; | |
135 | dprint("%s: TSC Ref - mperf_diff: %llu, tsc_diff: %llu\n", | |
136 | mperf_cstates[id].name, mperf_diff, tsc_diff); | |
137 | } else if (max_freq_mode == MAX_FREQ_SYSFS) { | |
138 | timediff = timespec_diff_us(time_start, time_end); | |
139 | *percent = 100.0 * mperf_diff / timediff; | |
140 | dprint("%s: MAXFREQ - mperf_diff: %llu, time_diff: %llu\n", | |
141 | mperf_cstates[id].name, mperf_diff, timediff); | |
142 | } else | |
143 | return -1; | |
7fe2f639 DB |
144 | |
145 | if (id == Cx) | |
146 | *percent = 100.0 - *percent; | |
147 | ||
b510b541 DB |
148 | dprint("%s: previous: %llu - current: %llu - (%u)\n", |
149 | mperf_cstates[id].name, mperf_diff, aperf_diff, cpu); | |
7fe2f639 DB |
150 | dprint("%s: %f\n", mperf_cstates[id].name, *percent); |
151 | return 0; | |
152 | } | |
153 | ||
154 | static int mperf_get_count_freq(unsigned int id, unsigned long long *count, | |
b510b541 | 155 | unsigned int cpu) |
7fe2f639 | 156 | { |
2dfc818b | 157 | unsigned long long aperf_diff, mperf_diff, time_diff, tsc_diff; |
7fe2f639 DB |
158 | |
159 | if (id != AVG_FREQ) | |
160 | return 1; | |
161 | ||
162 | if (!is_valid[cpu]) | |
163 | return -1; | |
164 | ||
165 | mperf_diff = mperf_current_count[cpu] - mperf_previous_count[cpu]; | |
166 | aperf_diff = aperf_current_count[cpu] - aperf_previous_count[cpu]; | |
167 | ||
2dfc818b TR |
168 | if (max_freq_mode == MAX_FREQ_TSC_REF) { |
169 | /* Calculate max_freq from TSC count */ | |
170 | tsc_diff = tsc_at_measure_end - tsc_at_measure_start; | |
171 | time_diff = timespec_diff_us(time_start, time_end); | |
172 | max_frequency = tsc_diff / time_diff; | |
173 | } | |
7fe2f639 | 174 | |
2dfc818b TR |
175 | *count = max_frequency * ((double)aperf_diff / mperf_diff); |
176 | dprint("%s: Average freq based on %s maximum frequency:\n", | |
177 | mperf_cstates[id].name, | |
178 | (max_freq_mode == MAX_FREQ_TSC_REF) ? "TSC calculated" : "sysfs read"); | |
179 | dprint("%max_frequency: %lu", max_frequency); | |
180 | dprint("aperf_diff: %llu\n", aperf_diff); | |
181 | dprint("mperf_diff: %llu\n", mperf_diff); | |
182 | dprint("avg freq: %llu\n", *count); | |
7fe2f639 DB |
183 | return 0; |
184 | } | |
185 | ||
186 | static int mperf_start(void) | |
187 | { | |
188 | int cpu; | |
189 | unsigned long long dbg; | |
190 | ||
2dfc818b | 191 | clock_gettime(CLOCK_REALTIME, &time_start); |
7fe2f639 DB |
192 | mperf_get_tsc(&tsc_at_measure_start); |
193 | ||
194 | for (cpu = 0; cpu < cpu_count; cpu++) | |
195 | mperf_init_stats(cpu); | |
196 | ||
197 | mperf_get_tsc(&dbg); | |
198 | dprint("TSC diff: %llu\n", dbg - tsc_at_measure_start); | |
199 | return 0; | |
200 | } | |
201 | ||
202 | static int mperf_stop(void) | |
203 | { | |
204 | unsigned long long dbg; | |
205 | int cpu; | |
206 | ||
7fe2f639 DB |
207 | for (cpu = 0; cpu < cpu_count; cpu++) |
208 | mperf_measure_stats(cpu); | |
209 | ||
2dfc818b TR |
210 | mperf_get_tsc(&tsc_at_measure_end); |
211 | clock_gettime(CLOCK_REALTIME, &time_end); | |
212 | ||
7fe2f639 DB |
213 | mperf_get_tsc(&dbg); |
214 | dprint("TSC diff: %llu\n", dbg - tsc_at_measure_end); | |
215 | ||
216 | return 0; | |
217 | } | |
218 | ||
2dfc818b TR |
219 | /* |
220 | * Mperf register is defined to tick at P0 (maximum) frequency | |
221 | * | |
222 | * Instead of reading out P0 which can be tricky to read out from HW, | |
223 | * we use TSC counter if it reliably ticks at P0/mperf frequency. | |
224 | * | |
225 | * Still try to fall back to: | |
226 | * /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq | |
227 | * on older Intel HW without invariant TSC feature. | |
228 | * Or on AMD machines where TSC does not tick at P0 (do not exist yet, but | |
229 | * it's still double checked (MSR_AMD_HWCR)). | |
230 | * | |
231 | * On these machines the user would still get useful mperf | |
232 | * stats when acpi-cpufreq driver is loaded. | |
233 | */ | |
234 | static int init_maxfreq_mode(void) | |
b510b541 | 235 | { |
2dfc818b TR |
236 | int ret; |
237 | unsigned long long hwcr; | |
7fe2f639 DB |
238 | unsigned long min; |
239 | ||
97fa1c5c | 240 | if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_INV_TSC)) |
2dfc818b TR |
241 | goto use_sysfs; |
242 | ||
243 | if (cpupower_cpu_info.vendor == X86_VENDOR_AMD) { | |
244 | /* MSR_AMD_HWCR tells us whether TSC runs at P0/mperf | |
245 | * freq. | |
246 | * A test whether hwcr is accessable/available would be: | |
247 | * (cpupower_cpu_info.family > 0x10 || | |
248 | * cpupower_cpu_info.family == 0x10 && | |
249 | * cpupower_cpu_info.model >= 0x2)) | |
250 | * This should be the case for all aperf/mperf | |
251 | * capable AMD machines and is therefore safe to test here. | |
252 | * Compare with Linus kernel git commit: acf01734b1747b1ec4 | |
253 | */ | |
254 | ret = read_msr(0, MSR_AMD_HWCR, &hwcr); | |
255 | /* | |
256 | * If the MSR read failed, assume a Xen system that did | |
257 | * not explicitly provide access to it and assume TSC works | |
258 | */ | |
259 | if (ret != 0) { | |
260 | dprint("TSC read 0x%x failed - assume TSC working\n", | |
261 | MSR_AMD_HWCR); | |
262 | return 0; | |
263 | } else if (1 & (hwcr >> 24)) { | |
264 | max_freq_mode = MAX_FREQ_TSC_REF; | |
265 | return 0; | |
266 | } else { /* Use sysfs max frequency if available */ } | |
267 | } else if (cpupower_cpu_info.vendor == X86_VENDOR_INTEL) { | |
268 | /* | |
269 | * On Intel we assume mperf (in C0) is ticking at same | |
270 | * rate than TSC | |
271 | */ | |
272 | max_freq_mode = MAX_FREQ_TSC_REF; | |
273 | return 0; | |
274 | } | |
275 | use_sysfs: | |
7fe2f639 DB |
276 | if (cpufreq_get_hardware_limits(0, &min, &max_frequency)) { |
277 | dprint("Cannot retrieve max freq from cpufreq kernel " | |
278 | "subsystem\n"); | |
2dfc818b | 279 | return -1; |
7fe2f639 | 280 | } |
2dfc818b TR |
281 | max_freq_mode = MAX_FREQ_SYSFS; |
282 | return 0; | |
283 | } | |
284 | ||
285 | /* | |
286 | * This monitor provides: | |
287 | * | |
288 | * 1) Average frequency a CPU resided in | |
289 | * This always works if the CPU has aperf/mperf capabilities | |
290 | * | |
291 | * 2) C0 and Cx (any sleep state) time a CPU resided in | |
292 | * Works if mperf timer stops ticking in sleep states which | |
293 | * seem to be the case on all current HW. | |
294 | * Both is directly retrieved from HW registers and is independent | |
295 | * from kernel statistics. | |
296 | */ | |
297 | struct cpuidle_monitor mperf_monitor; | |
298 | struct cpuidle_monitor *mperf_register(void) | |
299 | { | |
300 | if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_APERF)) | |
301 | return NULL; | |
302 | ||
303 | if (init_maxfreq_mode()) | |
304 | return NULL; | |
7fe2f639 DB |
305 | |
306 | /* Free this at program termination */ | |
b510b541 DB |
307 | is_valid = calloc(cpu_count, sizeof(int)); |
308 | mperf_previous_count = calloc(cpu_count, sizeof(unsigned long long)); | |
309 | aperf_previous_count = calloc(cpu_count, sizeof(unsigned long long)); | |
310 | mperf_current_count = calloc(cpu_count, sizeof(unsigned long long)); | |
311 | aperf_current_count = calloc(cpu_count, sizeof(unsigned long long)); | |
312 | ||
7fe2f639 DB |
313 | mperf_monitor.name_len = strlen(mperf_monitor.name); |
314 | return &mperf_monitor; | |
315 | } | |
316 | ||
b510b541 DB |
317 | void mperf_unregister(void) |
318 | { | |
7fe2f639 DB |
319 | free(mperf_previous_count); |
320 | free(aperf_previous_count); | |
321 | free(mperf_current_count); | |
322 | free(aperf_current_count); | |
323 | free(is_valid); | |
324 | } | |
325 | ||
326 | struct cpuidle_monitor mperf_monitor = { | |
327 | .name = "Mperf", | |
328 | .hw_states_num = MPERF_CSTATE_COUNT, | |
329 | .hw_states = mperf_cstates, | |
330 | .start = mperf_start, | |
331 | .stop = mperf_stop, | |
332 | .do_register = mperf_register, | |
333 | .unregister = mperf_unregister, | |
334 | .needs_root = 1, | |
335 | .overflow_s = 922000000 /* 922337203 seconds TSC overflow | |
336 | at 20GHz */ | |
337 | }; | |
338 | #endif /* #if defined(__i386__) || defined(__x86_64__) */ |