]>
Commit | Line | Data |
---|---|---|
7db79172 MF |
1 | /* |
2 | * Blackfin performance counters | |
3 | * | |
4 | * Copyright 2011 Analog Devices Inc. | |
5 | * | |
6 | * Ripped from SuperH version: | |
7 | * | |
8 | * Copyright (C) 2009 Paul Mundt | |
9 | * | |
10 | * Heavily based on the x86 and PowerPC implementations. | |
11 | * | |
12 | * x86: | |
13 | * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> | |
14 | * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar | |
15 | * Copyright (C) 2009 Jaswinder Singh Rajput | |
16 | * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter | |
90eec103 | 17 | * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra |
7db79172 MF |
18 | * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> |
19 | * | |
20 | * ppc: | |
21 | * Copyright 2008-2009 Paul Mackerras, IBM Corporation. | |
22 | * | |
23 | * Licensed under the GPL-2 or later. | |
24 | */ | |
25 | ||
26 | #include <linux/kernel.h> | |
8dc7a9c8 | 27 | #include <linux/export.h> |
7db79172 MF |
28 | #include <linux/init.h> |
29 | #include <linux/perf_event.h> | |
30 | #include <asm/bfin_pfmon.h> | |
31 | ||
32 | /* | |
33 | * We have two counters, and each counter can support an event type. | |
34 | * The 'o' is PFCNTx=1 and 's' is PFCNTx=0 | |
35 | * | |
36 | * 0x04 o pc invariant branches | |
37 | * 0x06 o mispredicted branches | |
38 | * 0x09 o predicted branches taken | |
39 | * 0x0B o EXCPT insn | |
40 | * 0x0C o CSYNC/SSYNC insn | |
41 | * 0x0D o Insns committed | |
42 | * 0x0E o Interrupts taken | |
43 | * 0x0F o Misaligned address exceptions | |
44 | * 0x80 o Code memory fetches stalled due to DMA | |
45 | * 0x83 o 64bit insn fetches delivered | |
46 | * 0x9A o data cache fills (bank a) | |
47 | * 0x9B o data cache fills (bank b) | |
48 | * 0x9C o data cache lines evicted (bank a) | |
49 | * 0x9D o data cache lines evicted (bank b) | |
50 | * 0x9E o data cache high priority fills | |
51 | * 0x9F o data cache low priority fills | |
52 | * 0x00 s loop 0 iterations | |
53 | * 0x01 s loop 1 iterations | |
54 | * 0x0A s CSYNC/SSYNC stalls | |
55 | * 0x10 s DAG read/after write hazards | |
56 | * 0x13 s RAW data hazards | |
57 | * 0x81 s code TAG stalls | |
58 | * 0x82 s code fill stalls | |
59 | * 0x90 s processor to memory stalls | |
60 | * 0x91 s data memory stalls not hidden by 0x90 | |
61 | * 0x92 s data store buffer full stalls | |
62 | * 0x93 s data memory write buffer full stalls due to high->low priority | |
63 | * 0x95 s data memory fill buffer stalls | |
64 | * 0x96 s data TAG collision stalls | |
65 | * 0x97 s data collision stalls | |
66 | * 0x98 s data stalls | |
67 | * 0x99 s data stalls sent to processor | |
68 | */ | |
69 | ||
70 | static const int event_map[] = { | |
71 | /* use CYCLES cpu register */ | |
72 | [PERF_COUNT_HW_CPU_CYCLES] = -1, | |
73 | [PERF_COUNT_HW_INSTRUCTIONS] = 0x0D, | |
74 | [PERF_COUNT_HW_CACHE_REFERENCES] = -1, | |
75 | [PERF_COUNT_HW_CACHE_MISSES] = 0x83, | |
76 | [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x09, | |
77 | [PERF_COUNT_HW_BRANCH_MISSES] = 0x06, | |
78 | [PERF_COUNT_HW_BUS_CYCLES] = -1, | |
79 | }; | |
80 | ||
81 | #define C(x) PERF_COUNT_HW_CACHE_##x | |
82 | ||
83 | static const int cache_events[PERF_COUNT_HW_CACHE_MAX] | |
84 | [PERF_COUNT_HW_CACHE_OP_MAX] | |
85 | [PERF_COUNT_HW_CACHE_RESULT_MAX] = | |
86 | { | |
87 | [C(L1D)] = { /* Data bank A */ | |
88 | [C(OP_READ)] = { | |
89 | [C(RESULT_ACCESS)] = 0, | |
90 | [C(RESULT_MISS) ] = 0x9A, | |
91 | }, | |
92 | [C(OP_WRITE)] = { | |
93 | [C(RESULT_ACCESS)] = 0, | |
94 | [C(RESULT_MISS) ] = 0, | |
95 | }, | |
96 | [C(OP_PREFETCH)] = { | |
97 | [C(RESULT_ACCESS)] = 0, | |
98 | [C(RESULT_MISS) ] = 0, | |
99 | }, | |
100 | }, | |
101 | ||
102 | [C(L1I)] = { | |
103 | [C(OP_READ)] = { | |
104 | [C(RESULT_ACCESS)] = 0, | |
105 | [C(RESULT_MISS) ] = 0x83, | |
106 | }, | |
107 | [C(OP_WRITE)] = { | |
108 | [C(RESULT_ACCESS)] = -1, | |
109 | [C(RESULT_MISS) ] = -1, | |
110 | }, | |
111 | [C(OP_PREFETCH)] = { | |
112 | [C(RESULT_ACCESS)] = 0, | |
113 | [C(RESULT_MISS) ] = 0, | |
114 | }, | |
115 | }, | |
116 | ||
117 | [C(LL)] = { | |
118 | [C(OP_READ)] = { | |
119 | [C(RESULT_ACCESS)] = -1, | |
120 | [C(RESULT_MISS) ] = -1, | |
121 | }, | |
122 | [C(OP_WRITE)] = { | |
123 | [C(RESULT_ACCESS)] = -1, | |
124 | [C(RESULT_MISS) ] = -1, | |
125 | }, | |
126 | [C(OP_PREFETCH)] = { | |
127 | [C(RESULT_ACCESS)] = -1, | |
128 | [C(RESULT_MISS) ] = -1, | |
129 | }, | |
130 | }, | |
131 | ||
132 | [C(DTLB)] = { | |
133 | [C(OP_READ)] = { | |
134 | [C(RESULT_ACCESS)] = -1, | |
135 | [C(RESULT_MISS) ] = -1, | |
136 | }, | |
137 | [C(OP_WRITE)] = { | |
138 | [C(RESULT_ACCESS)] = -1, | |
139 | [C(RESULT_MISS) ] = -1, | |
140 | }, | |
141 | [C(OP_PREFETCH)] = { | |
142 | [C(RESULT_ACCESS)] = -1, | |
143 | [C(RESULT_MISS) ] = -1, | |
144 | }, | |
145 | }, | |
146 | ||
147 | [C(ITLB)] = { | |
148 | [C(OP_READ)] = { | |
149 | [C(RESULT_ACCESS)] = -1, | |
150 | [C(RESULT_MISS) ] = -1, | |
151 | }, | |
152 | [C(OP_WRITE)] = { | |
153 | [C(RESULT_ACCESS)] = -1, | |
154 | [C(RESULT_MISS) ] = -1, | |
155 | }, | |
156 | [C(OP_PREFETCH)] = { | |
157 | [C(RESULT_ACCESS)] = -1, | |
158 | [C(RESULT_MISS) ] = -1, | |
159 | }, | |
160 | }, | |
161 | ||
162 | [C(BPU)] = { | |
163 | [C(OP_READ)] = { | |
164 | [C(RESULT_ACCESS)] = -1, | |
165 | [C(RESULT_MISS) ] = -1, | |
166 | }, | |
167 | [C(OP_WRITE)] = { | |
168 | [C(RESULT_ACCESS)] = -1, | |
169 | [C(RESULT_MISS) ] = -1, | |
170 | }, | |
171 | [C(OP_PREFETCH)] = { | |
172 | [C(RESULT_ACCESS)] = -1, | |
173 | [C(RESULT_MISS) ] = -1, | |
174 | }, | |
175 | }, | |
176 | }; | |
177 | ||
178 | const char *perf_pmu_name(void) | |
179 | { | |
180 | return "bfin"; | |
181 | } | |
182 | EXPORT_SYMBOL(perf_pmu_name); | |
183 | ||
184 | int perf_num_counters(void) | |
185 | { | |
186 | return ARRAY_SIZE(event_map); | |
187 | } | |
188 | EXPORT_SYMBOL(perf_num_counters); | |
189 | ||
190 | static u64 bfin_pfmon_read(int idx) | |
191 | { | |
192 | return bfin_read32(PFCNTR0 + (idx * 4)); | |
193 | } | |
194 | ||
195 | static void bfin_pfmon_disable(struct hw_perf_event *hwc, int idx) | |
196 | { | |
197 | bfin_write_PFCTL(bfin_read_PFCTL() & ~PFCEN(idx, PFCEN_MASK)); | |
198 | } | |
199 | ||
200 | static void bfin_pfmon_enable(struct hw_perf_event *hwc, int idx) | |
201 | { | |
202 | u32 val, mask; | |
203 | ||
204 | val = PFPWR; | |
205 | if (idx) { | |
206 | mask = ~(PFCNT1 | PFMON1 | PFCEN1 | PEMUSW1); | |
207 | /* The packed config is for event0, so shift it to event1 slots */ | |
208 | val |= (hwc->config << (PFMON1_P - PFMON0_P)); | |
209 | val |= (hwc->config & PFCNT0) << (PFCNT1_P - PFCNT0_P); | |
210 | bfin_write_PFCNTR1(0); | |
211 | } else { | |
212 | mask = ~(PFCNT0 | PFMON0 | PFCEN0 | PEMUSW0); | |
213 | val |= hwc->config; | |
214 | bfin_write_PFCNTR0(0); | |
215 | } | |
216 | ||
217 | bfin_write_PFCTL((bfin_read_PFCTL() & mask) | val); | |
218 | } | |
219 | ||
220 | static void bfin_pfmon_disable_all(void) | |
221 | { | |
222 | bfin_write_PFCTL(bfin_read_PFCTL() & ~PFPWR); | |
223 | } | |
224 | ||
225 | static void bfin_pfmon_enable_all(void) | |
226 | { | |
227 | bfin_write_PFCTL(bfin_read_PFCTL() | PFPWR); | |
228 | } | |
229 | ||
230 | struct cpu_hw_events { | |
231 | struct perf_event *events[MAX_HWEVENTS]; | |
232 | unsigned long used_mask[BITS_TO_LONGS(MAX_HWEVENTS)]; | |
233 | }; | |
234 | DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events); | |
235 | ||
236 | static int hw_perf_cache_event(int config, int *evp) | |
237 | { | |
238 | unsigned long type, op, result; | |
239 | int ev; | |
240 | ||
241 | /* unpack config */ | |
242 | type = config & 0xff; | |
243 | op = (config >> 8) & 0xff; | |
244 | result = (config >> 16) & 0xff; | |
245 | ||
246 | if (type >= PERF_COUNT_HW_CACHE_MAX || | |
247 | op >= PERF_COUNT_HW_CACHE_OP_MAX || | |
248 | result >= PERF_COUNT_HW_CACHE_RESULT_MAX) | |
249 | return -EINVAL; | |
250 | ||
251 | ev = cache_events[type][op][result]; | |
252 | if (ev == 0) | |
253 | return -EOPNOTSUPP; | |
254 | if (ev == -1) | |
255 | return -EINVAL; | |
256 | *evp = ev; | |
257 | return 0; | |
258 | } | |
259 | ||
260 | static void bfin_perf_event_update(struct perf_event *event, | |
261 | struct hw_perf_event *hwc, int idx) | |
262 | { | |
263 | u64 prev_raw_count, new_raw_count; | |
264 | s64 delta; | |
265 | int shift = 0; | |
266 | ||
267 | /* | |
268 | * Depending on the counter configuration, they may or may not | |
269 | * be chained, in which case the previous counter value can be | |
270 | * updated underneath us if the lower-half overflows. | |
271 | * | |
272 | * Our tactic to handle this is to first atomically read and | |
273 | * exchange a new raw count - then add that new-prev delta | |
274 | * count to the generic counter atomically. | |
275 | * | |
276 | * As there is no interrupt associated with the overflow events, | |
277 | * this is the simplest approach for maintaining consistency. | |
278 | */ | |
279 | again: | |
280 | prev_raw_count = local64_read(&hwc->prev_count); | |
281 | new_raw_count = bfin_pfmon_read(idx); | |
282 | ||
283 | if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, | |
284 | new_raw_count) != prev_raw_count) | |
285 | goto again; | |
286 | ||
287 | /* | |
288 | * Now we have the new raw value and have updated the prev | |
289 | * timestamp already. We can now calculate the elapsed delta | |
290 | * (counter-)time and add that to the generic counter. | |
291 | * | |
292 | * Careful, not all hw sign-extends above the physical width | |
293 | * of the count. | |
294 | */ | |
295 | delta = (new_raw_count << shift) - (prev_raw_count << shift); | |
296 | delta >>= shift; | |
297 | ||
298 | local64_add(delta, &event->count); | |
299 | } | |
300 | ||
301 | static void bfin_pmu_stop(struct perf_event *event, int flags) | |
302 | { | |
7e788ab1 | 303 | struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); |
7db79172 MF |
304 | struct hw_perf_event *hwc = &event->hw; |
305 | int idx = hwc->idx; | |
306 | ||
307 | if (!(event->hw.state & PERF_HES_STOPPED)) { | |
308 | bfin_pfmon_disable(hwc, idx); | |
309 | cpuc->events[idx] = NULL; | |
310 | event->hw.state |= PERF_HES_STOPPED; | |
311 | } | |
312 | ||
313 | if ((flags & PERF_EF_UPDATE) && !(event->hw.state & PERF_HES_UPTODATE)) { | |
314 | bfin_perf_event_update(event, &event->hw, idx); | |
315 | event->hw.state |= PERF_HES_UPTODATE; | |
316 | } | |
317 | } | |
318 | ||
319 | static void bfin_pmu_start(struct perf_event *event, int flags) | |
320 | { | |
7e788ab1 | 321 | struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); |
7db79172 MF |
322 | struct hw_perf_event *hwc = &event->hw; |
323 | int idx = hwc->idx; | |
324 | ||
325 | if (WARN_ON_ONCE(idx == -1)) | |
326 | return; | |
327 | ||
328 | if (flags & PERF_EF_RELOAD) | |
329 | WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); | |
330 | ||
331 | cpuc->events[idx] = event; | |
332 | event->hw.state = 0; | |
333 | bfin_pfmon_enable(hwc, idx); | |
334 | } | |
335 | ||
336 | static void bfin_pmu_del(struct perf_event *event, int flags) | |
337 | { | |
7e788ab1 | 338 | struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); |
7db79172 MF |
339 | |
340 | bfin_pmu_stop(event, PERF_EF_UPDATE); | |
341 | __clear_bit(event->hw.idx, cpuc->used_mask); | |
342 | ||
343 | perf_event_update_userpage(event); | |
344 | } | |
345 | ||
346 | static int bfin_pmu_add(struct perf_event *event, int flags) | |
347 | { | |
7e788ab1 | 348 | struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); |
7db79172 MF |
349 | struct hw_perf_event *hwc = &event->hw; |
350 | int idx = hwc->idx; | |
351 | int ret = -EAGAIN; | |
352 | ||
353 | perf_pmu_disable(event->pmu); | |
354 | ||
355 | if (__test_and_set_bit(idx, cpuc->used_mask)) { | |
356 | idx = find_first_zero_bit(cpuc->used_mask, MAX_HWEVENTS); | |
357 | if (idx == MAX_HWEVENTS) | |
358 | goto out; | |
359 | ||
360 | __set_bit(idx, cpuc->used_mask); | |
361 | hwc->idx = idx; | |
362 | } | |
363 | ||
364 | bfin_pfmon_disable(hwc, idx); | |
365 | ||
366 | event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; | |
367 | if (flags & PERF_EF_START) | |
368 | bfin_pmu_start(event, PERF_EF_RELOAD); | |
369 | ||
370 | perf_event_update_userpage(event); | |
371 | ret = 0; | |
372 | out: | |
373 | perf_pmu_enable(event->pmu); | |
374 | return ret; | |
375 | } | |
376 | ||
377 | static void bfin_pmu_read(struct perf_event *event) | |
378 | { | |
379 | bfin_perf_event_update(event, &event->hw, event->hw.idx); | |
380 | } | |
381 | ||
382 | static int bfin_pmu_event_init(struct perf_event *event) | |
383 | { | |
384 | struct perf_event_attr *attr = &event->attr; | |
385 | struct hw_perf_event *hwc = &event->hw; | |
386 | int config = -1; | |
387 | int ret; | |
388 | ||
389 | if (attr->exclude_hv || attr->exclude_idle) | |
390 | return -EPERM; | |
391 | ||
7db79172 MF |
392 | ret = 0; |
393 | switch (attr->type) { | |
394 | case PERF_TYPE_RAW: | |
395 | config = PFMON(0, attr->config & PFMON_MASK) | | |
396 | PFCNT(0, !(attr->config & 0x100)); | |
397 | break; | |
398 | case PERF_TYPE_HW_CACHE: | |
399 | ret = hw_perf_cache_event(attr->config, &config); | |
400 | break; | |
401 | case PERF_TYPE_HARDWARE: | |
402 | if (attr->config >= ARRAY_SIZE(event_map)) | |
403 | return -EINVAL; | |
404 | ||
405 | config = event_map[attr->config]; | |
406 | break; | |
407 | } | |
408 | ||
409 | if (config == -1) | |
410 | return -EINVAL; | |
411 | ||
412 | if (!attr->exclude_kernel) | |
413 | config |= PFCEN(0, PFCEN_ENABLE_SUPV); | |
414 | if (!attr->exclude_user) | |
415 | config |= PFCEN(0, PFCEN_ENABLE_USER); | |
416 | ||
417 | hwc->config |= config; | |
418 | ||
419 | return ret; | |
420 | } | |
421 | ||
422 | static void bfin_pmu_enable(struct pmu *pmu) | |
423 | { | |
7e788ab1 | 424 | struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); |
7db79172 MF |
425 | struct perf_event *event; |
426 | struct hw_perf_event *hwc; | |
427 | int i; | |
428 | ||
429 | for (i = 0; i < MAX_HWEVENTS; ++i) { | |
430 | event = cpuc->events[i]; | |
431 | if (!event) | |
432 | continue; | |
433 | hwc = &event->hw; | |
434 | bfin_pfmon_enable(hwc, hwc->idx); | |
435 | } | |
436 | ||
437 | bfin_pfmon_enable_all(); | |
438 | } | |
439 | ||
440 | static void bfin_pmu_disable(struct pmu *pmu) | |
441 | { | |
442 | bfin_pfmon_disable_all(); | |
443 | } | |
444 | ||
445 | static struct pmu pmu = { | |
446 | .pmu_enable = bfin_pmu_enable, | |
447 | .pmu_disable = bfin_pmu_disable, | |
448 | .event_init = bfin_pmu_event_init, | |
449 | .add = bfin_pmu_add, | |
450 | .del = bfin_pmu_del, | |
451 | .start = bfin_pmu_start, | |
452 | .stop = bfin_pmu_stop, | |
453 | .read = bfin_pmu_read, | |
454 | }; | |
455 | ||
456 | static void bfin_pmu_setup(int cpu) | |
457 | { | |
458 | struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu); | |
459 | ||
460 | memset(cpuhw, 0, sizeof(struct cpu_hw_events)); | |
461 | } | |
462 | ||
13dff62d | 463 | static int |
7db79172 MF |
464 | bfin_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) |
465 | { | |
466 | unsigned int cpu = (long)hcpu; | |
467 | ||
468 | switch (action & ~CPU_TASKS_FROZEN) { | |
469 | case CPU_UP_PREPARE: | |
470 | bfin_write_PFCTL(0); | |
471 | bfin_pmu_setup(cpu); | |
472 | break; | |
473 | ||
474 | default: | |
475 | break; | |
476 | } | |
477 | ||
478 | return NOTIFY_OK; | |
479 | } | |
480 | ||
481 | static int __init bfin_pmu_init(void) | |
482 | { | |
483 | int ret; | |
484 | ||
6e316f9c VW |
485 | /* |
486 | * All of the on-chip counters are "limited", in that they have | |
487 | * no interrupts, and are therefore unable to do sampling without | |
488 | * further work and timer assistance. | |
489 | */ | |
490 | pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; | |
491 | ||
7db79172 MF |
492 | ret = perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); |
493 | if (!ret) | |
494 | perf_cpu_notifier(bfin_pmu_notifier); | |
495 | ||
496 | return ret; | |
497 | } | |
498 | early_initcall(bfin_pmu_init); |