2 * kerneltop.c: show top kernel functions - performance counters showcase
6 cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
10 ------------------------------------------------------------------------------
11 KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2)
12 ------------------------------------------------------------------------------
14 weight RIP kernel function
15 ______ ________________ _______________
17 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18 33.00 - ffffffff804cb740 : sock_alloc_send_skb
19 31.26 - ffffffff804ce808 : skb_push
20 22.43 - ffffffff80510004 : tcp_established_options
21 19.00 - ffffffff8027d250 : find_get_page
22 15.76 - ffffffff804e4fc9 : eth_type_trans
23 15.20 - ffffffff804d8baa : dst_release
24 14.86 - ffffffff804cf5d8 : skb_release_head_state
25 14.00 - ffffffff802217d5 : read_hpet
26 12.00 - ffffffff804ffb7f : __ip_local_out
27 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28 8.54 - ffffffff805001a3 : ip_queue_xmit
32 * perfstat: /usr/bin/time -alike performance counter statistics utility
34 It summarizes the counter events of all tasks (and child tasks),
35 covering all CPUs that the command (or workload) executes on.
36 It only counts the per-task events of the workload started,
37 independent of how many other tasks run on those CPUs.
41 $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
43 Performance counter stats for 'ls':
45 163516953 instructions
51 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
53 * Improvements and fixes by:
55 * Arjan van de Ven <arjan@linux.intel.com>
56 * Yanmin Zhang <yanmin.zhang@intel.com>
57 * Wu Fengguang <fengguang.wu@intel.com>
58 * Mike Galbraith <efault@gmx.de>
59 * Paul Mackerras <paulus@samba.org>
61 * Released under the GPL v2. (and only v2, not any later version)
64 #include "util/util.h"
75 #include <sys/syscall.h>
76 #include <sys/ioctl.h>
78 #include <sys/prctl.h>
83 #include <linux/unistd.h>
84 #include <linux/types.h>
86 #include "../../include/linux/perf_counter.h"
90 #define EVENT_MASK_KERNEL 1
91 #define EVENT_MASK_USER 2
93 static int system_wide
= 0;
95 static int nr_counters
= 0;
96 static __u64 event_id
[MAX_COUNTERS
] = {
97 EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_TASK_CLOCK
),
98 EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_CONTEXT_SWITCHES
),
99 EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_CPU_MIGRATIONS
),
100 EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_PAGE_FAULTS
),
102 EID(PERF_TYPE_HARDWARE
, PERF_COUNT_CPU_CYCLES
),
103 EID(PERF_TYPE_HARDWARE
, PERF_COUNT_INSTRUCTIONS
),
104 EID(PERF_TYPE_HARDWARE
, PERF_COUNT_CACHE_REFERENCES
),
105 EID(PERF_TYPE_HARDWARE
, PERF_COUNT_CACHE_MISSES
),
107 static int default_interval
= 100000;
108 static int event_count
[MAX_COUNTERS
];
109 static int fd
[MAX_NR_CPUS
][MAX_COUNTERS
];
110 static int event_mask
[MAX_COUNTERS
];
113 static int profile_cpu
= -1;
114 static int nr_cpus
= 0;
116 static int group
= 0;
117 static unsigned int page_size
;
121 static int scale
= 1;
123 static const unsigned int default_count
[] = {
132 static char *hw_event_names
[] = {
142 static char *sw_event_names
[] = {
152 struct event_symbol
{
157 static struct event_symbol event_symbols
[] = {
158 {EID(PERF_TYPE_HARDWARE
, PERF_COUNT_CPU_CYCLES
), "cpu-cycles", },
159 {EID(PERF_TYPE_HARDWARE
, PERF_COUNT_CPU_CYCLES
), "cycles", },
160 {EID(PERF_TYPE_HARDWARE
, PERF_COUNT_INSTRUCTIONS
), "instructions", },
161 {EID(PERF_TYPE_HARDWARE
, PERF_COUNT_CACHE_REFERENCES
), "cache-references", },
162 {EID(PERF_TYPE_HARDWARE
, PERF_COUNT_CACHE_MISSES
), "cache-misses", },
163 {EID(PERF_TYPE_HARDWARE
, PERF_COUNT_BRANCH_INSTRUCTIONS
), "branch-instructions", },
164 {EID(PERF_TYPE_HARDWARE
, PERF_COUNT_BRANCH_INSTRUCTIONS
), "branches", },
165 {EID(PERF_TYPE_HARDWARE
, PERF_COUNT_BRANCH_MISSES
), "branch-misses", },
166 {EID(PERF_TYPE_HARDWARE
, PERF_COUNT_BUS_CYCLES
), "bus-cycles", },
168 {EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_CPU_CLOCK
), "cpu-clock", },
169 {EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_TASK_CLOCK
), "task-clock", },
170 {EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_PAGE_FAULTS
), "page-faults", },
171 {EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_PAGE_FAULTS
), "faults", },
172 {EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_PAGE_FAULTS_MIN
), "minor-faults", },
173 {EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_PAGE_FAULTS_MAJ
), "major-faults", },
174 {EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_CONTEXT_SWITCHES
), "context-switches", },
175 {EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_CONTEXT_SWITCHES
), "cs", },
176 {EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_CPU_MIGRATIONS
), "cpu-migrations", },
177 {EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_CPU_MIGRATIONS
), "migrations", },
180 #define __PERF_COUNTER_FIELD(config, name) \
181 ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
183 #define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW)
184 #define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG)
185 #define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
186 #define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
188 static void display_events_help(void)
194 " -e EVENT --event=EVENT # symbolic-name abbreviations");
196 for (i
= 0; i
< ARRAY_SIZE(event_symbols
); i
++) {
199 e
= event_symbols
[i
].event
;
200 type
= PERF_COUNTER_TYPE(e
);
201 id
= PERF_COUNTER_ID(e
);
203 printf("\n %d:%d: %-20s",
204 type
, id
, event_symbols
[i
].symbol
);
208 " rNNN: raw PMU events (eventsel+umask)\n\n");
211 static void display_help(void)
214 "Usage: perfstat [<events...>] <cmd...>\n\n"
215 "PerfStat Options (up to %d event types can be specified):\n\n",
218 display_events_help();
221 " -l # scale counter values\n"
222 " -a # system-wide collection\n");
226 static char *event_name(int ctr
)
228 __u64 config
= event_id
[ctr
];
229 int type
= PERF_COUNTER_TYPE(config
);
230 int id
= PERF_COUNTER_ID(config
);
233 if (PERF_COUNTER_RAW(config
)) {
234 sprintf(buf
, "raw 0x%llx", PERF_COUNTER_CONFIG(config
));
239 case PERF_TYPE_HARDWARE
:
240 if (id
< PERF_HW_EVENTS_MAX
)
241 return hw_event_names
[id
];
242 return "unknown-hardware";
244 case PERF_TYPE_SOFTWARE
:
245 if (id
< PERF_SW_EVENTS_MAX
)
246 return sw_event_names
[id
];
247 return "unknown-software";
257 * Each event can have multiple symbolic names.
258 * Symbolic names are (almost) exactly matched.
260 static __u64
match_event_symbols(char *str
)
267 if (sscanf(str
, "r%llx", &config
) == 1)
268 return config
| PERF_COUNTER_RAW_MASK
;
270 switch (sscanf(str
, "%d:%llu:%2s", &type
, &id
, mask_str
)) {
272 if (strchr(mask_str
, 'u'))
273 event_mask
[nr_counters
] |= EVENT_MASK_USER
;
274 if (strchr(mask_str
, 'k'))
275 event_mask
[nr_counters
] |= EVENT_MASK_KERNEL
;
277 return EID(type
, id
);
283 for (i
= 0; i
< ARRAY_SIZE(event_symbols
); i
++) {
284 if (!strncmp(str
, event_symbols
[i
].symbol
,
285 strlen(event_symbols
[i
].symbol
)))
286 return event_symbols
[i
].event
;
292 static int parse_events(char *str
)
297 if (nr_counters
== MAX_COUNTERS
)
300 config
= match_event_symbols(str
);
304 event_id
[nr_counters
] = config
;
307 str
= strstr(str
, ",");
321 char fault_here
[1000000];
323 static void create_perfstat_counter(int counter
)
325 struct perf_counter_hw_event hw_event
;
327 memset(&hw_event
, 0, sizeof(hw_event
));
328 hw_event
.config
= event_id
[counter
];
329 hw_event
.record_type
= 0;
331 hw_event
.exclude_kernel
= event_mask
[counter
] & EVENT_MASK_KERNEL
;
332 hw_event
.exclude_user
= event_mask
[counter
] & EVENT_MASK_USER
;
335 hw_event
.read_format
= PERF_FORMAT_TOTAL_TIME_ENABLED
|
336 PERF_FORMAT_TOTAL_TIME_RUNNING
;
340 for (cpu
= 0; cpu
< nr_cpus
; cpu
++) {
341 fd
[cpu
][counter
] = sys_perf_counter_open(&hw_event
, -1, cpu
, -1, 0);
342 if (fd
[cpu
][counter
] < 0) {
343 printf("perfstat error: syscall returned with %d (%s)\n",
344 fd
[cpu
][counter
], strerror(errno
));
349 hw_event
.inherit
= 1;
350 hw_event
.disabled
= 1;
352 fd
[0][counter
] = sys_perf_counter_open(&hw_event
, 0, -1, -1, 0);
353 if (fd
[0][counter
] < 0) {
354 printf("perfstat error: syscall returned with %d (%s)\n",
355 fd
[0][counter
], strerror(errno
));
361 int do_perfstat(int argc
, char *argv
[])
363 unsigned long long t0
, t1
;
372 for (counter
= 0; counter
< nr_counters
; counter
++)
373 create_perfstat_counter(counter
);
382 * Enable counters and exec the command:
385 prctl(PR_TASK_PERF_COUNTERS_ENABLE
);
387 if ((pid
= fork()) < 0)
388 perror("failed to fork");
390 if (execvp(argv
[0], argv
)) {
395 while (wait(&status
) >= 0)
397 prctl(PR_TASK_PERF_COUNTERS_DISABLE
);
402 fprintf(stderr
, "\n");
403 fprintf(stderr
, " Performance counter stats for \'%s\':\n",
405 fprintf(stderr
, "\n");
407 for (counter
= 0; counter
< nr_counters
; counter
++) {
409 __u64 count
[3], single_count
[3];
412 count
[0] = count
[1] = count
[2] = 0;
414 for (cpu
= 0; cpu
< nr_cpus
; cpu
++) {
415 res
= read(fd
[cpu
][counter
],
416 single_count
, nv
* sizeof(__u64
));
417 assert(res
== nv
* sizeof(__u64
));
419 count
[0] += single_count
[0];
421 count
[1] += single_count
[1];
422 count
[2] += single_count
[2];
429 fprintf(stderr
, " %14s %-20s\n",
430 "<not counted>", event_name(counter
));
433 if (count
[2] < count
[1]) {
435 count
[0] = (unsigned long long)
436 ((double)count
[0] * count
[1] / count
[2] + 0.5);
440 if (event_id
[counter
] == EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_CPU_CLOCK
) ||
441 event_id
[counter
] == EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_TASK_CLOCK
)) {
443 double msecs
= (double)count
[0] / 1000000;
445 fprintf(stderr
, " %14.6f %-20s (msecs)",
446 msecs
, event_name(counter
));
448 fprintf(stderr
, " %14Ld %-20s (events)",
449 count
[0], event_name(counter
));
452 fprintf(stderr
, " (scaled from %.2f%%)",
453 (double) count
[2] / count
[1] * 100);
454 fprintf(stderr
, "\n");
456 fprintf(stderr
, "\n");
457 fprintf(stderr
, " Wall-clock time elapsed: %12.6f msecs\n",
458 (double)(t1
-t0
)/1e6
);
459 fprintf(stderr
, "\n");
464 static void process_options(int argc
, char **argv
)
466 int error
= 0, counter
;
469 int option_index
= 0;
470 /** Options for getopt */
471 static struct option long_options
[] = {
472 {"count", required_argument
, NULL
, 'c'},
473 {"cpu", required_argument
, NULL
, 'C'},
474 {"delay", required_argument
, NULL
, 'd'},
475 {"dump_symtab", no_argument
, NULL
, 'D'},
476 {"event", required_argument
, NULL
, 'e'},
477 {"filter", required_argument
, NULL
, 'f'},
478 {"group", required_argument
, NULL
, 'g'},
479 {"help", no_argument
, NULL
, 'h'},
480 {"nmi", required_argument
, NULL
, 'n'},
481 {"munmap_info", no_argument
, NULL
, 'U'},
482 {"pid", required_argument
, NULL
, 'p'},
483 {"realtime", required_argument
, NULL
, 'r'},
484 {"scale", no_argument
, NULL
, 'l'},
485 {"symbol", required_argument
, NULL
, 's'},
486 {"stat", no_argument
, NULL
, 'S'},
487 {"vmlinux", required_argument
, NULL
, 'x'},
488 {"zero", no_argument
, NULL
, 'z'},
491 int c
= getopt_long(argc
, argv
, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU",
492 long_options
, &option_index
);
497 case 'a': system_wide
= 1; break;
498 case 'c': default_interval
= atoi(optarg
); break;
500 /* CPU and PID are mutually exclusive */
502 printf("WARNING: CPU switch overriding PID\n");
506 profile_cpu
= atoi(optarg
); break;
508 case 'e': error
= parse_events(optarg
); break;
510 case 'g': group
= atoi(optarg
); break;
511 case 'h': display_help(); break;
512 case 'l': scale
= 1; break;
513 case 'n': nmi
= atoi(optarg
); break;
515 /* CPU and PID are mutually exclusive */
516 if (profile_cpu
!= -1) {
517 printf("WARNING: PID switch overriding CPU\n");
521 tid
= atoi(optarg
); break;
522 case 'z': zero
= 1; break;
523 default: error
= 1; break;
533 for (counter
= 0; counter
< nr_counters
; counter
++) {
534 if (event_count
[counter
])
537 event_count
[counter
] = default_interval
;
541 int cmd_stat(int argc
, char **argv
, const char *prefix
)
543 page_size
= sysconf(_SC_PAGE_SIZE
);
545 process_options(argc
, argv
);
547 nr_cpus
= sysconf(_SC_NPROCESSORS_ONLN
);
548 assert(nr_cpus
<= MAX_NR_CPUS
);
549 assert(nr_cpus
>= 0);
551 return do_perfstat(argc
, argv
);