]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blob - tools/perf/builtin-trace.c
tools: Introduce str_error_r()
[mirror_ubuntu-hirsute-kernel.git] / tools / perf / builtin-trace.c
1 /*
2 * builtin-trace.c
3 *
4 * Builtin 'trace' command:
5 *
6 * Display a continuously updated trace of any workload, CPU, specific PID,
7 * system wide, etc. Default format is loosely strace like, but any other
8 * event may be specified using --event.
9 *
10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11 *
12 * Initially based on the 'trace' prototype by Thomas Gleixner:
13 *
14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15 *
16 * Released under the GPL v2. (and only v2, not any later version)
17 */
18
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/evlist.h"
25 #include <subcmd/exec-cmd.h>
26 #include "util/machine.h"
27 #include "util/session.h"
28 #include "util/thread.h"
29 #include <subcmd/parse-options.h>
30 #include "util/strlist.h"
31 #include "util/intlist.h"
32 #include "util/thread_map.h"
33 #include "util/stat.h"
34 #include "trace-event.h"
35 #include "util/parse-events.h"
36 #include "util/bpf-loader.h"
37 #include "callchain.h"
38 #include "syscalltbl.h"
39 #include "rb_resort.h"
40
41 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
42 #include <stdlib.h>
43 #include <linux/err.h>
44 #include <linux/filter.h>
45 #include <linux/audit.h>
46 #include <sys/ptrace.h>
47 #include <linux/random.h>
48 #include <linux/stringify.h>
49
50 #ifndef O_CLOEXEC
51 # define O_CLOEXEC 02000000
52 #endif
53
54 struct trace {
55 struct perf_tool tool;
56 struct syscalltbl *sctbl;
57 struct {
58 int max;
59 struct syscall *table;
60 struct {
61 struct perf_evsel *sys_enter,
62 *sys_exit;
63 } events;
64 } syscalls;
65 struct record_opts opts;
66 struct perf_evlist *evlist;
67 struct machine *host;
68 struct thread *current;
69 u64 base_time;
70 FILE *output;
71 unsigned long nr_events;
72 struct strlist *ev_qualifier;
73 struct {
74 size_t nr;
75 int *entries;
76 } ev_qualifier_ids;
77 struct intlist *tid_list;
78 struct intlist *pid_list;
79 struct {
80 size_t nr;
81 pid_t *entries;
82 } filter_pids;
83 double duration_filter;
84 double runtime_ms;
85 struct {
86 u64 vfs_getname,
87 proc_getname;
88 } stats;
89 unsigned int max_stack;
90 unsigned int min_stack;
91 bool not_ev_qualifier;
92 bool live;
93 bool full_time;
94 bool sched;
95 bool multiple_threads;
96 bool summary;
97 bool summary_only;
98 bool show_comm;
99 bool show_tool_stats;
100 bool trace_syscalls;
101 bool kernel_syscallchains;
102 bool force;
103 bool vfs_getname;
104 int trace_pgfaults;
105 int open_id;
106 };
107
108 struct tp_field {
109 int offset;
110 union {
111 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
112 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
113 };
114 };
115
116 #define TP_UINT_FIELD(bits) \
117 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
118 { \
119 u##bits value; \
120 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
121 return value; \
122 }
123
124 TP_UINT_FIELD(8);
125 TP_UINT_FIELD(16);
126 TP_UINT_FIELD(32);
127 TP_UINT_FIELD(64);
128
129 #define TP_UINT_FIELD__SWAPPED(bits) \
130 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
131 { \
132 u##bits value; \
133 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
134 return bswap_##bits(value);\
135 }
136
137 TP_UINT_FIELD__SWAPPED(16);
138 TP_UINT_FIELD__SWAPPED(32);
139 TP_UINT_FIELD__SWAPPED(64);
140
141 static int tp_field__init_uint(struct tp_field *field,
142 struct format_field *format_field,
143 bool needs_swap)
144 {
145 field->offset = format_field->offset;
146
147 switch (format_field->size) {
148 case 1:
149 field->integer = tp_field__u8;
150 break;
151 case 2:
152 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
153 break;
154 case 4:
155 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
156 break;
157 case 8:
158 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
159 break;
160 default:
161 return -1;
162 }
163
164 return 0;
165 }
166
167 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
168 {
169 return sample->raw_data + field->offset;
170 }
171
172 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
173 {
174 field->offset = format_field->offset;
175 field->pointer = tp_field__ptr;
176 return 0;
177 }
178
179 struct syscall_tp {
180 struct tp_field id;
181 union {
182 struct tp_field args, ret;
183 };
184 };
185
186 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
187 struct tp_field *field,
188 const char *name)
189 {
190 struct format_field *format_field = perf_evsel__field(evsel, name);
191
192 if (format_field == NULL)
193 return -1;
194
195 return tp_field__init_uint(field, format_field, evsel->needs_swap);
196 }
197
198 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
199 ({ struct syscall_tp *sc = evsel->priv;\
200 perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
201
202 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
203 struct tp_field *field,
204 const char *name)
205 {
206 struct format_field *format_field = perf_evsel__field(evsel, name);
207
208 if (format_field == NULL)
209 return -1;
210
211 return tp_field__init_ptr(field, format_field);
212 }
213
214 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
215 ({ struct syscall_tp *sc = evsel->priv;\
216 perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
217
218 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
219 {
220 zfree(&evsel->priv);
221 perf_evsel__delete(evsel);
222 }
223
224 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
225 {
226 evsel->priv = malloc(sizeof(struct syscall_tp));
227 if (evsel->priv != NULL) {
228 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
229 goto out_delete;
230
231 evsel->handler = handler;
232 return 0;
233 }
234
235 return -ENOMEM;
236
237 out_delete:
238 zfree(&evsel->priv);
239 return -ENOENT;
240 }
241
242 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
243 {
244 struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
245
246 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
247 if (IS_ERR(evsel))
248 evsel = perf_evsel__newtp("syscalls", direction);
249
250 if (IS_ERR(evsel))
251 return NULL;
252
253 if (perf_evsel__init_syscall_tp(evsel, handler))
254 goto out_delete;
255
256 return evsel;
257
258 out_delete:
259 perf_evsel__delete_priv(evsel);
260 return NULL;
261 }
262
263 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
264 ({ struct syscall_tp *fields = evsel->priv; \
265 fields->name.integer(&fields->name, sample); })
266
267 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
268 ({ struct syscall_tp *fields = evsel->priv; \
269 fields->name.pointer(&fields->name, sample); })
270
271 struct syscall_arg {
272 unsigned long val;
273 struct thread *thread;
274 struct trace *trace;
275 void *parm;
276 u8 idx;
277 u8 mask;
278 };
279
280 struct strarray {
281 int offset;
282 int nr_entries;
283 const char **entries;
284 };
285
286 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
287 .nr_entries = ARRAY_SIZE(array), \
288 .entries = array, \
289 }
290
291 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
292 .offset = off, \
293 .nr_entries = ARRAY_SIZE(array), \
294 .entries = array, \
295 }
296
297 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
298 const char *intfmt,
299 struct syscall_arg *arg)
300 {
301 struct strarray *sa = arg->parm;
302 int idx = arg->val - sa->offset;
303
304 if (idx < 0 || idx >= sa->nr_entries)
305 return scnprintf(bf, size, intfmt, arg->val);
306
307 return scnprintf(bf, size, "%s", sa->entries[idx]);
308 }
309
310 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
311 struct syscall_arg *arg)
312 {
313 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
314 }
315
316 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
317
318 #if defined(__i386__) || defined(__x86_64__)
319 /*
320 * FIXME: Make this available to all arches as soon as the ioctl beautifier
321 * gets rewritten to support all arches.
322 */
323 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
324 struct syscall_arg *arg)
325 {
326 return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
327 }
328
329 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
330 #endif /* defined(__i386__) || defined(__x86_64__) */
331
332 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
333 struct syscall_arg *arg);
334
335 #define SCA_FD syscall_arg__scnprintf_fd
336
337 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
338 struct syscall_arg *arg)
339 {
340 int fd = arg->val;
341
342 if (fd == AT_FDCWD)
343 return scnprintf(bf, size, "CWD");
344
345 return syscall_arg__scnprintf_fd(bf, size, arg);
346 }
347
348 #define SCA_FDAT syscall_arg__scnprintf_fd_at
349
350 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
351 struct syscall_arg *arg);
352
353 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
354
355 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
356 struct syscall_arg *arg)
357 {
358 return scnprintf(bf, size, "%#lx", arg->val);
359 }
360
361 #define SCA_HEX syscall_arg__scnprintf_hex
362
363 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
364 struct syscall_arg *arg)
365 {
366 return scnprintf(bf, size, "%d", arg->val);
367 }
368
369 #define SCA_INT syscall_arg__scnprintf_int
370
371 static const char *bpf_cmd[] = {
372 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
373 "MAP_GET_NEXT_KEY", "PROG_LOAD",
374 };
375 static DEFINE_STRARRAY(bpf_cmd);
376
377 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
378 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
379
380 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
381 static DEFINE_STRARRAY(itimers);
382
383 static const char *keyctl_options[] = {
384 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
385 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
386 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
387 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
388 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
389 };
390 static DEFINE_STRARRAY(keyctl_options);
391
392 static const char *whences[] = { "SET", "CUR", "END",
393 #ifdef SEEK_DATA
394 "DATA",
395 #endif
396 #ifdef SEEK_HOLE
397 "HOLE",
398 #endif
399 };
400 static DEFINE_STRARRAY(whences);
401
402 static const char *fcntl_cmds[] = {
403 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
404 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
405 "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
406 "F_GETOWNER_UIDS",
407 };
408 static DEFINE_STRARRAY(fcntl_cmds);
409
410 static const char *rlimit_resources[] = {
411 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
412 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
413 "RTTIME",
414 };
415 static DEFINE_STRARRAY(rlimit_resources);
416
417 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
418 static DEFINE_STRARRAY(sighow);
419
420 static const char *clockid[] = {
421 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
422 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
423 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
424 };
425 static DEFINE_STRARRAY(clockid);
426
427 static const char *socket_families[] = {
428 "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
429 "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
430 "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
431 "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
432 "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
433 "ALG", "NFC", "VSOCK",
434 };
435 static DEFINE_STRARRAY(socket_families);
436
437 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
438 struct syscall_arg *arg)
439 {
440 size_t printed = 0;
441 int mode = arg->val;
442
443 if (mode == F_OK) /* 0 */
444 return scnprintf(bf, size, "F");
445 #define P_MODE(n) \
446 if (mode & n##_OK) { \
447 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
448 mode &= ~n##_OK; \
449 }
450
451 P_MODE(R);
452 P_MODE(W);
453 P_MODE(X);
454 #undef P_MODE
455
456 if (mode)
457 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
458
459 return printed;
460 }
461
462 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
463
464 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
465 struct syscall_arg *arg);
466
467 #define SCA_FILENAME syscall_arg__scnprintf_filename
468
469 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
470 struct syscall_arg *arg)
471 {
472 int printed = 0, flags = arg->val;
473
474 #define P_FLAG(n) \
475 if (flags & O_##n) { \
476 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
477 flags &= ~O_##n; \
478 }
479
480 P_FLAG(CLOEXEC);
481 P_FLAG(NONBLOCK);
482 #undef P_FLAG
483
484 if (flags)
485 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
486
487 return printed;
488 }
489
490 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
491
492 #if defined(__i386__) || defined(__x86_64__)
493 /*
494 * FIXME: Make this available to all arches.
495 */
496 #define TCGETS 0x5401
497
498 static const char *tioctls[] = {
499 "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
500 "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
501 "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
502 "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
503 "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
504 "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
505 "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
506 "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
507 "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
508 "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
509 "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
510 [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
511 "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
512 "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
513 "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
514 };
515
516 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
517 #endif /* defined(__i386__) || defined(__x86_64__) */
518
519 #ifndef GRND_NONBLOCK
520 #define GRND_NONBLOCK 0x0001
521 #endif
522 #ifndef GRND_RANDOM
523 #define GRND_RANDOM 0x0002
524 #endif
525
526 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
527 struct syscall_arg *arg)
528 {
529 int printed = 0, flags = arg->val;
530
531 #define P_FLAG(n) \
532 if (flags & GRND_##n) { \
533 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
534 flags &= ~GRND_##n; \
535 }
536
537 P_FLAG(RANDOM);
538 P_FLAG(NONBLOCK);
539 #undef P_FLAG
540
541 if (flags)
542 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
543
544 return printed;
545 }
546
547 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
548
549 #define STRARRAY(arg, name, array) \
550 .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
551 .arg_parm = { [arg] = &strarray__##array, }
552
553 #include "trace/beauty/eventfd.c"
554 #include "trace/beauty/flock.c"
555 #include "trace/beauty/futex_op.c"
556 #include "trace/beauty/mmap.c"
557 #include "trace/beauty/mode_t.c"
558 #include "trace/beauty/msg_flags.c"
559 #include "trace/beauty/open_flags.c"
560 #include "trace/beauty/perf_event_open.c"
561 #include "trace/beauty/pid.c"
562 #include "trace/beauty/sched_policy.c"
563 #include "trace/beauty/seccomp.c"
564 #include "trace/beauty/signum.c"
565 #include "trace/beauty/socket_type.c"
566 #include "trace/beauty/waitid_options.c"
567
568 static struct syscall_fmt {
569 const char *name;
570 const char *alias;
571 size_t (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
572 void *arg_parm[6];
573 bool errmsg;
574 bool errpid;
575 bool timeout;
576 bool hexret;
577 } syscall_fmts[] = {
578 { .name = "access", .errmsg = true,
579 .arg_scnprintf = { [1] = SCA_ACCMODE, /* mode */ }, },
580 { .name = "arch_prctl", .errmsg = true, .alias = "prctl", },
581 { .name = "bpf", .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
582 { .name = "brk", .hexret = true,
583 .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
584 { .name = "chdir", .errmsg = true, },
585 { .name = "chmod", .errmsg = true, },
586 { .name = "chroot", .errmsg = true, },
587 { .name = "clock_gettime", .errmsg = true, STRARRAY(0, clk_id, clockid), },
588 { .name = "clone", .errpid = true, },
589 { .name = "close", .errmsg = true,
590 .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
591 { .name = "connect", .errmsg = true, },
592 { .name = "creat", .errmsg = true, },
593 { .name = "dup", .errmsg = true, },
594 { .name = "dup2", .errmsg = true, },
595 { .name = "dup3", .errmsg = true, },
596 { .name = "epoll_ctl", .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
597 { .name = "eventfd2", .errmsg = true,
598 .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
599 { .name = "faccessat", .errmsg = true, },
600 { .name = "fadvise64", .errmsg = true, },
601 { .name = "fallocate", .errmsg = true, },
602 { .name = "fchdir", .errmsg = true, },
603 { .name = "fchmod", .errmsg = true, },
604 { .name = "fchmodat", .errmsg = true,
605 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
606 { .name = "fchown", .errmsg = true, },
607 { .name = "fchownat", .errmsg = true,
608 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
609 { .name = "fcntl", .errmsg = true,
610 .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
611 .arg_parm = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
612 { .name = "fdatasync", .errmsg = true, },
613 { .name = "flock", .errmsg = true,
614 .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
615 { .name = "fsetxattr", .errmsg = true, },
616 { .name = "fstat", .errmsg = true, .alias = "newfstat", },
617 { .name = "fstatat", .errmsg = true, .alias = "newfstatat", },
618 { .name = "fstatfs", .errmsg = true, },
619 { .name = "fsync", .errmsg = true, },
620 { .name = "ftruncate", .errmsg = true, },
621 { .name = "futex", .errmsg = true,
622 .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
623 { .name = "futimesat", .errmsg = true,
624 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
625 { .name = "getdents", .errmsg = true, },
626 { .name = "getdents64", .errmsg = true, },
627 { .name = "getitimer", .errmsg = true, STRARRAY(0, which, itimers), },
628 { .name = "getpid", .errpid = true, },
629 { .name = "getpgid", .errpid = true, },
630 { .name = "getppid", .errpid = true, },
631 { .name = "getrandom", .errmsg = true,
632 .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
633 { .name = "getrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
634 { .name = "getxattr", .errmsg = true, },
635 { .name = "inotify_add_watch", .errmsg = true, },
636 { .name = "ioctl", .errmsg = true,
637 .arg_scnprintf = {
638 #if defined(__i386__) || defined(__x86_64__)
639 /*
640 * FIXME: Make this available to all arches.
641 */
642 [1] = SCA_STRHEXARRAY, /* cmd */
643 [2] = SCA_HEX, /* arg */ },
644 .arg_parm = { [1] = &strarray__tioctls, /* cmd */ }, },
645 #else
646 [2] = SCA_HEX, /* arg */ }, },
647 #endif
648 { .name = "keyctl", .errmsg = true, STRARRAY(0, option, keyctl_options), },
649 { .name = "kill", .errmsg = true,
650 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
651 { .name = "lchown", .errmsg = true, },
652 { .name = "lgetxattr", .errmsg = true, },
653 { .name = "linkat", .errmsg = true,
654 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
655 { .name = "listxattr", .errmsg = true, },
656 { .name = "llistxattr", .errmsg = true, },
657 { .name = "lremovexattr", .errmsg = true, },
658 { .name = "lseek", .errmsg = true,
659 .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
660 .arg_parm = { [2] = &strarray__whences, /* whence */ }, },
661 { .name = "lsetxattr", .errmsg = true, },
662 { .name = "lstat", .errmsg = true, .alias = "newlstat", },
663 { .name = "lsxattr", .errmsg = true, },
664 { .name = "madvise", .errmsg = true,
665 .arg_scnprintf = { [0] = SCA_HEX, /* start */
666 [2] = SCA_MADV_BHV, /* behavior */ }, },
667 { .name = "mkdir", .errmsg = true, },
668 { .name = "mkdirat", .errmsg = true,
669 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
670 { .name = "mknod", .errmsg = true, },
671 { .name = "mknodat", .errmsg = true,
672 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
673 { .name = "mlock", .errmsg = true,
674 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
675 { .name = "mlockall", .errmsg = true,
676 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
677 { .name = "mmap", .hexret = true,
678 .arg_scnprintf = { [0] = SCA_HEX, /* addr */
679 [2] = SCA_MMAP_PROT, /* prot */
680 [3] = SCA_MMAP_FLAGS, /* flags */ }, },
681 { .name = "mprotect", .errmsg = true,
682 .arg_scnprintf = { [0] = SCA_HEX, /* start */
683 [2] = SCA_MMAP_PROT, /* prot */ }, },
684 { .name = "mq_unlink", .errmsg = true,
685 .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
686 { .name = "mremap", .hexret = true,
687 .arg_scnprintf = { [0] = SCA_HEX, /* addr */
688 [3] = SCA_MREMAP_FLAGS, /* flags */
689 [4] = SCA_HEX, /* new_addr */ }, },
690 { .name = "munlock", .errmsg = true,
691 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
692 { .name = "munmap", .errmsg = true,
693 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
694 { .name = "name_to_handle_at", .errmsg = true,
695 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
696 { .name = "newfstatat", .errmsg = true,
697 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
698 { .name = "open", .errmsg = true,
699 .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
700 { .name = "open_by_handle_at", .errmsg = true,
701 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
702 [2] = SCA_OPEN_FLAGS, /* flags */ }, },
703 { .name = "openat", .errmsg = true,
704 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
705 [2] = SCA_OPEN_FLAGS, /* flags */ }, },
706 { .name = "perf_event_open", .errmsg = true,
707 .arg_scnprintf = { [2] = SCA_INT, /* cpu */
708 [3] = SCA_FD, /* group_fd */
709 [4] = SCA_PERF_FLAGS, /* flags */ }, },
710 { .name = "pipe2", .errmsg = true,
711 .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
712 { .name = "poll", .errmsg = true, .timeout = true, },
713 { .name = "ppoll", .errmsg = true, .timeout = true, },
714 { .name = "pread", .errmsg = true, .alias = "pread64", },
715 { .name = "preadv", .errmsg = true, .alias = "pread", },
716 { .name = "prlimit64", .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
717 { .name = "pwrite", .errmsg = true, .alias = "pwrite64", },
718 { .name = "pwritev", .errmsg = true, },
719 { .name = "read", .errmsg = true, },
720 { .name = "readlink", .errmsg = true, },
721 { .name = "readlinkat", .errmsg = true,
722 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
723 { .name = "readv", .errmsg = true, },
724 { .name = "recvfrom", .errmsg = true,
725 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
726 { .name = "recvmmsg", .errmsg = true,
727 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
728 { .name = "recvmsg", .errmsg = true,
729 .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
730 { .name = "removexattr", .errmsg = true, },
731 { .name = "renameat", .errmsg = true,
732 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
733 { .name = "rmdir", .errmsg = true, },
734 { .name = "rt_sigaction", .errmsg = true,
735 .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
736 { .name = "rt_sigprocmask", .errmsg = true, STRARRAY(0, how, sighow), },
737 { .name = "rt_sigqueueinfo", .errmsg = true,
738 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
739 { .name = "rt_tgsigqueueinfo", .errmsg = true,
740 .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
741 { .name = "sched_setscheduler", .errmsg = true,
742 .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
743 { .name = "seccomp", .errmsg = true,
744 .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
745 [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
746 { .name = "select", .errmsg = true, .timeout = true, },
747 { .name = "sendmmsg", .errmsg = true,
748 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
749 { .name = "sendmsg", .errmsg = true,
750 .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
751 { .name = "sendto", .errmsg = true,
752 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
753 { .name = "set_tid_address", .errpid = true, },
754 { .name = "setitimer", .errmsg = true, STRARRAY(0, which, itimers), },
755 { .name = "setpgid", .errmsg = true, },
756 { .name = "setrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
757 { .name = "setxattr", .errmsg = true, },
758 { .name = "shutdown", .errmsg = true, },
759 { .name = "socket", .errmsg = true,
760 .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
761 [1] = SCA_SK_TYPE, /* type */ },
762 .arg_parm = { [0] = &strarray__socket_families, /* family */ }, },
763 { .name = "socketpair", .errmsg = true,
764 .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
765 [1] = SCA_SK_TYPE, /* type */ },
766 .arg_parm = { [0] = &strarray__socket_families, /* family */ }, },
767 { .name = "stat", .errmsg = true, .alias = "newstat", },
768 { .name = "statfs", .errmsg = true, },
769 { .name = "swapoff", .errmsg = true,
770 .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
771 { .name = "swapon", .errmsg = true,
772 .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
773 { .name = "symlinkat", .errmsg = true,
774 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
775 { .name = "tgkill", .errmsg = true,
776 .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
777 { .name = "tkill", .errmsg = true,
778 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
779 { .name = "truncate", .errmsg = true, },
780 { .name = "uname", .errmsg = true, .alias = "newuname", },
781 { .name = "unlinkat", .errmsg = true,
782 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
783 { .name = "utime", .errmsg = true, },
784 { .name = "utimensat", .errmsg = true,
785 .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
786 { .name = "utimes", .errmsg = true, },
787 { .name = "vmsplice", .errmsg = true, },
788 { .name = "wait4", .errpid = true,
789 .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
790 { .name = "waitid", .errpid = true,
791 .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
792 { .name = "write", .errmsg = true, },
793 { .name = "writev", .errmsg = true, },
794 };
795
796 static int syscall_fmt__cmp(const void *name, const void *fmtp)
797 {
798 const struct syscall_fmt *fmt = fmtp;
799 return strcmp(name, fmt->name);
800 }
801
802 static struct syscall_fmt *syscall_fmt__find(const char *name)
803 {
804 const int nmemb = ARRAY_SIZE(syscall_fmts);
805 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
806 }
807
808 struct syscall {
809 struct event_format *tp_format;
810 int nr_args;
811 struct format_field *args;
812 const char *name;
813 bool is_exit;
814 struct syscall_fmt *fmt;
815 size_t (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
816 void **arg_parm;
817 };
818
819 static size_t fprintf_duration(unsigned long t, FILE *fp)
820 {
821 double duration = (double)t / NSEC_PER_MSEC;
822 size_t printed = fprintf(fp, "(");
823
824 if (duration >= 1.0)
825 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
826 else if (duration >= 0.01)
827 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
828 else
829 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
830 return printed + fprintf(fp, "): ");
831 }
832
833 /**
834 * filename.ptr: The filename char pointer that will be vfs_getname'd
835 * filename.entry_str_pos: Where to insert the string translated from
836 * filename.ptr by the vfs_getname tracepoint/kprobe.
837 */
838 struct thread_trace {
839 u64 entry_time;
840 u64 exit_time;
841 bool entry_pending;
842 unsigned long nr_events;
843 unsigned long pfmaj, pfmin;
844 char *entry_str;
845 double runtime_ms;
846 struct {
847 unsigned long ptr;
848 short int entry_str_pos;
849 bool pending_open;
850 unsigned int namelen;
851 char *name;
852 } filename;
853 struct {
854 int max;
855 char **table;
856 } paths;
857
858 struct intlist *syscall_stats;
859 };
860
861 static struct thread_trace *thread_trace__new(void)
862 {
863 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace));
864
865 if (ttrace)
866 ttrace->paths.max = -1;
867
868 ttrace->syscall_stats = intlist__new(NULL);
869
870 return ttrace;
871 }
872
873 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
874 {
875 struct thread_trace *ttrace;
876
877 if (thread == NULL)
878 goto fail;
879
880 if (thread__priv(thread) == NULL)
881 thread__set_priv(thread, thread_trace__new());
882
883 if (thread__priv(thread) == NULL)
884 goto fail;
885
886 ttrace = thread__priv(thread);
887 ++ttrace->nr_events;
888
889 return ttrace;
890 fail:
891 color_fprintf(fp, PERF_COLOR_RED,
892 "WARNING: not enough memory, dropping samples!\n");
893 return NULL;
894 }
895
896 #define TRACE_PFMAJ (1 << 0)
897 #define TRACE_PFMIN (1 << 1)
898
899 static const size_t trace__entry_str_size = 2048;
900
901 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
902 {
903 struct thread_trace *ttrace = thread__priv(thread);
904
905 if (fd > ttrace->paths.max) {
906 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
907
908 if (npath == NULL)
909 return -1;
910
911 if (ttrace->paths.max != -1) {
912 memset(npath + ttrace->paths.max + 1, 0,
913 (fd - ttrace->paths.max) * sizeof(char *));
914 } else {
915 memset(npath, 0, (fd + 1) * sizeof(char *));
916 }
917
918 ttrace->paths.table = npath;
919 ttrace->paths.max = fd;
920 }
921
922 ttrace->paths.table[fd] = strdup(pathname);
923
924 return ttrace->paths.table[fd] != NULL ? 0 : -1;
925 }
926
927 static int thread__read_fd_path(struct thread *thread, int fd)
928 {
929 char linkname[PATH_MAX], pathname[PATH_MAX];
930 struct stat st;
931 int ret;
932
933 if (thread->pid_ == thread->tid) {
934 scnprintf(linkname, sizeof(linkname),
935 "/proc/%d/fd/%d", thread->pid_, fd);
936 } else {
937 scnprintf(linkname, sizeof(linkname),
938 "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
939 }
940
941 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
942 return -1;
943
944 ret = readlink(linkname, pathname, sizeof(pathname));
945
946 if (ret < 0 || ret > st.st_size)
947 return -1;
948
949 pathname[ret] = '\0';
950 return trace__set_fd_pathname(thread, fd, pathname);
951 }
952
953 static const char *thread__fd_path(struct thread *thread, int fd,
954 struct trace *trace)
955 {
956 struct thread_trace *ttrace = thread__priv(thread);
957
958 if (ttrace == NULL)
959 return NULL;
960
961 if (fd < 0)
962 return NULL;
963
964 if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
965 if (!trace->live)
966 return NULL;
967 ++trace->stats.proc_getname;
968 if (thread__read_fd_path(thread, fd))
969 return NULL;
970 }
971
972 return ttrace->paths.table[fd];
973 }
974
975 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
976 struct syscall_arg *arg)
977 {
978 int fd = arg->val;
979 size_t printed = scnprintf(bf, size, "%d", fd);
980 const char *path = thread__fd_path(arg->thread, fd, arg->trace);
981
982 if (path)
983 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
984
985 return printed;
986 }
987
988 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
989 struct syscall_arg *arg)
990 {
991 int fd = arg->val;
992 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
993 struct thread_trace *ttrace = thread__priv(arg->thread);
994
995 if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
996 zfree(&ttrace->paths.table[fd]);
997
998 return printed;
999 }
1000
1001 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1002 unsigned long ptr)
1003 {
1004 struct thread_trace *ttrace = thread__priv(thread);
1005
1006 ttrace->filename.ptr = ptr;
1007 ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1008 }
1009
1010 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1011 struct syscall_arg *arg)
1012 {
1013 unsigned long ptr = arg->val;
1014
1015 if (!arg->trace->vfs_getname)
1016 return scnprintf(bf, size, "%#x", ptr);
1017
1018 thread__set_filename_pos(arg->thread, bf, ptr);
1019 return 0;
1020 }
1021
1022 static bool trace__filter_duration(struct trace *trace, double t)
1023 {
1024 return t < (trace->duration_filter * NSEC_PER_MSEC);
1025 }
1026
1027 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1028 {
1029 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1030
1031 return fprintf(fp, "%10.3f ", ts);
1032 }
1033
1034 static bool done = false;
1035 static bool interrupted = false;
1036
1037 static void sig_handler(int sig)
1038 {
1039 done = true;
1040 interrupted = sig == SIGINT;
1041 }
1042
1043 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1044 u64 duration, u64 tstamp, FILE *fp)
1045 {
1046 size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1047 printed += fprintf_duration(duration, fp);
1048
1049 if (trace->multiple_threads) {
1050 if (trace->show_comm)
1051 printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1052 printed += fprintf(fp, "%d ", thread->tid);
1053 }
1054
1055 return printed;
1056 }
1057
1058 static int trace__process_event(struct trace *trace, struct machine *machine,
1059 union perf_event *event, struct perf_sample *sample)
1060 {
1061 int ret = 0;
1062
1063 switch (event->header.type) {
1064 case PERF_RECORD_LOST:
1065 color_fprintf(trace->output, PERF_COLOR_RED,
1066 "LOST %" PRIu64 " events!\n", event->lost.lost);
1067 ret = machine__process_lost_event(machine, event, sample);
1068 break;
1069 default:
1070 ret = machine__process_event(machine, event, sample);
1071 break;
1072 }
1073
1074 return ret;
1075 }
1076
1077 static int trace__tool_process(struct perf_tool *tool,
1078 union perf_event *event,
1079 struct perf_sample *sample,
1080 struct machine *machine)
1081 {
1082 struct trace *trace = container_of(tool, struct trace, tool);
1083 return trace__process_event(trace, machine, event, sample);
1084 }
1085
1086 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1087 {
1088 struct machine *machine = vmachine;
1089
1090 if (machine->kptr_restrict_warned)
1091 return NULL;
1092
1093 if (symbol_conf.kptr_restrict) {
1094 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1095 "Check /proc/sys/kernel/kptr_restrict.\n\n"
1096 "Kernel samples will not be resolved.\n");
1097 machine->kptr_restrict_warned = true;
1098 return NULL;
1099 }
1100
1101 return machine__resolve_kernel_addr(vmachine, addrp, modp);
1102 }
1103
1104 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1105 {
1106 int err = symbol__init(NULL);
1107
1108 if (err)
1109 return err;
1110
1111 trace->host = machine__new_host();
1112 if (trace->host == NULL)
1113 return -ENOMEM;
1114
1115 if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1116 return -errno;
1117
1118 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1119 evlist->threads, trace__tool_process, false,
1120 trace->opts.proc_map_timeout);
1121 if (err)
1122 symbol__exit();
1123
1124 return err;
1125 }
1126
1127 static int syscall__set_arg_fmts(struct syscall *sc)
1128 {
1129 struct format_field *field;
1130 int idx = 0, len;
1131
1132 sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1133 if (sc->arg_scnprintf == NULL)
1134 return -1;
1135
1136 if (sc->fmt)
1137 sc->arg_parm = sc->fmt->arg_parm;
1138
1139 for (field = sc->args; field; field = field->next) {
1140 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1141 sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1142 else if (strcmp(field->type, "const char *") == 0 &&
1143 (strcmp(field->name, "filename") == 0 ||
1144 strcmp(field->name, "path") == 0 ||
1145 strcmp(field->name, "pathname") == 0))
1146 sc->arg_scnprintf[idx] = SCA_FILENAME;
1147 else if (field->flags & FIELD_IS_POINTER)
1148 sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1149 else if (strcmp(field->type, "pid_t") == 0)
1150 sc->arg_scnprintf[idx] = SCA_PID;
1151 else if (strcmp(field->type, "umode_t") == 0)
1152 sc->arg_scnprintf[idx] = SCA_MODE_T;
1153 else if ((strcmp(field->type, "int") == 0 ||
1154 strcmp(field->type, "unsigned int") == 0 ||
1155 strcmp(field->type, "long") == 0) &&
1156 (len = strlen(field->name)) >= 2 &&
1157 strcmp(field->name + len - 2, "fd") == 0) {
1158 /*
1159 * /sys/kernel/tracing/events/syscalls/sys_enter*
1160 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1161 * 65 int
1162 * 23 unsigned int
1163 * 7 unsigned long
1164 */
1165 sc->arg_scnprintf[idx] = SCA_FD;
1166 }
1167 ++idx;
1168 }
1169
1170 return 0;
1171 }
1172
1173 static int trace__read_syscall_info(struct trace *trace, int id)
1174 {
1175 char tp_name[128];
1176 struct syscall *sc;
1177 const char *name = syscalltbl__name(trace->sctbl, id);
1178
1179 if (name == NULL)
1180 return -1;
1181
1182 if (id > trace->syscalls.max) {
1183 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1184
1185 if (nsyscalls == NULL)
1186 return -1;
1187
1188 if (trace->syscalls.max != -1) {
1189 memset(nsyscalls + trace->syscalls.max + 1, 0,
1190 (id - trace->syscalls.max) * sizeof(*sc));
1191 } else {
1192 memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1193 }
1194
1195 trace->syscalls.table = nsyscalls;
1196 trace->syscalls.max = id;
1197 }
1198
1199 sc = trace->syscalls.table + id;
1200 sc->name = name;
1201
1202 sc->fmt = syscall_fmt__find(sc->name);
1203
1204 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1205 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1206
1207 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1208 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1209 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1210 }
1211
1212 if (IS_ERR(sc->tp_format))
1213 return -1;
1214
1215 sc->args = sc->tp_format->format.fields;
1216 sc->nr_args = sc->tp_format->format.nr_fields;
1217 /*
1218 * We need to check and discard the first variable '__syscall_nr'
1219 * or 'nr' that mean the syscall number. It is needless here.
1220 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1221 */
1222 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1223 sc->args = sc->args->next;
1224 --sc->nr_args;
1225 }
1226
1227 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1228
1229 return syscall__set_arg_fmts(sc);
1230 }
1231
1232 static int trace__validate_ev_qualifier(struct trace *trace)
1233 {
1234 int err = 0, i;
1235 struct str_node *pos;
1236
1237 trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1238 trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1239 sizeof(trace->ev_qualifier_ids.entries[0]));
1240
1241 if (trace->ev_qualifier_ids.entries == NULL) {
1242 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1243 trace->output);
1244 err = -EINVAL;
1245 goto out;
1246 }
1247
1248 i = 0;
1249
1250 strlist__for_each_entry(pos, trace->ev_qualifier) {
1251 const char *sc = pos->s;
1252 int id = syscalltbl__id(trace->sctbl, sc);
1253
1254 if (id < 0) {
1255 if (err == 0) {
1256 fputs("Error:\tInvalid syscall ", trace->output);
1257 err = -EINVAL;
1258 } else {
1259 fputs(", ", trace->output);
1260 }
1261
1262 fputs(sc, trace->output);
1263 }
1264
1265 trace->ev_qualifier_ids.entries[i++] = id;
1266 }
1267
1268 if (err < 0) {
1269 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1270 "\nHint:\tand: 'man syscalls'\n", trace->output);
1271 zfree(&trace->ev_qualifier_ids.entries);
1272 trace->ev_qualifier_ids.nr = 0;
1273 }
1274 out:
1275 return err;
1276 }
1277
1278 /*
1279 * args is to be interpreted as a series of longs but we need to handle
1280 * 8-byte unaligned accesses. args points to raw_data within the event
1281 * and raw_data is guaranteed to be 8-byte unaligned because it is
1282 * preceded by raw_size which is a u32. So we need to copy args to a temp
1283 * variable to read it. Most notably this avoids extended load instructions
1284 * on unaligned addresses
1285 */
1286
1287 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1288 unsigned char *args, struct trace *trace,
1289 struct thread *thread)
1290 {
1291 size_t printed = 0;
1292 unsigned char *p;
1293 unsigned long val;
1294
1295 if (sc->args != NULL) {
1296 struct format_field *field;
1297 u8 bit = 1;
1298 struct syscall_arg arg = {
1299 .idx = 0,
1300 .mask = 0,
1301 .trace = trace,
1302 .thread = thread,
1303 };
1304
1305 for (field = sc->args; field;
1306 field = field->next, ++arg.idx, bit <<= 1) {
1307 if (arg.mask & bit)
1308 continue;
1309
1310 /* special care for unaligned accesses */
1311 p = args + sizeof(unsigned long) * arg.idx;
1312 memcpy(&val, p, sizeof(val));
1313
1314 /*
1315 * Suppress this argument if its value is zero and
1316 * and we don't have a string associated in an
1317 * strarray for it.
1318 */
1319 if (val == 0 &&
1320 !(sc->arg_scnprintf &&
1321 sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1322 sc->arg_parm[arg.idx]))
1323 continue;
1324
1325 printed += scnprintf(bf + printed, size - printed,
1326 "%s%s: ", printed ? ", " : "", field->name);
1327 if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1328 arg.val = val;
1329 if (sc->arg_parm)
1330 arg.parm = sc->arg_parm[arg.idx];
1331 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1332 size - printed, &arg);
1333 } else {
1334 printed += scnprintf(bf + printed, size - printed,
1335 "%ld", val);
1336 }
1337 }
1338 } else if (IS_ERR(sc->tp_format)) {
1339 /*
1340 * If we managed to read the tracepoint /format file, then we
1341 * may end up not having any args, like with gettid(), so only
1342 * print the raw args when we didn't manage to read it.
1343 */
1344 int i = 0;
1345
1346 while (i < 6) {
1347 /* special care for unaligned accesses */
1348 p = args + sizeof(unsigned long) * i;
1349 memcpy(&val, p, sizeof(val));
1350 printed += scnprintf(bf + printed, size - printed,
1351 "%sarg%d: %ld",
1352 printed ? ", " : "", i, val);
1353 ++i;
1354 }
1355 }
1356
1357 return printed;
1358 }
1359
1360 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1361 union perf_event *event,
1362 struct perf_sample *sample);
1363
1364 static struct syscall *trace__syscall_info(struct trace *trace,
1365 struct perf_evsel *evsel, int id)
1366 {
1367
1368 if (id < 0) {
1369
1370 /*
1371 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1372 * before that, leaving at a higher verbosity level till that is
1373 * explained. Reproduced with plain ftrace with:
1374 *
1375 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1376 * grep "NR -1 " /t/trace_pipe
1377 *
1378 * After generating some load on the machine.
1379 */
1380 if (verbose > 1) {
1381 static u64 n;
1382 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1383 id, perf_evsel__name(evsel), ++n);
1384 }
1385 return NULL;
1386 }
1387
1388 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1389 trace__read_syscall_info(trace, id))
1390 goto out_cant_read;
1391
1392 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1393 goto out_cant_read;
1394
1395 return &trace->syscalls.table[id];
1396
1397 out_cant_read:
1398 if (verbose) {
1399 fprintf(trace->output, "Problems reading syscall %d", id);
1400 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1401 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1402 fputs(" information\n", trace->output);
1403 }
1404 return NULL;
1405 }
1406
1407 static void thread__update_stats(struct thread_trace *ttrace,
1408 int id, struct perf_sample *sample)
1409 {
1410 struct int_node *inode;
1411 struct stats *stats;
1412 u64 duration = 0;
1413
1414 inode = intlist__findnew(ttrace->syscall_stats, id);
1415 if (inode == NULL)
1416 return;
1417
1418 stats = inode->priv;
1419 if (stats == NULL) {
1420 stats = malloc(sizeof(struct stats));
1421 if (stats == NULL)
1422 return;
1423 init_stats(stats);
1424 inode->priv = stats;
1425 }
1426
1427 if (ttrace->entry_time && sample->time > ttrace->entry_time)
1428 duration = sample->time - ttrace->entry_time;
1429
1430 update_stats(stats, duration);
1431 }
1432
1433 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1434 {
1435 struct thread_trace *ttrace;
1436 u64 duration;
1437 size_t printed;
1438
1439 if (trace->current == NULL)
1440 return 0;
1441
1442 ttrace = thread__priv(trace->current);
1443
1444 if (!ttrace->entry_pending)
1445 return 0;
1446
1447 duration = sample->time - ttrace->entry_time;
1448
1449 printed = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output);
1450 printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1451 ttrace->entry_pending = false;
1452
1453 return printed;
1454 }
1455
1456 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1457 union perf_event *event __maybe_unused,
1458 struct perf_sample *sample)
1459 {
1460 char *msg;
1461 void *args;
1462 size_t printed = 0;
1463 struct thread *thread;
1464 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1465 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1466 struct thread_trace *ttrace;
1467
1468 if (sc == NULL)
1469 return -1;
1470
1471 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1472 ttrace = thread__trace(thread, trace->output);
1473 if (ttrace == NULL)
1474 goto out_put;
1475
1476 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1477
1478 if (ttrace->entry_str == NULL) {
1479 ttrace->entry_str = malloc(trace__entry_str_size);
1480 if (!ttrace->entry_str)
1481 goto out_put;
1482 }
1483
1484 if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1485 trace__printf_interrupted_entry(trace, sample);
1486
1487 ttrace->entry_time = sample->time;
1488 msg = ttrace->entry_str;
1489 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1490
1491 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1492 args, trace, thread);
1493
1494 if (sc->is_exit) {
1495 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1496 trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
1497 fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1498 }
1499 } else {
1500 ttrace->entry_pending = true;
1501 /* See trace__vfs_getname & trace__sys_exit */
1502 ttrace->filename.pending_open = false;
1503 }
1504
1505 if (trace->current != thread) {
1506 thread__put(trace->current);
1507 trace->current = thread__get(thread);
1508 }
1509 err = 0;
1510 out_put:
1511 thread__put(thread);
1512 return err;
1513 }
1514
1515 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1516 struct perf_sample *sample,
1517 struct callchain_cursor *cursor)
1518 {
1519 struct addr_location al;
1520
1521 if (machine__resolve(trace->host, &al, sample) < 0 ||
1522 thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1523 return -1;
1524
1525 return 0;
1526 }
1527
1528 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1529 {
1530 /* TODO: user-configurable print_opts */
1531 const unsigned int print_opts = EVSEL__PRINT_SYM |
1532 EVSEL__PRINT_DSO |
1533 EVSEL__PRINT_UNKNOWN_AS_ADDR;
1534
1535 return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1536 }
1537
1538 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1539 union perf_event *event __maybe_unused,
1540 struct perf_sample *sample)
1541 {
1542 long ret;
1543 u64 duration = 0;
1544 struct thread *thread;
1545 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1546 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1547 struct thread_trace *ttrace;
1548
1549 if (sc == NULL)
1550 return -1;
1551
1552 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1553 ttrace = thread__trace(thread, trace->output);
1554 if (ttrace == NULL)
1555 goto out_put;
1556
1557 if (trace->summary)
1558 thread__update_stats(ttrace, id, sample);
1559
1560 ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1561
1562 if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1563 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1564 ttrace->filename.pending_open = false;
1565 ++trace->stats.vfs_getname;
1566 }
1567
1568 ttrace->exit_time = sample->time;
1569
1570 if (ttrace->entry_time) {
1571 duration = sample->time - ttrace->entry_time;
1572 if (trace__filter_duration(trace, duration))
1573 goto out;
1574 } else if (trace->duration_filter)
1575 goto out;
1576
1577 if (sample->callchain) {
1578 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1579 if (callchain_ret == 0) {
1580 if (callchain_cursor.nr < trace->min_stack)
1581 goto out;
1582 callchain_ret = 1;
1583 }
1584 }
1585
1586 if (trace->summary_only)
1587 goto out;
1588
1589 trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
1590
1591 if (ttrace->entry_pending) {
1592 fprintf(trace->output, "%-70s", ttrace->entry_str);
1593 } else {
1594 fprintf(trace->output, " ... [");
1595 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1596 fprintf(trace->output, "]: %s()", sc->name);
1597 }
1598
1599 if (sc->fmt == NULL) {
1600 signed_print:
1601 fprintf(trace->output, ") = %ld", ret);
1602 } else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
1603 char bf[STRERR_BUFSIZE];
1604 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1605 *e = audit_errno_to_name(-ret);
1606
1607 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1608 } else if (ret == 0 && sc->fmt->timeout)
1609 fprintf(trace->output, ") = 0 Timeout");
1610 else if (sc->fmt->hexret)
1611 fprintf(trace->output, ") = %#lx", ret);
1612 else if (sc->fmt->errpid) {
1613 struct thread *child = machine__find_thread(trace->host, ret, ret);
1614
1615 if (child != NULL) {
1616 fprintf(trace->output, ") = %ld", ret);
1617 if (child->comm_set)
1618 fprintf(trace->output, " (%s)", thread__comm_str(child));
1619 thread__put(child);
1620 }
1621 } else
1622 goto signed_print;
1623
1624 fputc('\n', trace->output);
1625
1626 if (callchain_ret > 0)
1627 trace__fprintf_callchain(trace, sample);
1628 else if (callchain_ret < 0)
1629 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1630 out:
1631 ttrace->entry_pending = false;
1632 err = 0;
1633 out_put:
1634 thread__put(thread);
1635 return err;
1636 }
1637
1638 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1639 union perf_event *event __maybe_unused,
1640 struct perf_sample *sample)
1641 {
1642 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1643 struct thread_trace *ttrace;
1644 size_t filename_len, entry_str_len, to_move;
1645 ssize_t remaining_space;
1646 char *pos;
1647 const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1648
1649 if (!thread)
1650 goto out;
1651
1652 ttrace = thread__priv(thread);
1653 if (!ttrace)
1654 goto out;
1655
1656 filename_len = strlen(filename);
1657
1658 if (ttrace->filename.namelen < filename_len) {
1659 char *f = realloc(ttrace->filename.name, filename_len + 1);
1660
1661 if (f == NULL)
1662 goto out;
1663
1664 ttrace->filename.namelen = filename_len;
1665 ttrace->filename.name = f;
1666 }
1667
1668 strcpy(ttrace->filename.name, filename);
1669 ttrace->filename.pending_open = true;
1670
1671 if (!ttrace->filename.ptr)
1672 goto out;
1673
1674 entry_str_len = strlen(ttrace->entry_str);
1675 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1676 if (remaining_space <= 0)
1677 goto out;
1678
1679 if (filename_len > (size_t)remaining_space) {
1680 filename += filename_len - remaining_space;
1681 filename_len = remaining_space;
1682 }
1683
1684 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1685 pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1686 memmove(pos + filename_len, pos, to_move);
1687 memcpy(pos, filename, filename_len);
1688
1689 ttrace->filename.ptr = 0;
1690 ttrace->filename.entry_str_pos = 0;
1691 out:
1692 return 0;
1693 }
1694
1695 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1696 union perf_event *event __maybe_unused,
1697 struct perf_sample *sample)
1698 {
1699 u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1700 double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1701 struct thread *thread = machine__findnew_thread(trace->host,
1702 sample->pid,
1703 sample->tid);
1704 struct thread_trace *ttrace = thread__trace(thread, trace->output);
1705
1706 if (ttrace == NULL)
1707 goto out_dump;
1708
1709 ttrace->runtime_ms += runtime_ms;
1710 trace->runtime_ms += runtime_ms;
1711 thread__put(thread);
1712 return 0;
1713
1714 out_dump:
1715 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1716 evsel->name,
1717 perf_evsel__strval(evsel, sample, "comm"),
1718 (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1719 runtime,
1720 perf_evsel__intval(evsel, sample, "vruntime"));
1721 thread__put(thread);
1722 return 0;
1723 }
1724
1725 static void bpf_output__printer(enum binary_printer_ops op,
1726 unsigned int val, void *extra)
1727 {
1728 FILE *output = extra;
1729 unsigned char ch = (unsigned char)val;
1730
1731 switch (op) {
1732 case BINARY_PRINT_CHAR_DATA:
1733 fprintf(output, "%c", isprint(ch) ? ch : '.');
1734 break;
1735 case BINARY_PRINT_DATA_BEGIN:
1736 case BINARY_PRINT_LINE_BEGIN:
1737 case BINARY_PRINT_ADDR:
1738 case BINARY_PRINT_NUM_DATA:
1739 case BINARY_PRINT_NUM_PAD:
1740 case BINARY_PRINT_SEP:
1741 case BINARY_PRINT_CHAR_PAD:
1742 case BINARY_PRINT_LINE_END:
1743 case BINARY_PRINT_DATA_END:
1744 default:
1745 break;
1746 }
1747 }
1748
1749 static void bpf_output__fprintf(struct trace *trace,
1750 struct perf_sample *sample)
1751 {
1752 print_binary(sample->raw_data, sample->raw_size, 8,
1753 bpf_output__printer, trace->output);
1754 }
1755
1756 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1757 union perf_event *event __maybe_unused,
1758 struct perf_sample *sample)
1759 {
1760 int callchain_ret = 0;
1761
1762 if (sample->callchain) {
1763 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1764 if (callchain_ret == 0) {
1765 if (callchain_cursor.nr < trace->min_stack)
1766 goto out;
1767 callchain_ret = 1;
1768 }
1769 }
1770
1771 trace__printf_interrupted_entry(trace, sample);
1772 trace__fprintf_tstamp(trace, sample->time, trace->output);
1773
1774 if (trace->trace_syscalls)
1775 fprintf(trace->output, "( ): ");
1776
1777 fprintf(trace->output, "%s:", evsel->name);
1778
1779 if (perf_evsel__is_bpf_output(evsel)) {
1780 bpf_output__fprintf(trace, sample);
1781 } else if (evsel->tp_format) {
1782 event_format__fprintf(evsel->tp_format, sample->cpu,
1783 sample->raw_data, sample->raw_size,
1784 trace->output);
1785 }
1786
1787 fprintf(trace->output, ")\n");
1788
1789 if (callchain_ret > 0)
1790 trace__fprintf_callchain(trace, sample);
1791 else if (callchain_ret < 0)
1792 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1793 out:
1794 return 0;
1795 }
1796
1797 static void print_location(FILE *f, struct perf_sample *sample,
1798 struct addr_location *al,
1799 bool print_dso, bool print_sym)
1800 {
1801
1802 if ((verbose || print_dso) && al->map)
1803 fprintf(f, "%s@", al->map->dso->long_name);
1804
1805 if ((verbose || print_sym) && al->sym)
1806 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1807 al->addr - al->sym->start);
1808 else if (al->map)
1809 fprintf(f, "0x%" PRIx64, al->addr);
1810 else
1811 fprintf(f, "0x%" PRIx64, sample->addr);
1812 }
1813
1814 static int trace__pgfault(struct trace *trace,
1815 struct perf_evsel *evsel,
1816 union perf_event *event __maybe_unused,
1817 struct perf_sample *sample)
1818 {
1819 struct thread *thread;
1820 struct addr_location al;
1821 char map_type = 'd';
1822 struct thread_trace *ttrace;
1823 int err = -1;
1824 int callchain_ret = 0;
1825
1826 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1827
1828 if (sample->callchain) {
1829 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1830 if (callchain_ret == 0) {
1831 if (callchain_cursor.nr < trace->min_stack)
1832 goto out_put;
1833 callchain_ret = 1;
1834 }
1835 }
1836
1837 ttrace = thread__trace(thread, trace->output);
1838 if (ttrace == NULL)
1839 goto out_put;
1840
1841 if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1842 ttrace->pfmaj++;
1843 else
1844 ttrace->pfmin++;
1845
1846 if (trace->summary_only)
1847 goto out;
1848
1849 thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1850 sample->ip, &al);
1851
1852 trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
1853
1854 fprintf(trace->output, "%sfault [",
1855 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1856 "maj" : "min");
1857
1858 print_location(trace->output, sample, &al, false, true);
1859
1860 fprintf(trace->output, "] => ");
1861
1862 thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1863 sample->addr, &al);
1864
1865 if (!al.map) {
1866 thread__find_addr_location(thread, sample->cpumode,
1867 MAP__FUNCTION, sample->addr, &al);
1868
1869 if (al.map)
1870 map_type = 'x';
1871 else
1872 map_type = '?';
1873 }
1874
1875 print_location(trace->output, sample, &al, true, false);
1876
1877 fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1878
1879 if (callchain_ret > 0)
1880 trace__fprintf_callchain(trace, sample);
1881 else if (callchain_ret < 0)
1882 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1883 out:
1884 err = 0;
1885 out_put:
1886 thread__put(thread);
1887 return err;
1888 }
1889
1890 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
1891 {
1892 if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
1893 (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
1894 return false;
1895
1896 if (trace->pid_list || trace->tid_list)
1897 return true;
1898
1899 return false;
1900 }
1901
1902 static void trace__set_base_time(struct trace *trace,
1903 struct perf_evsel *evsel,
1904 struct perf_sample *sample)
1905 {
1906 /*
1907 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1908 * and don't use sample->time unconditionally, we may end up having
1909 * some other event in the future without PERF_SAMPLE_TIME for good
1910 * reason, i.e. we may not be interested in its timestamps, just in
1911 * it taking place, picking some piece of information when it
1912 * appears in our event stream (vfs_getname comes to mind).
1913 */
1914 if (trace->base_time == 0 && !trace->full_time &&
1915 (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1916 trace->base_time = sample->time;
1917 }
1918
1919 static int trace__process_sample(struct perf_tool *tool,
1920 union perf_event *event,
1921 struct perf_sample *sample,
1922 struct perf_evsel *evsel,
1923 struct machine *machine __maybe_unused)
1924 {
1925 struct trace *trace = container_of(tool, struct trace, tool);
1926 int err = 0;
1927
1928 tracepoint_handler handler = evsel->handler;
1929
1930 if (skip_sample(trace, sample))
1931 return 0;
1932
1933 trace__set_base_time(trace, evsel, sample);
1934
1935 if (handler) {
1936 ++trace->nr_events;
1937 handler(trace, evsel, event, sample);
1938 }
1939
1940 return err;
1941 }
1942
1943 static int parse_target_str(struct trace *trace)
1944 {
1945 if (trace->opts.target.pid) {
1946 trace->pid_list = intlist__new(trace->opts.target.pid);
1947 if (trace->pid_list == NULL) {
1948 pr_err("Error parsing process id string\n");
1949 return -EINVAL;
1950 }
1951 }
1952
1953 if (trace->opts.target.tid) {
1954 trace->tid_list = intlist__new(trace->opts.target.tid);
1955 if (trace->tid_list == NULL) {
1956 pr_err("Error parsing thread id string\n");
1957 return -EINVAL;
1958 }
1959 }
1960
1961 return 0;
1962 }
1963
1964 static int trace__record(struct trace *trace, int argc, const char **argv)
1965 {
1966 unsigned int rec_argc, i, j;
1967 const char **rec_argv;
1968 const char * const record_args[] = {
1969 "record",
1970 "-R",
1971 "-m", "1024",
1972 "-c", "1",
1973 };
1974
1975 const char * const sc_args[] = { "-e", };
1976 unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
1977 const char * const majpf_args[] = { "-e", "major-faults" };
1978 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
1979 const char * const minpf_args[] = { "-e", "minor-faults" };
1980 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
1981
1982 /* +1 is for the event string below */
1983 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
1984 majpf_args_nr + minpf_args_nr + argc;
1985 rec_argv = calloc(rec_argc + 1, sizeof(char *));
1986
1987 if (rec_argv == NULL)
1988 return -ENOMEM;
1989
1990 j = 0;
1991 for (i = 0; i < ARRAY_SIZE(record_args); i++)
1992 rec_argv[j++] = record_args[i];
1993
1994 if (trace->trace_syscalls) {
1995 for (i = 0; i < sc_args_nr; i++)
1996 rec_argv[j++] = sc_args[i];
1997
1998 /* event string may be different for older kernels - e.g., RHEL6 */
1999 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2000 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2001 else if (is_valid_tracepoint("syscalls:sys_enter"))
2002 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2003 else {
2004 pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2005 return -1;
2006 }
2007 }
2008
2009 if (trace->trace_pgfaults & TRACE_PFMAJ)
2010 for (i = 0; i < majpf_args_nr; i++)
2011 rec_argv[j++] = majpf_args[i];
2012
2013 if (trace->trace_pgfaults & TRACE_PFMIN)
2014 for (i = 0; i < minpf_args_nr; i++)
2015 rec_argv[j++] = minpf_args[i];
2016
2017 for (i = 0; i < (unsigned int)argc; i++)
2018 rec_argv[j++] = argv[i];
2019
2020 return cmd_record(j, rec_argv, NULL);
2021 }
2022
2023 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2024
2025 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2026 {
2027 struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2028
2029 if (IS_ERR(evsel))
2030 return false;
2031
2032 if (perf_evsel__field(evsel, "pathname") == NULL) {
2033 perf_evsel__delete(evsel);
2034 return false;
2035 }
2036
2037 evsel->handler = trace__vfs_getname;
2038 perf_evlist__add(evlist, evsel);
2039 return true;
2040 }
2041
2042 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2043 {
2044 struct perf_evsel *evsel;
2045 struct perf_event_attr attr = {
2046 .type = PERF_TYPE_SOFTWARE,
2047 .mmap_data = 1,
2048 };
2049
2050 attr.config = config;
2051 attr.sample_period = 1;
2052
2053 event_attr_init(&attr);
2054
2055 evsel = perf_evsel__new(&attr);
2056 if (evsel)
2057 evsel->handler = trace__pgfault;
2058
2059 return evsel;
2060 }
2061
2062 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2063 {
2064 const u32 type = event->header.type;
2065 struct perf_evsel *evsel;
2066
2067 if (type != PERF_RECORD_SAMPLE) {
2068 trace__process_event(trace, trace->host, event, sample);
2069 return;
2070 }
2071
2072 evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2073 if (evsel == NULL) {
2074 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2075 return;
2076 }
2077
2078 trace__set_base_time(trace, evsel, sample);
2079
2080 if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2081 sample->raw_data == NULL) {
2082 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2083 perf_evsel__name(evsel), sample->tid,
2084 sample->cpu, sample->raw_size);
2085 } else {
2086 tracepoint_handler handler = evsel->handler;
2087 handler(trace, evsel, event, sample);
2088 }
2089 }
2090
2091 static int trace__add_syscall_newtp(struct trace *trace)
2092 {
2093 int ret = -1;
2094 struct perf_evlist *evlist = trace->evlist;
2095 struct perf_evsel *sys_enter, *sys_exit;
2096
2097 sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2098 if (sys_enter == NULL)
2099 goto out;
2100
2101 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2102 goto out_delete_sys_enter;
2103
2104 sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2105 if (sys_exit == NULL)
2106 goto out_delete_sys_enter;
2107
2108 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2109 goto out_delete_sys_exit;
2110
2111 perf_evlist__add(evlist, sys_enter);
2112 perf_evlist__add(evlist, sys_exit);
2113
2114 if (callchain_param.enabled && !trace->kernel_syscallchains) {
2115 /*
2116 * We're interested only in the user space callchain
2117 * leading to the syscall, allow overriding that for
2118 * debugging reasons using --kernel_syscall_callchains
2119 */
2120 sys_exit->attr.exclude_callchain_kernel = 1;
2121 }
2122
2123 trace->syscalls.events.sys_enter = sys_enter;
2124 trace->syscalls.events.sys_exit = sys_exit;
2125
2126 ret = 0;
2127 out:
2128 return ret;
2129
2130 out_delete_sys_exit:
2131 perf_evsel__delete_priv(sys_exit);
2132 out_delete_sys_enter:
2133 perf_evsel__delete_priv(sys_enter);
2134 goto out;
2135 }
2136
2137 static int trace__set_ev_qualifier_filter(struct trace *trace)
2138 {
2139 int err = -1;
2140 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2141 trace->ev_qualifier_ids.nr,
2142 trace->ev_qualifier_ids.entries);
2143
2144 if (filter == NULL)
2145 goto out_enomem;
2146
2147 if (!perf_evsel__append_filter(trace->syscalls.events.sys_enter, "&&", filter))
2148 err = perf_evsel__append_filter(trace->syscalls.events.sys_exit, "&&", filter);
2149
2150 free(filter);
2151 out:
2152 return err;
2153 out_enomem:
2154 errno = ENOMEM;
2155 goto out;
2156 }
2157
2158 static int trace__run(struct trace *trace, int argc, const char **argv)
2159 {
2160 struct perf_evlist *evlist = trace->evlist;
2161 struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2162 int err = -1, i;
2163 unsigned long before;
2164 const bool forks = argc > 0;
2165 bool draining = false;
2166
2167 trace->live = true;
2168
2169 if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2170 goto out_error_raw_syscalls;
2171
2172 if (trace->trace_syscalls)
2173 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2174
2175 if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2176 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2177 if (pgfault_maj == NULL)
2178 goto out_error_mem;
2179 perf_evlist__add(evlist, pgfault_maj);
2180 }
2181
2182 if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2183 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2184 if (pgfault_min == NULL)
2185 goto out_error_mem;
2186 perf_evlist__add(evlist, pgfault_min);
2187 }
2188
2189 if (trace->sched &&
2190 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2191 trace__sched_stat_runtime))
2192 goto out_error_sched_stat_runtime;
2193
2194 err = perf_evlist__create_maps(evlist, &trace->opts.target);
2195 if (err < 0) {
2196 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2197 goto out_delete_evlist;
2198 }
2199
2200 err = trace__symbols_init(trace, evlist);
2201 if (err < 0) {
2202 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2203 goto out_delete_evlist;
2204 }
2205
2206 perf_evlist__config(evlist, &trace->opts, NULL);
2207
2208 if (callchain_param.enabled) {
2209 bool use_identifier = false;
2210
2211 if (trace->syscalls.events.sys_exit) {
2212 perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2213 &trace->opts, &callchain_param);
2214 use_identifier = true;
2215 }
2216
2217 if (pgfault_maj) {
2218 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2219 use_identifier = true;
2220 }
2221
2222 if (pgfault_min) {
2223 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2224 use_identifier = true;
2225 }
2226
2227 if (use_identifier) {
2228 /*
2229 * Now we have evsels with different sample_ids, use
2230 * PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2231 * from a fixed position in each ring buffer record.
2232 *
2233 * As of this the changeset introducing this comment, this
2234 * isn't strictly needed, as the fields that can come before
2235 * PERF_SAMPLE_ID are all used, but we'll probably disable
2236 * some of those for things like copying the payload of
2237 * pointer syscall arguments, and for vfs_getname we don't
2238 * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2239 * here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2240 */
2241 perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2242 perf_evlist__reset_sample_bit(evlist, ID);
2243 }
2244 }
2245
2246 signal(SIGCHLD, sig_handler);
2247 signal(SIGINT, sig_handler);
2248
2249 if (forks) {
2250 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2251 argv, false, NULL);
2252 if (err < 0) {
2253 fprintf(trace->output, "Couldn't run the workload!\n");
2254 goto out_delete_evlist;
2255 }
2256 }
2257
2258 err = perf_evlist__open(evlist);
2259 if (err < 0)
2260 goto out_error_open;
2261
2262 err = bpf__apply_obj_config();
2263 if (err) {
2264 char errbuf[BUFSIZ];
2265
2266 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2267 pr_err("ERROR: Apply config to BPF failed: %s\n",
2268 errbuf);
2269 goto out_error_open;
2270 }
2271
2272 /*
2273 * Better not use !target__has_task() here because we need to cover the
2274 * case where no threads were specified in the command line, but a
2275 * workload was, and in that case we will fill in the thread_map when
2276 * we fork the workload in perf_evlist__prepare_workload.
2277 */
2278 if (trace->filter_pids.nr > 0)
2279 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2280 else if (thread_map__pid(evlist->threads, 0) == -1)
2281 err = perf_evlist__set_filter_pid(evlist, getpid());
2282
2283 if (err < 0)
2284 goto out_error_mem;
2285
2286 if (trace->ev_qualifier_ids.nr > 0) {
2287 err = trace__set_ev_qualifier_filter(trace);
2288 if (err < 0)
2289 goto out_errno;
2290
2291 pr_debug("event qualifier tracepoint filter: %s\n",
2292 trace->syscalls.events.sys_exit->filter);
2293 }
2294
2295 err = perf_evlist__apply_filters(evlist, &evsel);
2296 if (err < 0)
2297 goto out_error_apply_filters;
2298
2299 err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2300 if (err < 0)
2301 goto out_error_mmap;
2302
2303 if (!target__none(&trace->opts.target))
2304 perf_evlist__enable(evlist);
2305
2306 if (forks)
2307 perf_evlist__start_workload(evlist);
2308
2309 trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2310 evlist->threads->nr > 1 ||
2311 perf_evlist__first(evlist)->attr.inherit;
2312 again:
2313 before = trace->nr_events;
2314
2315 for (i = 0; i < evlist->nr_mmaps; i++) {
2316 union perf_event *event;
2317
2318 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2319 struct perf_sample sample;
2320
2321 ++trace->nr_events;
2322
2323 err = perf_evlist__parse_sample(evlist, event, &sample);
2324 if (err) {
2325 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2326 goto next_event;
2327 }
2328
2329 trace__handle_event(trace, event, &sample);
2330 next_event:
2331 perf_evlist__mmap_consume(evlist, i);
2332
2333 if (interrupted)
2334 goto out_disable;
2335
2336 if (done && !draining) {
2337 perf_evlist__disable(evlist);
2338 draining = true;
2339 }
2340 }
2341 }
2342
2343 if (trace->nr_events == before) {
2344 int timeout = done ? 100 : -1;
2345
2346 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2347 if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2348 draining = true;
2349
2350 goto again;
2351 }
2352 } else {
2353 goto again;
2354 }
2355
2356 out_disable:
2357 thread__zput(trace->current);
2358
2359 perf_evlist__disable(evlist);
2360
2361 if (!err) {
2362 if (trace->summary)
2363 trace__fprintf_thread_summary(trace, trace->output);
2364
2365 if (trace->show_tool_stats) {
2366 fprintf(trace->output, "Stats:\n "
2367 " vfs_getname : %" PRIu64 "\n"
2368 " proc_getname: %" PRIu64 "\n",
2369 trace->stats.vfs_getname,
2370 trace->stats.proc_getname);
2371 }
2372 }
2373
2374 out_delete_evlist:
2375 perf_evlist__delete(evlist);
2376 trace->evlist = NULL;
2377 trace->live = false;
2378 return err;
2379 {
2380 char errbuf[BUFSIZ];
2381
2382 out_error_sched_stat_runtime:
2383 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2384 goto out_error;
2385
2386 out_error_raw_syscalls:
2387 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2388 goto out_error;
2389
2390 out_error_mmap:
2391 perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2392 goto out_error;
2393
2394 out_error_open:
2395 perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2396
2397 out_error:
2398 fprintf(trace->output, "%s\n", errbuf);
2399 goto out_delete_evlist;
2400
2401 out_error_apply_filters:
2402 fprintf(trace->output,
2403 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2404 evsel->filter, perf_evsel__name(evsel), errno,
2405 str_error_r(errno, errbuf, sizeof(errbuf)));
2406 goto out_delete_evlist;
2407 }
2408 out_error_mem:
2409 fprintf(trace->output, "Not enough memory to run!\n");
2410 goto out_delete_evlist;
2411
2412 out_errno:
2413 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2414 goto out_delete_evlist;
2415 }
2416
2417 static int trace__replay(struct trace *trace)
2418 {
2419 const struct perf_evsel_str_handler handlers[] = {
2420 { "probe:vfs_getname", trace__vfs_getname, },
2421 };
2422 struct perf_data_file file = {
2423 .path = input_name,
2424 .mode = PERF_DATA_MODE_READ,
2425 .force = trace->force,
2426 };
2427 struct perf_session *session;
2428 struct perf_evsel *evsel;
2429 int err = -1;
2430
2431 trace->tool.sample = trace__process_sample;
2432 trace->tool.mmap = perf_event__process_mmap;
2433 trace->tool.mmap2 = perf_event__process_mmap2;
2434 trace->tool.comm = perf_event__process_comm;
2435 trace->tool.exit = perf_event__process_exit;
2436 trace->tool.fork = perf_event__process_fork;
2437 trace->tool.attr = perf_event__process_attr;
2438 trace->tool.tracing_data = perf_event__process_tracing_data;
2439 trace->tool.build_id = perf_event__process_build_id;
2440
2441 trace->tool.ordered_events = true;
2442 trace->tool.ordering_requires_timestamps = true;
2443
2444 /* add tid to output */
2445 trace->multiple_threads = true;
2446
2447 session = perf_session__new(&file, false, &trace->tool);
2448 if (session == NULL)
2449 return -1;
2450
2451 if (symbol__init(&session->header.env) < 0)
2452 goto out;
2453
2454 trace->host = &session->machines.host;
2455
2456 err = perf_session__set_tracepoints_handlers(session, handlers);
2457 if (err)
2458 goto out;
2459
2460 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2461 "raw_syscalls:sys_enter");
2462 /* older kernels have syscalls tp versus raw_syscalls */
2463 if (evsel == NULL)
2464 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2465 "syscalls:sys_enter");
2466
2467 if (evsel &&
2468 (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2469 perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2470 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2471 goto out;
2472 }
2473
2474 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2475 "raw_syscalls:sys_exit");
2476 if (evsel == NULL)
2477 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2478 "syscalls:sys_exit");
2479 if (evsel &&
2480 (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2481 perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2482 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2483 goto out;
2484 }
2485
2486 evlist__for_each_entry(session->evlist, evsel) {
2487 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2488 (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2489 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2490 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2491 evsel->handler = trace__pgfault;
2492 }
2493
2494 err = parse_target_str(trace);
2495 if (err != 0)
2496 goto out;
2497
2498 setup_pager();
2499
2500 err = perf_session__process_events(session);
2501 if (err)
2502 pr_err("Failed to process events, error %d", err);
2503
2504 else if (trace->summary)
2505 trace__fprintf_thread_summary(trace, trace->output);
2506
2507 out:
2508 perf_session__delete(session);
2509
2510 return err;
2511 }
2512
2513 static size_t trace__fprintf_threads_header(FILE *fp)
2514 {
2515 size_t printed;
2516
2517 printed = fprintf(fp, "\n Summary of events:\n\n");
2518
2519 return printed;
2520 }
2521
2522 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2523 struct stats *stats;
2524 double msecs;
2525 int syscall;
2526 )
2527 {
2528 struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2529 struct stats *stats = source->priv;
2530
2531 entry->syscall = source->i;
2532 entry->stats = stats;
2533 entry->msecs = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2534 }
2535
2536 static size_t thread__dump_stats(struct thread_trace *ttrace,
2537 struct trace *trace, FILE *fp)
2538 {
2539 size_t printed = 0;
2540 struct syscall *sc;
2541 struct rb_node *nd;
2542 DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2543
2544 if (syscall_stats == NULL)
2545 return 0;
2546
2547 printed += fprintf(fp, "\n");
2548
2549 printed += fprintf(fp, " syscall calls total min avg max stddev\n");
2550 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
2551 printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n");
2552
2553 resort_rb__for_each_entry(nd, syscall_stats) {
2554 struct stats *stats = syscall_stats_entry->stats;
2555 if (stats) {
2556 double min = (double)(stats->min) / NSEC_PER_MSEC;
2557 double max = (double)(stats->max) / NSEC_PER_MSEC;
2558 double avg = avg_stats(stats);
2559 double pct;
2560 u64 n = (u64) stats->n;
2561
2562 pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2563 avg /= NSEC_PER_MSEC;
2564
2565 sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2566 printed += fprintf(fp, " %-15s", sc->name);
2567 printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2568 n, syscall_stats_entry->msecs, min, avg);
2569 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2570 }
2571 }
2572
2573 resort_rb__delete(syscall_stats);
2574 printed += fprintf(fp, "\n\n");
2575
2576 return printed;
2577 }
2578
2579 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2580 {
2581 size_t printed = 0;
2582 struct thread_trace *ttrace = thread__priv(thread);
2583 double ratio;
2584
2585 if (ttrace == NULL)
2586 return 0;
2587
2588 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2589
2590 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2591 printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2592 printed += fprintf(fp, "%.1f%%", ratio);
2593 if (ttrace->pfmaj)
2594 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2595 if (ttrace->pfmin)
2596 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2597 if (trace->sched)
2598 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2599 else if (fputc('\n', fp) != EOF)
2600 ++printed;
2601
2602 printed += thread__dump_stats(ttrace, trace, fp);
2603
2604 return printed;
2605 }
2606
2607 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2608 {
2609 return ttrace ? ttrace->nr_events : 0;
2610 }
2611
2612 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2613 struct thread *thread;
2614 )
2615 {
2616 entry->thread = rb_entry(nd, struct thread, rb_node);
2617 }
2618
2619 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2620 {
2621 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2622 size_t printed = trace__fprintf_threads_header(fp);
2623 struct rb_node *nd;
2624
2625 if (threads == NULL) {
2626 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2627 return 0;
2628 }
2629
2630 resort_rb__for_each_entry(nd, threads)
2631 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2632
2633 resort_rb__delete(threads);
2634
2635 return printed;
2636 }
2637
2638 static int trace__set_duration(const struct option *opt, const char *str,
2639 int unset __maybe_unused)
2640 {
2641 struct trace *trace = opt->value;
2642
2643 trace->duration_filter = atof(str);
2644 return 0;
2645 }
2646
2647 static int trace__set_filter_pids(const struct option *opt, const char *str,
2648 int unset __maybe_unused)
2649 {
2650 int ret = -1;
2651 size_t i;
2652 struct trace *trace = opt->value;
2653 /*
2654 * FIXME: introduce a intarray class, plain parse csv and create a
2655 * { int nr, int entries[] } struct...
2656 */
2657 struct intlist *list = intlist__new(str);
2658
2659 if (list == NULL)
2660 return -1;
2661
2662 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2663 trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2664
2665 if (trace->filter_pids.entries == NULL)
2666 goto out;
2667
2668 trace->filter_pids.entries[0] = getpid();
2669
2670 for (i = 1; i < trace->filter_pids.nr; ++i)
2671 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2672
2673 intlist__delete(list);
2674 ret = 0;
2675 out:
2676 return ret;
2677 }
2678
2679 static int trace__open_output(struct trace *trace, const char *filename)
2680 {
2681 struct stat st;
2682
2683 if (!stat(filename, &st) && st.st_size) {
2684 char oldname[PATH_MAX];
2685
2686 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2687 unlink(oldname);
2688 rename(filename, oldname);
2689 }
2690
2691 trace->output = fopen(filename, "w");
2692
2693 return trace->output == NULL ? -errno : 0;
2694 }
2695
2696 static int parse_pagefaults(const struct option *opt, const char *str,
2697 int unset __maybe_unused)
2698 {
2699 int *trace_pgfaults = opt->value;
2700
2701 if (strcmp(str, "all") == 0)
2702 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2703 else if (strcmp(str, "maj") == 0)
2704 *trace_pgfaults |= TRACE_PFMAJ;
2705 else if (strcmp(str, "min") == 0)
2706 *trace_pgfaults |= TRACE_PFMIN;
2707 else
2708 return -1;
2709
2710 return 0;
2711 }
2712
2713 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2714 {
2715 struct perf_evsel *evsel;
2716
2717 evlist__for_each_entry(evlist, evsel)
2718 evsel->handler = handler;
2719 }
2720
2721 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
2722 {
2723 const char *trace_usage[] = {
2724 "perf trace [<options>] [<command>]",
2725 "perf trace [<options>] -- <command> [<options>]",
2726 "perf trace record [<options>] [<command>]",
2727 "perf trace record [<options>] -- <command> [<options>]",
2728 NULL
2729 };
2730 struct trace trace = {
2731 .syscalls = {
2732 . max = -1,
2733 },
2734 .opts = {
2735 .target = {
2736 .uid = UINT_MAX,
2737 .uses_mmap = true,
2738 },
2739 .user_freq = UINT_MAX,
2740 .user_interval = ULLONG_MAX,
2741 .no_buffering = true,
2742 .mmap_pages = UINT_MAX,
2743 .proc_map_timeout = 500,
2744 },
2745 .output = stderr,
2746 .show_comm = true,
2747 .trace_syscalls = true,
2748 .kernel_syscallchains = false,
2749 .max_stack = UINT_MAX,
2750 };
2751 const char *output_name = NULL;
2752 const char *ev_qualifier_str = NULL;
2753 const struct option trace_options[] = {
2754 OPT_CALLBACK(0, "event", &trace.evlist, "event",
2755 "event selector. use 'perf list' to list available events",
2756 parse_events_option),
2757 OPT_BOOLEAN(0, "comm", &trace.show_comm,
2758 "show the thread COMM next to its id"),
2759 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2760 OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
2761 OPT_STRING('o', "output", &output_name, "file", "output file name"),
2762 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2763 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2764 "trace events on existing process id"),
2765 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2766 "trace events on existing thread id"),
2767 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2768 "pids to filter (by the kernel)", trace__set_filter_pids),
2769 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2770 "system-wide collection from all CPUs"),
2771 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2772 "list of cpus to monitor"),
2773 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2774 "child tasks do not inherit counters"),
2775 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2776 "number of mmap data pages",
2777 perf_evlist__parse_mmap_pages),
2778 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2779 "user to profile"),
2780 OPT_CALLBACK(0, "duration", &trace, "float",
2781 "show only events with duration > N.M ms",
2782 trace__set_duration),
2783 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2784 OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2785 OPT_BOOLEAN('T', "time", &trace.full_time,
2786 "Show full timestamp, not time relative to first start"),
2787 OPT_BOOLEAN('s', "summary", &trace.summary_only,
2788 "Show only syscall summary with statistics"),
2789 OPT_BOOLEAN('S', "with-summary", &trace.summary,
2790 "Show all syscalls and summary with statistics"),
2791 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2792 "Trace pagefaults", parse_pagefaults, "maj"),
2793 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2794 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2795 OPT_CALLBACK(0, "call-graph", &trace.opts,
2796 "record_mode[,record_size]", record_callchain_help,
2797 &record_parse_callchain_opt),
2798 OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2799 "Show the kernel callchains on the syscall exit path"),
2800 OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2801 "Set the minimum stack depth when parsing the callchain, "
2802 "anything below the specified depth will be ignored."),
2803 OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2804 "Set the maximum stack depth when parsing the callchain, "
2805 "anything beyond the specified depth will be ignored. "
2806 "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2807 OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2808 "per thread proc mmap processing timeout in ms"),
2809 OPT_END()
2810 };
2811 bool __maybe_unused max_stack_user_set = true;
2812 bool mmap_pages_user_set = true;
2813 const char * const trace_subcommands[] = { "record", NULL };
2814 int err;
2815 char bf[BUFSIZ];
2816
2817 signal(SIGSEGV, sighandler_dump_stack);
2818 signal(SIGFPE, sighandler_dump_stack);
2819
2820 trace.evlist = perf_evlist__new();
2821 trace.sctbl = syscalltbl__new();
2822
2823 if (trace.evlist == NULL || trace.sctbl == NULL) {
2824 pr_err("Not enough memory to run!\n");
2825 err = -ENOMEM;
2826 goto out;
2827 }
2828
2829 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2830 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2831
2832 err = bpf__setup_stdout(trace.evlist);
2833 if (err) {
2834 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
2835 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
2836 goto out;
2837 }
2838
2839 err = -1;
2840
2841 if (trace.trace_pgfaults) {
2842 trace.opts.sample_address = true;
2843 trace.opts.sample_time = true;
2844 }
2845
2846 if (trace.opts.mmap_pages == UINT_MAX)
2847 mmap_pages_user_set = false;
2848
2849 if (trace.max_stack == UINT_MAX) {
2850 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
2851 max_stack_user_set = false;
2852 }
2853
2854 #ifdef HAVE_DWARF_UNWIND_SUPPORT
2855 if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
2856 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
2857 #endif
2858
2859 if (callchain_param.enabled) {
2860 if (!mmap_pages_user_set && geteuid() == 0)
2861 trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
2862
2863 symbol_conf.use_callchain = true;
2864 }
2865
2866 if (trace.evlist->nr_entries > 0)
2867 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2868
2869 if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2870 return trace__record(&trace, argc-1, &argv[1]);
2871
2872 /* summary_only implies summary option, but don't overwrite summary if set */
2873 if (trace.summary_only)
2874 trace.summary = trace.summary_only;
2875
2876 if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2877 trace.evlist->nr_entries == 0 /* Was --events used? */) {
2878 pr_err("Please specify something to trace.\n");
2879 return -1;
2880 }
2881
2882 if (!trace.trace_syscalls && ev_qualifier_str) {
2883 pr_err("The -e option can't be used with --no-syscalls.\n");
2884 goto out;
2885 }
2886
2887 if (output_name != NULL) {
2888 err = trace__open_output(&trace, output_name);
2889 if (err < 0) {
2890 perror("failed to create output file");
2891 goto out;
2892 }
2893 }
2894
2895 trace.open_id = syscalltbl__id(trace.sctbl, "open");
2896
2897 if (ev_qualifier_str != NULL) {
2898 const char *s = ev_qualifier_str;
2899 struct strlist_config slist_config = {
2900 .dirname = system_path(STRACE_GROUPS_DIR),
2901 };
2902
2903 trace.not_ev_qualifier = *s == '!';
2904 if (trace.not_ev_qualifier)
2905 ++s;
2906 trace.ev_qualifier = strlist__new(s, &slist_config);
2907 if (trace.ev_qualifier == NULL) {
2908 fputs("Not enough memory to parse event qualifier",
2909 trace.output);
2910 err = -ENOMEM;
2911 goto out_close;
2912 }
2913
2914 err = trace__validate_ev_qualifier(&trace);
2915 if (err)
2916 goto out_close;
2917 }
2918
2919 err = target__validate(&trace.opts.target);
2920 if (err) {
2921 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2922 fprintf(trace.output, "%s", bf);
2923 goto out_close;
2924 }
2925
2926 err = target__parse_uid(&trace.opts.target);
2927 if (err) {
2928 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2929 fprintf(trace.output, "%s", bf);
2930 goto out_close;
2931 }
2932
2933 if (!argc && target__none(&trace.opts.target))
2934 trace.opts.target.system_wide = true;
2935
2936 if (input_name)
2937 err = trace__replay(&trace);
2938 else
2939 err = trace__run(&trace, argc, argv);
2940
2941 out_close:
2942 if (output_name != NULL)
2943 fclose(trace.output);
2944 out:
2945 return err;
2946 }