]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blob - tools/perf/builtin-trace.c
iio: imu: inv_mpu6050: test whoami first and against all known values
[mirror_ubuntu-artful-kernel.git] / tools / perf / builtin-trace.c
1 /*
2 * builtin-trace.c
3 *
4 * Builtin 'trace' command:
5 *
6 * Display a continuously updated trace of any workload, CPU, specific PID,
7 * system wide, etc. Default format is loosely strace like, but any other
8 * event may be specified using --event.
9 *
10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11 *
12 * Initially based on the 'trace' prototype by Thomas Gleixner:
13 *
14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15 *
16 * Released under the GPL v2. (and only v2, not any later version)
17 */
18
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/event.h"
25 #include "util/evlist.h"
26 #include <subcmd/exec-cmd.h>
27 #include "util/machine.h"
28 #include "util/path.h"
29 #include "util/session.h"
30 #include "util/thread.h"
31 #include <subcmd/parse-options.h>
32 #include "util/strlist.h"
33 #include "util/intlist.h"
34 #include "util/thread_map.h"
35 #include "util/stat.h"
36 #include "trace/beauty/beauty.h"
37 #include "trace-event.h"
38 #include "util/parse-events.h"
39 #include "util/bpf-loader.h"
40 #include "callchain.h"
41 #include "print_binary.h"
42 #include "string2.h"
43 #include "syscalltbl.h"
44 #include "rb_resort.h"
45
46 #include <errno.h>
47 #include <inttypes.h>
48 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
49 #include <poll.h>
50 #include <signal.h>
51 #include <stdlib.h>
52 #include <string.h>
53 #include <linux/err.h>
54 #include <linux/filter.h>
55 #include <linux/audit.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
60
61 #include "sane_ctype.h"
62
63 #ifndef O_CLOEXEC
64 # define O_CLOEXEC 02000000
65 #endif
66
67 struct trace {
68 struct perf_tool tool;
69 struct syscalltbl *sctbl;
70 struct {
71 int max;
72 struct syscall *table;
73 struct {
74 struct perf_evsel *sys_enter,
75 *sys_exit;
76 } events;
77 } syscalls;
78 struct record_opts opts;
79 struct perf_evlist *evlist;
80 struct machine *host;
81 struct thread *current;
82 u64 base_time;
83 FILE *output;
84 unsigned long nr_events;
85 struct strlist *ev_qualifier;
86 struct {
87 size_t nr;
88 int *entries;
89 } ev_qualifier_ids;
90 struct {
91 size_t nr;
92 pid_t *entries;
93 } filter_pids;
94 double duration_filter;
95 double runtime_ms;
96 struct {
97 u64 vfs_getname,
98 proc_getname;
99 } stats;
100 unsigned int max_stack;
101 unsigned int min_stack;
102 bool not_ev_qualifier;
103 bool live;
104 bool full_time;
105 bool sched;
106 bool multiple_threads;
107 bool summary;
108 bool summary_only;
109 bool show_comm;
110 bool show_tool_stats;
111 bool trace_syscalls;
112 bool kernel_syscallchains;
113 bool force;
114 bool vfs_getname;
115 int trace_pgfaults;
116 int open_id;
117 };
118
119 struct tp_field {
120 int offset;
121 union {
122 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
123 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
124 };
125 };
126
127 #define TP_UINT_FIELD(bits) \
128 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
129 { \
130 u##bits value; \
131 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
132 return value; \
133 }
134
135 TP_UINT_FIELD(8);
136 TP_UINT_FIELD(16);
137 TP_UINT_FIELD(32);
138 TP_UINT_FIELD(64);
139
140 #define TP_UINT_FIELD__SWAPPED(bits) \
141 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
142 { \
143 u##bits value; \
144 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
145 return bswap_##bits(value);\
146 }
147
148 TP_UINT_FIELD__SWAPPED(16);
149 TP_UINT_FIELD__SWAPPED(32);
150 TP_UINT_FIELD__SWAPPED(64);
151
152 static int tp_field__init_uint(struct tp_field *field,
153 struct format_field *format_field,
154 bool needs_swap)
155 {
156 field->offset = format_field->offset;
157
158 switch (format_field->size) {
159 case 1:
160 field->integer = tp_field__u8;
161 break;
162 case 2:
163 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
164 break;
165 case 4:
166 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
167 break;
168 case 8:
169 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
170 break;
171 default:
172 return -1;
173 }
174
175 return 0;
176 }
177
178 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
179 {
180 return sample->raw_data + field->offset;
181 }
182
183 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
184 {
185 field->offset = format_field->offset;
186 field->pointer = tp_field__ptr;
187 return 0;
188 }
189
190 struct syscall_tp {
191 struct tp_field id;
192 union {
193 struct tp_field args, ret;
194 };
195 };
196
197 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
198 struct tp_field *field,
199 const char *name)
200 {
201 struct format_field *format_field = perf_evsel__field(evsel, name);
202
203 if (format_field == NULL)
204 return -1;
205
206 return tp_field__init_uint(field, format_field, evsel->needs_swap);
207 }
208
209 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
210 ({ struct syscall_tp *sc = evsel->priv;\
211 perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
212
213 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
214 struct tp_field *field,
215 const char *name)
216 {
217 struct format_field *format_field = perf_evsel__field(evsel, name);
218
219 if (format_field == NULL)
220 return -1;
221
222 return tp_field__init_ptr(field, format_field);
223 }
224
225 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
226 ({ struct syscall_tp *sc = evsel->priv;\
227 perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
228
229 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
230 {
231 zfree(&evsel->priv);
232 perf_evsel__delete(evsel);
233 }
234
235 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
236 {
237 evsel->priv = malloc(sizeof(struct syscall_tp));
238 if (evsel->priv != NULL) {
239 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
240 goto out_delete;
241
242 evsel->handler = handler;
243 return 0;
244 }
245
246 return -ENOMEM;
247
248 out_delete:
249 zfree(&evsel->priv);
250 return -ENOENT;
251 }
252
253 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
254 {
255 struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
256
257 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
258 if (IS_ERR(evsel))
259 evsel = perf_evsel__newtp("syscalls", direction);
260
261 if (IS_ERR(evsel))
262 return NULL;
263
264 if (perf_evsel__init_syscall_tp(evsel, handler))
265 goto out_delete;
266
267 return evsel;
268
269 out_delete:
270 perf_evsel__delete_priv(evsel);
271 return NULL;
272 }
273
274 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
275 ({ struct syscall_tp *fields = evsel->priv; \
276 fields->name.integer(&fields->name, sample); })
277
278 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
279 ({ struct syscall_tp *fields = evsel->priv; \
280 fields->name.pointer(&fields->name, sample); })
281
282 struct strarray {
283 int offset;
284 int nr_entries;
285 const char **entries;
286 };
287
288 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
289 .nr_entries = ARRAY_SIZE(array), \
290 .entries = array, \
291 }
292
293 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
294 .offset = off, \
295 .nr_entries = ARRAY_SIZE(array), \
296 .entries = array, \
297 }
298
299 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
300 const char *intfmt,
301 struct syscall_arg *arg)
302 {
303 struct strarray *sa = arg->parm;
304 int idx = arg->val - sa->offset;
305
306 if (idx < 0 || idx >= sa->nr_entries)
307 return scnprintf(bf, size, intfmt, arg->val);
308
309 return scnprintf(bf, size, "%s", sa->entries[idx]);
310 }
311
312 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
313 struct syscall_arg *arg)
314 {
315 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
316 }
317
318 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
319
320 #if defined(__i386__) || defined(__x86_64__)
321 /*
322 * FIXME: Make this available to all arches as soon as the ioctl beautifier
323 * gets rewritten to support all arches.
324 */
325 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
326 struct syscall_arg *arg)
327 {
328 return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
329 }
330
331 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
332 #endif /* defined(__i386__) || defined(__x86_64__) */
333
334 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
335 struct syscall_arg *arg);
336
337 #define SCA_FD syscall_arg__scnprintf_fd
338
339 #ifndef AT_FDCWD
340 #define AT_FDCWD -100
341 #endif
342
343 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
344 struct syscall_arg *arg)
345 {
346 int fd = arg->val;
347
348 if (fd == AT_FDCWD)
349 return scnprintf(bf, size, "CWD");
350
351 return syscall_arg__scnprintf_fd(bf, size, arg);
352 }
353
354 #define SCA_FDAT syscall_arg__scnprintf_fd_at
355
356 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
357 struct syscall_arg *arg);
358
359 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
360
361 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
362 struct syscall_arg *arg)
363 {
364 return scnprintf(bf, size, "%#lx", arg->val);
365 }
366
367 #define SCA_HEX syscall_arg__scnprintf_hex
368
369 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
370 struct syscall_arg *arg)
371 {
372 return scnprintf(bf, size, "%d", arg->val);
373 }
374
375 #define SCA_INT syscall_arg__scnprintf_int
376
377 static const char *bpf_cmd[] = {
378 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
379 "MAP_GET_NEXT_KEY", "PROG_LOAD",
380 };
381 static DEFINE_STRARRAY(bpf_cmd);
382
383 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
384 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
385
386 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
387 static DEFINE_STRARRAY(itimers);
388
389 static const char *keyctl_options[] = {
390 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
391 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
392 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
393 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
394 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
395 };
396 static DEFINE_STRARRAY(keyctl_options);
397
398 static const char *whences[] = { "SET", "CUR", "END",
399 #ifdef SEEK_DATA
400 "DATA",
401 #endif
402 #ifdef SEEK_HOLE
403 "HOLE",
404 #endif
405 };
406 static DEFINE_STRARRAY(whences);
407
408 static const char *fcntl_cmds[] = {
409 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
410 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
411 "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
412 "F_GETOWNER_UIDS",
413 };
414 static DEFINE_STRARRAY(fcntl_cmds);
415
416 static const char *rlimit_resources[] = {
417 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
418 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
419 "RTTIME",
420 };
421 static DEFINE_STRARRAY(rlimit_resources);
422
423 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
424 static DEFINE_STRARRAY(sighow);
425
426 static const char *clockid[] = {
427 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
428 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
429 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
430 };
431 static DEFINE_STRARRAY(clockid);
432
433 static const char *socket_families[] = {
434 "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
435 "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
436 "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
437 "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
438 "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
439 "ALG", "NFC", "VSOCK",
440 };
441 static DEFINE_STRARRAY(socket_families);
442
443 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
444 struct syscall_arg *arg)
445 {
446 size_t printed = 0;
447 int mode = arg->val;
448
449 if (mode == F_OK) /* 0 */
450 return scnprintf(bf, size, "F");
451 #define P_MODE(n) \
452 if (mode & n##_OK) { \
453 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
454 mode &= ~n##_OK; \
455 }
456
457 P_MODE(R);
458 P_MODE(W);
459 P_MODE(X);
460 #undef P_MODE
461
462 if (mode)
463 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
464
465 return printed;
466 }
467
468 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
469
470 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
471 struct syscall_arg *arg);
472
473 #define SCA_FILENAME syscall_arg__scnprintf_filename
474
475 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
476 struct syscall_arg *arg)
477 {
478 int printed = 0, flags = arg->val;
479
480 #define P_FLAG(n) \
481 if (flags & O_##n) { \
482 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
483 flags &= ~O_##n; \
484 }
485
486 P_FLAG(CLOEXEC);
487 P_FLAG(NONBLOCK);
488 #undef P_FLAG
489
490 if (flags)
491 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
492
493 return printed;
494 }
495
496 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
497
498 #if defined(__i386__) || defined(__x86_64__)
499 /*
500 * FIXME: Make this available to all arches.
501 */
502 #define TCGETS 0x5401
503
504 static const char *tioctls[] = {
505 "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
506 "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
507 "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
508 "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
509 "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
510 "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
511 "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
512 "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
513 "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
514 "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
515 "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
516 [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
517 "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
518 "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
519 "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
520 };
521
522 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
523 #endif /* defined(__i386__) || defined(__x86_64__) */
524
525 #ifndef GRND_NONBLOCK
526 #define GRND_NONBLOCK 0x0001
527 #endif
528 #ifndef GRND_RANDOM
529 #define GRND_RANDOM 0x0002
530 #endif
531
532 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
533 struct syscall_arg *arg)
534 {
535 int printed = 0, flags = arg->val;
536
537 #define P_FLAG(n) \
538 if (flags & GRND_##n) { \
539 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
540 flags &= ~GRND_##n; \
541 }
542
543 P_FLAG(RANDOM);
544 P_FLAG(NONBLOCK);
545 #undef P_FLAG
546
547 if (flags)
548 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
549
550 return printed;
551 }
552
553 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
554
555 #define STRARRAY(arg, name, array) \
556 .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
557 .arg_parm = { [arg] = &strarray__##array, }
558
559 #include "trace/beauty/eventfd.c"
560 #include "trace/beauty/flock.c"
561 #include "trace/beauty/futex_op.c"
562 #include "trace/beauty/mmap.c"
563 #include "trace/beauty/mode_t.c"
564 #include "trace/beauty/msg_flags.c"
565 #include "trace/beauty/open_flags.c"
566 #include "trace/beauty/perf_event_open.c"
567 #include "trace/beauty/pid.c"
568 #include "trace/beauty/sched_policy.c"
569 #include "trace/beauty/seccomp.c"
570 #include "trace/beauty/signum.c"
571 #include "trace/beauty/socket_type.c"
572 #include "trace/beauty/waitid_options.c"
573
574 static struct syscall_fmt {
575 const char *name;
576 const char *alias;
577 size_t (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
578 void *arg_parm[6];
579 bool errmsg;
580 bool errpid;
581 bool timeout;
582 bool hexret;
583 } syscall_fmts[] = {
584 { .name = "access", .errmsg = true,
585 .arg_scnprintf = { [1] = SCA_ACCMODE, /* mode */ }, },
586 { .name = "arch_prctl", .errmsg = true, .alias = "prctl", },
587 { .name = "bpf", .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
588 { .name = "brk", .hexret = true,
589 .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
590 { .name = "chdir", .errmsg = true, },
591 { .name = "chmod", .errmsg = true, },
592 { .name = "chroot", .errmsg = true, },
593 { .name = "clock_gettime", .errmsg = true, STRARRAY(0, clk_id, clockid), },
594 { .name = "clone", .errpid = true, },
595 { .name = "close", .errmsg = true,
596 .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
597 { .name = "connect", .errmsg = true, },
598 { .name = "creat", .errmsg = true, },
599 { .name = "dup", .errmsg = true, },
600 { .name = "dup2", .errmsg = true, },
601 { .name = "dup3", .errmsg = true, },
602 { .name = "epoll_ctl", .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
603 { .name = "eventfd2", .errmsg = true,
604 .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
605 { .name = "faccessat", .errmsg = true, },
606 { .name = "fadvise64", .errmsg = true, },
607 { .name = "fallocate", .errmsg = true, },
608 { .name = "fchdir", .errmsg = true, },
609 { .name = "fchmod", .errmsg = true, },
610 { .name = "fchmodat", .errmsg = true,
611 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
612 { .name = "fchown", .errmsg = true, },
613 { .name = "fchownat", .errmsg = true,
614 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
615 { .name = "fcntl", .errmsg = true,
616 .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
617 .arg_parm = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
618 { .name = "fdatasync", .errmsg = true, },
619 { .name = "flock", .errmsg = true,
620 .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
621 { .name = "fsetxattr", .errmsg = true, },
622 { .name = "fstat", .errmsg = true, .alias = "newfstat", },
623 { .name = "fstatat", .errmsg = true, .alias = "newfstatat", },
624 { .name = "fstatfs", .errmsg = true, },
625 { .name = "fsync", .errmsg = true, },
626 { .name = "ftruncate", .errmsg = true, },
627 { .name = "futex", .errmsg = true,
628 .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
629 { .name = "futimesat", .errmsg = true,
630 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
631 { .name = "getdents", .errmsg = true, },
632 { .name = "getdents64", .errmsg = true, },
633 { .name = "getitimer", .errmsg = true, STRARRAY(0, which, itimers), },
634 { .name = "getpid", .errpid = true, },
635 { .name = "getpgid", .errpid = true, },
636 { .name = "getppid", .errpid = true, },
637 { .name = "getrandom", .errmsg = true,
638 .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
639 { .name = "getrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
640 { .name = "getxattr", .errmsg = true, },
641 { .name = "inotify_add_watch", .errmsg = true, },
642 { .name = "ioctl", .errmsg = true,
643 .arg_scnprintf = {
644 #if defined(__i386__) || defined(__x86_64__)
645 /*
646 * FIXME: Make this available to all arches.
647 */
648 [1] = SCA_STRHEXARRAY, /* cmd */
649 [2] = SCA_HEX, /* arg */ },
650 .arg_parm = { [1] = &strarray__tioctls, /* cmd */ }, },
651 #else
652 [2] = SCA_HEX, /* arg */ }, },
653 #endif
654 { .name = "keyctl", .errmsg = true, STRARRAY(0, option, keyctl_options), },
655 { .name = "kill", .errmsg = true,
656 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
657 { .name = "lchown", .errmsg = true, },
658 { .name = "lgetxattr", .errmsg = true, },
659 { .name = "linkat", .errmsg = true,
660 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
661 { .name = "listxattr", .errmsg = true, },
662 { .name = "llistxattr", .errmsg = true, },
663 { .name = "lremovexattr", .errmsg = true, },
664 { .name = "lseek", .errmsg = true,
665 .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
666 .arg_parm = { [2] = &strarray__whences, /* whence */ }, },
667 { .name = "lsetxattr", .errmsg = true, },
668 { .name = "lstat", .errmsg = true, .alias = "newlstat", },
669 { .name = "lsxattr", .errmsg = true, },
670 { .name = "madvise", .errmsg = true,
671 .arg_scnprintf = { [0] = SCA_HEX, /* start */
672 [2] = SCA_MADV_BHV, /* behavior */ }, },
673 { .name = "mkdir", .errmsg = true, },
674 { .name = "mkdirat", .errmsg = true,
675 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
676 { .name = "mknod", .errmsg = true, },
677 { .name = "mknodat", .errmsg = true,
678 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
679 { .name = "mlock", .errmsg = true,
680 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
681 { .name = "mlockall", .errmsg = true,
682 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
683 { .name = "mmap", .hexret = true,
684 .arg_scnprintf = { [0] = SCA_HEX, /* addr */
685 [2] = SCA_MMAP_PROT, /* prot */
686 [3] = SCA_MMAP_FLAGS, /* flags */ }, },
687 { .name = "mprotect", .errmsg = true,
688 .arg_scnprintf = { [0] = SCA_HEX, /* start */
689 [2] = SCA_MMAP_PROT, /* prot */ }, },
690 { .name = "mq_unlink", .errmsg = true,
691 .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
692 { .name = "mremap", .hexret = true,
693 .arg_scnprintf = { [0] = SCA_HEX, /* addr */
694 [3] = SCA_MREMAP_FLAGS, /* flags */
695 [4] = SCA_HEX, /* new_addr */ }, },
696 { .name = "munlock", .errmsg = true,
697 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
698 { .name = "munmap", .errmsg = true,
699 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
700 { .name = "name_to_handle_at", .errmsg = true,
701 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
702 { .name = "newfstatat", .errmsg = true,
703 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
704 { .name = "open", .errmsg = true,
705 .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
706 { .name = "open_by_handle_at", .errmsg = true,
707 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
708 [2] = SCA_OPEN_FLAGS, /* flags */ }, },
709 { .name = "openat", .errmsg = true,
710 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
711 [2] = SCA_OPEN_FLAGS, /* flags */ }, },
712 { .name = "perf_event_open", .errmsg = true,
713 .arg_scnprintf = { [2] = SCA_INT, /* cpu */
714 [3] = SCA_FD, /* group_fd */
715 [4] = SCA_PERF_FLAGS, /* flags */ }, },
716 { .name = "pipe2", .errmsg = true,
717 .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
718 { .name = "poll", .errmsg = true, .timeout = true, },
719 { .name = "ppoll", .errmsg = true, .timeout = true, },
720 { .name = "pread", .errmsg = true, .alias = "pread64", },
721 { .name = "preadv", .errmsg = true, .alias = "pread", },
722 { .name = "prlimit64", .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
723 { .name = "pwrite", .errmsg = true, .alias = "pwrite64", },
724 { .name = "pwritev", .errmsg = true, },
725 { .name = "read", .errmsg = true, },
726 { .name = "readlink", .errmsg = true, },
727 { .name = "readlinkat", .errmsg = true,
728 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
729 { .name = "readv", .errmsg = true, },
730 { .name = "recvfrom", .errmsg = true,
731 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
732 { .name = "recvmmsg", .errmsg = true,
733 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
734 { .name = "recvmsg", .errmsg = true,
735 .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
736 { .name = "removexattr", .errmsg = true, },
737 { .name = "renameat", .errmsg = true,
738 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
739 { .name = "rmdir", .errmsg = true, },
740 { .name = "rt_sigaction", .errmsg = true,
741 .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
742 { .name = "rt_sigprocmask", .errmsg = true, STRARRAY(0, how, sighow), },
743 { .name = "rt_sigqueueinfo", .errmsg = true,
744 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
745 { .name = "rt_tgsigqueueinfo", .errmsg = true,
746 .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
747 { .name = "sched_getattr", .errmsg = true, },
748 { .name = "sched_setattr", .errmsg = true, },
749 { .name = "sched_setscheduler", .errmsg = true,
750 .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
751 { .name = "seccomp", .errmsg = true,
752 .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
753 [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
754 { .name = "select", .errmsg = true, .timeout = true, },
755 { .name = "sendmmsg", .errmsg = true,
756 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
757 { .name = "sendmsg", .errmsg = true,
758 .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
759 { .name = "sendto", .errmsg = true,
760 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
761 { .name = "set_tid_address", .errpid = true, },
762 { .name = "setitimer", .errmsg = true, STRARRAY(0, which, itimers), },
763 { .name = "setpgid", .errmsg = true, },
764 { .name = "setrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
765 { .name = "setxattr", .errmsg = true, },
766 { .name = "shutdown", .errmsg = true, },
767 { .name = "socket", .errmsg = true,
768 .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
769 [1] = SCA_SK_TYPE, /* type */ },
770 .arg_parm = { [0] = &strarray__socket_families, /* family */ }, },
771 { .name = "socketpair", .errmsg = true,
772 .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
773 [1] = SCA_SK_TYPE, /* type */ },
774 .arg_parm = { [0] = &strarray__socket_families, /* family */ }, },
775 { .name = "stat", .errmsg = true, .alias = "newstat", },
776 { .name = "statfs", .errmsg = true, },
777 { .name = "statx", .errmsg = true,
778 .arg_scnprintf = { [0] = SCA_FDAT, /* flags */
779 [2] = SCA_STATX_FLAGS, /* flags */
780 [3] = SCA_STATX_MASK, /* mask */ }, },
781 { .name = "swapoff", .errmsg = true,
782 .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
783 { .name = "swapon", .errmsg = true,
784 .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
785 { .name = "symlinkat", .errmsg = true,
786 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
787 { .name = "tgkill", .errmsg = true,
788 .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
789 { .name = "tkill", .errmsg = true,
790 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
791 { .name = "truncate", .errmsg = true, },
792 { .name = "uname", .errmsg = true, .alias = "newuname", },
793 { .name = "unlinkat", .errmsg = true,
794 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
795 { .name = "utime", .errmsg = true, },
796 { .name = "utimensat", .errmsg = true,
797 .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
798 { .name = "utimes", .errmsg = true, },
799 { .name = "vmsplice", .errmsg = true, },
800 { .name = "wait4", .errpid = true,
801 .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
802 { .name = "waitid", .errpid = true,
803 .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
804 { .name = "write", .errmsg = true, },
805 { .name = "writev", .errmsg = true, },
806 };
807
808 static int syscall_fmt__cmp(const void *name, const void *fmtp)
809 {
810 const struct syscall_fmt *fmt = fmtp;
811 return strcmp(name, fmt->name);
812 }
813
814 static struct syscall_fmt *syscall_fmt__find(const char *name)
815 {
816 const int nmemb = ARRAY_SIZE(syscall_fmts);
817 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
818 }
819
820 struct syscall {
821 struct event_format *tp_format;
822 int nr_args;
823 struct format_field *args;
824 const char *name;
825 bool is_exit;
826 struct syscall_fmt *fmt;
827 size_t (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
828 void **arg_parm;
829 };
830
831 /*
832 * We need to have this 'calculated' boolean because in some cases we really
833 * don't know what is the duration of a syscall, for instance, when we start
834 * a session and some threads are waiting for a syscall to finish, say 'poll',
835 * in which case all we can do is to print "( ? ) for duration and for the
836 * start timestamp.
837 */
838 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
839 {
840 double duration = (double)t / NSEC_PER_MSEC;
841 size_t printed = fprintf(fp, "(");
842
843 if (!calculated)
844 printed += fprintf(fp, " ? ");
845 else if (duration >= 1.0)
846 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
847 else if (duration >= 0.01)
848 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
849 else
850 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
851 return printed + fprintf(fp, "): ");
852 }
853
854 /**
855 * filename.ptr: The filename char pointer that will be vfs_getname'd
856 * filename.entry_str_pos: Where to insert the string translated from
857 * filename.ptr by the vfs_getname tracepoint/kprobe.
858 */
859 struct thread_trace {
860 u64 entry_time;
861 bool entry_pending;
862 unsigned long nr_events;
863 unsigned long pfmaj, pfmin;
864 char *entry_str;
865 double runtime_ms;
866 struct {
867 unsigned long ptr;
868 short int entry_str_pos;
869 bool pending_open;
870 unsigned int namelen;
871 char *name;
872 } filename;
873 struct {
874 int max;
875 char **table;
876 } paths;
877
878 struct intlist *syscall_stats;
879 };
880
881 static struct thread_trace *thread_trace__new(void)
882 {
883 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace));
884
885 if (ttrace)
886 ttrace->paths.max = -1;
887
888 ttrace->syscall_stats = intlist__new(NULL);
889
890 return ttrace;
891 }
892
893 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
894 {
895 struct thread_trace *ttrace;
896
897 if (thread == NULL)
898 goto fail;
899
900 if (thread__priv(thread) == NULL)
901 thread__set_priv(thread, thread_trace__new());
902
903 if (thread__priv(thread) == NULL)
904 goto fail;
905
906 ttrace = thread__priv(thread);
907 ++ttrace->nr_events;
908
909 return ttrace;
910 fail:
911 color_fprintf(fp, PERF_COLOR_RED,
912 "WARNING: not enough memory, dropping samples!\n");
913 return NULL;
914 }
915
916 #define TRACE_PFMAJ (1 << 0)
917 #define TRACE_PFMIN (1 << 1)
918
919 static const size_t trace__entry_str_size = 2048;
920
921 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
922 {
923 struct thread_trace *ttrace = thread__priv(thread);
924
925 if (fd > ttrace->paths.max) {
926 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
927
928 if (npath == NULL)
929 return -1;
930
931 if (ttrace->paths.max != -1) {
932 memset(npath + ttrace->paths.max + 1, 0,
933 (fd - ttrace->paths.max) * sizeof(char *));
934 } else {
935 memset(npath, 0, (fd + 1) * sizeof(char *));
936 }
937
938 ttrace->paths.table = npath;
939 ttrace->paths.max = fd;
940 }
941
942 ttrace->paths.table[fd] = strdup(pathname);
943
944 return ttrace->paths.table[fd] != NULL ? 0 : -1;
945 }
946
947 static int thread__read_fd_path(struct thread *thread, int fd)
948 {
949 char linkname[PATH_MAX], pathname[PATH_MAX];
950 struct stat st;
951 int ret;
952
953 if (thread->pid_ == thread->tid) {
954 scnprintf(linkname, sizeof(linkname),
955 "/proc/%d/fd/%d", thread->pid_, fd);
956 } else {
957 scnprintf(linkname, sizeof(linkname),
958 "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
959 }
960
961 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
962 return -1;
963
964 ret = readlink(linkname, pathname, sizeof(pathname));
965
966 if (ret < 0 || ret > st.st_size)
967 return -1;
968
969 pathname[ret] = '\0';
970 return trace__set_fd_pathname(thread, fd, pathname);
971 }
972
973 static const char *thread__fd_path(struct thread *thread, int fd,
974 struct trace *trace)
975 {
976 struct thread_trace *ttrace = thread__priv(thread);
977
978 if (ttrace == NULL)
979 return NULL;
980
981 if (fd < 0)
982 return NULL;
983
984 if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
985 if (!trace->live)
986 return NULL;
987 ++trace->stats.proc_getname;
988 if (thread__read_fd_path(thread, fd))
989 return NULL;
990 }
991
992 return ttrace->paths.table[fd];
993 }
994
995 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
996 struct syscall_arg *arg)
997 {
998 int fd = arg->val;
999 size_t printed = scnprintf(bf, size, "%d", fd);
1000 const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1001
1002 if (path)
1003 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1004
1005 return printed;
1006 }
1007
1008 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1009 struct syscall_arg *arg)
1010 {
1011 int fd = arg->val;
1012 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1013 struct thread_trace *ttrace = thread__priv(arg->thread);
1014
1015 if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1016 zfree(&ttrace->paths.table[fd]);
1017
1018 return printed;
1019 }
1020
1021 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1022 unsigned long ptr)
1023 {
1024 struct thread_trace *ttrace = thread__priv(thread);
1025
1026 ttrace->filename.ptr = ptr;
1027 ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1028 }
1029
1030 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1031 struct syscall_arg *arg)
1032 {
1033 unsigned long ptr = arg->val;
1034
1035 if (!arg->trace->vfs_getname)
1036 return scnprintf(bf, size, "%#x", ptr);
1037
1038 thread__set_filename_pos(arg->thread, bf, ptr);
1039 return 0;
1040 }
1041
1042 static bool trace__filter_duration(struct trace *trace, double t)
1043 {
1044 return t < (trace->duration_filter * NSEC_PER_MSEC);
1045 }
1046
1047 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1048 {
1049 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1050
1051 return fprintf(fp, "%10.3f ", ts);
1052 }
1053
1054 /*
1055 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1056 * using ttrace->entry_time for a thread that receives a sys_exit without
1057 * first having received a sys_enter ("poll" issued before tracing session
1058 * starts, lost sys_enter exit due to ring buffer overflow).
1059 */
1060 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1061 {
1062 if (tstamp > 0)
1063 return __trace__fprintf_tstamp(trace, tstamp, fp);
1064
1065 return fprintf(fp, " ? ");
1066 }
1067
1068 static bool done = false;
1069 static bool interrupted = false;
1070
1071 static void sig_handler(int sig)
1072 {
1073 done = true;
1074 interrupted = sig == SIGINT;
1075 }
1076
1077 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1078 u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1079 {
1080 size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1081 printed += fprintf_duration(duration, duration_calculated, fp);
1082
1083 if (trace->multiple_threads) {
1084 if (trace->show_comm)
1085 printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1086 printed += fprintf(fp, "%d ", thread->tid);
1087 }
1088
1089 return printed;
1090 }
1091
1092 static int trace__process_event(struct trace *trace, struct machine *machine,
1093 union perf_event *event, struct perf_sample *sample)
1094 {
1095 int ret = 0;
1096
1097 switch (event->header.type) {
1098 case PERF_RECORD_LOST:
1099 color_fprintf(trace->output, PERF_COLOR_RED,
1100 "LOST %" PRIu64 " events!\n", event->lost.lost);
1101 ret = machine__process_lost_event(machine, event, sample);
1102 break;
1103 default:
1104 ret = machine__process_event(machine, event, sample);
1105 break;
1106 }
1107
1108 return ret;
1109 }
1110
1111 static int trace__tool_process(struct perf_tool *tool,
1112 union perf_event *event,
1113 struct perf_sample *sample,
1114 struct machine *machine)
1115 {
1116 struct trace *trace = container_of(tool, struct trace, tool);
1117 return trace__process_event(trace, machine, event, sample);
1118 }
1119
1120 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1121 {
1122 struct machine *machine = vmachine;
1123
1124 if (machine->kptr_restrict_warned)
1125 return NULL;
1126
1127 if (symbol_conf.kptr_restrict) {
1128 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1129 "Check /proc/sys/kernel/kptr_restrict.\n\n"
1130 "Kernel samples will not be resolved.\n");
1131 machine->kptr_restrict_warned = true;
1132 return NULL;
1133 }
1134
1135 return machine__resolve_kernel_addr(vmachine, addrp, modp);
1136 }
1137
1138 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1139 {
1140 int err = symbol__init(NULL);
1141
1142 if (err)
1143 return err;
1144
1145 trace->host = machine__new_host();
1146 if (trace->host == NULL)
1147 return -ENOMEM;
1148
1149 if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1150 return -errno;
1151
1152 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1153 evlist->threads, trace__tool_process, false,
1154 trace->opts.proc_map_timeout);
1155 if (err)
1156 symbol__exit();
1157
1158 return err;
1159 }
1160
1161 static int syscall__set_arg_fmts(struct syscall *sc)
1162 {
1163 struct format_field *field;
1164 int idx = 0, len;
1165
1166 sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1167 if (sc->arg_scnprintf == NULL)
1168 return -1;
1169
1170 if (sc->fmt)
1171 sc->arg_parm = sc->fmt->arg_parm;
1172
1173 for (field = sc->args; field; field = field->next) {
1174 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1175 sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1176 else if (strcmp(field->type, "const char *") == 0 &&
1177 (strcmp(field->name, "filename") == 0 ||
1178 strcmp(field->name, "path") == 0 ||
1179 strcmp(field->name, "pathname") == 0))
1180 sc->arg_scnprintf[idx] = SCA_FILENAME;
1181 else if (field->flags & FIELD_IS_POINTER)
1182 sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1183 else if (strcmp(field->type, "pid_t") == 0)
1184 sc->arg_scnprintf[idx] = SCA_PID;
1185 else if (strcmp(field->type, "umode_t") == 0)
1186 sc->arg_scnprintf[idx] = SCA_MODE_T;
1187 else if ((strcmp(field->type, "int") == 0 ||
1188 strcmp(field->type, "unsigned int") == 0 ||
1189 strcmp(field->type, "long") == 0) &&
1190 (len = strlen(field->name)) >= 2 &&
1191 strcmp(field->name + len - 2, "fd") == 0) {
1192 /*
1193 * /sys/kernel/tracing/events/syscalls/sys_enter*
1194 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1195 * 65 int
1196 * 23 unsigned int
1197 * 7 unsigned long
1198 */
1199 sc->arg_scnprintf[idx] = SCA_FD;
1200 }
1201 ++idx;
1202 }
1203
1204 return 0;
1205 }
1206
1207 static int trace__read_syscall_info(struct trace *trace, int id)
1208 {
1209 char tp_name[128];
1210 struct syscall *sc;
1211 const char *name = syscalltbl__name(trace->sctbl, id);
1212
1213 if (name == NULL)
1214 return -1;
1215
1216 if (id > trace->syscalls.max) {
1217 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1218
1219 if (nsyscalls == NULL)
1220 return -1;
1221
1222 if (trace->syscalls.max != -1) {
1223 memset(nsyscalls + trace->syscalls.max + 1, 0,
1224 (id - trace->syscalls.max) * sizeof(*sc));
1225 } else {
1226 memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1227 }
1228
1229 trace->syscalls.table = nsyscalls;
1230 trace->syscalls.max = id;
1231 }
1232
1233 sc = trace->syscalls.table + id;
1234 sc->name = name;
1235
1236 sc->fmt = syscall_fmt__find(sc->name);
1237
1238 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1239 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1240
1241 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1242 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1243 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1244 }
1245
1246 if (IS_ERR(sc->tp_format))
1247 return -1;
1248
1249 sc->args = sc->tp_format->format.fields;
1250 sc->nr_args = sc->tp_format->format.nr_fields;
1251 /*
1252 * We need to check and discard the first variable '__syscall_nr'
1253 * or 'nr' that mean the syscall number. It is needless here.
1254 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1255 */
1256 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1257 sc->args = sc->args->next;
1258 --sc->nr_args;
1259 }
1260
1261 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1262
1263 return syscall__set_arg_fmts(sc);
1264 }
1265
1266 static int trace__validate_ev_qualifier(struct trace *trace)
1267 {
1268 int err = 0, i;
1269 struct str_node *pos;
1270
1271 trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1272 trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1273 sizeof(trace->ev_qualifier_ids.entries[0]));
1274
1275 if (trace->ev_qualifier_ids.entries == NULL) {
1276 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1277 trace->output);
1278 err = -EINVAL;
1279 goto out;
1280 }
1281
1282 i = 0;
1283
1284 strlist__for_each_entry(pos, trace->ev_qualifier) {
1285 const char *sc = pos->s;
1286 int id = syscalltbl__id(trace->sctbl, sc);
1287
1288 if (id < 0) {
1289 if (err == 0) {
1290 fputs("Error:\tInvalid syscall ", trace->output);
1291 err = -EINVAL;
1292 } else {
1293 fputs(", ", trace->output);
1294 }
1295
1296 fputs(sc, trace->output);
1297 }
1298
1299 trace->ev_qualifier_ids.entries[i++] = id;
1300 }
1301
1302 if (err < 0) {
1303 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1304 "\nHint:\tand: 'man syscalls'\n", trace->output);
1305 zfree(&trace->ev_qualifier_ids.entries);
1306 trace->ev_qualifier_ids.nr = 0;
1307 }
1308 out:
1309 return err;
1310 }
1311
1312 /*
1313 * args is to be interpreted as a series of longs but we need to handle
1314 * 8-byte unaligned accesses. args points to raw_data within the event
1315 * and raw_data is guaranteed to be 8-byte unaligned because it is
1316 * preceded by raw_size which is a u32. So we need to copy args to a temp
1317 * variable to read it. Most notably this avoids extended load instructions
1318 * on unaligned addresses
1319 */
1320
1321 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1322 unsigned char *args, struct trace *trace,
1323 struct thread *thread)
1324 {
1325 size_t printed = 0;
1326 unsigned char *p;
1327 unsigned long val;
1328
1329 if (sc->args != NULL) {
1330 struct format_field *field;
1331 u8 bit = 1;
1332 struct syscall_arg arg = {
1333 .idx = 0,
1334 .mask = 0,
1335 .trace = trace,
1336 .thread = thread,
1337 };
1338
1339 for (field = sc->args; field;
1340 field = field->next, ++arg.idx, bit <<= 1) {
1341 if (arg.mask & bit)
1342 continue;
1343
1344 /* special care for unaligned accesses */
1345 p = args + sizeof(unsigned long) * arg.idx;
1346 memcpy(&val, p, sizeof(val));
1347
1348 /*
1349 * Suppress this argument if its value is zero and
1350 * and we don't have a string associated in an
1351 * strarray for it.
1352 */
1353 if (val == 0 &&
1354 !(sc->arg_scnprintf &&
1355 sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1356 sc->arg_parm[arg.idx]))
1357 continue;
1358
1359 printed += scnprintf(bf + printed, size - printed,
1360 "%s%s: ", printed ? ", " : "", field->name);
1361 if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1362 arg.val = val;
1363 if (sc->arg_parm)
1364 arg.parm = sc->arg_parm[arg.idx];
1365 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1366 size - printed, &arg);
1367 } else {
1368 printed += scnprintf(bf + printed, size - printed,
1369 "%ld", val);
1370 }
1371 }
1372 } else if (IS_ERR(sc->tp_format)) {
1373 /*
1374 * If we managed to read the tracepoint /format file, then we
1375 * may end up not having any args, like with gettid(), so only
1376 * print the raw args when we didn't manage to read it.
1377 */
1378 int i = 0;
1379
1380 while (i < 6) {
1381 /* special care for unaligned accesses */
1382 p = args + sizeof(unsigned long) * i;
1383 memcpy(&val, p, sizeof(val));
1384 printed += scnprintf(bf + printed, size - printed,
1385 "%sarg%d: %ld",
1386 printed ? ", " : "", i, val);
1387 ++i;
1388 }
1389 }
1390
1391 return printed;
1392 }
1393
1394 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1395 union perf_event *event,
1396 struct perf_sample *sample);
1397
1398 static struct syscall *trace__syscall_info(struct trace *trace,
1399 struct perf_evsel *evsel, int id)
1400 {
1401
1402 if (id < 0) {
1403
1404 /*
1405 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1406 * before that, leaving at a higher verbosity level till that is
1407 * explained. Reproduced with plain ftrace with:
1408 *
1409 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1410 * grep "NR -1 " /t/trace_pipe
1411 *
1412 * After generating some load on the machine.
1413 */
1414 if (verbose > 1) {
1415 static u64 n;
1416 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1417 id, perf_evsel__name(evsel), ++n);
1418 }
1419 return NULL;
1420 }
1421
1422 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1423 trace__read_syscall_info(trace, id))
1424 goto out_cant_read;
1425
1426 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1427 goto out_cant_read;
1428
1429 return &trace->syscalls.table[id];
1430
1431 out_cant_read:
1432 if (verbose > 0) {
1433 fprintf(trace->output, "Problems reading syscall %d", id);
1434 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1435 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1436 fputs(" information\n", trace->output);
1437 }
1438 return NULL;
1439 }
1440
1441 static void thread__update_stats(struct thread_trace *ttrace,
1442 int id, struct perf_sample *sample)
1443 {
1444 struct int_node *inode;
1445 struct stats *stats;
1446 u64 duration = 0;
1447
1448 inode = intlist__findnew(ttrace->syscall_stats, id);
1449 if (inode == NULL)
1450 return;
1451
1452 stats = inode->priv;
1453 if (stats == NULL) {
1454 stats = malloc(sizeof(struct stats));
1455 if (stats == NULL)
1456 return;
1457 init_stats(stats);
1458 inode->priv = stats;
1459 }
1460
1461 if (ttrace->entry_time && sample->time > ttrace->entry_time)
1462 duration = sample->time - ttrace->entry_time;
1463
1464 update_stats(stats, duration);
1465 }
1466
1467 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1468 {
1469 struct thread_trace *ttrace;
1470 u64 duration;
1471 size_t printed;
1472
1473 if (trace->current == NULL)
1474 return 0;
1475
1476 ttrace = thread__priv(trace->current);
1477
1478 if (!ttrace->entry_pending)
1479 return 0;
1480
1481 duration = sample->time - ttrace->entry_time;
1482
1483 printed = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output);
1484 printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1485 ttrace->entry_pending = false;
1486
1487 return printed;
1488 }
1489
1490 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1491 union perf_event *event __maybe_unused,
1492 struct perf_sample *sample)
1493 {
1494 char *msg;
1495 void *args;
1496 size_t printed = 0;
1497 struct thread *thread;
1498 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1499 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1500 struct thread_trace *ttrace;
1501
1502 if (sc == NULL)
1503 return -1;
1504
1505 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1506 ttrace = thread__trace(thread, trace->output);
1507 if (ttrace == NULL)
1508 goto out_put;
1509
1510 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1511
1512 if (ttrace->entry_str == NULL) {
1513 ttrace->entry_str = malloc(trace__entry_str_size);
1514 if (!ttrace->entry_str)
1515 goto out_put;
1516 }
1517
1518 if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1519 trace__printf_interrupted_entry(trace, sample);
1520
1521 ttrace->entry_time = sample->time;
1522 msg = ttrace->entry_str;
1523 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1524
1525 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1526 args, trace, thread);
1527
1528 if (sc->is_exit) {
1529 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1530 trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1531 fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1532 }
1533 } else {
1534 ttrace->entry_pending = true;
1535 /* See trace__vfs_getname & trace__sys_exit */
1536 ttrace->filename.pending_open = false;
1537 }
1538
1539 if (trace->current != thread) {
1540 thread__put(trace->current);
1541 trace->current = thread__get(thread);
1542 }
1543 err = 0;
1544 out_put:
1545 thread__put(thread);
1546 return err;
1547 }
1548
1549 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1550 struct perf_sample *sample,
1551 struct callchain_cursor *cursor)
1552 {
1553 struct addr_location al;
1554
1555 if (machine__resolve(trace->host, &al, sample) < 0 ||
1556 thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1557 return -1;
1558
1559 return 0;
1560 }
1561
1562 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1563 {
1564 /* TODO: user-configurable print_opts */
1565 const unsigned int print_opts = EVSEL__PRINT_SYM |
1566 EVSEL__PRINT_DSO |
1567 EVSEL__PRINT_UNKNOWN_AS_ADDR;
1568
1569 return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1570 }
1571
1572 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1573 union perf_event *event __maybe_unused,
1574 struct perf_sample *sample)
1575 {
1576 long ret;
1577 u64 duration = 0;
1578 bool duration_calculated = false;
1579 struct thread *thread;
1580 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1581 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1582 struct thread_trace *ttrace;
1583
1584 if (sc == NULL)
1585 return -1;
1586
1587 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1588 ttrace = thread__trace(thread, trace->output);
1589 if (ttrace == NULL)
1590 goto out_put;
1591
1592 if (trace->summary)
1593 thread__update_stats(ttrace, id, sample);
1594
1595 ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1596
1597 if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1598 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1599 ttrace->filename.pending_open = false;
1600 ++trace->stats.vfs_getname;
1601 }
1602
1603 if (ttrace->entry_time) {
1604 duration = sample->time - ttrace->entry_time;
1605 if (trace__filter_duration(trace, duration))
1606 goto out;
1607 duration_calculated = true;
1608 } else if (trace->duration_filter)
1609 goto out;
1610
1611 if (sample->callchain) {
1612 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1613 if (callchain_ret == 0) {
1614 if (callchain_cursor.nr < trace->min_stack)
1615 goto out;
1616 callchain_ret = 1;
1617 }
1618 }
1619
1620 if (trace->summary_only)
1621 goto out;
1622
1623 trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1624
1625 if (ttrace->entry_pending) {
1626 fprintf(trace->output, "%-70s", ttrace->entry_str);
1627 } else {
1628 fprintf(trace->output, " ... [");
1629 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1630 fprintf(trace->output, "]: %s()", sc->name);
1631 }
1632
1633 if (sc->fmt == NULL) {
1634 signed_print:
1635 fprintf(trace->output, ") = %ld", ret);
1636 } else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
1637 char bf[STRERR_BUFSIZE];
1638 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1639 *e = audit_errno_to_name(-ret);
1640
1641 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1642 } else if (ret == 0 && sc->fmt->timeout)
1643 fprintf(trace->output, ") = 0 Timeout");
1644 else if (sc->fmt->hexret)
1645 fprintf(trace->output, ") = %#lx", ret);
1646 else if (sc->fmt->errpid) {
1647 struct thread *child = machine__find_thread(trace->host, ret, ret);
1648
1649 if (child != NULL) {
1650 fprintf(trace->output, ") = %ld", ret);
1651 if (child->comm_set)
1652 fprintf(trace->output, " (%s)", thread__comm_str(child));
1653 thread__put(child);
1654 }
1655 } else
1656 goto signed_print;
1657
1658 fputc('\n', trace->output);
1659
1660 if (callchain_ret > 0)
1661 trace__fprintf_callchain(trace, sample);
1662 else if (callchain_ret < 0)
1663 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1664 out:
1665 ttrace->entry_pending = false;
1666 err = 0;
1667 out_put:
1668 thread__put(thread);
1669 return err;
1670 }
1671
1672 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1673 union perf_event *event __maybe_unused,
1674 struct perf_sample *sample)
1675 {
1676 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1677 struct thread_trace *ttrace;
1678 size_t filename_len, entry_str_len, to_move;
1679 ssize_t remaining_space;
1680 char *pos;
1681 const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1682
1683 if (!thread)
1684 goto out;
1685
1686 ttrace = thread__priv(thread);
1687 if (!ttrace)
1688 goto out_put;
1689
1690 filename_len = strlen(filename);
1691 if (filename_len == 0)
1692 goto out_put;
1693
1694 if (ttrace->filename.namelen < filename_len) {
1695 char *f = realloc(ttrace->filename.name, filename_len + 1);
1696
1697 if (f == NULL)
1698 goto out_put;
1699
1700 ttrace->filename.namelen = filename_len;
1701 ttrace->filename.name = f;
1702 }
1703
1704 strcpy(ttrace->filename.name, filename);
1705 ttrace->filename.pending_open = true;
1706
1707 if (!ttrace->filename.ptr)
1708 goto out_put;
1709
1710 entry_str_len = strlen(ttrace->entry_str);
1711 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1712 if (remaining_space <= 0)
1713 goto out_put;
1714
1715 if (filename_len > (size_t)remaining_space) {
1716 filename += filename_len - remaining_space;
1717 filename_len = remaining_space;
1718 }
1719
1720 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1721 pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1722 memmove(pos + filename_len, pos, to_move);
1723 memcpy(pos, filename, filename_len);
1724
1725 ttrace->filename.ptr = 0;
1726 ttrace->filename.entry_str_pos = 0;
1727 out_put:
1728 thread__put(thread);
1729 out:
1730 return 0;
1731 }
1732
1733 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1734 union perf_event *event __maybe_unused,
1735 struct perf_sample *sample)
1736 {
1737 u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1738 double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1739 struct thread *thread = machine__findnew_thread(trace->host,
1740 sample->pid,
1741 sample->tid);
1742 struct thread_trace *ttrace = thread__trace(thread, trace->output);
1743
1744 if (ttrace == NULL)
1745 goto out_dump;
1746
1747 ttrace->runtime_ms += runtime_ms;
1748 trace->runtime_ms += runtime_ms;
1749 out_put:
1750 thread__put(thread);
1751 return 0;
1752
1753 out_dump:
1754 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1755 evsel->name,
1756 perf_evsel__strval(evsel, sample, "comm"),
1757 (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1758 runtime,
1759 perf_evsel__intval(evsel, sample, "vruntime"));
1760 goto out_put;
1761 }
1762
1763 static void bpf_output__printer(enum binary_printer_ops op,
1764 unsigned int val, void *extra)
1765 {
1766 FILE *output = extra;
1767 unsigned char ch = (unsigned char)val;
1768
1769 switch (op) {
1770 case BINARY_PRINT_CHAR_DATA:
1771 fprintf(output, "%c", isprint(ch) ? ch : '.');
1772 break;
1773 case BINARY_PRINT_DATA_BEGIN:
1774 case BINARY_PRINT_LINE_BEGIN:
1775 case BINARY_PRINT_ADDR:
1776 case BINARY_PRINT_NUM_DATA:
1777 case BINARY_PRINT_NUM_PAD:
1778 case BINARY_PRINT_SEP:
1779 case BINARY_PRINT_CHAR_PAD:
1780 case BINARY_PRINT_LINE_END:
1781 case BINARY_PRINT_DATA_END:
1782 default:
1783 break;
1784 }
1785 }
1786
1787 static void bpf_output__fprintf(struct trace *trace,
1788 struct perf_sample *sample)
1789 {
1790 print_binary(sample->raw_data, sample->raw_size, 8,
1791 bpf_output__printer, trace->output);
1792 }
1793
1794 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1795 union perf_event *event __maybe_unused,
1796 struct perf_sample *sample)
1797 {
1798 int callchain_ret = 0;
1799
1800 if (sample->callchain) {
1801 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1802 if (callchain_ret == 0) {
1803 if (callchain_cursor.nr < trace->min_stack)
1804 goto out;
1805 callchain_ret = 1;
1806 }
1807 }
1808
1809 trace__printf_interrupted_entry(trace, sample);
1810 trace__fprintf_tstamp(trace, sample->time, trace->output);
1811
1812 if (trace->trace_syscalls)
1813 fprintf(trace->output, "( ): ");
1814
1815 fprintf(trace->output, "%s:", evsel->name);
1816
1817 if (perf_evsel__is_bpf_output(evsel)) {
1818 bpf_output__fprintf(trace, sample);
1819 } else if (evsel->tp_format) {
1820 event_format__fprintf(evsel->tp_format, sample->cpu,
1821 sample->raw_data, sample->raw_size,
1822 trace->output);
1823 }
1824
1825 fprintf(trace->output, ")\n");
1826
1827 if (callchain_ret > 0)
1828 trace__fprintf_callchain(trace, sample);
1829 else if (callchain_ret < 0)
1830 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1831 out:
1832 return 0;
1833 }
1834
1835 static void print_location(FILE *f, struct perf_sample *sample,
1836 struct addr_location *al,
1837 bool print_dso, bool print_sym)
1838 {
1839
1840 if ((verbose > 0 || print_dso) && al->map)
1841 fprintf(f, "%s@", al->map->dso->long_name);
1842
1843 if ((verbose > 0 || print_sym) && al->sym)
1844 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1845 al->addr - al->sym->start);
1846 else if (al->map)
1847 fprintf(f, "0x%" PRIx64, al->addr);
1848 else
1849 fprintf(f, "0x%" PRIx64, sample->addr);
1850 }
1851
1852 static int trace__pgfault(struct trace *trace,
1853 struct perf_evsel *evsel,
1854 union perf_event *event __maybe_unused,
1855 struct perf_sample *sample)
1856 {
1857 struct thread *thread;
1858 struct addr_location al;
1859 char map_type = 'd';
1860 struct thread_trace *ttrace;
1861 int err = -1;
1862 int callchain_ret = 0;
1863
1864 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1865
1866 if (sample->callchain) {
1867 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1868 if (callchain_ret == 0) {
1869 if (callchain_cursor.nr < trace->min_stack)
1870 goto out_put;
1871 callchain_ret = 1;
1872 }
1873 }
1874
1875 ttrace = thread__trace(thread, trace->output);
1876 if (ttrace == NULL)
1877 goto out_put;
1878
1879 if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1880 ttrace->pfmaj++;
1881 else
1882 ttrace->pfmin++;
1883
1884 if (trace->summary_only)
1885 goto out;
1886
1887 thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1888 sample->ip, &al);
1889
1890 trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
1891
1892 fprintf(trace->output, "%sfault [",
1893 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1894 "maj" : "min");
1895
1896 print_location(trace->output, sample, &al, false, true);
1897
1898 fprintf(trace->output, "] => ");
1899
1900 thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1901 sample->addr, &al);
1902
1903 if (!al.map) {
1904 thread__find_addr_location(thread, sample->cpumode,
1905 MAP__FUNCTION, sample->addr, &al);
1906
1907 if (al.map)
1908 map_type = 'x';
1909 else
1910 map_type = '?';
1911 }
1912
1913 print_location(trace->output, sample, &al, true, false);
1914
1915 fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1916
1917 if (callchain_ret > 0)
1918 trace__fprintf_callchain(trace, sample);
1919 else if (callchain_ret < 0)
1920 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1921 out:
1922 err = 0;
1923 out_put:
1924 thread__put(thread);
1925 return err;
1926 }
1927
1928 static void trace__set_base_time(struct trace *trace,
1929 struct perf_evsel *evsel,
1930 struct perf_sample *sample)
1931 {
1932 /*
1933 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1934 * and don't use sample->time unconditionally, we may end up having
1935 * some other event in the future without PERF_SAMPLE_TIME for good
1936 * reason, i.e. we may not be interested in its timestamps, just in
1937 * it taking place, picking some piece of information when it
1938 * appears in our event stream (vfs_getname comes to mind).
1939 */
1940 if (trace->base_time == 0 && !trace->full_time &&
1941 (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1942 trace->base_time = sample->time;
1943 }
1944
1945 static int trace__process_sample(struct perf_tool *tool,
1946 union perf_event *event,
1947 struct perf_sample *sample,
1948 struct perf_evsel *evsel,
1949 struct machine *machine __maybe_unused)
1950 {
1951 struct trace *trace = container_of(tool, struct trace, tool);
1952 struct thread *thread;
1953 int err = 0;
1954
1955 tracepoint_handler handler = evsel->handler;
1956
1957 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1958 if (thread && thread__is_filtered(thread))
1959 goto out;
1960
1961 trace__set_base_time(trace, evsel, sample);
1962
1963 if (handler) {
1964 ++trace->nr_events;
1965 handler(trace, evsel, event, sample);
1966 }
1967 out:
1968 thread__put(thread);
1969 return err;
1970 }
1971
1972 static int trace__record(struct trace *trace, int argc, const char **argv)
1973 {
1974 unsigned int rec_argc, i, j;
1975 const char **rec_argv;
1976 const char * const record_args[] = {
1977 "record",
1978 "-R",
1979 "-m", "1024",
1980 "-c", "1",
1981 };
1982
1983 const char * const sc_args[] = { "-e", };
1984 unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
1985 const char * const majpf_args[] = { "-e", "major-faults" };
1986 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
1987 const char * const minpf_args[] = { "-e", "minor-faults" };
1988 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
1989
1990 /* +1 is for the event string below */
1991 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
1992 majpf_args_nr + minpf_args_nr + argc;
1993 rec_argv = calloc(rec_argc + 1, sizeof(char *));
1994
1995 if (rec_argv == NULL)
1996 return -ENOMEM;
1997
1998 j = 0;
1999 for (i = 0; i < ARRAY_SIZE(record_args); i++)
2000 rec_argv[j++] = record_args[i];
2001
2002 if (trace->trace_syscalls) {
2003 for (i = 0; i < sc_args_nr; i++)
2004 rec_argv[j++] = sc_args[i];
2005
2006 /* event string may be different for older kernels - e.g., RHEL6 */
2007 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2008 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2009 else if (is_valid_tracepoint("syscalls:sys_enter"))
2010 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2011 else {
2012 pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2013 return -1;
2014 }
2015 }
2016
2017 if (trace->trace_pgfaults & TRACE_PFMAJ)
2018 for (i = 0; i < majpf_args_nr; i++)
2019 rec_argv[j++] = majpf_args[i];
2020
2021 if (trace->trace_pgfaults & TRACE_PFMIN)
2022 for (i = 0; i < minpf_args_nr; i++)
2023 rec_argv[j++] = minpf_args[i];
2024
2025 for (i = 0; i < (unsigned int)argc; i++)
2026 rec_argv[j++] = argv[i];
2027
2028 return cmd_record(j, rec_argv);
2029 }
2030
2031 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2032
2033 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2034 {
2035 struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2036
2037 if (IS_ERR(evsel))
2038 return false;
2039
2040 if (perf_evsel__field(evsel, "pathname") == NULL) {
2041 perf_evsel__delete(evsel);
2042 return false;
2043 }
2044
2045 evsel->handler = trace__vfs_getname;
2046 perf_evlist__add(evlist, evsel);
2047 return true;
2048 }
2049
2050 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2051 {
2052 struct perf_evsel *evsel;
2053 struct perf_event_attr attr = {
2054 .type = PERF_TYPE_SOFTWARE,
2055 .mmap_data = 1,
2056 };
2057
2058 attr.config = config;
2059 attr.sample_period = 1;
2060
2061 event_attr_init(&attr);
2062
2063 evsel = perf_evsel__new(&attr);
2064 if (evsel)
2065 evsel->handler = trace__pgfault;
2066
2067 return evsel;
2068 }
2069
2070 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2071 {
2072 const u32 type = event->header.type;
2073 struct perf_evsel *evsel;
2074
2075 if (type != PERF_RECORD_SAMPLE) {
2076 trace__process_event(trace, trace->host, event, sample);
2077 return;
2078 }
2079
2080 evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2081 if (evsel == NULL) {
2082 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2083 return;
2084 }
2085
2086 trace__set_base_time(trace, evsel, sample);
2087
2088 if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2089 sample->raw_data == NULL) {
2090 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2091 perf_evsel__name(evsel), sample->tid,
2092 sample->cpu, sample->raw_size);
2093 } else {
2094 tracepoint_handler handler = evsel->handler;
2095 handler(trace, evsel, event, sample);
2096 }
2097 }
2098
2099 static int trace__add_syscall_newtp(struct trace *trace)
2100 {
2101 int ret = -1;
2102 struct perf_evlist *evlist = trace->evlist;
2103 struct perf_evsel *sys_enter, *sys_exit;
2104
2105 sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2106 if (sys_enter == NULL)
2107 goto out;
2108
2109 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2110 goto out_delete_sys_enter;
2111
2112 sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2113 if (sys_exit == NULL)
2114 goto out_delete_sys_enter;
2115
2116 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2117 goto out_delete_sys_exit;
2118
2119 perf_evlist__add(evlist, sys_enter);
2120 perf_evlist__add(evlist, sys_exit);
2121
2122 if (callchain_param.enabled && !trace->kernel_syscallchains) {
2123 /*
2124 * We're interested only in the user space callchain
2125 * leading to the syscall, allow overriding that for
2126 * debugging reasons using --kernel_syscall_callchains
2127 */
2128 sys_exit->attr.exclude_callchain_kernel = 1;
2129 }
2130
2131 trace->syscalls.events.sys_enter = sys_enter;
2132 trace->syscalls.events.sys_exit = sys_exit;
2133
2134 ret = 0;
2135 out:
2136 return ret;
2137
2138 out_delete_sys_exit:
2139 perf_evsel__delete_priv(sys_exit);
2140 out_delete_sys_enter:
2141 perf_evsel__delete_priv(sys_enter);
2142 goto out;
2143 }
2144
2145 static int trace__set_ev_qualifier_filter(struct trace *trace)
2146 {
2147 int err = -1;
2148 struct perf_evsel *sys_exit;
2149 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2150 trace->ev_qualifier_ids.nr,
2151 trace->ev_qualifier_ids.entries);
2152
2153 if (filter == NULL)
2154 goto out_enomem;
2155
2156 if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2157 filter)) {
2158 sys_exit = trace->syscalls.events.sys_exit;
2159 err = perf_evsel__append_tp_filter(sys_exit, filter);
2160 }
2161
2162 free(filter);
2163 out:
2164 return err;
2165 out_enomem:
2166 errno = ENOMEM;
2167 goto out;
2168 }
2169
2170 static int trace__run(struct trace *trace, int argc, const char **argv)
2171 {
2172 struct perf_evlist *evlist = trace->evlist;
2173 struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2174 int err = -1, i;
2175 unsigned long before;
2176 const bool forks = argc > 0;
2177 bool draining = false;
2178
2179 trace->live = true;
2180
2181 if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2182 goto out_error_raw_syscalls;
2183
2184 if (trace->trace_syscalls)
2185 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2186
2187 if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2188 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2189 if (pgfault_maj == NULL)
2190 goto out_error_mem;
2191 perf_evlist__add(evlist, pgfault_maj);
2192 }
2193
2194 if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2195 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2196 if (pgfault_min == NULL)
2197 goto out_error_mem;
2198 perf_evlist__add(evlist, pgfault_min);
2199 }
2200
2201 if (trace->sched &&
2202 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2203 trace__sched_stat_runtime))
2204 goto out_error_sched_stat_runtime;
2205
2206 err = perf_evlist__create_maps(evlist, &trace->opts.target);
2207 if (err < 0) {
2208 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2209 goto out_delete_evlist;
2210 }
2211
2212 err = trace__symbols_init(trace, evlist);
2213 if (err < 0) {
2214 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2215 goto out_delete_evlist;
2216 }
2217
2218 perf_evlist__config(evlist, &trace->opts, NULL);
2219
2220 if (callchain_param.enabled) {
2221 bool use_identifier = false;
2222
2223 if (trace->syscalls.events.sys_exit) {
2224 perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2225 &trace->opts, &callchain_param);
2226 use_identifier = true;
2227 }
2228
2229 if (pgfault_maj) {
2230 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2231 use_identifier = true;
2232 }
2233
2234 if (pgfault_min) {
2235 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2236 use_identifier = true;
2237 }
2238
2239 if (use_identifier) {
2240 /*
2241 * Now we have evsels with different sample_ids, use
2242 * PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2243 * from a fixed position in each ring buffer record.
2244 *
2245 * As of this the changeset introducing this comment, this
2246 * isn't strictly needed, as the fields that can come before
2247 * PERF_SAMPLE_ID are all used, but we'll probably disable
2248 * some of those for things like copying the payload of
2249 * pointer syscall arguments, and for vfs_getname we don't
2250 * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2251 * here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2252 */
2253 perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2254 perf_evlist__reset_sample_bit(evlist, ID);
2255 }
2256 }
2257
2258 signal(SIGCHLD, sig_handler);
2259 signal(SIGINT, sig_handler);
2260
2261 if (forks) {
2262 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2263 argv, false, NULL);
2264 if (err < 0) {
2265 fprintf(trace->output, "Couldn't run the workload!\n");
2266 goto out_delete_evlist;
2267 }
2268 }
2269
2270 err = perf_evlist__open(evlist);
2271 if (err < 0)
2272 goto out_error_open;
2273
2274 err = bpf__apply_obj_config();
2275 if (err) {
2276 char errbuf[BUFSIZ];
2277
2278 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2279 pr_err("ERROR: Apply config to BPF failed: %s\n",
2280 errbuf);
2281 goto out_error_open;
2282 }
2283
2284 /*
2285 * Better not use !target__has_task() here because we need to cover the
2286 * case where no threads were specified in the command line, but a
2287 * workload was, and in that case we will fill in the thread_map when
2288 * we fork the workload in perf_evlist__prepare_workload.
2289 */
2290 if (trace->filter_pids.nr > 0)
2291 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2292 else if (thread_map__pid(evlist->threads, 0) == -1)
2293 err = perf_evlist__set_filter_pid(evlist, getpid());
2294
2295 if (err < 0)
2296 goto out_error_mem;
2297
2298 if (trace->ev_qualifier_ids.nr > 0) {
2299 err = trace__set_ev_qualifier_filter(trace);
2300 if (err < 0)
2301 goto out_errno;
2302
2303 pr_debug("event qualifier tracepoint filter: %s\n",
2304 trace->syscalls.events.sys_exit->filter);
2305 }
2306
2307 err = perf_evlist__apply_filters(evlist, &evsel);
2308 if (err < 0)
2309 goto out_error_apply_filters;
2310
2311 err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2312 if (err < 0)
2313 goto out_error_mmap;
2314
2315 if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2316 perf_evlist__enable(evlist);
2317
2318 if (forks)
2319 perf_evlist__start_workload(evlist);
2320
2321 if (trace->opts.initial_delay) {
2322 usleep(trace->opts.initial_delay * 1000);
2323 perf_evlist__enable(evlist);
2324 }
2325
2326 trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2327 evlist->threads->nr > 1 ||
2328 perf_evlist__first(evlist)->attr.inherit;
2329 again:
2330 before = trace->nr_events;
2331
2332 for (i = 0; i < evlist->nr_mmaps; i++) {
2333 union perf_event *event;
2334
2335 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2336 struct perf_sample sample;
2337
2338 ++trace->nr_events;
2339
2340 err = perf_evlist__parse_sample(evlist, event, &sample);
2341 if (err) {
2342 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2343 goto next_event;
2344 }
2345
2346 trace__handle_event(trace, event, &sample);
2347 next_event:
2348 perf_evlist__mmap_consume(evlist, i);
2349
2350 if (interrupted)
2351 goto out_disable;
2352
2353 if (done && !draining) {
2354 perf_evlist__disable(evlist);
2355 draining = true;
2356 }
2357 }
2358 }
2359
2360 if (trace->nr_events == before) {
2361 int timeout = done ? 100 : -1;
2362
2363 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2364 if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2365 draining = true;
2366
2367 goto again;
2368 }
2369 } else {
2370 goto again;
2371 }
2372
2373 out_disable:
2374 thread__zput(trace->current);
2375
2376 perf_evlist__disable(evlist);
2377
2378 if (!err) {
2379 if (trace->summary)
2380 trace__fprintf_thread_summary(trace, trace->output);
2381
2382 if (trace->show_tool_stats) {
2383 fprintf(trace->output, "Stats:\n "
2384 " vfs_getname : %" PRIu64 "\n"
2385 " proc_getname: %" PRIu64 "\n",
2386 trace->stats.vfs_getname,
2387 trace->stats.proc_getname);
2388 }
2389 }
2390
2391 out_delete_evlist:
2392 perf_evlist__delete(evlist);
2393 trace->evlist = NULL;
2394 trace->live = false;
2395 return err;
2396 {
2397 char errbuf[BUFSIZ];
2398
2399 out_error_sched_stat_runtime:
2400 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2401 goto out_error;
2402
2403 out_error_raw_syscalls:
2404 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2405 goto out_error;
2406
2407 out_error_mmap:
2408 perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2409 goto out_error;
2410
2411 out_error_open:
2412 perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2413
2414 out_error:
2415 fprintf(trace->output, "%s\n", errbuf);
2416 goto out_delete_evlist;
2417
2418 out_error_apply_filters:
2419 fprintf(trace->output,
2420 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2421 evsel->filter, perf_evsel__name(evsel), errno,
2422 str_error_r(errno, errbuf, sizeof(errbuf)));
2423 goto out_delete_evlist;
2424 }
2425 out_error_mem:
2426 fprintf(trace->output, "Not enough memory to run!\n");
2427 goto out_delete_evlist;
2428
2429 out_errno:
2430 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2431 goto out_delete_evlist;
2432 }
2433
2434 static int trace__replay(struct trace *trace)
2435 {
2436 const struct perf_evsel_str_handler handlers[] = {
2437 { "probe:vfs_getname", trace__vfs_getname, },
2438 };
2439 struct perf_data_file file = {
2440 .path = input_name,
2441 .mode = PERF_DATA_MODE_READ,
2442 .force = trace->force,
2443 };
2444 struct perf_session *session;
2445 struct perf_evsel *evsel;
2446 int err = -1;
2447
2448 trace->tool.sample = trace__process_sample;
2449 trace->tool.mmap = perf_event__process_mmap;
2450 trace->tool.mmap2 = perf_event__process_mmap2;
2451 trace->tool.comm = perf_event__process_comm;
2452 trace->tool.exit = perf_event__process_exit;
2453 trace->tool.fork = perf_event__process_fork;
2454 trace->tool.attr = perf_event__process_attr;
2455 trace->tool.tracing_data = perf_event__process_tracing_data;
2456 trace->tool.build_id = perf_event__process_build_id;
2457 trace->tool.namespaces = perf_event__process_namespaces;
2458
2459 trace->tool.ordered_events = true;
2460 trace->tool.ordering_requires_timestamps = true;
2461
2462 /* add tid to output */
2463 trace->multiple_threads = true;
2464
2465 session = perf_session__new(&file, false, &trace->tool);
2466 if (session == NULL)
2467 return -1;
2468
2469 if (trace->opts.target.pid)
2470 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2471
2472 if (trace->opts.target.tid)
2473 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2474
2475 if (symbol__init(&session->header.env) < 0)
2476 goto out;
2477
2478 trace->host = &session->machines.host;
2479
2480 err = perf_session__set_tracepoints_handlers(session, handlers);
2481 if (err)
2482 goto out;
2483
2484 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2485 "raw_syscalls:sys_enter");
2486 /* older kernels have syscalls tp versus raw_syscalls */
2487 if (evsel == NULL)
2488 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2489 "syscalls:sys_enter");
2490
2491 if (evsel &&
2492 (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2493 perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2494 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2495 goto out;
2496 }
2497
2498 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2499 "raw_syscalls:sys_exit");
2500 if (evsel == NULL)
2501 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2502 "syscalls:sys_exit");
2503 if (evsel &&
2504 (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2505 perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2506 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2507 goto out;
2508 }
2509
2510 evlist__for_each_entry(session->evlist, evsel) {
2511 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2512 (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2513 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2514 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2515 evsel->handler = trace__pgfault;
2516 }
2517
2518 setup_pager();
2519
2520 err = perf_session__process_events(session);
2521 if (err)
2522 pr_err("Failed to process events, error %d", err);
2523
2524 else if (trace->summary)
2525 trace__fprintf_thread_summary(trace, trace->output);
2526
2527 out:
2528 perf_session__delete(session);
2529
2530 return err;
2531 }
2532
2533 static size_t trace__fprintf_threads_header(FILE *fp)
2534 {
2535 size_t printed;
2536
2537 printed = fprintf(fp, "\n Summary of events:\n\n");
2538
2539 return printed;
2540 }
2541
2542 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2543 struct stats *stats;
2544 double msecs;
2545 int syscall;
2546 )
2547 {
2548 struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2549 struct stats *stats = source->priv;
2550
2551 entry->syscall = source->i;
2552 entry->stats = stats;
2553 entry->msecs = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2554 }
2555
2556 static size_t thread__dump_stats(struct thread_trace *ttrace,
2557 struct trace *trace, FILE *fp)
2558 {
2559 size_t printed = 0;
2560 struct syscall *sc;
2561 struct rb_node *nd;
2562 DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2563
2564 if (syscall_stats == NULL)
2565 return 0;
2566
2567 printed += fprintf(fp, "\n");
2568
2569 printed += fprintf(fp, " syscall calls total min avg max stddev\n");
2570 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
2571 printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n");
2572
2573 resort_rb__for_each_entry(nd, syscall_stats) {
2574 struct stats *stats = syscall_stats_entry->stats;
2575 if (stats) {
2576 double min = (double)(stats->min) / NSEC_PER_MSEC;
2577 double max = (double)(stats->max) / NSEC_PER_MSEC;
2578 double avg = avg_stats(stats);
2579 double pct;
2580 u64 n = (u64) stats->n;
2581
2582 pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2583 avg /= NSEC_PER_MSEC;
2584
2585 sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2586 printed += fprintf(fp, " %-15s", sc->name);
2587 printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2588 n, syscall_stats_entry->msecs, min, avg);
2589 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2590 }
2591 }
2592
2593 resort_rb__delete(syscall_stats);
2594 printed += fprintf(fp, "\n\n");
2595
2596 return printed;
2597 }
2598
2599 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2600 {
2601 size_t printed = 0;
2602 struct thread_trace *ttrace = thread__priv(thread);
2603 double ratio;
2604
2605 if (ttrace == NULL)
2606 return 0;
2607
2608 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2609
2610 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2611 printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2612 printed += fprintf(fp, "%.1f%%", ratio);
2613 if (ttrace->pfmaj)
2614 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2615 if (ttrace->pfmin)
2616 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2617 if (trace->sched)
2618 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2619 else if (fputc('\n', fp) != EOF)
2620 ++printed;
2621
2622 printed += thread__dump_stats(ttrace, trace, fp);
2623
2624 return printed;
2625 }
2626
2627 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2628 {
2629 return ttrace ? ttrace->nr_events : 0;
2630 }
2631
2632 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2633 struct thread *thread;
2634 )
2635 {
2636 entry->thread = rb_entry(nd, struct thread, rb_node);
2637 }
2638
2639 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2640 {
2641 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2642 size_t printed = trace__fprintf_threads_header(fp);
2643 struct rb_node *nd;
2644
2645 if (threads == NULL) {
2646 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2647 return 0;
2648 }
2649
2650 resort_rb__for_each_entry(nd, threads)
2651 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2652
2653 resort_rb__delete(threads);
2654
2655 return printed;
2656 }
2657
2658 static int trace__set_duration(const struct option *opt, const char *str,
2659 int unset __maybe_unused)
2660 {
2661 struct trace *trace = opt->value;
2662
2663 trace->duration_filter = atof(str);
2664 return 0;
2665 }
2666
2667 static int trace__set_filter_pids(const struct option *opt, const char *str,
2668 int unset __maybe_unused)
2669 {
2670 int ret = -1;
2671 size_t i;
2672 struct trace *trace = opt->value;
2673 /*
2674 * FIXME: introduce a intarray class, plain parse csv and create a
2675 * { int nr, int entries[] } struct...
2676 */
2677 struct intlist *list = intlist__new(str);
2678
2679 if (list == NULL)
2680 return -1;
2681
2682 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2683 trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2684
2685 if (trace->filter_pids.entries == NULL)
2686 goto out;
2687
2688 trace->filter_pids.entries[0] = getpid();
2689
2690 for (i = 1; i < trace->filter_pids.nr; ++i)
2691 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2692
2693 intlist__delete(list);
2694 ret = 0;
2695 out:
2696 return ret;
2697 }
2698
2699 static int trace__open_output(struct trace *trace, const char *filename)
2700 {
2701 struct stat st;
2702
2703 if (!stat(filename, &st) && st.st_size) {
2704 char oldname[PATH_MAX];
2705
2706 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2707 unlink(oldname);
2708 rename(filename, oldname);
2709 }
2710
2711 trace->output = fopen(filename, "w");
2712
2713 return trace->output == NULL ? -errno : 0;
2714 }
2715
2716 static int parse_pagefaults(const struct option *opt, const char *str,
2717 int unset __maybe_unused)
2718 {
2719 int *trace_pgfaults = opt->value;
2720
2721 if (strcmp(str, "all") == 0)
2722 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2723 else if (strcmp(str, "maj") == 0)
2724 *trace_pgfaults |= TRACE_PFMAJ;
2725 else if (strcmp(str, "min") == 0)
2726 *trace_pgfaults |= TRACE_PFMIN;
2727 else
2728 return -1;
2729
2730 return 0;
2731 }
2732
2733 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2734 {
2735 struct perf_evsel *evsel;
2736
2737 evlist__for_each_entry(evlist, evsel)
2738 evsel->handler = handler;
2739 }
2740
2741 /*
2742 * XXX: Hackish, just splitting the combined -e+--event (syscalls
2743 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2744 * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2745 *
2746 * It'd be better to introduce a parse_options() variant that would return a
2747 * list with the terms it didn't match to an event...
2748 */
2749 static int trace__parse_events_option(const struct option *opt, const char *str,
2750 int unset __maybe_unused)
2751 {
2752 struct trace *trace = (struct trace *)opt->value;
2753 const char *s = str;
2754 char *sep = NULL, *lists[2] = { NULL, NULL, };
2755 int len = strlen(str), err = -1, list;
2756 char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2757 char group_name[PATH_MAX];
2758
2759 if (strace_groups_dir == NULL)
2760 return -1;
2761
2762 if (*s == '!') {
2763 ++s;
2764 trace->not_ev_qualifier = true;
2765 }
2766
2767 while (1) {
2768 if ((sep = strchr(s, ',')) != NULL)
2769 *sep = '\0';
2770
2771 list = 0;
2772 if (syscalltbl__id(trace->sctbl, s) >= 0) {
2773 list = 1;
2774 } else {
2775 path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2776 if (access(group_name, R_OK) == 0)
2777 list = 1;
2778 }
2779
2780 if (lists[list]) {
2781 sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2782 } else {
2783 lists[list] = malloc(len);
2784 if (lists[list] == NULL)
2785 goto out;
2786 strcpy(lists[list], s);
2787 }
2788
2789 if (!sep)
2790 break;
2791
2792 *sep = ',';
2793 s = sep + 1;
2794 }
2795
2796 if (lists[1] != NULL) {
2797 struct strlist_config slist_config = {
2798 .dirname = strace_groups_dir,
2799 };
2800
2801 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2802 if (trace->ev_qualifier == NULL) {
2803 fputs("Not enough memory to parse event qualifier", trace->output);
2804 goto out;
2805 }
2806
2807 if (trace__validate_ev_qualifier(trace))
2808 goto out;
2809 }
2810
2811 err = 0;
2812
2813 if (lists[0]) {
2814 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2815 "event selector. use 'perf list' to list available events",
2816 parse_events_option);
2817 err = parse_events_option(&o, lists[0], 0);
2818 }
2819 out:
2820 if (sep)
2821 *sep = ',';
2822
2823 return err;
2824 }
2825
2826 int cmd_trace(int argc, const char **argv)
2827 {
2828 const char *trace_usage[] = {
2829 "perf trace [<options>] [<command>]",
2830 "perf trace [<options>] -- <command> [<options>]",
2831 "perf trace record [<options>] [<command>]",
2832 "perf trace record [<options>] -- <command> [<options>]",
2833 NULL
2834 };
2835 struct trace trace = {
2836 .syscalls = {
2837 . max = -1,
2838 },
2839 .opts = {
2840 .target = {
2841 .uid = UINT_MAX,
2842 .uses_mmap = true,
2843 },
2844 .user_freq = UINT_MAX,
2845 .user_interval = ULLONG_MAX,
2846 .no_buffering = true,
2847 .mmap_pages = UINT_MAX,
2848 .proc_map_timeout = 500,
2849 },
2850 .output = stderr,
2851 .show_comm = true,
2852 .trace_syscalls = true,
2853 .kernel_syscallchains = false,
2854 .max_stack = UINT_MAX,
2855 };
2856 const char *output_name = NULL;
2857 const struct option trace_options[] = {
2858 OPT_CALLBACK('e', "event", &trace, "event",
2859 "event/syscall selector. use 'perf list' to list available events",
2860 trace__parse_events_option),
2861 OPT_BOOLEAN(0, "comm", &trace.show_comm,
2862 "show the thread COMM next to its id"),
2863 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2864 OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
2865 trace__parse_events_option),
2866 OPT_STRING('o', "output", &output_name, "file", "output file name"),
2867 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2868 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2869 "trace events on existing process id"),
2870 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2871 "trace events on existing thread id"),
2872 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2873 "pids to filter (by the kernel)", trace__set_filter_pids),
2874 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2875 "system-wide collection from all CPUs"),
2876 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2877 "list of cpus to monitor"),
2878 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2879 "child tasks do not inherit counters"),
2880 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2881 "number of mmap data pages",
2882 perf_evlist__parse_mmap_pages),
2883 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2884 "user to profile"),
2885 OPT_CALLBACK(0, "duration", &trace, "float",
2886 "show only events with duration > N.M ms",
2887 trace__set_duration),
2888 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2889 OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2890 OPT_BOOLEAN('T', "time", &trace.full_time,
2891 "Show full timestamp, not time relative to first start"),
2892 OPT_BOOLEAN('s', "summary", &trace.summary_only,
2893 "Show only syscall summary with statistics"),
2894 OPT_BOOLEAN('S', "with-summary", &trace.summary,
2895 "Show all syscalls and summary with statistics"),
2896 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2897 "Trace pagefaults", parse_pagefaults, "maj"),
2898 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2899 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2900 OPT_CALLBACK(0, "call-graph", &trace.opts,
2901 "record_mode[,record_size]", record_callchain_help,
2902 &record_parse_callchain_opt),
2903 OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2904 "Show the kernel callchains on the syscall exit path"),
2905 OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2906 "Set the minimum stack depth when parsing the callchain, "
2907 "anything below the specified depth will be ignored."),
2908 OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2909 "Set the maximum stack depth when parsing the callchain, "
2910 "anything beyond the specified depth will be ignored. "
2911 "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2912 OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2913 "per thread proc mmap processing timeout in ms"),
2914 OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
2915 "ms to wait before starting measurement after program "
2916 "start"),
2917 OPT_END()
2918 };
2919 bool __maybe_unused max_stack_user_set = true;
2920 bool mmap_pages_user_set = true;
2921 const char * const trace_subcommands[] = { "record", NULL };
2922 int err;
2923 char bf[BUFSIZ];
2924
2925 signal(SIGSEGV, sighandler_dump_stack);
2926 signal(SIGFPE, sighandler_dump_stack);
2927
2928 trace.evlist = perf_evlist__new();
2929 trace.sctbl = syscalltbl__new();
2930
2931 if (trace.evlist == NULL || trace.sctbl == NULL) {
2932 pr_err("Not enough memory to run!\n");
2933 err = -ENOMEM;
2934 goto out;
2935 }
2936
2937 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2938 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2939
2940 err = bpf__setup_stdout(trace.evlist);
2941 if (err) {
2942 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
2943 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
2944 goto out;
2945 }
2946
2947 err = -1;
2948
2949 if (trace.trace_pgfaults) {
2950 trace.opts.sample_address = true;
2951 trace.opts.sample_time = true;
2952 }
2953
2954 if (trace.opts.mmap_pages == UINT_MAX)
2955 mmap_pages_user_set = false;
2956
2957 if (trace.max_stack == UINT_MAX) {
2958 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
2959 max_stack_user_set = false;
2960 }
2961
2962 #ifdef HAVE_DWARF_UNWIND_SUPPORT
2963 if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
2964 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
2965 #endif
2966
2967 if (callchain_param.enabled) {
2968 if (!mmap_pages_user_set && geteuid() == 0)
2969 trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
2970
2971 symbol_conf.use_callchain = true;
2972 }
2973
2974 if (trace.evlist->nr_entries > 0)
2975 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2976
2977 if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2978 return trace__record(&trace, argc-1, &argv[1]);
2979
2980 /* summary_only implies summary option, but don't overwrite summary if set */
2981 if (trace.summary_only)
2982 trace.summary = trace.summary_only;
2983
2984 if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2985 trace.evlist->nr_entries == 0 /* Was --events used? */) {
2986 pr_err("Please specify something to trace.\n");
2987 return -1;
2988 }
2989
2990 if (!trace.trace_syscalls && trace.ev_qualifier) {
2991 pr_err("The -e option can't be used with --no-syscalls.\n");
2992 goto out;
2993 }
2994
2995 if (output_name != NULL) {
2996 err = trace__open_output(&trace, output_name);
2997 if (err < 0) {
2998 perror("failed to create output file");
2999 goto out;
3000 }
3001 }
3002
3003 trace.open_id = syscalltbl__id(trace.sctbl, "open");
3004
3005 err = target__validate(&trace.opts.target);
3006 if (err) {
3007 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3008 fprintf(trace.output, "%s", bf);
3009 goto out_close;
3010 }
3011
3012 err = target__parse_uid(&trace.opts.target);
3013 if (err) {
3014 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3015 fprintf(trace.output, "%s", bf);
3016 goto out_close;
3017 }
3018
3019 if (!argc && target__none(&trace.opts.target))
3020 trace.opts.target.system_wide = true;
3021
3022 if (input_name)
3023 err = trace__replay(&trace);
3024 else
3025 err = trace__run(&trace, argc, argv);
3026
3027 out_close:
3028 if (output_name != NULL)
3029 fclose(trace.output);
3030 out:
3031 return err;
3032 }