]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blob - tools/perf/builtin-record.c
Merge branch 'sfc-fix-bugs-introduced-by-XDP-patches'
[mirror_ubuntu-jammy-kernel.git] / tools / perf / builtin-record.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * builtin-record.c
4 *
5 * Builtin record command: Record the profile of a workload
6 * (or a CPU, or a PID) into the perf.data output file - for
7 * later analysis via perf report.
8 */
9 #include "builtin.h"
10
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
15
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/mmap.h"
24 #include "util/target.h"
25 #include "util/session.h"
26 #include "util/tool.h"
27 #include "util/symbol.h"
28 #include "util/record.h"
29 #include "util/cpumap.h"
30 #include "util/thread_map.h"
31 #include "util/data.h"
32 #include "util/perf_regs.h"
33 #include "util/auxtrace.h"
34 #include "util/tsc.h"
35 #include "util/parse-branch-options.h"
36 #include "util/parse-regs-options.h"
37 #include "util/llvm-utils.h"
38 #include "util/bpf-loader.h"
39 #include "util/trigger.h"
40 #include "util/perf-hooks.h"
41 #include "util/cpu-set-sched.h"
42 #include "util/synthetic-events.h"
43 #include "util/time-utils.h"
44 #include "util/units.h"
45 #include "util/bpf-event.h"
46 #include "asm/bug.h"
47 #include "perf.h"
48
49 #include <errno.h>
50 #include <inttypes.h>
51 #include <locale.h>
52 #include <poll.h>
53 #include <unistd.h>
54 #include <sched.h>
55 #include <signal.h>
56 #include <sys/mman.h>
57 #include <sys/wait.h>
58 #include <sys/types.h>
59 #include <sys/stat.h>
60 #include <fcntl.h>
61 #include <linux/err.h>
62 #include <linux/string.h>
63 #include <linux/time64.h>
64 #include <linux/zalloc.h>
65
66 struct switch_output {
67 bool enabled;
68 bool signal;
69 unsigned long size;
70 unsigned long time;
71 const char *str;
72 bool set;
73 char **filenames;
74 int num_files;
75 int cur_file;
76 };
77
78 struct record {
79 struct perf_tool tool;
80 struct record_opts opts;
81 u64 bytes_written;
82 struct perf_data data;
83 struct auxtrace_record *itr;
84 struct evlist *evlist;
85 struct perf_session *session;
86 int realtime_prio;
87 bool no_buildid;
88 bool no_buildid_set;
89 bool no_buildid_cache;
90 bool no_buildid_cache_set;
91 bool buildid_all;
92 bool timestamp_filename;
93 bool timestamp_boundary;
94 struct switch_output switch_output;
95 unsigned long long samples;
96 cpu_set_t affinity_mask;
97 unsigned long output_max_size; /* = 0: unlimited */
98 };
99
100 static volatile int done;
101
102 static volatile int auxtrace_record__snapshot_started;
103 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
104 static DEFINE_TRIGGER(switch_output_trigger);
105
106 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
107 "SYS", "NODE", "CPU"
108 };
109
110 static bool switch_output_signal(struct record *rec)
111 {
112 return rec->switch_output.signal &&
113 trigger_is_ready(&switch_output_trigger);
114 }
115
116 static bool switch_output_size(struct record *rec)
117 {
118 return rec->switch_output.size &&
119 trigger_is_ready(&switch_output_trigger) &&
120 (rec->bytes_written >= rec->switch_output.size);
121 }
122
123 static bool switch_output_time(struct record *rec)
124 {
125 return rec->switch_output.time &&
126 trigger_is_ready(&switch_output_trigger);
127 }
128
129 static bool record__output_max_size_exceeded(struct record *rec)
130 {
131 return rec->output_max_size &&
132 (rec->bytes_written >= rec->output_max_size);
133 }
134
135 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
136 void *bf, size_t size)
137 {
138 struct perf_data_file *file = &rec->session->data->file;
139
140 if (perf_data_file__write(file, bf, size) < 0) {
141 pr_err("failed to write perf data, error: %m\n");
142 return -1;
143 }
144
145 rec->bytes_written += size;
146
147 if (record__output_max_size_exceeded(rec) && !done) {
148 fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
149 " stopping session ]\n",
150 rec->bytes_written >> 10);
151 done = 1;
152 }
153
154 if (switch_output_size(rec))
155 trigger_hit(&switch_output_trigger);
156
157 return 0;
158 }
159
160 static int record__aio_enabled(struct record *rec);
161 static int record__comp_enabled(struct record *rec);
162 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
163 void *src, size_t src_size);
164
165 #ifdef HAVE_AIO_SUPPORT
166 static int record__aio_write(struct aiocb *cblock, int trace_fd,
167 void *buf, size_t size, off_t off)
168 {
169 int rc;
170
171 cblock->aio_fildes = trace_fd;
172 cblock->aio_buf = buf;
173 cblock->aio_nbytes = size;
174 cblock->aio_offset = off;
175 cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
176
177 do {
178 rc = aio_write(cblock);
179 if (rc == 0) {
180 break;
181 } else if (errno != EAGAIN) {
182 cblock->aio_fildes = -1;
183 pr_err("failed to queue perf data, error: %m\n");
184 break;
185 }
186 } while (1);
187
188 return rc;
189 }
190
191 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
192 {
193 void *rem_buf;
194 off_t rem_off;
195 size_t rem_size;
196 int rc, aio_errno;
197 ssize_t aio_ret, written;
198
199 aio_errno = aio_error(cblock);
200 if (aio_errno == EINPROGRESS)
201 return 0;
202
203 written = aio_ret = aio_return(cblock);
204 if (aio_ret < 0) {
205 if (aio_errno != EINTR)
206 pr_err("failed to write perf data, error: %m\n");
207 written = 0;
208 }
209
210 rem_size = cblock->aio_nbytes - written;
211
212 if (rem_size == 0) {
213 cblock->aio_fildes = -1;
214 /*
215 * md->refcount is incremented in record__aio_pushfn() for
216 * every aio write request started in record__aio_push() so
217 * decrement it because the request is now complete.
218 */
219 perf_mmap__put(&md->core);
220 rc = 1;
221 } else {
222 /*
223 * aio write request may require restart with the
224 * reminder if the kernel didn't write whole
225 * chunk at once.
226 */
227 rem_off = cblock->aio_offset + written;
228 rem_buf = (void *)(cblock->aio_buf + written);
229 record__aio_write(cblock, cblock->aio_fildes,
230 rem_buf, rem_size, rem_off);
231 rc = 0;
232 }
233
234 return rc;
235 }
236
237 static int record__aio_sync(struct mmap *md, bool sync_all)
238 {
239 struct aiocb **aiocb = md->aio.aiocb;
240 struct aiocb *cblocks = md->aio.cblocks;
241 struct timespec timeout = { 0, 1000 * 1000 * 1 }; /* 1ms */
242 int i, do_suspend;
243
244 do {
245 do_suspend = 0;
246 for (i = 0; i < md->aio.nr_cblocks; ++i) {
247 if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
248 if (sync_all)
249 aiocb[i] = NULL;
250 else
251 return i;
252 } else {
253 /*
254 * Started aio write is not complete yet
255 * so it has to be waited before the
256 * next allocation.
257 */
258 aiocb[i] = &cblocks[i];
259 do_suspend = 1;
260 }
261 }
262 if (!do_suspend)
263 return -1;
264
265 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
266 if (!(errno == EAGAIN || errno == EINTR))
267 pr_err("failed to sync perf data, error: %m\n");
268 }
269 } while (1);
270 }
271
272 struct record_aio {
273 struct record *rec;
274 void *data;
275 size_t size;
276 };
277
278 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
279 {
280 struct record_aio *aio = to;
281
282 /*
283 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
284 * to release space in the kernel buffer as fast as possible, calling
285 * perf_mmap__consume() from perf_mmap__push() function.
286 *
287 * That lets the kernel to proceed with storing more profiling data into
288 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
289 *
290 * Coping can be done in two steps in case the chunk of profiling data
291 * crosses the upper bound of the kernel buffer. In this case we first move
292 * part of data from map->start till the upper bound and then the reminder
293 * from the beginning of the kernel buffer till the end of the data chunk.
294 */
295
296 if (record__comp_enabled(aio->rec)) {
297 size = zstd_compress(aio->rec->session, aio->data + aio->size,
298 mmap__mmap_len(map) - aio->size,
299 buf, size);
300 } else {
301 memcpy(aio->data + aio->size, buf, size);
302 }
303
304 if (!aio->size) {
305 /*
306 * Increment map->refcount to guard map->aio.data[] buffer
307 * from premature deallocation because map object can be
308 * released earlier than aio write request started on
309 * map->aio.data[] buffer is complete.
310 *
311 * perf_mmap__put() is done at record__aio_complete()
312 * after started aio request completion or at record__aio_push()
313 * if the request failed to start.
314 */
315 perf_mmap__get(&map->core);
316 }
317
318 aio->size += size;
319
320 return size;
321 }
322
323 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
324 {
325 int ret, idx;
326 int trace_fd = rec->session->data->file.fd;
327 struct record_aio aio = { .rec = rec, .size = 0 };
328
329 /*
330 * Call record__aio_sync() to wait till map->aio.data[] buffer
331 * becomes available after previous aio write operation.
332 */
333
334 idx = record__aio_sync(map, false);
335 aio.data = map->aio.data[idx];
336 ret = perf_mmap__push(map, &aio, record__aio_pushfn);
337 if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
338 return ret;
339
340 rec->samples++;
341 ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
342 if (!ret) {
343 *off += aio.size;
344 rec->bytes_written += aio.size;
345 if (switch_output_size(rec))
346 trigger_hit(&switch_output_trigger);
347 } else {
348 /*
349 * Decrement map->refcount incremented in record__aio_pushfn()
350 * back if record__aio_write() operation failed to start, otherwise
351 * map->refcount is decremented in record__aio_complete() after
352 * aio write operation finishes successfully.
353 */
354 perf_mmap__put(&map->core);
355 }
356
357 return ret;
358 }
359
360 static off_t record__aio_get_pos(int trace_fd)
361 {
362 return lseek(trace_fd, 0, SEEK_CUR);
363 }
364
365 static void record__aio_set_pos(int trace_fd, off_t pos)
366 {
367 lseek(trace_fd, pos, SEEK_SET);
368 }
369
370 static void record__aio_mmap_read_sync(struct record *rec)
371 {
372 int i;
373 struct evlist *evlist = rec->evlist;
374 struct mmap *maps = evlist->mmap;
375
376 if (!record__aio_enabled(rec))
377 return;
378
379 for (i = 0; i < evlist->core.nr_mmaps; i++) {
380 struct mmap *map = &maps[i];
381
382 if (map->core.base)
383 record__aio_sync(map, true);
384 }
385 }
386
387 static int nr_cblocks_default = 1;
388 static int nr_cblocks_max = 4;
389
390 static int record__aio_parse(const struct option *opt,
391 const char *str,
392 int unset)
393 {
394 struct record_opts *opts = (struct record_opts *)opt->value;
395
396 if (unset) {
397 opts->nr_cblocks = 0;
398 } else {
399 if (str)
400 opts->nr_cblocks = strtol(str, NULL, 0);
401 if (!opts->nr_cblocks)
402 opts->nr_cblocks = nr_cblocks_default;
403 }
404
405 return 0;
406 }
407 #else /* HAVE_AIO_SUPPORT */
408 static int nr_cblocks_max = 0;
409
410 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
411 off_t *off __maybe_unused)
412 {
413 return -1;
414 }
415
416 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
417 {
418 return -1;
419 }
420
421 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
422 {
423 }
424
425 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
426 {
427 }
428 #endif
429
430 static int record__aio_enabled(struct record *rec)
431 {
432 return rec->opts.nr_cblocks > 0;
433 }
434
435 #define MMAP_FLUSH_DEFAULT 1
436 static int record__mmap_flush_parse(const struct option *opt,
437 const char *str,
438 int unset)
439 {
440 int flush_max;
441 struct record_opts *opts = (struct record_opts *)opt->value;
442 static struct parse_tag tags[] = {
443 { .tag = 'B', .mult = 1 },
444 { .tag = 'K', .mult = 1 << 10 },
445 { .tag = 'M', .mult = 1 << 20 },
446 { .tag = 'G', .mult = 1 << 30 },
447 { .tag = 0 },
448 };
449
450 if (unset)
451 return 0;
452
453 if (str) {
454 opts->mmap_flush = parse_tag_value(str, tags);
455 if (opts->mmap_flush == (int)-1)
456 opts->mmap_flush = strtol(str, NULL, 0);
457 }
458
459 if (!opts->mmap_flush)
460 opts->mmap_flush = MMAP_FLUSH_DEFAULT;
461
462 flush_max = evlist__mmap_size(opts->mmap_pages);
463 flush_max /= 4;
464 if (opts->mmap_flush > flush_max)
465 opts->mmap_flush = flush_max;
466
467 return 0;
468 }
469
470 #ifdef HAVE_ZSTD_SUPPORT
471 static unsigned int comp_level_default = 1;
472
473 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
474 {
475 struct record_opts *opts = opt->value;
476
477 if (unset) {
478 opts->comp_level = 0;
479 } else {
480 if (str)
481 opts->comp_level = strtol(str, NULL, 0);
482 if (!opts->comp_level)
483 opts->comp_level = comp_level_default;
484 }
485
486 return 0;
487 }
488 #endif
489 static unsigned int comp_level_max = 22;
490
491 static int record__comp_enabled(struct record *rec)
492 {
493 return rec->opts.comp_level > 0;
494 }
495
496 static int process_synthesized_event(struct perf_tool *tool,
497 union perf_event *event,
498 struct perf_sample *sample __maybe_unused,
499 struct machine *machine __maybe_unused)
500 {
501 struct record *rec = container_of(tool, struct record, tool);
502 return record__write(rec, NULL, event, event->header.size);
503 }
504
505 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
506 {
507 struct record *rec = to;
508
509 if (record__comp_enabled(rec)) {
510 size = zstd_compress(rec->session, map->data, mmap__mmap_len(map), bf, size);
511 bf = map->data;
512 }
513
514 rec->samples++;
515 return record__write(rec, map, bf, size);
516 }
517
518 static volatile int signr = -1;
519 static volatile int child_finished;
520
521 static void sig_handler(int sig)
522 {
523 if (sig == SIGCHLD)
524 child_finished = 1;
525 else
526 signr = sig;
527
528 done = 1;
529 }
530
531 static void sigsegv_handler(int sig)
532 {
533 perf_hooks__recover();
534 sighandler_dump_stack(sig);
535 }
536
537 static void record__sig_exit(void)
538 {
539 if (signr == -1)
540 return;
541
542 signal(signr, SIG_DFL);
543 raise(signr);
544 }
545
546 #ifdef HAVE_AUXTRACE_SUPPORT
547
548 static int record__process_auxtrace(struct perf_tool *tool,
549 struct mmap *map,
550 union perf_event *event, void *data1,
551 size_t len1, void *data2, size_t len2)
552 {
553 struct record *rec = container_of(tool, struct record, tool);
554 struct perf_data *data = &rec->data;
555 size_t padding;
556 u8 pad[8] = {0};
557
558 if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
559 off_t file_offset;
560 int fd = perf_data__fd(data);
561 int err;
562
563 file_offset = lseek(fd, 0, SEEK_CUR);
564 if (file_offset == -1)
565 return -1;
566 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
567 event, file_offset);
568 if (err)
569 return err;
570 }
571
572 /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
573 padding = (len1 + len2) & 7;
574 if (padding)
575 padding = 8 - padding;
576
577 record__write(rec, map, event, event->header.size);
578 record__write(rec, map, data1, len1);
579 if (len2)
580 record__write(rec, map, data2, len2);
581 record__write(rec, map, &pad, padding);
582
583 return 0;
584 }
585
586 static int record__auxtrace_mmap_read(struct record *rec,
587 struct mmap *map)
588 {
589 int ret;
590
591 ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
592 record__process_auxtrace);
593 if (ret < 0)
594 return ret;
595
596 if (ret)
597 rec->samples++;
598
599 return 0;
600 }
601
602 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
603 struct mmap *map)
604 {
605 int ret;
606
607 ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
608 record__process_auxtrace,
609 rec->opts.auxtrace_snapshot_size);
610 if (ret < 0)
611 return ret;
612
613 if (ret)
614 rec->samples++;
615
616 return 0;
617 }
618
619 static int record__auxtrace_read_snapshot_all(struct record *rec)
620 {
621 int i;
622 int rc = 0;
623
624 for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
625 struct mmap *map = &rec->evlist->mmap[i];
626
627 if (!map->auxtrace_mmap.base)
628 continue;
629
630 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
631 rc = -1;
632 goto out;
633 }
634 }
635 out:
636 return rc;
637 }
638
639 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
640 {
641 pr_debug("Recording AUX area tracing snapshot\n");
642 if (record__auxtrace_read_snapshot_all(rec) < 0) {
643 trigger_error(&auxtrace_snapshot_trigger);
644 } else {
645 if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
646 trigger_error(&auxtrace_snapshot_trigger);
647 else
648 trigger_ready(&auxtrace_snapshot_trigger);
649 }
650 }
651
652 static int record__auxtrace_snapshot_exit(struct record *rec)
653 {
654 if (trigger_is_error(&auxtrace_snapshot_trigger))
655 return 0;
656
657 if (!auxtrace_record__snapshot_started &&
658 auxtrace_record__snapshot_start(rec->itr))
659 return -1;
660
661 record__read_auxtrace_snapshot(rec, true);
662 if (trigger_is_error(&auxtrace_snapshot_trigger))
663 return -1;
664
665 return 0;
666 }
667
668 static int record__auxtrace_init(struct record *rec)
669 {
670 int err;
671
672 if (!rec->itr) {
673 rec->itr = auxtrace_record__init(rec->evlist, &err);
674 if (err)
675 return err;
676 }
677
678 err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
679 rec->opts.auxtrace_snapshot_opts);
680 if (err)
681 return err;
682
683 err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
684 rec->opts.auxtrace_sample_opts);
685 if (err)
686 return err;
687
688 return auxtrace_parse_filters(rec->evlist);
689 }
690
691 #else
692
693 static inline
694 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
695 struct mmap *map __maybe_unused)
696 {
697 return 0;
698 }
699
700 static inline
701 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
702 bool on_exit __maybe_unused)
703 {
704 }
705
706 static inline
707 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
708 {
709 return 0;
710 }
711
712 static inline
713 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
714 {
715 return 0;
716 }
717
718 static int record__auxtrace_init(struct record *rec __maybe_unused)
719 {
720 return 0;
721 }
722
723 #endif
724
725 static bool record__kcore_readable(struct machine *machine)
726 {
727 char kcore[PATH_MAX];
728 int fd;
729
730 scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
731
732 fd = open(kcore, O_RDONLY);
733 if (fd < 0)
734 return false;
735
736 close(fd);
737
738 return true;
739 }
740
741 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
742 {
743 char from_dir[PATH_MAX];
744 char kcore_dir[PATH_MAX];
745 int ret;
746
747 snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
748
749 ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
750 if (ret)
751 return ret;
752
753 return kcore_copy(from_dir, kcore_dir);
754 }
755
756 static int record__mmap_evlist(struct record *rec,
757 struct evlist *evlist)
758 {
759 struct record_opts *opts = &rec->opts;
760 bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
761 opts->auxtrace_sample_mode;
762 char msg[512];
763
764 if (opts->affinity != PERF_AFFINITY_SYS)
765 cpu__setup_cpunode_map();
766
767 if (evlist__mmap_ex(evlist, opts->mmap_pages,
768 opts->auxtrace_mmap_pages,
769 auxtrace_overwrite,
770 opts->nr_cblocks, opts->affinity,
771 opts->mmap_flush, opts->comp_level) < 0) {
772 if (errno == EPERM) {
773 pr_err("Permission error mapping pages.\n"
774 "Consider increasing "
775 "/proc/sys/kernel/perf_event_mlock_kb,\n"
776 "or try again with a smaller value of -m/--mmap_pages.\n"
777 "(current value: %u,%u)\n",
778 opts->mmap_pages, opts->auxtrace_mmap_pages);
779 return -errno;
780 } else {
781 pr_err("failed to mmap with %d (%s)\n", errno,
782 str_error_r(errno, msg, sizeof(msg)));
783 if (errno)
784 return -errno;
785 else
786 return -EINVAL;
787 }
788 }
789 return 0;
790 }
791
792 static int record__mmap(struct record *rec)
793 {
794 return record__mmap_evlist(rec, rec->evlist);
795 }
796
797 static int record__open(struct record *rec)
798 {
799 char msg[BUFSIZ];
800 struct evsel *pos;
801 struct evlist *evlist = rec->evlist;
802 struct perf_session *session = rec->session;
803 struct record_opts *opts = &rec->opts;
804 int rc = 0;
805
806 /*
807 * For initial_delay we need to add a dummy event so that we can track
808 * PERF_RECORD_MMAP while we wait for the initial delay to enable the
809 * real events, the ones asked by the user.
810 */
811 if (opts->initial_delay) {
812 if (perf_evlist__add_dummy(evlist))
813 return -ENOMEM;
814
815 pos = evlist__first(evlist);
816 pos->tracking = 0;
817 pos = evlist__last(evlist);
818 pos->tracking = 1;
819 pos->core.attr.enable_on_exec = 1;
820 }
821
822 perf_evlist__config(evlist, opts, &callchain_param);
823
824 evlist__for_each_entry(evlist, pos) {
825 try_again:
826 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
827 if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
828 if (verbose > 0)
829 ui__warning("%s\n", msg);
830 goto try_again;
831 }
832 if ((errno == EINVAL || errno == EBADF) &&
833 pos->leader != pos &&
834 pos->weak_group) {
835 pos = perf_evlist__reset_weak_group(evlist, pos);
836 goto try_again;
837 }
838 rc = -errno;
839 perf_evsel__open_strerror(pos, &opts->target,
840 errno, msg, sizeof(msg));
841 ui__error("%s\n", msg);
842 goto out;
843 }
844
845 pos->supported = true;
846 }
847
848 if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(evlist)) {
849 pr_warning(
850 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
851 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
852 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
853 "file is not found in the buildid cache or in the vmlinux path.\n\n"
854 "Samples in kernel modules won't be resolved at all.\n\n"
855 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
856 "even with a suitable vmlinux or kallsyms file.\n\n");
857 }
858
859 if (perf_evlist__apply_filters(evlist, &pos)) {
860 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
861 pos->filter, perf_evsel__name(pos), errno,
862 str_error_r(errno, msg, sizeof(msg)));
863 rc = -1;
864 goto out;
865 }
866
867 rc = record__mmap(rec);
868 if (rc)
869 goto out;
870
871 session->evlist = evlist;
872 perf_session__set_id_hdr_size(session);
873 out:
874 return rc;
875 }
876
877 static int process_sample_event(struct perf_tool *tool,
878 union perf_event *event,
879 struct perf_sample *sample,
880 struct evsel *evsel,
881 struct machine *machine)
882 {
883 struct record *rec = container_of(tool, struct record, tool);
884
885 if (rec->evlist->first_sample_time == 0)
886 rec->evlist->first_sample_time = sample->time;
887
888 rec->evlist->last_sample_time = sample->time;
889
890 if (rec->buildid_all)
891 return 0;
892
893 rec->samples++;
894 return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
895 }
896
897 static int process_buildids(struct record *rec)
898 {
899 struct perf_session *session = rec->session;
900
901 if (perf_data__size(&rec->data) == 0)
902 return 0;
903
904 /*
905 * During this process, it'll load kernel map and replace the
906 * dso->long_name to a real pathname it found. In this case
907 * we prefer the vmlinux path like
908 * /lib/modules/3.16.4/build/vmlinux
909 *
910 * rather than build-id path (in debug directory).
911 * $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
912 */
913 symbol_conf.ignore_vmlinux_buildid = true;
914
915 /*
916 * If --buildid-all is given, it marks all DSO regardless of hits,
917 * so no need to process samples. But if timestamp_boundary is enabled,
918 * it still needs to walk on all samples to get the timestamps of
919 * first/last samples.
920 */
921 if (rec->buildid_all && !rec->timestamp_boundary)
922 rec->tool.sample = NULL;
923
924 return perf_session__process_events(session);
925 }
926
927 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
928 {
929 int err;
930 struct perf_tool *tool = data;
931 /*
932 *As for guest kernel when processing subcommand record&report,
933 *we arrange module mmap prior to guest kernel mmap and trigger
934 *a preload dso because default guest module symbols are loaded
935 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
936 *method is used to avoid symbol missing when the first addr is
937 *in module instead of in guest kernel.
938 */
939 err = perf_event__synthesize_modules(tool, process_synthesized_event,
940 machine);
941 if (err < 0)
942 pr_err("Couldn't record guest kernel [%d]'s reference"
943 " relocation symbol.\n", machine->pid);
944
945 /*
946 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
947 * have no _text sometimes.
948 */
949 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
950 machine);
951 if (err < 0)
952 pr_err("Couldn't record guest kernel [%d]'s reference"
953 " relocation symbol.\n", machine->pid);
954 }
955
956 static struct perf_event_header finished_round_event = {
957 .size = sizeof(struct perf_event_header),
958 .type = PERF_RECORD_FINISHED_ROUND,
959 };
960
961 static void record__adjust_affinity(struct record *rec, struct mmap *map)
962 {
963 if (rec->opts.affinity != PERF_AFFINITY_SYS &&
964 !CPU_EQUAL(&rec->affinity_mask, &map->affinity_mask)) {
965 CPU_ZERO(&rec->affinity_mask);
966 CPU_OR(&rec->affinity_mask, &rec->affinity_mask, &map->affinity_mask);
967 sched_setaffinity(0, sizeof(rec->affinity_mask), &rec->affinity_mask);
968 }
969 }
970
971 static size_t process_comp_header(void *record, size_t increment)
972 {
973 struct perf_record_compressed *event = record;
974 size_t size = sizeof(*event);
975
976 if (increment) {
977 event->header.size += increment;
978 return increment;
979 }
980
981 event->header.type = PERF_RECORD_COMPRESSED;
982 event->header.size = size;
983
984 return size;
985 }
986
987 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
988 void *src, size_t src_size)
989 {
990 size_t compressed;
991 size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
992
993 compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
994 max_record_size, process_comp_header);
995
996 session->bytes_transferred += src_size;
997 session->bytes_compressed += compressed;
998
999 return compressed;
1000 }
1001
1002 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1003 bool overwrite, bool synch)
1004 {
1005 u64 bytes_written = rec->bytes_written;
1006 int i;
1007 int rc = 0;
1008 struct mmap *maps;
1009 int trace_fd = rec->data.file.fd;
1010 off_t off = 0;
1011
1012 if (!evlist)
1013 return 0;
1014
1015 maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
1016 if (!maps)
1017 return 0;
1018
1019 if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1020 return 0;
1021
1022 if (record__aio_enabled(rec))
1023 off = record__aio_get_pos(trace_fd);
1024
1025 for (i = 0; i < evlist->core.nr_mmaps; i++) {
1026 u64 flush = 0;
1027 struct mmap *map = &maps[i];
1028
1029 if (map->core.base) {
1030 record__adjust_affinity(rec, map);
1031 if (synch) {
1032 flush = map->core.flush;
1033 map->core.flush = 1;
1034 }
1035 if (!record__aio_enabled(rec)) {
1036 if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1037 if (synch)
1038 map->core.flush = flush;
1039 rc = -1;
1040 goto out;
1041 }
1042 } else {
1043 if (record__aio_push(rec, map, &off) < 0) {
1044 record__aio_set_pos(trace_fd, off);
1045 if (synch)
1046 map->core.flush = flush;
1047 rc = -1;
1048 goto out;
1049 }
1050 }
1051 if (synch)
1052 map->core.flush = flush;
1053 }
1054
1055 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1056 !rec->opts.auxtrace_sample_mode &&
1057 record__auxtrace_mmap_read(rec, map) != 0) {
1058 rc = -1;
1059 goto out;
1060 }
1061 }
1062
1063 if (record__aio_enabled(rec))
1064 record__aio_set_pos(trace_fd, off);
1065
1066 /*
1067 * Mark the round finished in case we wrote
1068 * at least one event.
1069 */
1070 if (bytes_written != rec->bytes_written)
1071 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1072
1073 if (overwrite)
1074 perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1075 out:
1076 return rc;
1077 }
1078
1079 static int record__mmap_read_all(struct record *rec, bool synch)
1080 {
1081 int err;
1082
1083 err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1084 if (err)
1085 return err;
1086
1087 return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1088 }
1089
1090 static void record__init_features(struct record *rec)
1091 {
1092 struct perf_session *session = rec->session;
1093 int feat;
1094
1095 for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1096 perf_header__set_feat(&session->header, feat);
1097
1098 if (rec->no_buildid)
1099 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1100
1101 if (!have_tracepoints(&rec->evlist->core.entries))
1102 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1103
1104 if (!rec->opts.branch_stack)
1105 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1106
1107 if (!rec->opts.full_auxtrace)
1108 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1109
1110 if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1111 perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1112
1113 perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1114 if (!record__comp_enabled(rec))
1115 perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1116
1117 perf_header__clear_feat(&session->header, HEADER_STAT);
1118 }
1119
1120 static void
1121 record__finish_output(struct record *rec)
1122 {
1123 struct perf_data *data = &rec->data;
1124 int fd = perf_data__fd(data);
1125
1126 if (data->is_pipe)
1127 return;
1128
1129 rec->session->header.data_size += rec->bytes_written;
1130 data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1131
1132 if (!rec->no_buildid) {
1133 process_buildids(rec);
1134
1135 if (rec->buildid_all)
1136 dsos__hit_all(rec->session);
1137 }
1138 perf_session__write_header(rec->session, rec->evlist, fd, true);
1139
1140 return;
1141 }
1142
1143 static int record__synthesize_workload(struct record *rec, bool tail)
1144 {
1145 int err;
1146 struct perf_thread_map *thread_map;
1147
1148 if (rec->opts.tail_synthesize != tail)
1149 return 0;
1150
1151 thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1152 if (thread_map == NULL)
1153 return -1;
1154
1155 err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1156 process_synthesized_event,
1157 &rec->session->machines.host,
1158 rec->opts.sample_address);
1159 perf_thread_map__put(thread_map);
1160 return err;
1161 }
1162
1163 static int record__synthesize(struct record *rec, bool tail);
1164
1165 static int
1166 record__switch_output(struct record *rec, bool at_exit)
1167 {
1168 struct perf_data *data = &rec->data;
1169 int fd, err;
1170 char *new_filename;
1171
1172 /* Same Size: "2015122520103046"*/
1173 char timestamp[] = "InvalidTimestamp";
1174
1175 record__aio_mmap_read_sync(rec);
1176
1177 record__synthesize(rec, true);
1178 if (target__none(&rec->opts.target))
1179 record__synthesize_workload(rec, true);
1180
1181 rec->samples = 0;
1182 record__finish_output(rec);
1183 err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1184 if (err) {
1185 pr_err("Failed to get current timestamp\n");
1186 return -EINVAL;
1187 }
1188
1189 fd = perf_data__switch(data, timestamp,
1190 rec->session->header.data_offset,
1191 at_exit, &new_filename);
1192 if (fd >= 0 && !at_exit) {
1193 rec->bytes_written = 0;
1194 rec->session->header.data_size = 0;
1195 }
1196
1197 if (!quiet)
1198 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1199 data->path, timestamp);
1200
1201 if (rec->switch_output.num_files) {
1202 int n = rec->switch_output.cur_file + 1;
1203
1204 if (n >= rec->switch_output.num_files)
1205 n = 0;
1206 rec->switch_output.cur_file = n;
1207 if (rec->switch_output.filenames[n]) {
1208 remove(rec->switch_output.filenames[n]);
1209 zfree(&rec->switch_output.filenames[n]);
1210 }
1211 rec->switch_output.filenames[n] = new_filename;
1212 } else {
1213 free(new_filename);
1214 }
1215
1216 /* Output tracking events */
1217 if (!at_exit) {
1218 record__synthesize(rec, false);
1219
1220 /*
1221 * In 'perf record --switch-output' without -a,
1222 * record__synthesize() in record__switch_output() won't
1223 * generate tracking events because there's no thread_map
1224 * in evlist. Which causes newly created perf.data doesn't
1225 * contain map and comm information.
1226 * Create a fake thread_map and directly call
1227 * perf_event__synthesize_thread_map() for those events.
1228 */
1229 if (target__none(&rec->opts.target))
1230 record__synthesize_workload(rec, false);
1231 }
1232 return fd;
1233 }
1234
1235 static volatile int workload_exec_errno;
1236
1237 /*
1238 * perf_evlist__prepare_workload will send a SIGUSR1
1239 * if the fork fails, since we asked by setting its
1240 * want_signal to true.
1241 */
1242 static void workload_exec_failed_signal(int signo __maybe_unused,
1243 siginfo_t *info,
1244 void *ucontext __maybe_unused)
1245 {
1246 workload_exec_errno = info->si_value.sival_int;
1247 done = 1;
1248 child_finished = 1;
1249 }
1250
1251 static void snapshot_sig_handler(int sig);
1252 static void alarm_sig_handler(int sig);
1253
1254 static const struct perf_event_mmap_page *
1255 perf_evlist__pick_pc(struct evlist *evlist)
1256 {
1257 if (evlist) {
1258 if (evlist->mmap && evlist->mmap[0].core.base)
1259 return evlist->mmap[0].core.base;
1260 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1261 return evlist->overwrite_mmap[0].core.base;
1262 }
1263 return NULL;
1264 }
1265
1266 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1267 {
1268 const struct perf_event_mmap_page *pc;
1269
1270 pc = perf_evlist__pick_pc(rec->evlist);
1271 if (pc)
1272 return pc;
1273 return NULL;
1274 }
1275
1276 static int record__synthesize(struct record *rec, bool tail)
1277 {
1278 struct perf_session *session = rec->session;
1279 struct machine *machine = &session->machines.host;
1280 struct perf_data *data = &rec->data;
1281 struct record_opts *opts = &rec->opts;
1282 struct perf_tool *tool = &rec->tool;
1283 int fd = perf_data__fd(data);
1284 int err = 0;
1285
1286 if (rec->opts.tail_synthesize != tail)
1287 return 0;
1288
1289 if (data->is_pipe) {
1290 /*
1291 * We need to synthesize events first, because some
1292 * features works on top of them (on report side).
1293 */
1294 err = perf_event__synthesize_attrs(tool, rec->evlist,
1295 process_synthesized_event);
1296 if (err < 0) {
1297 pr_err("Couldn't synthesize attrs.\n");
1298 goto out;
1299 }
1300
1301 err = perf_event__synthesize_features(tool, session, rec->evlist,
1302 process_synthesized_event);
1303 if (err < 0) {
1304 pr_err("Couldn't synthesize features.\n");
1305 return err;
1306 }
1307
1308 if (have_tracepoints(&rec->evlist->core.entries)) {
1309 /*
1310 * FIXME err <= 0 here actually means that
1311 * there were no tracepoints so its not really
1312 * an error, just that we don't need to
1313 * synthesize anything. We really have to
1314 * return this more properly and also
1315 * propagate errors that now are calling die()
1316 */
1317 err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist,
1318 process_synthesized_event);
1319 if (err <= 0) {
1320 pr_err("Couldn't record tracing data.\n");
1321 goto out;
1322 }
1323 rec->bytes_written += err;
1324 }
1325 }
1326
1327 err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1328 process_synthesized_event, machine);
1329 if (err)
1330 goto out;
1331
1332 /* Synthesize id_index before auxtrace_info */
1333 if (rec->opts.auxtrace_sample_mode) {
1334 err = perf_event__synthesize_id_index(tool,
1335 process_synthesized_event,
1336 session->evlist, machine);
1337 if (err)
1338 goto out;
1339 }
1340
1341 if (rec->opts.full_auxtrace) {
1342 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1343 session, process_synthesized_event);
1344 if (err)
1345 goto out;
1346 }
1347
1348 if (!perf_evlist__exclude_kernel(rec->evlist)) {
1349 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1350 machine);
1351 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1352 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1353 "Check /proc/kallsyms permission or run as root.\n");
1354
1355 err = perf_event__synthesize_modules(tool, process_synthesized_event,
1356 machine);
1357 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1358 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1359 "Check /proc/modules permission or run as root.\n");
1360 }
1361
1362 if (perf_guest) {
1363 machines__process_guests(&session->machines,
1364 perf_event__synthesize_guest_os, tool);
1365 }
1366
1367 err = perf_event__synthesize_extra_attr(&rec->tool,
1368 rec->evlist,
1369 process_synthesized_event,
1370 data->is_pipe);
1371 if (err)
1372 goto out;
1373
1374 err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1375 process_synthesized_event,
1376 NULL);
1377 if (err < 0) {
1378 pr_err("Couldn't synthesize thread map.\n");
1379 return err;
1380 }
1381
1382 err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1383 process_synthesized_event, NULL);
1384 if (err < 0) {
1385 pr_err("Couldn't synthesize cpu map.\n");
1386 return err;
1387 }
1388
1389 err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1390 machine, opts);
1391 if (err < 0)
1392 pr_warning("Couldn't synthesize bpf events.\n");
1393
1394 err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
1395 process_synthesized_event, opts->sample_address,
1396 1);
1397 out:
1398 return err;
1399 }
1400
1401 static int __cmd_record(struct record *rec, int argc, const char **argv)
1402 {
1403 int err;
1404 int status = 0;
1405 unsigned long waking = 0;
1406 const bool forks = argc > 0;
1407 struct perf_tool *tool = &rec->tool;
1408 struct record_opts *opts = &rec->opts;
1409 struct perf_data *data = &rec->data;
1410 struct perf_session *session;
1411 bool disabled = false, draining = false;
1412 struct evlist *sb_evlist = NULL;
1413 int fd;
1414 float ratio = 0;
1415
1416 atexit(record__sig_exit);
1417 signal(SIGCHLD, sig_handler);
1418 signal(SIGINT, sig_handler);
1419 signal(SIGTERM, sig_handler);
1420 signal(SIGSEGV, sigsegv_handler);
1421
1422 if (rec->opts.record_namespaces)
1423 tool->namespace_events = true;
1424
1425 if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1426 signal(SIGUSR2, snapshot_sig_handler);
1427 if (rec->opts.auxtrace_snapshot_mode)
1428 trigger_on(&auxtrace_snapshot_trigger);
1429 if (rec->switch_output.enabled)
1430 trigger_on(&switch_output_trigger);
1431 } else {
1432 signal(SIGUSR2, SIG_IGN);
1433 }
1434
1435 session = perf_session__new(data, false, tool);
1436 if (IS_ERR(session)) {
1437 pr_err("Perf session creation failed.\n");
1438 return PTR_ERR(session);
1439 }
1440
1441 fd = perf_data__fd(data);
1442 rec->session = session;
1443
1444 if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1445 pr_err("Compression initialization failed.\n");
1446 return -1;
1447 }
1448
1449 session->header.env.comp_type = PERF_COMP_ZSTD;
1450 session->header.env.comp_level = rec->opts.comp_level;
1451
1452 if (rec->opts.kcore &&
1453 !record__kcore_readable(&session->machines.host)) {
1454 pr_err("ERROR: kcore is not readable.\n");
1455 return -1;
1456 }
1457
1458 record__init_features(rec);
1459
1460 if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1461 session->header.env.clockid_res_ns = rec->opts.clockid_res_ns;
1462
1463 if (forks) {
1464 err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
1465 argv, data->is_pipe,
1466 workload_exec_failed_signal);
1467 if (err < 0) {
1468 pr_err("Couldn't run the workload!\n");
1469 status = err;
1470 goto out_delete_session;
1471 }
1472 }
1473
1474 /*
1475 * If we have just single event and are sending data
1476 * through pipe, we need to force the ids allocation,
1477 * because we synthesize event name through the pipe
1478 * and need the id for that.
1479 */
1480 if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1481 rec->opts.sample_id = true;
1482
1483 if (record__open(rec) != 0) {
1484 err = -1;
1485 goto out_child;
1486 }
1487 session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
1488
1489 if (rec->opts.kcore) {
1490 err = record__kcore_copy(&session->machines.host, data);
1491 if (err) {
1492 pr_err("ERROR: Failed to copy kcore\n");
1493 goto out_child;
1494 }
1495 }
1496
1497 err = bpf__apply_obj_config();
1498 if (err) {
1499 char errbuf[BUFSIZ];
1500
1501 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1502 pr_err("ERROR: Apply config to BPF failed: %s\n",
1503 errbuf);
1504 goto out_child;
1505 }
1506
1507 /*
1508 * Normally perf_session__new would do this, but it doesn't have the
1509 * evlist.
1510 */
1511 if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
1512 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1513 rec->tool.ordered_events = false;
1514 }
1515
1516 if (!rec->evlist->nr_groups)
1517 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1518
1519 if (data->is_pipe) {
1520 err = perf_header__write_pipe(fd);
1521 if (err < 0)
1522 goto out_child;
1523 } else {
1524 err = perf_session__write_header(session, rec->evlist, fd, false);
1525 if (err < 0)
1526 goto out_child;
1527 }
1528
1529 if (!rec->no_buildid
1530 && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1531 pr_err("Couldn't generate buildids. "
1532 "Use --no-buildid to profile anyway.\n");
1533 err = -1;
1534 goto out_child;
1535 }
1536
1537 if (!opts->no_bpf_event)
1538 bpf_event__add_sb_event(&sb_evlist, &session->header.env);
1539
1540 if (perf_evlist__start_sb_thread(sb_evlist, &rec->opts.target)) {
1541 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1542 opts->no_bpf_event = true;
1543 }
1544
1545 err = record__synthesize(rec, false);
1546 if (err < 0)
1547 goto out_child;
1548
1549 if (rec->realtime_prio) {
1550 struct sched_param param;
1551
1552 param.sched_priority = rec->realtime_prio;
1553 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1554 pr_err("Could not set realtime priority.\n");
1555 err = -1;
1556 goto out_child;
1557 }
1558 }
1559
1560 /*
1561 * When perf is starting the traced process, all the events
1562 * (apart from group members) have enable_on_exec=1 set,
1563 * so don't spoil it by prematurely enabling them.
1564 */
1565 if (!target__none(&opts->target) && !opts->initial_delay)
1566 evlist__enable(rec->evlist);
1567
1568 /*
1569 * Let the child rip
1570 */
1571 if (forks) {
1572 struct machine *machine = &session->machines.host;
1573 union perf_event *event;
1574 pid_t tgid;
1575
1576 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1577 if (event == NULL) {
1578 err = -ENOMEM;
1579 goto out_child;
1580 }
1581
1582 /*
1583 * Some H/W events are generated before COMM event
1584 * which is emitted during exec(), so perf script
1585 * cannot see a correct process name for those events.
1586 * Synthesize COMM event to prevent it.
1587 */
1588 tgid = perf_event__synthesize_comm(tool, event,
1589 rec->evlist->workload.pid,
1590 process_synthesized_event,
1591 machine);
1592 free(event);
1593
1594 if (tgid == -1)
1595 goto out_child;
1596
1597 event = malloc(sizeof(event->namespaces) +
1598 (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1599 machine->id_hdr_size);
1600 if (event == NULL) {
1601 err = -ENOMEM;
1602 goto out_child;
1603 }
1604
1605 /*
1606 * Synthesize NAMESPACES event for the command specified.
1607 */
1608 perf_event__synthesize_namespaces(tool, event,
1609 rec->evlist->workload.pid,
1610 tgid, process_synthesized_event,
1611 machine);
1612 free(event);
1613
1614 perf_evlist__start_workload(rec->evlist);
1615 }
1616
1617 if (opts->initial_delay) {
1618 usleep(opts->initial_delay * USEC_PER_MSEC);
1619 evlist__enable(rec->evlist);
1620 }
1621
1622 trigger_ready(&auxtrace_snapshot_trigger);
1623 trigger_ready(&switch_output_trigger);
1624 perf_hooks__invoke_record_start();
1625 for (;;) {
1626 unsigned long long hits = rec->samples;
1627
1628 /*
1629 * rec->evlist->bkw_mmap_state is possible to be
1630 * BKW_MMAP_EMPTY here: when done == true and
1631 * hits != rec->samples in previous round.
1632 *
1633 * perf_evlist__toggle_bkw_mmap ensure we never
1634 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1635 */
1636 if (trigger_is_hit(&switch_output_trigger) || done || draining)
1637 perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1638
1639 if (record__mmap_read_all(rec, false) < 0) {
1640 trigger_error(&auxtrace_snapshot_trigger);
1641 trigger_error(&switch_output_trigger);
1642 err = -1;
1643 goto out_child;
1644 }
1645
1646 if (auxtrace_record__snapshot_started) {
1647 auxtrace_record__snapshot_started = 0;
1648 if (!trigger_is_error(&auxtrace_snapshot_trigger))
1649 record__read_auxtrace_snapshot(rec, false);
1650 if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1651 pr_err("AUX area tracing snapshot failed\n");
1652 err = -1;
1653 goto out_child;
1654 }
1655 }
1656
1657 if (trigger_is_hit(&switch_output_trigger)) {
1658 /*
1659 * If switch_output_trigger is hit, the data in
1660 * overwritable ring buffer should have been collected,
1661 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1662 *
1663 * If SIGUSR2 raise after or during record__mmap_read_all(),
1664 * record__mmap_read_all() didn't collect data from
1665 * overwritable ring buffer. Read again.
1666 */
1667 if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1668 continue;
1669 trigger_ready(&switch_output_trigger);
1670
1671 /*
1672 * Reenable events in overwrite ring buffer after
1673 * record__mmap_read_all(): we should have collected
1674 * data from it.
1675 */
1676 perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1677
1678 if (!quiet)
1679 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1680 waking);
1681 waking = 0;
1682 fd = record__switch_output(rec, false);
1683 if (fd < 0) {
1684 pr_err("Failed to switch to new file\n");
1685 trigger_error(&switch_output_trigger);
1686 err = fd;
1687 goto out_child;
1688 }
1689
1690 /* re-arm the alarm */
1691 if (rec->switch_output.time)
1692 alarm(rec->switch_output.time);
1693 }
1694
1695 if (hits == rec->samples) {
1696 if (done || draining)
1697 break;
1698 err = evlist__poll(rec->evlist, -1);
1699 /*
1700 * Propagate error, only if there's any. Ignore positive
1701 * number of returned events and interrupt error.
1702 */
1703 if (err > 0 || (err < 0 && errno == EINTR))
1704 err = 0;
1705 waking++;
1706
1707 if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1708 draining = true;
1709 }
1710
1711 /*
1712 * When perf is starting the traced process, at the end events
1713 * die with the process and we wait for that. Thus no need to
1714 * disable events in this case.
1715 */
1716 if (done && !disabled && !target__none(&opts->target)) {
1717 trigger_off(&auxtrace_snapshot_trigger);
1718 evlist__disable(rec->evlist);
1719 disabled = true;
1720 }
1721 }
1722
1723 trigger_off(&auxtrace_snapshot_trigger);
1724 trigger_off(&switch_output_trigger);
1725
1726 if (opts->auxtrace_snapshot_on_exit)
1727 record__auxtrace_snapshot_exit(rec);
1728
1729 if (forks && workload_exec_errno) {
1730 char msg[STRERR_BUFSIZE];
1731 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1732 pr_err("Workload failed: %s\n", emsg);
1733 err = -1;
1734 goto out_child;
1735 }
1736
1737 if (!quiet)
1738 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1739
1740 if (target__none(&rec->opts.target))
1741 record__synthesize_workload(rec, true);
1742
1743 out_child:
1744 record__mmap_read_all(rec, true);
1745 record__aio_mmap_read_sync(rec);
1746
1747 if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
1748 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
1749 session->header.env.comp_ratio = ratio + 0.5;
1750 }
1751
1752 if (forks) {
1753 int exit_status;
1754
1755 if (!child_finished)
1756 kill(rec->evlist->workload.pid, SIGTERM);
1757
1758 wait(&exit_status);
1759
1760 if (err < 0)
1761 status = err;
1762 else if (WIFEXITED(exit_status))
1763 status = WEXITSTATUS(exit_status);
1764 else if (WIFSIGNALED(exit_status))
1765 signr = WTERMSIG(exit_status);
1766 } else
1767 status = err;
1768
1769 record__synthesize(rec, true);
1770 /* this will be recalculated during process_buildids() */
1771 rec->samples = 0;
1772
1773 if (!err) {
1774 if (!rec->timestamp_filename) {
1775 record__finish_output(rec);
1776 } else {
1777 fd = record__switch_output(rec, true);
1778 if (fd < 0) {
1779 status = fd;
1780 goto out_delete_session;
1781 }
1782 }
1783 }
1784
1785 perf_hooks__invoke_record_end();
1786
1787 if (!err && !quiet) {
1788 char samples[128];
1789 const char *postfix = rec->timestamp_filename ?
1790 ".<timestamp>" : "";
1791
1792 if (rec->samples && !rec->opts.full_auxtrace)
1793 scnprintf(samples, sizeof(samples),
1794 " (%" PRIu64 " samples)", rec->samples);
1795 else
1796 samples[0] = '\0';
1797
1798 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
1799 perf_data__size(data) / 1024.0 / 1024.0,
1800 data->path, postfix, samples);
1801 if (ratio) {
1802 fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
1803 rec->session->bytes_transferred / 1024.0 / 1024.0,
1804 ratio);
1805 }
1806 fprintf(stderr, " ]\n");
1807 }
1808
1809 out_delete_session:
1810 zstd_fini(&session->zstd_data);
1811 perf_session__delete(session);
1812
1813 if (!opts->no_bpf_event)
1814 perf_evlist__stop_sb_thread(sb_evlist);
1815 return status;
1816 }
1817
1818 static void callchain_debug(struct callchain_param *callchain)
1819 {
1820 static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
1821
1822 pr_debug("callchain: type %s\n", str[callchain->record_mode]);
1823
1824 if (callchain->record_mode == CALLCHAIN_DWARF)
1825 pr_debug("callchain: stack dump size %d\n",
1826 callchain->dump_size);
1827 }
1828
1829 int record_opts__parse_callchain(struct record_opts *record,
1830 struct callchain_param *callchain,
1831 const char *arg, bool unset)
1832 {
1833 int ret;
1834 callchain->enabled = !unset;
1835
1836 /* --no-call-graph */
1837 if (unset) {
1838 callchain->record_mode = CALLCHAIN_NONE;
1839 pr_debug("callchain: disabled\n");
1840 return 0;
1841 }
1842
1843 ret = parse_callchain_record_opt(arg, callchain);
1844 if (!ret) {
1845 /* Enable data address sampling for DWARF unwind. */
1846 if (callchain->record_mode == CALLCHAIN_DWARF)
1847 record->sample_address = true;
1848 callchain_debug(callchain);
1849 }
1850
1851 return ret;
1852 }
1853
1854 int record_parse_callchain_opt(const struct option *opt,
1855 const char *arg,
1856 int unset)
1857 {
1858 return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
1859 }
1860
1861 int record_callchain_opt(const struct option *opt,
1862 const char *arg __maybe_unused,
1863 int unset __maybe_unused)
1864 {
1865 struct callchain_param *callchain = opt->value;
1866
1867 callchain->enabled = true;
1868
1869 if (callchain->record_mode == CALLCHAIN_NONE)
1870 callchain->record_mode = CALLCHAIN_FP;
1871
1872 callchain_debug(callchain);
1873 return 0;
1874 }
1875
1876 static int perf_record_config(const char *var, const char *value, void *cb)
1877 {
1878 struct record *rec = cb;
1879
1880 if (!strcmp(var, "record.build-id")) {
1881 if (!strcmp(value, "cache"))
1882 rec->no_buildid_cache = false;
1883 else if (!strcmp(value, "no-cache"))
1884 rec->no_buildid_cache = true;
1885 else if (!strcmp(value, "skip"))
1886 rec->no_buildid = true;
1887 else
1888 return -1;
1889 return 0;
1890 }
1891 if (!strcmp(var, "record.call-graph")) {
1892 var = "call-graph.record-mode";
1893 return perf_default_config(var, value, cb);
1894 }
1895 #ifdef HAVE_AIO_SUPPORT
1896 if (!strcmp(var, "record.aio")) {
1897 rec->opts.nr_cblocks = strtol(value, NULL, 0);
1898 if (!rec->opts.nr_cblocks)
1899 rec->opts.nr_cblocks = nr_cblocks_default;
1900 }
1901 #endif
1902
1903 return 0;
1904 }
1905
1906 struct clockid_map {
1907 const char *name;
1908 int clockid;
1909 };
1910
1911 #define CLOCKID_MAP(n, c) \
1912 { .name = n, .clockid = (c), }
1913
1914 #define CLOCKID_END { .name = NULL, }
1915
1916
1917 /*
1918 * Add the missing ones, we need to build on many distros...
1919 */
1920 #ifndef CLOCK_MONOTONIC_RAW
1921 #define CLOCK_MONOTONIC_RAW 4
1922 #endif
1923 #ifndef CLOCK_BOOTTIME
1924 #define CLOCK_BOOTTIME 7
1925 #endif
1926 #ifndef CLOCK_TAI
1927 #define CLOCK_TAI 11
1928 #endif
1929
1930 static const struct clockid_map clockids[] = {
1931 /* available for all events, NMI safe */
1932 CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
1933 CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
1934
1935 /* available for some events */
1936 CLOCKID_MAP("realtime", CLOCK_REALTIME),
1937 CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
1938 CLOCKID_MAP("tai", CLOCK_TAI),
1939
1940 /* available for the lazy */
1941 CLOCKID_MAP("mono", CLOCK_MONOTONIC),
1942 CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
1943 CLOCKID_MAP("real", CLOCK_REALTIME),
1944 CLOCKID_MAP("boot", CLOCK_BOOTTIME),
1945
1946 CLOCKID_END,
1947 };
1948
1949 static int get_clockid_res(clockid_t clk_id, u64 *res_ns)
1950 {
1951 struct timespec res;
1952
1953 *res_ns = 0;
1954 if (!clock_getres(clk_id, &res))
1955 *res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC;
1956 else
1957 pr_warning("WARNING: Failed to determine specified clock resolution.\n");
1958
1959 return 0;
1960 }
1961
1962 static int parse_clockid(const struct option *opt, const char *str, int unset)
1963 {
1964 struct record_opts *opts = (struct record_opts *)opt->value;
1965 const struct clockid_map *cm;
1966 const char *ostr = str;
1967
1968 if (unset) {
1969 opts->use_clockid = 0;
1970 return 0;
1971 }
1972
1973 /* no arg passed */
1974 if (!str)
1975 return 0;
1976
1977 /* no setting it twice */
1978 if (opts->use_clockid)
1979 return -1;
1980
1981 opts->use_clockid = true;
1982
1983 /* if its a number, we're done */
1984 if (sscanf(str, "%d", &opts->clockid) == 1)
1985 return get_clockid_res(opts->clockid, &opts->clockid_res_ns);
1986
1987 /* allow a "CLOCK_" prefix to the name */
1988 if (!strncasecmp(str, "CLOCK_", 6))
1989 str += 6;
1990
1991 for (cm = clockids; cm->name; cm++) {
1992 if (!strcasecmp(str, cm->name)) {
1993 opts->clockid = cm->clockid;
1994 return get_clockid_res(opts->clockid,
1995 &opts->clockid_res_ns);
1996 }
1997 }
1998
1999 opts->use_clockid = false;
2000 ui__warning("unknown clockid %s, check man page\n", ostr);
2001 return -1;
2002 }
2003
2004 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2005 {
2006 struct record_opts *opts = (struct record_opts *)opt->value;
2007
2008 if (unset || !str)
2009 return 0;
2010
2011 if (!strcasecmp(str, "node"))
2012 opts->affinity = PERF_AFFINITY_NODE;
2013 else if (!strcasecmp(str, "cpu"))
2014 opts->affinity = PERF_AFFINITY_CPU;
2015
2016 return 0;
2017 }
2018
2019 static int parse_output_max_size(const struct option *opt,
2020 const char *str, int unset)
2021 {
2022 unsigned long *s = (unsigned long *)opt->value;
2023 static struct parse_tag tags_size[] = {
2024 { .tag = 'B', .mult = 1 },
2025 { .tag = 'K', .mult = 1 << 10 },
2026 { .tag = 'M', .mult = 1 << 20 },
2027 { .tag = 'G', .mult = 1 << 30 },
2028 { .tag = 0 },
2029 };
2030 unsigned long val;
2031
2032 if (unset) {
2033 *s = 0;
2034 return 0;
2035 }
2036
2037 val = parse_tag_value(str, tags_size);
2038 if (val != (unsigned long) -1) {
2039 *s = val;
2040 return 0;
2041 }
2042
2043 return -1;
2044 }
2045
2046 static int record__parse_mmap_pages(const struct option *opt,
2047 const char *str,
2048 int unset __maybe_unused)
2049 {
2050 struct record_opts *opts = opt->value;
2051 char *s, *p;
2052 unsigned int mmap_pages;
2053 int ret;
2054
2055 if (!str)
2056 return -EINVAL;
2057
2058 s = strdup(str);
2059 if (!s)
2060 return -ENOMEM;
2061
2062 p = strchr(s, ',');
2063 if (p)
2064 *p = '\0';
2065
2066 if (*s) {
2067 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
2068 if (ret)
2069 goto out_free;
2070 opts->mmap_pages = mmap_pages;
2071 }
2072
2073 if (!p) {
2074 ret = 0;
2075 goto out_free;
2076 }
2077
2078 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
2079 if (ret)
2080 goto out_free;
2081
2082 opts->auxtrace_mmap_pages = mmap_pages;
2083
2084 out_free:
2085 free(s);
2086 return ret;
2087 }
2088
2089 static void switch_output_size_warn(struct record *rec)
2090 {
2091 u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
2092 struct switch_output *s = &rec->switch_output;
2093
2094 wakeup_size /= 2;
2095
2096 if (s->size < wakeup_size) {
2097 char buf[100];
2098
2099 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
2100 pr_warning("WARNING: switch-output data size lower than "
2101 "wakeup kernel buffer size (%s) "
2102 "expect bigger perf.data sizes\n", buf);
2103 }
2104 }
2105
2106 static int switch_output_setup(struct record *rec)
2107 {
2108 struct switch_output *s = &rec->switch_output;
2109 static struct parse_tag tags_size[] = {
2110 { .tag = 'B', .mult = 1 },
2111 { .tag = 'K', .mult = 1 << 10 },
2112 { .tag = 'M', .mult = 1 << 20 },
2113 { .tag = 'G', .mult = 1 << 30 },
2114 { .tag = 0 },
2115 };
2116 static struct parse_tag tags_time[] = {
2117 { .tag = 's', .mult = 1 },
2118 { .tag = 'm', .mult = 60 },
2119 { .tag = 'h', .mult = 60*60 },
2120 { .tag = 'd', .mult = 60*60*24 },
2121 { .tag = 0 },
2122 };
2123 unsigned long val;
2124
2125 if (!s->set)
2126 return 0;
2127
2128 if (!strcmp(s->str, "signal")) {
2129 s->signal = true;
2130 pr_debug("switch-output with SIGUSR2 signal\n");
2131 goto enabled;
2132 }
2133
2134 val = parse_tag_value(s->str, tags_size);
2135 if (val != (unsigned long) -1) {
2136 s->size = val;
2137 pr_debug("switch-output with %s size threshold\n", s->str);
2138 goto enabled;
2139 }
2140
2141 val = parse_tag_value(s->str, tags_time);
2142 if (val != (unsigned long) -1) {
2143 s->time = val;
2144 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2145 s->str, s->time);
2146 goto enabled;
2147 }
2148
2149 return -1;
2150
2151 enabled:
2152 rec->timestamp_filename = true;
2153 s->enabled = true;
2154
2155 if (s->size && !rec->opts.no_buffering)
2156 switch_output_size_warn(rec);
2157
2158 return 0;
2159 }
2160
2161 static const char * const __record_usage[] = {
2162 "perf record [<options>] [<command>]",
2163 "perf record [<options>] -- <command> [<options>]",
2164 NULL
2165 };
2166 const char * const *record_usage = __record_usage;
2167
2168 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
2169 struct perf_sample *sample, struct machine *machine)
2170 {
2171 /*
2172 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2173 * no need to add them twice.
2174 */
2175 if (!(event->header.misc & PERF_RECORD_MISC_USER))
2176 return 0;
2177 return perf_event__process_mmap(tool, event, sample, machine);
2178 }
2179
2180 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
2181 struct perf_sample *sample, struct machine *machine)
2182 {
2183 /*
2184 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2185 * no need to add them twice.
2186 */
2187 if (!(event->header.misc & PERF_RECORD_MISC_USER))
2188 return 0;
2189
2190 return perf_event__process_mmap2(tool, event, sample, machine);
2191 }
2192
2193 /*
2194 * XXX Ideally would be local to cmd_record() and passed to a record__new
2195 * because we need to have access to it in record__exit, that is called
2196 * after cmd_record() exits, but since record_options need to be accessible to
2197 * builtin-script, leave it here.
2198 *
2199 * At least we don't ouch it in all the other functions here directly.
2200 *
2201 * Just say no to tons of global variables, sigh.
2202 */
2203 static struct record record = {
2204 .opts = {
2205 .sample_time = true,
2206 .mmap_pages = UINT_MAX,
2207 .user_freq = UINT_MAX,
2208 .user_interval = ULLONG_MAX,
2209 .freq = 4000,
2210 .target = {
2211 .uses_mmap = true,
2212 .default_per_cpu = true,
2213 },
2214 .mmap_flush = MMAP_FLUSH_DEFAULT,
2215 },
2216 .tool = {
2217 .sample = process_sample_event,
2218 .fork = perf_event__process_fork,
2219 .exit = perf_event__process_exit,
2220 .comm = perf_event__process_comm,
2221 .namespaces = perf_event__process_namespaces,
2222 .mmap = build_id__process_mmap,
2223 .mmap2 = build_id__process_mmap2,
2224 .ordered_events = true,
2225 },
2226 };
2227
2228 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2229 "\n\t\t\t\tDefault: fp";
2230
2231 static bool dry_run;
2232
2233 /*
2234 * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2235 * with it and switch to use the library functions in perf_evlist that came
2236 * from builtin-record.c, i.e. use record_opts,
2237 * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2238 * using pipes, etc.
2239 */
2240 static struct option __record_options[] = {
2241 OPT_CALLBACK('e', "event", &record.evlist, "event",
2242 "event selector. use 'perf list' to list available events",
2243 parse_events_option),
2244 OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2245 "event filter", parse_filter),
2246 OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2247 NULL, "don't record events from perf itself",
2248 exclude_perf),
2249 OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2250 "record events on existing process id"),
2251 OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2252 "record events on existing thread id"),
2253 OPT_INTEGER('r', "realtime", &record.realtime_prio,
2254 "collect data with this RT SCHED_FIFO priority"),
2255 OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2256 "collect data without buffering"),
2257 OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2258 "collect raw sample records from all opened counters"),
2259 OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2260 "system-wide collection from all CPUs"),
2261 OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2262 "list of cpus to monitor"),
2263 OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2264 OPT_STRING('o', "output", &record.data.path, "file",
2265 "output file name"),
2266 OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2267 &record.opts.no_inherit_set,
2268 "child tasks do not inherit counters"),
2269 OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2270 "synthesize non-sample events at the end of output"),
2271 OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2272 OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "record bpf events"),
2273 OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2274 "Fail if the specified frequency can't be used"),
2275 OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2276 "profile at this frequency",
2277 record__parse_freq),
2278 OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2279 "number of mmap data pages and AUX area tracing mmap pages",
2280 record__parse_mmap_pages),
2281 OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2282 "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2283 record__mmap_flush_parse),
2284 OPT_BOOLEAN(0, "group", &record.opts.group,
2285 "put the counters into a counter group"),
2286 OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2287 NULL, "enables call-graph recording" ,
2288 &record_callchain_opt),
2289 OPT_CALLBACK(0, "call-graph", &record.opts,
2290 "record_mode[,record_size]", record_callchain_help,
2291 &record_parse_callchain_opt),
2292 OPT_INCR('v', "verbose", &verbose,
2293 "be more verbose (show counter open errors, etc)"),
2294 OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2295 OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2296 "per thread counts"),
2297 OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2298 OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2299 "Record the sample physical addresses"),
2300 OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2301 OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2302 &record.opts.sample_time_set,
2303 "Record the sample timestamps"),
2304 OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2305 "Record the sample period"),
2306 OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2307 "don't sample"),
2308 OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2309 &record.no_buildid_cache_set,
2310 "do not update the buildid cache"),
2311 OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2312 &record.no_buildid_set,
2313 "do not collect buildids in perf.data"),
2314 OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2315 "monitor event in cgroup name only",
2316 parse_cgroups),
2317 OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
2318 "ms to wait before starting measurement after program start"),
2319 OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
2320 OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2321 "user to profile"),
2322
2323 OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2324 "branch any", "sample any taken branches",
2325 parse_branch_stack),
2326
2327 OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2328 "branch filter mask", "branch stack filter modes",
2329 parse_branch_stack),
2330 OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2331 "sample by weight (on special events only)"),
2332 OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2333 "sample transaction flags (special events only)"),
2334 OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2335 "use per-thread mmaps"),
2336 OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2337 "sample selected machine registers on interrupt,"
2338 " use '-I?' to list register names", parse_intr_regs),
2339 OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2340 "sample selected machine registers on interrupt,"
2341 " use '--user-regs=?' to list register names", parse_user_regs),
2342 OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2343 "Record running/enabled time of read (:S) events"),
2344 OPT_CALLBACK('k', "clockid", &record.opts,
2345 "clockid", "clockid to use for events, see clock_gettime()",
2346 parse_clockid),
2347 OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2348 "opts", "AUX area tracing Snapshot Mode", ""),
2349 OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
2350 "opts", "sample AUX area", ""),
2351 OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2352 "per thread proc mmap processing timeout in ms"),
2353 OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2354 "Record namespaces events"),
2355 OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
2356 "Record context switch events"),
2357 OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2358 "Configure all used events to run in kernel space.",
2359 PARSE_OPT_EXCLUSIVE),
2360 OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2361 "Configure all used events to run in user space.",
2362 PARSE_OPT_EXCLUSIVE),
2363 OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2364 "collect kernel callchains"),
2365 OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2366 "collect user callchains"),
2367 OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2368 "clang binary to use for compiling BPF scriptlets"),
2369 OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2370 "options passed to clang when compiling BPF scriptlets"),
2371 OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2372 "file", "vmlinux pathname"),
2373 OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2374 "Record build-id of all DSOs regardless of hits"),
2375 OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2376 "append timestamp to output filename"),
2377 OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2378 "Record timestamp boundary (time of first/last samples)"),
2379 OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2380 &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2381 "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2382 "signal"),
2383 OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2384 "Limit number of switch output generated files"),
2385 OPT_BOOLEAN(0, "dry-run", &dry_run,
2386 "Parse options then exit"),
2387 #ifdef HAVE_AIO_SUPPORT
2388 OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2389 &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2390 record__aio_parse),
2391 #endif
2392 OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2393 "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2394 record__parse_affinity),
2395 #ifdef HAVE_ZSTD_SUPPORT
2396 OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2397 "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2398 record__parse_comp_level),
2399 #endif
2400 OPT_CALLBACK(0, "max-size", &record.output_max_size,
2401 "size", "Limit the maximum size of the output file", parse_output_max_size),
2402 OPT_END()
2403 };
2404
2405 struct option *record_options = __record_options;
2406
2407 int cmd_record(int argc, const char **argv)
2408 {
2409 int err;
2410 struct record *rec = &record;
2411 char errbuf[BUFSIZ];
2412
2413 setlocale(LC_ALL, "");
2414
2415 #ifndef HAVE_LIBBPF_SUPPORT
2416 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2417 set_nobuild('\0', "clang-path", true);
2418 set_nobuild('\0', "clang-opt", true);
2419 # undef set_nobuild
2420 #endif
2421
2422 #ifndef HAVE_BPF_PROLOGUE
2423 # if !defined (HAVE_DWARF_SUPPORT)
2424 # define REASON "NO_DWARF=1"
2425 # elif !defined (HAVE_LIBBPF_SUPPORT)
2426 # define REASON "NO_LIBBPF=1"
2427 # else
2428 # define REASON "this architecture doesn't support BPF prologue"
2429 # endif
2430 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2431 set_nobuild('\0', "vmlinux", true);
2432 # undef set_nobuild
2433 # undef REASON
2434 #endif
2435
2436 CPU_ZERO(&rec->affinity_mask);
2437 rec->opts.affinity = PERF_AFFINITY_SYS;
2438
2439 rec->evlist = evlist__new();
2440 if (rec->evlist == NULL)
2441 return -ENOMEM;
2442
2443 err = perf_config(perf_record_config, rec);
2444 if (err)
2445 return err;
2446
2447 argc = parse_options(argc, argv, record_options, record_usage,
2448 PARSE_OPT_STOP_AT_NON_OPTION);
2449 if (quiet)
2450 perf_quiet_option();
2451
2452 /* Make system wide (-a) the default target. */
2453 if (!argc && target__none(&rec->opts.target))
2454 rec->opts.target.system_wide = true;
2455
2456 if (nr_cgroups && !rec->opts.target.system_wide) {
2457 usage_with_options_msg(record_usage, record_options,
2458 "cgroup monitoring only available in system-wide mode");
2459
2460 }
2461
2462 if (rec->opts.kcore)
2463 rec->data.is_dir = true;
2464
2465 if (rec->opts.comp_level != 0) {
2466 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2467 rec->no_buildid = true;
2468 }
2469
2470 if (rec->opts.record_switch_events &&
2471 !perf_can_record_switch_events()) {
2472 ui__error("kernel does not support recording context switch events\n");
2473 parse_options_usage(record_usage, record_options, "switch-events", 0);
2474 return -EINVAL;
2475 }
2476
2477 if (switch_output_setup(rec)) {
2478 parse_options_usage(record_usage, record_options, "switch-output", 0);
2479 return -EINVAL;
2480 }
2481
2482 if (rec->switch_output.time) {
2483 signal(SIGALRM, alarm_sig_handler);
2484 alarm(rec->switch_output.time);
2485 }
2486
2487 if (rec->switch_output.num_files) {
2488 rec->switch_output.filenames = calloc(sizeof(char *),
2489 rec->switch_output.num_files);
2490 if (!rec->switch_output.filenames)
2491 return -EINVAL;
2492 }
2493
2494 /*
2495 * Allow aliases to facilitate the lookup of symbols for address
2496 * filters. Refer to auxtrace_parse_filters().
2497 */
2498 symbol_conf.allow_aliases = true;
2499
2500 symbol__init(NULL);
2501
2502 err = record__auxtrace_init(rec);
2503 if (err)
2504 goto out;
2505
2506 if (dry_run)
2507 goto out;
2508
2509 err = bpf__setup_stdout(rec->evlist);
2510 if (err) {
2511 bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2512 pr_err("ERROR: Setup BPF stdout failed: %s\n",
2513 errbuf);
2514 goto out;
2515 }
2516
2517 err = -ENOMEM;
2518
2519 if (rec->no_buildid_cache || rec->no_buildid) {
2520 disable_buildid_cache();
2521 } else if (rec->switch_output.enabled) {
2522 /*
2523 * In 'perf record --switch-output', disable buildid
2524 * generation by default to reduce data file switching
2525 * overhead. Still generate buildid if they are required
2526 * explicitly using
2527 *
2528 * perf record --switch-output --no-no-buildid \
2529 * --no-no-buildid-cache
2530 *
2531 * Following code equals to:
2532 *
2533 * if ((rec->no_buildid || !rec->no_buildid_set) &&
2534 * (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2535 * disable_buildid_cache();
2536 */
2537 bool disable = true;
2538
2539 if (rec->no_buildid_set && !rec->no_buildid)
2540 disable = false;
2541 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2542 disable = false;
2543 if (disable) {
2544 rec->no_buildid = true;
2545 rec->no_buildid_cache = true;
2546 disable_buildid_cache();
2547 }
2548 }
2549
2550 if (record.opts.overwrite)
2551 record.opts.tail_synthesize = true;
2552
2553 if (rec->evlist->core.nr_entries == 0 &&
2554 __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2555 pr_err("Not enough memory for event selector list\n");
2556 goto out;
2557 }
2558
2559 if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2560 rec->opts.no_inherit = true;
2561
2562 err = target__validate(&rec->opts.target);
2563 if (err) {
2564 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2565 ui__warning("%s\n", errbuf);
2566 }
2567
2568 err = target__parse_uid(&rec->opts.target);
2569 if (err) {
2570 int saved_errno = errno;
2571
2572 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2573 ui__error("%s", errbuf);
2574
2575 err = -saved_errno;
2576 goto out;
2577 }
2578
2579 /* Enable ignoring missing threads when -u/-p option is defined. */
2580 rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2581
2582 err = -ENOMEM;
2583 if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2584 usage_with_options(record_usage, record_options);
2585
2586 err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2587 if (err)
2588 goto out;
2589
2590 /*
2591 * We take all buildids when the file contains
2592 * AUX area tracing data because we do not decode the
2593 * trace because it would take too long.
2594 */
2595 if (rec->opts.full_auxtrace)
2596 rec->buildid_all = true;
2597
2598 if (record_opts__config(&rec->opts)) {
2599 err = -EINVAL;
2600 goto out;
2601 }
2602
2603 if (rec->opts.nr_cblocks > nr_cblocks_max)
2604 rec->opts.nr_cblocks = nr_cblocks_max;
2605 pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2606
2607 pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2608 pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2609
2610 if (rec->opts.comp_level > comp_level_max)
2611 rec->opts.comp_level = comp_level_max;
2612 pr_debug("comp level: %d\n", rec->opts.comp_level);
2613
2614 err = __cmd_record(&record, argc, argv);
2615 out:
2616 evlist__delete(rec->evlist);
2617 symbol__exit();
2618 auxtrace_record__free(rec->itr);
2619 return err;
2620 }
2621
2622 static void snapshot_sig_handler(int sig __maybe_unused)
2623 {
2624 struct record *rec = &record;
2625
2626 if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2627 trigger_hit(&auxtrace_snapshot_trigger);
2628 auxtrace_record__snapshot_started = 1;
2629 if (auxtrace_record__snapshot_start(record.itr))
2630 trigger_error(&auxtrace_snapshot_trigger);
2631 }
2632
2633 if (switch_output_signal(rec))
2634 trigger_hit(&switch_output_trigger);
2635 }
2636
2637 static void alarm_sig_handler(int sig __maybe_unused)
2638 {
2639 struct record *rec = &record;
2640
2641 if (switch_output_time(rec))
2642 trigger_hit(&switch_output_trigger);
2643 }