]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blame - kernel/events/core.c
perf/core: Don't re-schedule CPU flexible events needlessly
[mirror_ubuntu-hirsute-kernel.git] / kernel / events / core.c
CommitLineData
0793a61d 1/*
57c0c15b 2 * Performance events core code:
0793a61d 3 *
98144511 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
e7e7ee2e 5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
90eec103 6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
d36b6910 7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
7b732a75 8 *
57c0c15b 9 * For licensing details see kernel-base/COPYING
0793a61d
TG
10 */
11
12#include <linux/fs.h>
b9cacc7b 13#include <linux/mm.h>
0793a61d
TG
14#include <linux/cpu.h>
15#include <linux/smp.h>
2e80a82a 16#include <linux/idr.h>
04289bb9 17#include <linux/file.h>
0793a61d 18#include <linux/poll.h>
5a0e3ad6 19#include <linux/slab.h>
76e1d904 20#include <linux/hash.h>
12351ef8 21#include <linux/tick.h>
0793a61d 22#include <linux/sysfs.h>
22a4f650 23#include <linux/dcache.h>
0793a61d 24#include <linux/percpu.h>
22a4f650 25#include <linux/ptrace.h>
c277443c 26#include <linux/reboot.h>
b9cacc7b 27#include <linux/vmstat.h>
abe43400 28#include <linux/device.h>
6e5fdeed 29#include <linux/export.h>
906010b2 30#include <linux/vmalloc.h>
b9cacc7b
PZ
31#include <linux/hardirq.h>
32#include <linux/rculist.h>
0793a61d
TG
33#include <linux/uaccess.h>
34#include <linux/syscalls.h>
35#include <linux/anon_inodes.h>
aa9c4c0f 36#include <linux/kernel_stat.h>
39bed6cb 37#include <linux/cgroup.h>
cdd6c482 38#include <linux/perf_event.h>
af658dca 39#include <linux/trace_events.h>
3c502e7a 40#include <linux/hw_breakpoint.h>
c5ebcedb 41#include <linux/mm_types.h>
c464c76e 42#include <linux/module.h>
f972eb63 43#include <linux/mman.h>
b3f20785 44#include <linux/compat.h>
2541517c
AS
45#include <linux/bpf.h>
46#include <linux/filter.h>
375637bc
AS
47#include <linux/namei.h>
48#include <linux/parser.h>
0793a61d 49
76369139
FW
50#include "internal.h"
51
4e193bd4
TB
52#include <asm/irq_regs.h>
53
272325c4
PZ
54typedef int (*remote_function_f)(void *);
55
fe4b04fa 56struct remote_function_call {
e7e7ee2e 57 struct task_struct *p;
272325c4 58 remote_function_f func;
e7e7ee2e
IM
59 void *info;
60 int ret;
fe4b04fa
PZ
61};
62
63static void remote_function(void *data)
64{
65 struct remote_function_call *tfc = data;
66 struct task_struct *p = tfc->p;
67
68 if (p) {
0da4cf3e
PZ
69 /* -EAGAIN */
70 if (task_cpu(p) != smp_processor_id())
71 return;
72
73 /*
74 * Now that we're on right CPU with IRQs disabled, we can test
75 * if we hit the right task without races.
76 */
77
78 tfc->ret = -ESRCH; /* No such (running) process */
79 if (p != current)
fe4b04fa
PZ
80 return;
81 }
82
83 tfc->ret = tfc->func(tfc->info);
84}
85
86/**
87 * task_function_call - call a function on the cpu on which a task runs
88 * @p: the task to evaluate
89 * @func: the function to be called
90 * @info: the function call argument
91 *
92 * Calls the function @func when the task is currently running. This might
93 * be on the current CPU, which just calls the function directly
94 *
95 * returns: @func return value, or
96 * -ESRCH - when the process isn't running
97 * -EAGAIN - when the process moved away
98 */
99static int
272325c4 100task_function_call(struct task_struct *p, remote_function_f func, void *info)
fe4b04fa
PZ
101{
102 struct remote_function_call data = {
e7e7ee2e
IM
103 .p = p,
104 .func = func,
105 .info = info,
0da4cf3e 106 .ret = -EAGAIN,
fe4b04fa 107 };
0da4cf3e 108 int ret;
fe4b04fa 109
0da4cf3e
PZ
110 do {
111 ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1);
112 if (!ret)
113 ret = data.ret;
114 } while (ret == -EAGAIN);
fe4b04fa 115
0da4cf3e 116 return ret;
fe4b04fa
PZ
117}
118
119/**
120 * cpu_function_call - call a function on the cpu
121 * @func: the function to be called
122 * @info: the function call argument
123 *
124 * Calls the function @func on the remote cpu.
125 *
126 * returns: @func return value or -ENXIO when the cpu is offline
127 */
272325c4 128static int cpu_function_call(int cpu, remote_function_f func, void *info)
fe4b04fa
PZ
129{
130 struct remote_function_call data = {
e7e7ee2e
IM
131 .p = NULL,
132 .func = func,
133 .info = info,
134 .ret = -ENXIO, /* No such CPU */
fe4b04fa
PZ
135 };
136
137 smp_call_function_single(cpu, remote_function, &data, 1);
138
139 return data.ret;
140}
141
fae3fde6
PZ
142static inline struct perf_cpu_context *
143__get_cpu_context(struct perf_event_context *ctx)
144{
145 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
146}
147
148static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
149 struct perf_event_context *ctx)
0017960f 150{
fae3fde6
PZ
151 raw_spin_lock(&cpuctx->ctx.lock);
152 if (ctx)
153 raw_spin_lock(&ctx->lock);
154}
155
156static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
157 struct perf_event_context *ctx)
158{
159 if (ctx)
160 raw_spin_unlock(&ctx->lock);
161 raw_spin_unlock(&cpuctx->ctx.lock);
162}
163
63b6da39
PZ
164#define TASK_TOMBSTONE ((void *)-1L)
165
166static bool is_kernel_event(struct perf_event *event)
167{
f47c02c0 168 return READ_ONCE(event->owner) == TASK_TOMBSTONE;
63b6da39
PZ
169}
170
39a43640
PZ
171/*
172 * On task ctx scheduling...
173 *
174 * When !ctx->nr_events a task context will not be scheduled. This means
175 * we can disable the scheduler hooks (for performance) without leaving
176 * pending task ctx state.
177 *
178 * This however results in two special cases:
179 *
180 * - removing the last event from a task ctx; this is relatively straight
181 * forward and is done in __perf_remove_from_context.
182 *
183 * - adding the first event to a task ctx; this is tricky because we cannot
184 * rely on ctx->is_active and therefore cannot use event_function_call().
185 * See perf_install_in_context().
186 *
39a43640
PZ
187 * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
188 */
189
fae3fde6
PZ
190typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
191 struct perf_event_context *, void *);
192
193struct event_function_struct {
194 struct perf_event *event;
195 event_f func;
196 void *data;
197};
198
199static int event_function(void *info)
200{
201 struct event_function_struct *efs = info;
202 struct perf_event *event = efs->event;
0017960f 203 struct perf_event_context *ctx = event->ctx;
fae3fde6
PZ
204 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
205 struct perf_event_context *task_ctx = cpuctx->task_ctx;
63b6da39 206 int ret = 0;
fae3fde6
PZ
207
208 WARN_ON_ONCE(!irqs_disabled());
209
63b6da39 210 perf_ctx_lock(cpuctx, task_ctx);
fae3fde6
PZ
211 /*
212 * Since we do the IPI call without holding ctx->lock things can have
213 * changed, double check we hit the task we set out to hit.
fae3fde6
PZ
214 */
215 if (ctx->task) {
63b6da39 216 if (ctx->task != current) {
0da4cf3e 217 ret = -ESRCH;
63b6da39
PZ
218 goto unlock;
219 }
fae3fde6 220
fae3fde6
PZ
221 /*
222 * We only use event_function_call() on established contexts,
223 * and event_function() is only ever called when active (or
224 * rather, we'll have bailed in task_function_call() or the
225 * above ctx->task != current test), therefore we must have
226 * ctx->is_active here.
227 */
228 WARN_ON_ONCE(!ctx->is_active);
229 /*
230 * And since we have ctx->is_active, cpuctx->task_ctx must
231 * match.
232 */
63b6da39
PZ
233 WARN_ON_ONCE(task_ctx != ctx);
234 } else {
235 WARN_ON_ONCE(&cpuctx->ctx != ctx);
fae3fde6 236 }
63b6da39 237
fae3fde6 238 efs->func(event, cpuctx, ctx, efs->data);
63b6da39 239unlock:
fae3fde6
PZ
240 perf_ctx_unlock(cpuctx, task_ctx);
241
63b6da39 242 return ret;
fae3fde6
PZ
243}
244
fae3fde6 245static void event_function_call(struct perf_event *event, event_f func, void *data)
0017960f
PZ
246{
247 struct perf_event_context *ctx = event->ctx;
63b6da39 248 struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
fae3fde6
PZ
249 struct event_function_struct efs = {
250 .event = event,
251 .func = func,
252 .data = data,
253 };
0017960f 254
c97f4736
PZ
255 if (!event->parent) {
256 /*
257 * If this is a !child event, we must hold ctx::mutex to
258 * stabilize the the event->ctx relation. See
259 * perf_event_ctx_lock().
260 */
261 lockdep_assert_held(&ctx->mutex);
262 }
0017960f
PZ
263
264 if (!task) {
fae3fde6 265 cpu_function_call(event->cpu, event_function, &efs);
0017960f
PZ
266 return;
267 }
268
63b6da39
PZ
269 if (task == TASK_TOMBSTONE)
270 return;
271
a096309b 272again:
fae3fde6 273 if (!task_function_call(task, event_function, &efs))
0017960f
PZ
274 return;
275
276 raw_spin_lock_irq(&ctx->lock);
63b6da39
PZ
277 /*
278 * Reload the task pointer, it might have been changed by
279 * a concurrent perf_event_context_sched_out().
280 */
281 task = ctx->task;
a096309b
PZ
282 if (task == TASK_TOMBSTONE) {
283 raw_spin_unlock_irq(&ctx->lock);
284 return;
0017960f 285 }
a096309b
PZ
286 if (ctx->is_active) {
287 raw_spin_unlock_irq(&ctx->lock);
288 goto again;
289 }
290 func(event, NULL, ctx, data);
0017960f
PZ
291 raw_spin_unlock_irq(&ctx->lock);
292}
293
cca20946
PZ
294/*
295 * Similar to event_function_call() + event_function(), but hard assumes IRQs
296 * are already disabled and we're on the right CPU.
297 */
298static void event_function_local(struct perf_event *event, event_f func, void *data)
299{
300 struct perf_event_context *ctx = event->ctx;
301 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
302 struct task_struct *task = READ_ONCE(ctx->task);
303 struct perf_event_context *task_ctx = NULL;
304
305 WARN_ON_ONCE(!irqs_disabled());
306
307 if (task) {
308 if (task == TASK_TOMBSTONE)
309 return;
310
311 task_ctx = ctx;
312 }
313
314 perf_ctx_lock(cpuctx, task_ctx);
315
316 task = ctx->task;
317 if (task == TASK_TOMBSTONE)
318 goto unlock;
319
320 if (task) {
321 /*
322 * We must be either inactive or active and the right task,
323 * otherwise we're screwed, since we cannot IPI to somewhere
324 * else.
325 */
326 if (ctx->is_active) {
327 if (WARN_ON_ONCE(task != current))
328 goto unlock;
329
330 if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
331 goto unlock;
332 }
333 } else {
334 WARN_ON_ONCE(&cpuctx->ctx != ctx);
335 }
336
337 func(event, cpuctx, ctx, data);
338unlock:
339 perf_ctx_unlock(cpuctx, task_ctx);
340}
341
e5d1367f
SE
342#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
343 PERF_FLAG_FD_OUTPUT |\
a21b0b35
YD
344 PERF_FLAG_PID_CGROUP |\
345 PERF_FLAG_FD_CLOEXEC)
e5d1367f 346
bce38cd5
SE
347/*
348 * branch priv levels that need permission checks
349 */
350#define PERF_SAMPLE_BRANCH_PERM_PLM \
351 (PERF_SAMPLE_BRANCH_KERNEL |\
352 PERF_SAMPLE_BRANCH_HV)
353
0b3fcf17
SE
354enum event_type_t {
355 EVENT_FLEXIBLE = 0x1,
356 EVENT_PINNED = 0x2,
3cbaa590 357 EVENT_TIME = 0x4,
0b3fcf17
SE
358 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
359};
360
e5d1367f
SE
361/*
362 * perf_sched_events : >0 events exist
363 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
364 */
9107c89e
PZ
365
366static void perf_sched_delayed(struct work_struct *work);
367DEFINE_STATIC_KEY_FALSE(perf_sched_events);
368static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
369static DEFINE_MUTEX(perf_sched_mutex);
370static atomic_t perf_sched_count;
371
e5d1367f 372static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
ba532500 373static DEFINE_PER_CPU(int, perf_sched_cb_usages);
f2fb6bef 374static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
e5d1367f 375
cdd6c482
IM
376static atomic_t nr_mmap_events __read_mostly;
377static atomic_t nr_comm_events __read_mostly;
378static atomic_t nr_task_events __read_mostly;
948b26b6 379static atomic_t nr_freq_events __read_mostly;
45ac1403 380static atomic_t nr_switch_events __read_mostly;
9ee318a7 381
108b02cf
PZ
382static LIST_HEAD(pmus);
383static DEFINE_MUTEX(pmus_lock);
384static struct srcu_struct pmus_srcu;
385
0764771d 386/*
cdd6c482 387 * perf event paranoia level:
0fbdea19
IM
388 * -1 - not paranoid at all
389 * 0 - disallow raw tracepoint access for unpriv
cdd6c482 390 * 1 - disallow cpu events for unpriv
0fbdea19 391 * 2 - disallow kernel profiling for unpriv
0764771d 392 */
0161028b 393int sysctl_perf_event_paranoid __read_mostly = 2;
0764771d 394
20443384
FW
395/* Minimum for 512 kiB + 1 user control page */
396int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
df58ab24
PZ
397
398/*
cdd6c482 399 * max perf event sample rate
df58ab24 400 */
14c63f17
DH
401#define DEFAULT_MAX_SAMPLE_RATE 100000
402#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
403#define DEFAULT_CPU_TIME_MAX_PERCENT 25
404
405int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
406
407static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
408static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
409
d9494cb4
PZ
410static int perf_sample_allowed_ns __read_mostly =
411 DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
14c63f17 412
18ab2cd3 413static void update_perf_cpu_limits(void)
14c63f17
DH
414{
415 u64 tmp = perf_sample_period_ns;
416
417 tmp *= sysctl_perf_cpu_time_max_percent;
91a612ee
PZ
418 tmp = div_u64(tmp, 100);
419 if (!tmp)
420 tmp = 1;
421
422 WRITE_ONCE(perf_sample_allowed_ns, tmp);
14c63f17 423}
163ec435 424
9e630205
SE
425static int perf_rotate_context(struct perf_cpu_context *cpuctx);
426
163ec435
PZ
427int perf_proc_update_handler(struct ctl_table *table, int write,
428 void __user *buffer, size_t *lenp,
429 loff_t *ppos)
430{
723478c8 431 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
163ec435
PZ
432
433 if (ret || !write)
434 return ret;
435
ab7fdefb
KL
436 /*
437 * If throttling is disabled don't allow the write:
438 */
439 if (sysctl_perf_cpu_time_max_percent == 100 ||
440 sysctl_perf_cpu_time_max_percent == 0)
441 return -EINVAL;
442
163ec435 443 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
14c63f17
DH
444 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
445 update_perf_cpu_limits();
446
447 return 0;
448}
449
450int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
451
452int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
453 void __user *buffer, size_t *lenp,
454 loff_t *ppos)
455{
456 int ret = proc_dointvec(table, write, buffer, lenp, ppos);
457
458 if (ret || !write)
459 return ret;
460
b303e7c1
PZ
461 if (sysctl_perf_cpu_time_max_percent == 100 ||
462 sysctl_perf_cpu_time_max_percent == 0) {
91a612ee
PZ
463 printk(KERN_WARNING
464 "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
465 WRITE_ONCE(perf_sample_allowed_ns, 0);
466 } else {
467 update_perf_cpu_limits();
468 }
163ec435
PZ
469
470 return 0;
471}
1ccd1549 472
14c63f17
DH
473/*
474 * perf samples are done in some very critical code paths (NMIs).
475 * If they take too much CPU time, the system can lock up and not
476 * get any real work done. This will drop the sample rate when
477 * we detect that events are taking too long.
478 */
479#define NR_ACCUMULATED_SAMPLES 128
d9494cb4 480static DEFINE_PER_CPU(u64, running_sample_length);
14c63f17 481
91a612ee
PZ
482static u64 __report_avg;
483static u64 __report_allowed;
484
6a02ad66 485static void perf_duration_warn(struct irq_work *w)
14c63f17 486{
0d87d7ec 487 printk_ratelimited(KERN_INFO
91a612ee
PZ
488 "perf: interrupt took too long (%lld > %lld), lowering "
489 "kernel.perf_event_max_sample_rate to %d\n",
490 __report_avg, __report_allowed,
491 sysctl_perf_event_sample_rate);
6a02ad66
PZ
492}
493
494static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
495
496void perf_sample_event_took(u64 sample_len_ns)
497{
91a612ee
PZ
498 u64 max_len = READ_ONCE(perf_sample_allowed_ns);
499 u64 running_len;
500 u64 avg_len;
501 u32 max;
14c63f17 502
91a612ee 503 if (max_len == 0)
14c63f17
DH
504 return;
505
91a612ee
PZ
506 /* Decay the counter by 1 average sample. */
507 running_len = __this_cpu_read(running_sample_length);
508 running_len -= running_len/NR_ACCUMULATED_SAMPLES;
509 running_len += sample_len_ns;
510 __this_cpu_write(running_sample_length, running_len);
14c63f17
DH
511
512 /*
91a612ee
PZ
513 * Note: this will be biased artifically low until we have
514 * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
14c63f17
DH
515 * from having to maintain a count.
516 */
91a612ee
PZ
517 avg_len = running_len/NR_ACCUMULATED_SAMPLES;
518 if (avg_len <= max_len)
14c63f17
DH
519 return;
520
91a612ee
PZ
521 __report_avg = avg_len;
522 __report_allowed = max_len;
14c63f17 523
91a612ee
PZ
524 /*
525 * Compute a throttle threshold 25% below the current duration.
526 */
527 avg_len += avg_len / 4;
528 max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
529 if (avg_len < max)
530 max /= (u32)avg_len;
531 else
532 max = 1;
14c63f17 533
91a612ee
PZ
534 WRITE_ONCE(perf_sample_allowed_ns, avg_len);
535 WRITE_ONCE(max_samples_per_tick, max);
536
537 sysctl_perf_event_sample_rate = max * HZ;
538 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
6a02ad66 539
cd578abb 540 if (!irq_work_queue(&perf_duration_work)) {
91a612ee 541 early_printk("perf: interrupt took too long (%lld > %lld), lowering "
cd578abb 542 "kernel.perf_event_max_sample_rate to %d\n",
91a612ee 543 __report_avg, __report_allowed,
cd578abb
PZ
544 sysctl_perf_event_sample_rate);
545 }
14c63f17
DH
546}
547
cdd6c482 548static atomic64_t perf_event_id;
a96bbc16 549
0b3fcf17
SE
550static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
551 enum event_type_t event_type);
552
553static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
e5d1367f
SE
554 enum event_type_t event_type,
555 struct task_struct *task);
556
557static void update_context_time(struct perf_event_context *ctx);
558static u64 perf_event_time(struct perf_event *event);
0b3fcf17 559
cdd6c482 560void __weak perf_event_print_debug(void) { }
0793a61d 561
84c79910 562extern __weak const char *perf_pmu_name(void)
0793a61d 563{
84c79910 564 return "pmu";
0793a61d
TG
565}
566
0b3fcf17
SE
567static inline u64 perf_clock(void)
568{
569 return local_clock();
570}
571
34f43927
PZ
572static inline u64 perf_event_clock(struct perf_event *event)
573{
574 return event->clock();
575}
576
e5d1367f
SE
577#ifdef CONFIG_CGROUP_PERF
578
e5d1367f
SE
579static inline bool
580perf_cgroup_match(struct perf_event *event)
581{
582 struct perf_event_context *ctx = event->ctx;
583 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
584
ef824fa1
TH
585 /* @event doesn't care about cgroup */
586 if (!event->cgrp)
587 return true;
588
589 /* wants specific cgroup scope but @cpuctx isn't associated with any */
590 if (!cpuctx->cgrp)
591 return false;
592
593 /*
594 * Cgroup scoping is recursive. An event enabled for a cgroup is
595 * also enabled for all its descendant cgroups. If @cpuctx's
596 * cgroup is a descendant of @event's (the test covers identity
597 * case), it's a match.
598 */
599 return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
600 event->cgrp->css.cgroup);
e5d1367f
SE
601}
602
e5d1367f
SE
603static inline void perf_detach_cgroup(struct perf_event *event)
604{
4e2ba650 605 css_put(&event->cgrp->css);
e5d1367f
SE
606 event->cgrp = NULL;
607}
608
609static inline int is_cgroup_event(struct perf_event *event)
610{
611 return event->cgrp != NULL;
612}
613
614static inline u64 perf_cgroup_event_time(struct perf_event *event)
615{
616 struct perf_cgroup_info *t;
617
618 t = per_cpu_ptr(event->cgrp->info, event->cpu);
619 return t->time;
620}
621
622static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
623{
624 struct perf_cgroup_info *info;
625 u64 now;
626
627 now = perf_clock();
628
629 info = this_cpu_ptr(cgrp->info);
630
631 info->time += now - info->timestamp;
632 info->timestamp = now;
633}
634
635static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
636{
637 struct perf_cgroup *cgrp_out = cpuctx->cgrp;
638 if (cgrp_out)
639 __update_cgrp_time(cgrp_out);
640}
641
642static inline void update_cgrp_time_from_event(struct perf_event *event)
643{
3f7cce3c
SE
644 struct perf_cgroup *cgrp;
645
e5d1367f 646 /*
3f7cce3c
SE
647 * ensure we access cgroup data only when needed and
648 * when we know the cgroup is pinned (css_get)
e5d1367f 649 */
3f7cce3c 650 if (!is_cgroup_event(event))
e5d1367f
SE
651 return;
652
614e4c4e 653 cgrp = perf_cgroup_from_task(current, event->ctx);
3f7cce3c
SE
654 /*
655 * Do not update time when cgroup is not active
656 */
657 if (cgrp == event->cgrp)
658 __update_cgrp_time(event->cgrp);
e5d1367f
SE
659}
660
661static inline void
3f7cce3c
SE
662perf_cgroup_set_timestamp(struct task_struct *task,
663 struct perf_event_context *ctx)
e5d1367f
SE
664{
665 struct perf_cgroup *cgrp;
666 struct perf_cgroup_info *info;
667
3f7cce3c
SE
668 /*
669 * ctx->lock held by caller
670 * ensure we do not access cgroup data
671 * unless we have the cgroup pinned (css_get)
672 */
673 if (!task || !ctx->nr_cgroups)
e5d1367f
SE
674 return;
675
614e4c4e 676 cgrp = perf_cgroup_from_task(task, ctx);
e5d1367f 677 info = this_cpu_ptr(cgrp->info);
3f7cce3c 678 info->timestamp = ctx->timestamp;
e5d1367f
SE
679}
680
058fe1c0
DCC
681static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
682
e5d1367f
SE
683#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
684#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
685
686/*
687 * reschedule events based on the cgroup constraint of task.
688 *
689 * mode SWOUT : schedule out everything
690 * mode SWIN : schedule in based on cgroup for next
691 */
18ab2cd3 692static void perf_cgroup_switch(struct task_struct *task, int mode)
e5d1367f
SE
693{
694 struct perf_cpu_context *cpuctx;
058fe1c0 695 struct list_head *list;
e5d1367f
SE
696 unsigned long flags;
697
698 /*
058fe1c0
DCC
699 * Disable interrupts and preemption to avoid this CPU's
700 * cgrp_cpuctx_entry to change under us.
e5d1367f
SE
701 */
702 local_irq_save(flags);
703
058fe1c0
DCC
704 list = this_cpu_ptr(&cgrp_cpuctx_list);
705 list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
706 WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
e5d1367f 707
058fe1c0
DCC
708 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
709 perf_pmu_disable(cpuctx->ctx.pmu);
e5d1367f 710
058fe1c0
DCC
711 if (mode & PERF_CGROUP_SWOUT) {
712 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
713 /*
714 * must not be done before ctxswout due
715 * to event_filter_match() in event_sched_out()
716 */
717 cpuctx->cgrp = NULL;
718 }
e5d1367f 719
058fe1c0
DCC
720 if (mode & PERF_CGROUP_SWIN) {
721 WARN_ON_ONCE(cpuctx->cgrp);
722 /*
723 * set cgrp before ctxsw in to allow
724 * event_filter_match() to not have to pass
725 * task around
726 * we pass the cpuctx->ctx to perf_cgroup_from_task()
727 * because cgorup events are only per-cpu
728 */
729 cpuctx->cgrp = perf_cgroup_from_task(task,
730 &cpuctx->ctx);
731 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
e5d1367f 732 }
058fe1c0
DCC
733 perf_pmu_enable(cpuctx->ctx.pmu);
734 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
e5d1367f
SE
735 }
736
e5d1367f
SE
737 local_irq_restore(flags);
738}
739
a8d757ef
SE
740static inline void perf_cgroup_sched_out(struct task_struct *task,
741 struct task_struct *next)
e5d1367f 742{
a8d757ef
SE
743 struct perf_cgroup *cgrp1;
744 struct perf_cgroup *cgrp2 = NULL;
745
ddaaf4e2 746 rcu_read_lock();
a8d757ef
SE
747 /*
748 * we come here when we know perf_cgroup_events > 0
614e4c4e
SE
749 * we do not need to pass the ctx here because we know
750 * we are holding the rcu lock
a8d757ef 751 */
614e4c4e 752 cgrp1 = perf_cgroup_from_task(task, NULL);
70a01657 753 cgrp2 = perf_cgroup_from_task(next, NULL);
a8d757ef
SE
754
755 /*
756 * only schedule out current cgroup events if we know
757 * that we are switching to a different cgroup. Otherwise,
758 * do no touch the cgroup events.
759 */
760 if (cgrp1 != cgrp2)
761 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
ddaaf4e2
SE
762
763 rcu_read_unlock();
e5d1367f
SE
764}
765
a8d757ef
SE
766static inline void perf_cgroup_sched_in(struct task_struct *prev,
767 struct task_struct *task)
e5d1367f 768{
a8d757ef
SE
769 struct perf_cgroup *cgrp1;
770 struct perf_cgroup *cgrp2 = NULL;
771
ddaaf4e2 772 rcu_read_lock();
a8d757ef
SE
773 /*
774 * we come here when we know perf_cgroup_events > 0
614e4c4e
SE
775 * we do not need to pass the ctx here because we know
776 * we are holding the rcu lock
a8d757ef 777 */
614e4c4e 778 cgrp1 = perf_cgroup_from_task(task, NULL);
614e4c4e 779 cgrp2 = perf_cgroup_from_task(prev, NULL);
a8d757ef
SE
780
781 /*
782 * only need to schedule in cgroup events if we are changing
783 * cgroup during ctxsw. Cgroup events were not scheduled
784 * out of ctxsw out if that was not the case.
785 */
786 if (cgrp1 != cgrp2)
787 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
ddaaf4e2
SE
788
789 rcu_read_unlock();
e5d1367f
SE
790}
791
792static inline int perf_cgroup_connect(int fd, struct perf_event *event,
793 struct perf_event_attr *attr,
794 struct perf_event *group_leader)
795{
796 struct perf_cgroup *cgrp;
797 struct cgroup_subsys_state *css;
2903ff01
AV
798 struct fd f = fdget(fd);
799 int ret = 0;
e5d1367f 800
2903ff01 801 if (!f.file)
e5d1367f
SE
802 return -EBADF;
803
b583043e 804 css = css_tryget_online_from_dir(f.file->f_path.dentry,
ec903c0c 805 &perf_event_cgrp_subsys);
3db272c0
LZ
806 if (IS_ERR(css)) {
807 ret = PTR_ERR(css);
808 goto out;
809 }
e5d1367f
SE
810
811 cgrp = container_of(css, struct perf_cgroup, css);
812 event->cgrp = cgrp;
813
814 /*
815 * all events in a group must monitor
816 * the same cgroup because a task belongs
817 * to only one perf cgroup at a time
818 */
819 if (group_leader && group_leader->cgrp != cgrp) {
820 perf_detach_cgroup(event);
821 ret = -EINVAL;
e5d1367f 822 }
3db272c0 823out:
2903ff01 824 fdput(f);
e5d1367f
SE
825 return ret;
826}
827
828static inline void
829perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
830{
831 struct perf_cgroup_info *t;
832 t = per_cpu_ptr(event->cgrp->info, event->cpu);
833 event->shadow_ctx_time = now - t->timestamp;
834}
835
836static inline void
837perf_cgroup_defer_enabled(struct perf_event *event)
838{
839 /*
840 * when the current task's perf cgroup does not match
841 * the event's, we need to remember to call the
842 * perf_mark_enable() function the first time a task with
843 * a matching perf cgroup is scheduled in.
844 */
845 if (is_cgroup_event(event) && !perf_cgroup_match(event))
846 event->cgrp_defer_enabled = 1;
847}
848
849static inline void
850perf_cgroup_mark_enabled(struct perf_event *event,
851 struct perf_event_context *ctx)
852{
853 struct perf_event *sub;
854 u64 tstamp = perf_event_time(event);
855
856 if (!event->cgrp_defer_enabled)
857 return;
858
859 event->cgrp_defer_enabled = 0;
860
861 event->tstamp_enabled = tstamp - event->total_time_enabled;
862 list_for_each_entry(sub, &event->sibling_list, group_entry) {
863 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
864 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
865 sub->cgrp_defer_enabled = 0;
866 }
867 }
868}
db4a8356
DCC
869
870/*
871 * Update cpuctx->cgrp so that it is set when first cgroup event is added and
872 * cleared when last cgroup event is removed.
873 */
874static inline void
875list_update_cgroup_event(struct perf_event *event,
876 struct perf_event_context *ctx, bool add)
877{
878 struct perf_cpu_context *cpuctx;
058fe1c0 879 struct list_head *cpuctx_entry;
db4a8356
DCC
880
881 if (!is_cgroup_event(event))
882 return;
883
884 if (add && ctx->nr_cgroups++)
885 return;
886 else if (!add && --ctx->nr_cgroups)
887 return;
888 /*
889 * Because cgroup events are always per-cpu events,
890 * this will always be called from the right CPU.
891 */
892 cpuctx = __get_cpu_context(ctx);
058fe1c0
DCC
893 cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
894 /* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/
895 if (add) {
896 list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
897 if (perf_cgroup_from_task(current, ctx) == event->cgrp)
898 cpuctx->cgrp = event->cgrp;
899 } else {
900 list_del(cpuctx_entry);
8fc31ce8 901 cpuctx->cgrp = NULL;
058fe1c0 902 }
db4a8356
DCC
903}
904
e5d1367f
SE
905#else /* !CONFIG_CGROUP_PERF */
906
907static inline bool
908perf_cgroup_match(struct perf_event *event)
909{
910 return true;
911}
912
913static inline void perf_detach_cgroup(struct perf_event *event)
914{}
915
916static inline int is_cgroup_event(struct perf_event *event)
917{
918 return 0;
919}
920
921static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
922{
923 return 0;
924}
925
926static inline void update_cgrp_time_from_event(struct perf_event *event)
927{
928}
929
930static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
931{
932}
933
a8d757ef
SE
934static inline void perf_cgroup_sched_out(struct task_struct *task,
935 struct task_struct *next)
e5d1367f
SE
936{
937}
938
a8d757ef
SE
939static inline void perf_cgroup_sched_in(struct task_struct *prev,
940 struct task_struct *task)
e5d1367f
SE
941{
942}
943
944static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
945 struct perf_event_attr *attr,
946 struct perf_event *group_leader)
947{
948 return -EINVAL;
949}
950
951static inline void
3f7cce3c
SE
952perf_cgroup_set_timestamp(struct task_struct *task,
953 struct perf_event_context *ctx)
e5d1367f
SE
954{
955}
956
957void
958perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
959{
960}
961
962static inline void
963perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
964{
965}
966
967static inline u64 perf_cgroup_event_time(struct perf_event *event)
968{
969 return 0;
970}
971
972static inline void
973perf_cgroup_defer_enabled(struct perf_event *event)
974{
975}
976
977static inline void
978perf_cgroup_mark_enabled(struct perf_event *event,
979 struct perf_event_context *ctx)
980{
981}
db4a8356
DCC
982
983static inline void
984list_update_cgroup_event(struct perf_event *event,
985 struct perf_event_context *ctx, bool add)
986{
987}
988
e5d1367f
SE
989#endif
990
9e630205
SE
991/*
992 * set default to be dependent on timer tick just
993 * like original code
994 */
995#define PERF_CPU_HRTIMER (1000 / HZ)
996/*
997 * function must be called with interrupts disbled
998 */
272325c4 999static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
9e630205
SE
1000{
1001 struct perf_cpu_context *cpuctx;
9e630205
SE
1002 int rotations = 0;
1003
1004 WARN_ON(!irqs_disabled());
1005
1006 cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
9e630205
SE
1007 rotations = perf_rotate_context(cpuctx);
1008
4cfafd30
PZ
1009 raw_spin_lock(&cpuctx->hrtimer_lock);
1010 if (rotations)
9e630205 1011 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
4cfafd30
PZ
1012 else
1013 cpuctx->hrtimer_active = 0;
1014 raw_spin_unlock(&cpuctx->hrtimer_lock);
9e630205 1015
4cfafd30 1016 return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
9e630205
SE
1017}
1018
272325c4 1019static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
9e630205 1020{
272325c4 1021 struct hrtimer *timer = &cpuctx->hrtimer;
9e630205 1022 struct pmu *pmu = cpuctx->ctx.pmu;
272325c4 1023 u64 interval;
9e630205
SE
1024
1025 /* no multiplexing needed for SW PMU */
1026 if (pmu->task_ctx_nr == perf_sw_context)
1027 return;
1028
62b85639
SE
1029 /*
1030 * check default is sane, if not set then force to
1031 * default interval (1/tick)
1032 */
272325c4
PZ
1033 interval = pmu->hrtimer_interval_ms;
1034 if (interval < 1)
1035 interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
62b85639 1036
272325c4 1037 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
9e630205 1038
4cfafd30
PZ
1039 raw_spin_lock_init(&cpuctx->hrtimer_lock);
1040 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
272325c4 1041 timer->function = perf_mux_hrtimer_handler;
9e630205
SE
1042}
1043
272325c4 1044static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
9e630205 1045{
272325c4 1046 struct hrtimer *timer = &cpuctx->hrtimer;
9e630205 1047 struct pmu *pmu = cpuctx->ctx.pmu;
4cfafd30 1048 unsigned long flags;
9e630205
SE
1049
1050 /* not for SW PMU */
1051 if (pmu->task_ctx_nr == perf_sw_context)
272325c4 1052 return 0;
9e630205 1053
4cfafd30
PZ
1054 raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
1055 if (!cpuctx->hrtimer_active) {
1056 cpuctx->hrtimer_active = 1;
1057 hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
1058 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
1059 }
1060 raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
9e630205 1061
272325c4 1062 return 0;
9e630205
SE
1063}
1064
33696fc0 1065void perf_pmu_disable(struct pmu *pmu)
9e35ad38 1066{
33696fc0
PZ
1067 int *count = this_cpu_ptr(pmu->pmu_disable_count);
1068 if (!(*count)++)
1069 pmu->pmu_disable(pmu);
9e35ad38 1070}
9e35ad38 1071
33696fc0 1072void perf_pmu_enable(struct pmu *pmu)
9e35ad38 1073{
33696fc0
PZ
1074 int *count = this_cpu_ptr(pmu->pmu_disable_count);
1075 if (!--(*count))
1076 pmu->pmu_enable(pmu);
9e35ad38 1077}
9e35ad38 1078
2fde4f94 1079static DEFINE_PER_CPU(struct list_head, active_ctx_list);
e9d2b064
PZ
1080
1081/*
2fde4f94
MR
1082 * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
1083 * perf_event_task_tick() are fully serialized because they're strictly cpu
1084 * affine and perf_event_ctx{activate,deactivate} are called with IRQs
1085 * disabled, while perf_event_task_tick is called from IRQ context.
e9d2b064 1086 */
2fde4f94 1087static void perf_event_ctx_activate(struct perf_event_context *ctx)
9e35ad38 1088{
2fde4f94 1089 struct list_head *head = this_cpu_ptr(&active_ctx_list);
b5ab4cd5 1090
e9d2b064 1091 WARN_ON(!irqs_disabled());
b5ab4cd5 1092
2fde4f94
MR
1093 WARN_ON(!list_empty(&ctx->active_ctx_list));
1094
1095 list_add(&ctx->active_ctx_list, head);
1096}
1097
1098static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
1099{
1100 WARN_ON(!irqs_disabled());
1101
1102 WARN_ON(list_empty(&ctx->active_ctx_list));
1103
1104 list_del_init(&ctx->active_ctx_list);
9e35ad38 1105}
9e35ad38 1106
cdd6c482 1107static void get_ctx(struct perf_event_context *ctx)
a63eaf34 1108{
e5289d4a 1109 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
a63eaf34
PM
1110}
1111
4af57ef2
YZ
1112static void free_ctx(struct rcu_head *head)
1113{
1114 struct perf_event_context *ctx;
1115
1116 ctx = container_of(head, struct perf_event_context, rcu_head);
1117 kfree(ctx->task_ctx_data);
1118 kfree(ctx);
1119}
1120
cdd6c482 1121static void put_ctx(struct perf_event_context *ctx)
a63eaf34 1122{
564c2b21
PM
1123 if (atomic_dec_and_test(&ctx->refcount)) {
1124 if (ctx->parent_ctx)
1125 put_ctx(ctx->parent_ctx);
63b6da39 1126 if (ctx->task && ctx->task != TASK_TOMBSTONE)
c93f7669 1127 put_task_struct(ctx->task);
4af57ef2 1128 call_rcu(&ctx->rcu_head, free_ctx);
564c2b21 1129 }
a63eaf34
PM
1130}
1131
f63a8daa
PZ
1132/*
1133 * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
1134 * perf_pmu_migrate_context() we need some magic.
1135 *
1136 * Those places that change perf_event::ctx will hold both
1137 * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
1138 *
8b10c5e2
PZ
1139 * Lock ordering is by mutex address. There are two other sites where
1140 * perf_event_context::mutex nests and those are:
1141 *
1142 * - perf_event_exit_task_context() [ child , 0 ]
8ba289b8
PZ
1143 * perf_event_exit_event()
1144 * put_event() [ parent, 1 ]
8b10c5e2
PZ
1145 *
1146 * - perf_event_init_context() [ parent, 0 ]
1147 * inherit_task_group()
1148 * inherit_group()
1149 * inherit_event()
1150 * perf_event_alloc()
1151 * perf_init_event()
1152 * perf_try_init_event() [ child , 1 ]
1153 *
1154 * While it appears there is an obvious deadlock here -- the parent and child
1155 * nesting levels are inverted between the two. This is in fact safe because
1156 * life-time rules separate them. That is an exiting task cannot fork, and a
1157 * spawning task cannot (yet) exit.
1158 *
1159 * But remember that that these are parent<->child context relations, and
1160 * migration does not affect children, therefore these two orderings should not
1161 * interact.
f63a8daa
PZ
1162 *
1163 * The change in perf_event::ctx does not affect children (as claimed above)
1164 * because the sys_perf_event_open() case will install a new event and break
1165 * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
1166 * concerned with cpuctx and that doesn't have children.
1167 *
1168 * The places that change perf_event::ctx will issue:
1169 *
1170 * perf_remove_from_context();
1171 * synchronize_rcu();
1172 * perf_install_in_context();
1173 *
1174 * to affect the change. The remove_from_context() + synchronize_rcu() should
1175 * quiesce the event, after which we can install it in the new location. This
1176 * means that only external vectors (perf_fops, prctl) can perturb the event
1177 * while in transit. Therefore all such accessors should also acquire
1178 * perf_event_context::mutex to serialize against this.
1179 *
1180 * However; because event->ctx can change while we're waiting to acquire
1181 * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
1182 * function.
1183 *
1184 * Lock order:
79c9ce57 1185 * cred_guard_mutex
f63a8daa
PZ
1186 * task_struct::perf_event_mutex
1187 * perf_event_context::mutex
f63a8daa 1188 * perf_event::child_mutex;
07c4a776 1189 * perf_event_context::lock
f63a8daa
PZ
1190 * perf_event::mmap_mutex
1191 * mmap_sem
1192 */
a83fe28e
PZ
1193static struct perf_event_context *
1194perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
f63a8daa
PZ
1195{
1196 struct perf_event_context *ctx;
1197
1198again:
1199 rcu_read_lock();
1200 ctx = ACCESS_ONCE(event->ctx);
1201 if (!atomic_inc_not_zero(&ctx->refcount)) {
1202 rcu_read_unlock();
1203 goto again;
1204 }
1205 rcu_read_unlock();
1206
a83fe28e 1207 mutex_lock_nested(&ctx->mutex, nesting);
f63a8daa
PZ
1208 if (event->ctx != ctx) {
1209 mutex_unlock(&ctx->mutex);
1210 put_ctx(ctx);
1211 goto again;
1212 }
1213
1214 return ctx;
1215}
1216
a83fe28e
PZ
1217static inline struct perf_event_context *
1218perf_event_ctx_lock(struct perf_event *event)
1219{
1220 return perf_event_ctx_lock_nested(event, 0);
1221}
1222
f63a8daa
PZ
1223static void perf_event_ctx_unlock(struct perf_event *event,
1224 struct perf_event_context *ctx)
1225{
1226 mutex_unlock(&ctx->mutex);
1227 put_ctx(ctx);
1228}
1229
211de6eb
PZ
1230/*
1231 * This must be done under the ctx->lock, such as to serialize against
1232 * context_equiv(), therefore we cannot call put_ctx() since that might end up
1233 * calling scheduler related locks and ctx->lock nests inside those.
1234 */
1235static __must_check struct perf_event_context *
1236unclone_ctx(struct perf_event_context *ctx)
71a851b4 1237{
211de6eb
PZ
1238 struct perf_event_context *parent_ctx = ctx->parent_ctx;
1239
1240 lockdep_assert_held(&ctx->lock);
1241
1242 if (parent_ctx)
71a851b4 1243 ctx->parent_ctx = NULL;
5a3126d4 1244 ctx->generation++;
211de6eb
PZ
1245
1246 return parent_ctx;
71a851b4
PZ
1247}
1248
6844c09d
ACM
1249static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1250{
1251 /*
1252 * only top level events have the pid namespace they were created in
1253 */
1254 if (event->parent)
1255 event = event->parent;
1256
1257 return task_tgid_nr_ns(p, event->ns);
1258}
1259
1260static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1261{
1262 /*
1263 * only top level events have the pid namespace they were created in
1264 */
1265 if (event->parent)
1266 event = event->parent;
1267
1268 return task_pid_nr_ns(p, event->ns);
1269}
1270
7f453c24 1271/*
cdd6c482 1272 * If we inherit events we want to return the parent event id
7f453c24
PZ
1273 * to userspace.
1274 */
cdd6c482 1275static u64 primary_event_id(struct perf_event *event)
7f453c24 1276{
cdd6c482 1277 u64 id = event->id;
7f453c24 1278
cdd6c482
IM
1279 if (event->parent)
1280 id = event->parent->id;
7f453c24
PZ
1281
1282 return id;
1283}
1284
25346b93 1285/*
cdd6c482 1286 * Get the perf_event_context for a task and lock it.
63b6da39 1287 *
25346b93
PM
1288 * This has to cope with with the fact that until it is locked,
1289 * the context could get moved to another task.
1290 */
cdd6c482 1291static struct perf_event_context *
8dc85d54 1292perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
25346b93 1293{
cdd6c482 1294 struct perf_event_context *ctx;
25346b93 1295
9ed6060d 1296retry:
058ebd0e
PZ
1297 /*
1298 * One of the few rules of preemptible RCU is that one cannot do
1299 * rcu_read_unlock() while holding a scheduler (or nested) lock when
2fd59077 1300 * part of the read side critical section was irqs-enabled -- see
058ebd0e
PZ
1301 * rcu_read_unlock_special().
1302 *
1303 * Since ctx->lock nests under rq->lock we must ensure the entire read
2fd59077 1304 * side critical section has interrupts disabled.
058ebd0e 1305 */
2fd59077 1306 local_irq_save(*flags);
058ebd0e 1307 rcu_read_lock();
8dc85d54 1308 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
25346b93
PM
1309 if (ctx) {
1310 /*
1311 * If this context is a clone of another, it might
1312 * get swapped for another underneath us by
cdd6c482 1313 * perf_event_task_sched_out, though the
25346b93
PM
1314 * rcu_read_lock() protects us from any context
1315 * getting freed. Lock the context and check if it
1316 * got swapped before we could get the lock, and retry
1317 * if so. If we locked the right context, then it
1318 * can't get swapped on us any more.
1319 */
2fd59077 1320 raw_spin_lock(&ctx->lock);
8dc85d54 1321 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
2fd59077 1322 raw_spin_unlock(&ctx->lock);
058ebd0e 1323 rcu_read_unlock();
2fd59077 1324 local_irq_restore(*flags);
25346b93
PM
1325 goto retry;
1326 }
b49a9e7e 1327
63b6da39
PZ
1328 if (ctx->task == TASK_TOMBSTONE ||
1329 !atomic_inc_not_zero(&ctx->refcount)) {
2fd59077 1330 raw_spin_unlock(&ctx->lock);
b49a9e7e 1331 ctx = NULL;
828b6f0e
PZ
1332 } else {
1333 WARN_ON_ONCE(ctx->task != task);
b49a9e7e 1334 }
25346b93
PM
1335 }
1336 rcu_read_unlock();
2fd59077
PM
1337 if (!ctx)
1338 local_irq_restore(*flags);
25346b93
PM
1339 return ctx;
1340}
1341
1342/*
1343 * Get the context for a task and increment its pin_count so it
1344 * can't get swapped to another task. This also increments its
1345 * reference count so that the context can't get freed.
1346 */
8dc85d54
PZ
1347static struct perf_event_context *
1348perf_pin_task_context(struct task_struct *task, int ctxn)
25346b93 1349{
cdd6c482 1350 struct perf_event_context *ctx;
25346b93
PM
1351 unsigned long flags;
1352
8dc85d54 1353 ctx = perf_lock_task_context(task, ctxn, &flags);
25346b93
PM
1354 if (ctx) {
1355 ++ctx->pin_count;
e625cce1 1356 raw_spin_unlock_irqrestore(&ctx->lock, flags);
25346b93
PM
1357 }
1358 return ctx;
1359}
1360
cdd6c482 1361static void perf_unpin_context(struct perf_event_context *ctx)
25346b93
PM
1362{
1363 unsigned long flags;
1364
e625cce1 1365 raw_spin_lock_irqsave(&ctx->lock, flags);
25346b93 1366 --ctx->pin_count;
e625cce1 1367 raw_spin_unlock_irqrestore(&ctx->lock, flags);
25346b93
PM
1368}
1369
f67218c3
PZ
1370/*
1371 * Update the record of the current time in a context.
1372 */
1373static void update_context_time(struct perf_event_context *ctx)
1374{
1375 u64 now = perf_clock();
1376
1377 ctx->time += now - ctx->timestamp;
1378 ctx->timestamp = now;
1379}
1380
4158755d
SE
1381static u64 perf_event_time(struct perf_event *event)
1382{
1383 struct perf_event_context *ctx = event->ctx;
e5d1367f
SE
1384
1385 if (is_cgroup_event(event))
1386 return perf_cgroup_event_time(event);
1387
4158755d
SE
1388 return ctx ? ctx->time : 0;
1389}
1390
f67218c3
PZ
1391/*
1392 * Update the total_time_enabled and total_time_running fields for a event.
1393 */
1394static void update_event_times(struct perf_event *event)
1395{
1396 struct perf_event_context *ctx = event->ctx;
1397 u64 run_end;
1398
3cbaa590
PZ
1399 lockdep_assert_held(&ctx->lock);
1400
f67218c3
PZ
1401 if (event->state < PERF_EVENT_STATE_INACTIVE ||
1402 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
1403 return;
3cbaa590 1404
e5d1367f
SE
1405 /*
1406 * in cgroup mode, time_enabled represents
1407 * the time the event was enabled AND active
1408 * tasks were in the monitored cgroup. This is
1409 * independent of the activity of the context as
1410 * there may be a mix of cgroup and non-cgroup events.
1411 *
1412 * That is why we treat cgroup events differently
1413 * here.
1414 */
1415 if (is_cgroup_event(event))
46cd6a7f 1416 run_end = perf_cgroup_event_time(event);
e5d1367f
SE
1417 else if (ctx->is_active)
1418 run_end = ctx->time;
acd1d7c1
PZ
1419 else
1420 run_end = event->tstamp_stopped;
1421
1422 event->total_time_enabled = run_end - event->tstamp_enabled;
f67218c3
PZ
1423
1424 if (event->state == PERF_EVENT_STATE_INACTIVE)
1425 run_end = event->tstamp_stopped;
1426 else
4158755d 1427 run_end = perf_event_time(event);
f67218c3
PZ
1428
1429 event->total_time_running = run_end - event->tstamp_running;
e5d1367f 1430
f67218c3
PZ
1431}
1432
96c21a46
PZ
1433/*
1434 * Update total_time_enabled and total_time_running for all events in a group.
1435 */
1436static void update_group_times(struct perf_event *leader)
1437{
1438 struct perf_event *event;
1439
1440 update_event_times(leader);
1441 list_for_each_entry(event, &leader->sibling_list, group_entry)
1442 update_event_times(event);
1443}
1444
889ff015
FW
1445static struct list_head *
1446ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
1447{
1448 if (event->attr.pinned)
1449 return &ctx->pinned_groups;
1450 else
1451 return &ctx->flexible_groups;
1452}
1453
fccc714b 1454/*
cdd6c482 1455 * Add a event from the lists for its context.
fccc714b
PZ
1456 * Must be called with ctx->mutex and ctx->lock held.
1457 */
04289bb9 1458static void
cdd6c482 1459list_add_event(struct perf_event *event, struct perf_event_context *ctx)
04289bb9 1460{
c994d613
PZ
1461 lockdep_assert_held(&ctx->lock);
1462
8a49542c
PZ
1463 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1464 event->attach_state |= PERF_ATTACH_CONTEXT;
04289bb9
IM
1465
1466 /*
8a49542c
PZ
1467 * If we're a stand alone event or group leader, we go to the context
1468 * list, group events are kept attached to the group so that
1469 * perf_group_detach can, at all times, locate all siblings.
04289bb9 1470 */
8a49542c 1471 if (event->group_leader == event) {
889ff015
FW
1472 struct list_head *list;
1473
4ff6a8de 1474 event->group_caps = event->event_caps;
d6f962b5 1475
889ff015
FW
1476 list = ctx_group_list(event, ctx);
1477 list_add_tail(&event->group_entry, list);
5c148194 1478 }
592903cd 1479
db4a8356 1480 list_update_cgroup_event(event, ctx, true);
e5d1367f 1481
cdd6c482
IM
1482 list_add_rcu(&event->event_entry, &ctx->event_list);
1483 ctx->nr_events++;
1484 if (event->attr.inherit_stat)
bfbd3381 1485 ctx->nr_stat++;
5a3126d4
PZ
1486
1487 ctx->generation++;
04289bb9
IM
1488}
1489
0231bb53
JO
1490/*
1491 * Initialize event state based on the perf_event_attr::disabled.
1492 */
1493static inline void perf_event__state_init(struct perf_event *event)
1494{
1495 event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1496 PERF_EVENT_STATE_INACTIVE;
1497}
1498
a723968c 1499static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
c320c7b7
ACM
1500{
1501 int entry = sizeof(u64); /* value */
1502 int size = 0;
1503 int nr = 1;
1504
1505 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1506 size += sizeof(u64);
1507
1508 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1509 size += sizeof(u64);
1510
1511 if (event->attr.read_format & PERF_FORMAT_ID)
1512 entry += sizeof(u64);
1513
1514 if (event->attr.read_format & PERF_FORMAT_GROUP) {
a723968c 1515 nr += nr_siblings;
c320c7b7
ACM
1516 size += sizeof(u64);
1517 }
1518
1519 size += entry * nr;
1520 event->read_size = size;
1521}
1522
a723968c 1523static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
c320c7b7
ACM
1524{
1525 struct perf_sample_data *data;
c320c7b7
ACM
1526 u16 size = 0;
1527
c320c7b7
ACM
1528 if (sample_type & PERF_SAMPLE_IP)
1529 size += sizeof(data->ip);
1530
6844c09d
ACM
1531 if (sample_type & PERF_SAMPLE_ADDR)
1532 size += sizeof(data->addr);
1533
1534 if (sample_type & PERF_SAMPLE_PERIOD)
1535 size += sizeof(data->period);
1536
c3feedf2
AK
1537 if (sample_type & PERF_SAMPLE_WEIGHT)
1538 size += sizeof(data->weight);
1539
6844c09d
ACM
1540 if (sample_type & PERF_SAMPLE_READ)
1541 size += event->read_size;
1542
d6be9ad6
SE
1543 if (sample_type & PERF_SAMPLE_DATA_SRC)
1544 size += sizeof(data->data_src.val);
1545
fdfbbd07
AK
1546 if (sample_type & PERF_SAMPLE_TRANSACTION)
1547 size += sizeof(data->txn);
1548
6844c09d
ACM
1549 event->header_size = size;
1550}
1551
a723968c
PZ
1552/*
1553 * Called at perf_event creation and when events are attached/detached from a
1554 * group.
1555 */
1556static void perf_event__header_size(struct perf_event *event)
1557{
1558 __perf_event_read_size(event,
1559 event->group_leader->nr_siblings);
1560 __perf_event_header_size(event, event->attr.sample_type);
1561}
1562
6844c09d
ACM
1563static void perf_event__id_header_size(struct perf_event *event)
1564{
1565 struct perf_sample_data *data;
1566 u64 sample_type = event->attr.sample_type;
1567 u16 size = 0;
1568
c320c7b7
ACM
1569 if (sample_type & PERF_SAMPLE_TID)
1570 size += sizeof(data->tid_entry);
1571
1572 if (sample_type & PERF_SAMPLE_TIME)
1573 size += sizeof(data->time);
1574
ff3d527c
AH
1575 if (sample_type & PERF_SAMPLE_IDENTIFIER)
1576 size += sizeof(data->id);
1577
c320c7b7
ACM
1578 if (sample_type & PERF_SAMPLE_ID)
1579 size += sizeof(data->id);
1580
1581 if (sample_type & PERF_SAMPLE_STREAM_ID)
1582 size += sizeof(data->stream_id);
1583
1584 if (sample_type & PERF_SAMPLE_CPU)
1585 size += sizeof(data->cpu_entry);
1586
6844c09d 1587 event->id_header_size = size;
c320c7b7
ACM
1588}
1589
a723968c
PZ
1590static bool perf_event_validate_size(struct perf_event *event)
1591{
1592 /*
1593 * The values computed here will be over-written when we actually
1594 * attach the event.
1595 */
1596 __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
1597 __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
1598 perf_event__id_header_size(event);
1599
1600 /*
1601 * Sum the lot; should not exceed the 64k limit we have on records.
1602 * Conservative limit to allow for callchains and other variable fields.
1603 */
1604 if (event->read_size + event->header_size +
1605 event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
1606 return false;
1607
1608 return true;
1609}
1610
8a49542c
PZ
1611static void perf_group_attach(struct perf_event *event)
1612{
c320c7b7 1613 struct perf_event *group_leader = event->group_leader, *pos;
8a49542c 1614
a76a82a3
PZ
1615 lockdep_assert_held(&event->ctx->lock);
1616
74c3337c
PZ
1617 /*
1618 * We can have double attach due to group movement in perf_event_open.
1619 */
1620 if (event->attach_state & PERF_ATTACH_GROUP)
1621 return;
1622
8a49542c
PZ
1623 event->attach_state |= PERF_ATTACH_GROUP;
1624
1625 if (group_leader == event)
1626 return;
1627
652884fe
PZ
1628 WARN_ON_ONCE(group_leader->ctx != event->ctx);
1629
4ff6a8de 1630 group_leader->group_caps &= event->event_caps;
8a49542c
PZ
1631
1632 list_add_tail(&event->group_entry, &group_leader->sibling_list);
1633 group_leader->nr_siblings++;
c320c7b7
ACM
1634
1635 perf_event__header_size(group_leader);
1636
1637 list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
1638 perf_event__header_size(pos);
8a49542c
PZ
1639}
1640
a63eaf34 1641/*
cdd6c482 1642 * Remove a event from the lists for its context.
fccc714b 1643 * Must be called with ctx->mutex and ctx->lock held.
a63eaf34 1644 */
04289bb9 1645static void
cdd6c482 1646list_del_event(struct perf_event *event, struct perf_event_context *ctx)
04289bb9 1647{
652884fe
PZ
1648 WARN_ON_ONCE(event->ctx != ctx);
1649 lockdep_assert_held(&ctx->lock);
1650
8a49542c
PZ
1651 /*
1652 * We can have double detach due to exit/hot-unplug + close.
1653 */
1654 if (!(event->attach_state & PERF_ATTACH_CONTEXT))
a63eaf34 1655 return;
8a49542c
PZ
1656
1657 event->attach_state &= ~PERF_ATTACH_CONTEXT;
1658
db4a8356 1659 list_update_cgroup_event(event, ctx, false);
e5d1367f 1660
cdd6c482
IM
1661 ctx->nr_events--;
1662 if (event->attr.inherit_stat)
bfbd3381 1663 ctx->nr_stat--;
8bc20959 1664
cdd6c482 1665 list_del_rcu(&event->event_entry);
04289bb9 1666
8a49542c
PZ
1667 if (event->group_leader == event)
1668 list_del_init(&event->group_entry);
5c148194 1669
96c21a46 1670 update_group_times(event);
b2e74a26
SE
1671
1672 /*
1673 * If event was in error state, then keep it
1674 * that way, otherwise bogus counts will be
1675 * returned on read(). The only way to get out
1676 * of error state is by explicit re-enabling
1677 * of the event
1678 */
1679 if (event->state > PERF_EVENT_STATE_OFF)
1680 event->state = PERF_EVENT_STATE_OFF;
5a3126d4
PZ
1681
1682 ctx->generation++;
050735b0
PZ
1683}
1684
8a49542c 1685static void perf_group_detach(struct perf_event *event)
050735b0
PZ
1686{
1687 struct perf_event *sibling, *tmp;
8a49542c
PZ
1688 struct list_head *list = NULL;
1689
a76a82a3
PZ
1690 lockdep_assert_held(&event->ctx->lock);
1691
8a49542c
PZ
1692 /*
1693 * We can have double detach due to exit/hot-unplug + close.
1694 */
1695 if (!(event->attach_state & PERF_ATTACH_GROUP))
1696 return;
1697
1698 event->attach_state &= ~PERF_ATTACH_GROUP;
1699
1700 /*
1701 * If this is a sibling, remove it from its group.
1702 */
1703 if (event->group_leader != event) {
1704 list_del_init(&event->group_entry);
1705 event->group_leader->nr_siblings--;
c320c7b7 1706 goto out;
8a49542c
PZ
1707 }
1708
1709 if (!list_empty(&event->group_entry))
1710 list = &event->group_entry;
2e2af50b 1711
04289bb9 1712 /*
cdd6c482
IM
1713 * If this was a group event with sibling events then
1714 * upgrade the siblings to singleton events by adding them
8a49542c 1715 * to whatever list we are on.
04289bb9 1716 */
cdd6c482 1717 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
8a49542c
PZ
1718 if (list)
1719 list_move_tail(&sibling->group_entry, list);
04289bb9 1720 sibling->group_leader = sibling;
d6f962b5
FW
1721
1722 /* Inherit group flags from the previous leader */
4ff6a8de 1723 sibling->group_caps = event->group_caps;
652884fe
PZ
1724
1725 WARN_ON_ONCE(sibling->ctx != event->ctx);
04289bb9 1726 }
c320c7b7
ACM
1727
1728out:
1729 perf_event__header_size(event->group_leader);
1730
1731 list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
1732 perf_event__header_size(tmp);
04289bb9
IM
1733}
1734
fadfe7be
JO
1735static bool is_orphaned_event(struct perf_event *event)
1736{
a69b0ca4 1737 return event->state == PERF_EVENT_STATE_DEAD;
fadfe7be
JO
1738}
1739
2c81a647 1740static inline int __pmu_filter_match(struct perf_event *event)
66eb579e
MR
1741{
1742 struct pmu *pmu = event->pmu;
1743 return pmu->filter_match ? pmu->filter_match(event) : 1;
1744}
1745
2c81a647
MR
1746/*
1747 * Check whether we should attempt to schedule an event group based on
1748 * PMU-specific filtering. An event group can consist of HW and SW events,
1749 * potentially with a SW leader, so we must check all the filters, to
1750 * determine whether a group is schedulable:
1751 */
1752static inline int pmu_filter_match(struct perf_event *event)
1753{
1754 struct perf_event *child;
1755
1756 if (!__pmu_filter_match(event))
1757 return 0;
1758
1759 list_for_each_entry(child, &event->sibling_list, group_entry) {
1760 if (!__pmu_filter_match(child))
1761 return 0;
1762 }
1763
1764 return 1;
1765}
1766
fa66f07a
SE
1767static inline int
1768event_filter_match(struct perf_event *event)
1769{
0b8f1e2e
PZ
1770 return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
1771 perf_cgroup_match(event) && pmu_filter_match(event);
fa66f07a
SE
1772}
1773
9ffcfa6f
SE
1774static void
1775event_sched_out(struct perf_event *event,
3b6f9e5c 1776 struct perf_cpu_context *cpuctx,
cdd6c482 1777 struct perf_event_context *ctx)
3b6f9e5c 1778{
4158755d 1779 u64 tstamp = perf_event_time(event);
fa66f07a 1780 u64 delta;
652884fe
PZ
1781
1782 WARN_ON_ONCE(event->ctx != ctx);
1783 lockdep_assert_held(&ctx->lock);
1784
fa66f07a
SE
1785 /*
1786 * An event which could not be activated because of
1787 * filter mismatch still needs to have its timings
1788 * maintained, otherwise bogus information is return
1789 * via read() for time_enabled, time_running:
1790 */
0b8f1e2e
PZ
1791 if (event->state == PERF_EVENT_STATE_INACTIVE &&
1792 !event_filter_match(event)) {
e5d1367f 1793 delta = tstamp - event->tstamp_stopped;
fa66f07a 1794 event->tstamp_running += delta;
4158755d 1795 event->tstamp_stopped = tstamp;
fa66f07a
SE
1796 }
1797
cdd6c482 1798 if (event->state != PERF_EVENT_STATE_ACTIVE)
9ffcfa6f 1799 return;
3b6f9e5c 1800
44377277
AS
1801 perf_pmu_disable(event->pmu);
1802
28a967c3
PZ
1803 event->tstamp_stopped = tstamp;
1804 event->pmu->del(event, 0);
1805 event->oncpu = -1;
cdd6c482
IM
1806 event->state = PERF_EVENT_STATE_INACTIVE;
1807 if (event->pending_disable) {
1808 event->pending_disable = 0;
1809 event->state = PERF_EVENT_STATE_OFF;
970892a9 1810 }
3b6f9e5c 1811
cdd6c482 1812 if (!is_software_event(event))
3b6f9e5c 1813 cpuctx->active_oncpu--;
2fde4f94
MR
1814 if (!--ctx->nr_active)
1815 perf_event_ctx_deactivate(ctx);
0f5a2601
PZ
1816 if (event->attr.freq && event->attr.sample_freq)
1817 ctx->nr_freq--;
cdd6c482 1818 if (event->attr.exclusive || !cpuctx->active_oncpu)
3b6f9e5c 1819 cpuctx->exclusive = 0;
44377277
AS
1820
1821 perf_pmu_enable(event->pmu);
3b6f9e5c
PM
1822}
1823
d859e29f 1824static void
cdd6c482 1825group_sched_out(struct perf_event *group_event,
d859e29f 1826 struct perf_cpu_context *cpuctx,
cdd6c482 1827 struct perf_event_context *ctx)
d859e29f 1828{
cdd6c482 1829 struct perf_event *event;
fa66f07a 1830 int state = group_event->state;
d859e29f 1831
3f005e7d
MR
1832 perf_pmu_disable(ctx->pmu);
1833
cdd6c482 1834 event_sched_out(group_event, cpuctx, ctx);
d859e29f
PM
1835
1836 /*
1837 * Schedule out siblings (if any):
1838 */
cdd6c482
IM
1839 list_for_each_entry(event, &group_event->sibling_list, group_entry)
1840 event_sched_out(event, cpuctx, ctx);
d859e29f 1841
3f005e7d
MR
1842 perf_pmu_enable(ctx->pmu);
1843
fa66f07a 1844 if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
d859e29f
PM
1845 cpuctx->exclusive = 0;
1846}
1847
45a0e07a 1848#define DETACH_GROUP 0x01UL
0017960f 1849
0793a61d 1850/*
cdd6c482 1851 * Cross CPU call to remove a performance event
0793a61d 1852 *
cdd6c482 1853 * We disable the event on the hardware level first. After that we
0793a61d
TG
1854 * remove it from the context list.
1855 */
fae3fde6
PZ
1856static void
1857__perf_remove_from_context(struct perf_event *event,
1858 struct perf_cpu_context *cpuctx,
1859 struct perf_event_context *ctx,
1860 void *info)
0793a61d 1861{
45a0e07a 1862 unsigned long flags = (unsigned long)info;
0793a61d 1863
cdd6c482 1864 event_sched_out(event, cpuctx, ctx);
45a0e07a 1865 if (flags & DETACH_GROUP)
46ce0fe9 1866 perf_group_detach(event);
cdd6c482 1867 list_del_event(event, ctx);
39a43640
PZ
1868
1869 if (!ctx->nr_events && ctx->is_active) {
64ce3126 1870 ctx->is_active = 0;
39a43640
PZ
1871 if (ctx->task) {
1872 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
1873 cpuctx->task_ctx = NULL;
1874 }
64ce3126 1875 }
0793a61d
TG
1876}
1877
0793a61d 1878/*
cdd6c482 1879 * Remove the event from a task's (or a CPU's) list of events.
0793a61d 1880 *
cdd6c482
IM
1881 * If event->ctx is a cloned context, callers must make sure that
1882 * every task struct that event->ctx->task could possibly point to
c93f7669
PM
1883 * remains valid. This is OK when called from perf_release since
1884 * that only calls us on the top-level context, which can't be a clone.
cdd6c482 1885 * When called from perf_event_exit_task, it's OK because the
c93f7669 1886 * context has been detached from its task.
0793a61d 1887 */
45a0e07a 1888static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
0793a61d 1889{
a76a82a3
PZ
1890 struct perf_event_context *ctx = event->ctx;
1891
1892 lockdep_assert_held(&ctx->mutex);
0793a61d 1893
45a0e07a 1894 event_function_call(event, __perf_remove_from_context, (void *)flags);
a76a82a3
PZ
1895
1896 /*
1897 * The above event_function_call() can NO-OP when it hits
1898 * TASK_TOMBSTONE. In that case we must already have been detached
1899 * from the context (by perf_event_exit_event()) but the grouping
1900 * might still be in-tact.
1901 */
1902 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1903 if ((flags & DETACH_GROUP) &&
1904 (event->attach_state & PERF_ATTACH_GROUP)) {
1905 /*
1906 * Since in that case we cannot possibly be scheduled, simply
1907 * detach now.
1908 */
1909 raw_spin_lock_irq(&ctx->lock);
1910 perf_group_detach(event);
1911 raw_spin_unlock_irq(&ctx->lock);
1912 }
0793a61d
TG
1913}
1914
d859e29f 1915/*
cdd6c482 1916 * Cross CPU call to disable a performance event
d859e29f 1917 */
fae3fde6
PZ
1918static void __perf_event_disable(struct perf_event *event,
1919 struct perf_cpu_context *cpuctx,
1920 struct perf_event_context *ctx,
1921 void *info)
7b648018 1922{
fae3fde6
PZ
1923 if (event->state < PERF_EVENT_STATE_INACTIVE)
1924 return;
7b648018 1925
fae3fde6
PZ
1926 update_context_time(ctx);
1927 update_cgrp_time_from_event(event);
1928 update_group_times(event);
1929 if (event == event->group_leader)
1930 group_sched_out(event, cpuctx, ctx);
1931 else
1932 event_sched_out(event, cpuctx, ctx);
1933 event->state = PERF_EVENT_STATE_OFF;
7b648018
PZ
1934}
1935
d859e29f 1936/*
cdd6c482 1937 * Disable a event.
c93f7669 1938 *
cdd6c482
IM
1939 * If event->ctx is a cloned context, callers must make sure that
1940 * every task struct that event->ctx->task could possibly point to
c93f7669 1941 * remains valid. This condition is satisifed when called through
cdd6c482
IM
1942 * perf_event_for_each_child or perf_event_for_each because they
1943 * hold the top-level event's child_mutex, so any descendant that
8ba289b8
PZ
1944 * goes to exit will block in perf_event_exit_event().
1945 *
cdd6c482 1946 * When called from perf_pending_event it's OK because event->ctx
c93f7669 1947 * is the current context on this CPU and preemption is disabled,
cdd6c482 1948 * hence we can't get into perf_event_task_sched_out for this context.
d859e29f 1949 */
f63a8daa 1950static void _perf_event_disable(struct perf_event *event)
d859e29f 1951{
cdd6c482 1952 struct perf_event_context *ctx = event->ctx;
d859e29f 1953
e625cce1 1954 raw_spin_lock_irq(&ctx->lock);
7b648018 1955 if (event->state <= PERF_EVENT_STATE_OFF) {
e625cce1 1956 raw_spin_unlock_irq(&ctx->lock);
7b648018 1957 return;
53cfbf59 1958 }
e625cce1 1959 raw_spin_unlock_irq(&ctx->lock);
7b648018 1960
fae3fde6
PZ
1961 event_function_call(event, __perf_event_disable, NULL);
1962}
1963
1964void perf_event_disable_local(struct perf_event *event)
1965{
1966 event_function_local(event, __perf_event_disable, NULL);
d859e29f 1967}
f63a8daa
PZ
1968
1969/*
1970 * Strictly speaking kernel users cannot create groups and therefore this
1971 * interface does not need the perf_event_ctx_lock() magic.
1972 */
1973void perf_event_disable(struct perf_event *event)
1974{
1975 struct perf_event_context *ctx;
1976
1977 ctx = perf_event_ctx_lock(event);
1978 _perf_event_disable(event);
1979 perf_event_ctx_unlock(event, ctx);
1980}
dcfce4a0 1981EXPORT_SYMBOL_GPL(perf_event_disable);
d859e29f 1982
5aab90ce
JO
1983void perf_event_disable_inatomic(struct perf_event *event)
1984{
1985 event->pending_disable = 1;
1986 irq_work_queue(&event->pending);
1987}
1988
e5d1367f
SE
1989static void perf_set_shadow_time(struct perf_event *event,
1990 struct perf_event_context *ctx,
1991 u64 tstamp)
1992{
1993 /*
1994 * use the correct time source for the time snapshot
1995 *
1996 * We could get by without this by leveraging the
1997 * fact that to get to this function, the caller
1998 * has most likely already called update_context_time()
1999 * and update_cgrp_time_xx() and thus both timestamp
2000 * are identical (or very close). Given that tstamp is,
2001 * already adjusted for cgroup, we could say that:
2002 * tstamp - ctx->timestamp
2003 * is equivalent to
2004 * tstamp - cgrp->timestamp.
2005 *
2006 * Then, in perf_output_read(), the calculation would
2007 * work with no changes because:
2008 * - event is guaranteed scheduled in
2009 * - no scheduled out in between
2010 * - thus the timestamp would be the same
2011 *
2012 * But this is a bit hairy.
2013 *
2014 * So instead, we have an explicit cgroup call to remain
2015 * within the time time source all along. We believe it
2016 * is cleaner and simpler to understand.
2017 */
2018 if (is_cgroup_event(event))
2019 perf_cgroup_set_shadow_time(event, tstamp);
2020 else
2021 event->shadow_ctx_time = tstamp - ctx->timestamp;
2022}
2023
4fe757dd
PZ
2024#define MAX_INTERRUPTS (~0ULL)
2025
2026static void perf_log_throttle(struct perf_event *event, int enable);
ec0d7729 2027static void perf_log_itrace_start(struct perf_event *event);
4fe757dd 2028
235c7fc7 2029static int
9ffcfa6f 2030event_sched_in(struct perf_event *event,
235c7fc7 2031 struct perf_cpu_context *cpuctx,
6e37738a 2032 struct perf_event_context *ctx)
235c7fc7 2033{
4158755d 2034 u64 tstamp = perf_event_time(event);
44377277 2035 int ret = 0;
4158755d 2036
63342411
PZ
2037 lockdep_assert_held(&ctx->lock);
2038
cdd6c482 2039 if (event->state <= PERF_EVENT_STATE_OFF)
235c7fc7
IM
2040 return 0;
2041
95ff4ca2
AS
2042 WRITE_ONCE(event->oncpu, smp_processor_id());
2043 /*
2044 * Order event::oncpu write to happen before the ACTIVE state
2045 * is visible.
2046 */
2047 smp_wmb();
2048 WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE);
4fe757dd
PZ
2049
2050 /*
2051 * Unthrottle events, since we scheduled we might have missed several
2052 * ticks already, also for a heavily scheduling task there is little
2053 * guarantee it'll get a tick in a timely manner.
2054 */
2055 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
2056 perf_log_throttle(event, 1);
2057 event->hw.interrupts = 0;
2058 }
2059
235c7fc7
IM
2060 /*
2061 * The new state must be visible before we turn it on in the hardware:
2062 */
2063 smp_wmb();
2064
44377277
AS
2065 perf_pmu_disable(event->pmu);
2066
72f669c0
SL
2067 perf_set_shadow_time(event, ctx, tstamp);
2068
ec0d7729
AS
2069 perf_log_itrace_start(event);
2070
a4eaf7f1 2071 if (event->pmu->add(event, PERF_EF_START)) {
cdd6c482
IM
2072 event->state = PERF_EVENT_STATE_INACTIVE;
2073 event->oncpu = -1;
44377277
AS
2074 ret = -EAGAIN;
2075 goto out;
235c7fc7
IM
2076 }
2077
00a2916f
PZ
2078 event->tstamp_running += tstamp - event->tstamp_stopped;
2079
cdd6c482 2080 if (!is_software_event(event))
3b6f9e5c 2081 cpuctx->active_oncpu++;
2fde4f94
MR
2082 if (!ctx->nr_active++)
2083 perf_event_ctx_activate(ctx);
0f5a2601
PZ
2084 if (event->attr.freq && event->attr.sample_freq)
2085 ctx->nr_freq++;
235c7fc7 2086
cdd6c482 2087 if (event->attr.exclusive)
3b6f9e5c
PM
2088 cpuctx->exclusive = 1;
2089
44377277
AS
2090out:
2091 perf_pmu_enable(event->pmu);
2092
2093 return ret;
235c7fc7
IM
2094}
2095
6751b71e 2096static int
cdd6c482 2097group_sched_in(struct perf_event *group_event,
6751b71e 2098 struct perf_cpu_context *cpuctx,
6e37738a 2099 struct perf_event_context *ctx)
6751b71e 2100{
6bde9b6c 2101 struct perf_event *event, *partial_group = NULL;
4a234593 2102 struct pmu *pmu = ctx->pmu;
d7842da4
SE
2103 u64 now = ctx->time;
2104 bool simulate = false;
6751b71e 2105
cdd6c482 2106 if (group_event->state == PERF_EVENT_STATE_OFF)
6751b71e
PM
2107 return 0;
2108
fbbe0701 2109 pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
6bde9b6c 2110
9ffcfa6f 2111 if (event_sched_in(group_event, cpuctx, ctx)) {
ad5133b7 2112 pmu->cancel_txn(pmu);
272325c4 2113 perf_mux_hrtimer_restart(cpuctx);
6751b71e 2114 return -EAGAIN;
90151c35 2115 }
6751b71e
PM
2116
2117 /*
2118 * Schedule in siblings as one group (if any):
2119 */
cdd6c482 2120 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
9ffcfa6f 2121 if (event_sched_in(event, cpuctx, ctx)) {
cdd6c482 2122 partial_group = event;
6751b71e
PM
2123 goto group_error;
2124 }
2125 }
2126
9ffcfa6f 2127 if (!pmu->commit_txn(pmu))
6e85158c 2128 return 0;
9ffcfa6f 2129
6751b71e
PM
2130group_error:
2131 /*
2132 * Groups can be scheduled in as one unit only, so undo any
2133 * partial group before returning:
d7842da4
SE
2134 * The events up to the failed event are scheduled out normally,
2135 * tstamp_stopped will be updated.
2136 *
2137 * The failed events and the remaining siblings need to have
2138 * their timings updated as if they had gone thru event_sched_in()
2139 * and event_sched_out(). This is required to get consistent timings
2140 * across the group. This also takes care of the case where the group
2141 * could never be scheduled by ensuring tstamp_stopped is set to mark
2142 * the time the event was actually stopped, such that time delta
2143 * calculation in update_event_times() is correct.
6751b71e 2144 */
cdd6c482
IM
2145 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
2146 if (event == partial_group)
d7842da4
SE
2147 simulate = true;
2148
2149 if (simulate) {
2150 event->tstamp_running += now - event->tstamp_stopped;
2151 event->tstamp_stopped = now;
2152 } else {
2153 event_sched_out(event, cpuctx, ctx);
2154 }
6751b71e 2155 }
9ffcfa6f 2156 event_sched_out(group_event, cpuctx, ctx);
6751b71e 2157
ad5133b7 2158 pmu->cancel_txn(pmu);
90151c35 2159
272325c4 2160 perf_mux_hrtimer_restart(cpuctx);
9e630205 2161
6751b71e
PM
2162 return -EAGAIN;
2163}
2164
3b6f9e5c 2165/*
cdd6c482 2166 * Work out whether we can put this event group on the CPU now.
3b6f9e5c 2167 */
cdd6c482 2168static int group_can_go_on(struct perf_event *event,
3b6f9e5c
PM
2169 struct perf_cpu_context *cpuctx,
2170 int can_add_hw)
2171{
2172 /*
cdd6c482 2173 * Groups consisting entirely of software events can always go on.
3b6f9e5c 2174 */
4ff6a8de 2175 if (event->group_caps & PERF_EV_CAP_SOFTWARE)
3b6f9e5c
PM
2176 return 1;
2177 /*
2178 * If an exclusive group is already on, no other hardware
cdd6c482 2179 * events can go on.
3b6f9e5c
PM
2180 */
2181 if (cpuctx->exclusive)
2182 return 0;
2183 /*
2184 * If this group is exclusive and there are already
cdd6c482 2185 * events on the CPU, it can't go on.
3b6f9e5c 2186 */
cdd6c482 2187 if (event->attr.exclusive && cpuctx->active_oncpu)
3b6f9e5c
PM
2188 return 0;
2189 /*
2190 * Otherwise, try to add it if all previous groups were able
2191 * to go on.
2192 */
2193 return can_add_hw;
2194}
2195
cdd6c482
IM
2196static void add_event_to_ctx(struct perf_event *event,
2197 struct perf_event_context *ctx)
53cfbf59 2198{
4158755d
SE
2199 u64 tstamp = perf_event_time(event);
2200
cdd6c482 2201 list_add_event(event, ctx);
8a49542c 2202 perf_group_attach(event);
4158755d
SE
2203 event->tstamp_enabled = tstamp;
2204 event->tstamp_running = tstamp;
2205 event->tstamp_stopped = tstamp;
53cfbf59
PM
2206}
2207
bd2afa49
PZ
2208static void ctx_sched_out(struct perf_event_context *ctx,
2209 struct perf_cpu_context *cpuctx,
2210 enum event_type_t event_type);
2c29ef0f
PZ
2211static void
2212ctx_sched_in(struct perf_event_context *ctx,
2213 struct perf_cpu_context *cpuctx,
2214 enum event_type_t event_type,
2215 struct task_struct *task);
fe4b04fa 2216
bd2afa49
PZ
2217static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
2218 struct perf_event_context *ctx)
2219{
2220 if (!cpuctx->task_ctx)
2221 return;
2222
2223 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2224 return;
2225
2226 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2227}
2228
dce5855b
PZ
2229static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2230 struct perf_event_context *ctx,
2231 struct task_struct *task)
2232{
2233 cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
2234 if (ctx)
2235 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2236 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2237 if (ctx)
2238 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2239}
2240
3e349507
PZ
2241static void ctx_resched(struct perf_cpu_context *cpuctx,
2242 struct perf_event_context *task_ctx)
0017960f 2243{
3e349507
PZ
2244 perf_pmu_disable(cpuctx->ctx.pmu);
2245 if (task_ctx)
2246 task_ctx_sched_out(cpuctx, task_ctx);
2247 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
2248 perf_event_sched_in(cpuctx, task_ctx, current);
2249 perf_pmu_enable(cpuctx->ctx.pmu);
0017960f
PZ
2250}
2251
0793a61d 2252/*
cdd6c482 2253 * Cross CPU call to install and enable a performance event
682076ae 2254 *
a096309b
PZ
2255 * Very similar to remote_function() + event_function() but cannot assume that
2256 * things like ctx->is_active and cpuctx->task_ctx are set.
0793a61d 2257 */
fe4b04fa 2258static int __perf_install_in_context(void *info)
0793a61d 2259{
a096309b
PZ
2260 struct perf_event *event = info;
2261 struct perf_event_context *ctx = event->ctx;
108b02cf 2262 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2c29ef0f 2263 struct perf_event_context *task_ctx = cpuctx->task_ctx;
63cae12b 2264 bool reprogram = true;
a096309b 2265 int ret = 0;
0793a61d 2266
63b6da39 2267 raw_spin_lock(&cpuctx->ctx.lock);
39a43640 2268 if (ctx->task) {
b58f6b0d
PZ
2269 raw_spin_lock(&ctx->lock);
2270 task_ctx = ctx;
a096309b 2271
63cae12b 2272 reprogram = (ctx->task == current);
b58f6b0d 2273
39a43640 2274 /*
63cae12b
PZ
2275 * If the task is running, it must be running on this CPU,
2276 * otherwise we cannot reprogram things.
2277 *
2278 * If its not running, we don't care, ctx->lock will
2279 * serialize against it becoming runnable.
39a43640 2280 */
63cae12b
PZ
2281 if (task_curr(ctx->task) && !reprogram) {
2282 ret = -ESRCH;
2283 goto unlock;
2284 }
a096309b 2285
63cae12b 2286 WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
63b6da39
PZ
2287 } else if (task_ctx) {
2288 raw_spin_lock(&task_ctx->lock);
2c29ef0f 2289 }
b58f6b0d 2290
63cae12b 2291 if (reprogram) {
a096309b
PZ
2292 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2293 add_event_to_ctx(event, ctx);
2294 ctx_resched(cpuctx, task_ctx);
2295 } else {
2296 add_event_to_ctx(event, ctx);
2297 }
2298
63b6da39 2299unlock:
2c29ef0f 2300 perf_ctx_unlock(cpuctx, task_ctx);
fe4b04fa 2301
a096309b 2302 return ret;
0793a61d
TG
2303}
2304
2305/*
a096309b
PZ
2306 * Attach a performance event to a context.
2307 *
2308 * Very similar to event_function_call, see comment there.
0793a61d
TG
2309 */
2310static void
cdd6c482
IM
2311perf_install_in_context(struct perf_event_context *ctx,
2312 struct perf_event *event,
0793a61d
TG
2313 int cpu)
2314{
a096309b 2315 struct task_struct *task = READ_ONCE(ctx->task);
39a43640 2316
fe4b04fa
PZ
2317 lockdep_assert_held(&ctx->mutex);
2318
0cda4c02
YZ
2319 if (event->cpu != -1)
2320 event->cpu = cpu;
c3f00c70 2321
0b8f1e2e
PZ
2322 /*
2323 * Ensures that if we can observe event->ctx, both the event and ctx
2324 * will be 'complete'. See perf_iterate_sb_cpu().
2325 */
2326 smp_store_release(&event->ctx, ctx);
2327
a096309b
PZ
2328 if (!task) {
2329 cpu_function_call(cpu, __perf_install_in_context, event);
2330 return;
2331 }
2332
2333 /*
2334 * Should not happen, we validate the ctx is still alive before calling.
2335 */
2336 if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
2337 return;
2338
39a43640
PZ
2339 /*
2340 * Installing events is tricky because we cannot rely on ctx->is_active
2341 * to be set in case this is the nr_events 0 -> 1 transition.
63cae12b
PZ
2342 *
2343 * Instead we use task_curr(), which tells us if the task is running.
2344 * However, since we use task_curr() outside of rq::lock, we can race
2345 * against the actual state. This means the result can be wrong.
2346 *
2347 * If we get a false positive, we retry, this is harmless.
2348 *
2349 * If we get a false negative, things are complicated. If we are after
2350 * perf_event_context_sched_in() ctx::lock will serialize us, and the
2351 * value must be correct. If we're before, it doesn't matter since
2352 * perf_event_context_sched_in() will program the counter.
2353 *
2354 * However, this hinges on the remote context switch having observed
2355 * our task->perf_event_ctxp[] store, such that it will in fact take
2356 * ctx::lock in perf_event_context_sched_in().
2357 *
2358 * We do this by task_function_call(), if the IPI fails to hit the task
2359 * we know any future context switch of task must see the
2360 * perf_event_ctpx[] store.
39a43640 2361 */
63cae12b 2362
63b6da39 2363 /*
63cae12b
PZ
2364 * This smp_mb() orders the task->perf_event_ctxp[] store with the
2365 * task_cpu() load, such that if the IPI then does not find the task
2366 * running, a future context switch of that task must observe the
2367 * store.
63b6da39 2368 */
63cae12b
PZ
2369 smp_mb();
2370again:
2371 if (!task_function_call(task, __perf_install_in_context, event))
a096309b
PZ
2372 return;
2373
2374 raw_spin_lock_irq(&ctx->lock);
2375 task = ctx->task;
84c4e620 2376 if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
a096309b
PZ
2377 /*
2378 * Cannot happen because we already checked above (which also
2379 * cannot happen), and we hold ctx->mutex, which serializes us
2380 * against perf_event_exit_task_context().
2381 */
63b6da39
PZ
2382 raw_spin_unlock_irq(&ctx->lock);
2383 return;
2384 }
39a43640 2385 /*
63cae12b
PZ
2386 * If the task is not running, ctx->lock will avoid it becoming so,
2387 * thus we can safely install the event.
39a43640 2388 */
63cae12b
PZ
2389 if (task_curr(task)) {
2390 raw_spin_unlock_irq(&ctx->lock);
2391 goto again;
2392 }
2393 add_event_to_ctx(event, ctx);
2394 raw_spin_unlock_irq(&ctx->lock);
0793a61d
TG
2395}
2396
fa289bec 2397/*
cdd6c482 2398 * Put a event into inactive state and update time fields.
fa289bec
PM
2399 * Enabling the leader of a group effectively enables all
2400 * the group members that aren't explicitly disabled, so we
2401 * have to update their ->tstamp_enabled also.
2402 * Note: this works for group members as well as group leaders
2403 * since the non-leader members' sibling_lists will be empty.
2404 */
1d9b482e 2405static void __perf_event_mark_enabled(struct perf_event *event)
fa289bec 2406{
cdd6c482 2407 struct perf_event *sub;
4158755d 2408 u64 tstamp = perf_event_time(event);
fa289bec 2409
cdd6c482 2410 event->state = PERF_EVENT_STATE_INACTIVE;
4158755d 2411 event->tstamp_enabled = tstamp - event->total_time_enabled;
9ed6060d 2412 list_for_each_entry(sub, &event->sibling_list, group_entry) {
4158755d
SE
2413 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
2414 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
9ed6060d 2415 }
fa289bec
PM
2416}
2417
d859e29f 2418/*
cdd6c482 2419 * Cross CPU call to enable a performance event
d859e29f 2420 */
fae3fde6
PZ
2421static void __perf_event_enable(struct perf_event *event,
2422 struct perf_cpu_context *cpuctx,
2423 struct perf_event_context *ctx,
2424 void *info)
04289bb9 2425{
cdd6c482 2426 struct perf_event *leader = event->group_leader;
fae3fde6 2427 struct perf_event_context *task_ctx;
04289bb9 2428
6e801e01
PZ
2429 if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2430 event->state <= PERF_EVENT_STATE_ERROR)
fae3fde6 2431 return;
3cbed429 2432
bd2afa49
PZ
2433 if (ctx->is_active)
2434 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2435
1d9b482e 2436 __perf_event_mark_enabled(event);
04289bb9 2437
fae3fde6
PZ
2438 if (!ctx->is_active)
2439 return;
2440
e5d1367f 2441 if (!event_filter_match(event)) {
bd2afa49 2442 if (is_cgroup_event(event))
e5d1367f 2443 perf_cgroup_defer_enabled(event);
bd2afa49 2444 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
fae3fde6 2445 return;
e5d1367f 2446 }
f4c4176f 2447
04289bb9 2448 /*
cdd6c482 2449 * If the event is in a group and isn't the group leader,
d859e29f 2450 * then don't put it on unless the group is on.
04289bb9 2451 */
bd2afa49
PZ
2452 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
2453 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
fae3fde6 2454 return;
bd2afa49 2455 }
fe4b04fa 2456
fae3fde6
PZ
2457 task_ctx = cpuctx->task_ctx;
2458 if (ctx->task)
2459 WARN_ON_ONCE(task_ctx != ctx);
d859e29f 2460
fae3fde6 2461 ctx_resched(cpuctx, task_ctx);
7b648018
PZ
2462}
2463
d859e29f 2464/*
cdd6c482 2465 * Enable a event.
c93f7669 2466 *
cdd6c482
IM
2467 * If event->ctx is a cloned context, callers must make sure that
2468 * every task struct that event->ctx->task could possibly point to
c93f7669 2469 * remains valid. This condition is satisfied when called through
cdd6c482
IM
2470 * perf_event_for_each_child or perf_event_for_each as described
2471 * for perf_event_disable.
d859e29f 2472 */
f63a8daa 2473static void _perf_event_enable(struct perf_event *event)
d859e29f 2474{
cdd6c482 2475 struct perf_event_context *ctx = event->ctx;
d859e29f 2476
7b648018 2477 raw_spin_lock_irq(&ctx->lock);
6e801e01
PZ
2478 if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2479 event->state < PERF_EVENT_STATE_ERROR) {
7b648018 2480 raw_spin_unlock_irq(&ctx->lock);
d859e29f
PM
2481 return;
2482 }
2483
d859e29f 2484 /*
cdd6c482 2485 * If the event is in error state, clear that first.
7b648018
PZ
2486 *
2487 * That way, if we see the event in error state below, we know that it
2488 * has gone back into error state, as distinct from the task having
2489 * been scheduled away before the cross-call arrived.
d859e29f 2490 */
cdd6c482
IM
2491 if (event->state == PERF_EVENT_STATE_ERROR)
2492 event->state = PERF_EVENT_STATE_OFF;
e625cce1 2493 raw_spin_unlock_irq(&ctx->lock);
fe4b04fa 2494
fae3fde6 2495 event_function_call(event, __perf_event_enable, NULL);
d859e29f 2496}
f63a8daa
PZ
2497
2498/*
2499 * See perf_event_disable();
2500 */
2501void perf_event_enable(struct perf_event *event)
2502{
2503 struct perf_event_context *ctx;
2504
2505 ctx = perf_event_ctx_lock(event);
2506 _perf_event_enable(event);
2507 perf_event_ctx_unlock(event, ctx);
2508}
dcfce4a0 2509EXPORT_SYMBOL_GPL(perf_event_enable);
d859e29f 2510
375637bc
AS
2511struct stop_event_data {
2512 struct perf_event *event;
2513 unsigned int restart;
2514};
2515
95ff4ca2
AS
2516static int __perf_event_stop(void *info)
2517{
375637bc
AS
2518 struct stop_event_data *sd = info;
2519 struct perf_event *event = sd->event;
95ff4ca2 2520
375637bc 2521 /* if it's already INACTIVE, do nothing */
95ff4ca2
AS
2522 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
2523 return 0;
2524
2525 /* matches smp_wmb() in event_sched_in() */
2526 smp_rmb();
2527
2528 /*
2529 * There is a window with interrupts enabled before we get here,
2530 * so we need to check again lest we try to stop another CPU's event.
2531 */
2532 if (READ_ONCE(event->oncpu) != smp_processor_id())
2533 return -EAGAIN;
2534
2535 event->pmu->stop(event, PERF_EF_UPDATE);
2536
375637bc
AS
2537 /*
2538 * May race with the actual stop (through perf_pmu_output_stop()),
2539 * but it is only used for events with AUX ring buffer, and such
2540 * events will refuse to restart because of rb::aux_mmap_count==0,
2541 * see comments in perf_aux_output_begin().
2542 *
2543 * Since this is happening on a event-local CPU, no trace is lost
2544 * while restarting.
2545 */
2546 if (sd->restart)
c9bbdd48 2547 event->pmu->start(event, 0);
375637bc 2548
95ff4ca2
AS
2549 return 0;
2550}
2551
767ae086 2552static int perf_event_stop(struct perf_event *event, int restart)
375637bc
AS
2553{
2554 struct stop_event_data sd = {
2555 .event = event,
767ae086 2556 .restart = restart,
375637bc
AS
2557 };
2558 int ret = 0;
2559
2560 do {
2561 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
2562 return 0;
2563
2564 /* matches smp_wmb() in event_sched_in() */
2565 smp_rmb();
2566
2567 /*
2568 * We only want to restart ACTIVE events, so if the event goes
2569 * inactive here (event->oncpu==-1), there's nothing more to do;
2570 * fall through with ret==-ENXIO.
2571 */
2572 ret = cpu_function_call(READ_ONCE(event->oncpu),
2573 __perf_event_stop, &sd);
2574 } while (ret == -EAGAIN);
2575
2576 return ret;
2577}
2578
2579/*
2580 * In order to contain the amount of racy and tricky in the address filter
2581 * configuration management, it is a two part process:
2582 *
2583 * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
2584 * we update the addresses of corresponding vmas in
2585 * event::addr_filters_offs array and bump the event::addr_filters_gen;
2586 * (p2) when an event is scheduled in (pmu::add), it calls
2587 * perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
2588 * if the generation has changed since the previous call.
2589 *
2590 * If (p1) happens while the event is active, we restart it to force (p2).
2591 *
2592 * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
2593 * pre-existing mappings, called once when new filters arrive via SET_FILTER
2594 * ioctl;
2595 * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
2596 * registered mapping, called for every new mmap(), with mm::mmap_sem down
2597 * for reading;
2598 * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
2599 * of exec.
2600 */
2601void perf_event_addr_filters_sync(struct perf_event *event)
2602{
2603 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
2604
2605 if (!has_addr_filter(event))
2606 return;
2607
2608 raw_spin_lock(&ifh->lock);
2609 if (event->addr_filters_gen != event->hw.addr_filters_gen) {
2610 event->pmu->addr_filters_sync(event);
2611 event->hw.addr_filters_gen = event->addr_filters_gen;
2612 }
2613 raw_spin_unlock(&ifh->lock);
2614}
2615EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
2616
f63a8daa 2617static int _perf_event_refresh(struct perf_event *event, int refresh)
79f14641 2618{
2023b359 2619 /*
cdd6c482 2620 * not supported on inherited events
2023b359 2621 */
2e939d1d 2622 if (event->attr.inherit || !is_sampling_event(event))
2023b359
PZ
2623 return -EINVAL;
2624
cdd6c482 2625 atomic_add(refresh, &event->event_limit);
f63a8daa 2626 _perf_event_enable(event);
2023b359
PZ
2627
2628 return 0;
79f14641 2629}
f63a8daa
PZ
2630
2631/*
2632 * See perf_event_disable()
2633 */
2634int perf_event_refresh(struct perf_event *event, int refresh)
2635{
2636 struct perf_event_context *ctx;
2637 int ret;
2638
2639 ctx = perf_event_ctx_lock(event);
2640 ret = _perf_event_refresh(event, refresh);
2641 perf_event_ctx_unlock(event, ctx);
2642
2643 return ret;
2644}
26ca5c11 2645EXPORT_SYMBOL_GPL(perf_event_refresh);
79f14641 2646
5b0311e1
FW
2647static void ctx_sched_out(struct perf_event_context *ctx,
2648 struct perf_cpu_context *cpuctx,
2649 enum event_type_t event_type)
235c7fc7 2650{
db24d33e 2651 int is_active = ctx->is_active;
c994d613 2652 struct perf_event *event;
235c7fc7 2653
c994d613 2654 lockdep_assert_held(&ctx->lock);
235c7fc7 2655
39a43640
PZ
2656 if (likely(!ctx->nr_events)) {
2657 /*
2658 * See __perf_remove_from_context().
2659 */
2660 WARN_ON_ONCE(ctx->is_active);
2661 if (ctx->task)
2662 WARN_ON_ONCE(cpuctx->task_ctx);
facc4307 2663 return;
39a43640
PZ
2664 }
2665
db24d33e 2666 ctx->is_active &= ~event_type;
3cbaa590
PZ
2667 if (!(ctx->is_active & EVENT_ALL))
2668 ctx->is_active = 0;
2669
63e30d3e
PZ
2670 if (ctx->task) {
2671 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2672 if (!ctx->is_active)
2673 cpuctx->task_ctx = NULL;
2674 }
facc4307 2675
8fdc6539
PZ
2676 /*
2677 * Always update time if it was set; not only when it changes.
2678 * Otherwise we can 'forget' to update time for any but the last
2679 * context we sched out. For example:
2680 *
2681 * ctx_sched_out(.event_type = EVENT_FLEXIBLE)
2682 * ctx_sched_out(.event_type = EVENT_PINNED)
2683 *
2684 * would only update time for the pinned events.
2685 */
3cbaa590
PZ
2686 if (is_active & EVENT_TIME) {
2687 /* update (and stop) ctx time */
2688 update_context_time(ctx);
2689 update_cgrp_time_from_cpuctx(cpuctx);
2690 }
2691
8fdc6539
PZ
2692 is_active ^= ctx->is_active; /* changed bits */
2693
3cbaa590 2694 if (!ctx->nr_active || !(is_active & EVENT_ALL))
facc4307 2695 return;
5b0311e1 2696
075e0b00 2697 perf_pmu_disable(ctx->pmu);
3cbaa590 2698 if (is_active & EVENT_PINNED) {
889ff015
FW
2699 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
2700 group_sched_out(event, cpuctx, ctx);
9ed6060d 2701 }
889ff015 2702
3cbaa590 2703 if (is_active & EVENT_FLEXIBLE) {
889ff015 2704 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
8c9ed8e1 2705 group_sched_out(event, cpuctx, ctx);
9ed6060d 2706 }
1b9a644f 2707 perf_pmu_enable(ctx->pmu);
235c7fc7
IM
2708}
2709
564c2b21 2710/*
5a3126d4
PZ
2711 * Test whether two contexts are equivalent, i.e. whether they have both been
2712 * cloned from the same version of the same context.
2713 *
2714 * Equivalence is measured using a generation number in the context that is
2715 * incremented on each modification to it; see unclone_ctx(), list_add_event()
2716 * and list_del_event().
564c2b21 2717 */
cdd6c482
IM
2718static int context_equiv(struct perf_event_context *ctx1,
2719 struct perf_event_context *ctx2)
564c2b21 2720{
211de6eb
PZ
2721 lockdep_assert_held(&ctx1->lock);
2722 lockdep_assert_held(&ctx2->lock);
2723
5a3126d4
PZ
2724 /* Pinning disables the swap optimization */
2725 if (ctx1->pin_count || ctx2->pin_count)
2726 return 0;
2727
2728 /* If ctx1 is the parent of ctx2 */
2729 if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2730 return 1;
2731
2732 /* If ctx2 is the parent of ctx1 */
2733 if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
2734 return 1;
2735
2736 /*
2737 * If ctx1 and ctx2 have the same parent; we flatten the parent
2738 * hierarchy, see perf_event_init_context().
2739 */
2740 if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
2741 ctx1->parent_gen == ctx2->parent_gen)
2742 return 1;
2743
2744 /* Unmatched */
2745 return 0;
564c2b21
PM
2746}
2747
cdd6c482
IM
2748static void __perf_event_sync_stat(struct perf_event *event,
2749 struct perf_event *next_event)
bfbd3381
PZ
2750{
2751 u64 value;
2752
cdd6c482 2753 if (!event->attr.inherit_stat)
bfbd3381
PZ
2754 return;
2755
2756 /*
cdd6c482 2757 * Update the event value, we cannot use perf_event_read()
bfbd3381
PZ
2758 * because we're in the middle of a context switch and have IRQs
2759 * disabled, which upsets smp_call_function_single(), however
cdd6c482 2760 * we know the event must be on the current CPU, therefore we
bfbd3381
PZ
2761 * don't need to use it.
2762 */
cdd6c482
IM
2763 switch (event->state) {
2764 case PERF_EVENT_STATE_ACTIVE:
3dbebf15
PZ
2765 event->pmu->read(event);
2766 /* fall-through */
bfbd3381 2767
cdd6c482
IM
2768 case PERF_EVENT_STATE_INACTIVE:
2769 update_event_times(event);
bfbd3381
PZ
2770 break;
2771
2772 default:
2773 break;
2774 }
2775
2776 /*
cdd6c482 2777 * In order to keep per-task stats reliable we need to flip the event
bfbd3381
PZ
2778 * values when we flip the contexts.
2779 */
e7850595
PZ
2780 value = local64_read(&next_event->count);
2781 value = local64_xchg(&event->count, value);
2782 local64_set(&next_event->count, value);
bfbd3381 2783
cdd6c482
IM
2784 swap(event->total_time_enabled, next_event->total_time_enabled);
2785 swap(event->total_time_running, next_event->total_time_running);
19d2e755 2786
bfbd3381 2787 /*
19d2e755 2788 * Since we swizzled the values, update the user visible data too.
bfbd3381 2789 */
cdd6c482
IM
2790 perf_event_update_userpage(event);
2791 perf_event_update_userpage(next_event);
bfbd3381
PZ
2792}
2793
cdd6c482
IM
2794static void perf_event_sync_stat(struct perf_event_context *ctx,
2795 struct perf_event_context *next_ctx)
bfbd3381 2796{
cdd6c482 2797 struct perf_event *event, *next_event;
bfbd3381
PZ
2798
2799 if (!ctx->nr_stat)
2800 return;
2801
02ffdbc8
PZ
2802 update_context_time(ctx);
2803
cdd6c482
IM
2804 event = list_first_entry(&ctx->event_list,
2805 struct perf_event, event_entry);
bfbd3381 2806
cdd6c482
IM
2807 next_event = list_first_entry(&next_ctx->event_list,
2808 struct perf_event, event_entry);
bfbd3381 2809
cdd6c482
IM
2810 while (&event->event_entry != &ctx->event_list &&
2811 &next_event->event_entry != &next_ctx->event_list) {
bfbd3381 2812
cdd6c482 2813 __perf_event_sync_stat(event, next_event);
bfbd3381 2814
cdd6c482
IM
2815 event = list_next_entry(event, event_entry);
2816 next_event = list_next_entry(next_event, event_entry);
bfbd3381
PZ
2817 }
2818}
2819
fe4b04fa
PZ
2820static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2821 struct task_struct *next)
0793a61d 2822{
8dc85d54 2823 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
cdd6c482 2824 struct perf_event_context *next_ctx;
5a3126d4 2825 struct perf_event_context *parent, *next_parent;
108b02cf 2826 struct perf_cpu_context *cpuctx;
c93f7669 2827 int do_switch = 1;
0793a61d 2828
108b02cf
PZ
2829 if (likely(!ctx))
2830 return;
10989fb2 2831
108b02cf
PZ
2832 cpuctx = __get_cpu_context(ctx);
2833 if (!cpuctx->task_ctx)
0793a61d
TG
2834 return;
2835
c93f7669 2836 rcu_read_lock();
8dc85d54 2837 next_ctx = next->perf_event_ctxp[ctxn];
5a3126d4
PZ
2838 if (!next_ctx)
2839 goto unlock;
2840
2841 parent = rcu_dereference(ctx->parent_ctx);
2842 next_parent = rcu_dereference(next_ctx->parent_ctx);
2843
2844 /* If neither context have a parent context; they cannot be clones. */
802c8a61 2845 if (!parent && !next_parent)
5a3126d4
PZ
2846 goto unlock;
2847
2848 if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
c93f7669
PM
2849 /*
2850 * Looks like the two contexts are clones, so we might be
2851 * able to optimize the context switch. We lock both
2852 * contexts and check that they are clones under the
2853 * lock (including re-checking that neither has been
2854 * uncloned in the meantime). It doesn't matter which
2855 * order we take the locks because no other cpu could
2856 * be trying to lock both of these tasks.
2857 */
e625cce1
TG
2858 raw_spin_lock(&ctx->lock);
2859 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
c93f7669 2860 if (context_equiv(ctx, next_ctx)) {
63b6da39
PZ
2861 WRITE_ONCE(ctx->task, next);
2862 WRITE_ONCE(next_ctx->task, task);
5a158c3c
YZ
2863
2864 swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
2865
63b6da39
PZ
2866 /*
2867 * RCU_INIT_POINTER here is safe because we've not
2868 * modified the ctx and the above modification of
2869 * ctx->task and ctx->task_ctx_data are immaterial
2870 * since those values are always verified under
2871 * ctx->lock which we're now holding.
2872 */
2873 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
2874 RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
2875
c93f7669 2876 do_switch = 0;
bfbd3381 2877
cdd6c482 2878 perf_event_sync_stat(ctx, next_ctx);
c93f7669 2879 }
e625cce1
TG
2880 raw_spin_unlock(&next_ctx->lock);
2881 raw_spin_unlock(&ctx->lock);
564c2b21 2882 }
5a3126d4 2883unlock:
c93f7669 2884 rcu_read_unlock();
564c2b21 2885
c93f7669 2886 if (do_switch) {
facc4307 2887 raw_spin_lock(&ctx->lock);
8833d0e2 2888 task_ctx_sched_out(cpuctx, ctx);
facc4307 2889 raw_spin_unlock(&ctx->lock);
c93f7669 2890 }
0793a61d
TG
2891}
2892
e48c1788
PZ
2893static DEFINE_PER_CPU(struct list_head, sched_cb_list);
2894
ba532500
YZ
2895void perf_sched_cb_dec(struct pmu *pmu)
2896{
e48c1788
PZ
2897 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2898
ba532500 2899 this_cpu_dec(perf_sched_cb_usages);
e48c1788
PZ
2900
2901 if (!--cpuctx->sched_cb_usage)
2902 list_del(&cpuctx->sched_cb_entry);
ba532500
YZ
2903}
2904
e48c1788 2905
ba532500
YZ
2906void perf_sched_cb_inc(struct pmu *pmu)
2907{
e48c1788
PZ
2908 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2909
2910 if (!cpuctx->sched_cb_usage++)
2911 list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
2912
ba532500
YZ
2913 this_cpu_inc(perf_sched_cb_usages);
2914}
2915
2916/*
2917 * This function provides the context switch callback to the lower code
2918 * layer. It is invoked ONLY when the context switch callback is enabled.
09e61b4f
PZ
2919 *
2920 * This callback is relevant even to per-cpu events; for example multi event
2921 * PEBS requires this to provide PID/TID information. This requires we flush
2922 * all queued PEBS records before we context switch to a new task.
ba532500
YZ
2923 */
2924static void perf_pmu_sched_task(struct task_struct *prev,
2925 struct task_struct *next,
2926 bool sched_in)
2927{
2928 struct perf_cpu_context *cpuctx;
2929 struct pmu *pmu;
ba532500
YZ
2930
2931 if (prev == next)
2932 return;
2933
e48c1788 2934 list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
1fd7e416 2935 pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
ba532500 2936
e48c1788
PZ
2937 if (WARN_ON_ONCE(!pmu->sched_task))
2938 continue;
ba532500 2939
e48c1788
PZ
2940 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2941 perf_pmu_disable(pmu);
ba532500 2942
e48c1788 2943 pmu->sched_task(cpuctx->task_ctx, sched_in);
ba532500 2944
e48c1788
PZ
2945 perf_pmu_enable(pmu);
2946 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
ba532500 2947 }
ba532500
YZ
2948}
2949
45ac1403
AH
2950static void perf_event_switch(struct task_struct *task,
2951 struct task_struct *next_prev, bool sched_in);
2952
8dc85d54
PZ
2953#define for_each_task_context_nr(ctxn) \
2954 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
2955
2956/*
2957 * Called from scheduler to remove the events of the current task,
2958 * with interrupts disabled.
2959 *
2960 * We stop each event and update the event value in event->count.
2961 *
2962 * This does not protect us against NMI, but disable()
2963 * sets the disabled bit in the control field of event _before_
2964 * accessing the event control register. If a NMI hits, then it will
2965 * not restart the event.
2966 */
ab0cce56
JO
2967void __perf_event_task_sched_out(struct task_struct *task,
2968 struct task_struct *next)
8dc85d54
PZ
2969{
2970 int ctxn;
2971
ba532500
YZ
2972 if (__this_cpu_read(perf_sched_cb_usages))
2973 perf_pmu_sched_task(task, next, false);
2974
45ac1403
AH
2975 if (atomic_read(&nr_switch_events))
2976 perf_event_switch(task, next, false);
2977
8dc85d54
PZ
2978 for_each_task_context_nr(ctxn)
2979 perf_event_context_sched_out(task, ctxn, next);
e5d1367f
SE
2980
2981 /*
2982 * if cgroup events exist on this CPU, then we need
2983 * to check if we have to switch out PMU state.
2984 * cgroup event are system-wide mode only
2985 */
4a32fea9 2986 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
a8d757ef 2987 perf_cgroup_sched_out(task, next);
8dc85d54
PZ
2988}
2989
5b0311e1
FW
2990/*
2991 * Called with IRQs disabled
2992 */
2993static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
2994 enum event_type_t event_type)
2995{
2996 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
04289bb9
IM
2997}
2998
235c7fc7 2999static void
5b0311e1 3000ctx_pinned_sched_in(struct perf_event_context *ctx,
6e37738a 3001 struct perf_cpu_context *cpuctx)
0793a61d 3002{
cdd6c482 3003 struct perf_event *event;
0793a61d 3004
889ff015
FW
3005 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
3006 if (event->state <= PERF_EVENT_STATE_OFF)
3b6f9e5c 3007 continue;
5632ab12 3008 if (!event_filter_match(event))
3b6f9e5c
PM
3009 continue;
3010
e5d1367f
SE
3011 /* may need to reset tstamp_enabled */
3012 if (is_cgroup_event(event))
3013 perf_cgroup_mark_enabled(event, ctx);
3014
8c9ed8e1 3015 if (group_can_go_on(event, cpuctx, 1))
6e37738a 3016 group_sched_in(event, cpuctx, ctx);
3b6f9e5c
PM
3017
3018 /*
3019 * If this pinned group hasn't been scheduled,
3020 * put it in error state.
3021 */
cdd6c482
IM
3022 if (event->state == PERF_EVENT_STATE_INACTIVE) {
3023 update_group_times(event);
3024 event->state = PERF_EVENT_STATE_ERROR;
53cfbf59 3025 }
3b6f9e5c 3026 }
5b0311e1
FW
3027}
3028
3029static void
3030ctx_flexible_sched_in(struct perf_event_context *ctx,
6e37738a 3031 struct perf_cpu_context *cpuctx)
5b0311e1
FW
3032{
3033 struct perf_event *event;
3034 int can_add_hw = 1;
3b6f9e5c 3035
889ff015
FW
3036 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
3037 /* Ignore events in OFF or ERROR state */
3038 if (event->state <= PERF_EVENT_STATE_OFF)
3b6f9e5c 3039 continue;
04289bb9
IM
3040 /*
3041 * Listen to the 'cpu' scheduling filter constraint
cdd6c482 3042 * of events:
04289bb9 3043 */
5632ab12 3044 if (!event_filter_match(event))
0793a61d
TG
3045 continue;
3046
e5d1367f
SE
3047 /* may need to reset tstamp_enabled */
3048 if (is_cgroup_event(event))
3049 perf_cgroup_mark_enabled(event, ctx);
3050
9ed6060d 3051 if (group_can_go_on(event, cpuctx, can_add_hw)) {
6e37738a 3052 if (group_sched_in(event, cpuctx, ctx))
dd0e6ba2 3053 can_add_hw = 0;
9ed6060d 3054 }
0793a61d 3055 }
5b0311e1
FW
3056}
3057
3058static void
3059ctx_sched_in(struct perf_event_context *ctx,
3060 struct perf_cpu_context *cpuctx,
e5d1367f
SE
3061 enum event_type_t event_type,
3062 struct task_struct *task)
5b0311e1 3063{
db24d33e 3064 int is_active = ctx->is_active;
c994d613
PZ
3065 u64 now;
3066
3067 lockdep_assert_held(&ctx->lock);
e5d1367f 3068
5b0311e1 3069 if (likely(!ctx->nr_events))
facc4307 3070 return;
5b0311e1 3071
3cbaa590 3072 ctx->is_active |= (event_type | EVENT_TIME);
63e30d3e
PZ
3073 if (ctx->task) {
3074 if (!is_active)
3075 cpuctx->task_ctx = ctx;
3076 else
3077 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3078 }
3079
3cbaa590
PZ
3080 is_active ^= ctx->is_active; /* changed bits */
3081
3082 if (is_active & EVENT_TIME) {
3083 /* start ctx time */
3084 now = perf_clock();
3085 ctx->timestamp = now;
3086 perf_cgroup_set_timestamp(task, ctx);
3087 }
3088
5b0311e1
FW
3089 /*
3090 * First go through the list and put on any pinned groups
3091 * in order to give them the best chance of going on.
3092 */
3cbaa590 3093 if (is_active & EVENT_PINNED)
6e37738a 3094 ctx_pinned_sched_in(ctx, cpuctx);
5b0311e1
FW
3095
3096 /* Then walk through the lower prio flexible groups */
3cbaa590 3097 if (is_active & EVENT_FLEXIBLE)
6e37738a 3098 ctx_flexible_sched_in(ctx, cpuctx);
235c7fc7
IM
3099}
3100
329c0e01 3101static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
e5d1367f
SE
3102 enum event_type_t event_type,
3103 struct task_struct *task)
329c0e01
FW
3104{
3105 struct perf_event_context *ctx = &cpuctx->ctx;
3106
e5d1367f 3107 ctx_sched_in(ctx, cpuctx, event_type, task);
329c0e01
FW
3108}
3109
e5d1367f
SE
3110static void perf_event_context_sched_in(struct perf_event_context *ctx,
3111 struct task_struct *task)
235c7fc7 3112{
108b02cf 3113 struct perf_cpu_context *cpuctx;
235c7fc7 3114
108b02cf 3115 cpuctx = __get_cpu_context(ctx);
329c0e01
FW
3116 if (cpuctx->task_ctx == ctx)
3117 return;
3118
facc4307 3119 perf_ctx_lock(cpuctx, ctx);
1b9a644f 3120 perf_pmu_disable(ctx->pmu);
329c0e01
FW
3121 /*
3122 * We want to keep the following priority order:
3123 * cpu pinned (that don't need to move), task pinned,
3124 * cpu flexible, task flexible.
fe45bafb
AS
3125 *
3126 * However, if task's ctx is not carrying any pinned
3127 * events, no need to flip the cpuctx's events around.
329c0e01 3128 */
fe45bafb
AS
3129 if (!list_empty(&ctx->pinned_groups))
3130 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
63e30d3e 3131 perf_event_sched_in(cpuctx, ctx, task);
facc4307
PZ
3132 perf_pmu_enable(ctx->pmu);
3133 perf_ctx_unlock(cpuctx, ctx);
235c7fc7
IM
3134}
3135
8dc85d54
PZ
3136/*
3137 * Called from scheduler to add the events of the current task
3138 * with interrupts disabled.
3139 *
3140 * We restore the event value and then enable it.
3141 *
3142 * This does not protect us against NMI, but enable()
3143 * sets the enabled bit in the control field of event _before_
3144 * accessing the event control register. If a NMI hits, then it will
3145 * keep the event running.
3146 */
ab0cce56
JO
3147void __perf_event_task_sched_in(struct task_struct *prev,
3148 struct task_struct *task)
8dc85d54
PZ
3149{
3150 struct perf_event_context *ctx;
3151 int ctxn;
3152
7e41d177
PZ
3153 /*
3154 * If cgroup events exist on this CPU, then we need to check if we have
3155 * to switch in PMU state; cgroup event are system-wide mode only.
3156 *
3157 * Since cgroup events are CPU events, we must schedule these in before
3158 * we schedule in the task events.
3159 */
3160 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3161 perf_cgroup_sched_in(prev, task);
3162
8dc85d54
PZ
3163 for_each_task_context_nr(ctxn) {
3164 ctx = task->perf_event_ctxp[ctxn];
3165 if (likely(!ctx))
3166 continue;
3167
e5d1367f 3168 perf_event_context_sched_in(ctx, task);
8dc85d54 3169 }
d010b332 3170
45ac1403
AH
3171 if (atomic_read(&nr_switch_events))
3172 perf_event_switch(task, prev, true);
3173
ba532500
YZ
3174 if (__this_cpu_read(perf_sched_cb_usages))
3175 perf_pmu_sched_task(prev, task, true);
235c7fc7
IM
3176}
3177
abd50713
PZ
3178static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
3179{
3180 u64 frequency = event->attr.sample_freq;
3181 u64 sec = NSEC_PER_SEC;
3182 u64 divisor, dividend;
3183
3184 int count_fls, nsec_fls, frequency_fls, sec_fls;
3185
3186 count_fls = fls64(count);
3187 nsec_fls = fls64(nsec);
3188 frequency_fls = fls64(frequency);
3189 sec_fls = 30;
3190
3191 /*
3192 * We got @count in @nsec, with a target of sample_freq HZ
3193 * the target period becomes:
3194 *
3195 * @count * 10^9
3196 * period = -------------------
3197 * @nsec * sample_freq
3198 *
3199 */
3200
3201 /*
3202 * Reduce accuracy by one bit such that @a and @b converge
3203 * to a similar magnitude.
3204 */
fe4b04fa 3205#define REDUCE_FLS(a, b) \
abd50713
PZ
3206do { \
3207 if (a##_fls > b##_fls) { \
3208 a >>= 1; \
3209 a##_fls--; \
3210 } else { \
3211 b >>= 1; \
3212 b##_fls--; \
3213 } \
3214} while (0)
3215
3216 /*
3217 * Reduce accuracy until either term fits in a u64, then proceed with
3218 * the other, so that finally we can do a u64/u64 division.
3219 */
3220 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
3221 REDUCE_FLS(nsec, frequency);
3222 REDUCE_FLS(sec, count);
3223 }
3224
3225 if (count_fls + sec_fls > 64) {
3226 divisor = nsec * frequency;
3227
3228 while (count_fls + sec_fls > 64) {
3229 REDUCE_FLS(count, sec);
3230 divisor >>= 1;
3231 }
3232
3233 dividend = count * sec;
3234 } else {
3235 dividend = count * sec;
3236
3237 while (nsec_fls + frequency_fls > 64) {
3238 REDUCE_FLS(nsec, frequency);
3239 dividend >>= 1;
3240 }
3241
3242 divisor = nsec * frequency;
3243 }
3244
f6ab91ad
PZ
3245 if (!divisor)
3246 return dividend;
3247
abd50713
PZ
3248 return div64_u64(dividend, divisor);
3249}
3250
e050e3f0
SE
3251static DEFINE_PER_CPU(int, perf_throttled_count);
3252static DEFINE_PER_CPU(u64, perf_throttled_seq);
3253
f39d47ff 3254static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
bd2b5b12 3255{
cdd6c482 3256 struct hw_perf_event *hwc = &event->hw;
f6ab91ad 3257 s64 period, sample_period;
bd2b5b12
PZ
3258 s64 delta;
3259
abd50713 3260 period = perf_calculate_period(event, nsec, count);
bd2b5b12
PZ
3261
3262 delta = (s64)(period - hwc->sample_period);
3263 delta = (delta + 7) / 8; /* low pass filter */
3264
3265 sample_period = hwc->sample_period + delta;
3266
3267 if (!sample_period)
3268 sample_period = 1;
3269
bd2b5b12 3270 hwc->sample_period = sample_period;
abd50713 3271
e7850595 3272 if (local64_read(&hwc->period_left) > 8*sample_period) {
f39d47ff
SE
3273 if (disable)
3274 event->pmu->stop(event, PERF_EF_UPDATE);
3275
e7850595 3276 local64_set(&hwc->period_left, 0);
f39d47ff
SE
3277
3278 if (disable)
3279 event->pmu->start(event, PERF_EF_RELOAD);
abd50713 3280 }
bd2b5b12
PZ
3281}
3282
e050e3f0
SE
3283/*
3284 * combine freq adjustment with unthrottling to avoid two passes over the
3285 * events. At the same time, make sure, having freq events does not change
3286 * the rate of unthrottling as that would introduce bias.
3287 */
3288static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
3289 int needs_unthr)
60db5e09 3290{
cdd6c482
IM
3291 struct perf_event *event;
3292 struct hw_perf_event *hwc;
e050e3f0 3293 u64 now, period = TICK_NSEC;
abd50713 3294 s64 delta;
60db5e09 3295
e050e3f0
SE
3296 /*
3297 * only need to iterate over all events iff:
3298 * - context have events in frequency mode (needs freq adjust)
3299 * - there are events to unthrottle on this cpu
3300 */
3301 if (!(ctx->nr_freq || needs_unthr))
0f5a2601
PZ
3302 return;
3303
e050e3f0 3304 raw_spin_lock(&ctx->lock);
f39d47ff 3305 perf_pmu_disable(ctx->pmu);
e050e3f0 3306
03541f8b 3307 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
cdd6c482 3308 if (event->state != PERF_EVENT_STATE_ACTIVE)
60db5e09
PZ
3309 continue;
3310
5632ab12 3311 if (!event_filter_match(event))
5d27c23d
PZ
3312 continue;
3313
44377277
AS
3314 perf_pmu_disable(event->pmu);
3315
cdd6c482 3316 hwc = &event->hw;
6a24ed6c 3317
ae23bff1 3318 if (hwc->interrupts == MAX_INTERRUPTS) {
e050e3f0 3319 hwc->interrupts = 0;
cdd6c482 3320 perf_log_throttle(event, 1);
a4eaf7f1 3321 event->pmu->start(event, 0);
a78ac325
PZ
3322 }
3323
cdd6c482 3324 if (!event->attr.freq || !event->attr.sample_freq)
44377277 3325 goto next;
60db5e09 3326
e050e3f0
SE
3327 /*
3328 * stop the event and update event->count
3329 */
3330 event->pmu->stop(event, PERF_EF_UPDATE);
3331
e7850595 3332 now = local64_read(&event->count);
abd50713
PZ
3333 delta = now - hwc->freq_count_stamp;
3334 hwc->freq_count_stamp = now;
60db5e09 3335
e050e3f0
SE
3336 /*
3337 * restart the event
3338 * reload only if value has changed
f39d47ff
SE
3339 * we have stopped the event so tell that
3340 * to perf_adjust_period() to avoid stopping it
3341 * twice.
e050e3f0 3342 */
abd50713 3343 if (delta > 0)
f39d47ff 3344 perf_adjust_period(event, period, delta, false);
e050e3f0
SE
3345
3346 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
44377277
AS
3347 next:
3348 perf_pmu_enable(event->pmu);
60db5e09 3349 }
e050e3f0 3350
f39d47ff 3351 perf_pmu_enable(ctx->pmu);
e050e3f0 3352 raw_spin_unlock(&ctx->lock);
60db5e09
PZ
3353}
3354
235c7fc7 3355/*
cdd6c482 3356 * Round-robin a context's events:
235c7fc7 3357 */
cdd6c482 3358static void rotate_ctx(struct perf_event_context *ctx)
0793a61d 3359{
dddd3379
TG
3360 /*
3361 * Rotate the first entry last of non-pinned groups. Rotation might be
3362 * disabled by the inheritance code.
3363 */
3364 if (!ctx->rotate_disable)
3365 list_rotate_left(&ctx->flexible_groups);
235c7fc7
IM
3366}
3367
9e630205 3368static int perf_rotate_context(struct perf_cpu_context *cpuctx)
235c7fc7 3369{
8dc85d54 3370 struct perf_event_context *ctx = NULL;
2fde4f94 3371 int rotate = 0;
7fc23a53 3372
b5ab4cd5 3373 if (cpuctx->ctx.nr_events) {
b5ab4cd5
PZ
3374 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
3375 rotate = 1;
3376 }
235c7fc7 3377
8dc85d54 3378 ctx = cpuctx->task_ctx;
b5ab4cd5 3379 if (ctx && ctx->nr_events) {
b5ab4cd5
PZ
3380 if (ctx->nr_events != ctx->nr_active)
3381 rotate = 1;
3382 }
9717e6cd 3383
e050e3f0 3384 if (!rotate)
0f5a2601
PZ
3385 goto done;
3386
facc4307 3387 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
1b9a644f 3388 perf_pmu_disable(cpuctx->ctx.pmu);
60db5e09 3389
e050e3f0
SE
3390 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3391 if (ctx)
3392 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
0793a61d 3393
e050e3f0
SE
3394 rotate_ctx(&cpuctx->ctx);
3395 if (ctx)
3396 rotate_ctx(ctx);
235c7fc7 3397
e050e3f0 3398 perf_event_sched_in(cpuctx, ctx, current);
235c7fc7 3399
0f5a2601
PZ
3400 perf_pmu_enable(cpuctx->ctx.pmu);
3401 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
b5ab4cd5 3402done:
9e630205
SE
3403
3404 return rotate;
e9d2b064
PZ
3405}
3406
3407void perf_event_task_tick(void)
3408{
2fde4f94
MR
3409 struct list_head *head = this_cpu_ptr(&active_ctx_list);
3410 struct perf_event_context *ctx, *tmp;
e050e3f0 3411 int throttled;
b5ab4cd5 3412
e9d2b064
PZ
3413 WARN_ON(!irqs_disabled());
3414
e050e3f0
SE
3415 __this_cpu_inc(perf_throttled_seq);
3416 throttled = __this_cpu_xchg(perf_throttled_count, 0);
555e0c1e 3417 tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
e050e3f0 3418
2fde4f94 3419 list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
e050e3f0 3420 perf_adjust_freq_unthr_context(ctx, throttled);
0793a61d
TG
3421}
3422
889ff015
FW
3423static int event_enable_on_exec(struct perf_event *event,
3424 struct perf_event_context *ctx)
3425{
3426 if (!event->attr.enable_on_exec)
3427 return 0;
3428
3429 event->attr.enable_on_exec = 0;
3430 if (event->state >= PERF_EVENT_STATE_INACTIVE)
3431 return 0;
3432
1d9b482e 3433 __perf_event_mark_enabled(event);
889ff015
FW
3434
3435 return 1;
3436}
3437
57e7986e 3438/*
cdd6c482 3439 * Enable all of a task's events that have been marked enable-on-exec.
57e7986e
PM
3440 * This expects task == current.
3441 */
c1274499 3442static void perf_event_enable_on_exec(int ctxn)
57e7986e 3443{
c1274499 3444 struct perf_event_context *ctx, *clone_ctx = NULL;
3e349507 3445 struct perf_cpu_context *cpuctx;
cdd6c482 3446 struct perf_event *event;
57e7986e
PM
3447 unsigned long flags;
3448 int enabled = 0;
3449
3450 local_irq_save(flags);
c1274499 3451 ctx = current->perf_event_ctxp[ctxn];
cdd6c482 3452 if (!ctx || !ctx->nr_events)
57e7986e
PM
3453 goto out;
3454
3e349507
PZ
3455 cpuctx = __get_cpu_context(ctx);
3456 perf_ctx_lock(cpuctx, ctx);
7fce2509 3457 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
3e349507
PZ
3458 list_for_each_entry(event, &ctx->event_list, event_entry)
3459 enabled |= event_enable_on_exec(event, ctx);
57e7986e
PM
3460
3461 /*
3e349507 3462 * Unclone and reschedule this context if we enabled any event.
57e7986e 3463 */
3e349507 3464 if (enabled) {
211de6eb 3465 clone_ctx = unclone_ctx(ctx);
3e349507
PZ
3466 ctx_resched(cpuctx, ctx);
3467 }
3468 perf_ctx_unlock(cpuctx, ctx);
57e7986e 3469
9ed6060d 3470out:
57e7986e 3471 local_irq_restore(flags);
211de6eb
PZ
3472
3473 if (clone_ctx)
3474 put_ctx(clone_ctx);
57e7986e
PM
3475}
3476
0492d4c5
PZ
3477struct perf_read_data {
3478 struct perf_event *event;
3479 bool group;
7d88962e 3480 int ret;
0492d4c5
PZ
3481};
3482
d6a2f903
DCC
3483static int find_cpu_to_read(struct perf_event *event, int local_cpu)
3484{
3485 int event_cpu = event->oncpu;
3486 u16 local_pkg, event_pkg;
3487
3488 if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
3489 event_pkg = topology_physical_package_id(event_cpu);
3490 local_pkg = topology_physical_package_id(local_cpu);
3491
3492 if (event_pkg == local_pkg)
3493 return local_cpu;
3494 }
3495
3496 return event_cpu;
3497}
3498
0793a61d 3499/*
cdd6c482 3500 * Cross CPU call to read the hardware event
0793a61d 3501 */
cdd6c482 3502static void __perf_event_read(void *info)
0793a61d 3503{
0492d4c5
PZ
3504 struct perf_read_data *data = info;
3505 struct perf_event *sub, *event = data->event;
cdd6c482 3506 struct perf_event_context *ctx = event->ctx;
108b02cf 3507 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
4a00c16e 3508 struct pmu *pmu = event->pmu;
621a01ea 3509
e1ac3614
PM
3510 /*
3511 * If this is a task context, we need to check whether it is
3512 * the current task context of this cpu. If not it has been
3513 * scheduled out before the smp call arrived. In that case
cdd6c482
IM
3514 * event->count would have been updated to a recent sample
3515 * when the event was scheduled out.
e1ac3614
PM
3516 */
3517 if (ctx->task && cpuctx->task_ctx != ctx)
3518 return;
3519
e625cce1 3520 raw_spin_lock(&ctx->lock);
e5d1367f 3521 if (ctx->is_active) {
542e72fc 3522 update_context_time(ctx);
e5d1367f
SE
3523 update_cgrp_time_from_event(event);
3524 }
0492d4c5 3525
cdd6c482 3526 update_event_times(event);
4a00c16e
SB
3527 if (event->state != PERF_EVENT_STATE_ACTIVE)
3528 goto unlock;
0492d4c5 3529
4a00c16e
SB
3530 if (!data->group) {
3531 pmu->read(event);
3532 data->ret = 0;
0492d4c5 3533 goto unlock;
4a00c16e
SB
3534 }
3535
3536 pmu->start_txn(pmu, PERF_PMU_TXN_READ);
3537
3538 pmu->read(event);
0492d4c5
PZ
3539
3540 list_for_each_entry(sub, &event->sibling_list, group_entry) {
3541 update_event_times(sub);
4a00c16e
SB
3542 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
3543 /*
3544 * Use sibling's PMU rather than @event's since
3545 * sibling could be on different (eg: software) PMU.
3546 */
0492d4c5 3547 sub->pmu->read(sub);
4a00c16e 3548 }
0492d4c5 3549 }
4a00c16e
SB
3550
3551 data->ret = pmu->commit_txn(pmu);
0492d4c5
PZ
3552
3553unlock:
e625cce1 3554 raw_spin_unlock(&ctx->lock);
0793a61d
TG
3555}
3556
b5e58793
PZ
3557static inline u64 perf_event_count(struct perf_event *event)
3558{
eacd3ecc
MF
3559 if (event->pmu->count)
3560 return event->pmu->count(event);
3561
3562 return __perf_event_count(event);
b5e58793
PZ
3563}
3564
ffe8690c
KX
3565/*
3566 * NMI-safe method to read a local event, that is an event that
3567 * is:
3568 * - either for the current task, or for this CPU
3569 * - does not have inherit set, for inherited task events
3570 * will not be local and we cannot read them atomically
3571 * - must not have a pmu::count method
3572 */
3573u64 perf_event_read_local(struct perf_event *event)
3574{
3575 unsigned long flags;
3576 u64 val;
3577
3578 /*
3579 * Disabling interrupts avoids all counter scheduling (context
3580 * switches, timer based rotation and IPIs).
3581 */
3582 local_irq_save(flags);
3583
3584 /* If this is a per-task event, it must be for current */
3585 WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) &&
3586 event->hw.target != current);
3587
3588 /* If this is a per-CPU event, it must be for this CPU */
3589 WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) &&
3590 event->cpu != smp_processor_id());
3591
3592 /*
3593 * It must not be an event with inherit set, we cannot read
3594 * all child counters from atomic context.
3595 */
3596 WARN_ON_ONCE(event->attr.inherit);
3597
3598 /*
3599 * It must not have a pmu::count method, those are not
3600 * NMI safe.
3601 */
3602 WARN_ON_ONCE(event->pmu->count);
3603
3604 /*
3605 * If the event is currently on this CPU, its either a per-task event,
3606 * or local to this CPU. Furthermore it means its ACTIVE (otherwise
3607 * oncpu == -1).
3608 */
3609 if (event->oncpu == smp_processor_id())
3610 event->pmu->read(event);
3611
3612 val = local64_read(&event->count);
3613 local_irq_restore(flags);
3614
3615 return val;
3616}
3617
7d88962e 3618static int perf_event_read(struct perf_event *event, bool group)
0793a61d 3619{
d6a2f903 3620 int ret = 0, cpu_to_read, local_cpu;
7d88962e 3621
0793a61d 3622 /*
cdd6c482
IM
3623 * If event is enabled and currently active on a CPU, update the
3624 * value in the event structure:
0793a61d 3625 */
cdd6c482 3626 if (event->state == PERF_EVENT_STATE_ACTIVE) {
0492d4c5
PZ
3627 struct perf_read_data data = {
3628 .event = event,
3629 .group = group,
7d88962e 3630 .ret = 0,
0492d4c5 3631 };
d6a2f903
DCC
3632
3633 local_cpu = get_cpu();
3634 cpu_to_read = find_cpu_to_read(event, local_cpu);
3635 put_cpu();
3636
58763148
PZ
3637 /*
3638 * Purposely ignore the smp_call_function_single() return
3639 * value.
3640 *
3641 * If event->oncpu isn't a valid CPU it means the event got
3642 * scheduled out and that will have updated the event count.
3643 *
3644 * Therefore, either way, we'll have an up-to-date event count
3645 * after this.
3646 */
2cc53841 3647 (void)smp_call_function_single(cpu_to_read, __perf_event_read, &data, 1);
58763148 3648 ret = data.ret;
cdd6c482 3649 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
2b8988c9
PZ
3650 struct perf_event_context *ctx = event->ctx;
3651 unsigned long flags;
3652
e625cce1 3653 raw_spin_lock_irqsave(&ctx->lock, flags);
c530ccd9
SE
3654 /*
3655 * may read while context is not active
3656 * (e.g., thread is blocked), in that case
3657 * we cannot update context time
3658 */
e5d1367f 3659 if (ctx->is_active) {
c530ccd9 3660 update_context_time(ctx);
e5d1367f
SE
3661 update_cgrp_time_from_event(event);
3662 }
0492d4c5
PZ
3663 if (group)
3664 update_group_times(event);
3665 else
3666 update_event_times(event);
e625cce1 3667 raw_spin_unlock_irqrestore(&ctx->lock, flags);
0793a61d 3668 }
7d88962e
SB
3669
3670 return ret;
0793a61d
TG
3671}
3672
a63eaf34 3673/*
cdd6c482 3674 * Initialize the perf_event context in a task_struct:
a63eaf34 3675 */
eb184479 3676static void __perf_event_init_context(struct perf_event_context *ctx)
a63eaf34 3677{
e625cce1 3678 raw_spin_lock_init(&ctx->lock);
a63eaf34 3679 mutex_init(&ctx->mutex);
2fde4f94 3680 INIT_LIST_HEAD(&ctx->active_ctx_list);
889ff015
FW
3681 INIT_LIST_HEAD(&ctx->pinned_groups);
3682 INIT_LIST_HEAD(&ctx->flexible_groups);
a63eaf34
PM
3683 INIT_LIST_HEAD(&ctx->event_list);
3684 atomic_set(&ctx->refcount, 1);
eb184479
PZ
3685}
3686
3687static struct perf_event_context *
3688alloc_perf_context(struct pmu *pmu, struct task_struct *task)
3689{
3690 struct perf_event_context *ctx;
3691
3692 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
3693 if (!ctx)
3694 return NULL;
3695
3696 __perf_event_init_context(ctx);
3697 if (task) {
3698 ctx->task = task;
3699 get_task_struct(task);
0793a61d 3700 }
eb184479
PZ
3701 ctx->pmu = pmu;
3702
3703 return ctx;
a63eaf34
PM
3704}
3705
2ebd4ffb
MH
3706static struct task_struct *
3707find_lively_task_by_vpid(pid_t vpid)
3708{
3709 struct task_struct *task;
0793a61d
TG
3710
3711 rcu_read_lock();
2ebd4ffb 3712 if (!vpid)
0793a61d
TG
3713 task = current;
3714 else
2ebd4ffb 3715 task = find_task_by_vpid(vpid);
0793a61d
TG
3716 if (task)
3717 get_task_struct(task);
3718 rcu_read_unlock();
3719
3720 if (!task)
3721 return ERR_PTR(-ESRCH);
3722
2ebd4ffb 3723 return task;
2ebd4ffb
MH
3724}
3725
fe4b04fa
PZ
3726/*
3727 * Returns a matching context with refcount and pincount.
3728 */
108b02cf 3729static struct perf_event_context *
4af57ef2
YZ
3730find_get_context(struct pmu *pmu, struct task_struct *task,
3731 struct perf_event *event)
0793a61d 3732{
211de6eb 3733 struct perf_event_context *ctx, *clone_ctx = NULL;
22a4f650 3734 struct perf_cpu_context *cpuctx;
4af57ef2 3735 void *task_ctx_data = NULL;
25346b93 3736 unsigned long flags;
8dc85d54 3737 int ctxn, err;
4af57ef2 3738 int cpu = event->cpu;
0793a61d 3739
22a4ec72 3740 if (!task) {
cdd6c482 3741 /* Must be root to operate on a CPU event: */
0764771d 3742 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
0793a61d
TG
3743 return ERR_PTR(-EACCES);
3744
0793a61d 3745 /*
cdd6c482 3746 * We could be clever and allow to attach a event to an
0793a61d
TG
3747 * offline CPU and activate it when the CPU comes up, but
3748 * that's for later.
3749 */
f6325e30 3750 if (!cpu_online(cpu))
0793a61d
TG
3751 return ERR_PTR(-ENODEV);
3752
108b02cf 3753 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
0793a61d 3754 ctx = &cpuctx->ctx;
c93f7669 3755 get_ctx(ctx);
fe4b04fa 3756 ++ctx->pin_count;
0793a61d 3757
0793a61d
TG
3758 return ctx;
3759 }
3760
8dc85d54
PZ
3761 err = -EINVAL;
3762 ctxn = pmu->task_ctx_nr;
3763 if (ctxn < 0)
3764 goto errout;
3765
4af57ef2
YZ
3766 if (event->attach_state & PERF_ATTACH_TASK_DATA) {
3767 task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
3768 if (!task_ctx_data) {
3769 err = -ENOMEM;
3770 goto errout;
3771 }
3772 }
3773
9ed6060d 3774retry:
8dc85d54 3775 ctx = perf_lock_task_context(task, ctxn, &flags);
c93f7669 3776 if (ctx) {
211de6eb 3777 clone_ctx = unclone_ctx(ctx);
fe4b04fa 3778 ++ctx->pin_count;
4af57ef2
YZ
3779
3780 if (task_ctx_data && !ctx->task_ctx_data) {
3781 ctx->task_ctx_data = task_ctx_data;
3782 task_ctx_data = NULL;
3783 }
e625cce1 3784 raw_spin_unlock_irqrestore(&ctx->lock, flags);
211de6eb
PZ
3785
3786 if (clone_ctx)
3787 put_ctx(clone_ctx);
9137fb28 3788 } else {
eb184479 3789 ctx = alloc_perf_context(pmu, task);
c93f7669
PM
3790 err = -ENOMEM;
3791 if (!ctx)
3792 goto errout;
eb184479 3793
4af57ef2
YZ
3794 if (task_ctx_data) {
3795 ctx->task_ctx_data = task_ctx_data;
3796 task_ctx_data = NULL;
3797 }
3798
dbe08d82
ON
3799 err = 0;
3800 mutex_lock(&task->perf_event_mutex);
3801 /*
3802 * If it has already passed perf_event_exit_task().
3803 * we must see PF_EXITING, it takes this mutex too.
3804 */
3805 if (task->flags & PF_EXITING)
3806 err = -ESRCH;
3807 else if (task->perf_event_ctxp[ctxn])
3808 err = -EAGAIN;
fe4b04fa 3809 else {
9137fb28 3810 get_ctx(ctx);
fe4b04fa 3811 ++ctx->pin_count;
dbe08d82 3812 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
fe4b04fa 3813 }
dbe08d82
ON
3814 mutex_unlock(&task->perf_event_mutex);
3815
3816 if (unlikely(err)) {
9137fb28 3817 put_ctx(ctx);
dbe08d82
ON
3818
3819 if (err == -EAGAIN)
3820 goto retry;
3821 goto errout;
a63eaf34
PM
3822 }
3823 }
3824
4af57ef2 3825 kfree(task_ctx_data);
0793a61d 3826 return ctx;
c93f7669 3827
9ed6060d 3828errout:
4af57ef2 3829 kfree(task_ctx_data);
c93f7669 3830 return ERR_PTR(err);
0793a61d
TG
3831}
3832
6fb2915d 3833static void perf_event_free_filter(struct perf_event *event);
2541517c 3834static void perf_event_free_bpf_prog(struct perf_event *event);
6fb2915d 3835
cdd6c482 3836static void free_event_rcu(struct rcu_head *head)
592903cd 3837{
cdd6c482 3838 struct perf_event *event;
592903cd 3839
cdd6c482
IM
3840 event = container_of(head, struct perf_event, rcu_head);
3841 if (event->ns)
3842 put_pid_ns(event->ns);
6fb2915d 3843 perf_event_free_filter(event);
cdd6c482 3844 kfree(event);
592903cd
PZ
3845}
3846
b69cf536
PZ
3847static void ring_buffer_attach(struct perf_event *event,
3848 struct ring_buffer *rb);
925d519a 3849
f2fb6bef
KL
3850static void detach_sb_event(struct perf_event *event)
3851{
3852 struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
3853
3854 raw_spin_lock(&pel->lock);
3855 list_del_rcu(&event->sb_list);
3856 raw_spin_unlock(&pel->lock);
3857}
3858
a4f144eb 3859static bool is_sb_event(struct perf_event *event)
f2fb6bef 3860{
a4f144eb
DCC
3861 struct perf_event_attr *attr = &event->attr;
3862
f2fb6bef 3863 if (event->parent)
a4f144eb 3864 return false;
f2fb6bef
KL
3865
3866 if (event->attach_state & PERF_ATTACH_TASK)
a4f144eb 3867 return false;
f2fb6bef 3868
a4f144eb
DCC
3869 if (attr->mmap || attr->mmap_data || attr->mmap2 ||
3870 attr->comm || attr->comm_exec ||
3871 attr->task ||
3872 attr->context_switch)
3873 return true;
3874 return false;
3875}
3876
3877static void unaccount_pmu_sb_event(struct perf_event *event)
3878{
3879 if (is_sb_event(event))
3880 detach_sb_event(event);
f2fb6bef
KL
3881}
3882
4beb31f3 3883static void unaccount_event_cpu(struct perf_event *event, int cpu)
f1600952 3884{
4beb31f3
FW
3885 if (event->parent)
3886 return;
3887
4beb31f3
FW
3888 if (is_cgroup_event(event))
3889 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
3890}
925d519a 3891
555e0c1e
FW
3892#ifdef CONFIG_NO_HZ_FULL
3893static DEFINE_SPINLOCK(nr_freq_lock);
3894#endif
3895
3896static void unaccount_freq_event_nohz(void)
3897{
3898#ifdef CONFIG_NO_HZ_FULL
3899 spin_lock(&nr_freq_lock);
3900 if (atomic_dec_and_test(&nr_freq_events))
3901 tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
3902 spin_unlock(&nr_freq_lock);
3903#endif
3904}
3905
3906static void unaccount_freq_event(void)
3907{
3908 if (tick_nohz_full_enabled())
3909 unaccount_freq_event_nohz();
3910 else
3911 atomic_dec(&nr_freq_events);
3912}
3913
4beb31f3
FW
3914static void unaccount_event(struct perf_event *event)
3915{
25432ae9
PZ
3916 bool dec = false;
3917
4beb31f3
FW
3918 if (event->parent)
3919 return;
3920
3921 if (event->attach_state & PERF_ATTACH_TASK)
25432ae9 3922 dec = true;
4beb31f3
FW
3923 if (event->attr.mmap || event->attr.mmap_data)
3924 atomic_dec(&nr_mmap_events);
3925 if (event->attr.comm)
3926 atomic_dec(&nr_comm_events);
3927 if (event->attr.task)
3928 atomic_dec(&nr_task_events);
948b26b6 3929 if (event->attr.freq)
555e0c1e 3930 unaccount_freq_event();
45ac1403 3931 if (event->attr.context_switch) {
25432ae9 3932 dec = true;
45ac1403
AH
3933 atomic_dec(&nr_switch_events);
3934 }
4beb31f3 3935 if (is_cgroup_event(event))
25432ae9 3936 dec = true;
4beb31f3 3937 if (has_branch_stack(event))
25432ae9
PZ
3938 dec = true;
3939
9107c89e
PZ
3940 if (dec) {
3941 if (!atomic_add_unless(&perf_sched_count, -1, 1))
3942 schedule_delayed_work(&perf_sched_work, HZ);
3943 }
4beb31f3
FW
3944
3945 unaccount_event_cpu(event, event->cpu);
f2fb6bef
KL
3946
3947 unaccount_pmu_sb_event(event);
4beb31f3 3948}
925d519a 3949
9107c89e
PZ
3950static void perf_sched_delayed(struct work_struct *work)
3951{
3952 mutex_lock(&perf_sched_mutex);
3953 if (atomic_dec_and_test(&perf_sched_count))
3954 static_branch_disable(&perf_sched_events);
3955 mutex_unlock(&perf_sched_mutex);
3956}
3957
bed5b25a
AS
3958/*
3959 * The following implement mutual exclusion of events on "exclusive" pmus
3960 * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
3961 * at a time, so we disallow creating events that might conflict, namely:
3962 *
3963 * 1) cpu-wide events in the presence of per-task events,
3964 * 2) per-task events in the presence of cpu-wide events,
3965 * 3) two matching events on the same context.
3966 *
3967 * The former two cases are handled in the allocation path (perf_event_alloc(),
a0733e69 3968 * _free_event()), the latter -- before the first perf_install_in_context().
bed5b25a
AS
3969 */
3970static int exclusive_event_init(struct perf_event *event)
3971{
3972 struct pmu *pmu = event->pmu;
3973
3974 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3975 return 0;
3976
3977 /*
3978 * Prevent co-existence of per-task and cpu-wide events on the
3979 * same exclusive pmu.
3980 *
3981 * Negative pmu::exclusive_cnt means there are cpu-wide
3982 * events on this "exclusive" pmu, positive means there are
3983 * per-task events.
3984 *
3985 * Since this is called in perf_event_alloc() path, event::ctx
3986 * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
3987 * to mean "per-task event", because unlike other attach states it
3988 * never gets cleared.
3989 */
3990 if (event->attach_state & PERF_ATTACH_TASK) {
3991 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
3992 return -EBUSY;
3993 } else {
3994 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
3995 return -EBUSY;
3996 }
3997
3998 return 0;
3999}
4000
4001static void exclusive_event_destroy(struct perf_event *event)
4002{
4003 struct pmu *pmu = event->pmu;
4004
4005 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
4006 return;
4007
4008 /* see comment in exclusive_event_init() */
4009 if (event->attach_state & PERF_ATTACH_TASK)
4010 atomic_dec(&pmu->exclusive_cnt);
4011 else
4012 atomic_inc(&pmu->exclusive_cnt);
4013}
4014
4015static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
4016{
3bf6215a 4017 if ((e1->pmu == e2->pmu) &&
bed5b25a
AS
4018 (e1->cpu == e2->cpu ||
4019 e1->cpu == -1 ||
4020 e2->cpu == -1))
4021 return true;
4022 return false;
4023}
4024
4025/* Called under the same ctx::mutex as perf_install_in_context() */
4026static bool exclusive_event_installable(struct perf_event *event,
4027 struct perf_event_context *ctx)
4028{
4029 struct perf_event *iter_event;
4030 struct pmu *pmu = event->pmu;
4031
4032 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
4033 return true;
4034
4035 list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
4036 if (exclusive_event_match(iter_event, event))
4037 return false;
4038 }
4039
4040 return true;
4041}
4042
375637bc
AS
4043static void perf_addr_filters_splice(struct perf_event *event,
4044 struct list_head *head);
4045
683ede43 4046static void _free_event(struct perf_event *event)
f1600952 4047{
e360adbe 4048 irq_work_sync(&event->pending);
925d519a 4049
4beb31f3 4050 unaccount_event(event);
9ee318a7 4051
76369139 4052 if (event->rb) {
9bb5d40c
PZ
4053 /*
4054 * Can happen when we close an event with re-directed output.
4055 *
4056 * Since we have a 0 refcount, perf_mmap_close() will skip
4057 * over us; possibly making our ring_buffer_put() the last.
4058 */
4059 mutex_lock(&event->mmap_mutex);
b69cf536 4060 ring_buffer_attach(event, NULL);
9bb5d40c 4061 mutex_unlock(&event->mmap_mutex);
a4be7c27
PZ
4062 }
4063
e5d1367f
SE
4064 if (is_cgroup_event(event))
4065 perf_detach_cgroup(event);
4066
a0733e69
PZ
4067 if (!event->parent) {
4068 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
4069 put_callchain_buffers();
4070 }
4071
4072 perf_event_free_bpf_prog(event);
375637bc
AS
4073 perf_addr_filters_splice(event, NULL);
4074 kfree(event->addr_filters_offs);
a0733e69
PZ
4075
4076 if (event->destroy)
4077 event->destroy(event);
4078
4079 if (event->ctx)
4080 put_ctx(event->ctx);
4081
62a92c8f
AS
4082 exclusive_event_destroy(event);
4083 module_put(event->pmu->module);
a0733e69
PZ
4084
4085 call_rcu(&event->rcu_head, free_event_rcu);
f1600952
PZ
4086}
4087
683ede43
PZ
4088/*
4089 * Used to free events which have a known refcount of 1, such as in error paths
4090 * where the event isn't exposed yet and inherited events.
4091 */
4092static void free_event(struct perf_event *event)
0793a61d 4093{
683ede43
PZ
4094 if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
4095 "unexpected event refcount: %ld; ptr=%p\n",
4096 atomic_long_read(&event->refcount), event)) {
4097 /* leak to avoid use-after-free */
4098 return;
4099 }
0793a61d 4100
683ede43 4101 _free_event(event);
0793a61d
TG
4102}
4103
a66a3052 4104/*
f8697762 4105 * Remove user event from the owner task.
a66a3052 4106 */
f8697762 4107static void perf_remove_from_owner(struct perf_event *event)
fb0459d7 4108{
8882135b 4109 struct task_struct *owner;
fb0459d7 4110
8882135b 4111 rcu_read_lock();
8882135b 4112 /*
f47c02c0
PZ
4113 * Matches the smp_store_release() in perf_event_exit_task(). If we
4114 * observe !owner it means the list deletion is complete and we can
4115 * indeed free this event, otherwise we need to serialize on
8882135b
PZ
4116 * owner->perf_event_mutex.
4117 */
f47c02c0 4118 owner = lockless_dereference(event->owner);
8882135b
PZ
4119 if (owner) {
4120 /*
4121 * Since delayed_put_task_struct() also drops the last
4122 * task reference we can safely take a new reference
4123 * while holding the rcu_read_lock().
4124 */
4125 get_task_struct(owner);
4126 }
4127 rcu_read_unlock();
4128
4129 if (owner) {
f63a8daa
PZ
4130 /*
4131 * If we're here through perf_event_exit_task() we're already
4132 * holding ctx->mutex which would be an inversion wrt. the
4133 * normal lock order.
4134 *
4135 * However we can safely take this lock because its the child
4136 * ctx->mutex.
4137 */
4138 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
4139
8882135b
PZ
4140 /*
4141 * We have to re-check the event->owner field, if it is cleared
4142 * we raced with perf_event_exit_task(), acquiring the mutex
4143 * ensured they're done, and we can proceed with freeing the
4144 * event.
4145 */
f47c02c0 4146 if (event->owner) {
8882135b 4147 list_del_init(&event->owner_entry);
f47c02c0
PZ
4148 smp_store_release(&event->owner, NULL);
4149 }
8882135b
PZ
4150 mutex_unlock(&owner->perf_event_mutex);
4151 put_task_struct(owner);
4152 }
f8697762
JO
4153}
4154
f8697762
JO
4155static void put_event(struct perf_event *event)
4156{
f8697762
JO
4157 if (!atomic_long_dec_and_test(&event->refcount))
4158 return;
4159
c6e5b732
PZ
4160 _free_event(event);
4161}
4162
4163/*
4164 * Kill an event dead; while event:refcount will preserve the event
4165 * object, it will not preserve its functionality. Once the last 'user'
4166 * gives up the object, we'll destroy the thing.
4167 */
4168int perf_event_release_kernel(struct perf_event *event)
4169{
a4f4bb6d 4170 struct perf_event_context *ctx = event->ctx;
c6e5b732
PZ
4171 struct perf_event *child, *tmp;
4172
a4f4bb6d
PZ
4173 /*
4174 * If we got here through err_file: fput(event_file); we will not have
4175 * attached to a context yet.
4176 */
4177 if (!ctx) {
4178 WARN_ON_ONCE(event->attach_state &
4179 (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
4180 goto no_ctx;
4181 }
4182
f8697762
JO
4183 if (!is_kernel_event(event))
4184 perf_remove_from_owner(event);
8882135b 4185
5fa7c8ec 4186 ctx = perf_event_ctx_lock(event);
a83fe28e 4187 WARN_ON_ONCE(ctx->parent_ctx);
a69b0ca4 4188 perf_remove_from_context(event, DETACH_GROUP);
683ede43 4189
a69b0ca4 4190 raw_spin_lock_irq(&ctx->lock);
683ede43 4191 /*
a69b0ca4
PZ
4192 * Mark this even as STATE_DEAD, there is no external reference to it
4193 * anymore.
683ede43 4194 *
a69b0ca4
PZ
4195 * Anybody acquiring event->child_mutex after the below loop _must_
4196 * also see this, most importantly inherit_event() which will avoid
4197 * placing more children on the list.
683ede43 4198 *
c6e5b732
PZ
4199 * Thus this guarantees that we will in fact observe and kill _ALL_
4200 * child events.
683ede43 4201 */
a69b0ca4
PZ
4202 event->state = PERF_EVENT_STATE_DEAD;
4203 raw_spin_unlock_irq(&ctx->lock);
4204
4205 perf_event_ctx_unlock(event, ctx);
683ede43 4206
c6e5b732
PZ
4207again:
4208 mutex_lock(&event->child_mutex);
4209 list_for_each_entry(child, &event->child_list, child_list) {
a6fa941d 4210
c6e5b732
PZ
4211 /*
4212 * Cannot change, child events are not migrated, see the
4213 * comment with perf_event_ctx_lock_nested().
4214 */
4215 ctx = lockless_dereference(child->ctx);
4216 /*
4217 * Since child_mutex nests inside ctx::mutex, we must jump
4218 * through hoops. We start by grabbing a reference on the ctx.
4219 *
4220 * Since the event cannot get freed while we hold the
4221 * child_mutex, the context must also exist and have a !0
4222 * reference count.
4223 */
4224 get_ctx(ctx);
4225
4226 /*
4227 * Now that we have a ctx ref, we can drop child_mutex, and
4228 * acquire ctx::mutex without fear of it going away. Then we
4229 * can re-acquire child_mutex.
4230 */
4231 mutex_unlock(&event->child_mutex);
4232 mutex_lock(&ctx->mutex);
4233 mutex_lock(&event->child_mutex);
4234
4235 /*
4236 * Now that we hold ctx::mutex and child_mutex, revalidate our
4237 * state, if child is still the first entry, it didn't get freed
4238 * and we can continue doing so.
4239 */
4240 tmp = list_first_entry_or_null(&event->child_list,
4241 struct perf_event, child_list);
4242 if (tmp == child) {
4243 perf_remove_from_context(child, DETACH_GROUP);
4244 list_del(&child->child_list);
4245 free_event(child);
4246 /*
4247 * This matches the refcount bump in inherit_event();
4248 * this can't be the last reference.
4249 */
4250 put_event(event);
4251 }
4252
4253 mutex_unlock(&event->child_mutex);
4254 mutex_unlock(&ctx->mutex);
4255 put_ctx(ctx);
4256 goto again;
4257 }
4258 mutex_unlock(&event->child_mutex);
4259
a4f4bb6d
PZ
4260no_ctx:
4261 put_event(event); /* Must be the 'last' reference */
683ede43
PZ
4262 return 0;
4263}
4264EXPORT_SYMBOL_GPL(perf_event_release_kernel);
4265
8b10c5e2
PZ
4266/*
4267 * Called when the last reference to the file is gone.
4268 */
a6fa941d
AV
4269static int perf_release(struct inode *inode, struct file *file)
4270{
c6e5b732 4271 perf_event_release_kernel(file->private_data);
a6fa941d 4272 return 0;
fb0459d7 4273}
fb0459d7 4274
59ed446f 4275u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
e53c0994 4276{
cdd6c482 4277 struct perf_event *child;
e53c0994
PZ
4278 u64 total = 0;
4279
59ed446f
PZ
4280 *enabled = 0;
4281 *running = 0;
4282
6f10581a 4283 mutex_lock(&event->child_mutex);
01add3ea 4284
7d88962e 4285 (void)perf_event_read(event, false);
01add3ea
SB
4286 total += perf_event_count(event);
4287
59ed446f
PZ
4288 *enabled += event->total_time_enabled +
4289 atomic64_read(&event->child_total_time_enabled);
4290 *running += event->total_time_running +
4291 atomic64_read(&event->child_total_time_running);
4292
4293 list_for_each_entry(child, &event->child_list, child_list) {
7d88962e 4294 (void)perf_event_read(child, false);
01add3ea 4295 total += perf_event_count(child);
59ed446f
PZ
4296 *enabled += child->total_time_enabled;
4297 *running += child->total_time_running;
4298 }
6f10581a 4299 mutex_unlock(&event->child_mutex);
e53c0994
PZ
4300
4301 return total;
4302}
fb0459d7 4303EXPORT_SYMBOL_GPL(perf_event_read_value);
e53c0994 4304
7d88962e 4305static int __perf_read_group_add(struct perf_event *leader,
fa8c2693 4306 u64 read_format, u64 *values)
3dab77fb 4307{
fa8c2693
PZ
4308 struct perf_event *sub;
4309 int n = 1; /* skip @nr */
7d88962e 4310 int ret;
f63a8daa 4311
7d88962e
SB
4312 ret = perf_event_read(leader, true);
4313 if (ret)
4314 return ret;
abf4868b 4315
fa8c2693
PZ
4316 /*
4317 * Since we co-schedule groups, {enabled,running} times of siblings
4318 * will be identical to those of the leader, so we only publish one
4319 * set.
4320 */
4321 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
4322 values[n++] += leader->total_time_enabled +
4323 atomic64_read(&leader->child_total_time_enabled);
4324 }
3dab77fb 4325
fa8c2693
PZ
4326 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
4327 values[n++] += leader->total_time_running +
4328 atomic64_read(&leader->child_total_time_running);
4329 }
4330
4331 /*
4332 * Write {count,id} tuples for every sibling.
4333 */
4334 values[n++] += perf_event_count(leader);
abf4868b
PZ
4335 if (read_format & PERF_FORMAT_ID)
4336 values[n++] = primary_event_id(leader);
3dab77fb 4337
fa8c2693
PZ
4338 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
4339 values[n++] += perf_event_count(sub);
4340 if (read_format & PERF_FORMAT_ID)
4341 values[n++] = primary_event_id(sub);
4342 }
7d88962e
SB
4343
4344 return 0;
fa8c2693 4345}
3dab77fb 4346
fa8c2693
PZ
4347static int perf_read_group(struct perf_event *event,
4348 u64 read_format, char __user *buf)
4349{
4350 struct perf_event *leader = event->group_leader, *child;
4351 struct perf_event_context *ctx = leader->ctx;
7d88962e 4352 int ret;
fa8c2693 4353 u64 *values;
3dab77fb 4354
fa8c2693 4355 lockdep_assert_held(&ctx->mutex);
3dab77fb 4356
fa8c2693
PZ
4357 values = kzalloc(event->read_size, GFP_KERNEL);
4358 if (!values)
4359 return -ENOMEM;
3dab77fb 4360
fa8c2693
PZ
4361 values[0] = 1 + leader->nr_siblings;
4362
4363 /*
4364 * By locking the child_mutex of the leader we effectively
4365 * lock the child list of all siblings.. XXX explain how.
4366 */
4367 mutex_lock(&leader->child_mutex);
abf4868b 4368
7d88962e
SB
4369 ret = __perf_read_group_add(leader, read_format, values);
4370 if (ret)
4371 goto unlock;
4372
4373 list_for_each_entry(child, &leader->child_list, child_list) {
4374 ret = __perf_read_group_add(child, read_format, values);
4375 if (ret)
4376 goto unlock;
4377 }
abf4868b 4378
fa8c2693 4379 mutex_unlock(&leader->child_mutex);
abf4868b 4380
7d88962e 4381 ret = event->read_size;
fa8c2693
PZ
4382 if (copy_to_user(buf, values, event->read_size))
4383 ret = -EFAULT;
7d88962e 4384 goto out;
fa8c2693 4385
7d88962e
SB
4386unlock:
4387 mutex_unlock(&leader->child_mutex);
4388out:
fa8c2693 4389 kfree(values);
abf4868b 4390 return ret;
3dab77fb
PZ
4391}
4392
b15f495b 4393static int perf_read_one(struct perf_event *event,
3dab77fb
PZ
4394 u64 read_format, char __user *buf)
4395{
59ed446f 4396 u64 enabled, running;
3dab77fb
PZ
4397 u64 values[4];
4398 int n = 0;
4399
59ed446f
PZ
4400 values[n++] = perf_event_read_value(event, &enabled, &running);
4401 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
4402 values[n++] = enabled;
4403 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
4404 values[n++] = running;
3dab77fb 4405 if (read_format & PERF_FORMAT_ID)
cdd6c482 4406 values[n++] = primary_event_id(event);
3dab77fb
PZ
4407
4408 if (copy_to_user(buf, values, n * sizeof(u64)))
4409 return -EFAULT;
4410
4411 return n * sizeof(u64);
4412}
4413
dc633982
JO
4414static bool is_event_hup(struct perf_event *event)
4415{
4416 bool no_children;
4417
a69b0ca4 4418 if (event->state > PERF_EVENT_STATE_EXIT)
dc633982
JO
4419 return false;
4420
4421 mutex_lock(&event->child_mutex);
4422 no_children = list_empty(&event->child_list);
4423 mutex_unlock(&event->child_mutex);
4424 return no_children;
4425}
4426
0793a61d 4427/*
cdd6c482 4428 * Read the performance event - simple non blocking version for now
0793a61d
TG
4429 */
4430static ssize_t
b15f495b 4431__perf_read(struct perf_event *event, char __user *buf, size_t count)
0793a61d 4432{
cdd6c482 4433 u64 read_format = event->attr.read_format;
3dab77fb 4434 int ret;
0793a61d 4435
3b6f9e5c 4436 /*
cdd6c482 4437 * Return end-of-file for a read on a event that is in
3b6f9e5c
PM
4438 * error state (i.e. because it was pinned but it couldn't be
4439 * scheduled on to the CPU at some point).
4440 */
cdd6c482 4441 if (event->state == PERF_EVENT_STATE_ERROR)
3b6f9e5c
PM
4442 return 0;
4443
c320c7b7 4444 if (count < event->read_size)
3dab77fb
PZ
4445 return -ENOSPC;
4446
cdd6c482 4447 WARN_ON_ONCE(event->ctx->parent_ctx);
3dab77fb 4448 if (read_format & PERF_FORMAT_GROUP)
b15f495b 4449 ret = perf_read_group(event, read_format, buf);
3dab77fb 4450 else
b15f495b 4451 ret = perf_read_one(event, read_format, buf);
0793a61d 4452
3dab77fb 4453 return ret;
0793a61d
TG
4454}
4455
0793a61d
TG
4456static ssize_t
4457perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
4458{
cdd6c482 4459 struct perf_event *event = file->private_data;
f63a8daa
PZ
4460 struct perf_event_context *ctx;
4461 int ret;
0793a61d 4462
f63a8daa 4463 ctx = perf_event_ctx_lock(event);
b15f495b 4464 ret = __perf_read(event, buf, count);
f63a8daa
PZ
4465 perf_event_ctx_unlock(event, ctx);
4466
4467 return ret;
0793a61d
TG
4468}
4469
4470static unsigned int perf_poll(struct file *file, poll_table *wait)
4471{
cdd6c482 4472 struct perf_event *event = file->private_data;
76369139 4473 struct ring_buffer *rb;
61b67684 4474 unsigned int events = POLLHUP;
c7138f37 4475
e708d7ad 4476 poll_wait(file, &event->waitq, wait);
179033b3 4477
dc633982 4478 if (is_event_hup(event))
179033b3 4479 return events;
c7138f37 4480
10c6db11 4481 /*
9bb5d40c
PZ
4482 * Pin the event->rb by taking event->mmap_mutex; otherwise
4483 * perf_event_set_output() can swizzle our rb and make us miss wakeups.
10c6db11
PZ
4484 */
4485 mutex_lock(&event->mmap_mutex);
9bb5d40c
PZ
4486 rb = event->rb;
4487 if (rb)
76369139 4488 events = atomic_xchg(&rb->poll, 0);
10c6db11 4489 mutex_unlock(&event->mmap_mutex);
0793a61d
TG
4490 return events;
4491}
4492
f63a8daa 4493static void _perf_event_reset(struct perf_event *event)
6de6a7b9 4494{
7d88962e 4495 (void)perf_event_read(event, false);
e7850595 4496 local64_set(&event->count, 0);
cdd6c482 4497 perf_event_update_userpage(event);
3df5edad
PZ
4498}
4499
c93f7669 4500/*
cdd6c482
IM
4501 * Holding the top-level event's child_mutex means that any
4502 * descendant process that has inherited this event will block
8ba289b8 4503 * in perf_event_exit_event() if it goes to exit, thus satisfying the
cdd6c482 4504 * task existence requirements of perf_event_enable/disable.
c93f7669 4505 */
cdd6c482
IM
4506static void perf_event_for_each_child(struct perf_event *event,
4507 void (*func)(struct perf_event *))
3df5edad 4508{
cdd6c482 4509 struct perf_event *child;
3df5edad 4510
cdd6c482 4511 WARN_ON_ONCE(event->ctx->parent_ctx);
f63a8daa 4512
cdd6c482
IM
4513 mutex_lock(&event->child_mutex);
4514 func(event);
4515 list_for_each_entry(child, &event->child_list, child_list)
3df5edad 4516 func(child);
cdd6c482 4517 mutex_unlock(&event->child_mutex);
3df5edad
PZ
4518}
4519
cdd6c482
IM
4520static void perf_event_for_each(struct perf_event *event,
4521 void (*func)(struct perf_event *))
3df5edad 4522{
cdd6c482
IM
4523 struct perf_event_context *ctx = event->ctx;
4524 struct perf_event *sibling;
3df5edad 4525
f63a8daa
PZ
4526 lockdep_assert_held(&ctx->mutex);
4527
cdd6c482 4528 event = event->group_leader;
75f937f2 4529
cdd6c482 4530 perf_event_for_each_child(event, func);
cdd6c482 4531 list_for_each_entry(sibling, &event->sibling_list, group_entry)
724b6daa 4532 perf_event_for_each_child(sibling, func);
6de6a7b9
PZ
4533}
4534
fae3fde6
PZ
4535static void __perf_event_period(struct perf_event *event,
4536 struct perf_cpu_context *cpuctx,
4537 struct perf_event_context *ctx,
4538 void *info)
c7999c6f 4539{
fae3fde6 4540 u64 value = *((u64 *)info);
c7999c6f 4541 bool active;
08247e31 4542
cdd6c482 4543 if (event->attr.freq) {
cdd6c482 4544 event->attr.sample_freq = value;
08247e31 4545 } else {
cdd6c482
IM
4546 event->attr.sample_period = value;
4547 event->hw.sample_period = value;
08247e31 4548 }
bad7192b
PZ
4549
4550 active = (event->state == PERF_EVENT_STATE_ACTIVE);
4551 if (active) {
4552 perf_pmu_disable(ctx->pmu);
1e02cd40
PZ
4553 /*
4554 * We could be throttled; unthrottle now to avoid the tick
4555 * trying to unthrottle while we already re-started the event.
4556 */
4557 if (event->hw.interrupts == MAX_INTERRUPTS) {
4558 event->hw.interrupts = 0;
4559 perf_log_throttle(event, 1);
4560 }
bad7192b
PZ
4561 event->pmu->stop(event, PERF_EF_UPDATE);
4562 }
4563
4564 local64_set(&event->hw.period_left, 0);
4565
4566 if (active) {
4567 event->pmu->start(event, PERF_EF_RELOAD);
4568 perf_pmu_enable(ctx->pmu);
4569 }
c7999c6f
PZ
4570}
4571
4572static int perf_event_period(struct perf_event *event, u64 __user *arg)
4573{
c7999c6f
PZ
4574 u64 value;
4575
4576 if (!is_sampling_event(event))
4577 return -EINVAL;
4578
4579 if (copy_from_user(&value, arg, sizeof(value)))
4580 return -EFAULT;
4581
4582 if (!value)
4583 return -EINVAL;
4584
4585 if (event->attr.freq && value > sysctl_perf_event_sample_rate)
4586 return -EINVAL;
4587
fae3fde6 4588 event_function_call(event, __perf_event_period, &value);
08247e31 4589
c7999c6f 4590 return 0;
08247e31
PZ
4591}
4592
ac9721f3
PZ
4593static const struct file_operations perf_fops;
4594
2903ff01 4595static inline int perf_fget_light(int fd, struct fd *p)
ac9721f3 4596{
2903ff01
AV
4597 struct fd f = fdget(fd);
4598 if (!f.file)
4599 return -EBADF;
ac9721f3 4600
2903ff01
AV
4601 if (f.file->f_op != &perf_fops) {
4602 fdput(f);
4603 return -EBADF;
ac9721f3 4604 }
2903ff01
AV
4605 *p = f;
4606 return 0;
ac9721f3
PZ
4607}
4608
4609static int perf_event_set_output(struct perf_event *event,
4610 struct perf_event *output_event);
6fb2915d 4611static int perf_event_set_filter(struct perf_event *event, void __user *arg);
2541517c 4612static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
a4be7c27 4613
f63a8daa 4614static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
d859e29f 4615{
cdd6c482 4616 void (*func)(struct perf_event *);
3df5edad 4617 u32 flags = arg;
d859e29f
PM
4618
4619 switch (cmd) {
cdd6c482 4620 case PERF_EVENT_IOC_ENABLE:
f63a8daa 4621 func = _perf_event_enable;
d859e29f 4622 break;
cdd6c482 4623 case PERF_EVENT_IOC_DISABLE:
f63a8daa 4624 func = _perf_event_disable;
79f14641 4625 break;
cdd6c482 4626 case PERF_EVENT_IOC_RESET:
f63a8daa 4627 func = _perf_event_reset;
6de6a7b9 4628 break;
3df5edad 4629
cdd6c482 4630 case PERF_EVENT_IOC_REFRESH:
f63a8daa 4631 return _perf_event_refresh(event, arg);
08247e31 4632
cdd6c482
IM
4633 case PERF_EVENT_IOC_PERIOD:
4634 return perf_event_period(event, (u64 __user *)arg);
08247e31 4635
cf4957f1
JO
4636 case PERF_EVENT_IOC_ID:
4637 {
4638 u64 id = primary_event_id(event);
4639
4640 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
4641 return -EFAULT;
4642 return 0;
4643 }
4644
cdd6c482 4645 case PERF_EVENT_IOC_SET_OUTPUT:
ac9721f3 4646 {
ac9721f3 4647 int ret;
ac9721f3 4648 if (arg != -1) {
2903ff01
AV
4649 struct perf_event *output_event;
4650 struct fd output;
4651 ret = perf_fget_light(arg, &output);
4652 if (ret)
4653 return ret;
4654 output_event = output.file->private_data;
4655 ret = perf_event_set_output(event, output_event);
4656 fdput(output);
4657 } else {
4658 ret = perf_event_set_output(event, NULL);
ac9721f3 4659 }
ac9721f3
PZ
4660 return ret;
4661 }
a4be7c27 4662
6fb2915d
LZ
4663 case PERF_EVENT_IOC_SET_FILTER:
4664 return perf_event_set_filter(event, (void __user *)arg);
4665
2541517c
AS
4666 case PERF_EVENT_IOC_SET_BPF:
4667 return perf_event_set_bpf_prog(event, arg);
4668
86e7972f
WN
4669 case PERF_EVENT_IOC_PAUSE_OUTPUT: {
4670 struct ring_buffer *rb;
4671
4672 rcu_read_lock();
4673 rb = rcu_dereference(event->rb);
4674 if (!rb || !rb->nr_pages) {
4675 rcu_read_unlock();
4676 return -EINVAL;
4677 }
4678 rb_toggle_paused(rb, !!arg);
4679 rcu_read_unlock();
4680 return 0;
4681 }
d859e29f 4682 default:
3df5edad 4683 return -ENOTTY;
d859e29f 4684 }
3df5edad
PZ
4685
4686 if (flags & PERF_IOC_FLAG_GROUP)
cdd6c482 4687 perf_event_for_each(event, func);
3df5edad 4688 else
cdd6c482 4689 perf_event_for_each_child(event, func);
3df5edad
PZ
4690
4691 return 0;
d859e29f
PM
4692}
4693
f63a8daa
PZ
4694static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
4695{
4696 struct perf_event *event = file->private_data;
4697 struct perf_event_context *ctx;
4698 long ret;
4699
4700 ctx = perf_event_ctx_lock(event);
4701 ret = _perf_ioctl(event, cmd, arg);
4702 perf_event_ctx_unlock(event, ctx);
4703
4704 return ret;
4705}
4706
b3f20785
PM
4707#ifdef CONFIG_COMPAT
4708static long perf_compat_ioctl(struct file *file, unsigned int cmd,
4709 unsigned long arg)
4710{
4711 switch (_IOC_NR(cmd)) {
4712 case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
4713 case _IOC_NR(PERF_EVENT_IOC_ID):
4714 /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
4715 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
4716 cmd &= ~IOCSIZE_MASK;
4717 cmd |= sizeof(void *) << IOCSIZE_SHIFT;
4718 }
4719 break;
4720 }
4721 return perf_ioctl(file, cmd, arg);
4722}
4723#else
4724# define perf_compat_ioctl NULL
4725#endif
4726
cdd6c482 4727int perf_event_task_enable(void)
771d7cde 4728{
f63a8daa 4729 struct perf_event_context *ctx;
cdd6c482 4730 struct perf_event *event;
771d7cde 4731
cdd6c482 4732 mutex_lock(&current->perf_event_mutex);
f63a8daa
PZ
4733 list_for_each_entry(event, &current->perf_event_list, owner_entry) {
4734 ctx = perf_event_ctx_lock(event);
4735 perf_event_for_each_child(event, _perf_event_enable);
4736 perf_event_ctx_unlock(event, ctx);
4737 }
cdd6c482 4738 mutex_unlock(&current->perf_event_mutex);
771d7cde
PZ
4739
4740 return 0;
4741}
4742
cdd6c482 4743int perf_event_task_disable(void)
771d7cde 4744{
f63a8daa 4745 struct perf_event_context *ctx;
cdd6c482 4746 struct perf_event *event;
771d7cde 4747
cdd6c482 4748 mutex_lock(&current->perf_event_mutex);
f63a8daa
PZ
4749 list_for_each_entry(event, &current->perf_event_list, owner_entry) {
4750 ctx = perf_event_ctx_lock(event);
4751 perf_event_for_each_child(event, _perf_event_disable);
4752 perf_event_ctx_unlock(event, ctx);
4753 }
cdd6c482 4754 mutex_unlock(&current->perf_event_mutex);
771d7cde
PZ
4755
4756 return 0;
4757}
4758
cdd6c482 4759static int perf_event_index(struct perf_event *event)
194002b2 4760{
a4eaf7f1
PZ
4761 if (event->hw.state & PERF_HES_STOPPED)
4762 return 0;
4763
cdd6c482 4764 if (event->state != PERF_EVENT_STATE_ACTIVE)
194002b2
PZ
4765 return 0;
4766
35edc2a5 4767 return event->pmu->event_idx(event);
194002b2
PZ
4768}
4769
c4794295 4770static void calc_timer_values(struct perf_event *event,
e3f3541c 4771 u64 *now,
7f310a5d
EM
4772 u64 *enabled,
4773 u64 *running)
c4794295 4774{
e3f3541c 4775 u64 ctx_time;
c4794295 4776
e3f3541c
PZ
4777 *now = perf_clock();
4778 ctx_time = event->shadow_ctx_time + *now;
c4794295
EM
4779 *enabled = ctx_time - event->tstamp_enabled;
4780 *running = ctx_time - event->tstamp_running;
4781}
4782
fa731587
PZ
4783static void perf_event_init_userpage(struct perf_event *event)
4784{
4785 struct perf_event_mmap_page *userpg;
4786 struct ring_buffer *rb;
4787
4788 rcu_read_lock();
4789 rb = rcu_dereference(event->rb);
4790 if (!rb)
4791 goto unlock;
4792
4793 userpg = rb->user_page;
4794
4795 /* Allow new userspace to detect that bit 0 is deprecated */
4796 userpg->cap_bit0_is_deprecated = 1;
4797 userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
e8c6deac
AS
4798 userpg->data_offset = PAGE_SIZE;
4799 userpg->data_size = perf_data_size(rb);
fa731587
PZ
4800
4801unlock:
4802 rcu_read_unlock();
4803}
4804
c1317ec2
AL
4805void __weak arch_perf_update_userpage(
4806 struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
e3f3541c
PZ
4807{
4808}
4809
38ff667b
PZ
4810/*
4811 * Callers need to ensure there can be no nesting of this function, otherwise
4812 * the seqlock logic goes bad. We can not serialize this because the arch
4813 * code calls this from NMI context.
4814 */
cdd6c482 4815void perf_event_update_userpage(struct perf_event *event)
37d81828 4816{
cdd6c482 4817 struct perf_event_mmap_page *userpg;
76369139 4818 struct ring_buffer *rb;
e3f3541c 4819 u64 enabled, running, now;
38ff667b
PZ
4820
4821 rcu_read_lock();
5ec4c599
PZ
4822 rb = rcu_dereference(event->rb);
4823 if (!rb)
4824 goto unlock;
4825
0d641208
EM
4826 /*
4827 * compute total_time_enabled, total_time_running
4828 * based on snapshot values taken when the event
4829 * was last scheduled in.
4830 *
4831 * we cannot simply called update_context_time()
4832 * because of locking issue as we can be called in
4833 * NMI context
4834 */
e3f3541c 4835 calc_timer_values(event, &now, &enabled, &running);
38ff667b 4836
76369139 4837 userpg = rb->user_page;
7b732a75
PZ
4838 /*
4839 * Disable preemption so as to not let the corresponding user-space
4840 * spin too long if we get preempted.
4841 */
4842 preempt_disable();
37d81828 4843 ++userpg->lock;
92f22a38 4844 barrier();
cdd6c482 4845 userpg->index = perf_event_index(event);
b5e58793 4846 userpg->offset = perf_event_count(event);
365a4038 4847 if (userpg->index)
e7850595 4848 userpg->offset -= local64_read(&event->hw.prev_count);
7b732a75 4849
0d641208 4850 userpg->time_enabled = enabled +
cdd6c482 4851 atomic64_read(&event->child_total_time_enabled);
7f8b4e4e 4852
0d641208 4853 userpg->time_running = running +
cdd6c482 4854 atomic64_read(&event->child_total_time_running);
7f8b4e4e 4855
c1317ec2 4856 arch_perf_update_userpage(event, userpg, now);
e3f3541c 4857
92f22a38 4858 barrier();
37d81828 4859 ++userpg->lock;
7b732a75 4860 preempt_enable();
38ff667b 4861unlock:
7b732a75 4862 rcu_read_unlock();
37d81828
PM
4863}
4864
906010b2
PZ
4865static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
4866{
4867 struct perf_event *event = vma->vm_file->private_data;
76369139 4868 struct ring_buffer *rb;
906010b2
PZ
4869 int ret = VM_FAULT_SIGBUS;
4870
4871 if (vmf->flags & FAULT_FLAG_MKWRITE) {
4872 if (vmf->pgoff == 0)
4873 ret = 0;
4874 return ret;
4875 }
4876
4877 rcu_read_lock();
76369139
FW
4878 rb = rcu_dereference(event->rb);
4879 if (!rb)
906010b2
PZ
4880 goto unlock;
4881
4882 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
4883 goto unlock;
4884
76369139 4885 vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
906010b2
PZ
4886 if (!vmf->page)
4887 goto unlock;
4888
4889 get_page(vmf->page);
4890 vmf->page->mapping = vma->vm_file->f_mapping;
4891 vmf->page->index = vmf->pgoff;
4892
4893 ret = 0;
4894unlock:
4895 rcu_read_unlock();
4896
4897 return ret;
4898}
4899
10c6db11
PZ
4900static void ring_buffer_attach(struct perf_event *event,
4901 struct ring_buffer *rb)
4902{
b69cf536 4903 struct ring_buffer *old_rb = NULL;
10c6db11
PZ
4904 unsigned long flags;
4905
b69cf536
PZ
4906 if (event->rb) {
4907 /*
4908 * Should be impossible, we set this when removing
4909 * event->rb_entry and wait/clear when adding event->rb_entry.
4910 */
4911 WARN_ON_ONCE(event->rcu_pending);
10c6db11 4912
b69cf536 4913 old_rb = event->rb;
b69cf536
PZ
4914 spin_lock_irqsave(&old_rb->event_lock, flags);
4915 list_del_rcu(&event->rb_entry);
4916 spin_unlock_irqrestore(&old_rb->event_lock, flags);
10c6db11 4917
2f993cf0
ON
4918 event->rcu_batches = get_state_synchronize_rcu();
4919 event->rcu_pending = 1;
b69cf536 4920 }
10c6db11 4921
b69cf536 4922 if (rb) {
2f993cf0
ON
4923 if (event->rcu_pending) {
4924 cond_synchronize_rcu(event->rcu_batches);
4925 event->rcu_pending = 0;
4926 }
4927
b69cf536
PZ
4928 spin_lock_irqsave(&rb->event_lock, flags);
4929 list_add_rcu(&event->rb_entry, &rb->event_list);
4930 spin_unlock_irqrestore(&rb->event_lock, flags);
4931 }
4932
767ae086
AS
4933 /*
4934 * Avoid racing with perf_mmap_close(AUX): stop the event
4935 * before swizzling the event::rb pointer; if it's getting
4936 * unmapped, its aux_mmap_count will be 0 and it won't
4937 * restart. See the comment in __perf_pmu_output_stop().
4938 *
4939 * Data will inevitably be lost when set_output is done in
4940 * mid-air, but then again, whoever does it like this is
4941 * not in for the data anyway.
4942 */
4943 if (has_aux(event))
4944 perf_event_stop(event, 0);
4945
b69cf536
PZ
4946 rcu_assign_pointer(event->rb, rb);
4947
4948 if (old_rb) {
4949 ring_buffer_put(old_rb);
4950 /*
4951 * Since we detached before setting the new rb, so that we
4952 * could attach the new rb, we could have missed a wakeup.
4953 * Provide it now.
4954 */
4955 wake_up_all(&event->waitq);
4956 }
10c6db11
PZ
4957}
4958
4959static void ring_buffer_wakeup(struct perf_event *event)
4960{
4961 struct ring_buffer *rb;
4962
4963 rcu_read_lock();
4964 rb = rcu_dereference(event->rb);
9bb5d40c
PZ
4965 if (rb) {
4966 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
4967 wake_up_all(&event->waitq);
4968 }
10c6db11
PZ
4969 rcu_read_unlock();
4970}
4971
fdc26706 4972struct ring_buffer *ring_buffer_get(struct perf_event *event)
7b732a75 4973{
76369139 4974 struct ring_buffer *rb;
7b732a75 4975
ac9721f3 4976 rcu_read_lock();
76369139
FW
4977 rb = rcu_dereference(event->rb);
4978 if (rb) {
4979 if (!atomic_inc_not_zero(&rb->refcount))
4980 rb = NULL;
ac9721f3
PZ
4981 }
4982 rcu_read_unlock();
4983
76369139 4984 return rb;
ac9721f3
PZ
4985}
4986
fdc26706 4987void ring_buffer_put(struct ring_buffer *rb)
ac9721f3 4988{
76369139 4989 if (!atomic_dec_and_test(&rb->refcount))
ac9721f3 4990 return;
7b732a75 4991
9bb5d40c 4992 WARN_ON_ONCE(!list_empty(&rb->event_list));
10c6db11 4993
76369139 4994 call_rcu(&rb->rcu_head, rb_free_rcu);
7b732a75
PZ
4995}
4996
4997static void perf_mmap_open(struct vm_area_struct *vma)
4998{
cdd6c482 4999 struct perf_event *event = vma->vm_file->private_data;
7b732a75 5000
cdd6c482 5001 atomic_inc(&event->mmap_count);
9bb5d40c 5002 atomic_inc(&event->rb->mmap_count);
1e0fb9ec 5003
45bfb2e5
PZ
5004 if (vma->vm_pgoff)
5005 atomic_inc(&event->rb->aux_mmap_count);
5006
1e0fb9ec
AL
5007 if (event->pmu->event_mapped)
5008 event->pmu->event_mapped(event);
7b732a75
PZ
5009}
5010
95ff4ca2
AS
5011static void perf_pmu_output_stop(struct perf_event *event);
5012
9bb5d40c
PZ
5013/*
5014 * A buffer can be mmap()ed multiple times; either directly through the same
5015 * event, or through other events by use of perf_event_set_output().
5016 *
5017 * In order to undo the VM accounting done by perf_mmap() we need to destroy
5018 * the buffer here, where we still have a VM context. This means we need
5019 * to detach all events redirecting to us.
5020 */
7b732a75
PZ
5021static void perf_mmap_close(struct vm_area_struct *vma)
5022{
cdd6c482 5023 struct perf_event *event = vma->vm_file->private_data;
7b732a75 5024
b69cf536 5025 struct ring_buffer *rb = ring_buffer_get(event);
9bb5d40c
PZ
5026 struct user_struct *mmap_user = rb->mmap_user;
5027 int mmap_locked = rb->mmap_locked;
5028 unsigned long size = perf_data_size(rb);
789f90fc 5029
1e0fb9ec
AL
5030 if (event->pmu->event_unmapped)
5031 event->pmu->event_unmapped(event);
5032
45bfb2e5
PZ
5033 /*
5034 * rb->aux_mmap_count will always drop before rb->mmap_count and
5035 * event->mmap_count, so it is ok to use event->mmap_mutex to
5036 * serialize with perf_mmap here.
5037 */
5038 if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
5039 atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
95ff4ca2
AS
5040 /*
5041 * Stop all AUX events that are writing to this buffer,
5042 * so that we can free its AUX pages and corresponding PMU
5043 * data. Note that after rb::aux_mmap_count dropped to zero,
5044 * they won't start any more (see perf_aux_output_begin()).
5045 */
5046 perf_pmu_output_stop(event);
5047
5048 /* now it's safe to free the pages */
45bfb2e5
PZ
5049 atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
5050 vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
5051
95ff4ca2 5052 /* this has to be the last one */
45bfb2e5 5053 rb_free_aux(rb);
95ff4ca2
AS
5054 WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
5055
45bfb2e5
PZ
5056 mutex_unlock(&event->mmap_mutex);
5057 }
5058
9bb5d40c
PZ
5059 atomic_dec(&rb->mmap_count);
5060
5061 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
b69cf536 5062 goto out_put;
9bb5d40c 5063
b69cf536 5064 ring_buffer_attach(event, NULL);
9bb5d40c
PZ
5065 mutex_unlock(&event->mmap_mutex);
5066
5067 /* If there's still other mmap()s of this buffer, we're done. */
b69cf536
PZ
5068 if (atomic_read(&rb->mmap_count))
5069 goto out_put;
ac9721f3 5070
9bb5d40c
PZ
5071 /*
5072 * No other mmap()s, detach from all other events that might redirect
5073 * into the now unreachable buffer. Somewhat complicated by the
5074 * fact that rb::event_lock otherwise nests inside mmap_mutex.
5075 */
5076again:
5077 rcu_read_lock();
5078 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
5079 if (!atomic_long_inc_not_zero(&event->refcount)) {
5080 /*
5081 * This event is en-route to free_event() which will
5082 * detach it and remove it from the list.
5083 */
5084 continue;
5085 }
5086 rcu_read_unlock();
789f90fc 5087
9bb5d40c
PZ
5088 mutex_lock(&event->mmap_mutex);
5089 /*
5090 * Check we didn't race with perf_event_set_output() which can
5091 * swizzle the rb from under us while we were waiting to
5092 * acquire mmap_mutex.
5093 *
5094 * If we find a different rb; ignore this event, a next
5095 * iteration will no longer find it on the list. We have to
5096 * still restart the iteration to make sure we're not now
5097 * iterating the wrong list.
5098 */
b69cf536
PZ
5099 if (event->rb == rb)
5100 ring_buffer_attach(event, NULL);
5101
cdd6c482 5102 mutex_unlock(&event->mmap_mutex);
9bb5d40c 5103 put_event(event);
ac9721f3 5104
9bb5d40c
PZ
5105 /*
5106 * Restart the iteration; either we're on the wrong list or
5107 * destroyed its integrity by doing a deletion.
5108 */
5109 goto again;
7b732a75 5110 }
9bb5d40c
PZ
5111 rcu_read_unlock();
5112
5113 /*
5114 * It could be there's still a few 0-ref events on the list; they'll
5115 * get cleaned up by free_event() -- they'll also still have their
5116 * ref on the rb and will free it whenever they are done with it.
5117 *
5118 * Aside from that, this buffer is 'fully' detached and unmapped,
5119 * undo the VM accounting.
5120 */
5121
5122 atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
5123 vma->vm_mm->pinned_vm -= mmap_locked;
5124 free_uid(mmap_user);
5125
b69cf536 5126out_put:
9bb5d40c 5127 ring_buffer_put(rb); /* could be last */
37d81828
PM
5128}
5129
f0f37e2f 5130static const struct vm_operations_struct perf_mmap_vmops = {
43a21ea8 5131 .open = perf_mmap_open,
45bfb2e5 5132 .close = perf_mmap_close, /* non mergable */
43a21ea8
PZ
5133 .fault = perf_mmap_fault,
5134 .page_mkwrite = perf_mmap_fault,
37d81828
PM
5135};
5136
5137static int perf_mmap(struct file *file, struct vm_area_struct *vma)
5138{
cdd6c482 5139 struct perf_event *event = file->private_data;
22a4f650 5140 unsigned long user_locked, user_lock_limit;
789f90fc 5141 struct user_struct *user = current_user();
22a4f650 5142 unsigned long locked, lock_limit;
45bfb2e5 5143 struct ring_buffer *rb = NULL;
7b732a75
PZ
5144 unsigned long vma_size;
5145 unsigned long nr_pages;
45bfb2e5 5146 long user_extra = 0, extra = 0;
d57e34fd 5147 int ret = 0, flags = 0;
37d81828 5148
c7920614
PZ
5149 /*
5150 * Don't allow mmap() of inherited per-task counters. This would
5151 * create a performance issue due to all children writing to the
76369139 5152 * same rb.
c7920614
PZ
5153 */
5154 if (event->cpu == -1 && event->attr.inherit)
5155 return -EINVAL;
5156
43a21ea8 5157 if (!(vma->vm_flags & VM_SHARED))
37d81828 5158 return -EINVAL;
7b732a75
PZ
5159
5160 vma_size = vma->vm_end - vma->vm_start;
45bfb2e5
PZ
5161
5162 if (vma->vm_pgoff == 0) {
5163 nr_pages = (vma_size / PAGE_SIZE) - 1;
5164 } else {
5165 /*
5166 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
5167 * mapped, all subsequent mappings should have the same size
5168 * and offset. Must be above the normal perf buffer.
5169 */
5170 u64 aux_offset, aux_size;
5171
5172 if (!event->rb)
5173 return -EINVAL;
5174
5175 nr_pages = vma_size / PAGE_SIZE;
5176
5177 mutex_lock(&event->mmap_mutex);
5178 ret = -EINVAL;
5179
5180 rb = event->rb;
5181 if (!rb)
5182 goto aux_unlock;
5183
5184 aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
5185 aux_size = ACCESS_ONCE(rb->user_page->aux_size);
5186
5187 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
5188 goto aux_unlock;
5189
5190 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
5191 goto aux_unlock;
5192
5193 /* already mapped with a different offset */
5194 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
5195 goto aux_unlock;
5196
5197 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
5198 goto aux_unlock;
5199
5200 /* already mapped with a different size */
5201 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
5202 goto aux_unlock;
5203
5204 if (!is_power_of_2(nr_pages))
5205 goto aux_unlock;
5206
5207 if (!atomic_inc_not_zero(&rb->mmap_count))
5208 goto aux_unlock;
5209
5210 if (rb_has_aux(rb)) {
5211 atomic_inc(&rb->aux_mmap_count);
5212 ret = 0;
5213 goto unlock;
5214 }
5215
5216 atomic_set(&rb->aux_mmap_count, 1);
5217 user_extra = nr_pages;
5218
5219 goto accounting;
5220 }
7b732a75 5221
7730d865 5222 /*
76369139 5223 * If we have rb pages ensure they're a power-of-two number, so we
7730d865
PZ
5224 * can do bitmasks instead of modulo.
5225 */
2ed11312 5226 if (nr_pages != 0 && !is_power_of_2(nr_pages))
37d81828
PM
5227 return -EINVAL;
5228
7b732a75 5229 if (vma_size != PAGE_SIZE * (1 + nr_pages))
37d81828
PM
5230 return -EINVAL;
5231
cdd6c482 5232 WARN_ON_ONCE(event->ctx->parent_ctx);
9bb5d40c 5233again:
cdd6c482 5234 mutex_lock(&event->mmap_mutex);
76369139 5235 if (event->rb) {
9bb5d40c 5236 if (event->rb->nr_pages != nr_pages) {
ebb3c4c4 5237 ret = -EINVAL;
9bb5d40c
PZ
5238 goto unlock;
5239 }
5240
5241 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
5242 /*
5243 * Raced against perf_mmap_close() through
5244 * perf_event_set_output(). Try again, hope for better
5245 * luck.
5246 */
5247 mutex_unlock(&event->mmap_mutex);
5248 goto again;
5249 }
5250
ebb3c4c4
PZ
5251 goto unlock;
5252 }
5253
789f90fc 5254 user_extra = nr_pages + 1;
45bfb2e5
PZ
5255
5256accounting:
cdd6c482 5257 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
a3862d3f
IM
5258
5259 /*
5260 * Increase the limit linearly with more CPUs:
5261 */
5262 user_lock_limit *= num_online_cpus();
5263
789f90fc 5264 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
c5078f78 5265
789f90fc
PZ
5266 if (user_locked > user_lock_limit)
5267 extra = user_locked - user_lock_limit;
7b732a75 5268
78d7d407 5269 lock_limit = rlimit(RLIMIT_MEMLOCK);
7b732a75 5270 lock_limit >>= PAGE_SHIFT;
bc3e53f6 5271 locked = vma->vm_mm->pinned_vm + extra;
7b732a75 5272
459ec28a
IM
5273 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
5274 !capable(CAP_IPC_LOCK)) {
ebb3c4c4
PZ
5275 ret = -EPERM;
5276 goto unlock;
5277 }
7b732a75 5278
45bfb2e5 5279 WARN_ON(!rb && event->rb);
906010b2 5280
d57e34fd 5281 if (vma->vm_flags & VM_WRITE)
76369139 5282 flags |= RING_BUFFER_WRITABLE;
d57e34fd 5283
76369139 5284 if (!rb) {
45bfb2e5
PZ
5285 rb = rb_alloc(nr_pages,
5286 event->attr.watermark ? event->attr.wakeup_watermark : 0,
5287 event->cpu, flags);
26cb63ad 5288
45bfb2e5
PZ
5289 if (!rb) {
5290 ret = -ENOMEM;
5291 goto unlock;
5292 }
43a21ea8 5293
45bfb2e5
PZ
5294 atomic_set(&rb->mmap_count, 1);
5295 rb->mmap_user = get_current_user();
5296 rb->mmap_locked = extra;
26cb63ad 5297
45bfb2e5 5298 ring_buffer_attach(event, rb);
ac9721f3 5299
45bfb2e5
PZ
5300 perf_event_init_userpage(event);
5301 perf_event_update_userpage(event);
5302 } else {
1a594131
AS
5303 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
5304 event->attr.aux_watermark, flags);
45bfb2e5
PZ
5305 if (!ret)
5306 rb->aux_mmap_locked = extra;
5307 }
9a0f05cb 5308
ebb3c4c4 5309unlock:
45bfb2e5
PZ
5310 if (!ret) {
5311 atomic_long_add(user_extra, &user->locked_vm);
5312 vma->vm_mm->pinned_vm += extra;
5313
ac9721f3 5314 atomic_inc(&event->mmap_count);
45bfb2e5
PZ
5315 } else if (rb) {
5316 atomic_dec(&rb->mmap_count);
5317 }
5318aux_unlock:
cdd6c482 5319 mutex_unlock(&event->mmap_mutex);
37d81828 5320
9bb5d40c
PZ
5321 /*
5322 * Since pinned accounting is per vm we cannot allow fork() to copy our
5323 * vma.
5324 */
26cb63ad 5325 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
37d81828 5326 vma->vm_ops = &perf_mmap_vmops;
7b732a75 5327
1e0fb9ec
AL
5328 if (event->pmu->event_mapped)
5329 event->pmu->event_mapped(event);
5330
7b732a75 5331 return ret;
37d81828
PM
5332}
5333
3c446b3d
PZ
5334static int perf_fasync(int fd, struct file *filp, int on)
5335{
496ad9aa 5336 struct inode *inode = file_inode(filp);
cdd6c482 5337 struct perf_event *event = filp->private_data;
3c446b3d
PZ
5338 int retval;
5339
5955102c 5340 inode_lock(inode);
cdd6c482 5341 retval = fasync_helper(fd, filp, on, &event->fasync);
5955102c 5342 inode_unlock(inode);
3c446b3d
PZ
5343
5344 if (retval < 0)
5345 return retval;
5346
5347 return 0;
5348}
5349
0793a61d 5350static const struct file_operations perf_fops = {
3326c1ce 5351 .llseek = no_llseek,
0793a61d
TG
5352 .release = perf_release,
5353 .read = perf_read,
5354 .poll = perf_poll,
d859e29f 5355 .unlocked_ioctl = perf_ioctl,
b3f20785 5356 .compat_ioctl = perf_compat_ioctl,
37d81828 5357 .mmap = perf_mmap,
3c446b3d 5358 .fasync = perf_fasync,
0793a61d
TG
5359};
5360
925d519a 5361/*
cdd6c482 5362 * Perf event wakeup
925d519a
PZ
5363 *
5364 * If there's data, ensure we set the poll() state and publish everything
5365 * to user-space before waking everybody up.
5366 */
5367
fed66e2c
PZ
5368static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
5369{
5370 /* only the parent has fasync state */
5371 if (event->parent)
5372 event = event->parent;
5373 return &event->fasync;
5374}
5375
cdd6c482 5376void perf_event_wakeup(struct perf_event *event)
925d519a 5377{
10c6db11 5378 ring_buffer_wakeup(event);
4c9e2542 5379
cdd6c482 5380 if (event->pending_kill) {
fed66e2c 5381 kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
cdd6c482 5382 event->pending_kill = 0;
4c9e2542 5383 }
925d519a
PZ
5384}
5385
e360adbe 5386static void perf_pending_event(struct irq_work *entry)
79f14641 5387{
cdd6c482
IM
5388 struct perf_event *event = container_of(entry,
5389 struct perf_event, pending);
d525211f
PZ
5390 int rctx;
5391
5392 rctx = perf_swevent_get_recursion_context();
5393 /*
5394 * If we 'fail' here, that's OK, it means recursion is already disabled
5395 * and we won't recurse 'further'.
5396 */
79f14641 5397
cdd6c482
IM
5398 if (event->pending_disable) {
5399 event->pending_disable = 0;
fae3fde6 5400 perf_event_disable_local(event);
79f14641
PZ
5401 }
5402
cdd6c482
IM
5403 if (event->pending_wakeup) {
5404 event->pending_wakeup = 0;
5405 perf_event_wakeup(event);
79f14641 5406 }
d525211f
PZ
5407
5408 if (rctx >= 0)
5409 perf_swevent_put_recursion_context(rctx);
79f14641
PZ
5410}
5411
39447b38
ZY
5412/*
5413 * We assume there is only KVM supporting the callbacks.
5414 * Later on, we might change it to a list if there is
5415 * another virtualization implementation supporting the callbacks.
5416 */
5417struct perf_guest_info_callbacks *perf_guest_cbs;
5418
5419int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5420{
5421 perf_guest_cbs = cbs;
5422 return 0;
5423}
5424EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
5425
5426int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5427{
5428 perf_guest_cbs = NULL;
5429 return 0;
5430}
5431EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
5432
4018994f
JO
5433static void
5434perf_output_sample_regs(struct perf_output_handle *handle,
5435 struct pt_regs *regs, u64 mask)
5436{
5437 int bit;
29dd3288 5438 DECLARE_BITMAP(_mask, 64);
4018994f 5439
29dd3288
MS
5440 bitmap_from_u64(_mask, mask);
5441 for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
4018994f
JO
5442 u64 val;
5443
5444 val = perf_reg_value(regs, bit);
5445 perf_output_put(handle, val);
5446 }
5447}
5448
60e2364e 5449static void perf_sample_regs_user(struct perf_regs *regs_user,
88a7c26a
AL
5450 struct pt_regs *regs,
5451 struct pt_regs *regs_user_copy)
4018994f 5452{
88a7c26a
AL
5453 if (user_mode(regs)) {
5454 regs_user->abi = perf_reg_abi(current);
2565711f 5455 regs_user->regs = regs;
88a7c26a
AL
5456 } else if (current->mm) {
5457 perf_get_regs_user(regs_user, regs, regs_user_copy);
2565711f
PZ
5458 } else {
5459 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
5460 regs_user->regs = NULL;
4018994f
JO
5461 }
5462}
5463
60e2364e
SE
5464static void perf_sample_regs_intr(struct perf_regs *regs_intr,
5465 struct pt_regs *regs)
5466{
5467 regs_intr->regs = regs;
5468 regs_intr->abi = perf_reg_abi(current);
5469}
5470
5471
c5ebcedb
JO
5472/*
5473 * Get remaining task size from user stack pointer.
5474 *
5475 * It'd be better to take stack vma map and limit this more
5476 * precisly, but there's no way to get it safely under interrupt,
5477 * so using TASK_SIZE as limit.
5478 */
5479static u64 perf_ustack_task_size(struct pt_regs *regs)
5480{
5481 unsigned long addr = perf_user_stack_pointer(regs);
5482
5483 if (!addr || addr >= TASK_SIZE)
5484 return 0;
5485
5486 return TASK_SIZE - addr;
5487}
5488
5489static u16
5490perf_sample_ustack_size(u16 stack_size, u16 header_size,
5491 struct pt_regs *regs)
5492{
5493 u64 task_size;
5494
5495 /* No regs, no stack pointer, no dump. */
5496 if (!regs)
5497 return 0;
5498
5499 /*
5500 * Check if we fit in with the requested stack size into the:
5501 * - TASK_SIZE
5502 * If we don't, we limit the size to the TASK_SIZE.
5503 *
5504 * - remaining sample size
5505 * If we don't, we customize the stack size to
5506 * fit in to the remaining sample size.
5507 */
5508
5509 task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
5510 stack_size = min(stack_size, (u16) task_size);
5511
5512 /* Current header size plus static size and dynamic size. */
5513 header_size += 2 * sizeof(u64);
5514
5515 /* Do we fit in with the current stack dump size? */
5516 if ((u16) (header_size + stack_size) < header_size) {
5517 /*
5518 * If we overflow the maximum size for the sample,
5519 * we customize the stack dump size to fit in.
5520 */
5521 stack_size = USHRT_MAX - header_size - sizeof(u64);
5522 stack_size = round_up(stack_size, sizeof(u64));
5523 }
5524
5525 return stack_size;
5526}
5527
5528static void
5529perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
5530 struct pt_regs *regs)
5531{
5532 /* Case of a kernel thread, nothing to dump */
5533 if (!regs) {
5534 u64 size = 0;
5535 perf_output_put(handle, size);
5536 } else {
5537 unsigned long sp;
5538 unsigned int rem;
5539 u64 dyn_size;
5540
5541 /*
5542 * We dump:
5543 * static size
5544 * - the size requested by user or the best one we can fit
5545 * in to the sample max size
5546 * data
5547 * - user stack dump data
5548 * dynamic size
5549 * - the actual dumped size
5550 */
5551
5552 /* Static size. */
5553 perf_output_put(handle, dump_size);
5554
5555 /* Data. */
5556 sp = perf_user_stack_pointer(regs);
5557 rem = __output_copy_user(handle, (void *) sp, dump_size);
5558 dyn_size = dump_size - rem;
5559
5560 perf_output_skip(handle, rem);
5561
5562 /* Dynamic size. */
5563 perf_output_put(handle, dyn_size);
5564 }
5565}
5566
c980d109
ACM
5567static void __perf_event_header__init_id(struct perf_event_header *header,
5568 struct perf_sample_data *data,
5569 struct perf_event *event)
6844c09d
ACM
5570{
5571 u64 sample_type = event->attr.sample_type;
5572
5573 data->type = sample_type;
5574 header->size += event->id_header_size;
5575
5576 if (sample_type & PERF_SAMPLE_TID) {
5577 /* namespace issues */
5578 data->tid_entry.pid = perf_event_pid(event, current);
5579 data->tid_entry.tid = perf_event_tid(event, current);
5580 }
5581
5582 if (sample_type & PERF_SAMPLE_TIME)
34f43927 5583 data->time = perf_event_clock(event);
6844c09d 5584
ff3d527c 5585 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
6844c09d
ACM
5586 data->id = primary_event_id(event);
5587
5588 if (sample_type & PERF_SAMPLE_STREAM_ID)
5589 data->stream_id = event->id;
5590
5591 if (sample_type & PERF_SAMPLE_CPU) {
5592 data->cpu_entry.cpu = raw_smp_processor_id();
5593 data->cpu_entry.reserved = 0;
5594 }
5595}
5596
76369139
FW
5597void perf_event_header__init_id(struct perf_event_header *header,
5598 struct perf_sample_data *data,
5599 struct perf_event *event)
c980d109
ACM
5600{
5601 if (event->attr.sample_id_all)
5602 __perf_event_header__init_id(header, data, event);
5603}
5604
5605static void __perf_event__output_id_sample(struct perf_output_handle *handle,
5606 struct perf_sample_data *data)
5607{
5608 u64 sample_type = data->type;
5609
5610 if (sample_type & PERF_SAMPLE_TID)
5611 perf_output_put(handle, data->tid_entry);
5612
5613 if (sample_type & PERF_SAMPLE_TIME)
5614 perf_output_put(handle, data->time);
5615
5616 if (sample_type & PERF_SAMPLE_ID)
5617 perf_output_put(handle, data->id);
5618
5619 if (sample_type & PERF_SAMPLE_STREAM_ID)
5620 perf_output_put(handle, data->stream_id);
5621
5622 if (sample_type & PERF_SAMPLE_CPU)
5623 perf_output_put(handle, data->cpu_entry);
ff3d527c
AH
5624
5625 if (sample_type & PERF_SAMPLE_IDENTIFIER)
5626 perf_output_put(handle, data->id);
c980d109
ACM
5627}
5628
76369139
FW
5629void perf_event__output_id_sample(struct perf_event *event,
5630 struct perf_output_handle *handle,
5631 struct perf_sample_data *sample)
c980d109
ACM
5632{
5633 if (event->attr.sample_id_all)
5634 __perf_event__output_id_sample(handle, sample);
5635}
5636
3dab77fb 5637static void perf_output_read_one(struct perf_output_handle *handle,
eed01528
SE
5638 struct perf_event *event,
5639 u64 enabled, u64 running)
3dab77fb 5640{
cdd6c482 5641 u64 read_format = event->attr.read_format;
3dab77fb
PZ
5642 u64 values[4];
5643 int n = 0;
5644
b5e58793 5645 values[n++] = perf_event_count(event);
3dab77fb 5646 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
eed01528 5647 values[n++] = enabled +
cdd6c482 5648 atomic64_read(&event->child_total_time_enabled);
3dab77fb
PZ
5649 }
5650 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
eed01528 5651 values[n++] = running +
cdd6c482 5652 atomic64_read(&event->child_total_time_running);
3dab77fb
PZ
5653 }
5654 if (read_format & PERF_FORMAT_ID)
cdd6c482 5655 values[n++] = primary_event_id(event);
3dab77fb 5656
76369139 5657 __output_copy(handle, values, n * sizeof(u64));
3dab77fb
PZ
5658}
5659
5660/*
cdd6c482 5661 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
3dab77fb
PZ
5662 */
5663static void perf_output_read_group(struct perf_output_handle *handle,
eed01528
SE
5664 struct perf_event *event,
5665 u64 enabled, u64 running)
3dab77fb 5666{
cdd6c482
IM
5667 struct perf_event *leader = event->group_leader, *sub;
5668 u64 read_format = event->attr.read_format;
3dab77fb
PZ
5669 u64 values[5];
5670 int n = 0;
5671
5672 values[n++] = 1 + leader->nr_siblings;
5673
5674 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
eed01528 5675 values[n++] = enabled;
3dab77fb
PZ
5676
5677 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
eed01528 5678 values[n++] = running;
3dab77fb 5679
cdd6c482 5680 if (leader != event)
3dab77fb
PZ
5681 leader->pmu->read(leader);
5682
b5e58793 5683 values[n++] = perf_event_count(leader);
3dab77fb 5684 if (read_format & PERF_FORMAT_ID)
cdd6c482 5685 values[n++] = primary_event_id(leader);
3dab77fb 5686
76369139 5687 __output_copy(handle, values, n * sizeof(u64));
3dab77fb 5688
65abc865 5689 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
3dab77fb
PZ
5690 n = 0;
5691
6f5ab001
JO
5692 if ((sub != event) &&
5693 (sub->state == PERF_EVENT_STATE_ACTIVE))
3dab77fb
PZ
5694 sub->pmu->read(sub);
5695
b5e58793 5696 values[n++] = perf_event_count(sub);
3dab77fb 5697 if (read_format & PERF_FORMAT_ID)
cdd6c482 5698 values[n++] = primary_event_id(sub);
3dab77fb 5699
76369139 5700 __output_copy(handle, values, n * sizeof(u64));
3dab77fb
PZ
5701 }
5702}
5703
eed01528
SE
5704#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
5705 PERF_FORMAT_TOTAL_TIME_RUNNING)
5706
3dab77fb 5707static void perf_output_read(struct perf_output_handle *handle,
cdd6c482 5708 struct perf_event *event)
3dab77fb 5709{
e3f3541c 5710 u64 enabled = 0, running = 0, now;
eed01528
SE
5711 u64 read_format = event->attr.read_format;
5712
5713 /*
5714 * compute total_time_enabled, total_time_running
5715 * based on snapshot values taken when the event
5716 * was last scheduled in.
5717 *
5718 * we cannot simply called update_context_time()
5719 * because of locking issue as we are called in
5720 * NMI context
5721 */
c4794295 5722 if (read_format & PERF_FORMAT_TOTAL_TIMES)
e3f3541c 5723 calc_timer_values(event, &now, &enabled, &running);
eed01528 5724
cdd6c482 5725 if (event->attr.read_format & PERF_FORMAT_GROUP)
eed01528 5726 perf_output_read_group(handle, event, enabled, running);
3dab77fb 5727 else
eed01528 5728 perf_output_read_one(handle, event, enabled, running);
3dab77fb
PZ
5729}
5730
5622f295
MM
5731void perf_output_sample(struct perf_output_handle *handle,
5732 struct perf_event_header *header,
5733 struct perf_sample_data *data,
cdd6c482 5734 struct perf_event *event)
5622f295
MM
5735{
5736 u64 sample_type = data->type;
5737
5738 perf_output_put(handle, *header);
5739
ff3d527c
AH
5740 if (sample_type & PERF_SAMPLE_IDENTIFIER)
5741 perf_output_put(handle, data->id);
5742
5622f295
MM
5743 if (sample_type & PERF_SAMPLE_IP)
5744 perf_output_put(handle, data->ip);
5745
5746 if (sample_type & PERF_SAMPLE_TID)
5747 perf_output_put(handle, data->tid_entry);
5748
5749 if (sample_type & PERF_SAMPLE_TIME)
5750 perf_output_put(handle, data->time);
5751
5752 if (sample_type & PERF_SAMPLE_ADDR)
5753 perf_output_put(handle, data->addr);
5754
5755 if (sample_type & PERF_SAMPLE_ID)
5756 perf_output_put(handle, data->id);
5757
5758 if (sample_type & PERF_SAMPLE_STREAM_ID)
5759 perf_output_put(handle, data->stream_id);
5760
5761 if (sample_type & PERF_SAMPLE_CPU)
5762 perf_output_put(handle, data->cpu_entry);
5763
5764 if (sample_type & PERF_SAMPLE_PERIOD)
5765 perf_output_put(handle, data->period);
5766
5767 if (sample_type & PERF_SAMPLE_READ)
cdd6c482 5768 perf_output_read(handle, event);
5622f295
MM
5769
5770 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5771 if (data->callchain) {
5772 int size = 1;
5773
5774 if (data->callchain)
5775 size += data->callchain->nr;
5776
5777 size *= sizeof(u64);
5778
76369139 5779 __output_copy(handle, data->callchain, size);
5622f295
MM
5780 } else {
5781 u64 nr = 0;
5782 perf_output_put(handle, nr);
5783 }
5784 }
5785
5786 if (sample_type & PERF_SAMPLE_RAW) {
7e3f977e
DB
5787 struct perf_raw_record *raw = data->raw;
5788
5789 if (raw) {
5790 struct perf_raw_frag *frag = &raw->frag;
5791
5792 perf_output_put(handle, raw->size);
5793 do {
5794 if (frag->copy) {
5795 __output_custom(handle, frag->copy,
5796 frag->data, frag->size);
5797 } else {
5798 __output_copy(handle, frag->data,
5799 frag->size);
5800 }
5801 if (perf_raw_frag_last(frag))
5802 break;
5803 frag = frag->next;
5804 } while (1);
5805 if (frag->pad)
5806 __output_skip(handle, NULL, frag->pad);
5622f295
MM
5807 } else {
5808 struct {
5809 u32 size;
5810 u32 data;
5811 } raw = {
5812 .size = sizeof(u32),
5813 .data = 0,
5814 };
5815 perf_output_put(handle, raw);
5816 }
5817 }
a7ac67ea 5818
bce38cd5
SE
5819 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5820 if (data->br_stack) {
5821 size_t size;
5822
5823 size = data->br_stack->nr
5824 * sizeof(struct perf_branch_entry);
5825
5826 perf_output_put(handle, data->br_stack->nr);
5827 perf_output_copy(handle, data->br_stack->entries, size);
5828 } else {
5829 /*
5830 * we always store at least the value of nr
5831 */
5832 u64 nr = 0;
5833 perf_output_put(handle, nr);
5834 }
5835 }
4018994f
JO
5836
5837 if (sample_type & PERF_SAMPLE_REGS_USER) {
5838 u64 abi = data->regs_user.abi;
5839
5840 /*
5841 * If there are no regs to dump, notice it through
5842 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
5843 */
5844 perf_output_put(handle, abi);
5845
5846 if (abi) {
5847 u64 mask = event->attr.sample_regs_user;
5848 perf_output_sample_regs(handle,
5849 data->regs_user.regs,
5850 mask);
5851 }
5852 }
c5ebcedb 5853
a5cdd40c 5854 if (sample_type & PERF_SAMPLE_STACK_USER) {
c5ebcedb
JO
5855 perf_output_sample_ustack(handle,
5856 data->stack_user_size,
5857 data->regs_user.regs);
a5cdd40c 5858 }
c3feedf2
AK
5859
5860 if (sample_type & PERF_SAMPLE_WEIGHT)
5861 perf_output_put(handle, data->weight);
d6be9ad6
SE
5862
5863 if (sample_type & PERF_SAMPLE_DATA_SRC)
5864 perf_output_put(handle, data->data_src.val);
a5cdd40c 5865
fdfbbd07
AK
5866 if (sample_type & PERF_SAMPLE_TRANSACTION)
5867 perf_output_put(handle, data->txn);
5868
60e2364e
SE
5869 if (sample_type & PERF_SAMPLE_REGS_INTR) {
5870 u64 abi = data->regs_intr.abi;
5871 /*
5872 * If there are no regs to dump, notice it through
5873 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
5874 */
5875 perf_output_put(handle, abi);
5876
5877 if (abi) {
5878 u64 mask = event->attr.sample_regs_intr;
5879
5880 perf_output_sample_regs(handle,
5881 data->regs_intr.regs,
5882 mask);
5883 }
5884 }
5885
a5cdd40c
PZ
5886 if (!event->attr.watermark) {
5887 int wakeup_events = event->attr.wakeup_events;
5888
5889 if (wakeup_events) {
5890 struct ring_buffer *rb = handle->rb;
5891 int events = local_inc_return(&rb->events);
5892
5893 if (events >= wakeup_events) {
5894 local_sub(wakeup_events, &rb->events);
5895 local_inc(&rb->wakeup);
5896 }
5897 }
5898 }
5622f295
MM
5899}
5900
5901void perf_prepare_sample(struct perf_event_header *header,
5902 struct perf_sample_data *data,
cdd6c482 5903 struct perf_event *event,
5622f295 5904 struct pt_regs *regs)
7b732a75 5905{
cdd6c482 5906 u64 sample_type = event->attr.sample_type;
7b732a75 5907
cdd6c482 5908 header->type = PERF_RECORD_SAMPLE;
c320c7b7 5909 header->size = sizeof(*header) + event->header_size;
5622f295
MM
5910
5911 header->misc = 0;
5912 header->misc |= perf_misc_flags(regs);
6fab0192 5913
c980d109 5914 __perf_event_header__init_id(header, data, event);
6844c09d 5915
c320c7b7 5916 if (sample_type & PERF_SAMPLE_IP)
5622f295
MM
5917 data->ip = perf_instruction_pointer(regs);
5918
b23f3325 5919 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5622f295 5920 int size = 1;
394ee076 5921
e6dab5ff 5922 data->callchain = perf_callchain(event, regs);
5622f295
MM
5923
5924 if (data->callchain)
5925 size += data->callchain->nr;
5926
5927 header->size += size * sizeof(u64);
394ee076
PZ
5928 }
5929
3a43ce68 5930 if (sample_type & PERF_SAMPLE_RAW) {
7e3f977e
DB
5931 struct perf_raw_record *raw = data->raw;
5932 int size;
5933
5934 if (raw) {
5935 struct perf_raw_frag *frag = &raw->frag;
5936 u32 sum = 0;
5937
5938 do {
5939 sum += frag->size;
5940 if (perf_raw_frag_last(frag))
5941 break;
5942 frag = frag->next;
5943 } while (1);
5944
5945 size = round_up(sum + sizeof(u32), sizeof(u64));
5946 raw->size = size - sizeof(u32);
5947 frag->pad = raw->size - sum;
5948 } else {
5949 size = sizeof(u64);
5950 }
a044560c 5951
7e3f977e 5952 header->size += size;
7f453c24 5953 }
bce38cd5
SE
5954
5955 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5956 int size = sizeof(u64); /* nr */
5957 if (data->br_stack) {
5958 size += data->br_stack->nr
5959 * sizeof(struct perf_branch_entry);
5960 }
5961 header->size += size;
5962 }
4018994f 5963
2565711f 5964 if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
88a7c26a
AL
5965 perf_sample_regs_user(&data->regs_user, regs,
5966 &data->regs_user_copy);
2565711f 5967
4018994f
JO
5968 if (sample_type & PERF_SAMPLE_REGS_USER) {
5969 /* regs dump ABI info */
5970 int size = sizeof(u64);
5971
4018994f
JO
5972 if (data->regs_user.regs) {
5973 u64 mask = event->attr.sample_regs_user;
5974 size += hweight64(mask) * sizeof(u64);
5975 }
5976
5977 header->size += size;
5978 }
c5ebcedb
JO
5979
5980 if (sample_type & PERF_SAMPLE_STACK_USER) {
5981 /*
5982 * Either we need PERF_SAMPLE_STACK_USER bit to be allways
5983 * processed as the last one or have additional check added
5984 * in case new sample type is added, because we could eat
5985 * up the rest of the sample size.
5986 */
c5ebcedb
JO
5987 u16 stack_size = event->attr.sample_stack_user;
5988 u16 size = sizeof(u64);
5989
c5ebcedb 5990 stack_size = perf_sample_ustack_size(stack_size, header->size,
2565711f 5991 data->regs_user.regs);
c5ebcedb
JO
5992
5993 /*
5994 * If there is something to dump, add space for the dump
5995 * itself and for the field that tells the dynamic size,
5996 * which is how many have been actually dumped.
5997 */
5998 if (stack_size)
5999 size += sizeof(u64) + stack_size;
6000
6001 data->stack_user_size = stack_size;
6002 header->size += size;
6003 }
60e2364e
SE
6004
6005 if (sample_type & PERF_SAMPLE_REGS_INTR) {
6006 /* regs dump ABI info */
6007 int size = sizeof(u64);
6008
6009 perf_sample_regs_intr(&data->regs_intr, regs);
6010
6011 if (data->regs_intr.regs) {
6012 u64 mask = event->attr.sample_regs_intr;
6013
6014 size += hweight64(mask) * sizeof(u64);
6015 }
6016
6017 header->size += size;
6018 }
5622f295 6019}
7f453c24 6020
9ecda41a
WN
6021static void __always_inline
6022__perf_event_output(struct perf_event *event,
6023 struct perf_sample_data *data,
6024 struct pt_regs *regs,
6025 int (*output_begin)(struct perf_output_handle *,
6026 struct perf_event *,
6027 unsigned int))
5622f295
MM
6028{
6029 struct perf_output_handle handle;
6030 struct perf_event_header header;
689802b2 6031
927c7a9e
FW
6032 /* protect the callchain buffers */
6033 rcu_read_lock();
6034
cdd6c482 6035 perf_prepare_sample(&header, data, event, regs);
5c148194 6036
9ecda41a 6037 if (output_begin(&handle, event, header.size))
927c7a9e 6038 goto exit;
0322cd6e 6039
cdd6c482 6040 perf_output_sample(&handle, &header, data, event);
f413cdb8 6041
8a057d84 6042 perf_output_end(&handle);
927c7a9e
FW
6043
6044exit:
6045 rcu_read_unlock();
0322cd6e
PZ
6046}
6047
9ecda41a
WN
6048void
6049perf_event_output_forward(struct perf_event *event,
6050 struct perf_sample_data *data,
6051 struct pt_regs *regs)
6052{
6053 __perf_event_output(event, data, regs, perf_output_begin_forward);
6054}
6055
6056void
6057perf_event_output_backward(struct perf_event *event,
6058 struct perf_sample_data *data,
6059 struct pt_regs *regs)
6060{
6061 __perf_event_output(event, data, regs, perf_output_begin_backward);
6062}
6063
6064void
6065perf_event_output(struct perf_event *event,
6066 struct perf_sample_data *data,
6067 struct pt_regs *regs)
6068{
6069 __perf_event_output(event, data, regs, perf_output_begin);
6070}
6071
38b200d6 6072/*
cdd6c482 6073 * read event_id
38b200d6
PZ
6074 */
6075
6076struct perf_read_event {
6077 struct perf_event_header header;
6078
6079 u32 pid;
6080 u32 tid;
38b200d6
PZ
6081};
6082
6083static void
cdd6c482 6084perf_event_read_event(struct perf_event *event,
38b200d6
PZ
6085 struct task_struct *task)
6086{
6087 struct perf_output_handle handle;
c980d109 6088 struct perf_sample_data sample;
dfc65094 6089 struct perf_read_event read_event = {
38b200d6 6090 .header = {
cdd6c482 6091 .type = PERF_RECORD_READ,
38b200d6 6092 .misc = 0,
c320c7b7 6093 .size = sizeof(read_event) + event->read_size,
38b200d6 6094 },
cdd6c482
IM
6095 .pid = perf_event_pid(event, task),
6096 .tid = perf_event_tid(event, task),
38b200d6 6097 };
3dab77fb 6098 int ret;
38b200d6 6099
c980d109 6100 perf_event_header__init_id(&read_event.header, &sample, event);
a7ac67ea 6101 ret = perf_output_begin(&handle, event, read_event.header.size);
38b200d6
PZ
6102 if (ret)
6103 return;
6104
dfc65094 6105 perf_output_put(&handle, read_event);
cdd6c482 6106 perf_output_read(&handle, event);
c980d109 6107 perf_event__output_id_sample(event, &handle, &sample);
3dab77fb 6108
38b200d6
PZ
6109 perf_output_end(&handle);
6110}
6111
aab5b71e 6112typedef void (perf_iterate_f)(struct perf_event *event, void *data);
52d857a8
JO
6113
6114static void
aab5b71e
PZ
6115perf_iterate_ctx(struct perf_event_context *ctx,
6116 perf_iterate_f output,
b73e4fef 6117 void *data, bool all)
52d857a8
JO
6118{
6119 struct perf_event *event;
6120
6121 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
b73e4fef
AS
6122 if (!all) {
6123 if (event->state < PERF_EVENT_STATE_INACTIVE)
6124 continue;
6125 if (!event_filter_match(event))
6126 continue;
6127 }
6128
67516844 6129 output(event, data);
52d857a8
JO
6130 }
6131}
6132
aab5b71e 6133static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
f2fb6bef
KL
6134{
6135 struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
6136 struct perf_event *event;
6137
6138 list_for_each_entry_rcu(event, &pel->list, sb_list) {
0b8f1e2e
PZ
6139 /*
6140 * Skip events that are not fully formed yet; ensure that
6141 * if we observe event->ctx, both event and ctx will be
6142 * complete enough. See perf_install_in_context().
6143 */
6144 if (!smp_load_acquire(&event->ctx))
6145 continue;
6146
f2fb6bef
KL
6147 if (event->state < PERF_EVENT_STATE_INACTIVE)
6148 continue;
6149 if (!event_filter_match(event))
6150 continue;
6151 output(event, data);
6152 }
6153}
6154
aab5b71e
PZ
6155/*
6156 * Iterate all events that need to receive side-band events.
6157 *
6158 * For new callers; ensure that account_pmu_sb_event() includes
6159 * your event, otherwise it might not get delivered.
6160 */
52d857a8 6161static void
aab5b71e 6162perf_iterate_sb(perf_iterate_f output, void *data,
52d857a8
JO
6163 struct perf_event_context *task_ctx)
6164{
52d857a8 6165 struct perf_event_context *ctx;
52d857a8
JO
6166 int ctxn;
6167
aab5b71e
PZ
6168 rcu_read_lock();
6169 preempt_disable();
6170
4e93ad60 6171 /*
aab5b71e
PZ
6172 * If we have task_ctx != NULL we only notify the task context itself.
6173 * The task_ctx is set only for EXIT events before releasing task
4e93ad60
JO
6174 * context.
6175 */
6176 if (task_ctx) {
aab5b71e
PZ
6177 perf_iterate_ctx(task_ctx, output, data, false);
6178 goto done;
4e93ad60
JO
6179 }
6180
aab5b71e 6181 perf_iterate_sb_cpu(output, data);
f2fb6bef
KL
6182
6183 for_each_task_context_nr(ctxn) {
52d857a8
JO
6184 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
6185 if (ctx)
aab5b71e 6186 perf_iterate_ctx(ctx, output, data, false);
52d857a8 6187 }
aab5b71e 6188done:
f2fb6bef 6189 preempt_enable();
52d857a8 6190 rcu_read_unlock();
95ff4ca2
AS
6191}
6192
375637bc
AS
6193/*
6194 * Clear all file-based filters at exec, they'll have to be
6195 * re-instated when/if these objects are mmapped again.
6196 */
6197static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
6198{
6199 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
6200 struct perf_addr_filter *filter;
6201 unsigned int restart = 0, count = 0;
6202 unsigned long flags;
6203
6204 if (!has_addr_filter(event))
6205 return;
6206
6207 raw_spin_lock_irqsave(&ifh->lock, flags);
6208 list_for_each_entry(filter, &ifh->list, entry) {
6209 if (filter->inode) {
6210 event->addr_filters_offs[count] = 0;
6211 restart++;
6212 }
6213
6214 count++;
6215 }
6216
6217 if (restart)
6218 event->addr_filters_gen++;
6219 raw_spin_unlock_irqrestore(&ifh->lock, flags);
6220
6221 if (restart)
767ae086 6222 perf_event_stop(event, 1);
375637bc
AS
6223}
6224
6225void perf_event_exec(void)
6226{
6227 struct perf_event_context *ctx;
6228 int ctxn;
6229
6230 rcu_read_lock();
6231 for_each_task_context_nr(ctxn) {
6232 ctx = current->perf_event_ctxp[ctxn];
6233 if (!ctx)
6234 continue;
6235
6236 perf_event_enable_on_exec(ctxn);
6237
aab5b71e 6238 perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
375637bc
AS
6239 true);
6240 }
6241 rcu_read_unlock();
6242}
6243
95ff4ca2
AS
6244struct remote_output {
6245 struct ring_buffer *rb;
6246 int err;
6247};
6248
6249static void __perf_event_output_stop(struct perf_event *event, void *data)
6250{
6251 struct perf_event *parent = event->parent;
6252 struct remote_output *ro = data;
6253 struct ring_buffer *rb = ro->rb;
375637bc
AS
6254 struct stop_event_data sd = {
6255 .event = event,
6256 };
95ff4ca2
AS
6257
6258 if (!has_aux(event))
6259 return;
6260
6261 if (!parent)
6262 parent = event;
6263
6264 /*
6265 * In case of inheritance, it will be the parent that links to the
767ae086
AS
6266 * ring-buffer, but it will be the child that's actually using it.
6267 *
6268 * We are using event::rb to determine if the event should be stopped,
6269 * however this may race with ring_buffer_attach() (through set_output),
6270 * which will make us skip the event that actually needs to be stopped.
6271 * So ring_buffer_attach() has to stop an aux event before re-assigning
6272 * its rb pointer.
95ff4ca2
AS
6273 */
6274 if (rcu_dereference(parent->rb) == rb)
375637bc 6275 ro->err = __perf_event_stop(&sd);
95ff4ca2
AS
6276}
6277
6278static int __perf_pmu_output_stop(void *info)
6279{
6280 struct perf_event *event = info;
6281 struct pmu *pmu = event->pmu;
8b6a3fe8 6282 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
95ff4ca2
AS
6283 struct remote_output ro = {
6284 .rb = event->rb,
6285 };
6286
6287 rcu_read_lock();
aab5b71e 6288 perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
95ff4ca2 6289 if (cpuctx->task_ctx)
aab5b71e 6290 perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
b73e4fef 6291 &ro, false);
95ff4ca2
AS
6292 rcu_read_unlock();
6293
6294 return ro.err;
6295}
6296
6297static void perf_pmu_output_stop(struct perf_event *event)
6298{
6299 struct perf_event *iter;
6300 int err, cpu;
6301
6302restart:
6303 rcu_read_lock();
6304 list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
6305 /*
6306 * For per-CPU events, we need to make sure that neither they
6307 * nor their children are running; for cpu==-1 events it's
6308 * sufficient to stop the event itself if it's active, since
6309 * it can't have children.
6310 */
6311 cpu = iter->cpu;
6312 if (cpu == -1)
6313 cpu = READ_ONCE(iter->oncpu);
6314
6315 if (cpu == -1)
6316 continue;
6317
6318 err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
6319 if (err == -EAGAIN) {
6320 rcu_read_unlock();
6321 goto restart;
6322 }
6323 }
6324 rcu_read_unlock();
52d857a8
JO
6325}
6326
60313ebe 6327/*
9f498cc5
PZ
6328 * task tracking -- fork/exit
6329 *
13d7a241 6330 * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
60313ebe
PZ
6331 */
6332
9f498cc5 6333struct perf_task_event {
3a80b4a3 6334 struct task_struct *task;
cdd6c482 6335 struct perf_event_context *task_ctx;
60313ebe
PZ
6336
6337 struct {
6338 struct perf_event_header header;
6339
6340 u32 pid;
6341 u32 ppid;
9f498cc5
PZ
6342 u32 tid;
6343 u32 ptid;
393b2ad8 6344 u64 time;
cdd6c482 6345 } event_id;
60313ebe
PZ
6346};
6347
67516844
JO
6348static int perf_event_task_match(struct perf_event *event)
6349{
13d7a241
SE
6350 return event->attr.comm || event->attr.mmap ||
6351 event->attr.mmap2 || event->attr.mmap_data ||
6352 event->attr.task;
67516844
JO
6353}
6354
cdd6c482 6355static void perf_event_task_output(struct perf_event *event,
52d857a8 6356 void *data)
60313ebe 6357{
52d857a8 6358 struct perf_task_event *task_event = data;
60313ebe 6359 struct perf_output_handle handle;
c980d109 6360 struct perf_sample_data sample;
9f498cc5 6361 struct task_struct *task = task_event->task;
c980d109 6362 int ret, size = task_event->event_id.header.size;
8bb39f9a 6363
67516844
JO
6364 if (!perf_event_task_match(event))
6365 return;
6366
c980d109 6367 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
60313ebe 6368
c980d109 6369 ret = perf_output_begin(&handle, event,
a7ac67ea 6370 task_event->event_id.header.size);
ef60777c 6371 if (ret)
c980d109 6372 goto out;
60313ebe 6373
cdd6c482
IM
6374 task_event->event_id.pid = perf_event_pid(event, task);
6375 task_event->event_id.ppid = perf_event_pid(event, current);
60313ebe 6376
cdd6c482
IM
6377 task_event->event_id.tid = perf_event_tid(event, task);
6378 task_event->event_id.ptid = perf_event_tid(event, current);
9f498cc5 6379
34f43927
PZ
6380 task_event->event_id.time = perf_event_clock(event);
6381
cdd6c482 6382 perf_output_put(&handle, task_event->event_id);
393b2ad8 6383
c980d109
ACM
6384 perf_event__output_id_sample(event, &handle, &sample);
6385
60313ebe 6386 perf_output_end(&handle);
c980d109
ACM
6387out:
6388 task_event->event_id.header.size = size;
60313ebe
PZ
6389}
6390
cdd6c482
IM
6391static void perf_event_task(struct task_struct *task,
6392 struct perf_event_context *task_ctx,
3a80b4a3 6393 int new)
60313ebe 6394{
9f498cc5 6395 struct perf_task_event task_event;
60313ebe 6396
cdd6c482
IM
6397 if (!atomic_read(&nr_comm_events) &&
6398 !atomic_read(&nr_mmap_events) &&
6399 !atomic_read(&nr_task_events))
60313ebe
PZ
6400 return;
6401
9f498cc5 6402 task_event = (struct perf_task_event){
3a80b4a3
PZ
6403 .task = task,
6404 .task_ctx = task_ctx,
cdd6c482 6405 .event_id = {
60313ebe 6406 .header = {
cdd6c482 6407 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
573402db 6408 .misc = 0,
cdd6c482 6409 .size = sizeof(task_event.event_id),
60313ebe 6410 },
573402db
PZ
6411 /* .pid */
6412 /* .ppid */
9f498cc5
PZ
6413 /* .tid */
6414 /* .ptid */
34f43927 6415 /* .time */
60313ebe
PZ
6416 },
6417 };
6418
aab5b71e 6419 perf_iterate_sb(perf_event_task_output,
52d857a8
JO
6420 &task_event,
6421 task_ctx);
9f498cc5
PZ
6422}
6423
cdd6c482 6424void perf_event_fork(struct task_struct *task)
9f498cc5 6425{
cdd6c482 6426 perf_event_task(task, NULL, 1);
60313ebe
PZ
6427}
6428
8d1b2d93
PZ
6429/*
6430 * comm tracking
6431 */
6432
6433struct perf_comm_event {
22a4f650
IM
6434 struct task_struct *task;
6435 char *comm;
8d1b2d93
PZ
6436 int comm_size;
6437
6438 struct {
6439 struct perf_event_header header;
6440
6441 u32 pid;
6442 u32 tid;
cdd6c482 6443 } event_id;
8d1b2d93
PZ
6444};
6445
67516844
JO
6446static int perf_event_comm_match(struct perf_event *event)
6447{
6448 return event->attr.comm;
6449}
6450
cdd6c482 6451static void perf_event_comm_output(struct perf_event *event,
52d857a8 6452 void *data)
8d1b2d93 6453{
52d857a8 6454 struct perf_comm_event *comm_event = data;
8d1b2d93 6455 struct perf_output_handle handle;
c980d109 6456 struct perf_sample_data sample;
cdd6c482 6457 int size = comm_event->event_id.header.size;
c980d109
ACM
6458 int ret;
6459
67516844
JO
6460 if (!perf_event_comm_match(event))
6461 return;
6462
c980d109
ACM
6463 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
6464 ret = perf_output_begin(&handle, event,
a7ac67ea 6465 comm_event->event_id.header.size);
8d1b2d93
PZ
6466
6467 if (ret)
c980d109 6468 goto out;
8d1b2d93 6469
cdd6c482
IM
6470 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
6471 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
709e50cf 6472
cdd6c482 6473 perf_output_put(&handle, comm_event->event_id);
76369139 6474 __output_copy(&handle, comm_event->comm,
8d1b2d93 6475 comm_event->comm_size);
c980d109
ACM
6476
6477 perf_event__output_id_sample(event, &handle, &sample);
6478
8d1b2d93 6479 perf_output_end(&handle);
c980d109
ACM
6480out:
6481 comm_event->event_id.header.size = size;
8d1b2d93
PZ
6482}
6483
cdd6c482 6484static void perf_event_comm_event(struct perf_comm_event *comm_event)
8d1b2d93 6485{
413ee3b4 6486 char comm[TASK_COMM_LEN];
8d1b2d93 6487 unsigned int size;
8d1b2d93 6488
413ee3b4 6489 memset(comm, 0, sizeof(comm));
96b02d78 6490 strlcpy(comm, comm_event->task->comm, sizeof(comm));
888fcee0 6491 size = ALIGN(strlen(comm)+1, sizeof(u64));
8d1b2d93
PZ
6492
6493 comm_event->comm = comm;
6494 comm_event->comm_size = size;
6495
cdd6c482 6496 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
8dc85d54 6497
aab5b71e 6498 perf_iterate_sb(perf_event_comm_output,
52d857a8
JO
6499 comm_event,
6500 NULL);
8d1b2d93
PZ
6501}
6502
82b89778 6503void perf_event_comm(struct task_struct *task, bool exec)
8d1b2d93 6504{
9ee318a7
PZ
6505 struct perf_comm_event comm_event;
6506
cdd6c482 6507 if (!atomic_read(&nr_comm_events))
9ee318a7 6508 return;
a63eaf34 6509
9ee318a7 6510 comm_event = (struct perf_comm_event){
8d1b2d93 6511 .task = task,
573402db
PZ
6512 /* .comm */
6513 /* .comm_size */
cdd6c482 6514 .event_id = {
573402db 6515 .header = {
cdd6c482 6516 .type = PERF_RECORD_COMM,
82b89778 6517 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
573402db
PZ
6518 /* .size */
6519 },
6520 /* .pid */
6521 /* .tid */
8d1b2d93
PZ
6522 },
6523 };
6524
cdd6c482 6525 perf_event_comm_event(&comm_event);
8d1b2d93
PZ
6526}
6527
0a4a9391
PZ
6528/*
6529 * mmap tracking
6530 */
6531
6532struct perf_mmap_event {
089dd79d
PZ
6533 struct vm_area_struct *vma;
6534
6535 const char *file_name;
6536 int file_size;
13d7a241
SE
6537 int maj, min;
6538 u64 ino;
6539 u64 ino_generation;
f972eb63 6540 u32 prot, flags;
0a4a9391
PZ
6541
6542 struct {
6543 struct perf_event_header header;
6544
6545 u32 pid;
6546 u32 tid;
6547 u64 start;
6548 u64 len;
6549 u64 pgoff;
cdd6c482 6550 } event_id;
0a4a9391
PZ
6551};
6552
67516844
JO
6553static int perf_event_mmap_match(struct perf_event *event,
6554 void *data)
6555{
6556 struct perf_mmap_event *mmap_event = data;
6557 struct vm_area_struct *vma = mmap_event->vma;
6558 int executable = vma->vm_flags & VM_EXEC;
6559
6560 return (!executable && event->attr.mmap_data) ||
13d7a241 6561 (executable && (event->attr.mmap || event->attr.mmap2));
67516844
JO
6562}
6563
cdd6c482 6564static void perf_event_mmap_output(struct perf_event *event,
52d857a8 6565 void *data)
0a4a9391 6566{
52d857a8 6567 struct perf_mmap_event *mmap_event = data;
0a4a9391 6568 struct perf_output_handle handle;
c980d109 6569 struct perf_sample_data sample;
cdd6c482 6570 int size = mmap_event->event_id.header.size;
c980d109 6571 int ret;
0a4a9391 6572
67516844
JO
6573 if (!perf_event_mmap_match(event, data))
6574 return;
6575
13d7a241
SE
6576 if (event->attr.mmap2) {
6577 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
6578 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
6579 mmap_event->event_id.header.size += sizeof(mmap_event->min);
6580 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
d008d525 6581 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
f972eb63
PZ
6582 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
6583 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
13d7a241
SE
6584 }
6585
c980d109
ACM
6586 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
6587 ret = perf_output_begin(&handle, event,
a7ac67ea 6588 mmap_event->event_id.header.size);
0a4a9391 6589 if (ret)
c980d109 6590 goto out;
0a4a9391 6591
cdd6c482
IM
6592 mmap_event->event_id.pid = perf_event_pid(event, current);
6593 mmap_event->event_id.tid = perf_event_tid(event, current);
709e50cf 6594
cdd6c482 6595 perf_output_put(&handle, mmap_event->event_id);
13d7a241
SE
6596
6597 if (event->attr.mmap2) {
6598 perf_output_put(&handle, mmap_event->maj);
6599 perf_output_put(&handle, mmap_event->min);
6600 perf_output_put(&handle, mmap_event->ino);
6601 perf_output_put(&handle, mmap_event->ino_generation);
f972eb63
PZ
6602 perf_output_put(&handle, mmap_event->prot);
6603 perf_output_put(&handle, mmap_event->flags);
13d7a241
SE
6604 }
6605
76369139 6606 __output_copy(&handle, mmap_event->file_name,
0a4a9391 6607 mmap_event->file_size);
c980d109
ACM
6608
6609 perf_event__output_id_sample(event, &handle, &sample);
6610
78d613eb 6611 perf_output_end(&handle);
c980d109
ACM
6612out:
6613 mmap_event->event_id.header.size = size;
0a4a9391
PZ
6614}
6615
cdd6c482 6616static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
0a4a9391 6617{
089dd79d
PZ
6618 struct vm_area_struct *vma = mmap_event->vma;
6619 struct file *file = vma->vm_file;
13d7a241
SE
6620 int maj = 0, min = 0;
6621 u64 ino = 0, gen = 0;
f972eb63 6622 u32 prot = 0, flags = 0;
0a4a9391
PZ
6623 unsigned int size;
6624 char tmp[16];
6625 char *buf = NULL;
2c42cfbf 6626 char *name;
413ee3b4 6627
0b3589be
PZ
6628 if (vma->vm_flags & VM_READ)
6629 prot |= PROT_READ;
6630 if (vma->vm_flags & VM_WRITE)
6631 prot |= PROT_WRITE;
6632 if (vma->vm_flags & VM_EXEC)
6633 prot |= PROT_EXEC;
6634
6635 if (vma->vm_flags & VM_MAYSHARE)
6636 flags = MAP_SHARED;
6637 else
6638 flags = MAP_PRIVATE;
6639
6640 if (vma->vm_flags & VM_DENYWRITE)
6641 flags |= MAP_DENYWRITE;
6642 if (vma->vm_flags & VM_MAYEXEC)
6643 flags |= MAP_EXECUTABLE;
6644 if (vma->vm_flags & VM_LOCKED)
6645 flags |= MAP_LOCKED;
6646 if (vma->vm_flags & VM_HUGETLB)
6647 flags |= MAP_HUGETLB;
6648
0a4a9391 6649 if (file) {
13d7a241
SE
6650 struct inode *inode;
6651 dev_t dev;
3ea2f2b9 6652
2c42cfbf 6653 buf = kmalloc(PATH_MAX, GFP_KERNEL);
0a4a9391 6654 if (!buf) {
c7e548b4
ON
6655 name = "//enomem";
6656 goto cpy_name;
0a4a9391 6657 }
413ee3b4 6658 /*
3ea2f2b9 6659 * d_path() works from the end of the rb backwards, so we
413ee3b4
AB
6660 * need to add enough zero bytes after the string to handle
6661 * the 64bit alignment we do later.
6662 */
9bf39ab2 6663 name = file_path(file, buf, PATH_MAX - sizeof(u64));
0a4a9391 6664 if (IS_ERR(name)) {
c7e548b4
ON
6665 name = "//toolong";
6666 goto cpy_name;
0a4a9391 6667 }
13d7a241
SE
6668 inode = file_inode(vma->vm_file);
6669 dev = inode->i_sb->s_dev;
6670 ino = inode->i_ino;
6671 gen = inode->i_generation;
6672 maj = MAJOR(dev);
6673 min = MINOR(dev);
f972eb63 6674
c7e548b4 6675 goto got_name;
0a4a9391 6676 } else {
fbe26abe
JO
6677 if (vma->vm_ops && vma->vm_ops->name) {
6678 name = (char *) vma->vm_ops->name(vma);
6679 if (name)
6680 goto cpy_name;
6681 }
6682
2c42cfbf 6683 name = (char *)arch_vma_name(vma);
c7e548b4
ON
6684 if (name)
6685 goto cpy_name;
089dd79d 6686
32c5fb7e 6687 if (vma->vm_start <= vma->vm_mm->start_brk &&
3af9e859 6688 vma->vm_end >= vma->vm_mm->brk) {
c7e548b4
ON
6689 name = "[heap]";
6690 goto cpy_name;
32c5fb7e
ON
6691 }
6692 if (vma->vm_start <= vma->vm_mm->start_stack &&
3af9e859 6693 vma->vm_end >= vma->vm_mm->start_stack) {
c7e548b4
ON
6694 name = "[stack]";
6695 goto cpy_name;
089dd79d
PZ
6696 }
6697
c7e548b4
ON
6698 name = "//anon";
6699 goto cpy_name;
0a4a9391
PZ
6700 }
6701
c7e548b4
ON
6702cpy_name:
6703 strlcpy(tmp, name, sizeof(tmp));
6704 name = tmp;
0a4a9391 6705got_name:
2c42cfbf
PZ
6706 /*
6707 * Since our buffer works in 8 byte units we need to align our string
6708 * size to a multiple of 8. However, we must guarantee the tail end is
6709 * zero'd out to avoid leaking random bits to userspace.
6710 */
6711 size = strlen(name)+1;
6712 while (!IS_ALIGNED(size, sizeof(u64)))
6713 name[size++] = '\0';
0a4a9391
PZ
6714
6715 mmap_event->file_name = name;
6716 mmap_event->file_size = size;
13d7a241
SE
6717 mmap_event->maj = maj;
6718 mmap_event->min = min;
6719 mmap_event->ino = ino;
6720 mmap_event->ino_generation = gen;
f972eb63
PZ
6721 mmap_event->prot = prot;
6722 mmap_event->flags = flags;
0a4a9391 6723
2fe85427
SE
6724 if (!(vma->vm_flags & VM_EXEC))
6725 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
6726
cdd6c482 6727 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
0a4a9391 6728
aab5b71e 6729 perf_iterate_sb(perf_event_mmap_output,
52d857a8
JO
6730 mmap_event,
6731 NULL);
665c2142 6732
0a4a9391
PZ
6733 kfree(buf);
6734}
6735
375637bc
AS
6736/*
6737 * Check whether inode and address range match filter criteria.
6738 */
6739static bool perf_addr_filter_match(struct perf_addr_filter *filter,
6740 struct file *file, unsigned long offset,
6741 unsigned long size)
6742{
45063097 6743 if (filter->inode != file_inode(file))
375637bc
AS
6744 return false;
6745
6746 if (filter->offset > offset + size)
6747 return false;
6748
6749 if (filter->offset + filter->size < offset)
6750 return false;
6751
6752 return true;
6753}
6754
6755static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
6756{
6757 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
6758 struct vm_area_struct *vma = data;
6759 unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags;
6760 struct file *file = vma->vm_file;
6761 struct perf_addr_filter *filter;
6762 unsigned int restart = 0, count = 0;
6763
6764 if (!has_addr_filter(event))
6765 return;
6766
6767 if (!file)
6768 return;
6769
6770 raw_spin_lock_irqsave(&ifh->lock, flags);
6771 list_for_each_entry(filter, &ifh->list, entry) {
6772 if (perf_addr_filter_match(filter, file, off,
6773 vma->vm_end - vma->vm_start)) {
6774 event->addr_filters_offs[count] = vma->vm_start;
6775 restart++;
6776 }
6777
6778 count++;
6779 }
6780
6781 if (restart)
6782 event->addr_filters_gen++;
6783 raw_spin_unlock_irqrestore(&ifh->lock, flags);
6784
6785 if (restart)
767ae086 6786 perf_event_stop(event, 1);
375637bc
AS
6787}
6788
6789/*
6790 * Adjust all task's events' filters to the new vma
6791 */
6792static void perf_addr_filters_adjust(struct vm_area_struct *vma)
6793{
6794 struct perf_event_context *ctx;
6795 int ctxn;
6796
12b40a23
MP
6797 /*
6798 * Data tracing isn't supported yet and as such there is no need
6799 * to keep track of anything that isn't related to executable code:
6800 */
6801 if (!(vma->vm_flags & VM_EXEC))
6802 return;
6803
375637bc
AS
6804 rcu_read_lock();
6805 for_each_task_context_nr(ctxn) {
6806 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
6807 if (!ctx)
6808 continue;
6809
aab5b71e 6810 perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
375637bc
AS
6811 }
6812 rcu_read_unlock();
6813}
6814
3af9e859 6815void perf_event_mmap(struct vm_area_struct *vma)
0a4a9391 6816{
9ee318a7
PZ
6817 struct perf_mmap_event mmap_event;
6818
cdd6c482 6819 if (!atomic_read(&nr_mmap_events))
9ee318a7
PZ
6820 return;
6821
6822 mmap_event = (struct perf_mmap_event){
089dd79d 6823 .vma = vma,
573402db
PZ
6824 /* .file_name */
6825 /* .file_size */
cdd6c482 6826 .event_id = {
573402db 6827 .header = {
cdd6c482 6828 .type = PERF_RECORD_MMAP,
39447b38 6829 .misc = PERF_RECORD_MISC_USER,
573402db
PZ
6830 /* .size */
6831 },
6832 /* .pid */
6833 /* .tid */
089dd79d
PZ
6834 .start = vma->vm_start,
6835 .len = vma->vm_end - vma->vm_start,
3a0304e9 6836 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
0a4a9391 6837 },
13d7a241
SE
6838 /* .maj (attr_mmap2 only) */
6839 /* .min (attr_mmap2 only) */
6840 /* .ino (attr_mmap2 only) */
6841 /* .ino_generation (attr_mmap2 only) */
f972eb63
PZ
6842 /* .prot (attr_mmap2 only) */
6843 /* .flags (attr_mmap2 only) */
0a4a9391
PZ
6844 };
6845
375637bc 6846 perf_addr_filters_adjust(vma);
cdd6c482 6847 perf_event_mmap_event(&mmap_event);
0a4a9391
PZ
6848}
6849
68db7e98
AS
6850void perf_event_aux_event(struct perf_event *event, unsigned long head,
6851 unsigned long size, u64 flags)
6852{
6853 struct perf_output_handle handle;
6854 struct perf_sample_data sample;
6855 struct perf_aux_event {
6856 struct perf_event_header header;
6857 u64 offset;
6858 u64 size;
6859 u64 flags;
6860 } rec = {
6861 .header = {
6862 .type = PERF_RECORD_AUX,
6863 .misc = 0,
6864 .size = sizeof(rec),
6865 },
6866 .offset = head,
6867 .size = size,
6868 .flags = flags,
6869 };
6870 int ret;
6871
6872 perf_event_header__init_id(&rec.header, &sample, event);
6873 ret = perf_output_begin(&handle, event, rec.header.size);
6874
6875 if (ret)
6876 return;
6877
6878 perf_output_put(&handle, rec);
6879 perf_event__output_id_sample(event, &handle, &sample);
6880
6881 perf_output_end(&handle);
6882}
6883
f38b0dbb
KL
6884/*
6885 * Lost/dropped samples logging
6886 */
6887void perf_log_lost_samples(struct perf_event *event, u64 lost)
6888{
6889 struct perf_output_handle handle;
6890 struct perf_sample_data sample;
6891 int ret;
6892
6893 struct {
6894 struct perf_event_header header;
6895 u64 lost;
6896 } lost_samples_event = {
6897 .header = {
6898 .type = PERF_RECORD_LOST_SAMPLES,
6899 .misc = 0,
6900 .size = sizeof(lost_samples_event),
6901 },
6902 .lost = lost,
6903 };
6904
6905 perf_event_header__init_id(&lost_samples_event.header, &sample, event);
6906
6907 ret = perf_output_begin(&handle, event,
6908 lost_samples_event.header.size);
6909 if (ret)
6910 return;
6911
6912 perf_output_put(&handle, lost_samples_event);
6913 perf_event__output_id_sample(event, &handle, &sample);
6914 perf_output_end(&handle);
6915}
6916
45ac1403
AH
6917/*
6918 * context_switch tracking
6919 */
6920
6921struct perf_switch_event {
6922 struct task_struct *task;
6923 struct task_struct *next_prev;
6924
6925 struct {
6926 struct perf_event_header header;
6927 u32 next_prev_pid;
6928 u32 next_prev_tid;
6929 } event_id;
6930};
6931
6932static int perf_event_switch_match(struct perf_event *event)
6933{
6934 return event->attr.context_switch;
6935}
6936
6937static void perf_event_switch_output(struct perf_event *event, void *data)
6938{
6939 struct perf_switch_event *se = data;
6940 struct perf_output_handle handle;
6941 struct perf_sample_data sample;
6942 int ret;
6943
6944 if (!perf_event_switch_match(event))
6945 return;
6946
6947 /* Only CPU-wide events are allowed to see next/prev pid/tid */
6948 if (event->ctx->task) {
6949 se->event_id.header.type = PERF_RECORD_SWITCH;
6950 se->event_id.header.size = sizeof(se->event_id.header);
6951 } else {
6952 se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
6953 se->event_id.header.size = sizeof(se->event_id);
6954 se->event_id.next_prev_pid =
6955 perf_event_pid(event, se->next_prev);
6956 se->event_id.next_prev_tid =
6957 perf_event_tid(event, se->next_prev);
6958 }
6959
6960 perf_event_header__init_id(&se->event_id.header, &sample, event);
6961
6962 ret = perf_output_begin(&handle, event, se->event_id.header.size);
6963 if (ret)
6964 return;
6965
6966 if (event->ctx->task)
6967 perf_output_put(&handle, se->event_id.header);
6968 else
6969 perf_output_put(&handle, se->event_id);
6970
6971 perf_event__output_id_sample(event, &handle, &sample);
6972
6973 perf_output_end(&handle);
6974}
6975
6976static void perf_event_switch(struct task_struct *task,
6977 struct task_struct *next_prev, bool sched_in)
6978{
6979 struct perf_switch_event switch_event;
6980
6981 /* N.B. caller checks nr_switch_events != 0 */
6982
6983 switch_event = (struct perf_switch_event){
6984 .task = task,
6985 .next_prev = next_prev,
6986 .event_id = {
6987 .header = {
6988 /* .type */
6989 .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
6990 /* .size */
6991 },
6992 /* .next_prev_pid */
6993 /* .next_prev_tid */
6994 },
6995 };
6996
aab5b71e 6997 perf_iterate_sb(perf_event_switch_output,
45ac1403
AH
6998 &switch_event,
6999 NULL);
7000}
7001
a78ac325
PZ
7002/*
7003 * IRQ throttle logging
7004 */
7005
cdd6c482 7006static void perf_log_throttle(struct perf_event *event, int enable)
a78ac325
PZ
7007{
7008 struct perf_output_handle handle;
c980d109 7009 struct perf_sample_data sample;
a78ac325
PZ
7010 int ret;
7011
7012 struct {
7013 struct perf_event_header header;
7014 u64 time;
cca3f454 7015 u64 id;
7f453c24 7016 u64 stream_id;
a78ac325
PZ
7017 } throttle_event = {
7018 .header = {
cdd6c482 7019 .type = PERF_RECORD_THROTTLE,
a78ac325
PZ
7020 .misc = 0,
7021 .size = sizeof(throttle_event),
7022 },
34f43927 7023 .time = perf_event_clock(event),
cdd6c482
IM
7024 .id = primary_event_id(event),
7025 .stream_id = event->id,
a78ac325
PZ
7026 };
7027
966ee4d6 7028 if (enable)
cdd6c482 7029 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
966ee4d6 7030
c980d109
ACM
7031 perf_event_header__init_id(&throttle_event.header, &sample, event);
7032
7033 ret = perf_output_begin(&handle, event,
a7ac67ea 7034 throttle_event.header.size);
a78ac325
PZ
7035 if (ret)
7036 return;
7037
7038 perf_output_put(&handle, throttle_event);
c980d109 7039 perf_event__output_id_sample(event, &handle, &sample);
a78ac325
PZ
7040 perf_output_end(&handle);
7041}
7042
ec0d7729
AS
7043static void perf_log_itrace_start(struct perf_event *event)
7044{
7045 struct perf_output_handle handle;
7046 struct perf_sample_data sample;
7047 struct perf_aux_event {
7048 struct perf_event_header header;
7049 u32 pid;
7050 u32 tid;
7051 } rec;
7052 int ret;
7053
7054 if (event->parent)
7055 event = event->parent;
7056
7057 if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
7058 event->hw.itrace_started)
7059 return;
7060
ec0d7729
AS
7061 rec.header.type = PERF_RECORD_ITRACE_START;
7062 rec.header.misc = 0;
7063 rec.header.size = sizeof(rec);
7064 rec.pid = perf_event_pid(event, current);
7065 rec.tid = perf_event_tid(event, current);
7066
7067 perf_event_header__init_id(&rec.header, &sample, event);
7068 ret = perf_output_begin(&handle, event, rec.header.size);
7069
7070 if (ret)
7071 return;
7072
7073 perf_output_put(&handle, rec);
7074 perf_event__output_id_sample(event, &handle, &sample);
7075
7076 perf_output_end(&handle);
7077}
7078
475113d9
JO
7079static int
7080__perf_event_account_interrupt(struct perf_event *event, int throttle)
f6c7d5fe 7081{
cdd6c482 7082 struct hw_perf_event *hwc = &event->hw;
79f14641 7083 int ret = 0;
475113d9 7084 u64 seq;
96398826 7085
e050e3f0
SE
7086 seq = __this_cpu_read(perf_throttled_seq);
7087 if (seq != hwc->interrupts_seq) {
7088 hwc->interrupts_seq = seq;
7089 hwc->interrupts = 1;
7090 } else {
7091 hwc->interrupts++;
7092 if (unlikely(throttle
7093 && hwc->interrupts >= max_samples_per_tick)) {
7094 __this_cpu_inc(perf_throttled_count);
555e0c1e 7095 tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
163ec435
PZ
7096 hwc->interrupts = MAX_INTERRUPTS;
7097 perf_log_throttle(event, 0);
a78ac325
PZ
7098 ret = 1;
7099 }
e050e3f0 7100 }
60db5e09 7101
cdd6c482 7102 if (event->attr.freq) {
def0a9b2 7103 u64 now = perf_clock();
abd50713 7104 s64 delta = now - hwc->freq_time_stamp;
bd2b5b12 7105
abd50713 7106 hwc->freq_time_stamp = now;
bd2b5b12 7107
abd50713 7108 if (delta > 0 && delta < 2*TICK_NSEC)
f39d47ff 7109 perf_adjust_period(event, delta, hwc->last_period, true);
bd2b5b12
PZ
7110 }
7111
475113d9
JO
7112 return ret;
7113}
7114
7115int perf_event_account_interrupt(struct perf_event *event)
7116{
7117 return __perf_event_account_interrupt(event, 1);
7118}
7119
7120/*
7121 * Generic event overflow handling, sampling.
7122 */
7123
7124static int __perf_event_overflow(struct perf_event *event,
7125 int throttle, struct perf_sample_data *data,
7126 struct pt_regs *regs)
7127{
7128 int events = atomic_read(&event->event_limit);
7129 int ret = 0;
7130
7131 /*
7132 * Non-sampling counters might still use the PMI to fold short
7133 * hardware counters, ignore those.
7134 */
7135 if (unlikely(!is_sampling_event(event)))
7136 return 0;
7137
7138 ret = __perf_event_account_interrupt(event, throttle);
7139
2023b359
PZ
7140 /*
7141 * XXX event_limit might not quite work as expected on inherited
cdd6c482 7142 * events
2023b359
PZ
7143 */
7144
cdd6c482
IM
7145 event->pending_kill = POLL_IN;
7146 if (events && atomic_dec_and_test(&event->event_limit)) {
79f14641 7147 ret = 1;
cdd6c482 7148 event->pending_kill = POLL_HUP;
5aab90ce
JO
7149
7150 perf_event_disable_inatomic(event);
79f14641
PZ
7151 }
7152
aa6a5f3c 7153 READ_ONCE(event->overflow_handler)(event, data, regs);
453f19ee 7154
fed66e2c 7155 if (*perf_event_fasync(event) && event->pending_kill) {
a8b0ca17
PZ
7156 event->pending_wakeup = 1;
7157 irq_work_queue(&event->pending);
f506b3dc
PZ
7158 }
7159
79f14641 7160 return ret;
f6c7d5fe
PZ
7161}
7162
a8b0ca17 7163int perf_event_overflow(struct perf_event *event,
5622f295
MM
7164 struct perf_sample_data *data,
7165 struct pt_regs *regs)
850bc73f 7166{
a8b0ca17 7167 return __perf_event_overflow(event, 1, data, regs);
850bc73f
PZ
7168}
7169
15dbf27c 7170/*
cdd6c482 7171 * Generic software event infrastructure
15dbf27c
PZ
7172 */
7173
b28ab83c
PZ
7174struct swevent_htable {
7175 struct swevent_hlist *swevent_hlist;
7176 struct mutex hlist_mutex;
7177 int hlist_refcount;
7178
7179 /* Recursion avoidance in each contexts */
7180 int recursion[PERF_NR_CONTEXTS];
7181};
7182
7183static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
7184
7b4b6658 7185/*
cdd6c482
IM
7186 * We directly increment event->count and keep a second value in
7187 * event->hw.period_left to count intervals. This period event
7b4b6658
PZ
7188 * is kept in the range [-sample_period, 0] so that we can use the
7189 * sign as trigger.
7190 */
7191
ab573844 7192u64 perf_swevent_set_period(struct perf_event *event)
15dbf27c 7193{
cdd6c482 7194 struct hw_perf_event *hwc = &event->hw;
7b4b6658
PZ
7195 u64 period = hwc->last_period;
7196 u64 nr, offset;
7197 s64 old, val;
7198
7199 hwc->last_period = hwc->sample_period;
15dbf27c
PZ
7200
7201again:
e7850595 7202 old = val = local64_read(&hwc->period_left);
7b4b6658
PZ
7203 if (val < 0)
7204 return 0;
15dbf27c 7205
7b4b6658
PZ
7206 nr = div64_u64(period + val, period);
7207 offset = nr * period;
7208 val -= offset;
e7850595 7209 if (local64_cmpxchg(&hwc->period_left, old, val) != old)
7b4b6658 7210 goto again;
15dbf27c 7211
7b4b6658 7212 return nr;
15dbf27c
PZ
7213}
7214
0cff784a 7215static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
a8b0ca17 7216 struct perf_sample_data *data,
5622f295 7217 struct pt_regs *regs)
15dbf27c 7218{
cdd6c482 7219 struct hw_perf_event *hwc = &event->hw;
850bc73f 7220 int throttle = 0;
15dbf27c 7221
0cff784a
PZ
7222 if (!overflow)
7223 overflow = perf_swevent_set_period(event);
15dbf27c 7224
7b4b6658
PZ
7225 if (hwc->interrupts == MAX_INTERRUPTS)
7226 return;
15dbf27c 7227
7b4b6658 7228 for (; overflow; overflow--) {
a8b0ca17 7229 if (__perf_event_overflow(event, throttle,
5622f295 7230 data, regs)) {
7b4b6658
PZ
7231 /*
7232 * We inhibit the overflow from happening when
7233 * hwc->interrupts == MAX_INTERRUPTS.
7234 */
7235 break;
7236 }
cf450a73 7237 throttle = 1;
7b4b6658 7238 }
15dbf27c
PZ
7239}
7240
a4eaf7f1 7241static void perf_swevent_event(struct perf_event *event, u64 nr,
a8b0ca17 7242 struct perf_sample_data *data,
5622f295 7243 struct pt_regs *regs)
7b4b6658 7244{
cdd6c482 7245 struct hw_perf_event *hwc = &event->hw;
d6d020e9 7246
e7850595 7247 local64_add(nr, &event->count);
d6d020e9 7248
0cff784a
PZ
7249 if (!regs)
7250 return;
7251
6c7e550f 7252 if (!is_sampling_event(event))
7b4b6658 7253 return;
d6d020e9 7254
5d81e5cf
AV
7255 if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
7256 data->period = nr;
7257 return perf_swevent_overflow(event, 1, data, regs);
7258 } else
7259 data->period = event->hw.last_period;
7260
0cff784a 7261 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
a8b0ca17 7262 return perf_swevent_overflow(event, 1, data, regs);
0cff784a 7263
e7850595 7264 if (local64_add_negative(nr, &hwc->period_left))
7b4b6658 7265 return;
df1a132b 7266
a8b0ca17 7267 perf_swevent_overflow(event, 0, data, regs);
d6d020e9
PZ
7268}
7269
f5ffe02e
FW
7270static int perf_exclude_event(struct perf_event *event,
7271 struct pt_regs *regs)
7272{
a4eaf7f1 7273 if (event->hw.state & PERF_HES_STOPPED)
91b2f482 7274 return 1;
a4eaf7f1 7275
f5ffe02e
FW
7276 if (regs) {
7277 if (event->attr.exclude_user && user_mode(regs))
7278 return 1;
7279
7280 if (event->attr.exclude_kernel && !user_mode(regs))
7281 return 1;
7282 }
7283
7284 return 0;
7285}
7286
cdd6c482 7287static int perf_swevent_match(struct perf_event *event,
1c432d89 7288 enum perf_type_id type,
6fb2915d
LZ
7289 u32 event_id,
7290 struct perf_sample_data *data,
7291 struct pt_regs *regs)
15dbf27c 7292{
cdd6c482 7293 if (event->attr.type != type)
a21ca2ca 7294 return 0;
f5ffe02e 7295
cdd6c482 7296 if (event->attr.config != event_id)
15dbf27c
PZ
7297 return 0;
7298
f5ffe02e
FW
7299 if (perf_exclude_event(event, regs))
7300 return 0;
15dbf27c
PZ
7301
7302 return 1;
7303}
7304
76e1d904
FW
7305static inline u64 swevent_hash(u64 type, u32 event_id)
7306{
7307 u64 val = event_id | (type << 32);
7308
7309 return hash_64(val, SWEVENT_HLIST_BITS);
7310}
7311
49f135ed
FW
7312static inline struct hlist_head *
7313__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
76e1d904 7314{
49f135ed
FW
7315 u64 hash = swevent_hash(type, event_id);
7316
7317 return &hlist->heads[hash];
7318}
76e1d904 7319
49f135ed
FW
7320/* For the read side: events when they trigger */
7321static inline struct hlist_head *
b28ab83c 7322find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
49f135ed
FW
7323{
7324 struct swevent_hlist *hlist;
76e1d904 7325
b28ab83c 7326 hlist = rcu_dereference(swhash->swevent_hlist);
76e1d904
FW
7327 if (!hlist)
7328 return NULL;
7329
49f135ed
FW
7330 return __find_swevent_head(hlist, type, event_id);
7331}
7332
7333/* For the event head insertion and removal in the hlist */
7334static inline struct hlist_head *
b28ab83c 7335find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
49f135ed
FW
7336{
7337 struct swevent_hlist *hlist;
7338 u32 event_id = event->attr.config;
7339 u64 type = event->attr.type;
7340
7341 /*
7342 * Event scheduling is always serialized against hlist allocation
7343 * and release. Which makes the protected version suitable here.
7344 * The context lock guarantees that.
7345 */
b28ab83c 7346 hlist = rcu_dereference_protected(swhash->swevent_hlist,
49f135ed
FW
7347 lockdep_is_held(&event->ctx->lock));
7348 if (!hlist)
7349 return NULL;
7350
7351 return __find_swevent_head(hlist, type, event_id);
76e1d904
FW
7352}
7353
7354static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
a8b0ca17 7355 u64 nr,
76e1d904
FW
7356 struct perf_sample_data *data,
7357 struct pt_regs *regs)
15dbf27c 7358{
4a32fea9 7359 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
cdd6c482 7360 struct perf_event *event;
76e1d904 7361 struct hlist_head *head;
15dbf27c 7362
76e1d904 7363 rcu_read_lock();
b28ab83c 7364 head = find_swevent_head_rcu(swhash, type, event_id);
76e1d904
FW
7365 if (!head)
7366 goto end;
7367
b67bfe0d 7368 hlist_for_each_entry_rcu(event, head, hlist_entry) {
6fb2915d 7369 if (perf_swevent_match(event, type, event_id, data, regs))
a8b0ca17 7370 perf_swevent_event(event, nr, data, regs);
15dbf27c 7371 }
76e1d904
FW
7372end:
7373 rcu_read_unlock();
15dbf27c
PZ
7374}
7375
86038c5e
PZI
7376DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
7377
4ed7c92d 7378int perf_swevent_get_recursion_context(void)
96f6d444 7379{
4a32fea9 7380 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
96f6d444 7381
b28ab83c 7382 return get_recursion_context(swhash->recursion);
96f6d444 7383}
645e8cc0 7384EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
96f6d444 7385
98b5c2c6 7386void perf_swevent_put_recursion_context(int rctx)
15dbf27c 7387{
4a32fea9 7388 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
927c7a9e 7389
b28ab83c 7390 put_recursion_context(swhash->recursion, rctx);
ce71b9df 7391}
15dbf27c 7392
86038c5e 7393void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
b8e83514 7394{
a4234bfc 7395 struct perf_sample_data data;
4ed7c92d 7396
86038c5e 7397 if (WARN_ON_ONCE(!regs))
4ed7c92d 7398 return;
a4234bfc 7399
fd0d000b 7400 perf_sample_data_init(&data, addr, 0);
a8b0ca17 7401 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
86038c5e
PZI
7402}
7403
7404void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
7405{
7406 int rctx;
7407
7408 preempt_disable_notrace();
7409 rctx = perf_swevent_get_recursion_context();
7410 if (unlikely(rctx < 0))
7411 goto fail;
7412
7413 ___perf_sw_event(event_id, nr, regs, addr);
4ed7c92d
PZ
7414
7415 perf_swevent_put_recursion_context(rctx);
86038c5e 7416fail:
1c024eca 7417 preempt_enable_notrace();
b8e83514
PZ
7418}
7419
cdd6c482 7420static void perf_swevent_read(struct perf_event *event)
15dbf27c 7421{
15dbf27c
PZ
7422}
7423
a4eaf7f1 7424static int perf_swevent_add(struct perf_event *event, int flags)
15dbf27c 7425{
4a32fea9 7426 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
cdd6c482 7427 struct hw_perf_event *hwc = &event->hw;
76e1d904
FW
7428 struct hlist_head *head;
7429
6c7e550f 7430 if (is_sampling_event(event)) {
7b4b6658 7431 hwc->last_period = hwc->sample_period;
cdd6c482 7432 perf_swevent_set_period(event);
7b4b6658 7433 }
76e1d904 7434
a4eaf7f1
PZ
7435 hwc->state = !(flags & PERF_EF_START);
7436
b28ab83c 7437 head = find_swevent_head(swhash, event);
12ca6ad2 7438 if (WARN_ON_ONCE(!head))
76e1d904
FW
7439 return -EINVAL;
7440
7441 hlist_add_head_rcu(&event->hlist_entry, head);
6a694a60 7442 perf_event_update_userpage(event);
76e1d904 7443
15dbf27c
PZ
7444 return 0;
7445}
7446
a4eaf7f1 7447static void perf_swevent_del(struct perf_event *event, int flags)
15dbf27c 7448{
76e1d904 7449 hlist_del_rcu(&event->hlist_entry);
15dbf27c
PZ
7450}
7451
a4eaf7f1 7452static void perf_swevent_start(struct perf_event *event, int flags)
5c92d124 7453{
a4eaf7f1 7454 event->hw.state = 0;
d6d020e9 7455}
aa9c4c0f 7456
a4eaf7f1 7457static void perf_swevent_stop(struct perf_event *event, int flags)
d6d020e9 7458{
a4eaf7f1 7459 event->hw.state = PERF_HES_STOPPED;
bae43c99
IM
7460}
7461
49f135ed
FW
7462/* Deref the hlist from the update side */
7463static inline struct swevent_hlist *
b28ab83c 7464swevent_hlist_deref(struct swevent_htable *swhash)
49f135ed 7465{
b28ab83c
PZ
7466 return rcu_dereference_protected(swhash->swevent_hlist,
7467 lockdep_is_held(&swhash->hlist_mutex));
49f135ed
FW
7468}
7469
b28ab83c 7470static void swevent_hlist_release(struct swevent_htable *swhash)
76e1d904 7471{
b28ab83c 7472 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
76e1d904 7473
49f135ed 7474 if (!hlist)
76e1d904
FW
7475 return;
7476
70691d4a 7477 RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
fa4bbc4c 7478 kfree_rcu(hlist, rcu_head);
76e1d904
FW
7479}
7480
3b364d7b 7481static void swevent_hlist_put_cpu(int cpu)
76e1d904 7482{
b28ab83c 7483 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
76e1d904 7484
b28ab83c 7485 mutex_lock(&swhash->hlist_mutex);
76e1d904 7486
b28ab83c
PZ
7487 if (!--swhash->hlist_refcount)
7488 swevent_hlist_release(swhash);
76e1d904 7489
b28ab83c 7490 mutex_unlock(&swhash->hlist_mutex);
76e1d904
FW
7491}
7492
3b364d7b 7493static void swevent_hlist_put(void)
76e1d904
FW
7494{
7495 int cpu;
7496
76e1d904 7497 for_each_possible_cpu(cpu)
3b364d7b 7498 swevent_hlist_put_cpu(cpu);
76e1d904
FW
7499}
7500
3b364d7b 7501static int swevent_hlist_get_cpu(int cpu)
76e1d904 7502{
b28ab83c 7503 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
76e1d904
FW
7504 int err = 0;
7505
b28ab83c 7506 mutex_lock(&swhash->hlist_mutex);
b28ab83c 7507 if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
76e1d904
FW
7508 struct swevent_hlist *hlist;
7509
7510 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
7511 if (!hlist) {
7512 err = -ENOMEM;
7513 goto exit;
7514 }
b28ab83c 7515 rcu_assign_pointer(swhash->swevent_hlist, hlist);
76e1d904 7516 }
b28ab83c 7517 swhash->hlist_refcount++;
9ed6060d 7518exit:
b28ab83c 7519 mutex_unlock(&swhash->hlist_mutex);
76e1d904
FW
7520
7521 return err;
7522}
7523
3b364d7b 7524static int swevent_hlist_get(void)
76e1d904 7525{
3b364d7b 7526 int err, cpu, failed_cpu;
76e1d904 7527
76e1d904
FW
7528 get_online_cpus();
7529 for_each_possible_cpu(cpu) {
3b364d7b 7530 err = swevent_hlist_get_cpu(cpu);
76e1d904
FW
7531 if (err) {
7532 failed_cpu = cpu;
7533 goto fail;
7534 }
7535 }
7536 put_online_cpus();
7537
7538 return 0;
9ed6060d 7539fail:
76e1d904
FW
7540 for_each_possible_cpu(cpu) {
7541 if (cpu == failed_cpu)
7542 break;
3b364d7b 7543 swevent_hlist_put_cpu(cpu);
76e1d904
FW
7544 }
7545
7546 put_online_cpus();
7547 return err;
7548}
7549
c5905afb 7550struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
95476b64 7551
b0a873eb
PZ
7552static void sw_perf_event_destroy(struct perf_event *event)
7553{
7554 u64 event_id = event->attr.config;
95476b64 7555
b0a873eb
PZ
7556 WARN_ON(event->parent);
7557
c5905afb 7558 static_key_slow_dec(&perf_swevent_enabled[event_id]);
3b364d7b 7559 swevent_hlist_put();
b0a873eb
PZ
7560}
7561
7562static int perf_swevent_init(struct perf_event *event)
7563{
8176cced 7564 u64 event_id = event->attr.config;
b0a873eb
PZ
7565
7566 if (event->attr.type != PERF_TYPE_SOFTWARE)
7567 return -ENOENT;
7568
2481c5fa
SE
7569 /*
7570 * no branch sampling for software events
7571 */
7572 if (has_branch_stack(event))
7573 return -EOPNOTSUPP;
7574
b0a873eb
PZ
7575 switch (event_id) {
7576 case PERF_COUNT_SW_CPU_CLOCK:
7577 case PERF_COUNT_SW_TASK_CLOCK:
7578 return -ENOENT;
7579
7580 default:
7581 break;
7582 }
7583
ce677831 7584 if (event_id >= PERF_COUNT_SW_MAX)
b0a873eb
PZ
7585 return -ENOENT;
7586
7587 if (!event->parent) {
7588 int err;
7589
3b364d7b 7590 err = swevent_hlist_get();
b0a873eb
PZ
7591 if (err)
7592 return err;
7593
c5905afb 7594 static_key_slow_inc(&perf_swevent_enabled[event_id]);
b0a873eb
PZ
7595 event->destroy = sw_perf_event_destroy;
7596 }
7597
7598 return 0;
7599}
7600
7601static struct pmu perf_swevent = {
89a1e187 7602 .task_ctx_nr = perf_sw_context,
95476b64 7603
34f43927
PZ
7604 .capabilities = PERF_PMU_CAP_NO_NMI,
7605
b0a873eb 7606 .event_init = perf_swevent_init,
a4eaf7f1
PZ
7607 .add = perf_swevent_add,
7608 .del = perf_swevent_del,
7609 .start = perf_swevent_start,
7610 .stop = perf_swevent_stop,
1c024eca 7611 .read = perf_swevent_read,
1c024eca
PZ
7612};
7613
b0a873eb
PZ
7614#ifdef CONFIG_EVENT_TRACING
7615
1c024eca
PZ
7616static int perf_tp_filter_match(struct perf_event *event,
7617 struct perf_sample_data *data)
7618{
7e3f977e 7619 void *record = data->raw->frag.data;
1c024eca 7620
b71b437e
PZ
7621 /* only top level events have filters set */
7622 if (event->parent)
7623 event = event->parent;
7624
1c024eca
PZ
7625 if (likely(!event->filter) || filter_match_preds(event->filter, record))
7626 return 1;
7627 return 0;
7628}
7629
7630static int perf_tp_event_match(struct perf_event *event,
7631 struct perf_sample_data *data,
7632 struct pt_regs *regs)
7633{
a0f7d0f7
FW
7634 if (event->hw.state & PERF_HES_STOPPED)
7635 return 0;
580d607c
PZ
7636 /*
7637 * All tracepoints are from kernel-space.
7638 */
7639 if (event->attr.exclude_kernel)
1c024eca
PZ
7640 return 0;
7641
7642 if (!perf_tp_filter_match(event, data))
7643 return 0;
7644
7645 return 1;
7646}
7647
85b67bcb
AS
7648void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
7649 struct trace_event_call *call, u64 count,
7650 struct pt_regs *regs, struct hlist_head *head,
7651 struct task_struct *task)
7652{
7653 struct bpf_prog *prog = call->prog;
7654
7655 if (prog) {
7656 *(struct pt_regs **)raw_data = regs;
7657 if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) {
7658 perf_swevent_put_recursion_context(rctx);
7659 return;
7660 }
7661 }
7662 perf_tp_event(call->event.type, count, raw_data, size, regs, head,
7663 rctx, task);
7664}
7665EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
7666
1e1dcd93 7667void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
e6dab5ff
AV
7668 struct pt_regs *regs, struct hlist_head *head, int rctx,
7669 struct task_struct *task)
95476b64
FW
7670{
7671 struct perf_sample_data data;
1c024eca 7672 struct perf_event *event;
1c024eca 7673
95476b64 7674 struct perf_raw_record raw = {
7e3f977e
DB
7675 .frag = {
7676 .size = entry_size,
7677 .data = record,
7678 },
95476b64
FW
7679 };
7680
1e1dcd93 7681 perf_sample_data_init(&data, 0, 0);
95476b64
FW
7682 data.raw = &raw;
7683
1e1dcd93
AS
7684 perf_trace_buf_update(record, event_type);
7685
b67bfe0d 7686 hlist_for_each_entry_rcu(event, head, hlist_entry) {
1c024eca 7687 if (perf_tp_event_match(event, &data, regs))
a8b0ca17 7688 perf_swevent_event(event, count, &data, regs);
4f41c013 7689 }
ecc55f84 7690
e6dab5ff
AV
7691 /*
7692 * If we got specified a target task, also iterate its context and
7693 * deliver this event there too.
7694 */
7695 if (task && task != current) {
7696 struct perf_event_context *ctx;
7697 struct trace_entry *entry = record;
7698
7699 rcu_read_lock();
7700 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
7701 if (!ctx)
7702 goto unlock;
7703
7704 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
7705 if (event->attr.type != PERF_TYPE_TRACEPOINT)
7706 continue;
7707 if (event->attr.config != entry->type)
7708 continue;
7709 if (perf_tp_event_match(event, &data, regs))
7710 perf_swevent_event(event, count, &data, regs);
7711 }
7712unlock:
7713 rcu_read_unlock();
7714 }
7715
ecc55f84 7716 perf_swevent_put_recursion_context(rctx);
95476b64
FW
7717}
7718EXPORT_SYMBOL_GPL(perf_tp_event);
7719
cdd6c482 7720static void tp_perf_event_destroy(struct perf_event *event)
e077df4f 7721{
1c024eca 7722 perf_trace_destroy(event);
e077df4f
PZ
7723}
7724
b0a873eb 7725static int perf_tp_event_init(struct perf_event *event)
e077df4f 7726{
76e1d904
FW
7727 int err;
7728
b0a873eb
PZ
7729 if (event->attr.type != PERF_TYPE_TRACEPOINT)
7730 return -ENOENT;
7731
2481c5fa
SE
7732 /*
7733 * no branch sampling for tracepoint events
7734 */
7735 if (has_branch_stack(event))
7736 return -EOPNOTSUPP;
7737
1c024eca
PZ
7738 err = perf_trace_init(event);
7739 if (err)
b0a873eb 7740 return err;
e077df4f 7741
cdd6c482 7742 event->destroy = tp_perf_event_destroy;
e077df4f 7743
b0a873eb
PZ
7744 return 0;
7745}
7746
7747static struct pmu perf_tracepoint = {
89a1e187
PZ
7748 .task_ctx_nr = perf_sw_context,
7749
b0a873eb 7750 .event_init = perf_tp_event_init,
a4eaf7f1
PZ
7751 .add = perf_trace_add,
7752 .del = perf_trace_del,
7753 .start = perf_swevent_start,
7754 .stop = perf_swevent_stop,
b0a873eb 7755 .read = perf_swevent_read,
b0a873eb
PZ
7756};
7757
7758static inline void perf_tp_register(void)
7759{
2e80a82a 7760 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
e077df4f 7761}
6fb2915d 7762
6fb2915d
LZ
7763static void perf_event_free_filter(struct perf_event *event)
7764{
7765 ftrace_profile_free_filter(event);
7766}
7767
aa6a5f3c
AS
7768#ifdef CONFIG_BPF_SYSCALL
7769static void bpf_overflow_handler(struct perf_event *event,
7770 struct perf_sample_data *data,
7771 struct pt_regs *regs)
7772{
7773 struct bpf_perf_event_data_kern ctx = {
7774 .data = data,
7775 .regs = regs,
7776 };
7777 int ret = 0;
7778
7779 preempt_disable();
7780 if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
7781 goto out;
7782 rcu_read_lock();
88575199 7783 ret = BPF_PROG_RUN(event->prog, &ctx);
aa6a5f3c
AS
7784 rcu_read_unlock();
7785out:
7786 __this_cpu_dec(bpf_prog_active);
7787 preempt_enable();
7788 if (!ret)
7789 return;
7790
7791 event->orig_overflow_handler(event, data, regs);
7792}
7793
7794static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
7795{
7796 struct bpf_prog *prog;
7797
7798 if (event->overflow_handler_context)
7799 /* hw breakpoint or kernel counter */
7800 return -EINVAL;
7801
7802 if (event->prog)
7803 return -EEXIST;
7804
7805 prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
7806 if (IS_ERR(prog))
7807 return PTR_ERR(prog);
7808
7809 event->prog = prog;
7810 event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
7811 WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
7812 return 0;
7813}
7814
7815static void perf_event_free_bpf_handler(struct perf_event *event)
7816{
7817 struct bpf_prog *prog = event->prog;
7818
7819 if (!prog)
7820 return;
7821
7822 WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
7823 event->prog = NULL;
7824 bpf_prog_put(prog);
7825}
7826#else
7827static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
7828{
7829 return -EOPNOTSUPP;
7830}
7831static void perf_event_free_bpf_handler(struct perf_event *event)
7832{
7833}
7834#endif
7835
2541517c
AS
7836static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
7837{
98b5c2c6 7838 bool is_kprobe, is_tracepoint;
2541517c
AS
7839 struct bpf_prog *prog;
7840
aa6a5f3c
AS
7841 if (event->attr.type == PERF_TYPE_HARDWARE ||
7842 event->attr.type == PERF_TYPE_SOFTWARE)
7843 return perf_event_set_bpf_handler(event, prog_fd);
7844
2541517c
AS
7845 if (event->attr.type != PERF_TYPE_TRACEPOINT)
7846 return -EINVAL;
7847
7848 if (event->tp_event->prog)
7849 return -EEXIST;
7850
98b5c2c6
AS
7851 is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
7852 is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
7853 if (!is_kprobe && !is_tracepoint)
7854 /* bpf programs can only be attached to u/kprobe or tracepoint */
2541517c
AS
7855 return -EINVAL;
7856
7857 prog = bpf_prog_get(prog_fd);
7858 if (IS_ERR(prog))
7859 return PTR_ERR(prog);
7860
98b5c2c6
AS
7861 if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
7862 (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
2541517c
AS
7863 /* valid fd, but invalid bpf program type */
7864 bpf_prog_put(prog);
7865 return -EINVAL;
7866 }
7867
32bbe007
AS
7868 if (is_tracepoint) {
7869 int off = trace_event_get_offsets(event->tp_event);
7870
7871 if (prog->aux->max_ctx_offset > off) {
7872 bpf_prog_put(prog);
7873 return -EACCES;
7874 }
7875 }
2541517c
AS
7876 event->tp_event->prog = prog;
7877
7878 return 0;
7879}
7880
7881static void perf_event_free_bpf_prog(struct perf_event *event)
7882{
7883 struct bpf_prog *prog;
7884
aa6a5f3c
AS
7885 perf_event_free_bpf_handler(event);
7886
2541517c
AS
7887 if (!event->tp_event)
7888 return;
7889
7890 prog = event->tp_event->prog;
7891 if (prog) {
7892 event->tp_event->prog = NULL;
1aacde3d 7893 bpf_prog_put(prog);
2541517c
AS
7894 }
7895}
7896
e077df4f 7897#else
6fb2915d 7898
b0a873eb 7899static inline void perf_tp_register(void)
e077df4f 7900{
e077df4f 7901}
6fb2915d 7902
6fb2915d
LZ
7903static void perf_event_free_filter(struct perf_event *event)
7904{
7905}
7906
2541517c
AS
7907static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
7908{
7909 return -ENOENT;
7910}
7911
7912static void perf_event_free_bpf_prog(struct perf_event *event)
7913{
7914}
07b139c8 7915#endif /* CONFIG_EVENT_TRACING */
e077df4f 7916
24f1e32c 7917#ifdef CONFIG_HAVE_HW_BREAKPOINT
f5ffe02e 7918void perf_bp_event(struct perf_event *bp, void *data)
24f1e32c 7919{
f5ffe02e
FW
7920 struct perf_sample_data sample;
7921 struct pt_regs *regs = data;
7922
fd0d000b 7923 perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
f5ffe02e 7924
a4eaf7f1 7925 if (!bp->hw.state && !perf_exclude_event(bp, regs))
a8b0ca17 7926 perf_swevent_event(bp, 1, &sample, regs);
24f1e32c
FW
7927}
7928#endif
7929
375637bc
AS
7930/*
7931 * Allocate a new address filter
7932 */
7933static struct perf_addr_filter *
7934perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
7935{
7936 int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
7937 struct perf_addr_filter *filter;
7938
7939 filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
7940 if (!filter)
7941 return NULL;
7942
7943 INIT_LIST_HEAD(&filter->entry);
7944 list_add_tail(&filter->entry, filters);
7945
7946 return filter;
7947}
7948
7949static void free_filters_list(struct list_head *filters)
7950{
7951 struct perf_addr_filter *filter, *iter;
7952
7953 list_for_each_entry_safe(filter, iter, filters, entry) {
7954 if (filter->inode)
7955 iput(filter->inode);
7956 list_del(&filter->entry);
7957 kfree(filter);
7958 }
7959}
7960
7961/*
7962 * Free existing address filters and optionally install new ones
7963 */
7964static void perf_addr_filters_splice(struct perf_event *event,
7965 struct list_head *head)
7966{
7967 unsigned long flags;
7968 LIST_HEAD(list);
7969
7970 if (!has_addr_filter(event))
7971 return;
7972
7973 /* don't bother with children, they don't have their own filters */
7974 if (event->parent)
7975 return;
7976
7977 raw_spin_lock_irqsave(&event->addr_filters.lock, flags);
7978
7979 list_splice_init(&event->addr_filters.list, &list);
7980 if (head)
7981 list_splice(head, &event->addr_filters.list);
7982
7983 raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);
7984
7985 free_filters_list(&list);
7986}
7987
7988/*
7989 * Scan through mm's vmas and see if one of them matches the
7990 * @filter; if so, adjust filter's address range.
7991 * Called with mm::mmap_sem down for reading.
7992 */
7993static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter,
7994 struct mm_struct *mm)
7995{
7996 struct vm_area_struct *vma;
7997
7998 for (vma = mm->mmap; vma; vma = vma->vm_next) {
7999 struct file *file = vma->vm_file;
8000 unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
8001 unsigned long vma_size = vma->vm_end - vma->vm_start;
8002
8003 if (!file)
8004 continue;
8005
8006 if (!perf_addr_filter_match(filter, file, off, vma_size))
8007 continue;
8008
8009 return vma->vm_start;
8010 }
8011
8012 return 0;
8013}
8014
8015/*
8016 * Update event's address range filters based on the
8017 * task's existing mappings, if any.
8018 */
8019static void perf_event_addr_filters_apply(struct perf_event *event)
8020{
8021 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
8022 struct task_struct *task = READ_ONCE(event->ctx->task);
8023 struct perf_addr_filter *filter;
8024 struct mm_struct *mm = NULL;
8025 unsigned int count = 0;
8026 unsigned long flags;
8027
8028 /*
8029 * We may observe TASK_TOMBSTONE, which means that the event tear-down
8030 * will stop on the parent's child_mutex that our caller is also holding
8031 */
8032 if (task == TASK_TOMBSTONE)
8033 return;
8034
8035 mm = get_task_mm(event->ctx->task);
8036 if (!mm)
8037 goto restart;
8038
8039 down_read(&mm->mmap_sem);
8040
8041 raw_spin_lock_irqsave(&ifh->lock, flags);
8042 list_for_each_entry(filter, &ifh->list, entry) {
8043 event->addr_filters_offs[count] = 0;
8044
99f5bc9b
MP
8045 /*
8046 * Adjust base offset if the filter is associated to a binary
8047 * that needs to be mapped:
8048 */
8049 if (filter->inode)
375637bc
AS
8050 event->addr_filters_offs[count] =
8051 perf_addr_filter_apply(filter, mm);
8052
8053 count++;
8054 }
8055
8056 event->addr_filters_gen++;
8057 raw_spin_unlock_irqrestore(&ifh->lock, flags);
8058
8059 up_read(&mm->mmap_sem);
8060
8061 mmput(mm);
8062
8063restart:
767ae086 8064 perf_event_stop(event, 1);
375637bc
AS
8065}
8066
8067/*
8068 * Address range filtering: limiting the data to certain
8069 * instruction address ranges. Filters are ioctl()ed to us from
8070 * userspace as ascii strings.
8071 *
8072 * Filter string format:
8073 *
8074 * ACTION RANGE_SPEC
8075 * where ACTION is one of the
8076 * * "filter": limit the trace to this region
8077 * * "start": start tracing from this address
8078 * * "stop": stop tracing at this address/region;
8079 * RANGE_SPEC is
8080 * * for kernel addresses: <start address>[/<size>]
8081 * * for object files: <start address>[/<size>]@</path/to/object/file>
8082 *
8083 * if <size> is not specified, the range is treated as a single address.
8084 */
8085enum {
e96271f3 8086 IF_ACT_NONE = -1,
375637bc
AS
8087 IF_ACT_FILTER,
8088 IF_ACT_START,
8089 IF_ACT_STOP,
8090 IF_SRC_FILE,
8091 IF_SRC_KERNEL,
8092 IF_SRC_FILEADDR,
8093 IF_SRC_KERNELADDR,
8094};
8095
8096enum {
8097 IF_STATE_ACTION = 0,
8098 IF_STATE_SOURCE,
8099 IF_STATE_END,
8100};
8101
8102static const match_table_t if_tokens = {
8103 { IF_ACT_FILTER, "filter" },
8104 { IF_ACT_START, "start" },
8105 { IF_ACT_STOP, "stop" },
8106 { IF_SRC_FILE, "%u/%u@%s" },
8107 { IF_SRC_KERNEL, "%u/%u" },
8108 { IF_SRC_FILEADDR, "%u@%s" },
8109 { IF_SRC_KERNELADDR, "%u" },
e96271f3 8110 { IF_ACT_NONE, NULL },
375637bc
AS
8111};
8112
8113/*
8114 * Address filter string parser
8115 */
8116static int
8117perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
8118 struct list_head *filters)
8119{
8120 struct perf_addr_filter *filter = NULL;
8121 char *start, *orig, *filename = NULL;
8122 struct path path;
8123 substring_t args[MAX_OPT_ARGS];
8124 int state = IF_STATE_ACTION, token;
8125 unsigned int kernel = 0;
8126 int ret = -EINVAL;
8127
8128 orig = fstr = kstrdup(fstr, GFP_KERNEL);
8129 if (!fstr)
8130 return -ENOMEM;
8131
8132 while ((start = strsep(&fstr, " ,\n")) != NULL) {
8133 ret = -EINVAL;
8134
8135 if (!*start)
8136 continue;
8137
8138 /* filter definition begins */
8139 if (state == IF_STATE_ACTION) {
8140 filter = perf_addr_filter_new(event, filters);
8141 if (!filter)
8142 goto fail;
8143 }
8144
8145 token = match_token(start, if_tokens, args);
8146 switch (token) {
8147 case IF_ACT_FILTER:
8148 case IF_ACT_START:
8149 filter->filter = 1;
8150
8151 case IF_ACT_STOP:
8152 if (state != IF_STATE_ACTION)
8153 goto fail;
8154
8155 state = IF_STATE_SOURCE;
8156 break;
8157
8158 case IF_SRC_KERNELADDR:
8159 case IF_SRC_KERNEL:
8160 kernel = 1;
8161
8162 case IF_SRC_FILEADDR:
8163 case IF_SRC_FILE:
8164 if (state != IF_STATE_SOURCE)
8165 goto fail;
8166
8167 if (token == IF_SRC_FILE || token == IF_SRC_KERNEL)
8168 filter->range = 1;
8169
8170 *args[0].to = 0;
8171 ret = kstrtoul(args[0].from, 0, &filter->offset);
8172 if (ret)
8173 goto fail;
8174
8175 if (filter->range) {
8176 *args[1].to = 0;
8177 ret = kstrtoul(args[1].from, 0, &filter->size);
8178 if (ret)
8179 goto fail;
8180 }
8181
4059ffd0
MP
8182 if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
8183 int fpos = filter->range ? 2 : 1;
8184
8185 filename = match_strdup(&args[fpos]);
375637bc
AS
8186 if (!filename) {
8187 ret = -ENOMEM;
8188 goto fail;
8189 }
8190 }
8191
8192 state = IF_STATE_END;
8193 break;
8194
8195 default:
8196 goto fail;
8197 }
8198
8199 /*
8200 * Filter definition is fully parsed, validate and install it.
8201 * Make sure that it doesn't contradict itself or the event's
8202 * attribute.
8203 */
8204 if (state == IF_STATE_END) {
8205 if (kernel && event->attr.exclude_kernel)
8206 goto fail;
8207
8208 if (!kernel) {
8209 if (!filename)
8210 goto fail;
8211
8212 /* look up the path and grab its inode */
8213 ret = kern_path(filename, LOOKUP_FOLLOW, &path);
8214 if (ret)
8215 goto fail_free_name;
8216
8217 filter->inode = igrab(d_inode(path.dentry));
8218 path_put(&path);
8219 kfree(filename);
8220 filename = NULL;
8221
8222 ret = -EINVAL;
8223 if (!filter->inode ||
8224 !S_ISREG(filter->inode->i_mode))
8225 /* free_filters_list() will iput() */
8226 goto fail;
8227 }
8228
8229 /* ready to consume more filters */
8230 state = IF_STATE_ACTION;
8231 filter = NULL;
8232 }
8233 }
8234
8235 if (state != IF_STATE_ACTION)
8236 goto fail;
8237
8238 kfree(orig);
8239
8240 return 0;
8241
8242fail_free_name:
8243 kfree(filename);
8244fail:
8245 free_filters_list(filters);
8246 kfree(orig);
8247
8248 return ret;
8249}
8250
8251static int
8252perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
8253{
8254 LIST_HEAD(filters);
8255 int ret;
8256
8257 /*
8258 * Since this is called in perf_ioctl() path, we're already holding
8259 * ctx::mutex.
8260 */
8261 lockdep_assert_held(&event->ctx->mutex);
8262
8263 if (WARN_ON_ONCE(event->parent))
8264 return -EINVAL;
8265
8266 /*
8267 * For now, we only support filtering in per-task events; doing so
8268 * for CPU-wide events requires additional context switching trickery,
8269 * since same object code will be mapped at different virtual
8270 * addresses in different processes.
8271 */
8272 if (!event->ctx->task)
8273 return -EOPNOTSUPP;
8274
8275 ret = perf_event_parse_addr_filter(event, filter_str, &filters);
8276 if (ret)
8277 return ret;
8278
8279 ret = event->pmu->addr_filters_validate(&filters);
8280 if (ret) {
8281 free_filters_list(&filters);
8282 return ret;
8283 }
8284
8285 /* remove existing filters, if any */
8286 perf_addr_filters_splice(event, &filters);
8287
8288 /* install new filters */
8289 perf_event_for_each_child(event, perf_event_addr_filters_apply);
8290
8291 return ret;
8292}
8293
c796bbbe
AS
8294static int perf_event_set_filter(struct perf_event *event, void __user *arg)
8295{
8296 char *filter_str;
8297 int ret = -EINVAL;
8298
375637bc
AS
8299 if ((event->attr.type != PERF_TYPE_TRACEPOINT ||
8300 !IS_ENABLED(CONFIG_EVENT_TRACING)) &&
8301 !has_addr_filter(event))
c796bbbe
AS
8302 return -EINVAL;
8303
8304 filter_str = strndup_user(arg, PAGE_SIZE);
8305 if (IS_ERR(filter_str))
8306 return PTR_ERR(filter_str);
8307
8308 if (IS_ENABLED(CONFIG_EVENT_TRACING) &&
8309 event->attr.type == PERF_TYPE_TRACEPOINT)
8310 ret = ftrace_profile_set_filter(event, event->attr.config,
8311 filter_str);
375637bc
AS
8312 else if (has_addr_filter(event))
8313 ret = perf_event_set_addr_filter(event, filter_str);
c796bbbe
AS
8314
8315 kfree(filter_str);
8316 return ret;
8317}
8318
b0a873eb
PZ
8319/*
8320 * hrtimer based swevent callback
8321 */
f29ac756 8322
b0a873eb 8323static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
f29ac756 8324{
b0a873eb
PZ
8325 enum hrtimer_restart ret = HRTIMER_RESTART;
8326 struct perf_sample_data data;
8327 struct pt_regs *regs;
8328 struct perf_event *event;
8329 u64 period;
f29ac756 8330
b0a873eb 8331 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
ba3dd36c
PZ
8332
8333 if (event->state != PERF_EVENT_STATE_ACTIVE)
8334 return HRTIMER_NORESTART;
8335
b0a873eb 8336 event->pmu->read(event);
f344011c 8337
fd0d000b 8338 perf_sample_data_init(&data, 0, event->hw.last_period);
b0a873eb
PZ
8339 regs = get_irq_regs();
8340
8341 if (regs && !perf_exclude_event(event, regs)) {
77aeeebd 8342 if (!(event->attr.exclude_idle && is_idle_task(current)))
33b07b8b 8343 if (__perf_event_overflow(event, 1, &data, regs))
b0a873eb
PZ
8344 ret = HRTIMER_NORESTART;
8345 }
24f1e32c 8346
b0a873eb
PZ
8347 period = max_t(u64, 10000, event->hw.sample_period);
8348 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
24f1e32c 8349
b0a873eb 8350 return ret;
f29ac756
PZ
8351}
8352
b0a873eb 8353static void perf_swevent_start_hrtimer(struct perf_event *event)
5c92d124 8354{
b0a873eb 8355 struct hw_perf_event *hwc = &event->hw;
5d508e82
FBH
8356 s64 period;
8357
8358 if (!is_sampling_event(event))
8359 return;
f5ffe02e 8360
5d508e82
FBH
8361 period = local64_read(&hwc->period_left);
8362 if (period) {
8363 if (period < 0)
8364 period = 10000;
fa407f35 8365
5d508e82
FBH
8366 local64_set(&hwc->period_left, 0);
8367 } else {
8368 period = max_t(u64, 10000, hwc->sample_period);
8369 }
3497d206
TG
8370 hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
8371 HRTIMER_MODE_REL_PINNED);
24f1e32c 8372}
b0a873eb
PZ
8373
8374static void perf_swevent_cancel_hrtimer(struct perf_event *event)
24f1e32c 8375{
b0a873eb
PZ
8376 struct hw_perf_event *hwc = &event->hw;
8377
6c7e550f 8378 if (is_sampling_event(event)) {
b0a873eb 8379 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
fa407f35 8380 local64_set(&hwc->period_left, ktime_to_ns(remaining));
b0a873eb
PZ
8381
8382 hrtimer_cancel(&hwc->hrtimer);
8383 }
24f1e32c
FW
8384}
8385
ba3dd36c
PZ
8386static void perf_swevent_init_hrtimer(struct perf_event *event)
8387{
8388 struct hw_perf_event *hwc = &event->hw;
8389
8390 if (!is_sampling_event(event))
8391 return;
8392
8393 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
8394 hwc->hrtimer.function = perf_swevent_hrtimer;
8395
8396 /*
8397 * Since hrtimers have a fixed rate, we can do a static freq->period
8398 * mapping and avoid the whole period adjust feedback stuff.
8399 */
8400 if (event->attr.freq) {
8401 long freq = event->attr.sample_freq;
8402
8403 event->attr.sample_period = NSEC_PER_SEC / freq;
8404 hwc->sample_period = event->attr.sample_period;
8405 local64_set(&hwc->period_left, hwc->sample_period);
778141e3 8406 hwc->last_period = hwc->sample_period;
ba3dd36c
PZ
8407 event->attr.freq = 0;
8408 }
8409}
8410
b0a873eb
PZ
8411/*
8412 * Software event: cpu wall time clock
8413 */
8414
8415static void cpu_clock_event_update(struct perf_event *event)
24f1e32c 8416{
b0a873eb
PZ
8417 s64 prev;
8418 u64 now;
8419
a4eaf7f1 8420 now = local_clock();
b0a873eb
PZ
8421 prev = local64_xchg(&event->hw.prev_count, now);
8422 local64_add(now - prev, &event->count);
24f1e32c 8423}
24f1e32c 8424
a4eaf7f1 8425static void cpu_clock_event_start(struct perf_event *event, int flags)
b0a873eb 8426{
a4eaf7f1 8427 local64_set(&event->hw.prev_count, local_clock());
b0a873eb 8428 perf_swevent_start_hrtimer(event);
b0a873eb
PZ
8429}
8430
a4eaf7f1 8431static void cpu_clock_event_stop(struct perf_event *event, int flags)
f29ac756 8432{
b0a873eb
PZ
8433 perf_swevent_cancel_hrtimer(event);
8434 cpu_clock_event_update(event);
8435}
f29ac756 8436
a4eaf7f1
PZ
8437static int cpu_clock_event_add(struct perf_event *event, int flags)
8438{
8439 if (flags & PERF_EF_START)
8440 cpu_clock_event_start(event, flags);
6a694a60 8441 perf_event_update_userpage(event);
a4eaf7f1
PZ
8442
8443 return 0;
8444}
8445
8446static void cpu_clock_event_del(struct perf_event *event, int flags)
8447{
8448 cpu_clock_event_stop(event, flags);
8449}
8450
b0a873eb
PZ
8451static void cpu_clock_event_read(struct perf_event *event)
8452{
8453 cpu_clock_event_update(event);
8454}
f344011c 8455
b0a873eb
PZ
8456static int cpu_clock_event_init(struct perf_event *event)
8457{
8458 if (event->attr.type != PERF_TYPE_SOFTWARE)
8459 return -ENOENT;
8460
8461 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
8462 return -ENOENT;
8463
2481c5fa
SE
8464 /*
8465 * no branch sampling for software events
8466 */
8467 if (has_branch_stack(event))
8468 return -EOPNOTSUPP;
8469
ba3dd36c
PZ
8470 perf_swevent_init_hrtimer(event);
8471
b0a873eb 8472 return 0;
f29ac756
PZ
8473}
8474
b0a873eb 8475static struct pmu perf_cpu_clock = {
89a1e187
PZ
8476 .task_ctx_nr = perf_sw_context,
8477
34f43927
PZ
8478 .capabilities = PERF_PMU_CAP_NO_NMI,
8479
b0a873eb 8480 .event_init = cpu_clock_event_init,
a4eaf7f1
PZ
8481 .add = cpu_clock_event_add,
8482 .del = cpu_clock_event_del,
8483 .start = cpu_clock_event_start,
8484 .stop = cpu_clock_event_stop,
b0a873eb
PZ
8485 .read = cpu_clock_event_read,
8486};
8487
8488/*
8489 * Software event: task time clock
8490 */
8491
8492static void task_clock_event_update(struct perf_event *event, u64 now)
5c92d124 8493{
b0a873eb
PZ
8494 u64 prev;
8495 s64 delta;
5c92d124 8496
b0a873eb
PZ
8497 prev = local64_xchg(&event->hw.prev_count, now);
8498 delta = now - prev;
8499 local64_add(delta, &event->count);
8500}
5c92d124 8501
a4eaf7f1 8502static void task_clock_event_start(struct perf_event *event, int flags)
b0a873eb 8503{
a4eaf7f1 8504 local64_set(&event->hw.prev_count, event->ctx->time);
b0a873eb 8505 perf_swevent_start_hrtimer(event);
b0a873eb
PZ
8506}
8507
a4eaf7f1 8508static void task_clock_event_stop(struct perf_event *event, int flags)
b0a873eb
PZ
8509{
8510 perf_swevent_cancel_hrtimer(event);
8511 task_clock_event_update(event, event->ctx->time);
a4eaf7f1
PZ
8512}
8513
8514static int task_clock_event_add(struct perf_event *event, int flags)
8515{
8516 if (flags & PERF_EF_START)
8517 task_clock_event_start(event, flags);
6a694a60 8518 perf_event_update_userpage(event);
b0a873eb 8519
a4eaf7f1
PZ
8520 return 0;
8521}
8522
8523static void task_clock_event_del(struct perf_event *event, int flags)
8524{
8525 task_clock_event_stop(event, PERF_EF_UPDATE);
b0a873eb
PZ
8526}
8527
8528static void task_clock_event_read(struct perf_event *event)
8529{
768a06e2
PZ
8530 u64 now = perf_clock();
8531 u64 delta = now - event->ctx->timestamp;
8532 u64 time = event->ctx->time + delta;
b0a873eb
PZ
8533
8534 task_clock_event_update(event, time);
8535}
8536
8537static int task_clock_event_init(struct perf_event *event)
6fb2915d 8538{
b0a873eb
PZ
8539 if (event->attr.type != PERF_TYPE_SOFTWARE)
8540 return -ENOENT;
8541
8542 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
8543 return -ENOENT;
8544
2481c5fa
SE
8545 /*
8546 * no branch sampling for software events
8547 */
8548 if (has_branch_stack(event))
8549 return -EOPNOTSUPP;
8550
ba3dd36c
PZ
8551 perf_swevent_init_hrtimer(event);
8552
b0a873eb 8553 return 0;
6fb2915d
LZ
8554}
8555
b0a873eb 8556static struct pmu perf_task_clock = {
89a1e187
PZ
8557 .task_ctx_nr = perf_sw_context,
8558
34f43927
PZ
8559 .capabilities = PERF_PMU_CAP_NO_NMI,
8560
b0a873eb 8561 .event_init = task_clock_event_init,
a4eaf7f1
PZ
8562 .add = task_clock_event_add,
8563 .del = task_clock_event_del,
8564 .start = task_clock_event_start,
8565 .stop = task_clock_event_stop,
b0a873eb
PZ
8566 .read = task_clock_event_read,
8567};
6fb2915d 8568
ad5133b7 8569static void perf_pmu_nop_void(struct pmu *pmu)
e077df4f 8570{
e077df4f 8571}
6fb2915d 8572
fbbe0701
SB
8573static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
8574{
8575}
8576
ad5133b7 8577static int perf_pmu_nop_int(struct pmu *pmu)
6fb2915d 8578{
ad5133b7 8579 return 0;
6fb2915d
LZ
8580}
8581
18ab2cd3 8582static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
fbbe0701
SB
8583
8584static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
6fb2915d 8585{
fbbe0701
SB
8586 __this_cpu_write(nop_txn_flags, flags);
8587
8588 if (flags & ~PERF_PMU_TXN_ADD)
8589 return;
8590
ad5133b7 8591 perf_pmu_disable(pmu);
6fb2915d
LZ
8592}
8593
ad5133b7
PZ
8594static int perf_pmu_commit_txn(struct pmu *pmu)
8595{
fbbe0701
SB
8596 unsigned int flags = __this_cpu_read(nop_txn_flags);
8597
8598 __this_cpu_write(nop_txn_flags, 0);
8599
8600 if (flags & ~PERF_PMU_TXN_ADD)
8601 return 0;
8602
ad5133b7
PZ
8603 perf_pmu_enable(pmu);
8604 return 0;
8605}
e077df4f 8606
ad5133b7 8607static void perf_pmu_cancel_txn(struct pmu *pmu)
24f1e32c 8608{
fbbe0701
SB
8609 unsigned int flags = __this_cpu_read(nop_txn_flags);
8610
8611 __this_cpu_write(nop_txn_flags, 0);
8612
8613 if (flags & ~PERF_PMU_TXN_ADD)
8614 return;
8615
ad5133b7 8616 perf_pmu_enable(pmu);
24f1e32c
FW
8617}
8618
35edc2a5
PZ
8619static int perf_event_idx_default(struct perf_event *event)
8620{
c719f560 8621 return 0;
35edc2a5
PZ
8622}
8623
8dc85d54
PZ
8624/*
8625 * Ensures all contexts with the same task_ctx_nr have the same
8626 * pmu_cpu_context too.
8627 */
9e317041 8628static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
24f1e32c 8629{
8dc85d54 8630 struct pmu *pmu;
b326e956 8631
8dc85d54
PZ
8632 if (ctxn < 0)
8633 return NULL;
24f1e32c 8634
8dc85d54
PZ
8635 list_for_each_entry(pmu, &pmus, entry) {
8636 if (pmu->task_ctx_nr == ctxn)
8637 return pmu->pmu_cpu_context;
8638 }
24f1e32c 8639
8dc85d54 8640 return NULL;
24f1e32c
FW
8641}
8642
51676957
PZ
8643static void free_pmu_context(struct pmu *pmu)
8644{
8dc85d54 8645 mutex_lock(&pmus_lock);
51676957 8646 free_percpu(pmu->pmu_cpu_context);
8dc85d54 8647 mutex_unlock(&pmus_lock);
24f1e32c 8648}
6e855cd4
AS
8649
8650/*
8651 * Let userspace know that this PMU supports address range filtering:
8652 */
8653static ssize_t nr_addr_filters_show(struct device *dev,
8654 struct device_attribute *attr,
8655 char *page)
8656{
8657 struct pmu *pmu = dev_get_drvdata(dev);
8658
8659 return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
8660}
8661DEVICE_ATTR_RO(nr_addr_filters);
8662
2e80a82a 8663static struct idr pmu_idr;
d6d020e9 8664
abe43400
PZ
8665static ssize_t
8666type_show(struct device *dev, struct device_attribute *attr, char *page)
8667{
8668 struct pmu *pmu = dev_get_drvdata(dev);
8669
8670 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
8671}
90826ca7 8672static DEVICE_ATTR_RO(type);
abe43400 8673
62b85639
SE
8674static ssize_t
8675perf_event_mux_interval_ms_show(struct device *dev,
8676 struct device_attribute *attr,
8677 char *page)
8678{
8679 struct pmu *pmu = dev_get_drvdata(dev);
8680
8681 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
8682}
8683
272325c4
PZ
8684static DEFINE_MUTEX(mux_interval_mutex);
8685
62b85639
SE
8686static ssize_t
8687perf_event_mux_interval_ms_store(struct device *dev,
8688 struct device_attribute *attr,
8689 const char *buf, size_t count)
8690{
8691 struct pmu *pmu = dev_get_drvdata(dev);
8692 int timer, cpu, ret;
8693
8694 ret = kstrtoint(buf, 0, &timer);
8695 if (ret)
8696 return ret;
8697
8698 if (timer < 1)
8699 return -EINVAL;
8700
8701 /* same value, noting to do */
8702 if (timer == pmu->hrtimer_interval_ms)
8703 return count;
8704
272325c4 8705 mutex_lock(&mux_interval_mutex);
62b85639
SE
8706 pmu->hrtimer_interval_ms = timer;
8707
8708 /* update all cpuctx for this PMU */
272325c4
PZ
8709 get_online_cpus();
8710 for_each_online_cpu(cpu) {
62b85639
SE
8711 struct perf_cpu_context *cpuctx;
8712 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
8713 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
8714
272325c4
PZ
8715 cpu_function_call(cpu,
8716 (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
62b85639 8717 }
272325c4
PZ
8718 put_online_cpus();
8719 mutex_unlock(&mux_interval_mutex);
62b85639
SE
8720
8721 return count;
8722}
90826ca7 8723static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
62b85639 8724
90826ca7
GKH
8725static struct attribute *pmu_dev_attrs[] = {
8726 &dev_attr_type.attr,
8727 &dev_attr_perf_event_mux_interval_ms.attr,
8728 NULL,
abe43400 8729};
90826ca7 8730ATTRIBUTE_GROUPS(pmu_dev);
abe43400
PZ
8731
8732static int pmu_bus_running;
8733static struct bus_type pmu_bus = {
8734 .name = "event_source",
90826ca7 8735 .dev_groups = pmu_dev_groups,
abe43400
PZ
8736};
8737
8738static void pmu_dev_release(struct device *dev)
8739{
8740 kfree(dev);
8741}
8742
8743static int pmu_dev_alloc(struct pmu *pmu)
8744{
8745 int ret = -ENOMEM;
8746
8747 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
8748 if (!pmu->dev)
8749 goto out;
8750
0c9d42ed 8751 pmu->dev->groups = pmu->attr_groups;
abe43400
PZ
8752 device_initialize(pmu->dev);
8753 ret = dev_set_name(pmu->dev, "%s", pmu->name);
8754 if (ret)
8755 goto free_dev;
8756
8757 dev_set_drvdata(pmu->dev, pmu);
8758 pmu->dev->bus = &pmu_bus;
8759 pmu->dev->release = pmu_dev_release;
8760 ret = device_add(pmu->dev);
8761 if (ret)
8762 goto free_dev;
8763
6e855cd4
AS
8764 /* For PMUs with address filters, throw in an extra attribute: */
8765 if (pmu->nr_addr_filters)
8766 ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
8767
8768 if (ret)
8769 goto del_dev;
8770
abe43400
PZ
8771out:
8772 return ret;
8773
6e855cd4
AS
8774del_dev:
8775 device_del(pmu->dev);
8776
abe43400
PZ
8777free_dev:
8778 put_device(pmu->dev);
8779 goto out;
8780}
8781
547e9fd7 8782static struct lock_class_key cpuctx_mutex;
facc4307 8783static struct lock_class_key cpuctx_lock;
547e9fd7 8784
03d8e80b 8785int perf_pmu_register(struct pmu *pmu, const char *name, int type)
24f1e32c 8786{
108b02cf 8787 int cpu, ret;
24f1e32c 8788
b0a873eb 8789 mutex_lock(&pmus_lock);
33696fc0
PZ
8790 ret = -ENOMEM;
8791 pmu->pmu_disable_count = alloc_percpu(int);
8792 if (!pmu->pmu_disable_count)
8793 goto unlock;
f29ac756 8794
2e80a82a
PZ
8795 pmu->type = -1;
8796 if (!name)
8797 goto skip_type;
8798 pmu->name = name;
8799
8800 if (type < 0) {
0e9c3be2
TH
8801 type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
8802 if (type < 0) {
8803 ret = type;
2e80a82a
PZ
8804 goto free_pdc;
8805 }
8806 }
8807 pmu->type = type;
8808
abe43400
PZ
8809 if (pmu_bus_running) {
8810 ret = pmu_dev_alloc(pmu);
8811 if (ret)
8812 goto free_idr;
8813 }
8814
2e80a82a 8815skip_type:
26657848
PZ
8816 if (pmu->task_ctx_nr == perf_hw_context) {
8817 static int hw_context_taken = 0;
8818
5101ef20
MR
8819 /*
8820 * Other than systems with heterogeneous CPUs, it never makes
8821 * sense for two PMUs to share perf_hw_context. PMUs which are
8822 * uncore must use perf_invalid_context.
8823 */
8824 if (WARN_ON_ONCE(hw_context_taken &&
8825 !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
26657848
PZ
8826 pmu->task_ctx_nr = perf_invalid_context;
8827
8828 hw_context_taken = 1;
8829 }
8830
8dc85d54
PZ
8831 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
8832 if (pmu->pmu_cpu_context)
8833 goto got_cpu_context;
f29ac756 8834
c4814202 8835 ret = -ENOMEM;
108b02cf
PZ
8836 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
8837 if (!pmu->pmu_cpu_context)
abe43400 8838 goto free_dev;
f344011c 8839
108b02cf
PZ
8840 for_each_possible_cpu(cpu) {
8841 struct perf_cpu_context *cpuctx;
8842
8843 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
eb184479 8844 __perf_event_init_context(&cpuctx->ctx);
547e9fd7 8845 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
facc4307 8846 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
108b02cf 8847 cpuctx->ctx.pmu = pmu;
9e630205 8848
272325c4 8849 __perf_mux_hrtimer_init(cpuctx, cpu);
108b02cf 8850 }
76e1d904 8851
8dc85d54 8852got_cpu_context:
ad5133b7
PZ
8853 if (!pmu->start_txn) {
8854 if (pmu->pmu_enable) {
8855 /*
8856 * If we have pmu_enable/pmu_disable calls, install
8857 * transaction stubs that use that to try and batch
8858 * hardware accesses.
8859 */
8860 pmu->start_txn = perf_pmu_start_txn;
8861 pmu->commit_txn = perf_pmu_commit_txn;
8862 pmu->cancel_txn = perf_pmu_cancel_txn;
8863 } else {
fbbe0701 8864 pmu->start_txn = perf_pmu_nop_txn;
ad5133b7
PZ
8865 pmu->commit_txn = perf_pmu_nop_int;
8866 pmu->cancel_txn = perf_pmu_nop_void;
f344011c 8867 }
5c92d124 8868 }
15dbf27c 8869
ad5133b7
PZ
8870 if (!pmu->pmu_enable) {
8871 pmu->pmu_enable = perf_pmu_nop_void;
8872 pmu->pmu_disable = perf_pmu_nop_void;
8873 }
8874
35edc2a5
PZ
8875 if (!pmu->event_idx)
8876 pmu->event_idx = perf_event_idx_default;
8877
b0a873eb 8878 list_add_rcu(&pmu->entry, &pmus);
bed5b25a 8879 atomic_set(&pmu->exclusive_cnt, 0);
33696fc0
PZ
8880 ret = 0;
8881unlock:
b0a873eb
PZ
8882 mutex_unlock(&pmus_lock);
8883
33696fc0 8884 return ret;
108b02cf 8885
abe43400
PZ
8886free_dev:
8887 device_del(pmu->dev);
8888 put_device(pmu->dev);
8889
2e80a82a
PZ
8890free_idr:
8891 if (pmu->type >= PERF_TYPE_MAX)
8892 idr_remove(&pmu_idr, pmu->type);
8893
108b02cf
PZ
8894free_pdc:
8895 free_percpu(pmu->pmu_disable_count);
8896 goto unlock;
f29ac756 8897}
c464c76e 8898EXPORT_SYMBOL_GPL(perf_pmu_register);
f29ac756 8899
b0a873eb 8900void perf_pmu_unregister(struct pmu *pmu)
5c92d124 8901{
0933840a
JO
8902 int remove_device;
8903
b0a873eb 8904 mutex_lock(&pmus_lock);
0933840a 8905 remove_device = pmu_bus_running;
b0a873eb
PZ
8906 list_del_rcu(&pmu->entry);
8907 mutex_unlock(&pmus_lock);
5c92d124 8908
0475f9ea 8909 /*
cde8e884
PZ
8910 * We dereference the pmu list under both SRCU and regular RCU, so
8911 * synchronize against both of those.
0475f9ea 8912 */
b0a873eb 8913 synchronize_srcu(&pmus_srcu);
cde8e884 8914 synchronize_rcu();
d6d020e9 8915
33696fc0 8916 free_percpu(pmu->pmu_disable_count);
2e80a82a
PZ
8917 if (pmu->type >= PERF_TYPE_MAX)
8918 idr_remove(&pmu_idr, pmu->type);
0933840a
JO
8919 if (remove_device) {
8920 if (pmu->nr_addr_filters)
8921 device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
8922 device_del(pmu->dev);
8923 put_device(pmu->dev);
8924 }
51676957 8925 free_pmu_context(pmu);
b0a873eb 8926}
c464c76e 8927EXPORT_SYMBOL_GPL(perf_pmu_unregister);
d6d020e9 8928
cc34b98b
MR
8929static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
8930{
ccd41c86 8931 struct perf_event_context *ctx = NULL;
cc34b98b
MR
8932 int ret;
8933
8934 if (!try_module_get(pmu->module))
8935 return -ENODEV;
ccd41c86
PZ
8936
8937 if (event->group_leader != event) {
8b10c5e2
PZ
8938 /*
8939 * This ctx->mutex can nest when we're called through
8940 * inheritance. See the perf_event_ctx_lock_nested() comment.
8941 */
8942 ctx = perf_event_ctx_lock_nested(event->group_leader,
8943 SINGLE_DEPTH_NESTING);
ccd41c86
PZ
8944 BUG_ON(!ctx);
8945 }
8946
cc34b98b
MR
8947 event->pmu = pmu;
8948 ret = pmu->event_init(event);
ccd41c86
PZ
8949
8950 if (ctx)
8951 perf_event_ctx_unlock(event->group_leader, ctx);
8952
cc34b98b
MR
8953 if (ret)
8954 module_put(pmu->module);
8955
8956 return ret;
8957}
8958
18ab2cd3 8959static struct pmu *perf_init_event(struct perf_event *event)
b0a873eb
PZ
8960{
8961 struct pmu *pmu = NULL;
8962 int idx;
940c5b29 8963 int ret;
b0a873eb
PZ
8964
8965 idx = srcu_read_lock(&pmus_srcu);
2e80a82a
PZ
8966
8967 rcu_read_lock();
8968 pmu = idr_find(&pmu_idr, event->attr.type);
8969 rcu_read_unlock();
940c5b29 8970 if (pmu) {
cc34b98b 8971 ret = perf_try_init_event(pmu, event);
940c5b29
LM
8972 if (ret)
8973 pmu = ERR_PTR(ret);
2e80a82a 8974 goto unlock;
940c5b29 8975 }
2e80a82a 8976
b0a873eb 8977 list_for_each_entry_rcu(pmu, &pmus, entry) {
cc34b98b 8978 ret = perf_try_init_event(pmu, event);
b0a873eb 8979 if (!ret)
e5f4d339 8980 goto unlock;
76e1d904 8981
b0a873eb
PZ
8982 if (ret != -ENOENT) {
8983 pmu = ERR_PTR(ret);
e5f4d339 8984 goto unlock;
f344011c 8985 }
5c92d124 8986 }
e5f4d339
PZ
8987 pmu = ERR_PTR(-ENOENT);
8988unlock:
b0a873eb 8989 srcu_read_unlock(&pmus_srcu, idx);
15dbf27c 8990
4aeb0b42 8991 return pmu;
5c92d124
IM
8992}
8993
f2fb6bef
KL
8994static void attach_sb_event(struct perf_event *event)
8995{
8996 struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
8997
8998 raw_spin_lock(&pel->lock);
8999 list_add_rcu(&event->sb_list, &pel->list);
9000 raw_spin_unlock(&pel->lock);
9001}
9002
aab5b71e
PZ
9003/*
9004 * We keep a list of all !task (and therefore per-cpu) events
9005 * that need to receive side-band records.
9006 *
9007 * This avoids having to scan all the various PMU per-cpu contexts
9008 * looking for them.
9009 */
f2fb6bef
KL
9010static void account_pmu_sb_event(struct perf_event *event)
9011{
a4f144eb 9012 if (is_sb_event(event))
f2fb6bef
KL
9013 attach_sb_event(event);
9014}
9015
4beb31f3
FW
9016static void account_event_cpu(struct perf_event *event, int cpu)
9017{
9018 if (event->parent)
9019 return;
9020
4beb31f3
FW
9021 if (is_cgroup_event(event))
9022 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
9023}
9024
555e0c1e
FW
9025/* Freq events need the tick to stay alive (see perf_event_task_tick). */
9026static void account_freq_event_nohz(void)
9027{
9028#ifdef CONFIG_NO_HZ_FULL
9029 /* Lock so we don't race with concurrent unaccount */
9030 spin_lock(&nr_freq_lock);
9031 if (atomic_inc_return(&nr_freq_events) == 1)
9032 tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
9033 spin_unlock(&nr_freq_lock);
9034#endif
9035}
9036
9037static void account_freq_event(void)
9038{
9039 if (tick_nohz_full_enabled())
9040 account_freq_event_nohz();
9041 else
9042 atomic_inc(&nr_freq_events);
9043}
9044
9045
766d6c07
FW
9046static void account_event(struct perf_event *event)
9047{
25432ae9
PZ
9048 bool inc = false;
9049
4beb31f3
FW
9050 if (event->parent)
9051 return;
9052
766d6c07 9053 if (event->attach_state & PERF_ATTACH_TASK)
25432ae9 9054 inc = true;
766d6c07
FW
9055 if (event->attr.mmap || event->attr.mmap_data)
9056 atomic_inc(&nr_mmap_events);
9057 if (event->attr.comm)
9058 atomic_inc(&nr_comm_events);
9059 if (event->attr.task)
9060 atomic_inc(&nr_task_events);
555e0c1e
FW
9061 if (event->attr.freq)
9062 account_freq_event();
45ac1403
AH
9063 if (event->attr.context_switch) {
9064 atomic_inc(&nr_switch_events);
25432ae9 9065 inc = true;
45ac1403 9066 }
4beb31f3 9067 if (has_branch_stack(event))
25432ae9 9068 inc = true;
4beb31f3 9069 if (is_cgroup_event(event))
25432ae9
PZ
9070 inc = true;
9071
9107c89e
PZ
9072 if (inc) {
9073 if (atomic_inc_not_zero(&perf_sched_count))
9074 goto enabled;
9075
9076 mutex_lock(&perf_sched_mutex);
9077 if (!atomic_read(&perf_sched_count)) {
9078 static_branch_enable(&perf_sched_events);
9079 /*
9080 * Guarantee that all CPUs observe they key change and
9081 * call the perf scheduling hooks before proceeding to
9082 * install events that need them.
9083 */
9084 synchronize_sched();
9085 }
9086 /*
9087 * Now that we have waited for the sync_sched(), allow further
9088 * increments to by-pass the mutex.
9089 */
9090 atomic_inc(&perf_sched_count);
9091 mutex_unlock(&perf_sched_mutex);
9092 }
9093enabled:
4beb31f3
FW
9094
9095 account_event_cpu(event, event->cpu);
f2fb6bef
KL
9096
9097 account_pmu_sb_event(event);
766d6c07
FW
9098}
9099
0793a61d 9100/*
cdd6c482 9101 * Allocate and initialize a event structure
0793a61d 9102 */
cdd6c482 9103static struct perf_event *
c3f00c70 9104perf_event_alloc(struct perf_event_attr *attr, int cpu,
d580ff86
PZ
9105 struct task_struct *task,
9106 struct perf_event *group_leader,
9107 struct perf_event *parent_event,
4dc0da86 9108 perf_overflow_handler_t overflow_handler,
79dff51e 9109 void *context, int cgroup_fd)
0793a61d 9110{
51b0fe39 9111 struct pmu *pmu;
cdd6c482
IM
9112 struct perf_event *event;
9113 struct hw_perf_event *hwc;
90983b16 9114 long err = -EINVAL;
0793a61d 9115
66832eb4
ON
9116 if ((unsigned)cpu >= nr_cpu_ids) {
9117 if (!task || cpu != -1)
9118 return ERR_PTR(-EINVAL);
9119 }
9120
c3f00c70 9121 event = kzalloc(sizeof(*event), GFP_KERNEL);
cdd6c482 9122 if (!event)
d5d2bc0d 9123 return ERR_PTR(-ENOMEM);
0793a61d 9124
04289bb9 9125 /*
cdd6c482 9126 * Single events are their own group leaders, with an
04289bb9
IM
9127 * empty sibling list:
9128 */
9129 if (!group_leader)
cdd6c482 9130 group_leader = event;
04289bb9 9131
cdd6c482
IM
9132 mutex_init(&event->child_mutex);
9133 INIT_LIST_HEAD(&event->child_list);
fccc714b 9134
cdd6c482
IM
9135 INIT_LIST_HEAD(&event->group_entry);
9136 INIT_LIST_HEAD(&event->event_entry);
9137 INIT_LIST_HEAD(&event->sibling_list);
10c6db11 9138 INIT_LIST_HEAD(&event->rb_entry);
71ad88ef 9139 INIT_LIST_HEAD(&event->active_entry);
375637bc 9140 INIT_LIST_HEAD(&event->addr_filters.list);
f3ae75de
SE
9141 INIT_HLIST_NODE(&event->hlist_entry);
9142
10c6db11 9143
cdd6c482 9144 init_waitqueue_head(&event->waitq);
e360adbe 9145 init_irq_work(&event->pending, perf_pending_event);
0793a61d 9146
cdd6c482 9147 mutex_init(&event->mmap_mutex);
375637bc 9148 raw_spin_lock_init(&event->addr_filters.lock);
7b732a75 9149
a6fa941d 9150 atomic_long_set(&event->refcount, 1);
cdd6c482
IM
9151 event->cpu = cpu;
9152 event->attr = *attr;
9153 event->group_leader = group_leader;
9154 event->pmu = NULL;
cdd6c482 9155 event->oncpu = -1;
a96bbc16 9156
cdd6c482 9157 event->parent = parent_event;
b84fbc9f 9158
17cf22c3 9159 event->ns = get_pid_ns(task_active_pid_ns(current));
cdd6c482 9160 event->id = atomic64_inc_return(&perf_event_id);
a96bbc16 9161
cdd6c482 9162 event->state = PERF_EVENT_STATE_INACTIVE;
329d876d 9163
d580ff86
PZ
9164 if (task) {
9165 event->attach_state = PERF_ATTACH_TASK;
d580ff86 9166 /*
50f16a8b
PZ
9167 * XXX pmu::event_init needs to know what task to account to
9168 * and we cannot use the ctx information because we need the
9169 * pmu before we get a ctx.
d580ff86 9170 */
50f16a8b 9171 event->hw.target = task;
d580ff86
PZ
9172 }
9173
34f43927
PZ
9174 event->clock = &local_clock;
9175 if (parent_event)
9176 event->clock = parent_event->clock;
9177
4dc0da86 9178 if (!overflow_handler && parent_event) {
b326e956 9179 overflow_handler = parent_event->overflow_handler;
4dc0da86 9180 context = parent_event->overflow_handler_context;
f1e4ba5b 9181#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
aa6a5f3c
AS
9182 if (overflow_handler == bpf_overflow_handler) {
9183 struct bpf_prog *prog = bpf_prog_inc(parent_event->prog);
9184
9185 if (IS_ERR(prog)) {
9186 err = PTR_ERR(prog);
9187 goto err_ns;
9188 }
9189 event->prog = prog;
9190 event->orig_overflow_handler =
9191 parent_event->orig_overflow_handler;
9192 }
9193#endif
4dc0da86 9194 }
66832eb4 9195
1879445d
WN
9196 if (overflow_handler) {
9197 event->overflow_handler = overflow_handler;
9198 event->overflow_handler_context = context;
9ecda41a
WN
9199 } else if (is_write_backward(event)){
9200 event->overflow_handler = perf_event_output_backward;
9201 event->overflow_handler_context = NULL;
1879445d 9202 } else {
9ecda41a 9203 event->overflow_handler = perf_event_output_forward;
1879445d
WN
9204 event->overflow_handler_context = NULL;
9205 }
97eaf530 9206
0231bb53 9207 perf_event__state_init(event);
a86ed508 9208
4aeb0b42 9209 pmu = NULL;
b8e83514 9210
cdd6c482 9211 hwc = &event->hw;
bd2b5b12 9212 hwc->sample_period = attr->sample_period;
0d48696f 9213 if (attr->freq && attr->sample_freq)
bd2b5b12 9214 hwc->sample_period = 1;
eced1dfc 9215 hwc->last_period = hwc->sample_period;
bd2b5b12 9216
e7850595 9217 local64_set(&hwc->period_left, hwc->sample_period);
60db5e09 9218
2023b359 9219 /*
cdd6c482 9220 * we currently do not support PERF_FORMAT_GROUP on inherited events
2023b359 9221 */
3dab77fb 9222 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
90983b16 9223 goto err_ns;
a46a2300
YZ
9224
9225 if (!has_branch_stack(event))
9226 event->attr.branch_sample_type = 0;
2023b359 9227
79dff51e
MF
9228 if (cgroup_fd != -1) {
9229 err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
9230 if (err)
9231 goto err_ns;
9232 }
9233
b0a873eb 9234 pmu = perf_init_event(event);
4aeb0b42 9235 if (!pmu)
90983b16
FW
9236 goto err_ns;
9237 else if (IS_ERR(pmu)) {
4aeb0b42 9238 err = PTR_ERR(pmu);
90983b16 9239 goto err_ns;
621a01ea 9240 }
d5d2bc0d 9241
bed5b25a
AS
9242 err = exclusive_event_init(event);
9243 if (err)
9244 goto err_pmu;
9245
375637bc
AS
9246 if (has_addr_filter(event)) {
9247 event->addr_filters_offs = kcalloc(pmu->nr_addr_filters,
9248 sizeof(unsigned long),
9249 GFP_KERNEL);
9250 if (!event->addr_filters_offs)
9251 goto err_per_task;
9252
9253 /* force hw sync on the address filters */
9254 event->addr_filters_gen = 1;
9255 }
9256
cdd6c482 9257 if (!event->parent) {
927c7a9e 9258 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
97c79a38 9259 err = get_callchain_buffers(attr->sample_max_stack);
90983b16 9260 if (err)
375637bc 9261 goto err_addr_filters;
d010b332 9262 }
f344011c 9263 }
9ee318a7 9264
927a5570
AS
9265 /* symmetric to unaccount_event() in _free_event() */
9266 account_event(event);
9267
cdd6c482 9268 return event;
90983b16 9269
375637bc
AS
9270err_addr_filters:
9271 kfree(event->addr_filters_offs);
9272
bed5b25a
AS
9273err_per_task:
9274 exclusive_event_destroy(event);
9275
90983b16
FW
9276err_pmu:
9277 if (event->destroy)
9278 event->destroy(event);
c464c76e 9279 module_put(pmu->module);
90983b16 9280err_ns:
79dff51e
MF
9281 if (is_cgroup_event(event))
9282 perf_detach_cgroup(event);
90983b16
FW
9283 if (event->ns)
9284 put_pid_ns(event->ns);
9285 kfree(event);
9286
9287 return ERR_PTR(err);
0793a61d
TG
9288}
9289
cdd6c482
IM
9290static int perf_copy_attr(struct perf_event_attr __user *uattr,
9291 struct perf_event_attr *attr)
974802ea 9292{
974802ea 9293 u32 size;
cdf8073d 9294 int ret;
974802ea
PZ
9295
9296 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
9297 return -EFAULT;
9298
9299 /*
9300 * zero the full structure, so that a short copy will be nice.
9301 */
9302 memset(attr, 0, sizeof(*attr));
9303
9304 ret = get_user(size, &uattr->size);
9305 if (ret)
9306 return ret;
9307
9308 if (size > PAGE_SIZE) /* silly large */
9309 goto err_size;
9310
9311 if (!size) /* abi compat */
9312 size = PERF_ATTR_SIZE_VER0;
9313
9314 if (size < PERF_ATTR_SIZE_VER0)
9315 goto err_size;
9316
9317 /*
9318 * If we're handed a bigger struct than we know of,
cdf8073d
IS
9319 * ensure all the unknown bits are 0 - i.e. new
9320 * user-space does not rely on any kernel feature
9321 * extensions we dont know about yet.
974802ea
PZ
9322 */
9323 if (size > sizeof(*attr)) {
cdf8073d
IS
9324 unsigned char __user *addr;
9325 unsigned char __user *end;
9326 unsigned char val;
974802ea 9327
cdf8073d
IS
9328 addr = (void __user *)uattr + sizeof(*attr);
9329 end = (void __user *)uattr + size;
974802ea 9330
cdf8073d 9331 for (; addr < end; addr++) {
974802ea
PZ
9332 ret = get_user(val, addr);
9333 if (ret)
9334 return ret;
9335 if (val)
9336 goto err_size;
9337 }
b3e62e35 9338 size = sizeof(*attr);
974802ea
PZ
9339 }
9340
9341 ret = copy_from_user(attr, uattr, size);
9342 if (ret)
9343 return -EFAULT;
9344
cd757645 9345 if (attr->__reserved_1)
974802ea
PZ
9346 return -EINVAL;
9347
9348 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
9349 return -EINVAL;
9350
9351 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
9352 return -EINVAL;
9353
bce38cd5
SE
9354 if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
9355 u64 mask = attr->branch_sample_type;
9356
9357 /* only using defined bits */
9358 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
9359 return -EINVAL;
9360
9361 /* at least one branch bit must be set */
9362 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
9363 return -EINVAL;
9364
bce38cd5
SE
9365 /* propagate priv level, when not set for branch */
9366 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
9367
9368 /* exclude_kernel checked on syscall entry */
9369 if (!attr->exclude_kernel)
9370 mask |= PERF_SAMPLE_BRANCH_KERNEL;
9371
9372 if (!attr->exclude_user)
9373 mask |= PERF_SAMPLE_BRANCH_USER;
9374
9375 if (!attr->exclude_hv)
9376 mask |= PERF_SAMPLE_BRANCH_HV;
9377 /*
9378 * adjust user setting (for HW filter setup)
9379 */
9380 attr->branch_sample_type = mask;
9381 }
e712209a
SE
9382 /* privileged levels capture (kernel, hv): check permissions */
9383 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
2b923c8f
SE
9384 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
9385 return -EACCES;
bce38cd5 9386 }
4018994f 9387
c5ebcedb 9388 if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
4018994f 9389 ret = perf_reg_validate(attr->sample_regs_user);
c5ebcedb
JO
9390 if (ret)
9391 return ret;
9392 }
9393
9394 if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
9395 if (!arch_perf_have_user_stack_dump())
9396 return -ENOSYS;
9397
9398 /*
9399 * We have __u32 type for the size, but so far
9400 * we can only use __u16 as maximum due to the
9401 * __u16 sample size limit.
9402 */
9403 if (attr->sample_stack_user >= USHRT_MAX)
9404 ret = -EINVAL;
9405 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
9406 ret = -EINVAL;
9407 }
4018994f 9408
60e2364e
SE
9409 if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
9410 ret = perf_reg_validate(attr->sample_regs_intr);
974802ea
PZ
9411out:
9412 return ret;
9413
9414err_size:
9415 put_user(sizeof(*attr), &uattr->size);
9416 ret = -E2BIG;
9417 goto out;
9418}
9419
ac9721f3
PZ
9420static int
9421perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
a4be7c27 9422{
b69cf536 9423 struct ring_buffer *rb = NULL;
a4be7c27
PZ
9424 int ret = -EINVAL;
9425
ac9721f3 9426 if (!output_event)
a4be7c27
PZ
9427 goto set;
9428
ac9721f3
PZ
9429 /* don't allow circular references */
9430 if (event == output_event)
a4be7c27
PZ
9431 goto out;
9432
0f139300
PZ
9433 /*
9434 * Don't allow cross-cpu buffers
9435 */
9436 if (output_event->cpu != event->cpu)
9437 goto out;
9438
9439 /*
76369139 9440 * If its not a per-cpu rb, it must be the same task.
0f139300
PZ
9441 */
9442 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
9443 goto out;
9444
34f43927
PZ
9445 /*
9446 * Mixing clocks in the same buffer is trouble you don't need.
9447 */
9448 if (output_event->clock != event->clock)
9449 goto out;
9450
9ecda41a
WN
9451 /*
9452 * Either writing ring buffer from beginning or from end.
9453 * Mixing is not allowed.
9454 */
9455 if (is_write_backward(output_event) != is_write_backward(event))
9456 goto out;
9457
45bfb2e5
PZ
9458 /*
9459 * If both events generate aux data, they must be on the same PMU
9460 */
9461 if (has_aux(event) && has_aux(output_event) &&
9462 event->pmu != output_event->pmu)
9463 goto out;
9464
a4be7c27 9465set:
cdd6c482 9466 mutex_lock(&event->mmap_mutex);
ac9721f3
PZ
9467 /* Can't redirect output if we've got an active mmap() */
9468 if (atomic_read(&event->mmap_count))
9469 goto unlock;
a4be7c27 9470
ac9721f3 9471 if (output_event) {
76369139
FW
9472 /* get the rb we want to redirect to */
9473 rb = ring_buffer_get(output_event);
9474 if (!rb)
ac9721f3 9475 goto unlock;
a4be7c27
PZ
9476 }
9477
b69cf536 9478 ring_buffer_attach(event, rb);
9bb5d40c 9479
a4be7c27 9480 ret = 0;
ac9721f3
PZ
9481unlock:
9482 mutex_unlock(&event->mmap_mutex);
9483
a4be7c27 9484out:
a4be7c27
PZ
9485 return ret;
9486}
9487
f63a8daa
PZ
9488static void mutex_lock_double(struct mutex *a, struct mutex *b)
9489{
9490 if (b < a)
9491 swap(a, b);
9492
9493 mutex_lock(a);
9494 mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
9495}
9496
34f43927
PZ
9497static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
9498{
9499 bool nmi_safe = false;
9500
9501 switch (clk_id) {
9502 case CLOCK_MONOTONIC:
9503 event->clock = &ktime_get_mono_fast_ns;
9504 nmi_safe = true;
9505 break;
9506
9507 case CLOCK_MONOTONIC_RAW:
9508 event->clock = &ktime_get_raw_fast_ns;
9509 nmi_safe = true;
9510 break;
9511
9512 case CLOCK_REALTIME:
9513 event->clock = &ktime_get_real_ns;
9514 break;
9515
9516 case CLOCK_BOOTTIME:
9517 event->clock = &ktime_get_boot_ns;
9518 break;
9519
9520 case CLOCK_TAI:
9521 event->clock = &ktime_get_tai_ns;
9522 break;
9523
9524 default:
9525 return -EINVAL;
9526 }
9527
9528 if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
9529 return -EINVAL;
9530
9531 return 0;
9532}
9533
321027c1
PZ
9534/*
9535 * Variation on perf_event_ctx_lock_nested(), except we take two context
9536 * mutexes.
9537 */
9538static struct perf_event_context *
9539__perf_event_ctx_lock_double(struct perf_event *group_leader,
9540 struct perf_event_context *ctx)
9541{
9542 struct perf_event_context *gctx;
9543
9544again:
9545 rcu_read_lock();
9546 gctx = READ_ONCE(group_leader->ctx);
9547 if (!atomic_inc_not_zero(&gctx->refcount)) {
9548 rcu_read_unlock();
9549 goto again;
9550 }
9551 rcu_read_unlock();
9552
9553 mutex_lock_double(&gctx->mutex, &ctx->mutex);
9554
9555 if (group_leader->ctx != gctx) {
9556 mutex_unlock(&ctx->mutex);
9557 mutex_unlock(&gctx->mutex);
9558 put_ctx(gctx);
9559 goto again;
9560 }
9561
9562 return gctx;
9563}
9564
0793a61d 9565/**
cdd6c482 9566 * sys_perf_event_open - open a performance event, associate it to a task/cpu
9f66a381 9567 *
cdd6c482 9568 * @attr_uptr: event_id type attributes for monitoring/sampling
0793a61d 9569 * @pid: target pid
9f66a381 9570 * @cpu: target cpu
cdd6c482 9571 * @group_fd: group leader event fd
0793a61d 9572 */
cdd6c482
IM
9573SYSCALL_DEFINE5(perf_event_open,
9574 struct perf_event_attr __user *, attr_uptr,
2743a5b0 9575 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
0793a61d 9576{
b04243ef
PZ
9577 struct perf_event *group_leader = NULL, *output_event = NULL;
9578 struct perf_event *event, *sibling;
cdd6c482 9579 struct perf_event_attr attr;
f63a8daa 9580 struct perf_event_context *ctx, *uninitialized_var(gctx);
cdd6c482 9581 struct file *event_file = NULL;
2903ff01 9582 struct fd group = {NULL, 0};
38a81da2 9583 struct task_struct *task = NULL;
89a1e187 9584 struct pmu *pmu;
ea635c64 9585 int event_fd;
b04243ef 9586 int move_group = 0;
dc86cabe 9587 int err;
a21b0b35 9588 int f_flags = O_RDWR;
79dff51e 9589 int cgroup_fd = -1;
0793a61d 9590
2743a5b0 9591 /* for future expandability... */
e5d1367f 9592 if (flags & ~PERF_FLAG_ALL)
2743a5b0
PM
9593 return -EINVAL;
9594
dc86cabe
IM
9595 err = perf_copy_attr(attr_uptr, &attr);
9596 if (err)
9597 return err;
eab656ae 9598
0764771d
PZ
9599 if (!attr.exclude_kernel) {
9600 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
9601 return -EACCES;
9602 }
9603
df58ab24 9604 if (attr.freq) {
cdd6c482 9605 if (attr.sample_freq > sysctl_perf_event_sample_rate)
df58ab24 9606 return -EINVAL;
0819b2e3
PZ
9607 } else {
9608 if (attr.sample_period & (1ULL << 63))
9609 return -EINVAL;
df58ab24
PZ
9610 }
9611
97c79a38
ACM
9612 if (!attr.sample_max_stack)
9613 attr.sample_max_stack = sysctl_perf_event_max_stack;
9614
e5d1367f
SE
9615 /*
9616 * In cgroup mode, the pid argument is used to pass the fd
9617 * opened to the cgroup directory in cgroupfs. The cpu argument
9618 * designates the cpu on which to monitor threads from that
9619 * cgroup.
9620 */
9621 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
9622 return -EINVAL;
9623
a21b0b35
YD
9624 if (flags & PERF_FLAG_FD_CLOEXEC)
9625 f_flags |= O_CLOEXEC;
9626
9627 event_fd = get_unused_fd_flags(f_flags);
ea635c64
AV
9628 if (event_fd < 0)
9629 return event_fd;
9630
ac9721f3 9631 if (group_fd != -1) {
2903ff01
AV
9632 err = perf_fget_light(group_fd, &group);
9633 if (err)
d14b12d7 9634 goto err_fd;
2903ff01 9635 group_leader = group.file->private_data;
ac9721f3
PZ
9636 if (flags & PERF_FLAG_FD_OUTPUT)
9637 output_event = group_leader;
9638 if (flags & PERF_FLAG_FD_NO_GROUP)
9639 group_leader = NULL;
9640 }
9641
e5d1367f 9642 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
c6be5a5c
PZ
9643 task = find_lively_task_by_vpid(pid);
9644 if (IS_ERR(task)) {
9645 err = PTR_ERR(task);
9646 goto err_group_fd;
9647 }
9648 }
9649
1f4ee503
PZ
9650 if (task && group_leader &&
9651 group_leader->attr.inherit != attr.inherit) {
9652 err = -EINVAL;
9653 goto err_task;
9654 }
9655
fbfc623f
YZ
9656 get_online_cpus();
9657
79c9ce57
PZ
9658 if (task) {
9659 err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
9660 if (err)
9661 goto err_cpus;
9662
9663 /*
9664 * Reuse ptrace permission checks for now.
9665 *
9666 * We must hold cred_guard_mutex across this and any potential
9667 * perf_install_in_context() call for this new event to
9668 * serialize against exec() altering our credentials (and the
9669 * perf_event_exit_task() that could imply).
9670 */
9671 err = -EACCES;
9672 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
9673 goto err_cred;
9674 }
9675
79dff51e
MF
9676 if (flags & PERF_FLAG_PID_CGROUP)
9677 cgroup_fd = pid;
9678
4dc0da86 9679 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
79dff51e 9680 NULL, NULL, cgroup_fd);
d14b12d7
SE
9681 if (IS_ERR(event)) {
9682 err = PTR_ERR(event);
79c9ce57 9683 goto err_cred;
d14b12d7
SE
9684 }
9685
53b25335
VW
9686 if (is_sampling_event(event)) {
9687 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
a1396555 9688 err = -EOPNOTSUPP;
53b25335
VW
9689 goto err_alloc;
9690 }
9691 }
9692
89a1e187
PZ
9693 /*
9694 * Special case software events and allow them to be part of
9695 * any hardware group.
9696 */
9697 pmu = event->pmu;
b04243ef 9698
34f43927
PZ
9699 if (attr.use_clockid) {
9700 err = perf_event_set_clock(event, attr.clockid);
9701 if (err)
9702 goto err_alloc;
9703 }
9704
4ff6a8de
DCC
9705 if (pmu->task_ctx_nr == perf_sw_context)
9706 event->event_caps |= PERF_EV_CAP_SOFTWARE;
9707
b04243ef
PZ
9708 if (group_leader &&
9709 (is_software_event(event) != is_software_event(group_leader))) {
9710 if (is_software_event(event)) {
9711 /*
9712 * If event and group_leader are not both a software
9713 * event, and event is, then group leader is not.
9714 *
9715 * Allow the addition of software events to !software
9716 * groups, this is safe because software events never
9717 * fail to schedule.
9718 */
9719 pmu = group_leader->pmu;
9720 } else if (is_software_event(group_leader) &&
4ff6a8de 9721 (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
b04243ef
PZ
9722 /*
9723 * In case the group is a pure software group, and we
9724 * try to add a hardware event, move the whole group to
9725 * the hardware context.
9726 */
9727 move_group = 1;
9728 }
9729 }
89a1e187
PZ
9730
9731 /*
9732 * Get the target context (task or percpu):
9733 */
4af57ef2 9734 ctx = find_get_context(pmu, task, event);
89a1e187
PZ
9735 if (IS_ERR(ctx)) {
9736 err = PTR_ERR(ctx);
c6be5a5c 9737 goto err_alloc;
89a1e187
PZ
9738 }
9739
bed5b25a
AS
9740 if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
9741 err = -EBUSY;
9742 goto err_context;
9743 }
9744
ccff286d 9745 /*
cdd6c482 9746 * Look up the group leader (we will attach this event to it):
04289bb9 9747 */
ac9721f3 9748 if (group_leader) {
dc86cabe 9749 err = -EINVAL;
04289bb9 9750
04289bb9 9751 /*
ccff286d
IM
9752 * Do not allow a recursive hierarchy (this new sibling
9753 * becoming part of another group-sibling):
9754 */
9755 if (group_leader->group_leader != group_leader)
c3f00c70 9756 goto err_context;
34f43927
PZ
9757
9758 /* All events in a group should have the same clock */
9759 if (group_leader->clock != event->clock)
9760 goto err_context;
9761
ccff286d
IM
9762 /*
9763 * Do not allow to attach to a group in a different
9764 * task or CPU context:
04289bb9 9765 */
b04243ef 9766 if (move_group) {
c3c87e77
PZ
9767 /*
9768 * Make sure we're both on the same task, or both
9769 * per-cpu events.
9770 */
9771 if (group_leader->ctx->task != ctx->task)
9772 goto err_context;
9773
9774 /*
9775 * Make sure we're both events for the same CPU;
9776 * grouping events for different CPUs is broken; since
9777 * you can never concurrently schedule them anyhow.
9778 */
9779 if (group_leader->cpu != event->cpu)
b04243ef
PZ
9780 goto err_context;
9781 } else {
9782 if (group_leader->ctx != ctx)
9783 goto err_context;
9784 }
9785
3b6f9e5c
PM
9786 /*
9787 * Only a group leader can be exclusive or pinned
9788 */
0d48696f 9789 if (attr.exclusive || attr.pinned)
c3f00c70 9790 goto err_context;
ac9721f3
PZ
9791 }
9792
9793 if (output_event) {
9794 err = perf_event_set_output(event, output_event);
9795 if (err)
c3f00c70 9796 goto err_context;
ac9721f3 9797 }
0793a61d 9798
a21b0b35
YD
9799 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
9800 f_flags);
ea635c64
AV
9801 if (IS_ERR(event_file)) {
9802 err = PTR_ERR(event_file);
201c2f85 9803 event_file = NULL;
c3f00c70 9804 goto err_context;
ea635c64 9805 }
9b51f66d 9806
b04243ef 9807 if (move_group) {
321027c1
PZ
9808 gctx = __perf_event_ctx_lock_double(group_leader, ctx);
9809
84c4e620
PZ
9810 if (gctx->task == TASK_TOMBSTONE) {
9811 err = -ESRCH;
9812 goto err_locked;
9813 }
321027c1
PZ
9814
9815 /*
9816 * Check if we raced against another sys_perf_event_open() call
9817 * moving the software group underneath us.
9818 */
9819 if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
9820 /*
9821 * If someone moved the group out from under us, check
9822 * if this new event wound up on the same ctx, if so
9823 * its the regular !move_group case, otherwise fail.
9824 */
9825 if (gctx != ctx) {
9826 err = -EINVAL;
9827 goto err_locked;
9828 } else {
9829 perf_event_ctx_unlock(group_leader, gctx);
9830 move_group = 0;
9831 }
9832 }
f55fc2a5
PZ
9833 } else {
9834 mutex_lock(&ctx->mutex);
9835 }
9836
84c4e620
PZ
9837 if (ctx->task == TASK_TOMBSTONE) {
9838 err = -ESRCH;
9839 goto err_locked;
9840 }
9841
a723968c
PZ
9842 if (!perf_event_validate_size(event)) {
9843 err = -E2BIG;
9844 goto err_locked;
9845 }
9846
f55fc2a5
PZ
9847 /*
9848 * Must be under the same ctx::mutex as perf_install_in_context(),
9849 * because we need to serialize with concurrent event creation.
9850 */
9851 if (!exclusive_event_installable(event, ctx)) {
9852 /* exclusive and group stuff are assumed mutually exclusive */
9853 WARN_ON_ONCE(move_group);
f63a8daa 9854
f55fc2a5
PZ
9855 err = -EBUSY;
9856 goto err_locked;
9857 }
f63a8daa 9858
f55fc2a5
PZ
9859 WARN_ON_ONCE(ctx->parent_ctx);
9860
79c9ce57
PZ
9861 /*
9862 * This is the point on no return; we cannot fail hereafter. This is
9863 * where we start modifying current state.
9864 */
9865
f55fc2a5 9866 if (move_group) {
f63a8daa
PZ
9867 /*
9868 * See perf_event_ctx_lock() for comments on the details
9869 * of swizzling perf_event::ctx.
9870 */
45a0e07a 9871 perf_remove_from_context(group_leader, 0);
0231bb53 9872
b04243ef
PZ
9873 list_for_each_entry(sibling, &group_leader->sibling_list,
9874 group_entry) {
45a0e07a 9875 perf_remove_from_context(sibling, 0);
b04243ef
PZ
9876 put_ctx(gctx);
9877 }
b04243ef 9878
f63a8daa
PZ
9879 /*
9880 * Wait for everybody to stop referencing the events through
9881 * the old lists, before installing it on new lists.
9882 */
0cda4c02 9883 synchronize_rcu();
f63a8daa 9884
8f95b435
PZI
9885 /*
9886 * Install the group siblings before the group leader.
9887 *
9888 * Because a group leader will try and install the entire group
9889 * (through the sibling list, which is still in-tact), we can
9890 * end up with siblings installed in the wrong context.
9891 *
9892 * By installing siblings first we NO-OP because they're not
9893 * reachable through the group lists.
9894 */
b04243ef
PZ
9895 list_for_each_entry(sibling, &group_leader->sibling_list,
9896 group_entry) {
8f95b435 9897 perf_event__state_init(sibling);
9fc81d87 9898 perf_install_in_context(ctx, sibling, sibling->cpu);
b04243ef
PZ
9899 get_ctx(ctx);
9900 }
8f95b435
PZI
9901
9902 /*
9903 * Removing from the context ends up with disabled
9904 * event. What we want here is event in the initial
9905 * startup state, ready to be add into new context.
9906 */
9907 perf_event__state_init(group_leader);
9908 perf_install_in_context(ctx, group_leader, group_leader->cpu);
9909 get_ctx(ctx);
b04243ef 9910
f55fc2a5
PZ
9911 /*
9912 * Now that all events are installed in @ctx, nothing
9913 * references @gctx anymore, so drop the last reference we have
9914 * on it.
9915 */
9916 put_ctx(gctx);
bed5b25a
AS
9917 }
9918
f73e22ab
PZ
9919 /*
9920 * Precalculate sample_data sizes; do while holding ctx::mutex such
9921 * that we're serialized against further additions and before
9922 * perf_install_in_context() which is the point the event is active and
9923 * can use these values.
9924 */
9925 perf_event__header_size(event);
9926 perf_event__id_header_size(event);
9927
78cd2c74
PZ
9928 event->owner = current;
9929
e2d37cd2 9930 perf_install_in_context(ctx, event, event->cpu);
fe4b04fa 9931 perf_unpin_context(ctx);
f63a8daa 9932
f55fc2a5 9933 if (move_group)
321027c1 9934 perf_event_ctx_unlock(group_leader, gctx);
d859e29f 9935 mutex_unlock(&ctx->mutex);
9b51f66d 9936
79c9ce57
PZ
9937 if (task) {
9938 mutex_unlock(&task->signal->cred_guard_mutex);
9939 put_task_struct(task);
9940 }
9941
fbfc623f
YZ
9942 put_online_cpus();
9943
cdd6c482
IM
9944 mutex_lock(&current->perf_event_mutex);
9945 list_add_tail(&event->owner_entry, &current->perf_event_list);
9946 mutex_unlock(&current->perf_event_mutex);
082ff5a2 9947
8a49542c
PZ
9948 /*
9949 * Drop the reference on the group_event after placing the
9950 * new event on the sibling_list. This ensures destruction
9951 * of the group leader will find the pointer to itself in
9952 * perf_group_detach().
9953 */
2903ff01 9954 fdput(group);
ea635c64
AV
9955 fd_install(event_fd, event_file);
9956 return event_fd;
0793a61d 9957
f55fc2a5
PZ
9958err_locked:
9959 if (move_group)
321027c1 9960 perf_event_ctx_unlock(group_leader, gctx);
f55fc2a5
PZ
9961 mutex_unlock(&ctx->mutex);
9962/* err_file: */
9963 fput(event_file);
c3f00c70 9964err_context:
fe4b04fa 9965 perf_unpin_context(ctx);
ea635c64 9966 put_ctx(ctx);
c6be5a5c 9967err_alloc:
13005627
PZ
9968 /*
9969 * If event_file is set, the fput() above will have called ->release()
9970 * and that will take care of freeing the event.
9971 */
9972 if (!event_file)
9973 free_event(event);
79c9ce57
PZ
9974err_cred:
9975 if (task)
9976 mutex_unlock(&task->signal->cred_guard_mutex);
1f4ee503 9977err_cpus:
fbfc623f 9978 put_online_cpus();
1f4ee503 9979err_task:
e7d0bc04
PZ
9980 if (task)
9981 put_task_struct(task);
89a1e187 9982err_group_fd:
2903ff01 9983 fdput(group);
ea635c64
AV
9984err_fd:
9985 put_unused_fd(event_fd);
dc86cabe 9986 return err;
0793a61d
TG
9987}
9988
fb0459d7
AV
9989/**
9990 * perf_event_create_kernel_counter
9991 *
9992 * @attr: attributes of the counter to create
9993 * @cpu: cpu in which the counter is bound
38a81da2 9994 * @task: task to profile (NULL for percpu)
fb0459d7
AV
9995 */
9996struct perf_event *
9997perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
38a81da2 9998 struct task_struct *task,
4dc0da86
AK
9999 perf_overflow_handler_t overflow_handler,
10000 void *context)
fb0459d7 10001{
fb0459d7 10002 struct perf_event_context *ctx;
c3f00c70 10003 struct perf_event *event;
fb0459d7 10004 int err;
d859e29f 10005
fb0459d7
AV
10006 /*
10007 * Get the target context (task or percpu):
10008 */
d859e29f 10009
4dc0da86 10010 event = perf_event_alloc(attr, cpu, task, NULL, NULL,
79dff51e 10011 overflow_handler, context, -1);
c3f00c70
PZ
10012 if (IS_ERR(event)) {
10013 err = PTR_ERR(event);
10014 goto err;
10015 }
d859e29f 10016
f8697762 10017 /* Mark owner so we could distinguish it from user events. */
63b6da39 10018 event->owner = TASK_TOMBSTONE;
f8697762 10019
4af57ef2 10020 ctx = find_get_context(event->pmu, task, event);
c6567f64
FW
10021 if (IS_ERR(ctx)) {
10022 err = PTR_ERR(ctx);
c3f00c70 10023 goto err_free;
d859e29f 10024 }
fb0459d7 10025
fb0459d7
AV
10026 WARN_ON_ONCE(ctx->parent_ctx);
10027 mutex_lock(&ctx->mutex);
84c4e620
PZ
10028 if (ctx->task == TASK_TOMBSTONE) {
10029 err = -ESRCH;
10030 goto err_unlock;
10031 }
10032
bed5b25a 10033 if (!exclusive_event_installable(event, ctx)) {
bed5b25a 10034 err = -EBUSY;
84c4e620 10035 goto err_unlock;
bed5b25a
AS
10036 }
10037
fb0459d7 10038 perf_install_in_context(ctx, event, cpu);
fe4b04fa 10039 perf_unpin_context(ctx);
fb0459d7
AV
10040 mutex_unlock(&ctx->mutex);
10041
fb0459d7
AV
10042 return event;
10043
84c4e620
PZ
10044err_unlock:
10045 mutex_unlock(&ctx->mutex);
10046 perf_unpin_context(ctx);
10047 put_ctx(ctx);
c3f00c70
PZ
10048err_free:
10049 free_event(event);
10050err:
c6567f64 10051 return ERR_PTR(err);
9b51f66d 10052}
fb0459d7 10053EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
9b51f66d 10054
0cda4c02
YZ
10055void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
10056{
10057 struct perf_event_context *src_ctx;
10058 struct perf_event_context *dst_ctx;
10059 struct perf_event *event, *tmp;
10060 LIST_HEAD(events);
10061
10062 src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
10063 dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
10064
f63a8daa
PZ
10065 /*
10066 * See perf_event_ctx_lock() for comments on the details
10067 * of swizzling perf_event::ctx.
10068 */
10069 mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
0cda4c02
YZ
10070 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
10071 event_entry) {
45a0e07a 10072 perf_remove_from_context(event, 0);
9a545de0 10073 unaccount_event_cpu(event, src_cpu);
0cda4c02 10074 put_ctx(src_ctx);
9886167d 10075 list_add(&event->migrate_entry, &events);
0cda4c02 10076 }
0cda4c02 10077
8f95b435
PZI
10078 /*
10079 * Wait for the events to quiesce before re-instating them.
10080 */
0cda4c02
YZ
10081 synchronize_rcu();
10082
8f95b435
PZI
10083 /*
10084 * Re-instate events in 2 passes.
10085 *
10086 * Skip over group leaders and only install siblings on this first
10087 * pass, siblings will not get enabled without a leader, however a
10088 * leader will enable its siblings, even if those are still on the old
10089 * context.
10090 */
10091 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
10092 if (event->group_leader == event)
10093 continue;
10094
10095 list_del(&event->migrate_entry);
10096 if (event->state >= PERF_EVENT_STATE_OFF)
10097 event->state = PERF_EVENT_STATE_INACTIVE;
10098 account_event_cpu(event, dst_cpu);
10099 perf_install_in_context(dst_ctx, event, dst_cpu);
10100 get_ctx(dst_ctx);
10101 }
10102
10103 /*
10104 * Once all the siblings are setup properly, install the group leaders
10105 * to make it go.
10106 */
9886167d
PZ
10107 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
10108 list_del(&event->migrate_entry);
0cda4c02
YZ
10109 if (event->state >= PERF_EVENT_STATE_OFF)
10110 event->state = PERF_EVENT_STATE_INACTIVE;
9a545de0 10111 account_event_cpu(event, dst_cpu);
0cda4c02
YZ
10112 perf_install_in_context(dst_ctx, event, dst_cpu);
10113 get_ctx(dst_ctx);
10114 }
10115 mutex_unlock(&dst_ctx->mutex);
f63a8daa 10116 mutex_unlock(&src_ctx->mutex);
0cda4c02
YZ
10117}
10118EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
10119
cdd6c482 10120static void sync_child_event(struct perf_event *child_event,
38b200d6 10121 struct task_struct *child)
d859e29f 10122{
cdd6c482 10123 struct perf_event *parent_event = child_event->parent;
8bc20959 10124 u64 child_val;
d859e29f 10125
cdd6c482
IM
10126 if (child_event->attr.inherit_stat)
10127 perf_event_read_event(child_event, child);
38b200d6 10128
b5e58793 10129 child_val = perf_event_count(child_event);
d859e29f
PM
10130
10131 /*
10132 * Add back the child's count to the parent's count:
10133 */
a6e6dea6 10134 atomic64_add(child_val, &parent_event->child_count);
cdd6c482
IM
10135 atomic64_add(child_event->total_time_enabled,
10136 &parent_event->child_total_time_enabled);
10137 atomic64_add(child_event->total_time_running,
10138 &parent_event->child_total_time_running);
d859e29f
PM
10139}
10140
9b51f66d 10141static void
8ba289b8
PZ
10142perf_event_exit_event(struct perf_event *child_event,
10143 struct perf_event_context *child_ctx,
10144 struct task_struct *child)
9b51f66d 10145{
8ba289b8
PZ
10146 struct perf_event *parent_event = child_event->parent;
10147
1903d50c
PZ
10148 /*
10149 * Do not destroy the 'original' grouping; because of the context
10150 * switch optimization the original events could've ended up in a
10151 * random child task.
10152 *
10153 * If we were to destroy the original group, all group related
10154 * operations would cease to function properly after this random
10155 * child dies.
10156 *
10157 * Do destroy all inherited groups, we don't care about those
10158 * and being thorough is better.
10159 */
32132a3d
PZ
10160 raw_spin_lock_irq(&child_ctx->lock);
10161 WARN_ON_ONCE(child_ctx->is_active);
10162
8ba289b8 10163 if (parent_event)
32132a3d
PZ
10164 perf_group_detach(child_event);
10165 list_del_event(child_event, child_ctx);
a69b0ca4 10166 child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */
32132a3d 10167 raw_spin_unlock_irq(&child_ctx->lock);
0cc0c027 10168
9b51f66d 10169 /*
8ba289b8 10170 * Parent events are governed by their filedesc, retain them.
9b51f66d 10171 */
8ba289b8 10172 if (!parent_event) {
179033b3 10173 perf_event_wakeup(child_event);
8ba289b8 10174 return;
4bcf349a 10175 }
8ba289b8
PZ
10176 /*
10177 * Child events can be cleaned up.
10178 */
10179
10180 sync_child_event(child_event, child);
10181
10182 /*
10183 * Remove this event from the parent's list
10184 */
10185 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
10186 mutex_lock(&parent_event->child_mutex);
10187 list_del_init(&child_event->child_list);
10188 mutex_unlock(&parent_event->child_mutex);
10189
10190 /*
10191 * Kick perf_poll() for is_event_hup().
10192 */
10193 perf_event_wakeup(parent_event);
10194 free_event(child_event);
10195 put_event(parent_event);
9b51f66d
IM
10196}
10197
8dc85d54 10198static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
9b51f66d 10199{
211de6eb 10200 struct perf_event_context *child_ctx, *clone_ctx = NULL;
63b6da39 10201 struct perf_event *child_event, *next;
63b6da39
PZ
10202
10203 WARN_ON_ONCE(child != current);
9b51f66d 10204
6a3351b6 10205 child_ctx = perf_pin_task_context(child, ctxn);
63b6da39 10206 if (!child_ctx)
9b51f66d
IM
10207 return;
10208
ad3a37de 10209 /*
6a3351b6
PZ
10210 * In order to reduce the amount of tricky in ctx tear-down, we hold
10211 * ctx::mutex over the entire thing. This serializes against almost
10212 * everything that wants to access the ctx.
10213 *
10214 * The exception is sys_perf_event_open() /
10215 * perf_event_create_kernel_count() which does find_get_context()
10216 * without ctx::mutex (it cannot because of the move_group double mutex
10217 * lock thing). See the comments in perf_install_in_context().
ad3a37de 10218 */
6a3351b6 10219 mutex_lock(&child_ctx->mutex);
c93f7669
PM
10220
10221 /*
6a3351b6
PZ
10222 * In a single ctx::lock section, de-schedule the events and detach the
10223 * context from the task such that we cannot ever get it scheduled back
10224 * in.
c93f7669 10225 */
6a3351b6 10226 raw_spin_lock_irq(&child_ctx->lock);
63b6da39 10227 task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx);
4a1c0f26 10228
71a851b4 10229 /*
63b6da39
PZ
10230 * Now that the context is inactive, destroy the task <-> ctx relation
10231 * and mark the context dead.
71a851b4 10232 */
63b6da39
PZ
10233 RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
10234 put_ctx(child_ctx); /* cannot be last */
10235 WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
10236 put_task_struct(current); /* cannot be last */
4a1c0f26 10237
211de6eb 10238 clone_ctx = unclone_ctx(child_ctx);
6a3351b6 10239 raw_spin_unlock_irq(&child_ctx->lock);
9f498cc5 10240
211de6eb
PZ
10241 if (clone_ctx)
10242 put_ctx(clone_ctx);
4a1c0f26 10243
9f498cc5 10244 /*
cdd6c482
IM
10245 * Report the task dead after unscheduling the events so that we
10246 * won't get any samples after PERF_RECORD_EXIT. We can however still
10247 * get a few PERF_RECORD_READ events.
9f498cc5 10248 */
cdd6c482 10249 perf_event_task(child, child_ctx, 0);
a63eaf34 10250
ebf905fc 10251 list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
8ba289b8 10252 perf_event_exit_event(child_event, child_ctx, child);
8bc20959 10253
a63eaf34
PM
10254 mutex_unlock(&child_ctx->mutex);
10255
10256 put_ctx(child_ctx);
9b51f66d
IM
10257}
10258
8dc85d54
PZ
10259/*
10260 * When a child task exits, feed back event values to parent events.
79c9ce57
PZ
10261 *
10262 * Can be called with cred_guard_mutex held when called from
10263 * install_exec_creds().
8dc85d54
PZ
10264 */
10265void perf_event_exit_task(struct task_struct *child)
10266{
8882135b 10267 struct perf_event *event, *tmp;
8dc85d54
PZ
10268 int ctxn;
10269
8882135b
PZ
10270 mutex_lock(&child->perf_event_mutex);
10271 list_for_each_entry_safe(event, tmp, &child->perf_event_list,
10272 owner_entry) {
10273 list_del_init(&event->owner_entry);
10274
10275 /*
10276 * Ensure the list deletion is visible before we clear
10277 * the owner, closes a race against perf_release() where
10278 * we need to serialize on the owner->perf_event_mutex.
10279 */
f47c02c0 10280 smp_store_release(&event->owner, NULL);
8882135b
PZ
10281 }
10282 mutex_unlock(&child->perf_event_mutex);
10283
8dc85d54
PZ
10284 for_each_task_context_nr(ctxn)
10285 perf_event_exit_task_context(child, ctxn);
4e93ad60
JO
10286
10287 /*
10288 * The perf_event_exit_task_context calls perf_event_task
10289 * with child's task_ctx, which generates EXIT events for
10290 * child contexts and sets child->perf_event_ctxp[] to NULL.
10291 * At this point we need to send EXIT events to cpu contexts.
10292 */
10293 perf_event_task(child, NULL, 0);
8dc85d54
PZ
10294}
10295
889ff015
FW
10296static void perf_free_event(struct perf_event *event,
10297 struct perf_event_context *ctx)
10298{
10299 struct perf_event *parent = event->parent;
10300
10301 if (WARN_ON_ONCE(!parent))
10302 return;
10303
10304 mutex_lock(&parent->child_mutex);
10305 list_del_init(&event->child_list);
10306 mutex_unlock(&parent->child_mutex);
10307
a6fa941d 10308 put_event(parent);
889ff015 10309
652884fe 10310 raw_spin_lock_irq(&ctx->lock);
8a49542c 10311 perf_group_detach(event);
889ff015 10312 list_del_event(event, ctx);
652884fe 10313 raw_spin_unlock_irq(&ctx->lock);
889ff015
FW
10314 free_event(event);
10315}
10316
bbbee908 10317/*
652884fe 10318 * Free an unexposed, unused context as created by inheritance by
8dc85d54 10319 * perf_event_init_task below, used by fork() in case of fail.
652884fe
PZ
10320 *
10321 * Not all locks are strictly required, but take them anyway to be nice and
10322 * help out with the lockdep assertions.
bbbee908 10323 */
cdd6c482 10324void perf_event_free_task(struct task_struct *task)
bbbee908 10325{
8dc85d54 10326 struct perf_event_context *ctx;
cdd6c482 10327 struct perf_event *event, *tmp;
8dc85d54 10328 int ctxn;
bbbee908 10329
8dc85d54
PZ
10330 for_each_task_context_nr(ctxn) {
10331 ctx = task->perf_event_ctxp[ctxn];
10332 if (!ctx)
10333 continue;
bbbee908 10334
8dc85d54 10335 mutex_lock(&ctx->mutex);
bbbee908 10336again:
8dc85d54
PZ
10337 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
10338 group_entry)
10339 perf_free_event(event, ctx);
bbbee908 10340
8dc85d54
PZ
10341 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
10342 group_entry)
10343 perf_free_event(event, ctx);
bbbee908 10344
8dc85d54
PZ
10345 if (!list_empty(&ctx->pinned_groups) ||
10346 !list_empty(&ctx->flexible_groups))
10347 goto again;
bbbee908 10348
8dc85d54 10349 mutex_unlock(&ctx->mutex);
bbbee908 10350
8dc85d54
PZ
10351 put_ctx(ctx);
10352 }
889ff015
FW
10353}
10354
4e231c79
PZ
10355void perf_event_delayed_put(struct task_struct *task)
10356{
10357 int ctxn;
10358
10359 for_each_task_context_nr(ctxn)
10360 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
10361}
10362
e03e7ee3 10363struct file *perf_event_get(unsigned int fd)
ffe8690c 10364{
e03e7ee3 10365 struct file *file;
ffe8690c 10366
e03e7ee3
AS
10367 file = fget_raw(fd);
10368 if (!file)
10369 return ERR_PTR(-EBADF);
ffe8690c 10370
e03e7ee3
AS
10371 if (file->f_op != &perf_fops) {
10372 fput(file);
10373 return ERR_PTR(-EBADF);
10374 }
ffe8690c 10375
e03e7ee3 10376 return file;
ffe8690c
KX
10377}
10378
10379const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
10380{
10381 if (!event)
10382 return ERR_PTR(-EINVAL);
10383
10384 return &event->attr;
10385}
10386
97dee4f3
PZ
10387/*
10388 * inherit a event from parent task to child task:
10389 */
10390static struct perf_event *
10391inherit_event(struct perf_event *parent_event,
10392 struct task_struct *parent,
10393 struct perf_event_context *parent_ctx,
10394 struct task_struct *child,
10395 struct perf_event *group_leader,
10396 struct perf_event_context *child_ctx)
10397{
1929def9 10398 enum perf_event_active_state parent_state = parent_event->state;
97dee4f3 10399 struct perf_event *child_event;
cee010ec 10400 unsigned long flags;
97dee4f3
PZ
10401
10402 /*
10403 * Instead of creating recursive hierarchies of events,
10404 * we link inherited events back to the original parent,
10405 * which has a filp for sure, which we use as the reference
10406 * count:
10407 */
10408 if (parent_event->parent)
10409 parent_event = parent_event->parent;
10410
10411 child_event = perf_event_alloc(&parent_event->attr,
10412 parent_event->cpu,
d580ff86 10413 child,
97dee4f3 10414 group_leader, parent_event,
79dff51e 10415 NULL, NULL, -1);
97dee4f3
PZ
10416 if (IS_ERR(child_event))
10417 return child_event;
a6fa941d 10418
c6e5b732
PZ
10419 /*
10420 * is_orphaned_event() and list_add_tail(&parent_event->child_list)
10421 * must be under the same lock in order to serialize against
10422 * perf_event_release_kernel(), such that either we must observe
10423 * is_orphaned_event() or they will observe us on the child_list.
10424 */
10425 mutex_lock(&parent_event->child_mutex);
fadfe7be
JO
10426 if (is_orphaned_event(parent_event) ||
10427 !atomic_long_inc_not_zero(&parent_event->refcount)) {
c6e5b732 10428 mutex_unlock(&parent_event->child_mutex);
a6fa941d
AV
10429 free_event(child_event);
10430 return NULL;
10431 }
10432
97dee4f3
PZ
10433 get_ctx(child_ctx);
10434
10435 /*
10436 * Make the child state follow the state of the parent event,
10437 * not its attr.disabled bit. We hold the parent's mutex,
10438 * so we won't race with perf_event_{en, dis}able_family.
10439 */
1929def9 10440 if (parent_state >= PERF_EVENT_STATE_INACTIVE)
97dee4f3
PZ
10441 child_event->state = PERF_EVENT_STATE_INACTIVE;
10442 else
10443 child_event->state = PERF_EVENT_STATE_OFF;
10444
10445 if (parent_event->attr.freq) {
10446 u64 sample_period = parent_event->hw.sample_period;
10447 struct hw_perf_event *hwc = &child_event->hw;
10448
10449 hwc->sample_period = sample_period;
10450 hwc->last_period = sample_period;
10451
10452 local64_set(&hwc->period_left, sample_period);
10453 }
10454
10455 child_event->ctx = child_ctx;
10456 child_event->overflow_handler = parent_event->overflow_handler;
4dc0da86
AK
10457 child_event->overflow_handler_context
10458 = parent_event->overflow_handler_context;
97dee4f3 10459
614b6780
TG
10460 /*
10461 * Precalculate sample_data sizes
10462 */
10463 perf_event__header_size(child_event);
6844c09d 10464 perf_event__id_header_size(child_event);
614b6780 10465
97dee4f3
PZ
10466 /*
10467 * Link it up in the child's context:
10468 */
cee010ec 10469 raw_spin_lock_irqsave(&child_ctx->lock, flags);
97dee4f3 10470 add_event_to_ctx(child_event, child_ctx);
cee010ec 10471 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
97dee4f3 10472
97dee4f3
PZ
10473 /*
10474 * Link this into the parent event's child list
10475 */
97dee4f3
PZ
10476 list_add_tail(&child_event->child_list, &parent_event->child_list);
10477 mutex_unlock(&parent_event->child_mutex);
10478
10479 return child_event;
10480}
10481
10482static int inherit_group(struct perf_event *parent_event,
10483 struct task_struct *parent,
10484 struct perf_event_context *parent_ctx,
10485 struct task_struct *child,
10486 struct perf_event_context *child_ctx)
10487{
10488 struct perf_event *leader;
10489 struct perf_event *sub;
10490 struct perf_event *child_ctr;
10491
10492 leader = inherit_event(parent_event, parent, parent_ctx,
10493 child, NULL, child_ctx);
10494 if (IS_ERR(leader))
10495 return PTR_ERR(leader);
10496 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
10497 child_ctr = inherit_event(sub, parent, parent_ctx,
10498 child, leader, child_ctx);
10499 if (IS_ERR(child_ctr))
10500 return PTR_ERR(child_ctr);
10501 }
10502 return 0;
889ff015
FW
10503}
10504
10505static int
10506inherit_task_group(struct perf_event *event, struct task_struct *parent,
10507 struct perf_event_context *parent_ctx,
8dc85d54 10508 struct task_struct *child, int ctxn,
889ff015
FW
10509 int *inherited_all)
10510{
10511 int ret;
8dc85d54 10512 struct perf_event_context *child_ctx;
889ff015
FW
10513
10514 if (!event->attr.inherit) {
10515 *inherited_all = 0;
10516 return 0;
bbbee908
PZ
10517 }
10518
fe4b04fa 10519 child_ctx = child->perf_event_ctxp[ctxn];
889ff015
FW
10520 if (!child_ctx) {
10521 /*
10522 * This is executed from the parent task context, so
10523 * inherit events that have been marked for cloning.
10524 * First allocate and initialize a context for the
10525 * child.
10526 */
bbbee908 10527
734df5ab 10528 child_ctx = alloc_perf_context(parent_ctx->pmu, child);
889ff015
FW
10529 if (!child_ctx)
10530 return -ENOMEM;
bbbee908 10531
8dc85d54 10532 child->perf_event_ctxp[ctxn] = child_ctx;
889ff015
FW
10533 }
10534
10535 ret = inherit_group(event, parent, parent_ctx,
10536 child, child_ctx);
10537
10538 if (ret)
10539 *inherited_all = 0;
10540
10541 return ret;
bbbee908
PZ
10542}
10543
9b51f66d 10544/*
cdd6c482 10545 * Initialize the perf_event context in task_struct
9b51f66d 10546 */
985c8dcb 10547static int perf_event_init_context(struct task_struct *child, int ctxn)
9b51f66d 10548{
889ff015 10549 struct perf_event_context *child_ctx, *parent_ctx;
cdd6c482
IM
10550 struct perf_event_context *cloned_ctx;
10551 struct perf_event *event;
9b51f66d 10552 struct task_struct *parent = current;
564c2b21 10553 int inherited_all = 1;
dddd3379 10554 unsigned long flags;
6ab423e0 10555 int ret = 0;
9b51f66d 10556
8dc85d54 10557 if (likely(!parent->perf_event_ctxp[ctxn]))
6ab423e0
PZ
10558 return 0;
10559
ad3a37de 10560 /*
25346b93
PM
10561 * If the parent's context is a clone, pin it so it won't get
10562 * swapped under us.
ad3a37de 10563 */
8dc85d54 10564 parent_ctx = perf_pin_task_context(parent, ctxn);
ffb4ef21
PZ
10565 if (!parent_ctx)
10566 return 0;
25346b93 10567
ad3a37de
PM
10568 /*
10569 * No need to check if parent_ctx != NULL here; since we saw
10570 * it non-NULL earlier, the only reason for it to become NULL
10571 * is if we exit, and since we're currently in the middle of
10572 * a fork we can't be exiting at the same time.
10573 */
ad3a37de 10574
9b51f66d
IM
10575 /*
10576 * Lock the parent list. No need to lock the child - not PID
10577 * hashed yet and not running, so nobody can access it.
10578 */
d859e29f 10579 mutex_lock(&parent_ctx->mutex);
9b51f66d
IM
10580
10581 /*
10582 * We dont have to disable NMIs - we are only looking at
10583 * the list, not manipulating it:
10584 */
889ff015 10585 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
8dc85d54
PZ
10586 ret = inherit_task_group(event, parent, parent_ctx,
10587 child, ctxn, &inherited_all);
889ff015
FW
10588 if (ret)
10589 break;
10590 }
b93f7978 10591
dddd3379
TG
10592 /*
10593 * We can't hold ctx->lock when iterating the ->flexible_group list due
10594 * to allocations, but we need to prevent rotation because
10595 * rotate_ctx() will change the list from interrupt context.
10596 */
10597 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
10598 parent_ctx->rotate_disable = 1;
10599 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
10600
889ff015 10601 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
8dc85d54
PZ
10602 ret = inherit_task_group(event, parent, parent_ctx,
10603 child, ctxn, &inherited_all);
889ff015 10604 if (ret)
9b51f66d 10605 break;
564c2b21
PM
10606 }
10607
dddd3379
TG
10608 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
10609 parent_ctx->rotate_disable = 0;
dddd3379 10610
8dc85d54 10611 child_ctx = child->perf_event_ctxp[ctxn];
889ff015 10612
05cbaa28 10613 if (child_ctx && inherited_all) {
564c2b21
PM
10614 /*
10615 * Mark the child context as a clone of the parent
10616 * context, or of whatever the parent is a clone of.
c5ed5145
PZ
10617 *
10618 * Note that if the parent is a clone, the holding of
10619 * parent_ctx->lock avoids it from being uncloned.
564c2b21 10620 */
c5ed5145 10621 cloned_ctx = parent_ctx->parent_ctx;
ad3a37de
PM
10622 if (cloned_ctx) {
10623 child_ctx->parent_ctx = cloned_ctx;
25346b93 10624 child_ctx->parent_gen = parent_ctx->parent_gen;
564c2b21
PM
10625 } else {
10626 child_ctx->parent_ctx = parent_ctx;
10627 child_ctx->parent_gen = parent_ctx->generation;
10628 }
10629 get_ctx(child_ctx->parent_ctx);
9b51f66d
IM
10630 }
10631
c5ed5145 10632 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
d859e29f 10633 mutex_unlock(&parent_ctx->mutex);
6ab423e0 10634
25346b93 10635 perf_unpin_context(parent_ctx);
fe4b04fa 10636 put_ctx(parent_ctx);
ad3a37de 10637
6ab423e0 10638 return ret;
9b51f66d
IM
10639}
10640
8dc85d54
PZ
10641/*
10642 * Initialize the perf_event context in task_struct
10643 */
10644int perf_event_init_task(struct task_struct *child)
10645{
10646 int ctxn, ret;
10647
8550d7cb
ON
10648 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
10649 mutex_init(&child->perf_event_mutex);
10650 INIT_LIST_HEAD(&child->perf_event_list);
10651
8dc85d54
PZ
10652 for_each_task_context_nr(ctxn) {
10653 ret = perf_event_init_context(child, ctxn);
6c72e350
PZ
10654 if (ret) {
10655 perf_event_free_task(child);
8dc85d54 10656 return ret;
6c72e350 10657 }
8dc85d54
PZ
10658 }
10659
10660 return 0;
10661}
10662
220b140b
PM
10663static void __init perf_event_init_all_cpus(void)
10664{
b28ab83c 10665 struct swevent_htable *swhash;
220b140b 10666 int cpu;
220b140b
PM
10667
10668 for_each_possible_cpu(cpu) {
b28ab83c
PZ
10669 swhash = &per_cpu(swevent_htable, cpu);
10670 mutex_init(&swhash->hlist_mutex);
2fde4f94 10671 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
f2fb6bef
KL
10672
10673 INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
10674 raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
e48c1788 10675
058fe1c0
DCC
10676#ifdef CONFIG_CGROUP_PERF
10677 INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
10678#endif
e48c1788 10679 INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
220b140b
PM
10680 }
10681}
10682
00e16c3d 10683int perf_event_init_cpu(unsigned int cpu)
0793a61d 10684{
108b02cf 10685 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
0793a61d 10686
b28ab83c 10687 mutex_lock(&swhash->hlist_mutex);
059fcd8c 10688 if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
76e1d904
FW
10689 struct swevent_hlist *hlist;
10690
b28ab83c
PZ
10691 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
10692 WARN_ON(!hlist);
10693 rcu_assign_pointer(swhash->swevent_hlist, hlist);
76e1d904 10694 }
b28ab83c 10695 mutex_unlock(&swhash->hlist_mutex);
00e16c3d 10696 return 0;
0793a61d
TG
10697}
10698
2965faa5 10699#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
108b02cf 10700static void __perf_event_exit_context(void *__info)
0793a61d 10701{
108b02cf 10702 struct perf_event_context *ctx = __info;
fae3fde6
PZ
10703 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
10704 struct perf_event *event;
0793a61d 10705
fae3fde6
PZ
10706 raw_spin_lock(&ctx->lock);
10707 list_for_each_entry(event, &ctx->event_list, event_entry)
45a0e07a 10708 __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
fae3fde6 10709 raw_spin_unlock(&ctx->lock);
0793a61d 10710}
108b02cf
PZ
10711
10712static void perf_event_exit_cpu_context(int cpu)
10713{
10714 struct perf_event_context *ctx;
10715 struct pmu *pmu;
10716 int idx;
10717
10718 idx = srcu_read_lock(&pmus_srcu);
10719 list_for_each_entry_rcu(pmu, &pmus, entry) {
917bdd1c 10720 ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
108b02cf
PZ
10721
10722 mutex_lock(&ctx->mutex);
10723 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
10724 mutex_unlock(&ctx->mutex);
10725 }
10726 srcu_read_unlock(&pmus_srcu, idx);
108b02cf 10727}
00e16c3d
TG
10728#else
10729
10730static void perf_event_exit_cpu_context(int cpu) { }
10731
10732#endif
108b02cf 10733
00e16c3d 10734int perf_event_exit_cpu(unsigned int cpu)
0793a61d 10735{
e3703f8c 10736 perf_event_exit_cpu_context(cpu);
00e16c3d 10737 return 0;
0793a61d 10738}
0793a61d 10739
c277443c
PZ
10740static int
10741perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
10742{
10743 int cpu;
10744
10745 for_each_online_cpu(cpu)
10746 perf_event_exit_cpu(cpu);
10747
10748 return NOTIFY_OK;
10749}
10750
10751/*
10752 * Run the perf reboot notifier at the very last possible moment so that
10753 * the generic watchdog code runs as long as possible.
10754 */
10755static struct notifier_block perf_reboot_notifier = {
10756 .notifier_call = perf_reboot,
10757 .priority = INT_MIN,
10758};
10759
cdd6c482 10760void __init perf_event_init(void)
0793a61d 10761{
3c502e7a
JW
10762 int ret;
10763
2e80a82a
PZ
10764 idr_init(&pmu_idr);
10765
220b140b 10766 perf_event_init_all_cpus();
b0a873eb 10767 init_srcu_struct(&pmus_srcu);
2e80a82a
PZ
10768 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
10769 perf_pmu_register(&perf_cpu_clock, NULL, -1);
10770 perf_pmu_register(&perf_task_clock, NULL, -1);
b0a873eb 10771 perf_tp_register();
00e16c3d 10772 perf_event_init_cpu(smp_processor_id());
c277443c 10773 register_reboot_notifier(&perf_reboot_notifier);
3c502e7a
JW
10774
10775 ret = init_hw_breakpoint();
10776 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
b2029520 10777
b01c3a00
JO
10778 /*
10779 * Build time assertion that we keep the data_head at the intended
10780 * location. IOW, validation we got the __reserved[] size right.
10781 */
10782 BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
10783 != 1024);
0793a61d 10784}
abe43400 10785
fd979c01
CS
10786ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
10787 char *page)
10788{
10789 struct perf_pmu_events_attr *pmu_attr =
10790 container_of(attr, struct perf_pmu_events_attr, attr);
10791
10792 if (pmu_attr->event_str)
10793 return sprintf(page, "%s\n", pmu_attr->event_str);
10794
10795 return 0;
10796}
675965b0 10797EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
fd979c01 10798
abe43400
PZ
10799static int __init perf_event_sysfs_init(void)
10800{
10801 struct pmu *pmu;
10802 int ret;
10803
10804 mutex_lock(&pmus_lock);
10805
10806 ret = bus_register(&pmu_bus);
10807 if (ret)
10808 goto unlock;
10809
10810 list_for_each_entry(pmu, &pmus, entry) {
10811 if (!pmu->name || pmu->type < 0)
10812 continue;
10813
10814 ret = pmu_dev_alloc(pmu);
10815 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
10816 }
10817 pmu_bus_running = 1;
10818 ret = 0;
10819
10820unlock:
10821 mutex_unlock(&pmus_lock);
10822
10823 return ret;
10824}
10825device_initcall(perf_event_sysfs_init);
e5d1367f
SE
10826
10827#ifdef CONFIG_CGROUP_PERF
eb95419b
TH
10828static struct cgroup_subsys_state *
10829perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
e5d1367f
SE
10830{
10831 struct perf_cgroup *jc;
e5d1367f 10832
1b15d055 10833 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
e5d1367f
SE
10834 if (!jc)
10835 return ERR_PTR(-ENOMEM);
10836
e5d1367f
SE
10837 jc->info = alloc_percpu(struct perf_cgroup_info);
10838 if (!jc->info) {
10839 kfree(jc);
10840 return ERR_PTR(-ENOMEM);
10841 }
10842
e5d1367f
SE
10843 return &jc->css;
10844}
10845
eb95419b 10846static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
e5d1367f 10847{
eb95419b
TH
10848 struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
10849
e5d1367f
SE
10850 free_percpu(jc->info);
10851 kfree(jc);
10852}
10853
10854static int __perf_cgroup_move(void *info)
10855{
10856 struct task_struct *task = info;
ddaaf4e2 10857 rcu_read_lock();
e5d1367f 10858 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
ddaaf4e2 10859 rcu_read_unlock();
e5d1367f
SE
10860 return 0;
10861}
10862
1f7dd3e5 10863static void perf_cgroup_attach(struct cgroup_taskset *tset)
e5d1367f 10864{
bb9d97b6 10865 struct task_struct *task;
1f7dd3e5 10866 struct cgroup_subsys_state *css;
bb9d97b6 10867
1f7dd3e5 10868 cgroup_taskset_for_each(task, css, tset)
bb9d97b6 10869 task_function_call(task, __perf_cgroup_move, task);
e5d1367f
SE
10870}
10871
073219e9 10872struct cgroup_subsys perf_event_cgrp_subsys = {
92fb9748
TH
10873 .css_alloc = perf_cgroup_css_alloc,
10874 .css_free = perf_cgroup_css_free,
bb9d97b6 10875 .attach = perf_cgroup_attach,
e5d1367f
SE
10876};
10877#endif /* CONFIG_CGROUP_PERF */